3943 lines
128 KiB
JSON
3943 lines
128 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0273972602739727,
|
|
"eval_steps": 500,
|
|
"global_step": 300,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"completion_length": 2687.75,
|
|
"epoch": 0.003424657534246575,
|
|
"grad_norm": 0.18900136649608612,
|
|
"kl": 0.0,
|
|
"learning_rate": 3.3333333333333334e-08,
|
|
"loss": 0.0,
|
|
"reward": 0.3042634315788746,
|
|
"reward_std": 0.4194334000349045,
|
|
"rewards/cosine_scaled_reward": -0.14712543413043022,
|
|
"rewards/format_reward": 0.4513889104127884,
|
|
"step": 1
|
|
},
|
|
{
|
|
"completion_length": 2708.71533203125,
|
|
"epoch": 0.00684931506849315,
|
|
"grad_norm": 0.18016651272773743,
|
|
"kl": 0.0,
|
|
"learning_rate": 6.666666666666667e-08,
|
|
"loss": 0.0,
|
|
"reward": 0.3438135087490082,
|
|
"reward_std": 0.4610961228609085,
|
|
"rewards/cosine_scaled_reward": -0.1353531926870346,
|
|
"rewards/format_reward": 0.4791666716337204,
|
|
"step": 2
|
|
},
|
|
{
|
|
"completion_length": 2606.2569580078125,
|
|
"epoch": 0.010273972602739725,
|
|
"grad_norm": 0.20038573443889618,
|
|
"kl": 9.512901306152344e-05,
|
|
"learning_rate": 1e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.4459744915366173,
|
|
"reward_std": 0.4223913550376892,
|
|
"rewards/cosine_scaled_reward": -0.033192168921232224,
|
|
"rewards/format_reward": 0.4791666567325592,
|
|
"step": 3
|
|
},
|
|
{
|
|
"completion_length": 2475.6805419921875,
|
|
"epoch": 0.0136986301369863,
|
|
"grad_norm": 0.288876473903656,
|
|
"kl": 0.00010967254638671875,
|
|
"learning_rate": 1.3333333333333334e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.5504279434680939,
|
|
"reward_std": 0.4054145812988281,
|
|
"rewards/cosine_scaled_reward": -0.07457206398248672,
|
|
"rewards/format_reward": 0.625,
|
|
"step": 4
|
|
},
|
|
{
|
|
"completion_length": 2784.541748046875,
|
|
"epoch": 0.017123287671232876,
|
|
"grad_norm": 0.14747899770736694,
|
|
"kl": 0.00010347366333007812,
|
|
"learning_rate": 1.6666666666666665e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.3733392059803009,
|
|
"reward_std": 0.4893380403518677,
|
|
"rewards/cosine_scaled_reward": -0.057216365821659565,
|
|
"rewards/format_reward": 0.4305555522441864,
|
|
"step": 5
|
|
},
|
|
{
|
|
"completion_length": 3103.0902099609375,
|
|
"epoch": 0.02054794520547945,
|
|
"grad_norm": 0.1839321404695511,
|
|
"kl": 0.00011086463928222656,
|
|
"learning_rate": 2e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.02957332320511341,
|
|
"reward_std": 0.3297805115580559,
|
|
"rewards/cosine_scaled_reward": -0.22042667865753174,
|
|
"rewards/format_reward": 0.2499999925494194,
|
|
"step": 6
|
|
},
|
|
{
|
|
"completion_length": 2373.9375,
|
|
"epoch": 0.023972602739726026,
|
|
"grad_norm": 0.22951197624206543,
|
|
"kl": 0.00012159347534179688,
|
|
"learning_rate": 2.3333333333333333e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.4939586818218231,
|
|
"reward_std": 0.3855544626712799,
|
|
"rewards/cosine_scaled_reward": -0.04770803824067116,
|
|
"rewards/format_reward": 0.5416666716337204,
|
|
"step": 7
|
|
},
|
|
{
|
|
"completion_length": 2863.65283203125,
|
|
"epoch": 0.0273972602739726,
|
|
"grad_norm": 0.1952303647994995,
|
|
"kl": 0.00010585784912109375,
|
|
"learning_rate": 2.6666666666666667e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.17456580698490143,
|
|
"reward_std": 0.4745761901140213,
|
|
"rewards/cosine_scaled_reward": -0.15182308107614517,
|
|
"rewards/format_reward": 0.3263888880610466,
|
|
"step": 8
|
|
},
|
|
{
|
|
"completion_length": 3127.5833740234375,
|
|
"epoch": 0.030821917808219176,
|
|
"grad_norm": 0.12419066578149796,
|
|
"kl": 0.00010228157043457031,
|
|
"learning_rate": 3e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.10362935438752174,
|
|
"reward_std": 0.3372735381126404,
|
|
"rewards/cosine_scaled_reward": -0.15331509709358215,
|
|
"rewards/format_reward": 0.2569444552063942,
|
|
"step": 9
|
|
},
|
|
{
|
|
"completion_length": 3021.9722900390625,
|
|
"epoch": 0.03424657534246575,
|
|
"grad_norm": 0.1988951563835144,
|
|
"kl": 0.00011277198791503906,
|
|
"learning_rate": 3.333333333333333e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.13170428201556206,
|
|
"reward_std": 0.41356146335601807,
|
|
"rewards/cosine_scaled_reward": -0.16690683364868164,
|
|
"rewards/format_reward": 0.298611119389534,
|
|
"step": 10
|
|
},
|
|
{
|
|
"completion_length": 2931.1319580078125,
|
|
"epoch": 0.03767123287671233,
|
|
"grad_norm": 0.18511006236076355,
|
|
"kl": 0.0001304149627685547,
|
|
"learning_rate": 3.666666666666666e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.1742808436974883,
|
|
"reward_std": 0.32126323878765106,
|
|
"rewards/cosine_scaled_reward": -0.17294137924909592,
|
|
"rewards/format_reward": 0.3472222238779068,
|
|
"step": 11
|
|
},
|
|
{
|
|
"completion_length": 2932.5555419921875,
|
|
"epoch": 0.0410958904109589,
|
|
"grad_norm": 0.16578702628612518,
|
|
"kl": 0.00010228157043457031,
|
|
"learning_rate": 4e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.19121046364307404,
|
|
"reward_std": 0.3839987516403198,
|
|
"rewards/cosine_scaled_reward": -0.16295619308948517,
|
|
"rewards/format_reward": 0.3541666567325592,
|
|
"step": 12
|
|
},
|
|
{
|
|
"completion_length": 2798.6180419921875,
|
|
"epoch": 0.04452054794520548,
|
|
"grad_norm": 0.15695109963417053,
|
|
"kl": 9.846687316894531e-05,
|
|
"learning_rate": 4.3333333333333335e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.40019115805625916,
|
|
"reward_std": 0.49739648401737213,
|
|
"rewards/cosine_scaled_reward": -0.05814217543229461,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 13
|
|
},
|
|
{
|
|
"completion_length": 2636.548583984375,
|
|
"epoch": 0.04794520547945205,
|
|
"grad_norm": 0.19918033480644226,
|
|
"kl": 0.00012230873107910156,
|
|
"learning_rate": 4.6666666666666666e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.5720034390687943,
|
|
"reward_std": 0.46172526478767395,
|
|
"rewards/cosine_scaled_reward": 0.009503423236310482,
|
|
"rewards/format_reward": 0.5625,
|
|
"step": 14
|
|
},
|
|
{
|
|
"completion_length": 2823.763916015625,
|
|
"epoch": 0.05136986301369863,
|
|
"grad_norm": 0.13800376653671265,
|
|
"kl": 0.00010466575622558594,
|
|
"learning_rate": 5e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.46923990547657013,
|
|
"reward_std": 0.5656772404909134,
|
|
"rewards/cosine_scaled_reward": -0.037704543210566044,
|
|
"rewards/format_reward": 0.5069444477558136,
|
|
"step": 15
|
|
},
|
|
{
|
|
"completion_length": 2824.2777099609375,
|
|
"epoch": 0.0547945205479452,
|
|
"grad_norm": 0.19206716120243073,
|
|
"kl": 0.00011682510375976562,
|
|
"learning_rate": 5.333333333333333e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.23321796208620071,
|
|
"reward_std": 0.41186313331127167,
|
|
"rewards/cosine_scaled_reward": -0.10011539235711098,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 16
|
|
},
|
|
{
|
|
"completion_length": 2954.1112060546875,
|
|
"epoch": 0.05821917808219178,
|
|
"grad_norm": 0.19295988976955414,
|
|
"kl": 0.00010442733764648438,
|
|
"learning_rate": 5.666666666666666e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.28434962406754494,
|
|
"reward_std": 0.5011897683143616,
|
|
"rewards/cosine_scaled_reward": -0.07676149532198906,
|
|
"rewards/format_reward": 0.361111119389534,
|
|
"step": 17
|
|
},
|
|
{
|
|
"completion_length": 2896.78466796875,
|
|
"epoch": 0.06164383561643835,
|
|
"grad_norm": 0.16591955721378326,
|
|
"kl": 0.00012493133544921875,
|
|
"learning_rate": 6e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.1477920152246952,
|
|
"reward_std": 0.3473651483654976,
|
|
"rewards/cosine_scaled_reward": -0.1855413243174553,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 18
|
|
},
|
|
{
|
|
"completion_length": 2883.125,
|
|
"epoch": 0.06506849315068493,
|
|
"grad_norm": 0.24647116661071777,
|
|
"kl": 0.0001239776611328125,
|
|
"learning_rate": 6.333333333333332e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.23157373815774918,
|
|
"reward_std": 0.3507264107465744,
|
|
"rewards/cosine_scaled_reward": -0.10870405659079552,
|
|
"rewards/format_reward": 0.3402777761220932,
|
|
"step": 19
|
|
},
|
|
{
|
|
"completion_length": 2961.7362060546875,
|
|
"epoch": 0.0684931506849315,
|
|
"grad_norm": 0.18728116154670715,
|
|
"kl": 0.000133514404296875,
|
|
"learning_rate": 6.666666666666666e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.17506830394268036,
|
|
"reward_std": 0.30466167628765106,
|
|
"rewards/cosine_scaled_reward": -0.13743170350790024,
|
|
"rewards/format_reward": 0.3125,
|
|
"step": 20
|
|
},
|
|
{
|
|
"completion_length": 2186.2501220703125,
|
|
"epoch": 0.07191780821917808,
|
|
"grad_norm": 0.21891450881958008,
|
|
"kl": 0.00015282630920410156,
|
|
"learning_rate": 7e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.8545757234096527,
|
|
"reward_std": 0.4611281454563141,
|
|
"rewards/cosine_scaled_reward": 0.1392979435622692,
|
|
"rewards/format_reward": 0.715277761220932,
|
|
"step": 21
|
|
},
|
|
{
|
|
"completion_length": 2804.2291259765625,
|
|
"epoch": 0.07534246575342465,
|
|
"grad_norm": 0.17504653334617615,
|
|
"kl": 0.000141143798828125,
|
|
"learning_rate": 7.333333333333332e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.26354077458381653,
|
|
"reward_std": 0.30501972138881683,
|
|
"rewards/cosine_scaled_reward": -0.16007035970687866,
|
|
"rewards/format_reward": 0.423611119389534,
|
|
"step": 22
|
|
},
|
|
{
|
|
"completion_length": 2849.8958740234375,
|
|
"epoch": 0.07876712328767123,
|
|
"grad_norm": 0.16187402606010437,
|
|
"kl": 0.00014853477478027344,
|
|
"learning_rate": 7.666666666666667e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.2565463110804558,
|
|
"reward_std": 0.4471036493778229,
|
|
"rewards/cosine_scaled_reward": -0.11150925606489182,
|
|
"rewards/format_reward": 0.3680555522441864,
|
|
"step": 23
|
|
},
|
|
{
|
|
"completion_length": 2847.166748046875,
|
|
"epoch": 0.0821917808219178,
|
|
"grad_norm": 0.1545354276895523,
|
|
"kl": 0.0001323223114013672,
|
|
"learning_rate": 8e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.4566986709833145,
|
|
"reward_std": 0.5350392460823059,
|
|
"rewards/cosine_scaled_reward": -0.015523582696914673,
|
|
"rewards/format_reward": 0.4722222238779068,
|
|
"step": 24
|
|
},
|
|
{
|
|
"completion_length": 2807.2708740234375,
|
|
"epoch": 0.08561643835616438,
|
|
"grad_norm": 0.18177877366542816,
|
|
"kl": 0.00020647048950195312,
|
|
"learning_rate": 8.333333333333333e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.21164313331246376,
|
|
"reward_std": 0.3955174386501312,
|
|
"rewards/cosine_scaled_reward": -0.15641241893172264,
|
|
"rewards/format_reward": 0.3680555522441864,
|
|
"step": 25
|
|
},
|
|
{
|
|
"completion_length": 2927.194580078125,
|
|
"epoch": 0.08904109589041095,
|
|
"grad_norm": 0.17393368482589722,
|
|
"kl": 0.00020933151245117188,
|
|
"learning_rate": 8.666666666666667e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.370952308177948,
|
|
"reward_std": 0.4646989554166794,
|
|
"rewards/cosine_scaled_reward": -0.017936568707227707,
|
|
"rewards/format_reward": 0.388888880610466,
|
|
"step": 26
|
|
},
|
|
{
|
|
"completion_length": 2989.7291259765625,
|
|
"epoch": 0.09246575342465753,
|
|
"grad_norm": 0.19562838971614838,
|
|
"kl": 0.00015878677368164062,
|
|
"learning_rate": 9e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.10655032098293304,
|
|
"reward_std": 0.4004325717687607,
|
|
"rewards/cosine_scaled_reward": -0.20594968646764755,
|
|
"rewards/format_reward": 0.3125,
|
|
"step": 27
|
|
},
|
|
{
|
|
"completion_length": 3012.5555419921875,
|
|
"epoch": 0.0958904109589041,
|
|
"grad_norm": 0.15751628577709198,
|
|
"kl": 0.00021696090698242188,
|
|
"learning_rate": 9.333333333333333e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.3497147411108017,
|
|
"reward_std": 0.6247645616531372,
|
|
"rewards/cosine_scaled_reward": -0.0738963820040226,
|
|
"rewards/format_reward": 0.423611119389534,
|
|
"step": 28
|
|
},
|
|
{
|
|
"completion_length": 3110.6181640625,
|
|
"epoch": 0.09931506849315068,
|
|
"grad_norm": 0.13668763637542725,
|
|
"kl": 0.00023889541625976562,
|
|
"learning_rate": 9.666666666666666e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.10539204813539982,
|
|
"reward_std": 0.30441631376743317,
|
|
"rewards/cosine_scaled_reward": -0.17933017387986183,
|
|
"rewards/format_reward": 0.2847222238779068,
|
|
"step": 29
|
|
},
|
|
{
|
|
"completion_length": 2887.2083740234375,
|
|
"epoch": 0.10273972602739725,
|
|
"grad_norm": 0.17280973494052887,
|
|
"kl": 0.00022649765014648438,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0,
|
|
"reward": 0.4436980187892914,
|
|
"reward_std": 0.5404154658317566,
|
|
"rewards/cosine_scaled_reward": 0.006197985261678696,
|
|
"rewards/format_reward": 0.4375,
|
|
"step": 30
|
|
},
|
|
{
|
|
"completion_length": 2932.757080078125,
|
|
"epoch": 0.10616438356164383,
|
|
"grad_norm": 0.15570040047168732,
|
|
"kl": 0.00044155120849609375,
|
|
"learning_rate": 9.99969538601693e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.16736368834972382,
|
|
"reward_std": 0.4045102745294571,
|
|
"rewards/cosine_scaled_reward": -0.15208075568079948,
|
|
"rewards/format_reward": 0.3194444477558136,
|
|
"step": 31
|
|
},
|
|
{
|
|
"completion_length": 3094.4444580078125,
|
|
"epoch": 0.1095890410958904,
|
|
"grad_norm": 0.14696453511714935,
|
|
"kl": 0.0003809928894042969,
|
|
"learning_rate": 9.998781585307575e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.1443577939644456,
|
|
"reward_std": 0.5129190236330032,
|
|
"rewards/cosine_scaled_reward": -0.16119776666164398,
|
|
"rewards/format_reward": 0.3055555522441864,
|
|
"step": 32
|
|
},
|
|
{
|
|
"completion_length": 2546.0556640625,
|
|
"epoch": 0.11301369863013698,
|
|
"grad_norm": 0.1807711124420166,
|
|
"kl": 0.00067138671875,
|
|
"learning_rate": 9.997258721585931e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.5040995180606842,
|
|
"reward_std": 0.43710532784461975,
|
|
"rewards/cosine_scaled_reward": 0.011043965816497803,
|
|
"rewards/format_reward": 0.4930555671453476,
|
|
"step": 33
|
|
},
|
|
{
|
|
"completion_length": 2832.5902099609375,
|
|
"epoch": 0.11643835616438356,
|
|
"grad_norm": 0.16727592051029205,
|
|
"kl": 0.000640869140625,
|
|
"learning_rate": 9.99512700102336e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.24528168514370918,
|
|
"reward_std": 0.4029017984867096,
|
|
"rewards/cosine_scaled_reward": -0.15749610401690006,
|
|
"rewards/format_reward": 0.4027777910232544,
|
|
"step": 34
|
|
},
|
|
{
|
|
"completion_length": 2865.923583984375,
|
|
"epoch": 0.11986301369863013,
|
|
"grad_norm": 0.1456187218427658,
|
|
"kl": 0.0006847381591796875,
|
|
"learning_rate": 9.992386712220707e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.27862629294395447,
|
|
"reward_std": 0.37985116243362427,
|
|
"rewards/cosine_scaled_reward": -0.11026258394122124,
|
|
"rewards/format_reward": 0.3888888955116272,
|
|
"step": 35
|
|
},
|
|
{
|
|
"completion_length": 2459.1527709960938,
|
|
"epoch": 0.1232876712328767,
|
|
"grad_norm": 0.16684532165527344,
|
|
"kl": 0.0010623931884765625,
|
|
"learning_rate": 9.989038226169207e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.5571352988481522,
|
|
"reward_std": 0.4538237154483795,
|
|
"rewards/cosine_scaled_reward": 0.015468628145754337,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 36
|
|
},
|
|
{
|
|
"completion_length": 2674.763916015625,
|
|
"epoch": 0.1267123287671233,
|
|
"grad_norm": 0.15782181918621063,
|
|
"kl": 0.001285552978515625,
|
|
"learning_rate": 9.985081996200277e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.3798440098762512,
|
|
"reward_std": 0.48301415145397186,
|
|
"rewards/cosine_scaled_reward": -0.08543377462774515,
|
|
"rewards/format_reward": 0.4652777910232544,
|
|
"step": 37
|
|
},
|
|
{
|
|
"completion_length": 2692.986083984375,
|
|
"epoch": 0.13013698630136986,
|
|
"grad_norm": 0.16332699358463287,
|
|
"kl": 0.0014743804931640625,
|
|
"learning_rate": 9.98051855792412e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.38910074532032013,
|
|
"reward_std": 0.3475951850414276,
|
|
"rewards/cosine_scaled_reward": -0.03451040526852012,
|
|
"rewards/format_reward": 0.423611119389534,
|
|
"step": 38
|
|
},
|
|
{
|
|
"completion_length": 3102.4930419921875,
|
|
"epoch": 0.13356164383561644,
|
|
"grad_norm": 0.1224745586514473,
|
|
"kl": 0.00080108642578125,
|
|
"learning_rate": 9.975348529157229e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.22906357236206532,
|
|
"reward_std": 0.5051266252994537,
|
|
"rewards/cosine_scaled_reward": -0.10426976904273033,
|
|
"rewards/format_reward": 0.3333333358168602,
|
|
"step": 39
|
|
},
|
|
{
|
|
"completion_length": 2613.0347900390625,
|
|
"epoch": 0.136986301369863,
|
|
"grad_norm": 0.2610801160335541,
|
|
"kl": 0.00220489501953125,
|
|
"learning_rate": 9.969572609838744e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.39406873285770416,
|
|
"reward_std": 0.5201945602893829,
|
|
"rewards/cosine_scaled_reward": -0.13370903208851814,
|
|
"rewards/format_reward": 0.5277777910232544,
|
|
"step": 40
|
|
},
|
|
{
|
|
"completion_length": 2810.84033203125,
|
|
"epoch": 0.1404109589041096,
|
|
"grad_norm": 0.1520536243915558,
|
|
"kl": 0.00159454345703125,
|
|
"learning_rate": 9.963191581935677e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.26414267159998417,
|
|
"reward_std": 0.39119474589824677,
|
|
"rewards/cosine_scaled_reward": -0.12474621459841728,
|
|
"rewards/format_reward": 0.3888888955116272,
|
|
"step": 41
|
|
},
|
|
{
|
|
"completion_length": 2832.3958740234375,
|
|
"epoch": 0.14383561643835616,
|
|
"grad_norm": 0.16354550421237946,
|
|
"kl": 0.001583099365234375,
|
|
"learning_rate": 9.956206309337066e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.1603957675397396,
|
|
"reward_std": 0.2545732408761978,
|
|
"rewards/cosine_scaled_reward": -0.19377091526985168,
|
|
"rewards/format_reward": 0.3541666716337204,
|
|
"step": 42
|
|
},
|
|
{
|
|
"completion_length": 2594.4166259765625,
|
|
"epoch": 0.14726027397260275,
|
|
"grad_norm": 0.1575089693069458,
|
|
"kl": 0.002162933349609375,
|
|
"learning_rate": 9.948617737737001e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.4924927353858948,
|
|
"reward_std": 0.4764625281095505,
|
|
"rewards/cosine_scaled_reward": -0.05611838772892952,
|
|
"rewards/format_reward": 0.5486111044883728,
|
|
"step": 43
|
|
},
|
|
{
|
|
"completion_length": 3142.638916015625,
|
|
"epoch": 0.1506849315068493,
|
|
"grad_norm": 0.13056397438049316,
|
|
"kl": 0.0015392303466796875,
|
|
"learning_rate": 9.940426894506606e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.18407823704183102,
|
|
"reward_std": 0.40585757791996,
|
|
"rewards/cosine_scaled_reward": -0.0867550881812349,
|
|
"rewards/format_reward": 0.2708333432674408,
|
|
"step": 44
|
|
},
|
|
{
|
|
"completion_length": 2841.7501220703125,
|
|
"epoch": 0.1541095890410959,
|
|
"grad_norm": 0.1490578055381775,
|
|
"kl": 0.0020904541015625,
|
|
"learning_rate": 9.931634888554935e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.24261729046702385,
|
|
"reward_std": 0.4036310315132141,
|
|
"rewards/cosine_scaled_reward": -0.11849382892251015,
|
|
"rewards/format_reward": 0.3611111119389534,
|
|
"step": 45
|
|
},
|
|
{
|
|
"completion_length": 2715.6944580078125,
|
|
"epoch": 0.15753424657534246,
|
|
"grad_norm": 0.18108582496643066,
|
|
"kl": 0.00254058837890625,
|
|
"learning_rate": 9.922242910178859e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.40730662643909454,
|
|
"reward_std": 0.5260264724493027,
|
|
"rewards/cosine_scaled_reward": -0.05797116830945015,
|
|
"rewards/format_reward": 0.4652777761220932,
|
|
"step": 46
|
|
},
|
|
{
|
|
"completion_length": 2995.9862060546875,
|
|
"epoch": 0.16095890410958905,
|
|
"grad_norm": 0.23772801458835602,
|
|
"kl": 0.0029754638671875,
|
|
"learning_rate": 9.912252230901906e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.28154853731393814,
|
|
"reward_std": 0.44985754787921906,
|
|
"rewards/cosine_scaled_reward": -0.0726181073114276,
|
|
"rewards/format_reward": 0.3541666641831398,
|
|
"step": 47
|
|
},
|
|
{
|
|
"completion_length": 2810.7222900390625,
|
|
"epoch": 0.1643835616438356,
|
|
"grad_norm": 0.1889384686946869,
|
|
"kl": 0.00370025634765625,
|
|
"learning_rate": 9.901664203302124e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.3217255771160126,
|
|
"reward_std": 0.5001529008150101,
|
|
"rewards/cosine_scaled_reward": -0.08105220319703221,
|
|
"rewards/format_reward": 0.4027777761220932,
|
|
"step": 48
|
|
},
|
|
{
|
|
"completion_length": 2650.09033203125,
|
|
"epoch": 0.1678082191780822,
|
|
"grad_norm": 0.13828006386756897,
|
|
"kl": 0.00315093994140625,
|
|
"learning_rate": 9.890480260828965e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.37053926289081573,
|
|
"reward_std": 0.40743280947208405,
|
|
"rewards/cosine_scaled_reward": -0.12251630332320929,
|
|
"rewards/format_reward": 0.4930555522441864,
|
|
"step": 49
|
|
},
|
|
{
|
|
"completion_length": 2785.84716796875,
|
|
"epoch": 0.17123287671232876,
|
|
"grad_norm": 0.16287721693515778,
|
|
"kl": 0.00333404541015625,
|
|
"learning_rate": 9.878701917609207e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.3723617196083069,
|
|
"reward_std": 0.4880683571100235,
|
|
"rewards/cosine_scaled_reward": -0.09291603974997997,
|
|
"rewards/format_reward": 0.4652777910232544,
|
|
"step": 50
|
|
},
|
|
{
|
|
"completion_length": 2878.1181640625,
|
|
"epoch": 0.17465753424657535,
|
|
"grad_norm": 0.13556380569934845,
|
|
"kl": 0.0032501220703125,
|
|
"learning_rate": 9.866330768241983e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.2918383926153183,
|
|
"reward_std": 0.3286859840154648,
|
|
"rewards/cosine_scaled_reward": -0.1109393835067749,
|
|
"rewards/format_reward": 0.4027777910232544,
|
|
"step": 51
|
|
},
|
|
{
|
|
"completion_length": 2817.638916015625,
|
|
"epoch": 0.1780821917808219,
|
|
"grad_norm": 0.16504673659801483,
|
|
"kl": 0.0037384033203125,
|
|
"learning_rate": 9.853368487582886e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.23075967282056808,
|
|
"reward_std": 0.3352076858282089,
|
|
"rewards/cosine_scaled_reward": -0.08868478238582611,
|
|
"rewards/format_reward": 0.3194444477558136,
|
|
"step": 52
|
|
},
|
|
{
|
|
"completion_length": 2916.1112060546875,
|
|
"epoch": 0.1815068493150685,
|
|
"grad_norm": 0.13884864747524261,
|
|
"kl": 0.00319671630859375,
|
|
"learning_rate": 9.839816830517225e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.28766703605651855,
|
|
"reward_std": 0.39385148882865906,
|
|
"rewards/cosine_scaled_reward": -0.09427741169929504,
|
|
"rewards/format_reward": 0.3819444477558136,
|
|
"step": 53
|
|
},
|
|
{
|
|
"completion_length": 2991.6041259765625,
|
|
"epoch": 0.18493150684931506,
|
|
"grad_norm": 0.1436394602060318,
|
|
"kl": 0.0031585693359375,
|
|
"learning_rate": 9.825677631722435e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.35537297278642654,
|
|
"reward_std": 0.39448249340057373,
|
|
"rewards/cosine_scaled_reward": -0.04740479774773121,
|
|
"rewards/format_reward": 0.4027777910232544,
|
|
"step": 54
|
|
},
|
|
{
|
|
"completion_length": 2785.173583984375,
|
|
"epoch": 0.18835616438356165,
|
|
"grad_norm": 0.18955738842487335,
|
|
"kl": 0.0048980712890625,
|
|
"learning_rate": 9.8109528054197e-07,
|
|
"loss": 0.0002,
|
|
"reward": 0.2277716025710106,
|
|
"reward_std": 0.4378567188978195,
|
|
"rewards/cosine_scaled_reward": -0.1263950616121292,
|
|
"rewards/format_reward": 0.3541666716337204,
|
|
"step": 55
|
|
},
|
|
{
|
|
"completion_length": 2579.388916015625,
|
|
"epoch": 0.1917808219178082,
|
|
"grad_norm": 0.1459513008594513,
|
|
"kl": 0.003570556640625,
|
|
"learning_rate": 9.795644345114794e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.35765571892261505,
|
|
"reward_std": 0.39826689660549164,
|
|
"rewards/cosine_scaled_reward": -0.11456651613116264,
|
|
"rewards/format_reward": 0.4722222238779068,
|
|
"step": 56
|
|
},
|
|
{
|
|
"completion_length": 2586.9305419921875,
|
|
"epoch": 0.1952054794520548,
|
|
"grad_norm": 0.20002588629722595,
|
|
"kl": 0.00665283203125,
|
|
"learning_rate": 9.779754323328192e-07,
|
|
"loss": 0.0003,
|
|
"reward": 0.3011641800403595,
|
|
"reward_std": 0.397590771317482,
|
|
"rewards/cosine_scaled_reward": -0.1641135960817337,
|
|
"rewards/format_reward": 0.4652777761220932,
|
|
"step": 57
|
|
},
|
|
{
|
|
"completion_length": 2786.3333740234375,
|
|
"epoch": 0.19863013698630136,
|
|
"grad_norm": 0.15498584508895874,
|
|
"kl": 0.00461578369140625,
|
|
"learning_rate": 9.76328489131448e-07,
|
|
"loss": 0.0002,
|
|
"reward": 0.3455113209784031,
|
|
"reward_std": 0.38071802258491516,
|
|
"rewards/cosine_scaled_reward": -0.043377578258514404,
|
|
"rewards/format_reward": 0.3888889029622078,
|
|
"step": 58
|
|
},
|
|
{
|
|
"completion_length": 2466.763916015625,
|
|
"epoch": 0.20205479452054795,
|
|
"grad_norm": 0.14243252575397491,
|
|
"kl": 0.0069580078125,
|
|
"learning_rate": 9.746238278771125e-07,
|
|
"loss": 0.0003,
|
|
"reward": 0.5345294326543808,
|
|
"reward_std": 0.5026020854711533,
|
|
"rewards/cosine_scaled_reward": -0.021026152186095715,
|
|
"rewards/format_reward": 0.5555555522441864,
|
|
"step": 59
|
|
},
|
|
{
|
|
"completion_length": 2819.2083740234375,
|
|
"epoch": 0.2054794520547945,
|
|
"grad_norm": 0.13467197120189667,
|
|
"kl": 0.005340576171875,
|
|
"learning_rate": 9.728616793536587e-07,
|
|
"loss": 0.0002,
|
|
"reward": 0.32512621581554413,
|
|
"reward_std": 0.3411554992198944,
|
|
"rewards/cosine_scaled_reward": -0.07070711255073547,
|
|
"rewards/format_reward": 0.3958333432674408,
|
|
"step": 60
|
|
},
|
|
{
|
|
"completion_length": 2741.6944580078125,
|
|
"epoch": 0.2089041095890411,
|
|
"grad_norm": 0.1468200534582138,
|
|
"kl": 0.0046539306640625,
|
|
"learning_rate": 9.71042282127789e-07,
|
|
"loss": 0.0002,
|
|
"reward": 0.4192769527435303,
|
|
"reward_std": 0.4629937559366226,
|
|
"rewards/cosine_scaled_reward": -0.10850081220269203,
|
|
"rewards/format_reward": 0.5277777910232544,
|
|
"step": 61
|
|
},
|
|
{
|
|
"completion_length": 2945.27783203125,
|
|
"epoch": 0.21232876712328766,
|
|
"grad_norm": 0.14704585075378418,
|
|
"kl": 0.00518798828125,
|
|
"learning_rate": 9.69165882516764e-07,
|
|
"loss": 0.0002,
|
|
"reward": 0.35519421100616455,
|
|
"reward_std": 0.49601832032203674,
|
|
"rewards/cosine_scaled_reward": -0.019805820658802986,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 62
|
|
},
|
|
{
|
|
"completion_length": 2946.166748046875,
|
|
"epoch": 0.21575342465753425,
|
|
"grad_norm": 0.14856047928333282,
|
|
"kl": 0.004547119140625,
|
|
"learning_rate": 9.672327345550543e-07,
|
|
"loss": 0.0002,
|
|
"reward": 0.14313821494579315,
|
|
"reward_std": 0.30480653047561646,
|
|
"rewards/cosine_scaled_reward": -0.16241733357310295,
|
|
"rewards/format_reward": 0.305555559694767,
|
|
"step": 63
|
|
},
|
|
{
|
|
"completion_length": 2631.2708740234375,
|
|
"epoch": 0.2191780821917808,
|
|
"grad_norm": 0.1544739007949829,
|
|
"kl": 0.0066070556640625,
|
|
"learning_rate": 9.65243099959949e-07,
|
|
"loss": 0.0003,
|
|
"reward": 0.3399986997246742,
|
|
"reward_std": 0.41406454145908356,
|
|
"rewards/cosine_scaled_reward": -0.0905568664893508,
|
|
"rewards/format_reward": 0.4305555522441864,
|
|
"step": 64
|
|
},
|
|
{
|
|
"completion_length": 2305.416748046875,
|
|
"epoch": 0.2226027397260274,
|
|
"grad_norm": 0.18752287328243256,
|
|
"kl": 0.011260986328125,
|
|
"learning_rate": 9.631972480961233e-07,
|
|
"loss": 0.0005,
|
|
"reward": 0.43726037442684174,
|
|
"reward_std": 0.25909677147865295,
|
|
"rewards/cosine_scaled_reward": -0.06273962743580341,
|
|
"rewards/format_reward": 0.5000000149011612,
|
|
"step": 65
|
|
},
|
|
{
|
|
"completion_length": 2983.9444580078125,
|
|
"epoch": 0.22602739726027396,
|
|
"grad_norm": 0.11286786943674088,
|
|
"kl": 0.00555419921875,
|
|
"learning_rate": 9.610954559391704e-07,
|
|
"loss": 0.0002,
|
|
"reward": 0.2669789642095566,
|
|
"reward_std": 0.39550328254699707,
|
|
"rewards/cosine_scaled_reward": -0.12190994247794151,
|
|
"rewards/format_reward": 0.3888888955116272,
|
|
"step": 66
|
|
},
|
|
{
|
|
"completion_length": 2817.4652099609375,
|
|
"epoch": 0.22945205479452055,
|
|
"grad_norm": 0.12778045237064362,
|
|
"kl": 0.006805419921875,
|
|
"learning_rate": 9.589380080381038e-07,
|
|
"loss": 0.0003,
|
|
"reward": 0.37828030437231064,
|
|
"reward_std": 0.372231587767601,
|
|
"rewards/cosine_scaled_reward": -0.05921970750205219,
|
|
"rewards/format_reward": 0.4375000149011612,
|
|
"step": 67
|
|
},
|
|
{
|
|
"completion_length": 2652.541748046875,
|
|
"epoch": 0.2328767123287671,
|
|
"grad_norm": 0.21021802723407745,
|
|
"kl": 0.009674072265625,
|
|
"learning_rate": 9.567251964768342e-07,
|
|
"loss": 0.0004,
|
|
"reward": 0.36411239206790924,
|
|
"reward_std": 0.4231058210134506,
|
|
"rewards/cosine_scaled_reward": -0.04560980945825577,
|
|
"rewards/format_reward": 0.4097222238779068,
|
|
"step": 68
|
|
},
|
|
{
|
|
"completion_length": 2930.09716796875,
|
|
"epoch": 0.2363013698630137,
|
|
"grad_norm": 0.13323834538459778,
|
|
"kl": 0.0063323974609375,
|
|
"learning_rate": 9.54457320834625e-07,
|
|
"loss": 0.0003,
|
|
"reward": 0.2518671154975891,
|
|
"reward_std": 0.44373343884944916,
|
|
"rewards/cosine_scaled_reward": -0.10924399271607399,
|
|
"rewards/format_reward": 0.361111119389534,
|
|
"step": 69
|
|
},
|
|
{
|
|
"completion_length": 2892.1041259765625,
|
|
"epoch": 0.23972602739726026,
|
|
"grad_norm": 0.1291056126356125,
|
|
"kl": 0.0074005126953125,
|
|
"learning_rate": 9.521346881455354e-07,
|
|
"loss": 0.0003,
|
|
"reward": 0.32329247891902924,
|
|
"reward_std": 0.2955201715230942,
|
|
"rewards/cosine_scaled_reward": -0.04476307414006442,
|
|
"rewards/format_reward": 0.3680555522441864,
|
|
"step": 70
|
|
},
|
|
{
|
|
"completion_length": 3238.2847900390625,
|
|
"epoch": 0.24315068493150685,
|
|
"grad_norm": 0.12446684390306473,
|
|
"kl": 0.0060577392578125,
|
|
"learning_rate": 9.497576128568518e-07,
|
|
"loss": 0.0002,
|
|
"reward": 0.127724077552557,
|
|
"reward_std": 0.38546572625637054,
|
|
"rewards/cosine_scaled_reward": -0.129220362752676,
|
|
"rewards/format_reward": 0.256944440305233,
|
|
"step": 71
|
|
},
|
|
{
|
|
"completion_length": 2898.52783203125,
|
|
"epoch": 0.2465753424657534,
|
|
"grad_norm": 0.12787607312202454,
|
|
"kl": 0.007537841796875,
|
|
"learning_rate": 9.473264167865171e-07,
|
|
"loss": 0.0003,
|
|
"reward": 0.21395007893443108,
|
|
"reward_std": 0.3605284094810486,
|
|
"rewards/cosine_scaled_reward": -0.14716103300452232,
|
|
"rewards/format_reward": 0.361111119389534,
|
|
"step": 72
|
|
},
|
|
{
|
|
"completion_length": 2566.1666259765625,
|
|
"epoch": 0.25,
|
|
"grad_norm": 0.16552697122097015,
|
|
"kl": 0.010162353515625,
|
|
"learning_rate": 9.448414290795618e-07,
|
|
"loss": 0.0004,
|
|
"reward": 0.5014020800590515,
|
|
"reward_std": 0.3501627743244171,
|
|
"rewards/cosine_scaled_reward": 0.015290968120098114,
|
|
"rewards/format_reward": 0.486111119389534,
|
|
"step": 73
|
|
},
|
|
{
|
|
"completion_length": 2927.8125,
|
|
"epoch": 0.2534246575342466,
|
|
"grad_norm": 0.1325031816959381,
|
|
"kl": 0.007843017578125,
|
|
"learning_rate": 9.42302986163543e-07,
|
|
"loss": 0.0003,
|
|
"reward": 0.20612417813390493,
|
|
"reward_std": 0.38437697291374207,
|
|
"rewards/cosine_scaled_reward": -0.1758202537894249,
|
|
"rewards/format_reward": 0.3819444477558136,
|
|
"step": 74
|
|
},
|
|
{
|
|
"completion_length": 2712.4583740234375,
|
|
"epoch": 0.2568493150684932,
|
|
"grad_norm": 0.1481809765100479,
|
|
"kl": 0.0088043212890625,
|
|
"learning_rate": 9.397114317029974e-07,
|
|
"loss": 0.0004,
|
|
"reward": 0.46940816938877106,
|
|
"reward_std": 0.4842826575040817,
|
|
"rewards/cosine_scaled_reward": -0.002814057283103466,
|
|
"rewards/format_reward": 0.472222238779068,
|
|
"step": 75
|
|
},
|
|
{
|
|
"completion_length": 2792.3055419921875,
|
|
"epoch": 0.2602739726027397,
|
|
"grad_norm": 0.22191229462623596,
|
|
"kl": 0.0104827880859375,
|
|
"learning_rate": 9.370671165529144e-07,
|
|
"loss": 0.0004,
|
|
"reward": 0.3087327107787132,
|
|
"reward_std": 0.3974086493253708,
|
|
"rewards/cosine_scaled_reward": -0.07321174256503582,
|
|
"rewards/format_reward": 0.3819444477558136,
|
|
"step": 76
|
|
},
|
|
{
|
|
"completion_length": 2848.0625,
|
|
"epoch": 0.2636986301369863,
|
|
"grad_norm": 0.14868301153182983,
|
|
"kl": 0.009918212890625,
|
|
"learning_rate": 9.343703987112365e-07,
|
|
"loss": 0.0004,
|
|
"reward": 0.361818864941597,
|
|
"reward_std": 0.47596603631973267,
|
|
"rewards/cosine_scaled_reward": -0.09651443734765053,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 77
|
|
},
|
|
{
|
|
"completion_length": 3004.96533203125,
|
|
"epoch": 0.2671232876712329,
|
|
"grad_norm": 0.1331622451543808,
|
|
"kl": 0.007659912109375,
|
|
"learning_rate": 9.316216432703916e-07,
|
|
"loss": 0.0003,
|
|
"reward": 0.4262048453092575,
|
|
"reward_std": 0.38969628512859344,
|
|
"rewards/cosine_scaled_reward": 0.030371490865945816,
|
|
"rewards/format_reward": 0.3958333283662796,
|
|
"step": 78
|
|
},
|
|
{
|
|
"completion_length": 2514.673583984375,
|
|
"epoch": 0.2705479452054795,
|
|
"grad_norm": 0.15792632102966309,
|
|
"kl": 0.010833740234375,
|
|
"learning_rate": 9.288212223678658e-07,
|
|
"loss": 0.0004,
|
|
"reward": 0.517068013548851,
|
|
"reward_std": 0.4114367365837097,
|
|
"rewards/cosine_scaled_reward": -0.003765310626477003,
|
|
"rewards/format_reward": 0.5208333283662796,
|
|
"step": 79
|
|
},
|
|
{
|
|
"completion_length": 2929.9652099609375,
|
|
"epoch": 0.273972602739726,
|
|
"grad_norm": 0.16112865507602692,
|
|
"kl": 0.009185791015625,
|
|
"learning_rate": 9.259695151358214e-07,
|
|
"loss": 0.0004,
|
|
"reward": 0.22491584718227386,
|
|
"reward_std": 0.3951188027858734,
|
|
"rewards/cosine_scaled_reward": -0.11536196433007717,
|
|
"rewards/format_reward": 0.3402777910232544,
|
|
"step": 80
|
|
},
|
|
{
|
|
"completion_length": 2740.84716796875,
|
|
"epoch": 0.2773972602739726,
|
|
"grad_norm": 0.16842201352119446,
|
|
"kl": 0.012237548828125,
|
|
"learning_rate": 9.230669076497687e-07,
|
|
"loss": 0.0005,
|
|
"reward": 0.4160406142473221,
|
|
"reward_std": 0.47858820855617523,
|
|
"rewards/cosine_scaled_reward": -0.007570529356598854,
|
|
"rewards/format_reward": 0.423611119389534,
|
|
"step": 81
|
|
},
|
|
{
|
|
"completion_length": 2845.5069580078125,
|
|
"epoch": 0.2808219178082192,
|
|
"grad_norm": 0.20275162160396576,
|
|
"kl": 0.009857177734375,
|
|
"learning_rate": 9.20113792876298e-07,
|
|
"loss": 0.0004,
|
|
"reward": 0.3008575513958931,
|
|
"reward_std": 0.46689480543136597,
|
|
"rewards/cosine_scaled_reward": -0.088031355291605,
|
|
"rewards/format_reward": 0.3888888955116272,
|
|
"step": 82
|
|
},
|
|
{
|
|
"completion_length": 2746.25,
|
|
"epoch": 0.2842465753424658,
|
|
"grad_norm": 0.186171293258667,
|
|
"kl": 0.013397216796875,
|
|
"learning_rate": 9.171105706198774e-07,
|
|
"loss": 0.0005,
|
|
"reward": 0.2251388169825077,
|
|
"reward_std": 0.49258220195770264,
|
|
"rewards/cosine_scaled_reward": -0.1568056344985962,
|
|
"rewards/format_reward": 0.3819444477558136,
|
|
"step": 83
|
|
},
|
|
{
|
|
"completion_length": 2873.1805419921875,
|
|
"epoch": 0.2876712328767123,
|
|
"grad_norm": 0.1654582917690277,
|
|
"kl": 0.00982666015625,
|
|
"learning_rate": 9.140576474687263e-07,
|
|
"loss": 0.0004,
|
|
"reward": 0.33728019893169403,
|
|
"reward_std": 0.4015253335237503,
|
|
"rewards/cosine_scaled_reward": -0.05855315364897251,
|
|
"rewards/format_reward": 0.3958333432674408,
|
|
"step": 84
|
|
},
|
|
{
|
|
"completion_length": 2642.9376220703125,
|
|
"epoch": 0.2910958904109589,
|
|
"grad_norm": 0.1457146555185318,
|
|
"kl": 0.01275634765625,
|
|
"learning_rate": 9.109554367397697e-07,
|
|
"loss": 0.0005,
|
|
"reward": 0.5559843331575394,
|
|
"reward_std": 0.39349929988384247,
|
|
"rewards/cosine_scaled_reward": 0.0004287753254175186,
|
|
"rewards/format_reward": 0.5555555522441864,
|
|
"step": 85
|
|
},
|
|
{
|
|
"completion_length": 2919.9027099609375,
|
|
"epoch": 0.2945205479452055,
|
|
"grad_norm": 0.12657979130744934,
|
|
"kl": 0.0092926025390625,
|
|
"learning_rate": 9.078043584226815e-07,
|
|
"loss": 0.0004,
|
|
"reward": 0.4835028350353241,
|
|
"reward_std": 0.4722355157136917,
|
|
"rewards/cosine_scaled_reward": -0.009552719071507454,
|
|
"rewards/format_reward": 0.4930555671453476,
|
|
"step": 86
|
|
},
|
|
{
|
|
"completion_length": 2724.104248046875,
|
|
"epoch": 0.2979452054794521,
|
|
"grad_norm": 0.16625893115997314,
|
|
"kl": 0.0166015625,
|
|
"learning_rate": 9.046048391230247e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.23857301846146584,
|
|
"reward_std": 0.30345526337623596,
|
|
"rewards/cosine_scaled_reward": -0.09476033598184586,
|
|
"rewards/format_reward": 0.3333333283662796,
|
|
"step": 87
|
|
},
|
|
{
|
|
"completion_length": 3015.486083984375,
|
|
"epoch": 0.3013698630136986,
|
|
"grad_norm": 0.15486519038677216,
|
|
"kl": 0.01129150390625,
|
|
"learning_rate": 9.013573120044966e-07,
|
|
"loss": 0.0005,
|
|
"reward": 0.197230139747262,
|
|
"reward_std": 0.4105876684188843,
|
|
"rewards/cosine_scaled_reward": -0.15693652629852295,
|
|
"rewards/format_reward": 0.3541666716337204,
|
|
"step": 88
|
|
},
|
|
{
|
|
"completion_length": 3045.7708740234375,
|
|
"epoch": 0.3047945205479452,
|
|
"grad_norm": 0.11665515601634979,
|
|
"kl": 0.00848388671875,
|
|
"learning_rate": 8.980622167302837e-07,
|
|
"loss": 0.0003,
|
|
"reward": 0.2519669234752655,
|
|
"reward_std": 0.37049752473831177,
|
|
"rewards/cosine_scaled_reward": -0.07442197389900684,
|
|
"rewards/format_reward": 0.3263888955116272,
|
|
"step": 89
|
|
},
|
|
{
|
|
"completion_length": 2415.263916015625,
|
|
"epoch": 0.3082191780821918,
|
|
"grad_norm": 0.14136534929275513,
|
|
"kl": 0.01422119140625,
|
|
"learning_rate": 8.9471999940354e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.4783553332090378,
|
|
"reward_std": 0.37421566247940063,
|
|
"rewards/cosine_scaled_reward": -0.05636689253151417,
|
|
"rewards/format_reward": 0.5347222238779068,
|
|
"step": 90
|
|
},
|
|
{
|
|
"completion_length": 3079.4166259765625,
|
|
"epoch": 0.3116438356164384,
|
|
"grad_norm": 0.1431264877319336,
|
|
"kl": 0.011749267578125,
|
|
"learning_rate": 8.91331112506991e-07,
|
|
"loss": 0.0005,
|
|
"reward": 0.1510103940963745,
|
|
"reward_std": 0.4773586541414261,
|
|
"rewards/cosine_scaled_reward": -0.1684340313076973,
|
|
"rewards/format_reward": 0.3194444477558136,
|
|
"step": 91
|
|
},
|
|
{
|
|
"completion_length": 2879.6458740234375,
|
|
"epoch": 0.3150684931506849,
|
|
"grad_norm": 0.16839326918125153,
|
|
"kl": 0.013824462890625,
|
|
"learning_rate": 8.878960148416747e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.36638200283050537,
|
|
"reward_std": 0.4755648225545883,
|
|
"rewards/cosine_scaled_reward": -0.07111799996346235,
|
|
"rewards/format_reward": 0.4375,
|
|
"step": 92
|
|
},
|
|
{
|
|
"completion_length": 2882.326416015625,
|
|
"epoch": 0.3184931506849315,
|
|
"grad_norm": 0.14426778256893158,
|
|
"kl": 0.01336669921875,
|
|
"learning_rate": 8.844151714648274e-07,
|
|
"loss": 0.0005,
|
|
"reward": 0.41798925399780273,
|
|
"reward_std": 0.3813701719045639,
|
|
"rewards/cosine_scaled_reward": 0.015211460180580616,
|
|
"rewards/format_reward": 0.4027777910232544,
|
|
"step": 93
|
|
},
|
|
{
|
|
"completion_length": 2881.97216796875,
|
|
"epoch": 0.3219178082191781,
|
|
"grad_norm": 0.13846907019615173,
|
|
"kl": 0.012542724609375,
|
|
"learning_rate": 8.808890536269229e-07,
|
|
"loss": 0.0005,
|
|
"reward": 0.3407672867178917,
|
|
"reward_std": 0.5038859099149704,
|
|
"rewards/cosine_scaled_reward": -0.05506602302193642,
|
|
"rewards/format_reward": 0.3958333432674408,
|
|
"step": 94
|
|
},
|
|
{
|
|
"completion_length": 3092.4375,
|
|
"epoch": 0.3253424657534247,
|
|
"grad_norm": 0.1459702104330063,
|
|
"kl": 0.011962890625,
|
|
"learning_rate": 8.773181387078719e-07,
|
|
"loss": 0.0005,
|
|
"reward": 0.0870569609105587,
|
|
"reward_std": 0.36285020411014557,
|
|
"rewards/cosine_scaled_reward": -0.1837763711810112,
|
|
"rewards/format_reward": 0.2708333283662796,
|
|
"step": 95
|
|
},
|
|
{
|
|
"completion_length": 2850.486083984375,
|
|
"epoch": 0.3287671232876712,
|
|
"grad_norm": 0.16794843971729279,
|
|
"kl": 0.016632080078125,
|
|
"learning_rate": 8.737029101523929e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.2914813682436943,
|
|
"reward_std": 0.40039965510368347,
|
|
"rewards/cosine_scaled_reward": -0.12518527917563915,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 96
|
|
},
|
|
{
|
|
"completion_length": 2382.861083984375,
|
|
"epoch": 0.3321917808219178,
|
|
"grad_norm": 0.17770880460739136,
|
|
"kl": 0.01910400390625,
|
|
"learning_rate": 8.700438574045617e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.5590852797031403,
|
|
"reward_std": 0.3997005224227905,
|
|
"rewards/cosine_scaled_reward": -0.010359160602092743,
|
|
"rewards/format_reward": 0.5694444477558136,
|
|
"step": 97
|
|
},
|
|
{
|
|
"completion_length": 2623.6181640625,
|
|
"epoch": 0.3356164383561644,
|
|
"grad_norm": 0.14203575253486633,
|
|
"kl": 0.0145263671875,
|
|
"learning_rate": 8.663414758415478e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.4303872585296631,
|
|
"reward_std": 0.41416965425014496,
|
|
"rewards/cosine_scaled_reward": -0.055723853409290314,
|
|
"rewards/format_reward": 0.486111119389534,
|
|
"step": 98
|
|
},
|
|
{
|
|
"completion_length": 2726.8056640625,
|
|
"epoch": 0.339041095890411,
|
|
"grad_norm": 0.14974938333034515,
|
|
"kl": 0.014617919921875,
|
|
"learning_rate": 8.625962667065487e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.5150108188390732,
|
|
"reward_std": 0.42872530221939087,
|
|
"rewards/cosine_scaled_reward": 0.02889970690011978,
|
|
"rewards/format_reward": 0.486111119389534,
|
|
"step": 99
|
|
},
|
|
{
|
|
"completion_length": 2724.8472900390625,
|
|
"epoch": 0.3424657534246575,
|
|
"grad_norm": 0.15371288359165192,
|
|
"kl": 0.015380859375,
|
|
"learning_rate": 8.588087370409302e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.31959572434425354,
|
|
"reward_std": 0.4421093314886093,
|
|
"rewards/cosine_scaled_reward": -0.06234869919717312,
|
|
"rewards/format_reward": 0.3819444477558136,
|
|
"step": 100
|
|
},
|
|
{
|
|
"completion_length": 2685.0069580078125,
|
|
"epoch": 0.3458904109589041,
|
|
"grad_norm": 0.15011949837207794,
|
|
"kl": 0.017578125,
|
|
"learning_rate": 8.549793996155795e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.28529858589172363,
|
|
"reward_std": 0.3051328808069229,
|
|
"rewards/cosine_scaled_reward": -0.14525696635246277,
|
|
"rewards/format_reward": 0.4305555522441864,
|
|
"step": 101
|
|
},
|
|
{
|
|
"completion_length": 2744.3056640625,
|
|
"epoch": 0.3493150684931507,
|
|
"grad_norm": 0.16087760031223297,
|
|
"kl": 0.015289306640625,
|
|
"learning_rate": 8.511087728614862e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.4498697370290756,
|
|
"reward_std": 0.459157794713974,
|
|
"rewards/cosine_scaled_reward": 0.01931417128071189,
|
|
"rewards/format_reward": 0.4305555522441864,
|
|
"step": 102
|
|
},
|
|
{
|
|
"completion_length": 2870.5208740234375,
|
|
"epoch": 0.3527397260273973,
|
|
"grad_norm": 0.1468544751405716,
|
|
"kl": 0.01470947265625,
|
|
"learning_rate": 8.471973807995534e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.3780040740966797,
|
|
"reward_std": 0.5862710475921631,
|
|
"rewards/cosine_scaled_reward": -0.08727369178086519,
|
|
"rewards/format_reward": 0.4652777761220932,
|
|
"step": 103
|
|
},
|
|
{
|
|
"completion_length": 2846.5069580078125,
|
|
"epoch": 0.3561643835616438,
|
|
"grad_norm": 0.1548270583152771,
|
|
"kl": 0.018310546875,
|
|
"learning_rate": 8.432457529696548e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.35155677795410156,
|
|
"reward_std": 0.4553772062063217,
|
|
"rewards/cosine_scaled_reward": -0.0720543134957552,
|
|
"rewards/format_reward": 0.423611119389534,
|
|
"step": 104
|
|
},
|
|
{
|
|
"completion_length": 2724.076416015625,
|
|
"epoch": 0.3595890410958904,
|
|
"grad_norm": 0.1372641921043396,
|
|
"kl": 0.01708984375,
|
|
"learning_rate": 8.392544243589427e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.23305706679821014,
|
|
"reward_std": 0.36483894288539886,
|
|
"rewards/cosine_scaled_reward": -0.16277626901865005,
|
|
"rewards/format_reward": 0.3958333283662796,
|
|
"step": 105
|
|
},
|
|
{
|
|
"completion_length": 2911.5069580078125,
|
|
"epoch": 0.363013698630137,
|
|
"grad_norm": 0.173682302236557,
|
|
"kl": 0.0185546875,
|
|
"learning_rate": 8.352239353294194e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.3065890637226403,
|
|
"reward_std": 0.30855099856853485,
|
|
"rewards/cosine_scaled_reward": -0.03368869423866272,
|
|
"rewards/format_reward": 0.3402777761220932,
|
|
"step": 106
|
|
},
|
|
{
|
|
"completion_length": 3053.5555419921875,
|
|
"epoch": 0.3664383561643836,
|
|
"grad_norm": 0.1345294862985611,
|
|
"kl": 0.01544189453125,
|
|
"learning_rate": 8.31154831544782e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.28517407923936844,
|
|
"reward_std": 0.4504627585411072,
|
|
"rewards/cosine_scaled_reward": -0.11065925285220146,
|
|
"rewards/format_reward": 0.3958333283662796,
|
|
"step": 107
|
|
},
|
|
{
|
|
"completion_length": 3053.4583740234375,
|
|
"epoch": 0.3698630136986301,
|
|
"grad_norm": 0.12478786706924438,
|
|
"kl": 0.015869140625,
|
|
"learning_rate": 8.270476638965461e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.18328540516085923,
|
|
"reward_std": 0.31476570665836334,
|
|
"rewards/cosine_scaled_reward": -0.14310350455343723,
|
|
"rewards/format_reward": 0.3263888880610466,
|
|
"step": 108
|
|
},
|
|
{
|
|
"completion_length": 2813.4444580078125,
|
|
"epoch": 0.3732876712328767,
|
|
"grad_norm": 0.13302823901176453,
|
|
"kl": 0.013427734375,
|
|
"learning_rate": 8.229029884294662e-07,
|
|
"loss": 0.0005,
|
|
"reward": 0.4454272836446762,
|
|
"reward_std": 0.4187168627977371,
|
|
"rewards/cosine_scaled_reward": 0.0009828601032495499,
|
|
"rewards/format_reward": 0.4444444477558136,
|
|
"step": 109
|
|
},
|
|
{
|
|
"completion_length": 3167.1875,
|
|
"epoch": 0.3767123287671233,
|
|
"grad_norm": 0.13403479754924774,
|
|
"kl": 0.013946533203125,
|
|
"learning_rate": 8.187213662662538e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.12785961106419563,
|
|
"reward_std": 0.4516500234603882,
|
|
"rewards/cosine_scaled_reward": -0.19158484041690826,
|
|
"rewards/format_reward": 0.3194444477558136,
|
|
"step": 110
|
|
},
|
|
{
|
|
"completion_length": 3124.513916015625,
|
|
"epoch": 0.3801369863013699,
|
|
"grad_norm": 0.13928255438804626,
|
|
"kl": 0.0164794921875,
|
|
"learning_rate": 8.145033635316128e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.1088075079023838,
|
|
"reward_std": 0.40094128996133804,
|
|
"rewards/cosine_scaled_reward": -0.1828591525554657,
|
|
"rewards/format_reward": 0.291666679084301,
|
|
"step": 111
|
|
},
|
|
{
|
|
"completion_length": 3192.8194580078125,
|
|
"epoch": 0.3835616438356164,
|
|
"grad_norm": 0.12524190545082092,
|
|
"kl": 0.015838623046875,
|
|
"learning_rate": 8.102495512755938e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.16609879583120346,
|
|
"reward_std": 0.4136479049921036,
|
|
"rewards/cosine_scaled_reward": -0.18806789070367813,
|
|
"rewards/format_reward": 0.3541666716337204,
|
|
"step": 112
|
|
},
|
|
{
|
|
"completion_length": 3000.625,
|
|
"epoch": 0.386986301369863,
|
|
"grad_norm": 0.15665502846240997,
|
|
"kl": 0.01593017578125,
|
|
"learning_rate": 8.059605053962833e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.34966103732585907,
|
|
"reward_std": 0.5462008714675903,
|
|
"rewards/cosine_scaled_reward": -0.025338975712656975,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 113
|
|
},
|
|
{
|
|
"completion_length": 3111.944580078125,
|
|
"epoch": 0.3904109589041096,
|
|
"grad_norm": 0.12206412851810455,
|
|
"kl": 0.0172119140625,
|
|
"learning_rate": 8.01636806561836e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.3253851607441902,
|
|
"reward_std": 0.46508027613162994,
|
|
"rewards/cosine_scaled_reward": -0.021837057545781136,
|
|
"rewards/format_reward": 0.3472222238779068,
|
|
"step": 114
|
|
},
|
|
{
|
|
"completion_length": 2817.8333740234375,
|
|
"epoch": 0.3938356164383562,
|
|
"grad_norm": 0.15154601633548737,
|
|
"kl": 0.01849365234375,
|
|
"learning_rate": 7.972790401318627e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.19523771665990353,
|
|
"reward_std": 0.2818225920200348,
|
|
"rewards/cosine_scaled_reward": -0.17281781509518623,
|
|
"rewards/format_reward": 0.368055559694767,
|
|
"step": 115
|
|
},
|
|
{
|
|
"completion_length": 3147.47216796875,
|
|
"epoch": 0.3972602739726027,
|
|
"grad_norm": 0.11532028764486313,
|
|
"kl": 0.013916015625,
|
|
"learning_rate": 7.928877960781808e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.21391746401786804,
|
|
"reward_std": 0.4905927777290344,
|
|
"rewards/cosine_scaled_reward": -0.0777492057532072,
|
|
"rewards/format_reward": 0.2916666716337204,
|
|
"step": 116
|
|
},
|
|
{
|
|
"completion_length": 2670.5625,
|
|
"epoch": 0.4006849315068493,
|
|
"grad_norm": 0.16773977875709534,
|
|
"kl": 0.0164794921875,
|
|
"learning_rate": 7.884636689049422e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.4450993984937668,
|
|
"reward_std": 0.4119822680950165,
|
|
"rewards/cosine_scaled_reward": -0.08962283562868834,
|
|
"rewards/format_reward": 0.5347222238779068,
|
|
"step": 117
|
|
},
|
|
{
|
|
"completion_length": 3084.826416015625,
|
|
"epoch": 0.4041095890410959,
|
|
"grad_norm": 0.13086926937103271,
|
|
"kl": 0.0169677734375,
|
|
"learning_rate": 7.840072575681468e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.32114290446043015,
|
|
"reward_std": 0.3112673908472061,
|
|
"rewards/cosine_scaled_reward": -0.033023773692548275,
|
|
"rewards/format_reward": 0.3541666716337204,
|
|
"step": 118
|
|
},
|
|
{
|
|
"completion_length": 3206.5625,
|
|
"epoch": 0.4075342465753425,
|
|
"grad_norm": 0.1529916226863861,
|
|
"kl": 0.02081298828125,
|
|
"learning_rate": 7.795191653945538e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.08010175824165344,
|
|
"reward_std": 0.2550045773386955,
|
|
"rewards/cosine_scaled_reward": -0.14906491339206696,
|
|
"rewards/format_reward": 0.2291666641831398,
|
|
"step": 119
|
|
},
|
|
{
|
|
"completion_length": 2953.4097900390625,
|
|
"epoch": 0.410958904109589,
|
|
"grad_norm": 0.1457367241382599,
|
|
"kl": 0.01806640625,
|
|
"learning_rate": 7.75e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.4063476175069809,
|
|
"reward_std": 0.4592422544956207,
|
|
"rewards/cosine_scaled_reward": -0.024207940325140953,
|
|
"rewards/format_reward": 0.4305555671453476,
|
|
"step": 120
|
|
},
|
|
{
|
|
"completion_length": 2701.7569580078125,
|
|
"epoch": 0.4143835616438356,
|
|
"grad_norm": 0.16445693373680115,
|
|
"kl": 0.0205078125,
|
|
"learning_rate": 7.704503732071391e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.5053235739469528,
|
|
"reward_std": 0.5655853599309921,
|
|
"rewards/cosine_scaled_reward": -0.015509757213294506,
|
|
"rewards/format_reward": 0.5208333432674408,
|
|
"step": 121
|
|
},
|
|
{
|
|
"completion_length": 3087.791748046875,
|
|
"epoch": 0.4178082191780822,
|
|
"grad_norm": 0.11365609616041183,
|
|
"kl": 0.015625,
|
|
"learning_rate": 7.658709009626109e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.6354653835296631,
|
|
"reward_std": 0.5989937484264374,
|
|
"rewards/cosine_scaled_reward": 0.16324318200349808,
|
|
"rewards/format_reward": 0.472222238779068,
|
|
"step": 122
|
|
},
|
|
{
|
|
"completion_length": 2501.3958740234375,
|
|
"epoch": 0.4212328767123288,
|
|
"grad_norm": 0.14529232680797577,
|
|
"kl": 0.0194091796875,
|
|
"learning_rate": 7.612622032536507e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.4810824617743492,
|
|
"reward_std": 0.4910588413476944,
|
|
"rewards/cosine_scaled_reward": -0.06058423314243555,
|
|
"rewards/format_reward": 0.5416666567325592,
|
|
"step": 123
|
|
},
|
|
{
|
|
"completion_length": 3121.854248046875,
|
|
"epoch": 0.4246575342465753,
|
|
"grad_norm": 0.11533054709434509,
|
|
"kl": 0.0169677734375,
|
|
"learning_rate": 7.566249040241553e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.26003750413656235,
|
|
"reward_std": 0.44601309299468994,
|
|
"rewards/cosine_scaled_reward": -0.0732958409935236,
|
|
"rewards/format_reward": 0.3333333283662796,
|
|
"step": 124
|
|
},
|
|
{
|
|
"completion_length": 2981.65283203125,
|
|
"epoch": 0.4280821917808219,
|
|
"grad_norm": 0.14013110101222992,
|
|
"kl": 0.0194091796875,
|
|
"learning_rate": 7.51959631090208e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.17028066888451576,
|
|
"reward_std": 0.35746677219867706,
|
|
"rewards/cosine_scaled_reward": -0.16305266320705414,
|
|
"rewards/format_reward": 0.3333333358168602,
|
|
"step": 125
|
|
},
|
|
{
|
|
"completion_length": 3021.8055419921875,
|
|
"epoch": 0.4315068493150685,
|
|
"grad_norm": 0.12227875739336014,
|
|
"kl": 0.01800537109375,
|
|
"learning_rate": 7.472670160550848e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.35344888269901276,
|
|
"reward_std": 0.3790488839149475,
|
|
"rewards/cosine_scaled_reward": -0.0007177963852882385,
|
|
"rewards/format_reward": 0.3541666716337204,
|
|
"step": 126
|
|
},
|
|
{
|
|
"completion_length": 3308.8055419921875,
|
|
"epoch": 0.4349315068493151,
|
|
"grad_norm": 0.13628166913986206,
|
|
"kl": 0.01708984375,
|
|
"learning_rate": 7.425476942237444e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.2540893331170082,
|
|
"reward_std": 0.47945114970207214,
|
|
"rewards/cosine_scaled_reward": -0.07229956053197384,
|
|
"rewards/format_reward": 0.326388880610466,
|
|
"step": 127
|
|
},
|
|
{
|
|
"completion_length": 2813.6666259765625,
|
|
"epoch": 0.4383561643835616,
|
|
"grad_norm": 0.15076382458209991,
|
|
"kl": 0.019775390625,
|
|
"learning_rate": 7.37802304516818e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.47455114126205444,
|
|
"reward_std": 0.4427139610052109,
|
|
"rewards/cosine_scaled_reward": -0.03933773934841156,
|
|
"rewards/format_reward": 0.5138888955116272,
|
|
"step": 128
|
|
},
|
|
{
|
|
"completion_length": 2989.52783203125,
|
|
"epoch": 0.4417808219178082,
|
|
"grad_norm": 0.14209668338298798,
|
|
"kl": 0.01824951171875,
|
|
"learning_rate": 7.330314893841101e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.3041275106370449,
|
|
"reward_std": 0.4487529695034027,
|
|
"rewards/cosine_scaled_reward": -0.09865027293562889,
|
|
"rewards/format_reward": 0.4027777910232544,
|
|
"step": 129
|
|
},
|
|
{
|
|
"completion_length": 2724.0556640625,
|
|
"epoch": 0.4452054794520548,
|
|
"grad_norm": 0.13135673105716705,
|
|
"kl": 0.02044677734375,
|
|
"learning_rate": 7.282358947176205e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.4611601382493973,
|
|
"reward_std": 0.44737473130226135,
|
|
"rewards/cosine_scaled_reward": -0.059673219453543425,
|
|
"rewards/format_reward": 0.520833358168602,
|
|
"step": 130
|
|
},
|
|
{
|
|
"completion_length": 2713.166748046875,
|
|
"epoch": 0.4486301369863014,
|
|
"grad_norm": 0.14880216121673584,
|
|
"kl": 0.0208740234375,
|
|
"learning_rate": 7.234161697641017e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.5555524080991745,
|
|
"reward_std": 0.3877111077308655,
|
|
"rewards/cosine_scaled_reward": 0.05555241275578737,
|
|
"rewards/format_reward": 0.5000000149011612,
|
|
"step": 131
|
|
},
|
|
{
|
|
"completion_length": 3061.0625,
|
|
"epoch": 0.4520547945205479,
|
|
"grad_norm": 0.12646687030792236,
|
|
"kl": 0.02032470703125,
|
|
"learning_rate": 7.185729670371604e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.15745187550783157,
|
|
"reward_std": 0.23714770376682281,
|
|
"rewards/cosine_scaled_reward": -0.15504813194274902,
|
|
"rewards/format_reward": 0.3125,
|
|
"step": 132
|
|
},
|
|
{
|
|
"completion_length": 3232.3472900390625,
|
|
"epoch": 0.4554794520547945,
|
|
"grad_norm": 0.10323884338140488,
|
|
"kl": 0.0181884765625,
|
|
"learning_rate": 7.137069422289181e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.2314557433128357,
|
|
"reward_std": 0.39915989339351654,
|
|
"rewards/cosine_scaled_reward": -0.08104425063356757,
|
|
"rewards/format_reward": 0.3125000149011612,
|
|
"step": 133
|
|
},
|
|
{
|
|
"completion_length": 3193.8612060546875,
|
|
"epoch": 0.4589041095890411,
|
|
"grad_norm": 0.1457975059747696,
|
|
"kl": 0.02178955078125,
|
|
"learning_rate": 7.08818754121241e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.03552616201341152,
|
|
"reward_std": 0.36521604657173157,
|
|
"rewards/cosine_scaled_reward": -0.2630849555134773,
|
|
"rewards/format_reward": 0.298611119389534,
|
|
"step": 134
|
|
},
|
|
{
|
|
"completion_length": 3036.7291259765625,
|
|
"epoch": 0.4623287671232877,
|
|
"grad_norm": 0.12449899315834045,
|
|
"kl": 0.01971435546875,
|
|
"learning_rate": 7.039090644965509e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.6097220778465271,
|
|
"reward_std": 0.6045728027820587,
|
|
"rewards/cosine_scaled_reward": 0.10972205176949501,
|
|
"rewards/format_reward": 0.5000000149011612,
|
|
"step": 135
|
|
},
|
|
{
|
|
"completion_length": 2624.75,
|
|
"epoch": 0.4657534246575342,
|
|
"grad_norm": 0.15769356489181519,
|
|
"kl": 0.02471923828125,
|
|
"learning_rate": 6.989785380482312e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.4681246876716614,
|
|
"reward_std": 0.3646779954433441,
|
|
"rewards/cosine_scaled_reward": -0.052708632312715054,
|
|
"rewards/format_reward": 0.520833358168602,
|
|
"step": 136
|
|
},
|
|
{
|
|
"completion_length": 2946.8333740234375,
|
|
"epoch": 0.4691780821917808,
|
|
"grad_norm": 0.13552969694137573,
|
|
"kl": 0.02203369140625,
|
|
"learning_rate": 6.940278422906372e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.4625835418701172,
|
|
"reward_std": 0.4559210389852524,
|
|
"rewards/cosine_scaled_reward": 0.004250235855579376,
|
|
"rewards/format_reward": 0.4583333283662796,
|
|
"step": 137
|
|
},
|
|
{
|
|
"completion_length": 3224.576416015625,
|
|
"epoch": 0.4726027397260274,
|
|
"grad_norm": 0.1137109324336052,
|
|
"kl": 0.01885986328125,
|
|
"learning_rate": 6.890576474687263e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.27407026663422585,
|
|
"reward_std": 0.42793411016464233,
|
|
"rewards/cosine_scaled_reward": -0.03148530051112175,
|
|
"rewards/format_reward": 0.305555559694767,
|
|
"step": 138
|
|
},
|
|
{
|
|
"completion_length": 3156.84033203125,
|
|
"epoch": 0.476027397260274,
|
|
"grad_norm": 0.13348160684108734,
|
|
"kl": 0.01849365234375,
|
|
"learning_rate": 6.840686264673168e-07,
|
|
"loss": 0.0007,
|
|
"reward": 0.2564939334988594,
|
|
"reward_std": 0.4258010536432266,
|
|
"rewards/cosine_scaled_reward": -0.06295053288340569,
|
|
"rewards/format_reward": 0.3194444477558136,
|
|
"step": 139
|
|
},
|
|
{
|
|
"completion_length": 3052.3125,
|
|
"epoch": 0.4794520547945205,
|
|
"grad_norm": 0.1428801566362381,
|
|
"kl": 0.021240234375,
|
|
"learning_rate": 6.790614547199906e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.3512444347143173,
|
|
"reward_std": 0.404025673866272,
|
|
"rewards/cosine_scaled_reward": -0.016811135224997997,
|
|
"rewards/format_reward": 0.3680555671453476,
|
|
"step": 140
|
|
},
|
|
{
|
|
"completion_length": 2792.486083984375,
|
|
"epoch": 0.4828767123287671,
|
|
"grad_norm": 0.13267238438129425,
|
|
"kl": 0.02069091796875,
|
|
"learning_rate": 6.740368101176495e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.6453568935394287,
|
|
"reward_std": 0.5250414609909058,
|
|
"rewards/cosine_scaled_reward": 0.11063468037173152,
|
|
"rewards/format_reward": 0.5347222238779068,
|
|
"step": 141
|
|
},
|
|
{
|
|
"completion_length": 2989.0972900390625,
|
|
"epoch": 0.4863013698630137,
|
|
"grad_norm": 0.12677060067653656,
|
|
"kl": 0.021484375,
|
|
"learning_rate": 6.68995372916741e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.4786522537469864,
|
|
"reward_std": 0.5605998337268829,
|
|
"rewards/cosine_scaled_reward": 0.013374458998441696,
|
|
"rewards/format_reward": 0.4652777761220932,
|
|
"step": 142
|
|
},
|
|
{
|
|
"completion_length": 3084.6180419921875,
|
|
"epoch": 0.4897260273972603,
|
|
"grad_norm": 0.12558940052986145,
|
|
"kl": 0.0198974609375,
|
|
"learning_rate": 6.639378256471608e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.3289758712053299,
|
|
"reward_std": 0.45005667209625244,
|
|
"rewards/cosine_scaled_reward": -0.018246358260512352,
|
|
"rewards/format_reward": 0.3472222238779068,
|
|
"step": 143
|
|
},
|
|
{
|
|
"completion_length": 3120.8541259765625,
|
|
"epoch": 0.4931506849315068,
|
|
"grad_norm": 0.13771818578243256,
|
|
"kl": 0.02197265625,
|
|
"learning_rate": 6.588648530198504e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.1804770578892203,
|
|
"reward_std": 0.40222403407096863,
|
|
"rewards/cosine_scaled_reward": -0.11813406273722649,
|
|
"rewards/format_reward": 0.2986111119389534,
|
|
"step": 144
|
|
},
|
|
{
|
|
"completion_length": 3007.013916015625,
|
|
"epoch": 0.4965753424657534,
|
|
"grad_norm": 0.19792306423187256,
|
|
"kl": 0.031005859375,
|
|
"learning_rate": 6.537771418340981e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.19204921275377274,
|
|
"reward_std": 0.30794692039489746,
|
|
"rewards/cosine_scaled_reward": -0.09961747378110886,
|
|
"rewards/format_reward": 0.2916666567325592,
|
|
"step": 145
|
|
},
|
|
{
|
|
"completion_length": 3020.9306640625,
|
|
"epoch": 0.5,
|
|
"grad_norm": 0.13727493584156036,
|
|
"kl": 0.0220947265625,
|
|
"learning_rate": 6.486753808845564e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.16832835972309113,
|
|
"reward_std": 0.5160266309976578,
|
|
"rewards/cosine_scaled_reward": -0.15806053578853607,
|
|
"rewards/format_reward": 0.3263888955116272,
|
|
"step": 146
|
|
},
|
|
{
|
|
"completion_length": 2718.4097900390625,
|
|
"epoch": 0.5034246575342466,
|
|
"grad_norm": 0.11808720976114273,
|
|
"kl": 0.021484375,
|
|
"learning_rate": 6.435602608679916e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.5802240371704102,
|
|
"reward_std": 0.44783809781074524,
|
|
"rewards/cosine_scaled_reward": 0.017724037170410156,
|
|
"rewards/format_reward": 0.5625000149011612,
|
|
"step": 147
|
|
},
|
|
{
|
|
"completion_length": 2859.6458740234375,
|
|
"epoch": 0.5068493150684932,
|
|
"grad_norm": 0.21733467280864716,
|
|
"kl": 0.03009033203125,
|
|
"learning_rate": 6.384324742897735e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.4155062139034271,
|
|
"reward_std": 0.4282771795988083,
|
|
"rewards/cosine_scaled_reward": 0.005784010514616966,
|
|
"rewards/format_reward": 0.4097222238779068,
|
|
"step": 148
|
|
},
|
|
{
|
|
"completion_length": 2950.701416015625,
|
|
"epoch": 0.5102739726027398,
|
|
"grad_norm": 0.10564743727445602,
|
|
"kl": 0.02130126953125,
|
|
"learning_rate": 6.332927153701215e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.6034330129623413,
|
|
"reward_std": 0.47583654522895813,
|
|
"rewards/cosine_scaled_reward": 0.1103774681687355,
|
|
"rewards/format_reward": 0.4930555671453476,
|
|
"step": 149
|
|
},
|
|
{
|
|
"completion_length": 3038.013916015625,
|
|
"epoch": 0.5136986301369864,
|
|
"grad_norm": 0.12356794625520706,
|
|
"kl": 0.02337646484375,
|
|
"learning_rate": 6.281416799501187e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.577631801366806,
|
|
"reward_std": 0.47365154325962067,
|
|
"rewards/cosine_scaled_reward": 0.13318736106157303,
|
|
"rewards/format_reward": 0.4444444477558136,
|
|
"step": 150
|
|
},
|
|
{
|
|
"completion_length": 3258.90966796875,
|
|
"epoch": 0.5171232876712328,
|
|
"grad_norm": 0.13418567180633545,
|
|
"kl": 0.0211181640625,
|
|
"learning_rate": 6.229800653975054e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.3129318729043007,
|
|
"reward_std": 0.5764759629964828,
|
|
"rewards/cosine_scaled_reward": -0.0551237054169178,
|
|
"rewards/format_reward": 0.3680555671453476,
|
|
"step": 151
|
|
},
|
|
{
|
|
"completion_length": 2671.361083984375,
|
|
"epoch": 0.5205479452054794,
|
|
"grad_norm": 0.1966993510723114,
|
|
"kl": 0.0272216796875,
|
|
"learning_rate": 6.178085705122674e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.5614952445030212,
|
|
"reward_std": 0.43348051607608795,
|
|
"rewards/cosine_scaled_reward": 0.040661935694515705,
|
|
"rewards/format_reward": 0.5208333432674408,
|
|
"step": 152
|
|
},
|
|
{
|
|
"completion_length": 3149.1666259765625,
|
|
"epoch": 0.523972602739726,
|
|
"grad_norm": 0.12545958161354065,
|
|
"kl": 0.0216064453125,
|
|
"learning_rate": 6.126278954320294e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.15634628385305405,
|
|
"reward_std": 0.38860486447811127,
|
|
"rewards/cosine_scaled_reward": -0.17698704451322556,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 153
|
|
},
|
|
{
|
|
"completion_length": 3038.9930419921875,
|
|
"epoch": 0.5273972602739726,
|
|
"grad_norm": 0.15269039571285248,
|
|
"kl": 0.0208740234375,
|
|
"learning_rate": 6.074387415372676e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.38375401496887207,
|
|
"reward_std": 0.48632751405239105,
|
|
"rewards/cosine_scaled_reward": -0.019023781642317772,
|
|
"rewards/format_reward": 0.4027777761220932,
|
|
"step": 154
|
|
},
|
|
{
|
|
"completion_length": 3070.8680419921875,
|
|
"epoch": 0.5308219178082192,
|
|
"grad_norm": 0.11029084771871567,
|
|
"kl": 0.02392578125,
|
|
"learning_rate": 6.022418113563535e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.26838141679763794,
|
|
"reward_std": 0.3721802681684494,
|
|
"rewards/cosine_scaled_reward": -0.07189634721726179,
|
|
"rewards/format_reward": 0.3402777910232544,
|
|
"step": 155
|
|
},
|
|
{
|
|
"completion_length": 3024.270751953125,
|
|
"epoch": 0.5342465753424658,
|
|
"grad_norm": 0.1171901524066925,
|
|
"kl": 0.02239990234375,
|
|
"learning_rate": 5.97037808470444e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.36915363371372223,
|
|
"reward_std": 0.40745308995246887,
|
|
"rewards/cosine_scaled_reward": 0.021931427530944347,
|
|
"rewards/format_reward": 0.3472222238779068,
|
|
"step": 156
|
|
},
|
|
{
|
|
"completion_length": 2957.673583984375,
|
|
"epoch": 0.5376712328767124,
|
|
"grad_norm": 0.14073632657527924,
|
|
"kl": 0.0245361328125,
|
|
"learning_rate": 5.918274374182266e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.18879153579473495,
|
|
"reward_std": 0.31644490361213684,
|
|
"rewards/cosine_scaled_reward": -0.14454180747270584,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 157
|
|
},
|
|
{
|
|
"completion_length": 2745.4306640625,
|
|
"epoch": 0.541095890410959,
|
|
"grad_norm": 0.15367640554904938,
|
|
"kl": 0.02471923828125,
|
|
"learning_rate": 5.866114036005362e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.647605836391449,
|
|
"reward_std": 0.45094963908195496,
|
|
"rewards/cosine_scaled_reward": 0.12677249684929848,
|
|
"rewards/format_reward": 0.5208333432674408,
|
|
"step": 158
|
|
},
|
|
{
|
|
"completion_length": 2848.479248046875,
|
|
"epoch": 0.5445205479452054,
|
|
"grad_norm": 0.14879484474658966,
|
|
"kl": 0.026123046875,
|
|
"learning_rate": 5.813904131848564e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.31949036195874214,
|
|
"reward_std": 0.43667902052402496,
|
|
"rewards/cosine_scaled_reward": -0.10412076953798532,
|
|
"rewards/format_reward": 0.423611119389534,
|
|
"step": 159
|
|
},
|
|
{
|
|
"completion_length": 2885.5416259765625,
|
|
"epoch": 0.547945205479452,
|
|
"grad_norm": 0.12330173701047897,
|
|
"kl": 0.02471923828125,
|
|
"learning_rate": 5.761651730097142e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.47639837861061096,
|
|
"reward_std": 0.5159202069044113,
|
|
"rewards/cosine_scaled_reward": -0.00276830792427063,
|
|
"rewards/format_reward": 0.4791666716337204,
|
|
"step": 160
|
|
},
|
|
{
|
|
"completion_length": 3260.416748046875,
|
|
"epoch": 0.5513698630136986,
|
|
"grad_norm": 0.1258607804775238,
|
|
"kl": 0.02301025390625,
|
|
"learning_rate": 5.709363904889861e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.3024430572986603,
|
|
"reward_std": 0.6272812485694885,
|
|
"rewards/cosine_scaled_reward": -0.01700139231979847,
|
|
"rewards/format_reward": 0.3194444477558136,
|
|
"step": 161
|
|
},
|
|
{
|
|
"completion_length": 3207.2083740234375,
|
|
"epoch": 0.5547945205479452,
|
|
"grad_norm": 0.12467379122972488,
|
|
"kl": 0.02398681640625,
|
|
"learning_rate": 5.657047735161255e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.20557049103081226,
|
|
"reward_std": 0.3906491547822952,
|
|
"rewards/cosine_scaled_reward": -0.07220727764070034,
|
|
"rewards/format_reward": 0.2777777761220932,
|
|
"step": 162
|
|
},
|
|
{
|
|
"completion_length": 2876.319580078125,
|
|
"epoch": 0.5582191780821918,
|
|
"grad_norm": 0.13523170351982117,
|
|
"kl": 0.02081298828125,
|
|
"learning_rate": 5.604710303683253e-07,
|
|
"loss": 0.0008,
|
|
"reward": 0.5364240109920502,
|
|
"reward_std": 0.48902808129787445,
|
|
"rewards/cosine_scaled_reward": 0.06420179456472397,
|
|
"rewards/format_reward": 0.4722222238779068,
|
|
"step": 163
|
|
},
|
|
{
|
|
"completion_length": 2883.8958740234375,
|
|
"epoch": 0.5616438356164384,
|
|
"grad_norm": 0.17181335389614105,
|
|
"kl": 0.028564453125,
|
|
"learning_rate": 5.552358696106288e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.27160534262657166,
|
|
"reward_std": 0.360342800617218,
|
|
"rewards/cosine_scaled_reward": -0.10339467972517014,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 164
|
|
},
|
|
{
|
|
"completion_length": 3007.28466796875,
|
|
"epoch": 0.565068493150685,
|
|
"grad_norm": 0.12402219325304031,
|
|
"kl": 0.02569580078125,
|
|
"learning_rate": 5.5e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.4214746206998825,
|
|
"reward_std": 0.4252837300300598,
|
|
"rewards/cosine_scaled_reward": 0.004807952791452408,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 165
|
|
},
|
|
{
|
|
"completion_length": 2990.2291259765625,
|
|
"epoch": 0.5684931506849316,
|
|
"grad_norm": 0.1333877593278885,
|
|
"kl": 0.02728271484375,
|
|
"learning_rate": 5.447641303893714e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.3566634953022003,
|
|
"reward_std": 0.49885208904743195,
|
|
"rewards/cosine_scaled_reward": -0.05305874161422253,
|
|
"rewards/format_reward": 0.4097222238779068,
|
|
"step": 166
|
|
},
|
|
{
|
|
"completion_length": 3205.513916015625,
|
|
"epoch": 0.571917808219178,
|
|
"grad_norm": 0.14048750698566437,
|
|
"kl": 0.02264404296875,
|
|
"learning_rate": 5.395289696316747e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.29908060282468796,
|
|
"reward_std": 0.5275179445743561,
|
|
"rewards/cosine_scaled_reward": -0.07591940555721521,
|
|
"rewards/format_reward": 0.3750000149011612,
|
|
"step": 167
|
|
},
|
|
{
|
|
"completion_length": 2930.826416015625,
|
|
"epoch": 0.5753424657534246,
|
|
"grad_norm": 0.13344134390354156,
|
|
"kl": 0.0302734375,
|
|
"learning_rate": 5.342952264838747e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.41841088235378265,
|
|
"reward_std": 0.40377357602119446,
|
|
"rewards/cosine_scaled_reward": -0.05381134897470474,
|
|
"rewards/format_reward": 0.4722222238779068,
|
|
"step": 168
|
|
},
|
|
{
|
|
"completion_length": 2789.326416015625,
|
|
"epoch": 0.5787671232876712,
|
|
"grad_norm": 0.1511034071445465,
|
|
"kl": 0.02587890625,
|
|
"learning_rate": 5.29063609511014e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.45637938380241394,
|
|
"reward_std": 0.4706819951534271,
|
|
"rewards/cosine_scaled_reward": -0.00889836996793747,
|
|
"rewards/format_reward": 0.4652777761220932,
|
|
"step": 169
|
|
},
|
|
{
|
|
"completion_length": 3059.9930419921875,
|
|
"epoch": 0.5821917808219178,
|
|
"grad_norm": 0.14896270632743835,
|
|
"kl": 0.02703857421875,
|
|
"learning_rate": 5.238348269902859e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.18200979381799698,
|
|
"reward_std": 0.4613404721021652,
|
|
"rewards/cosine_scaled_reward": -0.19299022108316422,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 170
|
|
},
|
|
{
|
|
"completion_length": 3044.4930419921875,
|
|
"epoch": 0.5856164383561644,
|
|
"grad_norm": 0.12685967981815338,
|
|
"kl": 0.02545166015625,
|
|
"learning_rate": 5.186095868151436e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.24055355973541737,
|
|
"reward_std": 0.4408658444881439,
|
|
"rewards/cosine_scaled_reward": -0.14139089360833168,
|
|
"rewards/format_reward": 0.3819444477558136,
|
|
"step": 171
|
|
},
|
|
{
|
|
"completion_length": 2786.451416015625,
|
|
"epoch": 0.589041095890411,
|
|
"grad_norm": 0.14915940165519714,
|
|
"kl": 0.02362060546875,
|
|
"learning_rate": 5.133885963994639e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.23196261376142502,
|
|
"reward_std": 0.30038829147815704,
|
|
"rewards/cosine_scaled_reward": -0.14303740486502647,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 172
|
|
},
|
|
{
|
|
"completion_length": 3067.9306640625,
|
|
"epoch": 0.5924657534246576,
|
|
"grad_norm": 0.10505218058824539,
|
|
"kl": 0.0228271484375,
|
|
"learning_rate": 5.081725625817735e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.5079791992902756,
|
|
"reward_std": 0.5481714606285095,
|
|
"rewards/cosine_scaled_reward": 0.05659032240509987,
|
|
"rewards/format_reward": 0.451388880610466,
|
|
"step": 173
|
|
},
|
|
{
|
|
"completion_length": 2666.3541259765625,
|
|
"epoch": 0.5958904109589042,
|
|
"grad_norm": 0.135061576962471,
|
|
"kl": 0.02471923828125,
|
|
"learning_rate": 5.02962191529556e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.4175218790769577,
|
|
"reward_std": 0.37634842097759247,
|
|
"rewards/cosine_scaled_reward": -0.1033114567399025,
|
|
"rewards/format_reward": 0.5208333283662796,
|
|
"step": 174
|
|
},
|
|
{
|
|
"completion_length": 3136.0833740234375,
|
|
"epoch": 0.5993150684931506,
|
|
"grad_norm": 0.11110376566648483,
|
|
"kl": 0.026123046875,
|
|
"learning_rate": 4.977581886436462e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.4329180419445038,
|
|
"reward_std": 0.5168599039316177,
|
|
"rewards/cosine_scaled_reward": -0.025415293872356415,
|
|
"rewards/format_reward": 0.4583333283662796,
|
|
"step": 175
|
|
},
|
|
{
|
|
"completion_length": 3090.9583740234375,
|
|
"epoch": 0.6027397260273972,
|
|
"grad_norm": 0.12515892088413239,
|
|
"kl": 0.02777099609375,
|
|
"learning_rate": 4.925612584627324e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.3180784285068512,
|
|
"reward_std": 0.4640498459339142,
|
|
"rewards/cosine_scaled_reward": -0.0985882543027401,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 176
|
|
},
|
|
{
|
|
"completion_length": 3099.416748046875,
|
|
"epoch": 0.6061643835616438,
|
|
"grad_norm": 0.10670111328363419,
|
|
"kl": 0.027587890625,
|
|
"learning_rate": 4.873721045679706e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.37505099177360535,
|
|
"reward_std": 0.31386855244636536,
|
|
"rewards/cosine_scaled_reward": -0.013837885111570358,
|
|
"rewards/format_reward": 0.3888889029622078,
|
|
"step": 177
|
|
},
|
|
{
|
|
"completion_length": 2943.9306640625,
|
|
"epoch": 0.6095890410958904,
|
|
"grad_norm": 0.13467499613761902,
|
|
"kl": 0.0272216796875,
|
|
"learning_rate": 4.821914294877326e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.2806200385093689,
|
|
"reward_std": 0.48631517589092255,
|
|
"rewards/cosine_scaled_reward": -0.1152133010327816,
|
|
"rewards/format_reward": 0.3958333283662796,
|
|
"step": 178
|
|
},
|
|
{
|
|
"completion_length": 3079.7222900390625,
|
|
"epoch": 0.613013698630137,
|
|
"grad_norm": 0.1212821900844574,
|
|
"kl": 0.02349853515625,
|
|
"learning_rate": 4.770199346024947e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.34058963507413864,
|
|
"reward_std": 0.5203238129615784,
|
|
"rewards/cosine_scaled_reward": -0.03441034443676472,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 179
|
|
},
|
|
{
|
|
"completion_length": 2798.0555419921875,
|
|
"epoch": 0.6164383561643836,
|
|
"grad_norm": 0.14347128570079803,
|
|
"kl": 0.02813720703125,
|
|
"learning_rate": 4.7185832004988133e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.33986398577690125,
|
|
"reward_std": 0.4621936082839966,
|
|
"rewards/cosine_scaled_reward": -0.12541379779577255,
|
|
"rewards/format_reward": 0.4652777761220932,
|
|
"step": 180
|
|
},
|
|
{
|
|
"completion_length": 3015.9791259765625,
|
|
"epoch": 0.6198630136986302,
|
|
"grad_norm": 0.10863691568374634,
|
|
"kl": 0.0252685546875,
|
|
"learning_rate": 4.667072846298785e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.32850363850593567,
|
|
"reward_std": 0.4628835916519165,
|
|
"rewards/cosine_scaled_reward": -0.10205190535634756,
|
|
"rewards/format_reward": 0.4305555671453476,
|
|
"step": 181
|
|
},
|
|
{
|
|
"completion_length": 3229.4375,
|
|
"epoch": 0.6232876712328768,
|
|
"grad_norm": 0.13130250573158264,
|
|
"kl": 0.02691650390625,
|
|
"learning_rate": 4.6156752571022637e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.1762874647974968,
|
|
"reward_std": 0.45904412865638733,
|
|
"rewards/cosine_scaled_reward": -0.1084347516298294,
|
|
"rewards/format_reward": 0.2847222238779068,
|
|
"step": 182
|
|
},
|
|
{
|
|
"completion_length": 3043.9722900390625,
|
|
"epoch": 0.6267123287671232,
|
|
"grad_norm": 0.13968031108379364,
|
|
"kl": 0.02679443359375,
|
|
"learning_rate": 4.5643973913200837e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.3731088787317276,
|
|
"reward_std": 0.5312269479036331,
|
|
"rewards/cosine_scaled_reward": -0.029668924515135586,
|
|
"rewards/format_reward": 0.4027777910232544,
|
|
"step": 183
|
|
},
|
|
{
|
|
"completion_length": 3144.5625,
|
|
"epoch": 0.6301369863013698,
|
|
"grad_norm": 0.15973497927188873,
|
|
"kl": 0.03131103515625,
|
|
"learning_rate": 4.513246191154434e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.2171999216079712,
|
|
"reward_std": 0.43705086410045624,
|
|
"rewards/cosine_scaled_reward": -0.12307787034660578,
|
|
"rewards/format_reward": 0.3402777910232544,
|
|
"step": 184
|
|
},
|
|
{
|
|
"completion_length": 2747.4652099609375,
|
|
"epoch": 0.6335616438356164,
|
|
"grad_norm": 0.13849014043807983,
|
|
"kl": 0.02557373046875,
|
|
"learning_rate": 4.4622285816590186e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.5670523345470428,
|
|
"reward_std": 0.41843467950820923,
|
|
"rewards/cosine_scaled_reward": 0.07399673759937286,
|
|
"rewards/format_reward": 0.4930555671453476,
|
|
"step": 185
|
|
},
|
|
{
|
|
"completion_length": 2985.71533203125,
|
|
"epoch": 0.636986301369863,
|
|
"grad_norm": 0.15879811346530914,
|
|
"kl": 0.0255126953125,
|
|
"learning_rate": 4.4113514698014953e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.4425032064318657,
|
|
"reward_std": 0.6330223977565765,
|
|
"rewards/cosine_scaled_reward": -0.036663462640717626,
|
|
"rewards/format_reward": 0.4791666716337204,
|
|
"step": 186
|
|
},
|
|
{
|
|
"completion_length": 2936.0416259765625,
|
|
"epoch": 0.6404109589041096,
|
|
"grad_norm": 0.12994582951068878,
|
|
"kl": 0.0272216796875,
|
|
"learning_rate": 4.360621743528392e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.34703654050827026,
|
|
"reward_std": 0.4127475470304489,
|
|
"rewards/cosine_scaled_reward": -0.11824123747646809,
|
|
"rewards/format_reward": 0.4652777761220932,
|
|
"step": 187
|
|
},
|
|
{
|
|
"completion_length": 2758.3126220703125,
|
|
"epoch": 0.6438356164383562,
|
|
"grad_norm": 0.17351487278938293,
|
|
"kl": 0.03253173828125,
|
|
"learning_rate": 4.3100462708325914e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.3991873562335968,
|
|
"reward_std": 0.5110540986061096,
|
|
"rewards/cosine_scaled_reward": -0.07303486485034227,
|
|
"rewards/format_reward": 0.4722222238779068,
|
|
"step": 188
|
|
},
|
|
{
|
|
"completion_length": 3067.34033203125,
|
|
"epoch": 0.6472602739726028,
|
|
"grad_norm": 0.17369891703128815,
|
|
"kl": 0.02911376953125,
|
|
"learning_rate": 4.2596318988235037e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.3477390259504318,
|
|
"reward_std": 0.5584754794836044,
|
|
"rewards/cosine_scaled_reward": -0.05503876507282257,
|
|
"rewards/format_reward": 0.4027777910232544,
|
|
"step": 189
|
|
},
|
|
{
|
|
"completion_length": 2643.451416015625,
|
|
"epoch": 0.6506849315068494,
|
|
"grad_norm": 0.15890643000602722,
|
|
"kl": 0.0362548828125,
|
|
"learning_rate": 4.209385452800095e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.3941483050584793,
|
|
"reward_std": 0.4240891933441162,
|
|
"rewards/cosine_scaled_reward": -0.05724058859050274,
|
|
"rewards/format_reward": 0.4513888955116272,
|
|
"step": 190
|
|
},
|
|
{
|
|
"completion_length": 3008.076416015625,
|
|
"epoch": 0.6541095890410958,
|
|
"grad_norm": 0.14325085282325745,
|
|
"kl": 0.02728271484375,
|
|
"learning_rate": 4.1593137353268303e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.4985590726137161,
|
|
"reward_std": 0.573657751083374,
|
|
"rewards/cosine_scaled_reward": 0.04717019200325012,
|
|
"rewards/format_reward": 0.4513888955116272,
|
|
"step": 191
|
|
},
|
|
{
|
|
"completion_length": 2820.451416015625,
|
|
"epoch": 0.6575342465753424,
|
|
"grad_norm": 0.14183150231838226,
|
|
"kl": 0.0311279296875,
|
|
"learning_rate": 4.1094235253127374e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.2823975309729576,
|
|
"reward_std": 0.4726349860429764,
|
|
"rewards/cosine_scaled_reward": -0.15510250255465508,
|
|
"rewards/format_reward": 0.4375,
|
|
"step": 192
|
|
},
|
|
{
|
|
"completion_length": 3112.888916015625,
|
|
"epoch": 0.660958904109589,
|
|
"grad_norm": 0.1434364914894104,
|
|
"kl": 0.0272216796875,
|
|
"learning_rate": 4.059721577093628e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.2338184304535389,
|
|
"reward_std": 0.513207420706749,
|
|
"rewards/cosine_scaled_reward": -0.0925704650580883,
|
|
"rewards/format_reward": 0.3263888955116272,
|
|
"step": 193
|
|
},
|
|
{
|
|
"completion_length": 2760.3958740234375,
|
|
"epoch": 0.6643835616438356,
|
|
"grad_norm": 0.15593542158603668,
|
|
"kl": 0.02716064453125,
|
|
"learning_rate": 4.0102146195176887e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.39227430522441864,
|
|
"reward_std": 0.431293249130249,
|
|
"rewards/cosine_scaled_reward": -0.05911456607282162,
|
|
"rewards/format_reward": 0.4513888955116272,
|
|
"step": 194
|
|
},
|
|
{
|
|
"completion_length": 3120.576416015625,
|
|
"epoch": 0.6678082191780822,
|
|
"grad_norm": 0.12067221850156784,
|
|
"kl": 0.02813720703125,
|
|
"learning_rate": 3.9609093550344907e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.33260975778102875,
|
|
"reward_std": 0.3859928399324417,
|
|
"rewards/cosine_scaled_reward": -0.014612471219152212,
|
|
"rewards/format_reward": 0.3472222238779068,
|
|
"step": 195
|
|
},
|
|
{
|
|
"completion_length": 2929.52783203125,
|
|
"epoch": 0.6712328767123288,
|
|
"grad_norm": 0.15069261193275452,
|
|
"kl": 0.0296630859375,
|
|
"learning_rate": 3.911812458787591e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.3924526572227478,
|
|
"reward_std": 0.48095013201236725,
|
|
"rewards/cosine_scaled_reward": -0.051991806365549564,
|
|
"rewards/format_reward": 0.4444444477558136,
|
|
"step": 196
|
|
},
|
|
{
|
|
"completion_length": 2846.9375,
|
|
"epoch": 0.6746575342465754,
|
|
"grad_norm": 0.1341117024421692,
|
|
"kl": 0.0289306640625,
|
|
"learning_rate": 3.86293057771082e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.5837523490190506,
|
|
"reward_std": 0.5863839089870453,
|
|
"rewards/cosine_scaled_reward": 0.07680792175233364,
|
|
"rewards/format_reward": 0.5069444328546524,
|
|
"step": 197
|
|
},
|
|
{
|
|
"completion_length": 2936.75,
|
|
"epoch": 0.678082191780822,
|
|
"grad_norm": 0.12375517934560776,
|
|
"kl": 0.02490234375,
|
|
"learning_rate": 3.8142703296283953e-07,
|
|
"loss": 0.001,
|
|
"reward": 0.421320840716362,
|
|
"reward_std": 0.48637837171554565,
|
|
"rewards/cosine_scaled_reward": 0.018543066456913948,
|
|
"rewards/format_reward": 0.4027777761220932,
|
|
"step": 198
|
|
},
|
|
{
|
|
"completion_length": 3141.076416015625,
|
|
"epoch": 0.6815068493150684,
|
|
"grad_norm": 0.14611783623695374,
|
|
"kl": 0.02886962890625,
|
|
"learning_rate": 3.7658383023589833e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.31543052941560745,
|
|
"reward_std": 0.484183669090271,
|
|
"rewards/cosine_scaled_reward": -0.059569500386714935,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 199
|
|
},
|
|
{
|
|
"completion_length": 2385.8680419921875,
|
|
"epoch": 0.684931506849315,
|
|
"grad_norm": 0.1563843935728073,
|
|
"kl": 0.02777099609375,
|
|
"learning_rate": 3.7176410528237945e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.546364039182663,
|
|
"reward_std": 0.4009109437465668,
|
|
"rewards/cosine_scaled_reward": -0.023080429062247276,
|
|
"rewards/format_reward": 0.5694444626569748,
|
|
"step": 200
|
|
},
|
|
{
|
|
"completion_length": 3089.6458740234375,
|
|
"epoch": 0.6883561643835616,
|
|
"grad_norm": 0.12665298581123352,
|
|
"kl": 0.03118896484375,
|
|
"learning_rate": 3.6696851061588994e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.17153404070995748,
|
|
"reward_std": 0.39160603284835815,
|
|
"rewards/cosine_scaled_reward": -0.18263262510299683,
|
|
"rewards/format_reward": 0.354166679084301,
|
|
"step": 201
|
|
},
|
|
{
|
|
"completion_length": 2912.826416015625,
|
|
"epoch": 0.6917808219178082,
|
|
"grad_norm": 0.1533837616443634,
|
|
"kl": 0.0333251953125,
|
|
"learning_rate": 3.62197695483182e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.4708049148321152,
|
|
"reward_std": 0.5301374197006226,
|
|
"rewards/cosine_scaled_reward": -0.01530623622238636,
|
|
"rewards/format_reward": 0.486111119389534,
|
|
"step": 202
|
|
},
|
|
{
|
|
"completion_length": 2903.9652099609375,
|
|
"epoch": 0.6952054794520548,
|
|
"grad_norm": 0.1623891443014145,
|
|
"kl": 0.03265380859375,
|
|
"learning_rate": 3.5745230577625573e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.3767779842019081,
|
|
"reward_std": 0.37230053544044495,
|
|
"rewards/cosine_scaled_reward": -0.025999773293733597,
|
|
"rewards/format_reward": 0.4027777761220932,
|
|
"step": 203
|
|
},
|
|
{
|
|
"completion_length": 2757.076416015625,
|
|
"epoch": 0.6986301369863014,
|
|
"grad_norm": 0.9816704988479614,
|
|
"kl": 0.03271484375,
|
|
"learning_rate": 3.5273298394491515e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.44859715551137924,
|
|
"reward_std": 0.3788439631462097,
|
|
"rewards/cosine_scaled_reward": 0.004152711480855942,
|
|
"rewards/format_reward": 0.4444444626569748,
|
|
"step": 204
|
|
},
|
|
{
|
|
"completion_length": 3004.104248046875,
|
|
"epoch": 0.702054794520548,
|
|
"grad_norm": 0.14980177581310272,
|
|
"kl": 0.0347900390625,
|
|
"learning_rate": 3.4804036890979205e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.26160044223070145,
|
|
"reward_std": 0.4303555190563202,
|
|
"rewards/cosine_scaled_reward": -0.0647884514182806,
|
|
"rewards/format_reward": 0.3263888955116272,
|
|
"step": 205
|
|
},
|
|
{
|
|
"completion_length": 2953.40283203125,
|
|
"epoch": 0.7054794520547946,
|
|
"grad_norm": 0.13833436369895935,
|
|
"kl": 0.031982421875,
|
|
"learning_rate": 3.433750959758446e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.3072604089975357,
|
|
"reward_std": 0.5101044028997421,
|
|
"rewards/cosine_scaled_reward": -0.12329516559839249,
|
|
"rewards/format_reward": 0.4305555522441864,
|
|
"step": 206
|
|
},
|
|
{
|
|
"completion_length": 2859.96533203125,
|
|
"epoch": 0.708904109589041,
|
|
"grad_norm": 0.13901107013225555,
|
|
"kl": 0.03009033203125,
|
|
"learning_rate": 3.387377967463493e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.5190957188606262,
|
|
"reward_std": 0.4833519160747528,
|
|
"rewards/cosine_scaled_reward": 0.07465130463242531,
|
|
"rewards/format_reward": 0.4444444626569748,
|
|
"step": 207
|
|
},
|
|
{
|
|
"completion_length": 2990.9583740234375,
|
|
"epoch": 0.7123287671232876,
|
|
"grad_norm": 0.16973043978214264,
|
|
"kl": 0.0328369140625,
|
|
"learning_rate": 3.3412909903738936e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.1571333408355713,
|
|
"reward_std": 0.33002787828445435,
|
|
"rewards/cosine_scaled_reward": -0.13453333638608456,
|
|
"rewards/format_reward": 0.2916666716337204,
|
|
"step": 208
|
|
},
|
|
{
|
|
"completion_length": 2839.3126220703125,
|
|
"epoch": 0.7157534246575342,
|
|
"grad_norm": 0.13653664290905,
|
|
"kl": 0.03125,
|
|
"learning_rate": 3.295496267928609e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.3505253791809082,
|
|
"reward_std": 0.47255241870880127,
|
|
"rewards/cosine_scaled_reward": -0.1008635088801384,
|
|
"rewards/format_reward": 0.451388880610466,
|
|
"step": 209
|
|
},
|
|
{
|
|
"completion_length": 2634.1737060546875,
|
|
"epoch": 0.7191780821917808,
|
|
"grad_norm": 0.16086305677890778,
|
|
"kl": 0.02947998046875,
|
|
"learning_rate": 3.250000000000001e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.4823562502861023,
|
|
"reward_std": 0.4473782926797867,
|
|
"rewards/cosine_scaled_reward": -0.04542151384521276,
|
|
"rewards/format_reward": 0.5277777910232544,
|
|
"step": 210
|
|
},
|
|
{
|
|
"completion_length": 2892.3958740234375,
|
|
"epoch": 0.7226027397260274,
|
|
"grad_norm": 0.13830755650997162,
|
|
"kl": 0.03363037109375,
|
|
"learning_rate": 3.204808346054461e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.4033074826002121,
|
|
"reward_std": 0.41210463643074036,
|
|
"rewards/cosine_scaled_reward": -0.10363698564469814,
|
|
"rewards/format_reward": 0.5069444477558136,
|
|
"step": 211
|
|
},
|
|
{
|
|
"completion_length": 2885.541748046875,
|
|
"epoch": 0.726027397260274,
|
|
"grad_norm": 0.1686064600944519,
|
|
"kl": 0.03106689453125,
|
|
"learning_rate": 3.159927424318531e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.3951975554227829,
|
|
"reward_std": 0.5239757895469666,
|
|
"rewards/cosine_scaled_reward": -0.04924688953906298,
|
|
"rewards/format_reward": 0.4444444477558136,
|
|
"step": 212
|
|
},
|
|
{
|
|
"completion_length": 2877.71533203125,
|
|
"epoch": 0.7294520547945206,
|
|
"grad_norm": 0.15673589706420898,
|
|
"kl": 0.034423828125,
|
|
"learning_rate": 3.115363310950578e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.2623257301747799,
|
|
"reward_std": 0.3177921772003174,
|
|
"rewards/cosine_scaled_reward": -0.08489650301635265,
|
|
"rewards/format_reward": 0.3472222164273262,
|
|
"step": 213
|
|
},
|
|
{
|
|
"completion_length": 3026.298583984375,
|
|
"epoch": 0.7328767123287672,
|
|
"grad_norm": 0.1316094845533371,
|
|
"kl": 0.0333251953125,
|
|
"learning_rate": 3.0711220392181934e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.4284323900938034,
|
|
"reward_std": 0.5533152520656586,
|
|
"rewards/cosine_scaled_reward": -0.009067630395293236,
|
|
"rewards/format_reward": 0.4375,
|
|
"step": 214
|
|
},
|
|
{
|
|
"completion_length": 2914.27783203125,
|
|
"epoch": 0.7363013698630136,
|
|
"grad_norm": 0.1542029231786728,
|
|
"kl": 0.02874755859375,
|
|
"learning_rate": 3.027209598681373e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.3265261799097061,
|
|
"reward_std": 0.4032795578241348,
|
|
"rewards/cosine_scaled_reward": -0.06930714938789606,
|
|
"rewards/format_reward": 0.3958333432674408,
|
|
"step": 215
|
|
},
|
|
{
|
|
"completion_length": 2932.25,
|
|
"epoch": 0.7397260273972602,
|
|
"grad_norm": 0.142373189330101,
|
|
"kl": 0.03369140625,
|
|
"learning_rate": 2.9836319343816397e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.5414205491542816,
|
|
"reward_std": 0.5791518688201904,
|
|
"rewards/cosine_scaled_reward": 0.013642808422446251,
|
|
"rewards/format_reward": 0.5277777761220932,
|
|
"step": 216
|
|
},
|
|
{
|
|
"completion_length": 2438.7987060546875,
|
|
"epoch": 0.7431506849315068,
|
|
"grad_norm": 0.15347974002361298,
|
|
"kl": 0.03460693359375,
|
|
"learning_rate": 2.9403949460371677e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.6215761005878448,
|
|
"reward_std": 0.360453262925148,
|
|
"rewards/cosine_scaled_reward": 0.0312983263283968,
|
|
"rewards/format_reward": 0.5902777910232544,
|
|
"step": 217
|
|
},
|
|
{
|
|
"completion_length": 2892.2916259765625,
|
|
"epoch": 0.7465753424657534,
|
|
"grad_norm": 0.14735041558742523,
|
|
"kl": 0.0301513671875,
|
|
"learning_rate": 2.897504487244061e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.46117305755615234,
|
|
"reward_std": 0.4907253533601761,
|
|
"rewards/cosine_scaled_reward": 0.030617523938417435,
|
|
"rewards/format_reward": 0.4305555522441864,
|
|
"step": 218
|
|
},
|
|
{
|
|
"completion_length": 2625.59033203125,
|
|
"epoch": 0.75,
|
|
"grad_norm": 0.30190229415893555,
|
|
"kl": 0.0400390625,
|
|
"learning_rate": 2.854966364683872e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.4576456546783447,
|
|
"reward_std": 0.3912748843431473,
|
|
"rewards/cosine_scaled_reward": -0.014576543122529984,
|
|
"rewards/format_reward": 0.472222238779068,
|
|
"step": 219
|
|
},
|
|
{
|
|
"completion_length": 2769.8333740234375,
|
|
"epoch": 0.7534246575342466,
|
|
"grad_norm": 0.13973061740398407,
|
|
"kl": 0.035400390625,
|
|
"learning_rate": 2.812786337337463e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.5715092867612839,
|
|
"reward_std": 0.45521561801433563,
|
|
"rewards/cosine_scaled_reward": 0.022898193448781967,
|
|
"rewards/format_reward": 0.5486111342906952,
|
|
"step": 220
|
|
},
|
|
{
|
|
"completion_length": 2313.9097900390625,
|
|
"epoch": 0.7568493150684932,
|
|
"grad_norm": 0.1722535640001297,
|
|
"kl": 0.0362548828125,
|
|
"learning_rate": 2.770970115705341e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.6356571912765503,
|
|
"reward_std": 0.31826692819595337,
|
|
"rewards/cosine_scaled_reward": -0.017120573669672012,
|
|
"rewards/format_reward": 0.6527777910232544,
|
|
"step": 221
|
|
},
|
|
{
|
|
"completion_length": 2617.8333740234375,
|
|
"epoch": 0.7602739726027398,
|
|
"grad_norm": 0.1793624311685562,
|
|
"kl": 0.0341796875,
|
|
"learning_rate": 2.729523361034538e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.5496674627065659,
|
|
"reward_std": 0.45733560621738434,
|
|
"rewards/cosine_scaled_reward": 0.01494525047019124,
|
|
"rewards/format_reward": 0.5347222089767456,
|
|
"step": 222
|
|
},
|
|
{
|
|
"completion_length": 2766.7291259765625,
|
|
"epoch": 0.7636986301369864,
|
|
"grad_norm": 0.1420270800590515,
|
|
"kl": 0.02960205078125,
|
|
"learning_rate": 2.68845168455218e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.6161434650421143,
|
|
"reward_std": 0.4993426203727722,
|
|
"rewards/cosine_scaled_reward": 0.07447678688913584,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 223
|
|
},
|
|
{
|
|
"completion_length": 2775.375,
|
|
"epoch": 0.7671232876712328,
|
|
"grad_norm": 0.15780124068260193,
|
|
"kl": 0.0301513671875,
|
|
"learning_rate": 2.6477606467058035e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.5157144665718079,
|
|
"reward_std": 0.3755457401275635,
|
|
"rewards/cosine_scaled_reward": 0.03654780611395836,
|
|
"rewards/format_reward": 0.4791666716337204,
|
|
"step": 224
|
|
},
|
|
{
|
|
"completion_length": 2975.041748046875,
|
|
"epoch": 0.7705479452054794,
|
|
"grad_norm": 0.13926586508750916,
|
|
"kl": 0.03179931640625,
|
|
"learning_rate": 2.6074557564105724e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.19908806309103966,
|
|
"reward_std": 0.42198337614536285,
|
|
"rewards/cosine_scaled_reward": -0.14813418313860893,
|
|
"rewards/format_reward": 0.3472222238779068,
|
|
"step": 225
|
|
},
|
|
{
|
|
"completion_length": 3097.52783203125,
|
|
"epoch": 0.773972602739726,
|
|
"grad_norm": 0.1462773233652115,
|
|
"kl": 0.028076171875,
|
|
"learning_rate": 2.567542470303452e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.4122817665338516,
|
|
"reward_std": 0.5014311969280243,
|
|
"rewards/cosine_scaled_reward": -0.0321626765653491,
|
|
"rewards/format_reward": 0.4444444477558136,
|
|
"step": 226
|
|
},
|
|
{
|
|
"completion_length": 2771.5556640625,
|
|
"epoch": 0.7773972602739726,
|
|
"grad_norm": 0.1374952346086502,
|
|
"kl": 0.0338134765625,
|
|
"learning_rate": 2.528026192004466e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.4719505310058594,
|
|
"reward_std": 0.49139489233493805,
|
|
"rewards/cosine_scaled_reward": -0.014160582795739174,
|
|
"rewards/format_reward": 0.486111119389534,
|
|
"step": 227
|
|
},
|
|
{
|
|
"completion_length": 2911.5833740234375,
|
|
"epoch": 0.7808219178082192,
|
|
"grad_norm": 0.1661740243434906,
|
|
"kl": 0.0330810546875,
|
|
"learning_rate": 2.488912271385139e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.3701959401369095,
|
|
"reward_std": 0.479979932308197,
|
|
"rewards/cosine_scaled_reward": -0.06730403914116323,
|
|
"rewards/format_reward": 0.4375,
|
|
"step": 228
|
|
},
|
|
{
|
|
"completion_length": 2854.826416015625,
|
|
"epoch": 0.7842465753424658,
|
|
"grad_norm": 0.1530180722475052,
|
|
"kl": 0.03662109375,
|
|
"learning_rate": 2.450206003844205e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.47989606857299805,
|
|
"reward_std": 0.40031544864177704,
|
|
"rewards/cosine_scaled_reward": -0.027048394083976746,
|
|
"rewards/format_reward": 0.5069444626569748,
|
|
"step": 229
|
|
},
|
|
{
|
|
"completion_length": 3026.8055419921875,
|
|
"epoch": 0.7876712328767124,
|
|
"grad_norm": 0.12480524182319641,
|
|
"kl": 0.0284423828125,
|
|
"learning_rate": 2.411912629590699e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.38817086815834045,
|
|
"reward_std": 0.41600513458251953,
|
|
"rewards/cosine_scaled_reward": -0.014606935903429985,
|
|
"rewards/format_reward": 0.4027777761220932,
|
|
"step": 230
|
|
},
|
|
{
|
|
"completion_length": 2905.3333740234375,
|
|
"epoch": 0.791095890410959,
|
|
"grad_norm": 0.14893729984760284,
|
|
"kl": 0.033203125,
|
|
"learning_rate": 2.374037332934512e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.4625111222267151,
|
|
"reward_std": 0.45194628834724426,
|
|
"rewards/cosine_scaled_reward": -0.023599994368851185,
|
|
"rewards/format_reward": 0.4861111342906952,
|
|
"step": 231
|
|
},
|
|
{
|
|
"completion_length": 2417.1319580078125,
|
|
"epoch": 0.7945205479452054,
|
|
"grad_norm": 0.1770123839378357,
|
|
"kl": 0.0350341796875,
|
|
"learning_rate": 2.336585241584522e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.7192187607288361,
|
|
"reward_std": 0.39792077243328094,
|
|
"rewards/cosine_scaled_reward": 0.059496549889445305,
|
|
"rewards/format_reward": 0.659722238779068,
|
|
"step": 232
|
|
},
|
|
{
|
|
"completion_length": 3020.0833740234375,
|
|
"epoch": 0.797945205479452,
|
|
"grad_norm": 0.1579238921403885,
|
|
"kl": 0.0362548828125,
|
|
"learning_rate": 2.299561425954383e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.3186444193124771,
|
|
"reward_std": 0.5212821513414383,
|
|
"rewards/cosine_scaled_reward": -0.11885556951165199,
|
|
"rewards/format_reward": 0.4375,
|
|
"step": 233
|
|
},
|
|
{
|
|
"completion_length": 2965.21533203125,
|
|
"epoch": 0.8013698630136986,
|
|
"grad_norm": 0.13462784886360168,
|
|
"kl": 0.03240966796875,
|
|
"learning_rate": 2.2629708984760706e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.4752238541841507,
|
|
"reward_std": 0.40988166630268097,
|
|
"rewards/cosine_scaled_reward": 0.003001643344759941,
|
|
"rewards/format_reward": 0.4722222238779068,
|
|
"step": 234
|
|
},
|
|
{
|
|
"completion_length": 2902.916748046875,
|
|
"epoch": 0.8047945205479452,
|
|
"grad_norm": 0.13331294059753418,
|
|
"kl": 0.02911376953125,
|
|
"learning_rate": 2.2268186129212807e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.3500567376613617,
|
|
"reward_std": 0.4639824479818344,
|
|
"rewards/cosine_scaled_reward": -0.0735543726477772,
|
|
"rewards/format_reward": 0.4236111044883728,
|
|
"step": 235
|
|
},
|
|
{
|
|
"completion_length": 2946.5,
|
|
"epoch": 0.8082191780821918,
|
|
"grad_norm": 0.14252088963985443,
|
|
"kl": 0.03155517578125,
|
|
"learning_rate": 2.1911094637307714e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.3601393699645996,
|
|
"reward_std": 0.40262600779533386,
|
|
"rewards/cosine_scaled_reward": -0.05652729608118534,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 236
|
|
},
|
|
{
|
|
"completion_length": 3064.47216796875,
|
|
"epoch": 0.8116438356164384,
|
|
"grad_norm": 0.16358421742916107,
|
|
"kl": 0.03594970703125,
|
|
"learning_rate": 2.1558482853517253e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.3410459593869746,
|
|
"reward_std": 0.3948727697134018,
|
|
"rewards/cosine_scaled_reward": -0.020065151154994965,
|
|
"rewards/format_reward": 0.3611111119389534,
|
|
"step": 237
|
|
},
|
|
{
|
|
"completion_length": 2980.416748046875,
|
|
"epoch": 0.815068493150685,
|
|
"grad_norm": 0.36835718154907227,
|
|
"kl": 0.039306640625,
|
|
"learning_rate": 2.1210398515832536e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.2538940832018852,
|
|
"reward_std": 0.3796728700399399,
|
|
"rewards/cosine_scaled_reward": -0.10721703246235847,
|
|
"rewards/format_reward": 0.3611111119389534,
|
|
"step": 238
|
|
},
|
|
{
|
|
"completion_length": 2511.3333740234375,
|
|
"epoch": 0.8184931506849316,
|
|
"grad_norm": 0.15058235824108124,
|
|
"kl": 0.03302001953125,
|
|
"learning_rate": 2.08668887493009e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.42302054166793823,
|
|
"reward_std": 0.35738538205623627,
|
|
"rewards/cosine_scaled_reward": -0.11864613555371761,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 239
|
|
},
|
|
{
|
|
"completion_length": 2788.2430419921875,
|
|
"epoch": 0.821917808219178,
|
|
"grad_norm": 0.15094441175460815,
|
|
"kl": 0.03497314453125,
|
|
"learning_rate": 2.0528000059645995e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.4362386465072632,
|
|
"reward_std": 0.37140533328056335,
|
|
"rewards/cosine_scaled_reward": -0.04987248365068808,
|
|
"rewards/format_reward": 0.486111119389534,
|
|
"step": 240
|
|
},
|
|
{
|
|
"completion_length": 2689.46533203125,
|
|
"epoch": 0.8253424657534246,
|
|
"grad_norm": 0.15874545276165009,
|
|
"kl": 0.03240966796875,
|
|
"learning_rate": 2.0193778326971628e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.44388699531555176,
|
|
"reward_std": 0.3412891924381256,
|
|
"rewards/cosine_scaled_reward": -0.08389079011976719,
|
|
"rewards/format_reward": 0.5277777910232544,
|
|
"step": 241
|
|
},
|
|
{
|
|
"completion_length": 3123.8680419921875,
|
|
"epoch": 0.8287671232876712,
|
|
"grad_norm": 0.1301407366991043,
|
|
"kl": 0.0311279296875,
|
|
"learning_rate": 1.986426879955034e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.21631913632154465,
|
|
"reward_std": 0.39288755506277084,
|
|
"rewards/cosine_scaled_reward": -0.15173641964793205,
|
|
"rewards/format_reward": 0.3680555522441864,
|
|
"step": 242
|
|
},
|
|
{
|
|
"completion_length": 2796.3056640625,
|
|
"epoch": 0.8321917808219178,
|
|
"grad_norm": 0.1545010507106781,
|
|
"kl": 0.038818359375,
|
|
"learning_rate": 1.9539516087697517e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.3822927325963974,
|
|
"reward_std": 0.5006706863641739,
|
|
"rewards/cosine_scaled_reward": -0.08298505656421185,
|
|
"rewards/format_reward": 0.4652777761220932,
|
|
"step": 243
|
|
},
|
|
{
|
|
"completion_length": 2943.77783203125,
|
|
"epoch": 0.8356164383561644,
|
|
"grad_norm": 0.16031137108802795,
|
|
"kl": 0.0350341796875,
|
|
"learning_rate": 1.9219564157731844e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.3758165240287781,
|
|
"reward_std": 0.3894062936306,
|
|
"rewards/cosine_scaled_reward": -0.0339057189412415,
|
|
"rewards/format_reward": 0.409722238779068,
|
|
"step": 244
|
|
},
|
|
{
|
|
"completion_length": 2864.8819580078125,
|
|
"epoch": 0.839041095890411,
|
|
"grad_norm": 0.14703238010406494,
|
|
"kl": 0.0367431640625,
|
|
"learning_rate": 1.8904456326023027e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.3677906394004822,
|
|
"reward_std": 0.2914382070302963,
|
|
"rewards/cosine_scaled_reward": -0.06276494171470404,
|
|
"rewards/format_reward": 0.4305555671453476,
|
|
"step": 245
|
|
},
|
|
{
|
|
"completion_length": 2885.84033203125,
|
|
"epoch": 0.8424657534246576,
|
|
"grad_norm": 0.13430039584636688,
|
|
"kl": 0.03369140625,
|
|
"learning_rate": 1.8594235253127372e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.39504893124103546,
|
|
"reward_std": 0.447158083319664,
|
|
"rewards/cosine_scaled_reward": -0.07717327354475856,
|
|
"rewards/format_reward": 0.4722222238779068,
|
|
"step": 246
|
|
},
|
|
{
|
|
"completion_length": 2797.9583740234375,
|
|
"epoch": 0.8458904109589042,
|
|
"grad_norm": 0.15996281802654266,
|
|
"kl": 0.03466796875,
|
|
"learning_rate": 1.8288942938012267e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.4512728601694107,
|
|
"reward_std": 0.49667105078697205,
|
|
"rewards/cosine_scaled_reward": -0.03483825922012329,
|
|
"rewards/format_reward": 0.4861111044883728,
|
|
"step": 247
|
|
},
|
|
{
|
|
"completion_length": 2761.7362060546875,
|
|
"epoch": 0.8493150684931506,
|
|
"grad_norm": 0.16140304505825043,
|
|
"kl": 0.0394287109375,
|
|
"learning_rate": 1.7988620712370195e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.5905152261257172,
|
|
"reward_std": 0.4745737165212631,
|
|
"rewards/cosine_scaled_reward": 0.03495965828187764,
|
|
"rewards/format_reward": 0.5555555522441864,
|
|
"step": 248
|
|
},
|
|
{
|
|
"completion_length": 2945.3055419921875,
|
|
"epoch": 0.8527397260273972,
|
|
"grad_norm": 0.14055365324020386,
|
|
"kl": 0.0325927734375,
|
|
"learning_rate": 1.7693309235023127e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.5377485156059265,
|
|
"reward_std": 0.5574042201042175,
|
|
"rewards/cosine_scaled_reward": 0.023859622422605753,
|
|
"rewards/format_reward": 0.5138888955116272,
|
|
"step": 249
|
|
},
|
|
{
|
|
"completion_length": 2663.6458740234375,
|
|
"epoch": 0.8561643835616438,
|
|
"grad_norm": 0.17629733681678772,
|
|
"kl": 0.03955078125,
|
|
"learning_rate": 1.7403048486417868e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.5798373818397522,
|
|
"reward_std": 0.5406672060489655,
|
|
"rewards/cosine_scaled_reward": 0.06594848074018955,
|
|
"rewards/format_reward": 0.5138888955116272,
|
|
"step": 250
|
|
},
|
|
{
|
|
"completion_length": 2895.1944580078125,
|
|
"epoch": 0.8595890410958904,
|
|
"grad_norm": 0.18740905821323395,
|
|
"kl": 0.03955078125,
|
|
"learning_rate": 1.711787776321341e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.3543848991394043,
|
|
"reward_std": 0.5366209447383881,
|
|
"rewards/cosine_scaled_reward": -0.09700398705899715,
|
|
"rewards/format_reward": 0.451388880610466,
|
|
"step": 251
|
|
},
|
|
{
|
|
"completion_length": 2812.7222900390625,
|
|
"epoch": 0.863013698630137,
|
|
"grad_norm": 0.13896240293979645,
|
|
"kl": 0.0306396484375,
|
|
"learning_rate": 1.6837835672960831e-07,
|
|
"loss": 0.0012,
|
|
"reward": 0.49542590975761414,
|
|
"reward_std": 0.42443887889385223,
|
|
"rewards/cosine_scaled_reward": -0.018462970852851868,
|
|
"rewards/format_reward": 0.513888880610466,
|
|
"step": 252
|
|
},
|
|
{
|
|
"completion_length": 2959.5069580078125,
|
|
"epoch": 0.8664383561643836,
|
|
"grad_norm": 0.1251368522644043,
|
|
"kl": 0.03564453125,
|
|
"learning_rate": 1.6562960128876353e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.2707902789115906,
|
|
"reward_std": 0.5160115361213684,
|
|
"rewards/cosine_scaled_reward": -0.14587640017271042,
|
|
"rewards/format_reward": 0.4166666567325592,
|
|
"step": 253
|
|
},
|
|
{
|
|
"completion_length": 2675.423583984375,
|
|
"epoch": 0.8698630136986302,
|
|
"grad_norm": 0.15856920182704926,
|
|
"kl": 0.0364990234375,
|
|
"learning_rate": 1.6293288344708566e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.6221481561660767,
|
|
"reward_std": 0.3958921879529953,
|
|
"rewards/cosine_scaled_reward": 0.03881483152508736,
|
|
"rewards/format_reward": 0.5833333432674408,
|
|
"step": 254
|
|
},
|
|
{
|
|
"completion_length": 3005.2362060546875,
|
|
"epoch": 0.8732876712328768,
|
|
"grad_norm": 0.1423628181219101,
|
|
"kl": 0.0340576171875,
|
|
"learning_rate": 1.6028856829700258e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.20419325679540634,
|
|
"reward_std": 0.3235137313604355,
|
|
"rewards/cosine_scaled_reward": -0.17080675438046455,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 255
|
|
},
|
|
{
|
|
"completion_length": 2853.2222900390625,
|
|
"epoch": 0.8767123287671232,
|
|
"grad_norm": 0.18754243850708008,
|
|
"kl": 0.0416259765625,
|
|
"learning_rate": 1.5769701383645698e-07,
|
|
"loss": 0.0017,
|
|
"reward": 0.5054954886436462,
|
|
"reward_std": 0.41478313505649567,
|
|
"rewards/cosine_scaled_reward": 0.047162143513560295,
|
|
"rewards/format_reward": 0.4583333283662796,
|
|
"step": 256
|
|
},
|
|
{
|
|
"completion_length": 3062.604248046875,
|
|
"epoch": 0.8801369863013698,
|
|
"grad_norm": 0.14577165246009827,
|
|
"kl": 0.035888671875,
|
|
"learning_rate": 1.551585709204381e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.5648697018623352,
|
|
"reward_std": 0.6114741563796997,
|
|
"rewards/cosine_scaled_reward": 0.0787586160004139,
|
|
"rewards/format_reward": 0.4861111044883728,
|
|
"step": 257
|
|
},
|
|
{
|
|
"completion_length": 2579.1944580078125,
|
|
"epoch": 0.8835616438356164,
|
|
"grad_norm": 0.14230208098888397,
|
|
"kl": 0.03363037109375,
|
|
"learning_rate": 1.5267358321348285e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.5974612534046173,
|
|
"reward_std": 0.35620053112506866,
|
|
"rewards/cosine_scaled_reward": 0.0557946152985096,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 258
|
|
},
|
|
{
|
|
"completion_length": 2702.3194580078125,
|
|
"epoch": 0.886986301369863,
|
|
"grad_norm": 0.1390962451696396,
|
|
"kl": 0.033935546875,
|
|
"learning_rate": 1.5024238714314825e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.6885968148708344,
|
|
"reward_std": 0.4262382835149765,
|
|
"rewards/cosine_scaled_reward": 0.09137461334466934,
|
|
"rewards/format_reward": 0.597222238779068,
|
|
"step": 259
|
|
},
|
|
{
|
|
"completion_length": 2968.2362060546875,
|
|
"epoch": 0.8904109589041096,
|
|
"grad_norm": 0.1457592248916626,
|
|
"kl": 0.03399658203125,
|
|
"learning_rate": 1.4786531185446452e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.33401037007570267,
|
|
"reward_std": 0.33265479654073715,
|
|
"rewards/cosine_scaled_reward": -0.00626740138977766,
|
|
"rewards/format_reward": 0.3402777761220932,
|
|
"step": 260
|
|
},
|
|
{
|
|
"completion_length": 3024.8680419921875,
|
|
"epoch": 0.8938356164383562,
|
|
"grad_norm": 0.14392928779125214,
|
|
"kl": 0.0400390625,
|
|
"learning_rate": 1.4554267916537495e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.30607690662145615,
|
|
"reward_std": 0.3898402601480484,
|
|
"rewards/cosine_scaled_reward": -0.027256430126726627,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 261
|
|
},
|
|
{
|
|
"completion_length": 2864.013916015625,
|
|
"epoch": 0.8972602739726028,
|
|
"grad_norm": 0.12024541944265366,
|
|
"kl": 0.0279541015625,
|
|
"learning_rate": 1.432748035231658e-07,
|
|
"loss": 0.0011,
|
|
"reward": 0.5254741907119751,
|
|
"reward_std": 0.46267665922641754,
|
|
"rewards/cosine_scaled_reward": -0.0231369249522686,
|
|
"rewards/format_reward": 0.548611119389534,
|
|
"step": 262
|
|
},
|
|
{
|
|
"completion_length": 2763.9583740234375,
|
|
"epoch": 0.9006849315068494,
|
|
"grad_norm": 0.24614769220352173,
|
|
"kl": 0.040771484375,
|
|
"learning_rate": 1.4106199196189608e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.35452011227607727,
|
|
"reward_std": 0.47811339795589447,
|
|
"rewards/cosine_scaled_reward": -0.09686877019703388,
|
|
"rewards/format_reward": 0.4513889104127884,
|
|
"step": 263
|
|
},
|
|
{
|
|
"completion_length": 2571.0,
|
|
"epoch": 0.9041095890410958,
|
|
"grad_norm": 0.14297829568386078,
|
|
"kl": 0.03631591796875,
|
|
"learning_rate": 1.3890454406082956e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.49256862699985504,
|
|
"reward_std": 0.3805427849292755,
|
|
"rewards/cosine_scaled_reward": -0.0074313730001449585,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 264
|
|
},
|
|
{
|
|
"completion_length": 2234.326416015625,
|
|
"epoch": 0.9075342465753424,
|
|
"grad_norm": 0.15764550864696503,
|
|
"kl": 0.032470703125,
|
|
"learning_rate": 1.3680275190387675e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.7705516219139099,
|
|
"reward_std": 0.4592142701148987,
|
|
"rewards/cosine_scaled_reward": 0.05527384765446186,
|
|
"rewards/format_reward": 0.7152777910232544,
|
|
"step": 265
|
|
},
|
|
{
|
|
"completion_length": 2849.0347900390625,
|
|
"epoch": 0.910958904109589,
|
|
"grad_norm": 0.13773973286151886,
|
|
"kl": 0.03289794921875,
|
|
"learning_rate": 1.3475690004005097e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.5077018439769745,
|
|
"reward_std": 0.3785110265016556,
|
|
"rewards/cosine_scaled_reward": -0.013131474610418081,
|
|
"rewards/format_reward": 0.5208333432674408,
|
|
"step": 266
|
|
},
|
|
{
|
|
"completion_length": 3086.2431640625,
|
|
"epoch": 0.9143835616438356,
|
|
"grad_norm": 0.1191553846001625,
|
|
"kl": 0.0340576171875,
|
|
"learning_rate": 1.3276726544494571e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.38693176954984665,
|
|
"reward_std": 0.45597271621227264,
|
|
"rewards/cosine_scaled_reward": -0.1269571604207158,
|
|
"rewards/format_reward": 0.513888880610466,
|
|
"step": 267
|
|
},
|
|
{
|
|
"completion_length": 2707.8681640625,
|
|
"epoch": 0.9178082191780822,
|
|
"grad_norm": 0.1543823927640915,
|
|
"kl": 0.0347900390625,
|
|
"learning_rate": 1.308341174832359e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.6485514044761658,
|
|
"reward_std": 0.4456641525030136,
|
|
"rewards/cosine_scaled_reward": 0.03744027949869633,
|
|
"rewards/format_reward": 0.6111111044883728,
|
|
"step": 268
|
|
},
|
|
{
|
|
"completion_length": 2430.388916015625,
|
|
"epoch": 0.9212328767123288,
|
|
"grad_norm": 0.20193035900592804,
|
|
"kl": 0.040283203125,
|
|
"learning_rate": 1.2895771787221088e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.5673209726810455,
|
|
"reward_std": 0.47004686295986176,
|
|
"rewards/cosine_scaled_reward": -0.009067919105291367,
|
|
"rewards/format_reward": 0.5763888955116272,
|
|
"step": 269
|
|
},
|
|
{
|
|
"completion_length": 2542.5208740234375,
|
|
"epoch": 0.9246575342465754,
|
|
"grad_norm": 0.14499835669994354,
|
|
"kl": 0.036376953125,
|
|
"learning_rate": 1.2713832064634125e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.6222769916057587,
|
|
"reward_std": 0.46079638600349426,
|
|
"rewards/cosine_scaled_reward": 0.05283256620168686,
|
|
"rewards/format_reward": 0.5694444477558136,
|
|
"step": 270
|
|
},
|
|
{
|
|
"completion_length": 2701.8056640625,
|
|
"epoch": 0.928082191780822,
|
|
"grad_norm": 0.16896043717861176,
|
|
"kl": 0.0341796875,
|
|
"learning_rate": 1.2537617212288742e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.36604734510183334,
|
|
"reward_std": 0.361560583114624,
|
|
"rewards/cosine_scaled_reward": -0.0922860149294138,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 271
|
|
},
|
|
{
|
|
"completion_length": 2764.4583740234375,
|
|
"epoch": 0.9315068493150684,
|
|
"grad_norm": 0.16842736303806305,
|
|
"kl": 0.0345458984375,
|
|
"learning_rate": 1.2367151086855187e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.5705253630876541,
|
|
"reward_std": 0.5290426909923553,
|
|
"rewards/cosine_scaled_reward": 0.07746978849172592,
|
|
"rewards/format_reward": 0.4930555522441864,
|
|
"step": 272
|
|
},
|
|
{
|
|
"completion_length": 2728.6597900390625,
|
|
"epoch": 0.934931506849315,
|
|
"grad_norm": 0.1411057859659195,
|
|
"kl": 0.0361328125,
|
|
"learning_rate": 1.220245676671809e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.5213780552148819,
|
|
"reward_std": 0.5097576379776001,
|
|
"rewards/cosine_scaled_reward": -0.02723303623497486,
|
|
"rewards/format_reward": 0.5486111044883728,
|
|
"step": 273
|
|
},
|
|
{
|
|
"completion_length": 3240.3055419921875,
|
|
"epoch": 0.9383561643835616,
|
|
"grad_norm": 0.1661859005689621,
|
|
"kl": 0.03228759765625,
|
|
"learning_rate": 1.2043556548852063e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.25158608704805374,
|
|
"reward_std": 0.4693699926137924,
|
|
"rewards/cosine_scaled_reward": -0.09563614055514336,
|
|
"rewards/format_reward": 0.3472222238779068,
|
|
"step": 274
|
|
},
|
|
{
|
|
"completion_length": 2608.8472900390625,
|
|
"epoch": 0.9417808219178082,
|
|
"grad_norm": 0.14709579944610596,
|
|
"kl": 0.0390625,
|
|
"learning_rate": 1.1890471945802999e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.5507668703794479,
|
|
"reward_std": 0.32081814110279083,
|
|
"rewards/cosine_scaled_reward": 0.0021557584404945374,
|
|
"rewards/format_reward": 0.548611119389534,
|
|
"step": 275
|
|
},
|
|
{
|
|
"completion_length": 2953.8055419921875,
|
|
"epoch": 0.9452054794520548,
|
|
"grad_norm": 0.13219843804836273,
|
|
"kl": 0.035888671875,
|
|
"learning_rate": 1.1743223682775649e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.4363091439008713,
|
|
"reward_std": 0.5200669467449188,
|
|
"rewards/cosine_scaled_reward": -0.0428575212135911,
|
|
"rewards/format_reward": 0.4791666865348816,
|
|
"step": 276
|
|
},
|
|
{
|
|
"completion_length": 2877.375,
|
|
"epoch": 0.9486301369863014,
|
|
"grad_norm": 0.1316324770450592,
|
|
"kl": 0.038818359375,
|
|
"learning_rate": 1.160183169482775e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.5365364849567413,
|
|
"reward_std": 0.5149263441562653,
|
|
"rewards/cosine_scaled_reward": 0.05042533949017525,
|
|
"rewards/format_reward": 0.4861111342906952,
|
|
"step": 277
|
|
},
|
|
{
|
|
"completion_length": 2528.6944580078125,
|
|
"epoch": 0.952054794520548,
|
|
"grad_norm": 0.21519577503204346,
|
|
"kl": 0.03662109375,
|
|
"learning_rate": 1.1466315124171128e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.4645202308893204,
|
|
"reward_std": 0.4477800279855728,
|
|
"rewards/cosine_scaled_reward": -0.07714640907943249,
|
|
"rewards/format_reward": 0.5416666716337204,
|
|
"step": 278
|
|
},
|
|
{
|
|
"completion_length": 2701.9862060546875,
|
|
"epoch": 0.9554794520547946,
|
|
"grad_norm": 0.17575909197330475,
|
|
"kl": 0.0399169921875,
|
|
"learning_rate": 1.1336692317580158e-07,
|
|
"loss": 0.0016,
|
|
"reward": 0.40750324726104736,
|
|
"reward_std": 0.4330911487340927,
|
|
"rewards/cosine_scaled_reward": -0.07166343554854393,
|
|
"rewards/format_reward": 0.4791666716337204,
|
|
"step": 279
|
|
},
|
|
{
|
|
"completion_length": 2913.5,
|
|
"epoch": 0.958904109589041,
|
|
"grad_norm": 0.1264602094888687,
|
|
"kl": 0.03594970703125,
|
|
"learning_rate": 1.1212980823907929e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.5513550490140915,
|
|
"reward_std": 0.5005469620227814,
|
|
"rewards/cosine_scaled_reward": 0.04441063478589058,
|
|
"rewards/format_reward": 0.5069444626569748,
|
|
"step": 280
|
|
},
|
|
{
|
|
"completion_length": 2647.451416015625,
|
|
"epoch": 0.9623287671232876,
|
|
"grad_norm": 0.2159995436668396,
|
|
"kl": 0.0328369140625,
|
|
"learning_rate": 1.1095197391710362e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.5046227127313614,
|
|
"reward_std": 0.4968739449977875,
|
|
"rewards/cosine_scaled_reward": -0.01621063333004713,
|
|
"rewards/format_reward": 0.5208333283662796,
|
|
"step": 281
|
|
},
|
|
{
|
|
"completion_length": 2636.2708740234375,
|
|
"epoch": 0.9657534246575342,
|
|
"grad_norm": 0.16840696334838867,
|
|
"kl": 0.0361328125,
|
|
"learning_rate": 1.0983357966978745e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.4477865546941757,
|
|
"reward_std": 0.47679105401039124,
|
|
"rewards/cosine_scaled_reward": -0.0730467566754669,
|
|
"rewards/format_reward": 0.5208333283662796,
|
|
"step": 282
|
|
},
|
|
{
|
|
"completion_length": 2957.4930419921875,
|
|
"epoch": 0.9691780821917808,
|
|
"grad_norm": 0.14714959263801575,
|
|
"kl": 0.03460693359375,
|
|
"learning_rate": 1.0877477690980931e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.35540203750133514,
|
|
"reward_std": 0.47071878612041473,
|
|
"rewards/cosine_scaled_reward": -0.09598685055971146,
|
|
"rewards/format_reward": 0.4513888955116272,
|
|
"step": 283
|
|
},
|
|
{
|
|
"completion_length": 2786.5208740234375,
|
|
"epoch": 0.9726027397260274,
|
|
"grad_norm": 0.1734078973531723,
|
|
"kl": 0.044677734375,
|
|
"learning_rate": 1.0777570898211405e-07,
|
|
"loss": 0.0018,
|
|
"reward": 0.3655647486448288,
|
|
"reward_std": 0.4497402310371399,
|
|
"rewards/cosine_scaled_reward": -0.05110191088169813,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 284
|
|
},
|
|
{
|
|
"completion_length": 2854.5069580078125,
|
|
"epoch": 0.976027397260274,
|
|
"grad_norm": 0.140737384557724,
|
|
"kl": 0.0367431640625,
|
|
"learning_rate": 1.068365111445064e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.3777775317430496,
|
|
"reward_std": 0.4247249662876129,
|
|
"rewards/cosine_scaled_reward": -0.09444468468427658,
|
|
"rewards/format_reward": 0.4722222238779068,
|
|
"step": 285
|
|
},
|
|
{
|
|
"completion_length": 2879.7501220703125,
|
|
"epoch": 0.9794520547945206,
|
|
"grad_norm": 0.15024779736995697,
|
|
"kl": 0.0333251953125,
|
|
"learning_rate": 1.0595731054933934e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.24627424031496048,
|
|
"reward_std": 0.3542858809232712,
|
|
"rewards/cosine_scaled_reward": -0.12178133055567741,
|
|
"rewards/format_reward": 0.3680555671453476,
|
|
"step": 286
|
|
},
|
|
{
|
|
"completion_length": 2901.6944580078125,
|
|
"epoch": 0.9828767123287672,
|
|
"grad_norm": 0.12313710153102875,
|
|
"kl": 0.03375244140625,
|
|
"learning_rate": 1.0513822622629978e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.5626899600028992,
|
|
"reward_std": 0.47839629650115967,
|
|
"rewards/cosine_scaled_reward": 0.06268996931612492,
|
|
"rewards/format_reward": 0.5000000149011612,
|
|
"step": 287
|
|
},
|
|
{
|
|
"completion_length": 2720.4097900390625,
|
|
"epoch": 0.9863013698630136,
|
|
"grad_norm": 0.13372260332107544,
|
|
"kl": 0.03326416015625,
|
|
"learning_rate": 1.0437936906629334e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.42101193219423294,
|
|
"reward_std": 0.44517213106155396,
|
|
"rewards/cosine_scaled_reward": -0.07898806827142835,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 288
|
|
},
|
|
{
|
|
"completion_length": 2722.4305419921875,
|
|
"epoch": 0.9897260273972602,
|
|
"grad_norm": 0.18578775227069855,
|
|
"kl": 0.03375244140625,
|
|
"learning_rate": 1.0368084180643224e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.574064090847969,
|
|
"reward_std": 0.4914132207632065,
|
|
"rewards/cosine_scaled_reward": 0.018508493900299072,
|
|
"rewards/format_reward": 0.5555555522441864,
|
|
"step": 289
|
|
},
|
|
{
|
|
"completion_length": 2860.0208740234375,
|
|
"epoch": 0.9931506849315068,
|
|
"grad_norm": 0.16790378093719482,
|
|
"kl": 0.037841796875,
|
|
"learning_rate": 1.0304273901612565e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.4475916475057602,
|
|
"reward_std": 0.35639651119709015,
|
|
"rewards/cosine_scaled_reward": -0.05240832082927227,
|
|
"rewards/format_reward": 0.5000000149011612,
|
|
"step": 290
|
|
},
|
|
{
|
|
"completion_length": 2500.354248046875,
|
|
"epoch": 0.9965753424657534,
|
|
"grad_norm": 0.19861853122711182,
|
|
"kl": 0.0384521484375,
|
|
"learning_rate": 1.0246514708427701e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.5283551514148712,
|
|
"reward_std": 0.4327033758163452,
|
|
"rewards/cosine_scaled_reward": -0.03414486348628998,
|
|
"rewards/format_reward": 0.5625,
|
|
"step": 291
|
|
},
|
|
{
|
|
"completion_length": 3228.75,
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.14693370461463928,
|
|
"kl": 0.03466796875,
|
|
"learning_rate": 1.0194814420758804e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.8688084781169891,
|
|
"reward_std": 0.7189086824655533,
|
|
"rewards/cosine_scaled_reward": -0.00619150698184967,
|
|
"rewards/format_reward": 0.875,
|
|
"step": 292
|
|
},
|
|
{
|
|
"completion_length": 2950.875,
|
|
"epoch": 1.0034246575342465,
|
|
"grad_norm": 0.1779322773218155,
|
|
"kl": 0.0443115234375,
|
|
"learning_rate": 1.0149180037997228e-07,
|
|
"loss": 0.0018,
|
|
"reward": 0.3629312068223953,
|
|
"reward_std": 0.47756847739219666,
|
|
"rewards/cosine_scaled_reward": -0.06067990604788065,
|
|
"rewards/format_reward": 0.423611119389534,
|
|
"step": 293
|
|
},
|
|
{
|
|
"completion_length": 2664.9375,
|
|
"epoch": 1.0068493150684932,
|
|
"grad_norm": 0.140543133020401,
|
|
"kl": 0.03173828125,
|
|
"learning_rate": 1.0109617738307911e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.42808833718299866,
|
|
"reward_std": 0.38089750707149506,
|
|
"rewards/cosine_scaled_reward": -0.08580057881772518,
|
|
"rewards/format_reward": 0.513888880610466,
|
|
"step": 294
|
|
},
|
|
{
|
|
"completion_length": 2873.826416015625,
|
|
"epoch": 1.0102739726027397,
|
|
"grad_norm": 0.14133252203464508,
|
|
"kl": 0.032958984375,
|
|
"learning_rate": 1.0076132877792932e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.463130921125412,
|
|
"reward_std": 0.6191278696060181,
|
|
"rewards/cosine_scaled_reward": -0.03686907887458801,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 295
|
|
},
|
|
{
|
|
"completion_length": 3035.78466796875,
|
|
"epoch": 1.0136986301369864,
|
|
"grad_norm": 0.12591783702373505,
|
|
"kl": 0.03314208984375,
|
|
"learning_rate": 1.0048729989766394e-07,
|
|
"loss": 0.0013,
|
|
"reward": 0.5283119380474091,
|
|
"reward_std": 0.4726581275463104,
|
|
"rewards/cosine_scaled_reward": 0.06997863575816154,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 296
|
|
},
|
|
{
|
|
"completion_length": 2424.90283203125,
|
|
"epoch": 1.0171232876712328,
|
|
"grad_norm": 0.14170390367507935,
|
|
"kl": 0.0355224609375,
|
|
"learning_rate": 1.002741278414069e-07,
|
|
"loss": 0.0014,
|
|
"reward": 0.8962737321853638,
|
|
"reward_std": 0.46189258992671967,
|
|
"rewards/cosine_scaled_reward": 0.20877373218536377,
|
|
"rewards/format_reward": 0.6875000298023224,
|
|
"step": 297
|
|
},
|
|
{
|
|
"completion_length": 2943.513916015625,
|
|
"epoch": 1.0205479452054795,
|
|
"grad_norm": 0.1521555632352829,
|
|
"kl": 0.0386962890625,
|
|
"learning_rate": 1.0012184146924223e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.26369042694568634,
|
|
"reward_std": 0.40857334434986115,
|
|
"rewards/cosine_scaled_reward": -0.15297627449035645,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 298
|
|
},
|
|
{
|
|
"completion_length": 2802.791748046875,
|
|
"epoch": 1.023972602739726,
|
|
"grad_norm": 0.13754691183567047,
|
|
"kl": 0.0372314453125,
|
|
"learning_rate": 1.0003046139830701e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.6113243997097015,
|
|
"reward_std": 0.4505104422569275,
|
|
"rewards/cosine_scaled_reward": 0.055768875405192375,
|
|
"rewards/format_reward": 0.5555555820465088,
|
|
"step": 299
|
|
},
|
|
{
|
|
"completion_length": 2618.9722900390625,
|
|
"epoch": 1.0273972602739727,
|
|
"grad_norm": 0.15953682363033295,
|
|
"kl": 0.0386962890625,
|
|
"learning_rate": 1e-07,
|
|
"loss": 0.0015,
|
|
"reward": 0.6579746007919312,
|
|
"reward_std": 0.4734661132097244,
|
|
"rewards/cosine_scaled_reward": 0.08158569037914276,
|
|
"rewards/format_reward": 0.5763888955116272,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.0273972602739727,
|
|
"step": 300,
|
|
"total_flos": 0.0,
|
|
"train_loss": 0.0008326698157195504,
|
|
"train_runtime": 30165.0377,
|
|
"train_samples_per_second": 0.239,
|
|
"train_steps_per_second": 0.01
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 300,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 100,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|