6056 lines
165 KiB
JSON
6056 lines
165 KiB
JSON
{
|
|
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
|
"episodes": 5000,
|
|
"epochs": 1,
|
|
"batch_size": 8,
|
|
"learning_rate": 5e-06,
|
|
"global_step": 2500,
|
|
"training_loss": 0.005647265207767487,
|
|
"training_reward_curve": {
|
|
"type": "unbounded_scalar",
|
|
"note": "Direct training_reward() scalar. Not comparable to eval_reward.",
|
|
"mean_start": 0.13,
|
|
"mean_end": 0.469
|
|
},
|
|
"eval_reward_before": {
|
|
"Fraud detection": 0.0,
|
|
"Decision accuracy": 0.0,
|
|
"Evidence quality": 0.3333333333333333,
|
|
"Calibration": 0.0,
|
|
"Reasoning quality": 0.8333333333333334
|
|
},
|
|
"eval_reward_after": {
|
|
"Fraud detection": 0.3333333333333333,
|
|
"Decision accuracy": 1.0,
|
|
"Evidence quality": 0.3333333333333333,
|
|
"Calibration": 1.0,
|
|
"Reasoning quality": 0.7916666666666666
|
|
},
|
|
"component_shift": {
|
|
"before": {
|
|
"Fraud detection": 0.0,
|
|
"Decision accuracy": 0.0,
|
|
"Evidence quality": 0.3333333333333333,
|
|
"Calibration": 0.0,
|
|
"Reasoning quality": 0.8333333333333334
|
|
},
|
|
"after": {
|
|
"Fraud detection": 0.3333333333333333,
|
|
"Decision accuracy": 1.0,
|
|
"Evidence quality": 0.3333333333333333,
|
|
"Calibration": 1.0,
|
|
"Reasoning quality": 0.7916666666666666
|
|
}
|
|
},
|
|
"log_history": [
|
|
{
|
|
"loss": 0.0008,
|
|
"grad_norm": 22.5,
|
|
"learning_rate": 4.9900000000000005e-06,
|
|
"rewards/reward_fn": 0.12996437549591064,
|
|
"reward": 0.12996437549591064,
|
|
"reward_std": 0.15663783259224145,
|
|
"completion_length": 72.6125,
|
|
"kl": 0.01886011641472578,
|
|
"epoch": 0.002,
|
|
"step": 5
|
|
},
|
|
{
|
|
"loss": 0.0017,
|
|
"grad_norm": 25.375,
|
|
"learning_rate": 4.980000000000001e-06,
|
|
"rewards/reward_fn": 0.28686500089243056,
|
|
"reward": 0.28686500089243056,
|
|
"reward_std": 0.1139603321440518,
|
|
"completion_length": 71.45,
|
|
"kl": 0.04206784293055534,
|
|
"epoch": 0.004,
|
|
"step": 10
|
|
},
|
|
{
|
|
"loss": 0.0018,
|
|
"grad_norm": 26.125,
|
|
"learning_rate": 4.970000000000001e-06,
|
|
"rewards/reward_fn": 0.33125562937930225,
|
|
"reward": 0.33125562937930225,
|
|
"reward_std": 0.10047997636720538,
|
|
"completion_length": 69.7625,
|
|
"kl": 0.04418694227933884,
|
|
"epoch": 0.006,
|
|
"step": 15
|
|
},
|
|
{
|
|
"loss": 0.0024,
|
|
"grad_norm": 29.5,
|
|
"learning_rate": 4.960000000000001e-06,
|
|
"rewards/reward_fn": 0.38998999876203017,
|
|
"reward": 0.38998999876203017,
|
|
"reward_std": 0.05469522252678871,
|
|
"completion_length": 66.0125,
|
|
"kl": 0.061039629578590396,
|
|
"epoch": 0.008,
|
|
"step": 20
|
|
},
|
|
{
|
|
"loss": 0.0121,
|
|
"grad_norm": 105.5,
|
|
"learning_rate": 4.95e-06,
|
|
"rewards/reward_fn": 0.31268125153146686,
|
|
"reward": 0.31268125153146686,
|
|
"reward_std": 0.05678519255015999,
|
|
"completion_length": 68.3625,
|
|
"kl": 0.30179612897336483,
|
|
"epoch": 0.01,
|
|
"step": 25
|
|
},
|
|
{
|
|
"loss": 0.0028,
|
|
"grad_norm": 31.0,
|
|
"learning_rate": 4.94e-06,
|
|
"rewards/reward_fn": 0.2681674983672565,
|
|
"reward": 0.2681674983672565,
|
|
"reward_std": 0.0353069698670879,
|
|
"completion_length": 65.6875,
|
|
"kl": 0.07095254212617874,
|
|
"epoch": 0.012,
|
|
"step": 30
|
|
},
|
|
{
|
|
"loss": 0.0041,
|
|
"grad_norm": 26.25,
|
|
"learning_rate": 4.93e-06,
|
|
"rewards/reward_fn": 0.3527887500880752,
|
|
"reward": 0.3527887500880752,
|
|
"reward_std": 0.05785412744153291,
|
|
"completion_length": 63.4,
|
|
"kl": 0.10367086306214332,
|
|
"epoch": 0.014,
|
|
"step": 35
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 25.875,
|
|
"learning_rate": 4.92e-06,
|
|
"rewards/reward_fn": 0.34420499864791054,
|
|
"reward": 0.34420499864791054,
|
|
"reward_std": 0.06693777176551521,
|
|
"completion_length": 63.125,
|
|
"kl": 0.11868430003523826,
|
|
"epoch": 0.016,
|
|
"step": 40
|
|
},
|
|
{
|
|
"loss": 0.0069,
|
|
"grad_norm": 24.0,
|
|
"learning_rate": 4.9100000000000004e-06,
|
|
"rewards/reward_fn": 0.19720500293187798,
|
|
"reward": 0.19720500293187798,
|
|
"reward_std": 0.09952702496666462,
|
|
"completion_length": 62.625,
|
|
"kl": 0.17344569861888887,
|
|
"epoch": 0.018,
|
|
"step": 45
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 4.9000000000000005e-06,
|
|
"rewards/reward_fn": 0.37614874897699335,
|
|
"reward": 0.37614874897699335,
|
|
"reward_std": 0.05041897173505276,
|
|
"completion_length": 63.65,
|
|
"kl": 0.14020639136433602,
|
|
"epoch": 0.02,
|
|
"step": 50
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 21.375,
|
|
"learning_rate": 4.890000000000001e-06,
|
|
"rewards/reward_fn": 0.20948062620591373,
|
|
"reward": 0.20948062620591373,
|
|
"reward_std": 0.11797074675559997,
|
|
"completion_length": 65.2,
|
|
"kl": 0.12582977935671807,
|
|
"epoch": 0.022,
|
|
"step": 55
|
|
},
|
|
{
|
|
"loss": 0.0049,
|
|
"grad_norm": 24.25,
|
|
"learning_rate": 4.880000000000001e-06,
|
|
"rewards/reward_fn": 0.2675649975077249,
|
|
"reward": 0.2675649975077249,
|
|
"reward_std": 0.10371575457975268,
|
|
"completion_length": 65.775,
|
|
"kl": 0.12268042787909508,
|
|
"epoch": 0.024,
|
|
"step": 60
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 27.25,
|
|
"learning_rate": 4.87e-06,
|
|
"rewards/reward_fn": 0.17759875237825326,
|
|
"reward": 0.17759875237825326,
|
|
"reward_std": 0.09766199714504183,
|
|
"completion_length": 64.4375,
|
|
"kl": 0.1204748086631298,
|
|
"epoch": 0.026,
|
|
"step": 65
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 4.86e-06,
|
|
"rewards/reward_fn": 0.3662331223487854,
|
|
"reward": 0.3662331223487854,
|
|
"reward_std": 0.13943021052982657,
|
|
"completion_length": 65.525,
|
|
"kl": 0.1409070000052452,
|
|
"epoch": 0.028,
|
|
"step": 70
|
|
},
|
|
{
|
|
"loss": 0.0041,
|
|
"grad_norm": 28.375,
|
|
"learning_rate": 4.85e-06,
|
|
"rewards/reward_fn": 0.35237686783075334,
|
|
"reward": 0.35237686783075334,
|
|
"reward_std": 0.14571735821664333,
|
|
"completion_length": 67.475,
|
|
"kl": 0.10161374881863594,
|
|
"epoch": 0.03,
|
|
"step": 75
|
|
},
|
|
{
|
|
"loss": 0.0081,
|
|
"grad_norm": 29.0,
|
|
"learning_rate": 4.84e-06,
|
|
"rewards/reward_fn": 0.34102812483906747,
|
|
"reward": 0.34102812483906747,
|
|
"reward_std": 0.12838326790370047,
|
|
"completion_length": 66.0875,
|
|
"kl": 0.2030480533838272,
|
|
"epoch": 0.032,
|
|
"step": 80
|
|
},
|
|
{
|
|
"loss": 0.0043,
|
|
"grad_norm": 29.0,
|
|
"learning_rate": 4.83e-06,
|
|
"rewards/reward_fn": 0.38738313168287275,
|
|
"reward": 0.38738313168287275,
|
|
"reward_std": 0.08913053697906434,
|
|
"completion_length": 68.675,
|
|
"kl": 0.10850983113050461,
|
|
"epoch": 0.034,
|
|
"step": 85
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 26.75,
|
|
"learning_rate": 4.8200000000000004e-06,
|
|
"rewards/reward_fn": 0.37884062230587007,
|
|
"reward": 0.37884062230587007,
|
|
"reward_std": 0.11409456301480532,
|
|
"completion_length": 70.4,
|
|
"kl": 0.1510870262980461,
|
|
"epoch": 0.036,
|
|
"step": 90
|
|
},
|
|
{
|
|
"loss": 0.0042,
|
|
"grad_norm": 28.5,
|
|
"learning_rate": 4.8100000000000005e-06,
|
|
"rewards/reward_fn": 0.3212599984370172,
|
|
"reward": 0.3212599984370172,
|
|
"reward_std": 0.11497495661024004,
|
|
"completion_length": 68.7875,
|
|
"kl": 0.10497871562838554,
|
|
"epoch": 0.038,
|
|
"step": 95
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 25.25,
|
|
"learning_rate": 4.800000000000001e-06,
|
|
"rewards/reward_fn": 0.2606187478464562,
|
|
"reward": 0.2606187478464562,
|
|
"reward_std": 0.11127406840678304,
|
|
"completion_length": 70.1125,
|
|
"kl": 0.13605541437864305,
|
|
"epoch": 0.04,
|
|
"step": 100
|
|
},
|
|
{
|
|
"loss": 0.004,
|
|
"grad_norm": 159.0,
|
|
"learning_rate": 4.79e-06,
|
|
"rewards/reward_fn": 0.3490543693304062,
|
|
"reward": 0.3490543693304062,
|
|
"reward_std": 0.17655093723442405,
|
|
"completion_length": 71.875,
|
|
"kl": 0.10081770941615105,
|
|
"epoch": 0.042,
|
|
"step": 105
|
|
},
|
|
{
|
|
"loss": 0.0043,
|
|
"grad_norm": 24.75,
|
|
"learning_rate": 4.78e-06,
|
|
"rewards/reward_fn": 0.36398687958717346,
|
|
"reward": 0.36398687958717346,
|
|
"reward_std": 0.15169972144067287,
|
|
"completion_length": 72.0625,
|
|
"kl": 0.10862671732902526,
|
|
"epoch": 0.044,
|
|
"step": 110
|
|
},
|
|
{
|
|
"loss": 0.0037,
|
|
"grad_norm": 23.625,
|
|
"learning_rate": 4.77e-06,
|
|
"rewards/reward_fn": 0.3380812492221594,
|
|
"reward": 0.3380812492221594,
|
|
"reward_std": 0.1447692496702075,
|
|
"completion_length": 73.5125,
|
|
"kl": 0.09267130568623543,
|
|
"epoch": 0.046,
|
|
"step": 115
|
|
},
|
|
{
|
|
"loss": 0.0045,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 4.76e-06,
|
|
"rewards/reward_fn": 0.39886312037706373,
|
|
"reward": 0.39886312037706373,
|
|
"reward_std": 0.13123975209891797,
|
|
"completion_length": 75.6,
|
|
"kl": 0.11189883872866631,
|
|
"epoch": 0.048,
|
|
"step": 120
|
|
},
|
|
{
|
|
"loss": 0.0038,
|
|
"grad_norm": 25.125,
|
|
"learning_rate": 4.75e-06,
|
|
"rewards/reward_fn": 0.4117881193757057,
|
|
"reward": 0.4117881193757057,
|
|
"reward_std": 0.1342116856947541,
|
|
"completion_length": 77.5875,
|
|
"kl": 0.09554292932152748,
|
|
"epoch": 0.05,
|
|
"step": 125
|
|
},
|
|
{
|
|
"loss": 0.0042,
|
|
"grad_norm": 22.125,
|
|
"learning_rate": 4.74e-06,
|
|
"rewards/reward_fn": 0.43608374893665314,
|
|
"reward": 0.43608374893665314,
|
|
"reward_std": 0.10520601402968169,
|
|
"completion_length": 77.4625,
|
|
"kl": 0.10588956028223037,
|
|
"epoch": 0.052,
|
|
"step": 130
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 4.7300000000000005e-06,
|
|
"rewards/reward_fn": 0.4558625012636185,
|
|
"reward": 0.4558625012636185,
|
|
"reward_std": 0.06957857511006296,
|
|
"completion_length": 78.1875,
|
|
"kl": 0.12035084962844848,
|
|
"epoch": 0.054,
|
|
"step": 135
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 4.7200000000000005e-06,
|
|
"rewards/reward_fn": 0.40547625310719015,
|
|
"reward": 0.40547625310719015,
|
|
"reward_std": 0.09707445108797401,
|
|
"completion_length": 77.9,
|
|
"kl": 0.11794439107179641,
|
|
"epoch": 0.056,
|
|
"step": 140
|
|
},
|
|
{
|
|
"loss": 0.0042,
|
|
"grad_norm": 22.875,
|
|
"learning_rate": 4.71e-06,
|
|
"rewards/reward_fn": 0.33485061936080457,
|
|
"reward": 0.33485061936080457,
|
|
"reward_std": 0.09993105094181373,
|
|
"completion_length": 79.125,
|
|
"kl": 0.10510653629899025,
|
|
"epoch": 0.058,
|
|
"step": 145
|
|
},
|
|
{
|
|
"loss": 0.0042,
|
|
"grad_norm": 19.5,
|
|
"learning_rate": 4.7e-06,
|
|
"rewards/reward_fn": 0.3309887422248721,
|
|
"reward": 0.3309887422248721,
|
|
"reward_std": 0.06645656500477344,
|
|
"completion_length": 78.6125,
|
|
"kl": 0.10446615666151046,
|
|
"epoch": 0.06,
|
|
"step": 150
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 4.69e-06,
|
|
"rewards/reward_fn": 0.3643562486220617,
|
|
"reward": 0.3643562486220617,
|
|
"reward_std": 0.06148011786863208,
|
|
"completion_length": 76.725,
|
|
"kl": 0.1252933219075203,
|
|
"epoch": 0.062,
|
|
"step": 155
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 25.625,
|
|
"learning_rate": 4.680000000000001e-06,
|
|
"rewards/reward_fn": 0.3345375001837965,
|
|
"reward": 0.3345375001837965,
|
|
"reward_std": 0.0954028001986444,
|
|
"completion_length": 75.45,
|
|
"kl": 0.12887531742453576,
|
|
"epoch": 0.064,
|
|
"step": 160
|
|
},
|
|
{
|
|
"loss": 0.0046,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 4.670000000000001e-06,
|
|
"rewards/reward_fn": 0.3415462435106747,
|
|
"reward": 0.3415462435106747,
|
|
"reward_std": 0.07221131722908466,
|
|
"completion_length": 75.9375,
|
|
"kl": 0.11467845514416694,
|
|
"epoch": 0.066,
|
|
"step": 165
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 22.625,
|
|
"learning_rate": 4.66e-06,
|
|
"rewards/reward_fn": 0.38888062462210654,
|
|
"reward": 0.38888062462210654,
|
|
"reward_std": 0.11567582259885967,
|
|
"completion_length": 76.375,
|
|
"kl": 0.12577201426029205,
|
|
"epoch": 0.068,
|
|
"step": 170
|
|
},
|
|
{
|
|
"loss": 0.0044,
|
|
"grad_norm": 23.75,
|
|
"learning_rate": 4.65e-06,
|
|
"rewards/reward_fn": 0.27768062038812785,
|
|
"reward": 0.27768062038812785,
|
|
"reward_std": 0.04161863022018224,
|
|
"completion_length": 75.275,
|
|
"kl": 0.11094974502921104,
|
|
"epoch": 0.07,
|
|
"step": 175
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 4.6400000000000005e-06,
|
|
"rewards/reward_fn": 0.31952374114189297,
|
|
"reward": 0.31952374114189297,
|
|
"reward_std": 0.07254288513213396,
|
|
"completion_length": 77.3625,
|
|
"kl": 0.11998703256249428,
|
|
"epoch": 0.072,
|
|
"step": 180
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 25.125,
|
|
"learning_rate": 4.6300000000000006e-06,
|
|
"rewards/reward_fn": 0.34861375503242015,
|
|
"reward": 0.34861375503242015,
|
|
"reward_std": 0.10978359731379897,
|
|
"completion_length": 75.9625,
|
|
"kl": 0.15532704591751098,
|
|
"epoch": 0.074,
|
|
"step": 185
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 22.5,
|
|
"learning_rate": 4.620000000000001e-06,
|
|
"rewards/reward_fn": 0.3256543739698827,
|
|
"reward": 0.3256543739698827,
|
|
"reward_std": 0.07666705958545209,
|
|
"completion_length": 76.825,
|
|
"kl": 0.1190482571721077,
|
|
"epoch": 0.076,
|
|
"step": 190
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 26.25,
|
|
"learning_rate": 4.610000000000001e-06,
|
|
"rewards/reward_fn": 0.3306443728506565,
|
|
"reward": 0.3306443728506565,
|
|
"reward_std": 0.12050404832698405,
|
|
"completion_length": 76.2375,
|
|
"kl": 0.11852994039654732,
|
|
"epoch": 0.078,
|
|
"step": 195
|
|
},
|
|
{
|
|
"loss": 0.0041,
|
|
"grad_norm": 23.625,
|
|
"learning_rate": 4.600000000000001e-06,
|
|
"rewards/reward_fn": 0.33713062135502697,
|
|
"reward": 0.33713062135502697,
|
|
"reward_std": 0.09549199095927179,
|
|
"completion_length": 75.725,
|
|
"kl": 0.10264018401503563,
|
|
"epoch": 0.08,
|
|
"step": 200
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 4.590000000000001e-06,
|
|
"rewards/reward_fn": 0.35562500059604646,
|
|
"reward": 0.35562500059604646,
|
|
"reward_std": 0.11331822639331221,
|
|
"completion_length": 76.55,
|
|
"kl": 0.11866414025425912,
|
|
"epoch": 0.082,
|
|
"step": 205
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 25.0,
|
|
"learning_rate": 4.58e-06,
|
|
"rewards/reward_fn": 0.3766068793833256,
|
|
"reward": 0.3766068793833256,
|
|
"reward_std": 0.10549901332706213,
|
|
"completion_length": 73.0875,
|
|
"kl": 0.14588759168982507,
|
|
"epoch": 0.084,
|
|
"step": 210
|
|
},
|
|
{
|
|
"loss": 0.004,
|
|
"grad_norm": 26.75,
|
|
"learning_rate": 4.57e-06,
|
|
"rewards/reward_fn": 0.38299812823534013,
|
|
"reward": 0.38299812823534013,
|
|
"reward_std": 0.09799009431153535,
|
|
"completion_length": 73.625,
|
|
"kl": 0.10109216421842575,
|
|
"epoch": 0.086,
|
|
"step": 215
|
|
},
|
|
{
|
|
"loss": 0.0041,
|
|
"grad_norm": 28.0,
|
|
"learning_rate": 4.56e-06,
|
|
"rewards/reward_fn": 0.37175500094890596,
|
|
"reward": 0.37175500094890596,
|
|
"reward_std": 0.10488205360015854,
|
|
"completion_length": 72.3375,
|
|
"kl": 0.10273240357637406,
|
|
"epoch": 0.088,
|
|
"step": 220
|
|
},
|
|
{
|
|
"loss": 0.0042,
|
|
"grad_norm": 22.5,
|
|
"learning_rate": 4.5500000000000005e-06,
|
|
"rewards/reward_fn": 0.3897412523627281,
|
|
"reward": 0.3897412523627281,
|
|
"reward_std": 0.14026562571525575,
|
|
"completion_length": 74.6125,
|
|
"kl": 0.1044769786298275,
|
|
"epoch": 0.09,
|
|
"step": 225
|
|
},
|
|
{
|
|
"loss": 0.0039,
|
|
"grad_norm": 23.375,
|
|
"learning_rate": 4.540000000000001e-06,
|
|
"rewards/reward_fn": 0.41331062465906143,
|
|
"reward": 0.41331062465906143,
|
|
"reward_std": 0.09166353384498507,
|
|
"completion_length": 73.95,
|
|
"kl": 0.0984603650867939,
|
|
"epoch": 0.092,
|
|
"step": 230
|
|
},
|
|
{
|
|
"loss": 0.0043,
|
|
"grad_norm": 25.25,
|
|
"learning_rate": 4.530000000000001e-06,
|
|
"rewards/reward_fn": 0.3803025022149086,
|
|
"reward": 0.3803025022149086,
|
|
"reward_std": 0.11351661148946732,
|
|
"completion_length": 74.6,
|
|
"kl": 0.1073625199496746,
|
|
"epoch": 0.094,
|
|
"step": 235
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 40.25,
|
|
"learning_rate": 4.520000000000001e-06,
|
|
"rewards/reward_fn": 0.37668500542640687,
|
|
"reward": 0.37668500542640687,
|
|
"reward_std": 0.14612680403515696,
|
|
"completion_length": 73.75,
|
|
"kl": 0.1317383050918579,
|
|
"epoch": 0.096,
|
|
"step": 240
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 28.25,
|
|
"learning_rate": 4.510000000000001e-06,
|
|
"rewards/reward_fn": 0.37795437276363375,
|
|
"reward": 0.37795437276363375,
|
|
"reward_std": 0.11509951823391021,
|
|
"completion_length": 74.675,
|
|
"kl": 0.13217320367693902,
|
|
"epoch": 0.098,
|
|
"step": 245
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 24.0,
|
|
"learning_rate": 4.5e-06,
|
|
"rewards/reward_fn": 0.4587212562561035,
|
|
"reward": 0.4587212562561035,
|
|
"reward_std": 0.05489388951100409,
|
|
"completion_length": 72.6375,
|
|
"kl": 0.14324783831834792,
|
|
"epoch": 0.1,
|
|
"step": 250
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 28.25,
|
|
"learning_rate": 4.49e-06,
|
|
"rewards/reward_fn": 0.36413499563932417,
|
|
"reward": 0.36413499563932417,
|
|
"reward_std": 0.14084610100835562,
|
|
"completion_length": 73.175,
|
|
"kl": 0.1550510197877884,
|
|
"epoch": 0.102,
|
|
"step": 255
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 22.75,
|
|
"learning_rate": 4.48e-06,
|
|
"rewards/reward_fn": 0.41378499418497083,
|
|
"reward": 0.41378499418497083,
|
|
"reward_std": 0.09481649375520647,
|
|
"completion_length": 75.3625,
|
|
"kl": 0.1283886268734932,
|
|
"epoch": 0.104,
|
|
"step": 260
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 27.25,
|
|
"learning_rate": 4.47e-06,
|
|
"rewards/reward_fn": 0.45114499926567075,
|
|
"reward": 0.45114499926567075,
|
|
"reward_std": 0.04992847095709294,
|
|
"completion_length": 74.15,
|
|
"kl": 0.1244954839348793,
|
|
"epoch": 0.106,
|
|
"step": 265
|
|
},
|
|
{
|
|
"loss": 0.0046,
|
|
"grad_norm": 27.625,
|
|
"learning_rate": 4.4600000000000005e-06,
|
|
"rewards/reward_fn": 0.4430062472820282,
|
|
"reward": 0.4430062472820282,
|
|
"reward_std": 0.06626461511477828,
|
|
"completion_length": 73.775,
|
|
"kl": 0.11374877691268921,
|
|
"epoch": 0.108,
|
|
"step": 270
|
|
},
|
|
{
|
|
"loss": 0.0046,
|
|
"grad_norm": 23.0,
|
|
"learning_rate": 4.450000000000001e-06,
|
|
"rewards/reward_fn": 0.3933093786239624,
|
|
"reward": 0.3933093786239624,
|
|
"reward_std": 0.07578937450889497,
|
|
"completion_length": 73.4875,
|
|
"kl": 0.11540523990988731,
|
|
"epoch": 0.11,
|
|
"step": 275
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 26.125,
|
|
"learning_rate": 4.440000000000001e-06,
|
|
"rewards/reward_fn": 0.34966062209568916,
|
|
"reward": 0.34966062209568916,
|
|
"reward_std": 0.12435578326694667,
|
|
"completion_length": 75.5375,
|
|
"kl": 0.13259521648287773,
|
|
"epoch": 0.112,
|
|
"step": 280
|
|
},
|
|
{
|
|
"loss": 0.0039,
|
|
"grad_norm": 24.625,
|
|
"learning_rate": 4.430000000000001e-06,
|
|
"rewards/reward_fn": 0.40091561824083327,
|
|
"reward": 0.40091561824083327,
|
|
"reward_std": 0.10028558413032443,
|
|
"completion_length": 75.9875,
|
|
"kl": 0.09868917912244797,
|
|
"epoch": 0.114,
|
|
"step": 285
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 24.0,
|
|
"learning_rate": 4.42e-06,
|
|
"rewards/reward_fn": 0.3874093756079674,
|
|
"reward": 0.3874093756079674,
|
|
"reward_std": 0.11204615456517786,
|
|
"completion_length": 75.7,
|
|
"kl": 0.1249419741332531,
|
|
"epoch": 0.116,
|
|
"step": 290
|
|
},
|
|
{
|
|
"loss": 0.0039,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 4.41e-06,
|
|
"rewards/reward_fn": 0.40853061974048616,
|
|
"reward": 0.40853061974048616,
|
|
"reward_std": 0.1080384837463498,
|
|
"completion_length": 74.325,
|
|
"kl": 0.0986015535891056,
|
|
"epoch": 0.118,
|
|
"step": 295
|
|
},
|
|
{
|
|
"loss": 0.0037,
|
|
"grad_norm": 24.75,
|
|
"learning_rate": 4.4e-06,
|
|
"rewards/reward_fn": 0.41577999889850614,
|
|
"reward": 0.41577999889850614,
|
|
"reward_std": 0.10275121238082648,
|
|
"completion_length": 71.7125,
|
|
"kl": 0.09238781034946442,
|
|
"epoch": 0.12,
|
|
"step": 300
|
|
},
|
|
{
|
|
"loss": 0.0046,
|
|
"grad_norm": 31.0,
|
|
"learning_rate": 4.39e-06,
|
|
"rewards/reward_fn": 0.4253518760204315,
|
|
"reward": 0.4253518760204315,
|
|
"reward_std": 0.09691239511594177,
|
|
"completion_length": 72.15,
|
|
"kl": 0.11536458730697632,
|
|
"epoch": 0.122,
|
|
"step": 305
|
|
},
|
|
{
|
|
"loss": 0.0039,
|
|
"grad_norm": 29.875,
|
|
"learning_rate": 4.38e-06,
|
|
"rewards/reward_fn": 0.437681245803833,
|
|
"reward": 0.437681245803833,
|
|
"reward_std": 0.08544279797933996,
|
|
"completion_length": 73.7875,
|
|
"kl": 0.09754163324832917,
|
|
"epoch": 0.124,
|
|
"step": 310
|
|
},
|
|
{
|
|
"loss": 0.0066,
|
|
"grad_norm": 23.5,
|
|
"learning_rate": 4.3700000000000005e-06,
|
|
"rewards/reward_fn": 0.4518656224012375,
|
|
"reward": 0.4518656224012375,
|
|
"reward_std": 0.07031336800428108,
|
|
"completion_length": 72.075,
|
|
"kl": 0.16523725241422654,
|
|
"epoch": 0.126,
|
|
"step": 315
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 22.875,
|
|
"learning_rate": 4.360000000000001e-06,
|
|
"rewards/reward_fn": 0.4219950050115585,
|
|
"reward": 0.4219950050115585,
|
|
"reward_std": 0.10292997076176107,
|
|
"completion_length": 73.8375,
|
|
"kl": 0.1384974516928196,
|
|
"epoch": 0.128,
|
|
"step": 320
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 30.25,
|
|
"learning_rate": 4.350000000000001e-06,
|
|
"rewards/reward_fn": 0.3595293749123812,
|
|
"reward": 0.3595293749123812,
|
|
"reward_std": 0.09363621571101248,
|
|
"completion_length": 73.2125,
|
|
"kl": 0.11762727722525597,
|
|
"epoch": 0.13,
|
|
"step": 325
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 4.34e-06,
|
|
"rewards/reward_fn": 0.41323124766349795,
|
|
"reward": 0.41323124766349795,
|
|
"reward_std": 0.07829355036374182,
|
|
"completion_length": 75.7375,
|
|
"kl": 0.12051494792103767,
|
|
"epoch": 0.132,
|
|
"step": 330
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 4.33e-06,
|
|
"rewards/reward_fn": 0.4376243770122528,
|
|
"reward": 0.4376243770122528,
|
|
"reward_std": 0.08593541735317559,
|
|
"completion_length": 74.2125,
|
|
"kl": 0.11745435148477554,
|
|
"epoch": 0.134,
|
|
"step": 335
|
|
},
|
|
{
|
|
"loss": 0.0042,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 4.32e-06,
|
|
"rewards/reward_fn": 0.40846686959266665,
|
|
"reward": 0.40846686959266665,
|
|
"reward_std": 0.08748328550718724,
|
|
"completion_length": 74.1375,
|
|
"kl": 0.1059987798333168,
|
|
"epoch": 0.136,
|
|
"step": 340
|
|
},
|
|
{
|
|
"loss": 0.0049,
|
|
"grad_norm": 24.25,
|
|
"learning_rate": 4.31e-06,
|
|
"rewards/reward_fn": 0.4022818714380264,
|
|
"reward": 0.4022818714380264,
|
|
"reward_std": 0.12827726462855935,
|
|
"completion_length": 73.7375,
|
|
"kl": 0.12174244895577431,
|
|
"epoch": 0.138,
|
|
"step": 345
|
|
},
|
|
{
|
|
"loss": 0.0067,
|
|
"grad_norm": 25.375,
|
|
"learning_rate": 4.3e-06,
|
|
"rewards/reward_fn": 0.390729995071888,
|
|
"reward": 0.390729995071888,
|
|
"reward_std": 0.14277701806277038,
|
|
"completion_length": 75.6625,
|
|
"kl": 0.16858599781990052,
|
|
"epoch": 0.14,
|
|
"step": 350
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 20.25,
|
|
"learning_rate": 4.2900000000000004e-06,
|
|
"rewards/reward_fn": 0.430366250872612,
|
|
"reward": 0.430366250872612,
|
|
"reward_std": 0.10421461121877655,
|
|
"completion_length": 75.8125,
|
|
"kl": 0.13511769324541092,
|
|
"epoch": 0.142,
|
|
"step": 355
|
|
},
|
|
{
|
|
"loss": 0.0043,
|
|
"grad_norm": 24.5,
|
|
"learning_rate": 4.2800000000000005e-06,
|
|
"rewards/reward_fn": 0.4330950051546097,
|
|
"reward": 0.4330950051546097,
|
|
"reward_std": 0.09782969739753752,
|
|
"completion_length": 74.5875,
|
|
"kl": 0.10710541978478431,
|
|
"epoch": 0.144,
|
|
"step": 360
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 23.375,
|
|
"learning_rate": 4.270000000000001e-06,
|
|
"rewards/reward_fn": 0.4299006313085556,
|
|
"reward": 0.4299006313085556,
|
|
"reward_std": 0.09049425637349487,
|
|
"completion_length": 74.0125,
|
|
"kl": 0.13614988327026367,
|
|
"epoch": 0.146,
|
|
"step": 365
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 27.75,
|
|
"learning_rate": 4.26e-06,
|
|
"rewards/reward_fn": 0.4390475034713745,
|
|
"reward": 0.4390475034713745,
|
|
"reward_std": 0.08892962010577321,
|
|
"completion_length": 74.075,
|
|
"kl": 0.14286103025078772,
|
|
"epoch": 0.148,
|
|
"step": 370
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 4.25e-06,
|
|
"rewards/reward_fn": 0.4013475000858307,
|
|
"reward": 0.4013475000858307,
|
|
"reward_std": 0.13384937290102245,
|
|
"completion_length": 74.2375,
|
|
"kl": 0.15888455584645272,
|
|
"epoch": 0.15,
|
|
"step": 375
|
|
},
|
|
{
|
|
"loss": 0.0044,
|
|
"grad_norm": 24.5,
|
|
"learning_rate": 4.24e-06,
|
|
"rewards/reward_fn": 0.4265299946069717,
|
|
"reward": 0.4265299946069717,
|
|
"reward_std": 0.1011309385765344,
|
|
"completion_length": 75.3125,
|
|
"kl": 0.10906772464513778,
|
|
"epoch": 0.152,
|
|
"step": 380
|
|
},
|
|
{
|
|
"loss": 0.0044,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 4.23e-06,
|
|
"rewards/reward_fn": 0.4264387458562851,
|
|
"reward": 0.4264387458562851,
|
|
"reward_std": 0.09354419643059372,
|
|
"completion_length": 75.5625,
|
|
"kl": 0.11007295995950699,
|
|
"epoch": 0.154,
|
|
"step": 385
|
|
},
|
|
{
|
|
"loss": 0.0042,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 4.22e-06,
|
|
"rewards/reward_fn": 0.4492681235074997,
|
|
"reward": 0.4492681235074997,
|
|
"reward_std": 0.06859665396623313,
|
|
"completion_length": 75.8125,
|
|
"kl": 0.10581666082143784,
|
|
"epoch": 0.156,
|
|
"step": 390
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 22.125,
|
|
"learning_rate": 4.21e-06,
|
|
"rewards/reward_fn": 0.4439112454652786,
|
|
"reward": 0.4439112454652786,
|
|
"reward_std": 0.06631400538608431,
|
|
"completion_length": 76.9375,
|
|
"kl": 0.13060626164078712,
|
|
"epoch": 0.158,
|
|
"step": 395
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 21.625,
|
|
"learning_rate": 4.2000000000000004e-06,
|
|
"rewards/reward_fn": 0.42808999717235563,
|
|
"reward": 0.42808999717235563,
|
|
"reward_std": 0.10399190871976316,
|
|
"completion_length": 77.9375,
|
|
"kl": 0.12519487142562866,
|
|
"epoch": 0.16,
|
|
"step": 400
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 23.875,
|
|
"learning_rate": 4.1900000000000005e-06,
|
|
"rewards/reward_fn": 0.4617268741130829,
|
|
"reward": 0.4617268741130829,
|
|
"reward_std": 0.024455197062343358,
|
|
"completion_length": 78.35,
|
|
"kl": 0.1178566724061966,
|
|
"epoch": 0.162,
|
|
"step": 405
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 22.125,
|
|
"learning_rate": 4.18e-06,
|
|
"rewards/reward_fn": 0.443281877040863,
|
|
"reward": 0.443281877040863,
|
|
"reward_std": 0.0685680250171572,
|
|
"completion_length": 78.2125,
|
|
"kl": 0.13702442422509192,
|
|
"epoch": 0.164,
|
|
"step": 410
|
|
},
|
|
{
|
|
"loss": 0.0044,
|
|
"grad_norm": 22.875,
|
|
"learning_rate": 4.17e-06,
|
|
"rewards/reward_fn": 0.4668212473392487,
|
|
"reward": 0.4668212473392487,
|
|
"reward_std": 0.011207807157188655,
|
|
"completion_length": 77.575,
|
|
"kl": 0.11055121570825577,
|
|
"epoch": 0.166,
|
|
"step": 415
|
|
},
|
|
{
|
|
"loss": 0.0043,
|
|
"grad_norm": 21.625,
|
|
"learning_rate": 4.16e-06,
|
|
"rewards/reward_fn": 0.4402331173419952,
|
|
"reward": 0.4402331173419952,
|
|
"reward_std": 0.058068437944166364,
|
|
"completion_length": 77.5,
|
|
"kl": 0.10733927562832832,
|
|
"epoch": 0.168,
|
|
"step": 420
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 19.875,
|
|
"learning_rate": 4.15e-06,
|
|
"rewards/reward_fn": 0.4000462591648102,
|
|
"reward": 0.4000462591648102,
|
|
"reward_std": 0.12040392335038633,
|
|
"completion_length": 76.4,
|
|
"kl": 0.1479562886059284,
|
|
"epoch": 0.17,
|
|
"step": 425
|
|
},
|
|
{
|
|
"loss": 0.0043,
|
|
"grad_norm": 24.75,
|
|
"learning_rate": 4.14e-06,
|
|
"rewards/reward_fn": 0.43545125126838685,
|
|
"reward": 0.43545125126838685,
|
|
"reward_std": 0.083320726826787,
|
|
"completion_length": 77.6875,
|
|
"kl": 0.10773153975605965,
|
|
"epoch": 0.172,
|
|
"step": 430
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 25.75,
|
|
"learning_rate": 4.13e-06,
|
|
"rewards/reward_fn": 0.44696625173091886,
|
|
"reward": 0.44696625173091886,
|
|
"reward_std": 0.0702914291061461,
|
|
"completion_length": 77.6875,
|
|
"kl": 0.14093699380755426,
|
|
"epoch": 0.174,
|
|
"step": 435
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 4.12e-06,
|
|
"rewards/reward_fn": 0.4151681214570999,
|
|
"reward": 0.4151681214570999,
|
|
"reward_std": 0.13133891765028238,
|
|
"completion_length": 77.3875,
|
|
"kl": 0.14335689023137094,
|
|
"epoch": 0.176,
|
|
"step": 440
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 37.5,
|
|
"learning_rate": 4.1100000000000005e-06,
|
|
"rewards/reward_fn": 0.4624956250190735,
|
|
"reward": 0.4624956250190735,
|
|
"reward_std": 0.02820514002814889,
|
|
"completion_length": 78.2625,
|
|
"kl": 0.1433185674250126,
|
|
"epoch": 0.178,
|
|
"step": 445
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 24.5,
|
|
"learning_rate": 4.1e-06,
|
|
"rewards/reward_fn": 0.4109575003385544,
|
|
"reward": 0.4109575003385544,
|
|
"reward_std": 0.10657719178125262,
|
|
"completion_length": 77.425,
|
|
"kl": 0.14092796370387078,
|
|
"epoch": 0.18,
|
|
"step": 450
|
|
},
|
|
{
|
|
"loss": 0.0049,
|
|
"grad_norm": 19.25,
|
|
"learning_rate": 4.09e-06,
|
|
"rewards/reward_fn": 0.4427300065755844,
|
|
"reward": 0.4427300065755844,
|
|
"reward_std": 0.07360082946252078,
|
|
"completion_length": 77.2,
|
|
"kl": 0.12334202900528908,
|
|
"epoch": 0.182,
|
|
"step": 455
|
|
},
|
|
{
|
|
"loss": 0.0046,
|
|
"grad_norm": 21.25,
|
|
"learning_rate": 4.08e-06,
|
|
"rewards/reward_fn": 0.4475331217050552,
|
|
"reward": 0.4475331217050552,
|
|
"reward_std": 0.07192960330285132,
|
|
"completion_length": 77.3,
|
|
"kl": 0.1150731973350048,
|
|
"epoch": 0.184,
|
|
"step": 460
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 23.625,
|
|
"learning_rate": 4.07e-06,
|
|
"rewards/reward_fn": 0.4609056174755096,
|
|
"reward": 0.4609056174755096,
|
|
"reward_std": 0.03011263143271208,
|
|
"completion_length": 78.1375,
|
|
"kl": 0.11868541091680526,
|
|
"epoch": 0.186,
|
|
"step": 465
|
|
},
|
|
{
|
|
"loss": 0.0043,
|
|
"grad_norm": 24.0,
|
|
"learning_rate": 4.060000000000001e-06,
|
|
"rewards/reward_fn": 0.43565624952316284,
|
|
"reward": 0.43565624952316284,
|
|
"reward_std": 0.09692498000804335,
|
|
"completion_length": 77.7875,
|
|
"kl": 0.10825898423790932,
|
|
"epoch": 0.188,
|
|
"step": 470
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 22.625,
|
|
"learning_rate": 4.05e-06,
|
|
"rewards/reward_fn": 0.4492074936628342,
|
|
"reward": 0.4492074936628342,
|
|
"reward_std": 0.054914072714746,
|
|
"completion_length": 77.5375,
|
|
"kl": 0.13897996991872788,
|
|
"epoch": 0.19,
|
|
"step": 475
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 20.25,
|
|
"learning_rate": 4.04e-06,
|
|
"rewards/reward_fn": 0.4413031220436096,
|
|
"reward": 0.4413031220436096,
|
|
"reward_std": 0.09486053336877376,
|
|
"completion_length": 77.65,
|
|
"kl": 0.12382525056600571,
|
|
"epoch": 0.192,
|
|
"step": 480
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 25.375,
|
|
"learning_rate": 4.03e-06,
|
|
"rewards/reward_fn": 0.4565912544727325,
|
|
"reward": 0.4565912544727325,
|
|
"reward_std": 0.048468802426941696,
|
|
"completion_length": 77.8125,
|
|
"kl": 0.12045493870973586,
|
|
"epoch": 0.194,
|
|
"step": 485
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 4.0200000000000005e-06,
|
|
"rewards/reward_fn": 0.43663875162601473,
|
|
"reward": 0.43663875162601473,
|
|
"reward_std": 0.08210341725498438,
|
|
"completion_length": 78.4,
|
|
"kl": 0.12900268211960791,
|
|
"epoch": 0.196,
|
|
"step": 490
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 4.0100000000000006e-06,
|
|
"rewards/reward_fn": 0.45537562370300294,
|
|
"reward": 0.45537562370300294,
|
|
"reward_std": 0.0482569785322994,
|
|
"completion_length": 76.7125,
|
|
"kl": 0.14251393526792527,
|
|
"epoch": 0.198,
|
|
"step": 495
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 25.875,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"rewards/reward_fn": 0.4432006269693375,
|
|
"reward": 0.4432006269693375,
|
|
"reward_std": 0.06335797258652746,
|
|
"completion_length": 73.8625,
|
|
"kl": 0.1357534795999527,
|
|
"epoch": 0.2,
|
|
"step": 500
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 22.0,
|
|
"learning_rate": 3.990000000000001e-06,
|
|
"rewards/reward_fn": 0.4444637417793274,
|
|
"reward": 0.4444637417793274,
|
|
"reward_std": 0.07501828772947192,
|
|
"completion_length": 77.9375,
|
|
"kl": 0.11708598956465721,
|
|
"epoch": 0.202,
|
|
"step": 505
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 23.5,
|
|
"learning_rate": 3.980000000000001e-06,
|
|
"rewards/reward_fn": 0.4472743809223175,
|
|
"reward": 0.4472743809223175,
|
|
"reward_std": 0.05749309537932277,
|
|
"completion_length": 74.5875,
|
|
"kl": 0.14097338169813156,
|
|
"epoch": 0.204,
|
|
"step": 510
|
|
},
|
|
{
|
|
"loss": 0.0046,
|
|
"grad_norm": 24.5,
|
|
"learning_rate": 3.97e-06,
|
|
"rewards/reward_fn": 0.44517249464988706,
|
|
"reward": 0.44517249464988706,
|
|
"reward_std": 0.04624197790399194,
|
|
"completion_length": 74.3625,
|
|
"kl": 0.11476076990365983,
|
|
"epoch": 0.206,
|
|
"step": 515
|
|
},
|
|
{
|
|
"loss": 0.0045,
|
|
"grad_norm": 23.0,
|
|
"learning_rate": 3.96e-06,
|
|
"rewards/reward_fn": 0.46824624538421633,
|
|
"reward": 0.46824624538421633,
|
|
"reward_std": 0.01596033286768943,
|
|
"completion_length": 76.65,
|
|
"kl": 0.11366325318813324,
|
|
"epoch": 0.208,
|
|
"step": 520
|
|
},
|
|
{
|
|
"loss": 0.0063,
|
|
"grad_norm": 32.25,
|
|
"learning_rate": 3.95e-06,
|
|
"rewards/reward_fn": 0.42703562378883364,
|
|
"reward": 0.42703562378883364,
|
|
"reward_std": 0.12119532297365368,
|
|
"completion_length": 72.925,
|
|
"kl": 0.15730374231934546,
|
|
"epoch": 0.21,
|
|
"step": 525
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 21.75,
|
|
"learning_rate": 3.94e-06,
|
|
"rewards/reward_fn": 0.4221406221389771,
|
|
"reward": 0.4221406221389771,
|
|
"reward_std": 0.10592716310638935,
|
|
"completion_length": 74.5125,
|
|
"kl": 0.15577242150902748,
|
|
"epoch": 0.212,
|
|
"step": 530
|
|
},
|
|
{
|
|
"loss": 0.0045,
|
|
"grad_norm": 21.125,
|
|
"learning_rate": 3.9300000000000005e-06,
|
|
"rewards/reward_fn": 0.4661912500858307,
|
|
"reward": 0.4661912500858307,
|
|
"reward_std": 0.020173130772309377,
|
|
"completion_length": 75.8375,
|
|
"kl": 0.11145939379930496,
|
|
"epoch": 0.214,
|
|
"step": 535
|
|
},
|
|
{
|
|
"loss": 0.0049,
|
|
"grad_norm": 24.25,
|
|
"learning_rate": 3.920000000000001e-06,
|
|
"rewards/reward_fn": 0.441836878657341,
|
|
"reward": 0.441836878657341,
|
|
"reward_std": 0.07485336323734373,
|
|
"completion_length": 76.2125,
|
|
"kl": 0.12274321988224983,
|
|
"epoch": 0.216,
|
|
"step": 540
|
|
},
|
|
{
|
|
"loss": 0.0071,
|
|
"grad_norm": 27.875,
|
|
"learning_rate": 3.910000000000001e-06,
|
|
"rewards/reward_fn": 0.41665250062942505,
|
|
"reward": 0.41665250062942505,
|
|
"reward_std": 0.11695102071389556,
|
|
"completion_length": 75.5375,
|
|
"kl": 0.1784944050014019,
|
|
"epoch": 0.218,
|
|
"step": 545
|
|
},
|
|
{
|
|
"loss": 0.0049,
|
|
"grad_norm": 22.125,
|
|
"learning_rate": 3.900000000000001e-06,
|
|
"rewards/reward_fn": 0.46246500313282013,
|
|
"reward": 0.46246500313282013,
|
|
"reward_std": 0.025297004880849273,
|
|
"completion_length": 77.3125,
|
|
"kl": 0.1214751310646534,
|
|
"epoch": 0.22,
|
|
"step": 550
|
|
},
|
|
{
|
|
"loss": 0.0046,
|
|
"grad_norm": 22.0,
|
|
"learning_rate": 3.89e-06,
|
|
"rewards/reward_fn": 0.4644468754529953,
|
|
"reward": 0.4644468754529953,
|
|
"reward_std": 0.012496462906710804,
|
|
"completion_length": 75.7375,
|
|
"kl": 0.11581535264849663,
|
|
"epoch": 0.222,
|
|
"step": 555
|
|
},
|
|
{
|
|
"loss": 0.0082,
|
|
"grad_norm": 36.0,
|
|
"learning_rate": 3.88e-06,
|
|
"rewards/reward_fn": 0.4410806208848953,
|
|
"reward": 0.4410806208848953,
|
|
"reward_std": 0.06957816896028816,
|
|
"completion_length": 74.8875,
|
|
"kl": 0.2040191449224949,
|
|
"epoch": 0.224,
|
|
"step": 560
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 20.25,
|
|
"learning_rate": 3.87e-06,
|
|
"rewards/reward_fn": 0.4340968787670135,
|
|
"reward": 0.4340968787670135,
|
|
"reward_std": 0.08061990649439395,
|
|
"completion_length": 75.425,
|
|
"kl": 0.13628464713692665,
|
|
"epoch": 0.226,
|
|
"step": 565
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 25.0,
|
|
"learning_rate": 3.86e-06,
|
|
"rewards/reward_fn": 0.4491756230592728,
|
|
"reward": 0.4491756230592728,
|
|
"reward_std": 0.05307391991373152,
|
|
"completion_length": 75.0875,
|
|
"kl": 0.12775095850229262,
|
|
"epoch": 0.228,
|
|
"step": 570
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 23.5,
|
|
"learning_rate": 3.85e-06,
|
|
"rewards/reward_fn": 0.44790937304496764,
|
|
"reward": 0.44790937304496764,
|
|
"reward_std": 0.04875197249930352,
|
|
"completion_length": 76.5625,
|
|
"kl": 0.13741603270173072,
|
|
"epoch": 0.23,
|
|
"step": 575
|
|
},
|
|
{
|
|
"loss": 0.0075,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 3.8400000000000005e-06,
|
|
"rewards/reward_fn": 0.4636618733406067,
|
|
"reward": 0.4636618733406067,
|
|
"reward_std": 0.027970095619093627,
|
|
"completion_length": 75.525,
|
|
"kl": 0.18872758597135544,
|
|
"epoch": 0.232,
|
|
"step": 580
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 22.875,
|
|
"learning_rate": 3.830000000000001e-06,
|
|
"rewards/reward_fn": 0.44757687151432035,
|
|
"reward": 0.44757687151432035,
|
|
"reward_std": 0.05606174336280674,
|
|
"completion_length": 78.5875,
|
|
"kl": 0.143553277105093,
|
|
"epoch": 0.234,
|
|
"step": 585
|
|
},
|
|
{
|
|
"loss": 0.0046,
|
|
"grad_norm": 21.75,
|
|
"learning_rate": 3.820000000000001e-06,
|
|
"rewards/reward_fn": 0.474083748459816,
|
|
"reward": 0.474083748459816,
|
|
"reward_std": 0.013858947483822704,
|
|
"completion_length": 77.1375,
|
|
"kl": 0.1158306747674942,
|
|
"epoch": 0.236,
|
|
"step": 590
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 23.125,
|
|
"learning_rate": 3.8100000000000004e-06,
|
|
"rewards/reward_fn": 0.46378999650478364,
|
|
"reward": 0.46378999650478364,
|
|
"reward_std": 0.02867411085171625,
|
|
"completion_length": 78.075,
|
|
"kl": 0.1382530927658081,
|
|
"epoch": 0.238,
|
|
"step": 595
|
|
},
|
|
{
|
|
"loss": 0.0069,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 3.8000000000000005e-06,
|
|
"rewards/reward_fn": 0.44207625091075897,
|
|
"reward": 0.44207625091075897,
|
|
"reward_std": 0.07887064684182406,
|
|
"completion_length": 78.2125,
|
|
"kl": 0.17263479977846147,
|
|
"epoch": 0.24,
|
|
"step": 600
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 22.875,
|
|
"learning_rate": 3.79e-06,
|
|
"rewards/reward_fn": 0.45089874863624574,
|
|
"reward": 0.45089874863624574,
|
|
"reward_std": 0.05866381305968389,
|
|
"completion_length": 78.15,
|
|
"kl": 0.14266471862792968,
|
|
"epoch": 0.242,
|
|
"step": 605
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 3.7800000000000002e-06,
|
|
"rewards/reward_fn": 0.44535249173641206,
|
|
"reward": 0.44535249173641206,
|
|
"reward_std": 0.06417759947944432,
|
|
"completion_length": 77.725,
|
|
"kl": 0.15949834659695625,
|
|
"epoch": 0.244,
|
|
"step": 610
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 3.7700000000000003e-06,
|
|
"rewards/reward_fn": 0.45778937339782716,
|
|
"reward": 0.45778937339782716,
|
|
"reward_std": 0.03863266622647643,
|
|
"completion_length": 78.0375,
|
|
"kl": 0.14478488713502885,
|
|
"epoch": 0.246,
|
|
"step": 615
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 19.5,
|
|
"learning_rate": 3.7600000000000004e-06,
|
|
"rewards/reward_fn": 0.4707600027322769,
|
|
"reward": 0.4707600027322769,
|
|
"reward_std": 0.01137657801155001,
|
|
"completion_length": 78.65,
|
|
"kl": 0.12897173911333085,
|
|
"epoch": 0.248,
|
|
"step": 620
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 17.875,
|
|
"learning_rate": 3.7500000000000005e-06,
|
|
"rewards/reward_fn": 0.46352937519550325,
|
|
"reward": 0.46352937519550325,
|
|
"reward_std": 0.024159080686513335,
|
|
"completion_length": 77.9375,
|
|
"kl": 0.1265575334429741,
|
|
"epoch": 0.25,
|
|
"step": 625
|
|
},
|
|
{
|
|
"loss": 0.0065,
|
|
"grad_norm": 27.0,
|
|
"learning_rate": 3.74e-06,
|
|
"rewards/reward_fn": 0.42510437667369844,
|
|
"reward": 0.42510437667369844,
|
|
"reward_std": 0.0986353380489163,
|
|
"completion_length": 77.4875,
|
|
"kl": 0.16288376674056054,
|
|
"epoch": 0.252,
|
|
"step": 630
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 27.25,
|
|
"learning_rate": 3.7300000000000003e-06,
|
|
"rewards/reward_fn": 0.45724311769008635,
|
|
"reward": 0.45724311769008635,
|
|
"reward_std": 0.04627569923177362,
|
|
"completion_length": 79.15,
|
|
"kl": 0.14429674297571182,
|
|
"epoch": 0.254,
|
|
"step": 635
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 3.7200000000000004e-06,
|
|
"rewards/reward_fn": 0.45629062950611116,
|
|
"reward": 0.45629062950611116,
|
|
"reward_std": 0.04499068569857627,
|
|
"completion_length": 78.575,
|
|
"kl": 0.13493222519755363,
|
|
"epoch": 0.256,
|
|
"step": 640
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 3.7100000000000005e-06,
|
|
"rewards/reward_fn": 0.45364187359809877,
|
|
"reward": 0.45364187359809877,
|
|
"reward_std": 0.06047176127322018,
|
|
"completion_length": 78.075,
|
|
"kl": 0.14743178635835646,
|
|
"epoch": 0.258,
|
|
"step": 645
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 3.7e-06,
|
|
"rewards/reward_fn": 0.46636125445365906,
|
|
"reward": 0.46636125445365906,
|
|
"reward_std": 0.024842010554857553,
|
|
"completion_length": 77.7,
|
|
"kl": 0.12465962767601013,
|
|
"epoch": 0.26,
|
|
"step": 650
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 3.6900000000000002e-06,
|
|
"rewards/reward_fn": 0.46913875043392184,
|
|
"reward": 0.46913875043392184,
|
|
"reward_std": 0.014119817013852298,
|
|
"completion_length": 79.1375,
|
|
"kl": 0.13569475561380387,
|
|
"epoch": 0.262,
|
|
"step": 655
|
|
},
|
|
{
|
|
"loss": 0.0068,
|
|
"grad_norm": 20.25,
|
|
"learning_rate": 3.6800000000000003e-06,
|
|
"rewards/reward_fn": 0.44111000895500185,
|
|
"reward": 0.44111000895500185,
|
|
"reward_std": 0.09162386588286608,
|
|
"completion_length": 78.7125,
|
|
"kl": 0.17013774663209916,
|
|
"epoch": 0.264,
|
|
"step": 660
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 23.625,
|
|
"learning_rate": 3.6700000000000004e-06,
|
|
"rewards/reward_fn": 0.4559825032949448,
|
|
"reward": 0.4559825032949448,
|
|
"reward_std": 0.062304181954823436,
|
|
"completion_length": 77.8875,
|
|
"kl": 0.13616653084754943,
|
|
"epoch": 0.266,
|
|
"step": 665
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 21.375,
|
|
"learning_rate": 3.66e-06,
|
|
"rewards/reward_fn": 0.45857687294483185,
|
|
"reward": 0.45857687294483185,
|
|
"reward_std": 0.027881676610559226,
|
|
"completion_length": 77.1125,
|
|
"kl": 0.1250321976840496,
|
|
"epoch": 0.268,
|
|
"step": 670
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 3.65e-06,
|
|
"rewards/reward_fn": 0.46213499903678895,
|
|
"reward": 0.46213499903678895,
|
|
"reward_std": 0.026366882980801164,
|
|
"completion_length": 78.0625,
|
|
"kl": 0.1547384850680828,
|
|
"epoch": 0.27,
|
|
"step": 675
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 22.875,
|
|
"learning_rate": 3.6400000000000003e-06,
|
|
"rewards/reward_fn": 0.4569937527179718,
|
|
"reward": 0.4569937527179718,
|
|
"reward_std": 0.04252268351847306,
|
|
"completion_length": 77.85,
|
|
"kl": 0.14238858669996263,
|
|
"epoch": 0.272,
|
|
"step": 680
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 22.125,
|
|
"learning_rate": 3.6300000000000004e-06,
|
|
"rewards/reward_fn": 0.4151043713092804,
|
|
"reward": 0.4151043713092804,
|
|
"reward_std": 0.12278079790994526,
|
|
"completion_length": 77.125,
|
|
"kl": 0.15316254496574402,
|
|
"epoch": 0.274,
|
|
"step": 685
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 24.125,
|
|
"learning_rate": 3.62e-06,
|
|
"rewards/reward_fn": 0.45251187682151794,
|
|
"reward": 0.45251187682151794,
|
|
"reward_std": 0.05636680471943691,
|
|
"completion_length": 78.075,
|
|
"kl": 0.14139395952224731,
|
|
"epoch": 0.276,
|
|
"step": 690
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 24.375,
|
|
"learning_rate": 3.61e-06,
|
|
"rewards/reward_fn": 0.462823748588562,
|
|
"reward": 0.462823748588562,
|
|
"reward_std": 0.021253089199308305,
|
|
"completion_length": 77.7625,
|
|
"kl": 0.1295616790652275,
|
|
"epoch": 0.278,
|
|
"step": 695
|
|
},
|
|
{
|
|
"loss": 0.0046,
|
|
"grad_norm": 25.75,
|
|
"learning_rate": 3.6000000000000003e-06,
|
|
"rewards/reward_fn": 0.4587912499904633,
|
|
"reward": 0.4587912499904633,
|
|
"reward_std": 0.03155275412136689,
|
|
"completion_length": 79.1375,
|
|
"kl": 0.11457905992865562,
|
|
"epoch": 0.28,
|
|
"step": 700
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 3.5900000000000004e-06,
|
|
"rewards/reward_fn": 0.45730499029159544,
|
|
"reward": 0.45730499029159544,
|
|
"reward_std": 0.04703305826988071,
|
|
"completion_length": 77.0,
|
|
"kl": 0.14419187232851982,
|
|
"epoch": 0.282,
|
|
"step": 705
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 19.125,
|
|
"learning_rate": 3.58e-06,
|
|
"rewards/reward_fn": 0.44802438020706176,
|
|
"reward": 0.44802438020706176,
|
|
"reward_std": 0.05318908016197384,
|
|
"completion_length": 76.4375,
|
|
"kl": 0.15136009827256203,
|
|
"epoch": 0.284,
|
|
"step": 710
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 3.57e-06,
|
|
"rewards/reward_fn": 0.45345875024795534,
|
|
"reward": 0.45345875024795534,
|
|
"reward_std": 0.05543687182944268,
|
|
"completion_length": 77.1125,
|
|
"kl": 0.14021009653806688,
|
|
"epoch": 0.286,
|
|
"step": 715
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 21.25,
|
|
"learning_rate": 3.5600000000000002e-06,
|
|
"rewards/reward_fn": 0.45240687429904936,
|
|
"reward": 0.45240687429904936,
|
|
"reward_std": 0.05269500815775245,
|
|
"completion_length": 77.8125,
|
|
"kl": 0.1341713160276413,
|
|
"epoch": 0.288,
|
|
"step": 720
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 23.375,
|
|
"learning_rate": 3.5500000000000003e-06,
|
|
"rewards/reward_fn": 0.4583718776702881,
|
|
"reward": 0.4583718776702881,
|
|
"reward_std": 0.04077405421994627,
|
|
"completion_length": 78.5375,
|
|
"kl": 0.13093890696763993,
|
|
"epoch": 0.29,
|
|
"step": 725
|
|
},
|
|
{
|
|
"loss": 0.0072,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 3.54e-06,
|
|
"rewards/reward_fn": 0.434508752822876,
|
|
"reward": 0.434508752822876,
|
|
"reward_std": 0.09370574047788978,
|
|
"completion_length": 76.6375,
|
|
"kl": 0.18059465438127517,
|
|
"epoch": 0.292,
|
|
"step": 730
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 3.53e-06,
|
|
"rewards/reward_fn": 0.4609118640422821,
|
|
"reward": 0.4609118640422821,
|
|
"reward_std": 0.04159380637574941,
|
|
"completion_length": 77.5875,
|
|
"kl": 0.14632384702563286,
|
|
"epoch": 0.294,
|
|
"step": 735
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 3.52e-06,
|
|
"rewards/reward_fn": 0.416993123292923,
|
|
"reward": 0.416993123292923,
|
|
"reward_std": 0.11569311295170337,
|
|
"completion_length": 76.4875,
|
|
"kl": 0.15963388308882714,
|
|
"epoch": 0.296,
|
|
"step": 740
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 22.0,
|
|
"learning_rate": 3.5100000000000003e-06,
|
|
"rewards/reward_fn": 0.4675106227397919,
|
|
"reward": 0.4675106227397919,
|
|
"reward_std": 0.013280918868258596,
|
|
"completion_length": 78.3,
|
|
"kl": 0.13514449894428254,
|
|
"epoch": 0.298,
|
|
"step": 745
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 3.5e-06,
|
|
"rewards/reward_fn": 0.45719312131404877,
|
|
"reward": 0.45719312131404877,
|
|
"reward_std": 0.03967158079613,
|
|
"completion_length": 78.35,
|
|
"kl": 0.1188413679599762,
|
|
"epoch": 0.3,
|
|
"step": 750
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 23.875,
|
|
"learning_rate": 3.49e-06,
|
|
"rewards/reward_fn": 0.45698000490665436,
|
|
"reward": 0.45698000490665436,
|
|
"reward_std": 0.040315793512854727,
|
|
"completion_length": 77.0875,
|
|
"kl": 0.15275436490774155,
|
|
"epoch": 0.302,
|
|
"step": 755
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 3.48e-06,
|
|
"rewards/reward_fn": 0.4397631257772446,
|
|
"reward": 0.4397631257772446,
|
|
"reward_std": 0.05836378745734692,
|
|
"completion_length": 78.3,
|
|
"kl": 0.14592362120747565,
|
|
"epoch": 0.304,
|
|
"step": 760
|
|
},
|
|
{
|
|
"loss": 0.0063,
|
|
"grad_norm": 23.625,
|
|
"learning_rate": 3.4700000000000002e-06,
|
|
"rewards/reward_fn": 0.43903999626636503,
|
|
"reward": 0.43903999626636503,
|
|
"reward_std": 0.08307434991002083,
|
|
"completion_length": 78.5875,
|
|
"kl": 0.1567191883921623,
|
|
"epoch": 0.306,
|
|
"step": 765
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 16.25,
|
|
"learning_rate": 3.46e-06,
|
|
"rewards/reward_fn": 0.46542062163352965,
|
|
"reward": 0.46542062163352965,
|
|
"reward_std": 0.024025356164202094,
|
|
"completion_length": 78.125,
|
|
"kl": 0.14618832543492316,
|
|
"epoch": 0.308,
|
|
"step": 770
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 25.375,
|
|
"learning_rate": 3.45e-06,
|
|
"rewards/reward_fn": 0.46039311587810516,
|
|
"reward": 0.46039311587810516,
|
|
"reward_std": 0.03917545401491225,
|
|
"completion_length": 76.3375,
|
|
"kl": 0.15213449746370317,
|
|
"epoch": 0.31,
|
|
"step": 775
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 23.5,
|
|
"learning_rate": 3.44e-06,
|
|
"rewards/reward_fn": 0.4595668792724609,
|
|
"reward": 0.4595668792724609,
|
|
"reward_std": 0.03896486459998414,
|
|
"completion_length": 77.925,
|
|
"kl": 0.1365116611123085,
|
|
"epoch": 0.312,
|
|
"step": 780
|
|
},
|
|
{
|
|
"loss": 0.0077,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 3.4300000000000006e-06,
|
|
"rewards/reward_fn": 0.4467168778181076,
|
|
"reward": 0.4467168778181076,
|
|
"reward_std": 0.06692771762609481,
|
|
"completion_length": 77.9,
|
|
"kl": 0.19316297993063927,
|
|
"epoch": 0.314,
|
|
"step": 785
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 3.4200000000000007e-06,
|
|
"rewards/reward_fn": 0.4581025063991547,
|
|
"reward": 0.4581025063991547,
|
|
"reward_std": 0.043769028829410674,
|
|
"completion_length": 75.4875,
|
|
"kl": 0.161041110008955,
|
|
"epoch": 0.316,
|
|
"step": 790
|
|
},
|
|
{
|
|
"loss": 0.0077,
|
|
"grad_norm": 24.5,
|
|
"learning_rate": 3.4100000000000004e-06,
|
|
"rewards/reward_fn": 0.4519962579011917,
|
|
"reward": 0.4519962579011917,
|
|
"reward_std": 0.07411843243753538,
|
|
"completion_length": 75.875,
|
|
"kl": 0.19167449921369553,
|
|
"epoch": 0.318,
|
|
"step": 795
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 23.125,
|
|
"learning_rate": 3.4000000000000005e-06,
|
|
"rewards/reward_fn": 0.45835437476634977,
|
|
"reward": 0.45835437476634977,
|
|
"reward_std": 0.03227461196947843,
|
|
"completion_length": 76.7125,
|
|
"kl": 0.14751672148704528,
|
|
"epoch": 0.32,
|
|
"step": 800
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 3.3900000000000006e-06,
|
|
"rewards/reward_fn": 0.4664868742227554,
|
|
"reward": 0.4664868742227554,
|
|
"reward_std": 0.0312751340912655,
|
|
"completion_length": 75.575,
|
|
"kl": 0.15016857534646988,
|
|
"epoch": 0.322,
|
|
"step": 805
|
|
},
|
|
{
|
|
"loss": 0.0073,
|
|
"grad_norm": 18.0,
|
|
"learning_rate": 3.3800000000000007e-06,
|
|
"rewards/reward_fn": 0.45181562900543215,
|
|
"reward": 0.45181562900543215,
|
|
"reward_std": 0.06425200761295854,
|
|
"completion_length": 77.3125,
|
|
"kl": 0.18286750614643096,
|
|
"epoch": 0.324,
|
|
"step": 810
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 21.75,
|
|
"learning_rate": 3.3700000000000003e-06,
|
|
"rewards/reward_fn": 0.45358812212944033,
|
|
"reward": 0.45358812212944033,
|
|
"reward_std": 0.05638027461245656,
|
|
"completion_length": 77.3125,
|
|
"kl": 0.14141111373901366,
|
|
"epoch": 0.326,
|
|
"step": 815
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 3.3600000000000004e-06,
|
|
"rewards/reward_fn": 0.46734937429428103,
|
|
"reward": 0.46734937429428103,
|
|
"reward_std": 0.02419458368094638,
|
|
"completion_length": 77.1,
|
|
"kl": 0.13360125049948693,
|
|
"epoch": 0.328,
|
|
"step": 820
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 23.75,
|
|
"learning_rate": 3.3500000000000005e-06,
|
|
"rewards/reward_fn": 0.45999937057495116,
|
|
"reward": 0.45999937057495116,
|
|
"reward_std": 0.0442831747001037,
|
|
"completion_length": 76.3375,
|
|
"kl": 0.14247470945119858,
|
|
"epoch": 0.33,
|
|
"step": 825
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 25.0,
|
|
"learning_rate": 3.3400000000000006e-06,
|
|
"rewards/reward_fn": 0.4691068768501282,
|
|
"reward": 0.4691068768501282,
|
|
"reward_std": 0.013599430792964995,
|
|
"completion_length": 76.95,
|
|
"kl": 0.1476905442774296,
|
|
"epoch": 0.332,
|
|
"step": 830
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 3.3300000000000003e-06,
|
|
"rewards/reward_fn": 0.43902124762535094,
|
|
"reward": 0.43902124762535094,
|
|
"reward_std": 0.09761263309046626,
|
|
"completion_length": 76.625,
|
|
"kl": 0.16074835285544395,
|
|
"epoch": 0.334,
|
|
"step": 835
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 3.3200000000000004e-06,
|
|
"rewards/reward_fn": 0.4529812455177307,
|
|
"reward": 0.4529812455177307,
|
|
"reward_std": 0.04783163331449032,
|
|
"completion_length": 77.6125,
|
|
"kl": 0.1611533671617508,
|
|
"epoch": 0.336,
|
|
"step": 840
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 24.375,
|
|
"learning_rate": 3.3100000000000005e-06,
|
|
"rewards/reward_fn": 0.45019249618053436,
|
|
"reward": 0.45019249618053436,
|
|
"reward_std": 0.0602539261453785,
|
|
"completion_length": 76.8125,
|
|
"kl": 0.1599690869450569,
|
|
"epoch": 0.338,
|
|
"step": 845
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 25.125,
|
|
"learning_rate": 3.3000000000000006e-06,
|
|
"rewards/reward_fn": 0.4448312520980835,
|
|
"reward": 0.4448312520980835,
|
|
"reward_std": 0.08103471701033413,
|
|
"completion_length": 74.9875,
|
|
"kl": 0.15435032844543456,
|
|
"epoch": 0.34,
|
|
"step": 850
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 24.5,
|
|
"learning_rate": 3.2900000000000003e-06,
|
|
"rewards/reward_fn": 0.4587881326675415,
|
|
"reward": 0.4587881326675415,
|
|
"reward_std": 0.03882696847431362,
|
|
"completion_length": 75.775,
|
|
"kl": 0.14002252742648125,
|
|
"epoch": 0.342,
|
|
"step": 855
|
|
},
|
|
{
|
|
"loss": 0.0074,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 3.2800000000000004e-06,
|
|
"rewards/reward_fn": 0.4507631242275238,
|
|
"reward": 0.4507631242275238,
|
|
"reward_std": 0.05658294195309281,
|
|
"completion_length": 76.5125,
|
|
"kl": 0.18587008863687515,
|
|
"epoch": 0.344,
|
|
"step": 860
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 3.2700000000000005e-06,
|
|
"rewards/reward_fn": 0.46349187195301056,
|
|
"reward": 0.46349187195301056,
|
|
"reward_std": 0.032273246673867106,
|
|
"completion_length": 77.775,
|
|
"kl": 0.1273516111075878,
|
|
"epoch": 0.346,
|
|
"step": 865
|
|
},
|
|
{
|
|
"loss": 0.0063,
|
|
"grad_norm": 23.5,
|
|
"learning_rate": 3.2600000000000006e-06,
|
|
"rewards/reward_fn": 0.45839687883853913,
|
|
"reward": 0.45839687883853913,
|
|
"reward_std": 0.041816312330774964,
|
|
"completion_length": 76.4375,
|
|
"kl": 0.15825477614998817,
|
|
"epoch": 0.348,
|
|
"step": 870
|
|
},
|
|
{
|
|
"loss": 0.0065,
|
|
"grad_norm": 21.375,
|
|
"learning_rate": 3.2500000000000002e-06,
|
|
"rewards/reward_fn": 0.45482062697410586,
|
|
"reward": 0.45482062697410586,
|
|
"reward_std": 0.0653240518644452,
|
|
"completion_length": 76.7125,
|
|
"kl": 0.16268835961818695,
|
|
"epoch": 0.35,
|
|
"step": 875
|
|
},
|
|
{
|
|
"loss": 0.0063,
|
|
"grad_norm": 21.375,
|
|
"learning_rate": 3.2400000000000003e-06,
|
|
"rewards/reward_fn": 0.4411893755197525,
|
|
"reward": 0.4411893755197525,
|
|
"reward_std": 0.08931890472304076,
|
|
"completion_length": 76.125,
|
|
"kl": 0.15622055530548096,
|
|
"epoch": 0.352,
|
|
"step": 880
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 24.375,
|
|
"learning_rate": 3.2300000000000004e-06,
|
|
"rewards/reward_fn": 0.4543387472629547,
|
|
"reward": 0.4543387472629547,
|
|
"reward_std": 0.05997409771662206,
|
|
"completion_length": 77.4875,
|
|
"kl": 0.14859429150819778,
|
|
"epoch": 0.354,
|
|
"step": 885
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 23.5,
|
|
"learning_rate": 3.2200000000000005e-06,
|
|
"rewards/reward_fn": 0.4376031279563904,
|
|
"reward": 0.4376031279563904,
|
|
"reward_std": 0.07789694773964584,
|
|
"completion_length": 78.6875,
|
|
"kl": 0.13580713272094727,
|
|
"epoch": 0.356,
|
|
"step": 890
|
|
},
|
|
{
|
|
"loss": 0.007,
|
|
"grad_norm": 27.0,
|
|
"learning_rate": 3.21e-06,
|
|
"rewards/reward_fn": 0.4595912516117096,
|
|
"reward": 0.4595912516117096,
|
|
"reward_std": 0.03936622152104974,
|
|
"completion_length": 77.8,
|
|
"kl": 0.17524173483252525,
|
|
"epoch": 0.358,
|
|
"step": 895
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 3.2000000000000003e-06,
|
|
"rewards/reward_fn": 0.45086687207221987,
|
|
"reward": 0.45086687207221987,
|
|
"reward_std": 0.06653416159097106,
|
|
"completion_length": 78.225,
|
|
"kl": 0.13200628608465195,
|
|
"epoch": 0.36,
|
|
"step": 900
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 3.1900000000000004e-06,
|
|
"rewards/reward_fn": 0.44835312366485597,
|
|
"reward": 0.44835312366485597,
|
|
"reward_std": 0.061607802627258935,
|
|
"completion_length": 78.45,
|
|
"kl": 0.14225002825260163,
|
|
"epoch": 0.362,
|
|
"step": 905
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 3.1800000000000005e-06,
|
|
"rewards/reward_fn": 0.45717000365257265,
|
|
"reward": 0.45717000365257265,
|
|
"reward_std": 0.041998466942459345,
|
|
"completion_length": 78.5125,
|
|
"kl": 0.12377910763025284,
|
|
"epoch": 0.364,
|
|
"step": 910
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 22.75,
|
|
"learning_rate": 3.17e-06,
|
|
"rewards/reward_fn": 0.4658468782901764,
|
|
"reward": 0.4658468782901764,
|
|
"reward_std": 0.021458613453432918,
|
|
"completion_length": 77.6625,
|
|
"kl": 0.12044140994548798,
|
|
"epoch": 0.366,
|
|
"step": 915
|
|
},
|
|
{
|
|
"loss": 0.0068,
|
|
"grad_norm": 22.875,
|
|
"learning_rate": 3.1600000000000002e-06,
|
|
"rewards/reward_fn": 0.4457137495279312,
|
|
"reward": 0.4457137495279312,
|
|
"reward_std": 0.07773053634446114,
|
|
"completion_length": 77.0125,
|
|
"kl": 0.16893841549754143,
|
|
"epoch": 0.368,
|
|
"step": 920
|
|
},
|
|
{
|
|
"loss": 0.0072,
|
|
"grad_norm": 22.875,
|
|
"learning_rate": 3.1500000000000003e-06,
|
|
"rewards/reward_fn": 0.4391768783330917,
|
|
"reward": 0.4391768783330917,
|
|
"reward_std": 0.08680278662359342,
|
|
"completion_length": 76.8375,
|
|
"kl": 0.1803253024816513,
|
|
"epoch": 0.37,
|
|
"step": 925
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 22.625,
|
|
"learning_rate": 3.1400000000000004e-06,
|
|
"rewards/reward_fn": 0.4521843731403351,
|
|
"reward": 0.4521843731403351,
|
|
"reward_std": 0.06457424827385694,
|
|
"completion_length": 77.875,
|
|
"kl": 0.14004313349723815,
|
|
"epoch": 0.372,
|
|
"step": 930
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 22.625,
|
|
"learning_rate": 3.13e-06,
|
|
"rewards/reward_fn": 0.4524868756532669,
|
|
"reward": 0.4524868756532669,
|
|
"reward_std": 0.048214147449471056,
|
|
"completion_length": 77.3375,
|
|
"kl": 0.15322432667016983,
|
|
"epoch": 0.374,
|
|
"step": 935
|
|
},
|
|
{
|
|
"loss": 0.0071,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 3.12e-06,
|
|
"rewards/reward_fn": 0.4452850043773651,
|
|
"reward": 0.4452850043773651,
|
|
"reward_std": 0.07152452755253762,
|
|
"completion_length": 77.6,
|
|
"kl": 0.17651870474219322,
|
|
"epoch": 0.376,
|
|
"step": 940
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 21.25,
|
|
"learning_rate": 3.1100000000000003e-06,
|
|
"rewards/reward_fn": 0.4586562544107437,
|
|
"reward": 0.4586562544107437,
|
|
"reward_std": 0.04483227517921477,
|
|
"completion_length": 78.25,
|
|
"kl": 0.13818887621164322,
|
|
"epoch": 0.378,
|
|
"step": 945
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 21.75,
|
|
"learning_rate": 3.1000000000000004e-06,
|
|
"rewards/reward_fn": 0.4671787559986115,
|
|
"reward": 0.4671787559986115,
|
|
"reward_std": 0.02326571140438318,
|
|
"completion_length": 78.575,
|
|
"kl": 0.1284794516861439,
|
|
"epoch": 0.38,
|
|
"step": 950
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 24.25,
|
|
"learning_rate": 3.09e-06,
|
|
"rewards/reward_fn": 0.4639474958181381,
|
|
"reward": 0.4639474958181381,
|
|
"reward_std": 0.03198056248947978,
|
|
"completion_length": 78.525,
|
|
"kl": 0.1249109148979187,
|
|
"epoch": 0.382,
|
|
"step": 955
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 26.5,
|
|
"learning_rate": 3.08e-06,
|
|
"rewards/reward_fn": 0.4462443798780441,
|
|
"reward": 0.4462443798780441,
|
|
"reward_std": 0.06451276817824692,
|
|
"completion_length": 77.025,
|
|
"kl": 0.13725997805595397,
|
|
"epoch": 0.384,
|
|
"step": 960
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 24.375,
|
|
"learning_rate": 3.0700000000000003e-06,
|
|
"rewards/reward_fn": 0.43646687269210815,
|
|
"reward": 0.43646687269210815,
|
|
"reward_std": 0.10176013394957409,
|
|
"completion_length": 77.9625,
|
|
"kl": 0.14228134751319885,
|
|
"epoch": 0.386,
|
|
"step": 965
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 3.0600000000000003e-06,
|
|
"rewards/reward_fn": 0.45134938657283785,
|
|
"reward": 0.45134938657283785,
|
|
"reward_std": 0.06808145013637841,
|
|
"completion_length": 78.4875,
|
|
"kl": 0.1274636261165142,
|
|
"epoch": 0.388,
|
|
"step": 970
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 21.125,
|
|
"learning_rate": 3.05e-06,
|
|
"rewards/reward_fn": 0.43994062542915346,
|
|
"reward": 0.43994062542915346,
|
|
"reward_std": 0.09077681568451226,
|
|
"completion_length": 77.375,
|
|
"kl": 0.13992855474352836,
|
|
"epoch": 0.39,
|
|
"step": 975
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 20.25,
|
|
"learning_rate": 3.04e-06,
|
|
"rewards/reward_fn": 0.4459106236696243,
|
|
"reward": 0.4459106236696243,
|
|
"reward_std": 0.07173144910484552,
|
|
"completion_length": 77.6875,
|
|
"kl": 0.13269591480493545,
|
|
"epoch": 0.392,
|
|
"step": 980
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 3.0300000000000002e-06,
|
|
"rewards/reward_fn": 0.452276873588562,
|
|
"reward": 0.452276873588562,
|
|
"reward_std": 0.058129315462429075,
|
|
"completion_length": 76.2125,
|
|
"kl": 0.14192070737481116,
|
|
"epoch": 0.394,
|
|
"step": 985
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 21.75,
|
|
"learning_rate": 3.0200000000000003e-06,
|
|
"rewards/reward_fn": 0.4616843730211258,
|
|
"reward": 0.4616843730211258,
|
|
"reward_std": 0.027600679779425263,
|
|
"completion_length": 76.6,
|
|
"kl": 0.12908575385808946,
|
|
"epoch": 0.396,
|
|
"step": 990
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 3.01e-06,
|
|
"rewards/reward_fn": 0.45958187282085416,
|
|
"reward": 0.45958187282085416,
|
|
"reward_std": 0.041698419768363235,
|
|
"completion_length": 77.6375,
|
|
"kl": 0.14417157247662543,
|
|
"epoch": 0.398,
|
|
"step": 995
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 3e-06,
|
|
"rewards/reward_fn": 0.45577124059200286,
|
|
"reward": 0.45577124059200286,
|
|
"reward_std": 0.061070334317628296,
|
|
"completion_length": 77.5625,
|
|
"kl": 0.11728422567248345,
|
|
"epoch": 0.4,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"loss": 0.0043,
|
|
"grad_norm": 23.0,
|
|
"learning_rate": 2.99e-06,
|
|
"rewards/reward_fn": 0.4588618755340576,
|
|
"reward": 0.4588618755340576,
|
|
"reward_std": 0.036662753293057904,
|
|
"completion_length": 76.8625,
|
|
"kl": 0.10696139335632324,
|
|
"epoch": 0.402,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 22.125,
|
|
"learning_rate": 2.9800000000000003e-06,
|
|
"rewards/reward_fn": 0.4509599953889847,
|
|
"reward": 0.4509599953889847,
|
|
"reward_std": 0.04541698046959937,
|
|
"completion_length": 78.7375,
|
|
"kl": 0.1384617082774639,
|
|
"epoch": 0.404,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 25.25,
|
|
"learning_rate": 2.97e-06,
|
|
"rewards/reward_fn": 0.4705031216144562,
|
|
"reward": 0.4705031216144562,
|
|
"reward_std": 0.013416963210329414,
|
|
"completion_length": 76.9,
|
|
"kl": 0.11976072862744332,
|
|
"epoch": 0.406,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 22.75,
|
|
"learning_rate": 2.96e-06,
|
|
"rewards/reward_fn": 0.46544250547885896,
|
|
"reward": 0.46544250547885896,
|
|
"reward_std": 0.026560991373844444,
|
|
"completion_length": 76.4625,
|
|
"kl": 0.13976338282227516,
|
|
"epoch": 0.408,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 21.125,
|
|
"learning_rate": 2.95e-06,
|
|
"rewards/reward_fn": 0.46479061543941497,
|
|
"reward": 0.46479061543941497,
|
|
"reward_std": 0.02370762478094548,
|
|
"completion_length": 75.7625,
|
|
"kl": 0.1556813433766365,
|
|
"epoch": 0.41,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 19.625,
|
|
"learning_rate": 2.9400000000000002e-06,
|
|
"rewards/reward_fn": 0.4578593820333481,
|
|
"reward": 0.4578593820333481,
|
|
"reward_std": 0.04385443233186379,
|
|
"completion_length": 76.7875,
|
|
"kl": 0.13361710608005523,
|
|
"epoch": 0.412,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"loss": 0.0081,
|
|
"grad_norm": 25.875,
|
|
"learning_rate": 2.93e-06,
|
|
"rewards/reward_fn": 0.42873625457286835,
|
|
"reward": 0.42873625457286835,
|
|
"reward_std": 0.11082857861183584,
|
|
"completion_length": 74.8125,
|
|
"kl": 0.20237903594970702,
|
|
"epoch": 0.414,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 18.5,
|
|
"learning_rate": 2.92e-06,
|
|
"rewards/reward_fn": 0.4592456161975861,
|
|
"reward": 0.4592456161975861,
|
|
"reward_std": 0.042502091301139446,
|
|
"completion_length": 76.25,
|
|
"kl": 0.1265183039009571,
|
|
"epoch": 0.416,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 2.91e-06,
|
|
"rewards/reward_fn": 0.44570625126361846,
|
|
"reward": 0.44570625126361846,
|
|
"reward_std": 0.07765153090003878,
|
|
"completion_length": 76.525,
|
|
"kl": 0.14906007573008537,
|
|
"epoch": 0.418,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"loss": 0.0067,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 2.9e-06,
|
|
"rewards/reward_fn": 0.4367462515830994,
|
|
"reward": 0.4367462515830994,
|
|
"reward_std": 0.0928474075277336,
|
|
"completion_length": 77.8,
|
|
"kl": 0.16817878931760788,
|
|
"epoch": 0.42,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"loss": 0.0066,
|
|
"grad_norm": 20.25,
|
|
"learning_rate": 2.89e-06,
|
|
"rewards/reward_fn": 0.45984499156475067,
|
|
"reward": 0.45984499156475067,
|
|
"reward_std": 0.03933965916512534,
|
|
"completion_length": 77.275,
|
|
"kl": 0.16533141881227492,
|
|
"epoch": 0.422,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"loss": 0.0063,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 2.88e-06,
|
|
"rewards/reward_fn": 0.43899562656879426,
|
|
"reward": 0.43899562656879426,
|
|
"reward_std": 0.08788106166757644,
|
|
"completion_length": 75.75,
|
|
"kl": 0.15769053027033805,
|
|
"epoch": 0.424,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 2.87e-06,
|
|
"rewards/reward_fn": 0.423912501335144,
|
|
"reward": 0.423912501335144,
|
|
"reward_std": 0.11766294327098877,
|
|
"completion_length": 76.475,
|
|
"kl": 0.14753883704543114,
|
|
"epoch": 0.426,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 25.25,
|
|
"learning_rate": 2.86e-06,
|
|
"rewards/reward_fn": 0.46032374203205106,
|
|
"reward": 0.46032374203205106,
|
|
"reward_std": 0.03572893298696726,
|
|
"completion_length": 77.825,
|
|
"kl": 0.13863224387168885,
|
|
"epoch": 0.428,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"loss": 0.0049,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 2.85e-06,
|
|
"rewards/reward_fn": 0.4649974972009659,
|
|
"reward": 0.4649974972009659,
|
|
"reward_std": 0.03036914155818522,
|
|
"completion_length": 75.8,
|
|
"kl": 0.12206159606575966,
|
|
"epoch": 0.43,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 2.84e-06,
|
|
"rewards/reward_fn": 0.4573018759489059,
|
|
"reward": 0.4573018759489059,
|
|
"reward_std": 0.06353021854301914,
|
|
"completion_length": 77.575,
|
|
"kl": 0.1350351519882679,
|
|
"epoch": 0.432,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 2.83e-06,
|
|
"rewards/reward_fn": 0.43689249753952025,
|
|
"reward": 0.43689249753952025,
|
|
"reward_std": 0.09802878738846629,
|
|
"completion_length": 76.1625,
|
|
"kl": 0.13650911152362824,
|
|
"epoch": 0.434,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 2.82e-06,
|
|
"rewards/reward_fn": 0.465862500667572,
|
|
"reward": 0.465862500667572,
|
|
"reward_std": 0.023497561831027268,
|
|
"completion_length": 77.325,
|
|
"kl": 0.13802992850542067,
|
|
"epoch": 0.436,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 22.75,
|
|
"learning_rate": 2.8100000000000006e-06,
|
|
"rewards/reward_fn": 0.4442381262779236,
|
|
"reward": 0.4442381262779236,
|
|
"reward_std": 0.0735843145288527,
|
|
"completion_length": 77.4875,
|
|
"kl": 0.14121268913149834,
|
|
"epoch": 0.438,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 2.8000000000000003e-06,
|
|
"rewards/reward_fn": 0.44780624806880953,
|
|
"reward": 0.44780624806880953,
|
|
"reward_std": 0.08063485231250525,
|
|
"completion_length": 77.8,
|
|
"kl": 0.14030690044164656,
|
|
"epoch": 0.44,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 2.7900000000000004e-06,
|
|
"rewards/reward_fn": 0.45844624638557435,
|
|
"reward": 0.45844624638557435,
|
|
"reward_std": 0.05268092898186296,
|
|
"completion_length": 77.7,
|
|
"kl": 0.14547136351466178,
|
|
"epoch": 0.442,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 19.5,
|
|
"learning_rate": 2.7800000000000005e-06,
|
|
"rewards/reward_fn": 0.43803000152111055,
|
|
"reward": 0.43803000152111055,
|
|
"reward_std": 0.10099421259947121,
|
|
"completion_length": 78.0375,
|
|
"kl": 0.14180475547909738,
|
|
"epoch": 0.444,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"loss": 0.0045,
|
|
"grad_norm": 21.375,
|
|
"learning_rate": 2.7700000000000006e-06,
|
|
"rewards/reward_fn": 0.4747843772172928,
|
|
"reward": 0.4747843772172928,
|
|
"reward_std": 0.01129134335787967,
|
|
"completion_length": 77.5875,
|
|
"kl": 0.11243945509195327,
|
|
"epoch": 0.446,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 21.625,
|
|
"learning_rate": 2.7600000000000003e-06,
|
|
"rewards/reward_fn": 0.4492399960756302,
|
|
"reward": 0.4492399960756302,
|
|
"reward_std": 0.06491480625700205,
|
|
"completion_length": 78.9375,
|
|
"kl": 0.15152825638651848,
|
|
"epoch": 0.448,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"loss": 0.0068,
|
|
"grad_norm": 23.25,
|
|
"learning_rate": 2.7500000000000004e-06,
|
|
"rewards/reward_fn": 0.44580812752246857,
|
|
"reward": 0.44580812752246857,
|
|
"reward_std": 0.08126231417991221,
|
|
"completion_length": 78.075,
|
|
"kl": 0.1693297281861305,
|
|
"epoch": 0.45,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 2.7400000000000004e-06,
|
|
"rewards/reward_fn": 0.4513862580060959,
|
|
"reward": 0.4513862580060959,
|
|
"reward_std": 0.04983757671434432,
|
|
"completion_length": 75.9625,
|
|
"kl": 0.1426179051399231,
|
|
"epoch": 0.452,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 2.7300000000000005e-06,
|
|
"rewards/reward_fn": 0.449181866645813,
|
|
"reward": 0.449181866645813,
|
|
"reward_std": 0.05518764650914818,
|
|
"completion_length": 76.775,
|
|
"kl": 0.1532064698636532,
|
|
"epoch": 0.454,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 2.7200000000000002e-06,
|
|
"rewards/reward_fn": 0.45171125829219816,
|
|
"reward": 0.45171125829219816,
|
|
"reward_std": 0.05260382960550487,
|
|
"completion_length": 77.9375,
|
|
"kl": 0.15185603350400925,
|
|
"epoch": 0.456,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 21.375,
|
|
"learning_rate": 2.7100000000000003e-06,
|
|
"rewards/reward_fn": 0.4606387555599213,
|
|
"reward": 0.4606387555599213,
|
|
"reward_std": 0.03888747000601143,
|
|
"completion_length": 76.275,
|
|
"kl": 0.1298865035176277,
|
|
"epoch": 0.458,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 24.375,
|
|
"learning_rate": 2.7000000000000004e-06,
|
|
"rewards/reward_fn": 0.43960937559604646,
|
|
"reward": 0.43960937559604646,
|
|
"reward_std": 0.1048707491834648,
|
|
"completion_length": 77.4625,
|
|
"kl": 0.14175623878836632,
|
|
"epoch": 0.46,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 20.375,
|
|
"learning_rate": 2.6900000000000005e-06,
|
|
"rewards/reward_fn": 0.44681625366210936,
|
|
"reward": 0.44681625366210936,
|
|
"reward_std": 0.07011992897605523,
|
|
"completion_length": 78.4375,
|
|
"kl": 0.12534804567694663,
|
|
"epoch": 0.462,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"loss": 0.0044,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 2.68e-06,
|
|
"rewards/reward_fn": 0.4641531229019165,
|
|
"reward": 0.4641531229019165,
|
|
"reward_std": 0.025870742078404875,
|
|
"completion_length": 78.5,
|
|
"kl": 0.10891071110963821,
|
|
"epoch": 0.464,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 21.125,
|
|
"learning_rate": 2.6700000000000003e-06,
|
|
"rewards/reward_fn": 0.4655087530612946,
|
|
"reward": 0.4655087530612946,
|
|
"reward_std": 0.03508747317828238,
|
|
"completion_length": 78.4125,
|
|
"kl": 0.1420759491622448,
|
|
"epoch": 0.466,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 19.875,
|
|
"learning_rate": 2.6600000000000004e-06,
|
|
"rewards/reward_fn": 0.45731625854969027,
|
|
"reward": 0.45731625854969027,
|
|
"reward_std": 0.03957532516214997,
|
|
"completion_length": 78.4375,
|
|
"kl": 0.13185337632894517,
|
|
"epoch": 0.468,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 2.6500000000000005e-06,
|
|
"rewards/reward_fn": 0.44373124837875366,
|
|
"reward": 0.44373124837875366,
|
|
"reward_std": 0.07896788076031953,
|
|
"completion_length": 76.2625,
|
|
"kl": 0.13021735474467278,
|
|
"epoch": 0.47,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 2.64e-06,
|
|
"rewards/reward_fn": 0.45281187295913694,
|
|
"reward": 0.45281187295913694,
|
|
"reward_std": 0.05061942492611706,
|
|
"completion_length": 78.2375,
|
|
"kl": 0.13731320798397065,
|
|
"epoch": 0.472,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 2.6300000000000002e-06,
|
|
"rewards/reward_fn": 0.4562093824148178,
|
|
"reward": 0.4562093824148178,
|
|
"reward_std": 0.040638361941091716,
|
|
"completion_length": 77.3625,
|
|
"kl": 0.1344783328473568,
|
|
"epoch": 0.474,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 18.125,
|
|
"learning_rate": 2.6200000000000003e-06,
|
|
"rewards/reward_fn": 0.4580037474632263,
|
|
"reward": 0.4580037474632263,
|
|
"reward_std": 0.05447399332770146,
|
|
"completion_length": 78.775,
|
|
"kl": 0.12405369728803635,
|
|
"epoch": 0.476,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 2.6100000000000004e-06,
|
|
"rewards/reward_fn": 0.45782187283039094,
|
|
"reward": 0.45782187283039094,
|
|
"reward_std": 0.04352846188703552,
|
|
"completion_length": 78.2375,
|
|
"kl": 0.14039622321724893,
|
|
"epoch": 0.478,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"loss": 0.0049,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 2.6e-06,
|
|
"rewards/reward_fn": 0.470593124628067,
|
|
"reward": 0.470593124628067,
|
|
"reward_std": 0.007097184634767472,
|
|
"completion_length": 77.8875,
|
|
"kl": 0.1220773808658123,
|
|
"epoch": 0.48,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"loss": 0.0068,
|
|
"grad_norm": 19.125,
|
|
"learning_rate": 2.59e-06,
|
|
"rewards/reward_fn": 0.45062249302864077,
|
|
"reward": 0.45062249302864077,
|
|
"reward_std": 0.06360151261324062,
|
|
"completion_length": 78.1875,
|
|
"kl": 0.16903574615716935,
|
|
"epoch": 0.482,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"loss": 0.007,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 2.5800000000000003e-06,
|
|
"rewards/reward_fn": 0.45593812465667727,
|
|
"reward": 0.45593812465667727,
|
|
"reward_std": 0.036037556815426794,
|
|
"completion_length": 77.325,
|
|
"kl": 0.17566560804843903,
|
|
"epoch": 0.484,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 2.5700000000000004e-06,
|
|
"rewards/reward_fn": 0.45412937700748446,
|
|
"reward": 0.45412937700748446,
|
|
"reward_std": 0.060397130448836836,
|
|
"completion_length": 78.6625,
|
|
"kl": 0.14816011264920234,
|
|
"epoch": 0.486,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 21.625,
|
|
"learning_rate": 2.56e-06,
|
|
"rewards/reward_fn": 0.4632093787193298,
|
|
"reward": 0.4632093787193298,
|
|
"reward_std": 0.044997752620838584,
|
|
"completion_length": 79.2625,
|
|
"kl": 0.1313982665538788,
|
|
"epoch": 0.488,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 21.75,
|
|
"learning_rate": 2.55e-06,
|
|
"rewards/reward_fn": 0.4657293736934662,
|
|
"reward": 0.4657293736934662,
|
|
"reward_std": 0.022073199006263165,
|
|
"completion_length": 78.9,
|
|
"kl": 0.12473629713058472,
|
|
"epoch": 0.49,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 23.125,
|
|
"learning_rate": 2.5400000000000002e-06,
|
|
"rewards/reward_fn": 0.4348493814468384,
|
|
"reward": 0.4348493814468384,
|
|
"reward_std": 0.07554407969582826,
|
|
"completion_length": 79.5125,
|
|
"kl": 0.1481925331056118,
|
|
"epoch": 0.492,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"loss": 0.0077,
|
|
"grad_norm": 24.0,
|
|
"learning_rate": 2.5300000000000003e-06,
|
|
"rewards/reward_fn": 0.43550437688827515,
|
|
"reward": 0.43550437688827515,
|
|
"reward_std": 0.10341594566125423,
|
|
"completion_length": 79.35,
|
|
"kl": 0.1917330376803875,
|
|
"epoch": 0.494,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"loss": 0.0066,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 2.52e-06,
|
|
"rewards/reward_fn": 0.46648249924182894,
|
|
"reward": 0.46648249924182894,
|
|
"reward_std": 0.030170188657939433,
|
|
"completion_length": 78.1375,
|
|
"kl": 0.16498119458556176,
|
|
"epoch": 0.496,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"loss": 0.0065,
|
|
"grad_norm": 22.875,
|
|
"learning_rate": 2.51e-06,
|
|
"rewards/reward_fn": 0.450721874833107,
|
|
"reward": 0.450721874833107,
|
|
"reward_std": 0.054543074569664896,
|
|
"completion_length": 78.0125,
|
|
"kl": 0.16144041568040848,
|
|
"epoch": 0.498,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 19.875,
|
|
"learning_rate": 2.5e-06,
|
|
"rewards/reward_fn": 0.47377062737941744,
|
|
"reward": 0.47377062737941744,
|
|
"reward_std": 0.011301003873813897,
|
|
"completion_length": 77.975,
|
|
"kl": 0.13532672077417374,
|
|
"epoch": 0.5,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"loss": 0.0049,
|
|
"grad_norm": 18.875,
|
|
"learning_rate": 2.4900000000000003e-06,
|
|
"rewards/reward_fn": 0.4674256265163422,
|
|
"reward": 0.4674256265163422,
|
|
"reward_std": 0.0174906364409253,
|
|
"completion_length": 79.825,
|
|
"kl": 0.12367920055985451,
|
|
"epoch": 0.502,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 2.4800000000000004e-06,
|
|
"rewards/reward_fn": 0.4363387554883957,
|
|
"reward": 0.4363387554883957,
|
|
"reward_std": 0.10196942522889003,
|
|
"completion_length": 78.7,
|
|
"kl": 0.15936801359057426,
|
|
"epoch": 0.504,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"loss": 0.0069,
|
|
"grad_norm": 21.625,
|
|
"learning_rate": 2.47e-06,
|
|
"rewards/reward_fn": 0.45225499868392943,
|
|
"reward": 0.45225499868392943,
|
|
"reward_std": 0.059183214767836036,
|
|
"completion_length": 78.9125,
|
|
"kl": 0.17264233008027077,
|
|
"epoch": 0.506,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 2.46e-06,
|
|
"rewards/reward_fn": 0.4540149927139282,
|
|
"reward": 0.4540149927139282,
|
|
"reward_std": 0.05151141767855734,
|
|
"completion_length": 78.875,
|
|
"kl": 0.14860266521573068,
|
|
"epoch": 0.508,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 2.4500000000000003e-06,
|
|
"rewards/reward_fn": 0.46672500371932985,
|
|
"reward": 0.46672500371932985,
|
|
"reward_std": 0.0238963620737195,
|
|
"completion_length": 79.5875,
|
|
"kl": 0.12659351155161858,
|
|
"epoch": 0.51,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 2.4400000000000004e-06,
|
|
"rewards/reward_fn": 0.4593931257724762,
|
|
"reward": 0.4593931257724762,
|
|
"reward_std": 0.0301577219623141,
|
|
"completion_length": 79.65,
|
|
"kl": 0.15002150908112527,
|
|
"epoch": 0.512,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 18.0,
|
|
"learning_rate": 2.43e-06,
|
|
"rewards/reward_fn": 0.4625087469816208,
|
|
"reward": 0.4625087469816208,
|
|
"reward_std": 0.03460253309458494,
|
|
"completion_length": 79.1125,
|
|
"kl": 0.14838578924536705,
|
|
"epoch": 0.514,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 18.375,
|
|
"learning_rate": 2.42e-06,
|
|
"rewards/reward_fn": 0.4678725004196167,
|
|
"reward": 0.4678725004196167,
|
|
"reward_std": 0.02502680493053049,
|
|
"completion_length": 78.5125,
|
|
"kl": 0.11876562908291817,
|
|
"epoch": 0.516,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 2.4100000000000002e-06,
|
|
"rewards/reward_fn": 0.44823938310146333,
|
|
"reward": 0.44823938310146333,
|
|
"reward_std": 0.04440039648325182,
|
|
"completion_length": 78.2,
|
|
"kl": 0.13063137009739875,
|
|
"epoch": 0.518,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 25.125,
|
|
"learning_rate": 2.4000000000000003e-06,
|
|
"rewards/reward_fn": 0.44891312420368196,
|
|
"reward": 0.44891312420368196,
|
|
"reward_std": 0.07504934098105878,
|
|
"completion_length": 78.3625,
|
|
"kl": 0.15268274173140525,
|
|
"epoch": 0.52,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 2.39e-06,
|
|
"rewards/reward_fn": 0.4568462461233139,
|
|
"reward": 0.4568462461233139,
|
|
"reward_std": 0.056088435545098035,
|
|
"completion_length": 79.4125,
|
|
"kl": 0.14741537049412728,
|
|
"epoch": 0.522,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"loss": 0.0069,
|
|
"grad_norm": 21.375,
|
|
"learning_rate": 2.38e-06,
|
|
"rewards/reward_fn": 0.4558568805456161,
|
|
"reward": 0.4558568805456161,
|
|
"reward_std": 0.05745224840939045,
|
|
"completion_length": 78.8375,
|
|
"kl": 0.1723767749965191,
|
|
"epoch": 0.524,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 2.37e-06,
|
|
"rewards/reward_fn": 0.4718281179666519,
|
|
"reward": 0.4718281179666519,
|
|
"reward_std": 0.014395223173778504,
|
|
"completion_length": 78.9875,
|
|
"kl": 0.14733590111136435,
|
|
"epoch": 0.526,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 2.3600000000000003e-06,
|
|
"rewards/reward_fn": 0.4554156303405762,
|
|
"reward": 0.4554156303405762,
|
|
"reward_std": 0.05159756838111207,
|
|
"completion_length": 79.175,
|
|
"kl": 0.14898578226566314,
|
|
"epoch": 0.528,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"loss": 0.0075,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 2.35e-06,
|
|
"rewards/reward_fn": 0.4550568699836731,
|
|
"reward": 0.4550568699836731,
|
|
"reward_std": 0.06614897139370442,
|
|
"completion_length": 78.3,
|
|
"kl": 0.18677168115973472,
|
|
"epoch": 0.53,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"loss": 0.007,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 2.3400000000000005e-06,
|
|
"rewards/reward_fn": 0.44545812010765073,
|
|
"reward": 0.44545812010765073,
|
|
"reward_std": 0.07346066441386938,
|
|
"completion_length": 79.625,
|
|
"kl": 0.1740099720656872,
|
|
"epoch": 0.532,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"loss": 0.0071,
|
|
"grad_norm": 19.0,
|
|
"learning_rate": 2.33e-06,
|
|
"rewards/reward_fn": 0.45567687749862673,
|
|
"reward": 0.45567687749862673,
|
|
"reward_std": 0.05622612689621746,
|
|
"completion_length": 79.0875,
|
|
"kl": 0.17623607516288758,
|
|
"epoch": 0.534,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 21.25,
|
|
"learning_rate": 2.3200000000000002e-06,
|
|
"rewards/reward_fn": 0.4575300008058548,
|
|
"reward": 0.4575300008058548,
|
|
"reward_std": 0.06325785100925714,
|
|
"completion_length": 78.85,
|
|
"kl": 0.1518963485956192,
|
|
"epoch": 0.536,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 19.875,
|
|
"learning_rate": 2.3100000000000003e-06,
|
|
"rewards/reward_fn": 0.4771687567234039,
|
|
"reward": 0.4771687567234039,
|
|
"reward_std": 0.01310007597785443,
|
|
"completion_length": 78.25,
|
|
"kl": 0.14608021229505538,
|
|
"epoch": 0.538,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"loss": 0.0076,
|
|
"grad_norm": 22.125,
|
|
"learning_rate": 2.3000000000000004e-06,
|
|
"rewards/reward_fn": 0.440699377655983,
|
|
"reward": 0.440699377655983,
|
|
"reward_std": 0.07997361421585084,
|
|
"completion_length": 78.5,
|
|
"kl": 0.19073922261595727,
|
|
"epoch": 0.54,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 23.375,
|
|
"learning_rate": 2.29e-06,
|
|
"rewards/reward_fn": 0.46355812549591063,
|
|
"reward": 0.46355812549591063,
|
|
"reward_std": 0.042679897602647544,
|
|
"completion_length": 78.8125,
|
|
"kl": 0.15155968442559242,
|
|
"epoch": 0.542,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 19.375,
|
|
"learning_rate": 2.28e-06,
|
|
"rewards/reward_fn": 0.47639000713825225,
|
|
"reward": 0.47639000713825225,
|
|
"reward_std": 0.018505998922046275,
|
|
"completion_length": 79.6375,
|
|
"kl": 0.13178130090236664,
|
|
"epoch": 0.544,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 19.0,
|
|
"learning_rate": 2.2700000000000003e-06,
|
|
"rewards/reward_fn": 0.463755002617836,
|
|
"reward": 0.463755002617836,
|
|
"reward_std": 0.02542402143590152,
|
|
"completion_length": 78.775,
|
|
"kl": 0.1374554641544819,
|
|
"epoch": 0.546,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"loss": 0.0067,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 2.2600000000000004e-06,
|
|
"rewards/reward_fn": 0.45715188086032865,
|
|
"reward": 0.45715188086032865,
|
|
"reward_std": 0.0426810149801895,
|
|
"completion_length": 79.3375,
|
|
"kl": 0.16694772839546204,
|
|
"epoch": 0.548,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 19.875,
|
|
"learning_rate": 2.25e-06,
|
|
"rewards/reward_fn": 0.46370500326156616,
|
|
"reward": 0.46370500326156616,
|
|
"reward_std": 0.023371401114854962,
|
|
"completion_length": 78.325,
|
|
"kl": 0.13150209859013556,
|
|
"epoch": 0.55,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 19.375,
|
|
"learning_rate": 2.24e-06,
|
|
"rewards/reward_fn": 0.44826062619686124,
|
|
"reward": 0.44826062619686124,
|
|
"reward_std": 0.06448173672542908,
|
|
"completion_length": 78.8375,
|
|
"kl": 0.15249428376555443,
|
|
"epoch": 0.552,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 23.25,
|
|
"learning_rate": 2.2300000000000002e-06,
|
|
"rewards/reward_fn": 0.46055562794208527,
|
|
"reward": 0.46055562794208527,
|
|
"reward_std": 0.04732920726528391,
|
|
"completion_length": 78.475,
|
|
"kl": 0.13974663913249968,
|
|
"epoch": 0.554,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 2.2200000000000003e-06,
|
|
"rewards/reward_fn": 0.4677337437868118,
|
|
"reward": 0.4677337437868118,
|
|
"reward_std": 0.02635425798362121,
|
|
"completion_length": 78.4125,
|
|
"kl": 0.1423714838922024,
|
|
"epoch": 0.556,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 19.125,
|
|
"learning_rate": 2.21e-06,
|
|
"rewards/reward_fn": 0.4616131275892258,
|
|
"reward": 0.4616131275892258,
|
|
"reward_std": 0.03302627064986154,
|
|
"completion_length": 78.9375,
|
|
"kl": 0.1436442255973816,
|
|
"epoch": 0.558,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 2.2e-06,
|
|
"rewards/reward_fn": 0.46321562230587005,
|
|
"reward": 0.46321562230587005,
|
|
"reward_std": 0.04756553352344781,
|
|
"completion_length": 78.5,
|
|
"kl": 0.1262364447116852,
|
|
"epoch": 0.56,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 2.19e-06,
|
|
"rewards/reward_fn": 0.44202812314033507,
|
|
"reward": 0.44202812314033507,
|
|
"reward_std": 0.0773768131621182,
|
|
"completion_length": 78.9125,
|
|
"kl": 0.14810121133923532,
|
|
"epoch": 0.562,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 2.1800000000000003e-06,
|
|
"rewards/reward_fn": 0.46586625576019286,
|
|
"reward": 0.46586625576019286,
|
|
"reward_std": 0.032051419792696836,
|
|
"completion_length": 78.5625,
|
|
"kl": 0.1482535183429718,
|
|
"epoch": 0.564,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 24.75,
|
|
"learning_rate": 2.17e-06,
|
|
"rewards/reward_fn": 0.46913000345230105,
|
|
"reward": 0.46913000345230105,
|
|
"reward_std": 0.032656107540242375,
|
|
"completion_length": 78.5,
|
|
"kl": 0.11947640255093575,
|
|
"epoch": 0.566,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 19.125,
|
|
"learning_rate": 2.16e-06,
|
|
"rewards/reward_fn": 0.439087501168251,
|
|
"reward": 0.439087501168251,
|
|
"reward_std": 0.09692132237832993,
|
|
"completion_length": 79.6625,
|
|
"kl": 0.1427506759762764,
|
|
"epoch": 0.568,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"loss": 0.0069,
|
|
"grad_norm": 21.75,
|
|
"learning_rate": 2.15e-06,
|
|
"rewards/reward_fn": 0.4551968663930893,
|
|
"reward": 0.4551968663930893,
|
|
"reward_std": 0.043816833925666286,
|
|
"completion_length": 77.55,
|
|
"kl": 0.17263874933123588,
|
|
"epoch": 0.57,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 2.1400000000000003e-06,
|
|
"rewards/reward_fn": 0.4475862592458725,
|
|
"reward": 0.4475862592458725,
|
|
"reward_std": 0.061305654630996284,
|
|
"completion_length": 79.3375,
|
|
"kl": 0.1420199103653431,
|
|
"epoch": 0.572,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 19.5,
|
|
"learning_rate": 2.13e-06,
|
|
"rewards/reward_fn": 0.4612312436103821,
|
|
"reward": 0.4612312436103821,
|
|
"reward_std": 0.04303327279048972,
|
|
"completion_length": 78.9125,
|
|
"kl": 0.13926436081528665,
|
|
"epoch": 0.574,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"loss": 0.0079,
|
|
"grad_norm": 38.0,
|
|
"learning_rate": 2.12e-06,
|
|
"rewards/reward_fn": 0.4627299964427948,
|
|
"reward": 0.4627299964427948,
|
|
"reward_std": 0.042749036371242256,
|
|
"completion_length": 77.8625,
|
|
"kl": 0.1982392191886902,
|
|
"epoch": 0.576,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 18.75,
|
|
"learning_rate": 2.11e-06,
|
|
"rewards/reward_fn": 0.4696300059556961,
|
|
"reward": 0.4696300059556961,
|
|
"reward_std": 0.029448882048018276,
|
|
"completion_length": 79.9125,
|
|
"kl": 0.14228403344750404,
|
|
"epoch": 0.578,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 2.1000000000000002e-06,
|
|
"rewards/reward_fn": 0.4625418782234192,
|
|
"reward": 0.4625418782234192,
|
|
"reward_std": 0.023554211598820984,
|
|
"completion_length": 78.7375,
|
|
"kl": 0.12463297769427299,
|
|
"epoch": 0.58,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 2.09e-06,
|
|
"rewards/reward_fn": 0.4666400045156479,
|
|
"reward": 0.4666400045156479,
|
|
"reward_std": 0.01901569733163342,
|
|
"completion_length": 79.2875,
|
|
"kl": 0.13114793226122856,
|
|
"epoch": 0.582,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"loss": 0.0074,
|
|
"grad_norm": 22.125,
|
|
"learning_rate": 2.08e-06,
|
|
"rewards/reward_fn": 0.4477406233549118,
|
|
"reward": 0.4477406233549118,
|
|
"reward_std": 0.06840260641183704,
|
|
"completion_length": 78.7625,
|
|
"kl": 0.1860959157347679,
|
|
"epoch": 0.584,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 18.25,
|
|
"learning_rate": 2.07e-06,
|
|
"rewards/reward_fn": 0.47093687057495115,
|
|
"reward": 0.47093687057495115,
|
|
"reward_std": 0.009799153183121235,
|
|
"completion_length": 77.5,
|
|
"kl": 0.1460045598447323,
|
|
"epoch": 0.586,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 20.375,
|
|
"learning_rate": 2.06e-06,
|
|
"rewards/reward_fn": 0.4671725004911423,
|
|
"reward": 0.4671725004911423,
|
|
"reward_std": 0.029030334879644216,
|
|
"completion_length": 78.0125,
|
|
"kl": 0.14976133704185485,
|
|
"epoch": 0.588,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 19.625,
|
|
"learning_rate": 2.05e-06,
|
|
"rewards/reward_fn": 0.4621724963188171,
|
|
"reward": 0.4621724963188171,
|
|
"reward_std": 0.042897804221138355,
|
|
"completion_length": 78.5875,
|
|
"kl": 0.1281472846865654,
|
|
"epoch": 0.59,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 2.04e-06,
|
|
"rewards/reward_fn": 0.4698318690061569,
|
|
"reward": 0.4698318690061569,
|
|
"reward_std": 0.023317102977307512,
|
|
"completion_length": 78.5375,
|
|
"kl": 0.12476283833384513,
|
|
"epoch": 0.592,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 2.0300000000000005e-06,
|
|
"rewards/reward_fn": 0.45168625712394717,
|
|
"reward": 0.45168625712394717,
|
|
"reward_std": 0.06396679894533008,
|
|
"completion_length": 78.775,
|
|
"kl": 0.15403145402669907,
|
|
"epoch": 0.594,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 2.02e-06,
|
|
"rewards/reward_fn": 0.45557625591754913,
|
|
"reward": 0.45557625591754913,
|
|
"reward_std": 0.04759975708439015,
|
|
"completion_length": 77.5875,
|
|
"kl": 0.14153004586696624,
|
|
"epoch": 0.596,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 22.625,
|
|
"learning_rate": 2.0100000000000002e-06,
|
|
"rewards/reward_fn": 0.45877124965190885,
|
|
"reward": 0.45877124965190885,
|
|
"reward_std": 0.038299218472093347,
|
|
"completion_length": 79.1875,
|
|
"kl": 0.1336129680275917,
|
|
"epoch": 0.598,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"rewards/reward_fn": 0.45951750576496125,
|
|
"reward": 0.45951750576496125,
|
|
"reward_std": 0.043301355338189754,
|
|
"completion_length": 78.8625,
|
|
"kl": 0.1319414682686329,
|
|
"epoch": 0.6,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 19.25,
|
|
"learning_rate": 1.9900000000000004e-06,
|
|
"rewards/reward_fn": 0.4404612571001053,
|
|
"reward": 0.4404612571001053,
|
|
"reward_std": 0.07990776300430298,
|
|
"completion_length": 77.75,
|
|
"kl": 0.15399570986628533,
|
|
"epoch": 0.602,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 1.98e-06,
|
|
"rewards/reward_fn": 0.4647749960422516,
|
|
"reward": 0.4647749960422516,
|
|
"reward_std": 0.047874861588934434,
|
|
"completion_length": 78.975,
|
|
"kl": 0.14728261902928352,
|
|
"epoch": 0.604,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 19.25,
|
|
"learning_rate": 1.97e-06,
|
|
"rewards/reward_fn": 0.45807936787605286,
|
|
"reward": 0.45807936787605286,
|
|
"reward_std": 0.060872105229645965,
|
|
"completion_length": 78.6625,
|
|
"kl": 0.1391053855419159,
|
|
"epoch": 0.606,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"loss": 0.0069,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 1.9600000000000003e-06,
|
|
"rewards/reward_fn": 0.4504493743181229,
|
|
"reward": 0.4504493743181229,
|
|
"reward_std": 0.06272484959335997,
|
|
"completion_length": 78.0125,
|
|
"kl": 0.17193232327699662,
|
|
"epoch": 0.608,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"loss": 0.0068,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 1.9500000000000004e-06,
|
|
"rewards/reward_fn": 0.439431244134903,
|
|
"reward": 0.439431244134903,
|
|
"reward_std": 0.07358825565315782,
|
|
"completion_length": 78.675,
|
|
"kl": 0.16932241916656493,
|
|
"epoch": 0.61,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 1.94e-06,
|
|
"rewards/reward_fn": 0.4701712429523468,
|
|
"reward": 0.4701712429523468,
|
|
"reward_std": 0.025754676898941398,
|
|
"completion_length": 78.45,
|
|
"kl": 0.1536574937403202,
|
|
"epoch": 0.612,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 19.5,
|
|
"learning_rate": 1.93e-06,
|
|
"rewards/reward_fn": 0.4685331225395203,
|
|
"reward": 0.4685331225395203,
|
|
"reward_std": 0.02594901086995378,
|
|
"completion_length": 78.625,
|
|
"kl": 0.12761929631233215,
|
|
"epoch": 0.614,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 23.0,
|
|
"learning_rate": 1.9200000000000003e-06,
|
|
"rewards/reward_fn": 0.46238250136375425,
|
|
"reward": 0.46238250136375425,
|
|
"reward_std": 0.04514178307726979,
|
|
"completion_length": 77.6125,
|
|
"kl": 0.1310683749616146,
|
|
"epoch": 0.616,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 26.375,
|
|
"learning_rate": 1.9100000000000003e-06,
|
|
"rewards/reward_fn": 0.46453936994075773,
|
|
"reward": 0.46453936994075773,
|
|
"reward_std": 0.05459905466996133,
|
|
"completion_length": 78.525,
|
|
"kl": 0.12022457122802735,
|
|
"epoch": 0.618,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 19.0,
|
|
"learning_rate": 1.9000000000000002e-06,
|
|
"rewards/reward_fn": 0.46645999848842623,
|
|
"reward": 0.46645999848842623,
|
|
"reward_std": 0.024052193760871886,
|
|
"completion_length": 78.6875,
|
|
"kl": 0.16018542796373367,
|
|
"epoch": 0.62,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 1.8900000000000001e-06,
|
|
"rewards/reward_fn": 0.4562818706035614,
|
|
"reward": 0.4562818706035614,
|
|
"reward_std": 0.043363090697675945,
|
|
"completion_length": 78.7375,
|
|
"kl": 0.1383350558578968,
|
|
"epoch": 0.622,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 1.8800000000000002e-06,
|
|
"rewards/reward_fn": 0.47267499268054963,
|
|
"reward": 0.47267499268054963,
|
|
"reward_std": 0.03722939351573586,
|
|
"completion_length": 78.1625,
|
|
"kl": 0.14105435311794282,
|
|
"epoch": 0.624,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 19.625,
|
|
"learning_rate": 1.87e-06,
|
|
"rewards/reward_fn": 0.4576281189918518,
|
|
"reward": 0.4576281189918518,
|
|
"reward_std": 0.05807865222450346,
|
|
"completion_length": 79.4,
|
|
"kl": 0.13264633268117904,
|
|
"epoch": 0.626,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"loss": 0.0069,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 1.8600000000000002e-06,
|
|
"rewards/reward_fn": 0.42023812532424926,
|
|
"reward": 0.42023812532424926,
|
|
"reward_std": 0.11829792927019298,
|
|
"completion_length": 76.625,
|
|
"kl": 0.17229357063770295,
|
|
"epoch": 0.628,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 1.85e-06,
|
|
"rewards/reward_fn": 0.46860812306404115,
|
|
"reward": 0.46860812306404115,
|
|
"reward_std": 0.03086728664347902,
|
|
"completion_length": 78.7875,
|
|
"kl": 0.13762294948101045,
|
|
"epoch": 0.63,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 22.625,
|
|
"learning_rate": 1.8400000000000002e-06,
|
|
"rewards/reward_fn": 0.46147062480449674,
|
|
"reward": 0.46147062480449674,
|
|
"reward_std": 0.04284065024694428,
|
|
"completion_length": 77.675,
|
|
"kl": 0.1324526160955429,
|
|
"epoch": 0.632,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"loss": 0.0065,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 1.83e-06,
|
|
"rewards/reward_fn": 0.4482531249523163,
|
|
"reward": 0.4482531249523163,
|
|
"reward_std": 0.07075127304997295,
|
|
"completion_length": 75.85,
|
|
"kl": 0.16137402653694152,
|
|
"epoch": 0.634,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 21.75,
|
|
"learning_rate": 1.8200000000000002e-06,
|
|
"rewards/reward_fn": 0.4649731248617172,
|
|
"reward": 0.4649731248617172,
|
|
"reward_std": 0.027589096594601868,
|
|
"completion_length": 77.5625,
|
|
"kl": 0.12578429877758027,
|
|
"epoch": 0.636,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 1.81e-06,
|
|
"rewards/reward_fn": 0.46504874527454376,
|
|
"reward": 0.46504874527454376,
|
|
"reward_std": 0.02663288627518341,
|
|
"completion_length": 78.3875,
|
|
"kl": 0.12880957499146461,
|
|
"epoch": 0.638,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 1.8000000000000001e-06,
|
|
"rewards/reward_fn": 0.4477743715047836,
|
|
"reward": 0.4477743715047836,
|
|
"reward_std": 0.06575249675661325,
|
|
"completion_length": 76.825,
|
|
"kl": 0.14026456847786903,
|
|
"epoch": 0.64,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 21.375,
|
|
"learning_rate": 1.79e-06,
|
|
"rewards/reward_fn": 0.46851625442504885,
|
|
"reward": 0.46851625442504885,
|
|
"reward_std": 0.03404894776176661,
|
|
"completion_length": 78.3,
|
|
"kl": 0.13332988694310188,
|
|
"epoch": 0.642,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 1.7800000000000001e-06,
|
|
"rewards/reward_fn": 0.45667624771595,
|
|
"reward": 0.45667624771595,
|
|
"reward_std": 0.05264936711173505,
|
|
"completion_length": 78.5875,
|
|
"kl": 0.14069104120135306,
|
|
"epoch": 0.644,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"loss": 0.0065,
|
|
"grad_norm": 23.625,
|
|
"learning_rate": 1.77e-06,
|
|
"rewards/reward_fn": 0.4541974991559982,
|
|
"reward": 0.4541974991559982,
|
|
"reward_std": 0.06876377174630761,
|
|
"completion_length": 78.325,
|
|
"kl": 0.1617581441998482,
|
|
"epoch": 0.646,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 18.875,
|
|
"learning_rate": 1.76e-06,
|
|
"rewards/reward_fn": 0.47011750638484956,
|
|
"reward": 0.47011750638484956,
|
|
"reward_std": 0.027857921156100928,
|
|
"completion_length": 78.5,
|
|
"kl": 0.1346297614276409,
|
|
"epoch": 0.648,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"loss": 0.0065,
|
|
"grad_norm": 22.0,
|
|
"learning_rate": 1.75e-06,
|
|
"rewards/reward_fn": 0.4520518720149994,
|
|
"reward": 0.4520518720149994,
|
|
"reward_std": 0.0729821051703766,
|
|
"completion_length": 77.825,
|
|
"kl": 0.161976557970047,
|
|
"epoch": 0.65,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 1.74e-06,
|
|
"rewards/reward_fn": 0.4532381296157837,
|
|
"reward": 0.4532381296157837,
|
|
"reward_std": 0.06829985191579908,
|
|
"completion_length": 77.525,
|
|
"kl": 0.13216826990246772,
|
|
"epoch": 0.652,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 19.125,
|
|
"learning_rate": 1.73e-06,
|
|
"rewards/reward_fn": 0.4630106300115585,
|
|
"reward": 0.4630106300115585,
|
|
"reward_std": 0.05130832166178152,
|
|
"completion_length": 78.8875,
|
|
"kl": 0.12697028666734694,
|
|
"epoch": 0.654,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 21.625,
|
|
"learning_rate": 1.72e-06,
|
|
"rewards/reward_fn": 0.4494799941778183,
|
|
"reward": 0.4494799941778183,
|
|
"reward_std": 0.06570386737585068,
|
|
"completion_length": 76.425,
|
|
"kl": 0.14841574504971505,
|
|
"epoch": 0.656,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 20.25,
|
|
"learning_rate": 1.7100000000000004e-06,
|
|
"rewards/reward_fn": 0.45594811737537383,
|
|
"reward": 0.45594811737537383,
|
|
"reward_std": 0.05052668444113806,
|
|
"completion_length": 79.3875,
|
|
"kl": 0.11954338103532791,
|
|
"epoch": 0.658,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 19.75,
|
|
"learning_rate": 1.7000000000000002e-06,
|
|
"rewards/reward_fn": 0.46476125419139863,
|
|
"reward": 0.46476125419139863,
|
|
"reward_std": 0.043870922236237675,
|
|
"completion_length": 78.7875,
|
|
"kl": 0.1275065064430237,
|
|
"epoch": 0.66,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"loss": 0.0049,
|
|
"grad_norm": 21.125,
|
|
"learning_rate": 1.6900000000000003e-06,
|
|
"rewards/reward_fn": 0.46913999915122984,
|
|
"reward": 0.46913999915122984,
|
|
"reward_std": 0.006915005797054619,
|
|
"completion_length": 78.5375,
|
|
"kl": 0.12140461131930351,
|
|
"epoch": 0.662,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 19.625,
|
|
"learning_rate": 1.6800000000000002e-06,
|
|
"rewards/reward_fn": 0.4625368744134903,
|
|
"reward": 0.4625368744134903,
|
|
"reward_std": 0.02914451065007597,
|
|
"completion_length": 77.95,
|
|
"kl": 0.1269746668636799,
|
|
"epoch": 0.664,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"loss": 0.0066,
|
|
"grad_norm": 24.125,
|
|
"learning_rate": 1.6700000000000003e-06,
|
|
"rewards/reward_fn": 0.4738156259059906,
|
|
"reward": 0.4738156259059906,
|
|
"reward_std": 0.020672354963608086,
|
|
"completion_length": 77.8,
|
|
"kl": 0.16429368406534195,
|
|
"epoch": 0.666,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 20.375,
|
|
"learning_rate": 1.6600000000000002e-06,
|
|
"rewards/reward_fn": 0.4726106315851212,
|
|
"reward": 0.4726106315851212,
|
|
"reward_std": 0.011511084495577962,
|
|
"completion_length": 79.3375,
|
|
"kl": 0.13863546177744865,
|
|
"epoch": 0.668,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 1.6500000000000003e-06,
|
|
"rewards/reward_fn": 0.46120937168598175,
|
|
"reward": 0.46120937168598175,
|
|
"reward_std": 0.04015400728676468,
|
|
"completion_length": 78.25,
|
|
"kl": 0.13780420050024986,
|
|
"epoch": 0.67,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 24.5,
|
|
"learning_rate": 1.6400000000000002e-06,
|
|
"rewards/reward_fn": 0.4591624945402145,
|
|
"reward": 0.4591624945402145,
|
|
"reward_std": 0.0403320163837634,
|
|
"completion_length": 78.8875,
|
|
"kl": 0.15152628272771834,
|
|
"epoch": 0.672,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 1.6300000000000003e-06,
|
|
"rewards/reward_fn": 0.46432062685489656,
|
|
"reward": 0.46432062685489656,
|
|
"reward_std": 0.03836179277859628,
|
|
"completion_length": 78.275,
|
|
"kl": 0.12853171303868294,
|
|
"epoch": 0.674,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 1.6200000000000002e-06,
|
|
"rewards/reward_fn": 0.4714624971151352,
|
|
"reward": 0.4714624971151352,
|
|
"reward_std": 0.028523307130672037,
|
|
"completion_length": 78.3125,
|
|
"kl": 0.14340822845697404,
|
|
"epoch": 0.676,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 22.75,
|
|
"learning_rate": 1.6100000000000003e-06,
|
|
"rewards/reward_fn": 0.4591949999332428,
|
|
"reward": 0.4591949999332428,
|
|
"reward_std": 0.038035544892773034,
|
|
"completion_length": 78.15,
|
|
"kl": 0.14982439056038857,
|
|
"epoch": 0.678,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 21.25,
|
|
"learning_rate": 1.6000000000000001e-06,
|
|
"rewards/reward_fn": 0.4653699994087219,
|
|
"reward": 0.4653699994087219,
|
|
"reward_std": 0.020481601386563852,
|
|
"completion_length": 76.7625,
|
|
"kl": 0.14447411969304086,
|
|
"epoch": 0.68,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 1.5900000000000002e-06,
|
|
"rewards/reward_fn": 0.4341324925422668,
|
|
"reward": 0.4341324925422668,
|
|
"reward_std": 0.09427430615760386,
|
|
"completion_length": 77.8,
|
|
"kl": 0.14823570474982262,
|
|
"epoch": 0.682,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 1.5800000000000001e-06,
|
|
"rewards/reward_fn": 0.45423062741756437,
|
|
"reward": 0.45423062741756437,
|
|
"reward_std": 0.04875338152050972,
|
|
"completion_length": 79.475,
|
|
"kl": 0.1357567824423313,
|
|
"epoch": 0.684,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 1.5700000000000002e-06,
|
|
"rewards/reward_fn": 0.4559962421655655,
|
|
"reward": 0.4559962421655655,
|
|
"reward_std": 0.06438031857833267,
|
|
"completion_length": 78.4875,
|
|
"kl": 0.15894640609622002,
|
|
"epoch": 0.686,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"loss": 0.0065,
|
|
"grad_norm": 19.875,
|
|
"learning_rate": 1.56e-06,
|
|
"rewards/reward_fn": 0.4572743773460388,
|
|
"reward": 0.4572743773460388,
|
|
"reward_std": 0.04752160895150155,
|
|
"completion_length": 79.1125,
|
|
"kl": 0.16350691244006157,
|
|
"epoch": 0.688,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"loss": 0.007,
|
|
"grad_norm": 22.875,
|
|
"learning_rate": 1.5500000000000002e-06,
|
|
"rewards/reward_fn": 0.451460000872612,
|
|
"reward": 0.451460000872612,
|
|
"reward_std": 0.06449790641199797,
|
|
"completion_length": 78.6375,
|
|
"kl": 0.17609091848134995,
|
|
"epoch": 0.69,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 18.0,
|
|
"learning_rate": 1.54e-06,
|
|
"rewards/reward_fn": 0.46678187251091,
|
|
"reward": 0.46678187251091,
|
|
"reward_std": 0.028732791543006897,
|
|
"completion_length": 77.3875,
|
|
"kl": 0.12948581501841544,
|
|
"epoch": 0.692,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 17.75,
|
|
"learning_rate": 1.5300000000000002e-06,
|
|
"rewards/reward_fn": 0.4550174981355667,
|
|
"reward": 0.4550174981355667,
|
|
"reward_std": 0.06296568798134103,
|
|
"completion_length": 78.3875,
|
|
"kl": 0.14993617683649063,
|
|
"epoch": 0.694,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 24.0,
|
|
"learning_rate": 1.52e-06,
|
|
"rewards/reward_fn": 0.45879937410354615,
|
|
"reward": 0.45879937410354615,
|
|
"reward_std": 0.04999549321364612,
|
|
"completion_length": 77.975,
|
|
"kl": 0.13548573106527328,
|
|
"epoch": 0.696,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 19.5,
|
|
"learning_rate": 1.5100000000000002e-06,
|
|
"rewards/reward_fn": 0.4645506262779236,
|
|
"reward": 0.4645506262779236,
|
|
"reward_std": 0.025334799219854175,
|
|
"completion_length": 79.025,
|
|
"kl": 0.1345573790371418,
|
|
"epoch": 0.698,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"loss": 0.0072,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 1.5e-06,
|
|
"rewards/reward_fn": 0.4387993663549423,
|
|
"reward": 0.4387993663549423,
|
|
"reward_std": 0.0916181854379829,
|
|
"completion_length": 78.8125,
|
|
"kl": 0.18051299825310707,
|
|
"epoch": 0.7,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 19.375,
|
|
"learning_rate": 1.4900000000000001e-06,
|
|
"rewards/reward_fn": 0.4713474988937378,
|
|
"reward": 0.4713474988937378,
|
|
"reward_std": 0.02058067887555808,
|
|
"completion_length": 78.3875,
|
|
"kl": 0.14509371370077134,
|
|
"epoch": 0.702,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 1.48e-06,
|
|
"rewards/reward_fn": 0.45444686710834503,
|
|
"reward": 0.45444686710834503,
|
|
"reward_std": 0.06304303905926645,
|
|
"completion_length": 77.1,
|
|
"kl": 0.1549811489880085,
|
|
"epoch": 0.704,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 21.625,
|
|
"learning_rate": 1.4700000000000001e-06,
|
|
"rewards/reward_fn": 0.4627524971961975,
|
|
"reward": 0.4627524971961975,
|
|
"reward_std": 0.04254062173422426,
|
|
"completion_length": 78.4625,
|
|
"kl": 0.1384074404835701,
|
|
"epoch": 0.706,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 23.5,
|
|
"learning_rate": 1.46e-06,
|
|
"rewards/reward_fn": 0.45813000202178955,
|
|
"reward": 0.45813000202178955,
|
|
"reward_std": 0.04893373708473518,
|
|
"completion_length": 78.0,
|
|
"kl": 0.15400241911411286,
|
|
"epoch": 0.708,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 1.45e-06,
|
|
"rewards/reward_fn": 0.4570950001478195,
|
|
"reward": 0.4570950001478195,
|
|
"reward_std": 0.04461987121030688,
|
|
"completion_length": 78.6625,
|
|
"kl": 0.14551043882966042,
|
|
"epoch": 0.71,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 19.0,
|
|
"learning_rate": 1.44e-06,
|
|
"rewards/reward_fn": 0.45253312587738037,
|
|
"reward": 0.45253312587738037,
|
|
"reward_std": 0.04993348123971373,
|
|
"completion_length": 79.375,
|
|
"kl": 0.15369636416435242,
|
|
"epoch": 0.712,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 22.5,
|
|
"learning_rate": 1.43e-06,
|
|
"rewards/reward_fn": 0.4615906268358231,
|
|
"reward": 0.4615906268358231,
|
|
"reward_std": 0.0613109068479389,
|
|
"completion_length": 76.65,
|
|
"kl": 0.14984343126416205,
|
|
"epoch": 0.714,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 1.42e-06,
|
|
"rewards/reward_fn": 0.4519368767738342,
|
|
"reward": 0.4519368767738342,
|
|
"reward_std": 0.05483808619901538,
|
|
"completion_length": 78.6,
|
|
"kl": 0.12471728846430778,
|
|
"epoch": 0.716,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 20.375,
|
|
"learning_rate": 1.41e-06,
|
|
"rewards/reward_fn": 0.4455212503671646,
|
|
"reward": 0.4455212503671646,
|
|
"reward_std": 0.06304481262341141,
|
|
"completion_length": 78.2625,
|
|
"kl": 0.14399517476558685,
|
|
"epoch": 0.718,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"loss": 0.0071,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 1.4000000000000001e-06,
|
|
"rewards/reward_fn": 0.44046937823295595,
|
|
"reward": 0.44046937823295595,
|
|
"reward_std": 0.08519753144355491,
|
|
"completion_length": 78.35,
|
|
"kl": 0.17743645012378692,
|
|
"epoch": 0.72,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"loss": 0.0066,
|
|
"grad_norm": 20.375,
|
|
"learning_rate": 1.3900000000000002e-06,
|
|
"rewards/reward_fn": 0.44510937929153443,
|
|
"reward": 0.44510937929153443,
|
|
"reward_std": 0.064357951504644,
|
|
"completion_length": 78.325,
|
|
"kl": 0.16583998426795005,
|
|
"epoch": 0.722,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 23.5,
|
|
"learning_rate": 1.3800000000000001e-06,
|
|
"rewards/reward_fn": 0.4451799988746643,
|
|
"reward": 0.4451799988746643,
|
|
"reward_std": 0.06354925713967532,
|
|
"completion_length": 78.675,
|
|
"kl": 0.1327526532113552,
|
|
"epoch": 0.724,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 23.875,
|
|
"learning_rate": 1.3700000000000002e-06,
|
|
"rewards/reward_fn": 0.4643556296825409,
|
|
"reward": 0.4643556296825409,
|
|
"reward_std": 0.03843736774288118,
|
|
"completion_length": 77.925,
|
|
"kl": 0.13086711019277572,
|
|
"epoch": 0.726,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"loss": 0.0047,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 1.3600000000000001e-06,
|
|
"rewards/reward_fn": 0.47222812473773956,
|
|
"reward": 0.47222812473773956,
|
|
"reward_std": 0.014235112490132451,
|
|
"completion_length": 78.6625,
|
|
"kl": 0.11623715609312057,
|
|
"epoch": 0.728,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"loss": 0.0072,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 1.3500000000000002e-06,
|
|
"rewards/reward_fn": 0.4484899967908859,
|
|
"reward": 0.4484899967908859,
|
|
"reward_std": 0.06967922276817262,
|
|
"completion_length": 77.825,
|
|
"kl": 0.17991492599248887,
|
|
"epoch": 0.73,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 24.5,
|
|
"learning_rate": 1.34e-06,
|
|
"rewards/reward_fn": 0.46708749830722807,
|
|
"reward": 0.46708749830722807,
|
|
"reward_std": 0.02486464052926749,
|
|
"completion_length": 78.5875,
|
|
"kl": 0.13221397027373313,
|
|
"epoch": 0.732,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"loss": 0.0073,
|
|
"grad_norm": 18.375,
|
|
"learning_rate": 1.3300000000000002e-06,
|
|
"rewards/reward_fn": 0.45467875599861146,
|
|
"reward": 0.45467875599861146,
|
|
"reward_std": 0.06859695718158036,
|
|
"completion_length": 78.8375,
|
|
"kl": 0.18368308618664742,
|
|
"epoch": 0.734,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 1.32e-06,
|
|
"rewards/reward_fn": 0.4613149970769882,
|
|
"reward": 0.4613149970769882,
|
|
"reward_std": 0.03261192251229659,
|
|
"completion_length": 78.35,
|
|
"kl": 0.12882784008979797,
|
|
"epoch": 0.736,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 24.125,
|
|
"learning_rate": 1.3100000000000002e-06,
|
|
"rewards/reward_fn": 0.4719943791627884,
|
|
"reward": 0.4719943791627884,
|
|
"reward_std": 0.009089648583903908,
|
|
"completion_length": 76.3375,
|
|
"kl": 0.13276104778051376,
|
|
"epoch": 0.738,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 18.875,
|
|
"learning_rate": 1.3e-06,
|
|
"rewards/reward_fn": 0.45800375044345853,
|
|
"reward": 0.45800375044345853,
|
|
"reward_std": 0.0387735236203298,
|
|
"completion_length": 79.45,
|
|
"kl": 0.1281396232545376,
|
|
"epoch": 0.74,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 19.75,
|
|
"learning_rate": 1.2900000000000001e-06,
|
|
"rewards/reward_fn": 0.46116250157356264,
|
|
"reward": 0.46116250157356264,
|
|
"reward_std": 0.03875681417994201,
|
|
"completion_length": 79.425,
|
|
"kl": 0.1366500124335289,
|
|
"epoch": 0.742,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 21.25,
|
|
"learning_rate": 1.28e-06,
|
|
"rewards/reward_fn": 0.4411043733358383,
|
|
"reward": 0.4411043733358383,
|
|
"reward_std": 0.07198944769334048,
|
|
"completion_length": 78.2625,
|
|
"kl": 0.14997942075133325,
|
|
"epoch": 0.744,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 1.2700000000000001e-06,
|
|
"rewards/reward_fn": 0.4610200017690659,
|
|
"reward": 0.4610200017690659,
|
|
"reward_std": 0.028940725000575186,
|
|
"completion_length": 79.1,
|
|
"kl": 0.1250425823032856,
|
|
"epoch": 0.746,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 1.26e-06,
|
|
"rewards/reward_fn": 0.44400312602519987,
|
|
"reward": 0.44400312602519987,
|
|
"reward_std": 0.07846251965966075,
|
|
"completion_length": 79.3,
|
|
"kl": 0.14869983717799187,
|
|
"epoch": 0.748,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"loss": 0.0071,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 1.25e-06,
|
|
"rewards/reward_fn": 0.44204375743865965,
|
|
"reward": 0.44204375743865965,
|
|
"reward_std": 0.09281483425293117,
|
|
"completion_length": 78.525,
|
|
"kl": 0.17744441479444503,
|
|
"epoch": 0.75,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"loss": 0.0044,
|
|
"grad_norm": 23.0,
|
|
"learning_rate": 1.2400000000000002e-06,
|
|
"rewards/reward_fn": 0.4658162444829941,
|
|
"reward": 0.4658162444829941,
|
|
"reward_std": 0.02522226042347029,
|
|
"completion_length": 78.8,
|
|
"kl": 0.11068192198872566,
|
|
"epoch": 0.752,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 1.23e-06,
|
|
"rewards/reward_fn": 0.4581906199455261,
|
|
"reward": 0.4581906199455261,
|
|
"reward_std": 0.03355656263884157,
|
|
"completion_length": 77.15,
|
|
"kl": 0.13748234882950783,
|
|
"epoch": 0.754,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 1.2200000000000002e-06,
|
|
"rewards/reward_fn": 0.45401187539100646,
|
|
"reward": 0.45401187539100646,
|
|
"reward_std": 0.051014326070435344,
|
|
"completion_length": 78.05,
|
|
"kl": 0.14651698172092437,
|
|
"epoch": 0.756,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 21.625,
|
|
"learning_rate": 1.21e-06,
|
|
"rewards/reward_fn": 0.47379874885082246,
|
|
"reward": 0.47379874885082246,
|
|
"reward_std": 0.015307459211908282,
|
|
"completion_length": 78.8125,
|
|
"kl": 0.12749662175774573,
|
|
"epoch": 0.758,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"loss": 0.007,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 1.2000000000000002e-06,
|
|
"rewards/reward_fn": 0.44789875447750094,
|
|
"reward": 0.44789875447750094,
|
|
"reward_std": 0.0735718347132206,
|
|
"completion_length": 78.3,
|
|
"kl": 0.1748662807047367,
|
|
"epoch": 0.76,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 25.875,
|
|
"learning_rate": 1.19e-06,
|
|
"rewards/reward_fn": 0.4480418682098389,
|
|
"reward": 0.4480418682098389,
|
|
"reward_std": 0.068895304761827,
|
|
"completion_length": 79.225,
|
|
"kl": 0.15129087641835212,
|
|
"epoch": 0.762,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"loss": 0.0143,
|
|
"grad_norm": 21.75,
|
|
"learning_rate": 1.1800000000000001e-06,
|
|
"rewards/reward_fn": 0.4410243809223175,
|
|
"reward": 0.4410243809223175,
|
|
"reward_std": 0.08341788314282894,
|
|
"completion_length": 76.725,
|
|
"kl": 0.3567336067557335,
|
|
"epoch": 0.764,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 17.875,
|
|
"learning_rate": 1.1700000000000002e-06,
|
|
"rewards/reward_fn": 0.4720318764448166,
|
|
"reward": 0.4720318764448166,
|
|
"reward_std": 0.032237262232229114,
|
|
"completion_length": 78.0375,
|
|
"kl": 0.14518789127469062,
|
|
"epoch": 0.766,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 1.1600000000000001e-06,
|
|
"rewards/reward_fn": 0.4640293687582016,
|
|
"reward": 0.4640293687582016,
|
|
"reward_std": 0.030499694612808527,
|
|
"completion_length": 78.4,
|
|
"kl": 0.1303658217191696,
|
|
"epoch": 0.768,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"loss": 0.0073,
|
|
"grad_norm": 27.875,
|
|
"learning_rate": 1.1500000000000002e-06,
|
|
"rewards/reward_fn": 0.44796750247478484,
|
|
"reward": 0.44796750247478484,
|
|
"reward_std": 0.09624997415812686,
|
|
"completion_length": 78.65,
|
|
"kl": 0.18188868314027787,
|
|
"epoch": 0.77,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"loss": 0.0065,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 1.14e-06,
|
|
"rewards/reward_fn": 0.45636438131332396,
|
|
"reward": 0.45636438131332396,
|
|
"reward_std": 0.08401111733401194,
|
|
"completion_length": 77.075,
|
|
"kl": 0.1628888465464115,
|
|
"epoch": 0.772,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 19.375,
|
|
"learning_rate": 1.1300000000000002e-06,
|
|
"rewards/reward_fn": 0.4671268731355667,
|
|
"reward": 0.4671268731355667,
|
|
"reward_std": 0.024293193663470446,
|
|
"completion_length": 78.3875,
|
|
"kl": 0.13081972151994706,
|
|
"epoch": 0.774,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"loss": 0.0068,
|
|
"grad_norm": 20.375,
|
|
"learning_rate": 1.12e-06,
|
|
"rewards/reward_fn": 0.45347937643527986,
|
|
"reward": 0.45347937643527986,
|
|
"reward_std": 0.06302163258660584,
|
|
"completion_length": 79.0,
|
|
"kl": 0.17032922431826591,
|
|
"epoch": 0.776,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 1.1100000000000002e-06,
|
|
"rewards/reward_fn": 0.45575874745845796,
|
|
"reward": 0.45575874745845796,
|
|
"reward_std": 0.0562650595093146,
|
|
"completion_length": 78.85,
|
|
"kl": 0.12893958985805512,
|
|
"epoch": 0.778,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 21.25,
|
|
"learning_rate": 1.1e-06,
|
|
"rewards/reward_fn": 0.4621687412261963,
|
|
"reward": 0.4621687412261963,
|
|
"reward_std": 0.0637435567798093,
|
|
"completion_length": 78.725,
|
|
"kl": 0.15337565019726754,
|
|
"epoch": 0.78,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 23.25,
|
|
"learning_rate": 1.0900000000000002e-06,
|
|
"rewards/reward_fn": 0.46070688366889956,
|
|
"reward": 0.46070688366889956,
|
|
"reward_std": 0.03493543366203085,
|
|
"completion_length": 77.375,
|
|
"kl": 0.1603299029171467,
|
|
"epoch": 0.782,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 22.25,
|
|
"learning_rate": 1.08e-06,
|
|
"rewards/reward_fn": 0.46500125527381897,
|
|
"reward": 0.46500125527381897,
|
|
"reward_std": 0.024533626122865825,
|
|
"completion_length": 78.7125,
|
|
"kl": 0.1345980040729046,
|
|
"epoch": 0.784,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"loss": 0.0072,
|
|
"grad_norm": 22.0,
|
|
"learning_rate": 1.0700000000000001e-06,
|
|
"rewards/reward_fn": 0.43493750393390657,
|
|
"reward": 0.43493750393390657,
|
|
"reward_std": 0.11233580666594208,
|
|
"completion_length": 78.8625,
|
|
"kl": 0.18072494119405746,
|
|
"epoch": 0.786,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 1.06e-06,
|
|
"rewards/reward_fn": 0.46299062967300414,
|
|
"reward": 0.46299062967300414,
|
|
"reward_std": 0.0409371492365608,
|
|
"completion_length": 78.275,
|
|
"kl": 0.12988597080111502,
|
|
"epoch": 0.788,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 21.25,
|
|
"learning_rate": 1.0500000000000001e-06,
|
|
"rewards/reward_fn": 0.4538056284189224,
|
|
"reward": 0.4538056284189224,
|
|
"reward_std": 0.04634799053892493,
|
|
"completion_length": 78.4375,
|
|
"kl": 0.1439467839896679,
|
|
"epoch": 0.79,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 1.04e-06,
|
|
"rewards/reward_fn": 0.4611237466335297,
|
|
"reward": 0.4611237466335297,
|
|
"reward_std": 0.04574344952125102,
|
|
"completion_length": 78.6,
|
|
"kl": 0.13221421986818313,
|
|
"epoch": 0.792,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 1.03e-06,
|
|
"rewards/reward_fn": 0.4470118790864944,
|
|
"reward": 0.4470118790864944,
|
|
"reward_std": 0.08011215794831514,
|
|
"completion_length": 79.0,
|
|
"kl": 0.15436191707849503,
|
|
"epoch": 0.794,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 1.02e-06,
|
|
"rewards/reward_fn": 0.46021624803543093,
|
|
"reward": 0.46021624803543093,
|
|
"reward_std": 0.040483302506618205,
|
|
"completion_length": 79.525,
|
|
"kl": 0.13130446001887322,
|
|
"epoch": 0.796,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 21.625,
|
|
"learning_rate": 1.01e-06,
|
|
"rewards/reward_fn": 0.45304437875747683,
|
|
"reward": 0.45304437875747683,
|
|
"reward_std": 0.06350767945405096,
|
|
"completion_length": 77.9125,
|
|
"kl": 0.1545679196715355,
|
|
"epoch": 0.798,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"loss": 0.0063,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 1.0000000000000002e-06,
|
|
"rewards/reward_fn": 0.46792625188827514,
|
|
"reward": 0.46792625188827514,
|
|
"reward_std": 0.029082121956162155,
|
|
"completion_length": 77.5375,
|
|
"kl": 0.15671682581305504,
|
|
"epoch": 0.8,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"loss": 0.007,
|
|
"grad_norm": 22.625,
|
|
"learning_rate": 9.9e-07,
|
|
"rewards/reward_fn": 0.4429012507200241,
|
|
"reward": 0.4429012507200241,
|
|
"reward_std": 0.0852669625543058,
|
|
"completion_length": 78.9125,
|
|
"kl": 0.17405613735318184,
|
|
"epoch": 0.802,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"loss": 0.0063,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 9.800000000000001e-07,
|
|
"rewards/reward_fn": 0.4483262479305267,
|
|
"reward": 0.4483262479305267,
|
|
"reward_std": 0.07652467372827232,
|
|
"completion_length": 77.325,
|
|
"kl": 0.15809645801782607,
|
|
"epoch": 0.804,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 22.125,
|
|
"learning_rate": 9.7e-07,
|
|
"rewards/reward_fn": 0.45815313160419463,
|
|
"reward": 0.45815313160419463,
|
|
"reward_std": 0.045375860878266394,
|
|
"completion_length": 78.275,
|
|
"kl": 0.1531553089618683,
|
|
"epoch": 0.806,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 9.600000000000001e-07,
|
|
"rewards/reward_fn": 0.4604393750429153,
|
|
"reward": 0.4604393750429153,
|
|
"reward_std": 0.04589560895692557,
|
|
"completion_length": 77.825,
|
|
"kl": 0.1458041973412037,
|
|
"epoch": 0.808,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 23.625,
|
|
"learning_rate": 9.500000000000001e-07,
|
|
"rewards/reward_fn": 0.4627868801355362,
|
|
"reward": 0.4627868801355362,
|
|
"reward_std": 0.041009452322032305,
|
|
"completion_length": 78.1625,
|
|
"kl": 0.1340768076479435,
|
|
"epoch": 0.81,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 9.400000000000001e-07,
|
|
"rewards/reward_fn": 0.47766625583171846,
|
|
"reward": 0.47766625583171846,
|
|
"reward_std": 0.008443673443980514,
|
|
"completion_length": 79.0625,
|
|
"kl": 0.1261758454144001,
|
|
"epoch": 0.812,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 20.25,
|
|
"learning_rate": 9.300000000000001e-07,
|
|
"rewards/reward_fn": 0.45916875302791593,
|
|
"reward": 0.45916875302791593,
|
|
"reward_std": 0.05537645731819794,
|
|
"completion_length": 78.35,
|
|
"kl": 0.13748721331357955,
|
|
"epoch": 0.814,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 19.75,
|
|
"learning_rate": 9.200000000000001e-07,
|
|
"rewards/reward_fn": 0.4704318791627884,
|
|
"reward": 0.4704318791627884,
|
|
"reward_std": 0.02074106188956648,
|
|
"completion_length": 78.2375,
|
|
"kl": 0.1439397320151329,
|
|
"epoch": 0.816,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 23.25,
|
|
"learning_rate": 9.100000000000001e-07,
|
|
"rewards/reward_fn": 0.457552495598793,
|
|
"reward": 0.457552495598793,
|
|
"reward_std": 0.04766743449727073,
|
|
"completion_length": 79.125,
|
|
"kl": 0.14166640490293503,
|
|
"epoch": 0.818,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 25.0,
|
|
"learning_rate": 9.000000000000001e-07,
|
|
"rewards/reward_fn": 0.46860311925411224,
|
|
"reward": 0.46860311925411224,
|
|
"reward_std": 0.03131808526813984,
|
|
"completion_length": 78.3,
|
|
"kl": 0.12001164257526398,
|
|
"epoch": 0.82,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 18.5,
|
|
"learning_rate": 8.900000000000001e-07,
|
|
"rewards/reward_fn": 0.4583775013685226,
|
|
"reward": 0.4583775013685226,
|
|
"reward_std": 0.0501600137562491,
|
|
"completion_length": 78.65,
|
|
"kl": 0.13961323350667953,
|
|
"epoch": 0.822,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"loss": 0.0071,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 8.8e-07,
|
|
"rewards/reward_fn": 0.4555699944496155,
|
|
"reward": 0.4555699944496155,
|
|
"reward_std": 0.05186676031444222,
|
|
"completion_length": 77.7875,
|
|
"kl": 0.17678724601864815,
|
|
"epoch": 0.824,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"loss": 0.0048,
|
|
"grad_norm": 21.375,
|
|
"learning_rate": 8.7e-07,
|
|
"rewards/reward_fn": 0.46306562423706055,
|
|
"reward": 0.46306562423706055,
|
|
"reward_std": 0.025608734460547566,
|
|
"completion_length": 78.45,
|
|
"kl": 0.1207703597843647,
|
|
"epoch": 0.826,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 20.375,
|
|
"learning_rate": 8.6e-07,
|
|
"rewards/reward_fn": 0.4619231253862381,
|
|
"reward": 0.4619231253862381,
|
|
"reward_std": 0.05284600446466357,
|
|
"completion_length": 78.5625,
|
|
"kl": 0.14405835717916488,
|
|
"epoch": 0.828,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 19.75,
|
|
"learning_rate": 8.500000000000001e-07,
|
|
"rewards/reward_fn": 0.46677875220775605,
|
|
"reward": 0.46677875220775605,
|
|
"reward_std": 0.02917533617001027,
|
|
"completion_length": 76.925,
|
|
"kl": 0.14229361489415168,
|
|
"epoch": 0.83,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 8.400000000000001e-07,
|
|
"rewards/reward_fn": 0.46606625616550446,
|
|
"reward": 0.46606625616550446,
|
|
"reward_std": 0.0280997826019302,
|
|
"completion_length": 77.9125,
|
|
"kl": 0.13288158997893335,
|
|
"epoch": 0.832,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 26.75,
|
|
"learning_rate": 8.300000000000001e-07,
|
|
"rewards/reward_fn": 0.4598168820142746,
|
|
"reward": 0.4598168820142746,
|
|
"reward_std": 0.03902562449220568,
|
|
"completion_length": 78.4625,
|
|
"kl": 0.13612622767686844,
|
|
"epoch": 0.834,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 20.375,
|
|
"learning_rate": 8.200000000000001e-07,
|
|
"rewards/reward_fn": 0.4634856253862381,
|
|
"reward": 0.4634856253862381,
|
|
"reward_std": 0.03273412830894813,
|
|
"completion_length": 78.8875,
|
|
"kl": 0.12488429546356201,
|
|
"epoch": 0.836,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 19.625,
|
|
"learning_rate": 8.100000000000001e-07,
|
|
"rewards/reward_fn": 0.469024994969368,
|
|
"reward": 0.469024994969368,
|
|
"reward_std": 0.025262853922322394,
|
|
"completion_length": 79.1,
|
|
"kl": 0.12443113997578621,
|
|
"epoch": 0.838,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 8.000000000000001e-07,
|
|
"rewards/reward_fn": 0.4686718791723251,
|
|
"reward": 0.4686718791723251,
|
|
"reward_std": 0.03224018139299005,
|
|
"completion_length": 77.9125,
|
|
"kl": 0.15120850279927253,
|
|
"epoch": 0.84,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 24.125,
|
|
"learning_rate": 7.900000000000001e-07,
|
|
"rewards/reward_fn": 0.4641831278800964,
|
|
"reward": 0.4641831278800964,
|
|
"reward_std": 0.045767600310500714,
|
|
"completion_length": 77.9875,
|
|
"kl": 0.14055218696594238,
|
|
"epoch": 0.842,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 22.625,
|
|
"learning_rate": 7.8e-07,
|
|
"rewards/reward_fn": 0.44297937452793124,
|
|
"reward": 0.44297937452793124,
|
|
"reward_std": 0.0778072669985704,
|
|
"completion_length": 77.5375,
|
|
"kl": 0.147323065251112,
|
|
"epoch": 0.844,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 20.25,
|
|
"learning_rate": 7.7e-07,
|
|
"rewards/reward_fn": 0.4716887503862381,
|
|
"reward": 0.4716887503862381,
|
|
"reward_std": 0.02907162085175514,
|
|
"completion_length": 78.675,
|
|
"kl": 0.1302117206156254,
|
|
"epoch": 0.846,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 19.875,
|
|
"learning_rate": 7.6e-07,
|
|
"rewards/reward_fn": 0.4569631278514862,
|
|
"reward": 0.4569631278514862,
|
|
"reward_std": 0.04407282890751958,
|
|
"completion_length": 79.1125,
|
|
"kl": 0.13647983074188233,
|
|
"epoch": 0.848,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"loss": 0.0046,
|
|
"grad_norm": 21.75,
|
|
"learning_rate": 7.5e-07,
|
|
"rewards/reward_fn": 0.47706499695777893,
|
|
"reward": 0.47706499695777893,
|
|
"reward_std": 0.009564002160914242,
|
|
"completion_length": 79.5625,
|
|
"kl": 0.11507855504751205,
|
|
"epoch": 0.85,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"loss": 0.0062,
|
|
"grad_norm": 21.625,
|
|
"learning_rate": 7.4e-07,
|
|
"rewards/reward_fn": 0.4304031223058701,
|
|
"reward": 0.4304031223058701,
|
|
"reward_std": 0.08106931184884161,
|
|
"completion_length": 78.85,
|
|
"kl": 0.15556320548057556,
|
|
"epoch": 0.852,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 23.375,
|
|
"learning_rate": 7.3e-07,
|
|
"rewards/reward_fn": 0.44377937018871305,
|
|
"reward": 0.44377937018871305,
|
|
"reward_std": 0.08343072717543691,
|
|
"completion_length": 78.675,
|
|
"kl": 0.14879855364561081,
|
|
"epoch": 0.854,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 23.5,
|
|
"learning_rate": 7.2e-07,
|
|
"rewards/reward_fn": 0.45706000328063967,
|
|
"reward": 0.45706000328063967,
|
|
"reward_std": 0.043012913013808426,
|
|
"completion_length": 78.25,
|
|
"kl": 0.1595211073756218,
|
|
"epoch": 0.856,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 19.0,
|
|
"learning_rate": 7.1e-07,
|
|
"rewards/reward_fn": 0.4507762461900711,
|
|
"reward": 0.4507762461900711,
|
|
"reward_std": 0.0820188666926697,
|
|
"completion_length": 78.475,
|
|
"kl": 0.1462649531662464,
|
|
"epoch": 0.858,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 19.875,
|
|
"learning_rate": 7.000000000000001e-07,
|
|
"rewards/reward_fn": 0.46033436954021456,
|
|
"reward": 0.46033436954021456,
|
|
"reward_std": 0.05020685677882284,
|
|
"completion_length": 77.6375,
|
|
"kl": 0.13829350471496582,
|
|
"epoch": 0.86,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"loss": 0.0065,
|
|
"grad_norm": 20.375,
|
|
"learning_rate": 6.900000000000001e-07,
|
|
"rewards/reward_fn": 0.44231187999248506,
|
|
"reward": 0.44231187999248506,
|
|
"reward_std": 0.0736640966264531,
|
|
"completion_length": 77.6125,
|
|
"kl": 0.1614016644656658,
|
|
"epoch": 0.862,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 19.875,
|
|
"learning_rate": 6.800000000000001e-07,
|
|
"rewards/reward_fn": 0.45599688291549684,
|
|
"reward": 0.45599688291549684,
|
|
"reward_std": 0.06550167343229987,
|
|
"completion_length": 78.2125,
|
|
"kl": 0.15353991836309433,
|
|
"epoch": 0.864,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 6.7e-07,
|
|
"rewards/reward_fn": 0.4656537532806396,
|
|
"reward": 0.4656537532806396,
|
|
"reward_std": 0.025680063420441,
|
|
"completion_length": 77.125,
|
|
"kl": 0.1347724623978138,
|
|
"epoch": 0.866,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"loss": 0.0067,
|
|
"grad_norm": 20.75,
|
|
"learning_rate": 6.6e-07,
|
|
"rewards/reward_fn": 0.4520668715238571,
|
|
"reward": 0.4520668715238571,
|
|
"reward_std": 0.07245250167325139,
|
|
"completion_length": 78.4375,
|
|
"kl": 0.16720658987760545,
|
|
"epoch": 0.868,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 22.5,
|
|
"learning_rate": 6.5e-07,
|
|
"rewards/reward_fn": 0.46412250101566316,
|
|
"reward": 0.46412250101566316,
|
|
"reward_std": 0.03379640890052542,
|
|
"completion_length": 77.8625,
|
|
"kl": 0.134855917096138,
|
|
"epoch": 0.87,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 24.25,
|
|
"learning_rate": 6.4e-07,
|
|
"rewards/reward_fn": 0.4482081264257431,
|
|
"reward": 0.4482081264257431,
|
|
"reward_std": 0.07765977667877451,
|
|
"completion_length": 78.7875,
|
|
"kl": 0.13678457364439964,
|
|
"epoch": 0.872,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"loss": 0.0079,
|
|
"grad_norm": 23.0,
|
|
"learning_rate": 6.3e-07,
|
|
"rewards/reward_fn": 0.4295049995183945,
|
|
"reward": 0.4295049995183945,
|
|
"reward_std": 0.12403819523751736,
|
|
"completion_length": 76.825,
|
|
"kl": 0.1965901866555214,
|
|
"epoch": 0.874,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"loss": 0.0069,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 6.200000000000001e-07,
|
|
"rewards/reward_fn": 0.4354356348514557,
|
|
"reward": 0.4354356348514557,
|
|
"reward_std": 0.10014819449279458,
|
|
"completion_length": 78.525,
|
|
"kl": 0.17188069224357605,
|
|
"epoch": 0.876,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"loss": 0.0069,
|
|
"grad_norm": 25.875,
|
|
"learning_rate": 6.100000000000001e-07,
|
|
"rewards/reward_fn": 0.4368724972009659,
|
|
"reward": 0.4368724972009659,
|
|
"reward_std": 0.09698029151186346,
|
|
"completion_length": 78.4875,
|
|
"kl": 0.17358247861266135,
|
|
"epoch": 0.878,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 20.25,
|
|
"learning_rate": 6.000000000000001e-07,
|
|
"rewards/reward_fn": 0.4686275005340576,
|
|
"reward": 0.4686275005340576,
|
|
"reward_std": 0.02711519307922572,
|
|
"completion_length": 78.2125,
|
|
"kl": 0.14824069589376448,
|
|
"epoch": 0.88,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"loss": 0.0063,
|
|
"grad_norm": 21.125,
|
|
"learning_rate": 5.900000000000001e-07,
|
|
"rewards/reward_fn": 0.4606556236743927,
|
|
"reward": 0.4606556236743927,
|
|
"reward_std": 0.057382132229395214,
|
|
"completion_length": 78.55,
|
|
"kl": 0.1566497005522251,
|
|
"epoch": 0.882,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 21.125,
|
|
"learning_rate": 5.800000000000001e-07,
|
|
"rewards/reward_fn": 0.4678006261587143,
|
|
"reward": 0.4678006261587143,
|
|
"reward_std": 0.028731092542875557,
|
|
"completion_length": 77.9625,
|
|
"kl": 0.14399609267711638,
|
|
"epoch": 0.884,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 18.875,
|
|
"learning_rate": 5.7e-07,
|
|
"rewards/reward_fn": 0.45871124863624574,
|
|
"reward": 0.45871124863624574,
|
|
"reward_std": 0.061426320811733603,
|
|
"completion_length": 77.2625,
|
|
"kl": 0.13430218696594237,
|
|
"epoch": 0.886,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 5.6e-07,
|
|
"rewards/reward_fn": 0.46343562602996824,
|
|
"reward": 0.46343562602996824,
|
|
"reward_std": 0.048576657217927276,
|
|
"completion_length": 78.0375,
|
|
"kl": 0.13749194145202637,
|
|
"epoch": 0.888,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"loss": 0.005,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 5.5e-07,
|
|
"rewards/reward_fn": 0.45629812180995943,
|
|
"reward": 0.45629812180995943,
|
|
"reward_std": 0.05870918773580343,
|
|
"completion_length": 79.225,
|
|
"kl": 0.1249243251979351,
|
|
"epoch": 0.89,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 19.875,
|
|
"learning_rate": 5.4e-07,
|
|
"rewards/reward_fn": 0.4663606256246567,
|
|
"reward": 0.4663606256246567,
|
|
"reward_std": 0.03157033738680184,
|
|
"completion_length": 77.35,
|
|
"kl": 0.1368165969848633,
|
|
"epoch": 0.892,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"loss": 0.0061,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 5.3e-07,
|
|
"rewards/reward_fn": 0.4638850033283234,
|
|
"reward": 0.4638850033283234,
|
|
"reward_std": 0.04434651714982465,
|
|
"completion_length": 77.3,
|
|
"kl": 0.1524613842368126,
|
|
"epoch": 0.894,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"loss": 0.0064,
|
|
"grad_norm": 22.0,
|
|
"learning_rate": 5.2e-07,
|
|
"rewards/reward_fn": 0.4575281262397766,
|
|
"reward": 0.4575281262397766,
|
|
"reward_std": 0.0726023374358192,
|
|
"completion_length": 77.475,
|
|
"kl": 0.16076251789927481,
|
|
"epoch": 0.896,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 5.1e-07,
|
|
"rewards/reward_fn": 0.4624299943447113,
|
|
"reward": 0.4624299943447113,
|
|
"reward_std": 0.03603266594000161,
|
|
"completion_length": 77.6375,
|
|
"kl": 0.14188418835401534,
|
|
"epoch": 0.898,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 21.125,
|
|
"learning_rate": 5.000000000000001e-07,
|
|
"rewards/reward_fn": 0.4630206227302551,
|
|
"reward": 0.4630206227302551,
|
|
"reward_std": 0.04361341076437384,
|
|
"completion_length": 79.175,
|
|
"kl": 0.1345802366733551,
|
|
"epoch": 0.9,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"loss": 0.0066,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 4.900000000000001e-07,
|
|
"rewards/reward_fn": 0.45432437062263487,
|
|
"reward": 0.45432437062263487,
|
|
"reward_std": 0.06151717790635303,
|
|
"completion_length": 77.125,
|
|
"kl": 0.16546293646097182,
|
|
"epoch": 0.902,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 4.800000000000001e-07,
|
|
"rewards/reward_fn": 0.4623143792152405,
|
|
"reward": 0.4623143792152405,
|
|
"reward_std": 0.04675731394672766,
|
|
"completion_length": 78.6875,
|
|
"kl": 0.13369161933660506,
|
|
"epoch": 0.904,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 4.7000000000000005e-07,
|
|
"rewards/reward_fn": 0.4644206166267395,
|
|
"reward": 0.4644206166267395,
|
|
"reward_std": 0.03467130603967235,
|
|
"completion_length": 78.0875,
|
|
"kl": 0.13208074048161506,
|
|
"epoch": 0.906,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 4.6000000000000004e-07,
|
|
"rewards/reward_fn": 0.4571474939584732,
|
|
"reward": 0.4571474939584732,
|
|
"reward_std": 0.05679858090588823,
|
|
"completion_length": 77.3,
|
|
"kl": 0.1495486691594124,
|
|
"epoch": 0.908,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"loss": 0.007,
|
|
"grad_norm": 23.625,
|
|
"learning_rate": 4.5000000000000003e-07,
|
|
"rewards/reward_fn": 0.42995937168598175,
|
|
"reward": 0.42995937168598175,
|
|
"reward_std": 0.10549633367918432,
|
|
"completion_length": 77.675,
|
|
"kl": 0.17482125535607337,
|
|
"epoch": 0.91,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 22.125,
|
|
"learning_rate": 4.4e-07,
|
|
"rewards/reward_fn": 0.4737518787384033,
|
|
"reward": 0.4737518787384033,
|
|
"reward_std": 0.009154988103546202,
|
|
"completion_length": 78.3,
|
|
"kl": 0.1262164294719696,
|
|
"epoch": 0.912,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 23.5,
|
|
"learning_rate": 4.3e-07,
|
|
"rewards/reward_fn": 0.46946938037872316,
|
|
"reward": 0.46946938037872316,
|
|
"reward_std": 0.03301746472716331,
|
|
"completion_length": 78.7125,
|
|
"kl": 0.14844730645418167,
|
|
"epoch": 0.914,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"loss": 0.0078,
|
|
"grad_norm": 21.25,
|
|
"learning_rate": 4.2000000000000006e-07,
|
|
"rewards/reward_fn": 0.44804688096046447,
|
|
"reward": 0.44804688096046447,
|
|
"reward_std": 0.06899331058375538,
|
|
"completion_length": 78.1875,
|
|
"kl": 0.19460128620266914,
|
|
"epoch": 0.916,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 22.5,
|
|
"learning_rate": 4.1000000000000004e-07,
|
|
"rewards/reward_fn": 0.4682606279850006,
|
|
"reward": 0.4682606279850006,
|
|
"reward_std": 0.05492446586722508,
|
|
"completion_length": 78.1375,
|
|
"kl": 0.13396066278219224,
|
|
"epoch": 0.918,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 18.875,
|
|
"learning_rate": 4.0000000000000003e-07,
|
|
"rewards/reward_fn": 0.45140312910079955,
|
|
"reward": 0.45140312910079955,
|
|
"reward_std": 0.04784779482288286,
|
|
"completion_length": 78.45,
|
|
"kl": 0.14303272366523742,
|
|
"epoch": 0.92,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 21.875,
|
|
"learning_rate": 3.9e-07,
|
|
"rewards/reward_fn": 0.4413818746805191,
|
|
"reward": 0.4413818746805191,
|
|
"reward_std": 0.07519180465023964,
|
|
"completion_length": 78.7,
|
|
"kl": 0.14771961718797683,
|
|
"epoch": 0.922,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 21.375,
|
|
"learning_rate": 3.8e-07,
|
|
"rewards/reward_fn": 0.46304125487804415,
|
|
"reward": 0.46304125487804415,
|
|
"reward_std": 0.042501320654992014,
|
|
"completion_length": 78.475,
|
|
"kl": 0.13340821117162704,
|
|
"epoch": 0.924,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 3.7e-07,
|
|
"rewards/reward_fn": 0.46366499960422514,
|
|
"reward": 0.46366499960422514,
|
|
"reward_std": 0.04564647800289094,
|
|
"completion_length": 78.75,
|
|
"kl": 0.142959389090538,
|
|
"epoch": 0.926,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"loss": 0.007,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 3.6e-07,
|
|
"rewards/reward_fn": 0.44440999925136565,
|
|
"reward": 0.44440999925136565,
|
|
"reward_std": 0.09289236271288245,
|
|
"completion_length": 77.9125,
|
|
"kl": 0.17386788129806519,
|
|
"epoch": 0.928,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 19.375,
|
|
"learning_rate": 3.5000000000000004e-07,
|
|
"rewards/reward_fn": 0.44386188089847567,
|
|
"reward": 0.44386188089847567,
|
|
"reward_std": 0.07522995788604021,
|
|
"completion_length": 77.9375,
|
|
"kl": 0.14186157137155533,
|
|
"epoch": 0.93,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 20.875,
|
|
"learning_rate": 3.4000000000000003e-07,
|
|
"rewards/reward_fn": 0.4641656279563904,
|
|
"reward": 0.4641656279563904,
|
|
"reward_std": 0.04270973342936486,
|
|
"completion_length": 78.125,
|
|
"kl": 0.13535649850964546,
|
|
"epoch": 0.932,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"loss": 0.0075,
|
|
"grad_norm": 22.625,
|
|
"learning_rate": 3.3e-07,
|
|
"rewards/reward_fn": 0.4358831226825714,
|
|
"reward": 0.4358831226825714,
|
|
"reward_std": 0.09227959238924086,
|
|
"completion_length": 78.2625,
|
|
"kl": 0.18859679996967316,
|
|
"epoch": 0.934,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 19.75,
|
|
"learning_rate": 3.2e-07,
|
|
"rewards/reward_fn": 0.46294688284397123,
|
|
"reward": 0.46294688284397123,
|
|
"reward_std": 0.04259873778792098,
|
|
"completion_length": 78.7375,
|
|
"kl": 0.14116661995649338,
|
|
"epoch": 0.936,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 26.875,
|
|
"learning_rate": 3.1000000000000005e-07,
|
|
"rewards/reward_fn": 0.45820999443531035,
|
|
"reward": 0.45820999443531035,
|
|
"reward_std": 0.049100439576432106,
|
|
"completion_length": 78.6875,
|
|
"kl": 0.1331377424299717,
|
|
"epoch": 0.938,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 24.125,
|
|
"learning_rate": 3.0000000000000004e-07,
|
|
"rewards/reward_fn": 0.4553199976682663,
|
|
"reward": 0.4553199976682663,
|
|
"reward_std": 0.06563679699320346,
|
|
"completion_length": 77.9125,
|
|
"kl": 0.13799761608242989,
|
|
"epoch": 0.94,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"loss": 0.007,
|
|
"grad_norm": 23.375,
|
|
"learning_rate": 2.9000000000000003e-07,
|
|
"rewards/reward_fn": 0.45740562677383423,
|
|
"reward": 0.45740562677383423,
|
|
"reward_std": 0.05445564701221883,
|
|
"completion_length": 77.05,
|
|
"kl": 0.17594465613365173,
|
|
"epoch": 0.942,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"loss": 0.0066,
|
|
"grad_norm": 28.25,
|
|
"learning_rate": 2.8e-07,
|
|
"rewards/reward_fn": 0.463620001077652,
|
|
"reward": 0.463620001077652,
|
|
"reward_std": 0.051476556318812074,
|
|
"completion_length": 77.0625,
|
|
"kl": 0.1647212788462639,
|
|
"epoch": 0.944,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 19.625,
|
|
"learning_rate": 2.7e-07,
|
|
"rewards/reward_fn": 0.46528937220573424,
|
|
"reward": 0.46528937220573424,
|
|
"reward_std": 0.022915772977285087,
|
|
"completion_length": 78.4625,
|
|
"kl": 0.132051981985569,
|
|
"epoch": 0.946,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"loss": 0.0069,
|
|
"grad_norm": 19.875,
|
|
"learning_rate": 2.6e-07,
|
|
"rewards/reward_fn": 0.4677406221628189,
|
|
"reward": 0.4677406221628189,
|
|
"reward_std": 0.05549450130201876,
|
|
"completion_length": 78.125,
|
|
"kl": 0.1719025544822216,
|
|
"epoch": 0.948,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 2.5000000000000004e-07,
|
|
"rewards/reward_fn": 0.45939249396324155,
|
|
"reward": 0.45939249396324155,
|
|
"reward_std": 0.0691711014136672,
|
|
"completion_length": 78.875,
|
|
"kl": 0.14711649268865584,
|
|
"epoch": 0.95,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"loss": 0.0069,
|
|
"grad_norm": 23.125,
|
|
"learning_rate": 2.4000000000000003e-07,
|
|
"rewards/reward_fn": 0.4654675006866455,
|
|
"reward": 0.4654675006866455,
|
|
"reward_std": 0.030214719858486207,
|
|
"completion_length": 77.5,
|
|
"kl": 0.1717626817524433,
|
|
"epoch": 0.952,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"loss": 0.0063,
|
|
"grad_norm": 17.125,
|
|
"learning_rate": 2.3000000000000002e-07,
|
|
"rewards/reward_fn": 0.45497375130653384,
|
|
"reward": 0.45497375130653384,
|
|
"reward_std": 0.06808556367177516,
|
|
"completion_length": 77.825,
|
|
"kl": 0.15703836753964423,
|
|
"epoch": 0.954,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 22.75,
|
|
"learning_rate": 2.2e-07,
|
|
"rewards/reward_fn": 0.44606186747550963,
|
|
"reward": 0.44606186747550963,
|
|
"reward_std": 0.08989615420578048,
|
|
"completion_length": 78.0875,
|
|
"kl": 0.15060136690735818,
|
|
"epoch": 0.956,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"loss": 0.0051,
|
|
"grad_norm": 20.5,
|
|
"learning_rate": 2.1000000000000003e-07,
|
|
"rewards/reward_fn": 0.47334000170230867,
|
|
"reward": 0.47334000170230867,
|
|
"reward_std": 0.029013207624666394,
|
|
"completion_length": 78.675,
|
|
"kl": 0.1277802363038063,
|
|
"epoch": 0.958,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 21.125,
|
|
"learning_rate": 2.0000000000000002e-07,
|
|
"rewards/reward_fn": 0.4681881219148636,
|
|
"reward": 0.4681881219148636,
|
|
"reward_std": 0.02434324522037059,
|
|
"completion_length": 79.625,
|
|
"kl": 0.1305567964911461,
|
|
"epoch": 0.96,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"loss": 0.006,
|
|
"grad_norm": 18.25,
|
|
"learning_rate": 1.9e-07,
|
|
"rewards/reward_fn": 0.44461186826229093,
|
|
"reward": 0.44461186826229093,
|
|
"reward_std": 0.07648974631447344,
|
|
"completion_length": 79.3375,
|
|
"kl": 0.14975779727101327,
|
|
"epoch": 0.962,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"loss": 0.0049,
|
|
"grad_norm": 19.125,
|
|
"learning_rate": 1.8e-07,
|
|
"rewards/reward_fn": 0.46052125096321106,
|
|
"reward": 0.46052125096321106,
|
|
"reward_std": 0.0383532726438716,
|
|
"completion_length": 77.55,
|
|
"kl": 0.12272944673895836,
|
|
"epoch": 0.964,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 19.625,
|
|
"learning_rate": 1.7000000000000001e-07,
|
|
"rewards/reward_fn": 0.4670031249523163,
|
|
"reward": 0.4670031249523163,
|
|
"reward_std": 0.03299781592795625,
|
|
"completion_length": 78.025,
|
|
"kl": 0.14154839739203454,
|
|
"epoch": 0.966,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 22.375,
|
|
"learning_rate": 1.6e-07,
|
|
"rewards/reward_fn": 0.46661687791347506,
|
|
"reward": 0.46661687791347506,
|
|
"reward_std": 0.02604542833287269,
|
|
"completion_length": 77.2,
|
|
"kl": 0.14838956594467162,
|
|
"epoch": 0.968,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 22.5,
|
|
"learning_rate": 1.5000000000000002e-07,
|
|
"rewards/reward_fn": 0.455153751373291,
|
|
"reward": 0.455153751373291,
|
|
"reward_std": 0.04773070907685906,
|
|
"completion_length": 78.75,
|
|
"kl": 0.1395164869725704,
|
|
"epoch": 0.97,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"loss": 0.0055,
|
|
"grad_norm": 18.625,
|
|
"learning_rate": 1.4e-07,
|
|
"rewards/reward_fn": 0.46802250742912294,
|
|
"reward": 0.46802250742912294,
|
|
"reward_std": 0.03353580196853727,
|
|
"completion_length": 78.1375,
|
|
"kl": 0.13703610971570016,
|
|
"epoch": 0.972,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"loss": 0.0058,
|
|
"grad_norm": 19.75,
|
|
"learning_rate": 1.3e-07,
|
|
"rewards/reward_fn": 0.44542625546455383,
|
|
"reward": 0.44542625546455383,
|
|
"reward_std": 0.07354072753805667,
|
|
"completion_length": 79.375,
|
|
"kl": 0.1450169213116169,
|
|
"epoch": 0.974,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"loss": 0.0059,
|
|
"grad_norm": 22.5,
|
|
"learning_rate": 1.2000000000000002e-07,
|
|
"rewards/reward_fn": 0.45854686498641967,
|
|
"reward": 0.45854686498641967,
|
|
"reward_std": 0.05262974831275642,
|
|
"completion_length": 78.1625,
|
|
"kl": 0.1467783972620964,
|
|
"epoch": 0.976,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 21.5,
|
|
"learning_rate": 1.1e-07,
|
|
"rewards/reward_fn": 0.4662149965763092,
|
|
"reward": 0.4662149965763092,
|
|
"reward_std": 0.030651798704639077,
|
|
"completion_length": 77.7375,
|
|
"kl": 0.1360873505473137,
|
|
"epoch": 0.978,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"loss": 0.0063,
|
|
"grad_norm": 19.75,
|
|
"learning_rate": 1.0000000000000001e-07,
|
|
"rewards/reward_fn": 0.4552900016307831,
|
|
"reward": 0.4552900016307831,
|
|
"reward_std": 0.05980135982390493,
|
|
"completion_length": 76.8625,
|
|
"kl": 0.15636155605316163,
|
|
"epoch": 0.98,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"loss": 0.0053,
|
|
"grad_norm": 22.0,
|
|
"learning_rate": 9e-08,
|
|
"rewards/reward_fn": 0.45424186289310453,
|
|
"reward": 0.45424186289310453,
|
|
"reward_std": 0.06591771512757987,
|
|
"completion_length": 77.5625,
|
|
"kl": 0.13355037719011306,
|
|
"epoch": 0.982,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"loss": 0.0067,
|
|
"grad_norm": 24.375,
|
|
"learning_rate": 8e-08,
|
|
"rewards/reward_fn": 0.438731250166893,
|
|
"reward": 0.438731250166893,
|
|
"reward_std": 0.10588476944249123,
|
|
"completion_length": 77.7,
|
|
"kl": 0.1672614686191082,
|
|
"epoch": 0.984,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"loss": 0.0045,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 7e-08,
|
|
"rewards/reward_fn": 0.4748474985361099,
|
|
"reward": 0.4748474985361099,
|
|
"reward_std": 0.011794954282231629,
|
|
"completion_length": 78.425,
|
|
"kl": 0.11192921400070191,
|
|
"epoch": 0.986,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"loss": 0.0056,
|
|
"grad_norm": 21.75,
|
|
"learning_rate": 6.000000000000001e-08,
|
|
"rewards/reward_fn": 0.47034125924110415,
|
|
"reward": 0.47034125924110415,
|
|
"reward_std": 0.028933694993611425,
|
|
"completion_length": 77.525,
|
|
"kl": 0.13980434015393256,
|
|
"epoch": 0.988,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"loss": 0.007,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 5.0000000000000004e-08,
|
|
"rewards/reward_fn": 0.46100749671459196,
|
|
"reward": 0.46100749671459196,
|
|
"reward_std": 0.046814579702913764,
|
|
"completion_length": 78.225,
|
|
"kl": 0.17536836490035057,
|
|
"epoch": 0.99,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"loss": 0.0052,
|
|
"grad_norm": 21.0,
|
|
"learning_rate": 4e-08,
|
|
"rewards/reward_fn": 0.45782187581062317,
|
|
"reward": 0.45782187581062317,
|
|
"reward_std": 0.06049414209555835,
|
|
"completion_length": 78.275,
|
|
"kl": 0.13118749782443045,
|
|
"epoch": 0.992,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"loss": 0.0063,
|
|
"grad_norm": 20.625,
|
|
"learning_rate": 3.0000000000000004e-08,
|
|
"rewards/reward_fn": 0.44442749917507174,
|
|
"reward": 0.44442749917507174,
|
|
"reward_std": 0.08098832431714982,
|
|
"completion_length": 77.925,
|
|
"kl": 0.158622158318758,
|
|
"epoch": 0.994,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"loss": 0.0054,
|
|
"grad_norm": 22.625,
|
|
"learning_rate": 2e-08,
|
|
"rewards/reward_fn": 0.47422375380992887,
|
|
"reward": 0.47422375380992887,
|
|
"reward_std": 0.018112805008422585,
|
|
"completion_length": 77.9875,
|
|
"kl": 0.13429155126214026,
|
|
"epoch": 0.996,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"loss": 0.0071,
|
|
"grad_norm": 20.0,
|
|
"learning_rate": 1e-08,
|
|
"rewards/reward_fn": 0.4550556272268295,
|
|
"reward": 0.4550556272268295,
|
|
"reward_std": 0.06616670698858798,
|
|
"completion_length": 78.4625,
|
|
"kl": 0.17691104635596275,
|
|
"epoch": 0.998,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"loss": 0.0057,
|
|
"grad_norm": 20.125,
|
|
"learning_rate": 0.0,
|
|
"rewards/reward_fn": 0.46899437308311465,
|
|
"reward": 0.46899437308311465,
|
|
"reward_std": 0.03491774908034131,
|
|
"completion_length": 78.35,
|
|
"kl": 0.14370609149336816,
|
|
"epoch": 1.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"train_runtime": 10996.7976,
|
|
"train_samples_per_second": 0.455,
|
|
"train_steps_per_second": 0.227,
|
|
"total_flos": 0.0,
|
|
"train_loss": 0.005647265207767487,
|
|
"epoch": 1.0,
|
|
"step": 2500
|
|
}
|
|
]
|
|
} |