Files
Qwen2.5-3B-Instruct-RG-Math/trainer_state.json
ModelHub XC a77026bb1a 初始化项目,由ModelHub XC社区提供模型
Model: zafstojano/Qwen2.5-3B-Instruct-RG-Math
Source: Original Platform
2026-05-01 17:10:10 +08:00

8444 lines
246 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.04,
"eval_steps": 500,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 1360.5,
"epoch": 6.666666666666667e-05,
"grad_norm": 62.92084503173828,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"reward": 0.4226251244544983,
"reward_std": 0.23553414642810822,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3742498755455017,
"rewards/tag_count_reward": 0.796875,
"step": 1
},
{
"completion_length": 2048.0,
"epoch": 0.00013333333333333334,
"grad_norm": 79.95831298828125,
"kl": 0.0,
"learning_rate": 1.6666666666666667e-08,
"loss": 0.0,
"reward": 0.7906050682067871,
"reward_std": 0.5845786929130554,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.3187699615955353,
"rewards/tag_count_reward": 0.984375,
"step": 2
},
{
"completion_length": 1188.5,
"epoch": 0.0002,
"grad_norm": 58.27819061279297,
"kl": 0.000675201416015625,
"learning_rate": 3.3333333333333334e-08,
"loss": 0.0,
"reward": 0.5692380666732788,
"reward_std": 0.23440620303153992,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22763691842556,
"rewards/tag_count_reward": 0.796875,
"step": 3
},
{
"completion_length": 318.5,
"epoch": 0.0002666666666666667,
"grad_norm": 85.0557861328125,
"kl": 0.001190185546875,
"learning_rate": 5e-08,
"loss": 0.0,
"reward": 0.7729774713516235,
"reward_std": 0.2601366341114044,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.13327254354953766,
"rewards/tag_count_reward": 0.90625,
"step": 4
},
{
"completion_length": 1023.0,
"epoch": 0.0003333333333333333,
"grad_norm": 104.47907257080078,
"kl": 0.00139617919921875,
"learning_rate": 6.666666666666667e-08,
"loss": 0.0001,
"reward": 0.7300776243209839,
"reward_std": 0.2588834762573242,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2542974054813385,
"rewards/tag_count_reward": 0.984375,
"step": 5
},
{
"completion_length": 1320.0,
"epoch": 0.0004,
"grad_norm": 60.059425354003906,
"kl": 0.00128173828125,
"learning_rate": 8.333333333333333e-08,
"loss": 0.0001,
"reward": 0.5090208649635315,
"reward_std": 0.2359948605298996,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2722291350364685,
"rewards/tag_count_reward": 0.78125,
"step": 6
},
{
"completion_length": 1725.5,
"epoch": 0.00046666666666666666,
"grad_norm": 18.682714462280273,
"kl": 0.000553131103515625,
"learning_rate": 1e-07,
"loss": 0.0,
"reward": 0.4545783996582031,
"reward_std": 0.3350987732410431,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.3266716003417969,
"rewards/tag_count_reward": 0.71875,
"step": 7
},
{
"completion_length": 2048.0,
"epoch": 0.0005333333333333334,
"grad_norm": 65.13062286376953,
"kl": 0.00049591064453125,
"learning_rate": 1.1666666666666667e-07,
"loss": 0.0,
"reward": 0.38973310589790344,
"reward_std": 0.4630505442619324,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.31339186429977417,
"rewards/tag_count_reward": 0.703125,
"step": 8
},
{
"completion_length": 638.5,
"epoch": 0.0006,
"grad_norm": 45.310523986816406,
"kl": 0.001953125,
"learning_rate": 1.3333333333333334e-07,
"loss": 0.0001,
"reward": 0.8727901577949524,
"reward_std": 0.3541698455810547,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.3147098422050476,
"rewards/tag_count_reward": 1.0,
"step": 9
},
{
"completion_length": 1209.0,
"epoch": 0.0006666666666666666,
"grad_norm": 30.45114517211914,
"kl": 0.0008392333984375,
"learning_rate": 1.5e-07,
"loss": 0.0,
"reward": 0.5161985754966736,
"reward_std": 0.23943105340003967,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2025514394044876,
"rewards/tag_count_reward": 0.71875,
"step": 10
},
{
"completion_length": 1543.0,
"epoch": 0.0007333333333333333,
"grad_norm": 73.57073974609375,
"kl": 0.00107574462890625,
"learning_rate": 1.6666666666666665e-07,
"loss": 0.0,
"reward": 0.6498641967773438,
"reward_std": 0.18601471185684204,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.28763580322265625,
"rewards/tag_count_reward": 0.9375,
"step": 11
},
{
"completion_length": 1177.5,
"epoch": 0.0008,
"grad_norm": 78.71764373779297,
"kl": 0.0019073486328125,
"learning_rate": 1.833333333333333e-07,
"loss": 0.0001,
"reward": 0.47592616081237793,
"reward_std": 0.25675779581069946,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.30532386898994446,
"rewards/tag_count_reward": 0.78125,
"step": 12
},
{
"completion_length": 1347.0,
"epoch": 0.0008666666666666666,
"grad_norm": 65.83430480957031,
"kl": 0.00299072265625,
"learning_rate": 2e-07,
"loss": 0.0001,
"reward": 0.42067980766296387,
"reward_std": 0.2540290951728821,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.34494519233703613,
"rewards/tag_count_reward": 0.765625,
"step": 13
},
{
"completion_length": 1529.0,
"epoch": 0.0009333333333333333,
"grad_norm": 39.00326919555664,
"kl": 0.00286865234375,
"learning_rate": 2.1666666666666667e-07,
"loss": 0.0001,
"reward": 0.4691365659236908,
"reward_std": 0.34750378131866455,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.3121134638786316,
"rewards/tag_count_reward": 0.71875,
"step": 14
},
{
"completion_length": 1282.0,
"epoch": 0.001,
"grad_norm": 37.169288635253906,
"kl": 0.0035400390625,
"learning_rate": 2.3333333333333333e-07,
"loss": 0.0001,
"reward": 0.5637781620025635,
"reward_std": 0.26130610704421997,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.20184680819511414,
"rewards/tag_count_reward": 0.765625,
"step": 15
},
{
"completion_length": 1581.5,
"epoch": 0.0010666666666666667,
"grad_norm": 102.38505554199219,
"kl": 0.00457763671875,
"learning_rate": 2.5e-07,
"loss": 0.0002,
"reward": 0.6891039609909058,
"reward_std": 0.4037727117538452,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.32652103900909424,
"rewards/tag_count_reward": 0.953125,
"step": 16
},
{
"completion_length": 2048.0,
"epoch": 0.0011333333333333334,
"grad_norm": 91.1015625,
"kl": 0.011474609375,
"learning_rate": 2.6666666666666667e-07,
"loss": 0.0005,
"reward": 0.34357714653015137,
"reward_std": 0.18312102556228638,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.297047883272171,
"rewards/tag_count_reward": 0.640625,
"step": 17
},
{
"completion_length": 1239.0,
"epoch": 0.0012,
"grad_norm": 20.3443660736084,
"kl": 0.036376953125,
"learning_rate": 2.833333333333333e-07,
"loss": 0.0015,
"reward": 0.6245652437210083,
"reward_std": 0.2036026120185852,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2035597562789917,
"rewards/tag_count_reward": 0.828125,
"step": 18
},
{
"completion_length": 1572.0,
"epoch": 0.0012666666666666666,
"grad_norm": 62.626895904541016,
"kl": 0.048095703125,
"learning_rate": 3e-07,
"loss": 0.0019,
"reward": 0.45009517669677734,
"reward_std": 0.23423996567726135,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.31552982330322266,
"rewards/tag_count_reward": 0.765625,
"step": 19
},
{
"completion_length": 1256.5,
"epoch": 0.0013333333333333333,
"grad_norm": 72.09598541259766,
"kl": 0.05615234375,
"learning_rate": 3.166666666666666e-07,
"loss": 0.0022,
"reward": 0.49289625883102417,
"reward_std": 0.32105302810668945,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22585375607013702,
"rewards/tag_count_reward": 0.71875,
"step": 20
},
{
"completion_length": 2048.0,
"epoch": 0.0014,
"grad_norm": 90.30217742919922,
"kl": 0.0478515625,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0019,
"reward": 0.4728601574897766,
"reward_std": 0.3845285475254059,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2458898425102234,
"rewards/tag_count_reward": 0.71875,
"step": 21
},
{
"completion_length": 418.5,
"epoch": 0.0014666666666666667,
"grad_norm": 81.34673309326172,
"kl": 0.1162109375,
"learning_rate": 3.5e-07,
"loss": 0.0046,
"reward": 0.8560193777084351,
"reward_std": 0.1753513216972351,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.14398059248924255,
"rewards/tag_count_reward": 1.0,
"step": 22
},
{
"completion_length": 2048.0,
"epoch": 0.0015333333333333334,
"grad_norm": 82.59819030761719,
"kl": 0.0966796875,
"learning_rate": 3.666666666666666e-07,
"loss": 0.0039,
"reward": 0.5138834118843079,
"reward_std": 0.2827845513820648,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3142416179180145,
"rewards/tag_count_reward": 0.828125,
"step": 23
},
{
"completion_length": 1181.0,
"epoch": 0.0016,
"grad_norm": 44.45237350463867,
"kl": 1.15625,
"learning_rate": 3.8333333333333335e-07,
"loss": 0.0462,
"reward": 0.21006432175636292,
"reward_std": 0.24397452175617218,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3211856782436371,
"rewards/tag_count_reward": 0.53125,
"step": 24
},
{
"completion_length": 1337.0,
"epoch": 0.0016666666666666668,
"grad_norm": 50.711578369140625,
"kl": 1.640625,
"learning_rate": 4e-07,
"loss": 0.0652,
"reward": 0.5274523496627808,
"reward_std": 0.25214487314224243,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3475476801395416,
"rewards/tag_count_reward": 0.875,
"step": 25
},
{
"completion_length": 2048.0,
"epoch": 0.0017333333333333333,
"grad_norm": 49.71009826660156,
"kl": 2.203125,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.0879,
"reward": 0.6676552295684814,
"reward_std": 0.29243233799934387,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.3948447108268738,
"rewards/tag_count_reward": 1.0,
"step": 26
},
{
"completion_length": 409.0,
"epoch": 0.0018,
"grad_norm": 23.617977142333984,
"kl": 1.96875,
"learning_rate": 4.3333333333333335e-07,
"loss": 0.0789,
"reward": 0.7921032905578613,
"reward_std": 0.20389382541179657,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.20789667963981628,
"rewards/tag_count_reward": 1.0,
"step": 27
},
{
"completion_length": 347.5,
"epoch": 0.0018666666666666666,
"grad_norm": 77.3822021484375,
"kl": 3.34375,
"learning_rate": 4.5e-07,
"loss": 0.134,
"reward": 0.8200243711471558,
"reward_std": 0.16092178225517273,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.17997564375400543,
"rewards/tag_count_reward": 1.0,
"step": 28
},
{
"completion_length": 2048.0,
"epoch": 0.0019333333333333333,
"grad_norm": 55.70796585083008,
"kl": 3.421875,
"learning_rate": 4.6666666666666666e-07,
"loss": 0.1365,
"reward": 0.5405373573303223,
"reward_std": 0.2981247901916504,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.27196261286735535,
"rewards/tag_count_reward": 0.8125,
"step": 29
},
{
"completion_length": 1006.5,
"epoch": 0.002,
"grad_norm": 155.38221740722656,
"kl": 5.6875,
"learning_rate": 4.833333333333333e-07,
"loss": 0.2292,
"reward": 0.6248105764389038,
"reward_std": 0.24843928217887878,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2970644235610962,
"rewards/tag_count_reward": 0.921875,
"step": 30
},
{
"completion_length": 2048.0,
"epoch": 0.0020666666666666667,
"grad_norm": 138.4519500732422,
"kl": 5.125,
"learning_rate": 5e-07,
"loss": 0.205,
"reward": 0.7118746042251587,
"reward_std": 0.32819420099258423,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.4131254255771637,
"rewards/tag_count_reward": 1.0,
"step": 31
},
{
"completion_length": 2048.0,
"epoch": 0.0021333333333333334,
"grad_norm": 47.884742736816406,
"kl": 5.09375,
"learning_rate": 5.166666666666667e-07,
"loss": 0.2037,
"reward": 0.6176910400390625,
"reward_std": 0.17327076196670532,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3823089003562927,
"rewards/tag_count_reward": 1.0,
"step": 32
},
{
"completion_length": 684.5,
"epoch": 0.0022,
"grad_norm": 65.66542053222656,
"kl": 4.34375,
"learning_rate": 5.333333333333333e-07,
"loss": 0.1737,
"reward": 0.7614506483078003,
"reward_std": 0.18302768468856812,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2385493516921997,
"rewards/tag_count_reward": 1.0,
"step": 33
},
{
"completion_length": 1710.5,
"epoch": 0.002266666666666667,
"grad_norm": 53.532588958740234,
"kl": 3.171875,
"learning_rate": 5.5e-07,
"loss": 0.1268,
"reward": 0.524427056312561,
"reward_std": 0.2843332290649414,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.20994791388511658,
"rewards/tag_count_reward": 0.734375,
"step": 34
},
{
"completion_length": 2048.0,
"epoch": 0.0023333333333333335,
"grad_norm": 53.95136642456055,
"kl": 2.65625,
"learning_rate": 5.666666666666666e-07,
"loss": 0.1065,
"reward": 0.1748514324426651,
"reward_std": 0.38427919149398804,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2782735824584961,
"rewards/tag_count_reward": 0.453125,
"step": 35
},
{
"completion_length": 1342.0,
"epoch": 0.0024,
"grad_norm": 66.91969299316406,
"kl": 4.25,
"learning_rate": 5.833333333333334e-07,
"loss": 0.1694,
"reward": 0.4533642530441284,
"reward_std": 0.3015359938144684,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3122607469558716,
"rewards/tag_count_reward": 0.765625,
"step": 36
},
{
"completion_length": 1331.0,
"epoch": 0.0024666666666666665,
"grad_norm": 45.550968170166016,
"kl": 4.40625,
"learning_rate": 6e-07,
"loss": 0.1765,
"reward": 0.6793862581253052,
"reward_std": 0.20818302035331726,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3206138014793396,
"rewards/tag_count_reward": 1.0,
"step": 37
},
{
"completion_length": 1590.0,
"epoch": 0.002533333333333333,
"grad_norm": 74.53897094726562,
"kl": 3.625,
"learning_rate": 6.166666666666667e-07,
"loss": 0.1458,
"reward": 0.3450509309768677,
"reward_std": 0.3167041540145874,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.4049490690231323,
"rewards/tag_count_reward": 0.6875,
"step": 38
},
{
"completion_length": 2048.0,
"epoch": 0.0026,
"grad_norm": 66.00804901123047,
"kl": 1.875,
"learning_rate": 6.333333333333332e-07,
"loss": 0.075,
"reward": 0.258808434009552,
"reward_std": 0.40460073947906494,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.288066565990448,
"rewards/tag_count_reward": 0.546875,
"step": 39
},
{
"completion_length": 1535.5,
"epoch": 0.0026666666666666666,
"grad_norm": 20.755111694335938,
"kl": 2.125,
"learning_rate": 6.5e-07,
"loss": 0.0851,
"reward": 0.6783057451248169,
"reward_std": 0.5646753907203674,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.18106922507286072,
"rewards/tag_count_reward": 0.796875,
"step": 40
},
{
"completion_length": 1706.5,
"epoch": 0.0027333333333333333,
"grad_norm": 14.439873695373535,
"kl": 4.8125,
"learning_rate": 6.666666666666666e-07,
"loss": 0.1935,
"reward": 0.7685257196426392,
"reward_std": 0.44565704464912415,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.35647428035736084,
"rewards/tag_count_reward": 1.0,
"step": 41
},
{
"completion_length": 1622.5,
"epoch": 0.0028,
"grad_norm": 17.968324661254883,
"kl": 2.984375,
"learning_rate": 6.833333333333333e-07,
"loss": 0.1196,
"reward": 0.4771651327610016,
"reward_std": 0.34532877802848816,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3822098970413208,
"rewards/tag_count_reward": 0.859375,
"step": 42
},
{
"completion_length": 1579.5,
"epoch": 0.0028666666666666667,
"grad_norm": 49.35251235961914,
"kl": 2.125,
"learning_rate": 7e-07,
"loss": 0.085,
"reward": 0.49850696325302124,
"reward_std": 0.1912895143032074,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.26711803674697876,
"rewards/tag_count_reward": 0.765625,
"step": 43
},
{
"completion_length": 2048.0,
"epoch": 0.0029333333333333334,
"grad_norm": 96.884033203125,
"kl": 2.8125,
"learning_rate": 7.166666666666667e-07,
"loss": 0.113,
"reward": 0.6241928935050964,
"reward_std": 0.22991162538528442,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.36018210649490356,
"rewards/tag_count_reward": 0.984375,
"step": 44
},
{
"completion_length": 1337.5,
"epoch": 0.003,
"grad_norm": 10.082756996154785,
"kl": 1.5625,
"learning_rate": 7.333333333333332e-07,
"loss": 0.0626,
"reward": 0.8175822496414185,
"reward_std": 0.2660192847251892,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.22929275035858154,
"rewards/tag_count_reward": 0.984375,
"step": 45
},
{
"completion_length": 1441.5,
"epoch": 0.0030666666666666668,
"grad_norm": 50.631492614746094,
"kl": 1.5546875,
"learning_rate": 7.5e-07,
"loss": 0.0621,
"reward": 0.4033077657222748,
"reward_std": 0.2452845275402069,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3623172342777252,
"rewards/tag_count_reward": 0.765625,
"step": 46
},
{
"completion_length": 468.0,
"epoch": 0.0031333333333333335,
"grad_norm": 40.3426399230957,
"kl": 0.47265625,
"learning_rate": 7.666666666666667e-07,
"loss": 0.0189,
"reward": 0.8530340194702148,
"reward_std": 0.2936784029006958,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.20946595072746277,
"rewards/tag_count_reward": 1.0,
"step": 47
},
{
"completion_length": 1290.0,
"epoch": 0.0032,
"grad_norm": 25.1014461517334,
"kl": 2.078125,
"learning_rate": 7.833333333333333e-07,
"loss": 0.0831,
"reward": 0.4508376717567444,
"reward_std": 0.27874755859375,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3304123282432556,
"rewards/tag_count_reward": 0.78125,
"step": 48
},
{
"completion_length": 2018.5,
"epoch": 0.003266666666666667,
"grad_norm": 17.916645050048828,
"kl": 3.875,
"learning_rate": 8e-07,
"loss": 0.1551,
"reward": 0.7111445069313049,
"reward_std": 0.31034788489341736,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.33573055267333984,
"rewards/tag_count_reward": 0.984375,
"step": 49
},
{
"completion_length": 1838.5,
"epoch": 0.0033333333333333335,
"grad_norm": 69.96803283691406,
"kl": 3.46875,
"learning_rate": 8.166666666666666e-07,
"loss": 0.1391,
"reward": 0.44843411445617676,
"reward_std": 0.37969285249710083,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.30156588554382324,
"rewards/tag_count_reward": 0.75,
"step": 50
},
{
"completion_length": 2048.0,
"epoch": 0.0034,
"grad_norm": 209.96632385253906,
"kl": 9.75,
"learning_rate": 8.333333333333333e-07,
"loss": 0.39,
"reward": 0.5666791200637817,
"reward_std": 0.18180032074451447,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.41769587993621826,
"rewards/tag_count_reward": 0.984375,
"step": 51
},
{
"completion_length": 1529.5,
"epoch": 0.0034666666666666665,
"grad_norm": 159.65383911132812,
"kl": 8.625,
"learning_rate": 8.499999999999999e-07,
"loss": 0.3457,
"reward": 0.5927076935768127,
"reward_std": 0.16607308387756348,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.40729236602783203,
"rewards/tag_count_reward": 1.0,
"step": 52
},
{
"completion_length": 592.5,
"epoch": 0.003533333333333333,
"grad_norm": 218.92417907714844,
"kl": 6.4375,
"learning_rate": 8.666666666666667e-07,
"loss": 0.2579,
"reward": 0.9154659509658813,
"reward_std": 0.4596608579158783,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.24078400433063507,
"rewards/tag_count_reward": 0.90625,
"step": 53
},
{
"completion_length": 1021.0,
"epoch": 0.0036,
"grad_norm": 23.559619903564453,
"kl": 4.25,
"learning_rate": 8.833333333333333e-07,
"loss": 0.1696,
"reward": 0.5911154747009277,
"reward_std": 0.43817973136901855,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.26825952529907227,
"rewards/tag_count_reward": 0.796875,
"step": 54
},
{
"completion_length": 2048.0,
"epoch": 0.0036666666666666666,
"grad_norm": 55.155784606933594,
"kl": 3.28125,
"learning_rate": 9e-07,
"loss": 0.1314,
"reward": 0.4308491051197052,
"reward_std": 0.33698350191116333,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3347759246826172,
"rewards/tag_count_reward": 0.765625,
"step": 55
},
{
"completion_length": 1790.5,
"epoch": 0.0037333333333333333,
"grad_norm": 58.42893981933594,
"kl": 2.828125,
"learning_rate": 9.166666666666665e-07,
"loss": 0.1131,
"reward": 0.31651684641838074,
"reward_std": 0.27603378891944885,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.27723315358161926,
"rewards/tag_count_reward": 0.59375,
"step": 56
},
{
"completion_length": 1152.5,
"epoch": 0.0038,
"grad_norm": 25.080169677734375,
"kl": 3.078125,
"learning_rate": 9.333333333333333e-07,
"loss": 0.1231,
"reward": 0.5117756128311157,
"reward_std": 0.28854095935821533,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3163493871688843,
"rewards/tag_count_reward": 0.828125,
"step": 57
},
{
"completion_length": 1621.0,
"epoch": 0.0038666666666666667,
"grad_norm": 48.9347038269043,
"kl": 2.453125,
"learning_rate": 9.499999999999999e-07,
"loss": 0.0986,
"reward": 0.6456644535064697,
"reward_std": 0.28838276863098145,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22933553159236908,
"rewards/tag_count_reward": 0.875,
"step": 58
},
{
"completion_length": 1352.5,
"epoch": 0.003933333333333333,
"grad_norm": 64.43124389648438,
"kl": 1.984375,
"learning_rate": 9.666666666666666e-07,
"loss": 0.0796,
"reward": 0.7111755609512329,
"reward_std": 0.18634262681007385,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2731994390487671,
"rewards/tag_count_reward": 0.984375,
"step": 59
},
{
"completion_length": 579.0,
"epoch": 0.004,
"grad_norm": 68.34193420410156,
"kl": 1.609375,
"learning_rate": 9.833333333333332e-07,
"loss": 0.0641,
"reward": 0.7022674679756165,
"reward_std": 0.25780147314071655,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.26648253202438354,
"rewards/tag_count_reward": 0.96875,
"step": 60
},
{
"completion_length": 1151.5,
"epoch": 0.004066666666666666,
"grad_norm": 32.08150100708008,
"kl": 1.3515625,
"learning_rate": 1e-06,
"loss": 0.054,
"reward": 0.873136043548584,
"reward_std": 0.4330424666404724,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.0956139862537384,
"rewards/tag_count_reward": 0.90625,
"step": 61
},
{
"completion_length": 1163.0,
"epoch": 0.0041333333333333335,
"grad_norm": 28.13393783569336,
"kl": 1.953125,
"learning_rate": 1e-06,
"loss": 0.0779,
"reward": 0.6770293712615967,
"reward_std": 0.2541940212249756,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.11984563618898392,
"rewards/tag_count_reward": 0.796875,
"step": 62
},
{
"completion_length": 682.0,
"epoch": 0.0042,
"grad_norm": 787.3125,
"kl": 5.1875,
"learning_rate": 1e-06,
"loss": 0.2088,
"reward": 0.5052896738052368,
"reward_std": 0.24993839859962463,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.24471038579940796,
"rewards/tag_count_reward": 0.75,
"step": 63
},
{
"completion_length": 411.5,
"epoch": 0.004266666666666667,
"grad_norm": 50.115291595458984,
"kl": 1.453125,
"learning_rate": 1e-06,
"loss": 0.058,
"reward": 0.6861989498138428,
"reward_std": 0.24390053749084473,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.20442602038383484,
"rewards/tag_count_reward": 0.890625,
"step": 64
},
{
"completion_length": 1717.5,
"epoch": 0.004333333333333333,
"grad_norm": 104.73741912841797,
"kl": 4.875,
"learning_rate": 1e-06,
"loss": 0.1942,
"reward": 0.612551212310791,
"reward_std": 0.21034899353981018,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1686987429857254,
"rewards/tag_count_reward": 0.78125,
"step": 65
},
{
"completion_length": 563.5,
"epoch": 0.0044,
"grad_norm": 616.6585083007812,
"kl": 12.1875,
"learning_rate": 1e-06,
"loss": 0.4881,
"reward": 0.9393866658210754,
"reward_std": 0.34601008892059326,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.24811333417892456,
"rewards/tag_count_reward": 1.0,
"step": 66
},
{
"completion_length": 2048.0,
"epoch": 0.0044666666666666665,
"grad_norm": 969.1438598632812,
"kl": 25.75,
"learning_rate": 1e-06,
"loss": 1.0353,
"reward": 0.590280294418335,
"reward_std": 0.1938992142677307,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.37846967577934265,
"rewards/tag_count_reward": 0.96875,
"step": 67
},
{
"completion_length": 1266.5,
"epoch": 0.004533333333333334,
"grad_norm": 99.06281280517578,
"kl": 4.3125,
"learning_rate": 1e-06,
"loss": 0.173,
"reward": 0.8209527134895325,
"reward_std": 0.4626120328903198,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.21029730141162872,
"rewards/tag_count_reward": 0.78125,
"step": 68
},
{
"completion_length": 1431.5,
"epoch": 0.0046,
"grad_norm": 16.082868576049805,
"kl": 0.8359375,
"learning_rate": 1e-06,
"loss": 0.0334,
"reward": 0.4943634867668152,
"reward_std": 0.3753243386745453,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.13063651323318481,
"rewards/tag_count_reward": 0.625,
"step": 69
},
{
"completion_length": 511.5,
"epoch": 0.004666666666666667,
"grad_norm": 27.706239700317383,
"kl": 2.15625,
"learning_rate": 1e-06,
"loss": 0.0861,
"reward": 0.5137104988098145,
"reward_std": 0.3204461336135864,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.26753950119018555,
"rewards/tag_count_reward": 0.78125,
"step": 70
},
{
"completion_length": 1672.5,
"epoch": 0.004733333333333333,
"grad_norm": 28.113643646240234,
"kl": 2.71875,
"learning_rate": 1e-06,
"loss": 0.1085,
"reward": 0.49439576268196106,
"reward_std": 0.32019099593162537,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.27122926712036133,
"rewards/tag_count_reward": 0.765625,
"step": 71
},
{
"completion_length": 1199.5,
"epoch": 0.0048,
"grad_norm": 32.67612075805664,
"kl": 3.625,
"learning_rate": 1e-06,
"loss": 0.1455,
"reward": 0.5977433919906616,
"reward_std": 0.23971763253211975,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2616316080093384,
"rewards/tag_count_reward": 0.859375,
"step": 72
},
{
"completion_length": 474.0,
"epoch": 0.004866666666666667,
"grad_norm": 123.9244384765625,
"kl": 4.0,
"learning_rate": 1e-06,
"loss": 0.1596,
"reward": 0.9720438122749329,
"reward_std": 0.5066829323768616,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.21545612812042236,
"rewards/tag_count_reward": 1.0,
"step": 73
},
{
"completion_length": 636.5,
"epoch": 0.004933333333333333,
"grad_norm": 544.687744140625,
"kl": 4.125,
"learning_rate": 1e-06,
"loss": 0.1657,
"reward": 0.5301218628883362,
"reward_std": 0.37352970242500305,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.188628152012825,
"rewards/tag_count_reward": 0.71875,
"step": 74
},
{
"completion_length": 1155.5,
"epoch": 0.005,
"grad_norm": 61.665035247802734,
"kl": 0.69921875,
"learning_rate": 1e-06,
"loss": 0.028,
"reward": 0.6461014747619629,
"reward_std": 0.4455444812774658,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.1976485401391983,
"rewards/tag_count_reward": 0.78125,
"step": 75
},
{
"completion_length": 1879.0,
"epoch": 0.005066666666666666,
"grad_norm": 42.811279296875,
"kl": 1.65625,
"learning_rate": 1e-06,
"loss": 0.0661,
"reward": 0.4902087152004242,
"reward_std": 0.25922858715057373,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3535413146018982,
"rewards/tag_count_reward": 0.84375,
"step": 76
},
{
"completion_length": 1231.5,
"epoch": 0.0051333333333333335,
"grad_norm": 119.74993133544922,
"kl": 0.7421875,
"learning_rate": 1e-06,
"loss": 0.0298,
"reward": 0.7240487337112427,
"reward_std": 0.16741560399532318,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.27595123648643494,
"rewards/tag_count_reward": 1.0,
"step": 77
},
{
"completion_length": 568.5,
"epoch": 0.0052,
"grad_norm": 78.45287322998047,
"kl": 3.125,
"learning_rate": 1e-06,
"loss": 0.1253,
"reward": 0.7721863985061646,
"reward_std": 0.33460527658462524,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.22781357169151306,
"rewards/tag_count_reward": 0.9375,
"step": 78
},
{
"completion_length": 1466.5,
"epoch": 0.005266666666666667,
"grad_norm": 51.971961975097656,
"kl": 1.265625,
"learning_rate": 1e-06,
"loss": 0.0507,
"reward": 0.7405422329902649,
"reward_std": 0.33687448501586914,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.1969577968120575,
"rewards/tag_count_reward": 0.875,
"step": 79
},
{
"completion_length": 1090.0,
"epoch": 0.005333333333333333,
"grad_norm": 67.31307220458984,
"kl": 1.234375,
"learning_rate": 1e-06,
"loss": 0.0491,
"reward": 0.6939834356307983,
"reward_std": 0.4383859634399414,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.22789162397384644,
"rewards/tag_count_reward": 0.859375,
"step": 80
},
{
"completion_length": 543.5,
"epoch": 0.0054,
"grad_norm": 698.207763671875,
"kl": 18.625,
"learning_rate": 1e-06,
"loss": 0.744,
"reward": 0.6980682611465454,
"reward_std": 0.1692238748073578,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.301931768655777,
"rewards/tag_count_reward": 1.0,
"step": 81
},
{
"completion_length": 492.5,
"epoch": 0.0054666666666666665,
"grad_norm": 112.44371032714844,
"kl": 4.34375,
"learning_rate": 1e-06,
"loss": 0.1729,
"reward": 0.6759111881256104,
"reward_std": 0.28266435861587524,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.24596385657787323,
"rewards/tag_count_reward": 0.921875,
"step": 82
},
{
"completion_length": 1402.0,
"epoch": 0.005533333333333334,
"grad_norm": 20.140390396118164,
"kl": 4.1875,
"learning_rate": 1e-06,
"loss": 0.1672,
"reward": 0.6321854591369629,
"reward_std": 0.25300371646881104,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2428145408630371,
"rewards/tag_count_reward": 0.875,
"step": 83
},
{
"completion_length": 1182.5,
"epoch": 0.0056,
"grad_norm": 29.856775283813477,
"kl": 2.8125,
"learning_rate": 1e-06,
"loss": 0.1122,
"reward": 0.6982576847076416,
"reward_std": 0.23495344817638397,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2236173450946808,
"rewards/tag_count_reward": 0.921875,
"step": 84
},
{
"completion_length": 383.5,
"epoch": 0.005666666666666667,
"grad_norm": 152.1378936767578,
"kl": 4.1875,
"learning_rate": 1e-06,
"loss": 0.1665,
"reward": 1.047166347503662,
"reward_std": 0.45125406980514526,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.21845868229866028,
"rewards/tag_count_reward": 0.890625,
"step": 85
},
{
"completion_length": 1728.5,
"epoch": 0.005733333333333333,
"grad_norm": 404.3030090332031,
"kl": 8.6875,
"learning_rate": 1e-06,
"loss": 0.3462,
"reward": 0.22871780395507812,
"reward_std": 0.3173619210720062,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2712821960449219,
"rewards/tag_count_reward": 0.5,
"step": 86
},
{
"completion_length": 659.0,
"epoch": 0.0058,
"grad_norm": 61.45455551147461,
"kl": 2.828125,
"learning_rate": 1e-06,
"loss": 0.113,
"reward": 0.6972514390945435,
"reward_std": 0.2451970875263214,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.19337353110313416,
"rewards/tag_count_reward": 0.890625,
"step": 87
},
{
"completion_length": 597.5,
"epoch": 0.005866666666666667,
"grad_norm": 629.5557861328125,
"kl": 16.125,
"learning_rate": 1e-06,
"loss": 0.645,
"reward": 0.6588761806488037,
"reward_std": 0.17969873547554016,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3411238193511963,
"rewards/tag_count_reward": 1.0,
"step": 88
},
{
"completion_length": 1257.5,
"epoch": 0.005933333333333333,
"grad_norm": 41.06660842895508,
"kl": 1.4921875,
"learning_rate": 1e-06,
"loss": 0.0596,
"reward": 0.6366362571716309,
"reward_std": 0.27736175060272217,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.19148872792720795,
"rewards/tag_count_reward": 0.828125,
"step": 89
},
{
"completion_length": 468.5,
"epoch": 0.006,
"grad_norm": 59.929656982421875,
"kl": 0.248046875,
"learning_rate": 1e-06,
"loss": 0.0099,
"reward": 0.7060214281082153,
"reward_std": 0.2982117533683777,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.15335358679294586,
"rewards/tag_count_reward": 0.859375,
"step": 90
},
{
"completion_length": 2048.0,
"epoch": 0.006066666666666666,
"grad_norm": 268.2857971191406,
"kl": 9.0,
"learning_rate": 1e-06,
"loss": 0.3589,
"reward": 0.4473886489868164,
"reward_std": 0.21605214476585388,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3026113510131836,
"rewards/tag_count_reward": 0.75,
"step": 91
},
{
"completion_length": 733.5,
"epoch": 0.0061333333333333335,
"grad_norm": 227.89134216308594,
"kl": 9.0625,
"learning_rate": 1e-06,
"loss": 0.3635,
"reward": 0.7792458534240723,
"reward_std": 0.3857659697532654,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.26762914657592773,
"rewards/tag_count_reward": 0.984375,
"step": 92
},
{
"completion_length": 2003.5,
"epoch": 0.0062,
"grad_norm": 34.40571975708008,
"kl": 1.390625,
"learning_rate": 1e-06,
"loss": 0.0559,
"reward": 0.6673760414123535,
"reward_std": 0.2075645476579666,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1919989287853241,
"rewards/tag_count_reward": 0.859375,
"step": 93
},
{
"completion_length": 296.0,
"epoch": 0.006266666666666667,
"grad_norm": 87.31964111328125,
"kl": 3.25,
"learning_rate": 1e-06,
"loss": 0.13,
"reward": 0.8700410723686218,
"reward_std": 0.17551617324352264,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.12995892763137817,
"rewards/tag_count_reward": 1.0,
"step": 94
},
{
"completion_length": 290.5,
"epoch": 0.006333333333333333,
"grad_norm": 105.713134765625,
"kl": 0.8046875,
"learning_rate": 1e-06,
"loss": 0.0322,
"reward": 0.8407634496688843,
"reward_std": 0.1777280569076538,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.14361155033111572,
"rewards/tag_count_reward": 0.984375,
"step": 95
},
{
"completion_length": 2048.0,
"epoch": 0.0064,
"grad_norm": 84.93144226074219,
"kl": 1.453125,
"learning_rate": 1e-06,
"loss": 0.0582,
"reward": 0.6186965703964233,
"reward_std": 0.3762863278388977,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.35005342960357666,
"rewards/tag_count_reward": 0.71875,
"step": 96
},
{
"completion_length": 1466.0,
"epoch": 0.006466666666666667,
"grad_norm": 83.41734313964844,
"kl": 0.400390625,
"learning_rate": 1e-06,
"loss": 0.016,
"reward": 1.0745121240615845,
"reward_std": 0.5339106321334839,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.3317378759384155,
"rewards/tag_count_reward": 0.96875,
"step": 97
},
{
"completion_length": 1355.0,
"epoch": 0.006533333333333334,
"grad_norm": 79.6697006225586,
"kl": 0.6875,
"learning_rate": 1e-06,
"loss": 0.0276,
"reward": 0.5732134580612183,
"reward_std": 0.2440004199743271,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.30178651213645935,
"rewards/tag_count_reward": 0.875,
"step": 98
},
{
"completion_length": 595.0,
"epoch": 0.0066,
"grad_norm": 67.67591857910156,
"kl": 0.318359375,
"learning_rate": 1e-06,
"loss": 0.0128,
"reward": 0.8623976111412048,
"reward_std": 0.22806811332702637,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.12197738885879517,
"rewards/tag_count_reward": 0.984375,
"step": 99
},
{
"completion_length": 474.5,
"epoch": 0.006666666666666667,
"grad_norm": 40.00164794921875,
"kl": 0.7265625,
"learning_rate": 1e-06,
"loss": 0.029,
"reward": 0.7015387415885925,
"reward_std": 0.24650061130523682,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.09533626586198807,
"rewards/tag_count_reward": 0.796875,
"step": 100
},
{
"completion_length": 1289.5,
"epoch": 0.006733333333333333,
"grad_norm": 488.3539733886719,
"kl": 13.5625,
"learning_rate": 1e-06,
"loss": 0.5411,
"reward": 0.6548053622245789,
"reward_std": 0.20378378033638,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.34519466757774353,
"rewards/tag_count_reward": 1.0,
"step": 101
},
{
"completion_length": 1228.5,
"epoch": 0.0068,
"grad_norm": 264.9231262207031,
"kl": 9.25,
"learning_rate": 1e-06,
"loss": 0.3681,
"reward": 0.6209266185760498,
"reward_std": 0.19560487568378448,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3009483814239502,
"rewards/tag_count_reward": 0.921875,
"step": 102
},
{
"completion_length": 769.0,
"epoch": 0.006866666666666667,
"grad_norm": 88.28279113769531,
"kl": 4.3125,
"learning_rate": 1e-06,
"loss": 0.1713,
"reward": 0.8779873847961426,
"reward_std": 0.4074295163154602,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.215762659907341,
"rewards/tag_count_reward": 0.96875,
"step": 103
},
{
"completion_length": 281.5,
"epoch": 0.006933333333333333,
"grad_norm": 144.38900756835938,
"kl": 5.34375,
"learning_rate": 1e-06,
"loss": 0.2136,
"reward": 0.7748416662216187,
"reward_std": 0.18396000564098358,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22515833377838135,
"rewards/tag_count_reward": 1.0,
"step": 104
},
{
"completion_length": 637.5,
"epoch": 0.007,
"grad_norm": 65.10338592529297,
"kl": 3.125,
"learning_rate": 1e-06,
"loss": 0.1248,
"reward": 0.8497180938720703,
"reward_std": 0.4444595277309418,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.2752818763256073,
"rewards/tag_count_reward": 1.0,
"step": 105
},
{
"completion_length": 423.0,
"epoch": 0.007066666666666666,
"grad_norm": 42.835147857666016,
"kl": 1.5390625,
"learning_rate": 1e-06,
"loss": 0.0619,
"reward": 0.7825101613998413,
"reward_std": 0.30330055952072144,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2799898684024811,
"rewards/tag_count_reward": 1.0,
"step": 106
},
{
"completion_length": 2048.0,
"epoch": 0.0071333333333333335,
"grad_norm": 72.4932861328125,
"kl": 0.5,
"learning_rate": 1e-06,
"loss": 0.02,
"reward": 0.3152003288269043,
"reward_std": 0.36495572328567505,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2629246711730957,
"rewards/tag_count_reward": 0.578125,
"step": 107
},
{
"completion_length": 1259.0,
"epoch": 0.0072,
"grad_norm": 88.30027770996094,
"kl": 0.19921875,
"learning_rate": 1e-06,
"loss": 0.008,
"reward": 0.6244743466377258,
"reward_std": 0.27865296602249146,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.21927565336227417,
"rewards/tag_count_reward": 0.84375,
"step": 108
},
{
"completion_length": 1879.0,
"epoch": 0.007266666666666667,
"grad_norm": 68.95305633544922,
"kl": 1.7109375,
"learning_rate": 1e-06,
"loss": 0.0685,
"reward": 0.44533634185791016,
"reward_std": 0.19798362255096436,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.38278865814208984,
"rewards/tag_count_reward": 0.828125,
"step": 109
},
{
"completion_length": 545.0,
"epoch": 0.007333333333333333,
"grad_norm": 26.842378616333008,
"kl": 0.7421875,
"learning_rate": 1e-06,
"loss": 0.0297,
"reward": 0.8213717341423035,
"reward_std": 0.46459102630615234,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.17862829566001892,
"rewards/tag_count_reward": 0.875,
"step": 110
},
{
"completion_length": 1188.0,
"epoch": 0.0074,
"grad_norm": 34.98936080932617,
"kl": 1.8828125,
"learning_rate": 1e-06,
"loss": 0.0754,
"reward": 0.8087401390075684,
"reward_std": 0.2381439059972763,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.25375983119010925,
"rewards/tag_count_reward": 1.0,
"step": 111
},
{
"completion_length": 2048.0,
"epoch": 0.007466666666666667,
"grad_norm": 66.2349853515625,
"kl": 1.015625,
"learning_rate": 1e-06,
"loss": 0.0406,
"reward": 0.5706478357315063,
"reward_std": 0.312145859003067,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.3356021046638489,
"rewards/tag_count_reward": 0.78125,
"step": 112
},
{
"completion_length": 1583.5,
"epoch": 0.007533333333333334,
"grad_norm": 68.85169219970703,
"kl": 3.171875,
"learning_rate": 1e-06,
"loss": 0.1268,
"reward": 0.7540740966796875,
"reward_std": 0.39475083351135254,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.3553008437156677,
"rewards/tag_count_reward": 0.984375,
"step": 113
},
{
"completion_length": 1604.5,
"epoch": 0.0076,
"grad_norm": 131.9192352294922,
"kl": 1.21875,
"learning_rate": 1e-06,
"loss": 0.0485,
"reward": 0.662670910358429,
"reward_std": 0.18053007125854492,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.30607908964157104,
"rewards/tag_count_reward": 0.96875,
"step": 114
},
{
"completion_length": 320.0,
"epoch": 0.007666666666666666,
"grad_norm": 85.9920654296875,
"kl": 1.03125,
"learning_rate": 1e-06,
"loss": 0.041,
"reward": 0.8732629418373108,
"reward_std": 0.18879486620426178,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1267370581626892,
"rewards/tag_count_reward": 1.0,
"step": 115
},
{
"completion_length": 1665.5,
"epoch": 0.007733333333333333,
"grad_norm": 256.2652282714844,
"kl": 0.39453125,
"learning_rate": 1e-06,
"loss": 0.0158,
"reward": 0.44586634635925293,
"reward_std": 0.32189178466796875,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.36663368344306946,
"rewards/tag_count_reward": 0.8125,
"step": 116
},
{
"completion_length": 947.0,
"epoch": 0.0078,
"grad_norm": 154.89486694335938,
"kl": 1.25,
"learning_rate": 1e-06,
"loss": 0.05,
"reward": 0.8331127166748047,
"reward_std": 0.28338319063186646,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2137623131275177,
"rewards/tag_count_reward": 0.984375,
"step": 117
},
{
"completion_length": 1757.5,
"epoch": 0.007866666666666666,
"grad_norm": 162.80978393554688,
"kl": 0.10302734375,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 0.864159345626831,
"reward_std": 0.37670308351516724,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.16709071397781372,
"rewards/tag_count_reward": 0.90625,
"step": 118
},
{
"completion_length": 1124.0,
"epoch": 0.007933333333333334,
"grad_norm": 573.138671875,
"kl": 10.875,
"learning_rate": 1e-06,
"loss": 0.4362,
"reward": 0.7860045433044434,
"reward_std": 0.382926344871521,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.29212039709091187,
"rewards/tag_count_reward": 0.953125,
"step": 119
},
{
"completion_length": 1036.5,
"epoch": 0.008,
"grad_norm": 3948.708251953125,
"kl": 71.0,
"learning_rate": 1e-06,
"loss": 2.868,
"reward": 0.6766761541366577,
"reward_std": 0.38920336961746216,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2764488458633423,
"rewards/tag_count_reward": 0.890625,
"step": 120
},
{
"completion_length": 1811.5,
"epoch": 0.008066666666666666,
"grad_norm": 90.52825927734375,
"kl": 4.25,
"learning_rate": 1e-06,
"loss": 0.1697,
"reward": 0.6224031448364258,
"reward_std": 0.32361501455307007,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.19009681046009064,
"rewards/tag_count_reward": 0.8125,
"step": 121
},
{
"completion_length": 1185.5,
"epoch": 0.008133333333333333,
"grad_norm": 281.738525390625,
"kl": 6.9375,
"learning_rate": 1e-06,
"loss": 0.2773,
"reward": 0.6586467027664185,
"reward_std": 0.30995893478393555,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.23197825253009796,
"rewards/tag_count_reward": 0.890625,
"step": 122
},
{
"completion_length": 710.5,
"epoch": 0.0082,
"grad_norm": 1714.2760009765625,
"kl": 29.25,
"learning_rate": 1e-06,
"loss": 1.1726,
"reward": 0.8991853594779968,
"reward_std": 0.3646455705165863,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.33518970012664795,
"rewards/tag_count_reward": 0.921875,
"step": 123
},
{
"completion_length": 1366.5,
"epoch": 0.008266666666666667,
"grad_norm": 467.15386962890625,
"kl": 12.4375,
"learning_rate": 1e-06,
"loss": 0.4967,
"reward": 0.8071024417877197,
"reward_std": 0.5944163799285889,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.2710225582122803,
"rewards/tag_count_reward": 0.890625,
"step": 124
},
{
"completion_length": 1361.5,
"epoch": 0.008333333333333333,
"grad_norm": 126.81051635742188,
"kl": 3.625,
"learning_rate": 1e-06,
"loss": 0.1452,
"reward": 0.2870804965496063,
"reward_std": 0.312913179397583,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2597945034503937,
"rewards/tag_count_reward": 0.546875,
"step": 125
},
{
"completion_length": 887.5,
"epoch": 0.0084,
"grad_norm": 122.0322494506836,
"kl": 3.84375,
"learning_rate": 1e-06,
"loss": 0.1538,
"reward": 0.7047094702720642,
"reward_std": 0.18850934505462646,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2640404999256134,
"rewards/tag_count_reward": 0.96875,
"step": 126
},
{
"completion_length": 551.0,
"epoch": 0.008466666666666667,
"grad_norm": 97.62600708007812,
"kl": 0.2099609375,
"learning_rate": 1e-06,
"loss": 0.0084,
"reward": 0.8664301633834839,
"reward_std": 0.37567955255508423,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.2585698366165161,
"rewards/tag_count_reward": 1.0,
"step": 127
},
{
"completion_length": 2048.0,
"epoch": 0.008533333333333334,
"grad_norm": 124.26470184326172,
"kl": 0.54296875,
"learning_rate": 1e-06,
"loss": 0.0218,
"reward": 0.1481974571943283,
"reward_std": 0.28645384311676025,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2893025577068329,
"rewards/tag_count_reward": 0.4375,
"step": 128
},
{
"completion_length": 566.5,
"epoch": 0.0086,
"grad_norm": 145.44088745117188,
"kl": 1.140625,
"learning_rate": 1e-06,
"loss": 0.0455,
"reward": 0.684267520904541,
"reward_std": 0.1783992052078247,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3001074194908142,
"rewards/tag_count_reward": 0.984375,
"step": 129
},
{
"completion_length": 287.0,
"epoch": 0.008666666666666666,
"grad_norm": 79.27010345458984,
"kl": 1.734375,
"learning_rate": 1e-06,
"loss": 0.0693,
"reward": 0.9607985019683838,
"reward_std": 0.5766100287437439,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.2110764980316162,
"rewards/tag_count_reward": 0.984375,
"step": 130
},
{
"completion_length": 1237.5,
"epoch": 0.008733333333333333,
"grad_norm": 166.70591735839844,
"kl": 1.265625,
"learning_rate": 1e-06,
"loss": 0.0507,
"reward": 0.5976251363754272,
"reward_std": 0.2768978476524353,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.26174983382225037,
"rewards/tag_count_reward": 0.859375,
"step": 131
},
{
"completion_length": 1230.5,
"epoch": 0.0088,
"grad_norm": 86.69196319580078,
"kl": 0.84375,
"learning_rate": 1e-06,
"loss": 0.0338,
"reward": 0.9894083142280579,
"reward_std": 0.44239863753318787,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.18246668577194214,
"rewards/tag_count_reward": 0.921875,
"step": 132
},
{
"completion_length": 1223.5,
"epoch": 0.008866666666666667,
"grad_norm": 577.7119140625,
"kl": 11.5,
"learning_rate": 1e-06,
"loss": 0.4612,
"reward": 0.9148614406585693,
"reward_std": 0.3666042983531952,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.25701355934143066,
"rewards/tag_count_reward": 0.984375,
"step": 133
},
{
"completion_length": 1583.5,
"epoch": 0.008933333333333333,
"grad_norm": 1036.103271484375,
"kl": 27.375,
"learning_rate": 1e-06,
"loss": 1.0967,
"reward": 0.6422809958457947,
"reward_std": 0.30539625883102417,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.3108440041542053,
"rewards/tag_count_reward": 0.890625,
"step": 134
},
{
"completion_length": 743.5,
"epoch": 0.009,
"grad_norm": 2467.851806640625,
"kl": 19.25,
"learning_rate": 1e-06,
"loss": 0.7744,
"reward": 0.5940513610839844,
"reward_std": 0.26661908626556396,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.10907360911369324,
"rewards/tag_count_reward": 0.703125,
"step": 135
},
{
"completion_length": 744.5,
"epoch": 0.009066666666666667,
"grad_norm": 506.23193359375,
"kl": 14.375,
"learning_rate": 1e-06,
"loss": 0.5751,
"reward": 0.7252035140991211,
"reward_std": 0.6573628187179565,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.21229654550552368,
"rewards/tag_count_reward": 0.75,
"step": 136
},
{
"completion_length": 1501.5,
"epoch": 0.009133333333333334,
"grad_norm": 425.3215637207031,
"kl": 13.125,
"learning_rate": 1e-06,
"loss": 0.5264,
"reward": 0.6281967759132385,
"reward_std": 0.33101916313171387,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.26242825388908386,
"rewards/tag_count_reward": 0.828125,
"step": 137
},
{
"completion_length": 1585.5,
"epoch": 0.0092,
"grad_norm": 46.23551940917969,
"kl": 2.015625,
"learning_rate": 1e-06,
"loss": 0.0807,
"reward": 0.3377422094345093,
"reward_std": 0.341916561126709,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.24038279056549072,
"rewards/tag_count_reward": 0.578125,
"step": 138
},
{
"completion_length": 852.0,
"epoch": 0.009266666666666666,
"grad_norm": 20.030078887939453,
"kl": 2.265625,
"learning_rate": 1e-06,
"loss": 0.0906,
"reward": 0.48952001333236694,
"reward_std": 0.4157789945602417,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.26047998666763306,
"rewards/tag_count_reward": 0.75,
"step": 139
},
{
"completion_length": 1278.0,
"epoch": 0.009333333333333334,
"grad_norm": 274.914306640625,
"kl": 7.8125,
"learning_rate": 1e-06,
"loss": 0.3133,
"reward": 0.8666358590126038,
"reward_std": 0.4360937476158142,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.36773911118507385,
"rewards/tag_count_reward": 0.984375,
"step": 140
},
{
"completion_length": 2048.0,
"epoch": 0.0094,
"grad_norm": 79.84725952148438,
"kl": 5.78125,
"learning_rate": 1e-06,
"loss": 0.232,
"reward": 0.4236172139644623,
"reward_std": 0.27087658643722534,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3576327860355377,
"rewards/tag_count_reward": 0.78125,
"step": 141
},
{
"completion_length": 1249.5,
"epoch": 0.009466666666666667,
"grad_norm": 65.60501098632812,
"kl": 1.7421875,
"learning_rate": 1e-06,
"loss": 0.0697,
"reward": 0.6048706769943237,
"reward_std": 0.22861841320991516,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2545042932033539,
"rewards/tag_count_reward": 0.859375,
"step": 142
},
{
"completion_length": 1514.5,
"epoch": 0.009533333333333333,
"grad_norm": 34.173702239990234,
"kl": 1.140625,
"learning_rate": 1e-06,
"loss": 0.0458,
"reward": 0.520450234413147,
"reward_std": 0.30032819509506226,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.354549765586853,
"rewards/tag_count_reward": 0.8125,
"step": 143
},
{
"completion_length": 2048.0,
"epoch": 0.0096,
"grad_norm": 67.23226165771484,
"kl": 4.0625,
"learning_rate": 1e-06,
"loss": 0.1631,
"reward": 0.1353166550397873,
"reward_std": 0.3215920329093933,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3490583300590515,
"rewards/tag_count_reward": 0.484375,
"step": 144
},
{
"completion_length": 687.0,
"epoch": 0.009666666666666667,
"grad_norm": 81.97444915771484,
"kl": 0.27734375,
"learning_rate": 1e-06,
"loss": 0.011,
"reward": 0.7245032787322998,
"reward_std": 0.20581424236297607,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2754966914653778,
"rewards/tag_count_reward": 1.0,
"step": 145
},
{
"completion_length": 430.0,
"epoch": 0.009733333333333333,
"grad_norm": 34.066314697265625,
"kl": 0.0751953125,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 0.9231314063072205,
"reward_std": 0.2355959713459015,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.12374356389045715,
"rewards/tag_count_reward": 0.984375,
"step": 146
},
{
"completion_length": 760.5,
"epoch": 0.0098,
"grad_norm": 80.5255355834961,
"kl": 0.201171875,
"learning_rate": 1e-06,
"loss": 0.0081,
"reward": 0.8242017030715942,
"reward_std": 0.2619200050830841,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.20704832673072815,
"rewards/tag_count_reward": 0.96875,
"step": 147
},
{
"completion_length": 1339.5,
"epoch": 0.009866666666666666,
"grad_norm": 75.45352172851562,
"kl": 4.03125,
"learning_rate": 1e-06,
"loss": 0.1615,
"reward": 0.37841227650642395,
"reward_std": 0.3038763403892517,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.27783775329589844,
"rewards/tag_count_reward": 0.65625,
"step": 148
},
{
"completion_length": 617.5,
"epoch": 0.009933333333333334,
"grad_norm": 37.40899658203125,
"kl": 1.0703125,
"learning_rate": 1e-06,
"loss": 0.0427,
"reward": 0.9852249622344971,
"reward_std": 0.4898127317428589,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.10852503776550293,
"rewards/tag_count_reward": 0.84375,
"step": 149
},
{
"completion_length": 1091.5,
"epoch": 0.01,
"grad_norm": 118.6474838256836,
"kl": 2.734375,
"learning_rate": 1e-06,
"loss": 0.1095,
"reward": 0.9904427528381348,
"reward_std": 0.5234081745147705,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.29080721735954285,
"rewards/tag_count_reward": 0.96875,
"step": 150
},
{
"completion_length": 541.5,
"epoch": 0.010066666666666666,
"grad_norm": 438.11431884765625,
"kl": 10.3125,
"learning_rate": 1e-06,
"loss": 0.4122,
"reward": 0.5217373967170715,
"reward_std": 0.2800005078315735,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22826258838176727,
"rewards/tag_count_reward": 0.75,
"step": 151
},
{
"completion_length": 825.0,
"epoch": 0.010133333333333333,
"grad_norm": 1261.1268310546875,
"kl": 12.75,
"learning_rate": 1e-06,
"loss": 0.5119,
"reward": 0.46687906980514526,
"reward_std": 0.4136854410171509,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.12687088549137115,
"rewards/tag_count_reward": 0.53125,
"step": 152
},
{
"completion_length": 993.5,
"epoch": 0.0102,
"grad_norm": 538.96337890625,
"kl": 11.625,
"learning_rate": 1e-06,
"loss": 0.4671,
"reward": 0.27331826090812683,
"reward_std": 0.4327068328857422,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22668173909187317,
"rewards/tag_count_reward": 0.5,
"step": 153
},
{
"completion_length": 282.5,
"epoch": 0.010266666666666667,
"grad_norm": 27.94957160949707,
"kl": 1.75,
"learning_rate": 1e-06,
"loss": 0.0702,
"reward": 0.8316723108291626,
"reward_std": 0.20498070120811462,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1214526817202568,
"rewards/tag_count_reward": 0.953125,
"step": 154
},
{
"completion_length": 543.5,
"epoch": 0.010333333333333333,
"grad_norm": 67.51468658447266,
"kl": 1.59375,
"learning_rate": 1e-06,
"loss": 0.0636,
"reward": 1.0527127981185913,
"reward_std": 0.5268169641494751,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.2597872018814087,
"rewards/tag_count_reward": 1.0,
"step": 155
},
{
"completion_length": 1237.5,
"epoch": 0.0104,
"grad_norm": 152.0338592529297,
"kl": 1.21875,
"learning_rate": 1e-06,
"loss": 0.0488,
"reward": 0.5705561637878418,
"reward_std": 0.32453328371047974,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2263188511133194,
"rewards/tag_count_reward": 0.796875,
"step": 156
},
{
"completion_length": 652.0,
"epoch": 0.010466666666666668,
"grad_norm": 86.2015151977539,
"kl": 1.671875,
"learning_rate": 1e-06,
"loss": 0.0668,
"reward": 0.7413462400436401,
"reward_std": 0.536789059638977,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.16490373015403748,
"rewards/tag_count_reward": 0.78125,
"step": 157
},
{
"completion_length": 890.0,
"epoch": 0.010533333333333334,
"grad_norm": 10600.8759765625,
"kl": 76.0,
"learning_rate": 1e-06,
"loss": 3.049,
"reward": 0.7228549718856812,
"reward_std": 0.5175967812538147,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.18339505791664124,
"rewards/tag_count_reward": 0.78125,
"step": 158
},
{
"completion_length": 1430.5,
"epoch": 0.0106,
"grad_norm": 51.20515060424805,
"kl": 2.5625,
"learning_rate": 1e-06,
"loss": 0.1022,
"reward": 0.5848544836044312,
"reward_std": 0.34735676646232605,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.24327048659324646,
"rewards/tag_count_reward": 0.765625,
"step": 159
},
{
"completion_length": 699.0,
"epoch": 0.010666666666666666,
"grad_norm": 124.27230834960938,
"kl": 0.7578125,
"learning_rate": 1e-06,
"loss": 0.0303,
"reward": 1.1091445684432983,
"reward_std": 0.4381716549396515,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.14085541665554047,
"rewards/tag_count_reward": 0.875,
"step": 160
},
{
"completion_length": 472.0,
"epoch": 0.010733333333333333,
"grad_norm": 130.2908935546875,
"kl": 6.09375,
"learning_rate": 1e-06,
"loss": 0.2449,
"reward": 0.8178807497024536,
"reward_std": 0.49368974566459656,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.1821192353963852,
"rewards/tag_count_reward": 0.8125,
"step": 161
},
{
"completion_length": 2048.0,
"epoch": 0.0108,
"grad_norm": 161.0198211669922,
"kl": 7.0,
"learning_rate": 1e-06,
"loss": 0.2802,
"reward": 0.3412390947341919,
"reward_std": 0.3579902648925781,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.23688587546348572,
"rewards/tag_count_reward": 0.578125,
"step": 162
},
{
"completion_length": 790.0,
"epoch": 0.010866666666666667,
"grad_norm": 148.21499633789062,
"kl": 5.84375,
"learning_rate": 1e-06,
"loss": 0.2336,
"reward": 0.9175270199775696,
"reward_std": 0.5335466265678406,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.19184798002243042,
"rewards/tag_count_reward": 0.859375,
"step": 163
},
{
"completion_length": 679.5,
"epoch": 0.010933333333333333,
"grad_norm": 100.95904541015625,
"kl": 2.59375,
"learning_rate": 1e-06,
"loss": 0.1042,
"reward": 0.5523126721382141,
"reward_std": 0.22545668482780457,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1976873278617859,
"rewards/tag_count_reward": 0.75,
"step": 164
},
{
"completion_length": 419.0,
"epoch": 0.011,
"grad_norm": 89.20986938476562,
"kl": 0.8046875,
"learning_rate": 1e-06,
"loss": 0.0321,
"reward": 1.2350327968597412,
"reward_std": 0.5273359417915344,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.04621714726090431,
"rewards/tag_count_reward": 0.90625,
"step": 165
},
{
"completion_length": 2048.0,
"epoch": 0.011066666666666667,
"grad_norm": 80.26315307617188,
"kl": 3.15625,
"learning_rate": 1e-06,
"loss": 0.1263,
"reward": 0.2963656485080719,
"reward_std": 0.39362943172454834,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.3442593514919281,
"rewards/tag_count_reward": 0.578125,
"step": 166
},
{
"completion_length": 710.5,
"epoch": 0.011133333333333334,
"grad_norm": 152.8345184326172,
"kl": 0.72265625,
"learning_rate": 1e-06,
"loss": 0.0289,
"reward": 0.671838641166687,
"reward_std": 0.23800846934318542,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.20316140353679657,
"rewards/tag_count_reward": 0.875,
"step": 167
},
{
"completion_length": 2048.0,
"epoch": 0.0112,
"grad_norm": 109.0709457397461,
"kl": 2.4375,
"learning_rate": 1e-06,
"loss": 0.0978,
"reward": 0.33459407091140747,
"reward_std": 0.21566098928451538,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3216559588909149,
"rewards/tag_count_reward": 0.65625,
"step": 168
},
{
"completion_length": 1306.0,
"epoch": 0.011266666666666666,
"grad_norm": 122.21932220458984,
"kl": 1.8125,
"learning_rate": 1e-06,
"loss": 0.0723,
"reward": 0.7176011204719543,
"reward_std": 0.44902274012565613,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.29802384972572327,
"rewards/tag_count_reward": 0.828125,
"step": 169
},
{
"completion_length": 345.5,
"epoch": 0.011333333333333334,
"grad_norm": 77.08424377441406,
"kl": 2.125,
"learning_rate": 1e-06,
"loss": 0.0854,
"reward": 1.0335280895233154,
"reward_std": 0.5491755604743958,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.13834688067436218,
"rewards/tag_count_reward": 0.921875,
"step": 170
},
{
"completion_length": 928.0,
"epoch": 0.0114,
"grad_norm": 92.87094116210938,
"kl": 2.765625,
"learning_rate": 1e-06,
"loss": 0.1105,
"reward": 0.6134524941444397,
"reward_std": 0.2891947627067566,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2146725207567215,
"rewards/tag_count_reward": 0.828125,
"step": 171
},
{
"completion_length": 1915.5,
"epoch": 0.011466666666666667,
"grad_norm": 96.58674621582031,
"kl": 5.46875,
"learning_rate": 1e-06,
"loss": 0.2197,
"reward": 0.4971694350242615,
"reward_std": 0.30854254961013794,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.28408053517341614,
"rewards/tag_count_reward": 0.78125,
"step": 172
},
{
"completion_length": 1015.5,
"epoch": 0.011533333333333333,
"grad_norm": 351.7418212890625,
"kl": 10.625,
"learning_rate": 1e-06,
"loss": 0.4244,
"reward": 0.578019380569458,
"reward_std": 0.5916845798492432,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.23448067903518677,
"rewards/tag_count_reward": 0.6875,
"step": 173
},
{
"completion_length": 1680.0,
"epoch": 0.0116,
"grad_norm": 150.63426208496094,
"kl": 3.40625,
"learning_rate": 1e-06,
"loss": 0.1358,
"reward": 0.2883574366569519,
"reward_std": 0.35795196890830994,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2585175633430481,
"rewards/tag_count_reward": 0.546875,
"step": 174
},
{
"completion_length": 1299.0,
"epoch": 0.011666666666666667,
"grad_norm": 49.87466812133789,
"kl": 1.25,
"learning_rate": 1e-06,
"loss": 0.0501,
"reward": 0.5055930614471436,
"reward_std": 0.2460363507270813,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.21315690875053406,
"rewards/tag_count_reward": 0.71875,
"step": 175
},
{
"completion_length": 988.0,
"epoch": 0.011733333333333333,
"grad_norm": 97.638916015625,
"kl": 4.40625,
"learning_rate": 1e-06,
"loss": 0.1764,
"reward": 0.5065950155258179,
"reward_std": 0.3401361107826233,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.30590495467185974,
"rewards/tag_count_reward": 0.75,
"step": 176
},
{
"completion_length": 699.5,
"epoch": 0.0118,
"grad_norm": 98.37915802001953,
"kl": 1.296875,
"learning_rate": 1e-06,
"loss": 0.0516,
"reward": 0.46273481845855713,
"reward_std": 0.30883949995040894,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.17789018154144287,
"rewards/tag_count_reward": 0.640625,
"step": 177
},
{
"completion_length": 460.5,
"epoch": 0.011866666666666666,
"grad_norm": 128.74960327148438,
"kl": 0.5703125,
"learning_rate": 1e-06,
"loss": 0.023,
"reward": 1.0834062099456787,
"reward_std": 0.7106175422668457,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.08846880495548248,
"rewards/tag_count_reward": 0.984375,
"step": 178
},
{
"completion_length": 1096.0,
"epoch": 0.011933333333333334,
"grad_norm": 60.819427490234375,
"kl": 1.3828125,
"learning_rate": 1e-06,
"loss": 0.0553,
"reward": 0.9217681884765625,
"reward_std": 0.517062246799469,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.2032317817211151,
"rewards/tag_count_reward": 0.875,
"step": 179
},
{
"completion_length": 400.0,
"epoch": 0.012,
"grad_norm": 42.64118194580078,
"kl": 2.796875,
"learning_rate": 1e-06,
"loss": 0.1112,
"reward": 0.7124161720275879,
"reward_std": 0.2661663889884949,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.14695881307125092,
"rewards/tag_count_reward": 0.859375,
"step": 180
},
{
"completion_length": 354.5,
"epoch": 0.012066666666666667,
"grad_norm": 1183.7877197265625,
"kl": 29.25,
"learning_rate": 1e-06,
"loss": 1.1724,
"reward": 0.8555076122283936,
"reward_std": 0.6536040306091309,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.20699241757392883,
"rewards/tag_count_reward": 0.8125,
"step": 181
},
{
"completion_length": 1114.5,
"epoch": 0.012133333333333333,
"grad_norm": 64.2253189086914,
"kl": 4.65625,
"learning_rate": 1e-06,
"loss": 0.1876,
"reward": 0.7184881567955017,
"reward_std": 0.39609208703041077,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.3283868432044983,
"rewards/tag_count_reward": 0.859375,
"step": 182
},
{
"completion_length": 1195.5,
"epoch": 0.0122,
"grad_norm": 60.13753128051758,
"kl": 2.46875,
"learning_rate": 1e-06,
"loss": 0.0985,
"reward": 0.6382501125335693,
"reward_std": 0.2662697732448578,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.12737488746643066,
"rewards/tag_count_reward": 0.765625,
"step": 183
},
{
"completion_length": 1723.0,
"epoch": 0.012266666666666667,
"grad_norm": 139.8374786376953,
"kl": 1.46875,
"learning_rate": 1e-06,
"loss": 0.0589,
"reward": 0.3703034520149231,
"reward_std": 0.3178945779800415,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1765715628862381,
"rewards/tag_count_reward": 0.546875,
"step": 184
},
{
"completion_length": 1334.0,
"epoch": 0.012333333333333333,
"grad_norm": 107.38422393798828,
"kl": 1.671875,
"learning_rate": 1e-06,
"loss": 0.0666,
"reward": 0.5631563663482666,
"reward_std": 0.363595187664032,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.10871861129999161,
"rewards/tag_count_reward": 0.671875,
"step": 185
},
{
"completion_length": 1153.0,
"epoch": 0.0124,
"grad_norm": 59.173736572265625,
"kl": 3.078125,
"learning_rate": 1e-06,
"loss": 0.1227,
"reward": 0.756859540939331,
"reward_std": 0.32119861245155334,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.29001548886299133,
"rewards/tag_count_reward": 0.984375,
"step": 186
},
{
"completion_length": 559.5,
"epoch": 0.012466666666666666,
"grad_norm": 77.07391357421875,
"kl": 1.390625,
"learning_rate": 1e-06,
"loss": 0.0556,
"reward": 0.8043298721313477,
"reward_std": 0.32981082797050476,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.14879512786865234,
"rewards/tag_count_reward": 0.765625,
"step": 187
},
{
"completion_length": 687.0,
"epoch": 0.012533333333333334,
"grad_norm": 181.65753173828125,
"kl": 6.6875,
"learning_rate": 1e-06,
"loss": 0.2685,
"reward": 0.7765844464302063,
"reward_std": 0.5066102743148804,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.1921655535697937,
"rewards/tag_count_reward": 0.78125,
"step": 188
},
{
"completion_length": 693.5,
"epoch": 0.0126,
"grad_norm": 153.3155517578125,
"kl": 4.1875,
"learning_rate": 1e-06,
"loss": 0.1682,
"reward": 0.8262364864349365,
"reward_std": 0.3578583598136902,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.31438854336738586,
"rewards/tag_count_reward": 0.953125,
"step": 189
},
{
"completion_length": 798.0,
"epoch": 0.012666666666666666,
"grad_norm": 50.24184036254883,
"kl": 2.375,
"learning_rate": 1e-06,
"loss": 0.0947,
"reward": 0.5746697187423706,
"reward_std": 0.26803144812583923,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2378302663564682,
"rewards/tag_count_reward": 0.8125,
"step": 190
},
{
"completion_length": 531.5,
"epoch": 0.012733333333333333,
"grad_norm": 59.12314224243164,
"kl": 1.59375,
"learning_rate": 1e-06,
"loss": 0.0635,
"reward": 0.6490483283996582,
"reward_std": 0.2684840261936188,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1947016566991806,
"rewards/tag_count_reward": 0.84375,
"step": 191
},
{
"completion_length": 1208.0,
"epoch": 0.0128,
"grad_norm": 59.42628860473633,
"kl": 4.46875,
"learning_rate": 1e-06,
"loss": 0.1795,
"reward": 0.8103557825088501,
"reward_std": 0.32967913150787354,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.23651918768882751,
"rewards/tag_count_reward": 0.984375,
"step": 192
},
{
"completion_length": 1280.0,
"epoch": 0.012866666666666667,
"grad_norm": 146.31739807128906,
"kl": 3.875,
"learning_rate": 1e-06,
"loss": 0.1556,
"reward": 0.6511543989181519,
"reward_std": 0.4586493670940399,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.20822066068649292,
"rewards/tag_count_reward": 0.671875,
"step": 193
},
{
"completion_length": 548.5,
"epoch": 0.012933333333333333,
"grad_norm": 86.45354461669922,
"kl": 3.09375,
"learning_rate": 1e-06,
"loss": 0.1237,
"reward": 0.5654099583625793,
"reward_std": 0.25625455379486084,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.16896502673625946,
"rewards/tag_count_reward": 0.734375,
"step": 194
},
{
"completion_length": 453.5,
"epoch": 0.013,
"grad_norm": 109.8470230102539,
"kl": 0.359375,
"learning_rate": 1e-06,
"loss": 0.0144,
"reward": 0.8253956437110901,
"reward_std": 0.3288414478302002,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.2527293860912323,
"rewards/tag_count_reward": 0.953125,
"step": 195
},
{
"completion_length": 1591.5,
"epoch": 0.013066666666666667,
"grad_norm": 114.1063003540039,
"kl": 1.6171875,
"learning_rate": 1e-06,
"loss": 0.0646,
"reward": 0.299161821603775,
"reward_std": 0.1698484718799591,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.24771319329738617,
"rewards/tag_count_reward": 0.546875,
"step": 196
},
{
"completion_length": 1003.5,
"epoch": 0.013133333333333334,
"grad_norm": 42.97124099731445,
"kl": 4.03125,
"learning_rate": 1e-06,
"loss": 0.1612,
"reward": 0.654625415802002,
"reward_std": 0.22866007685661316,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.25162458419799805,
"rewards/tag_count_reward": 0.90625,
"step": 197
},
{
"completion_length": 1635.0,
"epoch": 0.0132,
"grad_norm": 145.948486328125,
"kl": 3.21875,
"learning_rate": 1e-06,
"loss": 0.1287,
"reward": 0.2930026054382324,
"reward_std": 0.30017927289009094,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2382473647594452,
"rewards/tag_count_reward": 0.53125,
"step": 198
},
{
"completion_length": 1250.5,
"epoch": 0.013266666666666666,
"grad_norm": 145.41311645507812,
"kl": 7.0,
"learning_rate": 1e-06,
"loss": 0.2798,
"reward": 0.7496009469032288,
"reward_std": 0.5164273977279663,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.20352405309677124,
"rewards/tag_count_reward": 0.765625,
"step": 199
},
{
"completion_length": 2048.0,
"epoch": 0.013333333333333334,
"grad_norm": 80.11041259765625,
"kl": 1.5625,
"learning_rate": 1e-06,
"loss": 0.0626,
"reward": 0.41792166233062744,
"reward_std": 0.18836495280265808,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.28520333766937256,
"rewards/tag_count_reward": 0.703125,
"step": 200
},
{
"completion_length": 1320.0,
"epoch": 0.0134,
"grad_norm": 404.68017578125,
"kl": 13.3125,
"learning_rate": 1e-06,
"loss": 0.5327,
"reward": 0.5508228540420532,
"reward_std": 0.47881072759628296,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2460521161556244,
"rewards/tag_count_reward": 0.734375,
"step": 201
},
{
"completion_length": 1161.0,
"epoch": 0.013466666666666667,
"grad_norm": 222.3137969970703,
"kl": 8.25,
"learning_rate": 1e-06,
"loss": 0.331,
"reward": 0.5511502027511597,
"reward_std": 0.2810462415218353,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2613498270511627,
"rewards/tag_count_reward": 0.75,
"step": 202
},
{
"completion_length": 1230.0,
"epoch": 0.013533333333333333,
"grad_norm": 132.03036499023438,
"kl": 7.6875,
"learning_rate": 1e-06,
"loss": 0.3065,
"reward": 0.47215622663497925,
"reward_std": 0.2874332070350647,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.21534380316734314,
"rewards/tag_count_reward": 0.6875,
"step": 203
},
{
"completion_length": 2048.0,
"epoch": 0.0136,
"grad_norm": 145.11434936523438,
"kl": 5.8125,
"learning_rate": 1e-06,
"loss": 0.2328,
"reward": 0.339425265789032,
"reward_std": 0.2594582438468933,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.332449734210968,
"rewards/tag_count_reward": 0.671875,
"step": 204
},
{
"completion_length": 1684.0,
"epoch": 0.013666666666666667,
"grad_norm": 95.94991302490234,
"kl": 2.125,
"learning_rate": 1e-06,
"loss": 0.0852,
"reward": 0.4848323464393616,
"reward_std": 0.24807266891002655,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2807926535606384,
"rewards/tag_count_reward": 0.765625,
"step": 205
},
{
"completion_length": 2048.0,
"epoch": 0.013733333333333334,
"grad_norm": 25.70447540283203,
"kl": 3.15625,
"learning_rate": 1e-06,
"loss": 0.1265,
"reward": 0.7671686410903931,
"reward_std": 0.4565780758857727,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.26408135890960693,
"rewards/tag_count_reward": 0.96875,
"step": 206
},
{
"completion_length": 2048.0,
"epoch": 0.0138,
"grad_norm": 132.874267578125,
"kl": 1.703125,
"learning_rate": 1e-06,
"loss": 0.0683,
"reward": 0.5867856740951538,
"reward_std": 0.4050213098526001,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.3194643557071686,
"rewards/tag_count_reward": 0.71875,
"step": 207
},
{
"completion_length": 1151.5,
"epoch": 0.013866666666666666,
"grad_norm": 100.7705307006836,
"kl": 0.96875,
"learning_rate": 1e-06,
"loss": 0.0386,
"reward": 0.6679742336273193,
"reward_std": 0.43158334493637085,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.22265075147151947,
"rewards/tag_count_reward": 0.828125,
"step": 208
},
{
"completion_length": 2048.0,
"epoch": 0.013933333333333334,
"grad_norm": 123.25395202636719,
"kl": 2.015625,
"learning_rate": 1e-06,
"loss": 0.0809,
"reward": 0.6185864210128784,
"reward_std": 0.4980039596557617,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.31891360878944397,
"rewards/tag_count_reward": 0.875,
"step": 209
},
{
"completion_length": 1145.0,
"epoch": 0.014,
"grad_norm": 67.15911865234375,
"kl": 1.0703125,
"learning_rate": 1e-06,
"loss": 0.0429,
"reward": 0.899847149848938,
"reward_std": 0.47282007336616516,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.1470278799533844,
"rewards/tag_count_reward": 0.796875,
"step": 210
},
{
"completion_length": 1179.5,
"epoch": 0.014066666666666667,
"grad_norm": 125.00872802734375,
"kl": 3.703125,
"learning_rate": 1e-06,
"loss": 0.1486,
"reward": 0.5287724733352661,
"reward_std": 0.22276633977890015,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2056024968624115,
"rewards/tag_count_reward": 0.734375,
"step": 211
},
{
"completion_length": 1127.0,
"epoch": 0.014133333333333333,
"grad_norm": 115.925048828125,
"kl": 3.015625,
"learning_rate": 1e-06,
"loss": 0.1203,
"reward": 1.0227022171020508,
"reward_std": 0.41721510887145996,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.13354776799678802,
"rewards/tag_count_reward": 0.71875,
"step": 212
},
{
"completion_length": 891.5,
"epoch": 0.0142,
"grad_norm": 319.3634033203125,
"kl": 5.78125,
"learning_rate": 1e-06,
"loss": 0.2325,
"reward": 1.142776608467102,
"reward_std": 0.6472839117050171,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.23222345113754272,
"rewards/tag_count_reward": 1.0,
"step": 213
},
{
"completion_length": 1132.5,
"epoch": 0.014266666666666667,
"grad_norm": 47.81877899169922,
"kl": 3.234375,
"learning_rate": 1e-06,
"loss": 0.1294,
"reward": 0.9716029763221741,
"reward_std": 0.5017786026000977,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.20027202367782593,
"rewards/tag_count_reward": 0.859375,
"step": 214
},
{
"completion_length": 2048.0,
"epoch": 0.014333333333333333,
"grad_norm": 94.92833709716797,
"kl": 6.375,
"learning_rate": 1e-06,
"loss": 0.2551,
"reward": 0.43028393387794495,
"reward_std": 0.3057904839515686,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.27284103631973267,
"rewards/tag_count_reward": 0.703125,
"step": 215
},
{
"completion_length": 372.5,
"epoch": 0.0144,
"grad_norm": 48.02029800415039,
"kl": 2.015625,
"learning_rate": 1e-06,
"loss": 0.0801,
"reward": 0.7344586253166199,
"reward_std": 0.2993859648704529,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.14054134488105774,
"rewards/tag_count_reward": 0.875,
"step": 216
},
{
"completion_length": 1484.5,
"epoch": 0.014466666666666666,
"grad_norm": 29.980205535888672,
"kl": 0.6875,
"learning_rate": 1e-06,
"loss": 0.0276,
"reward": 0.4853227734565735,
"reward_std": 0.326226145029068,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.10842724144458771,
"rewards/tag_count_reward": 0.59375,
"step": 217
},
{
"completion_length": 825.0,
"epoch": 0.014533333333333334,
"grad_norm": 115.4741439819336,
"kl": 5.75,
"learning_rate": 1e-06,
"loss": 0.23,
"reward": 0.534308135509491,
"reward_std": 0.4810641407966614,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.21569184958934784,
"rewards/tag_count_reward": 0.6875,
"step": 218
},
{
"completion_length": 1326.5,
"epoch": 0.0146,
"grad_norm": 71.67326354980469,
"kl": 2.5,
"learning_rate": 1e-06,
"loss": 0.0999,
"reward": 0.7836402058601379,
"reward_std": 0.5527104139328003,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.20073477923870087,
"rewards/tag_count_reward": 0.734375,
"step": 219
},
{
"completion_length": 838.5,
"epoch": 0.014666666666666666,
"grad_norm": 72.6811294555664,
"kl": 4.6875,
"learning_rate": 1e-06,
"loss": 0.1866,
"reward": 0.464601993560791,
"reward_std": 0.29503798484802246,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2697729766368866,
"rewards/tag_count_reward": 0.734375,
"step": 220
},
{
"completion_length": 1459.5,
"epoch": 0.014733333333333333,
"grad_norm": 198.50364685058594,
"kl": 3.140625,
"learning_rate": 1e-06,
"loss": 0.1256,
"reward": 0.6743848919868469,
"reward_std": 0.3051697909832001,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.20061510801315308,
"rewards/tag_count_reward": 0.875,
"step": 221
},
{
"completion_length": 1243.5,
"epoch": 0.0148,
"grad_norm": 100.1600112915039,
"kl": 4.84375,
"learning_rate": 1e-06,
"loss": 0.1934,
"reward": 0.506662905216217,
"reward_std": 0.23734720051288605,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.25896206498146057,
"rewards/tag_count_reward": 0.765625,
"step": 222
},
{
"completion_length": 1270.0,
"epoch": 0.014866666666666667,
"grad_norm": 380.06414794921875,
"kl": 8.0625,
"learning_rate": 1e-06,
"loss": 0.323,
"reward": 0.9009274840354919,
"reward_std": 0.6793674230575562,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.28657251596450806,
"rewards/tag_count_reward": 0.875,
"step": 223
},
{
"completion_length": 1717.5,
"epoch": 0.014933333333333333,
"grad_norm": 82.44145965576172,
"kl": 2.84375,
"learning_rate": 1e-06,
"loss": 0.1133,
"reward": 0.3924320340156555,
"reward_std": 0.28474855422973633,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2169429361820221,
"rewards/tag_count_reward": 0.609375,
"step": 224
},
{
"completion_length": 1228.5,
"epoch": 0.015,
"grad_norm": 315.5871276855469,
"kl": 8.8125,
"learning_rate": 1e-06,
"loss": 0.3517,
"reward": 1.075720191001892,
"reward_std": 0.5324565172195435,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.2680298388004303,
"rewards/tag_count_reward": 0.96875,
"step": 225
},
{
"completion_length": 520.0,
"epoch": 0.015066666666666667,
"grad_norm": 91.1231918334961,
"kl": 4.875,
"learning_rate": 1e-06,
"loss": 0.1961,
"reward": 0.737302303314209,
"reward_std": 0.2117619812488556,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2158227115869522,
"rewards/tag_count_reward": 0.953125,
"step": 226
},
{
"completion_length": 1367.5,
"epoch": 0.015133333333333334,
"grad_norm": 77.13899993896484,
"kl": 4.71875,
"learning_rate": 1e-06,
"loss": 0.1891,
"reward": 0.4703078269958496,
"reward_std": 0.23381000757217407,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.20156718790531158,
"rewards/tag_count_reward": 0.671875,
"step": 227
},
{
"completion_length": 407.0,
"epoch": 0.0152,
"grad_norm": 110.41172790527344,
"kl": 1.421875,
"learning_rate": 1e-06,
"loss": 0.0571,
"reward": 0.6082684993743896,
"reward_std": 0.22766825556755066,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.15735645592212677,
"rewards/tag_count_reward": 0.765625,
"step": 228
},
{
"completion_length": 1122.0,
"epoch": 0.015266666666666666,
"grad_norm": 40.2192268371582,
"kl": 2.515625,
"learning_rate": 1e-06,
"loss": 0.1008,
"reward": 0.6912946701049805,
"reward_std": 0.28589141368865967,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2618303894996643,
"rewards/tag_count_reward": 0.890625,
"step": 229
},
{
"completion_length": 718.5,
"epoch": 0.015333333333333332,
"grad_norm": 34.91520309448242,
"kl": 2.78125,
"learning_rate": 1e-06,
"loss": 0.1116,
"reward": 0.6401130557060242,
"reward_std": 0.49446383118629456,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.20363695919513702,
"rewards/tag_count_reward": 0.71875,
"step": 230
},
{
"completion_length": 1270.0,
"epoch": 0.0154,
"grad_norm": 230.4547119140625,
"kl": 1.1484375,
"learning_rate": 1e-06,
"loss": 0.046,
"reward": 0.5606362819671631,
"reward_std": 0.22004806995391846,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2831137180328369,
"rewards/tag_count_reward": 0.84375,
"step": 231
},
{
"completion_length": 993.0,
"epoch": 0.015466666666666667,
"grad_norm": 123.76904296875,
"kl": 2.265625,
"learning_rate": 1e-06,
"loss": 0.0911,
"reward": 1.2356938123703003,
"reward_std": 0.5522989630699158,
"rewards/accuracy_reward": 0.625,
"rewards/len_reward": -0.35805612802505493,
"rewards/tag_count_reward": 0.96875,
"step": 232
},
{
"completion_length": 1782.5,
"epoch": 0.015533333333333333,
"grad_norm": 369.7264709472656,
"kl": 0.90625,
"learning_rate": 1e-06,
"loss": 0.0363,
"reward": 0.20918411016464233,
"reward_std": 0.31902557611465454,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22831588983535767,
"rewards/tag_count_reward": 0.4375,
"step": 233
},
{
"completion_length": 1251.0,
"epoch": 0.0156,
"grad_norm": 147.57382202148438,
"kl": 0.52734375,
"learning_rate": 1e-06,
"loss": 0.0211,
"reward": 0.7122491002082825,
"reward_std": 0.23013344407081604,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.11587591469287872,
"rewards/tag_count_reward": 0.828125,
"step": 234
},
{
"completion_length": 1186.0,
"epoch": 0.015666666666666666,
"grad_norm": 66.60592651367188,
"kl": 2.359375,
"learning_rate": 1e-06,
"loss": 0.0946,
"reward": 0.5424002408981323,
"reward_std": 0.26648008823394775,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.23884975910186768,
"rewards/tag_count_reward": 0.78125,
"step": 235
},
{
"completion_length": 1340.0,
"epoch": 0.015733333333333332,
"grad_norm": 320.10870361328125,
"kl": 11.25,
"learning_rate": 1e-06,
"loss": 0.4481,
"reward": 0.4441903233528137,
"reward_std": 0.19974735379219055,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.24330966174602509,
"rewards/tag_count_reward": 0.6875,
"step": 236
},
{
"completion_length": 846.5,
"epoch": 0.0158,
"grad_norm": 116.87240600585938,
"kl": 5.84375,
"learning_rate": 1e-06,
"loss": 0.2336,
"reward": 1.1734495162963867,
"reward_std": 0.4857546091079712,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.13905052840709686,
"rewards/tag_count_reward": 0.9375,
"step": 237
},
{
"completion_length": 1278.5,
"epoch": 0.015866666666666668,
"grad_norm": 20.454734802246094,
"kl": 2.375,
"learning_rate": 1e-06,
"loss": 0.0949,
"reward": 0.5781991481781006,
"reward_std": 0.3597751259803772,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2030508816242218,
"rewards/tag_count_reward": 0.71875,
"step": 238
},
{
"completion_length": 1672.0,
"epoch": 0.015933333333333334,
"grad_norm": 34.63027572631836,
"kl": 4.21875,
"learning_rate": 1e-06,
"loss": 0.1687,
"reward": 0.37047719955444336,
"reward_std": 0.3707921504974365,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.19202280044555664,
"rewards/tag_count_reward": 0.5625,
"step": 239
},
{
"completion_length": 270.5,
"epoch": 0.016,
"grad_norm": 54.505245208740234,
"kl": 1.21875,
"learning_rate": 1e-06,
"loss": 0.0489,
"reward": 1.076977252960205,
"reward_std": 0.3801991939544678,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.1730227917432785,
"rewards/tag_count_reward": 1.0,
"step": 240
},
{
"completion_length": 1284.0,
"epoch": 0.016066666666666667,
"grad_norm": 85.85352325439453,
"kl": 3.1875,
"learning_rate": 1e-06,
"loss": 0.1275,
"reward": 0.47834357619285583,
"reward_std": 0.33723020553588867,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.14665640890598297,
"rewards/tag_count_reward": 0.625,
"step": 241
},
{
"completion_length": 1172.0,
"epoch": 0.016133333333333333,
"grad_norm": 45.62316131591797,
"kl": 5.0,
"learning_rate": 1e-06,
"loss": 0.2004,
"reward": 0.6529799103736877,
"reward_std": 0.5432481169700623,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.20639507472515106,
"rewards/tag_count_reward": 0.734375,
"step": 242
},
{
"completion_length": 2048.0,
"epoch": 0.0162,
"grad_norm": 107.2427978515625,
"kl": 5.125,
"learning_rate": 1e-06,
"loss": 0.2056,
"reward": 0.25203531980514526,
"reward_std": 0.327674925327301,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.29483968019485474,
"rewards/tag_count_reward": 0.484375,
"step": 243
},
{
"completion_length": 1438.0,
"epoch": 0.016266666666666665,
"grad_norm": 135.52386474609375,
"kl": 1.140625,
"learning_rate": 1e-06,
"loss": 0.0456,
"reward": 0.6074755191802979,
"reward_std": 0.307578980922699,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.18939949572086334,
"rewards/tag_count_reward": 0.796875,
"step": 244
},
{
"completion_length": 1279.5,
"epoch": 0.01633333333333333,
"grad_norm": 72.26471710205078,
"kl": 0.8125,
"learning_rate": 1e-06,
"loss": 0.0324,
"reward": 0.637866735458374,
"reward_std": 0.388823926448822,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.25275832414627075,
"rewards/tag_count_reward": 0.828125,
"step": 245
},
{
"completion_length": 1477.5,
"epoch": 0.0164,
"grad_norm": 222.29373168945312,
"kl": 2.109375,
"learning_rate": 1e-06,
"loss": 0.0841,
"reward": 0.23366467654705048,
"reward_std": 0.26289576292037964,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.25071030855178833,
"rewards/tag_count_reward": 0.484375,
"step": 246
},
{
"completion_length": 1201.0,
"epoch": 0.016466666666666668,
"grad_norm": 36.81657409667969,
"kl": 1.25,
"learning_rate": 1e-06,
"loss": 0.0501,
"reward": 0.4497566819190979,
"reward_std": 0.36574527621269226,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2689933180809021,
"rewards/tag_count_reward": 0.71875,
"step": 247
},
{
"completion_length": 1144.0,
"epoch": 0.016533333333333334,
"grad_norm": 95.82453155517578,
"kl": 1.1875,
"learning_rate": 1e-06,
"loss": 0.0475,
"reward": 0.8179270625114441,
"reward_std": 0.6401329040527344,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.1351979672908783,
"rewards/tag_count_reward": 0.828125,
"step": 248
},
{
"completion_length": 1120.5,
"epoch": 0.0166,
"grad_norm": 82.85420227050781,
"kl": 4.28125,
"learning_rate": 1e-06,
"loss": 0.1707,
"reward": 0.37731966376304626,
"reward_std": 0.30782580375671387,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.26330533623695374,
"rewards/tag_count_reward": 0.640625,
"step": 249
},
{
"completion_length": 1404.5,
"epoch": 0.016666666666666666,
"grad_norm": 101.45523071289062,
"kl": 3.640625,
"learning_rate": 1e-06,
"loss": 0.1453,
"reward": 0.5992006063461304,
"reward_std": 0.5270397663116455,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.18204940855503082,
"rewards/tag_count_reward": 0.65625,
"step": 250
},
{
"completion_length": 1210.5,
"epoch": 0.016733333333333333,
"grad_norm": 202.80154418945312,
"kl": 8.875,
"learning_rate": 1e-06,
"loss": 0.3557,
"reward": 0.6146788001060486,
"reward_std": 0.30628567934036255,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.30719617009162903,
"rewards/tag_count_reward": 0.609375,
"step": 251
},
{
"completion_length": 343.5,
"epoch": 0.0168,
"grad_norm": 116.09471893310547,
"kl": 3.1875,
"learning_rate": 1e-06,
"loss": 0.1277,
"reward": 0.9948755502700806,
"reward_std": 0.40214404463768005,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.08324943482875824,
"rewards/tag_count_reward": 0.953125,
"step": 252
},
{
"completion_length": 907.5,
"epoch": 0.016866666666666665,
"grad_norm": 59.3304328918457,
"kl": 5.40625,
"learning_rate": 1e-06,
"loss": 0.2164,
"reward": 0.8052083253860474,
"reward_std": 0.4671739339828491,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.30416664481163025,
"rewards/tag_count_reward": 0.921875,
"step": 253
},
{
"completion_length": 1012.0,
"epoch": 0.016933333333333335,
"grad_norm": 105.59696960449219,
"kl": 5.46875,
"learning_rate": 1e-06,
"loss": 0.219,
"reward": 0.8351148366928101,
"reward_std": 0.5238816142082214,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.19613513350486755,
"rewards/tag_count_reward": 0.84375,
"step": 254
},
{
"completion_length": 1186.0,
"epoch": 0.017,
"grad_norm": 10.558053970336914,
"kl": 0.50390625,
"learning_rate": 1e-06,
"loss": 0.0202,
"reward": 0.6605161428451538,
"reward_std": 0.3056912124156952,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.15198387205600739,
"rewards/tag_count_reward": 0.8125,
"step": 255
},
{
"completion_length": 619.5,
"epoch": 0.017066666666666667,
"grad_norm": 368.22821044921875,
"kl": 1.203125,
"learning_rate": 1e-06,
"loss": 0.048,
"reward": 0.9540891647338867,
"reward_std": 0.4605638086795807,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.23341085016727448,
"rewards/tag_count_reward": 1.0,
"step": 256
},
{
"completion_length": 851.0,
"epoch": 0.017133333333333334,
"grad_norm": 60.82518005371094,
"kl": 1.1484375,
"learning_rate": 1e-06,
"loss": 0.0458,
"reward": 0.7120445370674133,
"reward_std": 0.1923874169588089,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.24108049273490906,
"rewards/tag_count_reward": 0.953125,
"step": 257
},
{
"completion_length": 955.0,
"epoch": 0.0172,
"grad_norm": 88.13372802734375,
"kl": 3.640625,
"learning_rate": 1e-06,
"loss": 0.1454,
"reward": 0.5048485994338989,
"reward_std": 0.2593071162700653,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.18265138566493988,
"rewards/tag_count_reward": 0.6875,
"step": 258
},
{
"completion_length": 1299.0,
"epoch": 0.017266666666666666,
"grad_norm": 50.409542083740234,
"kl": 2.59375,
"learning_rate": 1e-06,
"loss": 0.104,
"reward": 0.6450666785240173,
"reward_std": 0.39017635583877563,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.21430832147598267,
"rewards/tag_count_reward": 0.796875,
"step": 259
},
{
"completion_length": 746.0,
"epoch": 0.017333333333333333,
"grad_norm": 75.39633178710938,
"kl": 4.75,
"learning_rate": 1e-06,
"loss": 0.19,
"reward": 0.8463404178619385,
"reward_std": 0.5164816379547119,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.20053458213806152,
"rewards/tag_count_reward": 0.796875,
"step": 260
},
{
"completion_length": 720.5,
"epoch": 0.0174,
"grad_norm": 160.54527282714844,
"kl": 3.84375,
"learning_rate": 1e-06,
"loss": 0.1536,
"reward": 1.109606385231018,
"reward_std": 0.5404841303825378,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.20289357006549835,
"rewards/tag_count_reward": 1.0,
"step": 261
},
{
"completion_length": 1248.0,
"epoch": 0.017466666666666665,
"grad_norm": 339.8331604003906,
"kl": 5.125,
"learning_rate": 1e-06,
"loss": 0.2044,
"reward": 0.5907869338989258,
"reward_std": 0.255514919757843,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.12796306610107422,
"rewards/tag_count_reward": 0.71875,
"step": 262
},
{
"completion_length": 2011.0,
"epoch": 0.017533333333333335,
"grad_norm": 78.27387237548828,
"kl": 2.796875,
"learning_rate": 1e-06,
"loss": 0.1118,
"reward": 0.557160496711731,
"reward_std": 0.25057724118232727,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.20846451818943024,
"rewards/tag_count_reward": 0.765625,
"step": 263
},
{
"completion_length": 1214.5,
"epoch": 0.0176,
"grad_norm": 116.71304321289062,
"kl": 1.65625,
"learning_rate": 1e-06,
"loss": 0.0659,
"reward": 0.880974292755127,
"reward_std": 0.3218742609024048,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.27527573704719543,
"rewards/tag_count_reward": 0.96875,
"step": 264
},
{
"completion_length": 1242.0,
"epoch": 0.017666666666666667,
"grad_norm": 141.15890502929688,
"kl": 3.0625,
"learning_rate": 1e-06,
"loss": 0.1228,
"reward": 0.9699192643165588,
"reward_std": 0.4596714377403259,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.21758070588111877,
"rewards/tag_count_reward": 1.0,
"step": 265
},
{
"completion_length": 1939.0,
"epoch": 0.017733333333333334,
"grad_norm": 193.25926208496094,
"kl": 6.375,
"learning_rate": 1e-06,
"loss": 0.2553,
"reward": 0.35130447149276733,
"reward_std": 0.2630960941314697,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.25807052850723267,
"rewards/tag_count_reward": 0.609375,
"step": 266
},
{
"completion_length": 724.0,
"epoch": 0.0178,
"grad_norm": 447.6954650878906,
"kl": 5.71875,
"learning_rate": 1e-06,
"loss": 0.228,
"reward": 0.9759852886199951,
"reward_std": 0.4168573021888733,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.14901477098464966,
"rewards/tag_count_reward": 0.6875,
"step": 267
},
{
"completion_length": 1770.0,
"epoch": 0.017866666666666666,
"grad_norm": 105.21097564697266,
"kl": 4.3125,
"learning_rate": 1e-06,
"loss": 0.1719,
"reward": 0.5311283469200134,
"reward_std": 0.27244049310684204,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.18762168288230896,
"rewards/tag_count_reward": 0.71875,
"step": 268
},
{
"completion_length": 1002.0,
"epoch": 0.017933333333333332,
"grad_norm": 195.61874389648438,
"kl": 1.78125,
"learning_rate": 1e-06,
"loss": 0.0713,
"reward": 0.4463762640953064,
"reward_std": 0.15696293115615845,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2879987955093384,
"rewards/tag_count_reward": 0.734375,
"step": 269
},
{
"completion_length": 715.5,
"epoch": 0.018,
"grad_norm": 138.14581298828125,
"kl": 1.203125,
"learning_rate": 1e-06,
"loss": 0.0482,
"reward": 1.0527253150939941,
"reward_std": 0.32684525847435,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.22852468490600586,
"rewards/tag_count_reward": 0.96875,
"step": 270
},
{
"completion_length": 650.0,
"epoch": 0.01806666666666667,
"grad_norm": 78.0746841430664,
"kl": 0.84375,
"learning_rate": 1e-06,
"loss": 0.0337,
"reward": 1.0004079341888428,
"reward_std": 0.6274251341819763,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.093342125415802,
"rewards/tag_count_reward": 0.90625,
"step": 271
},
{
"completion_length": 1242.0,
"epoch": 0.018133333333333335,
"grad_norm": 143.22390747070312,
"kl": 1.3671875,
"learning_rate": 1e-06,
"loss": 0.0548,
"reward": 0.3881889581680298,
"reward_std": 0.20968960225582123,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2993110120296478,
"rewards/tag_count_reward": 0.6875,
"step": 272
},
{
"completion_length": 1436.0,
"epoch": 0.0182,
"grad_norm": 115.1573715209961,
"kl": 1.1015625,
"learning_rate": 1e-06,
"loss": 0.0439,
"reward": 0.6134010553359985,
"reward_std": 0.3705349862575531,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.16784895956516266,
"rewards/tag_count_reward": 0.71875,
"step": 273
},
{
"completion_length": 1050.0,
"epoch": 0.018266666666666667,
"grad_norm": 48.76930618286133,
"kl": 1.359375,
"learning_rate": 1e-06,
"loss": 0.0542,
"reward": 0.7366237044334412,
"reward_std": 0.2547900378704071,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.13837629556655884,
"rewards/tag_count_reward": 0.875,
"step": 274
},
{
"completion_length": 1306.5,
"epoch": 0.018333333333333333,
"grad_norm": 65.57711791992188,
"kl": 2.15625,
"learning_rate": 1e-06,
"loss": 0.0863,
"reward": 0.6077868342399597,
"reward_std": 0.20309025049209595,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2515881657600403,
"rewards/tag_count_reward": 0.859375,
"step": 275
},
{
"completion_length": 539.5,
"epoch": 0.0184,
"grad_norm": 47.25210189819336,
"kl": 1.2265625,
"learning_rate": 1e-06,
"loss": 0.0491,
"reward": 1.2373601198196411,
"reward_std": 0.45193660259246826,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.09076492488384247,
"rewards/tag_count_reward": 0.890625,
"step": 276
},
{
"completion_length": 1508.0,
"epoch": 0.018466666666666666,
"grad_norm": 63.90336990356445,
"kl": 5.0625,
"learning_rate": 1e-06,
"loss": 0.2025,
"reward": 0.39768722653388977,
"reward_std": 0.34046733379364014,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.16481277346611023,
"rewards/tag_count_reward": 0.5625,
"step": 277
},
{
"completion_length": 304.0,
"epoch": 0.018533333333333332,
"grad_norm": 190.8180694580078,
"kl": 7.34375,
"learning_rate": 1e-06,
"loss": 0.293,
"reward": 0.985604465007782,
"reward_std": 0.49539294838905334,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.13939552009105682,
"rewards/tag_count_reward": 1.0,
"step": 278
},
{
"completion_length": 878.5,
"epoch": 0.0186,
"grad_norm": 267.1549072265625,
"kl": 9.625,
"learning_rate": 1e-06,
"loss": 0.3847,
"reward": 0.7716439962387085,
"reward_std": 0.4284132421016693,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.1502310335636139,
"rewards/tag_count_reward": 0.859375,
"step": 279
},
{
"completion_length": 1235.0,
"epoch": 0.018666666666666668,
"grad_norm": 213.63856506347656,
"kl": 11.125,
"learning_rate": 1e-06,
"loss": 0.443,
"reward": 0.884079098701477,
"reward_std": 0.45604008436203003,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.22529590129852295,
"rewards/tag_count_reward": 0.734375,
"step": 280
},
{
"completion_length": 288.5,
"epoch": 0.018733333333333334,
"grad_norm": 86.7800521850586,
"kl": 4.0625,
"learning_rate": 1e-06,
"loss": 0.1625,
"reward": 1.1818442344665527,
"reward_std": 0.4051699638366699,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.13065576553344727,
"rewards/tag_count_reward": 1.0,
"step": 281
},
{
"completion_length": 1323.0,
"epoch": 0.0188,
"grad_norm": 106.07861328125,
"kl": 1.8359375,
"learning_rate": 1e-06,
"loss": 0.0733,
"reward": 0.3720395565032959,
"reward_std": 0.2879902720451355,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2529604434967041,
"rewards/tag_count_reward": 0.625,
"step": 282
},
{
"completion_length": 1189.5,
"epoch": 0.018866666666666667,
"grad_norm": 162.4084930419922,
"kl": 6.4375,
"learning_rate": 1e-06,
"loss": 0.2579,
"reward": 0.7242240905761719,
"reward_std": 0.2956833243370056,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.29140087962150574,
"rewards/tag_count_reward": 0.953125,
"step": 283
},
{
"completion_length": 927.5,
"epoch": 0.018933333333333333,
"grad_norm": 127.29533386230469,
"kl": 2.28125,
"learning_rate": 1e-06,
"loss": 0.0912,
"reward": 1.453515648841858,
"reward_std": 0.6606445908546448,
"rewards/accuracy_reward": 0.625,
"rewards/len_reward": -0.1558593511581421,
"rewards/tag_count_reward": 0.984375,
"step": 284
},
{
"completion_length": 1550.5,
"epoch": 0.019,
"grad_norm": 90.6812515258789,
"kl": 2.34375,
"learning_rate": 1e-06,
"loss": 0.0938,
"reward": 0.31715595722198486,
"reward_std": 0.30243024230003357,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3390940725803375,
"rewards/tag_count_reward": 0.65625,
"step": 285
},
{
"completion_length": 277.5,
"epoch": 0.019066666666666666,
"grad_norm": 60.94807052612305,
"kl": 0.171875,
"learning_rate": 1e-06,
"loss": 0.0069,
"reward": 1.2874398231506348,
"reward_std": 0.2819579541683197,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.16568517684936523,
"rewards/tag_count_reward": 0.953125,
"step": 286
},
{
"completion_length": 1065.0,
"epoch": 0.019133333333333332,
"grad_norm": 78.6766128540039,
"kl": 1.2890625,
"learning_rate": 1e-06,
"loss": 0.0517,
"reward": 0.28757408261299133,
"reward_std": 0.2649262547492981,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.25930091738700867,
"rewards/tag_count_reward": 0.546875,
"step": 287
},
{
"completion_length": 1198.0,
"epoch": 0.0192,
"grad_norm": 26.8408260345459,
"kl": 3.28125,
"learning_rate": 1e-06,
"loss": 0.131,
"reward": 0.44327443838119507,
"reward_std": 0.3510250449180603,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.33797556161880493,
"rewards/tag_count_reward": 0.78125,
"step": 288
},
{
"completion_length": 771.0,
"epoch": 0.019266666666666668,
"grad_norm": 101.416259765625,
"kl": 1.1796875,
"learning_rate": 1e-06,
"loss": 0.047,
"reward": 0.7894538044929504,
"reward_std": 0.1728997826576233,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.17929621040821075,
"rewards/tag_count_reward": 0.96875,
"step": 289
},
{
"completion_length": 1305.0,
"epoch": 0.019333333333333334,
"grad_norm": 92.49710845947266,
"kl": 1.34375,
"learning_rate": 1e-06,
"loss": 0.0537,
"reward": 0.9623291492462158,
"reward_std": 0.44925224781036377,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.22517085075378418,
"rewards/tag_count_reward": 0.75,
"step": 290
},
{
"completion_length": 1113.5,
"epoch": 0.0194,
"grad_norm": 221.7279510498047,
"kl": 1.7109375,
"learning_rate": 1e-06,
"loss": 0.0685,
"reward": 1.2709362506866455,
"reward_std": 0.2591468393802643,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.21343865990638733,
"rewards/tag_count_reward": 0.984375,
"step": 291
},
{
"completion_length": 406.5,
"epoch": 0.019466666666666667,
"grad_norm": 38.19606399536133,
"kl": 1.4375,
"learning_rate": 1e-06,
"loss": 0.0577,
"reward": 0.5383668541908264,
"reward_std": 0.3133370578289032,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.19600816071033478,
"rewards/tag_count_reward": 0.734375,
"step": 292
},
{
"completion_length": 1367.0,
"epoch": 0.019533333333333333,
"grad_norm": 171.46334838867188,
"kl": 10.5,
"learning_rate": 1e-06,
"loss": 0.4187,
"reward": 0.433968186378479,
"reward_std": 0.2997130751609802,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.300406813621521,
"rewards/tag_count_reward": 0.734375,
"step": 293
},
{
"completion_length": 1106.0,
"epoch": 0.0196,
"grad_norm": 78.32112121582031,
"kl": 6.5625,
"learning_rate": 1e-06,
"loss": 0.2641,
"reward": 0.29133471846580505,
"reward_std": 0.3040623068809509,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.14616529643535614,
"rewards/tag_count_reward": 0.4375,
"step": 294
},
{
"completion_length": 523.0,
"epoch": 0.019666666666666666,
"grad_norm": 193.3687744140625,
"kl": 4.96875,
"learning_rate": 1e-06,
"loss": 0.198,
"reward": 1.0770788192749023,
"reward_std": 0.3124271333217621,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.26667121052742004,
"rewards/tag_count_reward": 0.96875,
"step": 295
},
{
"completion_length": 1180.5,
"epoch": 0.019733333333333332,
"grad_norm": 115.18844604492188,
"kl": 6.21875,
"learning_rate": 1e-06,
"loss": 0.2484,
"reward": 0.7966092824935913,
"reward_std": 0.47257116436958313,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.2033906877040863,
"rewards/tag_count_reward": 0.75,
"step": 296
},
{
"completion_length": 1163.0,
"epoch": 0.0198,
"grad_norm": 25.589466094970703,
"kl": 4.21875,
"learning_rate": 1e-06,
"loss": 0.169,
"reward": 0.6315311193466187,
"reward_std": 0.541244387626648,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.27471891045570374,
"rewards/tag_count_reward": 0.78125,
"step": 297
},
{
"completion_length": 1623.5,
"epoch": 0.019866666666666668,
"grad_norm": 223.26202392578125,
"kl": 2.21875,
"learning_rate": 1e-06,
"loss": 0.0888,
"reward": 0.6494944095611572,
"reward_std": 0.5408167839050293,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.2567555606365204,
"rewards/tag_count_reward": 0.71875,
"step": 298
},
{
"completion_length": 396.0,
"epoch": 0.019933333333333334,
"grad_norm": 72.4623794555664,
"kl": 1.8671875,
"learning_rate": 1e-06,
"loss": 0.0748,
"reward": 0.8335331082344055,
"reward_std": 0.1918882131576538,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1352168619632721,
"rewards/tag_count_reward": 0.96875,
"step": 299
},
{
"completion_length": 1047.0,
"epoch": 0.02,
"grad_norm": 53.39625930786133,
"kl": 1.9296875,
"learning_rate": 1e-06,
"loss": 0.0774,
"reward": 0.8796528577804565,
"reward_std": 0.5016852617263794,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.22972214221954346,
"rewards/tag_count_reward": 0.921875,
"step": 300
},
{
"completion_length": 826.0,
"epoch": 0.020066666666666667,
"grad_norm": 22.02448844909668,
"kl": 0.84375,
"learning_rate": 1e-06,
"loss": 0.0337,
"reward": 1.2643147706985474,
"reward_std": 0.3837927579879761,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.15756019949913025,
"rewards/tag_count_reward": 0.984375,
"step": 301
},
{
"completion_length": 1103.0,
"epoch": 0.020133333333333333,
"grad_norm": 74.74524688720703,
"kl": 1.6484375,
"learning_rate": 1e-06,
"loss": 0.0659,
"reward": 0.6457744240760803,
"reward_std": 0.35607969760894775,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.16672557592391968,
"rewards/tag_count_reward": 0.8125,
"step": 302
},
{
"completion_length": 493.5,
"epoch": 0.0202,
"grad_norm": 43.90092849731445,
"kl": 1.8125,
"learning_rate": 1e-06,
"loss": 0.0724,
"reward": 1.023780345916748,
"reward_std": 0.5628347992897034,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.22621965408325195,
"rewards/tag_count_reward": 1.0,
"step": 303
},
{
"completion_length": 1063.0,
"epoch": 0.020266666666666665,
"grad_norm": 35.594295501708984,
"kl": 2.0,
"learning_rate": 1e-06,
"loss": 0.0801,
"reward": 1.0189833641052246,
"reward_std": 0.5456187725067139,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.10601675510406494,
"rewards/tag_count_reward": 0.8125,
"step": 304
},
{
"completion_length": 2048.0,
"epoch": 0.02033333333333333,
"grad_norm": 107.36284637451172,
"kl": 6.25,
"learning_rate": 1e-06,
"loss": 0.2504,
"reward": 0.26232224702835083,
"reward_std": 0.2836293578147888,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.31580275297164917,
"rewards/tag_count_reward": 0.578125,
"step": 305
},
{
"completion_length": 1701.0,
"epoch": 0.0204,
"grad_norm": 303.72821044921875,
"kl": 9.5,
"learning_rate": 1e-06,
"loss": 0.3803,
"reward": 0.5319410562515259,
"reward_std": 0.1759571135044098,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.4055589437484741,
"rewards/tag_count_reward": 0.9375,
"step": 306
},
{
"completion_length": 771.5,
"epoch": 0.020466666666666668,
"grad_norm": 59.3670539855957,
"kl": 2.953125,
"learning_rate": 1e-06,
"loss": 0.1183,
"reward": 1.1603425741195679,
"reward_std": 0.40957415103912354,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.19903242588043213,
"rewards/tag_count_reward": 0.921875,
"step": 307
},
{
"completion_length": 2048.0,
"epoch": 0.020533333333333334,
"grad_norm": 380.81689453125,
"kl": 17.5,
"learning_rate": 1e-06,
"loss": 0.7023,
"reward": 0.24851681292057037,
"reward_std": 0.20674453675746918,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22023317217826843,
"rewards/tag_count_reward": 0.46875,
"step": 308
},
{
"completion_length": 832.5,
"epoch": 0.0206,
"grad_norm": 132.42506408691406,
"kl": 8.1875,
"learning_rate": 1e-06,
"loss": 0.3275,
"reward": 0.7047885656356812,
"reward_std": 0.343413770198822,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.18583643436431885,
"rewards/tag_count_reward": 0.890625,
"step": 309
},
{
"completion_length": 777.0,
"epoch": 0.020666666666666667,
"grad_norm": 76.12239837646484,
"kl": 3.6875,
"learning_rate": 1e-06,
"loss": 0.1471,
"reward": 0.8980907201766968,
"reward_std": 0.4372199773788452,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.1175343245267868,
"rewards/tag_count_reward": 0.953125,
"step": 310
},
{
"completion_length": 1257.5,
"epoch": 0.020733333333333333,
"grad_norm": 271.6349182128906,
"kl": 6.65625,
"learning_rate": 1e-06,
"loss": 0.2665,
"reward": 1.0332744121551514,
"reward_std": 0.33565282821655273,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.2323506772518158,
"rewards/tag_count_reward": 0.953125,
"step": 311
},
{
"completion_length": 1716.5,
"epoch": 0.0208,
"grad_norm": 340.9157409667969,
"kl": 9.25,
"learning_rate": 1e-06,
"loss": 0.3681,
"reward": 0.30537861585617065,
"reward_std": 0.32000964879989624,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.27274638414382935,
"rewards/tag_count_reward": 0.578125,
"step": 312
},
{
"completion_length": 430.0,
"epoch": 0.020866666666666665,
"grad_norm": 58.16603088378906,
"kl": 1.84375,
"learning_rate": 1e-06,
"loss": 0.0737,
"reward": 1.3933343887329102,
"reward_std": 0.26193928718566895,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.10666556656360626,
"rewards/tag_count_reward": 1.0,
"step": 313
},
{
"completion_length": 1505.0,
"epoch": 0.020933333333333335,
"grad_norm": 42.11033248901367,
"kl": 4.3125,
"learning_rate": 1e-06,
"loss": 0.1722,
"reward": 0.6482540369033813,
"reward_std": 0.22455057501792908,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.38299596309661865,
"rewards/tag_count_reward": 0.96875,
"step": 314
},
{
"completion_length": 302.0,
"epoch": 0.021,
"grad_norm": 42.860755920410156,
"kl": 1.796875,
"learning_rate": 1e-06,
"loss": 0.0718,
"reward": 1.214234471321106,
"reward_std": 0.6082682609558105,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.1295156180858612,
"rewards/tag_count_reward": 0.96875,
"step": 315
},
{
"completion_length": 590.0,
"epoch": 0.021066666666666668,
"grad_norm": 62.87367248535156,
"kl": 0.76953125,
"learning_rate": 1e-06,
"loss": 0.0308,
"reward": 0.9708921909332275,
"reward_std": 0.5262269973754883,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.10723280906677246,
"rewards/tag_count_reward": 0.828125,
"step": 316
},
{
"completion_length": 687.5,
"epoch": 0.021133333333333334,
"grad_norm": 32.127357482910156,
"kl": 0.34765625,
"learning_rate": 1e-06,
"loss": 0.014,
"reward": 0.9708912372589111,
"reward_std": 0.4935838580131531,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.21660876274108887,
"rewards/tag_count_reward": 0.9375,
"step": 317
},
{
"completion_length": 1581.0,
"epoch": 0.0212,
"grad_norm": 104.86175537109375,
"kl": 0.7421875,
"learning_rate": 1e-06,
"loss": 0.0296,
"reward": 0.49017536640167236,
"reward_std": 0.22941234707832336,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.18169961869716644,
"rewards/tag_count_reward": 0.671875,
"step": 318
},
{
"completion_length": 1262.5,
"epoch": 0.021266666666666666,
"grad_norm": 116.81327819824219,
"kl": 0.419921875,
"learning_rate": 1e-06,
"loss": 0.0168,
"reward": 0.8134942054748535,
"reward_std": 0.4755869209766388,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.15525580942630768,
"rewards/tag_count_reward": 0.71875,
"step": 319
},
{
"completion_length": 849.5,
"epoch": 0.021333333333333333,
"grad_norm": 57.31502914428711,
"kl": 0.6875,
"learning_rate": 1e-06,
"loss": 0.0276,
"reward": 1.1615235805511475,
"reward_std": 0.528767466545105,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.041601456701755524,
"rewards/tag_count_reward": 0.765625,
"step": 320
},
{
"completion_length": 1183.5,
"epoch": 0.0214,
"grad_norm": 27.733661651611328,
"kl": 1.3125,
"learning_rate": 1e-06,
"loss": 0.0528,
"reward": 1.0065999031066895,
"reward_std": 0.43920642137527466,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.2590251564979553,
"rewards/tag_count_reward": 0.890625,
"step": 321
},
{
"completion_length": 696.5,
"epoch": 0.021466666666666665,
"grad_norm": 228.986572265625,
"kl": 0.953125,
"learning_rate": 1e-06,
"loss": 0.0381,
"reward": 1.0592111349105835,
"reward_std": 0.7358847856521606,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.23766390979290009,
"rewards/tag_count_reward": 0.984375,
"step": 322
},
{
"completion_length": 1235.0,
"epoch": 0.021533333333333335,
"grad_norm": 80.84215545654297,
"kl": 1.390625,
"learning_rate": 1e-06,
"loss": 0.0558,
"reward": 0.6002234220504761,
"reward_std": 0.2300834059715271,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2435266226530075,
"rewards/tag_count_reward": 0.84375,
"step": 323
},
{
"completion_length": 1185.0,
"epoch": 0.0216,
"grad_norm": 40.54029083251953,
"kl": 2.3125,
"learning_rate": 1e-06,
"loss": 0.0924,
"reward": 1.0903337001800537,
"reward_std": 0.5135502815246582,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.15966621041297913,
"rewards/tag_count_reward": 0.875,
"step": 324
},
{
"completion_length": 1124.5,
"epoch": 0.021666666666666667,
"grad_norm": 96.52243041992188,
"kl": 2.96875,
"learning_rate": 1e-06,
"loss": 0.1185,
"reward": 0.8188521862030029,
"reward_std": 0.4867573380470276,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.22802278399467468,
"rewards/tag_count_reward": 0.796875,
"step": 325
},
{
"completion_length": 629.5,
"epoch": 0.021733333333333334,
"grad_norm": 52.54798126220703,
"kl": 1.78125,
"learning_rate": 1e-06,
"loss": 0.0713,
"reward": 0.6936306953430176,
"reward_std": 0.2830107510089874,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.243869349360466,
"rewards/tag_count_reward": 0.875,
"step": 326
},
{
"completion_length": 1273.0,
"epoch": 0.0218,
"grad_norm": 553.539306640625,
"kl": 24.0,
"learning_rate": 1e-06,
"loss": 0.9609,
"reward": 0.5443458557128906,
"reward_std": 0.2812609076499939,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.23690414428710938,
"rewards/tag_count_reward": 0.78125,
"step": 327
},
{
"completion_length": 1118.0,
"epoch": 0.021866666666666666,
"grad_norm": 15.364245414733887,
"kl": 1.671875,
"learning_rate": 1e-06,
"loss": 0.0669,
"reward": 1.1107473373413086,
"reward_std": 0.29542076587677,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.21737776696681976,
"rewards/tag_count_reward": 0.953125,
"step": 328
},
{
"completion_length": 304.0,
"epoch": 0.021933333333333332,
"grad_norm": 151.0471649169922,
"kl": 7.3125,
"learning_rate": 1e-06,
"loss": 0.2924,
"reward": 1.7604129314422607,
"reward_std": 0.4952170252799988,
"rewards/accuracy_reward": 0.875,
"rewards/len_reward": -0.11458698660135269,
"rewards/tag_count_reward": 1.0,
"step": 329
},
{
"completion_length": 1306.5,
"epoch": 0.022,
"grad_norm": 956.6917724609375,
"kl": 37.25,
"learning_rate": 1e-06,
"loss": 1.491,
"reward": 0.5319682359695435,
"reward_std": 0.287946492433548,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.20240673422813416,
"rewards/tag_count_reward": 0.734375,
"step": 330
},
{
"completion_length": 1410.5,
"epoch": 0.022066666666666665,
"grad_norm": 397.27294921875,
"kl": 21.875,
"learning_rate": 1e-06,
"loss": 0.8763,
"reward": 0.913928210735321,
"reward_std": 0.5697895288467407,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.19544677436351776,
"rewards/tag_count_reward": 0.734375,
"step": 331
},
{
"completion_length": 2048.0,
"epoch": 0.022133333333333335,
"grad_norm": 85.96682739257812,
"kl": 9.375,
"learning_rate": 1e-06,
"loss": 0.3737,
"reward": 0.4683043956756592,
"reward_std": 0.2435683161020279,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2504456043243408,
"rewards/tag_count_reward": 0.71875,
"step": 332
},
{
"completion_length": 774.5,
"epoch": 0.0222,
"grad_norm": 28.857770919799805,
"kl": 1.6328125,
"learning_rate": 1e-06,
"loss": 0.0654,
"reward": 1.199403166770935,
"reward_std": 0.4144977331161499,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.06622181087732315,
"rewards/tag_count_reward": 0.953125,
"step": 333
},
{
"completion_length": 1293.0,
"epoch": 0.022266666666666667,
"grad_norm": 76.90970611572266,
"kl": 1.9921875,
"learning_rate": 1e-06,
"loss": 0.0795,
"reward": 0.8091883063316345,
"reward_std": 0.4681074619293213,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.22206170856952667,
"rewards/tag_count_reward": 0.84375,
"step": 334
},
{
"completion_length": 1191.5,
"epoch": 0.022333333333333334,
"grad_norm": 72.85279846191406,
"kl": 2.5,
"learning_rate": 1e-06,
"loss": 0.1002,
"reward": 1.10751211643219,
"reward_std": 0.5649300217628479,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.25186288356781006,
"rewards/tag_count_reward": 0.984375,
"step": 335
},
{
"completion_length": 319.5,
"epoch": 0.0224,
"grad_norm": 46.205345153808594,
"kl": 1.21875,
"learning_rate": 1e-06,
"loss": 0.0487,
"reward": 1.3281185626983643,
"reward_std": 0.45367008447647095,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.09375645965337753,
"rewards/tag_count_reward": 0.984375,
"step": 336
},
{
"completion_length": 375.0,
"epoch": 0.022466666666666666,
"grad_norm": 31.738218307495117,
"kl": 1.6875,
"learning_rate": 1e-06,
"loss": 0.0671,
"reward": 1.096465826034546,
"reward_std": 0.48942404985427856,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.1379092037677765,
"rewards/tag_count_reward": 0.984375,
"step": 337
},
{
"completion_length": 537.5,
"epoch": 0.022533333333333332,
"grad_norm": 154.14036560058594,
"kl": 2.421875,
"learning_rate": 1e-06,
"loss": 0.0972,
"reward": 0.854365348815918,
"reward_std": 0.3954009413719177,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.30188465118408203,
"rewards/tag_count_reward": 0.96875,
"step": 338
},
{
"completion_length": 1025.0,
"epoch": 0.0226,
"grad_norm": 210.96768188476562,
"kl": 1.203125,
"learning_rate": 1e-06,
"loss": 0.0479,
"reward": 0.7532867789268494,
"reward_std": 0.35458308458328247,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.24671322107315063,
"rewards/tag_count_reward": 0.9375,
"step": 339
},
{
"completion_length": 1158.5,
"epoch": 0.02266666666666667,
"grad_norm": 37.851768493652344,
"kl": 1.3125,
"learning_rate": 1e-06,
"loss": 0.0525,
"reward": 0.7112846970558167,
"reward_std": 0.26829543709754944,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.13246530294418335,
"rewards/tag_count_reward": 0.84375,
"step": 340
},
{
"completion_length": 892.5,
"epoch": 0.022733333333333335,
"grad_norm": 24.23723602294922,
"kl": 0.671875,
"learning_rate": 1e-06,
"loss": 0.027,
"reward": 0.9460461735725403,
"reward_std": 0.41133737564086914,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.06957884132862091,
"rewards/tag_count_reward": 0.953125,
"step": 341
},
{
"completion_length": 681.0,
"epoch": 0.0228,
"grad_norm": 33.455047607421875,
"kl": 1.1328125,
"learning_rate": 1e-06,
"loss": 0.0454,
"reward": 0.9595123529434204,
"reward_std": 0.5215348601341248,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.14986258745193481,
"rewards/tag_count_reward": 0.859375,
"step": 342
},
{
"completion_length": 1745.5,
"epoch": 0.022866666666666667,
"grad_norm": 46.849483489990234,
"kl": 2.28125,
"learning_rate": 1e-06,
"loss": 0.0912,
"reward": 0.3965286910533905,
"reward_std": 0.2530428469181061,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3065963387489319,
"rewards/tag_count_reward": 0.703125,
"step": 343
},
{
"completion_length": 1157.0,
"epoch": 0.022933333333333333,
"grad_norm": 144.76483154296875,
"kl": 4.4375,
"learning_rate": 1e-06,
"loss": 0.1764,
"reward": 0.6540185213088989,
"reward_std": 0.1685352623462677,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3303564786911011,
"rewards/tag_count_reward": 0.984375,
"step": 344
},
{
"completion_length": 598.0,
"epoch": 0.023,
"grad_norm": 28.438884735107422,
"kl": 4.8125,
"learning_rate": 1e-06,
"loss": 0.1917,
"reward": 0.9016408324241638,
"reward_std": 0.42892104387283325,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.1452341079711914,
"rewards/tag_count_reward": 0.921875,
"step": 345
},
{
"completion_length": 1474.5,
"epoch": 0.023066666666666666,
"grad_norm": 68.08282470703125,
"kl": 9.75,
"learning_rate": 1e-06,
"loss": 0.3908,
"reward": 1.0328866243362427,
"reward_std": 0.4325779974460602,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.3264884054660797,
"rewards/tag_count_reward": 0.984375,
"step": 346
},
{
"completion_length": 2048.0,
"epoch": 0.023133333333333332,
"grad_norm": 49.9034309387207,
"kl": 7.0,
"learning_rate": 1e-06,
"loss": 0.2797,
"reward": 0.33096492290496826,
"reward_std": 0.2335730493068695,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.27841007709503174,
"rewards/tag_count_reward": 0.609375,
"step": 347
},
{
"completion_length": 818.5,
"epoch": 0.0232,
"grad_norm": 19.60797691345215,
"kl": 4.8125,
"learning_rate": 1e-06,
"loss": 0.1927,
"reward": 1.0055264234542847,
"reward_std": 0.5644086599349976,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.10384850949048996,
"rewards/tag_count_reward": 0.796875,
"step": 348
},
{
"completion_length": 374.5,
"epoch": 0.023266666666666668,
"grad_norm": 94.12194061279297,
"kl": 1.015625,
"learning_rate": 1e-06,
"loss": 0.0406,
"reward": 1.3110102415084839,
"reward_std": 0.3672176003456116,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.1264897584915161,
"rewards/tag_count_reward": 1.0,
"step": 349
},
{
"completion_length": 1221.0,
"epoch": 0.023333333333333334,
"grad_norm": 324.7854309082031,
"kl": 1.40625,
"learning_rate": 1e-06,
"loss": 0.0564,
"reward": 0.5963558554649353,
"reward_std": 0.23897361755371094,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1848941296339035,
"rewards/tag_count_reward": 0.78125,
"step": 350
},
{
"completion_length": 389.5,
"epoch": 0.0234,
"grad_norm": 26.179595947265625,
"kl": 1.6953125,
"learning_rate": 1e-06,
"loss": 0.0679,
"reward": 1.3193278312683105,
"reward_std": 0.7391749620437622,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.008797142654657364,
"rewards/tag_count_reward": 0.890625,
"step": 351
},
{
"completion_length": 1109.5,
"epoch": 0.023466666666666667,
"grad_norm": 188.46897888183594,
"kl": 1.203125,
"learning_rate": 1e-06,
"loss": 0.0482,
"reward": 0.5779841542243958,
"reward_std": 0.22797349095344543,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.29701584577560425,
"rewards/tag_count_reward": 0.875,
"step": 352
},
{
"completion_length": 1664.5,
"epoch": 0.023533333333333333,
"grad_norm": 27.162336349487305,
"kl": 1.4375,
"learning_rate": 1e-06,
"loss": 0.0573,
"reward": 0.6244074106216431,
"reward_std": 0.3301810026168823,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.23496761918067932,
"rewards/tag_count_reward": 0.859375,
"step": 353
},
{
"completion_length": 540.5,
"epoch": 0.0236,
"grad_norm": 76.66584777832031,
"kl": 1.78125,
"learning_rate": 1e-06,
"loss": 0.0715,
"reward": 1.0277178287506104,
"reward_std": 0.3999354839324951,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.23790714144706726,
"rewards/tag_count_reward": 0.953125,
"step": 354
},
{
"completion_length": 730.0,
"epoch": 0.023666666666666666,
"grad_norm": 120.81816864013672,
"kl": 2.015625,
"learning_rate": 1e-06,
"loss": 0.0808,
"reward": 1.1916608810424805,
"reward_std": 0.7615381479263306,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.10521401464939117,
"rewards/tag_count_reward": 0.921875,
"step": 355
},
{
"completion_length": 758.0,
"epoch": 0.023733333333333332,
"grad_norm": 37.24180221557617,
"kl": 5.6875,
"learning_rate": 1e-06,
"loss": 0.2278,
"reward": 0.8624981641769409,
"reward_std": 0.5419960021972656,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.26250186562538147,
"rewards/tag_count_reward": 1.0,
"step": 356
},
{
"completion_length": 1211.5,
"epoch": 0.0238,
"grad_norm": 81.53892517089844,
"kl": 5.0625,
"learning_rate": 1e-06,
"loss": 0.2026,
"reward": 1.141159176826477,
"reward_std": 0.481996089220047,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.14009076356887817,
"rewards/tag_count_reward": 0.90625,
"step": 357
},
{
"completion_length": 1180.0,
"epoch": 0.023866666666666668,
"grad_norm": 93.55269622802734,
"kl": 5.4375,
"learning_rate": 1e-06,
"loss": 0.2177,
"reward": 0.537026047706604,
"reward_std": 0.20559044182300568,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.322348952293396,
"rewards/tag_count_reward": 0.859375,
"step": 358
},
{
"completion_length": 2048.0,
"epoch": 0.023933333333333334,
"grad_norm": 194.85621643066406,
"kl": 1.4765625,
"learning_rate": 1e-06,
"loss": 0.0592,
"reward": 0.3867694139480591,
"reward_std": 0.34691348671913147,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.16010558605194092,
"rewards/tag_count_reward": 0.546875,
"step": 359
},
{
"completion_length": 2048.0,
"epoch": 0.024,
"grad_norm": 172.2584686279297,
"kl": 1.328125,
"learning_rate": 1e-06,
"loss": 0.053,
"reward": 0.42840299010276794,
"reward_std": 0.3341846466064453,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.16534700989723206,
"rewards/tag_count_reward": 0.59375,
"step": 360
},
{
"completion_length": 1219.5,
"epoch": 0.024066666666666667,
"grad_norm": 111.23456573486328,
"kl": 1.3125,
"learning_rate": 1e-06,
"loss": 0.0526,
"reward": 0.6558999419212341,
"reward_std": 0.4681927561759949,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.21910005807876587,
"rewards/tag_count_reward": 0.75,
"step": 361
},
{
"completion_length": 2048.0,
"epoch": 0.024133333333333333,
"grad_norm": 164.71762084960938,
"kl": 1.2890625,
"learning_rate": 1e-06,
"loss": 0.0516,
"reward": 0.41724294424057007,
"reward_std": 0.24640598893165588,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.28588205575942993,
"rewards/tag_count_reward": 0.703125,
"step": 362
},
{
"completion_length": 1185.0,
"epoch": 0.0242,
"grad_norm": 113.31256103515625,
"kl": 1.1328125,
"learning_rate": 1e-06,
"loss": 0.0454,
"reward": 1.023838758468628,
"reward_std": 0.38074183464050293,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.17928624153137207,
"rewards/tag_count_reward": 0.953125,
"step": 363
},
{
"completion_length": 1585.0,
"epoch": 0.024266666666666666,
"grad_norm": 168.71810913085938,
"kl": 0.984375,
"learning_rate": 1e-06,
"loss": 0.0392,
"reward": 0.7161804437637329,
"reward_std": 0.5223672389984131,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.22131958603858948,
"rewards/tag_count_reward": 0.75,
"step": 364
},
{
"completion_length": 393.5,
"epoch": 0.024333333333333332,
"grad_norm": 60.523162841796875,
"kl": 0.66796875,
"learning_rate": 1e-06,
"loss": 0.0268,
"reward": 1.1848499774932861,
"reward_std": 0.6366668939590454,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.06514997035264969,
"rewards/tag_count_reward": 1.0,
"step": 365
},
{
"completion_length": 1315.0,
"epoch": 0.0244,
"grad_norm": 80.97102355957031,
"kl": 1.5,
"learning_rate": 1e-06,
"loss": 0.06,
"reward": 1.1217434406280518,
"reward_std": 0.6317075490951538,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.31575655937194824,
"rewards/tag_count_reward": 1.0,
"step": 366
},
{
"completion_length": 1144.0,
"epoch": 0.024466666666666668,
"grad_norm": 135.45469665527344,
"kl": 0.88671875,
"learning_rate": 1e-06,
"loss": 0.0354,
"reward": 1.0854109525680542,
"reward_std": 0.27328023314476013,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.2583390176296234,
"rewards/tag_count_reward": 0.84375,
"step": 367
},
{
"completion_length": 610.0,
"epoch": 0.024533333333333334,
"grad_norm": 30.466493606567383,
"kl": 1.140625,
"learning_rate": 1e-06,
"loss": 0.0456,
"reward": 0.7019455432891846,
"reward_std": 0.171876460313797,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2668044865131378,
"rewards/tag_count_reward": 0.96875,
"step": 368
},
{
"completion_length": 423.0,
"epoch": 0.0246,
"grad_norm": 230.95326232910156,
"kl": 0.8046875,
"learning_rate": 1e-06,
"loss": 0.0323,
"reward": 1.2531075477600098,
"reward_std": 0.44291990995407104,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.10626740008592606,
"rewards/tag_count_reward": 0.984375,
"step": 369
},
{
"completion_length": 1254.5,
"epoch": 0.024666666666666667,
"grad_norm": 38.18868637084961,
"kl": 0.8515625,
"learning_rate": 1e-06,
"loss": 0.0341,
"reward": 0.7070168852806091,
"reward_std": 0.25675168633461,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.18360809981822968,
"rewards/tag_count_reward": 0.890625,
"step": 370
},
{
"completion_length": 1166.0,
"epoch": 0.024733333333333333,
"grad_norm": 58.44999694824219,
"kl": 0.828125,
"learning_rate": 1e-06,
"loss": 0.0332,
"reward": 0.9627978801727295,
"reward_std": 0.5553252696990967,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.13095210492610931,
"rewards/tag_count_reward": 0.78125,
"step": 371
},
{
"completion_length": 336.0,
"epoch": 0.0248,
"grad_norm": 31.88998794555664,
"kl": 1.015625,
"learning_rate": 1e-06,
"loss": 0.0404,
"reward": 1.3841114044189453,
"reward_std": 0.3197627663612366,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.11588859558105469,
"rewards/tag_count_reward": 1.0,
"step": 372
},
{
"completion_length": 2048.0,
"epoch": 0.024866666666666665,
"grad_norm": 85.23079681396484,
"kl": 2.46875,
"learning_rate": 1e-06,
"loss": 0.0992,
"reward": 0.5388705730438232,
"reward_std": 0.177922785282135,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.32050439715385437,
"rewards/tag_count_reward": 0.859375,
"step": 373
},
{
"completion_length": 1783.5,
"epoch": 0.02493333333333333,
"grad_norm": 33.19438552856445,
"kl": 2.09375,
"learning_rate": 1e-06,
"loss": 0.0834,
"reward": 0.5565052628517151,
"reward_std": 0.3148510456085205,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1934947371482849,
"rewards/tag_count_reward": 0.75,
"step": 374
},
{
"completion_length": 471.5,
"epoch": 0.025,
"grad_norm": 162.33657836914062,
"kl": 8.0,
"learning_rate": 1e-06,
"loss": 0.3206,
"reward": 1.2218003273010254,
"reward_std": 0.6198499798774719,
"rewards/accuracy_reward": 0.5625,
"rewards/len_reward": -0.184449702501297,
"rewards/tag_count_reward": 0.84375,
"step": 375
},
{
"completion_length": 1204.5,
"epoch": 0.025066666666666668,
"grad_norm": 131.4541015625,
"kl": 11.625,
"learning_rate": 1e-06,
"loss": 0.4628,
"reward": 0.6212787628173828,
"reward_std": 0.25988760590553284,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.19122126698493958,
"rewards/tag_count_reward": 0.8125,
"step": 376
},
{
"completion_length": 1207.5,
"epoch": 0.025133333333333334,
"grad_norm": 189.53819274902344,
"kl": 11.625,
"learning_rate": 1e-06,
"loss": 0.4687,
"reward": 0.6212553977966309,
"reward_std": 0.3231739401817322,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.22249463200569153,
"rewards/tag_count_reward": 0.78125,
"step": 377
},
{
"completion_length": 1216.5,
"epoch": 0.0252,
"grad_norm": 268.6429138183594,
"kl": 19.875,
"learning_rate": 1e-06,
"loss": 0.7935,
"reward": 0.6878746747970581,
"reward_std": 0.23541045188903809,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2340003401041031,
"rewards/tag_count_reward": 0.921875,
"step": 378
},
{
"completion_length": 808.0,
"epoch": 0.025266666666666666,
"grad_norm": 178.39500427246094,
"kl": 14.125,
"learning_rate": 1e-06,
"loss": 0.5631,
"reward": 0.7823457717895508,
"reward_std": 0.34783172607421875,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.2176542580127716,
"rewards/tag_count_reward": 0.875,
"step": 379
},
{
"completion_length": 473.5,
"epoch": 0.025333333333333333,
"grad_norm": 119.30746459960938,
"kl": 9.625,
"learning_rate": 1e-06,
"loss": 0.3831,
"reward": 1.4035418033599854,
"reward_std": 0.501310408115387,
"rewards/accuracy_reward": 0.625,
"rewards/len_reward": -0.19020821154117584,
"rewards/tag_count_reward": 0.96875,
"step": 380
},
{
"completion_length": 2048.0,
"epoch": 0.0254,
"grad_norm": 85.62954711914062,
"kl": 5.125,
"learning_rate": 1e-06,
"loss": 0.2055,
"reward": 0.5226788520812988,
"reward_std": 0.311183899641037,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.19607111811637878,
"rewards/tag_count_reward": 0.71875,
"step": 381
},
{
"completion_length": 1282.5,
"epoch": 0.025466666666666665,
"grad_norm": 11.626707077026367,
"kl": 2.171875,
"learning_rate": 1e-06,
"loss": 0.0865,
"reward": 0.5935724377632141,
"reward_std": 0.27991387248039246,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1564275622367859,
"rewards/tag_count_reward": 0.75,
"step": 382
},
{
"completion_length": 1230.5,
"epoch": 0.025533333333333335,
"grad_norm": 99.81382751464844,
"kl": 1.7578125,
"learning_rate": 1e-06,
"loss": 0.0703,
"reward": 0.6769819855690002,
"reward_std": 0.3789372444152832,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.15114302933216095,
"rewards/tag_count_reward": 0.765625,
"step": 383
},
{
"completion_length": 1200.5,
"epoch": 0.0256,
"grad_norm": 33.18088150024414,
"kl": 1.1875,
"learning_rate": 1e-06,
"loss": 0.0475,
"reward": 1.2311782836914062,
"reward_std": 0.5231276750564575,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.050071749836206436,
"rewards/tag_count_reward": 0.96875,
"step": 384
},
{
"completion_length": 1315.5,
"epoch": 0.025666666666666667,
"grad_norm": 104.46115112304688,
"kl": 2.109375,
"learning_rate": 1e-06,
"loss": 0.0843,
"reward": 0.6653253436088562,
"reward_std": 0.4485510289669037,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.209674671292305,
"rewards/tag_count_reward": 0.75,
"step": 385
},
{
"completion_length": 2048.0,
"epoch": 0.025733333333333334,
"grad_norm": 30.411439895629883,
"kl": 1.296875,
"learning_rate": 1e-06,
"loss": 0.0519,
"reward": 0.29646408557891846,
"reward_std": 0.305771142244339,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.18791092932224274,
"rewards/tag_count_reward": 0.484375,
"step": 386
},
{
"completion_length": 1025.5,
"epoch": 0.0258,
"grad_norm": 33.216915130615234,
"kl": 1.7421875,
"learning_rate": 1e-06,
"loss": 0.0697,
"reward": 0.619266927242279,
"reward_std": 0.30492904782295227,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.25573304295539856,
"rewards/tag_count_reward": 0.8125,
"step": 387
},
{
"completion_length": 1230.5,
"epoch": 0.025866666666666666,
"grad_norm": 27.491132736206055,
"kl": 0.70703125,
"learning_rate": 1e-06,
"loss": 0.0283,
"reward": 0.8222702741622925,
"reward_std": 0.47413820028305054,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.08397974073886871,
"rewards/tag_count_reward": 0.71875,
"step": 388
},
{
"completion_length": 970.0,
"epoch": 0.025933333333333333,
"grad_norm": 146.60391235351562,
"kl": 1.7265625,
"learning_rate": 1e-06,
"loss": 0.069,
"reward": 0.8505053520202637,
"reward_std": 0.37996864318847656,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.3057446777820587,
"rewards/tag_count_reward": 0.96875,
"step": 389
},
{
"completion_length": 302.0,
"epoch": 0.026,
"grad_norm": 29.51785659790039,
"kl": 0.494140625,
"learning_rate": 1e-06,
"loss": 0.0198,
"reward": 1.5183930397033691,
"reward_std": 0.4361667037010193,
"rewards/accuracy_reward": 0.625,
"rewards/len_reward": -0.10660697519779205,
"rewards/tag_count_reward": 1.0,
"step": 390
},
{
"completion_length": 1902.0,
"epoch": 0.026066666666666665,
"grad_norm": 27.1646671295166,
"kl": 1.7421875,
"learning_rate": 1e-06,
"loss": 0.07,
"reward": 0.3741278648376465,
"reward_std": 0.4004511833190918,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1727471500635147,
"rewards/tag_count_reward": 0.546875,
"step": 391
},
{
"completion_length": 1133.5,
"epoch": 0.026133333333333335,
"grad_norm": 47.449825286865234,
"kl": 1.578125,
"learning_rate": 1e-06,
"loss": 0.0632,
"reward": 1.2241830825805664,
"reward_std": 0.5153764486312866,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.2601918578147888,
"rewards/tag_count_reward": 0.984375,
"step": 392
},
{
"completion_length": 2048.0,
"epoch": 0.0262,
"grad_norm": 69.18492126464844,
"kl": 3.0625,
"learning_rate": 1e-06,
"loss": 0.1226,
"reward": 0.22290146350860596,
"reward_std": 0.32242223620414734,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.23022352159023285,
"rewards/tag_count_reward": 0.453125,
"step": 393
},
{
"completion_length": 1254.5,
"epoch": 0.026266666666666667,
"grad_norm": 75.0274658203125,
"kl": 1.25,
"learning_rate": 1e-06,
"loss": 0.0502,
"reward": 0.6211944222450256,
"reward_std": 0.2534863352775574,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.11318057775497437,
"rewards/tag_count_reward": 0.734375,
"step": 394
},
{
"completion_length": 1313.5,
"epoch": 0.026333333333333334,
"grad_norm": 109.3488998413086,
"kl": 10.125,
"learning_rate": 1e-06,
"loss": 0.4038,
"reward": 0.37912631034851074,
"reward_std": 0.24370431900024414,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.33962368965148926,
"rewards/tag_count_reward": 0.71875,
"step": 395
},
{
"completion_length": 2048.0,
"epoch": 0.0264,
"grad_norm": 85.0662612915039,
"kl": 6.65625,
"learning_rate": 1e-06,
"loss": 0.2659,
"reward": 0.2979457676410675,
"reward_std": 0.2590464949607849,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1864292174577713,
"rewards/tag_count_reward": 0.484375,
"step": 396
},
{
"completion_length": 1397.0,
"epoch": 0.026466666666666666,
"grad_norm": 143.81922912597656,
"kl": 5.8125,
"learning_rate": 1e-06,
"loss": 0.232,
"reward": 0.4804523289203644,
"reward_std": 0.3569881319999695,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.332047700881958,
"rewards/tag_count_reward": 0.75,
"step": 397
},
{
"completion_length": 1368.5,
"epoch": 0.026533333333333332,
"grad_norm": 109.19786834716797,
"kl": 3.375,
"learning_rate": 1e-06,
"loss": 0.1349,
"reward": 0.5585415363311768,
"reward_std": 0.44386589527130127,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.22270843386650085,
"rewards/tag_count_reward": 0.71875,
"step": 398
},
{
"completion_length": 1180.5,
"epoch": 0.0266,
"grad_norm": 61.98153305053711,
"kl": 2.9375,
"learning_rate": 1e-06,
"loss": 0.1172,
"reward": 1.0090720653533936,
"reward_std": 0.4242977499961853,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.13155291974544525,
"rewards/tag_count_reward": 0.765625,
"step": 399
},
{
"completion_length": 1199.5,
"epoch": 0.02666666666666667,
"grad_norm": 107.26168060302734,
"kl": 4.71875,
"learning_rate": 1e-06,
"loss": 0.189,
"reward": 1.1559706926345825,
"reward_std": 0.2972278594970703,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.14090432226657867,
"rewards/tag_count_reward": 0.796875,
"step": 400
},
{
"completion_length": 1264.5,
"epoch": 0.026733333333333335,
"grad_norm": 57.82908248901367,
"kl": 5.21875,
"learning_rate": 1e-06,
"loss": 0.2085,
"reward": 0.454619437456131,
"reward_std": 0.21859362721443176,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.20163056254386902,
"rewards/tag_count_reward": 0.65625,
"step": 401
},
{
"completion_length": 2048.0,
"epoch": 0.0268,
"grad_norm": 40.06622314453125,
"kl": 3.34375,
"learning_rate": 1e-06,
"loss": 0.1344,
"reward": 0.4545770287513733,
"reward_std": 0.31251251697540283,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2485479712486267,
"rewards/tag_count_reward": 0.703125,
"step": 402
},
{
"completion_length": 1084.5,
"epoch": 0.026866666666666667,
"grad_norm": 64.30767059326172,
"kl": 6.0625,
"learning_rate": 1e-06,
"loss": 0.2421,
"reward": 0.4500392973423004,
"reward_std": 0.3190680742263794,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22183571755886078,
"rewards/tag_count_reward": 0.671875,
"step": 403
},
{
"completion_length": 488.5,
"epoch": 0.026933333333333333,
"grad_norm": 16.057842254638672,
"kl": 0.87109375,
"learning_rate": 1e-06,
"loss": 0.0349,
"reward": 1.2301216125488281,
"reward_std": 0.3597370386123657,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.17612846195697784,
"rewards/tag_count_reward": 0.96875,
"step": 404
},
{
"completion_length": 431.0,
"epoch": 0.027,
"grad_norm": 13.406998634338379,
"kl": 1.53125,
"learning_rate": 1e-06,
"loss": 0.0608,
"reward": 1.2892142534255981,
"reward_std": 0.5871255993843079,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.07016070187091827,
"rewards/tag_count_reward": 0.984375,
"step": 405
},
{
"completion_length": 1112.5,
"epoch": 0.027066666666666666,
"grad_norm": 12.13965129852295,
"kl": 2.796875,
"learning_rate": 1e-06,
"loss": 0.1119,
"reward": 0.812352180480957,
"reward_std": 0.5745599269866943,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.09389783442020416,
"rewards/tag_count_reward": 0.84375,
"step": 406
},
{
"completion_length": 559.0,
"epoch": 0.027133333333333332,
"grad_norm": 54.97040557861328,
"kl": 1.5078125,
"learning_rate": 1e-06,
"loss": 0.0601,
"reward": 1.1173365116119385,
"reward_std": 0.5239236354827881,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.24203848838806152,
"rewards/tag_count_reward": 0.984375,
"step": 407
},
{
"completion_length": 2048.0,
"epoch": 0.0272,
"grad_norm": 78.96199035644531,
"kl": 4.5625,
"learning_rate": 1e-06,
"loss": 0.1822,
"reward": 0.23337377607822418,
"reward_std": 0.2871261239051819,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.329126238822937,
"rewards/tag_count_reward": 0.5625,
"step": 408
},
{
"completion_length": 376.0,
"epoch": 0.027266666666666668,
"grad_norm": 68.48465728759766,
"kl": 3.0625,
"learning_rate": 1e-06,
"loss": 0.1221,
"reward": 1.1270508766174316,
"reward_std": 0.49426913261413574,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.16982409358024597,
"rewards/tag_count_reward": 0.984375,
"step": 409
},
{
"completion_length": 892.0,
"epoch": 0.027333333333333334,
"grad_norm": 55.446083068847656,
"kl": 3.65625,
"learning_rate": 1e-06,
"loss": 0.1462,
"reward": 1.3488011360168457,
"reward_std": 0.569901704788208,
"rewards/accuracy_reward": 0.625,
"rewards/len_reward": -0.2761989235877991,
"rewards/tag_count_reward": 1.0,
"step": 410
},
{
"completion_length": 1137.0,
"epoch": 0.0274,
"grad_norm": 116.13448333740234,
"kl": 5.53125,
"learning_rate": 1e-06,
"loss": 0.2212,
"reward": 0.6423210501670837,
"reward_std": 0.47986292839050293,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.17017894983291626,
"rewards/tag_count_reward": 0.625,
"step": 411
},
{
"completion_length": 2048.0,
"epoch": 0.027466666666666667,
"grad_norm": 81.79183959960938,
"kl": 4.8125,
"learning_rate": 1e-06,
"loss": 0.1923,
"reward": 0.30772507190704346,
"reward_std": 0.2979990243911743,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.36414992809295654,
"rewards/tag_count_reward": 0.671875,
"step": 412
},
{
"completion_length": 1155.5,
"epoch": 0.027533333333333333,
"grad_norm": 17.789812088012695,
"kl": 1.1015625,
"learning_rate": 1e-06,
"loss": 0.044,
"reward": 0.6761860847473145,
"reward_std": 0.23097765445709229,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.18318893015384674,
"rewards/tag_count_reward": 0.859375,
"step": 413
},
{
"completion_length": 760.5,
"epoch": 0.0276,
"grad_norm": 140.0294952392578,
"kl": 2.265625,
"learning_rate": 1e-06,
"loss": 0.0901,
"reward": 1.1233935356140137,
"reward_std": 0.19936342537403107,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.3141065239906311,
"rewards/tag_count_reward": 1.0,
"step": 414
},
{
"completion_length": 2048.0,
"epoch": 0.027666666666666666,
"grad_norm": 95.42269134521484,
"kl": 4.65625,
"learning_rate": 1e-06,
"loss": 0.1864,
"reward": 0.12715484201908112,
"reward_std": 0.24717658758163452,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3103451728820801,
"rewards/tag_count_reward": 0.4375,
"step": 415
},
{
"completion_length": 1188.0,
"epoch": 0.027733333333333332,
"grad_norm": 22.912418365478516,
"kl": 1.203125,
"learning_rate": 1e-06,
"loss": 0.0481,
"reward": 1.0479087829589844,
"reward_std": 0.39824777841567993,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.21771611273288727,
"rewards/tag_count_reward": 0.953125,
"step": 416
},
{
"completion_length": 459.5,
"epoch": 0.0278,
"grad_norm": 88.90909576416016,
"kl": 3.765625,
"learning_rate": 1e-06,
"loss": 0.1509,
"reward": 0.6286050081253052,
"reward_std": 0.3392166495323181,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1682700514793396,
"rewards/tag_count_reward": 0.796875,
"step": 417
},
{
"completion_length": 2048.0,
"epoch": 0.027866666666666668,
"grad_norm": 17.91539192199707,
"kl": 2.15625,
"learning_rate": 1e-06,
"loss": 0.0867,
"reward": 0.3854790925979614,
"reward_std": 0.2980719208717346,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.23952090740203857,
"rewards/tag_count_reward": 0.625,
"step": 418
},
{
"completion_length": 1353.0,
"epoch": 0.027933333333333334,
"grad_norm": 19.329565048217773,
"kl": 2.5625,
"learning_rate": 1e-06,
"loss": 0.102,
"reward": 0.4859287738800049,
"reward_std": 0.24047116935253143,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2640712261199951,
"rewards/tag_count_reward": 0.75,
"step": 419
},
{
"completion_length": 680.5,
"epoch": 0.028,
"grad_norm": 29.351930618286133,
"kl": 1.90625,
"learning_rate": 1e-06,
"loss": 0.0763,
"reward": 1.1519155502319336,
"reward_std": 0.2905888557434082,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.17620949447155,
"rewards/tag_count_reward": 0.828125,
"step": 420
},
{
"completion_length": 477.0,
"epoch": 0.028066666666666667,
"grad_norm": 4.9814300537109375,
"kl": 1.140625,
"learning_rate": 1e-06,
"loss": 0.0455,
"reward": 1.3871073722839355,
"reward_std": 0.49295300245285034,
"rewards/accuracy_reward": 0.5625,
"rewards/len_reward": -0.14414256811141968,
"rewards/tag_count_reward": 0.96875,
"step": 421
},
{
"completion_length": 691.0,
"epoch": 0.028133333333333333,
"grad_norm": 23.486236572265625,
"kl": 2.421875,
"learning_rate": 1e-06,
"loss": 0.097,
"reward": 1.2017005681991577,
"reward_std": 0.33617621660232544,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.23579944670200348,
"rewards/tag_count_reward": 0.9375,
"step": 422
},
{
"completion_length": 397.5,
"epoch": 0.0282,
"grad_norm": 172.2124786376953,
"kl": 0.9765625,
"learning_rate": 1e-06,
"loss": 0.039,
"reward": 1.824448823928833,
"reward_std": 0.5625327229499817,
"rewards/accuracy_reward": 0.8125,
"rewards/len_reward": 0.01194883044809103,
"rewards/tag_count_reward": 1.0,
"step": 423
},
{
"completion_length": 1932.5,
"epoch": 0.028266666666666666,
"grad_norm": 70.06598663330078,
"kl": 6.9375,
"learning_rate": 1e-06,
"loss": 0.2758,
"reward": 0.1855810135602951,
"reward_std": 0.3005276918411255,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2675439715385437,
"rewards/tag_count_reward": 0.453125,
"step": 424
},
{
"completion_length": 1108.0,
"epoch": 0.028333333333333332,
"grad_norm": 241.70701599121094,
"kl": 1.4609375,
"learning_rate": 1e-06,
"loss": 0.0585,
"reward": 0.6527888774871826,
"reward_std": 0.2796846628189087,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.19096116721630096,
"rewards/tag_count_reward": 0.84375,
"step": 425
},
{
"completion_length": 1694.0,
"epoch": 0.0284,
"grad_norm": 38.58192825317383,
"kl": 3.796875,
"learning_rate": 1e-06,
"loss": 0.1519,
"reward": 0.3635767698287964,
"reward_std": 0.33601176738739014,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1832982301712036,
"rewards/tag_count_reward": 0.546875,
"step": 426
},
{
"completion_length": 729.0,
"epoch": 0.028466666666666668,
"grad_norm": 5.525942325592041,
"kl": 3.484375,
"learning_rate": 1e-06,
"loss": 0.1392,
"reward": 0.8202959299087524,
"reward_std": 0.5015732645988464,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.19532907009124756,
"rewards/tag_count_reward": 0.828125,
"step": 427
},
{
"completion_length": 1260.5,
"epoch": 0.028533333333333334,
"grad_norm": 24.404516220092773,
"kl": 2.84375,
"learning_rate": 1e-06,
"loss": 0.1135,
"reward": 0.5966103076934814,
"reward_std": 0.26570066809654236,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.15338970720767975,
"rewards/tag_count_reward": 0.75,
"step": 428
},
{
"completion_length": 1728.0,
"epoch": 0.0286,
"grad_norm": 24.004438400268555,
"kl": 4.5625,
"learning_rate": 1e-06,
"loss": 0.1825,
"reward": 0.1035110354423523,
"reward_std": 0.3564677834510803,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3496139645576477,
"rewards/tag_count_reward": 0.453125,
"step": 429
},
{
"completion_length": 2012.0,
"epoch": 0.028666666666666667,
"grad_norm": 210.22830200195312,
"kl": 2.96875,
"learning_rate": 1e-06,
"loss": 0.1191,
"reward": 0.4417126774787903,
"reward_std": 0.2711828649044037,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2145373374223709,
"rewards/tag_count_reward": 0.65625,
"step": 430
},
{
"completion_length": 761.5,
"epoch": 0.028733333333333333,
"grad_norm": 177.37310791015625,
"kl": 2.03125,
"learning_rate": 1e-06,
"loss": 0.0812,
"reward": 1.385439395904541,
"reward_std": 0.45246297121047974,
"rewards/accuracy_reward": 0.5625,
"rewards/len_reward": -0.17706066370010376,
"rewards/tag_count_reward": 1.0,
"step": 431
},
{
"completion_length": 1500.0,
"epoch": 0.0288,
"grad_norm": 25.036670684814453,
"kl": 3.0625,
"learning_rate": 1e-06,
"loss": 0.1224,
"reward": 0.506252110004425,
"reward_std": 0.25970926880836487,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.13437287509441376,
"rewards/tag_count_reward": 0.640625,
"step": 432
},
{
"completion_length": 463.0,
"epoch": 0.028866666666666665,
"grad_norm": 98.45127868652344,
"kl": 2.265625,
"learning_rate": 1e-06,
"loss": 0.0906,
"reward": 0.9743916392326355,
"reward_std": 0.5563583970069885,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.10373339056968689,
"rewards/tag_count_reward": 0.953125,
"step": 433
},
{
"completion_length": 1468.5,
"epoch": 0.028933333333333332,
"grad_norm": 61.40692901611328,
"kl": 5.15625,
"learning_rate": 1e-06,
"loss": 0.2059,
"reward": 0.7104079127311707,
"reward_std": 0.26031026244163513,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22709208726882935,
"rewards/tag_count_reward": 0.9375,
"step": 434
},
{
"completion_length": 597.5,
"epoch": 0.029,
"grad_norm": 120.39767456054688,
"kl": 4.28125,
"learning_rate": 1e-06,
"loss": 0.1715,
"reward": 1.5235540866851807,
"reward_std": 0.5599320530891418,
"rewards/accuracy_reward": 0.75,
"rewards/len_reward": -0.17957091331481934,
"rewards/tag_count_reward": 0.953125,
"step": 435
},
{
"completion_length": 1170.5,
"epoch": 0.029066666666666668,
"grad_norm": 59.792537689208984,
"kl": 4.25,
"learning_rate": 1e-06,
"loss": 0.1692,
"reward": 0.570354700088501,
"reward_std": 0.3200645446777344,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.11714527010917664,
"rewards/tag_count_reward": 0.6875,
"step": 436
},
{
"completion_length": 801.0,
"epoch": 0.029133333333333334,
"grad_norm": 85.89793395996094,
"kl": 3.0625,
"learning_rate": 1e-06,
"loss": 0.1229,
"reward": 0.7610937356948853,
"reward_std": 0.19216570258140564,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22328129410743713,
"rewards/tag_count_reward": 0.984375,
"step": 437
},
{
"completion_length": 1052.5,
"epoch": 0.0292,
"grad_norm": 33.27427673339844,
"kl": 2.84375,
"learning_rate": 1e-06,
"loss": 0.1139,
"reward": 0.5006247162818909,
"reward_std": 0.42141035199165344,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2493753433227539,
"rewards/tag_count_reward": 0.6875,
"step": 438
},
{
"completion_length": 1325.5,
"epoch": 0.029266666666666667,
"grad_norm": 124.21721649169922,
"kl": 1.78125,
"learning_rate": 1e-06,
"loss": 0.0716,
"reward": 0.8362706899642944,
"reward_std": 0.4848806858062744,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.21060431003570557,
"rewards/tag_count_reward": 0.859375,
"step": 439
},
{
"completion_length": 1308.0,
"epoch": 0.029333333333333333,
"grad_norm": 101.55350494384766,
"kl": 1.0625,
"learning_rate": 1e-06,
"loss": 0.0427,
"reward": 0.8429738879203796,
"reward_std": 0.3363175094127655,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.21952611207962036,
"rewards/tag_count_reward": 1.0,
"step": 440
},
{
"completion_length": 1663.0,
"epoch": 0.0294,
"grad_norm": 272.64434814453125,
"kl": 1.2421875,
"learning_rate": 1e-06,
"loss": 0.0497,
"reward": 0.5526822805404663,
"reward_std": 0.23517288267612457,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2441927045583725,
"rewards/tag_count_reward": 0.796875,
"step": 441
},
{
"completion_length": 376.5,
"epoch": 0.029466666666666665,
"grad_norm": 19.101346969604492,
"kl": 0.85546875,
"learning_rate": 1e-06,
"loss": 0.0342,
"reward": 1.2024941444396973,
"reward_std": 0.47450199723243713,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.14125582575798035,
"rewards/tag_count_reward": 0.96875,
"step": 442
},
{
"completion_length": 355.0,
"epoch": 0.029533333333333335,
"grad_norm": 39.646183013916016,
"kl": 0.9453125,
"learning_rate": 1e-06,
"loss": 0.0377,
"reward": 1.3451943397521973,
"reward_std": 0.27902498841285706,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.1548057198524475,
"rewards/tag_count_reward": 1.0,
"step": 443
},
{
"completion_length": 584.0,
"epoch": 0.0296,
"grad_norm": 42.00813293457031,
"kl": 0.95703125,
"learning_rate": 1e-06,
"loss": 0.0383,
"reward": 0.9564022421836853,
"reward_std": 0.5531929731369019,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.1685977727174759,
"rewards/tag_count_reward": 1.0,
"step": 444
},
{
"completion_length": 459.5,
"epoch": 0.029666666666666668,
"grad_norm": 143.62857055664062,
"kl": 1.0078125,
"learning_rate": 1e-06,
"loss": 0.0403,
"reward": 1.3355096578598022,
"reward_std": 0.41100364923477173,
"rewards/accuracy_reward": 0.5625,
"rewards/len_reward": -0.13324040174484253,
"rewards/tag_count_reward": 0.90625,
"step": 445
},
{
"completion_length": 1615.0,
"epoch": 0.029733333333333334,
"grad_norm": 39.87702941894531,
"kl": 2.671875,
"learning_rate": 1e-06,
"loss": 0.1069,
"reward": 0.5858045816421509,
"reward_std": 0.3898976445198059,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.22669541835784912,
"rewards/tag_count_reward": 0.6875,
"step": 446
},
{
"completion_length": 1805.0,
"epoch": 0.0298,
"grad_norm": 117.98473358154297,
"kl": 2.515625,
"learning_rate": 1e-06,
"loss": 0.1008,
"reward": 0.18895292282104492,
"reward_std": 0.19049280881881714,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.23292209208011627,
"rewards/tag_count_reward": 0.421875,
"step": 447
},
{
"completion_length": 1490.0,
"epoch": 0.029866666666666666,
"grad_norm": 73.97403717041016,
"kl": 2.25,
"learning_rate": 1e-06,
"loss": 0.0899,
"reward": 0.24734440445899963,
"reward_std": 0.2963607907295227,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.25265559554100037,
"rewards/tag_count_reward": 0.5,
"step": 448
},
{
"completion_length": 1220.0,
"epoch": 0.029933333333333333,
"grad_norm": 36.06205749511719,
"kl": 3.1875,
"learning_rate": 1e-06,
"loss": 0.1276,
"reward": 0.6867581009864807,
"reward_std": 0.38040876388549805,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.1882418543100357,
"rewards/tag_count_reward": 0.6875,
"step": 449
},
{
"completion_length": 2048.0,
"epoch": 0.03,
"grad_norm": 155.3662872314453,
"kl": 5.3125,
"learning_rate": 1e-06,
"loss": 0.2113,
"reward": 0.3176334500312805,
"reward_std": 0.2072276473045349,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3542415499687195,
"rewards/tag_count_reward": 0.671875,
"step": 450
},
{
"completion_length": 1347.0,
"epoch": 0.030066666666666665,
"grad_norm": 47.34263610839844,
"kl": 2.46875,
"learning_rate": 1e-06,
"loss": 0.0993,
"reward": 1.1042916774749756,
"reward_std": 0.5063524842262268,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.31758344173431396,
"rewards/tag_count_reward": 0.984375,
"step": 451
},
{
"completion_length": 1206.5,
"epoch": 0.030133333333333335,
"grad_norm": 45.91167068481445,
"kl": 5.3125,
"learning_rate": 1e-06,
"loss": 0.2126,
"reward": 0.4902770221233368,
"reward_std": 0.2801367938518524,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3065979778766632,
"rewards/tag_count_reward": 0.796875,
"step": 452
},
{
"completion_length": 2048.0,
"epoch": 0.0302,
"grad_norm": 31.02351188659668,
"kl": 3.46875,
"learning_rate": 1e-06,
"loss": 0.1384,
"reward": 0.15944889187812805,
"reward_std": 0.33201614022254944,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.19992613792419434,
"rewards/tag_count_reward": 0.359375,
"step": 453
},
{
"completion_length": 1568.0,
"epoch": 0.030266666666666667,
"grad_norm": 2.735738515853882,
"kl": 1.296875,
"learning_rate": 1e-06,
"loss": 0.0517,
"reward": 0.6618618965148926,
"reward_std": 0.2902847230434418,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.16626305878162384,
"rewards/tag_count_reward": 0.828125,
"step": 454
},
{
"completion_length": 1286.0,
"epoch": 0.030333333333333334,
"grad_norm": 108.56804656982422,
"kl": 1.828125,
"learning_rate": 1e-06,
"loss": 0.0731,
"reward": 1.221577525138855,
"reward_std": 0.56404048204422,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.24717243015766144,
"rewards/tag_count_reward": 0.96875,
"step": 455
},
{
"completion_length": 1626.0,
"epoch": 0.0304,
"grad_norm": 260.0552978515625,
"kl": 3.34375,
"learning_rate": 1e-06,
"loss": 0.134,
"reward": 0.4680972397327423,
"reward_std": 0.3092855215072632,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3287777900695801,
"rewards/tag_count_reward": 0.796875,
"step": 456
},
{
"completion_length": 929.5,
"epoch": 0.030466666666666666,
"grad_norm": 7.954984188079834,
"kl": 2.46875,
"learning_rate": 1e-06,
"loss": 0.099,
"reward": 0.6224107146263123,
"reward_std": 0.2342901974916458,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.26821425557136536,
"rewards/tag_count_reward": 0.890625,
"step": 457
},
{
"completion_length": 680.5,
"epoch": 0.030533333333333332,
"grad_norm": 311.03179931640625,
"kl": 1.46875,
"learning_rate": 1e-06,
"loss": 0.0587,
"reward": 0.8654361963272095,
"reward_std": 0.2987762689590454,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.18143880367279053,
"rewards/tag_count_reward": 0.984375,
"step": 458
},
{
"completion_length": 2048.0,
"epoch": 0.0306,
"grad_norm": 364.86083984375,
"kl": 2.15625,
"learning_rate": 1e-06,
"loss": 0.086,
"reward": 0.6716626882553101,
"reward_std": 0.31359678506851196,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.37521231174468994,
"rewards/tag_count_reward": 0.984375,
"step": 459
},
{
"completion_length": 482.5,
"epoch": 0.030666666666666665,
"grad_norm": 203.19033813476562,
"kl": 2.328125,
"learning_rate": 1e-06,
"loss": 0.0935,
"reward": 1.6676654815673828,
"reward_std": 0.43626925349235535,
"rewards/accuracy_reward": 0.8125,
"rewards/len_reward": -0.1448344886302948,
"rewards/tag_count_reward": 1.0,
"step": 460
},
{
"completion_length": 369.5,
"epoch": 0.030733333333333335,
"grad_norm": 94.32376861572266,
"kl": 4.5625,
"learning_rate": 1e-06,
"loss": 0.1816,
"reward": 1.7893270254135132,
"reward_std": 0.4871293008327484,
"rewards/accuracy_reward": 0.875,
"rewards/len_reward": -0.08567289263010025,
"rewards/tag_count_reward": 1.0,
"step": 461
},
{
"completion_length": 899.5,
"epoch": 0.0308,
"grad_norm": 26.289045333862305,
"kl": 2.0,
"learning_rate": 1e-06,
"loss": 0.0802,
"reward": 0.5772469639778137,
"reward_std": 0.33308613300323486,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.15712805092334747,
"rewards/tag_count_reward": 0.734375,
"step": 462
},
{
"completion_length": 1619.5,
"epoch": 0.030866666666666667,
"grad_norm": 45.15757751464844,
"kl": 1.765625,
"learning_rate": 1e-06,
"loss": 0.0708,
"reward": 1.156102180480957,
"reward_std": 0.2771090269088745,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.2657727897167206,
"rewards/tag_count_reward": 0.984375,
"step": 463
},
{
"completion_length": 1230.5,
"epoch": 0.030933333333333334,
"grad_norm": 251.4700927734375,
"kl": 19.125,
"learning_rate": 1e-06,
"loss": 0.764,
"reward": 0.4844090938568115,
"reward_std": 0.26258766651153564,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2655909061431885,
"rewards/tag_count_reward": 0.75,
"step": 464
},
{
"completion_length": 1297.0,
"epoch": 0.031,
"grad_norm": 151.19776916503906,
"kl": 11.375,
"learning_rate": 1e-06,
"loss": 0.4533,
"reward": 0.826347291469574,
"reward_std": 0.5434819459915161,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.07990271598100662,
"rewards/tag_count_reward": 0.65625,
"step": 465
},
{
"completion_length": 453.5,
"epoch": 0.031066666666666666,
"grad_norm": 65.11819458007812,
"kl": 5.0,
"learning_rate": 1e-06,
"loss": 0.2003,
"reward": 0.833638072013855,
"reward_std": 0.20467182993888855,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1507369577884674,
"rewards/tag_count_reward": 0.984375,
"step": 466
},
{
"completion_length": 1464.5,
"epoch": 0.031133333333333332,
"grad_norm": 121.26884460449219,
"kl": 8.5,
"learning_rate": 1e-06,
"loss": 0.3395,
"reward": 0.6761493682861328,
"reward_std": 0.1760469675064087,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2926006019115448,
"rewards/tag_count_reward": 0.96875,
"step": 467
},
{
"completion_length": 1127.5,
"epoch": 0.0312,
"grad_norm": 77.94124603271484,
"kl": 4.71875,
"learning_rate": 1e-06,
"loss": 0.1894,
"reward": 1.0224368572235107,
"reward_std": 0.4776157736778259,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.24318821728229523,
"rewards/tag_count_reward": 0.828125,
"step": 468
},
{
"completion_length": 1644.0,
"epoch": 0.031266666666666665,
"grad_norm": 76.69334411621094,
"kl": 3.75,
"learning_rate": 1e-06,
"loss": 0.1496,
"reward": 0.41094398498535156,
"reward_std": 0.30710700154304504,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.24530604481697083,
"rewards/tag_count_reward": 0.65625,
"step": 469
},
{
"completion_length": 366.5,
"epoch": 0.03133333333333333,
"grad_norm": 38.32693862915039,
"kl": 2.671875,
"learning_rate": 1e-06,
"loss": 0.1065,
"reward": 1.4753447771072388,
"reward_std": 0.39756450057029724,
"rewards/accuracy_reward": 0.625,
"rewards/len_reward": -0.14965522289276123,
"rewards/tag_count_reward": 1.0,
"step": 470
},
{
"completion_length": 1699.5,
"epoch": 0.0314,
"grad_norm": 140.01515197753906,
"kl": 0.2890625,
"learning_rate": 1e-06,
"loss": 0.0116,
"reward": 0.8597678542137146,
"reward_std": 0.2188059687614441,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1089821383357048,
"rewards/tag_count_reward": 0.96875,
"step": 471
},
{
"completion_length": 483.5,
"epoch": 0.031466666666666664,
"grad_norm": 4.880711555480957,
"kl": 2.078125,
"learning_rate": 1e-06,
"loss": 0.0831,
"reward": 0.916806161403656,
"reward_std": 0.40184321999549866,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.1925688534975052,
"rewards/tag_count_reward": 0.984375,
"step": 472
},
{
"completion_length": 1589.0,
"epoch": 0.03153333333333333,
"grad_norm": 248.47195434570312,
"kl": 1.765625,
"learning_rate": 1e-06,
"loss": 0.0704,
"reward": 0.28684836626052856,
"reward_std": 0.29903551936149597,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.30690163373947144,
"rewards/tag_count_reward": 0.59375,
"step": 473
},
{
"completion_length": 951.0,
"epoch": 0.0316,
"grad_norm": 13.834942817687988,
"kl": 0.259765625,
"learning_rate": 1e-06,
"loss": 0.0104,
"reward": 1.1863715648651123,
"reward_std": 0.26852947473526,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.2355034500360489,
"rewards/tag_count_reward": 0.984375,
"step": 474
},
{
"completion_length": 1147.0,
"epoch": 0.03166666666666667,
"grad_norm": 36.051719665527344,
"kl": 0.31640625,
"learning_rate": 1e-06,
"loss": 0.0127,
"reward": 0.7529718279838562,
"reward_std": 0.32631242275238037,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2782781720161438,
"rewards/tag_count_reward": 0.96875,
"step": 475
},
{
"completion_length": 2048.0,
"epoch": 0.031733333333333336,
"grad_norm": 15.62546443939209,
"kl": 1.671875,
"learning_rate": 1e-06,
"loss": 0.0671,
"reward": 0.7044357061386108,
"reward_std": 0.39865821599960327,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.35806435346603394,
"rewards/tag_count_reward": 0.9375,
"step": 476
},
{
"completion_length": 1054.5,
"epoch": 0.0318,
"grad_norm": 27.04443359375,
"kl": 0.55078125,
"learning_rate": 1e-06,
"loss": 0.0221,
"reward": 0.7698061466217041,
"reward_std": 0.17838923633098602,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2301938831806183,
"rewards/tag_count_reward": 0.9375,
"step": 477
},
{
"completion_length": 1930.0,
"epoch": 0.03186666666666667,
"grad_norm": 16.326013565063477,
"kl": 1.484375,
"learning_rate": 1e-06,
"loss": 0.0593,
"reward": 0.614019513130188,
"reward_std": 0.34445932507514954,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.3391055464744568,
"rewards/tag_count_reward": 0.890625,
"step": 478
},
{
"completion_length": 1036.5,
"epoch": 0.031933333333333334,
"grad_norm": 23.876251220703125,
"kl": 0.369140625,
"learning_rate": 1e-06,
"loss": 0.0147,
"reward": 0.9702253341674805,
"reward_std": 0.34339791536331177,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.06102466583251953,
"rewards/tag_count_reward": 0.96875,
"step": 479
},
{
"completion_length": 653.5,
"epoch": 0.032,
"grad_norm": 29.604717254638672,
"kl": 2.3125,
"learning_rate": 1e-06,
"loss": 0.0923,
"reward": 1.1755359172821045,
"reward_std": 0.43715834617614746,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.2619641125202179,
"rewards/tag_count_reward": 1.0,
"step": 480
},
{
"completion_length": 1473.0,
"epoch": 0.03206666666666667,
"grad_norm": 65.69683074951172,
"kl": 3.71875,
"learning_rate": 1e-06,
"loss": 0.1485,
"reward": 0.2577321529388428,
"reward_std": 0.24535515904426575,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2735178470611572,
"rewards/tag_count_reward": 0.53125,
"step": 481
},
{
"completion_length": 1509.0,
"epoch": 0.03213333333333333,
"grad_norm": 67.13792419433594,
"kl": 2.71875,
"learning_rate": 1e-06,
"loss": 0.108,
"reward": 0.3775395154953003,
"reward_std": 0.2782382071018219,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2943354845046997,
"rewards/tag_count_reward": 0.671875,
"step": 482
},
{
"completion_length": 1161.0,
"epoch": 0.0322,
"grad_norm": 162.6373291015625,
"kl": 2.78125,
"learning_rate": 1e-06,
"loss": 0.1118,
"reward": 0.9634556174278259,
"reward_std": 0.40102633833885193,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.23966939747333527,
"rewards/tag_count_reward": 0.765625,
"step": 483
},
{
"completion_length": 951.0,
"epoch": 0.032266666666666666,
"grad_norm": 36.65139389038086,
"kl": 1.0546875,
"learning_rate": 1e-06,
"loss": 0.0422,
"reward": 1.1727631092071533,
"reward_std": 0.47382399439811707,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.12411190569400787,
"rewards/tag_count_reward": 0.859375,
"step": 484
},
{
"completion_length": 1401.0,
"epoch": 0.03233333333333333,
"grad_norm": 54.291259765625,
"kl": 0.439453125,
"learning_rate": 1e-06,
"loss": 0.0176,
"reward": 0.5725162625312805,
"reward_std": 0.21343138813972473,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3493587374687195,
"rewards/tag_count_reward": 0.921875,
"step": 485
},
{
"completion_length": 496.5,
"epoch": 0.0324,
"grad_norm": 63.599273681640625,
"kl": 0.8671875,
"learning_rate": 1e-06,
"loss": 0.0345,
"reward": 1.1292247772216797,
"reward_std": 0.4646027684211731,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.1364002227783203,
"rewards/tag_count_reward": 0.953125,
"step": 486
},
{
"completion_length": 1243.5,
"epoch": 0.032466666666666665,
"grad_norm": 116.7339096069336,
"kl": 0.73828125,
"learning_rate": 1e-06,
"loss": 0.0296,
"reward": 0.8797893524169922,
"reward_std": 0.483071506023407,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.24521055817604065,
"rewards/tag_count_reward": 0.9375,
"step": 487
},
{
"completion_length": 446.0,
"epoch": 0.03253333333333333,
"grad_norm": 35.85505676269531,
"kl": 0.2001953125,
"learning_rate": 1e-06,
"loss": 0.008,
"reward": 1.3932549953460693,
"reward_std": 0.26571404933929443,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.10674503445625305,
"rewards/tag_count_reward": 1.0,
"step": 488
},
{
"completion_length": 1034.5,
"epoch": 0.0326,
"grad_norm": 81.21532440185547,
"kl": 1.453125,
"learning_rate": 1e-06,
"loss": 0.0584,
"reward": 1.1355910301208496,
"reward_std": 0.5194476246833801,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.1612839698791504,
"rewards/tag_count_reward": 0.921875,
"step": 489
},
{
"completion_length": 994.0,
"epoch": 0.03266666666666666,
"grad_norm": 22.129722595214844,
"kl": 0.6484375,
"learning_rate": 1e-06,
"loss": 0.0259,
"reward": 1.3354647159576416,
"reward_std": 0.4170573949813843,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.03953530639410019,
"rewards/tag_count_reward": 0.9375,
"step": 490
},
{
"completion_length": 854.0,
"epoch": 0.032733333333333337,
"grad_norm": 27.246200561523438,
"kl": 2.53125,
"learning_rate": 1e-06,
"loss": 0.101,
"reward": 1.1956677436828613,
"reward_std": 0.24232608079910278,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.30433231592178345,
"rewards/tag_count_reward": 1.0,
"step": 491
},
{
"completion_length": 1069.0,
"epoch": 0.0328,
"grad_norm": 38.06834030151367,
"kl": 2.171875,
"learning_rate": 1e-06,
"loss": 0.087,
"reward": 0.6666276454925537,
"reward_std": 0.1747668981552124,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2552473545074463,
"rewards/tag_count_reward": 0.921875,
"step": 492
},
{
"completion_length": 939.0,
"epoch": 0.03286666666666667,
"grad_norm": 46.978397369384766,
"kl": 5.34375,
"learning_rate": 1e-06,
"loss": 0.2134,
"reward": 0.9970148205757141,
"reward_std": 0.41218793392181396,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.2529851794242859,
"rewards/tag_count_reward": 1.0,
"step": 493
},
{
"completion_length": 837.5,
"epoch": 0.032933333333333335,
"grad_norm": 16.7191219329834,
"kl": 1.734375,
"learning_rate": 1e-06,
"loss": 0.0697,
"reward": 1.2853515148162842,
"reward_std": 0.30677878856658936,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.1677735447883606,
"rewards/tag_count_reward": 0.953125,
"step": 494
},
{
"completion_length": 880.5,
"epoch": 0.033,
"grad_norm": 103.61772155761719,
"kl": 5.96875,
"learning_rate": 1e-06,
"loss": 0.2386,
"reward": 0.5835074186325073,
"reward_std": 0.4315335750579834,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2289925515651703,
"rewards/tag_count_reward": 0.75,
"step": 495
},
{
"completion_length": 1742.5,
"epoch": 0.03306666666666667,
"grad_norm": 152.2026824951172,
"kl": 11.0,
"learning_rate": 1e-06,
"loss": 0.4405,
"reward": 0.16734576225280762,
"reward_std": 0.21883532404899597,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.3326542377471924,
"rewards/tag_count_reward": 0.5,
"step": 496
},
{
"completion_length": 1249.0,
"epoch": 0.033133333333333334,
"grad_norm": 100.76626586914062,
"kl": 3.21875,
"learning_rate": 1e-06,
"loss": 0.1287,
"reward": 0.8781791925430298,
"reward_std": 0.39289015531539917,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.3249457776546478,
"rewards/tag_count_reward": 0.953125,
"step": 497
},
{
"completion_length": 1309.0,
"epoch": 0.0332,
"grad_norm": 23.7191162109375,
"kl": 2.65625,
"learning_rate": 1e-06,
"loss": 0.106,
"reward": 0.6000075936317444,
"reward_std": 0.27661073207855225,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.21249239146709442,
"rewards/tag_count_reward": 0.8125,
"step": 498
},
{
"completion_length": 872.5,
"epoch": 0.03326666666666667,
"grad_norm": 205.3312225341797,
"kl": 3.890625,
"learning_rate": 1e-06,
"loss": 0.1562,
"reward": 0.5276904702186584,
"reward_std": 0.20058870315551758,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22230952978134155,
"rewards/tag_count_reward": 0.75,
"step": 499
},
{
"completion_length": 1393.5,
"epoch": 0.03333333333333333,
"grad_norm": 34.567054748535156,
"kl": 5.0,
"learning_rate": 1e-06,
"loss": 0.1995,
"reward": 0.3974929451942444,
"reward_std": 0.34111130237579346,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.16500705480575562,
"rewards/tag_count_reward": 0.5625,
"step": 500
},
{
"completion_length": 915.5,
"epoch": 0.0334,
"grad_norm": 68.95124053955078,
"kl": 6.6875,
"learning_rate": 1e-06,
"loss": 0.2672,
"reward": 0.48827648162841797,
"reward_std": 0.2557675242424011,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.23047351837158203,
"rewards/tag_count_reward": 0.71875,
"step": 501
},
{
"completion_length": 1179.0,
"epoch": 0.033466666666666665,
"grad_norm": 91.65396118164062,
"kl": 4.4375,
"learning_rate": 1e-06,
"loss": 0.1769,
"reward": 1.28905189037323,
"reward_std": 0.4957786798477173,
"rewards/accuracy_reward": 0.5625,
"rewards/len_reward": -0.19532307982444763,
"rewards/tag_count_reward": 0.921875,
"step": 502
},
{
"completion_length": 406.0,
"epoch": 0.03353333333333333,
"grad_norm": 34.242103576660156,
"kl": 2.84375,
"learning_rate": 1e-06,
"loss": 0.1132,
"reward": 1.4901211261749268,
"reward_std": 0.4295300543308258,
"rewards/accuracy_reward": 0.625,
"rewards/len_reward": -0.11925392597913742,
"rewards/tag_count_reward": 0.984375,
"step": 503
},
{
"completion_length": 1280.0,
"epoch": 0.0336,
"grad_norm": 70.93849182128906,
"kl": 1.4296875,
"learning_rate": 1e-06,
"loss": 0.0571,
"reward": 0.742384135723114,
"reward_std": 0.19301137328147888,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.17949089407920837,
"rewards/tag_count_reward": 0.921875,
"step": 504
},
{
"completion_length": 1247.5,
"epoch": 0.033666666666666664,
"grad_norm": 30.341981887817383,
"kl": 1.625,
"learning_rate": 1e-06,
"loss": 0.065,
"reward": 0.9364212155342102,
"reward_std": 0.376028835773468,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.17295382916927338,
"rewards/tag_count_reward": 0.671875,
"step": 505
},
{
"completion_length": 505.0,
"epoch": 0.03373333333333333,
"grad_norm": 40.31377029418945,
"kl": 2.109375,
"learning_rate": 1e-06,
"loss": 0.0841,
"reward": 0.802386999130249,
"reward_std": 0.32473599910736084,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.1976129710674286,
"rewards/tag_count_reward": 0.9375,
"step": 506
},
{
"completion_length": 574.5,
"epoch": 0.0338,
"grad_norm": 43.77798080444336,
"kl": 0.5625,
"learning_rate": 1e-06,
"loss": 0.0225,
"reward": 0.8121465444564819,
"reward_std": 0.19152596592903137,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.14097848534584045,
"rewards/tag_count_reward": 0.953125,
"step": 507
},
{
"completion_length": 1466.0,
"epoch": 0.03386666666666667,
"grad_norm": 63.456520080566406,
"kl": 2.1875,
"learning_rate": 1e-06,
"loss": 0.0877,
"reward": 0.5162305235862732,
"reward_std": 0.22059300541877747,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2962694764137268,
"rewards/tag_count_reward": 0.8125,
"step": 508
},
{
"completion_length": 2048.0,
"epoch": 0.033933333333333336,
"grad_norm": 54.614505767822266,
"kl": 1.96875,
"learning_rate": 1e-06,
"loss": 0.079,
"reward": 0.4008627235889435,
"reward_std": 0.2848988473415375,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2710123062133789,
"rewards/tag_count_reward": 0.671875,
"step": 509
},
{
"completion_length": 1095.0,
"epoch": 0.034,
"grad_norm": 41.41899490356445,
"kl": 1.0703125,
"learning_rate": 1e-06,
"loss": 0.0429,
"reward": 0.7443307042121887,
"reward_std": 0.28202205896377563,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.13066931068897247,
"rewards/tag_count_reward": 0.875,
"step": 510
},
{
"completion_length": 502.0,
"epoch": 0.03406666666666667,
"grad_norm": 166.49993896484375,
"kl": 1.015625,
"learning_rate": 1e-06,
"loss": 0.0406,
"reward": 1.0923429727554321,
"reward_std": 0.49536144733428955,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.2045319825410843,
"rewards/tag_count_reward": 0.984375,
"step": 511
},
{
"completion_length": 2048.0,
"epoch": 0.034133333333333335,
"grad_norm": 97.36649322509766,
"kl": 3.15625,
"learning_rate": 1e-06,
"loss": 0.1261,
"reward": 0.47508713603019714,
"reward_std": 0.2021721601486206,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.27491286396980286,
"rewards/tag_count_reward": 0.75,
"step": 512
},
{
"completion_length": 1942.5,
"epoch": 0.0342,
"grad_norm": 43.3392448425293,
"kl": 3.890625,
"learning_rate": 1e-06,
"loss": 0.1556,
"reward": 0.3828532099723816,
"reward_std": 0.25550660490989685,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2421467900276184,
"rewards/tag_count_reward": 0.625,
"step": 513
},
{
"completion_length": 806.0,
"epoch": 0.03426666666666667,
"grad_norm": 24.938037872314453,
"kl": 5.375,
"learning_rate": 1e-06,
"loss": 0.2155,
"reward": 0.39125797152519226,
"reward_std": 0.3487168550491333,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.21811704337596893,
"rewards/tag_count_reward": 0.609375,
"step": 514
},
{
"completion_length": 492.0,
"epoch": 0.034333333333333334,
"grad_norm": 234.27557373046875,
"kl": 12.25,
"learning_rate": 1e-06,
"loss": 0.4907,
"reward": 1.2218936681747437,
"reward_std": 0.3230869174003601,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.16873131692409515,
"rewards/tag_count_reward": 0.953125,
"step": 515
},
{
"completion_length": 1637.0,
"epoch": 0.0344,
"grad_norm": 41.388702392578125,
"kl": 6.78125,
"learning_rate": 1e-06,
"loss": 0.2706,
"reward": 0.5030000805854797,
"reward_std": 0.25090712308883667,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.35637491941452026,
"rewards/tag_count_reward": 0.859375,
"step": 516
},
{
"completion_length": 455.0,
"epoch": 0.034466666666666666,
"grad_norm": 106.74304962158203,
"kl": 8.625,
"learning_rate": 1e-06,
"loss": 0.3451,
"reward": 1.1737362146377563,
"reward_std": 0.458330363035202,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.15438883006572723,
"rewards/tag_count_reward": 0.890625,
"step": 517
},
{
"completion_length": 1055.0,
"epoch": 0.03453333333333333,
"grad_norm": 203.50892639160156,
"kl": 8.125,
"learning_rate": 1e-06,
"loss": 0.3261,
"reward": 0.9100844264030457,
"reward_std": 0.40734437108039856,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.29304054379463196,
"rewards/tag_count_reward": 0.890625,
"step": 518
},
{
"completion_length": 706.0,
"epoch": 0.0346,
"grad_norm": 75.58966064453125,
"kl": 6.90625,
"learning_rate": 1e-06,
"loss": 0.2768,
"reward": 0.48510849475860596,
"reward_std": 0.19676175713539124,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.24926650524139404,
"rewards/tag_count_reward": 0.734375,
"step": 519
},
{
"completion_length": 936.0,
"epoch": 0.034666666666666665,
"grad_norm": 32.32265853881836,
"kl": 2.53125,
"learning_rate": 1e-06,
"loss": 0.101,
"reward": 0.5524517893791199,
"reward_std": 0.35804426670074463,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.18192319571971893,
"rewards/tag_count_reward": 0.734375,
"step": 520
},
{
"completion_length": 612.0,
"epoch": 0.03473333333333333,
"grad_norm": 51.99385070800781,
"kl": 2.3125,
"learning_rate": 1e-06,
"loss": 0.0924,
"reward": 0.9991387724876404,
"reward_std": 0.6344587802886963,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.11023623496294022,
"rewards/tag_count_reward": 0.921875,
"step": 521
},
{
"completion_length": 836.5,
"epoch": 0.0348,
"grad_norm": 54.89631271362305,
"kl": 1.9921875,
"learning_rate": 1e-06,
"loss": 0.0796,
"reward": 1.1614923477172852,
"reward_std": 0.4174689054489136,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.22913259267807007,
"rewards/tag_count_reward": 0.953125,
"step": 522
},
{
"completion_length": 946.5,
"epoch": 0.034866666666666664,
"grad_norm": 62.0288200378418,
"kl": 2.03125,
"learning_rate": 1e-06,
"loss": 0.0809,
"reward": 1.1289585828781128,
"reward_std": 0.3178820013999939,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.08979140222072601,
"rewards/tag_count_reward": 0.71875,
"step": 523
},
{
"completion_length": 598.0,
"epoch": 0.03493333333333333,
"grad_norm": 83.11853790283203,
"kl": 1.609375,
"learning_rate": 1e-06,
"loss": 0.0646,
"reward": 1.320982813835144,
"reward_std": 0.3107944130897522,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.17901712656021118,
"rewards/tag_count_reward": 1.0,
"step": 524
},
{
"completion_length": 1150.5,
"epoch": 0.035,
"grad_norm": 141.64254760742188,
"kl": 1.71875,
"learning_rate": 1e-06,
"loss": 0.0685,
"reward": 0.6263207793235779,
"reward_std": 0.19915956258773804,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2018042504787445,
"rewards/tag_count_reward": 0.828125,
"step": 525
},
{
"completion_length": 643.0,
"epoch": 0.03506666666666667,
"grad_norm": 52.120609283447266,
"kl": 2.3125,
"learning_rate": 1e-06,
"loss": 0.093,
"reward": 0.5838699340820312,
"reward_std": 0.2685253620147705,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22863011062145233,
"rewards/tag_count_reward": 0.8125,
"step": 526
},
{
"completion_length": 534.0,
"epoch": 0.035133333333333336,
"grad_norm": 8086.248046875,
"kl": 106.5,
"learning_rate": 1e-06,
"loss": 4.2705,
"reward": 0.9970443248748779,
"reward_std": 0.3563997149467468,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.23733067512512207,
"rewards/tag_count_reward": 0.984375,
"step": 527
},
{
"completion_length": 1125.0,
"epoch": 0.0352,
"grad_norm": 86.96722412109375,
"kl": 2.15625,
"learning_rate": 1e-06,
"loss": 0.0864,
"reward": 0.6056922674179077,
"reward_std": 0.2982494533061981,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.23805776238441467,
"rewards/tag_count_reward": 0.84375,
"step": 528
},
{
"completion_length": 1553.5,
"epoch": 0.03526666666666667,
"grad_norm": 44.041141510009766,
"kl": 3.65625,
"learning_rate": 1e-06,
"loss": 0.1466,
"reward": 0.6474529504776001,
"reward_std": 0.37525323033332825,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2587970197200775,
"rewards/tag_count_reward": 0.84375,
"step": 529
},
{
"completion_length": 1848.5,
"epoch": 0.035333333333333335,
"grad_norm": 147.4516143798828,
"kl": 6.25,
"learning_rate": 1e-06,
"loss": 0.2493,
"reward": 0.2758605480194092,
"reward_std": 0.26490387320518494,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.31788942217826843,
"rewards/tag_count_reward": 0.59375,
"step": 530
},
{
"completion_length": 1511.5,
"epoch": 0.0354,
"grad_norm": 93.3603744506836,
"kl": 8.125,
"learning_rate": 1e-06,
"loss": 0.3259,
"reward": 0.455405056476593,
"reward_std": 0.23336824774742126,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.357094943523407,
"rewards/tag_count_reward": 0.8125,
"step": 531
},
{
"completion_length": 1739.5,
"epoch": 0.03546666666666667,
"grad_norm": 202.96273803710938,
"kl": 4.25,
"learning_rate": 1e-06,
"loss": 0.1705,
"reward": 0.9460640549659729,
"reward_std": 0.34083792567253113,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.2414359152317047,
"rewards/tag_count_reward": 0.875,
"step": 532
},
{
"completion_length": 2048.0,
"epoch": 0.03553333333333333,
"grad_norm": 100.5988998413086,
"kl": 3.28125,
"learning_rate": 1e-06,
"loss": 0.1312,
"reward": 0.28255584836006165,
"reward_std": 0.268098920583725,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.20181915163993835,
"rewards/tag_count_reward": 0.484375,
"step": 533
},
{
"completion_length": 594.0,
"epoch": 0.0356,
"grad_norm": 59.22998046875,
"kl": 4.5625,
"learning_rate": 1e-06,
"loss": 0.1823,
"reward": 0.9103326797485352,
"reward_std": 0.38058894872665405,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.27716729044914246,
"rewards/tag_count_reward": 1.0,
"step": 534
},
{
"completion_length": 424.5,
"epoch": 0.035666666666666666,
"grad_norm": 41.5837516784668,
"kl": 0.8046875,
"learning_rate": 1e-06,
"loss": 0.0322,
"reward": 1.3207809925079346,
"reward_std": 0.5327502489089966,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.08546902239322662,
"rewards/tag_count_reward": 0.90625,
"step": 535
},
{
"completion_length": 1182.5,
"epoch": 0.03573333333333333,
"grad_norm": 82.11896514892578,
"kl": 3.3125,
"learning_rate": 1e-06,
"loss": 0.1324,
"reward": 0.8846542239189148,
"reward_std": 0.6816003918647766,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.2247207760810852,
"rewards/tag_count_reward": 0.921875,
"step": 536
},
{
"completion_length": 626.0,
"epoch": 0.0358,
"grad_norm": 77.67766571044922,
"kl": 1.484375,
"learning_rate": 1e-06,
"loss": 0.0593,
"reward": 1.4307773113250732,
"reward_std": 0.6569211483001709,
"rewards/accuracy_reward": 0.5625,
"rewards/len_reward": -0.13172270357608795,
"rewards/tag_count_reward": 1.0,
"step": 537
},
{
"completion_length": 438.0,
"epoch": 0.035866666666666665,
"grad_norm": 41.48287582397461,
"kl": 0.89453125,
"learning_rate": 1e-06,
"loss": 0.0357,
"reward": 1.1362364292144775,
"reward_std": 0.4892275929450989,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.14501355588436127,
"rewards/tag_count_reward": 0.96875,
"step": 538
},
{
"completion_length": 670.0,
"epoch": 0.03593333333333333,
"grad_norm": 58.74188232421875,
"kl": 1.328125,
"learning_rate": 1e-06,
"loss": 0.0533,
"reward": 0.8157967329025269,
"reward_std": 0.4505951404571533,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.18420328199863434,
"rewards/tag_count_reward": 0.875,
"step": 539
},
{
"completion_length": 459.5,
"epoch": 0.036,
"grad_norm": 114.96290588378906,
"kl": 1.484375,
"learning_rate": 1e-06,
"loss": 0.0591,
"reward": 1.6360636949539185,
"reward_std": 0.5903237462043762,
"rewards/accuracy_reward": 0.75,
"rewards/len_reward": -0.11393626034259796,
"rewards/tag_count_reward": 1.0,
"step": 540
},
{
"completion_length": 372.5,
"epoch": 0.036066666666666664,
"grad_norm": 79.44792175292969,
"kl": 1.5078125,
"learning_rate": 1e-06,
"loss": 0.0602,
"reward": 0.9838788509368896,
"reward_std": 0.5077656507492065,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.14112117886543274,
"rewards/tag_count_reward": 0.875,
"step": 541
},
{
"completion_length": 493.0,
"epoch": 0.03613333333333334,
"grad_norm": 98.48267364501953,
"kl": 1.21875,
"learning_rate": 1e-06,
"loss": 0.0489,
"reward": 1.4731286764144897,
"reward_std": 0.6060855388641357,
"rewards/accuracy_reward": 0.5625,
"rewards/len_reward": -0.042496293783187866,
"rewards/tag_count_reward": 0.953125,
"step": 542
},
{
"completion_length": 808.5,
"epoch": 0.0362,
"grad_norm": 187.82749938964844,
"kl": 1.0,
"learning_rate": 1e-06,
"loss": 0.04,
"reward": 1.1587413549423218,
"reward_std": 0.3639298379421234,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.09125865995883942,
"rewards/tag_count_reward": 0.8125,
"step": 543
},
{
"completion_length": 1173.0,
"epoch": 0.03626666666666667,
"grad_norm": 268.3917236328125,
"kl": 2.03125,
"learning_rate": 1e-06,
"loss": 0.081,
"reward": 1.0823649168014526,
"reward_std": 0.4572429060935974,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.19888514280319214,
"rewards/tag_count_reward": 0.84375,
"step": 544
},
{
"completion_length": 1165.0,
"epoch": 0.036333333333333336,
"grad_norm": 211.13243103027344,
"kl": 1.2421875,
"learning_rate": 1e-06,
"loss": 0.0497,
"reward": 0.5632283687591553,
"reward_std": 0.22577616572380066,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.17114657163619995,
"rewards/tag_count_reward": 0.734375,
"step": 545
},
{
"completion_length": 729.0,
"epoch": 0.0364,
"grad_norm": 7.84573221206665,
"kl": 1.2265625,
"learning_rate": 1e-06,
"loss": 0.0489,
"reward": 0.7691013216972351,
"reward_std": 0.23391905426979065,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2465236932039261,
"rewards/tag_count_reward": 0.953125,
"step": 546
},
{
"completion_length": 756.5,
"epoch": 0.03646666666666667,
"grad_norm": 35.259178161621094,
"kl": 2.15625,
"learning_rate": 1e-06,
"loss": 0.0864,
"reward": 0.5018031597137451,
"reward_std": 0.24412024021148682,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2169468104839325,
"rewards/tag_count_reward": 0.71875,
"step": 547
},
{
"completion_length": 1541.5,
"epoch": 0.036533333333333334,
"grad_norm": 33.61385726928711,
"kl": 1.8828125,
"learning_rate": 1e-06,
"loss": 0.075,
"reward": 0.5826287269592285,
"reward_std": 0.4832291007041931,
"rewards/accuracy_reward": 0.1875,
"rewards/len_reward": -0.1829962581396103,
"rewards/tag_count_reward": 0.578125,
"step": 548
},
{
"completion_length": 416.5,
"epoch": 0.0366,
"grad_norm": 414.3907470703125,
"kl": 1.1953125,
"learning_rate": 1e-06,
"loss": 0.0479,
"reward": 1.6081717014312744,
"reward_std": 0.761642575263977,
"rewards/accuracy_reward": 0.6875,
"rewards/len_reward": -0.04807830601930618,
"rewards/tag_count_reward": 0.96875,
"step": 549
},
{
"completion_length": 1300.5,
"epoch": 0.03666666666666667,
"grad_norm": 67.41233825683594,
"kl": 2.734375,
"learning_rate": 1e-06,
"loss": 0.1091,
"reward": 0.5682820677757263,
"reward_std": 0.3351229429244995,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.19734293222427368,
"rewards/tag_count_reward": 0.703125,
"step": 550
},
{
"completion_length": 459.0,
"epoch": 0.03673333333333333,
"grad_norm": 81.06475830078125,
"kl": 10.0,
"learning_rate": 1e-06,
"loss": 0.4002,
"reward": 0.8082788586616516,
"reward_std": 0.38192644715309143,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.16047115623950958,
"rewards/tag_count_reward": 0.90625,
"step": 551
},
{
"completion_length": 1919.0,
"epoch": 0.0368,
"grad_norm": 190.73329162597656,
"kl": 6.375,
"learning_rate": 1e-06,
"loss": 0.2547,
"reward": 0.4826861619949341,
"reward_std": 0.422154039144516,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.3298138678073883,
"rewards/tag_count_reward": 0.75,
"step": 552
},
{
"completion_length": 395.0,
"epoch": 0.036866666666666666,
"grad_norm": 25.296106338500977,
"kl": 1.875,
"learning_rate": 1e-06,
"loss": 0.0747,
"reward": 1.1998919248580933,
"reward_std": 0.40744996070861816,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.14385807514190674,
"rewards/tag_count_reward": 0.96875,
"step": 553
},
{
"completion_length": 1485.5,
"epoch": 0.03693333333333333,
"grad_norm": 41.97356033325195,
"kl": 5.25,
"learning_rate": 1e-06,
"loss": 0.2101,
"reward": 1.2462836503982544,
"reward_std": 0.27198371291160583,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.2537163496017456,
"rewards/tag_count_reward": 1.0,
"step": 554
},
{
"completion_length": 669.0,
"epoch": 0.037,
"grad_norm": 60.60013961791992,
"kl": 5.4375,
"learning_rate": 1e-06,
"loss": 0.2175,
"reward": 0.7133312225341797,
"reward_std": 0.3600277304649353,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.22416874766349792,
"rewards/tag_count_reward": 0.8125,
"step": 555
},
{
"completion_length": 866.0,
"epoch": 0.037066666666666664,
"grad_norm": 283.12860107421875,
"kl": 1.25,
"learning_rate": 1e-06,
"loss": 0.05,
"reward": 0.7453024387359619,
"reward_std": 0.19285044074058533,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22344759106636047,
"rewards/tag_count_reward": 0.96875,
"step": 556
},
{
"completion_length": 835.0,
"epoch": 0.03713333333333333,
"grad_norm": 46.98485565185547,
"kl": 4.125,
"learning_rate": 1e-06,
"loss": 0.1644,
"reward": 0.6793530583381653,
"reward_std": 0.28530818223953247,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.21127192676067352,
"rewards/tag_count_reward": 0.890625,
"step": 557
},
{
"completion_length": 314.0,
"epoch": 0.0372,
"grad_norm": 79.10865783691406,
"kl": 5.90625,
"learning_rate": 1e-06,
"loss": 0.2369,
"reward": 1.6027872562408447,
"reward_std": 0.4462547302246094,
"rewards/accuracy_reward": 0.75,
"rewards/len_reward": -0.1472126841545105,
"rewards/tag_count_reward": 1.0,
"step": 558
},
{
"completion_length": 630.5,
"epoch": 0.03726666666666666,
"grad_norm": 62.99226760864258,
"kl": 8.1875,
"learning_rate": 1e-06,
"loss": 0.3277,
"reward": 0.7975714206695557,
"reward_std": 0.3944025933742523,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.26492857933044434,
"rewards/tag_count_reward": 1.0,
"step": 559
},
{
"completion_length": 1226.5,
"epoch": 0.037333333333333336,
"grad_norm": 61.31340026855469,
"kl": 3.9375,
"learning_rate": 1e-06,
"loss": 0.1574,
"reward": 0.5960041284561157,
"reward_std": 0.20416629314422607,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.24774591624736786,
"rewards/tag_count_reward": 0.84375,
"step": 560
},
{
"completion_length": 1597.5,
"epoch": 0.0374,
"grad_norm": 98.62857818603516,
"kl": 1.3046875,
"learning_rate": 1e-06,
"loss": 0.0522,
"reward": 0.6311689019203186,
"reward_std": 0.23550403118133545,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.134456068277359,
"rewards/tag_count_reward": 0.765625,
"step": 561
},
{
"completion_length": 500.0,
"epoch": 0.03746666666666667,
"grad_norm": 95.38880157470703,
"kl": 1.234375,
"learning_rate": 1e-06,
"loss": 0.0495,
"reward": 0.9405548572540283,
"reward_std": 0.4175971746444702,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.12194512784481049,
"rewards/tag_count_reward": 0.9375,
"step": 562
},
{
"completion_length": 594.0,
"epoch": 0.037533333333333335,
"grad_norm": 99.58438873291016,
"kl": 1.1171875,
"learning_rate": 1e-06,
"loss": 0.0447,
"reward": 0.7160016298294067,
"reward_std": 0.16242673993110657,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.26837339997291565,
"rewards/tag_count_reward": 0.984375,
"step": 563
},
{
"completion_length": 511.0,
"epoch": 0.0376,
"grad_norm": 72.11625671386719,
"kl": 1.515625,
"learning_rate": 1e-06,
"loss": 0.0607,
"reward": 0.7955929040908813,
"reward_std": 0.3326777517795563,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.28253206610679626,
"rewards/tag_count_reward": 0.953125,
"step": 564
},
{
"completion_length": 993.0,
"epoch": 0.03766666666666667,
"grad_norm": 36.98861312866211,
"kl": 2.15625,
"learning_rate": 1e-06,
"loss": 0.086,
"reward": 0.7062109708786011,
"reward_std": 0.19238844513893127,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.29378896951675415,
"rewards/tag_count_reward": 1.0,
"step": 565
},
{
"completion_length": 1724.0,
"epoch": 0.037733333333333334,
"grad_norm": 56.63056564331055,
"kl": 1.078125,
"learning_rate": 1e-06,
"loss": 0.0432,
"reward": 0.919497013092041,
"reward_std": 0.3396682143211365,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.2836279571056366,
"rewards/tag_count_reward": 0.953125,
"step": 566
},
{
"completion_length": 1594.5,
"epoch": 0.0378,
"grad_norm": 66.82796478271484,
"kl": 1.7578125,
"learning_rate": 1e-06,
"loss": 0.0702,
"reward": 0.5739055275917053,
"reward_std": 0.25649651885032654,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.16046947240829468,
"rewards/tag_count_reward": 0.734375,
"step": 567
},
{
"completion_length": 1549.0,
"epoch": 0.037866666666666667,
"grad_norm": 92.17615509033203,
"kl": 1.7890625,
"learning_rate": 1e-06,
"loss": 0.0715,
"reward": 0.6099597811698914,
"reward_std": 0.2543640434741974,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.21816526353359222,
"rewards/tag_count_reward": 0.828125,
"step": 568
},
{
"completion_length": 1274.5,
"epoch": 0.03793333333333333,
"grad_norm": 60.14912796020508,
"kl": 1.2734375,
"learning_rate": 1e-06,
"loss": 0.051,
"reward": 0.7341867685317993,
"reward_std": 0.21600724756717682,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2033131867647171,
"rewards/tag_count_reward": 0.9375,
"step": 569
},
{
"completion_length": 1292.5,
"epoch": 0.038,
"grad_norm": 19.547534942626953,
"kl": 4.28125,
"learning_rate": 1e-06,
"loss": 0.1717,
"reward": 0.8135941624641418,
"reward_std": 0.40806543827056885,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.20203083753585815,
"rewards/tag_count_reward": 0.953125,
"step": 570
},
{
"completion_length": 1476.5,
"epoch": 0.038066666666666665,
"grad_norm": 55.32059097290039,
"kl": 2.03125,
"learning_rate": 1e-06,
"loss": 0.0817,
"reward": 0.6646462678909302,
"reward_std": 0.3652288317680359,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.3041037619113922,
"rewards/tag_count_reward": 0.84375,
"step": 571
},
{
"completion_length": 1136.5,
"epoch": 0.03813333333333333,
"grad_norm": 76.0602798461914,
"kl": 0.84765625,
"learning_rate": 1e-06,
"loss": 0.0339,
"reward": 0.7735757827758789,
"reward_std": 0.30608898401260376,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.2420491874217987,
"rewards/tag_count_reward": 0.953125,
"step": 572
},
{
"completion_length": 1641.0,
"epoch": 0.0382,
"grad_norm": 51.28430938720703,
"kl": 4.4375,
"learning_rate": 1e-06,
"loss": 0.178,
"reward": 0.5504544973373413,
"reward_std": 0.24401339888572693,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2776705026626587,
"rewards/tag_count_reward": 0.828125,
"step": 573
},
{
"completion_length": 1398.0,
"epoch": 0.038266666666666664,
"grad_norm": 27.796009063720703,
"kl": 2.34375,
"learning_rate": 1e-06,
"loss": 0.0941,
"reward": 0.7129076719284058,
"reward_std": 0.18738889694213867,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.27146732807159424,
"rewards/tag_count_reward": 0.984375,
"step": 574
},
{
"completion_length": 1515.0,
"epoch": 0.03833333333333333,
"grad_norm": 16.124732971191406,
"kl": 1.359375,
"learning_rate": 1e-06,
"loss": 0.0546,
"reward": 0.7237770557403564,
"reward_std": 0.2166043221950531,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.19809791445732117,
"rewards/tag_count_reward": 0.921875,
"step": 575
},
{
"completion_length": 1228.0,
"epoch": 0.0384,
"grad_norm": 83.39096069335938,
"kl": 2.75,
"learning_rate": 1e-06,
"loss": 0.1097,
"reward": 1.0372037887573242,
"reward_std": 0.4406978487968445,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.2596711814403534,
"rewards/tag_count_reward": 0.984375,
"step": 576
},
{
"completion_length": 1316.0,
"epoch": 0.03846666666666667,
"grad_norm": 43.13058853149414,
"kl": 1.625,
"learning_rate": 1e-06,
"loss": 0.0653,
"reward": 0.5740906000137329,
"reward_std": 0.34053653478622437,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.3165343701839447,
"rewards/tag_count_reward": 0.828125,
"step": 577
},
{
"completion_length": 1529.0,
"epoch": 0.038533333333333336,
"grad_norm": 36.661808013916016,
"kl": 1.875,
"learning_rate": 1e-06,
"loss": 0.0752,
"reward": 0.4301720857620239,
"reward_std": 0.29682278633117676,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.14795291423797607,
"rewards/tag_count_reward": 0.578125,
"step": 578
},
{
"completion_length": 1967.5,
"epoch": 0.0386,
"grad_norm": 72.27436065673828,
"kl": 2.53125,
"learning_rate": 1e-06,
"loss": 0.1015,
"reward": 0.5470969676971436,
"reward_std": 0.3998749256134033,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.18727803230285645,
"rewards/tag_count_reward": 0.671875,
"step": 579
},
{
"completion_length": 376.5,
"epoch": 0.03866666666666667,
"grad_norm": 40.38228988647461,
"kl": 0.98828125,
"learning_rate": 1e-06,
"loss": 0.0396,
"reward": 1.3723690509796143,
"reward_std": 0.4802526831626892,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.12763085961341858,
"rewards/tag_count_reward": 1.0,
"step": 580
},
{
"completion_length": 1755.5,
"epoch": 0.038733333333333335,
"grad_norm": 105.07771301269531,
"kl": 4.25,
"learning_rate": 1e-06,
"loss": 0.1703,
"reward": 0.4931444525718689,
"reward_std": 0.33035823702812195,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.3037305474281311,
"rewards/tag_count_reward": 0.734375,
"step": 581
},
{
"completion_length": 1228.0,
"epoch": 0.0388,
"grad_norm": 174.9635467529297,
"kl": 2.78125,
"learning_rate": 1e-06,
"loss": 0.1113,
"reward": 0.9252910614013672,
"reward_std": 0.5327416062355042,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.26220887899398804,
"rewards/tag_count_reward": 0.9375,
"step": 582
},
{
"completion_length": 1432.0,
"epoch": 0.03886666666666667,
"grad_norm": 66.7169418334961,
"kl": 4.4375,
"learning_rate": 1e-06,
"loss": 0.1769,
"reward": 0.6800612211227417,
"reward_std": 0.22471702098846436,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.2886887788772583,
"rewards/tag_count_reward": 0.96875,
"step": 583
},
{
"completion_length": 1371.5,
"epoch": 0.038933333333333334,
"grad_norm": 167.17843627929688,
"kl": 1.3359375,
"learning_rate": 1e-06,
"loss": 0.0533,
"reward": 0.8494129180908203,
"reward_std": 0.1625041514635086,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1193370521068573,
"rewards/tag_count_reward": 0.96875,
"step": 584
},
{
"completion_length": 791.5,
"epoch": 0.039,
"grad_norm": 50.34352493286133,
"kl": 2.21875,
"learning_rate": 1e-06,
"loss": 0.0892,
"reward": 0.8169752359390259,
"reward_std": 0.2861425578594208,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.22989973425865173,
"rewards/tag_count_reward": 0.984375,
"step": 585
},
{
"completion_length": 1327.0,
"epoch": 0.039066666666666666,
"grad_norm": 114.80569458007812,
"kl": 1.3125,
"learning_rate": 1e-06,
"loss": 0.0525,
"reward": 0.978561520576477,
"reward_std": 0.35342609882354736,
"rewards/accuracy_reward": 0.3125,
"rewards/len_reward": -0.31831347942352295,
"rewards/tag_count_reward": 0.984375,
"step": 586
},
{
"completion_length": 576.0,
"epoch": 0.03913333333333333,
"grad_norm": 31.026878356933594,
"kl": 1.3125,
"learning_rate": 1e-06,
"loss": 0.0524,
"reward": 0.7971070408821106,
"reward_std": 0.31213486194610596,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.1716429740190506,
"rewards/tag_count_reward": 0.90625,
"step": 587
},
{
"completion_length": 544.5,
"epoch": 0.0392,
"grad_norm": 72.87944793701172,
"kl": 1.125,
"learning_rate": 1e-06,
"loss": 0.0449,
"reward": 1.0887908935546875,
"reward_std": 0.2934584617614746,
"rewards/accuracy_reward": 0.4375,
"rewards/len_reward": -0.1924591362476349,
"rewards/tag_count_reward": 0.84375,
"step": 588
},
{
"completion_length": 1214.0,
"epoch": 0.039266666666666665,
"grad_norm": 26.02140235900879,
"kl": 2.28125,
"learning_rate": 1e-06,
"loss": 0.0911,
"reward": 1.1631091833114624,
"reward_std": 0.526539146900177,
"rewards/accuracy_reward": 0.375,
"rewards/len_reward": -0.008765812031924725,
"rewards/tag_count_reward": 0.796875,
"step": 589
},
{
"completion_length": 999.0,
"epoch": 0.03933333333333333,
"grad_norm": 101.99327850341797,
"kl": 1.765625,
"learning_rate": 1e-06,
"loss": 0.0705,
"reward": 0.5965220332145691,
"reward_std": 0.23640874028205872,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.1847279667854309,
"rewards/tag_count_reward": 0.78125,
"step": 590
},
{
"completion_length": 797.5,
"epoch": 0.0394,
"grad_norm": 129.76966857910156,
"kl": 2.390625,
"learning_rate": 1e-06,
"loss": 0.0952,
"reward": 1.2040002346038818,
"reward_std": 0.31418269872665405,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.202249675989151,
"rewards/tag_count_reward": 0.90625,
"step": 591
},
{
"completion_length": 692.5,
"epoch": 0.039466666666666664,
"grad_norm": 79.64732360839844,
"kl": 1.8125,
"learning_rate": 1e-06,
"loss": 0.0726,
"reward": 0.5536153316497803,
"reward_std": 0.21468180418014526,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.14950962364673615,
"rewards/tag_count_reward": 0.703125,
"step": 592
},
{
"completion_length": 1234.5,
"epoch": 0.03953333333333333,
"grad_norm": 217.17437744140625,
"kl": 2.46875,
"learning_rate": 1e-06,
"loss": 0.0983,
"reward": 0.7479342222213745,
"reward_std": 0.18500711023807526,
"rewards/accuracy_reward": 0.0,
"rewards/len_reward": -0.22081580758094788,
"rewards/tag_count_reward": 0.96875,
"step": 593
},
{
"completion_length": 1058.5,
"epoch": 0.0396,
"grad_norm": 24.286544799804688,
"kl": 2.828125,
"learning_rate": 1e-06,
"loss": 0.1131,
"reward": 0.7602176666259766,
"reward_std": 0.3844819664955139,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.19290727376937866,
"rewards/tag_count_reward": 0.890625,
"step": 594
},
{
"completion_length": 1172.5,
"epoch": 0.03966666666666667,
"grad_norm": 67.3233413696289,
"kl": 6.34375,
"learning_rate": 1e-06,
"loss": 0.2544,
"reward": 0.7233742475509644,
"reward_std": 0.49401748180389404,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.13600073754787445,
"rewards/tag_count_reward": 0.796875,
"step": 595
},
{
"completion_length": 259.0,
"epoch": 0.039733333333333336,
"grad_norm": 26.167888641357422,
"kl": 1.3046875,
"learning_rate": 1e-06,
"loss": 0.0522,
"reward": 1.4047433137893677,
"reward_std": 0.27290037274360657,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.09525654464960098,
"rewards/tag_count_reward": 1.0,
"step": 596
},
{
"completion_length": 1156.0,
"epoch": 0.0398,
"grad_norm": 68.4002685546875,
"kl": 6.1875,
"learning_rate": 1e-06,
"loss": 0.248,
"reward": 0.8152648210525513,
"reward_std": 0.3991209864616394,
"rewards/accuracy_reward": 0.125,
"rewards/len_reward": -0.27848517894744873,
"rewards/tag_count_reward": 0.96875,
"step": 597
},
{
"completion_length": 707.0,
"epoch": 0.03986666666666667,
"grad_norm": 75.64336395263672,
"kl": 3.515625,
"learning_rate": 1e-06,
"loss": 0.1403,
"reward": 0.9840830564498901,
"reward_std": 0.4521138668060303,
"rewards/accuracy_reward": 0.25,
"rewards/len_reward": -0.2190418690443039,
"rewards/tag_count_reward": 0.953125,
"step": 598
},
{
"completion_length": 1399.5,
"epoch": 0.039933333333333335,
"grad_norm": 49.26580047607422,
"kl": 3.6875,
"learning_rate": 1e-06,
"loss": 0.1476,
"reward": 0.6098353862762451,
"reward_std": 0.25023001432418823,
"rewards/accuracy_reward": 0.0625,
"rewards/len_reward": -0.29641464352607727,
"rewards/tag_count_reward": 0.84375,
"step": 599
},
{
"completion_length": 1176.0,
"epoch": 0.04,
"grad_norm": 48.72330093383789,
"kl": 3.90625,
"learning_rate": 1e-06,
"loss": 0.1564,
"reward": 1.256005048751831,
"reward_std": 0.3112409710884094,
"rewards/accuracy_reward": 0.5,
"rewards/len_reward": -0.11899499595165253,
"rewards/tag_count_reward": 0.875,
"step": 600
},
{
"epoch": 0.04,
"step": 600,
"total_flos": 0.0,
"train_loss": 0.15968478212249465,
"train_runtime": 15013.7279,
"train_samples_per_second": 0.639,
"train_steps_per_second": 0.04
}
],
"logging_steps": 1,
"max_steps": 600,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}