Files
Qwen2.5-7B-BLEUBERI/trainer_state.json
ModelHub XC 1b513f59c8 初始化项目,由ModelHub XC社区提供模型
Model: yapeichang/Qwen2.5-7B-BLEUBERI
Source: Original Platform
2026-04-22 11:24:02 +08:00

31285 lines
1.1 MiB

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 262.625,
"completions/mean_terminated_length": 179.5,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0008,
"grad_norm": 5.598288059234619,
"kl": 0.0005154609680175781,
"learning_rate": 1.5873015873015872e-08,
"loss": 0.0537,
"num_tokens": 15100.0,
"reward": 0.04846250265836716,
"reward_std": 0.06843117624521255,
"rewards/bleu_reward_func/mean": 0.04846250265836716,
"rewards/bleu_reward_func/std": 0.07639143615961075,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 306.0,
"completions/mean_length": 248.09375,
"completions/mean_terminated_length": 128.13636779785156,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.0016,
"grad_norm": 7.323095321655273,
"kl": 0.0005979537963867188,
"learning_rate": 3.1746031746031744e-08,
"loss": 0.2393,
"num_tokens": 31479.0,
"reward": 0.03515050560235977,
"reward_std": 0.0315697155892849,
"rewards/bleu_reward_func/mean": 0.03515050560235977,
"rewards/bleu_reward_func/std": 0.048244670033454895,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 258.34375,
"completions/mean_terminated_length": 159.0869598388672,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0024,
"grad_norm": 5.801818370819092,
"kl": 0.0008335113525390625,
"learning_rate": 4.7619047619047613e-08,
"loss": 0.2227,
"num_tokens": 47330.0,
"reward": 0.0770750418305397,
"reward_std": 0.05211775749921799,
"rewards/bleu_reward_func/mean": 0.0770750418305397,
"rewards/bleu_reward_func/std": 0.07082299888134003,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 285.59375,
"completions/mean_terminated_length": 197.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0032,
"grad_norm": 6.7342329025268555,
"kl": 0.0007953643798828125,
"learning_rate": 6.349206349206349e-08,
"loss": 0.1714,
"num_tokens": 62101.0,
"reward": 0.05630416050553322,
"reward_std": 0.0387054979801178,
"rewards/bleu_reward_func/mean": 0.05630416050553322,
"rewards/bleu_reward_func/std": 0.05173136293888092,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 312.75,
"completions/mean_terminated_length": 208.38095092773438,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.004,
"grad_norm": 4.261541843414307,
"kl": 0.0007123947143554688,
"learning_rate": 7.936507936507936e-08,
"loss": 0.0096,
"num_tokens": 74629.0,
"reward": 0.03661263734102249,
"reward_std": 0.02765350043773651,
"rewards/bleu_reward_func/mean": 0.03661263734102249,
"rewards/bleu_reward_func/std": 0.05122661218047142,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 269.0625,
"completions/mean_terminated_length": 80.11111450195312,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0048,
"grad_norm": 25.550201416015625,
"kl": 0.000881195068359375,
"learning_rate": 9.523809523809523e-08,
"loss": -0.1788,
"num_tokens": 91711.0,
"reward": 0.01917407289147377,
"reward_std": 0.014019257389008999,
"rewards/bleu_reward_func/mean": 0.01917407289147377,
"rewards/bleu_reward_func/std": 0.024173468351364136,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 377.8125,
"completions/mean_terminated_length": 259.4117736816406,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0056,
"grad_norm": 3.7816717624664307,
"kl": 0.0007390975952148438,
"learning_rate": 1.111111111111111e-07,
"loss": -0.2289,
"num_tokens": 107369.0,
"reward": 0.02209433726966381,
"reward_std": 0.011734157800674438,
"rewards/bleu_reward_func/mean": 0.02209433726966381,
"rewards/bleu_reward_func/std": 0.023080473765730858,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 283.5625,
"completions/mean_terminated_length": 146.5,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0064,
"grad_norm": 4.340329647064209,
"kl": 0.000911712646484375,
"learning_rate": 1.2698412698412698e-07,
"loss": -0.0252,
"num_tokens": 125275.0,
"reward": 0.03392016887664795,
"reward_std": 0.04013249650597572,
"rewards/bleu_reward_func/mean": 0.03392016887664795,
"rewards/bleu_reward_func/std": 0.05353143438696861,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 282.09375,
"completions/mean_terminated_length": 192.13043212890625,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0072,
"grad_norm": 5.671853542327881,
"kl": 0.000640869140625,
"learning_rate": 1.4285714285714285e-07,
"loss": -0.4792,
"num_tokens": 142190.0,
"reward": 0.02354184165596962,
"reward_std": 0.015565130859613419,
"rewards/bleu_reward_func/mean": 0.02354184165596962,
"rewards/bleu_reward_func/std": 0.02305246703326702,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 421.625,
"completions/mean_terminated_length": 359.78948974609375,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.008,
"grad_norm": 3.240866184234619,
"kl": 0.0006823539733886719,
"learning_rate": 1.5873015873015872e-07,
"loss": -0.0021,
"num_tokens": 158282.0,
"reward": 0.02482026070356369,
"reward_std": 0.0131409652531147,
"rewards/bleu_reward_func/mean": 0.02482026070356369,
"rewards/bleu_reward_func/std": 0.015270248055458069,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 345.90625,
"completions/mean_terminated_length": 199.35293579101562,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0088,
"grad_norm": 3.652275800704956,
"kl": 0.0006260871887207031,
"learning_rate": 1.7460317460317458e-07,
"loss": -0.2852,
"num_tokens": 177455.0,
"reward": 0.03390186280012131,
"reward_std": 0.016770539805293083,
"rewards/bleu_reward_func/mean": 0.03390186280012131,
"rewards/bleu_reward_func/std": 0.04328485205769539,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 438.0,
"completions/mean_length": 275.46875,
"completions/mean_terminated_length": 196.625,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.0096,
"grad_norm": 73.21807098388672,
"kl": 0.0032701492309570312,
"learning_rate": 1.9047619047619045e-07,
"loss": 0.0661,
"num_tokens": 189486.0,
"reward": 0.022345196455717087,
"reward_std": 0.019753258675336838,
"rewards/bleu_reward_func/mean": 0.022345196455717087,
"rewards/bleu_reward_func/std": 0.020975911989808083,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 408.0,
"completions/mean_length": 309.90625,
"completions/mean_terminated_length": 188.65000915527344,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0104,
"grad_norm": 5.3057379722595215,
"kl": 0.0005865097045898438,
"learning_rate": 2.0634920634920632e-07,
"loss": -0.1972,
"num_tokens": 203691.0,
"reward": 0.031099505722522736,
"reward_std": 0.04415294528007507,
"rewards/bleu_reward_func/mean": 0.031099505722522736,
"rewards/bleu_reward_func/std": 0.05319083109498024,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 211.5625,
"completions/mean_terminated_length": 127.43999481201172,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0112,
"grad_norm": 10.997786521911621,
"kl": 0.0007891654968261719,
"learning_rate": 2.222222222222222e-07,
"loss": 0.005,
"num_tokens": 220117.0,
"reward": 0.07334433495998383,
"reward_std": 0.05255947634577751,
"rewards/bleu_reward_func/mean": 0.07334433495998383,
"rewards/bleu_reward_func/std": 0.11127088218927383,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 322.71875,
"completions/mean_terminated_length": 175.5,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.012,
"grad_norm": 4.558916091918945,
"kl": 0.000732421875,
"learning_rate": 2.3809523809523806e-07,
"loss": -0.1845,
"num_tokens": 232508.0,
"reward": 0.01538888644427061,
"reward_std": 0.012768322601914406,
"rewards/bleu_reward_func/mean": 0.01538888644427061,
"rewards/bleu_reward_func/std": 0.01415330171585083,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 397.0,
"completions/mean_length": 266.78125,
"completions/mean_terminated_length": 138.33334350585938,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0128,
"grad_norm": 4.418691158294678,
"kl": 0.0009031295776367188,
"learning_rate": 2.5396825396825396e-07,
"loss": 0.2931,
"num_tokens": 246325.0,
"reward": 0.04519380256533623,
"reward_std": 0.047629594802856445,
"rewards/bleu_reward_func/mean": 0.04519380256533623,
"rewards/bleu_reward_func/std": 0.09796681255102158,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 375.375,
"completions/mean_terminated_length": 238.75,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0136,
"grad_norm": 4.360379219055176,
"kl": 0.0007829666137695312,
"learning_rate": 2.698412698412698e-07,
"loss": 0.0392,
"num_tokens": 262393.0,
"reward": 0.02785748988389969,
"reward_std": 0.02370397374033928,
"rewards/bleu_reward_func/mean": 0.02785748988389969,
"rewards/bleu_reward_func/std": 0.031648874282836914,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 341.625,
"completions/mean_terminated_length": 171.25,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0144,
"grad_norm": 4.028530597686768,
"kl": 0.0006041526794433594,
"learning_rate": 2.857142857142857e-07,
"loss": -0.0867,
"num_tokens": 276509.0,
"reward": 0.03313319757580757,
"reward_std": 0.026780985295772552,
"rewards/bleu_reward_func/mean": 0.03313319757580757,
"rewards/bleu_reward_func/std": 0.03177988529205322,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 323.78125,
"completions/mean_terminated_length": 250.13043212890625,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.0152,
"grad_norm": 6.029330253601074,
"kl": 0.0007543563842773438,
"learning_rate": 3.0158730158730156e-07,
"loss": 0.2177,
"num_tokens": 288774.0,
"reward": 0.04934918135404587,
"reward_std": 0.035659849643707275,
"rewards/bleu_reward_func/mean": 0.04934918135404587,
"rewards/bleu_reward_func/std": 0.046043358743190765,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 365.5625,
"completions/mean_terminated_length": 251.6666717529297,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.016,
"grad_norm": 3.2459499835968018,
"kl": 0.0007076263427734375,
"learning_rate": 3.1746031746031743e-07,
"loss": -0.1034,
"num_tokens": 302384.0,
"reward": 0.045273810625076294,
"reward_std": 0.033148057758808136,
"rewards/bleu_reward_func/mean": 0.045273810625076294,
"rewards/bleu_reward_func/std": 0.05641715228557587,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 309.4375,
"completions/mean_terminated_length": 170.84210205078125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0168,
"grad_norm": 3.209543228149414,
"kl": 0.0006561279296875,
"learning_rate": 3.333333333333333e-07,
"loss": -0.0094,
"num_tokens": 317406.0,
"reward": 0.10972930490970612,
"reward_std": 0.09467534720897675,
"rewards/bleu_reward_func/mean": 0.10972930490970612,
"rewards/bleu_reward_func/std": 0.1834246814250946,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 420.0,
"completions/mean_length": 276.4375,
"completions/mean_terminated_length": 169.3636474609375,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.0176,
"grad_norm": 6.837025165557861,
"kl": 0.0008249282836914062,
"learning_rate": 3.4920634920634917e-07,
"loss": 0.192,
"num_tokens": 331436.0,
"reward": 0.08987575769424438,
"reward_std": 0.03435216099023819,
"rewards/bleu_reward_func/mean": 0.08987575769424438,
"rewards/bleu_reward_func/std": 0.13043095171451569,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 453.0,
"completions/mean_length": 392.46875,
"completions/mean_terminated_length": 272.9375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.0184,
"grad_norm": 6.737916946411133,
"kl": 0.00086212158203125,
"learning_rate": 3.6507936507936504e-07,
"loss": -0.0441,
"num_tokens": 349715.0,
"reward": 0.027110569179058075,
"reward_std": 0.01938316598534584,
"rewards/bleu_reward_func/mean": 0.027110569179058075,
"rewards/bleu_reward_func/std": 0.021934401243925095,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 274.65625,
"completions/mean_terminated_length": 230.70370483398438,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0192,
"grad_norm": 10.491765022277832,
"kl": 0.0007648468017578125,
"learning_rate": 3.809523809523809e-07,
"loss": 0.269,
"num_tokens": 360336.0,
"reward": 0.03281049802899361,
"reward_std": 0.023013217374682426,
"rewards/bleu_reward_func/mean": 0.03281049802899361,
"rewards/bleu_reward_func/std": 0.026025522500276566,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 418.0,
"completions/mean_length": 269.59375,
"completions/mean_terminated_length": 103.7368392944336,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.02,
"grad_norm": 5.670685291290283,
"kl": 0.0008592605590820312,
"learning_rate": 3.968253968253968e-07,
"loss": 0.2917,
"num_tokens": 374179.0,
"reward": 0.04281582683324814,
"reward_std": 0.0440773144364357,
"rewards/bleu_reward_func/mean": 0.04281582683324814,
"rewards/bleu_reward_func/std": 0.0797559842467308,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 296.96875,
"completions/mean_terminated_length": 184.33334350585938,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0208,
"grad_norm": 6.63213586807251,
"kl": 0.0011281967163085938,
"learning_rate": 4.1269841269841265e-07,
"loss": 0.0991,
"num_tokens": 386458.0,
"reward": 0.07768785208463669,
"reward_std": 0.08760131150484085,
"rewards/bleu_reward_func/mean": 0.07768785208463669,
"rewards/bleu_reward_func/std": 0.12583571672439575,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 447.0,
"completions/mean_length": 280.4375,
"completions/mean_terminated_length": 175.18182373046875,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.0216,
"grad_norm": 5.1314802169799805,
"kl": 0.0008611679077148438,
"learning_rate": 4.285714285714285e-07,
"loss": 0.2129,
"num_tokens": 399600.0,
"reward": 0.034803349524736404,
"reward_std": 0.033125463873147964,
"rewards/bleu_reward_func/mean": 0.034803349524736404,
"rewards/bleu_reward_func/std": 0.04297792166471481,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 405.0,
"completions/mean_length": 403.625,
"completions/mean_terminated_length": 196.72727966308594,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0224,
"grad_norm": 2.8215885162353516,
"kl": 0.0007638931274414062,
"learning_rate": 4.444444444444444e-07,
"loss": -0.2953,
"num_tokens": 415372.0,
"reward": 0.02452818863093853,
"reward_std": 0.018821807578206062,
"rewards/bleu_reward_func/mean": 0.02452818863093853,
"rewards/bleu_reward_func/std": 0.03300207853317261,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 458.0,
"completions/mean_length": 213.84375,
"completions/mean_terminated_length": 114.45833587646484,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0232,
"grad_norm": 7.012094020843506,
"kl": 0.0006189346313476562,
"learning_rate": 4.6031746031746025e-07,
"loss": 0.119,
"num_tokens": 428351.0,
"reward": 0.055403269827365875,
"reward_std": 0.06412488222122192,
"rewards/bleu_reward_func/mean": 0.055403269827365875,
"rewards/bleu_reward_func/std": 0.07173087447881699,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 374.28125,
"completions/mean_terminated_length": 291.6499938964844,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.024,
"grad_norm": 5.251861095428467,
"kl": 0.0007734298706054688,
"learning_rate": 4.761904761904761e-07,
"loss": 0.0396,
"num_tokens": 443856.0,
"reward": 0.033150382339954376,
"reward_std": 0.029685020446777344,
"rewards/bleu_reward_func/mean": 0.033150382339954376,
"rewards/bleu_reward_func/std": 0.04449395835399628,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 352.6875,
"completions/mean_terminated_length": 212.11764526367188,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.0248,
"grad_norm": 2.8306992053985596,
"kl": 0.0007534027099609375,
"learning_rate": 4.92063492063492e-07,
"loss": 0.0386,
"num_tokens": 458846.0,
"reward": 0.07098191231489182,
"reward_std": 0.07976502180099487,
"rewards/bleu_reward_func/mean": 0.07098191231489182,
"rewards/bleu_reward_func/std": 0.13301755487918854,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 314.96875,
"completions/mean_terminated_length": 249.2916717529297,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.0256,
"grad_norm": 4.18798303604126,
"kl": 0.0013475418090820312,
"learning_rate": 5.079365079365079e-07,
"loss": 0.1184,
"num_tokens": 475693.0,
"reward": 0.06003670394420624,
"reward_std": 0.04762943834066391,
"rewards/bleu_reward_func/mean": 0.06003670394420624,
"rewards/bleu_reward_func/std": 0.06799852848052979,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 433.0,
"completions/mean_length": 246.09375,
"completions/mean_terminated_length": 157.45834350585938,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0264,
"grad_norm": 7.522784233093262,
"kl": 0.0013055801391601562,
"learning_rate": 5.238095238095238e-07,
"loss": 0.2574,
"num_tokens": 489632.0,
"reward": 0.035463202744722366,
"reward_std": 0.02683849260210991,
"rewards/bleu_reward_func/mean": 0.035463202744722366,
"rewards/bleu_reward_func/std": 0.05300255864858627,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 387.0,
"completions/mean_length": 281.40625,
"completions/mean_terminated_length": 191.17391967773438,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.0272,
"grad_norm": 4.312166213989258,
"kl": 0.0016279220581054688,
"learning_rate": 5.396825396825396e-07,
"loss": 0.0276,
"num_tokens": 503221.0,
"reward": 0.036928486078977585,
"reward_std": 0.030746515840291977,
"rewards/bleu_reward_func/mean": 0.036928486078977585,
"rewards/bleu_reward_func/std": 0.041675370186567307,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 447.0,
"completions/mean_length": 363.0,
"completions/mean_terminated_length": 231.5294189453125,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.028,
"grad_norm": 5.934871196746826,
"kl": 0.001247406005859375,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0007,
"num_tokens": 519629.0,
"reward": 0.02279968000948429,
"reward_std": 0.0171576626598835,
"rewards/bleu_reward_func/mean": 0.02279968000948429,
"rewards/bleu_reward_func/std": 0.02809896320104599,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 245.125,
"completions/mean_terminated_length": 227.33334350585938,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0288,
"grad_norm": 6.152184963226318,
"kl": 0.00135040283203125,
"learning_rate": 5.714285714285714e-07,
"loss": 0.1277,
"num_tokens": 531009.0,
"reward": 0.08614860475063324,
"reward_std": 0.05592390149831772,
"rewards/bleu_reward_func/mean": 0.08614860475063324,
"rewards/bleu_reward_func/std": 0.07292494177818298,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 315.46875,
"completions/mean_terminated_length": 226.13636779785156,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.0296,
"grad_norm": 4.999361991882324,
"kl": 0.0010061264038085938,
"learning_rate": 5.873015873015873e-07,
"loss": -0.1945,
"num_tokens": 553904.0,
"reward": 0.022978566586971283,
"reward_std": 0.0320000983774662,
"rewards/bleu_reward_func/mean": 0.022978566586971283,
"rewards/bleu_reward_func/std": 0.05384916067123413,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 255.125,
"completions/mean_terminated_length": 195.84616088867188,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.0304,
"grad_norm": 12.462119102478027,
"kl": 0.00140380859375,
"learning_rate": 6.031746031746031e-07,
"loss": -0.0499,
"num_tokens": 569980.0,
"reward": 0.06601191312074661,
"reward_std": 0.06571432948112488,
"rewards/bleu_reward_func/mean": 0.06601191312074661,
"rewards/bleu_reward_func/std": 0.11037519574165344,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 350.78125,
"completions/mean_terminated_length": 287.6956481933594,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0312,
"grad_norm": 4.318572044372559,
"kl": 0.0013608932495117188,
"learning_rate": 6.19047619047619e-07,
"loss": 0.1581,
"num_tokens": 584165.0,
"reward": 0.03686396777629852,
"reward_std": 0.00873212143778801,
"rewards/bleu_reward_func/mean": 0.03686396777629852,
"rewards/bleu_reward_func/std": 0.03987700119614601,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 467.0,
"completions/mean_length": 395.40625,
"completions/mean_terminated_length": 315.631591796875,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.032,
"grad_norm": 2.8483028411865234,
"kl": 0.0015163421630859375,
"learning_rate": 6.349206349206349e-07,
"loss": 0.2409,
"num_tokens": 599602.0,
"reward": 0.012605215422809124,
"reward_std": 0.007717709057033062,
"rewards/bleu_reward_func/mean": 0.012605215422809124,
"rewards/bleu_reward_func/std": 0.008546828292310238,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 306.46875,
"completions/mean_terminated_length": 125.11764526367188,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.0328,
"grad_norm": 15.546673774719238,
"kl": 0.0033998489379882812,
"learning_rate": 6.507936507936507e-07,
"loss": 0.2262,
"num_tokens": 617761.0,
"reward": 0.037311654537916183,
"reward_std": 0.04001215100288391,
"rewards/bleu_reward_func/mean": 0.037311654537916183,
"rewards/bleu_reward_func/std": 0.05116492509841919,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 394.0,
"completions/mean_length": 308.375,
"completions/mean_terminated_length": 77.60000610351562,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0336,
"grad_norm": 9.35763168334961,
"kl": 0.0030651092529296875,
"learning_rate": 6.666666666666666e-07,
"loss": 0.1783,
"num_tokens": 637541.0,
"reward": 0.06710080057382584,
"reward_std": 0.0418785884976387,
"rewards/bleu_reward_func/mean": 0.06710080057382584,
"rewards/bleu_reward_func/std": 0.09365852922201157,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 336.59375,
"completions/mean_terminated_length": 216.57894897460938,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.0344,
"grad_norm": 4.267389297485352,
"kl": 0.002620697021484375,
"learning_rate": 6.825396825396826e-07,
"loss": -0.0163,
"num_tokens": 650776.0,
"reward": 0.04351692646741867,
"reward_std": 0.03509015589952469,
"rewards/bleu_reward_func/mean": 0.04351692646741867,
"rewards/bleu_reward_func/std": 0.052853576838970184,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 387.0,
"completions/mean_length": 240.5,
"completions/mean_terminated_length": 177.84616088867188,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.0352,
"grad_norm": 112.21627807617188,
"kl": 0.0047245025634765625,
"learning_rate": 6.984126984126983e-07,
"loss": 0.0521,
"num_tokens": 665576.0,
"reward": 0.05281548202037811,
"reward_std": 0.034495480358600616,
"rewards/bleu_reward_func/mean": 0.05281548202037811,
"rewards/bleu_reward_func/std": 0.0704483836889267,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 335.46875,
"completions/mean_terminated_length": 310.25,
"completions/min_length": 81.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.036,
"grad_norm": 30.739818572998047,
"kl": 0.00278472900390625,
"learning_rate": 7.142857142857143e-07,
"loss": -0.1091,
"num_tokens": 678375.0,
"reward": 0.04049266129732132,
"reward_std": 0.020605597645044327,
"rewards/bleu_reward_func/mean": 0.04049266129732132,
"rewards/bleu_reward_func/std": 0.04322003573179245,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 456.0,
"completions/mean_length": 253.28125,
"completions/mean_terminated_length": 205.37037658691406,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.0368,
"grad_norm": 3.857532501220703,
"kl": 0.002582550048828125,
"learning_rate": 7.301587301587301e-07,
"loss": 0.1863,
"num_tokens": 693632.0,
"reward": 0.03602021187543869,
"reward_std": 0.03167928382754326,
"rewards/bleu_reward_func/mean": 0.03602021187543869,
"rewards/bleu_reward_func/std": 0.060269005596637726,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 457.0,
"completions/mean_length": 379.125,
"completions/mean_terminated_length": 208.2857208251953,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.0376,
"grad_norm": 44.705684661865234,
"kl": 0.002960205078125,
"learning_rate": 7.46031746031746e-07,
"loss": -0.2537,
"num_tokens": 712244.0,
"reward": 0.009683560580015182,
"reward_std": 0.007736856117844582,
"rewards/bleu_reward_func/mean": 0.009683560580015182,
"rewards/bleu_reward_func/std": 0.010262547992169857,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 209.46875,
"completions/mean_terminated_length": 153.44444274902344,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0384,
"grad_norm": 4.927426815032959,
"kl": 0.010406494140625,
"learning_rate": 7.619047619047618e-07,
"loss": 0.2249,
"num_tokens": 722779.0,
"reward": 0.06434739381074905,
"reward_std": 0.062096044421195984,
"rewards/bleu_reward_func/mean": 0.06434739381074905,
"rewards/bleu_reward_func/std": 0.07261113822460175,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 319.53125,
"completions/mean_terminated_length": 187.84210205078125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0392,
"grad_norm": 6.036177158355713,
"kl": 0.0051422119140625,
"learning_rate": 7.777777777777778e-07,
"loss": 0.2132,
"num_tokens": 735892.0,
"reward": 0.0316137932240963,
"reward_std": 0.028243713080883026,
"rewards/bleu_reward_func/mean": 0.0316137932240963,
"rewards/bleu_reward_func/std": 0.032289810478687286,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 329.0625,
"completions/mean_terminated_length": 233.23809814453125,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.04,
"grad_norm": 6.000904560089111,
"kl": 0.0041103363037109375,
"learning_rate": 7.936507936507936e-07,
"loss": 0.164,
"num_tokens": 748926.0,
"reward": 0.031059542670845985,
"reward_std": 0.02046222612261772,
"rewards/bleu_reward_func/mean": 0.031059542670845985,
"rewards/bleu_reward_func/std": 0.029215287417173386,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 460.0,
"completions/mean_length": 240.03125,
"completions/mean_terminated_length": 201.17857360839844,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.0408,
"grad_norm": 4.45400333404541,
"kl": 0.00446319580078125,
"learning_rate": 8.095238095238095e-07,
"loss": 0.095,
"num_tokens": 763935.0,
"reward": 0.06022896245121956,
"reward_std": 0.04401791840791702,
"rewards/bleu_reward_func/mean": 0.06022896245121956,
"rewards/bleu_reward_func/std": 0.06288844347000122,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 308.875,
"completions/mean_terminated_length": 169.89474487304688,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0416,
"grad_norm": 6.896392822265625,
"kl": 0.00627899169921875,
"learning_rate": 8.253968253968253e-07,
"loss": 0.1302,
"num_tokens": 781619.0,
"reward": 0.02847466617822647,
"reward_std": 0.024918708950281143,
"rewards/bleu_reward_func/mean": 0.02847466617822647,
"rewards/bleu_reward_func/std": 0.03209677338600159,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 309.9375,
"completions/mean_terminated_length": 263.3077087402344,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.0424,
"grad_norm": 5.093250751495361,
"kl": 0.004283905029296875,
"learning_rate": 8.412698412698413e-07,
"loss": -0.0777,
"num_tokens": 795977.0,
"reward": 0.07096201926469803,
"reward_std": 0.06636855751276016,
"rewards/bleu_reward_func/mean": 0.07096201926469803,
"rewards/bleu_reward_func/std": 0.09039857983589172,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 386.4375,
"completions/mean_terminated_length": 202.92308044433594,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.0432,
"grad_norm": 3.2264180183410645,
"kl": 0.0028362274169921875,
"learning_rate": 8.57142857142857e-07,
"loss": -0.0863,
"num_tokens": 814135.0,
"reward": 0.014086933806538582,
"reward_std": 0.013363949954509735,
"rewards/bleu_reward_func/mean": 0.014086933806538582,
"rewards/bleu_reward_func/std": 0.01598522998392582,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 456.0,
"completions/mean_length": 244.59375,
"completions/mean_terminated_length": 195.07408142089844,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.044,
"grad_norm": 8.706979751586914,
"kl": 0.00571441650390625,
"learning_rate": 8.73015873015873e-07,
"loss": 0.1609,
"num_tokens": 827226.0,
"reward": 0.0647934228181839,
"reward_std": 0.0345802828669548,
"rewards/bleu_reward_func/mean": 0.0647934228181839,
"rewards/bleu_reward_func/std": 0.04030924290418625,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 126.0625,
"completions/mean_terminated_length": 113.61289978027344,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.0448,
"grad_norm": 8.598736763000488,
"kl": 0.0142974853515625,
"learning_rate": 8.888888888888888e-07,
"loss": 0.1419,
"num_tokens": 834268.0,
"reward": 0.04880748316645622,
"reward_std": 0.042880259454250336,
"rewards/bleu_reward_func/mean": 0.04880748316645622,
"rewards/bleu_reward_func/std": 0.05060458555817604,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 302.03125,
"completions/mean_terminated_length": 219.86956787109375,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0456,
"grad_norm": 6.926377296447754,
"kl": 0.009532928466796875,
"learning_rate": 9.047619047619047e-07,
"loss": -0.0374,
"num_tokens": 851701.0,
"reward": 0.06913506239652634,
"reward_std": 0.04138587415218353,
"rewards/bleu_reward_func/mean": 0.06913506239652634,
"rewards/bleu_reward_func/std": 0.0750163346529007,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 416.0,
"completions/mean_length": 211.40625,
"completions/mean_terminated_length": 127.23999786376953,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.0464,
"grad_norm": 7.853041648864746,
"kl": 0.0272064208984375,
"learning_rate": 9.206349206349205e-07,
"loss": 0.4023,
"num_tokens": 865434.0,
"reward": 0.12499310076236725,
"reward_std": 0.08980046212673187,
"rewards/bleu_reward_func/mean": 0.12499310076236725,
"rewards/bleu_reward_func/std": 0.13493874669075012,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 390.65625,
"completions/mean_terminated_length": 283.5882263183594,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0472,
"grad_norm": 2.4230539798736572,
"kl": 0.003765106201171875,
"learning_rate": 9.365079365079365e-07,
"loss": -0.0092,
"num_tokens": 884815.0,
"reward": 0.021261584013700485,
"reward_std": 0.027461236342787743,
"rewards/bleu_reward_func/mean": 0.021261584013700485,
"rewards/bleu_reward_func/std": 0.03110821731388569,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 281.59375,
"completions/mean_terminated_length": 191.43478393554688,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.048,
"grad_norm": 2.9367215633392334,
"kl": 0.00628662109375,
"learning_rate": 9.523809523809522e-07,
"loss": 0.2021,
"num_tokens": 896810.0,
"reward": 0.023613639175891876,
"reward_std": 0.02252291887998581,
"rewards/bleu_reward_func/mean": 0.023613639175891876,
"rewards/bleu_reward_func/std": 0.041281431913375854,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 272.34375,
"completions/mean_terminated_length": 227.9629669189453,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.0488,
"grad_norm": 3.998281717300415,
"kl": 0.0076904296875,
"learning_rate": 9.682539682539682e-07,
"loss": -0.1513,
"num_tokens": 907349.0,
"reward": 0.07193129509687424,
"reward_std": 0.05195175111293793,
"rewards/bleu_reward_func/mean": 0.07193129509687424,
"rewards/bleu_reward_func/std": 0.07358168065547943,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 335.0,
"completions/max_terminated_length": 335.0,
"completions/mean_length": 82.25,
"completions/mean_terminated_length": 82.25,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.0496,
"grad_norm": 12.274957656860352,
"kl": 0.049835205078125,
"learning_rate": 9.84126984126984e-07,
"loss": 0.0211,
"num_tokens": 916605.0,
"reward": 0.1968570053577423,
"reward_std": 0.09575757384300232,
"rewards/bleu_reward_func/mean": 0.1968570053577423,
"rewards/bleu_reward_func/std": 0.14971531927585602,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 353.0,
"completions/mean_length": 209.71875,
"completions/mean_terminated_length": 178.44827270507812,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.0504,
"grad_norm": 4.69417142868042,
"kl": 0.0074615478515625,
"learning_rate": 1e-06,
"loss": -0.0102,
"num_tokens": 924772.0,
"reward": 0.026346374303102493,
"reward_std": 0.015668006613850594,
"rewards/bleu_reward_func/mean": 0.026346374303102493,
"rewards/bleu_reward_func/std": 0.016677534207701683,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 367.0,
"completions/mean_length": 147.34375,
"completions/mean_terminated_length": 95.25000762939453,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.0512,
"grad_norm": 7.854241371154785,
"kl": 0.019744873046875,
"learning_rate": 1e-06,
"loss": 0.1402,
"num_tokens": 932879.0,
"reward": 0.039189111441373825,
"reward_std": 0.034408073872327805,
"rewards/bleu_reward_func/mean": 0.039189111441373825,
"rewards/bleu_reward_func/std": 0.06643246859312057,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 473.0,
"completions/mean_length": 217.53125,
"completions/mean_terminated_length": 163.0,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.052,
"grad_norm": 5.94617223739624,
"kl": 0.012752532958984375,
"learning_rate": 1e-06,
"loss": 0.0629,
"num_tokens": 944744.0,
"reward": 0.0992283821105957,
"reward_std": 0.04174066707491875,
"rewards/bleu_reward_func/mean": 0.0992283821105957,
"rewards/bleu_reward_func/std": 0.14538165926933289,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 360.84375,
"completions/mean_terminated_length": 281.66668701171875,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.0528,
"grad_norm": 2.7164971828460693,
"kl": 0.00780487060546875,
"learning_rate": 1e-06,
"loss": -0.1815,
"num_tokens": 959043.0,
"reward": 0.03164489567279816,
"reward_std": 0.024089161306619644,
"rewards/bleu_reward_func/mean": 0.03164489567279816,
"rewards/bleu_reward_func/std": 0.03230883181095123,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 174.375,
"completions/mean_terminated_length": 79.83999633789062,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.0536,
"grad_norm": 8.954367637634277,
"kl": 0.040679931640625,
"learning_rate": 1e-06,
"loss": 0.4022,
"num_tokens": 970487.0,
"reward": 0.1188623458147049,
"reward_std": 0.06528393179178238,
"rewards/bleu_reward_func/mean": 0.1188623458147049,
"rewards/bleu_reward_func/std": 0.10126637667417526,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 458.0,
"completions/mean_length": 249.78125,
"completions/mean_terminated_length": 112.42857360839844,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.0544,
"grad_norm": 8.929741859436035,
"kl": 0.01055908203125,
"learning_rate": 1e-06,
"loss": -0.3676,
"num_tokens": 980432.0,
"reward": 0.04680415242910385,
"reward_std": 0.015473801642656326,
"rewards/bleu_reward_func/mean": 0.04680415242910385,
"rewards/bleu_reward_func/std": 0.05666949972510338,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 200.28125,
"completions/mean_terminated_length": 155.75,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.0552,
"grad_norm": 19.934701919555664,
"kl": 0.036396026611328125,
"learning_rate": 1e-06,
"loss": 0.1105,
"num_tokens": 988713.0,
"reward": 0.03349726274609566,
"reward_std": 0.007375569082796574,
"rewards/bleu_reward_func/mean": 0.03349726274609566,
"rewards/bleu_reward_func/std": 0.0360921286046505,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 349.0,
"completions/mean_length": 295.1875,
"completions/mean_terminated_length": 146.84210205078125,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.056,
"grad_norm": 3.6616406440734863,
"kl": 0.0112762451171875,
"learning_rate": 1e-06,
"loss": 0.081,
"num_tokens": 1002319.0,
"reward": 0.016106903553009033,
"reward_std": 0.008415726944804192,
"rewards/bleu_reward_func/mean": 0.016106903553009033,
"rewards/bleu_reward_func/std": 0.012413726188242435,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 270.1875,
"completions/mean_terminated_length": 235.6428680419922,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.0568,
"grad_norm": 11.310477256774902,
"kl": 0.02431488037109375,
"learning_rate": 1e-06,
"loss": 0.0216,
"num_tokens": 1014117.0,
"reward": 0.09336908906698227,
"reward_std": 0.04001408815383911,
"rewards/bleu_reward_func/mean": 0.09336908906698227,
"rewards/bleu_reward_func/std": 0.04507448151707649,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 298.96875,
"completions/mean_terminated_length": 187.38095092773438,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.0576,
"grad_norm": 11.831945419311523,
"kl": 0.0236358642578125,
"learning_rate": 1e-06,
"loss": 0.2398,
"num_tokens": 1029324.0,
"reward": 0.06671467423439026,
"reward_std": 0.07224421948194504,
"rewards/bleu_reward_func/mean": 0.06671467423439026,
"rewards/bleu_reward_func/std": 0.09839192777872086,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 262.28125,
"completions/mean_terminated_length": 245.6333465576172,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"epoch": 0.0584,
"grad_norm": 3.4266369342803955,
"kl": 0.01332855224609375,
"learning_rate": 1e-06,
"loss": -0.117,
"num_tokens": 1040677.0,
"reward": 0.048909105360507965,
"reward_std": 0.01749919354915619,
"rewards/bleu_reward_func/mean": 0.048909105360507965,
"rewards/bleu_reward_func/std": 0.046220190823078156,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 443.0,
"completions/mean_length": 315.78125,
"completions/mean_terminated_length": 213.0,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.0592,
"grad_norm": 3.334998369216919,
"kl": 0.030864715576171875,
"learning_rate": 1e-06,
"loss": 0.0762,
"num_tokens": 1057350.0,
"reward": 0.06654933840036392,
"reward_std": 0.030867960304021835,
"rewards/bleu_reward_func/mean": 0.06654933840036392,
"rewards/bleu_reward_func/std": 0.04364337399601936,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 287.4375,
"completions/mean_terminated_length": 235.61538696289062,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.06,
"grad_norm": 12.321810722351074,
"kl": 0.05252838134765625,
"learning_rate": 1e-06,
"loss": 0.1111,
"num_tokens": 1072668.0,
"reward": 0.07815341651439667,
"reward_std": 0.05233295261859894,
"rewards/bleu_reward_func/mean": 0.07815341651439667,
"rewards/bleu_reward_func/std": 0.0646696388721466,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 306.3125,
"completions/mean_terminated_length": 182.90000915527344,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.0608,
"grad_norm": 3.883251905441284,
"kl": 0.0373382568359375,
"learning_rate": 1e-06,
"loss": 0.1517,
"num_tokens": 1086694.0,
"reward": 0.06417744606733322,
"reward_std": 0.034075379371643066,
"rewards/bleu_reward_func/mean": 0.06417744606733322,
"rewards/bleu_reward_func/std": 0.049788232892751694,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 415.0,
"completions/max_terminated_length": 415.0,
"completions/mean_length": 196.4375,
"completions/mean_terminated_length": 196.4375,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.0616,
"grad_norm": 6.460638523101807,
"kl": 0.05291748046875,
"learning_rate": 1e-06,
"loss": -0.0849,
"num_tokens": 1097476.0,
"reward": 0.08122064173221588,
"reward_std": 0.03298315033316612,
"rewards/bleu_reward_func/mean": 0.08122064173221588,
"rewards/bleu_reward_func/std": 0.047924816608428955,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 448.0,
"completions/mean_length": 199.15625,
"completions/mean_terminated_length": 166.79310607910156,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.0624,
"grad_norm": 6.63805627822876,
"kl": 0.07642364501953125,
"learning_rate": 1e-06,
"loss": -0.0967,
"num_tokens": 1108793.0,
"reward": 0.07887591421604156,
"reward_std": 0.05435461550951004,
"rewards/bleu_reward_func/mean": 0.07887591421604156,
"rewards/bleu_reward_func/std": 0.10201766341924667,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 193.0,
"completions/mean_terminated_length": 147.42857360839844,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.0632,
"grad_norm": 14.087907791137695,
"kl": 0.0762939453125,
"learning_rate": 1e-06,
"loss": -0.056,
"num_tokens": 1118721.0,
"reward": 0.035933416336774826,
"reward_std": 0.02187356725335121,
"rewards/bleu_reward_func/mean": 0.035933416336774826,
"rewards/bleu_reward_func/std": 0.025764403864741325,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 192.9375,
"completions/mean_terminated_length": 133.8518524169922,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.064,
"grad_norm": 6.712767124176025,
"kl": 0.07209014892578125,
"learning_rate": 1e-06,
"loss": 0.2799,
"num_tokens": 1130095.0,
"reward": 0.04000134766101837,
"reward_std": 0.014790613204240799,
"rewards/bleu_reward_func/mean": 0.04000134766101837,
"rewards/bleu_reward_func/std": 0.028310615569353104,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 395.5625,
"completions/mean_terminated_length": 315.8947448730469,
"completions/min_length": 82.0,
"completions/min_terminated_length": 82.0,
"epoch": 0.0648,
"grad_norm": 3.1348772048950195,
"kl": 0.012363433837890625,
"learning_rate": 1e-06,
"loss": -0.0477,
"num_tokens": 1145009.0,
"reward": 0.05394501984119415,
"reward_std": 0.019456665962934494,
"rewards/bleu_reward_func/mean": 0.05394501984119415,
"rewards/bleu_reward_func/std": 0.05528007075190544,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 297.25,
"completions/mean_terminated_length": 213.21739196777344,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0656,
"grad_norm": 4.045035362243652,
"kl": 0.02558135986328125,
"learning_rate": 1e-06,
"loss": -0.0948,
"num_tokens": 1156665.0,
"reward": 0.08088956773281097,
"reward_std": 0.031020794063806534,
"rewards/bleu_reward_func/mean": 0.08088956773281097,
"rewards/bleu_reward_func/std": 0.04719265177845955,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 462.0,
"completions/mean_length": 252.65625,
"completions/mean_terminated_length": 204.629638671875,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.0664,
"grad_norm": 7.020449161529541,
"kl": 0.022491455078125,
"learning_rate": 1e-06,
"loss": -0.2084,
"num_tokens": 1170686.0,
"reward": 0.048978567123413086,
"reward_std": 0.014538805931806564,
"rewards/bleu_reward_func/mean": 0.048978567123413086,
"rewards/bleu_reward_func/std": 0.03447263315320015,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 279.53125,
"completions/mean_terminated_length": 157.76190185546875,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.0672,
"grad_norm": 6.09721040725708,
"kl": 0.02556610107421875,
"learning_rate": 1e-06,
"loss": 0.0979,
"num_tokens": 1181927.0,
"reward": 0.07267215847969055,
"reward_std": 0.029872559010982513,
"rewards/bleu_reward_func/mean": 0.07267215847969055,
"rewards/bleu_reward_func/std": 0.05035723000764847,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 281.1875,
"completions/mean_terminated_length": 257.3103332519531,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.068,
"grad_norm": 5.706907272338867,
"kl": 0.030059814453125,
"learning_rate": 1e-06,
"loss": -0.0967,
"num_tokens": 1198797.0,
"reward": 0.05050581321120262,
"reward_std": 0.023779014125466347,
"rewards/bleu_reward_func/mean": 0.05050581321120262,
"rewards/bleu_reward_func/std": 0.03608938306570053,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 163.65625,
"completions/mean_terminated_length": 163.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0688,
"grad_norm": 7.1789960861206055,
"kl": 0.0573577880859375,
"learning_rate": 1e-06,
"loss": 0.0245,
"num_tokens": 1207106.0,
"reward": 0.07873363792896271,
"reward_std": 0.0395892933011055,
"rewards/bleu_reward_func/mean": 0.07873363792896271,
"rewards/bleu_reward_func/std": 0.0705900639295578,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 425.0,
"completions/mean_length": 304.6875,
"completions/mean_terminated_length": 223.56521606445312,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.0696,
"grad_norm": 3.9068655967712402,
"kl": 0.01955413818359375,
"learning_rate": 1e-06,
"loss": -0.2361,
"num_tokens": 1219760.0,
"reward": 0.03426438570022583,
"reward_std": 0.021733341738581657,
"rewards/bleu_reward_func/mean": 0.03426438570022583,
"rewards/bleu_reward_func/std": 0.031944356858730316,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 496.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 137.1875,
"completions/mean_terminated_length": 137.1875,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.0704,
"grad_norm": 7.929437160491943,
"kl": 0.1163482666015625,
"learning_rate": 1e-06,
"loss": -0.0079,
"num_tokens": 1230702.0,
"reward": 0.13437795639038086,
"reward_std": 0.04989761859178543,
"rewards/bleu_reward_func/mean": 0.13437795639038086,
"rewards/bleu_reward_func/std": 0.08757011592388153,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 428.0,
"completions/mean_length": 248.1875,
"completions/mean_terminated_length": 160.25,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.0712,
"grad_norm": 3.5676372051239014,
"kl": 0.029571533203125,
"learning_rate": 1e-06,
"loss": 0.085,
"num_tokens": 1241748.0,
"reward": 0.06261839717626572,
"reward_std": 0.05303023010492325,
"rewards/bleu_reward_func/mean": 0.06261839717626572,
"rewards/bleu_reward_func/std": 0.07371754199266434,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 199.21875,
"completions/mean_terminated_length": 178.36666870117188,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.072,
"grad_norm": 13.081062316894531,
"kl": 0.0998382568359375,
"learning_rate": 1e-06,
"loss": 0.0303,
"num_tokens": 1254971.0,
"reward": 0.09151400625705719,
"reward_std": 0.049102533608675,
"rewards/bleu_reward_func/mean": 0.09151400625705719,
"rewards/bleu_reward_func/std": 0.08098553121089935,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 458.0,
"completions/mean_length": 277.8125,
"completions/mean_terminated_length": 244.35714721679688,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.0728,
"grad_norm": 4.541591167449951,
"kl": 0.0150604248046875,
"learning_rate": 1e-06,
"loss": -0.2243,
"num_tokens": 1268229.0,
"reward": 0.029024727642536163,
"reward_std": 0.02233259379863739,
"rewards/bleu_reward_func/mean": 0.029024727642536163,
"rewards/bleu_reward_func/std": 0.0296621173620224,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 461.0,
"completions/mean_length": 175.875,
"completions/mean_terminated_length": 81.75999450683594,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.0736,
"grad_norm": 12.45702075958252,
"kl": 0.09152984619140625,
"learning_rate": 1e-06,
"loss": 0.3301,
"num_tokens": 1279753.0,
"reward": 0.06008782982826233,
"reward_std": 0.03770461678504944,
"rewards/bleu_reward_func/mean": 0.06008782982826233,
"rewards/bleu_reward_func/std": 0.056894708424806595,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 239.0,
"completions/max_terminated_length": 239.0,
"completions/mean_length": 71.03125,
"completions/mean_terminated_length": 71.03125,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.0744,
"grad_norm": 8.271183967590332,
"kl": 0.0682220458984375,
"learning_rate": 1e-06,
"loss": 0.2167,
"num_tokens": 1288762.0,
"reward": 0.17779187858104706,
"reward_std": 0.02900426834821701,
"rewards/bleu_reward_func/mean": 0.17779187858104706,
"rewards/bleu_reward_func/std": 0.1678331196308136,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 132.28125,
"completions/mean_terminated_length": 78.03572082519531,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.0752,
"grad_norm": 45.396934509277344,
"kl": 0.140625,
"learning_rate": 1e-06,
"loss": 0.1526,
"num_tokens": 1299835.0,
"reward": 0.1527654230594635,
"reward_std": 0.061802513897418976,
"rewards/bleu_reward_func/mean": 0.1527654230594635,
"rewards/bleu_reward_func/std": 0.10723396390676498,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 350.8125,
"completions/mean_terminated_length": 266.3809509277344,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.076,
"grad_norm": 4.4763383865356445,
"kl": 0.03443145751953125,
"learning_rate": 1e-06,
"loss": 0.0315,
"num_tokens": 1314877.0,
"reward": 0.08366496115922928,
"reward_std": 0.023002739995718002,
"rewards/bleu_reward_func/mean": 0.08366496115922928,
"rewards/bleu_reward_func/std": 0.07334847003221512,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 237.75,
"completions/mean_terminated_length": 209.37930297851562,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.0768,
"grad_norm": 13.181612968444824,
"kl": 0.0660247802734375,
"learning_rate": 1e-06,
"loss": -0.1104,
"num_tokens": 1326757.0,
"reward": 0.04618287831544876,
"reward_std": 0.022957133129239082,
"rewards/bleu_reward_func/mean": 0.04618287831544876,
"rewards/bleu_reward_func/std": 0.03049774467945099,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 326.71875,
"completions/mean_terminated_length": 254.21739196777344,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.0776,
"grad_norm": 5.014129161834717,
"kl": 0.022705078125,
"learning_rate": 1e-06,
"loss": 0.0227,
"num_tokens": 1340716.0,
"reward": 0.08603382110595703,
"reward_std": 0.022703565657138824,
"rewards/bleu_reward_func/mean": 0.08603382110595703,
"rewards/bleu_reward_func/std": 0.09760169684886932,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 205.03125,
"completions/mean_terminated_length": 195.1290283203125,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.0784,
"grad_norm": 10.452898025512695,
"kl": 0.101318359375,
"learning_rate": 1e-06,
"loss": -0.0956,
"num_tokens": 1354373.0,
"reward": 0.07816646993160248,
"reward_std": 0.03450850397348404,
"rewards/bleu_reward_func/mean": 0.07816646993160248,
"rewards/bleu_reward_func/std": 0.05475042015314102,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 240.4375,
"completions/mean_terminated_length": 190.1481475830078,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.0792,
"grad_norm": 8.82993221282959,
"kl": 0.0435791015625,
"learning_rate": 1e-06,
"loss": -0.1898,
"num_tokens": 1365875.0,
"reward": 0.027829378843307495,
"reward_std": 0.016982190310955048,
"rewards/bleu_reward_func/mean": 0.027829378843307495,
"rewards/bleu_reward_func/std": 0.019511230289936066,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 466.0,
"completions/mean_length": 210.21875,
"completions/mean_terminated_length": 140.57693481445312,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.08,
"grad_norm": 14.261658668518066,
"kl": 0.1817169189453125,
"learning_rate": 1e-06,
"loss": -0.4193,
"num_tokens": 1375586.0,
"reward": 0.0430663600564003,
"reward_std": 0.023313239216804504,
"rewards/bleu_reward_func/mean": 0.0430663600564003,
"rewards/bleu_reward_func/std": 0.0409073531627655,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 210.90625,
"completions/mean_terminated_length": 201.19354248046875,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.0808,
"grad_norm": 7.960334300994873,
"kl": 0.0833740234375,
"learning_rate": 1e-06,
"loss": 0.1121,
"num_tokens": 1384975.0,
"reward": 0.0974574014544487,
"reward_std": 0.03397291898727417,
"rewards/bleu_reward_func/mean": 0.0974574014544487,
"rewards/bleu_reward_func/std": 0.10795393586158752,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 459.0,
"completions/mean_length": 306.21875,
"completions/mean_terminated_length": 225.69566345214844,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"epoch": 0.0816,
"grad_norm": 3.8322501182556152,
"kl": 0.0701446533203125,
"learning_rate": 1e-06,
"loss": 0.0153,
"num_tokens": 1403126.0,
"reward": 0.07732782512903214,
"reward_std": 0.038768649101257324,
"rewards/bleu_reward_func/mean": 0.07732782512903214,
"rewards/bleu_reward_func/std": 0.06468553096055984,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 167.8125,
"completions/mean_terminated_length": 144.86666870117188,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0824,
"grad_norm": 6.311352729797363,
"kl": 0.06109619140625,
"learning_rate": 1e-06,
"loss": 0.1029,
"num_tokens": 1417360.0,
"reward": 0.23947298526763916,
"reward_std": 0.10021178424358368,
"rewards/bleu_reward_func/mean": 0.23947298526763916,
"rewards/bleu_reward_func/std": 0.40957576036453247,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 319.375,
"completions/mean_terminated_length": 126.75,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.0832,
"grad_norm": 6.871039867401123,
"kl": 0.06162261962890625,
"learning_rate": 1e-06,
"loss": 0.3071,
"num_tokens": 1431932.0,
"reward": 0.11237628757953644,
"reward_std": 0.05608592554926872,
"rewards/bleu_reward_func/mean": 0.11237628757953644,
"rewards/bleu_reward_func/std": 0.1758151650428772,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 293.0,
"completions/mean_length": 206.34375,
"completions/mean_terminated_length": 104.45833587646484,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.084,
"grad_norm": 13.681912422180176,
"kl": 0.063812255859375,
"learning_rate": 1e-06,
"loss": 0.3711,
"num_tokens": 1440791.0,
"reward": 0.13408097624778748,
"reward_std": 0.07736363261938095,
"rewards/bleu_reward_func/mean": 0.13408097624778748,
"rewards/bleu_reward_func/std": 0.10995227843523026,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 239.21875,
"completions/mean_terminated_length": 188.70370483398438,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.0848,
"grad_norm": 6.377567291259766,
"kl": 0.212432861328125,
"learning_rate": 1e-06,
"loss": 0.1742,
"num_tokens": 1452430.0,
"reward": 0.09214982390403748,
"reward_std": 0.037541188299655914,
"rewards/bleu_reward_func/mean": 0.09214982390403748,
"rewards/bleu_reward_func/std": 0.06507368385791779,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 416.0,
"completions/mean_length": 382.84375,
"completions/mean_terminated_length": 236.4666748046875,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.0856,
"grad_norm": 2.956113338470459,
"kl": 0.015625,
"learning_rate": 1e-06,
"loss": 0.019,
"num_tokens": 1470353.0,
"reward": 0.029356852173805237,
"reward_std": 0.020268836989998817,
"rewards/bleu_reward_func/mean": 0.029356852173805237,
"rewards/bleu_reward_func/std": 0.031047984957695007,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 346.5,
"completions/mean_terminated_length": 247.1999969482422,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.0864,
"grad_norm": 5.237264156341553,
"kl": 0.0384063720703125,
"learning_rate": 1e-06,
"loss": -0.2289,
"num_tokens": 1483353.0,
"reward": 0.06388352811336517,
"reward_std": 0.03146419674158096,
"rewards/bleu_reward_func/mean": 0.06388352811336517,
"rewards/bleu_reward_func/std": 0.0666789561510086,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 326.1875,
"completions/mean_terminated_length": 199.05262756347656,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.0872,
"grad_norm": 5.559842109680176,
"kl": 0.03765106201171875,
"learning_rate": 1e-06,
"loss": -0.0076,
"num_tokens": 1497415.0,
"reward": 0.2991971969604492,
"reward_std": 0.10907518863677979,
"rewards/bleu_reward_func/mean": 0.2991971969604492,
"rewards/bleu_reward_func/std": 0.36222296953201294,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 238.0,
"completions/max_terminated_length": 238.0,
"completions/mean_length": 98.53125,
"completions/mean_terminated_length": 98.53125,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.088,
"grad_norm": 8.213761329650879,
"kl": 0.078704833984375,
"learning_rate": 1e-06,
"loss": 0.1858,
"num_tokens": 1504528.0,
"reward": 0.048464857041835785,
"reward_std": 0.0210396908223629,
"rewards/bleu_reward_func/mean": 0.048464857041835785,
"rewards/bleu_reward_func/std": 0.03311728686094284,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.0,
"completions/max_terminated_length": 399.0,
"completions/mean_length": 118.59375,
"completions/mean_terminated_length": 118.59375,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.0888,
"grad_norm": 7.166593074798584,
"kl": 0.096771240234375,
"learning_rate": 1e-06,
"loss": -0.0566,
"num_tokens": 1517035.0,
"reward": 0.09873979538679123,
"reward_std": 0.03707325458526611,
"rewards/bleu_reward_func/mean": 0.09873979538679123,
"rewards/bleu_reward_func/std": 0.13200855255126953,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 401.5,
"completions/mean_terminated_length": 240.00001525878906,
"completions/min_length": 112.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.0896,
"grad_norm": 2.8505043983459473,
"kl": 0.02176666259765625,
"learning_rate": 1e-06,
"loss": -0.0153,
"num_tokens": 1534363.0,
"reward": 0.11044108867645264,
"reward_std": 0.03410620242357254,
"rewards/bleu_reward_func/mean": 0.11044108867645264,
"rewards/bleu_reward_func/std": 0.16289857029914856,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 482.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 125.96875,
"completions/mean_terminated_length": 125.96875,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.0904,
"grad_norm": 7.475080490112305,
"kl": 0.100982666015625,
"learning_rate": 1e-06,
"loss": -0.0271,
"num_tokens": 1546258.0,
"reward": 0.12119434028863907,
"reward_std": 0.03986787050962448,
"rewards/bleu_reward_func/mean": 0.12119434028863907,
"rewards/bleu_reward_func/std": 0.10625314712524414,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 332.09375,
"completions/mean_terminated_length": 250.3181915283203,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.0912,
"grad_norm": 3.1941514015197754,
"kl": 0.01666259765625,
"learning_rate": 1e-06,
"loss": -0.0363,
"num_tokens": 1559133.0,
"reward": 0.05715271458029747,
"reward_std": 0.04336331784725189,
"rewards/bleu_reward_func/mean": 0.05715271458029747,
"rewards/bleu_reward_func/std": 0.05400845408439636,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 182.03125,
"completions/mean_terminated_length": 160.03334045410156,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.092,
"grad_norm": 8.439948081970215,
"kl": 0.0611572265625,
"learning_rate": 1e-06,
"loss": -0.2392,
"num_tokens": 1569774.0,
"reward": 0.0502852126955986,
"reward_std": 0.01610748842358589,
"rewards/bleu_reward_func/mean": 0.0502852126955986,
"rewards/bleu_reward_func/std": 0.040807489305734634,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 408.0,
"completions/max_terminated_length": 408.0,
"completions/mean_length": 163.84375,
"completions/mean_terminated_length": 163.84375,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.0928,
"grad_norm": 4.541551113128662,
"kl": 0.069915771484375,
"learning_rate": 1e-06,
"loss": -0.0109,
"num_tokens": 1580281.0,
"reward": 0.03980318829417229,
"reward_std": 0.01563824526965618,
"rewards/bleu_reward_func/mean": 0.03980318829417229,
"rewards/bleu_reward_func/std": 0.023048467934131622,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 458.0,
"completions/mean_length": 288.875,
"completions/mean_terminated_length": 226.39999389648438,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0936,
"grad_norm": 8.210314750671387,
"kl": 0.09372711181640625,
"learning_rate": 1e-06,
"loss": 0.122,
"num_tokens": 1593285.0,
"reward": 0.0629456490278244,
"reward_std": 0.015063179656863213,
"rewards/bleu_reward_func/mean": 0.0629456490278244,
"rewards/bleu_reward_func/std": 0.03602227941155434,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 350.0,
"completions/mean_terminated_length": 265.1428527832031,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"epoch": 0.0944,
"grad_norm": 2.992391586303711,
"kl": 0.02304840087890625,
"learning_rate": 1e-06,
"loss": -0.1711,
"num_tokens": 1606645.0,
"reward": 0.022465957328677177,
"reward_std": 0.016872048377990723,
"rewards/bleu_reward_func/mean": 0.022465957328677177,
"rewards/bleu_reward_func/std": 0.023790787905454636,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 285.375,
"completions/mean_terminated_length": 233.07693481445312,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.0952,
"grad_norm": 4.377739429473877,
"kl": 0.0274658203125,
"learning_rate": 1e-06,
"loss": -0.0351,
"num_tokens": 1617737.0,
"reward": 0.053562991321086884,
"reward_std": 0.025934984907507896,
"rewards/bleu_reward_func/mean": 0.053562991321086884,
"rewards/bleu_reward_func/std": 0.03459456190466881,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 277.0,
"completions/mean_length": 285.25,
"completions/mean_terminated_length": 85.17646789550781,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.096,
"grad_norm": 5.66406774520874,
"kl": 0.032745361328125,
"learning_rate": 1e-06,
"loss": 0.1042,
"num_tokens": 1631409.0,
"reward": 0.05639251321554184,
"reward_std": 0.025049947202205658,
"rewards/bleu_reward_func/mean": 0.05639251321554184,
"rewards/bleu_reward_func/std": 0.04031047970056534,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 467.0,
"completions/mean_length": 391.25,
"completions/mean_terminated_length": 254.40000915527344,
"completions/min_length": 81.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.0968,
"grad_norm": 2.7011678218841553,
"kl": 0.019012451171875,
"learning_rate": 1e-06,
"loss": -0.0595,
"num_tokens": 1647665.0,
"reward": 0.1355845332145691,
"reward_std": 0.03834523260593414,
"rewards/bleu_reward_func/mean": 0.1355845332145691,
"rewards/bleu_reward_func/std": 0.17731845378875732,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 360.09375,
"completions/mean_terminated_length": 241.94444274902344,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.0976,
"grad_norm": 3.2975172996520996,
"kl": 0.0248870849609375,
"learning_rate": 1e-06,
"loss": -0.0281,
"num_tokens": 1661164.0,
"reward": 0.02182621881365776,
"reward_std": 0.010437489487230778,
"rewards/bleu_reward_func/mean": 0.02182621881365776,
"rewards/bleu_reward_func/std": 0.019065655767917633,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 329.03125,
"completions/mean_terminated_length": 295.1481628417969,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.0984,
"grad_norm": 3.011232376098633,
"kl": 0.023040771484375,
"learning_rate": 1e-06,
"loss": -0.0609,
"num_tokens": 1674269.0,
"reward": 0.05195554345846176,
"reward_std": 0.020864665508270264,
"rewards/bleu_reward_func/mean": 0.05195554345846176,
"rewards/bleu_reward_func/std": 0.027087198570370674,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 451.0,
"completions/mean_length": 301.0,
"completions/mean_terminated_length": 114.82353210449219,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.0992,
"grad_norm": 8.828636169433594,
"kl": 0.0798492431640625,
"learning_rate": 1e-06,
"loss": -0.0267,
"num_tokens": 1691797.0,
"reward": 0.1420682966709137,
"reward_std": 0.04143287241458893,
"rewards/bleu_reward_func/mean": 0.1420682966709137,
"rewards/bleu_reward_func/std": 0.07349839806556702,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 432.0,
"completions/max_terminated_length": 432.0,
"completions/mean_length": 167.28125,
"completions/mean_terminated_length": 167.28125,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.1,
"grad_norm": 7.268754482269287,
"kl": 0.104766845703125,
"learning_rate": 1e-06,
"loss": 0.2657,
"num_tokens": 1702150.0,
"reward": 0.16663971543312073,
"reward_std": 0.05392443761229515,
"rewards/bleu_reward_func/mean": 0.16663971543312073,
"rewards/bleu_reward_func/std": 0.09980462491512299,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 432.0,
"completions/mean_length": 367.8125,
"completions/mean_terminated_length": 223.625,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.1008,
"grad_norm": 2.9197561740875244,
"kl": 0.0296630859375,
"learning_rate": 1e-06,
"loss": 0.069,
"num_tokens": 1720080.0,
"reward": 0.05814104527235031,
"reward_std": 0.023808015510439873,
"rewards/bleu_reward_func/mean": 0.05814104527235031,
"rewards/bleu_reward_func/std": 0.06258071959018707,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 222.0,
"completions/mean_length": 311.03125,
"completions/mean_terminated_length": 110.0625,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.1016,
"grad_norm": 3.6131699085235596,
"kl": 0.040496826171875,
"learning_rate": 1e-06,
"loss": -0.1103,
"num_tokens": 1736129.0,
"reward": 0.1627029925584793,
"reward_std": 0.048266101628541946,
"rewards/bleu_reward_func/mean": 0.1627029925584793,
"rewards/bleu_reward_func/std": 0.2640880048274994,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 182.1875,
"completions/mean_terminated_length": 89.83999633789062,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.1024,
"grad_norm": 5.8553900718688965,
"kl": 0.0695953369140625,
"learning_rate": 1e-06,
"loss": 0.2073,
"num_tokens": 1744807.0,
"reward": 0.05680542066693306,
"reward_std": 0.02900797501206398,
"rewards/bleu_reward_func/mean": 0.05680542066693306,
"rewards/bleu_reward_func/std": 0.062428779900074005,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 434.0,
"completions/mean_length": 304.375,
"completions/mean_terminated_length": 235.1666717529297,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.1032,
"grad_norm": 2.625256299972534,
"kl": 0.014190673828125,
"learning_rate": 1e-06,
"loss": -0.3256,
"num_tokens": 1759979.0,
"reward": 0.07073010504245758,
"reward_std": 0.0585593655705452,
"rewards/bleu_reward_func/mean": 0.07073010504245758,
"rewards/bleu_reward_func/std": 0.0830271914601326,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 269.21875,
"completions/mean_terminated_length": 188.2916717529297,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.104,
"grad_norm": 7.594178199768066,
"kl": 0.019775390625,
"learning_rate": 1e-06,
"loss": 0.103,
"num_tokens": 1772810.0,
"reward": 0.03343900665640831,
"reward_std": 0.008691318333148956,
"rewards/bleu_reward_func/mean": 0.03343900665640831,
"rewards/bleu_reward_func/std": 0.027092551812529564,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 307.375,
"completions/mean_terminated_length": 286.2069091796875,
"completions/min_length": 114.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.1048,
"grad_norm": 3.5237069129943848,
"kl": 0.0251312255859375,
"learning_rate": 1e-06,
"loss": 0.0902,
"num_tokens": 1786094.0,
"reward": 0.03853389620780945,
"reward_std": 0.016378795728087425,
"rewards/bleu_reward_func/mean": 0.03853389620780945,
"rewards/bleu_reward_func/std": 0.02983209490776062,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 244.03125,
"completions/mean_terminated_length": 194.40740966796875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.1056,
"grad_norm": 3.428116798400879,
"kl": 0.072906494140625,
"learning_rate": 1e-06,
"loss": 0.1144,
"num_tokens": 1797551.0,
"reward": 0.1538739800453186,
"reward_std": 0.03595956414937973,
"rewards/bleu_reward_func/mean": 0.1538739800453186,
"rewards/bleu_reward_func/std": 0.21548843383789062,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 220.46875,
"completions/mean_terminated_length": 211.06451416015625,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.1064,
"grad_norm": 4.208179950714111,
"kl": 0.058685302734375,
"learning_rate": 1e-06,
"loss": -0.1489,
"num_tokens": 1808726.0,
"reward": 0.18491268157958984,
"reward_std": 0.0416969433426857,
"rewards/bleu_reward_func/mean": 0.18491268157958984,
"rewards/bleu_reward_func/std": 0.2198871225118637,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 402.0,
"completions/max_terminated_length": 402.0,
"completions/mean_length": 117.5625,
"completions/mean_terminated_length": 117.5625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1072,
"grad_norm": 9.312064170837402,
"kl": 0.2574310302734375,
"learning_rate": 1e-06,
"loss": -0.1699,
"num_tokens": 1816152.0,
"reward": 0.09744147956371307,
"reward_std": 0.03963543474674225,
"rewards/bleu_reward_func/mean": 0.09744147956371307,
"rewards/bleu_reward_func/std": 0.07821591198444366,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 284.5625,
"completions/mean_terminated_length": 165.42857360839844,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.108,
"grad_norm": 2.491009473800659,
"kl": 0.5361785888671875,
"learning_rate": 1e-06,
"loss": 0.0755,
"num_tokens": 1830010.0,
"reward": 0.07847163081169128,
"reward_std": 0.07447989284992218,
"rewards/bleu_reward_func/mean": 0.07847163081169128,
"rewards/bleu_reward_func/std": 0.1269197165966034,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 208.28125,
"completions/mean_terminated_length": 164.8928680419922,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.1088,
"grad_norm": 9.84536075592041,
"kl": 0.137481689453125,
"learning_rate": 1e-06,
"loss": 0.0643,
"num_tokens": 1843019.0,
"reward": 0.23385955393314362,
"reward_std": 0.07621090114116669,
"rewards/bleu_reward_func/mean": 0.23385955393314362,
"rewards/bleu_reward_func/std": 0.2127569168806076,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 243.46875,
"completions/mean_terminated_length": 138.3913116455078,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.1096,
"grad_norm": 6.7033257484436035,
"kl": 0.051727294921875,
"learning_rate": 1e-06,
"loss": 0.1285,
"num_tokens": 1856450.0,
"reward": 0.04753299057483673,
"reward_std": 0.016634728759527206,
"rewards/bleu_reward_func/mean": 0.04753299057483673,
"rewards/bleu_reward_func/std": 0.030512619763612747,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 352.40625,
"completions/mean_terminated_length": 171.53334045410156,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.1104,
"grad_norm": 3.812756061553955,
"kl": 0.0177764892578125,
"learning_rate": 1e-06,
"loss": 0.1601,
"num_tokens": 1870943.0,
"reward": 0.04067971557378769,
"reward_std": 0.026344479992985725,
"rewards/bleu_reward_func/mean": 0.04067971557378769,
"rewards/bleu_reward_func/std": 0.06328170746564865,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 287.46875,
"completions/mean_terminated_length": 264.2413635253906,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.1112,
"grad_norm": 4.916464805603027,
"kl": 0.06890869140625,
"learning_rate": 1e-06,
"loss": 0.0108,
"num_tokens": 1883774.0,
"reward": 0.1910865753889084,
"reward_std": 0.09566200524568558,
"rewards/bleu_reward_func/mean": 0.1910865753889084,
"rewards/bleu_reward_func/std": 0.2485995888710022,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 446.0,
"completions/mean_length": 312.6875,
"completions/mean_terminated_length": 157.6666717529297,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.112,
"grad_norm": 5.02897310256958,
"kl": 0.081298828125,
"learning_rate": 1e-06,
"loss": 0.0347,
"num_tokens": 1900964.0,
"reward": 0.1273106336593628,
"reward_std": 0.037408363074064255,
"rewards/bleu_reward_func/mean": 0.1273106336593628,
"rewards/bleu_reward_func/std": 0.1255699247121811,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 421.0,
"completions/mean_length": 287.40625,
"completions/mean_terminated_length": 199.52174377441406,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.1128,
"grad_norm": 3.5728607177734375,
"kl": 0.0290985107421875,
"learning_rate": 1e-06,
"loss": -0.0802,
"num_tokens": 1914153.0,
"reward": 0.1449739634990692,
"reward_std": 0.05561315268278122,
"rewards/bleu_reward_func/mean": 0.1449739634990692,
"rewards/bleu_reward_func/std": 0.10589203238487244,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 171.6875,
"completions/mean_terminated_length": 149.00001525878906,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.1136,
"grad_norm": 6.168550491333008,
"kl": 0.076751708984375,
"learning_rate": 1e-06,
"loss": 0.1377,
"num_tokens": 1924551.0,
"reward": 0.07935678958892822,
"reward_std": 0.044586654752492905,
"rewards/bleu_reward_func/mean": 0.07935678958892822,
"rewards/bleu_reward_func/std": 0.11080160737037659,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 188.28125,
"completions/mean_terminated_length": 166.70001220703125,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.1144,
"grad_norm": 5.237224102020264,
"kl": 0.042633056640625,
"learning_rate": 1e-06,
"loss": 0.1969,
"num_tokens": 1936192.0,
"reward": 0.07339954376220703,
"reward_std": 0.04980514198541641,
"rewards/bleu_reward_func/mean": 0.07339954376220703,
"rewards/bleu_reward_func/std": 0.06703697144985199,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 340.0,
"completions/mean_length": 168.03125,
"completions/mean_terminated_length": 53.375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1152,
"grad_norm": 6.578721523284912,
"kl": 0.148895263671875,
"learning_rate": 1e-06,
"loss": 0.2196,
"num_tokens": 1946361.0,
"reward": 0.2388084977865219,
"reward_std": 0.05400132015347481,
"rewards/bleu_reward_func/mean": 0.2388084977865219,
"rewards/bleu_reward_func/std": 0.2556310296058655,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 254.90625,
"completions/mean_terminated_length": 218.17857360839844,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.116,
"grad_norm": 5.644160270690918,
"kl": 0.10284423828125,
"learning_rate": 1e-06,
"loss": 0.0093,
"num_tokens": 1958990.0,
"reward": 0.05691784247756004,
"reward_std": 0.045338764786720276,
"rewards/bleu_reward_func/mean": 0.05691784247756004,
"rewards/bleu_reward_func/std": 0.051530975848436356,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 445.0,
"completions/mean_length": 138.125,
"completions/mean_terminated_length": 126.06451416015625,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.1168,
"grad_norm": 6.659230709075928,
"kl": 0.07525634765625,
"learning_rate": 1e-06,
"loss": 0.3905,
"num_tokens": 1966178.0,
"reward": 0.08115407824516296,
"reward_std": 0.05008203536272049,
"rewards/bleu_reward_func/mean": 0.08115407824516296,
"rewards/bleu_reward_func/std": 0.060907039791345596,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 446.0,
"completions/mean_length": 152.25,
"completions/mean_terminated_length": 69.23077392578125,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1176,
"grad_norm": 10.029218673706055,
"kl": 0.1614227294921875,
"learning_rate": 1e-06,
"loss": 0.0338,
"num_tokens": 1977194.0,
"reward": 0.23646463453769684,
"reward_std": 0.09375543892383575,
"rewards/bleu_reward_func/mean": 0.23646463453769684,
"rewards/bleu_reward_func/std": 0.27427393198013306,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 300.84375,
"completions/mean_terminated_length": 174.15000915527344,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1184,
"grad_norm": 7.6274027824401855,
"kl": 0.0694122314453125,
"learning_rate": 1e-06,
"loss": -0.0195,
"num_tokens": 1991693.0,
"reward": 0.1271597295999527,
"reward_std": 0.03925805538892746,
"rewards/bleu_reward_func/mean": 0.1271597295999527,
"rewards/bleu_reward_func/std": 0.20968182384967804,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 442.0,
"completions/mean_length": 235.0625,
"completions/mean_terminated_length": 142.75,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.1192,
"grad_norm": 5.1624908447265625,
"kl": 0.074005126953125,
"learning_rate": 1e-06,
"loss": -0.0749,
"num_tokens": 2002359.0,
"reward": 0.0867965817451477,
"reward_std": 0.03743039071559906,
"rewards/bleu_reward_func/mean": 0.0867965817451477,
"rewards/bleu_reward_func/std": 0.06982331722974777,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 383.0,
"completions/mean_length": 180.125,
"completions/mean_terminated_length": 69.5,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.12,
"grad_norm": 5.687462329864502,
"kl": 0.079742431640625,
"learning_rate": 1e-06,
"loss": 0.1774,
"num_tokens": 2012139.0,
"reward": 0.08913667500019073,
"reward_std": 0.03803376108407974,
"rewards/bleu_reward_func/mean": 0.08913667500019073,
"rewards/bleu_reward_func/std": 0.07373686879873276,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 246.6875,
"completions/mean_terminated_length": 158.25,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.1208,
"grad_norm": 6.3145341873168945,
"kl": 0.168792724609375,
"learning_rate": 1e-06,
"loss": 0.0474,
"num_tokens": 2024961.0,
"reward": 0.09886027127504349,
"reward_std": 0.09059572219848633,
"rewards/bleu_reward_func/mean": 0.09886027127504349,
"rewards/bleu_reward_func/std": 0.20261086523532867,
"step": 151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 278.375,
"completions/mean_terminated_length": 200.5,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.1216,
"grad_norm": 6.056458473205566,
"kl": 0.04815673828125,
"learning_rate": 1e-06,
"loss": 0.0367,
"num_tokens": 2038621.0,
"reward": 0.040941424667835236,
"reward_std": 0.024181999266147614,
"rewards/bleu_reward_func/mean": 0.040941424667835236,
"rewards/bleu_reward_func/std": 0.031022800132632256,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 408.0,
"completions/mean_length": 174.4375,
"completions/mean_terminated_length": 79.91999816894531,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1224,
"grad_norm": 11.584298133850098,
"kl": 0.10552978515625,
"learning_rate": 1e-06,
"loss": -0.3199,
"num_tokens": 2048947.0,
"reward": 0.057592377066612244,
"reward_std": 0.02831832319498062,
"rewards/bleu_reward_func/mean": 0.057592377066612244,
"rewards/bleu_reward_func/std": 0.0929059162735939,
"step": 153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 285.71875,
"completions/mean_terminated_length": 197.17391967773438,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.1232,
"grad_norm": 3.8603882789611816,
"kl": 0.042999267578125,
"learning_rate": 1e-06,
"loss": -0.1903,
"num_tokens": 2059978.0,
"reward": 0.0238445196300745,
"reward_std": 0.016163241118192673,
"rewards/bleu_reward_func/mean": 0.0238445196300745,
"rewards/bleu_reward_func/std": 0.020820245146751404,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 262.0,
"completions/max_terminated_length": 262.0,
"completions/mean_length": 110.90625,
"completions/mean_terminated_length": 110.90625,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.124,
"grad_norm": 9.231768608093262,
"kl": 0.134063720703125,
"learning_rate": 1e-06,
"loss": -0.12,
"num_tokens": 2068223.0,
"reward": 0.093255415558815,
"reward_std": 0.04695024713873863,
"rewards/bleu_reward_func/mean": 0.093255415558815,
"rewards/bleu_reward_func/std": 0.07957140356302261,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 196.875,
"completions/mean_terminated_length": 151.85714721679688,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.1248,
"grad_norm": 6.661685466766357,
"kl": 0.11859130859375,
"learning_rate": 1e-06,
"loss": -0.1122,
"num_tokens": 2079491.0,
"reward": 0.10441941022872925,
"reward_std": 0.06782116740942001,
"rewards/bleu_reward_func/mean": 0.10441941022872925,
"rewards/bleu_reward_func/std": 0.1558544933795929,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 417.84375,
"completions/mean_terminated_length": 353.4210510253906,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.1256,
"grad_norm": 2.281557559967041,
"kl": 0.020050048828125,
"learning_rate": 1e-06,
"loss": -0.049,
"num_tokens": 2094998.0,
"reward": 0.03994186595082283,
"reward_std": 0.020151065662503242,
"rewards/bleu_reward_func/mean": 0.03994186595082283,
"rewards/bleu_reward_func/std": 0.03798232972621918,
"step": 157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 186.75,
"completions/mean_terminated_length": 126.51851654052734,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1264,
"grad_norm": 5.349869251251221,
"kl": 0.052093505859375,
"learning_rate": 1e-06,
"loss": 0.0712,
"num_tokens": 2104006.0,
"reward": 0.060490936040878296,
"reward_std": 0.039247751235961914,
"rewards/bleu_reward_func/mean": 0.060490936040878296,
"rewards/bleu_reward_func/std": 0.06767360866069794,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 312.375,
"completions/mean_terminated_length": 266.3077087402344,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1272,
"grad_norm": 6.974966526031494,
"kl": 0.0902099609375,
"learning_rate": 1e-06,
"loss": -0.0785,
"num_tokens": 2121570.0,
"reward": 0.21717938780784607,
"reward_std": 0.08217764645814896,
"rewards/bleu_reward_func/mean": 0.21717938780784607,
"rewards/bleu_reward_func/std": 0.1689896285533905,
"step": 159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 350.25,
"completions/mean_terminated_length": 239.57894897460938,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.128,
"grad_norm": 8.79633617401123,
"kl": 0.10308837890625,
"learning_rate": 1e-06,
"loss": -0.0746,
"num_tokens": 2134322.0,
"reward": 0.027915209531784058,
"reward_std": 0.008189969696104527,
"rewards/bleu_reward_func/mean": 0.027915209531784058,
"rewards/bleu_reward_func/std": 0.021798407658934593,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 432.0,
"completions/max_terminated_length": 432.0,
"completions/mean_length": 98.40625,
"completions/mean_terminated_length": 98.40625,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.1288,
"grad_norm": 9.201173782348633,
"kl": 0.112060546875,
"learning_rate": 1e-06,
"loss": 0.1926,
"num_tokens": 2139903.0,
"reward": 0.08629470318555832,
"reward_std": 0.0329008549451828,
"rewards/bleu_reward_func/mean": 0.08629470318555832,
"rewards/bleu_reward_func/std": 0.04737285524606705,
"step": 161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 426.0,
"completions/mean_length": 408.75,
"completions/mean_terminated_length": 211.63636779785156,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"epoch": 0.1296,
"grad_norm": 3.5992963314056396,
"kl": 0.034759521484375,
"learning_rate": 1e-06,
"loss": 0.2439,
"num_tokens": 2158343.0,
"reward": 0.07093626260757446,
"reward_std": 0.04270578920841217,
"rewards/bleu_reward_func/mean": 0.07093626260757446,
"rewards/bleu_reward_func/std": 0.09919130057096481,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 396.0,
"completions/mean_length": 171.4375,
"completions/mean_terminated_length": 92.84616088867188,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.1304,
"grad_norm": 8.71653938293457,
"kl": 0.069793701171875,
"learning_rate": 1e-06,
"loss": -0.2102,
"num_tokens": 2166957.0,
"reward": 0.045812517404556274,
"reward_std": 0.0257731880992651,
"rewards/bleu_reward_func/mean": 0.045812517404556274,
"rewards/bleu_reward_func/std": 0.033692970871925354,
"step": 163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 435.65625,
"completions/mean_terminated_length": 383.4210510253906,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.1312,
"grad_norm": 2.308507204055786,
"kl": 0.020599365234375,
"learning_rate": 1e-06,
"loss": -0.142,
"num_tokens": 2182538.0,
"reward": 0.05681996047496796,
"reward_std": 0.022751763463020325,
"rewards/bleu_reward_func/mean": 0.05681996047496796,
"rewards/bleu_reward_func/std": 0.034446995705366135,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 422.1875,
"completions/mean_terminated_length": 368.3000183105469,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.132,
"grad_norm": 2.0656521320343018,
"kl": 0.0328369140625,
"learning_rate": 1e-06,
"loss": -0.1159,
"num_tokens": 2198440.0,
"reward": 0.08400298655033112,
"reward_std": 0.03193335980176926,
"rewards/bleu_reward_func/mean": 0.08400298655033112,
"rewards/bleu_reward_func/std": 0.05056838318705559,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 265.25,
"completions/mean_terminated_length": 117.20000457763672,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.1328,
"grad_norm": 7.659345626831055,
"kl": 0.12249755859375,
"learning_rate": 1e-06,
"loss": -0.0683,
"num_tokens": 2212440.0,
"reward": 0.17941661179065704,
"reward_std": 0.040813662111759186,
"rewards/bleu_reward_func/mean": 0.17941661179065704,
"rewards/bleu_reward_func/std": 0.2576500475406647,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 410.0,
"completions/max_terminated_length": 410.0,
"completions/mean_length": 171.625,
"completions/mean_terminated_length": 171.625,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.1336,
"grad_norm": 4.253745079040527,
"kl": 0.03948974609375,
"learning_rate": 1e-06,
"loss": -0.0369,
"num_tokens": 2220868.0,
"reward": 0.15958541631698608,
"reward_std": 0.08837255835533142,
"rewards/bleu_reward_func/mean": 0.15958541631698608,
"rewards/bleu_reward_func/std": 0.2750999629497528,
"step": 167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 258.09375,
"completions/mean_terminated_length": 158.7391357421875,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.1344,
"grad_norm": 7.613649845123291,
"kl": 0.230072021484375,
"learning_rate": 1e-06,
"loss": 0.2337,
"num_tokens": 2232063.0,
"reward": 0.0643484890460968,
"reward_std": 0.04164566472172737,
"rewards/bleu_reward_func/mean": 0.0643484890460968,
"rewards/bleu_reward_func/std": 0.07561130821704865,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 312.75,
"completions/mean_terminated_length": 193.1999969482422,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.1352,
"grad_norm": 7.345302104949951,
"kl": 0.19305419921875,
"learning_rate": 1e-06,
"loss": -0.2036,
"num_tokens": 2248271.0,
"reward": 0.04911228269338608,
"reward_std": 0.018512040376663208,
"rewards/bleu_reward_func/mean": 0.04911228269338608,
"rewards/bleu_reward_func/std": 0.05713532865047455,
"step": 169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 327.0,
"completions/mean_terminated_length": 230.09524536132812,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.136,
"grad_norm": 5.079345226287842,
"kl": 0.02252197265625,
"learning_rate": 1e-06,
"loss": 0.1003,
"num_tokens": 2261791.0,
"reward": 0.03408445790410042,
"reward_std": 0.007548983674496412,
"rewards/bleu_reward_func/mean": 0.03408445790410042,
"rewards/bleu_reward_func/std": 0.030450724065303802,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 204.0,
"completions/mean_length": 173.3125,
"completions/mean_terminated_length": 78.47999572753906,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.1368,
"grad_norm": 7.255119800567627,
"kl": 0.098876953125,
"learning_rate": 1e-06,
"loss": -0.1893,
"num_tokens": 2271041.0,
"reward": 0.08309763669967651,
"reward_std": 0.05162087082862854,
"rewards/bleu_reward_func/mean": 0.08309763669967651,
"rewards/bleu_reward_func/std": 0.08563226461410522,
"step": 171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 452.0,
"completions/mean_length": 251.59375,
"completions/mean_terminated_length": 164.7916717529297,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1376,
"grad_norm": 9.955636024475098,
"kl": 0.094970703125,
"learning_rate": 1e-06,
"loss": 0.4501,
"num_tokens": 2281108.0,
"reward": 0.09667688608169556,
"reward_std": 0.047036267817020416,
"rewards/bleu_reward_func/mean": 0.09667688608169556,
"rewards/bleu_reward_func/std": 0.05911566689610481,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 259.625,
"completions/mean_terminated_length": 251.48385620117188,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.1384,
"grad_norm": 2.710672616958618,
"kl": 0.05035400390625,
"learning_rate": 1e-06,
"loss": -0.1131,
"num_tokens": 2292040.0,
"reward": 0.01771564967930317,
"reward_std": 0.0045564379543066025,
"rewards/bleu_reward_func/mean": 0.01771564967930317,
"rewards/bleu_reward_func/std": 0.009397609159350395,
"step": 173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 279.15625,
"completions/mean_terminated_length": 119.84210968017578,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.1392,
"grad_norm": 5.8446946144104,
"kl": 0.10295867919921875,
"learning_rate": 1e-06,
"loss": -0.0422,
"num_tokens": 2306453.0,
"reward": 0.10576937347650528,
"reward_std": 0.040997594594955444,
"rewards/bleu_reward_func/mean": 0.10576937347650528,
"rewards/bleu_reward_func/std": 0.15739315748214722,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 406.375,
"completions/mean_terminated_length": 270.5714416503906,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.14,
"grad_norm": 2.8740079402923584,
"kl": 0.022064208984375,
"learning_rate": 1e-06,
"loss": -0.001,
"num_tokens": 2321945.0,
"reward": 0.07392336428165436,
"reward_std": 0.027644775807857513,
"rewards/bleu_reward_func/mean": 0.07392336428165436,
"rewards/bleu_reward_func/std": 0.079840287566185,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 454.0,
"completions/mean_length": 384.6875,
"completions/mean_terminated_length": 308.3000183105469,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1408,
"grad_norm": 2.571645498275757,
"kl": 0.0183868408203125,
"learning_rate": 1e-06,
"loss": -0.106,
"num_tokens": 2338695.0,
"reward": 0.043530724942684174,
"reward_std": 0.02269122190773487,
"rewards/bleu_reward_func/mean": 0.043530724942684174,
"rewards/bleu_reward_func/std": 0.029228538274765015,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 276.71875,
"completions/mean_terminated_length": 184.6521759033203,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.1416,
"grad_norm": 6.338461399078369,
"kl": 0.0661468505859375,
"learning_rate": 1e-06,
"loss": 0.2588,
"num_tokens": 2349054.0,
"reward": 0.04008907824754715,
"reward_std": 0.03199386969208717,
"rewards/bleu_reward_func/mean": 0.04008907824754715,
"rewards/bleu_reward_func/std": 0.05116712674498558,
"step": 177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 367.84375,
"completions/mean_terminated_length": 182.50001525878906,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1424,
"grad_norm": 7.893227577209473,
"kl": 0.1038818359375,
"learning_rate": 1e-06,
"loss": -0.0007,
"num_tokens": 2365297.0,
"reward": 0.05835431069135666,
"reward_std": 0.01447733398526907,
"rewards/bleu_reward_func/mean": 0.05835431069135666,
"rewards/bleu_reward_func/std": 0.05388018116354942,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 106.9375,
"completions/mean_terminated_length": 79.93333435058594,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.1432,
"grad_norm": 13.133338928222656,
"kl": 0.378204345703125,
"learning_rate": 1e-06,
"loss": 0.1078,
"num_tokens": 2377279.0,
"reward": 0.27373576164245605,
"reward_std": 0.10149600356817245,
"rewards/bleu_reward_func/mean": 0.27373576164245605,
"rewards/bleu_reward_func/std": 0.21089527010917664,
"step": 179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 384.875,
"completions/mean_terminated_length": 318.28570556640625,
"completions/min_length": 125.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.144,
"grad_norm": 2.6368002891540527,
"kl": 0.02276611328125,
"learning_rate": 1e-06,
"loss": 0.0261,
"num_tokens": 2393507.0,
"reward": 0.06703202426433563,
"reward_std": 0.02514977753162384,
"rewards/bleu_reward_func/mean": 0.06703202426433563,
"rewards/bleu_reward_func/std": 0.05334871634840965,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 408.0,
"completions/mean_length": 216.15625,
"completions/mean_terminated_length": 147.88462829589844,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1448,
"grad_norm": 26.23644256591797,
"kl": 0.052398681640625,
"learning_rate": 1e-06,
"loss": 0.2092,
"num_tokens": 2403920.0,
"reward": 0.07073464244604111,
"reward_std": 0.0369129553437233,
"rewards/bleu_reward_func/mean": 0.07073464244604111,
"rewards/bleu_reward_func/std": 0.04567345231771469,
"step": 181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 285.09375,
"completions/mean_terminated_length": 166.23809814453125,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.1456,
"grad_norm": 12.993139266967773,
"kl": 0.135498046875,
"learning_rate": 1e-06,
"loss": 0.241,
"num_tokens": 2416907.0,
"reward": 0.05180336907505989,
"reward_std": 0.024485625326633453,
"rewards/bleu_reward_func/mean": 0.05180336907505989,
"rewards/bleu_reward_func/std": 0.03925548121333122,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 318.0,
"completions/mean_length": 196.96875,
"completions/mean_terminated_length": 91.95833587646484,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.1464,
"grad_norm": 8.714866638183594,
"kl": 0.2222137451171875,
"learning_rate": 1e-06,
"loss": 0.1099,
"num_tokens": 2428778.0,
"reward": 0.08364134281873703,
"reward_std": 0.042949263006448746,
"rewards/bleu_reward_func/mean": 0.08364134281873703,
"rewards/bleu_reward_func/std": 0.09259536862373352,
"step": 183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 314.09375,
"completions/mean_terminated_length": 293.6206970214844,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.1472,
"grad_norm": 2.9456305503845215,
"kl": 0.029815673828125,
"learning_rate": 1e-06,
"loss": -0.172,
"num_tokens": 2441829.0,
"reward": 0.11525549739599228,
"reward_std": 0.056866977363824844,
"rewards/bleu_reward_func/mean": 0.11525549739599228,
"rewards/bleu_reward_func/std": 0.10229503363370895,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 308.0,
"completions/mean_length": 310.21875,
"completions/mean_terminated_length": 108.4375,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.148,
"grad_norm": 4.448924541473389,
"kl": 0.118255615234375,
"learning_rate": 1e-06,
"loss": -0.0768,
"num_tokens": 2456612.0,
"reward": 0.1624433547258377,
"reward_std": 0.045910030603408813,
"rewards/bleu_reward_func/mean": 0.1624433547258377,
"rewards/bleu_reward_func/std": 0.19173115491867065,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 135.0,
"completions/mean_length": 134.09375,
"completions/mean_terminated_length": 28.279998779296875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.1488,
"grad_norm": 13.645524024963379,
"kl": 0.1475830078125,
"learning_rate": 1e-06,
"loss": 0.0664,
"num_tokens": 2464839.0,
"reward": 0.05004946142435074,
"reward_std": 0.03280433267354965,
"rewards/bleu_reward_func/mean": 0.05004946142435074,
"rewards/bleu_reward_func/std": 0.05075250193476677,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 383.28125,
"completions/mean_terminated_length": 332.9130554199219,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.1496,
"grad_norm": 2.3289942741394043,
"kl": 0.0212860107421875,
"learning_rate": 1e-06,
"loss": 0.005,
"num_tokens": 2478608.0,
"reward": 0.03798733651638031,
"reward_std": 0.014268442057073116,
"rewards/bleu_reward_func/mean": 0.03798733651638031,
"rewards/bleu_reward_func/std": 0.03045865148305893,
"step": 187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 448.0,
"completions/mean_length": 355.9375,
"completions/mean_terminated_length": 303.91668701171875,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.1504,
"grad_norm": 3.192103862762451,
"kl": 0.0250701904296875,
"learning_rate": 1e-06,
"loss": 0.167,
"num_tokens": 2492518.0,
"reward": 0.02103330008685589,
"reward_std": 0.0090586943551898,
"rewards/bleu_reward_func/mean": 0.02103330008685589,
"rewards/bleu_reward_func/std": 0.01017869170755148,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 350.0,
"completions/max_terminated_length": 350.0,
"completions/mean_length": 60.375,
"completions/mean_terminated_length": 60.375,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.1512,
"grad_norm": 10.600973129272461,
"kl": 0.31732177734375,
"learning_rate": 1e-06,
"loss": -0.0974,
"num_tokens": 2503642.0,
"reward": 0.2223111093044281,
"reward_std": 0.05318839102983475,
"rewards/bleu_reward_func/mean": 0.2223111093044281,
"rewards/bleu_reward_func/std": 0.1549021303653717,
"step": 189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 468.3125,
"completions/mean_terminated_length": 384.9090881347656,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"epoch": 0.152,
"grad_norm": 2.152480363845825,
"kl": 0.02301025390625,
"learning_rate": 1e-06,
"loss": -0.0043,
"num_tokens": 2521540.0,
"reward": 0.04742058366537094,
"reward_std": 0.0165211483836174,
"rewards/bleu_reward_func/mean": 0.04742058366537094,
"rewards/bleu_reward_func/std": 0.038380105048418045,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 295.5625,
"completions/mean_terminated_length": 147.4736785888672,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.1528,
"grad_norm": 3.0485126972198486,
"kl": 0.0381622314453125,
"learning_rate": 1e-06,
"loss": 0.5548,
"num_tokens": 2536702.0,
"reward": 0.059137165546417236,
"reward_std": 0.029524236917495728,
"rewards/bleu_reward_func/mean": 0.059137165546417236,
"rewards/bleu_reward_func/std": 0.04191603511571884,
"step": 191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 305.4375,
"completions/mean_terminated_length": 211.5454559326172,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.1536,
"grad_norm": 3.727417230606079,
"kl": 0.06072998046875,
"learning_rate": 1e-06,
"loss": 0.0741,
"num_tokens": 2553892.0,
"reward": 0.06053918972611427,
"reward_std": 0.025174250826239586,
"rewards/bleu_reward_func/mean": 0.06053918972611427,
"rewards/bleu_reward_func/std": 0.03798559308052063,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 327.0625,
"completions/mean_terminated_length": 216.10000610351562,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.1544,
"grad_norm": 22.730863571166992,
"kl": 0.1234130859375,
"learning_rate": 1e-06,
"loss": -0.1216,
"num_tokens": 2569950.0,
"reward": 0.14068183302879333,
"reward_std": 0.05201031640172005,
"rewards/bleu_reward_func/mean": 0.14068183302879333,
"rewards/bleu_reward_func/std": 0.1718810796737671,
"step": 193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 349.0,
"completions/mean_length": 189.71875,
"completions/mean_terminated_length": 82.29167175292969,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.1552,
"grad_norm": 5.675025939941406,
"kl": 0.063934326171875,
"learning_rate": 1e-06,
"loss": -0.0105,
"num_tokens": 2579693.0,
"reward": 0.08947663754224777,
"reward_std": 0.029948215931653976,
"rewards/bleu_reward_func/mean": 0.08947663754224777,
"rewards/bleu_reward_func/std": 0.06868135929107666,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 410.0,
"completions/mean_length": 210.5625,
"completions/mean_terminated_length": 167.5,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.156,
"grad_norm": 6.797698974609375,
"kl": 0.17242431640625,
"learning_rate": 1e-06,
"loss": -0.0415,
"num_tokens": 2592967.0,
"reward": 0.16623055934906006,
"reward_std": 0.08808746933937073,
"rewards/bleu_reward_func/mean": 0.16623055934906006,
"rewards/bleu_reward_func/std": 0.17983676493167877,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 277.0,
"completions/max_terminated_length": 277.0,
"completions/mean_length": 119.09375,
"completions/mean_terminated_length": 119.09375,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.1568,
"grad_norm": 5.494751453399658,
"kl": 0.05902099609375,
"learning_rate": 1e-06,
"loss": 0.2727,
"num_tokens": 2602042.0,
"reward": 0.17185799777507782,
"reward_std": 0.10617370158433914,
"rewards/bleu_reward_func/mean": 0.17185799777507782,
"rewards/bleu_reward_func/std": 0.16121239960193634,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 368.5,
"completions/mean_terminated_length": 293.3333435058594,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1576,
"grad_norm": 4.1480302810668945,
"kl": 0.0312652587890625,
"learning_rate": 1e-06,
"loss": -0.1879,
"num_tokens": 2615826.0,
"reward": 0.051297686994075775,
"reward_std": 0.018504546955227852,
"rewards/bleu_reward_func/mean": 0.051297686994075775,
"rewards/bleu_reward_func/std": 0.034977275878190994,
"step": 197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 267.53125,
"completions/mean_terminated_length": 251.2333526611328,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.1584,
"grad_norm": 4.113983631134033,
"kl": 0.027557373046875,
"learning_rate": 1e-06,
"loss": 0.0749,
"num_tokens": 2626275.0,
"reward": 0.054141815751791,
"reward_std": 0.02476467750966549,
"rewards/bleu_reward_func/mean": 0.054141815751791,
"rewards/bleu_reward_func/std": 0.07109448313713074,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 470.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 112.96875,
"completions/mean_terminated_length": 112.96875,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.1592,
"grad_norm": 7.432074546813965,
"kl": 0.1759033203125,
"learning_rate": 1e-06,
"loss": 0.0648,
"num_tokens": 2633962.0,
"reward": 0.16682901978492737,
"reward_std": 0.07138749957084656,
"rewards/bleu_reward_func/mean": 0.16682901978492737,
"rewards/bleu_reward_func/std": 0.15276572108268738,
"step": 199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 451.78125,
"completions/mean_terminated_length": 404.9444580078125,
"completions/min_length": 289.0,
"completions/min_terminated_length": 289.0,
"epoch": 0.16,
"grad_norm": 2.110192060470581,
"kl": 0.022369384765625,
"learning_rate": 1e-06,
"loss": -0.0268,
"num_tokens": 2653971.0,
"reward": 0.11942745745182037,
"reward_std": 0.02005620300769806,
"rewards/bleu_reward_func/mean": 0.11942745745182037,
"rewards/bleu_reward_func/std": 0.09454692155122757,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 349.0,
"completions/mean_length": 185.9375,
"completions/mean_terminated_length": 110.69231414794922,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.1608,
"grad_norm": 6.2729973793029785,
"kl": 0.063720703125,
"learning_rate": 1e-06,
"loss": -0.0659,
"num_tokens": 2664601.0,
"reward": 0.03557516261935234,
"reward_std": 0.021523961797356606,
"rewards/bleu_reward_func/mean": 0.03557516261935234,
"rewards/bleu_reward_func/std": 0.02618589997291565,
"step": 201
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 292.5,
"completions/mean_terminated_length": 121.77777862548828,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1616,
"grad_norm": 5.936282157897949,
"kl": 0.0358734130859375,
"learning_rate": 1e-06,
"loss": -0.2742,
"num_tokens": 2679849.0,
"reward": 0.038136985152959824,
"reward_std": 0.022807471454143524,
"rewards/bleu_reward_func/mean": 0.038136985152959824,
"rewards/bleu_reward_func/std": 0.061121899634599686,
"step": 202
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 230.96875,
"completions/mean_terminated_length": 221.90321350097656,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.1624,
"grad_norm": 8.785550117492676,
"kl": 0.1523590087890625,
"learning_rate": 1e-06,
"loss": -0.2049,
"num_tokens": 2693024.0,
"reward": 0.1289938986301422,
"reward_std": 0.045512765645980835,
"rewards/bleu_reward_func/mean": 0.1289938986301422,
"rewards/bleu_reward_func/std": 0.09638386219739914,
"step": 203
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 475.0,
"completions/mean_length": 361.8125,
"completions/mean_terminated_length": 168.71429443359375,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.1632,
"grad_norm": 6.617871284484863,
"kl": 0.0502777099609375,
"learning_rate": 1e-06,
"loss": 0.1304,
"num_tokens": 2710354.0,
"reward": 0.033049020916223526,
"reward_std": 0.017362549901008606,
"rewards/bleu_reward_func/mean": 0.033049020916223526,
"rewards/bleu_reward_func/std": 0.026102159172296524,
"step": 204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 456.0,
"completions/mean_length": 281.4375,
"completions/mean_terminated_length": 176.63636779785156,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.164,
"grad_norm": 3.961705446243286,
"kl": 0.104736328125,
"learning_rate": 1e-06,
"loss": -0.0209,
"num_tokens": 2724680.0,
"reward": 0.1750263273715973,
"reward_std": 0.02830299735069275,
"rewards/bleu_reward_func/mean": 0.1750263273715973,
"rewards/bleu_reward_func/std": 0.13747908174991608,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 287.28125,
"completions/mean_terminated_length": 255.1785888671875,
"completions/min_length": 53.0,
"completions/min_terminated_length": 53.0,
"epoch": 0.1648,
"grad_norm": 3.098118305206299,
"kl": 0.020477294921875,
"learning_rate": 1e-06,
"loss": -0.1863,
"num_tokens": 2736209.0,
"reward": 0.06041261553764343,
"reward_std": 0.033261410892009735,
"rewards/bleu_reward_func/mean": 0.06041261553764343,
"rewards/bleu_reward_func/std": 0.046081364154815674,
"step": 206
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 429.0,
"completions/mean_length": 230.1875,
"completions/mean_terminated_length": 136.25,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.1656,
"grad_norm": 6.539205551147461,
"kl": 0.0445404052734375,
"learning_rate": 1e-06,
"loss": 0.0949,
"num_tokens": 2749503.0,
"reward": 0.039952248334884644,
"reward_std": 0.05510722100734711,
"rewards/bleu_reward_func/mean": 0.039952248334884644,
"rewards/bleu_reward_func/std": 0.08833327889442444,
"step": 207
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 453.0,
"completions/mean_length": 385.8125,
"completions/mean_terminated_length": 223.57144165039062,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.1664,
"grad_norm": 3.262167453765869,
"kl": 0.0330657958984375,
"learning_rate": 1e-06,
"loss": 0.0025,
"num_tokens": 2763289.0,
"reward": 0.06319095194339752,
"reward_std": 0.021728292107582092,
"rewards/bleu_reward_func/mean": 0.06319095194339752,
"rewards/bleu_reward_func/std": 0.03750937059521675,
"step": 208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 182.71875,
"completions/mean_terminated_length": 172.09677124023438,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.1672,
"grad_norm": 6.667765140533447,
"kl": 0.113616943359375,
"learning_rate": 1e-06,
"loss": 0.3285,
"num_tokens": 2773104.0,
"reward": 0.1846303939819336,
"reward_std": 0.16774994134902954,
"rewards/bleu_reward_func/mean": 0.1846303939819336,
"rewards/bleu_reward_func/std": 0.20520828664302826,
"step": 209
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 449.0,
"completions/mean_length": 204.6875,
"completions/mean_terminated_length": 147.7777862548828,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.168,
"grad_norm": 5.794483661651611,
"kl": 0.09796142578125,
"learning_rate": 1e-06,
"loss": 0.1363,
"num_tokens": 2787670.0,
"reward": 0.09086121618747711,
"reward_std": 0.052026841789484024,
"rewards/bleu_reward_func/mean": 0.09086121618747711,
"rewards/bleu_reward_func/std": 0.09278357774019241,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 367.0,
"completions/mean_length": 399.8125,
"completions/mean_terminated_length": 113.11111450195312,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.1688,
"grad_norm": 11.576338768005371,
"kl": 0.0574493408203125,
"learning_rate": 1e-06,
"loss": 0.1204,
"num_tokens": 2805864.0,
"reward": 0.023652518168091774,
"reward_std": 0.01210303045809269,
"rewards/bleu_reward_func/mean": 0.023652518168091774,
"rewards/bleu_reward_func/std": 0.02501726523041725,
"step": 211
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 157.6875,
"completions/mean_terminated_length": 58.47999954223633,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.1696,
"grad_norm": 10.228835105895996,
"kl": 0.1674041748046875,
"learning_rate": 1e-06,
"loss": 0.0582,
"num_tokens": 2816758.0,
"reward": 0.13800185918807983,
"reward_std": 0.047296687960624695,
"rewards/bleu_reward_func/mean": 0.13800185918807983,
"rewards/bleu_reward_func/std": 0.0863277018070221,
"step": 212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 402.15625,
"completions/mean_terminated_length": 359.1739196777344,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.1704,
"grad_norm": 2.593717336654663,
"kl": 0.01934814453125,
"learning_rate": 1e-06,
"loss": -0.0091,
"num_tokens": 2833347.0,
"reward": 0.05193600431084633,
"reward_std": 0.018484318628907204,
"rewards/bleu_reward_func/mean": 0.05193600431084633,
"rewards/bleu_reward_func/std": 0.04251272976398468,
"step": 213
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 509.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 198.25,
"completions/mean_terminated_length": 198.25,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1712,
"grad_norm": 6.071621894836426,
"kl": 0.057373046875,
"learning_rate": 1e-06,
"loss": -0.1354,
"num_tokens": 2841195.0,
"reward": 0.062206219881772995,
"reward_std": 0.03749649226665497,
"rewards/bleu_reward_func/mean": 0.062206219881772995,
"rewards/bleu_reward_func/std": 0.0528765432536602,
"step": 214
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 246.40625,
"completions/mean_terminated_length": 218.9310302734375,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.172,
"grad_norm": 4.833486557006836,
"kl": 0.068695068359375,
"learning_rate": 1e-06,
"loss": 0.1631,
"num_tokens": 2851224.0,
"reward": 0.06542235612869263,
"reward_std": 0.03771442174911499,
"rewards/bleu_reward_func/mean": 0.06542235612869263,
"rewards/bleu_reward_func/std": 0.0579860620200634,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 465.15625,
"completions/mean_terminated_length": 412.0666809082031,
"completions/min_length": 309.0,
"completions/min_terminated_length": 309.0,
"epoch": 0.1728,
"grad_norm": 2.1820828914642334,
"kl": 0.0188446044921875,
"learning_rate": 1e-06,
"loss": 0.0277,
"num_tokens": 2870765.0,
"reward": 0.06440776586532593,
"reward_std": 0.013088207691907883,
"rewards/bleu_reward_func/mean": 0.06440776586532593,
"rewards/bleu_reward_func/std": 0.06307429075241089,
"step": 216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 224.0625,
"completions/mean_terminated_length": 170.74073791503906,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.1736,
"grad_norm": 9.596390724182129,
"kl": 0.2492218017578125,
"learning_rate": 1e-06,
"loss": 0.1621,
"num_tokens": 2882151.0,
"reward": 0.15283548831939697,
"reward_std": 0.08103044331073761,
"rewards/bleu_reward_func/mean": 0.15283548831939697,
"rewards/bleu_reward_func/std": 0.13223250210285187,
"step": 217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 512.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 464.09375,
"completions/mean_terminated_length": 320.375,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.1744,
"grad_norm": 2.1536099910736084,
"kl": 0.02032470703125,
"learning_rate": 1e-06,
"loss": 0.0402,
"num_tokens": 2900658.0,
"reward": 0.02000538259744644,
"reward_std": 0.008671639487147331,
"rewards/bleu_reward_func/mean": 0.02000538259744644,
"rewards/bleu_reward_func/std": 0.01867109164595604,
"step": 218
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 155.5,
"completions/mean_terminated_length": 55.68000030517578,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1752,
"grad_norm": 8.202893257141113,
"kl": 0.2074127197265625,
"learning_rate": 1e-06,
"loss": -0.0157,
"num_tokens": 2909778.0,
"reward": 0.1296558678150177,
"reward_std": 0.04394569993019104,
"rewards/bleu_reward_func/mean": 0.1296558678150177,
"rewards/bleu_reward_func/std": 0.05605300888419151,
"step": 219
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 350.40625,
"completions/mean_terminated_length": 188.8125,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.176,
"grad_norm": 5.133015155792236,
"kl": 0.0847930908203125,
"learning_rate": 1e-06,
"loss": 0.2562,
"num_tokens": 2926239.0,
"reward": 0.1607290506362915,
"reward_std": 0.12061528861522675,
"rewards/bleu_reward_func/mean": 0.1607290506362915,
"rewards/bleu_reward_func/std": 0.19297951459884644,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 377.09375,
"completions/mean_terminated_length": 306.4285888671875,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.1768,
"grad_norm": 2.917404890060425,
"kl": 0.02947998046875,
"learning_rate": 1e-06,
"loss": 0.0902,
"num_tokens": 2940970.0,
"reward": 0.05033531412482262,
"reward_std": 0.015085380524396896,
"rewards/bleu_reward_func/mean": 0.05033531412482262,
"rewards/bleu_reward_func/std": 0.03601166605949402,
"step": 221
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 307.0,
"completions/max_terminated_length": 307.0,
"completions/mean_length": 98.625,
"completions/mean_terminated_length": 98.625,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.1776,
"grad_norm": 9.739842414855957,
"kl": 0.28759765625,
"learning_rate": 1e-06,
"loss": 0.1954,
"num_tokens": 2951942.0,
"reward": 0.18511344492435455,
"reward_std": 0.09618590772151947,
"rewards/bleu_reward_func/mean": 0.18511344492435455,
"rewards/bleu_reward_func/std": 0.13407698273658752,
"step": 222
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 469.0,
"completions/mean_length": 155.1875,
"completions/mean_terminated_length": 131.40000915527344,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1784,
"grad_norm": 5.931830883026123,
"kl": 0.082611083984375,
"learning_rate": 1e-06,
"loss": 0.0924,
"num_tokens": 2959460.0,
"reward": 0.07271347939968109,
"reward_std": 0.05200031027197838,
"rewards/bleu_reward_func/mean": 0.07271347939968109,
"rewards/bleu_reward_func/std": 0.06765022873878479,
"step": 223
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 391.0,
"completions/mean_length": 130.71875,
"completions/mean_terminated_length": 105.30000305175781,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.1792,
"grad_norm": 7.8368730545043945,
"kl": 0.09417724609375,
"learning_rate": 1e-06,
"loss": 0.1685,
"num_tokens": 2966491.0,
"reward": 0.0899183601140976,
"reward_std": 0.05122753232717514,
"rewards/bleu_reward_func/mean": 0.0899183601140976,
"rewards/bleu_reward_func/std": 0.11120127141475677,
"step": 224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 367.5,
"completions/mean_terminated_length": 268.631591796875,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.18,
"grad_norm": 3.555055618286133,
"kl": 0.0318603515625,
"learning_rate": 1e-06,
"loss": -0.0636,
"num_tokens": 2982515.0,
"reward": 0.11773502081632614,
"reward_std": 0.046606093645095825,
"rewards/bleu_reward_func/mean": 0.11773502081632614,
"rewards/bleu_reward_func/std": 0.15673232078552246,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 505.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 122.65625,
"completions/mean_terminated_length": 122.65625,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.1808,
"grad_norm": 10.452176094055176,
"kl": 0.1519775390625,
"learning_rate": 1e-06,
"loss": 0.0183,
"num_tokens": 2991512.0,
"reward": 0.13446207344532013,
"reward_std": 0.060547836124897,
"rewards/bleu_reward_func/mean": 0.13446207344532013,
"rewards/bleu_reward_func/std": 0.07454977184534073,
"step": 226
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 345.6875,
"completions/mean_terminated_length": 258.5714416503906,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.1816,
"grad_norm": 2.964317560195923,
"kl": 0.038421630859375,
"learning_rate": 1e-06,
"loss": 0.1194,
"num_tokens": 3005678.0,
"reward": 0.14132392406463623,
"reward_std": 0.05001860111951828,
"rewards/bleu_reward_func/mean": 0.14132392406463623,
"rewards/bleu_reward_func/std": 0.08175285160541534,
"step": 227
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 449.0,
"completions/mean_length": 390.125,
"completions/mean_terminated_length": 122.0,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.1824,
"grad_norm": 5.049752235412598,
"kl": 0.04425048828125,
"learning_rate": 1e-06,
"loss": 0.1754,
"num_tokens": 3021434.0,
"reward": 0.04336467757821083,
"reward_std": 0.018742987886071205,
"rewards/bleu_reward_func/mean": 0.04336467757821083,
"rewards/bleu_reward_func/std": 0.03402964025735855,
"step": 228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 364.0625,
"completions/mean_terminated_length": 249.0,
"completions/min_length": 111.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.1832,
"grad_norm": 3.026240348815918,
"kl": 0.0245361328125,
"learning_rate": 1e-06,
"loss": -0.2846,
"num_tokens": 3040956.0,
"reward": 0.028285246342420578,
"reward_std": 0.018473699688911438,
"rewards/bleu_reward_func/mean": 0.028285246342420578,
"rewards/bleu_reward_func/std": 0.02460222877562046,
"step": 229
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 370.1875,
"completions/mean_terminated_length": 330.47998046875,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.184,
"grad_norm": 2.621922731399536,
"kl": 0.0340728759765625,
"learning_rate": 1e-06,
"loss": -0.0699,
"num_tokens": 3058866.0,
"reward": 0.18184542655944824,
"reward_std": 0.06604617834091187,
"rewards/bleu_reward_func/mean": 0.18184542655944824,
"rewards/bleu_reward_func/std": 0.16794371604919434,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 213.71875,
"completions/mean_terminated_length": 97.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1848,
"grad_norm": 5.496671676635742,
"kl": 0.11822509765625,
"learning_rate": 1e-06,
"loss": 0.4021,
"num_tokens": 3071713.0,
"reward": 0.22397759556770325,
"reward_std": 0.09391038119792938,
"rewards/bleu_reward_func/mean": 0.22397759556770325,
"rewards/bleu_reward_func/std": 0.19180122017860413,
"step": 231
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 267.0,
"completions/mean_length": 197.84375,
"completions/mean_terminated_length": 93.125,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.1856,
"grad_norm": 3.808242082595825,
"kl": 0.04571533203125,
"learning_rate": 1e-06,
"loss": 0.0821,
"num_tokens": 3079892.0,
"reward": 0.060666900128126144,
"reward_std": 0.029011715203523636,
"rewards/bleu_reward_func/mean": 0.060666900128126144,
"rewards/bleu_reward_func/std": 0.0762709304690361,
"step": 232
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 441.0,
"completions/mean_length": 160.40625,
"completions/mean_terminated_length": 61.959999084472656,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.1864,
"grad_norm": 11.2310791015625,
"kl": 0.160797119140625,
"learning_rate": 1e-06,
"loss": 0.2881,
"num_tokens": 3087689.0,
"reward": 0.07089974731206894,
"reward_std": 0.03123306669294834,
"rewards/bleu_reward_func/mean": 0.07089974731206894,
"rewards/bleu_reward_func/std": 0.06456828862428665,
"step": 233
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 437.0,
"completions/max_terminated_length": 437.0,
"completions/mean_length": 66.875,
"completions/mean_terminated_length": 66.875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.1872,
"grad_norm": 13.989295959472656,
"kl": 0.3311767578125,
"learning_rate": 1e-06,
"loss": -0.0331,
"num_tokens": 3093357.0,
"reward": 0.15325351059436798,
"reward_std": 0.0506255105137825,
"rewards/bleu_reward_func/mean": 0.15325351059436798,
"rewards/bleu_reward_func/std": 0.19497260451316833,
"step": 234
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 466.0,
"completions/mean_length": 260.75,
"completions/mean_terminated_length": 162.43478393554688,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.188,
"grad_norm": 7.557122230529785,
"kl": 0.173126220703125,
"learning_rate": 1e-06,
"loss": 0.1592,
"num_tokens": 3105749.0,
"reward": 0.20930011570453644,
"reward_std": 0.06161898747086525,
"rewards/bleu_reward_func/mean": 0.20930011570453644,
"rewards/bleu_reward_func/std": 0.2159973680973053,
"step": 235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 404.9375,
"completions/mean_terminated_length": 310.4705810546875,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1888,
"grad_norm": 4.613722324371338,
"kl": 0.025421142578125,
"learning_rate": 1e-06,
"loss": -0.1006,
"num_tokens": 3121795.0,
"reward": 0.02748030610382557,
"reward_std": 0.0075658103451132774,
"rewards/bleu_reward_func/mean": 0.02748030610382557,
"rewards/bleu_reward_func/std": 0.03438537195324898,
"step": 236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 389.09375,
"completions/mean_terminated_length": 266.1875,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.1896,
"grad_norm": 2.435314178466797,
"kl": 0.0276031494140625,
"learning_rate": 1e-06,
"loss": -0.1746,
"num_tokens": 3140006.0,
"reward": 0.10853572189807892,
"reward_std": 0.05605427548289299,
"rewards/bleu_reward_func/mean": 0.10853572189807892,
"rewards/bleu_reward_func/std": 0.1485956311225891,
"step": 237
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 61.0,
"completions/mean_length": 149.78125,
"completions/mean_terminated_length": 29.041667938232422,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1904,
"grad_norm": 8.839442253112793,
"kl": 0.2632598876953125,
"learning_rate": 1e-06,
"loss": 0.0665,
"num_tokens": 3149743.0,
"reward": 0.13384486734867096,
"reward_std": 0.03735985979437828,
"rewards/bleu_reward_func/mean": 0.13384486734867096,
"rewards/bleu_reward_func/std": 0.17275770008563995,
"step": 238
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 261.625,
"completions/mean_terminated_length": 178.1666717529297,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.1912,
"grad_norm": 4.326257228851318,
"kl": 0.14703369140625,
"learning_rate": 1e-06,
"loss": -0.0395,
"num_tokens": 3163195.0,
"reward": 0.16435688734054565,
"reward_std": 0.051772814244031906,
"rewards/bleu_reward_func/mean": 0.16435688734054565,
"rewards/bleu_reward_func/std": 0.13062016665935516,
"step": 239
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 359.4375,
"completions/mean_terminated_length": 255.05262756347656,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.192,
"grad_norm": 6.709453582763672,
"kl": 0.097503662109375,
"learning_rate": 1e-06,
"loss": -0.0418,
"num_tokens": 3180209.0,
"reward": 0.10101380944252014,
"reward_std": 0.030364379286766052,
"rewards/bleu_reward_func/mean": 0.10101380944252014,
"rewards/bleu_reward_func/std": 0.08647928386926651,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 412.0,
"completions/mean_length": 185.0,
"completions/mean_terminated_length": 93.43999481201172,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.1928,
"grad_norm": 9.118388175964355,
"kl": 0.14984130859375,
"learning_rate": 1e-06,
"loss": -0.0512,
"num_tokens": 3189425.0,
"reward": 0.19255727529525757,
"reward_std": 0.03786986321210861,
"rewards/bleu_reward_func/mean": 0.19255727529525757,
"rewards/bleu_reward_func/std": 0.18927834928035736,
"step": 241
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 162.4375,
"completions/mean_terminated_length": 81.76923370361328,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.1936,
"grad_norm": 7.5658745765686035,
"kl": 0.1087493896484375,
"learning_rate": 1e-06,
"loss": 0.11,
"num_tokens": 3196911.0,
"reward": 0.08898752182722092,
"reward_std": 0.01980067417025566,
"rewards/bleu_reward_func/mean": 0.08898752182722092,
"rewards/bleu_reward_func/std": 0.09810609370470047,
"step": 242
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 238.8125,
"completions/mean_terminated_length": 175.7692413330078,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.1944,
"grad_norm": 3.6591224670410156,
"kl": 0.068145751953125,
"learning_rate": 1e-06,
"loss": 0.0677,
"num_tokens": 3212161.0,
"reward": 0.16356298327445984,
"reward_std": 0.08266205340623856,
"rewards/bleu_reward_func/mean": 0.16356298327445984,
"rewards/bleu_reward_func/std": 0.17177340388298035,
"step": 243
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 369.0625,
"completions/mean_terminated_length": 294.19049072265625,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.1952,
"grad_norm": 2.8115499019622803,
"kl": 0.032623291015625,
"learning_rate": 1e-06,
"loss": 0.0943,
"num_tokens": 3228483.0,
"reward": 0.06906401365995407,
"reward_std": 0.025964463129639626,
"rewards/bleu_reward_func/mean": 0.06906401365995407,
"rewards/bleu_reward_func/std": 0.044564370065927505,
"step": 244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 253.875,
"completions/mean_terminated_length": 194.3076934814453,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.196,
"grad_norm": 6.153151512145996,
"kl": 0.072845458984375,
"learning_rate": 1e-06,
"loss": 0.1336,
"num_tokens": 3238447.0,
"reward": 0.05225534737110138,
"reward_std": 0.019162572920322418,
"rewards/bleu_reward_func/mean": 0.05225534737110138,
"rewards/bleu_reward_func/std": 0.04069560393691063,
"step": 245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 357.6875,
"completions/mean_terminated_length": 237.6666717529297,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.1968,
"grad_norm": 4.332682132720947,
"kl": 0.074615478515625,
"learning_rate": 1e-06,
"loss": -0.0015,
"num_tokens": 3252661.0,
"reward": 0.06644366681575775,
"reward_std": 0.029834389686584473,
"rewards/bleu_reward_func/mean": 0.06644366681575775,
"rewards/bleu_reward_func/std": 0.0527600534260273,
"step": 246
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 433.0,
"completions/mean_length": 280.84375,
"completions/mean_terminated_length": 203.7916717529297,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.1976,
"grad_norm": 3.9714043140411377,
"kl": 0.046600341796875,
"learning_rate": 1e-06,
"loss": 0.0574,
"num_tokens": 3263392.0,
"reward": 0.04084426164627075,
"reward_std": 0.022724341601133347,
"rewards/bleu_reward_func/mean": 0.04084426164627075,
"rewards/bleu_reward_func/std": 0.03625248372554779,
"step": 247
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 246.1875,
"completions/mean_terminated_length": 171.75999450683594,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.1984,
"grad_norm": 7.287817478179932,
"kl": 0.128570556640625,
"learning_rate": 1e-06,
"loss": 0.0058,
"num_tokens": 3274190.0,
"reward": 0.05778396502137184,
"reward_std": 0.020291190594434738,
"rewards/bleu_reward_func/mean": 0.05778396502137184,
"rewards/bleu_reward_func/std": 0.046611472964286804,
"step": 248
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 454.0,
"completions/mean_length": 317.21875,
"completions/mean_terminated_length": 183.94737243652344,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1992,
"grad_norm": 9.650996208190918,
"kl": 0.1335906982421875,
"learning_rate": 1e-06,
"loss": -0.0956,
"num_tokens": 3288605.0,
"reward": 0.15271537005901337,
"reward_std": 0.0891089141368866,
"rewards/bleu_reward_func/mean": 0.15271537005901337,
"rewards/bleu_reward_func/std": 0.1993638128042221,
"step": 249
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 315.125,
"completions/mean_terminated_length": 238.0869598388672,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.2,
"grad_norm": 8.128390312194824,
"kl": 0.073394775390625,
"learning_rate": 1e-06,
"loss": 0.1129,
"num_tokens": 3303449.0,
"reward": 0.05582565814256668,
"reward_std": 0.04732588678598404,
"rewards/bleu_reward_func/mean": 0.05582565814256668,
"rewards/bleu_reward_func/std": 0.06975270062685013,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 397.34375,
"completions/mean_terminated_length": 282.6875,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"epoch": 0.2008,
"grad_norm": 2.3954126834869385,
"kl": 0.027587890625,
"learning_rate": 1e-06,
"loss": -0.0758,
"num_tokens": 3322684.0,
"reward": 0.20381565392017365,
"reward_std": 0.06331950426101685,
"rewards/bleu_reward_func/mean": 0.20381565392017365,
"rewards/bleu_reward_func/std": 0.30689555406570435,
"step": 251
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 187.75,
"completions/mean_terminated_length": 177.29031372070312,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.2016,
"grad_norm": 10.365123748779297,
"kl": 0.403076171875,
"learning_rate": 1e-06,
"loss": -0.0409,
"num_tokens": 3332612.0,
"reward": 0.09179520606994629,
"reward_std": 0.042515259236097336,
"rewards/bleu_reward_func/mean": 0.09179520606994629,
"rewards/bleu_reward_func/std": 0.06000783294439316,
"step": 252
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 256.4375,
"completions/mean_terminated_length": 184.87998962402344,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.2024,
"grad_norm": 5.642463207244873,
"kl": 0.1347198486328125,
"learning_rate": 1e-06,
"loss": -0.0626,
"num_tokens": 3347706.0,
"reward": 0.12519359588623047,
"reward_std": 0.036009326577186584,
"rewards/bleu_reward_func/mean": 0.12519359588623047,
"rewards/bleu_reward_func/std": 0.1556256264448166,
"step": 253
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 469.0,
"completions/mean_length": 133.4375,
"completions/mean_terminated_length": 79.35714721679688,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.2032,
"grad_norm": 13.66992473602295,
"kl": 0.30645751953125,
"learning_rate": 1e-06,
"loss": -0.1315,
"num_tokens": 3353864.0,
"reward": 0.10196495056152344,
"reward_std": 0.05300650745630264,
"rewards/bleu_reward_func/mean": 0.10196495056152344,
"rewards/bleu_reward_func/std": 0.09023614972829819,
"step": 254
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 267.6875,
"completions/mean_terminated_length": 232.7857208251953,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.204,
"grad_norm": 24.040653228759766,
"kl": 0.062896728515625,
"learning_rate": 1e-06,
"loss": -0.1232,
"num_tokens": 3365758.0,
"reward": 0.03941156342625618,
"reward_std": 0.017305800691246986,
"rewards/bleu_reward_func/mean": 0.03941156342625618,
"rewards/bleu_reward_func/std": 0.02295033633708954,
"step": 255
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 420.21875,
"completions/mean_terminated_length": 316.20001220703125,
"completions/min_length": 98.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.2048,
"grad_norm": 2.9320602416992188,
"kl": 0.033233642578125,
"learning_rate": 1e-06,
"loss": -0.1371,
"num_tokens": 3382765.0,
"reward": 0.05339156836271286,
"reward_std": 0.02982841432094574,
"rewards/bleu_reward_func/mean": 0.05339156836271286,
"rewards/bleu_reward_func/std": 0.07343700528144836,
"step": 256
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 247.8125,
"completions/mean_terminated_length": 173.83999633789062,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.2056,
"grad_norm": 8.614324569702148,
"kl": 0.1400146484375,
"learning_rate": 1e-06,
"loss": 0.1332,
"num_tokens": 3394391.0,
"reward": 0.06851230561733246,
"reward_std": 0.04152427613735199,
"rewards/bleu_reward_func/mean": 0.06851230561733246,
"rewards/bleu_reward_func/std": 0.056356508284807205,
"step": 257
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 365.625,
"completions/mean_terminated_length": 199.73333740234375,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.2064,
"grad_norm": 6.318526744842529,
"kl": 0.099365234375,
"learning_rate": 1e-06,
"loss": -0.0091,
"num_tokens": 3411195.0,
"reward": 0.08351869136095047,
"reward_std": 0.012093533761799335,
"rewards/bleu_reward_func/mean": 0.08351869136095047,
"rewards/bleu_reward_func/std": 0.08073550462722778,
"step": 258
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 348.0,
"completions/mean_length": 134.65625,
"completions/mean_terminated_length": 109.50000762939453,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.2072,
"grad_norm": 7.35445499420166,
"kl": 0.2371826171875,
"learning_rate": 1e-06,
"loss": 0.4692,
"num_tokens": 3419136.0,
"reward": 0.15089674293994904,
"reward_std": 0.06239618360996246,
"rewards/bleu_reward_func/mean": 0.15089674293994904,
"rewards/bleu_reward_func/std": 0.09912555664777756,
"step": 259
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 242.46875,
"completions/mean_terminated_length": 180.2692413330078,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.208,
"grad_norm": 4.740394592285156,
"kl": 0.08489990234375,
"learning_rate": 1e-06,
"loss": 0.0594,
"num_tokens": 3432127.0,
"reward": 0.05275239422917366,
"reward_std": 0.050225820392370224,
"rewards/bleu_reward_func/mean": 0.05275239422917366,
"rewards/bleu_reward_func/std": 0.07898835092782974,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 462.0,
"completions/mean_length": 331.84375,
"completions/mean_terminated_length": 223.75,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.2088,
"grad_norm": 3.1740782260894775,
"kl": 0.05401611328125,
"learning_rate": 1e-06,
"loss": -0.0923,
"num_tokens": 3446746.0,
"reward": 0.12386887520551682,
"reward_std": 0.031204696744680405,
"rewards/bleu_reward_func/mean": 0.12386887520551682,
"rewards/bleu_reward_func/std": 0.1644604653120041,
"step": 261
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 329.5,
"completions/mean_terminated_length": 268.66668701171875,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.2096,
"grad_norm": 2.937896728515625,
"kl": 0.033477783203125,
"learning_rate": 1e-06,
"loss": -0.0197,
"num_tokens": 3460890.0,
"reward": 0.05950773134827614,
"reward_std": 0.017293047159910202,
"rewards/bleu_reward_func/mean": 0.05950773134827614,
"rewards/bleu_reward_func/std": 0.04094443470239639,
"step": 262
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 455.0,
"completions/max_terminated_length": 455.0,
"completions/mean_length": 135.40625,
"completions/mean_terminated_length": 135.40625,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.2104,
"grad_norm": 8.865147590637207,
"kl": 0.2249755859375,
"learning_rate": 1e-06,
"loss": -0.0363,
"num_tokens": 3475103.0,
"reward": 0.20508863031864166,
"reward_std": 0.040958937257528305,
"rewards/bleu_reward_func/mean": 0.20508863031864166,
"rewards/bleu_reward_func/std": 0.14616157114505768,
"step": 263
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 267.8125,
"completions/mean_terminated_length": 186.4166717529297,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.2112,
"grad_norm": 9.684611320495605,
"kl": 0.241973876953125,
"learning_rate": 1e-06,
"loss": 0.0071,
"num_tokens": 3487049.0,
"reward": 0.098166324198246,
"reward_std": 0.040819209069013596,
"rewards/bleu_reward_func/mean": 0.098166324198246,
"rewards/bleu_reward_func/std": 0.08471043407917023,
"step": 264
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 421.0,
"completions/mean_length": 260.875,
"completions/mean_terminated_length": 129.3333282470703,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.212,
"grad_norm": 10.798442840576172,
"kl": 0.1309814453125,
"learning_rate": 1e-06,
"loss": 0.3087,
"num_tokens": 3501029.0,
"reward": 0.12524467706680298,
"reward_std": 0.05395754426717758,
"rewards/bleu_reward_func/mean": 0.12524467706680298,
"rewards/bleu_reward_func/std": 0.1178852915763855,
"step": 265
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 205.71875,
"completions/mean_terminated_length": 103.625,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.2128,
"grad_norm": 6.346302032470703,
"kl": 0.132049560546875,
"learning_rate": 1e-06,
"loss": 0.0884,
"num_tokens": 3511372.0,
"reward": 0.10632273554801941,
"reward_std": 0.041688427329063416,
"rewards/bleu_reward_func/mean": 0.10632273554801941,
"rewards/bleu_reward_func/std": 0.09963962435722351,
"step": 266
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 231.4375,
"completions/mean_terminated_length": 137.9166717529297,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.2136,
"grad_norm": 9.553611755371094,
"kl": 0.2503204345703125,
"learning_rate": 1e-06,
"loss": -0.085,
"num_tokens": 3524570.0,
"reward": 0.08381873369216919,
"reward_std": 0.026928268373012543,
"rewards/bleu_reward_func/mean": 0.08381873369216919,
"rewards/bleu_reward_func/std": 0.06075910106301308,
"step": 267
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 266.8125,
"completions/mean_terminated_length": 198.1599884033203,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.2144,
"grad_norm": 5.0754289627075195,
"kl": 0.09002685546875,
"learning_rate": 1e-06,
"loss": -0.0985,
"num_tokens": 3535156.0,
"reward": 0.04936995357275009,
"reward_std": 0.02683193050324917,
"rewards/bleu_reward_func/mean": 0.04936995357275009,
"rewards/bleu_reward_func/std": 0.05894342064857483,
"step": 268
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 423.0,
"completions/mean_length": 432.5,
"completions/mean_terminated_length": 300.0,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.2152,
"grad_norm": 2.118546724319458,
"kl": 0.0207061767578125,
"learning_rate": 1e-06,
"loss": -0.0481,
"num_tokens": 3555804.0,
"reward": 0.05241474509239197,
"reward_std": 0.019338509067893028,
"rewards/bleu_reward_func/mean": 0.05241474509239197,
"rewards/bleu_reward_func/std": 0.06824250519275665,
"step": 269
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 272.625,
"completions/mean_terminated_length": 192.83334350585938,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.216,
"grad_norm": 3.938976526260376,
"kl": 0.042999267578125,
"learning_rate": 1e-06,
"loss": -0.1052,
"num_tokens": 3572328.0,
"reward": 0.21750634908676147,
"reward_std": 0.06779822707176208,
"rewards/bleu_reward_func/mean": 0.21750634908676147,
"rewards/bleu_reward_func/std": 0.28914642333984375,
"step": 270
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 475.0,
"completions/mean_length": 293.3125,
"completions/mean_terminated_length": 178.76190185546875,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.2168,
"grad_norm": 3.788853645324707,
"kl": 0.041900634765625,
"learning_rate": 1e-06,
"loss": 0.0279,
"num_tokens": 3587050.0,
"reward": 0.04385410249233246,
"reward_std": 0.030311163514852524,
"rewards/bleu_reward_func/mean": 0.04385410249233246,
"rewards/bleu_reward_func/std": 0.047958169132471085,
"step": 271
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 232.25,
"completions/mean_terminated_length": 139.0,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2176,
"grad_norm": 12.908583641052246,
"kl": 0.464263916015625,
"learning_rate": 1e-06,
"loss": 0.2404,
"num_tokens": 3598874.0,
"reward": 0.1504618227481842,
"reward_std": 0.04004389047622681,
"rewards/bleu_reward_func/mean": 0.1504618227481842,
"rewards/bleu_reward_func/std": 0.16537794470787048,
"step": 272
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 475.0,
"completions/mean_length": 209.03125,
"completions/mean_terminated_length": 199.258056640625,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.2184,
"grad_norm": 6.985334873199463,
"kl": 0.16595458984375,
"learning_rate": 1e-06,
"loss": -0.0408,
"num_tokens": 3610515.0,
"reward": 0.21218228340148926,
"reward_std": 0.09676108509302139,
"rewards/bleu_reward_func/mean": 0.21218228340148926,
"rewards/bleu_reward_func/std": 0.22182048857212067,
"step": 273
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 465.0,
"completions/mean_length": 333.9375,
"completions/mean_terminated_length": 195.44444274902344,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.2192,
"grad_norm": 3.482099771499634,
"kl": 0.03741455078125,
"learning_rate": 1e-06,
"loss": -0.1819,
"num_tokens": 3623921.0,
"reward": 0.11982771754264832,
"reward_std": 0.063297338783741,
"rewards/bleu_reward_func/mean": 0.11982771754264832,
"rewards/bleu_reward_func/std": 0.09915972501039505,
"step": 274
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 292.90625,
"completions/mean_terminated_length": 261.6071472167969,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.22,
"grad_norm": 4.1525559425354,
"kl": 0.041351318359375,
"learning_rate": 1e-06,
"loss": -0.0478,
"num_tokens": 3634918.0,
"reward": 0.05305434763431549,
"reward_std": 0.019571729004383087,
"rewards/bleu_reward_func/mean": 0.05305434763431549,
"rewards/bleu_reward_func/std": 0.04326590150594711,
"step": 275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 229.59375,
"completions/mean_terminated_length": 135.45834350585938,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.2208,
"grad_norm": 14.463852882385254,
"kl": 0.257415771484375,
"learning_rate": 1e-06,
"loss": -0.0433,
"num_tokens": 3647289.0,
"reward": 0.23456689715385437,
"reward_std": 0.08336643874645233,
"rewards/bleu_reward_func/mean": 0.23456689715385437,
"rewards/bleu_reward_func/std": 0.2258531004190445,
"step": 276
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 438.0,
"completions/mean_length": 193.375,
"completions/mean_terminated_length": 87.16667175292969,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.2216,
"grad_norm": 21.709369659423828,
"kl": 0.1391448974609375,
"learning_rate": 1e-06,
"loss": -0.0033,
"num_tokens": 3656709.0,
"reward": 0.16775630414485931,
"reward_std": 0.03647792339324951,
"rewards/bleu_reward_func/mean": 0.16775630414485931,
"rewards/bleu_reward_func/std": 0.15713484585285187,
"step": 277
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 219.78125,
"completions/mean_terminated_length": 122.375,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.2224,
"grad_norm": 7.275771141052246,
"kl": 0.22491455078125,
"learning_rate": 1e-06,
"loss": 0.072,
"num_tokens": 3670158.0,
"reward": 0.1231408566236496,
"reward_std": 0.022272268310189247,
"rewards/bleu_reward_func/mean": 0.1231408566236496,
"rewards/bleu_reward_func/std": 0.1077708899974823,
"step": 278
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 464.0,
"completions/mean_length": 339.46875,
"completions/mean_terminated_length": 261.04547119140625,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2232,
"grad_norm": 3.146303176879883,
"kl": 0.0390625,
"learning_rate": 1e-06,
"loss": -0.1333,
"num_tokens": 3683925.0,
"reward": 0.0675458312034607,
"reward_std": 0.017428681254386902,
"rewards/bleu_reward_func/mean": 0.0675458312034607,
"rewards/bleu_reward_func/std": 0.05334463343024254,
"step": 279
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 321.9375,
"completions/mean_terminated_length": 268.7200012207031,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.224,
"grad_norm": 8.726150512695312,
"kl": 0.156707763671875,
"learning_rate": 1e-06,
"loss": 0.0175,
"num_tokens": 3699747.0,
"reward": 0.11248552799224854,
"reward_std": 0.03111671656370163,
"rewards/bleu_reward_func/mean": 0.11248552799224854,
"rewards/bleu_reward_func/std": 0.08908119797706604,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 297.375,
"completions/mean_terminated_length": 247.84616088867188,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.2248,
"grad_norm": 3.081026077270508,
"kl": 0.021881103515625,
"learning_rate": 1e-06,
"loss": -0.1217,
"num_tokens": 3712759.0,
"reward": 0.09313205629587173,
"reward_std": 0.03823218122124672,
"rewards/bleu_reward_func/mean": 0.09313205629587173,
"rewards/bleu_reward_func/std": 0.06713149696588516,
"step": 281
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 316.46875,
"completions/mean_terminated_length": 251.2916717529297,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.2256,
"grad_norm": 3.1275222301483154,
"kl": 0.035675048828125,
"learning_rate": 1e-06,
"loss": -0.1645,
"num_tokens": 3725598.0,
"reward": 0.032498396933078766,
"reward_std": 0.018658628687262535,
"rewards/bleu_reward_func/mean": 0.032498396933078766,
"rewards/bleu_reward_func/std": 0.019405974075198174,
"step": 282
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 194.71875,
"completions/mean_terminated_length": 173.56668090820312,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.2264,
"grad_norm": 4.276275634765625,
"kl": 0.08013916015625,
"learning_rate": 1e-06,
"loss": 0.0746,
"num_tokens": 3736861.0,
"reward": 0.12694165110588074,
"reward_std": 0.04432743415236473,
"rewards/bleu_reward_func/mean": 0.12694165110588074,
"rewards/bleu_reward_func/std": 0.13188457489013672,
"step": 283
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 268.375,
"completions/mean_terminated_length": 101.68421173095703,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.2272,
"grad_norm": 7.712943077087402,
"kl": 0.271392822265625,
"learning_rate": 1e-06,
"loss": -0.0787,
"num_tokens": 3751545.0,
"reward": 0.1655203104019165,
"reward_std": 0.08383054286241531,
"rewards/bleu_reward_func/mean": 0.1655203104019165,
"rewards/bleu_reward_func/std": 0.1525241732597351,
"step": 284
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 451.0,
"completions/mean_length": 342.125,
"completions/mean_terminated_length": 225.89474487304688,
"completions/min_length": 49.0,
"completions/min_terminated_length": 49.0,
"epoch": 0.228,
"grad_norm": 3.6280434131622314,
"kl": 0.046173095703125,
"learning_rate": 1e-06,
"loss": -0.1072,
"num_tokens": 3765781.0,
"reward": 0.042814724147319794,
"reward_std": 0.026553209871053696,
"rewards/bleu_reward_func/mean": 0.042814724147319794,
"rewards/bleu_reward_func/std": 0.03911494091153145,
"step": 285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 208.96875,
"completions/mean_terminated_length": 199.19354248046875,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.2288,
"grad_norm": 7.985737323760986,
"kl": 0.12091064453125,
"learning_rate": 1e-06,
"loss": -0.1044,
"num_tokens": 3779220.0,
"reward": 0.11331084370613098,
"reward_std": 0.025679122656583786,
"rewards/bleu_reward_func/mean": 0.11331084370613098,
"rewards/bleu_reward_func/std": 0.16165612637996674,
"step": 286
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 351.3125,
"completions/mean_terminated_length": 209.5294189453125,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.2296,
"grad_norm": 3.912679433822632,
"kl": 0.026214599609375,
"learning_rate": 1e-06,
"loss": 0.1208,
"num_tokens": 3794550.0,
"reward": 0.01693039759993553,
"reward_std": 0.0203933697193861,
"rewards/bleu_reward_func/mean": 0.01693039759993553,
"rewards/bleu_reward_func/std": 0.02536601759493351,
"step": 287
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 453.0,
"completions/mean_length": 274.65625,
"completions/mean_terminated_length": 240.75001525878906,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.2304,
"grad_norm": 6.236807346343994,
"kl": 0.1007537841796875,
"learning_rate": 1e-06,
"loss": -0.0844,
"num_tokens": 3808595.0,
"reward": 0.13739125430583954,
"reward_std": 0.042728863656520844,
"rewards/bleu_reward_func/mean": 0.13739125430583954,
"rewards/bleu_reward_func/std": 0.09978168457746506,
"step": 288
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 443.0,
"completions/mean_length": 260.15625,
"completions/mean_terminated_length": 87.84210968017578,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.2312,
"grad_norm": 12.087539672851562,
"kl": 0.2603912353515625,
"learning_rate": 1e-06,
"loss": 0.1979,
"num_tokens": 3821552.0,
"reward": 0.1537414938211441,
"reward_std": 0.04864966496825218,
"rewards/bleu_reward_func/mean": 0.1537414938211441,
"rewards/bleu_reward_func/std": 0.08011970669031143,
"step": 289
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 274.0,
"completions/mean_length": 214.40625,
"completions/mean_terminated_length": 79.13636779785156,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.232,
"grad_norm": 6.511635780334473,
"kl": 0.12432861328125,
"learning_rate": 1e-06,
"loss": 0.2609,
"num_tokens": 3838117.0,
"reward": 0.19495005905628204,
"reward_std": 0.09461250901222229,
"rewards/bleu_reward_func/mean": 0.19495005905628204,
"rewards/bleu_reward_func/std": 0.20672400295734406,
"step": 290
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 322.78125,
"completions/mean_terminated_length": 223.6666717529297,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.2328,
"grad_norm": 4.590160369873047,
"kl": 0.127716064453125,
"learning_rate": 1e-06,
"loss": -0.1225,
"num_tokens": 3853958.0,
"reward": 0.1360878348350525,
"reward_std": 0.03053300268948078,
"rewards/bleu_reward_func/mean": 0.1360878348350525,
"rewards/bleu_reward_func/std": 0.17878462374210358,
"step": 291
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 457.0,
"completions/mean_length": 462.6875,
"completions/mean_terminated_length": 390.6153869628906,
"completions/min_length": 315.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.2336,
"grad_norm": 2.3334367275238037,
"kl": 0.030426025390625,
"learning_rate": 1e-06,
"loss": -0.0405,
"num_tokens": 3875596.0,
"reward": 0.06421424448490143,
"reward_std": 0.02072659507393837,
"rewards/bleu_reward_func/mean": 0.06421424448490143,
"rewards/bleu_reward_func/std": 0.02574257366359234,
"step": 292
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 325.03125,
"completions/mean_terminated_length": 251.86956787109375,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.2344,
"grad_norm": 3.4702064990997314,
"kl": 0.03289794921875,
"learning_rate": 1e-06,
"loss": 0.1122,
"num_tokens": 3892021.0,
"reward": 0.04875369742512703,
"reward_std": 0.020287783816456795,
"rewards/bleu_reward_func/mean": 0.04875369742512703,
"rewards/bleu_reward_func/std": 0.0285445898771286,
"step": 293
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 368.0,
"completions/mean_length": 107.375,
"completions/mean_terminated_length": 94.32257843017578,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.2352,
"grad_norm": 25.415559768676758,
"kl": 0.23876953125,
"learning_rate": 1e-06,
"loss": 0.1099,
"num_tokens": 3903457.0,
"reward": 0.12372880429029465,
"reward_std": 0.02668173238635063,
"rewards/bleu_reward_func/mean": 0.12372880429029465,
"rewards/bleu_reward_func/std": 0.12391357123851776,
"step": 294
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 244.09375,
"completions/mean_terminated_length": 122.31818389892578,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.236,
"grad_norm": 8.319884300231934,
"kl": 0.14251708984375,
"learning_rate": 1e-06,
"loss": -0.034,
"num_tokens": 3917028.0,
"reward": 0.16006486117839813,
"reward_std": 0.02584708109498024,
"rewards/bleu_reward_func/mean": 0.16006486117839813,
"rewards/bleu_reward_func/std": 0.1484500914812088,
"step": 295
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 278.875,
"completions/mean_terminated_length": 139.0,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.2368,
"grad_norm": 4.291149616241455,
"kl": 0.131500244140625,
"learning_rate": 1e-06,
"loss": -0.192,
"num_tokens": 3929400.0,
"reward": 0.09954051673412323,
"reward_std": 0.03838299959897995,
"rewards/bleu_reward_func/mean": 0.09954051673412323,
"rewards/bleu_reward_func/std": 0.13533763587474823,
"step": 296
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 288.9375,
"completions/mean_terminated_length": 187.5454559326172,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.2376,
"grad_norm": 5.0546417236328125,
"kl": 0.10858154296875,
"learning_rate": 1e-06,
"loss": -0.0687,
"num_tokens": 3942222.0,
"reward": 0.16907253861427307,
"reward_std": 0.03968513384461403,
"rewards/bleu_reward_func/mean": 0.16907253861427307,
"rewards/bleu_reward_func/std": 0.10800375789403915,
"step": 297
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 309.1875,
"completions/mean_terminated_length": 151.44444274902344,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.2384,
"grad_norm": 8.339001655578613,
"kl": 0.1490631103515625,
"learning_rate": 1e-06,
"loss": -0.0121,
"num_tokens": 3954316.0,
"reward": 0.06681232899427414,
"reward_std": 0.015474791638553143,
"rewards/bleu_reward_func/mean": 0.06681232899427414,
"rewards/bleu_reward_func/std": 0.06617429107427597,
"step": 298
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 285.59375,
"completions/mean_terminated_length": 109.5,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2392,
"grad_norm": 3.8715662956237793,
"kl": 0.050140380859375,
"learning_rate": 1e-06,
"loss": 0.1729,
"num_tokens": 3967087.0,
"reward": 0.1066230833530426,
"reward_std": 0.08889298141002655,
"rewards/bleu_reward_func/mean": 0.1066230833530426,
"rewards/bleu_reward_func/std": 0.14223438501358032,
"step": 299
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 361.0,
"completions/max_terminated_length": 361.0,
"completions/mean_length": 120.9375,
"completions/mean_terminated_length": 120.9375,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.24,
"grad_norm": 9.271559715270996,
"kl": 0.223388671875,
"learning_rate": 1e-06,
"loss": 0.0531,
"num_tokens": 3977797.0,
"reward": 0.09239183366298676,
"reward_std": 0.04012807458639145,
"rewards/bleu_reward_func/mean": 0.09239183366298676,
"rewards/bleu_reward_func/std": 0.07950045168399811,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 270.0,
"completions/mean_terminated_length": 202.239990234375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.2408,
"grad_norm": 8.53159236907959,
"kl": 0.18048095703125,
"learning_rate": 1e-06,
"loss": -0.1823,
"num_tokens": 3988157.0,
"reward": 0.04499006271362305,
"reward_std": 0.015048853121697903,
"rewards/bleu_reward_func/mean": 0.04499006271362305,
"rewards/bleu_reward_func/std": 0.036676883697509766,
"step": 301
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 224.0625,
"completions/mean_terminated_length": 157.61538696289062,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.2416,
"grad_norm": 6.0366997718811035,
"kl": 0.099029541015625,
"learning_rate": 1e-06,
"loss": -0.1824,
"num_tokens": 4000135.0,
"reward": 0.1630059778690338,
"reward_std": 0.04720958322286606,
"rewards/bleu_reward_func/mean": 0.1630059778690338,
"rewards/bleu_reward_func/std": 0.1834760457277298,
"step": 302
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 169.3125,
"completions/mean_terminated_length": 90.23077392578125,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2424,
"grad_norm": 9.543852806091309,
"kl": 0.35198974609375,
"learning_rate": 1e-06,
"loss": -0.2399,
"num_tokens": 4009009.0,
"reward": 0.06052142754197121,
"reward_std": 0.026765264570713043,
"rewards/bleu_reward_func/mean": 0.06052142754197121,
"rewards/bleu_reward_func/std": 0.052253786474466324,
"step": 303
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 365.78125,
"completions/mean_terminated_length": 265.7368469238281,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.2432,
"grad_norm": 3.007157564163208,
"kl": 0.0393524169921875,
"learning_rate": 1e-06,
"loss": 0.032,
"num_tokens": 4023690.0,
"reward": 0.025675857439637184,
"reward_std": 0.013720525428652763,
"rewards/bleu_reward_func/mean": 0.025675857439637184,
"rewards/bleu_reward_func/std": 0.022033939138054848,
"step": 304
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 506.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 158.6875,
"completions/mean_terminated_length": 158.6875,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.244,
"grad_norm": 7.10622501373291,
"kl": 0.21661376953125,
"learning_rate": 1e-06,
"loss": 0.166,
"num_tokens": 4033848.0,
"reward": 0.19492439925670624,
"reward_std": 0.0628402829170227,
"rewards/bleu_reward_func/mean": 0.19492439925670624,
"rewards/bleu_reward_func/std": 0.22491495311260223,
"step": 305
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 345.53125,
"completions/mean_terminated_length": 290.04168701171875,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.2448,
"grad_norm": 4.572328090667725,
"kl": 0.099700927734375,
"learning_rate": 1e-06,
"loss": 0.1016,
"num_tokens": 4047897.0,
"reward": 0.12647973001003265,
"reward_std": 0.03362637385725975,
"rewards/bleu_reward_func/mean": 0.12647973001003265,
"rewards/bleu_reward_func/std": 0.08024211972951889,
"step": 306
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 217.6875,
"completions/mean_terminated_length": 175.6428680419922,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.2456,
"grad_norm": 7.489211082458496,
"kl": 0.17584228515625,
"learning_rate": 1e-06,
"loss": -0.1361,
"num_tokens": 4062471.0,
"reward": 0.15859398245811462,
"reward_std": 0.059820279479026794,
"rewards/bleu_reward_func/mean": 0.15859398245811462,
"rewards/bleu_reward_func/std": 0.11927466094493866,
"step": 307
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 304.0625,
"completions/mean_terminated_length": 209.5454559326172,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.2464,
"grad_norm": 6.605251789093018,
"kl": 0.15716552734375,
"learning_rate": 1e-06,
"loss": 0.1835,
"num_tokens": 4079553.0,
"reward": 0.048189468681812286,
"reward_std": 0.01783904619514942,
"rewards/bleu_reward_func/mean": 0.048189468681812286,
"rewards/bleu_reward_func/std": 0.037260618060827255,
"step": 308
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 422.0,
"completions/mean_length": 150.46875,
"completions/mean_terminated_length": 67.03846740722656,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.2472,
"grad_norm": 20.150175094604492,
"kl": 0.31695556640625,
"learning_rate": 1e-06,
"loss": -0.2639,
"num_tokens": 4089536.0,
"reward": 0.19017143547534943,
"reward_std": 0.06138678267598152,
"rewards/bleu_reward_func/mean": 0.19017143547534943,
"rewards/bleu_reward_func/std": 0.25128865242004395,
"step": 309
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 171.3125,
"completions/mean_terminated_length": 136.0689697265625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.248,
"grad_norm": 6.626379013061523,
"kl": 0.103729248046875,
"learning_rate": 1e-06,
"loss": -0.1912,
"num_tokens": 4100146.0,
"reward": 0.08903198689222336,
"reward_std": 0.029232412576675415,
"rewards/bleu_reward_func/mean": 0.08903198689222336,
"rewards/bleu_reward_func/std": 0.09126507490873337,
"step": 310
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 216.46875,
"completions/mean_terminated_length": 117.95833587646484,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.2488,
"grad_norm": 5.2524285316467285,
"kl": 0.0589141845703125,
"learning_rate": 1e-06,
"loss": 0.3318,
"num_tokens": 4112841.0,
"reward": 0.07349678874015808,
"reward_std": 0.05337782949209213,
"rewards/bleu_reward_func/mean": 0.07349678874015808,
"rewards/bleu_reward_func/std": 0.10531707108020782,
"step": 311
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 285.0,
"completions/mean_terminated_length": 196.17391967773438,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.2496,
"grad_norm": 5.209020137786865,
"kl": 0.11212158203125,
"learning_rate": 1e-06,
"loss": -0.1362,
"num_tokens": 4125369.0,
"reward": 0.1321243941783905,
"reward_std": 0.035379908978939056,
"rewards/bleu_reward_func/mean": 0.1321243941783905,
"rewards/bleu_reward_func/std": 0.12779219448566437,
"step": 312
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 397.1875,
"completions/mean_terminated_length": 205.83334350585938,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2504,
"grad_norm": 2.491729974746704,
"kl": 0.029266357421875,
"learning_rate": 1e-06,
"loss": -0.0819,
"num_tokens": 4142671.0,
"reward": 0.021221335977315903,
"reward_std": 0.008927191607654095,
"rewards/bleu_reward_func/mean": 0.021221335977315903,
"rewards/bleu_reward_func/std": 0.01940017379820347,
"step": 313
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 276.8125,
"completions/mean_terminated_length": 222.53846740722656,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.2512,
"grad_norm": 3.1302947998046875,
"kl": 0.030731201171875,
"learning_rate": 1e-06,
"loss": -0.09,
"num_tokens": 4158681.0,
"reward": 0.18806447088718414,
"reward_std": 0.04276939481496811,
"rewards/bleu_reward_func/mean": 0.18806447088718414,
"rewards/bleu_reward_func/std": 0.2711097002029419,
"step": 314
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 202.71875,
"completions/mean_terminated_length": 182.10000610351562,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.252,
"grad_norm": 9.11577320098877,
"kl": 0.321502685546875,
"learning_rate": 1e-06,
"loss": 0.2469,
"num_tokens": 4168304.0,
"reward": 0.17324072122573853,
"reward_std": 0.07514998316764832,
"rewards/bleu_reward_func/mean": 0.17324072122573853,
"rewards/bleu_reward_func/std": 0.15059800446033478,
"step": 315
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 457.0,
"completions/mean_length": 310.65625,
"completions/mean_terminated_length": 133.0,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2528,
"grad_norm": 4.476902961730957,
"kl": 0.22100830078125,
"learning_rate": 1e-06,
"loss": -0.0695,
"num_tokens": 4183237.0,
"reward": 0.11044389009475708,
"reward_std": 0.04662460461258888,
"rewards/bleu_reward_func/mean": 0.11044389009475708,
"rewards/bleu_reward_func/std": 0.13189704716205597,
"step": 316
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 326.40625,
"completions/mean_terminated_length": 264.54168701171875,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.2536,
"grad_norm": 4.724470138549805,
"kl": 0.039764404296875,
"learning_rate": 1e-06,
"loss": 0.0133,
"num_tokens": 4196786.0,
"reward": 0.1738719940185547,
"reward_std": 0.06735121458768845,
"rewards/bleu_reward_func/mean": 0.1738719940185547,
"rewards/bleu_reward_func/std": 0.15234871208667755,
"step": 317
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 264.65625,
"completions/mean_terminated_length": 116.25,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.2544,
"grad_norm": 7.755268096923828,
"kl": 0.23388671875,
"learning_rate": 1e-06,
"loss": 0.0091,
"num_tokens": 4211319.0,
"reward": 0.16174694895744324,
"reward_std": 0.04472574219107628,
"rewards/bleu_reward_func/mean": 0.16174694895744324,
"rewards/bleu_reward_func/std": 0.13533204793930054,
"step": 318
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 326.0625,
"completions/mean_terminated_length": 228.6666717529297,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.2552,
"grad_norm": 3.5100746154785156,
"kl": 0.0638427734375,
"learning_rate": 1e-06,
"loss": -0.0174,
"num_tokens": 4224641.0,
"reward": 0.14605101943016052,
"reward_std": 0.039064351469278336,
"rewards/bleu_reward_func/mean": 0.14605101943016052,
"rewards/bleu_reward_func/std": 0.1437525898218155,
"step": 319
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 465.0,
"completions/mean_length": 387.5,
"completions/mean_terminated_length": 180.0,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.256,
"grad_norm": 3.499901056289673,
"kl": 0.03240966796875,
"learning_rate": 1e-06,
"loss": -0.101,
"num_tokens": 4244041.0,
"reward": 0.038129642605781555,
"reward_std": 0.0157744400203228,
"rewards/bleu_reward_func/mean": 0.038129642605781555,
"rewards/bleu_reward_func/std": 0.030961766839027405,
"step": 320
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 235.75,
"completions/mean_terminated_length": 207.1724090576172,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2568,
"grad_norm": 6.800954341888428,
"kl": 0.172210693359375,
"learning_rate": 1e-06,
"loss": -0.2682,
"num_tokens": 4257425.0,
"reward": 0.08078090846538544,
"reward_std": 0.0318281352519989,
"rewards/bleu_reward_func/mean": 0.08078090846538544,
"rewards/bleu_reward_func/std": 0.060885149985551834,
"step": 321
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 52.0,
"completions/mean_length": 153.5,
"completions/mean_terminated_length": 34.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.2576,
"grad_norm": 6.995741367340088,
"kl": 0.197662353515625,
"learning_rate": 1e-06,
"loss": 0.0321,
"num_tokens": 4270729.0,
"reward": 0.3046156167984009,
"reward_std": 0.045112840831279755,
"rewards/bleu_reward_func/mean": 0.3046156167984009,
"rewards/bleu_reward_func/std": 0.17106564342975616,
"step": 322
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 309.0625,
"completions/mean_terminated_length": 216.8181915283203,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2584,
"grad_norm": 8.159075736999512,
"kl": 0.11962890625,
"learning_rate": 1e-06,
"loss": 0.0599,
"num_tokens": 4286907.0,
"reward": 0.11749087274074554,
"reward_std": 0.04918123036623001,
"rewards/bleu_reward_func/mean": 0.11749087274074554,
"rewards/bleu_reward_func/std": 0.12518151104450226,
"step": 323
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 424.0,
"completions/max_terminated_length": 424.0,
"completions/mean_length": 156.90625,
"completions/mean_terminated_length": 156.90625,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2592,
"grad_norm": 7.079853057861328,
"kl": 0.09991455078125,
"learning_rate": 1e-06,
"loss": 0.0397,
"num_tokens": 4295536.0,
"reward": 0.11096417158842087,
"reward_std": 0.04051455110311508,
"rewards/bleu_reward_func/mean": 0.11096417158842087,
"rewards/bleu_reward_func/std": 0.1420901119709015,
"step": 324
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 464.0,
"completions/mean_length": 150.875,
"completions/mean_terminated_length": 49.7599983215332,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.26,
"grad_norm": 8.065258026123047,
"kl": 0.167816162109375,
"learning_rate": 1e-06,
"loss": -0.0243,
"num_tokens": 4306404.0,
"reward": 0.13756218552589417,
"reward_std": 0.02154640108346939,
"rewards/bleu_reward_func/mean": 0.13756218552589417,
"rewards/bleu_reward_func/std": 0.14523112773895264,
"step": 325
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 445.0,
"completions/mean_length": 385.75,
"completions/mean_terminated_length": 310.0,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.2608,
"grad_norm": 2.441365957260132,
"kl": 0.019775390625,
"learning_rate": 1e-06,
"loss": 0.024,
"num_tokens": 4323836.0,
"reward": 0.023768192157149315,
"reward_std": 0.009069718420505524,
"rewards/bleu_reward_func/mean": 0.023768192157149315,
"rewards/bleu_reward_func/std": 0.029040560126304626,
"step": 326
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 319.46875,
"completions/mean_terminated_length": 203.9499969482422,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.2616,
"grad_norm": 5.7556071281433105,
"kl": 0.091705322265625,
"learning_rate": 1e-06,
"loss": 0.0042,
"num_tokens": 4338667.0,
"reward": 0.07871399819850922,
"reward_std": 0.03653344139456749,
"rewards/bleu_reward_func/mean": 0.07871399819850922,
"rewards/bleu_reward_func/std": 0.06572794169187546,
"step": 327
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 442.0,
"completions/mean_length": 264.84375,
"completions/mean_terminated_length": 152.5,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.2624,
"grad_norm": 6.231250286102295,
"kl": 0.1138916015625,
"learning_rate": 1e-06,
"loss": -0.0458,
"num_tokens": 4351270.0,
"reward": 0.16190959513187408,
"reward_std": 0.02650507725775242,
"rewards/bleu_reward_func/mean": 0.16190959513187408,
"rewards/bleu_reward_func/std": 0.15018552541732788,
"step": 328
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 350.625,
"completions/mean_terminated_length": 277.2727355957031,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"epoch": 0.2632,
"grad_norm": 2.828697681427002,
"kl": 0.02972412109375,
"learning_rate": 1e-06,
"loss": 0.0748,
"num_tokens": 4366018.0,
"reward": 0.07461819052696228,
"reward_std": 0.034676797688007355,
"rewards/bleu_reward_func/mean": 0.07461819052696228,
"rewards/bleu_reward_func/std": 0.10171358287334442,
"step": 329
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 447.0,
"completions/mean_length": 330.96875,
"completions/mean_terminated_length": 171.23529052734375,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.264,
"grad_norm": 7.326402187347412,
"kl": 0.0977630615234375,
"learning_rate": 1e-06,
"loss": 0.2768,
"num_tokens": 4378353.0,
"reward": 0.07485680282115936,
"reward_std": 0.04837151616811752,
"rewards/bleu_reward_func/mean": 0.07485680282115936,
"rewards/bleu_reward_func/std": 0.04874453693628311,
"step": 330
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 194.0,
"completions/max_terminated_length": 194.0,
"completions/mean_length": 65.40625,
"completions/mean_terminated_length": 65.40625,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2648,
"grad_norm": 12.08074951171875,
"kl": 0.335693359375,
"learning_rate": 1e-06,
"loss": 0.1573,
"num_tokens": 4384062.0,
"reward": 0.19588544964790344,
"reward_std": 0.09824244678020477,
"rewards/bleu_reward_func/mean": 0.19588544964790344,
"rewards/bleu_reward_func/std": 0.16972649097442627,
"step": 331
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 270.65625,
"completions/mean_terminated_length": 203.0800018310547,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.2656,
"grad_norm": 4.561427593231201,
"kl": 0.039154052734375,
"learning_rate": 1e-06,
"loss": -0.0917,
"num_tokens": 4394427.0,
"reward": 0.06531640887260437,
"reward_std": 0.018873782828450203,
"rewards/bleu_reward_func/mean": 0.06531640887260437,
"rewards/bleu_reward_func/std": 0.059104837477207184,
"step": 332
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 200.25,
"completions/mean_terminated_length": 155.71429443359375,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.2664,
"grad_norm": 6.5239057540893555,
"kl": 0.169952392578125,
"learning_rate": 1e-06,
"loss": 0.0545,
"num_tokens": 4409739.0,
"reward": 0.23698079586029053,
"reward_std": 0.08829502761363983,
"rewards/bleu_reward_func/mean": 0.23698079586029053,
"rewards/bleu_reward_func/std": 0.2539888322353363,
"step": 333
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 142.1875,
"completions/mean_terminated_length": 103.93103790283203,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.2672,
"grad_norm": 6.988838195800781,
"kl": 0.192413330078125,
"learning_rate": 1e-06,
"loss": 0.0242,
"num_tokens": 4420033.0,
"reward": 0.18931233882904053,
"reward_std": 0.06329823285341263,
"rewards/bleu_reward_func/mean": 0.18931233882904053,
"rewards/bleu_reward_func/std": 0.16267651319503784,
"step": 334
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 455.0,
"completions/mean_length": 149.71875,
"completions/mean_terminated_length": 66.11538696289062,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.268,
"grad_norm": 16.95305061340332,
"kl": 0.3260498046875,
"learning_rate": 1e-06,
"loss": 0.3727,
"num_tokens": 4429376.0,
"reward": 0.11154920607805252,
"reward_std": 0.06479852646589279,
"rewards/bleu_reward_func/mean": 0.11154920607805252,
"rewards/bleu_reward_func/std": 0.07707681506872177,
"step": 335
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 216.125,
"completions/mean_terminated_length": 185.51724243164062,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.2688,
"grad_norm": 12.891951560974121,
"kl": 0.154815673828125,
"learning_rate": 1e-06,
"loss": 0.1506,
"num_tokens": 4438340.0,
"reward": 0.11881305277347565,
"reward_std": 0.04300341382622719,
"rewards/bleu_reward_func/mean": 0.11881305277347565,
"rewards/bleu_reward_func/std": 0.11628168076276779,
"step": 336
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 259.375,
"completions/mean_terminated_length": 127.04762268066406,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.2696,
"grad_norm": 8.3147554397583,
"kl": 0.0980682373046875,
"learning_rate": 1e-06,
"loss": 0.0185,
"num_tokens": 4451328.0,
"reward": 0.11791149526834488,
"reward_std": 0.02945806086063385,
"rewards/bleu_reward_func/mean": 0.11791149526834488,
"rewards/bleu_reward_func/std": 0.06387177854776382,
"step": 337
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 397.0,
"completions/mean_length": 159.28125,
"completions/mean_terminated_length": 60.52000045776367,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.2704,
"grad_norm": 6.874416828155518,
"kl": 0.235107421875,
"learning_rate": 1e-06,
"loss": 0.1172,
"num_tokens": 4461785.0,
"reward": 0.18331755697727203,
"reward_std": 0.05733542889356613,
"rewards/bleu_reward_func/mean": 0.18331755697727203,
"rewards/bleu_reward_func/std": 0.17218343913555145,
"step": 338
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 206.1875,
"completions/mean_terminated_length": 120.55999755859375,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.2712,
"grad_norm": 7.444963455200195,
"kl": 0.08843994140625,
"learning_rate": 1e-06,
"loss": 0.3417,
"num_tokens": 4471031.0,
"reward": 0.08221863210201263,
"reward_std": 0.030037853866815567,
"rewards/bleu_reward_func/mean": 0.08221863210201263,
"rewards/bleu_reward_func/std": 0.05527469143271446,
"step": 339
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 256.84375,
"completions/mean_terminated_length": 197.9615478515625,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.272,
"grad_norm": 6855.86328125,
"kl": 1.03955078125,
"learning_rate": 1e-06,
"loss": 0.0411,
"num_tokens": 4485834.0,
"reward": 0.13405509293079376,
"reward_std": 0.03707335144281387,
"rewards/bleu_reward_func/mean": 0.13405509293079376,
"rewards/bleu_reward_func/std": 0.15687085688114166,
"step": 340
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 388.0,
"completions/mean_length": 211.0,
"completions/mean_terminated_length": 141.53846740722656,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.2728,
"grad_norm": 8.717283248901367,
"kl": 0.128326416015625,
"learning_rate": 1e-06,
"loss": -0.0381,
"num_tokens": 4496202.0,
"reward": 0.0755915641784668,
"reward_std": 0.029588045552372932,
"rewards/bleu_reward_func/mean": 0.0755915641784668,
"rewards/bleu_reward_func/std": 0.05914263799786568,
"step": 341
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 416.0,
"completions/mean_length": 213.15625,
"completions/mean_terminated_length": 129.47999572753906,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.2736,
"grad_norm": 9.269394874572754,
"kl": 0.21234130859375,
"learning_rate": 1e-06,
"loss": -0.0787,
"num_tokens": 4505447.0,
"reward": 0.11310072988271713,
"reward_std": 0.035067904740571976,
"rewards/bleu_reward_func/mean": 0.11310072988271713,
"rewards/bleu_reward_func/std": 0.10819036513566971,
"step": 342
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 353.90625,
"completions/mean_terminated_length": 174.73333740234375,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.2744,
"grad_norm": 6.147165298461914,
"kl": 0.0384521484375,
"learning_rate": 1e-06,
"loss": -0.0413,
"num_tokens": 4519052.0,
"reward": 0.06785966455936432,
"reward_std": 0.039666250348091125,
"rewards/bleu_reward_func/mean": 0.06785966455936432,
"rewards/bleu_reward_func/std": 0.059012189507484436,
"step": 343
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 370.84375,
"completions/mean_terminated_length": 261.0555725097656,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.2752,
"grad_norm": 6.257096767425537,
"kl": 0.170440673828125,
"learning_rate": 1e-06,
"loss": 0.0397,
"num_tokens": 4533975.0,
"reward": 0.05020497739315033,
"reward_std": 0.009127253666520119,
"rewards/bleu_reward_func/mean": 0.05020497739315033,
"rewards/bleu_reward_func/std": 0.04745229333639145,
"step": 344
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 260.1875,
"completions/mean_terminated_length": 64.33333587646484,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.276,
"grad_norm": 8.694131851196289,
"kl": 0.40167236328125,
"learning_rate": 1e-06,
"loss": 0.124,
"num_tokens": 4548765.0,
"reward": 0.17815490067005157,
"reward_std": 0.04761611297726631,
"rewards/bleu_reward_func/mean": 0.17815490067005157,
"rewards/bleu_reward_func/std": 0.22018791735172272,
"step": 345
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 334.78125,
"completions/mean_terminated_length": 275.7083435058594,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2768,
"grad_norm": 6.1226325035095215,
"kl": 0.105133056640625,
"learning_rate": 1e-06,
"loss": -0.0251,
"num_tokens": 4565158.0,
"reward": 0.09645688533782959,
"reward_std": 0.0746307447552681,
"rewards/bleu_reward_func/mean": 0.09645688533782959,
"rewards/bleu_reward_func/std": 0.1715475171804428,
"step": 346
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 388.0,
"completions/max_terminated_length": 388.0,
"completions/mean_length": 98.46875,
"completions/mean_terminated_length": 98.46875,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.2776,
"grad_norm": 8.647904396057129,
"kl": 0.328857421875,
"learning_rate": 1e-06,
"loss": -0.0264,
"num_tokens": 4576637.0,
"reward": 0.3595752716064453,
"reward_std": 0.09626303613185883,
"rewards/bleu_reward_func/mean": 0.3595752716064453,
"rewards/bleu_reward_func/std": 0.293544203042984,
"step": 347
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 239.0,
"completions/mean_length": 191.28125,
"completions/mean_terminated_length": 84.375,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.2784,
"grad_norm": 7.7827630043029785,
"kl": 0.24041748046875,
"learning_rate": 1e-06,
"loss": 0.0475,
"num_tokens": 4586230.0,
"reward": 0.2051679939031601,
"reward_std": 0.029646433889865875,
"rewards/bleu_reward_func/mean": 0.2051679939031601,
"rewards/bleu_reward_func/std": 0.20678655803203583,
"step": 348
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 320.5,
"completions/mean_terminated_length": 256.66668701171875,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.2792,
"grad_norm": 8.593353271484375,
"kl": 0.149017333984375,
"learning_rate": 1e-06,
"loss": -0.0651,
"num_tokens": 4603070.0,
"reward": 0.1438911259174347,
"reward_std": 0.06431536376476288,
"rewards/bleu_reward_func/mean": 0.1438911259174347,
"rewards/bleu_reward_func/std": 0.22814705967903137,
"step": 349
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 469.0,
"completions/mean_length": 381.96875,
"completions/mean_terminated_length": 191.92308044433594,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.28,
"grad_norm": 2.2874648571014404,
"kl": 0.023284912109375,
"learning_rate": 1e-06,
"loss": 0.0246,
"num_tokens": 4619757.0,
"reward": 0.19660863280296326,
"reward_std": 0.08571420609951019,
"rewards/bleu_reward_func/mean": 0.19660863280296326,
"rewards/bleu_reward_func/std": 0.2662343680858612,
"step": 350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 380.53125,
"completions/mean_terminated_length": 290.5789489746094,
"completions/min_length": 109.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.2808,
"grad_norm": 2.8600640296936035,
"kl": 0.02679443359375,
"learning_rate": 1e-06,
"loss": 0.082,
"num_tokens": 4636806.0,
"reward": 0.05401962995529175,
"reward_std": 0.019372381269931793,
"rewards/bleu_reward_func/mean": 0.05401962995529175,
"rewards/bleu_reward_func/std": 0.026677841320633888,
"step": 351
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 328.125,
"completions/mean_terminated_length": 244.5454559326172,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.2816,
"grad_norm": 6.117258548736572,
"kl": 0.10986328125,
"learning_rate": 1e-06,
"loss": 0.162,
"num_tokens": 4649338.0,
"reward": 0.12430500984191895,
"reward_std": 0.046015314757823944,
"rewards/bleu_reward_func/mean": 0.12430500984191895,
"rewards/bleu_reward_func/std": 0.11290674656629562,
"step": 352
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 335.15625,
"completions/mean_terminated_length": 179.11764526367188,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.2824,
"grad_norm": 9.883430480957031,
"kl": 0.134429931640625,
"learning_rate": 1e-06,
"loss": -0.0897,
"num_tokens": 4664823.0,
"reward": 0.10318648815155029,
"reward_std": 0.040948014706373215,
"rewards/bleu_reward_func/mean": 0.10318648815155029,
"rewards/bleu_reward_func/std": 0.098084457218647,
"step": 353
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 425.0,
"completions/mean_length": 213.96875,
"completions/mean_terminated_length": 204.35482788085938,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"epoch": 0.2832,
"grad_norm": 3.7569406032562256,
"kl": 0.0567626953125,
"learning_rate": 1e-06,
"loss": 0.276,
"num_tokens": 4673198.0,
"reward": 0.02880779653787613,
"reward_std": 0.02136135660111904,
"rewards/bleu_reward_func/mean": 0.02880779653787613,
"rewards/bleu_reward_func/std": 0.031262028962373734,
"step": 354
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 382.34375,
"completions/mean_terminated_length": 134.8181915283203,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.284,
"grad_norm": 5.402606010437012,
"kl": 0.059539794921875,
"learning_rate": 1e-06,
"loss": 0.0167,
"num_tokens": 4690033.0,
"reward": 0.11326944082975388,
"reward_std": 0.04008851572871208,
"rewards/bleu_reward_func/mean": 0.11326944082975388,
"rewards/bleu_reward_func/std": 0.1632446050643921,
"step": 355
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 184.0,
"completions/max_terminated_length": 184.0,
"completions/mean_length": 49.5,
"completions/mean_terminated_length": 49.5,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.2848,
"grad_norm": 15.920856475830078,
"kl": 0.2000732421875,
"learning_rate": 1e-06,
"loss": 0.1343,
"num_tokens": 4696953.0,
"reward": 0.1625998467206955,
"reward_std": 0.10141640901565552,
"rewards/bleu_reward_func/mean": 0.1625998467206955,
"rewards/bleu_reward_func/std": 0.12067051976919174,
"step": 356
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 328.0,
"completions/max_terminated_length": 328.0,
"completions/mean_length": 103.71875,
"completions/mean_terminated_length": 103.71875,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.2856,
"grad_norm": 32.86006546020508,
"kl": 0.153564453125,
"learning_rate": 1e-06,
"loss": 0.1497,
"num_tokens": 4705000.0,
"reward": 0.05853947252035141,
"reward_std": 0.014492938295006752,
"rewards/bleu_reward_func/mean": 0.05853947252035141,
"rewards/bleu_reward_func/std": 0.02192818373441696,
"step": 357
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 430.0,
"completions/max_terminated_length": 430.0,
"completions/mean_length": 110.96875,
"completions/mean_terminated_length": 110.96875,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.2864,
"grad_norm": 8.785351753234863,
"kl": 0.1767578125,
"learning_rate": 1e-06,
"loss": 0.0291,
"num_tokens": 4713815.0,
"reward": 0.256367951631546,
"reward_std": 0.06547890603542328,
"rewards/bleu_reward_func/mean": 0.256367951631546,
"rewards/bleu_reward_func/std": 0.2225809097290039,
"step": 358
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 387.0,
"completions/mean_length": 333.21875,
"completions/mean_terminated_length": 175.47059631347656,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.2872,
"grad_norm": 3.714874744415283,
"kl": 0.0813140869140625,
"learning_rate": 1e-06,
"loss": 0.0176,
"num_tokens": 4732606.0,
"reward": 0.08705547451972961,
"reward_std": 0.02976841665804386,
"rewards/bleu_reward_func/mean": 0.08705547451972961,
"rewards/bleu_reward_func/std": 0.041370097547769547,
"step": 359
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 159.0,
"completions/mean_terminated_length": 147.61289978027344,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.288,
"grad_norm": 7.568475723266602,
"kl": 0.069976806640625,
"learning_rate": 1e-06,
"loss": 0.0023,
"num_tokens": 4742086.0,
"reward": 0.05895683914422989,
"reward_std": 0.036796510219573975,
"rewards/bleu_reward_func/mean": 0.05895683914422989,
"rewards/bleu_reward_func/std": 0.06153297796845436,
"step": 360
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 303.34375,
"completions/mean_terminated_length": 221.69566345214844,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"epoch": 0.2888,
"grad_norm": 3.495642900466919,
"kl": 0.033843994140625,
"learning_rate": 1e-06,
"loss": 0.1195,
"num_tokens": 4755153.0,
"reward": 0.024642691016197205,
"reward_std": 0.00707631791010499,
"rewards/bleu_reward_func/mean": 0.024642691016197205,
"rewards/bleu_reward_func/std": 0.01350654847919941,
"step": 361
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 349.59375,
"completions/mean_terminated_length": 238.4736785888672,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.2896,
"grad_norm": 3.2497663497924805,
"kl": 0.032928466796875,
"learning_rate": 1e-06,
"loss": 0.0909,
"num_tokens": 4768724.0,
"reward": 0.06024404242634773,
"reward_std": 0.029051221907138824,
"rewards/bleu_reward_func/mean": 0.06024404242634773,
"rewards/bleu_reward_func/std": 0.05113474279642105,
"step": 362
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 308.21875,
"completions/mean_terminated_length": 201.4761962890625,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.2904,
"grad_norm": 3.932180643081665,
"kl": 0.0535736083984375,
"learning_rate": 1e-06,
"loss": -0.1216,
"num_tokens": 4784611.0,
"reward": 0.10957776010036469,
"reward_std": 0.018995165824890137,
"rewards/bleu_reward_func/mean": 0.10957776010036469,
"rewards/bleu_reward_func/std": 0.12744034826755524,
"step": 363
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 467.0,
"completions/mean_length": 315.15625,
"completions/mean_terminated_length": 260.0400085449219,
"completions/min_length": 95.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.2912,
"grad_norm": 3.873363971710205,
"kl": 0.04693603515625,
"learning_rate": 1e-06,
"loss": 0.1157,
"num_tokens": 4798600.0,
"reward": 0.06850136816501617,
"reward_std": 0.03206296265125275,
"rewards/bleu_reward_func/mean": 0.06850136816501617,
"rewards/bleu_reward_func/std": 0.06299194693565369,
"step": 364
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 301.28125,
"completions/mean_terminated_length": 205.5,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.292,
"grad_norm": 3.491849184036255,
"kl": 0.050079345703125,
"learning_rate": 1e-06,
"loss": 0.1634,
"num_tokens": 4812193.0,
"reward": 0.0632539913058281,
"reward_std": 0.04620906710624695,
"rewards/bleu_reward_func/mean": 0.0632539913058281,
"rewards/bleu_reward_func/std": 0.08490858227014542,
"step": 365
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 446.0,
"completions/mean_length": 320.40625,
"completions/mean_terminated_length": 266.7599792480469,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.2928,
"grad_norm": 10.243452072143555,
"kl": 0.1219482421875,
"learning_rate": 1e-06,
"loss": 0.0432,
"num_tokens": 4824134.0,
"reward": 0.0788659006357193,
"reward_std": 0.019495027139782906,
"rewards/bleu_reward_func/mean": 0.0788659006357193,
"rewards/bleu_reward_func/std": 0.05461956560611725,
"step": 366
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 357.40625,
"completions/mean_terminated_length": 296.9130554199219,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.2936,
"grad_norm": 2.715989351272583,
"kl": 0.036712646484375,
"learning_rate": 1e-06,
"loss": -0.1141,
"num_tokens": 4839219.0,
"reward": 0.1387082040309906,
"reward_std": 0.025043122470378876,
"rewards/bleu_reward_func/mean": 0.1387082040309906,
"rewards/bleu_reward_func/std": 0.14657536149024963,
"step": 367
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 418.0,
"completions/mean_terminated_length": 261.3333435058594,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.2944,
"grad_norm": 2.414018154144287,
"kl": 0.029937744140625,
"learning_rate": 1e-06,
"loss": 0.0066,
"num_tokens": 4857699.0,
"reward": 0.06751300394535065,
"reward_std": 0.05967854708433151,
"rewards/bleu_reward_func/mean": 0.06751300394535065,
"rewards/bleu_reward_func/std": 0.08448994904756546,
"step": 368
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 414.0,
"completions/max_terminated_length": 414.0,
"completions/mean_length": 50.09375,
"completions/mean_terminated_length": 50.09375,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.2952,
"grad_norm": 12.07331657409668,
"kl": 0.331298828125,
"learning_rate": 1e-06,
"loss": -0.167,
"num_tokens": 4865766.0,
"reward": 0.2235146164894104,
"reward_std": 0.06765347719192505,
"rewards/bleu_reward_func/mean": 0.2235146164894104,
"rewards/bleu_reward_func/std": 0.15006797015666962,
"step": 369
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 300.125,
"completions/mean_terminated_length": 155.15789794921875,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.296,
"grad_norm": 6.003938674926758,
"kl": 0.062225341796875,
"learning_rate": 1e-06,
"loss": 0.0295,
"num_tokens": 4883786.0,
"reward": 0.09686341136693954,
"reward_std": 0.04255010187625885,
"rewards/bleu_reward_func/mean": 0.09686341136693954,
"rewards/bleu_reward_func/std": 0.11752825975418091,
"step": 370
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 381.84375,
"completions/mean_terminated_length": 338.4583435058594,
"completions/min_length": 72.0,
"completions/min_terminated_length": 72.0,
"epoch": 0.2968,
"grad_norm": 2.782743215560913,
"kl": 0.041107177734375,
"learning_rate": 1e-06,
"loss": -0.0931,
"num_tokens": 4898397.0,
"reward": 0.06518180668354034,
"reward_std": 0.017261603847146034,
"rewards/bleu_reward_func/mean": 0.06518180668354034,
"rewards/bleu_reward_func/std": 0.07592527568340302,
"step": 371
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 191.71875,
"completions/mean_terminated_length": 170.36666870117188,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.2976,
"grad_norm": 6.834630489349365,
"kl": 0.155029296875,
"learning_rate": 1e-06,
"loss": -0.0674,
"num_tokens": 4907564.0,
"reward": 0.0751166045665741,
"reward_std": 0.03539106994867325,
"rewards/bleu_reward_func/mean": 0.0751166045665741,
"rewards/bleu_reward_func/std": 0.03759034350514412,
"step": 372
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 167.65625,
"completions/mean_terminated_length": 156.5483856201172,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.2984,
"grad_norm": 9.550743103027344,
"kl": 0.1636962890625,
"learning_rate": 1e-06,
"loss": 0.1533,
"num_tokens": 4916969.0,
"reward": 0.12691722810268402,
"reward_std": 0.019398069009184837,
"rewards/bleu_reward_func/mean": 0.12691722810268402,
"rewards/bleu_reward_func/std": 0.14723701775074005,
"step": 373
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 267.6875,
"completions/mean_terminated_length": 199.27999877929688,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.2992,
"grad_norm": 5.277988433837891,
"kl": 0.143890380859375,
"learning_rate": 1e-06,
"loss": -0.2024,
"num_tokens": 4930367.0,
"reward": 0.21388903260231018,
"reward_std": 0.0590648353099823,
"rewards/bleu_reward_func/mean": 0.21388903260231018,
"rewards/bleu_reward_func/std": 0.2627076506614685,
"step": 374
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 337.59375,
"completions/mean_terminated_length": 246.23809814453125,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.3,
"grad_norm": 10.797468185424805,
"kl": 0.12530517578125,
"learning_rate": 1e-06,
"loss": -0.06,
"num_tokens": 4948002.0,
"reward": 0.1380675733089447,
"reward_std": 0.049179110676050186,
"rewards/bleu_reward_func/mean": 0.1380675733089447,
"rewards/bleu_reward_func/std": 0.14962899684906006,
"step": 375
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 250.375,
"completions/mean_terminated_length": 163.1666717529297,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.3008,
"grad_norm": 6.259679794311523,
"kl": 0.1361083984375,
"learning_rate": 1e-06,
"loss": -0.0324,
"num_tokens": 4963942.0,
"reward": 0.2779002785682678,
"reward_std": 0.049215167760849,
"rewards/bleu_reward_func/mean": 0.2779002785682678,
"rewards/bleu_reward_func/std": 0.247111514210701,
"step": 376
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 228.96875,
"completions/mean_terminated_length": 188.5357208251953,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.3016,
"grad_norm": 9.751809120178223,
"kl": 0.12164306640625,
"learning_rate": 1e-06,
"loss": -0.0784,
"num_tokens": 4974437.0,
"reward": 0.12611877918243408,
"reward_std": 0.05333450064063072,
"rewards/bleu_reward_func/mean": 0.12611877918243408,
"rewards/bleu_reward_func/std": 0.11847065389156342,
"step": 377
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 428.34375,
"completions/mean_terminated_length": 306.0769348144531,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.3024,
"grad_norm": 1.8198633193969727,
"kl": 0.025543212890625,
"learning_rate": 1e-06,
"loss": 0.0668,
"num_tokens": 4993880.0,
"reward": 0.07207944989204407,
"reward_std": 0.019526129588484764,
"rewards/bleu_reward_func/mean": 0.07207944989204407,
"rewards/bleu_reward_func/std": 0.06778865307569504,
"step": 378
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 237.0,
"completions/max_terminated_length": 237.0,
"completions/mean_length": 63.40625,
"completions/mean_terminated_length": 63.40625,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.3032,
"grad_norm": 6.933629512786865,
"kl": 0.13250732421875,
"learning_rate": 1e-06,
"loss": 0.3354,
"num_tokens": 5001909.0,
"reward": 0.12609761953353882,
"reward_std": 0.07611958682537079,
"rewards/bleu_reward_func/mean": 0.12609761953353882,
"rewards/bleu_reward_func/std": 0.09586605429649353,
"step": 379
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 205.25,
"completions/mean_terminated_length": 148.44444274902344,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.304,
"grad_norm": 8.042766571044922,
"kl": 0.155029296875,
"learning_rate": 1e-06,
"loss": 0.1587,
"num_tokens": 5012533.0,
"reward": 0.18430504202842712,
"reward_std": 0.09831003099679947,
"rewards/bleu_reward_func/mean": 0.18430504202842712,
"rewards/bleu_reward_func/std": 0.1858755648136139,
"step": 380
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 333.46875,
"completions/mean_terminated_length": 211.3157958984375,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.3048,
"grad_norm": 19.682376861572266,
"kl": 0.1099853515625,
"learning_rate": 1e-06,
"loss": 0.015,
"num_tokens": 5028972.0,
"reward": 0.09155917167663574,
"reward_std": 0.012800632044672966,
"rewards/bleu_reward_func/mean": 0.09155917167663574,
"rewards/bleu_reward_func/std": 0.1374584585428238,
"step": 381
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 301.6875,
"completions/mean_terminated_length": 253.1538543701172,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.3056,
"grad_norm": 3.2053592205047607,
"kl": 0.051788330078125,
"learning_rate": 1e-06,
"loss": 0.2446,
"num_tokens": 5041090.0,
"reward": 0.06323020905256271,
"reward_std": 0.032996732741594315,
"rewards/bleu_reward_func/mean": 0.06323020905256271,
"rewards/bleu_reward_func/std": 0.05562639981508255,
"step": 382
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 369.375,
"completions/mean_terminated_length": 160.92308044433594,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.3064,
"grad_norm": 7.353909492492676,
"kl": 0.07830810546875,
"learning_rate": 1e-06,
"loss": 0.0282,
"num_tokens": 5057798.0,
"reward": 0.1571401059627533,
"reward_std": 0.02875007688999176,
"rewards/bleu_reward_func/mean": 0.1571401059627533,
"rewards/bleu_reward_func/std": 0.20372198522090912,
"step": 383
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 316.84375,
"completions/mean_terminated_length": 199.75,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.3072,
"grad_norm": 3.984431743621826,
"kl": 0.066986083984375,
"learning_rate": 1e-06,
"loss": 0.0376,
"num_tokens": 5073449.0,
"reward": 0.03315318748354912,
"reward_std": 0.038507476449012756,
"rewards/bleu_reward_func/mean": 0.03315318748354912,
"rewards/bleu_reward_func/std": 0.06562887132167816,
"step": 384
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 465.0,
"completions/mean_length": 319.375,
"completions/mean_terminated_length": 126.75,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.308,
"grad_norm": 6.559665203094482,
"kl": 0.0677490234375,
"learning_rate": 1e-06,
"loss": 0.3157,
"num_tokens": 5087821.0,
"reward": 0.06230534613132477,
"reward_std": 0.03765605762600899,
"rewards/bleu_reward_func/mean": 0.06230534613132477,
"rewards/bleu_reward_func/std": 0.07213454693555832,
"step": 385
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 319.15625,
"completions/mean_terminated_length": 265.1600036621094,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.3088,
"grad_norm": 3.6326193809509277,
"kl": 0.05596923828125,
"learning_rate": 1e-06,
"loss": -0.0929,
"num_tokens": 5100618.0,
"reward": 0.04398781806230545,
"reward_std": 0.02026546560227871,
"rewards/bleu_reward_func/mean": 0.04398781806230545,
"rewards/bleu_reward_func/std": 0.042056936770677567,
"step": 386
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 326.0,
"completions/mean_terminated_length": 181.3333282470703,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"epoch": 0.3096,
"grad_norm": 4.189205646514893,
"kl": 0.0577392578125,
"learning_rate": 1e-06,
"loss": 0.0616,
"num_tokens": 5118850.0,
"reward": 0.10049895197153091,
"reward_std": 0.035130538046360016,
"rewards/bleu_reward_func/mean": 0.10049895197153091,
"rewards/bleu_reward_func/std": 0.0897059291601181,
"step": 387
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 357.125,
"completions/mean_terminated_length": 251.15789794921875,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.3104,
"grad_norm": 8.503087997436523,
"kl": 0.1228790283203125,
"learning_rate": 1e-06,
"loss": 0.1152,
"num_tokens": 5131574.0,
"reward": 0.10157294571399689,
"reward_std": 0.05235150083899498,
"rewards/bleu_reward_func/mean": 0.10157294571399689,
"rewards/bleu_reward_func/std": 0.11832693964242935,
"step": 388
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 261.0625,
"completions/mean_terminated_length": 244.33334350585938,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.3112,
"grad_norm": 7.511518478393555,
"kl": 0.0782470703125,
"learning_rate": 1e-06,
"loss": 0.1551,
"num_tokens": 5142288.0,
"reward": 0.05309104174375534,
"reward_std": 0.0195770300924778,
"rewards/bleu_reward_func/mean": 0.05309104174375534,
"rewards/bleu_reward_func/std": 0.03859832510352135,
"step": 389
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 136.0,
"completions/max_terminated_length": 136.0,
"completions/mean_length": 77.8125,
"completions/mean_terminated_length": 77.8125,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.312,
"grad_norm": 7.358268737792969,
"kl": 0.1142578125,
"learning_rate": 1e-06,
"loss": 0.0692,
"num_tokens": 5147226.0,
"reward": 0.2647009789943695,
"reward_std": 0.0788542777299881,
"rewards/bleu_reward_func/mean": 0.2647009789943695,
"rewards/bleu_reward_func/std": 0.3669854998588562,
"step": 390
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 376.0,
"completions/mean_length": 331.1875,
"completions/mean_terminated_length": 126.26667022705078,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.3128,
"grad_norm": 6.546727180480957,
"kl": 0.131866455078125,
"learning_rate": 1e-06,
"loss": 0.0216,
"num_tokens": 5162552.0,
"reward": 0.06478870660066605,
"reward_std": 0.016362179070711136,
"rewards/bleu_reward_func/mean": 0.06478870660066605,
"rewards/bleu_reward_func/std": 0.07661883533000946,
"step": 391
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 305.5,
"completions/mean_terminated_length": 211.63636779785156,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.3136,
"grad_norm": 3.7394042015075684,
"kl": 0.0411224365234375,
"learning_rate": 1e-06,
"loss": -0.0472,
"num_tokens": 5174632.0,
"reward": 0.07655475288629532,
"reward_std": 0.04063459113240242,
"rewards/bleu_reward_func/mean": 0.07655475288629532,
"rewards/bleu_reward_func/std": 0.05244217440485954,
"step": 392
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 464.0,
"completions/mean_length": 299.4375,
"completions/mean_terminated_length": 239.9199981689453,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.3144,
"grad_norm": 3.130519151687622,
"kl": 0.036407470703125,
"learning_rate": 1e-06,
"loss": 0.0573,
"num_tokens": 5189038.0,
"reward": 0.08177624642848969,
"reward_std": 0.03700428456068039,
"rewards/bleu_reward_func/mean": 0.08177624642848969,
"rewards/bleu_reward_func/std": 0.07332108914852142,
"step": 393
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 175.0,
"completions/mean_terminated_length": 152.53334045410156,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.3152,
"grad_norm": 6.235530853271484,
"kl": 0.119140625,
"learning_rate": 1e-06,
"loss": -0.1315,
"num_tokens": 5199614.0,
"reward": 0.08668357878923416,
"reward_std": 0.029862932860851288,
"rewards/bleu_reward_func/mean": 0.08668357878923416,
"rewards/bleu_reward_func/std": 0.04458598420023918,
"step": 394
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 449.0,
"completions/mean_length": 303.21875,
"completions/mean_terminated_length": 140.8333282470703,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.316,
"grad_norm": 3.7761735916137695,
"kl": 0.051483154296875,
"learning_rate": 1e-06,
"loss": 0.4336,
"num_tokens": 5216893.0,
"reward": 0.04373088479042053,
"reward_std": 0.025996902957558632,
"rewards/bleu_reward_func/mean": 0.04373088479042053,
"rewards/bleu_reward_func/std": 0.035521000623703,
"step": 395
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 473.0,
"completions/mean_length": 102.375,
"completions/mean_terminated_length": 89.16128540039062,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.3168,
"grad_norm": 9.422346115112305,
"kl": 0.20269775390625,
"learning_rate": 1e-06,
"loss": -0.3887,
"num_tokens": 5222225.0,
"reward": 0.0936415046453476,
"reward_std": 0.07821927219629288,
"rewards/bleu_reward_func/mean": 0.0936415046453476,
"rewards/bleu_reward_func/std": 0.1016775444149971,
"step": 396
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 187.875,
"completions/mean_terminated_length": 127.85185241699219,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.3176,
"grad_norm": 16.8355712890625,
"kl": 0.111968994140625,
"learning_rate": 1e-06,
"loss": 0.2515,
"num_tokens": 5234477.0,
"reward": 0.2821354866027832,
"reward_std": 0.16070716083049774,
"rewards/bleu_reward_func/mean": 0.2821354866027832,
"rewards/bleu_reward_func/std": 0.34524035453796387,
"step": 397
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 222.625,
"completions/mean_terminated_length": 169.0370330810547,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.3184,
"grad_norm": 4.937644004821777,
"kl": 0.0895843505859375,
"learning_rate": 1e-06,
"loss": 0.1443,
"num_tokens": 5243161.0,
"reward": 0.04823939502239227,
"reward_std": 0.020888181403279305,
"rewards/bleu_reward_func/mean": 0.04823939502239227,
"rewards/bleu_reward_func/std": 0.032690465450286865,
"step": 398
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 330.5625,
"completions/mean_terminated_length": 304.64288330078125,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.3192,
"grad_norm": 2.7899651527404785,
"kl": 0.028900146484375,
"learning_rate": 1e-06,
"loss": -0.0361,
"num_tokens": 5257211.0,
"reward": 0.10274805128574371,
"reward_std": 0.03329307958483696,
"rewards/bleu_reward_func/mean": 0.10274805128574371,
"rewards/bleu_reward_func/std": 0.08635566383600235,
"step": 399
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 173.875,
"completions/mean_terminated_length": 125.5714340209961,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.32,
"grad_norm": 9.990334510803223,
"kl": 0.19793701171875,
"learning_rate": 1e-06,
"loss": 0.0415,
"num_tokens": 5265263.0,
"reward": 0.13340914249420166,
"reward_std": 0.06052035093307495,
"rewards/bleu_reward_func/mean": 0.13340914249420166,
"rewards/bleu_reward_func/std": 0.12332285940647125,
"step": 400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 174.0,
"completions/mean_length": 299.625,
"completions/mean_terminated_length": 87.25,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.3208,
"grad_norm": 5.343194007873535,
"kl": 0.0823516845703125,
"learning_rate": 1e-06,
"loss": 0.1588,
"num_tokens": 5279491.0,
"reward": 0.04100114479660988,
"reward_std": 0.021917924284934998,
"rewards/bleu_reward_func/mean": 0.04100114479660988,
"rewards/bleu_reward_func/std": 0.059245530515909195,
"step": 401
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 363.96875,
"completions/mean_terminated_length": 296.68182373046875,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.3216,
"grad_norm": 2.502444267272949,
"kl": 0.0249786376953125,
"learning_rate": 1e-06,
"loss": -0.1811,
"num_tokens": 5298618.0,
"reward": 0.06452260166406631,
"reward_std": 0.043596021831035614,
"rewards/bleu_reward_func/mean": 0.06452260166406631,
"rewards/bleu_reward_func/std": 0.0457596592605114,
"step": 402
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 186.8125,
"completions/mean_terminated_length": 153.1724090576172,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.3224,
"grad_norm": 6.430903434753418,
"kl": 0.0926513671875,
"learning_rate": 1e-06,
"loss": 0.1091,
"num_tokens": 5308788.0,
"reward": 0.1375400573015213,
"reward_std": 0.044691912829875946,
"rewards/bleu_reward_func/mean": 0.1375400573015213,
"rewards/bleu_reward_func/std": 0.1667727530002594,
"step": 403
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 261.375,
"completions/mean_terminated_length": 163.30435180664062,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.3232,
"grad_norm": 7.348942279815674,
"kl": 0.114410400390625,
"learning_rate": 1e-06,
"loss": 0.124,
"num_tokens": 5325816.0,
"reward": 0.29955723881721497,
"reward_std": 0.09420829266309738,
"rewards/bleu_reward_func/mean": 0.29955723881721497,
"rewards/bleu_reward_func/std": 0.27135762572288513,
"step": 404
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 313.125,
"completions/mean_terminated_length": 284.71429443359375,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.324,
"grad_norm": 5.8108601570129395,
"kl": 0.0474853515625,
"learning_rate": 1e-06,
"loss": -0.0256,
"num_tokens": 5339252.0,
"reward": 0.125982865691185,
"reward_std": 0.03331389278173447,
"rewards/bleu_reward_func/mean": 0.125982865691185,
"rewards/bleu_reward_func/std": 0.07514968514442444,
"step": 405
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 221.1875,
"completions/mean_terminated_length": 154.07693481445312,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.3248,
"grad_norm": 6.334237098693848,
"kl": 0.167236328125,
"learning_rate": 1e-06,
"loss": 0.2154,
"num_tokens": 5350538.0,
"reward": 0.12314164638519287,
"reward_std": 0.034954577684402466,
"rewards/bleu_reward_func/mean": 0.12314164638519287,
"rewards/bleu_reward_func/std": 0.11711690574884415,
"step": 406
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 182.96875,
"completions/mean_terminated_length": 90.83999633789062,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.3256,
"grad_norm": 6.83364200592041,
"kl": 0.09881591796875,
"learning_rate": 1e-06,
"loss": 0.2224,
"num_tokens": 5364465.0,
"reward": 0.23839128017425537,
"reward_std": 0.09448365867137909,
"rewards/bleu_reward_func/mean": 0.23839128017425537,
"rewards/bleu_reward_func/std": 0.17264093458652496,
"step": 407
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 413.0,
"completions/mean_length": 251.59375,
"completions/mean_terminated_length": 164.7916717529297,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.3264,
"grad_norm": 5.291790962219238,
"kl": 0.120269775390625,
"learning_rate": 1e-06,
"loss": 0.1051,
"num_tokens": 5379212.0,
"reward": 0.07936831563711166,
"reward_std": 0.026489000767469406,
"rewards/bleu_reward_func/mean": 0.07936831563711166,
"rewards/bleu_reward_func/std": 0.04656874015927315,
"step": 408
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 395.90625,
"completions/mean_terminated_length": 279.8125,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.3272,
"grad_norm": 2.7209274768829346,
"kl": 0.033355712890625,
"learning_rate": 1e-06,
"loss": -0.0145,
"num_tokens": 5395369.0,
"reward": 0.05327831208705902,
"reward_std": 0.020644793286919594,
"rewards/bleu_reward_func/mean": 0.05327831208705902,
"rewards/bleu_reward_func/std": 0.044744666665792465,
"step": 409
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 402.0,
"completions/mean_length": 279.125,
"completions/mean_terminated_length": 157.14285278320312,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.328,
"grad_norm": 6.835958003997803,
"kl": 0.17645263671875,
"learning_rate": 1e-06,
"loss": 0.1004,
"num_tokens": 5408861.0,
"reward": 0.15895725786685944,
"reward_std": 0.053282976150512695,
"rewards/bleu_reward_func/mean": 0.15895725786685944,
"rewards/bleu_reward_func/std": 0.1344875991344452,
"step": 410
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 124.625,
"completions/mean_terminated_length": 112.1290283203125,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.3288,
"grad_norm": 7.9765801429748535,
"kl": 0.12908935546875,
"learning_rate": 1e-06,
"loss": 0.0791,
"num_tokens": 5422569.0,
"reward": 0.29637736082077026,
"reward_std": 0.07562527060508728,
"rewards/bleu_reward_func/mean": 0.29637736082077026,
"rewards/bleu_reward_func/std": 0.1916900873184204,
"step": 411
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 420.0,
"completions/mean_length": 242.6875,
"completions/mean_terminated_length": 180.53846740722656,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.3296,
"grad_norm": 5.444021701812744,
"kl": 0.079376220703125,
"learning_rate": 1e-06,
"loss": -0.0428,
"num_tokens": 5432847.0,
"reward": 0.1152123510837555,
"reward_std": 0.07390551269054413,
"rewards/bleu_reward_func/mean": 0.1152123510837555,
"rewards/bleu_reward_func/std": 0.14451570808887482,
"step": 412
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 329.4375,
"completions/mean_terminated_length": 204.5263214111328,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.3304,
"grad_norm": 14.007586479187012,
"kl": 0.074188232421875,
"learning_rate": 1e-06,
"loss": 0.0903,
"num_tokens": 5451693.0,
"reward": 0.13860949873924255,
"reward_std": 0.032740939408540726,
"rewards/bleu_reward_func/mean": 0.13860949873924255,
"rewards/bleu_reward_func/std": 0.15230515599250793,
"step": 413
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 370.0,
"completions/mean_terminated_length": 322.66668701171875,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.3312,
"grad_norm": 2.6279470920562744,
"kl": 0.02838134765625,
"learning_rate": 1e-06,
"loss": 0.1011,
"num_tokens": 5465741.0,
"reward": 0.07638199627399445,
"reward_std": 0.018498672172427177,
"rewards/bleu_reward_func/mean": 0.07638199627399445,
"rewards/bleu_reward_func/std": 0.07297802716493607,
"step": 414
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 433.96875,
"completions/mean_terminated_length": 403.4347839355469,
"completions/min_length": 253.0,
"completions/min_terminated_length": 253.0,
"epoch": 0.332,
"grad_norm": 2.4823691844940186,
"kl": 0.035186767578125,
"learning_rate": 1e-06,
"loss": -0.0328,
"num_tokens": 5482924.0,
"reward": 0.06871578842401505,
"reward_std": 0.015666324645280838,
"rewards/bleu_reward_func/mean": 0.06871578842401505,
"rewards/bleu_reward_func/std": 0.03051225282251835,
"step": 415
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 432.0,
"completions/mean_length": 241.5,
"completions/mean_terminated_length": 179.07693481445312,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.3328,
"grad_norm": 6.543938159942627,
"kl": 0.16180419921875,
"learning_rate": 1e-06,
"loss": 0.0852,
"num_tokens": 5494084.0,
"reward": 0.1368054300546646,
"reward_std": 0.05007235333323479,
"rewards/bleu_reward_func/mean": 0.1368054300546646,
"rewards/bleu_reward_func/std": 0.17140735685825348,
"step": 416
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 440.0,
"completions/mean_terminated_length": 390.7368469238281,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.3336,
"grad_norm": 2.235297203063965,
"kl": 0.03033447265625,
"learning_rate": 1e-06,
"loss": -0.0838,
"num_tokens": 5511716.0,
"reward": 0.038143888115882874,
"reward_std": 0.01655811443924904,
"rewards/bleu_reward_func/mean": 0.038143888115882874,
"rewards/bleu_reward_func/std": 0.024868454784154892,
"step": 417
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 366.0,
"completions/mean_length": 182.3125,
"completions/mean_terminated_length": 72.41667175292969,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.3344,
"grad_norm": 7.744441509246826,
"kl": 0.191436767578125,
"learning_rate": 1e-06,
"loss": 0.3195,
"num_tokens": 5523022.0,
"reward": 0.31701600551605225,
"reward_std": 0.07194612175226212,
"rewards/bleu_reward_func/mean": 0.31701600551605225,
"rewards/bleu_reward_func/std": 0.3555218279361725,
"step": 418
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 366.15625,
"completions/mean_terminated_length": 237.47059631347656,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.3352,
"grad_norm": 6.1128129959106445,
"kl": 0.156463623046875,
"learning_rate": 1e-06,
"loss": -0.0201,
"num_tokens": 5541259.0,
"reward": 0.08823719620704651,
"reward_std": 0.024577319622039795,
"rewards/bleu_reward_func/mean": 0.08823719620704651,
"rewards/bleu_reward_func/std": 0.06854464113712311,
"step": 419
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 441.34375,
"completions/mean_terminated_length": 361.2666931152344,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"epoch": 0.336,
"grad_norm": 1.8351125717163086,
"kl": 0.021087646484375,
"learning_rate": 1e-06,
"loss": 0.1005,
"num_tokens": 5559934.0,
"reward": 0.04894189164042473,
"reward_std": 0.02001025900244713,
"rewards/bleu_reward_func/mean": 0.04894189164042473,
"rewards/bleu_reward_func/std": 0.05484846979379654,
"step": 420
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 439.0,
"completions/mean_length": 326.625,
"completions/mean_terminated_length": 264.8333435058594,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.3368,
"grad_norm": 2.533376932144165,
"kl": 0.036773681640625,
"learning_rate": 1e-06,
"loss": -0.0233,
"num_tokens": 5573530.0,
"reward": 0.040375903248786926,
"reward_std": 0.020407570526003838,
"rewards/bleu_reward_func/mean": 0.040375903248786926,
"rewards/bleu_reward_func/std": 0.03530384972691536,
"step": 421
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 387.0,
"completions/mean_terminated_length": 312.0,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.3376,
"grad_norm": 2.77024507522583,
"kl": 0.023895263671875,
"learning_rate": 1e-06,
"loss": -0.0548,
"num_tokens": 5587906.0,
"reward": 0.07852312177419662,
"reward_std": 0.01865551620721817,
"rewards/bleu_reward_func/mean": 0.07852312177419662,
"rewards/bleu_reward_func/std": 0.01962001994252205,
"step": 422
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 286.625,
"completions/mean_terminated_length": 223.51998901367188,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.3384,
"grad_norm": 10.24564266204834,
"kl": 0.1881103515625,
"learning_rate": 1e-06,
"loss": 0.3541,
"num_tokens": 5600862.0,
"reward": 0.1451932042837143,
"reward_std": 0.04526112228631973,
"rewards/bleu_reward_func/mean": 0.1451932042837143,
"rewards/bleu_reward_func/std": 0.11114869266748428,
"step": 423
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 374.0,
"completions/mean_terminated_length": 217.60000610351562,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.3392,
"grad_norm": 3.2997934818267822,
"kl": 0.03955078125,
"learning_rate": 1e-06,
"loss": 0.0442,
"num_tokens": 5614662.0,
"reward": 0.029227450489997864,
"reward_std": 0.015134407207369804,
"rewards/bleu_reward_func/mean": 0.029227450489997864,
"rewards/bleu_reward_func/std": 0.03273903205990791,
"step": 424
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 375.0,
"completions/mean_length": 266.59375,
"completions/mean_terminated_length": 184.7916717529297,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.34,
"grad_norm": 3.9605484008789062,
"kl": 0.05718994140625,
"learning_rate": 1e-06,
"loss": 0.3365,
"num_tokens": 5625193.0,
"reward": 0.07731406390666962,
"reward_std": 0.04166540876030922,
"rewards/bleu_reward_func/mean": 0.07731406390666962,
"rewards/bleu_reward_func/std": 0.07211390882730484,
"step": 425
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 260.625,
"completions/mean_terminated_length": 88.63157653808594,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.3408,
"grad_norm": 11.257648468017578,
"kl": 0.2677154541015625,
"learning_rate": 1e-06,
"loss": 0.1169,
"num_tokens": 5640717.0,
"reward": 0.19435667991638184,
"reward_std": 0.055491410195827484,
"rewards/bleu_reward_func/mean": 0.19435667991638184,
"rewards/bleu_reward_func/std": 0.1956581324338913,
"step": 426
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 376.9375,
"completions/mean_terminated_length": 179.53846740722656,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.3416,
"grad_norm": 25.653825759887695,
"kl": 0.1090087890625,
"learning_rate": 1e-06,
"loss": 0.024,
"num_tokens": 5655923.0,
"reward": 0.11750101298093796,
"reward_std": 0.0449095293879509,
"rewards/bleu_reward_func/mean": 0.11750101298093796,
"rewards/bleu_reward_func/std": 0.10332971811294556,
"step": 427
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 454.0,
"completions/mean_length": 313.0,
"completions/mean_terminated_length": 235.13043212890625,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.3424,
"grad_norm": 3.291689157485962,
"kl": 0.05206298828125,
"learning_rate": 1e-06,
"loss": -0.0722,
"num_tokens": 5672371.0,
"reward": 0.07329948246479034,
"reward_std": 0.04769134148955345,
"rewards/bleu_reward_func/mean": 0.07329948246479034,
"rewards/bleu_reward_func/std": 0.10588011890649796,
"step": 428
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 218.1875,
"completions/mean_terminated_length": 176.21429443359375,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.3432,
"grad_norm": 70.38253784179688,
"kl": 0.111907958984375,
"learning_rate": 1e-06,
"loss": 0.2102,
"num_tokens": 5685137.0,
"reward": 0.057677462697029114,
"reward_std": 0.02635624073445797,
"rewards/bleu_reward_func/mean": 0.057677462697029114,
"rewards/bleu_reward_func/std": 0.03576910123229027,
"step": 429
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 448.0,
"completions/mean_length": 276.34375,
"completions/mean_terminated_length": 184.13043212890625,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.344,
"grad_norm": 6.003584861755371,
"kl": 0.0679931640625,
"learning_rate": 1e-06,
"loss": 0.0754,
"num_tokens": 5699164.0,
"reward": 0.1447058618068695,
"reward_std": 0.02169397845864296,
"rewards/bleu_reward_func/mean": 0.1447058618068695,
"rewards/bleu_reward_func/std": 0.17934927344322205,
"step": 430
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 190.0,
"completions/mean_terminated_length": 179.61289978027344,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.3448,
"grad_norm": 6.911223888397217,
"kl": 0.186920166015625,
"learning_rate": 1e-06,
"loss": -0.1198,
"num_tokens": 5707324.0,
"reward": 0.1218734011054039,
"reward_std": 0.029896825551986694,
"rewards/bleu_reward_func/mean": 0.1218734011054039,
"rewards/bleu_reward_func/std": 0.12784428894519806,
"step": 431
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 441.0,
"completions/max_terminated_length": 441.0,
"completions/mean_length": 161.3125,
"completions/mean_terminated_length": 161.3125,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.3456,
"grad_norm": 7.491186141967773,
"kl": 0.174072265625,
"learning_rate": 1e-06,
"loss": -0.0073,
"num_tokens": 5715774.0,
"reward": 0.24741162359714508,
"reward_std": 0.06959841400384903,
"rewards/bleu_reward_func/mean": 0.24741162359714508,
"rewards/bleu_reward_func/std": 0.12952403724193573,
"step": 432
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 360.375,
"completions/mean_terminated_length": 280.952392578125,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.3464,
"grad_norm": 2.933356523513794,
"kl": 0.05389404296875,
"learning_rate": 1e-06,
"loss": 0.1508,
"num_tokens": 5729490.0,
"reward": 0.047768086194992065,
"reward_std": 0.022835325449705124,
"rewards/bleu_reward_func/mean": 0.047768086194992065,
"rewards/bleu_reward_func/std": 0.03785131126642227,
"step": 433
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 290.65625,
"completions/mean_terminated_length": 216.875,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.3472,
"grad_norm": 4.7709503173828125,
"kl": 0.202667236328125,
"learning_rate": 1e-06,
"loss": -0.0613,
"num_tokens": 5744007.0,
"reward": 0.17955930531024933,
"reward_std": 0.04158224165439606,
"rewards/bleu_reward_func/mean": 0.17955930531024933,
"rewards/bleu_reward_func/std": 0.16465015709400177,
"step": 434
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 471.46875,
"completions/mean_terminated_length": 430.9375,
"completions/min_length": 318.0,
"completions/min_terminated_length": 318.0,
"epoch": 0.348,
"grad_norm": 2.0240750312805176,
"kl": 0.0249786376953125,
"learning_rate": 1e-06,
"loss": -0.0115,
"num_tokens": 5764798.0,
"reward": 0.06078977510333061,
"reward_std": 0.014253700152039528,
"rewards/bleu_reward_func/mean": 0.06078977510333061,
"rewards/bleu_reward_func/std": 0.061424292623996735,
"step": 435
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 297.5,
"completions/mean_terminated_length": 213.56521606445312,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.3488,
"grad_norm": 6.2941999435424805,
"kl": 0.14788818359375,
"learning_rate": 1e-06,
"loss": -0.117,
"num_tokens": 5778070.0,
"reward": 0.18015003204345703,
"reward_std": 0.04164495691657066,
"rewards/bleu_reward_func/mean": 0.18015003204345703,
"rewards/bleu_reward_func/std": 0.25248411297798157,
"step": 436
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 319.0625,
"completions/mean_terminated_length": 265.0400085449219,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.3496,
"grad_norm": 6.8885884284973145,
"kl": 0.15325927734375,
"learning_rate": 1e-06,
"loss": 0.1008,
"num_tokens": 5794464.0,
"reward": 0.06313855201005936,
"reward_std": 0.01877717673778534,
"rewards/bleu_reward_func/mean": 0.06313855201005936,
"rewards/bleu_reward_func/std": 0.07749292254447937,
"step": 437
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 431.0,
"completions/mean_length": 174.96875,
"completions/mean_terminated_length": 126.8214340209961,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.3504,
"grad_norm": 6.638815402984619,
"kl": 0.21893310546875,
"learning_rate": 1e-06,
"loss": -0.1077,
"num_tokens": 5808279.0,
"reward": 0.1649433970451355,
"reward_std": 0.03847195580601692,
"rewards/bleu_reward_func/mean": 0.1649433970451355,
"rewards/bleu_reward_func/std": 0.1434909999370575,
"step": 438
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 487.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 111.46875,
"completions/mean_terminated_length": 111.46875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.3512,
"grad_norm": 8.658287048339844,
"kl": 0.380615234375,
"learning_rate": 1e-06,
"loss": -0.0241,
"num_tokens": 5818006.0,
"reward": 0.16367265582084656,
"reward_std": 0.043664492666721344,
"rewards/bleu_reward_func/mean": 0.16367265582084656,
"rewards/bleu_reward_func/std": 0.09786061942577362,
"step": 439
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 183.0625,
"completions/mean_terminated_length": 161.1333465576172,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.352,
"grad_norm": 6.755293369293213,
"kl": 0.094757080078125,
"learning_rate": 1e-06,
"loss": -0.3775,
"num_tokens": 5827832.0,
"reward": 0.20365653932094574,
"reward_std": 0.022682592272758484,
"rewards/bleu_reward_func/mean": 0.20365653932094574,
"rewards/bleu_reward_func/std": 0.28341981768608093,
"step": 440
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 374.15625,
"completions/mean_terminated_length": 196.92857360839844,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.3528,
"grad_norm": 3.652517557144165,
"kl": 0.04571533203125,
"learning_rate": 1e-06,
"loss": 0.1243,
"num_tokens": 5846877.0,
"reward": 0.028015542775392532,
"reward_std": 0.017580918967723846,
"rewards/bleu_reward_func/mean": 0.028015542775392532,
"rewards/bleu_reward_func/std": 0.018063105642795563,
"step": 441
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 247.96875,
"completions/mean_terminated_length": 174.0399932861328,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.3536,
"grad_norm": 5.749145984649658,
"kl": 0.2269287109375,
"learning_rate": 1e-06,
"loss": 0.0053,
"num_tokens": 5857276.0,
"reward": 0.24086514115333557,
"reward_std": 0.11034538596868515,
"rewards/bleu_reward_func/mean": 0.24086514115333557,
"rewards/bleu_reward_func/std": 0.2930907607078552,
"step": 442
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 458.0,
"completions/mean_length": 180.9375,
"completions/mean_terminated_length": 88.23999786376953,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.3544,
"grad_norm": 6.519045352935791,
"kl": 0.3109130859375,
"learning_rate": 1e-06,
"loss": 0.0523,
"num_tokens": 5866418.0,
"reward": 0.14787587523460388,
"reward_std": 0.08442827314138412,
"rewards/bleu_reward_func/mean": 0.14787587523460388,
"rewards/bleu_reward_func/std": 0.13120223581790924,
"step": 443
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 290.6875,
"completions/mean_terminated_length": 239.61538696289062,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.3552,
"grad_norm": 3.2144103050231934,
"kl": 0.042388916015625,
"learning_rate": 1e-06,
"loss": -0.0012,
"num_tokens": 5878832.0,
"reward": 0.06585465371608734,
"reward_std": 0.03217202052474022,
"rewards/bleu_reward_func/mean": 0.06585465371608734,
"rewards/bleu_reward_func/std": 0.0564405731856823,
"step": 444
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 226.0625,
"completions/mean_terminated_length": 196.48275756835938,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.356,
"grad_norm": 7.220034122467041,
"kl": 0.255218505859375,
"learning_rate": 1e-06,
"loss": 0.0212,
"num_tokens": 5894266.0,
"reward": 0.1998336911201477,
"reward_std": 0.05887780338525772,
"rewards/bleu_reward_func/mean": 0.1998336911201477,
"rewards/bleu_reward_func/std": 0.1896047741174698,
"step": 445
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 394.0,
"completions/mean_length": 194.46875,
"completions/mean_terminated_length": 173.3000030517578,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.3568,
"grad_norm": 5.338675022125244,
"kl": 0.112518310546875,
"learning_rate": 1e-06,
"loss": 0.1041,
"num_tokens": 5902393.0,
"reward": 0.08252020180225372,
"reward_std": 0.041884347796440125,
"rewards/bleu_reward_func/mean": 0.08252020180225372,
"rewards/bleu_reward_func/std": 0.05604247748851776,
"step": 446
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 266.96875,
"completions/mean_terminated_length": 231.96429443359375,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.3576,
"grad_norm": 3.9111521244049072,
"kl": 0.048919677734375,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 5914296.0,
"reward": 0.2005537748336792,
"reward_std": 0.03531679883599281,
"rewards/bleu_reward_func/mean": 0.2005537748336792,
"rewards/bleu_reward_func/std": 0.1125224232673645,
"step": 447
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 274.125,
"completions/mean_terminated_length": 89.11111450195312,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.3584,
"grad_norm": 18.863727569580078,
"kl": 0.269134521484375,
"learning_rate": 1e-06,
"loss": 0.4411,
"num_tokens": 5927172.0,
"reward": 0.12709318101406097,
"reward_std": 0.020968245342373848,
"rewards/bleu_reward_func/mean": 0.12709318101406097,
"rewards/bleu_reward_func/std": 0.14331206679344177,
"step": 448
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 378.21875,
"completions/mean_terminated_length": 297.95001220703125,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.3592,
"grad_norm": 2.759582996368408,
"kl": 0.02587890625,
"learning_rate": 1e-06,
"loss": 0.0122,
"num_tokens": 5944491.0,
"reward": 0.04890431463718414,
"reward_std": 0.01871412619948387,
"rewards/bleu_reward_func/mean": 0.04890431463718414,
"rewards/bleu_reward_func/std": 0.05281543731689453,
"step": 449
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 234.8125,
"completions/mean_terminated_length": 216.33334350585938,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.36,
"grad_norm": 5.44106388092041,
"kl": 0.08453369140625,
"learning_rate": 1e-06,
"loss": -0.2079,
"num_tokens": 5954413.0,
"reward": 0.08892585337162018,
"reward_std": 0.05316928029060364,
"rewards/bleu_reward_func/mean": 0.08892585337162018,
"rewards/bleu_reward_func/std": 0.09096309542655945,
"step": 450
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 385.0,
"completions/mean_length": 182.59375,
"completions/mean_terminated_length": 90.36000061035156,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.3608,
"grad_norm": 10.483473777770996,
"kl": 0.30169677734375,
"learning_rate": 1e-06,
"loss": 0.1221,
"num_tokens": 5965776.0,
"reward": 0.2010711133480072,
"reward_std": 0.035105034708976746,
"rewards/bleu_reward_func/mean": 0.2010711133480072,
"rewards/bleu_reward_func/std": 0.20054543018341064,
"step": 451
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 289.5,
"completions/mean_terminated_length": 266.4827575683594,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.3616,
"grad_norm": 8.551454544067383,
"kl": 0.21197509765625,
"learning_rate": 1e-06,
"loss": 0.0288,
"num_tokens": 5976704.0,
"reward": 0.03945029526948929,
"reward_std": 0.011974655091762543,
"rewards/bleu_reward_func/mean": 0.03945029526948929,
"rewards/bleu_reward_func/std": 0.027504391968250275,
"step": 452
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 451.0,
"completions/max_terminated_length": 451.0,
"completions/mean_length": 241.90625,
"completions/mean_terminated_length": 241.90625,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.3624,
"grad_norm": 5.853670597076416,
"kl": 0.209869384765625,
"learning_rate": 1e-06,
"loss": -0.0173,
"num_tokens": 5987269.0,
"reward": 0.09715719521045685,
"reward_std": 0.009554330259561539,
"rewards/bleu_reward_func/mean": 0.09715719521045685,
"rewards/bleu_reward_func/std": 0.0827893614768982,
"step": 453
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 258.6875,
"completions/mean_terminated_length": 143.5454559326172,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.3632,
"grad_norm": 9.270416259765625,
"kl": 0.208038330078125,
"learning_rate": 1e-06,
"loss": 0.0672,
"num_tokens": 6001411.0,
"reward": 0.1554635763168335,
"reward_std": 0.03311417996883392,
"rewards/bleu_reward_func/mean": 0.1554635763168335,
"rewards/bleu_reward_func/std": 0.1801016479730606,
"step": 454
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 496.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 196.65625,
"completions/mean_terminated_length": 196.65625,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.364,
"grad_norm": 11.34135913848877,
"kl": 0.224945068359375,
"learning_rate": 1e-06,
"loss": 0.3216,
"num_tokens": 6012208.0,
"reward": 0.058545198291540146,
"reward_std": 0.017396699637174606,
"rewards/bleu_reward_func/mean": 0.058545198291540146,
"rewards/bleu_reward_func/std": 0.04106508567929268,
"step": 455
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 459.0,
"completions/max_terminated_length": 459.0,
"completions/mean_length": 137.8125,
"completions/mean_terminated_length": 137.8125,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.3648,
"grad_norm": 5.569085597991943,
"kl": 0.173675537109375,
"learning_rate": 1e-06,
"loss": -0.0886,
"num_tokens": 6020026.0,
"reward": 0.25735002756118774,
"reward_std": 0.08652571588754654,
"rewards/bleu_reward_func/mean": 0.25735002756118774,
"rewards/bleu_reward_func/std": 0.34091776609420776,
"step": 456
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 261.0625,
"completions/mean_terminated_length": 162.86956787109375,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.3656,
"grad_norm": 10.537775993347168,
"kl": 0.191162109375,
"learning_rate": 1e-06,
"loss": 0.1318,
"num_tokens": 6031956.0,
"reward": 0.12902843952178955,
"reward_std": 0.049239080399274826,
"rewards/bleu_reward_func/mean": 0.12902843952178955,
"rewards/bleu_reward_func/std": 0.1560073047876358,
"step": 457
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 267.65625,
"completions/mean_terminated_length": 156.59091186523438,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.3664,
"grad_norm": 8.385242462158203,
"kl": 0.13568115234375,
"learning_rate": 1e-06,
"loss": 0.1967,
"num_tokens": 6048289.0,
"reward": 0.09441059827804565,
"reward_std": 0.02894745022058487,
"rewards/bleu_reward_func/mean": 0.09441059827804565,
"rewards/bleu_reward_func/std": 0.07357289642095566,
"step": 458
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 144.3125,
"completions/mean_terminated_length": 76.22222137451172,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.3672,
"grad_norm": 9.128081321716309,
"kl": 0.275970458984375,
"learning_rate": 1e-06,
"loss": -0.0309,
"num_tokens": 6060955.0,
"reward": 0.23786574602127075,
"reward_std": 0.04663696512579918,
"rewards/bleu_reward_func/mean": 0.23786574602127075,
"rewards/bleu_reward_func/std": 0.15007296204566956,
"step": 459
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 293.25,
"completions/mean_terminated_length": 262.0,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.368,
"grad_norm": 10.163530349731445,
"kl": 0.1614837646484375,
"learning_rate": 1e-06,
"loss": 0.0314,
"num_tokens": 6075291.0,
"reward": 0.11764833331108093,
"reward_std": 0.025302093476057053,
"rewards/bleu_reward_func/mean": 0.11764833331108093,
"rewards/bleu_reward_func/std": 0.054068438708782196,
"step": 460
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 465.0,
"completions/mean_length": 160.78125,
"completions/mean_terminated_length": 137.36666870117188,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.3688,
"grad_norm": 7.89539909362793,
"kl": 0.18206787109375,
"learning_rate": 1e-06,
"loss": -0.0019,
"num_tokens": 6083980.0,
"reward": 0.0945214033126831,
"reward_std": 0.046040039509534836,
"rewards/bleu_reward_func/mean": 0.0945214033126831,
"rewards/bleu_reward_func/std": 0.08345890045166016,
"step": 461
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 254.96875,
"completions/mean_terminated_length": 246.6774139404297,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.3696,
"grad_norm": 6.462737560272217,
"kl": 0.130950927734375,
"learning_rate": 1e-06,
"loss": 0.0046,
"num_tokens": 6096555.0,
"reward": 0.04283145070075989,
"reward_std": 0.010249357670545578,
"rewards/bleu_reward_func/mean": 0.04283145070075989,
"rewards/bleu_reward_func/std": 0.038907162845134735,
"step": 462
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 208.0,
"completions/mean_length": 190.875,
"completions/mean_terminated_length": 83.83333587646484,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.3704,
"grad_norm": 5.569899559020996,
"kl": 0.115325927734375,
"learning_rate": 1e-06,
"loss": 0.006,
"num_tokens": 6105487.0,
"reward": 0.13501238822937012,
"reward_std": 0.034556735306978226,
"rewards/bleu_reward_func/mean": 0.13501238822937012,
"rewards/bleu_reward_func/std": 0.09039971977472305,
"step": 463
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 454.0,
"completions/mean_length": 219.53125,
"completions/mean_terminated_length": 137.63999938964844,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.3712,
"grad_norm": 12.397187232971191,
"kl": 0.133880615234375,
"learning_rate": 1e-06,
"loss": -0.0829,
"num_tokens": 6117800.0,
"reward": 0.18308544158935547,
"reward_std": 0.06162799149751663,
"rewards/bleu_reward_func/mean": 0.18308544158935547,
"rewards/bleu_reward_func/std": 0.15996244549751282,
"step": 464
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 459.0,
"completions/mean_length": 220.65625,
"completions/mean_terminated_length": 190.51724243164062,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.372,
"grad_norm": 7.391514301300049,
"kl": 0.11492919921875,
"learning_rate": 1e-06,
"loss": -0.0768,
"num_tokens": 6128413.0,
"reward": 0.05292118340730667,
"reward_std": 0.04890108108520508,
"rewards/bleu_reward_func/mean": 0.05292118340730667,
"rewards/bleu_reward_func/std": 0.07255055755376816,
"step": 465
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 488.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 194.15625,
"completions/mean_terminated_length": 194.15625,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.3728,
"grad_norm": 3.9842867851257324,
"kl": 0.050048828125,
"learning_rate": 1e-06,
"loss": -0.0064,
"num_tokens": 6137562.0,
"reward": 0.04538443684577942,
"reward_std": 0.024577371776103973,
"rewards/bleu_reward_func/mean": 0.04538443684577942,
"rewards/bleu_reward_func/std": 0.03160402178764343,
"step": 466
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 368.0,
"completions/mean_terminated_length": 311.6521911621094,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.3736,
"grad_norm": 2.496399402618408,
"kl": 0.025543212890625,
"learning_rate": 1e-06,
"loss": -0.0006,
"num_tokens": 6152090.0,
"reward": 0.062375668436288834,
"reward_std": 0.031018512323498726,
"rewards/bleu_reward_func/mean": 0.062375668436288834,
"rewards/bleu_reward_func/std": 0.06766829639673233,
"step": 467
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 341.59375,
"completions/mean_terminated_length": 302.2692565917969,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.3744,
"grad_norm": 4.490657329559326,
"kl": 0.042816162109375,
"learning_rate": 1e-06,
"loss": 0.2011,
"num_tokens": 6165917.0,
"reward": 0.06601699441671371,
"reward_std": 0.028723105788230896,
"rewards/bleu_reward_func/mean": 0.06601699441671371,
"rewards/bleu_reward_func/std": 0.039854664355516434,
"step": 468
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 250.21875,
"completions/mean_terminated_length": 147.78260803222656,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.3752,
"grad_norm": 4.7490010261535645,
"kl": 0.1627197265625,
"learning_rate": 1e-06,
"loss": -0.0409,
"num_tokens": 6178940.0,
"reward": 0.15887555480003357,
"reward_std": 0.018191883340477943,
"rewards/bleu_reward_func/mean": 0.15887555480003357,
"rewards/bleu_reward_func/std": 0.21522025763988495,
"step": 469
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 396.125,
"completions/mean_terminated_length": 264.8000183105469,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.376,
"grad_norm": 3.33166241645813,
"kl": 0.046051025390625,
"learning_rate": 1e-06,
"loss": -0.0611,
"num_tokens": 6194232.0,
"reward": 0.0860922709107399,
"reward_std": 0.04104076325893402,
"rewards/bleu_reward_func/mean": 0.0860922709107399,
"rewards/bleu_reward_func/std": 0.13754135370254517,
"step": 470
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 364.6875,
"completions/mean_terminated_length": 323.44000244140625,
"completions/min_length": 95.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.3768,
"grad_norm": 2.6695375442504883,
"kl": 0.038360595703125,
"learning_rate": 1e-06,
"loss": -0.0899,
"num_tokens": 6207750.0,
"reward": 0.05763555318117142,
"reward_std": 0.022492559626698494,
"rewards/bleu_reward_func/mean": 0.05763555318117142,
"rewards/bleu_reward_func/std": 0.034512683749198914,
"step": 471
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 439.0,
"completions/mean_length": 237.40625,
"completions/mean_terminated_length": 209.0,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.3776,
"grad_norm": 4.532895565032959,
"kl": 0.088165283203125,
"learning_rate": 1e-06,
"loss": 0.0908,
"num_tokens": 6220235.0,
"reward": 0.07317312806844711,
"reward_std": 0.02968096360564232,
"rewards/bleu_reward_func/mean": 0.07317312806844711,
"rewards/bleu_reward_func/std": 0.04997172951698303,
"step": 472
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 310.25,
"completions/mean_terminated_length": 263.69232177734375,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.3784,
"grad_norm": 2.9320926666259766,
"kl": 0.04736328125,
"learning_rate": 1e-06,
"loss": -0.0956,
"num_tokens": 6233691.0,
"reward": 0.07909499108791351,
"reward_std": 0.02384771592915058,
"rewards/bleu_reward_func/mean": 0.07909499108791351,
"rewards/bleu_reward_func/std": 0.08157114684581757,
"step": 473
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 276.8125,
"completions/mean_terminated_length": 233.25926208496094,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.3792,
"grad_norm": 5.839748859405518,
"kl": 0.14556884765625,
"learning_rate": 1e-06,
"loss": -0.0466,
"num_tokens": 6245669.0,
"reward": 0.10992265492677689,
"reward_std": 0.027910416945815086,
"rewards/bleu_reward_func/mean": 0.10992265492677689,
"rewards/bleu_reward_func/std": 0.11659030616283417,
"step": 474
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 455.0,
"completions/mean_length": 194.09375,
"completions/mean_terminated_length": 172.90000915527344,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.38,
"grad_norm": 22.791318893432617,
"kl": 0.30963134765625,
"learning_rate": 1e-06,
"loss": -0.0749,
"num_tokens": 6255632.0,
"reward": 0.14596156775951385,
"reward_std": 0.0427117757499218,
"rewards/bleu_reward_func/mean": 0.14596156775951385,
"rewards/bleu_reward_func/std": 0.06039505451917648,
"step": 475
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 474.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 147.03125,
"completions/mean_terminated_length": 147.03125,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.3808,
"grad_norm": 7.391219615936279,
"kl": 0.34600830078125,
"learning_rate": 1e-06,
"loss": -0.0711,
"num_tokens": 6267817.0,
"reward": 0.155485600233078,
"reward_std": 0.03775210678577423,
"rewards/bleu_reward_func/mean": 0.155485600233078,
"rewards/bleu_reward_func/std": 0.14854131639003754,
"step": 476
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 176.09375,
"completions/mean_terminated_length": 153.70001220703125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.3816,
"grad_norm": 7.248151779174805,
"kl": 0.1455078125,
"learning_rate": 1e-06,
"loss": -0.1443,
"num_tokens": 6276772.0,
"reward": 0.08080196380615234,
"reward_std": 0.06804326176643372,
"rewards/bleu_reward_func/mean": 0.08080196380615234,
"rewards/bleu_reward_func/std": 0.11115432530641556,
"step": 477
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 230.03125,
"completions/mean_terminated_length": 151.0800018310547,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.3824,
"grad_norm": 8.359848022460938,
"kl": 0.151123046875,
"learning_rate": 1e-06,
"loss": 0.36,
"num_tokens": 6287581.0,
"reward": 0.06686853617429733,
"reward_std": 0.028161579743027687,
"rewards/bleu_reward_func/mean": 0.06686853617429733,
"rewards/bleu_reward_func/std": 0.054127294570207596,
"step": 478
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 470.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 120.03125,
"completions/mean_terminated_length": 120.03125,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.3832,
"grad_norm": 7.5159101486206055,
"kl": 0.190155029296875,
"learning_rate": 1e-06,
"loss": 0.1074,
"num_tokens": 6297390.0,
"reward": 0.19040237367153168,
"reward_std": 0.05353376269340515,
"rewards/bleu_reward_func/mean": 0.19040237367153168,
"rewards/bleu_reward_func/std": 0.17947913706302643,
"step": 479
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 451.4375,
"completions/mean_terminated_length": 410.0,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"epoch": 0.384,
"grad_norm": 2.1026315689086914,
"kl": 0.0289306640625,
"learning_rate": 1e-06,
"loss": 0.046,
"num_tokens": 6314548.0,
"reward": 0.09041387587785721,
"reward_std": 0.04015309736132622,
"rewards/bleu_reward_func/mean": 0.09041387587785721,
"rewards/bleu_reward_func/std": 0.09059884399175644,
"step": 480
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 319.5625,
"completions/mean_terminated_length": 204.10000610351562,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.3848,
"grad_norm": 5.199352264404297,
"kl": 0.172821044921875,
"learning_rate": 1e-06,
"loss": -0.062,
"num_tokens": 6333886.0,
"reward": 0.13319844007492065,
"reward_std": 0.03567848354578018,
"rewards/bleu_reward_func/mean": 0.13319844007492065,
"rewards/bleu_reward_func/std": 0.12437637895345688,
"step": 481
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 208.9375,
"completions/mean_terminated_length": 152.8148193359375,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.3856,
"grad_norm": 6.110198497772217,
"kl": 0.2414398193359375,
"learning_rate": 1e-06,
"loss": -0.0203,
"num_tokens": 6346564.0,
"reward": 0.19878074526786804,
"reward_std": 0.043283406645059586,
"rewards/bleu_reward_func/mean": 0.19878074526786804,
"rewards/bleu_reward_func/std": 0.1821635365486145,
"step": 482
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 237.53125,
"completions/mean_terminated_length": 186.70370483398438,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.3864,
"grad_norm": 8.788106918334961,
"kl": 0.17132568359375,
"learning_rate": 1e-06,
"loss": -0.0653,
"num_tokens": 6358741.0,
"reward": 0.07478289306163788,
"reward_std": 0.019201520830392838,
"rewards/bleu_reward_func/mean": 0.07478289306163788,
"rewards/bleu_reward_func/std": 0.05620751157402992,
"step": 483
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 202.4375,
"completions/mean_terminated_length": 115.75999450683594,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.3872,
"grad_norm": 7.577336311340332,
"kl": 0.109588623046875,
"learning_rate": 1e-06,
"loss": 0.3355,
"num_tokens": 6371787.0,
"reward": 0.09253311157226562,
"reward_std": 0.03513386473059654,
"rewards/bleu_reward_func/mean": 0.09253311157226562,
"rewards/bleu_reward_func/std": 0.0667162612080574,
"step": 484
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 331.625,
"completions/mean_terminated_length": 281.1199951171875,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.388,
"grad_norm": 5.319460391998291,
"kl": 0.0972900390625,
"learning_rate": 1e-06,
"loss": 0.057,
"num_tokens": 6388231.0,
"reward": 0.16802164912223816,
"reward_std": 0.024459581822156906,
"rewards/bleu_reward_func/mean": 0.16802164912223816,
"rewards/bleu_reward_func/std": 0.17531749606132507,
"step": 485
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 475.0,
"completions/mean_length": 222.6875,
"completions/mean_terminated_length": 155.92308044433594,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.3888,
"grad_norm": 9.174544334411621,
"kl": 0.21746826171875,
"learning_rate": 1e-06,
"loss": 0.0552,
"num_tokens": 6399885.0,
"reward": 0.20374764502048492,
"reward_std": 0.02469576895236969,
"rewards/bleu_reward_func/mean": 0.20374764502048492,
"rewards/bleu_reward_func/std": 0.17774522304534912,
"step": 486
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 465.0,
"completions/mean_length": 212.40625,
"completions/mean_terminated_length": 112.54167175292969,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.3896,
"grad_norm": 8.529189109802246,
"kl": 0.412689208984375,
"learning_rate": 1e-06,
"loss": -0.0442,
"num_tokens": 6410746.0,
"reward": 0.13253280520439148,
"reward_std": 0.03401318937540054,
"rewards/bleu_reward_func/mean": 0.13253280520439148,
"rewards/bleu_reward_func/std": 0.09572894126176834,
"step": 487
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 343.40625,
"completions/mean_terminated_length": 277.4347839355469,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.3904,
"grad_norm": 5.815334796905518,
"kl": 0.06719970703125,
"learning_rate": 1e-06,
"loss": 0.0079,
"num_tokens": 6424639.0,
"reward": 0.14998552203178406,
"reward_std": 0.03536435216665268,
"rewards/bleu_reward_func/mean": 0.14998552203178406,
"rewards/bleu_reward_func/std": 0.08015048503875732,
"step": 488
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 242.21875,
"completions/mean_terminated_length": 192.25926208496094,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.3912,
"grad_norm": 8.644153594970703,
"kl": 0.204193115234375,
"learning_rate": 1e-06,
"loss": 0.2233,
"num_tokens": 6437382.0,
"reward": 0.08585190027952194,
"reward_std": 0.032436732202768326,
"rewards/bleu_reward_func/mean": 0.08585190027952194,
"rewards/bleu_reward_func/std": 0.10239724069833755,
"step": 489
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 259.625,
"completions/mean_terminated_length": 108.20000457763672,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.392,
"grad_norm": 18.119718551635742,
"kl": 0.39862060546875,
"learning_rate": 1e-06,
"loss": 0.0372,
"num_tokens": 6450634.0,
"reward": 0.07857100665569305,
"reward_std": 0.010440990328788757,
"rewards/bleu_reward_func/mean": 0.07857100665569305,
"rewards/bleu_reward_func/std": 0.06719467043876648,
"step": 490
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 417.0,
"completions/mean_length": 348.28125,
"completions/mean_terminated_length": 262.5238037109375,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.3928,
"grad_norm": 2.811199903488159,
"kl": 0.03350830078125,
"learning_rate": 1e-06,
"loss": 0.0223,
"num_tokens": 6464187.0,
"reward": 0.07400047779083252,
"reward_std": 0.021461695432662964,
"rewards/bleu_reward_func/mean": 0.07400047779083252,
"rewards/bleu_reward_func/std": 0.061210907995700836,
"step": 491
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 225.53125,
"completions/mean_terminated_length": 172.48147583007812,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.3936,
"grad_norm": 8.102995872497559,
"kl": 0.17437744140625,
"learning_rate": 1e-06,
"loss": -0.0621,
"num_tokens": 6477388.0,
"reward": 0.08205416798591614,
"reward_std": 0.02140321210026741,
"rewards/bleu_reward_func/mean": 0.08205416798591614,
"rewards/bleu_reward_func/std": 0.06504324823617935,
"step": 492
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 248.40625,
"completions/mean_terminated_length": 239.90321350097656,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.3944,
"grad_norm": 5.510415554046631,
"kl": 0.054595947265625,
"learning_rate": 1e-06,
"loss": 0.1424,
"num_tokens": 6490681.0,
"reward": 0.09917749464511871,
"reward_std": 0.03953540325164795,
"rewards/bleu_reward_func/mean": 0.09917749464511871,
"rewards/bleu_reward_func/std": 0.062214821577072144,
"step": 493
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 430.0,
"completions/mean_length": 347.65625,
"completions/mean_terminated_length": 202.64706420898438,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.3952,
"grad_norm": 3.754049301147461,
"kl": 0.065643310546875,
"learning_rate": 1e-06,
"loss": -0.0312,
"num_tokens": 6508054.0,
"reward": 0.04995376244187355,
"reward_std": 0.018671657890081406,
"rewards/bleu_reward_func/mean": 0.04995376244187355,
"rewards/bleu_reward_func/std": 0.021997425705194473,
"step": 494
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 448.0,
"completions/mean_length": 304.78125,
"completions/mean_terminated_length": 121.94117736816406,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.396,
"grad_norm": 3.28092098236084,
"kl": 0.0880889892578125,
"learning_rate": 1e-06,
"loss": 0.2271,
"num_tokens": 6528167.0,
"reward": 0.21464568376541138,
"reward_std": 0.04326138645410538,
"rewards/bleu_reward_func/mean": 0.21464568376541138,
"rewards/bleu_reward_func/std": 0.2538887560367584,
"step": 495
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 260.0,
"completions/mean_terminated_length": 161.3913116455078,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.3968,
"grad_norm": 7.667696475982666,
"kl": 0.27783203125,
"learning_rate": 1e-06,
"loss": -0.0064,
"num_tokens": 6539503.0,
"reward": 0.14023897051811218,
"reward_std": 0.03843347355723381,
"rewards/bleu_reward_func/mean": 0.14023897051811218,
"rewards/bleu_reward_func/std": 0.12260077148675919,
"step": 496
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 312.78125,
"completions/mean_terminated_length": 275.8888854980469,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.3976,
"grad_norm": 5.650123596191406,
"kl": 0.122955322265625,
"learning_rate": 1e-06,
"loss": -0.218,
"num_tokens": 6556880.0,
"reward": 0.2068222463130951,
"reward_std": 0.08186712116003036,
"rewards/bleu_reward_func/mean": 0.2068222463130951,
"rewards/bleu_reward_func/std": 0.30478134751319885,
"step": 497
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 482.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 115.5,
"completions/mean_terminated_length": 115.5,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.3984,
"grad_norm": 9.522087097167969,
"kl": 0.31427001953125,
"learning_rate": 1e-06,
"loss": 0.3453,
"num_tokens": 6564648.0,
"reward": 0.21922443807125092,
"reward_std": 0.07997345924377441,
"rewards/bleu_reward_func/mean": 0.21922443807125092,
"rewards/bleu_reward_func/std": 0.12106078118085861,
"step": 498
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 283.46875,
"completions/mean_terminated_length": 194.04348754882812,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.3992,
"grad_norm": 4.472853183746338,
"kl": 0.10198974609375,
"learning_rate": 1e-06,
"loss": 0.0606,
"num_tokens": 6577575.0,
"reward": 0.1807648241519928,
"reward_std": 0.04940491169691086,
"rewards/bleu_reward_func/mean": 0.1807648241519928,
"rewards/bleu_reward_func/std": 0.2276194989681244,
"step": 499
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 257.6875,
"completions/mean_terminated_length": 210.59259033203125,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.4,
"grad_norm": 3.429314136505127,
"kl": 0.120086669921875,
"learning_rate": 1e-06,
"loss": 0.1092,
"num_tokens": 6590341.0,
"reward": 0.13892096281051636,
"reward_std": 0.04246610775589943,
"rewards/bleu_reward_func/mean": 0.13892096281051636,
"rewards/bleu_reward_func/std": 0.12665794789791107,
"step": 500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 422.0,
"completions/mean_length": 150.78125,
"completions/mean_terminated_length": 126.70000457763672,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.4008,
"grad_norm": 6.932479381561279,
"kl": 0.4072265625,
"learning_rate": 1e-06,
"loss": 0.1306,
"num_tokens": 6604182.0,
"reward": 0.13375571370124817,
"reward_std": 0.05735353007912636,
"rewards/bleu_reward_func/mean": 0.13375571370124817,
"rewards/bleu_reward_func/std": 0.14047691226005554,
"step": 501
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 329.375,
"completions/mean_terminated_length": 257.9130554199219,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.4016,
"grad_norm": 3.9977669715881348,
"kl": 0.0543212890625,
"learning_rate": 1e-06,
"loss": 0.1227,
"num_tokens": 6619994.0,
"reward": 0.08314976096153259,
"reward_std": 0.01850474253296852,
"rewards/bleu_reward_func/mean": 0.08314976096153259,
"rewards/bleu_reward_func/std": 0.03126469627022743,
"step": 502
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 328.875,
"completions/mean_terminated_length": 245.63636779785156,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.4024,
"grad_norm": 8.637741088867188,
"kl": 0.294342041015625,
"learning_rate": 1e-06,
"loss": -0.0294,
"num_tokens": 6632862.0,
"reward": 0.21461226046085358,
"reward_std": 0.05726875364780426,
"rewards/bleu_reward_func/mean": 0.21461226046085358,
"rewards/bleu_reward_func/std": 0.19377335906028748,
"step": 503
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 345.71875,
"completions/mean_terminated_length": 245.9499969482422,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.4032,
"grad_norm": 5.415818691253662,
"kl": 0.11663818359375,
"learning_rate": 1e-06,
"loss": -0.1008,
"num_tokens": 6649573.0,
"reward": 0.10018286108970642,
"reward_std": 0.025530360639095306,
"rewards/bleu_reward_func/mean": 0.10018286108970642,
"rewards/bleu_reward_func/std": 0.08217810094356537,
"step": 504
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 293.21875,
"completions/mean_terminated_length": 123.05555725097656,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.404,
"grad_norm": 5.128636837005615,
"kl": 0.0567626953125,
"learning_rate": 1e-06,
"loss": 0.1747,
"num_tokens": 6661556.0,
"reward": 0.08193753659725189,
"reward_std": 0.036860473453998566,
"rewards/bleu_reward_func/mean": 0.08193753659725189,
"rewards/bleu_reward_func/std": 0.0639234408736229,
"step": 505
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 383.1875,
"completions/mean_terminated_length": 254.375,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.4048,
"grad_norm": 2.5557072162628174,
"kl": 0.03839111328125,
"learning_rate": 1e-06,
"loss": -0.0743,
"num_tokens": 6678938.0,
"reward": 0.05591622740030289,
"reward_std": 0.017734069377183914,
"rewards/bleu_reward_func/mean": 0.05591622740030289,
"rewards/bleu_reward_func/std": 0.04607876017689705,
"step": 506
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 271.0,
"completions/mean_length": 293.625,
"completions/mean_terminated_length": 75.25,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.4056,
"grad_norm": 5.220037460327148,
"kl": 0.07464599609375,
"learning_rate": 1e-06,
"loss": 0.0253,
"num_tokens": 6691822.0,
"reward": 0.029562367126345634,
"reward_std": 0.03146641328930855,
"rewards/bleu_reward_func/mean": 0.029562367126345634,
"rewards/bleu_reward_func/std": 0.04593721404671669,
"step": 507
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 453.96875,
"completions/mean_terminated_length": 343.18182373046875,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.4064,
"grad_norm": 2.189230442047119,
"kl": 0.026153564453125,
"learning_rate": 1e-06,
"loss": -0.0992,
"num_tokens": 6709997.0,
"reward": 0.03725602477788925,
"reward_std": 0.02092660963535309,
"rewards/bleu_reward_func/mean": 0.03725602477788925,
"rewards/bleu_reward_func/std": 0.02429044619202614,
"step": 508
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 290.0,
"completions/mean_length": 106.34375,
"completions/mean_terminated_length": 79.30000305175781,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.4072,
"grad_norm": 6.567111015319824,
"kl": 0.233154296875,
"learning_rate": 1e-06,
"loss": 0.2905,
"num_tokens": 6718912.0,
"reward": 0.15163123607635498,
"reward_std": 0.039707012474536896,
"rewards/bleu_reward_func/mean": 0.15163123607635498,
"rewards/bleu_reward_func/std": 0.12998701632022858,
"step": 509
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 411.0,
"completions/mean_length": 119.03125,
"completions/mean_terminated_length": 106.3548355102539,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.408,
"grad_norm": 8.991604804992676,
"kl": 0.14703369140625,
"learning_rate": 1e-06,
"loss": 0.0972,
"num_tokens": 6728609.0,
"reward": 0.23723718523979187,
"reward_std": 0.07665139436721802,
"rewards/bleu_reward_func/mean": 0.23723718523979187,
"rewards/bleu_reward_func/std": 0.27060666680336,
"step": 510
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 288.375,
"completions/mean_terminated_length": 171.23809814453125,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.4088,
"grad_norm": 6.349617958068848,
"kl": 0.168121337890625,
"learning_rate": 1e-06,
"loss": 0.0688,
"num_tokens": 6743789.0,
"reward": 0.1937231868505478,
"reward_std": 0.13082939386367798,
"rewards/bleu_reward_func/mean": 0.1937231868505478,
"rewards/bleu_reward_func/std": 0.25435397028923035,
"step": 511
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 401.96875,
"completions/mean_terminated_length": 291.9375,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.4096,
"grad_norm": 2.5390427112579346,
"kl": 0.03851318359375,
"learning_rate": 1e-06,
"loss": 0.1042,
"num_tokens": 6759732.0,
"reward": 0.029224077239632607,
"reward_std": 0.016936711966991425,
"rewards/bleu_reward_func/mean": 0.029224077239632607,
"rewards/bleu_reward_func/std": 0.022709792479872704,
"step": 512
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 451.0,
"completions/mean_length": 268.46875,
"completions/mean_terminated_length": 79.05555725097656,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.4104,
"grad_norm": 3.7983713150024414,
"kl": 0.137786865234375,
"learning_rate": 1e-06,
"loss": -0.0173,
"num_tokens": 6774051.0,
"reward": 0.20052862167358398,
"reward_std": 0.028155002743005753,
"rewards/bleu_reward_func/mean": 0.20052862167358398,
"rewards/bleu_reward_func/std": 0.2302575409412384,
"step": 513
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 388.71875,
"completions/mean_terminated_length": 360.2692565917969,
"completions/min_length": 176.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.4112,
"grad_norm": 2.4446346759796143,
"kl": 0.028076171875,
"learning_rate": 1e-06,
"loss": -0.0783,
"num_tokens": 6790610.0,
"reward": 0.10578086227178574,
"reward_std": 0.029093941673636436,
"rewards/bleu_reward_func/mean": 0.10578086227178574,
"rewards/bleu_reward_func/std": 0.08641202747821808,
"step": 514
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 275.34375,
"completions/mean_terminated_length": 259.5666809082031,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.412,
"grad_norm": 5.883263111114502,
"kl": 0.18634033203125,
"learning_rate": 1e-06,
"loss": 0.0231,
"num_tokens": 6803965.0,
"reward": 0.1322258561849594,
"reward_std": 0.030806170776486397,
"rewards/bleu_reward_func/mean": 0.1322258561849594,
"rewards/bleu_reward_func/std": 0.16078709065914154,
"step": 515
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 333.84375,
"completions/mean_terminated_length": 274.4583435058594,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.4128,
"grad_norm": 3.016139507293701,
"kl": 0.03173828125,
"learning_rate": 1e-06,
"loss": 0.025,
"num_tokens": 6818840.0,
"reward": 0.09323176741600037,
"reward_std": 0.05342460051178932,
"rewards/bleu_reward_func/mean": 0.09323176741600037,
"rewards/bleu_reward_func/std": 0.06577997654676437,
"step": 516
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 442.0,
"completions/mean_length": 276.5625,
"completions/mean_terminated_length": 210.63999938964844,
"completions/min_length": 72.0,
"completions/min_terminated_length": 72.0,
"epoch": 0.4136,
"grad_norm": 4.685121059417725,
"kl": 0.050933837890625,
"learning_rate": 1e-06,
"loss": -0.1784,
"num_tokens": 6830770.0,
"reward": 0.03872024267911911,
"reward_std": 0.016178004443645477,
"rewards/bleu_reward_func/mean": 0.03872024267911911,
"rewards/bleu_reward_func/std": 0.025313377380371094,
"step": 517
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 362.875,
"completions/mean_terminated_length": 231.2941131591797,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.4144,
"grad_norm": 4.639893054962158,
"kl": 0.122711181640625,
"learning_rate": 1e-06,
"loss": 0.0545,
"num_tokens": 6848302.0,
"reward": 0.07996964454650879,
"reward_std": 0.01709877885878086,
"rewards/bleu_reward_func/mean": 0.07996964454650879,
"rewards/bleu_reward_func/std": 0.10056579113006592,
"step": 518
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 294.625,
"completions/mean_terminated_length": 263.5714416503906,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.4152,
"grad_norm": 9.085565567016602,
"kl": 0.1670379638671875,
"learning_rate": 1e-06,
"loss": -0.1226,
"num_tokens": 6859826.0,
"reward": 0.10505213588476181,
"reward_std": 0.05224030464887619,
"rewards/bleu_reward_func/mean": 0.10505213588476181,
"rewards/bleu_reward_func/std": 0.0725407749414444,
"step": 519
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 258.3125,
"completions/mean_terminated_length": 173.75,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.416,
"grad_norm": 7.134840965270996,
"kl": 0.175750732421875,
"learning_rate": 1e-06,
"loss": 0.1168,
"num_tokens": 6873516.0,
"reward": 0.21853026747703552,
"reward_std": 0.06429094821214676,
"rewards/bleu_reward_func/mean": 0.21853026747703552,
"rewards/bleu_reward_func/std": 0.14174966514110565,
"step": 520
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 328.65625,
"completions/mean_terminated_length": 256.9130554199219,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.4168,
"grad_norm": 6.0497517585754395,
"kl": 0.131439208984375,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 6887761.0,
"reward": 0.0685054138302803,
"reward_std": 0.012891553342342377,
"rewards/bleu_reward_func/mean": 0.0685054138302803,
"rewards/bleu_reward_func/std": 0.057060711085796356,
"step": 521
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 267.0625,
"completions/mean_terminated_length": 221.70370483398438,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.4176,
"grad_norm": 4.475714683532715,
"kl": 0.047637939453125,
"learning_rate": 1e-06,
"loss": 0.1563,
"num_tokens": 6901251.0,
"reward": 0.18483126163482666,
"reward_std": 0.02913127839565277,
"rewards/bleu_reward_func/mean": 0.18483126163482666,
"rewards/bleu_reward_func/std": 0.16543246805667877,
"step": 522
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 228.53125,
"completions/mean_terminated_length": 163.11538696289062,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.4184,
"grad_norm": 7.8729143142700195,
"kl": 0.297882080078125,
"learning_rate": 1e-06,
"loss": 0.0428,
"num_tokens": 6912844.0,
"reward": 0.1846814900636673,
"reward_std": 0.10159599035978317,
"rewards/bleu_reward_func/mean": 0.1846814900636673,
"rewards/bleu_reward_func/std": 0.2030598670244217,
"step": 523
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 456.0,
"completions/mean_length": 251.09375,
"completions/mean_terminated_length": 233.70001220703125,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.4192,
"grad_norm": 4.603794097900391,
"kl": 0.08197021484375,
"learning_rate": 1e-06,
"loss": -0.0057,
"num_tokens": 6923895.0,
"reward": 0.11323156207799911,
"reward_std": 0.03932211175560951,
"rewards/bleu_reward_func/mean": 0.11323156207799911,
"rewards/bleu_reward_func/std": 0.08274988830089569,
"step": 524
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 288.0,
"completions/max_terminated_length": 288.0,
"completions/mean_length": 76.625,
"completions/mean_terminated_length": 76.625,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.42,
"grad_norm": 8.708818435668945,
"kl": 0.1953125,
"learning_rate": 1e-06,
"loss": 0.3306,
"num_tokens": 6934091.0,
"reward": 0.18468719720840454,
"reward_std": 0.0689420998096466,
"rewards/bleu_reward_func/mean": 0.18468719720840454,
"rewards/bleu_reward_func/std": 0.12529541552066803,
"step": 525
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 256.5,
"completions/mean_terminated_length": 220.00001525878906,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.4208,
"grad_norm": 15.188727378845215,
"kl": 0.10748291015625,
"learning_rate": 1e-06,
"loss": -0.0496,
"num_tokens": 6946923.0,
"reward": 0.09780866652727127,
"reward_std": 0.029562484472990036,
"rewards/bleu_reward_func/mean": 0.09780866652727127,
"rewards/bleu_reward_func/std": 0.09735672175884247,
"step": 526
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 246.75,
"completions/mean_terminated_length": 107.80952453613281,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.4216,
"grad_norm": 5.919389247894287,
"kl": 0.1287841796875,
"learning_rate": 1e-06,
"loss": 0.0944,
"num_tokens": 6958675.0,
"reward": 0.049182113260030746,
"reward_std": 0.03928225487470627,
"rewards/bleu_reward_func/mean": 0.049182113260030746,
"rewards/bleu_reward_func/std": 0.05703483149409294,
"step": 527
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 298.59375,
"completions/mean_terminated_length": 268.1071472167969,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.4224,
"grad_norm": 4.162198066711426,
"kl": 0.048370361328125,
"learning_rate": 1e-06,
"loss": -0.0255,
"num_tokens": 6973038.0,
"reward": 0.19552364945411682,
"reward_std": 0.05411393195390701,
"rewards/bleu_reward_func/mean": 0.19552364945411682,
"rewards/bleu_reward_func/std": 0.11564817279577255,
"step": 528
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 343.125,
"completions/mean_terminated_length": 266.3636474609375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.4232,
"grad_norm": 7.422494411468506,
"kl": 0.06976318359375,
"learning_rate": 1e-06,
"loss": 0.0526,
"num_tokens": 6988034.0,
"reward": 0.03407738357782364,
"reward_std": 0.010626979172229767,
"rewards/bleu_reward_func/mean": 0.03407738357782364,
"rewards/bleu_reward_func/std": 0.027887288480997086,
"step": 529
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 249.0,
"completions/max_terminated_length": 249.0,
"completions/mean_length": 53.53125,
"completions/mean_terminated_length": 53.53125,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.424,
"grad_norm": 13.498769760131836,
"kl": 0.46209716796875,
"learning_rate": 1e-06,
"loss": 0.0087,
"num_tokens": 6995419.0,
"reward": 0.24595381319522858,
"reward_std": 0.09870806336402893,
"rewards/bleu_reward_func/mean": 0.24595381319522858,
"rewards/bleu_reward_func/std": 0.1663571149110794,
"step": 530
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 392.5,
"completions/mean_terminated_length": 287.058837890625,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.4248,
"grad_norm": 2.2019829750061035,
"kl": 0.0286407470703125,
"learning_rate": 1e-06,
"loss": -0.2369,
"num_tokens": 7014707.0,
"reward": 0.12730640172958374,
"reward_std": 0.03398028016090393,
"rewards/bleu_reward_func/mean": 0.12730640172958374,
"rewards/bleu_reward_func/std": 0.20578297972679138,
"step": 531
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 317.15625,
"completions/mean_terminated_length": 262.6000061035156,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.4256,
"grad_norm": 5.298976421356201,
"kl": 0.087127685546875,
"learning_rate": 1e-06,
"loss": -0.0481,
"num_tokens": 7030000.0,
"reward": 0.06116287037730217,
"reward_std": 0.04584234952926636,
"rewards/bleu_reward_func/mean": 0.06116287037730217,
"rewards/bleu_reward_func/std": 0.07913482189178467,
"step": 532
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 321.34375,
"completions/mean_terminated_length": 234.68182373046875,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.4264,
"grad_norm": 4.799532413482666,
"kl": 0.1138916015625,
"learning_rate": 1e-06,
"loss": -0.0807,
"num_tokens": 7046643.0,
"reward": 0.08829933404922485,
"reward_std": 0.03609791770577431,
"rewards/bleu_reward_func/mean": 0.08829933404922485,
"rewards/bleu_reward_func/std": 0.10983619093894958,
"step": 533
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 466.0,
"completions/mean_length": 196.84375,
"completions/mean_terminated_length": 151.82144165039062,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.4272,
"grad_norm": 15.142457008361816,
"kl": 0.25970458984375,
"learning_rate": 1e-06,
"loss": -0.0027,
"num_tokens": 7057718.0,
"reward": 0.10968612134456635,
"reward_std": 0.05676144361495972,
"rewards/bleu_reward_func/mean": 0.10968612134456635,
"rewards/bleu_reward_func/std": 0.1397821009159088,
"step": 534
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 391.0,
"completions/mean_length": 244.9375,
"completions/mean_terminated_length": 195.48147583007812,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.428,
"grad_norm": 6.9789934158325195,
"kl": 0.092926025390625,
"learning_rate": 1e-06,
"loss": 0.1057,
"num_tokens": 7073980.0,
"reward": 0.19463014602661133,
"reward_std": 0.09179598838090897,
"rewards/bleu_reward_func/mean": 0.19463014602661133,
"rewards/bleu_reward_func/std": 0.1903815120458603,
"step": 535
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 146.5,
"completions/mean_terminated_length": 122.13333892822266,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.4288,
"grad_norm": 10.111763000488281,
"kl": 0.143310546875,
"learning_rate": 1e-06,
"loss": -0.0902,
"num_tokens": 7085228.0,
"reward": 0.15931251645088196,
"reward_std": 0.06651220470666885,
"rewards/bleu_reward_func/mean": 0.15931251645088196,
"rewards/bleu_reward_func/std": 0.10370245575904846,
"step": 536
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 245.09375,
"completions/mean_terminated_length": 183.5,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.4296,
"grad_norm": 11.313093185424805,
"kl": 0.135894775390625,
"learning_rate": 1e-06,
"loss": 0.2524,
"num_tokens": 7098583.0,
"reward": 0.08501166105270386,
"reward_std": 0.03819301724433899,
"rewards/bleu_reward_func/mean": 0.08501166105270386,
"rewards/bleu_reward_func/std": 0.0931810513138771,
"step": 537
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 261.5,
"completions/mean_terminated_length": 244.80001831054688,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.4304,
"grad_norm": 4.153919696807861,
"kl": 0.050628662109375,
"learning_rate": 1e-06,
"loss": -0.1541,
"num_tokens": 7108663.0,
"reward": 0.06835095584392548,
"reward_std": 0.042577650398015976,
"rewards/bleu_reward_func/mean": 0.06835095584392548,
"rewards/bleu_reward_func/std": 0.05704295262694359,
"step": 538
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 226.375,
"completions/mean_terminated_length": 173.48147583007812,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.4312,
"grad_norm": 13.284401893615723,
"kl": 0.14434814453125,
"learning_rate": 1e-06,
"loss": -0.0842,
"num_tokens": 7118427.0,
"reward": 0.08002069592475891,
"reward_std": 0.029213791713118553,
"rewards/bleu_reward_func/mean": 0.08002069592475891,
"rewards/bleu_reward_func/std": 0.03687189891934395,
"step": 539
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 391.0,
"completions/mean_length": 123.65625,
"completions/mean_terminated_length": 83.48275756835938,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.432,
"grad_norm": 23.51561164855957,
"kl": 0.1839599609375,
"learning_rate": 1e-06,
"loss": 0.4993,
"num_tokens": 7128856.0,
"reward": 0.2179010808467865,
"reward_std": 0.08272600173950195,
"rewards/bleu_reward_func/mean": 0.2179010808467865,
"rewards/bleu_reward_func/std": 0.26301127672195435,
"step": 540
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 452.0,
"completions/mean_length": 332.40625,
"completions/mean_terminated_length": 173.94117736816406,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.4328,
"grad_norm": 10.934617042541504,
"kl": 0.10113525390625,
"learning_rate": 1e-06,
"loss": -0.1254,
"num_tokens": 7141661.0,
"reward": 0.06413869559764862,
"reward_std": 0.05120678246021271,
"rewards/bleu_reward_func/mean": 0.06413869559764862,
"rewards/bleu_reward_func/std": 0.09179537743330002,
"step": 541
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 294.375,
"completions/mean_terminated_length": 221.83334350585938,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.4336,
"grad_norm": 3.505484104156494,
"kl": 0.06707763671875,
"learning_rate": 1e-06,
"loss": 0.0835,
"num_tokens": 7153161.0,
"reward": 0.09516981989145279,
"reward_std": 0.044140610843896866,
"rewards/bleu_reward_func/mean": 0.09516981989145279,
"rewards/bleu_reward_func/std": 0.049775656312704086,
"step": 542
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 326.25,
"completions/mean_terminated_length": 214.8000030517578,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.4344,
"grad_norm": 7.354869842529297,
"kl": 0.10260009765625,
"learning_rate": 1e-06,
"loss": -0.1239,
"num_tokens": 7167649.0,
"reward": 0.03533574938774109,
"reward_std": 0.014214935712516308,
"rewards/bleu_reward_func/mean": 0.03533574938774109,
"rewards/bleu_reward_func/std": 0.027195338159799576,
"step": 543
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 225.78125,
"completions/mean_terminated_length": 159.73077392578125,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.4352,
"grad_norm": 5.206735610961914,
"kl": 0.09149169921875,
"learning_rate": 1e-06,
"loss": -0.0428,
"num_tokens": 7181226.0,
"reward": 0.22954684495925903,
"reward_std": 0.06006891652941704,
"rewards/bleu_reward_func/mean": 0.22954684495925903,
"rewards/bleu_reward_func/std": 0.11863149702548981,
"step": 544
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 250.75,
"completions/mean_terminated_length": 190.4615478515625,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.436,
"grad_norm": 5.510367393493652,
"kl": 0.05084228515625,
"learning_rate": 1e-06,
"loss": 0.1971,
"num_tokens": 7191810.0,
"reward": 0.08453569561243057,
"reward_std": 0.050511520355939865,
"rewards/bleu_reward_func/mean": 0.08453569561243057,
"rewards/bleu_reward_func/std": 0.07364515960216522,
"step": 545
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 388.34375,
"completions/mean_terminated_length": 314.1499938964844,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.4368,
"grad_norm": 3.61635160446167,
"kl": 0.045135498046875,
"learning_rate": 1e-06,
"loss": 0.0817,
"num_tokens": 7206789.0,
"reward": 0.050152119249105453,
"reward_std": 0.03165213763713837,
"rewards/bleu_reward_func/mean": 0.050152119249105453,
"rewards/bleu_reward_func/std": 0.05620579421520233,
"step": 546
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 160.0,
"completions/max_terminated_length": 160.0,
"completions/mean_length": 57.5,
"completions/mean_terminated_length": 57.5,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.4376,
"grad_norm": 8.828208923339844,
"kl": 0.25537109375,
"learning_rate": 1e-06,
"loss": 0.103,
"num_tokens": 7213949.0,
"reward": 0.20786888897418976,
"reward_std": 0.06727642565965652,
"rewards/bleu_reward_func/mean": 0.20786888897418976,
"rewards/bleu_reward_func/std": 0.1706974357366562,
"step": 547
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 275.125,
"completions/mean_terminated_length": 196.1666717529297,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.4384,
"grad_norm": 13.268147468566895,
"kl": 0.058074951171875,
"learning_rate": 1e-06,
"loss": -0.0694,
"num_tokens": 7227041.0,
"reward": 0.05118046700954437,
"reward_std": 0.02497515268623829,
"rewards/bleu_reward_func/mean": 0.05118046700954437,
"rewards/bleu_reward_func/std": 0.035916514694690704,
"step": 548
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 351.3125,
"completions/mean_terminated_length": 278.2727355957031,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.4392,
"grad_norm": 13.135753631591797,
"kl": 0.077850341796875,
"learning_rate": 1e-06,
"loss": 0.0555,
"num_tokens": 7244763.0,
"reward": 0.07840518653392792,
"reward_std": 0.022635504603385925,
"rewards/bleu_reward_func/mean": 0.07840518653392792,
"rewards/bleu_reward_func/std": 0.06580173969268799,
"step": 549
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 201.1875,
"completions/mean_terminated_length": 156.7857208251953,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.44,
"grad_norm": 7.055432319641113,
"kl": 0.25830078125,
"learning_rate": 1e-06,
"loss": -0.0546,
"num_tokens": 7256785.0,
"reward": 0.253431499004364,
"reward_std": 0.028121720999479294,
"rewards/bleu_reward_func/mean": 0.253431499004364,
"rewards/bleu_reward_func/std": 0.20365522801876068,
"step": 550
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 336.8125,
"completions/mean_terminated_length": 287.7599792480469,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.4408,
"grad_norm": 3.6197187900543213,
"kl": 0.046844482421875,
"learning_rate": 1e-06,
"loss": 0.1119,
"num_tokens": 7268987.0,
"reward": 0.061213478446006775,
"reward_std": 0.01489005982875824,
"rewards/bleu_reward_func/mean": 0.061213478446006775,
"rewards/bleu_reward_func/std": 0.038935884833335876,
"step": 551
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 386.21875,
"completions/mean_terminated_length": 351.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.4416,
"grad_norm": 2.6066031455993652,
"kl": 0.034149169921875,
"learning_rate": 1e-06,
"loss": -0.1341,
"num_tokens": 7287306.0,
"reward": 0.11066319048404694,
"reward_std": 0.105903759598732,
"rewards/bleu_reward_func/mean": 0.11066319048404694,
"rewards/bleu_reward_func/std": 0.16723419725894928,
"step": 552
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 263.28125,
"completions/mean_terminated_length": 237.55172729492188,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.4424,
"grad_norm": 9.281195640563965,
"kl": 0.226043701171875,
"learning_rate": 1e-06,
"loss": -0.1571,
"num_tokens": 7299899.0,
"reward": 0.06672249734401703,
"reward_std": 0.03525693714618683,
"rewards/bleu_reward_func/mean": 0.06672249734401703,
"rewards/bleu_reward_func/std": 0.0810592845082283,
"step": 553
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 233.84375,
"completions/mean_terminated_length": 182.3333282470703,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.4432,
"grad_norm": 7.543862342834473,
"kl": 0.308807373046875,
"learning_rate": 1e-06,
"loss": 0.0082,
"num_tokens": 7313638.0,
"reward": 0.30825120210647583,
"reward_std": 0.07663644850254059,
"rewards/bleu_reward_func/mean": 0.30825120210647583,
"rewards/bleu_reward_func/std": 0.1689450740814209,
"step": 554
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 246.40625,
"completions/mean_terminated_length": 218.9310302734375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.444,
"grad_norm": 4.433272361755371,
"kl": 0.104339599609375,
"learning_rate": 1e-06,
"loss": 0.0582,
"num_tokens": 7324939.0,
"reward": 0.19763408601284027,
"reward_std": 0.028635632246732712,
"rewards/bleu_reward_func/mean": 0.19763408601284027,
"rewards/bleu_reward_func/std": 0.18309614062309265,
"step": 555
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 280.0,
"completions/mean_length": 62.96875,
"completions/mean_terminated_length": 48.48386764526367,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.4448,
"grad_norm": 10.322346687316895,
"kl": 0.1849365234375,
"learning_rate": 1e-06,
"loss": 0.432,
"num_tokens": 7330802.0,
"reward": 0.19210518896579742,
"reward_std": 0.03121430240571499,
"rewards/bleu_reward_func/mean": 0.19210518896579742,
"rewards/bleu_reward_func/std": 0.16853223741054535,
"step": 556
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 202.78125,
"completions/mean_terminated_length": 182.1666717529297,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.4456,
"grad_norm": 9.519190788269043,
"kl": 0.171844482421875,
"learning_rate": 1e-06,
"loss": 0.0451,
"num_tokens": 7339683.0,
"reward": 0.170665442943573,
"reward_std": 0.06568457931280136,
"rewards/bleu_reward_func/mean": 0.170665442943573,
"rewards/bleu_reward_func/std": 0.1584860235452652,
"step": 557
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 256.75,
"completions/mean_terminated_length": 123.04762268066406,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.4464,
"grad_norm": 5.70733118057251,
"kl": 0.177093505859375,
"learning_rate": 1e-06,
"loss": -0.0177,
"num_tokens": 7354059.0,
"reward": 0.11887075752019882,
"reward_std": 0.037268251180648804,
"rewards/bleu_reward_func/mean": 0.11887075752019882,
"rewards/bleu_reward_func/std": 0.09704269468784332,
"step": 558
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 398.0,
"completions/mean_length": 237.28125,
"completions/mean_terminated_length": 145.70834350585938,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.4472,
"grad_norm": 10.510950088500977,
"kl": 0.141387939453125,
"learning_rate": 1e-06,
"loss": 0.2298,
"num_tokens": 7369252.0,
"reward": 0.11686157435178757,
"reward_std": 0.06300412118434906,
"rewards/bleu_reward_func/mean": 0.11686157435178757,
"rewards/bleu_reward_func/std": 0.10008818656206131,
"step": 559
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 464.0,
"completions/mean_length": 294.21875,
"completions/mean_terminated_length": 209.0,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.448,
"grad_norm": 12.427102088928223,
"kl": 0.15325927734375,
"learning_rate": 1e-06,
"loss": -0.1463,
"num_tokens": 7384539.0,
"reward": 0.10454531759023666,
"reward_std": 0.032633934170007706,
"rewards/bleu_reward_func/mean": 0.10454531759023666,
"rewards/bleu_reward_func/std": 0.09093461185693741,
"step": 560
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 175.84375,
"completions/mean_terminated_length": 63.79166793823242,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.4488,
"grad_norm": 18.402080535888672,
"kl": 0.187530517578125,
"learning_rate": 1e-06,
"loss": 0.6927,
"num_tokens": 7394846.0,
"reward": 0.21487680077552795,
"reward_std": 0.08058933913707733,
"rewards/bleu_reward_func/mean": 0.21487680077552795,
"rewards/bleu_reward_func/std": 0.20088493824005127,
"step": 561
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 425.21875,
"completions/mean_terminated_length": 379.76190185546875,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.4496,
"grad_norm": 2.861811637878418,
"kl": 0.030670166015625,
"learning_rate": 1e-06,
"loss": -0.0581,
"num_tokens": 7414069.0,
"reward": 0.09261719137430191,
"reward_std": 0.046390384435653687,
"rewards/bleu_reward_func/mean": 0.09261719137430191,
"rewards/bleu_reward_func/std": 0.14345434308052063,
"step": 562
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 372.53125,
"completions/mean_terminated_length": 277.1052551269531,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.4504,
"grad_norm": 3.737154960632324,
"kl": 0.07086181640625,
"learning_rate": 1e-06,
"loss": -0.0085,
"num_tokens": 7433038.0,
"reward": 0.13954411447048187,
"reward_std": 0.09964635223150253,
"rewards/bleu_reward_func/mean": 0.13954411447048187,
"rewards/bleu_reward_func/std": 0.2269161492586136,
"step": 563
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 217.5,
"completions/mean_terminated_length": 175.42857360839844,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.4512,
"grad_norm": 9.693827629089355,
"kl": 0.2703857421875,
"learning_rate": 1e-06,
"loss": 0.0569,
"num_tokens": 7442590.0,
"reward": 0.08689892292022705,
"reward_std": 0.046516068279743195,
"rewards/bleu_reward_func/mean": 0.08689892292022705,
"rewards/bleu_reward_func/std": 0.09460947662591934,
"step": 564
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 344.3125,
"completions/mean_terminated_length": 320.3571472167969,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.452,
"grad_norm": 3.287851333618164,
"kl": 0.03594970703125,
"learning_rate": 1e-06,
"loss": -0.0888,
"num_tokens": 7455856.0,
"reward": 0.09177221357822418,
"reward_std": 0.02658715285360813,
"rewards/bleu_reward_func/mean": 0.09177221357822418,
"rewards/bleu_reward_func/std": 0.04939228668808937,
"step": 565
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 257.0,
"completions/mean_terminated_length": 141.09091186523438,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.4528,
"grad_norm": 5.329028129577637,
"kl": 0.21087646484375,
"learning_rate": 1e-06,
"loss": -0.0318,
"num_tokens": 7471128.0,
"reward": 0.30248120427131653,
"reward_std": 0.045193642377853394,
"rewards/bleu_reward_func/mean": 0.30248120427131653,
"rewards/bleu_reward_func/std": 0.09429154545068741,
"step": 566
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 396.5625,
"completions/mean_terminated_length": 306.77777099609375,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.4536,
"grad_norm": 2.8240556716918945,
"kl": 0.029327392578125,
"learning_rate": 1e-06,
"loss": -0.006,
"num_tokens": 7486778.0,
"reward": 0.046158432960510254,
"reward_std": 0.012592589482665062,
"rewards/bleu_reward_func/mean": 0.046158432960510254,
"rewards/bleu_reward_func/std": 0.0691753551363945,
"step": 567
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 221.5,
"completions/mean_terminated_length": 180.00001525878906,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.4544,
"grad_norm": 5.554408073425293,
"kl": 0.077850341796875,
"learning_rate": 1e-06,
"loss": -0.0114,
"num_tokens": 7501522.0,
"reward": 0.19211658835411072,
"reward_std": 0.052228912711143494,
"rewards/bleu_reward_func/mean": 0.19211658835411072,
"rewards/bleu_reward_func/std": 0.12220965325832367,
"step": 568
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 370.0,
"completions/mean_length": 282.90625,
"completions/mean_terminated_length": 104.72222137451172,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.4552,
"grad_norm": 8.932016372680664,
"kl": 0.1943359375,
"learning_rate": 1e-06,
"loss": 0.0056,
"num_tokens": 7515511.0,
"reward": 0.08466814458370209,
"reward_std": 0.03040888160467148,
"rewards/bleu_reward_func/mean": 0.08466814458370209,
"rewards/bleu_reward_func/std": 0.07005324959754944,
"step": 569
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 450.0,
"completions/mean_terminated_length": 388.0,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.456,
"grad_norm": 1.950373888015747,
"kl": 0.0325927734375,
"learning_rate": 1e-06,
"loss": 0.0603,
"num_tokens": 7535791.0,
"reward": 0.06426975131034851,
"reward_std": 0.02304723486304283,
"rewards/bleu_reward_func/mean": 0.06426975131034851,
"rewards/bleu_reward_func/std": 0.04708797112107277,
"step": 570
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 200.03125,
"completions/mean_terminated_length": 167.7586212158203,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.4568,
"grad_norm": 8.692915916442871,
"kl": 0.300445556640625,
"learning_rate": 1e-06,
"loss": 0.179,
"num_tokens": 7548720.0,
"reward": 0.16858291625976562,
"reward_std": 0.04772442951798439,
"rewards/bleu_reward_func/mean": 0.16858291625976562,
"rewards/bleu_reward_func/std": 0.187880739569664,
"step": 571
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 221.4375,
"completions/mean_terminated_length": 154.38462829589844,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.4576,
"grad_norm": 5.559481143951416,
"kl": 0.2091064453125,
"learning_rate": 1e-06,
"loss": 0.1461,
"num_tokens": 7559926.0,
"reward": 0.2749570608139038,
"reward_std": 0.07935648411512375,
"rewards/bleu_reward_func/mean": 0.2749570608139038,
"rewards/bleu_reward_func/std": 0.20695801079273224,
"step": 572
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 298.25,
"completions/mean_terminated_length": 227.0,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.4584,
"grad_norm": 4.713781833648682,
"kl": 0.10894775390625,
"learning_rate": 1e-06,
"loss": -0.1979,
"num_tokens": 7573214.0,
"reward": 0.11424913257360458,
"reward_std": 0.0238350722938776,
"rewards/bleu_reward_func/mean": 0.11424913257360458,
"rewards/bleu_reward_func/std": 0.1513095498085022,
"step": 573
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 301.375,
"completions/mean_terminated_length": 231.1666717529297,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.4592,
"grad_norm": 6.019801616668701,
"kl": 0.117706298828125,
"learning_rate": 1e-06,
"loss": -0.0467,
"num_tokens": 7584922.0,
"reward": 0.12773753702640533,
"reward_std": 0.03902646526694298,
"rewards/bleu_reward_func/mean": 0.12773753702640533,
"rewards/bleu_reward_func/std": 0.08676618337631226,
"step": 574
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 310.0,
"completions/mean_length": 261.3125,
"completions/mean_terminated_length": 40.11764907836914,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.46,
"grad_norm": 20.097490310668945,
"kl": 0.32666015625,
"learning_rate": 1e-06,
"loss": 0.0115,
"num_tokens": 7599564.0,
"reward": 0.1775631606578827,
"reward_std": 0.05471285060048103,
"rewards/bleu_reward_func/mean": 0.1775631606578827,
"rewards/bleu_reward_func/std": 0.1462731659412384,
"step": 575
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 224.59375,
"completions/mean_terminated_length": 93.95455169677734,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.4608,
"grad_norm": 12.845754623413086,
"kl": 0.2415771484375,
"learning_rate": 1e-06,
"loss": 0.1681,
"num_tokens": 7609343.0,
"reward": 0.10711174458265305,
"reward_std": 0.03790780156850815,
"rewards/bleu_reward_func/mean": 0.10711174458265305,
"rewards/bleu_reward_func/std": 0.11842114478349686,
"step": 576
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 429.15625,
"completions/mean_terminated_length": 217.44444274902344,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.4616,
"grad_norm": 3.16619873046875,
"kl": 0.031341552734375,
"learning_rate": 1e-06,
"loss": -0.1864,
"num_tokens": 7628708.0,
"reward": 0.12277669459581375,
"reward_std": 0.030532412230968475,
"rewards/bleu_reward_func/mean": 0.12277669459581375,
"rewards/bleu_reward_func/std": 0.14895910024642944,
"step": 577
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 452.0,
"completions/mean_length": 190.625,
"completions/mean_terminated_length": 100.63999938964844,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.4624,
"grad_norm": 6.808165550231934,
"kl": 0.2611083984375,
"learning_rate": 1e-06,
"loss": -0.043,
"num_tokens": 7637504.0,
"reward": 0.06198694184422493,
"reward_std": 0.018319500610232353,
"rewards/bleu_reward_func/mean": 0.06198694184422493,
"rewards/bleu_reward_func/std": 0.05399094894528389,
"step": 578
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 422.0,
"completions/mean_length": 306.65625,
"completions/mean_terminated_length": 125.47058868408203,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.4632,
"grad_norm": 6.689248085021973,
"kl": 0.165435791015625,
"learning_rate": 1e-06,
"loss": 0.0301,
"num_tokens": 7651797.0,
"reward": 0.07045552134513855,
"reward_std": 0.018690217286348343,
"rewards/bleu_reward_func/mean": 0.07045552134513855,
"rewards/bleu_reward_func/std": 0.07137548923492432,
"step": 579
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 332.6875,
"completions/mean_terminated_length": 193.22222900390625,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.464,
"grad_norm": 4.129982948303223,
"kl": 0.050750732421875,
"learning_rate": 1e-06,
"loss": 0.2464,
"num_tokens": 7667787.0,
"reward": 0.09383320808410645,
"reward_std": 0.046889662742614746,
"rewards/bleu_reward_func/mean": 0.09383320808410645,
"rewards/bleu_reward_func/std": 0.10615876317024231,
"step": 580
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 438.0,
"completions/mean_length": 145.375,
"completions/mean_terminated_length": 42.71999740600586,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.4648,
"grad_norm": 9.511686325073242,
"kl": 0.385894775390625,
"learning_rate": 1e-06,
"loss": 0.0501,
"num_tokens": 7679535.0,
"reward": 0.08031031489372253,
"reward_std": 0.036660827696323395,
"rewards/bleu_reward_func/mean": 0.08031031489372253,
"rewards/bleu_reward_func/std": 0.07939815521240234,
"step": 581
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 294.0,
"completions/mean_terminated_length": 271.4482727050781,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.4656,
"grad_norm": 7.2719950675964355,
"kl": 0.149932861328125,
"learning_rate": 1e-06,
"loss": 0.0885,
"num_tokens": 7690815.0,
"reward": 0.11769823729991913,
"reward_std": 0.02824997529387474,
"rewards/bleu_reward_func/mean": 0.11769823729991913,
"rewards/bleu_reward_func/std": 0.12788043916225433,
"step": 582
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 453.0,
"completions/max_terminated_length": 453.0,
"completions/mean_length": 120.15625,
"completions/mean_terminated_length": 120.15625,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.4664,
"grad_norm": 11.640869140625,
"kl": 0.19036865234375,
"learning_rate": 1e-06,
"loss": 0.1164,
"num_tokens": 7698172.0,
"reward": 0.052216824144124985,
"reward_std": 0.015741443261504173,
"rewards/bleu_reward_func/mean": 0.052216824144124985,
"rewards/bleu_reward_func/std": 0.01899011991918087,
"step": 583
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 236.46875,
"completions/mean_terminated_length": 144.625,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.4672,
"grad_norm": 7.708470821380615,
"kl": 0.25323486328125,
"learning_rate": 1e-06,
"loss": 0.0057,
"num_tokens": 7711683.0,
"reward": 0.20987500250339508,
"reward_std": 0.050422437489032745,
"rewards/bleu_reward_func/mean": 0.20987500250339508,
"rewards/bleu_reward_func/std": 0.21432380378246307,
"step": 584
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 255.09375,
"completions/mean_terminated_length": 120.52381134033203,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.468,
"grad_norm": 8.178823471069336,
"kl": 0.181243896484375,
"learning_rate": 1e-06,
"loss": 0.0026,
"num_tokens": 7721638.0,
"reward": 0.11023026704788208,
"reward_std": 0.03732236102223396,
"rewards/bleu_reward_func/mean": 0.11023026704788208,
"rewards/bleu_reward_func/std": 0.06018221378326416,
"step": 585
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 209.40625,
"completions/mean_terminated_length": 124.68000030517578,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.4688,
"grad_norm": 6.163815498352051,
"kl": 0.057861328125,
"learning_rate": 1e-06,
"loss": -0.382,
"num_tokens": 7731483.0,
"reward": 0.022579330950975418,
"reward_std": 0.024172717705368996,
"rewards/bleu_reward_func/mean": 0.022579330950975418,
"rewards/bleu_reward_func/std": 0.03154170513153076,
"step": 586
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 179.28125,
"completions/mean_terminated_length": 131.75,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.4696,
"grad_norm": 24.94843864440918,
"kl": 0.303466796875,
"learning_rate": 1e-06,
"loss": -0.149,
"num_tokens": 7740828.0,
"reward": 0.062010329216718674,
"reward_std": 0.030193448066711426,
"rewards/bleu_reward_func/mean": 0.062010329216718674,
"rewards/bleu_reward_func/std": 0.04090145602822304,
"step": 587
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 372.0625,
"completions/mean_terminated_length": 276.3157958984375,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.4704,
"grad_norm": 2.5675299167633057,
"kl": 0.03485107421875,
"learning_rate": 1e-06,
"loss": 0.0333,
"num_tokens": 7757862.0,
"reward": 0.037547022104263306,
"reward_std": 0.01179808471351862,
"rewards/bleu_reward_func/mean": 0.037547022104263306,
"rewards/bleu_reward_func/std": 0.03366583213210106,
"step": 588
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 268.875,
"completions/mean_terminated_length": 223.8518524169922,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.4712,
"grad_norm": 9.238602638244629,
"kl": 0.09326171875,
"learning_rate": 1e-06,
"loss": 0.0178,
"num_tokens": 7769098.0,
"reward": 0.0967094898223877,
"reward_std": 0.041084855794906616,
"rewards/bleu_reward_func/mean": 0.0967094898223877,
"rewards/bleu_reward_func/std": 0.10235904902219772,
"step": 589
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 475.0,
"completions/mean_length": 271.71875,
"completions/mean_terminated_length": 107.31578826904297,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.472,
"grad_norm": 12.115531921386719,
"kl": 0.196563720703125,
"learning_rate": 1e-06,
"loss": 0.3242,
"num_tokens": 7782097.0,
"reward": 0.19325336813926697,
"reward_std": 0.0921676903963089,
"rewards/bleu_reward_func/mean": 0.19325336813926697,
"rewards/bleu_reward_func/std": 0.24508582055568695,
"step": 590
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 197.59375,
"completions/mean_terminated_length": 176.6333465576172,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.4728,
"grad_norm": 9.336063385009766,
"kl": 0.17633056640625,
"learning_rate": 1e-06,
"loss": 0.0424,
"num_tokens": 7793588.0,
"reward": 0.14900264143943787,
"reward_std": 0.06498396396636963,
"rewards/bleu_reward_func/mean": 0.14900264143943787,
"rewards/bleu_reward_func/std": 0.13959822058677673,
"step": 591
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 396.0,
"completions/mean_length": 213.96875,
"completions/mean_terminated_length": 145.1923065185547,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.4736,
"grad_norm": 40.333492279052734,
"kl": 0.1762237548828125,
"learning_rate": 1e-06,
"loss": 0.034,
"num_tokens": 7809987.0,
"reward": 0.11991982161998749,
"reward_std": 0.024838652461767197,
"rewards/bleu_reward_func/mean": 0.11991982161998749,
"rewards/bleu_reward_func/std": 0.13350419700145721,
"step": 592
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 349.46875,
"completions/mean_terminated_length": 295.29168701171875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.4744,
"grad_norm": 6.914525032043457,
"kl": 0.02703857421875,
"learning_rate": 1e-06,
"loss": 0.1437,
"num_tokens": 7826610.0,
"reward": 0.02816709131002426,
"reward_std": 0.015584287233650684,
"rewards/bleu_reward_func/mean": 0.02816709131002426,
"rewards/bleu_reward_func/std": 0.027631772682070732,
"step": 593
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 444.0,
"completions/mean_length": 189.28125,
"completions/mean_terminated_length": 178.87095642089844,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.4752,
"grad_norm": 26.64930534362793,
"kl": 0.619964599609375,
"learning_rate": 1e-06,
"loss": 0.0753,
"num_tokens": 7834363.0,
"reward": 0.1504502296447754,
"reward_std": 0.061798207461833954,
"rewards/bleu_reward_func/mean": 0.1504502296447754,
"rewards/bleu_reward_func/std": 0.1269664466381073,
"step": 594
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 317.125,
"completions/mean_terminated_length": 200.1999969482422,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.476,
"grad_norm": 8.9805908203125,
"kl": 0.294097900390625,
"learning_rate": 1e-06,
"loss": -0.0704,
"num_tokens": 7849831.0,
"reward": 0.13049980998039246,
"reward_std": 0.02749776840209961,
"rewards/bleu_reward_func/mean": 0.13049980998039246,
"rewards/bleu_reward_func/std": 0.109443299472332,
"step": 595
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 466.0,
"completions/mean_length": 272.125,
"completions/mean_terminated_length": 192.1666717529297,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.4768,
"grad_norm": 4.8097429275512695,
"kl": 0.08526611328125,
"learning_rate": 1e-06,
"loss": -0.1088,
"num_tokens": 7861347.0,
"reward": 0.0661308616399765,
"reward_std": 0.02051004208624363,
"rewards/bleu_reward_func/mean": 0.0661308616399765,
"rewards/bleu_reward_func/std": 0.05708196386694908,
"step": 596
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 206.125,
"completions/mean_terminated_length": 120.47999572753906,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.4776,
"grad_norm": 7.351487636566162,
"kl": 0.334259033203125,
"learning_rate": 1e-06,
"loss": 0.0519,
"num_tokens": 7871295.0,
"reward": 0.16019710898399353,
"reward_std": 0.03656643629074097,
"rewards/bleu_reward_func/mean": 0.16019710898399353,
"rewards/bleu_reward_func/std": 0.19289268553256989,
"step": 597
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 349.3125,
"completions/mean_terminated_length": 186.625,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.4784,
"grad_norm": 4.930379867553711,
"kl": 0.181121826171875,
"learning_rate": 1e-06,
"loss": -0.0202,
"num_tokens": 7887577.0,
"reward": 0.1285662055015564,
"reward_std": 0.03015293926000595,
"rewards/bleu_reward_func/mean": 0.1285662055015564,
"rewards/bleu_reward_func/std": 0.08600351959466934,
"step": 598
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 175.875,
"completions/mean_terminated_length": 153.4666748046875,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.4792,
"grad_norm": 15.110285758972168,
"kl": 0.34112548828125,
"learning_rate": 1e-06,
"loss": -0.1173,
"num_tokens": 7898453.0,
"reward": 0.09940430521965027,
"reward_std": 0.046547506004571915,
"rewards/bleu_reward_func/mean": 0.09940430521965027,
"rewards/bleu_reward_func/std": 0.05020095780491829,
"step": 599
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 334.03125,
"completions/mean_terminated_length": 315.6206970214844,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.48,
"grad_norm": 3.0496630668640137,
"kl": 0.03216552734375,
"learning_rate": 1e-06,
"loss": -0.1382,
"num_tokens": 7911190.0,
"reward": 0.08442967385053635,
"reward_std": 0.027117565274238586,
"rewards/bleu_reward_func/mean": 0.08442967385053635,
"rewards/bleu_reward_func/std": 0.07272256910800934,
"step": 600
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 389.0,
"completions/mean_length": 132.125,
"completions/mean_terminated_length": 119.87096405029297,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.4808,
"grad_norm": 9.77161693572998,
"kl": 0.3096923828125,
"learning_rate": 1e-06,
"loss": 0.0712,
"num_tokens": 7923114.0,
"reward": 0.19400227069854736,
"reward_std": 0.08562377095222473,
"rewards/bleu_reward_func/mean": 0.19400227069854736,
"rewards/bleu_reward_func/std": 0.18403199315071106,
"step": 601
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 433.0,
"completions/mean_length": 270.53125,
"completions/mean_terminated_length": 105.31578826904297,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.4816,
"grad_norm": 5.418551445007324,
"kl": 0.132049560546875,
"learning_rate": 1e-06,
"loss": 0.057,
"num_tokens": 7937747.0,
"reward": 0.15049128234386444,
"reward_std": 0.024429049342870712,
"rewards/bleu_reward_func/mean": 0.15049128234386444,
"rewards/bleu_reward_func/std": 0.1750853955745697,
"step": 602
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 295.46875,
"completions/mean_terminated_length": 104.4117660522461,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.4824,
"grad_norm": 4.260025501251221,
"kl": 0.074981689453125,
"learning_rate": 1e-06,
"loss": 0.0337,
"num_tokens": 7951266.0,
"reward": 0.0902150496840477,
"reward_std": 0.0313844196498394,
"rewards/bleu_reward_func/mean": 0.0902150496840477,
"rewards/bleu_reward_func/std": 0.09559616446495056,
"step": 603
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 359.625,
"completions/mean_terminated_length": 255.36842346191406,
"completions/min_length": 49.0,
"completions/min_terminated_length": 49.0,
"epoch": 0.4832,
"grad_norm": 3.981938362121582,
"kl": 0.05792236328125,
"learning_rate": 1e-06,
"loss": 0.068,
"num_tokens": 7967510.0,
"reward": 0.15250109136104584,
"reward_std": 0.050509147346019745,
"rewards/bleu_reward_func/mean": 0.15250109136104584,
"rewards/bleu_reward_func/std": 0.2119276374578476,
"step": 604
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 243.625,
"completions/mean_terminated_length": 205.2857208251953,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.484,
"grad_norm": 4.506271839141846,
"kl": 0.133819580078125,
"learning_rate": 1e-06,
"loss": 0.0768,
"num_tokens": 7979194.0,
"reward": 0.049993276596069336,
"reward_std": 0.01375819742679596,
"rewards/bleu_reward_func/mean": 0.049993276596069336,
"rewards/bleu_reward_func/std": 0.019665135070681572,
"step": 605
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 394.0,
"completions/mean_length": 184.90625,
"completions/mean_terminated_length": 124.33333587646484,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.4848,
"grad_norm": 13.16498851776123,
"kl": 0.163360595703125,
"learning_rate": 1e-06,
"loss": 0.2145,
"num_tokens": 7992215.0,
"reward": 0.16849525272846222,
"reward_std": 0.041973263025283813,
"rewards/bleu_reward_func/mean": 0.16849525272846222,
"rewards/bleu_reward_func/std": 0.11670318245887756,
"step": 606
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 468.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 278.3125,
"completions/mean_terminated_length": 278.3125,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.4856,
"grad_norm": 9.305381774902344,
"kl": 0.1671142578125,
"learning_rate": 1e-06,
"loss": 0.0542,
"num_tokens": 8003561.0,
"reward": 0.14806249737739563,
"reward_std": 0.04475884884595871,
"rewards/bleu_reward_func/mean": 0.14806249737739563,
"rewards/bleu_reward_func/std": 0.10317616909742355,
"step": 607
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 392.3125,
"completions/mean_terminated_length": 320.5,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.4864,
"grad_norm": 8.050078392028809,
"kl": 0.0286865234375,
"learning_rate": 1e-06,
"loss": -0.0652,
"num_tokens": 8020771.0,
"reward": 0.09127211570739746,
"reward_std": 0.02751500904560089,
"rewards/bleu_reward_func/mean": 0.09127211570739746,
"rewards/bleu_reward_func/std": 0.04517889395356178,
"step": 608
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 293.0,
"completions/mean_terminated_length": 285.93548583984375,
"completions/min_length": 108.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.4872,
"grad_norm": 9.551247596740723,
"kl": 0.05633544921875,
"learning_rate": 1e-06,
"loss": 0.0629,
"num_tokens": 8033227.0,
"reward": 0.07062816619873047,
"reward_std": 0.032938919961452484,
"rewards/bleu_reward_func/mean": 0.07062816619873047,
"rewards/bleu_reward_func/std": 0.05320809781551361,
"step": 609
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 461.0,
"completions/mean_length": 246.25,
"completions/mean_terminated_length": 184.92308044433594,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.488,
"grad_norm": 28.349098205566406,
"kl": 0.06927490234375,
"learning_rate": 1e-06,
"loss": 0.1921,
"num_tokens": 8048579.0,
"reward": 0.12640823423862457,
"reward_std": 0.028129609301686287,
"rewards/bleu_reward_func/mean": 0.12640823423862457,
"rewards/bleu_reward_func/std": 0.12890547513961792,
"step": 610
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 334.28125,
"completions/mean_terminated_length": 241.1904754638672,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.4888,
"grad_norm": 6.840433597564697,
"kl": 0.054046630859375,
"learning_rate": 1e-06,
"loss": 0.1175,
"num_tokens": 8063660.0,
"reward": 0.06585729867219925,
"reward_std": 0.016829343512654305,
"rewards/bleu_reward_func/mean": 0.06585729867219925,
"rewards/bleu_reward_func/std": 0.027104271575808525,
"step": 611
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 469.4375,
"completions/mean_terminated_length": 443.8999938964844,
"completions/min_length": 359.0,
"completions/min_terminated_length": 359.0,
"epoch": 0.4896,
"grad_norm": 2.304220199584961,
"kl": 0.029449462890625,
"learning_rate": 1e-06,
"loss": -0.0018,
"num_tokens": 8081922.0,
"reward": 0.029903851449489594,
"reward_std": 0.007851570844650269,
"rewards/bleu_reward_func/mean": 0.029903851449489594,
"rewards/bleu_reward_func/std": 0.017835307866334915,
"step": 612
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 197.75,
"completions/mean_terminated_length": 187.61289978027344,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.4904,
"grad_norm": 5.66249418258667,
"kl": 0.0855712890625,
"learning_rate": 1e-06,
"loss": 0.1286,
"num_tokens": 8091010.0,
"reward": 0.2724965810775757,
"reward_std": 0.06183997541666031,
"rewards/bleu_reward_func/mean": 0.2724965810775757,
"rewards/bleu_reward_func/std": 0.2708708643913269,
"step": 613
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 328.71875,
"completions/mean_terminated_length": 186.1666717529297,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"epoch": 0.4912,
"grad_norm": 3.228691816329956,
"kl": 0.0655670166015625,
"learning_rate": 1e-06,
"loss": 0.0338,
"num_tokens": 8105809.0,
"reward": 0.12389599531888962,
"reward_std": 0.07396578788757324,
"rewards/bleu_reward_func/mean": 0.12389599531888962,
"rewards/bleu_reward_func/std": 0.18483103811740875,
"step": 614
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 317.59375,
"completions/mean_terminated_length": 215.76190185546875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.492,
"grad_norm": 3.5793278217315674,
"kl": 0.044097900390625,
"learning_rate": 1e-06,
"loss": 0.0314,
"num_tokens": 8118620.0,
"reward": 0.08205534517765045,
"reward_std": 0.032849013805389404,
"rewards/bleu_reward_func/mean": 0.08205534517765045,
"rewards/bleu_reward_func/std": 0.05394502356648445,
"step": 615
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 449.0,
"completions/max_terminated_length": 449.0,
"completions/mean_length": 180.9375,
"completions/mean_terminated_length": 180.9375,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.4928,
"grad_norm": 7.083518028259277,
"kl": 0.40167236328125,
"learning_rate": 1e-06,
"loss": -0.1081,
"num_tokens": 8128922.0,
"reward": 0.12701216340065002,
"reward_std": 0.03847620263695717,
"rewards/bleu_reward_func/mean": 0.12701216340065002,
"rewards/bleu_reward_func/std": 0.08405326306819916,
"step": 616
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 473.0,
"completions/mean_length": 214.25,
"completions/mean_terminated_length": 145.53846740722656,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.4936,
"grad_norm": 6.532368183135986,
"kl": 0.239410400390625,
"learning_rate": 1e-06,
"loss": -0.0347,
"num_tokens": 8143162.0,
"reward": 0.11757355183362961,
"reward_std": 0.02820819616317749,
"rewards/bleu_reward_func/mean": 0.11757355183362961,
"rewards/bleu_reward_func/std": 0.10728771984577179,
"step": 617
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 364.375,
"completions/mean_terminated_length": 297.2727355957031,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.4944,
"grad_norm": 2.549912929534912,
"kl": 0.044189453125,
"learning_rate": 1e-06,
"loss": 0.0332,
"num_tokens": 8158198.0,
"reward": 0.04174516722559929,
"reward_std": 0.011650302447378635,
"rewards/bleu_reward_func/mean": 0.04174516722559929,
"rewards/bleu_reward_func/std": 0.03221089020371437,
"step": 618
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 441.0,
"completions/mean_length": 311.84375,
"completions/mean_terminated_length": 207.0,
"completions/min_length": 49.0,
"completions/min_terminated_length": 49.0,
"epoch": 0.4952,
"grad_norm": 3.869034767150879,
"kl": 0.05462646484375,
"learning_rate": 1e-06,
"loss": 0.0031,
"num_tokens": 8171945.0,
"reward": 0.028928130865097046,
"reward_std": 0.012434298172593117,
"rewards/bleu_reward_func/mean": 0.028928130865097046,
"rewards/bleu_reward_func/std": 0.025789210572838783,
"step": 619
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 243.90625,
"completions/mean_terminated_length": 205.60714721679688,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.496,
"grad_norm": 6.46402645111084,
"kl": 0.304168701171875,
"learning_rate": 1e-06,
"loss": 0.0145,
"num_tokens": 8181710.0,
"reward": 0.13130733370780945,
"reward_std": 0.018212325870990753,
"rewards/bleu_reward_func/mean": 0.13130733370780945,
"rewards/bleu_reward_func/std": 0.09850703179836273,
"step": 620
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 242.0,
"completions/mean_terminated_length": 203.42857360839844,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.4968,
"grad_norm": 5.473258018493652,
"kl": 0.16949462890625,
"learning_rate": 1e-06,
"loss": -0.0354,
"num_tokens": 8193118.0,
"reward": 0.2502046227455139,
"reward_std": 0.03522457554936409,
"rewards/bleu_reward_func/mean": 0.2502046227455139,
"rewards/bleu_reward_func/std": 0.2565787732601166,
"step": 621
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 236.71875,
"completions/mean_terminated_length": 144.95834350585938,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.4976,
"grad_norm": 30.692569732666016,
"kl": 0.29986572265625,
"learning_rate": 1e-06,
"loss": -0.0477,
"num_tokens": 8207549.0,
"reward": 0.22225125133991241,
"reward_std": 0.033524345606565475,
"rewards/bleu_reward_func/mean": 0.22225125133991241,
"rewards/bleu_reward_func/std": 0.19432197511196136,
"step": 622
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 120.96875,
"completions/mean_terminated_length": 80.51724243164062,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.4984,
"grad_norm": 7.908195495605469,
"kl": 0.380523681640625,
"learning_rate": 1e-06,
"loss": -0.0756,
"num_tokens": 8218324.0,
"reward": 0.23351526260375977,
"reward_std": 0.05452558770775795,
"rewards/bleu_reward_func/mean": 0.23351526260375977,
"rewards/bleu_reward_func/std": 0.1365489512681961,
"step": 623
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 415.0,
"completions/mean_length": 172.5625,
"completions/mean_terminated_length": 137.44827270507812,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.4992,
"grad_norm": 8.18444538116455,
"kl": 0.296051025390625,
"learning_rate": 1e-06,
"loss": 0.0021,
"num_tokens": 8228742.0,
"reward": 0.14677512645721436,
"reward_std": 0.04820986092090607,
"rewards/bleu_reward_func/mean": 0.14677512645721436,
"rewards/bleu_reward_func/std": 0.15982075035572052,
"step": 624
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 333.65625,
"completions/mean_terminated_length": 263.86956787109375,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.5,
"grad_norm": 15.107973098754883,
"kl": 0.160400390625,
"learning_rate": 1e-06,
"loss": 0.0015,
"num_tokens": 8245059.0,
"reward": 0.13298457860946655,
"reward_std": 0.018914809450507164,
"rewards/bleu_reward_func/mean": 0.13298457860946655,
"rewards/bleu_reward_func/std": 0.07686522603034973,
"step": 625
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 459.0,
"completions/mean_length": 299.53125,
"completions/mean_terminated_length": 260.1851806640625,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.5008,
"grad_norm": 131.54771423339844,
"kl": 0.102081298828125,
"learning_rate": 1e-06,
"loss": 0.1782,
"num_tokens": 8256620.0,
"reward": 0.1039574146270752,
"reward_std": 0.03130800276994705,
"rewards/bleu_reward_func/mean": 0.1039574146270752,
"rewards/bleu_reward_func/std": 0.05177094042301178,
"step": 626
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 202.15625,
"completions/mean_terminated_length": 181.50001525878906,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.5016,
"grad_norm": 8.109158515930176,
"kl": 0.213104248046875,
"learning_rate": 1e-06,
"loss": -0.0456,
"num_tokens": 8266585.0,
"reward": 0.23561137914657593,
"reward_std": 0.03910698741674423,
"rewards/bleu_reward_func/mean": 0.23561137914657593,
"rewards/bleu_reward_func/std": 0.1352909654378891,
"step": 627
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 259.5625,
"completions/mean_terminated_length": 144.8181915283203,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.5024,
"grad_norm": 5.162299633026123,
"kl": 0.147857666015625,
"learning_rate": 1e-06,
"loss": -0.0329,
"num_tokens": 8278827.0,
"reward": 0.1635468751192093,
"reward_std": 0.04077983647584915,
"rewards/bleu_reward_func/mean": 0.1635468751192093,
"rewards/bleu_reward_func/std": 0.1520238220691681,
"step": 628
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 354.84375,
"completions/mean_terminated_length": 302.4583435058594,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.5032,
"grad_norm": 3.0983669757843018,
"kl": 0.0435791015625,
"learning_rate": 1e-06,
"loss": 0.0095,
"num_tokens": 8293502.0,
"reward": 0.05737052857875824,
"reward_std": 0.021961018443107605,
"rewards/bleu_reward_func/mean": 0.05737052857875824,
"rewards/bleu_reward_func/std": 0.03505769371986389,
"step": 629
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 216.25,
"completions/mean_terminated_length": 185.65516662597656,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.504,
"grad_norm": 6.821644306182861,
"kl": 0.0950927734375,
"learning_rate": 1e-06,
"loss": -0.0482,
"num_tokens": 8302182.0,
"reward": 0.07243853062391281,
"reward_std": 0.06683069467544556,
"rewards/bleu_reward_func/mean": 0.07243853062391281,
"rewards/bleu_reward_func/std": 0.10312769562005997,
"step": 630
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 352.34375,
"completions/mean_terminated_length": 329.5357360839844,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.5048,
"grad_norm": 3.5717883110046387,
"kl": 0.036041259765625,
"learning_rate": 1e-06,
"loss": 0.1298,
"num_tokens": 8318753.0,
"reward": 0.026654381304979324,
"reward_std": 0.024883870035409927,
"rewards/bleu_reward_func/mean": 0.026654381304979324,
"rewards/bleu_reward_func/std": 0.03104417398571968,
"step": 631
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 499.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 183.09375,
"completions/mean_terminated_length": 183.09375,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.5056,
"grad_norm": 6.945363998413086,
"kl": 0.234039306640625,
"learning_rate": 1e-06,
"loss": 0.0696,
"num_tokens": 8329596.0,
"reward": 0.18802031874656677,
"reward_std": 0.06351514160633087,
"rewards/bleu_reward_func/mean": 0.18802031874656677,
"rewards/bleu_reward_func/std": 0.16961929202079773,
"step": 632
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 157.9375,
"completions/mean_terminated_length": 134.33334350585938,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.5064,
"grad_norm": 99.53560638427734,
"kl": 0.190399169921875,
"learning_rate": 1e-06,
"loss": 0.0621,
"num_tokens": 8343090.0,
"reward": 0.06075248867273331,
"reward_std": 0.018952492624521255,
"rewards/bleu_reward_func/mean": 0.06075248867273331,
"rewards/bleu_reward_func/std": 0.057455144822597504,
"step": 633
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 291.71875,
"completions/mean_terminated_length": 268.9310302734375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.5072,
"grad_norm": 6.928218364715576,
"kl": 0.155853271484375,
"learning_rate": 1e-06,
"loss": 0.0112,
"num_tokens": 8354305.0,
"reward": 0.15043729543685913,
"reward_std": 0.04871266707777977,
"rewards/bleu_reward_func/mean": 0.15043729543685913,
"rewards/bleu_reward_func/std": 0.17611344158649445,
"step": 634
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 198.5625,
"completions/mean_terminated_length": 177.6666717529297,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.508,
"grad_norm": 4.697340488433838,
"kl": 0.094085693359375,
"learning_rate": 1e-06,
"loss": 0.0737,
"num_tokens": 8364091.0,
"reward": 0.06601180136203766,
"reward_std": 0.02227596938610077,
"rewards/bleu_reward_func/mean": 0.06601180136203766,
"rewards/bleu_reward_func/std": 0.043257758021354675,
"step": 635
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 410.78125,
"completions/mean_terminated_length": 332.0555725097656,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.5088,
"grad_norm": 77.46025085449219,
"kl": 0.05377197265625,
"learning_rate": 1e-06,
"loss": -0.1447,
"num_tokens": 8381724.0,
"reward": 0.03671726584434509,
"reward_std": 0.015178699977695942,
"rewards/bleu_reward_func/mean": 0.03671726584434509,
"rewards/bleu_reward_func/std": 0.03225603699684143,
"step": 636
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 408.0,
"completions/mean_length": 333.25,
"completions/mean_terminated_length": 154.5,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.5096,
"grad_norm": 6.136791706085205,
"kl": 0.124664306640625,
"learning_rate": 1e-06,
"loss": -0.0018,
"num_tokens": 8397476.0,
"reward": 0.16264644265174866,
"reward_std": 0.03679278865456581,
"rewards/bleu_reward_func/mean": 0.16264644265174866,
"rewards/bleu_reward_func/std": 0.16195148229599,
"step": 637
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 221.125,
"completions/mean_terminated_length": 179.57144165039062,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.5104,
"grad_norm": 4.400974750518799,
"kl": 0.058013916015625,
"learning_rate": 1e-06,
"loss": -0.0244,
"num_tokens": 8406544.0,
"reward": 0.059636689722537994,
"reward_std": 0.024229735136032104,
"rewards/bleu_reward_func/mean": 0.059636689722537994,
"rewards/bleu_reward_func/std": 0.04718983918428421,
"step": 638
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 287.09375,
"completions/mean_terminated_length": 235.19232177734375,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.5112,
"grad_norm": 10.916999816894531,
"kl": 0.118011474609375,
"learning_rate": 1e-06,
"loss": 0.0766,
"num_tokens": 8422323.0,
"reward": 0.10468322038650513,
"reward_std": 0.021623361855745316,
"rewards/bleu_reward_func/mean": 0.10468322038650513,
"rewards/bleu_reward_func/std": 0.08197237551212311,
"step": 639
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 461.0,
"completions/mean_length": 336.28125,
"completions/mean_terminated_length": 311.1785888671875,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.512,
"grad_norm": 2.1528663635253906,
"kl": 0.033172607421875,
"learning_rate": 1e-06,
"loss": -0.0722,
"num_tokens": 8437100.0,
"reward": 0.10049737989902496,
"reward_std": 0.03208357095718384,
"rewards/bleu_reward_func/mean": 0.10049737989902496,
"rewards/bleu_reward_func/std": 0.0739847868680954,
"step": 640
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 451.0,
"completions/mean_length": 361.375,
"completions/mean_terminated_length": 190.6666717529297,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.5128,
"grad_norm": 4.406239986419678,
"kl": 0.163421630859375,
"learning_rate": 1e-06,
"loss": 0.0712,
"num_tokens": 8450184.0,
"reward": 0.06270510703325272,
"reward_std": 0.01890096440911293,
"rewards/bleu_reward_func/mean": 0.06270510703325272,
"rewards/bleu_reward_func/std": 0.04299367591738701,
"step": 641
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 344.75,
"completions/mean_terminated_length": 268.727294921875,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.5136,
"grad_norm": 3.8396599292755127,
"kl": 0.21343994140625,
"learning_rate": 1e-06,
"loss": -0.0071,
"num_tokens": 8464952.0,
"reward": 0.12920798361301422,
"reward_std": 0.025508491322398186,
"rewards/bleu_reward_func/mean": 0.12920798361301422,
"rewards/bleu_reward_func/std": 0.11343086510896683,
"step": 642
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 443.0,
"completions/mean_length": 243.0625,
"completions/mean_terminated_length": 204.6428680419922,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.5144,
"grad_norm": 9.709904670715332,
"kl": 0.127349853515625,
"learning_rate": 1e-06,
"loss": -0.0569,
"num_tokens": 8481010.0,
"reward": 0.08636625856161118,
"reward_std": 0.023468628525733948,
"rewards/bleu_reward_func/mean": 0.08636625856161118,
"rewards/bleu_reward_func/std": 0.11172276735305786,
"step": 643
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 430.0,
"completions/mean_length": 319.8125,
"completions/mean_terminated_length": 150.23529052734375,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.5152,
"grad_norm": 4.022066116333008,
"kl": 0.095428466796875,
"learning_rate": 1e-06,
"loss": 0.0192,
"num_tokens": 8495668.0,
"reward": 0.13581258058547974,
"reward_std": 0.049042053520679474,
"rewards/bleu_reward_func/mean": 0.13581258058547974,
"rewards/bleu_reward_func/std": 0.10865607112646103,
"step": 644
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 421.0,
"completions/mean_length": 200.40625,
"completions/mean_terminated_length": 113.15999603271484,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.516,
"grad_norm": 14.371395111083984,
"kl": 0.188232421875,
"learning_rate": 1e-06,
"loss": 0.1384,
"num_tokens": 8510193.0,
"reward": 0.16988492012023926,
"reward_std": 0.02835988998413086,
"rewards/bleu_reward_func/mean": 0.16988492012023926,
"rewards/bleu_reward_func/std": 0.22432467341423035,
"step": 645
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 350.625,
"completions/mean_terminated_length": 189.25,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.5168,
"grad_norm": 3.603522539138794,
"kl": 0.106658935546875,
"learning_rate": 1e-06,
"loss": 0.0683,
"num_tokens": 8527613.0,
"reward": 0.04950277507305145,
"reward_std": 0.02557562291622162,
"rewards/bleu_reward_func/mean": 0.04950277507305145,
"rewards/bleu_reward_func/std": 0.036064986139535904,
"step": 646
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 240.71875,
"completions/mean_terminated_length": 190.48147583007812,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.5176,
"grad_norm": 2.1311533451080322,
"kl": 0.040771484375,
"learning_rate": 1e-06,
"loss": -0.0675,
"num_tokens": 8540012.0,
"reward": 0.4242175817489624,
"reward_std": 0.05443207919597626,
"rewards/bleu_reward_func/mean": 0.4242175817489624,
"rewards/bleu_reward_func/std": 0.3835957646369934,
"step": 647
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 162.78125,
"completions/mean_terminated_length": 82.19231414794922,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.5184,
"grad_norm": 7.217045783996582,
"kl": 0.238250732421875,
"learning_rate": 1e-06,
"loss": -0.2492,
"num_tokens": 8550285.0,
"reward": 0.15483121573925018,
"reward_std": 0.04074571654200554,
"rewards/bleu_reward_func/mean": 0.15483121573925018,
"rewards/bleu_reward_func/std": 0.1628112941980362,
"step": 648
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 226.03125,
"completions/mean_terminated_length": 196.44827270507812,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.5192,
"grad_norm": 9.426239013671875,
"kl": 0.326446533203125,
"learning_rate": 1e-06,
"loss": -0.0078,
"num_tokens": 8562006.0,
"reward": 0.22631683945655823,
"reward_std": 0.046764910221099854,
"rewards/bleu_reward_func/mean": 0.22631683945655823,
"rewards/bleu_reward_func/std": 0.24870522320270538,
"step": 649
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 408.4375,
"completions/mean_terminated_length": 373.91668701171875,
"completions/min_length": 224.0,
"completions/min_terminated_length": 224.0,
"epoch": 0.52,
"grad_norm": 2.5924437046051025,
"kl": 0.03192138671875,
"learning_rate": 1e-06,
"loss": -0.0517,
"num_tokens": 8577388.0,
"reward": 0.050332337617874146,
"reward_std": 0.013445645570755005,
"rewards/bleu_reward_func/mean": 0.050332337617874146,
"rewards/bleu_reward_func/std": 0.04263650253415108,
"step": 650
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 323.65625,
"completions/mean_terminated_length": 296.75,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.5208,
"grad_norm": 6.97017765045166,
"kl": 0.062255859375,
"learning_rate": 1e-06,
"loss": 0.075,
"num_tokens": 8589889.0,
"reward": 0.0671861320734024,
"reward_std": 0.020000552758574486,
"rewards/bleu_reward_func/mean": 0.0671861320734024,
"rewards/bleu_reward_func/std": 0.027637863531708717,
"step": 651
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 202.6875,
"completions/mean_terminated_length": 170.6896514892578,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.5216,
"grad_norm": 5.9939727783203125,
"kl": 0.30169677734375,
"learning_rate": 1e-06,
"loss": 0.0126,
"num_tokens": 8603471.0,
"reward": 0.23086336255073547,
"reward_std": 0.03887036070227623,
"rewards/bleu_reward_func/mean": 0.23086336255073547,
"rewards/bleu_reward_func/std": 0.1954699456691742,
"step": 652
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 334.09375,
"completions/mean_terminated_length": 264.478271484375,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.5224,
"grad_norm": 10.721747398376465,
"kl": 0.11480712890625,
"learning_rate": 1e-06,
"loss": -0.0181,
"num_tokens": 8618802.0,
"reward": 0.12768197059631348,
"reward_std": 0.018044453114271164,
"rewards/bleu_reward_func/mean": 0.12768197059631348,
"rewards/bleu_reward_func/std": 0.18208470940589905,
"step": 653
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 254.28125,
"completions/mean_terminated_length": 182.1199951171875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.5232,
"grad_norm": 21.832870483398438,
"kl": 0.224090576171875,
"learning_rate": 1e-06,
"loss": 0.0275,
"num_tokens": 8633395.0,
"reward": 0.23750805854797363,
"reward_std": 0.10584703087806702,
"rewards/bleu_reward_func/mean": 0.23750805854797363,
"rewards/bleu_reward_func/std": 0.24472850561141968,
"step": 654
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 293.375,
"completions/mean_terminated_length": 207.8260955810547,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.524,
"grad_norm": 14.025049209594727,
"kl": 0.1319580078125,
"learning_rate": 1e-06,
"loss": 0.2269,
"num_tokens": 8647583.0,
"reward": 0.08922699838876724,
"reward_std": 0.022407300770282745,
"rewards/bleu_reward_func/mean": 0.08922699838876724,
"rewards/bleu_reward_func/std": 0.05691966786980629,
"step": 655
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 417.34375,
"completions/mean_terminated_length": 374.3182067871094,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.5248,
"grad_norm": 2.506775379180908,
"kl": 0.030517578125,
"learning_rate": 1e-06,
"loss": -0.1182,
"num_tokens": 8664458.0,
"reward": 0.056899260729551315,
"reward_std": 0.024433575570583344,
"rewards/bleu_reward_func/mean": 0.056899260729551315,
"rewards/bleu_reward_func/std": 0.043169718235731125,
"step": 656
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 251.78125,
"completions/mean_terminated_length": 214.60714721679688,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.5256,
"grad_norm": 7.267916202545166,
"kl": 0.123870849609375,
"learning_rate": 1e-06,
"loss": 0.0057,
"num_tokens": 8673803.0,
"reward": 0.10382385551929474,
"reward_std": 0.051886267960071564,
"rewards/bleu_reward_func/mean": 0.10382385551929474,
"rewards/bleu_reward_func/std": 0.06761174649000168,
"step": 657
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 288.0,
"completions/mean_length": 227.34375,
"completions/mean_terminated_length": 132.45834350585938,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.5264,
"grad_norm": 5.546152114868164,
"kl": 0.098236083984375,
"learning_rate": 1e-06,
"loss": -0.0623,
"num_tokens": 8685270.0,
"reward": 0.1565043181180954,
"reward_std": 0.08428065478801727,
"rewards/bleu_reward_func/mean": 0.1565043181180954,
"rewards/bleu_reward_func/std": 0.17227834463119507,
"step": 658
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 414.0,
"completions/mean_length": 152.96875,
"completions/mean_terminated_length": 86.48148345947266,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.5272,
"grad_norm": 14.263039588928223,
"kl": 0.30914306640625,
"learning_rate": 1e-06,
"loss": 0.0075,
"num_tokens": 8696005.0,
"reward": 0.24965888261795044,
"reward_std": 0.051375266164541245,
"rewards/bleu_reward_func/mean": 0.24965888261795044,
"rewards/bleu_reward_func/std": 0.21870571374893188,
"step": 659
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 165.0,
"completions/max_terminated_length": 165.0,
"completions/mean_length": 32.5625,
"completions/mean_terminated_length": 32.5625,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.528,
"grad_norm": 442.6045837402344,
"kl": 0.5015869140625,
"learning_rate": 1e-06,
"loss": 0.0936,
"num_tokens": 8704767.0,
"reward": 0.13235034048557281,
"reward_std": 0.07672514766454697,
"rewards/bleu_reward_func/mean": 0.13235034048557281,
"rewards/bleu_reward_func/std": 0.13803941011428833,
"step": 660
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 431.0,
"completions/mean_length": 252.4375,
"completions/mean_terminated_length": 74.84210205078125,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.5288,
"grad_norm": 5.065097332000732,
"kl": 0.18255615234375,
"learning_rate": 1e-06,
"loss": 0.0464,
"num_tokens": 8721005.0,
"reward": 0.2883501648902893,
"reward_std": 0.022871889173984528,
"rewards/bleu_reward_func/mean": 0.2883501648902893,
"rewards/bleu_reward_func/std": 0.23920658230781555,
"step": 661
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 402.0,
"completions/mean_length": 178.625,
"completions/mean_terminated_length": 144.13792419433594,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.5296,
"grad_norm": 4.1915202140808105,
"kl": 0.152069091796875,
"learning_rate": 1e-06,
"loss": 0.1483,
"num_tokens": 8729545.0,
"reward": 0.09845434874296188,
"reward_std": 0.049190133810043335,
"rewards/bleu_reward_func/mean": 0.09845434874296188,
"rewards/bleu_reward_func/std": 0.06372099369764328,
"step": 662
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 293.9375,
"completions/mean_terminated_length": 279.4000244140625,
"completions/min_length": 90.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.5304,
"grad_norm": 10.675251007080078,
"kl": 0.0498046875,
"learning_rate": 1e-06,
"loss": 0.1798,
"num_tokens": 8741895.0,
"reward": 0.09067553281784058,
"reward_std": 0.036186374723911285,
"rewards/bleu_reward_func/mean": 0.09067553281784058,
"rewards/bleu_reward_func/std": 0.057389046996831894,
"step": 663
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 157.75,
"completions/mean_terminated_length": 146.32257080078125,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.5312,
"grad_norm": 42.93239212036133,
"kl": 0.20751953125,
"learning_rate": 1e-06,
"loss": 0.0697,
"num_tokens": 8752255.0,
"reward": 0.27352431416511536,
"reward_std": 0.07531043887138367,
"rewards/bleu_reward_func/mean": 0.27352431416511536,
"rewards/bleu_reward_func/std": 0.13157765567302704,
"step": 664
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 285.625,
"completions/mean_terminated_length": 222.239990234375,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.532,
"grad_norm": 5.36870813369751,
"kl": 0.075042724609375,
"learning_rate": 1e-06,
"loss": -0.0052,
"num_tokens": 8764259.0,
"reward": 0.024750784039497375,
"reward_std": 0.022341227158904076,
"rewards/bleu_reward_func/mean": 0.024750784039497375,
"rewards/bleu_reward_func/std": 0.03164950758218765,
"step": 665
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 498.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 231.0,
"completions/mean_terminated_length": 231.0,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.5328,
"grad_norm": 5.825709342956543,
"kl": 0.2059326171875,
"learning_rate": 1e-06,
"loss": -0.126,
"num_tokens": 8776043.0,
"reward": 0.09888751804828644,
"reward_std": 0.027325943112373352,
"rewards/bleu_reward_func/mean": 0.09888751804828644,
"rewards/bleu_reward_func/std": 0.06260307133197784,
"step": 666
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 154.09375,
"completions/mean_terminated_length": 71.5,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.5336,
"grad_norm": 13.691543579101562,
"kl": 0.3173828125,
"learning_rate": 1e-06,
"loss": -0.0016,
"num_tokens": 8787118.0,
"reward": 0.16891013085842133,
"reward_std": 0.033737391233444214,
"rewards/bleu_reward_func/mean": 0.16891013085842133,
"rewards/bleu_reward_func/std": 0.1783466339111328,
"step": 667
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 458.0,
"completions/mean_length": 258.71875,
"completions/mean_terminated_length": 174.2916717529297,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.5344,
"grad_norm": 5.329594612121582,
"kl": 0.25360107421875,
"learning_rate": 1e-06,
"loss": -0.0877,
"num_tokens": 8801157.0,
"reward": 0.071600541472435,
"reward_std": 0.021211300045251846,
"rewards/bleu_reward_func/mean": 0.071600541472435,
"rewards/bleu_reward_func/std": 0.054277434945106506,
"step": 668
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 300.71875,
"completions/mean_terminated_length": 136.38888549804688,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.5352,
"grad_norm": 10.509814262390137,
"kl": 0.1150054931640625,
"learning_rate": 1e-06,
"loss": 0.0738,
"num_tokens": 8815244.0,
"reward": 0.09365338832139969,
"reward_std": 0.023113342002034187,
"rewards/bleu_reward_func/mean": 0.09365338832139969,
"rewards/bleu_reward_func/std": 0.0734696164727211,
"step": 669
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 253.9375,
"completions/mean_terminated_length": 245.61289978027344,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.536,
"grad_norm": 23.27628517150879,
"kl": 0.17572021484375,
"learning_rate": 1e-06,
"loss": -0.1243,
"num_tokens": 8826930.0,
"reward": 0.10506822168827057,
"reward_std": 0.023658432066440582,
"rewards/bleu_reward_func/mean": 0.10506822168827057,
"rewards/bleu_reward_func/std": 0.06695646047592163,
"step": 670
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 447.71875,
"completions/mean_terminated_length": 374.86669921875,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.5368,
"grad_norm": 3.2729108333587646,
"kl": 0.028778076171875,
"learning_rate": 1e-06,
"loss": -0.0068,
"num_tokens": 8843889.0,
"reward": 0.025088129565119743,
"reward_std": 0.00651167519390583,
"rewards/bleu_reward_func/mean": 0.025088129565119743,
"rewards/bleu_reward_func/std": 0.02992870658636093,
"step": 671
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 220.0,
"completions/mean_terminated_length": 210.5806427001953,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.5376,
"grad_norm": 245.07106018066406,
"kl": 0.28985595703125,
"learning_rate": 1e-06,
"loss": 0.0867,
"num_tokens": 8855401.0,
"reward": 0.09645279496908188,
"reward_std": 0.0731353610754013,
"rewards/bleu_reward_func/mean": 0.09645279496908188,
"rewards/bleu_reward_func/std": 0.09792789071798325,
"step": 672
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 257.0625,
"completions/mean_terminated_length": 230.6896514892578,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.5384,
"grad_norm": 4.84860897064209,
"kl": 0.12005615234375,
"learning_rate": 1e-06,
"loss": -0.1854,
"num_tokens": 8867371.0,
"reward": 0.11370459198951721,
"reward_std": 0.06978605687618256,
"rewards/bleu_reward_func/mean": 0.11370459198951721,
"rewards/bleu_reward_func/std": 0.18471869826316833,
"step": 673
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 237.78125,
"completions/mean_terminated_length": 161.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.5392,
"grad_norm": 33.883243560791016,
"kl": 0.1602783203125,
"learning_rate": 1e-06,
"loss": -0.0528,
"num_tokens": 8882588.0,
"reward": 0.20929288864135742,
"reward_std": 0.04879160225391388,
"rewards/bleu_reward_func/mean": 0.20929288864135742,
"rewards/bleu_reward_func/std": 0.17186923325061798,
"step": 674
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 473.0,
"completions/mean_length": 274.5,
"completions/mean_terminated_length": 240.57144165039062,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.54,
"grad_norm": 6.54965877532959,
"kl": 0.08441162109375,
"learning_rate": 1e-06,
"loss": -0.037,
"num_tokens": 8894508.0,
"reward": 0.04714702442288399,
"reward_std": 0.010213707573711872,
"rewards/bleu_reward_func/mean": 0.04714702442288399,
"rewards/bleu_reward_func/std": 0.04436042159795761,
"step": 675
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 183.375,
"completions/mean_terminated_length": 91.36000061035156,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.5408,
"grad_norm": 11.63567066192627,
"kl": 0.16424560546875,
"learning_rate": 1e-06,
"loss": -0.0095,
"num_tokens": 8902704.0,
"reward": 0.18900102376937866,
"reward_std": 0.045921262353658676,
"rewards/bleu_reward_func/mean": 0.18900102376937866,
"rewards/bleu_reward_func/std": 0.25213196873664856,
"step": 676
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 181.96875,
"completions/mean_terminated_length": 105.80769348144531,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.5416,
"grad_norm": 180.27212524414062,
"kl": 0.203125,
"learning_rate": 1e-06,
"loss": 0.5786,
"num_tokens": 8917647.0,
"reward": 0.11598189175128937,
"reward_std": 0.0453701987862587,
"rewards/bleu_reward_func/mean": 0.11598189175128937,
"rewards/bleu_reward_func/std": 0.12164945900440216,
"step": 677
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 258.0,
"completions/mean_length": 187.59375,
"completions/mean_terminated_length": 79.45833587646484,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.5424,
"grad_norm": 13.031902313232422,
"kl": 0.232452392578125,
"learning_rate": 1e-06,
"loss": 0.1064,
"num_tokens": 8927434.0,
"reward": 0.2563853859901428,
"reward_std": 0.021821634843945503,
"rewards/bleu_reward_func/mean": 0.2563853859901428,
"rewards/bleu_reward_func/std": 0.25126466155052185,
"step": 678
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 157.53125,
"completions/mean_terminated_length": 133.90000915527344,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.5432,
"grad_norm": 37.64163589477539,
"kl": 0.17242431640625,
"learning_rate": 1e-06,
"loss": -0.1013,
"num_tokens": 8939683.0,
"reward": 0.22671283781528473,
"reward_std": 0.05255472660064697,
"rewards/bleu_reward_func/mean": 0.22671283781528473,
"rewards/bleu_reward_func/std": 0.22751960158348083,
"step": 679
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 263.71875,
"completions/mean_terminated_length": 238.03448486328125,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.544,
"grad_norm": 129.8694610595703,
"kl": 0.1982421875,
"learning_rate": 1e-06,
"loss": 0.0067,
"num_tokens": 8949442.0,
"reward": 0.08898493647575378,
"reward_std": 0.0456019788980484,
"rewards/bleu_reward_func/mean": 0.08898493647575378,
"rewards/bleu_reward_func/std": 0.10032162815332413,
"step": 680
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 510.1875,
"completions/mean_terminated_length": 483.0,
"completions/min_length": 457.0,
"completions/min_terminated_length": 457.0,
"epoch": 0.5448,
"grad_norm": 16.652488708496094,
"kl": 0.063751220703125,
"learning_rate": 1e-06,
"loss": 0.0038,
"num_tokens": 8968384.0,
"reward": 0.04759781062602997,
"reward_std": 0.009598957374691963,
"rewards/bleu_reward_func/mean": 0.04759781062602997,
"rewards/bleu_reward_func/std": 0.050501517951488495,
"step": 681
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 374.875,
"completions/mean_terminated_length": 253.88235473632812,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.5456,
"grad_norm": 130.1652069091797,
"kl": 0.032989501953125,
"learning_rate": 1e-06,
"loss": -0.213,
"num_tokens": 8984364.0,
"reward": 0.07015404105186462,
"reward_std": 0.037277355790138245,
"rewards/bleu_reward_func/mean": 0.07015404105186462,
"rewards/bleu_reward_func/std": 0.10696472972631454,
"step": 682
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 492.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 221.59375,
"completions/mean_terminated_length": 221.59375,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.5464,
"grad_norm": 15.168272018432617,
"kl": 0.25604248046875,
"learning_rate": 1e-06,
"loss": 0.0072,
"num_tokens": 8993615.0,
"reward": 0.05222689360380173,
"reward_std": 0.015750503167510033,
"rewards/bleu_reward_func/mean": 0.05222689360380173,
"rewards/bleu_reward_func/std": 0.03590291365981102,
"step": 683
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 385.0,
"completions/mean_length": 231.0625,
"completions/mean_terminated_length": 121.13043975830078,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.5472,
"grad_norm": 35.623695373535156,
"kl": 0.1175537109375,
"learning_rate": 1e-06,
"loss": -0.0464,
"num_tokens": 9004377.0,
"reward": 0.04063406586647034,
"reward_std": 0.028225397691130638,
"rewards/bleu_reward_func/mean": 0.04063406586647034,
"rewards/bleu_reward_func/std": 0.05525263398885727,
"step": 684
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 361.78125,
"completions/mean_terminated_length": 259.0,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.548,
"grad_norm": 593.3710327148438,
"kl": 0.0638427734375,
"learning_rate": 1e-06,
"loss": 0.1608,
"num_tokens": 9020330.0,
"reward": 0.1717539131641388,
"reward_std": 0.0751393511891365,
"rewards/bleu_reward_func/mean": 0.1717539131641388,
"rewards/bleu_reward_func/std": 0.25347769260406494,
"step": 685
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 462.0,
"completions/max_terminated_length": 462.0,
"completions/mean_length": 175.59375,
"completions/mean_terminated_length": 175.59375,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.5488,
"grad_norm": 7.423861980438232,
"kl": 0.170654296875,
"learning_rate": 1e-06,
"loss": -0.0565,
"num_tokens": 9028509.0,
"reward": 0.091148242354393,
"reward_std": 0.017926650121808052,
"rewards/bleu_reward_func/mean": 0.091148242354393,
"rewards/bleu_reward_func/std": 0.07815965265035629,
"step": 686
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 455.15625,
"completions/mean_terminated_length": 398.3125,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.5496,
"grad_norm": 2.612523317337036,
"kl": 0.033416748046875,
"learning_rate": 1e-06,
"loss": -0.0994,
"num_tokens": 9046090.0,
"reward": 0.04202251136302948,
"reward_std": 0.015885071828961372,
"rewards/bleu_reward_func/mean": 0.04202251136302948,
"rewards/bleu_reward_func/std": 0.03677666559815407,
"step": 687
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 448.9375,
"completions/mean_terminated_length": 399.8888854980469,
"completions/min_length": 97.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.5504,
"grad_norm": 14.944954872131348,
"kl": 0.056488037109375,
"learning_rate": 1e-06,
"loss": 0.1249,
"num_tokens": 9064360.0,
"reward": 0.03943703696131706,
"reward_std": 0.024654783308506012,
"rewards/bleu_reward_func/mean": 0.03943703696131706,
"rewards/bleu_reward_func/std": 0.03771531209349632,
"step": 688
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 165.3125,
"completions/mean_terminated_length": 85.30769348144531,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.5512,
"grad_norm": 66.34259033203125,
"kl": 0.408477783203125,
"learning_rate": 1e-06,
"loss": -0.1375,
"num_tokens": 9078698.0,
"reward": 0.20156420767307281,
"reward_std": 0.07818345725536346,
"rewards/bleu_reward_func/mean": 0.20156420767307281,
"rewards/bleu_reward_func/std": 0.23512743413448334,
"step": 689
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 224.0,
"completions/mean_terminated_length": 170.6666717529297,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.552,
"grad_norm": 45.66501998901367,
"kl": 1.2672119140625,
"learning_rate": 1e-06,
"loss": 0.3161,
"num_tokens": 9087402.0,
"reward": 0.12671013176441193,
"reward_std": 0.03653056174516678,
"rewards/bleu_reward_func/mean": 0.12671013176441193,
"rewards/bleu_reward_func/std": 0.0971146747469902,
"step": 690
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 442.0,
"completions/mean_length": 237.125,
"completions/mean_terminated_length": 93.14286041259766,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.5528,
"grad_norm": 497.3811950683594,
"kl": 0.163970947265625,
"learning_rate": 1e-06,
"loss": 0.069,
"num_tokens": 9099254.0,
"reward": 0.0354698970913887,
"reward_std": 0.02819395810365677,
"rewards/bleu_reward_func/mean": 0.0354698970913887,
"rewards/bleu_reward_func/std": 0.030991537496447563,
"step": 691
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 287.65625,
"completions/mean_terminated_length": 246.11111450195312,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.5536,
"grad_norm": 6.150320053100586,
"kl": 0.086273193359375,
"learning_rate": 1e-06,
"loss": 0.1272,
"num_tokens": 9110347.0,
"reward": 0.06792090833187103,
"reward_std": 0.02885974571108818,
"rewards/bleu_reward_func/mean": 0.06792090833187103,
"rewards/bleu_reward_func/std": 0.06222621724009514,
"step": 692
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 244.84375,
"completions/mean_terminated_length": 140.30435180664062,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.5544,
"grad_norm": 36.0806999206543,
"kl": 0.23052978515625,
"learning_rate": 1e-06,
"loss": 0.0802,
"num_tokens": 9121382.0,
"reward": 0.09483693540096283,
"reward_std": 0.05147245526313782,
"rewards/bleu_reward_func/mean": 0.09483693540096283,
"rewards/bleu_reward_func/std": 0.08640998601913452,
"step": 693
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 361.0,
"completions/mean_length": 155.0,
"completions/mean_terminated_length": 131.20001220703125,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.5552,
"grad_norm": 55.443695068359375,
"kl": 0.318450927734375,
"learning_rate": 1e-06,
"loss": -0.0617,
"num_tokens": 9132678.0,
"reward": 0.10359849035739899,
"reward_std": 0.07937172800302505,
"rewards/bleu_reward_func/mean": 0.10359849035739899,
"rewards/bleu_reward_func/std": 0.13979652523994446,
"step": 694
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 305.0,
"completions/mean_length": 236.65625,
"completions/mean_terminated_length": 71.45000457763672,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.556,
"grad_norm": 146.36070251464844,
"kl": 0.099609375,
"learning_rate": 1e-06,
"loss": 0.1483,
"num_tokens": 9145227.0,
"reward": 0.17079538106918335,
"reward_std": 0.05914284288883209,
"rewards/bleu_reward_func/mean": 0.17079538106918335,
"rewards/bleu_reward_func/std": 0.23325958847999573,
"step": 695
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 228.5625,
"completions/mean_terminated_length": 134.08334350585938,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.5568,
"grad_norm": 298.8739013671875,
"kl": 0.1568603515625,
"learning_rate": 1e-06,
"loss": -0.1223,
"num_tokens": 9160869.0,
"reward": 0.037290386855602264,
"reward_std": 0.014398223720490932,
"rewards/bleu_reward_func/mean": 0.037290386855602264,
"rewards/bleu_reward_func/std": 0.03690984100103378,
"step": 696
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 229.3125,
"completions/mean_terminated_length": 164.07693481445312,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.5576,
"grad_norm": 220.50381469726562,
"kl": 0.365478515625,
"learning_rate": 1e-06,
"loss": 0.0947,
"num_tokens": 9170959.0,
"reward": 0.11910027265548706,
"reward_std": 0.04049726575613022,
"rewards/bleu_reward_func/mean": 0.11910027265548706,
"rewards/bleu_reward_func/std": 0.14281374216079712,
"step": 697
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 473.0,
"completions/mean_length": 216.15625,
"completions/mean_terminated_length": 161.37037658691406,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.5584,
"grad_norm": 11.570438385009766,
"kl": 0.205352783203125,
"learning_rate": 1e-06,
"loss": 0.0257,
"num_tokens": 9181980.0,
"reward": 0.08099336922168732,
"reward_std": 0.04509742930531502,
"rewards/bleu_reward_func/mean": 0.08099336922168732,
"rewards/bleu_reward_func/std": 0.10288692265748978,
"step": 698
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 349.96875,
"completions/mean_terminated_length": 304.6000061035156,
"completions/min_length": 96.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.5592,
"grad_norm": 37.448883056640625,
"kl": 0.09320068359375,
"learning_rate": 1e-06,
"loss": 0.0422,
"num_tokens": 9197723.0,
"reward": 0.05653442069888115,
"reward_std": 0.026268266141414642,
"rewards/bleu_reward_func/mean": 0.05653442069888115,
"rewards/bleu_reward_func/std": 0.04277388006448746,
"step": 699
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 255.03125,
"completions/mean_terminated_length": 195.73077392578125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.56,
"grad_norm": 340.22479248046875,
"kl": 0.41278076171875,
"learning_rate": 1e-06,
"loss": -0.2392,
"num_tokens": 9210148.0,
"reward": 0.06149371713399887,
"reward_std": 0.023687850683927536,
"rewards/bleu_reward_func/mean": 0.06149371713399887,
"rewards/bleu_reward_func/std": 0.03754807263612747,
"step": 700
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 282.84375,
"completions/mean_terminated_length": 145.35000610351562,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.5608,
"grad_norm": 404.3658752441406,
"kl": 0.29132080078125,
"learning_rate": 1e-06,
"loss": 0.148,
"num_tokens": 9222815.0,
"reward": 0.05664993077516556,
"reward_std": 0.01803937554359436,
"rewards/bleu_reward_func/mean": 0.05664993077516556,
"rewards/bleu_reward_func/std": 0.02324024587869644,
"step": 701
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 205.0,
"completions/mean_length": 116.40625,
"completions/mean_terminated_length": 59.892860412597656,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.5616,
"grad_norm": 43.716766357421875,
"kl": 0.216552734375,
"learning_rate": 1e-06,
"loss": -0.0484,
"num_tokens": 9231460.0,
"reward": 0.1364556849002838,
"reward_std": 0.09380181133747101,
"rewards/bleu_reward_func/mean": 0.1364556849002838,
"rewards/bleu_reward_func/std": 0.21690192818641663,
"step": 702
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 305.65625,
"completions/mean_terminated_length": 211.8636474609375,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.5624,
"grad_norm": 745.7041625976562,
"kl": 0.14068603515625,
"learning_rate": 1e-06,
"loss": 0.1945,
"num_tokens": 9245265.0,
"reward": 0.04224216938018799,
"reward_std": 0.016487902030348778,
"rewards/bleu_reward_func/mean": 0.04224216938018799,
"rewards/bleu_reward_func/std": 0.025008555501699448,
"step": 703
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 408.0,
"completions/mean_length": 115.0625,
"completions/mean_terminated_length": 102.25806427001953,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.5632,
"grad_norm": 216.922607421875,
"kl": 0.3056640625,
"learning_rate": 1e-06,
"loss": 0.0737,
"num_tokens": 9253899.0,
"reward": 0.08648309111595154,
"reward_std": 0.03777506947517395,
"rewards/bleu_reward_func/mean": 0.08648309111595154,
"rewards/bleu_reward_func/std": 0.05951961874961853,
"step": 704
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 461.0,
"completions/mean_length": 253.71875,
"completions/mean_terminated_length": 98.75,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.564,
"grad_norm": 162.15737915039062,
"kl": 0.1361083984375,
"learning_rate": 1e-06,
"loss": -0.0623,
"num_tokens": 9269314.0,
"reward": 0.2322496771812439,
"reward_std": 0.045732706785202026,
"rewards/bleu_reward_func/mean": 0.2322496771812439,
"rewards/bleu_reward_func/std": 0.25273510813713074,
"step": 705
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 340.0625,
"completions/mean_terminated_length": 272.7826232910156,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.5648,
"grad_norm": 4.662173271179199,
"kl": 0.035003662109375,
"learning_rate": 1e-06,
"loss": 0.0266,
"num_tokens": 9282188.0,
"reward": 0.04237870126962662,
"reward_std": 0.01780301332473755,
"rewards/bleu_reward_func/mean": 0.04237870126962662,
"rewards/bleu_reward_func/std": 0.04967799782752991,
"step": 706
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 438.0,
"completions/mean_length": 349.6875,
"completions/mean_terminated_length": 275.9090881347656,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.5656,
"grad_norm": 2.4642882347106934,
"kl": 0.04644775390625,
"learning_rate": 1e-06,
"loss": 0.0446,
"num_tokens": 9298362.0,
"reward": 0.0549091175198555,
"reward_std": 0.03525715321302414,
"rewards/bleu_reward_func/mean": 0.0549091175198555,
"rewards/bleu_reward_func/std": 0.051221489906311035,
"step": 707
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 249.875,
"completions/mean_terminated_length": 232.40000915527344,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.5664,
"grad_norm": 11.45730972290039,
"kl": 0.16558837890625,
"learning_rate": 1e-06,
"loss": 0.0823,
"num_tokens": 9308934.0,
"reward": 0.10365931689739227,
"reward_std": 0.028398117050528526,
"rewards/bleu_reward_func/mean": 0.10365931689739227,
"rewards/bleu_reward_func/std": 0.06428122520446777,
"step": 708
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 452.0,
"completions/mean_length": 254.0625,
"completions/mean_terminated_length": 99.30000305175781,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.5672,
"grad_norm": 7.227019786834717,
"kl": 0.154449462890625,
"learning_rate": 1e-06,
"loss": 0.0107,
"num_tokens": 9322640.0,
"reward": 0.18866762518882751,
"reward_std": 0.044271718710660934,
"rewards/bleu_reward_func/mean": 0.18866762518882751,
"rewards/bleu_reward_func/std": 0.1287185698747635,
"step": 709
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 304.96875,
"completions/mean_terminated_length": 283.5517272949219,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.568,
"grad_norm": 5.722259998321533,
"kl": 0.07122802734375,
"learning_rate": 1e-06,
"loss": -0.1128,
"num_tokens": 9337639.0,
"reward": 0.0695868507027626,
"reward_std": 0.022387558594346046,
"rewards/bleu_reward_func/mean": 0.0695868507027626,
"rewards/bleu_reward_func/std": 0.065777987241745,
"step": 710
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 387.0,
"completions/mean_length": 150.09375,
"completions/mean_terminated_length": 125.9666748046875,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.5688,
"grad_norm": 5.365989685058594,
"kl": 0.0738525390625,
"learning_rate": 1e-06,
"loss": 0.3885,
"num_tokens": 9346522.0,
"reward": 0.2348867505788803,
"reward_std": 0.09850712865591049,
"rewards/bleu_reward_func/mean": 0.2348867505788803,
"rewards/bleu_reward_func/std": 0.302653044462204,
"step": 711
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 342.65625,
"completions/mean_terminated_length": 265.68182373046875,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.5696,
"grad_norm": 7.032015800476074,
"kl": 0.060089111328125,
"learning_rate": 1e-06,
"loss": -0.0338,
"num_tokens": 9359935.0,
"reward": 0.053069278597831726,
"reward_std": 0.015607406385242939,
"rewards/bleu_reward_func/mean": 0.053069278597831726,
"rewards/bleu_reward_func/std": 0.0380670465528965,
"step": 712
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 372.0,
"completions/mean_length": 166.46875,
"completions/mean_terminated_length": 69.72000122070312,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.5704,
"grad_norm": 7.543166160583496,
"kl": 0.26068115234375,
"learning_rate": 1e-06,
"loss": 0.3241,
"num_tokens": 9367886.0,
"reward": 0.17086198925971985,
"reward_std": 0.059704020619392395,
"rewards/bleu_reward_func/mean": 0.17086198925971985,
"rewards/bleu_reward_func/std": 0.13924144208431244,
"step": 713
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 306.3125,
"completions/mean_terminated_length": 248.72000122070312,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.5712,
"grad_norm": 3.74664568901062,
"kl": 0.086090087890625,
"learning_rate": 1e-06,
"loss": -0.2145,
"num_tokens": 9381256.0,
"reward": 0.026702141389250755,
"reward_std": 0.010159555822610855,
"rewards/bleu_reward_func/mean": 0.026702141389250755,
"rewards/bleu_reward_func/std": 0.025492098182439804,
"step": 714
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 374.0,
"completions/mean_length": 202.40625,
"completions/mean_terminated_length": 145.07408142089844,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.572,
"grad_norm": 3.670616865158081,
"kl": 0.0810394287109375,
"learning_rate": 1e-06,
"loss": 0.3116,
"num_tokens": 9394869.0,
"reward": 0.09095513820648193,
"reward_std": 0.0500517264008522,
"rewards/bleu_reward_func/mean": 0.09095513820648193,
"rewards/bleu_reward_func/std": 0.07213761657476425,
"step": 715
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 432.0,
"completions/mean_length": 221.5625,
"completions/mean_terminated_length": 202.20001220703125,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.5728,
"grad_norm": 9.221464157104492,
"kl": 0.10400390625,
"learning_rate": 1e-06,
"loss": 0.3058,
"num_tokens": 9405335.0,
"reward": 0.12374541163444519,
"reward_std": 0.040764160454273224,
"rewards/bleu_reward_func/mean": 0.12374541163444519,
"rewards/bleu_reward_func/std": 0.13386160135269165,
"step": 716
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 347.1875,
"completions/mean_terminated_length": 234.42105102539062,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.5736,
"grad_norm": 12.991151809692383,
"kl": 0.157958984375,
"learning_rate": 1e-06,
"loss": 0.0424,
"num_tokens": 9420789.0,
"reward": 0.07029742747545242,
"reward_std": 0.012854170054197311,
"rewards/bleu_reward_func/mean": 0.07029742747545242,
"rewards/bleu_reward_func/std": 0.041719451546669006,
"step": 717
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 503.75,
"completions/mean_terminated_length": 474.2857360839844,
"completions/min_length": 408.0,
"completions/min_terminated_length": 408.0,
"epoch": 0.5744,
"grad_norm": 2.107853889465332,
"kl": 0.028350830078125,
"learning_rate": 1e-06,
"loss": -0.0047,
"num_tokens": 9439789.0,
"reward": 0.05256051570177078,
"reward_std": 0.010154004208743572,
"rewards/bleu_reward_func/mean": 0.05256051570177078,
"rewards/bleu_reward_func/std": 0.03522626310586929,
"step": 718
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 244.15625,
"completions/mean_terminated_length": 235.51612854003906,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.5752,
"grad_norm": 3.158909559249878,
"kl": 0.0923919677734375,
"learning_rate": 1e-06,
"loss": -0.1019,
"num_tokens": 9451978.0,
"reward": 0.12841522693634033,
"reward_std": 0.05657704174518585,
"rewards/bleu_reward_func/mean": 0.12841522693634033,
"rewards/bleu_reward_func/std": 0.07523242384195328,
"step": 719
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 150.4375,
"completions/mean_terminated_length": 83.48148345947266,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.576,
"grad_norm": 8.099937438964844,
"kl": 0.214874267578125,
"learning_rate": 1e-06,
"loss": 0.0129,
"num_tokens": 9461736.0,
"reward": 0.08868992328643799,
"reward_std": 0.017071515321731567,
"rewards/bleu_reward_func/mean": 0.08868992328643799,
"rewards/bleu_reward_func/std": 0.08577441424131393,
"step": 720
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 447.65625,
"completions/mean_terminated_length": 383.3125,
"completions/min_length": 89.0,
"completions/min_terminated_length": 89.0,
"epoch": 0.5768,
"grad_norm": 2.4854142665863037,
"kl": 0.045318603515625,
"learning_rate": 1e-06,
"loss": -0.0504,
"num_tokens": 9477381.0,
"reward": 0.0682106539607048,
"reward_std": 0.022257793694734573,
"rewards/bleu_reward_func/mean": 0.0682106539607048,
"rewards/bleu_reward_func/std": 0.05095710977911949,
"step": 721
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 350.0,
"completions/mean_length": 167.84375,
"completions/mean_terminated_length": 156.74192810058594,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.5776,
"grad_norm": 4.308876037597656,
"kl": 0.07659912109375,
"learning_rate": 1e-06,
"loss": 0.1774,
"num_tokens": 9489008.0,
"reward": 0.0951995924115181,
"reward_std": 0.033833228051662445,
"rewards/bleu_reward_func/mean": 0.0951995924115181,
"rewards/bleu_reward_func/std": 0.0729941874742508,
"step": 722
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 179.34375,
"completions/mean_terminated_length": 144.9310302734375,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.5784,
"grad_norm": 12.132597923278809,
"kl": 0.33624267578125,
"learning_rate": 1e-06,
"loss": 0.2153,
"num_tokens": 9498059.0,
"reward": 0.10619839280843735,
"reward_std": 0.04761648178100586,
"rewards/bleu_reward_func/mean": 0.10619839280843735,
"rewards/bleu_reward_func/std": 0.0809776559472084,
"step": 723
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 374.78125,
"completions/mean_terminated_length": 268.0555725097656,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.5792,
"grad_norm": 7.476963996887207,
"kl": 0.1087646484375,
"learning_rate": 1e-06,
"loss": 0.0108,
"num_tokens": 9517972.0,
"reward": 0.17816489934921265,
"reward_std": 0.016055870801210403,
"rewards/bleu_reward_func/mean": 0.17816489934921265,
"rewards/bleu_reward_func/std": 0.266427606344223,
"step": 724
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 220.125,
"completions/mean_terminated_length": 152.7692413330078,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.58,
"grad_norm": 5.399383544921875,
"kl": 0.13983154296875,
"learning_rate": 1e-06,
"loss": 0.1212,
"num_tokens": 9530760.0,
"reward": 0.07149016857147217,
"reward_std": 0.023819390684366226,
"rewards/bleu_reward_func/mean": 0.07149016857147217,
"rewards/bleu_reward_func/std": 0.053011875599622726,
"step": 725
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 475.0,
"completions/mean_length": 500.3125,
"completions/mean_terminated_length": 418.5,
"completions/min_length": 355.0,
"completions/min_terminated_length": 355.0,
"epoch": 0.5808,
"grad_norm": 2.243359088897705,
"kl": 0.028045654296875,
"learning_rate": 1e-06,
"loss": -0.0175,
"num_tokens": 9550330.0,
"reward": 0.04440176486968994,
"reward_std": 0.00922885537147522,
"rewards/bleu_reward_func/mean": 0.04440176486968994,
"rewards/bleu_reward_func/std": 0.03932040557265282,
"step": 726
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 154.65625,
"completions/mean_terminated_length": 103.60714721679688,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.5816,
"grad_norm": 54.60096740722656,
"kl": 0.27252197265625,
"learning_rate": 1e-06,
"loss": 0.1714,
"num_tokens": 9557863.0,
"reward": 0.34987902641296387,
"reward_std": 0.09637948125600815,
"rewards/bleu_reward_func/mean": 0.34987902641296387,
"rewards/bleu_reward_func/std": 0.30998000502586365,
"step": 727
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 394.21875,
"completions/mean_terminated_length": 197.9166717529297,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.5824,
"grad_norm": 3.5557267665863037,
"kl": 0.04107666015625,
"learning_rate": 1e-06,
"loss": -0.2035,
"num_tokens": 9576342.0,
"reward": 0.03708350285887718,
"reward_std": 0.013400746509432793,
"rewards/bleu_reward_func/mean": 0.03708350285887718,
"rewards/bleu_reward_func/std": 0.030460968613624573,
"step": 728
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 287.53125,
"completions/mean_terminated_length": 272.5666809082031,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.5832,
"grad_norm": 5.337757110595703,
"kl": 0.106048583984375,
"learning_rate": 1e-06,
"loss": -0.0648,
"num_tokens": 9587719.0,
"reward": 0.08226186782121658,
"reward_std": 0.016267672181129456,
"rewards/bleu_reward_func/mean": 0.08226186782121658,
"rewards/bleu_reward_func/std": 0.047058336436748505,
"step": 729
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 353.46875,
"completions/mean_terminated_length": 291.4347839355469,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.584,
"grad_norm": 172.38458251953125,
"kl": 0.142059326171875,
"learning_rate": 1e-06,
"loss": -0.0203,
"num_tokens": 9604270.0,
"reward": 0.08777523040771484,
"reward_std": 0.028989041224122047,
"rewards/bleu_reward_func/mean": 0.08777523040771484,
"rewards/bleu_reward_func/std": 0.053535155951976776,
"step": 730
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 412.0,
"completions/max_terminated_length": 412.0,
"completions/mean_length": 123.5625,
"completions/mean_terminated_length": 123.5625,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.5848,
"grad_norm": 14.68234920501709,
"kl": 0.1837158203125,
"learning_rate": 1e-06,
"loss": 0.053,
"num_tokens": 9613376.0,
"reward": 0.09781108796596527,
"reward_std": 0.03509049117565155,
"rewards/bleu_reward_func/mean": 0.09781108796596527,
"rewards/bleu_reward_func/std": 0.07531887292861938,
"step": 731
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 147.53125,
"completions/mean_terminated_length": 63.42308044433594,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.5856,
"grad_norm": 471.59912109375,
"kl": 0.191680908203125,
"learning_rate": 1e-06,
"loss": 0.0896,
"num_tokens": 9624305.0,
"reward": 0.08778894692659378,
"reward_std": 0.025603748857975006,
"rewards/bleu_reward_func/mean": 0.08778894692659378,
"rewards/bleu_reward_func/std": 0.06823020428419113,
"step": 732
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 254.0,
"completions/mean_terminated_length": 194.4615478515625,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.5864,
"grad_norm": 3.397786855697632,
"kl": 0.03460693359375,
"learning_rate": 1e-06,
"loss": -0.0596,
"num_tokens": 9634593.0,
"reward": 0.08798034489154816,
"reward_std": 0.02149152383208275,
"rewards/bleu_reward_func/mean": 0.08798034489154816,
"rewards/bleu_reward_func/std": 0.07060196995735168,
"step": 733
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 300.75,
"completions/mean_terminated_length": 261.629638671875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.5872,
"grad_norm": 11.675055503845215,
"kl": 0.0538330078125,
"learning_rate": 1e-06,
"loss": 0.1186,
"num_tokens": 9647729.0,
"reward": 0.20255795121192932,
"reward_std": 0.044919952750205994,
"rewards/bleu_reward_func/mean": 0.20255795121192932,
"rewards/bleu_reward_func/std": 0.23513151705265045,
"step": 734
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 381.0,
"completions/max_terminated_length": 381.0,
"completions/mean_length": 100.875,
"completions/mean_terminated_length": 100.875,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.588,
"grad_norm": 7.969452381134033,
"kl": 0.2791748046875,
"learning_rate": 1e-06,
"loss": 0.1783,
"num_tokens": 9655565.0,
"reward": 0.1732563078403473,
"reward_std": 0.06255275756120682,
"rewards/bleu_reward_func/mean": 0.1732563078403473,
"rewards/bleu_reward_func/std": 0.14761896431446075,
"step": 735
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 255.78125,
"completions/mean_terminated_length": 196.6538543701172,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.5888,
"grad_norm": 37.51567840576172,
"kl": 0.139739990234375,
"learning_rate": 1e-06,
"loss": 0.0247,
"num_tokens": 9672070.0,
"reward": 0.3068300187587738,
"reward_std": 0.018469596281647682,
"rewards/bleu_reward_func/mean": 0.3068300187587738,
"rewards/bleu_reward_func/std": 0.29021966457366943,
"step": 736
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 367.15625,
"completions/mean_terminated_length": 301.31817626953125,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.5896,
"grad_norm": 2.4598867893218994,
"kl": 0.0330657958984375,
"learning_rate": 1e-06,
"loss": -0.0194,
"num_tokens": 9688219.0,
"reward": 0.05701170861721039,
"reward_std": 0.020281650125980377,
"rewards/bleu_reward_func/mean": 0.05701170861721039,
"rewards/bleu_reward_func/std": 0.055385053157806396,
"step": 737
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 318.46875,
"completions/mean_terminated_length": 230.5,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.5904,
"grad_norm": 4.623442649841309,
"kl": 0.045806884765625,
"learning_rate": 1e-06,
"loss": -0.2136,
"num_tokens": 9701082.0,
"reward": 0.05631488561630249,
"reward_std": 0.022235814481973648,
"rewards/bleu_reward_func/mean": 0.05631488561630249,
"rewards/bleu_reward_func/std": 0.0748782679438591,
"step": 738
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 429.0,
"completions/mean_length": 195.40625,
"completions/mean_terminated_length": 122.34616088867188,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.5912,
"grad_norm": 6.426390171051025,
"kl": 0.21075439453125,
"learning_rate": 1e-06,
"loss": 0.0043,
"num_tokens": 9712175.0,
"reward": 0.250314861536026,
"reward_std": 0.043683700263500214,
"rewards/bleu_reward_func/mean": 0.250314861536026,
"rewards/bleu_reward_func/std": 0.27451202273368835,
"step": 739
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 314.375,
"completions/mean_terminated_length": 195.8000030517578,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.592,
"grad_norm": 4.511030197143555,
"kl": 0.0513916015625,
"learning_rate": 1e-06,
"loss": 0.0376,
"num_tokens": 9727491.0,
"reward": 0.24225017428398132,
"reward_std": 0.0391615591943264,
"rewards/bleu_reward_func/mean": 0.24225017428398132,
"rewards/bleu_reward_func/std": 0.23075063526630402,
"step": 740
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 291.34375,
"completions/mean_terminated_length": 191.0454559326172,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.5928,
"grad_norm": 5.867874622344971,
"kl": 0.070037841796875,
"learning_rate": 1e-06,
"loss": 0.0656,
"num_tokens": 9740590.0,
"reward": 0.08597154170274734,
"reward_std": 0.053836189210414886,
"rewards/bleu_reward_func/mean": 0.08597154170274734,
"rewards/bleu_reward_func/std": 0.12926995754241943,
"step": 741
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 414.0,
"completions/mean_length": 205.34375,
"completions/mean_terminated_length": 148.55555725097656,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.5936,
"grad_norm": 6.713605880737305,
"kl": 0.12249755859375,
"learning_rate": 1e-06,
"loss": 0.1915,
"num_tokens": 9751033.0,
"reward": 0.19744500517845154,
"reward_std": 0.041014768183231354,
"rewards/bleu_reward_func/mean": 0.19744500517845154,
"rewards/bleu_reward_func/std": 0.1538456529378891,
"step": 742
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 249.46875,
"completions/mean_terminated_length": 200.8518524169922,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.5944,
"grad_norm": 5.988234043121338,
"kl": 0.22430419921875,
"learning_rate": 1e-06,
"loss": 0.0382,
"num_tokens": 9764368.0,
"reward": 0.2869833707809448,
"reward_std": 0.07026369869709015,
"rewards/bleu_reward_func/mean": 0.2869833707809448,
"rewards/bleu_reward_func/std": 0.2287815362215042,
"step": 743
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 464.0,
"completions/mean_length": 265.3125,
"completions/mean_terminated_length": 196.239990234375,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.5952,
"grad_norm": 8.931143760681152,
"kl": 0.234375,
"learning_rate": 1e-06,
"loss": -0.1984,
"num_tokens": 9774882.0,
"reward": 0.07221800833940506,
"reward_std": 0.031748898327350616,
"rewards/bleu_reward_func/mean": 0.07221800833940506,
"rewards/bleu_reward_func/std": 0.06019110977649689,
"step": 744
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 410.3125,
"completions/mean_terminated_length": 279.5714416503906,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.596,
"grad_norm": 2.886622190475464,
"kl": 0.031707763671875,
"learning_rate": 1e-06,
"loss": -0.0903,
"num_tokens": 9790372.0,
"reward": 0.030976204201579094,
"reward_std": 0.015047797001898289,
"rewards/bleu_reward_func/mean": 0.030976204201579094,
"rewards/bleu_reward_func/std": 0.033486902713775635,
"step": 745
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 241.84375,
"completions/mean_terminated_length": 166.1999969482422,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.5968,
"grad_norm": 4.601830005645752,
"kl": 0.069915771484375,
"learning_rate": 1e-06,
"loss": 0.0366,
"num_tokens": 9801439.0,
"reward": 0.10595028847455978,
"reward_std": 0.024186890572309494,
"rewards/bleu_reward_func/mean": 0.10595028847455978,
"rewards/bleu_reward_func/std": 0.10705985873937607,
"step": 746
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 311.53125,
"completions/mean_terminated_length": 255.39999389648438,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.5976,
"grad_norm": 2.84419322013855,
"kl": 0.05322265625,
"learning_rate": 1e-06,
"loss": -0.0815,
"num_tokens": 9816096.0,
"reward": 0.02818489633500576,
"reward_std": 0.008401873521506786,
"rewards/bleu_reward_func/mean": 0.02818489633500576,
"rewards/bleu_reward_func/std": 0.02303573302924633,
"step": 747
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 343.0,
"completions/max_terminated_length": 343.0,
"completions/mean_length": 100.78125,
"completions/mean_terminated_length": 100.78125,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.5984,
"grad_norm": 8.280403137207031,
"kl": 0.14306640625,
"learning_rate": 1e-06,
"loss": 0.0722,
"num_tokens": 9821761.0,
"reward": 0.07066097855567932,
"reward_std": 0.026687482371926308,
"rewards/bleu_reward_func/mean": 0.07066097855567932,
"rewards/bleu_reward_func/std": 0.04903886467218399,
"step": 748
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 396.0,
"completions/max_terminated_length": 396.0,
"completions/mean_length": 129.1875,
"completions/mean_terminated_length": 129.1875,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.5992,
"grad_norm": 6.650774955749512,
"kl": 0.1456298828125,
"learning_rate": 1e-06,
"loss": 0.1458,
"num_tokens": 9829943.0,
"reward": 0.19511400163173676,
"reward_std": 0.03503159433603287,
"rewards/bleu_reward_func/mean": 0.19511400163173676,
"rewards/bleu_reward_func/std": 0.21101784706115723,
"step": 749
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 469.0,
"completions/mean_length": 258.1875,
"completions/mean_terminated_length": 105.9000015258789,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.6,
"grad_norm": 5.664450645446777,
"kl": 0.081695556640625,
"learning_rate": 1e-06,
"loss": 0.008,
"num_tokens": 9841813.0,
"reward": 0.20738312602043152,
"reward_std": 0.07332950830459595,
"rewards/bleu_reward_func/mean": 0.20738312602043152,
"rewards/bleu_reward_func/std": 0.2197185456752777,
"step": 750
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 421.0,
"completions/mean_length": 160.03125,
"completions/mean_terminated_length": 136.56668090820312,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.6008,
"grad_norm": 69.39850616455078,
"kl": 0.24407958984375,
"learning_rate": 1e-06,
"loss": 0.016,
"num_tokens": 9852910.0,
"reward": 0.2434358447790146,
"reward_std": 0.10366295278072357,
"rewards/bleu_reward_func/mean": 0.2434358447790146,
"rewards/bleu_reward_func/std": 0.18655826151371002,
"step": 751
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 316.875,
"completions/mean_terminated_length": 251.83334350585938,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.6016,
"grad_norm": 4.908503532409668,
"kl": 0.0460205078125,
"learning_rate": 1e-06,
"loss": 0.1812,
"num_tokens": 9865266.0,
"reward": 0.05490465834736824,
"reward_std": 0.02047915570437908,
"rewards/bleu_reward_func/mean": 0.05490465834736824,
"rewards/bleu_reward_func/std": 0.04037528112530708,
"step": 752
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 269.09375,
"completions/mean_terminated_length": 252.90000915527344,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.6024,
"grad_norm": 2.893157958984375,
"kl": 0.048187255859375,
"learning_rate": 1e-06,
"loss": -0.0419,
"num_tokens": 9876613.0,
"reward": 0.05797014757990837,
"reward_std": 0.029720589518547058,
"rewards/bleu_reward_func/mean": 0.05797014757990837,
"rewards/bleu_reward_func/std": 0.07483170926570892,
"step": 753
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 434.21875,
"completions/mean_terminated_length": 285.727294921875,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.6032,
"grad_norm": 3.0334763526916504,
"kl": 0.042449951171875,
"learning_rate": 1e-06,
"loss": -0.167,
"num_tokens": 9894044.0,
"reward": 0.0762338861823082,
"reward_std": 0.02360478974878788,
"rewards/bleu_reward_func/mean": 0.0762338861823082,
"rewards/bleu_reward_func/std": 0.0673457533121109,
"step": 754
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 423.21875,
"completions/mean_terminated_length": 309.0714416503906,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.604,
"grad_norm": 2.2641522884368896,
"kl": 0.03399658203125,
"learning_rate": 1e-06,
"loss": -0.0917,
"num_tokens": 9911107.0,
"reward": 0.03881996497511864,
"reward_std": 0.012424922548234463,
"rewards/bleu_reward_func/mean": 0.03881996497511864,
"rewards/bleu_reward_func/std": 0.02825937233865261,
"step": 755
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 431.0,
"completions/mean_length": 196.53125,
"completions/mean_terminated_length": 108.19999694824219,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.6048,
"grad_norm": 9.265530586242676,
"kl": 0.2891845703125,
"learning_rate": 1e-06,
"loss": 0.1189,
"num_tokens": 9924868.0,
"reward": 0.2168477475643158,
"reward_std": 0.07323689758777618,
"rewards/bleu_reward_func/mean": 0.2168477475643158,
"rewards/bleu_reward_func/std": 0.17768503725528717,
"step": 756
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 447.0,
"completions/mean_length": 163.53125,
"completions/mean_terminated_length": 152.29031372070312,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.6056,
"grad_norm": 44.89513397216797,
"kl": 0.31927490234375,
"learning_rate": 1e-06,
"loss": 0.0321,
"num_tokens": 9935765.0,
"reward": 0.08691335469484329,
"reward_std": 0.03311008960008621,
"rewards/bleu_reward_func/mean": 0.08691335469484329,
"rewards/bleu_reward_func/std": 0.08314234763383865,
"step": 757
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 425.0,
"completions/max_terminated_length": 425.0,
"completions/mean_length": 97.375,
"completions/mean_terminated_length": 97.375,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6064,
"grad_norm": 8.500920295715332,
"kl": 0.15838623046875,
"learning_rate": 1e-06,
"loss": 0.1392,
"num_tokens": 9944529.0,
"reward": 0.25747808814048767,
"reward_std": 0.048998236656188965,
"rewards/bleu_reward_func/mean": 0.25747808814048767,
"rewards/bleu_reward_func/std": 0.22997993230819702,
"step": 758
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 357.78125,
"completions/mean_terminated_length": 265.25,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.6072,
"grad_norm": 3.4296295642852783,
"kl": 0.05389404296875,
"learning_rate": 1e-06,
"loss": -0.0162,
"num_tokens": 9961058.0,
"reward": 0.07074315845966339,
"reward_std": 0.0662379041314125,
"rewards/bleu_reward_func/mean": 0.07074315845966339,
"rewards/bleu_reward_func/std": 0.1079547107219696,
"step": 759
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 207.375,
"completions/mean_terminated_length": 175.86207580566406,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.608,
"grad_norm": 6.224173545837402,
"kl": 0.1185302734375,
"learning_rate": 1e-06,
"loss": 0.0424,
"num_tokens": 9972646.0,
"reward": 0.09982403367757797,
"reward_std": 0.06204840913414955,
"rewards/bleu_reward_func/mean": 0.09982403367757797,
"rewards/bleu_reward_func/std": 0.10973682999610901,
"step": 760
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 452.0,
"completions/mean_length": 152.8125,
"completions/mean_terminated_length": 128.86666870117188,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.6088,
"grad_norm": 12.398276329040527,
"kl": 0.240966796875,
"learning_rate": 1e-06,
"loss": 0.198,
"num_tokens": 9985136.0,
"reward": 0.059197958558797836,
"reward_std": 0.02872345596551895,
"rewards/bleu_reward_func/mean": 0.059197958558797836,
"rewards/bleu_reward_func/std": 0.04119112715125084,
"step": 761
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 377.6875,
"completions/mean_terminated_length": 273.22222900390625,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.6096,
"grad_norm": 3.9670443534851074,
"kl": 0.046600341796875,
"learning_rate": 1e-06,
"loss": 0.0648,
"num_tokens": 10003078.0,
"reward": 0.0413321927189827,
"reward_std": 0.015110660344362259,
"rewards/bleu_reward_func/mean": 0.0413321927189827,
"rewards/bleu_reward_func/std": 0.032528944313526154,
"step": 762
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 472.0,
"completions/mean_terminated_length": 413.5384826660156,
"completions/min_length": 283.0,
"completions/min_terminated_length": 283.0,
"epoch": 0.6104,
"grad_norm": 2.092273712158203,
"kl": 0.0268096923828125,
"learning_rate": 1e-06,
"loss": 0.0147,
"num_tokens": 10020230.0,
"reward": 0.08549900352954865,
"reward_std": 0.03175706788897514,
"rewards/bleu_reward_func/mean": 0.08549900352954865,
"rewards/bleu_reward_func/std": 0.042792484164237976,
"step": 763
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 446.0,
"completions/mean_length": 150.6875,
"completions/mean_terminated_length": 83.77777862548828,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6112,
"grad_norm": 9.28490924835205,
"kl": 0.265869140625,
"learning_rate": 1e-06,
"loss": 0.2918,
"num_tokens": 10031172.0,
"reward": 0.21292155981063843,
"reward_std": 0.06925603747367859,
"rewards/bleu_reward_func/mean": 0.21292155981063843,
"rewards/bleu_reward_func/std": 0.18994402885437012,
"step": 764
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 328.3125,
"completions/mean_terminated_length": 218.10000610351562,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.612,
"grad_norm": 21.738191604614258,
"kl": 0.1495361328125,
"learning_rate": 1e-06,
"loss": -0.0895,
"num_tokens": 10046358.0,
"reward": 0.09622834622859955,
"reward_std": 0.04176661744713783,
"rewards/bleu_reward_func/mean": 0.09622834622859955,
"rewards/bleu_reward_func/std": 0.09296616911888123,
"step": 765
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 288.21875,
"completions/mean_terminated_length": 246.7777862548828,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.6128,
"grad_norm": 5.178511142730713,
"kl": 0.1473388671875,
"learning_rate": 1e-06,
"loss": 0.0688,
"num_tokens": 10059877.0,
"reward": 0.088385209441185,
"reward_std": 0.016962474212050438,
"rewards/bleu_reward_func/mean": 0.088385209441185,
"rewards/bleu_reward_func/std": 0.08985943347215652,
"step": 766
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 311.46875,
"completions/mean_terminated_length": 244.625,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.6136,
"grad_norm": 11.883882522583008,
"kl": 0.2354583740234375,
"learning_rate": 1e-06,
"loss": -0.0238,
"num_tokens": 10073076.0,
"reward": 0.18446165323257446,
"reward_std": 0.10309243947267532,
"rewards/bleu_reward_func/mean": 0.18446165323257446,
"rewards/bleu_reward_func/std": 0.30338525772094727,
"step": 767
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 275.59375,
"completions/mean_terminated_length": 168.13636779785156,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.6144,
"grad_norm": 5.438653469085693,
"kl": 0.081451416015625,
"learning_rate": 1e-06,
"loss": 0.0639,
"num_tokens": 10085871.0,
"reward": 0.06602545082569122,
"reward_std": 0.030349329113960266,
"rewards/bleu_reward_func/mean": 0.06602545082569122,
"rewards/bleu_reward_func/std": 0.04767395555973053,
"step": 768
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 467.0,
"completions/max_terminated_length": 467.0,
"completions/mean_length": 136.09375,
"completions/mean_terminated_length": 136.09375,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.6152,
"grad_norm": 5.674292087554932,
"kl": 0.26715087890625,
"learning_rate": 1e-06,
"loss": 0.2437,
"num_tokens": 10094858.0,
"reward": 0.07597756385803223,
"reward_std": 0.027629435062408447,
"rewards/bleu_reward_func/mean": 0.07597756385803223,
"rewards/bleu_reward_func/std": 0.054181892424821854,
"step": 769
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 107.9375,
"completions/mean_terminated_length": 94.9032211303711,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.616,
"grad_norm": 9.938776969909668,
"kl": 0.16571044921875,
"learning_rate": 1e-06,
"loss": 0.082,
"num_tokens": 10106672.0,
"reward": 0.3735446035861969,
"reward_std": 0.03898521885275841,
"rewards/bleu_reward_func/mean": 0.3735446035861969,
"rewards/bleu_reward_func/std": 0.30306297540664673,
"step": 770
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 338.6875,
"completions/mean_terminated_length": 280.91668701171875,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.6168,
"grad_norm": 3.914602041244507,
"kl": 0.05010986328125,
"learning_rate": 1e-06,
"loss": -0.1112,
"num_tokens": 10120510.0,
"reward": 0.09634806215763092,
"reward_std": 0.04157658666372299,
"rewards/bleu_reward_func/mean": 0.09634806215763092,
"rewards/bleu_reward_func/std": 0.08702099323272705,
"step": 771
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 270.875,
"completions/mean_terminated_length": 215.23077392578125,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.6176,
"grad_norm": 5.461522102355957,
"kl": 0.3319091796875,
"learning_rate": 1e-06,
"loss": 0.0344,
"num_tokens": 10132138.0,
"reward": 0.18050828576087952,
"reward_std": 0.033299222588539124,
"rewards/bleu_reward_func/mean": 0.18050828576087952,
"rewards/bleu_reward_func/std": 0.21068614721298218,
"step": 772
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 310.0,
"completions/max_terminated_length": 310.0,
"completions/mean_length": 131.375,
"completions/mean_terminated_length": 131.375,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6184,
"grad_norm": 8.162845611572266,
"kl": 0.2459716796875,
"learning_rate": 1e-06,
"loss": 0.1064,
"num_tokens": 10142558.0,
"reward": 0.1030242070555687,
"reward_std": 0.05847536772489548,
"rewards/bleu_reward_func/mean": 0.1030242070555687,
"rewards/bleu_reward_func/std": 0.14961844682693481,
"step": 773
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 492.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 169.9375,
"completions/mean_terminated_length": 169.9375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.6192,
"grad_norm": 43.1834602355957,
"kl": 0.22735595703125,
"learning_rate": 1e-06,
"loss": -0.0649,
"num_tokens": 10151012.0,
"reward": 0.04478512331843376,
"reward_std": 0.012456279247999191,
"rewards/bleu_reward_func/mean": 0.04478512331843376,
"rewards/bleu_reward_func/std": 0.04301442950963974,
"step": 774
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 249.3125,
"completions/mean_terminated_length": 188.69232177734375,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.62,
"grad_norm": 8.241740226745605,
"kl": 0.188079833984375,
"learning_rate": 1e-06,
"loss": 0.1066,
"num_tokens": 10163446.0,
"reward": 0.15149368345737457,
"reward_std": 0.028546612709760666,
"rewards/bleu_reward_func/mean": 0.15149368345737457,
"rewards/bleu_reward_func/std": 0.14032159745693207,
"step": 775
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 235.125,
"completions/mean_terminated_length": 126.78260803222656,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.6208,
"grad_norm": 8.033968925476074,
"kl": 0.22894287109375,
"learning_rate": 1e-06,
"loss": 0.098,
"num_tokens": 10174522.0,
"reward": 0.22483249008655548,
"reward_std": 0.04489654302597046,
"rewards/bleu_reward_func/mean": 0.22483249008655548,
"rewards/bleu_reward_func/std": 0.24184906482696533,
"step": 776
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 273.0625,
"completions/mean_terminated_length": 179.56521606445312,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.6216,
"grad_norm": 6.315563201904297,
"kl": 0.09613037109375,
"learning_rate": 1e-06,
"loss": 0.1547,
"num_tokens": 10188964.0,
"reward": 0.06417440623044968,
"reward_std": 0.01652311347424984,
"rewards/bleu_reward_func/mean": 0.06417440623044968,
"rewards/bleu_reward_func/std": 0.05222758278250694,
"step": 777
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 350.40625,
"completions/mean_terminated_length": 188.8125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.6224,
"grad_norm": 4.945059776306152,
"kl": 0.08636474609375,
"learning_rate": 1e-06,
"loss": 0.0638,
"num_tokens": 10203657.0,
"reward": 0.07400526106357574,
"reward_std": 0.03621644526720047,
"rewards/bleu_reward_func/mean": 0.07400526106357574,
"rewards/bleu_reward_func/std": 0.05740804970264435,
"step": 778
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 319.1875,
"completions/mean_terminated_length": 243.7391357421875,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.6232,
"grad_norm": 4.364378929138184,
"kl": 0.07342529296875,
"learning_rate": 1e-06,
"loss": -0.0132,
"num_tokens": 10219919.0,
"reward": 0.24302205443382263,
"reward_std": 0.05576051399111748,
"rewards/bleu_reward_func/mean": 0.24302205443382263,
"rewards/bleu_reward_func/std": 0.21575042605400085,
"step": 779
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 298.75,
"completions/mean_terminated_length": 201.8181915283203,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.624,
"grad_norm": 3.959169864654541,
"kl": 0.06689453125,
"learning_rate": 1e-06,
"loss": 0.0405,
"num_tokens": 10234119.0,
"reward": 0.09677430242300034,
"reward_std": 0.02671782858669758,
"rewards/bleu_reward_func/mean": 0.09677430242300034,
"rewards/bleu_reward_func/std": 0.06890682131052017,
"step": 780
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 211.40625,
"completions/mean_terminated_length": 168.46429443359375,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.6248,
"grad_norm": 10.254522323608398,
"kl": 0.30865478515625,
"learning_rate": 1e-06,
"loss": -0.044,
"num_tokens": 10247388.0,
"reward": 0.2194344401359558,
"reward_std": 0.04920031875371933,
"rewards/bleu_reward_func/mean": 0.2194344401359558,
"rewards/bleu_reward_func/std": 0.15552020072937012,
"step": 781
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 201.46875,
"completions/mean_terminated_length": 180.7666778564453,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.6256,
"grad_norm": 7.234709739685059,
"kl": 0.1651611328125,
"learning_rate": 1e-06,
"loss": 0.2297,
"num_tokens": 10256947.0,
"reward": 0.11007180064916611,
"reward_std": 0.07193183898925781,
"rewards/bleu_reward_func/mean": 0.11007180064916611,
"rewards/bleu_reward_func/std": 0.13098347187042236,
"step": 782
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 288.15625,
"completions/mean_terminated_length": 246.70370483398438,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.6264,
"grad_norm": 3.6756701469421387,
"kl": 0.050689697265625,
"learning_rate": 1e-06,
"loss": -0.2297,
"num_tokens": 10271504.0,
"reward": 0.07084184139966965,
"reward_std": 0.03263479843735695,
"rewards/bleu_reward_func/mean": 0.07084184139966965,
"rewards/bleu_reward_func/std": 0.07953313738107681,
"step": 783
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 370.0,
"completions/mean_length": 127.375,
"completions/mean_terminated_length": 114.96773529052734,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.6272,
"grad_norm": 7.3130879402160645,
"kl": 0.143096923828125,
"learning_rate": 1e-06,
"loss": -0.1119,
"num_tokens": 10281236.0,
"reward": 0.17116650938987732,
"reward_std": 0.040961284190416336,
"rewards/bleu_reward_func/mean": 0.17116650938987732,
"rewards/bleu_reward_func/std": 0.16110415756702423,
"step": 784
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 458.0,
"completions/mean_length": 198.125,
"completions/mean_terminated_length": 140.0,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.628,
"grad_norm": 5.703468322753906,
"kl": 0.168975830078125,
"learning_rate": 1e-06,
"loss": 0.0673,
"num_tokens": 10293288.0,
"reward": 0.14155232906341553,
"reward_std": 0.059418316930532455,
"rewards/bleu_reward_func/mean": 0.14155232906341553,
"rewards/bleu_reward_func/std": 0.142944797873497,
"step": 785
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 475.0,
"completions/mean_length": 312.5,
"completions/mean_terminated_length": 266.4615478515625,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.6288,
"grad_norm": 2.4137492179870605,
"kl": 0.023956298828125,
"learning_rate": 1e-06,
"loss": 0.0405,
"num_tokens": 10309744.0,
"reward": 0.16134724020957947,
"reward_std": 0.01978662982583046,
"rewards/bleu_reward_func/mean": 0.16134724020957947,
"rewards/bleu_reward_func/std": 0.16176313161849976,
"step": 786
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 475.0,
"completions/mean_length": 281.34375,
"completions/mean_terminated_length": 228.11538696289062,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.6296,
"grad_norm": 5.432137489318848,
"kl": 0.126953125,
"learning_rate": 1e-06,
"loss": -0.0834,
"num_tokens": 10322867.0,
"reward": 0.13262051343917847,
"reward_std": 0.054468683898448944,
"rewards/bleu_reward_func/mean": 0.13262051343917847,
"rewards/bleu_reward_func/std": 0.1454581618309021,
"step": 787
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 434.0,
"completions/mean_length": 157.8125,
"completions/mean_terminated_length": 121.17241668701172,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.6304,
"grad_norm": 8.820817947387695,
"kl": 0.384033203125,
"learning_rate": 1e-06,
"loss": 0.1057,
"num_tokens": 10330085.0,
"reward": 0.14398705959320068,
"reward_std": 0.05267474800348282,
"rewards/bleu_reward_func/mean": 0.14398705959320068,
"rewards/bleu_reward_func/std": 0.12204661965370178,
"step": 788
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 313.875,
"completions/mean_terminated_length": 223.8181915283203,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.6312,
"grad_norm": 6.252136707305908,
"kl": 0.202178955078125,
"learning_rate": 1e-06,
"loss": -0.0958,
"num_tokens": 10344937.0,
"reward": 0.08566081523895264,
"reward_std": 0.0418044775724411,
"rewards/bleu_reward_func/mean": 0.08566081523895264,
"rewards/bleu_reward_func/std": 0.1277945637702942,
"step": 789
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 476.40625,
"completions/mean_terminated_length": 408.4545593261719,
"completions/min_length": 270.0,
"completions/min_terminated_length": 270.0,
"epoch": 0.632,
"grad_norm": 2.1677629947662354,
"kl": 0.03436279296875,
"learning_rate": 1e-06,
"loss": -0.0004,
"num_tokens": 10362222.0,
"reward": 0.04695405811071396,
"reward_std": 0.013839447870850563,
"rewards/bleu_reward_func/mean": 0.04695405811071396,
"rewards/bleu_reward_func/std": 0.03280064836144447,
"step": 790
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 404.46875,
"completions/mean_terminated_length": 296.9375,
"completions/min_length": 122.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.6328,
"grad_norm": 2.3538496494293213,
"kl": 0.0289459228515625,
"learning_rate": 1e-06,
"loss": -0.0297,
"num_tokens": 10382845.0,
"reward": 0.08459493517875671,
"reward_std": 0.029446884989738464,
"rewards/bleu_reward_func/mean": 0.08459493517875671,
"rewards/bleu_reward_func/std": 0.051741067320108414,
"step": 791
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 418.0,
"completions/mean_length": 133.0625,
"completions/mean_terminated_length": 107.80000305175781,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.6336,
"grad_norm": 6.885672092437744,
"kl": 0.20733642578125,
"learning_rate": 1e-06,
"loss": 0.1098,
"num_tokens": 10391199.0,
"reward": 0.10581733286380768,
"reward_std": 0.034825149923563004,
"rewards/bleu_reward_func/mean": 0.10581733286380768,
"rewards/bleu_reward_func/std": 0.10278832167387009,
"step": 792
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 95.0,
"completions/mean_length": 154.4375,
"completions/mean_terminated_length": 35.25,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.6344,
"grad_norm": 8.279248237609863,
"kl": 0.2750244140625,
"learning_rate": 1e-06,
"loss": -0.0444,
"num_tokens": 10399789.0,
"reward": 0.1634266972541809,
"reward_std": 0.029335156083106995,
"rewards/bleu_reward_func/mean": 0.1634266972541809,
"rewards/bleu_reward_func/std": 0.1743723601102829,
"step": 793
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 244.96875,
"completions/mean_terminated_length": 123.59091186523438,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6352,
"grad_norm": 5.577760219573975,
"kl": 0.230865478515625,
"learning_rate": 1e-06,
"loss": -0.0138,
"num_tokens": 10413116.0,
"reward": 0.18318259716033936,
"reward_std": 0.02782328985631466,
"rewards/bleu_reward_func/mean": 0.18318259716033936,
"rewards/bleu_reward_func/std": 0.14704957604408264,
"step": 794
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 354.78125,
"completions/mean_terminated_length": 344.3000183105469,
"completions/min_length": 176.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.636,
"grad_norm": 2.591658115386963,
"kl": 0.0296630859375,
"learning_rate": 1e-06,
"loss": -0.0201,
"num_tokens": 10426797.0,
"reward": 0.06094507500529289,
"reward_std": 0.02977069467306137,
"rewards/bleu_reward_func/mean": 0.06094507500529289,
"rewards/bleu_reward_func/std": 0.03347548097372055,
"step": 795
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 209.25,
"completions/mean_terminated_length": 189.06668090820312,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.6368,
"grad_norm": 7.372705936431885,
"kl": 0.226715087890625,
"learning_rate": 1e-06,
"loss": 0.0166,
"num_tokens": 10435821.0,
"reward": 0.17854920029640198,
"reward_std": 0.039038486778736115,
"rewards/bleu_reward_func/mean": 0.17854920029640198,
"rewards/bleu_reward_func/std": 0.11250942945480347,
"step": 796
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 510.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 139.90625,
"completions/mean_terminated_length": 139.90625,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6376,
"grad_norm": 7.399951457977295,
"kl": 0.2593994140625,
"learning_rate": 1e-06,
"loss": 0.0215,
"num_tokens": 10444706.0,
"reward": 0.20415213704109192,
"reward_std": 0.05372469127178192,
"rewards/bleu_reward_func/mean": 0.20415213704109192,
"rewards/bleu_reward_func/std": 0.15420135855674744,
"step": 797
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 253.3125,
"completions/mean_terminated_length": 193.61538696289062,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6384,
"grad_norm": 5.452202320098877,
"kl": 0.207672119140625,
"learning_rate": 1e-06,
"loss": 0.0658,
"num_tokens": 10457300.0,
"reward": 0.18789556622505188,
"reward_std": 0.06054109334945679,
"rewards/bleu_reward_func/mean": 0.18789556622505188,
"rewards/bleu_reward_func/std": 0.18226853013038635,
"step": 798
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 436.0,
"completions/mean_length": 164.53125,
"completions/mean_terminated_length": 100.18518829345703,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6392,
"grad_norm": 9.581258773803711,
"kl": 0.312286376953125,
"learning_rate": 1e-06,
"loss": -0.0913,
"num_tokens": 10465053.0,
"reward": 0.14276297390460968,
"reward_std": 0.028537599369883537,
"rewards/bleu_reward_func/mean": 0.14276297390460968,
"rewards/bleu_reward_func/std": 0.10928227007389069,
"step": 799
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 313.0,
"completions/max_terminated_length": 313.0,
"completions/mean_length": 56.59375,
"completions/mean_terminated_length": 56.59375,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.64,
"grad_norm": 7.995264053344727,
"kl": 0.2779541015625,
"learning_rate": 1e-06,
"loss": 0.3759,
"num_tokens": 10477608.0,
"reward": 0.34325188398361206,
"reward_std": 0.07241753488779068,
"rewards/bleu_reward_func/mean": 0.34325188398361206,
"rewards/bleu_reward_func/std": 0.20597775280475616,
"step": 800
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 245.0,
"completions/mean_length": 191.5625,
"completions/mean_terminated_length": 117.61538696289062,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.6408,
"grad_norm": 4.229004859924316,
"kl": 0.0904541015625,
"learning_rate": 1e-06,
"loss": 0.1193,
"num_tokens": 10485490.0,
"reward": 0.11370354145765305,
"reward_std": 0.061382561922073364,
"rewards/bleu_reward_func/mean": 0.11370354145765305,
"rewards/bleu_reward_func/std": 0.15154796838760376,
"step": 801
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 439.0,
"completions/mean_length": 293.0,
"completions/mean_terminated_length": 99.76470947265625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.6416,
"grad_norm": 4.936343193054199,
"kl": 0.227630615234375,
"learning_rate": 1e-06,
"loss": 0.0723,
"num_tokens": 10497930.0,
"reward": 0.15342603623867035,
"reward_std": 0.018828846514225006,
"rewards/bleu_reward_func/mean": 0.15342603623867035,
"rewards/bleu_reward_func/std": 0.22573818266391754,
"step": 802
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 357.0,
"completions/mean_length": 206.9375,
"completions/mean_terminated_length": 163.35714721679688,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.6424,
"grad_norm": 4.5400471687316895,
"kl": 0.1302490234375,
"learning_rate": 1e-06,
"loss": -0.0265,
"num_tokens": 10509096.0,
"reward": 0.042201556265354156,
"reward_std": 0.01641710475087166,
"rewards/bleu_reward_func/mean": 0.042201556265354156,
"rewards/bleu_reward_func/std": 0.026252396404743195,
"step": 803
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 339.8125,
"completions/mean_terminated_length": 222.0,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.6432,
"grad_norm": 4.132330417633057,
"kl": 0.042144775390625,
"learning_rate": 1e-06,
"loss": -0.0638,
"num_tokens": 10522810.0,
"reward": 0.05155924707651138,
"reward_std": 0.017338326200842857,
"rewards/bleu_reward_func/mean": 0.05155924707651138,
"rewards/bleu_reward_func/std": 0.03961692750453949,
"step": 804
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 375.0,
"completions/mean_length": 251.0,
"completions/mean_terminated_length": 132.3636474609375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.644,
"grad_norm": 6.286128044128418,
"kl": 0.150299072265625,
"learning_rate": 1e-06,
"loss": 0.1961,
"num_tokens": 10533090.0,
"reward": 0.03828435763716698,
"reward_std": 0.01768323965370655,
"rewards/bleu_reward_func/mean": 0.03828435763716698,
"rewards/bleu_reward_func/std": 0.035699598491191864,
"step": 805
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 286.03125,
"completions/mean_terminated_length": 253.75001525878906,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"epoch": 0.6448,
"grad_norm": 3.333425283432007,
"kl": 0.0326385498046875,
"learning_rate": 1e-06,
"loss": 0.0689,
"num_tokens": 10544131.0,
"reward": 0.11853313446044922,
"reward_std": 0.06690388172864914,
"rewards/bleu_reward_func/mean": 0.11853313446044922,
"rewards/bleu_reward_func/std": 0.14521227777004242,
"step": 806
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 428.0,
"completions/mean_length": 176.6875,
"completions/mean_terminated_length": 165.87095642089844,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.6456,
"grad_norm": 6.8076090812683105,
"kl": 0.24957275390625,
"learning_rate": 1e-06,
"loss": -0.1496,
"num_tokens": 10554641.0,
"reward": 0.12172487378120422,
"reward_std": 0.05724428966641426,
"rewards/bleu_reward_func/mean": 0.12172487378120422,
"rewards/bleu_reward_func/std": 0.11496427655220032,
"step": 807
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 146.875,
"completions/mean_terminated_length": 62.615386962890625,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.6464,
"grad_norm": 9.55725383758545,
"kl": 0.30224609375,
"learning_rate": 1e-06,
"loss": 0.0709,
"num_tokens": 10563829.0,
"reward": 0.20068883895874023,
"reward_std": 0.06663694977760315,
"rewards/bleu_reward_func/mean": 0.20068883895874023,
"rewards/bleu_reward_func/std": 0.13896267116069794,
"step": 808
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 176.0,
"completions/mean_length": 141.34375,
"completions/mean_terminated_length": 37.55999755859375,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.6472,
"grad_norm": 26.54884147644043,
"kl": 0.41717529296875,
"learning_rate": 1e-06,
"loss": -0.1795,
"num_tokens": 10574624.0,
"reward": 0.22808396816253662,
"reward_std": 0.06877206265926361,
"rewards/bleu_reward_func/mean": 0.22808396816253662,
"rewards/bleu_reward_func/std": 0.21049334108829498,
"step": 809
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 308.0,
"completions/max_terminated_length": 308.0,
"completions/mean_length": 111.21875,
"completions/mean_terminated_length": 111.21875,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.648,
"grad_norm": 13.591133117675781,
"kl": 0.272674560546875,
"learning_rate": 1e-06,
"loss": -0.0292,
"num_tokens": 10583671.0,
"reward": 0.2966269850730896,
"reward_std": 0.015265233814716339,
"rewards/bleu_reward_func/mean": 0.2966269850730896,
"rewards/bleu_reward_func/std": 0.24745707213878632,
"step": 810
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 473.0,
"completions/mean_length": 94.34375,
"completions/mean_terminated_length": 80.87096405029297,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.6488,
"grad_norm": 10.449114799499512,
"kl": 0.2784423828125,
"learning_rate": 1e-06,
"loss": -0.2645,
"num_tokens": 10592586.0,
"reward": 0.23048871755599976,
"reward_std": 0.05683053284883499,
"rewards/bleu_reward_func/mean": 0.23048871755599976,
"rewards/bleu_reward_func/std": 0.304109662771225,
"step": 811
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 262.0,
"completions/mean_length": 184.65625,
"completions/mean_terminated_length": 75.54167175292969,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.6496,
"grad_norm": 6.2868170738220215,
"kl": 0.202392578125,
"learning_rate": 1e-06,
"loss": 0.0856,
"num_tokens": 10604447.0,
"reward": 0.06996900588274002,
"reward_std": 0.01753135770559311,
"rewards/bleu_reward_func/mean": 0.06996900588274002,
"rewards/bleu_reward_func/std": 0.07089151442050934,
"step": 812
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 171.0,
"completions/max_terminated_length": 171.0,
"completions/mean_length": 46.96875,
"completions/mean_terminated_length": 46.96875,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.6504,
"grad_norm": 15.99052619934082,
"kl": 0.55419921875,
"learning_rate": 1e-06,
"loss": 0.1537,
"num_tokens": 10612710.0,
"reward": 0.08658448606729507,
"reward_std": 0.03601383790373802,
"rewards/bleu_reward_func/mean": 0.08658448606729507,
"rewards/bleu_reward_func/std": 0.05530841648578644,
"step": 813
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 355.0,
"completions/mean_terminated_length": 283.6363830566406,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.6512,
"grad_norm": 3.1796348094940186,
"kl": 0.059417724609375,
"learning_rate": 1e-06,
"loss": 0.1311,
"num_tokens": 10625326.0,
"reward": 0.11449694633483887,
"reward_std": 0.027395280078053474,
"rewards/bleu_reward_func/mean": 0.11449694633483887,
"rewards/bleu_reward_func/std": 0.05288613215088844,
"step": 814
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 128.96875,
"completions/mean_terminated_length": 103.43334197998047,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.652,
"grad_norm": 6.432667255401611,
"kl": 0.1822509765625,
"learning_rate": 1e-06,
"loss": 0.1721,
"num_tokens": 10635421.0,
"reward": 0.18185263872146606,
"reward_std": 0.0783199891448021,
"rewards/bleu_reward_func/mean": 0.18185263872146606,
"rewards/bleu_reward_func/std": 0.18959355354309082,
"step": 815
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 242.9375,
"completions/mean_terminated_length": 204.50001525878906,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.6528,
"grad_norm": 4.343508720397949,
"kl": 0.06640625,
"learning_rate": 1e-06,
"loss": 0.1592,
"num_tokens": 10647283.0,
"reward": 0.10118309408426285,
"reward_std": 0.026538610458374023,
"rewards/bleu_reward_func/mean": 0.10118309408426285,
"rewards/bleu_reward_func/std": 0.08866976201534271,
"step": 816
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 456.0,
"completions/mean_length": 217.5625,
"completions/mean_terminated_length": 175.50001525878906,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.6536,
"grad_norm": 4.102277755737305,
"kl": 0.141204833984375,
"learning_rate": 1e-06,
"loss": 0.2738,
"num_tokens": 10660189.0,
"reward": 0.23801954090595245,
"reward_std": 0.07484984397888184,
"rewards/bleu_reward_func/mean": 0.23801954090595245,
"rewards/bleu_reward_func/std": 0.168580561876297,
"step": 817
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 180.625,
"completions/mean_terminated_length": 169.93548583984375,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.6544,
"grad_norm": 5.659482002258301,
"kl": 0.08270263671875,
"learning_rate": 1e-06,
"loss": 0.1325,
"num_tokens": 10668393.0,
"reward": 0.08136270940303802,
"reward_std": 0.02622528187930584,
"rewards/bleu_reward_func/mean": 0.08136270940303802,
"rewards/bleu_reward_func/std": 0.03544744476675987,
"step": 818
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 289.375,
"completions/mean_terminated_length": 227.0399932861328,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.6552,
"grad_norm": 3.059807300567627,
"kl": 0.076446533203125,
"learning_rate": 1e-06,
"loss": -0.016,
"num_tokens": 10682749.0,
"reward": 0.07544586062431335,
"reward_std": 0.0220788661390543,
"rewards/bleu_reward_func/mean": 0.07544586062431335,
"rewards/bleu_reward_func/std": 0.04309820756316185,
"step": 819
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 407.0,
"completions/mean_length": 271.4375,
"completions/mean_terminated_length": 191.25,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.656,
"grad_norm": 4.126210689544678,
"kl": 0.06744384765625,
"learning_rate": 1e-06,
"loss": 0.2091,
"num_tokens": 10693011.0,
"reward": 0.12100762873888016,
"reward_std": 0.040202461183071136,
"rewards/bleu_reward_func/mean": 0.12100762873888016,
"rewards/bleu_reward_func/std": 0.09315716475248337,
"step": 820
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 360.34375,
"completions/mean_terminated_length": 226.5294189453125,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.6568,
"grad_norm": 3.169628620147705,
"kl": 0.030731201171875,
"learning_rate": 1e-06,
"loss": -0.1334,
"num_tokens": 10706846.0,
"reward": 0.041019197553396225,
"reward_std": 0.012767975218594074,
"rewards/bleu_reward_func/mean": 0.041019197553396225,
"rewards/bleu_reward_func/std": 0.050586286932229996,
"step": 821
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 378.0,
"completions/mean_length": 264.46875,
"completions/mean_terminated_length": 46.05882263183594,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.6576,
"grad_norm": 6.150150775909424,
"kl": 0.174896240234375,
"learning_rate": 1e-06,
"loss": 0.0093,
"num_tokens": 10720189.0,
"reward": 0.10611159354448318,
"reward_std": 0.044405680149793625,
"rewards/bleu_reward_func/mean": 0.10611159354448318,
"rewards/bleu_reward_func/std": 0.10892455279827118,
"step": 822
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 266.84375,
"completions/mean_terminated_length": 210.2692413330078,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.6584,
"grad_norm": 6.753479957580566,
"kl": 0.20147705078125,
"learning_rate": 1e-06,
"loss": 0.0273,
"num_tokens": 10731600.0,
"reward": 0.10170187056064606,
"reward_std": 0.03044716641306877,
"rewards/bleu_reward_func/mean": 0.10170187056064606,
"rewards/bleu_reward_func/std": 0.05836126208305359,
"step": 823
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 392.28125,
"completions/mean_terminated_length": 299.1666564941406,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.6592,
"grad_norm": 2.5884182453155518,
"kl": 0.0380859375,
"learning_rate": 1e-06,
"loss": -0.0465,
"num_tokens": 10748121.0,
"reward": 0.03844967484474182,
"reward_std": 0.012901275418698788,
"rewards/bleu_reward_func/mean": 0.03844967484474182,
"rewards/bleu_reward_func/std": 0.032823171466588974,
"step": 824
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 270.5,
"completions/mean_terminated_length": 214.7692413330078,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.66,
"grad_norm": 4.168792247772217,
"kl": 0.083251953125,
"learning_rate": 1e-06,
"loss": -0.0346,
"num_tokens": 10763353.0,
"reward": 0.08619528263807297,
"reward_std": 0.02499576285481453,
"rewards/bleu_reward_func/mean": 0.08619528263807297,
"rewards/bleu_reward_func/std": 0.10102304071187973,
"step": 825
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 264.78125,
"completions/mean_terminated_length": 195.55999755859375,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.6608,
"grad_norm": 10.431943893432617,
"kl": 0.154876708984375,
"learning_rate": 1e-06,
"loss": 0.1006,
"num_tokens": 10776642.0,
"reward": 0.06286956369876862,
"reward_std": 0.027797410264611244,
"rewards/bleu_reward_func/mean": 0.06286956369876862,
"rewards/bleu_reward_func/std": 0.07537111639976501,
"step": 826
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 231.1875,
"completions/mean_terminated_length": 202.13792419433594,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6616,
"grad_norm": 7.486932754516602,
"kl": 0.15045166015625,
"learning_rate": 1e-06,
"loss": 0.2166,
"num_tokens": 10788656.0,
"reward": 0.10479411482810974,
"reward_std": 0.029287472367286682,
"rewards/bleu_reward_func/mean": 0.10479411482810974,
"rewards/bleu_reward_func/std": 0.07098822295665741,
"step": 827
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 92.0,
"completions/mean_length": 165.3125,
"completions/mean_terminated_length": 49.75,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.6624,
"grad_norm": 7.1387104988098145,
"kl": 0.23382568359375,
"learning_rate": 1e-06,
"loss": 0.0727,
"num_tokens": 10795930.0,
"reward": 0.11342652887105942,
"reward_std": 0.027198534458875656,
"rewards/bleu_reward_func/mean": 0.11342652887105942,
"rewards/bleu_reward_func/std": 0.11971734464168549,
"step": 828
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 464.0,
"completions/max_terminated_length": 464.0,
"completions/mean_length": 108.96875,
"completions/mean_terminated_length": 108.96875,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.6632,
"grad_norm": 8.27657413482666,
"kl": 0.36053466796875,
"learning_rate": 1e-06,
"loss": -0.0108,
"num_tokens": 10804241.0,
"reward": 0.28853511810302734,
"reward_std": 0.07218953967094421,
"rewards/bleu_reward_func/mean": 0.28853511810302734,
"rewards/bleu_reward_func/std": 0.20515379309654236,
"step": 829
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 143.0,
"completions/max_terminated_length": 143.0,
"completions/mean_length": 51.875,
"completions/mean_terminated_length": 51.875,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.664,
"grad_norm": 8.845324516296387,
"kl": 0.4793701171875,
"learning_rate": 1e-06,
"loss": 0.1651,
"num_tokens": 10813957.0,
"reward": 0.2881876826286316,
"reward_std": 0.06279260665178299,
"rewards/bleu_reward_func/mean": 0.2881876826286316,
"rewards/bleu_reward_func/std": 0.23817574977874756,
"step": 830
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 268.46875,
"completions/mean_terminated_length": 157.77273559570312,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.6648,
"grad_norm": 5.753482818603516,
"kl": 0.143280029296875,
"learning_rate": 1e-06,
"loss": 0.0154,
"num_tokens": 10826356.0,
"reward": 0.13365639746189117,
"reward_std": 0.023316586390137672,
"rewards/bleu_reward_func/mean": 0.13365639746189117,
"rewards/bleu_reward_func/std": 0.20613247156143188,
"step": 831
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 444.0,
"completions/mean_length": 194.125,
"completions/mean_terminated_length": 172.933349609375,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.6656,
"grad_norm": 5.661513805389404,
"kl": 0.106048583984375,
"learning_rate": 1e-06,
"loss": 0.0722,
"num_tokens": 10836128.0,
"reward": 0.08124659210443497,
"reward_std": 0.016117524355649948,
"rewards/bleu_reward_func/mean": 0.08124659210443497,
"rewards/bleu_reward_func/std": 0.08725257217884064,
"step": 832
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 60.0,
"completions/max_terminated_length": 60.0,
"completions/mean_length": 26.625,
"completions/mean_terminated_length": 26.625,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6664,
"grad_norm": 12.99825382232666,
"kl": 0.453857421875,
"learning_rate": 1e-06,
"loss": 0.1057,
"num_tokens": 10842116.0,
"reward": 0.3153986930847168,
"reward_std": 0.0658825933933258,
"rewards/bleu_reward_func/mean": 0.3153986930847168,
"rewards/bleu_reward_func/std": 0.17146961390972137,
"step": 833
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 185.5625,
"completions/mean_terminated_length": 94.15999603271484,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.6672,
"grad_norm": 9.406866073608398,
"kl": 0.144378662109375,
"learning_rate": 1e-06,
"loss": 0.2779,
"num_tokens": 10852030.0,
"reward": 0.13975293934345245,
"reward_std": 0.04399016499519348,
"rewards/bleu_reward_func/mean": 0.13975293934345245,
"rewards/bleu_reward_func/std": 0.17490676045417786,
"step": 834
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 355.125,
"completions/mean_terminated_length": 198.25,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.668,
"grad_norm": 8.956555366516113,
"kl": 0.220977783203125,
"learning_rate": 1e-06,
"loss": -0.047,
"num_tokens": 10869930.0,
"reward": 0.04670516401529312,
"reward_std": 0.01496485248208046,
"rewards/bleu_reward_func/mean": 0.04670516401529312,
"rewards/bleu_reward_func/std": 0.03720833733677864,
"step": 835
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 370.71875,
"completions/mean_terminated_length": 331.1600036621094,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.6688,
"grad_norm": 2.452960729598999,
"kl": 0.048583984375,
"learning_rate": 1e-06,
"loss": -0.0511,
"num_tokens": 10884969.0,
"reward": 0.05606111139059067,
"reward_std": 0.01513909362256527,
"rewards/bleu_reward_func/mean": 0.05606111139059067,
"rewards/bleu_reward_func/std": 0.05016703903675079,
"step": 836
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 340.0,
"completions/mean_length": 200.40625,
"completions/mean_terminated_length": 96.54167175292969,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.6696,
"grad_norm": 4.966017723083496,
"kl": 0.11663818359375,
"learning_rate": 1e-06,
"loss": 0.2335,
"num_tokens": 10897982.0,
"reward": 0.0694584771990776,
"reward_std": 0.04167729243636131,
"rewards/bleu_reward_func/mean": 0.0694584771990776,
"rewards/bleu_reward_func/std": 0.06985452026128769,
"step": 837
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 397.0,
"completions/mean_length": 150.125,
"completions/mean_terminated_length": 126.00000762939453,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.6704,
"grad_norm": 6.741494178771973,
"kl": 0.11309814453125,
"learning_rate": 1e-06,
"loss": -0.1029,
"num_tokens": 10911978.0,
"reward": 0.2410159707069397,
"reward_std": 0.056731171905994415,
"rewards/bleu_reward_func/mean": 0.2410159707069397,
"rewards/bleu_reward_func/std": 0.20536428689956665,
"step": 838
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 198.8125,
"completions/mean_terminated_length": 140.8148193359375,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.6712,
"grad_norm": 5.741243839263916,
"kl": 0.2135009765625,
"learning_rate": 1e-06,
"loss": 0.1006,
"num_tokens": 10925892.0,
"reward": 0.2018284797668457,
"reward_std": 0.04848968982696533,
"rewards/bleu_reward_func/mean": 0.2018284797668457,
"rewards/bleu_reward_func/std": 0.19715876877307892,
"step": 839
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 158.5,
"completions/mean_terminated_length": 158.5,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.672,
"grad_norm": 6.41207218170166,
"kl": 0.09161376953125,
"learning_rate": 1e-06,
"loss": -0.0922,
"num_tokens": 10934300.0,
"reward": 0.0522538498044014,
"reward_std": 0.021779239177703857,
"rewards/bleu_reward_func/mean": 0.0522538498044014,
"rewards/bleu_reward_func/std": 0.02408943697810173,
"step": 840
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 389.8125,
"completions/mean_terminated_length": 316.5,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.6728,
"grad_norm": 2.996119976043701,
"kl": 0.06011962890625,
"learning_rate": 1e-06,
"loss": 0.0296,
"num_tokens": 10951166.0,
"reward": 0.08128196746110916,
"reward_std": 0.01865270733833313,
"rewards/bleu_reward_func/mean": 0.08128196746110916,
"rewards/bleu_reward_func/std": 0.05130209028720856,
"step": 841
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 211.25,
"completions/mean_terminated_length": 180.13792419433594,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.6736,
"grad_norm": 4.3674750328063965,
"kl": 0.07989501953125,
"learning_rate": 1e-06,
"loss": 0.2159,
"num_tokens": 10962038.0,
"reward": 0.035381607711315155,
"reward_std": 0.015435540117323399,
"rewards/bleu_reward_func/mean": 0.035381607711315155,
"rewards/bleu_reward_func/std": 0.02227640338242054,
"step": 842
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 326.15625,
"completions/mean_terminated_length": 241.68182373046875,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.6744,
"grad_norm": 7.144293308258057,
"kl": 0.1136932373046875,
"learning_rate": 1e-06,
"loss": 0.0113,
"num_tokens": 10980947.0,
"reward": 0.24271616339683533,
"reward_std": 0.03907809406518936,
"rewards/bleu_reward_func/mean": 0.24271616339683533,
"rewards/bleu_reward_func/std": 0.21944448351860046,
"step": 843
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 318.6875,
"completions/mean_terminated_length": 274.0769348144531,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.6752,
"grad_norm": 3.7767539024353027,
"kl": 0.080078125,
"learning_rate": 1e-06,
"loss": -0.0971,
"num_tokens": 10993977.0,
"reward": 0.034242644906044006,
"reward_std": 0.01977381855249405,
"rewards/bleu_reward_func/mean": 0.034242644906044006,
"rewards/bleu_reward_func/std": 0.024919696152210236,
"step": 844
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 454.0,
"completions/mean_length": 265.375,
"completions/mean_terminated_length": 73.55555725097656,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.676,
"grad_norm": 8.71445083618164,
"kl": 0.34588623046875,
"learning_rate": 1e-06,
"loss": 0.0431,
"num_tokens": 11007565.0,
"reward": 0.23305484652519226,
"reward_std": 0.05401034653186798,
"rewards/bleu_reward_func/mean": 0.23305484652519226,
"rewards/bleu_reward_func/std": 0.2091369926929474,
"step": 845
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 363.375,
"completions/mean_terminated_length": 305.2174072265625,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.6768,
"grad_norm": 5.406923770904541,
"kl": 0.061279296875,
"learning_rate": 1e-06,
"loss": -0.1012,
"num_tokens": 11023473.0,
"reward": 0.15702804923057556,
"reward_std": 0.02012755163013935,
"rewards/bleu_reward_func/mean": 0.15702804923057556,
"rewards/bleu_reward_func/std": 0.16683605313301086,
"step": 846
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 158.0,
"completions/mean_length": 84.125,
"completions/mean_terminated_length": 70.32257843017578,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6776,
"grad_norm": 7.79757022857666,
"kl": 0.094970703125,
"learning_rate": 1e-06,
"loss": 0.1681,
"num_tokens": 11033277.0,
"reward": 0.10883745551109314,
"reward_std": 0.06399966031312943,
"rewards/bleu_reward_func/mean": 0.10883745551109314,
"rewards/bleu_reward_func/std": 0.11935968697071075,
"step": 847
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 282.375,
"completions/mean_terminated_length": 239.8518524169922,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.6784,
"grad_norm": 7.137828350067139,
"kl": 0.18231201171875,
"learning_rate": 1e-06,
"loss": -0.1134,
"num_tokens": 11047977.0,
"reward": 0.07731978595256805,
"reward_std": 0.026035165414214134,
"rewards/bleu_reward_func/mean": 0.07731978595256805,
"rewards/bleu_reward_func/std": 0.08138881623744965,
"step": 848
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 427.0,
"completions/mean_length": 180.21875,
"completions/mean_terminated_length": 118.77777862548828,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.6792,
"grad_norm": 10.036446571350098,
"kl": 0.2659912109375,
"learning_rate": 1e-06,
"loss": 0.1133,
"num_tokens": 11058320.0,
"reward": 0.13975116610527039,
"reward_std": 0.02090391516685486,
"rewards/bleu_reward_func/mean": 0.13975116610527039,
"rewards/bleu_reward_func/std": 0.15142837166786194,
"step": 849
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 369.6875,
"completions/mean_terminated_length": 284.3000183105469,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.68,
"grad_norm": 3.8194141387939453,
"kl": 0.038970947265625,
"learning_rate": 1e-06,
"loss": 0.0524,
"num_tokens": 11073662.0,
"reward": 0.08185985684394836,
"reward_std": 0.033635906875133514,
"rewards/bleu_reward_func/mean": 0.08185985684394836,
"rewards/bleu_reward_func/std": 0.06655923277139664,
"step": 850
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 54.0,
"completions/mean_length": 106.5,
"completions/mean_terminated_length": 31.407407760620117,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.6808,
"grad_norm": 6.564956188201904,
"kl": 0.1971435546875,
"learning_rate": 1e-06,
"loss": 0.1071,
"num_tokens": 11085022.0,
"reward": 0.22551283240318298,
"reward_std": 0.04716075211763382,
"rewards/bleu_reward_func/mean": 0.22551283240318298,
"rewards/bleu_reward_func/std": 0.16660061478614807,
"step": 851
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 417.0,
"completions/mean_length": 154.375,
"completions/mean_terminated_length": 130.53334045410156,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.6816,
"grad_norm": 15.921568870544434,
"kl": 0.28778076171875,
"learning_rate": 1e-06,
"loss": -0.0597,
"num_tokens": 11095378.0,
"reward": 0.1252121478319168,
"reward_std": 0.05527370423078537,
"rewards/bleu_reward_func/mean": 0.1252121478319168,
"rewards/bleu_reward_func/std": 0.12923383712768555,
"step": 852
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 305.9375,
"completions/mean_terminated_length": 212.27273559570312,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.6824,
"grad_norm": 8.1792631149292,
"kl": 0.1992645263671875,
"learning_rate": 1e-06,
"loss": 0.0667,
"num_tokens": 11109616.0,
"reward": 0.0659586489200592,
"reward_std": 0.027510065585374832,
"rewards/bleu_reward_func/mean": 0.0659586489200592,
"rewards/bleu_reward_func/std": 0.08249466121196747,
"step": 853
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 341.125,
"completions/mean_terminated_length": 224.2105255126953,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.6832,
"grad_norm": 2.265425443649292,
"kl": 0.039276123046875,
"learning_rate": 1e-06,
"loss": 0.2352,
"num_tokens": 11123036.0,
"reward": 0.10829215496778488,
"reward_std": 0.10021056979894638,
"rewards/bleu_reward_func/mean": 0.10829215496778488,
"rewards/bleu_reward_func/std": 0.15350966155529022,
"step": 854
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 436.0,
"completions/mean_length": 398.5625,
"completions/mean_terminated_length": 285.125,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.684,
"grad_norm": 2.628075361251831,
"kl": 0.047515869140625,
"learning_rate": 1e-06,
"loss": 0.1379,
"num_tokens": 11139950.0,
"reward": 0.04525969177484512,
"reward_std": 0.025323685258626938,
"rewards/bleu_reward_func/mean": 0.04525969177484512,
"rewards/bleu_reward_func/std": 0.04984954744577408,
"step": 855
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 459.0,
"completions/mean_length": 489.0625,
"completions/mean_terminated_length": 389.66668701171875,
"completions/min_length": 325.0,
"completions/min_terminated_length": 325.0,
"epoch": 0.6848,
"grad_norm": 2.320620059967041,
"kl": 0.035888671875,
"learning_rate": 1e-06,
"loss": -0.0345,
"num_tokens": 11158672.0,
"reward": 0.016138827428221703,
"reward_std": 0.0038068746216595173,
"rewards/bleu_reward_func/mean": 0.016138827428221703,
"rewards/bleu_reward_func/std": 0.016928784549236298,
"step": 856
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 453.0,
"completions/mean_length": 317.84375,
"completions/mean_terminated_length": 241.86956787109375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.6856,
"grad_norm": 3.551910638809204,
"kl": 0.109039306640625,
"learning_rate": 1e-06,
"loss": 0.0289,
"num_tokens": 11173291.0,
"reward": 0.20694701373577118,
"reward_std": 0.014496378600597382,
"rewards/bleu_reward_func/mean": 0.20694701373577118,
"rewards/bleu_reward_func/std": 0.2963625490665436,
"step": 857
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 453.0,
"completions/mean_length": 147.125,
"completions/mean_terminated_length": 62.92308044433594,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.6864,
"grad_norm": 15.02341079711914,
"kl": 0.70330810546875,
"learning_rate": 1e-06,
"loss": 0.2886,
"num_tokens": 11182215.0,
"reward": 0.19951725006103516,
"reward_std": 0.052443791180849075,
"rewards/bleu_reward_func/mean": 0.19951725006103516,
"rewards/bleu_reward_func/std": 0.19433696568012238,
"step": 858
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 284.125,
"completions/mean_terminated_length": 128.2105255126953,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.6872,
"grad_norm": 8.070085525512695,
"kl": 0.110443115234375,
"learning_rate": 1e-06,
"loss": 0.1827,
"num_tokens": 11194667.0,
"reward": 0.04423338174819946,
"reward_std": 0.017294086515903473,
"rewards/bleu_reward_func/mean": 0.04423338174819946,
"rewards/bleu_reward_func/std": 0.047055598348379135,
"step": 859
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 329.71875,
"completions/mean_terminated_length": 287.65386962890625,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.688,
"grad_norm": 2.8106324672698975,
"kl": 0.039154052734375,
"learning_rate": 1e-06,
"loss": -0.2058,
"num_tokens": 11207642.0,
"reward": 0.06786108016967773,
"reward_std": 0.0352618470788002,
"rewards/bleu_reward_func/mean": 0.06786108016967773,
"rewards/bleu_reward_func/std": 0.04090343415737152,
"step": 860
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 368.53125,
"completions/mean_terminated_length": 320.7083435058594,
"completions/min_length": 54.0,
"completions/min_terminated_length": 54.0,
"epoch": 0.6888,
"grad_norm": 2.837979793548584,
"kl": 0.052001953125,
"learning_rate": 1e-06,
"loss": 0.107,
"num_tokens": 11221547.0,
"reward": 0.07621696591377258,
"reward_std": 0.029543904587626457,
"rewards/bleu_reward_func/mean": 0.07621696591377258,
"rewards/bleu_reward_func/std": 0.04072652757167816,
"step": 861
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 398.59375,
"completions/mean_terminated_length": 321.0,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.6896,
"grad_norm": 2.941206693649292,
"kl": 0.046844482421875,
"learning_rate": 1e-06,
"loss": -0.0139,
"num_tokens": 11237182.0,
"reward": 0.026391834020614624,
"reward_std": 0.016949903219938278,
"rewards/bleu_reward_func/mean": 0.026391834020614624,
"rewards/bleu_reward_func/std": 0.03409172222018242,
"step": 862
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 357.40625,
"completions/mean_terminated_length": 251.63157653808594,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.6904,
"grad_norm": 2.633358955383301,
"kl": 0.055328369140625,
"learning_rate": 1e-06,
"loss": -0.0911,
"num_tokens": 11251547.0,
"reward": 0.042053550481796265,
"reward_std": 0.021867552772164345,
"rewards/bleu_reward_func/mean": 0.042053550481796265,
"rewards/bleu_reward_func/std": 0.029616717249155045,
"step": 863
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 168.3125,
"completions/mean_terminated_length": 119.21429443359375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.6912,
"grad_norm": 4.309210777282715,
"kl": 0.13665771484375,
"learning_rate": 1e-06,
"loss": -0.1275,
"num_tokens": 11261245.0,
"reward": 0.1768188774585724,
"reward_std": 0.030298635363578796,
"rewards/bleu_reward_func/mean": 0.1768188774585724,
"rewards/bleu_reward_func/std": 0.12399855256080627,
"step": 864
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 252.78125,
"completions/mean_terminated_length": 151.3478240966797,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.692,
"grad_norm": 8.154788970947266,
"kl": 0.23638916015625,
"learning_rate": 1e-06,
"loss": 0.0438,
"num_tokens": 11275478.0,
"reward": 0.07366465032100677,
"reward_std": 0.029438909143209457,
"rewards/bleu_reward_func/mean": 0.07366465032100677,
"rewards/bleu_reward_func/std": 0.05699191242456436,
"step": 865
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 322.21875,
"completions/mean_terminated_length": 235.95455932617188,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6928,
"grad_norm": 2.933178663253784,
"kl": 0.04754638671875,
"learning_rate": 1e-06,
"loss": 0.0639,
"num_tokens": 11287317.0,
"reward": 0.041873324662446976,
"reward_std": 0.02685678005218506,
"rewards/bleu_reward_func/mean": 0.041873324662446976,
"rewards/bleu_reward_func/std": 0.039241958409547806,
"step": 866
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 455.0,
"completions/mean_length": 208.1875,
"completions/mean_terminated_length": 89.30435180664062,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.6936,
"grad_norm": 6.312671661376953,
"kl": 0.1842041015625,
"learning_rate": 1e-06,
"loss": 0.1617,
"num_tokens": 11297835.0,
"reward": 0.103369802236557,
"reward_std": 0.04473632201552391,
"rewards/bleu_reward_func/mean": 0.103369802236557,
"rewards/bleu_reward_func/std": 0.10830661654472351,
"step": 867
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 274.375,
"completions/mean_terminated_length": 230.37037658691406,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.6944,
"grad_norm": 3.08828067779541,
"kl": 0.090667724609375,
"learning_rate": 1e-06,
"loss": -0.0115,
"num_tokens": 11312311.0,
"reward": 0.16378189623355865,
"reward_std": 0.0222244244068861,
"rewards/bleu_reward_func/mean": 0.16378189623355865,
"rewards/bleu_reward_func/std": 0.19553562998771667,
"step": 868
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 446.0,
"completions/mean_length": 216.15625,
"completions/mean_terminated_length": 117.54167175292969,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.6952,
"grad_norm": 5.62147331237793,
"kl": 0.15509033203125,
"learning_rate": 1e-06,
"loss": -0.0272,
"num_tokens": 11325348.0,
"reward": 0.059518001973629,
"reward_std": 0.028110869228839874,
"rewards/bleu_reward_func/mean": 0.059518001973629,
"rewards/bleu_reward_func/std": 0.048489734530448914,
"step": 869
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 404.46875,
"completions/mean_terminated_length": 296.9375,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.696,
"grad_norm": 3.3346071243286133,
"kl": 0.038360595703125,
"learning_rate": 1e-06,
"loss": -0.0205,
"num_tokens": 11343635.0,
"reward": 0.07933641970157623,
"reward_std": 0.021958988159894943,
"rewards/bleu_reward_func/mean": 0.07933641970157623,
"rewards/bleu_reward_func/std": 0.06096653267741203,
"step": 870
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 464.0,
"completions/mean_length": 236.6875,
"completions/mean_terminated_length": 185.70370483398438,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.6968,
"grad_norm": 4.446465015411377,
"kl": 0.17974853515625,
"learning_rate": 1e-06,
"loss": -0.0515,
"num_tokens": 11353817.0,
"reward": 0.11160654574632645,
"reward_std": 0.039265286177396774,
"rewards/bleu_reward_func/mean": 0.11160654574632645,
"rewards/bleu_reward_func/std": 0.08857923746109009,
"step": 871
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 260.125,
"completions/mean_terminated_length": 161.56521606445312,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.6976,
"grad_norm": 7.414572715759277,
"kl": 0.107696533203125,
"learning_rate": 1e-06,
"loss": -0.0123,
"num_tokens": 11367725.0,
"reward": 0.1780683994293213,
"reward_std": 0.015433109365403652,
"rewards/bleu_reward_func/mean": 0.1780683994293213,
"rewards/bleu_reward_func/std": 0.2229662984609604,
"step": 872
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 408.0625,
"completions/mean_terminated_length": 360.8182067871094,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.6984,
"grad_norm": 3.227613925933838,
"kl": 0.0611572265625,
"learning_rate": 1e-06,
"loss": -0.0681,
"num_tokens": 11385423.0,
"reward": 0.0366949737071991,
"reward_std": 0.01884927786886692,
"rewards/bleu_reward_func/mean": 0.0366949737071991,
"rewards/bleu_reward_func/std": 0.028229771181941032,
"step": 873
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 222.21875,
"completions/mean_terminated_length": 168.55555725097656,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.6992,
"grad_norm": 7.29306697845459,
"kl": 0.142730712890625,
"learning_rate": 1e-06,
"loss": 0.11,
"num_tokens": 11397022.0,
"reward": 0.046667180955410004,
"reward_std": 0.020207617431879044,
"rewards/bleu_reward_func/mean": 0.046667180955410004,
"rewards/bleu_reward_func/std": 0.02555895410478115,
"step": 874
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 268.5,
"completions/mean_terminated_length": 173.21739196777344,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.7,
"grad_norm": 3.604617118835449,
"kl": 0.06298828125,
"learning_rate": 1e-06,
"loss": -0.3505,
"num_tokens": 11410046.0,
"reward": 0.07897455990314484,
"reward_std": 0.014880911447107792,
"rewards/bleu_reward_func/mean": 0.07897455990314484,
"rewards/bleu_reward_func/std": 0.08343996107578278,
"step": 875
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 325.46875,
"completions/mean_terminated_length": 227.76190185546875,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.7008,
"grad_norm": 8.727375030517578,
"kl": 0.133270263671875,
"learning_rate": 1e-06,
"loss": 0.3117,
"num_tokens": 11422725.0,
"reward": 0.07061035186052322,
"reward_std": 0.0419192910194397,
"rewards/bleu_reward_func/mean": 0.07061035186052322,
"rewards/bleu_reward_func/std": 0.07667659968137741,
"step": 876
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 181.375,
"completions/mean_terminated_length": 120.14814758300781,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.7016,
"grad_norm": 7.317707061767578,
"kl": 0.21905517578125,
"learning_rate": 1e-06,
"loss": 0.4234,
"num_tokens": 11431937.0,
"reward": 0.10765747725963593,
"reward_std": 0.052248626947402954,
"rewards/bleu_reward_func/mean": 0.10765747725963593,
"rewards/bleu_reward_func/std": 0.05436404421925545,
"step": 877
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 220.75,
"completions/mean_terminated_length": 211.35482788085938,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.7024,
"grad_norm": 4.363486289978027,
"kl": 0.214019775390625,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 11443057.0,
"reward": 0.30547034740448,
"reward_std": 0.024015674367547035,
"rewards/bleu_reward_func/mean": 0.30547034740448,
"rewards/bleu_reward_func/std": 0.2281493991613388,
"step": 878
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 377.0,
"completions/mean_length": 205.46875,
"completions/mean_terminated_length": 119.63999938964844,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.7032,
"grad_norm": 8.484012603759766,
"kl": 0.189361572265625,
"learning_rate": 1e-06,
"loss": 0.1579,
"num_tokens": 11451232.0,
"reward": 0.0824245885014534,
"reward_std": 0.04487679526209831,
"rewards/bleu_reward_func/mean": 0.0824245885014534,
"rewards/bleu_reward_func/std": 0.07150331139564514,
"step": 879
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 449.0,
"completions/mean_length": 141.375,
"completions/mean_terminated_length": 129.4193572998047,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.704,
"grad_norm": 5.958530902862549,
"kl": 0.1085205078125,
"learning_rate": 1e-06,
"loss": -0.0313,
"num_tokens": 11462604.0,
"reward": 0.04005417972803116,
"reward_std": 0.024934137240052223,
"rewards/bleu_reward_func/mean": 0.04005417972803116,
"rewards/bleu_reward_func/std": 0.03826345130801201,
"step": 880
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 283.03125,
"completions/mean_terminated_length": 126.36842346191406,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.7048,
"grad_norm": 6.075742244720459,
"kl": 0.3231201171875,
"learning_rate": 1e-06,
"loss": 0.0444,
"num_tokens": 11477461.0,
"reward": 0.10366402566432953,
"reward_std": 0.055370062589645386,
"rewards/bleu_reward_func/mean": 0.10366402566432953,
"rewards/bleu_reward_func/std": 0.11003145575523376,
"step": 881
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 361.0,
"completions/mean_terminated_length": 227.76470947265625,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.7056,
"grad_norm": 5.96100378036499,
"kl": 0.19482421875,
"learning_rate": 1e-06,
"loss": -0.0353,
"num_tokens": 11493365.0,
"reward": 0.1235186904668808,
"reward_std": 0.038026995956897736,
"rewards/bleu_reward_func/mean": 0.1235186904668808,
"rewards/bleu_reward_func/std": 0.05816841870546341,
"step": 882
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 358.0,
"completions/mean_length": 172.09375,
"completions/mean_terminated_length": 109.14814758300781,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.7064,
"grad_norm": 6.117469787597656,
"kl": 0.201904296875,
"learning_rate": 1e-06,
"loss": 0.067,
"num_tokens": 11501456.0,
"reward": 0.15373189747333527,
"reward_std": 0.05197744071483612,
"rewards/bleu_reward_func/mean": 0.15373189747333527,
"rewards/bleu_reward_func/std": 0.10633216798305511,
"step": 883
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 263.15625,
"completions/mean_terminated_length": 193.47999572753906,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.7072,
"grad_norm": 4.437005519866943,
"kl": 0.12554931640625,
"learning_rate": 1e-06,
"loss": 0.2681,
"num_tokens": 11517437.0,
"reward": 0.29476526379585266,
"reward_std": 0.13803553581237793,
"rewards/bleu_reward_func/mean": 0.29476526379585266,
"rewards/bleu_reward_func/std": 0.32065168023109436,
"step": 884
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 252.75,
"completions/mean_terminated_length": 134.90908813476562,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.708,
"grad_norm": 8.795902252197266,
"kl": 0.3934326171875,
"learning_rate": 1e-06,
"loss": -0.0341,
"num_tokens": 11529885.0,
"reward": 0.13262969255447388,
"reward_std": 0.037800293415784836,
"rewards/bleu_reward_func/mean": 0.13262969255447388,
"rewards/bleu_reward_func/std": 0.11564164608716965,
"step": 885
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 355.6875,
"completions/mean_terminated_length": 199.375,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.7088,
"grad_norm": 5.155429840087891,
"kl": 0.0558319091796875,
"learning_rate": 1e-06,
"loss": 0.0544,
"num_tokens": 11546155.0,
"reward": 0.10861489176750183,
"reward_std": 0.035860445350408554,
"rewards/bleu_reward_func/mean": 0.10861489176750183,
"rewards/bleu_reward_func/std": 0.08613201975822449,
"step": 886
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 282.84375,
"completions/mean_terminated_length": 126.0526351928711,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.7096,
"grad_norm": 4.782761096954346,
"kl": 0.133056640625,
"learning_rate": 1e-06,
"loss": 0.0206,
"num_tokens": 11561390.0,
"reward": 0.0671025738120079,
"reward_std": 0.018492672592401505,
"rewards/bleu_reward_func/mean": 0.0671025738120079,
"rewards/bleu_reward_func/std": 0.06450604647397995,
"step": 887
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 331.6875,
"completions/mean_terminated_length": 261.13043212890625,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.7104,
"grad_norm": 2.8767964839935303,
"kl": 0.06182861328125,
"learning_rate": 1e-06,
"loss": -0.0002,
"num_tokens": 11577356.0,
"reward": 0.1093081682920456,
"reward_std": 0.07805053889751434,
"rewards/bleu_reward_func/mean": 0.1093081682920456,
"rewards/bleu_reward_func/std": 0.17048169672489166,
"step": 888
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 397.375,
"completions/mean_terminated_length": 359.16668701171875,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.7112,
"grad_norm": 2.2424142360687256,
"kl": 0.0458984375,
"learning_rate": 1e-06,
"loss": -0.015,
"num_tokens": 11591432.0,
"reward": 0.06777183711528778,
"reward_std": 0.019787484779953957,
"rewards/bleu_reward_func/mean": 0.06777183711528778,
"rewards/bleu_reward_func/std": 0.041765324771404266,
"step": 889
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 292.59375,
"completions/mean_terminated_length": 160.9499969482422,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.712,
"grad_norm": 4.390679359436035,
"kl": 0.0987548828125,
"learning_rate": 1e-06,
"loss": 0.2246,
"num_tokens": 11603515.0,
"reward": 0.06538625806570053,
"reward_std": 0.03718053176999092,
"rewards/bleu_reward_func/mean": 0.06538625806570053,
"rewards/bleu_reward_func/std": 0.0816822499036789,
"step": 890
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 424.0,
"completions/mean_length": 188.875,
"completions/mean_terminated_length": 155.44827270507812,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.7128,
"grad_norm": 5.5804290771484375,
"kl": 0.123260498046875,
"learning_rate": 1e-06,
"loss": -0.2822,
"num_tokens": 11612927.0,
"reward": 0.0781329870223999,
"reward_std": 0.049637503921985626,
"rewards/bleu_reward_func/mean": 0.0781329870223999,
"rewards/bleu_reward_func/std": 0.08602513372898102,
"step": 891
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 332.96875,
"completions/mean_terminated_length": 282.8399963378906,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.7136,
"grad_norm": 2.8855247497558594,
"kl": 0.0643310546875,
"learning_rate": 1e-06,
"loss": 0.1736,
"num_tokens": 11625662.0,
"reward": 0.03828759491443634,
"reward_std": 0.024871867150068283,
"rewards/bleu_reward_func/mean": 0.03828759491443634,
"rewards/bleu_reward_func/std": 0.03181852772831917,
"step": 892
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 322.96875,
"completions/mean_terminated_length": 223.952392578125,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.7144,
"grad_norm": 8.224991798400879,
"kl": 0.19036865234375,
"learning_rate": 1e-06,
"loss": 0.1601,
"num_tokens": 11642029.0,
"reward": 0.03835766017436981,
"reward_std": 0.013130895793437958,
"rewards/bleu_reward_func/mean": 0.03835766017436981,
"rewards/bleu_reward_func/std": 0.024478256702423096,
"step": 893
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 469.0,
"completions/mean_length": 205.46875,
"completions/mean_terminated_length": 148.70370483398438,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.7152,
"grad_norm": 6.305431842803955,
"kl": 0.14776611328125,
"learning_rate": 1e-06,
"loss": -0.0135,
"num_tokens": 11657452.0,
"reward": 0.11420266330242157,
"reward_std": 0.04108916223049164,
"rewards/bleu_reward_func/mean": 0.11420266330242157,
"rewards/bleu_reward_func/std": 0.06337518244981766,
"step": 894
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 336.71875,
"completions/mean_terminated_length": 200.38888549804688,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.716,
"grad_norm": 4.193768501281738,
"kl": 0.13232421875,
"learning_rate": 1e-06,
"loss": 0.3397,
"num_tokens": 11672971.0,
"reward": 0.07947193086147308,
"reward_std": 0.04811304062604904,
"rewards/bleu_reward_func/mean": 0.07947193086147308,
"rewards/bleu_reward_func/std": 0.10142233967781067,
"step": 895
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 123.09375,
"completions/mean_terminated_length": 97.16667175292969,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.7168,
"grad_norm": 9.166439056396484,
"kl": 0.28271484375,
"learning_rate": 1e-06,
"loss": -0.1089,
"num_tokens": 11684334.0,
"reward": 0.27329233288764954,
"reward_std": 0.059711530804634094,
"rewards/bleu_reward_func/mean": 0.27329233288764954,
"rewards/bleu_reward_func/std": 0.1879579871892929,
"step": 896
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 246.0,
"completions/mean_terminated_length": 157.33334350585938,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.7176,
"grad_norm": 4.767898082733154,
"kl": 0.112060546875,
"learning_rate": 1e-06,
"loss": 0.0037,
"num_tokens": 11700486.0,
"reward": 0.07844039797782898,
"reward_std": 0.034808725118637085,
"rewards/bleu_reward_func/mean": 0.07844039797782898,
"rewards/bleu_reward_func/std": 0.0884510949254036,
"step": 897
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 291.25,
"completions/mean_terminated_length": 140.2105255126953,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.7184,
"grad_norm": 5.6404266357421875,
"kl": 0.126312255859375,
"learning_rate": 1e-06,
"loss": 0.0453,
"num_tokens": 11712438.0,
"reward": 0.07097087800502777,
"reward_std": 0.03667715564370155,
"rewards/bleu_reward_func/mean": 0.07097087800502777,
"rewards/bleu_reward_func/std": 0.08086320012807846,
"step": 898
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 264.6875,
"completions/mean_terminated_length": 229.35714721679688,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.7192,
"grad_norm": 4.8742499351501465,
"kl": 0.10357666015625,
"learning_rate": 1e-06,
"loss": 0.1854,
"num_tokens": 11725396.0,
"reward": 0.21620362997055054,
"reward_std": 0.07608456909656525,
"rewards/bleu_reward_func/mean": 0.21620362997055054,
"rewards/bleu_reward_func/std": 0.2514094114303589,
"step": 899
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 329.46875,
"completions/mean_terminated_length": 268.625,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.72,
"grad_norm": 4.756425857543945,
"kl": 0.10198974609375,
"learning_rate": 1e-06,
"loss": 0.0469,
"num_tokens": 11738307.0,
"reward": 0.053835704922676086,
"reward_std": 0.012895071879029274,
"rewards/bleu_reward_func/mean": 0.053835704922676086,
"rewards/bleu_reward_func/std": 0.03540419042110443,
"step": 900
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 401.65625,
"completions/mean_terminated_length": 259.7857360839844,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"epoch": 0.7208,
"grad_norm": 3.1605112552642822,
"kl": 0.04974365234375,
"learning_rate": 1e-06,
"loss": 0.2297,
"num_tokens": 11756496.0,
"reward": 0.24388237297534943,
"reward_std": 0.1161736249923706,
"rewards/bleu_reward_func/mean": 0.24388237297534943,
"rewards/bleu_reward_func/std": 0.3413524627685547,
"step": 901
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 271.59375,
"completions/mean_terminated_length": 246.72413635253906,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.7216,
"grad_norm": 4.188157081604004,
"kl": 0.1815185546875,
"learning_rate": 1e-06,
"loss": -0.0606,
"num_tokens": 11768691.0,
"reward": 0.12793870270252228,
"reward_std": 0.04022746905684471,
"rewards/bleu_reward_func/mean": 0.12793870270252228,
"rewards/bleu_reward_func/std": 0.15937677025794983,
"step": 902
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 282.65625,
"completions/mean_terminated_length": 178.4091033935547,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.7224,
"grad_norm": 4.868112087249756,
"kl": 0.1282958984375,
"learning_rate": 1e-06,
"loss": -0.0395,
"num_tokens": 11780800.0,
"reward": 0.09620735794305801,
"reward_std": 0.021982625126838684,
"rewards/bleu_reward_func/mean": 0.09620735794305801,
"rewards/bleu_reward_func/std": 0.07161340862512589,
"step": 903
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 277.09375,
"completions/mean_terminated_length": 222.88462829589844,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.7232,
"grad_norm": 7.519046783447266,
"kl": 0.212158203125,
"learning_rate": 1e-06,
"loss": -0.1487,
"num_tokens": 11795403.0,
"reward": 0.07747071981430054,
"reward_std": 0.03376290947198868,
"rewards/bleu_reward_func/mean": 0.07747071981430054,
"rewards/bleu_reward_func/std": 0.055931881070137024,
"step": 904
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 253.4375,
"completions/mean_terminated_length": 167.25,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.724,
"grad_norm": 8.574625015258789,
"kl": 0.1490478515625,
"learning_rate": 1e-06,
"loss": 0.0058,
"num_tokens": 11806985.0,
"reward": 0.06105317175388336,
"reward_std": 0.019554441794753075,
"rewards/bleu_reward_func/mean": 0.06105317175388336,
"rewards/bleu_reward_func/std": 0.038146011531353,
"step": 905
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 407.0,
"completions/mean_length": 72.1875,
"completions/mean_terminated_length": 58.0,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.7248,
"grad_norm": 10.871319770812988,
"kl": 0.32177734375,
"learning_rate": 1e-06,
"loss": 0.2813,
"num_tokens": 11812335.0,
"reward": 0.09286689758300781,
"reward_std": 0.02634507045149803,
"rewards/bleu_reward_func/mean": 0.09286689758300781,
"rewards/bleu_reward_func/std": 0.04922043904662132,
"step": 906
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 239.3125,
"completions/mean_terminated_length": 200.35714721679688,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.7256,
"grad_norm": 23.129505157470703,
"kl": 0.1500244140625,
"learning_rate": 1e-06,
"loss": 0.0482,
"num_tokens": 11828945.0,
"reward": 0.07308115810155869,
"reward_std": 0.014882557094097137,
"rewards/bleu_reward_func/mean": 0.07308115810155869,
"rewards/bleu_reward_func/std": 0.08316269516944885,
"step": 907
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 429.8125,
"completions/mean_terminated_length": 292.8333435058594,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.7264,
"grad_norm": 11.226503372192383,
"kl": 0.059967041015625,
"learning_rate": 1e-06,
"loss": 0.0269,
"num_tokens": 11848283.0,
"reward": 0.04945487529039383,
"reward_std": 0.016689039766788483,
"rewards/bleu_reward_func/mean": 0.04945487529039383,
"rewards/bleu_reward_func/std": 0.04881744086742401,
"step": 908
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 225.09375,
"completions/mean_terminated_length": 129.45834350585938,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.7272,
"grad_norm": 8.629831314086914,
"kl": 0.2176055908203125,
"learning_rate": 1e-06,
"loss": 0.5547,
"num_tokens": 11858854.0,
"reward": 0.14837728440761566,
"reward_std": 0.06372867524623871,
"rewards/bleu_reward_func/mean": 0.14837728440761566,
"rewards/bleu_reward_func/std": 0.18777750432491302,
"step": 909
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 262.0,
"completions/max_terminated_length": 262.0,
"completions/mean_length": 74.25,
"completions/mean_terminated_length": 74.25,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.728,
"grad_norm": 13.281477928161621,
"kl": 0.438720703125,
"learning_rate": 1e-06,
"loss": 0.0534,
"num_tokens": 11869046.0,
"reward": 0.20467601716518402,
"reward_std": 0.04131526127457619,
"rewards/bleu_reward_func/mean": 0.20467601716518402,
"rewards/bleu_reward_func/std": 0.14035604894161224,
"step": 910
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 358.0,
"completions/mean_length": 341.625,
"completions/mean_terminated_length": 148.53334045410156,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.7288,
"grad_norm": 6.93421745300293,
"kl": 0.177642822265625,
"learning_rate": 1e-06,
"loss": 0.0368,
"num_tokens": 11883586.0,
"reward": 0.18407613039016724,
"reward_std": 0.020998071879148483,
"rewards/bleu_reward_func/mean": 0.18407613039016724,
"rewards/bleu_reward_func/std": 0.2021336704492569,
"step": 911
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 222.1875,
"completions/mean_terminated_length": 168.51852416992188,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.7296,
"grad_norm": 4.430690765380859,
"kl": 0.18560791015625,
"learning_rate": 1e-06,
"loss": 0.1092,
"num_tokens": 11894448.0,
"reward": 0.1439959555864334,
"reward_std": 0.04086273908615112,
"rewards/bleu_reward_func/mean": 0.1439959555864334,
"rewards/bleu_reward_func/std": 0.1705217957496643,
"step": 912
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 397.25,
"completions/mean_terminated_length": 345.0909118652344,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.7304,
"grad_norm": 2.7031519412994385,
"kl": 0.05963134765625,
"learning_rate": 1e-06,
"loss": 0.0584,
"num_tokens": 11909408.0,
"reward": 0.09079495072364807,
"reward_std": 0.021243298426270485,
"rewards/bleu_reward_func/mean": 0.09079495072364807,
"rewards/bleu_reward_func/std": 0.1052529513835907,
"step": 913
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 467.0,
"completions/mean_length": 282.9375,
"completions/mean_terminated_length": 126.21052551269531,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.7312,
"grad_norm": 6.302535057067871,
"kl": 0.17462158203125,
"learning_rate": 1e-06,
"loss": -0.1643,
"num_tokens": 11923182.0,
"reward": 0.10389965772628784,
"reward_std": 0.03838275372982025,
"rewards/bleu_reward_func/mean": 0.10389965772628784,
"rewards/bleu_reward_func/std": 0.10838860273361206,
"step": 914
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 411.4375,
"completions/mean_terminated_length": 282.14288330078125,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.732,
"grad_norm": 2.603992223739624,
"kl": 0.052398681640625,
"learning_rate": 1e-06,
"loss": -0.0025,
"num_tokens": 11939596.0,
"reward": 0.06734529137611389,
"reward_std": 0.0207513514906168,
"rewards/bleu_reward_func/mean": 0.06734529137611389,
"rewards/bleu_reward_func/std": 0.05821956321597099,
"step": 915
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.0,
"completions/max_terminated_length": 447.0,
"completions/mean_length": 111.90625,
"completions/mean_terminated_length": 111.90625,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.7328,
"grad_norm": 15.395035743713379,
"kl": 0.2227783203125,
"learning_rate": 1e-06,
"loss": 0.3837,
"num_tokens": 11947113.0,
"reward": 0.24101027846336365,
"reward_std": 0.07465855032205582,
"rewards/bleu_reward_func/mean": 0.24101027846336365,
"rewards/bleu_reward_func/std": 0.17581383883953094,
"step": 916
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 312.90625,
"completions/mean_terminated_length": 176.68421936035156,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.7336,
"grad_norm": 8.670394897460938,
"kl": 0.054595947265625,
"learning_rate": 1e-06,
"loss": -0.1736,
"num_tokens": 11965846.0,
"reward": 0.2349693477153778,
"reward_std": 0.042255695909261703,
"rewards/bleu_reward_func/mean": 0.2349693477153778,
"rewards/bleu_reward_func/std": 0.37363162636756897,
"step": 917
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 384.34375,
"completions/mean_terminated_length": 317.4761962890625,
"completions/min_length": 74.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.7344,
"grad_norm": 2.5599606037139893,
"kl": 0.06549072265625,
"learning_rate": 1e-06,
"loss": -0.0089,
"num_tokens": 11980945.0,
"reward": 0.061901748180389404,
"reward_std": 0.02856561914086342,
"rewards/bleu_reward_func/mean": 0.061901748180389404,
"rewards/bleu_reward_func/std": 0.04196527600288391,
"step": 918
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 424.0,
"completions/mean_length": 203.9375,
"completions/mean_terminated_length": 146.88888549804688,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.7352,
"grad_norm": 4.163308143615723,
"kl": 0.08990478515625,
"learning_rate": 1e-06,
"loss": -0.0627,
"num_tokens": 11994255.0,
"reward": 0.06285493075847626,
"reward_std": 0.027241935953497887,
"rewards/bleu_reward_func/mean": 0.06285493075847626,
"rewards/bleu_reward_func/std": 0.03245123475790024,
"step": 919
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 445.90625,
"completions/mean_terminated_length": 406.25,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"epoch": 0.736,
"grad_norm": 2.6394991874694824,
"kl": 0.0596923828125,
"learning_rate": 1e-06,
"loss": -0.0646,
"num_tokens": 12010980.0,
"reward": 0.05676237493753433,
"reward_std": 0.014085400849580765,
"rewards/bleu_reward_func/mean": 0.05676237493753433,
"rewards/bleu_reward_func/std": 0.03611414507031441,
"step": 920
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 286.0,
"completions/mean_length": 106.0625,
"completions/mean_terminated_length": 92.96774291992188,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.7368,
"grad_norm": 8.88719654083252,
"kl": 0.19818115234375,
"learning_rate": 1e-06,
"loss": -0.1221,
"num_tokens": 12017206.0,
"reward": 0.08727812767028809,
"reward_std": 0.05162365734577179,
"rewards/bleu_reward_func/mean": 0.08727812767028809,
"rewards/bleu_reward_func/std": 0.07182831317186356,
"step": 921
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 195.25,
"completions/mean_terminated_length": 162.48275756835938,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.7376,
"grad_norm": 7.483645439147949,
"kl": 0.206787109375,
"learning_rate": 1e-06,
"loss": -0.0919,
"num_tokens": 12028214.0,
"reward": 0.18070882558822632,
"reward_std": 0.04944847524166107,
"rewards/bleu_reward_func/mean": 0.18070882558822632,
"rewards/bleu_reward_func/std": 0.19004972279071808,
"step": 922
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 285.75,
"completions/mean_terminated_length": 130.94737243652344,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.7384,
"grad_norm": 4.784849643707275,
"kl": 0.07586669921875,
"learning_rate": 1e-06,
"loss": 0.131,
"num_tokens": 12042278.0,
"reward": 0.05333679914474487,
"reward_std": 0.03152618184685707,
"rewards/bleu_reward_func/mean": 0.05333679914474487,
"rewards/bleu_reward_func/std": 0.055619917809963226,
"step": 923
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 51.0,
"completions/mean_length": 300.0625,
"completions/mean_terminated_length": 27.571430206298828,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.7392,
"grad_norm": 5.440861701965332,
"kl": 0.145477294921875,
"learning_rate": 1e-06,
"loss": 0.0245,
"num_tokens": 12056416.0,
"reward": 0.14792697131633759,
"reward_std": 0.02701294980943203,
"rewards/bleu_reward_func/mean": 0.14792697131633759,
"rewards/bleu_reward_func/std": 0.15142259001731873,
"step": 924
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 286.15625,
"completions/mean_terminated_length": 271.1000061035156,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.74,
"grad_norm": 3.949329137802124,
"kl": 0.085693359375,
"learning_rate": 1e-06,
"loss": 0.051,
"num_tokens": 12069725.0,
"reward": 0.07858790457248688,
"reward_std": 0.02233020029962063,
"rewards/bleu_reward_func/mean": 0.07858790457248688,
"rewards/bleu_reward_func/std": 0.07242675125598907,
"step": 925
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 128.75,
"completions/mean_terminated_length": 89.10344696044922,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.7408,
"grad_norm": 11.58963394165039,
"kl": 0.43695068359375,
"learning_rate": 1e-06,
"loss": 0.0795,
"num_tokens": 12080085.0,
"reward": 0.11042273789644241,
"reward_std": 0.017381731420755386,
"rewards/bleu_reward_func/mean": 0.11042273789644241,
"rewards/bleu_reward_func/std": 0.04870026186108589,
"step": 926
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 295.40625,
"completions/mean_terminated_length": 126.94444274902344,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.7416,
"grad_norm": 9.117962837219238,
"kl": 0.22723388671875,
"learning_rate": 1e-06,
"loss": -0.4999,
"num_tokens": 12092682.0,
"reward": 0.03860364854335785,
"reward_std": 0.019805099815130234,
"rewards/bleu_reward_func/mean": 0.03860364854335785,
"rewards/bleu_reward_func/std": 0.024968957528471947,
"step": 927
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 405.0,
"completions/mean_length": 297.71875,
"completions/mean_terminated_length": 200.3181915283203,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"epoch": 0.7424,
"grad_norm": 3.2410967350006104,
"kl": 0.1171875,
"learning_rate": 1e-06,
"loss": -0.0166,
"num_tokens": 12107465.0,
"reward": 0.16988132894039154,
"reward_std": 0.03467182815074921,
"rewards/bleu_reward_func/mean": 0.16988132894039154,
"rewards/bleu_reward_func/std": 0.1373591423034668,
"step": 928
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 355.4375,
"completions/mean_terminated_length": 233.6666717529297,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.7432,
"grad_norm": 14.617587089538574,
"kl": 0.15447998046875,
"learning_rate": 1e-06,
"loss": 0.0284,
"num_tokens": 12121815.0,
"reward": 0.14590570330619812,
"reward_std": 0.026923291385173798,
"rewards/bleu_reward_func/mean": 0.14590570330619812,
"rewards/bleu_reward_func/std": 0.2141415923833847,
"step": 929
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 262.78125,
"completions/mean_terminated_length": 227.1785888671875,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.744,
"grad_norm": 3.273005962371826,
"kl": 0.1136474609375,
"learning_rate": 1e-06,
"loss": -0.2043,
"num_tokens": 12136200.0,
"reward": 0.08558979630470276,
"reward_std": 0.03035646863281727,
"rewards/bleu_reward_func/mean": 0.08558979630470276,
"rewards/bleu_reward_func/std": 0.0643271803855896,
"step": 930
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 285.78125,
"completions/mean_terminated_length": 150.0500030517578,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.7448,
"grad_norm": 6.722025394439697,
"kl": 0.117340087890625,
"learning_rate": 1e-06,
"loss": 0.0744,
"num_tokens": 12147369.0,
"reward": 0.071571946144104,
"reward_std": 0.020615192130208015,
"rewards/bleu_reward_func/mean": 0.071571946144104,
"rewards/bleu_reward_func/std": 0.06541716307401657,
"step": 931
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 391.03125,
"completions/mean_terminated_length": 357.1600036621094,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.7456,
"grad_norm": 2.371354103088379,
"kl": 0.039031982421875,
"learning_rate": 1e-06,
"loss": -0.0517,
"num_tokens": 12165498.0,
"reward": 0.07511453330516815,
"reward_std": 0.020994337275624275,
"rewards/bleu_reward_func/mean": 0.07511453330516815,
"rewards/bleu_reward_func/std": 0.043336208909749985,
"step": 932
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 444.0,
"completions/mean_length": 321.59375,
"completions/mean_terminated_length": 235.0454559326172,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.7464,
"grad_norm": 2.6573996543884277,
"kl": 0.0595703125,
"learning_rate": 1e-06,
"loss": 0.2007,
"num_tokens": 12178413.0,
"reward": 0.07766060531139374,
"reward_std": 0.030490310862660408,
"rewards/bleu_reward_func/mean": 0.07766060531139374,
"rewards/bleu_reward_func/std": 0.05291305482387543,
"step": 933
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 389.9375,
"completions/mean_terminated_length": 326.0,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.7472,
"grad_norm": 2.6120660305023193,
"kl": 0.0733642578125,
"learning_rate": 1e-06,
"loss": 0.0789,
"num_tokens": 12194507.0,
"reward": 0.10922634601593018,
"reward_std": 0.0321655347943306,
"rewards/bleu_reward_func/mean": 0.10922634601593018,
"rewards/bleu_reward_func/std": 0.10983148962259293,
"step": 934
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 228.21875,
"completions/mean_terminated_length": 198.86207580566406,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.748,
"grad_norm": 7.4974236488342285,
"kl": 0.24505615234375,
"learning_rate": 1e-06,
"loss": -0.0333,
"num_tokens": 12206842.0,
"reward": 0.16729718446731567,
"reward_std": 0.050741568207740784,
"rewards/bleu_reward_func/mean": 0.16729718446731567,
"rewards/bleu_reward_func/std": 0.2129126340150833,
"step": 935
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 444.0,
"completions/mean_length": 248.5625,
"completions/mean_terminated_length": 90.5,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.7488,
"grad_norm": 8.694432258605957,
"kl": 0.124114990234375,
"learning_rate": 1e-06,
"loss": 0.0072,
"num_tokens": 12216596.0,
"reward": 0.10160160809755325,
"reward_std": 0.03439757227897644,
"rewards/bleu_reward_func/mean": 0.10160160809755325,
"rewards/bleu_reward_func/std": 0.06317181140184402,
"step": 936
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 173.75,
"completions/mean_terminated_length": 79.04000091552734,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.7496,
"grad_norm": 7.92829704284668,
"kl": 0.2811279296875,
"learning_rate": 1e-06,
"loss": -0.2142,
"num_tokens": 12226292.0,
"reward": 0.11500123143196106,
"reward_std": 0.030234824866056442,
"rewards/bleu_reward_func/mean": 0.11500123143196106,
"rewards/bleu_reward_func/std": 0.12273158878087997,
"step": 937
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 234.875,
"completions/mean_terminated_length": 170.92308044433594,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.7504,
"grad_norm": 8.183011054992676,
"kl": 0.261383056640625,
"learning_rate": 1e-06,
"loss": 0.017,
"num_tokens": 12238216.0,
"reward": 0.39511436223983765,
"reward_std": 0.1106102392077446,
"rewards/bleu_reward_func/mean": 0.39511436223983765,
"rewards/bleu_reward_func/std": 0.3091021776199341,
"step": 938
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 315.6875,
"completions/mean_terminated_length": 163.0,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.7512,
"grad_norm": 4.409646511077881,
"kl": 0.092132568359375,
"learning_rate": 1e-06,
"loss": -0.0114,
"num_tokens": 12254990.0,
"reward": 0.1955508440732956,
"reward_std": 0.016137830913066864,
"rewards/bleu_reward_func/mean": 0.1955508440732956,
"rewards/bleu_reward_func/std": 0.26973703503608704,
"step": 939
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 424.0,
"completions/mean_length": 276.875,
"completions/mean_terminated_length": 69.4117660522461,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.752,
"grad_norm": 7.45650577545166,
"kl": 0.2735595703125,
"learning_rate": 1e-06,
"loss": -0.0045,
"num_tokens": 12268242.0,
"reward": 0.07165145874023438,
"reward_std": 0.020489612594246864,
"rewards/bleu_reward_func/mean": 0.07165145874023438,
"rewards/bleu_reward_func/std": 0.04259462654590607,
"step": 940
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 150.8125,
"completions/mean_terminated_length": 113.44827270507812,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.7528,
"grad_norm": 10.180867195129395,
"kl": 0.4688720703125,
"learning_rate": 1e-06,
"loss": 0.122,
"num_tokens": 12276916.0,
"reward": 0.15257704257965088,
"reward_std": 0.051439568400382996,
"rewards/bleu_reward_func/mean": 0.15257704257965088,
"rewards/bleu_reward_func/std": 0.11688338220119476,
"step": 941
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 270.65625,
"completions/mean_terminated_length": 176.21739196777344,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.7536,
"grad_norm": 5.519646167755127,
"kl": 0.07073974609375,
"learning_rate": 1e-06,
"loss": 0.437,
"num_tokens": 12291593.0,
"reward": 0.06806058436632156,
"reward_std": 0.05050808936357498,
"rewards/bleu_reward_func/mean": 0.06806058436632156,
"rewards/bleu_reward_func/std": 0.06130353361368179,
"step": 942
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 405.625,
"completions/mean_terminated_length": 375.8399963378906,
"completions/min_length": 233.0,
"completions/min_terminated_length": 233.0,
"epoch": 0.7544,
"grad_norm": 1.8502905368804932,
"kl": 0.0356292724609375,
"learning_rate": 1e-06,
"loss": 0.0076,
"num_tokens": 12310893.0,
"reward": 0.21372246742248535,
"reward_std": 0.0709368884563446,
"rewards/bleu_reward_func/mean": 0.21372246742248535,
"rewards/bleu_reward_func/std": 0.1763986349105835,
"step": 943
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 300.0,
"completions/mean_length": 245.625,
"completions/mean_terminated_length": 141.3913116455078,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.7552,
"grad_norm": 3.3902478218078613,
"kl": 0.05377197265625,
"learning_rate": 1e-06,
"loss": 0.076,
"num_tokens": 12323369.0,
"reward": 0.0433628112077713,
"reward_std": 0.03261272981762886,
"rewards/bleu_reward_func/mean": 0.0433628112077713,
"rewards/bleu_reward_func/std": 0.0436432845890522,
"step": 944
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 405.0,
"completions/mean_length": 270.0625,
"completions/mean_terminated_length": 189.4166717529297,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.756,
"grad_norm": 5.770748615264893,
"kl": 0.088531494140625,
"learning_rate": 1e-06,
"loss": 0.0047,
"num_tokens": 12333803.0,
"reward": 0.10272043943405151,
"reward_std": 0.03215545043349266,
"rewards/bleu_reward_func/mean": 0.10272043943405151,
"rewards/bleu_reward_func/std": 0.11694183200597763,
"step": 945
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 224.25,
"completions/mean_terminated_length": 194.48275756835938,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.7568,
"grad_norm": 7.7613067626953125,
"kl": 0.20135498046875,
"learning_rate": 1e-06,
"loss": 0.1591,
"num_tokens": 12344915.0,
"reward": 0.08291373401880264,
"reward_std": 0.024335253983736038,
"rewards/bleu_reward_func/mean": 0.08291373401880264,
"rewards/bleu_reward_func/std": 0.03890189528465271,
"step": 946
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 455.6875,
"completions/mean_terminated_length": 373.3846435546875,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"epoch": 0.7576,
"grad_norm": 2.599454879760742,
"kl": 0.0421142578125,
"learning_rate": 1e-06,
"loss": -0.0235,
"num_tokens": 12362161.0,
"reward": 0.03875226154923439,
"reward_std": 0.020147912204265594,
"rewards/bleu_reward_func/mean": 0.03875226154923439,
"rewards/bleu_reward_func/std": 0.023408547043800354,
"step": 947
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 426.0,
"completions/mean_length": 197.875,
"completions/mean_terminated_length": 125.3846206665039,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.7584,
"grad_norm": 6.522151947021484,
"kl": 0.1690673828125,
"learning_rate": 1e-06,
"loss": 0.1959,
"num_tokens": 12375709.0,
"reward": 0.2021377682685852,
"reward_std": 0.0921662300825119,
"rewards/bleu_reward_func/mean": 0.2021377682685852,
"rewards/bleu_reward_func/std": 0.28283461928367615,
"step": 948
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 410.0,
"completions/mean_length": 256.21875,
"completions/mean_terminated_length": 122.23809814453125,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.7592,
"grad_norm": 5.671032905578613,
"kl": 0.1173095703125,
"learning_rate": 1e-06,
"loss": -0.0303,
"num_tokens": 12385764.0,
"reward": 0.0564446821808815,
"reward_std": 0.02071106806397438,
"rewards/bleu_reward_func/mean": 0.0564446821808815,
"rewards/bleu_reward_func/std": 0.030088067054748535,
"step": 949
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 255.59375,
"completions/mean_terminated_length": 229.0689697265625,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.76,
"grad_norm": 6.897347927093506,
"kl": 0.2220458984375,
"learning_rate": 1e-06,
"loss": 0.1234,
"num_tokens": 12396999.0,
"reward": 0.09963001310825348,
"reward_std": 0.05010713264346123,
"rewards/bleu_reward_func/mean": 0.09963001310825348,
"rewards/bleu_reward_func/std": 0.08052106946706772,
"step": 950
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 456.5,
"completions/mean_terminated_length": 418.52630615234375,
"completions/min_length": 254.0,
"completions/min_terminated_length": 254.0,
"epoch": 0.7608,
"grad_norm": 2.444167137145996,
"kl": 0.04534912109375,
"learning_rate": 1e-06,
"loss": -0.0097,
"num_tokens": 12418039.0,
"reward": 0.03447666019201279,
"reward_std": 0.01355208083987236,
"rewards/bleu_reward_func/mean": 0.03447666019201279,
"rewards/bleu_reward_func/std": 0.022434458136558533,
"step": 951
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 279.5625,
"completions/mean_terminated_length": 120.52631378173828,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.7616,
"grad_norm": 8.821101188659668,
"kl": 0.091888427734375,
"learning_rate": 1e-06,
"loss": 0.1929,
"num_tokens": 12431177.0,
"reward": 0.11506980657577515,
"reward_std": 0.033062804490327835,
"rewards/bleu_reward_func/mean": 0.11506980657577515,
"rewards/bleu_reward_func/std": 0.0943976491689682,
"step": 952
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 456.0,
"completions/mean_length": 269.78125,
"completions/mean_terminated_length": 201.95999145507812,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.7624,
"grad_norm": 6.004775524139404,
"kl": 0.07818603515625,
"learning_rate": 1e-06,
"loss": -0.1255,
"num_tokens": 12445970.0,
"reward": 0.09985020756721497,
"reward_std": 0.0198547150939703,
"rewards/bleu_reward_func/mean": 0.09985020756721497,
"rewards/bleu_reward_func/std": 0.08852815628051758,
"step": 953
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 318.8125,
"completions/mean_terminated_length": 231.0,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.7632,
"grad_norm": 6.8956804275512695,
"kl": 0.175567626953125,
"learning_rate": 1e-06,
"loss": 0.0679,
"num_tokens": 12458004.0,
"reward": 0.1692689061164856,
"reward_std": 0.03958010673522949,
"rewards/bleu_reward_func/mean": 0.1692689061164856,
"rewards/bleu_reward_func/std": 0.13855873048305511,
"step": 954
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 250.9375,
"completions/mean_terminated_length": 148.78260803222656,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.764,
"grad_norm": 6.749716758728027,
"kl": 0.293182373046875,
"learning_rate": 1e-06,
"loss": 0.021,
"num_tokens": 12472626.0,
"reward": 0.1224028617143631,
"reward_std": 0.027801956981420517,
"rewards/bleu_reward_func/mean": 0.1224028617143631,
"rewards/bleu_reward_func/std": 0.07426659762859344,
"step": 955
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 147.6875,
"completions/mean_terminated_length": 123.40000915527344,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.7648,
"grad_norm": 9.662991523742676,
"kl": 0.46905517578125,
"learning_rate": 1e-06,
"loss": 0.0171,
"num_tokens": 12481928.0,
"reward": 0.23031684756278992,
"reward_std": 0.0920054167509079,
"rewards/bleu_reward_func/mean": 0.23031684756278992,
"rewards/bleu_reward_func/std": 0.16612249612808228,
"step": 956
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 258.65625,
"completions/mean_terminated_length": 106.6500015258789,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.7656,
"grad_norm": 10.25383472442627,
"kl": 0.2713623046875,
"learning_rate": 1e-06,
"loss": 0.0254,
"num_tokens": 12493805.0,
"reward": 0.15187731385231018,
"reward_std": 0.025371436029672623,
"rewards/bleu_reward_func/mean": 0.15187731385231018,
"rewards/bleu_reward_func/std": 0.12905065715312958,
"step": 957
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 140.0,
"completions/mean_length": 85.40625,
"completions/mean_terminated_length": 71.64515686035156,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.7664,
"grad_norm": 8.22080135345459,
"kl": 0.393310546875,
"learning_rate": 1e-06,
"loss": 0.3247,
"num_tokens": 12502818.0,
"reward": 0.29921823740005493,
"reward_std": 0.11694261431694031,
"rewards/bleu_reward_func/mean": 0.29921823740005493,
"rewards/bleu_reward_func/std": 0.25639036297798157,
"step": 958
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 386.09375,
"completions/mean_terminated_length": 260.1875,
"completions/min_length": 49.0,
"completions/min_terminated_length": 49.0,
"epoch": 0.7672,
"grad_norm": 2.847195863723755,
"kl": 0.0467529296875,
"learning_rate": 1e-06,
"loss": 0.0852,
"num_tokens": 12520317.0,
"reward": 0.039544593542814255,
"reward_std": 0.016648683696985245,
"rewards/bleu_reward_func/mean": 0.039544593542814255,
"rewards/bleu_reward_func/std": 0.034897446632385254,
"step": 959
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 452.0,
"completions/max_terminated_length": 452.0,
"completions/mean_length": 111.4375,
"completions/mean_terminated_length": 111.4375,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.768,
"grad_norm": 8.353513717651367,
"kl": 0.22430419921875,
"learning_rate": 1e-06,
"loss": 0.2469,
"num_tokens": 12530027.0,
"reward": 0.26215416193008423,
"reward_std": 0.032358862459659576,
"rewards/bleu_reward_func/mean": 0.26215416193008423,
"rewards/bleu_reward_func/std": 0.22925570607185364,
"step": 960
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 456.6875,
"completions/mean_terminated_length": 315.3333435058594,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.7688,
"grad_norm": 2.2755558490753174,
"kl": 0.033355712890625,
"learning_rate": 1e-06,
"loss": 0.1802,
"num_tokens": 12548593.0,
"reward": 0.02548890933394432,
"reward_std": 0.01250866986811161,
"rewards/bleu_reward_func/mean": 0.02548890933394432,
"rewards/bleu_reward_func/std": 0.0143959391862154,
"step": 961
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 359.28125,
"completions/mean_terminated_length": 206.5625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.7696,
"grad_norm": 5.7301130294799805,
"kl": 0.165802001953125,
"learning_rate": 1e-06,
"loss": 0.0633,
"num_tokens": 12566850.0,
"reward": 0.09463217854499817,
"reward_std": 0.021320462226867676,
"rewards/bleu_reward_func/mean": 0.09463217854499817,
"rewards/bleu_reward_func/std": 0.10299301147460938,
"step": 962
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 388.125,
"completions/mean_terminated_length": 313.8000183105469,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.7704,
"grad_norm": 2.5917232036590576,
"kl": 0.07171630859375,
"learning_rate": 1e-06,
"loss": 0.1643,
"num_tokens": 12581438.0,
"reward": 0.06743638217449188,
"reward_std": 0.041416820138692856,
"rewards/bleu_reward_func/mean": 0.06743638217449188,
"rewards/bleu_reward_func/std": 0.0745474174618721,
"step": 963
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 173.53125,
"completions/mean_terminated_length": 162.61289978027344,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.7712,
"grad_norm": 5.022093772888184,
"kl": 0.110015869140625,
"learning_rate": 1e-06,
"loss": 0.1948,
"num_tokens": 12593087.0,
"reward": 0.14474597573280334,
"reward_std": 0.039374105632305145,
"rewards/bleu_reward_func/mean": 0.14474597573280334,
"rewards/bleu_reward_func/std": 0.0781283900141716,
"step": 964
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 345.0,
"completions/max_terminated_length": 345.0,
"completions/mean_length": 129.53125,
"completions/mean_terminated_length": 129.53125,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.772,
"grad_norm": 5.708756446838379,
"kl": 0.185546875,
"learning_rate": 1e-06,
"loss": -0.112,
"num_tokens": 12604360.0,
"reward": 0.17942924797534943,
"reward_std": 0.04769964888691902,
"rewards/bleu_reward_func/mean": 0.17942924797534943,
"rewards/bleu_reward_func/std": 0.20441435277462006,
"step": 965
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 459.0,
"completions/mean_length": 143.15625,
"completions/mean_terminated_length": 39.87999725341797,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.7728,
"grad_norm": 10.111825942993164,
"kl": 0.39947509765625,
"learning_rate": 1e-06,
"loss": -0.0058,
"num_tokens": 12613941.0,
"reward": 0.09816907346248627,
"reward_std": 0.009084422141313553,
"rewards/bleu_reward_func/mean": 0.09816907346248627,
"rewards/bleu_reward_func/std": 0.06435907632112503,
"step": 966
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 348.875,
"completions/mean_terminated_length": 110.46154022216797,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.7736,
"grad_norm": 4.259431838989258,
"kl": 0.144195556640625,
"learning_rate": 1e-06,
"loss": 0.1364,
"num_tokens": 12628633.0,
"reward": 0.11497487127780914,
"reward_std": 0.0397370383143425,
"rewards/bleu_reward_func/mean": 0.11497487127780914,
"rewards/bleu_reward_func/std": 0.08479689061641693,
"step": 967
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 363.0,
"completions/mean_length": 319.125,
"completions/mean_terminated_length": 100.53334045410156,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.7744,
"grad_norm": 5.853184700012207,
"kl": 0.2406005859375,
"learning_rate": 1e-06,
"loss": 0.0855,
"num_tokens": 12642293.0,
"reward": 0.1797194480895996,
"reward_std": 0.06623274832963943,
"rewards/bleu_reward_func/mean": 0.1797194480895996,
"rewards/bleu_reward_func/std": 0.20514002442359924,
"step": 968
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 276.65625,
"completions/mean_terminated_length": 198.20834350585938,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.7752,
"grad_norm": 3.861785650253296,
"kl": 0.048126220703125,
"learning_rate": 1e-06,
"loss": 0.3173,
"num_tokens": 12653674.0,
"reward": 0.06044634059071541,
"reward_std": 0.02236868627369404,
"rewards/bleu_reward_func/mean": 0.06044634059071541,
"rewards/bleu_reward_func/std": 0.058306269347667694,
"step": 969
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 364.4375,
"completions/mean_terminated_length": 287.1428527832031,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.776,
"grad_norm": 2.8329427242279053,
"kl": 0.037445068359375,
"learning_rate": 1e-06,
"loss": 0.1258,
"num_tokens": 12668128.0,
"reward": 0.11927121132612228,
"reward_std": 0.0374884158372879,
"rewards/bleu_reward_func/mean": 0.11927121132612228,
"rewards/bleu_reward_func/std": 0.10864724963903427,
"step": 970
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 247.3125,
"completions/mean_terminated_length": 198.29629516601562,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.7768,
"grad_norm": 14.522860527038574,
"kl": 0.3580322265625,
"learning_rate": 1e-06,
"loss": 0.327,
"num_tokens": 12680266.0,
"reward": 0.11901617795228958,
"reward_std": 0.06829790771007538,
"rewards/bleu_reward_func/mean": 0.11901617795228958,
"rewards/bleu_reward_func/std": 0.0924401804804802,
"step": 971
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 384.0,
"completions/mean_length": 270.71875,
"completions/mean_terminated_length": 83.05555725097656,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.7776,
"grad_norm": 5.036475658416748,
"kl": 0.187713623046875,
"learning_rate": 1e-06,
"loss": 0.1832,
"num_tokens": 12692713.0,
"reward": 0.13872118294239044,
"reward_std": 0.0687481164932251,
"rewards/bleu_reward_func/mean": 0.13872118294239044,
"rewards/bleu_reward_func/std": 0.14044460654258728,
"step": 972
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 385.0,
"completions/mean_length": 244.90625,
"completions/mean_terminated_length": 155.875,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.7784,
"grad_norm": 7.775391578674316,
"kl": 0.136444091796875,
"learning_rate": 1e-06,
"loss": 0.0436,
"num_tokens": 12706702.0,
"reward": 0.11104710400104523,
"reward_std": 0.03642675280570984,
"rewards/bleu_reward_func/mean": 0.11104710400104523,
"rewards/bleu_reward_func/std": 0.09262983500957489,
"step": 973
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 293.96875,
"completions/mean_terminated_length": 194.8636474609375,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.7792,
"grad_norm": 36.50672149658203,
"kl": 1.153228759765625,
"learning_rate": 1e-06,
"loss": 0.1761,
"num_tokens": 12721461.0,
"reward": 0.2013707458972931,
"reward_std": 0.10103052109479904,
"rewards/bleu_reward_func/mean": 0.2013707458972931,
"rewards/bleu_reward_func/std": 0.16323500871658325,
"step": 974
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 164.5,
"completions/mean_terminated_length": 128.55172729492188,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.78,
"grad_norm": 9.472455024719238,
"kl": 0.2548828125,
"learning_rate": 1e-06,
"loss": 0.6259,
"num_tokens": 12733317.0,
"reward": 0.07841520756483078,
"reward_std": 0.029445767402648926,
"rewards/bleu_reward_func/mean": 0.07841520756483078,
"rewards/bleu_reward_func/std": 0.0620783306658268,
"step": 975
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 247.09375,
"completions/mean_terminated_length": 229.433349609375,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.7808,
"grad_norm": 4.399056911468506,
"kl": 0.10150146484375,
"learning_rate": 1e-06,
"loss": 0.0484,
"num_tokens": 12746712.0,
"reward": 0.20041930675506592,
"reward_std": 0.03039298765361309,
"rewards/bleu_reward_func/mean": 0.20041930675506592,
"rewards/bleu_reward_func/std": 0.2551174759864807,
"step": 976
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 462.0,
"completions/mean_length": 382.375,
"completions/mean_terminated_length": 293.6842041015625,
"completions/min_length": 74.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.7816,
"grad_norm": 2.5359840393066406,
"kl": 0.05230712890625,
"learning_rate": 1e-06,
"loss": 0.0699,
"num_tokens": 12762020.0,
"reward": 0.047146447002887726,
"reward_std": 0.022996241226792336,
"rewards/bleu_reward_func/mean": 0.047146447002887726,
"rewards/bleu_reward_func/std": 0.04002131521701813,
"step": 977
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 329.90625,
"completions/mean_terminated_length": 234.52381896972656,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.7824,
"grad_norm": 6.270121097564697,
"kl": 0.1246337890625,
"learning_rate": 1e-06,
"loss": -0.0807,
"num_tokens": 12778233.0,
"reward": 0.1260184496641159,
"reward_std": 0.029545176774263382,
"rewards/bleu_reward_func/mean": 0.1260184496641159,
"rewards/bleu_reward_func/std": 0.12758195400238037,
"step": 978
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 315.0,
"completions/mean_length": 163.6875,
"completions/mean_terminated_length": 127.6551742553711,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.7832,
"grad_norm": 11.418375015258789,
"kl": 0.1888427734375,
"learning_rate": 1e-06,
"loss": 0.4229,
"num_tokens": 12788943.0,
"reward": 0.22336477041244507,
"reward_std": 0.0984843298792839,
"rewards/bleu_reward_func/mean": 0.22336477041244507,
"rewards/bleu_reward_func/std": 0.1921825110912323,
"step": 979
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 141.6875,
"completions/mean_terminated_length": 117.00000762939453,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.784,
"grad_norm": 7.894710540771484,
"kl": 0.4378662109375,
"learning_rate": 1e-06,
"loss": 0.2326,
"num_tokens": 12797693.0,
"reward": 0.1561349630355835,
"reward_std": 0.05494026839733124,
"rewards/bleu_reward_func/mean": 0.1561349630355835,
"rewards/bleu_reward_func/std": 0.09167517721652985,
"step": 980
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 255.34375,
"completions/mean_terminated_length": 169.7916717529297,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.7848,
"grad_norm": 19.572107315063477,
"kl": 0.288604736328125,
"learning_rate": 1e-06,
"loss": 0.151,
"num_tokens": 12812768.0,
"reward": 0.06323938816785812,
"reward_std": 0.017744949087500572,
"rewards/bleu_reward_func/mean": 0.06323938816785812,
"rewards/bleu_reward_func/std": 0.07885830849409103,
"step": 981
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 443.0,
"completions/mean_length": 281.65625,
"completions/mean_terminated_length": 176.9545440673828,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.7856,
"grad_norm": 8.172053337097168,
"kl": 0.24444580078125,
"learning_rate": 1e-06,
"loss": 0.21,
"num_tokens": 12829453.0,
"reward": 0.0720784068107605,
"reward_std": 0.03868547081947327,
"rewards/bleu_reward_func/mean": 0.0720784068107605,
"rewards/bleu_reward_func/std": 0.05159585550427437,
"step": 982
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 235.84375,
"completions/mean_terminated_length": 110.31818389892578,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.7864,
"grad_norm": 8.912215232849121,
"kl": 0.32025146484375,
"learning_rate": 1e-06,
"loss": 0.12,
"num_tokens": 12843376.0,
"reward": 0.1997315138578415,
"reward_std": 0.030267415568232536,
"rewards/bleu_reward_func/mean": 0.1997315138578415,
"rewards/bleu_reward_func/std": 0.17783835530281067,
"step": 983
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 449.0,
"completions/mean_length": 378.0,
"completions/mean_terminated_length": 317.0909118652344,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.7872,
"grad_norm": 2.6129956245422363,
"kl": 0.036346435546875,
"learning_rate": 1e-06,
"loss": -0.1504,
"num_tokens": 12859368.0,
"reward": 0.07119783759117126,
"reward_std": 0.018479108810424805,
"rewards/bleu_reward_func/mean": 0.07119783759117126,
"rewards/bleu_reward_func/std": 0.06165986508131027,
"step": 984
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 434.0,
"completions/mean_length": 220.03125,
"completions/mean_terminated_length": 152.6538543701172,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.788,
"grad_norm": 5.786254405975342,
"kl": 0.15057373046875,
"learning_rate": 1e-06,
"loss": -0.1709,
"num_tokens": 12869665.0,
"reward": 0.14459165930747986,
"reward_std": 0.03573929890990257,
"rewards/bleu_reward_func/mean": 0.14459165930747986,
"rewards/bleu_reward_func/std": 0.13286592066287994,
"step": 985
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 357.125,
"completions/mean_terminated_length": 220.47059631347656,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.7888,
"grad_norm": 8.607572555541992,
"kl": 0.24078369140625,
"learning_rate": 1e-06,
"loss": 0.0188,
"num_tokens": 12885501.0,
"reward": 0.09036614745855331,
"reward_std": 0.031877610832452774,
"rewards/bleu_reward_func/mean": 0.09036614745855331,
"rewards/bleu_reward_func/std": 0.05137631297111511,
"step": 986
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 381.75,
"completions/mean_terminated_length": 280.4444580078125,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.7896,
"grad_norm": 2.421383857727051,
"kl": 0.046905517578125,
"learning_rate": 1e-06,
"loss": 0.1548,
"num_tokens": 12899517.0,
"reward": 0.06479343771934509,
"reward_std": 0.01870723068714142,
"rewards/bleu_reward_func/mean": 0.06479343771934509,
"rewards/bleu_reward_func/std": 0.039773859083652496,
"step": 987
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 272.3125,
"completions/mean_terminated_length": 238.07144165039062,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.7904,
"grad_norm": 3.447153329849243,
"kl": 0.06671142578125,
"learning_rate": 1e-06,
"loss": -0.0649,
"num_tokens": 12910175.0,
"reward": 0.0866774171590805,
"reward_std": 0.07288840413093567,
"rewards/bleu_reward_func/mean": 0.0866774171590805,
"rewards/bleu_reward_func/std": 0.10417941212654114,
"step": 988
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 156.625,
"completions/mean_terminated_length": 74.61538696289062,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.7912,
"grad_norm": 11.11733341217041,
"kl": 0.34686279296875,
"learning_rate": 1e-06,
"loss": 0.4111,
"num_tokens": 12920075.0,
"reward": 0.18872088193893433,
"reward_std": 0.05310884118080139,
"rewards/bleu_reward_func/mean": 0.18872088193893433,
"rewards/bleu_reward_func/std": 0.10052233934402466,
"step": 989
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 443.0,
"completions/mean_length": 251.15625,
"completions/mean_terminated_length": 94.6500015258789,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.792,
"grad_norm": 4.343538761138916,
"kl": 0.0811767578125,
"learning_rate": 1e-06,
"loss": 0.255,
"num_tokens": 12932672.0,
"reward": 0.2421235740184784,
"reward_std": 0.03650471195578575,
"rewards/bleu_reward_func/mean": 0.2421235740184784,
"rewards/bleu_reward_func/std": 0.35968947410583496,
"step": 990
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 365.34375,
"completions/mean_terminated_length": 307.9565124511719,
"completions/min_length": 80.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.7928,
"grad_norm": 4.527655601501465,
"kl": 0.109619140625,
"learning_rate": 1e-06,
"loss": -0.0782,
"num_tokens": 12947603.0,
"reward": 0.05564543977379799,
"reward_std": 0.033515315502882004,
"rewards/bleu_reward_func/mean": 0.05564543977379799,
"rewards/bleu_reward_func/std": 0.03913462907075882,
"step": 991
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 128.46875,
"completions/mean_terminated_length": 57.4444465637207,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.7936,
"grad_norm": 9.973651885986328,
"kl": 0.1922607421875,
"learning_rate": 1e-06,
"loss": 0.9373,
"num_tokens": 12958538.0,
"reward": 0.3186902403831482,
"reward_std": 0.10882419347763062,
"rewards/bleu_reward_func/mean": 0.3186902403831482,
"rewards/bleu_reward_func/std": 0.2608534097671509,
"step": 992
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 328.375,
"completions/mean_terminated_length": 202.73684692382812,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.7944,
"grad_norm": 7.580811023712158,
"kl": 0.290863037109375,
"learning_rate": 1e-06,
"loss": 0.1547,
"num_tokens": 12975334.0,
"reward": 0.16742870211601257,
"reward_std": 0.03473435714840889,
"rewards/bleu_reward_func/mean": 0.16742870211601257,
"rewards/bleu_reward_func/std": 0.1612749844789505,
"step": 993
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 119.6875,
"completions/mean_terminated_length": 47.03703689575195,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.7952,
"grad_norm": 20.384323120117188,
"kl": 0.74737548828125,
"learning_rate": 1e-06,
"loss": -0.0224,
"num_tokens": 12988020.0,
"reward": 0.1509585976600647,
"reward_std": 0.0387745276093483,
"rewards/bleu_reward_func/mean": 0.1509585976600647,
"rewards/bleu_reward_func/std": 0.13122804462909698,
"step": 994
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 346.84375,
"completions/mean_terminated_length": 181.6875,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.796,
"grad_norm": 4.161476135253906,
"kl": 0.05682373046875,
"learning_rate": 1e-06,
"loss": -0.1853,
"num_tokens": 13003199.0,
"reward": 0.026108039543032646,
"reward_std": 0.02537854015827179,
"rewards/bleu_reward_func/mean": 0.026108039543032646,
"rewards/bleu_reward_func/std": 0.03443064168095589,
"step": 995
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 239.0,
"completions/mean_length": 181.0625,
"completions/mean_terminated_length": 70.75,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.7968,
"grad_norm": 14.441998481750488,
"kl": 0.575714111328125,
"learning_rate": 1e-06,
"loss": 0.1517,
"num_tokens": 13012889.0,
"reward": 0.10783781111240387,
"reward_std": 0.053533561527729034,
"rewards/bleu_reward_func/mean": 0.10783781111240387,
"rewards/bleu_reward_func/std": 0.09023794531822205,
"step": 996
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 444.0,
"completions/mean_length": 371.75,
"completions/mean_terminated_length": 166.7692413330078,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.7976,
"grad_norm": 3.627946615219116,
"kl": 0.089813232421875,
"learning_rate": 1e-06,
"loss": -0.0463,
"num_tokens": 13027321.0,
"reward": 0.07810983061790466,
"reward_std": 0.03820539265871048,
"rewards/bleu_reward_func/mean": 0.07810983061790466,
"rewards/bleu_reward_func/std": 0.07255319505929947,
"step": 997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 410.0,
"completions/mean_length": 299.96875,
"completions/mean_terminated_length": 203.59091186523438,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.7984,
"grad_norm": 5.938675403594971,
"kl": 0.163330078125,
"learning_rate": 1e-06,
"loss": 0.1322,
"num_tokens": 13038888.0,
"reward": 0.0998745784163475,
"reward_std": 0.12165166437625885,
"rewards/bleu_reward_func/mean": 0.0998745784163475,
"rewards/bleu_reward_func/std": 0.2023635059595108,
"step": 998
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 299.25,
"completions/mean_terminated_length": 171.60000610351562,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.7992,
"grad_norm": 5.83504581451416,
"kl": 0.288848876953125,
"learning_rate": 1e-06,
"loss": 0.0509,
"num_tokens": 13051904.0,
"reward": 0.10337799787521362,
"reward_std": 0.029087794944643974,
"rewards/bleu_reward_func/mean": 0.10337799787521362,
"rewards/bleu_reward_func/std": 0.07911896705627441,
"step": 999
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 345.96875,
"completions/mean_terminated_length": 322.25,
"completions/min_length": 91.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.8,
"grad_norm": 2.916576385498047,
"kl": 0.078857421875,
"learning_rate": 1e-06,
"loss": 0.05,
"num_tokens": 13065431.0,
"reward": 0.05868455022573471,
"reward_std": 0.017369702458381653,
"rewards/bleu_reward_func/mean": 0.05868455022573471,
"rewards/bleu_reward_func/std": 0.04672805219888687,
"step": 1000
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 461.0,
"completions/mean_length": 266.8125,
"completions/mean_terminated_length": 250.4666748046875,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.8008,
"grad_norm": 5.116244316101074,
"kl": 0.20050048828125,
"learning_rate": 1e-06,
"loss": 0.0918,
"num_tokens": 13079225.0,
"reward": 0.13771796226501465,
"reward_std": 0.04302237555384636,
"rewards/bleu_reward_func/mean": 0.13771796226501465,
"rewards/bleu_reward_func/std": 0.10432249307632446,
"step": 1001
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 316.1875,
"completions/mean_terminated_length": 239.56521606445312,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.8016,
"grad_norm": 3.050908088684082,
"kl": 0.0596923828125,
"learning_rate": 1e-06,
"loss": -0.0323,
"num_tokens": 13093775.0,
"reward": 0.07833529263734818,
"reward_std": 0.02821630984544754,
"rewards/bleu_reward_func/mean": 0.07833529263734818,
"rewards/bleu_reward_func/std": 0.06890382617712021,
"step": 1002
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 380.0,
"completions/mean_length": 180.0,
"completions/mean_terminated_length": 132.57144165039062,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.8024,
"grad_norm": 6.102024078369141,
"kl": 0.28985595703125,
"learning_rate": 1e-06,
"loss": 0.0137,
"num_tokens": 13104447.0,
"reward": 0.27597150206565857,
"reward_std": 0.04547630250453949,
"rewards/bleu_reward_func/mean": 0.27597150206565857,
"rewards/bleu_reward_func/std": 0.2288428395986557,
"step": 1003
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 391.25,
"completions/mean_terminated_length": 254.40000915527344,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.8032,
"grad_norm": 3.2652981281280518,
"kl": 0.084625244140625,
"learning_rate": 1e-06,
"loss": 0.1886,
"num_tokens": 13119655.0,
"reward": 0.13283666968345642,
"reward_std": 0.029899559915065765,
"rewards/bleu_reward_func/mean": 0.13283666968345642,
"rewards/bleu_reward_func/std": 0.1536840796470642,
"step": 1004
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 456.0,
"completions/mean_length": 203.8125,
"completions/mean_terminated_length": 183.2666778564453,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.804,
"grad_norm": 6.098532199859619,
"kl": 0.1204833984375,
"learning_rate": 1e-06,
"loss": 0.0735,
"num_tokens": 13128825.0,
"reward": 0.21581590175628662,
"reward_std": 0.10097475349903107,
"rewards/bleu_reward_func/mean": 0.21581590175628662,
"rewards/bleu_reward_func/std": 0.2611050307750702,
"step": 1005
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 358.1875,
"completions/mean_terminated_length": 238.55555725097656,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.8048,
"grad_norm": 7.441494464874268,
"kl": 0.191802978515625,
"learning_rate": 1e-06,
"loss": 0.0313,
"num_tokens": 13142175.0,
"reward": 0.20881031453609467,
"reward_std": 0.03906787186861038,
"rewards/bleu_reward_func/mean": 0.20881031453609467,
"rewards/bleu_reward_func/std": 0.17651565372943878,
"step": 1006
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 307.65625,
"completions/mean_terminated_length": 260.5,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.8056,
"grad_norm": 3.559936761856079,
"kl": 0.0572509765625,
"learning_rate": 1e-06,
"loss": 0.1028,
"num_tokens": 13153852.0,
"reward": 0.04283101111650467,
"reward_std": 0.04057364910840988,
"rewards/bleu_reward_func/mean": 0.04283101111650467,
"rewards/bleu_reward_func/std": 0.059128936380147934,
"step": 1007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 336.875,
"completions/mean_terminated_length": 245.1428680419922,
"completions/min_length": 118.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.8064,
"grad_norm": 4.735692024230957,
"kl": 0.069091796875,
"learning_rate": 1e-06,
"loss": 0.0972,
"num_tokens": 13171048.0,
"reward": 0.09638670086860657,
"reward_std": 0.019614677876234055,
"rewards/bleu_reward_func/mean": 0.09638670086860657,
"rewards/bleu_reward_func/std": 0.09023009240627289,
"step": 1008
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 329.59375,
"completions/mean_terminated_length": 204.7894744873047,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.8072,
"grad_norm": 7.47179651260376,
"kl": 0.1318359375,
"learning_rate": 1e-06,
"loss": -0.0877,
"num_tokens": 13187195.0,
"reward": 0.036128196865320206,
"reward_std": 0.020984536036849022,
"rewards/bleu_reward_func/mean": 0.036128196865320206,
"rewards/bleu_reward_func/std": 0.038413140922784805,
"step": 1009
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 252.0,
"completions/max_terminated_length": 252.0,
"completions/mean_length": 53.5625,
"completions/mean_terminated_length": 53.5625,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.808,
"grad_norm": 10.647790908813477,
"kl": 0.482666015625,
"learning_rate": 1e-06,
"loss": -0.052,
"num_tokens": 13197629.0,
"reward": 0.34467193484306335,
"reward_std": 0.09173881262540817,
"rewards/bleu_reward_func/mean": 0.34467193484306335,
"rewards/bleu_reward_func/std": 0.23519103229045868,
"step": 1010
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 191.6875,
"completions/mean_terminated_length": 181.35482788085938,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.8088,
"grad_norm": 6.100401878356934,
"kl": 0.141632080078125,
"learning_rate": 1e-06,
"loss": 0.0525,
"num_tokens": 13210699.0,
"reward": 0.23702389001846313,
"reward_std": 0.03205852955579758,
"rewards/bleu_reward_func/mean": 0.23702389001846313,
"rewards/bleu_reward_func/std": 0.1315021812915802,
"step": 1011
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 374.4375,
"completions/mean_terminated_length": 218.53334045410156,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.8096,
"grad_norm": 8.772051811218262,
"kl": 0.103546142578125,
"learning_rate": 1e-06,
"loss": 0.0866,
"num_tokens": 13228945.0,
"reward": 0.13881856203079224,
"reward_std": 0.044034168124198914,
"rewards/bleu_reward_func/mean": 0.13881856203079224,
"rewards/bleu_reward_func/std": 0.0626484826207161,
"step": 1012
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 147.96875,
"completions/mean_terminated_length": 95.96428680419922,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.8104,
"grad_norm": 14.525429725646973,
"kl": 0.4837646484375,
"learning_rate": 1e-06,
"loss": 0.0162,
"num_tokens": 13242400.0,
"reward": 0.20639193058013916,
"reward_std": 0.05956702679395676,
"rewards/bleu_reward_func/mean": 0.20639193058013916,
"rewards/bleu_reward_func/std": 0.12604379653930664,
"step": 1013
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 510.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 149.90625,
"completions/mean_terminated_length": 149.90625,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.8112,
"grad_norm": 22.3626766204834,
"kl": 0.2891845703125,
"learning_rate": 1e-06,
"loss": 0.0612,
"num_tokens": 13252157.0,
"reward": 0.24893034994602203,
"reward_std": 0.03380701690912247,
"rewards/bleu_reward_func/mean": 0.24893034994602203,
"rewards/bleu_reward_func/std": 0.20013003051280975,
"step": 1014
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 28.0,
"completions/mean_length": 266.6875,
"completions/mean_terminated_length": 21.375,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.812,
"grad_norm": 7.276104927062988,
"kl": 0.26300048828125,
"learning_rate": 1e-06,
"loss": -0.0225,
"num_tokens": 13263955.0,
"reward": 0.14417850971221924,
"reward_std": 0.037887826561927795,
"rewards/bleu_reward_func/mean": 0.14417850971221924,
"rewards/bleu_reward_func/std": 0.1605955958366394,
"step": 1015
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 226.28125,
"completions/mean_terminated_length": 146.27999877929688,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"epoch": 0.8128,
"grad_norm": 5.462210655212402,
"kl": 0.13653564453125,
"learning_rate": 1e-06,
"loss": 0.1195,
"num_tokens": 13273692.0,
"reward": 0.08337672054767609,
"reward_std": 0.02062853053212166,
"rewards/bleu_reward_func/mean": 0.08337672054767609,
"rewards/bleu_reward_func/std": 0.048102062195539474,
"step": 1016
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 354.0625,
"completions/mean_terminated_length": 271.3333435058594,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.8136,
"grad_norm": 5.125792980194092,
"kl": 0.22271728515625,
"learning_rate": 1e-06,
"loss": 0.0213,
"num_tokens": 13289478.0,
"reward": 0.19775965809822083,
"reward_std": 0.04682963341474533,
"rewards/bleu_reward_func/mean": 0.19775965809822083,
"rewards/bleu_reward_func/std": 0.22582097351551056,
"step": 1017
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 183.65625,
"completions/mean_terminated_length": 107.8846206665039,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.8144,
"grad_norm": 7.784894943237305,
"kl": 0.38018798828125,
"learning_rate": 1e-06,
"loss": -0.2707,
"num_tokens": 13299747.0,
"reward": 0.06794524192810059,
"reward_std": 0.039994340389966965,
"rewards/bleu_reward_func/mean": 0.06794524192810059,
"rewards/bleu_reward_func/std": 0.0657891035079956,
"step": 1018
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 232.6875,
"completions/mean_terminated_length": 203.79310607910156,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.8152,
"grad_norm": 6.757852077484131,
"kl": 0.23583984375,
"learning_rate": 1e-06,
"loss": -0.0289,
"num_tokens": 13308865.0,
"reward": 0.1327855885028839,
"reward_std": 0.043607283383607864,
"rewards/bleu_reward_func/mean": 0.1327855885028839,
"rewards/bleu_reward_func/std": 0.1713796854019165,
"step": 1019
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 116.03125,
"completions/mean_terminated_length": 75.06896209716797,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.816,
"grad_norm": 6.1786370277404785,
"kl": 0.21380615234375,
"learning_rate": 1e-06,
"loss": 0.1686,
"num_tokens": 13318914.0,
"reward": 0.23704446852207184,
"reward_std": 0.057613980025053024,
"rewards/bleu_reward_func/mean": 0.23704446852207184,
"rewards/bleu_reward_func/std": 0.21550458669662476,
"step": 1020
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 188.34375,
"completions/mean_terminated_length": 166.7666778564453,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.8168,
"grad_norm": 7.113982200622559,
"kl": 0.258514404296875,
"learning_rate": 1e-06,
"loss": 0.0406,
"num_tokens": 13329205.0,
"reward": 0.08146457374095917,
"reward_std": 0.02083425223827362,
"rewards/bleu_reward_func/mean": 0.08146457374095917,
"rewards/bleu_reward_func/std": 0.0736912190914154,
"step": 1021
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 248.1875,
"completions/mean_terminated_length": 144.95652770996094,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.8176,
"grad_norm": 10.716026306152344,
"kl": 0.3465576171875,
"learning_rate": 1e-06,
"loss": -0.0356,
"num_tokens": 13345867.0,
"reward": 0.1497185379266739,
"reward_std": 0.016201931983232498,
"rewards/bleu_reward_func/mean": 0.1497185379266739,
"rewards/bleu_reward_func/std": 0.17363472282886505,
"step": 1022
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 466.0,
"completions/mean_length": 306.59375,
"completions/mean_terminated_length": 238.125,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.8184,
"grad_norm": 5.428062915802002,
"kl": 0.152679443359375,
"learning_rate": 1e-06,
"loss": -0.0033,
"num_tokens": 13360854.0,
"reward": 0.1825982928276062,
"reward_std": 0.057225678116083145,
"rewards/bleu_reward_func/mean": 0.1825982928276062,
"rewards/bleu_reward_func/std": 0.1867101639509201,
"step": 1023
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 504.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 103.6875,
"completions/mean_terminated_length": 103.6875,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.8192,
"grad_norm": 9.348381042480469,
"kl": 0.17828369140625,
"learning_rate": 1e-06,
"loss": 0.0549,
"num_tokens": 13369980.0,
"reward": 0.11240973323583603,
"reward_std": 0.026126563549041748,
"rewards/bleu_reward_func/mean": 0.11240973323583603,
"rewards/bleu_reward_func/std": 0.11270570009946823,
"step": 1024
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 225.875,
"completions/mean_terminated_length": 159.84616088867188,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.82,
"grad_norm": 7.571503162384033,
"kl": 0.237548828125,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 13384584.0,
"reward": 0.2324497401714325,
"reward_std": 0.0470973402261734,
"rewards/bleu_reward_func/mean": 0.2324497401714325,
"rewards/bleu_reward_func/std": 0.1243894025683403,
"step": 1025
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 139.34375,
"completions/mean_terminated_length": 114.50000762939453,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.8208,
"grad_norm": 8.227386474609375,
"kl": 0.25714111328125,
"learning_rate": 1e-06,
"loss": 0.1704,
"num_tokens": 13391771.0,
"reward": 0.12480157613754272,
"reward_std": 0.04330623894929886,
"rewards/bleu_reward_func/mean": 0.12480157613754272,
"rewards/bleu_reward_func/std": 0.103439562022686,
"step": 1026
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 214.3125,
"completions/mean_terminated_length": 130.95999145507812,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.8216,
"grad_norm": 5.542022228240967,
"kl": 0.24603271484375,
"learning_rate": 1e-06,
"loss": -0.1391,
"num_tokens": 13403757.0,
"reward": 0.25766974687576294,
"reward_std": 0.03755660355091095,
"rewards/bleu_reward_func/mean": 0.25766974687576294,
"rewards/bleu_reward_func/std": 0.22421182692050934,
"step": 1027
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 459.53125,
"completions/mean_terminated_length": 413.23529052734375,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.8224,
"grad_norm": 2.1804089546203613,
"kl": 0.04327392578125,
"learning_rate": 1e-06,
"loss": -0.1423,
"num_tokens": 13421078.0,
"reward": 0.07269357144832611,
"reward_std": 0.02826325222849846,
"rewards/bleu_reward_func/mean": 0.07269357144832611,
"rewards/bleu_reward_func/std": 0.034365471452474594,
"step": 1028
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 233.90625,
"completions/mean_terminated_length": 194.17857360839844,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.8232,
"grad_norm": 5.306719779968262,
"kl": 0.1806640625,
"learning_rate": 1e-06,
"loss": -0.4029,
"num_tokens": 13432323.0,
"reward": 0.18320006132125854,
"reward_std": 0.08323986828327179,
"rewards/bleu_reward_func/mean": 0.18320006132125854,
"rewards/bleu_reward_func/std": 0.2284490317106247,
"step": 1029
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 258.4375,
"completions/mean_terminated_length": 187.44000244140625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.824,
"grad_norm": 3.841357707977295,
"kl": 0.070404052734375,
"learning_rate": 1e-06,
"loss": 0.1459,
"num_tokens": 13444169.0,
"reward": 0.09387044608592987,
"reward_std": 0.07637906074523926,
"rewards/bleu_reward_func/mean": 0.09387044608592987,
"rewards/bleu_reward_func/std": 0.10294011980295181,
"step": 1030
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 121.59375,
"completions/mean_terminated_length": 109.0,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.8248,
"grad_norm": 10.732538223266602,
"kl": 0.598388671875,
"learning_rate": 1e-06,
"loss": -0.0236,
"num_tokens": 13450364.0,
"reward": 0.18597961962223053,
"reward_std": 0.03610639274120331,
"rewards/bleu_reward_func/mean": 0.18597961962223053,
"rewards/bleu_reward_func/std": 0.14241203665733337,
"step": 1031
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 174.15625,
"completions/mean_terminated_length": 151.6333465576172,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.8256,
"grad_norm": 5.849789619445801,
"kl": 0.2237548828125,
"learning_rate": 1e-06,
"loss": -0.1461,
"num_tokens": 13458489.0,
"reward": 0.10662397742271423,
"reward_std": 0.044935449957847595,
"rewards/bleu_reward_func/mean": 0.10662397742271423,
"rewards/bleu_reward_func/std": 0.08882930874824524,
"step": 1032
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 365.75,
"completions/mean_terminated_length": 278.0,
"completions/min_length": 96.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.8264,
"grad_norm": 2.3378233909606934,
"kl": 0.040679931640625,
"learning_rate": 1e-06,
"loss": 0.018,
"num_tokens": 13477497.0,
"reward": 0.12139745056629181,
"reward_std": 0.030839571729302406,
"rewards/bleu_reward_func/mean": 0.12139745056629181,
"rewards/bleu_reward_func/std": 0.087521493434906,
"step": 1033
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 307.8125,
"completions/mean_terminated_length": 260.69232177734375,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.8272,
"grad_norm": 5.579449653625488,
"kl": 0.14544677734375,
"learning_rate": 1e-06,
"loss": -0.0832,
"num_tokens": 13491595.0,
"reward": 0.06439976394176483,
"reward_std": 0.01632755994796753,
"rewards/bleu_reward_func/mean": 0.06439976394176483,
"rewards/bleu_reward_func/std": 0.025089839473366737,
"step": 1034
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 227.90625,
"completions/mean_terminated_length": 162.34616088867188,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.828,
"grad_norm": 6.95845890045166,
"kl": 0.08489990234375,
"learning_rate": 1e-06,
"loss": 0.1636,
"num_tokens": 13502304.0,
"reward": 0.15672987699508667,
"reward_std": 0.07095484435558319,
"rewards/bleu_reward_func/mean": 0.15672987699508667,
"rewards/bleu_reward_func/std": 0.1326054334640503,
"step": 1035
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 162.59375,
"completions/mean_terminated_length": 139.3000030517578,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.8288,
"grad_norm": 7.602659702301025,
"kl": 0.2142333984375,
"learning_rate": 1e-06,
"loss": -0.0952,
"num_tokens": 13512379.0,
"reward": 0.10166750848293304,
"reward_std": 0.022390395402908325,
"rewards/bleu_reward_func/mean": 0.10166750848293304,
"rewards/bleu_reward_func/std": 0.09791414439678192,
"step": 1036
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 352.84375,
"completions/mean_terminated_length": 257.3500061035156,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.8296,
"grad_norm": 3.4908225536346436,
"kl": 0.0816650390625,
"learning_rate": 1e-06,
"loss": 0.1513,
"num_tokens": 13526246.0,
"reward": 0.10761404037475586,
"reward_std": 0.02660614624619484,
"rewards/bleu_reward_func/mean": 0.10761404037475586,
"rewards/bleu_reward_func/std": 0.08269859850406647,
"step": 1037
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 154.03125,
"completions/mean_terminated_length": 87.74073791503906,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.8304,
"grad_norm": 8.316597938537598,
"kl": 0.3048095703125,
"learning_rate": 1e-06,
"loss": 0.108,
"num_tokens": 13538935.0,
"reward": 0.14819365739822388,
"reward_std": 0.07058853656053543,
"rewards/bleu_reward_func/mean": 0.14819365739822388,
"rewards/bleu_reward_func/std": 0.1550559103488922,
"step": 1038
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 237.75,
"completions/mean_terminated_length": 209.37930297851562,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.8312,
"grad_norm": 4.724348545074463,
"kl": 0.1529541015625,
"learning_rate": 1e-06,
"loss": 0.0635,
"num_tokens": 13551415.0,
"reward": 0.16776956617832184,
"reward_std": 0.026334762573242188,
"rewards/bleu_reward_func/mean": 0.16776956617832184,
"rewards/bleu_reward_func/std": 0.18577900528907776,
"step": 1039
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 402.0,
"completions/max_terminated_length": 402.0,
"completions/mean_length": 93.375,
"completions/mean_terminated_length": 93.375,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.832,
"grad_norm": 8.702837944030762,
"kl": 0.33355712890625,
"learning_rate": 1e-06,
"loss": 0.0333,
"num_tokens": 13560747.0,
"reward": 0.1746351718902588,
"reward_std": 0.039413660764694214,
"rewards/bleu_reward_func/mean": 0.1746351718902588,
"rewards/bleu_reward_func/std": 0.13439369201660156,
"step": 1040
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 297.0,
"completions/mean_length": 177.90625,
"completions/mean_terminated_length": 66.54167175292969,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.8328,
"grad_norm": 14.271418571472168,
"kl": 0.298431396484375,
"learning_rate": 1e-06,
"loss": -0.356,
"num_tokens": 13572328.0,
"reward": 0.0881040021777153,
"reward_std": 0.0392255075275898,
"rewards/bleu_reward_func/mean": 0.0881040021777153,
"rewards/bleu_reward_func/std": 0.086721271276474,
"step": 1041
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 243.34375,
"completions/mean_terminated_length": 138.21739196777344,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.8336,
"grad_norm": 3.9084980487823486,
"kl": 0.07525634765625,
"learning_rate": 1e-06,
"loss": 0.0813,
"num_tokens": 13586859.0,
"reward": 0.06463417410850525,
"reward_std": 0.022750139236450195,
"rewards/bleu_reward_func/mean": 0.06463417410850525,
"rewards/bleu_reward_func/std": 0.05624645948410034,
"step": 1042
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 328.96875,
"completions/mean_terminated_length": 267.9583435058594,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.8344,
"grad_norm": 9.608210563659668,
"kl": 0.2337646484375,
"learning_rate": 1e-06,
"loss": 0.0092,
"num_tokens": 13600978.0,
"reward": 0.06980200856924057,
"reward_std": 0.015845034271478653,
"rewards/bleu_reward_func/mean": 0.06980200856924057,
"rewards/bleu_reward_func/std": 0.03303433954715729,
"step": 1043
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 442.0,
"completions/mean_length": 305.8125,
"completions/mean_terminated_length": 145.44444274902344,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.8352,
"grad_norm": 4.128615379333496,
"kl": 0.06011962890625,
"learning_rate": 1e-06,
"loss": 0.1566,
"num_tokens": 13613068.0,
"reward": 0.04747869074344635,
"reward_std": 0.013655820861458778,
"rewards/bleu_reward_func/mean": 0.04747869074344635,
"rewards/bleu_reward_func/std": 0.028707411140203476,
"step": 1044
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 426.53125,
"completions/mean_terminated_length": 368.0526428222656,
"completions/min_length": 108.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.836,
"grad_norm": 2.515371799468994,
"kl": 0.043975830078125,
"learning_rate": 1e-06,
"loss": -0.065,
"num_tokens": 13629173.0,
"reward": 0.028738608583807945,
"reward_std": 0.012511001899838448,
"rewards/bleu_reward_func/mean": 0.028738608583807945,
"rewards/bleu_reward_func/std": 0.014564147219061852,
"step": 1045
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 206.03125,
"completions/mean_terminated_length": 135.42308044433594,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.8368,
"grad_norm": 8.021496772766113,
"kl": 0.273529052734375,
"learning_rate": 1e-06,
"loss": 0.1291,
"num_tokens": 13637374.0,
"reward": 0.11877701431512833,
"reward_std": 0.04857534170150757,
"rewards/bleu_reward_func/mean": 0.11877701431512833,
"rewards/bleu_reward_func/std": 0.08409105986356735,
"step": 1046
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 251.65625,
"completions/mean_terminated_length": 214.46429443359375,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.8376,
"grad_norm": 3.2490315437316895,
"kl": 0.06829833984375,
"learning_rate": 1e-06,
"loss": -0.1358,
"num_tokens": 13648067.0,
"reward": 0.08158313482999802,
"reward_std": 0.02561478689312935,
"rewards/bleu_reward_func/mean": 0.08158313482999802,
"rewards/bleu_reward_func/std": 0.05671805888414383,
"step": 1047
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 269.125,
"completions/mean_terminated_length": 201.1199951171875,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.8384,
"grad_norm": 5.032691955566406,
"kl": 0.1785888671875,
"learning_rate": 1e-06,
"loss": 0.0164,
"num_tokens": 13660719.0,
"reward": 0.18114086985588074,
"reward_std": 0.03815930336713791,
"rewards/bleu_reward_func/mean": 0.18114086985588074,
"rewards/bleu_reward_func/std": 0.14998804032802582,
"step": 1048
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 355.0,
"completions/mean_length": 87.25,
"completions/mean_terminated_length": 73.54838562011719,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.8392,
"grad_norm": 8.731785774230957,
"kl": 0.384765625,
"learning_rate": 1e-06,
"loss": 0.0594,
"num_tokens": 13670767.0,
"reward": 0.23021195828914642,
"reward_std": 0.09217022359371185,
"rewards/bleu_reward_func/mean": 0.23021195828914642,
"rewards/bleu_reward_func/std": 0.18223723769187927,
"step": 1049
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 454.0,
"completions/mean_length": 269.25,
"completions/mean_terminated_length": 80.44444274902344,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.84,
"grad_norm": 8.531782150268555,
"kl": 0.29345703125,
"learning_rate": 1e-06,
"loss": 0.2277,
"num_tokens": 13685927.0,
"reward": 0.13349372148513794,
"reward_std": 0.053998030722141266,
"rewards/bleu_reward_func/mean": 0.13349372148513794,
"rewards/bleu_reward_func/std": 0.15102460980415344,
"step": 1050
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 375.4375,
"completions/mean_terminated_length": 238.875,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.8408,
"grad_norm": 7.057468414306641,
"kl": 0.099639892578125,
"learning_rate": 1e-06,
"loss": 0.1437,
"num_tokens": 13702781.0,
"reward": 0.02950356900691986,
"reward_std": 0.010849589481949806,
"rewards/bleu_reward_func/mean": 0.02950356900691986,
"rewards/bleu_reward_func/std": 0.02092377282679081,
"step": 1051
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 237.84375,
"completions/mean_terminated_length": 187.07408142089844,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.8416,
"grad_norm": 8.259405136108398,
"kl": 0.57330322265625,
"learning_rate": 1e-06,
"loss": 0.1046,
"num_tokens": 13713560.0,
"reward": 0.1828644871711731,
"reward_std": 0.04976918175816536,
"rewards/bleu_reward_func/mean": 0.1828644871711731,
"rewards/bleu_reward_func/std": 0.13261918723583221,
"step": 1052
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 154.09375,
"completions/mean_terminated_length": 87.81481170654297,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.8424,
"grad_norm": 9.288348197937012,
"kl": 0.424560546875,
"learning_rate": 1e-06,
"loss": -0.0227,
"num_tokens": 13720387.0,
"reward": 0.1263371855020523,
"reward_std": 0.031269170343875885,
"rewards/bleu_reward_func/mean": 0.1263371855020523,
"rewards/bleu_reward_func/std": 0.1025131419301033,
"step": 1053
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 94.0,
"completions/mean_length": 160.15625,
"completions/mean_terminated_length": 42.875,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.8432,
"grad_norm": 7.418272972106934,
"kl": 0.32086181640625,
"learning_rate": 1e-06,
"loss": -0.0571,
"num_tokens": 13731528.0,
"reward": 0.2602638304233551,
"reward_std": 0.07646072655916214,
"rewards/bleu_reward_func/mean": 0.2602638304233551,
"rewards/bleu_reward_func/std": 0.2308470755815506,
"step": 1054
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 453.0,
"completions/mean_length": 183.46875,
"completions/mean_terminated_length": 122.62963104248047,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.844,
"grad_norm": 8.014601707458496,
"kl": 0.3927001953125,
"learning_rate": 1e-06,
"loss": 0.1042,
"num_tokens": 13742447.0,
"reward": 0.07197493314743042,
"reward_std": 0.012622429989278316,
"rewards/bleu_reward_func/mean": 0.07197493314743042,
"rewards/bleu_reward_func/std": 0.04327724501490593,
"step": 1055
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 240.59375,
"completions/mean_terminated_length": 222.50001525878906,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.8448,
"grad_norm": 5.102974891662598,
"kl": 0.1640625,
"learning_rate": 1e-06,
"loss": -0.0185,
"num_tokens": 13755202.0,
"reward": 0.08292236924171448,
"reward_std": 0.023967744782567024,
"rewards/bleu_reward_func/mean": 0.08292236924171448,
"rewards/bleu_reward_func/std": 0.046691060066223145,
"step": 1056
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 379.0,
"completions/mean_length": 133.0625,
"completions/mean_terminated_length": 45.615386962890625,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.8456,
"grad_norm": 12.413713455200195,
"kl": 0.4334716796875,
"learning_rate": 1e-06,
"loss": 0.1467,
"num_tokens": 13763268.0,
"reward": 0.09796961396932602,
"reward_std": 0.02291642501950264,
"rewards/bleu_reward_func/mean": 0.09796961396932602,
"rewards/bleu_reward_func/std": 0.04426925256848335,
"step": 1057
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 155.71875,
"completions/mean_terminated_length": 89.74073791503906,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.8464,
"grad_norm": 8.978612899780273,
"kl": 0.228515625,
"learning_rate": 1e-06,
"loss": -0.0356,
"num_tokens": 13771987.0,
"reward": 0.10166356712579727,
"reward_std": 0.055922288447618484,
"rewards/bleu_reward_func/mean": 0.10166356712579727,
"rewards/bleu_reward_func/std": 0.07498722523450851,
"step": 1058
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 408.84375,
"completions/mean_terminated_length": 276.21429443359375,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.8472,
"grad_norm": 1.850431203842163,
"kl": 0.046905517578125,
"learning_rate": 1e-06,
"loss": 0.1476,
"num_tokens": 13791446.0,
"reward": 0.07564342021942139,
"reward_std": 0.015303988009691238,
"rewards/bleu_reward_func/mean": 0.07564342021942139,
"rewards/bleu_reward_func/std": 0.09736621379852295,
"step": 1059
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 285.71875,
"completions/mean_terminated_length": 197.17391967773438,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.848,
"grad_norm": 4.632063388824463,
"kl": 0.21221923828125,
"learning_rate": 1e-06,
"loss": 0.0453,
"num_tokens": 13803221.0,
"reward": 0.12073882669210434,
"reward_std": 0.03981417790055275,
"rewards/bleu_reward_func/mean": 0.12073882669210434,
"rewards/bleu_reward_func/std": 0.07346338778734207,
"step": 1060
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 457.0,
"completions/mean_length": 329.1875,
"completions/mean_terminated_length": 246.09091186523438,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.8488,
"grad_norm": 4.053706645965576,
"kl": 0.078857421875,
"learning_rate": 1e-06,
"loss": 0.2911,
"num_tokens": 13818331.0,
"reward": 0.06476722657680511,
"reward_std": 0.030476348474621773,
"rewards/bleu_reward_func/mean": 0.06476722657680511,
"rewards/bleu_reward_func/std": 0.06862985342741013,
"step": 1061
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 430.0,
"completions/mean_length": 156.5625,
"completions/mean_terminated_length": 57.03999710083008,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.8496,
"grad_norm": 8.067891120910645,
"kl": 0.1793212890625,
"learning_rate": 1e-06,
"loss": 0.195,
"num_tokens": 13826957.0,
"reward": 0.20933812856674194,
"reward_std": 0.051397278904914856,
"rewards/bleu_reward_func/mean": 0.20933812856674194,
"rewards/bleu_reward_func/std": 0.2901296019554138,
"step": 1062
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 227.09375,
"completions/mean_terminated_length": 115.60869598388672,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.8504,
"grad_norm": 7.58860445022583,
"kl": 0.165283203125,
"learning_rate": 1e-06,
"loss": -0.0274,
"num_tokens": 13838440.0,
"reward": 0.11480045318603516,
"reward_std": 0.027013186365365982,
"rewards/bleu_reward_func/mean": 0.11480045318603516,
"rewards/bleu_reward_func/std": 0.11019645631313324,
"step": 1063
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 189.1875,
"completions/mean_terminated_length": 114.69231414794922,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.8512,
"grad_norm": 5.411109447479248,
"kl": 0.1951904296875,
"learning_rate": 1e-06,
"loss": 0.0807,
"num_tokens": 13849406.0,
"reward": 0.1512412428855896,
"reward_std": 0.062053047120571136,
"rewards/bleu_reward_func/mean": 0.1512412428855896,
"rewards/bleu_reward_func/std": 0.12469635158777237,
"step": 1064
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 259.5625,
"completions/mean_terminated_length": 127.33333587646484,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.852,
"grad_norm": 6.397778034210205,
"kl": 0.24896240234375,
"learning_rate": 1e-06,
"loss": -0.0087,
"num_tokens": 13864056.0,
"reward": 0.1552116870880127,
"reward_std": 0.027821514755487442,
"rewards/bleu_reward_func/mean": 0.1552116870880127,
"rewards/bleu_reward_func/std": 0.10892557352781296,
"step": 1065
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 412.0,
"completions/max_terminated_length": 412.0,
"completions/mean_length": 139.8125,
"completions/mean_terminated_length": 139.8125,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.8528,
"grad_norm": 8.31270694732666,
"kl": 0.20196533203125,
"learning_rate": 1e-06,
"loss": 0.0519,
"num_tokens": 13871490.0,
"reward": 0.13230201601982117,
"reward_std": 0.04370046779513359,
"rewards/bleu_reward_func/mean": 0.13230201601982117,
"rewards/bleu_reward_func/std": 0.08659063279628754,
"step": 1066
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 169.0625,
"completions/mean_terminated_length": 73.04000091552734,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.8536,
"grad_norm": 6.236849784851074,
"kl": 0.39727783203125,
"learning_rate": 1e-06,
"loss": 0.1366,
"num_tokens": 13885556.0,
"reward": 0.17620697617530823,
"reward_std": 0.024748487398028374,
"rewards/bleu_reward_func/mean": 0.17620697617530823,
"rewards/bleu_reward_func/std": 0.10503184050321579,
"step": 1067
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 228.75,
"completions/mean_terminated_length": 176.29629516601562,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.8544,
"grad_norm": 6.749774932861328,
"kl": 0.181121826171875,
"learning_rate": 1e-06,
"loss": -0.1016,
"num_tokens": 13894804.0,
"reward": 0.17489100992679596,
"reward_std": 0.042406514286994934,
"rewards/bleu_reward_func/mean": 0.17489100992679596,
"rewards/bleu_reward_func/std": 0.14329132437705994,
"step": 1068
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 475.0,
"completions/mean_length": 246.6875,
"completions/mean_terminated_length": 126.09091186523438,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.8552,
"grad_norm": 6.8404459953308105,
"kl": 0.31402587890625,
"learning_rate": 1e-06,
"loss": 0.0134,
"num_tokens": 13907514.0,
"reward": 0.16271845996379852,
"reward_std": 0.04602063074707985,
"rewards/bleu_reward_func/mean": 0.16271845996379852,
"rewards/bleu_reward_func/std": 0.12579885125160217,
"step": 1069
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 269.78125,
"completions/mean_terminated_length": 124.45000457763672,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.856,
"grad_norm": 4.438868522644043,
"kl": 0.177520751953125,
"learning_rate": 1e-06,
"loss": 0.0265,
"num_tokens": 13919603.0,
"reward": 0.13932910561561584,
"reward_std": 0.01856398582458496,
"rewards/bleu_reward_func/mean": 0.13932910561561584,
"rewards/bleu_reward_func/std": 0.14700213074684143,
"step": 1070
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 149.9375,
"completions/mean_terminated_length": 82.8888931274414,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.8568,
"grad_norm": 7.284147262573242,
"kl": 0.3516845703125,
"learning_rate": 1e-06,
"loss": 0.0398,
"num_tokens": 13927953.0,
"reward": 0.2614789605140686,
"reward_std": 0.07057315111160278,
"rewards/bleu_reward_func/mean": 0.2614789605140686,
"rewards/bleu_reward_func/std": 0.1882997453212738,
"step": 1071
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 243.78125,
"completions/mean_terminated_length": 168.67999267578125,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.8576,
"grad_norm": 5.096418380737305,
"kl": 0.16357421875,
"learning_rate": 1e-06,
"loss": -0.0371,
"num_tokens": 13942570.0,
"reward": 0.07684318721294403,
"reward_std": 0.019258558750152588,
"rewards/bleu_reward_func/mean": 0.07684318721294403,
"rewards/bleu_reward_func/std": 0.03753623366355896,
"step": 1072
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 433.0,
"completions/mean_length": 271.625,
"completions/mean_terminated_length": 107.15789794921875,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.8584,
"grad_norm": 3.9537734985351562,
"kl": 0.081634521484375,
"learning_rate": 1e-06,
"loss": -0.0181,
"num_tokens": 13956350.0,
"reward": 0.1140797883272171,
"reward_std": 0.023730140179395676,
"rewards/bleu_reward_func/mean": 0.1140797883272171,
"rewards/bleu_reward_func/std": 0.1426122635602951,
"step": 1073
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 461.0,
"completions/mean_length": 323.4375,
"completions/mean_terminated_length": 237.72727966308594,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.8592,
"grad_norm": 4.239878177642822,
"kl": 0.10797119140625,
"learning_rate": 1e-06,
"loss": 0.0029,
"num_tokens": 13971620.0,
"reward": 0.10953356325626373,
"reward_std": 0.07727043330669403,
"rewards/bleu_reward_func/mean": 0.10953356325626373,
"rewards/bleu_reward_func/std": 0.1143500879406929,
"step": 1074
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 447.5625,
"completions/mean_terminated_length": 383.125,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.86,
"grad_norm": 2.2002503871917725,
"kl": 0.049072265625,
"learning_rate": 1e-06,
"loss": -0.0472,
"num_tokens": 13990150.0,
"reward": 0.08343654125928879,
"reward_std": 0.02118324115872383,
"rewards/bleu_reward_func/mean": 0.08343654125928879,
"rewards/bleu_reward_func/std": 0.08093992620706558,
"step": 1075
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 472.125,
"completions/mean_terminated_length": 420.8571472167969,
"completions/min_length": 280.0,
"completions/min_terminated_length": 280.0,
"epoch": 0.8608,
"grad_norm": 2.0650570392608643,
"kl": 0.032440185546875,
"learning_rate": 1e-06,
"loss": -0.0088,
"num_tokens": 14011170.0,
"reward": 0.04162130132317543,
"reward_std": 0.010478474199771881,
"rewards/bleu_reward_func/mean": 0.04162130132317543,
"rewards/bleu_reward_func/std": 0.017952080816030502,
"step": 1076
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 421.0,
"completions/mean_length": 242.78125,
"completions/mean_terminated_length": 58.578948974609375,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.8616,
"grad_norm": 4.682583808898926,
"kl": 0.316986083984375,
"learning_rate": 1e-06,
"loss": -0.0201,
"num_tokens": 14025299.0,
"reward": 0.2864699065685272,
"reward_std": 0.03820549696683884,
"rewards/bleu_reward_func/mean": 0.2864699065685272,
"rewards/bleu_reward_func/std": 0.26346415281295776,
"step": 1077
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 122.0,
"completions/mean_length": 165.625,
"completions/mean_terminated_length": 50.16666793823242,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.8624,
"grad_norm": 7.724006652832031,
"kl": 0.23388671875,
"learning_rate": 1e-06,
"loss": -0.0182,
"num_tokens": 14037399.0,
"reward": 0.1795015037059784,
"reward_std": 0.07820923626422882,
"rewards/bleu_reward_func/mean": 0.1795015037059784,
"rewards/bleu_reward_func/std": 0.15337687730789185,
"step": 1078
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 172.0,
"completions/mean_length": 276.09375,
"completions/mean_terminated_length": 40.1875,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.8632,
"grad_norm": 8.850776672363281,
"kl": 0.24285888671875,
"learning_rate": 1e-06,
"loss": -0.1453,
"num_tokens": 14052562.0,
"reward": 0.09584256261587143,
"reward_std": 0.01827467978000641,
"rewards/bleu_reward_func/mean": 0.09584256261587143,
"rewards/bleu_reward_func/std": 0.10066576302051544,
"step": 1079
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 174.0,
"completions/mean_length": 203.1875,
"completions/mean_terminated_length": 82.34782409667969,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.864,
"grad_norm": 4.735725402832031,
"kl": 0.0994873046875,
"learning_rate": 1e-06,
"loss": 0.2218,
"num_tokens": 14061568.0,
"reward": 0.27030178904533386,
"reward_std": 0.057654060423374176,
"rewards/bleu_reward_func/mean": 0.27030178904533386,
"rewards/bleu_reward_func/std": 0.16439270973205566,
"step": 1080
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 392.0,
"completions/mean_length": 237.875,
"completions/mean_terminated_length": 174.61538696289062,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.8648,
"grad_norm": 4.542383193969727,
"kl": 0.0601806640625,
"learning_rate": 1e-06,
"loss": -0.1576,
"num_tokens": 14071236.0,
"reward": 0.05943232774734497,
"reward_std": 0.03797609731554985,
"rewards/bleu_reward_func/mean": 0.05943232774734497,
"rewards/bleu_reward_func/std": 0.07494883239269257,
"step": 1081
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 214.625,
"completions/mean_terminated_length": 146.0,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.8656,
"grad_norm": 9.019255638122559,
"kl": 0.340179443359375,
"learning_rate": 1e-06,
"loss": 0.0456,
"num_tokens": 14080680.0,
"reward": 0.09385368227958679,
"reward_std": 0.041810497641563416,
"rewards/bleu_reward_func/mean": 0.09385368227958679,
"rewards/bleu_reward_func/std": 0.04523298144340515,
"step": 1082
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 451.0,
"completions/mean_length": 111.3125,
"completions/mean_terminated_length": 54.07143020629883,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.8664,
"grad_norm": 5.487109184265137,
"kl": 0.3203125,
"learning_rate": 1e-06,
"loss": 0.195,
"num_tokens": 14093178.0,
"reward": 0.2254057228565216,
"reward_std": 0.0354473814368248,
"rewards/bleu_reward_func/mean": 0.2254057228565216,
"rewards/bleu_reward_func/std": 0.15529486536979675,
"step": 1083
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 473.03125,
"completions/mean_terminated_length": 408.0833435058594,
"completions/min_length": 108.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.8672,
"grad_norm": 2.1760456562042236,
"kl": 0.05029296875,
"learning_rate": 1e-06,
"loss": -0.1127,
"num_tokens": 14112195.0,
"reward": 0.11573594808578491,
"reward_std": 0.034562353044748306,
"rewards/bleu_reward_func/mean": 0.11573594808578491,
"rewards/bleu_reward_func/std": 0.0883888527750969,
"step": 1084
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 248.8125,
"completions/mean_terminated_length": 110.95238494873047,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.868,
"grad_norm": 20.237722396850586,
"kl": 0.27996826171875,
"learning_rate": 1e-06,
"loss": -0.0279,
"num_tokens": 14125485.0,
"reward": 0.06478704512119293,
"reward_std": 0.01746372878551483,
"rewards/bleu_reward_func/mean": 0.06478704512119293,
"rewards/bleu_reward_func/std": 0.04760226234793663,
"step": 1085
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 254.21875,
"completions/mean_terminated_length": 194.73077392578125,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.8688,
"grad_norm": 4.712401390075684,
"kl": 0.119384765625,
"learning_rate": 1e-06,
"loss": 0.2418,
"num_tokens": 14136628.0,
"reward": 0.07501716911792755,
"reward_std": 0.022581705823540688,
"rewards/bleu_reward_func/mean": 0.07501716911792755,
"rewards/bleu_reward_func/std": 0.045875921845436096,
"step": 1086
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 214.4375,
"completions/mean_terminated_length": 145.7692413330078,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.8696,
"grad_norm": 7.036587715148926,
"kl": 0.40423583984375,
"learning_rate": 1e-06,
"loss": -0.0659,
"num_tokens": 14152290.0,
"reward": 0.24361515045166016,
"reward_std": 0.05023983493447304,
"rewards/bleu_reward_func/mean": 0.24361515045166016,
"rewards/bleu_reward_func/std": 0.2318515181541443,
"step": 1087
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 310.15625,
"completions/mean_terminated_length": 281.3214416503906,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"epoch": 0.8704,
"grad_norm": 2.8048856258392334,
"kl": 0.06768798828125,
"learning_rate": 1e-06,
"loss": -0.1012,
"num_tokens": 14164015.0,
"reward": 0.06632909178733826,
"reward_std": 0.022660713642835617,
"rewards/bleu_reward_func/mean": 0.06632909178733826,
"rewards/bleu_reward_func/std": 0.05323861911892891,
"step": 1088
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 246.0,
"completions/mean_length": 72.03125,
"completions/mean_terminated_length": 57.838706970214844,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.8712,
"grad_norm": 9.539746284484863,
"kl": 0.28515625,
"learning_rate": 1e-06,
"loss": 0.2369,
"num_tokens": 14175360.0,
"reward": 0.2565135359764099,
"reward_std": 0.06622748076915741,
"rewards/bleu_reward_func/mean": 0.2565135359764099,
"rewards/bleu_reward_func/std": 0.2024916261434555,
"step": 1089
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 253.8125,
"completions/mean_terminated_length": 136.4545440673828,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.872,
"grad_norm": 8.92574691772461,
"kl": 0.2467041015625,
"learning_rate": 1e-06,
"loss": 0.2116,
"num_tokens": 14187954.0,
"reward": 0.10993756353855133,
"reward_std": 0.041833557188510895,
"rewards/bleu_reward_func/mean": 0.10993756353855133,
"rewards/bleu_reward_func/std": 0.13020949065685272,
"step": 1090
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 193.625,
"completions/mean_terminated_length": 104.47999572753906,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.8728,
"grad_norm": 5.538515090942383,
"kl": 0.0902099609375,
"learning_rate": 1e-06,
"loss": -0.1573,
"num_tokens": 14196270.0,
"reward": 0.05786508321762085,
"reward_std": 0.02371850796043873,
"rewards/bleu_reward_func/mean": 0.05786508321762085,
"rewards/bleu_reward_func/std": 0.03462414816021919,
"step": 1091
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 223.09375,
"completions/mean_terminated_length": 156.42308044433594,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.8736,
"grad_norm": 5.6724443435668945,
"kl": 0.179168701171875,
"learning_rate": 1e-06,
"loss": 0.0128,
"num_tokens": 14208633.0,
"reward": 0.107764333486557,
"reward_std": 0.021315133199095726,
"rewards/bleu_reward_func/mean": 0.107764333486557,
"rewards/bleu_reward_func/std": 0.038448914885520935,
"step": 1092
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 166.78125,
"completions/mean_terminated_length": 143.7666778564453,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.8744,
"grad_norm": 5.278230667114258,
"kl": 0.2412109375,
"learning_rate": 1e-06,
"loss": -0.0013,
"num_tokens": 14218242.0,
"reward": 0.4157559275627136,
"reward_std": 0.037054967135190964,
"rewards/bleu_reward_func/mean": 0.4157559275627136,
"rewards/bleu_reward_func/std": 0.2559570372104645,
"step": 1093
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 313.09375,
"completions/mean_terminated_length": 114.1875,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.8752,
"grad_norm": 3.892812490463257,
"kl": 0.13092041015625,
"learning_rate": 1e-06,
"loss": 0.1517,
"num_tokens": 14232805.0,
"reward": 0.12693487107753754,
"reward_std": 0.04035983234643936,
"rewards/bleu_reward_func/mean": 0.12693487107753754,
"rewards/bleu_reward_func/std": 0.12727496027946472,
"step": 1094
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 205.1875,
"completions/mean_terminated_length": 161.35714721679688,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.876,
"grad_norm": 8.024927139282227,
"kl": 0.350433349609375,
"learning_rate": 1e-06,
"loss": 0.2352,
"num_tokens": 14244755.0,
"reward": 0.20502734184265137,
"reward_std": 0.09303287416696548,
"rewards/bleu_reward_func/mean": 0.20502734184265137,
"rewards/bleu_reward_func/std": 0.241799458861351,
"step": 1095
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 361.1875,
"completions/mean_terminated_length": 282.19049072265625,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.8768,
"grad_norm": 2.944213390350342,
"kl": 0.068603515625,
"learning_rate": 1e-06,
"loss": -0.106,
"num_tokens": 14257801.0,
"reward": 0.04917216673493385,
"reward_std": 0.011258791200816631,
"rewards/bleu_reward_func/mean": 0.04917216673493385,
"rewards/bleu_reward_func/std": 0.038949303328990936,
"step": 1096
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 234.21875,
"completions/mean_terminated_length": 67.55000305175781,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.8776,
"grad_norm": 6.089639663696289,
"kl": 0.22784423828125,
"learning_rate": 1e-06,
"loss": 0.1116,
"num_tokens": 14267648.0,
"reward": 0.10858422517776489,
"reward_std": 0.04780227690935135,
"rewards/bleu_reward_func/mean": 0.10858422517776489,
"rewards/bleu_reward_func/std": 0.07767506688833237,
"step": 1097
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 414.0,
"completions/mean_length": 383.21875,
"completions/mean_terminated_length": 217.6428680419922,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.8784,
"grad_norm": 2.4417130947113037,
"kl": 0.039581298828125,
"learning_rate": 1e-06,
"loss": 0.101,
"num_tokens": 14284503.0,
"reward": 0.1223280131816864,
"reward_std": 0.058498185127973557,
"rewards/bleu_reward_func/mean": 0.1223280131816864,
"rewards/bleu_reward_func/std": 0.08976288139820099,
"step": 1098
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 390.0,
"completions/mean_length": 200.5,
"completions/mean_terminated_length": 78.60869598388672,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.8792,
"grad_norm": 6.311417102813721,
"kl": 0.34259033203125,
"learning_rate": 1e-06,
"loss": 0.0608,
"num_tokens": 14294455.0,
"reward": 0.12745052576065063,
"reward_std": 0.048099152743816376,
"rewards/bleu_reward_func/mean": 0.12745052576065063,
"rewards/bleu_reward_func/std": 0.12118762731552124,
"step": 1099
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 432.0,
"completions/mean_length": 294.375,
"completions/mean_terminated_length": 244.1538543701172,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.88,
"grad_norm": 5.328085422515869,
"kl": 0.184539794921875,
"learning_rate": 1e-06,
"loss": -0.0179,
"num_tokens": 14311835.0,
"reward": 0.2192595899105072,
"reward_std": 0.042960211634635925,
"rewards/bleu_reward_func/mean": 0.2192595899105072,
"rewards/bleu_reward_func/std": 0.13524703681468964,
"step": 1100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 324.625,
"completions/mean_terminated_length": 262.16668701171875,
"completions/min_length": 121.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.8808,
"grad_norm": 2.6405608654022217,
"kl": 0.0440673828125,
"learning_rate": 1e-06,
"loss": 0.1205,
"num_tokens": 14327975.0,
"reward": 0.14465495944023132,
"reward_std": 0.04526882618665695,
"rewards/bleu_reward_func/mean": 0.14465495944023132,
"rewards/bleu_reward_func/std": 0.1039966493844986,
"step": 1101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 466.0,
"completions/mean_length": 336.125,
"completions/mean_terminated_length": 180.94117736816406,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.8816,
"grad_norm": 3.4212841987609863,
"kl": 0.0875244140625,
"learning_rate": 1e-06,
"loss": 0.1697,
"num_tokens": 14343595.0,
"reward": 0.07303881645202637,
"reward_std": 0.023782189935445786,
"rewards/bleu_reward_func/mean": 0.07303881645202637,
"rewards/bleu_reward_func/std": 0.10003393888473511,
"step": 1102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 453.0,
"completions/mean_length": 226.0,
"completions/mean_terminated_length": 145.9199981689453,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.8824,
"grad_norm": 7.256896018981934,
"kl": 0.2523193359375,
"learning_rate": 1e-06,
"loss": 0.0389,
"num_tokens": 14357595.0,
"reward": 0.14129553735256195,
"reward_std": 0.05766978859901428,
"rewards/bleu_reward_func/mean": 0.14129553735256195,
"rewards/bleu_reward_func/std": 0.13982893526554108,
"step": 1103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 375.65625,
"completions/mean_terminated_length": 313.68182373046875,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.8832,
"grad_norm": 2.429725170135498,
"kl": 0.0618896484375,
"learning_rate": 1e-06,
"loss": 0.0492,
"num_tokens": 14372368.0,
"reward": 0.0918026864528656,
"reward_std": 0.019557196646928787,
"rewards/bleu_reward_func/mean": 0.0918026864528656,
"rewards/bleu_reward_func/std": 0.0768144428730011,
"step": 1104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 121.0,
"completions/mean_length": 81.0625,
"completions/mean_terminated_length": 36.482757568359375,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.884,
"grad_norm": 10.26905345916748,
"kl": 0.4246826171875,
"learning_rate": 1e-06,
"loss": 0.4653,
"num_tokens": 14379298.0,
"reward": 0.1903451383113861,
"reward_std": 0.0727916806936264,
"rewards/bleu_reward_func/mean": 0.1903451383113861,
"rewards/bleu_reward_func/std": 0.18681129813194275,
"step": 1105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 307.0,
"completions/mean_length": 218.09375,
"completions/mean_terminated_length": 103.08695983886719,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.8848,
"grad_norm": 6.238147735595703,
"kl": 0.2109375,
"learning_rate": 1e-06,
"loss": 0.1147,
"num_tokens": 14390325.0,
"reward": 0.10796605050563812,
"reward_std": 0.028253143653273582,
"rewards/bleu_reward_func/mean": 0.10796605050563812,
"rewards/bleu_reward_func/std": 0.08980042487382889,
"step": 1106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 392.0,
"completions/mean_length": 334.3125,
"completions/mean_terminated_length": 177.5294189453125,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.8856,
"grad_norm": 5.977288246154785,
"kl": 0.12933349609375,
"learning_rate": 1e-06,
"loss": 0.0113,
"num_tokens": 14405719.0,
"reward": 0.08215081691741943,
"reward_std": 0.012335095554590225,
"rewards/bleu_reward_func/mean": 0.08215081691741943,
"rewards/bleu_reward_func/std": 0.0935206413269043,
"step": 1107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 368.25,
"completions/mean_terminated_length": 224.5,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.8864,
"grad_norm": 3.1431217193603516,
"kl": 0.046478271484375,
"learning_rate": 1e-06,
"loss": 0.0529,
"num_tokens": 14420959.0,
"reward": 0.07733479142189026,
"reward_std": 0.04769892990589142,
"rewards/bleu_reward_func/mean": 0.07733479142189026,
"rewards/bleu_reward_func/std": 0.07268877327442169,
"step": 1108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 205.0625,
"completions/mean_terminated_length": 148.22222900390625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.8872,
"grad_norm": 8.718172073364258,
"kl": 0.27215576171875,
"learning_rate": 1e-06,
"loss": -0.0245,
"num_tokens": 14431297.0,
"reward": 0.08119820058345795,
"reward_std": 0.02793770469725132,
"rewards/bleu_reward_func/mean": 0.08119820058345795,
"rewards/bleu_reward_func/std": 0.046033825725317,
"step": 1109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 218.59375,
"completions/mean_terminated_length": 188.2413787841797,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.888,
"grad_norm": 7.532926082611084,
"kl": 0.197265625,
"learning_rate": 1e-06,
"loss": 0.2,
"num_tokens": 14440556.0,
"reward": 0.16878977417945862,
"reward_std": 0.06408128887414932,
"rewards/bleu_reward_func/mean": 0.16878977417945862,
"rewards/bleu_reward_func/std": 0.17819638550281525,
"step": 1110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 254.875,
"completions/mean_terminated_length": 169.1666717529297,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.8888,
"grad_norm": 5.752809047698975,
"kl": 0.151947021484375,
"learning_rate": 1e-06,
"loss": -0.0323,
"num_tokens": 14453832.0,
"reward": 0.08754751831293106,
"reward_std": 0.041982948780059814,
"rewards/bleu_reward_func/mean": 0.08754751831293106,
"rewards/bleu_reward_func/std": 0.08986286073923111,
"step": 1111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 431.9375,
"completions/mean_terminated_length": 298.5,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.8896,
"grad_norm": 2.506591320037842,
"kl": 0.04913330078125,
"learning_rate": 1e-06,
"loss": 0.0059,
"num_tokens": 14470966.0,
"reward": 0.05948667228221893,
"reward_std": 0.031033214181661606,
"rewards/bleu_reward_func/mean": 0.05948667228221893,
"rewards/bleu_reward_func/std": 0.04187482222914696,
"step": 1112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 458.125,
"completions/mean_terminated_length": 404.25,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"epoch": 0.8904,
"grad_norm": 2.1371476650238037,
"kl": 0.05169677734375,
"learning_rate": 1e-06,
"loss": 0.0275,
"num_tokens": 14488386.0,
"reward": 0.07635128498077393,
"reward_std": 0.02666424587368965,
"rewards/bleu_reward_func/mean": 0.07635128498077393,
"rewards/bleu_reward_func/std": 0.06230180338025093,
"step": 1113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 381.3125,
"completions/mean_terminated_length": 321.9090881347656,
"completions/min_length": 109.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.8912,
"grad_norm": 2.5617504119873047,
"kl": 0.05352783203125,
"learning_rate": 1e-06,
"loss": -0.0071,
"num_tokens": 14502500.0,
"reward": 0.1177460253238678,
"reward_std": 0.02880302257835865,
"rewards/bleu_reward_func/mean": 0.1177460253238678,
"rewards/bleu_reward_func/std": 0.06036384403705597,
"step": 1114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 354.875,
"completions/mean_terminated_length": 293.39129638671875,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.892,
"grad_norm": 6.915249824523926,
"kl": 0.162353515625,
"learning_rate": 1e-06,
"loss": -0.1012,
"num_tokens": 14515272.0,
"reward": 0.08914826065301895,
"reward_std": 0.03028780408203602,
"rewards/bleu_reward_func/mean": 0.08914826065301895,
"rewards/bleu_reward_func/std": 0.042083028703927994,
"step": 1115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 145.96875,
"completions/mean_terminated_length": 61.500003814697266,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.8928,
"grad_norm": 8.211282730102539,
"kl": 0.33642578125,
"learning_rate": 1e-06,
"loss": -0.0331,
"num_tokens": 14523199.0,
"reward": 0.13895554840564728,
"reward_std": 0.06001996994018555,
"rewards/bleu_reward_func/mean": 0.13895554840564728,
"rewards/bleu_reward_func/std": 0.10717976838350296,
"step": 1116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 269.8125,
"completions/mean_terminated_length": 175.04348754882812,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.8936,
"grad_norm": 9.093502044677734,
"kl": 0.18402099609375,
"learning_rate": 1e-06,
"loss": -0.0448,
"num_tokens": 14535921.0,
"reward": 0.08600494265556335,
"reward_std": 0.01430382952094078,
"rewards/bleu_reward_func/mean": 0.08600494265556335,
"rewards/bleu_reward_func/std": 0.03352402523159981,
"step": 1117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 220.65625,
"completions/mean_terminated_length": 166.70370483398438,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.8944,
"grad_norm": 5.133798122406006,
"kl": 0.23126220703125,
"learning_rate": 1e-06,
"loss": -0.0611,
"num_tokens": 14546502.0,
"reward": 0.1281801015138626,
"reward_std": 0.033460669219493866,
"rewards/bleu_reward_func/mean": 0.1281801015138626,
"rewards/bleu_reward_func/std": 0.09999439865350723,
"step": 1118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 414.90625,
"completions/mean_terminated_length": 382.54168701171875,
"completions/min_length": 261.0,
"completions/min_terminated_length": 261.0,
"epoch": 0.8952,
"grad_norm": 2.3336856365203857,
"kl": 0.05023193359375,
"learning_rate": 1e-06,
"loss": -0.0381,
"num_tokens": 14563043.0,
"reward": 0.09009081870317459,
"reward_std": 0.024957649409770966,
"rewards/bleu_reward_func/mean": 0.09009081870317459,
"rewards/bleu_reward_func/std": 0.07389495521783829,
"step": 1119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 440.0,
"completions/mean_length": 249.375,
"completions/mean_terminated_length": 188.7692413330078,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.896,
"grad_norm": 4.001770496368408,
"kl": 0.06622314453125,
"learning_rate": 1e-06,
"loss": 0.4376,
"num_tokens": 14574703.0,
"reward": 0.12255299836397171,
"reward_std": 0.06612245738506317,
"rewards/bleu_reward_func/mean": 0.12255299836397171,
"rewards/bleu_reward_func/std": 0.1745522916316986,
"step": 1120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 341.96875,
"completions/mean_terminated_length": 264.68182373046875,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.8968,
"grad_norm": 5.190872669219971,
"kl": 0.0662841796875,
"learning_rate": 1e-06,
"loss": -0.2989,
"num_tokens": 14590934.0,
"reward": 0.015649927780032158,
"reward_std": 0.008883368223905563,
"rewards/bleu_reward_func/mean": 0.015649927780032158,
"rewards/bleu_reward_func/std": 0.014249512925744057,
"step": 1121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 444.0,
"completions/mean_length": 204.71875,
"completions/mean_terminated_length": 118.68000030517578,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.8976,
"grad_norm": 7.992037296295166,
"kl": 0.22357177734375,
"learning_rate": 1e-06,
"loss": 0.0293,
"num_tokens": 14601957.0,
"reward": 0.14590373635292053,
"reward_std": 0.032411251217126846,
"rewards/bleu_reward_func/mean": 0.14590373635292053,
"rewards/bleu_reward_func/std": 0.1304609477519989,
"step": 1122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 298.4375,
"completions/mean_terminated_length": 132.3333282470703,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.8984,
"grad_norm": 6.297399044036865,
"kl": 0.05999755859375,
"learning_rate": 1e-06,
"loss": 0.3021,
"num_tokens": 14616019.0,
"reward": 0.071531280875206,
"reward_std": 0.02597668580710888,
"rewards/bleu_reward_func/mean": 0.071531280875206,
"rewards/bleu_reward_func/std": 0.05073075741529465,
"step": 1123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 456.0,
"completions/mean_length": 353.90625,
"completions/mean_terminated_length": 174.73333740234375,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.8992,
"grad_norm": 2.9494781494140625,
"kl": 0.0675048828125,
"learning_rate": 1e-06,
"loss": -0.0383,
"num_tokens": 14634248.0,
"reward": 0.20859137177467346,
"reward_std": 0.026030534878373146,
"rewards/bleu_reward_func/mean": 0.20859137177467346,
"rewards/bleu_reward_func/std": 0.25668489933013916,
"step": 1124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 459.0,
"completions/mean_length": 337.03125,
"completions/mean_terminated_length": 200.94444274902344,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.9,
"grad_norm": 3.6260087490081787,
"kl": 0.06976318359375,
"learning_rate": 1e-06,
"loss": -0.0424,
"num_tokens": 14646505.0,
"reward": 0.03524015098810196,
"reward_std": 0.021195726469159126,
"rewards/bleu_reward_func/mean": 0.03524015098810196,
"rewards/bleu_reward_func/std": 0.04081031307578087,
"step": 1125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 377.0,
"completions/mean_length": 228.375,
"completions/mean_terminated_length": 117.39130401611328,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.9008,
"grad_norm": 6.014510154724121,
"kl": 0.232421875,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 14657525.0,
"reward": 0.06341119110584259,
"reward_std": 0.03255104646086693,
"rewards/bleu_reward_func/mean": 0.06341119110584259,
"rewards/bleu_reward_func/std": 0.04315832257270813,
"step": 1126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 238.21875,
"completions/mean_terminated_length": 175.03846740722656,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.9016,
"grad_norm": 4.443993091583252,
"kl": 0.07666015625,
"learning_rate": 1e-06,
"loss": 0.1492,
"num_tokens": 14669820.0,
"reward": 0.050140924751758575,
"reward_std": 0.02134326659142971,
"rewards/bleu_reward_func/mean": 0.050140924751758575,
"rewards/bleu_reward_func/std": 0.054666925221681595,
"step": 1127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 206.8125,
"completions/mean_terminated_length": 121.36000061035156,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.9024,
"grad_norm": 6.487152099609375,
"kl": 0.30072021484375,
"learning_rate": 1e-06,
"loss": 0.1095,
"num_tokens": 14678878.0,
"reward": 0.20913344621658325,
"reward_std": 0.06204414367675781,
"rewards/bleu_reward_func/mean": 0.20913344621658325,
"rewards/bleu_reward_func/std": 0.15058699250221252,
"step": 1128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 295.0,
"completions/mean_length": 168.5625,
"completions/mean_terminated_length": 104.96296691894531,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.9032,
"grad_norm": 5.987677097320557,
"kl": 0.161376953125,
"learning_rate": 1e-06,
"loss": 0.1159,
"num_tokens": 14690400.0,
"reward": 0.22108127176761627,
"reward_std": 0.03181886300444603,
"rewards/bleu_reward_func/mean": 0.22108127176761627,
"rewards/bleu_reward_func/std": 0.21734413504600525,
"step": 1129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 224.40625,
"completions/mean_terminated_length": 194.65516662597656,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.904,
"grad_norm": 4.591039180755615,
"kl": 0.1512451171875,
"learning_rate": 1e-06,
"loss": 0.0318,
"num_tokens": 14703621.0,
"reward": 0.13979627192020416,
"reward_std": 0.024196792393922806,
"rewards/bleu_reward_func/mean": 0.13979627192020416,
"rewards/bleu_reward_func/std": 0.11370246112346649,
"step": 1130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 191.96875,
"completions/mean_terminated_length": 170.6333465576172,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.9048,
"grad_norm": 9.648924827575684,
"kl": 0.3162841796875,
"learning_rate": 1e-06,
"loss": 0.0993,
"num_tokens": 14715324.0,
"reward": 0.24474243819713593,
"reward_std": 0.03903892636299133,
"rewards/bleu_reward_func/mean": 0.24474243819713593,
"rewards/bleu_reward_func/std": 0.111121766269207,
"step": 1131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 311.0,
"completions/mean_length": 335.125,
"completions/mean_terminated_length": 158.25,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.9056,
"grad_norm": 3.5864415168762207,
"kl": 0.05462646484375,
"learning_rate": 1e-06,
"loss": 0.0438,
"num_tokens": 14728656.0,
"reward": 0.08658318221569061,
"reward_std": 0.03171471878886223,
"rewards/bleu_reward_func/mean": 0.08658318221569061,
"rewards/bleu_reward_func/std": 0.05243143439292908,
"step": 1132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 444.0,
"completions/mean_length": 196.90625,
"completions/mean_terminated_length": 186.74192810058594,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.9064,
"grad_norm": 5.391827583312988,
"kl": 0.149871826171875,
"learning_rate": 1e-06,
"loss": 0.209,
"num_tokens": 14743445.0,
"reward": 0.14488989114761353,
"reward_std": 0.05979035794734955,
"rewards/bleu_reward_func/mean": 0.14488989114761353,
"rewards/bleu_reward_func/std": 0.11484233289957047,
"step": 1133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 40.0,
"completions/mean_length": 147.9375,
"completions/mean_terminated_length": 26.58333396911621,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.9072,
"grad_norm": 9.304245948791504,
"kl": 0.4073486328125,
"learning_rate": 1e-06,
"loss": 0.0082,
"num_tokens": 14755539.0,
"reward": 0.2897959053516388,
"reward_std": 0.11407680809497833,
"rewards/bleu_reward_func/mean": 0.2897959053516388,
"rewards/bleu_reward_func/std": 0.3296494483947754,
"step": 1134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 302.0625,
"completions/mean_terminated_length": 219.9130401611328,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.908,
"grad_norm": 4.041188716888428,
"kl": 0.08953857421875,
"learning_rate": 1e-06,
"loss": 0.0784,
"num_tokens": 14771221.0,
"reward": 0.0867033302783966,
"reward_std": 0.03230535611510277,
"rewards/bleu_reward_func/mean": 0.0867033302783966,
"rewards/bleu_reward_func/std": 0.05712318420410156,
"step": 1135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 353.21875,
"completions/mean_terminated_length": 244.57894897460938,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.9088,
"grad_norm": 3.4682111740112305,
"kl": 0.063140869140625,
"learning_rate": 1e-06,
"loss": -0.013,
"num_tokens": 14786884.0,
"reward": 0.2040112018585205,
"reward_std": 0.03282826021313667,
"rewards/bleu_reward_func/mean": 0.2040112018585205,
"rewards/bleu_reward_func/std": 0.2342006117105484,
"step": 1136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 323.46875,
"completions/mean_terminated_length": 176.8333282470703,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.9096,
"grad_norm": 9.80130672454834,
"kl": 0.296661376953125,
"learning_rate": 1e-06,
"loss": 0.0879,
"num_tokens": 14807267.0,
"reward": 0.15428093075752258,
"reward_std": 0.047303371131420135,
"rewards/bleu_reward_func/mean": 0.15428093075752258,
"rewards/bleu_reward_func/std": 0.12975330650806427,
"step": 1137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 388.0,
"completions/mean_length": 213.3125,
"completions/mean_terminated_length": 113.75,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.9104,
"grad_norm": 6.25737190246582,
"kl": 0.125823974609375,
"learning_rate": 1e-06,
"loss": 0.0125,
"num_tokens": 14815933.0,
"reward": 0.0721752792596817,
"reward_std": 0.021741271018981934,
"rewards/bleu_reward_func/mean": 0.0721752792596817,
"rewards/bleu_reward_func/std": 0.05829243361949921,
"step": 1138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 349.0,
"completions/max_terminated_length": 349.0,
"completions/mean_length": 107.125,
"completions/mean_terminated_length": 107.125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.9112,
"grad_norm": 25.477249145507812,
"kl": 0.55877685546875,
"learning_rate": 1e-06,
"loss": -0.2259,
"num_tokens": 14825321.0,
"reward": 0.1802026480436325,
"reward_std": 0.11938925087451935,
"rewards/bleu_reward_func/mean": 0.1802026480436325,
"rewards/bleu_reward_func/std": 0.1613418012857437,
"step": 1139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 201.0,
"completions/max_terminated_length": 201.0,
"completions/mean_length": 105.3125,
"completions/mean_terminated_length": 105.3125,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.912,
"grad_norm": 11.604246139526367,
"kl": 0.2550048828125,
"learning_rate": 1e-06,
"loss": -0.0128,
"num_tokens": 14831115.0,
"reward": 0.19547826051712036,
"reward_std": 0.07176055759191513,
"rewards/bleu_reward_func/mean": 0.19547826051712036,
"rewards/bleu_reward_func/std": 0.11230416595935822,
"step": 1140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 255.59375,
"completions/mean_terminated_length": 208.11111450195312,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.9128,
"grad_norm": 6.936506271362305,
"kl": 0.14453125,
"learning_rate": 1e-06,
"loss": -0.1892,
"num_tokens": 14847286.0,
"reward": 0.07467533648014069,
"reward_std": 0.0442538745701313,
"rewards/bleu_reward_func/mean": 0.07467533648014069,
"rewards/bleu_reward_func/std": 0.0976758524775505,
"step": 1141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 491.0,
"completions/mean_length": 397.8125,
"completions/mean_terminated_length": 309.0,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.9136,
"grad_norm": 3.533027410507202,
"kl": 0.1064453125,
"learning_rate": 1e-06,
"loss": 0.2572,
"num_tokens": 14862600.0,
"reward": 0.07055975496768951,
"reward_std": 0.024620652198791504,
"rewards/bleu_reward_func/mean": 0.07055975496768951,
"rewards/bleu_reward_func/std": 0.04198145866394043,
"step": 1142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 442.0,
"completions/mean_length": 374.34375,
"completions/mean_terminated_length": 144.9166717529297,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.9144,
"grad_norm": 3.106947422027588,
"kl": 0.05267333984375,
"learning_rate": 1e-06,
"loss": 0.3129,
"num_tokens": 14879211.0,
"reward": 0.047079749405384064,
"reward_std": 0.01572955772280693,
"rewards/bleu_reward_func/mean": 0.047079749405384064,
"rewards/bleu_reward_func/std": 0.03182151913642883,
"step": 1143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 489.0,
"completions/mean_length": 296.21875,
"completions/mean_terminated_length": 289.258056640625,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.9152,
"grad_norm": 9.539961814880371,
"kl": 0.26116943359375,
"learning_rate": 1e-06,
"loss": 0.3821,
"num_tokens": 14892074.0,
"reward": 0.04650323465466499,
"reward_std": 0.016895011067390442,
"rewards/bleu_reward_func/mean": 0.04650323465466499,
"rewards/bleu_reward_func/std": 0.020110802724957466,
"step": 1144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 237.75,
"completions/mean_terminated_length": 50.105262756347656,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.916,
"grad_norm": 10.258657455444336,
"kl": 0.3250732421875,
"learning_rate": 1e-06,
"loss": -0.0619,
"num_tokens": 14904698.0,
"reward": 0.17951351404190063,
"reward_std": 0.07376629114151001,
"rewards/bleu_reward_func/mean": 0.17951351404190063,
"rewards/bleu_reward_func/std": 0.17956046760082245,
"step": 1145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 286.78125,
"completions/mean_terminated_length": 151.65000915527344,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.9168,
"grad_norm": 6.667459964752197,
"kl": 0.08001708984375,
"learning_rate": 1e-06,
"loss": 0.5727,
"num_tokens": 14917451.0,
"reward": 0.03467312082648277,
"reward_std": 0.01676066778600216,
"rewards/bleu_reward_func/mean": 0.03467312082648277,
"rewards/bleu_reward_func/std": 0.02004723809659481,
"step": 1146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 29.0,
"completions/mean_length": 266.90625,
"completions/mean_terminated_length": 21.8125,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.9176,
"grad_norm": 7.602176189422607,
"kl": 0.4158935546875,
"learning_rate": 1e-06,
"loss": 0.038,
"num_tokens": 14932352.0,
"reward": 0.19471007585525513,
"reward_std": 0.04646201431751251,
"rewards/bleu_reward_func/mean": 0.19471007585525513,
"rewards/bleu_reward_func/std": 0.1760382354259491,
"step": 1147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 315.15625,
"completions/mean_terminated_length": 162.05555725097656,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.9184,
"grad_norm": 6.479866027832031,
"kl": 0.109619140625,
"learning_rate": 1e-06,
"loss": 0.0595,
"num_tokens": 14947741.0,
"reward": 0.10786274820566177,
"reward_std": 0.036432720720767975,
"rewards/bleu_reward_func/mean": 0.10786274820566177,
"rewards/bleu_reward_func/std": 0.0789819210767746,
"step": 1148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 472.5,
"completions/mean_terminated_length": 427.7333679199219,
"completions/min_length": 327.0,
"completions/min_terminated_length": 327.0,
"epoch": 0.9192,
"grad_norm": 2.4095027446746826,
"kl": 0.0462646484375,
"learning_rate": 1e-06,
"loss": 0.0305,
"num_tokens": 14966173.0,
"reward": 0.1280764639377594,
"reward_std": 0.03749135136604309,
"rewards/bleu_reward_func/mean": 0.1280764639377594,
"rewards/bleu_reward_func/std": 0.05864708498120308,
"step": 1149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 64.0,
"completions/mean_length": 255.90625,
"completions/mean_terminated_length": 29.941177368164062,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.92,
"grad_norm": 10.67159652709961,
"kl": 0.1556396484375,
"learning_rate": 1e-06,
"loss": 0.1007,
"num_tokens": 14979570.0,
"reward": 0.055816084146499634,
"reward_std": 0.020302332937717438,
"rewards/bleu_reward_func/mean": 0.055816084146499634,
"rewards/bleu_reward_func/std": 0.03035581111907959,
"step": 1150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 210.78125,
"completions/mean_terminated_length": 167.75,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.9208,
"grad_norm": 6.78237771987915,
"kl": 0.17840576171875,
"learning_rate": 1e-06,
"loss": 0.09,
"num_tokens": 14993923.0,
"reward": 0.07397650182247162,
"reward_std": 0.01999451220035553,
"rewards/bleu_reward_func/mean": 0.07397650182247162,
"rewards/bleu_reward_func/std": 0.0341508574783802,
"step": 1151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 412.0,
"completions/mean_length": 227.34375,
"completions/mean_terminated_length": 115.95652770996094,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.9216,
"grad_norm": 9.095620155334473,
"kl": 0.483001708984375,
"learning_rate": 1e-06,
"loss": 0.2875,
"num_tokens": 15005734.0,
"reward": 0.1028270274400711,
"reward_std": 0.05264887586236,
"rewards/bleu_reward_func/mean": 0.1028270274400711,
"rewards/bleu_reward_func/std": 0.08930659294128418,
"step": 1152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 436.0625,
"completions/mean_terminated_length": 269.0,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.9224,
"grad_norm": 2.607079029083252,
"kl": 0.04913330078125,
"learning_rate": 1e-06,
"loss": 0.2182,
"num_tokens": 15021632.0,
"reward": 0.08679656684398651,
"reward_std": 0.05561990663409233,
"rewards/bleu_reward_func/mean": 0.08679656684398651,
"rewards/bleu_reward_func/std": 0.09985605627298355,
"step": 1153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 248.28125,
"completions/mean_terminated_length": 128.40908813476562,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.9232,
"grad_norm": 6.871771812438965,
"kl": 0.18536376953125,
"learning_rate": 1e-06,
"loss": 0.02,
"num_tokens": 15035081.0,
"reward": 0.19525909423828125,
"reward_std": 0.04538525268435478,
"rewards/bleu_reward_func/mean": 0.19525909423828125,
"rewards/bleu_reward_func/std": 0.18253828585147858,
"step": 1154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 156.0,
"completions/mean_length": 285.65625,
"completions/mean_terminated_length": 59.3125,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.924,
"grad_norm": 6.028687953948975,
"kl": 0.0938720703125,
"learning_rate": 1e-06,
"loss": 0.2695,
"num_tokens": 15049558.0,
"reward": 0.09561645239591599,
"reward_std": 0.0469173789024353,
"rewards/bleu_reward_func/mean": 0.09561645239591599,
"rewards/bleu_reward_func/std": 0.07949265837669373,
"step": 1155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 348.0,
"completions/mean_length": 237.875,
"completions/mean_terminated_length": 161.1199951171875,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.9248,
"grad_norm": 18.29808807373047,
"kl": 0.55096435546875,
"learning_rate": 1e-06,
"loss": 0.0911,
"num_tokens": 15060954.0,
"reward": 0.1312306672334671,
"reward_std": 0.05109435319900513,
"rewards/bleu_reward_func/mean": 0.1312306672334671,
"rewards/bleu_reward_func/std": 0.0968712568283081,
"step": 1156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 461.0,
"completions/mean_length": 183.40625,
"completions/mean_terminated_length": 73.875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.9256,
"grad_norm": 8.553780555725098,
"kl": 0.2044677734375,
"learning_rate": 1e-06,
"loss": 0.3837,
"num_tokens": 15071151.0,
"reward": 0.08946996927261353,
"reward_std": 0.032011546194553375,
"rewards/bleu_reward_func/mean": 0.08946996927261353,
"rewards/bleu_reward_func/std": 0.08429201692342758,
"step": 1157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 187.40625,
"completions/mean_terminated_length": 153.8275909423828,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.9264,
"grad_norm": 7.326318740844727,
"kl": 0.14752197265625,
"learning_rate": 1e-06,
"loss": 0.5061,
"num_tokens": 15080252.0,
"reward": 0.1023559644818306,
"reward_std": 0.045405931770801544,
"rewards/bleu_reward_func/mean": 0.1023559644818306,
"rewards/bleu_reward_func/std": 0.07145705074071884,
"step": 1158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 304.4375,
"completions/mean_terminated_length": 246.3199920654297,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.9272,
"grad_norm": 5.513801574707031,
"kl": 0.1817626953125,
"learning_rate": 1e-06,
"loss": 0.1149,
"num_tokens": 15095610.0,
"reward": 0.11080615222454071,
"reward_std": 0.043468981981277466,
"rewards/bleu_reward_func/mean": 0.11080615222454071,
"rewards/bleu_reward_func/std": 0.04713428020477295,
"step": 1159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 347.0,
"completions/mean_length": 205.40625,
"completions/mean_terminated_length": 161.60714721679688,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.928,
"grad_norm": 7.627071857452393,
"kl": 0.318359375,
"learning_rate": 1e-06,
"loss": 0.1805,
"num_tokens": 15104167.0,
"reward": 0.0735570564866066,
"reward_std": 0.02132660523056984,
"rewards/bleu_reward_func/mean": 0.0735570564866066,
"rewards/bleu_reward_func/std": 0.05421363562345505,
"step": 1160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 454.0,
"completions/mean_length": 256.25,
"completions/mean_terminated_length": 156.17391967773438,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.9288,
"grad_norm": 6.11322021484375,
"kl": 0.27099609375,
"learning_rate": 1e-06,
"loss": 0.1028,
"num_tokens": 15118823.0,
"reward": 0.1414988487958908,
"reward_std": 0.03222941979765892,
"rewards/bleu_reward_func/mean": 0.1414988487958908,
"rewards/bleu_reward_func/std": 0.15351709723472595,
"step": 1161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 434.0,
"completions/mean_length": 417.8125,
"completions/mean_terminated_length": 238.0,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.9296,
"grad_norm": 3.5942113399505615,
"kl": 0.08154296875,
"learning_rate": 1e-06,
"loss": 0.1689,
"num_tokens": 15139105.0,
"reward": 0.12588296830654144,
"reward_std": 0.04371759667992592,
"rewards/bleu_reward_func/mean": 0.12588296830654144,
"rewards/bleu_reward_func/std": 0.14081913232803345,
"step": 1162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 318.125,
"completions/mean_terminated_length": 273.3846130371094,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.9304,
"grad_norm": 3.5201575756073,
"kl": 0.145751953125,
"learning_rate": 1e-06,
"loss": -0.0396,
"num_tokens": 15151437.0,
"reward": 0.06649903953075409,
"reward_std": 0.024907082319259644,
"rewards/bleu_reward_func/mean": 0.06649903953075409,
"rewards/bleu_reward_func/std": 0.04641694948077202,
"step": 1163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 263.8125,
"completions/mean_terminated_length": 166.69564819335938,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.9312,
"grad_norm": 5.745899200439453,
"kl": 0.124755859375,
"learning_rate": 1e-06,
"loss": 0.1729,
"num_tokens": 15164791.0,
"reward": 0.33342817425727844,
"reward_std": 0.06225915253162384,
"rewards/bleu_reward_func/mean": 0.33342817425727844,
"rewards/bleu_reward_func/std": 0.3276880383491516,
"step": 1164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 280.1875,
"completions/mean_terminated_length": 272.70965576171875,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.932,
"grad_norm": 7.059730052947998,
"kl": 0.154541015625,
"learning_rate": 1e-06,
"loss": 0.2383,
"num_tokens": 15176629.0,
"reward": 0.03160897642374039,
"reward_std": 0.010618302971124649,
"rewards/bleu_reward_func/mean": 0.03160897642374039,
"rewards/bleu_reward_func/std": 0.016612010076642036,
"step": 1165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 295.59375,
"completions/mean_terminated_length": 281.16668701171875,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.9328,
"grad_norm": 5.280132293701172,
"kl": 0.13616943359375,
"learning_rate": 1e-06,
"loss": 0.3116,
"num_tokens": 15188168.0,
"reward": 0.03221059590578079,
"reward_std": 0.02213170751929283,
"rewards/bleu_reward_func/mean": 0.03221059590578079,
"rewards/bleu_reward_func/std": 0.03985392674803734,
"step": 1166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 227.375,
"completions/mean_terminated_length": 132.5,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.9336,
"grad_norm": 6.5651021003723145,
"kl": 0.142578125,
"learning_rate": 1e-06,
"loss": 0.0658,
"num_tokens": 15201892.0,
"reward": 0.1807694286108017,
"reward_std": 0.1409318894147873,
"rewards/bleu_reward_func/mean": 0.1807694286108017,
"rewards/bleu_reward_func/std": 0.2797906994819641,
"step": 1167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 220.5,
"completions/mean_terminated_length": 190.34483337402344,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.9344,
"grad_norm": 7.170085430145264,
"kl": 0.400634765625,
"learning_rate": 1e-06,
"loss": 0.0572,
"num_tokens": 15212724.0,
"reward": 0.1451285183429718,
"reward_std": 0.0673985704779625,
"rewards/bleu_reward_func/mean": 0.1451285183429718,
"rewards/bleu_reward_func/std": 0.12026475369930267,
"step": 1168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 313.75,
"completions/mean_terminated_length": 247.6666717529297,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.9352,
"grad_norm": 5.603696823120117,
"kl": 0.152587890625,
"learning_rate": 1e-06,
"loss": 0.0616,
"num_tokens": 15228428.0,
"reward": 0.253650963306427,
"reward_std": 0.03022560104727745,
"rewards/bleu_reward_func/mean": 0.253650963306427,
"rewards/bleu_reward_func/std": 0.3357136845588684,
"step": 1169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 255.125,
"completions/mean_terminated_length": 154.60870361328125,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.936,
"grad_norm": 6.110992431640625,
"kl": 0.1976318359375,
"learning_rate": 1e-06,
"loss": -0.0162,
"num_tokens": 15242768.0,
"reward": 0.0904015377163887,
"reward_std": 0.025095967575907707,
"rewards/bleu_reward_func/mean": 0.0904015377163887,
"rewards/bleu_reward_func/std": 0.09678779542446136,
"step": 1170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 241.625,
"completions/mean_terminated_length": 213.65516662597656,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.9368,
"grad_norm": 7.32260799407959,
"kl": 0.28955078125,
"learning_rate": 1e-06,
"loss": 0.2098,
"num_tokens": 15256260.0,
"reward": 0.09696318954229355,
"reward_std": 0.04769141972064972,
"rewards/bleu_reward_func/mean": 0.09696318954229355,
"rewards/bleu_reward_func/std": 0.07404191046953201,
"step": 1171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 189.96875,
"completions/mean_terminated_length": 156.65516662597656,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.9376,
"grad_norm": 8.43685531616211,
"kl": 0.293212890625,
"learning_rate": 1e-06,
"loss": -0.1046,
"num_tokens": 15269899.0,
"reward": 0.09695740044116974,
"reward_std": 0.037594642490148544,
"rewards/bleu_reward_func/mean": 0.09695740044116974,
"rewards/bleu_reward_func/std": 0.05750608071684837,
"step": 1172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 475.0,
"completions/mean_length": 343.03125,
"completions/mean_terminated_length": 125.78572082519531,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.9384,
"grad_norm": 5.582010269165039,
"kl": 0.410400390625,
"learning_rate": 1e-06,
"loss": -0.1385,
"num_tokens": 15287108.0,
"reward": 0.1451946198940277,
"reward_std": 0.03915205970406532,
"rewards/bleu_reward_func/mean": 0.1451946198940277,
"rewards/bleu_reward_func/std": 0.17974776029586792,
"step": 1173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 331.84375,
"completions/mean_terminated_length": 261.34783935546875,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.9392,
"grad_norm": 11.770851135253906,
"kl": 0.3424072265625,
"learning_rate": 1e-06,
"loss": 0.1869,
"num_tokens": 15304535.0,
"reward": 0.1413375586271286,
"reward_std": 0.06474150717258453,
"rewards/bleu_reward_func/mean": 0.1413375586271286,
"rewards/bleu_reward_func/std": 0.0782981589436531,
"step": 1174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 459.0,
"completions/mean_length": 299.46875,
"completions/mean_terminated_length": 188.1428680419922,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.94,
"grad_norm": 5.467903137207031,
"kl": 0.198486328125,
"learning_rate": 1e-06,
"loss": -0.1137,
"num_tokens": 15317078.0,
"reward": 0.14337725937366486,
"reward_std": 0.03686724230647087,
"rewards/bleu_reward_func/mean": 0.14337725937366486,
"rewards/bleu_reward_func/std": 0.16096609830856323,
"step": 1175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 253.8125,
"completions/mean_terminated_length": 181.51998901367188,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.9408,
"grad_norm": 8.620965957641602,
"kl": 0.379150390625,
"learning_rate": 1e-06,
"loss": -0.0426,
"num_tokens": 15328768.0,
"reward": 0.1426527500152588,
"reward_std": 0.0550708994269371,
"rewards/bleu_reward_func/mean": 0.1426527500152588,
"rewards/bleu_reward_func/std": 0.12621666491031647,
"step": 1176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 421.0,
"completions/mean_length": 193.21875,
"completions/mean_terminated_length": 86.95833587646484,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.9416,
"grad_norm": 10.362127304077148,
"kl": 0.565673828125,
"learning_rate": 1e-06,
"loss": 0.0348,
"num_tokens": 15340919.0,
"reward": 0.04140050709247589,
"reward_std": 0.019718483090400696,
"rewards/bleu_reward_func/mean": 0.04140050709247589,
"rewards/bleu_reward_func/std": 0.03685431182384491,
"step": 1177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 349.5625,
"completions/mean_terminated_length": 223.22222900390625,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.9424,
"grad_norm": 5.097819805145264,
"kl": 0.2220458984375,
"learning_rate": 1e-06,
"loss": 0.0039,
"num_tokens": 15356545.0,
"reward": 0.14949634671211243,
"reward_std": 0.05013212561607361,
"rewards/bleu_reward_func/mean": 0.14949634671211243,
"rewards/bleu_reward_func/std": 0.19787877798080444,
"step": 1178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 403.0,
"completions/mean_length": 160.03125,
"completions/mean_terminated_length": 123.62068939208984,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.9432,
"grad_norm": 10.772759437561035,
"kl": 0.300537109375,
"learning_rate": 1e-06,
"loss": 0.0346,
"num_tokens": 15364554.0,
"reward": 0.1290975958108902,
"reward_std": 0.07744569331407547,
"rewards/bleu_reward_func/mean": 0.1290975958108902,
"rewards/bleu_reward_func/std": 0.1356077641248703,
"step": 1179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 378.1875,
"completions/mean_terminated_length": 122.72727966308594,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.944,
"grad_norm": 6.254019260406494,
"kl": 0.2237548828125,
"learning_rate": 1e-06,
"loss": 0.0559,
"num_tokens": 15382792.0,
"reward": 0.11257205903530121,
"reward_std": 0.036544833332300186,
"rewards/bleu_reward_func/mean": 0.11257205903530121,
"rewards/bleu_reward_func/std": 0.10338166356086731,
"step": 1180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 214.0,
"completions/mean_length": 185.75,
"completions/mean_terminated_length": 37.45454788208008,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.9448,
"grad_norm": 30.484220504760742,
"kl": 1.228515625,
"learning_rate": 1e-06,
"loss": 0.7897,
"num_tokens": 15391000.0,
"reward": 0.2840408384799957,
"reward_std": 0.15822984278202057,
"rewards/bleu_reward_func/mean": 0.2840408384799957,
"rewards/bleu_reward_func/std": 0.1896887719631195,
"step": 1181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 237.8125,
"completions/mean_terminated_length": 130.52174377441406,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.9456,
"grad_norm": 18.71758270263672,
"kl": 1.376953125,
"learning_rate": 1e-06,
"loss": 0.403,
"num_tokens": 15401594.0,
"reward": 0.16319331526756287,
"reward_std": 0.07705336064100266,
"rewards/bleu_reward_func/mean": 0.16319331526756287,
"rewards/bleu_reward_func/std": 0.21551194787025452,
"step": 1182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 301.6875,
"completions/mean_terminated_length": 175.5,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.9464,
"grad_norm": 10.93260383605957,
"kl": 0.89013671875,
"learning_rate": 1e-06,
"loss": 0.073,
"num_tokens": 15415696.0,
"reward": 0.1893438994884491,
"reward_std": 0.050289541482925415,
"rewards/bleu_reward_func/mean": 0.1893438994884491,
"rewards/bleu_reward_func/std": 0.2888805866241455,
"step": 1183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 490.0,
"completions/mean_length": 409.75,
"completions/mean_terminated_length": 184.8000030517578,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.9472,
"grad_norm": 18.76006317138672,
"kl": 0.81097412109375,
"learning_rate": 1e-06,
"loss": 0.1983,
"num_tokens": 15436760.0,
"reward": 0.06023106724023819,
"reward_std": 0.03375660628080368,
"rewards/bleu_reward_func/mean": 0.06023106724023819,
"rewards/bleu_reward_func/std": 0.07748028635978699,
"step": 1184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 458.0,
"completions/mean_length": 319.875,
"completions/mean_terminated_length": 188.42105102539062,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.948,
"grad_norm": 8.845364570617676,
"kl": 1.5517578125,
"learning_rate": 1e-06,
"loss": 0.1722,
"num_tokens": 15449196.0,
"reward": 0.08687852323055267,
"reward_std": 0.02910173125565052,
"rewards/bleu_reward_func/mean": 0.08687852323055267,
"rewards/bleu_reward_func/std": 0.06549690663814545,
"step": 1185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 245.0,
"completions/mean_length": 192.5625,
"completions/mean_terminated_length": 86.08333587646484,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.9488,
"grad_norm": 17.081815719604492,
"kl": 0.970703125,
"learning_rate": 1e-06,
"loss": 0.1558,
"num_tokens": 15458006.0,
"reward": 0.10367533564567566,
"reward_std": 0.04477589949965477,
"rewards/bleu_reward_func/mean": 0.10367533564567566,
"rewards/bleu_reward_func/std": 0.1153038814663887,
"step": 1186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 357.0,
"completions/mean_length": 165.3125,
"completions/mean_terminated_length": 115.78572082519531,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.9496,
"grad_norm": 18.12542152404785,
"kl": 1.48876953125,
"learning_rate": 1e-06,
"loss": 0.1714,
"num_tokens": 15469048.0,
"reward": 0.14421464502811432,
"reward_std": 0.05387473851442337,
"rewards/bleu_reward_func/mean": 0.14421464502811432,
"rewards/bleu_reward_func/std": 0.11958708614110947,
"step": 1187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 170.8125,
"completions/mean_terminated_length": 92.0769271850586,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.9504,
"grad_norm": 19.755910873413086,
"kl": 1.64697265625,
"learning_rate": 1e-06,
"loss": 0.083,
"num_tokens": 15478610.0,
"reward": 0.11767937242984772,
"reward_std": 0.038736552000045776,
"rewards/bleu_reward_func/mean": 0.11767937242984772,
"rewards/bleu_reward_func/std": 0.14759457111358643,
"step": 1188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 303.34375,
"completions/mean_terminated_length": 233.7916717529297,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.9512,
"grad_norm": 7.415678024291992,
"kl": 0.464111328125,
"learning_rate": 1e-06,
"loss": -0.0998,
"num_tokens": 15493469.0,
"reward": 0.13566911220550537,
"reward_std": 0.03367416933178902,
"rewards/bleu_reward_func/mean": 0.13566911220550537,
"rewards/bleu_reward_func/std": 0.16353026032447815,
"step": 1189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 371.59375,
"completions/mean_terminated_length": 307.7727355957031,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.952,
"grad_norm": 4.499618053436279,
"kl": 0.21990966796875,
"learning_rate": 1e-06,
"loss": -0.0327,
"num_tokens": 15510992.0,
"reward": 0.05285275727510452,
"reward_std": 0.026856746524572372,
"rewards/bleu_reward_func/mean": 0.05285275727510452,
"rewards/bleu_reward_func/std": 0.028620464727282524,
"step": 1190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 456.0,
"completions/mean_length": 211.5,
"completions/mean_terminated_length": 155.8518524169922,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.9528,
"grad_norm": 11.282941818237305,
"kl": 0.353515625,
"learning_rate": 1e-06,
"loss": -0.0409,
"num_tokens": 15519336.0,
"reward": 0.09386638551950455,
"reward_std": 0.03402595967054367,
"rewards/bleu_reward_func/mean": 0.09386638551950455,
"rewards/bleu_reward_func/std": 0.08018817007541656,
"step": 1191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 468.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 53.5625,
"completions/mean_terminated_length": 53.5625,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.9536,
"grad_norm": 30.713850021362305,
"kl": 0.921142578125,
"learning_rate": 1e-06,
"loss": 0.5322,
"num_tokens": 15524298.0,
"reward": 0.1515873372554779,
"reward_std": 0.03387141600251198,
"rewards/bleu_reward_func/mean": 0.1515873372554779,
"rewards/bleu_reward_func/std": 0.14795146882534027,
"step": 1192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 307.78125,
"completions/mean_terminated_length": 148.94444274902344,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.9544,
"grad_norm": 8.172345161437988,
"kl": 0.3499755859375,
"learning_rate": 1e-06,
"loss": -0.1691,
"num_tokens": 15539683.0,
"reward": 0.06142358481884003,
"reward_std": 0.017768073827028275,
"rewards/bleu_reward_func/mean": 0.06142358481884003,
"rewards/bleu_reward_func/std": 0.02769811637699604,
"step": 1193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 438.0,
"completions/mean_length": 59.1875,
"completions/mean_terminated_length": 44.58064270019531,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.9552,
"grad_norm": 10.542518615722656,
"kl": 0.6220703125,
"learning_rate": 1e-06,
"loss": -0.2357,
"num_tokens": 15546073.0,
"reward": 0.13228365778923035,
"reward_std": 0.05075054615736008,
"rewards/bleu_reward_func/mean": 0.13228365778923035,
"rewards/bleu_reward_func/std": 0.14825788140296936,
"step": 1194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 351.75,
"completions/mean_terminated_length": 255.60000610351562,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.956,
"grad_norm": 14.762670516967773,
"kl": 0.54180908203125,
"learning_rate": 1e-06,
"loss": 0.1487,
"num_tokens": 15561137.0,
"reward": 0.10621648281812668,
"reward_std": 0.04206620901823044,
"rewards/bleu_reward_func/mean": 0.10621648281812668,
"rewards/bleu_reward_func/std": 0.05661296099424362,
"step": 1195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 363.1875,
"completions/mean_terminated_length": 285.23809814453125,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.9568,
"grad_norm": 2.930482864379883,
"kl": 0.08624267578125,
"learning_rate": 1e-06,
"loss": -0.2447,
"num_tokens": 15575311.0,
"reward": 0.026056351140141487,
"reward_std": 0.01072642207145691,
"rewards/bleu_reward_func/mean": 0.026056351140141487,
"rewards/bleu_reward_func/std": 0.014662106521427631,
"step": 1196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 488.0,
"completions/mean_length": 334.6875,
"completions/mean_terminated_length": 228.3000030517578,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.9576,
"grad_norm": 4.765532493591309,
"kl": 0.20489501953125,
"learning_rate": 1e-06,
"loss": -0.0713,
"num_tokens": 15591997.0,
"reward": 0.18965111672878265,
"reward_std": 0.03347271308302879,
"rewards/bleu_reward_func/mean": 0.18965111672878265,
"rewards/bleu_reward_func/std": 0.24074162542819977,
"step": 1197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 206.71875,
"completions/mean_terminated_length": 186.36668395996094,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.9584,
"grad_norm": 9.595136642456055,
"kl": 0.4879150390625,
"learning_rate": 1e-06,
"loss": -0.0063,
"num_tokens": 15601852.0,
"reward": 0.07087633013725281,
"reward_std": 0.024902882054448128,
"rewards/bleu_reward_func/mean": 0.07087633013725281,
"rewards/bleu_reward_func/std": 0.052057161927223206,
"step": 1198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 286.59375,
"completions/mean_terminated_length": 223.47999572753906,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.9592,
"grad_norm": 5.445902347564697,
"kl": 0.289306640625,
"learning_rate": 1e-06,
"loss": -0.0083,
"num_tokens": 15613743.0,
"reward": 0.10761390626430511,
"reward_std": 0.03891483694314957,
"rewards/bleu_reward_func/mean": 0.10761390626430511,
"rewards/bleu_reward_func/std": 0.10792107880115509,
"step": 1199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 112.0,
"completions/max_terminated_length": 112.0,
"completions/mean_length": 29.75,
"completions/mean_terminated_length": 29.75,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.96,
"grad_norm": 16.023887634277344,
"kl": 1.11669921875,
"learning_rate": 1e-06,
"loss": 0.1468,
"num_tokens": 15619671.0,
"reward": 0.3471377491950989,
"reward_std": 0.05158979445695877,
"rewards/bleu_reward_func/mean": 0.3471377491950989,
"rewards/bleu_reward_func/std": 0.13238590955734253,
"step": 1200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 451.0,
"completions/mean_length": 325.5,
"completions/mean_terminated_length": 213.60000610351562,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.9608,
"grad_norm": 5.491018295288086,
"kl": 0.32122802734375,
"learning_rate": 1e-06,
"loss": -0.2311,
"num_tokens": 15634047.0,
"reward": 0.07880916446447372,
"reward_std": 0.030750975012779236,
"rewards/bleu_reward_func/mean": 0.07880916446447372,
"rewards/bleu_reward_func/std": 0.06850366294384003,
"step": 1201
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 264.5625,
"completions/mean_terminated_length": 248.06668090820312,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.9616,
"grad_norm": 4.983512878417969,
"kl": 0.1092529296875,
"learning_rate": 1e-06,
"loss": -0.1154,
"num_tokens": 15647369.0,
"reward": 0.09226585179567337,
"reward_std": 0.04809027165174484,
"rewards/bleu_reward_func/mean": 0.09226585179567337,
"rewards/bleu_reward_func/std": 0.14570128917694092,
"step": 1202
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 419.0,
"completions/max_terminated_length": 419.0,
"completions/mean_length": 160.71875,
"completions/mean_terminated_length": 160.71875,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.9624,
"grad_norm": 4.987871170043945,
"kl": 0.1322021484375,
"learning_rate": 1e-06,
"loss": 0.0973,
"num_tokens": 15656384.0,
"reward": 0.036719270050525665,
"reward_std": 0.007080578710883856,
"rewards/bleu_reward_func/mean": 0.036719270050525665,
"rewards/bleu_reward_func/std": 0.018336299806833267,
"step": 1203
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 443.0,
"completions/mean_length": 238.03125,
"completions/mean_terminated_length": 146.70834350585938,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.9632,
"grad_norm": 4.710721492767334,
"kl": 0.24249267578125,
"learning_rate": 1e-06,
"loss": -0.0163,
"num_tokens": 15666769.0,
"reward": 0.11304133385419846,
"reward_std": 0.03101547807455063,
"rewards/bleu_reward_func/mean": 0.11304133385419846,
"rewards/bleu_reward_func/std": 0.09199430793523788,
"step": 1204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 417.0,
"completions/max_terminated_length": 417.0,
"completions/mean_length": 129.78125,
"completions/mean_terminated_length": 129.78125,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.964,
"grad_norm": 9.284921646118164,
"kl": 0.5504150390625,
"learning_rate": 1e-06,
"loss": -0.0268,
"num_tokens": 15674122.0,
"reward": 0.07591907680034637,
"reward_std": 0.03484845906496048,
"rewards/bleu_reward_func/mean": 0.07591907680034637,
"rewards/bleu_reward_func/std": 0.05640895664691925,
"step": 1205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 280.15625,
"completions/mean_terminated_length": 158.71429443359375,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.9648,
"grad_norm": 5.940964698791504,
"kl": 0.234375,
"learning_rate": 1e-06,
"loss": 0.2716,
"num_tokens": 15690119.0,
"reward": 0.19026660919189453,
"reward_std": 0.06492812931537628,
"rewards/bleu_reward_func/mean": 0.19026660919189453,
"rewards/bleu_reward_func/std": 0.1680937260389328,
"step": 1206
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 258.0,
"completions/mean_length": 190.96875,
"completions/mean_terminated_length": 131.51852416992188,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.9656,
"grad_norm": 4.697986602783203,
"kl": 0.17156982421875,
"learning_rate": 1e-06,
"loss": 0.1839,
"num_tokens": 15700302.0,
"reward": 0.29912805557250977,
"reward_std": 0.05129002407193184,
"rewards/bleu_reward_func/mean": 0.29912805557250977,
"rewards/bleu_reward_func/std": 0.33928823471069336,
"step": 1207
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 403.34375,
"completions/mean_terminated_length": 346.4285888671875,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.9664,
"grad_norm": 3.426858901977539,
"kl": 0.05426025390625,
"learning_rate": 1e-06,
"loss": -0.041,
"num_tokens": 15718081.0,
"reward": 0.10875709354877472,
"reward_std": 0.017351722344756126,
"rewards/bleu_reward_func/mean": 0.10875709354877472,
"rewards/bleu_reward_func/std": 0.1039399579167366,
"step": 1208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 407.0,
"completions/mean_length": 236.59375,
"completions/mean_terminated_length": 111.40909576416016,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.9672,
"grad_norm": 5.4925079345703125,
"kl": 0.2030029296875,
"learning_rate": 1e-06,
"loss": -0.1031,
"num_tokens": 15728028.0,
"reward": 0.04499085620045662,
"reward_std": 0.01842951774597168,
"rewards/bleu_reward_func/mean": 0.04499085620045662,
"rewards/bleu_reward_func/std": 0.030968643724918365,
"step": 1209
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 140.625,
"completions/mean_terminated_length": 128.64515686035156,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.968,
"grad_norm": 9.575289726257324,
"kl": 0.20196533203125,
"learning_rate": 1e-06,
"loss": 0.1409,
"num_tokens": 15735296.0,
"reward": 0.07829822599887848,
"reward_std": 0.03164747357368469,
"rewards/bleu_reward_func/mean": 0.07829822599887848,
"rewards/bleu_reward_func/std": 0.07768744975328445,
"step": 1210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 338.1875,
"completions/mean_terminated_length": 298.0769348144531,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.9688,
"grad_norm": 2.519958019256592,
"kl": 0.042205810546875,
"learning_rate": 1e-06,
"loss": 0.2122,
"num_tokens": 15750286.0,
"reward": 0.11583074182271957,
"reward_std": 0.0300702303647995,
"rewards/bleu_reward_func/mean": 0.11583074182271957,
"rewards/bleu_reward_func/std": 0.0626337081193924,
"step": 1211
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 202.1875,
"completions/mean_terminated_length": 98.91667175292969,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.9696,
"grad_norm": 7.297961235046387,
"kl": 0.3997802734375,
"learning_rate": 1e-06,
"loss": 0.1691,
"num_tokens": 15759468.0,
"reward": 0.14121456444263458,
"reward_std": 0.051856689155101776,
"rewards/bleu_reward_func/mean": 0.14121456444263458,
"rewards/bleu_reward_func/std": 0.1731884926557541,
"step": 1212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 95.0,
"completions/mean_length": 183.1875,
"completions/mean_terminated_length": 33.727272033691406,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.9704,
"grad_norm": 6.750447750091553,
"kl": 0.354522705078125,
"learning_rate": 1e-06,
"loss": 0.3063,
"num_tokens": 15771298.0,
"reward": 0.20002232491970062,
"reward_std": 0.05593840777873993,
"rewards/bleu_reward_func/mean": 0.20002232491970062,
"rewards/bleu_reward_func/std": 0.1818784922361374,
"step": 1213
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 509.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 155.59375,
"completions/mean_terminated_length": 155.59375,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.9712,
"grad_norm": 8.60672378540039,
"kl": 0.29888916015625,
"learning_rate": 1e-06,
"loss": -0.1436,
"num_tokens": 15781949.0,
"reward": 0.07949218153953552,
"reward_std": 0.030030012130737305,
"rewards/bleu_reward_func/mean": 0.07949218153953552,
"rewards/bleu_reward_func/std": 0.05354390665888786,
"step": 1214
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 293.0,
"completions/mean_terminated_length": 193.4545440673828,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.972,
"grad_norm": 4.160828590393066,
"kl": 0.09783935546875,
"learning_rate": 1e-06,
"loss": -0.0156,
"num_tokens": 15795061.0,
"reward": 0.1600840538740158,
"reward_std": 0.05235850065946579,
"rewards/bleu_reward_func/mean": 0.1600840538740158,
"rewards/bleu_reward_func/std": 0.0939527377486229,
"step": 1215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 251.0,
"completions/mean_length": 148.8125,
"completions/mean_terminated_length": 65.0,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.9728,
"grad_norm": 5.613617420196533,
"kl": 0.1966552734375,
"learning_rate": 1e-06,
"loss": 0.011,
"num_tokens": 15804343.0,
"reward": 0.1325000822544098,
"reward_std": 0.054465532302856445,
"rewards/bleu_reward_func/mean": 0.1325000822544098,
"rewards/bleu_reward_func/std": 0.15841807425022125,
"step": 1216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.0,
"completions/max_terminated_length": 389.0,
"completions/mean_length": 118.34375,
"completions/mean_terminated_length": 118.34375,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.9736,
"grad_norm": 7.5834503173828125,
"kl": 0.2967529296875,
"learning_rate": 1e-06,
"loss": 0.0808,
"num_tokens": 15815658.0,
"reward": 0.243885338306427,
"reward_std": 0.05274055525660515,
"rewards/bleu_reward_func/mean": 0.243885338306427,
"rewards/bleu_reward_func/std": 0.14211414754390717,
"step": 1217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 367.8125,
"completions/mean_terminated_length": 223.625,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.9744,
"grad_norm": 3.8433821201324463,
"kl": 0.0970458984375,
"learning_rate": 1e-06,
"loss": -0.0201,
"num_tokens": 15831204.0,
"reward": 0.10006400942802429,
"reward_std": 0.02605537325143814,
"rewards/bleu_reward_func/mean": 0.10006400942802429,
"rewards/bleu_reward_func/std": 0.1093517392873764,
"step": 1218
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 327.0,
"completions/mean_length": 191.90625,
"completions/mean_terminated_length": 132.62962341308594,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.9752,
"grad_norm": 25.382192611694336,
"kl": 0.375518798828125,
"learning_rate": 1e-06,
"loss": 0.2354,
"num_tokens": 15842769.0,
"reward": 0.21496494114398956,
"reward_std": 0.08334603905677795,
"rewards/bleu_reward_func/mean": 0.21496494114398956,
"rewards/bleu_reward_func/std": 0.3287891745567322,
"step": 1219
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 405.0,
"completions/mean_length": 193.0,
"completions/mean_terminated_length": 68.17391204833984,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.976,
"grad_norm": 8.422406196594238,
"kl": 0.158935546875,
"learning_rate": 1e-06,
"loss": 0.7192,
"num_tokens": 15852937.0,
"reward": 0.21234184503555298,
"reward_std": 0.10839352756738663,
"rewards/bleu_reward_func/mean": 0.21234184503555298,
"rewards/bleu_reward_func/std": 0.17191235721111298,
"step": 1220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 458.0,
"completions/mean_length": 291.15625,
"completions/mean_terminated_length": 250.25926208496094,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.9768,
"grad_norm": 3.8467891216278076,
"kl": 0.101806640625,
"learning_rate": 1e-06,
"loss": -0.0472,
"num_tokens": 15865934.0,
"reward": 0.13597853481769562,
"reward_std": 0.034184906631708145,
"rewards/bleu_reward_func/mean": 0.13597853481769562,
"rewards/bleu_reward_func/std": 0.0799630656838417,
"step": 1221
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 268.15625,
"completions/mean_terminated_length": 223.0,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.9776,
"grad_norm": 5.5957794189453125,
"kl": 0.1470947265625,
"learning_rate": 1e-06,
"loss": 0.137,
"num_tokens": 15881691.0,
"reward": 0.11758720874786377,
"reward_std": 0.05352931469678879,
"rewards/bleu_reward_func/mean": 0.11758720874786377,
"rewards/bleu_reward_func/std": 0.08839549124240875,
"step": 1222
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 364.28125,
"completions/mean_terminated_length": 174.35714721679688,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.9784,
"grad_norm": 3.5430212020874023,
"kl": 0.0980224609375,
"learning_rate": 1e-06,
"loss": -0.0829,
"num_tokens": 15901716.0,
"reward": 0.10303943604230881,
"reward_std": 0.02435348369181156,
"rewards/bleu_reward_func/mean": 0.10303943604230881,
"rewards/bleu_reward_func/std": 0.10681937634944916,
"step": 1223
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 350.09375,
"completions/mean_terminated_length": 207.23529052734375,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.9792,
"grad_norm": 3.817512035369873,
"kl": 0.06549072265625,
"learning_rate": 1e-06,
"loss": -0.2731,
"num_tokens": 15916239.0,
"reward": 0.025896022096276283,
"reward_std": 0.011423053219914436,
"rewards/bleu_reward_func/mean": 0.025896022096276283,
"rewards/bleu_reward_func/std": 0.01915143057703972,
"step": 1224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 512.0,
"completions/max_terminated_length": 58.0,
"completions/mean_length": 148.59375,
"completions/mean_terminated_length": 27.45833396911621,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.98,
"grad_norm": 8.734639167785645,
"kl": 0.29150390625,
"learning_rate": 1e-06,
"loss": 0.1203,
"num_tokens": 15926002.0,
"reward": 0.2994440793991089,
"reward_std": 0.08188341557979584,
"rewards/bleu_reward_func/mean": 0.2994440793991089,
"rewards/bleu_reward_func/std": 0.17080959677696228,
"step": 1225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 146.875,
"completions/mean_terminated_length": 109.10344696044922,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.9808,
"grad_norm": 6.3398637771606445,
"kl": 0.203369140625,
"learning_rate": 1e-06,
"loss": 0.0948,
"num_tokens": 15934822.0,
"reward": 0.08409433811903,
"reward_std": 0.022457323968410492,
"rewards/bleu_reward_func/mean": 0.08409433811903,
"rewards/bleu_reward_func/std": 0.12822076678276062,
"step": 1226
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 467.0,
"completions/mean_length": 127.15625,
"completions/mean_terminated_length": 114.74193572998047,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.9816,
"grad_norm": 5.545539379119873,
"kl": 0.157470703125,
"learning_rate": 1e-06,
"loss": -0.038,
"num_tokens": 15941411.0,
"reward": 0.291486918926239,
"reward_std": 0.01802459917962551,
"rewards/bleu_reward_func/mean": 0.291486918926239,
"rewards/bleu_reward_func/std": 0.30036208033561707,
"step": 1227
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 388.0,
"completions/mean_length": 188.1875,
"completions/mean_terminated_length": 128.22222900390625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.9824,
"grad_norm": 4.385817050933838,
"kl": 0.12310791015625,
"learning_rate": 1e-06,
"loss": -0.2018,
"num_tokens": 15948961.0,
"reward": 0.050382573157548904,
"reward_std": 0.019635431468486786,
"rewards/bleu_reward_func/mean": 0.050382573157548904,
"rewards/bleu_reward_func/std": 0.0410877950489521,
"step": 1228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 167.40625,
"completions/mean_terminated_length": 131.7586212158203,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.9832,
"grad_norm": 6.975030422210693,
"kl": 0.20166015625,
"learning_rate": 1e-06,
"loss": 0.0432,
"num_tokens": 15959414.0,
"reward": 0.10200367867946625,
"reward_std": 0.012780029326677322,
"rewards/bleu_reward_func/mean": 0.10200367867946625,
"rewards/bleu_reward_func/std": 0.07971282303333282,
"step": 1229
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 172.25,
"completions/mean_terminated_length": 149.60000610351562,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.984,
"grad_norm": 58.40047836303711,
"kl": 0.6114501953125,
"learning_rate": 1e-06,
"loss": 0.1056,
"num_tokens": 15970374.0,
"reward": 0.21096709370613098,
"reward_std": 0.05436326563358307,
"rewards/bleu_reward_func/mean": 0.21096709370613098,
"rewards/bleu_reward_func/std": 0.21129880845546722,
"step": 1230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 512.0,
"completions/max_terminated_length": 99.0,
"completions/mean_length": 282.875,
"completions/mean_terminated_length": 53.75,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.9848,
"grad_norm": 7.031259536743164,
"kl": 0.173828125,
"learning_rate": 1e-06,
"loss": -0.0097,
"num_tokens": 15985058.0,
"reward": 0.13087573647499084,
"reward_std": 0.03848683089017868,
"rewards/bleu_reward_func/mean": 0.13087573647499084,
"rewards/bleu_reward_func/std": 0.07179337739944458,
"step": 1231
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 347.21875,
"completions/mean_terminated_length": 282.7391357421875,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.9856,
"grad_norm": 13.313501358032227,
"kl": 0.432952880859375,
"learning_rate": 1e-06,
"loss": 0.1537,
"num_tokens": 16003825.0,
"reward": 0.06137411668896675,
"reward_std": 0.036481164395809174,
"rewards/bleu_reward_func/mean": 0.06137411668896675,
"rewards/bleu_reward_func/std": 0.05317319929599762,
"step": 1232
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 198.0,
"completions/mean_length": 114.03125,
"completions/mean_terminated_length": 87.50000762939453,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.9864,
"grad_norm": 12.291428565979004,
"kl": 0.4300537109375,
"learning_rate": 1e-06,
"loss": 0.2836,
"num_tokens": 16012714.0,
"reward": 0.05898230895400047,
"reward_std": 0.024662408977746964,
"rewards/bleu_reward_func/mean": 0.05898230895400047,
"rewards/bleu_reward_func/std": 0.05822930857539177,
"step": 1233
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 255.46875,
"completions/mean_terminated_length": 155.0869598388672,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.9872,
"grad_norm": 6.375350475311279,
"kl": 0.20550537109375,
"learning_rate": 1e-06,
"loss": -0.0603,
"num_tokens": 16023033.0,
"reward": 0.12795159220695496,
"reward_std": 0.034560851752758026,
"rewards/bleu_reward_func/mean": 0.12795159220695496,
"rewards/bleu_reward_func/std": 0.05310589075088501,
"step": 1234
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 335.375,
"completions/mean_terminated_length": 198.0,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.988,
"grad_norm": 2.701765775680542,
"kl": 0.1783447265625,
"learning_rate": 1e-06,
"loss": 0.016,
"num_tokens": 16037525.0,
"reward": 0.08859970420598984,
"reward_std": 0.012333719059824944,
"rewards/bleu_reward_func/mean": 0.08859970420598984,
"rewards/bleu_reward_func/std": 0.05836745351552963,
"step": 1235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 368.5625,
"completions/mean_terminated_length": 282.5,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.9888,
"grad_norm": 3.917327642440796,
"kl": 0.1302490234375,
"learning_rate": 1e-06,
"loss": -0.3326,
"num_tokens": 16051695.0,
"reward": 0.055808089673519135,
"reward_std": 0.020165979862213135,
"rewards/bleu_reward_func/mean": 0.055808089673519135,
"rewards/bleu_reward_func/std": 0.05799167603254318,
"step": 1236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 258.4375,
"completions/mean_terminated_length": 187.44000244140625,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.9896,
"grad_norm": 11.650849342346191,
"kl": 0.3966064453125,
"learning_rate": 1e-06,
"loss": -0.1616,
"num_tokens": 16063021.0,
"reward": 0.09570951759815216,
"reward_std": 0.041778795421123505,
"rewards/bleu_reward_func/mean": 0.09570951759815216,
"rewards/bleu_reward_func/std": 0.09836214780807495,
"step": 1237
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 463.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 130.71875,
"completions/mean_terminated_length": 130.71875,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.9904,
"grad_norm": 6.946449279785156,
"kl": 0.3614501953125,
"learning_rate": 1e-06,
"loss": 0.0432,
"num_tokens": 16073188.0,
"reward": 0.19768103957176208,
"reward_std": 0.05326389521360397,
"rewards/bleu_reward_func/mean": 0.19768103957176208,
"rewards/bleu_reward_func/std": 0.14096693694591522,
"step": 1238
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 153.59375,
"completions/mean_terminated_length": 53.23999786376953,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.9912,
"grad_norm": 15.68226432800293,
"kl": 0.60394287109375,
"learning_rate": 1e-06,
"loss": 0.1991,
"num_tokens": 16089367.0,
"reward": 0.19772392511367798,
"reward_std": 0.04295985400676727,
"rewards/bleu_reward_func/mean": 0.19772392511367798,
"rewards/bleu_reward_func/std": 0.15457068383693695,
"step": 1239
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 443.0,
"completions/mean_length": 202.1875,
"completions/mean_terminated_length": 181.53334045410156,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.992,
"grad_norm": 5.087695121765137,
"kl": 0.21142578125,
"learning_rate": 1e-06,
"loss": -0.1565,
"num_tokens": 16097533.0,
"reward": 0.04568080976605415,
"reward_std": 0.0273725725710392,
"rewards/bleu_reward_func/mean": 0.04568080976605415,
"rewards/bleu_reward_func/std": 0.05127081274986267,
"step": 1240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 149.0,
"completions/mean_length": 98.1875,
"completions/mean_terminated_length": 55.379310607910156,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.9928,
"grad_norm": 9.800594329833984,
"kl": 0.535400390625,
"learning_rate": 1e-06,
"loss": -0.2267,
"num_tokens": 16104579.0,
"reward": 0.1168278306722641,
"reward_std": 0.04581147059798241,
"rewards/bleu_reward_func/mean": 0.1168278306722641,
"rewards/bleu_reward_func/std": 0.08855386078357697,
"step": 1241
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 301.0,
"completions/mean_length": 251.21875,
"completions/mean_terminated_length": 48.38888931274414,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.9936,
"grad_norm": 6.915892124176025,
"kl": 0.300262451171875,
"learning_rate": 1e-06,
"loss": 0.0052,
"num_tokens": 16117106.0,
"reward": 0.21942071616649628,
"reward_std": 0.06735092401504517,
"rewards/bleu_reward_func/mean": 0.21942071616649628,
"rewards/bleu_reward_func/std": 0.1295205056667328,
"step": 1242
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 386.9375,
"completions/mean_terminated_length": 311.8999938964844,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.9944,
"grad_norm": 11.362247467041016,
"kl": 0.95458984375,
"learning_rate": 1e-06,
"loss": -0.2592,
"num_tokens": 16131824.0,
"reward": 0.03232087939977646,
"reward_std": 0.018025288358330727,
"rewards/bleu_reward_func/mean": 0.03232087939977646,
"rewards/bleu_reward_func/std": 0.026756620034575462,
"step": 1243
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 511.0,
"completions/mean_length": 229.40625,
"completions/mean_terminated_length": 220.29031372070312,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.9952,
"grad_norm": 15.675792694091797,
"kl": 0.33203125,
"learning_rate": 1e-06,
"loss": 0.0615,
"num_tokens": 16145325.0,
"reward": 0.08530285954475403,
"reward_std": 0.03364046663045883,
"rewards/bleu_reward_func/mean": 0.08530285954475403,
"rewards/bleu_reward_func/std": 0.06811228394508362,
"step": 1244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 426.5625,
"completions/mean_terminated_length": 381.8095397949219,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.996,
"grad_norm": 2.0989584922790527,
"kl": 0.0555419921875,
"learning_rate": 1e-06,
"loss": -0.0865,
"num_tokens": 16164487.0,
"reward": 0.10918224602937698,
"reward_std": 0.043439704924821854,
"rewards/bleu_reward_func/mean": 0.10918224602937698,
"rewards/bleu_reward_func/std": 0.09625791013240814,
"step": 1245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 434.0,
"completions/mean_length": 127.90625,
"completions/mean_terminated_length": 115.51612854003906,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.9968,
"grad_norm": 8.359559059143066,
"kl": 0.6103515625,
"learning_rate": 1e-06,
"loss": 0.2382,
"num_tokens": 16172076.0,
"reward": 0.0722852572798729,
"reward_std": 0.0363241545855999,
"rewards/bleu_reward_func/mean": 0.0722852572798729,
"rewards/bleu_reward_func/std": 0.05653948336839676,
"step": 1246
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 129.0,
"completions/max_terminated_length": 129.0,
"completions/mean_length": 63.21875,
"completions/mean_terminated_length": 63.21875,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.9976,
"grad_norm": 14.668207168579102,
"kl": 0.6405029296875,
"learning_rate": 1e-06,
"loss": 0.1996,
"num_tokens": 16178059.0,
"reward": 0.11710416525602341,
"reward_std": 0.044295113533735275,
"rewards/bleu_reward_func/mean": 0.11710416525602341,
"rewards/bleu_reward_func/std": 0.06186880171298981,
"step": 1247
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 184.875,
"completions/mean_terminated_length": 174.32257080078125,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.9984,
"grad_norm": 9.499897956848145,
"kl": 0.4058837890625,
"learning_rate": 1e-06,
"loss": -0.1047,
"num_tokens": 16190527.0,
"reward": 0.14165818691253662,
"reward_std": 0.042991265654563904,
"rewards/bleu_reward_func/mean": 0.14165818691253662,
"rewards/bleu_reward_func/std": 0.1511020064353943,
"step": 1248
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 464.0,
"completions/mean_length": 233.25,
"completions/mean_terminated_length": 168.92308044433594,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.9992,
"grad_norm": 6.786505222320557,
"kl": 0.34844970703125,
"learning_rate": 1e-06,
"loss": -0.0929,
"num_tokens": 16203199.0,
"reward": 0.14652788639068604,
"reward_std": 0.05133647471666336,
"rewards/bleu_reward_func/mean": 0.14652788639068604,
"rewards/bleu_reward_func/std": 0.18619418144226074,
"step": 1249
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 69.0,
"completions/max_terminated_length": 69.0,
"completions/mean_length": 39.75,
"completions/mean_terminated_length": 39.75,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 1.0,
"grad_norm": 9.613625526428223,
"kl": 0.37677001953125,
"learning_rate": 1e-06,
"loss": 0.061,
"num_tokens": 16214113.0,
"reward": 0.10052811354398727,
"reward_std": 0.05825551599264145,
"rewards/bleu_reward_func/mean": 0.10052811354398727,
"rewards/bleu_reward_func/std": 0.10802065581083298,
"step": 1250
}
],
"logging_steps": 1,
"max_steps": 1250,
"num_input_tokens_seen": 16214113,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}