2912 lines
107 KiB
JSON
2912 lines
107 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.9997795900374697,
|
|
"eval_steps": 500,
|
|
"global_step": 567,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 348.9375,
|
|
"completions/max_terminated_length": 348.9375,
|
|
"completions/mean_length": 166.421875,
|
|
"completions/mean_terminated_length": 166.421875,
|
|
"completions/min_length": 46.9375,
|
|
"completions/min_terminated_length": 46.9375,
|
|
"epoch": 0.001763279700242451,
|
|
"grad_norm": 0.9710755373974377,
|
|
"kl": 0.0,
|
|
"learning_rate": 0.0,
|
|
"loss": -0.1942,
|
|
"num_tokens": 147932.0,
|
|
"reward": 0.7169940862804651,
|
|
"reward_std": 0.2023643054999411,
|
|
"rewards/format_reward/mean": 0.6015625,
|
|
"rewards/format_reward/std": 0.4860950894653797,
|
|
"rewards/qatch_metrics/mean": 0.7311028698459268,
|
|
"rewards/qatch_metrics/std": 0.21332682901993394,
|
|
"rewards/tag_count_reward/mean": 0.7080078125,
|
|
"rewards/tag_count_reward/std": 0.3562574004754424,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 320.859375,
|
|
"completions/max_terminated_length": 320.859375,
|
|
"completions/mean_length": 157.0234375,
|
|
"completions/mean_terminated_length": 157.0234375,
|
|
"completions/min_length": 45.984375,
|
|
"completions/min_terminated_length": 45.984375,
|
|
"epoch": 0.008816398501212255,
|
|
"grad_norm": 1.3231567694690758,
|
|
"kl": 0.000191517174243927,
|
|
"learning_rate": 7.017543859649122e-08,
|
|
"loss": -0.3181,
|
|
"num_tokens": 670628.0,
|
|
"reward": 0.7213839697651565,
|
|
"reward_std": 0.15228567429585382,
|
|
"rewards/format_reward/mean": 0.5888671875,
|
|
"rewards/format_reward/std": 0.4847157197073102,
|
|
"rewards/qatch_metrics/mean": 0.7386370480526239,
|
|
"rewards/qatch_metrics/std": 0.13745591492079257,
|
|
"rewards/tag_count_reward/mean": 0.693115234375,
|
|
"rewards/tag_count_reward/std": 0.36212220159359276,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 336.6,
|
|
"completions/max_terminated_length": 336.6,
|
|
"completions/mean_length": 168.38125,
|
|
"completions/mean_terminated_length": 168.38125,
|
|
"completions/min_length": 47.6625,
|
|
"completions/min_terminated_length": 47.6625,
|
|
"epoch": 0.01763279700242451,
|
|
"grad_norm": 1.2344979720798859,
|
|
"kl": 0.0002501368522644043,
|
|
"learning_rate": 1.5789473684210525e-07,
|
|
"loss": -0.2443,
|
|
"num_tokens": 1345916.0,
|
|
"reward": 0.6422883274964988,
|
|
"reward_std": 0.19879084336571395,
|
|
"rewards/format_reward/mean": 0.596875,
|
|
"rewards/format_reward/std": 0.4875950779765844,
|
|
"rewards/qatch_metrics/mean": 0.6440179711673408,
|
|
"rewards/qatch_metrics/std": 0.19703516885638236,
|
|
"rewards/tag_count_reward/mean": 0.7037109375,
|
|
"rewards/tag_count_reward/std": 0.3609486103057861,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 334.6375,
|
|
"completions/max_terminated_length": 334.6375,
|
|
"completions/mean_length": 172.48515625,
|
|
"completions/mean_terminated_length": 172.48515625,
|
|
"completions/min_length": 47.9625,
|
|
"completions/min_terminated_length": 47.9625,
|
|
"epoch": 0.026449195503636766,
|
|
"grad_norm": 1.0324306738907167,
|
|
"kl": 0.0003110170364379883,
|
|
"learning_rate": 2.456140350877193e-07,
|
|
"loss": -0.2119,
|
|
"num_tokens": 2061897.0,
|
|
"reward": 0.7045063060708344,
|
|
"reward_std": 0.17503634537570179,
|
|
"rewards/format_reward/mean": 0.6765625,
|
|
"rewards/format_reward/std": 0.4584621708840132,
|
|
"rewards/qatch_metrics/mean": 0.7045317724347114,
|
|
"rewards/qatch_metrics/std": 0.17522094006126282,
|
|
"rewards/tag_count_reward/mean": 0.7599609375,
|
|
"rewards/tag_count_reward/std": 0.34046435691416266,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 359.0375,
|
|
"completions/max_terminated_length": 359.0375,
|
|
"completions/mean_length": 213.50859375,
|
|
"completions/mean_terminated_length": 213.50859375,
|
|
"completions/min_length": 63.2125,
|
|
"completions/min_terminated_length": 63.2125,
|
|
"epoch": 0.03526559400484902,
|
|
"grad_norm": 0.7271916297615016,
|
|
"kl": 0.0008575439453125,
|
|
"learning_rate": 3.333333333333333e-07,
|
|
"loss": -0.1464,
|
|
"num_tokens": 2779524.0,
|
|
"reward": 0.736725780274719,
|
|
"reward_std": 0.14974234900437294,
|
|
"rewards/format_reward/mean": 0.82265625,
|
|
"rewards/format_reward/std": 0.35349783338606355,
|
|
"rewards/qatch_metrics/mean": 0.7184940161881969,
|
|
"rewards/qatch_metrics/std": 0.14797394773922862,
|
|
"rewards/tag_count_reward/mean": 0.8748046875,
|
|
"rewards/tag_count_reward/std": 0.2501601692289114,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 345.9,
|
|
"completions/max_terminated_length": 345.9,
|
|
"completions/mean_length": 221.61953125,
|
|
"completions/mean_terminated_length": 221.61953125,
|
|
"completions/min_length": 110.9625,
|
|
"completions/min_terminated_length": 110.9625,
|
|
"epoch": 0.04408199250606128,
|
|
"grad_norm": 0.36089994298099687,
|
|
"kl": 0.002716684341430664,
|
|
"learning_rate": 4.2105263157894733e-07,
|
|
"loss": -0.0294,
|
|
"num_tokens": 3532397.0,
|
|
"reward": 0.7203821750357747,
|
|
"reward_std": 0.15227624527178704,
|
|
"rewards/format_reward/mean": 0.94375,
|
|
"rewards/format_reward/std": 0.16674657054245473,
|
|
"rewards/qatch_metrics/mean": 0.6798958362895065,
|
|
"rewards/qatch_metrics/std": 0.1698521633632481,
|
|
"rewards/tag_count_reward/mean": 0.9619140625,
|
|
"rewards/tag_count_reward/std": 0.11083460114896297,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 351.1625,
|
|
"completions/max_terminated_length": 351.1625,
|
|
"completions/mean_length": 233.1609375,
|
|
"completions/mean_terminated_length": 233.1609375,
|
|
"completions/min_length": 146.275,
|
|
"completions/min_terminated_length": 146.275,
|
|
"epoch": 0.05289839100727353,
|
|
"grad_norm": 0.39496099506167615,
|
|
"kl": 0.002033853530883789,
|
|
"learning_rate": 5.087719298245614e-07,
|
|
"loss": 0.0164,
|
|
"num_tokens": 4292411.0,
|
|
"reward": 0.7153215611353516,
|
|
"reward_std": 0.15457577785709872,
|
|
"rewards/format_reward/mean": 0.98828125,
|
|
"rewards/format_reward/std": 0.046875,
|
|
"rewards/qatch_metrics/mean": 0.6668190175667406,
|
|
"rewards/qatch_metrics/std": 0.1794413580093533,
|
|
"rewards/tag_count_reward/mean": 0.9939453125,
|
|
"rewards/tag_count_reward/std": 0.02421875,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 357.1375,
|
|
"completions/max_terminated_length": 357.1375,
|
|
"completions/mean_length": 236.98046875,
|
|
"completions/mean_terminated_length": 236.98046875,
|
|
"completions/min_length": 144.2375,
|
|
"completions/min_terminated_length": 144.2375,
|
|
"epoch": 0.06171478950848578,
|
|
"grad_norm": 0.40335285901379214,
|
|
"kl": 0.004096126556396485,
|
|
"learning_rate": 5.964912280701754e-07,
|
|
"loss": -0.0084,
|
|
"num_tokens": 5094210.0,
|
|
"reward": 0.7295356256887316,
|
|
"reward_std": 0.15927550423293724,
|
|
"rewards/format_reward/mean": 0.99296875,
|
|
"rewards/format_reward/std": 0.028125,
|
|
"rewards/qatch_metrics/mean": 0.6829440153203905,
|
|
"rewards/qatch_metrics/std": 0.1853304866242979,
|
|
"rewards/tag_count_reward/mean": 0.9947265625,
|
|
"rewards/tag_count_reward/std": 0.02109375,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 348.875,
|
|
"completions/max_terminated_length": 348.875,
|
|
"completions/mean_length": 225.73203125,
|
|
"completions/mean_terminated_length": 225.73203125,
|
|
"completions/min_length": 141.75,
|
|
"completions/min_terminated_length": 141.75,
|
|
"epoch": 0.07053118800969804,
|
|
"grad_norm": 0.28998715845893785,
|
|
"kl": 0.001429271697998047,
|
|
"learning_rate": 6.842105263157895e-07,
|
|
"loss": 0.0087,
|
|
"num_tokens": 5833499.0,
|
|
"reward": 0.7490684226155281,
|
|
"reward_std": 0.12702710015701085,
|
|
"rewards/format_reward/mean": 0.9984375,
|
|
"rewards/format_reward/std": 0.00625,
|
|
"rewards/qatch_metrics/mean": 0.7050161464139819,
|
|
"rewards/qatch_metrics/std": 0.14899895801208912,
|
|
"rewards/tag_count_reward/mean": 0.99921875,
|
|
"rewards/tag_count_reward/std": 0.003125,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 337.7625,
|
|
"completions/max_terminated_length": 337.7625,
|
|
"completions/mean_length": 234.1765625,
|
|
"completions/mean_terminated_length": 234.1765625,
|
|
"completions/min_length": 149.025,
|
|
"completions/min_terminated_length": 149.025,
|
|
"epoch": 0.0793475865109103,
|
|
"grad_norm": 0.3515825769486025,
|
|
"kl": 0.001779651641845703,
|
|
"learning_rate": 7.719298245614034e-07,
|
|
"loss": 0.0103,
|
|
"num_tokens": 6625773.0,
|
|
"reward": 0.7813593098893762,
|
|
"reward_std": 0.1548473397095222,
|
|
"rewards/format_reward/mean": 0.99765625,
|
|
"rewards/format_reward/std": 0.009375,
|
|
"rewards/qatch_metrics/mean": 0.7431203166022897,
|
|
"rewards/qatch_metrics/std": 0.18166826255619525,
|
|
"rewards/tag_count_reward/mean": 0.998828125,
|
|
"rewards/tag_count_reward/std": 0.0046875,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 345.325,
|
|
"completions/max_terminated_length": 345.325,
|
|
"completions/mean_length": 230.6203125,
|
|
"completions/mean_terminated_length": 230.6203125,
|
|
"completions/min_length": 150.1625,
|
|
"completions/min_terminated_length": 150.1625,
|
|
"epoch": 0.08816398501212255,
|
|
"grad_norm": 0.30438917372160834,
|
|
"kl": 0.0018757820129394532,
|
|
"learning_rate": 8.596491228070175e-07,
|
|
"loss": 0.0006,
|
|
"num_tokens": 7364007.0,
|
|
"reward": 0.7656656216830016,
|
|
"reward_std": 0.1071579195689992,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.724415887008945,
|
|
"rewards/qatch_metrics/std": 0.12601496450661215,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 340.725,
|
|
"completions/max_terminated_length": 340.725,
|
|
"completions/mean_length": 219.0265625,
|
|
"completions/mean_terminated_length": 219.0265625,
|
|
"completions/min_length": 140.775,
|
|
"completions/min_terminated_length": 140.775,
|
|
"epoch": 0.09698038351333481,
|
|
"grad_norm": 0.32051285299560317,
|
|
"kl": 0.002438640594482422,
|
|
"learning_rate": 9.473684210526315e-07,
|
|
"loss": 0.0062,
|
|
"num_tokens": 8093081.0,
|
|
"reward": 0.7895074604079128,
|
|
"reward_std": 0.10295408805832267,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.7524651106446981,
|
|
"rewards/qatch_metrics/std": 0.12109476723708212,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 333.6125,
|
|
"completions/max_terminated_length": 333.6125,
|
|
"completions/mean_length": 214.33203125,
|
|
"completions/mean_terminated_length": 214.33203125,
|
|
"completions/min_length": 138.125,
|
|
"completions/min_terminated_length": 138.125,
|
|
"epoch": 0.10579678201454706,
|
|
"grad_norm": 0.3687506384245934,
|
|
"kl": 0.0034315109252929686,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0167,
|
|
"num_tokens": 8839074.0,
|
|
"reward": 0.7963492956012488,
|
|
"reward_std": 0.13961084922775627,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.76051432879176,
|
|
"rewards/qatch_metrics/std": 0.16409257110208272,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 314.45,
|
|
"completions/max_terminated_length": 314.45,
|
|
"completions/mean_length": 205.66484375,
|
|
"completions/mean_terminated_length": 205.66484375,
|
|
"completions/min_length": 138.325,
|
|
"completions/min_terminated_length": 138.325,
|
|
"epoch": 0.11461318051575932,
|
|
"grad_norm": 0.3598691865301115,
|
|
"kl": 0.004790306091308594,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0072,
|
|
"num_tokens": 9592197.0,
|
|
"reward": 0.7755136819556355,
|
|
"reward_std": 0.12071140363659652,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.735898441291647,
|
|
"rewards/qatch_metrics/std": 0.14201342400228895,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 329.4875,
|
|
"completions/max_terminated_length": 329.4875,
|
|
"completions/mean_length": 218.82734375,
|
|
"completions/mean_terminated_length": 218.82734375,
|
|
"completions/min_length": 144.225,
|
|
"completions/min_terminated_length": 144.225,
|
|
"epoch": 0.12342957901697156,
|
|
"grad_norm": 0.40718780074992844,
|
|
"kl": 0.005454826354980469,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0052,
|
|
"num_tokens": 10323832.0,
|
|
"reward": 0.8082829523831606,
|
|
"reward_std": 0.12197014213888906,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7744505235925316,
|
|
"rewards/qatch_metrics/std": 0.1434942939085886,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 336.075,
|
|
"completions/max_terminated_length": 336.075,
|
|
"completions/mean_length": 220.25234375,
|
|
"completions/mean_terminated_length": 220.25234375,
|
|
"completions/min_length": 142.125,
|
|
"completions/min_terminated_length": 142.125,
|
|
"epoch": 0.13224597751818382,
|
|
"grad_norm": 0.4066036566641524,
|
|
"kl": 0.005206871032714844,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.003,
|
|
"num_tokens": 11058347.0,
|
|
"reward": 0.7646768478676677,
|
|
"reward_std": 0.15216110937763005,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7231492260238156,
|
|
"rewards/qatch_metrics/std": 0.17901307856664062,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 323.4625,
|
|
"completions/max_terminated_length": 323.4625,
|
|
"completions/mean_length": 212.896875,
|
|
"completions/mean_terminated_length": 212.896875,
|
|
"completions/min_length": 138.65,
|
|
"completions/min_terminated_length": 138.65,
|
|
"epoch": 0.14106237601939609,
|
|
"grad_norm": 0.3757999153884869,
|
|
"kl": 0.006637382507324219,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0123,
|
|
"num_tokens": 11818007.0,
|
|
"reward": 0.7600463172420859,
|
|
"reward_std": 0.11857278576935641,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7177130273543298,
|
|
"rewards/qatch_metrics/std": 0.1394514435902238,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 80
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 318.2375,
|
|
"completions/max_terminated_length": 318.2375,
|
|
"completions/mean_length": 212.54140625,
|
|
"completions/mean_terminated_length": 212.54140625,
|
|
"completions/min_length": 140.9125,
|
|
"completions/min_terminated_length": 140.9125,
|
|
"epoch": 0.14987877452060833,
|
|
"grad_norm": 0.2843877461846444,
|
|
"kl": 0.006991195678710938,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0103,
|
|
"num_tokens": 12562892.0,
|
|
"reward": 0.7912707846611738,
|
|
"reward_std": 0.12222371264360846,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7544361999258399,
|
|
"rewards/qatch_metrics/std": 0.14379260735586286,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 316.575,
|
|
"completions/max_terminated_length": 316.575,
|
|
"completions/mean_length": 211.7421875,
|
|
"completions/mean_terminated_length": 211.7421875,
|
|
"completions/min_length": 143.1125,
|
|
"completions/min_terminated_length": 143.1125,
|
|
"epoch": 0.1586951730218206,
|
|
"grad_norm": 0.33442141566841666,
|
|
"kl": 0.00792999267578125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.001,
|
|
"num_tokens": 13308402.0,
|
|
"reward": 0.8190947765484452,
|
|
"reward_std": 0.12180806350661441,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7871703177705058,
|
|
"rewards/qatch_metrics/std": 0.14330361068132333,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 327.6,
|
|
"completions/max_terminated_length": 327.6,
|
|
"completions/mean_length": 220.39140625,
|
|
"completions/mean_terminated_length": 220.39140625,
|
|
"completions/min_length": 144.9375,
|
|
"completions/min_terminated_length": 144.9375,
|
|
"epoch": 0.16751157152303284,
|
|
"grad_norm": 0.2060349260059472,
|
|
"kl": 0.007589149475097656,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0053,
|
|
"num_tokens": 14090727.0,
|
|
"reward": 0.8084983274340629,
|
|
"reward_std": 0.09074296336621046,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.774703911319375,
|
|
"rewards/qatch_metrics/std": 0.1067564318422228,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 345.4375,
|
|
"completions/max_terminated_length": 345.4375,
|
|
"completions/mean_length": 233.321875,
|
|
"completions/mean_terminated_length": 233.321875,
|
|
"completions/min_length": 150.725,
|
|
"completions/min_terminated_length": 150.725,
|
|
"epoch": 0.1763279700242451,
|
|
"grad_norm": 0.3437681507721623,
|
|
"kl": 0.008571624755859375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0018,
|
|
"num_tokens": 14870899.0,
|
|
"reward": 0.7850898955017328,
|
|
"reward_std": 0.11421530576772057,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.747267971560359,
|
|
"rewards/qatch_metrics/std": 0.13395735200028866,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 100
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 328.7,
|
|
"completions/max_terminated_length": 328.7,
|
|
"completions/mean_length": 218.9609375,
|
|
"completions/mean_terminated_length": 218.9609375,
|
|
"completions/min_length": 147.7125,
|
|
"completions/min_terminated_length": 147.7125,
|
|
"epoch": 0.18514436852545735,
|
|
"grad_norm": 0.2956106829157968,
|
|
"kl": 0.0087432861328125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0023,
|
|
"num_tokens": 15637185.0,
|
|
"reward": 0.8008848559111357,
|
|
"reward_std": 0.1065025228075683,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7657468795776368,
|
|
"rewards/qatch_metrics/std": 0.12529709176160395,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 354.7,
|
|
"completions/max_terminated_length": 354.7,
|
|
"completions/mean_length": 241.70234375,
|
|
"completions/mean_terminated_length": 241.70234375,
|
|
"completions/min_length": 153.675,
|
|
"completions/min_terminated_length": 153.675,
|
|
"epoch": 0.19396076702666962,
|
|
"grad_norm": 0.3156816970551577,
|
|
"kl": 0.00846099853515625,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0043,
|
|
"num_tokens": 16427236.0,
|
|
"reward": 0.8307245042175054,
|
|
"reward_std": 0.12043557950855757,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8008523487951607,
|
|
"rewards/qatch_metrics/std": 0.14168892623565627,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 369.025,
|
|
"completions/max_terminated_length": 369.025,
|
|
"completions/mean_length": 242.18203125,
|
|
"completions/mean_terminated_length": 242.18203125,
|
|
"completions/min_length": 155.2875,
|
|
"completions/min_terminated_length": 155.2875,
|
|
"epoch": 0.20277716552788186,
|
|
"grad_norm": 0.35044764518300353,
|
|
"kl": 0.008629226684570312,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0172,
|
|
"num_tokens": 17204669.0,
|
|
"reward": 0.7873043244704604,
|
|
"reward_std": 0.11700194676523097,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.7498731818312081,
|
|
"rewards/qatch_metrics/std": 0.13737923657754436,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 115
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 326.7,
|
|
"completions/max_terminated_length": 326.7,
|
|
"completions/mean_length": 223.1546875,
|
|
"completions/mean_terminated_length": 223.1546875,
|
|
"completions/min_length": 145.8125,
|
|
"completions/min_terminated_length": 145.8125,
|
|
"epoch": 0.21159356402909413,
|
|
"grad_norm": 0.256901537727429,
|
|
"kl": 0.008853530883789063,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0019,
|
|
"num_tokens": 17978371.0,
|
|
"reward": 0.8254757545888424,
|
|
"reward_std": 0.09378883789759129,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.794677343731746,
|
|
"rewards/qatch_metrics/std": 0.11033981533837504,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 320.0,
|
|
"completions/max_terminated_length": 320.0,
|
|
"completions/mean_length": 214.93828125,
|
|
"completions/mean_terminated_length": 214.93828125,
|
|
"completions/min_length": 144.85,
|
|
"completions/min_terminated_length": 144.85,
|
|
"epoch": 0.22040996253030637,
|
|
"grad_norm": 0.3734181940444686,
|
|
"kl": 0.009600830078125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0006,
|
|
"num_tokens": 18723748.0,
|
|
"reward": 0.7991562966257334,
|
|
"reward_std": 0.09936780408024788,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7637132841162384,
|
|
"rewards/qatch_metrics/std": 0.11690330407582224,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 328.3,
|
|
"completions/max_terminated_length": 328.3,
|
|
"completions/mean_length": 220.58515625,
|
|
"completions/mean_terminated_length": 220.58515625,
|
|
"completions/min_length": 137.9875,
|
|
"completions/min_terminated_length": 137.9875,
|
|
"epoch": 0.22922636103151864,
|
|
"grad_norm": 0.38207935979690344,
|
|
"kl": 0.010348129272460937,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.012,
|
|
"num_tokens": 19500401.0,
|
|
"reward": 0.812997136451304,
|
|
"reward_std": 0.12226196355186403,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7799966168124228,
|
|
"rewards/qatch_metrics/std": 0.14383760886266828,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 338.6125,
|
|
"completions/max_terminated_length": 338.6125,
|
|
"completions/mean_length": 220.6390625,
|
|
"completions/mean_terminated_length": 220.6390625,
|
|
"completions/min_length": 142.0375,
|
|
"completions/min_terminated_length": 142.0375,
|
|
"epoch": 0.23804275953273088,
|
|
"grad_norm": 0.3417278304849786,
|
|
"kl": 0.011602020263671875,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0024,
|
|
"num_tokens": 20253139.0,
|
|
"reward": 0.7694992732256651,
|
|
"reward_std": 0.10531347445212305,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7288226603297516,
|
|
"rewards/qatch_metrics/std": 0.12389820874668658,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 312.3,
|
|
"completions/max_terminated_length": 312.3,
|
|
"completions/mean_length": 208.0296875,
|
|
"completions/mean_terminated_length": 208.0296875,
|
|
"completions/min_length": 136.55,
|
|
"completions/min_terminated_length": 136.55,
|
|
"epoch": 0.24685915803394312,
|
|
"grad_norm": 0.3425773253691935,
|
|
"kl": 0.010589218139648438,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0116,
|
|
"num_tokens": 20952281.0,
|
|
"reward": 0.8347327882423997,
|
|
"reward_std": 0.10723668891005218,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8055679749464616,
|
|
"rewards/qatch_metrics/std": 0.1261608180589974,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 336.2375,
|
|
"completions/max_terminated_length": 336.2375,
|
|
"completions/mean_length": 226.9140625,
|
|
"completions/mean_terminated_length": 226.9140625,
|
|
"completions/min_length": 141.725,
|
|
"completions/min_terminated_length": 141.725,
|
|
"epoch": 0.2556755565351554,
|
|
"grad_norm": 0.3141154785420489,
|
|
"kl": 0.011150741577148437,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0045,
|
|
"num_tokens": 21701563.0,
|
|
"reward": 0.8115479167550802,
|
|
"reward_std": 0.11712859515100718,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.778395052952692,
|
|
"rewards/qatch_metrics/std": 0.13778491392731668,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 145
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 324.325,
|
|
"completions/max_terminated_length": 324.325,
|
|
"completions/mean_length": 214.4046875,
|
|
"completions/mean_terminated_length": 214.4046875,
|
|
"completions/min_length": 137.175,
|
|
"completions/min_terminated_length": 137.175,
|
|
"epoch": 0.26449195503636763,
|
|
"grad_norm": 0.346933040349207,
|
|
"kl": 0.020714950561523438,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0045,
|
|
"num_tokens": 22407937.0,
|
|
"reward": 0.8108294125646353,
|
|
"reward_std": 0.08920670928346226,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7774463596753776,
|
|
"rewards/qatch_metrics/std": 0.1049490759614855,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 346.025,
|
|
"completions/max_terminated_length": 346.025,
|
|
"completions/mean_length": 229.28125,
|
|
"completions/mean_terminated_length": 229.28125,
|
|
"completions/min_length": 140.1375,
|
|
"completions/min_terminated_length": 140.1375,
|
|
"epoch": 0.2733083535375799,
|
|
"grad_norm": 0.34731522656971436,
|
|
"kl": 0.011865234375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0106,
|
|
"num_tokens": 23184281.0,
|
|
"reward": 0.8204727115109562,
|
|
"reward_std": 0.12891971635399385,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.788791407039389,
|
|
"rewards/qatch_metrics/std": 0.15167025988921523,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 318.55,
|
|
"completions/max_terminated_length": 318.55,
|
|
"completions/mean_length": 209.17734375,
|
|
"completions/mean_terminated_length": 209.17734375,
|
|
"completions/min_length": 137.3625,
|
|
"completions/min_terminated_length": 137.3625,
|
|
"epoch": 0.28212475203879217,
|
|
"grad_norm": 0.3719362212745786,
|
|
"kl": 0.011676025390625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0151,
|
|
"num_tokens": 23893036.0,
|
|
"reward": 0.7991067057475447,
|
|
"reward_std": 0.09515065372979734,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.763758339243941,
|
|
"rewards/qatch_metrics/std": 0.1118375029604067,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 160
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 303.1375,
|
|
"completions/max_terminated_length": 303.1375,
|
|
"completions/mean_length": 201.36484375,
|
|
"completions/mean_terminated_length": 201.36484375,
|
|
"completions/min_length": 133.0625,
|
|
"completions/min_terminated_length": 133.0625,
|
|
"epoch": 0.2909411505400044,
|
|
"grad_norm": 0.41140648881811986,
|
|
"kl": 0.014212799072265626,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0031,
|
|
"num_tokens": 24627135.0,
|
|
"reward": 0.7533468393608928,
|
|
"reward_std": 0.10053728537168354,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7098197954706847,
|
|
"rewards/qatch_metrics/std": 0.11827916505280882,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 308.0,
|
|
"completions/max_terminated_length": 308.0,
|
|
"completions/mean_length": 203.5234375,
|
|
"completions/mean_terminated_length": 203.5234375,
|
|
"completions/min_length": 131.275,
|
|
"completions/min_terminated_length": 131.275,
|
|
"epoch": 0.29975754904121665,
|
|
"grad_norm": 0.40595987740034806,
|
|
"kl": 0.01388092041015625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0009,
|
|
"num_tokens": 25368413.0,
|
|
"reward": 0.7701841354370117,
|
|
"reward_std": 0.09663771950872616,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7296283877098176,
|
|
"rewards/qatch_metrics/std": 0.11369143746051122,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 328.725,
|
|
"completions/max_terminated_length": 328.725,
|
|
"completions/mean_length": 216.08125,
|
|
"completions/mean_terminated_length": 216.08125,
|
|
"completions/min_length": 138.325,
|
|
"completions/min_terminated_length": 138.325,
|
|
"epoch": 0.3085739475424289,
|
|
"grad_norm": 0.25083299753484695,
|
|
"kl": 0.01239776611328125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0008,
|
|
"num_tokens": 26110613.0,
|
|
"reward": 0.8106268728151917,
|
|
"reward_std": 0.10756922718137503,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7772080772556365,
|
|
"rewards/qatch_metrics/std": 0.12655203738249837,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 343.4625,
|
|
"completions/max_terminated_length": 343.4625,
|
|
"completions/mean_length": 231.24453125,
|
|
"completions/mean_terminated_length": 231.24453125,
|
|
"completions/min_length": 149.375,
|
|
"completions/min_terminated_length": 149.375,
|
|
"epoch": 0.3173903460436412,
|
|
"grad_norm": 0.3299647961730041,
|
|
"kl": 0.012017822265625,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0011,
|
|
"num_tokens": 26876590.0,
|
|
"reward": 0.7713820965960622,
|
|
"reward_std": 0.11187393190339208,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.7311411482747644,
|
|
"rewards/qatch_metrics/std": 0.13165212250314653,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 180
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 353.5875,
|
|
"completions/max_terminated_length": 353.5875,
|
|
"completions/mean_length": 242.12734375,
|
|
"completions/mean_terminated_length": 242.12734375,
|
|
"completions/min_length": 155.3125,
|
|
"completions/min_terminated_length": 155.3125,
|
|
"epoch": 0.32620674454485343,
|
|
"grad_norm": 0.32558550372348305,
|
|
"kl": 0.013112258911132813,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0031,
|
|
"num_tokens": 27658417.0,
|
|
"reward": 0.8029140060767531,
|
|
"reward_std": 0.10181500271428376,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7681341207586229,
|
|
"rewards/qatch_metrics/std": 0.1197823622263968,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 358.3625,
|
|
"completions/max_terminated_length": 358.3625,
|
|
"completions/mean_length": 244.378125,
|
|
"completions/mean_terminated_length": 244.378125,
|
|
"completions/min_length": 150.175,
|
|
"completions/min_terminated_length": 150.175,
|
|
"epoch": 0.3350231430460657,
|
|
"grad_norm": 0.32369260650095066,
|
|
"kl": 0.012503814697265626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0035,
|
|
"num_tokens": 28450773.0,
|
|
"reward": 0.8509975776076317,
|
|
"reward_std": 0.10326635130477371,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.8248408894985915,
|
|
"rewards/qatch_metrics/std": 0.12098856130687637,
|
|
"rewards/tag_count_reward/mean": 0.99921875,
|
|
"rewards/tag_count_reward/std": 0.003125,
|
|
"step": 190
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 346.8,
|
|
"completions/max_terminated_length": 346.8,
|
|
"completions/mean_length": 229.31484375,
|
|
"completions/mean_terminated_length": 229.31484375,
|
|
"completions/min_length": 142.2,
|
|
"completions/min_terminated_length": 142.2,
|
|
"epoch": 0.3438395415472779,
|
|
"grad_norm": 0.33109392130180526,
|
|
"kl": 0.012652969360351563,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.005,
|
|
"num_tokens": 29193720.0,
|
|
"reward": 0.8215816805139184,
|
|
"reward_std": 0.10861785978777334,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.7901994839310647,
|
|
"rewards/qatch_metrics/std": 0.12737212259089575,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 195
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 332.9,
|
|
"completions/max_terminated_length": 332.9,
|
|
"completions/mean_length": 226.34296875,
|
|
"completions/mean_terminated_length": 226.34296875,
|
|
"completions/min_length": 144.8,
|
|
"completions/min_terminated_length": 144.8,
|
|
"epoch": 0.3526559400484902,
|
|
"grad_norm": 0.34898255755360424,
|
|
"kl": 0.0120391845703125,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.001,
|
|
"num_tokens": 29936351.0,
|
|
"reward": 0.8118854926899075,
|
|
"reward_std": 0.08703446270665154,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.778688807785511,
|
|
"rewards/qatch_metrics/std": 0.10239349115872756,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 341.4875,
|
|
"completions/max_terminated_length": 341.4875,
|
|
"completions/mean_length": 233.30625,
|
|
"completions/mean_terminated_length": 233.30625,
|
|
"completions/min_length": 146.125,
|
|
"completions/min_terminated_length": 146.125,
|
|
"epoch": 0.36147233854970245,
|
|
"grad_norm": 0.2847757548262052,
|
|
"kl": 0.013054656982421874,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 30723207.0,
|
|
"reward": 0.8451463960111141,
|
|
"reward_std": 0.07684196562040597,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8178192753344774,
|
|
"rewards/qatch_metrics/std": 0.09040231805993244,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 344.9125,
|
|
"completions/max_terminated_length": 344.9125,
|
|
"completions/mean_length": 233.31953125,
|
|
"completions/mean_terminated_length": 233.31953125,
|
|
"completions/min_length": 144.9875,
|
|
"completions/min_terminated_length": 144.9875,
|
|
"epoch": 0.3702887370509147,
|
|
"grad_norm": 0.4034177578209816,
|
|
"kl": 0.014620590209960937,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0075,
|
|
"num_tokens": 31508960.0,
|
|
"reward": 0.7541328657418489,
|
|
"reward_std": 0.10346053875982761,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7107445362955331,
|
|
"rewards/qatch_metrics/std": 0.12171828672289849,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 337.2125,
|
|
"completions/max_terminated_length": 337.2125,
|
|
"completions/mean_length": 229.95234375,
|
|
"completions/mean_terminated_length": 229.95234375,
|
|
"completions/min_length": 143.85,
|
|
"completions/min_terminated_length": 143.85,
|
|
"epoch": 0.37910513555212694,
|
|
"grad_norm": 0.3513936634634679,
|
|
"kl": 0.01492767333984375,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0053,
|
|
"num_tokens": 32267923.0,
|
|
"reward": 0.7756918715313077,
|
|
"reward_std": 0.08237711414694786,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7361080780159682,
|
|
"rewards/qatch_metrics/std": 0.09691425783094018,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 331.65,
|
|
"completions/max_terminated_length": 331.65,
|
|
"completions/mean_length": 225.4984375,
|
|
"completions/mean_terminated_length": 225.4984375,
|
|
"completions/min_length": 145.2,
|
|
"completions/min_terminated_length": 145.2,
|
|
"epoch": 0.38792153405333923,
|
|
"grad_norm": 0.2816074681768635,
|
|
"kl": 0.0148773193359375,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0012,
|
|
"num_tokens": 33038177.0,
|
|
"reward": 0.8032938608899712,
|
|
"reward_std": 0.09598501106374897,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7685809925198555,
|
|
"rewards/qatch_metrics/std": 0.11292354888282716,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 316.7,
|
|
"completions/max_terminated_length": 316.7,
|
|
"completions/mean_length": 215.88984375,
|
|
"completions/mean_terminated_length": 215.88984375,
|
|
"completions/min_length": 138.5125,
|
|
"completions/min_terminated_length": 138.5125,
|
|
"epoch": 0.3967379325545515,
|
|
"grad_norm": 0.2792708523998877,
|
|
"kl": 0.01438140869140625,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0031,
|
|
"num_tokens": 33775780.0,
|
|
"reward": 0.833816378749907,
|
|
"reward_std": 0.08319322268362157,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8044898479929543,
|
|
"rewards/qatch_metrics/std": 0.09787438737985213,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 321.1625,
|
|
"completions/max_terminated_length": 321.1625,
|
|
"completions/mean_length": 212.334375,
|
|
"completions/mean_terminated_length": 212.334375,
|
|
"completions/min_length": 140.6,
|
|
"completions/min_terminated_length": 140.6,
|
|
"epoch": 0.4055543310557637,
|
|
"grad_norm": 0.2700162867921189,
|
|
"kl": 0.015807723999023436,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0029,
|
|
"num_tokens": 34518032.0,
|
|
"reward": 0.7357370050624013,
|
|
"reward_std": 0.10150592336431145,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.6891023456788389,
|
|
"rewards/qatch_metrics/std": 0.11941873789764941,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 325.4625,
|
|
"completions/max_terminated_length": 325.4625,
|
|
"completions/mean_length": 218.98125,
|
|
"completions/mean_terminated_length": 218.98125,
|
|
"completions/min_length": 143.225,
|
|
"completions/min_terminated_length": 143.225,
|
|
"epoch": 0.41437072955697596,
|
|
"grad_norm": 0.41442176380867557,
|
|
"kl": 0.014601898193359376,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0086,
|
|
"num_tokens": 35259240.0,
|
|
"reward": 0.7820553651079536,
|
|
"reward_std": 0.112267074175179,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.743594534881413,
|
|
"rewards/qatch_metrics/std": 0.13207891816273332,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 319.25,
|
|
"completions/max_terminated_length": 319.25,
|
|
"completions/mean_length": 209.9296875,
|
|
"completions/mean_terminated_length": 209.9296875,
|
|
"completions/min_length": 137.3125,
|
|
"completions/min_terminated_length": 137.3125,
|
|
"epoch": 0.42318712805818826,
|
|
"grad_norm": 0.3476187294524526,
|
|
"kl": 0.014452362060546875,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.002,
|
|
"num_tokens": 35975326.0,
|
|
"reward": 0.7921670457348228,
|
|
"reward_std": 0.09589202178794949,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.755490631237626,
|
|
"rewards/qatch_metrics/std": 0.1128141468463582,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 330.4,
|
|
"completions/max_terminated_length": 330.4,
|
|
"completions/mean_length": 219.6890625,
|
|
"completions/mean_terminated_length": 219.6890625,
|
|
"completions/min_length": 139.175,
|
|
"completions/min_terminated_length": 139.175,
|
|
"epoch": 0.4320035265594005,
|
|
"grad_norm": 0.24104253998662034,
|
|
"kl": 0.014841079711914062,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0048,
|
|
"num_tokens": 36727824.0,
|
|
"reward": 0.799939620308578,
|
|
"reward_std": 0.10737682661347207,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7646578153668088,
|
|
"rewards/qatch_metrics/std": 0.1263537659888243,
|
|
"rewards/tag_count_reward/mean": 0.999609375,
|
|
"rewards/tag_count_reward/std": 0.0015625,
|
|
"step": 245
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 327.9375,
|
|
"completions/max_terminated_length": 327.9375,
|
|
"completions/mean_length": 218.1921875,
|
|
"completions/mean_terminated_length": 218.1921875,
|
|
"completions/min_length": 140.15,
|
|
"completions/min_terminated_length": 140.15,
|
|
"epoch": 0.44081992506061274,
|
|
"grad_norm": 0.2539246494683978,
|
|
"kl": 0.014622879028320313,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0025,
|
|
"num_tokens": 37470134.0,
|
|
"reward": 0.7929010545834899,
|
|
"reward_std": 0.08832431975752116,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7563541710143908,
|
|
"rewards/qatch_metrics/std": 0.10391096852254122,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 369.9625,
|
|
"completions/max_terminated_length": 369.9625,
|
|
"completions/mean_length": 228.9,
|
|
"completions/mean_terminated_length": 228.9,
|
|
"completions/min_length": 142.2125,
|
|
"completions/min_terminated_length": 142.2125,
|
|
"epoch": 0.449636323561825,
|
|
"grad_norm": 0.389718556100374,
|
|
"kl": 0.013026809692382813,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0146,
|
|
"num_tokens": 38224342.0,
|
|
"reward": 0.8210916120558978,
|
|
"reward_std": 0.08054333752952517,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7895195348886773,
|
|
"rewards/qatch_metrics/std": 0.0947568719740957,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 360.125,
|
|
"completions/max_terminated_length": 360.125,
|
|
"completions/mean_length": 236.08203125,
|
|
"completions/mean_terminated_length": 236.08203125,
|
|
"completions/min_length": 150.35,
|
|
"completions/min_terminated_length": 150.35,
|
|
"epoch": 0.4584527220630373,
|
|
"grad_norm": 0.31386840054419024,
|
|
"kl": 0.01431121826171875,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0003,
|
|
"num_tokens": 39005375.0,
|
|
"reward": 0.7916189678013325,
|
|
"reward_std": 0.10870604729279876,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7548458357341588,
|
|
"rewards/qatch_metrics/std": 0.12788947536610068,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 361.3,
|
|
"completions/max_terminated_length": 361.3,
|
|
"completions/mean_length": 241.08046875,
|
|
"completions/mean_terminated_length": 241.08046875,
|
|
"completions/min_length": 143.625,
|
|
"completions/min_terminated_length": 143.625,
|
|
"epoch": 0.4672691205642495,
|
|
"grad_norm": 0.32199129625786044,
|
|
"kl": 0.014544677734375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0076,
|
|
"num_tokens": 39806326.0,
|
|
"reward": 0.8503130197525024,
|
|
"reward_std": 0.08854990721156356,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8238976599182933,
|
|
"rewards/qatch_metrics/std": 0.10417636758938897,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 357.9125,
|
|
"completions/max_terminated_length": 357.9125,
|
|
"completions/mean_length": 238.00703125,
|
|
"completions/mean_terminated_length": 238.00703125,
|
|
"completions/min_length": 150.7125,
|
|
"completions/min_terminated_length": 150.7125,
|
|
"epoch": 0.47608551906546176,
|
|
"grad_norm": 0.24307758730384177,
|
|
"kl": 0.0154083251953125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0007,
|
|
"num_tokens": 40579551.0,
|
|
"reward": 0.7649708043783903,
|
|
"reward_std": 0.09011161667294801,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7234950542595471,
|
|
"rewards/qatch_metrics/std": 0.1060136711690575,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 329.375,
|
|
"completions/max_terminated_length": 329.375,
|
|
"completions/mean_length": 218.865625,
|
|
"completions/mean_terminated_length": 218.865625,
|
|
"completions/min_length": 136.9375,
|
|
"completions/min_terminated_length": 136.9375,
|
|
"epoch": 0.484901917566674,
|
|
"grad_norm": 0.2437912279556105,
|
|
"kl": 0.01680908203125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0116,
|
|
"num_tokens": 41304819.0,
|
|
"reward": 0.7731978652998805,
|
|
"reward_std": 0.09575790755916387,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.7332773502916098,
|
|
"rewards/qatch_metrics/std": 0.11264292849227786,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 275
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 297.425,
|
|
"completions/max_terminated_length": 297.425,
|
|
"completions/mean_length": 194.35234375,
|
|
"completions/mean_terminated_length": 194.35234375,
|
|
"completions/min_length": 129.725,
|
|
"completions/min_terminated_length": 129.725,
|
|
"epoch": 0.49371831606788624,
|
|
"grad_norm": 0.3022726877012163,
|
|
"kl": 0.017235565185546874,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0071,
|
|
"num_tokens": 41986038.0,
|
|
"reward": 0.8300179397687316,
|
|
"reward_std": 0.08664830076304497,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8000210979953408,
|
|
"rewards/qatch_metrics/std": 0.10193918244767701,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 276.7875,
|
|
"completions/max_terminated_length": 276.7875,
|
|
"completions/mean_length": 183.3453125,
|
|
"completions/mean_terminated_length": 183.3453125,
|
|
"completions/min_length": 127.55,
|
|
"completions/min_terminated_length": 127.55,
|
|
"epoch": 0.5025347145690985,
|
|
"grad_norm": 0.22342235477555344,
|
|
"kl": 0.018723297119140624,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0009,
|
|
"num_tokens": 42654592.0,
|
|
"reward": 0.8566583547741174,
|
|
"reward_std": 0.08069641448455514,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8313627634197474,
|
|
"rewards/qatch_metrics/std": 0.09493696391291451,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 331.125,
|
|
"completions/max_terminated_length": 331.125,
|
|
"completions/mean_length": 220.7734375,
|
|
"completions/mean_terminated_length": 220.7734375,
|
|
"completions/min_length": 140.35,
|
|
"completions/min_terminated_length": 140.35,
|
|
"epoch": 0.5113511130703108,
|
|
"grad_norm": 0.34607047517160744,
|
|
"kl": 0.01548309326171875,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.006,
|
|
"num_tokens": 43404558.0,
|
|
"reward": 0.8139310251921416,
|
|
"reward_std": 0.0980742353014648,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7810953183798119,
|
|
"rewards/qatch_metrics/std": 0.11538145933300256,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 372.3625,
|
|
"completions/max_terminated_length": 372.3625,
|
|
"completions/mean_length": 253.30859375,
|
|
"completions/mean_terminated_length": 253.30859375,
|
|
"completions/min_length": 162.3375,
|
|
"completions/min_terminated_length": 162.3375,
|
|
"epoch": 0.5201675115715231,
|
|
"grad_norm": 0.26233887817165846,
|
|
"kl": 0.017038726806640626,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0014,
|
|
"num_tokens": 44202393.0,
|
|
"reward": 0.7521807491779328,
|
|
"reward_std": 0.0808277386619011,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7084479197394102,
|
|
"rewards/qatch_metrics/std": 0.09509145880147116,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 371.2875,
|
|
"completions/max_terminated_length": 371.2875,
|
|
"completions/mean_length": 256.74375,
|
|
"completions/mean_terminated_length": 256.74375,
|
|
"completions/min_length": 164.575,
|
|
"completions/min_terminated_length": 164.575,
|
|
"epoch": 0.5289839100727353,
|
|
"grad_norm": 0.27528750385577455,
|
|
"kl": 0.0166046142578125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0034,
|
|
"num_tokens": 44996449.0,
|
|
"reward": 0.7653900509700179,
|
|
"reward_std": 0.10405777737032622,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7239882865447725,
|
|
"rewards/qatch_metrics/std": 0.12242091761436313,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 383.75,
|
|
"completions/max_terminated_length": 383.75,
|
|
"completions/mean_length": 262.81875,
|
|
"completions/mean_terminated_length": 262.81875,
|
|
"completions/min_length": 163.2,
|
|
"completions/min_terminated_length": 163.2,
|
|
"epoch": 0.5378003085739476,
|
|
"grad_norm": 0.2916124436851988,
|
|
"kl": 0.016686248779296874,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0091,
|
|
"num_tokens": 45810553.0,
|
|
"reward": 0.751503630913794,
|
|
"reward_std": 0.10126840746961534,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7076513038016856,
|
|
"rewards/qatch_metrics/std": 0.1191393076442182,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 385.4,
|
|
"completions/max_terminated_length": 385.4,
|
|
"completions/mean_length": 255.634375,
|
|
"completions/mean_terminated_length": 255.634375,
|
|
"completions/min_length": 157.3125,
|
|
"completions/min_terminated_length": 157.3125,
|
|
"epoch": 0.5466167070751597,
|
|
"grad_norm": 0.3015279724579697,
|
|
"kl": 0.017456817626953124,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0074,
|
|
"num_tokens": 46626773.0,
|
|
"reward": 0.7629254685714841,
|
|
"reward_std": 0.14258774023037404,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.721100265783025,
|
|
"rewards/qatch_metrics/std": 0.16777858033310622,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 310
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 331.5375,
|
|
"completions/max_terminated_length": 331.5375,
|
|
"completions/mean_length": 217.5625,
|
|
"completions/mean_terminated_length": 217.5625,
|
|
"completions/min_length": 134.5375,
|
|
"completions/min_terminated_length": 134.5375,
|
|
"epoch": 0.555433105576372,
|
|
"grad_norm": 0.2736624916091261,
|
|
"kl": 0.016427993774414062,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0011,
|
|
"num_tokens": 47337797.0,
|
|
"reward": 0.8417512066662312,
|
|
"reward_std": 0.06132402071962133,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8138479188084602,
|
|
"rewards/qatch_metrics/std": 0.07210502420784906,
|
|
"rewards/tag_count_reward/mean": 0.999609375,
|
|
"rewards/tag_count_reward/std": 0.0015625,
|
|
"step": 315
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 347.9625,
|
|
"completions/max_terminated_length": 347.9625,
|
|
"completions/mean_length": 225.61484375,
|
|
"completions/mean_terminated_length": 225.61484375,
|
|
"completions/min_length": 139.5625,
|
|
"completions/min_terminated_length": 139.5625,
|
|
"epoch": 0.5642495040775843,
|
|
"grad_norm": 0.3231991864642648,
|
|
"kl": 0.017105865478515624,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0004,
|
|
"num_tokens": 48108472.0,
|
|
"reward": 0.8285636451095343,
|
|
"reward_std": 0.09632078433714923,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7983101591467857,
|
|
"rewards/qatch_metrics/std": 0.11331857727491297,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 347.45,
|
|
"completions/max_terminated_length": 347.45,
|
|
"completions/mean_length": 231.76328125,
|
|
"completions/mean_terminated_length": 231.76328125,
|
|
"completions/min_length": 148.275,
|
|
"completions/min_terminated_length": 148.275,
|
|
"epoch": 0.5730659025787965,
|
|
"grad_norm": 0.2973174138867235,
|
|
"kl": 0.016844940185546876,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0095,
|
|
"num_tokens": 48899481.0,
|
|
"reward": 0.7851231107488275,
|
|
"reward_std": 0.09552590373905331,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7472036501319508,
|
|
"rewards/qatch_metrics/std": 0.11238342153937993,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 359.275,
|
|
"completions/max_terminated_length": 359.275,
|
|
"completions/mean_length": 234.6609375,
|
|
"completions/mean_terminated_length": 234.6609375,
|
|
"completions/min_length": 146.075,
|
|
"completions/min_terminated_length": 146.075,
|
|
"epoch": 0.5818823010800088,
|
|
"grad_norm": 0.3288208170160159,
|
|
"kl": 0.017102813720703124,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0158,
|
|
"num_tokens": 49645255.0,
|
|
"reward": 0.8200173564255238,
|
|
"reward_std": 0.09870836285626865,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7882671934901737,
|
|
"rewards/qatch_metrics/std": 0.11608153889974346,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 330
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 357.4375,
|
|
"completions/max_terminated_length": 357.4375,
|
|
"completions/mean_length": 230.21484375,
|
|
"completions/mean_terminated_length": 230.21484375,
|
|
"completions/min_length": 140.6625,
|
|
"completions/min_terminated_length": 140.6625,
|
|
"epoch": 0.5906986995812211,
|
|
"grad_norm": 0.4614331608564361,
|
|
"kl": 0.018951416015625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0084,
|
|
"num_tokens": 50437466.0,
|
|
"reward": 0.7783582922071218,
|
|
"reward_std": 0.09422233710065483,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.7393484384519979,
|
|
"rewards/qatch_metrics/std": 0.1104362107347697,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 335
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 324.8875,
|
|
"completions/max_terminated_length": 324.8875,
|
|
"completions/mean_length": 209.18125,
|
|
"completions/mean_terminated_length": 209.18125,
|
|
"completions/min_length": 134.8375,
|
|
"completions/min_terminated_length": 134.8375,
|
|
"epoch": 0.5995150980824333,
|
|
"grad_norm": 0.2725327541303037,
|
|
"kl": 0.019439697265625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0057,
|
|
"num_tokens": 51167586.0,
|
|
"reward": 0.8479114411398768,
|
|
"reward_std": 0.07958720491733402,
|
|
"rewards/format_reward/mean": 0.9984375,
|
|
"rewards/format_reward/std": 0.00625,
|
|
"rewards/qatch_metrics/mean": 0.8213135461322963,
|
|
"rewards/qatch_metrics/std": 0.0926669392734766,
|
|
"rewards/tag_count_reward/mean": 0.9990234375,
|
|
"rewards/tag_count_reward/std": 0.00390625,
|
|
"step": 340
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 345.0833333333333,
|
|
"completions/max_terminated_length": 345.0833333333333,
|
|
"completions/mean_length": 229.90755208333334,
|
|
"completions/mean_terminated_length": 229.90755208333334,
|
|
"completions/min_length": 146.02083333333334,
|
|
"completions/min_terminated_length": 146.02083333333334,
|
|
"epoch": 0.6083314965836456,
|
|
"grad_norm": 0.33113384827355125,
|
|
"kl": 0.019663492838541668,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0054,
|
|
"num_tokens": 51884877.0,
|
|
"reward": 0.7889465639988581,
|
|
"reward_std": 0.09934755021822639,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7517018277818958,
|
|
"rewards/qatch_metrics/std": 0.11687947452689211,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 336.4125,
|
|
"completions/max_terminated_length": 336.4125,
|
|
"completions/mean_length": 225.73203125,
|
|
"completions/mean_terminated_length": 225.73203125,
|
|
"completions/min_length": 142.075,
|
|
"completions/min_terminated_length": 142.075,
|
|
"epoch": 0.6171478950848578,
|
|
"grad_norm": 0.3566644104382955,
|
|
"kl": 0.020180511474609374,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0094,
|
|
"num_tokens": 52662630.0,
|
|
"reward": 0.8238930691033601,
|
|
"reward_std": 0.08169953491305933,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7928153664804996,
|
|
"rewards/qatch_metrics/std": 0.09611710296012461,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 343.2,
|
|
"completions/max_terminated_length": 343.2,
|
|
"completions/mean_length": 236.98046875,
|
|
"completions/mean_terminated_length": 236.98046875,
|
|
"completions/min_length": 150.9,
|
|
"completions/min_terminated_length": 150.9,
|
|
"epoch": 0.6259642935860701,
|
|
"grad_norm": 0.28516393114431005,
|
|
"kl": 0.017165374755859376,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 53419117.0,
|
|
"reward": 0.8083362869918347,
|
|
"reward_std": 0.0969126635696739,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.7746166706085205,
|
|
"rewards/qatch_metrics/std": 0.11360130067914724,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 355
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 352.6875,
|
|
"completions/max_terminated_length": 352.6875,
|
|
"completions/mean_length": 239.6953125,
|
|
"completions/mean_terminated_length": 239.6953125,
|
|
"completions/min_length": 146.55,
|
|
"completions/min_terminated_length": 146.55,
|
|
"epoch": 0.6347806920872824,
|
|
"grad_norm": 0.17994755171571641,
|
|
"kl": 0.017398834228515625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0008,
|
|
"num_tokens": 54185031.0,
|
|
"reward": 0.8027490990236401,
|
|
"reward_std": 0.05896135854927707,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7679401046669228,
|
|
"rewards/qatch_metrics/std": 0.06936630747477465,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 352.7125,
|
|
"completions/max_terminated_length": 352.7125,
|
|
"completions/mean_length": 239.52109375,
|
|
"completions/mean_terminated_length": 239.52109375,
|
|
"completions/min_length": 145.0125,
|
|
"completions/min_terminated_length": 145.0125,
|
|
"epoch": 0.6435970905884946,
|
|
"grad_norm": 0.23679998463114285,
|
|
"kl": 0.017606353759765624,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0013,
|
|
"num_tokens": 54936114.0,
|
|
"reward": 0.8317079545930028,
|
|
"reward_std": 0.08216131356894038,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8020208382047713,
|
|
"rewards/qatch_metrics/std": 0.0966144205071032,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 365
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 368.05,
|
|
"completions/max_terminated_length": 368.05,
|
|
"completions/mean_length": 253.25546875,
|
|
"completions/mean_terminated_length": 253.25546875,
|
|
"completions/min_length": 161.875,
|
|
"completions/min_terminated_length": 161.875,
|
|
"epoch": 0.6524134890897069,
|
|
"grad_norm": 0.2556963595685909,
|
|
"kl": 0.018042755126953126,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0036,
|
|
"num_tokens": 55728841.0,
|
|
"reward": 0.8250438664108515,
|
|
"reward_std": 0.08220216338286264,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7941807355555284,
|
|
"rewards/qatch_metrics/std": 0.09668861866011866,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 370
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 341.6625,
|
|
"completions/max_terminated_length": 341.6625,
|
|
"completions/mean_length": 237.73046875,
|
|
"completions/mean_terminated_length": 237.73046875,
|
|
"completions/min_length": 150.525,
|
|
"completions/min_terminated_length": 150.525,
|
|
"epoch": 0.6612298875909192,
|
|
"grad_norm": 0.2751968612090182,
|
|
"kl": 0.018801116943359376,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0012,
|
|
"num_tokens": 56499232.0,
|
|
"reward": 0.8543739832937718,
|
|
"reward_std": 0.09150902703240718,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.828675264492631,
|
|
"rewards/qatch_metrics/std": 0.10765768758892591,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 319.2875,
|
|
"completions/max_terminated_length": 319.2875,
|
|
"completions/mean_length": 219.77734375,
|
|
"completions/mean_terminated_length": 219.77734375,
|
|
"completions/min_length": 137.475,
|
|
"completions/min_terminated_length": 137.475,
|
|
"epoch": 0.6700462860921313,
|
|
"grad_norm": 0.31154862475726136,
|
|
"kl": 0.01962738037109375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0002,
|
|
"num_tokens": 57237059.0,
|
|
"reward": 0.8459062715992332,
|
|
"reward_std": 0.08771184119395911,
|
|
"rewards/format_reward/mean": 0.9984375,
|
|
"rewards/format_reward/std": 0.00625,
|
|
"rewards/qatch_metrics/mean": 0.8189200535358395,
|
|
"rewards/qatch_metrics/std": 0.1028526050504297,
|
|
"rewards/tag_count_reward/mean": 0.999609375,
|
|
"rewards/tag_count_reward/std": 0.0015625,
|
|
"step": 380
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 302.25,
|
|
"completions/max_terminated_length": 302.25,
|
|
"completions/mean_length": 203.378125,
|
|
"completions/mean_terminated_length": 203.378125,
|
|
"completions/min_length": 137.75,
|
|
"completions/min_terminated_length": 137.75,
|
|
"epoch": 0.6788626845933436,
|
|
"grad_norm": 0.2352132785923824,
|
|
"kl": 0.02192230224609375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0049,
|
|
"num_tokens": 57972807.0,
|
|
"reward": 0.82975386492908,
|
|
"reward_std": 0.06049516258062795,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7997104193782434,
|
|
"rewards/qatch_metrics/std": 0.07117078317096456,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 304.5125,
|
|
"completions/max_terminated_length": 304.5125,
|
|
"completions/mean_length": 204.80234375,
|
|
"completions/mean_terminated_length": 204.80234375,
|
|
"completions/min_length": 136.125,
|
|
"completions/min_terminated_length": 136.125,
|
|
"epoch": 0.6876790830945558,
|
|
"grad_norm": 0.27452609767604863,
|
|
"kl": 0.022785186767578125,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0009,
|
|
"num_tokens": 58735722.0,
|
|
"reward": 0.821035155840218,
|
|
"reward_std": 0.09460214762948453,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.7895565141923726,
|
|
"rewards/qatch_metrics/std": 0.11137145487591624,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 390
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 298.75,
|
|
"completions/max_terminated_length": 298.75,
|
|
"completions/mean_length": 199.359375,
|
|
"completions/mean_terminated_length": 199.359375,
|
|
"completions/min_length": 133.325,
|
|
"completions/min_terminated_length": 133.325,
|
|
"epoch": 0.6964954815957681,
|
|
"grad_norm": 0.36379488564541623,
|
|
"kl": 0.023514556884765624,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0139,
|
|
"num_tokens": 59453574.0,
|
|
"reward": 0.7852070070803165,
|
|
"reward_std": 0.11039148237323389,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7473023503087461,
|
|
"rewards/qatch_metrics/std": 0.12987233807798476,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 297.525,
|
|
"completions/max_terminated_length": 297.525,
|
|
"completions/mean_length": 192.63515625,
|
|
"completions/mean_terminated_length": 192.63515625,
|
|
"completions/min_length": 130.4375,
|
|
"completions/min_terminated_length": 130.4375,
|
|
"epoch": 0.7053118800969804,
|
|
"grad_norm": 0.31515335460233057,
|
|
"kl": 0.02534027099609375,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0013,
|
|
"num_tokens": 60164595.0,
|
|
"reward": 0.8131733348593115,
|
|
"reward_std": 0.09474163451232016,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7802039116621018,
|
|
"rewards/qatch_metrics/std": 0.11146075297147036,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 301.3375,
|
|
"completions/max_terminated_length": 301.3375,
|
|
"completions/mean_length": 194.86171875,
|
|
"completions/mean_terminated_length": 194.86171875,
|
|
"completions/min_length": 132.875,
|
|
"completions/min_terminated_length": 132.875,
|
|
"epoch": 0.7141282785981926,
|
|
"grad_norm": 0.36020622913136857,
|
|
"kl": 0.02452239990234375,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0044,
|
|
"num_tokens": 60876834.0,
|
|
"reward": 0.7978186447173357,
|
|
"reward_std": 0.12316365442238748,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.7622429746668786,
|
|
"rewards/qatch_metrics/std": 0.14448482398875057,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 405
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 330.2875,
|
|
"completions/max_terminated_length": 330.2875,
|
|
"completions/mean_length": 222.946875,
|
|
"completions/mean_terminated_length": 222.946875,
|
|
"completions/min_length": 147.725,
|
|
"completions/min_terminated_length": 147.725,
|
|
"epoch": 0.7229446770994049,
|
|
"grad_norm": 0.23920846142153848,
|
|
"kl": 0.022620391845703126,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0015,
|
|
"num_tokens": 61628878.0,
|
|
"reward": 0.8119752595201135,
|
|
"reward_std": 0.09445713473833166,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7788403697311879,
|
|
"rewards/qatch_metrics/std": 0.11107292836531997,
|
|
"rewards/tag_count_reward/mean": 0.99921875,
|
|
"rewards/tag_count_reward/std": 0.003125,
|
|
"step": 410
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 362.675,
|
|
"completions/max_terminated_length": 362.675,
|
|
"completions/mean_length": 238.16640625,
|
|
"completions/mean_terminated_length": 238.16640625,
|
|
"completions/min_length": 149.55,
|
|
"completions/min_terminated_length": 149.55,
|
|
"epoch": 0.7317610756006171,
|
|
"grad_norm": 0.29869110706075575,
|
|
"kl": 0.02274169921875,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0076,
|
|
"num_tokens": 62407347.0,
|
|
"reward": 0.8343930047005415,
|
|
"reward_std": 0.1020151567645371,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.805168230831623,
|
|
"rewards/qatch_metrics/std": 0.12001783675514162,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 351.1625,
|
|
"completions/max_terminated_length": 351.1625,
|
|
"completions/mean_length": 238.10625,
|
|
"completions/mean_terminated_length": 238.10625,
|
|
"completions/min_length": 151.7,
|
|
"completions/min_terminated_length": 151.7,
|
|
"epoch": 0.7405774741018294,
|
|
"grad_norm": 0.2830612988897095,
|
|
"kl": 0.021949005126953126,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0002,
|
|
"num_tokens": 63172059.0,
|
|
"reward": 0.82713833283633,
|
|
"reward_std": 0.07888575517572463,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.7967367230914533,
|
|
"rewards/qatch_metrics/std": 0.0928413406247273,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 420
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 350.6625,
|
|
"completions/max_terminated_length": 350.6625,
|
|
"completions/mean_length": 234.71875,
|
|
"completions/mean_terminated_length": 234.71875,
|
|
"completions/min_length": 152.85,
|
|
"completions/min_terminated_length": 152.85,
|
|
"epoch": 0.7493938726030417,
|
|
"grad_norm": 0.303175281797131,
|
|
"kl": 0.023668670654296876,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0014,
|
|
"num_tokens": 63958371.0,
|
|
"reward": 0.8379335630685091,
|
|
"reward_std": 0.08907212568446994,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8093335981713607,
|
|
"rewards/qatch_metrics/std": 0.10479074087925255,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 356.15,
|
|
"completions/max_terminated_length": 356.15,
|
|
"completions/mean_length": 233.284375,
|
|
"completions/mean_terminated_length": 233.284375,
|
|
"completions/min_length": 144.625,
|
|
"completions/min_terminated_length": 144.625,
|
|
"epoch": 0.7582102711042539,
|
|
"grad_norm": 0.3362377001779979,
|
|
"kl": 0.020977020263671875,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0094,
|
|
"num_tokens": 64755503.0,
|
|
"reward": 0.7857262346893549,
|
|
"reward_std": 0.0779630596237439,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.748039587616222,
|
|
"rewards/qatch_metrics/std": 0.09121573810989503,
|
|
"rewards/tag_count_reward/mean": 0.9994140625,
|
|
"rewards/tag_count_reward/std": 0.00234375,
|
|
"step": 430
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 342.0875,
|
|
"completions/max_terminated_length": 342.0875,
|
|
"completions/mean_length": 224.60390625,
|
|
"completions/mean_terminated_length": 224.60390625,
|
|
"completions/min_length": 139.6875,
|
|
"completions/min_terminated_length": 139.6875,
|
|
"epoch": 0.7670266696054662,
|
|
"grad_norm": 0.27147477635209977,
|
|
"kl": 0.023795318603515626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.007,
|
|
"num_tokens": 65500036.0,
|
|
"reward": 0.8154280468821525,
|
|
"reward_std": 0.0885659102234058,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7828565156087279,
|
|
"rewards/qatch_metrics/std": 0.10419519691495224,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 343.15,
|
|
"completions/max_terminated_length": 343.15,
|
|
"completions/mean_length": 225.971875,
|
|
"completions/mean_terminated_length": 225.971875,
|
|
"completions/min_length": 134.8875,
|
|
"completions/min_terminated_length": 134.8875,
|
|
"epoch": 0.7758430681066785,
|
|
"grad_norm": 0.23493992268435515,
|
|
"kl": 0.0210540771484375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.001,
|
|
"num_tokens": 66252064.0,
|
|
"reward": 0.8585389815270901,
|
|
"reward_std": 0.08183195294986945,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8335752619896084,
|
|
"rewards/qatch_metrics/std": 0.09627289194759214,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 355.575,
|
|
"completions/max_terminated_length": 355.575,
|
|
"completions/mean_length": 242.10859375,
|
|
"completions/mean_terminated_length": 242.10859375,
|
|
"completions/min_length": 151.175,
|
|
"completions/min_terminated_length": 151.175,
|
|
"epoch": 0.7846594666078907,
|
|
"grad_norm": 0.2140376472128798,
|
|
"kl": 0.02173614501953125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0003,
|
|
"num_tokens": 67023275.0,
|
|
"reward": 0.87310497071594,
|
|
"reward_std": 0.0631409589201212,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8507117207162083,
|
|
"rewards/qatch_metrics/std": 0.07428348822286353,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 353.0,
|
|
"completions/max_terminated_length": 353.0,
|
|
"completions/mean_length": 226.27578125,
|
|
"completions/mean_terminated_length": 226.27578125,
|
|
"completions/min_length": 139.95,
|
|
"completions/min_terminated_length": 139.95,
|
|
"epoch": 0.793475865109103,
|
|
"grad_norm": 0.3122440505453943,
|
|
"kl": 0.021511077880859375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 67777324.0,
|
|
"reward": 0.8520105175673962,
|
|
"reward_std": 0.08210055120289325,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.8260210990672932,
|
|
"rewards/qatch_metrics/std": 0.09612429473781958,
|
|
"rewards/tag_count_reward/mean": 0.9994140625,
|
|
"rewards/tag_count_reward/std": 0.00234375,
|
|
"step": 450
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 327.425,
|
|
"completions/max_terminated_length": 327.425,
|
|
"completions/mean_length": 212.48125,
|
|
"completions/mean_terminated_length": 212.48125,
|
|
"completions/min_length": 135.5125,
|
|
"completions/min_terminated_length": 135.5125,
|
|
"epoch": 0.8022922636103151,
|
|
"grad_norm": 0.3121983142519688,
|
|
"kl": 0.022968292236328125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0106,
|
|
"num_tokens": 68512644.0,
|
|
"reward": 0.8407730983570219,
|
|
"reward_std": 0.06822409054730087,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8126742199849104,
|
|
"rewards/qatch_metrics/std": 0.08026364156394265,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 340.25,
|
|
"completions/max_terminated_length": 340.25,
|
|
"completions/mean_length": 223.44375,
|
|
"completions/mean_terminated_length": 223.44375,
|
|
"completions/min_length": 141.9625,
|
|
"completions/min_terminated_length": 141.9625,
|
|
"epoch": 0.8111086621115274,
|
|
"grad_norm": 0.3458299809901387,
|
|
"kl": 0.02670440673828125,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0033,
|
|
"num_tokens": 69262636.0,
|
|
"reward": 0.8292963147163391,
|
|
"reward_std": 0.08730166773311794,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.7992755254730582,
|
|
"rewards/qatch_metrics/std": 0.10261548004345969,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 460
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 340.2375,
|
|
"completions/max_terminated_length": 340.2375,
|
|
"completions/mean_length": 223.1640625,
|
|
"completions/mean_terminated_length": 223.1640625,
|
|
"completions/min_length": 144.9375,
|
|
"completions/min_terminated_length": 144.9375,
|
|
"epoch": 0.8199250606127397,
|
|
"grad_norm": 0.2539455340605116,
|
|
"kl": 0.02646636962890625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0074,
|
|
"num_tokens": 70028046.0,
|
|
"reward": 0.782775210775435,
|
|
"reward_std": 0.09664202006024425,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7444414100635186,
|
|
"rewards/qatch_metrics/std": 0.11369650048291077,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 343.65,
|
|
"completions/max_terminated_length": 343.65,
|
|
"completions/mean_length": 220.91953125,
|
|
"completions/mean_terminated_length": 220.91953125,
|
|
"completions/min_length": 142.9125,
|
|
"completions/min_terminated_length": 142.9125,
|
|
"epoch": 0.8287414591139519,
|
|
"grad_norm": 0.29340586589965106,
|
|
"kl": 0.023850250244140624,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0028,
|
|
"num_tokens": 70817127.0,
|
|
"reward": 0.83585394769907,
|
|
"reward_std": 0.08243265537312254,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8068869862705469,
|
|
"rewards/qatch_metrics/std": 0.09697959922486916,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 319.25,
|
|
"completions/max_terminated_length": 319.25,
|
|
"completions/mean_length": 212.4921875,
|
|
"completions/mean_terminated_length": 212.4921875,
|
|
"completions/min_length": 137.675,
|
|
"completions/min_terminated_length": 137.675,
|
|
"epoch": 0.8375578576151642,
|
|
"grad_norm": 0.30646618504129836,
|
|
"kl": 0.02505950927734375,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0023,
|
|
"num_tokens": 71553357.0,
|
|
"reward": 0.8542880930006505,
|
|
"reward_std": 0.07209075510618276,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8285742185791605,
|
|
"rewards/qatch_metrics/std": 0.08481265990703832,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 342.7375,
|
|
"completions/max_terminated_length": 342.7375,
|
|
"completions/mean_length": 233.45546875,
|
|
"completions/mean_terminated_length": 233.45546875,
|
|
"completions/min_length": 147.2125,
|
|
"completions/min_terminated_length": 147.2125,
|
|
"epoch": 0.8463742561163765,
|
|
"grad_norm": 0.3162519376309847,
|
|
"kl": 0.0266021728515625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0054,
|
|
"num_tokens": 72361556.0,
|
|
"reward": 0.8315023425966501,
|
|
"reward_std": 0.10010532954183873,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8017674525035545,
|
|
"rewards/qatch_metrics/std": 0.11777098168386146,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 341.25,
|
|
"completions/max_terminated_length": 341.25,
|
|
"completions/mean_length": 234.0140625,
|
|
"completions/mean_terminated_length": 234.0140625,
|
|
"completions/min_length": 154.7,
|
|
"completions/min_terminated_length": 154.7,
|
|
"epoch": 0.8551906546175887,
|
|
"grad_norm": 0.297834941840183,
|
|
"kl": 0.02375946044921875,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0113,
|
|
"num_tokens": 73116902.0,
|
|
"reward": 0.7931279448792339,
|
|
"reward_std": 0.08709526733728126,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7566210999619216,
|
|
"rewards/qatch_metrics/std": 0.10246502548689022,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 324.5625,
|
|
"completions/max_terminated_length": 324.5625,
|
|
"completions/mean_length": 220.03203125,
|
|
"completions/mean_terminated_length": 220.03203125,
|
|
"completions/min_length": 138.8125,
|
|
"completions/min_terminated_length": 138.8125,
|
|
"epoch": 0.864007053118801,
|
|
"grad_norm": 0.20060310802569065,
|
|
"kl": 0.022344970703125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0071,
|
|
"num_tokens": 73868767.0,
|
|
"reward": 0.8470947509631515,
|
|
"reward_std": 0.06845701420679688,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8201114618219435,
|
|
"rewards/qatch_metrics/std": 0.08053766970988363,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 315.8,
|
|
"completions/max_terminated_length": 315.8,
|
|
"completions/mean_length": 211.5171875,
|
|
"completions/mean_terminated_length": 211.5171875,
|
|
"completions/min_length": 139.7625,
|
|
"completions/min_terminated_length": 139.7625,
|
|
"epoch": 0.8728234516200132,
|
|
"grad_norm": 0.2914817614366052,
|
|
"kl": 0.023822784423828125,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0034,
|
|
"num_tokens": 74608789.0,
|
|
"reward": 0.8249077584594489,
|
|
"reward_std": 0.08015898242010736,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7940091195050627,
|
|
"rewards/qatch_metrics/std": 0.09430469113285653,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 330.7,
|
|
"completions/max_terminated_length": 330.7,
|
|
"completions/mean_length": 225.6296875,
|
|
"completions/mean_terminated_length": 225.6296875,
|
|
"completions/min_length": 147.9125,
|
|
"completions/min_terminated_length": 147.9125,
|
|
"epoch": 0.8816398501212255,
|
|
"grad_norm": 0.27724680534940715,
|
|
"kl": 0.024609375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0044,
|
|
"num_tokens": 75370139.0,
|
|
"reward": 0.7901407666504383,
|
|
"reward_std": 0.08103454456286271,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7531067751348018,
|
|
"rewards/qatch_metrics/std": 0.09533476178003183,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 333.875,
|
|
"completions/max_terminated_length": 333.875,
|
|
"completions/mean_length": 220.46640625,
|
|
"completions/mean_terminated_length": 220.46640625,
|
|
"completions/min_length": 142.2,
|
|
"completions/min_terminated_length": 142.2,
|
|
"epoch": 0.8904562486224378,
|
|
"grad_norm": 0.23661752197680214,
|
|
"kl": 0.022271728515625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0066,
|
|
"num_tokens": 76121648.0,
|
|
"reward": 0.8500555850565433,
|
|
"reward_std": 0.07330139055848121,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8235947942361236,
|
|
"rewards/qatch_metrics/std": 0.08623693531844764,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 317.7875,
|
|
"completions/max_terminated_length": 317.7875,
|
|
"completions/mean_length": 210.9671875,
|
|
"completions/mean_terminated_length": 210.9671875,
|
|
"completions/min_length": 138.975,
|
|
"completions/min_terminated_length": 138.975,
|
|
"epoch": 0.89927264712365,
|
|
"grad_norm": 0.21851493451825393,
|
|
"kl": 0.025273895263671874,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0036,
|
|
"num_tokens": 76872790.0,
|
|
"reward": 0.8583780597895384,
|
|
"reward_std": 0.08067860676383135,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8333859436213971,
|
|
"rewards/qatch_metrics/std": 0.09491601307672681,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 321.775,
|
|
"completions/max_terminated_length": 321.775,
|
|
"completions/mean_length": 216.0375,
|
|
"completions/mean_terminated_length": 216.0375,
|
|
"completions/min_length": 141.1625,
|
|
"completions/min_terminated_length": 141.1625,
|
|
"epoch": 0.9080890456248623,
|
|
"grad_norm": 0.2829422652691582,
|
|
"kl": 0.025566864013671874,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0082,
|
|
"num_tokens": 77616342.0,
|
|
"reward": 0.8006692606955766,
|
|
"reward_std": 0.08726950597483665,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7654932357370854,
|
|
"rewards/qatch_metrics/std": 0.10267001276370138,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 290.275,
|
|
"completions/max_terminated_length": 290.275,
|
|
"completions/mean_length": 195.296875,
|
|
"completions/mean_terminated_length": 195.296875,
|
|
"completions/min_length": 130.9125,
|
|
"completions/min_terminated_length": 130.9125,
|
|
"epoch": 0.9169054441260746,
|
|
"grad_norm": 0.2228119611592202,
|
|
"kl": 0.023336029052734374,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0065,
|
|
"num_tokens": 78313506.0,
|
|
"reward": 0.7732636205852031,
|
|
"reward_std": 0.05938901338377036,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7332513069733977,
|
|
"rewards/qatch_metrics/std": 0.06986943007213994,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 305.6125,
|
|
"completions/max_terminated_length": 305.6125,
|
|
"completions/mean_length": 205.165625,
|
|
"completions/mean_terminated_length": 205.165625,
|
|
"completions/min_length": 137.2,
|
|
"completions/min_terminated_length": 137.2,
|
|
"epoch": 0.9257218426272867,
|
|
"grad_norm": 0.36003445762352576,
|
|
"kl": 0.023264312744140626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0079,
|
|
"num_tokens": 79054486.0,
|
|
"reward": 0.8107579160481692,
|
|
"reward_std": 0.07336599697882776,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7773622425273061,
|
|
"rewards/qatch_metrics/std": 0.08631294009974226,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 323.2125,
|
|
"completions/max_terminated_length": 323.2125,
|
|
"completions/mean_length": 220.25,
|
|
"completions/mean_terminated_length": 220.25,
|
|
"completions/min_length": 145.8625,
|
|
"completions/min_terminated_length": 145.8625,
|
|
"epoch": 0.934538241128499,
|
|
"grad_norm": 0.30049379873568555,
|
|
"kl": 0.0247039794921875,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0046,
|
|
"num_tokens": 79821798.0,
|
|
"reward": 0.8396705185994506,
|
|
"reward_std": 0.07967656154651195,
|
|
"rewards/format_reward/mean": 0.99921875,
|
|
"rewards/format_reward/std": 0.003125,
|
|
"rewards/qatch_metrics/mean": 0.811480475217104,
|
|
"rewards/qatch_metrics/std": 0.09332353379577399,
|
|
"rewards/tag_count_reward/mean": 0.9998046875,
|
|
"rewards/tag_count_reward/std": 0.00078125,
|
|
"step": 530
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 334.6,
|
|
"completions/max_terminated_length": 334.6,
|
|
"completions/mean_length": 224.428125,
|
|
"completions/mean_terminated_length": 224.428125,
|
|
"completions/min_length": 143.85,
|
|
"completions/min_terminated_length": 143.85,
|
|
"epoch": 0.9433546396297112,
|
|
"grad_norm": 0.26616721247221914,
|
|
"kl": 0.024881744384765626,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0114,
|
|
"num_tokens": 80588906.0,
|
|
"reward": 0.8039220564067364,
|
|
"reward_std": 0.08752607379574329,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7693200555630029,
|
|
"rewards/qatch_metrics/std": 0.10297185693052598,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 337.025,
|
|
"completions/max_terminated_length": 337.025,
|
|
"completions/mean_length": 222.8765625,
|
|
"completions/mean_terminated_length": 222.8765625,
|
|
"completions/min_length": 146.35,
|
|
"completions/min_terminated_length": 146.35,
|
|
"epoch": 0.9521710381309235,
|
|
"grad_norm": 0.26056623342737706,
|
|
"kl": 0.023919677734375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0073,
|
|
"num_tokens": 81348428.0,
|
|
"reward": 0.8161520700901747,
|
|
"reward_std": 0.06797666533384472,
|
|
"rewards/format_reward/mean": 0.9984375,
|
|
"rewards/format_reward/std": 0.00625,
|
|
"rewards/qatch_metrics/mean": 0.7839151054620743,
|
|
"rewards/qatch_metrics/std": 0.07955915104830638,
|
|
"rewards/tag_count_reward/mean": 0.999609375,
|
|
"rewards/tag_count_reward/std": 0.0015625,
|
|
"step": 540
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 329.45,
|
|
"completions/max_terminated_length": 329.45,
|
|
"completions/mean_length": 222.23515625,
|
|
"completions/mean_terminated_length": 222.23515625,
|
|
"completions/min_length": 145.9375,
|
|
"completions/min_terminated_length": 145.9375,
|
|
"epoch": 0.9609874366321358,
|
|
"grad_norm": 0.25072778848110355,
|
|
"kl": 0.02654876708984375,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0051,
|
|
"num_tokens": 82112537.0,
|
|
"reward": 0.8805699178948998,
|
|
"reward_std": 0.07740147057920695,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8594940162263811,
|
|
"rewards/qatch_metrics/std": 0.09106055875308812,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 326.8125,
|
|
"completions/max_terminated_length": 326.8125,
|
|
"completions/mean_length": 216.265625,
|
|
"completions/mean_terminated_length": 216.265625,
|
|
"completions/min_length": 142.5,
|
|
"completions/min_terminated_length": 142.5,
|
|
"epoch": 0.969803835133348,
|
|
"grad_norm": 0.25948373422406573,
|
|
"kl": 0.024462890625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0006,
|
|
"num_tokens": 82872173.0,
|
|
"reward": 0.8117451569065451,
|
|
"reward_std": 0.0860868067946285,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7785237034782767,
|
|
"rewards/qatch_metrics/std": 0.10127860223874449,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 337.1875,
|
|
"completions/max_terminated_length": 337.1875,
|
|
"completions/mean_length": 216.21640625,
|
|
"completions/mean_terminated_length": 216.21640625,
|
|
"completions/min_length": 137.8125,
|
|
"completions/min_terminated_length": 137.8125,
|
|
"epoch": 0.9786202336345603,
|
|
"grad_norm": 0.3645941597990175,
|
|
"kl": 0.02496337890625,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0009,
|
|
"num_tokens": 83634770.0,
|
|
"reward": 0.8101593714207411,
|
|
"reward_std": 0.08605160953738959,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7766580768162384,
|
|
"rewards/qatch_metrics/std": 0.10123719241237268,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 343.25,
|
|
"completions/max_terminated_length": 343.25,
|
|
"completions/mean_length": 231.0015625,
|
|
"completions/mean_terminated_length": 231.0015625,
|
|
"completions/min_length": 151.325,
|
|
"completions/min_terminated_length": 151.325,
|
|
"epoch": 0.9874366321357725,
|
|
"grad_norm": 0.21758575977885092,
|
|
"kl": 0.023052978515625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0066,
|
|
"num_tokens": 84398692.0,
|
|
"reward": 0.8315089832991361,
|
|
"reward_std": 0.07600237202714197,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8017752626910806,
|
|
"rewards/qatch_metrics/std": 0.08941456201137044,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 355.1375,
|
|
"completions/max_terminated_length": 355.1375,
|
|
"completions/mean_length": 232.215625,
|
|
"completions/mean_terminated_length": 232.215625,
|
|
"completions/min_length": 148.6875,
|
|
"completions/min_terminated_length": 148.6875,
|
|
"epoch": 0.9962530306369848,
|
|
"grad_norm": 0.2378336659692683,
|
|
"kl": 0.023166656494140625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0083,
|
|
"num_tokens": 85155592.0,
|
|
"reward": 0.7450484920293092,
|
|
"reward_std": 0.07769823344424368,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.7000570335425437,
|
|
"rewards/qatch_metrics/std": 0.09140969021318597,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 290.0625,
|
|
"completions/max_terminated_length": 290.0625,
|
|
"completions/mean_length": 187.46484375,
|
|
"completions/mean_terminated_length": 187.46484375,
|
|
"completions/min_length": 124.65625,
|
|
"completions/min_terminated_length": 124.65625,
|
|
"epoch": 0.9997795900374697,
|
|
"kl": 0.027322769165039062,
|
|
"num_tokens": 85427878.0,
|
|
"reward": 0.8777023833245039,
|
|
"reward_std": 0.08281319939123932,
|
|
"rewards/format_reward/mean": 1.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/qatch_metrics/mean": 0.8561204457655549,
|
|
"rewards/qatch_metrics/std": 0.09742730065772776,
|
|
"rewards/tag_count_reward/mean": 1.0,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 567,
|
|
"total_flos": 0.0,
|
|
"train_loss": 0.0013445673257480432,
|
|
"train_runtime": 158207.651,
|
|
"train_samples_per_second": 0.057,
|
|
"train_steps_per_second": 0.004
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 567,
|
|
"num_input_tokens_seen": 85427878,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 57,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|