923 lines
24 KiB
JSON
923 lines
24 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 11066,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.00018074807107917894,
|
||
|
|
"grad_norm": 58.25,
|
||
|
|
"learning_rate": 1.805054151624549e-07,
|
||
|
|
"loss": 4.652698516845703,
|
||
|
|
"step": 1,
|
||
|
|
"token_acc": 0.43327626145634
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.018074807107917895,
|
||
|
|
"grad_norm": 6.53125,
|
||
|
|
"learning_rate": 1.805054151624549e-05,
|
||
|
|
"loss": 3.920129641137942,
|
||
|
|
"step": 100,
|
||
|
|
"token_acc": 0.45332661759508064
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03614961421583579,
|
||
|
|
"grad_norm": 6.0,
|
||
|
|
"learning_rate": 3.610108303249098e-05,
|
||
|
|
"loss": 2.8419125366210936,
|
||
|
|
"step": 200,
|
||
|
|
"token_acc": 0.5193163006097973
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.054224421323753685,
|
||
|
|
"grad_norm": 5.625,
|
||
|
|
"learning_rate": 5.415162454873647e-05,
|
||
|
|
"loss": 2.2325677490234375,
|
||
|
|
"step": 300,
|
||
|
|
"token_acc": 0.577260581442613
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07229922843167158,
|
||
|
|
"grad_norm": 5.09375,
|
||
|
|
"learning_rate": 7.220216606498195e-05,
|
||
|
|
"loss": 1.9477809143066407,
|
||
|
|
"step": 400,
|
||
|
|
"token_acc": 0.6095200383891847
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09037403553958948,
|
||
|
|
"grad_norm": 4.4375,
|
||
|
|
"learning_rate": 9.025270758122743e-05,
|
||
|
|
"loss": 1.7510385131835937,
|
||
|
|
"step": 500,
|
||
|
|
"token_acc": 0.6378192081318703
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10844884264750737,
|
||
|
|
"grad_norm": 4.28125,
|
||
|
|
"learning_rate": 9.999527526045029e-05,
|
||
|
|
"loss": 1.6535232543945313,
|
||
|
|
"step": 600,
|
||
|
|
"token_acc": 0.6496733713777781
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12652364975542527,
|
||
|
|
"grad_norm": 3.625,
|
||
|
|
"learning_rate": 9.99524110790929e-05,
|
||
|
|
"loss": 1.5711769104003905,
|
||
|
|
"step": 700,
|
||
|
|
"token_acc": 0.6620965629303417
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14459845686334316,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 9.986493474590536e-05,
|
||
|
|
"loss": 1.5024029541015624,
|
||
|
|
"step": 800,
|
||
|
|
"token_acc": 0.6721741041947722
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16267326397126106,
|
||
|
|
"grad_norm": 3.0,
|
||
|
|
"learning_rate": 9.973292438539405e-05,
|
||
|
|
"loss": 1.4528140258789062,
|
||
|
|
"step": 900,
|
||
|
|
"token_acc": 0.6810912279574309
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18074807107917895,
|
||
|
|
"grad_norm": 3.421875,
|
||
|
|
"learning_rate": 9.955649789509624e-05,
|
||
|
|
"loss": 1.4288172912597656,
|
||
|
|
"step": 1000,
|
||
|
|
"token_acc": 0.6845218263208889
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19882287818709685,
|
||
|
|
"grad_norm": 3.4375,
|
||
|
|
"learning_rate": 9.933581284028659e-05,
|
||
|
|
"loss": 1.4166110229492188,
|
||
|
|
"step": 1100,
|
||
|
|
"token_acc": 0.68570654719404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21689768529501474,
|
||
|
|
"grad_norm": 2.90625,
|
||
|
|
"learning_rate": 9.907106631325671e-05,
|
||
|
|
"loss": 1.3844677734375,
|
||
|
|
"step": 1200,
|
||
|
|
"token_acc": 0.6901876362337299
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23497249240293264,
|
||
|
|
"grad_norm": 2.375,
|
||
|
|
"learning_rate": 9.876249475729344e-05,
|
||
|
|
"loss": 1.3720639038085938,
|
||
|
|
"step": 1300,
|
||
|
|
"token_acc": 0.6921398802222583
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25304729951085053,
|
||
|
|
"grad_norm": 2.6875,
|
||
|
|
"learning_rate": 9.841037375551294e-05,
|
||
|
|
"loss": 1.3502085876464844,
|
||
|
|
"step": 1400,
|
||
|
|
"token_acc": 0.6961120678024922
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27112210661876845,
|
||
|
|
"grad_norm": 2.53125,
|
||
|
|
"learning_rate": 9.801501778473935e-05,
|
||
|
|
"loss": 1.3242225646972656,
|
||
|
|
"step": 1500,
|
||
|
|
"token_acc": 0.7008815156964483
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2891969137266863,
|
||
|
|
"grad_norm": 2.671875,
|
||
|
|
"learning_rate": 9.757677993464771e-05,
|
||
|
|
"loss": 1.3310586547851562,
|
||
|
|
"step": 1600,
|
||
|
|
"token_acc": 0.6988938129948864
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30727172083460425,
|
||
|
|
"grad_norm": 2.4375,
|
||
|
|
"learning_rate": 9.709605159242199e-05,
|
||
|
|
"loss": 1.3116970825195313,
|
||
|
|
"step": 1700,
|
||
|
|
"token_acc": 0.7015194406957871
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3253465279425221,
|
||
|
|
"grad_norm": 2.5625,
|
||
|
|
"learning_rate": 9.657326209320998e-05,
|
||
|
|
"loss": 1.3130838012695312,
|
||
|
|
"step": 1800,
|
||
|
|
"token_acc": 0.7008979147977925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34342133505044004,
|
||
|
|
"grad_norm": 2.453125,
|
||
|
|
"learning_rate": 9.600887833668701e-05,
|
||
|
|
"loss": 1.2876347351074218,
|
||
|
|
"step": 1900,
|
||
|
|
"token_acc": 0.7059691653488992
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3614961421583579,
|
||
|
|
"grad_norm": 2.28125,
|
||
|
|
"learning_rate": 9.540340437007106e-05,
|
||
|
|
"loss": 1.2805183410644532,
|
||
|
|
"step": 2000,
|
||
|
|
"token_acc": 0.7076493690679662
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3795709492662758,
|
||
|
|
"grad_norm": 2.3125,
|
||
|
|
"learning_rate": 9.475738093796172e-05,
|
||
|
|
"loss": 1.2861131286621095,
|
||
|
|
"step": 2100,
|
||
|
|
"token_acc": 0.7055881842553867
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3976457563741937,
|
||
|
|
"grad_norm": 2.375,
|
||
|
|
"learning_rate": 9.407138499940496e-05,
|
||
|
|
"loss": 1.268822479248047,
|
||
|
|
"step": 2200,
|
||
|
|
"token_acc": 0.7090846641855041
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4157205634821116,
|
||
|
|
"grad_norm": 2.59375,
|
||
|
|
"learning_rate": 9.334602921261492e-05,
|
||
|
|
"loss": 1.2444308471679688,
|
||
|
|
"step": 2300,
|
||
|
|
"token_acc": 0.7131349902523053
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4337953705900295,
|
||
|
|
"grad_norm": 3.015625,
|
||
|
|
"learning_rate": 9.258196138781327e-05,
|
||
|
|
"loss": 1.2656473541259765,
|
||
|
|
"step": 2400,
|
||
|
|
"token_acc": 0.7094099462846465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4518701776979474,
|
||
|
|
"grad_norm": 2.265625,
|
||
|
|
"learning_rate": 9.177986390867419e-05,
|
||
|
|
"loss": 1.2598892211914063,
|
||
|
|
"step": 2500,
|
||
|
|
"token_acc": 0.7109632032447007
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4699449848058653,
|
||
|
|
"grad_norm": 2.5,
|
||
|
|
"learning_rate": 9.09404531228924e-05,
|
||
|
|
"loss": 1.2393927764892578,
|
||
|
|
"step": 2600,
|
||
|
|
"token_acc": 0.7133786975644326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4880197919137832,
|
||
|
|
"grad_norm": 2.3125,
|
||
|
|
"learning_rate": 9.0064478702418e-05,
|
||
|
|
"loss": 1.2412493896484376,
|
||
|
|
"step": 2700,
|
||
|
|
"token_acc": 0.7132032480227637
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5060945990217011,
|
||
|
|
"grad_norm": 2.5625,
|
||
|
|
"learning_rate": 8.915272297392945e-05,
|
||
|
|
"loss": 1.2382020568847656,
|
||
|
|
"step": 2800,
|
||
|
|
"token_acc": 0.7131689546842609
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5241694061296189,
|
||
|
|
"grad_norm": 2.640625,
|
||
|
|
"learning_rate": 8.820600022014338e-05,
|
||
|
|
"loss": 1.2349536895751954,
|
||
|
|
"step": 2900,
|
||
|
|
"token_acc": 0.7140737742682136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5422442132375369,
|
||
|
|
"grad_norm": 2.109375,
|
||
|
|
"learning_rate": 8.722515595258402e-05,
|
||
|
|
"loss": 1.2141342163085938,
|
||
|
|
"step": 3000,
|
||
|
|
"token_acc": 0.7181298317819385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5603190203454548,
|
||
|
|
"grad_norm": 2.078125,
|
||
|
|
"learning_rate": 8.621106615646292e-05,
|
||
|
|
"loss": 1.2058545684814452,
|
||
|
|
"step": 3100,
|
||
|
|
"token_acc": 0.7190379193709925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5783938274533726,
|
||
|
|
"grad_norm": 2.453125,
|
||
|
|
"learning_rate": 8.51646365083426e-05,
|
||
|
|
"loss": 1.2195273590087892,
|
||
|
|
"step": 3200,
|
||
|
|
"token_acc": 0.7170014250513891
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5964686345612905,
|
||
|
|
"grad_norm": 2.3125,
|
||
|
|
"learning_rate": 8.408680156728299e-05,
|
||
|
|
"loss": 1.2225239562988282,
|
||
|
|
"step": 3300,
|
||
|
|
"token_acc": 0.71660874090857
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6145434416692085,
|
||
|
|
"grad_norm": 2.359375,
|
||
|
|
"learning_rate": 8.297852394019336e-05,
|
||
|
|
"loss": 1.1935769653320312,
|
||
|
|
"step": 3400,
|
||
|
|
"token_acc": 0.721348535831292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6326182487771264,
|
||
|
|
"grad_norm": 2.328125,
|
||
|
|
"learning_rate": 8.184079342213466e-05,
|
||
|
|
"loss": 1.198054428100586,
|
||
|
|
"step": 3500,
|
||
|
|
"token_acc": 0.7195931038612298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6506930558850442,
|
||
|
|
"grad_norm": 2.15625,
|
||
|
|
"learning_rate": 8.067462611234052e-05,
|
||
|
|
"loss": 1.2039249420166016,
|
||
|
|
"step": 3600,
|
||
|
|
"token_acc": 0.7198145640636711
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6687678629929621,
|
||
|
|
"grad_norm": 2.25,
|
||
|
|
"learning_rate": 7.948106350674593e-05,
|
||
|
|
"loss": 1.2044364166259767,
|
||
|
|
"step": 3700,
|
||
|
|
"token_acc": 0.7191590166885238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6868426701008801,
|
||
|
|
"grad_norm": 2.21875,
|
||
|
|
"learning_rate": 7.826117156783461e-05,
|
||
|
|
"loss": 1.1947254180908202,
|
||
|
|
"step": 3800,
|
||
|
|
"token_acc": 0.7211205989004562
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7049174772087979,
|
||
|
|
"grad_norm": 2.171875,
|
||
|
|
"learning_rate": 7.701603977263513e-05,
|
||
|
|
"loss": 1.1763773345947266,
|
||
|
|
"step": 3900,
|
||
|
|
"token_acc": 0.7237809512705871
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7229922843167158,
|
||
|
|
"grad_norm": 2.546875,
|
||
|
|
"learning_rate": 7.574678013971672e-05,
|
||
|
|
"loss": 1.180088424682617,
|
||
|
|
"step": 4000,
|
||
|
|
"token_acc": 0.7241476445274628
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7410670914246337,
|
||
|
|
"grad_norm": 2.359375,
|
||
|
|
"learning_rate": 7.445452623605307e-05,
|
||
|
|
"loss": 1.1668415069580078,
|
||
|
|
"step": 4100,
|
||
|
|
"token_acc": 0.7264480800382547
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7591418985325517,
|
||
|
|
"grad_norm": 2.015625,
|
||
|
|
"learning_rate": 7.314043216464158e-05,
|
||
|
|
"loss": 1.1807654571533204,
|
||
|
|
"step": 4200,
|
||
|
|
"token_acc": 0.7243762391137034
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7772167056404695,
|
||
|
|
"grad_norm": 2.328125,
|
||
|
|
"learning_rate": 7.180567153378193e-05,
|
||
|
|
"loss": 1.1845186614990235,
|
||
|
|
"step": 4300,
|
||
|
|
"token_acc": 0.7214888736557151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7952915127483874,
|
||
|
|
"grad_norm": 2.1875,
|
||
|
|
"learning_rate": 7.045143640893474e-05,
|
||
|
|
"loss": 1.1583942413330077,
|
||
|
|
"step": 4400,
|
||
|
|
"token_acc": 0.7274813631693335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8133663198563053,
|
||
|
|
"grad_norm": 2.109375,
|
||
|
|
"learning_rate": 6.907893624809609e-05,
|
||
|
|
"loss": 1.145471420288086,
|
||
|
|
"step": 4500,
|
||
|
|
"token_acc": 0.7298424303929127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8314411269642232,
|
||
|
|
"grad_norm": 2.359375,
|
||
|
|
"learning_rate": 6.768939682163902e-05,
|
||
|
|
"loss": 1.1666727447509766,
|
||
|
|
"step": 4600,
|
||
|
|
"token_acc": 0.7264280244660613
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8495159340721411,
|
||
|
|
"grad_norm": 2.09375,
|
||
|
|
"learning_rate": 6.628405911758647e-05,
|
||
|
|
"loss": 1.1622318267822265,
|
||
|
|
"step": 4700,
|
||
|
|
"token_acc": 0.7271438460195889
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.867590741180059,
|
||
|
|
"grad_norm": 2.15625,
|
||
|
|
"learning_rate": 6.486417823329354e-05,
|
||
|
|
"loss": 1.1364639282226563,
|
||
|
|
"step": 4800,
|
||
|
|
"token_acc": 0.7319144126512372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8856655482879768,
|
||
|
|
"grad_norm": 2.421875,
|
||
|
|
"learning_rate": 6.34310222545287e-05,
|
||
|
|
"loss": 1.1561846923828125,
|
||
|
|
"step": 4900,
|
||
|
|
"token_acc": 0.7282120514362783
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9037403553958948,
|
||
|
|
"grad_norm": 2.109375,
|
||
|
|
"learning_rate": 6.198587112295526e-05,
|
||
|
|
"loss": 1.1464973449707032,
|
||
|
|
"step": 5000,
|
||
|
|
"token_acc": 0.7304544859576637
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9218151625038127,
|
||
|
|
"grad_norm": 2.0,
|
||
|
|
"learning_rate": 6.053001549302422e-05,
|
||
|
|
"loss": 1.135927963256836,
|
||
|
|
"step": 5100,
|
||
|
|
"token_acc": 0.7309169370264882
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9398899696117305,
|
||
|
|
"grad_norm": 2.359375,
|
||
|
|
"learning_rate": 5.906475557929985e-05,
|
||
|
|
"loss": 1.1362411499023437,
|
||
|
|
"step": 5200,
|
||
|
|
"token_acc": 0.7308731770004574
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9579647767196484,
|
||
|
|
"grad_norm": 2.15625,
|
||
|
|
"learning_rate": 5.759139999524705e-05,
|
||
|
|
"loss": 1.132964859008789,
|
||
|
|
"step": 5300,
|
||
|
|
"token_acc": 0.7320876184986531
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9760395838275664,
|
||
|
|
"grad_norm": 2.09375,
|
||
|
|
"learning_rate": 5.611126458451772e-05,
|
||
|
|
"loss": 1.1330313873291016,
|
||
|
|
"step": 5400,
|
||
|
|
"token_acc": 0.7321249508199824
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9941143909354843,
|
||
|
|
"grad_norm": 2.1875,
|
||
|
|
"learning_rate": 5.462567124577992e-05,
|
||
|
|
"loss": 1.137665786743164,
|
||
|
|
"step": 5500,
|
||
|
|
"token_acc": 0.731388650772945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0121101207623049,
|
||
|
|
"grad_norm": 2.71875,
|
||
|
|
"learning_rate": 5.3135946752139385e-05,
|
||
|
|
"loss": 1.0870736694335938,
|
||
|
|
"step": 5600,
|
||
|
|
"token_acc": 0.7406857225004193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0301849278702229,
|
||
|
|
"grad_norm": 2.234375,
|
||
|
|
"learning_rate": 5.1643421566207615e-05,
|
||
|
|
"loss": 1.0574837493896485,
|
||
|
|
"step": 5700,
|
||
|
|
"token_acc": 0.7462299226419611
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0482597349781408,
|
||
|
|
"grad_norm": 2.1875,
|
||
|
|
"learning_rate": 5.0149428651874985e-05,
|
||
|
|
"loss": 1.0600157165527344,
|
||
|
|
"step": 5800,
|
||
|
|
"token_acc": 0.7451135106751428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0663345420860586,
|
||
|
|
"grad_norm": 2.296875,
|
||
|
|
"learning_rate": 4.86553022838499e-05,
|
||
|
|
"loss": 1.0566656494140625,
|
||
|
|
"step": 5900,
|
||
|
|
"token_acc": 0.7469262925837835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0844093491939766,
|
||
|
|
"grad_norm": 2.1875,
|
||
|
|
"learning_rate": 4.716237685602735e-05,
|
||
|
|
"loss": 1.073977508544922,
|
||
|
|
"step": 6000,
|
||
|
|
"token_acc": 0.7416494081610407
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1024841563018946,
|
||
|
|
"grad_norm": 2.359375,
|
||
|
|
"learning_rate": 4.567198568975096e-05,
|
||
|
|
"loss": 1.0501838684082032,
|
||
|
|
"step": 6100,
|
||
|
|
"token_acc": 0.7475906446092413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1205589634098123,
|
||
|
|
"grad_norm": 1.9453125,
|
||
|
|
"learning_rate": 4.418545984303294e-05,
|
||
|
|
"loss": 1.0559381866455078,
|
||
|
|
"step": 6200,
|
||
|
|
"token_acc": 0.7470775683736796
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1386337705177303,
|
||
|
|
"grad_norm": 2.09375,
|
||
|
|
"learning_rate": 4.2704126921795424e-05,
|
||
|
|
"loss": 1.0539588928222656,
|
||
|
|
"step": 6300,
|
||
|
|
"token_acc": 0.7466571018782976
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1567085776256483,
|
||
|
|
"grad_norm": 2.125,
|
||
|
|
"learning_rate": 4.1229309894194806e-05,
|
||
|
|
"loss": 1.0646955108642577,
|
||
|
|
"step": 6400,
|
||
|
|
"token_acc": 0.7450717568377625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.174783384733566,
|
||
|
|
"grad_norm": 1.8828125,
|
||
|
|
"learning_rate": 3.976232590908812e-05,
|
||
|
|
"loss": 1.0539531707763672,
|
||
|
|
"step": 6500,
|
||
|
|
"token_acc": 0.7465753023540981
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.192858191841484,
|
||
|
|
"grad_norm": 2.25,
|
||
|
|
"learning_rate": 3.830448511969638e-05,
|
||
|
|
"loss": 1.0504056549072265,
|
||
|
|
"step": 6600,
|
||
|
|
"token_acc": 0.7470591527899016
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2109329989494018,
|
||
|
|
"grad_norm": 1.9921875,
|
||
|
|
"learning_rate": 3.6857089513516035e-05,
|
||
|
|
"loss": 1.0537297821044922,
|
||
|
|
"step": 6700,
|
||
|
|
"token_acc": 0.746264189895728
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2290078060573197,
|
||
|
|
"grad_norm": 2.078125,
|
||
|
|
"learning_rate": 3.542143174952282e-05,
|
||
|
|
"loss": 1.0569972229003906,
|
||
|
|
"step": 6800,
|
||
|
|
"token_acc": 0.7456408105039157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2470826131652377,
|
||
|
|
"grad_norm": 2.078125,
|
||
|
|
"learning_rate": 3.399879400370704e-05,
|
||
|
|
"loss": 1.0594657897949218,
|
||
|
|
"step": 6900,
|
||
|
|
"token_acc": 0.7464631930608675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2651574202731555,
|
||
|
|
"grad_norm": 2.296875,
|
||
|
|
"learning_rate": 3.259044682397107e-05,
|
||
|
|
"loss": 1.059138946533203,
|
||
|
|
"step": 7000,
|
||
|
|
"token_acc": 0.7466058265866723
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2832322273810735,
|
||
|
|
"grad_norm": 2.140625,
|
||
|
|
"learning_rate": 3.119764799541187e-05,
|
||
|
|
"loss": 1.0547212982177734,
|
||
|
|
"step": 7100,
|
||
|
|
"token_acc": 0.7473527098438676
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3013070344889912,
|
||
|
|
"grad_norm": 2.125,
|
||
|
|
"learning_rate": 2.9821641417001806e-05,
|
||
|
|
"loss": 1.0402613067626953,
|
||
|
|
"step": 7200,
|
||
|
|
"token_acc": 0.7493453306325137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3193818415969092,
|
||
|
|
"grad_norm": 2.140625,
|
||
|
|
"learning_rate": 2.846365599067111e-05,
|
||
|
|
"loss": 1.045955352783203,
|
||
|
|
"step": 7300,
|
||
|
|
"token_acc": 0.7483038206918617
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3374566487048272,
|
||
|
|
"grad_norm": 2.203125,
|
||
|
|
"learning_rate": 2.7124904523784144e-05,
|
||
|
|
"loss": 1.0378961944580078,
|
||
|
|
"step": 7400,
|
||
|
|
"token_acc": 0.7502343443728431
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.355531455812745,
|
||
|
|
"grad_norm": 2.46875,
|
||
|
|
"learning_rate": 2.580658264598942e-05,
|
||
|
|
"loss": 1.0476718902587892,
|
||
|
|
"step": 7500,
|
||
|
|
"token_acc": 0.7483254533842428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.373606262920663,
|
||
|
|
"grad_norm": 2.109375,
|
||
|
|
"learning_rate": 2.450986774141123e-05,
|
||
|
|
"loss": 1.0452989196777345,
|
||
|
|
"step": 7600,
|
||
|
|
"token_acc": 0.7482226148441731
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3916810700285809,
|
||
|
|
"grad_norm": 2.140625,
|
||
|
|
"learning_rate": 2.3235917897135934e-05,
|
||
|
|
"loss": 1.0406829833984375,
|
||
|
|
"step": 7700,
|
||
|
|
"token_acc": 0.74961852670988
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4097558771364986,
|
||
|
|
"grad_norm": 2.3125,
|
||
|
|
"learning_rate": 2.1985870868932456e-05,
|
||
|
|
"loss": 1.0259892272949218,
|
||
|
|
"step": 7800,
|
||
|
|
"token_acc": 0.7533507908450509
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4278306842444166,
|
||
|
|
"grad_norm": 2.421875,
|
||
|
|
"learning_rate": 2.076084306513049e-05,
|
||
|
|
"loss": 1.0363540649414062,
|
||
|
|
"step": 7900,
|
||
|
|
"token_acc": 0.7503043904274775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4459054913523346,
|
||
|
|
"grad_norm": 1.984375,
|
||
|
|
"learning_rate": 1.9561928549563968e-05,
|
||
|
|
"loss": 1.0644924926757813,
|
||
|
|
"step": 8000,
|
||
|
|
"token_acc": 0.7456255842220881
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4639802984602523,
|
||
|
|
"grad_norm": 2.125,
|
||
|
|
"learning_rate": 1.839019806447024e-05,
|
||
|
|
"loss": 1.0405005645751952,
|
||
|
|
"step": 8100,
|
||
|
|
"token_acc": 0.7495747171714061
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4820551055681703,
|
||
|
|
"grad_norm": 2.09375,
|
||
|
|
"learning_rate": 1.724669807421762e-05,
|
||
|
|
"loss": 1.0440809631347656,
|
||
|
|
"step": 8200,
|
||
|
|
"token_acc": 0.7486369722470563
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5001299126760883,
|
||
|
|
"grad_norm": 2.125,
|
||
|
|
"learning_rate": 1.6132449830715263e-05,
|
||
|
|
"loss": 1.0530775451660157,
|
||
|
|
"step": 8300,
|
||
|
|
"token_acc": 0.7477425947235785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.518204719784006,
|
||
|
|
"grad_norm": 2.109375,
|
||
|
|
"learning_rate": 1.5048448461340258e-05,
|
||
|
|
"loss": 1.0460784912109375,
|
||
|
|
"step": 8400,
|
||
|
|
"token_acc": 0.7489244523851678
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5362795268919238,
|
||
|
|
"grad_norm": 2.125,
|
||
|
|
"learning_rate": 1.3995662080196215e-05,
|
||
|
|
"loss": 1.0327759552001954,
|
||
|
|
"step": 8500,
|
||
|
|
"token_acc": 0.7517219276186928
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5543543339998418,
|
||
|
|
"grad_norm": 1.8125,
|
||
|
|
"learning_rate": 1.2975030923497262e-05,
|
||
|
|
"loss": 1.037949981689453,
|
||
|
|
"step": 8600,
|
||
|
|
"token_acc": 0.7504533222058104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5724291411077598,
|
||
|
|
"grad_norm": 2.109375,
|
||
|
|
"learning_rate": 1.1987466509849655e-05,
|
||
|
|
"loss": 1.0523592376708983,
|
||
|
|
"step": 8700,
|
||
|
|
"token_acc": 0.7466913343954412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5905039482156775,
|
||
|
|
"grad_norm": 2.265625,
|
||
|
|
"learning_rate": 1.1033850826180781e-05,
|
||
|
|
"loss": 1.048785171508789,
|
||
|
|
"step": 8800,
|
||
|
|
"token_acc": 0.7472665235971139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6085787553235955,
|
||
|
|
"grad_norm": 2.0,
|
||
|
|
"learning_rate": 1.0115035540042784e-05,
|
||
|
|
"loss": 1.032520523071289,
|
||
|
|
"step": 8900,
|
||
|
|
"token_acc": 0.7507173368953239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6266535624315135,
|
||
|
|
"grad_norm": 2.21875,
|
||
|
|
"learning_rate": 9.231841238994194e-06,
|
||
|
|
"loss": 1.043864974975586,
|
||
|
|
"step": 9000,
|
||
|
|
"token_acc": 0.7493066372338768
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6447283695394312,
|
||
|
|
"grad_norm": 1.90625,
|
||
|
|
"learning_rate": 8.385056697738796e-06,
|
||
|
|
"loss": 1.0542935943603515,
|
||
|
|
"step": 9100,
|
||
|
|
"token_acc": 0.7465372369884397
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6628031766473492,
|
||
|
|
"grad_norm": 1.984375,
|
||
|
|
"learning_rate": 7.575438173676513e-06,
|
||
|
|
"loss": 1.031275177001953,
|
||
|
|
"step": 9200,
|
||
|
|
"token_acc": 0.7517255178380788
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6808779837552672,
|
||
|
|
"grad_norm": 2.125,
|
||
|
|
"learning_rate": 6.803708731495117e-06,
|
||
|
|
"loss": 1.0453128814697266,
|
||
|
|
"step": 9300,
|
||
|
|
"token_acc": 0.7487111883579803
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.698952790863185,
|
||
|
|
"grad_norm": 1.8984375,
|
||
|
|
"learning_rate": 6.070557597406163e-06,
|
||
|
|
"loss": 1.0431288146972657,
|
||
|
|
"step": 9400,
|
||
|
|
"token_acc": 0.7488178364241926
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.717027597971103,
|
||
|
|
"grad_norm": 1.96875,
|
||
|
|
"learning_rate": 5.376639543601858e-06,
|
||
|
|
"loss": 1.0395802307128905,
|
||
|
|
"step": 9500,
|
||
|
|
"token_acc": 0.7503542725542914
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.735102405079021,
|
||
|
|
"grad_norm": 2.078125,
|
||
|
|
"learning_rate": 4.722574303482557e-06,
|
||
|
|
"loss": 1.0538075256347657,
|
||
|
|
"step": 9600,
|
||
|
|
"token_acc": 0.7472696432430282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7531772121869387,
|
||
|
|
"grad_norm": 2.234375,
|
||
|
|
"learning_rate": 4.1089460181771675e-06,
|
||
|
|
"loss": 1.046026382446289,
|
||
|
|
"step": 9700,
|
||
|
|
"token_acc": 0.7478882170852356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7712520192948564,
|
||
|
|
"grad_norm": 2.15625,
|
||
|
|
"learning_rate": 3.5363027148507423e-06,
|
||
|
|
"loss": 1.0465138244628907,
|
||
|
|
"step": 9800,
|
||
|
|
"token_acc": 0.7479565299418576
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7893268264027746,
|
||
|
|
"grad_norm": 2.28125,
|
||
|
|
"learning_rate": 3.0051558172652316e-06,
|
||
|
|
"loss": 1.0322959899902344,
|
||
|
|
"step": 9900,
|
||
|
|
"token_acc": 0.752127805838985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8074016335106924,
|
||
|
|
"grad_norm": 2.21875,
|
||
|
|
"learning_rate": 2.5159796890304564e-06,
|
||
|
|
"loss": 1.0446186065673828,
|
||
|
|
"step": 10000,
|
||
|
|
"token_acc": 0.7488376764944241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8254764406186101,
|
||
|
|
"grad_norm": 2.0625,
|
||
|
|
"learning_rate": 2.069211209953287e-06,
|
||
|
|
"loss": 1.0309945678710937,
|
||
|
|
"step": 10100,
|
||
|
|
"token_acc": 0.7524413140197763
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8435512477265281,
|
||
|
|
"grad_norm": 1.8203125,
|
||
|
|
"learning_rate": 1.6652493858632823e-06,
|
||
|
|
"loss": 1.0363735198974608,
|
||
|
|
"step": 10200,
|
||
|
|
"token_acc": 0.7503588935333034
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.861626054834446,
|
||
|
|
"grad_norm": 2.296875,
|
||
|
|
"learning_rate": 1.3044549922633876e-06,
|
||
|
|
"loss": 1.0336082458496094,
|
||
|
|
"step": 10300,
|
||
|
|
"token_acc": 0.7506362801039103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8797008619423639,
|
||
|
|
"grad_norm": 2.046875,
|
||
|
|
"learning_rate": 9.871502521237975e-07,
|
||
|
|
"loss": 1.0328756713867187,
|
||
|
|
"step": 10400,
|
||
|
|
"token_acc": 0.7518247545659716
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8977756690502818,
|
||
|
|
"grad_norm": 1.8671875,
|
||
|
|
"learning_rate": 7.136185481068925e-07,
|
||
|
|
"loss": 1.0489426422119141,
|
||
|
|
"step": 10500,
|
||
|
|
"token_acc": 0.747691160714326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9158504761581998,
|
||
|
|
"grad_norm": 1.765625,
|
||
|
|
"learning_rate": 4.841041694801208e-07,
|
||
|
|
"loss": 1.0270442962646484,
|
||
|
|
"step": 10600,
|
||
|
|
"token_acc": 0.7519379627407866
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9339252832661176,
|
||
|
|
"grad_norm": 2.0,
|
||
|
|
"learning_rate": 2.988120939429684e-07,
|
||
|
|
"loss": 1.0303496551513671,
|
||
|
|
"step": 10700,
|
||
|
|
"token_acc": 0.7513509325616106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9520000903740355,
|
||
|
|
"grad_norm": 1.890625,
|
||
|
|
"learning_rate": 1.5790780456277355e-07,
|
||
|
|
"loss": 1.0394702911376954,
|
||
|
|
"step": 10800,
|
||
|
|
"token_acc": 0.7495070401423912
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9700748974819535,
|
||
|
|
"grad_norm": 2.390625,
|
||
|
|
"learning_rate": 6.15171419829752e-08,
|
||
|
|
"loss": 1.0353932189941406,
|
||
|
|
"step": 10900,
|
||
|
|
"token_acc": 0.7506620155660084
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9881497045898713,
|
||
|
|
"grad_norm": 2.15625,
|
||
|
|
"learning_rate": 9.726192035691694e-09,
|
||
|
|
"loss": 1.0364900970458983,
|
||
|
|
"step": 11000,
|
||
|
|
"token_acc": 0.7511165613553197
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 100,
|
||
|
|
"max_steps": 11066,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 2,
|
||
|
|
"save_steps": 1000,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 8.597718898108247e+17,
|
||
|
|
"train_batch_size": 8,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|