1181 lines
33 KiB
JSON
1181 lines
33 KiB
JSON
{
|
|
"best_global_step": 300,
|
|
"best_metric": 0.67163587,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v21-20250507-064807/checkpoint-300",
|
|
"epoch": 2.9826262626262627,
|
|
"eval_steps": 20,
|
|
"global_step": 462,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.006464646464646465,
|
|
"grad_norm": 5.7837233543396,
|
|
"learning_rate": 9.999884400986087e-06,
|
|
"loss": 1.0828938484191895,
|
|
"memory(GiB)": 27.73,
|
|
"step": 1,
|
|
"token_acc": 0.7079992873686086,
|
|
"train_speed(iter/s)": 0.069407
|
|
},
|
|
{
|
|
"epoch": 0.03232323232323232,
|
|
"grad_norm": 2.584890842437744,
|
|
"learning_rate": 9.997110291906109e-06,
|
|
"loss": 0.8132728338241577,
|
|
"memory(GiB)": 27.77,
|
|
"step": 5,
|
|
"token_acc": 0.7766760462727232,
|
|
"train_speed(iter/s)": 0.127059
|
|
},
|
|
{
|
|
"epoch": 0.06464646464646465,
|
|
"grad_norm": 1.4381581544876099,
|
|
"learning_rate": 9.988444507789584e-06,
|
|
"loss": 0.7008798599243165,
|
|
"memory(GiB)": 27.77,
|
|
"step": 10,
|
|
"token_acc": 0.8064654179148698,
|
|
"train_speed(iter/s)": 0.144995
|
|
},
|
|
{
|
|
"epoch": 0.09696969696969697,
|
|
"grad_norm": 1.4875842332839966,
|
|
"learning_rate": 9.97401266428502e-06,
|
|
"loss": 0.6918133735656739,
|
|
"memory(GiB)": 27.77,
|
|
"step": 15,
|
|
"token_acc": 0.7917820548324563,
|
|
"train_speed(iter/s)": 0.148104
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"grad_norm": 1.2409731149673462,
|
|
"learning_rate": 9.953831442918418e-06,
|
|
"loss": 0.6582849025726318,
|
|
"memory(GiB)": 27.77,
|
|
"step": 20,
|
|
"token_acc": 0.8092561024264626,
|
|
"train_speed(iter/s)": 0.1504
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"eval_loss": 0.722944438457489,
|
|
"eval_runtime": 4.5169,
|
|
"eval_samples_per_second": 22.139,
|
|
"eval_steps_per_second": 5.535,
|
|
"eval_token_acc": 0.8038043327122556,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.16161616161616163,
|
|
"grad_norm": 1.244113802909851,
|
|
"learning_rate": 9.927924170825266e-06,
|
|
"loss": 0.6381969451904297,
|
|
"memory(GiB)": 27.77,
|
|
"step": 25,
|
|
"token_acc": 0.8135619641465316,
|
|
"train_speed(iter/s)": 0.138906
|
|
},
|
|
{
|
|
"epoch": 0.19393939393939394,
|
|
"grad_norm": 1.034762978553772,
|
|
"learning_rate": 9.896320793787106e-06,
|
|
"loss": 0.6519847869873047,
|
|
"memory(GiB)": 27.77,
|
|
"step": 30,
|
|
"token_acc": 0.8079350766456267,
|
|
"train_speed(iter/s)": 0.144048
|
|
},
|
|
{
|
|
"epoch": 0.22626262626262628,
|
|
"grad_norm": 1.1062158346176147,
|
|
"learning_rate": 9.859057841617709e-06,
|
|
"loss": 0.6522578716278076,
|
|
"memory(GiB)": 27.77,
|
|
"step": 35,
|
|
"token_acc": 0.7871428029296801,
|
|
"train_speed(iter/s)": 0.146527
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"grad_norm": 1.048275113105774,
|
|
"learning_rate": 9.816178385938867e-06,
|
|
"loss": 0.6380832672119141,
|
|
"memory(GiB)": 30.0,
|
|
"step": 40,
|
|
"token_acc": 0.8310100032268474,
|
|
"train_speed(iter/s)": 0.147965
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"eval_loss": 0.7094467878341675,
|
|
"eval_runtime": 4.5026,
|
|
"eval_samples_per_second": 22.209,
|
|
"eval_steps_per_second": 5.552,
|
|
"eval_token_acc": 0.8064740398787508,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.2909090909090909,
|
|
"grad_norm": 1.228413462638855,
|
|
"learning_rate": 9.767731990394638e-06,
|
|
"loss": 0.6573184967041016,
|
|
"memory(GiB)": 30.0,
|
|
"step": 45,
|
|
"token_acc": 0.793196216263126,
|
|
"train_speed(iter/s)": 0.142174
|
|
},
|
|
{
|
|
"epoch": 0.32323232323232326,
|
|
"grad_norm": 1.2188494205474854,
|
|
"learning_rate": 9.71377465336155e-06,
|
|
"loss": 0.6832234382629394,
|
|
"memory(GiB)": 30.0,
|
|
"step": 50,
|
|
"token_acc": 0.7763722873389811,
|
|
"train_speed(iter/s)": 0.14464
|
|
},
|
|
{
|
|
"epoch": 0.35555555555555557,
|
|
"grad_norm": 1.0591987371444702,
|
|
"learning_rate": 9.654368743221022e-06,
|
|
"loss": 0.653898048400879,
|
|
"memory(GiB)": 30.0,
|
|
"step": 55,
|
|
"token_acc": 0.8127191799298624,
|
|
"train_speed(iter/s)": 0.146174
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"grad_norm": 1.0762509107589722,
|
|
"learning_rate": 9.589582926268798e-06,
|
|
"loss": 0.612049913406372,
|
|
"memory(GiB)": 30.0,
|
|
"step": 60,
|
|
"token_acc": 0.8167217591261857,
|
|
"train_speed(iter/s)": 0.147559
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"eval_loss": 0.6970872282981873,
|
|
"eval_runtime": 4.4596,
|
|
"eval_samples_per_second": 22.423,
|
|
"eval_steps_per_second": 5.606,
|
|
"eval_token_acc": 0.8075586084151395,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.4202020202020202,
|
|
"grad_norm": 1.0156168937683105,
|
|
"learning_rate": 9.519492087344724e-06,
|
|
"loss": 0.6183786392211914,
|
|
"memory(GiB)": 30.0,
|
|
"step": 65,
|
|
"token_acc": 0.82515202980332,
|
|
"train_speed(iter/s)": 0.142388
|
|
},
|
|
{
|
|
"epoch": 0.45252525252525255,
|
|
"grad_norm": 1.1835277080535889,
|
|
"learning_rate": 9.444177243274619e-06,
|
|
"loss": 0.6588045597076416,
|
|
"memory(GiB)": 30.0,
|
|
"step": 70,
|
|
"token_acc": 0.8151584404952757,
|
|
"train_speed(iter/s)": 0.144193
|
|
},
|
|
{
|
|
"epoch": 0.48484848484848486,
|
|
"grad_norm": 1.1693778038024902,
|
|
"learning_rate": 9.363725449224281e-06,
|
|
"loss": 0.6500480651855469,
|
|
"memory(GiB)": 30.0,
|
|
"step": 75,
|
|
"token_acc": 0.8076034754555119,
|
|
"train_speed(iter/s)": 0.145372
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"grad_norm": 1.0537551641464233,
|
|
"learning_rate": 9.278229698073889e-06,
|
|
"loss": 0.6718258380889892,
|
|
"memory(GiB)": 30.0,
|
|
"step": 80,
|
|
"token_acc": 0.7944381259859853,
|
|
"train_speed(iter/s)": 0.14633
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"eval_loss": 0.6917301416397095,
|
|
"eval_runtime": 4.4847,
|
|
"eval_samples_per_second": 22.298,
|
|
"eval_steps_per_second": 5.575,
|
|
"eval_token_acc": 0.8087266053004811,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.5494949494949495,
|
|
"grad_norm": 1.139643669128418,
|
|
"learning_rate": 9.187788812929074e-06,
|
|
"loss": 0.6676198005676269,
|
|
"memory(GiB)": 30.0,
|
|
"step": 85,
|
|
"token_acc": 0.8008274744669155,
|
|
"train_speed(iter/s)": 0.143323
|
|
},
|
|
{
|
|
"epoch": 0.5818181818181818,
|
|
"grad_norm": 1.1069749593734741,
|
|
"learning_rate": 9.092507332892968e-06,
|
|
"loss": 0.6722775459289551,
|
|
"memory(GiB)": 30.0,
|
|
"step": 90,
|
|
"token_acc": 0.8005652779928759,
|
|
"train_speed(iter/s)": 0.144592
|
|
},
|
|
{
|
|
"epoch": 0.6141414141414141,
|
|
"grad_norm": 1.1481822729110718,
|
|
"learning_rate": 8.992495392231195e-06,
|
|
"loss": 0.6242180824279785,
|
|
"memory(GiB)": 30.0,
|
|
"step": 95,
|
|
"token_acc": 0.8088621855050695,
|
|
"train_speed(iter/s)": 0.145728
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"grad_norm": 1.0265405178070068,
|
|
"learning_rate": 8.88786859306952e-06,
|
|
"loss": 0.6294228076934815,
|
|
"memory(GiB)": 30.0,
|
|
"step": 100,
|
|
"token_acc": 0.797233893557423,
|
|
"train_speed(iter/s)": 0.146555
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"eval_loss": 0.6827206611633301,
|
|
"eval_runtime": 4.4877,
|
|
"eval_samples_per_second": 22.283,
|
|
"eval_steps_per_second": 5.571,
|
|
"eval_token_acc": 0.8118412636613921,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.6787878787878788,
|
|
"grad_norm": 1.1638389825820923,
|
|
"learning_rate": 8.778747871771293e-06,
|
|
"loss": 0.6758254051208497,
|
|
"memory(GiB)": 30.0,
|
|
"step": 105,
|
|
"token_acc": 0.8088623640012675,
|
|
"train_speed(iter/s)": 0.144045
|
|
},
|
|
{
|
|
"epoch": 0.7111111111111111,
|
|
"grad_norm": 1.2460920810699463,
|
|
"learning_rate": 8.665259359149132e-06,
|
|
"loss": 0.6744856834411621,
|
|
"memory(GiB)": 30.0,
|
|
"step": 110,
|
|
"token_acc": 0.8033372194695424,
|
|
"train_speed(iter/s)": 0.145146
|
|
},
|
|
{
|
|
"epoch": 0.7434343434343434,
|
|
"grad_norm": 1.1937848329544067,
|
|
"learning_rate": 8.547534234672435e-06,
|
|
"loss": 0.6776030540466309,
|
|
"memory(GiB)": 30.0,
|
|
"step": 115,
|
|
"token_acc": 0.7960356428441535,
|
|
"train_speed(iter/s)": 0.145963
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"grad_norm": 1.2813752889633179,
|
|
"learning_rate": 8.425708574839221e-06,
|
|
"loss": 0.6523926734924317,
|
|
"memory(GiB)": 30.0,
|
|
"step": 120,
|
|
"token_acc": 0.8113255093959248,
|
|
"train_speed(iter/s)": 0.146806
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"eval_loss": 0.6801063418388367,
|
|
"eval_runtime": 4.5179,
|
|
"eval_samples_per_second": 22.134,
|
|
"eval_steps_per_second": 5.533,
|
|
"eval_token_acc": 0.8103673628298896,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.8080808080808081,
|
|
"grad_norm": 1.2211530208587646,
|
|
"learning_rate": 8.299923195887599e-06,
|
|
"loss": 0.6863309383392334,
|
|
"memory(GiB)": 30.0,
|
|
"step": 125,
|
|
"token_acc": 0.7986384909941853,
|
|
"train_speed(iter/s)": 0.144918
|
|
},
|
|
{
|
|
"epoch": 0.8404040404040404,
|
|
"grad_norm": 1.0644088983535767,
|
|
"learning_rate": 8.170323491028625e-06,
|
|
"loss": 0.6193663597106933,
|
|
"memory(GiB)": 30.0,
|
|
"step": 130,
|
|
"token_acc": 0.8070464504820333,
|
|
"train_speed(iter/s)": 0.145439
|
|
},
|
|
{
|
|
"epoch": 0.8727272727272727,
|
|
"grad_norm": 1.0717216730117798,
|
|
"learning_rate": 8.03705926238874e-06,
|
|
"loss": 0.6962187767028809,
|
|
"memory(GiB)": 30.0,
|
|
"step": 135,
|
|
"token_acc": 0.7882992561955905,
|
|
"train_speed(iter/s)": 0.146161
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"grad_norm": 1.0318809747695923,
|
|
"learning_rate": 7.900284547855992e-06,
|
|
"loss": 0.6141955375671386,
|
|
"memory(GiB)": 30.0,
|
|
"step": 140,
|
|
"token_acc": 0.8250999478532939,
|
|
"train_speed(iter/s)": 0.146718
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"eval_loss": 0.6769556403160095,
|
|
"eval_runtime": 4.5137,
|
|
"eval_samples_per_second": 22.155,
|
|
"eval_steps_per_second": 5.539,
|
|
"eval_token_acc": 0.812314024305459,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.9373737373737374,
|
|
"grad_norm": 1.1931071281433105,
|
|
"learning_rate": 7.760157443030234e-06,
|
|
"loss": 0.6433291435241699,
|
|
"memory(GiB)": 30.0,
|
|
"step": 145,
|
|
"token_acc": 0.8096512634810674,
|
|
"train_speed(iter/s)": 0.145106
|
|
},
|
|
{
|
|
"epoch": 0.9696969696969697,
|
|
"grad_norm": 0.9276081323623657,
|
|
"learning_rate": 7.616839918483061e-06,
|
|
"loss": 0.6246243000030518,
|
|
"memory(GiB)": 30.0,
|
|
"step": 150,
|
|
"token_acc": 0.8373285914577848,
|
|
"train_speed(iter/s)": 0.145752
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 1.0196563005447388,
|
|
"learning_rate": 7.470497632538743e-06,
|
|
"loss": 0.6316683769226075,
|
|
"memory(GiB)": 30.0,
|
|
"step": 155,
|
|
"token_acc": 0.8189631162217006,
|
|
"train_speed(iter/s)": 0.14654
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"grad_norm": 1.0861464738845825,
|
|
"learning_rate": 7.321299739792553e-06,
|
|
"loss": 0.574582290649414,
|
|
"memory(GiB)": 30.0,
|
|
"step": 160,
|
|
"token_acc": 0.822840260798696,
|
|
"train_speed(iter/s)": 0.147158
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"eval_loss": 0.6779691576957703,
|
|
"eval_runtime": 4.5159,
|
|
"eval_samples_per_second": 22.144,
|
|
"eval_steps_per_second": 5.536,
|
|
"eval_token_acc": 0.8116465975138352,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.0646464646464646,
|
|
"grad_norm": 1.1801550388336182,
|
|
"learning_rate": 7.169418695587791e-06,
|
|
"loss": 0.5374558448791504,
|
|
"memory(GiB)": 30.0,
|
|
"step": 165,
|
|
"token_acc": 0.8396995365190986,
|
|
"train_speed(iter/s)": 0.145626
|
|
},
|
|
{
|
|
"epoch": 1.096969696969697,
|
|
"grad_norm": 1.0841394662857056,
|
|
"learning_rate": 7.015030056677559e-06,
|
|
"loss": 0.5700150489807129,
|
|
"memory(GiB)": 30.0,
|
|
"step": 170,
|
|
"token_acc": 0.8313138512710858,
|
|
"train_speed(iter/s)": 0.146379
|
|
},
|
|
{
|
|
"epoch": 1.1292929292929292,
|
|
"grad_norm": 1.0907129049301147,
|
|
"learning_rate": 6.858312278301638e-06,
|
|
"loss": 0.530540657043457,
|
|
"memory(GiB)": 30.0,
|
|
"step": 175,
|
|
"token_acc": 0.8248816768086545,
|
|
"train_speed(iter/s)": 0.14678
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"grad_norm": 1.0117602348327637,
|
|
"learning_rate": 6.699446507913083e-06,
|
|
"loss": 0.5229566097259521,
|
|
"memory(GiB)": 30.0,
|
|
"step": 180,
|
|
"token_acc": 0.8368724855693546,
|
|
"train_speed(iter/s)": 0.147192
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"eval_loss": 0.6804619431495667,
|
|
"eval_runtime": 4.4977,
|
|
"eval_samples_per_second": 22.233,
|
|
"eval_steps_per_second": 5.558,
|
|
"eval_token_acc": 0.8114797408159292,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.1939393939393939,
|
|
"grad_norm": 1.050369381904602,
|
|
"learning_rate": 6.53861637579291e-06,
|
|
"loss": 0.548884916305542,
|
|
"memory(GiB)": 30.0,
|
|
"step": 185,
|
|
"token_acc": 0.8174426020408163,
|
|
"train_speed(iter/s)": 0.1459
|
|
},
|
|
{
|
|
"epoch": 1.2262626262626264,
|
|
"grad_norm": 1.1832817792892456,
|
|
"learning_rate": 6.376007782794926e-06,
|
|
"loss": 0.5427236557006836,
|
|
"memory(GiB)": 30.0,
|
|
"step": 190,
|
|
"token_acc": 0.8312639081497726,
|
|
"train_speed(iter/s)": 0.146405
|
|
},
|
|
{
|
|
"epoch": 1.2585858585858585,
|
|
"grad_norm": 1.0814838409423828,
|
|
"learning_rate": 6.211808685466063e-06,
|
|
"loss": 0.5683661937713623,
|
|
"memory(GiB)": 30.0,
|
|
"step": 195,
|
|
"token_acc": 0.8363592434074278,
|
|
"train_speed(iter/s)": 0.147077
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"grad_norm": 0.9832177758216858,
|
|
"learning_rate": 6.046208878790543e-06,
|
|
"loss": 0.5114535808563232,
|
|
"memory(GiB)": 30.0,
|
|
"step": 200,
|
|
"token_acc": 0.8398058252427184,
|
|
"train_speed(iter/s)": 0.14749
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"eval_loss": 0.6795041561126709,
|
|
"eval_runtime": 4.5398,
|
|
"eval_samples_per_second": 22.027,
|
|
"eval_steps_per_second": 5.507,
|
|
"eval_token_acc": 0.812147167607553,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.3232323232323233,
|
|
"grad_norm": 1.0313045978546143,
|
|
"learning_rate": 5.879399776809047e-06,
|
|
"loss": 0.5399807453155517,
|
|
"memory(GiB)": 30.0,
|
|
"step": 205,
|
|
"token_acc": 0.8344659940404622,
|
|
"train_speed(iter/s)": 0.14634
|
|
},
|
|
{
|
|
"epoch": 1.3555555555555556,
|
|
"grad_norm": 1.0118134021759033,
|
|
"learning_rate": 5.711574191366427e-06,
|
|
"loss": 0.5527422904968262,
|
|
"memory(GiB)": 30.0,
|
|
"step": 210,
|
|
"token_acc": 0.8313429020123443,
|
|
"train_speed(iter/s)": 0.146686
|
|
},
|
|
{
|
|
"epoch": 1.387878787878788,
|
|
"grad_norm": 1.0724433660507202,
|
|
"learning_rate": 5.542926109243727e-06,
|
|
"loss": 0.5539234161376954,
|
|
"memory(GiB)": 30.0,
|
|
"step": 215,
|
|
"token_acc": 0.8117005197773436,
|
|
"train_speed(iter/s)": 0.147158
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"grad_norm": 1.086374282836914,
|
|
"learning_rate": 5.373650467932122e-06,
|
|
"loss": 0.5297107219696044,
|
|
"memory(GiB)": 30.0,
|
|
"step": 220,
|
|
"token_acc": 0.8333011097792042,
|
|
"train_speed(iter/s)": 0.147561
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"eval_loss": 0.6780869960784912,
|
|
"eval_runtime": 4.4783,
|
|
"eval_samples_per_second": 22.33,
|
|
"eval_steps_per_second": 5.582,
|
|
"eval_token_acc": 0.8137323062376596,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.4525252525252526,
|
|
"grad_norm": 1.1475560665130615,
|
|
"learning_rate": 5.2039429303079294e-06,
|
|
"loss": 0.5562318801879883,
|
|
"memory(GiB)": 30.0,
|
|
"step": 225,
|
|
"token_acc": 0.820589226025445,
|
|
"train_speed(iter/s)": 0.146647
|
|
},
|
|
{
|
|
"epoch": 1.4848484848484849,
|
|
"grad_norm": 1.0739235877990723,
|
|
"learning_rate": 5.033999658469174e-06,
|
|
"loss": 0.5656192779541016,
|
|
"memory(GiB)": 30.0,
|
|
"step": 230,
|
|
"token_acc": 0.8434569629111267,
|
|
"train_speed(iter/s)": 0.147011
|
|
},
|
|
{
|
|
"epoch": 1.5171717171717172,
|
|
"grad_norm": 1.153382420539856,
|
|
"learning_rate": 4.864017086995112e-06,
|
|
"loss": 0.5392692565917969,
|
|
"memory(GiB)": 30.0,
|
|
"step": 235,
|
|
"token_acc": 0.8344013490725126,
|
|
"train_speed(iter/s)": 0.147492
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"grad_norm": 1.0742793083190918,
|
|
"learning_rate": 4.694191695890788e-06,
|
|
"loss": 0.5323070049285888,
|
|
"memory(GiB)": 30.0,
|
|
"step": 240,
|
|
"token_acc": 0.8339429680501642,
|
|
"train_speed(iter/s)": 0.147851
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"eval_loss": 0.6787331104278564,
|
|
"eval_runtime": 4.4769,
|
|
"eval_samples_per_second": 22.337,
|
|
"eval_steps_per_second": 5.584,
|
|
"eval_token_acc": 0.8122027865068551,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.5818181818181818,
|
|
"grad_norm": 1.012741208076477,
|
|
"learning_rate": 4.524719783479088e-06,
|
|
"loss": 0.5413528919219971,
|
|
"memory(GiB)": 30.0,
|
|
"step": 245,
|
|
"token_acc": 0.8438339287914254,
|
|
"train_speed(iter/s)": 0.14685
|
|
},
|
|
{
|
|
"epoch": 1.614141414141414,
|
|
"grad_norm": 1.0688731670379639,
|
|
"learning_rate": 4.355797239502807e-06,
|
|
"loss": 0.5247974395751953,
|
|
"memory(GiB)": 30.0,
|
|
"step": 250,
|
|
"token_acc": 0.8370563375806298,
|
|
"train_speed(iter/s)": 0.147062
|
|
},
|
|
{
|
|
"epoch": 1.6464646464646466,
|
|
"grad_norm": 1.195318341255188,
|
|
"learning_rate": 4.187619318698971e-06,
|
|
"loss": 0.5625959873199463,
|
|
"memory(GiB)": 30.0,
|
|
"step": 255,
|
|
"token_acc": 0.8187680020947892,
|
|
"train_speed(iter/s)": 0.147527
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"grad_norm": 1.0666542053222656,
|
|
"learning_rate": 4.020380415107167e-06,
|
|
"loss": 0.5226840972900391,
|
|
"memory(GiB)": 30.0,
|
|
"step": 260,
|
|
"token_acc": 0.8682154605263158,
|
|
"train_speed(iter/s)": 0.147705
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"eval_loss": 0.6767453551292419,
|
|
"eval_runtime": 4.4986,
|
|
"eval_samples_per_second": 22.229,
|
|
"eval_steps_per_second": 5.557,
|
|
"eval_token_acc": 0.8135098306404516,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.7111111111111112,
|
|
"grad_norm": 1.1024754047393799,
|
|
"learning_rate": 3.854273837372724e-06,
|
|
"loss": 0.5334303379058838,
|
|
"memory(GiB)": 30.0,
|
|
"step": 265,
|
|
"token_acc": 0.8305059560662721,
|
|
"train_speed(iter/s)": 0.146899
|
|
},
|
|
{
|
|
"epoch": 1.7434343434343433,
|
|
"grad_norm": 1.0549464225769043,
|
|
"learning_rate": 3.689491585304491e-06,
|
|
"loss": 0.5367157936096192,
|
|
"memory(GiB)": 30.0,
|
|
"step": 270,
|
|
"token_acc": 0.8195051514205433,
|
|
"train_speed(iter/s)": 0.14719
|
|
},
|
|
{
|
|
"epoch": 1.7757575757575759,
|
|
"grad_norm": 1.1271840333938599,
|
|
"learning_rate": 3.526224127945479e-06,
|
|
"loss": 0.5592126369476318,
|
|
"memory(GiB)": 30.0,
|
|
"step": 275,
|
|
"token_acc": 0.8336587028601531,
|
|
"train_speed(iter/s)": 0.147572
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"grad_norm": 1.0035786628723145,
|
|
"learning_rate": 3.3646601834128924e-06,
|
|
"loss": 0.5336441040039063,
|
|
"memory(GiB)": 30.0,
|
|
"step": 280,
|
|
"token_acc": 0.8376280205561832,
|
|
"train_speed(iter/s)": 0.147801
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"eval_loss": 0.675382137298584,
|
|
"eval_runtime": 4.4852,
|
|
"eval_samples_per_second": 22.296,
|
|
"eval_steps_per_second": 5.574,
|
|
"eval_token_acc": 0.8133151644928946,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.8404040404040405,
|
|
"grad_norm": 1.0249357223510742,
|
|
"learning_rate": 3.204986500762006e-06,
|
|
"loss": 0.5767297267913818,
|
|
"memory(GiB)": 30.0,
|
|
"step": 285,
|
|
"token_acc": 0.8033832987162484,
|
|
"train_speed(iter/s)": 0.146928
|
|
},
|
|
{
|
|
"epoch": 1.8727272727272726,
|
|
"grad_norm": 0.9631988406181335,
|
|
"learning_rate": 3.0473876441260786e-06,
|
|
"loss": 0.5411409854888916,
|
|
"memory(GiB)": 30.0,
|
|
"step": 290,
|
|
"token_acc": 0.8262143620505396,
|
|
"train_speed(iter/s)": 0.147203
|
|
},
|
|
{
|
|
"epoch": 1.905050505050505,
|
|
"grad_norm": 1.028135895729065,
|
|
"learning_rate": 2.8920457793817507e-06,
|
|
"loss": 0.5459909439086914,
|
|
"memory(GiB)": 30.0,
|
|
"step": 295,
|
|
"token_acc": 0.8455647944260032,
|
|
"train_speed(iter/s)": 0.147451
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"grad_norm": 1.0845921039581299,
|
|
"learning_rate": 2.7391404635865725e-06,
|
|
"loss": 0.5368780612945556,
|
|
"memory(GiB)": 30.0,
|
|
"step": 300,
|
|
"token_acc": 0.8320271783191137,
|
|
"train_speed(iter/s)": 0.147671
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"eval_loss": 0.6716358661651611,
|
|
"eval_runtime": 4.5215,
|
|
"eval_samples_per_second": 22.116,
|
|
"eval_steps_per_second": 5.529,
|
|
"eval_token_acc": 0.8139547818348675,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.9696969696969697,
|
|
"grad_norm": 0.9843314290046692,
|
|
"learning_rate": 2.5888484374320033e-06,
|
|
"loss": 0.5163120269775391,
|
|
"memory(GiB)": 30.0,
|
|
"step": 305,
|
|
"token_acc": 0.8328818151032849,
|
|
"train_speed(iter/s)": 0.146859
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 1.1701740026474,
|
|
"learning_rate": 2.4413434209518137e-06,
|
|
"loss": 0.5329459190368653,
|
|
"memory(GiB)": 30.0,
|
|
"step": 310,
|
|
"token_acc": 0.8684262230663435,
|
|
"train_speed(iter/s)": 0.147224
|
|
},
|
|
{
|
|
"epoch": 2.0323232323232325,
|
|
"grad_norm": 0.9886131286621094,
|
|
"learning_rate": 2.296795912722014e-06,
|
|
"loss": 0.47854862213134763,
|
|
"memory(GiB)": 30.0,
|
|
"step": 315,
|
|
"token_acc": 0.8571065805702677,
|
|
"train_speed(iter/s)": 0.147399
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"grad_norm": 1.1649657487869263,
|
|
"learning_rate": 2.1553729927843894e-06,
|
|
"loss": 0.46472911834716796,
|
|
"memory(GiB)": 30.0,
|
|
"step": 320,
|
|
"token_acc": 0.8510946618102064,
|
|
"train_speed(iter/s)": 0.147612
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"eval_loss": 0.683362603187561,
|
|
"eval_runtime": 4.5092,
|
|
"eval_samples_per_second": 22.177,
|
|
"eval_steps_per_second": 5.544,
|
|
"eval_token_acc": 0.812425262104063,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 2.096969696969697,
|
|
"grad_norm": 1.0194660425186157,
|
|
"learning_rate": 2.017238129521506e-06,
|
|
"loss": 0.4674004077911377,
|
|
"memory(GiB)": 30.0,
|
|
"step": 325,
|
|
"token_acc": 0.8521572339577521,
|
|
"train_speed(iter/s)": 0.146897
|
|
},
|
|
{
|
|
"epoch": 2.1292929292929292,
|
|
"grad_norm": 1.0168867111206055,
|
|
"learning_rate": 1.8825509907063328e-06,
|
|
"loss": 0.4946479320526123,
|
|
"memory(GiB)": 30.0,
|
|
"step": 330,
|
|
"token_acc": 0.846690244227946,
|
|
"train_speed(iter/s)": 0.147164
|
|
},
|
|
{
|
|
"epoch": 2.1616161616161618,
|
|
"grad_norm": 1.0326777696609497,
|
|
"learning_rate": 1.7514672589449378e-06,
|
|
"loss": 0.48072013854980467,
|
|
"memory(GiB)": 30.0,
|
|
"step": 335,
|
|
"token_acc": 0.8457147012835897,
|
|
"train_speed(iter/s)": 0.147447
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"grad_norm": 0.9925091862678528,
|
|
"learning_rate": 1.6241384517255854e-06,
|
|
"loss": 0.4736426830291748,
|
|
"memory(GiB)": 30.0,
|
|
"step": 340,
|
|
"token_acc": 0.8668122952098611,
|
|
"train_speed(iter/s)": 0.147682
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"eval_loss": 0.687374472618103,
|
|
"eval_runtime": 4.4755,
|
|
"eval_samples_per_second": 22.344,
|
|
"eval_steps_per_second": 5.586,
|
|
"eval_token_acc": 0.8125643093523179,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 2.2262626262626264,
|
|
"grad_norm": 1.008774995803833,
|
|
"learning_rate": 1.500711746282192e-06,
|
|
"loss": 0.5085729598999024,
|
|
"memory(GiB)": 30.0,
|
|
"step": 345,
|
|
"token_acc": 0.8404898047254289,
|
|
"train_speed(iter/s)": 0.147118
|
|
},
|
|
{
|
|
"epoch": 2.2585858585858585,
|
|
"grad_norm": 0.9545453786849976,
|
|
"learning_rate": 1.3813298094746491e-06,
|
|
"loss": 0.49334096908569336,
|
|
"memory(GiB)": 30.0,
|
|
"step": 350,
|
|
"token_acc": 0.8572497735061252,
|
|
"train_speed(iter/s)": 0.147268
|
|
},
|
|
{
|
|
"epoch": 2.290909090909091,
|
|
"grad_norm": 1.0316177606582642,
|
|
"learning_rate": 1.2661306328825818e-06,
|
|
"loss": 0.48065829277038574,
|
|
"memory(GiB)": 30.0,
|
|
"step": 355,
|
|
"token_acc": 0.8579491647410887,
|
|
"train_speed(iter/s)": 0.147552
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"grad_norm": 0.9947900772094727,
|
|
"learning_rate": 1.1552473733031893e-06,
|
|
"loss": 0.4793752670288086,
|
|
"memory(GiB)": 30.0,
|
|
"step": 360,
|
|
"token_acc": 0.8398426718189346,
|
|
"train_speed(iter/s)": 0.147791
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"eval_loss": 0.6864572167396545,
|
|
"eval_runtime": 4.5103,
|
|
"eval_samples_per_second": 22.171,
|
|
"eval_steps_per_second": 5.543,
|
|
"eval_token_acc": 0.8118968825606941,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 2.3555555555555556,
|
|
"grad_norm": 0.9976386427879333,
|
|
"learning_rate": 1.0488081988375493e-06,
|
|
"loss": 0.46926078796386717,
|
|
"memory(GiB)": 30.0,
|
|
"step": 365,
|
|
"token_acc": 0.8339437138994715,
|
|
"train_speed(iter/s)": 0.147274
|
|
},
|
|
{
|
|
"epoch": 2.3878787878787877,
|
|
"grad_norm": 0.9950555562973022,
|
|
"learning_rate": 9.469361407432431e-07,
|
|
"loss": 0.46298680305480955,
|
|
"memory(GiB)": 30.0,
|
|
"step": 370,
|
|
"token_acc": 0.8648511440693332,
|
|
"train_speed(iter/s)": 0.14743
|
|
},
|
|
{
|
|
"epoch": 2.4202020202020202,
|
|
"grad_norm": 0.9489296078681946,
|
|
"learning_rate": 8.497489512245971e-07,
|
|
"loss": 0.4750084400177002,
|
|
"memory(GiB)": 30.0,
|
|
"step": 375,
|
|
"token_acc": 0.8613484960635219,
|
|
"train_speed(iter/s)": 0.147589
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"grad_norm": 1.0904533863067627,
|
|
"learning_rate": 7.573589673248833e-07,
|
|
"loss": 0.49940977096557615,
|
|
"memory(GiB)": 30.0,
|
|
"step": 380,
|
|
"token_acc": 0.8456219466366027,
|
|
"train_speed(iter/s)": 0.147838
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"eval_loss": 0.6869800090789795,
|
|
"eval_runtime": 4.4689,
|
|
"eval_samples_per_second": 22.377,
|
|
"eval_steps_per_second": 5.594,
|
|
"eval_token_acc": 0.812230595956506,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 2.484848484848485,
|
|
"grad_norm": 0.9920545816421509,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.4398204803466797,
|
|
"memory(GiB)": 30.0,
|
|
"step": 385,
|
|
"token_acc": 0.8489904129398532,
|
|
"train_speed(iter/s)": 0.147202
|
|
},
|
|
{
|
|
"epoch": 2.517171717171717,
|
|
"grad_norm": 0.9930100440979004,
|
|
"learning_rate": 5.873921160683943e-07,
|
|
"loss": 0.4805948257446289,
|
|
"memory(GiB)": 30.0,
|
|
"step": 390,
|
|
"token_acc": 0.8426801497549115,
|
|
"train_speed(iter/s)": 0.14741
|
|
},
|
|
{
|
|
"epoch": 2.5494949494949495,
|
|
"grad_norm": 1.0105656385421753,
|
|
"learning_rate": 5.100117105459279e-07,
|
|
"loss": 0.4643260478973389,
|
|
"memory(GiB)": 30.0,
|
|
"step": 395,
|
|
"token_acc": 0.8793709396854699,
|
|
"train_speed(iter/s)": 0.14759
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"grad_norm": 0.949529230594635,
|
|
"learning_rate": 4.3782120722406565e-07,
|
|
"loss": 0.4940080165863037,
|
|
"memory(GiB)": 30.0,
|
|
"step": 400,
|
|
"token_acc": 0.8489606206997511,
|
|
"train_speed(iter/s)": 0.147848
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"eval_loss": 0.6872583627700806,
|
|
"eval_runtime": 4.4941,
|
|
"eval_samples_per_second": 22.251,
|
|
"eval_steps_per_second": 5.563,
|
|
"eval_token_acc": 0.812397452654412,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 2.614141414141414,
|
|
"grad_norm": 1.0164008140563965,
|
|
"learning_rate": 3.709040498955102e-07,
|
|
"loss": 0.4730886936187744,
|
|
"memory(GiB)": 30.0,
|
|
"step": 405,
|
|
"token_acc": 0.8358856213579076,
|
|
"train_speed(iter/s)": 0.147331
|
|
},
|
|
{
|
|
"epoch": 2.6464646464646466,
|
|
"grad_norm": 0.9890522956848145,
|
|
"learning_rate": 3.0933758698072023e-07,
|
|
"loss": 0.4806517124176025,
|
|
"memory(GiB)": 30.0,
|
|
"step": 410,
|
|
"token_acc": 0.8606986899563319,
|
|
"train_speed(iter/s)": 0.147503
|
|
},
|
|
{
|
|
"epoch": 2.6787878787878787,
|
|
"grad_norm": 1.0946931838989258,
|
|
"learning_rate": 2.531929821221768e-07,
|
|
"loss": 0.48343396186828613,
|
|
"memory(GiB)": 30.0,
|
|
"step": 415,
|
|
"token_acc": 0.8560399806064223,
|
|
"train_speed(iter/s)": 0.147741
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"grad_norm": 0.9739342331886292,
|
|
"learning_rate": 2.0253513192751374e-07,
|
|
"loss": 0.4568051338195801,
|
|
"memory(GiB)": 30.0,
|
|
"step": 420,
|
|
"token_acc": 0.8513886113886114,
|
|
"train_speed(iter/s)": 0.147934
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"eval_loss": 0.6869549751281738,
|
|
"eval_runtime": 4.4891,
|
|
"eval_samples_per_second": 22.276,
|
|
"eval_steps_per_second": 5.569,
|
|
"eval_token_acc": 0.8125643093523179,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 2.7434343434343433,
|
|
"grad_norm": 1.036109447479248,
|
|
"learning_rate": 1.5742259095662126e-07,
|
|
"loss": 0.4853102684020996,
|
|
"memory(GiB)": 30.0,
|
|
"step": 425,
|
|
"token_acc": 0.8378954181386694,
|
|
"train_speed(iter/s)": 0.147362
|
|
},
|
|
{
|
|
"epoch": 2.775757575757576,
|
|
"grad_norm": 1.012856125831604,
|
|
"learning_rate": 1.1790750403941231e-07,
|
|
"loss": 0.4725470542907715,
|
|
"memory(GiB)": 30.0,
|
|
"step": 430,
|
|
"token_acc": 0.8562256448320653,
|
|
"train_speed(iter/s)": 0.147546
|
|
},
|
|
{
|
|
"epoch": 2.808080808080808,
|
|
"grad_norm": 0.9843412041664124,
|
|
"learning_rate": 8.403554600248498e-08,
|
|
"loss": 0.47121171951293944,
|
|
"memory(GiB)": 30.0,
|
|
"step": 435,
|
|
"token_acc": 0.8520417505951291,
|
|
"train_speed(iter/s)": 0.14775
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"grad_norm": 1.0212868452072144,
|
|
"learning_rate": 5.584586887435739e-08,
|
|
"loss": 0.47542705535888674,
|
|
"memory(GiB)": 30.0,
|
|
"step": 440,
|
|
"token_acc": 0.8456783799474098,
|
|
"train_speed(iter/s)": 0.147955
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"eval_loss": 0.6869864463806152,
|
|
"eval_runtime": 4.4679,
|
|
"eval_samples_per_second": 22.382,
|
|
"eval_steps_per_second": 5.595,
|
|
"eval_token_acc": 0.812230595956506,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 2.8727272727272726,
|
|
"grad_norm": 0.9807941317558289,
|
|
"learning_rate": 3.337105663029361e-08,
|
|
"loss": 0.46687631607055663,
|
|
"memory(GiB)": 30.0,
|
|
"step": 445,
|
|
"token_acc": 0.8307434410089937,
|
|
"train_speed(iter/s)": 0.147443
|
|
},
|
|
{
|
|
"epoch": 2.905050505050505,
|
|
"grad_norm": 0.9818356037139893,
|
|
"learning_rate": 1.6637087529033925e-08,
|
|
"loss": 0.4775029182434082,
|
|
"memory(GiB)": 30.0,
|
|
"step": 450,
|
|
"token_acc": 0.8397012044747847,
|
|
"train_speed(iter/s)": 0.147671
|
|
},
|
|
{
|
|
"epoch": 2.937373737373737,
|
|
"grad_norm": 0.9716631174087524,
|
|
"learning_rate": 5.6633040849601865e-09,
|
|
"loss": 0.5018224716186523,
|
|
"memory(GiB)": 30.0,
|
|
"step": 455,
|
|
"token_acc": 0.8534569498346989,
|
|
"train_speed(iter/s)": 0.147853
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"grad_norm": 1.012851357460022,
|
|
"learning_rate": 4.623907104084335e-10,
|
|
"loss": 0.48965134620666506,
|
|
"memory(GiB)": 30.0,
|
|
"step": 460,
|
|
"token_acc": 0.8447179410444411,
|
|
"train_speed(iter/s)": 0.148014
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"eval_loss": 0.6869931817054749,
|
|
"eval_runtime": 4.4888,
|
|
"eval_samples_per_second": 22.278,
|
|
"eval_steps_per_second": 5.569,
|
|
"eval_token_acc": 0.8127033566005729,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 2.9826262626262627,
|
|
"eval_loss": 0.6868388652801514,
|
|
"eval_runtime": 4.4905,
|
|
"eval_samples_per_second": 22.269,
|
|
"eval_steps_per_second": 5.567,
|
|
"eval_token_acc": 0.8126199282516199,
|
|
"step": 462
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 462,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4.7722341067862835e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|