Files
qwen2.5vl-3b-sampled_10000_…/trainer_state.json
ModelHub XC 5a5318d7b5 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-sampled_10000_qwen2.5vl_sc
Source: Original Platform
2026-05-22 11:29:13 +08:00

1181 lines
33 KiB
JSON

{
"best_global_step": 300,
"best_metric": 0.67163587,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v21-20250507-064807/checkpoint-300",
"epoch": 2.9826262626262627,
"eval_steps": 20,
"global_step": 462,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006464646464646465,
"grad_norm": 5.7837233543396,
"learning_rate": 9.999884400986087e-06,
"loss": 1.0828938484191895,
"memory(GiB)": 27.73,
"step": 1,
"token_acc": 0.7079992873686086,
"train_speed(iter/s)": 0.069407
},
{
"epoch": 0.03232323232323232,
"grad_norm": 2.584890842437744,
"learning_rate": 9.997110291906109e-06,
"loss": 0.8132728338241577,
"memory(GiB)": 27.77,
"step": 5,
"token_acc": 0.7766760462727232,
"train_speed(iter/s)": 0.127059
},
{
"epoch": 0.06464646464646465,
"grad_norm": 1.4381581544876099,
"learning_rate": 9.988444507789584e-06,
"loss": 0.7008798599243165,
"memory(GiB)": 27.77,
"step": 10,
"token_acc": 0.8064654179148698,
"train_speed(iter/s)": 0.144995
},
{
"epoch": 0.09696969696969697,
"grad_norm": 1.4875842332839966,
"learning_rate": 9.97401266428502e-06,
"loss": 0.6918133735656739,
"memory(GiB)": 27.77,
"step": 15,
"token_acc": 0.7917820548324563,
"train_speed(iter/s)": 0.148104
},
{
"epoch": 0.1292929292929293,
"grad_norm": 1.2409731149673462,
"learning_rate": 9.953831442918418e-06,
"loss": 0.6582849025726318,
"memory(GiB)": 27.77,
"step": 20,
"token_acc": 0.8092561024264626,
"train_speed(iter/s)": 0.1504
},
{
"epoch": 0.1292929292929293,
"eval_loss": 0.722944438457489,
"eval_runtime": 4.5169,
"eval_samples_per_second": 22.139,
"eval_steps_per_second": 5.535,
"eval_token_acc": 0.8038043327122556,
"step": 20
},
{
"epoch": 0.16161616161616163,
"grad_norm": 1.244113802909851,
"learning_rate": 9.927924170825266e-06,
"loss": 0.6381969451904297,
"memory(GiB)": 27.77,
"step": 25,
"token_acc": 0.8135619641465316,
"train_speed(iter/s)": 0.138906
},
{
"epoch": 0.19393939393939394,
"grad_norm": 1.034762978553772,
"learning_rate": 9.896320793787106e-06,
"loss": 0.6519847869873047,
"memory(GiB)": 27.77,
"step": 30,
"token_acc": 0.8079350766456267,
"train_speed(iter/s)": 0.144048
},
{
"epoch": 0.22626262626262628,
"grad_norm": 1.1062158346176147,
"learning_rate": 9.859057841617709e-06,
"loss": 0.6522578716278076,
"memory(GiB)": 27.77,
"step": 35,
"token_acc": 0.7871428029296801,
"train_speed(iter/s)": 0.146527
},
{
"epoch": 0.2585858585858586,
"grad_norm": 1.048275113105774,
"learning_rate": 9.816178385938867e-06,
"loss": 0.6380832672119141,
"memory(GiB)": 30.0,
"step": 40,
"token_acc": 0.8310100032268474,
"train_speed(iter/s)": 0.147965
},
{
"epoch": 0.2585858585858586,
"eval_loss": 0.7094467878341675,
"eval_runtime": 4.5026,
"eval_samples_per_second": 22.209,
"eval_steps_per_second": 5.552,
"eval_token_acc": 0.8064740398787508,
"step": 40
},
{
"epoch": 0.2909090909090909,
"grad_norm": 1.228413462638855,
"learning_rate": 9.767731990394638e-06,
"loss": 0.6573184967041016,
"memory(GiB)": 30.0,
"step": 45,
"token_acc": 0.793196216263126,
"train_speed(iter/s)": 0.142174
},
{
"epoch": 0.32323232323232326,
"grad_norm": 1.2188494205474854,
"learning_rate": 9.71377465336155e-06,
"loss": 0.6832234382629394,
"memory(GiB)": 30.0,
"step": 50,
"token_acc": 0.7763722873389811,
"train_speed(iter/s)": 0.14464
},
{
"epoch": 0.35555555555555557,
"grad_norm": 1.0591987371444702,
"learning_rate": 9.654368743221022e-06,
"loss": 0.653898048400879,
"memory(GiB)": 30.0,
"step": 55,
"token_acc": 0.8127191799298624,
"train_speed(iter/s)": 0.146174
},
{
"epoch": 0.3878787878787879,
"grad_norm": 1.0762509107589722,
"learning_rate": 9.589582926268798e-06,
"loss": 0.612049913406372,
"memory(GiB)": 30.0,
"step": 60,
"token_acc": 0.8167217591261857,
"train_speed(iter/s)": 0.147559
},
{
"epoch": 0.3878787878787879,
"eval_loss": 0.6970872282981873,
"eval_runtime": 4.4596,
"eval_samples_per_second": 22.423,
"eval_steps_per_second": 5.606,
"eval_token_acc": 0.8075586084151395,
"step": 60
},
{
"epoch": 0.4202020202020202,
"grad_norm": 1.0156168937683105,
"learning_rate": 9.519492087344724e-06,
"loss": 0.6183786392211914,
"memory(GiB)": 30.0,
"step": 65,
"token_acc": 0.82515202980332,
"train_speed(iter/s)": 0.142388
},
{
"epoch": 0.45252525252525255,
"grad_norm": 1.1835277080535889,
"learning_rate": 9.444177243274619e-06,
"loss": 0.6588045597076416,
"memory(GiB)": 30.0,
"step": 70,
"token_acc": 0.8151584404952757,
"train_speed(iter/s)": 0.144193
},
{
"epoch": 0.48484848484848486,
"grad_norm": 1.1693778038024902,
"learning_rate": 9.363725449224281e-06,
"loss": 0.6500480651855469,
"memory(GiB)": 30.0,
"step": 75,
"token_acc": 0.8076034754555119,
"train_speed(iter/s)": 0.145372
},
{
"epoch": 0.5171717171717172,
"grad_norm": 1.0537551641464233,
"learning_rate": 9.278229698073889e-06,
"loss": 0.6718258380889892,
"memory(GiB)": 30.0,
"step": 80,
"token_acc": 0.7944381259859853,
"train_speed(iter/s)": 0.14633
},
{
"epoch": 0.5171717171717172,
"eval_loss": 0.6917301416397095,
"eval_runtime": 4.4847,
"eval_samples_per_second": 22.298,
"eval_steps_per_second": 5.575,
"eval_token_acc": 0.8087266053004811,
"step": 80
},
{
"epoch": 0.5494949494949495,
"grad_norm": 1.139643669128418,
"learning_rate": 9.187788812929074e-06,
"loss": 0.6676198005676269,
"memory(GiB)": 30.0,
"step": 85,
"token_acc": 0.8008274744669155,
"train_speed(iter/s)": 0.143323
},
{
"epoch": 0.5818181818181818,
"grad_norm": 1.1069749593734741,
"learning_rate": 9.092507332892968e-06,
"loss": 0.6722775459289551,
"memory(GiB)": 30.0,
"step": 90,
"token_acc": 0.8005652779928759,
"train_speed(iter/s)": 0.144592
},
{
"epoch": 0.6141414141414141,
"grad_norm": 1.1481822729110718,
"learning_rate": 8.992495392231195e-06,
"loss": 0.6242180824279785,
"memory(GiB)": 30.0,
"step": 95,
"token_acc": 0.8088621855050695,
"train_speed(iter/s)": 0.145728
},
{
"epoch": 0.6464646464646465,
"grad_norm": 1.0265405178070068,
"learning_rate": 8.88786859306952e-06,
"loss": 0.6294228076934815,
"memory(GiB)": 30.0,
"step": 100,
"token_acc": 0.797233893557423,
"train_speed(iter/s)": 0.146555
},
{
"epoch": 0.6464646464646465,
"eval_loss": 0.6827206611633301,
"eval_runtime": 4.4877,
"eval_samples_per_second": 22.283,
"eval_steps_per_second": 5.571,
"eval_token_acc": 0.8118412636613921,
"step": 100
},
{
"epoch": 0.6787878787878788,
"grad_norm": 1.1638389825820923,
"learning_rate": 8.778747871771293e-06,
"loss": 0.6758254051208497,
"memory(GiB)": 30.0,
"step": 105,
"token_acc": 0.8088623640012675,
"train_speed(iter/s)": 0.144045
},
{
"epoch": 0.7111111111111111,
"grad_norm": 1.2460920810699463,
"learning_rate": 8.665259359149132e-06,
"loss": 0.6744856834411621,
"memory(GiB)": 30.0,
"step": 110,
"token_acc": 0.8033372194695424,
"train_speed(iter/s)": 0.145146
},
{
"epoch": 0.7434343434343434,
"grad_norm": 1.1937848329544067,
"learning_rate": 8.547534234672435e-06,
"loss": 0.6776030540466309,
"memory(GiB)": 30.0,
"step": 115,
"token_acc": 0.7960356428441535,
"train_speed(iter/s)": 0.145963
},
{
"epoch": 0.7757575757575758,
"grad_norm": 1.2813752889633179,
"learning_rate": 8.425708574839221e-06,
"loss": 0.6523926734924317,
"memory(GiB)": 30.0,
"step": 120,
"token_acc": 0.8113255093959248,
"train_speed(iter/s)": 0.146806
},
{
"epoch": 0.7757575757575758,
"eval_loss": 0.6801063418388367,
"eval_runtime": 4.5179,
"eval_samples_per_second": 22.134,
"eval_steps_per_second": 5.533,
"eval_token_acc": 0.8103673628298896,
"step": 120
},
{
"epoch": 0.8080808080808081,
"grad_norm": 1.2211530208587646,
"learning_rate": 8.299923195887599e-06,
"loss": 0.6863309383392334,
"memory(GiB)": 30.0,
"step": 125,
"token_acc": 0.7986384909941853,
"train_speed(iter/s)": 0.144918
},
{
"epoch": 0.8404040404040404,
"grad_norm": 1.0644088983535767,
"learning_rate": 8.170323491028625e-06,
"loss": 0.6193663597106933,
"memory(GiB)": 30.0,
"step": 130,
"token_acc": 0.8070464504820333,
"train_speed(iter/s)": 0.145439
},
{
"epoch": 0.8727272727272727,
"grad_norm": 1.0717216730117798,
"learning_rate": 8.03705926238874e-06,
"loss": 0.6962187767028809,
"memory(GiB)": 30.0,
"step": 135,
"token_acc": 0.7882992561955905,
"train_speed(iter/s)": 0.146161
},
{
"epoch": 0.9050505050505051,
"grad_norm": 1.0318809747695923,
"learning_rate": 7.900284547855992e-06,
"loss": 0.6141955375671386,
"memory(GiB)": 30.0,
"step": 140,
"token_acc": 0.8250999478532939,
"train_speed(iter/s)": 0.146718
},
{
"epoch": 0.9050505050505051,
"eval_loss": 0.6769556403160095,
"eval_runtime": 4.5137,
"eval_samples_per_second": 22.155,
"eval_steps_per_second": 5.539,
"eval_token_acc": 0.812314024305459,
"step": 140
},
{
"epoch": 0.9373737373737374,
"grad_norm": 1.1931071281433105,
"learning_rate": 7.760157443030234e-06,
"loss": 0.6433291435241699,
"memory(GiB)": 30.0,
"step": 145,
"token_acc": 0.8096512634810674,
"train_speed(iter/s)": 0.145106
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.9276081323623657,
"learning_rate": 7.616839918483061e-06,
"loss": 0.6246243000030518,
"memory(GiB)": 30.0,
"step": 150,
"token_acc": 0.8373285914577848,
"train_speed(iter/s)": 0.145752
},
{
"epoch": 1.0,
"grad_norm": 1.0196563005447388,
"learning_rate": 7.470497632538743e-06,
"loss": 0.6316683769226075,
"memory(GiB)": 30.0,
"step": 155,
"token_acc": 0.8189631162217006,
"train_speed(iter/s)": 0.14654
},
{
"epoch": 1.0323232323232323,
"grad_norm": 1.0861464738845825,
"learning_rate": 7.321299739792553e-06,
"loss": 0.574582290649414,
"memory(GiB)": 30.0,
"step": 160,
"token_acc": 0.822840260798696,
"train_speed(iter/s)": 0.147158
},
{
"epoch": 1.0323232323232323,
"eval_loss": 0.6779691576957703,
"eval_runtime": 4.5159,
"eval_samples_per_second": 22.144,
"eval_steps_per_second": 5.536,
"eval_token_acc": 0.8116465975138352,
"step": 160
},
{
"epoch": 1.0646464646464646,
"grad_norm": 1.1801550388336182,
"learning_rate": 7.169418695587791e-06,
"loss": 0.5374558448791504,
"memory(GiB)": 30.0,
"step": 165,
"token_acc": 0.8396995365190986,
"train_speed(iter/s)": 0.145626
},
{
"epoch": 1.096969696969697,
"grad_norm": 1.0841394662857056,
"learning_rate": 7.015030056677559e-06,
"loss": 0.5700150489807129,
"memory(GiB)": 30.0,
"step": 170,
"token_acc": 0.8313138512710858,
"train_speed(iter/s)": 0.146379
},
{
"epoch": 1.1292929292929292,
"grad_norm": 1.0907129049301147,
"learning_rate": 6.858312278301638e-06,
"loss": 0.530540657043457,
"memory(GiB)": 30.0,
"step": 175,
"token_acc": 0.8248816768086545,
"train_speed(iter/s)": 0.14678
},
{
"epoch": 1.1616161616161615,
"grad_norm": 1.0117602348327637,
"learning_rate": 6.699446507913083e-06,
"loss": 0.5229566097259521,
"memory(GiB)": 30.0,
"step": 180,
"token_acc": 0.8368724855693546,
"train_speed(iter/s)": 0.147192
},
{
"epoch": 1.1616161616161615,
"eval_loss": 0.6804619431495667,
"eval_runtime": 4.4977,
"eval_samples_per_second": 22.233,
"eval_steps_per_second": 5.558,
"eval_token_acc": 0.8114797408159292,
"step": 180
},
{
"epoch": 1.1939393939393939,
"grad_norm": 1.050369381904602,
"learning_rate": 6.53861637579291e-06,
"loss": 0.548884916305542,
"memory(GiB)": 30.0,
"step": 185,
"token_acc": 0.8174426020408163,
"train_speed(iter/s)": 0.1459
},
{
"epoch": 1.2262626262626264,
"grad_norm": 1.1832817792892456,
"learning_rate": 6.376007782794926e-06,
"loss": 0.5427236557006836,
"memory(GiB)": 30.0,
"step": 190,
"token_acc": 0.8312639081497726,
"train_speed(iter/s)": 0.146405
},
{
"epoch": 1.2585858585858585,
"grad_norm": 1.0814838409423828,
"learning_rate": 6.211808685466063e-06,
"loss": 0.5683661937713623,
"memory(GiB)": 30.0,
"step": 195,
"token_acc": 0.8363592434074278,
"train_speed(iter/s)": 0.147077
},
{
"epoch": 1.290909090909091,
"grad_norm": 0.9832177758216858,
"learning_rate": 6.046208878790543e-06,
"loss": 0.5114535808563232,
"memory(GiB)": 30.0,
"step": 200,
"token_acc": 0.8398058252427184,
"train_speed(iter/s)": 0.14749
},
{
"epoch": 1.290909090909091,
"eval_loss": 0.6795041561126709,
"eval_runtime": 4.5398,
"eval_samples_per_second": 22.027,
"eval_steps_per_second": 5.507,
"eval_token_acc": 0.812147167607553,
"step": 200
},
{
"epoch": 1.3232323232323233,
"grad_norm": 1.0313045978546143,
"learning_rate": 5.879399776809047e-06,
"loss": 0.5399807453155517,
"memory(GiB)": 30.0,
"step": 205,
"token_acc": 0.8344659940404622,
"train_speed(iter/s)": 0.14634
},
{
"epoch": 1.3555555555555556,
"grad_norm": 1.0118134021759033,
"learning_rate": 5.711574191366427e-06,
"loss": 0.5527422904968262,
"memory(GiB)": 30.0,
"step": 210,
"token_acc": 0.8313429020123443,
"train_speed(iter/s)": 0.146686
},
{
"epoch": 1.387878787878788,
"grad_norm": 1.0724433660507202,
"learning_rate": 5.542926109243727e-06,
"loss": 0.5539234161376954,
"memory(GiB)": 30.0,
"step": 215,
"token_acc": 0.8117005197773436,
"train_speed(iter/s)": 0.147158
},
{
"epoch": 1.4202020202020202,
"grad_norm": 1.086374282836914,
"learning_rate": 5.373650467932122e-06,
"loss": 0.5297107219696044,
"memory(GiB)": 30.0,
"step": 220,
"token_acc": 0.8333011097792042,
"train_speed(iter/s)": 0.147561
},
{
"epoch": 1.4202020202020202,
"eval_loss": 0.6780869960784912,
"eval_runtime": 4.4783,
"eval_samples_per_second": 22.33,
"eval_steps_per_second": 5.582,
"eval_token_acc": 0.8137323062376596,
"step": 220
},
{
"epoch": 1.4525252525252526,
"grad_norm": 1.1475560665130615,
"learning_rate": 5.2039429303079294e-06,
"loss": 0.5562318801879883,
"memory(GiB)": 30.0,
"step": 225,
"token_acc": 0.820589226025445,
"train_speed(iter/s)": 0.146647
},
{
"epoch": 1.4848484848484849,
"grad_norm": 1.0739235877990723,
"learning_rate": 5.033999658469174e-06,
"loss": 0.5656192779541016,
"memory(GiB)": 30.0,
"step": 230,
"token_acc": 0.8434569629111267,
"train_speed(iter/s)": 0.147011
},
{
"epoch": 1.5171717171717172,
"grad_norm": 1.153382420539856,
"learning_rate": 4.864017086995112e-06,
"loss": 0.5392692565917969,
"memory(GiB)": 30.0,
"step": 235,
"token_acc": 0.8344013490725126,
"train_speed(iter/s)": 0.147492
},
{
"epoch": 1.5494949494949495,
"grad_norm": 1.0742793083190918,
"learning_rate": 4.694191695890788e-06,
"loss": 0.5323070049285888,
"memory(GiB)": 30.0,
"step": 240,
"token_acc": 0.8339429680501642,
"train_speed(iter/s)": 0.147851
},
{
"epoch": 1.5494949494949495,
"eval_loss": 0.6787331104278564,
"eval_runtime": 4.4769,
"eval_samples_per_second": 22.337,
"eval_steps_per_second": 5.584,
"eval_token_acc": 0.8122027865068551,
"step": 240
},
{
"epoch": 1.5818181818181818,
"grad_norm": 1.012741208076477,
"learning_rate": 4.524719783479088e-06,
"loss": 0.5413528919219971,
"memory(GiB)": 30.0,
"step": 245,
"token_acc": 0.8438339287914254,
"train_speed(iter/s)": 0.14685
},
{
"epoch": 1.614141414141414,
"grad_norm": 1.0688731670379639,
"learning_rate": 4.355797239502807e-06,
"loss": 0.5247974395751953,
"memory(GiB)": 30.0,
"step": 250,
"token_acc": 0.8370563375806298,
"train_speed(iter/s)": 0.147062
},
{
"epoch": 1.6464646464646466,
"grad_norm": 1.195318341255188,
"learning_rate": 4.187619318698971e-06,
"loss": 0.5625959873199463,
"memory(GiB)": 30.0,
"step": 255,
"token_acc": 0.8187680020947892,
"train_speed(iter/s)": 0.147527
},
{
"epoch": 1.6787878787878787,
"grad_norm": 1.0666542053222656,
"learning_rate": 4.020380415107167e-06,
"loss": 0.5226840972900391,
"memory(GiB)": 30.0,
"step": 260,
"token_acc": 0.8682154605263158,
"train_speed(iter/s)": 0.147705
},
{
"epoch": 1.6787878787878787,
"eval_loss": 0.6767453551292419,
"eval_runtime": 4.4986,
"eval_samples_per_second": 22.229,
"eval_steps_per_second": 5.557,
"eval_token_acc": 0.8135098306404516,
"step": 260
},
{
"epoch": 1.7111111111111112,
"grad_norm": 1.1024754047393799,
"learning_rate": 3.854273837372724e-06,
"loss": 0.5334303379058838,
"memory(GiB)": 30.0,
"step": 265,
"token_acc": 0.8305059560662721,
"train_speed(iter/s)": 0.146899
},
{
"epoch": 1.7434343434343433,
"grad_norm": 1.0549464225769043,
"learning_rate": 3.689491585304491e-06,
"loss": 0.5367157936096192,
"memory(GiB)": 30.0,
"step": 270,
"token_acc": 0.8195051514205433,
"train_speed(iter/s)": 0.14719
},
{
"epoch": 1.7757575757575759,
"grad_norm": 1.1271840333938599,
"learning_rate": 3.526224127945479e-06,
"loss": 0.5592126369476318,
"memory(GiB)": 30.0,
"step": 275,
"token_acc": 0.8336587028601531,
"train_speed(iter/s)": 0.147572
},
{
"epoch": 1.808080808080808,
"grad_norm": 1.0035786628723145,
"learning_rate": 3.3646601834128924e-06,
"loss": 0.5336441040039063,
"memory(GiB)": 30.0,
"step": 280,
"token_acc": 0.8376280205561832,
"train_speed(iter/s)": 0.147801
},
{
"epoch": 1.808080808080808,
"eval_loss": 0.675382137298584,
"eval_runtime": 4.4852,
"eval_samples_per_second": 22.296,
"eval_steps_per_second": 5.574,
"eval_token_acc": 0.8133151644928946,
"step": 280
},
{
"epoch": 1.8404040404040405,
"grad_norm": 1.0249357223510742,
"learning_rate": 3.204986500762006e-06,
"loss": 0.5767297267913818,
"memory(GiB)": 30.0,
"step": 285,
"token_acc": 0.8033832987162484,
"train_speed(iter/s)": 0.146928
},
{
"epoch": 1.8727272727272726,
"grad_norm": 0.9631988406181335,
"learning_rate": 3.0473876441260786e-06,
"loss": 0.5411409854888916,
"memory(GiB)": 30.0,
"step": 290,
"token_acc": 0.8262143620505396,
"train_speed(iter/s)": 0.147203
},
{
"epoch": 1.905050505050505,
"grad_norm": 1.028135895729065,
"learning_rate": 2.8920457793817507e-06,
"loss": 0.5459909439086914,
"memory(GiB)": 30.0,
"step": 295,
"token_acc": 0.8455647944260032,
"train_speed(iter/s)": 0.147451
},
{
"epoch": 1.9373737373737374,
"grad_norm": 1.0845921039581299,
"learning_rate": 2.7391404635865725e-06,
"loss": 0.5368780612945556,
"memory(GiB)": 30.0,
"step": 300,
"token_acc": 0.8320271783191137,
"train_speed(iter/s)": 0.147671
},
{
"epoch": 1.9373737373737374,
"eval_loss": 0.6716358661651611,
"eval_runtime": 4.5215,
"eval_samples_per_second": 22.116,
"eval_steps_per_second": 5.529,
"eval_token_acc": 0.8139547818348675,
"step": 300
},
{
"epoch": 1.9696969696969697,
"grad_norm": 0.9843314290046692,
"learning_rate": 2.5888484374320033e-06,
"loss": 0.5163120269775391,
"memory(GiB)": 30.0,
"step": 305,
"token_acc": 0.8328818151032849,
"train_speed(iter/s)": 0.146859
},
{
"epoch": 2.0,
"grad_norm": 1.1701740026474,
"learning_rate": 2.4413434209518137e-06,
"loss": 0.5329459190368653,
"memory(GiB)": 30.0,
"step": 310,
"token_acc": 0.8684262230663435,
"train_speed(iter/s)": 0.147224
},
{
"epoch": 2.0323232323232325,
"grad_norm": 0.9886131286621094,
"learning_rate": 2.296795912722014e-06,
"loss": 0.47854862213134763,
"memory(GiB)": 30.0,
"step": 315,
"token_acc": 0.8571065805702677,
"train_speed(iter/s)": 0.147399
},
{
"epoch": 2.0646464646464646,
"grad_norm": 1.1649657487869263,
"learning_rate": 2.1553729927843894e-06,
"loss": 0.46472911834716796,
"memory(GiB)": 30.0,
"step": 320,
"token_acc": 0.8510946618102064,
"train_speed(iter/s)": 0.147612
},
{
"epoch": 2.0646464646464646,
"eval_loss": 0.683362603187561,
"eval_runtime": 4.5092,
"eval_samples_per_second": 22.177,
"eval_steps_per_second": 5.544,
"eval_token_acc": 0.812425262104063,
"step": 320
},
{
"epoch": 2.096969696969697,
"grad_norm": 1.0194660425186157,
"learning_rate": 2.017238129521506e-06,
"loss": 0.4674004077911377,
"memory(GiB)": 30.0,
"step": 325,
"token_acc": 0.8521572339577521,
"train_speed(iter/s)": 0.146897
},
{
"epoch": 2.1292929292929292,
"grad_norm": 1.0168867111206055,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.4946479320526123,
"memory(GiB)": 30.0,
"step": 330,
"token_acc": 0.846690244227946,
"train_speed(iter/s)": 0.147164
},
{
"epoch": 2.1616161616161618,
"grad_norm": 1.0326777696609497,
"learning_rate": 1.7514672589449378e-06,
"loss": 0.48072013854980467,
"memory(GiB)": 30.0,
"step": 335,
"token_acc": 0.8457147012835897,
"train_speed(iter/s)": 0.147447
},
{
"epoch": 2.193939393939394,
"grad_norm": 0.9925091862678528,
"learning_rate": 1.6241384517255854e-06,
"loss": 0.4736426830291748,
"memory(GiB)": 30.0,
"step": 340,
"token_acc": 0.8668122952098611,
"train_speed(iter/s)": 0.147682
},
{
"epoch": 2.193939393939394,
"eval_loss": 0.687374472618103,
"eval_runtime": 4.4755,
"eval_samples_per_second": 22.344,
"eval_steps_per_second": 5.586,
"eval_token_acc": 0.8125643093523179,
"step": 340
},
{
"epoch": 2.2262626262626264,
"grad_norm": 1.008774995803833,
"learning_rate": 1.500711746282192e-06,
"loss": 0.5085729598999024,
"memory(GiB)": 30.0,
"step": 345,
"token_acc": 0.8404898047254289,
"train_speed(iter/s)": 0.147118
},
{
"epoch": 2.2585858585858585,
"grad_norm": 0.9545453786849976,
"learning_rate": 1.3813298094746491e-06,
"loss": 0.49334096908569336,
"memory(GiB)": 30.0,
"step": 350,
"token_acc": 0.8572497735061252,
"train_speed(iter/s)": 0.147268
},
{
"epoch": 2.290909090909091,
"grad_norm": 1.0316177606582642,
"learning_rate": 1.2661306328825818e-06,
"loss": 0.48065829277038574,
"memory(GiB)": 30.0,
"step": 355,
"token_acc": 0.8579491647410887,
"train_speed(iter/s)": 0.147552
},
{
"epoch": 2.323232323232323,
"grad_norm": 0.9947900772094727,
"learning_rate": 1.1552473733031893e-06,
"loss": 0.4793752670288086,
"memory(GiB)": 30.0,
"step": 360,
"token_acc": 0.8398426718189346,
"train_speed(iter/s)": 0.147791
},
{
"epoch": 2.323232323232323,
"eval_loss": 0.6864572167396545,
"eval_runtime": 4.5103,
"eval_samples_per_second": 22.171,
"eval_steps_per_second": 5.543,
"eval_token_acc": 0.8118968825606941,
"step": 360
},
{
"epoch": 2.3555555555555556,
"grad_norm": 0.9976386427879333,
"learning_rate": 1.0488081988375493e-06,
"loss": 0.46926078796386717,
"memory(GiB)": 30.0,
"step": 365,
"token_acc": 0.8339437138994715,
"train_speed(iter/s)": 0.147274
},
{
"epoch": 2.3878787878787877,
"grad_norm": 0.9950555562973022,
"learning_rate": 9.469361407432431e-07,
"loss": 0.46298680305480955,
"memory(GiB)": 30.0,
"step": 370,
"token_acc": 0.8648511440693332,
"train_speed(iter/s)": 0.14743
},
{
"epoch": 2.4202020202020202,
"grad_norm": 0.9489296078681946,
"learning_rate": 8.497489512245971e-07,
"loss": 0.4750084400177002,
"memory(GiB)": 30.0,
"step": 375,
"token_acc": 0.8613484960635219,
"train_speed(iter/s)": 0.147589
},
{
"epoch": 2.4525252525252528,
"grad_norm": 1.0904533863067627,
"learning_rate": 7.573589673248833e-07,
"loss": 0.49940977096557615,
"memory(GiB)": 30.0,
"step": 380,
"token_acc": 0.8456219466366027,
"train_speed(iter/s)": 0.147838
},
{
"epoch": 2.4525252525252528,
"eval_loss": 0.6869800090789795,
"eval_runtime": 4.4689,
"eval_samples_per_second": 22.377,
"eval_steps_per_second": 5.594,
"eval_token_acc": 0.812230595956506,
"step": 380
},
{
"epoch": 2.484848484848485,
"grad_norm": 0.9920545816421509,
"learning_rate": 6.698729810778065e-07,
"loss": 0.4398204803466797,
"memory(GiB)": 30.0,
"step": 385,
"token_acc": 0.8489904129398532,
"train_speed(iter/s)": 0.147202
},
{
"epoch": 2.517171717171717,
"grad_norm": 0.9930100440979004,
"learning_rate": 5.873921160683943e-07,
"loss": 0.4805948257446289,
"memory(GiB)": 30.0,
"step": 390,
"token_acc": 0.8426801497549115,
"train_speed(iter/s)": 0.14741
},
{
"epoch": 2.5494949494949495,
"grad_norm": 1.0105656385421753,
"learning_rate": 5.100117105459279e-07,
"loss": 0.4643260478973389,
"memory(GiB)": 30.0,
"step": 395,
"token_acc": 0.8793709396854699,
"train_speed(iter/s)": 0.14759
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.949529230594635,
"learning_rate": 4.3782120722406565e-07,
"loss": 0.4940080165863037,
"memory(GiB)": 30.0,
"step": 400,
"token_acc": 0.8489606206997511,
"train_speed(iter/s)": 0.147848
},
{
"epoch": 2.581818181818182,
"eval_loss": 0.6872583627700806,
"eval_runtime": 4.4941,
"eval_samples_per_second": 22.251,
"eval_steps_per_second": 5.563,
"eval_token_acc": 0.812397452654412,
"step": 400
},
{
"epoch": 2.614141414141414,
"grad_norm": 1.0164008140563965,
"learning_rate": 3.709040498955102e-07,
"loss": 0.4730886936187744,
"memory(GiB)": 30.0,
"step": 405,
"token_acc": 0.8358856213579076,
"train_speed(iter/s)": 0.147331
},
{
"epoch": 2.6464646464646466,
"grad_norm": 0.9890522956848145,
"learning_rate": 3.0933758698072023e-07,
"loss": 0.4806517124176025,
"memory(GiB)": 30.0,
"step": 410,
"token_acc": 0.8606986899563319,
"train_speed(iter/s)": 0.147503
},
{
"epoch": 2.6787878787878787,
"grad_norm": 1.0946931838989258,
"learning_rate": 2.531929821221768e-07,
"loss": 0.48343396186828613,
"memory(GiB)": 30.0,
"step": 415,
"token_acc": 0.8560399806064223,
"train_speed(iter/s)": 0.147741
},
{
"epoch": 2.7111111111111112,
"grad_norm": 0.9739342331886292,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.4568051338195801,
"memory(GiB)": 30.0,
"step": 420,
"token_acc": 0.8513886113886114,
"train_speed(iter/s)": 0.147934
},
{
"epoch": 2.7111111111111112,
"eval_loss": 0.6869549751281738,
"eval_runtime": 4.4891,
"eval_samples_per_second": 22.276,
"eval_steps_per_second": 5.569,
"eval_token_acc": 0.8125643093523179,
"step": 420
},
{
"epoch": 2.7434343434343433,
"grad_norm": 1.036109447479248,
"learning_rate": 1.5742259095662126e-07,
"loss": 0.4853102684020996,
"memory(GiB)": 30.0,
"step": 425,
"token_acc": 0.8378954181386694,
"train_speed(iter/s)": 0.147362
},
{
"epoch": 2.775757575757576,
"grad_norm": 1.012856125831604,
"learning_rate": 1.1790750403941231e-07,
"loss": 0.4725470542907715,
"memory(GiB)": 30.0,
"step": 430,
"token_acc": 0.8562256448320653,
"train_speed(iter/s)": 0.147546
},
{
"epoch": 2.808080808080808,
"grad_norm": 0.9843412041664124,
"learning_rate": 8.403554600248498e-08,
"loss": 0.47121171951293944,
"memory(GiB)": 30.0,
"step": 435,
"token_acc": 0.8520417505951291,
"train_speed(iter/s)": 0.14775
},
{
"epoch": 2.8404040404040405,
"grad_norm": 1.0212868452072144,
"learning_rate": 5.584586887435739e-08,
"loss": 0.47542705535888674,
"memory(GiB)": 30.0,
"step": 440,
"token_acc": 0.8456783799474098,
"train_speed(iter/s)": 0.147955
},
{
"epoch": 2.8404040404040405,
"eval_loss": 0.6869864463806152,
"eval_runtime": 4.4679,
"eval_samples_per_second": 22.382,
"eval_steps_per_second": 5.595,
"eval_token_acc": 0.812230595956506,
"step": 440
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.9807941317558289,
"learning_rate": 3.337105663029361e-08,
"loss": 0.46687631607055663,
"memory(GiB)": 30.0,
"step": 445,
"token_acc": 0.8307434410089937,
"train_speed(iter/s)": 0.147443
},
{
"epoch": 2.905050505050505,
"grad_norm": 0.9818356037139893,
"learning_rate": 1.6637087529033925e-08,
"loss": 0.4775029182434082,
"memory(GiB)": 30.0,
"step": 450,
"token_acc": 0.8397012044747847,
"train_speed(iter/s)": 0.147671
},
{
"epoch": 2.937373737373737,
"grad_norm": 0.9716631174087524,
"learning_rate": 5.6633040849601865e-09,
"loss": 0.5018224716186523,
"memory(GiB)": 30.0,
"step": 455,
"token_acc": 0.8534569498346989,
"train_speed(iter/s)": 0.147853
},
{
"epoch": 2.9696969696969697,
"grad_norm": 1.012851357460022,
"learning_rate": 4.623907104084335e-10,
"loss": 0.48965134620666506,
"memory(GiB)": 30.0,
"step": 460,
"token_acc": 0.8447179410444411,
"train_speed(iter/s)": 0.148014
},
{
"epoch": 2.9696969696969697,
"eval_loss": 0.6869931817054749,
"eval_runtime": 4.4888,
"eval_samples_per_second": 22.278,
"eval_steps_per_second": 5.569,
"eval_token_acc": 0.8127033566005729,
"step": 460
},
{
"epoch": 2.9826262626262627,
"eval_loss": 0.6868388652801514,
"eval_runtime": 4.4905,
"eval_samples_per_second": 22.269,
"eval_steps_per_second": 5.567,
"eval_token_acc": 0.8126199282516199,
"step": 462
}
],
"logging_steps": 5,
"max_steps": 462,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.7722341067862835e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}