Files
qwen2.5vl-3b-sampled_10000_…/trainer_state.json
ModelHub XC 2810fbac6f 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-sampled_10000_reflection-cot-7b
Source: Original Platform
2026-05-21 11:20:42 +08:00

1181 lines
33 KiB
JSON

{
"best_global_step": 260,
"best_metric": 0.33951408,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v13-20250507-015956/checkpoint-260",
"epoch": 2.9826262626262627,
"eval_steps": 20,
"global_step": 462,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006464646464646465,
"grad_norm": 2.9768006801605225,
"learning_rate": 9.999884400986087e-06,
"loss": 0.4710537791252136,
"memory(GiB)": 27.77,
"step": 1,
"token_acc": 0.8540840602696272,
"train_speed(iter/s)": 0.065349
},
{
"epoch": 0.03232323232323232,
"grad_norm": 1.8765902519226074,
"learning_rate": 9.997110291906109e-06,
"loss": 0.3920785188674927,
"memory(GiB)": 27.77,
"step": 5,
"token_acc": 0.8758898589657488,
"train_speed(iter/s)": 0.119626
},
{
"epoch": 0.06464646464646465,
"grad_norm": 1.293035626411438,
"learning_rate": 9.988444507789584e-06,
"loss": 0.3347900867462158,
"memory(GiB)": 27.77,
"step": 10,
"token_acc": 0.886203631273416,
"train_speed(iter/s)": 0.137285
},
{
"epoch": 0.09696969696969697,
"grad_norm": 1.2047252655029297,
"learning_rate": 9.97401266428502e-06,
"loss": 0.2941281318664551,
"memory(GiB)": 27.77,
"step": 15,
"token_acc": 0.9130237482982907,
"train_speed(iter/s)": 0.139757
},
{
"epoch": 0.1292929292929293,
"grad_norm": 1.105458378791809,
"learning_rate": 9.953831442918418e-06,
"loss": 0.30655100345611574,
"memory(GiB)": 27.77,
"step": 20,
"token_acc": 0.8922364378410602,
"train_speed(iter/s)": 0.143912
},
{
"epoch": 0.1292929292929293,
"eval_loss": 0.4002552032470703,
"eval_runtime": 5.4131,
"eval_samples_per_second": 18.474,
"eval_steps_per_second": 4.618,
"eval_token_acc": 0.8938616739334873,
"step": 20
},
{
"epoch": 0.16161616161616163,
"grad_norm": 1.041798710823059,
"learning_rate": 9.927924170825266e-06,
"loss": 0.29069504737854,
"memory(GiB)": 29.53,
"step": 25,
"token_acc": 0.8984301348526458,
"train_speed(iter/s)": 0.13295
},
{
"epoch": 0.19393939393939394,
"grad_norm": 1.0589144229888916,
"learning_rate": 9.896320793787106e-06,
"loss": 0.33169257640838623,
"memory(GiB)": 29.53,
"step": 30,
"token_acc": 0.8989100590731341,
"train_speed(iter/s)": 0.138041
},
{
"epoch": 0.22626262626262628,
"grad_norm": 0.905042290687561,
"learning_rate": 9.859057841617709e-06,
"loss": 0.29272122383117677,
"memory(GiB)": 29.53,
"step": 35,
"token_acc": 0.9075644898907739,
"train_speed(iter/s)": 0.140237
},
{
"epoch": 0.2585858585858586,
"grad_norm": 0.8976985812187195,
"learning_rate": 9.816178385938867e-06,
"loss": 0.29633958339691163,
"memory(GiB)": 29.53,
"step": 40,
"token_acc": 0.9119097956307258,
"train_speed(iter/s)": 0.142425
},
{
"epoch": 0.2585858585858586,
"eval_loss": 0.375478595495224,
"eval_runtime": 5.311,
"eval_samples_per_second": 18.829,
"eval_steps_per_second": 4.707,
"eval_token_acc": 0.8989911943233307,
"step": 40
},
{
"epoch": 0.2909090909090909,
"grad_norm": 1.0213873386383057,
"learning_rate": 9.767731990394638e-06,
"loss": 0.2878244400024414,
"memory(GiB)": 29.53,
"step": 45,
"token_acc": 0.8998546863647372,
"train_speed(iter/s)": 0.137223
},
{
"epoch": 0.32323232323232326,
"grad_norm": 0.9807785749435425,
"learning_rate": 9.71377465336155e-06,
"loss": 0.30373663902282716,
"memory(GiB)": 29.53,
"step": 50,
"token_acc": 0.9004981024667932,
"train_speed(iter/s)": 0.139351
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.87921142578125,
"learning_rate": 9.654368743221022e-06,
"loss": 0.2892775535583496,
"memory(GiB)": 29.53,
"step": 55,
"token_acc": 0.8972517176764522,
"train_speed(iter/s)": 0.140773
},
{
"epoch": 0.3878787878787879,
"grad_norm": 1.0269665718078613,
"learning_rate": 9.589582926268798e-06,
"loss": 0.2767331600189209,
"memory(GiB)": 29.53,
"step": 60,
"token_acc": 0.917551472624873,
"train_speed(iter/s)": 0.142633
},
{
"epoch": 0.3878787878787879,
"eval_loss": 0.36320608854293823,
"eval_runtime": 5.3096,
"eval_samples_per_second": 18.834,
"eval_steps_per_second": 4.708,
"eval_token_acc": 0.9018979225442421,
"step": 60
},
{
"epoch": 0.4202020202020202,
"grad_norm": 0.93445885181427,
"learning_rate": 9.519492087344724e-06,
"loss": 0.26028871536254883,
"memory(GiB)": 29.53,
"step": 65,
"token_acc": 0.9259884974838246,
"train_speed(iter/s)": 0.137874
},
{
"epoch": 0.45252525252525255,
"grad_norm": 0.9748146533966064,
"learning_rate": 9.444177243274619e-06,
"loss": 0.29547710418701173,
"memory(GiB)": 29.53,
"step": 70,
"token_acc": 0.8966111739669199,
"train_speed(iter/s)": 0.140001
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.937225341796875,
"learning_rate": 9.363725449224281e-06,
"loss": 0.2684732675552368,
"memory(GiB)": 29.53,
"step": 75,
"token_acc": 0.9166046511627907,
"train_speed(iter/s)": 0.141231
},
{
"epoch": 0.5171717171717172,
"grad_norm": 1.026237964630127,
"learning_rate": 9.278229698073889e-06,
"loss": 0.2696544647216797,
"memory(GiB)": 29.53,
"step": 80,
"token_acc": 0.9159072741806554,
"train_speed(iter/s)": 0.142009
},
{
"epoch": 0.5171717171717172,
"eval_loss": 0.35313570499420166,
"eval_runtime": 5.3322,
"eval_samples_per_second": 18.754,
"eval_steps_per_second": 4.689,
"eval_token_acc": 0.9009575104727708,
"step": 80
},
{
"epoch": 0.5494949494949495,
"grad_norm": 0.9791724681854248,
"learning_rate": 9.187788812929074e-06,
"loss": 0.2900824546813965,
"memory(GiB)": 29.53,
"step": 85,
"token_acc": 0.9096507542880761,
"train_speed(iter/s)": 0.139042
},
{
"epoch": 0.5818181818181818,
"grad_norm": 1.288805603981018,
"learning_rate": 9.092507332892968e-06,
"loss": 0.2910241365432739,
"memory(GiB)": 31.36,
"step": 90,
"token_acc": 0.9108959132712102,
"train_speed(iter/s)": 0.14038
},
{
"epoch": 0.6141414141414141,
"grad_norm": 1.055824875831604,
"learning_rate": 8.992495392231195e-06,
"loss": 0.2932537794113159,
"memory(GiB)": 31.36,
"step": 95,
"token_acc": 0.904834793623984,
"train_speed(iter/s)": 0.141584
},
{
"epoch": 0.6464646464646465,
"grad_norm": 1.0129783153533936,
"learning_rate": 8.88786859306952e-06,
"loss": 0.26900548934936525,
"memory(GiB)": 31.36,
"step": 100,
"token_acc": 0.8921682782362172,
"train_speed(iter/s)": 0.142349
},
{
"epoch": 0.6464646464646465,
"eval_loss": 0.3519335985183716,
"eval_runtime": 5.3292,
"eval_samples_per_second": 18.764,
"eval_steps_per_second": 4.691,
"eval_token_acc": 0.9018979225442421,
"step": 100
},
{
"epoch": 0.6787878787878788,
"grad_norm": 1.1157113313674927,
"learning_rate": 8.778747871771293e-06,
"loss": 0.2843419790267944,
"memory(GiB)": 31.36,
"step": 105,
"token_acc": 0.9077419118455281,
"train_speed(iter/s)": 0.140295
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.9619855880737305,
"learning_rate": 8.665259359149132e-06,
"loss": 0.2657431125640869,
"memory(GiB)": 31.36,
"step": 110,
"token_acc": 0.9188039238860655,
"train_speed(iter/s)": 0.141014
},
{
"epoch": 0.7434343434343434,
"grad_norm": 0.9229025840759277,
"learning_rate": 8.547534234672435e-06,
"loss": 0.28671417236328123,
"memory(GiB)": 31.36,
"step": 115,
"token_acc": 0.9083705301168564,
"train_speed(iter/s)": 0.141923
},
{
"epoch": 0.7757575757575758,
"grad_norm": 1.197439432144165,
"learning_rate": 8.425708574839221e-06,
"loss": 0.26473350524902345,
"memory(GiB)": 31.36,
"step": 120,
"token_acc": 0.9178892300693734,
"train_speed(iter/s)": 0.14256
},
{
"epoch": 0.7757575757575758,
"eval_loss": 0.3486604392528534,
"eval_runtime": 5.3218,
"eval_samples_per_second": 18.791,
"eval_steps_per_second": 4.698,
"eval_token_acc": 0.9009147644695221,
"step": 120
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.9874864220619202,
"learning_rate": 8.299923195887599e-06,
"loss": 0.30171942710876465,
"memory(GiB)": 31.36,
"step": 125,
"token_acc": 0.9013058472507459,
"train_speed(iter/s)": 0.140695
},
{
"epoch": 0.8404040404040404,
"grad_norm": 0.916023850440979,
"learning_rate": 8.170323491028625e-06,
"loss": 0.2660797119140625,
"memory(GiB)": 31.36,
"step": 130,
"token_acc": 0.9089231715442571,
"train_speed(iter/s)": 0.141028
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.9316055774688721,
"learning_rate": 8.03705926238874e-06,
"loss": 0.26981799602508544,
"memory(GiB)": 31.36,
"step": 135,
"token_acc": 0.9050517346317711,
"train_speed(iter/s)": 0.141684
},
{
"epoch": 0.9050505050505051,
"grad_norm": 0.991523802280426,
"learning_rate": 7.900284547855992e-06,
"loss": 0.30231451988220215,
"memory(GiB)": 31.36,
"step": 140,
"token_acc": 0.9078060346831559,
"train_speed(iter/s)": 0.142094
},
{
"epoch": 0.9050505050505051,
"eval_loss": 0.34019821882247925,
"eval_runtime": 5.3029,
"eval_samples_per_second": 18.857,
"eval_steps_per_second": 4.714,
"eval_token_acc": 0.9021543985637342,
"step": 140
},
{
"epoch": 0.9373737373737374,
"grad_norm": 0.9023342132568359,
"learning_rate": 7.760157443030234e-06,
"loss": 0.26686046123504636,
"memory(GiB)": 31.36,
"step": 145,
"token_acc": 0.9070432898663288,
"train_speed(iter/s)": 0.140367
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.8874338269233704,
"learning_rate": 7.616839918483061e-06,
"loss": 0.287930965423584,
"memory(GiB)": 31.36,
"step": 150,
"token_acc": 0.9119464202274026,
"train_speed(iter/s)": 0.140934
},
{
"epoch": 1.0,
"grad_norm": 0.9795826077461243,
"learning_rate": 7.470497632538743e-06,
"loss": 0.267154598236084,
"memory(GiB)": 31.36,
"step": 155,
"token_acc": 0.9206813096488424,
"train_speed(iter/s)": 0.141745
},
{
"epoch": 1.0323232323232323,
"grad_norm": 1.0346843004226685,
"learning_rate": 7.321299739792553e-06,
"loss": 0.210396146774292,
"memory(GiB)": 31.36,
"step": 160,
"token_acc": 0.9283128167994207,
"train_speed(iter/s)": 0.142337
},
{
"epoch": 1.0323232323232323,
"eval_loss": 0.3432323932647705,
"eval_runtime": 5.3155,
"eval_samples_per_second": 18.813,
"eval_steps_per_second": 4.703,
"eval_token_acc": 0.90360776267419,
"step": 160
},
{
"epoch": 1.0646464646464646,
"grad_norm": 1.1043457984924316,
"learning_rate": 7.169418695587791e-06,
"loss": 0.18866196870803834,
"memory(GiB)": 31.36,
"step": 165,
"token_acc": 0.9338293722459005,
"train_speed(iter/s)": 0.140897
},
{
"epoch": 1.096969696969697,
"grad_norm": 1.0049176216125488,
"learning_rate": 7.015030056677559e-06,
"loss": 0.19572091102600098,
"memory(GiB)": 31.36,
"step": 170,
"token_acc": 0.9378101525153654,
"train_speed(iter/s)": 0.141758
},
{
"epoch": 1.1292929292929292,
"grad_norm": 1.0148508548736572,
"learning_rate": 6.858312278301638e-06,
"loss": 0.20892024040222168,
"memory(GiB)": 31.36,
"step": 175,
"token_acc": 0.9264176417641764,
"train_speed(iter/s)": 0.142192
},
{
"epoch": 1.1616161616161615,
"grad_norm": 0.8647620677947998,
"learning_rate": 6.699446507913083e-06,
"loss": 0.190657377243042,
"memory(GiB)": 31.36,
"step": 180,
"token_acc": 0.929062185462343,
"train_speed(iter/s)": 0.142547
},
{
"epoch": 1.1616161616161615,
"eval_loss": 0.3503071963787079,
"eval_runtime": 5.314,
"eval_samples_per_second": 18.818,
"eval_steps_per_second": 4.705,
"eval_token_acc": 0.9053176028041379,
"step": 180
},
{
"epoch": 1.1939393939393939,
"grad_norm": 0.734727680683136,
"learning_rate": 6.53861637579291e-06,
"loss": 0.19445569515228273,
"memory(GiB)": 31.36,
"step": 185,
"token_acc": 0.9243174259416815,
"train_speed(iter/s)": 0.141275
},
{
"epoch": 1.2262626262626264,
"grad_norm": 1.0062605142593384,
"learning_rate": 6.376007782794926e-06,
"loss": 0.23203377723693847,
"memory(GiB)": 31.36,
"step": 190,
"token_acc": 0.9284152664126429,
"train_speed(iter/s)": 0.142023
},
{
"epoch": 1.2585858585858585,
"grad_norm": 0.8467244505882263,
"learning_rate": 6.211808685466063e-06,
"loss": 0.20841593742370607,
"memory(GiB)": 31.36,
"step": 195,
"token_acc": 0.9305149796643181,
"train_speed(iter/s)": 0.14276
},
{
"epoch": 1.290909090909091,
"grad_norm": 0.8289315700531006,
"learning_rate": 6.046208878790543e-06,
"loss": 0.20221335887908937,
"memory(GiB)": 31.36,
"step": 200,
"token_acc": 0.9386574826174541,
"train_speed(iter/s)": 0.143051
},
{
"epoch": 1.290909090909091,
"eval_loss": 0.3498072922229767,
"eval_runtime": 5.3519,
"eval_samples_per_second": 18.685,
"eval_steps_per_second": 4.671,
"eval_token_acc": 0.9033512866546978,
"step": 200
},
{
"epoch": 1.3232323232323233,
"grad_norm": 0.9581440091133118,
"learning_rate": 5.879399776809047e-06,
"loss": 0.1989992380142212,
"memory(GiB)": 31.36,
"step": 205,
"token_acc": 0.9317071351311029,
"train_speed(iter/s)": 0.14184
},
{
"epoch": 1.3555555555555556,
"grad_norm": 0.905442476272583,
"learning_rate": 5.711574191366427e-06,
"loss": 0.20228266716003418,
"memory(GiB)": 31.36,
"step": 210,
"token_acc": 0.9324250349735418,
"train_speed(iter/s)": 0.142185
},
{
"epoch": 1.387878787878788,
"grad_norm": 0.8766697645187378,
"learning_rate": 5.542926109243727e-06,
"loss": 0.18625075817108155,
"memory(GiB)": 31.36,
"step": 215,
"token_acc": 0.9340369393139841,
"train_speed(iter/s)": 0.142509
},
{
"epoch": 1.4202020202020202,
"grad_norm": 0.8826522827148438,
"learning_rate": 5.373650467932122e-06,
"loss": 0.18408771753311157,
"memory(GiB)": 31.36,
"step": 220,
"token_acc": 0.9386043390740326,
"train_speed(iter/s)": 0.14291
},
{
"epoch": 1.4202020202020202,
"eval_loss": 0.3430667817592621,
"eval_runtime": 5.3043,
"eval_samples_per_second": 18.853,
"eval_steps_per_second": 4.713,
"eval_token_acc": 0.9047619047619048,
"step": 220
},
{
"epoch": 1.4525252525252526,
"grad_norm": 1.0129814147949219,
"learning_rate": 5.2039429303079294e-06,
"loss": 0.20485594272613525,
"memory(GiB)": 31.36,
"step": 225,
"token_acc": 0.9242819843342036,
"train_speed(iter/s)": 0.142107
},
{
"epoch": 1.4848484848484849,
"grad_norm": 0.9322662353515625,
"learning_rate": 5.033999658469174e-06,
"loss": 0.1999491572380066,
"memory(GiB)": 31.36,
"step": 230,
"token_acc": 0.935610103166133,
"train_speed(iter/s)": 0.142426
},
{
"epoch": 1.5171717171717172,
"grad_norm": 0.9451385140419006,
"learning_rate": 4.864017086995112e-06,
"loss": 0.20325605869293212,
"memory(GiB)": 31.36,
"step": 235,
"token_acc": 0.9395159286317778,
"train_speed(iter/s)": 0.142786
},
{
"epoch": 1.5494949494949495,
"grad_norm": 0.7493119835853577,
"learning_rate": 4.694191695890788e-06,
"loss": 0.17417298555374144,
"memory(GiB)": 31.36,
"step": 240,
"token_acc": 0.9434940634843713,
"train_speed(iter/s)": 0.14319
},
{
"epoch": 1.5494949494949495,
"eval_loss": 0.3442021608352661,
"eval_runtime": 5.3253,
"eval_samples_per_second": 18.778,
"eval_steps_per_second": 4.695,
"eval_token_acc": 0.9033512866546978,
"step": 240
},
{
"epoch": 1.5818181818181818,
"grad_norm": 0.8448814749717712,
"learning_rate": 4.524719783479088e-06,
"loss": 0.17924880981445312,
"memory(GiB)": 31.36,
"step": 245,
"token_acc": 0.9297393970362801,
"train_speed(iter/s)": 0.142181
},
{
"epoch": 1.614141414141414,
"grad_norm": 0.8817701935768127,
"learning_rate": 4.355797239502807e-06,
"loss": 0.1808495044708252,
"memory(GiB)": 31.37,
"step": 250,
"token_acc": 0.932560963270262,
"train_speed(iter/s)": 0.142404
},
{
"epoch": 1.6464646464646466,
"grad_norm": 0.892765462398529,
"learning_rate": 4.187619318698971e-06,
"loss": 0.1975640058517456,
"memory(GiB)": 31.37,
"step": 255,
"token_acc": 0.9292730844793713,
"train_speed(iter/s)": 0.142833
},
{
"epoch": 1.6787878787878787,
"grad_norm": 0.8926272392272949,
"learning_rate": 4.020380415107167e-06,
"loss": 0.19108818769454955,
"memory(GiB)": 31.37,
"step": 260,
"token_acc": 0.9380416838629798,
"train_speed(iter/s)": 0.14303
},
{
"epoch": 1.6787878787878787,
"eval_loss": 0.3395140767097473,
"eval_runtime": 5.3138,
"eval_samples_per_second": 18.819,
"eval_steps_per_second": 4.705,
"eval_token_acc": 0.904719158758656,
"step": 260
},
{
"epoch": 1.7111111111111112,
"grad_norm": 0.9119723439216614,
"learning_rate": 3.854273837372724e-06,
"loss": 0.1968384265899658,
"memory(GiB)": 31.37,
"step": 265,
"token_acc": 0.9235020131049183,
"train_speed(iter/s)": 0.14221
},
{
"epoch": 1.7434343434343433,
"grad_norm": 0.8273991942405701,
"learning_rate": 3.689491585304491e-06,
"loss": 0.1967773199081421,
"memory(GiB)": 31.37,
"step": 270,
"token_acc": 0.9207749251850685,
"train_speed(iter/s)": 0.142463
},
{
"epoch": 1.7757575757575759,
"grad_norm": 1.0739576816558838,
"learning_rate": 3.526224127945479e-06,
"loss": 0.19146767854690552,
"memory(GiB)": 31.37,
"step": 275,
"token_acc": 0.9385914241279716,
"train_speed(iter/s)": 0.142947
},
{
"epoch": 1.808080808080808,
"grad_norm": 0.7444007992744446,
"learning_rate": 3.3646601834128924e-06,
"loss": 0.18657138347625732,
"memory(GiB)": 31.37,
"step": 280,
"token_acc": 0.9347743648084945,
"train_speed(iter/s)": 0.143177
},
{
"epoch": 1.808080808080808,
"eval_loss": 0.33954280614852905,
"eval_runtime": 5.3308,
"eval_samples_per_second": 18.759,
"eval_steps_per_second": 4.69,
"eval_token_acc": 0.9033512866546978,
"step": 280
},
{
"epoch": 1.8404040404040405,
"grad_norm": 0.854715883731842,
"learning_rate": 3.204986500762006e-06,
"loss": 0.17565726041793822,
"memory(GiB)": 31.37,
"step": 285,
"token_acc": 0.9216834543608964,
"train_speed(iter/s)": 0.142226
},
{
"epoch": 1.8727272727272726,
"grad_norm": 0.9426065683364868,
"learning_rate": 3.0473876441260786e-06,
"loss": 0.18239200115203857,
"memory(GiB)": 31.37,
"step": 290,
"token_acc": 0.939308718134809,
"train_speed(iter/s)": 0.142498
},
{
"epoch": 1.905050505050505,
"grad_norm": 0.7922055721282959,
"learning_rate": 2.8920457793817507e-06,
"loss": 0.19457708597183226,
"memory(GiB)": 31.37,
"step": 295,
"token_acc": 0.9396199182102478,
"train_speed(iter/s)": 0.142765
},
{
"epoch": 1.9373737373737374,
"grad_norm": 1.0278714895248413,
"learning_rate": 2.7391404635865725e-06,
"loss": 0.1904490351676941,
"memory(GiB)": 31.37,
"step": 300,
"token_acc": 0.9375734430082256,
"train_speed(iter/s)": 0.143068
},
{
"epoch": 1.9373737373737374,
"eval_loss": 0.34051987528800964,
"eval_runtime": 5.283,
"eval_samples_per_second": 18.929,
"eval_steps_per_second": 4.732,
"eval_token_acc": 0.9058733008463709,
"step": 300
},
{
"epoch": 1.9696969696969697,
"grad_norm": 0.7814855575561523,
"learning_rate": 2.5888484374320033e-06,
"loss": 0.18078551292419434,
"memory(GiB)": 31.37,
"step": 305,
"token_acc": 0.9328905860866678,
"train_speed(iter/s)": 0.142327
},
{
"epoch": 2.0,
"grad_norm": 1.068848967552185,
"learning_rate": 2.4413434209518137e-06,
"loss": 0.1775040626525879,
"memory(GiB)": 31.37,
"step": 310,
"token_acc": 0.9349484536082474,
"train_speed(iter/s)": 0.142606
},
{
"epoch": 2.0323232323232325,
"grad_norm": 0.8913955092430115,
"learning_rate": 2.296795912722014e-06,
"loss": 0.13691856861114501,
"memory(GiB)": 31.37,
"step": 315,
"token_acc": 0.9597564204395023,
"train_speed(iter/s)": 0.142686
},
{
"epoch": 2.0646464646464646,
"grad_norm": 0.7020771503448486,
"learning_rate": 2.1553729927843894e-06,
"loss": 0.14630917310714722,
"memory(GiB)": 31.37,
"step": 320,
"token_acc": 0.9601399342732959,
"train_speed(iter/s)": 0.142875
},
{
"epoch": 2.0646464646464646,
"eval_loss": 0.3507172465324402,
"eval_runtime": 5.3226,
"eval_samples_per_second": 18.788,
"eval_steps_per_second": 4.697,
"eval_token_acc": 0.9062152688723605,
"step": 320
},
{
"epoch": 2.096969696969697,
"grad_norm": 0.7276548743247986,
"learning_rate": 2.017238129521506e-06,
"loss": 0.14279915094375611,
"memory(GiB)": 31.37,
"step": 325,
"token_acc": 0.9453313981615868,
"train_speed(iter/s)": 0.142166
},
{
"epoch": 2.1292929292929292,
"grad_norm": 0.9800724387168884,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.14380053281784058,
"memory(GiB)": 31.37,
"step": 330,
"token_acc": 0.9582820773386105,
"train_speed(iter/s)": 0.142491
},
{
"epoch": 2.1616161616161618,
"grad_norm": 0.764988899230957,
"learning_rate": 1.7514672589449378e-06,
"loss": 0.1322154998779297,
"memory(GiB)": 31.37,
"step": 335,
"token_acc": 0.9540802213001384,
"train_speed(iter/s)": 0.142717
},
{
"epoch": 2.193939393939394,
"grad_norm": 0.8425831198692322,
"learning_rate": 1.6241384517255854e-06,
"loss": 0.1310911536216736,
"memory(GiB)": 31.37,
"step": 340,
"token_acc": 0.9542607139305956,
"train_speed(iter/s)": 0.142908
},
{
"epoch": 2.193939393939394,
"eval_loss": 0.3661825954914093,
"eval_runtime": 5.3319,
"eval_samples_per_second": 18.755,
"eval_steps_per_second": 4.689,
"eval_token_acc": 0.9046764127554073,
"step": 340
},
{
"epoch": 2.2262626262626264,
"grad_norm": 0.8031105399131775,
"learning_rate": 1.500711746282192e-06,
"loss": 0.15711712837219238,
"memory(GiB)": 31.37,
"step": 345,
"token_acc": 0.9412384531628675,
"train_speed(iter/s)": 0.142295
},
{
"epoch": 2.2585858585858585,
"grad_norm": 0.7701563835144043,
"learning_rate": 1.3813298094746491e-06,
"loss": 0.1371659517288208,
"memory(GiB)": 31.37,
"step": 350,
"token_acc": 0.9590588494599956,
"train_speed(iter/s)": 0.142428
},
{
"epoch": 2.290909090909091,
"grad_norm": 0.7400316596031189,
"learning_rate": 1.2661306328825818e-06,
"loss": 0.13530057668685913,
"memory(GiB)": 31.37,
"step": 355,
"token_acc": 0.9558357869007278,
"train_speed(iter/s)": 0.142645
},
{
"epoch": 2.323232323232323,
"grad_norm": 0.8807271718978882,
"learning_rate": 1.1552473733031893e-06,
"loss": 0.13264925479888917,
"memory(GiB)": 31.37,
"step": 360,
"token_acc": 0.9480800203441553,
"train_speed(iter/s)": 0.142961
},
{
"epoch": 2.323232323232323,
"eval_loss": 0.3667658269405365,
"eval_runtime": 5.3077,
"eval_samples_per_second": 18.841,
"eval_steps_per_second": 4.71,
"eval_token_acc": 0.9054458408138839,
"step": 360
},
{
"epoch": 2.3555555555555556,
"grad_norm": 0.8146414160728455,
"learning_rate": 1.0488081988375493e-06,
"loss": 0.14928791522979737,
"memory(GiB)": 31.37,
"step": 365,
"token_acc": 0.9331647539389224,
"train_speed(iter/s)": 0.142423
},
{
"epoch": 2.3878787878787877,
"grad_norm": 0.8976642489433289,
"learning_rate": 9.469361407432431e-07,
"loss": 0.14599543809890747,
"memory(GiB)": 31.37,
"step": 370,
"token_acc": 0.9549465467503818,
"train_speed(iter/s)": 0.142603
},
{
"epoch": 2.4202020202020202,
"grad_norm": 0.7792761921882629,
"learning_rate": 8.497489512245971e-07,
"loss": 0.13710694313049315,
"memory(GiB)": 31.37,
"step": 375,
"token_acc": 0.9578516805975458,
"train_speed(iter/s)": 0.142793
},
{
"epoch": 2.4525252525252528,
"grad_norm": 0.9567272663116455,
"learning_rate": 7.573589673248833e-07,
"loss": 0.13858846426010132,
"memory(GiB)": 31.37,
"step": 380,
"token_acc": 0.9512100926879505,
"train_speed(iter/s)": 0.143064
},
{
"epoch": 2.4525252525252528,
"eval_loss": 0.3684777021408081,
"eval_runtime": 5.3389,
"eval_samples_per_second": 18.73,
"eval_steps_per_second": 4.683,
"eval_token_acc": 0.905018380781397,
"step": 380
},
{
"epoch": 2.484848484848485,
"grad_norm": 0.7511286735534668,
"learning_rate": 6.698729810778065e-07,
"loss": 0.14136791229248047,
"memory(GiB)": 31.37,
"step": 385,
"token_acc": 0.9423156301596037,
"train_speed(iter/s)": 0.14245
},
{
"epoch": 2.517171717171717,
"grad_norm": 0.8342220783233643,
"learning_rate": 5.873921160683943e-07,
"loss": 0.13304708003997803,
"memory(GiB)": 31.37,
"step": 390,
"token_acc": 0.9584492790285859,
"train_speed(iter/s)": 0.142715
},
{
"epoch": 2.5494949494949495,
"grad_norm": 0.7826300263404846,
"learning_rate": 5.100117105459279e-07,
"loss": 0.1284404754638672,
"memory(GiB)": 31.37,
"step": 395,
"token_acc": 0.9555216985304958,
"train_speed(iter/s)": 0.142902
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.7116090655326843,
"learning_rate": 4.3782120722406565e-07,
"loss": 0.15817636251449585,
"memory(GiB)": 31.37,
"step": 400,
"token_acc": 0.9559717504022885,
"train_speed(iter/s)": 0.143144
},
{
"epoch": 2.581818181818182,
"eval_loss": 0.3664516508579254,
"eval_runtime": 5.3264,
"eval_samples_per_second": 18.774,
"eval_steps_per_second": 4.694,
"eval_token_acc": 0.9054885868171326,
"step": 400
},
{
"epoch": 2.614141414141414,
"grad_norm": 0.7377658486366272,
"learning_rate": 3.709040498955102e-07,
"loss": 0.13814414739608766,
"memory(GiB)": 31.37,
"step": 405,
"token_acc": 0.9434749034749035,
"train_speed(iter/s)": 0.142529
},
{
"epoch": 2.6464646464646466,
"grad_norm": 0.9466362595558167,
"learning_rate": 3.0933758698072023e-07,
"loss": 0.13764555454254152,
"memory(GiB)": 31.37,
"step": 410,
"token_acc": 0.9556354916067147,
"train_speed(iter/s)": 0.142696
},
{
"epoch": 2.6787878787878787,
"grad_norm": 0.8730055093765259,
"learning_rate": 2.531929821221768e-07,
"loss": 0.1323538064956665,
"memory(GiB)": 31.37,
"step": 415,
"token_acc": 0.951093389819949,
"train_speed(iter/s)": 0.142879
},
{
"epoch": 2.7111111111111112,
"grad_norm": 0.8799780011177063,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.1415894865989685,
"memory(GiB)": 31.37,
"step": 420,
"token_acc": 0.957655213984328,
"train_speed(iter/s)": 0.143168
},
{
"epoch": 2.7111111111111112,
"eval_loss": 0.3659995198249817,
"eval_runtime": 5.3257,
"eval_samples_per_second": 18.777,
"eval_steps_per_second": 4.694,
"eval_token_acc": 0.9056595708301274,
"step": 420
},
{
"epoch": 2.7434343434343433,
"grad_norm": 0.9417216777801514,
"learning_rate": 1.5742259095662126e-07,
"loss": 0.13300987482070922,
"memory(GiB)": 31.37,
"step": 425,
"token_acc": 0.9425528606965174,
"train_speed(iter/s)": 0.142605
},
{
"epoch": 2.775757575757576,
"grad_norm": 0.7449456453323364,
"learning_rate": 1.1790750403941231e-07,
"loss": 0.13196516036987305,
"memory(GiB)": 31.37,
"step": 430,
"token_acc": 0.9565181855333061,
"train_speed(iter/s)": 0.142783
},
{
"epoch": 2.808080808080808,
"grad_norm": 0.807500422000885,
"learning_rate": 8.403554600248498e-08,
"loss": 0.12061362266540528,
"memory(GiB)": 31.37,
"step": 435,
"token_acc": 0.9600038504115127,
"train_speed(iter/s)": 0.142955
},
{
"epoch": 2.8404040404040405,
"grad_norm": 0.8164981007575989,
"learning_rate": 5.584586887435739e-08,
"loss": 0.1402422547340393,
"memory(GiB)": 31.37,
"step": 440,
"token_acc": 0.9568223268439581,
"train_speed(iter/s)": 0.143151
},
{
"epoch": 2.8404040404040405,
"eval_loss": 0.3657556176185608,
"eval_runtime": 5.3232,
"eval_samples_per_second": 18.786,
"eval_steps_per_second": 4.696,
"eval_token_acc": 0.9058733008463709,
"step": 440
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.8275712728500366,
"learning_rate": 3.337105663029361e-08,
"loss": 0.13185644149780273,
"memory(GiB)": 31.37,
"step": 445,
"token_acc": 0.9404505730973607,
"train_speed(iter/s)": 0.142606
},
{
"epoch": 2.905050505050505,
"grad_norm": 0.8103200197219849,
"learning_rate": 1.6637087529033925e-08,
"loss": 0.12856653928756714,
"memory(GiB)": 31.37,
"step": 450,
"token_acc": 0.9603827178950516,
"train_speed(iter/s)": 0.14286
},
{
"epoch": 2.937373737373737,
"grad_norm": 0.7664045691490173,
"learning_rate": 5.6633040849601865e-09,
"loss": 0.12633774280548096,
"memory(GiB)": 31.37,
"step": 455,
"token_acc": 0.9499862901014533,
"train_speed(iter/s)": 0.142986
},
{
"epoch": 2.9696969696969697,
"grad_norm": 0.8392007350921631,
"learning_rate": 4.623907104084335e-10,
"loss": 0.15011647939682007,
"memory(GiB)": 31.37,
"step": 460,
"token_acc": 0.9552918794432762,
"train_speed(iter/s)": 0.143271
},
{
"epoch": 2.9696969696969697,
"eval_loss": 0.36587560176849365,
"eval_runtime": 5.3221,
"eval_samples_per_second": 18.79,
"eval_steps_per_second": 4.697,
"eval_token_acc": 0.9063007608788578,
"step": 460
},
{
"epoch": 2.9826262626262627,
"eval_loss": 0.36588189005851746,
"eval_runtime": 5.3146,
"eval_samples_per_second": 18.816,
"eval_steps_per_second": 4.704,
"eval_token_acc": 0.9059587928528683,
"step": 462
}
],
"logging_steps": 5,
"max_steps": 462,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.302620480776438e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}