Model: waltonfuture/qwen2.5vl-3b-sampled_10000_reflection-cot-7b Source: Original Platform
1181 lines
33 KiB
JSON
1181 lines
33 KiB
JSON
{
|
|
"best_global_step": 260,
|
|
"best_metric": 0.33951408,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v13-20250507-015956/checkpoint-260",
|
|
"epoch": 2.9826262626262627,
|
|
"eval_steps": 20,
|
|
"global_step": 462,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.006464646464646465,
|
|
"grad_norm": 2.9768006801605225,
|
|
"learning_rate": 9.999884400986087e-06,
|
|
"loss": 0.4710537791252136,
|
|
"memory(GiB)": 27.77,
|
|
"step": 1,
|
|
"token_acc": 0.8540840602696272,
|
|
"train_speed(iter/s)": 0.065349
|
|
},
|
|
{
|
|
"epoch": 0.03232323232323232,
|
|
"grad_norm": 1.8765902519226074,
|
|
"learning_rate": 9.997110291906109e-06,
|
|
"loss": 0.3920785188674927,
|
|
"memory(GiB)": 27.77,
|
|
"step": 5,
|
|
"token_acc": 0.8758898589657488,
|
|
"train_speed(iter/s)": 0.119626
|
|
},
|
|
{
|
|
"epoch": 0.06464646464646465,
|
|
"grad_norm": 1.293035626411438,
|
|
"learning_rate": 9.988444507789584e-06,
|
|
"loss": 0.3347900867462158,
|
|
"memory(GiB)": 27.77,
|
|
"step": 10,
|
|
"token_acc": 0.886203631273416,
|
|
"train_speed(iter/s)": 0.137285
|
|
},
|
|
{
|
|
"epoch": 0.09696969696969697,
|
|
"grad_norm": 1.2047252655029297,
|
|
"learning_rate": 9.97401266428502e-06,
|
|
"loss": 0.2941281318664551,
|
|
"memory(GiB)": 27.77,
|
|
"step": 15,
|
|
"token_acc": 0.9130237482982907,
|
|
"train_speed(iter/s)": 0.139757
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"grad_norm": 1.105458378791809,
|
|
"learning_rate": 9.953831442918418e-06,
|
|
"loss": 0.30655100345611574,
|
|
"memory(GiB)": 27.77,
|
|
"step": 20,
|
|
"token_acc": 0.8922364378410602,
|
|
"train_speed(iter/s)": 0.143912
|
|
},
|
|
{
|
|
"epoch": 0.1292929292929293,
|
|
"eval_loss": 0.4002552032470703,
|
|
"eval_runtime": 5.4131,
|
|
"eval_samples_per_second": 18.474,
|
|
"eval_steps_per_second": 4.618,
|
|
"eval_token_acc": 0.8938616739334873,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.16161616161616163,
|
|
"grad_norm": 1.041798710823059,
|
|
"learning_rate": 9.927924170825266e-06,
|
|
"loss": 0.29069504737854,
|
|
"memory(GiB)": 29.53,
|
|
"step": 25,
|
|
"token_acc": 0.8984301348526458,
|
|
"train_speed(iter/s)": 0.13295
|
|
},
|
|
{
|
|
"epoch": 0.19393939393939394,
|
|
"grad_norm": 1.0589144229888916,
|
|
"learning_rate": 9.896320793787106e-06,
|
|
"loss": 0.33169257640838623,
|
|
"memory(GiB)": 29.53,
|
|
"step": 30,
|
|
"token_acc": 0.8989100590731341,
|
|
"train_speed(iter/s)": 0.138041
|
|
},
|
|
{
|
|
"epoch": 0.22626262626262628,
|
|
"grad_norm": 0.905042290687561,
|
|
"learning_rate": 9.859057841617709e-06,
|
|
"loss": 0.29272122383117677,
|
|
"memory(GiB)": 29.53,
|
|
"step": 35,
|
|
"token_acc": 0.9075644898907739,
|
|
"train_speed(iter/s)": 0.140237
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"grad_norm": 0.8976985812187195,
|
|
"learning_rate": 9.816178385938867e-06,
|
|
"loss": 0.29633958339691163,
|
|
"memory(GiB)": 29.53,
|
|
"step": 40,
|
|
"token_acc": 0.9119097956307258,
|
|
"train_speed(iter/s)": 0.142425
|
|
},
|
|
{
|
|
"epoch": 0.2585858585858586,
|
|
"eval_loss": 0.375478595495224,
|
|
"eval_runtime": 5.311,
|
|
"eval_samples_per_second": 18.829,
|
|
"eval_steps_per_second": 4.707,
|
|
"eval_token_acc": 0.8989911943233307,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.2909090909090909,
|
|
"grad_norm": 1.0213873386383057,
|
|
"learning_rate": 9.767731990394638e-06,
|
|
"loss": 0.2878244400024414,
|
|
"memory(GiB)": 29.53,
|
|
"step": 45,
|
|
"token_acc": 0.8998546863647372,
|
|
"train_speed(iter/s)": 0.137223
|
|
},
|
|
{
|
|
"epoch": 0.32323232323232326,
|
|
"grad_norm": 0.9807785749435425,
|
|
"learning_rate": 9.71377465336155e-06,
|
|
"loss": 0.30373663902282716,
|
|
"memory(GiB)": 29.53,
|
|
"step": 50,
|
|
"token_acc": 0.9004981024667932,
|
|
"train_speed(iter/s)": 0.139351
|
|
},
|
|
{
|
|
"epoch": 0.35555555555555557,
|
|
"grad_norm": 0.87921142578125,
|
|
"learning_rate": 9.654368743221022e-06,
|
|
"loss": 0.2892775535583496,
|
|
"memory(GiB)": 29.53,
|
|
"step": 55,
|
|
"token_acc": 0.8972517176764522,
|
|
"train_speed(iter/s)": 0.140773
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"grad_norm": 1.0269665718078613,
|
|
"learning_rate": 9.589582926268798e-06,
|
|
"loss": 0.2767331600189209,
|
|
"memory(GiB)": 29.53,
|
|
"step": 60,
|
|
"token_acc": 0.917551472624873,
|
|
"train_speed(iter/s)": 0.142633
|
|
},
|
|
{
|
|
"epoch": 0.3878787878787879,
|
|
"eval_loss": 0.36320608854293823,
|
|
"eval_runtime": 5.3096,
|
|
"eval_samples_per_second": 18.834,
|
|
"eval_steps_per_second": 4.708,
|
|
"eval_token_acc": 0.9018979225442421,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.4202020202020202,
|
|
"grad_norm": 0.93445885181427,
|
|
"learning_rate": 9.519492087344724e-06,
|
|
"loss": 0.26028871536254883,
|
|
"memory(GiB)": 29.53,
|
|
"step": 65,
|
|
"token_acc": 0.9259884974838246,
|
|
"train_speed(iter/s)": 0.137874
|
|
},
|
|
{
|
|
"epoch": 0.45252525252525255,
|
|
"grad_norm": 0.9748146533966064,
|
|
"learning_rate": 9.444177243274619e-06,
|
|
"loss": 0.29547710418701173,
|
|
"memory(GiB)": 29.53,
|
|
"step": 70,
|
|
"token_acc": 0.8966111739669199,
|
|
"train_speed(iter/s)": 0.140001
|
|
},
|
|
{
|
|
"epoch": 0.48484848484848486,
|
|
"grad_norm": 0.937225341796875,
|
|
"learning_rate": 9.363725449224281e-06,
|
|
"loss": 0.2684732675552368,
|
|
"memory(GiB)": 29.53,
|
|
"step": 75,
|
|
"token_acc": 0.9166046511627907,
|
|
"train_speed(iter/s)": 0.141231
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"grad_norm": 1.026237964630127,
|
|
"learning_rate": 9.278229698073889e-06,
|
|
"loss": 0.2696544647216797,
|
|
"memory(GiB)": 29.53,
|
|
"step": 80,
|
|
"token_acc": 0.9159072741806554,
|
|
"train_speed(iter/s)": 0.142009
|
|
},
|
|
{
|
|
"epoch": 0.5171717171717172,
|
|
"eval_loss": 0.35313570499420166,
|
|
"eval_runtime": 5.3322,
|
|
"eval_samples_per_second": 18.754,
|
|
"eval_steps_per_second": 4.689,
|
|
"eval_token_acc": 0.9009575104727708,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.5494949494949495,
|
|
"grad_norm": 0.9791724681854248,
|
|
"learning_rate": 9.187788812929074e-06,
|
|
"loss": 0.2900824546813965,
|
|
"memory(GiB)": 29.53,
|
|
"step": 85,
|
|
"token_acc": 0.9096507542880761,
|
|
"train_speed(iter/s)": 0.139042
|
|
},
|
|
{
|
|
"epoch": 0.5818181818181818,
|
|
"grad_norm": 1.288805603981018,
|
|
"learning_rate": 9.092507332892968e-06,
|
|
"loss": 0.2910241365432739,
|
|
"memory(GiB)": 31.36,
|
|
"step": 90,
|
|
"token_acc": 0.9108959132712102,
|
|
"train_speed(iter/s)": 0.14038
|
|
},
|
|
{
|
|
"epoch": 0.6141414141414141,
|
|
"grad_norm": 1.055824875831604,
|
|
"learning_rate": 8.992495392231195e-06,
|
|
"loss": 0.2932537794113159,
|
|
"memory(GiB)": 31.36,
|
|
"step": 95,
|
|
"token_acc": 0.904834793623984,
|
|
"train_speed(iter/s)": 0.141584
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"grad_norm": 1.0129783153533936,
|
|
"learning_rate": 8.88786859306952e-06,
|
|
"loss": 0.26900548934936525,
|
|
"memory(GiB)": 31.36,
|
|
"step": 100,
|
|
"token_acc": 0.8921682782362172,
|
|
"train_speed(iter/s)": 0.142349
|
|
},
|
|
{
|
|
"epoch": 0.6464646464646465,
|
|
"eval_loss": 0.3519335985183716,
|
|
"eval_runtime": 5.3292,
|
|
"eval_samples_per_second": 18.764,
|
|
"eval_steps_per_second": 4.691,
|
|
"eval_token_acc": 0.9018979225442421,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.6787878787878788,
|
|
"grad_norm": 1.1157113313674927,
|
|
"learning_rate": 8.778747871771293e-06,
|
|
"loss": 0.2843419790267944,
|
|
"memory(GiB)": 31.36,
|
|
"step": 105,
|
|
"token_acc": 0.9077419118455281,
|
|
"train_speed(iter/s)": 0.140295
|
|
},
|
|
{
|
|
"epoch": 0.7111111111111111,
|
|
"grad_norm": 0.9619855880737305,
|
|
"learning_rate": 8.665259359149132e-06,
|
|
"loss": 0.2657431125640869,
|
|
"memory(GiB)": 31.36,
|
|
"step": 110,
|
|
"token_acc": 0.9188039238860655,
|
|
"train_speed(iter/s)": 0.141014
|
|
},
|
|
{
|
|
"epoch": 0.7434343434343434,
|
|
"grad_norm": 0.9229025840759277,
|
|
"learning_rate": 8.547534234672435e-06,
|
|
"loss": 0.28671417236328123,
|
|
"memory(GiB)": 31.36,
|
|
"step": 115,
|
|
"token_acc": 0.9083705301168564,
|
|
"train_speed(iter/s)": 0.141923
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"grad_norm": 1.197439432144165,
|
|
"learning_rate": 8.425708574839221e-06,
|
|
"loss": 0.26473350524902345,
|
|
"memory(GiB)": 31.36,
|
|
"step": 120,
|
|
"token_acc": 0.9178892300693734,
|
|
"train_speed(iter/s)": 0.14256
|
|
},
|
|
{
|
|
"epoch": 0.7757575757575758,
|
|
"eval_loss": 0.3486604392528534,
|
|
"eval_runtime": 5.3218,
|
|
"eval_samples_per_second": 18.791,
|
|
"eval_steps_per_second": 4.698,
|
|
"eval_token_acc": 0.9009147644695221,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.8080808080808081,
|
|
"grad_norm": 0.9874864220619202,
|
|
"learning_rate": 8.299923195887599e-06,
|
|
"loss": 0.30171942710876465,
|
|
"memory(GiB)": 31.36,
|
|
"step": 125,
|
|
"token_acc": 0.9013058472507459,
|
|
"train_speed(iter/s)": 0.140695
|
|
},
|
|
{
|
|
"epoch": 0.8404040404040404,
|
|
"grad_norm": 0.916023850440979,
|
|
"learning_rate": 8.170323491028625e-06,
|
|
"loss": 0.2660797119140625,
|
|
"memory(GiB)": 31.36,
|
|
"step": 130,
|
|
"token_acc": 0.9089231715442571,
|
|
"train_speed(iter/s)": 0.141028
|
|
},
|
|
{
|
|
"epoch": 0.8727272727272727,
|
|
"grad_norm": 0.9316055774688721,
|
|
"learning_rate": 8.03705926238874e-06,
|
|
"loss": 0.26981799602508544,
|
|
"memory(GiB)": 31.36,
|
|
"step": 135,
|
|
"token_acc": 0.9050517346317711,
|
|
"train_speed(iter/s)": 0.141684
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"grad_norm": 0.991523802280426,
|
|
"learning_rate": 7.900284547855992e-06,
|
|
"loss": 0.30231451988220215,
|
|
"memory(GiB)": 31.36,
|
|
"step": 140,
|
|
"token_acc": 0.9078060346831559,
|
|
"train_speed(iter/s)": 0.142094
|
|
},
|
|
{
|
|
"epoch": 0.9050505050505051,
|
|
"eval_loss": 0.34019821882247925,
|
|
"eval_runtime": 5.3029,
|
|
"eval_samples_per_second": 18.857,
|
|
"eval_steps_per_second": 4.714,
|
|
"eval_token_acc": 0.9021543985637342,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.9373737373737374,
|
|
"grad_norm": 0.9023342132568359,
|
|
"learning_rate": 7.760157443030234e-06,
|
|
"loss": 0.26686046123504636,
|
|
"memory(GiB)": 31.36,
|
|
"step": 145,
|
|
"token_acc": 0.9070432898663288,
|
|
"train_speed(iter/s)": 0.140367
|
|
},
|
|
{
|
|
"epoch": 0.9696969696969697,
|
|
"grad_norm": 0.8874338269233704,
|
|
"learning_rate": 7.616839918483061e-06,
|
|
"loss": 0.287930965423584,
|
|
"memory(GiB)": 31.36,
|
|
"step": 150,
|
|
"token_acc": 0.9119464202274026,
|
|
"train_speed(iter/s)": 0.140934
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.9795826077461243,
|
|
"learning_rate": 7.470497632538743e-06,
|
|
"loss": 0.267154598236084,
|
|
"memory(GiB)": 31.36,
|
|
"step": 155,
|
|
"token_acc": 0.9206813096488424,
|
|
"train_speed(iter/s)": 0.141745
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"grad_norm": 1.0346843004226685,
|
|
"learning_rate": 7.321299739792553e-06,
|
|
"loss": 0.210396146774292,
|
|
"memory(GiB)": 31.36,
|
|
"step": 160,
|
|
"token_acc": 0.9283128167994207,
|
|
"train_speed(iter/s)": 0.142337
|
|
},
|
|
{
|
|
"epoch": 1.0323232323232323,
|
|
"eval_loss": 0.3432323932647705,
|
|
"eval_runtime": 5.3155,
|
|
"eval_samples_per_second": 18.813,
|
|
"eval_steps_per_second": 4.703,
|
|
"eval_token_acc": 0.90360776267419,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.0646464646464646,
|
|
"grad_norm": 1.1043457984924316,
|
|
"learning_rate": 7.169418695587791e-06,
|
|
"loss": 0.18866196870803834,
|
|
"memory(GiB)": 31.36,
|
|
"step": 165,
|
|
"token_acc": 0.9338293722459005,
|
|
"train_speed(iter/s)": 0.140897
|
|
},
|
|
{
|
|
"epoch": 1.096969696969697,
|
|
"grad_norm": 1.0049176216125488,
|
|
"learning_rate": 7.015030056677559e-06,
|
|
"loss": 0.19572091102600098,
|
|
"memory(GiB)": 31.36,
|
|
"step": 170,
|
|
"token_acc": 0.9378101525153654,
|
|
"train_speed(iter/s)": 0.141758
|
|
},
|
|
{
|
|
"epoch": 1.1292929292929292,
|
|
"grad_norm": 1.0148508548736572,
|
|
"learning_rate": 6.858312278301638e-06,
|
|
"loss": 0.20892024040222168,
|
|
"memory(GiB)": 31.36,
|
|
"step": 175,
|
|
"token_acc": 0.9264176417641764,
|
|
"train_speed(iter/s)": 0.142192
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"grad_norm": 0.8647620677947998,
|
|
"learning_rate": 6.699446507913083e-06,
|
|
"loss": 0.190657377243042,
|
|
"memory(GiB)": 31.36,
|
|
"step": 180,
|
|
"token_acc": 0.929062185462343,
|
|
"train_speed(iter/s)": 0.142547
|
|
},
|
|
{
|
|
"epoch": 1.1616161616161615,
|
|
"eval_loss": 0.3503071963787079,
|
|
"eval_runtime": 5.314,
|
|
"eval_samples_per_second": 18.818,
|
|
"eval_steps_per_second": 4.705,
|
|
"eval_token_acc": 0.9053176028041379,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.1939393939393939,
|
|
"grad_norm": 0.734727680683136,
|
|
"learning_rate": 6.53861637579291e-06,
|
|
"loss": 0.19445569515228273,
|
|
"memory(GiB)": 31.36,
|
|
"step": 185,
|
|
"token_acc": 0.9243174259416815,
|
|
"train_speed(iter/s)": 0.141275
|
|
},
|
|
{
|
|
"epoch": 1.2262626262626264,
|
|
"grad_norm": 1.0062605142593384,
|
|
"learning_rate": 6.376007782794926e-06,
|
|
"loss": 0.23203377723693847,
|
|
"memory(GiB)": 31.36,
|
|
"step": 190,
|
|
"token_acc": 0.9284152664126429,
|
|
"train_speed(iter/s)": 0.142023
|
|
},
|
|
{
|
|
"epoch": 1.2585858585858585,
|
|
"grad_norm": 0.8467244505882263,
|
|
"learning_rate": 6.211808685466063e-06,
|
|
"loss": 0.20841593742370607,
|
|
"memory(GiB)": 31.36,
|
|
"step": 195,
|
|
"token_acc": 0.9305149796643181,
|
|
"train_speed(iter/s)": 0.14276
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"grad_norm": 0.8289315700531006,
|
|
"learning_rate": 6.046208878790543e-06,
|
|
"loss": 0.20221335887908937,
|
|
"memory(GiB)": 31.36,
|
|
"step": 200,
|
|
"token_acc": 0.9386574826174541,
|
|
"train_speed(iter/s)": 0.143051
|
|
},
|
|
{
|
|
"epoch": 1.290909090909091,
|
|
"eval_loss": 0.3498072922229767,
|
|
"eval_runtime": 5.3519,
|
|
"eval_samples_per_second": 18.685,
|
|
"eval_steps_per_second": 4.671,
|
|
"eval_token_acc": 0.9033512866546978,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.3232323232323233,
|
|
"grad_norm": 0.9581440091133118,
|
|
"learning_rate": 5.879399776809047e-06,
|
|
"loss": 0.1989992380142212,
|
|
"memory(GiB)": 31.36,
|
|
"step": 205,
|
|
"token_acc": 0.9317071351311029,
|
|
"train_speed(iter/s)": 0.14184
|
|
},
|
|
{
|
|
"epoch": 1.3555555555555556,
|
|
"grad_norm": 0.905442476272583,
|
|
"learning_rate": 5.711574191366427e-06,
|
|
"loss": 0.20228266716003418,
|
|
"memory(GiB)": 31.36,
|
|
"step": 210,
|
|
"token_acc": 0.9324250349735418,
|
|
"train_speed(iter/s)": 0.142185
|
|
},
|
|
{
|
|
"epoch": 1.387878787878788,
|
|
"grad_norm": 0.8766697645187378,
|
|
"learning_rate": 5.542926109243727e-06,
|
|
"loss": 0.18625075817108155,
|
|
"memory(GiB)": 31.36,
|
|
"step": 215,
|
|
"token_acc": 0.9340369393139841,
|
|
"train_speed(iter/s)": 0.142509
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"grad_norm": 0.8826522827148438,
|
|
"learning_rate": 5.373650467932122e-06,
|
|
"loss": 0.18408771753311157,
|
|
"memory(GiB)": 31.36,
|
|
"step": 220,
|
|
"token_acc": 0.9386043390740326,
|
|
"train_speed(iter/s)": 0.14291
|
|
},
|
|
{
|
|
"epoch": 1.4202020202020202,
|
|
"eval_loss": 0.3430667817592621,
|
|
"eval_runtime": 5.3043,
|
|
"eval_samples_per_second": 18.853,
|
|
"eval_steps_per_second": 4.713,
|
|
"eval_token_acc": 0.9047619047619048,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.4525252525252526,
|
|
"grad_norm": 1.0129814147949219,
|
|
"learning_rate": 5.2039429303079294e-06,
|
|
"loss": 0.20485594272613525,
|
|
"memory(GiB)": 31.36,
|
|
"step": 225,
|
|
"token_acc": 0.9242819843342036,
|
|
"train_speed(iter/s)": 0.142107
|
|
},
|
|
{
|
|
"epoch": 1.4848484848484849,
|
|
"grad_norm": 0.9322662353515625,
|
|
"learning_rate": 5.033999658469174e-06,
|
|
"loss": 0.1999491572380066,
|
|
"memory(GiB)": 31.36,
|
|
"step": 230,
|
|
"token_acc": 0.935610103166133,
|
|
"train_speed(iter/s)": 0.142426
|
|
},
|
|
{
|
|
"epoch": 1.5171717171717172,
|
|
"grad_norm": 0.9451385140419006,
|
|
"learning_rate": 4.864017086995112e-06,
|
|
"loss": 0.20325605869293212,
|
|
"memory(GiB)": 31.36,
|
|
"step": 235,
|
|
"token_acc": 0.9395159286317778,
|
|
"train_speed(iter/s)": 0.142786
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"grad_norm": 0.7493119835853577,
|
|
"learning_rate": 4.694191695890788e-06,
|
|
"loss": 0.17417298555374144,
|
|
"memory(GiB)": 31.36,
|
|
"step": 240,
|
|
"token_acc": 0.9434940634843713,
|
|
"train_speed(iter/s)": 0.14319
|
|
},
|
|
{
|
|
"epoch": 1.5494949494949495,
|
|
"eval_loss": 0.3442021608352661,
|
|
"eval_runtime": 5.3253,
|
|
"eval_samples_per_second": 18.778,
|
|
"eval_steps_per_second": 4.695,
|
|
"eval_token_acc": 0.9033512866546978,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.5818181818181818,
|
|
"grad_norm": 0.8448814749717712,
|
|
"learning_rate": 4.524719783479088e-06,
|
|
"loss": 0.17924880981445312,
|
|
"memory(GiB)": 31.36,
|
|
"step": 245,
|
|
"token_acc": 0.9297393970362801,
|
|
"train_speed(iter/s)": 0.142181
|
|
},
|
|
{
|
|
"epoch": 1.614141414141414,
|
|
"grad_norm": 0.8817701935768127,
|
|
"learning_rate": 4.355797239502807e-06,
|
|
"loss": 0.1808495044708252,
|
|
"memory(GiB)": 31.37,
|
|
"step": 250,
|
|
"token_acc": 0.932560963270262,
|
|
"train_speed(iter/s)": 0.142404
|
|
},
|
|
{
|
|
"epoch": 1.6464646464646466,
|
|
"grad_norm": 0.892765462398529,
|
|
"learning_rate": 4.187619318698971e-06,
|
|
"loss": 0.1975640058517456,
|
|
"memory(GiB)": 31.37,
|
|
"step": 255,
|
|
"token_acc": 0.9292730844793713,
|
|
"train_speed(iter/s)": 0.142833
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"grad_norm": 0.8926272392272949,
|
|
"learning_rate": 4.020380415107167e-06,
|
|
"loss": 0.19108818769454955,
|
|
"memory(GiB)": 31.37,
|
|
"step": 260,
|
|
"token_acc": 0.9380416838629798,
|
|
"train_speed(iter/s)": 0.14303
|
|
},
|
|
{
|
|
"epoch": 1.6787878787878787,
|
|
"eval_loss": 0.3395140767097473,
|
|
"eval_runtime": 5.3138,
|
|
"eval_samples_per_second": 18.819,
|
|
"eval_steps_per_second": 4.705,
|
|
"eval_token_acc": 0.904719158758656,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.7111111111111112,
|
|
"grad_norm": 0.9119723439216614,
|
|
"learning_rate": 3.854273837372724e-06,
|
|
"loss": 0.1968384265899658,
|
|
"memory(GiB)": 31.37,
|
|
"step": 265,
|
|
"token_acc": 0.9235020131049183,
|
|
"train_speed(iter/s)": 0.14221
|
|
},
|
|
{
|
|
"epoch": 1.7434343434343433,
|
|
"grad_norm": 0.8273991942405701,
|
|
"learning_rate": 3.689491585304491e-06,
|
|
"loss": 0.1967773199081421,
|
|
"memory(GiB)": 31.37,
|
|
"step": 270,
|
|
"token_acc": 0.9207749251850685,
|
|
"train_speed(iter/s)": 0.142463
|
|
},
|
|
{
|
|
"epoch": 1.7757575757575759,
|
|
"grad_norm": 1.0739576816558838,
|
|
"learning_rate": 3.526224127945479e-06,
|
|
"loss": 0.19146767854690552,
|
|
"memory(GiB)": 31.37,
|
|
"step": 275,
|
|
"token_acc": 0.9385914241279716,
|
|
"train_speed(iter/s)": 0.142947
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"grad_norm": 0.7444007992744446,
|
|
"learning_rate": 3.3646601834128924e-06,
|
|
"loss": 0.18657138347625732,
|
|
"memory(GiB)": 31.37,
|
|
"step": 280,
|
|
"token_acc": 0.9347743648084945,
|
|
"train_speed(iter/s)": 0.143177
|
|
},
|
|
{
|
|
"epoch": 1.808080808080808,
|
|
"eval_loss": 0.33954280614852905,
|
|
"eval_runtime": 5.3308,
|
|
"eval_samples_per_second": 18.759,
|
|
"eval_steps_per_second": 4.69,
|
|
"eval_token_acc": 0.9033512866546978,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.8404040404040405,
|
|
"grad_norm": 0.854715883731842,
|
|
"learning_rate": 3.204986500762006e-06,
|
|
"loss": 0.17565726041793822,
|
|
"memory(GiB)": 31.37,
|
|
"step": 285,
|
|
"token_acc": 0.9216834543608964,
|
|
"train_speed(iter/s)": 0.142226
|
|
},
|
|
{
|
|
"epoch": 1.8727272727272726,
|
|
"grad_norm": 0.9426065683364868,
|
|
"learning_rate": 3.0473876441260786e-06,
|
|
"loss": 0.18239200115203857,
|
|
"memory(GiB)": 31.37,
|
|
"step": 290,
|
|
"token_acc": 0.939308718134809,
|
|
"train_speed(iter/s)": 0.142498
|
|
},
|
|
{
|
|
"epoch": 1.905050505050505,
|
|
"grad_norm": 0.7922055721282959,
|
|
"learning_rate": 2.8920457793817507e-06,
|
|
"loss": 0.19457708597183226,
|
|
"memory(GiB)": 31.37,
|
|
"step": 295,
|
|
"token_acc": 0.9396199182102478,
|
|
"train_speed(iter/s)": 0.142765
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"grad_norm": 1.0278714895248413,
|
|
"learning_rate": 2.7391404635865725e-06,
|
|
"loss": 0.1904490351676941,
|
|
"memory(GiB)": 31.37,
|
|
"step": 300,
|
|
"token_acc": 0.9375734430082256,
|
|
"train_speed(iter/s)": 0.143068
|
|
},
|
|
{
|
|
"epoch": 1.9373737373737374,
|
|
"eval_loss": 0.34051987528800964,
|
|
"eval_runtime": 5.283,
|
|
"eval_samples_per_second": 18.929,
|
|
"eval_steps_per_second": 4.732,
|
|
"eval_token_acc": 0.9058733008463709,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.9696969696969697,
|
|
"grad_norm": 0.7814855575561523,
|
|
"learning_rate": 2.5888484374320033e-06,
|
|
"loss": 0.18078551292419434,
|
|
"memory(GiB)": 31.37,
|
|
"step": 305,
|
|
"token_acc": 0.9328905860866678,
|
|
"train_speed(iter/s)": 0.142327
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 1.068848967552185,
|
|
"learning_rate": 2.4413434209518137e-06,
|
|
"loss": 0.1775040626525879,
|
|
"memory(GiB)": 31.37,
|
|
"step": 310,
|
|
"token_acc": 0.9349484536082474,
|
|
"train_speed(iter/s)": 0.142606
|
|
},
|
|
{
|
|
"epoch": 2.0323232323232325,
|
|
"grad_norm": 0.8913955092430115,
|
|
"learning_rate": 2.296795912722014e-06,
|
|
"loss": 0.13691856861114501,
|
|
"memory(GiB)": 31.37,
|
|
"step": 315,
|
|
"token_acc": 0.9597564204395023,
|
|
"train_speed(iter/s)": 0.142686
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"grad_norm": 0.7020771503448486,
|
|
"learning_rate": 2.1553729927843894e-06,
|
|
"loss": 0.14630917310714722,
|
|
"memory(GiB)": 31.37,
|
|
"step": 320,
|
|
"token_acc": 0.9601399342732959,
|
|
"train_speed(iter/s)": 0.142875
|
|
},
|
|
{
|
|
"epoch": 2.0646464646464646,
|
|
"eval_loss": 0.3507172465324402,
|
|
"eval_runtime": 5.3226,
|
|
"eval_samples_per_second": 18.788,
|
|
"eval_steps_per_second": 4.697,
|
|
"eval_token_acc": 0.9062152688723605,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 2.096969696969697,
|
|
"grad_norm": 0.7276548743247986,
|
|
"learning_rate": 2.017238129521506e-06,
|
|
"loss": 0.14279915094375611,
|
|
"memory(GiB)": 31.37,
|
|
"step": 325,
|
|
"token_acc": 0.9453313981615868,
|
|
"train_speed(iter/s)": 0.142166
|
|
},
|
|
{
|
|
"epoch": 2.1292929292929292,
|
|
"grad_norm": 0.9800724387168884,
|
|
"learning_rate": 1.8825509907063328e-06,
|
|
"loss": 0.14380053281784058,
|
|
"memory(GiB)": 31.37,
|
|
"step": 330,
|
|
"token_acc": 0.9582820773386105,
|
|
"train_speed(iter/s)": 0.142491
|
|
},
|
|
{
|
|
"epoch": 2.1616161616161618,
|
|
"grad_norm": 0.764988899230957,
|
|
"learning_rate": 1.7514672589449378e-06,
|
|
"loss": 0.1322154998779297,
|
|
"memory(GiB)": 31.37,
|
|
"step": 335,
|
|
"token_acc": 0.9540802213001384,
|
|
"train_speed(iter/s)": 0.142717
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"grad_norm": 0.8425831198692322,
|
|
"learning_rate": 1.6241384517255854e-06,
|
|
"loss": 0.1310911536216736,
|
|
"memory(GiB)": 31.37,
|
|
"step": 340,
|
|
"token_acc": 0.9542607139305956,
|
|
"train_speed(iter/s)": 0.142908
|
|
},
|
|
{
|
|
"epoch": 2.193939393939394,
|
|
"eval_loss": 0.3661825954914093,
|
|
"eval_runtime": 5.3319,
|
|
"eval_samples_per_second": 18.755,
|
|
"eval_steps_per_second": 4.689,
|
|
"eval_token_acc": 0.9046764127554073,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 2.2262626262626264,
|
|
"grad_norm": 0.8031105399131775,
|
|
"learning_rate": 1.500711746282192e-06,
|
|
"loss": 0.15711712837219238,
|
|
"memory(GiB)": 31.37,
|
|
"step": 345,
|
|
"token_acc": 0.9412384531628675,
|
|
"train_speed(iter/s)": 0.142295
|
|
},
|
|
{
|
|
"epoch": 2.2585858585858585,
|
|
"grad_norm": 0.7701563835144043,
|
|
"learning_rate": 1.3813298094746491e-06,
|
|
"loss": 0.1371659517288208,
|
|
"memory(GiB)": 31.37,
|
|
"step": 350,
|
|
"token_acc": 0.9590588494599956,
|
|
"train_speed(iter/s)": 0.142428
|
|
},
|
|
{
|
|
"epoch": 2.290909090909091,
|
|
"grad_norm": 0.7400316596031189,
|
|
"learning_rate": 1.2661306328825818e-06,
|
|
"loss": 0.13530057668685913,
|
|
"memory(GiB)": 31.37,
|
|
"step": 355,
|
|
"token_acc": 0.9558357869007278,
|
|
"train_speed(iter/s)": 0.142645
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"grad_norm": 0.8807271718978882,
|
|
"learning_rate": 1.1552473733031893e-06,
|
|
"loss": 0.13264925479888917,
|
|
"memory(GiB)": 31.37,
|
|
"step": 360,
|
|
"token_acc": 0.9480800203441553,
|
|
"train_speed(iter/s)": 0.142961
|
|
},
|
|
{
|
|
"epoch": 2.323232323232323,
|
|
"eval_loss": 0.3667658269405365,
|
|
"eval_runtime": 5.3077,
|
|
"eval_samples_per_second": 18.841,
|
|
"eval_steps_per_second": 4.71,
|
|
"eval_token_acc": 0.9054458408138839,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 2.3555555555555556,
|
|
"grad_norm": 0.8146414160728455,
|
|
"learning_rate": 1.0488081988375493e-06,
|
|
"loss": 0.14928791522979737,
|
|
"memory(GiB)": 31.37,
|
|
"step": 365,
|
|
"token_acc": 0.9331647539389224,
|
|
"train_speed(iter/s)": 0.142423
|
|
},
|
|
{
|
|
"epoch": 2.3878787878787877,
|
|
"grad_norm": 0.8976642489433289,
|
|
"learning_rate": 9.469361407432431e-07,
|
|
"loss": 0.14599543809890747,
|
|
"memory(GiB)": 31.37,
|
|
"step": 370,
|
|
"token_acc": 0.9549465467503818,
|
|
"train_speed(iter/s)": 0.142603
|
|
},
|
|
{
|
|
"epoch": 2.4202020202020202,
|
|
"grad_norm": 0.7792761921882629,
|
|
"learning_rate": 8.497489512245971e-07,
|
|
"loss": 0.13710694313049315,
|
|
"memory(GiB)": 31.37,
|
|
"step": 375,
|
|
"token_acc": 0.9578516805975458,
|
|
"train_speed(iter/s)": 0.142793
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"grad_norm": 0.9567272663116455,
|
|
"learning_rate": 7.573589673248833e-07,
|
|
"loss": 0.13858846426010132,
|
|
"memory(GiB)": 31.37,
|
|
"step": 380,
|
|
"token_acc": 0.9512100926879505,
|
|
"train_speed(iter/s)": 0.143064
|
|
},
|
|
{
|
|
"epoch": 2.4525252525252528,
|
|
"eval_loss": 0.3684777021408081,
|
|
"eval_runtime": 5.3389,
|
|
"eval_samples_per_second": 18.73,
|
|
"eval_steps_per_second": 4.683,
|
|
"eval_token_acc": 0.905018380781397,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 2.484848484848485,
|
|
"grad_norm": 0.7511286735534668,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.14136791229248047,
|
|
"memory(GiB)": 31.37,
|
|
"step": 385,
|
|
"token_acc": 0.9423156301596037,
|
|
"train_speed(iter/s)": 0.14245
|
|
},
|
|
{
|
|
"epoch": 2.517171717171717,
|
|
"grad_norm": 0.8342220783233643,
|
|
"learning_rate": 5.873921160683943e-07,
|
|
"loss": 0.13304708003997803,
|
|
"memory(GiB)": 31.37,
|
|
"step": 390,
|
|
"token_acc": 0.9584492790285859,
|
|
"train_speed(iter/s)": 0.142715
|
|
},
|
|
{
|
|
"epoch": 2.5494949494949495,
|
|
"grad_norm": 0.7826300263404846,
|
|
"learning_rate": 5.100117105459279e-07,
|
|
"loss": 0.1284404754638672,
|
|
"memory(GiB)": 31.37,
|
|
"step": 395,
|
|
"token_acc": 0.9555216985304958,
|
|
"train_speed(iter/s)": 0.142902
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"grad_norm": 0.7116090655326843,
|
|
"learning_rate": 4.3782120722406565e-07,
|
|
"loss": 0.15817636251449585,
|
|
"memory(GiB)": 31.37,
|
|
"step": 400,
|
|
"token_acc": 0.9559717504022885,
|
|
"train_speed(iter/s)": 0.143144
|
|
},
|
|
{
|
|
"epoch": 2.581818181818182,
|
|
"eval_loss": 0.3664516508579254,
|
|
"eval_runtime": 5.3264,
|
|
"eval_samples_per_second": 18.774,
|
|
"eval_steps_per_second": 4.694,
|
|
"eval_token_acc": 0.9054885868171326,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 2.614141414141414,
|
|
"grad_norm": 0.7377658486366272,
|
|
"learning_rate": 3.709040498955102e-07,
|
|
"loss": 0.13814414739608766,
|
|
"memory(GiB)": 31.37,
|
|
"step": 405,
|
|
"token_acc": 0.9434749034749035,
|
|
"train_speed(iter/s)": 0.142529
|
|
},
|
|
{
|
|
"epoch": 2.6464646464646466,
|
|
"grad_norm": 0.9466362595558167,
|
|
"learning_rate": 3.0933758698072023e-07,
|
|
"loss": 0.13764555454254152,
|
|
"memory(GiB)": 31.37,
|
|
"step": 410,
|
|
"token_acc": 0.9556354916067147,
|
|
"train_speed(iter/s)": 0.142696
|
|
},
|
|
{
|
|
"epoch": 2.6787878787878787,
|
|
"grad_norm": 0.8730055093765259,
|
|
"learning_rate": 2.531929821221768e-07,
|
|
"loss": 0.1323538064956665,
|
|
"memory(GiB)": 31.37,
|
|
"step": 415,
|
|
"token_acc": 0.951093389819949,
|
|
"train_speed(iter/s)": 0.142879
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"grad_norm": 0.8799780011177063,
|
|
"learning_rate": 2.0253513192751374e-07,
|
|
"loss": 0.1415894865989685,
|
|
"memory(GiB)": 31.37,
|
|
"step": 420,
|
|
"token_acc": 0.957655213984328,
|
|
"train_speed(iter/s)": 0.143168
|
|
},
|
|
{
|
|
"epoch": 2.7111111111111112,
|
|
"eval_loss": 0.3659995198249817,
|
|
"eval_runtime": 5.3257,
|
|
"eval_samples_per_second": 18.777,
|
|
"eval_steps_per_second": 4.694,
|
|
"eval_token_acc": 0.9056595708301274,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 2.7434343434343433,
|
|
"grad_norm": 0.9417216777801514,
|
|
"learning_rate": 1.5742259095662126e-07,
|
|
"loss": 0.13300987482070922,
|
|
"memory(GiB)": 31.37,
|
|
"step": 425,
|
|
"token_acc": 0.9425528606965174,
|
|
"train_speed(iter/s)": 0.142605
|
|
},
|
|
{
|
|
"epoch": 2.775757575757576,
|
|
"grad_norm": 0.7449456453323364,
|
|
"learning_rate": 1.1790750403941231e-07,
|
|
"loss": 0.13196516036987305,
|
|
"memory(GiB)": 31.37,
|
|
"step": 430,
|
|
"token_acc": 0.9565181855333061,
|
|
"train_speed(iter/s)": 0.142783
|
|
},
|
|
{
|
|
"epoch": 2.808080808080808,
|
|
"grad_norm": 0.807500422000885,
|
|
"learning_rate": 8.403554600248498e-08,
|
|
"loss": 0.12061362266540528,
|
|
"memory(GiB)": 31.37,
|
|
"step": 435,
|
|
"token_acc": 0.9600038504115127,
|
|
"train_speed(iter/s)": 0.142955
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"grad_norm": 0.8164981007575989,
|
|
"learning_rate": 5.584586887435739e-08,
|
|
"loss": 0.1402422547340393,
|
|
"memory(GiB)": 31.37,
|
|
"step": 440,
|
|
"token_acc": 0.9568223268439581,
|
|
"train_speed(iter/s)": 0.143151
|
|
},
|
|
{
|
|
"epoch": 2.8404040404040405,
|
|
"eval_loss": 0.3657556176185608,
|
|
"eval_runtime": 5.3232,
|
|
"eval_samples_per_second": 18.786,
|
|
"eval_steps_per_second": 4.696,
|
|
"eval_token_acc": 0.9058733008463709,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 2.8727272727272726,
|
|
"grad_norm": 0.8275712728500366,
|
|
"learning_rate": 3.337105663029361e-08,
|
|
"loss": 0.13185644149780273,
|
|
"memory(GiB)": 31.37,
|
|
"step": 445,
|
|
"token_acc": 0.9404505730973607,
|
|
"train_speed(iter/s)": 0.142606
|
|
},
|
|
{
|
|
"epoch": 2.905050505050505,
|
|
"grad_norm": 0.8103200197219849,
|
|
"learning_rate": 1.6637087529033925e-08,
|
|
"loss": 0.12856653928756714,
|
|
"memory(GiB)": 31.37,
|
|
"step": 450,
|
|
"token_acc": 0.9603827178950516,
|
|
"train_speed(iter/s)": 0.14286
|
|
},
|
|
{
|
|
"epoch": 2.937373737373737,
|
|
"grad_norm": 0.7664045691490173,
|
|
"learning_rate": 5.6633040849601865e-09,
|
|
"loss": 0.12633774280548096,
|
|
"memory(GiB)": 31.37,
|
|
"step": 455,
|
|
"token_acc": 0.9499862901014533,
|
|
"train_speed(iter/s)": 0.142986
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"grad_norm": 0.8392007350921631,
|
|
"learning_rate": 4.623907104084335e-10,
|
|
"loss": 0.15011647939682007,
|
|
"memory(GiB)": 31.37,
|
|
"step": 460,
|
|
"token_acc": 0.9552918794432762,
|
|
"train_speed(iter/s)": 0.143271
|
|
},
|
|
{
|
|
"epoch": 2.9696969696969697,
|
|
"eval_loss": 0.36587560176849365,
|
|
"eval_runtime": 5.3221,
|
|
"eval_samples_per_second": 18.79,
|
|
"eval_steps_per_second": 4.697,
|
|
"eval_token_acc": 0.9063007608788578,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 2.9826262626262627,
|
|
"eval_loss": 0.36588189005851746,
|
|
"eval_runtime": 5.3146,
|
|
"eval_samples_per_second": 18.816,
|
|
"eval_steps_per_second": 4.704,
|
|
"eval_token_acc": 0.9059587928528683,
|
|
"step": 462
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 462,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4.302620480776438e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|