721 lines
20 KiB
JSON
721 lines
20 KiB
JSON
{
|
|
"best_global_step": 180,
|
|
"best_metric": 0.27287108,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v32-20250504-043500/checkpoint-180",
|
|
"epoch": 2.9732620320855614,
|
|
"eval_steps": 20,
|
|
"global_step": 279,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0106951871657754,
|
|
"grad_norm": 3.2918701171875,
|
|
"learning_rate": 9.999683023724021e-06,
|
|
"loss": 0.2989569902420044,
|
|
"memory(GiB)": 28.98,
|
|
"step": 1,
|
|
"token_acc": 0.9019468186134852,
|
|
"train_speed(iter/s)": 0.075727
|
|
},
|
|
{
|
|
"epoch": 0.053475935828877004,
|
|
"grad_norm": 1.4581928253173828,
|
|
"learning_rate": 9.992077602401358e-06,
|
|
"loss": 0.26982036232948303,
|
|
"memory(GiB)": 28.98,
|
|
"step": 5,
|
|
"token_acc": 0.9124155874528606,
|
|
"train_speed(iter/s)": 0.162223
|
|
},
|
|
{
|
|
"epoch": 0.10695187165775401,
|
|
"grad_norm": 0.8813036680221558,
|
|
"learning_rate": 9.968335515358916e-06,
|
|
"loss": 0.26494245529174804,
|
|
"memory(GiB)": 28.98,
|
|
"step": 10,
|
|
"token_acc": 0.9064403726266386,
|
|
"train_speed(iter/s)": 0.196471
|
|
},
|
|
{
|
|
"epoch": 0.16042780748663102,
|
|
"grad_norm": 0.8264817595481873,
|
|
"learning_rate": 9.92884897657402e-06,
|
|
"loss": 0.27532644271850587,
|
|
"memory(GiB)": 28.98,
|
|
"step": 15,
|
|
"token_acc": 0.9036378177940428,
|
|
"train_speed(iter/s)": 0.206122
|
|
},
|
|
{
|
|
"epoch": 0.21390374331550802,
|
|
"grad_norm": 0.7590827345848083,
|
|
"learning_rate": 9.873743117270691e-06,
|
|
"loss": 0.24748692512512208,
|
|
"memory(GiB)": 28.98,
|
|
"step": 20,
|
|
"token_acc": 0.9165032561067131,
|
|
"train_speed(iter/s)": 0.213242
|
|
},
|
|
{
|
|
"epoch": 0.21390374331550802,
|
|
"eval_loss": 0.29857584834098816,
|
|
"eval_runtime": 1.6521,
|
|
"eval_samples_per_second": 36.318,
|
|
"eval_steps_per_second": 9.079,
|
|
"eval_token_acc": 0.9033078880407125,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.26737967914438504,
|
|
"grad_norm": 0.7544880509376526,
|
|
"learning_rate": 9.803192565659898e-06,
|
|
"loss": 0.2472740650177002,
|
|
"memory(GiB)": 28.98,
|
|
"step": 25,
|
|
"token_acc": 0.9127769919849128,
|
|
"train_speed(iter/s)": 0.193549
|
|
},
|
|
{
|
|
"epoch": 0.32085561497326204,
|
|
"grad_norm": 0.7806485891342163,
|
|
"learning_rate": 9.717420893549902e-06,
|
|
"loss": 0.2667980670928955,
|
|
"memory(GiB)": 28.98,
|
|
"step": 30,
|
|
"token_acc": 0.908313332992902,
|
|
"train_speed(iter/s)": 0.199526
|
|
},
|
|
{
|
|
"epoch": 0.37433155080213903,
|
|
"grad_norm": 0.7090319395065308,
|
|
"learning_rate": 9.616699907856368e-06,
|
|
"loss": 0.23824496269226075,
|
|
"memory(GiB)": 28.98,
|
|
"step": 35,
|
|
"token_acc": 0.9139585630821934,
|
|
"train_speed(iter/s)": 0.202496
|
|
},
|
|
{
|
|
"epoch": 0.42780748663101603,
|
|
"grad_norm": 0.6566728949546814,
|
|
"learning_rate": 9.501348789257373e-06,
|
|
"loss": 0.24109985828399658,
|
|
"memory(GiB)": 28.98,
|
|
"step": 40,
|
|
"token_acc": 0.9166758030917662,
|
|
"train_speed(iter/s)": 0.20511
|
|
},
|
|
{
|
|
"epoch": 0.42780748663101603,
|
|
"eval_loss": 0.2843839228153229,
|
|
"eval_runtime": 1.6484,
|
|
"eval_samples_per_second": 36.398,
|
|
"eval_steps_per_second": 9.1,
|
|
"eval_token_acc": 0.9075595065545785,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.48128342245989303,
|
|
"grad_norm": 0.6919281482696533,
|
|
"learning_rate": 9.371733080722911e-06,
|
|
"loss": 0.24000308513641358,
|
|
"memory(GiB)": 28.98,
|
|
"step": 45,
|
|
"token_acc": 0.9120257943391221,
|
|
"train_speed(iter/s)": 0.195603
|
|
},
|
|
{
|
|
"epoch": 0.5347593582887701,
|
|
"grad_norm": 0.6628720164299011,
|
|
"learning_rate": 9.228263529124199e-06,
|
|
"loss": 0.225927734375,
|
|
"memory(GiB)": 28.98,
|
|
"step": 50,
|
|
"token_acc": 0.922932112394543,
|
|
"train_speed(iter/s)": 0.199738
|
|
},
|
|
{
|
|
"epoch": 0.5882352941176471,
|
|
"grad_norm": 0.7601417899131775,
|
|
"learning_rate": 9.071394783593664e-06,
|
|
"loss": 0.24698638916015625,
|
|
"memory(GiB)": 28.98,
|
|
"step": 55,
|
|
"token_acc": 0.916304375460809,
|
|
"train_speed(iter/s)": 0.202796
|
|
},
|
|
{
|
|
"epoch": 0.6417112299465241,
|
|
"grad_norm": 0.7333827018737793,
|
|
"learning_rate": 8.90162395476046e-06,
|
|
"loss": 0.24123883247375488,
|
|
"memory(GiB)": 28.98,
|
|
"step": 60,
|
|
"token_acc": 0.9169203180670583,
|
|
"train_speed(iter/s)": 0.205478
|
|
},
|
|
{
|
|
"epoch": 0.6417112299465241,
|
|
"eval_loss": 0.2792617380619049,
|
|
"eval_runtime": 1.6412,
|
|
"eval_samples_per_second": 36.559,
|
|
"eval_steps_per_second": 9.14,
|
|
"eval_token_acc": 0.908719038876542,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.6951871657754011,
|
|
"grad_norm": 0.732627809047699,
|
|
"learning_rate": 8.719489039427256e-06,
|
|
"loss": 0.2210240602493286,
|
|
"memory(GiB)": 28.98,
|
|
"step": 65,
|
|
"token_acc": 0.9183851177518306,
|
|
"train_speed(iter/s)": 0.19878
|
|
},
|
|
{
|
|
"epoch": 0.7486631016042781,
|
|
"grad_norm": 0.7144444584846497,
|
|
"learning_rate": 8.525567215680397e-06,
|
|
"loss": 0.24620118141174316,
|
|
"memory(GiB)": 28.98,
|
|
"step": 70,
|
|
"token_acc": 0.9128896697452457,
|
|
"train_speed(iter/s)": 0.20038
|
|
},
|
|
{
|
|
"epoch": 0.8021390374331551,
|
|
"grad_norm": 0.736126184463501,
|
|
"learning_rate": 8.320473013836197e-06,
|
|
"loss": 0.23789706230163574,
|
|
"memory(GiB)": 28.98,
|
|
"step": 75,
|
|
"token_acc": 0.9134095303360337,
|
|
"train_speed(iter/s)": 0.202414
|
|
},
|
|
{
|
|
"epoch": 0.8556149732620321,
|
|
"grad_norm": 0.7036953568458557,
|
|
"learning_rate": 8.104856369019525e-06,
|
|
"loss": 0.23406553268432617,
|
|
"memory(GiB)": 28.98,
|
|
"step": 80,
|
|
"token_acc": 0.9200627693460746,
|
|
"train_speed(iter/s)": 0.204729
|
|
},
|
|
{
|
|
"epoch": 0.8556149732620321,
|
|
"eval_loss": 0.2767316699028015,
|
|
"eval_runtime": 1.6471,
|
|
"eval_samples_per_second": 36.427,
|
|
"eval_steps_per_second": 9.107,
|
|
"eval_token_acc": 0.9091699681128611,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.9090909090909091,
|
|
"grad_norm": 0.6711894273757935,
|
|
"learning_rate": 7.879400561546033e-06,
|
|
"loss": 0.23753652572631836,
|
|
"memory(GiB)": 28.98,
|
|
"step": 85,
|
|
"token_acc": 0.9143226902311286,
|
|
"train_speed(iter/s)": 0.200371
|
|
},
|
|
{
|
|
"epoch": 0.9625668449197861,
|
|
"grad_norm": 0.6814318895339966,
|
|
"learning_rate": 7.644820051634813e-06,
|
|
"loss": 0.23459360599517823,
|
|
"memory(GiB)": 28.98,
|
|
"step": 90,
|
|
"token_acc": 0.9142770409116383,
|
|
"train_speed(iter/s)": 0.201764
|
|
},
|
|
{
|
|
"epoch": 1.0106951871657754,
|
|
"grad_norm": 1.021850824356079,
|
|
"learning_rate": 7.401858215313228e-06,
|
|
"loss": 0.21953530311584474,
|
|
"memory(GiB)": 28.98,
|
|
"step": 95,
|
|
"token_acc": 0.9268375978563548,
|
|
"train_speed(iter/s)": 0.203547
|
|
},
|
|
{
|
|
"epoch": 1.0641711229946524,
|
|
"grad_norm": 0.7571138739585876,
|
|
"learning_rate": 7.151284988688731e-06,
|
|
"loss": 0.19227520227432252,
|
|
"memory(GiB)": 28.98,
|
|
"step": 100,
|
|
"token_acc": 0.9299330505442838,
|
|
"train_speed(iter/s)": 0.205218
|
|
},
|
|
{
|
|
"epoch": 1.0641711229946524,
|
|
"eval_loss": 0.27827945351600647,
|
|
"eval_runtime": 1.6406,
|
|
"eval_samples_per_second": 36.572,
|
|
"eval_steps_per_second": 9.143,
|
|
"eval_token_acc": 0.9097497342738429,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.1176470588235294,
|
|
"grad_norm": 0.6977977156639099,
|
|
"learning_rate": 6.893894428052881e-06,
|
|
"loss": 0.18528327941894532,
|
|
"memory(GiB)": 28.98,
|
|
"step": 105,
|
|
"token_acc": 0.9313939048472141,
|
|
"train_speed(iter/s)": 0.200929
|
|
},
|
|
{
|
|
"epoch": 1.1711229946524064,
|
|
"grad_norm": 0.7503034472465515,
|
|
"learning_rate": 6.6305021935494755e-06,
|
|
"loss": 0.191499924659729,
|
|
"memory(GiB)": 28.98,
|
|
"step": 110,
|
|
"token_acc": 0.934816576879125,
|
|
"train_speed(iter/s)": 0.202263
|
|
},
|
|
{
|
|
"epoch": 1.2245989304812834,
|
|
"grad_norm": 0.6970927715301514,
|
|
"learning_rate": 6.361942964380967e-06,
|
|
"loss": 0.18341017961502076,
|
|
"memory(GiB)": 28.98,
|
|
"step": 115,
|
|
"token_acc": 0.9350552403702598,
|
|
"train_speed(iter/s)": 0.203567
|
|
},
|
|
{
|
|
"epoch": 1.2780748663101604,
|
|
"grad_norm": 0.7112876176834106,
|
|
"learning_rate": 6.089067793744258e-06,
|
|
"loss": 0.19445158243179322,
|
|
"memory(GiB)": 28.98,
|
|
"step": 120,
|
|
"token_acc": 0.9335015519281871,
|
|
"train_speed(iter/s)": 0.204703
|
|
},
|
|
{
|
|
"epoch": 1.2780748663101604,
|
|
"eval_loss": 0.27691978216171265,
|
|
"eval_runtime": 1.6606,
|
|
"eval_samples_per_second": 36.132,
|
|
"eval_steps_per_second": 9.033,
|
|
"eval_token_acc": 0.9094920604245177,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 1.3315508021390374,
|
|
"grad_norm": 0.6548081636428833,
|
|
"learning_rate": 5.8127414118779825e-06,
|
|
"loss": 0.18807239532470704,
|
|
"memory(GiB)": 31.29,
|
|
"step": 125,
|
|
"token_acc": 0.9327750242123853,
|
|
"train_speed(iter/s)": 0.200918
|
|
},
|
|
{
|
|
"epoch": 1.3850267379679144,
|
|
"grad_norm": 0.709028422832489,
|
|
"learning_rate": 5.533839485767795e-06,
|
|
"loss": 0.19655026197433473,
|
|
"memory(GiB)": 31.29,
|
|
"step": 130,
|
|
"token_acc": 0.9308182054862607,
|
|
"train_speed(iter/s)": 0.201887
|
|
},
|
|
{
|
|
"epoch": 1.4385026737967914,
|
|
"grad_norm": 0.6588287949562073,
|
|
"learning_rate": 5.253245844193564e-06,
|
|
"loss": 0.19113950729370116,
|
|
"memory(GiB)": 31.29,
|
|
"step": 135,
|
|
"token_acc": 0.9270080346573307,
|
|
"train_speed(iter/s)": 0.202901
|
|
},
|
|
{
|
|
"epoch": 1.4919786096256684,
|
|
"grad_norm": 0.6656479239463806,
|
|
"learning_rate": 4.971849676912172e-06,
|
|
"loss": 0.18891613483428954,
|
|
"memory(GiB)": 31.29,
|
|
"step": 140,
|
|
"token_acc": 0.9305257651059378,
|
|
"train_speed(iter/s)": 0.203847
|
|
},
|
|
{
|
|
"epoch": 1.4919786096256684,
|
|
"eval_loss": 0.2746458649635315,
|
|
"eval_runtime": 1.6467,
|
|
"eval_samples_per_second": 36.436,
|
|
"eval_steps_per_second": 9.109,
|
|
"eval_token_acc": 0.9111991496762972,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 1.5454545454545454,
|
|
"grad_norm": 0.7020911574363708,
|
|
"learning_rate": 4.6905427168515914e-06,
|
|
"loss": 0.19171638488769532,
|
|
"memory(GiB)": 31.29,
|
|
"step": 145,
|
|
"token_acc": 0.9305895351590245,
|
|
"train_speed(iter/s)": 0.201182
|
|
},
|
|
{
|
|
"epoch": 1.5989304812834224,
|
|
"grad_norm": 0.6727572083473206,
|
|
"learning_rate": 4.410216414245771e-06,
|
|
"loss": 0.1821829557418823,
|
|
"memory(GiB)": 31.29,
|
|
"step": 150,
|
|
"token_acc": 0.9352090736503919,
|
|
"train_speed(iter/s)": 0.202227
|
|
},
|
|
{
|
|
"epoch": 1.6524064171122994,
|
|
"grad_norm": 0.6589164733886719,
|
|
"learning_rate": 4.131759111665349e-06,
|
|
"loss": 0.18441460132598878,
|
|
"memory(GiB)": 31.29,
|
|
"step": 155,
|
|
"token_acc": 0.9374578346368156,
|
|
"train_speed(iter/s)": 0.203318
|
|
},
|
|
{
|
|
"epoch": 1.7058823529411766,
|
|
"grad_norm": 0.6176323890686035,
|
|
"learning_rate": 3.856053228896442e-06,
|
|
"loss": 0.18946645259857178,
|
|
"memory(GiB)": 31.29,
|
|
"step": 160,
|
|
"token_acc": 0.9367611881372071,
|
|
"train_speed(iter/s)": 0.20408
|
|
},
|
|
{
|
|
"epoch": 1.7058823529411766,
|
|
"eval_loss": 0.2751389443874359,
|
|
"eval_runtime": 1.6421,
|
|
"eval_samples_per_second": 36.539,
|
|
"eval_steps_per_second": 9.135,
|
|
"eval_token_acc": 0.9112957773697942,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.7593582887700534,
|
|
"grad_norm": 0.6360734701156616,
|
|
"learning_rate": 3.58397246658848e-06,
|
|
"loss": 0.1823675274848938,
|
|
"memory(GiB)": 31.29,
|
|
"step": 165,
|
|
"token_acc": 0.9278697615463836,
|
|
"train_speed(iter/s)": 0.201592
|
|
},
|
|
{
|
|
"epoch": 1.8128342245989306,
|
|
"grad_norm": 0.5981405973434448,
|
|
"learning_rate": 3.316379037532644e-06,
|
|
"loss": 0.18013572692871094,
|
|
"memory(GiB)": 31.29,
|
|
"step": 170,
|
|
"token_acc": 0.9407218114408998,
|
|
"train_speed(iter/s)": 0.202459
|
|
},
|
|
{
|
|
"epoch": 1.8663101604278074,
|
|
"grad_norm": 0.5807086825370789,
|
|
"learning_rate": 3.0541209343448373e-06,
|
|
"loss": 0.1835346221923828,
|
|
"memory(GiB)": 31.29,
|
|
"step": 175,
|
|
"token_acc": 0.9373540226163772,
|
|
"train_speed(iter/s)": 0.203227
|
|
},
|
|
{
|
|
"epoch": 1.9197860962566846,
|
|
"grad_norm": 0.610285758972168,
|
|
"learning_rate": 2.7980292422118282e-06,
|
|
"loss": 0.18963263034820557,
|
|
"memory(GiB)": 31.29,
|
|
"step": 180,
|
|
"token_acc": 0.9329708446611044,
|
|
"train_speed(iter/s)": 0.204063
|
|
},
|
|
{
|
|
"epoch": 1.9197860962566846,
|
|
"eval_loss": 0.27287107706069946,
|
|
"eval_runtime": 1.6399,
|
|
"eval_samples_per_second": 36.589,
|
|
"eval_steps_per_second": 9.147,
|
|
"eval_token_acc": 0.911682288143782,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.9732620320855614,
|
|
"grad_norm": 0.6412176489830017,
|
|
"learning_rate": 2.548915505216333e-06,
|
|
"loss": 0.18783329725265502,
|
|
"memory(GiB)": 31.29,
|
|
"step": 185,
|
|
"token_acc": 0.9265865937289413,
|
|
"train_speed(iter/s)": 0.201898
|
|
},
|
|
{
|
|
"epoch": 2.021390374331551,
|
|
"grad_norm": 0.607214629650116,
|
|
"learning_rate": 2.307569154587056e-06,
|
|
"loss": 0.1662315845489502,
|
|
"memory(GiB)": 31.29,
|
|
"step": 190,
|
|
"token_acc": 0.9465564026359995,
|
|
"train_speed(iter/s)": 0.203071
|
|
},
|
|
{
|
|
"epoch": 2.0748663101604277,
|
|
"grad_norm": 0.6007011532783508,
|
|
"learning_rate": 2.074755007023461e-06,
|
|
"loss": 0.16532043218612671,
|
|
"memory(GiB)": 31.29,
|
|
"step": 195,
|
|
"token_acc": 0.9450870631362545,
|
|
"train_speed(iter/s)": 0.203942
|
|
},
|
|
{
|
|
"epoch": 2.128342245989305,
|
|
"grad_norm": 0.6896679997444153,
|
|
"learning_rate": 1.8512108410229878e-06,
|
|
"loss": 0.15121257305145264,
|
|
"memory(GiB)": 31.29,
|
|
"step": 200,
|
|
"token_acc": 0.9484533555566449,
|
|
"train_speed(iter/s)": 0.204504
|
|
},
|
|
{
|
|
"epoch": 2.128342245989305,
|
|
"eval_loss": 0.28094714879989624,
|
|
"eval_runtime": 1.6463,
|
|
"eval_samples_per_second": 36.446,
|
|
"eval_steps_per_second": 9.111,
|
|
"eval_token_acc": 0.9109736850581377,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 2.1818181818181817,
|
|
"grad_norm": 0.6233195662498474,
|
|
"learning_rate": 1.6376450588911985e-06,
|
|
"loss": 0.15310670137405397,
|
|
"memory(GiB)": 31.29,
|
|
"step": 205,
|
|
"token_acc": 0.9403647217565523,
|
|
"train_speed(iter/s)": 0.202351
|
|
},
|
|
{
|
|
"epoch": 2.235294117647059,
|
|
"grad_norm": 0.6323373913764954,
|
|
"learning_rate": 1.434734441843899e-06,
|
|
"loss": 0.15562598705291747,
|
|
"memory(GiB)": 31.29,
|
|
"step": 210,
|
|
"token_acc": 0.9448852085089503,
|
|
"train_speed(iter/s)": 0.202913
|
|
},
|
|
{
|
|
"epoch": 2.2887700534759357,
|
|
"grad_norm": 0.6409267783164978,
|
|
"learning_rate": 1.2431220053151832e-06,
|
|
"loss": 0.15542089939117432,
|
|
"memory(GiB)": 31.29,
|
|
"step": 215,
|
|
"token_acc": 0.9450054780164817,
|
|
"train_speed(iter/s)": 0.203493
|
|
},
|
|
{
|
|
"epoch": 2.342245989304813,
|
|
"grad_norm": 0.6448594331741333,
|
|
"learning_rate": 1.063414961267859e-06,
|
|
"loss": 0.1522960662841797,
|
|
"memory(GiB)": 31.29,
|
|
"step": 220,
|
|
"token_acc": 0.9481132075471698,
|
|
"train_speed(iter/s)": 0.204302
|
|
},
|
|
{
|
|
"epoch": 2.342245989304813,
|
|
"eval_loss": 0.2818092703819275,
|
|
"eval_runtime": 1.643,
|
|
"eval_samples_per_second": 36.519,
|
|
"eval_steps_per_second": 9.13,
|
|
"eval_token_acc": 0.9113924050632911,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 2.3957219251336896,
|
|
"grad_norm": 0.6113874316215515,
|
|
"learning_rate": 8.961827939636198e-07,
|
|
"loss": 0.16363799571990967,
|
|
"memory(GiB)": 31.29,
|
|
"step": 225,
|
|
"token_acc": 0.9390907965842993,
|
|
"train_speed(iter/s)": 0.202503
|
|
},
|
|
{
|
|
"epoch": 2.449197860962567,
|
|
"grad_norm": 0.597212016582489,
|
|
"learning_rate": 7.41955455290726e-07,
|
|
"loss": 0.15171511173248292,
|
|
"memory(GiB)": 31.29,
|
|
"step": 230,
|
|
"token_acc": 0.9467608786903596,
|
|
"train_speed(iter/s)": 0.20326
|
|
},
|
|
{
|
|
"epoch": 2.502673796791444,
|
|
"grad_norm": 0.6323869228363037,
|
|
"learning_rate": 6.012216853682001e-07,
|
|
"loss": 0.16323232650756836,
|
|
"memory(GiB)": 31.29,
|
|
"step": 235,
|
|
"token_acc": 0.9391786687427014,
|
|
"train_speed(iter/s)": 0.20378
|
|
},
|
|
{
|
|
"epoch": 2.556149732620321,
|
|
"grad_norm": 0.6109181642532349,
|
|
"learning_rate": 4.7442746374839363e-07,
|
|
"loss": 0.1464900016784668,
|
|
"memory(GiB)": 31.3,
|
|
"step": 240,
|
|
"token_acc": 0.9483738659414637,
|
|
"train_speed(iter/s)": 0.20435
|
|
},
|
|
{
|
|
"epoch": 2.556149732620321,
|
|
"eval_loss": 0.28033456206321716,
|
|
"eval_runtime": 1.6424,
|
|
"eval_samples_per_second": 36.532,
|
|
"eval_steps_per_second": 9.133,
|
|
"eval_token_acc": 0.9115856604502851,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 2.6096256684491976,
|
|
"grad_norm": 0.6514647006988525,
|
|
"learning_rate": 3.619745961260623e-07,
|
|
"loss": 0.1541598081588745,
|
|
"memory(GiB)": 31.3,
|
|
"step": 245,
|
|
"token_acc": 0.9415382075569038,
|
|
"train_speed(iter/s)": 0.202522
|
|
},
|
|
{
|
|
"epoch": 2.663101604278075,
|
|
"grad_norm": 0.5899693965911865,
|
|
"learning_rate": 2.6421944103256657e-07,
|
|
"loss": 0.15795296430587769,
|
|
"memory(GiB)": 31.3,
|
|
"step": 250,
|
|
"token_acc": 0.947255862532017,
|
|
"train_speed(iter/s)": 0.203017
|
|
},
|
|
{
|
|
"epoch": 2.716577540106952,
|
|
"grad_norm": 0.612455427646637,
|
|
"learning_rate": 1.814717805502958e-07,
|
|
"loss": 0.15344234704971313,
|
|
"memory(GiB)": 31.3,
|
|
"step": 255,
|
|
"token_acc": 0.9460515010284584,
|
|
"train_speed(iter/s)": 0.203605
|
|
},
|
|
{
|
|
"epoch": 2.770053475935829,
|
|
"grad_norm": 0.6128495931625366,
|
|
"learning_rate": 1.1399383862592928e-07,
|
|
"loss": 0.1595083236694336,
|
|
"memory(GiB)": 31.3,
|
|
"step": 260,
|
|
"token_acc": 0.9440190249702735,
|
|
"train_speed(iter/s)": 0.20408
|
|
},
|
|
{
|
|
"epoch": 2.770053475935829,
|
|
"eval_loss": 0.2802920639514923,
|
|
"eval_runtime": 1.6389,
|
|
"eval_samples_per_second": 36.609,
|
|
"eval_steps_per_second": 9.152,
|
|
"eval_token_acc": 0.9115856604502851,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 2.8235294117647056,
|
|
"grad_norm": 0.5782672166824341,
|
|
"learning_rate": 6.199945009349173e-08,
|
|
"loss": 0.15760741233825684,
|
|
"memory(GiB)": 31.3,
|
|
"step": 265,
|
|
"token_acc": 0.9367169337749707,
|
|
"train_speed(iter/s)": 0.202464
|
|
},
|
|
{
|
|
"epoch": 2.877005347593583,
|
|
"grad_norm": 0.6260784864425659,
|
|
"learning_rate": 2.5653383040524228e-08,
|
|
"loss": 0.14205594062805177,
|
|
"memory(GiB)": 31.3,
|
|
"step": 270,
|
|
"token_acc": 0.9525445321564256,
|
|
"train_speed(iter/s)": 0.202893
|
|
},
|
|
{
|
|
"epoch": 2.93048128342246,
|
|
"grad_norm": 0.6263572573661804,
|
|
"learning_rate": 5.0708166647628345e-09,
|
|
"loss": 0.1594037890434265,
|
|
"memory(GiB)": 31.3,
|
|
"step": 275,
|
|
"token_acc": 0.9494285781334335,
|
|
"train_speed(iter/s)": 0.203622
|
|
},
|
|
{
|
|
"epoch": 2.9732620320855614,
|
|
"eval_loss": 0.27989062666893005,
|
|
"eval_runtime": 1.6446,
|
|
"eval_samples_per_second": 36.483,
|
|
"eval_steps_per_second": 9.121,
|
|
"eval_token_acc": 0.9118433342996103,
|
|
"step": 279
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 279,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.8413396385162854e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|