5905 lines
167 KiB
JSON
5905 lines
167 KiB
JSON
{
|
|
"best_global_step": 1540,
|
|
"best_metric": 0.20705882,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v12-20250430-202042/checkpoint-1540",
|
|
"epoch": 2.9972559780478245,
|
|
"eval_steps": 20,
|
|
"global_step": 2391,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0012544100352802822,
|
|
"grad_norm": 2.5221166610717773,
|
|
"learning_rate": 9.999995684008912e-06,
|
|
"loss": 0.34258008003234863,
|
|
"memory(GiB)": 28.82,
|
|
"step": 1,
|
|
"token_acc": 0.8867170626349892,
|
|
"train_speed(iter/s)": 0.063413
|
|
},
|
|
{
|
|
"epoch": 0.006272050176401411,
|
|
"grad_norm": 1.4420850276947021,
|
|
"learning_rate": 9.999892100595329e-06,
|
|
"loss": 0.32799670100212097,
|
|
"memory(GiB)": 28.86,
|
|
"step": 5,
|
|
"token_acc": 0.8843052006239421,
|
|
"train_speed(iter/s)": 0.122976
|
|
},
|
|
{
|
|
"epoch": 0.012544100352802822,
|
|
"grad_norm": 1.0056296586990356,
|
|
"learning_rate": 9.999568407038233e-06,
|
|
"loss": 0.3203620672225952,
|
|
"memory(GiB)": 28.86,
|
|
"step": 10,
|
|
"token_acc": 0.891963781939332,
|
|
"train_speed(iter/s)": 0.138195
|
|
},
|
|
{
|
|
"epoch": 0.018816150529204233,
|
|
"grad_norm": 0.8973207473754883,
|
|
"learning_rate": 9.999028933299243e-06,
|
|
"loss": 0.2882222652435303,
|
|
"memory(GiB)": 28.86,
|
|
"step": 15,
|
|
"token_acc": 0.9004290263995999,
|
|
"train_speed(iter/s)": 0.14514
|
|
},
|
|
{
|
|
"epoch": 0.025088200705605645,
|
|
"grad_norm": 0.9302909970283508,
|
|
"learning_rate": 9.99827370266192e-06,
|
|
"loss": 0.2774477481842041,
|
|
"memory(GiB)": 28.86,
|
|
"step": 20,
|
|
"token_acc": 0.9076236168501622,
|
|
"train_speed(iter/s)": 0.146392
|
|
},
|
|
{
|
|
"epoch": 0.025088200705605645,
|
|
"eval_loss": 0.3028348684310913,
|
|
"eval_runtime": 29.847,
|
|
"eval_samples_per_second": 17.255,
|
|
"eval_steps_per_second": 4.322,
|
|
"eval_token_acc": 0.903765129060793,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.03136025088200706,
|
|
"grad_norm": 0.86534184217453,
|
|
"learning_rate": 9.99730274772184e-06,
|
|
"loss": 0.2807079553604126,
|
|
"memory(GiB)": 28.86,
|
|
"step": 25,
|
|
"token_acc": 0.9082198329790155,
|
|
"train_speed(iter/s)": 0.119433
|
|
},
|
|
{
|
|
"epoch": 0.037632301058408466,
|
|
"grad_norm": 1.1536375284194946,
|
|
"learning_rate": 9.996116110385186e-06,
|
|
"loss": 0.283935022354126,
|
|
"memory(GiB)": 28.86,
|
|
"step": 30,
|
|
"token_acc": 0.9082473717980356,
|
|
"train_speed(iter/s)": 0.125112
|
|
},
|
|
{
|
|
"epoch": 0.04390435123480988,
|
|
"grad_norm": 0.81349778175354,
|
|
"learning_rate": 9.99471384186694e-06,
|
|
"loss": 0.27507519721984863,
|
|
"memory(GiB)": 28.87,
|
|
"step": 35,
|
|
"token_acc": 0.9149028749028749,
|
|
"train_speed(iter/s)": 0.128991
|
|
},
|
|
{
|
|
"epoch": 0.05017640141121129,
|
|
"grad_norm": 0.8251215815544128,
|
|
"learning_rate": 9.99309600268868e-06,
|
|
"loss": 0.25176520347595216,
|
|
"memory(GiB)": 28.87,
|
|
"step": 40,
|
|
"token_acc": 0.9197068447954133,
|
|
"train_speed(iter/s)": 0.131242
|
|
},
|
|
{
|
|
"epoch": 0.05017640141121129,
|
|
"eval_loss": 0.2838481664657593,
|
|
"eval_runtime": 29.7701,
|
|
"eval_samples_per_second": 17.299,
|
|
"eval_steps_per_second": 4.333,
|
|
"eval_token_acc": 0.9088420373033275,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.0564484515876127,
|
|
"grad_norm": 0.8325265645980835,
|
|
"learning_rate": 9.991262662675962e-06,
|
|
"loss": 0.26733884811401365,
|
|
"memory(GiB)": 28.87,
|
|
"step": 45,
|
|
"token_acc": 0.9162756526424058,
|
|
"train_speed(iter/s)": 0.119295
|
|
},
|
|
{
|
|
"epoch": 0.06272050176401411,
|
|
"grad_norm": 0.8100744485855103,
|
|
"learning_rate": 9.9892139009553e-06,
|
|
"loss": 0.2542246103286743,
|
|
"memory(GiB)": 28.87,
|
|
"step": 50,
|
|
"token_acc": 0.9066385049327594,
|
|
"train_speed(iter/s)": 0.122408
|
|
},
|
|
{
|
|
"epoch": 0.06899255194041552,
|
|
"grad_norm": 0.877231240272522,
|
|
"learning_rate": 9.986949805950763e-06,
|
|
"loss": 0.2703877925872803,
|
|
"memory(GiB)": 28.87,
|
|
"step": 55,
|
|
"token_acc": 0.9160455515225402,
|
|
"train_speed(iter/s)": 0.124536
|
|
},
|
|
{
|
|
"epoch": 0.07526460211681693,
|
|
"grad_norm": 0.808023989200592,
|
|
"learning_rate": 9.984470475380154e-06,
|
|
"loss": 0.2656998157501221,
|
|
"memory(GiB)": 28.87,
|
|
"step": 60,
|
|
"token_acc": 0.9185868853481338,
|
|
"train_speed(iter/s)": 0.126941
|
|
},
|
|
{
|
|
"epoch": 0.07526460211681693,
|
|
"eval_loss": 0.27411043643951416,
|
|
"eval_runtime": 29.8492,
|
|
"eval_samples_per_second": 17.253,
|
|
"eval_steps_per_second": 4.322,
|
|
"eval_token_acc": 0.9110513937391785,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.08153665229321834,
|
|
"grad_norm": 0.8025234937667847,
|
|
"learning_rate": 9.981776016250789e-06,
|
|
"loss": 0.25631260871887207,
|
|
"memory(GiB)": 28.87,
|
|
"step": 65,
|
|
"token_acc": 0.9179844371690116,
|
|
"train_speed(iter/s)": 0.119148
|
|
},
|
|
{
|
|
"epoch": 0.08780870246961976,
|
|
"grad_norm": 0.8097490072250366,
|
|
"learning_rate": 9.97886654485488e-06,
|
|
"loss": 0.24328134059906006,
|
|
"memory(GiB)": 28.87,
|
|
"step": 70,
|
|
"token_acc": 0.9217776363588153,
|
|
"train_speed(iter/s)": 0.121225
|
|
},
|
|
{
|
|
"epoch": 0.09408075264602117,
|
|
"grad_norm": 0.7601203918457031,
|
|
"learning_rate": 9.975742186764526e-06,
|
|
"loss": 0.24020836353302003,
|
|
"memory(GiB)": 28.87,
|
|
"step": 75,
|
|
"token_acc": 0.9274766452418747,
|
|
"train_speed(iter/s)": 0.12313
|
|
},
|
|
{
|
|
"epoch": 0.10035280282242258,
|
|
"grad_norm": 0.8542447090148926,
|
|
"learning_rate": 9.972403076826272e-06,
|
|
"loss": 0.24563825130462646,
|
|
"memory(GiB)": 28.87,
|
|
"step": 80,
|
|
"token_acc": 0.9028298984927715,
|
|
"train_speed(iter/s)": 0.124748
|
|
},
|
|
{
|
|
"epoch": 0.10035280282242258,
|
|
"eval_loss": 0.26768097281455994,
|
|
"eval_runtime": 29.7152,
|
|
"eval_samples_per_second": 17.331,
|
|
"eval_steps_per_second": 4.341,
|
|
"eval_token_acc": 0.9123132842272074,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.10662485299882399,
|
|
"grad_norm": 0.7627353072166443,
|
|
"learning_rate": 9.96884935915531e-06,
|
|
"loss": 0.2343665599822998,
|
|
"memory(GiB)": 28.87,
|
|
"step": 85,
|
|
"token_acc": 0.9181495379897511,
|
|
"train_speed(iter/s)": 0.119041
|
|
},
|
|
{
|
|
"epoch": 0.1128969031752254,
|
|
"grad_norm": 0.7160388827323914,
|
|
"learning_rate": 9.965081187129248e-06,
|
|
"loss": 0.23817930221557618,
|
|
"memory(GiB)": 28.87,
|
|
"step": 90,
|
|
"token_acc": 0.9190057254573384,
|
|
"train_speed(iter/s)": 0.120099
|
|
},
|
|
{
|
|
"epoch": 0.11916895335162682,
|
|
"grad_norm": 0.8704434633255005,
|
|
"learning_rate": 9.961098723381495e-06,
|
|
"loss": 0.24323635101318358,
|
|
"memory(GiB)": 28.87,
|
|
"step": 95,
|
|
"token_acc": 0.9187486766885454,
|
|
"train_speed(iter/s)": 0.121879
|
|
},
|
|
{
|
|
"epoch": 0.12544100352802823,
|
|
"grad_norm": 0.7855771780014038,
|
|
"learning_rate": 9.956902139794236e-06,
|
|
"loss": 0.2694889545440674,
|
|
"memory(GiB)": 28.87,
|
|
"step": 100,
|
|
"token_acc": 0.9003986382285332,
|
|
"train_speed(iter/s)": 0.123256
|
|
},
|
|
{
|
|
"epoch": 0.12544100352802823,
|
|
"eval_loss": 0.26312825083732605,
|
|
"eval_runtime": 29.8426,
|
|
"eval_samples_per_second": 17.257,
|
|
"eval_steps_per_second": 4.323,
|
|
"eval_token_acc": 0.9138979839098482,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.13171305370442962,
|
|
"grad_norm": 0.7281831502914429,
|
|
"learning_rate": 9.95249161749102e-06,
|
|
"loss": 0.2436688184738159,
|
|
"memory(GiB)": 28.87,
|
|
"step": 105,
|
|
"token_acc": 0.9221378504672897,
|
|
"train_speed(iter/s)": 0.118943
|
|
},
|
|
{
|
|
"epoch": 0.13798510388083104,
|
|
"grad_norm": 0.770193338394165,
|
|
"learning_rate": 9.94786734682894e-06,
|
|
"loss": 0.2424685001373291,
|
|
"memory(GiB)": 28.87,
|
|
"step": 110,
|
|
"token_acc": 0.9147483821400076,
|
|
"train_speed(iter/s)": 0.1202
|
|
},
|
|
{
|
|
"epoch": 0.14425715405723247,
|
|
"grad_norm": 0.7887458801269531,
|
|
"learning_rate": 9.943029527390415e-06,
|
|
"loss": 0.2566553592681885,
|
|
"memory(GiB)": 28.87,
|
|
"step": 115,
|
|
"token_acc": 0.917302101047781,
|
|
"train_speed(iter/s)": 0.121648
|
|
},
|
|
{
|
|
"epoch": 0.15052920423363386,
|
|
"grad_norm": 0.7880879640579224,
|
|
"learning_rate": 9.93797836797458e-06,
|
|
"loss": 0.24933695793151855,
|
|
"memory(GiB)": 28.87,
|
|
"step": 120,
|
|
"token_acc": 0.916873418038744,
|
|
"train_speed(iter/s)": 0.122856
|
|
},
|
|
{
|
|
"epoch": 0.15052920423363386,
|
|
"eval_loss": 0.2586906850337982,
|
|
"eval_runtime": 29.8223,
|
|
"eval_samples_per_second": 17.269,
|
|
"eval_steps_per_second": 4.326,
|
|
"eval_token_acc": 0.914292062666907,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.15680125441003528,
|
|
"grad_norm": 0.8280410170555115,
|
|
"learning_rate": 9.932714086588276e-06,
|
|
"loss": 0.24312918186187743,
|
|
"memory(GiB)": 28.87,
|
|
"step": 125,
|
|
"token_acc": 0.9201260500102441,
|
|
"train_speed(iter/s)": 0.11913
|
|
},
|
|
{
|
|
"epoch": 0.16307330458643668,
|
|
"grad_norm": 0.7925168871879578,
|
|
"learning_rate": 9.92723691043663e-06,
|
|
"loss": 0.23867030143737794,
|
|
"memory(GiB)": 28.87,
|
|
"step": 130,
|
|
"token_acc": 0.9171843234696182,
|
|
"train_speed(iter/s)": 0.12024
|
|
},
|
|
{
|
|
"epoch": 0.1693453547628381,
|
|
"grad_norm": 0.7491594552993774,
|
|
"learning_rate": 9.921547075913261e-06,
|
|
"loss": 0.235352087020874,
|
|
"memory(GiB)": 28.87,
|
|
"step": 135,
|
|
"token_acc": 0.9219220779220779,
|
|
"train_speed(iter/s)": 0.121219
|
|
},
|
|
{
|
|
"epoch": 0.17561740493923952,
|
|
"grad_norm": 0.7498698234558105,
|
|
"learning_rate": 9.915644828590074e-06,
|
|
"loss": 0.2597238063812256,
|
|
"memory(GiB)": 28.87,
|
|
"step": 140,
|
|
"token_acc": 0.9098761257799772,
|
|
"train_speed(iter/s)": 0.122367
|
|
},
|
|
{
|
|
"epoch": 0.17561740493923952,
|
|
"eval_loss": 0.25434646010398865,
|
|
"eval_runtime": 29.7793,
|
|
"eval_samples_per_second": 17.294,
|
|
"eval_steps_per_second": 4.332,
|
|
"eval_token_acc": 0.9158977239855616,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.18188945511564092,
|
|
"grad_norm": 0.7065432667732239,
|
|
"learning_rate": 9.909530423206657e-06,
|
|
"loss": 0.24801239967346192,
|
|
"memory(GiB)": 28.87,
|
|
"step": 145,
|
|
"token_acc": 0.9213938076007041,
|
|
"train_speed(iter/s)": 0.119043
|
|
},
|
|
{
|
|
"epoch": 0.18816150529204234,
|
|
"grad_norm": 0.7023429274559021,
|
|
"learning_rate": 9.903204123659288e-06,
|
|
"loss": 0.22359247207641603,
|
|
"memory(GiB)": 28.87,
|
|
"step": 150,
|
|
"token_acc": 0.922397476340694,
|
|
"train_speed(iter/s)": 0.120053
|
|
},
|
|
{
|
|
"epoch": 0.19443355546844374,
|
|
"grad_norm": 0.779660701751709,
|
|
"learning_rate": 9.896666202989553e-06,
|
|
"loss": 0.2474226951599121,
|
|
"memory(GiB)": 28.87,
|
|
"step": 155,
|
|
"token_acc": 0.9126028777783999,
|
|
"train_speed(iter/s)": 0.121177
|
|
},
|
|
{
|
|
"epoch": 0.20070560564484516,
|
|
"grad_norm": 0.7874925136566162,
|
|
"learning_rate": 9.889916943372549e-06,
|
|
"loss": 0.2478172779083252,
|
|
"memory(GiB)": 28.87,
|
|
"step": 160,
|
|
"token_acc": 0.9174663613676709,
|
|
"train_speed(iter/s)": 0.122185
|
|
},
|
|
{
|
|
"epoch": 0.20070560564484516,
|
|
"eval_loss": 0.2529737055301666,
|
|
"eval_runtime": 29.8233,
|
|
"eval_samples_per_second": 17.268,
|
|
"eval_steps_per_second": 4.325,
|
|
"eval_token_acc": 0.9160318784560497,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.20697765582124658,
|
|
"grad_norm": 0.6781697869300842,
|
|
"learning_rate": 9.882956636104714e-06,
|
|
"loss": 0.2371211528778076,
|
|
"memory(GiB)": 28.87,
|
|
"step": 165,
|
|
"token_acc": 0.9221280174789006,
|
|
"train_speed(iter/s)": 0.119331
|
|
},
|
|
{
|
|
"epoch": 0.21324970599764798,
|
|
"grad_norm": 0.7917264103889465,
|
|
"learning_rate": 9.875785581591253e-06,
|
|
"loss": 0.24579255580902098,
|
|
"memory(GiB)": 28.87,
|
|
"step": 170,
|
|
"token_acc": 0.9130986455782112,
|
|
"train_speed(iter/s)": 0.120095
|
|
},
|
|
{
|
|
"epoch": 0.2195217561740494,
|
|
"grad_norm": 0.7744810581207275,
|
|
"learning_rate": 9.868404089333171e-06,
|
|
"loss": 0.22069144248962402,
|
|
"memory(GiB)": 28.87,
|
|
"step": 175,
|
|
"token_acc": 0.9295828549559892,
|
|
"train_speed(iter/s)": 0.120995
|
|
},
|
|
{
|
|
"epoch": 0.2257938063504508,
|
|
"grad_norm": 0.8563181757926941,
|
|
"learning_rate": 9.860812477913915e-06,
|
|
"loss": 0.23019468784332275,
|
|
"memory(GiB)": 28.87,
|
|
"step": 180,
|
|
"token_acc": 0.9216493698005064,
|
|
"train_speed(iter/s)": 0.121807
|
|
},
|
|
{
|
|
"epoch": 0.2257938063504508,
|
|
"eval_loss": 0.25059598684310913,
|
|
"eval_runtime": 29.8473,
|
|
"eval_samples_per_second": 17.254,
|
|
"eval_steps_per_second": 4.322,
|
|
"eval_token_acc": 0.9165852656468132,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.23206585652685222,
|
|
"grad_norm": 0.7056512236595154,
|
|
"learning_rate": 9.853011074985628e-06,
|
|
"loss": 0.23237879276275636,
|
|
"memory(GiB)": 28.87,
|
|
"step": 185,
|
|
"token_acc": 0.9226392416106082,
|
|
"train_speed(iter/s)": 0.119324
|
|
},
|
|
{
|
|
"epoch": 0.23833790670325364,
|
|
"grad_norm": 0.8427721858024597,
|
|
"learning_rate": 9.845000217255e-06,
|
|
"loss": 0.23154301643371583,
|
|
"memory(GiB)": 28.87,
|
|
"step": 190,
|
|
"token_acc": 0.9154171704957679,
|
|
"train_speed(iter/s)": 0.120232
|
|
},
|
|
{
|
|
"epoch": 0.24460995687965503,
|
|
"grad_norm": 0.7186033725738525,
|
|
"learning_rate": 9.836780250468744e-06,
|
|
"loss": 0.23792126178741455,
|
|
"memory(GiB)": 28.87,
|
|
"step": 195,
|
|
"token_acc": 0.9164362118445082,
|
|
"train_speed(iter/s)": 0.120955
|
|
},
|
|
{
|
|
"epoch": 0.25088200705605646,
|
|
"grad_norm": 0.7176758050918579,
|
|
"learning_rate": 9.82835152939867e-06,
|
|
"loss": 0.2266530990600586,
|
|
"memory(GiB)": 28.87,
|
|
"step": 200,
|
|
"token_acc": 0.9176911913693492,
|
|
"train_speed(iter/s)": 0.121635
|
|
},
|
|
{
|
|
"epoch": 0.25088200705605646,
|
|
"eval_loss": 0.2479380965232849,
|
|
"eval_runtime": 29.8661,
|
|
"eval_samples_per_second": 17.244,
|
|
"eval_steps_per_second": 4.319,
|
|
"eval_token_acc": 0.9172476533448483,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.2571540572324579,
|
|
"grad_norm": 0.7367855906486511,
|
|
"learning_rate": 9.81971441782637e-06,
|
|
"loss": 0.2414193868637085,
|
|
"memory(GiB)": 28.87,
|
|
"step": 205,
|
|
"token_acc": 0.9207727327666089,
|
|
"train_speed(iter/s)": 0.119447
|
|
},
|
|
{
|
|
"epoch": 0.26342610740885924,
|
|
"grad_norm": 0.7605990767478943,
|
|
"learning_rate": 9.810869288527528e-06,
|
|
"loss": 0.22583391666412353,
|
|
"memory(GiB)": 28.87,
|
|
"step": 210,
|
|
"token_acc": 0.916309738503156,
|
|
"train_speed(iter/s)": 0.120034
|
|
},
|
|
{
|
|
"epoch": 0.26969815758526067,
|
|
"grad_norm": 0.8129069209098816,
|
|
"learning_rate": 9.801816523255811e-06,
|
|
"loss": 0.22113454341888428,
|
|
"memory(GiB)": 28.87,
|
|
"step": 215,
|
|
"token_acc": 0.9300042680324371,
|
|
"train_speed(iter/s)": 0.120689
|
|
},
|
|
{
|
|
"epoch": 0.2759702077616621,
|
|
"grad_norm": 0.7405229210853577,
|
|
"learning_rate": 9.792556512726419e-06,
|
|
"loss": 0.23366448879241944,
|
|
"memory(GiB)": 28.87,
|
|
"step": 220,
|
|
"token_acc": 0.9338177623990773,
|
|
"train_speed(iter/s)": 0.121495
|
|
},
|
|
{
|
|
"epoch": 0.2759702077616621,
|
|
"eval_loss": 0.24686363339424133,
|
|
"eval_runtime": 29.7985,
|
|
"eval_samples_per_second": 17.283,
|
|
"eval_steps_per_second": 4.329,
|
|
"eval_token_acc": 0.9180903111126018,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.2822422579380635,
|
|
"grad_norm": 0.8519447445869446,
|
|
"learning_rate": 9.783089656599196e-06,
|
|
"loss": 0.22918324470520018,
|
|
"memory(GiB)": 28.87,
|
|
"step": 225,
|
|
"token_acc": 0.9249420787728689,
|
|
"train_speed(iter/s)": 0.119596
|
|
},
|
|
{
|
|
"epoch": 0.28851430811446493,
|
|
"grad_norm": 0.7695819139480591,
|
|
"learning_rate": 9.773416363461401e-06,
|
|
"loss": 0.2181222677230835,
|
|
"memory(GiB)": 28.87,
|
|
"step": 230,
|
|
"token_acc": 0.9278282801744522,
|
|
"train_speed(iter/s)": 0.12021
|
|
},
|
|
{
|
|
"epoch": 0.2947863582908663,
|
|
"grad_norm": 0.7810352444648743,
|
|
"learning_rate": 9.763537050810064e-06,
|
|
"loss": 0.2256471872329712,
|
|
"memory(GiB)": 28.87,
|
|
"step": 235,
|
|
"token_acc": 0.9293833370318317,
|
|
"train_speed(iter/s)": 0.120752
|
|
},
|
|
{
|
|
"epoch": 0.3010584084672677,
|
|
"grad_norm": 0.6736721992492676,
|
|
"learning_rate": 9.753452145033961e-06,
|
|
"loss": 0.2320047378540039,
|
|
"memory(GiB)": 28.87,
|
|
"step": 240,
|
|
"token_acc": 0.9163366778813231,
|
|
"train_speed(iter/s)": 0.121219
|
|
},
|
|
{
|
|
"epoch": 0.3010584084672677,
|
|
"eval_loss": 0.2439332753419876,
|
|
"eval_runtime": 29.7876,
|
|
"eval_samples_per_second": 17.289,
|
|
"eval_steps_per_second": 4.331,
|
|
"eval_token_acc": 0.9183376584175642,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.30733045864366915,
|
|
"grad_norm": 0.7435624599456787,
|
|
"learning_rate": 9.743162081395227e-06,
|
|
"loss": 0.22847986221313477,
|
|
"memory(GiB)": 28.87,
|
|
"step": 245,
|
|
"token_acc": 0.9244482253414623,
|
|
"train_speed(iter/s)": 0.119441
|
|
},
|
|
{
|
|
"epoch": 0.31360250882007057,
|
|
"grad_norm": 0.7495070099830627,
|
|
"learning_rate": 9.73266730401056e-06,
|
|
"loss": 0.23787951469421387,
|
|
"memory(GiB)": 28.87,
|
|
"step": 250,
|
|
"token_acc": 0.9102755453501722,
|
|
"train_speed(iter/s)": 0.120195
|
|
},
|
|
{
|
|
"epoch": 0.319874558996472,
|
|
"grad_norm": 0.762973427772522,
|
|
"learning_rate": 9.72196826583205e-06,
|
|
"loss": 0.22795605659484863,
|
|
"memory(GiB)": 28.87,
|
|
"step": 255,
|
|
"token_acc": 0.9216171142676405,
|
|
"train_speed(iter/s)": 0.120779
|
|
},
|
|
{
|
|
"epoch": 0.32614660917287336,
|
|
"grad_norm": 0.7174361944198608,
|
|
"learning_rate": 9.711065428627638e-06,
|
|
"loss": 0.22773213386535646,
|
|
"memory(GiB)": 28.87,
|
|
"step": 260,
|
|
"token_acc": 0.9152317529383672,
|
|
"train_speed(iter/s)": 0.121245
|
|
},
|
|
{
|
|
"epoch": 0.32614660917287336,
|
|
"eval_loss": 0.24210233986377716,
|
|
"eval_runtime": 29.8271,
|
|
"eval_samples_per_second": 17.266,
|
|
"eval_steps_per_second": 4.325,
|
|
"eval_token_acc": 0.9190168154244103,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.3324186593492748,
|
|
"grad_norm": 0.6918441653251648,
|
|
"learning_rate": 9.699959262961182e-06,
|
|
"loss": 0.23561110496520996,
|
|
"memory(GiB)": 28.87,
|
|
"step": 265,
|
|
"token_acc": 0.9212337133550489,
|
|
"train_speed(iter/s)": 0.119577
|
|
},
|
|
{
|
|
"epoch": 0.3386907095256762,
|
|
"grad_norm": 0.7615500688552856,
|
|
"learning_rate": 9.688650248172145e-06,
|
|
"loss": 0.2438591480255127,
|
|
"memory(GiB)": 28.87,
|
|
"step": 270,
|
|
"token_acc": 0.9079355783308931,
|
|
"train_speed(iter/s)": 0.120126
|
|
},
|
|
{
|
|
"epoch": 0.3449627597020776,
|
|
"grad_norm": 0.7297282218933105,
|
|
"learning_rate": 9.677138872354916e-06,
|
|
"loss": 0.22617642879486083,
|
|
"memory(GiB)": 28.87,
|
|
"step": 275,
|
|
"token_acc": 0.9269484485707306,
|
|
"train_speed(iter/s)": 0.120692
|
|
},
|
|
{
|
|
"epoch": 0.35123480987847905,
|
|
"grad_norm": 0.8113409876823425,
|
|
"learning_rate": 9.665425632337731e-06,
|
|
"loss": 0.24270424842834473,
|
|
"memory(GiB)": 28.87,
|
|
"step": 280,
|
|
"token_acc": 0.9174018389662028,
|
|
"train_speed(iter/s)": 0.121257
|
|
},
|
|
{
|
|
"epoch": 0.35123480987847905,
|
|
"eval_loss": 0.2409682720899582,
|
|
"eval_runtime": 29.9049,
|
|
"eval_samples_per_second": 17.221,
|
|
"eval_steps_per_second": 4.314,
|
|
"eval_token_acc": 0.9193647785822388,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.3575068600548804,
|
|
"grad_norm": 0.7579072117805481,
|
|
"learning_rate": 9.653511033661242e-06,
|
|
"loss": 0.23863134384155274,
|
|
"memory(GiB)": 28.87,
|
|
"step": 285,
|
|
"token_acc": 0.9221180005684817,
|
|
"train_speed(iter/s)": 0.119665
|
|
},
|
|
{
|
|
"epoch": 0.36377891023128184,
|
|
"grad_norm": 0.7347179055213928,
|
|
"learning_rate": 9.641395590556689e-06,
|
|
"loss": 0.21491737365722657,
|
|
"memory(GiB)": 28.87,
|
|
"step": 290,
|
|
"token_acc": 0.9329900246145874,
|
|
"train_speed(iter/s)": 0.120176
|
|
},
|
|
{
|
|
"epoch": 0.37005096040768326,
|
|
"grad_norm": 0.7506256699562073,
|
|
"learning_rate": 9.629079825923712e-06,
|
|
"loss": 0.22804722785949708,
|
|
"memory(GiB)": 28.87,
|
|
"step": 295,
|
|
"token_acc": 0.9276003625313216,
|
|
"train_speed(iter/s)": 0.120703
|
|
},
|
|
{
|
|
"epoch": 0.3763230105840847,
|
|
"grad_norm": 0.7653704285621643,
|
|
"learning_rate": 9.616564271307779e-06,
|
|
"loss": 0.2438521385192871,
|
|
"memory(GiB)": 28.87,
|
|
"step": 300,
|
|
"token_acc": 0.9164976705946835,
|
|
"train_speed(iter/s)": 0.121274
|
|
},
|
|
{
|
|
"epoch": 0.3763230105840847,
|
|
"eval_loss": 0.23974527418613434,
|
|
"eval_runtime": 29.9433,
|
|
"eval_samples_per_second": 17.199,
|
|
"eval_steps_per_second": 4.308,
|
|
"eval_token_acc": 0.9196456645048233,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.3825950607604861,
|
|
"grad_norm": 0.7230799198150635,
|
|
"learning_rate": 9.603849466877249e-06,
|
|
"loss": 0.21197593212127686,
|
|
"memory(GiB)": 28.87,
|
|
"step": 305,
|
|
"token_acc": 0.9271648369422373,
|
|
"train_speed(iter/s)": 0.11978
|
|
},
|
|
{
|
|
"epoch": 0.3888671109368875,
|
|
"grad_norm": 0.7244482636451721,
|
|
"learning_rate": 9.59093596140005e-06,
|
|
"loss": 0.22695040702819824,
|
|
"memory(GiB)": 28.87,
|
|
"step": 310,
|
|
"token_acc": 0.9223127159030737,
|
|
"train_speed(iter/s)": 0.120294
|
|
},
|
|
{
|
|
"epoch": 0.3951391611132889,
|
|
"grad_norm": 0.7103644609451294,
|
|
"learning_rate": 9.577824312220006e-06,
|
|
"loss": 0.2294787883758545,
|
|
"memory(GiB)": 28.87,
|
|
"step": 315,
|
|
"token_acc": 0.9219038140678512,
|
|
"train_speed(iter/s)": 0.120646
|
|
},
|
|
{
|
|
"epoch": 0.4014112112896903,
|
|
"grad_norm": 0.6850073337554932,
|
|
"learning_rate": 9.564515085232772e-06,
|
|
"loss": 0.22310760021209716,
|
|
"memory(GiB)": 28.87,
|
|
"step": 320,
|
|
"token_acc": 0.916691517336731,
|
|
"train_speed(iter/s)": 0.121107
|
|
},
|
|
{
|
|
"epoch": 0.4014112112896903,
|
|
"eval_loss": 0.23897218704223633,
|
|
"eval_runtime": 29.8253,
|
|
"eval_samples_per_second": 17.267,
|
|
"eval_steps_per_second": 4.325,
|
|
"eval_token_acc": 0.9197756266481086,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.40768326146609174,
|
|
"grad_norm": 0.7237871289253235,
|
|
"learning_rate": 9.55100885486142e-06,
|
|
"loss": 0.2445456027984619,
|
|
"memory(GiB)": 28.87,
|
|
"step": 325,
|
|
"token_acc": 0.9257608045461343,
|
|
"train_speed(iter/s)": 0.119765
|
|
},
|
|
{
|
|
"epoch": 0.41395531164249316,
|
|
"grad_norm": 0.8139222860336304,
|
|
"learning_rate": 9.537306204031628e-06,
|
|
"loss": 0.2413849115371704,
|
|
"memory(GiB)": 28.87,
|
|
"step": 330,
|
|
"token_acc": 0.9230007008724653,
|
|
"train_speed(iter/s)": 0.120187
|
|
},
|
|
{
|
|
"epoch": 0.42022736181889453,
|
|
"grad_norm": 0.7721692323684692,
|
|
"learning_rate": 9.523407724146548e-06,
|
|
"loss": 0.22532095909118652,
|
|
"memory(GiB)": 28.87,
|
|
"step": 335,
|
|
"token_acc": 0.9216260855666258,
|
|
"train_speed(iter/s)": 0.120724
|
|
},
|
|
{
|
|
"epoch": 0.42649941199529595,
|
|
"grad_norm": 0.7195901274681091,
|
|
"learning_rate": 9.509314015061263e-06,
|
|
"loss": 0.21710624694824218,
|
|
"memory(GiB)": 28.87,
|
|
"step": 340,
|
|
"token_acc": 0.926438437202603,
|
|
"train_speed(iter/s)": 0.121078
|
|
},
|
|
{
|
|
"epoch": 0.42649941199529595,
|
|
"eval_loss": 0.2361544817686081,
|
|
"eval_runtime": 29.729,
|
|
"eval_samples_per_second": 17.323,
|
|
"eval_steps_per_second": 4.339,
|
|
"eval_token_acc": 0.9209872092097044,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.4327714621716974,
|
|
"grad_norm": 0.7148153185844421,
|
|
"learning_rate": 9.495025685056898e-06,
|
|
"loss": 0.21872997283935547,
|
|
"memory(GiB)": 28.87,
|
|
"step": 345,
|
|
"token_acc": 0.9274190386575708,
|
|
"train_speed(iter/s)": 0.119764
|
|
},
|
|
{
|
|
"epoch": 0.4390435123480988,
|
|
"grad_norm": 0.7195214033126831,
|
|
"learning_rate": 9.480543350814376e-06,
|
|
"loss": 0.22351717948913574,
|
|
"memory(GiB)": 28.87,
|
|
"step": 350,
|
|
"token_acc": 0.9214563773757728,
|
|
"train_speed(iter/s)": 0.120266
|
|
},
|
|
{
|
|
"epoch": 0.4453155625245002,
|
|
"grad_norm": 0.7162268161773682,
|
|
"learning_rate": 9.465867637387793e-06,
|
|
"loss": 0.24704561233520508,
|
|
"memory(GiB)": 28.87,
|
|
"step": 355,
|
|
"token_acc": 0.9105946522795992,
|
|
"train_speed(iter/s)": 0.120763
|
|
},
|
|
{
|
|
"epoch": 0.4515876127009016,
|
|
"grad_norm": 0.6543457508087158,
|
|
"learning_rate": 9.450999178177445e-06,
|
|
"loss": 0.22001304626464843,
|
|
"memory(GiB)": 28.87,
|
|
"step": 360,
|
|
"token_acc": 0.9289104914584577,
|
|
"train_speed(iter/s)": 0.121125
|
|
},
|
|
{
|
|
"epoch": 0.4515876127009016,
|
|
"eval_loss": 0.2345624417066574,
|
|
"eval_runtime": 29.6851,
|
|
"eval_samples_per_second": 17.349,
|
|
"eval_steps_per_second": 4.346,
|
|
"eval_token_acc": 0.9206140920886593,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.457859662877303,
|
|
"grad_norm": 0.7254881262779236,
|
|
"learning_rate": 9.435938614902494e-06,
|
|
"loss": 0.20390102863311768,
|
|
"memory(GiB)": 28.87,
|
|
"step": 365,
|
|
"token_acc": 0.9266752382392981,
|
|
"train_speed(iter/s)": 0.119898
|
|
},
|
|
{
|
|
"epoch": 0.46413171305370443,
|
|
"grad_norm": 0.7083726525306702,
|
|
"learning_rate": 9.42068659757326e-06,
|
|
"loss": 0.231141996383667,
|
|
"memory(GiB)": 28.87,
|
|
"step": 370,
|
|
"token_acc": 0.9151756790431099,
|
|
"train_speed(iter/s)": 0.12026
|
|
},
|
|
{
|
|
"epoch": 0.47040376323010585,
|
|
"grad_norm": 0.7314102053642273,
|
|
"learning_rate": 9.405243784463181e-06,
|
|
"loss": 0.22621698379516603,
|
|
"memory(GiB)": 28.87,
|
|
"step": 375,
|
|
"token_acc": 0.9306795666829633,
|
|
"train_speed(iter/s)": 0.120718
|
|
},
|
|
{
|
|
"epoch": 0.4766758134065073,
|
|
"grad_norm": 0.7966891527175903,
|
|
"learning_rate": 9.389610842080394e-06,
|
|
"loss": 0.23102831840515137,
|
|
"memory(GiB)": 28.87,
|
|
"step": 380,
|
|
"token_acc": 0.9184476940382452,
|
|
"train_speed(iter/s)": 0.121124
|
|
},
|
|
{
|
|
"epoch": 0.4766758134065073,
|
|
"eval_loss": 0.23356294631958008,
|
|
"eval_runtime": 29.8585,
|
|
"eval_samples_per_second": 17.248,
|
|
"eval_steps_per_second": 4.32,
|
|
"eval_token_acc": 0.9207440542319447,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.48294786358290864,
|
|
"grad_norm": 0.7005392909049988,
|
|
"learning_rate": 9.373788445138972e-06,
|
|
"loss": 0.21558718681335448,
|
|
"memory(GiB)": 28.87,
|
|
"step": 385,
|
|
"token_acc": 0.9273667904013094,
|
|
"train_speed(iter/s)": 0.119963
|
|
},
|
|
{
|
|
"epoch": 0.48921991375931007,
|
|
"grad_norm": 0.7006497383117676,
|
|
"learning_rate": 9.357777276529793e-06,
|
|
"loss": 0.21882824897766112,
|
|
"memory(GiB)": 28.87,
|
|
"step": 390,
|
|
"token_acc": 0.9215467262379661,
|
|
"train_speed(iter/s)": 0.120425
|
|
},
|
|
{
|
|
"epoch": 0.4954919639357115,
|
|
"grad_norm": 0.7023485898971558,
|
|
"learning_rate": 9.341578027291085e-06,
|
|
"loss": 0.2044372081756592,
|
|
"memory(GiB)": 28.87,
|
|
"step": 395,
|
|
"token_acc": 0.93374081873748,
|
|
"train_speed(iter/s)": 0.120782
|
|
},
|
|
{
|
|
"epoch": 0.5017640141121129,
|
|
"grad_norm": 0.7891673445701599,
|
|
"learning_rate": 9.325191396578589e-06,
|
|
"loss": 0.2204671859741211,
|
|
"memory(GiB)": 28.87,
|
|
"step": 400,
|
|
"token_acc": 0.9241170268845545,
|
|
"train_speed(iter/s)": 0.121159
|
|
},
|
|
{
|
|
"epoch": 0.5017640141121129,
|
|
"eval_loss": 0.23371295630931854,
|
|
"eval_runtime": 29.8363,
|
|
"eval_samples_per_second": 17.261,
|
|
"eval_steps_per_second": 4.324,
|
|
"eval_token_acc": 0.9213351723675329,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.5080360642885143,
|
|
"grad_norm": 0.696616530418396,
|
|
"learning_rate": 9.308618091635382e-06,
|
|
"loss": 0.22419328689575196,
|
|
"memory(GiB)": 28.87,
|
|
"step": 405,
|
|
"token_acc": 0.9280618250678131,
|
|
"train_speed(iter/s)": 0.120013
|
|
},
|
|
{
|
|
"epoch": 0.5143081144649158,
|
|
"grad_norm": 0.7773451209068298,
|
|
"learning_rate": 9.291858827761359e-06,
|
|
"loss": 0.2144181489944458,
|
|
"memory(GiB)": 28.87,
|
|
"step": 410,
|
|
"token_acc": 0.930534196094362,
|
|
"train_speed(iter/s)": 0.12032
|
|
},
|
|
{
|
|
"epoch": 0.5205801646413172,
|
|
"grad_norm": 0.786472737789154,
|
|
"learning_rate": 9.274914328282359e-06,
|
|
"loss": 0.23719301223754882,
|
|
"memory(GiB)": 28.87,
|
|
"step": 415,
|
|
"token_acc": 0.9227249618708694,
|
|
"train_speed(iter/s)": 0.120643
|
|
},
|
|
{
|
|
"epoch": 0.5268522148177185,
|
|
"grad_norm": 0.8163383603096008,
|
|
"learning_rate": 9.257785324518943e-06,
|
|
"loss": 0.22335872650146485,
|
|
"memory(GiB)": 28.87,
|
|
"step": 420,
|
|
"token_acc": 0.9292451461241548,
|
|
"train_speed(iter/s)": 0.120974
|
|
},
|
|
{
|
|
"epoch": 0.5268522148177185,
|
|
"eval_loss": 0.2324906587600708,
|
|
"eval_runtime": 29.851,
|
|
"eval_samples_per_second": 17.252,
|
|
"eval_steps_per_second": 4.321,
|
|
"eval_token_acc": 0.9215112501100486,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.5331242649941199,
|
|
"grad_norm": 0.690645158290863,
|
|
"learning_rate": 9.240472555754835e-06,
|
|
"loss": 0.21186673641204834,
|
|
"memory(GiB)": 28.87,
|
|
"step": 425,
|
|
"token_acc": 0.9271501893311638,
|
|
"train_speed(iter/s)": 0.120028
|
|
},
|
|
{
|
|
"epoch": 0.5393963151705213,
|
|
"grad_norm": 0.7793926000595093,
|
|
"learning_rate": 9.222976769205013e-06,
|
|
"loss": 0.21735620498657227,
|
|
"memory(GiB)": 28.87,
|
|
"step": 430,
|
|
"token_acc": 0.929071782480291,
|
|
"train_speed(iter/s)": 0.120349
|
|
},
|
|
{
|
|
"epoch": 0.5456683653469228,
|
|
"grad_norm": 0.6697238683700562,
|
|
"learning_rate": 9.205298719983458e-06,
|
|
"loss": 0.2206124782562256,
|
|
"memory(GiB)": 31.15,
|
|
"step": 435,
|
|
"token_acc": 0.9281159722041485,
|
|
"train_speed(iter/s)": 0.120617
|
|
},
|
|
{
|
|
"epoch": 0.5519404155233242,
|
|
"grad_norm": 1.481563687324524,
|
|
"learning_rate": 9.187439171070563e-06,
|
|
"loss": 0.22309460639953613,
|
|
"memory(GiB)": 31.15,
|
|
"step": 440,
|
|
"token_acc": 0.9234951798730308,
|
|
"train_speed(iter/s)": 0.120931
|
|
},
|
|
{
|
|
"epoch": 0.5519404155233242,
|
|
"eval_loss": 0.2316710203886032,
|
|
"eval_runtime": 29.9096,
|
|
"eval_samples_per_second": 17.219,
|
|
"eval_steps_per_second": 4.313,
|
|
"eval_token_acc": 0.9212806721138972,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.5582124656997256,
|
|
"grad_norm": 0.8089612722396851,
|
|
"learning_rate": 9.169398893280208e-06,
|
|
"loss": 0.20908033847808838,
|
|
"memory(GiB)": 31.15,
|
|
"step": 445,
|
|
"token_acc": 0.9294068842705951,
|
|
"train_speed(iter/s)": 0.119914
|
|
},
|
|
{
|
|
"epoch": 0.564484515876127,
|
|
"grad_norm": 0.6420107483863831,
|
|
"learning_rate": 9.151178665226486e-06,
|
|
"loss": 0.22311244010925294,
|
|
"memory(GiB)": 31.15,
|
|
"step": 450,
|
|
"token_acc": 0.9273165858531712,
|
|
"train_speed(iter/s)": 0.120233
|
|
},
|
|
{
|
|
"epoch": 0.5707565660525284,
|
|
"grad_norm": 0.6611379981040955,
|
|
"learning_rate": 9.132779273290103e-06,
|
|
"loss": 0.21406009197235107,
|
|
"memory(GiB)": 31.15,
|
|
"step": 455,
|
|
"token_acc": 0.9297352623862347,
|
|
"train_speed(iter/s)": 0.120453
|
|
},
|
|
{
|
|
"epoch": 0.5770286162289299,
|
|
"grad_norm": 0.665287435054779,
|
|
"learning_rate": 9.114201511584428e-06,
|
|
"loss": 0.22191643714904785,
|
|
"memory(GiB)": 31.15,
|
|
"step": 460,
|
|
"token_acc": 0.9259487069772417,
|
|
"train_speed(iter/s)": 0.120712
|
|
},
|
|
{
|
|
"epoch": 0.5770286162289299,
|
|
"eval_loss": 0.23122188448905945,
|
|
"eval_runtime": 29.7667,
|
|
"eval_samples_per_second": 17.301,
|
|
"eval_steps_per_second": 4.334,
|
|
"eval_token_acc": 0.9220562526464066,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.5833006664053313,
|
|
"grad_norm": 0.8055879473686218,
|
|
"learning_rate": 9.095446181921237e-06,
|
|
"loss": 0.22888469696044922,
|
|
"memory(GiB)": 31.15,
|
|
"step": 465,
|
|
"token_acc": 0.9251080172827653,
|
|
"train_speed(iter/s)": 0.119856
|
|
},
|
|
{
|
|
"epoch": 0.5895727165817326,
|
|
"grad_norm": 0.6367520689964294,
|
|
"learning_rate": 9.07651409377609e-06,
|
|
"loss": 0.22404332160949708,
|
|
"memory(GiB)": 31.15,
|
|
"step": 470,
|
|
"token_acc": 0.918756183745583,
|
|
"train_speed(iter/s)": 0.120217
|
|
},
|
|
{
|
|
"epoch": 0.595844766758134,
|
|
"grad_norm": 0.7680770754814148,
|
|
"learning_rate": 9.057406064253404e-06,
|
|
"loss": 0.2318411111831665,
|
|
"memory(GiB)": 31.15,
|
|
"step": 475,
|
|
"token_acc": 0.9213946117274168,
|
|
"train_speed(iter/s)": 0.120561
|
|
},
|
|
{
|
|
"epoch": 0.6021168169345354,
|
|
"grad_norm": 0.685991644859314,
|
|
"learning_rate": 9.038122918051184e-06,
|
|
"loss": 0.21981484889984132,
|
|
"memory(GiB)": 31.15,
|
|
"step": 480,
|
|
"token_acc": 0.9215212527964206,
|
|
"train_speed(iter/s)": 0.120844
|
|
},
|
|
{
|
|
"epoch": 0.6021168169345354,
|
|
"eval_loss": 0.22933758795261383,
|
|
"eval_runtime": 29.8714,
|
|
"eval_samples_per_second": 17.241,
|
|
"eval_steps_per_second": 4.319,
|
|
"eval_token_acc": 0.9229072950685655,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.6083888671109369,
|
|
"grad_norm": 0.7308952808380127,
|
|
"learning_rate": 9.018665487425426e-06,
|
|
"loss": 0.21712393760681153,
|
|
"memory(GiB)": 31.15,
|
|
"step": 485,
|
|
"token_acc": 0.9292804826355687,
|
|
"train_speed(iter/s)": 0.119914
|
|
},
|
|
{
|
|
"epoch": 0.6146609172873383,
|
|
"grad_norm": 0.7039359211921692,
|
|
"learning_rate": 8.999034612154204e-06,
|
|
"loss": 0.22841830253601075,
|
|
"memory(GiB)": 31.15,
|
|
"step": 490,
|
|
"token_acc": 0.9264894745658607,
|
|
"train_speed(iter/s)": 0.12024
|
|
},
|
|
{
|
|
"epoch": 0.6209329674637397,
|
|
"grad_norm": 0.7186778783798218,
|
|
"learning_rate": 8.979231139501417e-06,
|
|
"loss": 0.21591267585754395,
|
|
"memory(GiB)": 31.15,
|
|
"step": 495,
|
|
"token_acc": 0.9271491015055853,
|
|
"train_speed(iter/s)": 0.120545
|
|
},
|
|
{
|
|
"epoch": 0.6272050176401411,
|
|
"grad_norm": 0.8200941681861877,
|
|
"learning_rate": 8.95925592418023e-06,
|
|
"loss": 0.2159876823425293,
|
|
"memory(GiB)": 31.15,
|
|
"step": 500,
|
|
"token_acc": 0.9208719988216232,
|
|
"train_speed(iter/s)": 0.120825
|
|
},
|
|
{
|
|
"epoch": 0.6272050176401411,
|
|
"eval_loss": 0.22962290048599243,
|
|
"eval_runtime": 29.8717,
|
|
"eval_samples_per_second": 17.24,
|
|
"eval_steps_per_second": 4.318,
|
|
"eval_token_acc": 0.9228486024877269,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.6334770678165426,
|
|
"grad_norm": 0.7458428740501404,
|
|
"learning_rate": 8.939109828316184e-06,
|
|
"loss": 0.21893787384033203,
|
|
"memory(GiB)": 31.15,
|
|
"step": 505,
|
|
"token_acc": 0.92739813330044,
|
|
"train_speed(iter/s)": 0.119977
|
|
},
|
|
{
|
|
"epoch": 0.639749117992944,
|
|
"grad_norm": 0.6861876249313354,
|
|
"learning_rate": 8.918793721409973e-06,
|
|
"loss": 0.2168494701385498,
|
|
"memory(GiB)": 31.15,
|
|
"step": 510,
|
|
"token_acc": 0.9236680466488081,
|
|
"train_speed(iter/s)": 0.120232
|
|
},
|
|
{
|
|
"epoch": 0.6460211681693454,
|
|
"grad_norm": 0.6782411932945251,
|
|
"learning_rate": 8.898308480299937e-06,
|
|
"loss": 0.21742620468139648,
|
|
"memory(GiB)": 31.15,
|
|
"step": 515,
|
|
"token_acc": 0.9275666477595008,
|
|
"train_speed(iter/s)": 0.120504
|
|
},
|
|
{
|
|
"epoch": 0.6522932183457467,
|
|
"grad_norm": 0.6103708744049072,
|
|
"learning_rate": 8.877654989124202e-06,
|
|
"loss": 0.20578155517578126,
|
|
"memory(GiB)": 31.15,
|
|
"step": 520,
|
|
"token_acc": 0.9377705924261522,
|
|
"train_speed(iter/s)": 0.120812
|
|
},
|
|
{
|
|
"epoch": 0.6522932183457467,
|
|
"eval_loss": 0.22769030928611755,
|
|
"eval_runtime": 29.7319,
|
|
"eval_samples_per_second": 17.321,
|
|
"eval_steps_per_second": 4.339,
|
|
"eval_token_acc": 0.9225677165651425,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.6585652685221481,
|
|
"grad_norm": 0.6786876916885376,
|
|
"learning_rate": 8.856834139282531e-06,
|
|
"loss": 0.2087319850921631,
|
|
"memory(GiB)": 31.15,
|
|
"step": 525,
|
|
"token_acc": 0.9303061874753442,
|
|
"train_speed(iter/s)": 0.119999
|
|
},
|
|
{
|
|
"epoch": 0.6648373186985496,
|
|
"grad_norm": 0.6989500522613525,
|
|
"learning_rate": 8.835846829397843e-06,
|
|
"loss": 0.2093345880508423,
|
|
"memory(GiB)": 31.15,
|
|
"step": 530,
|
|
"token_acc": 0.9277319617252378,
|
|
"train_speed(iter/s)": 0.120304
|
|
},
|
|
{
|
|
"epoch": 0.671109368874951,
|
|
"grad_norm": 0.7903891801834106,
|
|
"learning_rate": 8.814693965277435e-06,
|
|
"loss": 0.2172760248184204,
|
|
"memory(GiB)": 31.15,
|
|
"step": 535,
|
|
"token_acc": 0.9253766898933953,
|
|
"train_speed(iter/s)": 0.120551
|
|
},
|
|
{
|
|
"epoch": 0.6773814190513524,
|
|
"grad_norm": 0.7009393572807312,
|
|
"learning_rate": 8.793376459873888e-06,
|
|
"loss": 0.2161731481552124,
|
|
"memory(GiB)": 31.15,
|
|
"step": 540,
|
|
"token_acc": 0.9260658696142567,
|
|
"train_speed(iter/s)": 0.120818
|
|
},
|
|
{
|
|
"epoch": 0.6773814190513524,
|
|
"eval_loss": 0.22710371017456055,
|
|
"eval_runtime": 29.8324,
|
|
"eval_samples_per_second": 17.263,
|
|
"eval_steps_per_second": 4.324,
|
|
"eval_token_acc": 0.9232384889175831,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.6836534692277538,
|
|
"grad_norm": 0.6882784962654114,
|
|
"learning_rate": 8.771895233245655e-06,
|
|
"loss": 0.20790476799011232,
|
|
"memory(GiB)": 31.15,
|
|
"step": 545,
|
|
"token_acc": 0.9269955800687545,
|
|
"train_speed(iter/s)": 0.120024
|
|
},
|
|
{
|
|
"epoch": 0.6899255194041553,
|
|
"grad_norm": 0.7261970639228821,
|
|
"learning_rate": 8.750251212517364e-06,
|
|
"loss": 0.21239514350891114,
|
|
"memory(GiB)": 31.15,
|
|
"step": 550,
|
|
"token_acc": 0.934424197347217,
|
|
"train_speed(iter/s)": 0.120259
|
|
},
|
|
{
|
|
"epoch": 0.6961975695805567,
|
|
"grad_norm": 0.7103798985481262,
|
|
"learning_rate": 8.728445331839796e-06,
|
|
"loss": 0.22427408695220946,
|
|
"memory(GiB)": 31.15,
|
|
"step": 555,
|
|
"token_acc": 0.926710200976352,
|
|
"train_speed(iter/s)": 0.120542
|
|
},
|
|
{
|
|
"epoch": 0.7024696197569581,
|
|
"grad_norm": 0.6691487431526184,
|
|
"learning_rate": 8.706478532349567e-06,
|
|
"loss": 0.2043588399887085,
|
|
"memory(GiB)": 31.15,
|
|
"step": 560,
|
|
"token_acc": 0.9273906956709025,
|
|
"train_speed(iter/s)": 0.120786
|
|
},
|
|
{
|
|
"epoch": 0.7024696197569581,
|
|
"eval_loss": 0.22625486552715302,
|
|
"eval_runtime": 29.9222,
|
|
"eval_samples_per_second": 17.211,
|
|
"eval_steps_per_second": 4.311,
|
|
"eval_token_acc": 0.9232091426271638,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.7087416699333595,
|
|
"grad_norm": 0.7665515542030334,
|
|
"learning_rate": 8.684351762128511e-06,
|
|
"loss": 0.21694588661193848,
|
|
"memory(GiB)": 31.15,
|
|
"step": 565,
|
|
"token_acc": 0.9275524304991059,
|
|
"train_speed(iter/s)": 0.120052
|
|
},
|
|
{
|
|
"epoch": 0.7150137201097608,
|
|
"grad_norm": 0.6899131536483765,
|
|
"learning_rate": 8.662065976162765e-06,
|
|
"loss": 0.20793275833129882,
|
|
"memory(GiB)": 31.15,
|
|
"step": 570,
|
|
"token_acc": 0.9312310457149275,
|
|
"train_speed(iter/s)": 0.12035
|
|
},
|
|
{
|
|
"epoch": 0.7212857702861623,
|
|
"grad_norm": 0.7864097356796265,
|
|
"learning_rate": 8.639622136301541e-06,
|
|
"loss": 0.21702027320861816,
|
|
"memory(GiB)": 33.6,
|
|
"step": 575,
|
|
"token_acc": 0.9272943626357281,
|
|
"train_speed(iter/s)": 0.120589
|
|
},
|
|
{
|
|
"epoch": 0.7275578204625637,
|
|
"grad_norm": 0.729067325592041,
|
|
"learning_rate": 8.617021211215629e-06,
|
|
"loss": 0.20268304347991944,
|
|
"memory(GiB)": 33.6,
|
|
"step": 580,
|
|
"token_acc": 0.9264947749297842,
|
|
"train_speed(iter/s)": 0.120859
|
|
},
|
|
{
|
|
"epoch": 0.7275578204625637,
|
|
"eval_loss": 0.22482995688915253,
|
|
"eval_runtime": 29.7802,
|
|
"eval_samples_per_second": 17.293,
|
|
"eval_steps_per_second": 4.332,
|
|
"eval_token_acc": 0.92376672214513,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.7338298706389651,
|
|
"grad_norm": 0.6976431608200073,
|
|
"learning_rate": 8.594264176355565e-06,
|
|
"loss": 0.2071969985961914,
|
|
"memory(GiB)": 33.6,
|
|
"step": 585,
|
|
"token_acc": 0.9269029680138362,
|
|
"train_speed(iter/s)": 0.120124
|
|
},
|
|
{
|
|
"epoch": 0.7401019208153665,
|
|
"grad_norm": 0.760837733745575,
|
|
"learning_rate": 8.571352013909558e-06,
|
|
"loss": 0.2175739288330078,
|
|
"memory(GiB)": 33.6,
|
|
"step": 590,
|
|
"token_acc": 0.9229339361969354,
|
|
"train_speed(iter/s)": 0.1204
|
|
},
|
|
{
|
|
"epoch": 0.7463739709917679,
|
|
"grad_norm": 0.646027147769928,
|
|
"learning_rate": 8.548285712761084e-06,
|
|
"loss": 0.21033940315246583,
|
|
"memory(GiB)": 33.6,
|
|
"step": 595,
|
|
"token_acc": 0.9275317355671036,
|
|
"train_speed(iter/s)": 0.120614
|
|
},
|
|
{
|
|
"epoch": 0.7526460211681694,
|
|
"grad_norm": 0.732366681098938,
|
|
"learning_rate": 8.525066268446208e-06,
|
|
"loss": 0.2169095754623413,
|
|
"memory(GiB)": 33.6,
|
|
"step": 600,
|
|
"token_acc": 0.9253966064284729,
|
|
"train_speed(iter/s)": 0.120858
|
|
},
|
|
{
|
|
"epoch": 0.7526460211681694,
|
|
"eval_loss": 0.22306384146213531,
|
|
"eval_runtime": 29.8297,
|
|
"eval_samples_per_second": 17.265,
|
|
"eval_steps_per_second": 4.325,
|
|
"eval_token_acc": 0.9240308387589035,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.7589180713445708,
|
|
"grad_norm": 0.7121861577033997,
|
|
"learning_rate": 8.501694683110615e-06,
|
|
"loss": 0.22008817195892333,
|
|
"memory(GiB)": 33.6,
|
|
"step": 605,
|
|
"token_acc": 0.9294228080296467,
|
|
"train_speed(iter/s)": 0.120141
|
|
},
|
|
{
|
|
"epoch": 0.7651901215209722,
|
|
"grad_norm": 0.8034666776657104,
|
|
"learning_rate": 8.478171965466366e-06,
|
|
"loss": 0.21159706115722657,
|
|
"memory(GiB)": 33.6,
|
|
"step": 610,
|
|
"token_acc": 0.925722043939768,
|
|
"train_speed(iter/s)": 0.120368
|
|
},
|
|
{
|
|
"epoch": 0.7714621716973736,
|
|
"grad_norm": 0.791170597076416,
|
|
"learning_rate": 8.454499130748352e-06,
|
|
"loss": 0.20048816204071046,
|
|
"memory(GiB)": 33.6,
|
|
"step": 615,
|
|
"token_acc": 0.9308691482869845,
|
|
"train_speed(iter/s)": 0.120665
|
|
},
|
|
{
|
|
"epoch": 0.777734221873775,
|
|
"grad_norm": 0.7372978329658508,
|
|
"learning_rate": 8.43067720067048e-06,
|
|
"loss": 0.2082076072692871,
|
|
"memory(GiB)": 33.6,
|
|
"step": 620,
|
|
"token_acc": 0.93298405204675,
|
|
"train_speed(iter/s)": 0.120914
|
|
},
|
|
{
|
|
"epoch": 0.777734221873775,
|
|
"eval_loss": 0.22348882257938385,
|
|
"eval_runtime": 29.701,
|
|
"eval_samples_per_second": 17.34,
|
|
"eval_steps_per_second": 4.343,
|
|
"eval_token_acc": 0.9239637615236594,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.7840062720501764,
|
|
"grad_norm": 0.7372789978981018,
|
|
"learning_rate": 8.40670720338158e-06,
|
|
"loss": 0.22890782356262207,
|
|
"memory(GiB)": 33.6,
|
|
"step": 625,
|
|
"token_acc": 0.9263055911491702,
|
|
"train_speed(iter/s)": 0.120201
|
|
},
|
|
{
|
|
"epoch": 0.7902783222265778,
|
|
"grad_norm": 0.7267508506774902,
|
|
"learning_rate": 8.382590173421029e-06,
|
|
"loss": 0.21681501865386962,
|
|
"memory(GiB)": 33.6,
|
|
"step": 630,
|
|
"token_acc": 0.9380081814049795,
|
|
"train_speed(iter/s)": 0.120459
|
|
},
|
|
{
|
|
"epoch": 0.7965503724029792,
|
|
"grad_norm": 0.7548701167106628,
|
|
"learning_rate": 8.358327151674095e-06,
|
|
"loss": 0.21880314350128174,
|
|
"memory(GiB)": 33.6,
|
|
"step": 635,
|
|
"token_acc": 0.9205941088367449,
|
|
"train_speed(iter/s)": 0.120708
|
|
},
|
|
{
|
|
"epoch": 0.8028224225793806,
|
|
"grad_norm": 0.7470581531524658,
|
|
"learning_rate": 8.33391918532702e-06,
|
|
"loss": 0.210282564163208,
|
|
"memory(GiB)": 33.6,
|
|
"step": 640,
|
|
"token_acc": 0.9302132163919552,
|
|
"train_speed(iter/s)": 0.120949
|
|
},
|
|
{
|
|
"epoch": 0.8028224225793806,
|
|
"eval_loss": 0.22056862711906433,
|
|
"eval_runtime": 29.8322,
|
|
"eval_samples_per_second": 17.263,
|
|
"eval_steps_per_second": 4.324,
|
|
"eval_token_acc": 0.9247183804201551,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.8090944727557821,
|
|
"grad_norm": 0.6555790901184082,
|
|
"learning_rate": 8.309367327821819e-06,
|
|
"loss": 0.19786405563354492,
|
|
"memory(GiB)": 33.6,
|
|
"step": 645,
|
|
"token_acc": 0.9318245266126937,
|
|
"train_speed(iter/s)": 0.12025
|
|
},
|
|
{
|
|
"epoch": 0.8153665229321835,
|
|
"grad_norm": 0.7353399395942688,
|
|
"learning_rate": 8.284672638810813e-06,
|
|
"loss": 0.2103184938430786,
|
|
"memory(GiB)": 33.6,
|
|
"step": 650,
|
|
"token_acc": 0.93095703125,
|
|
"train_speed(iter/s)": 0.120454
|
|
},
|
|
{
|
|
"epoch": 0.8216385731085849,
|
|
"grad_norm": 0.6436148881912231,
|
|
"learning_rate": 8.259836184110904e-06,
|
|
"loss": 0.20670008659362793,
|
|
"memory(GiB)": 33.6,
|
|
"step": 655,
|
|
"token_acc": 0.9274860182785432,
|
|
"train_speed(iter/s)": 0.120662
|
|
},
|
|
{
|
|
"epoch": 0.8279106232849863,
|
|
"grad_norm": 0.6722457408905029,
|
|
"learning_rate": 8.234859035657557e-06,
|
|
"loss": 0.21930215358734131,
|
|
"memory(GiB)": 33.6,
|
|
"step": 660,
|
|
"token_acc": 0.926199601410394,
|
|
"train_speed(iter/s)": 0.120911
|
|
},
|
|
{
|
|
"epoch": 0.8279106232849863,
|
|
"eval_loss": 0.22012893855571747,
|
|
"eval_runtime": 29.7417,
|
|
"eval_samples_per_second": 17.316,
|
|
"eval_steps_per_second": 4.337,
|
|
"eval_token_acc": 0.9244207251887595,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.8341826734613876,
|
|
"grad_norm": 0.661716103553772,
|
|
"learning_rate": 8.209742271458556e-06,
|
|
"loss": 0.21486959457397461,
|
|
"memory(GiB)": 33.6,
|
|
"step": 665,
|
|
"token_acc": 0.9290491629452907,
|
|
"train_speed(iter/s)": 0.120203
|
|
},
|
|
{
|
|
"epoch": 0.8404547236377891,
|
|
"grad_norm": 0.6564416885375977,
|
|
"learning_rate": 8.18448697554746e-06,
|
|
"loss": 0.19918079376220704,
|
|
"memory(GiB)": 33.6,
|
|
"step": 670,
|
|
"token_acc": 0.9267048282727726,
|
|
"train_speed(iter/s)": 0.120429
|
|
},
|
|
{
|
|
"epoch": 0.8467267738141905,
|
|
"grad_norm": 0.699549674987793,
|
|
"learning_rate": 8.159094237936828e-06,
|
|
"loss": 0.2177518367767334,
|
|
"memory(GiB)": 33.6,
|
|
"step": 675,
|
|
"token_acc": 0.9223363604783882,
|
|
"train_speed(iter/s)": 0.120656
|
|
},
|
|
{
|
|
"epoch": 0.8529988239905919,
|
|
"grad_norm": 0.7374889850616455,
|
|
"learning_rate": 8.133565154571169e-06,
|
|
"loss": 0.22228724956512452,
|
|
"memory(GiB)": 33.6,
|
|
"step": 680,
|
|
"token_acc": 0.9125668288580525,
|
|
"train_speed(iter/s)": 0.120849
|
|
},
|
|
{
|
|
"epoch": 0.8529988239905919,
|
|
"eval_loss": 0.2190328687429428,
|
|
"eval_runtime": 29.8942,
|
|
"eval_samples_per_second": 17.227,
|
|
"eval_steps_per_second": 4.315,
|
|
"eval_token_acc": 0.9247435343833715,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.8592708741669933,
|
|
"grad_norm": 0.6313470602035522,
|
|
"learning_rate": 8.107900827279638e-06,
|
|
"loss": 0.20030460357666016,
|
|
"memory(GiB)": 33.6,
|
|
"step": 685,
|
|
"token_acc": 0.9315766341697299,
|
|
"train_speed(iter/s)": 0.120164
|
|
},
|
|
{
|
|
"epoch": 0.8655429243433947,
|
|
"grad_norm": 0.7052416205406189,
|
|
"learning_rate": 8.082102363728494e-06,
|
|
"loss": 0.2170419692993164,
|
|
"memory(GiB)": 33.6,
|
|
"step": 690,
|
|
"token_acc": 0.9215518226488402,
|
|
"train_speed(iter/s)": 0.120386
|
|
},
|
|
{
|
|
"epoch": 0.8718149745197962,
|
|
"grad_norm": 0.7035377025604248,
|
|
"learning_rate": 8.056170877373277e-06,
|
|
"loss": 0.22514162063598633,
|
|
"memory(GiB)": 33.6,
|
|
"step": 695,
|
|
"token_acc": 0.9211979208747407,
|
|
"train_speed(iter/s)": 0.120592
|
|
},
|
|
{
|
|
"epoch": 0.8780870246961976,
|
|
"grad_norm": 0.6568828821182251,
|
|
"learning_rate": 8.030107487410766e-06,
|
|
"loss": 0.20586962699890138,
|
|
"memory(GiB)": 33.6,
|
|
"step": 700,
|
|
"token_acc": 0.9286558694881192,
|
|
"train_speed(iter/s)": 0.120818
|
|
},
|
|
{
|
|
"epoch": 0.8780870246961976,
|
|
"eval_loss": 0.21893204748630524,
|
|
"eval_runtime": 29.9213,
|
|
"eval_samples_per_second": 17.212,
|
|
"eval_steps_per_second": 4.311,
|
|
"eval_token_acc": 0.9248315732546294,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.884359074872599,
|
|
"grad_norm": 0.7014189958572388,
|
|
"learning_rate": 8.003913318730662e-06,
|
|
"loss": 0.2156972885131836,
|
|
"memory(GiB)": 33.6,
|
|
"step": 705,
|
|
"token_acc": 0.9312223183831391,
|
|
"train_speed(iter/s)": 0.120206
|
|
},
|
|
{
|
|
"epoch": 0.8906311250490004,
|
|
"grad_norm": 0.7113268971443176,
|
|
"learning_rate": 7.97758950186705e-06,
|
|
"loss": 0.20703303813934326,
|
|
"memory(GiB)": 33.6,
|
|
"step": 710,
|
|
"token_acc": 0.9297330786308018,
|
|
"train_speed(iter/s)": 0.120418
|
|
},
|
|
{
|
|
"epoch": 0.8969031752254017,
|
|
"grad_norm": 0.6587302684783936,
|
|
"learning_rate": 7.951137172949595e-06,
|
|
"loss": 0.20361075401306153,
|
|
"memory(GiB)": 33.6,
|
|
"step": 715,
|
|
"token_acc": 0.9325225225225225,
|
|
"train_speed(iter/s)": 0.120636
|
|
},
|
|
{
|
|
"epoch": 0.9031752254018032,
|
|
"grad_norm": 0.6677445769309998,
|
|
"learning_rate": 7.924557473654516e-06,
|
|
"loss": 0.19508445262908936,
|
|
"memory(GiB)": 33.6,
|
|
"step": 720,
|
|
"token_acc": 0.9272533459670733,
|
|
"train_speed(iter/s)": 0.120834
|
|
},
|
|
{
|
|
"epoch": 0.9031752254018032,
|
|
"eval_loss": 0.2177843153476715,
|
|
"eval_runtime": 29.9167,
|
|
"eval_samples_per_second": 17.214,
|
|
"eval_steps_per_second": 4.312,
|
|
"eval_token_acc": 0.9248483425634404,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.9094472755782046,
|
|
"grad_norm": 0.6732158064842224,
|
|
"learning_rate": 7.897851551155306e-06,
|
|
"loss": 0.20930843353271483,
|
|
"memory(GiB)": 33.6,
|
|
"step": 725,
|
|
"token_acc": 0.9286447233404372,
|
|
"train_speed(iter/s)": 0.120233
|
|
},
|
|
{
|
|
"epoch": 0.915719325754606,
|
|
"grad_norm": 0.7134806513786316,
|
|
"learning_rate": 7.871020558073217e-06,
|
|
"loss": 0.22407774925231932,
|
|
"memory(GiB)": 33.6,
|
|
"step": 730,
|
|
"token_acc": 0.9180210751919986,
|
|
"train_speed(iter/s)": 0.120451
|
|
},
|
|
{
|
|
"epoch": 0.9219913759310074,
|
|
"grad_norm": 0.7282131910324097,
|
|
"learning_rate": 7.844065652427523e-06,
|
|
"loss": 0.20888471603393555,
|
|
"memory(GiB)": 33.6,
|
|
"step": 735,
|
|
"token_acc": 0.9275435780462392,
|
|
"train_speed(iter/s)": 0.120674
|
|
},
|
|
{
|
|
"epoch": 0.9282634261074089,
|
|
"grad_norm": 0.6692695021629333,
|
|
"learning_rate": 7.816987997585535e-06,
|
|
"loss": 0.2041374683380127,
|
|
"memory(GiB)": 33.6,
|
|
"step": 740,
|
|
"token_acc": 0.9322735248670875,
|
|
"train_speed(iter/s)": 0.12084
|
|
},
|
|
{
|
|
"epoch": 0.9282634261074089,
|
|
"eval_loss": 0.21771369874477386,
|
|
"eval_runtime": 29.8739,
|
|
"eval_samples_per_second": 17.239,
|
|
"eval_steps_per_second": 4.318,
|
|
"eval_token_acc": 0.9255442688790975,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.9345354762838103,
|
|
"grad_norm": 0.7032608985900879,
|
|
"learning_rate": 7.789788762212384e-06,
|
|
"loss": 0.19432848691940308,
|
|
"memory(GiB)": 33.6,
|
|
"step": 745,
|
|
"token_acc": 0.9331851716544383,
|
|
"train_speed(iter/s)": 0.120233
|
|
},
|
|
{
|
|
"epoch": 0.9408075264602117,
|
|
"grad_norm": 0.6778285503387451,
|
|
"learning_rate": 7.762469120220595e-06,
|
|
"loss": 0.20669918060302733,
|
|
"memory(GiB)": 33.6,
|
|
"step": 750,
|
|
"token_acc": 0.9296290992410139,
|
|
"train_speed(iter/s)": 0.120451
|
|
},
|
|
{
|
|
"epoch": 0.9470795766366131,
|
|
"grad_norm": 0.7147387266159058,
|
|
"learning_rate": 7.73503025071941e-06,
|
|
"loss": 0.2145129680633545,
|
|
"memory(GiB)": 33.6,
|
|
"step": 755,
|
|
"token_acc": 0.9212092639519123,
|
|
"train_speed(iter/s)": 0.120647
|
|
},
|
|
{
|
|
"epoch": 0.9533516268130146,
|
|
"grad_norm": 0.6544482111930847,
|
|
"learning_rate": 7.7074733379639e-06,
|
|
"loss": 0.2081056594848633,
|
|
"memory(GiB)": 33.6,
|
|
"step": 760,
|
|
"token_acc": 0.9254992319508448,
|
|
"train_speed(iter/s)": 0.120855
|
|
},
|
|
{
|
|
"epoch": 0.9533516268130146,
|
|
"eval_loss": 0.21670959889888763,
|
|
"eval_runtime": 29.8367,
|
|
"eval_samples_per_second": 17.261,
|
|
"eval_steps_per_second": 4.324,
|
|
"eval_token_acc": 0.9255191149158809,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.9596236769894159,
|
|
"grad_norm": 0.6839804649353027,
|
|
"learning_rate": 7.679799571303861e-06,
|
|
"loss": 0.21366724967956544,
|
|
"memory(GiB)": 33.6,
|
|
"step": 765,
|
|
"token_acc": 0.9328849994693322,
|
|
"train_speed(iter/s)": 0.120321
|
|
},
|
|
{
|
|
"epoch": 0.9658957271658173,
|
|
"grad_norm": 0.7972912788391113,
|
|
"learning_rate": 7.65201014513247e-06,
|
|
"loss": 0.21506853103637696,
|
|
"memory(GiB)": 33.6,
|
|
"step": 770,
|
|
"token_acc": 0.9269746646795827,
|
|
"train_speed(iter/s)": 0.120492
|
|
},
|
|
{
|
|
"epoch": 0.9721677773422187,
|
|
"grad_norm": 0.6575592756271362,
|
|
"learning_rate": 7.62410625883474e-06,
|
|
"loss": 0.21630258560180665,
|
|
"memory(GiB)": 33.6,
|
|
"step": 775,
|
|
"token_acc": 0.92975748611615,
|
|
"train_speed(iter/s)": 0.120664
|
|
},
|
|
{
|
|
"epoch": 0.9784398275186201,
|
|
"grad_norm": 0.6901047825813293,
|
|
"learning_rate": 7.596089116735765e-06,
|
|
"loss": 0.2089380741119385,
|
|
"memory(GiB)": 33.6,
|
|
"step": 780,
|
|
"token_acc": 0.9275595528864113,
|
|
"train_speed(iter/s)": 0.12086
|
|
},
|
|
{
|
|
"epoch": 0.9784398275186201,
|
|
"eval_loss": 0.2157372534275055,
|
|
"eval_runtime": 29.8458,
|
|
"eval_samples_per_second": 17.255,
|
|
"eval_steps_per_second": 4.322,
|
|
"eval_token_acc": 0.9259257706545481,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.9847118776950216,
|
|
"grad_norm": 0.6576786041259766,
|
|
"learning_rate": 7.567959928048723e-06,
|
|
"loss": 0.2120821475982666,
|
|
"memory(GiB)": 33.6,
|
|
"step": 785,
|
|
"token_acc": 0.9295358776486603,
|
|
"train_speed(iter/s)": 0.120278
|
|
},
|
|
{
|
|
"epoch": 0.990983927871423,
|
|
"grad_norm": 0.8202281594276428,
|
|
"learning_rate": 7.5397199068227e-06,
|
|
"loss": 0.21234326362609862,
|
|
"memory(GiB)": 33.6,
|
|
"step": 790,
|
|
"token_acc": 0.9287775025499053,
|
|
"train_speed(iter/s)": 0.120438
|
|
},
|
|
{
|
|
"epoch": 0.9972559780478244,
|
|
"grad_norm": 0.6771529912948608,
|
|
"learning_rate": 7.511370271890286e-06,
|
|
"loss": 0.20872533321380615,
|
|
"memory(GiB)": 33.6,
|
|
"step": 795,
|
|
"token_acc": 0.9276382199405878,
|
|
"train_speed(iter/s)": 0.12063
|
|
},
|
|
{
|
|
"epoch": 1.0025088200705605,
|
|
"grad_norm": 0.6259863376617432,
|
|
"learning_rate": 7.482912246814975e-06,
|
|
"loss": 0.1691659927368164,
|
|
"memory(GiB)": 33.6,
|
|
"step": 800,
|
|
"token_acc": 0.9478380434146627,
|
|
"train_speed(iter/s)": 0.120917
|
|
},
|
|
{
|
|
"epoch": 1.0025088200705605,
|
|
"eval_loss": 0.21573711931705475,
|
|
"eval_runtime": 29.9639,
|
|
"eval_samples_per_second": 17.187,
|
|
"eval_steps_per_second": 4.305,
|
|
"eval_token_acc": 0.9261102330514692,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.008780870246962,
|
|
"grad_norm": 0.688946545124054,
|
|
"learning_rate": 7.454347059838351e-06,
|
|
"loss": 0.1593709945678711,
|
|
"memory(GiB)": 33.6,
|
|
"step": 805,
|
|
"token_acc": 0.9363939073284858,
|
|
"train_speed(iter/s)": 0.120397
|
|
},
|
|
{
|
|
"epoch": 1.0150529204233634,
|
|
"grad_norm": 0.740982174873352,
|
|
"learning_rate": 7.425675943827084e-06,
|
|
"loss": 0.17170259952545167,
|
|
"memory(GiB)": 33.6,
|
|
"step": 810,
|
|
"token_acc": 0.9332383983916904,
|
|
"train_speed(iter/s)": 0.120599
|
|
},
|
|
{
|
|
"epoch": 1.021324970599765,
|
|
"grad_norm": 0.7703794240951538,
|
|
"learning_rate": 7.3969001362197135e-06,
|
|
"loss": 0.15921430587768554,
|
|
"memory(GiB)": 33.6,
|
|
"step": 815,
|
|
"token_acc": 0.9395872420262664,
|
|
"train_speed(iter/s)": 0.120788
|
|
},
|
|
{
|
|
"epoch": 1.0275970207761662,
|
|
"grad_norm": 0.6545404195785522,
|
|
"learning_rate": 7.3680208789732385e-06,
|
|
"loss": 0.15435378551483153,
|
|
"memory(GiB)": 33.6,
|
|
"step": 820,
|
|
"token_acc": 0.946012336917954,
|
|
"train_speed(iter/s)": 0.12096
|
|
},
|
|
{
|
|
"epoch": 1.0275970207761662,
|
|
"eval_loss": 0.2201254665851593,
|
|
"eval_runtime": 29.934,
|
|
"eval_samples_per_second": 17.205,
|
|
"eval_steps_per_second": 4.309,
|
|
"eval_token_acc": 0.925619730768747,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.0338690709525675,
|
|
"grad_norm": 0.6074718236923218,
|
|
"learning_rate": 7.339039418509532e-06,
|
|
"loss": 0.15760223865509032,
|
|
"memory(GiB)": 33.6,
|
|
"step": 825,
|
|
"token_acc": 0.9387994171373061,
|
|
"train_speed(iter/s)": 0.120423
|
|
},
|
|
{
|
|
"epoch": 1.040141121128969,
|
|
"grad_norm": 0.7706517577171326,
|
|
"learning_rate": 7.309957005661521e-06,
|
|
"loss": 0.15146889686584472,
|
|
"memory(GiB)": 33.6,
|
|
"step": 830,
|
|
"token_acc": 0.9525346241764152,
|
|
"train_speed(iter/s)": 0.120632
|
|
},
|
|
{
|
|
"epoch": 1.0464131713053704,
|
|
"grad_norm": 0.7167385220527649,
|
|
"learning_rate": 7.280774895619219e-06,
|
|
"loss": 0.15735208988189697,
|
|
"memory(GiB)": 33.6,
|
|
"step": 835,
|
|
"token_acc": 0.9430896598332957,
|
|
"train_speed(iter/s)": 0.120825
|
|
},
|
|
{
|
|
"epoch": 1.052685221481772,
|
|
"grad_norm": 0.6908670663833618,
|
|
"learning_rate": 7.25149434787555e-06,
|
|
"loss": 0.15631234645843506,
|
|
"memory(GiB)": 33.6,
|
|
"step": 840,
|
|
"token_acc": 0.9427331753700342,
|
|
"train_speed(iter/s)": 0.121019
|
|
},
|
|
{
|
|
"epoch": 1.052685221481772,
|
|
"eval_loss": 0.2204887419939041,
|
|
"eval_runtime": 29.7677,
|
|
"eval_samples_per_second": 17.301,
|
|
"eval_steps_per_second": 4.334,
|
|
"eval_token_acc": 0.9257958085112626,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.0589572716581732,
|
|
"grad_norm": 0.6649175882339478,
|
|
"learning_rate": 7.2221166261719755e-06,
|
|
"loss": 0.14833444356918335,
|
|
"memory(GiB)": 33.6,
|
|
"step": 845,
|
|
"token_acc": 0.9379661510111051,
|
|
"train_speed(iter/s)": 0.120482
|
|
},
|
|
{
|
|
"epoch": 1.0652293218345747,
|
|
"grad_norm": 0.6566579937934875,
|
|
"learning_rate": 7.192642998443975e-06,
|
|
"loss": 0.15106643438339235,
|
|
"memory(GiB)": 33.6,
|
|
"step": 850,
|
|
"token_acc": 0.9515770402701145,
|
|
"train_speed(iter/s)": 0.120644
|
|
},
|
|
{
|
|
"epoch": 1.071501372010976,
|
|
"grad_norm": 0.7228676676750183,
|
|
"learning_rate": 7.163074736766299e-06,
|
|
"loss": 0.151542592048645,
|
|
"memory(GiB)": 33.6,
|
|
"step": 855,
|
|
"token_acc": 0.9434348954775242,
|
|
"train_speed(iter/s)": 0.120779
|
|
},
|
|
{
|
|
"epoch": 1.0777734221873776,
|
|
"grad_norm": 0.6888749003410339,
|
|
"learning_rate": 7.133413117298081e-06,
|
|
"loss": 0.14555807113647462,
|
|
"memory(GiB)": 33.6,
|
|
"step": 860,
|
|
"token_acc": 0.9450332471906849,
|
|
"train_speed(iter/s)": 0.120951
|
|
},
|
|
{
|
|
"epoch": 1.0777734221873776,
|
|
"eval_loss": 0.22007805109024048,
|
|
"eval_runtime": 29.794,
|
|
"eval_samples_per_second": 17.285,
|
|
"eval_steps_per_second": 4.33,
|
|
"eval_token_acc": 0.9259173860001425,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.084045472363779,
|
|
"grad_norm": 0.6592821478843689,
|
|
"learning_rate": 7.103659420227755e-06,
|
|
"loss": 0.1563601851463318,
|
|
"memory(GiB)": 33.6,
|
|
"step": 865,
|
|
"token_acc": 0.9376562343765623,
|
|
"train_speed(iter/s)": 0.120435
|
|
},
|
|
{
|
|
"epoch": 1.0903175225401802,
|
|
"grad_norm": 0.7310729622840881,
|
|
"learning_rate": 7.0738149297178005e-06,
|
|
"loss": 0.1602903962135315,
|
|
"memory(GiB)": 33.6,
|
|
"step": 870,
|
|
"token_acc": 0.9540862093385581,
|
|
"train_speed(iter/s)": 0.120585
|
|
},
|
|
{
|
|
"epoch": 1.0965895727165818,
|
|
"grad_norm": 0.7009090185165405,
|
|
"learning_rate": 7.04388093384932e-06,
|
|
"loss": 0.14554691314697266,
|
|
"memory(GiB)": 33.6,
|
|
"step": 875,
|
|
"token_acc": 0.9516893894487255,
|
|
"train_speed(iter/s)": 0.120725
|
|
},
|
|
{
|
|
"epoch": 1.102861622892983,
|
|
"grad_norm": 0.7225506901741028,
|
|
"learning_rate": 7.013858724566449e-06,
|
|
"loss": 0.16036466360092164,
|
|
"memory(GiB)": 33.6,
|
|
"step": 880,
|
|
"token_acc": 0.9487800335257962,
|
|
"train_speed(iter/s)": 0.120883
|
|
},
|
|
{
|
|
"epoch": 1.102861622892983,
|
|
"eval_loss": 0.22006595134735107,
|
|
"eval_runtime": 29.8492,
|
|
"eval_samples_per_second": 17.253,
|
|
"eval_steps_per_second": 4.322,
|
|
"eval_token_acc": 0.9258377317832902,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.1091336730693846,
|
|
"grad_norm": 0.7332343459129333,
|
|
"learning_rate": 6.983749597620588e-06,
|
|
"loss": 0.15600578784942626,
|
|
"memory(GiB)": 33.6,
|
|
"step": 885,
|
|
"token_acc": 0.9356690055649649,
|
|
"train_speed(iter/s)": 0.120401
|
|
},
|
|
{
|
|
"epoch": 1.115405723245786,
|
|
"grad_norm": 0.7584828734397888,
|
|
"learning_rate": 6.9535548525144894e-06,
|
|
"loss": 0.15730617046356202,
|
|
"memory(GiB)": 33.6,
|
|
"step": 890,
|
|
"token_acc": 0.9422360762461726,
|
|
"train_speed(iter/s)": 0.120564
|
|
},
|
|
{
|
|
"epoch": 1.1216777734221874,
|
|
"grad_norm": 0.6870989203453064,
|
|
"learning_rate": 6.923275792446159e-06,
|
|
"loss": 0.15372934341430664,
|
|
"memory(GiB)": 33.6,
|
|
"step": 895,
|
|
"token_acc": 0.9412487331470164,
|
|
"train_speed(iter/s)": 0.120704
|
|
},
|
|
{
|
|
"epoch": 1.1279498235985888,
|
|
"grad_norm": 0.6541606187820435,
|
|
"learning_rate": 6.8929137242526216e-06,
|
|
"loss": 0.1524061918258667,
|
|
"memory(GiB)": 33.6,
|
|
"step": 900,
|
|
"token_acc": 0.9508530617643169,
|
|
"train_speed(iter/s)": 0.120832
|
|
},
|
|
{
|
|
"epoch": 1.1279498235985888,
|
|
"eval_loss": 0.2202031910419464,
|
|
"eval_runtime": 29.8745,
|
|
"eval_samples_per_second": 17.239,
|
|
"eval_steps_per_second": 4.318,
|
|
"eval_token_acc": 0.9259173860001425,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.1342218737749903,
|
|
"grad_norm": 0.63804692029953,
|
|
"learning_rate": 6.862469958353506e-06,
|
|
"loss": 0.15143206119537353,
|
|
"memory(GiB)": 33.6,
|
|
"step": 905,
|
|
"token_acc": 0.9388512882977574,
|
|
"train_speed(iter/s)": 0.120334
|
|
},
|
|
{
|
|
"epoch": 1.1404939239513916,
|
|
"grad_norm": 0.8169479966163635,
|
|
"learning_rate": 6.8319458086945026e-06,
|
|
"loss": 0.1651373863220215,
|
|
"memory(GiB)": 33.6,
|
|
"step": 910,
|
|
"token_acc": 0.9463410976706987,
|
|
"train_speed(iter/s)": 0.120499
|
|
},
|
|
{
|
|
"epoch": 1.146765974127793,
|
|
"grad_norm": 0.7615450024604797,
|
|
"learning_rate": 6.801342592690641e-06,
|
|
"loss": 0.15947287082672118,
|
|
"memory(GiB)": 33.6,
|
|
"step": 915,
|
|
"token_acc": 0.943523544080974,
|
|
"train_speed(iter/s)": 0.120662
|
|
},
|
|
{
|
|
"epoch": 1.1530380243041944,
|
|
"grad_norm": 0.6645803451538086,
|
|
"learning_rate": 6.770661631169434e-06,
|
|
"loss": 0.14712635278701783,
|
|
"memory(GiB)": 33.6,
|
|
"step": 920,
|
|
"token_acc": 0.9416338351553735,
|
|
"train_speed(iter/s)": 0.120812
|
|
},
|
|
{
|
|
"epoch": 1.1530380243041944,
|
|
"eval_loss": 0.21968427300453186,
|
|
"eval_runtime": 30.0062,
|
|
"eval_samples_per_second": 17.163,
|
|
"eval_steps_per_second": 4.299,
|
|
"eval_token_acc": 0.9260389634890224,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.1593100744805958,
|
|
"grad_norm": 0.6639111042022705,
|
|
"learning_rate": 6.739904248313879e-06,
|
|
"loss": 0.1582737922668457,
|
|
"memory(GiB)": 33.6,
|
|
"step": 925,
|
|
"token_acc": 0.9358828491280381,
|
|
"train_speed(iter/s)": 0.120318
|
|
},
|
|
{
|
|
"epoch": 1.1655821246569973,
|
|
"grad_norm": 0.7869791388511658,
|
|
"learning_rate": 6.709071771605292e-06,
|
|
"loss": 0.15897371768951415,
|
|
"memory(GiB)": 33.6,
|
|
"step": 930,
|
|
"token_acc": 0.9428512114831401,
|
|
"train_speed(iter/s)": 0.12045
|
|
},
|
|
{
|
|
"epoch": 1.1718541748333986,
|
|
"grad_norm": 0.7262241840362549,
|
|
"learning_rate": 6.678165531766029e-06,
|
|
"loss": 0.15734575986862182,
|
|
"memory(GiB)": 33.6,
|
|
"step": 935,
|
|
"token_acc": 0.9491624723709089,
|
|
"train_speed(iter/s)": 0.120601
|
|
},
|
|
{
|
|
"epoch": 1.1781262250098001,
|
|
"grad_norm": 0.7573165893554688,
|
|
"learning_rate": 6.647186862702038e-06,
|
|
"loss": 0.1512979507446289,
|
|
"memory(GiB)": 33.6,
|
|
"step": 940,
|
|
"token_acc": 0.9467049494120864,
|
|
"train_speed(iter/s)": 0.120739
|
|
},
|
|
{
|
|
"epoch": 1.1781262250098001,
|
|
"eval_loss": 0.22000892460346222,
|
|
"eval_runtime": 30.0307,
|
|
"eval_samples_per_second": 17.149,
|
|
"eval_steps_per_second": 4.296,
|
|
"eval_token_acc": 0.9255736151695168,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.1843982751862014,
|
|
"grad_norm": 0.6843467354774475,
|
|
"learning_rate": 6.616137101445301e-06,
|
|
"loss": 0.1581122875213623,
|
|
"memory(GiB)": 33.6,
|
|
"step": 945,
|
|
"token_acc": 0.9385528792778878,
|
|
"train_speed(iter/s)": 0.12026
|
|
},
|
|
{
|
|
"epoch": 1.190670325362603,
|
|
"grad_norm": 0.7229541540145874,
|
|
"learning_rate": 6.58501758809612e-06,
|
|
"loss": 0.17478140592575073,
|
|
"memory(GiB)": 33.6,
|
|
"step": 950,
|
|
"token_acc": 0.9408516112836927,
|
|
"train_speed(iter/s)": 0.120424
|
|
},
|
|
{
|
|
"epoch": 1.1969423755390043,
|
|
"grad_norm": 0.7466816306114197,
|
|
"learning_rate": 6.55382966576528e-06,
|
|
"loss": 0.15570859909057616,
|
|
"memory(GiB)": 33.6,
|
|
"step": 955,
|
|
"token_acc": 0.9458100145459925,
|
|
"train_speed(iter/s)": 0.120574
|
|
},
|
|
{
|
|
"epoch": 1.2032144257154056,
|
|
"grad_norm": 0.7393471598625183,
|
|
"learning_rate": 6.522574680516081e-06,
|
|
"loss": 0.1629380464553833,
|
|
"memory(GiB)": 33.6,
|
|
"step": 960,
|
|
"token_acc": 0.94634954320764,
|
|
"train_speed(iter/s)": 0.120745
|
|
},
|
|
{
|
|
"epoch": 1.2032144257154056,
|
|
"eval_loss": 0.22024324536323547,
|
|
"eval_runtime": 29.8937,
|
|
"eval_samples_per_second": 17.228,
|
|
"eval_steps_per_second": 4.315,
|
|
"eval_token_acc": 0.9261898872683215,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.2094864758918071,
|
|
"grad_norm": 0.7199200987815857,
|
|
"learning_rate": 6.491253981306245e-06,
|
|
"loss": 0.15614912509918213,
|
|
"memory(GiB)": 33.6,
|
|
"step": 965,
|
|
"token_acc": 0.9363130072672509,
|
|
"train_speed(iter/s)": 0.120314
|
|
},
|
|
{
|
|
"epoch": 1.2157585260682087,
|
|
"grad_norm": 0.7639812231063843,
|
|
"learning_rate": 6.459868919929691e-06,
|
|
"loss": 0.15401583909988403,
|
|
"memory(GiB)": 33.6,
|
|
"step": 970,
|
|
"token_acc": 0.9419335026939505,
|
|
"train_speed(iter/s)": 0.120443
|
|
},
|
|
{
|
|
"epoch": 1.22203057624461,
|
|
"grad_norm": 0.7610638737678528,
|
|
"learning_rate": 6.428420850958194e-06,
|
|
"loss": 0.15354688167572023,
|
|
"memory(GiB)": 33.6,
|
|
"step": 975,
|
|
"token_acc": 0.9475969889982628,
|
|
"train_speed(iter/s)": 0.120569
|
|
},
|
|
{
|
|
"epoch": 1.2283026264210113,
|
|
"grad_norm": 0.7256251573562622,
|
|
"learning_rate": 6.3969111316829215e-06,
|
|
"loss": 0.15662674903869628,
|
|
"memory(GiB)": 33.6,
|
|
"step": 980,
|
|
"token_acc": 0.9459802620188146,
|
|
"train_speed(iter/s)": 0.120732
|
|
},
|
|
{
|
|
"epoch": 1.2283026264210113,
|
|
"eval_loss": 0.21970878541469574,
|
|
"eval_runtime": 29.8153,
|
|
"eval_samples_per_second": 17.273,
|
|
"eval_steps_per_second": 4.327,
|
|
"eval_token_acc": 0.9261437716690912,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.2345746765974128,
|
|
"grad_norm": 0.7129140496253967,
|
|
"learning_rate": 6.365341122055857e-06,
|
|
"loss": 0.15520663261413575,
|
|
"memory(GiB)": 33.6,
|
|
"step": 985,
|
|
"token_acc": 0.9358353146537455,
|
|
"train_speed(iter/s)": 0.120322
|
|
},
|
|
{
|
|
"epoch": 1.2408467267738141,
|
|
"grad_norm": 0.6518073081970215,
|
|
"learning_rate": 6.333712184631093e-06,
|
|
"loss": 0.14519546031951905,
|
|
"memory(GiB)": 33.6,
|
|
"step": 990,
|
|
"token_acc": 0.9487179487179487,
|
|
"train_speed(iter/s)": 0.120454
|
|
},
|
|
{
|
|
"epoch": 1.2471187769502157,
|
|
"grad_norm": 0.6729604005813599,
|
|
"learning_rate": 6.302025684506042e-06,
|
|
"loss": 0.1582566022872925,
|
|
"memory(GiB)": 33.6,
|
|
"step": 995,
|
|
"token_acc": 0.9451847717388776,
|
|
"train_speed(iter/s)": 0.120603
|
|
},
|
|
{
|
|
"epoch": 1.253390827126617,
|
|
"grad_norm": 0.7031135559082031,
|
|
"learning_rate": 6.2702829892625e-06,
|
|
"loss": 0.1544743537902832,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1000,
|
|
"token_acc": 0.9461549355615352,
|
|
"train_speed(iter/s)": 0.120764
|
|
},
|
|
{
|
|
"epoch": 1.253390827126617,
|
|
"eval_loss": 0.21808215975761414,
|
|
"eval_runtime": 29.9101,
|
|
"eval_samples_per_second": 17.218,
|
|
"eval_steps_per_second": 4.313,
|
|
"eval_token_acc": 0.9264749655181087,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.2596628773030183,
|
|
"grad_norm": 0.7392243146896362,
|
|
"learning_rate": 6.238485468907637e-06,
|
|
"loss": 0.15514018535614013,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1005,
|
|
"token_acc": 0.9405148412279971,
|
|
"train_speed(iter/s)": 0.120335
|
|
},
|
|
{
|
|
"epoch": 1.2659349274794198,
|
|
"grad_norm": 0.6544946432113647,
|
|
"learning_rate": 6.2066344958148596e-06,
|
|
"loss": 0.15222200155258178,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1010,
|
|
"token_acc": 0.9450700357044768,
|
|
"train_speed(iter/s)": 0.120493
|
|
},
|
|
{
|
|
"epoch": 1.2722069776558214,
|
|
"grad_norm": 0.6442455649375916,
|
|
"learning_rate": 6.174731444664579e-06,
|
|
"loss": 0.1523426055908203,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1015,
|
|
"token_acc": 0.9424824791940429,
|
|
"train_speed(iter/s)": 0.120659
|
|
},
|
|
{
|
|
"epoch": 1.2784790278322227,
|
|
"grad_norm": 0.6623610854148865,
|
|
"learning_rate": 6.14277769238489e-06,
|
|
"loss": 0.15341660976409913,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1020,
|
|
"token_acc": 0.94507058287796,
|
|
"train_speed(iter/s)": 0.120796
|
|
},
|
|
{
|
|
"epoch": 1.2784790278322227,
|
|
"eval_loss": 0.21728560328483582,
|
|
"eval_runtime": 29.8731,
|
|
"eval_samples_per_second": 17.24,
|
|
"eval_steps_per_second": 4.318,
|
|
"eval_token_acc": 0.9268396979847483,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.284751078008624,
|
|
"grad_norm": 0.6439575552940369,
|
|
"learning_rate": 6.110774618092128e-06,
|
|
"loss": 0.14585806131362916,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1025,
|
|
"token_acc": 0.9402694008845999,
|
|
"train_speed(iter/s)": 0.120366
|
|
},
|
|
{
|
|
"epoch": 1.2910231281850255,
|
|
"grad_norm": 0.6308783888816833,
|
|
"learning_rate": 6.07872360303136e-06,
|
|
"loss": 0.1529778242111206,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1030,
|
|
"token_acc": 0.9463726446578015,
|
|
"train_speed(iter/s)": 0.120542
|
|
},
|
|
{
|
|
"epoch": 1.2972951783614268,
|
|
"grad_norm": 0.6911998391151428,
|
|
"learning_rate": 6.046626030516766e-06,
|
|
"loss": 0.15263807773590088,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1035,
|
|
"token_acc": 0.9480356726509676,
|
|
"train_speed(iter/s)": 0.120685
|
|
},
|
|
{
|
|
"epoch": 1.3035672285378284,
|
|
"grad_norm": 0.6479185223579407,
|
|
"learning_rate": 6.0144832858719256e-06,
|
|
"loss": 0.1511695623397827,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1040,
|
|
"token_acc": 0.9483408164318522,
|
|
"train_speed(iter/s)": 0.120811
|
|
},
|
|
{
|
|
"epoch": 1.3035672285378284,
|
|
"eval_loss": 0.21816755831241608,
|
|
"eval_runtime": 29.8673,
|
|
"eval_samples_per_second": 17.243,
|
|
"eval_steps_per_second": 4.319,
|
|
"eval_token_acc": 0.9266594279150299,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.3098392787142297,
|
|
"grad_norm": 0.7152245044708252,
|
|
"learning_rate": 5.982296756370052e-06,
|
|
"loss": 0.15091612339019775,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1045,
|
|
"token_acc": 0.9368677988540223,
|
|
"train_speed(iter/s)": 0.120391
|
|
},
|
|
{
|
|
"epoch": 1.3161113288906312,
|
|
"grad_norm": 0.7569878101348877,
|
|
"learning_rate": 5.950067831174086e-06,
|
|
"loss": 0.1640252947807312,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1050,
|
|
"token_acc": 0.9438852605967474,
|
|
"train_speed(iter/s)": 0.120521
|
|
},
|
|
{
|
|
"epoch": 1.3223833790670325,
|
|
"grad_norm": 0.7000331282615662,
|
|
"learning_rate": 5.917797901276771e-06,
|
|
"loss": 0.1507915735244751,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1055,
|
|
"token_acc": 0.9414864333464412,
|
|
"train_speed(iter/s)": 0.120671
|
|
},
|
|
{
|
|
"epoch": 1.328655429243434,
|
|
"grad_norm": 0.6913698315620422,
|
|
"learning_rate": 5.885488359440592e-06,
|
|
"loss": 0.14514442682266235,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1060,
|
|
"token_acc": 0.9446129425437229,
|
|
"train_speed(iter/s)": 0.120797
|
|
},
|
|
{
|
|
"epoch": 1.328655429243434,
|
|
"eval_loss": 0.2164752334356308,
|
|
"eval_runtime": 30.0417,
|
|
"eval_samples_per_second": 17.143,
|
|
"eval_steps_per_second": 4.294,
|
|
"eval_token_acc": 0.9267306974774767,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.3349274794198354,
|
|
"grad_norm": 0.6658451557159424,
|
|
"learning_rate": 5.853140600137684e-06,
|
|
"loss": 0.15348198413848876,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1065,
|
|
"token_acc": 0.9394616144184715,
|
|
"train_speed(iter/s)": 0.120389
|
|
},
|
|
{
|
|
"epoch": 1.3411995295962367,
|
|
"grad_norm": 0.6750782132148743,
|
|
"learning_rate": 5.8207560194896325e-06,
|
|
"loss": 0.16195533275604249,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1070,
|
|
"token_acc": 0.9389181190397895,
|
|
"train_speed(iter/s)": 0.120519
|
|
},
|
|
{
|
|
"epoch": 1.3474715797726382,
|
|
"grad_norm": 0.6864067912101746,
|
|
"learning_rate": 5.78833601520723e-06,
|
|
"loss": 0.15502965450286865,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1075,
|
|
"token_acc": 0.9447475298539129,
|
|
"train_speed(iter/s)": 0.120654
|
|
},
|
|
{
|
|
"epoch": 1.3537436299490395,
|
|
"grad_norm": 0.7045819759368896,
|
|
"learning_rate": 5.755881986530137e-06,
|
|
"loss": 0.16037662029266359,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1080,
|
|
"token_acc": 0.9435206662381578,
|
|
"train_speed(iter/s)": 0.120778
|
|
},
|
|
{
|
|
"epoch": 1.3537436299490395,
|
|
"eval_loss": 0.21634995937347412,
|
|
"eval_runtime": 30.004,
|
|
"eval_samples_per_second": 17.164,
|
|
"eval_steps_per_second": 4.299,
|
|
"eval_token_acc": 0.9266720048966381,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 1.360015680125441,
|
|
"grad_norm": 0.7639293074607849,
|
|
"learning_rate": 5.723395334166506e-06,
|
|
"loss": 0.15927184820175172,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1085,
|
|
"token_acc": 0.9369159769632709,
|
|
"train_speed(iter/s)": 0.120386
|
|
},
|
|
{
|
|
"epoch": 1.3662877303018424,
|
|
"grad_norm": 0.6852443814277649,
|
|
"learning_rate": 5.6908774602325165e-06,
|
|
"loss": 0.14834917783737184,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1090,
|
|
"token_acc": 0.9440402603796291,
|
|
"train_speed(iter/s)": 0.120507
|
|
},
|
|
{
|
|
"epoch": 1.372559780478244,
|
|
"grad_norm": 0.6806090474128723,
|
|
"learning_rate": 5.6583297681918615e-06,
|
|
"loss": 0.14343435764312745,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1095,
|
|
"token_acc": 0.9514565363959733,
|
|
"train_speed(iter/s)": 0.120637
|
|
},
|
|
{
|
|
"epoch": 1.3788318306546452,
|
|
"grad_norm": 0.7074826955795288,
|
|
"learning_rate": 5.625753662795183e-06,
|
|
"loss": 0.15417686700820923,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1100,
|
|
"token_acc": 0.9386321901831356,
|
|
"train_speed(iter/s)": 0.120771
|
|
},
|
|
{
|
|
"epoch": 1.3788318306546452,
|
|
"eval_loss": 0.21541613340377808,
|
|
"eval_runtime": 29.9501,
|
|
"eval_samples_per_second": 17.195,
|
|
"eval_steps_per_second": 4.307,
|
|
"eval_token_acc": 0.9270451220176832,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 1.3851038808310467,
|
|
"grad_norm": 0.776336669921875,
|
|
"learning_rate": 5.59315055001943e-06,
|
|
"loss": 0.16252031326293945,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1105,
|
|
"token_acc": 0.9352993130520117,
|
|
"train_speed(iter/s)": 0.120392
|
|
},
|
|
{
|
|
"epoch": 1.391375931007448,
|
|
"grad_norm": 0.6779446005821228,
|
|
"learning_rate": 5.5605218370071836e-06,
|
|
"loss": 0.14336334466934203,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1110,
|
|
"token_acc": 0.9528518089352388,
|
|
"train_speed(iter/s)": 0.120505
|
|
},
|
|
{
|
|
"epoch": 1.3976479811838494,
|
|
"grad_norm": 0.690963089466095,
|
|
"learning_rate": 5.5278689320059305e-06,
|
|
"loss": 0.15652428865432738,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1115,
|
|
"token_acc": 0.9426378227494766,
|
|
"train_speed(iter/s)": 0.120669
|
|
},
|
|
{
|
|
"epoch": 1.403920031360251,
|
|
"grad_norm": 0.7639049887657166,
|
|
"learning_rate": 5.4951932443072764e-06,
|
|
"loss": 0.16521704196929932,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1120,
|
|
"token_acc": 0.9421646929220601,
|
|
"train_speed(iter/s)": 0.120808
|
|
},
|
|
{
|
|
"epoch": 1.403920031360251,
|
|
"eval_loss": 0.21606019139289856,
|
|
"eval_runtime": 29.9411,
|
|
"eval_samples_per_second": 17.2,
|
|
"eval_steps_per_second": 4.308,
|
|
"eval_token_acc": 0.9273176232858622,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 1.4101920815366524,
|
|
"grad_norm": 0.7224271297454834,
|
|
"learning_rate": 5.462496184186118e-06,
|
|
"loss": 0.15909309387207032,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1125,
|
|
"token_acc": 0.9397663407498653,
|
|
"train_speed(iter/s)": 0.120435
|
|
},
|
|
{
|
|
"epoch": 1.4164641317130537,
|
|
"grad_norm": 0.7142929434776306,
|
|
"learning_rate": 5.429779162839787e-06,
|
|
"loss": 0.16222875118255614,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1130,
|
|
"token_acc": 0.9455863719555118,
|
|
"train_speed(iter/s)": 0.120578
|
|
},
|
|
{
|
|
"epoch": 1.422736181889455,
|
|
"grad_norm": 0.6916890144348145,
|
|
"learning_rate": 5.397043592327129e-06,
|
|
"loss": 0.15585269927978515,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1135,
|
|
"token_acc": 0.9430803571428571,
|
|
"train_speed(iter/s)": 0.120706
|
|
},
|
|
{
|
|
"epoch": 1.4290082320658566,
|
|
"grad_norm": 0.7470511198043823,
|
|
"learning_rate": 5.364290885507577e-06,
|
|
"loss": 0.1534827470779419,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1140,
|
|
"token_acc": 0.9486189913884969,
|
|
"train_speed(iter/s)": 0.120798
|
|
},
|
|
{
|
|
"epoch": 1.4290082320658566,
|
|
"eval_loss": 0.21471655368804932,
|
|
"eval_runtime": 29.873,
|
|
"eval_samples_per_second": 17.24,
|
|
"eval_steps_per_second": 4.318,
|
|
"eval_token_acc": 0.9275440089548109,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 1.435280282242258,
|
|
"grad_norm": 0.7307199835777283,
|
|
"learning_rate": 5.3315224559801555e-06,
|
|
"loss": 0.14947969913482667,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1145,
|
|
"token_acc": 0.939248102132458,
|
|
"train_speed(iter/s)": 0.120404
|
|
},
|
|
{
|
|
"epoch": 1.4415523324186594,
|
|
"grad_norm": 0.7301707863807678,
|
|
"learning_rate": 5.2987397180224795e-06,
|
|
"loss": 0.15617960691452026,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1150,
|
|
"token_acc": 0.9474308925933741,
|
|
"train_speed(iter/s)": 0.120508
|
|
},
|
|
{
|
|
"epoch": 1.4478243825950607,
|
|
"grad_norm": 0.7371909022331238,
|
|
"learning_rate": 5.265944086529714e-06,
|
|
"loss": 0.15231599807739257,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1155,
|
|
"token_acc": 0.9433844406587166,
|
|
"train_speed(iter/s)": 0.120639
|
|
},
|
|
{
|
|
"epoch": 1.454096432771462,
|
|
"grad_norm": 0.7134169340133667,
|
|
"learning_rate": 5.233136976953504e-06,
|
|
"loss": 0.158011531829834,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1160,
|
|
"token_acc": 0.9418867924528301,
|
|
"train_speed(iter/s)": 0.120795
|
|
},
|
|
{
|
|
"epoch": 1.454096432771462,
|
|
"eval_loss": 0.2147841602563858,
|
|
"eval_runtime": 29.9333,
|
|
"eval_samples_per_second": 17.205,
|
|
"eval_steps_per_second": 4.31,
|
|
"eval_token_acc": 0.927066083653697,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 1.4603684829478636,
|
|
"grad_norm": 0.7018805146217346,
|
|
"learning_rate": 5.200319805240884e-06,
|
|
"loss": 0.15690932273864747,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1165,
|
|
"token_acc": 0.9392478977732894,
|
|
"train_speed(iter/s)": 0.120417
|
|
},
|
|
{
|
|
"epoch": 1.4666405331242651,
|
|
"grad_norm": 0.7469993233680725,
|
|
"learning_rate": 5.167493987773175e-06,
|
|
"loss": 0.15955485105514527,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1170,
|
|
"token_acc": 0.9344873812438071,
|
|
"train_speed(iter/s)": 0.120557
|
|
},
|
|
{
|
|
"epoch": 1.4729125833006664,
|
|
"grad_norm": 0.7378620505332947,
|
|
"learning_rate": 5.134660941304838e-06,
|
|
"loss": 0.1497912287712097,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1175,
|
|
"token_acc": 0.9491378587597462,
|
|
"train_speed(iter/s)": 0.120684
|
|
},
|
|
{
|
|
"epoch": 1.4791846334770677,
|
|
"grad_norm": 0.7320712208747864,
|
|
"learning_rate": 5.10182208290234e-06,
|
|
"loss": 0.15272881984710693,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1180,
|
|
"token_acc": 0.9514241554427025,
|
|
"train_speed(iter/s)": 0.120797
|
|
},
|
|
{
|
|
"epoch": 1.4791846334770677,
|
|
"eval_loss": 0.21511444449424744,
|
|
"eval_runtime": 29.7909,
|
|
"eval_samples_per_second": 17.287,
|
|
"eval_steps_per_second": 4.33,
|
|
"eval_token_acc": 0.9274056621571201,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 1.4854566836534693,
|
|
"grad_norm": 0.647217333316803,
|
|
"learning_rate": 5.068978829882992e-06,
|
|
"loss": 0.15485861301422119,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1185,
|
|
"token_acc": 0.9385337002183685,
|
|
"train_speed(iter/s)": 0.120365
|
|
},
|
|
{
|
|
"epoch": 1.4917287338298706,
|
|
"grad_norm": 0.650230884552002,
|
|
"learning_rate": 5.036132599753771e-06,
|
|
"loss": 0.15730609893798828,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1190,
|
|
"token_acc": 0.9470712591523152,
|
|
"train_speed(iter/s)": 0.120508
|
|
},
|
|
{
|
|
"epoch": 1.4980007840062721,
|
|
"grad_norm": 0.704609751701355,
|
|
"learning_rate": 5.003284810150152e-06,
|
|
"loss": 0.14192657470703124,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1195,
|
|
"token_acc": 0.9487411800236114,
|
|
"train_speed(iter/s)": 0.120634
|
|
},
|
|
{
|
|
"epoch": 1.5042728341826734,
|
|
"grad_norm": 0.6966667771339417,
|
|
"learning_rate": 4.970436878774907e-06,
|
|
"loss": 0.14936549663543702,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1200,
|
|
"token_acc": 0.940895846426327,
|
|
"train_speed(iter/s)": 0.120761
|
|
},
|
|
{
|
|
"epoch": 1.5042728341826734,
|
|
"eval_loss": 0.2132187932729721,
|
|
"eval_runtime": 29.8173,
|
|
"eval_samples_per_second": 17.272,
|
|
"eval_steps_per_second": 4.326,
|
|
"eval_token_acc": 0.927678163425299,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 1.5105448843590747,
|
|
"grad_norm": 0.7017958760261536,
|
|
"learning_rate": 4.937590223336936e-06,
|
|
"loss": 0.15734946727752686,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1205,
|
|
"token_acc": 0.9369624490741228,
|
|
"train_speed(iter/s)": 0.120418
|
|
},
|
|
{
|
|
"epoch": 1.5168169345354763,
|
|
"grad_norm": 0.7165507674217224,
|
|
"learning_rate": 4.904746261490062e-06,
|
|
"loss": 0.15068832635879517,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1210,
|
|
"token_acc": 0.9456936989216113,
|
|
"train_speed(iter/s)": 0.120528
|
|
},
|
|
{
|
|
"epoch": 1.5230889847118778,
|
|
"grad_norm": 0.6853801012039185,
|
|
"learning_rate": 4.87190641077186e-06,
|
|
"loss": 0.15125684738159179,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1215,
|
|
"token_acc": 0.9438098534671744,
|
|
"train_speed(iter/s)": 0.120629
|
|
},
|
|
{
|
|
"epoch": 1.5293610348882791,
|
|
"grad_norm": 0.6704487204551697,
|
|
"learning_rate": 4.8390720885424665e-06,
|
|
"loss": 0.14999151229858398,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1220,
|
|
"token_acc": 0.9518440275904198,
|
|
"train_speed(iter/s)": 0.120748
|
|
},
|
|
{
|
|
"epoch": 1.5293610348882791,
|
|
"eval_loss": 0.21275770664215088,
|
|
"eval_runtime": 29.8668,
|
|
"eval_samples_per_second": 17.243,
|
|
"eval_steps_per_second": 4.319,
|
|
"eval_token_acc": 0.9274769317195668,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 1.5356330850646804,
|
|
"grad_norm": 0.6753378510475159,
|
|
"learning_rate": 4.806244711923408e-06,
|
|
"loss": 0.15547568798065187,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1225,
|
|
"token_acc": 0.9402999940240234,
|
|
"train_speed(iter/s)": 0.120393
|
|
},
|
|
{
|
|
"epoch": 1.541905135241082,
|
|
"grad_norm": 0.7050623893737793,
|
|
"learning_rate": 4.773425697736445e-06,
|
|
"loss": 0.14445589780807494,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1230,
|
|
"token_acc": 0.949685360241732,
|
|
"train_speed(iter/s)": 0.120511
|
|
},
|
|
{
|
|
"epoch": 1.5481771854174835,
|
|
"grad_norm": 0.7278842329978943,
|
|
"learning_rate": 4.7406164624424135e-06,
|
|
"loss": 0.14890639781951903,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1235,
|
|
"token_acc": 0.9443320079049501,
|
|
"train_speed(iter/s)": 0.120618
|
|
},
|
|
{
|
|
"epoch": 1.5544492355938848,
|
|
"grad_norm": 0.7538560032844543,
|
|
"learning_rate": 4.707818422080094e-06,
|
|
"loss": 0.1574314832687378,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1240,
|
|
"token_acc": 0.9465081309868567,
|
|
"train_speed(iter/s)": 0.120754
|
|
},
|
|
{
|
|
"epoch": 1.5544492355938848,
|
|
"eval_loss": 0.21373403072357178,
|
|
"eval_runtime": 29.905,
|
|
"eval_samples_per_second": 17.221,
|
|
"eval_steps_per_second": 4.314,
|
|
"eval_token_acc": 0.9275230473187971,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 1.5607212857702861,
|
|
"grad_norm": 0.7221870422363281,
|
|
"learning_rate": 4.675032992205099e-06,
|
|
"loss": 0.14533066749572754,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1245,
|
|
"token_acc": 0.940144099378882,
|
|
"train_speed(iter/s)": 0.120412
|
|
},
|
|
{
|
|
"epoch": 1.5669933359466874,
|
|
"grad_norm": 0.6934393048286438,
|
|
"learning_rate": 4.642261587828778e-06,
|
|
"loss": 0.1509866714477539,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1250,
|
|
"token_acc": 0.9452032867356739,
|
|
"train_speed(iter/s)": 0.120529
|
|
},
|
|
{
|
|
"epoch": 1.573265386123089,
|
|
"grad_norm": 0.7100276350975037,
|
|
"learning_rate": 4.609505623357135e-06,
|
|
"loss": 0.1503272294998169,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1255,
|
|
"token_acc": 0.945176036085127,
|
|
"train_speed(iter/s)": 0.12066
|
|
},
|
|
{
|
|
"epoch": 1.5795374362994905,
|
|
"grad_norm": 0.7433052659034729,
|
|
"learning_rate": 4.576766512529799e-06,
|
|
"loss": 0.1667776346206665,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1260,
|
|
"token_acc": 0.9403739570010354,
|
|
"train_speed(iter/s)": 0.120774
|
|
},
|
|
{
|
|
"epoch": 1.5795374362994905,
|
|
"eval_loss": 0.21276888251304626,
|
|
"eval_runtime": 29.8799,
|
|
"eval_samples_per_second": 17.236,
|
|
"eval_steps_per_second": 4.317,
|
|
"eval_token_acc": 0.9279380877118697,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 1.5858094864758918,
|
|
"grad_norm": 0.6793298125267029,
|
|
"learning_rate": 4.544045668358999e-06,
|
|
"loss": 0.1555434823036194,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1265,
|
|
"token_acc": 0.9374781493998369,
|
|
"train_speed(iter/s)": 0.120424
|
|
},
|
|
{
|
|
"epoch": 1.5920815366522931,
|
|
"grad_norm": 0.7161461710929871,
|
|
"learning_rate": 4.511344503068574e-06,
|
|
"loss": 0.15700291395187377,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1270,
|
|
"token_acc": 0.9427421933283598,
|
|
"train_speed(iter/s)": 0.120517
|
|
},
|
|
{
|
|
"epoch": 1.5983535868286947,
|
|
"grad_norm": 0.6421639919281006,
|
|
"learning_rate": 4.478664428033031e-06,
|
|
"loss": 0.1498015284538269,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1275,
|
|
"token_acc": 0.9436655491212029,
|
|
"train_speed(iter/s)": 0.120625
|
|
},
|
|
{
|
|
"epoch": 1.6046256370050962,
|
|
"grad_norm": 0.7012119293212891,
|
|
"learning_rate": 4.446006853716628e-06,
|
|
"loss": 0.15100154876708985,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1280,
|
|
"token_acc": 0.9480080409356725,
|
|
"train_speed(iter/s)": 0.120734
|
|
},
|
|
{
|
|
"epoch": 1.6046256370050962,
|
|
"eval_loss": 0.21206073462963104,
|
|
"eval_runtime": 29.6181,
|
|
"eval_samples_per_second": 17.388,
|
|
"eval_steps_per_second": 4.355,
|
|
"eval_token_acc": 0.9279967802927083,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 1.6108976871814975,
|
|
"grad_norm": 0.6183798909187317,
|
|
"learning_rate": 4.413373189612497e-06,
|
|
"loss": 0.14532687664031982,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1285,
|
|
"token_acc": 0.9389383520807664,
|
|
"train_speed(iter/s)": 0.120397
|
|
},
|
|
{
|
|
"epoch": 1.6171697373578988,
|
|
"grad_norm": 0.6965980529785156,
|
|
"learning_rate": 4.380764844181806e-06,
|
|
"loss": 0.15175777673721313,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1290,
|
|
"token_acc": 0.942019661331,
|
|
"train_speed(iter/s)": 0.120532
|
|
},
|
|
{
|
|
"epoch": 1.6234417875343001,
|
|
"grad_norm": 0.7467201352119446,
|
|
"learning_rate": 4.34818322479298e-06,
|
|
"loss": 0.1542289137840271,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1295,
|
|
"token_acc": 0.9416962545716651,
|
|
"train_speed(iter/s)": 0.120641
|
|
},
|
|
{
|
|
"epoch": 1.6297138377107017,
|
|
"grad_norm": 0.7295259833335876,
|
|
"learning_rate": 4.315629737660956e-06,
|
|
"loss": 0.14708173274993896,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1300,
|
|
"token_acc": 0.9478447445877919,
|
|
"train_speed(iter/s)": 0.120737
|
|
},
|
|
{
|
|
"epoch": 1.6297138377107017,
|
|
"eval_loss": 0.21190744638442993,
|
|
"eval_runtime": 29.8419,
|
|
"eval_samples_per_second": 17.258,
|
|
"eval_steps_per_second": 4.323,
|
|
"eval_token_acc": 0.928495667229836,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 1.6359858878871032,
|
|
"grad_norm": 0.7090888023376465,
|
|
"learning_rate": 4.283105787786482e-06,
|
|
"loss": 0.15199344158172606,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1305,
|
|
"token_acc": 0.9388616179391395,
|
|
"train_speed(iter/s)": 0.12039
|
|
},
|
|
{
|
|
"epoch": 1.6422579380635045,
|
|
"grad_norm": 0.6687735915184021,
|
|
"learning_rate": 4.250612778895492e-06,
|
|
"loss": 0.1566769599914551,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1310,
|
|
"token_acc": 0.9447847002229262,
|
|
"train_speed(iter/s)": 0.120504
|
|
},
|
|
{
|
|
"epoch": 1.6485299882399058,
|
|
"grad_norm": 0.7526585459709167,
|
|
"learning_rate": 4.218152113378513e-06,
|
|
"loss": 0.15292699337005616,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1315,
|
|
"token_acc": 0.9508290451686678,
|
|
"train_speed(iter/s)": 0.120624
|
|
},
|
|
{
|
|
"epoch": 1.6548020384163074,
|
|
"grad_norm": 0.6574867367744446,
|
|
"learning_rate": 4.185725192230136e-06,
|
|
"loss": 0.1453101873397827,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1320,
|
|
"token_acc": 0.943889951905673,
|
|
"train_speed(iter/s)": 0.120725
|
|
},
|
|
{
|
|
"epoch": 1.6548020384163074,
|
|
"eval_loss": 0.21127335727214813,
|
|
"eval_runtime": 29.9057,
|
|
"eval_samples_per_second": 17.221,
|
|
"eval_steps_per_second": 4.314,
|
|
"eval_token_acc": 0.9287597838436095,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 1.6610740885927089,
|
|
"grad_norm": 0.7367099523544312,
|
|
"learning_rate": 4.1533334149885594e-06,
|
|
"loss": 0.157798171043396,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1325,
|
|
"token_acc": 0.9389246418932946,
|
|
"train_speed(iter/s)": 0.1204
|
|
},
|
|
{
|
|
"epoch": 1.6673461387691102,
|
|
"grad_norm": 0.7515724897384644,
|
|
"learning_rate": 4.120978179675172e-06,
|
|
"loss": 0.149272882938385,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1330,
|
|
"token_acc": 0.9415829318651067,
|
|
"train_speed(iter/s)": 0.120496
|
|
},
|
|
{
|
|
"epoch": 1.6736181889455115,
|
|
"grad_norm": 0.7276756763458252,
|
|
"learning_rate": 4.088660882734228e-06,
|
|
"loss": 0.15989675521850585,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1335,
|
|
"token_acc": 0.9443810526931742,
|
|
"train_speed(iter/s)": 0.120605
|
|
},
|
|
{
|
|
"epoch": 1.6798902391219128,
|
|
"grad_norm": 0.6809377670288086,
|
|
"learning_rate": 4.056382918972565e-06,
|
|
"loss": 0.150339674949646,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1340,
|
|
"token_acc": 0.9478320715760495,
|
|
"train_speed(iter/s)": 0.120703
|
|
},
|
|
{
|
|
"epoch": 1.6798902391219128,
|
|
"eval_loss": 0.21092940866947174,
|
|
"eval_runtime": 29.83,
|
|
"eval_samples_per_second": 17.264,
|
|
"eval_steps_per_second": 4.324,
|
|
"eval_token_acc": 0.9285711291194855,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 1.6861622892983144,
|
|
"grad_norm": 0.7006319165229797,
|
|
"learning_rate": 4.024145681499416e-06,
|
|
"loss": 0.14731377363204956,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1345,
|
|
"token_acc": 0.9404405572409874,
|
|
"train_speed(iter/s)": 0.120406
|
|
},
|
|
{
|
|
"epoch": 1.6924343394747159,
|
|
"grad_norm": 0.6811453700065613,
|
|
"learning_rate": 3.991950561666269e-06,
|
|
"loss": 0.14400005340576172,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1350,
|
|
"token_acc": 0.9514704326668783,
|
|
"train_speed(iter/s)": 0.120501
|
|
},
|
|
{
|
|
"epoch": 1.6987063896511172,
|
|
"grad_norm": 0.6884180307388306,
|
|
"learning_rate": 3.959798949006831e-06,
|
|
"loss": 0.1443554401397705,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1355,
|
|
"token_acc": 0.952190047945391,
|
|
"train_speed(iter/s)": 0.12061
|
|
},
|
|
{
|
|
"epoch": 1.7049784398275185,
|
|
"grad_norm": 0.642373263835907,
|
|
"learning_rate": 3.927692231177053e-06,
|
|
"loss": 0.14928441047668456,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1360,
|
|
"token_acc": 0.9533662833875387,
|
|
"train_speed(iter/s)": 0.12072
|
|
},
|
|
{
|
|
"epoch": 1.7049784398275185,
|
|
"eval_loss": 0.21123968064785004,
|
|
"eval_runtime": 29.8426,
|
|
"eval_samples_per_second": 17.257,
|
|
"eval_steps_per_second": 4.323,
|
|
"eval_token_acc": 0.9288436303876645,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 1.71125049000392,
|
|
"grad_norm": 0.673079252243042,
|
|
"learning_rate": 3.895631793895223e-06,
|
|
"loss": 0.14722020626068116,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1365,
|
|
"token_acc": 0.9392204906405309,
|
|
"train_speed(iter/s)": 0.120411
|
|
},
|
|
{
|
|
"epoch": 1.7175225401803216,
|
|
"grad_norm": 0.7413303256034851,
|
|
"learning_rate": 3.863619020882184e-06,
|
|
"loss": 0.1495545506477356,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1370,
|
|
"token_acc": 0.9497109224438773,
|
|
"train_speed(iter/s)": 0.120525
|
|
},
|
|
{
|
|
"epoch": 1.7237945903567229,
|
|
"grad_norm": 0.6686860918998718,
|
|
"learning_rate": 3.831655293801596e-06,
|
|
"loss": 0.15514848232269288,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1375,
|
|
"token_acc": 0.9489922206506365,
|
|
"train_speed(iter/s)": 0.120638
|
|
},
|
|
{
|
|
"epoch": 1.7300666405331242,
|
|
"grad_norm": 0.6420913338661194,
|
|
"learning_rate": 3.7997419922003077e-06,
|
|
"loss": 0.15427151918411255,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1380,
|
|
"token_acc": 0.9402859545836838,
|
|
"train_speed(iter/s)": 0.120745
|
|
},
|
|
{
|
|
"epoch": 1.7300666405331242,
|
|
"eval_loss": 0.2099025994539261,
|
|
"eval_runtime": 29.7426,
|
|
"eval_samples_per_second": 17.315,
|
|
"eval_steps_per_second": 4.337,
|
|
"eval_token_acc": 0.9287681684980149,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 1.7363386907095255,
|
|
"grad_norm": 0.6821540594100952,
|
|
"learning_rate": 3.7678804934488146e-06,
|
|
"loss": 0.158866024017334,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1385,
|
|
"token_acc": 0.9397809287559081,
|
|
"train_speed(iter/s)": 0.120441
|
|
},
|
|
{
|
|
"epoch": 1.742610740885927,
|
|
"grad_norm": 0.6591536998748779,
|
|
"learning_rate": 3.736072172681818e-06,
|
|
"loss": 0.1457535743713379,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1390,
|
|
"token_acc": 0.9491301798279906,
|
|
"train_speed(iter/s)": 0.120572
|
|
},
|
|
{
|
|
"epoch": 1.7488827910623286,
|
|
"grad_norm": 0.6923695802688599,
|
|
"learning_rate": 3.704318402738867e-06,
|
|
"loss": 0.14236855506896973,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1395,
|
|
"token_acc": 0.9516265603234001,
|
|
"train_speed(iter/s)": 0.120687
|
|
},
|
|
{
|
|
"epoch": 1.75515484123873,
|
|
"grad_norm": 0.6892858147621155,
|
|
"learning_rate": 3.672620554105111e-06,
|
|
"loss": 0.14654231071472168,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1400,
|
|
"token_acc": 0.9462526829555143,
|
|
"train_speed(iter/s)": 0.120781
|
|
},
|
|
{
|
|
"epoch": 1.75515484123873,
|
|
"eval_loss": 0.2089109718799591,
|
|
"eval_runtime": 29.8795,
|
|
"eval_samples_per_second": 17.236,
|
|
"eval_steps_per_second": 4.317,
|
|
"eval_token_acc": 0.9291622472550738,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 1.7614268914151312,
|
|
"grad_norm": 0.7279197573661804,
|
|
"learning_rate": 3.6409799948521473e-06,
|
|
"loss": 0.14290038347244263,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1405,
|
|
"token_acc": 0.9411003428074647,
|
|
"train_speed(iter/s)": 0.120448
|
|
},
|
|
{
|
|
"epoch": 1.7676989415915327,
|
|
"grad_norm": 0.7627122402191162,
|
|
"learning_rate": 3.6093980905789824e-06,
|
|
"loss": 0.16706535816192628,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1410,
|
|
"token_acc": 0.9413535575754067,
|
|
"train_speed(iter/s)": 0.120561
|
|
},
|
|
{
|
|
"epoch": 1.7739709917679343,
|
|
"grad_norm": 0.6972676515579224,
|
|
"learning_rate": 3.577876204353079e-06,
|
|
"loss": 0.1592485189437866,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1415,
|
|
"token_acc": 0.9439551849921834,
|
|
"train_speed(iter/s)": 0.120676
|
|
},
|
|
{
|
|
"epoch": 1.7802430419443356,
|
|
"grad_norm": 0.6900568604469299,
|
|
"learning_rate": 3.5464156966515426e-06,
|
|
"loss": 0.14554288387298583,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1420,
|
|
"token_acc": 0.951461222546277,
|
|
"train_speed(iter/s)": 0.120765
|
|
},
|
|
{
|
|
"epoch": 1.7802430419443356,
|
|
"eval_loss": 0.20876409113407135,
|
|
"eval_runtime": 29.5896,
|
|
"eval_samples_per_second": 17.405,
|
|
"eval_steps_per_second": 4.36,
|
|
"eval_token_acc": 0.9296234032473767,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 1.786515092120737,
|
|
"grad_norm": 0.684529721736908,
|
|
"learning_rate": 3.515017925302396e-06,
|
|
"loss": 0.14716337919235228,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1425,
|
|
"token_acc": 0.9401438678547113,
|
|
"train_speed(iter/s)": 0.120461
|
|
},
|
|
{
|
|
"epoch": 1.7927871422971384,
|
|
"grad_norm": 0.6805464029312134,
|
|
"learning_rate": 3.48368424542597e-06,
|
|
"loss": 0.16177623271942138,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1430,
|
|
"token_acc": 0.9364077811055218,
|
|
"train_speed(iter/s)": 0.12056
|
|
},
|
|
{
|
|
"epoch": 1.7990591924735397,
|
|
"grad_norm": 0.7333641648292542,
|
|
"learning_rate": 3.4524160093764288e-06,
|
|
"loss": 0.13987714052200317,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1435,
|
|
"token_acc": 0.94569744345486,
|
|
"train_speed(iter/s)": 0.120636
|
|
},
|
|
{
|
|
"epoch": 1.8053312426499413,
|
|
"grad_norm": 0.6505182385444641,
|
|
"learning_rate": 3.421214566683395e-06,
|
|
"loss": 0.14928100109100342,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1440,
|
|
"token_acc": 0.9490582512161656,
|
|
"train_speed(iter/s)": 0.12074
|
|
},
|
|
{
|
|
"epoch": 1.8053312426499413,
|
|
"eval_loss": 0.20936539769172668,
|
|
"eval_runtime": 29.7766,
|
|
"eval_samples_per_second": 17.295,
|
|
"eval_steps_per_second": 4.332,
|
|
"eval_token_acc": 0.9291370932918572,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 1.8116032928263426,
|
|
"grad_norm": 0.6788591742515564,
|
|
"learning_rate": 3.390081263993702e-06,
|
|
"loss": 0.1493847608566284,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1445,
|
|
"token_acc": 0.941620617599257,
|
|
"train_speed(iter/s)": 0.120458
|
|
},
|
|
{
|
|
"epoch": 1.817875343002744,
|
|
"grad_norm": 0.6276586055755615,
|
|
"learning_rate": 3.3590174450132828e-06,
|
|
"loss": 0.15320565700531005,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1450,
|
|
"token_acc": 0.9462449451184286,
|
|
"train_speed(iter/s)": 0.120567
|
|
},
|
|
{
|
|
"epoch": 1.8241473931791454,
|
|
"grad_norm": 0.7135562300682068,
|
|
"learning_rate": 3.3280244504491664e-06,
|
|
"loss": 0.15439343452453613,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1455,
|
|
"token_acc": 0.9476100611215954,
|
|
"train_speed(iter/s)": 0.120673
|
|
},
|
|
{
|
|
"epoch": 1.830419443355547,
|
|
"grad_norm": 0.7349167466163635,
|
|
"learning_rate": 3.297103617951618e-06,
|
|
"loss": 0.149544358253479,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1460,
|
|
"token_acc": 0.9508973838977895,
|
|
"train_speed(iter/s)": 0.120772
|
|
},
|
|
{
|
|
"epoch": 1.830419443355547,
|
|
"eval_loss": 0.20811545848846436,
|
|
"eval_runtime": 29.7337,
|
|
"eval_samples_per_second": 17.32,
|
|
"eval_steps_per_second": 4.339,
|
|
"eval_token_acc": 0.9293760559424142,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 1.8366914935319483,
|
|
"grad_norm": 0.6688914895057678,
|
|
"learning_rate": 3.2662562820564043e-06,
|
|
"loss": 0.147084379196167,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1465,
|
|
"token_acc": 0.9402785349655548,
|
|
"train_speed(iter/s)": 0.120466
|
|
},
|
|
{
|
|
"epoch": 1.8429635437083496,
|
|
"grad_norm": 0.7431237697601318,
|
|
"learning_rate": 3.2354837741271994e-06,
|
|
"loss": 0.15128002166748047,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1470,
|
|
"token_acc": 0.9480812641083521,
|
|
"train_speed(iter/s)": 0.120567
|
|
},
|
|
{
|
|
"epoch": 1.8492355938847511,
|
|
"grad_norm": 0.6432802677154541,
|
|
"learning_rate": 3.2047874222981134e-06,
|
|
"loss": 0.14261975288391113,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1475,
|
|
"token_acc": 0.9464294764583651,
|
|
"train_speed(iter/s)": 0.120663
|
|
},
|
|
{
|
|
"epoch": 1.8555076440611527,
|
|
"grad_norm": 0.7087119221687317,
|
|
"learning_rate": 3.174168551416384e-06,
|
|
"loss": 0.1470237135887146,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1480,
|
|
"token_acc": 0.9525287905322931,
|
|
"train_speed(iter/s)": 0.120742
|
|
},
|
|
{
|
|
"epoch": 1.8555076440611527,
|
|
"eval_loss": 0.20842251181602478,
|
|
"eval_runtime": 29.9521,
|
|
"eval_samples_per_second": 17.194,
|
|
"eval_steps_per_second": 4.307,
|
|
"eval_token_acc": 0.9292880170711564,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 1.861779694237554,
|
|
"grad_norm": 0.7013330459594727,
|
|
"learning_rate": 3.1436284829851883e-06,
|
|
"loss": 0.1440601348876953,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1485,
|
|
"token_acc": 0.94125851177291,
|
|
"train_speed(iter/s)": 0.120446
|
|
},
|
|
{
|
|
"epoch": 1.8680517444139553,
|
|
"grad_norm": 0.7456852793693542,
|
|
"learning_rate": 3.113168535106604e-06,
|
|
"loss": 0.15421888828277588,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1490,
|
|
"token_acc": 0.944170604009705,
|
|
"train_speed(iter/s)": 0.120552
|
|
},
|
|
{
|
|
"epoch": 1.8743237945903566,
|
|
"grad_norm": 0.7362022995948792,
|
|
"learning_rate": 3.08279002242473e-06,
|
|
"loss": 0.15221171379089354,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1495,
|
|
"token_acc": 0.9467418723959071,
|
|
"train_speed(iter/s)": 0.120646
|
|
},
|
|
{
|
|
"epoch": 1.8805958447667581,
|
|
"grad_norm": 0.6779168844223022,
|
|
"learning_rate": 3.0524942560689387e-06,
|
|
"loss": 0.14756014347076415,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1500,
|
|
"token_acc": 0.9489157165213503,
|
|
"train_speed(iter/s)": 0.120733
|
|
},
|
|
{
|
|
"epoch": 1.8805958447667581,
|
|
"eval_loss": 0.20791077613830566,
|
|
"eval_runtime": 29.8876,
|
|
"eval_samples_per_second": 17.231,
|
|
"eval_steps_per_second": 4.316,
|
|
"eval_token_acc": 0.9297491730634593,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.8868678949431597,
|
|
"grad_norm": 0.6737608909606934,
|
|
"learning_rate": 3.0222825435972948e-06,
|
|
"loss": 0.14706544876098632,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1505,
|
|
"token_acc": 0.9416515751653609,
|
|
"train_speed(iter/s)": 0.120438
|
|
},
|
|
{
|
|
"epoch": 1.893139945119561,
|
|
"grad_norm": 0.6713505983352661,
|
|
"learning_rate": 2.99215618894011e-06,
|
|
"loss": 0.14257076978683472,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1510,
|
|
"token_acc": 0.947631754503002,
|
|
"train_speed(iter/s)": 0.120524
|
|
},
|
|
{
|
|
"epoch": 1.8994119952959623,
|
|
"grad_norm": 0.7137247920036316,
|
|
"learning_rate": 2.9621164923436774e-06,
|
|
"loss": 0.14342806339263917,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1515,
|
|
"token_acc": 0.9520854223691699,
|
|
"train_speed(iter/s)": 0.120618
|
|
},
|
|
{
|
|
"epoch": 1.9056840454723638,
|
|
"grad_norm": 0.6587111949920654,
|
|
"learning_rate": 2.9321647503141525e-06,
|
|
"loss": 0.14919731616973878,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1520,
|
|
"token_acc": 0.9465758429898736,
|
|
"train_speed(iter/s)": 0.120695
|
|
},
|
|
{
|
|
"epoch": 1.9056840454723638,
|
|
"eval_loss": 0.20832034945487976,
|
|
"eval_runtime": 29.8838,
|
|
"eval_samples_per_second": 17.233,
|
|
"eval_steps_per_second": 4.317,
|
|
"eval_token_acc": 0.9296108262657684,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 1.9119560956487653,
|
|
"grad_norm": 0.6283432841300964,
|
|
"learning_rate": 2.902302255561585e-06,
|
|
"loss": 0.14435771703720093,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1525,
|
|
"token_acc": 0.9410489589892338,
|
|
"train_speed(iter/s)": 0.120411
|
|
},
|
|
{
|
|
"epoch": 1.9182281458251667,
|
|
"grad_norm": 0.6523663997650146,
|
|
"learning_rate": 2.87253029694414e-06,
|
|
"loss": 0.14620786905288696,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1530,
|
|
"token_acc": 0.9439097941523534,
|
|
"train_speed(iter/s)": 0.120487
|
|
},
|
|
{
|
|
"epoch": 1.924500196001568,
|
|
"grad_norm": 0.814400851726532,
|
|
"learning_rate": 2.8428501594124602e-06,
|
|
"loss": 0.14187668561935424,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1535,
|
|
"token_acc": 0.9552411118676178,
|
|
"train_speed(iter/s)": 0.120588
|
|
},
|
|
{
|
|
"epoch": 1.9307722461779693,
|
|
"grad_norm": 0.680105447769165,
|
|
"learning_rate": 2.813263123954214e-06,
|
|
"loss": 0.14542250633239745,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1540,
|
|
"token_acc": 0.946710125341177,
|
|
"train_speed(iter/s)": 0.12067
|
|
},
|
|
{
|
|
"epoch": 1.9307722461779693,
|
|
"eval_loss": 0.20705881714820862,
|
|
"eval_runtime": 29.9919,
|
|
"eval_samples_per_second": 17.171,
|
|
"eval_steps_per_second": 4.301,
|
|
"eval_token_acc": 0.9296569418649987,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 1.9370442963543708,
|
|
"grad_norm": 0.6705245971679688,
|
|
"learning_rate": 2.7837704675388045e-06,
|
|
"loss": 0.14242172241210938,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1545,
|
|
"token_acc": 0.9419143033907438,
|
|
"train_speed(iter/s)": 0.120382
|
|
},
|
|
{
|
|
"epoch": 1.9433163465307723,
|
|
"grad_norm": 0.6794357299804688,
|
|
"learning_rate": 2.7543734630622622e-06,
|
|
"loss": 0.14580047130584717,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1550,
|
|
"token_acc": 0.9519676920433064,
|
|
"train_speed(iter/s)": 0.120474
|
|
},
|
|
{
|
|
"epoch": 1.9495883967071737,
|
|
"grad_norm": 0.6641804575920105,
|
|
"learning_rate": 2.7250733792922997e-06,
|
|
"loss": 0.14899333715438842,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1555,
|
|
"token_acc": 0.9489447236180905,
|
|
"train_speed(iter/s)": 0.120542
|
|
},
|
|
{
|
|
"epoch": 1.955860446883575,
|
|
"grad_norm": 0.6625697016716003,
|
|
"learning_rate": 2.6958714808135546e-06,
|
|
"loss": 0.1446676015853882,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1560,
|
|
"token_acc": 0.9556168359941944,
|
|
"train_speed(iter/s)": 0.120652
|
|
},
|
|
{
|
|
"epoch": 1.955860446883575,
|
|
"eval_loss": 0.2073841542005539,
|
|
"eval_runtime": 29.9565,
|
|
"eval_samples_per_second": 17.192,
|
|
"eval_steps_per_second": 4.306,
|
|
"eval_token_acc": 0.929736596081851,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 1.9621324970599765,
|
|
"grad_norm": 0.6013507843017578,
|
|
"learning_rate": 2.6667690279730096e-06,
|
|
"loss": 0.1421922564506531,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1565,
|
|
"token_acc": 0.9436457058967458,
|
|
"train_speed(iter/s)": 0.120368
|
|
},
|
|
{
|
|
"epoch": 1.968404547236378,
|
|
"grad_norm": 0.7043313980102539,
|
|
"learning_rate": 2.6377672768256003e-06,
|
|
"loss": 0.1387406349182129,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1570,
|
|
"token_acc": 0.9493978394583139,
|
|
"train_speed(iter/s)": 0.12043
|
|
},
|
|
{
|
|
"epoch": 1.9746765974127793,
|
|
"grad_norm": 0.7223751544952393,
|
|
"learning_rate": 2.608867479080001e-06,
|
|
"loss": 0.14758012294769288,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1575,
|
|
"token_acc": 0.9447510837080315,
|
|
"train_speed(iter/s)": 0.120522
|
|
},
|
|
{
|
|
"epoch": 1.9809486475891807,
|
|
"grad_norm": 0.7050609588623047,
|
|
"learning_rate": 2.5800708820446002e-06,
|
|
"loss": 0.14392924308776855,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1580,
|
|
"token_acc": 0.9467297587598339,
|
|
"train_speed(iter/s)": 0.120624
|
|
},
|
|
{
|
|
"epoch": 1.9809486475891807,
|
|
"eval_loss": 0.20730111002922058,
|
|
"eval_runtime": 29.9348,
|
|
"eval_samples_per_second": 17.204,
|
|
"eval_steps_per_second": 4.309,
|
|
"eval_token_acc": 0.9300384436404493,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 1.987220697765582,
|
|
"grad_norm": 0.7090184092521667,
|
|
"learning_rate": 2.551378728573668e-06,
|
|
"loss": 0.140655517578125,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1585,
|
|
"token_acc": 0.9442207091256332,
|
|
"train_speed(iter/s)": 0.120333
|
|
},
|
|
{
|
|
"epoch": 1.9934927479419835,
|
|
"grad_norm": 0.6970275640487671,
|
|
"learning_rate": 2.5227922570137143e-06,
|
|
"loss": 0.15067524909973146,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1590,
|
|
"token_acc": 0.9442119257472057,
|
|
"train_speed(iter/s)": 0.120437
|
|
},
|
|
{
|
|
"epoch": 1.999764798118385,
|
|
"grad_norm": 0.6549662351608276,
|
|
"learning_rate": 2.4943127011500483e-06,
|
|
"loss": 0.1493726849555969,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1595,
|
|
"token_acc": 0.9526065156592507,
|
|
"train_speed(iter/s)": 0.120543
|
|
},
|
|
{
|
|
"epoch": 2.005017640141121,
|
|
"grad_norm": 0.6562784314155579,
|
|
"learning_rate": 2.465941290153514e-06,
|
|
"loss": 0.1286258101463318,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1600,
|
|
"token_acc": 0.9614165081272321,
|
|
"train_speed(iter/s)": 0.120686
|
|
},
|
|
{
|
|
"epoch": 2.005017640141121,
|
|
"eval_loss": 0.20796315371990204,
|
|
"eval_runtime": 29.931,
|
|
"eval_samples_per_second": 17.206,
|
|
"eval_steps_per_second": 4.31,
|
|
"eval_token_acc": 0.9298959045155556,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 2.0112896903175224,
|
|
"grad_norm": 0.6960498690605164,
|
|
"learning_rate": 2.4376792485274577e-06,
|
|
"loss": 0.1140947699546814,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1605,
|
|
"token_acc": 0.9474975268013971,
|
|
"train_speed(iter/s)": 0.120408
|
|
},
|
|
{
|
|
"epoch": 2.017561740493924,
|
|
"grad_norm": 0.6196739673614502,
|
|
"learning_rate": 2.409527796054863e-06,
|
|
"loss": 0.1102461576461792,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1610,
|
|
"token_acc": 0.9649468933272074,
|
|
"train_speed(iter/s)": 0.120505
|
|
},
|
|
{
|
|
"epoch": 2.0238337906703254,
|
|
"grad_norm": 0.6972460150718689,
|
|
"learning_rate": 2.38148814774572e-06,
|
|
"loss": 0.10172897577285767,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1615,
|
|
"token_acc": 0.9602301717784455,
|
|
"train_speed(iter/s)": 0.120575
|
|
},
|
|
{
|
|
"epoch": 2.0301058408467267,
|
|
"grad_norm": 0.7768440842628479,
|
|
"learning_rate": 2.353561513784566e-06,
|
|
"loss": 0.10658919811248779,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1620,
|
|
"token_acc": 0.9643368583388412,
|
|
"train_speed(iter/s)": 0.120672
|
|
},
|
|
{
|
|
"epoch": 2.0301058408467267,
|
|
"eval_loss": 0.2239648997783661,
|
|
"eval_runtime": 29.8719,
|
|
"eval_samples_per_second": 17.24,
|
|
"eval_steps_per_second": 4.318,
|
|
"eval_token_acc": 0.9290071311485719,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 2.036377891023128,
|
|
"grad_norm": 0.7098206281661987,
|
|
"learning_rate": 2.325749099478277e-06,
|
|
"loss": 0.10938189029693604,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1625,
|
|
"token_acc": 0.948818377439692,
|
|
"train_speed(iter/s)": 0.120401
|
|
},
|
|
{
|
|
"epoch": 2.04264994119953,
|
|
"grad_norm": 0.6591574549674988,
|
|
"learning_rate": 2.29805210520403e-06,
|
|
"loss": 0.10488543510437012,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1630,
|
|
"token_acc": 0.9647417816298272,
|
|
"train_speed(iter/s)": 0.120485
|
|
},
|
|
{
|
|
"epoch": 2.048921991375931,
|
|
"grad_norm": 0.6965081691741943,
|
|
"learning_rate": 2.270471726357501e-06,
|
|
"loss": 0.10199937820434571,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1635,
|
|
"token_acc": 0.9662853371466286,
|
|
"train_speed(iter/s)": 0.120557
|
|
},
|
|
{
|
|
"epoch": 2.0551940415523324,
|
|
"grad_norm": 0.7288631200790405,
|
|
"learning_rate": 2.243009153301276e-06,
|
|
"loss": 0.10732921361923217,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1640,
|
|
"token_acc": 0.9629427346459488,
|
|
"train_speed(iter/s)": 0.120642
|
|
},
|
|
{
|
|
"epoch": 2.0551940415523324,
|
|
"eval_loss": 0.2211890071630478,
|
|
"eval_runtime": 29.9921,
|
|
"eval_samples_per_second": 17.171,
|
|
"eval_steps_per_second": 4.301,
|
|
"eval_token_acc": 0.9286843219539599,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 2.0614660917287337,
|
|
"grad_norm": 0.6177812218666077,
|
|
"learning_rate": 2.215665571313468e-06,
|
|
"loss": 0.10446252822875976,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1645,
|
|
"token_acc": 0.946579760130165,
|
|
"train_speed(iter/s)": 0.120396
|
|
},
|
|
{
|
|
"epoch": 2.067738141905135,
|
|
"grad_norm": 0.6719108819961548,
|
|
"learning_rate": 2.188442160536562e-06,
|
|
"loss": 0.10937647819519043,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1650,
|
|
"token_acc": 0.9570199762322449,
|
|
"train_speed(iter/s)": 0.120466
|
|
},
|
|
{
|
|
"epoch": 2.074010192081537,
|
|
"grad_norm": 0.6592283844947815,
|
|
"learning_rate": 2.1613400959264845e-06,
|
|
"loss": 0.09818293452262879,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1655,
|
|
"token_acc": 0.9663418954827281,
|
|
"train_speed(iter/s)": 0.120538
|
|
},
|
|
{
|
|
"epoch": 2.080282242257938,
|
|
"grad_norm": 0.7156064510345459,
|
|
"learning_rate": 2.1343605472018954e-06,
|
|
"loss": 0.10150223970413208,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1660,
|
|
"token_acc": 0.9639677935587118,
|
|
"train_speed(iter/s)": 0.120626
|
|
},
|
|
{
|
|
"epoch": 2.080282242257938,
|
|
"eval_loss": 0.22235038876533508,
|
|
"eval_runtime": 29.8842,
|
|
"eval_samples_per_second": 17.233,
|
|
"eval_steps_per_second": 4.317,
|
|
"eval_token_acc": 0.9289903618397609,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 2.0865542924343394,
|
|
"grad_norm": 0.7405542731285095,
|
|
"learning_rate": 2.1075046787936842e-06,
|
|
"loss": 0.11414774656295776,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1665,
|
|
"token_acc": 0.9445764825060391,
|
|
"train_speed(iter/s)": 0.120361
|
|
},
|
|
{
|
|
"epoch": 2.0928263426107407,
|
|
"grad_norm": 0.6007137894630432,
|
|
"learning_rate": 2.0807736497947436e-06,
|
|
"loss": 0.1068692922592163,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1670,
|
|
"token_acc": 0.9613585407036289,
|
|
"train_speed(iter/s)": 0.120437
|
|
},
|
|
{
|
|
"epoch": 2.0990983927871425,
|
|
"grad_norm": 0.6333921551704407,
|
|
"learning_rate": 2.0541686139099164e-06,
|
|
"loss": 0.10767915248870849,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1675,
|
|
"token_acc": 0.9514025948814184,
|
|
"train_speed(iter/s)": 0.120521
|
|
},
|
|
{
|
|
"epoch": 2.105370442963544,
|
|
"grad_norm": 0.6882405877113342,
|
|
"learning_rate": 2.0276907194062167e-06,
|
|
"loss": 0.11104511022567749,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1680,
|
|
"token_acc": 0.9605332552007032,
|
|
"train_speed(iter/s)": 0.120608
|
|
},
|
|
{
|
|
"epoch": 2.105370442963544,
|
|
"eval_loss": 0.22228793799877167,
|
|
"eval_runtime": 29.9337,
|
|
"eval_samples_per_second": 17.205,
|
|
"eval_steps_per_second": 4.31,
|
|
"eval_token_acc": 0.9290364774389912,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 2.111642493139945,
|
|
"grad_norm": 0.7313436269760132,
|
|
"learning_rate": 2.0013411090632638e-06,
|
|
"loss": 0.1036494255065918,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1685,
|
|
"token_acc": 0.9467640326150703,
|
|
"train_speed(iter/s)": 0.120351
|
|
},
|
|
{
|
|
"epoch": 2.1179145433163464,
|
|
"grad_norm": 0.6357504725456238,
|
|
"learning_rate": 1.9751209201239696e-06,
|
|
"loss": 0.1004453420639038,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1690,
|
|
"token_acc": 0.9673980703392469,
|
|
"train_speed(iter/s)": 0.12045
|
|
},
|
|
{
|
|
"epoch": 2.1241865934927477,
|
|
"grad_norm": 0.6574280858039856,
|
|
"learning_rate": 1.9490312842454425e-06,
|
|
"loss": 0.09599907994270325,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1695,
|
|
"token_acc": 0.9642799567029778,
|
|
"train_speed(iter/s)": 0.120536
|
|
},
|
|
{
|
|
"epoch": 2.1304586436691495,
|
|
"grad_norm": 0.6352968811988831,
|
|
"learning_rate": 1.9230733274501525e-06,
|
|
"loss": 0.10356111526489258,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1700,
|
|
"token_acc": 0.965252210367902,
|
|
"train_speed(iter/s)": 0.120605
|
|
},
|
|
{
|
|
"epoch": 2.1304586436691495,
|
|
"eval_loss": 0.2226356714963913,
|
|
"eval_runtime": 29.9685,
|
|
"eval_samples_per_second": 17.185,
|
|
"eval_steps_per_second": 4.305,
|
|
"eval_token_acc": 0.9290155158029774,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 2.136730693845551,
|
|
"grad_norm": 0.6789947748184204,
|
|
"learning_rate": 1.8972481700773388e-06,
|
|
"loss": 0.10871880054473877,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1705,
|
|
"token_acc": 0.9450670361465422,
|
|
"train_speed(iter/s)": 0.120351
|
|
},
|
|
{
|
|
"epoch": 2.143002744021952,
|
|
"grad_norm": 0.6862888932228088,
|
|
"learning_rate": 1.8715569267346368e-06,
|
|
"loss": 0.10977823734283447,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1710,
|
|
"token_acc": 0.9672735959231341,
|
|
"train_speed(iter/s)": 0.120428
|
|
},
|
|
{
|
|
"epoch": 2.1492747941983534,
|
|
"grad_norm": 0.6545423865318298,
|
|
"learning_rate": 1.846000706249997e-06,
|
|
"loss": 0.10351777076721191,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1715,
|
|
"token_acc": 0.962293618920125,
|
|
"train_speed(iter/s)": 0.120499
|
|
},
|
|
{
|
|
"epoch": 2.155546844374755,
|
|
"grad_norm": 0.7650525569915771,
|
|
"learning_rate": 1.8205806116238055e-06,
|
|
"loss": 0.1088717222213745,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1720,
|
|
"token_acc": 0.9641661465130795,
|
|
"train_speed(iter/s)": 0.120563
|
|
},
|
|
{
|
|
"epoch": 2.155546844374755,
|
|
"eval_loss": 0.22163553535938263,
|
|
"eval_runtime": 29.9778,
|
|
"eval_samples_per_second": 17.179,
|
|
"eval_steps_per_second": 4.303,
|
|
"eval_token_acc": 0.9293844405968197,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 2.1618188945511565,
|
|
"grad_norm": 0.6473621129989624,
|
|
"learning_rate": 1.7952977399812988e-06,
|
|
"loss": 0.10216574668884278,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1725,
|
|
"token_acc": 0.9470272328316934,
|
|
"train_speed(iter/s)": 0.120306
|
|
},
|
|
{
|
|
"epoch": 2.168090944727558,
|
|
"grad_norm": 0.6527573466300964,
|
|
"learning_rate": 1.7701531825251888e-06,
|
|
"loss": 0.10740480422973633,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1730,
|
|
"token_acc": 0.9613802435723952,
|
|
"train_speed(iter/s)": 0.120405
|
|
},
|
|
{
|
|
"epoch": 2.174362994903959,
|
|
"grad_norm": 0.7024506330490112,
|
|
"learning_rate": 1.7451480244885938e-06,
|
|
"loss": 0.10878567695617676,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1735,
|
|
"token_acc": 0.9636387817528926,
|
|
"train_speed(iter/s)": 0.120492
|
|
},
|
|
{
|
|
"epoch": 2.1806350450803604,
|
|
"grad_norm": 0.686829686164856,
|
|
"learning_rate": 1.720283345088178e-06,
|
|
"loss": 0.11087257862091064,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1740,
|
|
"token_acc": 0.9646302250803859,
|
|
"train_speed(iter/s)": 0.120584
|
|
},
|
|
{
|
|
"epoch": 2.1806350450803604,
|
|
"eval_loss": 0.2214018851518631,
|
|
"eval_runtime": 29.9476,
|
|
"eval_samples_per_second": 17.197,
|
|
"eval_steps_per_second": 4.308,
|
|
"eval_token_acc": 0.9294934411040913,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 2.186907095256762,
|
|
"grad_norm": 0.6728172898292542,
|
|
"learning_rate": 1.695560217477582e-06,
|
|
"loss": 0.10692278146743775,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1745,
|
|
"token_acc": 0.9460528590459338,
|
|
"train_speed(iter/s)": 0.120324
|
|
},
|
|
{
|
|
"epoch": 2.1931791454331635,
|
|
"grad_norm": 0.6230509281158447,
|
|
"learning_rate": 1.6709797087011066e-06,
|
|
"loss": 0.10696847438812256,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1750,
|
|
"token_acc": 0.9615373673579699,
|
|
"train_speed(iter/s)": 0.120404
|
|
},
|
|
{
|
|
"epoch": 2.199451195609565,
|
|
"grad_norm": 0.6261888742446899,
|
|
"learning_rate": 1.6465428796476584e-06,
|
|
"loss": 0.10366283655166626,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1755,
|
|
"token_acc": 0.9615300546448088,
|
|
"train_speed(iter/s)": 0.120475
|
|
},
|
|
{
|
|
"epoch": 2.205723245785966,
|
|
"grad_norm": 0.7547042369842529,
|
|
"learning_rate": 1.6222507850049602e-06,
|
|
"loss": 0.1082529902458191,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1760,
|
|
"token_acc": 0.9618447339542474,
|
|
"train_speed(iter/s)": 0.120566
|
|
},
|
|
{
|
|
"epoch": 2.205723245785966,
|
|
"eval_loss": 0.22259920835494995,
|
|
"eval_runtime": 29.8466,
|
|
"eval_samples_per_second": 17.255,
|
|
"eval_steps_per_second": 4.322,
|
|
"eval_token_acc": 0.9292796324167508,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 2.211995295962368,
|
|
"grad_norm": 0.636020302772522,
|
|
"learning_rate": 1.598104473214031e-06,
|
|
"loss": 0.10505471229553223,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1765,
|
|
"token_acc": 0.9457782654231709,
|
|
"train_speed(iter/s)": 0.120309
|
|
},
|
|
{
|
|
"epoch": 2.218267346138769,
|
|
"grad_norm": 0.617948591709137,
|
|
"learning_rate": 1.5741049864239383e-06,
|
|
"loss": 0.10182752609252929,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1770,
|
|
"token_acc": 0.963300613814716,
|
|
"train_speed(iter/s)": 0.120392
|
|
},
|
|
{
|
|
"epoch": 2.2245393963151705,
|
|
"grad_norm": 0.7750356197357178,
|
|
"learning_rate": 1.550253360446815e-06,
|
|
"loss": 0.10825409889221191,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1775,
|
|
"token_acc": 0.9659604215960421,
|
|
"train_speed(iter/s)": 0.120481
|
|
},
|
|
{
|
|
"epoch": 2.230811446491572,
|
|
"grad_norm": 0.6845636367797852,
|
|
"learning_rate": 1.5265506247131617e-06,
|
|
"loss": 0.10921690464019776,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1780,
|
|
"token_acc": 0.9601528384279476,
|
|
"train_speed(iter/s)": 0.120568
|
|
},
|
|
{
|
|
"epoch": 2.230811446491572,
|
|
"eval_loss": 0.2214292734861374,
|
|
"eval_runtime": 29.9074,
|
|
"eval_samples_per_second": 17.22,
|
|
"eval_steps_per_second": 4.313,
|
|
"eval_token_acc": 0.9292964017255618,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 2.2370834966679736,
|
|
"grad_norm": 0.735072135925293,
|
|
"learning_rate": 1.5029978022274067e-06,
|
|
"loss": 0.11698575019836426,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1785,
|
|
"token_acc": 0.9456881099382594,
|
|
"train_speed(iter/s)": 0.120308
|
|
},
|
|
{
|
|
"epoch": 2.243355546844375,
|
|
"grad_norm": 0.6945551037788391,
|
|
"learning_rate": 1.47959590952376e-06,
|
|
"loss": 0.10342628955841064,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1790,
|
|
"token_acc": 0.9655268490374873,
|
|
"train_speed(iter/s)": 0.120393
|
|
},
|
|
{
|
|
"epoch": 2.249627597020776,
|
|
"grad_norm": 0.6537898778915405,
|
|
"learning_rate": 1.4563459566223358e-06,
|
|
"loss": 0.10419995784759521,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1795,
|
|
"token_acc": 0.9687678159779888,
|
|
"train_speed(iter/s)": 0.120479
|
|
},
|
|
{
|
|
"epoch": 2.2558996471971775,
|
|
"grad_norm": 0.665179431438446,
|
|
"learning_rate": 1.4332489469855698e-06,
|
|
"loss": 0.10044981241226196,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1800,
|
|
"token_acc": 0.9632239107769107,
|
|
"train_speed(iter/s)": 0.120548
|
|
},
|
|
{
|
|
"epoch": 2.2558996471971775,
|
|
"eval_loss": 0.22338801622390747,
|
|
"eval_runtime": 29.7434,
|
|
"eval_samples_per_second": 17.315,
|
|
"eval_steps_per_second": 4.337,
|
|
"eval_token_acc": 0.9293676712880087,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 2.262171697373579,
|
|
"grad_norm": 0.7259742617607117,
|
|
"learning_rate": 1.4103058774748923e-06,
|
|
"loss": 0.10409928560256958,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1805,
|
|
"token_acc": 0.9449998480658787,
|
|
"train_speed(iter/s)": 0.120311
|
|
},
|
|
{
|
|
"epoch": 2.2684437475499806,
|
|
"grad_norm": 0.6720697283744812,
|
|
"learning_rate": 1.3875177383077233e-06,
|
|
"loss": 0.11027616262435913,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1810,
|
|
"token_acc": 0.9617116015070705,
|
|
"train_speed(iter/s)": 0.120381
|
|
},
|
|
{
|
|
"epoch": 2.274715797726382,
|
|
"grad_norm": 0.6582772731781006,
|
|
"learning_rate": 1.3648855130147216e-06,
|
|
"loss": 0.10254979133605957,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1815,
|
|
"token_acc": 0.9641775983854692,
|
|
"train_speed(iter/s)": 0.120474
|
|
},
|
|
{
|
|
"epoch": 2.280987847902783,
|
|
"grad_norm": 0.6948631405830383,
|
|
"learning_rate": 1.3424101783973403e-06,
|
|
"loss": 0.10602834224700927,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1820,
|
|
"token_acc": 0.9628847951276872,
|
|
"train_speed(iter/s)": 0.120552
|
|
},
|
|
{
|
|
"epoch": 2.280987847902783,
|
|
"eval_loss": 0.22278502583503723,
|
|
"eval_runtime": 29.9808,
|
|
"eval_samples_per_second": 17.178,
|
|
"eval_steps_per_second": 4.303,
|
|
"eval_token_acc": 0.9293760559424142,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 2.2872598980791845,
|
|
"grad_norm": 0.6872897744178772,
|
|
"learning_rate": 1.3200927044856714e-06,
|
|
"loss": 0.11157424449920654,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1825,
|
|
"token_acc": 0.9470804190586105,
|
|
"train_speed(iter/s)": 0.120308
|
|
},
|
|
{
|
|
"epoch": 2.293531948255586,
|
|
"grad_norm": 0.7110899686813354,
|
|
"learning_rate": 1.2979340544965745e-06,
|
|
"loss": 0.10765695571899414,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1830,
|
|
"token_acc": 0.9619730551933942,
|
|
"train_speed(iter/s)": 0.120391
|
|
},
|
|
{
|
|
"epoch": 2.2998039984319876,
|
|
"grad_norm": 0.6805204749107361,
|
|
"learning_rate": 1.2759351847921053e-06,
|
|
"loss": 0.10896315574645996,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1835,
|
|
"token_acc": 0.9559441922637779,
|
|
"train_speed(iter/s)": 0.120473
|
|
},
|
|
{
|
|
"epoch": 2.306076048608389,
|
|
"grad_norm": 0.6619516015052795,
|
|
"learning_rate": 1.25409704483824e-06,
|
|
"loss": 0.1140247106552124,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1840,
|
|
"token_acc": 0.9612636720272272,
|
|
"train_speed(iter/s)": 0.12055
|
|
},
|
|
{
|
|
"epoch": 2.306076048608389,
|
|
"eval_loss": 0.2214568555355072,
|
|
"eval_runtime": 29.967,
|
|
"eval_samples_per_second": 17.186,
|
|
"eval_steps_per_second": 4.305,
|
|
"eval_token_acc": 0.9293173633615757,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 2.31234809878479,
|
|
"grad_norm": 0.6659355759620667,
|
|
"learning_rate": 1.232420577163902e-06,
|
|
"loss": 0.10365439653396606,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1845,
|
|
"token_acc": 0.9470177324019344,
|
|
"train_speed(iter/s)": 0.120308
|
|
},
|
|
{
|
|
"epoch": 2.3186201489611915,
|
|
"grad_norm": 0.6475389003753662,
|
|
"learning_rate": 1.2109067173202731e-06,
|
|
"loss": 0.10801565647125244,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1850,
|
|
"token_acc": 0.9615019262230197,
|
|
"train_speed(iter/s)": 0.12039
|
|
},
|
|
{
|
|
"epoch": 2.3248921991375933,
|
|
"grad_norm": 0.8018389344215393,
|
|
"learning_rate": 1.1895563938404203e-06,
|
|
"loss": 0.11211535930633545,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1855,
|
|
"token_acc": 0.9572030113563864,
|
|
"train_speed(iter/s)": 0.120471
|
|
},
|
|
{
|
|
"epoch": 2.3311642493139946,
|
|
"grad_norm": 0.7461184859275818,
|
|
"learning_rate": 1.1683705281992202e-06,
|
|
"loss": 0.10777713060379028,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1860,
|
|
"token_acc": 0.959629618707794,
|
|
"train_speed(iter/s)": 0.120548
|
|
},
|
|
{
|
|
"epoch": 2.3311642493139946,
|
|
"eval_loss": 0.2218623012304306,
|
|
"eval_runtime": 29.9356,
|
|
"eval_samples_per_second": 17.204,
|
|
"eval_steps_per_second": 4.309,
|
|
"eval_token_acc": 0.9295772876481464,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 2.337436299490396,
|
|
"grad_norm": 0.6567296385765076,
|
|
"learning_rate": 1.1473500347735927e-06,
|
|
"loss": 0.11489678621292114,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1865,
|
|
"token_acc": 0.9453161257195897,
|
|
"train_speed(iter/s)": 0.120343
|
|
},
|
|
{
|
|
"epoch": 2.343708349666797,
|
|
"grad_norm": 0.7275116443634033,
|
|
"learning_rate": 1.1264958208030224e-06,
|
|
"loss": 0.11094659566879272,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1870,
|
|
"token_acc": 0.9629886036851635,
|
|
"train_speed(iter/s)": 0.120428
|
|
},
|
|
{
|
|
"epoch": 2.349980399843199,
|
|
"grad_norm": 0.6866867542266846,
|
|
"learning_rate": 1.105808786350423e-06,
|
|
"loss": 0.11223549842834472,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1875,
|
|
"token_acc": 0.9640074018087519,
|
|
"train_speed(iter/s)": 0.120507
|
|
},
|
|
{
|
|
"epoch": 2.3562524500196003,
|
|
"grad_norm": 0.7598003149032593,
|
|
"learning_rate": 1.085289824263273e-06,
|
|
"loss": 0.10983138084411621,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1880,
|
|
"token_acc": 0.9628211185993882,
|
|
"train_speed(iter/s)": 0.120591
|
|
},
|
|
{
|
|
"epoch": 2.3562524500196003,
|
|
"eval_loss": 0.22093415260314941,
|
|
"eval_runtime": 29.9911,
|
|
"eval_samples_per_second": 17.172,
|
|
"eval_steps_per_second": 4.301,
|
|
"eval_token_acc": 0.9296737111738097,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 2.3625245001960016,
|
|
"grad_norm": 0.7150808572769165,
|
|
"learning_rate": 1.0649398201350907e-06,
|
|
"loss": 0.10479578971862794,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1885,
|
|
"token_acc": 0.9480835490841136,
|
|
"train_speed(iter/s)": 0.120353
|
|
},
|
|
{
|
|
"epoch": 2.368796550372403,
|
|
"grad_norm": 0.6622815728187561,
|
|
"learning_rate": 1.044759652267207e-06,
|
|
"loss": 0.10107295513153076,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1890,
|
|
"token_acc": 0.9652460603127552,
|
|
"train_speed(iter/s)": 0.120416
|
|
},
|
|
{
|
|
"epoch": 2.375068600548804,
|
|
"grad_norm": 0.6194722056388855,
|
|
"learning_rate": 1.024750191630864e-06,
|
|
"loss": 0.10245490074157715,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1895,
|
|
"token_acc": 0.9643746110765401,
|
|
"train_speed(iter/s)": 0.120494
|
|
},
|
|
{
|
|
"epoch": 2.381340650725206,
|
|
"grad_norm": 0.6570760011672974,
|
|
"learning_rate": 1.0049123018296158e-06,
|
|
"loss": 0.10547176599502564,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1900,
|
|
"token_acc": 0.9632573448738266,
|
|
"train_speed(iter/s)": 0.120572
|
|
},
|
|
{
|
|
"epoch": 2.381340650725206,
|
|
"eval_loss": 0.22135132551193237,
|
|
"eval_runtime": 29.9894,
|
|
"eval_samples_per_second": 17.173,
|
|
"eval_steps_per_second": 4.302,
|
|
"eval_token_acc": 0.9297952886626896,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 2.3876127009016073,
|
|
"grad_norm": 0.6823681592941284,
|
|
"learning_rate": 9.852468390620624e-07,
|
|
"loss": 0.11187875270843506,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1905,
|
|
"token_acc": 0.9470449919974708,
|
|
"train_speed(iter/s)": 0.120341
|
|
},
|
|
{
|
|
"epoch": 2.3938847510780086,
|
|
"grad_norm": 0.6770759224891663,
|
|
"learning_rate": 9.65754652084896e-07,
|
|
"loss": 0.1058814525604248,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1910,
|
|
"token_acc": 0.9609285414627324,
|
|
"train_speed(iter/s)": 0.120413
|
|
},
|
|
{
|
|
"epoch": 2.40015680125441,
|
|
"grad_norm": 0.6467224955558777,
|
|
"learning_rate": 9.464365821762611e-07,
|
|
"loss": 0.10833286046981812,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1915,
|
|
"token_acc": 0.9618701158717327,
|
|
"train_speed(iter/s)": 0.120485
|
|
},
|
|
{
|
|
"epoch": 2.406428851430811,
|
|
"grad_norm": 0.7155383825302124,
|
|
"learning_rate": 9.272934630994579e-07,
|
|
"loss": 0.1067124843597412,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1920,
|
|
"token_acc": 0.9663948320886975,
|
|
"train_speed(iter/s)": 0.120568
|
|
},
|
|
{
|
|
"epoch": 2.406428851430811,
|
|
"eval_loss": 0.22103355824947357,
|
|
"eval_runtime": 29.9865,
|
|
"eval_samples_per_second": 17.174,
|
|
"eval_steps_per_second": 4.302,
|
|
"eval_token_acc": 0.929770134699473,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 2.412700901607213,
|
|
"grad_norm": 0.6960355639457703,
|
|
"learning_rate": 9.083261210669458e-07,
|
|
"loss": 0.10286239385604859,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1925,
|
|
"token_acc": 0.9472781065088758,
|
|
"train_speed(iter/s)": 0.12034
|
|
},
|
|
{
|
|
"epoch": 2.4189729517836143,
|
|
"grad_norm": 0.8168506622314453,
|
|
"learning_rate": 8.895353747046903e-07,
|
|
"loss": 0.10974031686782837,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1930,
|
|
"token_acc": 0.9560156270763016,
|
|
"train_speed(iter/s)": 0.120426
|
|
},
|
|
{
|
|
"epoch": 2.4252450019600156,
|
|
"grad_norm": 0.7116117477416992,
|
|
"learning_rate": 8.70922035016829e-07,
|
|
"loss": 0.11615951061248779,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1935,
|
|
"token_acc": 0.9588699861295893,
|
|
"train_speed(iter/s)": 0.120504
|
|
},
|
|
{
|
|
"epoch": 2.4315170521364173,
|
|
"grad_norm": 0.793286919593811,
|
|
"learning_rate": 8.524869053506718e-07,
|
|
"loss": 0.11020160913467407,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1940,
|
|
"token_acc": 0.9633706189410888,
|
|
"train_speed(iter/s)": 0.120586
|
|
},
|
|
{
|
|
"epoch": 2.4315170521364173,
|
|
"eval_loss": 0.22128398716449738,
|
|
"eval_runtime": 29.8622,
|
|
"eval_samples_per_second": 17.246,
|
|
"eval_steps_per_second": 4.32,
|
|
"eval_token_acc": 0.9298959045155556,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 2.4377891023128186,
|
|
"grad_norm": 0.6766318678855896,
|
|
"learning_rate": 8.342307813620254e-07,
|
|
"loss": 0.10068587064743043,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1945,
|
|
"token_acc": 0.9491064989973427,
|
|
"train_speed(iter/s)": 0.120381
|
|
},
|
|
{
|
|
"epoch": 2.44406115248922,
|
|
"grad_norm": 0.6612719893455505,
|
|
"learning_rate": 8.161544509808522e-07,
|
|
"loss": 0.10740329027175903,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1950,
|
|
"token_acc": 0.9598074812125306,
|
|
"train_speed(iter/s)": 0.120452
|
|
},
|
|
{
|
|
"epoch": 2.4503332026656213,
|
|
"grad_norm": 0.7400087714195251,
|
|
"learning_rate": 7.982586943772663e-07,
|
|
"loss": 0.1041949987411499,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1955,
|
|
"token_acc": 0.9606271261647685,
|
|
"train_speed(iter/s)": 0.120526
|
|
},
|
|
{
|
|
"epoch": 2.4566052528420226,
|
|
"grad_norm": 0.6745719909667969,
|
|
"learning_rate": 7.805442839278643e-07,
|
|
"loss": 0.10791645050048829,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1960,
|
|
"token_acc": 0.9610862521215862,
|
|
"train_speed(iter/s)": 0.120595
|
|
},
|
|
{
|
|
"epoch": 2.4566052528420226,
|
|
"eval_loss": 0.22153809666633606,
|
|
"eval_runtime": 29.8544,
|
|
"eval_samples_per_second": 17.25,
|
|
"eval_steps_per_second": 4.321,
|
|
"eval_token_acc": 0.9296988651370263,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 2.4628773030184243,
|
|
"grad_norm": 0.6609322428703308,
|
|
"learning_rate": 7.630119841823808e-07,
|
|
"loss": 0.10820503234863281,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1965,
|
|
"token_acc": 0.9463575963963611,
|
|
"train_speed(iter/s)": 0.120382
|
|
},
|
|
{
|
|
"epoch": 2.4691493531948256,
|
|
"grad_norm": 0.6629140973091125,
|
|
"learning_rate": 7.456625518306976e-07,
|
|
"loss": 0.10982118844985962,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1970,
|
|
"token_acc": 0.9646770143802785,
|
|
"train_speed(iter/s)": 0.120462
|
|
},
|
|
{
|
|
"epoch": 2.475421403371227,
|
|
"grad_norm": 0.6865978837013245,
|
|
"learning_rate": 7.284967356701839e-07,
|
|
"loss": 0.10275110006332397,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1975,
|
|
"token_acc": 0.9637741118063815,
|
|
"train_speed(iter/s)": 0.120509
|
|
},
|
|
{
|
|
"epoch": 2.4816934535476283,
|
|
"grad_norm": 0.6618251800537109,
|
|
"learning_rate": 7.115152765733768e-07,
|
|
"loss": 0.10197668075561524,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1980,
|
|
"token_acc": 0.9611416209019804,
|
|
"train_speed(iter/s)": 0.120592
|
|
},
|
|
{
|
|
"epoch": 2.4816934535476283,
|
|
"eval_loss": 0.22130271792411804,
|
|
"eval_runtime": 29.9692,
|
|
"eval_samples_per_second": 17.184,
|
|
"eval_steps_per_second": 4.304,
|
|
"eval_token_acc": 0.9297156344458373,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 2.4879655037240296,
|
|
"grad_norm": 0.6805600523948669,
|
|
"learning_rate": 6.94718907456009e-07,
|
|
"loss": 0.11028853654861451,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1985,
|
|
"token_acc": 0.9475056321262383,
|
|
"train_speed(iter/s)": 0.120373
|
|
},
|
|
{
|
|
"epoch": 2.4942375539004313,
|
|
"grad_norm": 0.7021499276161194,
|
|
"learning_rate": 6.781083532453702e-07,
|
|
"loss": 0.10008060932159424,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1990,
|
|
"token_acc": 0.958559067450638,
|
|
"train_speed(iter/s)": 0.120444
|
|
},
|
|
{
|
|
"epoch": 2.5005096040768326,
|
|
"grad_norm": 0.68918377161026,
|
|
"learning_rate": 6.61684330849025e-07,
|
|
"loss": 0.10927926301956177,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1995,
|
|
"token_acc": 0.9565252438401806,
|
|
"train_speed(iter/s)": 0.120521
|
|
},
|
|
{
|
|
"epoch": 2.506781654253234,
|
|
"grad_norm": 0.6521994471549988,
|
|
"learning_rate": 6.454475491238682e-07,
|
|
"loss": 0.11399447917938232,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2000,
|
|
"token_acc": 0.9624060150375939,
|
|
"train_speed(iter/s)": 0.1206
|
|
},
|
|
{
|
|
"epoch": 2.506781654253234,
|
|
"eval_loss": 0.2205990105867386,
|
|
"eval_runtime": 30.0032,
|
|
"eval_samples_per_second": 17.165,
|
|
"eval_steps_per_second": 4.3,
|
|
"eval_token_acc": 0.9299881357140162,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 2.5130537044296353,
|
|
"grad_norm": 0.6786354184150696,
|
|
"learning_rate": 6.293987088455355e-07,
|
|
"loss": 0.10214885473251342,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2005,
|
|
"token_acc": 0.9475616708376412,
|
|
"train_speed(iter/s)": 0.120376
|
|
},
|
|
{
|
|
"epoch": 2.5193257546060366,
|
|
"grad_norm": 0.6837747097015381,
|
|
"learning_rate": 6.135385026781476e-07,
|
|
"loss": 0.10503888130187988,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2010,
|
|
"token_acc": 0.9636853327348222,
|
|
"train_speed(iter/s)": 0.120453
|
|
},
|
|
{
|
|
"epoch": 2.5255978047824383,
|
|
"grad_norm": 0.6747323274612427,
|
|
"learning_rate": 5.978676151444285e-07,
|
|
"loss": 0.10235412120819092,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2015,
|
|
"token_acc": 0.9618530311543985,
|
|
"train_speed(iter/s)": 0.120526
|
|
},
|
|
{
|
|
"epoch": 2.5318698549588396,
|
|
"grad_norm": 0.6173009872436523,
|
|
"learning_rate": 5.823867225961516e-07,
|
|
"loss": 0.10736865997314453,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2020,
|
|
"token_acc": 0.9654360340644179,
|
|
"train_speed(iter/s)": 0.120592
|
|
},
|
|
{
|
|
"epoch": 2.5318698549588396,
|
|
"eval_loss": 0.2209625095129013,
|
|
"eval_runtime": 29.8886,
|
|
"eval_samples_per_second": 17.231,
|
|
"eval_steps_per_second": 4.316,
|
|
"eval_token_acc": 0.9297785193538786,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 2.538141905135241,
|
|
"grad_norm": 0.7332006096839905,
|
|
"learning_rate": 5.670964931849521e-07,
|
|
"loss": 0.10466567277908326,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2025,
|
|
"token_acc": 0.9465066273634904,
|
|
"train_speed(iter/s)": 0.120388
|
|
},
|
|
{
|
|
"epoch": 2.5444139553116427,
|
|
"grad_norm": 0.7321441769599915,
|
|
"learning_rate": 5.519975868334914e-07,
|
|
"loss": 0.09656277894973755,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2030,
|
|
"token_acc": 0.9659533350385086,
|
|
"train_speed(iter/s)": 0.12046
|
|
},
|
|
{
|
|
"epoch": 2.550686005488044,
|
|
"grad_norm": 0.6885952949523926,
|
|
"learning_rate": 5.370906552069721e-07,
|
|
"loss": 0.11789785623550415,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2035,
|
|
"token_acc": 0.9600053756215562,
|
|
"train_speed(iter/s)": 0.120546
|
|
},
|
|
{
|
|
"epoch": 2.5569580556644453,
|
|
"grad_norm": 0.6895261406898499,
|
|
"learning_rate": 5.22376341685013e-07,
|
|
"loss": 0.10133184194564819,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2040,
|
|
"token_acc": 0.9643827639751553,
|
|
"train_speed(iter/s)": 0.120608
|
|
},
|
|
{
|
|
"epoch": 2.5569580556644453,
|
|
"eval_loss": 0.2212093323469162,
|
|
"eval_runtime": 29.9992,
|
|
"eval_samples_per_second": 17.167,
|
|
"eval_steps_per_second": 4.3,
|
|
"eval_token_acc": 0.9297617500450676,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 2.5632301058408467,
|
|
"grad_norm": 0.6516171097755432,
|
|
"learning_rate": 5.07855281333881e-07,
|
|
"loss": 0.11106686592102051,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2045,
|
|
"token_acc": 0.9468649356358528,
|
|
"train_speed(iter/s)": 0.120396
|
|
},
|
|
{
|
|
"epoch": 2.569502156017248,
|
|
"grad_norm": 0.682096004486084,
|
|
"learning_rate": 4.935281008790843e-07,
|
|
"loss": 0.10403594970703126,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2050,
|
|
"token_acc": 0.9631915123957995,
|
|
"train_speed(iter/s)": 0.120471
|
|
},
|
|
{
|
|
"epoch": 2.5757742061936497,
|
|
"grad_norm": 0.641323983669281,
|
|
"learning_rate": 4.793954186783195e-07,
|
|
"loss": 0.10982873439788818,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2055,
|
|
"token_acc": 0.9590548445010714,
|
|
"train_speed(iter/s)": 0.120551
|
|
},
|
|
{
|
|
"epoch": 2.582046256370051,
|
|
"grad_norm": 0.6976042985916138,
|
|
"learning_rate": 4.6545784469478386e-07,
|
|
"loss": 0.09905983209609985,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2060,
|
|
"token_acc": 0.9606581714709885,
|
|
"train_speed(iter/s)": 0.120623
|
|
},
|
|
{
|
|
"epoch": 2.582046256370051,
|
|
"eval_loss": 0.2210971564054489,
|
|
"eval_runtime": 29.7693,
|
|
"eval_samples_per_second": 17.3,
|
|
"eval_steps_per_second": 4.333,
|
|
"eval_token_acc": 0.9296611341922014,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 2.5883183065464523,
|
|
"grad_norm": 0.6458984613418579,
|
|
"learning_rate": 4.5171598047085153e-07,
|
|
"loss": 0.10688018798828125,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2065,
|
|
"token_acc": 0.9488463005339436,
|
|
"train_speed(iter/s)": 0.120424
|
|
},
|
|
{
|
|
"epoch": 2.5945903567228537,
|
|
"grad_norm": 0.7071846723556519,
|
|
"learning_rate": 4.381704191021119e-07,
|
|
"loss": 0.10872792005538941,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2070,
|
|
"token_acc": 0.9589340920905037,
|
|
"train_speed(iter/s)": 0.120501
|
|
},
|
|
{
|
|
"epoch": 2.600862406899255,
|
|
"grad_norm": 0.7648762464523315,
|
|
"learning_rate": 4.248217452117653e-07,
|
|
"loss": 0.10998923778533935,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2075,
|
|
"token_acc": 0.9643058531634149,
|
|
"train_speed(iter/s)": 0.120567
|
|
},
|
|
{
|
|
"epoch": 2.6071344570756567,
|
|
"grad_norm": 0.7118704319000244,
|
|
"learning_rate": 4.1167053492540023e-07,
|
|
"loss": 0.11142784357070923,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2080,
|
|
"token_acc": 0.9593960300853928,
|
|
"train_speed(iter/s)": 0.120634
|
|
},
|
|
{
|
|
"epoch": 2.6071344570756567,
|
|
"eval_loss": 0.22096213698387146,
|
|
"eval_runtime": 29.7839,
|
|
"eval_samples_per_second": 17.291,
|
|
"eval_steps_per_second": 4.331,
|
|
"eval_token_acc": 0.9297827116810813,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 2.613406507252058,
|
|
"grad_norm": 0.7053963541984558,
|
|
"learning_rate": 3.987173558461199e-07,
|
|
"loss": 0.10944682359695435,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2085,
|
|
"token_acc": 0.94411167563964,
|
|
"train_speed(iter/s)": 0.120434
|
|
},
|
|
{
|
|
"epoch": 2.6196785574284593,
|
|
"grad_norm": 0.6822431087493896,
|
|
"learning_rate": 3.8596276703004974e-07,
|
|
"loss": 0.10377117395401,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2090,
|
|
"token_acc": 0.9615926525074362,
|
|
"train_speed(iter/s)": 0.1205
|
|
},
|
|
{
|
|
"epoch": 2.625950607604861,
|
|
"grad_norm": 0.6763447523117065,
|
|
"learning_rate": 3.7340731896220393e-07,
|
|
"loss": 0.10526052713394166,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2095,
|
|
"token_acc": 0.9638053139407766,
|
|
"train_speed(iter/s)": 0.120571
|
|
},
|
|
{
|
|
"epoch": 2.6322226577812624,
|
|
"grad_norm": 0.8038213849067688,
|
|
"learning_rate": 3.6105155353273305e-07,
|
|
"loss": 0.10737766027450561,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2100,
|
|
"token_acc": 0.960337552742616,
|
|
"train_speed(iter/s)": 0.120622
|
|
},
|
|
{
|
|
"epoch": 2.6322226577812624,
|
|
"eval_loss": 0.22071143984794617,
|
|
"eval_runtime": 29.9597,
|
|
"eval_samples_per_second": 17.19,
|
|
"eval_steps_per_second": 4.306,
|
|
"eval_token_acc": 0.9298539812435281,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 2.6384947079576637,
|
|
"grad_norm": 0.7455542087554932,
|
|
"learning_rate": 3.488960040135303e-07,
|
|
"loss": 0.10756160020828247,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2105,
|
|
"token_acc": 0.9456971683355857,
|
|
"train_speed(iter/s)": 0.120413
|
|
},
|
|
{
|
|
"epoch": 2.644766758134065,
|
|
"grad_norm": 0.6782827377319336,
|
|
"learning_rate": 3.369411950352175e-07,
|
|
"loss": 0.10511963367462158,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2110,
|
|
"token_acc": 0.9645807367902379,
|
|
"train_speed(iter/s)": 0.120483
|
|
},
|
|
{
|
|
"epoch": 2.6510388083104663,
|
|
"grad_norm": 0.6251741051673889,
|
|
"learning_rate": 3.251876425645051e-07,
|
|
"loss": 0.10916777849197387,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2115,
|
|
"token_acc": 0.9620457248579358,
|
|
"train_speed(iter/s)": 0.120548
|
|
},
|
|
{
|
|
"epoch": 2.657310858486868,
|
|
"grad_norm": 0.7826245427131653,
|
|
"learning_rate": 3.136358538819162e-07,
|
|
"loss": 0.11334476470947266,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2120,
|
|
"token_acc": 0.9589237920833921,
|
|
"train_speed(iter/s)": 0.120628
|
|
},
|
|
{
|
|
"epoch": 2.657310858486868,
|
|
"eval_loss": 0.22062310576438904,
|
|
"eval_runtime": 29.8394,
|
|
"eval_samples_per_second": 17.259,
|
|
"eval_steps_per_second": 4.323,
|
|
"eval_token_acc": 0.930025866658841,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 2.6635829086632694,
|
|
"grad_norm": 0.7201940417289734,
|
|
"learning_rate": 3.0228632755990197e-07,
|
|
"loss": 0.1089336633682251,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2125,
|
|
"token_acc": 0.9453159041394336,
|
|
"train_speed(iter/s)": 0.120428
|
|
},
|
|
{
|
|
"epoch": 2.6698549588396707,
|
|
"grad_norm": 0.7320123910903931,
|
|
"learning_rate": 2.911395534413147e-07,
|
|
"loss": 0.10883692502975464,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2130,
|
|
"token_acc": 0.9583949549348438,
|
|
"train_speed(iter/s)": 0.120498
|
|
},
|
|
{
|
|
"epoch": 2.676127009016072,
|
|
"grad_norm": 0.6891148686408997,
|
|
"learning_rate": 2.8019601261827123e-07,
|
|
"loss": 0.1058511734008789,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2135,
|
|
"token_acc": 0.964820651358247,
|
|
"train_speed(iter/s)": 0.120566
|
|
},
|
|
{
|
|
"epoch": 2.6823990591924733,
|
|
"grad_norm": 0.665065586566925,
|
|
"learning_rate": 2.694561774113863e-07,
|
|
"loss": 0.10193836688995361,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2140,
|
|
"token_acc": 0.9663121444471927,
|
|
"train_speed(iter/s)": 0.120643
|
|
},
|
|
{
|
|
"epoch": 2.6823990591924733,
|
|
"eval_loss": 0.22056354582309723,
|
|
"eval_runtime": 29.9765,
|
|
"eval_samples_per_second": 17.18,
|
|
"eval_steps_per_second": 4.303,
|
|
"eval_token_acc": 0.9299629817507997,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 2.688671109368875,
|
|
"grad_norm": 0.746462345123291,
|
|
"learning_rate": 2.5892051134939256e-07,
|
|
"loss": 0.10777335166931153,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2145,
|
|
"token_acc": 0.9450254900930257,
|
|
"train_speed(iter/s)": 0.120438
|
|
},
|
|
{
|
|
"epoch": 2.6949431595452764,
|
|
"grad_norm": 0.7365299463272095,
|
|
"learning_rate": 2.485894691491253e-07,
|
|
"loss": 0.10137251615524293,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2150,
|
|
"token_acc": 0.9649379303011776,
|
|
"train_speed(iter/s)": 0.120509
|
|
},
|
|
{
|
|
"epoch": 2.7012152097216777,
|
|
"grad_norm": 0.6272339224815369,
|
|
"learning_rate": 2.384634966959076e-07,
|
|
"loss": 0.10637471675872803,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2155,
|
|
"token_acc": 0.9610995993921813,
|
|
"train_speed(iter/s)": 0.120579
|
|
},
|
|
{
|
|
"epoch": 2.707487259898079,
|
|
"grad_norm": 0.7295854091644287,
|
|
"learning_rate": 2.2854303102429808e-07,
|
|
"loss": 0.10675235986709594,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2160,
|
|
"token_acc": 0.961855927963982,
|
|
"train_speed(iter/s)": 0.120641
|
|
},
|
|
{
|
|
"epoch": 2.707487259898079,
|
|
"eval_loss": 0.22078193724155426,
|
|
"eval_runtime": 29.9825,
|
|
"eval_samples_per_second": 17.177,
|
|
"eval_steps_per_second": 4.303,
|
|
"eval_token_acc": 0.9299420201147859,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 2.7137593100744803,
|
|
"grad_norm": 0.7343592643737793,
|
|
"learning_rate": 2.1882850029923463e-07,
|
|
"loss": 0.1030248761177063,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2165,
|
|
"token_acc": 0.9477623684469362,
|
|
"train_speed(iter/s)": 0.120452
|
|
},
|
|
{
|
|
"epoch": 2.720031360250882,
|
|
"grad_norm": 0.6812222599983215,
|
|
"learning_rate": 2.093203237975483e-07,
|
|
"loss": 0.10615785121917724,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2170,
|
|
"token_acc": 0.9640974343723809,
|
|
"train_speed(iter/s)": 0.120515
|
|
},
|
|
{
|
|
"epoch": 2.7263034104272834,
|
|
"grad_norm": 0.6542791128158569,
|
|
"learning_rate": 2.0001891188987265e-07,
|
|
"loss": 0.10438240766525268,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2175,
|
|
"token_acc": 0.9666564149879543,
|
|
"train_speed(iter/s)": 0.120568
|
|
},
|
|
{
|
|
"epoch": 2.7325754606036847,
|
|
"grad_norm": 0.6803576946258545,
|
|
"learning_rate": 1.9092466602293247e-07,
|
|
"loss": 0.11247079372406006,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2180,
|
|
"token_acc": 0.964954353454149,
|
|
"train_speed(iter/s)": 0.120636
|
|
},
|
|
{
|
|
"epoch": 2.7325754606036847,
|
|
"eval_loss": 0.2204427272081375,
|
|
"eval_runtime": 29.9646,
|
|
"eval_samples_per_second": 17.187,
|
|
"eval_steps_per_second": 4.305,
|
|
"eval_token_acc": 0.930025866658841,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 2.7388475107800865,
|
|
"grad_norm": 0.7342873215675354,
|
|
"learning_rate": 1.8203797870221197e-07,
|
|
"loss": 0.10811096429824829,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2185,
|
|
"token_acc": 0.9444978916772424,
|
|
"train_speed(iter/s)": 0.120421
|
|
},
|
|
{
|
|
"epoch": 2.745119560956488,
|
|
"grad_norm": 0.6856096982955933,
|
|
"learning_rate": 1.7335923347502003e-07,
|
|
"loss": 0.10507526397705078,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2190,
|
|
"token_acc": 0.9626754255001493,
|
|
"train_speed(iter/s)": 0.12049
|
|
},
|
|
{
|
|
"epoch": 2.751391611132889,
|
|
"grad_norm": 0.7308098077774048,
|
|
"learning_rate": 1.6488880491393467e-07,
|
|
"loss": 0.10211585760116577,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2195,
|
|
"token_acc": 0.959207675642216,
|
|
"train_speed(iter/s)": 0.120553
|
|
},
|
|
{
|
|
"epoch": 2.7576636613092904,
|
|
"grad_norm": 0.7188462018966675,
|
|
"learning_rate": 1.5662705860063465e-07,
|
|
"loss": 0.10453490018844605,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2200,
|
|
"token_acc": 0.9691890125907868,
|
|
"train_speed(iter/s)": 0.120613
|
|
},
|
|
{
|
|
"epoch": 2.7576636613092904,
|
|
"eval_loss": 0.22036312520503998,
|
|
"eval_runtime": 29.8702,
|
|
"eval_samples_per_second": 17.241,
|
|
"eval_steps_per_second": 4.319,
|
|
"eval_token_acc": 0.9299294431331777,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 2.7639357114856917,
|
|
"grad_norm": 0.6238301396369934,
|
|
"learning_rate": 1.485743511101234e-07,
|
|
"loss": 0.10971046686172485,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2205,
|
|
"token_acc": 0.9466987384026312,
|
|
"train_speed(iter/s)": 0.120416
|
|
},
|
|
{
|
|
"epoch": 2.7702077616620935,
|
|
"grad_norm": 0.6658957600593567,
|
|
"learning_rate": 1.4073102999534017e-07,
|
|
"loss": 0.11103521585464478,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2210,
|
|
"token_acc": 0.9630309852479412,
|
|
"train_speed(iter/s)": 0.120481
|
|
},
|
|
{
|
|
"epoch": 2.776479811838495,
|
|
"grad_norm": 0.7214736938476562,
|
|
"learning_rate": 1.3309743377215468e-07,
|
|
"loss": 0.10242490768432617,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2215,
|
|
"token_acc": 0.9676976699508741,
|
|
"train_speed(iter/s)": 0.120536
|
|
},
|
|
{
|
|
"epoch": 2.782751862014896,
|
|
"grad_norm": 0.6873740553855896,
|
|
"learning_rate": 1.2567389190476287e-07,
|
|
"loss": 0.11063306331634522,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2220,
|
|
"token_acc": 0.962315525785547,
|
|
"train_speed(iter/s)": 0.120607
|
|
},
|
|
{
|
|
"epoch": 2.782751862014896,
|
|
"eval_loss": 0.22046540677547455,
|
|
"eval_runtime": 29.9929,
|
|
"eval_samples_per_second": 17.171,
|
|
"eval_steps_per_second": 4.301,
|
|
"eval_token_acc": 0.9297994809898923,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 2.7890239121912974,
|
|
"grad_norm": 0.6654419898986816,
|
|
"learning_rate": 1.1846072479146431e-07,
|
|
"loss": 0.09902162551879883,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2225,
|
|
"token_acc": 0.948145285935085,
|
|
"train_speed(iter/s)": 0.120399
|
|
},
|
|
{
|
|
"epoch": 2.7952959623676987,
|
|
"grad_norm": 0.6633173227310181,
|
|
"learning_rate": 1.114582437508327e-07,
|
|
"loss": 0.10771057605743409,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2230,
|
|
"token_acc": 0.9644760213143873,
|
|
"train_speed(iter/s)": 0.120473
|
|
},
|
|
{
|
|
"epoch": 2.8015680125441005,
|
|
"grad_norm": 0.6813404560089111,
|
|
"learning_rate": 1.0466675100828383e-07,
|
|
"loss": 0.10407230854034424,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2235,
|
|
"token_acc": 0.9577818418523915,
|
|
"train_speed(iter/s)": 0.120526
|
|
},
|
|
{
|
|
"epoch": 2.807840062720502,
|
|
"grad_norm": 0.6118773221969604,
|
|
"learning_rate": 9.808653968302607e-08,
|
|
"loss": 0.09836616516113281,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2240,
|
|
"token_acc": 0.963512739408732,
|
|
"train_speed(iter/s)": 0.120591
|
|
},
|
|
{
|
|
"epoch": 2.807840062720502,
|
|
"eval_loss": 0.22051523625850677,
|
|
"eval_runtime": 29.9565,
|
|
"eval_samples_per_second": 17.192,
|
|
"eval_steps_per_second": 4.306,
|
|
"eval_token_acc": 0.9300132896772327,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 2.814112112896903,
|
|
"grad_norm": 0.7042478322982788,
|
|
"learning_rate": 9.17178937754143e-08,
|
|
"loss": 0.1051060438156128,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2245,
|
|
"token_acc": 0.9483617807171872,
|
|
"train_speed(iter/s)": 0.120393
|
|
},
|
|
{
|
|
"epoch": 2.820384163073305,
|
|
"grad_norm": 0.5747640132904053,
|
|
"learning_rate": 8.556108815468756e-08,
|
|
"loss": 0.09687448740005493,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2250,
|
|
"token_acc": 0.9668047793409135,
|
|
"train_speed(iter/s)": 0.120457
|
|
},
|
|
{
|
|
"epoch": 2.8266562132497057,
|
|
"grad_norm": 0.7168111205101013,
|
|
"learning_rate": 7.961638854711296e-08,
|
|
"loss": 0.10943119525909424,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2255,
|
|
"token_acc": 0.9633674692232269,
|
|
"train_speed(iter/s)": 0.120513
|
|
},
|
|
{
|
|
"epoch": 2.8329282634261075,
|
|
"grad_norm": 0.7360076308250427,
|
|
"learning_rate": 7.388405152450706e-08,
|
|
"loss": 0.10468497276306152,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2260,
|
|
"token_acc": 0.9621856728621001,
|
|
"train_speed(iter/s)": 0.120585
|
|
},
|
|
{
|
|
"epoch": 2.8329282634261075,
|
|
"eval_loss": 0.2205626368522644,
|
|
"eval_runtime": 29.8508,
|
|
"eval_samples_per_second": 17.252,
|
|
"eval_steps_per_second": 4.321,
|
|
"eval_token_acc": 0.9299881357140162,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 2.839200313602509,
|
|
"grad_norm": 0.6543593406677246,
|
|
"learning_rate": 6.836432449317255e-08,
|
|
"loss": 0.10062656402587891,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2265,
|
|
"token_acc": 0.9466067584011981,
|
|
"train_speed(iter/s)": 0.120388
|
|
},
|
|
{
|
|
"epoch": 2.84547236377891,
|
|
"grad_norm": 0.6974958181381226,
|
|
"learning_rate": 6.305744568321281e-08,
|
|
"loss": 0.10191984176635742,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2270,
|
|
"token_acc": 0.9627590979146353,
|
|
"train_speed(iter/s)": 0.120453
|
|
},
|
|
{
|
|
"epoch": 2.851744413955312,
|
|
"grad_norm": 0.7041538953781128,
|
|
"learning_rate": 5.7963644138254175e-08,
|
|
"loss": 0.10670938491821289,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2275,
|
|
"token_acc": 0.9608410787749505,
|
|
"train_speed(iter/s)": 0.120516
|
|
},
|
|
{
|
|
"epoch": 2.858016464131713,
|
|
"grad_norm": 0.6692150831222534,
|
|
"learning_rate": 5.308313970555812e-08,
|
|
"loss": 0.1109403133392334,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2280,
|
|
"token_acc": 0.9577824715116431,
|
|
"train_speed(iter/s)": 0.120589
|
|
},
|
|
{
|
|
"epoch": 2.858016464131713,
|
|
"eval_loss": 0.22065654397010803,
|
|
"eval_runtime": 29.7512,
|
|
"eval_samples_per_second": 17.31,
|
|
"eval_steps_per_second": 4.336,
|
|
"eval_token_acc": 0.9299797510596107,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 2.8642885143081145,
|
|
"grad_norm": 0.7244044542312622,
|
|
"learning_rate": 4.841614302653341e-08,
|
|
"loss": 0.09498413801193237,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2285,
|
|
"token_acc": 0.9474822521052157,
|
|
"train_speed(iter/s)": 0.120388
|
|
},
|
|
{
|
|
"epoch": 2.870560564484516,
|
|
"grad_norm": 0.6670571565628052,
|
|
"learning_rate": 4.396285552764557e-08,
|
|
"loss": 0.10060865879058838,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2290,
|
|
"token_acc": 0.9639465521355285,
|
|
"train_speed(iter/s)": 0.120464
|
|
},
|
|
{
|
|
"epoch": 2.876832614660917,
|
|
"grad_norm": 0.7304293513298035,
|
|
"learning_rate": 3.9723469411723226e-08,
|
|
"loss": 0.10391623973846435,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2295,
|
|
"token_acc": 0.9663193870238685,
|
|
"train_speed(iter/s)": 0.12053
|
|
},
|
|
{
|
|
"epoch": 2.883104664837319,
|
|
"grad_norm": 0.6565809845924377,
|
|
"learning_rate": 3.5698167649660384e-08,
|
|
"loss": 0.10505614280700684,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2300,
|
|
"token_acc": 0.9635771315655959,
|
|
"train_speed(iter/s)": 0.12059
|
|
},
|
|
{
|
|
"epoch": 2.883104664837319,
|
|
"eval_loss": 0.22073638439178467,
|
|
"eval_runtime": 29.9946,
|
|
"eval_samples_per_second": 17.17,
|
|
"eval_steps_per_second": 4.301,
|
|
"eval_token_acc": 0.9300342513132465,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 2.88937671501372,
|
|
"grad_norm": 0.7473600506782532,
|
|
"learning_rate": 3.188712397252325e-08,
|
|
"loss": 0.10624938011169434,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2305,
|
|
"token_acc": 0.9467632230970124,
|
|
"train_speed(iter/s)": 0.120398
|
|
},
|
|
{
|
|
"epoch": 2.8956487651901215,
|
|
"grad_norm": 0.6958088874816895,
|
|
"learning_rate": 2.8290502864049553e-08,
|
|
"loss": 0.09693416357040405,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2310,
|
|
"token_acc": 0.9698841898459312,
|
|
"train_speed(iter/s)": 0.120461
|
|
},
|
|
{
|
|
"epoch": 2.901920815366523,
|
|
"grad_norm": 0.6366994976997375,
|
|
"learning_rate": 2.4908459553549257e-08,
|
|
"loss": 0.10598138570785523,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2315,
|
|
"token_acc": 0.9589003310040456,
|
|
"train_speed(iter/s)": 0.120529
|
|
},
|
|
{
|
|
"epoch": 2.908192865542924,
|
|
"grad_norm": 0.7216346263885498,
|
|
"learning_rate": 2.174114000920713e-08,
|
|
"loss": 0.10354976654052735,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2320,
|
|
"token_acc": 0.964891239164804,
|
|
"train_speed(iter/s)": 0.120594
|
|
},
|
|
{
|
|
"epoch": 2.908192865542924,
|
|
"eval_loss": 0.22061631083488464,
|
|
"eval_runtime": 29.9036,
|
|
"eval_samples_per_second": 17.222,
|
|
"eval_steps_per_second": 4.314,
|
|
"eval_token_acc": 0.930076174585274,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 2.914464915719326,
|
|
"grad_norm": 0.6899539828300476,
|
|
"learning_rate": 1.878868093177999e-08,
|
|
"loss": 0.10037648677825928,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2325,
|
|
"token_acc": 0.9473721149411458,
|
|
"train_speed(iter/s)": 0.120416
|
|
},
|
|
{
|
|
"epoch": 2.920736965895727,
|
|
"grad_norm": 0.6711906790733337,
|
|
"learning_rate": 1.6051209748698116e-08,
|
|
"loss": 0.1048201560974121,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2330,
|
|
"token_acc": 0.9611446773011098,
|
|
"train_speed(iter/s)": 0.120473
|
|
},
|
|
{
|
|
"epoch": 2.9270090160721285,
|
|
"grad_norm": 0.6756038665771484,
|
|
"learning_rate": 1.3528844608566848e-08,
|
|
"loss": 0.10034064054489136,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2335,
|
|
"token_acc": 0.959424851944726,
|
|
"train_speed(iter/s)": 0.120528
|
|
},
|
|
{
|
|
"epoch": 2.9332810662485302,
|
|
"grad_norm": 0.7135525941848755,
|
|
"learning_rate": 1.1221694376064018e-08,
|
|
"loss": 0.10905985832214356,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2340,
|
|
"token_acc": 0.9628500906709652,
|
|
"train_speed(iter/s)": 0.120593
|
|
},
|
|
{
|
|
"epoch": 2.9332810662485302,
|
|
"eval_loss": 0.22068579494953156,
|
|
"eval_runtime": 29.8342,
|
|
"eval_samples_per_second": 17.262,
|
|
"eval_steps_per_second": 4.324,
|
|
"eval_token_acc": 0.9299168661515694,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 2.9395531164249316,
|
|
"grad_norm": 0.7024103999137878,
|
|
"learning_rate": 9.129858627244802e-09,
|
|
"loss": 0.10470427274703979,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2345,
|
|
"token_acc": 0.9460003800114003,
|
|
"train_speed(iter/s)": 0.120414
|
|
},
|
|
{
|
|
"epoch": 2.945825166601333,
|
|
"grad_norm": 0.8265781402587891,
|
|
"learning_rate": 7.25342764524184e-09,
|
|
"loss": 0.10396888256072997,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2350,
|
|
"token_acc": 0.9658552348125807,
|
|
"train_speed(iter/s)": 0.120474
|
|
},
|
|
{
|
|
"epoch": 2.952097216777734,
|
|
"grad_norm": 0.6834991574287415,
|
|
"learning_rate": 5.592482416369449e-09,
|
|
"loss": 0.10240061283111572,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2355,
|
|
"token_acc": 0.9637053223821365,
|
|
"train_speed(iter/s)": 0.120538
|
|
},
|
|
{
|
|
"epoch": 2.9583692669541355,
|
|
"grad_norm": 0.6985086798667908,
|
|
"learning_rate": 4.147094626628656e-09,
|
|
"loss": 0.10518196821212769,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2360,
|
|
"token_acc": 0.9589985193161933,
|
|
"train_speed(iter/s)": 0.120597
|
|
},
|
|
{
|
|
"epoch": 2.9583692669541355,
|
|
"eval_loss": 0.22063224017620087,
|
|
"eval_runtime": 29.9892,
|
|
"eval_samples_per_second": 17.173,
|
|
"eval_steps_per_second": 4.302,
|
|
"eval_token_acc": 0.9300216743316382,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 2.9646413171305372,
|
|
"grad_norm": 0.6361654996871948,
|
|
"learning_rate": 2.9173266586113303e-09,
|
|
"loss": 0.11210713386535645,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2365,
|
|
"token_acc": 0.9465896933852769,
|
|
"train_speed(iter/s)": 0.120393
|
|
},
|
|
{
|
|
"epoch": 2.9709133673069386,
|
|
"grad_norm": 0.6814746260643005,
|
|
"learning_rate": 1.9032315888106724e-09,
|
|
"loss": 0.10490133762359619,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2370,
|
|
"token_acc": 0.9647347687658518,
|
|
"train_speed(iter/s)": 0.12045
|
|
},
|
|
{
|
|
"epoch": 2.97718541748334,
|
|
"grad_norm": 0.6994942426681519,
|
|
"learning_rate": 1.1048531853286027e-09,
|
|
"loss": 0.10961159467697143,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2375,
|
|
"token_acc": 0.9629837053630957,
|
|
"train_speed(iter/s)": 0.120504
|
|
},
|
|
{
|
|
"epoch": 2.983457467659741,
|
|
"grad_norm": 0.717983067035675,
|
|
"learning_rate": 5.222259059867174e-10,
|
|
"loss": 0.11561372280120849,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2380,
|
|
"token_acc": 0.9572651972483505,
|
|
"train_speed(iter/s)": 0.120566
|
|
},
|
|
{
|
|
"epoch": 2.983457467659741,
|
|
"eval_loss": 0.22063779830932617,
|
|
"eval_runtime": 29.6218,
|
|
"eval_samples_per_second": 17.386,
|
|
"eval_steps_per_second": 4.355,
|
|
"eval_token_acc": 0.9298959045155556,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 2.9897295178361425,
|
|
"grad_norm": 0.7122631072998047,
|
|
"learning_rate": 1.5537489683914442e-10,
|
|
"loss": 0.10554230213165283,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2385,
|
|
"token_acc": 0.9472894319111068,
|
|
"train_speed(iter/s)": 0.120389
|
|
},
|
|
{
|
|
"epoch": 2.9960015680125442,
|
|
"grad_norm": 0.6785567998886108,
|
|
"learning_rate": 4.315991088965632e-12,
|
|
"loss": 0.10587785243988038,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2390,
|
|
"token_acc": 0.9635652811337659,
|
|
"train_speed(iter/s)": 0.120449
|
|
},
|
|
{
|
|
"epoch": 2.9972559780478245,
|
|
"eval_loss": 0.22064490616321564,
|
|
"eval_runtime": 29.9722,
|
|
"eval_samples_per_second": 17.183,
|
|
"eval_steps_per_second": 4.304,
|
|
"eval_token_acc": 0.93000909735003,
|
|
"step": 2391
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 2391,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.893078571339743e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|