1907 lines
54 KiB
JSON
1907 lines
54 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": 760,
|
||
|
|
"best_metric": 0.22517732,
|
||
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v36-20250515-204543/checkpoint-760",
|
||
|
|
"epoch": 0.9533516268130146,
|
||
|
|
"eval_steps": 20,
|
||
|
|
"global_step": 760,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.0012544100352802822,
|
||
|
|
"grad_norm": 0.7007026672363281,
|
||
|
|
"learning_rate": 9.999995684008912e-06,
|
||
|
|
"loss": 0.09371452033519745,
|
||
|
|
"memory(GiB)": 30.15,
|
||
|
|
"step": 1,
|
||
|
|
"token_acc": 0.9615550755939525,
|
||
|
|
"train_speed(iter/s)": 0.062663
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.006272050176401411,
|
||
|
|
"grad_norm": 0.7769243121147156,
|
||
|
|
"learning_rate": 9.999892100595329e-06,
|
||
|
|
"loss": 0.10849708318710327,
|
||
|
|
"memory(GiB)": 30.19,
|
||
|
|
"step": 5,
|
||
|
|
"token_acc": 0.9562576748199528,
|
||
|
|
"train_speed(iter/s)": 0.122173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.012544100352802822,
|
||
|
|
"grad_norm": 0.7894852757453918,
|
||
|
|
"learning_rate": 9.999568407038233e-06,
|
||
|
|
"loss": 0.12320096492767334,
|
||
|
|
"memory(GiB)": 30.19,
|
||
|
|
"step": 10,
|
||
|
|
"token_acc": 0.955855880061259,
|
||
|
|
"train_speed(iter/s)": 0.136645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.018816150529204233,
|
||
|
|
"grad_norm": 0.8094897866249084,
|
||
|
|
"learning_rate": 9.999028933299243e-06,
|
||
|
|
"loss": 0.11489032506942749,
|
||
|
|
"memory(GiB)": 30.19,
|
||
|
|
"step": 15,
|
||
|
|
"token_acc": 0.9583607506645961,
|
||
|
|
"train_speed(iter/s)": 0.144552
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.025088200705605645,
|
||
|
|
"grad_norm": 0.8118980526924133,
|
||
|
|
"learning_rate": 9.99827370266192e-06,
|
||
|
|
"loss": 0.11607390642166138,
|
||
|
|
"memory(GiB)": 30.19,
|
||
|
|
"step": 20,
|
||
|
|
"token_acc": 0.9584569732937686,
|
||
|
|
"train_speed(iter/s)": 0.146159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.025088200705605645,
|
||
|
|
"eval_loss": 0.23747889697551727,
|
||
|
|
"eval_runtime": 29.1116,
|
||
|
|
"eval_samples_per_second": 17.691,
|
||
|
|
"eval_steps_per_second": 4.431,
|
||
|
|
"eval_token_acc": 0.9248609195450487,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03136025088200706,
|
||
|
|
"grad_norm": 0.7518147826194763,
|
||
|
|
"learning_rate": 9.99730274772184e-06,
|
||
|
|
"loss": 0.12006251811981201,
|
||
|
|
"memory(GiB)": 30.19,
|
||
|
|
"step": 25,
|
||
|
|
"token_acc": 0.9398439645614428,
|
||
|
|
"train_speed(iter/s)": 0.119433
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.037632301058408466,
|
||
|
|
"grad_norm": 0.7859554886817932,
|
||
|
|
"learning_rate": 9.996116110385186e-06,
|
||
|
|
"loss": 0.12473204135894775,
|
||
|
|
"memory(GiB)": 30.19,
|
||
|
|
"step": 30,
|
||
|
|
"token_acc": 0.9547159567642268,
|
||
|
|
"train_speed(iter/s)": 0.124931
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04390435123480988,
|
||
|
|
"grad_norm": 0.7734975814819336,
|
||
|
|
"learning_rate": 9.99471384186694e-06,
|
||
|
|
"loss": 0.11890232563018799,
|
||
|
|
"memory(GiB)": 30.2,
|
||
|
|
"step": 35,
|
||
|
|
"token_acc": 0.9632012432012432,
|
||
|
|
"train_speed(iter/s)": 0.128582
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05017640141121129,
|
||
|
|
"grad_norm": 0.7775484323501587,
|
||
|
|
"learning_rate": 9.99309600268868e-06,
|
||
|
|
"loss": 0.11513264179229736,
|
||
|
|
"memory(GiB)": 30.2,
|
||
|
|
"step": 40,
|
||
|
|
"token_acc": 0.9620845390377802,
|
||
|
|
"train_speed(iter/s)": 0.130866
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05017640141121129,
|
||
|
|
"eval_loss": 0.23900838196277618,
|
||
|
|
"eval_runtime": 29.1592,
|
||
|
|
"eval_samples_per_second": 17.662,
|
||
|
|
"eval_steps_per_second": 4.424,
|
||
|
|
"eval_token_acc": 0.9247099957657495,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0564484515876127,
|
||
|
|
"grad_norm": 0.7852229475975037,
|
||
|
|
"learning_rate": 9.991262662675962e-06,
|
||
|
|
"loss": 0.1213950753211975,
|
||
|
|
"memory(GiB)": 30.2,
|
||
|
|
"step": 45,
|
||
|
|
"token_acc": 0.9422833912915708,
|
||
|
|
"train_speed(iter/s)": 0.119133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06272050176401411,
|
||
|
|
"grad_norm": 0.7627941370010376,
|
||
|
|
"learning_rate": 9.9892139009553e-06,
|
||
|
|
"loss": 0.12013821601867676,
|
||
|
|
"memory(GiB)": 30.2,
|
||
|
|
"step": 50,
|
||
|
|
"token_acc": 0.9524904419431597,
|
||
|
|
"train_speed(iter/s)": 0.122203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06899255194041552,
|
||
|
|
"grad_norm": 0.7667288184165955,
|
||
|
|
"learning_rate": 9.986949805950763e-06,
|
||
|
|
"loss": 0.12547953128814698,
|
||
|
|
"memory(GiB)": 30.2,
|
||
|
|
"step": 55,
|
||
|
|
"token_acc": 0.9569144662104125,
|
||
|
|
"train_speed(iter/s)": 0.124401
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07526460211681693,
|
||
|
|
"grad_norm": 0.8220178484916687,
|
||
|
|
"learning_rate": 9.984470475380154e-06,
|
||
|
|
"loss": 0.12330178022384644,
|
||
|
|
"memory(GiB)": 30.2,
|
||
|
|
"step": 60,
|
||
|
|
"token_acc": 0.9609616164135824,
|
||
|
|
"train_speed(iter/s)": 0.12684
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07526460211681693,
|
||
|
|
"eval_loss": 0.2401651293039322,
|
||
|
|
"eval_runtime": 28.9982,
|
||
|
|
"eval_samples_per_second": 17.76,
|
||
|
|
"eval_steps_per_second": 4.449,
|
||
|
|
"eval_token_acc": 0.9250453819419698,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08153665229321834,
|
||
|
|
"grad_norm": 0.7496252059936523,
|
||
|
|
"learning_rate": 9.982332112912999e-06,
|
||
|
|
"loss": 0.12913516759872437,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 65,
|
||
|
|
"token_acc": 0.9413472329138108,
|
||
|
|
"train_speed(iter/s)": 0.119192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08780870246961976,
|
||
|
|
"grad_norm": 0.742638349533081,
|
||
|
|
"learning_rate": 9.979465634221514e-06,
|
||
|
|
"loss": 0.11686735153198242,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 70,
|
||
|
|
"token_acc": 0.9583432768541352,
|
||
|
|
"train_speed(iter/s)": 0.121248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09408075264602117,
|
||
|
|
"grad_norm": 0.6979886889457703,
|
||
|
|
"learning_rate": 9.976384242979025e-06,
|
||
|
|
"loss": 0.11433117389678955,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 75,
|
||
|
|
"token_acc": 0.9620859246922897,
|
||
|
|
"train_speed(iter/s)": 0.123121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10035280282242258,
|
||
|
|
"grad_norm": 0.772607684135437,
|
||
|
|
"learning_rate": 9.973088072177646e-06,
|
||
|
|
"loss": 0.11932685375213622,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 80,
|
||
|
|
"token_acc": 0.9525376807136265,
|
||
|
|
"train_speed(iter/s)": 0.124671
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10035280282242258,
|
||
|
|
"eval_loss": 0.2368009090423584,
|
||
|
|
"eval_runtime": 28.9145,
|
||
|
|
"eval_samples_per_second": 17.811,
|
||
|
|
"eval_steps_per_second": 4.461,
|
||
|
|
"eval_token_acc": 0.9248148039458184,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10662485299882399,
|
||
|
|
"grad_norm": 0.6924039721488953,
|
||
|
|
"learning_rate": 9.96957726407932e-06,
|
||
|
|
"loss": 0.11466219425201415,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 85,
|
||
|
|
"token_acc": 0.9409575111971916,
|
||
|
|
"train_speed(iter/s)": 0.119139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1128969031752254,
|
||
|
|
"grad_norm": 0.7015154957771301,
|
||
|
|
"learning_rate": 9.965851970209695e-06,
|
||
|
|
"loss": 0.11789379119873047,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 90,
|
||
|
|
"token_acc": 0.9585812037424941,
|
||
|
|
"train_speed(iter/s)": 0.120316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11916895335162682,
|
||
|
|
"grad_norm": 0.7769283652305603,
|
||
|
|
"learning_rate": 9.96191235135156e-06,
|
||
|
|
"loss": 0.12155482769012452,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 95,
|
||
|
|
"token_acc": 0.9554308702096125,
|
||
|
|
"train_speed(iter/s)": 0.122053
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12544100352802823,
|
||
|
|
"grad_norm": 0.7704362869262695,
|
||
|
|
"learning_rate": 9.957758577537933e-06,
|
||
|
|
"loss": 0.13259472846984863,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 100,
|
||
|
|
"token_acc": 0.9502720633165537,
|
||
|
|
"train_speed(iter/s)": 0.123484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12544100352802823,
|
||
|
|
"eval_loss": 0.23640382289886475,
|
||
|
|
"eval_runtime": 29.2359,
|
||
|
|
"eval_samples_per_second": 17.615,
|
||
|
|
"eval_steps_per_second": 4.412,
|
||
|
|
"eval_token_acc": 0.9245884182768697,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13171305370442962,
|
||
|
|
"grad_norm": 0.7656592726707458,
|
||
|
|
"learning_rate": 9.953390828044698e-06,
|
||
|
|
"loss": 0.1214489221572876,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 105,
|
||
|
|
"token_acc": 0.94163746105919,
|
||
|
|
"train_speed(iter/s)": 0.119212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13798510388083104,
|
||
|
|
"grad_norm": 0.6987672448158264,
|
||
|
|
"learning_rate": 9.948809291382886e-06,
|
||
|
|
"loss": 0.12167651653289795,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 110,
|
||
|
|
"token_acc": 0.9554091191158653,
|
||
|
|
"train_speed(iter/s)": 0.120455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14425715405723247,
|
||
|
|
"grad_norm": 0.7173567414283752,
|
||
|
|
"learning_rate": 9.944014165290526e-06,
|
||
|
|
"loss": 0.12870512008666993,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 115,
|
||
|
|
"token_acc": 0.9555927368478797,
|
||
|
|
"train_speed(iter/s)": 0.121851
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15052920423363386,
|
||
|
|
"grad_norm": 0.733219563961029,
|
||
|
|
"learning_rate": 9.939005656724122e-06,
|
||
|
|
"loss": 0.12763895988464355,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 120,
|
||
|
|
"token_acc": 0.9537378141994336,
|
||
|
|
"train_speed(iter/s)": 0.123038
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15052920423363386,
|
||
|
|
"eval_loss": 0.2358570694923401,
|
||
|
|
"eval_runtime": 29.0168,
|
||
|
|
"eval_samples_per_second": 17.748,
|
||
|
|
"eval_steps_per_second": 4.446,
|
||
|
|
"eval_token_acc": 0.9245213410416256,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15680125441003528,
|
||
|
|
"grad_norm": 0.765933096408844,
|
||
|
|
"learning_rate": 9.933783981849704e-06,
|
||
|
|
"loss": 0.12144865989685058,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 125,
|
||
|
|
"token_acc": 0.9409262529390531,
|
||
|
|
"train_speed(iter/s)": 0.119399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16307330458643668,
|
||
|
|
"grad_norm": 0.6981678009033203,
|
||
|
|
"learning_rate": 9.928349366033525e-06,
|
||
|
|
"loss": 0.12389117479324341,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 130,
|
||
|
|
"token_acc": 0.9555003388299074,
|
||
|
|
"train_speed(iter/s)": 0.120494
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1693453547628381,
|
||
|
|
"grad_norm": 0.7008967399597168,
|
||
|
|
"learning_rate": 9.923848513216085e-06,
|
||
|
|
"loss": 0.12482867240905762,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 135,
|
||
|
|
"token_acc": 0.9500779220779221,
|
||
|
|
"train_speed(iter/s)": 0.121483
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17561740493923952,
|
||
|
|
"grad_norm": 0.7450129985809326,
|
||
|
|
"learning_rate": 9.918031200957224e-06,
|
||
|
|
"loss": 0.1304723024368286,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 140,
|
||
|
|
"token_acc": 0.9544462545722805,
|
||
|
|
"train_speed(iter/s)": 0.122651
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17561740493923952,
|
||
|
|
"eval_loss": 0.23741304874420166,
|
||
|
|
"eval_runtime": 29.1676,
|
||
|
|
"eval_samples_per_second": 17.657,
|
||
|
|
"eval_steps_per_second": 4.423,
|
||
|
|
"eval_token_acc": 0.9250328049603616,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18188945511564092,
|
||
|
|
"grad_norm": 0.7213057279586792,
|
||
|
|
"learning_rate": 9.912001627642868e-06,
|
||
|
|
"loss": 0.12079639434814453,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 145,
|
||
|
|
"token_acc": 0.9405716060888475,
|
||
|
|
"train_speed(iter/s)": 0.119397
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18816150529204234,
|
||
|
|
"grad_norm": 0.6824830174446106,
|
||
|
|
"learning_rate": 9.905760053507967e-06,
|
||
|
|
"loss": 0.11286978721618653,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 150,
|
||
|
|
"token_acc": 0.9568509120833905,
|
||
|
|
"train_speed(iter/s)": 0.120378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19443355546844374,
|
||
|
|
"grad_norm": 0.7245369553565979,
|
||
|
|
"learning_rate": 9.899306747937377e-06,
|
||
|
|
"loss": 0.12503495216369628,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 155,
|
||
|
|
"token_acc": 0.9548737472705896,
|
||
|
|
"train_speed(iter/s)": 0.121472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20070560564484516,
|
||
|
|
"grad_norm": 0.7653631567955017,
|
||
|
|
"learning_rate": 9.892641989454225e-06,
|
||
|
|
"loss": 0.1246172308921814,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 160,
|
||
|
|
"token_acc": 0.9559546167897296,
|
||
|
|
"train_speed(iter/s)": 0.122509
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20070560564484516,
|
||
|
|
"eval_loss": 0.23944813013076782,
|
||
|
|
"eval_runtime": 29.28,
|
||
|
|
"eval_samples_per_second": 17.589,
|
||
|
|
"eval_steps_per_second": 4.406,
|
||
|
|
"eval_token_acc": 0.9244165328615568,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20697765582124658,
|
||
|
|
"grad_norm": 0.6885049939155579,
|
||
|
|
"learning_rate": 9.885766065707903e-06,
|
||
|
|
"loss": 0.12521634101867676,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 165,
|
||
|
|
"token_acc": 0.9397906012239098,
|
||
|
|
"train_speed(iter/s)": 0.119705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21324970599764798,
|
||
|
|
"grad_norm": 0.7737838625907898,
|
||
|
|
"learning_rate": 9.878679273461643e-06,
|
||
|
|
"loss": 0.12545130252838135,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 170,
|
||
|
|
"token_acc": 0.9525804833426775,
|
||
|
|
"train_speed(iter/s)": 0.120457
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2195217561740494,
|
||
|
|
"grad_norm": 0.7562989592552185,
|
||
|
|
"learning_rate": 9.871381918579706e-06,
|
||
|
|
"loss": 0.11616495847702027,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 175,
|
||
|
|
"token_acc": 0.9607603010588085,
|
||
|
|
"train_speed(iter/s)": 0.121311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2257938063504508,
|
||
|
|
"grad_norm": 0.8050908446311951,
|
||
|
|
"learning_rate": 9.863874316014197e-06,
|
||
|
|
"loss": 0.11883351802825928,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 180,
|
||
|
|
"token_acc": 0.9569015887148382,
|
||
|
|
"train_speed(iter/s)": 0.122115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2257938063504508,
|
||
|
|
"eval_loss": 0.2381161004304886,
|
||
|
|
"eval_runtime": 29.157,
|
||
|
|
"eval_samples_per_second": 17.663,
|
||
|
|
"eval_steps_per_second": 4.424,
|
||
|
|
"eval_token_acc": 0.9248860735082651,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23206585652685222,
|
||
|
|
"grad_norm": 0.7181304097175598,
|
||
|
|
"learning_rate": 9.856156789791454e-06,
|
||
|
|
"loss": 0.12097489833831787,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 185,
|
||
|
|
"token_acc": 0.9404164162611713,
|
||
|
|
"train_speed(iter/s)": 0.119715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23833790670325364,
|
||
|
|
"grad_norm": 0.7890848517417908,
|
||
|
|
"learning_rate": 9.848229672998066e-06,
|
||
|
|
"loss": 0.1189950704574585,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 190,
|
||
|
|
"token_acc": 0.9567714631197098,
|
||
|
|
"train_speed(iter/s)": 0.120621
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24460995687965503,
|
||
|
|
"grad_norm": 0.7079647183418274,
|
||
|
|
"learning_rate": 9.840093307766511e-06,
|
||
|
|
"loss": 0.12529479265213012,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 195,
|
||
|
|
"token_acc": 0.954030785285677,
|
||
|
|
"train_speed(iter/s)": 0.12131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25088200705605646,
|
||
|
|
"grad_norm": 0.7202178835868835,
|
||
|
|
"learning_rate": 9.831748045260374e-06,
|
||
|
|
"loss": 0.12180191278457642,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 200,
|
||
|
|
"token_acc": 0.9546836066920402,
|
||
|
|
"train_speed(iter/s)": 0.121964
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25088200705605646,
|
||
|
|
"eval_loss": 0.23432905972003937,
|
||
|
|
"eval_runtime": 29.0534,
|
||
|
|
"eval_samples_per_second": 17.726,
|
||
|
|
"eval_steps_per_second": 4.44,
|
||
|
|
"eval_token_acc": 0.9248776888538597,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2571540572324579,
|
||
|
|
"grad_norm": 0.7178977727890015,
|
||
|
|
"learning_rate": 9.823194245659197e-06,
|
||
|
|
"loss": 0.12807730436325074,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 205,
|
||
|
|
"token_acc": 0.9399802586519479,
|
||
|
|
"train_speed(iter/s)": 0.119811
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26342610740885924,
|
||
|
|
"grad_norm": 0.7141036987304688,
|
||
|
|
"learning_rate": 9.814432278142934e-06,
|
||
|
|
"loss": 0.11557638645172119,
|
||
|
|
"memory(GiB)": 31.66,
|
||
|
|
"step": 210,
|
||
|
|
"token_acc": 0.9552243011722272,
|
||
|
|
"train_speed(iter/s)": 0.120373
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26969815758526067,
|
||
|
|
"grad_norm": 0.8078102469444275,
|
||
|
|
"learning_rate": 9.805462520876015e-06,
|
||
|
|
"loss": 0.1150855302810669,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 215,
|
||
|
|
"token_acc": 0.9593783736285808,
|
||
|
|
"train_speed(iter/s)": 0.121017
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2759702077616621,
|
||
|
|
"grad_norm": 0.7020242214202881,
|
||
|
|
"learning_rate": 9.79628536099103e-06,
|
||
|
|
"loss": 0.1237363338470459,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 220,
|
||
|
|
"token_acc": 0.9599538638985006,
|
||
|
|
"train_speed(iter/s)": 0.121813
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2759702077616621,
|
||
|
|
"eval_loss": 0.23757396638393402,
|
||
|
|
"eval_runtime": 29.0264,
|
||
|
|
"eval_samples_per_second": 17.742,
|
||
|
|
"eval_steps_per_second": 4.444,
|
||
|
|
"eval_token_acc": 0.9250914975412001,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2822422579380635,
|
||
|
|
"grad_norm": 0.8055130839347839,
|
||
|
|
"learning_rate": 9.786901194572012e-06,
|
||
|
|
"loss": 0.1192856788635254,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 225,
|
||
|
|
"token_acc": 0.9404110409842614,
|
||
|
|
"train_speed(iter/s)": 0.119956
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28851430811446493,
|
||
|
|
"grad_norm": 0.8203707337379456,
|
||
|
|
"learning_rate": 9.777310426637349e-06,
|
||
|
|
"loss": 0.11806493997573853,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 230,
|
||
|
|
"token_acc": 0.9590930586937103,
|
||
|
|
"train_speed(iter/s)": 0.120564
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2947863582908663,
|
||
|
|
"grad_norm": 0.7641969919204712,
|
||
|
|
"learning_rate": 9.767513471122305e-06,
|
||
|
|
"loss": 0.11997225284576415,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 235,
|
||
|
|
"token_acc": 0.9602041571122124,
|
||
|
|
"train_speed(iter/s)": 0.121067
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3010584084672677,
|
||
|
|
"grad_norm": 0.733931839466095,
|
||
|
|
"learning_rate": 9.757510750861143e-06,
|
||
|
|
"loss": 0.12144792079925537,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 240,
|
||
|
|
"token_acc": 0.9531961770923274,
|
||
|
|
"train_speed(iter/s)": 0.121552
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3010584084672677,
|
||
|
|
"eval_loss": 0.23732979595661163,
|
||
|
|
"eval_runtime": 29.0284,
|
||
|
|
"eval_samples_per_second": 17.741,
|
||
|
|
"eval_steps_per_second": 4.444,
|
||
|
|
"eval_token_acc": 0.9247980346370074,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30733045864366915,
|
||
|
|
"grad_norm": 0.7051452398300171,
|
||
|
|
"learning_rate": 9.749360713849587e-06,
|
||
|
|
"loss": 0.12806930541992187,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 245,
|
||
|
|
"token_acc": 0.9387967295240796,
|
||
|
|
"train_speed(iter/s)": 0.119787
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31360250882007057,
|
||
|
|
"grad_norm": 0.7828952074050903,
|
||
|
|
"learning_rate": 9.741079488650608e-06,
|
||
|
|
"loss": 0.13568118810653687,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 250,
|
||
|
|
"token_acc": 0.948076923076923,
|
||
|
|
"train_speed(iter/s)": 0.12055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.319874558996472,
|
||
|
|
"grad_norm": 0.7669305205345154,
|
||
|
|
"learning_rate": 9.730543822588614e-06,
|
||
|
|
"loss": 0.12099459171295165,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 255,
|
||
|
|
"token_acc": 0.9579619299557192,
|
||
|
|
"train_speed(iter/s)": 0.121138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32614660917287336,
|
||
|
|
"grad_norm": 0.6992693543434143,
|
||
|
|
"learning_rate": 9.71980398738173e-06,
|
||
|
|
"loss": 0.12393572330474853,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 260,
|
||
|
|
"token_acc": 0.9492717094266536,
|
||
|
|
"train_speed(iter/s)": 0.121576
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32614660917287336,
|
||
|
|
"eval_loss": 0.23609744012355804,
|
||
|
|
"eval_runtime": 29.0643,
|
||
|
|
"eval_samples_per_second": 17.719,
|
||
|
|
"eval_steps_per_second": 4.438,
|
||
|
|
"eval_token_acc": 0.9247644960193854,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3324186593492748,
|
||
|
|
"grad_norm": 0.7092508673667908,
|
||
|
|
"learning_rate": 9.708860446558685e-06,
|
||
|
|
"loss": 0.12540948390960693,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 265,
|
||
|
|
"token_acc": 0.9381209283387623,
|
||
|
|
"train_speed(iter/s)": 0.119928
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3386907095256762,
|
||
|
|
"grad_norm": 0.7747306823730469,
|
||
|
|
"learning_rate": 9.6977136724401e-06,
|
||
|
|
"loss": 0.1322183132171631,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 270,
|
||
|
|
"token_acc": 0.948199121522694,
|
||
|
|
"train_speed(iter/s)": 0.120479
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3449627597020776,
|
||
|
|
"grad_norm": 0.7813765406608582,
|
||
|
|
"learning_rate": 9.686364146118085e-06,
|
||
|
|
"loss": 0.12453765869140625,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 275,
|
||
|
|
"token_acc": 0.9574612482015366,
|
||
|
|
"train_speed(iter/s)": 0.121031
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35123480987847905,
|
||
|
|
"grad_norm": 0.8077899217605591,
|
||
|
|
"learning_rate": 9.674812357435497e-06,
|
||
|
|
"loss": 0.13067824840545655,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 280,
|
||
|
|
"token_acc": 0.9562002982107356,
|
||
|
|
"train_speed(iter/s)": 0.121591
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35123480987847905,
|
||
|
|
"eval_loss": 0.23406127095222473,
|
||
|
|
"eval_runtime": 28.8857,
|
||
|
|
"eval_samples_per_second": 17.829,
|
||
|
|
"eval_steps_per_second": 4.466,
|
||
|
|
"eval_token_acc": 0.925296921574135,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3575068600548804,
|
||
|
|
"grad_norm": 0.7608367204666138,
|
||
|
|
"learning_rate": 9.663058804964784e-06,
|
||
|
|
"loss": 0.12904319763183594,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 285,
|
||
|
|
"token_acc": 0.9395176026312584,
|
||
|
|
"train_speed(iter/s)": 0.120021
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36377891023128184,
|
||
|
|
"grad_norm": 0.7974592447280884,
|
||
|
|
"learning_rate": 9.65110399598647e-06,
|
||
|
|
"loss": 0.11571755409240722,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 290,
|
||
|
|
"token_acc": 0.9629809560823941,
|
||
|
|
"train_speed(iter/s)": 0.120511
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37005096040768326,
|
||
|
|
"grad_norm": 0.807405948638916,
|
||
|
|
"learning_rate": 9.638948446467268e-06,
|
||
|
|
"loss": 0.12567424774169922,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 295,
|
||
|
|
"token_acc": 0.9559097936770272,
|
||
|
|
"train_speed(iter/s)": 0.121048
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3763230105840847,
|
||
|
|
"grad_norm": 0.8031049966812134,
|
||
|
|
"learning_rate": 9.626592681037797e-06,
|
||
|
|
"loss": 0.12862168550491332,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 300,
|
||
|
|
"token_acc": 0.9541518224171006,
|
||
|
|
"train_speed(iter/s)": 0.121608
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3763230105840847,
|
||
|
|
"eval_loss": 0.239148810505867,
|
||
|
|
"eval_runtime": 29.0314,
|
||
|
|
"eval_samples_per_second": 17.739,
|
||
|
|
"eval_steps_per_second": 4.443,
|
||
|
|
"eval_token_acc": 0.9250160356515505,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3825950607604861,
|
||
|
|
"grad_norm": 0.7160354256629944,
|
||
|
|
"learning_rate": 9.614037232969952e-06,
|
||
|
|
"loss": 0.11383086442947388,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 305,
|
||
|
|
"token_acc": 0.9417116516042943,
|
||
|
|
"train_speed(iter/s)": 0.120136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3888671109368875,
|
||
|
|
"grad_norm": 0.7129721641540527,
|
||
|
|
"learning_rate": 9.601282644153882e-06,
|
||
|
|
"loss": 0.12448391914367676,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 310,
|
||
|
|
"token_acc": 0.9534784033888903,
|
||
|
|
"train_speed(iter/s)": 0.120638
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3951391611132889,
|
||
|
|
"grad_norm": 0.6943992972373962,
|
||
|
|
"learning_rate": 9.5883294650746e-06,
|
||
|
|
"loss": 0.12468962669372559,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 315,
|
||
|
|
"token_acc": 0.9542487486461961,
|
||
|
|
"train_speed(iter/s)": 0.12098
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4014112112896903,
|
||
|
|
"grad_norm": 0.6942281126976013,
|
||
|
|
"learning_rate": 9.575178254788235e-06,
|
||
|
|
"loss": 0.12767086029052735,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 320,
|
||
|
|
"token_acc": 0.9511481009569767,
|
||
|
|
"train_speed(iter/s)": 0.121441
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4014112112896903,
|
||
|
|
"eval_loss": 0.23587600886821747,
|
||
|
|
"eval_runtime": 29.0365,
|
||
|
|
"eval_samples_per_second": 17.736,
|
||
|
|
"eval_steps_per_second": 4.443,
|
||
|
|
"eval_token_acc": 0.9246219568944917,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40768326146609174,
|
||
|
|
"grad_norm": 0.7060673236846924,
|
||
|
|
"learning_rate": 9.56182958089789e-06,
|
||
|
|
"loss": 0.1355045199394226,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 325,
|
||
|
|
"token_acc": 0.9405469567818154,
|
||
|
|
"train_speed(iter/s)": 0.120115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41395531164249316,
|
||
|
|
"grad_norm": 0.8162744045257568,
|
||
|
|
"learning_rate": 9.548284019529149e-06,
|
||
|
|
"loss": 0.13120698928833008,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 330,
|
||
|
|
"token_acc": 0.9542016095898688,
|
||
|
|
"train_speed(iter/s)": 0.120527
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42022736181889453,
|
||
|
|
"grad_norm": 0.7768563032150269,
|
||
|
|
"learning_rate": 9.534542155305217e-06,
|
||
|
|
"loss": 0.12495183944702148,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 335,
|
||
|
|
"token_acc": 0.955458468751665,
|
||
|
|
"train_speed(iter/s)": 0.121049
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42649941199529595,
|
||
|
|
"grad_norm": 0.7437924146652222,
|
||
|
|
"learning_rate": 9.520604581321682e-06,
|
||
|
|
"loss": 0.12085769176483155,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 340,
|
||
|
|
"token_acc": 0.9580750533707143,
|
||
|
|
"train_speed(iter/s)": 0.121396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42649941199529595,
|
||
|
|
"eval_loss": 0.23598669469356537,
|
||
|
|
"eval_runtime": 29.1207,
|
||
|
|
"eval_samples_per_second": 17.685,
|
||
|
|
"eval_steps_per_second": 4.43,
|
||
|
|
"eval_token_acc": 0.9251292284860249,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4327714621716974,
|
||
|
|
"grad_norm": 0.7635094523429871,
|
||
|
|
"learning_rate": 9.506471899120917e-06,
|
||
|
|
"loss": 0.12304807901382446,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 345,
|
||
|
|
"token_acc": 0.9426115423821846,
|
||
|
|
"train_speed(iter/s)": 0.120096
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4390435123480988,
|
||
|
|
"grad_norm": 0.7481803297996521,
|
||
|
|
"learning_rate": 9.49214471866612e-06,
|
||
|
|
"loss": 0.1286768436431885,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 350,
|
||
|
|
"token_acc": 0.9525481515405949,
|
||
|
|
"train_speed(iter/s)": 0.120576
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4453155625245002,
|
||
|
|
"grad_norm": 0.7514587640762329,
|
||
|
|
"learning_rate": 9.477623658314988e-06,
|
||
|
|
"loss": 0.13611133098602296,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 355,
|
||
|
|
"token_acc": 0.9467895891385546,
|
||
|
|
"train_speed(iter/s)": 0.121066
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4515876127009016,
|
||
|
|
"grad_norm": 0.6806954741477966,
|
||
|
|
"learning_rate": 9.462909344793028e-06,
|
||
|
|
"loss": 0.12503905296325685,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 360,
|
||
|
|
"token_acc": 0.9559137034194594,
|
||
|
|
"train_speed(iter/s)": 0.121424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4515876127009016,
|
||
|
|
"eval_loss": 0.23415741324424744,
|
||
|
|
"eval_runtime": 29.1302,
|
||
|
|
"eval_samples_per_second": 17.679,
|
||
|
|
"eval_steps_per_second": 4.428,
|
||
|
|
"eval_token_acc": 0.9256910003311939,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.457859662877303,
|
||
|
|
"grad_norm": 0.7245866060256958,
|
||
|
|
"learning_rate": 9.448002413166509e-06,
|
||
|
|
"loss": 0.11684945821762086,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 365,
|
||
|
|
"token_acc": 0.9405347148691575,
|
||
|
|
"train_speed(iter/s)": 0.12022
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46413171305370443,
|
||
|
|
"grad_norm": 0.7275366187095642,
|
||
|
|
"learning_rate": 9.43290350681505e-06,
|
||
|
|
"loss": 0.12942945957183838,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 370,
|
||
|
|
"token_acc": 0.948218290555694,
|
||
|
|
"train_speed(iter/s)": 0.120576
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47040376323010585,
|
||
|
|
"grad_norm": 0.9837439060211182,
|
||
|
|
"learning_rate": 9.41761327740385e-06,
|
||
|
|
"loss": 0.13003346920013428,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 375,
|
||
|
|
"token_acc": 0.957800478604328,
|
||
|
|
"train_speed(iter/s)": 0.121019
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4766758134065073,
|
||
|
|
"grad_norm": 0.7750929594039917,
|
||
|
|
"learning_rate": 9.402132384855573e-06,
|
||
|
|
"loss": 0.12979254722595215,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 380,
|
||
|
|
"token_acc": 0.9498875140607425,
|
||
|
|
"train_speed(iter/s)": 0.121423
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4766758134065073,
|
||
|
|
"eval_loss": 0.2338828444480896,
|
||
|
|
"eval_runtime": 29.2086,
|
||
|
|
"eval_samples_per_second": 17.632,
|
||
|
|
"eval_steps_per_second": 4.417,
|
||
|
|
"eval_token_acc": 0.9252759599381213,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48294786358290864,
|
||
|
|
"grad_norm": 0.6944046020507812,
|
||
|
|
"learning_rate": 9.389610842080394e-06,
|
||
|
|
"loss": 0.12626748085021972,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 385,
|
||
|
|
"token_acc": 0.9413088592055652,
|
||
|
|
"train_speed(iter/s)": 0.120293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48921991375931007,
|
||
|
|
"grad_norm": 0.7166000008583069,
|
||
|
|
"learning_rate": 9.373788445138972e-06,
|
||
|
|
"loss": 0.12364100217819214,
|
||
|
|
"memory(GiB)": 33.77,
|
||
|
|
"step": 390,
|
||
|
|
"token_acc": 0.9516249135684273,
|
||
|
|
"train_speed(iter/s)": 0.120761
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4954919639357115,
|
||
|
|
"grad_norm": 0.6891298294067383,
|
||
|
|
"learning_rate": 9.357777276529793e-06,
|
||
|
|
"loss": 0.11418641805648803,
|
||
|
|
"memory(GiB)": 36.04,
|
||
|
|
"step": 395,
|
||
|
|
"token_acc": 0.9607581283065386,
|
||
|
|
"train_speed(iter/s)": 0.121105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5017640141121129,
|
||
|
|
"grad_norm": 0.8014844059944153,
|
||
|
|
"learning_rate": 9.341578027291085e-06,
|
||
|
|
"loss": 0.12451854944229127,
|
||
|
|
"memory(GiB)": 36.04,
|
||
|
|
"step": 400,
|
||
|
|
"token_acc": 0.9554823405376911,
|
||
|
|
"train_speed(iter/s)": 0.121475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5017640141121129,
|
||
|
|
"eval_loss": 0.2350710779428482,
|
||
|
|
"eval_runtime": 28.9464,
|
||
|
|
"eval_samples_per_second": 17.792,
|
||
|
|
"eval_steps_per_second": 4.457,
|
||
|
|
"eval_token_acc": 0.9253011139013377,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5080360642885143,
|
||
|
|
"grad_norm": 0.7108286619186401,
|
||
|
|
"learning_rate": 9.325191396578589e-06,
|
||
|
|
"loss": 0.12221509218215942,
|
||
|
|
"memory(GiB)": 36.04,
|
||
|
|
"step": 405,
|
||
|
|
"token_acc": 0.9413378371462204,
|
||
|
|
"train_speed(iter/s)": 0.12034
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5143081144649158,
|
||
|
|
"grad_norm": 0.7911909222602844,
|
||
|
|
"learning_rate": 9.308618091635382e-06,
|
||
|
|
"loss": 0.12177256345748902,
|
||
|
|
"memory(GiB)": 36.04,
|
||
|
|
"step": 410,
|
||
|
|
"token_acc": 0.959347706235673,
|
||
|
|
"train_speed(iter/s)": 0.120651
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5205801646413172,
|
||
|
|
"grad_norm": 1.8789387941360474,
|
||
|
|
"learning_rate": 9.291858827761359e-06,
|
||
|
|
"loss": 0.1333709716796875,
|
||
|
|
"memory(GiB)": 36.04,
|
||
|
|
"step": 415,
|
||
|
|
"token_acc": 0.95194391673133,
|
||
|
|
"train_speed(iter/s)": 0.120981
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5268522148177185,
|
||
|
|
"grad_norm": 0.7984176278114319,
|
||
|
|
"learning_rate": 9.274914328282359e-06,
|
||
|
|
"loss": 0.12819453477859497,
|
||
|
|
"memory(GiB)": 36.04,
|
||
|
|
"step": 420,
|
||
|
|
"token_acc": 0.957134979829933,
|
||
|
|
"train_speed(iter/s)": 0.121312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5268522148177185,
|
||
|
|
"eval_loss": 0.23512502014636993,
|
||
|
|
"eval_runtime": 29.1452,
|
||
|
|
"eval_samples_per_second": 17.67,
|
||
|
|
"eval_steps_per_second": 4.426,
|
||
|
|
"eval_token_acc": 0.9250202279787533,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5331242649941199,
|
||
|
|
"grad_norm": 0.7520173788070679,
|
||
|
|
"learning_rate": 9.257785324518943e-06,
|
||
|
|
"loss": 0.12105765342712402,
|
||
|
|
"memory(GiB)": 36.04,
|
||
|
|
"step": 425,
|
||
|
|
"token_acc": 0.9403581723767339,
|
||
|
|
"train_speed(iter/s)": 0.120363
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5393963151705213,
|
||
|
|
"grad_norm": 0.8471489548683167,
|
||
|
|
"learning_rate": 9.240472555754835e-06,
|
||
|
|
"loss": 0.12356100082397461,
|
||
|
|
"memory(GiB)": 36.04,
|
||
|
|
"step": 430,
|
||
|
|
"token_acc": 0.9564814136828489,
|
||
|
|
"train_speed(iter/s)": 0.120684
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5456683653469228,
|
||
|
|
"grad_norm": 0.7935863733291626,
|
||
|
|
"learning_rate": 9.222976769205013e-06,
|
||
|
|
"loss": 0.12740910053253174,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 435,
|
||
|
|
"token_acc": 0.9569268406943757,
|
||
|
|
"train_speed(iter/s)": 0.120949
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5519404155233242,
|
||
|
|
"grad_norm": 0.7470819354057312,
|
||
|
|
"learning_rate": 9.205298719983458e-06,
|
||
|
|
"loss": 0.12629660367965698,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 440,
|
||
|
|
"token_acc": 0.9542381848107219,
|
||
|
|
"train_speed(iter/s)": 0.121266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5519404155233242,
|
||
|
|
"eval_loss": 0.2329576462507248,
|
||
|
|
"eval_runtime": 29.0573,
|
||
|
|
"eval_samples_per_second": 17.724,
|
||
|
|
"eval_steps_per_second": 4.44,
|
||
|
|
"eval_token_acc": 0.925007650997145,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5582124656997256,
|
||
|
|
"grad_norm": 0.8528454899787903,
|
||
|
|
"learning_rate": 9.187439171070563e-06,
|
||
|
|
"loss": 0.11683663129806518,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 445,
|
||
|
|
"token_acc": 0.9422364773256167,
|
||
|
|
"train_speed(iter/s)": 0.12028
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.564484515876127,
|
||
|
|
"grad_norm": 0.6902926564216614,
|
||
|
|
"learning_rate": 9.173021369887053e-06,
|
||
|
|
"loss": 0.1320955276489258,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 450,
|
||
|
|
"token_acc": 0.9549962232889062,
|
||
|
|
"train_speed(iter/s)": 0.12058
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5707565660525284,
|
||
|
|
"grad_norm": 0.7285463213920593,
|
||
|
|
"learning_rate": 9.154837069223594e-06,
|
||
|
|
"loss": 0.12488093376159667,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 455,
|
||
|
|
"token_acc": 0.9579794738443663,
|
||
|
|
"train_speed(iter/s)": 0.120802
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5770286162289299,
|
||
|
|
"grad_norm": 0.6829497218132019,
|
||
|
|
"learning_rate": 9.136473446781624e-06,
|
||
|
|
"loss": 0.12886552810668944,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 460,
|
||
|
|
"token_acc": 0.9550450619099832,
|
||
|
|
"train_speed(iter/s)": 0.121049
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5770286162289299,
|
||
|
|
"eval_loss": 0.2350022941827774,
|
||
|
|
"eval_runtime": 29.2036,
|
||
|
|
"eval_samples_per_second": 17.635,
|
||
|
|
"eval_steps_per_second": 4.417,
|
||
|
|
"eval_token_acc": 0.9253220755373516,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5833006664053313,
|
||
|
|
"grad_norm": 0.793786346912384,
|
||
|
|
"learning_rate": 9.11793129513072e-06,
|
||
|
|
"loss": 0.1309070110321045,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 465,
|
||
|
|
"token_acc": 0.9386901904304689,
|
||
|
|
"train_speed(iter/s)": 0.12018
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5895727165817326,
|
||
|
|
"grad_norm": 0.6860336661338806,
|
||
|
|
"learning_rate": 9.102969570306243e-06,
|
||
|
|
"loss": 0.13614410161972046,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 470,
|
||
|
|
"token_acc": 0.950530035335689,
|
||
|
|
"train_speed(iter/s)": 0.120521
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.595844766758134,
|
||
|
|
"grad_norm": 0.7473175525665283,
|
||
|
|
"learning_rate": 9.084108087927778e-06,
|
||
|
|
"loss": 0.13468925952911376,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 475,
|
||
|
|
"token_acc": 0.9512150026413101,
|
||
|
|
"train_speed(iter/s)": 0.120848
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6021168169345354,
|
||
|
|
"grad_norm": 0.705289900302887,
|
||
|
|
"learning_rate": 9.065070336416794e-06,
|
||
|
|
"loss": 0.12688368558883667,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 480,
|
||
|
|
"token_acc": 0.9514988814317673,
|
||
|
|
"train_speed(iter/s)": 0.121127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6021168169345354,
|
||
|
|
"eval_loss": 0.23424042761325836,
|
||
|
|
"eval_runtime": 28.918,
|
||
|
|
"eval_samples_per_second": 17.809,
|
||
|
|
"eval_steps_per_second": 4.461,
|
||
|
|
"eval_token_acc": 0.9258838473825205,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6083888671109369,
|
||
|
|
"grad_norm": 0.7309315204620361,
|
||
|
|
"learning_rate": 9.045857137438114e-06,
|
||
|
|
"loss": 0.12572396993637086,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 485,
|
||
|
|
"token_acc": 0.9416618199382905,
|
||
|
|
"train_speed(iter/s)": 0.120198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6146609172873383,
|
||
|
|
"grad_norm": 0.7343481779098511,
|
||
|
|
"learning_rate": 9.02646932022883e-06,
|
||
|
|
"loss": 0.12929785251617432,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 490,
|
||
|
|
"token_acc": 0.9553018035624546,
|
||
|
|
"train_speed(iter/s)": 0.120514
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6209329674637397,
|
||
|
|
"grad_norm": 0.7487764954566956,
|
||
|
|
"learning_rate": 9.006907721562515e-06,
|
||
|
|
"loss": 0.12204375267028808,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 495,
|
||
|
|
"token_acc": 0.9591731423020884,
|
||
|
|
"train_speed(iter/s)": 0.120819
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6272050176401411,
|
||
|
|
"grad_norm": 0.7962038516998291,
|
||
|
|
"learning_rate": 8.987173185713113e-06,
|
||
|
|
"loss": 0.12226212024688721,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 500,
|
||
|
|
"token_acc": 0.9564000589188393,
|
||
|
|
"train_speed(iter/s)": 0.121101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6272050176401411,
|
||
|
|
"eval_loss": 0.23693928122520447,
|
||
|
|
"eval_runtime": 29.1127,
|
||
|
|
"eval_samples_per_second": 17.69,
|
||
|
|
"eval_steps_per_second": 4.431,
|
||
|
|
"eval_token_acc": 0.9255736151695168,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6334770678165426,
|
||
|
|
"grad_norm": 0.7448955178260803,
|
||
|
|
"learning_rate": 8.967266564418485e-06,
|
||
|
|
"loss": 0.12553646564483642,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 505,
|
||
|
|
"token_acc": 0.9397639899675178,
|
||
|
|
"train_speed(iter/s)": 0.120264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.639749117992944,
|
||
|
|
"grad_norm": 0.6869771480560303,
|
||
|
|
"learning_rate": 8.947188716843668e-06,
|
||
|
|
"loss": 0.12530720233917236,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 510,
|
||
|
|
"token_acc": 0.9531347241388641,
|
||
|
|
"train_speed(iter/s)": 0.120511
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6460211681693454,
|
||
|
|
"grad_norm": 0.7007948756217957,
|
||
|
|
"learning_rate": 8.926940509543786e-06,
|
||
|
|
"loss": 0.12557142972946167,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 515,
|
||
|
|
"token_acc": 0.9570901871809416,
|
||
|
|
"train_speed(iter/s)": 0.120789
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6522932183457467,
|
||
|
|
"grad_norm": 0.6679887771606445,
|
||
|
|
"learning_rate": 8.906522816426642e-06,
|
||
|
|
"loss": 0.11763076782226563,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 520,
|
||
|
|
"token_acc": 0.9630898229846002,
|
||
|
|
"train_speed(iter/s)": 0.121093
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6522932183457467,
|
||
|
|
"eval_loss": 0.23494519293308258,
|
||
|
|
"eval_runtime": 29.1208,
|
||
|
|
"eval_samples_per_second": 17.685,
|
||
|
|
"eval_steps_per_second": 4.43,
|
||
|
|
"eval_token_acc": 0.9250202279787533,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6585652685221481,
|
||
|
|
"grad_norm": 1.0034900903701782,
|
||
|
|
"learning_rate": 8.885936518715009e-06,
|
||
|
|
"loss": 0.12190806865692139,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 525,
|
||
|
|
"token_acc": 0.9413621144839724,
|
||
|
|
"train_speed(iter/s)": 0.12028
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6648373186985496,
|
||
|
|
"grad_norm": 0.7202277779579163,
|
||
|
|
"learning_rate": 8.865182504908593e-06,
|
||
|
|
"loss": 0.12205361127853394,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 530,
|
||
|
|
"token_acc": 0.9566966466480848,
|
||
|
|
"train_speed(iter/s)": 0.120583
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.671109368874951,
|
||
|
|
"grad_norm": 0.7922447323799133,
|
||
|
|
"learning_rate": 8.84426167074569e-06,
|
||
|
|
"loss": 0.12360981702804566,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 535,
|
||
|
|
"token_acc": 0.9545589899350843,
|
||
|
|
"train_speed(iter/s)": 0.120816
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6773814190513524,
|
||
|
|
"grad_norm": 0.7317930459976196,
|
||
|
|
"learning_rate": 8.823174919164517e-06,
|
||
|
|
"loss": 0.12647807598114014,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 540,
|
||
|
|
"token_acc": 0.9562937062937062,
|
||
|
|
"train_speed(iter/s)": 0.121078
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6773814190513524,
|
||
|
|
"eval_loss": 0.234180748462677,
|
||
|
|
"eval_runtime": 29.282,
|
||
|
|
"eval_samples_per_second": 17.588,
|
||
|
|
"eval_steps_per_second": 4.405,
|
||
|
|
"eval_token_acc": 0.9257161542944103,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6836534692277538,
|
||
|
|
"grad_norm": 0.7069205045700073,
|
||
|
|
"learning_rate": 8.801923160264254e-06,
|
||
|
|
"loss": 0.12029304504394531,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 545,
|
||
|
|
"token_acc": 0.940614257111556,
|
||
|
|
"train_speed(iter/s)": 0.120293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6899255194041553,
|
||
|
|
"grad_norm": 0.7268481850624084,
|
||
|
|
"learning_rate": 8.78050731126575e-06,
|
||
|
|
"loss": 0.12312864065170288,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 550,
|
||
|
|
"token_acc": 0.9642680054543201,
|
||
|
|
"train_speed(iter/s)": 0.12052
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6961975695805567,
|
||
|
|
"grad_norm": 0.7602020502090454,
|
||
|
|
"learning_rate": 8.758928296471955e-06,
|
||
|
|
"loss": 0.12826888561248778,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 555,
|
||
|
|
"token_acc": 0.9557154631332023,
|
||
|
|
"train_speed(iter/s)": 0.120805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7024696197569581,
|
||
|
|
"grad_norm": 0.7089629173278809,
|
||
|
|
"learning_rate": 8.737187047228004e-06,
|
||
|
|
"loss": 0.12195276021957398,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 560,
|
||
|
|
"token_acc": 0.9564936463493431,
|
||
|
|
"train_speed(iter/s)": 0.121045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7024696197569581,
|
||
|
|
"eval_loss": 0.23239342868328094,
|
||
|
|
"eval_runtime": 28.8986,
|
||
|
|
"eval_samples_per_second": 17.821,
|
||
|
|
"eval_steps_per_second": 4.464,
|
||
|
|
"eval_token_acc": 0.9253933450997983,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7087416699333595,
|
||
|
|
"grad_norm": 0.7802727818489075,
|
||
|
|
"learning_rate": 8.715284501881039e-06,
|
||
|
|
"loss": 0.12478115558624267,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 565,
|
||
|
|
"token_acc": 0.9394610632417493,
|
||
|
|
"train_speed(iter/s)": 0.120318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7150137201097608,
|
||
|
|
"grad_norm": 0.692456841468811,
|
||
|
|
"learning_rate": 8.693221605739697e-06,
|
||
|
|
"loss": 0.12183520793914795,
|
||
|
|
"memory(GiB)": 38.32,
|
||
|
|
"step": 570,
|
||
|
|
"token_acc": 0.9573177580590813,
|
||
|
|
"train_speed(iter/s)": 0.120604
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7212857702861623,
|
||
|
|
"grad_norm": 0.783674955368042,
|
||
|
|
"learning_rate": 8.670999311033328e-06,
|
||
|
|
"loss": 0.1260378837585449,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 575,
|
||
|
|
"token_acc": 0.958084188606277,
|
||
|
|
"train_speed(iter/s)": 0.12084
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7275578204625637,
|
||
|
|
"grad_norm": 0.7612254023551941,
|
||
|
|
"learning_rate": 8.648618576870877e-06,
|
||
|
|
"loss": 0.12205030918121337,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 580,
|
||
|
|
"token_acc": 0.9522026264517598,
|
||
|
|
"train_speed(iter/s)": 0.121105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7275578204625637,
|
||
|
|
"eval_loss": 0.23236523568630219,
|
||
|
|
"eval_runtime": 29.1132,
|
||
|
|
"eval_samples_per_second": 17.69,
|
||
|
|
"eval_steps_per_second": 4.431,
|
||
|
|
"eval_token_acc": 0.9257538852392352,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7338298706389651,
|
||
|
|
"grad_norm": 0.6874219179153442,
|
||
|
|
"learning_rate": 8.626080369199499e-06,
|
||
|
|
"loss": 0.12317302227020263,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 585,
|
||
|
|
"token_acc": 0.9398445420750253,
|
||
|
|
"train_speed(iter/s)": 0.120379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7401019208153665,
|
||
|
|
"grad_norm": 0.7761655449867249,
|
||
|
|
"learning_rate": 8.603385660762872e-06,
|
||
|
|
"loss": 0.1282115697860718,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 590,
|
||
|
|
"token_acc": 0.9520723436322532,
|
||
|
|
"train_speed(iter/s)": 0.120652
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7463739709917679,
|
||
|
|
"grad_norm": 0.6982787251472473,
|
||
|
|
"learning_rate": 8.58053543105921e-06,
|
||
|
|
"loss": 0.1281890869140625,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 595,
|
||
|
|
"token_acc": 0.953644096279635,
|
||
|
|
"train_speed(iter/s)": 0.120874
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7526460211681694,
|
||
|
|
"grad_norm": 0.7546159625053406,
|
||
|
|
"learning_rate": 8.55753066629898e-06,
|
||
|
|
"loss": 0.12751117944717408,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 600,
|
||
|
|
"token_acc": 0.9540350393157677,
|
||
|
|
"train_speed(iter/s)": 0.121112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7526460211681694,
|
||
|
|
"eval_loss": 0.23002442717552185,
|
||
|
|
"eval_runtime": 29.1415,
|
||
|
|
"eval_samples_per_second": 17.672,
|
||
|
|
"eval_steps_per_second": 4.427,
|
||
|
|
"eval_token_acc": 0.9261940795955242,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7589180713445708,
|
||
|
|
"grad_norm": 0.7351900935173035,
|
||
|
|
"learning_rate": 8.534372359362357e-06,
|
||
|
|
"loss": 0.1303678870201111,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 605,
|
||
|
|
"token_acc": 0.9409314468422133,
|
||
|
|
"train_speed(iter/s)": 0.120394
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7651901215209722,
|
||
|
|
"grad_norm": 0.8682727217674255,
|
||
|
|
"learning_rate": 8.51106150975635e-06,
|
||
|
|
"loss": 0.1233241081237793,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 610,
|
||
|
|
"token_acc": 0.95544310046902,
|
||
|
|
"train_speed(iter/s)": 0.120622
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7714621716973736,
|
||
|
|
"grad_norm": 0.7701956629753113,
|
||
|
|
"learning_rate": 8.487599123571675e-06,
|
||
|
|
"loss": 0.11557955741882324,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 615,
|
||
|
|
"token_acc": 0.9595501699938976,
|
||
|
|
"train_speed(iter/s)": 0.120911
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.777734221873775,
|
||
|
|
"grad_norm": 0.763529360294342,
|
||
|
|
"learning_rate": 8.463986213439337e-06,
|
||
|
|
"loss": 0.12450950145721436,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 620,
|
||
|
|
"token_acc": 0.9594680177327423,
|
||
|
|
"train_speed(iter/s)": 0.121153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.777734221873775,
|
||
|
|
"eval_loss": 0.2307971715927124,
|
||
|
|
"eval_runtime": 28.9628,
|
||
|
|
"eval_samples_per_second": 17.781,
|
||
|
|
"eval_steps_per_second": 4.454,
|
||
|
|
"eval_token_acc": 0.9257371159304242,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7840062720501764,
|
||
|
|
"grad_norm": 0.743877649307251,
|
||
|
|
"learning_rate": 8.440223798486913e-06,
|
||
|
|
"loss": 0.13349132537841796,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 625,
|
||
|
|
"token_acc": 0.9376087341521601,
|
||
|
|
"train_speed(iter/s)": 0.120442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7902783222265778,
|
||
|
|
"grad_norm": 0.7353401184082031,
|
||
|
|
"learning_rate": 8.416312904294572e-06,
|
||
|
|
"loss": 0.13025209903717042,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 630,
|
||
|
|
"token_acc": 0.960784808848038,
|
||
|
|
"train_speed(iter/s)": 0.120683
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7965503724029792,
|
||
|
|
"grad_norm": 0.7505218982696533,
|
||
|
|
"learning_rate": 8.397077977170049e-06,
|
||
|
|
"loss": 0.13371331691741944,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 635,
|
||
|
|
"token_acc": 0.9515476784822766,
|
||
|
|
"train_speed(iter/s)": 0.120921
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8028224225793806,
|
||
|
|
"grad_norm": 0.8705490231513977,
|
||
|
|
"learning_rate": 8.372902425234847e-06,
|
||
|
|
"loss": 0.12443286180496216,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 640,
|
||
|
|
"token_acc": 0.9559957659156207,
|
||
|
|
"train_speed(iter/s)": 0.12116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8028224225793806,
|
||
|
|
"eval_loss": 0.23105858266353607,
|
||
|
|
"eval_runtime": 29.1773,
|
||
|
|
"eval_samples_per_second": 17.651,
|
||
|
|
"eval_steps_per_second": 4.421,
|
||
|
|
"eval_token_acc": 0.9266720048966381,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8090944727557821,
|
||
|
|
"grad_norm": 0.720313549041748,
|
||
|
|
"learning_rate": 8.348581299634171e-06,
|
||
|
|
"loss": 0.12005361318588256,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 645,
|
||
|
|
"token_acc": 0.942721820579713,
|
||
|
|
"train_speed(iter/s)": 0.120467
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8153665229321835,
|
||
|
|
"grad_norm": 0.7976186275482178,
|
||
|
|
"learning_rate": 8.324115650062005e-06,
|
||
|
|
"loss": 0.1226189136505127,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 650,
|
||
|
|
"token_acc": 0.95751953125,
|
||
|
|
"train_speed(iter/s)": 0.120671
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8216385731085849,
|
||
|
|
"grad_norm": 0.7320578694343567,
|
||
|
|
"learning_rate": 8.29950653244996e-06,
|
||
|
|
"loss": 0.12214083671569824,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 655,
|
||
|
|
"token_acc": 0.9560769335697722,
|
||
|
|
"train_speed(iter/s)": 0.120871
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8279106232849863,
|
||
|
|
"grad_norm": 0.714158833026886,
|
||
|
|
"learning_rate": 8.27475500892169e-06,
|
||
|
|
"loss": 0.13046940565109252,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 660,
|
||
|
|
"token_acc": 0.9564310899892687,
|
||
|
|
"train_speed(iter/s)": 0.121118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8279106232849863,
|
||
|
|
"eval_loss": 0.2298216074705124,
|
||
|
|
"eval_runtime": 29.0928,
|
||
|
|
"eval_samples_per_second": 17.702,
|
||
|
|
"eval_steps_per_second": 4.434,
|
||
|
|
"eval_token_acc": 0.9264120806100674,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8341826734613876,
|
||
|
|
"grad_norm": 0.70967036485672,
|
||
|
|
"learning_rate": 8.249862147747062e-06,
|
||
|
|
"loss": 0.12797050476074218,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 665,
|
||
|
|
"token_acc": 0.940926979466161,
|
||
|
|
"train_speed(iter/s)": 0.120422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8404547236377891,
|
||
|
|
"grad_norm": 0.6546662449836731,
|
||
|
|
"learning_rate": 8.224829023296032e-06,
|
||
|
|
"loss": 0.12179737091064453,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 670,
|
||
|
|
"token_acc": 0.9526879044300647,
|
||
|
|
"train_speed(iter/s)": 0.12064
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8467267738141905,
|
||
|
|
"grad_norm": 0.7337839007377625,
|
||
|
|
"learning_rate": 8.199656715992292e-06,
|
||
|
|
"loss": 0.13117530345916747,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 675,
|
||
|
|
"token_acc": 0.9478685921294229,
|
||
|
|
"train_speed(iter/s)": 0.120857
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8529988239905919,
|
||
|
|
"grad_norm": 0.7467445731163025,
|
||
|
|
"learning_rate": 8.179419388376196e-06,
|
||
|
|
"loss": 0.13929787874221802,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 680,
|
||
|
|
"token_acc": 0.946326665465249,
|
||
|
|
"train_speed(iter/s)": 0.121055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8529988239905919,
|
||
|
|
"eval_loss": 0.22663576900959015,
|
||
|
|
"eval_runtime": 29.1409,
|
||
|
|
"eval_samples_per_second": 17.673,
|
||
|
|
"eval_steps_per_second": 4.427,
|
||
|
|
"eval_token_acc": 0.9265671967165693,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8592708741669933,
|
||
|
|
"grad_norm": 0.6901698708534241,
|
||
|
|
"learning_rate": 8.153999293750005e-06,
|
||
|
|
"loss": 0.1212563157081604,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 685,
|
||
|
|
"token_acc": 0.9411858718235576,
|
||
|
|
"train_speed(iter/s)": 0.120382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8655429243433947,
|
||
|
|
"grad_norm": 0.7482547163963318,
|
||
|
|
"learning_rate": 8.128443073265364e-06,
|
||
|
|
"loss": 0.13035836219787597,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 690,
|
||
|
|
"token_acc": 0.9523483030510799,
|
||
|
|
"train_speed(iter/s)": 0.120598
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8718149745197962,
|
||
|
|
"grad_norm": 0.7634561657905579,
|
||
|
|
"learning_rate": 8.102751829922664e-06,
|
||
|
|
"loss": 0.13618214130401612,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 695,
|
||
|
|
"token_acc": 0.9506073092564165,
|
||
|
|
"train_speed(iter/s)": 0.1208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8780870246961976,
|
||
|
|
"grad_norm": 0.743306040763855,
|
||
|
|
"learning_rate": 8.082102363728494e-06,
|
||
|
|
"loss": 0.12926363945007324,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 700,
|
||
|
|
"token_acc": 0.9271781534460338,
|
||
|
|
"train_speed(iter/s)": 0.121029
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8780870246961976,
|
||
|
|
"eval_loss": 0.22914662957191467,
|
||
|
|
"eval_runtime": 28.9342,
|
||
|
|
"eval_samples_per_second": 17.799,
|
||
|
|
"eval_steps_per_second": 4.458,
|
||
|
|
"eval_token_acc": 0.9267055435142602,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.884359074872599,
|
||
|
|
"grad_norm": 0.774067759513855,
|
||
|
|
"learning_rate": 8.056170877373277e-06,
|
||
|
|
"loss": 0.12883291244506836,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 705,
|
||
|
|
"token_acc": 0.941404062515909,
|
||
|
|
"train_speed(iter/s)": 0.120445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8906311250490004,
|
||
|
|
"grad_norm": 0.731858491897583,
|
||
|
|
"learning_rate": 8.030107487410766e-06,
|
||
|
|
"loss": 0.1272268772125244,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 710,
|
||
|
|
"token_acc": 0.9559863699726366,
|
||
|
|
"train_speed(iter/s)": 0.120655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8969031752254017,
|
||
|
|
"grad_norm": 0.7816020846366882,
|
||
|
|
"learning_rate": 8.003913318730662e-06,
|
||
|
|
"loss": 0.12550874948501586,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 715,
|
||
|
|
"token_acc": 0.9573273273273273,
|
||
|
|
"train_speed(iter/s)": 0.120859
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9031752254018032,
|
||
|
|
"grad_norm": 0.7145897150039673,
|
||
|
|
"learning_rate": 7.97758950186705e-06,
|
||
|
|
"loss": 0.11747034788131713,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 720,
|
||
|
|
"token_acc": 0.9536420703541395,
|
||
|
|
"train_speed(iter/s)": 0.121052
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9031752254018032,
|
||
|
|
"eval_loss": 0.2287711501121521,
|
||
|
|
"eval_runtime": 28.9287,
|
||
|
|
"eval_samples_per_second": 17.802,
|
||
|
|
"eval_steps_per_second": 4.459,
|
||
|
|
"eval_token_acc": 0.9270073910728585,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9094472755782046,
|
||
|
|
"grad_norm": 0.7315598130226135,
|
||
|
|
"learning_rate": 7.951137172949595e-06,
|
||
|
|
"loss": 0.1277442455291748,
|
||
|
|
"memory(GiB)": 40.76,
|
||
|
|
"step": 725,
|
||
|
|
"token_acc": 0.9404945141684427,
|
||
|
|
"train_speed(iter/s)": 0.12046
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.915719325754606,
|
||
|
|
"grad_norm": 0.7869780659675598,
|
||
|
|
"learning_rate": 7.924557473654516e-06,
|
||
|
|
"loss": 0.13705768585205078,
|
||
|
|
"memory(GiB)": 40.77,
|
||
|
|
"step": 730,
|
||
|
|
"token_acc": 0.9484856989768581,
|
||
|
|
"train_speed(iter/s)": 0.120675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9219913759310074,
|
||
|
|
"grad_norm": 0.738673985004425,
|
||
|
|
"learning_rate": 7.897851551155306e-06,
|
||
|
|
"loss": 0.12492038011550903,
|
||
|
|
"memory(GiB)": 40.77,
|
||
|
|
"step": 735,
|
||
|
|
"token_acc": 0.9554785841007012,
|
||
|
|
"train_speed(iter/s)": 0.120899
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9282634261074089,
|
||
|
|
"grad_norm": 0.6920596361160278,
|
||
|
|
"learning_rate": 7.871020558073217e-06,
|
||
|
|
"loss": 0.12350271940231324,
|
||
|
|
"memory(GiB)": 40.77,
|
||
|
|
"step": 740,
|
||
|
|
"token_acc": 0.9585155697561742,
|
||
|
|
"train_speed(iter/s)": 0.121065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9282634261074089,
|
||
|
|
"eval_loss": 0.22647298872470856,
|
||
|
|
"eval_runtime": 29.274,
|
||
|
|
"eval_samples_per_second": 17.592,
|
||
|
|
"eval_steps_per_second": 4.407,
|
||
|
|
"eval_token_acc": 0.9271583148521576,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9345354762838103,
|
||
|
|
"grad_norm": 0.7176641821861267,
|
||
|
|
"learning_rate": 7.849466490796728e-06,
|
||
|
|
"loss": 0.12098994255065917,
|
||
|
|
"memory(GiB)": 40.77,
|
||
|
|
"step": 745,
|
||
|
|
"token_acc": 0.9415065810170385,
|
||
|
|
"train_speed(iter/s)": 0.120467
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9408075264602117,
|
||
|
|
"grad_norm": 0.7007727026939392,
|
||
|
|
"learning_rate": 7.822413292469593e-06,
|
||
|
|
"loss": 0.12603325843811036,
|
||
|
|
"memory(GiB)": 40.77,
|
||
|
|
"step": 750,
|
||
|
|
"token_acc": 0.955721036803666,
|
||
|
|
"train_speed(iter/s)": 0.120684
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9470795766366131,
|
||
|
|
"grad_norm": 0.7308924198150635,
|
||
|
|
"learning_rate": 7.79523827945686e-06,
|
||
|
|
"loss": 0.1311476469039917,
|
||
|
|
"memory(GiB)": 40.77,
|
||
|
|
"step": 755,
|
||
|
|
"token_acc": 0.9497907949790795,
|
||
|
|
"train_speed(iter/s)": 0.120874
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9533516268130146,
|
||
|
|
"grad_norm": 0.701979398727417,
|
||
|
|
"learning_rate": 7.767942624625625e-06,
|
||
|
|
"loss": 0.12925295829772948,
|
||
|
|
"memory(GiB)": 40.77,
|
||
|
|
"step": 760,
|
||
|
|
"token_acc": 0.9502816180235535,
|
||
|
|
"train_speed(iter/s)": 0.121076
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9533516268130146,
|
||
|
|
"eval_loss": 0.2251773178577423,
|
||
|
|
"eval_runtime": 29.2786,
|
||
|
|
"eval_samples_per_second": 17.59,
|
||
|
|
"eval_steps_per_second": 4.406,
|
||
|
|
"eval_token_acc": 0.9271918534697796,
|
||
|
|
"step": 760
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 2391,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 3,
|
||
|
|
"save_steps": 20,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 9.199964448180142e+17,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|