6013 lines
171 KiB
JSON
6013 lines
171 KiB
JSON
{
|
|
"best_global_step": 1620,
|
|
"best_metric": 0.3465479,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v14-20250430-214816/checkpoint-1620",
|
|
"epoch": 2.9988481916609078,
|
|
"eval_steps": 20,
|
|
"global_step": 2439,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0012285955616985335,
|
|
"grad_norm": 4.7140889167785645,
|
|
"learning_rate": 9.99999585221637e-06,
|
|
"loss": 0.7454355955123901,
|
|
"memory(GiB)": 28.92,
|
|
"step": 1,
|
|
"token_acc": 0.8220973782771536,
|
|
"train_speed(iter/s)": 0.064112
|
|
},
|
|
{
|
|
"epoch": 0.006142977808492667,
|
|
"grad_norm": 2.3402233123779297,
|
|
"learning_rate": 9.999896305753298e-06,
|
|
"loss": 0.6025638580322266,
|
|
"memory(GiB)": 28.92,
|
|
"step": 5,
|
|
"token_acc": 0.8154806964420893,
|
|
"train_speed(iter/s)": 0.1223
|
|
},
|
|
{
|
|
"epoch": 0.012285955616985334,
|
|
"grad_norm": 1.1033470630645752,
|
|
"learning_rate": 9.99958522731419e-06,
|
|
"loss": 0.4681520462036133,
|
|
"memory(GiB)": 28.96,
|
|
"step": 10,
|
|
"token_acc": 0.8509727902413654,
|
|
"train_speed(iter/s)": 0.134255
|
|
},
|
|
{
|
|
"epoch": 0.018428933425478,
|
|
"grad_norm": 1.1683531999588013,
|
|
"learning_rate": 9.999066777585496e-06,
|
|
"loss": 0.4340578556060791,
|
|
"memory(GiB)": 30.5,
|
|
"step": 15,
|
|
"token_acc": 0.8624967569989859,
|
|
"train_speed(iter/s)": 0.141354
|
|
},
|
|
{
|
|
"epoch": 0.024571911233970668,
|
|
"grad_norm": 0.8643156290054321,
|
|
"learning_rate": 9.998340978071314e-06,
|
|
"loss": 0.438944673538208,
|
|
"memory(GiB)": 30.5,
|
|
"step": 20,
|
|
"token_acc": 0.8628914650122352,
|
|
"train_speed(iter/s)": 0.145504
|
|
},
|
|
{
|
|
"epoch": 0.024571911233970668,
|
|
"eval_loss": 0.43904876708984375,
|
|
"eval_runtime": 31.0999,
|
|
"eval_samples_per_second": 16.913,
|
|
"eval_steps_per_second": 4.244,
|
|
"eval_token_acc": 0.8671750972762646,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.030714889042463334,
|
|
"grad_norm": 0.882213830947876,
|
|
"learning_rate": 9.997407858876141e-06,
|
|
"loss": 0.4316856384277344,
|
|
"memory(GiB)": 32.21,
|
|
"step": 25,
|
|
"token_acc": 0.8673553096382113,
|
|
"train_speed(iter/s)": 0.118362
|
|
},
|
|
{
|
|
"epoch": 0.036857866850956,
|
|
"grad_norm": 0.876335859298706,
|
|
"learning_rate": 9.99626745870361e-06,
|
|
"loss": 0.4254283428192139,
|
|
"memory(GiB)": 32.21,
|
|
"step": 30,
|
|
"token_acc": 0.866745778634824,
|
|
"train_speed(iter/s)": 0.122938
|
|
},
|
|
{
|
|
"epoch": 0.043000844659448666,
|
|
"grad_norm": 0.8186553120613098,
|
|
"learning_rate": 9.994919824854899e-06,
|
|
"loss": 0.4170750617980957,
|
|
"memory(GiB)": 32.21,
|
|
"step": 35,
|
|
"token_acc": 0.8640802675585284,
|
|
"train_speed(iter/s)": 0.127141
|
|
},
|
|
{
|
|
"epoch": 0.049143822467941335,
|
|
"grad_norm": 0.8065207004547119,
|
|
"learning_rate": 9.993365013226757e-06,
|
|
"loss": 0.40838775634765623,
|
|
"memory(GiB)": 32.21,
|
|
"step": 40,
|
|
"token_acc": 0.8663708595604169,
|
|
"train_speed(iter/s)": 0.130143
|
|
},
|
|
{
|
|
"epoch": 0.049143822467941335,
|
|
"eval_loss": 0.41924959421157837,
|
|
"eval_runtime": 31.0376,
|
|
"eval_samples_per_second": 16.947,
|
|
"eval_steps_per_second": 4.253,
|
|
"eval_token_acc": 0.8721037613488976,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.055286800276434005,
|
|
"grad_norm": 0.7789999842643738,
|
|
"learning_rate": 9.991603088309195e-06,
|
|
"loss": 0.4384481906890869,
|
|
"memory(GiB)": 32.21,
|
|
"step": 45,
|
|
"token_acc": 0.8650371852302875,
|
|
"train_speed(iter/s)": 0.117241
|
|
},
|
|
{
|
|
"epoch": 0.06142977808492667,
|
|
"grad_norm": 0.7491472959518433,
|
|
"learning_rate": 9.989634123182798e-06,
|
|
"loss": 0.3983407497406006,
|
|
"memory(GiB)": 32.21,
|
|
"step": 50,
|
|
"token_acc": 0.8744787141615986,
|
|
"train_speed(iter/s)": 0.120332
|
|
},
|
|
{
|
|
"epoch": 0.06757275589341934,
|
|
"grad_norm": 0.8437614440917969,
|
|
"learning_rate": 9.987458199515714e-06,
|
|
"loss": 0.4000354290008545,
|
|
"memory(GiB)": 32.21,
|
|
"step": 55,
|
|
"token_acc": 0.8653561422291064,
|
|
"train_speed(iter/s)": 0.123396
|
|
},
|
|
{
|
|
"epoch": 0.073715733701912,
|
|
"grad_norm": 0.7674087285995483,
|
|
"learning_rate": 9.985075407560247e-06,
|
|
"loss": 0.4135420799255371,
|
|
"memory(GiB)": 32.21,
|
|
"step": 60,
|
|
"token_acc": 0.872202027931892,
|
|
"train_speed(iter/s)": 0.125154
|
|
},
|
|
{
|
|
"epoch": 0.073715733701912,
|
|
"eval_loss": 0.4098711311817169,
|
|
"eval_runtime": 31.0819,
|
|
"eval_samples_per_second": 16.923,
|
|
"eval_steps_per_second": 4.247,
|
|
"eval_token_acc": 0.8743692174664938,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.07985871151040466,
|
|
"grad_norm": 0.8239404559135437,
|
|
"learning_rate": 9.982485846149125e-06,
|
|
"loss": 0.39459028244018557,
|
|
"memory(GiB)": 32.21,
|
|
"step": 65,
|
|
"token_acc": 0.8727861165617594,
|
|
"train_speed(iter/s)": 0.116909
|
|
},
|
|
{
|
|
"epoch": 0.08600168931889733,
|
|
"grad_norm": 0.8135547637939453,
|
|
"learning_rate": 9.979689622691393e-06,
|
|
"loss": 0.4003786087036133,
|
|
"memory(GiB)": 32.21,
|
|
"step": 70,
|
|
"token_acc": 0.8739415872132136,
|
|
"train_speed(iter/s)": 0.118714
|
|
},
|
|
{
|
|
"epoch": 0.09214466712739,
|
|
"grad_norm": 0.853965699672699,
|
|
"learning_rate": 9.976686853167967e-06,
|
|
"loss": 0.405532693862915,
|
|
"memory(GiB)": 32.21,
|
|
"step": 75,
|
|
"token_acc": 0.863868962219034,
|
|
"train_speed(iter/s)": 0.120582
|
|
},
|
|
{
|
|
"epoch": 0.09828764493588267,
|
|
"grad_norm": 0.7862138152122498,
|
|
"learning_rate": 9.973477662126818e-06,
|
|
"loss": 0.38930883407592776,
|
|
"memory(GiB)": 32.21,
|
|
"step": 80,
|
|
"token_acc": 0.8843768172126381,
|
|
"train_speed(iter/s)": 0.122421
|
|
},
|
|
{
|
|
"epoch": 0.09828764493588267,
|
|
"eval_loss": 0.4023858904838562,
|
|
"eval_runtime": 30.9765,
|
|
"eval_samples_per_second": 16.981,
|
|
"eval_steps_per_second": 4.261,
|
|
"eval_token_acc": 0.8762680501513186,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.10443062274437534,
|
|
"grad_norm": 0.7761799097061157,
|
|
"learning_rate": 9.970062182677802e-06,
|
|
"loss": 0.3841962099075317,
|
|
"memory(GiB)": 32.21,
|
|
"step": 85,
|
|
"token_acc": 0.8720659317731335,
|
|
"train_speed(iter/s)": 0.116555
|
|
},
|
|
{
|
|
"epoch": 0.11057360055286801,
|
|
"grad_norm": 0.7647544145584106,
|
|
"learning_rate": 9.966440556487149e-06,
|
|
"loss": 0.40062150955200193,
|
|
"memory(GiB)": 32.21,
|
|
"step": 90,
|
|
"token_acc": 0.8734117200834439,
|
|
"train_speed(iter/s)": 0.11815
|
|
},
|
|
{
|
|
"epoch": 0.11671657836136066,
|
|
"grad_norm": 0.8558200597763062,
|
|
"learning_rate": 9.962612933771575e-06,
|
|
"loss": 0.41026945114135743,
|
|
"memory(GiB)": 32.21,
|
|
"step": 95,
|
|
"token_acc": 0.8802111051978002,
|
|
"train_speed(iter/s)": 0.119854
|
|
},
|
|
{
|
|
"epoch": 0.12285955616985333,
|
|
"grad_norm": 0.8282895088195801,
|
|
"learning_rate": 9.958579473292067e-06,
|
|
"loss": 0.40637502670288084,
|
|
"memory(GiB)": 32.21,
|
|
"step": 100,
|
|
"token_acc": 0.8726802284082797,
|
|
"train_speed(iter/s)": 0.121692
|
|
},
|
|
{
|
|
"epoch": 0.12285955616985333,
|
|
"eval_loss": 0.39839640259742737,
|
|
"eval_runtime": 31.0438,
|
|
"eval_samples_per_second": 16.944,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.8775339386078685,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.129002533978346,
|
|
"grad_norm": 0.8366308212280273,
|
|
"learning_rate": 9.95434034234728e-06,
|
|
"loss": 0.3811495780944824,
|
|
"memory(GiB)": 32.21,
|
|
"step": 105,
|
|
"token_acc": 0.875357573668792,
|
|
"train_speed(iter/s)": 0.117395
|
|
},
|
|
{
|
|
"epoch": 0.13514551178683867,
|
|
"grad_norm": 0.7479439377784729,
|
|
"learning_rate": 9.949895716766611e-06,
|
|
"loss": 0.38749701976776124,
|
|
"memory(GiB)": 32.21,
|
|
"step": 110,
|
|
"token_acc": 0.8701843549972431,
|
|
"train_speed(iter/s)": 0.118845
|
|
},
|
|
{
|
|
"epoch": 0.14128848959533133,
|
|
"grad_norm": 0.801934003829956,
|
|
"learning_rate": 9.945245780902899e-06,
|
|
"loss": 0.37144348621368406,
|
|
"memory(GiB)": 32.21,
|
|
"step": 115,
|
|
"token_acc": 0.8773385913426266,
|
|
"train_speed(iter/s)": 0.120098
|
|
},
|
|
{
|
|
"epoch": 0.147431467403824,
|
|
"grad_norm": 0.7849209308624268,
|
|
"learning_rate": 9.940390727624785e-06,
|
|
"loss": 0.4016891956329346,
|
|
"memory(GiB)": 32.21,
|
|
"step": 120,
|
|
"token_acc": 0.8671916991890818,
|
|
"train_speed(iter/s)": 0.121292
|
|
},
|
|
{
|
|
"epoch": 0.147431467403824,
|
|
"eval_loss": 0.39595848321914673,
|
|
"eval_runtime": 31.0135,
|
|
"eval_samples_per_second": 16.96,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.877976653696498,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.15357444521231667,
|
|
"grad_norm": 0.7716063857078552,
|
|
"learning_rate": 9.935330758308706e-06,
|
|
"loss": 0.38781228065490725,
|
|
"memory(GiB)": 32.21,
|
|
"step": 125,
|
|
"token_acc": 0.8762705679981929,
|
|
"train_speed(iter/s)": 0.1175
|
|
},
|
|
{
|
|
"epoch": 0.15971742302080932,
|
|
"grad_norm": 0.7710253000259399,
|
|
"learning_rate": 9.93006608283054e-06,
|
|
"loss": 0.3876336574554443,
|
|
"memory(GiB)": 32.21,
|
|
"step": 130,
|
|
"token_acc": 0.8821086956521739,
|
|
"train_speed(iter/s)": 0.118454
|
|
},
|
|
{
|
|
"epoch": 0.165860400829302,
|
|
"grad_norm": 0.7821493744850159,
|
|
"learning_rate": 9.924596919556917e-06,
|
|
"loss": 0.40181121826171873,
|
|
"memory(GiB)": 32.21,
|
|
"step": 135,
|
|
"token_acc": 0.8626237623762376,
|
|
"train_speed(iter/s)": 0.119818
|
|
},
|
|
{
|
|
"epoch": 0.17200337863779466,
|
|
"grad_norm": 0.8226854205131531,
|
|
"learning_rate": 9.918923495336138e-06,
|
|
"loss": 0.39958484172821046,
|
|
"memory(GiB)": 32.21,
|
|
"step": 140,
|
|
"token_acc": 0.8556235746008882,
|
|
"train_speed(iter/s)": 0.120946
|
|
},
|
|
{
|
|
"epoch": 0.17200337863779466,
|
|
"eval_loss": 0.393728107213974,
|
|
"eval_runtime": 31.0073,
|
|
"eval_samples_per_second": 16.964,
|
|
"eval_steps_per_second": 4.257,
|
|
"eval_token_acc": 0.8784504971897968,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.17814635644628735,
|
|
"grad_norm": 0.7877047061920166,
|
|
"learning_rate": 9.913046045488787e-06,
|
|
"loss": 0.38108556270599364,
|
|
"memory(GiB)": 34.13,
|
|
"step": 145,
|
|
"token_acc": 0.8813046265713381,
|
|
"train_speed(iter/s)": 0.11771
|
|
},
|
|
{
|
|
"epoch": 0.18428933425478,
|
|
"grad_norm": 0.7512264251708984,
|
|
"learning_rate": 9.906964813797955e-06,
|
|
"loss": 0.3876554250717163,
|
|
"memory(GiB)": 34.13,
|
|
"step": 150,
|
|
"token_acc": 0.881988944871105,
|
|
"train_speed(iter/s)": 0.118688
|
|
},
|
|
{
|
|
"epoch": 0.19043231206327269,
|
|
"grad_norm": 0.7701375484466553,
|
|
"learning_rate": 9.900680052499138e-06,
|
|
"loss": 0.38112673759460447,
|
|
"memory(GiB)": 34.13,
|
|
"step": 155,
|
|
"token_acc": 0.8716818566661686,
|
|
"train_speed(iter/s)": 0.119662
|
|
},
|
|
{
|
|
"epoch": 0.19657528987176534,
|
|
"grad_norm": 0.7622193098068237,
|
|
"learning_rate": 9.894192022269773e-06,
|
|
"loss": 0.3982468843460083,
|
|
"memory(GiB)": 34.13,
|
|
"step": 160,
|
|
"token_acc": 0.8648266919817547,
|
|
"train_speed(iter/s)": 0.120545
|
|
},
|
|
{
|
|
"epoch": 0.19657528987176534,
|
|
"eval_loss": 0.39097315073013306,
|
|
"eval_runtime": 31.0177,
|
|
"eval_samples_per_second": 16.958,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.8786718547341116,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.202718267680258,
|
|
"grad_norm": 0.7747094035148621,
|
|
"learning_rate": 9.887500992218421e-06,
|
|
"loss": 0.3932340621948242,
|
|
"memory(GiB)": 34.13,
|
|
"step": 165,
|
|
"token_acc": 0.8735516505058284,
|
|
"train_speed(iter/s)": 0.117866
|
|
},
|
|
{
|
|
"epoch": 0.20886124548875068,
|
|
"grad_norm": 0.7225446701049805,
|
|
"learning_rate": 9.880607239873614e-06,
|
|
"loss": 0.3682489633560181,
|
|
"memory(GiB)": 34.13,
|
|
"step": 170,
|
|
"token_acc": 0.8780595564195458,
|
|
"train_speed(iter/s)": 0.118651
|
|
},
|
|
{
|
|
"epoch": 0.21500422329724334,
|
|
"grad_norm": 0.7513542771339417,
|
|
"learning_rate": 9.873511051172331e-06,
|
|
"loss": 0.37564697265625,
|
|
"memory(GiB)": 34.13,
|
|
"step": 175,
|
|
"token_acc": 0.8798945693728777,
|
|
"train_speed(iter/s)": 0.119494
|
|
},
|
|
{
|
|
"epoch": 0.22114720110573602,
|
|
"grad_norm": 0.7389309406280518,
|
|
"learning_rate": 9.866212720448149e-06,
|
|
"loss": 0.3957530498504639,
|
|
"memory(GiB)": 34.13,
|
|
"step": 180,
|
|
"token_acc": 0.8693595046908373,
|
|
"train_speed(iter/s)": 0.120172
|
|
},
|
|
{
|
|
"epoch": 0.22114720110573602,
|
|
"eval_loss": 0.3888963460922241,
|
|
"eval_runtime": 31.0263,
|
|
"eval_samples_per_second": 16.953,
|
|
"eval_steps_per_second": 4.254,
|
|
"eval_token_acc": 0.8793705144833549,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.22729017891422867,
|
|
"grad_norm": 0.8499953746795654,
|
|
"learning_rate": 9.85871255041903e-06,
|
|
"loss": 0.39398903846740724,
|
|
"memory(GiB)": 34.13,
|
|
"step": 185,
|
|
"token_acc": 0.8721618431945888,
|
|
"train_speed(iter/s)": 0.11765
|
|
},
|
|
{
|
|
"epoch": 0.23343315672272133,
|
|
"grad_norm": 0.7052657008171082,
|
|
"learning_rate": 9.85101085217477e-06,
|
|
"loss": 0.3804319381713867,
|
|
"memory(GiB)": 34.13,
|
|
"step": 190,
|
|
"token_acc": 0.8779494871039452,
|
|
"train_speed(iter/s)": 0.118504
|
|
},
|
|
{
|
|
"epoch": 0.239576134531214,
|
|
"grad_norm": 0.8443171977996826,
|
|
"learning_rate": 9.843107945164086e-06,
|
|
"loss": 0.3854555606842041,
|
|
"memory(GiB)": 34.13,
|
|
"step": 195,
|
|
"token_acc": 0.8738016136687233,
|
|
"train_speed(iter/s)": 0.119158
|
|
},
|
|
{
|
|
"epoch": 0.24571911233970667,
|
|
"grad_norm": 0.7444053292274475,
|
|
"learning_rate": 9.835004157181372e-06,
|
|
"loss": 0.3835892677307129,
|
|
"memory(GiB)": 34.13,
|
|
"step": 200,
|
|
"token_acc": 0.8789022648439094,
|
|
"train_speed(iter/s)": 0.119936
|
|
},
|
|
{
|
|
"epoch": 0.24571911233970667,
|
|
"eval_loss": 0.38609230518341064,
|
|
"eval_runtime": 30.9745,
|
|
"eval_samples_per_second": 16.982,
|
|
"eval_steps_per_second": 4.262,
|
|
"eval_token_acc": 0.880086467790748,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.2518620901481993,
|
|
"grad_norm": 0.7656287550926208,
|
|
"learning_rate": 9.826699824353106e-06,
|
|
"loss": 0.3835402488708496,
|
|
"memory(GiB)": 34.13,
|
|
"step": 205,
|
|
"token_acc": 0.8772318628475851,
|
|
"train_speed(iter/s)": 0.117635
|
|
},
|
|
{
|
|
"epoch": 0.258005067956692,
|
|
"grad_norm": 0.7985251545906067,
|
|
"learning_rate": 9.818195291123903e-06,
|
|
"loss": 0.37469916343688964,
|
|
"memory(GiB)": 36.59,
|
|
"step": 210,
|
|
"token_acc": 0.8918794474675596,
|
|
"train_speed(iter/s)": 0.11841
|
|
},
|
|
{
|
|
"epoch": 0.2641480457651847,
|
|
"grad_norm": 0.7901045680046082,
|
|
"learning_rate": 9.80949091024223e-06,
|
|
"loss": 0.39004669189453123,
|
|
"memory(GiB)": 36.59,
|
|
"step": 215,
|
|
"token_acc": 0.8694972278822917,
|
|
"train_speed(iter/s)": 0.119102
|
|
},
|
|
{
|
|
"epoch": 0.27029102357367735,
|
|
"grad_norm": 0.7759472727775574,
|
|
"learning_rate": 9.800587042745774e-06,
|
|
"loss": 0.37646257877349854,
|
|
"memory(GiB)": 36.59,
|
|
"step": 220,
|
|
"token_acc": 0.8768733180258252,
|
|
"train_speed(iter/s)": 0.119681
|
|
},
|
|
{
|
|
"epoch": 0.27029102357367735,
|
|
"eval_loss": 0.38418954610824585,
|
|
"eval_runtime": 30.9751,
|
|
"eval_samples_per_second": 16.981,
|
|
"eval_steps_per_second": 4.261,
|
|
"eval_token_acc": 0.8808370082144401,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.27643400138217,
|
|
"grad_norm": 0.7889726161956787,
|
|
"learning_rate": 9.791484057946465e-06,
|
|
"loss": 0.3830937385559082,
|
|
"memory(GiB)": 36.59,
|
|
"step": 225,
|
|
"token_acc": 0.8788355828537511,
|
|
"train_speed(iter/s)": 0.117815
|
|
},
|
|
{
|
|
"epoch": 0.28257697919066266,
|
|
"grad_norm": 0.8053146004676819,
|
|
"learning_rate": 9.782182333415168e-06,
|
|
"loss": 0.40045747756958006,
|
|
"memory(GiB)": 36.59,
|
|
"step": 230,
|
|
"token_acc": 0.8767751952143934,
|
|
"train_speed(iter/s)": 0.118387
|
|
},
|
|
{
|
|
"epoch": 0.2887199569991553,
|
|
"grad_norm": 0.7342280745506287,
|
|
"learning_rate": 9.772682254966009e-06,
|
|
"loss": 0.39071879386901853,
|
|
"memory(GiB)": 36.59,
|
|
"step": 235,
|
|
"token_acc": 0.8698379998127166,
|
|
"train_speed(iter/s)": 0.119097
|
|
},
|
|
{
|
|
"epoch": 0.294862934807648,
|
|
"grad_norm": 0.7769783139228821,
|
|
"learning_rate": 9.762984216640378e-06,
|
|
"loss": 0.38714871406555174,
|
|
"memory(GiB)": 36.59,
|
|
"step": 240,
|
|
"token_acc": 0.8766444973056945,
|
|
"train_speed(iter/s)": 0.119737
|
|
},
|
|
{
|
|
"epoch": 0.294862934807648,
|
|
"eval_loss": 0.38342124223709106,
|
|
"eval_runtime": 30.9908,
|
|
"eval_samples_per_second": 16.973,
|
|
"eval_steps_per_second": 4.259,
|
|
"eval_token_acc": 0.8811033290099438,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.3010059126161407,
|
|
"grad_norm": 0.7775170803070068,
|
|
"learning_rate": 9.753088620690589e-06,
|
|
"loss": 0.36563289165496826,
|
|
"memory(GiB)": 36.59,
|
|
"step": 245,
|
|
"token_acc": 0.8821107213664786,
|
|
"train_speed(iter/s)": 0.117883
|
|
},
|
|
{
|
|
"epoch": 0.30714889042463334,
|
|
"grad_norm": 0.7627344131469727,
|
|
"learning_rate": 9.742995877563187e-06,
|
|
"loss": 0.3691666841506958,
|
|
"memory(GiB)": 36.59,
|
|
"step": 250,
|
|
"token_acc": 0.8684178043301157,
|
|
"train_speed(iter/s)": 0.11847
|
|
},
|
|
{
|
|
"epoch": 0.313291868233126,
|
|
"grad_norm": 0.730969250202179,
|
|
"learning_rate": 9.732706405881931e-06,
|
|
"loss": 0.37671756744384766,
|
|
"memory(GiB)": 36.59,
|
|
"step": 255,
|
|
"token_acc": 0.8784978880675819,
|
|
"train_speed(iter/s)": 0.118913
|
|
},
|
|
{
|
|
"epoch": 0.31943484604161865,
|
|
"grad_norm": 0.7510061860084534,
|
|
"learning_rate": 9.722220632430428e-06,
|
|
"loss": 0.36403095722198486,
|
|
"memory(GiB)": 36.59,
|
|
"step": 260,
|
|
"token_acc": 0.884961560097506,
|
|
"train_speed(iter/s)": 0.1194
|
|
},
|
|
{
|
|
"epoch": 0.31943484604161865,
|
|
"eval_loss": 0.3818422555923462,
|
|
"eval_runtime": 30.9337,
|
|
"eval_samples_per_second": 17.004,
|
|
"eval_steps_per_second": 4.267,
|
|
"eval_token_acc": 0.8810894941634241,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.32557782385011136,
|
|
"grad_norm": 0.6700690984725952,
|
|
"learning_rate": 9.711538992134427e-06,
|
|
"loss": 0.37852253913879397,
|
|
"memory(GiB)": 36.59,
|
|
"step": 265,
|
|
"token_acc": 0.8780975219824141,
|
|
"train_speed(iter/s)": 0.117682
|
|
},
|
|
{
|
|
"epoch": 0.331720801658604,
|
|
"grad_norm": 0.7542963624000549,
|
|
"learning_rate": 9.700661928043787e-06,
|
|
"loss": 0.3520061016082764,
|
|
"memory(GiB)": 36.59,
|
|
"step": 270,
|
|
"token_acc": 0.8765217391304347,
|
|
"train_speed(iter/s)": 0.118172
|
|
},
|
|
{
|
|
"epoch": 0.33786377946709667,
|
|
"grad_norm": 0.6696748733520508,
|
|
"learning_rate": 9.689589891314094e-06,
|
|
"loss": 0.3755272150039673,
|
|
"memory(GiB)": 36.59,
|
|
"step": 275,
|
|
"token_acc": 0.8727695145026466,
|
|
"train_speed(iter/s)": 0.118608
|
|
},
|
|
{
|
|
"epoch": 0.3440067572755893,
|
|
"grad_norm": 0.7883334159851074,
|
|
"learning_rate": 9.678323341187956e-06,
|
|
"loss": 0.376280689239502,
|
|
"memory(GiB)": 36.59,
|
|
"step": 280,
|
|
"token_acc": 0.8781244037397443,
|
|
"train_speed(iter/s)": 0.119045
|
|
},
|
|
{
|
|
"epoch": 0.3440067572755893,
|
|
"eval_loss": 0.380220502614975,
|
|
"eval_runtime": 30.9631,
|
|
"eval_samples_per_second": 16.988,
|
|
"eval_steps_per_second": 4.263,
|
|
"eval_token_acc": 0.8814526588845655,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.350149735084082,
|
|
"grad_norm": 0.7125808596611023,
|
|
"learning_rate": 9.666862744975938e-06,
|
|
"loss": 0.3811634063720703,
|
|
"memory(GiB)": 36.59,
|
|
"step": 285,
|
|
"token_acc": 0.881547675634566,
|
|
"train_speed(iter/s)": 0.117616
|
|
},
|
|
{
|
|
"epoch": 0.3562927128925747,
|
|
"grad_norm": 0.7022562623023987,
|
|
"learning_rate": 9.655208578037198e-06,
|
|
"loss": 0.36770806312561033,
|
|
"memory(GiB)": 36.59,
|
|
"step": 290,
|
|
"token_acc": 0.8775136241403108,
|
|
"train_speed(iter/s)": 0.118155
|
|
},
|
|
{
|
|
"epoch": 0.36243569070106735,
|
|
"grad_norm": 0.7109845280647278,
|
|
"learning_rate": 9.643361323759763e-06,
|
|
"loss": 0.36910414695739746,
|
|
"memory(GiB)": 36.59,
|
|
"step": 295,
|
|
"token_acc": 0.8801465983159751,
|
|
"train_speed(iter/s)": 0.118621
|
|
},
|
|
{
|
|
"epoch": 0.36857866850956,
|
|
"grad_norm": 0.7310053706169128,
|
|
"learning_rate": 9.631321473540476e-06,
|
|
"loss": 0.36344945430755615,
|
|
"memory(GiB)": 36.59,
|
|
"step": 300,
|
|
"token_acc": 0.8726629026286561,
|
|
"train_speed(iter/s)": 0.119086
|
|
},
|
|
{
|
|
"epoch": 0.36857866850956,
|
|
"eval_loss": 0.3780768811702728,
|
|
"eval_runtime": 31.0728,
|
|
"eval_samples_per_second": 16.928,
|
|
"eval_steps_per_second": 4.248,
|
|
"eval_token_acc": 0.8825006485084306,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.37472164631805266,
|
|
"grad_norm": 0.7264479994773865,
|
|
"learning_rate": 9.619089526764614e-06,
|
|
"loss": 0.380098819732666,
|
|
"memory(GiB)": 36.59,
|
|
"step": 305,
|
|
"token_acc": 0.8804112554112554,
|
|
"train_speed(iter/s)": 0.11773
|
|
},
|
|
{
|
|
"epoch": 0.38086462412654537,
|
|
"grad_norm": 0.8007322549819946,
|
|
"learning_rate": 9.60666599078518e-06,
|
|
"loss": 0.3628620862960815,
|
|
"memory(GiB)": 36.59,
|
|
"step": 310,
|
|
"token_acc": 0.8855827918881669,
|
|
"train_speed(iter/s)": 0.118139
|
|
},
|
|
{
|
|
"epoch": 0.387007601935038,
|
|
"grad_norm": 0.730522871017456,
|
|
"learning_rate": 9.59405138090186e-06,
|
|
"loss": 0.36655001640319823,
|
|
"memory(GiB)": 36.59,
|
|
"step": 315,
|
|
"token_acc": 0.8823326091250246,
|
|
"train_speed(iter/s)": 0.118659
|
|
},
|
|
{
|
|
"epoch": 0.3931505797435307,
|
|
"grad_norm": 0.7646607756614685,
|
|
"learning_rate": 9.581246220339636e-06,
|
|
"loss": 0.35800130367279054,
|
|
"memory(GiB)": 36.59,
|
|
"step": 320,
|
|
"token_acc": 0.8788769866274592,
|
|
"train_speed(iter/s)": 0.119038
|
|
},
|
|
{
|
|
"epoch": 0.3931505797435307,
|
|
"eval_loss": 0.37690821290016174,
|
|
"eval_runtime": 31.0274,
|
|
"eval_samples_per_second": 16.953,
|
|
"eval_steps_per_second": 4.254,
|
|
"eval_token_acc": 0.8822689148292261,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.39929355755202334,
|
|
"grad_norm": 0.7562268972396851,
|
|
"learning_rate": 9.568251040227101e-06,
|
|
"loss": 0.384972071647644,
|
|
"memory(GiB)": 36.59,
|
|
"step": 325,
|
|
"token_acc": 0.8815015713117225,
|
|
"train_speed(iter/s)": 0.117697
|
|
},
|
|
{
|
|
"epoch": 0.405436535360516,
|
|
"grad_norm": 0.7428621053695679,
|
|
"learning_rate": 9.555066379574423e-06,
|
|
"loss": 0.3597818613052368,
|
|
"memory(GiB)": 36.59,
|
|
"step": 330,
|
|
"token_acc": 0.889793055068397,
|
|
"train_speed(iter/s)": 0.118163
|
|
},
|
|
{
|
|
"epoch": 0.4115795131690087,
|
|
"grad_norm": 0.7479391098022461,
|
|
"learning_rate": 9.541692785250983e-06,
|
|
"loss": 0.3805227279663086,
|
|
"memory(GiB)": 36.59,
|
|
"step": 335,
|
|
"token_acc": 0.8907455632716049,
|
|
"train_speed(iter/s)": 0.118498
|
|
},
|
|
{
|
|
"epoch": 0.41772249097750136,
|
|
"grad_norm": 0.6682092547416687,
|
|
"learning_rate": 9.528130811962693e-06,
|
|
"loss": 0.37683632373809817,
|
|
"memory(GiB)": 36.59,
|
|
"step": 340,
|
|
"token_acc": 0.8722201102452005,
|
|
"train_speed(iter/s)": 0.118896
|
|
},
|
|
{
|
|
"epoch": 0.41772249097750136,
|
|
"eval_loss": 0.3755421042442322,
|
|
"eval_runtime": 31.0414,
|
|
"eval_samples_per_second": 16.945,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.88294336359706,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.423865468785994,
|
|
"grad_norm": 0.7604427933692932,
|
|
"learning_rate": 9.514381022228997e-06,
|
|
"loss": 0.36464648246765136,
|
|
"memory(GiB)": 36.59,
|
|
"step": 345,
|
|
"token_acc": 0.8840508026994174,
|
|
"train_speed(iter/s)": 0.117605
|
|
},
|
|
{
|
|
"epoch": 0.43000844659448667,
|
|
"grad_norm": 0.7618926763534546,
|
|
"learning_rate": 9.50044398635953e-06,
|
|
"loss": 0.37844386100769045,
|
|
"memory(GiB)": 36.59,
|
|
"step": 350,
|
|
"token_acc": 0.8788168373151308,
|
|
"train_speed(iter/s)": 0.117953
|
|
},
|
|
{
|
|
"epoch": 0.4361514244029793,
|
|
"grad_norm": 0.6848899126052856,
|
|
"learning_rate": 9.486320282430469e-06,
|
|
"loss": 0.3681621551513672,
|
|
"memory(GiB)": 36.59,
|
|
"step": 355,
|
|
"token_acc": 0.8739398701268689,
|
|
"train_speed(iter/s)": 0.11841
|
|
},
|
|
{
|
|
"epoch": 0.44229440221147204,
|
|
"grad_norm": 0.7334110140800476,
|
|
"learning_rate": 9.472010496260545e-06,
|
|
"loss": 0.3769216060638428,
|
|
"memory(GiB)": 36.59,
|
|
"step": 360,
|
|
"token_acc": 0.8754503693028283,
|
|
"train_speed(iter/s)": 0.118855
|
|
},
|
|
{
|
|
"epoch": 0.44229440221147204,
|
|
"eval_loss": 0.3748551905155182,
|
|
"eval_runtime": 31.0629,
|
|
"eval_samples_per_second": 16.933,
|
|
"eval_steps_per_second": 4.249,
|
|
"eval_token_acc": 0.8829156939040208,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.4484373800199647,
|
|
"grad_norm": 0.6733468770980835,
|
|
"learning_rate": 9.45751522138676e-06,
|
|
"loss": 0.3699374198913574,
|
|
"memory(GiB)": 36.59,
|
|
"step": 365,
|
|
"token_acc": 0.8818029853755239,
|
|
"train_speed(iter/s)": 0.117632
|
|
},
|
|
{
|
|
"epoch": 0.45458035782845735,
|
|
"grad_norm": 0.6975194811820984,
|
|
"learning_rate": 9.44283505903976e-06,
|
|
"loss": 0.3686963081359863,
|
|
"memory(GiB)": 36.59,
|
|
"step": 370,
|
|
"token_acc": 0.8794543496470025,
|
|
"train_speed(iter/s)": 0.118021
|
|
},
|
|
{
|
|
"epoch": 0.46072333563695,
|
|
"grad_norm": 0.7434240579605103,
|
|
"learning_rate": 9.427970618118888e-06,
|
|
"loss": 0.38825435638427735,
|
|
"memory(GiB)": 36.59,
|
|
"step": 375,
|
|
"token_acc": 0.875475461545598,
|
|
"train_speed(iter/s)": 0.118433
|
|
},
|
|
{
|
|
"epoch": 0.46686631344544266,
|
|
"grad_norm": 0.7431550621986389,
|
|
"learning_rate": 9.412922515166952e-06,
|
|
"loss": 0.36851983070373534,
|
|
"memory(GiB)": 36.59,
|
|
"step": 380,
|
|
"token_acc": 0.8677917508307813,
|
|
"train_speed(iter/s)": 0.118763
|
|
},
|
|
{
|
|
"epoch": 0.46686631344544266,
|
|
"eval_loss": 0.37393027544021606,
|
|
"eval_runtime": 31.0461,
|
|
"eval_samples_per_second": 16.943,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.8833099870298314,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.47300929125393537,
|
|
"grad_norm": 0.7830028533935547,
|
|
"learning_rate": 9.39769137434463e-06,
|
|
"loss": 0.39449851512908934,
|
|
"memory(GiB)": 36.59,
|
|
"step": 385,
|
|
"token_acc": 0.8813338029015882,
|
|
"train_speed(iter/s)": 0.117631
|
|
},
|
|
{
|
|
"epoch": 0.479152269062428,
|
|
"grad_norm": 0.7592146992683411,
|
|
"learning_rate": 9.38227782740459e-06,
|
|
"loss": 0.3797061681747437,
|
|
"memory(GiB)": 36.59,
|
|
"step": 390,
|
|
"token_acc": 0.8725708251892791,
|
|
"train_speed(iter/s)": 0.118076
|
|
},
|
|
{
|
|
"epoch": 0.4852952468709207,
|
|
"grad_norm": 0.7044036388397217,
|
|
"learning_rate": 9.366682513665293e-06,
|
|
"loss": 0.34874444007873534,
|
|
"memory(GiB)": 36.59,
|
|
"step": 395,
|
|
"token_acc": 0.8924136680866491,
|
|
"train_speed(iter/s)": 0.118392
|
|
},
|
|
{
|
|
"epoch": 0.49143822467941334,
|
|
"grad_norm": 0.7327947020530701,
|
|
"learning_rate": 9.350906079984456e-06,
|
|
"loss": 0.3913299322128296,
|
|
"memory(GiB)": 36.59,
|
|
"step": 400,
|
|
"token_acc": 0.8793465520609494,
|
|
"train_speed(iter/s)": 0.118741
|
|
},
|
|
{
|
|
"epoch": 0.49143822467941334,
|
|
"eval_loss": 0.37309640645980835,
|
|
"eval_runtime": 31.0239,
|
|
"eval_samples_per_second": 16.955,
|
|
"eval_steps_per_second": 4.255,
|
|
"eval_token_acc": 0.8834448767833982,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.497581202487906,
|
|
"grad_norm": 0.714108407497406,
|
|
"learning_rate": 9.334949180732245e-06,
|
|
"loss": 0.3835240364074707,
|
|
"memory(GiB)": 36.59,
|
|
"step": 405,
|
|
"token_acc": 0.8806726886733547,
|
|
"train_speed(iter/s)": 0.117711
|
|
},
|
|
{
|
|
"epoch": 0.5037241802963986,
|
|
"grad_norm": 0.6842460632324219,
|
|
"learning_rate": 9.31881247776412e-06,
|
|
"loss": 0.34242706298828124,
|
|
"memory(GiB)": 36.59,
|
|
"step": 410,
|
|
"token_acc": 0.8898083315651744,
|
|
"train_speed(iter/s)": 0.118116
|
|
},
|
|
{
|
|
"epoch": 0.5098671581048914,
|
|
"grad_norm": 0.7109769582748413,
|
|
"learning_rate": 9.302496640393383e-06,
|
|
"loss": 0.3699876546859741,
|
|
"memory(GiB)": 36.59,
|
|
"step": 415,
|
|
"token_acc": 0.8834916327453641,
|
|
"train_speed(iter/s)": 0.118429
|
|
},
|
|
{
|
|
"epoch": 0.516010135913384,
|
|
"grad_norm": 0.72795569896698,
|
|
"learning_rate": 9.286002345363418e-06,
|
|
"loss": 0.36434710025787354,
|
|
"memory(GiB)": 36.59,
|
|
"step": 420,
|
|
"token_acc": 0.8838608737513539,
|
|
"train_speed(iter/s)": 0.118728
|
|
},
|
|
{
|
|
"epoch": 0.516010135913384,
|
|
"eval_loss": 0.37135639786720276,
|
|
"eval_runtime": 31.0313,
|
|
"eval_samples_per_second": 16.951,
|
|
"eval_steps_per_second": 4.254,
|
|
"eval_token_acc": 0.8840812797233031,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.5221531137218767,
|
|
"grad_norm": 0.6852810382843018,
|
|
"learning_rate": 9.26933027681963e-06,
|
|
"loss": 0.371048641204834,
|
|
"memory(GiB)": 36.59,
|
|
"step": 425,
|
|
"token_acc": 0.8814758591608687,
|
|
"train_speed(iter/s)": 0.117721
|
|
},
|
|
{
|
|
"epoch": 0.5282960915303694,
|
|
"grad_norm": 0.7316782474517822,
|
|
"learning_rate": 9.25248112628105e-06,
|
|
"loss": 0.3735438346862793,
|
|
"memory(GiB)": 36.59,
|
|
"step": 430,
|
|
"token_acc": 0.8826662287081789,
|
|
"train_speed(iter/s)": 0.117992
|
|
},
|
|
{
|
|
"epoch": 0.534439069338862,
|
|
"grad_norm": 0.7115728259086609,
|
|
"learning_rate": 9.235455592611667e-06,
|
|
"loss": 0.360302734375,
|
|
"memory(GiB)": 36.59,
|
|
"step": 435,
|
|
"token_acc": 0.8884788847888478,
|
|
"train_speed(iter/s)": 0.118327
|
|
},
|
|
{
|
|
"epoch": 0.5405820471473547,
|
|
"grad_norm": 0.6213703155517578,
|
|
"learning_rate": 9.218254381991438e-06,
|
|
"loss": 0.363602352142334,
|
|
"memory(GiB)": 36.59,
|
|
"step": 440,
|
|
"token_acc": 0.8796412181894034,
|
|
"train_speed(iter/s)": 0.118669
|
|
},
|
|
{
|
|
"epoch": 0.5405820471473547,
|
|
"eval_loss": 0.37061288952827454,
|
|
"eval_runtime": 31.0004,
|
|
"eval_samples_per_second": 16.968,
|
|
"eval_steps_per_second": 4.258,
|
|
"eval_token_acc": 0.8841919584954604,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.5467250249558473,
|
|
"grad_norm": 0.6319580674171448,
|
|
"learning_rate": 9.200878207886995e-06,
|
|
"loss": 0.36367177963256836,
|
|
"memory(GiB)": 36.59,
|
|
"step": 445,
|
|
"token_acc": 0.880615405975304,
|
|
"train_speed(iter/s)": 0.11768
|
|
},
|
|
{
|
|
"epoch": 0.55286800276434,
|
|
"grad_norm": 0.7951823472976685,
|
|
"learning_rate": 9.183327791022048e-06,
|
|
"loss": 0.37214341163635256,
|
|
"memory(GiB)": 36.59,
|
|
"step": 450,
|
|
"token_acc": 0.88060522696011,
|
|
"train_speed(iter/s)": 0.118044
|
|
},
|
|
{
|
|
"epoch": 0.5590109805728327,
|
|
"grad_norm": 0.7379077076911926,
|
|
"learning_rate": 9.165603859347503e-06,
|
|
"loss": 0.3636307716369629,
|
|
"memory(GiB)": 36.59,
|
|
"step": 455,
|
|
"token_acc": 0.8860057913311012,
|
|
"train_speed(iter/s)": 0.118377
|
|
},
|
|
{
|
|
"epoch": 0.5651539583813253,
|
|
"grad_norm": 0.6838334798812866,
|
|
"learning_rate": 9.147707148011255e-06,
|
|
"loss": 0.36699528694152833,
|
|
"memory(GiB)": 36.59,
|
|
"step": 460,
|
|
"token_acc": 0.8731886687471273,
|
|
"train_speed(iter/s)": 0.118711
|
|
},
|
|
{
|
|
"epoch": 0.5651539583813253,
|
|
"eval_loss": 0.3706679344177246,
|
|
"eval_runtime": 31.0119,
|
|
"eval_samples_per_second": 16.961,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.883956766104626,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.571296936189818,
|
|
"grad_norm": 0.7097205519676208,
|
|
"learning_rate": 9.129638399327707e-06,
|
|
"loss": 0.3835044622421265,
|
|
"memory(GiB)": 36.59,
|
|
"step": 465,
|
|
"token_acc": 0.8826179212466955,
|
|
"train_speed(iter/s)": 0.117847
|
|
},
|
|
{
|
|
"epoch": 0.5774399139983106,
|
|
"grad_norm": 0.7685003876686096,
|
|
"learning_rate": 9.111398362746969e-06,
|
|
"loss": 0.34739508628845217,
|
|
"memory(GiB)": 36.59,
|
|
"step": 470,
|
|
"token_acc": 0.8856424192063242,
|
|
"train_speed(iter/s)": 0.118092
|
|
},
|
|
{
|
|
"epoch": 0.5835828918068033,
|
|
"grad_norm": 0.684012234210968,
|
|
"learning_rate": 9.092987794823785e-06,
|
|
"loss": 0.35583484172821045,
|
|
"memory(GiB)": 36.59,
|
|
"step": 475,
|
|
"token_acc": 0.8870865428183053,
|
|
"train_speed(iter/s)": 0.118389
|
|
},
|
|
{
|
|
"epoch": 0.589725869615296,
|
|
"grad_norm": 0.7555577158927917,
|
|
"learning_rate": 9.074407459186144e-06,
|
|
"loss": 0.3742217540740967,
|
|
"memory(GiB)": 36.59,
|
|
"step": 480,
|
|
"token_acc": 0.8749733708902407,
|
|
"train_speed(iter/s)": 0.118715
|
|
},
|
|
{
|
|
"epoch": 0.589725869615296,
|
|
"eval_loss": 0.3698117733001709,
|
|
"eval_runtime": 31.0843,
|
|
"eval_samples_per_second": 16.922,
|
|
"eval_steps_per_second": 4.247,
|
|
"eval_token_acc": 0.8841297016861219,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.5958688474237887,
|
|
"grad_norm": 0.7176510095596313,
|
|
"learning_rate": 9.055658126503605e-06,
|
|
"loss": 0.35448513031005857,
|
|
"memory(GiB)": 36.59,
|
|
"step": 485,
|
|
"token_acc": 0.882393450149208,
|
|
"train_speed(iter/s)": 0.117835
|
|
},
|
|
{
|
|
"epoch": 0.6020118252322814,
|
|
"grad_norm": 0.7371023297309875,
|
|
"learning_rate": 9.036740574455345e-06,
|
|
"loss": 0.35907247066497805,
|
|
"memory(GiB)": 36.59,
|
|
"step": 490,
|
|
"token_acc": 0.8887174366887537,
|
|
"train_speed(iter/s)": 0.118083
|
|
},
|
|
{
|
|
"epoch": 0.608154803040774,
|
|
"grad_norm": 0.6593868136405945,
|
|
"learning_rate": 9.017655587697885e-06,
|
|
"loss": 0.36144974231719973,
|
|
"memory(GiB)": 36.59,
|
|
"step": 495,
|
|
"token_acc": 0.8897585166019836,
|
|
"train_speed(iter/s)": 0.118377
|
|
},
|
|
{
|
|
"epoch": 0.6142977808492667,
|
|
"grad_norm": 0.7346932291984558,
|
|
"learning_rate": 8.998403957832553e-06,
|
|
"loss": 0.35957746505737304,
|
|
"memory(GiB)": 36.59,
|
|
"step": 500,
|
|
"token_acc": 0.8936918488180564,
|
|
"train_speed(iter/s)": 0.118644
|
|
},
|
|
{
|
|
"epoch": 0.6142977808492667,
|
|
"eval_loss": 0.36875346302986145,
|
|
"eval_runtime": 30.9564,
|
|
"eval_samples_per_second": 16.992,
|
|
"eval_steps_per_second": 4.264,
|
|
"eval_token_acc": 0.8846000864677908,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.6204407586577594,
|
|
"grad_norm": 0.6690182089805603,
|
|
"learning_rate": 8.978986483372657e-06,
|
|
"loss": 0.36060357093811035,
|
|
"memory(GiB)": 36.59,
|
|
"step": 505,
|
|
"token_acc": 0.8834197325817438,
|
|
"train_speed(iter/s)": 0.117782
|
|
},
|
|
{
|
|
"epoch": 0.626583736466252,
|
|
"grad_norm": 0.6996055245399475,
|
|
"learning_rate": 8.959403969710346e-06,
|
|
"loss": 0.35636866092681885,
|
|
"memory(GiB)": 36.59,
|
|
"step": 510,
|
|
"token_acc": 0.8747046644744855,
|
|
"train_speed(iter/s)": 0.118099
|
|
},
|
|
{
|
|
"epoch": 0.6327267142747447,
|
|
"grad_norm": 0.7242439985275269,
|
|
"learning_rate": 8.939657229083223e-06,
|
|
"loss": 0.362790584564209,
|
|
"memory(GiB)": 36.59,
|
|
"step": 515,
|
|
"token_acc": 0.8795643179382369,
|
|
"train_speed(iter/s)": 0.11841
|
|
},
|
|
{
|
|
"epoch": 0.6388696920832373,
|
|
"grad_norm": 0.7438492178916931,
|
|
"learning_rate": 8.919747080540647e-06,
|
|
"loss": 0.36803131103515624,
|
|
"memory(GiB)": 36.59,
|
|
"step": 520,
|
|
"token_acc": 0.8868724794882492,
|
|
"train_speed(iter/s)": 0.118724
|
|
},
|
|
{
|
|
"epoch": 0.6388696920832373,
|
|
"eval_loss": 0.3668961226940155,
|
|
"eval_runtime": 31.0395,
|
|
"eval_samples_per_second": 16.946,
|
|
"eval_steps_per_second": 4.253,
|
|
"eval_token_acc": 0.8850704712494596,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.64501266989173,
|
|
"grad_norm": 0.7443042993545532,
|
|
"learning_rate": 8.899674349909759e-06,
|
|
"loss": 0.3723003387451172,
|
|
"memory(GiB)": 36.59,
|
|
"step": 525,
|
|
"token_acc": 0.8819261436583553,
|
|
"train_speed(iter/s)": 0.117947
|
|
},
|
|
{
|
|
"epoch": 0.6511556477002227,
|
|
"grad_norm": 0.7717169523239136,
|
|
"learning_rate": 8.879439869761233e-06,
|
|
"loss": 0.37207541465759275,
|
|
"memory(GiB)": 36.59,
|
|
"step": 530,
|
|
"token_acc": 0.8742153725911633,
|
|
"train_speed(iter/s)": 0.118275
|
|
},
|
|
{
|
|
"epoch": 0.6572986255087153,
|
|
"grad_norm": 0.7282743453979492,
|
|
"learning_rate": 8.859044479374737e-06,
|
|
"loss": 0.3790937900543213,
|
|
"memory(GiB)": 36.59,
|
|
"step": 535,
|
|
"token_acc": 0.8727581424267062,
|
|
"train_speed(iter/s)": 0.118594
|
|
},
|
|
{
|
|
"epoch": 0.663441603317208,
|
|
"grad_norm": 0.7114688158035278,
|
|
"learning_rate": 8.838489024704131e-06,
|
|
"loss": 0.3806754112243652,
|
|
"memory(GiB)": 36.59,
|
|
"step": 540,
|
|
"token_acc": 0.8710419328609594,
|
|
"train_speed(iter/s)": 0.1188
|
|
},
|
|
{
|
|
"epoch": 0.663441603317208,
|
|
"eval_loss": 0.3665069043636322,
|
|
"eval_runtime": 31.0202,
|
|
"eval_samples_per_second": 16.957,
|
|
"eval_steps_per_second": 4.255,
|
|
"eval_token_acc": 0.8851638564634674,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.6695845811257006,
|
|
"grad_norm": 0.7657713890075684,
|
|
"learning_rate": 8.817774358342367e-06,
|
|
"loss": 0.3505518913269043,
|
|
"memory(GiB)": 36.59,
|
|
"step": 545,
|
|
"token_acc": 0.8844506134759065,
|
|
"train_speed(iter/s)": 0.118004
|
|
},
|
|
{
|
|
"epoch": 0.6757275589341933,
|
|
"grad_norm": 0.7225794196128845,
|
|
"learning_rate": 8.796901339486136e-06,
|
|
"loss": 0.36959023475646974,
|
|
"memory(GiB)": 36.59,
|
|
"step": 550,
|
|
"token_acc": 0.8763546536336592,
|
|
"train_speed(iter/s)": 0.118273
|
|
},
|
|
{
|
|
"epoch": 0.681870536742686,
|
|
"grad_norm": 0.6392650604248047,
|
|
"learning_rate": 8.775870833900226e-06,
|
|
"loss": 0.35045757293701174,
|
|
"memory(GiB)": 36.59,
|
|
"step": 555,
|
|
"token_acc": 0.879759337041662,
|
|
"train_speed(iter/s)": 0.118527
|
|
},
|
|
{
|
|
"epoch": 0.6880135145511787,
|
|
"grad_norm": 0.7549835443496704,
|
|
"learning_rate": 8.75468371388161e-06,
|
|
"loss": 0.3724693775177002,
|
|
"memory(GiB)": 36.59,
|
|
"step": 560,
|
|
"token_acc": 0.8853696026829382,
|
|
"train_speed(iter/s)": 0.11871
|
|
},
|
|
{
|
|
"epoch": 0.6880135145511787,
|
|
"eval_loss": 0.3657075762748718,
|
|
"eval_runtime": 31.0165,
|
|
"eval_samples_per_second": 16.959,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.8855408560311284,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.6941564923596714,
|
|
"grad_norm": 0.7093605399131775,
|
|
"learning_rate": 8.733340858223268e-06,
|
|
"loss": 0.3566612720489502,
|
|
"memory(GiB)": 36.59,
|
|
"step": 565,
|
|
"token_acc": 0.8851378312772255,
|
|
"train_speed(iter/s)": 0.117959
|
|
},
|
|
{
|
|
"epoch": 0.700299470168164,
|
|
"grad_norm": 0.7799498438835144,
|
|
"learning_rate": 8.711843152177735e-06,
|
|
"loss": 0.35616464614868165,
|
|
"memory(GiB)": 36.59,
|
|
"step": 570,
|
|
"token_acc": 0.8828106906294872,
|
|
"train_speed(iter/s)": 0.118218
|
|
},
|
|
{
|
|
"epoch": 0.7064424479766567,
|
|
"grad_norm": 0.7166778445243835,
|
|
"learning_rate": 8.690191487420385e-06,
|
|
"loss": 0.36056735515594485,
|
|
"memory(GiB)": 36.59,
|
|
"step": 575,
|
|
"token_acc": 0.8801836905093237,
|
|
"train_speed(iter/s)": 0.118442
|
|
},
|
|
{
|
|
"epoch": 0.7125854257851494,
|
|
"grad_norm": 0.7225358486175537,
|
|
"learning_rate": 8.668386762012445e-06,
|
|
"loss": 0.3537228345870972,
|
|
"memory(GiB)": 36.59,
|
|
"step": 580,
|
|
"token_acc": 0.8771770513178728,
|
|
"train_speed(iter/s)": 0.118636
|
|
},
|
|
{
|
|
"epoch": 0.7125854257851494,
|
|
"eval_loss": 0.3655913472175598,
|
|
"eval_runtime": 31.0226,
|
|
"eval_samples_per_second": 16.955,
|
|
"eval_steps_per_second": 4.255,
|
|
"eval_token_acc": 0.8858832684824903,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.718728403593642,
|
|
"grad_norm": 0.707248866558075,
|
|
"learning_rate": 8.646429880363746e-06,
|
|
"loss": 0.35696611404418943,
|
|
"memory(GiB)": 36.59,
|
|
"step": 585,
|
|
"token_acc": 0.8862164894194702,
|
|
"train_speed(iter/s)": 0.117878
|
|
},
|
|
{
|
|
"epoch": 0.7248713814021347,
|
|
"grad_norm": 0.7443193197250366,
|
|
"learning_rate": 8.624321753195209e-06,
|
|
"loss": 0.3900872468948364,
|
|
"memory(GiB)": 36.59,
|
|
"step": 590,
|
|
"token_acc": 0.8764379646896419,
|
|
"train_speed(iter/s)": 0.118131
|
|
},
|
|
{
|
|
"epoch": 0.7310143592106274,
|
|
"grad_norm": 0.6623369455337524,
|
|
"learning_rate": 8.602063297501069e-06,
|
|
"loss": 0.36977558135986327,
|
|
"memory(GiB)": 36.59,
|
|
"step": 595,
|
|
"token_acc": 0.880887231518028,
|
|
"train_speed(iter/s)": 0.118362
|
|
},
|
|
{
|
|
"epoch": 0.73715733701912,
|
|
"grad_norm": 0.7453055381774902,
|
|
"learning_rate": 8.579655436510847e-06,
|
|
"loss": 0.35259857177734377,
|
|
"memory(GiB)": 36.59,
|
|
"step": 600,
|
|
"token_acc": 0.8738677315671569,
|
|
"train_speed(iter/s)": 0.118577
|
|
},
|
|
{
|
|
"epoch": 0.73715733701912,
|
|
"eval_loss": 0.3638327419757843,
|
|
"eval_runtime": 31.0054,
|
|
"eval_samples_per_second": 16.965,
|
|
"eval_steps_per_second": 4.257,
|
|
"eval_token_acc": 0.885855598789451,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.7433003148276127,
|
|
"grad_norm": 0.7225296497344971,
|
|
"learning_rate": 8.557099099651046e-06,
|
|
"loss": 0.36968977451324464,
|
|
"memory(GiB)": 36.59,
|
|
"step": 605,
|
|
"token_acc": 0.8823149650444696,
|
|
"train_speed(iter/s)": 0.11788
|
|
},
|
|
{
|
|
"epoch": 0.7494432926361053,
|
|
"grad_norm": 0.6900773048400879,
|
|
"learning_rate": 8.534395222506614e-06,
|
|
"loss": 0.36718852519989015,
|
|
"memory(GiB)": 36.59,
|
|
"step": 610,
|
|
"token_acc": 0.8860133630289533,
|
|
"train_speed(iter/s)": 0.118141
|
|
},
|
|
{
|
|
"epoch": 0.755586270444598,
|
|
"grad_norm": 0.671517014503479,
|
|
"learning_rate": 8.511544746782124e-06,
|
|
"loss": 0.36435210704803467,
|
|
"memory(GiB)": 36.59,
|
|
"step": 615,
|
|
"token_acc": 0.8798159594739043,
|
|
"train_speed(iter/s)": 0.118359
|
|
},
|
|
{
|
|
"epoch": 0.7617292482530907,
|
|
"grad_norm": 0.6808713674545288,
|
|
"learning_rate": 8.488548620262722e-06,
|
|
"loss": 0.36147489547729494,
|
|
"memory(GiB)": 36.59,
|
|
"step": 620,
|
|
"token_acc": 0.8823460793691529,
|
|
"train_speed(iter/s)": 0.118579
|
|
},
|
|
{
|
|
"epoch": 0.7617292482530907,
|
|
"eval_loss": 0.3635016977787018,
|
|
"eval_runtime": 31.0094,
|
|
"eval_samples_per_second": 16.963,
|
|
"eval_steps_per_second": 4.257,
|
|
"eval_token_acc": 0.8860146995244271,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.7678722260615833,
|
|
"grad_norm": 0.7556000351905823,
|
|
"learning_rate": 8.465407796774816e-06,
|
|
"loss": 0.36651790142059326,
|
|
"memory(GiB)": 36.59,
|
|
"step": 625,
|
|
"token_acc": 0.8846928285600197,
|
|
"train_speed(iter/s)": 0.117873
|
|
},
|
|
{
|
|
"epoch": 0.774015203870076,
|
|
"grad_norm": 0.724098801612854,
|
|
"learning_rate": 8.442123236146509e-06,
|
|
"loss": 0.35537469387054443,
|
|
"memory(GiB)": 36.59,
|
|
"step": 630,
|
|
"token_acc": 0.8859382569251772,
|
|
"train_speed(iter/s)": 0.118118
|
|
},
|
|
{
|
|
"epoch": 0.7801581816785687,
|
|
"grad_norm": 0.728448748588562,
|
|
"learning_rate": 8.418695904167789e-06,
|
|
"loss": 0.3752614974975586,
|
|
"memory(GiB)": 36.59,
|
|
"step": 635,
|
|
"token_acc": 0.8905149297823024,
|
|
"train_speed(iter/s)": 0.118318
|
|
},
|
|
{
|
|
"epoch": 0.7863011594870614,
|
|
"grad_norm": 0.7735581994056702,
|
|
"learning_rate": 8.395126772550475e-06,
|
|
"loss": 0.3447936773300171,
|
|
"memory(GiB)": 36.59,
|
|
"step": 640,
|
|
"token_acc": 0.8823329283110571,
|
|
"train_speed(iter/s)": 0.118526
|
|
},
|
|
{
|
|
"epoch": 0.7863011594870614,
|
|
"eval_loss": 0.36254996061325073,
|
|
"eval_runtime": 31.0583,
|
|
"eval_samples_per_second": 16.936,
|
|
"eval_steps_per_second": 4.25,
|
|
"eval_token_acc": 0.8862879377431907,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.7924441372955541,
|
|
"grad_norm": 0.6083407402038574,
|
|
"learning_rate": 8.371416818887907e-06,
|
|
"loss": 0.3541299343109131,
|
|
"memory(GiB)": 36.59,
|
|
"step": 645,
|
|
"token_acc": 0.8867384523493496,
|
|
"train_speed(iter/s)": 0.117839
|
|
},
|
|
{
|
|
"epoch": 0.7985871151040467,
|
|
"grad_norm": 0.7006340622901917,
|
|
"learning_rate": 8.347567026614398e-06,
|
|
"loss": 0.36687259674072265,
|
|
"memory(GiB)": 36.59,
|
|
"step": 650,
|
|
"token_acc": 0.878874098160756,
|
|
"train_speed(iter/s)": 0.118045
|
|
},
|
|
{
|
|
"epoch": 0.8047300929125394,
|
|
"grad_norm": 0.7071450352668762,
|
|
"learning_rate": 8.323578384964444e-06,
|
|
"loss": 0.354215145111084,
|
|
"memory(GiB)": 36.59,
|
|
"step": 655,
|
|
"token_acc": 0.8844807747626809,
|
|
"train_speed(iter/s)": 0.118259
|
|
},
|
|
{
|
|
"epoch": 0.810873070721032,
|
|
"grad_norm": 0.6859620809555054,
|
|
"learning_rate": 8.299451888931696e-06,
|
|
"loss": 0.33744206428527834,
|
|
"memory(GiB)": 36.59,
|
|
"step": 660,
|
|
"token_acc": 0.8832839002687923,
|
|
"train_speed(iter/s)": 0.118483
|
|
},
|
|
{
|
|
"epoch": 0.810873070721032,
|
|
"eval_loss": 0.36208656430244446,
|
|
"eval_runtime": 31.005,
|
|
"eval_samples_per_second": 16.965,
|
|
"eval_steps_per_second": 4.257,
|
|
"eval_token_acc": 0.8863259835711198,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.8170160485295247,
|
|
"grad_norm": 0.6853975057601929,
|
|
"learning_rate": 8.275188539227687e-06,
|
|
"loss": 0.3501296043395996,
|
|
"memory(GiB)": 36.59,
|
|
"step": 665,
|
|
"token_acc": 0.8818506429867994,
|
|
"train_speed(iter/s)": 0.117792
|
|
},
|
|
{
|
|
"epoch": 0.8231590263380174,
|
|
"grad_norm": 0.672095775604248,
|
|
"learning_rate": 8.250789342240326e-06,
|
|
"loss": 0.3572331190109253,
|
|
"memory(GiB)": 36.59,
|
|
"step": 670,
|
|
"token_acc": 0.8840531998946537,
|
|
"train_speed(iter/s)": 0.118042
|
|
},
|
|
{
|
|
"epoch": 0.82930200414651,
|
|
"grad_norm": 0.6654704809188843,
|
|
"learning_rate": 8.22625530999215e-06,
|
|
"loss": 0.35687694549560545,
|
|
"memory(GiB)": 36.59,
|
|
"step": 675,
|
|
"token_acc": 0.8840721896461247,
|
|
"train_speed(iter/s)": 0.118263
|
|
},
|
|
{
|
|
"epoch": 0.8354449819550027,
|
|
"grad_norm": 0.6872120499610901,
|
|
"learning_rate": 8.201587460098362e-06,
|
|
"loss": 0.34873204231262206,
|
|
"memory(GiB)": 36.59,
|
|
"step": 680,
|
|
"token_acc": 0.884066094755313,
|
|
"train_speed(iter/s)": 0.118437
|
|
},
|
|
{
|
|
"epoch": 0.8354449819550027,
|
|
"eval_loss": 0.36098214983940125,
|
|
"eval_runtime": 31.0688,
|
|
"eval_samples_per_second": 16.93,
|
|
"eval_steps_per_second": 4.249,
|
|
"eval_token_acc": 0.8864989191526157,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.8415879597634953,
|
|
"grad_norm": 0.6905971765518188,
|
|
"learning_rate": 8.176786815724601e-06,
|
|
"loss": 0.3643667221069336,
|
|
"memory(GiB)": 36.59,
|
|
"step": 685,
|
|
"token_acc": 0.8811371118426906,
|
|
"train_speed(iter/s)": 0.117814
|
|
},
|
|
{
|
|
"epoch": 0.847730937571988,
|
|
"grad_norm": 0.688023567199707,
|
|
"learning_rate": 8.151854405544526e-06,
|
|
"loss": 0.369766902923584,
|
|
"memory(GiB)": 36.59,
|
|
"step": 690,
|
|
"token_acc": 0.8848363488998546,
|
|
"train_speed(iter/s)": 0.118018
|
|
},
|
|
{
|
|
"epoch": 0.8538739153804807,
|
|
"grad_norm": 0.6458128690719604,
|
|
"learning_rate": 8.12679126369713e-06,
|
|
"loss": 0.3629646301269531,
|
|
"memory(GiB)": 36.59,
|
|
"step": 695,
|
|
"token_acc": 0.8775533863525702,
|
|
"train_speed(iter/s)": 0.118233
|
|
},
|
|
{
|
|
"epoch": 0.8600168931889733,
|
|
"grad_norm": 0.6942622065544128,
|
|
"learning_rate": 8.101598429743862e-06,
|
|
"loss": 0.3692671298980713,
|
|
"memory(GiB)": 36.59,
|
|
"step": 700,
|
|
"token_acc": 0.8780482002236338,
|
|
"train_speed(iter/s)": 0.118437
|
|
},
|
|
{
|
|
"epoch": 0.8600168931889733,
|
|
"eval_loss": 0.35998860001564026,
|
|
"eval_runtime": 31.0268,
|
|
"eval_samples_per_second": 16.953,
|
|
"eval_steps_per_second": 4.254,
|
|
"eval_token_acc": 0.8866441850410722,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.866159870997466,
|
|
"grad_norm": 0.7322993278503418,
|
|
"learning_rate": 8.076276948625495e-06,
|
|
"loss": 0.36251187324523926,
|
|
"memory(GiB)": 36.59,
|
|
"step": 705,
|
|
"token_acc": 0.8850018575958389,
|
|
"train_speed(iter/s)": 0.117844
|
|
},
|
|
{
|
|
"epoch": 0.8723028488059587,
|
|
"grad_norm": 0.7000852823257446,
|
|
"learning_rate": 8.050827870618795e-06,
|
|
"loss": 0.352423095703125,
|
|
"memory(GiB)": 36.59,
|
|
"step": 710,
|
|
"token_acc": 0.8848280386093149,
|
|
"train_speed(iter/s)": 0.118064
|
|
},
|
|
{
|
|
"epoch": 0.8784458266144514,
|
|
"grad_norm": 0.7393072843551636,
|
|
"learning_rate": 8.02525225129295e-06,
|
|
"loss": 0.3464043140411377,
|
|
"memory(GiB)": 36.59,
|
|
"step": 715,
|
|
"token_acc": 0.8842617899915519,
|
|
"train_speed(iter/s)": 0.118282
|
|
},
|
|
{
|
|
"epoch": 0.8845888044229441,
|
|
"grad_norm": 0.676538348197937,
|
|
"learning_rate": 7.999551151465793e-06,
|
|
"loss": 0.3531349658966064,
|
|
"memory(GiB)": 36.59,
|
|
"step": 720,
|
|
"token_acc": 0.882141211070386,
|
|
"train_speed(iter/s)": 0.118479
|
|
},
|
|
{
|
|
"epoch": 0.8845888044229441,
|
|
"eval_loss": 0.3599785268306732,
|
|
"eval_runtime": 31.1371,
|
|
"eval_samples_per_second": 16.893,
|
|
"eval_steps_per_second": 4.239,
|
|
"eval_token_acc": 0.8867168179853004,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.8907317822314367,
|
|
"grad_norm": 0.6606590151786804,
|
|
"learning_rate": 7.973725637159795e-06,
|
|
"loss": 0.3510305881500244,
|
|
"memory(GiB)": 36.59,
|
|
"step": 725,
|
|
"token_acc": 0.8858640888051448,
|
|
"train_speed(iter/s)": 0.117866
|
|
},
|
|
{
|
|
"epoch": 0.8968747600399294,
|
|
"grad_norm": 0.6910014748573303,
|
|
"learning_rate": 7.947776779557862e-06,
|
|
"loss": 0.34729857444763185,
|
|
"memory(GiB)": 36.59,
|
|
"step": 730,
|
|
"token_acc": 0.8849401138817985,
|
|
"train_speed(iter/s)": 0.11805
|
|
},
|
|
{
|
|
"epoch": 0.903017737848422,
|
|
"grad_norm": 0.715715765953064,
|
|
"learning_rate": 7.921705654958886e-06,
|
|
"loss": 0.37070040702819823,
|
|
"memory(GiB)": 36.59,
|
|
"step": 735,
|
|
"token_acc": 0.873466112894091,
|
|
"train_speed(iter/s)": 0.118238
|
|
},
|
|
{
|
|
"epoch": 0.9091607156569147,
|
|
"grad_norm": 0.6847560405731201,
|
|
"learning_rate": 7.895513344733124e-06,
|
|
"loss": 0.3388267993927002,
|
|
"memory(GiB)": 36.59,
|
|
"step": 740,
|
|
"token_acc": 0.892940483205657,
|
|
"train_speed(iter/s)": 0.118418
|
|
},
|
|
{
|
|
"epoch": 0.9091607156569147,
|
|
"eval_loss": 0.35883787274360657,
|
|
"eval_runtime": 31.0744,
|
|
"eval_samples_per_second": 16.927,
|
|
"eval_steps_per_second": 4.248,
|
|
"eval_token_acc": 0.8870696065715521,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.9153036934654074,
|
|
"grad_norm": 0.7038071155548096,
|
|
"learning_rate": 7.869200935277317e-06,
|
|
"loss": 0.3523221015930176,
|
|
"memory(GiB)": 36.59,
|
|
"step": 745,
|
|
"token_acc": 0.8841770158578834,
|
|
"train_speed(iter/s)": 0.117874
|
|
},
|
|
{
|
|
"epoch": 0.9214466712739,
|
|
"grad_norm": 0.7095304727554321,
|
|
"learning_rate": 7.842769517969665e-06,
|
|
"loss": 0.34724674224853513,
|
|
"memory(GiB)": 36.59,
|
|
"step": 750,
|
|
"token_acc": 0.8921830597616321,
|
|
"train_speed(iter/s)": 0.118073
|
|
},
|
|
{
|
|
"epoch": 0.9275896490823927,
|
|
"grad_norm": 0.7056006789207458,
|
|
"learning_rate": 7.816220189124527e-06,
|
|
"loss": 0.34354069232940676,
|
|
"memory(GiB)": 36.59,
|
|
"step": 755,
|
|
"token_acc": 0.8906672115144498,
|
|
"train_speed(iter/s)": 0.118273
|
|
},
|
|
{
|
|
"epoch": 0.9337326268908853,
|
|
"grad_norm": 0.6470732092857361,
|
|
"learning_rate": 7.789554049946966e-06,
|
|
"loss": 0.37253437042236326,
|
|
"memory(GiB)": 36.59,
|
|
"step": 760,
|
|
"token_acc": 0.8801472977363803,
|
|
"train_speed(iter/s)": 0.118474
|
|
},
|
|
{
|
|
"epoch": 0.9337326268908853,
|
|
"eval_loss": 0.3579709231853485,
|
|
"eval_runtime": 31.0126,
|
|
"eval_samples_per_second": 16.961,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.8876368352788586,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.939875604699378,
|
|
"grad_norm": 0.671111524105072,
|
|
"learning_rate": 7.762772206487066e-06,
|
|
"loss": 0.3589931011199951,
|
|
"memory(GiB)": 36.59,
|
|
"step": 765,
|
|
"token_acc": 0.8832086813686086,
|
|
"train_speed(iter/s)": 0.117907
|
|
},
|
|
{
|
|
"epoch": 0.9460185825078707,
|
|
"grad_norm": 0.7187632322311401,
|
|
"learning_rate": 7.735875769594063e-06,
|
|
"loss": 0.34763507843017577,
|
|
"memory(GiB)": 36.59,
|
|
"step": 770,
|
|
"token_acc": 0.8847997559593846,
|
|
"train_speed(iter/s)": 0.118076
|
|
},
|
|
{
|
|
"epoch": 0.9521615603163633,
|
|
"grad_norm": 0.7212729454040527,
|
|
"learning_rate": 7.70886585487026e-06,
|
|
"loss": 0.3598261833190918,
|
|
"memory(GiB)": 36.59,
|
|
"step": 775,
|
|
"token_acc": 0.8688440332679189,
|
|
"train_speed(iter/s)": 0.118251
|
|
},
|
|
{
|
|
"epoch": 0.958304538124856,
|
|
"grad_norm": 0.6621137261390686,
|
|
"learning_rate": 7.681743582624761e-06,
|
|
"loss": 0.35757567882537844,
|
|
"memory(GiB)": 36.59,
|
|
"step": 780,
|
|
"token_acc": 0.8785007468259896,
|
|
"train_speed(iter/s)": 0.118454
|
|
},
|
|
{
|
|
"epoch": 0.958304538124856,
|
|
"eval_loss": 0.3576439321041107,
|
|
"eval_runtime": 31.075,
|
|
"eval_samples_per_second": 16.927,
|
|
"eval_steps_per_second": 4.248,
|
|
"eval_token_acc": 0.8877751837440553,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.9644475159333487,
|
|
"grad_norm": 0.7074070572853088,
|
|
"learning_rate": 7.654510077827003e-06,
|
|
"loss": 0.3493576765060425,
|
|
"memory(GiB)": 36.59,
|
|
"step": 785,
|
|
"token_acc": 0.8852768310495931,
|
|
"train_speed(iter/s)": 0.117922
|
|
},
|
|
{
|
|
"epoch": 0.9705904937418414,
|
|
"grad_norm": 0.6370189189910889,
|
|
"learning_rate": 7.627166470060092e-06,
|
|
"loss": 0.3448970317840576,
|
|
"memory(GiB)": 36.59,
|
|
"step": 790,
|
|
"token_acc": 0.8896250845717751,
|
|
"train_speed(iter/s)": 0.118138
|
|
},
|
|
{
|
|
"epoch": 0.9767334715503341,
|
|
"grad_norm": 0.6875202655792236,
|
|
"learning_rate": 7.59971389347395e-06,
|
|
"loss": 0.36741271018981936,
|
|
"memory(GiB)": 36.59,
|
|
"step": 795,
|
|
"token_acc": 0.880575873679322,
|
|
"train_speed(iter/s)": 0.118312
|
|
},
|
|
{
|
|
"epoch": 0.9828764493588267,
|
|
"grad_norm": 0.7139670848846436,
|
|
"learning_rate": 7.572153486738281e-06,
|
|
"loss": 0.3554513692855835,
|
|
"memory(GiB)": 36.59,
|
|
"step": 800,
|
|
"token_acc": 0.8777580460748777,
|
|
"train_speed(iter/s)": 0.118491
|
|
},
|
|
{
|
|
"epoch": 0.9828764493588267,
|
|
"eval_loss": 0.3568785786628723,
|
|
"eval_runtime": 31.0006,
|
|
"eval_samples_per_second": 16.967,
|
|
"eval_steps_per_second": 4.258,
|
|
"eval_token_acc": 0.8877959360138349,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.9890194271673194,
|
|
"grad_norm": 0.7183944582939148,
|
|
"learning_rate": 7.544486392995325e-06,
|
|
"loss": 0.3408940076828003,
|
|
"memory(GiB)": 36.59,
|
|
"step": 805,
|
|
"token_acc": 0.8823203099663748,
|
|
"train_speed(iter/s)": 0.117937
|
|
},
|
|
{
|
|
"epoch": 0.995162404975812,
|
|
"grad_norm": 0.7064708471298218,
|
|
"learning_rate": 7.516713759812465e-06,
|
|
"loss": 0.3436570167541504,
|
|
"memory(GiB)": 36.59,
|
|
"step": 810,
|
|
"token_acc": 0.8865785782162089,
|
|
"train_speed(iter/s)": 0.118112
|
|
},
|
|
{
|
|
"epoch": 1.002457191123397,
|
|
"grad_norm": 0.7077184915542603,
|
|
"learning_rate": 7.4888367391346085e-06,
|
|
"loss": 0.40673046112060546,
|
|
"memory(GiB)": 36.59,
|
|
"step": 815,
|
|
"token_acc": 0.8932987364620939,
|
|
"train_speed(iter/s)": 0.11823
|
|
},
|
|
{
|
|
"epoch": 1.0086001689318898,
|
|
"grad_norm": 0.6631501317024231,
|
|
"learning_rate": 7.460856487236421e-06,
|
|
"loss": 0.32202835083007814,
|
|
"memory(GiB)": 36.59,
|
|
"step": 820,
|
|
"token_acc": 0.8988542163968578,
|
|
"train_speed(iter/s)": 0.118434
|
|
},
|
|
{
|
|
"epoch": 1.0086001689318898,
|
|
"eval_loss": 0.3615255355834961,
|
|
"eval_runtime": 31.0114,
|
|
"eval_samples_per_second": 16.961,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.8877959360138349,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.0147431467403825,
|
|
"grad_norm": 0.657802939414978,
|
|
"learning_rate": 7.432774164674359e-06,
|
|
"loss": 0.2976385116577148,
|
|
"memory(GiB)": 36.59,
|
|
"step": 825,
|
|
"token_acc": 0.8940141675474071,
|
|
"train_speed(iter/s)": 0.117911
|
|
},
|
|
{
|
|
"epoch": 1.0208861245488752,
|
|
"grad_norm": 0.675591766834259,
|
|
"learning_rate": 7.404590936238535e-06,
|
|
"loss": 0.311181640625,
|
|
"memory(GiB)": 36.59,
|
|
"step": 830,
|
|
"token_acc": 0.8997530755324309,
|
|
"train_speed(iter/s)": 0.118145
|
|
},
|
|
{
|
|
"epoch": 1.0270291023573677,
|
|
"grad_norm": 0.6661099791526794,
|
|
"learning_rate": 7.376307970904408e-06,
|
|
"loss": 0.3044283866882324,
|
|
"memory(GiB)": 36.59,
|
|
"step": 835,
|
|
"token_acc": 0.8999576197242789,
|
|
"train_speed(iter/s)": 0.118312
|
|
},
|
|
{
|
|
"epoch": 1.0331720801658604,
|
|
"grad_norm": 0.6595695614814758,
|
|
"learning_rate": 7.34792644178429e-06,
|
|
"loss": 0.3037309408187866,
|
|
"memory(GiB)": 36.59,
|
|
"step": 840,
|
|
"token_acc": 0.9055141287284144,
|
|
"train_speed(iter/s)": 0.118457
|
|
},
|
|
{
|
|
"epoch": 1.0331720801658604,
|
|
"eval_loss": 0.35968878865242004,
|
|
"eval_runtime": 31.0067,
|
|
"eval_samples_per_second": 16.964,
|
|
"eval_steps_per_second": 4.257,
|
|
"eval_token_acc": 0.8877025507998271,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.039315057974353,
|
|
"grad_norm": 0.7282394170761108,
|
|
"learning_rate": 7.319447526078696e-06,
|
|
"loss": 0.3085323333740234,
|
|
"memory(GiB)": 36.59,
|
|
"step": 845,
|
|
"token_acc": 0.8898290405833752,
|
|
"train_speed(iter/s)": 0.118005
|
|
},
|
|
{
|
|
"epoch": 1.0454580357828458,
|
|
"grad_norm": 0.6701980233192444,
|
|
"learning_rate": 7.290872405027508e-06,
|
|
"loss": 0.29195051193237304,
|
|
"memory(GiB)": 36.59,
|
|
"step": 850,
|
|
"token_acc": 0.9044647710888937,
|
|
"train_speed(iter/s)": 0.118164
|
|
},
|
|
{
|
|
"epoch": 1.0516010135913385,
|
|
"grad_norm": 0.6651575565338135,
|
|
"learning_rate": 7.262202263860989e-06,
|
|
"loss": 0.30650150775909424,
|
|
"memory(GiB)": 36.59,
|
|
"step": 855,
|
|
"token_acc": 0.8993040861428504,
|
|
"train_speed(iter/s)": 0.118324
|
|
},
|
|
{
|
|
"epoch": 1.057743991399831,
|
|
"grad_norm": 0.682246744632721,
|
|
"learning_rate": 7.233438291750615e-06,
|
|
"loss": 0.3102306842803955,
|
|
"memory(GiB)": 36.59,
|
|
"step": 860,
|
|
"token_acc": 0.9063239097279017,
|
|
"train_speed(iter/s)": 0.11848
|
|
},
|
|
{
|
|
"epoch": 1.057743991399831,
|
|
"eval_loss": 0.35930460691452026,
|
|
"eval_runtime": 31.0087,
|
|
"eval_samples_per_second": 16.963,
|
|
"eval_steps_per_second": 4.257,
|
|
"eval_token_acc": 0.887875486381323,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.0638869692083237,
|
|
"grad_norm": 0.7295219898223877,
|
|
"learning_rate": 7.204581681759752e-06,
|
|
"loss": 0.30730266571044923,
|
|
"memory(GiB)": 36.59,
|
|
"step": 865,
|
|
"token_acc": 0.8905181851880587,
|
|
"train_speed(iter/s)": 0.117999
|
|
},
|
|
{
|
|
"epoch": 1.0700299470168164,
|
|
"grad_norm": 0.6892926096916199,
|
|
"learning_rate": 7.175633630794176e-06,
|
|
"loss": 0.2974876403808594,
|
|
"memory(GiB)": 36.59,
|
|
"step": 870,
|
|
"token_acc": 0.9006297483247798,
|
|
"train_speed(iter/s)": 0.118168
|
|
},
|
|
{
|
|
"epoch": 1.0761729248253091,
|
|
"grad_norm": 0.6752432584762573,
|
|
"learning_rate": 7.146595339552423e-06,
|
|
"loss": 0.3102593421936035,
|
|
"memory(GiB)": 36.59,
|
|
"step": 875,
|
|
"token_acc": 0.9038279095421953,
|
|
"train_speed(iter/s)": 0.118364
|
|
},
|
|
{
|
|
"epoch": 1.0823159026338018,
|
|
"grad_norm": 0.674329400062561,
|
|
"learning_rate": 7.1174680124759856e-06,
|
|
"loss": 0.28625760078430174,
|
|
"memory(GiB)": 36.59,
|
|
"step": 880,
|
|
"token_acc": 0.9079884290164664,
|
|
"train_speed(iter/s)": 0.118523
|
|
},
|
|
{
|
|
"epoch": 1.0823159026338018,
|
|
"eval_loss": 0.36010968685150146,
|
|
"eval_runtime": 31.0269,
|
|
"eval_samples_per_second": 16.953,
|
|
"eval_steps_per_second": 4.254,
|
|
"eval_token_acc": 0.8876299178555987,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.0884588804422943,
|
|
"grad_norm": 0.6883670091629028,
|
|
"learning_rate": 7.08825285769936e-06,
|
|
"loss": 0.3032073020935059,
|
|
"memory(GiB)": 36.59,
|
|
"step": 885,
|
|
"token_acc": 0.8932736033602344,
|
|
"train_speed(iter/s)": 0.118061
|
|
},
|
|
{
|
|
"epoch": 1.094601858250787,
|
|
"grad_norm": 0.671500027179718,
|
|
"learning_rate": 7.058951086999934e-06,
|
|
"loss": 0.3017904758453369,
|
|
"memory(GiB)": 36.59,
|
|
"step": 890,
|
|
"token_acc": 0.9018632618216911,
|
|
"train_speed(iter/s)": 0.118196
|
|
},
|
|
{
|
|
"epoch": 1.1007448360592798,
|
|
"grad_norm": 0.7209696173667908,
|
|
"learning_rate": 7.029563915747723e-06,
|
|
"loss": 0.31074273586273193,
|
|
"memory(GiB)": 36.59,
|
|
"step": 895,
|
|
"token_acc": 0.898548356982823,
|
|
"train_speed(iter/s)": 0.118358
|
|
},
|
|
{
|
|
"epoch": 1.1068878138677725,
|
|
"grad_norm": 0.624523937702179,
|
|
"learning_rate": 7.0000925628549595e-06,
|
|
"loss": 0.2956224918365479,
|
|
"memory(GiB)": 36.59,
|
|
"step": 900,
|
|
"token_acc": 0.9076877474540027,
|
|
"train_speed(iter/s)": 0.118515
|
|
},
|
|
{
|
|
"epoch": 1.1068878138677725,
|
|
"eval_loss": 0.3587914705276489,
|
|
"eval_runtime": 31.0433,
|
|
"eval_samples_per_second": 16.944,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.8878443579766537,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.1130307916762652,
|
|
"grad_norm": 0.7052697539329529,
|
|
"learning_rate": 6.9705382507255405e-06,
|
|
"loss": 0.2872809648513794,
|
|
"memory(GiB)": 36.59,
|
|
"step": 905,
|
|
"token_acc": 0.8926222488296873,
|
|
"train_speed(iter/s)": 0.118076
|
|
},
|
|
{
|
|
"epoch": 1.1191737694847577,
|
|
"grad_norm": 0.7123196125030518,
|
|
"learning_rate": 6.940902205204321e-06,
|
|
"loss": 0.2964935302734375,
|
|
"memory(GiB)": 36.59,
|
|
"step": 910,
|
|
"token_acc": 0.9039498517120518,
|
|
"train_speed(iter/s)": 0.118226
|
|
},
|
|
{
|
|
"epoch": 1.1253167472932504,
|
|
"grad_norm": 0.660994291305542,
|
|
"learning_rate": 6.911185655526263e-06,
|
|
"loss": 0.302768611907959,
|
|
"memory(GiB)": 36.59,
|
|
"step": 915,
|
|
"token_acc": 0.9020544461398969,
|
|
"train_speed(iter/s)": 0.118393
|
|
},
|
|
{
|
|
"epoch": 1.131459725101743,
|
|
"grad_norm": 0.7210450768470764,
|
|
"learning_rate": 6.881389834265463e-06,
|
|
"loss": 0.3173034429550171,
|
|
"memory(GiB)": 36.59,
|
|
"step": 920,
|
|
"token_acc": 0.8982849864950921,
|
|
"train_speed(iter/s)": 0.118553
|
|
},
|
|
{
|
|
"epoch": 1.131459725101743,
|
|
"eval_loss": 0.3588680624961853,
|
|
"eval_runtime": 31.0113,
|
|
"eval_samples_per_second": 16.962,
|
|
"eval_steps_per_second": 4.257,
|
|
"eval_token_acc": 0.8877405966277562,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.1376027029102358,
|
|
"grad_norm": 0.6697967648506165,
|
|
"learning_rate": 6.851515977284014e-06,
|
|
"loss": 0.299291205406189,
|
|
"memory(GiB)": 36.59,
|
|
"step": 925,
|
|
"token_acc": 0.8902243928864662,
|
|
"train_speed(iter/s)": 0.118081
|
|
},
|
|
{
|
|
"epoch": 1.1437456807187285,
|
|
"grad_norm": 0.7066377401351929,
|
|
"learning_rate": 6.821565323680759e-06,
|
|
"loss": 0.29554860591888427,
|
|
"memory(GiB)": 36.59,
|
|
"step": 930,
|
|
"token_acc": 0.9000831485587583,
|
|
"train_speed(iter/s)": 0.118223
|
|
},
|
|
{
|
|
"epoch": 1.149888658527221,
|
|
"grad_norm": 0.6386650204658508,
|
|
"learning_rate": 6.791539115739879e-06,
|
|
"loss": 0.3022310256958008,
|
|
"memory(GiB)": 36.59,
|
|
"step": 935,
|
|
"token_acc": 0.8924001814882032,
|
|
"train_speed(iter/s)": 0.118412
|
|
},
|
|
{
|
|
"epoch": 1.1560316363357137,
|
|
"grad_norm": 0.6704084873199463,
|
|
"learning_rate": 6.761438598879383e-06,
|
|
"loss": 0.28601846694946287,
|
|
"memory(GiB)": 36.59,
|
|
"step": 940,
|
|
"token_acc": 0.9012753677155092,
|
|
"train_speed(iter/s)": 0.118547
|
|
},
|
|
{
|
|
"epoch": 1.1560316363357137,
|
|
"eval_loss": 0.35880643129348755,
|
|
"eval_runtime": 31.0179,
|
|
"eval_samples_per_second": 16.958,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.8880657155209685,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.1621746141442064,
|
|
"grad_norm": 0.6651415228843689,
|
|
"learning_rate": 6.731265021599437e-06,
|
|
"loss": 0.3218855381011963,
|
|
"memory(GiB)": 36.59,
|
|
"step": 945,
|
|
"token_acc": 0.8918404969109147,
|
|
"train_speed(iter/s)": 0.118102
|
|
},
|
|
{
|
|
"epoch": 1.1683175919526991,
|
|
"grad_norm": 0.6738328337669373,
|
|
"learning_rate": 6.7010196354305876e-06,
|
|
"loss": 0.30361137390136717,
|
|
"memory(GiB)": 36.59,
|
|
"step": 950,
|
|
"token_acc": 0.9092978421945045,
|
|
"train_speed(iter/s)": 0.118249
|
|
},
|
|
{
|
|
"epoch": 1.1744605697611918,
|
|
"grad_norm": 0.6776899099349976,
|
|
"learning_rate": 6.670703694881851e-06,
|
|
"loss": 0.29663915634155275,
|
|
"memory(GiB)": 36.59,
|
|
"step": 955,
|
|
"token_acc": 0.8984023842094978,
|
|
"train_speed(iter/s)": 0.118405
|
|
},
|
|
{
|
|
"epoch": 1.1806035475696843,
|
|
"grad_norm": 0.6939485669136047,
|
|
"learning_rate": 6.640318457388672e-06,
|
|
"loss": 0.3056649684906006,
|
|
"memory(GiB)": 36.59,
|
|
"step": 960,
|
|
"token_acc": 0.8867154116418194,
|
|
"train_speed(iter/s)": 0.118549
|
|
},
|
|
{
|
|
"epoch": 1.1806035475696843,
|
|
"eval_loss": 0.35902491211891174,
|
|
"eval_runtime": 31.0009,
|
|
"eval_samples_per_second": 16.967,
|
|
"eval_steps_per_second": 4.258,
|
|
"eval_token_acc": 0.8878408992650237,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.186746525378177,
|
|
"grad_norm": 0.7092224359512329,
|
|
"learning_rate": 6.609865183260777e-06,
|
|
"loss": 0.2987541198730469,
|
|
"memory(GiB)": 36.59,
|
|
"step": 965,
|
|
"token_acc": 0.8890386576114193,
|
|
"train_speed(iter/s)": 0.118089
|
|
},
|
|
{
|
|
"epoch": 1.1928895031866698,
|
|
"grad_norm": 0.7263514399528503,
|
|
"learning_rate": 6.579345135629896e-06,
|
|
"loss": 0.28489587306976316,
|
|
"memory(GiB)": 36.59,
|
|
"step": 970,
|
|
"token_acc": 0.8956198679571216,
|
|
"train_speed(iter/s)": 0.118237
|
|
},
|
|
{
|
|
"epoch": 1.1990324809951625,
|
|
"grad_norm": 0.6999565362930298,
|
|
"learning_rate": 6.548759580397363e-06,
|
|
"loss": 0.30396156311035155,
|
|
"memory(GiB)": 36.59,
|
|
"step": 975,
|
|
"token_acc": 0.8999096083844331,
|
|
"train_speed(iter/s)": 0.118377
|
|
},
|
|
{
|
|
"epoch": 1.2051754588036552,
|
|
"grad_norm": 0.6386498212814331,
|
|
"learning_rate": 6.518109786181628e-06,
|
|
"loss": 0.32303242683410643,
|
|
"memory(GiB)": 36.59,
|
|
"step": 980,
|
|
"token_acc": 0.8918318331799511,
|
|
"train_speed(iter/s)": 0.11851
|
|
},
|
|
{
|
|
"epoch": 1.2051754588036552,
|
|
"eval_loss": 0.3577713966369629,
|
|
"eval_runtime": 30.9949,
|
|
"eval_samples_per_second": 16.971,
|
|
"eval_steps_per_second": 4.259,
|
|
"eval_token_acc": 0.8879481193255512,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.2113184366121477,
|
|
"grad_norm": 0.6696978807449341,
|
|
"learning_rate": 6.487397024265616e-06,
|
|
"loss": 0.29286723136901854,
|
|
"memory(GiB)": 36.59,
|
|
"step": 985,
|
|
"token_acc": 0.8883067219587296,
|
|
"train_speed(iter/s)": 0.11806
|
|
},
|
|
{
|
|
"epoch": 1.2174614144206404,
|
|
"grad_norm": 0.6677629947662354,
|
|
"learning_rate": 6.456622568544012e-06,
|
|
"loss": 0.295971155166626,
|
|
"memory(GiB)": 36.59,
|
|
"step": 990,
|
|
"token_acc": 0.901066495199663,
|
|
"train_speed(iter/s)": 0.118215
|
|
},
|
|
{
|
|
"epoch": 1.223604392229133,
|
|
"grad_norm": 0.6924172639846802,
|
|
"learning_rate": 6.425787695470419e-06,
|
|
"loss": 0.2936640024185181,
|
|
"memory(GiB)": 36.59,
|
|
"step": 995,
|
|
"token_acc": 0.8968813591405991,
|
|
"train_speed(iter/s)": 0.118377
|
|
},
|
|
{
|
|
"epoch": 1.2297473700376258,
|
|
"grad_norm": 0.6816849112510681,
|
|
"learning_rate": 6.3948936840044096e-06,
|
|
"loss": 0.29815101623535156,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1000,
|
|
"token_acc": 0.9113140380746014,
|
|
"train_speed(iter/s)": 0.118511
|
|
},
|
|
{
|
|
"epoch": 1.2297473700376258,
|
|
"eval_loss": 0.35851019620895386,
|
|
"eval_runtime": 31.0715,
|
|
"eval_samples_per_second": 16.929,
|
|
"eval_steps_per_second": 4.248,
|
|
"eval_token_acc": 0.8881279723303069,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.2358903478461185,
|
|
"grad_norm": 0.7491683959960938,
|
|
"learning_rate": 6.363941815558484e-06,
|
|
"loss": 0.305048394203186,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1005,
|
|
"token_acc": 0.8883380321029248,
|
|
"train_speed(iter/s)": 0.118078
|
|
},
|
|
{
|
|
"epoch": 1.242033325654611,
|
|
"grad_norm": 0.6767114400863647,
|
|
"learning_rate": 6.332933373944914e-06,
|
|
"loss": 0.2910877466201782,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1010,
|
|
"token_acc": 0.8970752230332523,
|
|
"train_speed(iter/s)": 0.118198
|
|
},
|
|
{
|
|
"epoch": 1.2481763034631037,
|
|
"grad_norm": 0.6579700112342834,
|
|
"learning_rate": 6.301869645322498e-06,
|
|
"loss": 0.2989434480667114,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1015,
|
|
"token_acc": 0.9020202767705173,
|
|
"train_speed(iter/s)": 0.118352
|
|
},
|
|
{
|
|
"epoch": 1.2543192812715964,
|
|
"grad_norm": 0.7496470808982849,
|
|
"learning_rate": 6.270751918143213e-06,
|
|
"loss": 0.3161623477935791,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1020,
|
|
"token_acc": 0.8931434478006202,
|
|
"train_speed(iter/s)": 0.118501
|
|
},
|
|
{
|
|
"epoch": 1.2543192812715964,
|
|
"eval_loss": 0.3574770390987396,
|
|
"eval_runtime": 31.0423,
|
|
"eval_samples_per_second": 16.945,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.888463467358409,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.2604622590800891,
|
|
"grad_norm": 0.6567991971969604,
|
|
"learning_rate": 6.239581483098767e-06,
|
|
"loss": 0.2918637752532959,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1025,
|
|
"token_acc": 0.8930598715558318,
|
|
"train_speed(iter/s)": 0.118037
|
|
},
|
|
{
|
|
"epoch": 1.2666052368885818,
|
|
"grad_norm": 0.7520761489868164,
|
|
"learning_rate": 6.208359633067077e-06,
|
|
"loss": 0.2961498022079468,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1030,
|
|
"token_acc": 0.9095238095238095,
|
|
"train_speed(iter/s)": 0.118175
|
|
},
|
|
{
|
|
"epoch": 1.2727482146970743,
|
|
"grad_norm": 0.7256974577903748,
|
|
"learning_rate": 6.177087663058626e-06,
|
|
"loss": 0.30830044746398927,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1035,
|
|
"token_acc": 0.9017879399034648,
|
|
"train_speed(iter/s)": 0.118311
|
|
},
|
|
{
|
|
"epoch": 1.278891192505567,
|
|
"grad_norm": 0.6479539275169373,
|
|
"learning_rate": 6.145766870162767e-06,
|
|
"loss": 0.2862563610076904,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1040,
|
|
"token_acc": 0.9018611343172747,
|
|
"train_speed(iter/s)": 0.118441
|
|
},
|
|
{
|
|
"epoch": 1.278891192505567,
|
|
"eval_loss": 0.3572877049446106,
|
|
"eval_runtime": 31.0406,
|
|
"eval_samples_per_second": 16.946,
|
|
"eval_steps_per_second": 4.253,
|
|
"eval_token_acc": 0.8883147427583226,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.2850341703140598,
|
|
"grad_norm": 0.7319021224975586,
|
|
"learning_rate": 6.114398553493909e-06,
|
|
"loss": 0.3000927925109863,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1045,
|
|
"token_acc": 0.8926547069479344,
|
|
"train_speed(iter/s)": 0.118
|
|
},
|
|
{
|
|
"epoch": 1.2911771481225525,
|
|
"grad_norm": 0.705988883972168,
|
|
"learning_rate": 6.0829840141376385e-06,
|
|
"loss": 0.30697922706604003,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1050,
|
|
"token_acc": 0.901831032683459,
|
|
"train_speed(iter/s)": 0.118157
|
|
},
|
|
{
|
|
"epoch": 1.2973201259310452,
|
|
"grad_norm": 0.64214026927948,
|
|
"learning_rate": 6.051524555096754e-06,
|
|
"loss": 0.30261845588684083,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1055,
|
|
"token_acc": 0.902963066984974,
|
|
"train_speed(iter/s)": 0.118309
|
|
},
|
|
{
|
|
"epoch": 1.3034631037395377,
|
|
"grad_norm": 0.7394285798072815,
|
|
"learning_rate": 6.020021481237216e-06,
|
|
"loss": 0.30278654098510743,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1060,
|
|
"token_acc": 0.9020456426628828,
|
|
"train_speed(iter/s)": 0.118449
|
|
},
|
|
{
|
|
"epoch": 1.3034631037395377,
|
|
"eval_loss": 0.35663846135139465,
|
|
"eval_runtime": 31.0016,
|
|
"eval_samples_per_second": 16.967,
|
|
"eval_steps_per_second": 4.258,
|
|
"eval_token_acc": 0.8883424124513619,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.3096060815480304,
|
|
"grad_norm": 0.6863911151885986,
|
|
"learning_rate": 5.988476099234033e-06,
|
|
"loss": 0.2937177658081055,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1065,
|
|
"token_acc": 0.8901542316498898,
|
|
"train_speed(iter/s)": 0.1181
|
|
},
|
|
{
|
|
"epoch": 1.315749059356523,
|
|
"grad_norm": 0.654614269733429,
|
|
"learning_rate": 5.956889717517053e-06,
|
|
"loss": 0.3110340595245361,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1070,
|
|
"token_acc": 0.9028094153378892,
|
|
"train_speed(iter/s)": 0.118212
|
|
},
|
|
{
|
|
"epoch": 1.3218920371650158,
|
|
"grad_norm": 0.7234563827514648,
|
|
"learning_rate": 5.925263646216697e-06,
|
|
"loss": 0.31188764572143557,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1075,
|
|
"token_acc": 0.9096784327805578,
|
|
"train_speed(iter/s)": 0.118351
|
|
},
|
|
{
|
|
"epoch": 1.3280350149735085,
|
|
"grad_norm": 0.6865576505661011,
|
|
"learning_rate": 5.893599197109625e-06,
|
|
"loss": 0.302515435218811,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1080,
|
|
"token_acc": 0.8899835796387521,
|
|
"train_speed(iter/s)": 0.118487
|
|
},
|
|
{
|
|
"epoch": 1.3280350149735085,
|
|
"eval_loss": 0.35516050457954407,
|
|
"eval_runtime": 30.9944,
|
|
"eval_samples_per_second": 16.971,
|
|
"eval_steps_per_second": 4.259,
|
|
"eval_token_acc": 0.8885637699956767,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 1.334177992782001,
|
|
"grad_norm": 0.6132445335388184,
|
|
"learning_rate": 5.861897683564313e-06,
|
|
"loss": 0.3079413414001465,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1085,
|
|
"token_acc": 0.8899461794132038,
|
|
"train_speed(iter/s)": 0.118068
|
|
},
|
|
{
|
|
"epoch": 1.3403209705904937,
|
|
"grad_norm": 0.7110121250152588,
|
|
"learning_rate": 5.830160420486588e-06,
|
|
"loss": 0.29248368740081787,
|
|
"memory(GiB)": 36.59,
|
|
"step": 1090,
|
|
"token_acc": 0.905348378514747,
|
|
"train_speed(iter/s)": 0.118225
|
|
},
|
|
{
|
|
"epoch": 1.3464639483989864,
|
|
"grad_norm": 0.6436595916748047,
|
|
"learning_rate": 5.798388724265085e-06,
|
|
"loss": 0.3002151966094971,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1095,
|
|
"token_acc": 0.9053737339917971,
|
|
"train_speed(iter/s)": 0.118367
|
|
},
|
|
{
|
|
"epoch": 1.3526069262074791,
|
|
"grad_norm": 0.7013940215110779,
|
|
"learning_rate": 5.7665839127166475e-06,
|
|
"loss": 0.3010303020477295,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1100,
|
|
"token_acc": 0.9023475037752253,
|
|
"train_speed(iter/s)": 0.118479
|
|
},
|
|
{
|
|
"epoch": 1.3526069262074791,
|
|
"eval_loss": 0.3555811047554016,
|
|
"eval_runtime": 31.0333,
|
|
"eval_samples_per_second": 16.95,
|
|
"eval_steps_per_second": 4.253,
|
|
"eval_token_acc": 0.8887712926934717,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 1.3587499040159718,
|
|
"grad_norm": 0.7001612186431885,
|
|
"learning_rate": 5.734747305031664e-06,
|
|
"loss": 0.3120265483856201,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1105,
|
|
"token_acc": 0.8886269689596821,
|
|
"train_speed(iter/s)": 0.118091
|
|
},
|
|
{
|
|
"epoch": 1.3648928818244643,
|
|
"grad_norm": 0.6804000735282898,
|
|
"learning_rate": 5.7028802217193565e-06,
|
|
"loss": 0.30517282485961916,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1110,
|
|
"token_acc": 0.8981199555362235,
|
|
"train_speed(iter/s)": 0.118215
|
|
},
|
|
{
|
|
"epoch": 1.371035859632957,
|
|
"grad_norm": 0.6867697834968567,
|
|
"learning_rate": 5.670983984553003e-06,
|
|
"loss": 0.3074041366577148,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1115,
|
|
"token_acc": 0.903482807952247,
|
|
"train_speed(iter/s)": 0.118338
|
|
},
|
|
{
|
|
"epoch": 1.3771788374414498,
|
|
"grad_norm": 0.7690563201904297,
|
|
"learning_rate": 5.63905991651512e-06,
|
|
"loss": 0.3027225971221924,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1120,
|
|
"token_acc": 0.8987023004673533,
|
|
"train_speed(iter/s)": 0.118449
|
|
},
|
|
{
|
|
"epoch": 1.3771788374414498,
|
|
"eval_loss": 0.3556562066078186,
|
|
"eval_runtime": 31.1119,
|
|
"eval_samples_per_second": 16.907,
|
|
"eval_steps_per_second": 4.243,
|
|
"eval_token_acc": 0.8888612191958496,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 1.3833218152499425,
|
|
"grad_norm": 0.6769737005233765,
|
|
"learning_rate": 5.607109341742579e-06,
|
|
"loss": 0.30417637825012206,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1125,
|
|
"token_acc": 0.8885960318346111,
|
|
"train_speed(iter/s)": 0.118061
|
|
},
|
|
{
|
|
"epoch": 1.3894647930584352,
|
|
"grad_norm": 0.6724239587783813,
|
|
"learning_rate": 5.575133585471697e-06,
|
|
"loss": 0.31278433799743655,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1130,
|
|
"token_acc": 0.8959036584253262,
|
|
"train_speed(iter/s)": 0.118168
|
|
},
|
|
{
|
|
"epoch": 1.3956077708669277,
|
|
"grad_norm": 0.7643016576766968,
|
|
"learning_rate": 5.543133973983254e-06,
|
|
"loss": 0.29112992286682127,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1135,
|
|
"token_acc": 0.9014400645633149,
|
|
"train_speed(iter/s)": 0.118301
|
|
},
|
|
{
|
|
"epoch": 1.4017507486754204,
|
|
"grad_norm": 0.6788151264190674,
|
|
"learning_rate": 5.511111834547496e-06,
|
|
"loss": 0.3165508508682251,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1140,
|
|
"token_acc": 0.903283467750516,
|
|
"train_speed(iter/s)": 0.118415
|
|
},
|
|
{
|
|
"epoch": 1.4017507486754204,
|
|
"eval_loss": 0.35399720072746277,
|
|
"eval_runtime": 31.0476,
|
|
"eval_samples_per_second": 16.942,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.8891137051448336,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 1.407893726483913,
|
|
"grad_norm": 0.6638893485069275,
|
|
"learning_rate": 5.479068495369071e-06,
|
|
"loss": 0.2801161289215088,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1145,
|
|
"token_acc": 0.8925869894099848,
|
|
"train_speed(iter/s)": 0.118025
|
|
},
|
|
{
|
|
"epoch": 1.4140367042924058,
|
|
"grad_norm": 0.7107008099555969,
|
|
"learning_rate": 5.447005285531948e-06,
|
|
"loss": 0.29520745277404786,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1150,
|
|
"token_acc": 0.9020094269412057,
|
|
"train_speed(iter/s)": 0.118132
|
|
},
|
|
{
|
|
"epoch": 1.4201796821008985,
|
|
"grad_norm": 0.6262108087539673,
|
|
"learning_rate": 5.414923534944283e-06,
|
|
"loss": 0.28986170291900637,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1155,
|
|
"token_acc": 0.9047965292421047,
|
|
"train_speed(iter/s)": 0.11825
|
|
},
|
|
{
|
|
"epoch": 1.426322659909391,
|
|
"grad_norm": 0.7209280729293823,
|
|
"learning_rate": 5.38282457428326e-06,
|
|
"loss": 0.30995869636535645,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1160,
|
|
"token_acc": 0.9020344876192267,
|
|
"train_speed(iter/s)": 0.118366
|
|
},
|
|
{
|
|
"epoch": 1.426322659909391,
|
|
"eval_loss": 0.3549746870994568,
|
|
"eval_runtime": 31.0688,
|
|
"eval_samples_per_second": 16.93,
|
|
"eval_steps_per_second": 4.249,
|
|
"eval_token_acc": 0.8894630350194552,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 1.4324656377178837,
|
|
"grad_norm": 0.6941882371902466,
|
|
"learning_rate": 5.350709734939898e-06,
|
|
"loss": 0.313739013671875,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1165,
|
|
"token_acc": 0.889407067409571,
|
|
"train_speed(iter/s)": 0.117998
|
|
},
|
|
{
|
|
"epoch": 1.4386086155263764,
|
|
"grad_norm": 0.6950980424880981,
|
|
"learning_rate": 5.318580348963826e-06,
|
|
"loss": 0.29497203826904295,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1170,
|
|
"token_acc": 0.9058259992665934,
|
|
"train_speed(iter/s)": 0.118116
|
|
},
|
|
{
|
|
"epoch": 1.4447515933348691,
|
|
"grad_norm": 0.6526186466217041,
|
|
"learning_rate": 5.286437749008031e-06,
|
|
"loss": 0.29609017372131347,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1175,
|
|
"token_acc": 0.9071177290528133,
|
|
"train_speed(iter/s)": 0.11824
|
|
},
|
|
{
|
|
"epoch": 1.4508945711433618,
|
|
"grad_norm": 0.6585668921470642,
|
|
"learning_rate": 5.2542832682735956e-06,
|
|
"loss": 0.2915393590927124,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1180,
|
|
"token_acc": 0.8964739593006288,
|
|
"train_speed(iter/s)": 0.118376
|
|
},
|
|
{
|
|
"epoch": 1.4508945711433618,
|
|
"eval_loss": 0.35392019152641296,
|
|
"eval_runtime": 31.07,
|
|
"eval_samples_per_second": 16.93,
|
|
"eval_steps_per_second": 4.248,
|
|
"eval_token_acc": 0.889134457414613,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 1.4570375489518543,
|
|
"grad_norm": 0.680291473865509,
|
|
"learning_rate": 5.222118240454376e-06,
|
|
"loss": 0.3221513509750366,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1185,
|
|
"token_acc": 0.8858566297847655,
|
|
"train_speed(iter/s)": 0.117989
|
|
},
|
|
{
|
|
"epoch": 1.463180526760347,
|
|
"grad_norm": 0.676287055015564,
|
|
"learning_rate": 5.18994399968171e-06,
|
|
"loss": 0.303191614151001,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1190,
|
|
"token_acc": 0.8928310930499115,
|
|
"train_speed(iter/s)": 0.118095
|
|
},
|
|
{
|
|
"epoch": 1.4693235045688398,
|
|
"grad_norm": 0.7134848237037659,
|
|
"learning_rate": 5.157761880469058e-06,
|
|
"loss": 0.30745644569396974,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1195,
|
|
"token_acc": 0.8987542686739455,
|
|
"train_speed(iter/s)": 0.118213
|
|
},
|
|
{
|
|
"epoch": 1.4754664823773325,
|
|
"grad_norm": 0.706149160861969,
|
|
"learning_rate": 5.125573217656664e-06,
|
|
"loss": 0.3102452278137207,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1200,
|
|
"token_acc": 0.9014028524666823,
|
|
"train_speed(iter/s)": 0.118318
|
|
},
|
|
{
|
|
"epoch": 1.4754664823773325,
|
|
"eval_loss": 0.35402196645736694,
|
|
"eval_runtime": 31.0702,
|
|
"eval_samples_per_second": 16.929,
|
|
"eval_steps_per_second": 4.248,
|
|
"eval_token_acc": 0.8895391266753134,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 1.4816094601858252,
|
|
"grad_norm": 0.7066270112991333,
|
|
"learning_rate": 5.0933793463561855e-06,
|
|
"loss": 0.3033695936203003,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1205,
|
|
"token_acc": 0.8896138651714031,
|
|
"train_speed(iter/s)": 0.117945
|
|
},
|
|
{
|
|
"epoch": 1.4877524379943177,
|
|
"grad_norm": 0.6695776581764221,
|
|
"learning_rate": 5.061181601895317e-06,
|
|
"loss": 0.30724053382873534,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1210,
|
|
"token_acc": 0.9012793441808471,
|
|
"train_speed(iter/s)": 0.118065
|
|
},
|
|
{
|
|
"epoch": 1.4938954158028104,
|
|
"grad_norm": 0.7692334651947021,
|
|
"learning_rate": 5.028981319762399e-06,
|
|
"loss": 0.28596570491790774,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1215,
|
|
"token_acc": 0.8964816040858792,
|
|
"train_speed(iter/s)": 0.118187
|
|
},
|
|
{
|
|
"epoch": 1.500038393611303,
|
|
"grad_norm": 0.6707490086555481,
|
|
"learning_rate": 4.996779835551035e-06,
|
|
"loss": 0.2939592838287354,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1220,
|
|
"token_acc": 0.8994356329668192,
|
|
"train_speed(iter/s)": 0.118298
|
|
},
|
|
{
|
|
"epoch": 1.500038393611303,
|
|
"eval_loss": 0.35305002331733704,
|
|
"eval_runtime": 31.066,
|
|
"eval_samples_per_second": 16.932,
|
|
"eval_steps_per_second": 4.249,
|
|
"eval_token_acc": 0.8896774751405102,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 1.5061813714197958,
|
|
"grad_norm": 0.7542144656181335,
|
|
"learning_rate": 4.964578484904679e-06,
|
|
"loss": 0.30585541725158694,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1225,
|
|
"token_acc": 0.8881905335110271,
|
|
"train_speed(iter/s)": 0.117949
|
|
},
|
|
{
|
|
"epoch": 1.5123243492282885,
|
|
"grad_norm": 0.6754580140113831,
|
|
"learning_rate": 4.932378603461253e-06,
|
|
"loss": 0.2997127056121826,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1230,
|
|
"token_acc": 0.9038497785317123,
|
|
"train_speed(iter/s)": 0.118065
|
|
},
|
|
{
|
|
"epoch": 1.518467327036781,
|
|
"grad_norm": 0.7103241682052612,
|
|
"learning_rate": 4.900181526797737e-06,
|
|
"loss": 0.29804291725158694,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1235,
|
|
"token_acc": 0.8995869901910171,
|
|
"train_speed(iter/s)": 0.118167
|
|
},
|
|
{
|
|
"epoch": 1.5246103048452737,
|
|
"grad_norm": 0.6416381001472473,
|
|
"learning_rate": 4.867988590374777e-06,
|
|
"loss": 0.2915628433227539,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1240,
|
|
"token_acc": 0.8995757044689388,
|
|
"train_speed(iter/s)": 0.118299
|
|
},
|
|
{
|
|
"epoch": 1.5246103048452737,
|
|
"eval_loss": 0.35335448384284973,
|
|
"eval_runtime": 31.1015,
|
|
"eval_samples_per_second": 16.912,
|
|
"eval_steps_per_second": 4.244,
|
|
"eval_token_acc": 0.8896774751405102,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 1.5307532826537664,
|
|
"grad_norm": 0.7514793872833252,
|
|
"learning_rate": 4.835801129481287e-06,
|
|
"loss": 0.305086350440979,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1245,
|
|
"token_acc": 0.8938343509704211,
|
|
"train_speed(iter/s)": 0.117954
|
|
},
|
|
{
|
|
"epoch": 1.5368962604622591,
|
|
"grad_norm": 0.712042510509491,
|
|
"learning_rate": 4.803620479179071e-06,
|
|
"loss": 0.30651469230651857,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1250,
|
|
"token_acc": 0.9019437191760952,
|
|
"train_speed(iter/s)": 0.118064
|
|
},
|
|
{
|
|
"epoch": 1.5430392382707518,
|
|
"grad_norm": 0.6950103640556335,
|
|
"learning_rate": 4.771447974247449e-06,
|
|
"loss": 0.29916160106658934,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1255,
|
|
"token_acc": 0.8986829014071162,
|
|
"train_speed(iter/s)": 0.118206
|
|
},
|
|
{
|
|
"epoch": 1.5491822160792443,
|
|
"grad_norm": 0.702800452709198,
|
|
"learning_rate": 4.7392849491278825e-06,
|
|
"loss": 0.3027307987213135,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1260,
|
|
"token_acc": 0.8973517128165512,
|
|
"train_speed(iter/s)": 0.118315
|
|
},
|
|
{
|
|
"epoch": 1.5491822160792443,
|
|
"eval_loss": 0.35245779156684875,
|
|
"eval_runtime": 31.0495,
|
|
"eval_samples_per_second": 16.941,
|
|
"eval_steps_per_second": 4.251,
|
|
"eval_token_acc": 0.8897846952010376,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 1.555325193887737,
|
|
"grad_norm": 0.6939496397972107,
|
|
"learning_rate": 4.707132737868639e-06,
|
|
"loss": 0.30812973976135255,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1265,
|
|
"token_acc": 0.8929094774646575,
|
|
"train_speed(iter/s)": 0.117991
|
|
},
|
|
{
|
|
"epoch": 1.5614681716962298,
|
|
"grad_norm": 0.6996237635612488,
|
|
"learning_rate": 4.674992674069445e-06,
|
|
"loss": 0.3079190969467163,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1270,
|
|
"token_acc": 0.8922962411611463,
|
|
"train_speed(iter/s)": 0.118087
|
|
},
|
|
{
|
|
"epoch": 1.5676111495047225,
|
|
"grad_norm": 0.7096247673034668,
|
|
"learning_rate": 4.642866090826187e-06,
|
|
"loss": 0.29966809749603274,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1275,
|
|
"token_acc": 0.8995864625915011,
|
|
"train_speed(iter/s)": 0.118159
|
|
},
|
|
{
|
|
"epoch": 1.5737541273132152,
|
|
"grad_norm": 0.6891176104545593,
|
|
"learning_rate": 4.610754320675603e-06,
|
|
"loss": 0.28565430641174316,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1280,
|
|
"token_acc": 0.9035195544740737,
|
|
"train_speed(iter/s)": 0.118282
|
|
},
|
|
{
|
|
"epoch": 1.5737541273132152,
|
|
"eval_loss": 0.3529431223869324,
|
|
"eval_runtime": 31.0442,
|
|
"eval_samples_per_second": 16.944,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.8897604842196282,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 1.5798971051217077,
|
|
"grad_norm": 0.6836899518966675,
|
|
"learning_rate": 4.578658695540018e-06,
|
|
"loss": 0.30156033039093016,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1285,
|
|
"token_acc": 0.8901772041128856,
|
|
"train_speed(iter/s)": 0.117956
|
|
},
|
|
{
|
|
"epoch": 1.5860400829302004,
|
|
"grad_norm": 0.6600014567375183,
|
|
"learning_rate": 4.5465805466721e-06,
|
|
"loss": 0.30488083362579343,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1290,
|
|
"token_acc": 0.9087979374798582,
|
|
"train_speed(iter/s)": 0.11807
|
|
},
|
|
{
|
|
"epoch": 1.592183060738693,
|
|
"grad_norm": 0.7213631272315979,
|
|
"learning_rate": 4.514521204599645e-06,
|
|
"loss": 0.30581624507904054,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1295,
|
|
"token_acc": 0.9020306055757139,
|
|
"train_speed(iter/s)": 0.118174
|
|
},
|
|
{
|
|
"epoch": 1.5983260385471858,
|
|
"grad_norm": 0.6365712285041809,
|
|
"learning_rate": 4.48248199907038e-06,
|
|
"loss": 0.2971078872680664,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1300,
|
|
"token_acc": 0.9063318669368791,
|
|
"train_speed(iter/s)": 0.118307
|
|
},
|
|
{
|
|
"epoch": 1.5983260385471858,
|
|
"eval_loss": 0.35123586654663086,
|
|
"eval_runtime": 31.0565,
|
|
"eval_samples_per_second": 16.937,
|
|
"eval_steps_per_second": 4.25,
|
|
"eval_token_acc": 0.8903000432338953,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 1.6044690163556785,
|
|
"grad_norm": 0.7233961820602417,
|
|
"learning_rate": 4.450464258996822e-06,
|
|
"loss": 0.3078035831451416,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1305,
|
|
"token_acc": 0.8908178398170103,
|
|
"train_speed(iter/s)": 0.117996
|
|
},
|
|
{
|
|
"epoch": 1.610611994164171,
|
|
"grad_norm": 0.7506811022758484,
|
|
"learning_rate": 4.418469312401141e-06,
|
|
"loss": 0.29109845161437986,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1310,
|
|
"token_acc": 0.906337023704408,
|
|
"train_speed(iter/s)": 0.118097
|
|
},
|
|
{
|
|
"epoch": 1.6167549719726637,
|
|
"grad_norm": 0.7110884785652161,
|
|
"learning_rate": 4.386498486360095e-06,
|
|
"loss": 0.3077766180038452,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1315,
|
|
"token_acc": 0.8983554542610717,
|
|
"train_speed(iter/s)": 0.118213
|
|
},
|
|
{
|
|
"epoch": 1.6228979497811564,
|
|
"grad_norm": 0.6889677047729492,
|
|
"learning_rate": 4.354553106949972e-06,
|
|
"loss": 0.30059351921081545,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1320,
|
|
"token_acc": 0.90420160281651,
|
|
"train_speed(iter/s)": 0.118315
|
|
},
|
|
{
|
|
"epoch": 1.6228979497811564,
|
|
"eval_loss": 0.3506639003753662,
|
|
"eval_runtime": 31.0876,
|
|
"eval_samples_per_second": 16.92,
|
|
"eval_steps_per_second": 4.246,
|
|
"eval_token_acc": 0.8903450064850843,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 1.6290409275896491,
|
|
"grad_norm": 0.6659175753593445,
|
|
"learning_rate": 4.3226344991915936e-06,
|
|
"loss": 0.2960678577423096,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1325,
|
|
"token_acc": 0.8925680515759312,
|
|
"train_speed(iter/s)": 0.117967
|
|
},
|
|
{
|
|
"epoch": 1.6351839053981418,
|
|
"grad_norm": 0.6886357069015503,
|
|
"learning_rate": 4.290743986995353e-06,
|
|
"loss": 0.30909056663513185,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1330,
|
|
"token_acc": 0.9006650503792344,
|
|
"train_speed(iter/s)": 0.118082
|
|
},
|
|
{
|
|
"epoch": 1.6413268832066343,
|
|
"grad_norm": 0.7061545848846436,
|
|
"learning_rate": 4.258882893106308e-06,
|
|
"loss": 0.28565549850463867,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1335,
|
|
"token_acc": 0.9070018118019403,
|
|
"train_speed(iter/s)": 0.118171
|
|
},
|
|
{
|
|
"epoch": 1.647469861015127,
|
|
"grad_norm": 0.7113469243049622,
|
|
"learning_rate": 4.227052539049312e-06,
|
|
"loss": 0.28241825103759766,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1340,
|
|
"token_acc": 0.898852240585334,
|
|
"train_speed(iter/s)": 0.118285
|
|
},
|
|
{
|
|
"epoch": 1.647469861015127,
|
|
"eval_loss": 0.3508993089199066,
|
|
"eval_runtime": 31.0521,
|
|
"eval_samples_per_second": 16.939,
|
|
"eval_steps_per_second": 4.251,
|
|
"eval_token_acc": 0.8900994379593601,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 1.6536128388236198,
|
|
"grad_norm": 0.663295567035675,
|
|
"learning_rate": 4.195254245074196e-06,
|
|
"loss": 0.2974137783050537,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1345,
|
|
"token_acc": 0.8932698844323589,
|
|
"train_speed(iter/s)": 0.117947
|
|
},
|
|
{
|
|
"epoch": 1.6597558166321125,
|
|
"grad_norm": 0.6674165725708008,
|
|
"learning_rate": 4.163489330101017e-06,
|
|
"loss": 0.3030970096588135,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1350,
|
|
"token_acc": 0.8978457754971743,
|
|
"train_speed(iter/s)": 0.118042
|
|
},
|
|
{
|
|
"epoch": 1.6658987944406052,
|
|
"grad_norm": 0.6563280820846558,
|
|
"learning_rate": 4.131759111665349e-06,
|
|
"loss": 0.2904500961303711,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1355,
|
|
"token_acc": 0.902543907296759,
|
|
"train_speed(iter/s)": 0.118117
|
|
},
|
|
{
|
|
"epoch": 1.6720417722490977,
|
|
"grad_norm": 0.6549026370048523,
|
|
"learning_rate": 4.100064905863628e-06,
|
|
"loss": 0.2979156970977783,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1360,
|
|
"token_acc": 0.8915877216849292,
|
|
"train_speed(iter/s)": 0.118213
|
|
},
|
|
{
|
|
"epoch": 1.6720417722490977,
|
|
"eval_loss": 0.3503533601760864,
|
|
"eval_runtime": 31.0554,
|
|
"eval_samples_per_second": 16.937,
|
|
"eval_steps_per_second": 4.25,
|
|
"eval_token_acc": 0.8904902723735408,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 1.6781847500575904,
|
|
"grad_norm": 0.6918724179267883,
|
|
"learning_rate": 4.068408027298576e-06,
|
|
"loss": 0.2886175632476807,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1365,
|
|
"token_acc": 0.8957540263543192,
|
|
"train_speed(iter/s)": 0.117895
|
|
},
|
|
{
|
|
"epoch": 1.684327727866083,
|
|
"grad_norm": 0.6951196193695068,
|
|
"learning_rate": 4.036789789024659e-06,
|
|
"loss": 0.30408420562744143,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1370,
|
|
"token_acc": 0.9016488217746225,
|
|
"train_speed(iter/s)": 0.117988
|
|
},
|
|
{
|
|
"epoch": 1.6904707056745758,
|
|
"grad_norm": 0.7309929728507996,
|
|
"learning_rate": 4.00521150249364e-06,
|
|
"loss": 0.2967136144638062,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1375,
|
|
"token_acc": 0.9024064171122995,
|
|
"train_speed(iter/s)": 0.1181
|
|
},
|
|
{
|
|
"epoch": 1.6966136834830685,
|
|
"grad_norm": 0.7061511278152466,
|
|
"learning_rate": 3.973674477500172e-06,
|
|
"loss": 0.3006556749343872,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1380,
|
|
"token_acc": 0.9038457180411086,
|
|
"train_speed(iter/s)": 0.118226
|
|
},
|
|
{
|
|
"epoch": 1.6966136834830685,
|
|
"eval_loss": 0.3506544828414917,
|
|
"eval_runtime": 31.002,
|
|
"eval_samples_per_second": 16.967,
|
|
"eval_steps_per_second": 4.258,
|
|
"eval_token_acc": 0.8901928231733679,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 1.702756661291561,
|
|
"grad_norm": 0.696220338344574,
|
|
"learning_rate": 3.942180022127475e-06,
|
|
"loss": 0.2850822925567627,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1385,
|
|
"token_acc": 0.8949225591538171,
|
|
"train_speed(iter/s)": 0.117915
|
|
},
|
|
{
|
|
"epoch": 1.7088996391000537,
|
|
"grad_norm": 0.6707799434661865,
|
|
"learning_rate": 3.910729442693077e-06,
|
|
"loss": 0.30518031120300293,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1390,
|
|
"token_acc": 0.8971721087421103,
|
|
"train_speed(iter/s)": 0.118027
|
|
},
|
|
{
|
|
"epoch": 1.7150426169085464,
|
|
"grad_norm": 0.694172203540802,
|
|
"learning_rate": 3.8793240436946385e-06,
|
|
"loss": 0.29511513710021975,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1395,
|
|
"token_acc": 0.9010794140323825,
|
|
"train_speed(iter/s)": 0.118112
|
|
},
|
|
{
|
|
"epoch": 1.7211855947170391,
|
|
"grad_norm": 0.6791805624961853,
|
|
"learning_rate": 3.847965127755834e-06,
|
|
"loss": 0.2960803747177124,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1400,
|
|
"token_acc": 0.8956415132105685,
|
|
"train_speed(iter/s)": 0.11822
|
|
},
|
|
{
|
|
"epoch": 1.7211855947170391,
|
|
"eval_loss": 0.350666344165802,
|
|
"eval_runtime": 31.0507,
|
|
"eval_samples_per_second": 16.94,
|
|
"eval_steps_per_second": 4.251,
|
|
"eval_token_acc": 0.8905456117596195,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 1.7273285725255318,
|
|
"grad_norm": 0.6747899651527405,
|
|
"learning_rate": 3.816653995572332e-06,
|
|
"loss": 0.290825629234314,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1405,
|
|
"token_acc": 0.891223331082264,
|
|
"train_speed(iter/s)": 0.117914
|
|
},
|
|
{
|
|
"epoch": 1.7334715503340243,
|
|
"grad_norm": 0.660038411617279,
|
|
"learning_rate": 3.7853919458578327e-06,
|
|
"loss": 0.28858532905578616,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1410,
|
|
"token_acc": 0.9013322410968354,
|
|
"train_speed(iter/s)": 0.118029
|
|
},
|
|
{
|
|
"epoch": 1.739614528142517,
|
|
"grad_norm": 0.6371601223945618,
|
|
"learning_rate": 3.7541802752902224e-06,
|
|
"loss": 0.28829474449157716,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1415,
|
|
"token_acc": 0.9037818893145325,
|
|
"train_speed(iter/s)": 0.118112
|
|
},
|
|
{
|
|
"epoch": 1.7457575059510098,
|
|
"grad_norm": 0.7338966131210327,
|
|
"learning_rate": 3.723020278457763e-06,
|
|
"loss": 0.2963329076766968,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1420,
|
|
"token_acc": 0.9052094407824792,
|
|
"train_speed(iter/s)": 0.118216
|
|
},
|
|
{
|
|
"epoch": 1.7457575059510098,
|
|
"eval_loss": 0.3507256507873535,
|
|
"eval_runtime": 31.0383,
|
|
"eval_samples_per_second": 16.947,
|
|
"eval_steps_per_second": 4.253,
|
|
"eval_token_acc": 0.8900544747081712,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 1.7519004837595025,
|
|
"grad_norm": 0.6258969902992249,
|
|
"learning_rate": 3.6919132478054153e-06,
|
|
"loss": 0.29568450450897216,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1425,
|
|
"token_acc": 0.8909262230371559,
|
|
"train_speed(iter/s)": 0.117906
|
|
},
|
|
{
|
|
"epoch": 1.7580434615679952,
|
|
"grad_norm": 0.6673945784568787,
|
|
"learning_rate": 3.6608604735812226e-06,
|
|
"loss": 0.29297194480895994,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1430,
|
|
"token_acc": 0.9073745475193413,
|
|
"train_speed(iter/s)": 0.117999
|
|
},
|
|
{
|
|
"epoch": 1.7641864393764877,
|
|
"grad_norm": 0.6559710502624512,
|
|
"learning_rate": 3.629863243782799e-06,
|
|
"loss": 0.29749407768249514,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1435,
|
|
"token_acc": 0.9093345763896982,
|
|
"train_speed(iter/s)": 0.118115
|
|
},
|
|
{
|
|
"epoch": 1.7703294171849804,
|
|
"grad_norm": 0.6504038572311401,
|
|
"learning_rate": 3.5989228441039024e-06,
|
|
"loss": 0.29113216400146485,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1440,
|
|
"token_acc": 0.8930581191194346,
|
|
"train_speed(iter/s)": 0.118206
|
|
},
|
|
{
|
|
"epoch": 1.7703294171849804,
|
|
"eval_loss": 0.34917929768562317,
|
|
"eval_runtime": 31.0337,
|
|
"eval_samples_per_second": 16.949,
|
|
"eval_steps_per_second": 4.253,
|
|
"eval_token_acc": 0.8902377864245569,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 1.776472394993473,
|
|
"grad_norm": 0.6400864720344543,
|
|
"learning_rate": 3.568040557881106e-06,
|
|
"loss": 0.2814110279083252,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1445,
|
|
"token_acc": 0.8906971833959715,
|
|
"train_speed(iter/s)": 0.117931
|
|
},
|
|
{
|
|
"epoch": 1.7826153728019658,
|
|
"grad_norm": 0.7064361572265625,
|
|
"learning_rate": 3.5372176660405717e-06,
|
|
"loss": 0.3039525270462036,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1450,
|
|
"token_acc": 0.9050828549515421,
|
|
"train_speed(iter/s)": 0.118013
|
|
},
|
|
{
|
|
"epoch": 1.7887583506104585,
|
|
"grad_norm": 0.6955869793891907,
|
|
"learning_rate": 3.506455447044923e-06,
|
|
"loss": 0.2821065425872803,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1455,
|
|
"token_acc": 0.9053625617102223,
|
|
"train_speed(iter/s)": 0.118116
|
|
},
|
|
{
|
|
"epoch": 1.794901328418951,
|
|
"grad_norm": 0.6877216696739197,
|
|
"learning_rate": 3.4757551768402074e-06,
|
|
"loss": 0.2811419010162354,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1460,
|
|
"token_acc": 0.9011031359892095,
|
|
"train_speed(iter/s)": 0.118215
|
|
},
|
|
{
|
|
"epoch": 1.794901328418951,
|
|
"eval_loss": 0.34925225377082825,
|
|
"eval_runtime": 31.0492,
|
|
"eval_samples_per_second": 16.941,
|
|
"eval_steps_per_second": 4.251,
|
|
"eval_token_acc": 0.8903795936013835,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 1.8010443062274437,
|
|
"grad_norm": 0.6559416055679321,
|
|
"learning_rate": 3.4451181288029834e-06,
|
|
"loss": 0.2829850912094116,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1465,
|
|
"token_acc": 0.8958890676209237,
|
|
"train_speed(iter/s)": 0.117907
|
|
},
|
|
{
|
|
"epoch": 1.8071872840359364,
|
|
"grad_norm": 0.7104200720787048,
|
|
"learning_rate": 3.4145455736874957e-06,
|
|
"loss": 0.2918513059616089,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1470,
|
|
"token_acc": 0.9029460760822436,
|
|
"train_speed(iter/s)": 0.118008
|
|
},
|
|
{
|
|
"epoch": 1.8133302618444291,
|
|
"grad_norm": 0.7294064164161682,
|
|
"learning_rate": 3.3840387795729753e-06,
|
|
"loss": 0.30045604705810547,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1475,
|
|
"token_acc": 0.8996919108690979,
|
|
"train_speed(iter/s)": 0.118115
|
|
},
|
|
{
|
|
"epoch": 1.8194732396529218,
|
|
"grad_norm": 0.7393286824226379,
|
|
"learning_rate": 3.353599011811037e-06,
|
|
"loss": 0.3116471767425537,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1480,
|
|
"token_acc": 0.8992412297989751,
|
|
"train_speed(iter/s)": 0.118208
|
|
},
|
|
{
|
|
"epoch": 1.8194732396529218,
|
|
"eval_loss": 0.34843236207962036,
|
|
"eval_runtime": 31.0158,
|
|
"eval_samples_per_second": 16.959,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.8908534370946822,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 1.8256162174614143,
|
|
"grad_norm": 0.7225602865219116,
|
|
"learning_rate": 3.323227532973193e-06,
|
|
"loss": 0.2964847326278687,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1485,
|
|
"token_acc": 0.8920995259023428,
|
|
"train_speed(iter/s)": 0.117914
|
|
},
|
|
{
|
|
"epoch": 1.831759195269907,
|
|
"grad_norm": 0.7169524431228638,
|
|
"learning_rate": 3.292925602798492e-06,
|
|
"loss": 0.2890679359436035,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1490,
|
|
"token_acc": 0.9052988882813924,
|
|
"train_speed(iter/s)": 0.118009
|
|
},
|
|
{
|
|
"epoch": 1.8379021730783998,
|
|
"grad_norm": 0.7271701097488403,
|
|
"learning_rate": 3.262694478141266e-06,
|
|
"loss": 0.30105009078979494,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1495,
|
|
"token_acc": 0.8908968566759589,
|
|
"train_speed(iter/s)": 0.118105
|
|
},
|
|
{
|
|
"epoch": 1.8440451508868925,
|
|
"grad_norm": 0.7436238527297974,
|
|
"learning_rate": 3.2325354129189923e-06,
|
|
"loss": 0.3033268451690674,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1500,
|
|
"token_acc": 0.9051588095396857,
|
|
"train_speed(iter/s)": 0.118206
|
|
},
|
|
{
|
|
"epoch": 1.8440451508868925,
|
|
"eval_loss": 0.3476485013961792,
|
|
"eval_runtime": 31.0233,
|
|
"eval_samples_per_second": 16.955,
|
|
"eval_steps_per_second": 4.255,
|
|
"eval_token_acc": 0.8910609597924773,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.8501881286953852,
|
|
"grad_norm": 0.6778759956359863,
|
|
"learning_rate": 3.2024496580602892e-06,
|
|
"loss": 0.29907703399658203,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1505,
|
|
"token_acc": 0.8934337447015377,
|
|
"train_speed(iter/s)": 0.117911
|
|
},
|
|
{
|
|
"epoch": 1.8563311065038777,
|
|
"grad_norm": 0.6664173007011414,
|
|
"learning_rate": 3.172438461453032e-06,
|
|
"loss": 0.29923856258392334,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1510,
|
|
"token_acc": 0.8983641727004559,
|
|
"train_speed(iter/s)": 0.118019
|
|
},
|
|
{
|
|
"epoch": 1.8624740843123704,
|
|
"grad_norm": 0.7407649755477905,
|
|
"learning_rate": 3.142503067892594e-06,
|
|
"loss": 0.3053209066390991,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1515,
|
|
"token_acc": 0.8974559495588846,
|
|
"train_speed(iter/s)": 0.118102
|
|
},
|
|
{
|
|
"epoch": 1.868617062120863,
|
|
"grad_norm": 0.7822189927101135,
|
|
"learning_rate": 3.112644719030206e-06,
|
|
"loss": 0.2917191982269287,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1520,
|
|
"token_acc": 0.9052366138763197,
|
|
"train_speed(iter/s)": 0.118195
|
|
},
|
|
{
|
|
"epoch": 1.868617062120863,
|
|
"eval_loss": 0.3474676311016083,
|
|
"eval_runtime": 31.0192,
|
|
"eval_samples_per_second": 16.957,
|
|
"eval_steps_per_second": 4.255,
|
|
"eval_token_acc": 0.8910090791180285,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 1.8747600399293558,
|
|
"grad_norm": 0.6843962669372559,
|
|
"learning_rate": 3.0828646533214657e-06,
|
|
"loss": 0.3129580497741699,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1525,
|
|
"token_acc": 0.8910584210937568,
|
|
"train_speed(iter/s)": 0.117907
|
|
},
|
|
{
|
|
"epoch": 1.8809030177378485,
|
|
"grad_norm": 0.6650720238685608,
|
|
"learning_rate": 3.053164105974964e-06,
|
|
"loss": 0.3007251024246216,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1530,
|
|
"token_acc": 0.9046979865771813,
|
|
"train_speed(iter/s)": 0.118012
|
|
},
|
|
{
|
|
"epoch": 1.887045995546341,
|
|
"grad_norm": 0.687574028968811,
|
|
"learning_rate": 3.0235443089010564e-06,
|
|
"loss": 0.2859373092651367,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1535,
|
|
"token_acc": 0.9096972925400097,
|
|
"train_speed(iter/s)": 0.118098
|
|
},
|
|
{
|
|
"epoch": 1.8931889733548337,
|
|
"grad_norm": 0.6390689611434937,
|
|
"learning_rate": 2.9940064906607607e-06,
|
|
"loss": 0.28398540019989016,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1540,
|
|
"token_acc": 0.9035791530035582,
|
|
"train_speed(iter/s)": 0.118191
|
|
},
|
|
{
|
|
"epoch": 1.8931889733548337,
|
|
"eval_loss": 0.3476438522338867,
|
|
"eval_runtime": 31.0517,
|
|
"eval_samples_per_second": 16.939,
|
|
"eval_steps_per_second": 4.251,
|
|
"eval_token_acc": 0.8913341980112408,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 1.8993319511633264,
|
|
"grad_norm": 0.6599735617637634,
|
|
"learning_rate": 2.964551876414801e-06,
|
|
"loss": 0.27951204776763916,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1545,
|
|
"token_acc": 0.8958811522271253,
|
|
"train_speed(iter/s)": 0.117923
|
|
},
|
|
{
|
|
"epoch": 1.9054749289718191,
|
|
"grad_norm": 0.6753197312355042,
|
|
"learning_rate": 2.93518168787279e-06,
|
|
"loss": 0.2956626176834106,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1550,
|
|
"token_acc": 0.8987615726824576,
|
|
"train_speed(iter/s)": 0.118005
|
|
},
|
|
{
|
|
"epoch": 1.9116179067803118,
|
|
"grad_norm": 0.7011248469352722,
|
|
"learning_rate": 2.905897143242562e-06,
|
|
"loss": 0.2975893497467041,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1555,
|
|
"token_acc": 0.9092895928621318,
|
|
"train_speed(iter/s)": 0.118101
|
|
},
|
|
{
|
|
"epoch": 1.9177608845888043,
|
|
"grad_norm": 0.6635907292366028,
|
|
"learning_rate": 2.8766994571796336e-06,
|
|
"loss": 0.28919239044189454,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1560,
|
|
"token_acc": 0.9010686955756882,
|
|
"train_speed(iter/s)": 0.118185
|
|
},
|
|
{
|
|
"epoch": 1.9177608845888043,
|
|
"eval_loss": 0.3471442759037018,
|
|
"eval_runtime": 31.0546,
|
|
"eval_samples_per_second": 16.938,
|
|
"eval_steps_per_second": 4.251,
|
|
"eval_token_acc": 0.8912892347600518,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 1.923903862397297,
|
|
"grad_norm": 0.7003067135810852,
|
|
"learning_rate": 2.8475898407368298e-06,
|
|
"loss": 0.3121751308441162,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1565,
|
|
"token_acc": 0.8906831756550552,
|
|
"train_speed(iter/s)": 0.11792
|
|
},
|
|
{
|
|
"epoch": 1.9300468402057898,
|
|
"grad_norm": 0.6917641162872314,
|
|
"learning_rate": 2.8185695013140474e-06,
|
|
"loss": 0.31047801971435546,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1570,
|
|
"token_acc": 0.8935967102364517,
|
|
"train_speed(iter/s)": 0.117987
|
|
},
|
|
{
|
|
"epoch": 1.9361898180142825,
|
|
"grad_norm": 0.717903196811676,
|
|
"learning_rate": 2.7896396426081844e-06,
|
|
"loss": 0.29785962104797364,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1575,
|
|
"token_acc": 0.9072703838075233,
|
|
"train_speed(iter/s)": 0.118079
|
|
},
|
|
{
|
|
"epoch": 1.9423327958227752,
|
|
"grad_norm": 0.7065854072570801,
|
|
"learning_rate": 2.7608014645632e-06,
|
|
"loss": 0.2994864463806152,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1580,
|
|
"token_acc": 0.8992320879224104,
|
|
"train_speed(iter/s)": 0.118176
|
|
},
|
|
{
|
|
"epoch": 1.9423327958227752,
|
|
"eval_loss": 0.34740638732910156,
|
|
"eval_runtime": 31.0367,
|
|
"eval_samples_per_second": 16.948,
|
|
"eval_steps_per_second": 4.253,
|
|
"eval_token_acc": 0.8909641158668397,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 1.9484757736312677,
|
|
"grad_norm": 0.7280552387237549,
|
|
"learning_rate": 2.7320561633203567e-06,
|
|
"loss": 0.2979745864868164,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1585,
|
|
"token_acc": 0.8901521037274909,
|
|
"train_speed(iter/s)": 0.11791
|
|
},
|
|
{
|
|
"epoch": 1.9546187514397604,
|
|
"grad_norm": 0.6418682336807251,
|
|
"learning_rate": 2.703404931168594e-06,
|
|
"loss": 0.2907557010650635,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1590,
|
|
"token_acc": 0.8992377813256425,
|
|
"train_speed(iter/s)": 0.117995
|
|
},
|
|
{
|
|
"epoch": 1.960761729248253,
|
|
"grad_norm": 0.738042414188385,
|
|
"learning_rate": 2.6748489564950907e-06,
|
|
"loss": 0.29802637100219725,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1595,
|
|
"token_acc": 0.8980035246119306,
|
|
"train_speed(iter/s)": 0.118068
|
|
},
|
|
{
|
|
"epoch": 1.9669047070567458,
|
|
"grad_norm": 0.6280907988548279,
|
|
"learning_rate": 2.6463894237359556e-06,
|
|
"loss": 0.28393306732177737,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1600,
|
|
"token_acc": 0.9109865416676735,
|
|
"train_speed(iter/s)": 0.11816
|
|
},
|
|
{
|
|
"epoch": 1.9669047070567458,
|
|
"eval_loss": 0.34687539935112,
|
|
"eval_runtime": 31.0445,
|
|
"eval_samples_per_second": 16.943,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.8911128404669261,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 1.9730476848652385,
|
|
"grad_norm": 0.7155392169952393,
|
|
"learning_rate": 2.618027513327116e-06,
|
|
"loss": 0.3036234378814697,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1605,
|
|
"token_acc": 0.8935712088588127,
|
|
"train_speed(iter/s)": 0.117891
|
|
},
|
|
{
|
|
"epoch": 1.979190662673731,
|
|
"grad_norm": 0.7185878157615662,
|
|
"learning_rate": 2.589764401655343e-06,
|
|
"loss": 0.30625033378601074,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1610,
|
|
"token_acc": 0.9087600373057938,
|
|
"train_speed(iter/s)": 0.117966
|
|
},
|
|
{
|
|
"epoch": 1.9853336404822237,
|
|
"grad_norm": 0.6735581159591675,
|
|
"learning_rate": 2.5616012610094702e-06,
|
|
"loss": 0.30725975036621095,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1615,
|
|
"token_acc": 0.8961660250130988,
|
|
"train_speed(iter/s)": 0.118045
|
|
},
|
|
{
|
|
"epoch": 1.9914766182907164,
|
|
"grad_norm": 0.7557063102722168,
|
|
"learning_rate": 2.533539259531757e-06,
|
|
"loss": 0.29239468574523925,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1620,
|
|
"token_acc": 0.8937711127034497,
|
|
"train_speed(iter/s)": 0.118131
|
|
},
|
|
{
|
|
"epoch": 1.9914766182907164,
|
|
"eval_loss": 0.3465479016304016,
|
|
"eval_runtime": 31.0613,
|
|
"eval_samples_per_second": 16.934,
|
|
"eval_steps_per_second": 4.25,
|
|
"eval_token_acc": 0.8915348032857761,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 1.9976195960992091,
|
|
"grad_norm": 0.6903244853019714,
|
|
"learning_rate": 2.5055795611694435e-06,
|
|
"loss": 0.2919922351837158,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1625,
|
|
"token_acc": 0.8967221510883483,
|
|
"train_speed(iter/s)": 0.117886
|
|
},
|
|
{
|
|
"epoch": 2.004914382246794,
|
|
"grad_norm": 0.6568908095359802,
|
|
"learning_rate": 2.4777233256264743e-06,
|
|
"loss": 0.32158265113830564,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1630,
|
|
"token_acc": 0.9122267969438128,
|
|
"train_speed(iter/s)": 0.117945
|
|
},
|
|
{
|
|
"epoch": 2.011057360055287,
|
|
"grad_norm": 0.7132671475410461,
|
|
"learning_rate": 2.4499717083153975e-06,
|
|
"loss": 0.26807637214660646,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1635,
|
|
"token_acc": 0.9197771990740741,
|
|
"train_speed(iter/s)": 0.118027
|
|
},
|
|
{
|
|
"epoch": 2.0172003378637795,
|
|
"grad_norm": 0.6795634627342224,
|
|
"learning_rate": 2.4223258603094295e-06,
|
|
"loss": 0.2491468906402588,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1640,
|
|
"token_acc": 0.9240849211677818,
|
|
"train_speed(iter/s)": 0.118126
|
|
},
|
|
{
|
|
"epoch": 2.0172003378637795,
|
|
"eval_loss": 0.353736937046051,
|
|
"eval_runtime": 31.0562,
|
|
"eval_samples_per_second": 16.937,
|
|
"eval_steps_per_second": 4.25,
|
|
"eval_token_acc": 0.8904072632944229,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 2.023343315672272,
|
|
"grad_norm": 0.6820616126060486,
|
|
"learning_rate": 2.3947869282947263e-06,
|
|
"loss": 0.24982304573059083,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1645,
|
|
"token_acc": 0.8986748783803705,
|
|
"train_speed(iter/s)": 0.117854
|
|
},
|
|
{
|
|
"epoch": 2.029486293480765,
|
|
"grad_norm": 0.7354549169540405,
|
|
"learning_rate": 2.3673560545228082e-06,
|
|
"loss": 0.25387675762176515,
|
|
"memory(GiB)": 39.06,
|
|
"step": 1650,
|
|
"token_acc": 0.9141678261286763,
|
|
"train_speed(iter/s)": 0.117936
|
|
},
|
|
{
|
|
"epoch": 2.0356292712892574,
|
|
"grad_norm": 0.6763687133789062,
|
|
"learning_rate": 2.3400343767631943e-06,
|
|
"loss": 0.25168399810791015,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1655,
|
|
"token_acc": 0.9232377049180328,
|
|
"train_speed(iter/s)": 0.118023
|
|
},
|
|
{
|
|
"epoch": 2.0417722490977503,
|
|
"grad_norm": 0.6416710019111633,
|
|
"learning_rate": 2.312823028256205e-06,
|
|
"loss": 0.2497392177581787,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1660,
|
|
"token_acc": 0.9226366364968939,
|
|
"train_speed(iter/s)": 0.118098
|
|
},
|
|
{
|
|
"epoch": 2.0417722490977503,
|
|
"eval_loss": 0.35414808988571167,
|
|
"eval_runtime": 31.0374,
|
|
"eval_samples_per_second": 16.947,
|
|
"eval_steps_per_second": 4.253,
|
|
"eval_token_acc": 0.8909952442715089,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 2.047915226906243,
|
|
"grad_norm": 0.6878734827041626,
|
|
"learning_rate": 2.2857231376659517e-06,
|
|
"loss": 0.26041717529296876,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1665,
|
|
"token_acc": 0.895642282731377,
|
|
"train_speed(iter/s)": 0.117842
|
|
},
|
|
{
|
|
"epoch": 2.0540582047147353,
|
|
"grad_norm": 0.6756438612937927,
|
|
"learning_rate": 2.258735829033529e-06,
|
|
"loss": 0.2607592582702637,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1670,
|
|
"token_acc": 0.904909300316729,
|
|
"train_speed(iter/s)": 0.117933
|
|
},
|
|
{
|
|
"epoch": 2.0602011825232283,
|
|
"grad_norm": 0.6508097648620605,
|
|
"learning_rate": 2.231862221730394e-06,
|
|
"loss": 0.2445054054260254,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1675,
|
|
"token_acc": 0.9190018092758484,
|
|
"train_speed(iter/s)": 0.117998
|
|
},
|
|
{
|
|
"epoch": 2.0663441603317207,
|
|
"grad_norm": 0.6221520900726318,
|
|
"learning_rate": 2.2051034304119344e-06,
|
|
"loss": 0.2536668300628662,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1680,
|
|
"token_acc": 0.9074535753395931,
|
|
"train_speed(iter/s)": 0.118087
|
|
},
|
|
{
|
|
"epoch": 2.0663441603317207,
|
|
"eval_loss": 0.3554106652736664,
|
|
"eval_runtime": 31.0289,
|
|
"eval_samples_per_second": 16.952,
|
|
"eval_steps_per_second": 4.254,
|
|
"eval_token_acc": 0.8906770428015565,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 2.0724871381402137,
|
|
"grad_norm": 0.6437965035438538,
|
|
"learning_rate": 2.1784605649712326e-06,
|
|
"loss": 0.2540877103805542,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1685,
|
|
"token_acc": 0.896763604572522,
|
|
"train_speed(iter/s)": 0.117846
|
|
},
|
|
{
|
|
"epoch": 2.078630115948706,
|
|
"grad_norm": 0.6842249631881714,
|
|
"learning_rate": 2.1519347304930317e-06,
|
|
"loss": 0.2614542007446289,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1690,
|
|
"token_acc": 0.9103810036765567,
|
|
"train_speed(iter/s)": 0.117925
|
|
},
|
|
{
|
|
"epoch": 2.0847730937571987,
|
|
"grad_norm": 0.6956413388252258,
|
|
"learning_rate": 2.1255270272079044e-06,
|
|
"loss": 0.2528813362121582,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1695,
|
|
"token_acc": 0.9163791495710556,
|
|
"train_speed(iter/s)": 0.118022
|
|
},
|
|
{
|
|
"epoch": 2.0909160715656916,
|
|
"grad_norm": 0.7066583037376404,
|
|
"learning_rate": 2.0992385504466075e-06,
|
|
"loss": 0.2548670291900635,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1700,
|
|
"token_acc": 0.9165421398684998,
|
|
"train_speed(iter/s)": 0.118107
|
|
},
|
|
{
|
|
"epoch": 2.0909160715656916,
|
|
"eval_loss": 0.35480257868766785,
|
|
"eval_runtime": 31.1108,
|
|
"eval_samples_per_second": 16.907,
|
|
"eval_steps_per_second": 4.243,
|
|
"eval_token_acc": 0.8904660613921315,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 2.097059049374184,
|
|
"grad_norm": 0.6234432458877563,
|
|
"learning_rate": 2.0730703905946612e-06,
|
|
"loss": 0.24052574634552001,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1705,
|
|
"token_acc": 0.8977333662447761,
|
|
"train_speed(iter/s)": 0.117854
|
|
},
|
|
{
|
|
"epoch": 2.103202027182677,
|
|
"grad_norm": 0.7239139080047607,
|
|
"learning_rate": 2.0470236330471125e-06,
|
|
"loss": 0.2701937437057495,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1710,
|
|
"token_acc": 0.9132731300051116,
|
|
"train_speed(iter/s)": 0.117927
|
|
},
|
|
{
|
|
"epoch": 2.1093450049911695,
|
|
"grad_norm": 0.7042427062988281,
|
|
"learning_rate": 2.0210993581635257e-06,
|
|
"loss": 0.2760786533355713,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1715,
|
|
"token_acc": 0.9138149259328708,
|
|
"train_speed(iter/s)": 0.118022
|
|
},
|
|
{
|
|
"epoch": 2.115487982799662,
|
|
"grad_norm": 0.6625633835792542,
|
|
"learning_rate": 1.9952986412231612e-06,
|
|
"loss": 0.2629417657852173,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1720,
|
|
"token_acc": 0.9162193754622009,
|
|
"train_speed(iter/s)": 0.118081
|
|
},
|
|
{
|
|
"epoch": 2.115487982799662,
|
|
"eval_loss": 0.35503000020980835,
|
|
"eval_runtime": 31.0455,
|
|
"eval_samples_per_second": 16.943,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.8903346303501946,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 2.121630960608155,
|
|
"grad_norm": 0.698905885219574,
|
|
"learning_rate": 1.9696225523803803e-06,
|
|
"loss": 0.2582688808441162,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1725,
|
|
"token_acc": 0.8980166095055636,
|
|
"train_speed(iter/s)": 0.11783
|
|
},
|
|
{
|
|
"epoch": 2.1277739384166474,
|
|
"grad_norm": 0.6825575828552246,
|
|
"learning_rate": 1.944072156620261e-06,
|
|
"loss": 0.2485950469970703,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1730,
|
|
"token_acc": 0.9185884165422945,
|
|
"train_speed(iter/s)": 0.117919
|
|
},
|
|
{
|
|
"epoch": 2.1339169162251403,
|
|
"grad_norm": 0.656775176525116,
|
|
"learning_rate": 1.9186485137144217e-06,
|
|
"loss": 0.26242403984069823,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1735,
|
|
"token_acc": 0.9276958754348186,
|
|
"train_speed(iter/s)": 0.118001
|
|
},
|
|
{
|
|
"epoch": 2.140059894033633,
|
|
"grad_norm": 0.6787784099578857,
|
|
"learning_rate": 1.89335267817706e-06,
|
|
"loss": 0.2578416347503662,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1740,
|
|
"token_acc": 0.9204126213592233,
|
|
"train_speed(iter/s)": 0.118068
|
|
},
|
|
{
|
|
"epoch": 2.140059894033633,
|
|
"eval_loss": 0.35603559017181396,
|
|
"eval_runtime": 31.0997,
|
|
"eval_samples_per_second": 16.913,
|
|
"eval_steps_per_second": 4.244,
|
|
"eval_token_acc": 0.8905179420665802,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 2.1462028718421253,
|
|
"grad_norm": 0.705270528793335,
|
|
"learning_rate": 1.8681856992212211e-06,
|
|
"loss": 0.27148022651672366,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1745,
|
|
"token_acc": 0.8956282843498057,
|
|
"train_speed(iter/s)": 0.117819
|
|
},
|
|
{
|
|
"epoch": 2.1523458496506183,
|
|
"grad_norm": 0.6656559705734253,
|
|
"learning_rate": 1.8431486207152704e-06,
|
|
"loss": 0.251650071144104,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1750,
|
|
"token_acc": 0.9161503405192278,
|
|
"train_speed(iter/s)": 0.117892
|
|
},
|
|
{
|
|
"epoch": 2.1584888274591107,
|
|
"grad_norm": 0.6367560625076294,
|
|
"learning_rate": 1.8182424811396131e-06,
|
|
"loss": 0.24891986846923828,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1755,
|
|
"token_acc": 0.917142553869016,
|
|
"train_speed(iter/s)": 0.117962
|
|
},
|
|
{
|
|
"epoch": 2.1646318052676037,
|
|
"grad_norm": 0.7008864283561707,
|
|
"learning_rate": 1.7934683135435993e-06,
|
|
"loss": 0.25353493690490725,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1760,
|
|
"token_acc": 0.9114828452290961,
|
|
"train_speed(iter/s)": 0.118051
|
|
},
|
|
{
|
|
"epoch": 2.1646318052676037,
|
|
"eval_loss": 0.35659661889076233,
|
|
"eval_runtime": 31.0459,
|
|
"eval_samples_per_second": 16.943,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.8904626026805015,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 2.170774783076096,
|
|
"grad_norm": 0.6810339093208313,
|
|
"learning_rate": 1.7688271455026867e-06,
|
|
"loss": 0.25748143196105955,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1765,
|
|
"token_acc": 0.8993780164502753,
|
|
"train_speed(iter/s)": 0.117817
|
|
},
|
|
{
|
|
"epoch": 2.1769177608845887,
|
|
"grad_norm": 0.701768696308136,
|
|
"learning_rate": 1.7443199990758168e-06,
|
|
"loss": 0.25628554821014404,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1770,
|
|
"token_acc": 0.9092464549396461,
|
|
"train_speed(iter/s)": 0.117899
|
|
},
|
|
{
|
|
"epoch": 2.1830607386930816,
|
|
"grad_norm": 0.6798021793365479,
|
|
"learning_rate": 1.7199478907630269e-06,
|
|
"loss": 0.25238001346588135,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1775,
|
|
"token_acc": 0.9152910102820488,
|
|
"train_speed(iter/s)": 0.117983
|
|
},
|
|
{
|
|
"epoch": 2.189203716501574,
|
|
"grad_norm": 0.7590020895004272,
|
|
"learning_rate": 1.6957118314632825e-06,
|
|
"loss": 0.26000936031341554,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1780,
|
|
"token_acc": 0.9114106063560148,
|
|
"train_speed(iter/s)": 0.118075
|
|
},
|
|
{
|
|
"epoch": 2.189203716501574,
|
|
"eval_loss": 0.3557458817958832,
|
|
"eval_runtime": 31.0956,
|
|
"eval_samples_per_second": 16.916,
|
|
"eval_steps_per_second": 4.245,
|
|
"eval_token_acc": 0.8905248594898401,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 2.195346694310067,
|
|
"grad_norm": 0.690200924873352,
|
|
"learning_rate": 1.6716128264325477e-06,
|
|
"loss": 0.26896276473999026,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1785,
|
|
"token_acc": 0.8972400913052501,
|
|
"train_speed(iter/s)": 0.117847
|
|
},
|
|
{
|
|
"epoch": 2.2014896721185595,
|
|
"grad_norm": 0.7046708464622498,
|
|
"learning_rate": 1.64765187524209e-06,
|
|
"loss": 0.2622739315032959,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1790,
|
|
"token_acc": 0.9040114613180515,
|
|
"train_speed(iter/s)": 0.117916
|
|
},
|
|
{
|
|
"epoch": 2.207632649927052,
|
|
"grad_norm": 0.6468427181243896,
|
|
"learning_rate": 1.6238299717370254e-06,
|
|
"loss": 0.25573272705078126,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1795,
|
|
"token_acc": 0.913803724588921,
|
|
"train_speed(iter/s)": 0.117988
|
|
},
|
|
{
|
|
"epoch": 2.213775627735545,
|
|
"grad_norm": 0.6906710863113403,
|
|
"learning_rate": 1.6001481039950872e-06,
|
|
"loss": 0.24774715900421143,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1800,
|
|
"token_acc": 0.9198941998866428,
|
|
"train_speed(iter/s)": 0.118059
|
|
},
|
|
{
|
|
"epoch": 2.213775627735545,
|
|
"eval_loss": 0.3556331396102905,
|
|
"eval_runtime": 31.0281,
|
|
"eval_samples_per_second": 16.952,
|
|
"eval_steps_per_second": 4.254,
|
|
"eval_token_acc": 0.8904798962386511,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 2.2199186055440374,
|
|
"grad_norm": 0.67650306224823,
|
|
"learning_rate": 1.5766072542856525e-06,
|
|
"loss": 0.2552159070968628,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1805,
|
|
"token_acc": 0.8967117243311388,
|
|
"train_speed(iter/s)": 0.117823
|
|
},
|
|
{
|
|
"epoch": 2.2260615833525303,
|
|
"grad_norm": 0.6951079368591309,
|
|
"learning_rate": 1.5532083990289892e-06,
|
|
"loss": 0.25490808486938477,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1810,
|
|
"token_acc": 0.9191604784561341,
|
|
"train_speed(iter/s)": 0.117913
|
|
},
|
|
{
|
|
"epoch": 2.232204561161023,
|
|
"grad_norm": 0.6896148920059204,
|
|
"learning_rate": 1.5299525087557682e-06,
|
|
"loss": 0.2403803586959839,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1815,
|
|
"token_acc": 0.9143227478937136,
|
|
"train_speed(iter/s)": 0.117979
|
|
},
|
|
{
|
|
"epoch": 2.2383475389695153,
|
|
"grad_norm": 0.6858778595924377,
|
|
"learning_rate": 1.5068405480667975e-06,
|
|
"loss": 0.2647264003753662,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1820,
|
|
"token_acc": 0.9243344548061508,
|
|
"train_speed(iter/s)": 0.118051
|
|
},
|
|
{
|
|
"epoch": 2.2383475389695153,
|
|
"eval_loss": 0.35514572262763977,
|
|
"eval_runtime": 31.0442,
|
|
"eval_samples_per_second": 16.944,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.8905214007782101,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 2.2444905167780083,
|
|
"grad_norm": 0.7021420001983643,
|
|
"learning_rate": 1.4838734755930168e-06,
|
|
"loss": 0.2488544464111328,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1825,
|
|
"token_acc": 0.8990035802096363,
|
|
"train_speed(iter/s)": 0.117822
|
|
},
|
|
{
|
|
"epoch": 2.2506334945865007,
|
|
"grad_norm": 0.7138723134994507,
|
|
"learning_rate": 1.461052243955739e-06,
|
|
"loss": 0.2516676902770996,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1830,
|
|
"token_acc": 0.9070056092612484,
|
|
"train_speed(iter/s)": 0.117885
|
|
},
|
|
{
|
|
"epoch": 2.2567764723949937,
|
|
"grad_norm": 0.6612991094589233,
|
|
"learning_rate": 1.4383777997271347e-06,
|
|
"loss": 0.25036053657531737,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1835,
|
|
"token_acc": 0.9232339162298808,
|
|
"train_speed(iter/s)": 0.11797
|
|
},
|
|
{
|
|
"epoch": 2.262919450203486,
|
|
"grad_norm": 0.670829176902771,
|
|
"learning_rate": 1.4158510833909688e-06,
|
|
"loss": 0.26495842933654784,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1840,
|
|
"token_acc": 0.9127685871838752,
|
|
"train_speed(iter/s)": 0.118042
|
|
},
|
|
{
|
|
"epoch": 2.262919450203486,
|
|
"eval_loss": 0.35496076941490173,
|
|
"eval_runtime": 31.0316,
|
|
"eval_samples_per_second": 16.95,
|
|
"eval_steps_per_second": 4.254,
|
|
"eval_token_acc": 0.890611327280588,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 2.2690624280119787,
|
|
"grad_norm": 0.6969290971755981,
|
|
"learning_rate": 1.3934730293035935e-06,
|
|
"loss": 0.2619413614273071,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1845,
|
|
"token_acc": 0.8992944915071285,
|
|
"train_speed(iter/s)": 0.117838
|
|
},
|
|
{
|
|
"epoch": 2.2752054058204716,
|
|
"grad_norm": 0.697259247303009,
|
|
"learning_rate": 1.3712445656551904e-06,
|
|
"loss": 0.26856374740600586,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1850,
|
|
"token_acc": 0.9039304347826087,
|
|
"train_speed(iter/s)": 0.117916
|
|
},
|
|
{
|
|
"epoch": 2.281348383628964,
|
|
"grad_norm": 0.7025954127311707,
|
|
"learning_rate": 1.349166614431282e-06,
|
|
"loss": 0.2570216655731201,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1855,
|
|
"token_acc": 0.9162639337494233,
|
|
"train_speed(iter/s)": 0.117981
|
|
},
|
|
{
|
|
"epoch": 2.287491361437457,
|
|
"grad_norm": 0.6871860027313232,
|
|
"learning_rate": 1.3272400913744744e-06,
|
|
"loss": 0.262271785736084,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1860,
|
|
"token_acc": 0.9138437528688148,
|
|
"train_speed(iter/s)": 0.118061
|
|
},
|
|
{
|
|
"epoch": 2.287491361437457,
|
|
"eval_loss": 0.35484763979911804,
|
|
"eval_runtime": 31.0168,
|
|
"eval_samples_per_second": 16.959,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.8907669693039343,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 2.2936343392459495,
|
|
"grad_norm": 0.6811879873275757,
|
|
"learning_rate": 1.3054659059464836e-06,
|
|
"loss": 0.2392117500305176,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1865,
|
|
"token_acc": 0.901497755975368,
|
|
"train_speed(iter/s)": 0.117825
|
|
},
|
|
{
|
|
"epoch": 2.299777317054442,
|
|
"grad_norm": 0.7064546346664429,
|
|
"learning_rate": 1.2838449612904108e-06,
|
|
"loss": 0.266256046295166,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1870,
|
|
"token_acc": 0.9117101026954622,
|
|
"train_speed(iter/s)": 0.117915
|
|
},
|
|
{
|
|
"epoch": 2.305920294862935,
|
|
"grad_norm": 0.7244398593902588,
|
|
"learning_rate": 1.262378154193285e-06,
|
|
"loss": 0.23866605758666992,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1875,
|
|
"token_acc": 0.915842304335176,
|
|
"train_speed(iter/s)": 0.117981
|
|
},
|
|
{
|
|
"epoch": 2.3120632726714274,
|
|
"grad_norm": 0.7136631608009338,
|
|
"learning_rate": 1.2410663750488644e-06,
|
|
"loss": 0.25197710990905764,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1880,
|
|
"token_acc": 0.9191310820870271,
|
|
"train_speed(iter/s)": 0.118043
|
|
},
|
|
{
|
|
"epoch": 2.3120632726714274,
|
|
"eval_loss": 0.355129599571228,
|
|
"eval_runtime": 31.0906,
|
|
"eval_samples_per_second": 16.918,
|
|
"eval_steps_per_second": 4.246,
|
|
"eval_token_acc": 0.8907254647643753,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 2.3182062504799203,
|
|
"grad_norm": 0.6782585978507996,
|
|
"learning_rate": 1.2199105078207002e-06,
|
|
"loss": 0.2743240833282471,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1885,
|
|
"token_acc": 0.8939592652104051,
|
|
"train_speed(iter/s)": 0.117803
|
|
},
|
|
{
|
|
"epoch": 2.324349228288413,
|
|
"grad_norm": 0.6339967846870422,
|
|
"learning_rate": 1.1989114300054782e-06,
|
|
"loss": 0.25202603340148927,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1890,
|
|
"token_acc": 0.916531565897387,
|
|
"train_speed(iter/s)": 0.117882
|
|
},
|
|
{
|
|
"epoch": 2.3304922060969053,
|
|
"grad_norm": 0.6756547689437866,
|
|
"learning_rate": 1.1780700125966232e-06,
|
|
"loss": 0.2598109722137451,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1895,
|
|
"token_acc": 0.9081785893065719,
|
|
"train_speed(iter/s)": 0.117946
|
|
},
|
|
{
|
|
"epoch": 2.3366351839053983,
|
|
"grad_norm": 0.7056384086608887,
|
|
"learning_rate": 1.1573871200481634e-06,
|
|
"loss": 0.2566692352294922,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1900,
|
|
"token_acc": 0.9156997782187464,
|
|
"train_speed(iter/s)": 0.118011
|
|
},
|
|
{
|
|
"epoch": 2.3366351839053983,
|
|
"eval_loss": 0.35557088255882263,
|
|
"eval_runtime": 31.0333,
|
|
"eval_samples_per_second": 16.95,
|
|
"eval_steps_per_second": 4.253,
|
|
"eval_token_acc": 0.8906389969736274,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 2.3427781617138907,
|
|
"grad_norm": 0.7157571911811829,
|
|
"learning_rate": 1.136863610238887e-06,
|
|
"loss": 0.25399596691131593,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1905,
|
|
"token_acc": 0.8955967995576062,
|
|
"train_speed(iter/s)": 0.117798
|
|
},
|
|
{
|
|
"epoch": 2.3489211395223837,
|
|
"grad_norm": 0.6849676370620728,
|
|
"learning_rate": 1.1165003344367465e-06,
|
|
"loss": 0.2500483512878418,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1910,
|
|
"token_acc": 0.9112139701241321,
|
|
"train_speed(iter/s)": 0.11788
|
|
},
|
|
{
|
|
"epoch": 2.355064117330876,
|
|
"grad_norm": 0.6843670010566711,
|
|
"learning_rate": 1.0962981372635629e-06,
|
|
"loss": 0.24124569892883302,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1915,
|
|
"token_acc": 0.9228162034548048,
|
|
"train_speed(iter/s)": 0.117963
|
|
},
|
|
{
|
|
"epoch": 2.3612070951393687,
|
|
"grad_norm": 0.6974015235900879,
|
|
"learning_rate": 1.0762578566599818e-06,
|
|
"loss": 0.24528083801269532,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1920,
|
|
"token_acc": 0.9175750441436139,
|
|
"train_speed(iter/s)": 0.118052
|
|
},
|
|
{
|
|
"epoch": 2.3612070951393687,
|
|
"eval_loss": 0.3551888167858124,
|
|
"eval_runtime": 30.9708,
|
|
"eval_samples_per_second": 16.984,
|
|
"eval_steps_per_second": 4.262,
|
|
"eval_token_acc": 0.8907427583225248,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 2.3673500729478616,
|
|
"grad_norm": 0.6731058359146118,
|
|
"learning_rate": 1.056380323850722e-06,
|
|
"loss": 0.24767663478851318,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1925,
|
|
"token_acc": 0.90198810396806,
|
|
"train_speed(iter/s)": 0.117814
|
|
},
|
|
{
|
|
"epoch": 2.373493050756354,
|
|
"grad_norm": 0.6461980938911438,
|
|
"learning_rate": 1.0366663633101015e-06,
|
|
"loss": 0.2535504102706909,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1930,
|
|
"token_acc": 0.9234430094966145,
|
|
"train_speed(iter/s)": 0.117879
|
|
},
|
|
{
|
|
"epoch": 2.379636028564847,
|
|
"grad_norm": 0.6973277926445007,
|
|
"learning_rate": 1.0171167927278369e-06,
|
|
"loss": 0.25800695419311526,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1935,
|
|
"token_acc": 0.9152892113208366,
|
|
"train_speed(iter/s)": 0.117936
|
|
},
|
|
{
|
|
"epoch": 2.3857790063733395,
|
|
"grad_norm": 0.6010280847549438,
|
|
"learning_rate": 9.977324229751245e-07,
|
|
"loss": 0.2460566520690918,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1940,
|
|
"token_acc": 0.9177397229965928,
|
|
"train_speed(iter/s)": 0.117997
|
|
},
|
|
{
|
|
"epoch": 2.3857790063733395,
|
|
"eval_loss": 0.35497036576271057,
|
|
"eval_runtime": 31.0056,
|
|
"eval_samples_per_second": 16.965,
|
|
"eval_steps_per_second": 4.257,
|
|
"eval_token_acc": 0.8907565931690445,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 2.391921984181832,
|
|
"grad_norm": 0.7224907875061035,
|
|
"learning_rate": 9.785140580710106e-07,
|
|
"loss": 0.24542105197906494,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1945,
|
|
"token_acc": 0.899276675757627,
|
|
"train_speed(iter/s)": 0.117779
|
|
},
|
|
{
|
|
"epoch": 2.398064961990325,
|
|
"grad_norm": 0.6951374411582947,
|
|
"learning_rate": 9.594624951490455e-07,
|
|
"loss": 0.2523444652557373,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1950,
|
|
"token_acc": 0.9187413638457249,
|
|
"train_speed(iter/s)": 0.11785
|
|
},
|
|
{
|
|
"epoch": 2.4042079397988174,
|
|
"grad_norm": 0.708865761756897,
|
|
"learning_rate": 9.405785244242166e-07,
|
|
"loss": 0.2396538734436035,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1955,
|
|
"token_acc": 0.9178324813918034,
|
|
"train_speed(iter/s)": 0.117923
|
|
},
|
|
{
|
|
"epoch": 2.4103509176073104,
|
|
"grad_norm": 0.6320639848709106,
|
|
"learning_rate": 9.218629291601699e-07,
|
|
"loss": 0.23296713829040527,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1960,
|
|
"token_acc": 0.9257631364964948,
|
|
"train_speed(iter/s)": 0.117998
|
|
},
|
|
{
|
|
"epoch": 2.4103509176073104,
|
|
"eval_loss": 0.3550316095352173,
|
|
"eval_runtime": 31.0148,
|
|
"eval_samples_per_second": 16.96,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.8910021616947686,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 2.416493895415803,
|
|
"grad_norm": 0.6433020234107971,
|
|
"learning_rate": 9.033164856367271e-07,
|
|
"loss": 0.24781334400177002,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1965,
|
|
"token_acc": 0.8978867315004879,
|
|
"train_speed(iter/s)": 0.117779
|
|
},
|
|
{
|
|
"epoch": 2.4226368732242953,
|
|
"grad_norm": 0.7556272745132446,
|
|
"learning_rate": 8.849399631176825e-07,
|
|
"loss": 0.261240553855896,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1970,
|
|
"token_acc": 0.9180517884878411,
|
|
"train_speed(iter/s)": 0.117846
|
|
},
|
|
{
|
|
"epoch": 2.4287798510327883,
|
|
"grad_norm": 0.6567925214767456,
|
|
"learning_rate": 8.667341238189009e-07,
|
|
"loss": 0.24376273155212402,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1975,
|
|
"token_acc": 0.9204566085693536,
|
|
"train_speed(iter/s)": 0.117908
|
|
},
|
|
{
|
|
"epoch": 2.4349228288412808,
|
|
"grad_norm": 0.6730430722236633,
|
|
"learning_rate": 8.486997228767013e-07,
|
|
"loss": 0.26009833812713623,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1980,
|
|
"token_acc": 0.9134192822777164,
|
|
"train_speed(iter/s)": 0.117978
|
|
},
|
|
{
|
|
"epoch": 2.4349228288412808,
|
|
"eval_loss": 0.3539762794971466,
|
|
"eval_runtime": 31.0521,
|
|
"eval_samples_per_second": 16.939,
|
|
"eval_steps_per_second": 4.251,
|
|
"eval_token_acc": 0.8909018590575011,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 2.4410658066497737,
|
|
"grad_norm": 0.7423481941223145,
|
|
"learning_rate": 8.308375083165299e-07,
|
|
"loss": 0.24584083557128905,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1985,
|
|
"token_acc": 0.8992385337347264,
|
|
"train_speed(iter/s)": 0.117767
|
|
},
|
|
{
|
|
"epoch": 2.447208784458266,
|
|
"grad_norm": 0.6721974015235901,
|
|
"learning_rate": 8.131482210219383e-07,
|
|
"loss": 0.251566219329834,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1990,
|
|
"token_acc": 0.9197381858694643,
|
|
"train_speed(iter/s)": 0.117832
|
|
},
|
|
{
|
|
"epoch": 2.4533517622667587,
|
|
"grad_norm": 0.6605408787727356,
|
|
"learning_rate": 7.956325947038585e-07,
|
|
"loss": 0.2555187702178955,
|
|
"memory(GiB)": 41.58,
|
|
"step": 1995,
|
|
"token_acc": 0.9162280042111596,
|
|
"train_speed(iter/s)": 0.117901
|
|
},
|
|
{
|
|
"epoch": 2.4594947400752516,
|
|
"grad_norm": 0.647904098033905,
|
|
"learning_rate": 7.782913558701572e-07,
|
|
"loss": 0.2506421089172363,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2000,
|
|
"token_acc": 0.9203814955936324,
|
|
"train_speed(iter/s)": 0.117965
|
|
},
|
|
{
|
|
"epoch": 2.4594947400752516,
|
|
"eval_loss": 0.3547162115573883,
|
|
"eval_runtime": 31.1783,
|
|
"eval_samples_per_second": 16.871,
|
|
"eval_steps_per_second": 4.234,
|
|
"eval_token_acc": 0.8908811067877216,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 2.465637717883744,
|
|
"grad_norm": 0.7217480540275574,
|
|
"learning_rate": 7.611252237955168e-07,
|
|
"loss": 0.24761755466461183,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2005,
|
|
"token_acc": 0.8972576188708813,
|
|
"train_speed(iter/s)": 0.117755
|
|
},
|
|
{
|
|
"epoch": 2.471780695692237,
|
|
"grad_norm": 0.6904724836349487,
|
|
"learning_rate": 7.44134910491589e-07,
|
|
"loss": 0.2681485414505005,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2010,
|
|
"token_acc": 0.9053991693585602,
|
|
"train_speed(iter/s)": 0.117834
|
|
},
|
|
{
|
|
"epoch": 2.4779236735007295,
|
|
"grad_norm": 0.6789990663528442,
|
|
"learning_rate": 7.273211206774711e-07,
|
|
"loss": 0.24847228527069093,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2015,
|
|
"token_acc": 0.9193213372105735,
|
|
"train_speed(iter/s)": 0.117908
|
|
},
|
|
{
|
|
"epoch": 2.484066651309222,
|
|
"grad_norm": 0.7324934601783752,
|
|
"learning_rate": 7.106845517504684e-07,
|
|
"loss": 0.24457526206970215,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2020,
|
|
"token_acc": 0.9162846862832077,
|
|
"train_speed(iter/s)": 0.117969
|
|
},
|
|
{
|
|
"epoch": 2.484066651309222,
|
|
"eval_loss": 0.3543083965778351,
|
|
"eval_runtime": 31.0241,
|
|
"eval_samples_per_second": 16.955,
|
|
"eval_steps_per_second": 4.255,
|
|
"eval_token_acc": 0.8908396022481626,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 2.490209629117715,
|
|
"grad_norm": 0.7012256383895874,
|
|
"learning_rate": 6.942258937571772e-07,
|
|
"loss": 0.25258448123931887,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2025,
|
|
"token_acc": 0.8976666927565725,
|
|
"train_speed(iter/s)": 0.11777
|
|
},
|
|
{
|
|
"epoch": 2.4963526069262074,
|
|
"grad_norm": 0.6754176020622253,
|
|
"learning_rate": 6.779458293648506e-07,
|
|
"loss": 0.2500795841217041,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2030,
|
|
"token_acc": 0.9177111716621253,
|
|
"train_speed(iter/s)": 0.117835
|
|
},
|
|
{
|
|
"epoch": 2.5024955847347004,
|
|
"grad_norm": 0.6942124962806702,
|
|
"learning_rate": 6.618450338330978e-07,
|
|
"loss": 0.245684814453125,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2035,
|
|
"token_acc": 0.9162501585690727,
|
|
"train_speed(iter/s)": 0.117915
|
|
},
|
|
{
|
|
"epoch": 2.508638562543193,
|
|
"grad_norm": 0.6740065813064575,
|
|
"learning_rate": 6.459241749858619e-07,
|
|
"loss": 0.25455806255340574,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2040,
|
|
"token_acc": 0.9220431950634214,
|
|
"train_speed(iter/s)": 0.117979
|
|
},
|
|
{
|
|
"epoch": 2.508638562543193,
|
|
"eval_loss": 0.35373052954673767,
|
|
"eval_runtime": 31.0821,
|
|
"eval_samples_per_second": 16.923,
|
|
"eval_steps_per_second": 4.247,
|
|
"eval_token_acc": 0.8911335927367056,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 2.5147815403516853,
|
|
"grad_norm": 0.6818024516105652,
|
|
"learning_rate": 6.301839131837284e-07,
|
|
"loss": 0.2483248233795166,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2045,
|
|
"token_acc": 0.9004994038258826,
|
|
"train_speed(iter/s)": 0.117768
|
|
},
|
|
{
|
|
"epoch": 2.5209245181601783,
|
|
"grad_norm": 0.6766259074211121,
|
|
"learning_rate": 6.146249012965349e-07,
|
|
"loss": 0.25524895191192626,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2050,
|
|
"token_acc": 0.9155308997100655,
|
|
"train_speed(iter/s)": 0.117834
|
|
},
|
|
{
|
|
"epoch": 2.5270674959686708,
|
|
"grad_norm": 0.6721575260162354,
|
|
"learning_rate": 5.992477846762896e-07,
|
|
"loss": 0.2647790193557739,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2055,
|
|
"token_acc": 0.9044405418966383,
|
|
"train_speed(iter/s)": 0.117893
|
|
},
|
|
{
|
|
"epoch": 2.5332104737771637,
|
|
"grad_norm": 0.7143027782440186,
|
|
"learning_rate": 5.840532011303996e-07,
|
|
"loss": 0.2634526491165161,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2060,
|
|
"token_acc": 0.9136083648221958,
|
|
"train_speed(iter/s)": 0.117955
|
|
},
|
|
{
|
|
"epoch": 2.5332104737771637,
|
|
"eval_loss": 0.35337749123573303,
|
|
"eval_runtime": 31.0323,
|
|
"eval_samples_per_second": 16.95,
|
|
"eval_steps_per_second": 4.254,
|
|
"eval_token_acc": 0.8909814094249892,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 2.539353451585656,
|
|
"grad_norm": 0.6832711100578308,
|
|
"learning_rate": 5.690417808952243e-07,
|
|
"loss": 0.2547764301300049,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2065,
|
|
"token_acc": 0.8971203129214999,
|
|
"train_speed(iter/s)": 0.117757
|
|
},
|
|
{
|
|
"epoch": 2.5454964293941487,
|
|
"grad_norm": 0.7033362984657288,
|
|
"learning_rate": 5.542141466099271e-07,
|
|
"loss": 0.26053800582885744,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2070,
|
|
"token_acc": 0.9055393728734732,
|
|
"train_speed(iter/s)": 0.117841
|
|
},
|
|
{
|
|
"epoch": 2.5516394072026416,
|
|
"grad_norm": 0.7116051912307739,
|
|
"learning_rate": 5.395709132906569e-07,
|
|
"loss": 0.25941154956817625,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2075,
|
|
"token_acc": 0.920958114777396,
|
|
"train_speed(iter/s)": 0.117919
|
|
},
|
|
{
|
|
"epoch": 2.557782385011134,
|
|
"grad_norm": 0.6814519166946411,
|
|
"learning_rate": 5.251126883050333e-07,
|
|
"loss": 0.26160635948181155,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2080,
|
|
"token_acc": 0.912257738587306,
|
|
"train_speed(iter/s)": 0.117989
|
|
},
|
|
{
|
|
"epoch": 2.557782385011134,
|
|
"eval_loss": 0.3543572723865509,
|
|
"eval_runtime": 31.0115,
|
|
"eval_samples_per_second": 16.961,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.8910229139645482,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 2.563925362819627,
|
|
"grad_norm": 0.7511703372001648,
|
|
"learning_rate": 5.108400713469547e-07,
|
|
"loss": 0.24686145782470703,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2085,
|
|
"token_acc": 0.8980160383253489,
|
|
"train_speed(iter/s)": 0.117791
|
|
},
|
|
{
|
|
"epoch": 2.5700683406281195,
|
|
"grad_norm": 0.6902100443840027,
|
|
"learning_rate": 4.967536544117263e-07,
|
|
"loss": 0.26129970550537107,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2090,
|
|
"token_acc": 0.9143400153853115,
|
|
"train_speed(iter/s)": 0.117849
|
|
},
|
|
{
|
|
"epoch": 2.576211318436612,
|
|
"grad_norm": 0.759671688079834,
|
|
"learning_rate": 4.828540217715067e-07,
|
|
"loss": 0.27549381256103517,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2095,
|
|
"token_acc": 0.9109081247944131,
|
|
"train_speed(iter/s)": 0.117916
|
|
},
|
|
{
|
|
"epoch": 2.582354296245105,
|
|
"grad_norm": 0.6925843954086304,
|
|
"learning_rate": 4.6914174995106863e-07,
|
|
"loss": 0.25518312454223635,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2100,
|
|
"token_acc": 0.9096630452258998,
|
|
"train_speed(iter/s)": 0.117988
|
|
},
|
|
{
|
|
"epoch": 2.582354296245105,
|
|
"eval_loss": 0.3541419208049774,
|
|
"eval_runtime": 31.047,
|
|
"eval_samples_per_second": 16.942,
|
|
"eval_steps_per_second": 4.252,
|
|
"eval_token_acc": 0.8910021616947686,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 2.5884972740535974,
|
|
"grad_norm": 0.7308095693588257,
|
|
"learning_rate": 4.556174077038927e-07,
|
|
"loss": 0.2574288845062256,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2105,
|
|
"token_acc": 0.899001034002444,
|
|
"train_speed(iter/s)": 0.11778
|
|
},
|
|
{
|
|
"epoch": 2.5946402518620904,
|
|
"grad_norm": 0.6761147379875183,
|
|
"learning_rate": 4.422815559885696e-07,
|
|
"loss": 0.2425455093383789,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2110,
|
|
"token_acc": 0.9116659922401276,
|
|
"train_speed(iter/s)": 0.117842
|
|
},
|
|
{
|
|
"epoch": 2.600783229670583,
|
|
"grad_norm": 0.697441816329956,
|
|
"learning_rate": 4.2913474794554044e-07,
|
|
"loss": 0.2548621892929077,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2115,
|
|
"token_acc": 0.9114378356971362,
|
|
"train_speed(iter/s)": 0.11791
|
|
},
|
|
{
|
|
"epoch": 2.6069262074790753,
|
|
"grad_norm": 0.667349100112915,
|
|
"learning_rate": 4.161775288741454e-07,
|
|
"loss": 0.252597713470459,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2120,
|
|
"token_acc": 0.9123547788733769,
|
|
"train_speed(iter/s)": 0.117978
|
|
},
|
|
{
|
|
"epoch": 2.6069262074790753,
|
|
"eval_loss": 0.3542228639125824,
|
|
"eval_runtime": 31.0775,
|
|
"eval_samples_per_second": 16.925,
|
|
"eval_steps_per_second": 4.247,
|
|
"eval_token_acc": 0.8909537397319498,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 2.6130691852875683,
|
|
"grad_norm": 0.7039747834205627,
|
|
"learning_rate": 4.034104362100155e-07,
|
|
"loss": 0.25393052101135255,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2125,
|
|
"token_acc": 0.8992231097494255,
|
|
"train_speed(iter/s)": 0.117764
|
|
},
|
|
{
|
|
"epoch": 2.6192121630960608,
|
|
"grad_norm": 0.7111782431602478,
|
|
"learning_rate": 3.9083399950277156e-07,
|
|
"loss": 0.2592860221862793,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2130,
|
|
"token_acc": 0.9017042520227233,
|
|
"train_speed(iter/s)": 0.117842
|
|
},
|
|
{
|
|
"epoch": 2.6253551409045537,
|
|
"grad_norm": 0.7449079155921936,
|
|
"learning_rate": 3.7844874039406677e-07,
|
|
"loss": 0.23967378139495848,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2135,
|
|
"token_acc": 0.9237554343728797,
|
|
"train_speed(iter/s)": 0.11791
|
|
},
|
|
{
|
|
"epoch": 2.631498118713046,
|
|
"grad_norm": 0.6821849346160889,
|
|
"learning_rate": 3.6625517259594566e-07,
|
|
"loss": 0.273772144317627,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2140,
|
|
"token_acc": 0.9114792099290095,
|
|
"train_speed(iter/s)": 0.117984
|
|
},
|
|
{
|
|
"epoch": 2.631498118713046,
|
|
"eval_loss": 0.3543878495693207,
|
|
"eval_runtime": 31.0747,
|
|
"eval_samples_per_second": 16.927,
|
|
"eval_steps_per_second": 4.248,
|
|
"eval_token_acc": 0.8910263726761781,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 2.6376410965215387,
|
|
"grad_norm": 0.7271039485931396,
|
|
"learning_rate": 3.5425380186953905e-07,
|
|
"loss": 0.2533170223236084,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2145,
|
|
"token_acc": 0.8992799581191373,
|
|
"train_speed(iter/s)": 0.117788
|
|
},
|
|
{
|
|
"epoch": 2.6437840743300316,
|
|
"grad_norm": 0.6954792737960815,
|
|
"learning_rate": 3.424451260040862e-07,
|
|
"loss": 0.2587547302246094,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2150,
|
|
"token_acc": 0.9252017450665703,
|
|
"train_speed(iter/s)": 0.117868
|
|
},
|
|
{
|
|
"epoch": 2.649927052138524,
|
|
"grad_norm": 0.6999133229255676,
|
|
"learning_rate": 3.3082963479628747e-07,
|
|
"loss": 0.2520002841949463,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2155,
|
|
"token_acc": 0.9169615355242726,
|
|
"train_speed(iter/s)": 0.117941
|
|
},
|
|
{
|
|
"epoch": 2.656070029947017,
|
|
"grad_norm": 0.6630998253822327,
|
|
"learning_rate": 3.194078100299863e-07,
|
|
"loss": 0.2589444160461426,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2160,
|
|
"token_acc": 0.9155829021582063,
|
|
"train_speed(iter/s)": 0.118006
|
|
},
|
|
{
|
|
"epoch": 2.656070029947017,
|
|
"eval_loss": 0.3538263440132141,
|
|
"eval_runtime": 31.0691,
|
|
"eval_samples_per_second": 16.93,
|
|
"eval_steps_per_second": 4.249,
|
|
"eval_token_acc": 0.8909779507133593,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 2.6622130077555095,
|
|
"grad_norm": 0.6728103756904602,
|
|
"learning_rate": 3.0818012545618836e-07,
|
|
"loss": 0.243510103225708,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2165,
|
|
"token_acc": 0.89773630732402,
|
|
"train_speed(iter/s)": 0.117802
|
|
},
|
|
{
|
|
"epoch": 2.668355985564002,
|
|
"grad_norm": 0.6952410936355591,
|
|
"learning_rate": 2.9714704677341055e-07,
|
|
"loss": 0.2590247631072998,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2170,
|
|
"token_acc": 0.9167405790179891,
|
|
"train_speed(iter/s)": 0.117866
|
|
},
|
|
{
|
|
"epoch": 2.674498963372495,
|
|
"grad_norm": 0.6924260258674622,
|
|
"learning_rate": 2.8630903160836776e-07,
|
|
"loss": 0.25694501399993896,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2175,
|
|
"token_acc": 0.9082922132627271,
|
|
"train_speed(iter/s)": 0.11794
|
|
},
|
|
{
|
|
"epoch": 2.6806419411809874,
|
|
"grad_norm": 0.6898376941680908,
|
|
"learning_rate": 2.756665294969868e-07,
|
|
"loss": 0.2537565231323242,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2180,
|
|
"token_acc": 0.917653237630479,
|
|
"train_speed(iter/s)": 0.118015
|
|
},
|
|
{
|
|
"epoch": 2.6806419411809874,
|
|
"eval_loss": 0.35428422689437866,
|
|
"eval_runtime": 31.0905,
|
|
"eval_samples_per_second": 16.918,
|
|
"eval_steps_per_second": 4.246,
|
|
"eval_token_acc": 0.8910713359273671,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 2.6867849189894804,
|
|
"grad_norm": 0.6692034602165222,
|
|
"learning_rate": 2.6521998186576357e-07,
|
|
"loss": 0.24578571319580078,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2185,
|
|
"token_acc": 0.9007592006264257,
|
|
"train_speed(iter/s)": 0.117803
|
|
},
|
|
{
|
|
"epoch": 2.692927896797973,
|
|
"grad_norm": 0.6597223877906799,
|
|
"learning_rate": 2.549698220134517e-07,
|
|
"loss": 0.2445077896118164,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2190,
|
|
"token_acc": 0.921655840125781,
|
|
"train_speed(iter/s)": 0.117862
|
|
},
|
|
{
|
|
"epoch": 2.6990708746064653,
|
|
"grad_norm": 0.7004697322845459,
|
|
"learning_rate": 2.449164750930938e-07,
|
|
"loss": 0.24747202396392823,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2195,
|
|
"token_acc": 0.9170990796945369,
|
|
"train_speed(iter/s)": 0.117919
|
|
},
|
|
{
|
|
"epoch": 2.7052138524149583,
|
|
"grad_norm": 0.6603142619132996,
|
|
"learning_rate": 2.3506035809438553e-07,
|
|
"loss": 0.25233500003814696,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2200,
|
|
"token_acc": 0.9180474800634293,
|
|
"train_speed(iter/s)": 0.117989
|
|
},
|
|
{
|
|
"epoch": 2.7052138524149583,
|
|
"eval_loss": 0.35384565591812134,
|
|
"eval_runtime": 31.0643,
|
|
"eval_samples_per_second": 16.933,
|
|
"eval_steps_per_second": 4.249,
|
|
"eval_token_acc": 0.8911958495460441,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 2.7113568302234508,
|
|
"grad_norm": 0.6453321576118469,
|
|
"learning_rate": 2.2540187982637628e-07,
|
|
"loss": 0.2474754571914673,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2205,
|
|
"token_acc": 0.8990364613669268,
|
|
"train_speed(iter/s)": 0.117783
|
|
},
|
|
{
|
|
"epoch": 2.7174998080319437,
|
|
"grad_norm": 0.6942773461341858,
|
|
"learning_rate": 2.1594144090051728e-07,
|
|
"loss": 0.25811138153076174,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2210,
|
|
"token_acc": 0.9148966602302796,
|
|
"train_speed(iter/s)": 0.117842
|
|
},
|
|
{
|
|
"epoch": 2.723642785840436,
|
|
"grad_norm": 0.687302827835083,
|
|
"learning_rate": 2.066794337140443e-07,
|
|
"loss": 0.25774784088134767,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2215,
|
|
"token_acc": 0.9122162054746883,
|
|
"train_speed(iter/s)": 0.117899
|
|
},
|
|
{
|
|
"epoch": 2.7297857636489287,
|
|
"grad_norm": 0.735578715801239,
|
|
"learning_rate": 1.9761624243370026e-07,
|
|
"loss": 0.26178154945373533,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2220,
|
|
"token_acc": 0.9086515587830224,
|
|
"train_speed(iter/s)": 0.117952
|
|
},
|
|
{
|
|
"epoch": 2.7297857636489287,
|
|
"eval_loss": 0.35361814498901367,
|
|
"eval_runtime": 31.0398,
|
|
"eval_samples_per_second": 16.946,
|
|
"eval_steps_per_second": 4.253,
|
|
"eval_token_acc": 0.8910920881971466,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 2.7359287414574216,
|
|
"grad_norm": 0.6561589241027832,
|
|
"learning_rate": 1.8875224297980332e-07,
|
|
"loss": 0.25756092071533204,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2225,
|
|
"token_acc": 0.8958707817534339,
|
|
"train_speed(iter/s)": 0.117769
|
|
},
|
|
{
|
|
"epoch": 2.742071719265914,
|
|
"grad_norm": 0.6671420335769653,
|
|
"learning_rate": 1.800878030106501e-07,
|
|
"loss": 0.24125266075134277,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2230,
|
|
"token_acc": 0.9233396163654507,
|
|
"train_speed(iter/s)": 0.117827
|
|
},
|
|
{
|
|
"epoch": 2.748214697074407,
|
|
"grad_norm": 0.7091180086135864,
|
|
"learning_rate": 1.7162328190727217e-07,
|
|
"loss": 0.25800223350524903,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2235,
|
|
"token_acc": 0.9130938866210961,
|
|
"train_speed(iter/s)": 0.117897
|
|
},
|
|
{
|
|
"epoch": 2.7543576748828995,
|
|
"grad_norm": 0.7402175068855286,
|
|
"learning_rate": 1.6335903075852478e-07,
|
|
"loss": 0.2690894365310669,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2240,
|
|
"token_acc": 0.9129178605539637,
|
|
"train_speed(iter/s)": 0.117956
|
|
},
|
|
{
|
|
"epoch": 2.7543576748828995,
|
|
"eval_loss": 0.35380449891090393,
|
|
"eval_runtime": 31.058,
|
|
"eval_samples_per_second": 16.936,
|
|
"eval_steps_per_second": 4.25,
|
|
"eval_token_acc": 0.891244271508863,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 2.760500652691392,
|
|
"grad_norm": 0.700340211391449,
|
|
"learning_rate": 1.552953923465267e-07,
|
|
"loss": 0.26177315711975097,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2245,
|
|
"token_acc": 0.8943636286526147,
|
|
"train_speed(iter/s)": 0.11777
|
|
},
|
|
{
|
|
"epoch": 2.766643630499885,
|
|
"grad_norm": 0.6342586278915405,
|
|
"learning_rate": 1.4743270113244278e-07,
|
|
"loss": 0.23961200714111328,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2250,
|
|
"token_acc": 0.9237576735224269,
|
|
"train_speed(iter/s)": 0.117824
|
|
},
|
|
{
|
|
"epoch": 2.7727866083083774,
|
|
"grad_norm": 0.627129077911377,
|
|
"learning_rate": 1.3977128324261068e-07,
|
|
"loss": 0.24526638984680177,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2255,
|
|
"token_acc": 0.9125838004176283,
|
|
"train_speed(iter/s)": 0.117896
|
|
},
|
|
{
|
|
"epoch": 2.7789295861168704,
|
|
"grad_norm": 0.6337400674819946,
|
|
"learning_rate": 1.3231145645501153e-07,
|
|
"loss": 0.2480980396270752,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2260,
|
|
"token_acc": 0.9186616671473897,
|
|
"train_speed(iter/s)": 0.117951
|
|
},
|
|
{
|
|
"epoch": 2.7789295861168704,
|
|
"eval_loss": 0.354061484336853,
|
|
"eval_runtime": 31.0534,
|
|
"eval_samples_per_second": 16.939,
|
|
"eval_steps_per_second": 4.251,
|
|
"eval_token_acc": 0.8911024643320363,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 2.785072563925363,
|
|
"grad_norm": 0.712088942527771,
|
|
"learning_rate": 1.2505353018609445e-07,
|
|
"loss": 0.2516076326370239,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2265,
|
|
"token_acc": 0.8994283331306145,
|
|
"train_speed(iter/s)": 0.117768
|
|
},
|
|
{
|
|
"epoch": 2.7912155417338553,
|
|
"grad_norm": 0.7046364545822144,
|
|
"learning_rate": 1.1799780547793682e-07,
|
|
"loss": 0.25043492317199706,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2270,
|
|
"token_acc": 0.9169777512318948,
|
|
"train_speed(iter/s)": 0.117833
|
|
},
|
|
{
|
|
"epoch": 2.7973585195423483,
|
|
"grad_norm": 0.6503071784973145,
|
|
"learning_rate": 1.111445749857626e-07,
|
|
"loss": 0.2525207757949829,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2275,
|
|
"token_acc": 0.9089755560343795,
|
|
"train_speed(iter/s)": 0.117899
|
|
},
|
|
{
|
|
"epoch": 2.8035014973508408,
|
|
"grad_norm": 0.7683473229408264,
|
|
"learning_rate": 1.0449412296580252e-07,
|
|
"loss": 0.2637613534927368,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2280,
|
|
"token_acc": 0.9091683159202835,
|
|
"train_speed(iter/s)": 0.117958
|
|
},
|
|
{
|
|
"epoch": 2.8035014973508408,
|
|
"eval_loss": 0.35387495160102844,
|
|
"eval_runtime": 31.012,
|
|
"eval_samples_per_second": 16.961,
|
|
"eval_steps_per_second": 4.256,
|
|
"eval_token_acc": 0.8911405101599654,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 2.8096444751593337,
|
|
"grad_norm": 0.7151490449905396,
|
|
"learning_rate": 9.804672526349979e-08,
|
|
"loss": 0.2488321304321289,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2285,
|
|
"token_acc": 0.8973117200307805,
|
|
"train_speed(iter/s)": 0.117778
|
|
},
|
|
{
|
|
"epoch": 2.815787452967826,
|
|
"grad_norm": 0.730139434337616,
|
|
"learning_rate": 9.180264930207405e-08,
|
|
"loss": 0.2607487678527832,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2290,
|
|
"token_acc": 0.9156902926894462,
|
|
"train_speed(iter/s)": 0.11785
|
|
},
|
|
{
|
|
"epoch": 2.8219304307763187,
|
|
"grad_norm": 0.6628730297088623,
|
|
"learning_rate": 8.576215407142652e-08,
|
|
"loss": 0.26926565170288086,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2295,
|
|
"token_acc": 0.9116337769619092,
|
|
"train_speed(iter/s)": 0.11791
|
|
},
|
|
{
|
|
"epoch": 2.8280734085848116,
|
|
"grad_norm": 0.6601608991622925,
|
|
"learning_rate": 7.992549011739903e-08,
|
|
"loss": 0.2524131774902344,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2300,
|
|
"token_acc": 0.9154497235075048,
|
|
"train_speed(iter/s)": 0.117965
|
|
},
|
|
{
|
|
"epoch": 2.8280734085848116,
|
|
"eval_loss": 0.35372012853622437,
|
|
"eval_runtime": 31.0582,
|
|
"eval_samples_per_second": 16.936,
|
|
"eval_steps_per_second": 4.25,
|
|
"eval_token_acc": 0.891157803718115,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 2.834216386393304,
|
|
"grad_norm": 0.7079156041145325,
|
|
"learning_rate": 7.42928995313802e-08,
|
|
"loss": 0.25153977870941163,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2305,
|
|
"token_acc": 0.8977190549519733,
|
|
"train_speed(iter/s)": 0.117776
|
|
},
|
|
{
|
|
"epoch": 2.840359364201797,
|
|
"grad_norm": 0.707416296005249,
|
|
"learning_rate": 6.886461594026394e-08,
|
|
"loss": 0.24887454509735107,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2310,
|
|
"token_acc": 0.9237657201262054,
|
|
"train_speed(iter/s)": 0.117827
|
|
},
|
|
{
|
|
"epoch": 2.8465023420102895,
|
|
"grad_norm": 0.6941429972648621,
|
|
"learning_rate": 6.364086449676233e-08,
|
|
"loss": 0.2661618947982788,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2315,
|
|
"token_acc": 0.9116836428999401,
|
|
"train_speed(iter/s)": 0.117875
|
|
},
|
|
{
|
|
"epoch": 2.852645319818782,
|
|
"grad_norm": 0.705603301525116,
|
|
"learning_rate": 5.862186187006347e-08,
|
|
"loss": 0.251740837097168,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2320,
|
|
"token_acc": 0.9094119805522429,
|
|
"train_speed(iter/s)": 0.117943
|
|
},
|
|
{
|
|
"epoch": 2.852645319818782,
|
|
"eval_loss": 0.35377010703086853,
|
|
"eval_runtime": 31.0667,
|
|
"eval_samples_per_second": 16.931,
|
|
"eval_steps_per_second": 4.249,
|
|
"eval_token_acc": 0.8910367488110679,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 2.858788297627275,
|
|
"grad_norm": 0.6828641891479492,
|
|
"learning_rate": 5.3807816236846614e-08,
|
|
"loss": 0.26838877201080324,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2325,
|
|
"token_acc": 0.8946421677020814,
|
|
"train_speed(iter/s)": 0.117759
|
|
},
|
|
{
|
|
"epoch": 2.8649312754357674,
|
|
"grad_norm": 0.6699286699295044,
|
|
"learning_rate": 4.919892727264508e-08,
|
|
"loss": 0.2658334493637085,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2330,
|
|
"token_acc": 0.9121956642579211,
|
|
"train_speed(iter/s)": 0.117813
|
|
},
|
|
{
|
|
"epoch": 2.8710742532442604,
|
|
"grad_norm": 0.6932682394981384,
|
|
"learning_rate": 4.4795386143567375e-08,
|
|
"loss": 0.24600727558135987,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2335,
|
|
"token_acc": 0.918826454010682,
|
|
"train_speed(iter/s)": 0.117881
|
|
},
|
|
{
|
|
"epoch": 2.877217231052753,
|
|
"grad_norm": 0.6962621212005615,
|
|
"learning_rate": 4.0597375498365175e-08,
|
|
"loss": 0.2586866617202759,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2340,
|
|
"token_acc": 0.9217780343483908,
|
|
"train_speed(iter/s)": 0.117944
|
|
},
|
|
{
|
|
"epoch": 2.877217231052753,
|
|
"eval_loss": 0.35374194383621216,
|
|
"eval_runtime": 31.066,
|
|
"eval_samples_per_second": 16.932,
|
|
"eval_steps_per_second": 4.249,
|
|
"eval_token_acc": 0.8911439688715953,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 2.8833602088612453,
|
|
"grad_norm": 0.6817741990089417,
|
|
"learning_rate": 3.6605069460858286e-08,
|
|
"loss": 0.2390669822692871,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2345,
|
|
"token_acc": 0.9005924037018727,
|
|
"train_speed(iter/s)": 0.117765
|
|
},
|
|
{
|
|
"epoch": 2.8895031866697383,
|
|
"grad_norm": 0.6809601783752441,
|
|
"learning_rate": 3.281863362271487e-08,
|
|
"loss": 0.24726104736328125,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2350,
|
|
"token_acc": 0.9243779025438414,
|
|
"train_speed(iter/s)": 0.117823
|
|
},
|
|
{
|
|
"epoch": 2.8956461644782308,
|
|
"grad_norm": 0.6868336200714111,
|
|
"learning_rate": 2.9238225036579693e-08,
|
|
"loss": 0.2603924036026001,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2355,
|
|
"token_acc": 0.9140520341253614,
|
|
"train_speed(iter/s)": 0.117884
|
|
},
|
|
{
|
|
"epoch": 2.9017891422867237,
|
|
"grad_norm": 0.6945005655288696,
|
|
"learning_rate": 2.5863992209560484e-08,
|
|
"loss": 0.2470933675765991,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2360,
|
|
"token_acc": 0.9241285200347351,
|
|
"train_speed(iter/s)": 0.117951
|
|
},
|
|
{
|
|
"epoch": 2.9017891422867237,
|
|
"eval_loss": 0.353762149810791,
|
|
"eval_runtime": 31.0637,
|
|
"eval_samples_per_second": 16.933,
|
|
"eval_steps_per_second": 4.249,
|
|
"eval_token_acc": 0.891199308257674,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 2.907932120095216,
|
|
"grad_norm": 0.6887286305427551,
|
|
"learning_rate": 2.269607509707006e-08,
|
|
"loss": 0.2686716318130493,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2365,
|
|
"token_acc": 0.8963972388465724,
|
|
"train_speed(iter/s)": 0.117755
|
|
},
|
|
{
|
|
"epoch": 2.9140750979037087,
|
|
"grad_norm": 0.6807404160499573,
|
|
"learning_rate": 1.97346050970193e-08,
|
|
"loss": 0.25454580783843994,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2370,
|
|
"token_acc": 0.9134818448123169,
|
|
"train_speed(iter/s)": 0.11783
|
|
},
|
|
{
|
|
"epoch": 2.9202180757122016,
|
|
"grad_norm": 0.6732537150382996,
|
|
"learning_rate": 1.69797050443693e-08,
|
|
"loss": 0.251677131652832,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2375,
|
|
"token_acc": 0.9128050937389459,
|
|
"train_speed(iter/s)": 0.117893
|
|
},
|
|
{
|
|
"epoch": 2.926361053520694,
|
|
"grad_norm": 0.701576292514801,
|
|
"learning_rate": 1.4431489206034321e-08,
|
|
"loss": 0.26529679298400877,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2380,
|
|
"token_acc": 0.915328677370581,
|
|
"train_speed(iter/s)": 0.117951
|
|
},
|
|
{
|
|
"epoch": 2.926361053520694,
|
|
"eval_loss": 0.3537040054798126,
|
|
"eval_runtime": 31.0696,
|
|
"eval_samples_per_second": 16.93,
|
|
"eval_steps_per_second": 4.249,
|
|
"eval_token_acc": 0.8912581063553826,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 2.932504031329187,
|
|
"grad_norm": 0.6693256497383118,
|
|
"learning_rate": 1.2090063276142261e-08,
|
|
"loss": 0.2500641107559204,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2385,
|
|
"token_acc": 0.8987698849300564,
|
|
"train_speed(iter/s)": 0.117778
|
|
},
|
|
{
|
|
"epoch": 2.9386470091376795,
|
|
"grad_norm": 0.726274847984314,
|
|
"learning_rate": 9.955524371653146e-09,
|
|
"loss": 0.2546469926834106,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2390,
|
|
"token_acc": 0.9140067149004587,
|
|
"train_speed(iter/s)": 0.117838
|
|
},
|
|
{
|
|
"epoch": 2.944789986946172,
|
|
"grad_norm": 0.6883347630500793,
|
|
"learning_rate": 8.02796102832848e-09,
|
|
"loss": 0.2519416570663452,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2395,
|
|
"token_acc": 0.9123212139777092,
|
|
"train_speed(iter/s)": 0.117903
|
|
},
|
|
{
|
|
"epoch": 2.950932964754665,
|
|
"grad_norm": 0.7407357692718506,
|
|
"learning_rate": 6.307453197059166e-09,
|
|
"loss": 0.25615706443786623,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2400,
|
|
"token_acc": 0.9138917665630704,
|
|
"train_speed(iter/s)": 0.117957
|
|
},
|
|
{
|
|
"epoch": 2.950932964754665,
|
|
"eval_loss": 0.35370346903800964,
|
|
"eval_runtime": 31.078,
|
|
"eval_samples_per_second": 16.925,
|
|
"eval_steps_per_second": 4.247,
|
|
"eval_token_acc": 0.8911854734111544,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 2.9570759425631574,
|
|
"grad_norm": 0.6578332781791687,
|
|
"learning_rate": 4.794072240550951e-09,
|
|
"loss": 0.2539684772491455,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2405,
|
|
"token_acc": 0.8979478357573546,
|
|
"train_speed(iter/s)": 0.117778
|
|
},
|
|
{
|
|
"epoch": 2.9632189203716504,
|
|
"grad_norm": 0.6638470888137817,
|
|
"learning_rate": 3.487880930363452e-09,
|
|
"loss": 0.24514734745025635,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2410,
|
|
"token_acc": 0.9181864403032916,
|
|
"train_speed(iter/s)": 0.117844
|
|
},
|
|
{
|
|
"epoch": 2.969361898180143,
|
|
"grad_norm": 0.7209091782569885,
|
|
"learning_rate": 2.3889334443055743e-09,
|
|
"loss": 0.24684855937957764,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2415,
|
|
"token_acc": 0.9140838085792214,
|
|
"train_speed(iter/s)": 0.117913
|
|
},
|
|
{
|
|
"epoch": 2.9755048759886353,
|
|
"grad_norm": 0.651500940322876,
|
|
"learning_rate": 1.4972753641906424e-09,
|
|
"loss": 0.24752352237701417,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2420,
|
|
"token_acc": 0.9205346018801677,
|
|
"train_speed(iter/s)": 0.117962
|
|
},
|
|
{
|
|
"epoch": 2.9755048759886353,
|
|
"eval_loss": 0.3538280427455902,
|
|
"eval_runtime": 31.0603,
|
|
"eval_samples_per_second": 16.935,
|
|
"eval_steps_per_second": 4.25,
|
|
"eval_token_acc": 0.8910817120622568,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 2.9816478537971283,
|
|
"grad_norm": 0.7020614147186279,
|
|
"learning_rate": 8.12943673943467e-10,
|
|
"loss": 0.2728489875793457,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2425,
|
|
"token_acc": 0.8962999446979123,
|
|
"train_speed(iter/s)": 0.117785
|
|
},
|
|
{
|
|
"epoch": 2.9877908316056208,
|
|
"grad_norm": 0.6406486630439758,
|
|
"learning_rate": 3.359667580682402e-10,
|
|
"loss": 0.24820823669433595,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2430,
|
|
"token_acc": 0.9153875671527245,
|
|
"train_speed(iter/s)": 0.11784
|
|
},
|
|
{
|
|
"epoch": 2.9939338094141137,
|
|
"grad_norm": 0.6937646269798279,
|
|
"learning_rate": 6.636440046892123e-11,
|
|
"loss": 0.253904914855957,
|
|
"memory(GiB)": 41.58,
|
|
"step": 2435,
|
|
"token_acc": 0.9180234572177958,
|
|
"train_speed(iter/s)": 0.117894
|
|
},
|
|
{
|
|
"epoch": 2.9988481916609078,
|
|
"eval_loss": 0.3536596894264221,
|
|
"eval_runtime": 31.0744,
|
|
"eval_samples_per_second": 16.927,
|
|
"eval_steps_per_second": 4.248,
|
|
"eval_token_acc": 0.8911750972762645,
|
|
"step": 2439
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 2439,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3.1644436512416727e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|