Files
qwen2.5vl-3b-caption-cot-7b/trainer_state.json
ModelHub XC 25a68e9759 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-caption-cot-7b
Source: Original Platform
2026-05-22 15:31:13 +08:00

6013 lines
171 KiB
JSON

{
"best_global_step": 1620,
"best_metric": 0.3465479,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v14-20250430-214816/checkpoint-1620",
"epoch": 2.9988481916609078,
"eval_steps": 20,
"global_step": 2439,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012285955616985335,
"grad_norm": 4.7140889167785645,
"learning_rate": 9.99999585221637e-06,
"loss": 0.7454355955123901,
"memory(GiB)": 28.92,
"step": 1,
"token_acc": 0.8220973782771536,
"train_speed(iter/s)": 0.064112
},
{
"epoch": 0.006142977808492667,
"grad_norm": 2.3402233123779297,
"learning_rate": 9.999896305753298e-06,
"loss": 0.6025638580322266,
"memory(GiB)": 28.92,
"step": 5,
"token_acc": 0.8154806964420893,
"train_speed(iter/s)": 0.1223
},
{
"epoch": 0.012285955616985334,
"grad_norm": 1.1033470630645752,
"learning_rate": 9.99958522731419e-06,
"loss": 0.4681520462036133,
"memory(GiB)": 28.96,
"step": 10,
"token_acc": 0.8509727902413654,
"train_speed(iter/s)": 0.134255
},
{
"epoch": 0.018428933425478,
"grad_norm": 1.1683531999588013,
"learning_rate": 9.999066777585496e-06,
"loss": 0.4340578556060791,
"memory(GiB)": 30.5,
"step": 15,
"token_acc": 0.8624967569989859,
"train_speed(iter/s)": 0.141354
},
{
"epoch": 0.024571911233970668,
"grad_norm": 0.8643156290054321,
"learning_rate": 9.998340978071314e-06,
"loss": 0.438944673538208,
"memory(GiB)": 30.5,
"step": 20,
"token_acc": 0.8628914650122352,
"train_speed(iter/s)": 0.145504
},
{
"epoch": 0.024571911233970668,
"eval_loss": 0.43904876708984375,
"eval_runtime": 31.0999,
"eval_samples_per_second": 16.913,
"eval_steps_per_second": 4.244,
"eval_token_acc": 0.8671750972762646,
"step": 20
},
{
"epoch": 0.030714889042463334,
"grad_norm": 0.882213830947876,
"learning_rate": 9.997407858876141e-06,
"loss": 0.4316856384277344,
"memory(GiB)": 32.21,
"step": 25,
"token_acc": 0.8673553096382113,
"train_speed(iter/s)": 0.118362
},
{
"epoch": 0.036857866850956,
"grad_norm": 0.876335859298706,
"learning_rate": 9.99626745870361e-06,
"loss": 0.4254283428192139,
"memory(GiB)": 32.21,
"step": 30,
"token_acc": 0.866745778634824,
"train_speed(iter/s)": 0.122938
},
{
"epoch": 0.043000844659448666,
"grad_norm": 0.8186553120613098,
"learning_rate": 9.994919824854899e-06,
"loss": 0.4170750617980957,
"memory(GiB)": 32.21,
"step": 35,
"token_acc": 0.8640802675585284,
"train_speed(iter/s)": 0.127141
},
{
"epoch": 0.049143822467941335,
"grad_norm": 0.8065207004547119,
"learning_rate": 9.993365013226757e-06,
"loss": 0.40838775634765623,
"memory(GiB)": 32.21,
"step": 40,
"token_acc": 0.8663708595604169,
"train_speed(iter/s)": 0.130143
},
{
"epoch": 0.049143822467941335,
"eval_loss": 0.41924959421157837,
"eval_runtime": 31.0376,
"eval_samples_per_second": 16.947,
"eval_steps_per_second": 4.253,
"eval_token_acc": 0.8721037613488976,
"step": 40
},
{
"epoch": 0.055286800276434005,
"grad_norm": 0.7789999842643738,
"learning_rate": 9.991603088309195e-06,
"loss": 0.4384481906890869,
"memory(GiB)": 32.21,
"step": 45,
"token_acc": 0.8650371852302875,
"train_speed(iter/s)": 0.117241
},
{
"epoch": 0.06142977808492667,
"grad_norm": 0.7491472959518433,
"learning_rate": 9.989634123182798e-06,
"loss": 0.3983407497406006,
"memory(GiB)": 32.21,
"step": 50,
"token_acc": 0.8744787141615986,
"train_speed(iter/s)": 0.120332
},
{
"epoch": 0.06757275589341934,
"grad_norm": 0.8437614440917969,
"learning_rate": 9.987458199515714e-06,
"loss": 0.4000354290008545,
"memory(GiB)": 32.21,
"step": 55,
"token_acc": 0.8653561422291064,
"train_speed(iter/s)": 0.123396
},
{
"epoch": 0.073715733701912,
"grad_norm": 0.7674087285995483,
"learning_rate": 9.985075407560247e-06,
"loss": 0.4135420799255371,
"memory(GiB)": 32.21,
"step": 60,
"token_acc": 0.872202027931892,
"train_speed(iter/s)": 0.125154
},
{
"epoch": 0.073715733701912,
"eval_loss": 0.4098711311817169,
"eval_runtime": 31.0819,
"eval_samples_per_second": 16.923,
"eval_steps_per_second": 4.247,
"eval_token_acc": 0.8743692174664938,
"step": 60
},
{
"epoch": 0.07985871151040466,
"grad_norm": 0.8239404559135437,
"learning_rate": 9.982485846149125e-06,
"loss": 0.39459028244018557,
"memory(GiB)": 32.21,
"step": 65,
"token_acc": 0.8727861165617594,
"train_speed(iter/s)": 0.116909
},
{
"epoch": 0.08600168931889733,
"grad_norm": 0.8135547637939453,
"learning_rate": 9.979689622691393e-06,
"loss": 0.4003786087036133,
"memory(GiB)": 32.21,
"step": 70,
"token_acc": 0.8739415872132136,
"train_speed(iter/s)": 0.118714
},
{
"epoch": 0.09214466712739,
"grad_norm": 0.853965699672699,
"learning_rate": 9.976686853167967e-06,
"loss": 0.405532693862915,
"memory(GiB)": 32.21,
"step": 75,
"token_acc": 0.863868962219034,
"train_speed(iter/s)": 0.120582
},
{
"epoch": 0.09828764493588267,
"grad_norm": 0.7862138152122498,
"learning_rate": 9.973477662126818e-06,
"loss": 0.38930883407592776,
"memory(GiB)": 32.21,
"step": 80,
"token_acc": 0.8843768172126381,
"train_speed(iter/s)": 0.122421
},
{
"epoch": 0.09828764493588267,
"eval_loss": 0.4023858904838562,
"eval_runtime": 30.9765,
"eval_samples_per_second": 16.981,
"eval_steps_per_second": 4.261,
"eval_token_acc": 0.8762680501513186,
"step": 80
},
{
"epoch": 0.10443062274437534,
"grad_norm": 0.7761799097061157,
"learning_rate": 9.970062182677802e-06,
"loss": 0.3841962099075317,
"memory(GiB)": 32.21,
"step": 85,
"token_acc": 0.8720659317731335,
"train_speed(iter/s)": 0.116555
},
{
"epoch": 0.11057360055286801,
"grad_norm": 0.7647544145584106,
"learning_rate": 9.966440556487149e-06,
"loss": 0.40062150955200193,
"memory(GiB)": 32.21,
"step": 90,
"token_acc": 0.8734117200834439,
"train_speed(iter/s)": 0.11815
},
{
"epoch": 0.11671657836136066,
"grad_norm": 0.8558200597763062,
"learning_rate": 9.962612933771575e-06,
"loss": 0.41026945114135743,
"memory(GiB)": 32.21,
"step": 95,
"token_acc": 0.8802111051978002,
"train_speed(iter/s)": 0.119854
},
{
"epoch": 0.12285955616985333,
"grad_norm": 0.8282895088195801,
"learning_rate": 9.958579473292067e-06,
"loss": 0.40637502670288084,
"memory(GiB)": 32.21,
"step": 100,
"token_acc": 0.8726802284082797,
"train_speed(iter/s)": 0.121692
},
{
"epoch": 0.12285955616985333,
"eval_loss": 0.39839640259742737,
"eval_runtime": 31.0438,
"eval_samples_per_second": 16.944,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.8775339386078685,
"step": 100
},
{
"epoch": 0.129002533978346,
"grad_norm": 0.8366308212280273,
"learning_rate": 9.95434034234728e-06,
"loss": 0.3811495780944824,
"memory(GiB)": 32.21,
"step": 105,
"token_acc": 0.875357573668792,
"train_speed(iter/s)": 0.117395
},
{
"epoch": 0.13514551178683867,
"grad_norm": 0.7479439377784729,
"learning_rate": 9.949895716766611e-06,
"loss": 0.38749701976776124,
"memory(GiB)": 32.21,
"step": 110,
"token_acc": 0.8701843549972431,
"train_speed(iter/s)": 0.118845
},
{
"epoch": 0.14128848959533133,
"grad_norm": 0.801934003829956,
"learning_rate": 9.945245780902899e-06,
"loss": 0.37144348621368406,
"memory(GiB)": 32.21,
"step": 115,
"token_acc": 0.8773385913426266,
"train_speed(iter/s)": 0.120098
},
{
"epoch": 0.147431467403824,
"grad_norm": 0.7849209308624268,
"learning_rate": 9.940390727624785e-06,
"loss": 0.4016891956329346,
"memory(GiB)": 32.21,
"step": 120,
"token_acc": 0.8671916991890818,
"train_speed(iter/s)": 0.121292
},
{
"epoch": 0.147431467403824,
"eval_loss": 0.39595848321914673,
"eval_runtime": 31.0135,
"eval_samples_per_second": 16.96,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.877976653696498,
"step": 120
},
{
"epoch": 0.15357444521231667,
"grad_norm": 0.7716063857078552,
"learning_rate": 9.935330758308706e-06,
"loss": 0.38781228065490725,
"memory(GiB)": 32.21,
"step": 125,
"token_acc": 0.8762705679981929,
"train_speed(iter/s)": 0.1175
},
{
"epoch": 0.15971742302080932,
"grad_norm": 0.7710253000259399,
"learning_rate": 9.93006608283054e-06,
"loss": 0.3876336574554443,
"memory(GiB)": 32.21,
"step": 130,
"token_acc": 0.8821086956521739,
"train_speed(iter/s)": 0.118454
},
{
"epoch": 0.165860400829302,
"grad_norm": 0.7821493744850159,
"learning_rate": 9.924596919556917e-06,
"loss": 0.40181121826171873,
"memory(GiB)": 32.21,
"step": 135,
"token_acc": 0.8626237623762376,
"train_speed(iter/s)": 0.119818
},
{
"epoch": 0.17200337863779466,
"grad_norm": 0.8226854205131531,
"learning_rate": 9.918923495336138e-06,
"loss": 0.39958484172821046,
"memory(GiB)": 32.21,
"step": 140,
"token_acc": 0.8556235746008882,
"train_speed(iter/s)": 0.120946
},
{
"epoch": 0.17200337863779466,
"eval_loss": 0.393728107213974,
"eval_runtime": 31.0073,
"eval_samples_per_second": 16.964,
"eval_steps_per_second": 4.257,
"eval_token_acc": 0.8784504971897968,
"step": 140
},
{
"epoch": 0.17814635644628735,
"grad_norm": 0.7877047061920166,
"learning_rate": 9.913046045488787e-06,
"loss": 0.38108556270599364,
"memory(GiB)": 34.13,
"step": 145,
"token_acc": 0.8813046265713381,
"train_speed(iter/s)": 0.11771
},
{
"epoch": 0.18428933425478,
"grad_norm": 0.7512264251708984,
"learning_rate": 9.906964813797955e-06,
"loss": 0.3876554250717163,
"memory(GiB)": 34.13,
"step": 150,
"token_acc": 0.881988944871105,
"train_speed(iter/s)": 0.118688
},
{
"epoch": 0.19043231206327269,
"grad_norm": 0.7701375484466553,
"learning_rate": 9.900680052499138e-06,
"loss": 0.38112673759460447,
"memory(GiB)": 34.13,
"step": 155,
"token_acc": 0.8716818566661686,
"train_speed(iter/s)": 0.119662
},
{
"epoch": 0.19657528987176534,
"grad_norm": 0.7622193098068237,
"learning_rate": 9.894192022269773e-06,
"loss": 0.3982468843460083,
"memory(GiB)": 34.13,
"step": 160,
"token_acc": 0.8648266919817547,
"train_speed(iter/s)": 0.120545
},
{
"epoch": 0.19657528987176534,
"eval_loss": 0.39097315073013306,
"eval_runtime": 31.0177,
"eval_samples_per_second": 16.958,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.8786718547341116,
"step": 160
},
{
"epoch": 0.202718267680258,
"grad_norm": 0.7747094035148621,
"learning_rate": 9.887500992218421e-06,
"loss": 0.3932340621948242,
"memory(GiB)": 34.13,
"step": 165,
"token_acc": 0.8735516505058284,
"train_speed(iter/s)": 0.117866
},
{
"epoch": 0.20886124548875068,
"grad_norm": 0.7225446701049805,
"learning_rate": 9.880607239873614e-06,
"loss": 0.3682489633560181,
"memory(GiB)": 34.13,
"step": 170,
"token_acc": 0.8780595564195458,
"train_speed(iter/s)": 0.118651
},
{
"epoch": 0.21500422329724334,
"grad_norm": 0.7513542771339417,
"learning_rate": 9.873511051172331e-06,
"loss": 0.37564697265625,
"memory(GiB)": 34.13,
"step": 175,
"token_acc": 0.8798945693728777,
"train_speed(iter/s)": 0.119494
},
{
"epoch": 0.22114720110573602,
"grad_norm": 0.7389309406280518,
"learning_rate": 9.866212720448149e-06,
"loss": 0.3957530498504639,
"memory(GiB)": 34.13,
"step": 180,
"token_acc": 0.8693595046908373,
"train_speed(iter/s)": 0.120172
},
{
"epoch": 0.22114720110573602,
"eval_loss": 0.3888963460922241,
"eval_runtime": 31.0263,
"eval_samples_per_second": 16.953,
"eval_steps_per_second": 4.254,
"eval_token_acc": 0.8793705144833549,
"step": 180
},
{
"epoch": 0.22729017891422867,
"grad_norm": 0.8499953746795654,
"learning_rate": 9.85871255041903e-06,
"loss": 0.39398903846740724,
"memory(GiB)": 34.13,
"step": 185,
"token_acc": 0.8721618431945888,
"train_speed(iter/s)": 0.11765
},
{
"epoch": 0.23343315672272133,
"grad_norm": 0.7052657008171082,
"learning_rate": 9.85101085217477e-06,
"loss": 0.3804319381713867,
"memory(GiB)": 34.13,
"step": 190,
"token_acc": 0.8779494871039452,
"train_speed(iter/s)": 0.118504
},
{
"epoch": 0.239576134531214,
"grad_norm": 0.8443171977996826,
"learning_rate": 9.843107945164086e-06,
"loss": 0.3854555606842041,
"memory(GiB)": 34.13,
"step": 195,
"token_acc": 0.8738016136687233,
"train_speed(iter/s)": 0.119158
},
{
"epoch": 0.24571911233970667,
"grad_norm": 0.7444053292274475,
"learning_rate": 9.835004157181372e-06,
"loss": 0.3835892677307129,
"memory(GiB)": 34.13,
"step": 200,
"token_acc": 0.8789022648439094,
"train_speed(iter/s)": 0.119936
},
{
"epoch": 0.24571911233970667,
"eval_loss": 0.38609230518341064,
"eval_runtime": 30.9745,
"eval_samples_per_second": 16.982,
"eval_steps_per_second": 4.262,
"eval_token_acc": 0.880086467790748,
"step": 200
},
{
"epoch": 0.2518620901481993,
"grad_norm": 0.7656287550926208,
"learning_rate": 9.826699824353106e-06,
"loss": 0.3835402488708496,
"memory(GiB)": 34.13,
"step": 205,
"token_acc": 0.8772318628475851,
"train_speed(iter/s)": 0.117635
},
{
"epoch": 0.258005067956692,
"grad_norm": 0.7985251545906067,
"learning_rate": 9.818195291123903e-06,
"loss": 0.37469916343688964,
"memory(GiB)": 36.59,
"step": 210,
"token_acc": 0.8918794474675596,
"train_speed(iter/s)": 0.11841
},
{
"epoch": 0.2641480457651847,
"grad_norm": 0.7901045680046082,
"learning_rate": 9.80949091024223e-06,
"loss": 0.39004669189453123,
"memory(GiB)": 36.59,
"step": 215,
"token_acc": 0.8694972278822917,
"train_speed(iter/s)": 0.119102
},
{
"epoch": 0.27029102357367735,
"grad_norm": 0.7759472727775574,
"learning_rate": 9.800587042745774e-06,
"loss": 0.37646257877349854,
"memory(GiB)": 36.59,
"step": 220,
"token_acc": 0.8768733180258252,
"train_speed(iter/s)": 0.119681
},
{
"epoch": 0.27029102357367735,
"eval_loss": 0.38418954610824585,
"eval_runtime": 30.9751,
"eval_samples_per_second": 16.981,
"eval_steps_per_second": 4.261,
"eval_token_acc": 0.8808370082144401,
"step": 220
},
{
"epoch": 0.27643400138217,
"grad_norm": 0.7889726161956787,
"learning_rate": 9.791484057946465e-06,
"loss": 0.3830937385559082,
"memory(GiB)": 36.59,
"step": 225,
"token_acc": 0.8788355828537511,
"train_speed(iter/s)": 0.117815
},
{
"epoch": 0.28257697919066266,
"grad_norm": 0.8053146004676819,
"learning_rate": 9.782182333415168e-06,
"loss": 0.40045747756958006,
"memory(GiB)": 36.59,
"step": 230,
"token_acc": 0.8767751952143934,
"train_speed(iter/s)": 0.118387
},
{
"epoch": 0.2887199569991553,
"grad_norm": 0.7342280745506287,
"learning_rate": 9.772682254966009e-06,
"loss": 0.39071879386901853,
"memory(GiB)": 36.59,
"step": 235,
"token_acc": 0.8698379998127166,
"train_speed(iter/s)": 0.119097
},
{
"epoch": 0.294862934807648,
"grad_norm": 0.7769783139228821,
"learning_rate": 9.762984216640378e-06,
"loss": 0.38714871406555174,
"memory(GiB)": 36.59,
"step": 240,
"token_acc": 0.8766444973056945,
"train_speed(iter/s)": 0.119737
},
{
"epoch": 0.294862934807648,
"eval_loss": 0.38342124223709106,
"eval_runtime": 30.9908,
"eval_samples_per_second": 16.973,
"eval_steps_per_second": 4.259,
"eval_token_acc": 0.8811033290099438,
"step": 240
},
{
"epoch": 0.3010059126161407,
"grad_norm": 0.7775170803070068,
"learning_rate": 9.753088620690589e-06,
"loss": 0.36563289165496826,
"memory(GiB)": 36.59,
"step": 245,
"token_acc": 0.8821107213664786,
"train_speed(iter/s)": 0.117883
},
{
"epoch": 0.30714889042463334,
"grad_norm": 0.7627344131469727,
"learning_rate": 9.742995877563187e-06,
"loss": 0.3691666841506958,
"memory(GiB)": 36.59,
"step": 250,
"token_acc": 0.8684178043301157,
"train_speed(iter/s)": 0.11847
},
{
"epoch": 0.313291868233126,
"grad_norm": 0.730969250202179,
"learning_rate": 9.732706405881931e-06,
"loss": 0.37671756744384766,
"memory(GiB)": 36.59,
"step": 255,
"token_acc": 0.8784978880675819,
"train_speed(iter/s)": 0.118913
},
{
"epoch": 0.31943484604161865,
"grad_norm": 0.7510061860084534,
"learning_rate": 9.722220632430428e-06,
"loss": 0.36403095722198486,
"memory(GiB)": 36.59,
"step": 260,
"token_acc": 0.884961560097506,
"train_speed(iter/s)": 0.1194
},
{
"epoch": 0.31943484604161865,
"eval_loss": 0.3818422555923462,
"eval_runtime": 30.9337,
"eval_samples_per_second": 17.004,
"eval_steps_per_second": 4.267,
"eval_token_acc": 0.8810894941634241,
"step": 260
},
{
"epoch": 0.32557782385011136,
"grad_norm": 0.6700690984725952,
"learning_rate": 9.711538992134427e-06,
"loss": 0.37852253913879397,
"memory(GiB)": 36.59,
"step": 265,
"token_acc": 0.8780975219824141,
"train_speed(iter/s)": 0.117682
},
{
"epoch": 0.331720801658604,
"grad_norm": 0.7542963624000549,
"learning_rate": 9.700661928043787e-06,
"loss": 0.3520061016082764,
"memory(GiB)": 36.59,
"step": 270,
"token_acc": 0.8765217391304347,
"train_speed(iter/s)": 0.118172
},
{
"epoch": 0.33786377946709667,
"grad_norm": 0.6696748733520508,
"learning_rate": 9.689589891314094e-06,
"loss": 0.3755272150039673,
"memory(GiB)": 36.59,
"step": 275,
"token_acc": 0.8727695145026466,
"train_speed(iter/s)": 0.118608
},
{
"epoch": 0.3440067572755893,
"grad_norm": 0.7883334159851074,
"learning_rate": 9.678323341187956e-06,
"loss": 0.376280689239502,
"memory(GiB)": 36.59,
"step": 280,
"token_acc": 0.8781244037397443,
"train_speed(iter/s)": 0.119045
},
{
"epoch": 0.3440067572755893,
"eval_loss": 0.380220502614975,
"eval_runtime": 30.9631,
"eval_samples_per_second": 16.988,
"eval_steps_per_second": 4.263,
"eval_token_acc": 0.8814526588845655,
"step": 280
},
{
"epoch": 0.350149735084082,
"grad_norm": 0.7125808596611023,
"learning_rate": 9.666862744975938e-06,
"loss": 0.3811634063720703,
"memory(GiB)": 36.59,
"step": 285,
"token_acc": 0.881547675634566,
"train_speed(iter/s)": 0.117616
},
{
"epoch": 0.3562927128925747,
"grad_norm": 0.7022562623023987,
"learning_rate": 9.655208578037198e-06,
"loss": 0.36770806312561033,
"memory(GiB)": 36.59,
"step": 290,
"token_acc": 0.8775136241403108,
"train_speed(iter/s)": 0.118155
},
{
"epoch": 0.36243569070106735,
"grad_norm": 0.7109845280647278,
"learning_rate": 9.643361323759763e-06,
"loss": 0.36910414695739746,
"memory(GiB)": 36.59,
"step": 295,
"token_acc": 0.8801465983159751,
"train_speed(iter/s)": 0.118621
},
{
"epoch": 0.36857866850956,
"grad_norm": 0.7310053706169128,
"learning_rate": 9.631321473540476e-06,
"loss": 0.36344945430755615,
"memory(GiB)": 36.59,
"step": 300,
"token_acc": 0.8726629026286561,
"train_speed(iter/s)": 0.119086
},
{
"epoch": 0.36857866850956,
"eval_loss": 0.3780768811702728,
"eval_runtime": 31.0728,
"eval_samples_per_second": 16.928,
"eval_steps_per_second": 4.248,
"eval_token_acc": 0.8825006485084306,
"step": 300
},
{
"epoch": 0.37472164631805266,
"grad_norm": 0.7264479994773865,
"learning_rate": 9.619089526764614e-06,
"loss": 0.380098819732666,
"memory(GiB)": 36.59,
"step": 305,
"token_acc": 0.8804112554112554,
"train_speed(iter/s)": 0.11773
},
{
"epoch": 0.38086462412654537,
"grad_norm": 0.8007322549819946,
"learning_rate": 9.60666599078518e-06,
"loss": 0.3628620862960815,
"memory(GiB)": 36.59,
"step": 310,
"token_acc": 0.8855827918881669,
"train_speed(iter/s)": 0.118139
},
{
"epoch": 0.387007601935038,
"grad_norm": 0.730522871017456,
"learning_rate": 9.59405138090186e-06,
"loss": 0.36655001640319823,
"memory(GiB)": 36.59,
"step": 315,
"token_acc": 0.8823326091250246,
"train_speed(iter/s)": 0.118659
},
{
"epoch": 0.3931505797435307,
"grad_norm": 0.7646607756614685,
"learning_rate": 9.581246220339636e-06,
"loss": 0.35800130367279054,
"memory(GiB)": 36.59,
"step": 320,
"token_acc": 0.8788769866274592,
"train_speed(iter/s)": 0.119038
},
{
"epoch": 0.3931505797435307,
"eval_loss": 0.37690821290016174,
"eval_runtime": 31.0274,
"eval_samples_per_second": 16.953,
"eval_steps_per_second": 4.254,
"eval_token_acc": 0.8822689148292261,
"step": 320
},
{
"epoch": 0.39929355755202334,
"grad_norm": 0.7562268972396851,
"learning_rate": 9.568251040227101e-06,
"loss": 0.384972071647644,
"memory(GiB)": 36.59,
"step": 325,
"token_acc": 0.8815015713117225,
"train_speed(iter/s)": 0.117697
},
{
"epoch": 0.405436535360516,
"grad_norm": 0.7428621053695679,
"learning_rate": 9.555066379574423e-06,
"loss": 0.3597818613052368,
"memory(GiB)": 36.59,
"step": 330,
"token_acc": 0.889793055068397,
"train_speed(iter/s)": 0.118163
},
{
"epoch": 0.4115795131690087,
"grad_norm": 0.7479391098022461,
"learning_rate": 9.541692785250983e-06,
"loss": 0.3805227279663086,
"memory(GiB)": 36.59,
"step": 335,
"token_acc": 0.8907455632716049,
"train_speed(iter/s)": 0.118498
},
{
"epoch": 0.41772249097750136,
"grad_norm": 0.6682092547416687,
"learning_rate": 9.528130811962693e-06,
"loss": 0.37683632373809817,
"memory(GiB)": 36.59,
"step": 340,
"token_acc": 0.8722201102452005,
"train_speed(iter/s)": 0.118896
},
{
"epoch": 0.41772249097750136,
"eval_loss": 0.3755421042442322,
"eval_runtime": 31.0414,
"eval_samples_per_second": 16.945,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.88294336359706,
"step": 340
},
{
"epoch": 0.423865468785994,
"grad_norm": 0.7604427933692932,
"learning_rate": 9.514381022228997e-06,
"loss": 0.36464648246765136,
"memory(GiB)": 36.59,
"step": 345,
"token_acc": 0.8840508026994174,
"train_speed(iter/s)": 0.117605
},
{
"epoch": 0.43000844659448667,
"grad_norm": 0.7618926763534546,
"learning_rate": 9.50044398635953e-06,
"loss": 0.37844386100769045,
"memory(GiB)": 36.59,
"step": 350,
"token_acc": 0.8788168373151308,
"train_speed(iter/s)": 0.117953
},
{
"epoch": 0.4361514244029793,
"grad_norm": 0.6848899126052856,
"learning_rate": 9.486320282430469e-06,
"loss": 0.3681621551513672,
"memory(GiB)": 36.59,
"step": 355,
"token_acc": 0.8739398701268689,
"train_speed(iter/s)": 0.11841
},
{
"epoch": 0.44229440221147204,
"grad_norm": 0.7334110140800476,
"learning_rate": 9.472010496260545e-06,
"loss": 0.3769216060638428,
"memory(GiB)": 36.59,
"step": 360,
"token_acc": 0.8754503693028283,
"train_speed(iter/s)": 0.118855
},
{
"epoch": 0.44229440221147204,
"eval_loss": 0.3748551905155182,
"eval_runtime": 31.0629,
"eval_samples_per_second": 16.933,
"eval_steps_per_second": 4.249,
"eval_token_acc": 0.8829156939040208,
"step": 360
},
{
"epoch": 0.4484373800199647,
"grad_norm": 0.6733468770980835,
"learning_rate": 9.45751522138676e-06,
"loss": 0.3699374198913574,
"memory(GiB)": 36.59,
"step": 365,
"token_acc": 0.8818029853755239,
"train_speed(iter/s)": 0.117632
},
{
"epoch": 0.45458035782845735,
"grad_norm": 0.6975194811820984,
"learning_rate": 9.44283505903976e-06,
"loss": 0.3686963081359863,
"memory(GiB)": 36.59,
"step": 370,
"token_acc": 0.8794543496470025,
"train_speed(iter/s)": 0.118021
},
{
"epoch": 0.46072333563695,
"grad_norm": 0.7434240579605103,
"learning_rate": 9.427970618118888e-06,
"loss": 0.38825435638427735,
"memory(GiB)": 36.59,
"step": 375,
"token_acc": 0.875475461545598,
"train_speed(iter/s)": 0.118433
},
{
"epoch": 0.46686631344544266,
"grad_norm": 0.7431550621986389,
"learning_rate": 9.412922515166952e-06,
"loss": 0.36851983070373534,
"memory(GiB)": 36.59,
"step": 380,
"token_acc": 0.8677917508307813,
"train_speed(iter/s)": 0.118763
},
{
"epoch": 0.46686631344544266,
"eval_loss": 0.37393027544021606,
"eval_runtime": 31.0461,
"eval_samples_per_second": 16.943,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.8833099870298314,
"step": 380
},
{
"epoch": 0.47300929125393537,
"grad_norm": 0.7830028533935547,
"learning_rate": 9.39769137434463e-06,
"loss": 0.39449851512908934,
"memory(GiB)": 36.59,
"step": 385,
"token_acc": 0.8813338029015882,
"train_speed(iter/s)": 0.117631
},
{
"epoch": 0.479152269062428,
"grad_norm": 0.7592146992683411,
"learning_rate": 9.38227782740459e-06,
"loss": 0.3797061681747437,
"memory(GiB)": 36.59,
"step": 390,
"token_acc": 0.8725708251892791,
"train_speed(iter/s)": 0.118076
},
{
"epoch": 0.4852952468709207,
"grad_norm": 0.7044036388397217,
"learning_rate": 9.366682513665293e-06,
"loss": 0.34874444007873534,
"memory(GiB)": 36.59,
"step": 395,
"token_acc": 0.8924136680866491,
"train_speed(iter/s)": 0.118392
},
{
"epoch": 0.49143822467941334,
"grad_norm": 0.7327947020530701,
"learning_rate": 9.350906079984456e-06,
"loss": 0.3913299322128296,
"memory(GiB)": 36.59,
"step": 400,
"token_acc": 0.8793465520609494,
"train_speed(iter/s)": 0.118741
},
{
"epoch": 0.49143822467941334,
"eval_loss": 0.37309640645980835,
"eval_runtime": 31.0239,
"eval_samples_per_second": 16.955,
"eval_steps_per_second": 4.255,
"eval_token_acc": 0.8834448767833982,
"step": 400
},
{
"epoch": 0.497581202487906,
"grad_norm": 0.714108407497406,
"learning_rate": 9.334949180732245e-06,
"loss": 0.3835240364074707,
"memory(GiB)": 36.59,
"step": 405,
"token_acc": 0.8806726886733547,
"train_speed(iter/s)": 0.117711
},
{
"epoch": 0.5037241802963986,
"grad_norm": 0.6842460632324219,
"learning_rate": 9.31881247776412e-06,
"loss": 0.34242706298828124,
"memory(GiB)": 36.59,
"step": 410,
"token_acc": 0.8898083315651744,
"train_speed(iter/s)": 0.118116
},
{
"epoch": 0.5098671581048914,
"grad_norm": 0.7109769582748413,
"learning_rate": 9.302496640393383e-06,
"loss": 0.3699876546859741,
"memory(GiB)": 36.59,
"step": 415,
"token_acc": 0.8834916327453641,
"train_speed(iter/s)": 0.118429
},
{
"epoch": 0.516010135913384,
"grad_norm": 0.72795569896698,
"learning_rate": 9.286002345363418e-06,
"loss": 0.36434710025787354,
"memory(GiB)": 36.59,
"step": 420,
"token_acc": 0.8838608737513539,
"train_speed(iter/s)": 0.118728
},
{
"epoch": 0.516010135913384,
"eval_loss": 0.37135639786720276,
"eval_runtime": 31.0313,
"eval_samples_per_second": 16.951,
"eval_steps_per_second": 4.254,
"eval_token_acc": 0.8840812797233031,
"step": 420
},
{
"epoch": 0.5221531137218767,
"grad_norm": 0.6852810382843018,
"learning_rate": 9.26933027681963e-06,
"loss": 0.371048641204834,
"memory(GiB)": 36.59,
"step": 425,
"token_acc": 0.8814758591608687,
"train_speed(iter/s)": 0.117721
},
{
"epoch": 0.5282960915303694,
"grad_norm": 0.7316782474517822,
"learning_rate": 9.25248112628105e-06,
"loss": 0.3735438346862793,
"memory(GiB)": 36.59,
"step": 430,
"token_acc": 0.8826662287081789,
"train_speed(iter/s)": 0.117992
},
{
"epoch": 0.534439069338862,
"grad_norm": 0.7115728259086609,
"learning_rate": 9.235455592611667e-06,
"loss": 0.360302734375,
"memory(GiB)": 36.59,
"step": 435,
"token_acc": 0.8884788847888478,
"train_speed(iter/s)": 0.118327
},
{
"epoch": 0.5405820471473547,
"grad_norm": 0.6213703155517578,
"learning_rate": 9.218254381991438e-06,
"loss": 0.363602352142334,
"memory(GiB)": 36.59,
"step": 440,
"token_acc": 0.8796412181894034,
"train_speed(iter/s)": 0.118669
},
{
"epoch": 0.5405820471473547,
"eval_loss": 0.37061288952827454,
"eval_runtime": 31.0004,
"eval_samples_per_second": 16.968,
"eval_steps_per_second": 4.258,
"eval_token_acc": 0.8841919584954604,
"step": 440
},
{
"epoch": 0.5467250249558473,
"grad_norm": 0.6319580674171448,
"learning_rate": 9.200878207886995e-06,
"loss": 0.36367177963256836,
"memory(GiB)": 36.59,
"step": 445,
"token_acc": 0.880615405975304,
"train_speed(iter/s)": 0.11768
},
{
"epoch": 0.55286800276434,
"grad_norm": 0.7951823472976685,
"learning_rate": 9.183327791022048e-06,
"loss": 0.37214341163635256,
"memory(GiB)": 36.59,
"step": 450,
"token_acc": 0.88060522696011,
"train_speed(iter/s)": 0.118044
},
{
"epoch": 0.5590109805728327,
"grad_norm": 0.7379077076911926,
"learning_rate": 9.165603859347503e-06,
"loss": 0.3636307716369629,
"memory(GiB)": 36.59,
"step": 455,
"token_acc": 0.8860057913311012,
"train_speed(iter/s)": 0.118377
},
{
"epoch": 0.5651539583813253,
"grad_norm": 0.6838334798812866,
"learning_rate": 9.147707148011255e-06,
"loss": 0.36699528694152833,
"memory(GiB)": 36.59,
"step": 460,
"token_acc": 0.8731886687471273,
"train_speed(iter/s)": 0.118711
},
{
"epoch": 0.5651539583813253,
"eval_loss": 0.3706679344177246,
"eval_runtime": 31.0119,
"eval_samples_per_second": 16.961,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.883956766104626,
"step": 460
},
{
"epoch": 0.571296936189818,
"grad_norm": 0.7097205519676208,
"learning_rate": 9.129638399327707e-06,
"loss": 0.3835044622421265,
"memory(GiB)": 36.59,
"step": 465,
"token_acc": 0.8826179212466955,
"train_speed(iter/s)": 0.117847
},
{
"epoch": 0.5774399139983106,
"grad_norm": 0.7685003876686096,
"learning_rate": 9.111398362746969e-06,
"loss": 0.34739508628845217,
"memory(GiB)": 36.59,
"step": 470,
"token_acc": 0.8856424192063242,
"train_speed(iter/s)": 0.118092
},
{
"epoch": 0.5835828918068033,
"grad_norm": 0.684012234210968,
"learning_rate": 9.092987794823785e-06,
"loss": 0.35583484172821045,
"memory(GiB)": 36.59,
"step": 475,
"token_acc": 0.8870865428183053,
"train_speed(iter/s)": 0.118389
},
{
"epoch": 0.589725869615296,
"grad_norm": 0.7555577158927917,
"learning_rate": 9.074407459186144e-06,
"loss": 0.3742217540740967,
"memory(GiB)": 36.59,
"step": 480,
"token_acc": 0.8749733708902407,
"train_speed(iter/s)": 0.118715
},
{
"epoch": 0.589725869615296,
"eval_loss": 0.3698117733001709,
"eval_runtime": 31.0843,
"eval_samples_per_second": 16.922,
"eval_steps_per_second": 4.247,
"eval_token_acc": 0.8841297016861219,
"step": 480
},
{
"epoch": 0.5958688474237887,
"grad_norm": 0.7176510095596313,
"learning_rate": 9.055658126503605e-06,
"loss": 0.35448513031005857,
"memory(GiB)": 36.59,
"step": 485,
"token_acc": 0.882393450149208,
"train_speed(iter/s)": 0.117835
},
{
"epoch": 0.6020118252322814,
"grad_norm": 0.7371023297309875,
"learning_rate": 9.036740574455345e-06,
"loss": 0.35907247066497805,
"memory(GiB)": 36.59,
"step": 490,
"token_acc": 0.8887174366887537,
"train_speed(iter/s)": 0.118083
},
{
"epoch": 0.608154803040774,
"grad_norm": 0.6593868136405945,
"learning_rate": 9.017655587697885e-06,
"loss": 0.36144974231719973,
"memory(GiB)": 36.59,
"step": 495,
"token_acc": 0.8897585166019836,
"train_speed(iter/s)": 0.118377
},
{
"epoch": 0.6142977808492667,
"grad_norm": 0.7346932291984558,
"learning_rate": 8.998403957832553e-06,
"loss": 0.35957746505737304,
"memory(GiB)": 36.59,
"step": 500,
"token_acc": 0.8936918488180564,
"train_speed(iter/s)": 0.118644
},
{
"epoch": 0.6142977808492667,
"eval_loss": 0.36875346302986145,
"eval_runtime": 30.9564,
"eval_samples_per_second": 16.992,
"eval_steps_per_second": 4.264,
"eval_token_acc": 0.8846000864677908,
"step": 500
},
{
"epoch": 0.6204407586577594,
"grad_norm": 0.6690182089805603,
"learning_rate": 8.978986483372657e-06,
"loss": 0.36060357093811035,
"memory(GiB)": 36.59,
"step": 505,
"token_acc": 0.8834197325817438,
"train_speed(iter/s)": 0.117782
},
{
"epoch": 0.626583736466252,
"grad_norm": 0.6996055245399475,
"learning_rate": 8.959403969710346e-06,
"loss": 0.35636866092681885,
"memory(GiB)": 36.59,
"step": 510,
"token_acc": 0.8747046644744855,
"train_speed(iter/s)": 0.118099
},
{
"epoch": 0.6327267142747447,
"grad_norm": 0.7242439985275269,
"learning_rate": 8.939657229083223e-06,
"loss": 0.362790584564209,
"memory(GiB)": 36.59,
"step": 515,
"token_acc": 0.8795643179382369,
"train_speed(iter/s)": 0.11841
},
{
"epoch": 0.6388696920832373,
"grad_norm": 0.7438492178916931,
"learning_rate": 8.919747080540647e-06,
"loss": 0.36803131103515624,
"memory(GiB)": 36.59,
"step": 520,
"token_acc": 0.8868724794882492,
"train_speed(iter/s)": 0.118724
},
{
"epoch": 0.6388696920832373,
"eval_loss": 0.3668961226940155,
"eval_runtime": 31.0395,
"eval_samples_per_second": 16.946,
"eval_steps_per_second": 4.253,
"eval_token_acc": 0.8850704712494596,
"step": 520
},
{
"epoch": 0.64501266989173,
"grad_norm": 0.7443042993545532,
"learning_rate": 8.899674349909759e-06,
"loss": 0.3723003387451172,
"memory(GiB)": 36.59,
"step": 525,
"token_acc": 0.8819261436583553,
"train_speed(iter/s)": 0.117947
},
{
"epoch": 0.6511556477002227,
"grad_norm": 0.7717169523239136,
"learning_rate": 8.879439869761233e-06,
"loss": 0.37207541465759275,
"memory(GiB)": 36.59,
"step": 530,
"token_acc": 0.8742153725911633,
"train_speed(iter/s)": 0.118275
},
{
"epoch": 0.6572986255087153,
"grad_norm": 0.7282743453979492,
"learning_rate": 8.859044479374737e-06,
"loss": 0.3790937900543213,
"memory(GiB)": 36.59,
"step": 535,
"token_acc": 0.8727581424267062,
"train_speed(iter/s)": 0.118594
},
{
"epoch": 0.663441603317208,
"grad_norm": 0.7114688158035278,
"learning_rate": 8.838489024704131e-06,
"loss": 0.3806754112243652,
"memory(GiB)": 36.59,
"step": 540,
"token_acc": 0.8710419328609594,
"train_speed(iter/s)": 0.1188
},
{
"epoch": 0.663441603317208,
"eval_loss": 0.3665069043636322,
"eval_runtime": 31.0202,
"eval_samples_per_second": 16.957,
"eval_steps_per_second": 4.255,
"eval_token_acc": 0.8851638564634674,
"step": 540
},
{
"epoch": 0.6695845811257006,
"grad_norm": 0.7657713890075684,
"learning_rate": 8.817774358342367e-06,
"loss": 0.3505518913269043,
"memory(GiB)": 36.59,
"step": 545,
"token_acc": 0.8844506134759065,
"train_speed(iter/s)": 0.118004
},
{
"epoch": 0.6757275589341933,
"grad_norm": 0.7225794196128845,
"learning_rate": 8.796901339486136e-06,
"loss": 0.36959023475646974,
"memory(GiB)": 36.59,
"step": 550,
"token_acc": 0.8763546536336592,
"train_speed(iter/s)": 0.118273
},
{
"epoch": 0.681870536742686,
"grad_norm": 0.6392650604248047,
"learning_rate": 8.775870833900226e-06,
"loss": 0.35045757293701174,
"memory(GiB)": 36.59,
"step": 555,
"token_acc": 0.879759337041662,
"train_speed(iter/s)": 0.118527
},
{
"epoch": 0.6880135145511787,
"grad_norm": 0.7549835443496704,
"learning_rate": 8.75468371388161e-06,
"loss": 0.3724693775177002,
"memory(GiB)": 36.59,
"step": 560,
"token_acc": 0.8853696026829382,
"train_speed(iter/s)": 0.11871
},
{
"epoch": 0.6880135145511787,
"eval_loss": 0.3657075762748718,
"eval_runtime": 31.0165,
"eval_samples_per_second": 16.959,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.8855408560311284,
"step": 560
},
{
"epoch": 0.6941564923596714,
"grad_norm": 0.7093605399131775,
"learning_rate": 8.733340858223268e-06,
"loss": 0.3566612720489502,
"memory(GiB)": 36.59,
"step": 565,
"token_acc": 0.8851378312772255,
"train_speed(iter/s)": 0.117959
},
{
"epoch": 0.700299470168164,
"grad_norm": 0.7799498438835144,
"learning_rate": 8.711843152177735e-06,
"loss": 0.35616464614868165,
"memory(GiB)": 36.59,
"step": 570,
"token_acc": 0.8828106906294872,
"train_speed(iter/s)": 0.118218
},
{
"epoch": 0.7064424479766567,
"grad_norm": 0.7166778445243835,
"learning_rate": 8.690191487420385e-06,
"loss": 0.36056735515594485,
"memory(GiB)": 36.59,
"step": 575,
"token_acc": 0.8801836905093237,
"train_speed(iter/s)": 0.118442
},
{
"epoch": 0.7125854257851494,
"grad_norm": 0.7225358486175537,
"learning_rate": 8.668386762012445e-06,
"loss": 0.3537228345870972,
"memory(GiB)": 36.59,
"step": 580,
"token_acc": 0.8771770513178728,
"train_speed(iter/s)": 0.118636
},
{
"epoch": 0.7125854257851494,
"eval_loss": 0.3655913472175598,
"eval_runtime": 31.0226,
"eval_samples_per_second": 16.955,
"eval_steps_per_second": 4.255,
"eval_token_acc": 0.8858832684824903,
"step": 580
},
{
"epoch": 0.718728403593642,
"grad_norm": 0.707248866558075,
"learning_rate": 8.646429880363746e-06,
"loss": 0.35696611404418943,
"memory(GiB)": 36.59,
"step": 585,
"token_acc": 0.8862164894194702,
"train_speed(iter/s)": 0.117878
},
{
"epoch": 0.7248713814021347,
"grad_norm": 0.7443193197250366,
"learning_rate": 8.624321753195209e-06,
"loss": 0.3900872468948364,
"memory(GiB)": 36.59,
"step": 590,
"token_acc": 0.8764379646896419,
"train_speed(iter/s)": 0.118131
},
{
"epoch": 0.7310143592106274,
"grad_norm": 0.6623369455337524,
"learning_rate": 8.602063297501069e-06,
"loss": 0.36977558135986327,
"memory(GiB)": 36.59,
"step": 595,
"token_acc": 0.880887231518028,
"train_speed(iter/s)": 0.118362
},
{
"epoch": 0.73715733701912,
"grad_norm": 0.7453055381774902,
"learning_rate": 8.579655436510847e-06,
"loss": 0.35259857177734377,
"memory(GiB)": 36.59,
"step": 600,
"token_acc": 0.8738677315671569,
"train_speed(iter/s)": 0.118577
},
{
"epoch": 0.73715733701912,
"eval_loss": 0.3638327419757843,
"eval_runtime": 31.0054,
"eval_samples_per_second": 16.965,
"eval_steps_per_second": 4.257,
"eval_token_acc": 0.885855598789451,
"step": 600
},
{
"epoch": 0.7433003148276127,
"grad_norm": 0.7225296497344971,
"learning_rate": 8.557099099651046e-06,
"loss": 0.36968977451324464,
"memory(GiB)": 36.59,
"step": 605,
"token_acc": 0.8823149650444696,
"train_speed(iter/s)": 0.11788
},
{
"epoch": 0.7494432926361053,
"grad_norm": 0.6900773048400879,
"learning_rate": 8.534395222506614e-06,
"loss": 0.36718852519989015,
"memory(GiB)": 36.59,
"step": 610,
"token_acc": 0.8860133630289533,
"train_speed(iter/s)": 0.118141
},
{
"epoch": 0.755586270444598,
"grad_norm": 0.671517014503479,
"learning_rate": 8.511544746782124e-06,
"loss": 0.36435210704803467,
"memory(GiB)": 36.59,
"step": 615,
"token_acc": 0.8798159594739043,
"train_speed(iter/s)": 0.118359
},
{
"epoch": 0.7617292482530907,
"grad_norm": 0.6808713674545288,
"learning_rate": 8.488548620262722e-06,
"loss": 0.36147489547729494,
"memory(GiB)": 36.59,
"step": 620,
"token_acc": 0.8823460793691529,
"train_speed(iter/s)": 0.118579
},
{
"epoch": 0.7617292482530907,
"eval_loss": 0.3635016977787018,
"eval_runtime": 31.0094,
"eval_samples_per_second": 16.963,
"eval_steps_per_second": 4.257,
"eval_token_acc": 0.8860146995244271,
"step": 620
},
{
"epoch": 0.7678722260615833,
"grad_norm": 0.7556000351905823,
"learning_rate": 8.465407796774816e-06,
"loss": 0.36651790142059326,
"memory(GiB)": 36.59,
"step": 625,
"token_acc": 0.8846928285600197,
"train_speed(iter/s)": 0.117873
},
{
"epoch": 0.774015203870076,
"grad_norm": 0.724098801612854,
"learning_rate": 8.442123236146509e-06,
"loss": 0.35537469387054443,
"memory(GiB)": 36.59,
"step": 630,
"token_acc": 0.8859382569251772,
"train_speed(iter/s)": 0.118118
},
{
"epoch": 0.7801581816785687,
"grad_norm": 0.728448748588562,
"learning_rate": 8.418695904167789e-06,
"loss": 0.3752614974975586,
"memory(GiB)": 36.59,
"step": 635,
"token_acc": 0.8905149297823024,
"train_speed(iter/s)": 0.118318
},
{
"epoch": 0.7863011594870614,
"grad_norm": 0.7735581994056702,
"learning_rate": 8.395126772550475e-06,
"loss": 0.3447936773300171,
"memory(GiB)": 36.59,
"step": 640,
"token_acc": 0.8823329283110571,
"train_speed(iter/s)": 0.118526
},
{
"epoch": 0.7863011594870614,
"eval_loss": 0.36254996061325073,
"eval_runtime": 31.0583,
"eval_samples_per_second": 16.936,
"eval_steps_per_second": 4.25,
"eval_token_acc": 0.8862879377431907,
"step": 640
},
{
"epoch": 0.7924441372955541,
"grad_norm": 0.6083407402038574,
"learning_rate": 8.371416818887907e-06,
"loss": 0.3541299343109131,
"memory(GiB)": 36.59,
"step": 645,
"token_acc": 0.8867384523493496,
"train_speed(iter/s)": 0.117839
},
{
"epoch": 0.7985871151040467,
"grad_norm": 0.7006340622901917,
"learning_rate": 8.347567026614398e-06,
"loss": 0.36687259674072265,
"memory(GiB)": 36.59,
"step": 650,
"token_acc": 0.878874098160756,
"train_speed(iter/s)": 0.118045
},
{
"epoch": 0.8047300929125394,
"grad_norm": 0.7071450352668762,
"learning_rate": 8.323578384964444e-06,
"loss": 0.354215145111084,
"memory(GiB)": 36.59,
"step": 655,
"token_acc": 0.8844807747626809,
"train_speed(iter/s)": 0.118259
},
{
"epoch": 0.810873070721032,
"grad_norm": 0.6859620809555054,
"learning_rate": 8.299451888931696e-06,
"loss": 0.33744206428527834,
"memory(GiB)": 36.59,
"step": 660,
"token_acc": 0.8832839002687923,
"train_speed(iter/s)": 0.118483
},
{
"epoch": 0.810873070721032,
"eval_loss": 0.36208656430244446,
"eval_runtime": 31.005,
"eval_samples_per_second": 16.965,
"eval_steps_per_second": 4.257,
"eval_token_acc": 0.8863259835711198,
"step": 660
},
{
"epoch": 0.8170160485295247,
"grad_norm": 0.6853975057601929,
"learning_rate": 8.275188539227687e-06,
"loss": 0.3501296043395996,
"memory(GiB)": 36.59,
"step": 665,
"token_acc": 0.8818506429867994,
"train_speed(iter/s)": 0.117792
},
{
"epoch": 0.8231590263380174,
"grad_norm": 0.672095775604248,
"learning_rate": 8.250789342240326e-06,
"loss": 0.3572331190109253,
"memory(GiB)": 36.59,
"step": 670,
"token_acc": 0.8840531998946537,
"train_speed(iter/s)": 0.118042
},
{
"epoch": 0.82930200414651,
"grad_norm": 0.6654704809188843,
"learning_rate": 8.22625530999215e-06,
"loss": 0.35687694549560545,
"memory(GiB)": 36.59,
"step": 675,
"token_acc": 0.8840721896461247,
"train_speed(iter/s)": 0.118263
},
{
"epoch": 0.8354449819550027,
"grad_norm": 0.6872120499610901,
"learning_rate": 8.201587460098362e-06,
"loss": 0.34873204231262206,
"memory(GiB)": 36.59,
"step": 680,
"token_acc": 0.884066094755313,
"train_speed(iter/s)": 0.118437
},
{
"epoch": 0.8354449819550027,
"eval_loss": 0.36098214983940125,
"eval_runtime": 31.0688,
"eval_samples_per_second": 16.93,
"eval_steps_per_second": 4.249,
"eval_token_acc": 0.8864989191526157,
"step": 680
},
{
"epoch": 0.8415879597634953,
"grad_norm": 0.6905971765518188,
"learning_rate": 8.176786815724601e-06,
"loss": 0.3643667221069336,
"memory(GiB)": 36.59,
"step": 685,
"token_acc": 0.8811371118426906,
"train_speed(iter/s)": 0.117814
},
{
"epoch": 0.847730937571988,
"grad_norm": 0.688023567199707,
"learning_rate": 8.151854405544526e-06,
"loss": 0.369766902923584,
"memory(GiB)": 36.59,
"step": 690,
"token_acc": 0.8848363488998546,
"train_speed(iter/s)": 0.118018
},
{
"epoch": 0.8538739153804807,
"grad_norm": 0.6458128690719604,
"learning_rate": 8.12679126369713e-06,
"loss": 0.3629646301269531,
"memory(GiB)": 36.59,
"step": 695,
"token_acc": 0.8775533863525702,
"train_speed(iter/s)": 0.118233
},
{
"epoch": 0.8600168931889733,
"grad_norm": 0.6942622065544128,
"learning_rate": 8.101598429743862e-06,
"loss": 0.3692671298980713,
"memory(GiB)": 36.59,
"step": 700,
"token_acc": 0.8780482002236338,
"train_speed(iter/s)": 0.118437
},
{
"epoch": 0.8600168931889733,
"eval_loss": 0.35998860001564026,
"eval_runtime": 31.0268,
"eval_samples_per_second": 16.953,
"eval_steps_per_second": 4.254,
"eval_token_acc": 0.8866441850410722,
"step": 700
},
{
"epoch": 0.866159870997466,
"grad_norm": 0.7322993278503418,
"learning_rate": 8.076276948625495e-06,
"loss": 0.36251187324523926,
"memory(GiB)": 36.59,
"step": 705,
"token_acc": 0.8850018575958389,
"train_speed(iter/s)": 0.117844
},
{
"epoch": 0.8723028488059587,
"grad_norm": 0.7000852823257446,
"learning_rate": 8.050827870618795e-06,
"loss": 0.352423095703125,
"memory(GiB)": 36.59,
"step": 710,
"token_acc": 0.8848280386093149,
"train_speed(iter/s)": 0.118064
},
{
"epoch": 0.8784458266144514,
"grad_norm": 0.7393072843551636,
"learning_rate": 8.02525225129295e-06,
"loss": 0.3464043140411377,
"memory(GiB)": 36.59,
"step": 715,
"token_acc": 0.8842617899915519,
"train_speed(iter/s)": 0.118282
},
{
"epoch": 0.8845888044229441,
"grad_norm": 0.676538348197937,
"learning_rate": 7.999551151465793e-06,
"loss": 0.3531349658966064,
"memory(GiB)": 36.59,
"step": 720,
"token_acc": 0.882141211070386,
"train_speed(iter/s)": 0.118479
},
{
"epoch": 0.8845888044229441,
"eval_loss": 0.3599785268306732,
"eval_runtime": 31.1371,
"eval_samples_per_second": 16.893,
"eval_steps_per_second": 4.239,
"eval_token_acc": 0.8867168179853004,
"step": 720
},
{
"epoch": 0.8907317822314367,
"grad_norm": 0.6606590151786804,
"learning_rate": 7.973725637159795e-06,
"loss": 0.3510305881500244,
"memory(GiB)": 36.59,
"step": 725,
"token_acc": 0.8858640888051448,
"train_speed(iter/s)": 0.117866
},
{
"epoch": 0.8968747600399294,
"grad_norm": 0.6910014748573303,
"learning_rate": 7.947776779557862e-06,
"loss": 0.34729857444763185,
"memory(GiB)": 36.59,
"step": 730,
"token_acc": 0.8849401138817985,
"train_speed(iter/s)": 0.11805
},
{
"epoch": 0.903017737848422,
"grad_norm": 0.715715765953064,
"learning_rate": 7.921705654958886e-06,
"loss": 0.37070040702819823,
"memory(GiB)": 36.59,
"step": 735,
"token_acc": 0.873466112894091,
"train_speed(iter/s)": 0.118238
},
{
"epoch": 0.9091607156569147,
"grad_norm": 0.6847560405731201,
"learning_rate": 7.895513344733124e-06,
"loss": 0.3388267993927002,
"memory(GiB)": 36.59,
"step": 740,
"token_acc": 0.892940483205657,
"train_speed(iter/s)": 0.118418
},
{
"epoch": 0.9091607156569147,
"eval_loss": 0.35883787274360657,
"eval_runtime": 31.0744,
"eval_samples_per_second": 16.927,
"eval_steps_per_second": 4.248,
"eval_token_acc": 0.8870696065715521,
"step": 740
},
{
"epoch": 0.9153036934654074,
"grad_norm": 0.7038071155548096,
"learning_rate": 7.869200935277317e-06,
"loss": 0.3523221015930176,
"memory(GiB)": 36.59,
"step": 745,
"token_acc": 0.8841770158578834,
"train_speed(iter/s)": 0.117874
},
{
"epoch": 0.9214466712739,
"grad_norm": 0.7095304727554321,
"learning_rate": 7.842769517969665e-06,
"loss": 0.34724674224853513,
"memory(GiB)": 36.59,
"step": 750,
"token_acc": 0.8921830597616321,
"train_speed(iter/s)": 0.118073
},
{
"epoch": 0.9275896490823927,
"grad_norm": 0.7056006789207458,
"learning_rate": 7.816220189124527e-06,
"loss": 0.34354069232940676,
"memory(GiB)": 36.59,
"step": 755,
"token_acc": 0.8906672115144498,
"train_speed(iter/s)": 0.118273
},
{
"epoch": 0.9337326268908853,
"grad_norm": 0.6470732092857361,
"learning_rate": 7.789554049946966e-06,
"loss": 0.37253437042236326,
"memory(GiB)": 36.59,
"step": 760,
"token_acc": 0.8801472977363803,
"train_speed(iter/s)": 0.118474
},
{
"epoch": 0.9337326268908853,
"eval_loss": 0.3579709231853485,
"eval_runtime": 31.0126,
"eval_samples_per_second": 16.961,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.8876368352788586,
"step": 760
},
{
"epoch": 0.939875604699378,
"grad_norm": 0.671111524105072,
"learning_rate": 7.762772206487066e-06,
"loss": 0.3589931011199951,
"memory(GiB)": 36.59,
"step": 765,
"token_acc": 0.8832086813686086,
"train_speed(iter/s)": 0.117907
},
{
"epoch": 0.9460185825078707,
"grad_norm": 0.7187632322311401,
"learning_rate": 7.735875769594063e-06,
"loss": 0.34763507843017577,
"memory(GiB)": 36.59,
"step": 770,
"token_acc": 0.8847997559593846,
"train_speed(iter/s)": 0.118076
},
{
"epoch": 0.9521615603163633,
"grad_norm": 0.7212729454040527,
"learning_rate": 7.70886585487026e-06,
"loss": 0.3598261833190918,
"memory(GiB)": 36.59,
"step": 775,
"token_acc": 0.8688440332679189,
"train_speed(iter/s)": 0.118251
},
{
"epoch": 0.958304538124856,
"grad_norm": 0.6621137261390686,
"learning_rate": 7.681743582624761e-06,
"loss": 0.35757567882537844,
"memory(GiB)": 36.59,
"step": 780,
"token_acc": 0.8785007468259896,
"train_speed(iter/s)": 0.118454
},
{
"epoch": 0.958304538124856,
"eval_loss": 0.3576439321041107,
"eval_runtime": 31.075,
"eval_samples_per_second": 16.927,
"eval_steps_per_second": 4.248,
"eval_token_acc": 0.8877751837440553,
"step": 780
},
{
"epoch": 0.9644475159333487,
"grad_norm": 0.7074070572853088,
"learning_rate": 7.654510077827003e-06,
"loss": 0.3493576765060425,
"memory(GiB)": 36.59,
"step": 785,
"token_acc": 0.8852768310495931,
"train_speed(iter/s)": 0.117922
},
{
"epoch": 0.9705904937418414,
"grad_norm": 0.6370189189910889,
"learning_rate": 7.627166470060092e-06,
"loss": 0.3448970317840576,
"memory(GiB)": 36.59,
"step": 790,
"token_acc": 0.8896250845717751,
"train_speed(iter/s)": 0.118138
},
{
"epoch": 0.9767334715503341,
"grad_norm": 0.6875202655792236,
"learning_rate": 7.59971389347395e-06,
"loss": 0.36741271018981936,
"memory(GiB)": 36.59,
"step": 795,
"token_acc": 0.880575873679322,
"train_speed(iter/s)": 0.118312
},
{
"epoch": 0.9828764493588267,
"grad_norm": 0.7139670848846436,
"learning_rate": 7.572153486738281e-06,
"loss": 0.3554513692855835,
"memory(GiB)": 36.59,
"step": 800,
"token_acc": 0.8777580460748777,
"train_speed(iter/s)": 0.118491
},
{
"epoch": 0.9828764493588267,
"eval_loss": 0.3568785786628723,
"eval_runtime": 31.0006,
"eval_samples_per_second": 16.967,
"eval_steps_per_second": 4.258,
"eval_token_acc": 0.8877959360138349,
"step": 800
},
{
"epoch": 0.9890194271673194,
"grad_norm": 0.7183944582939148,
"learning_rate": 7.544486392995325e-06,
"loss": 0.3408940076828003,
"memory(GiB)": 36.59,
"step": 805,
"token_acc": 0.8823203099663748,
"train_speed(iter/s)": 0.117937
},
{
"epoch": 0.995162404975812,
"grad_norm": 0.7064708471298218,
"learning_rate": 7.516713759812465e-06,
"loss": 0.3436570167541504,
"memory(GiB)": 36.59,
"step": 810,
"token_acc": 0.8865785782162089,
"train_speed(iter/s)": 0.118112
},
{
"epoch": 1.002457191123397,
"grad_norm": 0.7077184915542603,
"learning_rate": 7.4888367391346085e-06,
"loss": 0.40673046112060546,
"memory(GiB)": 36.59,
"step": 815,
"token_acc": 0.8932987364620939,
"train_speed(iter/s)": 0.11823
},
{
"epoch": 1.0086001689318898,
"grad_norm": 0.6631501317024231,
"learning_rate": 7.460856487236421e-06,
"loss": 0.32202835083007814,
"memory(GiB)": 36.59,
"step": 820,
"token_acc": 0.8988542163968578,
"train_speed(iter/s)": 0.118434
},
{
"epoch": 1.0086001689318898,
"eval_loss": 0.3615255355834961,
"eval_runtime": 31.0114,
"eval_samples_per_second": 16.961,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.8877959360138349,
"step": 820
},
{
"epoch": 1.0147431467403825,
"grad_norm": 0.657802939414978,
"learning_rate": 7.432774164674359e-06,
"loss": 0.2976385116577148,
"memory(GiB)": 36.59,
"step": 825,
"token_acc": 0.8940141675474071,
"train_speed(iter/s)": 0.117911
},
{
"epoch": 1.0208861245488752,
"grad_norm": 0.675591766834259,
"learning_rate": 7.404590936238535e-06,
"loss": 0.311181640625,
"memory(GiB)": 36.59,
"step": 830,
"token_acc": 0.8997530755324309,
"train_speed(iter/s)": 0.118145
},
{
"epoch": 1.0270291023573677,
"grad_norm": 0.6661099791526794,
"learning_rate": 7.376307970904408e-06,
"loss": 0.3044283866882324,
"memory(GiB)": 36.59,
"step": 835,
"token_acc": 0.8999576197242789,
"train_speed(iter/s)": 0.118312
},
{
"epoch": 1.0331720801658604,
"grad_norm": 0.6595695614814758,
"learning_rate": 7.34792644178429e-06,
"loss": 0.3037309408187866,
"memory(GiB)": 36.59,
"step": 840,
"token_acc": 0.9055141287284144,
"train_speed(iter/s)": 0.118457
},
{
"epoch": 1.0331720801658604,
"eval_loss": 0.35968878865242004,
"eval_runtime": 31.0067,
"eval_samples_per_second": 16.964,
"eval_steps_per_second": 4.257,
"eval_token_acc": 0.8877025507998271,
"step": 840
},
{
"epoch": 1.039315057974353,
"grad_norm": 0.7282394170761108,
"learning_rate": 7.319447526078696e-06,
"loss": 0.3085323333740234,
"memory(GiB)": 36.59,
"step": 845,
"token_acc": 0.8898290405833752,
"train_speed(iter/s)": 0.118005
},
{
"epoch": 1.0454580357828458,
"grad_norm": 0.6701980233192444,
"learning_rate": 7.290872405027508e-06,
"loss": 0.29195051193237304,
"memory(GiB)": 36.59,
"step": 850,
"token_acc": 0.9044647710888937,
"train_speed(iter/s)": 0.118164
},
{
"epoch": 1.0516010135913385,
"grad_norm": 0.6651575565338135,
"learning_rate": 7.262202263860989e-06,
"loss": 0.30650150775909424,
"memory(GiB)": 36.59,
"step": 855,
"token_acc": 0.8993040861428504,
"train_speed(iter/s)": 0.118324
},
{
"epoch": 1.057743991399831,
"grad_norm": 0.682246744632721,
"learning_rate": 7.233438291750615e-06,
"loss": 0.3102306842803955,
"memory(GiB)": 36.59,
"step": 860,
"token_acc": 0.9063239097279017,
"train_speed(iter/s)": 0.11848
},
{
"epoch": 1.057743991399831,
"eval_loss": 0.35930460691452026,
"eval_runtime": 31.0087,
"eval_samples_per_second": 16.963,
"eval_steps_per_second": 4.257,
"eval_token_acc": 0.887875486381323,
"step": 860
},
{
"epoch": 1.0638869692083237,
"grad_norm": 0.7295219898223877,
"learning_rate": 7.204581681759752e-06,
"loss": 0.30730266571044923,
"memory(GiB)": 36.59,
"step": 865,
"token_acc": 0.8905181851880587,
"train_speed(iter/s)": 0.117999
},
{
"epoch": 1.0700299470168164,
"grad_norm": 0.6892926096916199,
"learning_rate": 7.175633630794176e-06,
"loss": 0.2974876403808594,
"memory(GiB)": 36.59,
"step": 870,
"token_acc": 0.9006297483247798,
"train_speed(iter/s)": 0.118168
},
{
"epoch": 1.0761729248253091,
"grad_norm": 0.6752432584762573,
"learning_rate": 7.146595339552423e-06,
"loss": 0.3102593421936035,
"memory(GiB)": 36.59,
"step": 875,
"token_acc": 0.9038279095421953,
"train_speed(iter/s)": 0.118364
},
{
"epoch": 1.0823159026338018,
"grad_norm": 0.674329400062561,
"learning_rate": 7.1174680124759856e-06,
"loss": 0.28625760078430174,
"memory(GiB)": 36.59,
"step": 880,
"token_acc": 0.9079884290164664,
"train_speed(iter/s)": 0.118523
},
{
"epoch": 1.0823159026338018,
"eval_loss": 0.36010968685150146,
"eval_runtime": 31.0269,
"eval_samples_per_second": 16.953,
"eval_steps_per_second": 4.254,
"eval_token_acc": 0.8876299178555987,
"step": 880
},
{
"epoch": 1.0884588804422943,
"grad_norm": 0.6883670091629028,
"learning_rate": 7.08825285769936e-06,
"loss": 0.3032073020935059,
"memory(GiB)": 36.59,
"step": 885,
"token_acc": 0.8932736033602344,
"train_speed(iter/s)": 0.118061
},
{
"epoch": 1.094601858250787,
"grad_norm": 0.671500027179718,
"learning_rate": 7.058951086999934e-06,
"loss": 0.3017904758453369,
"memory(GiB)": 36.59,
"step": 890,
"token_acc": 0.9018632618216911,
"train_speed(iter/s)": 0.118196
},
{
"epoch": 1.1007448360592798,
"grad_norm": 0.7209696173667908,
"learning_rate": 7.029563915747723e-06,
"loss": 0.31074273586273193,
"memory(GiB)": 36.59,
"step": 895,
"token_acc": 0.898548356982823,
"train_speed(iter/s)": 0.118358
},
{
"epoch": 1.1068878138677725,
"grad_norm": 0.624523937702179,
"learning_rate": 7.0000925628549595e-06,
"loss": 0.2956224918365479,
"memory(GiB)": 36.59,
"step": 900,
"token_acc": 0.9076877474540027,
"train_speed(iter/s)": 0.118515
},
{
"epoch": 1.1068878138677725,
"eval_loss": 0.3587914705276489,
"eval_runtime": 31.0433,
"eval_samples_per_second": 16.944,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.8878443579766537,
"step": 900
},
{
"epoch": 1.1130307916762652,
"grad_norm": 0.7052697539329529,
"learning_rate": 6.9705382507255405e-06,
"loss": 0.2872809648513794,
"memory(GiB)": 36.59,
"step": 905,
"token_acc": 0.8926222488296873,
"train_speed(iter/s)": 0.118076
},
{
"epoch": 1.1191737694847577,
"grad_norm": 0.7123196125030518,
"learning_rate": 6.940902205204321e-06,
"loss": 0.2964935302734375,
"memory(GiB)": 36.59,
"step": 910,
"token_acc": 0.9039498517120518,
"train_speed(iter/s)": 0.118226
},
{
"epoch": 1.1253167472932504,
"grad_norm": 0.660994291305542,
"learning_rate": 6.911185655526263e-06,
"loss": 0.302768611907959,
"memory(GiB)": 36.59,
"step": 915,
"token_acc": 0.9020544461398969,
"train_speed(iter/s)": 0.118393
},
{
"epoch": 1.131459725101743,
"grad_norm": 0.7210450768470764,
"learning_rate": 6.881389834265463e-06,
"loss": 0.3173034429550171,
"memory(GiB)": 36.59,
"step": 920,
"token_acc": 0.8982849864950921,
"train_speed(iter/s)": 0.118553
},
{
"epoch": 1.131459725101743,
"eval_loss": 0.3588680624961853,
"eval_runtime": 31.0113,
"eval_samples_per_second": 16.962,
"eval_steps_per_second": 4.257,
"eval_token_acc": 0.8877405966277562,
"step": 920
},
{
"epoch": 1.1376027029102358,
"grad_norm": 0.6697967648506165,
"learning_rate": 6.851515977284014e-06,
"loss": 0.299291205406189,
"memory(GiB)": 36.59,
"step": 925,
"token_acc": 0.8902243928864662,
"train_speed(iter/s)": 0.118081
},
{
"epoch": 1.1437456807187285,
"grad_norm": 0.7066377401351929,
"learning_rate": 6.821565323680759e-06,
"loss": 0.29554860591888427,
"memory(GiB)": 36.59,
"step": 930,
"token_acc": 0.9000831485587583,
"train_speed(iter/s)": 0.118223
},
{
"epoch": 1.149888658527221,
"grad_norm": 0.6386650204658508,
"learning_rate": 6.791539115739879e-06,
"loss": 0.3022310256958008,
"memory(GiB)": 36.59,
"step": 935,
"token_acc": 0.8924001814882032,
"train_speed(iter/s)": 0.118412
},
{
"epoch": 1.1560316363357137,
"grad_norm": 0.6704084873199463,
"learning_rate": 6.761438598879383e-06,
"loss": 0.28601846694946287,
"memory(GiB)": 36.59,
"step": 940,
"token_acc": 0.9012753677155092,
"train_speed(iter/s)": 0.118547
},
{
"epoch": 1.1560316363357137,
"eval_loss": 0.35880643129348755,
"eval_runtime": 31.0179,
"eval_samples_per_second": 16.958,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.8880657155209685,
"step": 940
},
{
"epoch": 1.1621746141442064,
"grad_norm": 0.6651415228843689,
"learning_rate": 6.731265021599437e-06,
"loss": 0.3218855381011963,
"memory(GiB)": 36.59,
"step": 945,
"token_acc": 0.8918404969109147,
"train_speed(iter/s)": 0.118102
},
{
"epoch": 1.1683175919526991,
"grad_norm": 0.6738328337669373,
"learning_rate": 6.7010196354305876e-06,
"loss": 0.30361137390136717,
"memory(GiB)": 36.59,
"step": 950,
"token_acc": 0.9092978421945045,
"train_speed(iter/s)": 0.118249
},
{
"epoch": 1.1744605697611918,
"grad_norm": 0.6776899099349976,
"learning_rate": 6.670703694881851e-06,
"loss": 0.29663915634155275,
"memory(GiB)": 36.59,
"step": 955,
"token_acc": 0.8984023842094978,
"train_speed(iter/s)": 0.118405
},
{
"epoch": 1.1806035475696843,
"grad_norm": 0.6939485669136047,
"learning_rate": 6.640318457388672e-06,
"loss": 0.3056649684906006,
"memory(GiB)": 36.59,
"step": 960,
"token_acc": 0.8867154116418194,
"train_speed(iter/s)": 0.118549
},
{
"epoch": 1.1806035475696843,
"eval_loss": 0.35902491211891174,
"eval_runtime": 31.0009,
"eval_samples_per_second": 16.967,
"eval_steps_per_second": 4.258,
"eval_token_acc": 0.8878408992650237,
"step": 960
},
{
"epoch": 1.186746525378177,
"grad_norm": 0.7092224359512329,
"learning_rate": 6.609865183260777e-06,
"loss": 0.2987541198730469,
"memory(GiB)": 36.59,
"step": 965,
"token_acc": 0.8890386576114193,
"train_speed(iter/s)": 0.118089
},
{
"epoch": 1.1928895031866698,
"grad_norm": 0.7263514399528503,
"learning_rate": 6.579345135629896e-06,
"loss": 0.28489587306976316,
"memory(GiB)": 36.59,
"step": 970,
"token_acc": 0.8956198679571216,
"train_speed(iter/s)": 0.118237
},
{
"epoch": 1.1990324809951625,
"grad_norm": 0.6999565362930298,
"learning_rate": 6.548759580397363e-06,
"loss": 0.30396156311035155,
"memory(GiB)": 36.59,
"step": 975,
"token_acc": 0.8999096083844331,
"train_speed(iter/s)": 0.118377
},
{
"epoch": 1.2051754588036552,
"grad_norm": 0.6386498212814331,
"learning_rate": 6.518109786181628e-06,
"loss": 0.32303242683410643,
"memory(GiB)": 36.59,
"step": 980,
"token_acc": 0.8918318331799511,
"train_speed(iter/s)": 0.11851
},
{
"epoch": 1.2051754588036552,
"eval_loss": 0.3577713966369629,
"eval_runtime": 30.9949,
"eval_samples_per_second": 16.971,
"eval_steps_per_second": 4.259,
"eval_token_acc": 0.8879481193255512,
"step": 980
},
{
"epoch": 1.2113184366121477,
"grad_norm": 0.6696978807449341,
"learning_rate": 6.487397024265616e-06,
"loss": 0.29286723136901854,
"memory(GiB)": 36.59,
"step": 985,
"token_acc": 0.8883067219587296,
"train_speed(iter/s)": 0.11806
},
{
"epoch": 1.2174614144206404,
"grad_norm": 0.6677629947662354,
"learning_rate": 6.456622568544012e-06,
"loss": 0.295971155166626,
"memory(GiB)": 36.59,
"step": 990,
"token_acc": 0.901066495199663,
"train_speed(iter/s)": 0.118215
},
{
"epoch": 1.223604392229133,
"grad_norm": 0.6924172639846802,
"learning_rate": 6.425787695470419e-06,
"loss": 0.2936640024185181,
"memory(GiB)": 36.59,
"step": 995,
"token_acc": 0.8968813591405991,
"train_speed(iter/s)": 0.118377
},
{
"epoch": 1.2297473700376258,
"grad_norm": 0.6816849112510681,
"learning_rate": 6.3948936840044096e-06,
"loss": 0.29815101623535156,
"memory(GiB)": 36.59,
"step": 1000,
"token_acc": 0.9113140380746014,
"train_speed(iter/s)": 0.118511
},
{
"epoch": 1.2297473700376258,
"eval_loss": 0.35851019620895386,
"eval_runtime": 31.0715,
"eval_samples_per_second": 16.929,
"eval_steps_per_second": 4.248,
"eval_token_acc": 0.8881279723303069,
"step": 1000
},
{
"epoch": 1.2358903478461185,
"grad_norm": 0.7491683959960938,
"learning_rate": 6.363941815558484e-06,
"loss": 0.305048394203186,
"memory(GiB)": 36.59,
"step": 1005,
"token_acc": 0.8883380321029248,
"train_speed(iter/s)": 0.118078
},
{
"epoch": 1.242033325654611,
"grad_norm": 0.6767114400863647,
"learning_rate": 6.332933373944914e-06,
"loss": 0.2910877466201782,
"memory(GiB)": 36.59,
"step": 1010,
"token_acc": 0.8970752230332523,
"train_speed(iter/s)": 0.118198
},
{
"epoch": 1.2481763034631037,
"grad_norm": 0.6579700112342834,
"learning_rate": 6.301869645322498e-06,
"loss": 0.2989434480667114,
"memory(GiB)": 36.59,
"step": 1015,
"token_acc": 0.9020202767705173,
"train_speed(iter/s)": 0.118352
},
{
"epoch": 1.2543192812715964,
"grad_norm": 0.7496470808982849,
"learning_rate": 6.270751918143213e-06,
"loss": 0.3161623477935791,
"memory(GiB)": 36.59,
"step": 1020,
"token_acc": 0.8931434478006202,
"train_speed(iter/s)": 0.118501
},
{
"epoch": 1.2543192812715964,
"eval_loss": 0.3574770390987396,
"eval_runtime": 31.0423,
"eval_samples_per_second": 16.945,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.888463467358409,
"step": 1020
},
{
"epoch": 1.2604622590800891,
"grad_norm": 0.6567991971969604,
"learning_rate": 6.239581483098767e-06,
"loss": 0.2918637752532959,
"memory(GiB)": 36.59,
"step": 1025,
"token_acc": 0.8930598715558318,
"train_speed(iter/s)": 0.118037
},
{
"epoch": 1.2666052368885818,
"grad_norm": 0.7520761489868164,
"learning_rate": 6.208359633067077e-06,
"loss": 0.2961498022079468,
"memory(GiB)": 36.59,
"step": 1030,
"token_acc": 0.9095238095238095,
"train_speed(iter/s)": 0.118175
},
{
"epoch": 1.2727482146970743,
"grad_norm": 0.7256974577903748,
"learning_rate": 6.177087663058626e-06,
"loss": 0.30830044746398927,
"memory(GiB)": 36.59,
"step": 1035,
"token_acc": 0.9017879399034648,
"train_speed(iter/s)": 0.118311
},
{
"epoch": 1.278891192505567,
"grad_norm": 0.6479539275169373,
"learning_rate": 6.145766870162767e-06,
"loss": 0.2862563610076904,
"memory(GiB)": 36.59,
"step": 1040,
"token_acc": 0.9018611343172747,
"train_speed(iter/s)": 0.118441
},
{
"epoch": 1.278891192505567,
"eval_loss": 0.3572877049446106,
"eval_runtime": 31.0406,
"eval_samples_per_second": 16.946,
"eval_steps_per_second": 4.253,
"eval_token_acc": 0.8883147427583226,
"step": 1040
},
{
"epoch": 1.2850341703140598,
"grad_norm": 0.7319021224975586,
"learning_rate": 6.114398553493909e-06,
"loss": 0.3000927925109863,
"memory(GiB)": 36.59,
"step": 1045,
"token_acc": 0.8926547069479344,
"train_speed(iter/s)": 0.118
},
{
"epoch": 1.2911771481225525,
"grad_norm": 0.705988883972168,
"learning_rate": 6.0829840141376385e-06,
"loss": 0.30697922706604003,
"memory(GiB)": 36.59,
"step": 1050,
"token_acc": 0.901831032683459,
"train_speed(iter/s)": 0.118157
},
{
"epoch": 1.2973201259310452,
"grad_norm": 0.64214026927948,
"learning_rate": 6.051524555096754e-06,
"loss": 0.30261845588684083,
"memory(GiB)": 36.59,
"step": 1055,
"token_acc": 0.902963066984974,
"train_speed(iter/s)": 0.118309
},
{
"epoch": 1.3034631037395377,
"grad_norm": 0.7394285798072815,
"learning_rate": 6.020021481237216e-06,
"loss": 0.30278654098510743,
"memory(GiB)": 36.59,
"step": 1060,
"token_acc": 0.9020456426628828,
"train_speed(iter/s)": 0.118449
},
{
"epoch": 1.3034631037395377,
"eval_loss": 0.35663846135139465,
"eval_runtime": 31.0016,
"eval_samples_per_second": 16.967,
"eval_steps_per_second": 4.258,
"eval_token_acc": 0.8883424124513619,
"step": 1060
},
{
"epoch": 1.3096060815480304,
"grad_norm": 0.6863911151885986,
"learning_rate": 5.988476099234033e-06,
"loss": 0.2937177658081055,
"memory(GiB)": 36.59,
"step": 1065,
"token_acc": 0.8901542316498898,
"train_speed(iter/s)": 0.1181
},
{
"epoch": 1.315749059356523,
"grad_norm": 0.654614269733429,
"learning_rate": 5.956889717517053e-06,
"loss": 0.3110340595245361,
"memory(GiB)": 36.59,
"step": 1070,
"token_acc": 0.9028094153378892,
"train_speed(iter/s)": 0.118212
},
{
"epoch": 1.3218920371650158,
"grad_norm": 0.7234563827514648,
"learning_rate": 5.925263646216697e-06,
"loss": 0.31188764572143557,
"memory(GiB)": 36.59,
"step": 1075,
"token_acc": 0.9096784327805578,
"train_speed(iter/s)": 0.118351
},
{
"epoch": 1.3280350149735085,
"grad_norm": 0.6865576505661011,
"learning_rate": 5.893599197109625e-06,
"loss": 0.302515435218811,
"memory(GiB)": 36.59,
"step": 1080,
"token_acc": 0.8899835796387521,
"train_speed(iter/s)": 0.118487
},
{
"epoch": 1.3280350149735085,
"eval_loss": 0.35516050457954407,
"eval_runtime": 30.9944,
"eval_samples_per_second": 16.971,
"eval_steps_per_second": 4.259,
"eval_token_acc": 0.8885637699956767,
"step": 1080
},
{
"epoch": 1.334177992782001,
"grad_norm": 0.6132445335388184,
"learning_rate": 5.861897683564313e-06,
"loss": 0.3079413414001465,
"memory(GiB)": 36.59,
"step": 1085,
"token_acc": 0.8899461794132038,
"train_speed(iter/s)": 0.118068
},
{
"epoch": 1.3403209705904937,
"grad_norm": 0.7110121250152588,
"learning_rate": 5.830160420486588e-06,
"loss": 0.29248368740081787,
"memory(GiB)": 36.59,
"step": 1090,
"token_acc": 0.905348378514747,
"train_speed(iter/s)": 0.118225
},
{
"epoch": 1.3464639483989864,
"grad_norm": 0.6436595916748047,
"learning_rate": 5.798388724265085e-06,
"loss": 0.3002151966094971,
"memory(GiB)": 39.06,
"step": 1095,
"token_acc": 0.9053737339917971,
"train_speed(iter/s)": 0.118367
},
{
"epoch": 1.3526069262074791,
"grad_norm": 0.7013940215110779,
"learning_rate": 5.7665839127166475e-06,
"loss": 0.3010303020477295,
"memory(GiB)": 39.06,
"step": 1100,
"token_acc": 0.9023475037752253,
"train_speed(iter/s)": 0.118479
},
{
"epoch": 1.3526069262074791,
"eval_loss": 0.3555811047554016,
"eval_runtime": 31.0333,
"eval_samples_per_second": 16.95,
"eval_steps_per_second": 4.253,
"eval_token_acc": 0.8887712926934717,
"step": 1100
},
{
"epoch": 1.3587499040159718,
"grad_norm": 0.7001612186431885,
"learning_rate": 5.734747305031664e-06,
"loss": 0.3120265483856201,
"memory(GiB)": 39.06,
"step": 1105,
"token_acc": 0.8886269689596821,
"train_speed(iter/s)": 0.118091
},
{
"epoch": 1.3648928818244643,
"grad_norm": 0.6804000735282898,
"learning_rate": 5.7028802217193565e-06,
"loss": 0.30517282485961916,
"memory(GiB)": 39.06,
"step": 1110,
"token_acc": 0.8981199555362235,
"train_speed(iter/s)": 0.118215
},
{
"epoch": 1.371035859632957,
"grad_norm": 0.6867697834968567,
"learning_rate": 5.670983984553003e-06,
"loss": 0.3074041366577148,
"memory(GiB)": 39.06,
"step": 1115,
"token_acc": 0.903482807952247,
"train_speed(iter/s)": 0.118338
},
{
"epoch": 1.3771788374414498,
"grad_norm": 0.7690563201904297,
"learning_rate": 5.63905991651512e-06,
"loss": 0.3027225971221924,
"memory(GiB)": 39.06,
"step": 1120,
"token_acc": 0.8987023004673533,
"train_speed(iter/s)": 0.118449
},
{
"epoch": 1.3771788374414498,
"eval_loss": 0.3556562066078186,
"eval_runtime": 31.1119,
"eval_samples_per_second": 16.907,
"eval_steps_per_second": 4.243,
"eval_token_acc": 0.8888612191958496,
"step": 1120
},
{
"epoch": 1.3833218152499425,
"grad_norm": 0.6769737005233765,
"learning_rate": 5.607109341742579e-06,
"loss": 0.30417637825012206,
"memory(GiB)": 39.06,
"step": 1125,
"token_acc": 0.8885960318346111,
"train_speed(iter/s)": 0.118061
},
{
"epoch": 1.3894647930584352,
"grad_norm": 0.6724239587783813,
"learning_rate": 5.575133585471697e-06,
"loss": 0.31278433799743655,
"memory(GiB)": 39.06,
"step": 1130,
"token_acc": 0.8959036584253262,
"train_speed(iter/s)": 0.118168
},
{
"epoch": 1.3956077708669277,
"grad_norm": 0.7643016576766968,
"learning_rate": 5.543133973983254e-06,
"loss": 0.29112992286682127,
"memory(GiB)": 39.06,
"step": 1135,
"token_acc": 0.9014400645633149,
"train_speed(iter/s)": 0.118301
},
{
"epoch": 1.4017507486754204,
"grad_norm": 0.6788151264190674,
"learning_rate": 5.511111834547496e-06,
"loss": 0.3165508508682251,
"memory(GiB)": 39.06,
"step": 1140,
"token_acc": 0.903283467750516,
"train_speed(iter/s)": 0.118415
},
{
"epoch": 1.4017507486754204,
"eval_loss": 0.35399720072746277,
"eval_runtime": 31.0476,
"eval_samples_per_second": 16.942,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.8891137051448336,
"step": 1140
},
{
"epoch": 1.407893726483913,
"grad_norm": 0.6638893485069275,
"learning_rate": 5.479068495369071e-06,
"loss": 0.2801161289215088,
"memory(GiB)": 39.06,
"step": 1145,
"token_acc": 0.8925869894099848,
"train_speed(iter/s)": 0.118025
},
{
"epoch": 1.4140367042924058,
"grad_norm": 0.7107008099555969,
"learning_rate": 5.447005285531948e-06,
"loss": 0.29520745277404786,
"memory(GiB)": 39.06,
"step": 1150,
"token_acc": 0.9020094269412057,
"train_speed(iter/s)": 0.118132
},
{
"epoch": 1.4201796821008985,
"grad_norm": 0.6262108087539673,
"learning_rate": 5.414923534944283e-06,
"loss": 0.28986170291900637,
"memory(GiB)": 39.06,
"step": 1155,
"token_acc": 0.9047965292421047,
"train_speed(iter/s)": 0.11825
},
{
"epoch": 1.426322659909391,
"grad_norm": 0.7209280729293823,
"learning_rate": 5.38282457428326e-06,
"loss": 0.30995869636535645,
"memory(GiB)": 39.06,
"step": 1160,
"token_acc": 0.9020344876192267,
"train_speed(iter/s)": 0.118366
},
{
"epoch": 1.426322659909391,
"eval_loss": 0.3549746870994568,
"eval_runtime": 31.0688,
"eval_samples_per_second": 16.93,
"eval_steps_per_second": 4.249,
"eval_token_acc": 0.8894630350194552,
"step": 1160
},
{
"epoch": 1.4324656377178837,
"grad_norm": 0.6941882371902466,
"learning_rate": 5.350709734939898e-06,
"loss": 0.313739013671875,
"memory(GiB)": 39.06,
"step": 1165,
"token_acc": 0.889407067409571,
"train_speed(iter/s)": 0.117998
},
{
"epoch": 1.4386086155263764,
"grad_norm": 0.6950980424880981,
"learning_rate": 5.318580348963826e-06,
"loss": 0.29497203826904295,
"memory(GiB)": 39.06,
"step": 1170,
"token_acc": 0.9058259992665934,
"train_speed(iter/s)": 0.118116
},
{
"epoch": 1.4447515933348691,
"grad_norm": 0.6526186466217041,
"learning_rate": 5.286437749008031e-06,
"loss": 0.29609017372131347,
"memory(GiB)": 39.06,
"step": 1175,
"token_acc": 0.9071177290528133,
"train_speed(iter/s)": 0.11824
},
{
"epoch": 1.4508945711433618,
"grad_norm": 0.6585668921470642,
"learning_rate": 5.2542832682735956e-06,
"loss": 0.2915393590927124,
"memory(GiB)": 39.06,
"step": 1180,
"token_acc": 0.8964739593006288,
"train_speed(iter/s)": 0.118376
},
{
"epoch": 1.4508945711433618,
"eval_loss": 0.35392019152641296,
"eval_runtime": 31.07,
"eval_samples_per_second": 16.93,
"eval_steps_per_second": 4.248,
"eval_token_acc": 0.889134457414613,
"step": 1180
},
{
"epoch": 1.4570375489518543,
"grad_norm": 0.680291473865509,
"learning_rate": 5.222118240454376e-06,
"loss": 0.3221513509750366,
"memory(GiB)": 39.06,
"step": 1185,
"token_acc": 0.8858566297847655,
"train_speed(iter/s)": 0.117989
},
{
"epoch": 1.463180526760347,
"grad_norm": 0.676287055015564,
"learning_rate": 5.18994399968171e-06,
"loss": 0.303191614151001,
"memory(GiB)": 39.06,
"step": 1190,
"token_acc": 0.8928310930499115,
"train_speed(iter/s)": 0.118095
},
{
"epoch": 1.4693235045688398,
"grad_norm": 0.7134848237037659,
"learning_rate": 5.157761880469058e-06,
"loss": 0.30745644569396974,
"memory(GiB)": 39.06,
"step": 1195,
"token_acc": 0.8987542686739455,
"train_speed(iter/s)": 0.118213
},
{
"epoch": 1.4754664823773325,
"grad_norm": 0.706149160861969,
"learning_rate": 5.125573217656664e-06,
"loss": 0.3102452278137207,
"memory(GiB)": 39.06,
"step": 1200,
"token_acc": 0.9014028524666823,
"train_speed(iter/s)": 0.118318
},
{
"epoch": 1.4754664823773325,
"eval_loss": 0.35402196645736694,
"eval_runtime": 31.0702,
"eval_samples_per_second": 16.929,
"eval_steps_per_second": 4.248,
"eval_token_acc": 0.8895391266753134,
"step": 1200
},
{
"epoch": 1.4816094601858252,
"grad_norm": 0.7066270112991333,
"learning_rate": 5.0933793463561855e-06,
"loss": 0.3033695936203003,
"memory(GiB)": 39.06,
"step": 1205,
"token_acc": 0.8896138651714031,
"train_speed(iter/s)": 0.117945
},
{
"epoch": 1.4877524379943177,
"grad_norm": 0.6695776581764221,
"learning_rate": 5.061181601895317e-06,
"loss": 0.30724053382873534,
"memory(GiB)": 39.06,
"step": 1210,
"token_acc": 0.9012793441808471,
"train_speed(iter/s)": 0.118065
},
{
"epoch": 1.4938954158028104,
"grad_norm": 0.7692334651947021,
"learning_rate": 5.028981319762399e-06,
"loss": 0.28596570491790774,
"memory(GiB)": 39.06,
"step": 1215,
"token_acc": 0.8964816040858792,
"train_speed(iter/s)": 0.118187
},
{
"epoch": 1.500038393611303,
"grad_norm": 0.6707490086555481,
"learning_rate": 4.996779835551035e-06,
"loss": 0.2939592838287354,
"memory(GiB)": 39.06,
"step": 1220,
"token_acc": 0.8994356329668192,
"train_speed(iter/s)": 0.118298
},
{
"epoch": 1.500038393611303,
"eval_loss": 0.35305002331733704,
"eval_runtime": 31.066,
"eval_samples_per_second": 16.932,
"eval_steps_per_second": 4.249,
"eval_token_acc": 0.8896774751405102,
"step": 1220
},
{
"epoch": 1.5061813714197958,
"grad_norm": 0.7542144656181335,
"learning_rate": 4.964578484904679e-06,
"loss": 0.30585541725158694,
"memory(GiB)": 39.06,
"step": 1225,
"token_acc": 0.8881905335110271,
"train_speed(iter/s)": 0.117949
},
{
"epoch": 1.5123243492282885,
"grad_norm": 0.6754580140113831,
"learning_rate": 4.932378603461253e-06,
"loss": 0.2997127056121826,
"memory(GiB)": 39.06,
"step": 1230,
"token_acc": 0.9038497785317123,
"train_speed(iter/s)": 0.118065
},
{
"epoch": 1.518467327036781,
"grad_norm": 0.7103241682052612,
"learning_rate": 4.900181526797737e-06,
"loss": 0.29804291725158694,
"memory(GiB)": 39.06,
"step": 1235,
"token_acc": 0.8995869901910171,
"train_speed(iter/s)": 0.118167
},
{
"epoch": 1.5246103048452737,
"grad_norm": 0.6416381001472473,
"learning_rate": 4.867988590374777e-06,
"loss": 0.2915628433227539,
"memory(GiB)": 39.06,
"step": 1240,
"token_acc": 0.8995757044689388,
"train_speed(iter/s)": 0.118299
},
{
"epoch": 1.5246103048452737,
"eval_loss": 0.35335448384284973,
"eval_runtime": 31.1015,
"eval_samples_per_second": 16.912,
"eval_steps_per_second": 4.244,
"eval_token_acc": 0.8896774751405102,
"step": 1240
},
{
"epoch": 1.5307532826537664,
"grad_norm": 0.7514793872833252,
"learning_rate": 4.835801129481287e-06,
"loss": 0.305086350440979,
"memory(GiB)": 39.06,
"step": 1245,
"token_acc": 0.8938343509704211,
"train_speed(iter/s)": 0.117954
},
{
"epoch": 1.5368962604622591,
"grad_norm": 0.712042510509491,
"learning_rate": 4.803620479179071e-06,
"loss": 0.30651469230651857,
"memory(GiB)": 39.06,
"step": 1250,
"token_acc": 0.9019437191760952,
"train_speed(iter/s)": 0.118064
},
{
"epoch": 1.5430392382707518,
"grad_norm": 0.6950103640556335,
"learning_rate": 4.771447974247449e-06,
"loss": 0.29916160106658934,
"memory(GiB)": 39.06,
"step": 1255,
"token_acc": 0.8986829014071162,
"train_speed(iter/s)": 0.118206
},
{
"epoch": 1.5491822160792443,
"grad_norm": 0.702800452709198,
"learning_rate": 4.7392849491278825e-06,
"loss": 0.3027307987213135,
"memory(GiB)": 39.06,
"step": 1260,
"token_acc": 0.8973517128165512,
"train_speed(iter/s)": 0.118315
},
{
"epoch": 1.5491822160792443,
"eval_loss": 0.35245779156684875,
"eval_runtime": 31.0495,
"eval_samples_per_second": 16.941,
"eval_steps_per_second": 4.251,
"eval_token_acc": 0.8897846952010376,
"step": 1260
},
{
"epoch": 1.555325193887737,
"grad_norm": 0.6939496397972107,
"learning_rate": 4.707132737868639e-06,
"loss": 0.30812973976135255,
"memory(GiB)": 39.06,
"step": 1265,
"token_acc": 0.8929094774646575,
"train_speed(iter/s)": 0.117991
},
{
"epoch": 1.5614681716962298,
"grad_norm": 0.6996237635612488,
"learning_rate": 4.674992674069445e-06,
"loss": 0.3079190969467163,
"memory(GiB)": 39.06,
"step": 1270,
"token_acc": 0.8922962411611463,
"train_speed(iter/s)": 0.118087
},
{
"epoch": 1.5676111495047225,
"grad_norm": 0.7096247673034668,
"learning_rate": 4.642866090826187e-06,
"loss": 0.29966809749603274,
"memory(GiB)": 39.06,
"step": 1275,
"token_acc": 0.8995864625915011,
"train_speed(iter/s)": 0.118159
},
{
"epoch": 1.5737541273132152,
"grad_norm": 0.6891176104545593,
"learning_rate": 4.610754320675603e-06,
"loss": 0.28565430641174316,
"memory(GiB)": 39.06,
"step": 1280,
"token_acc": 0.9035195544740737,
"train_speed(iter/s)": 0.118282
},
{
"epoch": 1.5737541273132152,
"eval_loss": 0.3529431223869324,
"eval_runtime": 31.0442,
"eval_samples_per_second": 16.944,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.8897604842196282,
"step": 1280
},
{
"epoch": 1.5798971051217077,
"grad_norm": 0.6836899518966675,
"learning_rate": 4.578658695540018e-06,
"loss": 0.30156033039093016,
"memory(GiB)": 39.06,
"step": 1285,
"token_acc": 0.8901772041128856,
"train_speed(iter/s)": 0.117956
},
{
"epoch": 1.5860400829302004,
"grad_norm": 0.6600014567375183,
"learning_rate": 4.5465805466721e-06,
"loss": 0.30488083362579343,
"memory(GiB)": 39.06,
"step": 1290,
"token_acc": 0.9087979374798582,
"train_speed(iter/s)": 0.11807
},
{
"epoch": 1.592183060738693,
"grad_norm": 0.7213631272315979,
"learning_rate": 4.514521204599645e-06,
"loss": 0.30581624507904054,
"memory(GiB)": 39.06,
"step": 1295,
"token_acc": 0.9020306055757139,
"train_speed(iter/s)": 0.118174
},
{
"epoch": 1.5983260385471858,
"grad_norm": 0.6365712285041809,
"learning_rate": 4.48248199907038e-06,
"loss": 0.2971078872680664,
"memory(GiB)": 39.06,
"step": 1300,
"token_acc": 0.9063318669368791,
"train_speed(iter/s)": 0.118307
},
{
"epoch": 1.5983260385471858,
"eval_loss": 0.35123586654663086,
"eval_runtime": 31.0565,
"eval_samples_per_second": 16.937,
"eval_steps_per_second": 4.25,
"eval_token_acc": 0.8903000432338953,
"step": 1300
},
{
"epoch": 1.6044690163556785,
"grad_norm": 0.7233961820602417,
"learning_rate": 4.450464258996822e-06,
"loss": 0.3078035831451416,
"memory(GiB)": 39.06,
"step": 1305,
"token_acc": 0.8908178398170103,
"train_speed(iter/s)": 0.117996
},
{
"epoch": 1.610611994164171,
"grad_norm": 0.7506811022758484,
"learning_rate": 4.418469312401141e-06,
"loss": 0.29109845161437986,
"memory(GiB)": 39.06,
"step": 1310,
"token_acc": 0.906337023704408,
"train_speed(iter/s)": 0.118097
},
{
"epoch": 1.6167549719726637,
"grad_norm": 0.7110884785652161,
"learning_rate": 4.386498486360095e-06,
"loss": 0.3077766180038452,
"memory(GiB)": 39.06,
"step": 1315,
"token_acc": 0.8983554542610717,
"train_speed(iter/s)": 0.118213
},
{
"epoch": 1.6228979497811564,
"grad_norm": 0.6889677047729492,
"learning_rate": 4.354553106949972e-06,
"loss": 0.30059351921081545,
"memory(GiB)": 39.06,
"step": 1320,
"token_acc": 0.90420160281651,
"train_speed(iter/s)": 0.118315
},
{
"epoch": 1.6228979497811564,
"eval_loss": 0.3506639003753662,
"eval_runtime": 31.0876,
"eval_samples_per_second": 16.92,
"eval_steps_per_second": 4.246,
"eval_token_acc": 0.8903450064850843,
"step": 1320
},
{
"epoch": 1.6290409275896491,
"grad_norm": 0.6659175753593445,
"learning_rate": 4.3226344991915936e-06,
"loss": 0.2960678577423096,
"memory(GiB)": 39.06,
"step": 1325,
"token_acc": 0.8925680515759312,
"train_speed(iter/s)": 0.117967
},
{
"epoch": 1.6351839053981418,
"grad_norm": 0.6886357069015503,
"learning_rate": 4.290743986995353e-06,
"loss": 0.30909056663513185,
"memory(GiB)": 39.06,
"step": 1330,
"token_acc": 0.9006650503792344,
"train_speed(iter/s)": 0.118082
},
{
"epoch": 1.6413268832066343,
"grad_norm": 0.7061545848846436,
"learning_rate": 4.258882893106308e-06,
"loss": 0.28565549850463867,
"memory(GiB)": 39.06,
"step": 1335,
"token_acc": 0.9070018118019403,
"train_speed(iter/s)": 0.118171
},
{
"epoch": 1.647469861015127,
"grad_norm": 0.7113469243049622,
"learning_rate": 4.227052539049312e-06,
"loss": 0.28241825103759766,
"memory(GiB)": 39.06,
"step": 1340,
"token_acc": 0.898852240585334,
"train_speed(iter/s)": 0.118285
},
{
"epoch": 1.647469861015127,
"eval_loss": 0.3508993089199066,
"eval_runtime": 31.0521,
"eval_samples_per_second": 16.939,
"eval_steps_per_second": 4.251,
"eval_token_acc": 0.8900994379593601,
"step": 1340
},
{
"epoch": 1.6536128388236198,
"grad_norm": 0.663295567035675,
"learning_rate": 4.195254245074196e-06,
"loss": 0.2974137783050537,
"memory(GiB)": 39.06,
"step": 1345,
"token_acc": 0.8932698844323589,
"train_speed(iter/s)": 0.117947
},
{
"epoch": 1.6597558166321125,
"grad_norm": 0.6674165725708008,
"learning_rate": 4.163489330101017e-06,
"loss": 0.3030970096588135,
"memory(GiB)": 39.06,
"step": 1350,
"token_acc": 0.8978457754971743,
"train_speed(iter/s)": 0.118042
},
{
"epoch": 1.6658987944406052,
"grad_norm": 0.6563280820846558,
"learning_rate": 4.131759111665349e-06,
"loss": 0.2904500961303711,
"memory(GiB)": 39.06,
"step": 1355,
"token_acc": 0.902543907296759,
"train_speed(iter/s)": 0.118117
},
{
"epoch": 1.6720417722490977,
"grad_norm": 0.6549026370048523,
"learning_rate": 4.100064905863628e-06,
"loss": 0.2979156970977783,
"memory(GiB)": 39.06,
"step": 1360,
"token_acc": 0.8915877216849292,
"train_speed(iter/s)": 0.118213
},
{
"epoch": 1.6720417722490977,
"eval_loss": 0.3503533601760864,
"eval_runtime": 31.0554,
"eval_samples_per_second": 16.937,
"eval_steps_per_second": 4.25,
"eval_token_acc": 0.8904902723735408,
"step": 1360
},
{
"epoch": 1.6781847500575904,
"grad_norm": 0.6918724179267883,
"learning_rate": 4.068408027298576e-06,
"loss": 0.2886175632476807,
"memory(GiB)": 39.06,
"step": 1365,
"token_acc": 0.8957540263543192,
"train_speed(iter/s)": 0.117895
},
{
"epoch": 1.684327727866083,
"grad_norm": 0.6951196193695068,
"learning_rate": 4.036789789024659e-06,
"loss": 0.30408420562744143,
"memory(GiB)": 39.06,
"step": 1370,
"token_acc": 0.9016488217746225,
"train_speed(iter/s)": 0.117988
},
{
"epoch": 1.6904707056745758,
"grad_norm": 0.7309929728507996,
"learning_rate": 4.00521150249364e-06,
"loss": 0.2967136144638062,
"memory(GiB)": 39.06,
"step": 1375,
"token_acc": 0.9024064171122995,
"train_speed(iter/s)": 0.1181
},
{
"epoch": 1.6966136834830685,
"grad_norm": 0.7061511278152466,
"learning_rate": 3.973674477500172e-06,
"loss": 0.3006556749343872,
"memory(GiB)": 39.06,
"step": 1380,
"token_acc": 0.9038457180411086,
"train_speed(iter/s)": 0.118226
},
{
"epoch": 1.6966136834830685,
"eval_loss": 0.3506544828414917,
"eval_runtime": 31.002,
"eval_samples_per_second": 16.967,
"eval_steps_per_second": 4.258,
"eval_token_acc": 0.8901928231733679,
"step": 1380
},
{
"epoch": 1.702756661291561,
"grad_norm": 0.696220338344574,
"learning_rate": 3.942180022127475e-06,
"loss": 0.2850822925567627,
"memory(GiB)": 39.06,
"step": 1385,
"token_acc": 0.8949225591538171,
"train_speed(iter/s)": 0.117915
},
{
"epoch": 1.7088996391000537,
"grad_norm": 0.6707799434661865,
"learning_rate": 3.910729442693077e-06,
"loss": 0.30518031120300293,
"memory(GiB)": 39.06,
"step": 1390,
"token_acc": 0.8971721087421103,
"train_speed(iter/s)": 0.118027
},
{
"epoch": 1.7150426169085464,
"grad_norm": 0.694172203540802,
"learning_rate": 3.8793240436946385e-06,
"loss": 0.29511513710021975,
"memory(GiB)": 39.06,
"step": 1395,
"token_acc": 0.9010794140323825,
"train_speed(iter/s)": 0.118112
},
{
"epoch": 1.7211855947170391,
"grad_norm": 0.6791805624961853,
"learning_rate": 3.847965127755834e-06,
"loss": 0.2960803747177124,
"memory(GiB)": 39.06,
"step": 1400,
"token_acc": 0.8956415132105685,
"train_speed(iter/s)": 0.11822
},
{
"epoch": 1.7211855947170391,
"eval_loss": 0.350666344165802,
"eval_runtime": 31.0507,
"eval_samples_per_second": 16.94,
"eval_steps_per_second": 4.251,
"eval_token_acc": 0.8905456117596195,
"step": 1400
},
{
"epoch": 1.7273285725255318,
"grad_norm": 0.6747899651527405,
"learning_rate": 3.816653995572332e-06,
"loss": 0.290825629234314,
"memory(GiB)": 39.06,
"step": 1405,
"token_acc": 0.891223331082264,
"train_speed(iter/s)": 0.117914
},
{
"epoch": 1.7334715503340243,
"grad_norm": 0.660038411617279,
"learning_rate": 3.7853919458578327e-06,
"loss": 0.28858532905578616,
"memory(GiB)": 39.06,
"step": 1410,
"token_acc": 0.9013322410968354,
"train_speed(iter/s)": 0.118029
},
{
"epoch": 1.739614528142517,
"grad_norm": 0.6371601223945618,
"learning_rate": 3.7541802752902224e-06,
"loss": 0.28829474449157716,
"memory(GiB)": 39.06,
"step": 1415,
"token_acc": 0.9037818893145325,
"train_speed(iter/s)": 0.118112
},
{
"epoch": 1.7457575059510098,
"grad_norm": 0.7338966131210327,
"learning_rate": 3.723020278457763e-06,
"loss": 0.2963329076766968,
"memory(GiB)": 39.06,
"step": 1420,
"token_acc": 0.9052094407824792,
"train_speed(iter/s)": 0.118216
},
{
"epoch": 1.7457575059510098,
"eval_loss": 0.3507256507873535,
"eval_runtime": 31.0383,
"eval_samples_per_second": 16.947,
"eval_steps_per_second": 4.253,
"eval_token_acc": 0.8900544747081712,
"step": 1420
},
{
"epoch": 1.7519004837595025,
"grad_norm": 0.6258969902992249,
"learning_rate": 3.6919132478054153e-06,
"loss": 0.29568450450897216,
"memory(GiB)": 39.06,
"step": 1425,
"token_acc": 0.8909262230371559,
"train_speed(iter/s)": 0.117906
},
{
"epoch": 1.7580434615679952,
"grad_norm": 0.6673945784568787,
"learning_rate": 3.6608604735812226e-06,
"loss": 0.29297194480895994,
"memory(GiB)": 39.06,
"step": 1430,
"token_acc": 0.9073745475193413,
"train_speed(iter/s)": 0.117999
},
{
"epoch": 1.7641864393764877,
"grad_norm": 0.6559710502624512,
"learning_rate": 3.629863243782799e-06,
"loss": 0.29749407768249514,
"memory(GiB)": 39.06,
"step": 1435,
"token_acc": 0.9093345763896982,
"train_speed(iter/s)": 0.118115
},
{
"epoch": 1.7703294171849804,
"grad_norm": 0.6504038572311401,
"learning_rate": 3.5989228441039024e-06,
"loss": 0.29113216400146485,
"memory(GiB)": 39.06,
"step": 1440,
"token_acc": 0.8930581191194346,
"train_speed(iter/s)": 0.118206
},
{
"epoch": 1.7703294171849804,
"eval_loss": 0.34917929768562317,
"eval_runtime": 31.0337,
"eval_samples_per_second": 16.949,
"eval_steps_per_second": 4.253,
"eval_token_acc": 0.8902377864245569,
"step": 1440
},
{
"epoch": 1.776472394993473,
"grad_norm": 0.6400864720344543,
"learning_rate": 3.568040557881106e-06,
"loss": 0.2814110279083252,
"memory(GiB)": 39.06,
"step": 1445,
"token_acc": 0.8906971833959715,
"train_speed(iter/s)": 0.117931
},
{
"epoch": 1.7826153728019658,
"grad_norm": 0.7064361572265625,
"learning_rate": 3.5372176660405717e-06,
"loss": 0.3039525270462036,
"memory(GiB)": 39.06,
"step": 1450,
"token_acc": 0.9050828549515421,
"train_speed(iter/s)": 0.118013
},
{
"epoch": 1.7887583506104585,
"grad_norm": 0.6955869793891907,
"learning_rate": 3.506455447044923e-06,
"loss": 0.2821065425872803,
"memory(GiB)": 39.06,
"step": 1455,
"token_acc": 0.9053625617102223,
"train_speed(iter/s)": 0.118116
},
{
"epoch": 1.794901328418951,
"grad_norm": 0.6877216696739197,
"learning_rate": 3.4757551768402074e-06,
"loss": 0.2811419010162354,
"memory(GiB)": 39.06,
"step": 1460,
"token_acc": 0.9011031359892095,
"train_speed(iter/s)": 0.118215
},
{
"epoch": 1.794901328418951,
"eval_loss": 0.34925225377082825,
"eval_runtime": 31.0492,
"eval_samples_per_second": 16.941,
"eval_steps_per_second": 4.251,
"eval_token_acc": 0.8903795936013835,
"step": 1460
},
{
"epoch": 1.8010443062274437,
"grad_norm": 0.6559416055679321,
"learning_rate": 3.4451181288029834e-06,
"loss": 0.2829850912094116,
"memory(GiB)": 39.06,
"step": 1465,
"token_acc": 0.8958890676209237,
"train_speed(iter/s)": 0.117907
},
{
"epoch": 1.8071872840359364,
"grad_norm": 0.7104200720787048,
"learning_rate": 3.4145455736874957e-06,
"loss": 0.2918513059616089,
"memory(GiB)": 39.06,
"step": 1470,
"token_acc": 0.9029460760822436,
"train_speed(iter/s)": 0.118008
},
{
"epoch": 1.8133302618444291,
"grad_norm": 0.7294064164161682,
"learning_rate": 3.3840387795729753e-06,
"loss": 0.30045604705810547,
"memory(GiB)": 39.06,
"step": 1475,
"token_acc": 0.8996919108690979,
"train_speed(iter/s)": 0.118115
},
{
"epoch": 1.8194732396529218,
"grad_norm": 0.7393286824226379,
"learning_rate": 3.353599011811037e-06,
"loss": 0.3116471767425537,
"memory(GiB)": 39.06,
"step": 1480,
"token_acc": 0.8992412297989751,
"train_speed(iter/s)": 0.118208
},
{
"epoch": 1.8194732396529218,
"eval_loss": 0.34843236207962036,
"eval_runtime": 31.0158,
"eval_samples_per_second": 16.959,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.8908534370946822,
"step": 1480
},
{
"epoch": 1.8256162174614143,
"grad_norm": 0.7225602865219116,
"learning_rate": 3.323227532973193e-06,
"loss": 0.2964847326278687,
"memory(GiB)": 39.06,
"step": 1485,
"token_acc": 0.8920995259023428,
"train_speed(iter/s)": 0.117914
},
{
"epoch": 1.831759195269907,
"grad_norm": 0.7169524431228638,
"learning_rate": 3.292925602798492e-06,
"loss": 0.2890679359436035,
"memory(GiB)": 39.06,
"step": 1490,
"token_acc": 0.9052988882813924,
"train_speed(iter/s)": 0.118009
},
{
"epoch": 1.8379021730783998,
"grad_norm": 0.7271701097488403,
"learning_rate": 3.262694478141266e-06,
"loss": 0.30105009078979494,
"memory(GiB)": 39.06,
"step": 1495,
"token_acc": 0.8908968566759589,
"train_speed(iter/s)": 0.118105
},
{
"epoch": 1.8440451508868925,
"grad_norm": 0.7436238527297974,
"learning_rate": 3.2325354129189923e-06,
"loss": 0.3033268451690674,
"memory(GiB)": 39.06,
"step": 1500,
"token_acc": 0.9051588095396857,
"train_speed(iter/s)": 0.118206
},
{
"epoch": 1.8440451508868925,
"eval_loss": 0.3476485013961792,
"eval_runtime": 31.0233,
"eval_samples_per_second": 16.955,
"eval_steps_per_second": 4.255,
"eval_token_acc": 0.8910609597924773,
"step": 1500
},
{
"epoch": 1.8501881286953852,
"grad_norm": 0.6778759956359863,
"learning_rate": 3.2024496580602892e-06,
"loss": 0.29907703399658203,
"memory(GiB)": 39.06,
"step": 1505,
"token_acc": 0.8934337447015377,
"train_speed(iter/s)": 0.117911
},
{
"epoch": 1.8563311065038777,
"grad_norm": 0.6664173007011414,
"learning_rate": 3.172438461453032e-06,
"loss": 0.29923856258392334,
"memory(GiB)": 39.06,
"step": 1510,
"token_acc": 0.8983641727004559,
"train_speed(iter/s)": 0.118019
},
{
"epoch": 1.8624740843123704,
"grad_norm": 0.7407649755477905,
"learning_rate": 3.142503067892594e-06,
"loss": 0.3053209066390991,
"memory(GiB)": 39.06,
"step": 1515,
"token_acc": 0.8974559495588846,
"train_speed(iter/s)": 0.118102
},
{
"epoch": 1.868617062120863,
"grad_norm": 0.7822189927101135,
"learning_rate": 3.112644719030206e-06,
"loss": 0.2917191982269287,
"memory(GiB)": 39.06,
"step": 1520,
"token_acc": 0.9052366138763197,
"train_speed(iter/s)": 0.118195
},
{
"epoch": 1.868617062120863,
"eval_loss": 0.3474676311016083,
"eval_runtime": 31.0192,
"eval_samples_per_second": 16.957,
"eval_steps_per_second": 4.255,
"eval_token_acc": 0.8910090791180285,
"step": 1520
},
{
"epoch": 1.8747600399293558,
"grad_norm": 0.6843962669372559,
"learning_rate": 3.0828646533214657e-06,
"loss": 0.3129580497741699,
"memory(GiB)": 39.06,
"step": 1525,
"token_acc": 0.8910584210937568,
"train_speed(iter/s)": 0.117907
},
{
"epoch": 1.8809030177378485,
"grad_norm": 0.6650720238685608,
"learning_rate": 3.053164105974964e-06,
"loss": 0.3007251024246216,
"memory(GiB)": 39.06,
"step": 1530,
"token_acc": 0.9046979865771813,
"train_speed(iter/s)": 0.118012
},
{
"epoch": 1.887045995546341,
"grad_norm": 0.687574028968811,
"learning_rate": 3.0235443089010564e-06,
"loss": 0.2859373092651367,
"memory(GiB)": 39.06,
"step": 1535,
"token_acc": 0.9096972925400097,
"train_speed(iter/s)": 0.118098
},
{
"epoch": 1.8931889733548337,
"grad_norm": 0.6390689611434937,
"learning_rate": 2.9940064906607607e-06,
"loss": 0.28398540019989016,
"memory(GiB)": 39.06,
"step": 1540,
"token_acc": 0.9035791530035582,
"train_speed(iter/s)": 0.118191
},
{
"epoch": 1.8931889733548337,
"eval_loss": 0.3476438522338867,
"eval_runtime": 31.0517,
"eval_samples_per_second": 16.939,
"eval_steps_per_second": 4.251,
"eval_token_acc": 0.8913341980112408,
"step": 1540
},
{
"epoch": 1.8993319511633264,
"grad_norm": 0.6599735617637634,
"learning_rate": 2.964551876414801e-06,
"loss": 0.27951204776763916,
"memory(GiB)": 39.06,
"step": 1545,
"token_acc": 0.8958811522271253,
"train_speed(iter/s)": 0.117923
},
{
"epoch": 1.9054749289718191,
"grad_norm": 0.6753197312355042,
"learning_rate": 2.93518168787279e-06,
"loss": 0.2956626176834106,
"memory(GiB)": 39.06,
"step": 1550,
"token_acc": 0.8987615726824576,
"train_speed(iter/s)": 0.118005
},
{
"epoch": 1.9116179067803118,
"grad_norm": 0.7011248469352722,
"learning_rate": 2.905897143242562e-06,
"loss": 0.2975893497467041,
"memory(GiB)": 39.06,
"step": 1555,
"token_acc": 0.9092895928621318,
"train_speed(iter/s)": 0.118101
},
{
"epoch": 1.9177608845888043,
"grad_norm": 0.6635907292366028,
"learning_rate": 2.8766994571796336e-06,
"loss": 0.28919239044189454,
"memory(GiB)": 39.06,
"step": 1560,
"token_acc": 0.9010686955756882,
"train_speed(iter/s)": 0.118185
},
{
"epoch": 1.9177608845888043,
"eval_loss": 0.3471442759037018,
"eval_runtime": 31.0546,
"eval_samples_per_second": 16.938,
"eval_steps_per_second": 4.251,
"eval_token_acc": 0.8912892347600518,
"step": 1560
},
{
"epoch": 1.923903862397297,
"grad_norm": 0.7003067135810852,
"learning_rate": 2.8475898407368298e-06,
"loss": 0.3121751308441162,
"memory(GiB)": 39.06,
"step": 1565,
"token_acc": 0.8906831756550552,
"train_speed(iter/s)": 0.11792
},
{
"epoch": 1.9300468402057898,
"grad_norm": 0.6917641162872314,
"learning_rate": 2.8185695013140474e-06,
"loss": 0.31047801971435546,
"memory(GiB)": 39.06,
"step": 1570,
"token_acc": 0.8935967102364517,
"train_speed(iter/s)": 0.117987
},
{
"epoch": 1.9361898180142825,
"grad_norm": 0.717903196811676,
"learning_rate": 2.7896396426081844e-06,
"loss": 0.29785962104797364,
"memory(GiB)": 39.06,
"step": 1575,
"token_acc": 0.9072703838075233,
"train_speed(iter/s)": 0.118079
},
{
"epoch": 1.9423327958227752,
"grad_norm": 0.7065854072570801,
"learning_rate": 2.7608014645632e-06,
"loss": 0.2994864463806152,
"memory(GiB)": 39.06,
"step": 1580,
"token_acc": 0.8992320879224104,
"train_speed(iter/s)": 0.118176
},
{
"epoch": 1.9423327958227752,
"eval_loss": 0.34740638732910156,
"eval_runtime": 31.0367,
"eval_samples_per_second": 16.948,
"eval_steps_per_second": 4.253,
"eval_token_acc": 0.8909641158668397,
"step": 1580
},
{
"epoch": 1.9484757736312677,
"grad_norm": 0.7280552387237549,
"learning_rate": 2.7320561633203567e-06,
"loss": 0.2979745864868164,
"memory(GiB)": 39.06,
"step": 1585,
"token_acc": 0.8901521037274909,
"train_speed(iter/s)": 0.11791
},
{
"epoch": 1.9546187514397604,
"grad_norm": 0.6418682336807251,
"learning_rate": 2.703404931168594e-06,
"loss": 0.2907557010650635,
"memory(GiB)": 39.06,
"step": 1590,
"token_acc": 0.8992377813256425,
"train_speed(iter/s)": 0.117995
},
{
"epoch": 1.960761729248253,
"grad_norm": 0.738042414188385,
"learning_rate": 2.6748489564950907e-06,
"loss": 0.29802637100219725,
"memory(GiB)": 39.06,
"step": 1595,
"token_acc": 0.8980035246119306,
"train_speed(iter/s)": 0.118068
},
{
"epoch": 1.9669047070567458,
"grad_norm": 0.6280907988548279,
"learning_rate": 2.6463894237359556e-06,
"loss": 0.28393306732177737,
"memory(GiB)": 39.06,
"step": 1600,
"token_acc": 0.9109865416676735,
"train_speed(iter/s)": 0.11816
},
{
"epoch": 1.9669047070567458,
"eval_loss": 0.34687539935112,
"eval_runtime": 31.0445,
"eval_samples_per_second": 16.943,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.8911128404669261,
"step": 1600
},
{
"epoch": 1.9730476848652385,
"grad_norm": 0.7155392169952393,
"learning_rate": 2.618027513327116e-06,
"loss": 0.3036234378814697,
"memory(GiB)": 39.06,
"step": 1605,
"token_acc": 0.8935712088588127,
"train_speed(iter/s)": 0.117891
},
{
"epoch": 1.979190662673731,
"grad_norm": 0.7185878157615662,
"learning_rate": 2.589764401655343e-06,
"loss": 0.30625033378601074,
"memory(GiB)": 39.06,
"step": 1610,
"token_acc": 0.9087600373057938,
"train_speed(iter/s)": 0.117966
},
{
"epoch": 1.9853336404822237,
"grad_norm": 0.6735581159591675,
"learning_rate": 2.5616012610094702e-06,
"loss": 0.30725975036621095,
"memory(GiB)": 39.06,
"step": 1615,
"token_acc": 0.8961660250130988,
"train_speed(iter/s)": 0.118045
},
{
"epoch": 1.9914766182907164,
"grad_norm": 0.7557063102722168,
"learning_rate": 2.533539259531757e-06,
"loss": 0.29239468574523925,
"memory(GiB)": 39.06,
"step": 1620,
"token_acc": 0.8937711127034497,
"train_speed(iter/s)": 0.118131
},
{
"epoch": 1.9914766182907164,
"eval_loss": 0.3465479016304016,
"eval_runtime": 31.0613,
"eval_samples_per_second": 16.934,
"eval_steps_per_second": 4.25,
"eval_token_acc": 0.8915348032857761,
"step": 1620
},
{
"epoch": 1.9976195960992091,
"grad_norm": 0.6903244853019714,
"learning_rate": 2.5055795611694435e-06,
"loss": 0.2919922351837158,
"memory(GiB)": 39.06,
"step": 1625,
"token_acc": 0.8967221510883483,
"train_speed(iter/s)": 0.117886
},
{
"epoch": 2.004914382246794,
"grad_norm": 0.6568908095359802,
"learning_rate": 2.4777233256264743e-06,
"loss": 0.32158265113830564,
"memory(GiB)": 39.06,
"step": 1630,
"token_acc": 0.9122267969438128,
"train_speed(iter/s)": 0.117945
},
{
"epoch": 2.011057360055287,
"grad_norm": 0.7132671475410461,
"learning_rate": 2.4499717083153975e-06,
"loss": 0.26807637214660646,
"memory(GiB)": 39.06,
"step": 1635,
"token_acc": 0.9197771990740741,
"train_speed(iter/s)": 0.118027
},
{
"epoch": 2.0172003378637795,
"grad_norm": 0.6795634627342224,
"learning_rate": 2.4223258603094295e-06,
"loss": 0.2491468906402588,
"memory(GiB)": 39.06,
"step": 1640,
"token_acc": 0.9240849211677818,
"train_speed(iter/s)": 0.118126
},
{
"epoch": 2.0172003378637795,
"eval_loss": 0.353736937046051,
"eval_runtime": 31.0562,
"eval_samples_per_second": 16.937,
"eval_steps_per_second": 4.25,
"eval_token_acc": 0.8904072632944229,
"step": 1640
},
{
"epoch": 2.023343315672272,
"grad_norm": 0.6820616126060486,
"learning_rate": 2.3947869282947263e-06,
"loss": 0.24982304573059083,
"memory(GiB)": 39.06,
"step": 1645,
"token_acc": 0.8986748783803705,
"train_speed(iter/s)": 0.117854
},
{
"epoch": 2.029486293480765,
"grad_norm": 0.7354549169540405,
"learning_rate": 2.3673560545228082e-06,
"loss": 0.25387675762176515,
"memory(GiB)": 39.06,
"step": 1650,
"token_acc": 0.9141678261286763,
"train_speed(iter/s)": 0.117936
},
{
"epoch": 2.0356292712892574,
"grad_norm": 0.6763687133789062,
"learning_rate": 2.3400343767631943e-06,
"loss": 0.25168399810791015,
"memory(GiB)": 41.58,
"step": 1655,
"token_acc": 0.9232377049180328,
"train_speed(iter/s)": 0.118023
},
{
"epoch": 2.0417722490977503,
"grad_norm": 0.6416710019111633,
"learning_rate": 2.312823028256205e-06,
"loss": 0.2497392177581787,
"memory(GiB)": 41.58,
"step": 1660,
"token_acc": 0.9226366364968939,
"train_speed(iter/s)": 0.118098
},
{
"epoch": 2.0417722490977503,
"eval_loss": 0.35414808988571167,
"eval_runtime": 31.0374,
"eval_samples_per_second": 16.947,
"eval_steps_per_second": 4.253,
"eval_token_acc": 0.8909952442715089,
"step": 1660
},
{
"epoch": 2.047915226906243,
"grad_norm": 0.6878734827041626,
"learning_rate": 2.2857231376659517e-06,
"loss": 0.26041717529296876,
"memory(GiB)": 41.58,
"step": 1665,
"token_acc": 0.895642282731377,
"train_speed(iter/s)": 0.117842
},
{
"epoch": 2.0540582047147353,
"grad_norm": 0.6756438612937927,
"learning_rate": 2.258735829033529e-06,
"loss": 0.2607592582702637,
"memory(GiB)": 41.58,
"step": 1670,
"token_acc": 0.904909300316729,
"train_speed(iter/s)": 0.117933
},
{
"epoch": 2.0602011825232283,
"grad_norm": 0.6508097648620605,
"learning_rate": 2.231862221730394e-06,
"loss": 0.2445054054260254,
"memory(GiB)": 41.58,
"step": 1675,
"token_acc": 0.9190018092758484,
"train_speed(iter/s)": 0.117998
},
{
"epoch": 2.0663441603317207,
"grad_norm": 0.6221520900726318,
"learning_rate": 2.2051034304119344e-06,
"loss": 0.2536668300628662,
"memory(GiB)": 41.58,
"step": 1680,
"token_acc": 0.9074535753395931,
"train_speed(iter/s)": 0.118087
},
{
"epoch": 2.0663441603317207,
"eval_loss": 0.3554106652736664,
"eval_runtime": 31.0289,
"eval_samples_per_second": 16.952,
"eval_steps_per_second": 4.254,
"eval_token_acc": 0.8906770428015565,
"step": 1680
},
{
"epoch": 2.0724871381402137,
"grad_norm": 0.6437965035438538,
"learning_rate": 2.1784605649712326e-06,
"loss": 0.2540877103805542,
"memory(GiB)": 41.58,
"step": 1685,
"token_acc": 0.896763604572522,
"train_speed(iter/s)": 0.117846
},
{
"epoch": 2.078630115948706,
"grad_norm": 0.6842249631881714,
"learning_rate": 2.1519347304930317e-06,
"loss": 0.2614542007446289,
"memory(GiB)": 41.58,
"step": 1690,
"token_acc": 0.9103810036765567,
"train_speed(iter/s)": 0.117925
},
{
"epoch": 2.0847730937571987,
"grad_norm": 0.6956413388252258,
"learning_rate": 2.1255270272079044e-06,
"loss": 0.2528813362121582,
"memory(GiB)": 41.58,
"step": 1695,
"token_acc": 0.9163791495710556,
"train_speed(iter/s)": 0.118022
},
{
"epoch": 2.0909160715656916,
"grad_norm": 0.7066583037376404,
"learning_rate": 2.0992385504466075e-06,
"loss": 0.2548670291900635,
"memory(GiB)": 41.58,
"step": 1700,
"token_acc": 0.9165421398684998,
"train_speed(iter/s)": 0.118107
},
{
"epoch": 2.0909160715656916,
"eval_loss": 0.35480257868766785,
"eval_runtime": 31.1108,
"eval_samples_per_second": 16.907,
"eval_steps_per_second": 4.243,
"eval_token_acc": 0.8904660613921315,
"step": 1700
},
{
"epoch": 2.097059049374184,
"grad_norm": 0.6234432458877563,
"learning_rate": 2.0730703905946612e-06,
"loss": 0.24052574634552001,
"memory(GiB)": 41.58,
"step": 1705,
"token_acc": 0.8977333662447761,
"train_speed(iter/s)": 0.117854
},
{
"epoch": 2.103202027182677,
"grad_norm": 0.7239139080047607,
"learning_rate": 2.0470236330471125e-06,
"loss": 0.2701937437057495,
"memory(GiB)": 41.58,
"step": 1710,
"token_acc": 0.9132731300051116,
"train_speed(iter/s)": 0.117927
},
{
"epoch": 2.1093450049911695,
"grad_norm": 0.7042427062988281,
"learning_rate": 2.0210993581635257e-06,
"loss": 0.2760786533355713,
"memory(GiB)": 41.58,
"step": 1715,
"token_acc": 0.9138149259328708,
"train_speed(iter/s)": 0.118022
},
{
"epoch": 2.115487982799662,
"grad_norm": 0.6625633835792542,
"learning_rate": 1.9952986412231612e-06,
"loss": 0.2629417657852173,
"memory(GiB)": 41.58,
"step": 1720,
"token_acc": 0.9162193754622009,
"train_speed(iter/s)": 0.118081
},
{
"epoch": 2.115487982799662,
"eval_loss": 0.35503000020980835,
"eval_runtime": 31.0455,
"eval_samples_per_second": 16.943,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.8903346303501946,
"step": 1720
},
{
"epoch": 2.121630960608155,
"grad_norm": 0.698905885219574,
"learning_rate": 1.9696225523803803e-06,
"loss": 0.2582688808441162,
"memory(GiB)": 41.58,
"step": 1725,
"token_acc": 0.8980166095055636,
"train_speed(iter/s)": 0.11783
},
{
"epoch": 2.1277739384166474,
"grad_norm": 0.6825575828552246,
"learning_rate": 1.944072156620261e-06,
"loss": 0.2485950469970703,
"memory(GiB)": 41.58,
"step": 1730,
"token_acc": 0.9185884165422945,
"train_speed(iter/s)": 0.117919
},
{
"epoch": 2.1339169162251403,
"grad_norm": 0.656775176525116,
"learning_rate": 1.9186485137144217e-06,
"loss": 0.26242403984069823,
"memory(GiB)": 41.58,
"step": 1735,
"token_acc": 0.9276958754348186,
"train_speed(iter/s)": 0.118001
},
{
"epoch": 2.140059894033633,
"grad_norm": 0.6787784099578857,
"learning_rate": 1.89335267817706e-06,
"loss": 0.2578416347503662,
"memory(GiB)": 41.58,
"step": 1740,
"token_acc": 0.9204126213592233,
"train_speed(iter/s)": 0.118068
},
{
"epoch": 2.140059894033633,
"eval_loss": 0.35603559017181396,
"eval_runtime": 31.0997,
"eval_samples_per_second": 16.913,
"eval_steps_per_second": 4.244,
"eval_token_acc": 0.8905179420665802,
"step": 1740
},
{
"epoch": 2.1462028718421253,
"grad_norm": 0.705270528793335,
"learning_rate": 1.8681856992212211e-06,
"loss": 0.27148022651672366,
"memory(GiB)": 41.58,
"step": 1745,
"token_acc": 0.8956282843498057,
"train_speed(iter/s)": 0.117819
},
{
"epoch": 2.1523458496506183,
"grad_norm": 0.6656559705734253,
"learning_rate": 1.8431486207152704e-06,
"loss": 0.251650071144104,
"memory(GiB)": 41.58,
"step": 1750,
"token_acc": 0.9161503405192278,
"train_speed(iter/s)": 0.117892
},
{
"epoch": 2.1584888274591107,
"grad_norm": 0.6367560625076294,
"learning_rate": 1.8182424811396131e-06,
"loss": 0.24891986846923828,
"memory(GiB)": 41.58,
"step": 1755,
"token_acc": 0.917142553869016,
"train_speed(iter/s)": 0.117962
},
{
"epoch": 2.1646318052676037,
"grad_norm": 0.7008864283561707,
"learning_rate": 1.7934683135435993e-06,
"loss": 0.25353493690490725,
"memory(GiB)": 41.58,
"step": 1760,
"token_acc": 0.9114828452290961,
"train_speed(iter/s)": 0.118051
},
{
"epoch": 2.1646318052676037,
"eval_loss": 0.35659661889076233,
"eval_runtime": 31.0459,
"eval_samples_per_second": 16.943,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.8904626026805015,
"step": 1760
},
{
"epoch": 2.170774783076096,
"grad_norm": 0.6810339093208313,
"learning_rate": 1.7688271455026867e-06,
"loss": 0.25748143196105955,
"memory(GiB)": 41.58,
"step": 1765,
"token_acc": 0.8993780164502753,
"train_speed(iter/s)": 0.117817
},
{
"epoch": 2.1769177608845887,
"grad_norm": 0.701768696308136,
"learning_rate": 1.7443199990758168e-06,
"loss": 0.25628554821014404,
"memory(GiB)": 41.58,
"step": 1770,
"token_acc": 0.9092464549396461,
"train_speed(iter/s)": 0.117899
},
{
"epoch": 2.1830607386930816,
"grad_norm": 0.6798021793365479,
"learning_rate": 1.7199478907630269e-06,
"loss": 0.25238001346588135,
"memory(GiB)": 41.58,
"step": 1775,
"token_acc": 0.9152910102820488,
"train_speed(iter/s)": 0.117983
},
{
"epoch": 2.189203716501574,
"grad_norm": 0.7590020895004272,
"learning_rate": 1.6957118314632825e-06,
"loss": 0.26000936031341554,
"memory(GiB)": 41.58,
"step": 1780,
"token_acc": 0.9114106063560148,
"train_speed(iter/s)": 0.118075
},
{
"epoch": 2.189203716501574,
"eval_loss": 0.3557458817958832,
"eval_runtime": 31.0956,
"eval_samples_per_second": 16.916,
"eval_steps_per_second": 4.245,
"eval_token_acc": 0.8905248594898401,
"step": 1780
},
{
"epoch": 2.195346694310067,
"grad_norm": 0.690200924873352,
"learning_rate": 1.6716128264325477e-06,
"loss": 0.26896276473999026,
"memory(GiB)": 41.58,
"step": 1785,
"token_acc": 0.8972400913052501,
"train_speed(iter/s)": 0.117847
},
{
"epoch": 2.2014896721185595,
"grad_norm": 0.7046708464622498,
"learning_rate": 1.64765187524209e-06,
"loss": 0.2622739315032959,
"memory(GiB)": 41.58,
"step": 1790,
"token_acc": 0.9040114613180515,
"train_speed(iter/s)": 0.117916
},
{
"epoch": 2.207632649927052,
"grad_norm": 0.6468427181243896,
"learning_rate": 1.6238299717370254e-06,
"loss": 0.25573272705078126,
"memory(GiB)": 41.58,
"step": 1795,
"token_acc": 0.913803724588921,
"train_speed(iter/s)": 0.117988
},
{
"epoch": 2.213775627735545,
"grad_norm": 0.6906710863113403,
"learning_rate": 1.6001481039950872e-06,
"loss": 0.24774715900421143,
"memory(GiB)": 41.58,
"step": 1800,
"token_acc": 0.9198941998866428,
"train_speed(iter/s)": 0.118059
},
{
"epoch": 2.213775627735545,
"eval_loss": 0.3556331396102905,
"eval_runtime": 31.0281,
"eval_samples_per_second": 16.952,
"eval_steps_per_second": 4.254,
"eval_token_acc": 0.8904798962386511,
"step": 1800
},
{
"epoch": 2.2199186055440374,
"grad_norm": 0.67650306224823,
"learning_rate": 1.5766072542856525e-06,
"loss": 0.2552159070968628,
"memory(GiB)": 41.58,
"step": 1805,
"token_acc": 0.8967117243311388,
"train_speed(iter/s)": 0.117823
},
{
"epoch": 2.2260615833525303,
"grad_norm": 0.6951079368591309,
"learning_rate": 1.5532083990289892e-06,
"loss": 0.25490808486938477,
"memory(GiB)": 41.58,
"step": 1810,
"token_acc": 0.9191604784561341,
"train_speed(iter/s)": 0.117913
},
{
"epoch": 2.232204561161023,
"grad_norm": 0.6896148920059204,
"learning_rate": 1.5299525087557682e-06,
"loss": 0.2403803586959839,
"memory(GiB)": 41.58,
"step": 1815,
"token_acc": 0.9143227478937136,
"train_speed(iter/s)": 0.117979
},
{
"epoch": 2.2383475389695153,
"grad_norm": 0.6858778595924377,
"learning_rate": 1.5068405480667975e-06,
"loss": 0.2647264003753662,
"memory(GiB)": 41.58,
"step": 1820,
"token_acc": 0.9243344548061508,
"train_speed(iter/s)": 0.118051
},
{
"epoch": 2.2383475389695153,
"eval_loss": 0.35514572262763977,
"eval_runtime": 31.0442,
"eval_samples_per_second": 16.944,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.8905214007782101,
"step": 1820
},
{
"epoch": 2.2444905167780083,
"grad_norm": 0.7021420001983643,
"learning_rate": 1.4838734755930168e-06,
"loss": 0.2488544464111328,
"memory(GiB)": 41.58,
"step": 1825,
"token_acc": 0.8990035802096363,
"train_speed(iter/s)": 0.117822
},
{
"epoch": 2.2506334945865007,
"grad_norm": 0.7138723134994507,
"learning_rate": 1.461052243955739e-06,
"loss": 0.2516676902770996,
"memory(GiB)": 41.58,
"step": 1830,
"token_acc": 0.9070056092612484,
"train_speed(iter/s)": 0.117885
},
{
"epoch": 2.2567764723949937,
"grad_norm": 0.6612991094589233,
"learning_rate": 1.4383777997271347e-06,
"loss": 0.25036053657531737,
"memory(GiB)": 41.58,
"step": 1835,
"token_acc": 0.9232339162298808,
"train_speed(iter/s)": 0.11797
},
{
"epoch": 2.262919450203486,
"grad_norm": 0.670829176902771,
"learning_rate": 1.4158510833909688e-06,
"loss": 0.26495842933654784,
"memory(GiB)": 41.58,
"step": 1840,
"token_acc": 0.9127685871838752,
"train_speed(iter/s)": 0.118042
},
{
"epoch": 2.262919450203486,
"eval_loss": 0.35496076941490173,
"eval_runtime": 31.0316,
"eval_samples_per_second": 16.95,
"eval_steps_per_second": 4.254,
"eval_token_acc": 0.890611327280588,
"step": 1840
},
{
"epoch": 2.2690624280119787,
"grad_norm": 0.6969290971755981,
"learning_rate": 1.3934730293035935e-06,
"loss": 0.2619413614273071,
"memory(GiB)": 41.58,
"step": 1845,
"token_acc": 0.8992944915071285,
"train_speed(iter/s)": 0.117838
},
{
"epoch": 2.2752054058204716,
"grad_norm": 0.697259247303009,
"learning_rate": 1.3712445656551904e-06,
"loss": 0.26856374740600586,
"memory(GiB)": 41.58,
"step": 1850,
"token_acc": 0.9039304347826087,
"train_speed(iter/s)": 0.117916
},
{
"epoch": 2.281348383628964,
"grad_norm": 0.7025954127311707,
"learning_rate": 1.349166614431282e-06,
"loss": 0.2570216655731201,
"memory(GiB)": 41.58,
"step": 1855,
"token_acc": 0.9162639337494233,
"train_speed(iter/s)": 0.117981
},
{
"epoch": 2.287491361437457,
"grad_norm": 0.6871860027313232,
"learning_rate": 1.3272400913744744e-06,
"loss": 0.262271785736084,
"memory(GiB)": 41.58,
"step": 1860,
"token_acc": 0.9138437528688148,
"train_speed(iter/s)": 0.118061
},
{
"epoch": 2.287491361437457,
"eval_loss": 0.35484763979911804,
"eval_runtime": 31.0168,
"eval_samples_per_second": 16.959,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.8907669693039343,
"step": 1860
},
{
"epoch": 2.2936343392459495,
"grad_norm": 0.6811879873275757,
"learning_rate": 1.3054659059464836e-06,
"loss": 0.2392117500305176,
"memory(GiB)": 41.58,
"step": 1865,
"token_acc": 0.901497755975368,
"train_speed(iter/s)": 0.117825
},
{
"epoch": 2.299777317054442,
"grad_norm": 0.7064546346664429,
"learning_rate": 1.2838449612904108e-06,
"loss": 0.266256046295166,
"memory(GiB)": 41.58,
"step": 1870,
"token_acc": 0.9117101026954622,
"train_speed(iter/s)": 0.117915
},
{
"epoch": 2.305920294862935,
"grad_norm": 0.7244398593902588,
"learning_rate": 1.262378154193285e-06,
"loss": 0.23866605758666992,
"memory(GiB)": 41.58,
"step": 1875,
"token_acc": 0.915842304335176,
"train_speed(iter/s)": 0.117981
},
{
"epoch": 2.3120632726714274,
"grad_norm": 0.7136631608009338,
"learning_rate": 1.2410663750488644e-06,
"loss": 0.25197710990905764,
"memory(GiB)": 41.58,
"step": 1880,
"token_acc": 0.9191310820870271,
"train_speed(iter/s)": 0.118043
},
{
"epoch": 2.3120632726714274,
"eval_loss": 0.355129599571228,
"eval_runtime": 31.0906,
"eval_samples_per_second": 16.918,
"eval_steps_per_second": 4.246,
"eval_token_acc": 0.8907254647643753,
"step": 1880
},
{
"epoch": 2.3182062504799203,
"grad_norm": 0.6782585978507996,
"learning_rate": 1.2199105078207002e-06,
"loss": 0.2743240833282471,
"memory(GiB)": 41.58,
"step": 1885,
"token_acc": 0.8939592652104051,
"train_speed(iter/s)": 0.117803
},
{
"epoch": 2.324349228288413,
"grad_norm": 0.6339967846870422,
"learning_rate": 1.1989114300054782e-06,
"loss": 0.25202603340148927,
"memory(GiB)": 41.58,
"step": 1890,
"token_acc": 0.916531565897387,
"train_speed(iter/s)": 0.117882
},
{
"epoch": 2.3304922060969053,
"grad_norm": 0.6756547689437866,
"learning_rate": 1.1780700125966232e-06,
"loss": 0.2598109722137451,
"memory(GiB)": 41.58,
"step": 1895,
"token_acc": 0.9081785893065719,
"train_speed(iter/s)": 0.117946
},
{
"epoch": 2.3366351839053983,
"grad_norm": 0.7056384086608887,
"learning_rate": 1.1573871200481634e-06,
"loss": 0.2566692352294922,
"memory(GiB)": 41.58,
"step": 1900,
"token_acc": 0.9156997782187464,
"train_speed(iter/s)": 0.118011
},
{
"epoch": 2.3366351839053983,
"eval_loss": 0.35557088255882263,
"eval_runtime": 31.0333,
"eval_samples_per_second": 16.95,
"eval_steps_per_second": 4.253,
"eval_token_acc": 0.8906389969736274,
"step": 1900
},
{
"epoch": 2.3427781617138907,
"grad_norm": 0.7157571911811829,
"learning_rate": 1.136863610238887e-06,
"loss": 0.25399596691131593,
"memory(GiB)": 41.58,
"step": 1905,
"token_acc": 0.8955967995576062,
"train_speed(iter/s)": 0.117798
},
{
"epoch": 2.3489211395223837,
"grad_norm": 0.6849676370620728,
"learning_rate": 1.1165003344367465e-06,
"loss": 0.2500483512878418,
"memory(GiB)": 41.58,
"step": 1910,
"token_acc": 0.9112139701241321,
"train_speed(iter/s)": 0.11788
},
{
"epoch": 2.355064117330876,
"grad_norm": 0.6843670010566711,
"learning_rate": 1.0962981372635629e-06,
"loss": 0.24124569892883302,
"memory(GiB)": 41.58,
"step": 1915,
"token_acc": 0.9228162034548048,
"train_speed(iter/s)": 0.117963
},
{
"epoch": 2.3612070951393687,
"grad_norm": 0.6974015235900879,
"learning_rate": 1.0762578566599818e-06,
"loss": 0.24528083801269532,
"memory(GiB)": 41.58,
"step": 1920,
"token_acc": 0.9175750441436139,
"train_speed(iter/s)": 0.118052
},
{
"epoch": 2.3612070951393687,
"eval_loss": 0.3551888167858124,
"eval_runtime": 30.9708,
"eval_samples_per_second": 16.984,
"eval_steps_per_second": 4.262,
"eval_token_acc": 0.8907427583225248,
"step": 1920
},
{
"epoch": 2.3673500729478616,
"grad_norm": 0.6731058359146118,
"learning_rate": 1.056380323850722e-06,
"loss": 0.24767663478851318,
"memory(GiB)": 41.58,
"step": 1925,
"token_acc": 0.90198810396806,
"train_speed(iter/s)": 0.117814
},
{
"epoch": 2.373493050756354,
"grad_norm": 0.6461980938911438,
"learning_rate": 1.0366663633101015e-06,
"loss": 0.2535504102706909,
"memory(GiB)": 41.58,
"step": 1930,
"token_acc": 0.9234430094966145,
"train_speed(iter/s)": 0.117879
},
{
"epoch": 2.379636028564847,
"grad_norm": 0.6973277926445007,
"learning_rate": 1.0171167927278369e-06,
"loss": 0.25800695419311526,
"memory(GiB)": 41.58,
"step": 1935,
"token_acc": 0.9152892113208366,
"train_speed(iter/s)": 0.117936
},
{
"epoch": 2.3857790063733395,
"grad_norm": 0.6010280847549438,
"learning_rate": 9.977324229751245e-07,
"loss": 0.2460566520690918,
"memory(GiB)": 41.58,
"step": 1940,
"token_acc": 0.9177397229965928,
"train_speed(iter/s)": 0.117997
},
{
"epoch": 2.3857790063733395,
"eval_loss": 0.35497036576271057,
"eval_runtime": 31.0056,
"eval_samples_per_second": 16.965,
"eval_steps_per_second": 4.257,
"eval_token_acc": 0.8907565931690445,
"step": 1940
},
{
"epoch": 2.391921984181832,
"grad_norm": 0.7224907875061035,
"learning_rate": 9.785140580710106e-07,
"loss": 0.24542105197906494,
"memory(GiB)": 41.58,
"step": 1945,
"token_acc": 0.899276675757627,
"train_speed(iter/s)": 0.117779
},
{
"epoch": 2.398064961990325,
"grad_norm": 0.6951374411582947,
"learning_rate": 9.594624951490455e-07,
"loss": 0.2523444652557373,
"memory(GiB)": 41.58,
"step": 1950,
"token_acc": 0.9187413638457249,
"train_speed(iter/s)": 0.11785
},
{
"epoch": 2.4042079397988174,
"grad_norm": 0.708865761756897,
"learning_rate": 9.405785244242166e-07,
"loss": 0.2396538734436035,
"memory(GiB)": 41.58,
"step": 1955,
"token_acc": 0.9178324813918034,
"train_speed(iter/s)": 0.117923
},
{
"epoch": 2.4103509176073104,
"grad_norm": 0.6320639848709106,
"learning_rate": 9.218629291601699e-07,
"loss": 0.23296713829040527,
"memory(GiB)": 41.58,
"step": 1960,
"token_acc": 0.9257631364964948,
"train_speed(iter/s)": 0.117998
},
{
"epoch": 2.4103509176073104,
"eval_loss": 0.3550316095352173,
"eval_runtime": 31.0148,
"eval_samples_per_second": 16.96,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.8910021616947686,
"step": 1960
},
{
"epoch": 2.416493895415803,
"grad_norm": 0.6433020234107971,
"learning_rate": 9.033164856367271e-07,
"loss": 0.24781334400177002,
"memory(GiB)": 41.58,
"step": 1965,
"token_acc": 0.8978867315004879,
"train_speed(iter/s)": 0.117779
},
{
"epoch": 2.4226368732242953,
"grad_norm": 0.7556272745132446,
"learning_rate": 8.849399631176825e-07,
"loss": 0.261240553855896,
"memory(GiB)": 41.58,
"step": 1970,
"token_acc": 0.9180517884878411,
"train_speed(iter/s)": 0.117846
},
{
"epoch": 2.4287798510327883,
"grad_norm": 0.6567925214767456,
"learning_rate": 8.667341238189009e-07,
"loss": 0.24376273155212402,
"memory(GiB)": 41.58,
"step": 1975,
"token_acc": 0.9204566085693536,
"train_speed(iter/s)": 0.117908
},
{
"epoch": 2.4349228288412808,
"grad_norm": 0.6730430722236633,
"learning_rate": 8.486997228767013e-07,
"loss": 0.26009833812713623,
"memory(GiB)": 41.58,
"step": 1980,
"token_acc": 0.9134192822777164,
"train_speed(iter/s)": 0.117978
},
{
"epoch": 2.4349228288412808,
"eval_loss": 0.3539762794971466,
"eval_runtime": 31.0521,
"eval_samples_per_second": 16.939,
"eval_steps_per_second": 4.251,
"eval_token_acc": 0.8909018590575011,
"step": 1980
},
{
"epoch": 2.4410658066497737,
"grad_norm": 0.7423481941223145,
"learning_rate": 8.308375083165299e-07,
"loss": 0.24584083557128905,
"memory(GiB)": 41.58,
"step": 1985,
"token_acc": 0.8992385337347264,
"train_speed(iter/s)": 0.117767
},
{
"epoch": 2.447208784458266,
"grad_norm": 0.6721974015235901,
"learning_rate": 8.131482210219383e-07,
"loss": 0.251566219329834,
"memory(GiB)": 41.58,
"step": 1990,
"token_acc": 0.9197381858694643,
"train_speed(iter/s)": 0.117832
},
{
"epoch": 2.4533517622667587,
"grad_norm": 0.6605408787727356,
"learning_rate": 7.956325947038585e-07,
"loss": 0.2555187702178955,
"memory(GiB)": 41.58,
"step": 1995,
"token_acc": 0.9162280042111596,
"train_speed(iter/s)": 0.117901
},
{
"epoch": 2.4594947400752516,
"grad_norm": 0.647904098033905,
"learning_rate": 7.782913558701572e-07,
"loss": 0.2506421089172363,
"memory(GiB)": 41.58,
"step": 2000,
"token_acc": 0.9203814955936324,
"train_speed(iter/s)": 0.117965
},
{
"epoch": 2.4594947400752516,
"eval_loss": 0.3547162115573883,
"eval_runtime": 31.1783,
"eval_samples_per_second": 16.871,
"eval_steps_per_second": 4.234,
"eval_token_acc": 0.8908811067877216,
"step": 2000
},
{
"epoch": 2.465637717883744,
"grad_norm": 0.7217480540275574,
"learning_rate": 7.611252237955168e-07,
"loss": 0.24761755466461183,
"memory(GiB)": 41.58,
"step": 2005,
"token_acc": 0.8972576188708813,
"train_speed(iter/s)": 0.117755
},
{
"epoch": 2.471780695692237,
"grad_norm": 0.6904724836349487,
"learning_rate": 7.44134910491589e-07,
"loss": 0.2681485414505005,
"memory(GiB)": 41.58,
"step": 2010,
"token_acc": 0.9053991693585602,
"train_speed(iter/s)": 0.117834
},
{
"epoch": 2.4779236735007295,
"grad_norm": 0.6789990663528442,
"learning_rate": 7.273211206774711e-07,
"loss": 0.24847228527069093,
"memory(GiB)": 41.58,
"step": 2015,
"token_acc": 0.9193213372105735,
"train_speed(iter/s)": 0.117908
},
{
"epoch": 2.484066651309222,
"grad_norm": 0.7324934601783752,
"learning_rate": 7.106845517504684e-07,
"loss": 0.24457526206970215,
"memory(GiB)": 41.58,
"step": 2020,
"token_acc": 0.9162846862832077,
"train_speed(iter/s)": 0.117969
},
{
"epoch": 2.484066651309222,
"eval_loss": 0.3543083965778351,
"eval_runtime": 31.0241,
"eval_samples_per_second": 16.955,
"eval_steps_per_second": 4.255,
"eval_token_acc": 0.8908396022481626,
"step": 2020
},
{
"epoch": 2.490209629117715,
"grad_norm": 0.7012256383895874,
"learning_rate": 6.942258937571772e-07,
"loss": 0.25258448123931887,
"memory(GiB)": 41.58,
"step": 2025,
"token_acc": 0.8976666927565725,
"train_speed(iter/s)": 0.11777
},
{
"epoch": 2.4963526069262074,
"grad_norm": 0.6754176020622253,
"learning_rate": 6.779458293648506e-07,
"loss": 0.2500795841217041,
"memory(GiB)": 41.58,
"step": 2030,
"token_acc": 0.9177111716621253,
"train_speed(iter/s)": 0.117835
},
{
"epoch": 2.5024955847347004,
"grad_norm": 0.6942124962806702,
"learning_rate": 6.618450338330978e-07,
"loss": 0.245684814453125,
"memory(GiB)": 41.58,
"step": 2035,
"token_acc": 0.9162501585690727,
"train_speed(iter/s)": 0.117915
},
{
"epoch": 2.508638562543193,
"grad_norm": 0.6740065813064575,
"learning_rate": 6.459241749858619e-07,
"loss": 0.25455806255340574,
"memory(GiB)": 41.58,
"step": 2040,
"token_acc": 0.9220431950634214,
"train_speed(iter/s)": 0.117979
},
{
"epoch": 2.508638562543193,
"eval_loss": 0.35373052954673767,
"eval_runtime": 31.0821,
"eval_samples_per_second": 16.923,
"eval_steps_per_second": 4.247,
"eval_token_acc": 0.8911335927367056,
"step": 2040
},
{
"epoch": 2.5147815403516853,
"grad_norm": 0.6818024516105652,
"learning_rate": 6.301839131837284e-07,
"loss": 0.2483248233795166,
"memory(GiB)": 41.58,
"step": 2045,
"token_acc": 0.9004994038258826,
"train_speed(iter/s)": 0.117768
},
{
"epoch": 2.5209245181601783,
"grad_norm": 0.6766259074211121,
"learning_rate": 6.146249012965349e-07,
"loss": 0.25524895191192626,
"memory(GiB)": 41.58,
"step": 2050,
"token_acc": 0.9155308997100655,
"train_speed(iter/s)": 0.117834
},
{
"epoch": 2.5270674959686708,
"grad_norm": 0.6721575260162354,
"learning_rate": 5.992477846762896e-07,
"loss": 0.2647790193557739,
"memory(GiB)": 41.58,
"step": 2055,
"token_acc": 0.9044405418966383,
"train_speed(iter/s)": 0.117893
},
{
"epoch": 2.5332104737771637,
"grad_norm": 0.7143027782440186,
"learning_rate": 5.840532011303996e-07,
"loss": 0.2634526491165161,
"memory(GiB)": 41.58,
"step": 2060,
"token_acc": 0.9136083648221958,
"train_speed(iter/s)": 0.117955
},
{
"epoch": 2.5332104737771637,
"eval_loss": 0.35337749123573303,
"eval_runtime": 31.0323,
"eval_samples_per_second": 16.95,
"eval_steps_per_second": 4.254,
"eval_token_acc": 0.8909814094249892,
"step": 2060
},
{
"epoch": 2.539353451585656,
"grad_norm": 0.6832711100578308,
"learning_rate": 5.690417808952243e-07,
"loss": 0.2547764301300049,
"memory(GiB)": 41.58,
"step": 2065,
"token_acc": 0.8971203129214999,
"train_speed(iter/s)": 0.117757
},
{
"epoch": 2.5454964293941487,
"grad_norm": 0.7033362984657288,
"learning_rate": 5.542141466099271e-07,
"loss": 0.26053800582885744,
"memory(GiB)": 41.58,
"step": 2070,
"token_acc": 0.9055393728734732,
"train_speed(iter/s)": 0.117841
},
{
"epoch": 2.5516394072026416,
"grad_norm": 0.7116051912307739,
"learning_rate": 5.395709132906569e-07,
"loss": 0.25941154956817625,
"memory(GiB)": 41.58,
"step": 2075,
"token_acc": 0.920958114777396,
"train_speed(iter/s)": 0.117919
},
{
"epoch": 2.557782385011134,
"grad_norm": 0.6814519166946411,
"learning_rate": 5.251126883050333e-07,
"loss": 0.26160635948181155,
"memory(GiB)": 41.58,
"step": 2080,
"token_acc": 0.912257738587306,
"train_speed(iter/s)": 0.117989
},
{
"epoch": 2.557782385011134,
"eval_loss": 0.3543572723865509,
"eval_runtime": 31.0115,
"eval_samples_per_second": 16.961,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.8910229139645482,
"step": 2080
},
{
"epoch": 2.563925362819627,
"grad_norm": 0.7511703372001648,
"learning_rate": 5.108400713469547e-07,
"loss": 0.24686145782470703,
"memory(GiB)": 41.58,
"step": 2085,
"token_acc": 0.8980160383253489,
"train_speed(iter/s)": 0.117791
},
{
"epoch": 2.5700683406281195,
"grad_norm": 0.6902100443840027,
"learning_rate": 4.967536544117263e-07,
"loss": 0.26129970550537107,
"memory(GiB)": 41.58,
"step": 2090,
"token_acc": 0.9143400153853115,
"train_speed(iter/s)": 0.117849
},
{
"epoch": 2.576211318436612,
"grad_norm": 0.759671688079834,
"learning_rate": 4.828540217715067e-07,
"loss": 0.27549381256103517,
"memory(GiB)": 41.58,
"step": 2095,
"token_acc": 0.9109081247944131,
"train_speed(iter/s)": 0.117916
},
{
"epoch": 2.582354296245105,
"grad_norm": 0.6925843954086304,
"learning_rate": 4.6914174995106863e-07,
"loss": 0.25518312454223635,
"memory(GiB)": 41.58,
"step": 2100,
"token_acc": 0.9096630452258998,
"train_speed(iter/s)": 0.117988
},
{
"epoch": 2.582354296245105,
"eval_loss": 0.3541419208049774,
"eval_runtime": 31.047,
"eval_samples_per_second": 16.942,
"eval_steps_per_second": 4.252,
"eval_token_acc": 0.8910021616947686,
"step": 2100
},
{
"epoch": 2.5884972740535974,
"grad_norm": 0.7308095693588257,
"learning_rate": 4.556174077038927e-07,
"loss": 0.2574288845062256,
"memory(GiB)": 41.58,
"step": 2105,
"token_acc": 0.899001034002444,
"train_speed(iter/s)": 0.11778
},
{
"epoch": 2.5946402518620904,
"grad_norm": 0.6761147379875183,
"learning_rate": 4.422815559885696e-07,
"loss": 0.2425455093383789,
"memory(GiB)": 41.58,
"step": 2110,
"token_acc": 0.9116659922401276,
"train_speed(iter/s)": 0.117842
},
{
"epoch": 2.600783229670583,
"grad_norm": 0.697441816329956,
"learning_rate": 4.2913474794554044e-07,
"loss": 0.2548621892929077,
"memory(GiB)": 41.58,
"step": 2115,
"token_acc": 0.9114378356971362,
"train_speed(iter/s)": 0.11791
},
{
"epoch": 2.6069262074790753,
"grad_norm": 0.667349100112915,
"learning_rate": 4.161775288741454e-07,
"loss": 0.252597713470459,
"memory(GiB)": 41.58,
"step": 2120,
"token_acc": 0.9123547788733769,
"train_speed(iter/s)": 0.117978
},
{
"epoch": 2.6069262074790753,
"eval_loss": 0.3542228639125824,
"eval_runtime": 31.0775,
"eval_samples_per_second": 16.925,
"eval_steps_per_second": 4.247,
"eval_token_acc": 0.8909537397319498,
"step": 2120
},
{
"epoch": 2.6130691852875683,
"grad_norm": 0.7039747834205627,
"learning_rate": 4.034104362100155e-07,
"loss": 0.25393052101135255,
"memory(GiB)": 41.58,
"step": 2125,
"token_acc": 0.8992231097494255,
"train_speed(iter/s)": 0.117764
},
{
"epoch": 2.6192121630960608,
"grad_norm": 0.7111782431602478,
"learning_rate": 3.9083399950277156e-07,
"loss": 0.2592860221862793,
"memory(GiB)": 41.58,
"step": 2130,
"token_acc": 0.9017042520227233,
"train_speed(iter/s)": 0.117842
},
{
"epoch": 2.6253551409045537,
"grad_norm": 0.7449079155921936,
"learning_rate": 3.7844874039406677e-07,
"loss": 0.23967378139495848,
"memory(GiB)": 41.58,
"step": 2135,
"token_acc": 0.9237554343728797,
"train_speed(iter/s)": 0.11791
},
{
"epoch": 2.631498118713046,
"grad_norm": 0.6821849346160889,
"learning_rate": 3.6625517259594566e-07,
"loss": 0.273772144317627,
"memory(GiB)": 41.58,
"step": 2140,
"token_acc": 0.9114792099290095,
"train_speed(iter/s)": 0.117984
},
{
"epoch": 2.631498118713046,
"eval_loss": 0.3543878495693207,
"eval_runtime": 31.0747,
"eval_samples_per_second": 16.927,
"eval_steps_per_second": 4.248,
"eval_token_acc": 0.8910263726761781,
"step": 2140
},
{
"epoch": 2.6376410965215387,
"grad_norm": 0.7271039485931396,
"learning_rate": 3.5425380186953905e-07,
"loss": 0.2533170223236084,
"memory(GiB)": 41.58,
"step": 2145,
"token_acc": 0.8992799581191373,
"train_speed(iter/s)": 0.117788
},
{
"epoch": 2.6437840743300316,
"grad_norm": 0.6954792737960815,
"learning_rate": 3.424451260040862e-07,
"loss": 0.2587547302246094,
"memory(GiB)": 41.58,
"step": 2150,
"token_acc": 0.9252017450665703,
"train_speed(iter/s)": 0.117868
},
{
"epoch": 2.649927052138524,
"grad_norm": 0.6999133229255676,
"learning_rate": 3.3082963479628747e-07,
"loss": 0.2520002841949463,
"memory(GiB)": 41.58,
"step": 2155,
"token_acc": 0.9169615355242726,
"train_speed(iter/s)": 0.117941
},
{
"epoch": 2.656070029947017,
"grad_norm": 0.6630998253822327,
"learning_rate": 3.194078100299863e-07,
"loss": 0.2589444160461426,
"memory(GiB)": 41.58,
"step": 2160,
"token_acc": 0.9155829021582063,
"train_speed(iter/s)": 0.118006
},
{
"epoch": 2.656070029947017,
"eval_loss": 0.3538263440132141,
"eval_runtime": 31.0691,
"eval_samples_per_second": 16.93,
"eval_steps_per_second": 4.249,
"eval_token_acc": 0.8909779507133593,
"step": 2160
},
{
"epoch": 2.6622130077555095,
"grad_norm": 0.6728103756904602,
"learning_rate": 3.0818012545618836e-07,
"loss": 0.243510103225708,
"memory(GiB)": 41.58,
"step": 2165,
"token_acc": 0.89773630732402,
"train_speed(iter/s)": 0.117802
},
{
"epoch": 2.668355985564002,
"grad_norm": 0.6952410936355591,
"learning_rate": 2.9714704677341055e-07,
"loss": 0.2590247631072998,
"memory(GiB)": 41.58,
"step": 2170,
"token_acc": 0.9167405790179891,
"train_speed(iter/s)": 0.117866
},
{
"epoch": 2.674498963372495,
"grad_norm": 0.6924260258674622,
"learning_rate": 2.8630903160836776e-07,
"loss": 0.25694501399993896,
"memory(GiB)": 41.58,
"step": 2175,
"token_acc": 0.9082922132627271,
"train_speed(iter/s)": 0.11794
},
{
"epoch": 2.6806419411809874,
"grad_norm": 0.6898376941680908,
"learning_rate": 2.756665294969868e-07,
"loss": 0.2537565231323242,
"memory(GiB)": 41.58,
"step": 2180,
"token_acc": 0.917653237630479,
"train_speed(iter/s)": 0.118015
},
{
"epoch": 2.6806419411809874,
"eval_loss": 0.35428422689437866,
"eval_runtime": 31.0905,
"eval_samples_per_second": 16.918,
"eval_steps_per_second": 4.246,
"eval_token_acc": 0.8910713359273671,
"step": 2180
},
{
"epoch": 2.6867849189894804,
"grad_norm": 0.6692034602165222,
"learning_rate": 2.6521998186576357e-07,
"loss": 0.24578571319580078,
"memory(GiB)": 41.58,
"step": 2185,
"token_acc": 0.9007592006264257,
"train_speed(iter/s)": 0.117803
},
{
"epoch": 2.692927896797973,
"grad_norm": 0.6597223877906799,
"learning_rate": 2.549698220134517e-07,
"loss": 0.2445077896118164,
"memory(GiB)": 41.58,
"step": 2190,
"token_acc": 0.921655840125781,
"train_speed(iter/s)": 0.117862
},
{
"epoch": 2.6990708746064653,
"grad_norm": 0.7004697322845459,
"learning_rate": 2.449164750930938e-07,
"loss": 0.24747202396392823,
"memory(GiB)": 41.58,
"step": 2195,
"token_acc": 0.9170990796945369,
"train_speed(iter/s)": 0.117919
},
{
"epoch": 2.7052138524149583,
"grad_norm": 0.6603142619132996,
"learning_rate": 2.3506035809438553e-07,
"loss": 0.25233500003814696,
"memory(GiB)": 41.58,
"step": 2200,
"token_acc": 0.9180474800634293,
"train_speed(iter/s)": 0.117989
},
{
"epoch": 2.7052138524149583,
"eval_loss": 0.35384565591812134,
"eval_runtime": 31.0643,
"eval_samples_per_second": 16.933,
"eval_steps_per_second": 4.249,
"eval_token_acc": 0.8911958495460441,
"step": 2200
},
{
"epoch": 2.7113568302234508,
"grad_norm": 0.6453321576118469,
"learning_rate": 2.2540187982637628e-07,
"loss": 0.2474754571914673,
"memory(GiB)": 41.58,
"step": 2205,
"token_acc": 0.8990364613669268,
"train_speed(iter/s)": 0.117783
},
{
"epoch": 2.7174998080319437,
"grad_norm": 0.6942773461341858,
"learning_rate": 2.1594144090051728e-07,
"loss": 0.25811138153076174,
"memory(GiB)": 41.58,
"step": 2210,
"token_acc": 0.9148966602302796,
"train_speed(iter/s)": 0.117842
},
{
"epoch": 2.723642785840436,
"grad_norm": 0.687302827835083,
"learning_rate": 2.066794337140443e-07,
"loss": 0.25774784088134767,
"memory(GiB)": 41.58,
"step": 2215,
"token_acc": 0.9122162054746883,
"train_speed(iter/s)": 0.117899
},
{
"epoch": 2.7297857636489287,
"grad_norm": 0.735578715801239,
"learning_rate": 1.9761624243370026e-07,
"loss": 0.26178154945373533,
"memory(GiB)": 41.58,
"step": 2220,
"token_acc": 0.9086515587830224,
"train_speed(iter/s)": 0.117952
},
{
"epoch": 2.7297857636489287,
"eval_loss": 0.35361814498901367,
"eval_runtime": 31.0398,
"eval_samples_per_second": 16.946,
"eval_steps_per_second": 4.253,
"eval_token_acc": 0.8910920881971466,
"step": 2220
},
{
"epoch": 2.7359287414574216,
"grad_norm": 0.6561589241027832,
"learning_rate": 1.8875224297980332e-07,
"loss": 0.25756092071533204,
"memory(GiB)": 41.58,
"step": 2225,
"token_acc": 0.8958707817534339,
"train_speed(iter/s)": 0.117769
},
{
"epoch": 2.742071719265914,
"grad_norm": 0.6671420335769653,
"learning_rate": 1.800878030106501e-07,
"loss": 0.24125266075134277,
"memory(GiB)": 41.58,
"step": 2230,
"token_acc": 0.9233396163654507,
"train_speed(iter/s)": 0.117827
},
{
"epoch": 2.748214697074407,
"grad_norm": 0.7091180086135864,
"learning_rate": 1.7162328190727217e-07,
"loss": 0.25800223350524903,
"memory(GiB)": 41.58,
"step": 2235,
"token_acc": 0.9130938866210961,
"train_speed(iter/s)": 0.117897
},
{
"epoch": 2.7543576748828995,
"grad_norm": 0.7402175068855286,
"learning_rate": 1.6335903075852478e-07,
"loss": 0.2690894365310669,
"memory(GiB)": 41.58,
"step": 2240,
"token_acc": 0.9129178605539637,
"train_speed(iter/s)": 0.117956
},
{
"epoch": 2.7543576748828995,
"eval_loss": 0.35380449891090393,
"eval_runtime": 31.058,
"eval_samples_per_second": 16.936,
"eval_steps_per_second": 4.25,
"eval_token_acc": 0.891244271508863,
"step": 2240
},
{
"epoch": 2.760500652691392,
"grad_norm": 0.700340211391449,
"learning_rate": 1.552953923465267e-07,
"loss": 0.26177315711975097,
"memory(GiB)": 41.58,
"step": 2245,
"token_acc": 0.8943636286526147,
"train_speed(iter/s)": 0.11777
},
{
"epoch": 2.766643630499885,
"grad_norm": 0.6342586278915405,
"learning_rate": 1.4743270113244278e-07,
"loss": 0.23961200714111328,
"memory(GiB)": 41.58,
"step": 2250,
"token_acc": 0.9237576735224269,
"train_speed(iter/s)": 0.117824
},
{
"epoch": 2.7727866083083774,
"grad_norm": 0.627129077911377,
"learning_rate": 1.3977128324261068e-07,
"loss": 0.24526638984680177,
"memory(GiB)": 41.58,
"step": 2255,
"token_acc": 0.9125838004176283,
"train_speed(iter/s)": 0.117896
},
{
"epoch": 2.7789295861168704,
"grad_norm": 0.6337400674819946,
"learning_rate": 1.3231145645501153e-07,
"loss": 0.2480980396270752,
"memory(GiB)": 41.58,
"step": 2260,
"token_acc": 0.9186616671473897,
"train_speed(iter/s)": 0.117951
},
{
"epoch": 2.7789295861168704,
"eval_loss": 0.354061484336853,
"eval_runtime": 31.0534,
"eval_samples_per_second": 16.939,
"eval_steps_per_second": 4.251,
"eval_token_acc": 0.8911024643320363,
"step": 2260
},
{
"epoch": 2.785072563925363,
"grad_norm": 0.712088942527771,
"learning_rate": 1.2505353018609445e-07,
"loss": 0.2516076326370239,
"memory(GiB)": 41.58,
"step": 2265,
"token_acc": 0.8994283331306145,
"train_speed(iter/s)": 0.117768
},
{
"epoch": 2.7912155417338553,
"grad_norm": 0.7046364545822144,
"learning_rate": 1.1799780547793682e-07,
"loss": 0.25043492317199706,
"memory(GiB)": 41.58,
"step": 2270,
"token_acc": 0.9169777512318948,
"train_speed(iter/s)": 0.117833
},
{
"epoch": 2.7973585195423483,
"grad_norm": 0.6503071784973145,
"learning_rate": 1.111445749857626e-07,
"loss": 0.2525207757949829,
"memory(GiB)": 41.58,
"step": 2275,
"token_acc": 0.9089755560343795,
"train_speed(iter/s)": 0.117899
},
{
"epoch": 2.8035014973508408,
"grad_norm": 0.7683473229408264,
"learning_rate": 1.0449412296580252e-07,
"loss": 0.2637613534927368,
"memory(GiB)": 41.58,
"step": 2280,
"token_acc": 0.9091683159202835,
"train_speed(iter/s)": 0.117958
},
{
"epoch": 2.8035014973508408,
"eval_loss": 0.35387495160102844,
"eval_runtime": 31.012,
"eval_samples_per_second": 16.961,
"eval_steps_per_second": 4.256,
"eval_token_acc": 0.8911405101599654,
"step": 2280
},
{
"epoch": 2.8096444751593337,
"grad_norm": 0.7151490449905396,
"learning_rate": 9.804672526349979e-08,
"loss": 0.2488321304321289,
"memory(GiB)": 41.58,
"step": 2285,
"token_acc": 0.8973117200307805,
"train_speed(iter/s)": 0.117778
},
{
"epoch": 2.815787452967826,
"grad_norm": 0.730139434337616,
"learning_rate": 9.180264930207405e-08,
"loss": 0.2607487678527832,
"memory(GiB)": 41.58,
"step": 2290,
"token_acc": 0.9156902926894462,
"train_speed(iter/s)": 0.11785
},
{
"epoch": 2.8219304307763187,
"grad_norm": 0.6628730297088623,
"learning_rate": 8.576215407142652e-08,
"loss": 0.26926565170288086,
"memory(GiB)": 41.58,
"step": 2295,
"token_acc": 0.9116337769619092,
"train_speed(iter/s)": 0.11791
},
{
"epoch": 2.8280734085848116,
"grad_norm": 0.6601608991622925,
"learning_rate": 7.992549011739903e-08,
"loss": 0.2524131774902344,
"memory(GiB)": 41.58,
"step": 2300,
"token_acc": 0.9154497235075048,
"train_speed(iter/s)": 0.117965
},
{
"epoch": 2.8280734085848116,
"eval_loss": 0.35372012853622437,
"eval_runtime": 31.0582,
"eval_samples_per_second": 16.936,
"eval_steps_per_second": 4.25,
"eval_token_acc": 0.891157803718115,
"step": 2300
},
{
"epoch": 2.834216386393304,
"grad_norm": 0.7079156041145325,
"learning_rate": 7.42928995313802e-08,
"loss": 0.25153977870941163,
"memory(GiB)": 41.58,
"step": 2305,
"token_acc": 0.8977190549519733,
"train_speed(iter/s)": 0.117776
},
{
"epoch": 2.840359364201797,
"grad_norm": 0.707416296005249,
"learning_rate": 6.886461594026394e-08,
"loss": 0.24887454509735107,
"memory(GiB)": 41.58,
"step": 2310,
"token_acc": 0.9237657201262054,
"train_speed(iter/s)": 0.117827
},
{
"epoch": 2.8465023420102895,
"grad_norm": 0.6941429972648621,
"learning_rate": 6.364086449676233e-08,
"loss": 0.2661618947982788,
"memory(GiB)": 41.58,
"step": 2315,
"token_acc": 0.9116836428999401,
"train_speed(iter/s)": 0.117875
},
{
"epoch": 2.852645319818782,
"grad_norm": 0.705603301525116,
"learning_rate": 5.862186187006347e-08,
"loss": 0.251740837097168,
"memory(GiB)": 41.58,
"step": 2320,
"token_acc": 0.9094119805522429,
"train_speed(iter/s)": 0.117943
},
{
"epoch": 2.852645319818782,
"eval_loss": 0.35377010703086853,
"eval_runtime": 31.0667,
"eval_samples_per_second": 16.931,
"eval_steps_per_second": 4.249,
"eval_token_acc": 0.8910367488110679,
"step": 2320
},
{
"epoch": 2.858788297627275,
"grad_norm": 0.6828641891479492,
"learning_rate": 5.3807816236846614e-08,
"loss": 0.26838877201080324,
"memory(GiB)": 41.58,
"step": 2325,
"token_acc": 0.8946421677020814,
"train_speed(iter/s)": 0.117759
},
{
"epoch": 2.8649312754357674,
"grad_norm": 0.6699286699295044,
"learning_rate": 4.919892727264508e-08,
"loss": 0.2658334493637085,
"memory(GiB)": 41.58,
"step": 2330,
"token_acc": 0.9121956642579211,
"train_speed(iter/s)": 0.117813
},
{
"epoch": 2.8710742532442604,
"grad_norm": 0.6932682394981384,
"learning_rate": 4.4795386143567375e-08,
"loss": 0.24600727558135987,
"memory(GiB)": 41.58,
"step": 2335,
"token_acc": 0.918826454010682,
"train_speed(iter/s)": 0.117881
},
{
"epoch": 2.877217231052753,
"grad_norm": 0.6962621212005615,
"learning_rate": 4.0597375498365175e-08,
"loss": 0.2586866617202759,
"memory(GiB)": 41.58,
"step": 2340,
"token_acc": 0.9217780343483908,
"train_speed(iter/s)": 0.117944
},
{
"epoch": 2.877217231052753,
"eval_loss": 0.35374194383621216,
"eval_runtime": 31.066,
"eval_samples_per_second": 16.932,
"eval_steps_per_second": 4.249,
"eval_token_acc": 0.8911439688715953,
"step": 2340
},
{
"epoch": 2.8833602088612453,
"grad_norm": 0.6817741990089417,
"learning_rate": 3.6605069460858286e-08,
"loss": 0.2390669822692871,
"memory(GiB)": 41.58,
"step": 2345,
"token_acc": 0.9005924037018727,
"train_speed(iter/s)": 0.117765
},
{
"epoch": 2.8895031866697383,
"grad_norm": 0.6809601783752441,
"learning_rate": 3.281863362271487e-08,
"loss": 0.24726104736328125,
"memory(GiB)": 41.58,
"step": 2350,
"token_acc": 0.9243779025438414,
"train_speed(iter/s)": 0.117823
},
{
"epoch": 2.8956461644782308,
"grad_norm": 0.6868336200714111,
"learning_rate": 2.9238225036579693e-08,
"loss": 0.2603924036026001,
"memory(GiB)": 41.58,
"step": 2355,
"token_acc": 0.9140520341253614,
"train_speed(iter/s)": 0.117884
},
{
"epoch": 2.9017891422867237,
"grad_norm": 0.6945005655288696,
"learning_rate": 2.5863992209560484e-08,
"loss": 0.2470933675765991,
"memory(GiB)": 41.58,
"step": 2360,
"token_acc": 0.9241285200347351,
"train_speed(iter/s)": 0.117951
},
{
"epoch": 2.9017891422867237,
"eval_loss": 0.353762149810791,
"eval_runtime": 31.0637,
"eval_samples_per_second": 16.933,
"eval_steps_per_second": 4.249,
"eval_token_acc": 0.891199308257674,
"step": 2360
},
{
"epoch": 2.907932120095216,
"grad_norm": 0.6887286305427551,
"learning_rate": 2.269607509707006e-08,
"loss": 0.2686716318130493,
"memory(GiB)": 41.58,
"step": 2365,
"token_acc": 0.8963972388465724,
"train_speed(iter/s)": 0.117755
},
{
"epoch": 2.9140750979037087,
"grad_norm": 0.6807404160499573,
"learning_rate": 1.97346050970193e-08,
"loss": 0.25454580783843994,
"memory(GiB)": 41.58,
"step": 2370,
"token_acc": 0.9134818448123169,
"train_speed(iter/s)": 0.11783
},
{
"epoch": 2.9202180757122016,
"grad_norm": 0.6732537150382996,
"learning_rate": 1.69797050443693e-08,
"loss": 0.251677131652832,
"memory(GiB)": 41.58,
"step": 2375,
"token_acc": 0.9128050937389459,
"train_speed(iter/s)": 0.117893
},
{
"epoch": 2.926361053520694,
"grad_norm": 0.701576292514801,
"learning_rate": 1.4431489206034321e-08,
"loss": 0.26529679298400877,
"memory(GiB)": 41.58,
"step": 2380,
"token_acc": 0.915328677370581,
"train_speed(iter/s)": 0.117951
},
{
"epoch": 2.926361053520694,
"eval_loss": 0.3537040054798126,
"eval_runtime": 31.0696,
"eval_samples_per_second": 16.93,
"eval_steps_per_second": 4.249,
"eval_token_acc": 0.8912581063553826,
"step": 2380
},
{
"epoch": 2.932504031329187,
"grad_norm": 0.6693256497383118,
"learning_rate": 1.2090063276142261e-08,
"loss": 0.2500641107559204,
"memory(GiB)": 41.58,
"step": 2385,
"token_acc": 0.8987698849300564,
"train_speed(iter/s)": 0.117778
},
{
"epoch": 2.9386470091376795,
"grad_norm": 0.726274847984314,
"learning_rate": 9.955524371653146e-09,
"loss": 0.2546469926834106,
"memory(GiB)": 41.58,
"step": 2390,
"token_acc": 0.9140067149004587,
"train_speed(iter/s)": 0.117838
},
{
"epoch": 2.944789986946172,
"grad_norm": 0.6883347630500793,
"learning_rate": 8.02796102832848e-09,
"loss": 0.2519416570663452,
"memory(GiB)": 41.58,
"step": 2395,
"token_acc": 0.9123212139777092,
"train_speed(iter/s)": 0.117903
},
{
"epoch": 2.950932964754665,
"grad_norm": 0.7407357692718506,
"learning_rate": 6.307453197059166e-09,
"loss": 0.25615706443786623,
"memory(GiB)": 41.58,
"step": 2400,
"token_acc": 0.9138917665630704,
"train_speed(iter/s)": 0.117957
},
{
"epoch": 2.950932964754665,
"eval_loss": 0.35370346903800964,
"eval_runtime": 31.078,
"eval_samples_per_second": 16.925,
"eval_steps_per_second": 4.247,
"eval_token_acc": 0.8911854734111544,
"step": 2400
},
{
"epoch": 2.9570759425631574,
"grad_norm": 0.6578332781791687,
"learning_rate": 4.794072240550951e-09,
"loss": 0.2539684772491455,
"memory(GiB)": 41.58,
"step": 2405,
"token_acc": 0.8979478357573546,
"train_speed(iter/s)": 0.117778
},
{
"epoch": 2.9632189203716504,
"grad_norm": 0.6638470888137817,
"learning_rate": 3.487880930363452e-09,
"loss": 0.24514734745025635,
"memory(GiB)": 41.58,
"step": 2410,
"token_acc": 0.9181864403032916,
"train_speed(iter/s)": 0.117844
},
{
"epoch": 2.969361898180143,
"grad_norm": 0.7209091782569885,
"learning_rate": 2.3889334443055743e-09,
"loss": 0.24684855937957764,
"memory(GiB)": 41.58,
"step": 2415,
"token_acc": 0.9140838085792214,
"train_speed(iter/s)": 0.117913
},
{
"epoch": 2.9755048759886353,
"grad_norm": 0.651500940322876,
"learning_rate": 1.4972753641906424e-09,
"loss": 0.24752352237701417,
"memory(GiB)": 41.58,
"step": 2420,
"token_acc": 0.9205346018801677,
"train_speed(iter/s)": 0.117962
},
{
"epoch": 2.9755048759886353,
"eval_loss": 0.3538280427455902,
"eval_runtime": 31.0603,
"eval_samples_per_second": 16.935,
"eval_steps_per_second": 4.25,
"eval_token_acc": 0.8910817120622568,
"step": 2420
},
{
"epoch": 2.9816478537971283,
"grad_norm": 0.7020614147186279,
"learning_rate": 8.12943673943467e-10,
"loss": 0.2728489875793457,
"memory(GiB)": 41.58,
"step": 2425,
"token_acc": 0.8962999446979123,
"train_speed(iter/s)": 0.117785
},
{
"epoch": 2.9877908316056208,
"grad_norm": 0.6406486630439758,
"learning_rate": 3.359667580682402e-10,
"loss": 0.24820823669433595,
"memory(GiB)": 41.58,
"step": 2430,
"token_acc": 0.9153875671527245,
"train_speed(iter/s)": 0.11784
},
{
"epoch": 2.9939338094141137,
"grad_norm": 0.6937646269798279,
"learning_rate": 6.636440046892123e-11,
"loss": 0.253904914855957,
"memory(GiB)": 41.58,
"step": 2435,
"token_acc": 0.9180234572177958,
"train_speed(iter/s)": 0.117894
},
{
"epoch": 2.9988481916609078,
"eval_loss": 0.3536596894264221,
"eval_runtime": 31.0744,
"eval_samples_per_second": 16.927,
"eval_steps_per_second": 4.248,
"eval_token_acc": 0.8911750972762645,
"step": 2439
}
],
"logging_steps": 5,
"max_steps": 2439,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.1644436512416727e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}