Files
qwen2.5vl-3b-reflection-cot-7b/trainer_state.json

6013 lines
171 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": 1620,
"best_metric": 0.25625008,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v13-20250430-203547/checkpoint-1620",
"epoch": 2.9988481916609078,
"eval_steps": 20,
"global_step": 2439,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012285955616985335,
"grad_norm": 2.85188364982605,
"learning_rate": 9.99999585221637e-06,
"loss": 0.3927260637283325,
"memory(GiB)": 27.77,
"step": 1,
"token_acc": 0.9111180904522613,
"train_speed(iter/s)": 0.065127
},
{
"epoch": 0.006142977808492667,
"grad_norm": 2.0374207496643066,
"learning_rate": 9.999896305753298e-06,
"loss": 0.4172998070716858,
"memory(GiB)": 27.77,
"step": 5,
"token_acc": 0.8710935003515051,
"train_speed(iter/s)": 0.125986
},
{
"epoch": 0.012285955616985334,
"grad_norm": 1.0768085718154907,
"learning_rate": 9.99958522731419e-06,
"loss": 0.31686446666717527,
"memory(GiB)": 27.77,
"step": 10,
"token_acc": 0.9013287401574803,
"train_speed(iter/s)": 0.138597
},
{
"epoch": 0.018428933425478,
"grad_norm": 1.4088069200515747,
"learning_rate": 9.999066777585496e-06,
"loss": 0.31454758644104003,
"memory(GiB)": 27.77,
"step": 15,
"token_acc": 0.8979140839756373,
"train_speed(iter/s)": 0.146387
},
{
"epoch": 0.024571911233970668,
"grad_norm": 1.0749812126159668,
"learning_rate": 9.998340978071314e-06,
"loss": 0.31294023990631104,
"memory(GiB)": 27.77,
"step": 20,
"token_acc": 0.9126713473754597,
"train_speed(iter/s)": 0.15088
},
{
"epoch": 0.024571911233970668,
"eval_loss": 0.34731194376945496,
"eval_runtime": 30.1495,
"eval_samples_per_second": 17.446,
"eval_steps_per_second": 4.378,
"eval_token_acc": 0.9024643320363165,
"step": 20
},
{
"epoch": 0.030714889042463334,
"grad_norm": 1.2545477151870728,
"learning_rate": 9.997407858876141e-06,
"loss": 0.30952184200286864,
"memory(GiB)": 27.77,
"step": 25,
"token_acc": 0.8951860272094114,
"train_speed(iter/s)": 0.122954
},
{
"epoch": 0.036857866850956,
"grad_norm": 1.0299962759017944,
"learning_rate": 9.99626745870361e-06,
"loss": 0.28446354866027834,
"memory(GiB)": 27.77,
"step": 30,
"token_acc": 0.9173851303377625,
"train_speed(iter/s)": 0.127723
},
{
"epoch": 0.043000844659448666,
"grad_norm": 1.05655038356781,
"learning_rate": 9.994919824854899e-06,
"loss": 0.3267578125,
"memory(GiB)": 27.77,
"step": 35,
"token_acc": 0.8922404371584699,
"train_speed(iter/s)": 0.132153
},
{
"epoch": 0.049143822467941335,
"grad_norm": 1.0286122560501099,
"learning_rate": 9.993365013226757e-06,
"loss": 0.29858396053314207,
"memory(GiB)": 27.77,
"step": 40,
"token_acc": 0.8932590177726365,
"train_speed(iter/s)": 0.135375
},
{
"epoch": 0.049143822467941335,
"eval_loss": 0.32549846172332764,
"eval_runtime": 29.8986,
"eval_samples_per_second": 17.593,
"eval_steps_per_second": 4.415,
"eval_token_acc": 0.9053393860786857,
"step": 40
},
{
"epoch": 0.055286800276434005,
"grad_norm": 0.9242257475852966,
"learning_rate": 9.991603088309195e-06,
"loss": 0.2890357971191406,
"memory(GiB)": 27.77,
"step": 45,
"token_acc": 0.8943298969072165,
"train_speed(iter/s)": 0.121978
},
{
"epoch": 0.06142977808492667,
"grad_norm": 0.8638477325439453,
"learning_rate": 9.989634123182798e-06,
"loss": 0.2940737247467041,
"memory(GiB)": 27.77,
"step": 50,
"token_acc": 0.9095572243424023,
"train_speed(iter/s)": 0.125285
},
{
"epoch": 0.06757275589341934,
"grad_norm": 1.1406885385513306,
"learning_rate": 9.987458199515714e-06,
"loss": 0.2999709606170654,
"memory(GiB)": 27.77,
"step": 55,
"token_acc": 0.9033247521498603,
"train_speed(iter/s)": 0.128485
},
{
"epoch": 0.073715733701912,
"grad_norm": 1.0126744508743286,
"learning_rate": 9.985075407560247e-06,
"loss": 0.2821986675262451,
"memory(GiB)": 29.52,
"step": 60,
"token_acc": 0.9166214683694098,
"train_speed(iter/s)": 0.130304
},
{
"epoch": 0.073715733701912,
"eval_loss": 0.31559479236602783,
"eval_runtime": 29.8185,
"eval_samples_per_second": 17.64,
"eval_steps_per_second": 4.427,
"eval_token_acc": 0.9073641735120335,
"step": 60
},
{
"epoch": 0.07985871151040466,
"grad_norm": 0.9640651345252991,
"learning_rate": 9.982485846149125e-06,
"loss": 0.2909295320510864,
"memory(GiB)": 29.52,
"step": 65,
"token_acc": 0.9025612979673685,
"train_speed(iter/s)": 0.121841
},
{
"epoch": 0.08600168931889733,
"grad_norm": 1.0075881481170654,
"learning_rate": 9.979689622691393e-06,
"loss": 0.2951636791229248,
"memory(GiB)": 29.52,
"step": 70,
"token_acc": 0.9077655003069368,
"train_speed(iter/s)": 0.123796
},
{
"epoch": 0.09214466712739,
"grad_norm": 1.0915230512619019,
"learning_rate": 9.976686853167967e-06,
"loss": 0.28308849334716796,
"memory(GiB)": 29.52,
"step": 75,
"token_acc": 0.8968571616035693,
"train_speed(iter/s)": 0.125709
},
{
"epoch": 0.09828764493588267,
"grad_norm": 1.0471818447113037,
"learning_rate": 9.973477662126818e-06,
"loss": 0.2649773836135864,
"memory(GiB)": 29.52,
"step": 80,
"token_acc": 0.9152229480261289,
"train_speed(iter/s)": 0.127584
},
{
"epoch": 0.09828764493588267,
"eval_loss": 0.311291366815567,
"eval_runtime": 29.9425,
"eval_samples_per_second": 17.567,
"eval_steps_per_second": 4.408,
"eval_token_acc": 0.9086323677763366,
"step": 80
},
{
"epoch": 0.10443062274437534,
"grad_norm": 0.9933186769485474,
"learning_rate": 9.970062182677802e-06,
"loss": 0.27203946113586425,
"memory(GiB)": 29.52,
"step": 85,
"token_acc": 0.8999555278840167,
"train_speed(iter/s)": 0.121459
},
{
"epoch": 0.11057360055286801,
"grad_norm": 1.0888270139694214,
"learning_rate": 9.966440556487149e-06,
"loss": 0.27809457778930663,
"memory(GiB)": 29.52,
"step": 90,
"token_acc": 0.9105367793240556,
"train_speed(iter/s)": 0.123076
},
{
"epoch": 0.11671657836136066,
"grad_norm": 1.0842769145965576,
"learning_rate": 9.962612933771575e-06,
"loss": 0.30378289222717286,
"memory(GiB)": 29.52,
"step": 95,
"token_acc": 0.9065857885615252,
"train_speed(iter/s)": 0.124867
},
{
"epoch": 0.12285955616985333,
"grad_norm": 1.1600843667984009,
"learning_rate": 9.958579473292067e-06,
"loss": 0.2904845714569092,
"memory(GiB)": 29.52,
"step": 100,
"token_acc": 0.910148975791434,
"train_speed(iter/s)": 0.126774
},
{
"epoch": 0.12285955616985333,
"eval_loss": 0.3061583638191223,
"eval_runtime": 29.8822,
"eval_samples_per_second": 17.602,
"eval_steps_per_second": 4.417,
"eval_token_acc": 0.9093385214007782,
"step": 100
},
{
"epoch": 0.129002533978346,
"grad_norm": 1.0021681785583496,
"learning_rate": 9.95434034234728e-06,
"loss": 0.29146251678466795,
"memory(GiB)": 29.52,
"step": 105,
"token_acc": 0.9012319578712691,
"train_speed(iter/s)": 0.122309
},
{
"epoch": 0.13514551178683867,
"grad_norm": 0.9947881698608398,
"learning_rate": 9.949895716766611e-06,
"loss": 0.28587632179260253,
"memory(GiB)": 29.52,
"step": 110,
"token_acc": 0.9113877118644068,
"train_speed(iter/s)": 0.12384
},
{
"epoch": 0.14128848959533133,
"grad_norm": 0.9120563268661499,
"learning_rate": 9.945245780902899e-06,
"loss": 0.25429134368896483,
"memory(GiB)": 29.52,
"step": 115,
"token_acc": 0.9119403599818774,
"train_speed(iter/s)": 0.125132
},
{
"epoch": 0.147431467403824,
"grad_norm": 0.9895658493041992,
"learning_rate": 9.940390727624785e-06,
"loss": 0.29624483585357664,
"memory(GiB)": 29.52,
"step": 120,
"token_acc": 0.907488553000837,
"train_speed(iter/s)": 0.126349
},
{
"epoch": 0.147431467403824,
"eval_loss": 0.30357423424720764,
"eval_runtime": 29.8849,
"eval_samples_per_second": 17.601,
"eval_steps_per_second": 4.417,
"eval_token_acc": 0.9093457270500072,
"step": 120
},
{
"epoch": 0.15357444521231667,
"grad_norm": 1.0295214653015137,
"learning_rate": 9.935330758308706e-06,
"loss": 0.2756758689880371,
"memory(GiB)": 29.52,
"step": 125,
"token_acc": 0.9061426587736607,
"train_speed(iter/s)": 0.122406
},
{
"epoch": 0.15971742302080932,
"grad_norm": 0.9042087197303772,
"learning_rate": 9.93006608283054e-06,
"loss": 0.26598501205444336,
"memory(GiB)": 29.52,
"step": 130,
"token_acc": 0.9089727568107973,
"train_speed(iter/s)": 0.123416
},
{
"epoch": 0.165860400829302,
"grad_norm": 1.020095944404602,
"learning_rate": 9.924596919556917e-06,
"loss": 0.30240449905395506,
"memory(GiB)": 29.52,
"step": 135,
"token_acc": 0.8946991831137082,
"train_speed(iter/s)": 0.124801
},
{
"epoch": 0.17200337863779466,
"grad_norm": 0.8778018355369568,
"learning_rate": 9.918923495336138e-06,
"loss": 0.30482988357543944,
"memory(GiB)": 29.52,
"step": 140,
"token_acc": 0.896467782800934,
"train_speed(iter/s)": 0.125985
},
{
"epoch": 0.17200337863779466,
"eval_loss": 0.3020693063735962,
"eval_runtime": 29.8385,
"eval_samples_per_second": 17.628,
"eval_steps_per_second": 4.424,
"eval_token_acc": 0.9099077676898688,
"step": 140
},
{
"epoch": 0.17814635644628735,
"grad_norm": 0.888533890247345,
"learning_rate": 9.913046045488787e-06,
"loss": 0.28194656372070315,
"memory(GiB)": 29.52,
"step": 145,
"token_acc": 0.9068211113661646,
"train_speed(iter/s)": 0.122725
},
{
"epoch": 0.18428933425478,
"grad_norm": 0.9438475370407104,
"learning_rate": 9.906964813797955e-06,
"loss": 0.2703879356384277,
"memory(GiB)": 29.52,
"step": 150,
"token_acc": 0.9050589050589051,
"train_speed(iter/s)": 0.123734
},
{
"epoch": 0.19043231206327269,
"grad_norm": 1.0727230310440063,
"learning_rate": 9.900680052499138e-06,
"loss": 0.267763090133667,
"memory(GiB)": 29.52,
"step": 155,
"token_acc": 0.8978266300274794,
"train_speed(iter/s)": 0.124756
},
{
"epoch": 0.19657528987176534,
"grad_norm": 0.8617845773696899,
"learning_rate": 9.894192022269773e-06,
"loss": 0.2951368808746338,
"memory(GiB)": 29.52,
"step": 160,
"token_acc": 0.9030839367122553,
"train_speed(iter/s)": 0.1257
},
{
"epoch": 0.19657528987176534,
"eval_loss": 0.2993355393409729,
"eval_runtime": 29.9095,
"eval_samples_per_second": 17.586,
"eval_steps_per_second": 4.413,
"eval_token_acc": 0.9106355382619974,
"step": 160
},
{
"epoch": 0.202718267680258,
"grad_norm": 1.1480624675750732,
"learning_rate": 9.887500992218421e-06,
"loss": 0.30594232082366946,
"memory(GiB)": 29.52,
"step": 165,
"token_acc": 0.9010627678938407,
"train_speed(iter/s)": 0.122909
},
{
"epoch": 0.20886124548875068,
"grad_norm": 1.0773506164550781,
"learning_rate": 9.880607239873614e-06,
"loss": 0.2754403591156006,
"memory(GiB)": 29.52,
"step": 170,
"token_acc": 0.9078512396694215,
"train_speed(iter/s)": 0.123726
},
{
"epoch": 0.21500422329724334,
"grad_norm": 1.0344544649124146,
"learning_rate": 9.873511051172331e-06,
"loss": 0.27539350986480715,
"memory(GiB)": 29.52,
"step": 175,
"token_acc": 0.9078795220527504,
"train_speed(iter/s)": 0.124606
},
{
"epoch": 0.22114720110573602,
"grad_norm": 0.801381528377533,
"learning_rate": 9.866212720448149e-06,
"loss": 0.2653654098510742,
"memory(GiB)": 29.52,
"step": 180,
"token_acc": 0.9089000349935845,
"train_speed(iter/s)": 0.125295
},
{
"epoch": 0.22114720110573602,
"eval_loss": 0.29467687010765076,
"eval_runtime": 29.9602,
"eval_samples_per_second": 17.557,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.911284046692607,
"step": 180
},
{
"epoch": 0.22729017891422867,
"grad_norm": 0.7735671401023865,
"learning_rate": 9.85871255041903e-06,
"loss": 0.2625685691833496,
"memory(GiB)": 29.52,
"step": 185,
"token_acc": 0.9061491117110654,
"train_speed(iter/s)": 0.122679
},
{
"epoch": 0.23343315672272133,
"grad_norm": 0.8188781142234802,
"learning_rate": 9.85101085217477e-06,
"loss": 0.2741875171661377,
"memory(GiB)": 29.52,
"step": 190,
"token_acc": 0.9082752921732972,
"train_speed(iter/s)": 0.123561
},
{
"epoch": 0.239576134531214,
"grad_norm": 0.9473600387573242,
"learning_rate": 9.843107945164086e-06,
"loss": 0.2795043230056763,
"memory(GiB)": 29.52,
"step": 195,
"token_acc": 0.9222816722590006,
"train_speed(iter/s)": 0.124249
},
{
"epoch": 0.24571911233970667,
"grad_norm": 0.9036338329315186,
"learning_rate": 9.835004157181372e-06,
"loss": 0.2842700004577637,
"memory(GiB)": 29.52,
"step": 200,
"token_acc": 0.9156384193074958,
"train_speed(iter/s)": 0.125075
},
{
"epoch": 0.24571911233970667,
"eval_loss": 0.292435884475708,
"eval_runtime": 29.9157,
"eval_samples_per_second": 17.583,
"eval_steps_per_second": 4.412,
"eval_token_acc": 0.9119109381755296,
"step": 200
},
{
"epoch": 0.2518620901481993,
"grad_norm": 0.850283145904541,
"learning_rate": 9.826699824353106e-06,
"loss": 0.25057048797607423,
"memory(GiB)": 29.52,
"step": 205,
"token_acc": 0.9017097011526469,
"train_speed(iter/s)": 0.122679
},
{
"epoch": 0.258005067956692,
"grad_norm": 0.9866188168525696,
"learning_rate": 9.818195291123903e-06,
"loss": 0.2645299434661865,
"memory(GiB)": 31.97,
"step": 210,
"token_acc": 0.9247558634504632,
"train_speed(iter/s)": 0.123484
},
{
"epoch": 0.2641480457651847,
"grad_norm": 0.9569115042686462,
"learning_rate": 9.80949091024223e-06,
"loss": 0.26346535682678224,
"memory(GiB)": 31.97,
"step": 215,
"token_acc": 0.9060756912373298,
"train_speed(iter/s)": 0.1242
},
{
"epoch": 0.27029102357367735,
"grad_norm": 0.8597538471221924,
"learning_rate": 9.800587042745774e-06,
"loss": 0.24233598709106446,
"memory(GiB)": 31.97,
"step": 220,
"token_acc": 0.9219184958700315,
"train_speed(iter/s)": 0.124816
},
{
"epoch": 0.27029102357367735,
"eval_loss": 0.29206207394599915,
"eval_runtime": 29.9221,
"eval_samples_per_second": 17.579,
"eval_steps_per_second": 4.411,
"eval_token_acc": 0.9120910794062546,
"step": 220
},
{
"epoch": 0.27643400138217,
"grad_norm": 0.955256462097168,
"learning_rate": 9.791484057946465e-06,
"loss": 0.256744384765625,
"memory(GiB)": 31.97,
"step": 225,
"token_acc": 0.9052175977500594,
"train_speed(iter/s)": 0.122865
},
{
"epoch": 0.28257697919066266,
"grad_norm": 0.9161826968193054,
"learning_rate": 9.782182333415168e-06,
"loss": 0.25551562309265136,
"memory(GiB)": 31.97,
"step": 230,
"token_acc": 0.9160751966238251,
"train_speed(iter/s)": 0.123442
},
{
"epoch": 0.2887199569991553,
"grad_norm": 0.8681318759918213,
"learning_rate": 9.772682254966009e-06,
"loss": 0.27017927169799805,
"memory(GiB)": 31.97,
"step": 235,
"token_acc": 0.9036617262423714,
"train_speed(iter/s)": 0.124191
},
{
"epoch": 0.294862934807648,
"grad_norm": 1.02655029296875,
"learning_rate": 9.762984216640378e-06,
"loss": 0.2807133197784424,
"memory(GiB)": 31.97,
"step": 240,
"token_acc": 0.91136,
"train_speed(iter/s)": 0.12485
},
{
"epoch": 0.294862934807648,
"eval_loss": 0.2887011170387268,
"eval_runtime": 29.7971,
"eval_samples_per_second": 17.653,
"eval_steps_per_second": 4.43,
"eval_token_acc": 0.9119037325263006,
"step": 240
},
{
"epoch": 0.3010059126161407,
"grad_norm": 0.8672446012496948,
"learning_rate": 9.753088620690589e-06,
"loss": 0.25624737739562986,
"memory(GiB)": 31.97,
"step": 245,
"token_acc": 0.9064800901577761,
"train_speed(iter/s)": 0.122919
},
{
"epoch": 0.30714889042463334,
"grad_norm": 0.9362043142318726,
"learning_rate": 9.742995877563187e-06,
"loss": 0.2410278081893921,
"memory(GiB)": 31.97,
"step": 250,
"token_acc": 0.9145431429992814,
"train_speed(iter/s)": 0.123516
},
{
"epoch": 0.313291868233126,
"grad_norm": 0.8355256915092468,
"learning_rate": 9.732706405881931e-06,
"loss": 0.29171640872955323,
"memory(GiB)": 31.97,
"step": 255,
"token_acc": 0.9167446592065107,
"train_speed(iter/s)": 0.123982
},
{
"epoch": 0.31943484604161865,
"grad_norm": 0.9195040464401245,
"learning_rate": 9.722220632430428e-06,
"loss": 0.2701089859008789,
"memory(GiB)": 31.97,
"step": 260,
"token_acc": 0.914859208523592,
"train_speed(iter/s)": 0.124498
},
{
"epoch": 0.31943484604161865,
"eval_loss": 0.2895512878894806,
"eval_runtime": 29.7827,
"eval_samples_per_second": 17.661,
"eval_steps_per_second": 4.432,
"eval_token_acc": 0.9118460873324686,
"step": 260
},
{
"epoch": 0.32557782385011136,
"grad_norm": 0.7184414267539978,
"learning_rate": 9.711538992134427e-06,
"loss": 0.27772011756896975,
"memory(GiB)": 31.97,
"step": 265,
"token_acc": 0.9080251975547935,
"train_speed(iter/s)": 0.122689
},
{
"epoch": 0.331720801658604,
"grad_norm": 0.8985347151756287,
"learning_rate": 9.700661928043787e-06,
"loss": 0.2564595460891724,
"memory(GiB)": 31.97,
"step": 270,
"token_acc": 0.9087627174269773,
"train_speed(iter/s)": 0.123177
},
{
"epoch": 0.33786377946709667,
"grad_norm": 0.8065007925033569,
"learning_rate": 9.689589891314094e-06,
"loss": 0.25415422916412356,
"memory(GiB)": 31.97,
"step": 275,
"token_acc": 0.9148117934972614,
"train_speed(iter/s)": 0.123657
},
{
"epoch": 0.3440067572755893,
"grad_norm": 1.036281943321228,
"learning_rate": 9.678323341187956e-06,
"loss": 0.2695312023162842,
"memory(GiB)": 31.97,
"step": 280,
"token_acc": 0.9125853071055801,
"train_speed(iter/s)": 0.124107
},
{
"epoch": 0.3440067572755893,
"eval_loss": 0.28761476278305054,
"eval_runtime": 29.7589,
"eval_samples_per_second": 17.675,
"eval_steps_per_second": 4.436,
"eval_token_acc": 0.9123000432338954,
"step": 280
},
{
"epoch": 0.350149735084082,
"grad_norm": 0.9171528816223145,
"learning_rate": 9.666862744975938e-06,
"loss": 0.26874988079071044,
"memory(GiB)": 31.97,
"step": 285,
"token_acc": 0.9094196412588164,
"train_speed(iter/s)": 0.122607
},
{
"epoch": 0.3562927128925747,
"grad_norm": 0.8578206300735474,
"learning_rate": 9.655208578037198e-06,
"loss": 0.28213140964508054,
"memory(GiB)": 31.97,
"step": 290,
"token_acc": 0.9133274656042989,
"train_speed(iter/s)": 0.123162
},
{
"epoch": 0.36243569070106735,
"grad_norm": 0.9107432961463928,
"learning_rate": 9.643361323759763e-06,
"loss": 0.27148008346557617,
"memory(GiB)": 31.97,
"step": 295,
"token_acc": 0.9049124513618677,
"train_speed(iter/s)": 0.123652
},
{
"epoch": 0.36857866850956,
"grad_norm": 0.9925222396850586,
"learning_rate": 9.631321473540476e-06,
"loss": 0.2592118740081787,
"memory(GiB)": 31.97,
"step": 300,
"token_acc": 0.8945048023933239,
"train_speed(iter/s)": 0.124138
},
{
"epoch": 0.36857866850956,
"eval_loss": 0.28638342022895813,
"eval_runtime": 29.7767,
"eval_samples_per_second": 17.665,
"eval_steps_per_second": 4.433,
"eval_token_acc": 0.9123360714800404,
"step": 300
},
{
"epoch": 0.37472164631805266,
"grad_norm": 0.9066736102104187,
"learning_rate": 9.619089526764614e-06,
"loss": 0.26896276473999026,
"memory(GiB)": 31.97,
"step": 305,
"token_acc": 0.9059246028729954,
"train_speed(iter/s)": 0.12275
},
{
"epoch": 0.38086462412654537,
"grad_norm": 1.0033142566680908,
"learning_rate": 9.60666599078518e-06,
"loss": 0.2597354412078857,
"memory(GiB)": 31.97,
"step": 310,
"token_acc": 0.9248076074702221,
"train_speed(iter/s)": 0.123177
},
{
"epoch": 0.387007601935038,
"grad_norm": 0.7254430055618286,
"learning_rate": 9.59405138090186e-06,
"loss": 0.25493106842041013,
"memory(GiB)": 31.97,
"step": 315,
"token_acc": 0.9245887855378633,
"train_speed(iter/s)": 0.123726
},
{
"epoch": 0.3931505797435307,
"grad_norm": 0.9664581418037415,
"learning_rate": 9.581246220339636e-06,
"loss": 0.25707592964172366,
"memory(GiB)": 31.97,
"step": 320,
"token_acc": 0.9214029811137158,
"train_speed(iter/s)": 0.124122
},
{
"epoch": 0.3931505797435307,
"eval_loss": 0.2849082946777344,
"eval_runtime": 29.7736,
"eval_samples_per_second": 17.667,
"eval_steps_per_second": 4.433,
"eval_token_acc": 0.9127467934860931,
"step": 320
},
{
"epoch": 0.39929355755202334,
"grad_norm": 1.0068587064743042,
"learning_rate": 9.568251040227101e-06,
"loss": 0.26822853088378906,
"memory(GiB)": 31.97,
"step": 325,
"token_acc": 0.9102573583789381,
"train_speed(iter/s)": 0.122735
},
{
"epoch": 0.405436535360516,
"grad_norm": 0.9217173457145691,
"learning_rate": 9.555066379574423e-06,
"loss": 0.25938191413879397,
"memory(GiB)": 31.97,
"step": 330,
"token_acc": 0.91627231410767,
"train_speed(iter/s)": 0.123232
},
{
"epoch": 0.4115795131690087,
"grad_norm": 0.9511445760726929,
"learning_rate": 9.541692785250983e-06,
"loss": 0.2502701759338379,
"memory(GiB)": 31.97,
"step": 335,
"token_acc": 0.9267840101791963,
"train_speed(iter/s)": 0.123579
},
{
"epoch": 0.41772249097750136,
"grad_norm": 0.8599613904953003,
"learning_rate": 9.528130811962693e-06,
"loss": 0.2726857662200928,
"memory(GiB)": 31.97,
"step": 340,
"token_acc": 0.9137533709242461,
"train_speed(iter/s)": 0.124005
},
{
"epoch": 0.41772249097750136,
"eval_loss": 0.2847888171672821,
"eval_runtime": 29.6352,
"eval_samples_per_second": 17.749,
"eval_steps_per_second": 4.454,
"eval_token_acc": 0.9126459143968871,
"step": 340
},
{
"epoch": 0.423865468785994,
"grad_norm": 0.8748957514762878,
"learning_rate": 9.514381022228997e-06,
"loss": 0.2631422996520996,
"memory(GiB)": 31.97,
"step": 345,
"token_acc": 0.9046689686233298,
"train_speed(iter/s)": 0.122672
},
{
"epoch": 0.43000844659448667,
"grad_norm": 0.8299148082733154,
"learning_rate": 9.50044398635953e-06,
"loss": 0.25957283973693845,
"memory(GiB)": 31.97,
"step": 350,
"token_acc": 0.9137482867810388,
"train_speed(iter/s)": 0.123033
},
{
"epoch": 0.4361514244029793,
"grad_norm": 0.8662333488464355,
"learning_rate": 9.486320282430469e-06,
"loss": 0.25355665683746337,
"memory(GiB)": 31.97,
"step": 355,
"token_acc": 0.9042410061421469,
"train_speed(iter/s)": 0.123514
},
{
"epoch": 0.44229440221147204,
"grad_norm": 0.8986192345619202,
"learning_rate": 9.472010496260545e-06,
"loss": 0.27993130683898926,
"memory(GiB)": 31.97,
"step": 360,
"token_acc": 0.9059234866040574,
"train_speed(iter/s)": 0.12398
},
{
"epoch": 0.44229440221147204,
"eval_loss": 0.28364232182502747,
"eval_runtime": 29.7243,
"eval_samples_per_second": 17.696,
"eval_steps_per_second": 4.441,
"eval_token_acc": 0.9125234183599943,
"step": 360
},
{
"epoch": 0.4484373800199647,
"grad_norm": 0.8662838339805603,
"learning_rate": 9.45751522138676e-06,
"loss": 0.2574014663696289,
"memory(GiB)": 31.97,
"step": 365,
"token_acc": 0.9092130002686006,
"train_speed(iter/s)": 0.122708
},
{
"epoch": 0.45458035782845735,
"grad_norm": 0.8905205130577087,
"learning_rate": 9.44283505903976e-06,
"loss": 0.2571540355682373,
"memory(GiB)": 31.97,
"step": 370,
"token_acc": 0.9236671451908322,
"train_speed(iter/s)": 0.12311
},
{
"epoch": 0.46072333563695,
"grad_norm": 0.8431711196899414,
"learning_rate": 9.427970618118888e-06,
"loss": 0.29239816665649415,
"memory(GiB)": 31.97,
"step": 375,
"token_acc": 0.910949410949411,
"train_speed(iter/s)": 0.123539
},
{
"epoch": 0.46686631344544266,
"grad_norm": 0.8874600529670715,
"learning_rate": 9.412922515166952e-06,
"loss": 0.2700673580169678,
"memory(GiB)": 31.97,
"step": 380,
"token_acc": 0.8923294784045315,
"train_speed(iter/s)": 0.123896
},
{
"epoch": 0.46686631344544266,
"eval_loss": 0.2839924693107605,
"eval_runtime": 29.8081,
"eval_samples_per_second": 17.646,
"eval_steps_per_second": 4.428,
"eval_token_acc": 0.9126891482922611,
"step": 380
},
{
"epoch": 0.47300929125393537,
"grad_norm": 0.9719629883766174,
"learning_rate": 9.39769137434463e-06,
"loss": 0.26788945198059083,
"memory(GiB)": 31.97,
"step": 385,
"token_acc": 0.9089390748674752,
"train_speed(iter/s)": 0.122706
},
{
"epoch": 0.479152269062428,
"grad_norm": 0.8490920066833496,
"learning_rate": 9.38227782740459e-06,
"loss": 0.26457748413085935,
"memory(GiB)": 31.97,
"step": 390,
"token_acc": 0.9092120695170061,
"train_speed(iter/s)": 0.12317
},
{
"epoch": 0.4852952468709207,
"grad_norm": 0.8913845419883728,
"learning_rate": 9.366682513665293e-06,
"loss": 0.2367623805999756,
"memory(GiB)": 31.97,
"step": 395,
"token_acc": 0.9159677595033221,
"train_speed(iter/s)": 0.123494
},
{
"epoch": 0.49143822467941334,
"grad_norm": 1.0622432231903076,
"learning_rate": 9.350906079984456e-06,
"loss": 0.29119043350219725,
"memory(GiB)": 31.97,
"step": 400,
"token_acc": 0.9054560355930219,
"train_speed(iter/s)": 0.123861
},
{
"epoch": 0.49143822467941334,
"eval_loss": 0.28296077251434326,
"eval_runtime": 29.8253,
"eval_samples_per_second": 17.636,
"eval_steps_per_second": 4.426,
"eval_token_acc": 0.9125594466061392,
"step": 400
},
{
"epoch": 0.497581202487906,
"grad_norm": 0.7695022225379944,
"learning_rate": 9.334949180732245e-06,
"loss": 0.27100481986999514,
"memory(GiB)": 31.97,
"step": 405,
"token_acc": 0.9032931397580931,
"train_speed(iter/s)": 0.122792
},
{
"epoch": 0.5037241802963986,
"grad_norm": 0.8153588175773621,
"learning_rate": 9.31881247776412e-06,
"loss": 0.24918160438537598,
"memory(GiB)": 31.97,
"step": 410,
"token_acc": 0.9269037635243568,
"train_speed(iter/s)": 0.123224
},
{
"epoch": 0.5098671581048914,
"grad_norm": 0.8573175668716431,
"learning_rate": 9.302496640393383e-06,
"loss": 0.2658379554748535,
"memory(GiB)": 31.97,
"step": 415,
"token_acc": 0.9111843654344243,
"train_speed(iter/s)": 0.12355
},
{
"epoch": 0.516010135913384,
"grad_norm": 0.8473800420761108,
"learning_rate": 9.286002345363418e-06,
"loss": 0.25906102657318114,
"memory(GiB)": 31.97,
"step": 420,
"token_acc": 0.9103914478855526,
"train_speed(iter/s)": 0.12386
},
{
"epoch": 0.516010135913384,
"eval_loss": 0.2819235920906067,
"eval_runtime": 29.7627,
"eval_samples_per_second": 17.673,
"eval_steps_per_second": 4.435,
"eval_token_acc": 0.9134313301628477,
"step": 420
},
{
"epoch": 0.5221531137218767,
"grad_norm": 0.8349065780639648,
"learning_rate": 9.26933027681963e-06,
"loss": 0.27934200763702394,
"memory(GiB)": 31.97,
"step": 425,
"token_acc": 0.9077418760931603,
"train_speed(iter/s)": 0.122817
},
{
"epoch": 0.5282960915303694,
"grad_norm": 0.7763445973396301,
"learning_rate": 9.25248112628105e-06,
"loss": 0.25652432441711426,
"memory(GiB)": 31.97,
"step": 430,
"token_acc": 0.9304649945266765,
"train_speed(iter/s)": 0.123084
},
{
"epoch": 0.534439069338862,
"grad_norm": 0.7395066022872925,
"learning_rate": 9.235455592611667e-06,
"loss": 0.2478388547897339,
"memory(GiB)": 31.97,
"step": 435,
"token_acc": 0.9145705869023341,
"train_speed(iter/s)": 0.123435
},
{
"epoch": 0.5405820471473547,
"grad_norm": 0.7695580124855042,
"learning_rate": 9.218254381991438e-06,
"loss": 0.26280052661895753,
"memory(GiB)": 31.97,
"step": 440,
"token_acc": 0.9116754512058777,
"train_speed(iter/s)": 0.123796
},
{
"epoch": 0.5405820471473547,
"eval_loss": 0.2799247205257416,
"eval_runtime": 29.6791,
"eval_samples_per_second": 17.723,
"eval_steps_per_second": 4.448,
"eval_token_acc": 0.9137844069750685,
"step": 440
},
{
"epoch": 0.5467250249558473,
"grad_norm": 0.8697651028633118,
"learning_rate": 9.200878207886995e-06,
"loss": 0.2713948726654053,
"memory(GiB)": 31.97,
"step": 445,
"token_acc": 0.9107910032853171,
"train_speed(iter/s)": 0.122757
},
{
"epoch": 0.55286800276434,
"grad_norm": 0.8758683204650879,
"learning_rate": 9.183327791022048e-06,
"loss": 0.28769237995147706,
"memory(GiB)": 31.97,
"step": 450,
"token_acc": 0.9117891241178913,
"train_speed(iter/s)": 0.123141
},
{
"epoch": 0.5590109805728327,
"grad_norm": 0.8500604033470154,
"learning_rate": 9.165603859347503e-06,
"loss": 0.28233935832977297,
"memory(GiB)": 31.97,
"step": 455,
"token_acc": 0.9127747252747253,
"train_speed(iter/s)": 0.123484
},
{
"epoch": 0.5651539583813253,
"grad_norm": 0.7201665639877319,
"learning_rate": 9.147707148011255e-06,
"loss": 0.26129984855651855,
"memory(GiB)": 31.97,
"step": 460,
"token_acc": 0.9233639048655371,
"train_speed(iter/s)": 0.123825
},
{
"epoch": 0.5651539583813253,
"eval_loss": 0.27780428528785706,
"eval_runtime": 29.6703,
"eval_samples_per_second": 17.728,
"eval_steps_per_second": 4.449,
"eval_token_acc": 0.9140005764519383,
"step": 460
},
{
"epoch": 0.571296936189818,
"grad_norm": 0.9048472046852112,
"learning_rate": 9.129638399327707e-06,
"loss": 0.2747702360153198,
"memory(GiB)": 31.97,
"step": 465,
"token_acc": 0.9071566001433583,
"train_speed(iter/s)": 0.122931
},
{
"epoch": 0.5774399139983106,
"grad_norm": 0.920282244682312,
"learning_rate": 9.111398362746969e-06,
"loss": 0.25536236763000486,
"memory(GiB)": 31.97,
"step": 470,
"token_acc": 0.9256230196112015,
"train_speed(iter/s)": 0.123189
},
{
"epoch": 0.5835828918068033,
"grad_norm": 0.7103043794631958,
"learning_rate": 9.092987794823785e-06,
"loss": 0.25142607688903806,
"memory(GiB)": 31.97,
"step": 475,
"token_acc": 0.9101461736887361,
"train_speed(iter/s)": 0.1235
},
{
"epoch": 0.589725869615296,
"grad_norm": 0.962253987789154,
"learning_rate": 9.074407459186144e-06,
"loss": 0.27944207191467285,
"memory(GiB)": 31.97,
"step": 480,
"token_acc": 0.9034072816049251,
"train_speed(iter/s)": 0.123839
},
{
"epoch": 0.589725869615296,
"eval_loss": 0.2775828242301941,
"eval_runtime": 29.9011,
"eval_samples_per_second": 17.591,
"eval_steps_per_second": 4.415,
"eval_token_acc": 0.9140798385934573,
"step": 480
},
{
"epoch": 0.5958688474237887,
"grad_norm": 0.8453803658485413,
"learning_rate": 9.055658126503605e-06,
"loss": 0.25680568218231203,
"memory(GiB)": 31.97,
"step": 485,
"token_acc": 0.9067185532791576,
"train_speed(iter/s)": 0.122913
},
{
"epoch": 0.6020118252322814,
"grad_norm": 0.8572331666946411,
"learning_rate": 9.036740574455345e-06,
"loss": 0.24585814476013185,
"memory(GiB)": 31.97,
"step": 490,
"token_acc": 0.9118667917448405,
"train_speed(iter/s)": 0.123173
},
{
"epoch": 0.608154803040774,
"grad_norm": 0.764564573764801,
"learning_rate": 9.017655587697885e-06,
"loss": 0.2665162801742554,
"memory(GiB)": 31.97,
"step": 495,
"token_acc": 0.9117743676380872,
"train_speed(iter/s)": 0.12348
},
{
"epoch": 0.6142977808492667,
"grad_norm": 0.8492972254753113,
"learning_rate": 8.998403957832553e-06,
"loss": 0.24622914791107178,
"memory(GiB)": 31.97,
"step": 500,
"token_acc": 0.9276657659530243,
"train_speed(iter/s)": 0.12376
},
{
"epoch": 0.6142977808492667,
"eval_loss": 0.2776328921318054,
"eval_runtime": 29.8051,
"eval_samples_per_second": 17.648,
"eval_steps_per_second": 4.429,
"eval_token_acc": 0.9137772013258395,
"step": 500
},
{
"epoch": 0.6204407586577594,
"grad_norm": 0.8095591068267822,
"learning_rate": 8.978986483372657e-06,
"loss": 0.26657900810241697,
"memory(GiB)": 31.97,
"step": 505,
"token_acc": 0.9116968207877298,
"train_speed(iter/s)": 0.122844
},
{
"epoch": 0.626583736466252,
"grad_norm": 0.8510200381278992,
"learning_rate": 8.959403969710346e-06,
"loss": 0.2664052486419678,
"memory(GiB)": 31.97,
"step": 510,
"token_acc": 0.9214573689711811,
"train_speed(iter/s)": 0.123175
},
{
"epoch": 0.6327267142747447,
"grad_norm": 0.9017972946166992,
"learning_rate": 8.939657229083223e-06,
"loss": 0.27168979644775393,
"memory(GiB)": 31.97,
"step": 515,
"token_acc": 0.9179679028410324,
"train_speed(iter/s)": 0.123502
},
{
"epoch": 0.6388696920832373,
"grad_norm": 0.8968759179115295,
"learning_rate": 8.919747080540647e-06,
"loss": 0.2673780918121338,
"memory(GiB)": 31.97,
"step": 520,
"token_acc": 0.918267105457046,
"train_speed(iter/s)": 0.123831
},
{
"epoch": 0.6388696920832373,
"eval_loss": 0.27617883682250977,
"eval_runtime": 29.7042,
"eval_samples_per_second": 17.708,
"eval_steps_per_second": 4.444,
"eval_token_acc": 0.9144112984579911,
"step": 520
},
{
"epoch": 0.64501266989173,
"grad_norm": 0.8835923075675964,
"learning_rate": 8.899674349909759e-06,
"loss": 0.25952877998352053,
"memory(GiB)": 31.97,
"step": 525,
"token_acc": 0.9108591693084734,
"train_speed(iter/s)": 0.123021
},
{
"epoch": 0.6511556477002227,
"grad_norm": 1.0378886461257935,
"learning_rate": 8.879439869761233e-06,
"loss": 0.27931737899780273,
"memory(GiB)": 31.97,
"step": 530,
"token_acc": 0.9016547678344126,
"train_speed(iter/s)": 0.123368
},
{
"epoch": 0.6572986255087153,
"grad_norm": 0.815498411655426,
"learning_rate": 8.859044479374737e-06,
"loss": 0.26363046169281007,
"memory(GiB)": 31.97,
"step": 535,
"token_acc": 0.9000225428313796,
"train_speed(iter/s)": 0.123703
},
{
"epoch": 0.663441603317208,
"grad_norm": 0.7910681962966919,
"learning_rate": 8.838489024704131e-06,
"loss": 0.25994918346405027,
"memory(GiB)": 31.97,
"step": 540,
"token_acc": 0.9154737238651347,
"train_speed(iter/s)": 0.123918
},
{
"epoch": 0.663441603317208,
"eval_loss": 0.27339863777160645,
"eval_runtime": 29.6984,
"eval_samples_per_second": 17.711,
"eval_steps_per_second": 4.445,
"eval_token_acc": 0.9149733390978527,
"step": 540
},
{
"epoch": 0.6695845811257006,
"grad_norm": 1.0020904541015625,
"learning_rate": 8.817774358342367e-06,
"loss": 0.25385727882385256,
"memory(GiB)": 31.97,
"step": 545,
"token_acc": 0.9148255452267112,
"train_speed(iter/s)": 0.123092
},
{
"epoch": 0.6757275589341933,
"grad_norm": 0.8850826621055603,
"learning_rate": 8.796901339486136e-06,
"loss": 0.258061146736145,
"memory(GiB)": 31.97,
"step": 550,
"token_acc": 0.9247646909183413,
"train_speed(iter/s)": 0.12338
},
{
"epoch": 0.681870536742686,
"grad_norm": 0.7173855304718018,
"learning_rate": 8.775870833900226e-06,
"loss": 0.2372835636138916,
"memory(GiB)": 31.97,
"step": 555,
"token_acc": 0.9205999329148498,
"train_speed(iter/s)": 0.12365
},
{
"epoch": 0.6880135145511787,
"grad_norm": 0.8591410517692566,
"learning_rate": 8.75468371388161e-06,
"loss": 0.2429880380630493,
"memory(GiB)": 31.97,
"step": 560,
"token_acc": 0.9182278006744099,
"train_speed(iter/s)": 0.123832
},
{
"epoch": 0.6880135145511787,
"eval_loss": 0.2735811173915863,
"eval_runtime": 29.6978,
"eval_samples_per_second": 17.712,
"eval_steps_per_second": 4.445,
"eval_token_acc": 0.914454532353365,
"step": 560
},
{
"epoch": 0.6941564923596714,
"grad_norm": 0.7596750259399414,
"learning_rate": 8.733340858223268e-06,
"loss": 0.27418644428253175,
"memory(GiB)": 31.97,
"step": 565,
"token_acc": 0.9034487711172151,
"train_speed(iter/s)": 0.123058
},
{
"epoch": 0.700299470168164,
"grad_norm": 0.9523453116416931,
"learning_rate": 8.711843152177735e-06,
"loss": 0.23236403465270997,
"memory(GiB)": 31.97,
"step": 570,
"token_acc": 0.9180918923916058,
"train_speed(iter/s)": 0.123331
},
{
"epoch": 0.7064424479766567,
"grad_norm": 0.7896727919578552,
"learning_rate": 8.690191487420385e-06,
"loss": 0.24450998306274413,
"memory(GiB)": 31.97,
"step": 575,
"token_acc": 0.9229566883477033,
"train_speed(iter/s)": 0.12356
},
{
"epoch": 0.7125854257851494,
"grad_norm": 0.9423844218254089,
"learning_rate": 8.668386762012445e-06,
"loss": 0.2612689256668091,
"memory(GiB)": 31.97,
"step": 580,
"token_acc": 0.9097009202453987,
"train_speed(iter/s)": 0.123753
},
{
"epoch": 0.7125854257851494,
"eval_loss": 0.2717309594154358,
"eval_runtime": 29.6986,
"eval_samples_per_second": 17.711,
"eval_steps_per_second": 4.445,
"eval_token_acc": 0.9148292261132728,
"step": 580
},
{
"epoch": 0.718728403593642,
"grad_norm": 0.8224099278450012,
"learning_rate": 8.646429880363746e-06,
"loss": 0.2574700117111206,
"memory(GiB)": 31.97,
"step": 585,
"token_acc": 0.9109293706792103,
"train_speed(iter/s)": 0.122969
},
{
"epoch": 0.7248713814021347,
"grad_norm": 0.8321871757507324,
"learning_rate": 8.624321753195209e-06,
"loss": 0.24323840141296388,
"memory(GiB)": 31.97,
"step": 590,
"token_acc": 0.9101112629318758,
"train_speed(iter/s)": 0.12323
},
{
"epoch": 0.7310143592106274,
"grad_norm": 0.7631216645240784,
"learning_rate": 8.602063297501069e-06,
"loss": 0.2646035194396973,
"memory(GiB)": 31.97,
"step": 595,
"token_acc": 0.9244347364071078,
"train_speed(iter/s)": 0.123473
},
{
"epoch": 0.73715733701912,
"grad_norm": 0.7845233678817749,
"learning_rate": 8.579655436510847e-06,
"loss": 0.24114649295806884,
"memory(GiB)": 31.97,
"step": 600,
"token_acc": 0.9176872685844488,
"train_speed(iter/s)": 0.123706
},
{
"epoch": 0.73715733701912,
"eval_loss": 0.27247732877731323,
"eval_runtime": 29.6948,
"eval_samples_per_second": 17.714,
"eval_steps_per_second": 4.445,
"eval_token_acc": 0.9152687707162416,
"step": 600
},
{
"epoch": 0.7433003148276127,
"grad_norm": 0.8848146796226501,
"learning_rate": 8.557099099651046e-06,
"loss": 0.26951429843902586,
"memory(GiB)": 31.97,
"step": 605,
"token_acc": 0.9051814218282708,
"train_speed(iter/s)": 0.122981
},
{
"epoch": 0.7494432926361053,
"grad_norm": 0.7597134709358215,
"learning_rate": 8.534395222506614e-06,
"loss": 0.2650261163711548,
"memory(GiB)": 31.97,
"step": 610,
"token_acc": 0.9191399015223382,
"train_speed(iter/s)": 0.123257
},
{
"epoch": 0.755586270444598,
"grad_norm": 0.8406746983528137,
"learning_rate": 8.511544746782124e-06,
"loss": 0.266461181640625,
"memory(GiB)": 31.97,
"step": 615,
"token_acc": 0.9117747440273037,
"train_speed(iter/s)": 0.123491
},
{
"epoch": 0.7617292482530907,
"grad_norm": 0.7886612415313721,
"learning_rate": 8.488548620262722e-06,
"loss": 0.23856868743896484,
"memory(GiB)": 31.97,
"step": 620,
"token_acc": 0.9248648177219606,
"train_speed(iter/s)": 0.123722
},
{
"epoch": 0.7617292482530907,
"eval_loss": 0.27053606510162354,
"eval_runtime": 29.6607,
"eval_samples_per_second": 17.734,
"eval_steps_per_second": 4.45,
"eval_token_acc": 0.9157299322668973,
"step": 620
},
{
"epoch": 0.7678722260615833,
"grad_norm": 0.7711523771286011,
"learning_rate": 8.465407796774816e-06,
"loss": 0.23632125854492186,
"memory(GiB)": 31.97,
"step": 625,
"token_acc": 0.9121895174526754,
"train_speed(iter/s)": 0.122988
},
{
"epoch": 0.774015203870076,
"grad_norm": 0.8446414470672607,
"learning_rate": 8.442123236146509e-06,
"loss": 0.25997061729431153,
"memory(GiB)": 31.97,
"step": 630,
"token_acc": 0.922757768361582,
"train_speed(iter/s)": 0.123243
},
{
"epoch": 0.7801581816785687,
"grad_norm": 0.8071849942207336,
"learning_rate": 8.418695904167789e-06,
"loss": 0.2547910690307617,
"memory(GiB)": 31.97,
"step": 635,
"token_acc": 0.9215275839612987,
"train_speed(iter/s)": 0.123454
},
{
"epoch": 0.7863011594870614,
"grad_norm": 1.0132827758789062,
"learning_rate": 8.395126772550475e-06,
"loss": 0.2584752082824707,
"memory(GiB)": 31.97,
"step": 640,
"token_acc": 0.9119555143651529,
"train_speed(iter/s)": 0.123674
},
{
"epoch": 0.7863011594870614,
"eval_loss": 0.2703080475330353,
"eval_runtime": 29.6849,
"eval_samples_per_second": 17.719,
"eval_steps_per_second": 4.447,
"eval_token_acc": 0.9151967142239515,
"step": 640
},
{
"epoch": 0.7924441372955541,
"grad_norm": 0.795028805732727,
"learning_rate": 8.371416818887907e-06,
"loss": 0.2689487934112549,
"memory(GiB)": 31.97,
"step": 645,
"token_acc": 0.906519600423793,
"train_speed(iter/s)": 0.122961
},
{
"epoch": 0.7985871151040467,
"grad_norm": 0.8361831903457642,
"learning_rate": 8.347567026614398e-06,
"loss": 0.25730276107788086,
"memory(GiB)": 31.97,
"step": 650,
"token_acc": 0.9204035220712578,
"train_speed(iter/s)": 0.123176
},
{
"epoch": 0.8047300929125394,
"grad_norm": 0.8233036994934082,
"learning_rate": 8.323578384964444e-06,
"loss": 0.2561511039733887,
"memory(GiB)": 31.97,
"step": 655,
"token_acc": 0.9101053936763794,
"train_speed(iter/s)": 0.123399
},
{
"epoch": 0.810873070721032,
"grad_norm": 0.8036513328552246,
"learning_rate": 8.299451888931696e-06,
"loss": 0.24744575023651122,
"memory(GiB)": 31.97,
"step": 660,
"token_acc": 0.9082344368103269,
"train_speed(iter/s)": 0.123638
},
{
"epoch": 0.810873070721032,
"eval_loss": 0.2683682441711426,
"eval_runtime": 29.6531,
"eval_samples_per_second": 17.738,
"eval_steps_per_second": 4.451,
"eval_token_acc": 0.9163135898544459,
"step": 660
},
{
"epoch": 0.8170160485295247,
"grad_norm": 0.7272213697433472,
"learning_rate": 8.275188539227687e-06,
"loss": 0.23523108959197997,
"memory(GiB)": 31.97,
"step": 665,
"token_acc": 0.9134302376185917,
"train_speed(iter/s)": 0.122932
},
{
"epoch": 0.8231590263380174,
"grad_norm": 0.854129433631897,
"learning_rate": 8.250789342240326e-06,
"loss": 0.24518890380859376,
"memory(GiB)": 31.97,
"step": 670,
"token_acc": 0.9203224101479915,
"train_speed(iter/s)": 0.123198
},
{
"epoch": 0.82930200414651,
"grad_norm": 0.755439043045044,
"learning_rate": 8.22625530999215e-06,
"loss": 0.23708434104919435,
"memory(GiB)": 31.97,
"step": 675,
"token_acc": 0.9149347105009027,
"train_speed(iter/s)": 0.123435
},
{
"epoch": 0.8354449819550027,
"grad_norm": 0.7811703085899353,
"learning_rate": 8.201587460098362e-06,
"loss": 0.23157744407653807,
"memory(GiB)": 31.97,
"step": 680,
"token_acc": 0.9248452220726784,
"train_speed(iter/s)": 0.123616
},
{
"epoch": 0.8354449819550027,
"eval_loss": 0.26816263794898987,
"eval_runtime": 29.696,
"eval_samples_per_second": 17.713,
"eval_steps_per_second": 4.445,
"eval_token_acc": 0.9156650814238363,
"step": 680
},
{
"epoch": 0.8415879597634953,
"grad_norm": 0.8753883242607117,
"learning_rate": 8.176786815724601e-06,
"loss": 0.26745316982269285,
"memory(GiB)": 31.97,
"step": 685,
"token_acc": 0.9049404582454025,
"train_speed(iter/s)": 0.122971
},
{
"epoch": 0.847730937571988,
"grad_norm": 0.699246346950531,
"learning_rate": 8.151854405544526e-06,
"loss": 0.2602883815765381,
"memory(GiB)": 31.97,
"step": 690,
"token_acc": 0.9025235288033923,
"train_speed(iter/s)": 0.12319
},
{
"epoch": 0.8538739153804807,
"grad_norm": 0.6880396604537964,
"learning_rate": 8.12679126369713e-06,
"loss": 0.27959246635437013,
"memory(GiB)": 31.97,
"step": 695,
"token_acc": 0.9131617782696919,
"train_speed(iter/s)": 0.123417
},
{
"epoch": 0.8600168931889733,
"grad_norm": 0.847633421421051,
"learning_rate": 8.101598429743862e-06,
"loss": 0.2776790142059326,
"memory(GiB)": 31.97,
"step": 700,
"token_acc": 0.9208461614857484,
"train_speed(iter/s)": 0.123628
},
{
"epoch": 0.8600168931889733,
"eval_loss": 0.26767873764038086,
"eval_runtime": 29.7471,
"eval_samples_per_second": 17.682,
"eval_steps_per_second": 4.437,
"eval_token_acc": 0.9162271220636979,
"step": 700
},
{
"epoch": 0.866159870997466,
"grad_norm": 0.796385645866394,
"learning_rate": 8.076276948625495e-06,
"loss": 0.2519699573516846,
"memory(GiB)": 31.97,
"step": 705,
"token_acc": 0.9091483105121683,
"train_speed(iter/s)": 0.123011
},
{
"epoch": 0.8723028488059587,
"grad_norm": 0.7697290182113647,
"learning_rate": 8.050827870618795e-06,
"loss": 0.2222222328186035,
"memory(GiB)": 31.97,
"step": 710,
"token_acc": 0.9183491244605387,
"train_speed(iter/s)": 0.123243
},
{
"epoch": 0.8784458266144514,
"grad_norm": 0.7783083319664001,
"learning_rate": 8.02525225129295e-06,
"loss": 0.2620779752731323,
"memory(GiB)": 31.97,
"step": 715,
"token_acc": 0.9140030018344544,
"train_speed(iter/s)": 0.123471
},
{
"epoch": 0.8845888044229441,
"grad_norm": 0.8889957666397095,
"learning_rate": 7.999551151465793e-06,
"loss": 0.2590866327285767,
"memory(GiB)": 31.97,
"step": 720,
"token_acc": 0.9240682856455879,
"train_speed(iter/s)": 0.123679
},
{
"epoch": 0.8845888044229441,
"eval_loss": 0.2673368752002716,
"eval_runtime": 29.7041,
"eval_samples_per_second": 17.708,
"eval_steps_per_second": 4.444,
"eval_token_acc": 0.9157587548638132,
"step": 720
},
{
"epoch": 0.8907317822314367,
"grad_norm": 0.699228048324585,
"learning_rate": 7.973725637159795e-06,
"loss": 0.24339399337768555,
"memory(GiB)": 31.97,
"step": 725,
"token_acc": 0.9063403422456052,
"train_speed(iter/s)": 0.123046
},
{
"epoch": 0.8968747600399294,
"grad_norm": 0.8761767745018005,
"learning_rate": 7.947776779557862e-06,
"loss": 0.22902493476867675,
"memory(GiB)": 31.97,
"step": 730,
"token_acc": 0.929031261265901,
"train_speed(iter/s)": 0.123239
},
{
"epoch": 0.903017737848422,
"grad_norm": 0.7344342470169067,
"learning_rate": 7.921705654958886e-06,
"loss": 0.25461578369140625,
"memory(GiB)": 31.97,
"step": 735,
"token_acc": 0.910783754344105,
"train_speed(iter/s)": 0.123436
},
{
"epoch": 0.9091607156569147,
"grad_norm": 0.8068217635154724,
"learning_rate": 7.895513344733124e-06,
"loss": 0.23727846145629883,
"memory(GiB)": 31.97,
"step": 740,
"token_acc": 0.9222560975609756,
"train_speed(iter/s)": 0.123624
},
{
"epoch": 0.9091607156569147,
"eval_loss": 0.26663488149642944,
"eval_runtime": 29.6915,
"eval_samples_per_second": 17.715,
"eval_steps_per_second": 4.446,
"eval_token_acc": 0.916219916414469,
"step": 740
},
{
"epoch": 0.9153036934654074,
"grad_norm": 0.7529241442680359,
"learning_rate": 7.869200935277317e-06,
"loss": 0.2533961534500122,
"memory(GiB)": 31.97,
"step": 745,
"token_acc": 0.9121754667444574,
"train_speed(iter/s)": 0.123063
},
{
"epoch": 0.9214466712739,
"grad_norm": 0.8958796858787537,
"learning_rate": 7.842769517969665e-06,
"loss": 0.2638097286224365,
"memory(GiB)": 31.97,
"step": 750,
"token_acc": 0.9133777069466579,
"train_speed(iter/s)": 0.123274
},
{
"epoch": 0.9275896490823927,
"grad_norm": 0.8491634726524353,
"learning_rate": 7.816220189124527e-06,
"loss": 0.2510275363922119,
"memory(GiB)": 31.97,
"step": 755,
"token_acc": 0.9162462967411322,
"train_speed(iter/s)": 0.123478
},
{
"epoch": 0.9337326268908853,
"grad_norm": 0.8026111125946045,
"learning_rate": 7.789554049946966e-06,
"loss": 0.2663265228271484,
"memory(GiB)": 31.97,
"step": 760,
"token_acc": 0.9118450459399057,
"train_speed(iter/s)": 0.123691
},
{
"epoch": 0.9337326268908853,
"eval_loss": 0.265114426612854,
"eval_runtime": 29.6567,
"eval_samples_per_second": 17.736,
"eval_steps_per_second": 4.451,
"eval_token_acc": 0.9164793197867128,
"step": 760
},
{
"epoch": 0.939875604699378,
"grad_norm": 0.7799587845802307,
"learning_rate": 7.762772206487066e-06,
"loss": 0.2516252756118774,
"memory(GiB)": 31.97,
"step": 765,
"token_acc": 0.9153250495227805,
"train_speed(iter/s)": 0.123108
},
{
"epoch": 0.9460185825078707,
"grad_norm": 0.7860950231552124,
"learning_rate": 7.735875769594063e-06,
"loss": 0.2252351760864258,
"memory(GiB)": 31.97,
"step": 770,
"token_acc": 0.9219084178777077,
"train_speed(iter/s)": 0.123281
},
{
"epoch": 0.9521615603163633,
"grad_norm": 0.7115477323532104,
"learning_rate": 7.70886585487026e-06,
"loss": 0.24201133251190185,
"memory(GiB)": 31.97,
"step": 775,
"token_acc": 0.9087906037805101,
"train_speed(iter/s)": 0.123461
},
{
"epoch": 0.958304538124856,
"grad_norm": 0.7322366833686829,
"learning_rate": 7.681743582624761e-06,
"loss": 0.24702987670898438,
"memory(GiB)": 31.97,
"step": 780,
"token_acc": 0.9231603262150568,
"train_speed(iter/s)": 0.123669
},
{
"epoch": 0.958304538124856,
"eval_loss": 0.2637149393558502,
"eval_runtime": 29.6465,
"eval_samples_per_second": 17.742,
"eval_steps_per_second": 4.452,
"eval_token_acc": 0.9160253638852861,
"step": 780
},
{
"epoch": 0.9644475159333487,
"grad_norm": 0.8311400413513184,
"learning_rate": 7.654510077827003e-06,
"loss": 0.26540687084198,
"memory(GiB)": 31.97,
"step": 785,
"token_acc": 0.9113840464870576,
"train_speed(iter/s)": 0.12312
},
{
"epoch": 0.9705904937418414,
"grad_norm": 0.8548195958137512,
"learning_rate": 7.627166470060092e-06,
"loss": 0.26256117820739744,
"memory(GiB)": 31.97,
"step": 790,
"token_acc": 0.9282326450438365,
"train_speed(iter/s)": 0.123344
},
{
"epoch": 0.9767334715503341,
"grad_norm": 0.822137176990509,
"learning_rate": 7.59971389347395e-06,
"loss": 0.2492506980895996,
"memory(GiB)": 31.97,
"step": 795,
"token_acc": 0.9176904773466712,
"train_speed(iter/s)": 0.123523
},
{
"epoch": 0.9828764493588267,
"grad_norm": 0.8581626415252686,
"learning_rate": 7.572153486738281e-06,
"loss": 0.23105947971343993,
"memory(GiB)": 31.97,
"step": 800,
"token_acc": 0.9199036089276493,
"train_speed(iter/s)": 0.123714
},
{
"epoch": 0.9828764493588267,
"eval_loss": 0.262928307056427,
"eval_runtime": 29.686,
"eval_samples_per_second": 17.719,
"eval_steps_per_second": 4.447,
"eval_token_acc": 0.9163784406975068,
"step": 800
},
{
"epoch": 0.9890194271673194,
"grad_norm": 0.801688551902771,
"learning_rate": 7.544486392995325e-06,
"loss": 0.22394142150878907,
"memory(GiB)": 31.97,
"step": 805,
"token_acc": 0.9134662867996202,
"train_speed(iter/s)": 0.12313
},
{
"epoch": 0.995162404975812,
"grad_norm": 0.769991397857666,
"learning_rate": 7.516713759812465e-06,
"loss": 0.24088678359985352,
"memory(GiB)": 31.97,
"step": 810,
"token_acc": 0.9233073946152885,
"train_speed(iter/s)": 0.12331
},
{
"epoch": 1.002457191123397,
"grad_norm": 0.7465444207191467,
"learning_rate": 7.4888367391346085e-06,
"loss": 0.2843191623687744,
"memory(GiB)": 31.97,
"step": 815,
"token_acc": 0.9340755933196602,
"train_speed(iter/s)": 0.12343
},
{
"epoch": 1.0086001689318898,
"grad_norm": 0.7140413522720337,
"learning_rate": 7.460856487236421e-06,
"loss": 0.21777431964874266,
"memory(GiB)": 31.97,
"step": 820,
"token_acc": 0.9192456915997473,
"train_speed(iter/s)": 0.123641
},
{
"epoch": 1.0086001689318898,
"eval_loss": 0.26577290892601013,
"eval_runtime": 29.655,
"eval_samples_per_second": 17.737,
"eval_steps_per_second": 4.451,
"eval_token_acc": 0.9173079694480473,
"step": 820
},
{
"epoch": 1.0147431467403825,
"grad_norm": 0.8933572769165039,
"learning_rate": 7.432774164674359e-06,
"loss": 0.18578357696533204,
"memory(GiB)": 31.97,
"step": 825,
"token_acc": 0.9199372635852016,
"train_speed(iter/s)": 0.123101
},
{
"epoch": 1.0208861245488752,
"grad_norm": 0.8847187161445618,
"learning_rate": 7.404590936238535e-06,
"loss": 0.19721906185150145,
"memory(GiB)": 31.97,
"step": 830,
"token_acc": 0.9378547338981656,
"train_speed(iter/s)": 0.123344
},
{
"epoch": 1.0270291023573677,
"grad_norm": 0.8109038472175598,
"learning_rate": 7.376307970904408e-06,
"loss": 0.209037446975708,
"memory(GiB)": 31.97,
"step": 835,
"token_acc": 0.9270859687294439,
"train_speed(iter/s)": 0.123517
},
{
"epoch": 1.0331720801658604,
"grad_norm": 0.7887623906135559,
"learning_rate": 7.34792644178429e-06,
"loss": 0.18892388343811034,
"memory(GiB)": 31.97,
"step": 840,
"token_acc": 0.940854053515372,
"train_speed(iter/s)": 0.123667
},
{
"epoch": 1.0331720801658604,
"eval_loss": 0.26770132780075073,
"eval_runtime": 29.6867,
"eval_samples_per_second": 17.718,
"eval_steps_per_second": 4.446,
"eval_token_acc": 0.9165729932266897,
"step": 840
},
{
"epoch": 1.039315057974353,
"grad_norm": 0.8831640481948853,
"learning_rate": 7.319447526078696e-06,
"loss": 0.20574064254760743,
"memory(GiB)": 31.97,
"step": 845,
"token_acc": 0.9144320335497399,
"train_speed(iter/s)": 0.123197
},
{
"epoch": 1.0454580357828458,
"grad_norm": 0.7205056548118591,
"learning_rate": 7.290872405027508e-06,
"loss": 0.1763360857963562,
"memory(GiB)": 31.97,
"step": 850,
"token_acc": 0.941142747945729,
"train_speed(iter/s)": 0.123361
},
{
"epoch": 1.0516010135913385,
"grad_norm": 0.6685133576393127,
"learning_rate": 7.262202263860989e-06,
"loss": 0.18052310943603517,
"memory(GiB)": 31.97,
"step": 855,
"token_acc": 0.9395161290322581,
"train_speed(iter/s)": 0.123527
},
{
"epoch": 1.057743991399831,
"grad_norm": 0.6980032920837402,
"learning_rate": 7.233438291750615e-06,
"loss": 0.2039564609527588,
"memory(GiB)": 31.97,
"step": 860,
"token_acc": 0.9359017096052193,
"train_speed(iter/s)": 0.123688
},
{
"epoch": 1.057743991399831,
"eval_loss": 0.2684628963470459,
"eval_runtime": 29.667,
"eval_samples_per_second": 17.73,
"eval_steps_per_second": 4.449,
"eval_token_acc": 0.9164360858913388,
"step": 860
},
{
"epoch": 1.0638869692083237,
"grad_norm": 0.9002748727798462,
"learning_rate": 7.204581681759752e-06,
"loss": 0.2119807004928589,
"memory(GiB)": 31.97,
"step": 865,
"token_acc": 0.9171671861932639,
"train_speed(iter/s)": 0.123189
},
{
"epoch": 1.0700299470168164,
"grad_norm": 0.7583820819854736,
"learning_rate": 7.175633630794176e-06,
"loss": 0.20298078060150146,
"memory(GiB)": 31.97,
"step": 870,
"token_acc": 0.938098510882016,
"train_speed(iter/s)": 0.123367
},
{
"epoch": 1.0761729248253091,
"grad_norm": 0.8472638726234436,
"learning_rate": 7.146595339552423e-06,
"loss": 0.19818198680877686,
"memory(GiB)": 31.97,
"step": 875,
"token_acc": 0.937833543813908,
"train_speed(iter/s)": 0.123568
},
{
"epoch": 1.0823159026338018,
"grad_norm": 0.8150883913040161,
"learning_rate": 7.1174680124759856e-06,
"loss": 0.17738423347473145,
"memory(GiB)": 31.97,
"step": 880,
"token_acc": 0.9374801246581441,
"train_speed(iter/s)": 0.123737
},
{
"epoch": 1.0823159026338018,
"eval_loss": 0.26935601234436035,
"eval_runtime": 29.5753,
"eval_samples_per_second": 17.785,
"eval_steps_per_second": 4.463,
"eval_token_acc": 0.916198299466782,
"step": 880
},
{
"epoch": 1.0884588804422943,
"grad_norm": 0.7381166815757751,
"learning_rate": 7.08825285769936e-06,
"loss": 0.18000258207321168,
"memory(GiB)": 31.97,
"step": 885,
"token_acc": 0.9197572365671506,
"train_speed(iter/s)": 0.12326
},
{
"epoch": 1.094601858250787,
"grad_norm": 0.7191819548606873,
"learning_rate": 7.058951086999934e-06,
"loss": 0.17380096912384033,
"memory(GiB)": 31.97,
"step": 890,
"token_acc": 0.9333728639965501,
"train_speed(iter/s)": 0.123399
},
{
"epoch": 1.1007448360592798,
"grad_norm": 0.8915813565254211,
"learning_rate": 7.029563915747723e-06,
"loss": 0.18771791458129883,
"memory(GiB)": 31.97,
"step": 895,
"token_acc": 0.9397812810680302,
"train_speed(iter/s)": 0.123569
},
{
"epoch": 1.1068878138677725,
"grad_norm": 0.7277628183364868,
"learning_rate": 7.0000925628549595e-06,
"loss": 0.20253748893737794,
"memory(GiB)": 31.97,
"step": 900,
"token_acc": 0.9236840782263335,
"train_speed(iter/s)": 0.123737
},
{
"epoch": 1.1068878138677725,
"eval_loss": 0.2681773900985718,
"eval_runtime": 29.6364,
"eval_samples_per_second": 17.748,
"eval_steps_per_second": 4.454,
"eval_token_acc": 0.916162271220637,
"step": 900
},
{
"epoch": 1.1130307916762652,
"grad_norm": 0.9438680410385132,
"learning_rate": 6.9705382507255405e-06,
"loss": 0.18554757833480834,
"memory(GiB)": 31.97,
"step": 905,
"token_acc": 0.9169849491620015,
"train_speed(iter/s)": 0.123281
},
{
"epoch": 1.1191737694847577,
"grad_norm": 0.8167145252227783,
"learning_rate": 6.940902205204321e-06,
"loss": 0.19586331844329835,
"memory(GiB)": 31.97,
"step": 910,
"token_acc": 0.935408560311284,
"train_speed(iter/s)": 0.123439
},
{
"epoch": 1.1253167472932504,
"grad_norm": 0.7680448889732361,
"learning_rate": 6.911185655526263e-06,
"loss": 0.2027712345123291,
"memory(GiB)": 31.97,
"step": 915,
"token_acc": 0.9350154026697961,
"train_speed(iter/s)": 0.123617
},
{
"epoch": 1.131459725101743,
"grad_norm": 0.7472254037857056,
"learning_rate": 6.881389834265463e-06,
"loss": 0.20426957607269286,
"memory(GiB)": 31.97,
"step": 920,
"token_acc": 0.9317039744175423,
"train_speed(iter/s)": 0.123791
},
{
"epoch": 1.131459725101743,
"eval_loss": 0.26660415530204773,
"eval_runtime": 29.6749,
"eval_samples_per_second": 17.725,
"eval_steps_per_second": 4.448,
"eval_token_acc": 0.9159677186914541,
"step": 920
},
{
"epoch": 1.1376027029102358,
"grad_norm": 0.7990177273750305,
"learning_rate": 6.851515977284014e-06,
"loss": 0.17569031715393066,
"memory(GiB)": 31.97,
"step": 925,
"token_acc": 0.9180286145399676,
"train_speed(iter/s)": 0.123301
},
{
"epoch": 1.1437456807187285,
"grad_norm": 0.7919936776161194,
"learning_rate": 6.821565323680759e-06,
"loss": 0.18091797828674316,
"memory(GiB)": 31.97,
"step": 930,
"token_acc": 0.9357635368079497,
"train_speed(iter/s)": 0.123449
},
{
"epoch": 1.149888658527221,
"grad_norm": 0.7052066922187805,
"learning_rate": 6.791539115739879e-06,
"loss": 0.20484356880187987,
"memory(GiB)": 31.97,
"step": 935,
"token_acc": 0.9249510662408571,
"train_speed(iter/s)": 0.123648
},
{
"epoch": 1.1560316363357137,
"grad_norm": 0.7116119861602783,
"learning_rate": 6.761438598879383e-06,
"loss": 0.18515671491622926,
"memory(GiB)": 31.97,
"step": 940,
"token_acc": 0.9454207808678322,
"train_speed(iter/s)": 0.123794
},
{
"epoch": 1.1560316363357137,
"eval_loss": 0.265466570854187,
"eval_runtime": 29.6874,
"eval_samples_per_second": 17.718,
"eval_steps_per_second": 4.446,
"eval_token_acc": 0.9163063842052169,
"step": 940
},
{
"epoch": 1.1621746141442064,
"grad_norm": 0.8001610040664673,
"learning_rate": 6.731265021599437e-06,
"loss": 0.2151487112045288,
"memory(GiB)": 31.97,
"step": 945,
"token_acc": 0.9195844345210973,
"train_speed(iter/s)": 0.123335
},
{
"epoch": 1.1683175919526991,
"grad_norm": 0.703551709651947,
"learning_rate": 6.7010196354305876e-06,
"loss": 0.188127601146698,
"memory(GiB)": 31.97,
"step": 950,
"token_acc": 0.9344865159357123,
"train_speed(iter/s)": 0.123485
},
{
"epoch": 1.1744605697611918,
"grad_norm": 0.7591283917427063,
"learning_rate": 6.670703694881851e-06,
"loss": 0.19852180480957032,
"memory(GiB)": 31.97,
"step": 955,
"token_acc": 0.9347434962314612,
"train_speed(iter/s)": 0.12365
},
{
"epoch": 1.1806035475696843,
"grad_norm": 0.8347094058990479,
"learning_rate": 6.640318457388672e-06,
"loss": 0.1904957413673401,
"memory(GiB)": 31.97,
"step": 960,
"token_acc": 0.9374376643394686,
"train_speed(iter/s)": 0.123801
},
{
"epoch": 1.1806035475696843,
"eval_loss": 0.2668422758579254,
"eval_runtime": 29.65,
"eval_samples_per_second": 17.74,
"eval_steps_per_second": 4.452,
"eval_token_acc": 0.9165946101743767,
"step": 960
},
{
"epoch": 1.186746525378177,
"grad_norm": 0.8114694952964783,
"learning_rate": 6.609865183260777e-06,
"loss": 0.19182581901550294,
"memory(GiB)": 31.97,
"step": 965,
"token_acc": 0.9153690632426489,
"train_speed(iter/s)": 0.123322
},
{
"epoch": 1.1928895031866698,
"grad_norm": 0.8866662979125977,
"learning_rate": 6.579345135629896e-06,
"loss": 0.19811842441558838,
"memory(GiB)": 31.97,
"step": 970,
"token_acc": 0.9233720292959992,
"train_speed(iter/s)": 0.123477
},
{
"epoch": 1.1990324809951625,
"grad_norm": 0.7779002785682678,
"learning_rate": 6.548759580397363e-06,
"loss": 0.20236413478851317,
"memory(GiB)": 31.97,
"step": 975,
"token_acc": 0.9268359567816596,
"train_speed(iter/s)": 0.123623
},
{
"epoch": 1.2051754588036552,
"grad_norm": 0.6543186902999878,
"learning_rate": 6.518109786181628e-06,
"loss": 0.19884101152420045,
"memory(GiB)": 31.97,
"step": 980,
"token_acc": 0.9361254541977434,
"train_speed(iter/s)": 0.123766
},
{
"epoch": 1.2051754588036552,
"eval_loss": 0.26707014441490173,
"eval_runtime": 29.6219,
"eval_samples_per_second": 17.757,
"eval_steps_per_second": 4.456,
"eval_token_acc": 0.9165801988759187,
"step": 980
},
{
"epoch": 1.2113184366121477,
"grad_norm": 0.7516761422157288,
"learning_rate": 6.487397024265616e-06,
"loss": 0.2052464008331299,
"memory(GiB)": 31.97,
"step": 985,
"token_acc": 0.9144311222289313,
"train_speed(iter/s)": 0.123306
},
{
"epoch": 1.2174614144206404,
"grad_norm": 0.7994732856750488,
"learning_rate": 6.456622568544012e-06,
"loss": 0.19984896183013917,
"memory(GiB)": 31.97,
"step": 990,
"token_acc": 0.9331774440147496,
"train_speed(iter/s)": 0.123471
},
{
"epoch": 1.223604392229133,
"grad_norm": 0.8212082386016846,
"learning_rate": 6.425787695470419e-06,
"loss": 0.19194519519805908,
"memory(GiB)": 31.97,
"step": 995,
"token_acc": 0.9230769230769231,
"train_speed(iter/s)": 0.123636
},
{
"epoch": 1.2297473700376258,
"grad_norm": 0.7439980506896973,
"learning_rate": 6.3948936840044096e-06,
"loss": 0.20161755084991456,
"memory(GiB)": 31.97,
"step": 1000,
"token_acc": 0.9443154490422091,
"train_speed(iter/s)": 0.123777
},
{
"epoch": 1.2297473700376258,
"eval_loss": 0.26448243856430054,
"eval_runtime": 29.656,
"eval_samples_per_second": 17.737,
"eval_steps_per_second": 4.451,
"eval_token_acc": 0.9172719412019023,
"step": 1000
},
{
"epoch": 1.2358903478461185,
"grad_norm": 0.9987765550613403,
"learning_rate": 6.363941815558484e-06,
"loss": 0.19967958927154542,
"memory(GiB)": 31.97,
"step": 1005,
"token_acc": 0.9190461073035912,
"train_speed(iter/s)": 0.12333
},
{
"epoch": 1.242033325654611,
"grad_norm": 0.8285346627235413,
"learning_rate": 6.332933373944914e-06,
"loss": 0.19167766571044922,
"memory(GiB)": 31.97,
"step": 1010,
"token_acc": 0.9248888888888889,
"train_speed(iter/s)": 0.123459
},
{
"epoch": 1.2481763034631037,
"grad_norm": 0.8778380751609802,
"learning_rate": 6.301869645322498e-06,
"loss": 0.20817289352416993,
"memory(GiB)": 31.97,
"step": 1015,
"token_acc": 0.9210802145631667,
"train_speed(iter/s)": 0.123619
},
{
"epoch": 1.2543192812715964,
"grad_norm": 0.764149010181427,
"learning_rate": 6.270751918143213e-06,
"loss": 0.2000873565673828,
"memory(GiB)": 31.97,
"step": 1020,
"token_acc": 0.9338627474220694,
"train_speed(iter/s)": 0.123776
},
{
"epoch": 1.2543192812715964,
"eval_loss": 0.2691567838191986,
"eval_runtime": 29.7546,
"eval_samples_per_second": 17.678,
"eval_steps_per_second": 4.436,
"eval_token_acc": 0.9163496181005909,
"step": 1020
},
{
"epoch": 1.2604622590800891,
"grad_norm": 0.8174912929534912,
"learning_rate": 6.239581483098767e-06,
"loss": 0.19734174013137817,
"memory(GiB)": 31.97,
"step": 1025,
"token_acc": 0.9185803052816426,
"train_speed(iter/s)": 0.123299
},
{
"epoch": 1.2666052368885818,
"grad_norm": 0.8845574259757996,
"learning_rate": 6.208359633067077e-06,
"loss": 0.18390114307403566,
"memory(GiB)": 31.97,
"step": 1030,
"token_acc": 0.9341327407655864,
"train_speed(iter/s)": 0.123442
},
{
"epoch": 1.2727482146970743,
"grad_norm": 0.9409281015396118,
"learning_rate": 6.177087663058626e-06,
"loss": 0.20083985328674317,
"memory(GiB)": 31.97,
"step": 1035,
"token_acc": 0.9444893687865671,
"train_speed(iter/s)": 0.123585
},
{
"epoch": 1.278891192505567,
"grad_norm": 0.7851516604423523,
"learning_rate": 6.145766870162767e-06,
"loss": 0.21141374111175537,
"memory(GiB)": 31.97,
"step": 1040,
"token_acc": 0.9148654159869495,
"train_speed(iter/s)": 0.123721
},
{
"epoch": 1.278891192505567,
"eval_loss": 0.2667810916900635,
"eval_runtime": 29.6926,
"eval_samples_per_second": 17.715,
"eval_steps_per_second": 4.446,
"eval_token_acc": 0.9169765095835135,
"step": 1040
},
{
"epoch": 1.2850341703140598,
"grad_norm": 0.9521942138671875,
"learning_rate": 6.114398553493909e-06,
"loss": 0.1960476517677307,
"memory(GiB)": 31.97,
"step": 1045,
"token_acc": 0.9178016461816706,
"train_speed(iter/s)": 0.123266
},
{
"epoch": 1.2911771481225525,
"grad_norm": 0.8174967765808105,
"learning_rate": 6.0829840141376385e-06,
"loss": 0.20267832279205322,
"memory(GiB)": 31.97,
"step": 1050,
"token_acc": 0.9279712548369264,
"train_speed(iter/s)": 0.123426
},
{
"epoch": 1.2973201259310452,
"grad_norm": 0.8733800053596497,
"learning_rate": 6.051524555096754e-06,
"loss": 0.18992329835891725,
"memory(GiB)": 31.97,
"step": 1055,
"token_acc": 0.9288329960489544,
"train_speed(iter/s)": 0.123584
},
{
"epoch": 1.3034631037395377,
"grad_norm": 0.8741737008094788,
"learning_rate": 6.020021481237216e-06,
"loss": 0.2002291202545166,
"memory(GiB)": 31.97,
"step": 1060,
"token_acc": 0.9348597405477326,
"train_speed(iter/s)": 0.123729
},
{
"epoch": 1.3034631037395377,
"eval_loss": 0.26571905612945557,
"eval_runtime": 29.6815,
"eval_samples_per_second": 17.721,
"eval_steps_per_second": 4.447,
"eval_token_acc": 0.9168756304943075,
"step": 1060
},
{
"epoch": 1.3096060815480304,
"grad_norm": 0.7780086398124695,
"learning_rate": 5.988476099234033e-06,
"loss": 0.20159559249877929,
"memory(GiB)": 31.97,
"step": 1065,
"token_acc": 0.9135952477386257,
"train_speed(iter/s)": 0.123364
},
{
"epoch": 1.315749059356523,
"grad_norm": 0.7431135177612305,
"learning_rate": 5.956889717517053e-06,
"loss": 0.1894887328147888,
"memory(GiB)": 31.97,
"step": 1070,
"token_acc": 0.9296808409887299,
"train_speed(iter/s)": 0.12348
},
{
"epoch": 1.3218920371650158,
"grad_norm": 0.72322016954422,
"learning_rate": 5.925263646216697e-06,
"loss": 0.17778899669647216,
"memory(GiB)": 31.97,
"step": 1075,
"token_acc": 0.9421357447673238,
"train_speed(iter/s)": 0.123618
},
{
"epoch": 1.3280350149735085,
"grad_norm": 0.9004871249198914,
"learning_rate": 5.893599197109625e-06,
"loss": 0.1900892972946167,
"memory(GiB)": 31.97,
"step": 1080,
"token_acc": 0.9286453541858326,
"train_speed(iter/s)": 0.123756
},
{
"epoch": 1.3280350149735085,
"eval_loss": 0.26534271240234375,
"eval_runtime": 29.6233,
"eval_samples_per_second": 17.756,
"eval_steps_per_second": 4.456,
"eval_token_acc": 0.9171062112696354,
"step": 1080
},
{
"epoch": 1.334177992782001,
"grad_norm": 0.657845675945282,
"learning_rate": 5.861897683564313e-06,
"loss": 0.18198509216308595,
"memory(GiB)": 31.97,
"step": 1085,
"token_acc": 0.9204705963413373,
"train_speed(iter/s)": 0.123316
},
{
"epoch": 1.3403209705904937,
"grad_norm": 0.8465989828109741,
"learning_rate": 5.830160420486588e-06,
"loss": 0.2053920269012451,
"memory(GiB)": 31.97,
"step": 1090,
"token_acc": 0.9380833375835161,
"train_speed(iter/s)": 0.123479
},
{
"epoch": 1.3464639483989864,
"grad_norm": 0.7130780816078186,
"learning_rate": 5.798388724265085e-06,
"loss": 0.18327146768569946,
"memory(GiB)": 31.97,
"step": 1095,
"token_acc": 0.9395675675675675,
"train_speed(iter/s)": 0.123624
},
{
"epoch": 1.3526069262074791,
"grad_norm": 0.800010085105896,
"learning_rate": 5.7665839127166475e-06,
"loss": 0.18803975582122803,
"memory(GiB)": 31.97,
"step": 1100,
"token_acc": 0.9407923378319547,
"train_speed(iter/s)": 0.123735
},
{
"epoch": 1.3526069262074791,
"eval_loss": 0.26394224166870117,
"eval_runtime": 29.7389,
"eval_samples_per_second": 17.687,
"eval_steps_per_second": 4.439,
"eval_token_acc": 0.9168684248450786,
"step": 1100
},
{
"epoch": 1.3587499040159718,
"grad_norm": 0.8300402164459229,
"learning_rate": 5.734747305031664e-06,
"loss": 0.21199843883514405,
"memory(GiB)": 31.97,
"step": 1105,
"token_acc": 0.9151312319249167,
"train_speed(iter/s)": 0.123328
},
{
"epoch": 1.3648928818244643,
"grad_norm": 0.7512418627738953,
"learning_rate": 5.7028802217193565e-06,
"loss": 0.18927464485168458,
"memory(GiB)": 31.97,
"step": 1110,
"token_acc": 0.9413873811065187,
"train_speed(iter/s)": 0.123452
},
{
"epoch": 1.371035859632957,
"grad_norm": 0.8453025221824646,
"learning_rate": 5.670983984553003e-06,
"loss": 0.20298895835876465,
"memory(GiB)": 31.97,
"step": 1115,
"token_acc": 0.9409311022678237,
"train_speed(iter/s)": 0.123572
},
{
"epoch": 1.3771788374414498,
"grad_norm": 0.8611200451850891,
"learning_rate": 5.63905991651512e-06,
"loss": 0.1865471839904785,
"memory(GiB)": 31.97,
"step": 1120,
"token_acc": 0.9321085791674028,
"train_speed(iter/s)": 0.123686
},
{
"epoch": 1.3771788374414498,
"eval_loss": 0.2642819583415985,
"eval_runtime": 29.6704,
"eval_samples_per_second": 17.728,
"eval_steps_per_second": 4.449,
"eval_token_acc": 0.9167243118604986,
"step": 1120
},
{
"epoch": 1.3833218152499425,
"grad_norm": 0.7028310298919678,
"learning_rate": 5.607109341742579e-06,
"loss": 0.1868009090423584,
"memory(GiB)": 31.97,
"step": 1125,
"token_acc": 0.9161407676887668,
"train_speed(iter/s)": 0.123281
},
{
"epoch": 1.3894647930584352,
"grad_norm": 0.7893259525299072,
"learning_rate": 5.575133585471697e-06,
"loss": 0.18891712427139282,
"memory(GiB)": 31.97,
"step": 1130,
"token_acc": 0.9357982673267327,
"train_speed(iter/s)": 0.123383
},
{
"epoch": 1.3956077708669277,
"grad_norm": 0.8665569424629211,
"learning_rate": 5.543133973983254e-06,
"loss": 0.18693907260894777,
"memory(GiB)": 31.97,
"step": 1135,
"token_acc": 0.9333289413004809,
"train_speed(iter/s)": 0.123515
},
{
"epoch": 1.4017507486754204,
"grad_norm": 0.7769924998283386,
"learning_rate": 5.511111834547496e-06,
"loss": 0.18896095752716063,
"memory(GiB)": 31.97,
"step": 1140,
"token_acc": 0.9316990440949738,
"train_speed(iter/s)": 0.123635
},
{
"epoch": 1.4017507486754204,
"eval_loss": 0.2637878358364105,
"eval_runtime": 29.598,
"eval_samples_per_second": 17.771,
"eval_steps_per_second": 4.46,
"eval_token_acc": 0.9168612191958495,
"step": 1140
},
{
"epoch": 1.407893726483913,
"grad_norm": 0.7512997388839722,
"learning_rate": 5.479068495369071e-06,
"loss": 0.17823780775070192,
"memory(GiB)": 31.97,
"step": 1145,
"token_acc": 0.9169221157037702,
"train_speed(iter/s)": 0.123233
},
{
"epoch": 1.4140367042924058,
"grad_norm": 0.7769235372543335,
"learning_rate": 5.447005285531948e-06,
"loss": 0.18888635635375978,
"memory(GiB)": 31.97,
"step": 1150,
"token_acc": 0.9348544111255975,
"train_speed(iter/s)": 0.123343
},
{
"epoch": 1.4201796821008985,
"grad_norm": 0.6804760694503784,
"learning_rate": 5.414923534944283e-06,
"loss": 0.19998799562454223,
"memory(GiB)": 31.97,
"step": 1155,
"token_acc": 0.927479002131127,
"train_speed(iter/s)": 0.12347
},
{
"epoch": 1.426322659909391,
"grad_norm": 0.8411586284637451,
"learning_rate": 5.38282457428326e-06,
"loss": 0.18058542013168336,
"memory(GiB)": 31.97,
"step": 1160,
"token_acc": 0.9393255256102724,
"train_speed(iter/s)": 0.123592
},
{
"epoch": 1.426322659909391,
"eval_loss": 0.26423749327659607,
"eval_runtime": 29.6214,
"eval_samples_per_second": 17.757,
"eval_steps_per_second": 4.456,
"eval_token_acc": 0.9165369649805447,
"step": 1160
},
{
"epoch": 1.4324656377178837,
"grad_norm": 0.9211987257003784,
"learning_rate": 5.350709734939898e-06,
"loss": 0.19590919017791747,
"memory(GiB)": 31.97,
"step": 1165,
"token_acc": 0.9181291791405984,
"train_speed(iter/s)": 0.123222
},
{
"epoch": 1.4386086155263764,
"grad_norm": 0.8355826735496521,
"learning_rate": 5.318580348963826e-06,
"loss": 0.18302634954452515,
"memory(GiB)": 31.97,
"step": 1170,
"token_acc": 0.940089028541503,
"train_speed(iter/s)": 0.123344
},
{
"epoch": 1.4447515933348691,
"grad_norm": 0.8388906717300415,
"learning_rate": 5.286437749008031e-06,
"loss": 0.19118983745574952,
"memory(GiB)": 31.97,
"step": 1175,
"token_acc": 0.934627927660836,
"train_speed(iter/s)": 0.123476
},
{
"epoch": 1.4508945711433618,
"grad_norm": 0.9128098487854004,
"learning_rate": 5.2542832682735956e-06,
"loss": 0.20061683654785156,
"memory(GiB)": 31.97,
"step": 1180,
"token_acc": 0.9325906344410876,
"train_speed(iter/s)": 0.123611
},
{
"epoch": 1.4508945711433618,
"eval_loss": 0.26307445764541626,
"eval_runtime": 29.7678,
"eval_samples_per_second": 17.67,
"eval_steps_per_second": 4.434,
"eval_token_acc": 0.9170125378296584,
"step": 1180
},
{
"epoch": 1.4570375489518543,
"grad_norm": 0.7825379371643066,
"learning_rate": 5.222118240454376e-06,
"loss": 0.19818990230560302,
"memory(GiB)": 31.97,
"step": 1185,
"token_acc": 0.9137893551001047,
"train_speed(iter/s)": 0.123207
},
{
"epoch": 1.463180526760347,
"grad_norm": 0.748515784740448,
"learning_rate": 5.18994399968171e-06,
"loss": 0.19512221813201905,
"memory(GiB)": 31.97,
"step": 1190,
"token_acc": 0.9283811949976841,
"train_speed(iter/s)": 0.123314
},
{
"epoch": 1.4693235045688398,
"grad_norm": 0.9209436178207397,
"learning_rate": 5.157761880469058e-06,
"loss": 0.19263048171997071,
"memory(GiB)": 31.97,
"step": 1195,
"token_acc": 0.9410712406608439,
"train_speed(iter/s)": 0.123435
},
{
"epoch": 1.4754664823773325,
"grad_norm": 0.8622936606407166,
"learning_rate": 5.125573217656664e-06,
"loss": 0.1777910351753235,
"memory(GiB)": 31.97,
"step": 1200,
"token_acc": 0.938923185912357,
"train_speed(iter/s)": 0.123542
},
{
"epoch": 1.4754664823773325,
"eval_loss": 0.26356378197669983,
"eval_runtime": 29.7055,
"eval_samples_per_second": 17.707,
"eval_steps_per_second": 4.444,
"eval_token_acc": 0.9174160541864822,
"step": 1200
},
{
"epoch": 1.4816094601858252,
"grad_norm": 0.8892216086387634,
"learning_rate": 5.0933793463561855e-06,
"loss": 0.189991557598114,
"memory(GiB)": 31.97,
"step": 1205,
"token_acc": 0.9184802373432864,
"train_speed(iter/s)": 0.12315
},
{
"epoch": 1.4877524379943177,
"grad_norm": 0.7723336219787598,
"learning_rate": 5.061181601895317e-06,
"loss": 0.19531933069229127,
"memory(GiB)": 31.97,
"step": 1210,
"token_acc": 0.9326246228990087,
"train_speed(iter/s)": 0.123271
},
{
"epoch": 1.4938954158028104,
"grad_norm": 0.8178744912147522,
"learning_rate": 5.028981319762399e-06,
"loss": 0.19836077690124512,
"memory(GiB)": 31.97,
"step": 1215,
"token_acc": 0.9317647058823529,
"train_speed(iter/s)": 0.123399
},
{
"epoch": 1.500038393611303,
"grad_norm": 0.8365611433982849,
"learning_rate": 4.996779835551035e-06,
"loss": 0.17670562267303466,
"memory(GiB)": 31.97,
"step": 1220,
"token_acc": 0.9215732593161283,
"train_speed(iter/s)": 0.123517
},
{
"epoch": 1.500038393611303,
"eval_loss": 0.26140737533569336,
"eval_runtime": 29.7037,
"eval_samples_per_second": 17.708,
"eval_steps_per_second": 4.444,
"eval_token_acc": 0.9175601671710621,
"step": 1220
},
{
"epoch": 1.5061813714197958,
"grad_norm": 0.9357166290283203,
"learning_rate": 4.964578484904679e-06,
"loss": 0.19990785121917726,
"memory(GiB)": 31.97,
"step": 1225,
"token_acc": 0.9138523956723339,
"train_speed(iter/s)": 0.123153
},
{
"epoch": 1.5123243492282885,
"grad_norm": 0.7626463174819946,
"learning_rate": 4.932378603461253e-06,
"loss": 0.17721318006515502,
"memory(GiB)": 31.97,
"step": 1230,
"token_acc": 0.9358071645166264,
"train_speed(iter/s)": 0.123279
},
{
"epoch": 1.518467327036781,
"grad_norm": 0.7975868582725525,
"learning_rate": 4.900181526797737e-06,
"loss": 0.18672944307327272,
"memory(GiB)": 31.97,
"step": 1235,
"token_acc": 0.9301695649818517,
"train_speed(iter/s)": 0.123379
},
{
"epoch": 1.5246103048452737,
"grad_norm": 0.9394721984863281,
"learning_rate": 4.867988590374777e-06,
"loss": 0.21254873275756836,
"memory(GiB)": 31.97,
"step": 1240,
"token_acc": 0.9301865980329075,
"train_speed(iter/s)": 0.12352
},
{
"epoch": 1.5246103048452737,
"eval_loss": 0.2631884217262268,
"eval_runtime": 29.7103,
"eval_samples_per_second": 17.704,
"eval_steps_per_second": 4.443,
"eval_token_acc": 0.9173656146418793,
"step": 1240
},
{
"epoch": 1.5307532826537664,
"grad_norm": 0.8997290134429932,
"learning_rate": 4.835801129481287e-06,
"loss": 0.17868154048919677,
"memory(GiB)": 31.97,
"step": 1245,
"token_acc": 0.9191156488994138,
"train_speed(iter/s)": 0.123159
},
{
"epoch": 1.5368962604622591,
"grad_norm": 0.9004043340682983,
"learning_rate": 4.803620479179071e-06,
"loss": 0.2074437618255615,
"memory(GiB)": 31.97,
"step": 1250,
"token_acc": 0.9450173238739482,
"train_speed(iter/s)": 0.123271
},
{
"epoch": 1.5430392382707518,
"grad_norm": 0.7723172307014465,
"learning_rate": 4.771447974247449e-06,
"loss": 0.1962502956390381,
"memory(GiB)": 31.97,
"step": 1255,
"token_acc": 0.9253724029792239,
"train_speed(iter/s)": 0.123413
},
{
"epoch": 1.5491822160792443,
"grad_norm": 0.8553282618522644,
"learning_rate": 4.7392849491278825e-06,
"loss": 0.18894779682159424,
"memory(GiB)": 31.97,
"step": 1260,
"token_acc": 0.9323178471693323,
"train_speed(iter/s)": 0.123525
},
{
"epoch": 1.5491822160792443,
"eval_loss": 0.26209425926208496,
"eval_runtime": 29.6524,
"eval_samples_per_second": 17.739,
"eval_steps_per_second": 4.452,
"eval_token_acc": 0.9178844213863669,
"step": 1260
},
{
"epoch": 1.555325193887737,
"grad_norm": 0.886417806148529,
"learning_rate": 4.707132737868639e-06,
"loss": 0.2006976842880249,
"memory(GiB)": 31.97,
"step": 1265,
"token_acc": 0.9235658289984614,
"train_speed(iter/s)": 0.123184
},
{
"epoch": 1.5614681716962298,
"grad_norm": 0.798883318901062,
"learning_rate": 4.674992674069445e-06,
"loss": 0.17858563661575316,
"memory(GiB)": 31.97,
"step": 1270,
"token_acc": 0.9344567177637512,
"train_speed(iter/s)": 0.123284
},
{
"epoch": 1.5676111495047225,
"grad_norm": 0.8193321824073792,
"learning_rate": 4.642866090826187e-06,
"loss": 0.19088488817214966,
"memory(GiB)": 31.97,
"step": 1275,
"token_acc": 0.9263077510500191,
"train_speed(iter/s)": 0.123357
},
{
"epoch": 1.5737541273132152,
"grad_norm": 0.7805312275886536,
"learning_rate": 4.610754320675603e-06,
"loss": 0.19581155776977538,
"memory(GiB)": 31.97,
"step": 1280,
"token_acc": 0.9278596416834517,
"train_speed(iter/s)": 0.123486
},
{
"epoch": 1.5737541273132152,
"eval_loss": 0.26224827766418457,
"eval_runtime": 29.7067,
"eval_samples_per_second": 17.706,
"eval_steps_per_second": 4.443,
"eval_token_acc": 0.9181726473555267,
"step": 1280
},
{
"epoch": 1.5798971051217077,
"grad_norm": 0.8490621447563171,
"learning_rate": 4.578658695540018e-06,
"loss": 0.2049680233001709,
"memory(GiB)": 31.97,
"step": 1285,
"token_acc": 0.9203214434630984,
"train_speed(iter/s)": 0.123147
},
{
"epoch": 1.5860400829302004,
"grad_norm": 0.8018094301223755,
"learning_rate": 4.5465805466721e-06,
"loss": 0.21368024349212647,
"memory(GiB)": 31.97,
"step": 1290,
"token_acc": 0.9337745342459544,
"train_speed(iter/s)": 0.123269
},
{
"epoch": 1.592183060738693,
"grad_norm": 0.8416959047317505,
"learning_rate": 4.514521204599645e-06,
"loss": 0.19902560710906983,
"memory(GiB)": 31.97,
"step": 1295,
"token_acc": 0.9410267803045603,
"train_speed(iter/s)": 0.123379
},
{
"epoch": 1.5983260385471858,
"grad_norm": 0.7020901441574097,
"learning_rate": 4.48248199907038e-06,
"loss": 0.1922709822654724,
"memory(GiB)": 31.97,
"step": 1300,
"token_acc": 0.9301529196433843,
"train_speed(iter/s)": 0.123512
},
{
"epoch": 1.5983260385471858,
"eval_loss": 0.2603996694087982,
"eval_runtime": 29.664,
"eval_samples_per_second": 17.732,
"eval_steps_per_second": 4.45,
"eval_token_acc": 0.9183167603401067,
"step": 1300
},
{
"epoch": 1.6044690163556785,
"grad_norm": 0.9669449329376221,
"learning_rate": 4.450464258996822e-06,
"loss": 0.20973031520843505,
"memory(GiB)": 31.97,
"step": 1305,
"token_acc": 0.9143080561489166,
"train_speed(iter/s)": 0.123187
},
{
"epoch": 1.610611994164171,
"grad_norm": 0.8625094890594482,
"learning_rate": 4.418469312401141e-06,
"loss": 0.16759986877441407,
"memory(GiB)": 31.97,
"step": 1310,
"token_acc": 0.9306892935456192,
"train_speed(iter/s)": 0.123294
},
{
"epoch": 1.6167549719726637,
"grad_norm": 0.8282011151313782,
"learning_rate": 4.386498486360095e-06,
"loss": 0.20370192527770997,
"memory(GiB)": 31.97,
"step": 1315,
"token_acc": 0.9212595005428882,
"train_speed(iter/s)": 0.123412
},
{
"epoch": 1.6228979497811564,
"grad_norm": 0.8530144095420837,
"learning_rate": 4.354553106949972e-06,
"loss": 0.20181150436401368,
"memory(GiB)": 31.97,
"step": 1320,
"token_acc": 0.9427324788655577,
"train_speed(iter/s)": 0.12352
},
{
"epoch": 1.6228979497811564,
"eval_loss": 0.26066553592681885,
"eval_runtime": 29.7468,
"eval_samples_per_second": 17.683,
"eval_steps_per_second": 4.437,
"eval_token_acc": 0.9182663207955036,
"step": 1320
},
{
"epoch": 1.6290409275896491,
"grad_norm": 0.760425329208374,
"learning_rate": 4.3226344991915936e-06,
"loss": 0.18798611164093018,
"memory(GiB)": 31.97,
"step": 1325,
"token_acc": 0.9204100274028215,
"train_speed(iter/s)": 0.123163
},
{
"epoch": 1.6351839053981418,
"grad_norm": 0.8320059776306152,
"learning_rate": 4.290743986995353e-06,
"loss": 0.20692143440246583,
"memory(GiB)": 31.97,
"step": 1330,
"token_acc": 0.9179834090460202,
"train_speed(iter/s)": 0.123278
},
{
"epoch": 1.6413268832066343,
"grad_norm": 0.9049168229103088,
"learning_rate": 4.258882893106308e-06,
"loss": 0.18184820413589478,
"memory(GiB)": 31.97,
"step": 1335,
"token_acc": 0.9419431279620853,
"train_speed(iter/s)": 0.123369
},
{
"epoch": 1.647469861015127,
"grad_norm": 0.8740628361701965,
"learning_rate": 4.227052539049312e-06,
"loss": 0.1948437809944153,
"memory(GiB)": 31.97,
"step": 1340,
"token_acc": 0.9350572326671016,
"train_speed(iter/s)": 0.123484
},
{
"epoch": 1.647469861015127,
"eval_loss": 0.2610000967979431,
"eval_runtime": 29.6634,
"eval_samples_per_second": 17.732,
"eval_steps_per_second": 4.45,
"eval_token_acc": 0.9179132439832829,
"step": 1340
},
{
"epoch": 1.6536128388236198,
"grad_norm": 0.7111175656318665,
"learning_rate": 4.195254245074196e-06,
"loss": 0.17852287292480468,
"memory(GiB)": 31.97,
"step": 1345,
"token_acc": 0.9160269612432129,
"train_speed(iter/s)": 0.123131
},
{
"epoch": 1.6597558166321125,
"grad_norm": 0.7991046905517578,
"learning_rate": 4.163489330101017e-06,
"loss": 0.1986152410507202,
"memory(GiB)": 31.97,
"step": 1350,
"token_acc": 0.93740389861614,
"train_speed(iter/s)": 0.123232
},
{
"epoch": 1.6658987944406052,
"grad_norm": 0.8582780957221985,
"learning_rate": 4.131759111665349e-06,
"loss": 0.18987109661102294,
"memory(GiB)": 31.97,
"step": 1355,
"token_acc": 0.9441934490194065,
"train_speed(iter/s)": 0.12331
},
{
"epoch": 1.6720417722490977,
"grad_norm": 0.6932474970817566,
"learning_rate": 4.100064905863628e-06,
"loss": 0.19035787582397462,
"memory(GiB)": 31.97,
"step": 1360,
"token_acc": 0.9174038315725623,
"train_speed(iter/s)": 0.123414
},
{
"epoch": 1.6720417722490977,
"eval_loss": 0.26117271184921265,
"eval_runtime": 29.681,
"eval_samples_per_second": 17.722,
"eval_steps_per_second": 4.447,
"eval_token_acc": 0.9178772157371379,
"step": 1360
},
{
"epoch": 1.6781847500575904,
"grad_norm": 0.8091769814491272,
"learning_rate": 4.068408027298576e-06,
"loss": 0.20030708312988282,
"memory(GiB)": 31.97,
"step": 1365,
"token_acc": 0.921018299777864,
"train_speed(iter/s)": 0.123084
},
{
"epoch": 1.684327727866083,
"grad_norm": 0.8656709790229797,
"learning_rate": 4.036789789024659e-06,
"loss": 0.17970023155212403,
"memory(GiB)": 31.97,
"step": 1370,
"token_acc": 0.9361972662458562,
"train_speed(iter/s)": 0.123185
},
{
"epoch": 1.6904707056745758,
"grad_norm": 0.8730081915855408,
"learning_rate": 4.00521150249364e-06,
"loss": 0.20007739067077637,
"memory(GiB)": 31.97,
"step": 1375,
"token_acc": 0.9322461977708231,
"train_speed(iter/s)": 0.123308
},
{
"epoch": 1.6966136834830685,
"grad_norm": 0.8743943572044373,
"learning_rate": 3.973674477500172e-06,
"loss": 0.19028009176254274,
"memory(GiB)": 31.97,
"step": 1380,
"token_acc": 0.9364455364455364,
"train_speed(iter/s)": 0.123436
},
{
"epoch": 1.6966136834830685,
"eval_loss": 0.2610381841659546,
"eval_runtime": 29.6856,
"eval_samples_per_second": 17.719,
"eval_steps_per_second": 4.447,
"eval_token_acc": 0.9183167603401067,
"step": 1380
},
{
"epoch": 1.702756661291561,
"grad_norm": 0.804136335849762,
"learning_rate": 3.942180022127475e-06,
"loss": 0.16746077537536622,
"memory(GiB)": 31.97,
"step": 1385,
"token_acc": 0.9190062765437667,
"train_speed(iter/s)": 0.12311
},
{
"epoch": 1.7088996391000537,
"grad_norm": 0.7739353775978088,
"learning_rate": 3.910729442693077e-06,
"loss": 0.20771589279174804,
"memory(GiB)": 31.97,
"step": 1390,
"token_acc": 0.9368890897790836,
"train_speed(iter/s)": 0.123227
},
{
"epoch": 1.7150426169085464,
"grad_norm": 0.7843467593193054,
"learning_rate": 3.8793240436946385e-06,
"loss": 0.1791388511657715,
"memory(GiB)": 31.97,
"step": 1395,
"token_acc": 0.9298795912172417,
"train_speed(iter/s)": 0.123311
},
{
"epoch": 1.7211855947170391,
"grad_norm": 0.8047447800636292,
"learning_rate": 3.847965127755834e-06,
"loss": 0.1962286114692688,
"memory(GiB)": 31.97,
"step": 1400,
"token_acc": 0.9289617486338798,
"train_speed(iter/s)": 0.123422
},
{
"epoch": 1.7211855947170391,
"eval_loss": 0.25994521379470825,
"eval_runtime": 29.6994,
"eval_samples_per_second": 17.711,
"eval_steps_per_second": 4.445,
"eval_token_acc": 0.9187274823461594,
"step": 1400
},
{
"epoch": 1.7273285725255318,
"grad_norm": 0.7783213257789612,
"learning_rate": 3.816653995572332e-06,
"loss": 0.1934323787689209,
"memory(GiB)": 31.97,
"step": 1405,
"token_acc": 0.9191889097250214,
"train_speed(iter/s)": 0.123105
},
{
"epoch": 1.7334715503340243,
"grad_norm": 0.9164287447929382,
"learning_rate": 3.7853919458578327e-06,
"loss": 0.1951138973236084,
"memory(GiB)": 31.97,
"step": 1410,
"token_acc": 0.9345938875014865,
"train_speed(iter/s)": 0.123227
},
{
"epoch": 1.739614528142517,
"grad_norm": 0.6820860505104065,
"learning_rate": 3.7541802752902224e-06,
"loss": 0.1772141695022583,
"memory(GiB)": 31.97,
"step": 1415,
"token_acc": 0.9298903956901357,
"train_speed(iter/s)": 0.123313
},
{
"epoch": 1.7457575059510098,
"grad_norm": 0.900181770324707,
"learning_rate": 3.723020278457763e-06,
"loss": 0.1944177269935608,
"memory(GiB)": 31.97,
"step": 1420,
"token_acc": 0.9333839438223572,
"train_speed(iter/s)": 0.123418
},
{
"epoch": 1.7457575059510098,
"eval_loss": 0.2598145008087158,
"eval_runtime": 29.6524,
"eval_samples_per_second": 17.739,
"eval_steps_per_second": 4.452,
"eval_token_acc": 0.9181582360570687,
"step": 1420
},
{
"epoch": 1.7519004837595025,
"grad_norm": 0.8312418460845947,
"learning_rate": 3.6919132478054153e-06,
"loss": 0.2060741662979126,
"memory(GiB)": 31.97,
"step": 1425,
"token_acc": 0.9155994474106709,
"train_speed(iter/s)": 0.123098
},
{
"epoch": 1.7580434615679952,
"grad_norm": 0.8780198097229004,
"learning_rate": 3.6608604735812226e-06,
"loss": 0.1988367795944214,
"memory(GiB)": 31.97,
"step": 1430,
"token_acc": 0.9336537924095915,
"train_speed(iter/s)": 0.123197
},
{
"epoch": 1.7641864393764877,
"grad_norm": 0.8375983834266663,
"learning_rate": 3.629863243782799e-06,
"loss": 0.20454792976379393,
"memory(GiB)": 31.97,
"step": 1435,
"token_acc": 0.9179058065245661,
"train_speed(iter/s)": 0.123317
},
{
"epoch": 1.7703294171849804,
"grad_norm": 0.7774383425712585,
"learning_rate": 3.5989228441039024e-06,
"loss": 0.1952831268310547,
"memory(GiB)": 31.97,
"step": 1440,
"token_acc": 0.9264312326179357,
"train_speed(iter/s)": 0.123413
},
{
"epoch": 1.7703294171849804,
"eval_loss": 0.2601591646671295,
"eval_runtime": 29.6923,
"eval_samples_per_second": 17.715,
"eval_steps_per_second": 4.446,
"eval_token_acc": 0.9185257241677475,
"step": 1440
},
{
"epoch": 1.776472394993473,
"grad_norm": 0.8040404915809631,
"learning_rate": 3.568040557881106e-06,
"loss": 0.18471212387084962,
"memory(GiB)": 31.97,
"step": 1445,
"token_acc": 0.9204298276599304,
"train_speed(iter/s)": 0.123124
},
{
"epoch": 1.7826153728019658,
"grad_norm": 0.7603934407234192,
"learning_rate": 3.5372176660405717e-06,
"loss": 0.19175269603729247,
"memory(GiB)": 31.97,
"step": 1450,
"token_acc": 0.9407587455914593,
"train_speed(iter/s)": 0.123205
},
{
"epoch": 1.7887583506104585,
"grad_norm": 0.8216592073440552,
"learning_rate": 3.506455447044923e-06,
"loss": 0.18449797630310058,
"memory(GiB)": 31.97,
"step": 1455,
"token_acc": 0.9232101076275152,
"train_speed(iter/s)": 0.12331
},
{
"epoch": 1.794901328418951,
"grad_norm": 0.8587454557418823,
"learning_rate": 3.4757551768402074e-06,
"loss": 0.1803336501121521,
"memory(GiB)": 31.97,
"step": 1460,
"token_acc": 0.9327302250057992,
"train_speed(iter/s)": 0.123414
},
{
"epoch": 1.794901328418951,
"eval_loss": 0.25998592376708984,
"eval_runtime": 29.6543,
"eval_samples_per_second": 17.738,
"eval_steps_per_second": 4.451,
"eval_token_acc": 0.9187851275399913,
"step": 1460
},
{
"epoch": 1.8010443062274437,
"grad_norm": 0.6978159546852112,
"learning_rate": 3.4451181288029834e-06,
"loss": 0.17668429613113404,
"memory(GiB)": 31.97,
"step": 1465,
"token_acc": 0.9205462088038718,
"train_speed(iter/s)": 0.123097
},
{
"epoch": 1.8071872840359364,
"grad_norm": 0.7977909445762634,
"learning_rate": 3.4145455736874957e-06,
"loss": 0.20127489566802978,
"memory(GiB)": 31.97,
"step": 1470,
"token_acc": 0.9337199247164149,
"train_speed(iter/s)": 0.123204
},
{
"epoch": 1.8133302618444291,
"grad_norm": 0.847145140171051,
"learning_rate": 3.3840387795729753e-06,
"loss": 0.1935032606124878,
"memory(GiB)": 31.97,
"step": 1475,
"token_acc": 0.9318121092288784,
"train_speed(iter/s)": 0.123311
},
{
"epoch": 1.8194732396529218,
"grad_norm": 0.8690524697303772,
"learning_rate": 3.353599011811037e-06,
"loss": 0.19352041482925414,
"memory(GiB)": 31.97,
"step": 1480,
"token_acc": 0.9311466218110457,
"train_speed(iter/s)": 0.123409
},
{
"epoch": 1.8194732396529218,
"eval_loss": 0.259257048368454,
"eval_runtime": 29.6405,
"eval_samples_per_second": 17.746,
"eval_steps_per_second": 4.453,
"eval_token_acc": 0.9186266032569534,
"step": 1480
},
{
"epoch": 1.8256162174614143,
"grad_norm": 0.8610875010490417,
"learning_rate": 3.323227532973193e-06,
"loss": 0.18993620872497557,
"memory(GiB)": 31.97,
"step": 1485,
"token_acc": 0.9163555740842508,
"train_speed(iter/s)": 0.123098
},
{
"epoch": 1.831759195269907,
"grad_norm": 0.9433273673057556,
"learning_rate": 3.292925602798492e-06,
"loss": 0.1930912494659424,
"memory(GiB)": 31.97,
"step": 1490,
"token_acc": 0.9382497082847141,
"train_speed(iter/s)": 0.123201
},
{
"epoch": 1.8379021730783998,
"grad_norm": 0.7419607043266296,
"learning_rate": 3.262694478141266e-06,
"loss": 0.1879183053970337,
"memory(GiB)": 31.97,
"step": 1495,
"token_acc": 0.9291406527587432,
"train_speed(iter/s)": 0.123298
},
{
"epoch": 1.8440451508868925,
"grad_norm": 0.8275686502456665,
"learning_rate": 3.2325354129189923e-06,
"loss": 0.19919825792312623,
"memory(GiB)": 31.97,
"step": 1500,
"token_acc": 0.9369797252438589,
"train_speed(iter/s)": 0.123397
},
{
"epoch": 1.8440451508868925,
"eval_loss": 0.2576294541358948,
"eval_runtime": 29.692,
"eval_samples_per_second": 17.715,
"eval_steps_per_second": 4.446,
"eval_token_acc": 0.9190301196137772,
"step": 1500
},
{
"epoch": 1.8501881286953852,
"grad_norm": 0.7494759559631348,
"learning_rate": 3.2024496580602892e-06,
"loss": 0.1704793930053711,
"memory(GiB)": 31.97,
"step": 1505,
"token_acc": 0.920035804863494,
"train_speed(iter/s)": 0.123086
},
{
"epoch": 1.8563311065038777,
"grad_norm": 0.7116587162017822,
"learning_rate": 3.172438461453032e-06,
"loss": 0.19869464635849,
"memory(GiB)": 31.97,
"step": 1510,
"token_acc": 0.9331588853693247,
"train_speed(iter/s)": 0.123201
},
{
"epoch": 1.8624740843123704,
"grad_norm": 0.8266251087188721,
"learning_rate": 3.142503067892594e-06,
"loss": 0.18929662704467773,
"memory(GiB)": 31.97,
"step": 1515,
"token_acc": 0.9361521750649076,
"train_speed(iter/s)": 0.123284
},
{
"epoch": 1.868617062120863,
"grad_norm": 0.8856237530708313,
"learning_rate": 3.112644719030206e-06,
"loss": 0.1765504837036133,
"memory(GiB)": 31.97,
"step": 1520,
"token_acc": 0.9446170019591915,
"train_speed(iter/s)": 0.123379
},
{
"epoch": 1.868617062120863,
"eval_loss": 0.2572629451751709,
"eval_runtime": 29.7297,
"eval_samples_per_second": 17.693,
"eval_steps_per_second": 4.44,
"eval_token_acc": 0.9187563049430754,
"step": 1520
},
{
"epoch": 1.8747600399293558,
"grad_norm": 0.7266517281532288,
"learning_rate": 3.0828646533214657e-06,
"loss": 0.18753888607025146,
"memory(GiB)": 31.97,
"step": 1525,
"token_acc": 0.921090387374462,
"train_speed(iter/s)": 0.123072
},
{
"epoch": 1.8809030177378485,
"grad_norm": 0.7010701298713684,
"learning_rate": 3.053164105974964e-06,
"loss": 0.18214144706726074,
"memory(GiB)": 31.97,
"step": 1530,
"token_acc": 0.9355687362479671,
"train_speed(iter/s)": 0.123181
},
{
"epoch": 1.887045995546341,
"grad_norm": 0.8133987188339233,
"learning_rate": 3.0235443089010564e-06,
"loss": 0.19535071849823,
"memory(GiB)": 31.97,
"step": 1535,
"token_acc": 0.9338049036944989,
"train_speed(iter/s)": 0.123268
},
{
"epoch": 1.8931889733548337,
"grad_norm": 0.7620673775672913,
"learning_rate": 2.9940064906607607e-06,
"loss": 0.19279036521911622,
"memory(GiB)": 31.97,
"step": 1540,
"token_acc": 0.9357547655847501,
"train_speed(iter/s)": 0.123366
},
{
"epoch": 1.8931889733548337,
"eval_loss": 0.2575133144855499,
"eval_runtime": 29.6891,
"eval_samples_per_second": 17.717,
"eval_steps_per_second": 4.446,
"eval_token_acc": 0.9187923331892204,
"step": 1540
},
{
"epoch": 1.8993319511633264,
"grad_norm": 0.8368042707443237,
"learning_rate": 2.964551876414801e-06,
"loss": 0.186897873878479,
"memory(GiB)": 31.97,
"step": 1545,
"token_acc": 0.9212080946652063,
"train_speed(iter/s)": 0.123086
},
{
"epoch": 1.9054749289718191,
"grad_norm": 0.7964156270027161,
"learning_rate": 2.93518168787279e-06,
"loss": 0.1861191511154175,
"memory(GiB)": 31.97,
"step": 1550,
"token_acc": 0.9394473838918284,
"train_speed(iter/s)": 0.123163
},
{
"epoch": 1.9116179067803118,
"grad_norm": 0.9378722906112671,
"learning_rate": 2.905897143242562e-06,
"loss": 0.197173810005188,
"memory(GiB)": 31.97,
"step": 1555,
"token_acc": 0.9388659543467702,
"train_speed(iter/s)": 0.123262
},
{
"epoch": 1.9177608845888043,
"grad_norm": 0.8486573100090027,
"learning_rate": 2.8766994571796336e-06,
"loss": 0.18908753395080566,
"memory(GiB)": 31.97,
"step": 1560,
"token_acc": 0.9474116680361545,
"train_speed(iter/s)": 0.123349
},
{
"epoch": 1.9177608845888043,
"eval_loss": 0.25726109743118286,
"eval_runtime": 29.6999,
"eval_samples_per_second": 17.711,
"eval_steps_per_second": 4.444,
"eval_token_acc": 0.9191309987029831,
"step": 1560
},
{
"epoch": 1.923903862397297,
"grad_norm": 0.8716571927070618,
"learning_rate": 2.8475898407368298e-06,
"loss": 0.18810817003250122,
"memory(GiB)": 31.97,
"step": 1565,
"token_acc": 0.9179941342227845,
"train_speed(iter/s)": 0.123063
},
{
"epoch": 1.9300468402057898,
"grad_norm": 0.794176459312439,
"learning_rate": 2.8185695013140474e-06,
"loss": 0.17928617000579833,
"memory(GiB)": 31.97,
"step": 1570,
"token_acc": 0.9303826916366175,
"train_speed(iter/s)": 0.123136
},
{
"epoch": 1.9361898180142825,
"grad_norm": 0.7357900142669678,
"learning_rate": 2.7896396426081844e-06,
"loss": 0.18468384742736815,
"memory(GiB)": 31.97,
"step": 1575,
"token_acc": 0.9476946498477599,
"train_speed(iter/s)": 0.123228
},
{
"epoch": 1.9423327958227752,
"grad_norm": 0.8071329593658447,
"learning_rate": 2.7608014645632e-06,
"loss": 0.1790044903755188,
"memory(GiB)": 31.97,
"step": 1580,
"token_acc": 0.938118933832586,
"train_speed(iter/s)": 0.123325
},
{
"epoch": 1.9423327958227752,
"eval_loss": 0.25785306096076965,
"eval_runtime": 29.6989,
"eval_samples_per_second": 17.711,
"eval_steps_per_second": 4.445,
"eval_token_acc": 0.9192462890906471,
"step": 1580
},
{
"epoch": 1.9484757736312677,
"grad_norm": 0.8714718818664551,
"learning_rate": 2.7320561633203567e-06,
"loss": 0.19142614603042601,
"memory(GiB)": 31.97,
"step": 1585,
"token_acc": 0.9180685641538602,
"train_speed(iter/s)": 0.123048
},
{
"epoch": 1.9546187514397604,
"grad_norm": 0.7642494440078735,
"learning_rate": 2.703404931168594e-06,
"loss": 0.1714502215385437,
"memory(GiB)": 31.97,
"step": 1590,
"token_acc": 0.9387637940932576,
"train_speed(iter/s)": 0.123137
},
{
"epoch": 1.960761729248253,
"grad_norm": 0.7690022587776184,
"learning_rate": 2.6748489564950907e-06,
"loss": 0.1712334156036377,
"memory(GiB)": 31.97,
"step": 1595,
"token_acc": 0.9381139489194499,
"train_speed(iter/s)": 0.123216
},
{
"epoch": 1.9669047070567458,
"grad_norm": 0.789311945438385,
"learning_rate": 2.6463894237359556e-06,
"loss": 0.1898505687713623,
"memory(GiB)": 31.97,
"step": 1600,
"token_acc": 0.9361558383064971,
"train_speed(iter/s)": 0.123315
},
{
"epoch": 1.9669047070567458,
"eval_loss": 0.25641384720802307,
"eval_runtime": 29.7239,
"eval_samples_per_second": 17.696,
"eval_steps_per_second": 4.441,
"eval_token_acc": 0.9191237930537541,
"step": 1600
},
{
"epoch": 1.9730476848652385,
"grad_norm": 0.8089138865470886,
"learning_rate": 2.618027513327116e-06,
"loss": 0.18109874725341796,
"memory(GiB)": 31.97,
"step": 1605,
"token_acc": 0.923689472311571,
"train_speed(iter/s)": 0.12303
},
{
"epoch": 1.979190662673731,
"grad_norm": 0.8698074221611023,
"learning_rate": 2.589764401655343e-06,
"loss": 0.183346688747406,
"memory(GiB)": 31.97,
"step": 1610,
"token_acc": 0.9436480028852222,
"train_speed(iter/s)": 0.123111
},
{
"epoch": 1.9853336404822237,
"grad_norm": 0.8340507745742798,
"learning_rate": 2.5616012610094702e-06,
"loss": 0.19840478897094727,
"memory(GiB)": 31.97,
"step": 1615,
"token_acc": 0.931529030765672,
"train_speed(iter/s)": 0.123193
},
{
"epoch": 1.9914766182907164,
"grad_norm": 0.9170531034469604,
"learning_rate": 2.533539259531757e-06,
"loss": 0.20222840309143067,
"memory(GiB)": 31.97,
"step": 1620,
"token_acc": 0.9312559145599567,
"train_speed(iter/s)": 0.123282
},
{
"epoch": 1.9914766182907164,
"eval_loss": 0.2562500834465027,
"eval_runtime": 29.7389,
"eval_samples_per_second": 17.687,
"eval_steps_per_second": 4.439,
"eval_token_acc": 0.91929672863525,
"step": 1620
},
{
"epoch": 1.9976195960992091,
"grad_norm": 0.8579339385032654,
"learning_rate": 2.5055795611694435e-06,
"loss": 0.17736260890960692,
"memory(GiB)": 31.97,
"step": 1625,
"token_acc": 0.9170028818443804,
"train_speed(iter/s)": 0.123032
},
{
"epoch": 2.004914382246794,
"grad_norm": 0.6278606057167053,
"learning_rate": 2.4777233256264743e-06,
"loss": 0.20010933876037598,
"memory(GiB)": 31.97,
"step": 1630,
"token_acc": 0.947276073094535,
"train_speed(iter/s)": 0.123086
},
{
"epoch": 2.011057360055287,
"grad_norm": 0.6573106050491333,
"learning_rate": 2.4499717083153975e-06,
"loss": 0.1415931224822998,
"memory(GiB)": 31.97,
"step": 1635,
"token_acc": 0.9564882032667876,
"train_speed(iter/s)": 0.123172
},
{
"epoch": 2.0172003378637795,
"grad_norm": 0.862938404083252,
"learning_rate": 2.4223258603094295e-06,
"loss": 0.16473679542541503,
"memory(GiB)": 31.97,
"step": 1640,
"token_acc": 0.9506443652316973,
"train_speed(iter/s)": 0.123277
},
{
"epoch": 2.0172003378637795,
"eval_loss": 0.2674296498298645,
"eval_runtime": 29.6424,
"eval_samples_per_second": 17.745,
"eval_steps_per_second": 4.453,
"eval_token_acc": 0.9185977806600375,
"step": 1640
},
{
"epoch": 2.023343315672272,
"grad_norm": 0.8443688750267029,
"learning_rate": 2.3947869282947263e-06,
"loss": 0.14469457864761354,
"memory(GiB)": 31.97,
"step": 1645,
"token_acc": 0.9298440219802724,
"train_speed(iter/s)": 0.122997
},
{
"epoch": 2.029486293480765,
"grad_norm": 0.8116291761398315,
"learning_rate": 2.3673560545228082e-06,
"loss": 0.14491933584213257,
"memory(GiB)": 31.97,
"step": 1650,
"token_acc": 0.9576881945413122,
"train_speed(iter/s)": 0.12308
},
{
"epoch": 2.0356292712892574,
"grad_norm": 0.6990134119987488,
"learning_rate": 2.3400343767631943e-06,
"loss": 0.1429425835609436,
"memory(GiB)": 31.97,
"step": 1655,
"token_acc": 0.9598897189612238,
"train_speed(iter/s)": 0.12317
},
{
"epoch": 2.0417722490977503,
"grad_norm": 0.7768262624740601,
"learning_rate": 2.312823028256205e-06,
"loss": 0.13332735300064086,
"memory(GiB)": 31.97,
"step": 1660,
"token_acc": 0.955598381190981,
"train_speed(iter/s)": 0.123249
},
{
"epoch": 2.0417722490977503,
"eval_loss": 0.2723671495914459,
"eval_runtime": 29.6708,
"eval_samples_per_second": 17.728,
"eval_steps_per_second": 4.449,
"eval_token_acc": 0.9178267761925349,
"step": 1660
},
{
"epoch": 2.047915226906243,
"grad_norm": 0.8455916047096252,
"learning_rate": 2.2857231376659517e-06,
"loss": 0.13110907077789308,
"memory(GiB)": 31.97,
"step": 1665,
"token_acc": 0.9266902441777343,
"train_speed(iter/s)": 0.12298
},
{
"epoch": 2.0540582047147353,
"grad_norm": 0.8857413530349731,
"learning_rate": 2.258735829033529e-06,
"loss": 0.16349921226501465,
"memory(GiB)": 31.97,
"step": 1670,
"token_acc": 0.9470571801080275,
"train_speed(iter/s)": 0.123074
},
{
"epoch": 2.0602011825232283,
"grad_norm": 0.8224142789840698,
"learning_rate": 2.231862221730394e-06,
"loss": 0.1457624077796936,
"memory(GiB)": 31.97,
"step": 1675,
"token_acc": 0.9467787114845938,
"train_speed(iter/s)": 0.123144
},
{
"epoch": 2.0663441603317207,
"grad_norm": 0.7867154479026794,
"learning_rate": 2.2051034304119344e-06,
"loss": 0.13943665027618407,
"memory(GiB)": 31.97,
"step": 1680,
"token_acc": 0.9487998351704955,
"train_speed(iter/s)": 0.123235
},
{
"epoch": 2.0663441603317207,
"eval_loss": 0.27172932028770447,
"eval_runtime": 29.8181,
"eval_samples_per_second": 17.64,
"eval_steps_per_second": 4.427,
"eval_token_acc": 0.9178844213863669,
"step": 1680
},
{
"epoch": 2.0724871381402137,
"grad_norm": 0.7623206973075867,
"learning_rate": 2.1784605649712326e-06,
"loss": 0.14780081510543824,
"memory(GiB)": 31.97,
"step": 1685,
"token_acc": 0.9247235706580367,
"train_speed(iter/s)": 0.122983
},
{
"epoch": 2.078630115948706,
"grad_norm": 0.8290310502052307,
"learning_rate": 2.1519347304930317e-06,
"loss": 0.1389237880706787,
"memory(GiB)": 31.97,
"step": 1690,
"token_acc": 0.9470190895741557,
"train_speed(iter/s)": 0.123069
},
{
"epoch": 2.0847730937571987,
"grad_norm": 0.7647258639335632,
"learning_rate": 2.1255270272079044e-06,
"loss": 0.14199459552764893,
"memory(GiB)": 31.97,
"step": 1695,
"token_acc": 0.9419040287400564,
"train_speed(iter/s)": 0.123168
},
{
"epoch": 2.0909160715656916,
"grad_norm": 0.8620509505271912,
"learning_rate": 2.0992385504466075e-06,
"loss": 0.14582890272140503,
"memory(GiB)": 31.97,
"step": 1700,
"token_acc": 0.9481878509443593,
"train_speed(iter/s)": 0.123261
},
{
"epoch": 2.0909160715656916,
"eval_loss": 0.27268144488334656,
"eval_runtime": 29.6628,
"eval_samples_per_second": 17.733,
"eval_steps_per_second": 4.45,
"eval_token_acc": 0.9178772157371379,
"step": 1700
},
{
"epoch": 2.097059049374184,
"grad_norm": 0.7256646752357483,
"learning_rate": 2.0730703905946612e-06,
"loss": 0.14624775648117067,
"memory(GiB)": 31.97,
"step": 1705,
"token_acc": 0.9200135124990616,
"train_speed(iter/s)": 0.123001
},
{
"epoch": 2.103202027182677,
"grad_norm": 0.7388427257537842,
"learning_rate": 2.0470236330471125e-06,
"loss": 0.11770030260086059,
"memory(GiB)": 31.97,
"step": 1710,
"token_acc": 0.9668929503916449,
"train_speed(iter/s)": 0.123069
},
{
"epoch": 2.1093450049911695,
"grad_norm": 0.8730806112289429,
"learning_rate": 2.0210993581635257e-06,
"loss": 0.16097368001937867,
"memory(GiB)": 31.97,
"step": 1715,
"token_acc": 0.9433831352051436,
"train_speed(iter/s)": 0.123169
},
{
"epoch": 2.115487982799662,
"grad_norm": 0.7302968502044678,
"learning_rate": 1.9952986412231612e-06,
"loss": 0.1270466446876526,
"memory(GiB)": 31.97,
"step": 1720,
"token_acc": 0.9573284772123241,
"train_speed(iter/s)": 0.123225
},
{
"epoch": 2.115487982799662,
"eval_loss": 0.27457520365715027,
"eval_runtime": 29.6821,
"eval_samples_per_second": 17.721,
"eval_steps_per_second": 4.447,
"eval_token_acc": 0.9178555987894509,
"step": 1720
},
{
"epoch": 2.121630960608155,
"grad_norm": 1.026114821434021,
"learning_rate": 1.9696225523803803e-06,
"loss": 0.1560563325881958,
"memory(GiB)": 31.97,
"step": 1725,
"token_acc": 0.9226966883434199,
"train_speed(iter/s)": 0.12297
},
{
"epoch": 2.1277739384166474,
"grad_norm": 0.8650787472724915,
"learning_rate": 1.944072156620261e-06,
"loss": 0.13898645639419555,
"memory(GiB)": 31.97,
"step": 1730,
"token_acc": 0.9579385943157581,
"train_speed(iter/s)": 0.12307
},
{
"epoch": 2.1339169162251403,
"grad_norm": 0.6428267955780029,
"learning_rate": 1.9186485137144217e-06,
"loss": 0.15046895742416383,
"memory(GiB)": 31.97,
"step": 1735,
"token_acc": 0.9573064770932069,
"train_speed(iter/s)": 0.123153
},
{
"epoch": 2.140059894033633,
"grad_norm": 0.7333597540855408,
"learning_rate": 1.89335267817706e-06,
"loss": 0.1286949872970581,
"memory(GiB)": 31.97,
"step": 1740,
"token_acc": 0.9547167656464138,
"train_speed(iter/s)": 0.123228
},
{
"epoch": 2.140059894033633,
"eval_loss": 0.2735785245895386,
"eval_runtime": 29.6624,
"eval_samples_per_second": 17.733,
"eval_steps_per_second": 4.45,
"eval_token_acc": 0.917639429312581,
"step": 1740
},
{
"epoch": 2.1462028718421253,
"grad_norm": 0.8134050369262695,
"learning_rate": 1.8681856992212211e-06,
"loss": 0.1448550343513489,
"memory(GiB)": 31.97,
"step": 1745,
"token_acc": 0.9220646406174626,
"train_speed(iter/s)": 0.122967
},
{
"epoch": 2.1523458496506183,
"grad_norm": 0.6919598579406738,
"learning_rate": 1.8431486207152704e-06,
"loss": 0.12585388422012328,
"memory(GiB)": 31.97,
"step": 1750,
"token_acc": 0.9565885062902368,
"train_speed(iter/s)": 0.12304
},
{
"epoch": 2.1584888274591107,
"grad_norm": 0.7137647271156311,
"learning_rate": 1.8182424811396131e-06,
"loss": 0.13218532800674437,
"memory(GiB)": 31.97,
"step": 1755,
"token_acc": 0.9553192383674499,
"train_speed(iter/s)": 0.123112
},
{
"epoch": 2.1646318052676037,
"grad_norm": 0.8254464864730835,
"learning_rate": 1.7934683135435993e-06,
"loss": 0.15681140422821044,
"memory(GiB)": 31.97,
"step": 1760,
"token_acc": 0.9500492764147518,
"train_speed(iter/s)": 0.123204
},
{
"epoch": 2.1646318052676037,
"eval_loss": 0.2743065655231476,
"eval_runtime": 29.7411,
"eval_samples_per_second": 17.686,
"eval_steps_per_second": 4.438,
"eval_token_acc": 0.91786280443868,
"step": 1760
},
{
"epoch": 2.170774783076096,
"grad_norm": 0.8115731477737427,
"learning_rate": 1.7688271455026867e-06,
"loss": 0.15295430421829223,
"memory(GiB)": 31.97,
"step": 1765,
"token_acc": 0.9211377831289036,
"train_speed(iter/s)": 0.122963
},
{
"epoch": 2.1769177608845887,
"grad_norm": 0.7977039813995361,
"learning_rate": 1.7443199990758168e-06,
"loss": 0.14479312896728516,
"memory(GiB)": 31.97,
"step": 1770,
"token_acc": 0.9533281533281533,
"train_speed(iter/s)": 0.123053
},
{
"epoch": 2.1830607386930816,
"grad_norm": 0.8783808350563049,
"learning_rate": 1.7199478907630269e-06,
"loss": 0.14664456844329835,
"memory(GiB)": 31.97,
"step": 1775,
"token_acc": 0.9566591882520905,
"train_speed(iter/s)": 0.123142
},
{
"epoch": 2.189203716501574,
"grad_norm": 0.737375795841217,
"learning_rate": 1.6957118314632825e-06,
"loss": 0.12802677154541015,
"memory(GiB)": 31.97,
"step": 1780,
"token_acc": 0.9474123975142305,
"train_speed(iter/s)": 0.12324
},
{
"epoch": 2.189203716501574,
"eval_loss": 0.27472689747810364,
"eval_runtime": 29.7215,
"eval_samples_per_second": 17.698,
"eval_steps_per_second": 4.441,
"eval_token_acc": 0.9179997117740308,
"step": 1780
},
{
"epoch": 2.195346694310067,
"grad_norm": 0.9062975645065308,
"learning_rate": 1.6716128264325477e-06,
"loss": 0.1491732716560364,
"memory(GiB)": 31.97,
"step": 1785,
"token_acc": 0.9252631765812785,
"train_speed(iter/s)": 0.123004
},
{
"epoch": 2.2014896721185595,
"grad_norm": 0.9112216830253601,
"learning_rate": 1.64765187524209e-06,
"loss": 0.14550890922546386,
"memory(GiB)": 31.97,
"step": 1790,
"token_acc": 0.9464007023019898,
"train_speed(iter/s)": 0.123075
},
{
"epoch": 2.207632649927052,
"grad_norm": 0.822755753993988,
"learning_rate": 1.6238299717370254e-06,
"loss": 0.14908239841461182,
"memory(GiB)": 31.97,
"step": 1795,
"token_acc": 0.9550466874166296,
"train_speed(iter/s)": 0.123153
},
{
"epoch": 2.213775627735545,
"grad_norm": 0.9669148921966553,
"learning_rate": 1.6001481039950872e-06,
"loss": 0.14041876792907715,
"memory(GiB)": 31.97,
"step": 1800,
"token_acc": 0.9552882955460927,
"train_speed(iter/s)": 0.123227
},
{
"epoch": 2.213775627735545,
"eval_loss": 0.2743581235408783,
"eval_runtime": 29.714,
"eval_samples_per_second": 17.702,
"eval_steps_per_second": 4.442,
"eval_token_acc": 0.9181150021616947,
"step": 1800
},
{
"epoch": 2.2199186055440374,
"grad_norm": 0.7515302896499634,
"learning_rate": 1.5766072542856525e-06,
"loss": 0.1314539670944214,
"memory(GiB)": 31.97,
"step": 1805,
"token_acc": 0.9241629064430544,
"train_speed(iter/s)": 0.122981
},
{
"epoch": 2.2260615833525303,
"grad_norm": 0.8223626613616943,
"learning_rate": 1.5532083990289892e-06,
"loss": 0.1447986364364624,
"memory(GiB)": 31.97,
"step": 1810,
"token_acc": 0.9575009707900073,
"train_speed(iter/s)": 0.123073
},
{
"epoch": 2.232204561161023,
"grad_norm": 0.8387971520423889,
"learning_rate": 1.5299525087557682e-06,
"loss": 0.12721827030181884,
"memory(GiB)": 31.97,
"step": 1815,
"token_acc": 0.9589310504396112,
"train_speed(iter/s)": 0.123144
},
{
"epoch": 2.2383475389695153,
"grad_norm": 0.8764155507087708,
"learning_rate": 1.5068405480667975e-06,
"loss": 0.1495474696159363,
"memory(GiB)": 31.97,
"step": 1820,
"token_acc": 0.9530952884005915,
"train_speed(iter/s)": 0.123223
},
{
"epoch": 2.2383475389695153,
"eval_loss": 0.2754988670349121,
"eval_runtime": 29.5873,
"eval_samples_per_second": 17.778,
"eval_steps_per_second": 4.461,
"eval_token_acc": 0.9176898688571841,
"step": 1820
},
{
"epoch": 2.2444905167780083,
"grad_norm": 0.8443633317947388,
"learning_rate": 1.4838734755930168e-06,
"loss": 0.14514811038970948,
"memory(GiB)": 31.97,
"step": 1825,
"token_acc": 0.9260831823671497,
"train_speed(iter/s)": 0.122984
},
{
"epoch": 2.2506334945865007,
"grad_norm": 0.8468235731124878,
"learning_rate": 1.461052243955739e-06,
"loss": 0.14231607913970948,
"memory(GiB)": 31.97,
"step": 1830,
"token_acc": 0.9431918169819622,
"train_speed(iter/s)": 0.123052
},
{
"epoch": 2.2567764723949937,
"grad_norm": 0.8009449243545532,
"learning_rate": 1.4383777997271347e-06,
"loss": 0.13485580682754517,
"memory(GiB)": 31.97,
"step": 1835,
"token_acc": 0.954977119519756,
"train_speed(iter/s)": 0.123135
},
{
"epoch": 2.262919450203486,
"grad_norm": 0.8349820971488953,
"learning_rate": 1.4158510833909688e-06,
"loss": 0.1553872346878052,
"memory(GiB)": 31.97,
"step": 1840,
"token_acc": 0.9466980320156062,
"train_speed(iter/s)": 0.12321
},
{
"epoch": 2.262919450203486,
"eval_loss": 0.2755924463272095,
"eval_runtime": 29.7545,
"eval_samples_per_second": 17.678,
"eval_steps_per_second": 4.436,
"eval_token_acc": 0.9181582360570687,
"step": 1840
},
{
"epoch": 2.2690624280119787,
"grad_norm": 0.9547154903411865,
"learning_rate": 1.3934730293035935e-06,
"loss": 0.1530256986618042,
"memory(GiB)": 31.97,
"step": 1845,
"token_acc": 0.9236753100338219,
"train_speed(iter/s)": 0.122997
},
{
"epoch": 2.2752054058204716,
"grad_norm": 0.8379245400428772,
"learning_rate": 1.3712445656551904e-06,
"loss": 0.14752573966979982,
"memory(GiB)": 31.97,
"step": 1850,
"token_acc": 0.9471953309555793,
"train_speed(iter/s)": 0.123078
},
{
"epoch": 2.281348383628964,
"grad_norm": 0.7745286822319031,
"learning_rate": 1.349166614431282e-06,
"loss": 0.13339710235595703,
"memory(GiB)": 31.97,
"step": 1855,
"token_acc": 0.9574489743981269,
"train_speed(iter/s)": 0.123139
},
{
"epoch": 2.287491361437457,
"grad_norm": 0.7709481120109558,
"learning_rate": 1.3272400913744744e-06,
"loss": 0.13953914642333984,
"memory(GiB)": 31.97,
"step": 1860,
"token_acc": 0.9522856703093736,
"train_speed(iter/s)": 0.123224
},
{
"epoch": 2.287491361437457,
"eval_loss": 0.27358269691467285,
"eval_runtime": 29.7058,
"eval_samples_per_second": 17.707,
"eval_steps_per_second": 4.444,
"eval_token_acc": 0.9181294134601528,
"step": 1860
},
{
"epoch": 2.2936343392459495,
"grad_norm": 0.9447105526924133,
"learning_rate": 1.3054659059464836e-06,
"loss": 0.1305554747581482,
"memory(GiB)": 31.97,
"step": 1865,
"token_acc": 0.9289236364999907,
"train_speed(iter/s)": 0.122979
},
{
"epoch": 2.299777317054442,
"grad_norm": 0.8118374347686768,
"learning_rate": 1.2838449612904108e-06,
"loss": 0.14541189670562743,
"memory(GiB)": 31.97,
"step": 1870,
"token_acc": 0.9568722866275464,
"train_speed(iter/s)": 0.123069
},
{
"epoch": 2.305920294862935,
"grad_norm": 0.7850742936134338,
"learning_rate": 1.262378154193285e-06,
"loss": 0.14573101997375487,
"memory(GiB)": 31.97,
"step": 1875,
"token_acc": 0.9437350591802227,
"train_speed(iter/s)": 0.123139
},
{
"epoch": 2.3120632726714274,
"grad_norm": 0.8586622476577759,
"learning_rate": 1.2410663750488644e-06,
"loss": 0.1231348991394043,
"memory(GiB)": 31.97,
"step": 1880,
"token_acc": 0.9528933210864716,
"train_speed(iter/s)": 0.123207
},
{
"epoch": 2.3120632726714274,
"eval_loss": 0.27450090646743774,
"eval_runtime": 29.7231,
"eval_samples_per_second": 17.697,
"eval_steps_per_second": 4.441,
"eval_token_acc": 0.9181798530047557,
"step": 1880
},
{
"epoch": 2.3182062504799203,
"grad_norm": 0.7418249845504761,
"learning_rate": 1.2199105078207002e-06,
"loss": 0.15627479553222656,
"memory(GiB)": 31.97,
"step": 1885,
"token_acc": 0.9225045238007312,
"train_speed(iter/s)": 0.122962
},
{
"epoch": 2.324349228288413,
"grad_norm": 0.838173508644104,
"learning_rate": 1.1989114300054782e-06,
"loss": 0.14288971424102784,
"memory(GiB)": 31.97,
"step": 1890,
"token_acc": 0.94441322229602,
"train_speed(iter/s)": 0.123041
},
{
"epoch": 2.3304922060969053,
"grad_norm": 0.8450609445571899,
"learning_rate": 1.1780700125966232e-06,
"loss": 0.13255660533905028,
"memory(GiB)": 31.97,
"step": 1895,
"token_acc": 0.9575001607406931,
"train_speed(iter/s)": 0.123109
},
{
"epoch": 2.3366351839053983,
"grad_norm": 0.7407841086387634,
"learning_rate": 1.1573871200481634e-06,
"loss": 0.1363093614578247,
"memory(GiB)": 31.97,
"step": 1900,
"token_acc": 0.948611310292079,
"train_speed(iter/s)": 0.123174
},
{
"epoch": 2.3366351839053983,
"eval_loss": 0.27403974533081055,
"eval_runtime": 29.6863,
"eval_samples_per_second": 17.719,
"eval_steps_per_second": 4.446,
"eval_token_acc": 0.9180429456694048,
"step": 1900
},
{
"epoch": 2.3427781617138907,
"grad_norm": 0.8412113189697266,
"learning_rate": 1.136863610238887e-06,
"loss": 0.151106858253479,
"memory(GiB)": 31.97,
"step": 1905,
"token_acc": 0.9202309459903064,
"train_speed(iter/s)": 0.122952
},
{
"epoch": 2.3489211395223837,
"grad_norm": 0.7621601819992065,
"learning_rate": 1.1165003344367465e-06,
"loss": 0.145496666431427,
"memory(GiB)": 31.97,
"step": 1910,
"token_acc": 0.9506583322250299,
"train_speed(iter/s)": 0.123038
},
{
"epoch": 2.355064117330876,
"grad_norm": 0.8177499175071716,
"learning_rate": 1.0962981372635629e-06,
"loss": 0.13563876152038573,
"memory(GiB)": 31.97,
"step": 1915,
"token_acc": 0.9514687814140511,
"train_speed(iter/s)": 0.123121
},
{
"epoch": 2.3612070951393687,
"grad_norm": 0.8387221693992615,
"learning_rate": 1.0762578566599818e-06,
"loss": 0.15051798820495604,
"memory(GiB)": 31.97,
"step": 1920,
"token_acc": 0.9480101984258952,
"train_speed(iter/s)": 0.123214
},
{
"epoch": 2.3612070951393687,
"eval_loss": 0.2752765119075775,
"eval_runtime": 29.73,
"eval_samples_per_second": 17.693,
"eval_steps_per_second": 4.44,
"eval_token_acc": 0.9179492722294279,
"step": 1920
},
{
"epoch": 2.3673500729478616,
"grad_norm": 0.9462500214576721,
"learning_rate": 1.056380323850722e-06,
"loss": 0.1329110622406006,
"memory(GiB)": 31.97,
"step": 1925,
"token_acc": 0.9277916379142559,
"train_speed(iter/s)": 0.122971
},
{
"epoch": 2.373493050756354,
"grad_norm": 0.6897282004356384,
"learning_rate": 1.0366663633101015e-06,
"loss": 0.14667117595672607,
"memory(GiB)": 31.97,
"step": 1930,
"token_acc": 0.9519438953214723,
"train_speed(iter/s)": 0.123037
},
{
"epoch": 2.379636028564847,
"grad_norm": 0.756504476070404,
"learning_rate": 1.0171167927278369e-06,
"loss": 0.15089083909988404,
"memory(GiB)": 31.97,
"step": 1935,
"token_acc": 0.9499266411093757,
"train_speed(iter/s)": 0.123095
},
{
"epoch": 2.3857790063733395,
"grad_norm": 0.620968222618103,
"learning_rate": 9.977324229751245e-07,
"loss": 0.13846428394317628,
"memory(GiB)": 31.97,
"step": 1940,
"token_acc": 0.9542850274450099,
"train_speed(iter/s)": 0.123159
},
{
"epoch": 2.3857790063733395,
"eval_loss": 0.273898720741272,
"eval_runtime": 29.7246,
"eval_samples_per_second": 17.696,
"eval_steps_per_second": 4.441,
"eval_token_acc": 0.9181582360570687,
"step": 1940
},
{
"epoch": 2.391921984181832,
"grad_norm": 0.8139801621437073,
"learning_rate": 9.785140580710106e-07,
"loss": 0.1415960192680359,
"memory(GiB)": 31.97,
"step": 1945,
"token_acc": 0.9238499208097432,
"train_speed(iter/s)": 0.122929
},
{
"epoch": 2.398064961990325,
"grad_norm": 0.8904073238372803,
"learning_rate": 9.594624951490455e-07,
"loss": 0.15040233135223388,
"memory(GiB)": 31.97,
"step": 1950,
"token_acc": 0.9561746584516475,
"train_speed(iter/s)": 0.123001
},
{
"epoch": 2.4042079397988174,
"grad_norm": 1.0656641721725464,
"learning_rate": 9.405785244242166e-07,
"loss": 0.1426215648651123,
"memory(GiB)": 31.97,
"step": 1955,
"token_acc": 0.9464016327979412,
"train_speed(iter/s)": 0.123079
},
{
"epoch": 2.4103509176073104,
"grad_norm": 0.656574547290802,
"learning_rate": 9.218629291601699e-07,
"loss": 0.12366310358047486,
"memory(GiB)": 31.97,
"step": 1960,
"token_acc": 0.9601010101010101,
"train_speed(iter/s)": 0.123155
},
{
"epoch": 2.4103509176073104,
"eval_loss": 0.27443960309028625,
"eval_runtime": 29.6457,
"eval_samples_per_second": 17.743,
"eval_steps_per_second": 4.453,
"eval_token_acc": 0.9185473411154345,
"step": 1960
},
{
"epoch": 2.416493895415803,
"grad_norm": 0.7282077074050903,
"learning_rate": 9.033164856367271e-07,
"loss": 0.14160101413726806,
"memory(GiB)": 31.97,
"step": 1965,
"token_acc": 0.9257107472635547,
"train_speed(iter/s)": 0.122927
},
{
"epoch": 2.4226368732242953,
"grad_norm": 0.9724346995353699,
"learning_rate": 8.849399631176825e-07,
"loss": 0.13960802555084229,
"memory(GiB)": 31.97,
"step": 1970,
"token_acc": 0.957187156146844,
"train_speed(iter/s)": 0.122994
},
{
"epoch": 2.4287798510327883,
"grad_norm": 0.6904351711273193,
"learning_rate": 8.667341238189009e-07,
"loss": 0.13362197875976561,
"memory(GiB)": 31.97,
"step": 1975,
"token_acc": 0.9529874213836478,
"train_speed(iter/s)": 0.123061
},
{
"epoch": 2.4349228288412808,
"grad_norm": 0.8341466784477234,
"learning_rate": 8.486997228767013e-07,
"loss": 0.15857725143432616,
"memory(GiB)": 31.97,
"step": 1980,
"token_acc": 0.9431441341856106,
"train_speed(iter/s)": 0.123136
},
{
"epoch": 2.4349228288412808,
"eval_loss": 0.27409639954566956,
"eval_runtime": 29.6556,
"eval_samples_per_second": 17.737,
"eval_steps_per_second": 4.451,
"eval_token_acc": 0.9185329298169765,
"step": 1980
},
{
"epoch": 2.4410658066497737,
"grad_norm": 1.1063848733901978,
"learning_rate": 8.308375083165299e-07,
"loss": 0.15017662048339844,
"memory(GiB)": 31.97,
"step": 1985,
"token_acc": 0.9246744744307849,
"train_speed(iter/s)": 0.122915
},
{
"epoch": 2.447208784458266,
"grad_norm": 0.8507280945777893,
"learning_rate": 8.131482210219383e-07,
"loss": 0.1420647144317627,
"memory(GiB)": 31.97,
"step": 1990,
"token_acc": 0.9540951446787641,
"train_speed(iter/s)": 0.122981
},
{
"epoch": 2.4533517622667587,
"grad_norm": 0.7251470685005188,
"learning_rate": 7.956325947038585e-07,
"loss": 0.13122901916503907,
"memory(GiB)": 31.97,
"step": 1995,
"token_acc": 0.9569695888700616,
"train_speed(iter/s)": 0.123056
},
{
"epoch": 2.4594947400752516,
"grad_norm": 0.6275292038917542,
"learning_rate": 7.782913558701572e-07,
"loss": 0.13776025772094727,
"memory(GiB)": 31.97,
"step": 2000,
"token_acc": 0.9476885644768857,
"train_speed(iter/s)": 0.123119
},
{
"epoch": 2.4594947400752516,
"eval_loss": 0.2740454375743866,
"eval_runtime": 29.8131,
"eval_samples_per_second": 17.643,
"eval_steps_per_second": 4.428,
"eval_token_acc": 0.9183816111831676,
"step": 2000
},
{
"epoch": 2.465637717883744,
"grad_norm": 0.830629825592041,
"learning_rate": 7.611252237955168e-07,
"loss": 0.12884964942932128,
"memory(GiB)": 31.97,
"step": 2005,
"token_acc": 0.926461027233981,
"train_speed(iter/s)": 0.122895
},
{
"epoch": 2.471780695692237,
"grad_norm": 0.8567506670951843,
"learning_rate": 7.44134910491589e-07,
"loss": 0.15558898448944092,
"memory(GiB)": 31.97,
"step": 2010,
"token_acc": 0.9473684210526315,
"train_speed(iter/s)": 0.122982
},
{
"epoch": 2.4779236735007295,
"grad_norm": 0.9071369171142578,
"learning_rate": 7.273211206774711e-07,
"loss": 0.14407318830490112,
"memory(GiB)": 31.97,
"step": 2015,
"token_acc": 0.9545647558386412,
"train_speed(iter/s)": 0.123063
},
{
"epoch": 2.484066651309222,
"grad_norm": 0.8281042575836182,
"learning_rate": 7.106845517504684e-07,
"loss": 0.14147133827209474,
"memory(GiB)": 31.97,
"step": 2020,
"token_acc": 0.9535927353360435,
"train_speed(iter/s)": 0.123126
},
{
"epoch": 2.484066651309222,
"eval_loss": 0.2732316851615906,
"eval_runtime": 29.6919,
"eval_samples_per_second": 17.715,
"eval_steps_per_second": 4.446,
"eval_token_acc": 0.9181510304078397,
"step": 2020
},
{
"epoch": 2.490209629117715,
"grad_norm": 0.8336676955223083,
"learning_rate": 6.942258937571772e-07,
"loss": 0.1445910692214966,
"memory(GiB)": 31.97,
"step": 2025,
"token_acc": 0.9194783843365841,
"train_speed(iter/s)": 0.122916
},
{
"epoch": 2.4963526069262074,
"grad_norm": 0.7258805632591248,
"learning_rate": 6.779458293648506e-07,
"loss": 0.13561407327651978,
"memory(GiB)": 31.97,
"step": 2030,
"token_acc": 0.9523143224939833,
"train_speed(iter/s)": 0.122985
},
{
"epoch": 2.5024955847347004,
"grad_norm": 0.8204253911972046,
"learning_rate": 6.618450338330978e-07,
"loss": 0.14749345779418946,
"memory(GiB)": 31.97,
"step": 2035,
"token_acc": 0.9511697728431429,
"train_speed(iter/s)": 0.12307
},
{
"epoch": 2.508638562543193,
"grad_norm": 0.755736768245697,
"learning_rate": 6.459241749858619e-07,
"loss": 0.1365538001060486,
"memory(GiB)": 31.97,
"step": 2040,
"token_acc": 0.94921875,
"train_speed(iter/s)": 0.123134
},
{
"epoch": 2.508638562543193,
"eval_loss": 0.2726115584373474,
"eval_runtime": 29.6337,
"eval_samples_per_second": 17.75,
"eval_steps_per_second": 4.454,
"eval_token_acc": 0.9181870586539848,
"step": 2040
},
{
"epoch": 2.5147815403516853,
"grad_norm": 0.7565985918045044,
"learning_rate": 6.301839131837284e-07,
"loss": 0.14346761703491212,
"memory(GiB)": 31.97,
"step": 2045,
"token_acc": 0.9241052727438303,
"train_speed(iter/s)": 0.122918
},
{
"epoch": 2.5209245181601783,
"grad_norm": 0.8396884202957153,
"learning_rate": 6.146249012965349e-07,
"loss": 0.13507163524627686,
"memory(GiB)": 31.97,
"step": 2050,
"token_acc": 0.9460710284016174,
"train_speed(iter/s)": 0.122992
},
{
"epoch": 2.5270674959686708,
"grad_norm": 0.7319416999816895,
"learning_rate": 5.992477846762896e-07,
"loss": 0.13839869499206542,
"memory(GiB)": 31.97,
"step": 2055,
"token_acc": 0.950456398185889,
"train_speed(iter/s)": 0.123056
},
{
"epoch": 2.5332104737771637,
"grad_norm": 0.7878042459487915,
"learning_rate": 5.840532011303996e-07,
"loss": 0.15459495782852173,
"memory(GiB)": 31.97,
"step": 2060,
"token_acc": 0.9486288752039581,
"train_speed(iter/s)": 0.12312
},
{
"epoch": 2.5332104737771637,
"eval_loss": 0.27472230792045593,
"eval_runtime": 29.5056,
"eval_samples_per_second": 17.827,
"eval_steps_per_second": 4.474,
"eval_token_acc": 0.9183744055339386,
"step": 2060
},
{
"epoch": 2.539353451585656,
"grad_norm": 0.8421174883842468,
"learning_rate": 5.690417808952243e-07,
"loss": 0.15741729736328125,
"memory(GiB)": 31.97,
"step": 2065,
"token_acc": 0.9253989855251763,
"train_speed(iter/s)": 0.122912
},
{
"epoch": 2.5454964293941487,
"grad_norm": 0.8520276546478271,
"learning_rate": 5.542141466099271e-07,
"loss": 0.1434725046157837,
"memory(GiB)": 31.97,
"step": 2070,
"token_acc": 0.9435738510115776,
"train_speed(iter/s)": 0.122999
},
{
"epoch": 2.5516394072026416,
"grad_norm": 0.8779215216636658,
"learning_rate": 5.395709132906569e-07,
"loss": 0.13479983806610107,
"memory(GiB)": 31.97,
"step": 2075,
"token_acc": 0.950151781434734,
"train_speed(iter/s)": 0.123082
},
{
"epoch": 2.557782385011134,
"grad_norm": 0.7466580867767334,
"learning_rate": 5.251126883050333e-07,
"loss": 0.13184006214141847,
"memory(GiB)": 31.97,
"step": 2080,
"token_acc": 0.954845163930699,
"train_speed(iter/s)": 0.12315
},
{
"epoch": 2.557782385011134,
"eval_loss": 0.27403029799461365,
"eval_runtime": 29.675,
"eval_samples_per_second": 17.725,
"eval_steps_per_second": 4.448,
"eval_token_acc": 0.9182735264447327,
"step": 2080
},
{
"epoch": 2.563925362819627,
"grad_norm": 0.9100626707077026,
"learning_rate": 5.108400713469547e-07,
"loss": 0.15517921447753907,
"memory(GiB)": 31.97,
"step": 2085,
"token_acc": 0.9215290970418331,
"train_speed(iter/s)": 0.122941
},
{
"epoch": 2.5700683406281195,
"grad_norm": 0.7965090870857239,
"learning_rate": 4.967536544117263e-07,
"loss": 0.1428399920463562,
"memory(GiB)": 31.97,
"step": 2090,
"token_acc": 0.9566871852266369,
"train_speed(iter/s)": 0.123002
},
{
"epoch": 2.576211318436612,
"grad_norm": 0.8939210772514343,
"learning_rate": 4.828540217715067e-07,
"loss": 0.15376098155975343,
"memory(GiB)": 31.97,
"step": 2095,
"token_acc": 0.9479303634355442,
"train_speed(iter/s)": 0.123068
},
{
"epoch": 2.582354296245105,
"grad_norm": 0.825840950012207,
"learning_rate": 4.6914174995106863e-07,
"loss": 0.14222912788391112,
"memory(GiB)": 31.97,
"step": 2100,
"token_acc": 0.9461206896551724,
"train_speed(iter/s)": 0.123144
},
{
"epoch": 2.582354296245105,
"eval_loss": 0.27377504110336304,
"eval_runtime": 29.7092,
"eval_samples_per_second": 17.705,
"eval_steps_per_second": 4.443,
"eval_token_acc": 0.9185257241677475,
"step": 2100
},
{
"epoch": 2.5884972740535974,
"grad_norm": 0.8890196681022644,
"learning_rate": 4.556174077038927e-07,
"loss": 0.14791591167449952,
"memory(GiB)": 31.97,
"step": 2105,
"token_acc": 0.9235944439194935,
"train_speed(iter/s)": 0.122918
},
{
"epoch": 2.5946402518620904,
"grad_norm": 0.7061293721199036,
"learning_rate": 4.422815559885696e-07,
"loss": 0.13169264793395996,
"memory(GiB)": 31.97,
"step": 2110,
"token_acc": 0.948949511019606,
"train_speed(iter/s)": 0.122982
},
{
"epoch": 2.600783229670583,
"grad_norm": 0.8161597847938538,
"learning_rate": 4.2913474794554044e-07,
"loss": 0.1341610074043274,
"memory(GiB)": 31.97,
"step": 2115,
"token_acc": 0.9497422680412371,
"train_speed(iter/s)": 0.123053
},
{
"epoch": 2.6069262074790753,
"grad_norm": 0.8730164766311646,
"learning_rate": 4.161775288741454e-07,
"loss": 0.15282490253448486,
"memory(GiB)": 31.97,
"step": 2120,
"token_acc": 0.943151087595532,
"train_speed(iter/s)": 0.123121
},
{
"epoch": 2.6069262074790753,
"eval_loss": 0.2740446925163269,
"eval_runtime": 29.6992,
"eval_samples_per_second": 17.711,
"eval_steps_per_second": 4.445,
"eval_token_acc": 0.9185833693615795,
"step": 2120
},
{
"epoch": 2.6130691852875683,
"grad_norm": 0.835308313369751,
"learning_rate": 4.034104362100155e-07,
"loss": 0.13589699268341066,
"memory(GiB)": 31.97,
"step": 2125,
"token_acc": 0.9286285805728917,
"train_speed(iter/s)": 0.1229
},
{
"epoch": 2.6192121630960608,
"grad_norm": 0.8869427442550659,
"learning_rate": 3.9083399950277156e-07,
"loss": 0.14998774528503417,
"memory(GiB)": 31.97,
"step": 2130,
"token_acc": 0.9448426301028358,
"train_speed(iter/s)": 0.122982
},
{
"epoch": 2.6253551409045537,
"grad_norm": 0.8314644694328308,
"learning_rate": 3.7844874039406677e-07,
"loss": 0.12793076038360596,
"memory(GiB)": 31.97,
"step": 2135,
"token_acc": 0.9614247859763555,
"train_speed(iter/s)": 0.123051
},
{
"epoch": 2.631498118713046,
"grad_norm": 0.8190094828605652,
"learning_rate": 3.6625517259594566e-07,
"loss": 0.14857040643692015,
"memory(GiB)": 31.97,
"step": 2140,
"token_acc": 0.9486269539501478,
"train_speed(iter/s)": 0.123129
},
{
"epoch": 2.631498118713046,
"eval_loss": 0.27396196126937866,
"eval_runtime": 29.662,
"eval_samples_per_second": 17.733,
"eval_steps_per_second": 4.45,
"eval_token_acc": 0.9183023490416486,
"step": 2140
},
{
"epoch": 2.6376410965215387,
"grad_norm": 0.8906332850456238,
"learning_rate": 3.5425380186953905e-07,
"loss": 0.15265541076660155,
"memory(GiB)": 31.97,
"step": 2145,
"token_acc": 0.9210909443851305,
"train_speed(iter/s)": 0.122924
},
{
"epoch": 2.6437840743300316,
"grad_norm": 0.8447383642196655,
"learning_rate": 3.424451260040862e-07,
"loss": 0.1445927381515503,
"memory(GiB)": 31.97,
"step": 2150,
"token_acc": 0.951250271798217,
"train_speed(iter/s)": 0.123006
},
{
"epoch": 2.649927052138524,
"grad_norm": 0.8714754581451416,
"learning_rate": 3.3082963479628747e-07,
"loss": 0.15293993949890136,
"memory(GiB)": 31.97,
"step": 2155,
"token_acc": 0.9348866900734121,
"train_speed(iter/s)": 0.123079
},
{
"epoch": 2.656070029947017,
"grad_norm": 0.7784111499786377,
"learning_rate": 3.194078100299863e-07,
"loss": 0.13703620433807373,
"memory(GiB)": 31.97,
"step": 2160,
"token_acc": 0.955746644295302,
"train_speed(iter/s)": 0.123143
},
{
"epoch": 2.656070029947017,
"eval_loss": 0.2733152210712433,
"eval_runtime": 29.7291,
"eval_samples_per_second": 17.693,
"eval_steps_per_second": 4.44,
"eval_token_acc": 0.9185329298169765,
"step": 2160
},
{
"epoch": 2.6622130077555095,
"grad_norm": 0.8391521573066711,
"learning_rate": 3.0818012545618836e-07,
"loss": 0.13625545501708985,
"memory(GiB)": 31.97,
"step": 2165,
"token_acc": 0.9254722933600564,
"train_speed(iter/s)": 0.12293
},
{
"epoch": 2.668355985564002,
"grad_norm": 0.8696740865707397,
"learning_rate": 2.9714704677341055e-07,
"loss": 0.15032825469970704,
"memory(GiB)": 31.97,
"step": 2170,
"token_acc": 0.9501612578109252,
"train_speed(iter/s)": 0.122998
},
{
"epoch": 2.674498963372495,
"grad_norm": 0.7542688250541687,
"learning_rate": 2.8630903160836776e-07,
"loss": 0.14371325969696044,
"memory(GiB)": 31.97,
"step": 2175,
"token_acc": 0.9407879649589506,
"train_speed(iter/s)": 0.123074
},
{
"epoch": 2.6806419411809874,
"grad_norm": 0.703037679195404,
"learning_rate": 2.756665294969868e-07,
"loss": 0.13071630001068116,
"memory(GiB)": 31.97,
"step": 2180,
"token_acc": 0.9524260355029586,
"train_speed(iter/s)": 0.123154
},
{
"epoch": 2.6806419411809874,
"eval_loss": 0.2734775245189667,
"eval_runtime": 29.4649,
"eval_samples_per_second": 17.852,
"eval_steps_per_second": 4.48,
"eval_token_acc": 0.9185833693615795,
"step": 2180
},
{
"epoch": 2.6867849189894804,
"grad_norm": 0.7227668166160583,
"learning_rate": 2.6521998186576357e-07,
"loss": 0.13176329135894777,
"memory(GiB)": 31.97,
"step": 2185,
"token_acc": 0.9272463413354781,
"train_speed(iter/s)": 0.122934
},
{
"epoch": 2.692927896797973,
"grad_norm": 0.812665581703186,
"learning_rate": 2.549698220134517e-07,
"loss": 0.14241292476654052,
"memory(GiB)": 31.97,
"step": 2190,
"token_acc": 0.9535809018567639,
"train_speed(iter/s)": 0.122998
},
{
"epoch": 2.6990708746064653,
"grad_norm": 0.8766520023345947,
"learning_rate": 2.449164750930938e-07,
"loss": 0.14588472843170167,
"memory(GiB)": 31.97,
"step": 2195,
"token_acc": 0.9549009533595936,
"train_speed(iter/s)": 0.123059
},
{
"epoch": 2.7052138524149583,
"grad_norm": 0.9236011505126953,
"learning_rate": 2.3506035809438553e-07,
"loss": 0.1432283639907837,
"memory(GiB)": 31.97,
"step": 2200,
"token_acc": 0.9595282766014473,
"train_speed(iter/s)": 0.123129
},
{
"epoch": 2.7052138524149583,
"eval_loss": 0.27359798550605774,
"eval_runtime": 29.6892,
"eval_samples_per_second": 17.717,
"eval_steps_per_second": 4.446,
"eval_token_acc": 0.9185833693615795,
"step": 2200
},
{
"epoch": 2.7113568302234508,
"grad_norm": 0.7723605036735535,
"learning_rate": 2.2540187982637628e-07,
"loss": 0.1351910948753357,
"memory(GiB)": 31.97,
"step": 2205,
"token_acc": 0.9280636513015653,
"train_speed(iter/s)": 0.122914
},
{
"epoch": 2.7174998080319437,
"grad_norm": 0.7751135230064392,
"learning_rate": 2.1594144090051728e-07,
"loss": 0.14594308137893677,
"memory(GiB)": 31.97,
"step": 2210,
"token_acc": 0.9436165379373013,
"train_speed(iter/s)": 0.122972
},
{
"epoch": 2.723642785840436,
"grad_norm": 0.8251017928123474,
"learning_rate": 2.066794337140443e-07,
"loss": 0.13928499221801757,
"memory(GiB)": 31.97,
"step": 2215,
"token_acc": 0.9523616048755713,
"train_speed(iter/s)": 0.123029
},
{
"epoch": 2.7297857636489287,
"grad_norm": 0.805425763130188,
"learning_rate": 1.9761624243370026e-07,
"loss": 0.13413631916046143,
"memory(GiB)": 31.97,
"step": 2220,
"token_acc": 0.9549077181208053,
"train_speed(iter/s)": 0.123085
},
{
"epoch": 2.7297857636489287,
"eval_loss": 0.27328255772590637,
"eval_runtime": 29.7048,
"eval_samples_per_second": 17.708,
"eval_steps_per_second": 4.444,
"eval_token_acc": 0.9185257241677475,
"step": 2220
},
{
"epoch": 2.7359287414574216,
"grad_norm": 0.720260739326477,
"learning_rate": 1.8875224297980332e-07,
"loss": 0.14869468212127684,
"memory(GiB)": 31.97,
"step": 2225,
"token_acc": 0.9208377041810281,
"train_speed(iter/s)": 0.122897
},
{
"epoch": 2.742071719265914,
"grad_norm": 0.7555059194564819,
"learning_rate": 1.800878030106501e-07,
"loss": 0.13696482181549072,
"memory(GiB)": 31.97,
"step": 2230,
"token_acc": 0.951278626898155,
"train_speed(iter/s)": 0.122958
},
{
"epoch": 2.748214697074407,
"grad_norm": 0.9698434472084045,
"learning_rate": 1.7162328190727217e-07,
"loss": 0.16066057682037355,
"memory(GiB)": 31.97,
"step": 2235,
"token_acc": 0.9414913717092833,
"train_speed(iter/s)": 0.123034
},
{
"epoch": 2.7543576748828995,
"grad_norm": 0.9093776345252991,
"learning_rate": 1.6335903075852478e-07,
"loss": 0.13771231174468995,
"memory(GiB)": 31.97,
"step": 2240,
"token_acc": 0.953091935104632,
"train_speed(iter/s)": 0.123095
},
{
"epoch": 2.7543576748828995,
"eval_loss": 0.2732333838939667,
"eval_runtime": 29.6848,
"eval_samples_per_second": 17.72,
"eval_steps_per_second": 4.447,
"eval_token_acc": 0.9184104337800836,
"step": 2240
},
{
"epoch": 2.760500652691392,
"grad_norm": 0.9624541997909546,
"learning_rate": 1.552953923465267e-07,
"loss": 0.1540065288543701,
"memory(GiB)": 31.97,
"step": 2245,
"token_acc": 0.9251303793194012,
"train_speed(iter/s)": 0.122899
},
{
"epoch": 2.766643630499885,
"grad_norm": 0.6570599675178528,
"learning_rate": 1.4743270113244278e-07,
"loss": 0.11645562648773193,
"memory(GiB)": 31.97,
"step": 2250,
"token_acc": 0.960995889387145,
"train_speed(iter/s)": 0.122957
},
{
"epoch": 2.7727866083083774,
"grad_norm": 0.8584187030792236,
"learning_rate": 1.3977128324261068e-07,
"loss": 0.1433710813522339,
"memory(GiB)": 31.97,
"step": 2255,
"token_acc": 0.9531076066790353,
"train_speed(iter/s)": 0.123031
},
{
"epoch": 2.7789295861168704,
"grad_norm": 0.8224872350692749,
"learning_rate": 1.3231145645501153e-07,
"loss": 0.14238922595977782,
"memory(GiB)": 31.97,
"step": 2260,
"token_acc": 0.9506010814215969,
"train_speed(iter/s)": 0.123085
},
{
"epoch": 2.7789295861168704,
"eval_loss": 0.2732416093349457,
"eval_runtime": 29.7221,
"eval_samples_per_second": 17.697,
"eval_steps_per_second": 4.441,
"eval_token_acc": 0.9183599942354806,
"step": 2260
},
{
"epoch": 2.785072563925363,
"grad_norm": 0.9489020705223083,
"learning_rate": 1.2505353018609445e-07,
"loss": 0.14603989124298095,
"memory(GiB)": 31.97,
"step": 2265,
"token_acc": 0.9227581508884137,
"train_speed(iter/s)": 0.122896
},
{
"epoch": 2.7912155417338553,
"grad_norm": 0.736284077167511,
"learning_rate": 1.1799780547793682e-07,
"loss": 0.14218697547912598,
"memory(GiB)": 31.97,
"step": 2270,
"token_acc": 0.9579674123170395,
"train_speed(iter/s)": 0.122963
},
{
"epoch": 2.7973585195423483,
"grad_norm": 0.7859813570976257,
"learning_rate": 1.111445749857626e-07,
"loss": 0.1413131594657898,
"memory(GiB)": 31.97,
"step": 2275,
"token_acc": 0.9514263252470799,
"train_speed(iter/s)": 0.123032
},
{
"epoch": 2.8035014973508408,
"grad_norm": 1.0802668333053589,
"learning_rate": 1.0449412296580252e-07,
"loss": 0.1472024917602539,
"memory(GiB)": 31.97,
"step": 2280,
"token_acc": 0.9520010294685368,
"train_speed(iter/s)": 0.123092
},
{
"epoch": 2.8035014973508408,
"eval_loss": 0.273179292678833,
"eval_runtime": 29.7252,
"eval_samples_per_second": 17.695,
"eval_steps_per_second": 4.441,
"eval_token_acc": 0.9184392563769995,
"step": 2280
},
{
"epoch": 2.8096444751593337,
"grad_norm": 0.7576152682304382,
"learning_rate": 9.804672526349979e-08,
"loss": 0.14902775287628173,
"memory(GiB)": 31.97,
"step": 2285,
"token_acc": 0.9238161925601751,
"train_speed(iter/s)": 0.122903
},
{
"epoch": 2.815787452967826,
"grad_norm": 0.8574935793876648,
"learning_rate": 9.180264930207405e-08,
"loss": 0.1530381441116333,
"memory(GiB)": 31.97,
"step": 2290,
"token_acc": 0.9533666759284987,
"train_speed(iter/s)": 0.12298
},
{
"epoch": 2.8219304307763187,
"grad_norm": 0.7855550050735474,
"learning_rate": 8.576215407142652e-08,
"loss": 0.12575039863586426,
"memory(GiB)": 31.97,
"step": 2295,
"token_acc": 0.9574297591025192,
"train_speed(iter/s)": 0.123043
},
{
"epoch": 2.8280734085848116,
"grad_norm": 0.8387411236763,
"learning_rate": 7.992549011739903e-08,
"loss": 0.14488180875778198,
"memory(GiB)": 31.97,
"step": 2300,
"token_acc": 0.9483278379651436,
"train_speed(iter/s)": 0.123101
},
{
"epoch": 2.8280734085848116,
"eval_loss": 0.2733011543750763,
"eval_runtime": 29.6296,
"eval_samples_per_second": 17.753,
"eval_steps_per_second": 4.455,
"eval_token_acc": 0.9184680789739156,
"step": 2300
},
{
"epoch": 2.834216386393304,
"grad_norm": 0.9076153039932251,
"learning_rate": 7.42928995313802e-08,
"loss": 0.14017899036407472,
"memory(GiB)": 31.97,
"step": 2305,
"token_acc": 0.9183514619299471,
"train_speed(iter/s)": 0.122904
},
{
"epoch": 2.840359364201797,
"grad_norm": 0.8693546056747437,
"learning_rate": 6.886461594026394e-08,
"loss": 0.134627628326416,
"memory(GiB)": 31.97,
"step": 2310,
"token_acc": 0.9542199129335768,
"train_speed(iter/s)": 0.122955
},
{
"epoch": 2.8465023420102895,
"grad_norm": 0.6884361505508423,
"learning_rate": 6.364086449676233e-08,
"loss": 0.11727933883666992,
"memory(GiB)": 31.97,
"step": 2315,
"token_acc": 0.9606529928840519,
"train_speed(iter/s)": 0.123004
},
{
"epoch": 2.852645319818782,
"grad_norm": 0.8520733118057251,
"learning_rate": 5.862186187006347e-08,
"loss": 0.13235876560211182,
"memory(GiB)": 31.97,
"step": 2320,
"token_acc": 0.9552718507276136,
"train_speed(iter/s)": 0.12307
},
{
"epoch": 2.852645319818782,
"eval_loss": 0.2732396423816681,
"eval_runtime": 29.7321,
"eval_samples_per_second": 17.691,
"eval_steps_per_second": 4.44,
"eval_token_acc": 0.9184248450785416,
"step": 2320
},
{
"epoch": 2.858788297627275,
"grad_norm": 0.6756072640419006,
"learning_rate": 5.3807816236846614e-08,
"loss": 0.14616656303405762,
"memory(GiB)": 31.97,
"step": 2325,
"token_acc": 0.9212030774597392,
"train_speed(iter/s)": 0.122879
},
{
"epoch": 2.8649312754357674,
"grad_norm": 0.7540440559387207,
"learning_rate": 4.919892727264508e-08,
"loss": 0.1399930238723755,
"memory(GiB)": 31.97,
"step": 2330,
"token_acc": 0.9455614286419997,
"train_speed(iter/s)": 0.122935
},
{
"epoch": 2.8710742532442604,
"grad_norm": 0.7465505599975586,
"learning_rate": 4.4795386143567375e-08,
"loss": 0.14697123765945436,
"memory(GiB)": 31.97,
"step": 2335,
"token_acc": 0.9416914178521182,
"train_speed(iter/s)": 0.123005
},
{
"epoch": 2.877217231052753,
"grad_norm": 0.8006975650787354,
"learning_rate": 4.0597375498365175e-08,
"loss": 0.14045066833496095,
"memory(GiB)": 31.97,
"step": 2340,
"token_acc": 0.9556364912896573,
"train_speed(iter/s)": 0.123073
},
{
"epoch": 2.877217231052753,
"eval_loss": 0.2732827663421631,
"eval_runtime": 29.7807,
"eval_samples_per_second": 17.662,
"eval_steps_per_second": 4.432,
"eval_token_acc": 0.9184320507277706,
"step": 2340
},
{
"epoch": 2.8833602088612453,
"grad_norm": 0.7655653357505798,
"learning_rate": 3.6605069460858286e-08,
"loss": 0.14170855283737183,
"memory(GiB)": 31.97,
"step": 2345,
"token_acc": 0.9223746043924427,
"train_speed(iter/s)": 0.122886
},
{
"epoch": 2.8895031866697383,
"grad_norm": 0.7868750691413879,
"learning_rate": 3.281863362271487e-08,
"loss": 0.13158297538757324,
"memory(GiB)": 31.97,
"step": 2350,
"token_acc": 0.9610517504554631,
"train_speed(iter/s)": 0.122944
},
{
"epoch": 2.8956461644782308,
"grad_norm": 0.9070404171943665,
"learning_rate": 2.9238225036579693e-08,
"loss": 0.13984733819961548,
"memory(GiB)": 31.97,
"step": 2355,
"token_acc": 0.9568466078293356,
"train_speed(iter/s)": 0.123006
},
{
"epoch": 2.9017891422867237,
"grad_norm": 0.826079249382019,
"learning_rate": 2.5863992209560484e-08,
"loss": 0.1394752025604248,
"memory(GiB)": 31.97,
"step": 2360,
"token_acc": 0.9418652788455852,
"train_speed(iter/s)": 0.123076
},
{
"epoch": 2.9017891422867237,
"eval_loss": 0.27347302436828613,
"eval_runtime": 29.7339,
"eval_samples_per_second": 17.69,
"eval_steps_per_second": 4.439,
"eval_token_acc": 0.9184464620262286,
"step": 2360
},
{
"epoch": 2.907932120095216,
"grad_norm": 0.9018839001655579,
"learning_rate": 2.269607509707006e-08,
"loss": 0.1596289873123169,
"memory(GiB)": 31.97,
"step": 2365,
"token_acc": 0.9238603473227207,
"train_speed(iter/s)": 0.122868
},
{
"epoch": 2.9140750979037087,
"grad_norm": 0.8631731271743774,
"learning_rate": 1.97346050970193e-08,
"loss": 0.1415793776512146,
"memory(GiB)": 31.97,
"step": 2370,
"token_acc": 0.9460295790671217,
"train_speed(iter/s)": 0.122945
},
{
"epoch": 2.9202180757122016,
"grad_norm": 0.8872095942497253,
"learning_rate": 1.69797050443693e-08,
"loss": 0.13625437021255493,
"memory(GiB)": 31.97,
"step": 2375,
"token_acc": 0.9583434245580044,
"train_speed(iter/s)": 0.123012
},
{
"epoch": 2.926361053520694,
"grad_norm": 0.8124271035194397,
"learning_rate": 1.4431489206034321e-08,
"loss": 0.14173973798751832,
"memory(GiB)": 31.97,
"step": 2380,
"token_acc": 0.9510019878579488,
"train_speed(iter/s)": 0.123073
},
{
"epoch": 2.926361053520694,
"eval_loss": 0.27337023615837097,
"eval_runtime": 29.6435,
"eval_samples_per_second": 17.744,
"eval_steps_per_second": 4.453,
"eval_token_acc": 0.9184969015708315,
"step": 2380
},
{
"epoch": 2.932504031329187,
"grad_norm": 0.8951108455657959,
"learning_rate": 1.2090063276142261e-08,
"loss": 0.13466954231262207,
"memory(GiB)": 31.97,
"step": 2385,
"token_acc": 0.9219987812309567,
"train_speed(iter/s)": 0.122895
},
{
"epoch": 2.9386470091376795,
"grad_norm": 0.9628083109855652,
"learning_rate": 9.955524371653146e-09,
"loss": 0.15039776563644408,
"memory(GiB)": 31.97,
"step": 2390,
"token_acc": 0.9414807461204869,
"train_speed(iter/s)": 0.12296
},
{
"epoch": 2.944789986946172,
"grad_norm": 0.8701411485671997,
"learning_rate": 8.02796102832848e-09,
"loss": 0.13970096111297609,
"memory(GiB)": 31.97,
"step": 2395,
"token_acc": 0.9505520319473808,
"train_speed(iter/s)": 0.123027
},
{
"epoch": 2.950932964754665,
"grad_norm": 0.9468409419059753,
"learning_rate": 6.307453197059166e-09,
"loss": 0.14919402599334716,
"memory(GiB)": 31.97,
"step": 2400,
"token_acc": 0.9412962147887324,
"train_speed(iter/s)": 0.12308
},
{
"epoch": 2.950932964754665,
"eval_loss": 0.27350106835365295,
"eval_runtime": 29.668,
"eval_samples_per_second": 17.73,
"eval_steps_per_second": 4.449,
"eval_token_acc": 0.9183167603401067,
"step": 2400
},
{
"epoch": 2.9570759425631574,
"grad_norm": 0.8505570292472839,
"learning_rate": 4.794072240550951e-09,
"loss": 0.1571817636489868,
"memory(GiB)": 31.97,
"step": 2405,
"token_acc": 0.920852764823451,
"train_speed(iter/s)": 0.122896
},
{
"epoch": 2.9632189203716504,
"grad_norm": 0.7646933197975159,
"learning_rate": 3.487880930363452e-09,
"loss": 0.13370524644851683,
"memory(GiB)": 31.97,
"step": 2410,
"token_acc": 0.9574316090263478,
"train_speed(iter/s)": 0.122964
},
{
"epoch": 2.969361898180143,
"grad_norm": 0.8396750092506409,
"learning_rate": 2.3889334443055743e-09,
"loss": 0.14388556480407716,
"memory(GiB)": 31.97,
"step": 2415,
"token_acc": 0.9585525888390827,
"train_speed(iter/s)": 0.123037
},
{
"epoch": 2.9755048759886353,
"grad_norm": 0.6261889338493347,
"learning_rate": 1.4972753641906424e-09,
"loss": 0.13296045064926149,
"memory(GiB)": 31.97,
"step": 2420,
"token_acc": 0.9545391609359856,
"train_speed(iter/s)": 0.123089
},
{
"epoch": 2.9755048759886353,
"eval_loss": 0.27353137731552124,
"eval_runtime": 29.6147,
"eval_samples_per_second": 17.761,
"eval_steps_per_second": 4.457,
"eval_token_acc": 0.9184320507277706,
"step": 2420
},
{
"epoch": 2.9816478537971283,
"grad_norm": 0.7684817910194397,
"learning_rate": 8.12943673943467e-10,
"loss": 0.1473867416381836,
"memory(GiB)": 31.97,
"step": 2425,
"token_acc": 0.9241485786940332,
"train_speed(iter/s)": 0.122905
},
{
"epoch": 2.9877908316056208,
"grad_norm": 0.7693585157394409,
"learning_rate": 3.359667580682402e-10,
"loss": 0.14533259868621826,
"memory(GiB)": 31.97,
"step": 2430,
"token_acc": 0.9479254868755292,
"train_speed(iter/s)": 0.122963
},
{
"epoch": 2.9939338094141137,
"grad_norm": 0.8009675145149231,
"learning_rate": 6.636440046892123e-11,
"loss": 0.12457112073898316,
"memory(GiB)": 31.97,
"step": 2435,
"token_acc": 0.9580943014806316,
"train_speed(iter/s)": 0.123019
},
{
"epoch": 2.9988481916609078,
"eval_loss": 0.2735154330730438,
"eval_runtime": 29.6742,
"eval_samples_per_second": 17.726,
"eval_steps_per_second": 4.448,
"eval_token_acc": 0.9182735264447327,
"step": 2439
}
],
"logging_steps": 5,
"max_steps": 2439,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.2467967701619835e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}