Files
qwen2.5vl-3b-32b-longest-25767/trainer_state.json
ModelHub XC 388eaee2b1 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-32b-longest-25767
Source: Original Platform
2026-05-20 13:36:35 +08:00

2965 lines
84 KiB
JSON

{
"best_global_step": 780,
"best_metric": 0.22540703,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v49-20250505-211427/checkpoint-780",
"epoch": 2.9984321103794294,
"eval_steps": 20,
"global_step": 1194,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002508623392913139,
"grad_norm": 2.4406964778900146,
"learning_rate": 9.999982692639099e-06,
"loss": 0.39261603355407715,
"memory(GiB)": 27.73,
"step": 1,
"token_acc": 0.8706896551724138,
"train_speed(iter/s)": 0.073021
},
{
"epoch": 0.012543116964565695,
"grad_norm": 1.4140573740005493,
"learning_rate": 9.999567321968297e-06,
"loss": 0.32130101323127747,
"memory(GiB)": 27.73,
"step": 5,
"token_acc": 0.8974483833268406,
"train_speed(iter/s)": 0.150893
},
{
"epoch": 0.02508623392913139,
"grad_norm": 0.9324946403503418,
"learning_rate": 9.998269362757298e-06,
"loss": 0.2845744609832764,
"memory(GiB)": 27.73,
"step": 10,
"token_acc": 0.9072619069023176,
"train_speed(iter/s)": 0.174486
},
{
"epoch": 0.03762935089369708,
"grad_norm": 0.8129357695579529,
"learning_rate": 9.996106347006378e-06,
"loss": 0.27310004234313967,
"memory(GiB)": 27.73,
"step": 15,
"token_acc": 0.9068501494128172,
"train_speed(iter/s)": 0.182334
},
{
"epoch": 0.05017246785826278,
"grad_norm": 0.8004717826843262,
"learning_rate": 9.993078649071297e-06,
"loss": 0.293326735496521,
"memory(GiB)": 27.73,
"step": 20,
"token_acc": 0.9049762744859472,
"train_speed(iter/s)": 0.189416
},
{
"epoch": 0.05017246785826278,
"eval_loss": 0.2858642339706421,
"eval_runtime": 10.0812,
"eval_samples_per_second": 25.493,
"eval_steps_per_second": 6.448,
"eval_token_acc": 0.9193595415242308,
"step": 20
},
{
"epoch": 0.06271558482282848,
"grad_norm": 0.7604618668556213,
"learning_rate": 9.989186792959408e-06,
"loss": 0.26241092681884765,
"memory(GiB)": 27.73,
"step": 25,
"token_acc": 0.9132264916006951,
"train_speed(iter/s)": 0.162946
},
{
"epoch": 0.07525870178739416,
"grad_norm": 0.6763395071029663,
"learning_rate": 9.984431452238968e-06,
"loss": 0.2643896102905273,
"memory(GiB)": 30.07,
"step": 30,
"token_acc": 0.9132476909865227,
"train_speed(iter/s)": 0.168084
},
{
"epoch": 0.08780181875195986,
"grad_norm": 0.7725923657417297,
"learning_rate": 9.97881344992256e-06,
"loss": 0.24953582286834716,
"memory(GiB)": 30.07,
"step": 35,
"token_acc": 0.9186898409484755,
"train_speed(iter/s)": 0.173361
},
{
"epoch": 0.10034493571652556,
"grad_norm": 0.6925674676895142,
"learning_rate": 9.97233375832466e-06,
"loss": 0.25593545436859133,
"memory(GiB)": 30.07,
"step": 40,
"token_acc": 0.9142301666698126,
"train_speed(iter/s)": 0.17571
},
{
"epoch": 0.10034493571652556,
"eval_loss": 0.27316194772720337,
"eval_runtime": 9.946,
"eval_samples_per_second": 25.84,
"eval_steps_per_second": 6.535,
"eval_token_acc": 0.9228584355519807,
"step": 40
},
{
"epoch": 0.11288805268109126,
"grad_norm": 0.6484134197235107,
"learning_rate": 9.964993498893349e-06,
"loss": 0.2584169626235962,
"memory(GiB)": 30.07,
"step": 45,
"token_acc": 0.9155336997396587,
"train_speed(iter/s)": 0.163839
},
{
"epoch": 0.12543116964565695,
"grad_norm": 0.6846535205841064,
"learning_rate": 9.95679394201623e-06,
"loss": 0.24747424125671386,
"memory(GiB)": 30.07,
"step": 50,
"token_acc": 0.9217108554277138,
"train_speed(iter/s)": 0.167611
},
{
"epoch": 0.13797428661022265,
"grad_norm": 0.6725199222564697,
"learning_rate": 9.947736506800554e-06,
"loss": 0.2658252716064453,
"memory(GiB)": 30.07,
"step": 55,
"token_acc": 0.9149937868631535,
"train_speed(iter/s)": 0.170799
},
{
"epoch": 0.15051740357478832,
"grad_norm": 0.6383991241455078,
"learning_rate": 9.93782276082762e-06,
"loss": 0.23914179801940919,
"memory(GiB)": 30.07,
"step": 60,
"token_acc": 0.9193507567193643,
"train_speed(iter/s)": 0.173334
},
{
"epoch": 0.15051740357478832,
"eval_loss": 0.26351168751716614,
"eval_runtime": 9.9368,
"eval_samples_per_second": 25.863,
"eval_steps_per_second": 6.541,
"eval_token_acc": 0.924492258194249,
"step": 60
},
{
"epoch": 0.16306052053935402,
"grad_norm": 0.6977497935295105,
"learning_rate": 9.927054419881462e-06,
"loss": 0.2521164894104004,
"memory(GiB)": 30.07,
"step": 65,
"token_acc": 0.9175230463277722,
"train_speed(iter/s)": 0.165312
},
{
"epoch": 0.17560363750391972,
"grad_norm": 0.6636829972267151,
"learning_rate": 9.915433347651909e-06,
"loss": 0.2421865940093994,
"memory(GiB)": 30.07,
"step": 70,
"token_acc": 0.9232868615930188,
"train_speed(iter/s)": 0.168227
},
{
"epoch": 0.18814675446848542,
"grad_norm": 0.715813398361206,
"learning_rate": 9.90296155541202e-06,
"loss": 0.2508160352706909,
"memory(GiB)": 30.07,
"step": 75,
"token_acc": 0.921933587812046,
"train_speed(iter/s)": 0.169862
},
{
"epoch": 0.20068987143305111,
"grad_norm": 0.762688934803009,
"learning_rate": 9.88964120167001e-06,
"loss": 0.2518954277038574,
"memory(GiB)": 30.07,
"step": 80,
"token_acc": 0.9140404864303723,
"train_speed(iter/s)": 0.171765
},
{
"epoch": 0.20068987143305111,
"eval_loss": 0.26037997007369995,
"eval_runtime": 9.9473,
"eval_samples_per_second": 25.836,
"eval_steps_per_second": 6.534,
"eval_token_acc": 0.9254122260205108,
"step": 80
},
{
"epoch": 0.2132329883976168,
"grad_norm": 0.7675485610961914,
"learning_rate": 9.875474591795648e-06,
"loss": 0.24610612392425538,
"memory(GiB)": 30.07,
"step": 85,
"token_acc": 0.9191244147424867,
"train_speed(iter/s)": 0.165845
},
{
"epoch": 0.2257761053621825,
"grad_norm": 0.6830568909645081,
"learning_rate": 9.860464177621286e-06,
"loss": 0.24553425312042237,
"memory(GiB)": 30.07,
"step": 90,
"token_acc": 0.9210544741632825,
"train_speed(iter/s)": 0.167772
},
{
"epoch": 0.2383192223267482,
"grad_norm": 0.6381848454475403,
"learning_rate": 9.84461255701751e-06,
"loss": 0.24247684478759765,
"memory(GiB)": 30.07,
"step": 95,
"token_acc": 0.9123848317331006,
"train_speed(iter/s)": 0.169012
},
{
"epoch": 0.2508623392913139,
"grad_norm": 0.6407138109207153,
"learning_rate": 9.827922473443518e-06,
"loss": 0.2540575504302979,
"memory(GiB)": 30.07,
"step": 100,
"token_acc": 0.9123342939481268,
"train_speed(iter/s)": 0.170603
},
{
"epoch": 0.2508623392913139,
"eval_loss": 0.255385160446167,
"eval_runtime": 9.9766,
"eval_samples_per_second": 25.76,
"eval_steps_per_second": 6.515,
"eval_token_acc": 0.9266941484013674,
"step": 100
},
{
"epoch": 0.2634054562558796,
"grad_norm": 0.6974446177482605,
"learning_rate": 9.810396815472316e-06,
"loss": 0.2443918228149414,
"memory(GiB)": 30.07,
"step": 105,
"token_acc": 0.9175226016453799,
"train_speed(iter/s)": 0.166259
},
{
"epoch": 0.2759485732204453,
"grad_norm": 0.6513930559158325,
"learning_rate": 9.79203861629078e-06,
"loss": 0.25510406494140625,
"memory(GiB)": 30.07,
"step": 110,
"token_acc": 0.9140450964411845,
"train_speed(iter/s)": 0.168031
},
{
"epoch": 0.288491690185011,
"grad_norm": 0.7416351437568665,
"learning_rate": 9.772851053174708e-06,
"loss": 0.24527263641357422,
"memory(GiB)": 30.07,
"step": 115,
"token_acc": 0.9193030038453679,
"train_speed(iter/s)": 0.169489
},
{
"epoch": 0.30103480714957664,
"grad_norm": 0.7076767086982727,
"learning_rate": 9.752837446938915e-06,
"loss": 0.24852099418640136,
"memory(GiB)": 30.07,
"step": 120,
"token_acc": 0.9153006496449615,
"train_speed(iter/s)": 0.170153
},
{
"epoch": 0.30103480714957664,
"eval_loss": 0.252126008272171,
"eval_runtime": 9.9461,
"eval_samples_per_second": 25.839,
"eval_steps_per_second": 6.535,
"eval_token_acc": 0.927654333400362,
"step": 120
},
{
"epoch": 0.31357792411414237,
"grad_norm": 0.6797928214073181,
"learning_rate": 9.732001261362503e-06,
"loss": 0.23887033462524415,
"memory(GiB)": 30.07,
"step": 125,
"token_acc": 0.9209902737430722,
"train_speed(iter/s)": 0.166589
},
{
"epoch": 0.32612104107870804,
"grad_norm": 0.7788575291633606,
"learning_rate": 9.710346102589376e-06,
"loss": 0.2535351276397705,
"memory(GiB)": 30.07,
"step": 130,
"token_acc": 0.9120982792920281,
"train_speed(iter/s)": 0.167787
},
{
"epoch": 0.33866415804327377,
"grad_norm": 0.6328480243682861,
"learning_rate": 9.687875718504126e-06,
"loss": 0.2357191562652588,
"memory(GiB)": 30.07,
"step": 135,
"token_acc": 0.9174817518248175,
"train_speed(iter/s)": 0.168973
},
{
"epoch": 0.35120727500783944,
"grad_norm": 0.6453744769096375,
"learning_rate": 9.664593998083374e-06,
"loss": 0.24335532188415526,
"memory(GiB)": 30.07,
"step": 140,
"token_acc": 0.9164061768834815,
"train_speed(iter/s)": 0.169985
},
{
"epoch": 0.35120727500783944,
"eval_loss": 0.24922603368759155,
"eval_runtime": 9.9665,
"eval_samples_per_second": 25.786,
"eval_steps_per_second": 6.522,
"eval_token_acc": 0.9285793283732153,
"step": 140
},
{
"epoch": 0.36375039197240516,
"grad_norm": 0.655521810054779,
"learning_rate": 9.640504970722708e-06,
"loss": 0.23469161987304688,
"memory(GiB)": 30.07,
"step": 145,
"token_acc": 0.9239321638563585,
"train_speed(iter/s)": 0.166594
},
{
"epoch": 0.37629350893697083,
"grad_norm": 0.656192421913147,
"learning_rate": 9.615612805539305e-06,
"loss": 0.23534941673278809,
"memory(GiB)": 30.07,
"step": 150,
"token_acc": 0.9101899504070365,
"train_speed(iter/s)": 0.16793
},
{
"epoch": 0.38883662590153656,
"grad_norm": 0.6794695258140564,
"learning_rate": 9.589921810650379e-06,
"loss": 0.24691348075866698,
"memory(GiB)": 30.07,
"step": 155,
"token_acc": 0.9117996509857795,
"train_speed(iter/s)": 0.168976
},
{
"epoch": 0.40137974286610223,
"grad_norm": 0.6336193680763245,
"learning_rate": 9.563436432427571e-06,
"loss": 0.23817820549011232,
"memory(GiB)": 30.07,
"step": 160,
"token_acc": 0.9163882846488502,
"train_speed(iter/s)": 0.170021
},
{
"epoch": 0.40137974286610223,
"eval_loss": 0.24645844101905823,
"eval_runtime": 9.937,
"eval_samples_per_second": 25.863,
"eval_steps_per_second": 6.541,
"eval_token_acc": 0.9283782425095516,
"step": 160
},
{
"epoch": 0.4139228598306679,
"grad_norm": 0.6395715475082397,
"learning_rate": 9.536161254727407e-06,
"loss": 0.23914387226104736,
"memory(GiB)": 30.07,
"step": 165,
"token_acc": 0.9195228213727245,
"train_speed(iter/s)": 0.167509
},
{
"epoch": 0.4264659767952336,
"grad_norm": 0.6767882704734802,
"learning_rate": 9.508100998097971e-06,
"loss": 0.2324080467224121,
"memory(GiB)": 30.07,
"step": 170,
"token_acc": 0.9136844562004045,
"train_speed(iter/s)": 0.168597
},
{
"epoch": 0.4390090937597993,
"grad_norm": 0.6208192706108093,
"learning_rate": 9.479260518961904e-06,
"loss": 0.23578665256500245,
"memory(GiB)": 30.07,
"step": 175,
"token_acc": 0.9154970653640042,
"train_speed(iter/s)": 0.169348
},
{
"epoch": 0.451552210724365,
"grad_norm": 0.6273934841156006,
"learning_rate": 9.449644808775902e-06,
"loss": 0.23413596153259278,
"memory(GiB)": 30.07,
"step": 180,
"token_acc": 0.9301044600520517,
"train_speed(iter/s)": 0.170195
},
{
"epoch": 0.451552210724365,
"eval_loss": 0.2454940527677536,
"eval_runtime": 9.965,
"eval_samples_per_second": 25.79,
"eval_steps_per_second": 6.523,
"eval_token_acc": 0.9287401970641463,
"step": 180
},
{
"epoch": 0.4640953276889307,
"grad_norm": 0.6492425203323364,
"learning_rate": 9.419258993166846e-06,
"loss": 0.2424703598022461,
"memory(GiB)": 30.07,
"step": 185,
"token_acc": 0.9212295968281757,
"train_speed(iter/s)": 0.167631
},
{
"epoch": 0.4766384446534964,
"grad_norm": 0.6422290205955505,
"learning_rate": 9.388108331044687e-06,
"loss": 0.23424482345581055,
"memory(GiB)": 30.07,
"step": 190,
"token_acc": 0.9254705767559033,
"train_speed(iter/s)": 0.168154
},
{
"epoch": 0.4891815616180621,
"grad_norm": 0.6608842015266418,
"learning_rate": 9.356198213692297e-06,
"loss": 0.23567054271697999,
"memory(GiB)": 30.07,
"step": 195,
"token_acc": 0.9198833160816787,
"train_speed(iter/s)": 0.168995
},
{
"epoch": 0.5017246785826278,
"grad_norm": 0.675005316734314,
"learning_rate": 9.323534163832387e-06,
"loss": 0.24276134967803956,
"memory(GiB)": 30.07,
"step": 200,
"token_acc": 0.913303071968056,
"train_speed(iter/s)": 0.170071
},
{
"epoch": 0.5017246785826278,
"eval_loss": 0.24412870407104492,
"eval_runtime": 9.9709,
"eval_samples_per_second": 25.775,
"eval_steps_per_second": 6.519,
"eval_token_acc": 0.9292730746028555,
"step": 200
},
{
"epoch": 0.5142677955471935,
"grad_norm": 0.6404038071632385,
"learning_rate": 9.290121834671669e-06,
"loss": 0.23353495597839355,
"memory(GiB)": 30.07,
"step": 205,
"token_acc": 0.9230231647999404,
"train_speed(iter/s)": 0.167939
},
{
"epoch": 0.5268109125117592,
"grad_norm": 0.6592434048652649,
"learning_rate": 9.255967008922475e-06,
"loss": 0.21883893013000488,
"memory(GiB)": 30.07,
"step": 210,
"token_acc": 0.9253728456196685,
"train_speed(iter/s)": 0.168578
},
{
"epoch": 0.5393540294763248,
"grad_norm": 0.656053900718689,
"learning_rate": 9.221075597801912e-06,
"loss": 0.2320107936859131,
"memory(GiB)": 30.07,
"step": 215,
"token_acc": 0.9213286713286714,
"train_speed(iter/s)": 0.169087
},
{
"epoch": 0.5518971464408906,
"grad_norm": 0.64938884973526,
"learning_rate": 9.18545364000882e-06,
"loss": 0.23771371841430664,
"memory(GiB)": 30.07,
"step": 220,
"token_acc": 0.9130793725675198,
"train_speed(iter/s)": 0.169997
},
{
"epoch": 0.5518971464408906,
"eval_loss": 0.24264991283416748,
"eval_runtime": 9.9448,
"eval_samples_per_second": 25.843,
"eval_steps_per_second": 6.536,
"eval_token_acc": 0.929810979288156,
"step": 220
},
{
"epoch": 0.5644402634054563,
"grad_norm": 0.6256290674209595,
"learning_rate": 9.14910730067863e-06,
"loss": 0.23107373714447021,
"memory(GiB)": 30.07,
"step": 225,
"token_acc": 0.9247168944864042,
"train_speed(iter/s)": 0.168098
},
{
"epoch": 0.576983380370022,
"grad_norm": 0.5621991753578186,
"learning_rate": 9.112042870316365e-06,
"loss": 0.21704797744750975,
"memory(GiB)": 30.07,
"step": 230,
"token_acc": 0.9269190993704302,
"train_speed(iter/s)": 0.168845
},
{
"epoch": 0.5895264973345876,
"grad_norm": 0.6810638308525085,
"learning_rate": 9.074266763707937e-06,
"loss": 0.2278088092803955,
"memory(GiB)": 30.07,
"step": 235,
"token_acc": 0.9134312189271274,
"train_speed(iter/s)": 0.169551
},
{
"epoch": 0.6020696142991533,
"grad_norm": 0.5773187279701233,
"learning_rate": 9.035785518809928e-06,
"loss": 0.21931402683258056,
"memory(GiB)": 30.07,
"step": 240,
"token_acc": 0.9246131941148552,
"train_speed(iter/s)": 0.170025
},
{
"epoch": 0.6020696142991533,
"eval_loss": 0.24013011157512665,
"eval_runtime": 9.9847,
"eval_samples_per_second": 25.739,
"eval_steps_per_second": 6.51,
"eval_token_acc": 0.9308616529257994,
"step": 240
},
{
"epoch": 0.6146127312637191,
"grad_norm": 0.6531880497932434,
"learning_rate": 8.996605795618054e-06,
"loss": 0.24535005092620848,
"memory(GiB)": 30.07,
"step": 245,
"token_acc": 0.915537267670059,
"train_speed(iter/s)": 0.168114
},
{
"epoch": 0.6271558482282847,
"grad_norm": 0.6779309511184692,
"learning_rate": 8.956734375014525e-06,
"loss": 0.23181967735290526,
"memory(GiB)": 30.07,
"step": 250,
"token_acc": 0.9155568096313017,
"train_speed(iter/s)": 0.168895
},
{
"epoch": 0.6396989651928504,
"grad_norm": 0.6643815040588379,
"learning_rate": 8.916178157594453e-06,
"loss": 0.23725414276123047,
"memory(GiB)": 30.07,
"step": 255,
"token_acc": 0.9135585175809833,
"train_speed(iter/s)": 0.169436
},
{
"epoch": 0.6522420821574161,
"grad_norm": 0.5990136861801147,
"learning_rate": 8.87494416247157e-06,
"loss": 0.22699012756347656,
"memory(GiB)": 30.07,
"step": 260,
"token_acc": 0.924126221001221,
"train_speed(iter/s)": 0.170075
},
{
"epoch": 0.6522420821574161,
"eval_loss": 0.23895950615406036,
"eval_runtime": 9.971,
"eval_samples_per_second": 25.775,
"eval_steps_per_second": 6.519,
"eval_token_acc": 0.9309320329780817,
"step": 260
},
{
"epoch": 0.6647851991219819,
"grad_norm": 0.647604763507843,
"learning_rate": 8.833039526063414e-06,
"loss": 0.22294692993164061,
"memory(GiB)": 30.07,
"step": 265,
"token_acc": 0.923023885742428,
"train_speed(iter/s)": 0.168429
},
{
"epoch": 0.6773283160865475,
"grad_norm": 0.7019293904304504,
"learning_rate": 8.790471500856229e-06,
"loss": 0.22493109703063965,
"memory(GiB)": 30.07,
"step": 270,
"token_acc": 0.9217673989150925,
"train_speed(iter/s)": 0.168763
},
{
"epoch": 0.6898714330511132,
"grad_norm": 0.709414541721344,
"learning_rate": 8.747247454149754e-06,
"loss": 0.23138487339019775,
"memory(GiB)": 30.07,
"step": 275,
"token_acc": 0.9201797011093793,
"train_speed(iter/s)": 0.169186
},
{
"epoch": 0.7024145500156789,
"grad_norm": 0.5952314138412476,
"learning_rate": 8.703374866782172e-06,
"loss": 0.2214064598083496,
"memory(GiB)": 30.07,
"step": 280,
"token_acc": 0.9243394229601485,
"train_speed(iter/s)": 0.169579
},
{
"epoch": 0.7024145500156789,
"eval_loss": 0.2380242794752121,
"eval_runtime": 9.9399,
"eval_samples_per_second": 25.855,
"eval_steps_per_second": 6.539,
"eval_token_acc": 0.9309119243917152,
"step": 280
},
{
"epoch": 0.7149576669802445,
"grad_norm": 0.6550840735435486,
"learning_rate": 8.658861331835384e-06,
"loss": 0.22828481197357178,
"memory(GiB)": 30.07,
"step": 285,
"token_acc": 0.9233320082850388,
"train_speed(iter/s)": 0.16832
},
{
"epoch": 0.7275007839448103,
"grad_norm": 0.6396917700767517,
"learning_rate": 8.613714553320863e-06,
"loss": 0.22759134769439698,
"memory(GiB)": 30.07,
"step": 290,
"token_acc": 0.9214041461850823,
"train_speed(iter/s)": 0.168862
},
{
"epoch": 0.740043900909376,
"grad_norm": 0.6359645128250122,
"learning_rate": 8.567942344846311e-06,
"loss": 0.2300776481628418,
"memory(GiB)": 30.07,
"step": 295,
"token_acc": 0.9242182277352745,
"train_speed(iter/s)": 0.169333
},
{
"epoch": 0.7525870178739417,
"grad_norm": 0.6625407338142395,
"learning_rate": 8.521552628263362e-06,
"loss": 0.23114292621612548,
"memory(GiB)": 30.07,
"step": 300,
"token_acc": 0.9189751431314676,
"train_speed(iter/s)": 0.169808
},
{
"epoch": 0.7525870178739417,
"eval_loss": 0.2363082766532898,
"eval_runtime": 9.9749,
"eval_samples_per_second": 25.765,
"eval_steps_per_second": 6.516,
"eval_token_acc": 0.9315302634224814,
"step": 300
},
{
"epoch": 0.7651301348385073,
"grad_norm": 0.6202587485313416,
"learning_rate": 8.474553432296517e-06,
"loss": 0.224021315574646,
"memory(GiB)": 30.07,
"step": 305,
"token_acc": 0.9276043893401739,
"train_speed(iter/s)": 0.168359
},
{
"epoch": 0.7776732518030731,
"grad_norm": 0.6691203713417053,
"learning_rate": 8.426952891153617e-06,
"loss": 0.23445448875427247,
"memory(GiB)": 30.07,
"step": 310,
"token_acc": 0.923713052741816,
"train_speed(iter/s)": 0.168781
},
{
"epoch": 0.7902163687676388,
"grad_norm": 0.6087518334388733,
"learning_rate": 8.378759243118044e-06,
"loss": 0.22397913932800292,
"memory(GiB)": 30.07,
"step": 315,
"token_acc": 0.9215067830325622,
"train_speed(iter/s)": 0.169206
},
{
"epoch": 0.8027594857322045,
"grad_norm": 0.6450164914131165,
"learning_rate": 8.329980829122907e-06,
"loss": 0.2312875509262085,
"memory(GiB)": 30.07,
"step": 320,
"token_acc": 0.9272111639559235,
"train_speed(iter/s)": 0.169675
},
{
"epoch": 0.8027594857322045,
"eval_loss": 0.23446869850158691,
"eval_runtime": 9.9713,
"eval_samples_per_second": 25.774,
"eval_steps_per_second": 6.519,
"eval_token_acc": 0.931907299416851,
"step": 320
},
{
"epoch": 0.8153026026967701,
"grad_norm": 0.6574311852455139,
"learning_rate": 8.280626091307466e-06,
"loss": 0.21696841716766357,
"memory(GiB)": 30.07,
"step": 325,
"token_acc": 0.9266242271024336,
"train_speed(iter/s)": 0.168386
},
{
"epoch": 0.8278457196613358,
"grad_norm": 0.6543525457382202,
"learning_rate": 8.23070357155605e-06,
"loss": 0.23127243518829346,
"memory(GiB)": 30.07,
"step": 330,
"token_acc": 0.9156461739292596,
"train_speed(iter/s)": 0.168765
},
{
"epoch": 0.8403888366259016,
"grad_norm": 0.6864754557609558,
"learning_rate": 8.18022191001969e-06,
"loss": 0.23221104145050048,
"memory(GiB)": 30.07,
"step": 335,
"token_acc": 0.9158222112374361,
"train_speed(iter/s)": 0.169317
},
{
"epoch": 0.8529319535904673,
"grad_norm": 0.6478604674339294,
"learning_rate": 8.129189843620766e-06,
"loss": 0.21692075729370117,
"memory(GiB)": 30.07,
"step": 340,
"token_acc": 0.92678130982976,
"train_speed(iter/s)": 0.169761
},
{
"epoch": 0.8529319535904673,
"eval_loss": 0.23404286801815033,
"eval_runtime": 9.9838,
"eval_samples_per_second": 25.742,
"eval_steps_per_second": 6.511,
"eval_token_acc": 0.9316659963804544,
"step": 340
},
{
"epoch": 0.8654750705550329,
"grad_norm": 0.6460584402084351,
"learning_rate": 8.077616204540897e-06,
"loss": 0.21975212097167968,
"memory(GiB)": 30.07,
"step": 345,
"token_acc": 0.926279602750191,
"train_speed(iter/s)": 0.168486
},
{
"epoch": 0.8780181875195986,
"grad_norm": 0.6245233416557312,
"learning_rate": 8.02550991869234e-06,
"loss": 0.2209392309188843,
"memory(GiB)": 30.07,
"step": 350,
"token_acc": 0.9232239957902122,
"train_speed(iter/s)": 0.168953
},
{
"epoch": 0.8905613044841643,
"grad_norm": 0.6148731708526611,
"learning_rate": 7.972880004173175e-06,
"loss": 0.22880539894104004,
"memory(GiB)": 30.07,
"step": 355,
"token_acc": 0.9230032848427968,
"train_speed(iter/s)": 0.169373
},
{
"epoch": 0.90310442144873,
"grad_norm": 0.5976830124855042,
"learning_rate": 7.919735569706533e-06,
"loss": 0.2258004665374756,
"memory(GiB)": 30.07,
"step": 360,
"token_acc": 0.9294882944307232,
"train_speed(iter/s)": 0.169737
},
{
"epoch": 0.90310442144873,
"eval_loss": 0.23216013610363007,
"eval_runtime": 9.9402,
"eval_samples_per_second": 25.855,
"eval_steps_per_second": 6.539,
"eval_token_acc": 0.9317363764327368,
"step": 360
},
{
"epoch": 0.9156475384132957,
"grad_norm": 0.6738146543502808,
"learning_rate": 7.86608581306413e-06,
"loss": 0.23008251190185547,
"memory(GiB)": 30.07,
"step": 365,
"token_acc": 0.9237889028684008,
"train_speed(iter/s)": 0.168422
},
{
"epoch": 0.9281906553778614,
"grad_norm": 0.5968795418739319,
"learning_rate": 7.811940019474414e-06,
"loss": 0.2311033248901367,
"memory(GiB)": 30.07,
"step": 370,
"token_acc": 0.9211925456821934,
"train_speed(iter/s)": 0.168654
},
{
"epoch": 0.940733772342427,
"grad_norm": 0.6431145668029785,
"learning_rate": 7.757307560015539e-06,
"loss": 0.21920247077941896,
"memory(GiB)": 30.07,
"step": 375,
"token_acc": 0.9226723579404703,
"train_speed(iter/s)": 0.168986
},
{
"epoch": 0.9532768893069928,
"grad_norm": 0.6081040501594543,
"learning_rate": 7.702197889993515e-06,
"loss": 0.23370542526245117,
"memory(GiB)": 30.07,
"step": 380,
"token_acc": 0.9235495603658321,
"train_speed(iter/s)": 0.169411
},
{
"epoch": 0.9532768893069928,
"eval_loss": 0.23192763328552246,
"eval_runtime": 9.9828,
"eval_samples_per_second": 25.744,
"eval_steps_per_second": 6.511,
"eval_token_acc": 0.9322843354112206,
"step": 380
},
{
"epoch": 0.9658200062715585,
"grad_norm": 0.6549849510192871,
"learning_rate": 7.646620547305765e-06,
"loss": 0.22933628559112548,
"memory(GiB)": 30.07,
"step": 385,
"token_acc": 0.9272846380609236,
"train_speed(iter/s)": 0.168384
},
{
"epoch": 0.9783631232361242,
"grad_norm": 0.5421332120895386,
"learning_rate": 7.590585150790388e-06,
"loss": 0.2162912368774414,
"memory(GiB)": 30.07,
"step": 390,
"token_acc": 0.9282690665907798,
"train_speed(iter/s)": 0.168866
},
{
"epoch": 0.9909062402006898,
"grad_norm": 0.5929440259933472,
"learning_rate": 7.5341013985614064e-06,
"loss": 0.22078533172607423,
"memory(GiB)": 30.07,
"step": 395,
"token_acc": 0.9202857714192223,
"train_speed(iter/s)": 0.169287
},
{
"epoch": 1.0050172467858263,
"grad_norm": 0.6037130951881409,
"learning_rate": 7.47717906633032e-06,
"loss": 0.253579306602478,
"memory(GiB)": 30.07,
"step": 400,
"token_acc": 0.9251826086956522,
"train_speed(iter/s)": 0.169497
},
{
"epoch": 1.0050172467858263,
"eval_loss": 0.23065434396266937,
"eval_runtime": 9.9745,
"eval_samples_per_second": 25.766,
"eval_steps_per_second": 6.517,
"eval_token_acc": 0.9330032173738186,
"step": 400
},
{
"epoch": 1.017560363750392,
"grad_norm": 0.6310690641403198,
"learning_rate": 7.419828005714195e-06,
"loss": 0.17385544776916503,
"memory(GiB)": 30.07,
"step": 405,
"token_acc": 0.9351318726588173,
"train_speed(iter/s)": 0.168293
},
{
"epoch": 1.0301034807149576,
"grad_norm": 0.6506795883178711,
"learning_rate": 7.362058142530639e-06,
"loss": 0.16791077852249145,
"memory(GiB)": 30.07,
"step": 410,
"token_acc": 0.9320981703907922,
"train_speed(iter/s)": 0.16871
},
{
"epoch": 1.0426465976795234,
"grad_norm": 0.5465015769004822,
"learning_rate": 7.303879475079931e-06,
"loss": 0.15868284702301025,
"memory(GiB)": 32.5,
"step": 415,
"token_acc": 0.942933207765865,
"train_speed(iter/s)": 0.168987
},
{
"epoch": 1.055189714644089,
"grad_norm": 0.6573348045349121,
"learning_rate": 7.245302072414602e-06,
"loss": 0.16553893089294433,
"memory(GiB)": 32.5,
"step": 420,
"token_acc": 0.9483149060876516,
"train_speed(iter/s)": 0.169449
},
{
"epoch": 1.055189714644089,
"eval_loss": 0.23652049899101257,
"eval_runtime": 9.9746,
"eval_samples_per_second": 25.765,
"eval_steps_per_second": 6.517,
"eval_token_acc": 0.9325306655942087,
"step": 420
},
{
"epoch": 1.0677328316086547,
"grad_norm": 0.6287339925765991,
"learning_rate": 7.1863360725967615e-06,
"loss": 0.15552424192428588,
"memory(GiB)": 32.5,
"step": 425,
"token_acc": 0.9385299885841407,
"train_speed(iter/s)": 0.168478
},
{
"epoch": 1.0802759485732205,
"grad_norm": 0.6632113456726074,
"learning_rate": 7.126991680943508e-06,
"loss": 0.17083898782730103,
"memory(GiB)": 32.5,
"step": 430,
"token_acc": 0.9388098703940791,
"train_speed(iter/s)": 0.168867
},
{
"epoch": 1.092819065537786,
"grad_norm": 0.6978201270103455,
"learning_rate": 7.067279168260671e-06,
"loss": 0.17238540649414064,
"memory(GiB)": 32.5,
"step": 435,
"token_acc": 0.941155504865096,
"train_speed(iter/s)": 0.169229
},
{
"epoch": 1.1053621825023519,
"grad_norm": 0.6574686765670776,
"learning_rate": 7.007208869065232e-06,
"loss": 0.164797842502594,
"memory(GiB)": 32.5,
"step": 440,
"token_acc": 0.9414418794608996,
"train_speed(iter/s)": 0.169579
},
{
"epoch": 1.1053621825023519,
"eval_loss": 0.23527124524116516,
"eval_runtime": 9.9855,
"eval_samples_per_second": 25.737,
"eval_steps_per_second": 6.509,
"eval_token_acc": 0.932812185803338,
"step": 440
},
{
"epoch": 1.1179052994669174,
"grad_norm": 0.6073827743530273,
"learning_rate": 6.946791179796718e-06,
"loss": 0.16431469917297364,
"memory(GiB)": 32.5,
"step": 445,
"token_acc": 0.936977573407634,
"train_speed(iter/s)": 0.168619
},
{
"epoch": 1.1304484164314832,
"grad_norm": 0.6841042041778564,
"learning_rate": 6.886036557017881e-06,
"loss": 0.1685694932937622,
"memory(GiB)": 32.5,
"step": 450,
"token_acc": 0.9406548805236119,
"train_speed(iter/s)": 0.168995
},
{
"epoch": 1.142991533396049,
"grad_norm": 0.6076449751853943,
"learning_rate": 6.824955515604957e-06,
"loss": 0.15892113447189332,
"memory(GiB)": 32.5,
"step": 455,
"token_acc": 0.9472979086195497,
"train_speed(iter/s)": 0.16932
},
{
"epoch": 1.1555346503606145,
"grad_norm": 0.6569898128509521,
"learning_rate": 6.76355862692786e-06,
"loss": 0.1675378680229187,
"memory(GiB)": 32.5,
"step": 460,
"token_acc": 0.9412598483175024,
"train_speed(iter/s)": 0.169705
},
{
"epoch": 1.1555346503606145,
"eval_loss": 0.23534773290157318,
"eval_runtime": 9.9809,
"eval_samples_per_second": 25.749,
"eval_steps_per_second": 6.512,
"eval_token_acc": 0.9333752262215966,
"step": 460
},
{
"epoch": 1.1680777673251803,
"grad_norm": 0.6509740948677063,
"learning_rate": 6.701856517020565e-06,
"loss": 0.17353179454803466,
"memory(GiB)": 32.5,
"step": 465,
"token_acc": 0.9360858431432503,
"train_speed(iter/s)": 0.168849
},
{
"epoch": 1.180620884289746,
"grad_norm": 0.6769475340843201,
"learning_rate": 6.639859864742058e-06,
"loss": 0.16680521965026857,
"memory(GiB)": 32.5,
"step": 470,
"token_acc": 0.945840546350372,
"train_speed(iter/s)": 0.1692
},
{
"epoch": 1.1931640012543117,
"grad_norm": 0.6421222686767578,
"learning_rate": 6.5775793999281345e-06,
"loss": 0.1688302278518677,
"memory(GiB)": 34.95,
"step": 475,
"token_acc": 0.9395874540830743,
"train_speed(iter/s)": 0.16954
},
{
"epoch": 1.2057071182188774,
"grad_norm": 0.6938676834106445,
"learning_rate": 6.515025901534364e-06,
"loss": 0.1676286816596985,
"memory(GiB)": 34.95,
"step": 480,
"token_acc": 0.9379853728417402,
"train_speed(iter/s)": 0.169878
},
{
"epoch": 1.2057071182188774,
"eval_loss": 0.2359960675239563,
"eval_runtime": 9.9789,
"eval_samples_per_second": 25.754,
"eval_steps_per_second": 6.514,
"eval_token_acc": 0.9330987331590589,
"step": 480
},
{
"epoch": 1.218250235183443,
"grad_norm": 0.65184086561203,
"learning_rate": 6.452210195770571e-06,
"loss": 0.1703261137008667,
"memory(GiB)": 34.95,
"step": 485,
"token_acc": 0.9328010713917791,
"train_speed(iter/s)": 0.169027
},
{
"epoch": 1.2307933521480088,
"grad_norm": 0.6309488415718079,
"learning_rate": 6.389143154227128e-06,
"loss": 0.17036676406860352,
"memory(GiB)": 34.95,
"step": 490,
"token_acc": 0.9409641272467474,
"train_speed(iter/s)": 0.169349
},
{
"epoch": 1.2433364691125746,
"grad_norm": 0.6250006556510925,
"learning_rate": 6.325835691993394e-06,
"loss": 0.17421271800994872,
"memory(GiB)": 34.95,
"step": 495,
"token_acc": 0.9379951431187548,
"train_speed(iter/s)": 0.16968
},
{
"epoch": 1.2558795860771401,
"grad_norm": 0.6726417541503906,
"learning_rate": 6.2622987657686305e-06,
"loss": 0.17579824924468995,
"memory(GiB)": 34.95,
"step": 500,
"token_acc": 0.9332128799122411,
"train_speed(iter/s)": 0.169898
},
{
"epoch": 1.2558795860771401,
"eval_loss": 0.2346230149269104,
"eval_runtime": 9.958,
"eval_samples_per_second": 25.809,
"eval_steps_per_second": 6.527,
"eval_token_acc": 0.9335712849386688,
"step": 500
},
{
"epoch": 1.268422703041706,
"grad_norm": 0.6843208074569702,
"learning_rate": 6.198543371965711e-06,
"loss": 0.16942257881164552,
"memory(GiB)": 34.95,
"step": 505,
"token_acc": 0.9333019066627884,
"train_speed(iter/s)": 0.168992
},
{
"epoch": 1.2809658200062715,
"grad_norm": 0.6464666128158569,
"learning_rate": 6.134580544807951e-06,
"loss": 0.16836194992065429,
"memory(GiB)": 34.95,
"step": 510,
"token_acc": 0.9348090386953423,
"train_speed(iter/s)": 0.169263
},
{
"epoch": 1.2935089369708372,
"grad_norm": 0.636613667011261,
"learning_rate": 6.070421354419418e-06,
"loss": 0.17016284465789794,
"memory(GiB)": 34.95,
"step": 515,
"token_acc": 0.9389155662264906,
"train_speed(iter/s)": 0.169454
},
{
"epoch": 1.306052053935403,
"grad_norm": 0.6690927743911743,
"learning_rate": 6.006076904908996e-06,
"loss": 0.1702873706817627,
"memory(GiB)": 34.95,
"step": 520,
"token_acc": 0.9357901608213163,
"train_speed(iter/s)": 0.16974
},
{
"epoch": 1.306052053935403,
"eval_loss": 0.23393017053604126,
"eval_runtime": 9.9533,
"eval_samples_per_second": 25.82,
"eval_steps_per_second": 6.53,
"eval_token_acc": 0.9333802533681882,
"step": 520
},
{
"epoch": 1.3185951708999686,
"grad_norm": 0.5869135856628418,
"learning_rate": 5.9415583324485895e-06,
"loss": 0.15791409015655516,
"memory(GiB)": 34.95,
"step": 525,
"token_acc": 0.9386744758379915,
"train_speed(iter/s)": 0.168968
},
{
"epoch": 1.3311382878645344,
"grad_norm": 0.644124448299408,
"learning_rate": 5.876876803345777e-06,
"loss": 0.16019464731216432,
"memory(GiB)": 34.95,
"step": 530,
"token_acc": 0.9399966226992138,
"train_speed(iter/s)": 0.169268
},
{
"epoch": 1.3436814048291001,
"grad_norm": 0.6242665648460388,
"learning_rate": 5.812043512111237e-06,
"loss": 0.16639323234558107,
"memory(GiB)": 34.95,
"step": 535,
"token_acc": 0.9414278117034347,
"train_speed(iter/s)": 0.169643
},
{
"epoch": 1.3562245217936657,
"grad_norm": 0.6533536314964294,
"learning_rate": 5.747069679521306e-06,
"loss": 0.16934127807617189,
"memory(GiB)": 34.95,
"step": 540,
"token_acc": 0.9394062777613266,
"train_speed(iter/s)": 0.169883
},
{
"epoch": 1.3562245217936657,
"eval_loss": 0.23251816630363464,
"eval_runtime": 9.9789,
"eval_samples_per_second": 25.754,
"eval_steps_per_second": 6.514,
"eval_token_acc": 0.9339081037603056,
"step": 540
},
{
"epoch": 1.3687676387582315,
"grad_norm": 0.6235964894294739,
"learning_rate": 5.681966550675981e-06,
"loss": 0.15981945991516114,
"memory(GiB)": 34.95,
"step": 545,
"token_acc": 0.9390275276562902,
"train_speed(iter/s)": 0.169143
},
{
"epoch": 1.381310755722797,
"grad_norm": 0.6405583620071411,
"learning_rate": 5.616745393052725e-06,
"loss": 0.1589187502861023,
"memory(GiB)": 34.95,
"step": 550,
"token_acc": 0.9394911105629661,
"train_speed(iter/s)": 0.169459
},
{
"epoch": 1.3938538726873628,
"grad_norm": 0.6054794192314148,
"learning_rate": 5.551417494556376e-06,
"loss": 0.1589406132698059,
"memory(GiB)": 34.95,
"step": 555,
"token_acc": 0.9447133523511548,
"train_speed(iter/s)": 0.169755
},
{
"epoch": 1.4063969896519284,
"grad_norm": 0.6477380394935608,
"learning_rate": 5.4859941615655495e-06,
"loss": 0.16248714923858643,
"memory(GiB)": 34.95,
"step": 560,
"token_acc": 0.9376850526480911,
"train_speed(iter/s)": 0.170048
},
{
"epoch": 1.4063969896519284,
"eval_loss": 0.2313276082277298,
"eval_runtime": 9.9434,
"eval_samples_per_second": 25.846,
"eval_steps_per_second": 6.537,
"eval_token_acc": 0.934089081037603,
"step": 560
},
{
"epoch": 1.4189401066164942,
"grad_norm": 0.6257679462432861,
"learning_rate": 5.4204867169758265e-06,
"loss": 0.1701244592666626,
"memory(GiB)": 34.95,
"step": 565,
"token_acc": 0.9315096587690685,
"train_speed(iter/s)": 0.169354
},
{
"epoch": 1.43148322358106,
"grad_norm": 0.6648498773574829,
"learning_rate": 5.35490649824008e-06,
"loss": 0.16337137222290038,
"memory(GiB)": 34.95,
"step": 570,
"token_acc": 0.9441849071789757,
"train_speed(iter/s)": 0.169609
},
{
"epoch": 1.4440263405456255,
"grad_norm": 0.6349947452545166,
"learning_rate": 5.289264855406295e-06,
"loss": 0.1652446985244751,
"memory(GiB)": 34.95,
"step": 575,
"token_acc": 0.9389820592823713,
"train_speed(iter/s)": 0.169876
},
{
"epoch": 1.4565694575101913,
"grad_norm": 0.6542990803718567,
"learning_rate": 5.223573149153197e-06,
"loss": 0.17841705083847045,
"memory(GiB)": 34.95,
"step": 580,
"token_acc": 0.9361828435737608,
"train_speed(iter/s)": 0.170157
},
{
"epoch": 1.4565694575101913,
"eval_loss": 0.23112896084785461,
"eval_runtime": 9.9829,
"eval_samples_per_second": 25.744,
"eval_steps_per_second": 6.511,
"eval_token_acc": 0.9340991353307863,
"step": 580
},
{
"epoch": 1.469112574474757,
"grad_norm": 0.6102667450904846,
"learning_rate": 5.157842748824053e-06,
"loss": 0.16528806686401368,
"memory(GiB)": 34.95,
"step": 585,
"token_acc": 0.9365076170735143,
"train_speed(iter/s)": 0.169488
},
{
"epoch": 1.4816556914393226,
"grad_norm": 0.6301048994064331,
"learning_rate": 5.092085030458957e-06,
"loss": 0.16155061721801758,
"memory(GiB)": 34.95,
"step": 590,
"token_acc": 0.9446101777707996,
"train_speed(iter/s)": 0.169721
},
{
"epoch": 1.4941988084038884,
"grad_norm": 0.6364830732345581,
"learning_rate": 5.026311374825969e-06,
"loss": 0.16691150665283203,
"memory(GiB)": 34.95,
"step": 595,
"token_acc": 0.9467816983326871,
"train_speed(iter/s)": 0.169943
},
{
"epoch": 1.5067419253684542,
"grad_norm": 0.7235390543937683,
"learning_rate": 4.960533165451435e-06,
"loss": 0.16880112886428833,
"memory(GiB)": 34.95,
"step": 600,
"token_acc": 0.9358625682365141,
"train_speed(iter/s)": 0.170106
},
{
"epoch": 1.5067419253684542,
"eval_loss": 0.23094019293785095,
"eval_runtime": 9.9784,
"eval_samples_per_second": 25.756,
"eval_steps_per_second": 6.514,
"eval_token_acc": 0.9340790267444199,
"step": 600
},
{
"epoch": 1.5192850423330198,
"grad_norm": 0.6465805172920227,
"learning_rate": 4.894761786649815e-06,
"loss": 0.16632287502288817,
"memory(GiB)": 34.95,
"step": 605,
"token_acc": 0.9343266943374441,
"train_speed(iter/s)": 0.169351
},
{
"epoch": 1.5318281592975853,
"grad_norm": 0.6529747247695923,
"learning_rate": 4.829008621553401e-06,
"loss": 0.16232678890228272,
"memory(GiB)": 34.95,
"step": 610,
"token_acc": 0.9372756540724568,
"train_speed(iter/s)": 0.169563
},
{
"epoch": 1.544371276262151,
"grad_norm": 0.6062551736831665,
"learning_rate": 4.763285050142211e-06,
"loss": 0.1610184907913208,
"memory(GiB)": 34.95,
"step": 615,
"token_acc": 0.9430264444742746,
"train_speed(iter/s)": 0.169769
},
{
"epoch": 1.5569143932267169,
"grad_norm": 0.6321557760238647,
"learning_rate": 4.697602447274454e-06,
"loss": 0.16829713582992553,
"memory(GiB)": 34.95,
"step": 620,
"token_acc": 0.9414636993230099,
"train_speed(iter/s)": 0.169953
},
{
"epoch": 1.5569143932267169,
"eval_loss": 0.22995001077651978,
"eval_runtime": 9.9733,
"eval_samples_per_second": 25.769,
"eval_steps_per_second": 6.517,
"eval_token_acc": 0.9340689724512367,
"step": 620
},
{
"epoch": 1.5694575101912824,
"grad_norm": 0.6242859363555908,
"learning_rate": 4.631972180717859e-06,
"loss": 0.169819974899292,
"memory(GiB)": 34.95,
"step": 625,
"token_acc": 0.9353122957152196,
"train_speed(iter/s)": 0.169207
},
{
"epoch": 1.5820006271558482,
"grad_norm": 0.6639277935028076,
"learning_rate": 4.566405609182247e-06,
"loss": 0.17650117874145507,
"memory(GiB)": 34.95,
"step": 630,
"token_acc": 0.9426657289854536,
"train_speed(iter/s)": 0.169405
},
{
"epoch": 1.594543744120414,
"grad_norm": 0.5965340733528137,
"learning_rate": 4.500914080353666e-06,
"loss": 0.16074283123016359,
"memory(GiB)": 34.95,
"step": 635,
"token_acc": 0.9436321558637268,
"train_speed(iter/s)": 0.169633
},
{
"epoch": 1.6070868610849796,
"grad_norm": 0.6135890483856201,
"learning_rate": 4.435508928930431e-06,
"loss": 0.17277932167053223,
"memory(GiB)": 34.95,
"step": 640,
"token_acc": 0.9343614580678052,
"train_speed(iter/s)": 0.169805
},
{
"epoch": 1.6070868610849796,
"eval_loss": 0.2301892191171646,
"eval_runtime": 9.9309,
"eval_samples_per_second": 25.879,
"eval_steps_per_second": 6.545,
"eval_token_acc": 0.9346370400160868,
"step": 640
},
{
"epoch": 1.6196299780495453,
"grad_norm": 0.6135047674179077,
"learning_rate": 4.3702014746614135e-06,
"loss": 0.16275949478149415,
"memory(GiB)": 34.95,
"step": 645,
"token_acc": 0.9382904296349448,
"train_speed(iter/s)": 0.169112
},
{
"epoch": 1.6321730950141111,
"grad_norm": 0.6757416725158691,
"learning_rate": 4.305003020386922e-06,
"loss": 0.16928246021270751,
"memory(GiB)": 34.95,
"step": 650,
"token_acc": 0.9348640286598274,
"train_speed(iter/s)": 0.169328
},
{
"epoch": 1.6447162119786767,
"grad_norm": 0.6586278676986694,
"learning_rate": 4.239924850082501e-06,
"loss": 0.15818471908569337,
"memory(GiB)": 34.95,
"step": 655,
"token_acc": 0.9452125117275318,
"train_speed(iter/s)": 0.169486
},
{
"epoch": 1.6572593289432422,
"grad_norm": 0.7142418026924133,
"learning_rate": 4.1749782269060045e-06,
"loss": 0.1626511335372925,
"memory(GiB)": 34.95,
"step": 660,
"token_acc": 0.9346642123840067,
"train_speed(iter/s)": 0.169729
},
{
"epoch": 1.6572593289432422,
"eval_loss": 0.22976796329021454,
"eval_runtime": 9.9836,
"eval_samples_per_second": 25.742,
"eval_steps_per_second": 6.511,
"eval_token_acc": 0.9347878544138347,
"step": 660
},
{
"epoch": 1.6698024459078082,
"grad_norm": 0.6405680775642395,
"learning_rate": 4.110174391248268e-06,
"loss": 0.1630636215209961,
"memory(GiB)": 34.95,
"step": 665,
"token_acc": 0.9356640277041026,
"train_speed(iter/s)": 0.169025
},
{
"epoch": 1.6823455628723738,
"grad_norm": 0.6509523391723633,
"learning_rate": 4.045524558787712e-06,
"loss": 0.17556746006011964,
"memory(GiB)": 34.95,
"step": 670,
"token_acc": 0.9399725004910626,
"train_speed(iter/s)": 0.169222
},
{
"epoch": 1.6948886798369394,
"grad_norm": 0.6420454978942871,
"learning_rate": 3.9810399185492406e-06,
"loss": 0.16325095891952515,
"memory(GiB)": 34.95,
"step": 675,
"token_acc": 0.9350154972645768,
"train_speed(iter/s)": 0.16944
},
{
"epoch": 1.7074317968015051,
"grad_norm": 0.6421252489089966,
"learning_rate": 3.916731630967741e-06,
"loss": 0.17528104782104492,
"memory(GiB)": 34.95,
"step": 680,
"token_acc": 0.9365230616994374,
"train_speed(iter/s)": 0.169645
},
{
"epoch": 1.7074317968015051,
"eval_loss": 0.2283635139465332,
"eval_runtime": 9.9363,
"eval_samples_per_second": 25.865,
"eval_steps_per_second": 6.542,
"eval_token_acc": 0.9346672028956364,
"step": 680
},
{
"epoch": 1.719974913766071,
"grad_norm": 0.6265820264816284,
"learning_rate": 3.852610825956529e-06,
"loss": 0.16770663261413574,
"memory(GiB)": 34.95,
"step": 685,
"token_acc": 0.9386050786166663,
"train_speed(iter/s)": 0.169004
},
{
"epoch": 1.7325180307306365,
"grad_norm": 0.6280970573425293,
"learning_rate": 3.788688600981085e-06,
"loss": 0.1680266261100769,
"memory(GiB)": 34.95,
"step": 690,
"token_acc": 0.9436317194937884,
"train_speed(iter/s)": 0.169183
},
{
"epoch": 1.7450611476952023,
"grad_norm": 0.575031578540802,
"learning_rate": 3.7249760191384055e-06,
"loss": 0.16007229089736938,
"memory(GiB)": 34.95,
"step": 695,
"token_acc": 0.9408609064687402,
"train_speed(iter/s)": 0.169405
},
{
"epoch": 1.757604264659768,
"grad_norm": 0.6248459219932556,
"learning_rate": 3.6614841072422913e-06,
"loss": 0.16597646474838257,
"memory(GiB)": 34.95,
"step": 700,
"token_acc": 0.9334997820314668,
"train_speed(iter/s)": 0.169658
},
{
"epoch": 1.757604264659768,
"eval_loss": 0.227211132645607,
"eval_runtime": 9.9535,
"eval_samples_per_second": 25.82,
"eval_steps_per_second": 6.53,
"eval_token_acc": 0.9347778001206515,
"step": 700
},
{
"epoch": 1.7701473816243336,
"grad_norm": 0.6405999064445496,
"learning_rate": 3.5982238539149287e-06,
"loss": 0.16680790185928346,
"memory(GiB)": 34.95,
"step": 705,
"token_acc": 0.9360328466797351,
"train_speed(iter/s)": 0.169094
},
{
"epoch": 1.7826904985888994,
"grad_norm": 0.6649206280708313,
"learning_rate": 3.535206207685079e-06,
"loss": 0.1820515751838684,
"memory(GiB)": 34.95,
"step": 710,
"token_acc": 0.9366235113407408,
"train_speed(iter/s)": 0.169367
},
{
"epoch": 1.7952336155534652,
"grad_norm": 0.6017094254493713,
"learning_rate": 3.472442075093192e-06,
"loss": 0.1508460283279419,
"memory(GiB)": 34.95,
"step": 715,
"token_acc": 0.9486971106461709,
"train_speed(iter/s)": 0.169534
},
{
"epoch": 1.8077767325180307,
"grad_norm": 0.5928083062171936,
"learning_rate": 3.4099423188038094e-06,
"loss": 0.16222984790802003,
"memory(GiB)": 34.95,
"step": 720,
"token_acc": 0.9467024477514842,
"train_speed(iter/s)": 0.169693
},
{
"epoch": 1.8077767325180307,
"eval_loss": 0.2278689295053482,
"eval_runtime": 9.9852,
"eval_samples_per_second": 25.738,
"eval_steps_per_second": 6.51,
"eval_token_acc": 0.9349235873718078,
"step": 720
},
{
"epoch": 1.8203198494825963,
"grad_norm": 0.6321941018104553,
"learning_rate": 3.347717755725547e-06,
"loss": 0.17015450000762938,
"memory(GiB)": 34.95,
"step": 725,
"token_acc": 0.9337387521012558,
"train_speed(iter/s)": 0.169142
},
{
"epoch": 1.832862966447162,
"grad_norm": 0.6680959463119507,
"learning_rate": 3.2857791551389907e-06,
"loss": 0.16979444026947021,
"memory(GiB)": 34.95,
"step": 730,
"token_acc": 0.9440408017179671,
"train_speed(iter/s)": 0.1693
},
{
"epoch": 1.8454060834117278,
"grad_norm": 0.6347929835319519,
"learning_rate": 3.224137236832859e-06,
"loss": 0.16566884517669678,
"memory(GiB)": 34.95,
"step": 735,
"token_acc": 0.9421418181073162,
"train_speed(iter/s)": 0.169501
},
{
"epoch": 1.8579492003762934,
"grad_norm": 0.6764352321624756,
"learning_rate": 3.1628026692487053e-06,
"loss": 0.1652566075325012,
"memory(GiB)": 34.95,
"step": 740,
"token_acc": 0.9433130787598766,
"train_speed(iter/s)": 0.169711
},
{
"epoch": 1.8579492003762934,
"eval_loss": 0.22732892632484436,
"eval_runtime": 9.9558,
"eval_samples_per_second": 25.814,
"eval_steps_per_second": 6.529,
"eval_token_acc": 0.9350542931831892,
"step": 740
},
{
"epoch": 1.8704923173408592,
"grad_norm": 0.6389771103858948,
"learning_rate": 3.1017860676345184e-06,
"loss": 0.15687326192855836,
"memory(GiB)": 34.95,
"step": 745,
"token_acc": 0.9414018945533932,
"train_speed(iter/s)": 0.169168
},
{
"epoch": 1.883035434305425,
"grad_norm": 0.6998502612113953,
"learning_rate": 3.0410979922075344e-06,
"loss": 0.17107654809951783,
"memory(GiB)": 34.95,
"step": 750,
"token_acc": 0.9427076541922024,
"train_speed(iter/s)": 0.169327
},
{
"epoch": 1.8955785512699905,
"grad_norm": 0.6121336817741394,
"learning_rate": 2.980748946326564e-06,
"loss": 0.16890095472335814,
"memory(GiB)": 34.95,
"step": 755,
"token_acc": 0.9396650021625447,
"train_speed(iter/s)": 0.169512
},
{
"epoch": 1.9081216682345563,
"grad_norm": 0.5630024075508118,
"learning_rate": 2.920749374674161e-06,
"loss": 0.16135737895965577,
"memory(GiB)": 34.95,
"step": 760,
"token_acc": 0.9455813142757539,
"train_speed(iter/s)": 0.169692
},
{
"epoch": 1.9081216682345563,
"eval_loss": 0.2258785516023636,
"eval_runtime": 9.9483,
"eval_samples_per_second": 25.833,
"eval_steps_per_second": 6.534,
"eval_token_acc": 0.9357782022923788,
"step": 760
},
{
"epoch": 1.920664785199122,
"grad_norm": 0.6215230226516724,
"learning_rate": 2.861109661448952e-06,
"loss": 0.160076367855072,
"memory(GiB)": 34.95,
"step": 765,
"token_acc": 0.938600821420109,
"train_speed(iter/s)": 0.169164
},
{
"epoch": 1.9332079021636877,
"grad_norm": 0.6377970576286316,
"learning_rate": 2.8018401285684284e-06,
"loss": 0.16507962942123414,
"memory(GiB)": 34.95,
"step": 770,
"token_acc": 0.9362514029180696,
"train_speed(iter/s)": 0.169343
},
{
"epoch": 1.9457510191282532,
"grad_norm": 0.6416298747062683,
"learning_rate": 2.7429510338825206e-06,
"loss": 0.1676865577697754,
"memory(GiB)": 34.95,
"step": 775,
"token_acc": 0.9356394574884725,
"train_speed(iter/s)": 0.169514
},
{
"epoch": 1.9582941360928192,
"grad_norm": 0.6064321398735046,
"learning_rate": 2.6844525693982614e-06,
"loss": 0.1615642786026001,
"memory(GiB)": 34.95,
"step": 780,
"token_acc": 0.9434515921396388,
"train_speed(iter/s)": 0.169698
},
{
"epoch": 1.9582941360928192,
"eval_loss": 0.22540703415870667,
"eval_runtime": 9.9865,
"eval_samples_per_second": 25.735,
"eval_steps_per_second": 6.509,
"eval_token_acc": 0.9356072793082646,
"step": 780
},
{
"epoch": 1.9708372530573848,
"grad_norm": 0.6212838888168335,
"learning_rate": 2.6263548595158374e-06,
"loss": 0.16903696060180665,
"memory(GiB)": 34.95,
"step": 785,
"token_acc": 0.9373140403756506,
"train_speed(iter/s)": 0.16916
},
{
"epoch": 1.9833803700219503,
"grad_norm": 0.6316173076629639,
"learning_rate": 2.568667959276351e-06,
"loss": 0.1633455991744995,
"memory(GiB)": 34.95,
"step": 790,
"token_acc": 0.9499691904033075,
"train_speed(iter/s)": 0.169327
},
{
"epoch": 1.9959234869865161,
"grad_norm": 0.5879420638084412,
"learning_rate": 2.5114018526215843e-06,
"loss": 0.15602803230285645,
"memory(GiB)": 34.95,
"step": 795,
"token_acc": 0.9389440475085831,
"train_speed(iter/s)": 0.169519
},
{
"epoch": 2.0100344935716525,
"grad_norm": 0.5628567337989807,
"learning_rate": 2.454566450666061e-06,
"loss": 0.1572946071624756,
"memory(GiB)": 34.95,
"step": 800,
"token_acc": 0.9571394981693594,
"train_speed(iter/s)": 0.169704
},
{
"epoch": 2.0100344935716525,
"eval_loss": 0.22719430923461914,
"eval_runtime": 9.9565,
"eval_samples_per_second": 25.812,
"eval_steps_per_second": 6.528,
"eval_token_acc": 0.9355921978684898,
"step": 800
},
{
"epoch": 2.022577610536218,
"grad_norm": 0.6104578971862793,
"learning_rate": 2.398171589981721e-06,
"loss": 0.1239326000213623,
"memory(GiB)": 34.95,
"step": 805,
"token_acc": 0.9446134994383744,
"train_speed(iter/s)": 0.169099
},
{
"epoch": 2.035120727500784,
"grad_norm": 0.5852713584899902,
"learning_rate": 2.3422270308954936e-06,
"loss": 0.12712430953979492,
"memory(GiB)": 34.95,
"step": 810,
"token_acc": 0.9523950262830121,
"train_speed(iter/s)": 0.169261
},
{
"epoch": 2.0476638444653497,
"grad_norm": 0.6348543763160706,
"learning_rate": 2.286742455800059e-06,
"loss": 0.12253003120422364,
"memory(GiB)": 34.95,
"step": 815,
"token_acc": 0.9575001424257962,
"train_speed(iter/s)": 0.169477
},
{
"epoch": 2.060206961429915,
"grad_norm": 0.6192832589149475,
"learning_rate": 2.2317274674781158e-06,
"loss": 0.12359896898269654,
"memory(GiB)": 34.95,
"step": 820,
"token_acc": 0.952242789995938,
"train_speed(iter/s)": 0.169661
},
{
"epoch": 2.060206961429915,
"eval_loss": 0.24132946133613586,
"eval_runtime": 9.9829,
"eval_samples_per_second": 25.744,
"eval_steps_per_second": 6.511,
"eval_token_acc": 0.934958777397949,
"step": 820
},
{
"epoch": 2.072750078394481,
"grad_norm": 0.6246756315231323,
"learning_rate": 2.1771915874404094e-06,
"loss": 0.1322195291519165,
"memory(GiB)": 34.95,
"step": 825,
"token_acc": 0.9414432054743698,
"train_speed(iter/s)": 0.169161
},
{
"epoch": 2.085293195359047,
"grad_norm": 0.5839347243309021,
"learning_rate": 2.1231442542778317e-06,
"loss": 0.11952453851699829,
"memory(GiB)": 34.95,
"step": 830,
"token_acc": 0.956975505857295,
"train_speed(iter/s)": 0.169338
},
{
"epoch": 2.0978363123236123,
"grad_norm": 0.6020109057426453,
"learning_rate": 2.0695948220278756e-06,
"loss": 0.12150832414627075,
"memory(GiB)": 34.95,
"step": 835,
"token_acc": 0.950202699878798,
"train_speed(iter/s)": 0.169516
},
{
"epoch": 2.110379429288178,
"grad_norm": 0.6176694631576538,
"learning_rate": 2.0165525585557205e-06,
"loss": 0.12181558609008789,
"memory(GiB)": 34.95,
"step": 840,
"token_acc": 0.9584100732944936,
"train_speed(iter/s)": 0.169655
},
{
"epoch": 2.110379429288178,
"eval_loss": 0.23956826329231262,
"eval_runtime": 9.9764,
"eval_samples_per_second": 25.761,
"eval_steps_per_second": 6.515,
"eval_token_acc": 0.9351900261411623,
"step": 840
},
{
"epoch": 2.122922546252744,
"grad_norm": 0.6029666066169739,
"learning_rate": 1.964026643950226e-06,
"loss": 0.11940534114837646,
"memory(GiB)": 34.95,
"step": 845,
"token_acc": 0.9457718501702611,
"train_speed(iter/s)": 0.16911
},
{
"epoch": 2.1354656632173095,
"grad_norm": 0.5822896957397461,
"learning_rate": 1.9120261689351317e-06,
"loss": 0.11883677244186401,
"memory(GiB)": 34.95,
"step": 850,
"token_acc": 0.9609223300970874,
"train_speed(iter/s)": 0.169285
},
{
"epoch": 2.148008780181875,
"grad_norm": 0.6058652997016907,
"learning_rate": 1.860560133295708e-06,
"loss": 0.12740614414215087,
"memory(GiB)": 34.95,
"step": 855,
"token_acc": 0.9549561469832148,
"train_speed(iter/s)": 0.169475
},
{
"epoch": 2.160551897146441,
"grad_norm": 0.5672310590744019,
"learning_rate": 1.8096374443211545e-06,
"loss": 0.12559156417846679,
"memory(GiB)": 34.95,
"step": 860,
"token_acc": 0.9539432293401429,
"train_speed(iter/s)": 0.169653
},
{
"epoch": 2.160551897146441,
"eval_loss": 0.23939980566501617,
"eval_runtime": 9.9791,
"eval_samples_per_second": 25.754,
"eval_steps_per_second": 6.514,
"eval_token_acc": 0.9351900261411623,
"step": 860
},
{
"epoch": 2.1730950141110066,
"grad_norm": 0.6905380487442017,
"learning_rate": 1.7592669152630082e-06,
"loss": 0.12502384185791016,
"memory(GiB)": 34.95,
"step": 865,
"token_acc": 0.9430737514131808,
"train_speed(iter/s)": 0.16916
},
{
"epoch": 2.185638131075572,
"grad_norm": 0.604882001876831,
"learning_rate": 1.7094572638098122e-06,
"loss": 0.13246217966079712,
"memory(GiB)": 34.95,
"step": 870,
"token_acc": 0.9538787052672268,
"train_speed(iter/s)": 0.169321
},
{
"epoch": 2.198181248040138,
"grad_norm": 0.6221954226493835,
"learning_rate": 1.6602171105783488e-06,
"loss": 0.12281397581100464,
"memory(GiB)": 34.95,
"step": 875,
"token_acc": 0.9516503156133547,
"train_speed(iter/s)": 0.169488
},
{
"epoch": 2.2107243650047037,
"grad_norm": 0.5445839166641235,
"learning_rate": 1.61155497762165e-06,
"loss": 0.11812053918838501,
"memory(GiB)": 34.95,
"step": 880,
"token_acc": 0.9578427802726868,
"train_speed(iter/s)": 0.169626
},
{
"epoch": 2.2107243650047037,
"eval_loss": 0.23973500728607178,
"eval_runtime": 9.9832,
"eval_samples_per_second": 25.743,
"eval_steps_per_second": 6.511,
"eval_token_acc": 0.9351347275286548,
"step": 880
},
{
"epoch": 2.2232674819692693,
"grad_norm": 0.6164165735244751,
"learning_rate": 1.5634792869540782e-06,
"loss": 0.11963331699371338,
"memory(GiB)": 34.95,
"step": 885,
"token_acc": 0.9436205250131545,
"train_speed(iter/s)": 0.169137
},
{
"epoch": 2.235810598933835,
"grad_norm": 0.606275200843811,
"learning_rate": 1.5159983590937183e-06,
"loss": 0.12453606128692626,
"memory(GiB)": 34.95,
"step": 890,
"token_acc": 0.9526249104831157,
"train_speed(iter/s)": 0.169305
},
{
"epoch": 2.248353715898401,
"grad_norm": 0.6022824048995972,
"learning_rate": 1.4691204116223357e-06,
"loss": 0.11552423238754272,
"memory(GiB)": 34.95,
"step": 895,
"token_acc": 0.9613866135340565,
"train_speed(iter/s)": 0.169507
},
{
"epoch": 2.2608968328629664,
"grad_norm": 0.6075533032417297,
"learning_rate": 1.4228535577631442e-06,
"loss": 0.12762036323547363,
"memory(GiB)": 34.95,
"step": 900,
"token_acc": 0.9527419384954348,
"train_speed(iter/s)": 0.169672
},
{
"epoch": 2.2608968328629664,
"eval_loss": 0.24023577570915222,
"eval_runtime": 9.9811,
"eval_samples_per_second": 25.749,
"eval_steps_per_second": 6.512,
"eval_token_acc": 0.9352000804343454,
"step": 900
},
{
"epoch": 2.273439949827532,
"grad_norm": 0.6038256883621216,
"learning_rate": 1.3772058049766491e-06,
"loss": 0.12403825521469117,
"memory(GiB)": 34.95,
"step": 905,
"token_acc": 0.9445350568832248,
"train_speed(iter/s)": 0.169145
},
{
"epoch": 2.285983066792098,
"grad_norm": 0.6491556763648987,
"learning_rate": 1.3321850535747822e-06,
"loss": 0.12173200845718384,
"memory(GiB)": 34.95,
"step": 910,
"token_acc": 0.9588150821120849,
"train_speed(iter/s)": 0.169343
},
{
"epoch": 2.2985261837566635,
"grad_norm": 0.548174262046814,
"learning_rate": 1.2877990953535841e-06,
"loss": 0.12104053497314453,
"memory(GiB)": 34.95,
"step": 915,
"token_acc": 0.958676718877986,
"train_speed(iter/s)": 0.169527
},
{
"epoch": 2.311069300721229,
"grad_norm": 0.5731512904167175,
"learning_rate": 1.2440556122446701e-06,
"loss": 0.12762261629104615,
"memory(GiB)": 34.95,
"step": 920,
"token_acc": 0.9535161617972158,
"train_speed(iter/s)": 0.169669
},
{
"epoch": 2.311069300721229,
"eval_loss": 0.2390112429857254,
"eval_runtime": 9.9533,
"eval_samples_per_second": 25.821,
"eval_steps_per_second": 6.53,
"eval_token_acc": 0.9355570078423486,
"step": 920
},
{
"epoch": 2.323612417685795,
"grad_norm": 0.634443998336792,
"learning_rate": 1.2009621749857103e-06,
"loss": 0.12285526990890502,
"memory(GiB)": 34.95,
"step": 925,
"token_acc": 0.9447460370728625,
"train_speed(iter/s)": 0.169211
},
{
"epoch": 2.3361555346503606,
"grad_norm": 0.674192488193512,
"learning_rate": 1.1585262418101468e-06,
"loss": 0.13117657899856566,
"memory(GiB)": 34.95,
"step": 930,
"token_acc": 0.9548805986574227,
"train_speed(iter/s)": 0.16934
},
{
"epoch": 2.348698651614926,
"grad_norm": 0.6701561808586121,
"learning_rate": 1.1167551571563967e-06,
"loss": 0.12773873805999755,
"memory(GiB)": 34.95,
"step": 935,
"token_acc": 0.9571292006765203,
"train_speed(iter/s)": 0.169553
},
{
"epoch": 2.361241768579492,
"grad_norm": 0.5688639879226685,
"learning_rate": 1.0756561503967366e-06,
"loss": 0.12773098945617675,
"memory(GiB)": 34.95,
"step": 940,
"token_acc": 0.9547074376365099,
"train_speed(iter/s)": 0.16967
},
{
"epoch": 2.361241768579492,
"eval_loss": 0.23946641385555267,
"eval_runtime": 9.9907,
"eval_samples_per_second": 25.724,
"eval_steps_per_second": 6.506,
"eval_token_acc": 0.9353056505127689,
"step": 940
},
{
"epoch": 2.3737848855440578,
"grad_norm": 0.6829060316085815,
"learning_rate": 1.0352363345861067e-06,
"loss": 0.1251779556274414,
"memory(GiB)": 34.95,
"step": 945,
"token_acc": 0.9461535568551496,
"train_speed(iter/s)": 0.169209
},
{
"epoch": 2.3863280025086233,
"grad_norm": 0.6119195222854614,
"learning_rate": 9.955027052310445e-07,
"loss": 0.12672061920166017,
"memory(GiB)": 34.95,
"step": 950,
"token_acc": 0.9541979451343965,
"train_speed(iter/s)": 0.169349
},
{
"epoch": 2.3988711194731893,
"grad_norm": 0.6136831045150757,
"learning_rate": 9.564621390789692e-07,
"loss": 0.12832672595977784,
"memory(GiB)": 34.95,
"step": 955,
"token_acc": 0.9493431077797455,
"train_speed(iter/s)": 0.169478
},
{
"epoch": 2.411414236437755,
"grad_norm": 0.6445709466934204,
"learning_rate": 9.181213929280047e-07,
"loss": 0.12906695604324342,
"memory(GiB)": 34.95,
"step": 960,
"token_acc": 0.9479911420436571,
"train_speed(iter/s)": 0.16967
},
{
"epoch": 2.411414236437755,
"eval_loss": 0.23977875709533691,
"eval_runtime": 9.9861,
"eval_samples_per_second": 25.736,
"eval_steps_per_second": 6.509,
"eval_token_acc": 0.9355368992559823,
"step": 960
},
{
"epoch": 2.4239573534023204,
"grad_norm": 0.5981658101081848,
"learning_rate": 8.804871024575851e-07,
"loss": 0.12087714672088623,
"memory(GiB)": 34.95,
"step": 965,
"token_acc": 0.9459313171146616,
"train_speed(iter/s)": 0.169225
},
{
"epoch": 2.436500470366886,
"grad_norm": 0.591122567653656,
"learning_rate": 8.435657810799991e-07,
"loss": 0.11974387168884278,
"memory(GiB)": 34.95,
"step": 970,
"token_acc": 0.9559715418707722,
"train_speed(iter/s)": 0.169402
},
{
"epoch": 2.449043587331452,
"grad_norm": 0.6105548143386841,
"learning_rate": 8.073638188131128e-07,
"loss": 0.12425668239593506,
"memory(GiB)": 34.95,
"step": 975,
"token_acc": 0.9544568733678918,
"train_speed(iter/s)": 0.169513
},
{
"epoch": 2.4615867042960176,
"grad_norm": 0.5976568460464478,
"learning_rate": 7.71887481174437e-07,
"loss": 0.12979369163513182,
"memory(GiB)": 34.95,
"step": 980,
"token_acc": 0.9534985244556201,
"train_speed(iter/s)": 0.16963
},
{
"epoch": 2.4615867042960176,
"eval_loss": 0.23947912454605103,
"eval_runtime": 9.9815,
"eval_samples_per_second": 25.748,
"eval_steps_per_second": 6.512,
"eval_token_acc": 0.9354866277900663,
"step": 980
},
{
"epoch": 2.474129821260583,
"grad_norm": 0.6130661964416504,
"learning_rate": 7.371429080967468e-07,
"loss": 0.12366334199905396,
"memory(GiB)": 34.95,
"step": 985,
"token_acc": 0.9456251029644058,
"train_speed(iter/s)": 0.169181
},
{
"epoch": 2.486672938225149,
"grad_norm": 0.611749529838562,
"learning_rate": 7.031361128654402e-07,
"loss": 0.11961600780487061,
"memory(GiB)": 34.95,
"step": 990,
"token_acc": 0.9553609289884855,
"train_speed(iter/s)": 0.169374
},
{
"epoch": 2.4992160551897147,
"grad_norm": 0.6357402205467224,
"learning_rate": 6.698729810778065e-07,
"loss": 0.12684570550918578,
"memory(GiB)": 34.95,
"step": 995,
"token_acc": 0.9534441273571709,
"train_speed(iter/s)": 0.169506
},
{
"epoch": 2.5117591721542802,
"grad_norm": 0.5895079970359802,
"learning_rate": 6.373592696244024e-07,
"loss": 0.12313053607940674,
"memory(GiB)": 34.95,
"step": 1000,
"token_acc": 0.9565139198618804,
"train_speed(iter/s)": 0.169623
},
{
"epoch": 2.5117591721542802,
"eval_loss": 0.23871561884880066,
"eval_runtime": 9.9765,
"eval_samples_per_second": 25.76,
"eval_steps_per_second": 6.515,
"eval_token_acc": 0.935416247737784,
"step": 1000
},
{
"epoch": 2.524302289118846,
"grad_norm": 0.6320595145225525,
"learning_rate": 6.056006056926978e-07,
"loss": 0.12712349891662597,
"memory(GiB)": 34.95,
"step": 1005,
"token_acc": 0.9447087643998138,
"train_speed(iter/s)": 0.169177
},
{
"epoch": 2.536845406083412,
"grad_norm": 0.611575186252594,
"learning_rate": 5.746024857931732e-07,
"loss": 0.12986292839050292,
"memory(GiB)": 34.95,
"step": 1010,
"token_acc": 0.9510800508259212,
"train_speed(iter/s)": 0.169271
},
{
"epoch": 2.5493885230479774,
"grad_norm": 0.6201998591423035,
"learning_rate": 5.443702748080288e-07,
"loss": 0.12274014949798584,
"memory(GiB)": 34.95,
"step": 1015,
"token_acc": 0.955253177824786,
"train_speed(iter/s)": 0.169445
},
{
"epoch": 2.561931640012543,
"grad_norm": 0.6165274977684021,
"learning_rate": 5.149092050626825e-07,
"loss": 0.1297899603843689,
"memory(GiB)": 34.95,
"step": 1020,
"token_acc": 0.9506936125816299,
"train_speed(iter/s)": 0.169553
},
{
"epoch": 2.561931640012543,
"eval_loss": 0.23904787003993988,
"eval_runtime": 10.067,
"eval_samples_per_second": 25.529,
"eval_steps_per_second": 6.457,
"eval_token_acc": 0.935441383470742,
"step": 1020
},
{
"epoch": 2.574474756977109,
"grad_norm": 0.6250675916671753,
"learning_rate": 4.862243754202023e-07,
"loss": 0.12486759424209595,
"memory(GiB)": 34.95,
"step": 1025,
"token_acc": 0.9427952415499746,
"train_speed(iter/s)": 0.169143
},
{
"epoch": 2.5870178739416745,
"grad_norm": 0.5898808240890503,
"learning_rate": 4.5832075039884014e-07,
"loss": 0.11898901462554931,
"memory(GiB)": 34.95,
"step": 1030,
"token_acc": 0.9599557987792043,
"train_speed(iter/s)": 0.169245
},
{
"epoch": 2.59956099090624,
"grad_norm": 0.6565684080123901,
"learning_rate": 4.3120315931281633e-07,
"loss": 0.12741444110870362,
"memory(GiB)": 34.95,
"step": 1035,
"token_acc": 0.9581729932512134,
"train_speed(iter/s)": 0.169389
},
{
"epoch": 2.612104107870806,
"grad_norm": 0.5875664949417114,
"learning_rate": 4.048762954365054e-07,
"loss": 0.12610654830932616,
"memory(GiB)": 34.95,
"step": 1040,
"token_acc": 0.9550036071318149,
"train_speed(iter/s)": 0.169544
},
{
"epoch": 2.612104107870806,
"eval_loss": 0.23876284062862396,
"eval_runtime": 9.9698,
"eval_samples_per_second": 25.778,
"eval_steps_per_second": 6.52,
"eval_token_acc": 0.9356223607480394,
"step": 1040
},
{
"epoch": 2.6246472248353716,
"grad_norm": 0.6185052990913391,
"learning_rate": 3.793447151921642e-07,
"loss": 0.11991071701049805,
"memory(GiB)": 34.95,
"step": 1045,
"token_acc": 0.9459399138299112,
"train_speed(iter/s)": 0.169124
},
{
"epoch": 2.637190341799937,
"grad_norm": 0.6107172966003418,
"learning_rate": 3.546128373613472e-07,
"loss": 0.11918728351593018,
"memory(GiB)": 34.95,
"step": 1050,
"token_acc": 0.9534668113226157,
"train_speed(iter/s)": 0.169318
},
{
"epoch": 2.649733458764503,
"grad_norm": 0.6262523531913757,
"learning_rate": 3.30684942320143e-07,
"loss": 0.11955299377441406,
"memory(GiB)": 34.95,
"step": 1055,
"token_acc": 0.9587094529959127,
"train_speed(iter/s)": 0.169507
},
{
"epoch": 2.6622765757290687,
"grad_norm": 0.5960172414779663,
"learning_rate": 3.0756517129836296e-07,
"loss": 0.12312361001968383,
"memory(GiB)": 34.95,
"step": 1060,
"token_acc": 0.9537392406006633,
"train_speed(iter/s)": 0.169614
},
{
"epoch": 2.6622765757290687,
"eval_loss": 0.2386154979467392,
"eval_runtime": 9.9782,
"eval_samples_per_second": 25.756,
"eval_steps_per_second": 6.514,
"eval_token_acc": 0.9356575507741806,
"step": 1060
},
{
"epoch": 2.6748196926936343,
"grad_norm": 0.6013411283493042,
"learning_rate": 2.8525752566281485e-07,
"loss": 0.11395795345306396,
"memory(GiB)": 34.95,
"step": 1065,
"token_acc": 0.947853377091845,
"train_speed(iter/s)": 0.169208
},
{
"epoch": 2.6873628096582003,
"grad_norm": 0.6079533696174622,
"learning_rate": 2.637658662247805e-07,
"loss": 0.12184674739837646,
"memory(GiB)": 34.95,
"step": 1070,
"token_acc": 0.9590189382179447,
"train_speed(iter/s)": 0.169294
},
{
"epoch": 2.699905926622766,
"grad_norm": 0.5974398851394653,
"learning_rate": 2.430939125718218e-07,
"loss": 0.12283775806427003,
"memory(GiB)": 34.95,
"step": 1075,
"token_acc": 0.9571679809383332,
"train_speed(iter/s)": 0.169407
},
{
"epoch": 2.7124490435873314,
"grad_norm": 0.6196507811546326,
"learning_rate": 2.232452424240261e-07,
"loss": 0.12050046920776367,
"memory(GiB)": 34.95,
"step": 1080,
"token_acc": 0.9597835852963006,
"train_speed(iter/s)": 0.169553
},
{
"epoch": 2.7124490435873314,
"eval_loss": 0.23875917494297028,
"eval_runtime": 9.9675,
"eval_samples_per_second": 25.784,
"eval_steps_per_second": 6.521,
"eval_token_acc": 0.9355821435753067,
"step": 1080
},
{
"epoch": 2.7249921605518974,
"grad_norm": 0.6392203569412231,
"learning_rate": 2.042232910148051e-07,
"loss": 0.11989054679870606,
"memory(GiB)": 34.95,
"step": 1085,
"token_acc": 0.9460828818275003,
"train_speed(iter/s)": 0.16913
},
{
"epoch": 2.737535277516463,
"grad_norm": 0.6039083003997803,
"learning_rate": 1.860313504963579e-07,
"loss": 0.11684960126876831,
"memory(GiB)": 34.95,
"step": 1090,
"token_acc": 0.9531301093630782,
"train_speed(iter/s)": 0.169284
},
{
"epoch": 2.7500783944810285,
"grad_norm": 0.6016117930412292,
"learning_rate": 1.6867256936989097e-07,
"loss": 0.12414079904556274,
"memory(GiB)": 34.95,
"step": 1095,
"token_acc": 0.9582522047875387,
"train_speed(iter/s)": 0.169393
},
{
"epoch": 2.762621511445594,
"grad_norm": 0.609747588634491,
"learning_rate": 1.521499519407038e-07,
"loss": 0.11737879514694213,
"memory(GiB)": 34.95,
"step": 1100,
"token_acc": 0.95666478832276,
"train_speed(iter/s)": 0.169525
},
{
"epoch": 2.762621511445594,
"eval_loss": 0.2390962839126587,
"eval_runtime": 9.9713,
"eval_samples_per_second": 25.774,
"eval_steps_per_second": 6.519,
"eval_token_acc": 0.9355821435753067,
"step": 1100
},
{
"epoch": 2.7751646284101597,
"grad_norm": 0.540483832359314,
"learning_rate": 1.364663577982317e-07,
"loss": 0.11655213832855224,
"memory(GiB)": 34.95,
"step": 1105,
"token_acc": 0.947412632708209,
"train_speed(iter/s)": 0.169129
},
{
"epoch": 2.7877077453747257,
"grad_norm": 0.6210323572158813,
"learning_rate": 1.2162450132113202e-07,
"loss": 0.12057442665100097,
"memory(GiB)": 34.95,
"step": 1110,
"token_acc": 0.9559064846811715,
"train_speed(iter/s)": 0.169327
},
{
"epoch": 2.800250862339291,
"grad_norm": 0.6889768242835999,
"learning_rate": 1.07626951207504e-07,
"loss": 0.12380859851837159,
"memory(GiB)": 34.95,
"step": 1115,
"token_acc": 0.9528777568254488,
"train_speed(iter/s)": 0.169457
},
{
"epoch": 2.8127939793038568,
"grad_norm": 0.5808271169662476,
"learning_rate": 9.447613003032042e-08,
"loss": 0.12284276485443116,
"memory(GiB)": 34.95,
"step": 1120,
"token_acc": 0.9562657695542472,
"train_speed(iter/s)": 0.169618
},
{
"epoch": 2.8127939793038568,
"eval_loss": 0.23907452821731567,
"eval_runtime": 9.9798,
"eval_samples_per_second": 25.752,
"eval_steps_per_second": 6.513,
"eval_token_acc": 0.9356977679469133,
"step": 1120
},
{
"epoch": 2.8253370962684228,
"grad_norm": 0.6014649868011475,
"learning_rate": 8.217431381815078e-08,
"loss": 0.12331821918487548,
"memory(GiB)": 34.95,
"step": 1125,
"token_acc": 0.9462024372046755,
"train_speed(iter/s)": 0.169228
},
{
"epoch": 2.8378802132329883,
"grad_norm": 0.6275858283042908,
"learning_rate": 7.072363166124363e-08,
"loss": 0.12597228288650514,
"memory(GiB)": 34.95,
"step": 1130,
"token_acc": 0.9567274137262505,
"train_speed(iter/s)": 0.169368
},
{
"epoch": 2.850423330197554,
"grad_norm": 0.6735210418701172,
"learning_rate": 6.012606534304688e-08,
"loss": 0.12442328929901122,
"memory(GiB)": 34.95,
"step": 1135,
"token_acc": 0.9592088998763906,
"train_speed(iter/s)": 0.169498
},
{
"epoch": 2.86296644716212,
"grad_norm": 0.6021392345428467,
"learning_rate": 5.038344899721437e-08,
"loss": 0.13345457315444947,
"memory(GiB)": 34.95,
"step": 1140,
"token_acc": 0.9546434206981488,
"train_speed(iter/s)": 0.169641
},
{
"epoch": 2.86296644716212,
"eval_loss": 0.23904505372047424,
"eval_runtime": 9.9416,
"eval_samples_per_second": 25.851,
"eval_steps_per_second": 6.538,
"eval_token_acc": 0.9355821435753067,
"step": 1140
},
{
"epoch": 2.8755095641266855,
"grad_norm": 0.6207495927810669,
"learning_rate": 4.149746879017147e-08,
"loss": 0.12285821437835694,
"memory(GiB)": 34.95,
"step": 1145,
"token_acc": 0.9433157837334047,
"train_speed(iter/s)": 0.169265
},
{
"epoch": 2.888052681091251,
"grad_norm": 0.5906082391738892,
"learning_rate": 3.3469662629289635e-08,
"loss": 0.1214226484298706,
"memory(GiB)": 34.95,
"step": 1150,
"token_acc": 0.9550792604937793,
"train_speed(iter/s)": 0.169398
},
{
"epoch": 2.900595798055817,
"grad_norm": 0.6598130464553833,
"learning_rate": 2.630141989671542e-08,
"loss": 0.12527458667755126,
"memory(GiB)": 34.95,
"step": 1155,
"token_acc": 0.9544634286811288,
"train_speed(iter/s)": 0.169521
},
{
"epoch": 2.9131389150203826,
"grad_norm": 0.6021043062210083,
"learning_rate": 1.999398120891116e-08,
"loss": 0.1194157361984253,
"memory(GiB)": 34.95,
"step": 1160,
"token_acc": 0.9597829112162538,
"train_speed(iter/s)": 0.169653
},
{
"epoch": 2.9131389150203826,
"eval_loss": 0.23905108869075775,
"eval_runtime": 9.9613,
"eval_samples_per_second": 25.8,
"eval_steps_per_second": 6.525,
"eval_token_acc": 0.935652523627589,
"step": 1160
},
{
"epoch": 2.925682031984948,
"grad_norm": 0.5893052816390991,
"learning_rate": 1.4548438201939518e-08,
"loss": 0.11504523754119873,
"memory(GiB)": 34.95,
"step": 1165,
"token_acc": 0.9459238731463808,
"train_speed(iter/s)": 0.16932
},
{
"epoch": 2.938225148949514,
"grad_norm": 0.6343664526939392,
"learning_rate": 9.965733342532925e-09,
"loss": 0.1275886058807373,
"memory(GiB)": 34.95,
"step": 1170,
"token_acc": 0.9558923448588253,
"train_speed(iter/s)": 0.169447
},
{
"epoch": 2.9507682659140797,
"grad_norm": 0.6318545341491699,
"learning_rate": 6.246659764979068e-09,
"loss": 0.12401323318481446,
"memory(GiB)": 34.95,
"step": 1175,
"token_acc": 0.9594298632943586,
"train_speed(iter/s)": 0.169549
},
{
"epoch": 2.9633113828786453,
"grad_norm": 0.6376116275787354,
"learning_rate": 3.3918611338507046e-09,
"loss": 0.11943215131759644,
"memory(GiB)": 34.95,
"step": 1180,
"token_acc": 0.9601843185254518,
"train_speed(iter/s)": 0.169695
},
{
"epoch": 2.9633113828786453,
"eval_loss": 0.2389203906059265,
"eval_runtime": 9.9873,
"eval_samples_per_second": 25.733,
"eval_steps_per_second": 6.508,
"eval_token_acc": 0.935702795093505,
"step": 1180
},
{
"epoch": 2.9758544998432113,
"grad_norm": 0.5881339311599731,
"learning_rate": 1.4018315326103094e-09,
"loss": 0.11981034278869629,
"memory(GiB)": 34.95,
"step": 1185,
"token_acc": 0.9440527542267837,
"train_speed(iter/s)": 0.169309
},
{
"epoch": 2.988397616807777,
"grad_norm": 0.6789658069610596,
"learning_rate": 2.7691537809293454e-10,
"loss": 0.1259629726409912,
"memory(GiB)": 34.95,
"step": 1190,
"token_acc": 0.9503406881712312,
"train_speed(iter/s)": 0.169392
},
{
"epoch": 2.9984321103794294,
"eval_loss": 0.23875285685062408,
"eval_runtime": 9.984,
"eval_samples_per_second": 25.741,
"eval_steps_per_second": 6.51,
"eval_token_acc": 0.9355570078423486,
"step": 1194
}
],
"logging_steps": 5,
"max_steps": 1194,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.502521518607827e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}