2965 lines
84 KiB
JSON
2965 lines
84 KiB
JSON
{
|
|
"best_global_step": 780,
|
|
"best_metric": 0.22540703,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v49-20250505-211427/checkpoint-780",
|
|
"epoch": 2.9984321103794294,
|
|
"eval_steps": 20,
|
|
"global_step": 1194,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.002508623392913139,
|
|
"grad_norm": 2.4406964778900146,
|
|
"learning_rate": 9.999982692639099e-06,
|
|
"loss": 0.39261603355407715,
|
|
"memory(GiB)": 27.73,
|
|
"step": 1,
|
|
"token_acc": 0.8706896551724138,
|
|
"train_speed(iter/s)": 0.073021
|
|
},
|
|
{
|
|
"epoch": 0.012543116964565695,
|
|
"grad_norm": 1.4140573740005493,
|
|
"learning_rate": 9.999567321968297e-06,
|
|
"loss": 0.32130101323127747,
|
|
"memory(GiB)": 27.73,
|
|
"step": 5,
|
|
"token_acc": 0.8974483833268406,
|
|
"train_speed(iter/s)": 0.150893
|
|
},
|
|
{
|
|
"epoch": 0.02508623392913139,
|
|
"grad_norm": 0.9324946403503418,
|
|
"learning_rate": 9.998269362757298e-06,
|
|
"loss": 0.2845744609832764,
|
|
"memory(GiB)": 27.73,
|
|
"step": 10,
|
|
"token_acc": 0.9072619069023176,
|
|
"train_speed(iter/s)": 0.174486
|
|
},
|
|
{
|
|
"epoch": 0.03762935089369708,
|
|
"grad_norm": 0.8129357695579529,
|
|
"learning_rate": 9.996106347006378e-06,
|
|
"loss": 0.27310004234313967,
|
|
"memory(GiB)": 27.73,
|
|
"step": 15,
|
|
"token_acc": 0.9068501494128172,
|
|
"train_speed(iter/s)": 0.182334
|
|
},
|
|
{
|
|
"epoch": 0.05017246785826278,
|
|
"grad_norm": 0.8004717826843262,
|
|
"learning_rate": 9.993078649071297e-06,
|
|
"loss": 0.293326735496521,
|
|
"memory(GiB)": 27.73,
|
|
"step": 20,
|
|
"token_acc": 0.9049762744859472,
|
|
"train_speed(iter/s)": 0.189416
|
|
},
|
|
{
|
|
"epoch": 0.05017246785826278,
|
|
"eval_loss": 0.2858642339706421,
|
|
"eval_runtime": 10.0812,
|
|
"eval_samples_per_second": 25.493,
|
|
"eval_steps_per_second": 6.448,
|
|
"eval_token_acc": 0.9193595415242308,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.06271558482282848,
|
|
"grad_norm": 0.7604618668556213,
|
|
"learning_rate": 9.989186792959408e-06,
|
|
"loss": 0.26241092681884765,
|
|
"memory(GiB)": 27.73,
|
|
"step": 25,
|
|
"token_acc": 0.9132264916006951,
|
|
"train_speed(iter/s)": 0.162946
|
|
},
|
|
{
|
|
"epoch": 0.07525870178739416,
|
|
"grad_norm": 0.6763395071029663,
|
|
"learning_rate": 9.984431452238968e-06,
|
|
"loss": 0.2643896102905273,
|
|
"memory(GiB)": 30.07,
|
|
"step": 30,
|
|
"token_acc": 0.9132476909865227,
|
|
"train_speed(iter/s)": 0.168084
|
|
},
|
|
{
|
|
"epoch": 0.08780181875195986,
|
|
"grad_norm": 0.7725923657417297,
|
|
"learning_rate": 9.97881344992256e-06,
|
|
"loss": 0.24953582286834716,
|
|
"memory(GiB)": 30.07,
|
|
"step": 35,
|
|
"token_acc": 0.9186898409484755,
|
|
"train_speed(iter/s)": 0.173361
|
|
},
|
|
{
|
|
"epoch": 0.10034493571652556,
|
|
"grad_norm": 0.6925674676895142,
|
|
"learning_rate": 9.97233375832466e-06,
|
|
"loss": 0.25593545436859133,
|
|
"memory(GiB)": 30.07,
|
|
"step": 40,
|
|
"token_acc": 0.9142301666698126,
|
|
"train_speed(iter/s)": 0.17571
|
|
},
|
|
{
|
|
"epoch": 0.10034493571652556,
|
|
"eval_loss": 0.27316194772720337,
|
|
"eval_runtime": 9.946,
|
|
"eval_samples_per_second": 25.84,
|
|
"eval_steps_per_second": 6.535,
|
|
"eval_token_acc": 0.9228584355519807,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.11288805268109126,
|
|
"grad_norm": 0.6484134197235107,
|
|
"learning_rate": 9.964993498893349e-06,
|
|
"loss": 0.2584169626235962,
|
|
"memory(GiB)": 30.07,
|
|
"step": 45,
|
|
"token_acc": 0.9155336997396587,
|
|
"train_speed(iter/s)": 0.163839
|
|
},
|
|
{
|
|
"epoch": 0.12543116964565695,
|
|
"grad_norm": 0.6846535205841064,
|
|
"learning_rate": 9.95679394201623e-06,
|
|
"loss": 0.24747424125671386,
|
|
"memory(GiB)": 30.07,
|
|
"step": 50,
|
|
"token_acc": 0.9217108554277138,
|
|
"train_speed(iter/s)": 0.167611
|
|
},
|
|
{
|
|
"epoch": 0.13797428661022265,
|
|
"grad_norm": 0.6725199222564697,
|
|
"learning_rate": 9.947736506800554e-06,
|
|
"loss": 0.2658252716064453,
|
|
"memory(GiB)": 30.07,
|
|
"step": 55,
|
|
"token_acc": 0.9149937868631535,
|
|
"train_speed(iter/s)": 0.170799
|
|
},
|
|
{
|
|
"epoch": 0.15051740357478832,
|
|
"grad_norm": 0.6383991241455078,
|
|
"learning_rate": 9.93782276082762e-06,
|
|
"loss": 0.23914179801940919,
|
|
"memory(GiB)": 30.07,
|
|
"step": 60,
|
|
"token_acc": 0.9193507567193643,
|
|
"train_speed(iter/s)": 0.173334
|
|
},
|
|
{
|
|
"epoch": 0.15051740357478832,
|
|
"eval_loss": 0.26351168751716614,
|
|
"eval_runtime": 9.9368,
|
|
"eval_samples_per_second": 25.863,
|
|
"eval_steps_per_second": 6.541,
|
|
"eval_token_acc": 0.924492258194249,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.16306052053935402,
|
|
"grad_norm": 0.6977497935295105,
|
|
"learning_rate": 9.927054419881462e-06,
|
|
"loss": 0.2521164894104004,
|
|
"memory(GiB)": 30.07,
|
|
"step": 65,
|
|
"token_acc": 0.9175230463277722,
|
|
"train_speed(iter/s)": 0.165312
|
|
},
|
|
{
|
|
"epoch": 0.17560363750391972,
|
|
"grad_norm": 0.6636829972267151,
|
|
"learning_rate": 9.915433347651909e-06,
|
|
"loss": 0.2421865940093994,
|
|
"memory(GiB)": 30.07,
|
|
"step": 70,
|
|
"token_acc": 0.9232868615930188,
|
|
"train_speed(iter/s)": 0.168227
|
|
},
|
|
{
|
|
"epoch": 0.18814675446848542,
|
|
"grad_norm": 0.715813398361206,
|
|
"learning_rate": 9.90296155541202e-06,
|
|
"loss": 0.2508160352706909,
|
|
"memory(GiB)": 30.07,
|
|
"step": 75,
|
|
"token_acc": 0.921933587812046,
|
|
"train_speed(iter/s)": 0.169862
|
|
},
|
|
{
|
|
"epoch": 0.20068987143305111,
|
|
"grad_norm": 0.762688934803009,
|
|
"learning_rate": 9.88964120167001e-06,
|
|
"loss": 0.2518954277038574,
|
|
"memory(GiB)": 30.07,
|
|
"step": 80,
|
|
"token_acc": 0.9140404864303723,
|
|
"train_speed(iter/s)": 0.171765
|
|
},
|
|
{
|
|
"epoch": 0.20068987143305111,
|
|
"eval_loss": 0.26037997007369995,
|
|
"eval_runtime": 9.9473,
|
|
"eval_samples_per_second": 25.836,
|
|
"eval_steps_per_second": 6.534,
|
|
"eval_token_acc": 0.9254122260205108,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.2132329883976168,
|
|
"grad_norm": 0.7675485610961914,
|
|
"learning_rate": 9.875474591795648e-06,
|
|
"loss": 0.24610612392425538,
|
|
"memory(GiB)": 30.07,
|
|
"step": 85,
|
|
"token_acc": 0.9191244147424867,
|
|
"train_speed(iter/s)": 0.165845
|
|
},
|
|
{
|
|
"epoch": 0.2257761053621825,
|
|
"grad_norm": 0.6830568909645081,
|
|
"learning_rate": 9.860464177621286e-06,
|
|
"loss": 0.24553425312042237,
|
|
"memory(GiB)": 30.07,
|
|
"step": 90,
|
|
"token_acc": 0.9210544741632825,
|
|
"train_speed(iter/s)": 0.167772
|
|
},
|
|
{
|
|
"epoch": 0.2383192223267482,
|
|
"grad_norm": 0.6381848454475403,
|
|
"learning_rate": 9.84461255701751e-06,
|
|
"loss": 0.24247684478759765,
|
|
"memory(GiB)": 30.07,
|
|
"step": 95,
|
|
"token_acc": 0.9123848317331006,
|
|
"train_speed(iter/s)": 0.169012
|
|
},
|
|
{
|
|
"epoch": 0.2508623392913139,
|
|
"grad_norm": 0.6407138109207153,
|
|
"learning_rate": 9.827922473443518e-06,
|
|
"loss": 0.2540575504302979,
|
|
"memory(GiB)": 30.07,
|
|
"step": 100,
|
|
"token_acc": 0.9123342939481268,
|
|
"train_speed(iter/s)": 0.170603
|
|
},
|
|
{
|
|
"epoch": 0.2508623392913139,
|
|
"eval_loss": 0.255385160446167,
|
|
"eval_runtime": 9.9766,
|
|
"eval_samples_per_second": 25.76,
|
|
"eval_steps_per_second": 6.515,
|
|
"eval_token_acc": 0.9266941484013674,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.2634054562558796,
|
|
"grad_norm": 0.6974446177482605,
|
|
"learning_rate": 9.810396815472316e-06,
|
|
"loss": 0.2443918228149414,
|
|
"memory(GiB)": 30.07,
|
|
"step": 105,
|
|
"token_acc": 0.9175226016453799,
|
|
"train_speed(iter/s)": 0.166259
|
|
},
|
|
{
|
|
"epoch": 0.2759485732204453,
|
|
"grad_norm": 0.6513930559158325,
|
|
"learning_rate": 9.79203861629078e-06,
|
|
"loss": 0.25510406494140625,
|
|
"memory(GiB)": 30.07,
|
|
"step": 110,
|
|
"token_acc": 0.9140450964411845,
|
|
"train_speed(iter/s)": 0.168031
|
|
},
|
|
{
|
|
"epoch": 0.288491690185011,
|
|
"grad_norm": 0.7416351437568665,
|
|
"learning_rate": 9.772851053174708e-06,
|
|
"loss": 0.24527263641357422,
|
|
"memory(GiB)": 30.07,
|
|
"step": 115,
|
|
"token_acc": 0.9193030038453679,
|
|
"train_speed(iter/s)": 0.169489
|
|
},
|
|
{
|
|
"epoch": 0.30103480714957664,
|
|
"grad_norm": 0.7076767086982727,
|
|
"learning_rate": 9.752837446938915e-06,
|
|
"loss": 0.24852099418640136,
|
|
"memory(GiB)": 30.07,
|
|
"step": 120,
|
|
"token_acc": 0.9153006496449615,
|
|
"train_speed(iter/s)": 0.170153
|
|
},
|
|
{
|
|
"epoch": 0.30103480714957664,
|
|
"eval_loss": 0.252126008272171,
|
|
"eval_runtime": 9.9461,
|
|
"eval_samples_per_second": 25.839,
|
|
"eval_steps_per_second": 6.535,
|
|
"eval_token_acc": 0.927654333400362,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.31357792411414237,
|
|
"grad_norm": 0.6797928214073181,
|
|
"learning_rate": 9.732001261362503e-06,
|
|
"loss": 0.23887033462524415,
|
|
"memory(GiB)": 30.07,
|
|
"step": 125,
|
|
"token_acc": 0.9209902737430722,
|
|
"train_speed(iter/s)": 0.166589
|
|
},
|
|
{
|
|
"epoch": 0.32612104107870804,
|
|
"grad_norm": 0.7788575291633606,
|
|
"learning_rate": 9.710346102589376e-06,
|
|
"loss": 0.2535351276397705,
|
|
"memory(GiB)": 30.07,
|
|
"step": 130,
|
|
"token_acc": 0.9120982792920281,
|
|
"train_speed(iter/s)": 0.167787
|
|
},
|
|
{
|
|
"epoch": 0.33866415804327377,
|
|
"grad_norm": 0.6328480243682861,
|
|
"learning_rate": 9.687875718504126e-06,
|
|
"loss": 0.2357191562652588,
|
|
"memory(GiB)": 30.07,
|
|
"step": 135,
|
|
"token_acc": 0.9174817518248175,
|
|
"train_speed(iter/s)": 0.168973
|
|
},
|
|
{
|
|
"epoch": 0.35120727500783944,
|
|
"grad_norm": 0.6453744769096375,
|
|
"learning_rate": 9.664593998083374e-06,
|
|
"loss": 0.24335532188415526,
|
|
"memory(GiB)": 30.07,
|
|
"step": 140,
|
|
"token_acc": 0.9164061768834815,
|
|
"train_speed(iter/s)": 0.169985
|
|
},
|
|
{
|
|
"epoch": 0.35120727500783944,
|
|
"eval_loss": 0.24922603368759155,
|
|
"eval_runtime": 9.9665,
|
|
"eval_samples_per_second": 25.786,
|
|
"eval_steps_per_second": 6.522,
|
|
"eval_token_acc": 0.9285793283732153,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.36375039197240516,
|
|
"grad_norm": 0.655521810054779,
|
|
"learning_rate": 9.640504970722708e-06,
|
|
"loss": 0.23469161987304688,
|
|
"memory(GiB)": 30.07,
|
|
"step": 145,
|
|
"token_acc": 0.9239321638563585,
|
|
"train_speed(iter/s)": 0.166594
|
|
},
|
|
{
|
|
"epoch": 0.37629350893697083,
|
|
"grad_norm": 0.656192421913147,
|
|
"learning_rate": 9.615612805539305e-06,
|
|
"loss": 0.23534941673278809,
|
|
"memory(GiB)": 30.07,
|
|
"step": 150,
|
|
"token_acc": 0.9101899504070365,
|
|
"train_speed(iter/s)": 0.16793
|
|
},
|
|
{
|
|
"epoch": 0.38883662590153656,
|
|
"grad_norm": 0.6794695258140564,
|
|
"learning_rate": 9.589921810650379e-06,
|
|
"loss": 0.24691348075866698,
|
|
"memory(GiB)": 30.07,
|
|
"step": 155,
|
|
"token_acc": 0.9117996509857795,
|
|
"train_speed(iter/s)": 0.168976
|
|
},
|
|
{
|
|
"epoch": 0.40137974286610223,
|
|
"grad_norm": 0.6336193680763245,
|
|
"learning_rate": 9.563436432427571e-06,
|
|
"loss": 0.23817820549011232,
|
|
"memory(GiB)": 30.07,
|
|
"step": 160,
|
|
"token_acc": 0.9163882846488502,
|
|
"train_speed(iter/s)": 0.170021
|
|
},
|
|
{
|
|
"epoch": 0.40137974286610223,
|
|
"eval_loss": 0.24645844101905823,
|
|
"eval_runtime": 9.937,
|
|
"eval_samples_per_second": 25.863,
|
|
"eval_steps_per_second": 6.541,
|
|
"eval_token_acc": 0.9283782425095516,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.4139228598306679,
|
|
"grad_norm": 0.6395715475082397,
|
|
"learning_rate": 9.536161254727407e-06,
|
|
"loss": 0.23914387226104736,
|
|
"memory(GiB)": 30.07,
|
|
"step": 165,
|
|
"token_acc": 0.9195228213727245,
|
|
"train_speed(iter/s)": 0.167509
|
|
},
|
|
{
|
|
"epoch": 0.4264659767952336,
|
|
"grad_norm": 0.6767882704734802,
|
|
"learning_rate": 9.508100998097971e-06,
|
|
"loss": 0.2324080467224121,
|
|
"memory(GiB)": 30.07,
|
|
"step": 170,
|
|
"token_acc": 0.9136844562004045,
|
|
"train_speed(iter/s)": 0.168597
|
|
},
|
|
{
|
|
"epoch": 0.4390090937597993,
|
|
"grad_norm": 0.6208192706108093,
|
|
"learning_rate": 9.479260518961904e-06,
|
|
"loss": 0.23578665256500245,
|
|
"memory(GiB)": 30.07,
|
|
"step": 175,
|
|
"token_acc": 0.9154970653640042,
|
|
"train_speed(iter/s)": 0.169348
|
|
},
|
|
{
|
|
"epoch": 0.451552210724365,
|
|
"grad_norm": 0.6273934841156006,
|
|
"learning_rate": 9.449644808775902e-06,
|
|
"loss": 0.23413596153259278,
|
|
"memory(GiB)": 30.07,
|
|
"step": 180,
|
|
"token_acc": 0.9301044600520517,
|
|
"train_speed(iter/s)": 0.170195
|
|
},
|
|
{
|
|
"epoch": 0.451552210724365,
|
|
"eval_loss": 0.2454940527677536,
|
|
"eval_runtime": 9.965,
|
|
"eval_samples_per_second": 25.79,
|
|
"eval_steps_per_second": 6.523,
|
|
"eval_token_acc": 0.9287401970641463,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.4640953276889307,
|
|
"grad_norm": 0.6492425203323364,
|
|
"learning_rate": 9.419258993166846e-06,
|
|
"loss": 0.2424703598022461,
|
|
"memory(GiB)": 30.07,
|
|
"step": 185,
|
|
"token_acc": 0.9212295968281757,
|
|
"train_speed(iter/s)": 0.167631
|
|
},
|
|
{
|
|
"epoch": 0.4766384446534964,
|
|
"grad_norm": 0.6422290205955505,
|
|
"learning_rate": 9.388108331044687e-06,
|
|
"loss": 0.23424482345581055,
|
|
"memory(GiB)": 30.07,
|
|
"step": 190,
|
|
"token_acc": 0.9254705767559033,
|
|
"train_speed(iter/s)": 0.168154
|
|
},
|
|
{
|
|
"epoch": 0.4891815616180621,
|
|
"grad_norm": 0.6608842015266418,
|
|
"learning_rate": 9.356198213692297e-06,
|
|
"loss": 0.23567054271697999,
|
|
"memory(GiB)": 30.07,
|
|
"step": 195,
|
|
"token_acc": 0.9198833160816787,
|
|
"train_speed(iter/s)": 0.168995
|
|
},
|
|
{
|
|
"epoch": 0.5017246785826278,
|
|
"grad_norm": 0.675005316734314,
|
|
"learning_rate": 9.323534163832387e-06,
|
|
"loss": 0.24276134967803956,
|
|
"memory(GiB)": 30.07,
|
|
"step": 200,
|
|
"token_acc": 0.913303071968056,
|
|
"train_speed(iter/s)": 0.170071
|
|
},
|
|
{
|
|
"epoch": 0.5017246785826278,
|
|
"eval_loss": 0.24412870407104492,
|
|
"eval_runtime": 9.9709,
|
|
"eval_samples_per_second": 25.775,
|
|
"eval_steps_per_second": 6.519,
|
|
"eval_token_acc": 0.9292730746028555,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.5142677955471935,
|
|
"grad_norm": 0.6404038071632385,
|
|
"learning_rate": 9.290121834671669e-06,
|
|
"loss": 0.23353495597839355,
|
|
"memory(GiB)": 30.07,
|
|
"step": 205,
|
|
"token_acc": 0.9230231647999404,
|
|
"train_speed(iter/s)": 0.167939
|
|
},
|
|
{
|
|
"epoch": 0.5268109125117592,
|
|
"grad_norm": 0.6592434048652649,
|
|
"learning_rate": 9.255967008922475e-06,
|
|
"loss": 0.21883893013000488,
|
|
"memory(GiB)": 30.07,
|
|
"step": 210,
|
|
"token_acc": 0.9253728456196685,
|
|
"train_speed(iter/s)": 0.168578
|
|
},
|
|
{
|
|
"epoch": 0.5393540294763248,
|
|
"grad_norm": 0.656053900718689,
|
|
"learning_rate": 9.221075597801912e-06,
|
|
"loss": 0.2320107936859131,
|
|
"memory(GiB)": 30.07,
|
|
"step": 215,
|
|
"token_acc": 0.9213286713286714,
|
|
"train_speed(iter/s)": 0.169087
|
|
},
|
|
{
|
|
"epoch": 0.5518971464408906,
|
|
"grad_norm": 0.64938884973526,
|
|
"learning_rate": 9.18545364000882e-06,
|
|
"loss": 0.23771371841430664,
|
|
"memory(GiB)": 30.07,
|
|
"step": 220,
|
|
"token_acc": 0.9130793725675198,
|
|
"train_speed(iter/s)": 0.169997
|
|
},
|
|
{
|
|
"epoch": 0.5518971464408906,
|
|
"eval_loss": 0.24264991283416748,
|
|
"eval_runtime": 9.9448,
|
|
"eval_samples_per_second": 25.843,
|
|
"eval_steps_per_second": 6.536,
|
|
"eval_token_acc": 0.929810979288156,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.5644402634054563,
|
|
"grad_norm": 0.6256290674209595,
|
|
"learning_rate": 9.14910730067863e-06,
|
|
"loss": 0.23107373714447021,
|
|
"memory(GiB)": 30.07,
|
|
"step": 225,
|
|
"token_acc": 0.9247168944864042,
|
|
"train_speed(iter/s)": 0.168098
|
|
},
|
|
{
|
|
"epoch": 0.576983380370022,
|
|
"grad_norm": 0.5621991753578186,
|
|
"learning_rate": 9.112042870316365e-06,
|
|
"loss": 0.21704797744750975,
|
|
"memory(GiB)": 30.07,
|
|
"step": 230,
|
|
"token_acc": 0.9269190993704302,
|
|
"train_speed(iter/s)": 0.168845
|
|
},
|
|
{
|
|
"epoch": 0.5895264973345876,
|
|
"grad_norm": 0.6810638308525085,
|
|
"learning_rate": 9.074266763707937e-06,
|
|
"loss": 0.2278088092803955,
|
|
"memory(GiB)": 30.07,
|
|
"step": 235,
|
|
"token_acc": 0.9134312189271274,
|
|
"train_speed(iter/s)": 0.169551
|
|
},
|
|
{
|
|
"epoch": 0.6020696142991533,
|
|
"grad_norm": 0.5773187279701233,
|
|
"learning_rate": 9.035785518809928e-06,
|
|
"loss": 0.21931402683258056,
|
|
"memory(GiB)": 30.07,
|
|
"step": 240,
|
|
"token_acc": 0.9246131941148552,
|
|
"train_speed(iter/s)": 0.170025
|
|
},
|
|
{
|
|
"epoch": 0.6020696142991533,
|
|
"eval_loss": 0.24013011157512665,
|
|
"eval_runtime": 9.9847,
|
|
"eval_samples_per_second": 25.739,
|
|
"eval_steps_per_second": 6.51,
|
|
"eval_token_acc": 0.9308616529257994,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.6146127312637191,
|
|
"grad_norm": 0.6531880497932434,
|
|
"learning_rate": 8.996605795618054e-06,
|
|
"loss": 0.24535005092620848,
|
|
"memory(GiB)": 30.07,
|
|
"step": 245,
|
|
"token_acc": 0.915537267670059,
|
|
"train_speed(iter/s)": 0.168114
|
|
},
|
|
{
|
|
"epoch": 0.6271558482282847,
|
|
"grad_norm": 0.6779309511184692,
|
|
"learning_rate": 8.956734375014525e-06,
|
|
"loss": 0.23181967735290526,
|
|
"memory(GiB)": 30.07,
|
|
"step": 250,
|
|
"token_acc": 0.9155568096313017,
|
|
"train_speed(iter/s)": 0.168895
|
|
},
|
|
{
|
|
"epoch": 0.6396989651928504,
|
|
"grad_norm": 0.6643815040588379,
|
|
"learning_rate": 8.916178157594453e-06,
|
|
"loss": 0.23725414276123047,
|
|
"memory(GiB)": 30.07,
|
|
"step": 255,
|
|
"token_acc": 0.9135585175809833,
|
|
"train_speed(iter/s)": 0.169436
|
|
},
|
|
{
|
|
"epoch": 0.6522420821574161,
|
|
"grad_norm": 0.5990136861801147,
|
|
"learning_rate": 8.87494416247157e-06,
|
|
"loss": 0.22699012756347656,
|
|
"memory(GiB)": 30.07,
|
|
"step": 260,
|
|
"token_acc": 0.924126221001221,
|
|
"train_speed(iter/s)": 0.170075
|
|
},
|
|
{
|
|
"epoch": 0.6522420821574161,
|
|
"eval_loss": 0.23895950615406036,
|
|
"eval_runtime": 9.971,
|
|
"eval_samples_per_second": 25.775,
|
|
"eval_steps_per_second": 6.519,
|
|
"eval_token_acc": 0.9309320329780817,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.6647851991219819,
|
|
"grad_norm": 0.647604763507843,
|
|
"learning_rate": 8.833039526063414e-06,
|
|
"loss": 0.22294692993164061,
|
|
"memory(GiB)": 30.07,
|
|
"step": 265,
|
|
"token_acc": 0.923023885742428,
|
|
"train_speed(iter/s)": 0.168429
|
|
},
|
|
{
|
|
"epoch": 0.6773283160865475,
|
|
"grad_norm": 0.7019293904304504,
|
|
"learning_rate": 8.790471500856229e-06,
|
|
"loss": 0.22493109703063965,
|
|
"memory(GiB)": 30.07,
|
|
"step": 270,
|
|
"token_acc": 0.9217673989150925,
|
|
"train_speed(iter/s)": 0.168763
|
|
},
|
|
{
|
|
"epoch": 0.6898714330511132,
|
|
"grad_norm": 0.709414541721344,
|
|
"learning_rate": 8.747247454149754e-06,
|
|
"loss": 0.23138487339019775,
|
|
"memory(GiB)": 30.07,
|
|
"step": 275,
|
|
"token_acc": 0.9201797011093793,
|
|
"train_speed(iter/s)": 0.169186
|
|
},
|
|
{
|
|
"epoch": 0.7024145500156789,
|
|
"grad_norm": 0.5952314138412476,
|
|
"learning_rate": 8.703374866782172e-06,
|
|
"loss": 0.2214064598083496,
|
|
"memory(GiB)": 30.07,
|
|
"step": 280,
|
|
"token_acc": 0.9243394229601485,
|
|
"train_speed(iter/s)": 0.169579
|
|
},
|
|
{
|
|
"epoch": 0.7024145500156789,
|
|
"eval_loss": 0.2380242794752121,
|
|
"eval_runtime": 9.9399,
|
|
"eval_samples_per_second": 25.855,
|
|
"eval_steps_per_second": 6.539,
|
|
"eval_token_acc": 0.9309119243917152,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.7149576669802445,
|
|
"grad_norm": 0.6550840735435486,
|
|
"learning_rate": 8.658861331835384e-06,
|
|
"loss": 0.22828481197357178,
|
|
"memory(GiB)": 30.07,
|
|
"step": 285,
|
|
"token_acc": 0.9233320082850388,
|
|
"train_speed(iter/s)": 0.16832
|
|
},
|
|
{
|
|
"epoch": 0.7275007839448103,
|
|
"grad_norm": 0.6396917700767517,
|
|
"learning_rate": 8.613714553320863e-06,
|
|
"loss": 0.22759134769439698,
|
|
"memory(GiB)": 30.07,
|
|
"step": 290,
|
|
"token_acc": 0.9214041461850823,
|
|
"train_speed(iter/s)": 0.168862
|
|
},
|
|
{
|
|
"epoch": 0.740043900909376,
|
|
"grad_norm": 0.6359645128250122,
|
|
"learning_rate": 8.567942344846311e-06,
|
|
"loss": 0.2300776481628418,
|
|
"memory(GiB)": 30.07,
|
|
"step": 295,
|
|
"token_acc": 0.9242182277352745,
|
|
"train_speed(iter/s)": 0.169333
|
|
},
|
|
{
|
|
"epoch": 0.7525870178739417,
|
|
"grad_norm": 0.6625407338142395,
|
|
"learning_rate": 8.521552628263362e-06,
|
|
"loss": 0.23114292621612548,
|
|
"memory(GiB)": 30.07,
|
|
"step": 300,
|
|
"token_acc": 0.9189751431314676,
|
|
"train_speed(iter/s)": 0.169808
|
|
},
|
|
{
|
|
"epoch": 0.7525870178739417,
|
|
"eval_loss": 0.2363082766532898,
|
|
"eval_runtime": 9.9749,
|
|
"eval_samples_per_second": 25.765,
|
|
"eval_steps_per_second": 6.516,
|
|
"eval_token_acc": 0.9315302634224814,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.7651301348385073,
|
|
"grad_norm": 0.6202587485313416,
|
|
"learning_rate": 8.474553432296517e-06,
|
|
"loss": 0.224021315574646,
|
|
"memory(GiB)": 30.07,
|
|
"step": 305,
|
|
"token_acc": 0.9276043893401739,
|
|
"train_speed(iter/s)": 0.168359
|
|
},
|
|
{
|
|
"epoch": 0.7776732518030731,
|
|
"grad_norm": 0.6691203713417053,
|
|
"learning_rate": 8.426952891153617e-06,
|
|
"loss": 0.23445448875427247,
|
|
"memory(GiB)": 30.07,
|
|
"step": 310,
|
|
"token_acc": 0.923713052741816,
|
|
"train_speed(iter/s)": 0.168781
|
|
},
|
|
{
|
|
"epoch": 0.7902163687676388,
|
|
"grad_norm": 0.6087518334388733,
|
|
"learning_rate": 8.378759243118044e-06,
|
|
"loss": 0.22397913932800292,
|
|
"memory(GiB)": 30.07,
|
|
"step": 315,
|
|
"token_acc": 0.9215067830325622,
|
|
"train_speed(iter/s)": 0.169206
|
|
},
|
|
{
|
|
"epoch": 0.8027594857322045,
|
|
"grad_norm": 0.6450164914131165,
|
|
"learning_rate": 8.329980829122907e-06,
|
|
"loss": 0.2312875509262085,
|
|
"memory(GiB)": 30.07,
|
|
"step": 320,
|
|
"token_acc": 0.9272111639559235,
|
|
"train_speed(iter/s)": 0.169675
|
|
},
|
|
{
|
|
"epoch": 0.8027594857322045,
|
|
"eval_loss": 0.23446869850158691,
|
|
"eval_runtime": 9.9713,
|
|
"eval_samples_per_second": 25.774,
|
|
"eval_steps_per_second": 6.519,
|
|
"eval_token_acc": 0.931907299416851,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.8153026026967701,
|
|
"grad_norm": 0.6574311852455139,
|
|
"learning_rate": 8.280626091307466e-06,
|
|
"loss": 0.21696841716766357,
|
|
"memory(GiB)": 30.07,
|
|
"step": 325,
|
|
"token_acc": 0.9266242271024336,
|
|
"train_speed(iter/s)": 0.168386
|
|
},
|
|
{
|
|
"epoch": 0.8278457196613358,
|
|
"grad_norm": 0.6543525457382202,
|
|
"learning_rate": 8.23070357155605e-06,
|
|
"loss": 0.23127243518829346,
|
|
"memory(GiB)": 30.07,
|
|
"step": 330,
|
|
"token_acc": 0.9156461739292596,
|
|
"train_speed(iter/s)": 0.168765
|
|
},
|
|
{
|
|
"epoch": 0.8403888366259016,
|
|
"grad_norm": 0.6864754557609558,
|
|
"learning_rate": 8.18022191001969e-06,
|
|
"loss": 0.23221104145050048,
|
|
"memory(GiB)": 30.07,
|
|
"step": 335,
|
|
"token_acc": 0.9158222112374361,
|
|
"train_speed(iter/s)": 0.169317
|
|
},
|
|
{
|
|
"epoch": 0.8529319535904673,
|
|
"grad_norm": 0.6478604674339294,
|
|
"learning_rate": 8.129189843620766e-06,
|
|
"loss": 0.21692075729370117,
|
|
"memory(GiB)": 30.07,
|
|
"step": 340,
|
|
"token_acc": 0.92678130982976,
|
|
"train_speed(iter/s)": 0.169761
|
|
},
|
|
{
|
|
"epoch": 0.8529319535904673,
|
|
"eval_loss": 0.23404286801815033,
|
|
"eval_runtime": 9.9838,
|
|
"eval_samples_per_second": 25.742,
|
|
"eval_steps_per_second": 6.511,
|
|
"eval_token_acc": 0.9316659963804544,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.8654750705550329,
|
|
"grad_norm": 0.6460584402084351,
|
|
"learning_rate": 8.077616204540897e-06,
|
|
"loss": 0.21975212097167968,
|
|
"memory(GiB)": 30.07,
|
|
"step": 345,
|
|
"token_acc": 0.926279602750191,
|
|
"train_speed(iter/s)": 0.168486
|
|
},
|
|
{
|
|
"epoch": 0.8780181875195986,
|
|
"grad_norm": 0.6245233416557312,
|
|
"learning_rate": 8.02550991869234e-06,
|
|
"loss": 0.2209392309188843,
|
|
"memory(GiB)": 30.07,
|
|
"step": 350,
|
|
"token_acc": 0.9232239957902122,
|
|
"train_speed(iter/s)": 0.168953
|
|
},
|
|
{
|
|
"epoch": 0.8905613044841643,
|
|
"grad_norm": 0.6148731708526611,
|
|
"learning_rate": 7.972880004173175e-06,
|
|
"loss": 0.22880539894104004,
|
|
"memory(GiB)": 30.07,
|
|
"step": 355,
|
|
"token_acc": 0.9230032848427968,
|
|
"train_speed(iter/s)": 0.169373
|
|
},
|
|
{
|
|
"epoch": 0.90310442144873,
|
|
"grad_norm": 0.5976830124855042,
|
|
"learning_rate": 7.919735569706533e-06,
|
|
"loss": 0.2258004665374756,
|
|
"memory(GiB)": 30.07,
|
|
"step": 360,
|
|
"token_acc": 0.9294882944307232,
|
|
"train_speed(iter/s)": 0.169737
|
|
},
|
|
{
|
|
"epoch": 0.90310442144873,
|
|
"eval_loss": 0.23216013610363007,
|
|
"eval_runtime": 9.9402,
|
|
"eval_samples_per_second": 25.855,
|
|
"eval_steps_per_second": 6.539,
|
|
"eval_token_acc": 0.9317363764327368,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.9156475384132957,
|
|
"grad_norm": 0.6738146543502808,
|
|
"learning_rate": 7.86608581306413e-06,
|
|
"loss": 0.23008251190185547,
|
|
"memory(GiB)": 30.07,
|
|
"step": 365,
|
|
"token_acc": 0.9237889028684008,
|
|
"train_speed(iter/s)": 0.168422
|
|
},
|
|
{
|
|
"epoch": 0.9281906553778614,
|
|
"grad_norm": 0.5968795418739319,
|
|
"learning_rate": 7.811940019474414e-06,
|
|
"loss": 0.2311033248901367,
|
|
"memory(GiB)": 30.07,
|
|
"step": 370,
|
|
"token_acc": 0.9211925456821934,
|
|
"train_speed(iter/s)": 0.168654
|
|
},
|
|
{
|
|
"epoch": 0.940733772342427,
|
|
"grad_norm": 0.6431145668029785,
|
|
"learning_rate": 7.757307560015539e-06,
|
|
"loss": 0.21920247077941896,
|
|
"memory(GiB)": 30.07,
|
|
"step": 375,
|
|
"token_acc": 0.9226723579404703,
|
|
"train_speed(iter/s)": 0.168986
|
|
},
|
|
{
|
|
"epoch": 0.9532768893069928,
|
|
"grad_norm": 0.6081040501594543,
|
|
"learning_rate": 7.702197889993515e-06,
|
|
"loss": 0.23370542526245117,
|
|
"memory(GiB)": 30.07,
|
|
"step": 380,
|
|
"token_acc": 0.9235495603658321,
|
|
"train_speed(iter/s)": 0.169411
|
|
},
|
|
{
|
|
"epoch": 0.9532768893069928,
|
|
"eval_loss": 0.23192763328552246,
|
|
"eval_runtime": 9.9828,
|
|
"eval_samples_per_second": 25.744,
|
|
"eval_steps_per_second": 6.511,
|
|
"eval_token_acc": 0.9322843354112206,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.9658200062715585,
|
|
"grad_norm": 0.6549849510192871,
|
|
"learning_rate": 7.646620547305765e-06,
|
|
"loss": 0.22933628559112548,
|
|
"memory(GiB)": 30.07,
|
|
"step": 385,
|
|
"token_acc": 0.9272846380609236,
|
|
"train_speed(iter/s)": 0.168384
|
|
},
|
|
{
|
|
"epoch": 0.9783631232361242,
|
|
"grad_norm": 0.5421332120895386,
|
|
"learning_rate": 7.590585150790388e-06,
|
|
"loss": 0.2162912368774414,
|
|
"memory(GiB)": 30.07,
|
|
"step": 390,
|
|
"token_acc": 0.9282690665907798,
|
|
"train_speed(iter/s)": 0.168866
|
|
},
|
|
{
|
|
"epoch": 0.9909062402006898,
|
|
"grad_norm": 0.5929440259933472,
|
|
"learning_rate": 7.5341013985614064e-06,
|
|
"loss": 0.22078533172607423,
|
|
"memory(GiB)": 30.07,
|
|
"step": 395,
|
|
"token_acc": 0.9202857714192223,
|
|
"train_speed(iter/s)": 0.169287
|
|
},
|
|
{
|
|
"epoch": 1.0050172467858263,
|
|
"grad_norm": 0.6037130951881409,
|
|
"learning_rate": 7.47717906633032e-06,
|
|
"loss": 0.253579306602478,
|
|
"memory(GiB)": 30.07,
|
|
"step": 400,
|
|
"token_acc": 0.9251826086956522,
|
|
"train_speed(iter/s)": 0.169497
|
|
},
|
|
{
|
|
"epoch": 1.0050172467858263,
|
|
"eval_loss": 0.23065434396266937,
|
|
"eval_runtime": 9.9745,
|
|
"eval_samples_per_second": 25.766,
|
|
"eval_steps_per_second": 6.517,
|
|
"eval_token_acc": 0.9330032173738186,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.017560363750392,
|
|
"grad_norm": 0.6310690641403198,
|
|
"learning_rate": 7.419828005714195e-06,
|
|
"loss": 0.17385544776916503,
|
|
"memory(GiB)": 30.07,
|
|
"step": 405,
|
|
"token_acc": 0.9351318726588173,
|
|
"train_speed(iter/s)": 0.168293
|
|
},
|
|
{
|
|
"epoch": 1.0301034807149576,
|
|
"grad_norm": 0.6506795883178711,
|
|
"learning_rate": 7.362058142530639e-06,
|
|
"loss": 0.16791077852249145,
|
|
"memory(GiB)": 30.07,
|
|
"step": 410,
|
|
"token_acc": 0.9320981703907922,
|
|
"train_speed(iter/s)": 0.16871
|
|
},
|
|
{
|
|
"epoch": 1.0426465976795234,
|
|
"grad_norm": 0.5465015769004822,
|
|
"learning_rate": 7.303879475079931e-06,
|
|
"loss": 0.15868284702301025,
|
|
"memory(GiB)": 32.5,
|
|
"step": 415,
|
|
"token_acc": 0.942933207765865,
|
|
"train_speed(iter/s)": 0.168987
|
|
},
|
|
{
|
|
"epoch": 1.055189714644089,
|
|
"grad_norm": 0.6573348045349121,
|
|
"learning_rate": 7.245302072414602e-06,
|
|
"loss": 0.16553893089294433,
|
|
"memory(GiB)": 32.5,
|
|
"step": 420,
|
|
"token_acc": 0.9483149060876516,
|
|
"train_speed(iter/s)": 0.169449
|
|
},
|
|
{
|
|
"epoch": 1.055189714644089,
|
|
"eval_loss": 0.23652049899101257,
|
|
"eval_runtime": 9.9746,
|
|
"eval_samples_per_second": 25.765,
|
|
"eval_steps_per_second": 6.517,
|
|
"eval_token_acc": 0.9325306655942087,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.0677328316086547,
|
|
"grad_norm": 0.6287339925765991,
|
|
"learning_rate": 7.1863360725967615e-06,
|
|
"loss": 0.15552424192428588,
|
|
"memory(GiB)": 32.5,
|
|
"step": 425,
|
|
"token_acc": 0.9385299885841407,
|
|
"train_speed(iter/s)": 0.168478
|
|
},
|
|
{
|
|
"epoch": 1.0802759485732205,
|
|
"grad_norm": 0.6632113456726074,
|
|
"learning_rate": 7.126991680943508e-06,
|
|
"loss": 0.17083898782730103,
|
|
"memory(GiB)": 32.5,
|
|
"step": 430,
|
|
"token_acc": 0.9388098703940791,
|
|
"train_speed(iter/s)": 0.168867
|
|
},
|
|
{
|
|
"epoch": 1.092819065537786,
|
|
"grad_norm": 0.6978201270103455,
|
|
"learning_rate": 7.067279168260671e-06,
|
|
"loss": 0.17238540649414064,
|
|
"memory(GiB)": 32.5,
|
|
"step": 435,
|
|
"token_acc": 0.941155504865096,
|
|
"train_speed(iter/s)": 0.169229
|
|
},
|
|
{
|
|
"epoch": 1.1053621825023519,
|
|
"grad_norm": 0.6574686765670776,
|
|
"learning_rate": 7.007208869065232e-06,
|
|
"loss": 0.164797842502594,
|
|
"memory(GiB)": 32.5,
|
|
"step": 440,
|
|
"token_acc": 0.9414418794608996,
|
|
"train_speed(iter/s)": 0.169579
|
|
},
|
|
{
|
|
"epoch": 1.1053621825023519,
|
|
"eval_loss": 0.23527124524116516,
|
|
"eval_runtime": 9.9855,
|
|
"eval_samples_per_second": 25.737,
|
|
"eval_steps_per_second": 6.509,
|
|
"eval_token_acc": 0.932812185803338,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.1179052994669174,
|
|
"grad_norm": 0.6073827743530273,
|
|
"learning_rate": 6.946791179796718e-06,
|
|
"loss": 0.16431469917297364,
|
|
"memory(GiB)": 32.5,
|
|
"step": 445,
|
|
"token_acc": 0.936977573407634,
|
|
"train_speed(iter/s)": 0.168619
|
|
},
|
|
{
|
|
"epoch": 1.1304484164314832,
|
|
"grad_norm": 0.6841042041778564,
|
|
"learning_rate": 6.886036557017881e-06,
|
|
"loss": 0.1685694932937622,
|
|
"memory(GiB)": 32.5,
|
|
"step": 450,
|
|
"token_acc": 0.9406548805236119,
|
|
"train_speed(iter/s)": 0.168995
|
|
},
|
|
{
|
|
"epoch": 1.142991533396049,
|
|
"grad_norm": 0.6076449751853943,
|
|
"learning_rate": 6.824955515604957e-06,
|
|
"loss": 0.15892113447189332,
|
|
"memory(GiB)": 32.5,
|
|
"step": 455,
|
|
"token_acc": 0.9472979086195497,
|
|
"train_speed(iter/s)": 0.16932
|
|
},
|
|
{
|
|
"epoch": 1.1555346503606145,
|
|
"grad_norm": 0.6569898128509521,
|
|
"learning_rate": 6.76355862692786e-06,
|
|
"loss": 0.1675378680229187,
|
|
"memory(GiB)": 32.5,
|
|
"step": 460,
|
|
"token_acc": 0.9412598483175024,
|
|
"train_speed(iter/s)": 0.169705
|
|
},
|
|
{
|
|
"epoch": 1.1555346503606145,
|
|
"eval_loss": 0.23534773290157318,
|
|
"eval_runtime": 9.9809,
|
|
"eval_samples_per_second": 25.749,
|
|
"eval_steps_per_second": 6.512,
|
|
"eval_token_acc": 0.9333752262215966,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 1.1680777673251803,
|
|
"grad_norm": 0.6509740948677063,
|
|
"learning_rate": 6.701856517020565e-06,
|
|
"loss": 0.17353179454803466,
|
|
"memory(GiB)": 32.5,
|
|
"step": 465,
|
|
"token_acc": 0.9360858431432503,
|
|
"train_speed(iter/s)": 0.168849
|
|
},
|
|
{
|
|
"epoch": 1.180620884289746,
|
|
"grad_norm": 0.6769475340843201,
|
|
"learning_rate": 6.639859864742058e-06,
|
|
"loss": 0.16680521965026857,
|
|
"memory(GiB)": 32.5,
|
|
"step": 470,
|
|
"token_acc": 0.945840546350372,
|
|
"train_speed(iter/s)": 0.1692
|
|
},
|
|
{
|
|
"epoch": 1.1931640012543117,
|
|
"grad_norm": 0.6421222686767578,
|
|
"learning_rate": 6.5775793999281345e-06,
|
|
"loss": 0.1688302278518677,
|
|
"memory(GiB)": 34.95,
|
|
"step": 475,
|
|
"token_acc": 0.9395874540830743,
|
|
"train_speed(iter/s)": 0.16954
|
|
},
|
|
{
|
|
"epoch": 1.2057071182188774,
|
|
"grad_norm": 0.6938676834106445,
|
|
"learning_rate": 6.515025901534364e-06,
|
|
"loss": 0.1676286816596985,
|
|
"memory(GiB)": 34.95,
|
|
"step": 480,
|
|
"token_acc": 0.9379853728417402,
|
|
"train_speed(iter/s)": 0.169878
|
|
},
|
|
{
|
|
"epoch": 1.2057071182188774,
|
|
"eval_loss": 0.2359960675239563,
|
|
"eval_runtime": 9.9789,
|
|
"eval_samples_per_second": 25.754,
|
|
"eval_steps_per_second": 6.514,
|
|
"eval_token_acc": 0.9330987331590589,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 1.218250235183443,
|
|
"grad_norm": 0.65184086561203,
|
|
"learning_rate": 6.452210195770571e-06,
|
|
"loss": 0.1703261137008667,
|
|
"memory(GiB)": 34.95,
|
|
"step": 485,
|
|
"token_acc": 0.9328010713917791,
|
|
"train_speed(iter/s)": 0.169027
|
|
},
|
|
{
|
|
"epoch": 1.2307933521480088,
|
|
"grad_norm": 0.6309488415718079,
|
|
"learning_rate": 6.389143154227128e-06,
|
|
"loss": 0.17036676406860352,
|
|
"memory(GiB)": 34.95,
|
|
"step": 490,
|
|
"token_acc": 0.9409641272467474,
|
|
"train_speed(iter/s)": 0.169349
|
|
},
|
|
{
|
|
"epoch": 1.2433364691125746,
|
|
"grad_norm": 0.6250006556510925,
|
|
"learning_rate": 6.325835691993394e-06,
|
|
"loss": 0.17421271800994872,
|
|
"memory(GiB)": 34.95,
|
|
"step": 495,
|
|
"token_acc": 0.9379951431187548,
|
|
"train_speed(iter/s)": 0.16968
|
|
},
|
|
{
|
|
"epoch": 1.2558795860771401,
|
|
"grad_norm": 0.6726417541503906,
|
|
"learning_rate": 6.2622987657686305e-06,
|
|
"loss": 0.17579824924468995,
|
|
"memory(GiB)": 34.95,
|
|
"step": 500,
|
|
"token_acc": 0.9332128799122411,
|
|
"train_speed(iter/s)": 0.169898
|
|
},
|
|
{
|
|
"epoch": 1.2558795860771401,
|
|
"eval_loss": 0.2346230149269104,
|
|
"eval_runtime": 9.958,
|
|
"eval_samples_per_second": 25.809,
|
|
"eval_steps_per_second": 6.527,
|
|
"eval_token_acc": 0.9335712849386688,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 1.268422703041706,
|
|
"grad_norm": 0.6843208074569702,
|
|
"learning_rate": 6.198543371965711e-06,
|
|
"loss": 0.16942257881164552,
|
|
"memory(GiB)": 34.95,
|
|
"step": 505,
|
|
"token_acc": 0.9333019066627884,
|
|
"train_speed(iter/s)": 0.168992
|
|
},
|
|
{
|
|
"epoch": 1.2809658200062715,
|
|
"grad_norm": 0.6464666128158569,
|
|
"learning_rate": 6.134580544807951e-06,
|
|
"loss": 0.16836194992065429,
|
|
"memory(GiB)": 34.95,
|
|
"step": 510,
|
|
"token_acc": 0.9348090386953423,
|
|
"train_speed(iter/s)": 0.169263
|
|
},
|
|
{
|
|
"epoch": 1.2935089369708372,
|
|
"grad_norm": 0.636613667011261,
|
|
"learning_rate": 6.070421354419418e-06,
|
|
"loss": 0.17016284465789794,
|
|
"memory(GiB)": 34.95,
|
|
"step": 515,
|
|
"token_acc": 0.9389155662264906,
|
|
"train_speed(iter/s)": 0.169454
|
|
},
|
|
{
|
|
"epoch": 1.306052053935403,
|
|
"grad_norm": 0.6690927743911743,
|
|
"learning_rate": 6.006076904908996e-06,
|
|
"loss": 0.1702873706817627,
|
|
"memory(GiB)": 34.95,
|
|
"step": 520,
|
|
"token_acc": 0.9357901608213163,
|
|
"train_speed(iter/s)": 0.16974
|
|
},
|
|
{
|
|
"epoch": 1.306052053935403,
|
|
"eval_loss": 0.23393017053604126,
|
|
"eval_runtime": 9.9533,
|
|
"eval_samples_per_second": 25.82,
|
|
"eval_steps_per_second": 6.53,
|
|
"eval_token_acc": 0.9333802533681882,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 1.3185951708999686,
|
|
"grad_norm": 0.5869135856628418,
|
|
"learning_rate": 5.9415583324485895e-06,
|
|
"loss": 0.15791409015655516,
|
|
"memory(GiB)": 34.95,
|
|
"step": 525,
|
|
"token_acc": 0.9386744758379915,
|
|
"train_speed(iter/s)": 0.168968
|
|
},
|
|
{
|
|
"epoch": 1.3311382878645344,
|
|
"grad_norm": 0.644124448299408,
|
|
"learning_rate": 5.876876803345777e-06,
|
|
"loss": 0.16019464731216432,
|
|
"memory(GiB)": 34.95,
|
|
"step": 530,
|
|
"token_acc": 0.9399966226992138,
|
|
"train_speed(iter/s)": 0.169268
|
|
},
|
|
{
|
|
"epoch": 1.3436814048291001,
|
|
"grad_norm": 0.6242665648460388,
|
|
"learning_rate": 5.812043512111237e-06,
|
|
"loss": 0.16639323234558107,
|
|
"memory(GiB)": 34.95,
|
|
"step": 535,
|
|
"token_acc": 0.9414278117034347,
|
|
"train_speed(iter/s)": 0.169643
|
|
},
|
|
{
|
|
"epoch": 1.3562245217936657,
|
|
"grad_norm": 0.6533536314964294,
|
|
"learning_rate": 5.747069679521306e-06,
|
|
"loss": 0.16934127807617189,
|
|
"memory(GiB)": 34.95,
|
|
"step": 540,
|
|
"token_acc": 0.9394062777613266,
|
|
"train_speed(iter/s)": 0.169883
|
|
},
|
|
{
|
|
"epoch": 1.3562245217936657,
|
|
"eval_loss": 0.23251816630363464,
|
|
"eval_runtime": 9.9789,
|
|
"eval_samples_per_second": 25.754,
|
|
"eval_steps_per_second": 6.514,
|
|
"eval_token_acc": 0.9339081037603056,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 1.3687676387582315,
|
|
"grad_norm": 0.6235964894294739,
|
|
"learning_rate": 5.681966550675981e-06,
|
|
"loss": 0.15981945991516114,
|
|
"memory(GiB)": 34.95,
|
|
"step": 545,
|
|
"token_acc": 0.9390275276562902,
|
|
"train_speed(iter/s)": 0.169143
|
|
},
|
|
{
|
|
"epoch": 1.381310755722797,
|
|
"grad_norm": 0.6405583620071411,
|
|
"learning_rate": 5.616745393052725e-06,
|
|
"loss": 0.1589187502861023,
|
|
"memory(GiB)": 34.95,
|
|
"step": 550,
|
|
"token_acc": 0.9394911105629661,
|
|
"train_speed(iter/s)": 0.169459
|
|
},
|
|
{
|
|
"epoch": 1.3938538726873628,
|
|
"grad_norm": 0.6054794192314148,
|
|
"learning_rate": 5.551417494556376e-06,
|
|
"loss": 0.1589406132698059,
|
|
"memory(GiB)": 34.95,
|
|
"step": 555,
|
|
"token_acc": 0.9447133523511548,
|
|
"train_speed(iter/s)": 0.169755
|
|
},
|
|
{
|
|
"epoch": 1.4063969896519284,
|
|
"grad_norm": 0.6477380394935608,
|
|
"learning_rate": 5.4859941615655495e-06,
|
|
"loss": 0.16248714923858643,
|
|
"memory(GiB)": 34.95,
|
|
"step": 560,
|
|
"token_acc": 0.9376850526480911,
|
|
"train_speed(iter/s)": 0.170048
|
|
},
|
|
{
|
|
"epoch": 1.4063969896519284,
|
|
"eval_loss": 0.2313276082277298,
|
|
"eval_runtime": 9.9434,
|
|
"eval_samples_per_second": 25.846,
|
|
"eval_steps_per_second": 6.537,
|
|
"eval_token_acc": 0.934089081037603,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 1.4189401066164942,
|
|
"grad_norm": 0.6257679462432861,
|
|
"learning_rate": 5.4204867169758265e-06,
|
|
"loss": 0.1701244592666626,
|
|
"memory(GiB)": 34.95,
|
|
"step": 565,
|
|
"token_acc": 0.9315096587690685,
|
|
"train_speed(iter/s)": 0.169354
|
|
},
|
|
{
|
|
"epoch": 1.43148322358106,
|
|
"grad_norm": 0.6648498773574829,
|
|
"learning_rate": 5.35490649824008e-06,
|
|
"loss": 0.16337137222290038,
|
|
"memory(GiB)": 34.95,
|
|
"step": 570,
|
|
"token_acc": 0.9441849071789757,
|
|
"train_speed(iter/s)": 0.169609
|
|
},
|
|
{
|
|
"epoch": 1.4440263405456255,
|
|
"grad_norm": 0.6349947452545166,
|
|
"learning_rate": 5.289264855406295e-06,
|
|
"loss": 0.1652446985244751,
|
|
"memory(GiB)": 34.95,
|
|
"step": 575,
|
|
"token_acc": 0.9389820592823713,
|
|
"train_speed(iter/s)": 0.169876
|
|
},
|
|
{
|
|
"epoch": 1.4565694575101913,
|
|
"grad_norm": 0.6542990803718567,
|
|
"learning_rate": 5.223573149153197e-06,
|
|
"loss": 0.17841705083847045,
|
|
"memory(GiB)": 34.95,
|
|
"step": 580,
|
|
"token_acc": 0.9361828435737608,
|
|
"train_speed(iter/s)": 0.170157
|
|
},
|
|
{
|
|
"epoch": 1.4565694575101913,
|
|
"eval_loss": 0.23112896084785461,
|
|
"eval_runtime": 9.9829,
|
|
"eval_samples_per_second": 25.744,
|
|
"eval_steps_per_second": 6.511,
|
|
"eval_token_acc": 0.9340991353307863,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 1.469112574474757,
|
|
"grad_norm": 0.6102667450904846,
|
|
"learning_rate": 5.157842748824053e-06,
|
|
"loss": 0.16528806686401368,
|
|
"memory(GiB)": 34.95,
|
|
"step": 585,
|
|
"token_acc": 0.9365076170735143,
|
|
"train_speed(iter/s)": 0.169488
|
|
},
|
|
{
|
|
"epoch": 1.4816556914393226,
|
|
"grad_norm": 0.6301048994064331,
|
|
"learning_rate": 5.092085030458957e-06,
|
|
"loss": 0.16155061721801758,
|
|
"memory(GiB)": 34.95,
|
|
"step": 590,
|
|
"token_acc": 0.9446101777707996,
|
|
"train_speed(iter/s)": 0.169721
|
|
},
|
|
{
|
|
"epoch": 1.4941988084038884,
|
|
"grad_norm": 0.6364830732345581,
|
|
"learning_rate": 5.026311374825969e-06,
|
|
"loss": 0.16691150665283203,
|
|
"memory(GiB)": 34.95,
|
|
"step": 595,
|
|
"token_acc": 0.9467816983326871,
|
|
"train_speed(iter/s)": 0.169943
|
|
},
|
|
{
|
|
"epoch": 1.5067419253684542,
|
|
"grad_norm": 0.7235390543937683,
|
|
"learning_rate": 4.960533165451435e-06,
|
|
"loss": 0.16880112886428833,
|
|
"memory(GiB)": 34.95,
|
|
"step": 600,
|
|
"token_acc": 0.9358625682365141,
|
|
"train_speed(iter/s)": 0.170106
|
|
},
|
|
{
|
|
"epoch": 1.5067419253684542,
|
|
"eval_loss": 0.23094019293785095,
|
|
"eval_runtime": 9.9784,
|
|
"eval_samples_per_second": 25.756,
|
|
"eval_steps_per_second": 6.514,
|
|
"eval_token_acc": 0.9340790267444199,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.5192850423330198,
|
|
"grad_norm": 0.6465805172920227,
|
|
"learning_rate": 4.894761786649815e-06,
|
|
"loss": 0.16632287502288817,
|
|
"memory(GiB)": 34.95,
|
|
"step": 605,
|
|
"token_acc": 0.9343266943374441,
|
|
"train_speed(iter/s)": 0.169351
|
|
},
|
|
{
|
|
"epoch": 1.5318281592975853,
|
|
"grad_norm": 0.6529747247695923,
|
|
"learning_rate": 4.829008621553401e-06,
|
|
"loss": 0.16232678890228272,
|
|
"memory(GiB)": 34.95,
|
|
"step": 610,
|
|
"token_acc": 0.9372756540724568,
|
|
"train_speed(iter/s)": 0.169563
|
|
},
|
|
{
|
|
"epoch": 1.544371276262151,
|
|
"grad_norm": 0.6062551736831665,
|
|
"learning_rate": 4.763285050142211e-06,
|
|
"loss": 0.1610184907913208,
|
|
"memory(GiB)": 34.95,
|
|
"step": 615,
|
|
"token_acc": 0.9430264444742746,
|
|
"train_speed(iter/s)": 0.169769
|
|
},
|
|
{
|
|
"epoch": 1.5569143932267169,
|
|
"grad_norm": 0.6321557760238647,
|
|
"learning_rate": 4.697602447274454e-06,
|
|
"loss": 0.16829713582992553,
|
|
"memory(GiB)": 34.95,
|
|
"step": 620,
|
|
"token_acc": 0.9414636993230099,
|
|
"train_speed(iter/s)": 0.169953
|
|
},
|
|
{
|
|
"epoch": 1.5569143932267169,
|
|
"eval_loss": 0.22995001077651978,
|
|
"eval_runtime": 9.9733,
|
|
"eval_samples_per_second": 25.769,
|
|
"eval_steps_per_second": 6.517,
|
|
"eval_token_acc": 0.9340689724512367,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 1.5694575101912824,
|
|
"grad_norm": 0.6242859363555908,
|
|
"learning_rate": 4.631972180717859e-06,
|
|
"loss": 0.169819974899292,
|
|
"memory(GiB)": 34.95,
|
|
"step": 625,
|
|
"token_acc": 0.9353122957152196,
|
|
"train_speed(iter/s)": 0.169207
|
|
},
|
|
{
|
|
"epoch": 1.5820006271558482,
|
|
"grad_norm": 0.6639277935028076,
|
|
"learning_rate": 4.566405609182247e-06,
|
|
"loss": 0.17650117874145507,
|
|
"memory(GiB)": 34.95,
|
|
"step": 630,
|
|
"token_acc": 0.9426657289854536,
|
|
"train_speed(iter/s)": 0.169405
|
|
},
|
|
{
|
|
"epoch": 1.594543744120414,
|
|
"grad_norm": 0.5965340733528137,
|
|
"learning_rate": 4.500914080353666e-06,
|
|
"loss": 0.16074283123016359,
|
|
"memory(GiB)": 34.95,
|
|
"step": 635,
|
|
"token_acc": 0.9436321558637268,
|
|
"train_speed(iter/s)": 0.169633
|
|
},
|
|
{
|
|
"epoch": 1.6070868610849796,
|
|
"grad_norm": 0.6135890483856201,
|
|
"learning_rate": 4.435508928930431e-06,
|
|
"loss": 0.17277932167053223,
|
|
"memory(GiB)": 34.95,
|
|
"step": 640,
|
|
"token_acc": 0.9343614580678052,
|
|
"train_speed(iter/s)": 0.169805
|
|
},
|
|
{
|
|
"epoch": 1.6070868610849796,
|
|
"eval_loss": 0.2301892191171646,
|
|
"eval_runtime": 9.9309,
|
|
"eval_samples_per_second": 25.879,
|
|
"eval_steps_per_second": 6.545,
|
|
"eval_token_acc": 0.9346370400160868,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 1.6196299780495453,
|
|
"grad_norm": 0.6135047674179077,
|
|
"learning_rate": 4.3702014746614135e-06,
|
|
"loss": 0.16275949478149415,
|
|
"memory(GiB)": 34.95,
|
|
"step": 645,
|
|
"token_acc": 0.9382904296349448,
|
|
"train_speed(iter/s)": 0.169112
|
|
},
|
|
{
|
|
"epoch": 1.6321730950141111,
|
|
"grad_norm": 0.6757416725158691,
|
|
"learning_rate": 4.305003020386922e-06,
|
|
"loss": 0.16928246021270751,
|
|
"memory(GiB)": 34.95,
|
|
"step": 650,
|
|
"token_acc": 0.9348640286598274,
|
|
"train_speed(iter/s)": 0.169328
|
|
},
|
|
{
|
|
"epoch": 1.6447162119786767,
|
|
"grad_norm": 0.6586278676986694,
|
|
"learning_rate": 4.239924850082501e-06,
|
|
"loss": 0.15818471908569337,
|
|
"memory(GiB)": 34.95,
|
|
"step": 655,
|
|
"token_acc": 0.9452125117275318,
|
|
"train_speed(iter/s)": 0.169486
|
|
},
|
|
{
|
|
"epoch": 1.6572593289432422,
|
|
"grad_norm": 0.7142418026924133,
|
|
"learning_rate": 4.1749782269060045e-06,
|
|
"loss": 0.1626511335372925,
|
|
"memory(GiB)": 34.95,
|
|
"step": 660,
|
|
"token_acc": 0.9346642123840067,
|
|
"train_speed(iter/s)": 0.169729
|
|
},
|
|
{
|
|
"epoch": 1.6572593289432422,
|
|
"eval_loss": 0.22976796329021454,
|
|
"eval_runtime": 9.9836,
|
|
"eval_samples_per_second": 25.742,
|
|
"eval_steps_per_second": 6.511,
|
|
"eval_token_acc": 0.9347878544138347,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 1.6698024459078082,
|
|
"grad_norm": 0.6405680775642395,
|
|
"learning_rate": 4.110174391248268e-06,
|
|
"loss": 0.1630636215209961,
|
|
"memory(GiB)": 34.95,
|
|
"step": 665,
|
|
"token_acc": 0.9356640277041026,
|
|
"train_speed(iter/s)": 0.169025
|
|
},
|
|
{
|
|
"epoch": 1.6823455628723738,
|
|
"grad_norm": 0.6509523391723633,
|
|
"learning_rate": 4.045524558787712e-06,
|
|
"loss": 0.17556746006011964,
|
|
"memory(GiB)": 34.95,
|
|
"step": 670,
|
|
"token_acc": 0.9399725004910626,
|
|
"train_speed(iter/s)": 0.169222
|
|
},
|
|
{
|
|
"epoch": 1.6948886798369394,
|
|
"grad_norm": 0.6420454978942871,
|
|
"learning_rate": 3.9810399185492406e-06,
|
|
"loss": 0.16325095891952515,
|
|
"memory(GiB)": 34.95,
|
|
"step": 675,
|
|
"token_acc": 0.9350154972645768,
|
|
"train_speed(iter/s)": 0.16944
|
|
},
|
|
{
|
|
"epoch": 1.7074317968015051,
|
|
"grad_norm": 0.6421252489089966,
|
|
"learning_rate": 3.916731630967741e-06,
|
|
"loss": 0.17528104782104492,
|
|
"memory(GiB)": 34.95,
|
|
"step": 680,
|
|
"token_acc": 0.9365230616994374,
|
|
"train_speed(iter/s)": 0.169645
|
|
},
|
|
{
|
|
"epoch": 1.7074317968015051,
|
|
"eval_loss": 0.2283635139465332,
|
|
"eval_runtime": 9.9363,
|
|
"eval_samples_per_second": 25.865,
|
|
"eval_steps_per_second": 6.542,
|
|
"eval_token_acc": 0.9346672028956364,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 1.719974913766071,
|
|
"grad_norm": 0.6265820264816284,
|
|
"learning_rate": 3.852610825956529e-06,
|
|
"loss": 0.16770663261413574,
|
|
"memory(GiB)": 34.95,
|
|
"step": 685,
|
|
"token_acc": 0.9386050786166663,
|
|
"train_speed(iter/s)": 0.169004
|
|
},
|
|
{
|
|
"epoch": 1.7325180307306365,
|
|
"grad_norm": 0.6280970573425293,
|
|
"learning_rate": 3.788688600981085e-06,
|
|
"loss": 0.1680266261100769,
|
|
"memory(GiB)": 34.95,
|
|
"step": 690,
|
|
"token_acc": 0.9436317194937884,
|
|
"train_speed(iter/s)": 0.169183
|
|
},
|
|
{
|
|
"epoch": 1.7450611476952023,
|
|
"grad_norm": 0.575031578540802,
|
|
"learning_rate": 3.7249760191384055e-06,
|
|
"loss": 0.16007229089736938,
|
|
"memory(GiB)": 34.95,
|
|
"step": 695,
|
|
"token_acc": 0.9408609064687402,
|
|
"train_speed(iter/s)": 0.169405
|
|
},
|
|
{
|
|
"epoch": 1.757604264659768,
|
|
"grad_norm": 0.6248459219932556,
|
|
"learning_rate": 3.6614841072422913e-06,
|
|
"loss": 0.16597646474838257,
|
|
"memory(GiB)": 34.95,
|
|
"step": 700,
|
|
"token_acc": 0.9334997820314668,
|
|
"train_speed(iter/s)": 0.169658
|
|
},
|
|
{
|
|
"epoch": 1.757604264659768,
|
|
"eval_loss": 0.227211132645607,
|
|
"eval_runtime": 9.9535,
|
|
"eval_samples_per_second": 25.82,
|
|
"eval_steps_per_second": 6.53,
|
|
"eval_token_acc": 0.9347778001206515,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 1.7701473816243336,
|
|
"grad_norm": 0.6405999064445496,
|
|
"learning_rate": 3.5982238539149287e-06,
|
|
"loss": 0.16680790185928346,
|
|
"memory(GiB)": 34.95,
|
|
"step": 705,
|
|
"token_acc": 0.9360328466797351,
|
|
"train_speed(iter/s)": 0.169094
|
|
},
|
|
{
|
|
"epoch": 1.7826904985888994,
|
|
"grad_norm": 0.6649206280708313,
|
|
"learning_rate": 3.535206207685079e-06,
|
|
"loss": 0.1820515751838684,
|
|
"memory(GiB)": 34.95,
|
|
"step": 710,
|
|
"token_acc": 0.9366235113407408,
|
|
"train_speed(iter/s)": 0.169367
|
|
},
|
|
{
|
|
"epoch": 1.7952336155534652,
|
|
"grad_norm": 0.6017094254493713,
|
|
"learning_rate": 3.472442075093192e-06,
|
|
"loss": 0.1508460283279419,
|
|
"memory(GiB)": 34.95,
|
|
"step": 715,
|
|
"token_acc": 0.9486971106461709,
|
|
"train_speed(iter/s)": 0.169534
|
|
},
|
|
{
|
|
"epoch": 1.8077767325180307,
|
|
"grad_norm": 0.5928083062171936,
|
|
"learning_rate": 3.4099423188038094e-06,
|
|
"loss": 0.16222984790802003,
|
|
"memory(GiB)": 34.95,
|
|
"step": 720,
|
|
"token_acc": 0.9467024477514842,
|
|
"train_speed(iter/s)": 0.169693
|
|
},
|
|
{
|
|
"epoch": 1.8077767325180307,
|
|
"eval_loss": 0.2278689295053482,
|
|
"eval_runtime": 9.9852,
|
|
"eval_samples_per_second": 25.738,
|
|
"eval_steps_per_second": 6.51,
|
|
"eval_token_acc": 0.9349235873718078,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 1.8203198494825963,
|
|
"grad_norm": 0.6321941018104553,
|
|
"learning_rate": 3.347717755725547e-06,
|
|
"loss": 0.17015450000762938,
|
|
"memory(GiB)": 34.95,
|
|
"step": 725,
|
|
"token_acc": 0.9337387521012558,
|
|
"train_speed(iter/s)": 0.169142
|
|
},
|
|
{
|
|
"epoch": 1.832862966447162,
|
|
"grad_norm": 0.6680959463119507,
|
|
"learning_rate": 3.2857791551389907e-06,
|
|
"loss": 0.16979444026947021,
|
|
"memory(GiB)": 34.95,
|
|
"step": 730,
|
|
"token_acc": 0.9440408017179671,
|
|
"train_speed(iter/s)": 0.1693
|
|
},
|
|
{
|
|
"epoch": 1.8454060834117278,
|
|
"grad_norm": 0.6347929835319519,
|
|
"learning_rate": 3.224137236832859e-06,
|
|
"loss": 0.16566884517669678,
|
|
"memory(GiB)": 34.95,
|
|
"step": 735,
|
|
"token_acc": 0.9421418181073162,
|
|
"train_speed(iter/s)": 0.169501
|
|
},
|
|
{
|
|
"epoch": 1.8579492003762934,
|
|
"grad_norm": 0.6764352321624756,
|
|
"learning_rate": 3.1628026692487053e-06,
|
|
"loss": 0.1652566075325012,
|
|
"memory(GiB)": 34.95,
|
|
"step": 740,
|
|
"token_acc": 0.9433130787598766,
|
|
"train_speed(iter/s)": 0.169711
|
|
},
|
|
{
|
|
"epoch": 1.8579492003762934,
|
|
"eval_loss": 0.22732892632484436,
|
|
"eval_runtime": 9.9558,
|
|
"eval_samples_per_second": 25.814,
|
|
"eval_steps_per_second": 6.529,
|
|
"eval_token_acc": 0.9350542931831892,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 1.8704923173408592,
|
|
"grad_norm": 0.6389771103858948,
|
|
"learning_rate": 3.1017860676345184e-06,
|
|
"loss": 0.15687326192855836,
|
|
"memory(GiB)": 34.95,
|
|
"step": 745,
|
|
"token_acc": 0.9414018945533932,
|
|
"train_speed(iter/s)": 0.169168
|
|
},
|
|
{
|
|
"epoch": 1.883035434305425,
|
|
"grad_norm": 0.6998502612113953,
|
|
"learning_rate": 3.0410979922075344e-06,
|
|
"loss": 0.17107654809951783,
|
|
"memory(GiB)": 34.95,
|
|
"step": 750,
|
|
"token_acc": 0.9427076541922024,
|
|
"train_speed(iter/s)": 0.169327
|
|
},
|
|
{
|
|
"epoch": 1.8955785512699905,
|
|
"grad_norm": 0.6121336817741394,
|
|
"learning_rate": 2.980748946326564e-06,
|
|
"loss": 0.16890095472335814,
|
|
"memory(GiB)": 34.95,
|
|
"step": 755,
|
|
"token_acc": 0.9396650021625447,
|
|
"train_speed(iter/s)": 0.169512
|
|
},
|
|
{
|
|
"epoch": 1.9081216682345563,
|
|
"grad_norm": 0.5630024075508118,
|
|
"learning_rate": 2.920749374674161e-06,
|
|
"loss": 0.16135737895965577,
|
|
"memory(GiB)": 34.95,
|
|
"step": 760,
|
|
"token_acc": 0.9455813142757539,
|
|
"train_speed(iter/s)": 0.169692
|
|
},
|
|
{
|
|
"epoch": 1.9081216682345563,
|
|
"eval_loss": 0.2258785516023636,
|
|
"eval_runtime": 9.9483,
|
|
"eval_samples_per_second": 25.833,
|
|
"eval_steps_per_second": 6.534,
|
|
"eval_token_acc": 0.9357782022923788,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 1.920664785199122,
|
|
"grad_norm": 0.6215230226516724,
|
|
"learning_rate": 2.861109661448952e-06,
|
|
"loss": 0.160076367855072,
|
|
"memory(GiB)": 34.95,
|
|
"step": 765,
|
|
"token_acc": 0.938600821420109,
|
|
"train_speed(iter/s)": 0.169164
|
|
},
|
|
{
|
|
"epoch": 1.9332079021636877,
|
|
"grad_norm": 0.6377970576286316,
|
|
"learning_rate": 2.8018401285684284e-06,
|
|
"loss": 0.16507962942123414,
|
|
"memory(GiB)": 34.95,
|
|
"step": 770,
|
|
"token_acc": 0.9362514029180696,
|
|
"train_speed(iter/s)": 0.169343
|
|
},
|
|
{
|
|
"epoch": 1.9457510191282532,
|
|
"grad_norm": 0.6416298747062683,
|
|
"learning_rate": 2.7429510338825206e-06,
|
|
"loss": 0.1676865577697754,
|
|
"memory(GiB)": 34.95,
|
|
"step": 775,
|
|
"token_acc": 0.9356394574884725,
|
|
"train_speed(iter/s)": 0.169514
|
|
},
|
|
{
|
|
"epoch": 1.9582941360928192,
|
|
"grad_norm": 0.6064321398735046,
|
|
"learning_rate": 2.6844525693982614e-06,
|
|
"loss": 0.1615642786026001,
|
|
"memory(GiB)": 34.95,
|
|
"step": 780,
|
|
"token_acc": 0.9434515921396388,
|
|
"train_speed(iter/s)": 0.169698
|
|
},
|
|
{
|
|
"epoch": 1.9582941360928192,
|
|
"eval_loss": 0.22540703415870667,
|
|
"eval_runtime": 9.9865,
|
|
"eval_samples_per_second": 25.735,
|
|
"eval_steps_per_second": 6.509,
|
|
"eval_token_acc": 0.9356072793082646,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 1.9708372530573848,
|
|
"grad_norm": 0.6212838888168335,
|
|
"learning_rate": 2.6263548595158374e-06,
|
|
"loss": 0.16903696060180665,
|
|
"memory(GiB)": 34.95,
|
|
"step": 785,
|
|
"token_acc": 0.9373140403756506,
|
|
"train_speed(iter/s)": 0.16916
|
|
},
|
|
{
|
|
"epoch": 1.9833803700219503,
|
|
"grad_norm": 0.6316173076629639,
|
|
"learning_rate": 2.568667959276351e-06,
|
|
"loss": 0.1633455991744995,
|
|
"memory(GiB)": 34.95,
|
|
"step": 790,
|
|
"token_acc": 0.9499691904033075,
|
|
"train_speed(iter/s)": 0.169327
|
|
},
|
|
{
|
|
"epoch": 1.9959234869865161,
|
|
"grad_norm": 0.5879420638084412,
|
|
"learning_rate": 2.5114018526215843e-06,
|
|
"loss": 0.15602803230285645,
|
|
"memory(GiB)": 34.95,
|
|
"step": 795,
|
|
"token_acc": 0.9389440475085831,
|
|
"train_speed(iter/s)": 0.169519
|
|
},
|
|
{
|
|
"epoch": 2.0100344935716525,
|
|
"grad_norm": 0.5628567337989807,
|
|
"learning_rate": 2.454566450666061e-06,
|
|
"loss": 0.1572946071624756,
|
|
"memory(GiB)": 34.95,
|
|
"step": 800,
|
|
"token_acc": 0.9571394981693594,
|
|
"train_speed(iter/s)": 0.169704
|
|
},
|
|
{
|
|
"epoch": 2.0100344935716525,
|
|
"eval_loss": 0.22719430923461914,
|
|
"eval_runtime": 9.9565,
|
|
"eval_samples_per_second": 25.812,
|
|
"eval_steps_per_second": 6.528,
|
|
"eval_token_acc": 0.9355921978684898,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 2.022577610536218,
|
|
"grad_norm": 0.6104578971862793,
|
|
"learning_rate": 2.398171589981721e-06,
|
|
"loss": 0.1239326000213623,
|
|
"memory(GiB)": 34.95,
|
|
"step": 805,
|
|
"token_acc": 0.9446134994383744,
|
|
"train_speed(iter/s)": 0.169099
|
|
},
|
|
{
|
|
"epoch": 2.035120727500784,
|
|
"grad_norm": 0.5852713584899902,
|
|
"learning_rate": 2.3422270308954936e-06,
|
|
"loss": 0.12712430953979492,
|
|
"memory(GiB)": 34.95,
|
|
"step": 810,
|
|
"token_acc": 0.9523950262830121,
|
|
"train_speed(iter/s)": 0.169261
|
|
},
|
|
{
|
|
"epoch": 2.0476638444653497,
|
|
"grad_norm": 0.6348543763160706,
|
|
"learning_rate": 2.286742455800059e-06,
|
|
"loss": 0.12253003120422364,
|
|
"memory(GiB)": 34.95,
|
|
"step": 815,
|
|
"token_acc": 0.9575001424257962,
|
|
"train_speed(iter/s)": 0.169477
|
|
},
|
|
{
|
|
"epoch": 2.060206961429915,
|
|
"grad_norm": 0.6192832589149475,
|
|
"learning_rate": 2.2317274674781158e-06,
|
|
"loss": 0.12359896898269654,
|
|
"memory(GiB)": 34.95,
|
|
"step": 820,
|
|
"token_acc": 0.952242789995938,
|
|
"train_speed(iter/s)": 0.169661
|
|
},
|
|
{
|
|
"epoch": 2.060206961429915,
|
|
"eval_loss": 0.24132946133613586,
|
|
"eval_runtime": 9.9829,
|
|
"eval_samples_per_second": 25.744,
|
|
"eval_steps_per_second": 6.511,
|
|
"eval_token_acc": 0.934958777397949,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 2.072750078394481,
|
|
"grad_norm": 0.6246756315231323,
|
|
"learning_rate": 2.1771915874404094e-06,
|
|
"loss": 0.1322195291519165,
|
|
"memory(GiB)": 34.95,
|
|
"step": 825,
|
|
"token_acc": 0.9414432054743698,
|
|
"train_speed(iter/s)": 0.169161
|
|
},
|
|
{
|
|
"epoch": 2.085293195359047,
|
|
"grad_norm": 0.5839347243309021,
|
|
"learning_rate": 2.1231442542778317e-06,
|
|
"loss": 0.11952453851699829,
|
|
"memory(GiB)": 34.95,
|
|
"step": 830,
|
|
"token_acc": 0.956975505857295,
|
|
"train_speed(iter/s)": 0.169338
|
|
},
|
|
{
|
|
"epoch": 2.0978363123236123,
|
|
"grad_norm": 0.6020109057426453,
|
|
"learning_rate": 2.0695948220278756e-06,
|
|
"loss": 0.12150832414627075,
|
|
"memory(GiB)": 34.95,
|
|
"step": 835,
|
|
"token_acc": 0.950202699878798,
|
|
"train_speed(iter/s)": 0.169516
|
|
},
|
|
{
|
|
"epoch": 2.110379429288178,
|
|
"grad_norm": 0.6176694631576538,
|
|
"learning_rate": 2.0165525585557205e-06,
|
|
"loss": 0.12181558609008789,
|
|
"memory(GiB)": 34.95,
|
|
"step": 840,
|
|
"token_acc": 0.9584100732944936,
|
|
"train_speed(iter/s)": 0.169655
|
|
},
|
|
{
|
|
"epoch": 2.110379429288178,
|
|
"eval_loss": 0.23956826329231262,
|
|
"eval_runtime": 9.9764,
|
|
"eval_samples_per_second": 25.761,
|
|
"eval_steps_per_second": 6.515,
|
|
"eval_token_acc": 0.9351900261411623,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 2.122922546252744,
|
|
"grad_norm": 0.6029666066169739,
|
|
"learning_rate": 1.964026643950226e-06,
|
|
"loss": 0.11940534114837646,
|
|
"memory(GiB)": 34.95,
|
|
"step": 845,
|
|
"token_acc": 0.9457718501702611,
|
|
"train_speed(iter/s)": 0.16911
|
|
},
|
|
{
|
|
"epoch": 2.1354656632173095,
|
|
"grad_norm": 0.5822896957397461,
|
|
"learning_rate": 1.9120261689351317e-06,
|
|
"loss": 0.11883677244186401,
|
|
"memory(GiB)": 34.95,
|
|
"step": 850,
|
|
"token_acc": 0.9609223300970874,
|
|
"train_speed(iter/s)": 0.169285
|
|
},
|
|
{
|
|
"epoch": 2.148008780181875,
|
|
"grad_norm": 0.6058652997016907,
|
|
"learning_rate": 1.860560133295708e-06,
|
|
"loss": 0.12740614414215087,
|
|
"memory(GiB)": 34.95,
|
|
"step": 855,
|
|
"token_acc": 0.9549561469832148,
|
|
"train_speed(iter/s)": 0.169475
|
|
},
|
|
{
|
|
"epoch": 2.160551897146441,
|
|
"grad_norm": 0.5672310590744019,
|
|
"learning_rate": 1.8096374443211545e-06,
|
|
"loss": 0.12559156417846679,
|
|
"memory(GiB)": 34.95,
|
|
"step": 860,
|
|
"token_acc": 0.9539432293401429,
|
|
"train_speed(iter/s)": 0.169653
|
|
},
|
|
{
|
|
"epoch": 2.160551897146441,
|
|
"eval_loss": 0.23939980566501617,
|
|
"eval_runtime": 9.9791,
|
|
"eval_samples_per_second": 25.754,
|
|
"eval_steps_per_second": 6.514,
|
|
"eval_token_acc": 0.9351900261411623,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 2.1730950141110066,
|
|
"grad_norm": 0.6905380487442017,
|
|
"learning_rate": 1.7592669152630082e-06,
|
|
"loss": 0.12502384185791016,
|
|
"memory(GiB)": 34.95,
|
|
"step": 865,
|
|
"token_acc": 0.9430737514131808,
|
|
"train_speed(iter/s)": 0.16916
|
|
},
|
|
{
|
|
"epoch": 2.185638131075572,
|
|
"grad_norm": 0.604882001876831,
|
|
"learning_rate": 1.7094572638098122e-06,
|
|
"loss": 0.13246217966079712,
|
|
"memory(GiB)": 34.95,
|
|
"step": 870,
|
|
"token_acc": 0.9538787052672268,
|
|
"train_speed(iter/s)": 0.169321
|
|
},
|
|
{
|
|
"epoch": 2.198181248040138,
|
|
"grad_norm": 0.6221954226493835,
|
|
"learning_rate": 1.6602171105783488e-06,
|
|
"loss": 0.12281397581100464,
|
|
"memory(GiB)": 34.95,
|
|
"step": 875,
|
|
"token_acc": 0.9516503156133547,
|
|
"train_speed(iter/s)": 0.169488
|
|
},
|
|
{
|
|
"epoch": 2.2107243650047037,
|
|
"grad_norm": 0.5445839166641235,
|
|
"learning_rate": 1.61155497762165e-06,
|
|
"loss": 0.11812053918838501,
|
|
"memory(GiB)": 34.95,
|
|
"step": 880,
|
|
"token_acc": 0.9578427802726868,
|
|
"train_speed(iter/s)": 0.169626
|
|
},
|
|
{
|
|
"epoch": 2.2107243650047037,
|
|
"eval_loss": 0.23973500728607178,
|
|
"eval_runtime": 9.9832,
|
|
"eval_samples_per_second": 25.743,
|
|
"eval_steps_per_second": 6.511,
|
|
"eval_token_acc": 0.9351347275286548,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 2.2232674819692693,
|
|
"grad_norm": 0.6164165735244751,
|
|
"learning_rate": 1.5634792869540782e-06,
|
|
"loss": 0.11963331699371338,
|
|
"memory(GiB)": 34.95,
|
|
"step": 885,
|
|
"token_acc": 0.9436205250131545,
|
|
"train_speed(iter/s)": 0.169137
|
|
},
|
|
{
|
|
"epoch": 2.235810598933835,
|
|
"grad_norm": 0.606275200843811,
|
|
"learning_rate": 1.5159983590937183e-06,
|
|
"loss": 0.12453606128692626,
|
|
"memory(GiB)": 34.95,
|
|
"step": 890,
|
|
"token_acc": 0.9526249104831157,
|
|
"train_speed(iter/s)": 0.169305
|
|
},
|
|
{
|
|
"epoch": 2.248353715898401,
|
|
"grad_norm": 0.6022824048995972,
|
|
"learning_rate": 1.4691204116223357e-06,
|
|
"loss": 0.11552423238754272,
|
|
"memory(GiB)": 34.95,
|
|
"step": 895,
|
|
"token_acc": 0.9613866135340565,
|
|
"train_speed(iter/s)": 0.169507
|
|
},
|
|
{
|
|
"epoch": 2.2608968328629664,
|
|
"grad_norm": 0.6075533032417297,
|
|
"learning_rate": 1.4228535577631442e-06,
|
|
"loss": 0.12762036323547363,
|
|
"memory(GiB)": 34.95,
|
|
"step": 900,
|
|
"token_acc": 0.9527419384954348,
|
|
"train_speed(iter/s)": 0.169672
|
|
},
|
|
{
|
|
"epoch": 2.2608968328629664,
|
|
"eval_loss": 0.24023577570915222,
|
|
"eval_runtime": 9.9811,
|
|
"eval_samples_per_second": 25.749,
|
|
"eval_steps_per_second": 6.512,
|
|
"eval_token_acc": 0.9352000804343454,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 2.273439949827532,
|
|
"grad_norm": 0.6038256883621216,
|
|
"learning_rate": 1.3772058049766491e-06,
|
|
"loss": 0.12403825521469117,
|
|
"memory(GiB)": 34.95,
|
|
"step": 905,
|
|
"token_acc": 0.9445350568832248,
|
|
"train_speed(iter/s)": 0.169145
|
|
},
|
|
{
|
|
"epoch": 2.285983066792098,
|
|
"grad_norm": 0.6491556763648987,
|
|
"learning_rate": 1.3321850535747822e-06,
|
|
"loss": 0.12173200845718384,
|
|
"memory(GiB)": 34.95,
|
|
"step": 910,
|
|
"token_acc": 0.9588150821120849,
|
|
"train_speed(iter/s)": 0.169343
|
|
},
|
|
{
|
|
"epoch": 2.2985261837566635,
|
|
"grad_norm": 0.548174262046814,
|
|
"learning_rate": 1.2877990953535841e-06,
|
|
"loss": 0.12104053497314453,
|
|
"memory(GiB)": 34.95,
|
|
"step": 915,
|
|
"token_acc": 0.958676718877986,
|
|
"train_speed(iter/s)": 0.169527
|
|
},
|
|
{
|
|
"epoch": 2.311069300721229,
|
|
"grad_norm": 0.5731512904167175,
|
|
"learning_rate": 1.2440556122446701e-06,
|
|
"loss": 0.12762261629104615,
|
|
"memory(GiB)": 34.95,
|
|
"step": 920,
|
|
"token_acc": 0.9535161617972158,
|
|
"train_speed(iter/s)": 0.169669
|
|
},
|
|
{
|
|
"epoch": 2.311069300721229,
|
|
"eval_loss": 0.2390112429857254,
|
|
"eval_runtime": 9.9533,
|
|
"eval_samples_per_second": 25.821,
|
|
"eval_steps_per_second": 6.53,
|
|
"eval_token_acc": 0.9355570078423486,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 2.323612417685795,
|
|
"grad_norm": 0.634443998336792,
|
|
"learning_rate": 1.2009621749857103e-06,
|
|
"loss": 0.12285526990890502,
|
|
"memory(GiB)": 34.95,
|
|
"step": 925,
|
|
"token_acc": 0.9447460370728625,
|
|
"train_speed(iter/s)": 0.169211
|
|
},
|
|
{
|
|
"epoch": 2.3361555346503606,
|
|
"grad_norm": 0.674192488193512,
|
|
"learning_rate": 1.1585262418101468e-06,
|
|
"loss": 0.13117657899856566,
|
|
"memory(GiB)": 34.95,
|
|
"step": 930,
|
|
"token_acc": 0.9548805986574227,
|
|
"train_speed(iter/s)": 0.16934
|
|
},
|
|
{
|
|
"epoch": 2.348698651614926,
|
|
"grad_norm": 0.6701561808586121,
|
|
"learning_rate": 1.1167551571563967e-06,
|
|
"loss": 0.12773873805999755,
|
|
"memory(GiB)": 34.95,
|
|
"step": 935,
|
|
"token_acc": 0.9571292006765203,
|
|
"train_speed(iter/s)": 0.169553
|
|
},
|
|
{
|
|
"epoch": 2.361241768579492,
|
|
"grad_norm": 0.5688639879226685,
|
|
"learning_rate": 1.0756561503967366e-06,
|
|
"loss": 0.12773098945617675,
|
|
"memory(GiB)": 34.95,
|
|
"step": 940,
|
|
"token_acc": 0.9547074376365099,
|
|
"train_speed(iter/s)": 0.16967
|
|
},
|
|
{
|
|
"epoch": 2.361241768579492,
|
|
"eval_loss": 0.23946641385555267,
|
|
"eval_runtime": 9.9907,
|
|
"eval_samples_per_second": 25.724,
|
|
"eval_steps_per_second": 6.506,
|
|
"eval_token_acc": 0.9353056505127689,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 2.3737848855440578,
|
|
"grad_norm": 0.6829060316085815,
|
|
"learning_rate": 1.0352363345861067e-06,
|
|
"loss": 0.1251779556274414,
|
|
"memory(GiB)": 34.95,
|
|
"step": 945,
|
|
"token_acc": 0.9461535568551496,
|
|
"train_speed(iter/s)": 0.169209
|
|
},
|
|
{
|
|
"epoch": 2.3863280025086233,
|
|
"grad_norm": 0.6119195222854614,
|
|
"learning_rate": 9.955027052310445e-07,
|
|
"loss": 0.12672061920166017,
|
|
"memory(GiB)": 34.95,
|
|
"step": 950,
|
|
"token_acc": 0.9541979451343965,
|
|
"train_speed(iter/s)": 0.169349
|
|
},
|
|
{
|
|
"epoch": 2.3988711194731893,
|
|
"grad_norm": 0.6136831045150757,
|
|
"learning_rate": 9.564621390789692e-07,
|
|
"loss": 0.12832672595977784,
|
|
"memory(GiB)": 34.95,
|
|
"step": 955,
|
|
"token_acc": 0.9493431077797455,
|
|
"train_speed(iter/s)": 0.169478
|
|
},
|
|
{
|
|
"epoch": 2.411414236437755,
|
|
"grad_norm": 0.6445709466934204,
|
|
"learning_rate": 9.181213929280047e-07,
|
|
"loss": 0.12906695604324342,
|
|
"memory(GiB)": 34.95,
|
|
"step": 960,
|
|
"token_acc": 0.9479911420436571,
|
|
"train_speed(iter/s)": 0.16967
|
|
},
|
|
{
|
|
"epoch": 2.411414236437755,
|
|
"eval_loss": 0.23977875709533691,
|
|
"eval_runtime": 9.9861,
|
|
"eval_samples_per_second": 25.736,
|
|
"eval_steps_per_second": 6.509,
|
|
"eval_token_acc": 0.9355368992559823,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 2.4239573534023204,
|
|
"grad_norm": 0.5981658101081848,
|
|
"learning_rate": 8.804871024575851e-07,
|
|
"loss": 0.12087714672088623,
|
|
"memory(GiB)": 34.95,
|
|
"step": 965,
|
|
"token_acc": 0.9459313171146616,
|
|
"train_speed(iter/s)": 0.169225
|
|
},
|
|
{
|
|
"epoch": 2.436500470366886,
|
|
"grad_norm": 0.591122567653656,
|
|
"learning_rate": 8.435657810799991e-07,
|
|
"loss": 0.11974387168884278,
|
|
"memory(GiB)": 34.95,
|
|
"step": 970,
|
|
"token_acc": 0.9559715418707722,
|
|
"train_speed(iter/s)": 0.169402
|
|
},
|
|
{
|
|
"epoch": 2.449043587331452,
|
|
"grad_norm": 0.6105548143386841,
|
|
"learning_rate": 8.073638188131128e-07,
|
|
"loss": 0.12425668239593506,
|
|
"memory(GiB)": 34.95,
|
|
"step": 975,
|
|
"token_acc": 0.9544568733678918,
|
|
"train_speed(iter/s)": 0.169513
|
|
},
|
|
{
|
|
"epoch": 2.4615867042960176,
|
|
"grad_norm": 0.5976568460464478,
|
|
"learning_rate": 7.71887481174437e-07,
|
|
"loss": 0.12979369163513182,
|
|
"memory(GiB)": 34.95,
|
|
"step": 980,
|
|
"token_acc": 0.9534985244556201,
|
|
"train_speed(iter/s)": 0.16963
|
|
},
|
|
{
|
|
"epoch": 2.4615867042960176,
|
|
"eval_loss": 0.23947912454605103,
|
|
"eval_runtime": 9.9815,
|
|
"eval_samples_per_second": 25.748,
|
|
"eval_steps_per_second": 6.512,
|
|
"eval_token_acc": 0.9354866277900663,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 2.474129821260583,
|
|
"grad_norm": 0.6130661964416504,
|
|
"learning_rate": 7.371429080967468e-07,
|
|
"loss": 0.12366334199905396,
|
|
"memory(GiB)": 34.95,
|
|
"step": 985,
|
|
"token_acc": 0.9456251029644058,
|
|
"train_speed(iter/s)": 0.169181
|
|
},
|
|
{
|
|
"epoch": 2.486672938225149,
|
|
"grad_norm": 0.611749529838562,
|
|
"learning_rate": 7.031361128654402e-07,
|
|
"loss": 0.11961600780487061,
|
|
"memory(GiB)": 34.95,
|
|
"step": 990,
|
|
"token_acc": 0.9553609289884855,
|
|
"train_speed(iter/s)": 0.169374
|
|
},
|
|
{
|
|
"epoch": 2.4992160551897147,
|
|
"grad_norm": 0.6357402205467224,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.12684570550918578,
|
|
"memory(GiB)": 34.95,
|
|
"step": 995,
|
|
"token_acc": 0.9534441273571709,
|
|
"train_speed(iter/s)": 0.169506
|
|
},
|
|
{
|
|
"epoch": 2.5117591721542802,
|
|
"grad_norm": 0.5895079970359802,
|
|
"learning_rate": 6.373592696244024e-07,
|
|
"loss": 0.12313053607940674,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1000,
|
|
"token_acc": 0.9565139198618804,
|
|
"train_speed(iter/s)": 0.169623
|
|
},
|
|
{
|
|
"epoch": 2.5117591721542802,
|
|
"eval_loss": 0.23871561884880066,
|
|
"eval_runtime": 9.9765,
|
|
"eval_samples_per_second": 25.76,
|
|
"eval_steps_per_second": 6.515,
|
|
"eval_token_acc": 0.935416247737784,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 2.524302289118846,
|
|
"grad_norm": 0.6320595145225525,
|
|
"learning_rate": 6.056006056926978e-07,
|
|
"loss": 0.12712349891662597,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1005,
|
|
"token_acc": 0.9447087643998138,
|
|
"train_speed(iter/s)": 0.169177
|
|
},
|
|
{
|
|
"epoch": 2.536845406083412,
|
|
"grad_norm": 0.611575186252594,
|
|
"learning_rate": 5.746024857931732e-07,
|
|
"loss": 0.12986292839050292,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1010,
|
|
"token_acc": 0.9510800508259212,
|
|
"train_speed(iter/s)": 0.169271
|
|
},
|
|
{
|
|
"epoch": 2.5493885230479774,
|
|
"grad_norm": 0.6201998591423035,
|
|
"learning_rate": 5.443702748080288e-07,
|
|
"loss": 0.12274014949798584,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1015,
|
|
"token_acc": 0.955253177824786,
|
|
"train_speed(iter/s)": 0.169445
|
|
},
|
|
{
|
|
"epoch": 2.561931640012543,
|
|
"grad_norm": 0.6165274977684021,
|
|
"learning_rate": 5.149092050626825e-07,
|
|
"loss": 0.1297899603843689,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1020,
|
|
"token_acc": 0.9506936125816299,
|
|
"train_speed(iter/s)": 0.169553
|
|
},
|
|
{
|
|
"epoch": 2.561931640012543,
|
|
"eval_loss": 0.23904787003993988,
|
|
"eval_runtime": 10.067,
|
|
"eval_samples_per_second": 25.529,
|
|
"eval_steps_per_second": 6.457,
|
|
"eval_token_acc": 0.935441383470742,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 2.574474756977109,
|
|
"grad_norm": 0.6250675916671753,
|
|
"learning_rate": 4.862243754202023e-07,
|
|
"loss": 0.12486759424209595,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1025,
|
|
"token_acc": 0.9427952415499746,
|
|
"train_speed(iter/s)": 0.169143
|
|
},
|
|
{
|
|
"epoch": 2.5870178739416745,
|
|
"grad_norm": 0.5898808240890503,
|
|
"learning_rate": 4.5832075039884014e-07,
|
|
"loss": 0.11898901462554931,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1030,
|
|
"token_acc": 0.9599557987792043,
|
|
"train_speed(iter/s)": 0.169245
|
|
},
|
|
{
|
|
"epoch": 2.59956099090624,
|
|
"grad_norm": 0.6565684080123901,
|
|
"learning_rate": 4.3120315931281633e-07,
|
|
"loss": 0.12741444110870362,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1035,
|
|
"token_acc": 0.9581729932512134,
|
|
"train_speed(iter/s)": 0.169389
|
|
},
|
|
{
|
|
"epoch": 2.612104107870806,
|
|
"grad_norm": 0.5875664949417114,
|
|
"learning_rate": 4.048762954365054e-07,
|
|
"loss": 0.12610654830932616,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1040,
|
|
"token_acc": 0.9550036071318149,
|
|
"train_speed(iter/s)": 0.169544
|
|
},
|
|
{
|
|
"epoch": 2.612104107870806,
|
|
"eval_loss": 0.23876284062862396,
|
|
"eval_runtime": 9.9698,
|
|
"eval_samples_per_second": 25.778,
|
|
"eval_steps_per_second": 6.52,
|
|
"eval_token_acc": 0.9356223607480394,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 2.6246472248353716,
|
|
"grad_norm": 0.6185052990913391,
|
|
"learning_rate": 3.793447151921642e-07,
|
|
"loss": 0.11991071701049805,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1045,
|
|
"token_acc": 0.9459399138299112,
|
|
"train_speed(iter/s)": 0.169124
|
|
},
|
|
{
|
|
"epoch": 2.637190341799937,
|
|
"grad_norm": 0.6107172966003418,
|
|
"learning_rate": 3.546128373613472e-07,
|
|
"loss": 0.11918728351593018,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1050,
|
|
"token_acc": 0.9534668113226157,
|
|
"train_speed(iter/s)": 0.169318
|
|
},
|
|
{
|
|
"epoch": 2.649733458764503,
|
|
"grad_norm": 0.6262523531913757,
|
|
"learning_rate": 3.30684942320143e-07,
|
|
"loss": 0.11955299377441406,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1055,
|
|
"token_acc": 0.9587094529959127,
|
|
"train_speed(iter/s)": 0.169507
|
|
},
|
|
{
|
|
"epoch": 2.6622765757290687,
|
|
"grad_norm": 0.5960172414779663,
|
|
"learning_rate": 3.0756517129836296e-07,
|
|
"loss": 0.12312361001968383,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1060,
|
|
"token_acc": 0.9537392406006633,
|
|
"train_speed(iter/s)": 0.169614
|
|
},
|
|
{
|
|
"epoch": 2.6622765757290687,
|
|
"eval_loss": 0.2386154979467392,
|
|
"eval_runtime": 9.9782,
|
|
"eval_samples_per_second": 25.756,
|
|
"eval_steps_per_second": 6.514,
|
|
"eval_token_acc": 0.9356575507741806,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 2.6748196926936343,
|
|
"grad_norm": 0.6013411283493042,
|
|
"learning_rate": 2.8525752566281485e-07,
|
|
"loss": 0.11395795345306396,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1065,
|
|
"token_acc": 0.947853377091845,
|
|
"train_speed(iter/s)": 0.169208
|
|
},
|
|
{
|
|
"epoch": 2.6873628096582003,
|
|
"grad_norm": 0.6079533696174622,
|
|
"learning_rate": 2.637658662247805e-07,
|
|
"loss": 0.12184674739837646,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1070,
|
|
"token_acc": 0.9590189382179447,
|
|
"train_speed(iter/s)": 0.169294
|
|
},
|
|
{
|
|
"epoch": 2.699905926622766,
|
|
"grad_norm": 0.5974398851394653,
|
|
"learning_rate": 2.430939125718218e-07,
|
|
"loss": 0.12283775806427003,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1075,
|
|
"token_acc": 0.9571679809383332,
|
|
"train_speed(iter/s)": 0.169407
|
|
},
|
|
{
|
|
"epoch": 2.7124490435873314,
|
|
"grad_norm": 0.6196507811546326,
|
|
"learning_rate": 2.232452424240261e-07,
|
|
"loss": 0.12050046920776367,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1080,
|
|
"token_acc": 0.9597835852963006,
|
|
"train_speed(iter/s)": 0.169553
|
|
},
|
|
{
|
|
"epoch": 2.7124490435873314,
|
|
"eval_loss": 0.23875917494297028,
|
|
"eval_runtime": 9.9675,
|
|
"eval_samples_per_second": 25.784,
|
|
"eval_steps_per_second": 6.521,
|
|
"eval_token_acc": 0.9355821435753067,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 2.7249921605518974,
|
|
"grad_norm": 0.6392203569412231,
|
|
"learning_rate": 2.042232910148051e-07,
|
|
"loss": 0.11989054679870606,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1085,
|
|
"token_acc": 0.9460828818275003,
|
|
"train_speed(iter/s)": 0.16913
|
|
},
|
|
{
|
|
"epoch": 2.737535277516463,
|
|
"grad_norm": 0.6039083003997803,
|
|
"learning_rate": 1.860313504963579e-07,
|
|
"loss": 0.11684960126876831,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1090,
|
|
"token_acc": 0.9531301093630782,
|
|
"train_speed(iter/s)": 0.169284
|
|
},
|
|
{
|
|
"epoch": 2.7500783944810285,
|
|
"grad_norm": 0.6016117930412292,
|
|
"learning_rate": 1.6867256936989097e-07,
|
|
"loss": 0.12414079904556274,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1095,
|
|
"token_acc": 0.9582522047875387,
|
|
"train_speed(iter/s)": 0.169393
|
|
},
|
|
{
|
|
"epoch": 2.762621511445594,
|
|
"grad_norm": 0.609747588634491,
|
|
"learning_rate": 1.521499519407038e-07,
|
|
"loss": 0.11737879514694213,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1100,
|
|
"token_acc": 0.95666478832276,
|
|
"train_speed(iter/s)": 0.169525
|
|
},
|
|
{
|
|
"epoch": 2.762621511445594,
|
|
"eval_loss": 0.2390962839126587,
|
|
"eval_runtime": 9.9713,
|
|
"eval_samples_per_second": 25.774,
|
|
"eval_steps_per_second": 6.519,
|
|
"eval_token_acc": 0.9355821435753067,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 2.7751646284101597,
|
|
"grad_norm": 0.540483832359314,
|
|
"learning_rate": 1.364663577982317e-07,
|
|
"loss": 0.11655213832855224,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1105,
|
|
"token_acc": 0.947412632708209,
|
|
"train_speed(iter/s)": 0.169129
|
|
},
|
|
{
|
|
"epoch": 2.7877077453747257,
|
|
"grad_norm": 0.6210323572158813,
|
|
"learning_rate": 1.2162450132113202e-07,
|
|
"loss": 0.12057442665100097,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1110,
|
|
"token_acc": 0.9559064846811715,
|
|
"train_speed(iter/s)": 0.169327
|
|
},
|
|
{
|
|
"epoch": 2.800250862339291,
|
|
"grad_norm": 0.6889768242835999,
|
|
"learning_rate": 1.07626951207504e-07,
|
|
"loss": 0.12380859851837159,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1115,
|
|
"token_acc": 0.9528777568254488,
|
|
"train_speed(iter/s)": 0.169457
|
|
},
|
|
{
|
|
"epoch": 2.8127939793038568,
|
|
"grad_norm": 0.5808271169662476,
|
|
"learning_rate": 9.447613003032042e-08,
|
|
"loss": 0.12284276485443116,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1120,
|
|
"token_acc": 0.9562657695542472,
|
|
"train_speed(iter/s)": 0.169618
|
|
},
|
|
{
|
|
"epoch": 2.8127939793038568,
|
|
"eval_loss": 0.23907452821731567,
|
|
"eval_runtime": 9.9798,
|
|
"eval_samples_per_second": 25.752,
|
|
"eval_steps_per_second": 6.513,
|
|
"eval_token_acc": 0.9356977679469133,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 2.8253370962684228,
|
|
"grad_norm": 0.6014649868011475,
|
|
"learning_rate": 8.217431381815078e-08,
|
|
"loss": 0.12331821918487548,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1125,
|
|
"token_acc": 0.9462024372046755,
|
|
"train_speed(iter/s)": 0.169228
|
|
},
|
|
{
|
|
"epoch": 2.8378802132329883,
|
|
"grad_norm": 0.6275858283042908,
|
|
"learning_rate": 7.072363166124363e-08,
|
|
"loss": 0.12597228288650514,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1130,
|
|
"token_acc": 0.9567274137262505,
|
|
"train_speed(iter/s)": 0.169368
|
|
},
|
|
{
|
|
"epoch": 2.850423330197554,
|
|
"grad_norm": 0.6735210418701172,
|
|
"learning_rate": 6.012606534304688e-08,
|
|
"loss": 0.12442328929901122,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1135,
|
|
"token_acc": 0.9592088998763906,
|
|
"train_speed(iter/s)": 0.169498
|
|
},
|
|
{
|
|
"epoch": 2.86296644716212,
|
|
"grad_norm": 0.6021392345428467,
|
|
"learning_rate": 5.038344899721437e-08,
|
|
"loss": 0.13345457315444947,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1140,
|
|
"token_acc": 0.9546434206981488,
|
|
"train_speed(iter/s)": 0.169641
|
|
},
|
|
{
|
|
"epoch": 2.86296644716212,
|
|
"eval_loss": 0.23904505372047424,
|
|
"eval_runtime": 9.9416,
|
|
"eval_samples_per_second": 25.851,
|
|
"eval_steps_per_second": 6.538,
|
|
"eval_token_acc": 0.9355821435753067,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 2.8755095641266855,
|
|
"grad_norm": 0.6207495927810669,
|
|
"learning_rate": 4.149746879017147e-08,
|
|
"loss": 0.12285821437835694,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1145,
|
|
"token_acc": 0.9433157837334047,
|
|
"train_speed(iter/s)": 0.169265
|
|
},
|
|
{
|
|
"epoch": 2.888052681091251,
|
|
"grad_norm": 0.5906082391738892,
|
|
"learning_rate": 3.3469662629289635e-08,
|
|
"loss": 0.1214226484298706,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1150,
|
|
"token_acc": 0.9550792604937793,
|
|
"train_speed(iter/s)": 0.169398
|
|
},
|
|
{
|
|
"epoch": 2.900595798055817,
|
|
"grad_norm": 0.6598130464553833,
|
|
"learning_rate": 2.630141989671542e-08,
|
|
"loss": 0.12527458667755126,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1155,
|
|
"token_acc": 0.9544634286811288,
|
|
"train_speed(iter/s)": 0.169521
|
|
},
|
|
{
|
|
"epoch": 2.9131389150203826,
|
|
"grad_norm": 0.6021043062210083,
|
|
"learning_rate": 1.999398120891116e-08,
|
|
"loss": 0.1194157361984253,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1160,
|
|
"token_acc": 0.9597829112162538,
|
|
"train_speed(iter/s)": 0.169653
|
|
},
|
|
{
|
|
"epoch": 2.9131389150203826,
|
|
"eval_loss": 0.23905108869075775,
|
|
"eval_runtime": 9.9613,
|
|
"eval_samples_per_second": 25.8,
|
|
"eval_steps_per_second": 6.525,
|
|
"eval_token_acc": 0.935652523627589,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 2.925682031984948,
|
|
"grad_norm": 0.5893052816390991,
|
|
"learning_rate": 1.4548438201939518e-08,
|
|
"loss": 0.11504523754119873,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1165,
|
|
"token_acc": 0.9459238731463808,
|
|
"train_speed(iter/s)": 0.16932
|
|
},
|
|
{
|
|
"epoch": 2.938225148949514,
|
|
"grad_norm": 0.6343664526939392,
|
|
"learning_rate": 9.965733342532925e-09,
|
|
"loss": 0.1275886058807373,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1170,
|
|
"token_acc": 0.9558923448588253,
|
|
"train_speed(iter/s)": 0.169447
|
|
},
|
|
{
|
|
"epoch": 2.9507682659140797,
|
|
"grad_norm": 0.6318545341491699,
|
|
"learning_rate": 6.246659764979068e-09,
|
|
"loss": 0.12401323318481446,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1175,
|
|
"token_acc": 0.9594298632943586,
|
|
"train_speed(iter/s)": 0.169549
|
|
},
|
|
{
|
|
"epoch": 2.9633113828786453,
|
|
"grad_norm": 0.6376116275787354,
|
|
"learning_rate": 3.3918611338507046e-09,
|
|
"loss": 0.11943215131759644,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1180,
|
|
"token_acc": 0.9601843185254518,
|
|
"train_speed(iter/s)": 0.169695
|
|
},
|
|
{
|
|
"epoch": 2.9633113828786453,
|
|
"eval_loss": 0.2389203906059265,
|
|
"eval_runtime": 9.9873,
|
|
"eval_samples_per_second": 25.733,
|
|
"eval_steps_per_second": 6.508,
|
|
"eval_token_acc": 0.935702795093505,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 2.9758544998432113,
|
|
"grad_norm": 0.5881339311599731,
|
|
"learning_rate": 1.4018315326103094e-09,
|
|
"loss": 0.11981034278869629,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1185,
|
|
"token_acc": 0.9440527542267837,
|
|
"train_speed(iter/s)": 0.169309
|
|
},
|
|
{
|
|
"epoch": 2.988397616807777,
|
|
"grad_norm": 0.6789658069610596,
|
|
"learning_rate": 2.7691537809293454e-10,
|
|
"loss": 0.1259629726409912,
|
|
"memory(GiB)": 34.95,
|
|
"step": 1190,
|
|
"token_acc": 0.9503406881712312,
|
|
"train_speed(iter/s)": 0.169392
|
|
},
|
|
{
|
|
"epoch": 2.9984321103794294,
|
|
"eval_loss": 0.23875285685062408,
|
|
"eval_runtime": 9.984,
|
|
"eval_samples_per_second": 25.741,
|
|
"eval_steps_per_second": 6.51,
|
|
"eval_token_acc": 0.9355570078423486,
|
|
"step": 1194
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 1194,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.502521518607827e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|