{ "best_global_step": 780, "best_metric": 0.22540703, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v49-20250505-211427/checkpoint-780", "epoch": 2.9984321103794294, "eval_steps": 20, "global_step": 1194, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002508623392913139, "grad_norm": 2.4406964778900146, "learning_rate": 9.999982692639099e-06, "loss": 0.39261603355407715, "memory(GiB)": 27.73, "step": 1, "token_acc": 0.8706896551724138, "train_speed(iter/s)": 0.073021 }, { "epoch": 0.012543116964565695, "grad_norm": 1.4140573740005493, "learning_rate": 9.999567321968297e-06, "loss": 0.32130101323127747, "memory(GiB)": 27.73, "step": 5, "token_acc": 0.8974483833268406, "train_speed(iter/s)": 0.150893 }, { "epoch": 0.02508623392913139, "grad_norm": 0.9324946403503418, "learning_rate": 9.998269362757298e-06, "loss": 0.2845744609832764, "memory(GiB)": 27.73, "step": 10, "token_acc": 0.9072619069023176, "train_speed(iter/s)": 0.174486 }, { "epoch": 0.03762935089369708, "grad_norm": 0.8129357695579529, "learning_rate": 9.996106347006378e-06, "loss": 0.27310004234313967, "memory(GiB)": 27.73, "step": 15, "token_acc": 0.9068501494128172, "train_speed(iter/s)": 0.182334 }, { "epoch": 0.05017246785826278, "grad_norm": 0.8004717826843262, "learning_rate": 9.993078649071297e-06, "loss": 0.293326735496521, "memory(GiB)": 27.73, "step": 20, "token_acc": 0.9049762744859472, "train_speed(iter/s)": 0.189416 }, { "epoch": 0.05017246785826278, "eval_loss": 0.2858642339706421, "eval_runtime": 10.0812, "eval_samples_per_second": 25.493, "eval_steps_per_second": 6.448, "eval_token_acc": 0.9193595415242308, "step": 20 }, { "epoch": 0.06271558482282848, "grad_norm": 0.7604618668556213, "learning_rate": 9.989186792959408e-06, "loss": 0.26241092681884765, "memory(GiB)": 27.73, "step": 25, "token_acc": 0.9132264916006951, "train_speed(iter/s)": 0.162946 }, { "epoch": 0.07525870178739416, "grad_norm": 0.6763395071029663, "learning_rate": 9.984431452238968e-06, "loss": 0.2643896102905273, "memory(GiB)": 30.07, "step": 30, "token_acc": 0.9132476909865227, "train_speed(iter/s)": 0.168084 }, { "epoch": 0.08780181875195986, "grad_norm": 0.7725923657417297, "learning_rate": 9.97881344992256e-06, "loss": 0.24953582286834716, "memory(GiB)": 30.07, "step": 35, "token_acc": 0.9186898409484755, "train_speed(iter/s)": 0.173361 }, { "epoch": 0.10034493571652556, "grad_norm": 0.6925674676895142, "learning_rate": 9.97233375832466e-06, "loss": 0.25593545436859133, "memory(GiB)": 30.07, "step": 40, "token_acc": 0.9142301666698126, "train_speed(iter/s)": 0.17571 }, { "epoch": 0.10034493571652556, "eval_loss": 0.27316194772720337, "eval_runtime": 9.946, "eval_samples_per_second": 25.84, "eval_steps_per_second": 6.535, "eval_token_acc": 0.9228584355519807, "step": 40 }, { "epoch": 0.11288805268109126, "grad_norm": 0.6484134197235107, "learning_rate": 9.964993498893349e-06, "loss": 0.2584169626235962, "memory(GiB)": 30.07, "step": 45, "token_acc": 0.9155336997396587, "train_speed(iter/s)": 0.163839 }, { "epoch": 0.12543116964565695, "grad_norm": 0.6846535205841064, "learning_rate": 9.95679394201623e-06, "loss": 0.24747424125671386, "memory(GiB)": 30.07, "step": 50, "token_acc": 0.9217108554277138, "train_speed(iter/s)": 0.167611 }, { "epoch": 0.13797428661022265, "grad_norm": 0.6725199222564697, "learning_rate": 9.947736506800554e-06, "loss": 0.2658252716064453, "memory(GiB)": 30.07, "step": 55, "token_acc": 0.9149937868631535, "train_speed(iter/s)": 0.170799 }, { "epoch": 0.15051740357478832, "grad_norm": 0.6383991241455078, "learning_rate": 9.93782276082762e-06, "loss": 0.23914179801940919, "memory(GiB)": 30.07, "step": 60, "token_acc": 0.9193507567193643, "train_speed(iter/s)": 0.173334 }, { "epoch": 0.15051740357478832, "eval_loss": 0.26351168751716614, "eval_runtime": 9.9368, "eval_samples_per_second": 25.863, "eval_steps_per_second": 6.541, "eval_token_acc": 0.924492258194249, "step": 60 }, { "epoch": 0.16306052053935402, "grad_norm": 0.6977497935295105, "learning_rate": 9.927054419881462e-06, "loss": 0.2521164894104004, "memory(GiB)": 30.07, "step": 65, "token_acc": 0.9175230463277722, "train_speed(iter/s)": 0.165312 }, { "epoch": 0.17560363750391972, "grad_norm": 0.6636829972267151, "learning_rate": 9.915433347651909e-06, "loss": 0.2421865940093994, "memory(GiB)": 30.07, "step": 70, "token_acc": 0.9232868615930188, "train_speed(iter/s)": 0.168227 }, { "epoch": 0.18814675446848542, "grad_norm": 0.715813398361206, "learning_rate": 9.90296155541202e-06, "loss": 0.2508160352706909, "memory(GiB)": 30.07, "step": 75, "token_acc": 0.921933587812046, "train_speed(iter/s)": 0.169862 }, { "epoch": 0.20068987143305111, "grad_norm": 0.762688934803009, "learning_rate": 9.88964120167001e-06, "loss": 0.2518954277038574, "memory(GiB)": 30.07, "step": 80, "token_acc": 0.9140404864303723, "train_speed(iter/s)": 0.171765 }, { "epoch": 0.20068987143305111, "eval_loss": 0.26037997007369995, "eval_runtime": 9.9473, "eval_samples_per_second": 25.836, "eval_steps_per_second": 6.534, "eval_token_acc": 0.9254122260205108, "step": 80 }, { "epoch": 0.2132329883976168, "grad_norm": 0.7675485610961914, "learning_rate": 9.875474591795648e-06, "loss": 0.24610612392425538, "memory(GiB)": 30.07, "step": 85, "token_acc": 0.9191244147424867, "train_speed(iter/s)": 0.165845 }, { "epoch": 0.2257761053621825, "grad_norm": 0.6830568909645081, "learning_rate": 9.860464177621286e-06, "loss": 0.24553425312042237, "memory(GiB)": 30.07, "step": 90, "token_acc": 0.9210544741632825, "train_speed(iter/s)": 0.167772 }, { "epoch": 0.2383192223267482, "grad_norm": 0.6381848454475403, "learning_rate": 9.84461255701751e-06, "loss": 0.24247684478759765, "memory(GiB)": 30.07, "step": 95, "token_acc": 0.9123848317331006, "train_speed(iter/s)": 0.169012 }, { "epoch": 0.2508623392913139, "grad_norm": 0.6407138109207153, "learning_rate": 9.827922473443518e-06, "loss": 0.2540575504302979, "memory(GiB)": 30.07, "step": 100, "token_acc": 0.9123342939481268, "train_speed(iter/s)": 0.170603 }, { "epoch": 0.2508623392913139, "eval_loss": 0.255385160446167, "eval_runtime": 9.9766, "eval_samples_per_second": 25.76, "eval_steps_per_second": 6.515, "eval_token_acc": 0.9266941484013674, "step": 100 }, { "epoch": 0.2634054562558796, "grad_norm": 0.6974446177482605, "learning_rate": 9.810396815472316e-06, "loss": 0.2443918228149414, "memory(GiB)": 30.07, "step": 105, "token_acc": 0.9175226016453799, "train_speed(iter/s)": 0.166259 }, { "epoch": 0.2759485732204453, "grad_norm": 0.6513930559158325, "learning_rate": 9.79203861629078e-06, "loss": 0.25510406494140625, "memory(GiB)": 30.07, "step": 110, "token_acc": 0.9140450964411845, "train_speed(iter/s)": 0.168031 }, { "epoch": 0.288491690185011, "grad_norm": 0.7416351437568665, "learning_rate": 9.772851053174708e-06, "loss": 0.24527263641357422, "memory(GiB)": 30.07, "step": 115, "token_acc": 0.9193030038453679, "train_speed(iter/s)": 0.169489 }, { "epoch": 0.30103480714957664, "grad_norm": 0.7076767086982727, "learning_rate": 9.752837446938915e-06, "loss": 0.24852099418640136, "memory(GiB)": 30.07, "step": 120, "token_acc": 0.9153006496449615, "train_speed(iter/s)": 0.170153 }, { "epoch": 0.30103480714957664, "eval_loss": 0.252126008272171, "eval_runtime": 9.9461, "eval_samples_per_second": 25.839, "eval_steps_per_second": 6.535, "eval_token_acc": 0.927654333400362, "step": 120 }, { "epoch": 0.31357792411414237, "grad_norm": 0.6797928214073181, "learning_rate": 9.732001261362503e-06, "loss": 0.23887033462524415, "memory(GiB)": 30.07, "step": 125, "token_acc": 0.9209902737430722, "train_speed(iter/s)": 0.166589 }, { "epoch": 0.32612104107870804, "grad_norm": 0.7788575291633606, "learning_rate": 9.710346102589376e-06, "loss": 0.2535351276397705, "memory(GiB)": 30.07, "step": 130, "token_acc": 0.9120982792920281, "train_speed(iter/s)": 0.167787 }, { "epoch": 0.33866415804327377, "grad_norm": 0.6328480243682861, "learning_rate": 9.687875718504126e-06, "loss": 0.2357191562652588, "memory(GiB)": 30.07, "step": 135, "token_acc": 0.9174817518248175, "train_speed(iter/s)": 0.168973 }, { "epoch": 0.35120727500783944, "grad_norm": 0.6453744769096375, "learning_rate": 9.664593998083374e-06, "loss": 0.24335532188415526, "memory(GiB)": 30.07, "step": 140, "token_acc": 0.9164061768834815, "train_speed(iter/s)": 0.169985 }, { "epoch": 0.35120727500783944, "eval_loss": 0.24922603368759155, "eval_runtime": 9.9665, "eval_samples_per_second": 25.786, "eval_steps_per_second": 6.522, "eval_token_acc": 0.9285793283732153, "step": 140 }, { "epoch": 0.36375039197240516, "grad_norm": 0.655521810054779, "learning_rate": 9.640504970722708e-06, "loss": 0.23469161987304688, "memory(GiB)": 30.07, "step": 145, "token_acc": 0.9239321638563585, "train_speed(iter/s)": 0.166594 }, { "epoch": 0.37629350893697083, "grad_norm": 0.656192421913147, "learning_rate": 9.615612805539305e-06, "loss": 0.23534941673278809, "memory(GiB)": 30.07, "step": 150, "token_acc": 0.9101899504070365, "train_speed(iter/s)": 0.16793 }, { "epoch": 0.38883662590153656, "grad_norm": 0.6794695258140564, "learning_rate": 9.589921810650379e-06, "loss": 0.24691348075866698, "memory(GiB)": 30.07, "step": 155, "token_acc": 0.9117996509857795, "train_speed(iter/s)": 0.168976 }, { "epoch": 0.40137974286610223, "grad_norm": 0.6336193680763245, "learning_rate": 9.563436432427571e-06, "loss": 0.23817820549011232, "memory(GiB)": 30.07, "step": 160, "token_acc": 0.9163882846488502, "train_speed(iter/s)": 0.170021 }, { "epoch": 0.40137974286610223, "eval_loss": 0.24645844101905823, "eval_runtime": 9.937, "eval_samples_per_second": 25.863, "eval_steps_per_second": 6.541, "eval_token_acc": 0.9283782425095516, "step": 160 }, { "epoch": 0.4139228598306679, "grad_norm": 0.6395715475082397, "learning_rate": 9.536161254727407e-06, "loss": 0.23914387226104736, "memory(GiB)": 30.07, "step": 165, "token_acc": 0.9195228213727245, "train_speed(iter/s)": 0.167509 }, { "epoch": 0.4264659767952336, "grad_norm": 0.6767882704734802, "learning_rate": 9.508100998097971e-06, "loss": 0.2324080467224121, "memory(GiB)": 30.07, "step": 170, "token_acc": 0.9136844562004045, "train_speed(iter/s)": 0.168597 }, { "epoch": 0.4390090937597993, "grad_norm": 0.6208192706108093, "learning_rate": 9.479260518961904e-06, "loss": 0.23578665256500245, "memory(GiB)": 30.07, "step": 175, "token_acc": 0.9154970653640042, "train_speed(iter/s)": 0.169348 }, { "epoch": 0.451552210724365, "grad_norm": 0.6273934841156006, "learning_rate": 9.449644808775902e-06, "loss": 0.23413596153259278, "memory(GiB)": 30.07, "step": 180, "token_acc": 0.9301044600520517, "train_speed(iter/s)": 0.170195 }, { "epoch": 0.451552210724365, "eval_loss": 0.2454940527677536, "eval_runtime": 9.965, "eval_samples_per_second": 25.79, "eval_steps_per_second": 6.523, "eval_token_acc": 0.9287401970641463, "step": 180 }, { "epoch": 0.4640953276889307, "grad_norm": 0.6492425203323364, "learning_rate": 9.419258993166846e-06, "loss": 0.2424703598022461, "memory(GiB)": 30.07, "step": 185, "token_acc": 0.9212295968281757, "train_speed(iter/s)": 0.167631 }, { "epoch": 0.4766384446534964, "grad_norm": 0.6422290205955505, "learning_rate": 9.388108331044687e-06, "loss": 0.23424482345581055, "memory(GiB)": 30.07, "step": 190, "token_acc": 0.9254705767559033, "train_speed(iter/s)": 0.168154 }, { "epoch": 0.4891815616180621, "grad_norm": 0.6608842015266418, "learning_rate": 9.356198213692297e-06, "loss": 0.23567054271697999, "memory(GiB)": 30.07, "step": 195, "token_acc": 0.9198833160816787, "train_speed(iter/s)": 0.168995 }, { "epoch": 0.5017246785826278, "grad_norm": 0.675005316734314, "learning_rate": 9.323534163832387e-06, "loss": 0.24276134967803956, "memory(GiB)": 30.07, "step": 200, "token_acc": 0.913303071968056, "train_speed(iter/s)": 0.170071 }, { "epoch": 0.5017246785826278, "eval_loss": 0.24412870407104492, "eval_runtime": 9.9709, "eval_samples_per_second": 25.775, "eval_steps_per_second": 6.519, "eval_token_acc": 0.9292730746028555, "step": 200 }, { "epoch": 0.5142677955471935, "grad_norm": 0.6404038071632385, "learning_rate": 9.290121834671669e-06, "loss": 0.23353495597839355, "memory(GiB)": 30.07, "step": 205, "token_acc": 0.9230231647999404, "train_speed(iter/s)": 0.167939 }, { "epoch": 0.5268109125117592, "grad_norm": 0.6592434048652649, "learning_rate": 9.255967008922475e-06, "loss": 0.21883893013000488, "memory(GiB)": 30.07, "step": 210, "token_acc": 0.9253728456196685, "train_speed(iter/s)": 0.168578 }, { "epoch": 0.5393540294763248, "grad_norm": 0.656053900718689, "learning_rate": 9.221075597801912e-06, "loss": 0.2320107936859131, "memory(GiB)": 30.07, "step": 215, "token_acc": 0.9213286713286714, "train_speed(iter/s)": 0.169087 }, { "epoch": 0.5518971464408906, "grad_norm": 0.64938884973526, "learning_rate": 9.18545364000882e-06, "loss": 0.23771371841430664, "memory(GiB)": 30.07, "step": 220, "token_acc": 0.9130793725675198, "train_speed(iter/s)": 0.169997 }, { "epoch": 0.5518971464408906, "eval_loss": 0.24264991283416748, "eval_runtime": 9.9448, "eval_samples_per_second": 25.843, "eval_steps_per_second": 6.536, "eval_token_acc": 0.929810979288156, "step": 220 }, { "epoch": 0.5644402634054563, "grad_norm": 0.6256290674209595, "learning_rate": 9.14910730067863e-06, "loss": 0.23107373714447021, "memory(GiB)": 30.07, "step": 225, "token_acc": 0.9247168944864042, "train_speed(iter/s)": 0.168098 }, { "epoch": 0.576983380370022, "grad_norm": 0.5621991753578186, "learning_rate": 9.112042870316365e-06, "loss": 0.21704797744750975, "memory(GiB)": 30.07, "step": 230, "token_acc": 0.9269190993704302, "train_speed(iter/s)": 0.168845 }, { "epoch": 0.5895264973345876, "grad_norm": 0.6810638308525085, "learning_rate": 9.074266763707937e-06, "loss": 0.2278088092803955, "memory(GiB)": 30.07, "step": 235, "token_acc": 0.9134312189271274, "train_speed(iter/s)": 0.169551 }, { "epoch": 0.6020696142991533, "grad_norm": 0.5773187279701233, "learning_rate": 9.035785518809928e-06, "loss": 0.21931402683258056, "memory(GiB)": 30.07, "step": 240, "token_acc": 0.9246131941148552, "train_speed(iter/s)": 0.170025 }, { "epoch": 0.6020696142991533, "eval_loss": 0.24013011157512665, "eval_runtime": 9.9847, "eval_samples_per_second": 25.739, "eval_steps_per_second": 6.51, "eval_token_acc": 0.9308616529257994, "step": 240 }, { "epoch": 0.6146127312637191, "grad_norm": 0.6531880497932434, "learning_rate": 8.996605795618054e-06, "loss": 0.24535005092620848, "memory(GiB)": 30.07, "step": 245, "token_acc": 0.915537267670059, "train_speed(iter/s)": 0.168114 }, { "epoch": 0.6271558482282847, "grad_norm": 0.6779309511184692, "learning_rate": 8.956734375014525e-06, "loss": 0.23181967735290526, "memory(GiB)": 30.07, "step": 250, "token_acc": 0.9155568096313017, "train_speed(iter/s)": 0.168895 }, { "epoch": 0.6396989651928504, "grad_norm": 0.6643815040588379, "learning_rate": 8.916178157594453e-06, "loss": 0.23725414276123047, "memory(GiB)": 30.07, "step": 255, "token_acc": 0.9135585175809833, "train_speed(iter/s)": 0.169436 }, { "epoch": 0.6522420821574161, "grad_norm": 0.5990136861801147, "learning_rate": 8.87494416247157e-06, "loss": 0.22699012756347656, "memory(GiB)": 30.07, "step": 260, "token_acc": 0.924126221001221, "train_speed(iter/s)": 0.170075 }, { "epoch": 0.6522420821574161, "eval_loss": 0.23895950615406036, "eval_runtime": 9.971, "eval_samples_per_second": 25.775, "eval_steps_per_second": 6.519, "eval_token_acc": 0.9309320329780817, "step": 260 }, { "epoch": 0.6647851991219819, "grad_norm": 0.647604763507843, "learning_rate": 8.833039526063414e-06, "loss": 0.22294692993164061, "memory(GiB)": 30.07, "step": 265, "token_acc": 0.923023885742428, "train_speed(iter/s)": 0.168429 }, { "epoch": 0.6773283160865475, "grad_norm": 0.7019293904304504, "learning_rate": 8.790471500856229e-06, "loss": 0.22493109703063965, "memory(GiB)": 30.07, "step": 270, "token_acc": 0.9217673989150925, "train_speed(iter/s)": 0.168763 }, { "epoch": 0.6898714330511132, "grad_norm": 0.709414541721344, "learning_rate": 8.747247454149754e-06, "loss": 0.23138487339019775, "memory(GiB)": 30.07, "step": 275, "token_acc": 0.9201797011093793, "train_speed(iter/s)": 0.169186 }, { "epoch": 0.7024145500156789, "grad_norm": 0.5952314138412476, "learning_rate": 8.703374866782172e-06, "loss": 0.2214064598083496, "memory(GiB)": 30.07, "step": 280, "token_acc": 0.9243394229601485, "train_speed(iter/s)": 0.169579 }, { "epoch": 0.7024145500156789, "eval_loss": 0.2380242794752121, "eval_runtime": 9.9399, "eval_samples_per_second": 25.855, "eval_steps_per_second": 6.539, "eval_token_acc": 0.9309119243917152, "step": 280 }, { "epoch": 0.7149576669802445, "grad_norm": 0.6550840735435486, "learning_rate": 8.658861331835384e-06, "loss": 0.22828481197357178, "memory(GiB)": 30.07, "step": 285, "token_acc": 0.9233320082850388, "train_speed(iter/s)": 0.16832 }, { "epoch": 0.7275007839448103, "grad_norm": 0.6396917700767517, "learning_rate": 8.613714553320863e-06, "loss": 0.22759134769439698, "memory(GiB)": 30.07, "step": 290, "token_acc": 0.9214041461850823, "train_speed(iter/s)": 0.168862 }, { "epoch": 0.740043900909376, "grad_norm": 0.6359645128250122, "learning_rate": 8.567942344846311e-06, "loss": 0.2300776481628418, "memory(GiB)": 30.07, "step": 295, "token_acc": 0.9242182277352745, "train_speed(iter/s)": 0.169333 }, { "epoch": 0.7525870178739417, "grad_norm": 0.6625407338142395, "learning_rate": 8.521552628263362e-06, "loss": 0.23114292621612548, "memory(GiB)": 30.07, "step": 300, "token_acc": 0.9189751431314676, "train_speed(iter/s)": 0.169808 }, { "epoch": 0.7525870178739417, "eval_loss": 0.2363082766532898, "eval_runtime": 9.9749, "eval_samples_per_second": 25.765, "eval_steps_per_second": 6.516, "eval_token_acc": 0.9315302634224814, "step": 300 }, { "epoch": 0.7651301348385073, "grad_norm": 0.6202587485313416, "learning_rate": 8.474553432296517e-06, "loss": 0.224021315574646, "memory(GiB)": 30.07, "step": 305, "token_acc": 0.9276043893401739, "train_speed(iter/s)": 0.168359 }, { "epoch": 0.7776732518030731, "grad_norm": 0.6691203713417053, "learning_rate": 8.426952891153617e-06, "loss": 0.23445448875427247, "memory(GiB)": 30.07, "step": 310, "token_acc": 0.923713052741816, "train_speed(iter/s)": 0.168781 }, { "epoch": 0.7902163687676388, "grad_norm": 0.6087518334388733, "learning_rate": 8.378759243118044e-06, "loss": 0.22397913932800292, "memory(GiB)": 30.07, "step": 315, "token_acc": 0.9215067830325622, "train_speed(iter/s)": 0.169206 }, { "epoch": 0.8027594857322045, "grad_norm": 0.6450164914131165, "learning_rate": 8.329980829122907e-06, "loss": 0.2312875509262085, "memory(GiB)": 30.07, "step": 320, "token_acc": 0.9272111639559235, "train_speed(iter/s)": 0.169675 }, { "epoch": 0.8027594857322045, "eval_loss": 0.23446869850158691, "eval_runtime": 9.9713, "eval_samples_per_second": 25.774, "eval_steps_per_second": 6.519, "eval_token_acc": 0.931907299416851, "step": 320 }, { "epoch": 0.8153026026967701, "grad_norm": 0.6574311852455139, "learning_rate": 8.280626091307466e-06, "loss": 0.21696841716766357, "memory(GiB)": 30.07, "step": 325, "token_acc": 0.9266242271024336, "train_speed(iter/s)": 0.168386 }, { "epoch": 0.8278457196613358, "grad_norm": 0.6543525457382202, "learning_rate": 8.23070357155605e-06, "loss": 0.23127243518829346, "memory(GiB)": 30.07, "step": 330, "token_acc": 0.9156461739292596, "train_speed(iter/s)": 0.168765 }, { "epoch": 0.8403888366259016, "grad_norm": 0.6864754557609558, "learning_rate": 8.18022191001969e-06, "loss": 0.23221104145050048, "memory(GiB)": 30.07, "step": 335, "token_acc": 0.9158222112374361, "train_speed(iter/s)": 0.169317 }, { "epoch": 0.8529319535904673, "grad_norm": 0.6478604674339294, "learning_rate": 8.129189843620766e-06, "loss": 0.21692075729370117, "memory(GiB)": 30.07, "step": 340, "token_acc": 0.92678130982976, "train_speed(iter/s)": 0.169761 }, { "epoch": 0.8529319535904673, "eval_loss": 0.23404286801815033, "eval_runtime": 9.9838, "eval_samples_per_second": 25.742, "eval_steps_per_second": 6.511, "eval_token_acc": 0.9316659963804544, "step": 340 }, { "epoch": 0.8654750705550329, "grad_norm": 0.6460584402084351, "learning_rate": 8.077616204540897e-06, "loss": 0.21975212097167968, "memory(GiB)": 30.07, "step": 345, "token_acc": 0.926279602750191, "train_speed(iter/s)": 0.168486 }, { "epoch": 0.8780181875195986, "grad_norm": 0.6245233416557312, "learning_rate": 8.02550991869234e-06, "loss": 0.2209392309188843, "memory(GiB)": 30.07, "step": 350, "token_acc": 0.9232239957902122, "train_speed(iter/s)": 0.168953 }, { "epoch": 0.8905613044841643, "grad_norm": 0.6148731708526611, "learning_rate": 7.972880004173175e-06, "loss": 0.22880539894104004, "memory(GiB)": 30.07, "step": 355, "token_acc": 0.9230032848427968, "train_speed(iter/s)": 0.169373 }, { "epoch": 0.90310442144873, "grad_norm": 0.5976830124855042, "learning_rate": 7.919735569706533e-06, "loss": 0.2258004665374756, "memory(GiB)": 30.07, "step": 360, "token_acc": 0.9294882944307232, "train_speed(iter/s)": 0.169737 }, { "epoch": 0.90310442144873, "eval_loss": 0.23216013610363007, "eval_runtime": 9.9402, "eval_samples_per_second": 25.855, "eval_steps_per_second": 6.539, "eval_token_acc": 0.9317363764327368, "step": 360 }, { "epoch": 0.9156475384132957, "grad_norm": 0.6738146543502808, "learning_rate": 7.86608581306413e-06, "loss": 0.23008251190185547, "memory(GiB)": 30.07, "step": 365, "token_acc": 0.9237889028684008, "train_speed(iter/s)": 0.168422 }, { "epoch": 0.9281906553778614, "grad_norm": 0.5968795418739319, "learning_rate": 7.811940019474414e-06, "loss": 0.2311033248901367, "memory(GiB)": 30.07, "step": 370, "token_acc": 0.9211925456821934, "train_speed(iter/s)": 0.168654 }, { "epoch": 0.940733772342427, "grad_norm": 0.6431145668029785, "learning_rate": 7.757307560015539e-06, "loss": 0.21920247077941896, "memory(GiB)": 30.07, "step": 375, "token_acc": 0.9226723579404703, "train_speed(iter/s)": 0.168986 }, { "epoch": 0.9532768893069928, "grad_norm": 0.6081040501594543, "learning_rate": 7.702197889993515e-06, "loss": 0.23370542526245117, "memory(GiB)": 30.07, "step": 380, "token_acc": 0.9235495603658321, "train_speed(iter/s)": 0.169411 }, { "epoch": 0.9532768893069928, "eval_loss": 0.23192763328552246, "eval_runtime": 9.9828, "eval_samples_per_second": 25.744, "eval_steps_per_second": 6.511, "eval_token_acc": 0.9322843354112206, "step": 380 }, { "epoch": 0.9658200062715585, "grad_norm": 0.6549849510192871, "learning_rate": 7.646620547305765e-06, "loss": 0.22933628559112548, "memory(GiB)": 30.07, "step": 385, "token_acc": 0.9272846380609236, "train_speed(iter/s)": 0.168384 }, { "epoch": 0.9783631232361242, "grad_norm": 0.5421332120895386, "learning_rate": 7.590585150790388e-06, "loss": 0.2162912368774414, "memory(GiB)": 30.07, "step": 390, "token_acc": 0.9282690665907798, "train_speed(iter/s)": 0.168866 }, { "epoch": 0.9909062402006898, "grad_norm": 0.5929440259933472, "learning_rate": 7.5341013985614064e-06, "loss": 0.22078533172607423, "memory(GiB)": 30.07, "step": 395, "token_acc": 0.9202857714192223, "train_speed(iter/s)": 0.169287 }, { "epoch": 1.0050172467858263, "grad_norm": 0.6037130951881409, "learning_rate": 7.47717906633032e-06, "loss": 0.253579306602478, "memory(GiB)": 30.07, "step": 400, "token_acc": 0.9251826086956522, "train_speed(iter/s)": 0.169497 }, { "epoch": 1.0050172467858263, "eval_loss": 0.23065434396266937, "eval_runtime": 9.9745, "eval_samples_per_second": 25.766, "eval_steps_per_second": 6.517, "eval_token_acc": 0.9330032173738186, "step": 400 }, { "epoch": 1.017560363750392, "grad_norm": 0.6310690641403198, "learning_rate": 7.419828005714195e-06, "loss": 0.17385544776916503, "memory(GiB)": 30.07, "step": 405, "token_acc": 0.9351318726588173, "train_speed(iter/s)": 0.168293 }, { "epoch": 1.0301034807149576, "grad_norm": 0.6506795883178711, "learning_rate": 7.362058142530639e-06, "loss": 0.16791077852249145, "memory(GiB)": 30.07, "step": 410, "token_acc": 0.9320981703907922, "train_speed(iter/s)": 0.16871 }, { "epoch": 1.0426465976795234, "grad_norm": 0.5465015769004822, "learning_rate": 7.303879475079931e-06, "loss": 0.15868284702301025, "memory(GiB)": 32.5, "step": 415, "token_acc": 0.942933207765865, "train_speed(iter/s)": 0.168987 }, { "epoch": 1.055189714644089, "grad_norm": 0.6573348045349121, "learning_rate": 7.245302072414602e-06, "loss": 0.16553893089294433, "memory(GiB)": 32.5, "step": 420, "token_acc": 0.9483149060876516, "train_speed(iter/s)": 0.169449 }, { "epoch": 1.055189714644089, "eval_loss": 0.23652049899101257, "eval_runtime": 9.9746, "eval_samples_per_second": 25.765, "eval_steps_per_second": 6.517, "eval_token_acc": 0.9325306655942087, "step": 420 }, { "epoch": 1.0677328316086547, "grad_norm": 0.6287339925765991, "learning_rate": 7.1863360725967615e-06, "loss": 0.15552424192428588, "memory(GiB)": 32.5, "step": 425, "token_acc": 0.9385299885841407, "train_speed(iter/s)": 0.168478 }, { "epoch": 1.0802759485732205, "grad_norm": 0.6632113456726074, "learning_rate": 7.126991680943508e-06, "loss": 0.17083898782730103, "memory(GiB)": 32.5, "step": 430, "token_acc": 0.9388098703940791, "train_speed(iter/s)": 0.168867 }, { "epoch": 1.092819065537786, "grad_norm": 0.6978201270103455, "learning_rate": 7.067279168260671e-06, "loss": 0.17238540649414064, "memory(GiB)": 32.5, "step": 435, "token_acc": 0.941155504865096, "train_speed(iter/s)": 0.169229 }, { "epoch": 1.1053621825023519, "grad_norm": 0.6574686765670776, "learning_rate": 7.007208869065232e-06, "loss": 0.164797842502594, "memory(GiB)": 32.5, "step": 440, "token_acc": 0.9414418794608996, "train_speed(iter/s)": 0.169579 }, { "epoch": 1.1053621825023519, "eval_loss": 0.23527124524116516, "eval_runtime": 9.9855, "eval_samples_per_second": 25.737, "eval_steps_per_second": 6.509, "eval_token_acc": 0.932812185803338, "step": 440 }, { "epoch": 1.1179052994669174, "grad_norm": 0.6073827743530273, "learning_rate": 6.946791179796718e-06, "loss": 0.16431469917297364, "memory(GiB)": 32.5, "step": 445, "token_acc": 0.936977573407634, "train_speed(iter/s)": 0.168619 }, { "epoch": 1.1304484164314832, "grad_norm": 0.6841042041778564, "learning_rate": 6.886036557017881e-06, "loss": 0.1685694932937622, "memory(GiB)": 32.5, "step": 450, "token_acc": 0.9406548805236119, "train_speed(iter/s)": 0.168995 }, { "epoch": 1.142991533396049, "grad_norm": 0.6076449751853943, "learning_rate": 6.824955515604957e-06, "loss": 0.15892113447189332, "memory(GiB)": 32.5, "step": 455, "token_acc": 0.9472979086195497, "train_speed(iter/s)": 0.16932 }, { "epoch": 1.1555346503606145, "grad_norm": 0.6569898128509521, "learning_rate": 6.76355862692786e-06, "loss": 0.1675378680229187, "memory(GiB)": 32.5, "step": 460, "token_acc": 0.9412598483175024, "train_speed(iter/s)": 0.169705 }, { "epoch": 1.1555346503606145, "eval_loss": 0.23534773290157318, "eval_runtime": 9.9809, "eval_samples_per_second": 25.749, "eval_steps_per_second": 6.512, "eval_token_acc": 0.9333752262215966, "step": 460 }, { "epoch": 1.1680777673251803, "grad_norm": 0.6509740948677063, "learning_rate": 6.701856517020565e-06, "loss": 0.17353179454803466, "memory(GiB)": 32.5, "step": 465, "token_acc": 0.9360858431432503, "train_speed(iter/s)": 0.168849 }, { "epoch": 1.180620884289746, "grad_norm": 0.6769475340843201, "learning_rate": 6.639859864742058e-06, "loss": 0.16680521965026857, "memory(GiB)": 32.5, "step": 470, "token_acc": 0.945840546350372, "train_speed(iter/s)": 0.1692 }, { "epoch": 1.1931640012543117, "grad_norm": 0.6421222686767578, "learning_rate": 6.5775793999281345e-06, "loss": 0.1688302278518677, "memory(GiB)": 34.95, "step": 475, "token_acc": 0.9395874540830743, "train_speed(iter/s)": 0.16954 }, { "epoch": 1.2057071182188774, "grad_norm": 0.6938676834106445, "learning_rate": 6.515025901534364e-06, "loss": 0.1676286816596985, "memory(GiB)": 34.95, "step": 480, "token_acc": 0.9379853728417402, "train_speed(iter/s)": 0.169878 }, { "epoch": 1.2057071182188774, "eval_loss": 0.2359960675239563, "eval_runtime": 9.9789, "eval_samples_per_second": 25.754, "eval_steps_per_second": 6.514, "eval_token_acc": 0.9330987331590589, "step": 480 }, { "epoch": 1.218250235183443, "grad_norm": 0.65184086561203, "learning_rate": 6.452210195770571e-06, "loss": 0.1703261137008667, "memory(GiB)": 34.95, "step": 485, "token_acc": 0.9328010713917791, "train_speed(iter/s)": 0.169027 }, { "epoch": 1.2307933521480088, "grad_norm": 0.6309488415718079, "learning_rate": 6.389143154227128e-06, "loss": 0.17036676406860352, "memory(GiB)": 34.95, "step": 490, "token_acc": 0.9409641272467474, "train_speed(iter/s)": 0.169349 }, { "epoch": 1.2433364691125746, "grad_norm": 0.6250006556510925, "learning_rate": 6.325835691993394e-06, "loss": 0.17421271800994872, "memory(GiB)": 34.95, "step": 495, "token_acc": 0.9379951431187548, "train_speed(iter/s)": 0.16968 }, { "epoch": 1.2558795860771401, "grad_norm": 0.6726417541503906, "learning_rate": 6.2622987657686305e-06, "loss": 0.17579824924468995, "memory(GiB)": 34.95, "step": 500, "token_acc": 0.9332128799122411, "train_speed(iter/s)": 0.169898 }, { "epoch": 1.2558795860771401, "eval_loss": 0.2346230149269104, "eval_runtime": 9.958, "eval_samples_per_second": 25.809, "eval_steps_per_second": 6.527, "eval_token_acc": 0.9335712849386688, "step": 500 }, { "epoch": 1.268422703041706, "grad_norm": 0.6843208074569702, "learning_rate": 6.198543371965711e-06, "loss": 0.16942257881164552, "memory(GiB)": 34.95, "step": 505, "token_acc": 0.9333019066627884, "train_speed(iter/s)": 0.168992 }, { "epoch": 1.2809658200062715, "grad_norm": 0.6464666128158569, "learning_rate": 6.134580544807951e-06, "loss": 0.16836194992065429, "memory(GiB)": 34.95, "step": 510, "token_acc": 0.9348090386953423, "train_speed(iter/s)": 0.169263 }, { "epoch": 1.2935089369708372, "grad_norm": 0.636613667011261, "learning_rate": 6.070421354419418e-06, "loss": 0.17016284465789794, "memory(GiB)": 34.95, "step": 515, "token_acc": 0.9389155662264906, "train_speed(iter/s)": 0.169454 }, { "epoch": 1.306052053935403, "grad_norm": 0.6690927743911743, "learning_rate": 6.006076904908996e-06, "loss": 0.1702873706817627, "memory(GiB)": 34.95, "step": 520, "token_acc": 0.9357901608213163, "train_speed(iter/s)": 0.16974 }, { "epoch": 1.306052053935403, "eval_loss": 0.23393017053604126, "eval_runtime": 9.9533, "eval_samples_per_second": 25.82, "eval_steps_per_second": 6.53, "eval_token_acc": 0.9333802533681882, "step": 520 }, { "epoch": 1.3185951708999686, "grad_norm": 0.5869135856628418, "learning_rate": 5.9415583324485895e-06, "loss": 0.15791409015655516, "memory(GiB)": 34.95, "step": 525, "token_acc": 0.9386744758379915, "train_speed(iter/s)": 0.168968 }, { "epoch": 1.3311382878645344, "grad_norm": 0.644124448299408, "learning_rate": 5.876876803345777e-06, "loss": 0.16019464731216432, "memory(GiB)": 34.95, "step": 530, "token_acc": 0.9399966226992138, "train_speed(iter/s)": 0.169268 }, { "epoch": 1.3436814048291001, "grad_norm": 0.6242665648460388, "learning_rate": 5.812043512111237e-06, "loss": 0.16639323234558107, "memory(GiB)": 34.95, "step": 535, "token_acc": 0.9414278117034347, "train_speed(iter/s)": 0.169643 }, { "epoch": 1.3562245217936657, "grad_norm": 0.6533536314964294, "learning_rate": 5.747069679521306e-06, "loss": 0.16934127807617189, "memory(GiB)": 34.95, "step": 540, "token_acc": 0.9394062777613266, "train_speed(iter/s)": 0.169883 }, { "epoch": 1.3562245217936657, "eval_loss": 0.23251816630363464, "eval_runtime": 9.9789, "eval_samples_per_second": 25.754, "eval_steps_per_second": 6.514, "eval_token_acc": 0.9339081037603056, "step": 540 }, { "epoch": 1.3687676387582315, "grad_norm": 0.6235964894294739, "learning_rate": 5.681966550675981e-06, "loss": 0.15981945991516114, "memory(GiB)": 34.95, "step": 545, "token_acc": 0.9390275276562902, "train_speed(iter/s)": 0.169143 }, { "epoch": 1.381310755722797, "grad_norm": 0.6405583620071411, "learning_rate": 5.616745393052725e-06, "loss": 0.1589187502861023, "memory(GiB)": 34.95, "step": 550, "token_acc": 0.9394911105629661, "train_speed(iter/s)": 0.169459 }, { "epoch": 1.3938538726873628, "grad_norm": 0.6054794192314148, "learning_rate": 5.551417494556376e-06, "loss": 0.1589406132698059, "memory(GiB)": 34.95, "step": 555, "token_acc": 0.9447133523511548, "train_speed(iter/s)": 0.169755 }, { "epoch": 1.4063969896519284, "grad_norm": 0.6477380394935608, "learning_rate": 5.4859941615655495e-06, "loss": 0.16248714923858643, "memory(GiB)": 34.95, "step": 560, "token_acc": 0.9376850526480911, "train_speed(iter/s)": 0.170048 }, { "epoch": 1.4063969896519284, "eval_loss": 0.2313276082277298, "eval_runtime": 9.9434, "eval_samples_per_second": 25.846, "eval_steps_per_second": 6.537, "eval_token_acc": 0.934089081037603, "step": 560 }, { "epoch": 1.4189401066164942, "grad_norm": 0.6257679462432861, "learning_rate": 5.4204867169758265e-06, "loss": 0.1701244592666626, "memory(GiB)": 34.95, "step": 565, "token_acc": 0.9315096587690685, "train_speed(iter/s)": 0.169354 }, { "epoch": 1.43148322358106, "grad_norm": 0.6648498773574829, "learning_rate": 5.35490649824008e-06, "loss": 0.16337137222290038, "memory(GiB)": 34.95, "step": 570, "token_acc": 0.9441849071789757, "train_speed(iter/s)": 0.169609 }, { "epoch": 1.4440263405456255, "grad_norm": 0.6349947452545166, "learning_rate": 5.289264855406295e-06, "loss": 0.1652446985244751, "memory(GiB)": 34.95, "step": 575, "token_acc": 0.9389820592823713, "train_speed(iter/s)": 0.169876 }, { "epoch": 1.4565694575101913, "grad_norm": 0.6542990803718567, "learning_rate": 5.223573149153197e-06, "loss": 0.17841705083847045, "memory(GiB)": 34.95, "step": 580, "token_acc": 0.9361828435737608, "train_speed(iter/s)": 0.170157 }, { "epoch": 1.4565694575101913, "eval_loss": 0.23112896084785461, "eval_runtime": 9.9829, "eval_samples_per_second": 25.744, "eval_steps_per_second": 6.511, "eval_token_acc": 0.9340991353307863, "step": 580 }, { "epoch": 1.469112574474757, "grad_norm": 0.6102667450904846, "learning_rate": 5.157842748824053e-06, "loss": 0.16528806686401368, "memory(GiB)": 34.95, "step": 585, "token_acc": 0.9365076170735143, "train_speed(iter/s)": 0.169488 }, { "epoch": 1.4816556914393226, "grad_norm": 0.6301048994064331, "learning_rate": 5.092085030458957e-06, "loss": 0.16155061721801758, "memory(GiB)": 34.95, "step": 590, "token_acc": 0.9446101777707996, "train_speed(iter/s)": 0.169721 }, { "epoch": 1.4941988084038884, "grad_norm": 0.6364830732345581, "learning_rate": 5.026311374825969e-06, "loss": 0.16691150665283203, "memory(GiB)": 34.95, "step": 595, "token_acc": 0.9467816983326871, "train_speed(iter/s)": 0.169943 }, { "epoch": 1.5067419253684542, "grad_norm": 0.7235390543937683, "learning_rate": 4.960533165451435e-06, "loss": 0.16880112886428833, "memory(GiB)": 34.95, "step": 600, "token_acc": 0.9358625682365141, "train_speed(iter/s)": 0.170106 }, { "epoch": 1.5067419253684542, "eval_loss": 0.23094019293785095, "eval_runtime": 9.9784, "eval_samples_per_second": 25.756, "eval_steps_per_second": 6.514, "eval_token_acc": 0.9340790267444199, "step": 600 }, { "epoch": 1.5192850423330198, "grad_norm": 0.6465805172920227, "learning_rate": 4.894761786649815e-06, "loss": 0.16632287502288817, "memory(GiB)": 34.95, "step": 605, "token_acc": 0.9343266943374441, "train_speed(iter/s)": 0.169351 }, { "epoch": 1.5318281592975853, "grad_norm": 0.6529747247695923, "learning_rate": 4.829008621553401e-06, "loss": 0.16232678890228272, "memory(GiB)": 34.95, "step": 610, "token_acc": 0.9372756540724568, "train_speed(iter/s)": 0.169563 }, { "epoch": 1.544371276262151, "grad_norm": 0.6062551736831665, "learning_rate": 4.763285050142211e-06, "loss": 0.1610184907913208, "memory(GiB)": 34.95, "step": 615, "token_acc": 0.9430264444742746, "train_speed(iter/s)": 0.169769 }, { "epoch": 1.5569143932267169, "grad_norm": 0.6321557760238647, "learning_rate": 4.697602447274454e-06, "loss": 0.16829713582992553, "memory(GiB)": 34.95, "step": 620, "token_acc": 0.9414636993230099, "train_speed(iter/s)": 0.169953 }, { "epoch": 1.5569143932267169, "eval_loss": 0.22995001077651978, "eval_runtime": 9.9733, "eval_samples_per_second": 25.769, "eval_steps_per_second": 6.517, "eval_token_acc": 0.9340689724512367, "step": 620 }, { "epoch": 1.5694575101912824, "grad_norm": 0.6242859363555908, "learning_rate": 4.631972180717859e-06, "loss": 0.169819974899292, "memory(GiB)": 34.95, "step": 625, "token_acc": 0.9353122957152196, "train_speed(iter/s)": 0.169207 }, { "epoch": 1.5820006271558482, "grad_norm": 0.6639277935028076, "learning_rate": 4.566405609182247e-06, "loss": 0.17650117874145507, "memory(GiB)": 34.95, "step": 630, "token_acc": 0.9426657289854536, "train_speed(iter/s)": 0.169405 }, { "epoch": 1.594543744120414, "grad_norm": 0.5965340733528137, "learning_rate": 4.500914080353666e-06, "loss": 0.16074283123016359, "memory(GiB)": 34.95, "step": 635, "token_acc": 0.9436321558637268, "train_speed(iter/s)": 0.169633 }, { "epoch": 1.6070868610849796, "grad_norm": 0.6135890483856201, "learning_rate": 4.435508928930431e-06, "loss": 0.17277932167053223, "memory(GiB)": 34.95, "step": 640, "token_acc": 0.9343614580678052, "train_speed(iter/s)": 0.169805 }, { "epoch": 1.6070868610849796, "eval_loss": 0.2301892191171646, "eval_runtime": 9.9309, "eval_samples_per_second": 25.879, "eval_steps_per_second": 6.545, "eval_token_acc": 0.9346370400160868, "step": 640 }, { "epoch": 1.6196299780495453, "grad_norm": 0.6135047674179077, "learning_rate": 4.3702014746614135e-06, "loss": 0.16275949478149415, "memory(GiB)": 34.95, "step": 645, "token_acc": 0.9382904296349448, "train_speed(iter/s)": 0.169112 }, { "epoch": 1.6321730950141111, "grad_norm": 0.6757416725158691, "learning_rate": 4.305003020386922e-06, "loss": 0.16928246021270751, "memory(GiB)": 34.95, "step": 650, "token_acc": 0.9348640286598274, "train_speed(iter/s)": 0.169328 }, { "epoch": 1.6447162119786767, "grad_norm": 0.6586278676986694, "learning_rate": 4.239924850082501e-06, "loss": 0.15818471908569337, "memory(GiB)": 34.95, "step": 655, "token_acc": 0.9452125117275318, "train_speed(iter/s)": 0.169486 }, { "epoch": 1.6572593289432422, "grad_norm": 0.7142418026924133, "learning_rate": 4.1749782269060045e-06, "loss": 0.1626511335372925, "memory(GiB)": 34.95, "step": 660, "token_acc": 0.9346642123840067, "train_speed(iter/s)": 0.169729 }, { "epoch": 1.6572593289432422, "eval_loss": 0.22976796329021454, "eval_runtime": 9.9836, "eval_samples_per_second": 25.742, "eval_steps_per_second": 6.511, "eval_token_acc": 0.9347878544138347, "step": 660 }, { "epoch": 1.6698024459078082, "grad_norm": 0.6405680775642395, "learning_rate": 4.110174391248268e-06, "loss": 0.1630636215209961, "memory(GiB)": 34.95, "step": 665, "token_acc": 0.9356640277041026, "train_speed(iter/s)": 0.169025 }, { "epoch": 1.6823455628723738, "grad_norm": 0.6509523391723633, "learning_rate": 4.045524558787712e-06, "loss": 0.17556746006011964, "memory(GiB)": 34.95, "step": 670, "token_acc": 0.9399725004910626, "train_speed(iter/s)": 0.169222 }, { "epoch": 1.6948886798369394, "grad_norm": 0.6420454978942871, "learning_rate": 3.9810399185492406e-06, "loss": 0.16325095891952515, "memory(GiB)": 34.95, "step": 675, "token_acc": 0.9350154972645768, "train_speed(iter/s)": 0.16944 }, { "epoch": 1.7074317968015051, "grad_norm": 0.6421252489089966, "learning_rate": 3.916731630967741e-06, "loss": 0.17528104782104492, "memory(GiB)": 34.95, "step": 680, "token_acc": 0.9365230616994374, "train_speed(iter/s)": 0.169645 }, { "epoch": 1.7074317968015051, "eval_loss": 0.2283635139465332, "eval_runtime": 9.9363, "eval_samples_per_second": 25.865, "eval_steps_per_second": 6.542, "eval_token_acc": 0.9346672028956364, "step": 680 }, { "epoch": 1.719974913766071, "grad_norm": 0.6265820264816284, "learning_rate": 3.852610825956529e-06, "loss": 0.16770663261413574, "memory(GiB)": 34.95, "step": 685, "token_acc": 0.9386050786166663, "train_speed(iter/s)": 0.169004 }, { "epoch": 1.7325180307306365, "grad_norm": 0.6280970573425293, "learning_rate": 3.788688600981085e-06, "loss": 0.1680266261100769, "memory(GiB)": 34.95, "step": 690, "token_acc": 0.9436317194937884, "train_speed(iter/s)": 0.169183 }, { "epoch": 1.7450611476952023, "grad_norm": 0.575031578540802, "learning_rate": 3.7249760191384055e-06, "loss": 0.16007229089736938, "memory(GiB)": 34.95, "step": 695, "token_acc": 0.9408609064687402, "train_speed(iter/s)": 0.169405 }, { "epoch": 1.757604264659768, "grad_norm": 0.6248459219932556, "learning_rate": 3.6614841072422913e-06, "loss": 0.16597646474838257, "memory(GiB)": 34.95, "step": 700, "token_acc": 0.9334997820314668, "train_speed(iter/s)": 0.169658 }, { "epoch": 1.757604264659768, "eval_loss": 0.227211132645607, "eval_runtime": 9.9535, "eval_samples_per_second": 25.82, "eval_steps_per_second": 6.53, "eval_token_acc": 0.9347778001206515, "step": 700 }, { "epoch": 1.7701473816243336, "grad_norm": 0.6405999064445496, "learning_rate": 3.5982238539149287e-06, "loss": 0.16680790185928346, "memory(GiB)": 34.95, "step": 705, "token_acc": 0.9360328466797351, "train_speed(iter/s)": 0.169094 }, { "epoch": 1.7826904985888994, "grad_norm": 0.6649206280708313, "learning_rate": 3.535206207685079e-06, "loss": 0.1820515751838684, "memory(GiB)": 34.95, "step": 710, "token_acc": 0.9366235113407408, "train_speed(iter/s)": 0.169367 }, { "epoch": 1.7952336155534652, "grad_norm": 0.6017094254493713, "learning_rate": 3.472442075093192e-06, "loss": 0.1508460283279419, "memory(GiB)": 34.95, "step": 715, "token_acc": 0.9486971106461709, "train_speed(iter/s)": 0.169534 }, { "epoch": 1.8077767325180307, "grad_norm": 0.5928083062171936, "learning_rate": 3.4099423188038094e-06, "loss": 0.16222984790802003, "memory(GiB)": 34.95, "step": 720, "token_acc": 0.9467024477514842, "train_speed(iter/s)": 0.169693 }, { "epoch": 1.8077767325180307, "eval_loss": 0.2278689295053482, "eval_runtime": 9.9852, "eval_samples_per_second": 25.738, "eval_steps_per_second": 6.51, "eval_token_acc": 0.9349235873718078, "step": 720 }, { "epoch": 1.8203198494825963, "grad_norm": 0.6321941018104553, "learning_rate": 3.347717755725547e-06, "loss": 0.17015450000762938, "memory(GiB)": 34.95, "step": 725, "token_acc": 0.9337387521012558, "train_speed(iter/s)": 0.169142 }, { "epoch": 1.832862966447162, "grad_norm": 0.6680959463119507, "learning_rate": 3.2857791551389907e-06, "loss": 0.16979444026947021, "memory(GiB)": 34.95, "step": 730, "token_acc": 0.9440408017179671, "train_speed(iter/s)": 0.1693 }, { "epoch": 1.8454060834117278, "grad_norm": 0.6347929835319519, "learning_rate": 3.224137236832859e-06, "loss": 0.16566884517669678, "memory(GiB)": 34.95, "step": 735, "token_acc": 0.9421418181073162, "train_speed(iter/s)": 0.169501 }, { "epoch": 1.8579492003762934, "grad_norm": 0.6764352321624756, "learning_rate": 3.1628026692487053e-06, "loss": 0.1652566075325012, "memory(GiB)": 34.95, "step": 740, "token_acc": 0.9433130787598766, "train_speed(iter/s)": 0.169711 }, { "epoch": 1.8579492003762934, "eval_loss": 0.22732892632484436, "eval_runtime": 9.9558, "eval_samples_per_second": 25.814, "eval_steps_per_second": 6.529, "eval_token_acc": 0.9350542931831892, "step": 740 }, { "epoch": 1.8704923173408592, "grad_norm": 0.6389771103858948, "learning_rate": 3.1017860676345184e-06, "loss": 0.15687326192855836, "memory(GiB)": 34.95, "step": 745, "token_acc": 0.9414018945533932, "train_speed(iter/s)": 0.169168 }, { "epoch": 1.883035434305425, "grad_norm": 0.6998502612113953, "learning_rate": 3.0410979922075344e-06, "loss": 0.17107654809951783, "memory(GiB)": 34.95, "step": 750, "token_acc": 0.9427076541922024, "train_speed(iter/s)": 0.169327 }, { "epoch": 1.8955785512699905, "grad_norm": 0.6121336817741394, "learning_rate": 2.980748946326564e-06, "loss": 0.16890095472335814, "memory(GiB)": 34.95, "step": 755, "token_acc": 0.9396650021625447, "train_speed(iter/s)": 0.169512 }, { "epoch": 1.9081216682345563, "grad_norm": 0.5630024075508118, "learning_rate": 2.920749374674161e-06, "loss": 0.16135737895965577, "memory(GiB)": 34.95, "step": 760, "token_acc": 0.9455813142757539, "train_speed(iter/s)": 0.169692 }, { "epoch": 1.9081216682345563, "eval_loss": 0.2258785516023636, "eval_runtime": 9.9483, "eval_samples_per_second": 25.833, "eval_steps_per_second": 6.534, "eval_token_acc": 0.9357782022923788, "step": 760 }, { "epoch": 1.920664785199122, "grad_norm": 0.6215230226516724, "learning_rate": 2.861109661448952e-06, "loss": 0.160076367855072, "memory(GiB)": 34.95, "step": 765, "token_acc": 0.938600821420109, "train_speed(iter/s)": 0.169164 }, { "epoch": 1.9332079021636877, "grad_norm": 0.6377970576286316, "learning_rate": 2.8018401285684284e-06, "loss": 0.16507962942123414, "memory(GiB)": 34.95, "step": 770, "token_acc": 0.9362514029180696, "train_speed(iter/s)": 0.169343 }, { "epoch": 1.9457510191282532, "grad_norm": 0.6416298747062683, "learning_rate": 2.7429510338825206e-06, "loss": 0.1676865577697754, "memory(GiB)": 34.95, "step": 775, "token_acc": 0.9356394574884725, "train_speed(iter/s)": 0.169514 }, { "epoch": 1.9582941360928192, "grad_norm": 0.6064321398735046, "learning_rate": 2.6844525693982614e-06, "loss": 0.1615642786026001, "memory(GiB)": 34.95, "step": 780, "token_acc": 0.9434515921396388, "train_speed(iter/s)": 0.169698 }, { "epoch": 1.9582941360928192, "eval_loss": 0.22540703415870667, "eval_runtime": 9.9865, "eval_samples_per_second": 25.735, "eval_steps_per_second": 6.509, "eval_token_acc": 0.9356072793082646, "step": 780 }, { "epoch": 1.9708372530573848, "grad_norm": 0.6212838888168335, "learning_rate": 2.6263548595158374e-06, "loss": 0.16903696060180665, "memory(GiB)": 34.95, "step": 785, "token_acc": 0.9373140403756506, "train_speed(iter/s)": 0.16916 }, { "epoch": 1.9833803700219503, "grad_norm": 0.6316173076629639, "learning_rate": 2.568667959276351e-06, "loss": 0.1633455991744995, "memory(GiB)": 34.95, "step": 790, "token_acc": 0.9499691904033075, "train_speed(iter/s)": 0.169327 }, { "epoch": 1.9959234869865161, "grad_norm": 0.5879420638084412, "learning_rate": 2.5114018526215843e-06, "loss": 0.15602803230285645, "memory(GiB)": 34.95, "step": 795, "token_acc": 0.9389440475085831, "train_speed(iter/s)": 0.169519 }, { "epoch": 2.0100344935716525, "grad_norm": 0.5628567337989807, "learning_rate": 2.454566450666061e-06, "loss": 0.1572946071624756, "memory(GiB)": 34.95, "step": 800, "token_acc": 0.9571394981693594, "train_speed(iter/s)": 0.169704 }, { "epoch": 2.0100344935716525, "eval_loss": 0.22719430923461914, "eval_runtime": 9.9565, "eval_samples_per_second": 25.812, "eval_steps_per_second": 6.528, "eval_token_acc": 0.9355921978684898, "step": 800 }, { "epoch": 2.022577610536218, "grad_norm": 0.6104578971862793, "learning_rate": 2.398171589981721e-06, "loss": 0.1239326000213623, "memory(GiB)": 34.95, "step": 805, "token_acc": 0.9446134994383744, "train_speed(iter/s)": 0.169099 }, { "epoch": 2.035120727500784, "grad_norm": 0.5852713584899902, "learning_rate": 2.3422270308954936e-06, "loss": 0.12712430953979492, "memory(GiB)": 34.95, "step": 810, "token_acc": 0.9523950262830121, "train_speed(iter/s)": 0.169261 }, { "epoch": 2.0476638444653497, "grad_norm": 0.6348543763160706, "learning_rate": 2.286742455800059e-06, "loss": 0.12253003120422364, "memory(GiB)": 34.95, "step": 815, "token_acc": 0.9575001424257962, "train_speed(iter/s)": 0.169477 }, { "epoch": 2.060206961429915, "grad_norm": 0.6192832589149475, "learning_rate": 2.2317274674781158e-06, "loss": 0.12359896898269654, "memory(GiB)": 34.95, "step": 820, "token_acc": 0.952242789995938, "train_speed(iter/s)": 0.169661 }, { "epoch": 2.060206961429915, "eval_loss": 0.24132946133613586, "eval_runtime": 9.9829, "eval_samples_per_second": 25.744, "eval_steps_per_second": 6.511, "eval_token_acc": 0.934958777397949, "step": 820 }, { "epoch": 2.072750078394481, "grad_norm": 0.6246756315231323, "learning_rate": 2.1771915874404094e-06, "loss": 0.1322195291519165, "memory(GiB)": 34.95, "step": 825, "token_acc": 0.9414432054743698, "train_speed(iter/s)": 0.169161 }, { "epoch": 2.085293195359047, "grad_norm": 0.5839347243309021, "learning_rate": 2.1231442542778317e-06, "loss": 0.11952453851699829, "memory(GiB)": 34.95, "step": 830, "token_acc": 0.956975505857295, "train_speed(iter/s)": 0.169338 }, { "epoch": 2.0978363123236123, "grad_norm": 0.6020109057426453, "learning_rate": 2.0695948220278756e-06, "loss": 0.12150832414627075, "memory(GiB)": 34.95, "step": 835, "token_acc": 0.950202699878798, "train_speed(iter/s)": 0.169516 }, { "epoch": 2.110379429288178, "grad_norm": 0.6176694631576538, "learning_rate": 2.0165525585557205e-06, "loss": 0.12181558609008789, "memory(GiB)": 34.95, "step": 840, "token_acc": 0.9584100732944936, "train_speed(iter/s)": 0.169655 }, { "epoch": 2.110379429288178, "eval_loss": 0.23956826329231262, "eval_runtime": 9.9764, "eval_samples_per_second": 25.761, "eval_steps_per_second": 6.515, "eval_token_acc": 0.9351900261411623, "step": 840 }, { "epoch": 2.122922546252744, "grad_norm": 0.6029666066169739, "learning_rate": 1.964026643950226e-06, "loss": 0.11940534114837646, "memory(GiB)": 34.95, "step": 845, "token_acc": 0.9457718501702611, "train_speed(iter/s)": 0.16911 }, { "epoch": 2.1354656632173095, "grad_norm": 0.5822896957397461, "learning_rate": 1.9120261689351317e-06, "loss": 0.11883677244186401, "memory(GiB)": 34.95, "step": 850, "token_acc": 0.9609223300970874, "train_speed(iter/s)": 0.169285 }, { "epoch": 2.148008780181875, "grad_norm": 0.6058652997016907, "learning_rate": 1.860560133295708e-06, "loss": 0.12740614414215087, "memory(GiB)": 34.95, "step": 855, "token_acc": 0.9549561469832148, "train_speed(iter/s)": 0.169475 }, { "epoch": 2.160551897146441, "grad_norm": 0.5672310590744019, "learning_rate": 1.8096374443211545e-06, "loss": 0.12559156417846679, "memory(GiB)": 34.95, "step": 860, "token_acc": 0.9539432293401429, "train_speed(iter/s)": 0.169653 }, { "epoch": 2.160551897146441, "eval_loss": 0.23939980566501617, "eval_runtime": 9.9791, "eval_samples_per_second": 25.754, "eval_steps_per_second": 6.514, "eval_token_acc": 0.9351900261411623, "step": 860 }, { "epoch": 2.1730950141110066, "grad_norm": 0.6905380487442017, "learning_rate": 1.7592669152630082e-06, "loss": 0.12502384185791016, "memory(GiB)": 34.95, "step": 865, "token_acc": 0.9430737514131808, "train_speed(iter/s)": 0.16916 }, { "epoch": 2.185638131075572, "grad_norm": 0.604882001876831, "learning_rate": 1.7094572638098122e-06, "loss": 0.13246217966079712, "memory(GiB)": 34.95, "step": 870, "token_acc": 0.9538787052672268, "train_speed(iter/s)": 0.169321 }, { "epoch": 2.198181248040138, "grad_norm": 0.6221954226493835, "learning_rate": 1.6602171105783488e-06, "loss": 0.12281397581100464, "memory(GiB)": 34.95, "step": 875, "token_acc": 0.9516503156133547, "train_speed(iter/s)": 0.169488 }, { "epoch": 2.2107243650047037, "grad_norm": 0.5445839166641235, "learning_rate": 1.61155497762165e-06, "loss": 0.11812053918838501, "memory(GiB)": 34.95, "step": 880, "token_acc": 0.9578427802726868, "train_speed(iter/s)": 0.169626 }, { "epoch": 2.2107243650047037, "eval_loss": 0.23973500728607178, "eval_runtime": 9.9832, "eval_samples_per_second": 25.743, "eval_steps_per_second": 6.511, "eval_token_acc": 0.9351347275286548, "step": 880 }, { "epoch": 2.2232674819692693, "grad_norm": 0.6164165735244751, "learning_rate": 1.5634792869540782e-06, "loss": 0.11963331699371338, "memory(GiB)": 34.95, "step": 885, "token_acc": 0.9436205250131545, "train_speed(iter/s)": 0.169137 }, { "epoch": 2.235810598933835, "grad_norm": 0.606275200843811, "learning_rate": 1.5159983590937183e-06, "loss": 0.12453606128692626, "memory(GiB)": 34.95, "step": 890, "token_acc": 0.9526249104831157, "train_speed(iter/s)": 0.169305 }, { "epoch": 2.248353715898401, "grad_norm": 0.6022824048995972, "learning_rate": 1.4691204116223357e-06, "loss": 0.11552423238754272, "memory(GiB)": 34.95, "step": 895, "token_acc": 0.9613866135340565, "train_speed(iter/s)": 0.169507 }, { "epoch": 2.2608968328629664, "grad_norm": 0.6075533032417297, "learning_rate": 1.4228535577631442e-06, "loss": 0.12762036323547363, "memory(GiB)": 34.95, "step": 900, "token_acc": 0.9527419384954348, "train_speed(iter/s)": 0.169672 }, { "epoch": 2.2608968328629664, "eval_loss": 0.24023577570915222, "eval_runtime": 9.9811, "eval_samples_per_second": 25.749, "eval_steps_per_second": 6.512, "eval_token_acc": 0.9352000804343454, "step": 900 }, { "epoch": 2.273439949827532, "grad_norm": 0.6038256883621216, "learning_rate": 1.3772058049766491e-06, "loss": 0.12403825521469117, "memory(GiB)": 34.95, "step": 905, "token_acc": 0.9445350568832248, "train_speed(iter/s)": 0.169145 }, { "epoch": 2.285983066792098, "grad_norm": 0.6491556763648987, "learning_rate": 1.3321850535747822e-06, "loss": 0.12173200845718384, "memory(GiB)": 34.95, "step": 910, "token_acc": 0.9588150821120849, "train_speed(iter/s)": 0.169343 }, { "epoch": 2.2985261837566635, "grad_norm": 0.548174262046814, "learning_rate": 1.2877990953535841e-06, "loss": 0.12104053497314453, "memory(GiB)": 34.95, "step": 915, "token_acc": 0.958676718877986, "train_speed(iter/s)": 0.169527 }, { "epoch": 2.311069300721229, "grad_norm": 0.5731512904167175, "learning_rate": 1.2440556122446701e-06, "loss": 0.12762261629104615, "memory(GiB)": 34.95, "step": 920, "token_acc": 0.9535161617972158, "train_speed(iter/s)": 0.169669 }, { "epoch": 2.311069300721229, "eval_loss": 0.2390112429857254, "eval_runtime": 9.9533, "eval_samples_per_second": 25.821, "eval_steps_per_second": 6.53, "eval_token_acc": 0.9355570078423486, "step": 920 }, { "epoch": 2.323612417685795, "grad_norm": 0.634443998336792, "learning_rate": 1.2009621749857103e-06, "loss": 0.12285526990890502, "memory(GiB)": 34.95, "step": 925, "token_acc": 0.9447460370728625, "train_speed(iter/s)": 0.169211 }, { "epoch": 2.3361555346503606, "grad_norm": 0.674192488193512, "learning_rate": 1.1585262418101468e-06, "loss": 0.13117657899856566, "memory(GiB)": 34.95, "step": 930, "token_acc": 0.9548805986574227, "train_speed(iter/s)": 0.16934 }, { "epoch": 2.348698651614926, "grad_norm": 0.6701561808586121, "learning_rate": 1.1167551571563967e-06, "loss": 0.12773873805999755, "memory(GiB)": 34.95, "step": 935, "token_acc": 0.9571292006765203, "train_speed(iter/s)": 0.169553 }, { "epoch": 2.361241768579492, "grad_norm": 0.5688639879226685, "learning_rate": 1.0756561503967366e-06, "loss": 0.12773098945617675, "memory(GiB)": 34.95, "step": 940, "token_acc": 0.9547074376365099, "train_speed(iter/s)": 0.16967 }, { "epoch": 2.361241768579492, "eval_loss": 0.23946641385555267, "eval_runtime": 9.9907, "eval_samples_per_second": 25.724, "eval_steps_per_second": 6.506, "eval_token_acc": 0.9353056505127689, "step": 940 }, { "epoch": 2.3737848855440578, "grad_norm": 0.6829060316085815, "learning_rate": 1.0352363345861067e-06, "loss": 0.1251779556274414, "memory(GiB)": 34.95, "step": 945, "token_acc": 0.9461535568551496, "train_speed(iter/s)": 0.169209 }, { "epoch": 2.3863280025086233, "grad_norm": 0.6119195222854614, "learning_rate": 9.955027052310445e-07, "loss": 0.12672061920166017, "memory(GiB)": 34.95, "step": 950, "token_acc": 0.9541979451343965, "train_speed(iter/s)": 0.169349 }, { "epoch": 2.3988711194731893, "grad_norm": 0.6136831045150757, "learning_rate": 9.564621390789692e-07, "loss": 0.12832672595977784, "memory(GiB)": 34.95, "step": 955, "token_acc": 0.9493431077797455, "train_speed(iter/s)": 0.169478 }, { "epoch": 2.411414236437755, "grad_norm": 0.6445709466934204, "learning_rate": 9.181213929280047e-07, "loss": 0.12906695604324342, "memory(GiB)": 34.95, "step": 960, "token_acc": 0.9479911420436571, "train_speed(iter/s)": 0.16967 }, { "epoch": 2.411414236437755, "eval_loss": 0.23977875709533691, "eval_runtime": 9.9861, "eval_samples_per_second": 25.736, "eval_steps_per_second": 6.509, "eval_token_acc": 0.9355368992559823, "step": 960 }, { "epoch": 2.4239573534023204, "grad_norm": 0.5981658101081848, "learning_rate": 8.804871024575851e-07, "loss": 0.12087714672088623, "memory(GiB)": 34.95, "step": 965, "token_acc": 0.9459313171146616, "train_speed(iter/s)": 0.169225 }, { "epoch": 2.436500470366886, "grad_norm": 0.591122567653656, "learning_rate": 8.435657810799991e-07, "loss": 0.11974387168884278, "memory(GiB)": 34.95, "step": 970, "token_acc": 0.9559715418707722, "train_speed(iter/s)": 0.169402 }, { "epoch": 2.449043587331452, "grad_norm": 0.6105548143386841, "learning_rate": 8.073638188131128e-07, "loss": 0.12425668239593506, "memory(GiB)": 34.95, "step": 975, "token_acc": 0.9544568733678918, "train_speed(iter/s)": 0.169513 }, { "epoch": 2.4615867042960176, "grad_norm": 0.5976568460464478, "learning_rate": 7.71887481174437e-07, "loss": 0.12979369163513182, "memory(GiB)": 34.95, "step": 980, "token_acc": 0.9534985244556201, "train_speed(iter/s)": 0.16963 }, { "epoch": 2.4615867042960176, "eval_loss": 0.23947912454605103, "eval_runtime": 9.9815, "eval_samples_per_second": 25.748, "eval_steps_per_second": 6.512, "eval_token_acc": 0.9354866277900663, "step": 980 }, { "epoch": 2.474129821260583, "grad_norm": 0.6130661964416504, "learning_rate": 7.371429080967468e-07, "loss": 0.12366334199905396, "memory(GiB)": 34.95, "step": 985, "token_acc": 0.9456251029644058, "train_speed(iter/s)": 0.169181 }, { "epoch": 2.486672938225149, "grad_norm": 0.611749529838562, "learning_rate": 7.031361128654402e-07, "loss": 0.11961600780487061, "memory(GiB)": 34.95, "step": 990, "token_acc": 0.9553609289884855, "train_speed(iter/s)": 0.169374 }, { "epoch": 2.4992160551897147, "grad_norm": 0.6357402205467224, "learning_rate": 6.698729810778065e-07, "loss": 0.12684570550918578, "memory(GiB)": 34.95, "step": 995, "token_acc": 0.9534441273571709, "train_speed(iter/s)": 0.169506 }, { "epoch": 2.5117591721542802, "grad_norm": 0.5895079970359802, "learning_rate": 6.373592696244024e-07, "loss": 0.12313053607940674, "memory(GiB)": 34.95, "step": 1000, "token_acc": 0.9565139198618804, "train_speed(iter/s)": 0.169623 }, { "epoch": 2.5117591721542802, "eval_loss": 0.23871561884880066, "eval_runtime": 9.9765, "eval_samples_per_second": 25.76, "eval_steps_per_second": 6.515, "eval_token_acc": 0.935416247737784, "step": 1000 }, { "epoch": 2.524302289118846, "grad_norm": 0.6320595145225525, "learning_rate": 6.056006056926978e-07, "loss": 0.12712349891662597, "memory(GiB)": 34.95, "step": 1005, "token_acc": 0.9447087643998138, "train_speed(iter/s)": 0.169177 }, { "epoch": 2.536845406083412, "grad_norm": 0.611575186252594, "learning_rate": 5.746024857931732e-07, "loss": 0.12986292839050292, "memory(GiB)": 34.95, "step": 1010, "token_acc": 0.9510800508259212, "train_speed(iter/s)": 0.169271 }, { "epoch": 2.5493885230479774, "grad_norm": 0.6201998591423035, "learning_rate": 5.443702748080288e-07, "loss": 0.12274014949798584, "memory(GiB)": 34.95, "step": 1015, "token_acc": 0.955253177824786, "train_speed(iter/s)": 0.169445 }, { "epoch": 2.561931640012543, "grad_norm": 0.6165274977684021, "learning_rate": 5.149092050626825e-07, "loss": 0.1297899603843689, "memory(GiB)": 34.95, "step": 1020, "token_acc": 0.9506936125816299, "train_speed(iter/s)": 0.169553 }, { "epoch": 2.561931640012543, "eval_loss": 0.23904787003993988, "eval_runtime": 10.067, "eval_samples_per_second": 25.529, "eval_steps_per_second": 6.457, "eval_token_acc": 0.935441383470742, "step": 1020 }, { "epoch": 2.574474756977109, "grad_norm": 0.6250675916671753, "learning_rate": 4.862243754202023e-07, "loss": 0.12486759424209595, "memory(GiB)": 34.95, "step": 1025, "token_acc": 0.9427952415499746, "train_speed(iter/s)": 0.169143 }, { "epoch": 2.5870178739416745, "grad_norm": 0.5898808240890503, "learning_rate": 4.5832075039884014e-07, "loss": 0.11898901462554931, "memory(GiB)": 34.95, "step": 1030, "token_acc": 0.9599557987792043, "train_speed(iter/s)": 0.169245 }, { "epoch": 2.59956099090624, "grad_norm": 0.6565684080123901, "learning_rate": 4.3120315931281633e-07, "loss": 0.12741444110870362, "memory(GiB)": 34.95, "step": 1035, "token_acc": 0.9581729932512134, "train_speed(iter/s)": 0.169389 }, { "epoch": 2.612104107870806, "grad_norm": 0.5875664949417114, "learning_rate": 4.048762954365054e-07, "loss": 0.12610654830932616, "memory(GiB)": 34.95, "step": 1040, "token_acc": 0.9550036071318149, "train_speed(iter/s)": 0.169544 }, { "epoch": 2.612104107870806, "eval_loss": 0.23876284062862396, "eval_runtime": 9.9698, "eval_samples_per_second": 25.778, "eval_steps_per_second": 6.52, "eval_token_acc": 0.9356223607480394, "step": 1040 }, { "epoch": 2.6246472248353716, "grad_norm": 0.6185052990913391, "learning_rate": 3.793447151921642e-07, "loss": 0.11991071701049805, "memory(GiB)": 34.95, "step": 1045, "token_acc": 0.9459399138299112, "train_speed(iter/s)": 0.169124 }, { "epoch": 2.637190341799937, "grad_norm": 0.6107172966003418, "learning_rate": 3.546128373613472e-07, "loss": 0.11918728351593018, "memory(GiB)": 34.95, "step": 1050, "token_acc": 0.9534668113226157, "train_speed(iter/s)": 0.169318 }, { "epoch": 2.649733458764503, "grad_norm": 0.6262523531913757, "learning_rate": 3.30684942320143e-07, "loss": 0.11955299377441406, "memory(GiB)": 34.95, "step": 1055, "token_acc": 0.9587094529959127, "train_speed(iter/s)": 0.169507 }, { "epoch": 2.6622765757290687, "grad_norm": 0.5960172414779663, "learning_rate": 3.0756517129836296e-07, "loss": 0.12312361001968383, "memory(GiB)": 34.95, "step": 1060, "token_acc": 0.9537392406006633, "train_speed(iter/s)": 0.169614 }, { "epoch": 2.6622765757290687, "eval_loss": 0.2386154979467392, "eval_runtime": 9.9782, "eval_samples_per_second": 25.756, "eval_steps_per_second": 6.514, "eval_token_acc": 0.9356575507741806, "step": 1060 }, { "epoch": 2.6748196926936343, "grad_norm": 0.6013411283493042, "learning_rate": 2.8525752566281485e-07, "loss": 0.11395795345306396, "memory(GiB)": 34.95, "step": 1065, "token_acc": 0.947853377091845, "train_speed(iter/s)": 0.169208 }, { "epoch": 2.6873628096582003, "grad_norm": 0.6079533696174622, "learning_rate": 2.637658662247805e-07, "loss": 0.12184674739837646, "memory(GiB)": 34.95, "step": 1070, "token_acc": 0.9590189382179447, "train_speed(iter/s)": 0.169294 }, { "epoch": 2.699905926622766, "grad_norm": 0.5974398851394653, "learning_rate": 2.430939125718218e-07, "loss": 0.12283775806427003, "memory(GiB)": 34.95, "step": 1075, "token_acc": 0.9571679809383332, "train_speed(iter/s)": 0.169407 }, { "epoch": 2.7124490435873314, "grad_norm": 0.6196507811546326, "learning_rate": 2.232452424240261e-07, "loss": 0.12050046920776367, "memory(GiB)": 34.95, "step": 1080, "token_acc": 0.9597835852963006, "train_speed(iter/s)": 0.169553 }, { "epoch": 2.7124490435873314, "eval_loss": 0.23875917494297028, "eval_runtime": 9.9675, "eval_samples_per_second": 25.784, "eval_steps_per_second": 6.521, "eval_token_acc": 0.9355821435753067, "step": 1080 }, { "epoch": 2.7249921605518974, "grad_norm": 0.6392203569412231, "learning_rate": 2.042232910148051e-07, "loss": 0.11989054679870606, "memory(GiB)": 34.95, "step": 1085, "token_acc": 0.9460828818275003, "train_speed(iter/s)": 0.16913 }, { "epoch": 2.737535277516463, "grad_norm": 0.6039083003997803, "learning_rate": 1.860313504963579e-07, "loss": 0.11684960126876831, "memory(GiB)": 34.95, "step": 1090, "token_acc": 0.9531301093630782, "train_speed(iter/s)": 0.169284 }, { "epoch": 2.7500783944810285, "grad_norm": 0.6016117930412292, "learning_rate": 1.6867256936989097e-07, "loss": 0.12414079904556274, "memory(GiB)": 34.95, "step": 1095, "token_acc": 0.9582522047875387, "train_speed(iter/s)": 0.169393 }, { "epoch": 2.762621511445594, "grad_norm": 0.609747588634491, "learning_rate": 1.521499519407038e-07, "loss": 0.11737879514694213, "memory(GiB)": 34.95, "step": 1100, "token_acc": 0.95666478832276, "train_speed(iter/s)": 0.169525 }, { "epoch": 2.762621511445594, "eval_loss": 0.2390962839126587, "eval_runtime": 9.9713, "eval_samples_per_second": 25.774, "eval_steps_per_second": 6.519, "eval_token_acc": 0.9355821435753067, "step": 1100 }, { "epoch": 2.7751646284101597, "grad_norm": 0.540483832359314, "learning_rate": 1.364663577982317e-07, "loss": 0.11655213832855224, "memory(GiB)": 34.95, "step": 1105, "token_acc": 0.947412632708209, "train_speed(iter/s)": 0.169129 }, { "epoch": 2.7877077453747257, "grad_norm": 0.6210323572158813, "learning_rate": 1.2162450132113202e-07, "loss": 0.12057442665100097, "memory(GiB)": 34.95, "step": 1110, "token_acc": 0.9559064846811715, "train_speed(iter/s)": 0.169327 }, { "epoch": 2.800250862339291, "grad_norm": 0.6889768242835999, "learning_rate": 1.07626951207504e-07, "loss": 0.12380859851837159, "memory(GiB)": 34.95, "step": 1115, "token_acc": 0.9528777568254488, "train_speed(iter/s)": 0.169457 }, { "epoch": 2.8127939793038568, "grad_norm": 0.5808271169662476, "learning_rate": 9.447613003032042e-08, "loss": 0.12284276485443116, "memory(GiB)": 34.95, "step": 1120, "token_acc": 0.9562657695542472, "train_speed(iter/s)": 0.169618 }, { "epoch": 2.8127939793038568, "eval_loss": 0.23907452821731567, "eval_runtime": 9.9798, "eval_samples_per_second": 25.752, "eval_steps_per_second": 6.513, "eval_token_acc": 0.9356977679469133, "step": 1120 }, { "epoch": 2.8253370962684228, "grad_norm": 0.6014649868011475, "learning_rate": 8.217431381815078e-08, "loss": 0.12331821918487548, "memory(GiB)": 34.95, "step": 1125, "token_acc": 0.9462024372046755, "train_speed(iter/s)": 0.169228 }, { "epoch": 2.8378802132329883, "grad_norm": 0.6275858283042908, "learning_rate": 7.072363166124363e-08, "loss": 0.12597228288650514, "memory(GiB)": 34.95, "step": 1130, "token_acc": 0.9567274137262505, "train_speed(iter/s)": 0.169368 }, { "epoch": 2.850423330197554, "grad_norm": 0.6735210418701172, "learning_rate": 6.012606534304688e-08, "loss": 0.12442328929901122, "memory(GiB)": 34.95, "step": 1135, "token_acc": 0.9592088998763906, "train_speed(iter/s)": 0.169498 }, { "epoch": 2.86296644716212, "grad_norm": 0.6021392345428467, "learning_rate": 5.038344899721437e-08, "loss": 0.13345457315444947, "memory(GiB)": 34.95, "step": 1140, "token_acc": 0.9546434206981488, "train_speed(iter/s)": 0.169641 }, { "epoch": 2.86296644716212, "eval_loss": 0.23904505372047424, "eval_runtime": 9.9416, "eval_samples_per_second": 25.851, "eval_steps_per_second": 6.538, "eval_token_acc": 0.9355821435753067, "step": 1140 }, { "epoch": 2.8755095641266855, "grad_norm": 0.6207495927810669, "learning_rate": 4.149746879017147e-08, "loss": 0.12285821437835694, "memory(GiB)": 34.95, "step": 1145, "token_acc": 0.9433157837334047, "train_speed(iter/s)": 0.169265 }, { "epoch": 2.888052681091251, "grad_norm": 0.5906082391738892, "learning_rate": 3.3469662629289635e-08, "loss": 0.1214226484298706, "memory(GiB)": 34.95, "step": 1150, "token_acc": 0.9550792604937793, "train_speed(iter/s)": 0.169398 }, { "epoch": 2.900595798055817, "grad_norm": 0.6598130464553833, "learning_rate": 2.630141989671542e-08, "loss": 0.12527458667755126, "memory(GiB)": 34.95, "step": 1155, "token_acc": 0.9544634286811288, "train_speed(iter/s)": 0.169521 }, { "epoch": 2.9131389150203826, "grad_norm": 0.6021043062210083, "learning_rate": 1.999398120891116e-08, "loss": 0.1194157361984253, "memory(GiB)": 34.95, "step": 1160, "token_acc": 0.9597829112162538, "train_speed(iter/s)": 0.169653 }, { "epoch": 2.9131389150203826, "eval_loss": 0.23905108869075775, "eval_runtime": 9.9613, "eval_samples_per_second": 25.8, "eval_steps_per_second": 6.525, "eval_token_acc": 0.935652523627589, "step": 1160 }, { "epoch": 2.925682031984948, "grad_norm": 0.5893052816390991, "learning_rate": 1.4548438201939518e-08, "loss": 0.11504523754119873, "memory(GiB)": 34.95, "step": 1165, "token_acc": 0.9459238731463808, "train_speed(iter/s)": 0.16932 }, { "epoch": 2.938225148949514, "grad_norm": 0.6343664526939392, "learning_rate": 9.965733342532925e-09, "loss": 0.1275886058807373, "memory(GiB)": 34.95, "step": 1170, "token_acc": 0.9558923448588253, "train_speed(iter/s)": 0.169447 }, { "epoch": 2.9507682659140797, "grad_norm": 0.6318545341491699, "learning_rate": 6.246659764979068e-09, "loss": 0.12401323318481446, "memory(GiB)": 34.95, "step": 1175, "token_acc": 0.9594298632943586, "train_speed(iter/s)": 0.169549 }, { "epoch": 2.9633113828786453, "grad_norm": 0.6376116275787354, "learning_rate": 3.3918611338507046e-09, "loss": 0.11943215131759644, "memory(GiB)": 34.95, "step": 1180, "token_acc": 0.9601843185254518, "train_speed(iter/s)": 0.169695 }, { "epoch": 2.9633113828786453, "eval_loss": 0.2389203906059265, "eval_runtime": 9.9873, "eval_samples_per_second": 25.733, "eval_steps_per_second": 6.508, "eval_token_acc": 0.935702795093505, "step": 1180 }, { "epoch": 2.9758544998432113, "grad_norm": 0.5881339311599731, "learning_rate": 1.4018315326103094e-09, "loss": 0.11981034278869629, "memory(GiB)": 34.95, "step": 1185, "token_acc": 0.9440527542267837, "train_speed(iter/s)": 0.169309 }, { "epoch": 2.988397616807777, "grad_norm": 0.6789658069610596, "learning_rate": 2.7691537809293454e-10, "loss": 0.1259629726409912, "memory(GiB)": 34.95, "step": 1190, "token_acc": 0.9503406881712312, "train_speed(iter/s)": 0.169392 }, { "epoch": 2.9984321103794294, "eval_loss": 0.23875285685062408, "eval_runtime": 9.984, "eval_samples_per_second": 25.741, "eval_steps_per_second": 6.51, "eval_token_acc": 0.9355570078423486, "step": 1194 } ], "logging_steps": 5, "max_steps": 1194, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.502521518607827e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }