{ "best_global_step": 980, "best_metric": 0.19523989, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v28-20250504-001458/checkpoint-980", "epoch": 1.9872038515139998, "eval_steps": 20, "global_step": 980, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002027112631445585, "grad_norm": 2.521756172180176, "learning_rate": 9.999988720152121e-06, "loss": 0.36801594495773315, "memory(GiB)": 31.83, "step": 1, "token_acc": 0.8936768843476973, "train_speed(iter/s)": 0.061871 }, { "epoch": 0.010135563157227924, "grad_norm": 1.4690369367599487, "learning_rate": 9.999718006347703e-06, "loss": 0.29584014415740967, "memory(GiB)": 31.83, "step": 5, "token_acc": 0.9045181572689532, "train_speed(iter/s)": 0.11262 }, { "epoch": 0.020271126314455847, "grad_norm": 1.016095519065857, "learning_rate": 9.998872057198983e-06, "loss": 0.2859709024429321, "memory(GiB)": 31.83, "step": 10, "token_acc": 0.8981923761356186, "train_speed(iter/s)": 0.125063 }, { "epoch": 0.03040668947168377, "grad_norm": 0.8137410879135132, "learning_rate": 9.997462247974751e-06, "loss": 0.26002681255340576, "memory(GiB)": 31.83, "step": 15, "token_acc": 0.907031992397846, "train_speed(iter/s)": 0.130865 }, { "epoch": 0.040542252628911694, "grad_norm": 0.8540328145027161, "learning_rate": 9.995488737697912e-06, "loss": 0.24060523509979248, "memory(GiB)": 31.84, "step": 20, "token_acc": 0.9225532754538279, "train_speed(iter/s)": 0.133177 }, { "epoch": 0.040542252628911694, "eval_loss": 0.2592654228210449, "eval_runtime": 21.9072, "eval_samples_per_second": 14.516, "eval_steps_per_second": 3.652, "eval_token_acc": 0.9178476377490845, "step": 20 }, { "epoch": 0.05067781578613962, "grad_norm": 0.7922812104225159, "learning_rate": 9.992951748975412e-06, "loss": 0.24358665943145752, "memory(GiB)": 31.84, "step": 25, "token_acc": 0.9152290582047404, "train_speed(iter/s)": 0.114782 }, { "epoch": 0.06081337894336754, "grad_norm": 0.8630368709564209, "learning_rate": 9.98985156797314e-06, "loss": 0.23777904510498046, "memory(GiB)": 31.84, "step": 30, "token_acc": 0.9226734218674036, "train_speed(iter/s)": 0.118815 }, { "epoch": 0.07094894210059546, "grad_norm": 0.783414900302887, "learning_rate": 9.98618854438364e-06, "loss": 0.2334528923034668, "memory(GiB)": 31.84, "step": 35, "token_acc": 0.9277159363111174, "train_speed(iter/s)": 0.122415 }, { "epoch": 0.08108450525782339, "grad_norm": 0.8003538846969604, "learning_rate": 9.98196309138667e-06, "loss": 0.24934375286102295, "memory(GiB)": 31.84, "step": 40, "token_acc": 0.9100693397648477, "train_speed(iter/s)": 0.124779 }, { "epoch": 0.08108450525782339, "eval_loss": 0.24275335669517517, "eval_runtime": 21.8352, "eval_samples_per_second": 14.564, "eval_steps_per_second": 3.664, "eval_token_acc": 0.9218243456030826, "step": 40 }, { "epoch": 0.09122006841505131, "grad_norm": 0.8252556324005127, "learning_rate": 9.977175685602601e-06, "loss": 0.22385673522949218, "memory(GiB)": 31.84, "step": 45, "token_acc": 0.9232757395226219, "train_speed(iter/s)": 0.115407 }, { "epoch": 0.10135563157227924, "grad_norm": 0.7236311435699463, "learning_rate": 9.971826867038652e-06, "loss": 0.23449177742004396, "memory(GiB)": 31.84, "step": 50, "token_acc": 0.9244473590731481, "train_speed(iter/s)": 0.117372 }, { "epoch": 0.11149119472950716, "grad_norm": 0.7773425579071045, "learning_rate": 9.965917239027972e-06, "loss": 0.22451424598693848, "memory(GiB)": 31.84, "step": 55, "token_acc": 0.9317552435764683, "train_speed(iter/s)": 0.118883 }, { "epoch": 0.12162675788673508, "grad_norm": 0.7654985785484314, "learning_rate": 9.959447468161598e-06, "loss": 0.22769575119018554, "memory(GiB)": 31.84, "step": 60, "token_acc": 0.9208154305996638, "train_speed(iter/s)": 0.120828 }, { "epoch": 0.12162675788673508, "eval_loss": 0.23627901077270508, "eval_runtime": 21.6502, "eval_samples_per_second": 14.688, "eval_steps_per_second": 3.695, "eval_token_acc": 0.9226332722007126, "step": 60 }, { "epoch": 0.13176232104396302, "grad_norm": 0.8141059279441833, "learning_rate": 9.952418284213256e-06, "loss": 0.21704885959625245, "memory(GiB)": 31.84, "step": 65, "token_acc": 0.9179818989115771, "train_speed(iter/s)": 0.114505 }, { "epoch": 0.14189788420119093, "grad_norm": 0.6942949891090393, "learning_rate": 9.94483048005705e-06, "loss": 0.23090925216674804, "memory(GiB)": 31.84, "step": 70, "token_acc": 0.9222755382207283, "train_speed(iter/s)": 0.115729 }, { "epoch": 0.15203344735841884, "grad_norm": 0.7940172553062439, "learning_rate": 9.936684911578019e-06, "loss": 0.22984471321105956, "memory(GiB)": 31.84, "step": 75, "token_acc": 0.9262917933130699, "train_speed(iter/s)": 0.117067 }, { "epoch": 0.16216901051564678, "grad_norm": 0.7496793270111084, "learning_rate": 9.927982497575606e-06, "loss": 0.21851859092712403, "memory(GiB)": 31.84, "step": 80, "token_acc": 0.9258955118221719, "train_speed(iter/s)": 0.117832 }, { "epoch": 0.16216901051564678, "eval_loss": 0.231533482670784, "eval_runtime": 21.8888, "eval_samples_per_second": 14.528, "eval_steps_per_second": 3.655, "eval_token_acc": 0.9239361997739945, "step": 80 }, { "epoch": 0.1723045736728747, "grad_norm": 0.7115150690078735, "learning_rate": 9.918724219660013e-06, "loss": 0.2375343084335327, "memory(GiB)": 31.84, "step": 85, "token_acc": 0.9190241791529182, "train_speed(iter/s)": 0.113889 }, { "epoch": 0.18244013683010263, "grad_norm": 0.7307594418525696, "learning_rate": 9.908911122141486e-06, "loss": 0.21745119094848633, "memory(GiB)": 31.84, "step": 90, "token_acc": 0.9248731238527157, "train_speed(iter/s)": 0.11504 }, { "epoch": 0.19257569998733054, "grad_norm": 0.7595335841178894, "learning_rate": 9.898544311912507e-06, "loss": 0.23666539192199706, "memory(GiB)": 31.84, "step": 95, "token_acc": 0.9154884783300083, "train_speed(iter/s)": 0.116125 }, { "epoch": 0.20271126314455848, "grad_norm": 0.7457159161567688, "learning_rate": 9.887624958322945e-06, "loss": 0.22108936309814453, "memory(GiB)": 31.84, "step": 100, "token_acc": 0.9207408691631993, "train_speed(iter/s)": 0.117134 }, { "epoch": 0.20271126314455848, "eval_loss": 0.22748854756355286, "eval_runtime": 21.8753, "eval_samples_per_second": 14.537, "eval_steps_per_second": 3.657, "eval_token_acc": 0.9248686266155376, "step": 100 }, { "epoch": 0.2128468263017864, "grad_norm": 0.715534508228302, "learning_rate": 9.876154293048163e-06, "loss": 0.2108386516571045, "memory(GiB)": 31.84, "step": 105, "token_acc": 0.9273328473234743, "train_speed(iter/s)": 0.113574 }, { "epoch": 0.22298238945901433, "grad_norm": 0.7702275514602661, "learning_rate": 9.864133609950077e-06, "loss": 0.22842142581939698, "memory(GiB)": 31.84, "step": 110, "token_acc": 0.9225603164746199, "train_speed(iter/s)": 0.114631 }, { "epoch": 0.23311795261624224, "grad_norm": 0.7723402976989746, "learning_rate": 9.851564264931219e-06, "loss": 0.22526850700378417, "memory(GiB)": 31.84, "step": 115, "token_acc": 0.9270424907786587, "train_speed(iter/s)": 0.115388 }, { "epoch": 0.24325351577347015, "grad_norm": 0.7186813950538635, "learning_rate": 9.838447675781795e-06, "loss": 0.21562654972076417, "memory(GiB)": 31.84, "step": 120, "token_acc": 0.9274811342327921, "train_speed(iter/s)": 0.116096 }, { "epoch": 0.24325351577347015, "eval_loss": 0.22359323501586914, "eval_runtime": 21.7276, "eval_samples_per_second": 14.636, "eval_steps_per_second": 3.682, "eval_token_acc": 0.925652853164385, "step": 120 }, { "epoch": 0.25338907893069806, "grad_norm": 0.7766680717468262, "learning_rate": 9.824785322019753e-06, "loss": 0.2237046480178833, "memory(GiB)": 31.84, "step": 125, "token_acc": 0.9236945219863242, "train_speed(iter/s)": 0.113406 }, { "epoch": 0.26352464208792603, "grad_norm": 0.7407037019729614, "learning_rate": 9.81057874472391e-06, "loss": 0.219429874420166, "memory(GiB)": 31.84, "step": 130, "token_acc": 0.9289204018740761, "train_speed(iter/s)": 0.114239 }, { "epoch": 0.27366020524515394, "grad_norm": 0.7638006210327148, "learning_rate": 9.795829546360113e-06, "loss": 0.2275376319885254, "memory(GiB)": 31.84, "step": 135, "token_acc": 0.9208956514221492, "train_speed(iter/s)": 0.114967 }, { "epoch": 0.28379576840238185, "grad_norm": 0.7686223387718201, "learning_rate": 9.78053939060049e-06, "loss": 0.22452235221862793, "memory(GiB)": 31.84, "step": 140, "token_acc": 0.9189745452344199, "train_speed(iter/s)": 0.11575 }, { "epoch": 0.28379576840238185, "eval_loss": 0.22123830020427704, "eval_runtime": 21.7578, "eval_samples_per_second": 14.615, "eval_steps_per_second": 3.677, "eval_token_acc": 0.9263012294449282, "step": 140 }, { "epoch": 0.29393133155960977, "grad_norm": 0.6931352019309998, "learning_rate": 9.764710002135784e-06, "loss": 0.2003002643585205, "memory(GiB)": 31.84, "step": 145, "token_acc": 0.9311943836379415, "train_speed(iter/s)": 0.113318 }, { "epoch": 0.3040668947168377, "grad_norm": 0.800798773765564, "learning_rate": 9.748343166480823e-06, "loss": 0.20239923000335694, "memory(GiB)": 31.84, "step": 150, "token_acc": 0.9229381308508969, "train_speed(iter/s)": 0.113936 }, { "epoch": 0.31420245787406564, "grad_norm": 0.7786034345626831, "learning_rate": 9.731440729773114e-06, "loss": 0.22815029621124266, "memory(GiB)": 31.84, "step": 155, "token_acc": 0.9227938277122053, "train_speed(iter/s)": 0.114778 }, { "epoch": 0.32433802103129356, "grad_norm": 0.7402083277702332, "learning_rate": 9.714004598564599e-06, "loss": 0.19810022115707399, "memory(GiB)": 31.84, "step": 160, "token_acc": 0.9393871049589806, "train_speed(iter/s)": 0.115347 }, { "epoch": 0.32433802103129356, "eval_loss": 0.21936483681201935, "eval_runtime": 21.7335, "eval_samples_per_second": 14.632, "eval_steps_per_second": 3.681, "eval_token_acc": 0.9270422309084061, "step": 160 }, { "epoch": 0.33447358418852147, "grad_norm": 0.7241997718811035, "learning_rate": 9.696036739606606e-06, "loss": 0.21367735862731935, "memory(GiB)": 31.84, "step": 165, "token_acc": 0.9242776974538682, "train_speed(iter/s)": 0.113487 }, { "epoch": 0.3446091473457494, "grad_norm": 0.7917912006378174, "learning_rate": 9.677539179628005e-06, "loss": 0.23486363887786865, "memory(GiB)": 31.84, "step": 170, "token_acc": 0.9263079934401947, "train_speed(iter/s)": 0.114247 }, { "epoch": 0.35474471050297735, "grad_norm": 0.7224828004837036, "learning_rate": 9.658514005106596e-06, "loss": 0.22237610816955566, "memory(GiB)": 31.84, "step": 175, "token_acc": 0.9191534928283597, "train_speed(iter/s)": 0.114924 }, { "epoch": 0.36488027366020526, "grad_norm": 0.8869006037712097, "learning_rate": 9.638963362033756e-06, "loss": 0.2130965232849121, "memory(GiB)": 31.84, "step": 180, "token_acc": 0.9328146634139909, "train_speed(iter/s)": 0.115499 }, { "epoch": 0.36488027366020526, "eval_loss": 0.21775159239768982, "eval_runtime": 21.827, "eval_samples_per_second": 14.569, "eval_steps_per_second": 3.665, "eval_token_acc": 0.9271039810303625, "step": 180 }, { "epoch": 0.37501583681743317, "grad_norm": 0.7315487861633301, "learning_rate": 9.618889455672384e-06, "loss": 0.20544004440307617, "memory(GiB)": 31.84, "step": 185, "token_acc": 0.9269179022705522, "train_speed(iter/s)": 0.113738 }, { "epoch": 0.3851513999746611, "grad_norm": 0.7636704444885254, "learning_rate": 9.598294550308149e-06, "loss": 0.21708755493164061, "memory(GiB)": 31.84, "step": 190, "token_acc": 0.9192093078475603, "train_speed(iter/s)": 0.114239 }, { "epoch": 0.395286963131889, "grad_norm": 0.753697395324707, "learning_rate": 9.577180968994081e-06, "loss": 0.20737123489379883, "memory(GiB)": 31.84, "step": 195, "token_acc": 0.9441376276819314, "train_speed(iter/s)": 0.114771 }, { "epoch": 0.40542252628911696, "grad_norm": 0.670136570930481, "learning_rate": 9.55555109328855e-06, "loss": 0.22084522247314453, "memory(GiB)": 31.84, "step": 200, "token_acc": 0.9222005816280249, "train_speed(iter/s)": 0.115258 }, { "epoch": 0.40542252628911696, "eval_loss": 0.2161727398633957, "eval_runtime": 21.793, "eval_samples_per_second": 14.592, "eval_steps_per_second": 3.671, "eval_token_acc": 0.9274559567255145, "step": 200 }, { "epoch": 0.41555808944634487, "grad_norm": 0.7045581936836243, "learning_rate": 9.533407362986606e-06, "loss": 0.20071265697479249, "memory(GiB)": 31.84, "step": 205, "token_acc": 0.926633272095267, "train_speed(iter/s)": 0.11359 }, { "epoch": 0.4256936526035728, "grad_norm": 0.7426798343658447, "learning_rate": 9.51075227584481e-06, "loss": 0.2052762269973755, "memory(GiB)": 31.84, "step": 210, "token_acc": 0.9334708885538184, "train_speed(iter/s)": 0.114162 }, { "epoch": 0.4358292157608007, "grad_norm": 0.7467470169067383, "learning_rate": 9.487588387299465e-06, "loss": 0.2220769166946411, "memory(GiB)": 31.84, "step": 215, "token_acc": 0.9223140495867769, "train_speed(iter/s)": 0.114807 }, { "epoch": 0.44596477891802866, "grad_norm": 0.7024182677268982, "learning_rate": 9.463918310178385e-06, "loss": 0.21398956775665284, "memory(GiB)": 31.84, "step": 220, "token_acc": 0.9318319475799791, "train_speed(iter/s)": 0.115301 }, { "epoch": 0.44596477891802866, "eval_loss": 0.21440377831459045, "eval_runtime": 21.7402, "eval_samples_per_second": 14.627, "eval_steps_per_second": 3.68, "eval_token_acc": 0.9279561327133621, "step": 220 }, { "epoch": 0.45610034207525657, "grad_norm": 0.6610811948776245, "learning_rate": 9.439744714406167e-06, "loss": 0.20560271739959718, "memory(GiB)": 31.84, "step": 225, "token_acc": 0.9351413448427652, "train_speed(iter/s)": 0.113954 }, { "epoch": 0.4662359052324845, "grad_norm": 0.6910359859466553, "learning_rate": 9.415070326703039e-06, "loss": 0.22013638019561768, "memory(GiB)": 31.84, "step": 230, "token_acc": 0.9156021643331288, "train_speed(iter/s)": 0.114456 }, { "epoch": 0.4763714683897124, "grad_norm": 0.7237235903739929, "learning_rate": 9.38989793027728e-06, "loss": 0.2018293857574463, "memory(GiB)": 31.84, "step": 235, "token_acc": 0.9305291447985599, "train_speed(iter/s)": 0.114857 }, { "epoch": 0.4865070315469403, "grad_norm": 0.5631291270256042, "learning_rate": 9.364230364511296e-06, "loss": 0.20158102512359619, "memory(GiB)": 31.84, "step": 240, "token_acc": 0.9311977864098334, "train_speed(iter/s)": 0.115295 }, { "epoch": 0.4865070315469403, "eval_loss": 0.21314272284507751, "eval_runtime": 21.8121, "eval_samples_per_second": 14.579, "eval_steps_per_second": 3.668, "eval_token_acc": 0.9278943825914057, "step": 240 }, { "epoch": 0.49664259470416827, "grad_norm": 0.7533831596374512, "learning_rate": 9.338070524641329e-06, "loss": 0.22155873775482177, "memory(GiB)": 31.84, "step": 245, "token_acc": 0.9251282051282051, "train_speed(iter/s)": 0.114158 }, { "epoch": 0.5067781578613961, "grad_norm": 0.6942651271820068, "learning_rate": 9.3114213614309e-06, "loss": 0.203749680519104, "memory(GiB)": 31.84, "step": 250, "token_acc": 0.9276428102429416, "train_speed(iter/s)": 0.114514 }, { "epoch": 0.5169137210186241, "grad_norm": 0.7373226284980774, "learning_rate": 9.284285880837947e-06, "loss": 0.204945707321167, "memory(GiB)": 31.84, "step": 255, "token_acc": 0.9326470962002795, "train_speed(iter/s)": 0.114981 }, { "epoch": 0.5270492841758521, "grad_norm": 0.7045229077339172, "learning_rate": 9.256667143675789e-06, "loss": 0.21190428733825684, "memory(GiB)": 31.84, "step": 260, "token_acc": 0.9289341566160771, "train_speed(iter/s)": 0.115383 }, { "epoch": 0.5270492841758521, "eval_loss": 0.21087971329689026, "eval_runtime": 21.8346, "eval_samples_per_second": 14.564, "eval_steps_per_second": 3.664, "eval_token_acc": 0.9285427588719488, "step": 260 }, { "epoch": 0.5371848473330799, "grad_norm": 0.6742910742759705, "learning_rate": 9.228568265267845e-06, "loss": 0.20208969116210937, "memory(GiB)": 31.84, "step": 265, "token_acc": 0.9311459533033191, "train_speed(iter/s)": 0.11409 }, { "epoch": 0.5473204104903079, "grad_norm": 0.7655450701713562, "learning_rate": 9.199992415096261e-06, "loss": 0.22049922943115235, "memory(GiB)": 31.84, "step": 270, "token_acc": 0.9257197105762817, "train_speed(iter/s)": 0.114477 }, { "epoch": 0.5574559736475357, "grad_norm": 0.7306473255157471, "learning_rate": 9.170942816444376e-06, "loss": 0.21227545738220216, "memory(GiB)": 31.84, "step": 275, "token_acc": 0.9255821189648914, "train_speed(iter/s)": 0.114761 }, { "epoch": 0.5675915368047637, "grad_norm": 0.6912848353385925, "learning_rate": 9.141422746033158e-06, "loss": 0.2025892972946167, "memory(GiB)": 31.84, "step": 280, "token_acc": 0.9356907206914177, "train_speed(iter/s)": 0.115104 }, { "epoch": 0.5675915368047637, "eval_loss": 0.21015885472297668, "eval_runtime": 21.7959, "eval_samples_per_second": 14.59, "eval_steps_per_second": 3.67, "eval_token_acc": 0.9283204584329054, "step": 280 }, { "epoch": 0.5777270999619917, "grad_norm": 0.6503239274024963, "learning_rate": 9.111435533651595e-06, "loss": 0.20708324909210205, "memory(GiB)": 31.84, "step": 285, "token_acc": 0.9302453853948871, "train_speed(iter/s)": 0.11406 }, { "epoch": 0.5878626631192195, "grad_norm": 0.6230775117874146, "learning_rate": 9.08098456178111e-06, "loss": 0.21010513305664064, "memory(GiB)": 31.84, "step": 290, "token_acc": 0.9251024310297733, "train_speed(iter/s)": 0.114468 }, { "epoch": 0.5979982262764475, "grad_norm": 0.5951421856880188, "learning_rate": 9.050073265214006e-06, "loss": 0.20655345916748047, "memory(GiB)": 31.84, "step": 295, "token_acc": 0.9383279364144834, "train_speed(iter/s)": 0.114773 }, { "epoch": 0.6081337894336754, "grad_norm": 0.6480836868286133, "learning_rate": 9.01870513066605e-06, "loss": 0.19019179344177245, "memory(GiB)": 31.84, "step": 300, "token_acc": 0.9367692102809961, "train_speed(iter/s)": 0.115106 }, { "epoch": 0.6081337894336754, "eval_loss": 0.20902292430400848, "eval_runtime": 21.7988, "eval_samples_per_second": 14.588, "eval_steps_per_second": 3.67, "eval_token_acc": 0.9290552848841876, "step": 300 }, { "epoch": 0.6182693525909033, "grad_norm": 0.5634115934371948, "learning_rate": 8.986883696383174e-06, "loss": 0.18940932750701905, "memory(GiB)": 31.84, "step": 305, "token_acc": 0.9292788671890537, "train_speed(iter/s)": 0.114033 }, { "epoch": 0.6284049157481313, "grad_norm": 0.7795764803886414, "learning_rate": 8.95461255174237e-06, "loss": 0.21010336875915528, "memory(GiB)": 31.84, "step": 310, "token_acc": 0.922672088989893, "train_speed(iter/s)": 0.114389 }, { "epoch": 0.6385404789053591, "grad_norm": 0.6814590692520142, "learning_rate": 8.921895336846814e-06, "loss": 0.21159787178039552, "memory(GiB)": 31.84, "step": 315, "token_acc": 0.9389688078621279, "train_speed(iter/s)": 0.114683 }, { "epoch": 0.6486760420625871, "grad_norm": 0.6690118908882141, "learning_rate": 8.888735742115268e-06, "loss": 0.1964055299758911, "memory(GiB)": 31.84, "step": 320, "token_acc": 0.9258384996498146, "train_speed(iter/s)": 0.114995 }, { "epoch": 0.6486760420625871, "eval_loss": 0.20818451046943665, "eval_runtime": 21.7939, "eval_samples_per_second": 14.591, "eval_steps_per_second": 3.671, "eval_token_acc": 0.929296110359818, "step": 320 }, { "epoch": 0.6588116052198151, "grad_norm": 0.6912645697593689, "learning_rate": 8.855137507865831e-06, "loss": 0.19853589534759522, "memory(GiB)": 31.84, "step": 325, "token_acc": 0.9269768893593345, "train_speed(iter/s)": 0.114013 }, { "epoch": 0.6689471683770429, "grad_norm": 0.6451709866523743, "learning_rate": 8.821104423894015e-06, "loss": 0.20868840217590331, "memory(GiB)": 31.84, "step": 330, "token_acc": 0.9279026926085749, "train_speed(iter/s)": 0.114452 }, { "epoch": 0.6790827315342709, "grad_norm": 0.6472577452659607, "learning_rate": 8.786640329045279e-06, "loss": 0.20419092178344728, "memory(GiB)": 31.84, "step": 335, "token_acc": 0.9282166508987701, "train_speed(iter/s)": 0.114692 }, { "epoch": 0.6892182946914988, "grad_norm": 0.6527739763259888, "learning_rate": 8.751749110782013e-06, "loss": 0.20285537242889404, "memory(GiB)": 31.84, "step": 340, "token_acc": 0.9387946219536667, "train_speed(iter/s)": 0.114992 }, { "epoch": 0.6892182946914988, "eval_loss": 0.20785662531852722, "eval_runtime": 21.816, "eval_samples_per_second": 14.576, "eval_steps_per_second": 3.667, "eval_token_acc": 0.9291232100183398, "step": 340 }, { "epoch": 0.6993538578487267, "grad_norm": 0.7979351878166199, "learning_rate": 8.716434704745047e-06, "loss": 0.21715948581695557, "memory(GiB)": 31.84, "step": 345, "token_acc": 0.9273517017828201, "train_speed(iter/s)": 0.114123 }, { "epoch": 0.7094894210059547, "grad_norm": 0.7549222707748413, "learning_rate": 8.680701094309716e-06, "loss": 0.19147870540618897, "memory(GiB)": 31.84, "step": 350, "token_acc": 0.9354483972067961, "train_speed(iter/s)": 0.114431 }, { "epoch": 0.7196249841631825, "grad_norm": 0.6661515831947327, "learning_rate": 8.644552310136547e-06, "loss": 0.19406793117523194, "memory(GiB)": 31.84, "step": 355, "token_acc": 0.9296775620817432, "train_speed(iter/s)": 0.114715 }, { "epoch": 0.7297605473204105, "grad_norm": 0.7383010983467102, "learning_rate": 8.60799242971661e-06, "loss": 0.19579734802246093, "memory(GiB)": 31.84, "step": 360, "token_acc": 0.92644679160905, "train_speed(iter/s)": 0.115022 }, { "epoch": 0.7297605473204105, "eval_loss": 0.20642724633216858, "eval_runtime": 21.8294, "eval_samples_per_second": 14.568, "eval_steps_per_second": 3.665, "eval_token_acc": 0.9297221862013177, "step": 360 }, { "epoch": 0.7398961104776384, "grad_norm": 0.763937771320343, "learning_rate": 8.571025576911587e-06, "loss": 0.19617555141448975, "memory(GiB)": 31.84, "step": 365, "token_acc": 0.9303160729846056, "train_speed(iter/s)": 0.114163 }, { "epoch": 0.7500316736348663, "grad_norm": 0.7166250348091125, "learning_rate": 8.533655921488612e-06, "loss": 0.21308059692382814, "memory(GiB)": 31.84, "step": 370, "token_acc": 0.9230729085120818, "train_speed(iter/s)": 0.114429 }, { "epoch": 0.7601672367920943, "grad_norm": 0.765160858631134, "learning_rate": 8.495887678649933e-06, "loss": 0.2089679002761841, "memory(GiB)": 31.84, "step": 375, "token_acc": 0.9291193218752438, "train_speed(iter/s)": 0.114729 }, { "epoch": 0.7703027999493222, "grad_norm": 0.6891535520553589, "learning_rate": 8.457725108557447e-06, "loss": 0.20386552810668945, "memory(GiB)": 31.84, "step": 380, "token_acc": 0.930313113680995, "train_speed(iter/s)": 0.114967 }, { "epoch": 0.7703027999493222, "eval_loss": 0.20559069514274597, "eval_runtime": 21.7942, "eval_samples_per_second": 14.591, "eval_steps_per_second": 3.671, "eval_token_acc": 0.9297468862501004, "step": 380 }, { "epoch": 0.7804383631065501, "grad_norm": 0.7166603803634644, "learning_rate": 8.41917251585216e-06, "loss": 0.19558722972869874, "memory(GiB)": 31.84, "step": 385, "token_acc": 0.9298326573239745, "train_speed(iter/s)": 0.114165 }, { "epoch": 0.790573926263778, "grad_norm": 0.7598738074302673, "learning_rate": 8.380234249168642e-06, "loss": 0.1924947738647461, "memory(GiB)": 31.84, "step": 390, "token_acc": 0.9257823458899742, "train_speed(iter/s)": 0.11441 }, { "epoch": 0.800709489421006, "grad_norm": 0.7150676846504211, "learning_rate": 8.340914700644507e-06, "loss": 0.20020556449890137, "memory(GiB)": 31.84, "step": 395, "token_acc": 0.9267301944153489, "train_speed(iter/s)": 0.114769 }, { "epoch": 0.8108450525782339, "grad_norm": 0.6519859433174133, "learning_rate": 8.301218305424994e-06, "loss": 0.19768332242965697, "memory(GiB)": 31.84, "step": 400, "token_acc": 0.923796935692706, "train_speed(iter/s)": 0.115018 }, { "epoch": 0.8108450525782339, "eval_loss": 0.20375800132751465, "eval_runtime": 21.8998, "eval_samples_per_second": 14.521, "eval_steps_per_second": 3.653, "eval_token_acc": 0.9307348882014042, "step": 400 }, { "epoch": 0.8209806157354618, "grad_norm": 0.7148239612579346, "learning_rate": 8.261149541162693e-06, "loss": 0.20124292373657227, "memory(GiB)": 31.84, "step": 405, "token_acc": 0.9281172612274948, "train_speed(iter/s)": 0.114267 }, { "epoch": 0.8311161788926897, "grad_norm": 0.6164042353630066, "learning_rate": 8.22071292751247e-06, "loss": 0.18608367443084717, "memory(GiB)": 31.84, "step": 410, "token_acc": 0.9256482433822335, "train_speed(iter/s)": 0.11456 }, { "epoch": 0.8412517420499176, "grad_norm": 0.722022294998169, "learning_rate": 8.179913025621676e-06, "loss": 0.19683008193969725, "memory(GiB)": 31.84, "step": 415, "token_acc": 0.9332604223628362, "train_speed(iter/s)": 0.114887 }, { "epoch": 0.8513873052071456, "grad_norm": 0.6409715414047241, "learning_rate": 8.138754437615652e-06, "loss": 0.20776214599609374, "memory(GiB)": 31.84, "step": 420, "token_acc": 0.9207910620264543, "train_speed(iter/s)": 0.115145 }, { "epoch": 0.8513873052071456, "eval_loss": 0.202426478266716, "eval_runtime": 21.7226, "eval_samples_per_second": 14.639, "eval_steps_per_second": 3.683, "eval_token_acc": 0.9311918391038823, "step": 420 }, { "epoch": 0.8615228683643735, "grad_norm": 0.6910145282745361, "learning_rate": 8.097241806078616e-06, "loss": 0.2034536838531494, "memory(GiB)": 31.84, "step": 425, "token_acc": 0.9296462086074857, "train_speed(iter/s)": 0.114436 }, { "epoch": 0.8716584315216014, "grad_norm": 0.7244237661361694, "learning_rate": 8.055379813530002e-06, "loss": 0.20410799980163574, "memory(GiB)": 31.84, "step": 430, "token_acc": 0.9354806441932579, "train_speed(iter/s)": 0.114688 }, { "epoch": 0.8817939946788294, "grad_norm": 0.6427024602890015, "learning_rate": 8.013173181896283e-06, "loss": 0.1945526719093323, "memory(GiB)": 31.84, "step": 435, "token_acc": 0.9243187942908474, "train_speed(iter/s)": 0.114942 }, { "epoch": 0.8919295578360573, "grad_norm": 0.6878598928451538, "learning_rate": 7.970626671978336e-06, "loss": 0.19251216650009156, "memory(GiB)": 31.84, "step": 440, "token_acc": 0.9332632422148793, "train_speed(iter/s)": 0.115122 }, { "epoch": 0.8919295578360573, "eval_loss": 0.20254245400428772, "eval_runtime": 21.8009, "eval_samples_per_second": 14.587, "eval_steps_per_second": 3.67, "eval_token_acc": 0.9308954385184911, "step": 440 }, { "epoch": 0.9020651209932852, "grad_norm": 0.679250955581665, "learning_rate": 7.927745082914453e-06, "loss": 0.19701287746429444, "memory(GiB)": 31.84, "step": 445, "token_acc": 0.9282149553325245, "train_speed(iter/s)": 0.1144 }, { "epoch": 0.9122006841505131, "grad_norm": 0.7318421602249146, "learning_rate": 7.884533251639e-06, "loss": 0.1975053071975708, "memory(GiB)": 31.84, "step": 450, "token_acc": 0.9288036746490503, "train_speed(iter/s)": 0.114655 }, { "epoch": 0.922336247307741, "grad_norm": 0.7219140529632568, "learning_rate": 7.840996052336827e-06, "loss": 0.22328760623931884, "memory(GiB)": 31.84, "step": 455, "token_acc": 0.9224597234555373, "train_speed(iter/s)": 0.114914 }, { "epoch": 0.932471810464969, "grad_norm": 0.6490547060966492, "learning_rate": 7.797138395893471e-06, "loss": 0.20440030097961426, "memory(GiB)": 31.84, "step": 460, "token_acc": 0.9310429568465546, "train_speed(iter/s)": 0.115177 }, { "epoch": 0.932471810464969, "eval_loss": 0.2018972933292389, "eval_runtime": 21.7035, "eval_samples_per_second": 14.652, "eval_steps_per_second": 3.686, "eval_token_acc": 0.9313091643355995, "step": 460 }, { "epoch": 0.9426073736221969, "grad_norm": 0.6232012510299683, "learning_rate": 7.75296522934122e-06, "loss": 0.19165756702423095, "memory(GiB)": 31.84, "step": 465, "token_acc": 0.932452931203712, "train_speed(iter/s)": 0.114511 }, { "epoch": 0.9527429367794248, "grad_norm": 0.638760507106781, "learning_rate": 7.708481535301101e-06, "loss": 0.180339515209198, "memory(GiB)": 31.84, "step": 470, "token_acc": 0.9387325860490436, "train_speed(iter/s)": 0.114716 }, { "epoch": 0.9628784999366528, "grad_norm": 0.7850983142852783, "learning_rate": 7.663692331420857e-06, "loss": 0.20282835960388185, "memory(GiB)": 31.84, "step": 475, "token_acc": 0.9294787942210982, "train_speed(iter/s)": 0.114974 }, { "epoch": 0.9730140630938806, "grad_norm": 0.7070600390434265, "learning_rate": 7.6186026698089584e-06, "loss": 0.20282812118530275, "memory(GiB)": 31.84, "step": 480, "token_acc": 0.9263814944005875, "train_speed(iter/s)": 0.115173 }, { "epoch": 0.9730140630938806, "eval_loss": 0.2006615400314331, "eval_runtime": 21.7993, "eval_samples_per_second": 14.588, "eval_steps_per_second": 3.67, "eval_token_acc": 0.931636439981969, "step": 480 }, { "epoch": 0.9831496262511086, "grad_norm": 0.6108932495117188, "learning_rate": 7.5732176364647515e-06, "loss": 0.18763720989227295, "memory(GiB)": 31.84, "step": 485, "token_acc": 0.9314041294819665, "train_speed(iter/s)": 0.114551 }, { "epoch": 0.9932851894083365, "grad_norm": 0.7140085101127625, "learning_rate": 7.527542350704759e-06, "loss": 0.20563454627990724, "memory(GiB)": 31.84, "step": 490, "token_acc": 0.9261781443826331, "train_speed(iter/s)": 0.114801 }, { "epoch": 1.0040542252628912, "grad_norm": 0.6436912417411804, "learning_rate": 7.481581964585245e-06, "loss": 0.2002246856689453, "memory(GiB)": 31.84, "step": 495, "token_acc": 0.9414402391501628, "train_speed(iter/s)": 0.114919 }, { "epoch": 1.0141897884201192, "grad_norm": 0.6124146580696106, "learning_rate": 7.435341662321063e-06, "loss": 0.14853100776672362, "memory(GiB)": 31.84, "step": 500, "token_acc": 0.9471966205837173, "train_speed(iter/s)": 0.115165 }, { "epoch": 1.0141897884201192, "eval_loss": 0.20782382786273956, "eval_runtime": 21.7236, "eval_samples_per_second": 14.638, "eval_steps_per_second": 3.683, "eval_token_acc": 0.9316426149941646, "step": 500 }, { "epoch": 1.024325351577347, "grad_norm": 0.753773033618927, "learning_rate": 7.388826659700902e-06, "loss": 0.13870035409927367, "memory(GiB)": 31.84, "step": 505, "token_acc": 0.9419344271245667, "train_speed(iter/s)": 0.114525 }, { "epoch": 1.034460914734575, "grad_norm": 0.6324412226676941, "learning_rate": 7.342042203498952e-06, "loss": 0.14452635049819945, "memory(GiB)": 31.84, "step": 510, "token_acc": 0.9496112148490845, "train_speed(iter/s)": 0.114737 }, { "epoch": 1.0445964778918029, "grad_norm": 0.568839430809021, "learning_rate": 7.2949935708830825e-06, "loss": 0.13303987979888915, "memory(GiB)": 31.84, "step": 515, "token_acc": 0.9554845690143747, "train_speed(iter/s)": 0.114956 }, { "epoch": 1.0547320410490308, "grad_norm": 0.7018365859985352, "learning_rate": 7.247686068819592e-06, "loss": 0.1495302438735962, "memory(GiB)": 31.84, "step": 520, "token_acc": 0.9394168981676347, "train_speed(iter/s)": 0.115106 }, { "epoch": 1.0547320410490308, "eval_loss": 0.20550589263439178, "eval_runtime": 21.8398, "eval_samples_per_second": 14.561, "eval_steps_per_second": 3.663, "eval_token_acc": 0.9316549650185559, "step": 520 }, { "epoch": 1.0648676042062588, "grad_norm": 0.6878096461296082, "learning_rate": 7.200125033474599e-06, "loss": 0.1441951036453247, "memory(GiB)": 31.84, "step": 525, "token_acc": 0.9410242560514515, "train_speed(iter/s)": 0.114529 }, { "epoch": 1.0750031673634866, "grad_norm": 0.5951861143112183, "learning_rate": 7.152315829612124e-06, "loss": 0.141739821434021, "memory(GiB)": 31.84, "step": 530, "token_acc": 0.9511068989817988, "train_speed(iter/s)": 0.11475 }, { "epoch": 1.0851387305207145, "grad_norm": 0.7104572057723999, "learning_rate": 7.104263849988976e-06, "loss": 0.13958663940429689, "memory(GiB)": 31.84, "step": 535, "token_acc": 0.9464745976287302, "train_speed(iter/s)": 0.114952 }, { "epoch": 1.0952742936779425, "grad_norm": 0.7287134528160095, "learning_rate": 7.055974514746446e-06, "loss": 0.1433733582496643, "memory(GiB)": 31.84, "step": 540, "token_acc": 0.9533915015281474, "train_speed(iter/s)": 0.11514 }, { "epoch": 1.0952742936779425, "eval_loss": 0.20595994591712952, "eval_runtime": 21.8259, "eval_samples_per_second": 14.57, "eval_steps_per_second": 3.665, "eval_token_acc": 0.9320995658966427, "step": 540 }, { "epoch": 1.1054098568351705, "grad_norm": 0.7750824093818665, "learning_rate": 7.007453270798937e-06, "loss": 0.14028260707855225, "memory(GiB)": 31.84, "step": 545, "token_acc": 0.9427210537187573, "train_speed(iter/s)": 0.114614 }, { "epoch": 1.1155454199923984, "grad_norm": 0.6719576120376587, "learning_rate": 6.95870559121957e-06, "loss": 0.1443098783493042, "memory(GiB)": 31.84, "step": 550, "token_acc": 0.9419209173821336, "train_speed(iter/s)": 0.114811 }, { "epoch": 1.1256809831496262, "grad_norm": 0.6712535619735718, "learning_rate": 6.909736974622827e-06, "loss": 0.1412811279296875, "memory(GiB)": 31.84, "step": 555, "token_acc": 0.9537117362746313, "train_speed(iter/s)": 0.114963 }, { "epoch": 1.1358165463068541, "grad_norm": 0.7601253390312195, "learning_rate": 6.860552944544325e-06, "loss": 0.1436458110809326, "memory(GiB)": 31.84, "step": 560, "token_acc": 0.9485312439436308, "train_speed(iter/s)": 0.11514 }, { "epoch": 1.1358165463068541, "eval_loss": 0.20563021302223206, "eval_runtime": 21.7336, "eval_samples_per_second": 14.632, "eval_steps_per_second": 3.681, "eval_token_acc": 0.9317969902990558, "step": 560 }, { "epoch": 1.145952109464082, "grad_norm": 0.7179679870605469, "learning_rate": 6.811159048817773e-06, "loss": 0.1442504644393921, "memory(GiB)": 31.84, "step": 565, "token_acc": 0.9380280917370789, "train_speed(iter/s)": 0.114635 }, { "epoch": 1.15608767262131, "grad_norm": 0.7179620265960693, "learning_rate": 6.7615608589491935e-06, "loss": 0.13410391807556152, "memory(GiB)": 31.84, "step": 570, "token_acc": 0.9558909767726473, "train_speed(iter/s)": 0.114806 }, { "epoch": 1.166223235778538, "grad_norm": 0.6108378171920776, "learning_rate": 6.711763969488472e-06, "loss": 0.13897714614868165, "memory(GiB)": 31.84, "step": 575, "token_acc": 0.9451023932889218, "train_speed(iter/s)": 0.114992 }, { "epoch": 1.1763587989357658, "grad_norm": 0.6707165837287903, "learning_rate": 6.6617739973982985e-06, "loss": 0.14145419597625733, "memory(GiB)": 31.84, "step": 580, "token_acc": 0.9474374323014652, "train_speed(iter/s)": 0.115192 }, { "epoch": 1.1763587989357658, "eval_loss": 0.20474562048912048, "eval_runtime": 21.8012, "eval_samples_per_second": 14.586, "eval_steps_per_second": 3.67, "eval_token_acc": 0.9320069407137079, "step": 580 }, { "epoch": 1.1864943620929937, "grad_norm": 0.7210031747817993, "learning_rate": 6.6115965814206e-06, "loss": 0.1450878858566284, "memory(GiB)": 31.84, "step": 585, "token_acc": 0.9400503909192447, "train_speed(iter/s)": 0.114706 }, { "epoch": 1.1966299252502217, "grad_norm": 0.6648117899894714, "learning_rate": 6.561237381440491e-06, "loss": 0.1425154447555542, "memory(GiB)": 31.84, "step": 590, "token_acc": 0.9425449072407813, "train_speed(iter/s)": 0.114844 }, { "epoch": 1.2067654884074497, "grad_norm": 0.7065860629081726, "learning_rate": 6.510702077847864e-06, "loss": 0.14567887783050537, "memory(GiB)": 31.84, "step": 595, "token_acc": 0.9487148690847946, "train_speed(iter/s)": 0.115008 }, { "epoch": 1.2169010515646776, "grad_norm": 0.6811427474021912, "learning_rate": 6.459996370896653e-06, "loss": 0.1396160125732422, "memory(GiB)": 31.84, "step": 600, "token_acc": 0.9510217472815898, "train_speed(iter/s)": 0.11519 }, { "epoch": 1.2169010515646776, "eval_loss": 0.204533189535141, "eval_runtime": 21.8513, "eval_samples_per_second": 14.553, "eval_steps_per_second": 3.661, "eval_token_acc": 0.9319143155307732, "step": 600 }, { "epoch": 1.2270366147219054, "grad_norm": 0.6900855302810669, "learning_rate": 6.409125980061852e-06, "loss": 0.15422054529190063, "memory(GiB)": 31.84, "step": 605, "token_acc": 0.938386855862584, "train_speed(iter/s)": 0.114682 }, { "epoch": 1.2371721778791334, "grad_norm": 0.7017642259597778, "learning_rate": 6.358096643394387e-06, "loss": 0.14031555652618408, "memory(GiB)": 31.84, "step": 610, "token_acc": 0.9535683058461724, "train_speed(iter/s)": 0.114824 }, { "epoch": 1.2473077410363613, "grad_norm": 0.6713739037513733, "learning_rate": 6.306914116873863e-06, "loss": 0.14935390949249266, "memory(GiB)": 31.84, "step": 615, "token_acc": 0.9406998158379374, "train_speed(iter/s)": 0.114993 }, { "epoch": 1.2574433041935893, "grad_norm": 0.6819109916687012, "learning_rate": 6.255584173759319e-06, "loss": 0.1377212643623352, "memory(GiB)": 31.84, "step": 620, "token_acc": 0.9501051424201874, "train_speed(iter/s)": 0.115142 }, { "epoch": 1.2574433041935893, "eval_loss": 0.2050146609544754, "eval_runtime": 21.8467, "eval_samples_per_second": 14.556, "eval_steps_per_second": 3.662, "eval_token_acc": 0.9317105401283168, "step": 620 }, { "epoch": 1.2675788673508173, "grad_norm": 0.7029967308044434, "learning_rate": 6.2041126039380065e-06, "loss": 0.14089449644088745, "memory(GiB)": 31.84, "step": 625, "token_acc": 0.9445310259703018, "train_speed(iter/s)": 0.114661 }, { "epoch": 1.2777144305080452, "grad_norm": 0.7382521629333496, "learning_rate": 6.152505213272308e-06, "loss": 0.15079411268234252, "memory(GiB)": 31.84, "step": 630, "token_acc": 0.9498072379057331, "train_speed(iter/s)": 0.114839 }, { "epoch": 1.287849993665273, "grad_norm": 0.7125928997993469, "learning_rate": 6.100767822944856e-06, "loss": 0.13632259368896485, "memory(GiB)": 31.84, "step": 635, "token_acc": 0.9514033264033264, "train_speed(iter/s)": 0.115011 }, { "epoch": 1.297985556822501, "grad_norm": 0.7053549885749817, "learning_rate": 6.048906268801915e-06, "loss": 0.1401703953742981, "memory(GiB)": 31.84, "step": 640, "token_acc": 0.9504383760540571, "train_speed(iter/s)": 0.115182 }, { "epoch": 1.297985556822501, "eval_loss": 0.20237460732460022, "eval_runtime": 21.7848, "eval_samples_per_second": 14.597, "eval_steps_per_second": 3.672, "eval_token_acc": 0.9321427909820121, "step": 640 }, { "epoch": 1.308121119979729, "grad_norm": 0.7114176750183105, "learning_rate": 5.9969264006951135e-06, "loss": 0.1354852318763733, "memory(GiB)": 31.84, "step": 645, "token_acc": 0.9398549466401315, "train_speed(iter/s)": 0.114762 }, { "epoch": 1.3182566831369569, "grad_norm": 0.6910070776939392, "learning_rate": 5.944834081821589e-06, "loss": 0.1504984140396118, "memory(GiB)": 31.84, "step": 650, "token_acc": 0.946494742867472, "train_speed(iter/s)": 0.114975 }, { "epoch": 1.3283922462941846, "grad_norm": 0.7024128437042236, "learning_rate": 5.892635188062647e-06, "loss": 0.1456763982772827, "memory(GiB)": 31.84, "step": 655, "token_acc": 0.9487756192189828, "train_speed(iter/s)": 0.115131 }, { "epoch": 1.3385278094514126, "grad_norm": 0.6766383647918701, "learning_rate": 5.8403356073209636e-06, "loss": 0.13979326486587523, "memory(GiB)": 31.84, "step": 660, "token_acc": 0.9429099638530946, "train_speed(iter/s)": 0.115294 }, { "epoch": 1.3385278094514126, "eval_loss": 0.2033311426639557, "eval_runtime": 21.7863, "eval_samples_per_second": 14.596, "eval_steps_per_second": 3.672, "eval_token_acc": 0.9320625158234688, "step": 660 }, { "epoch": 1.3486633726086406, "grad_norm": 0.704328179359436, "learning_rate": 5.787941238856456e-06, "loss": 0.14371912479400634, "memory(GiB)": 31.84, "step": 665, "token_acc": 0.9368773059543715, "train_speed(iter/s)": 0.114861 }, { "epoch": 1.3587989357658685, "grad_norm": 0.6802431344985962, "learning_rate": 5.735457992620851e-06, "loss": 0.1401059865951538, "memory(GiB)": 31.84, "step": 670, "token_acc": 0.9508861724428956, "train_speed(iter/s)": 0.115032 }, { "epoch": 1.3689344989230965, "grad_norm": 0.6354742646217346, "learning_rate": 5.682891788591066e-06, "loss": 0.13931301832199097, "memory(GiB)": 31.84, "step": 675, "token_acc": 0.9524576521435666, "train_speed(iter/s)": 0.115144 }, { "epoch": 1.3790700620803245, "grad_norm": 0.6422320604324341, "learning_rate": 5.630248556101448e-06, "loss": 0.14210424423217774, "memory(GiB)": 31.84, "step": 680, "token_acc": 0.9509293240203622, "train_speed(iter/s)": 0.115296 }, { "epoch": 1.3790700620803245, "eval_loss": 0.20212741196155548, "eval_runtime": 21.7681, "eval_samples_per_second": 14.609, "eval_steps_per_second": 3.675, "eval_token_acc": 0.9325071167015555, "step": 680 }, { "epoch": 1.3892056252375522, "grad_norm": 0.629282534122467, "learning_rate": 5.5775342331749525e-06, "loss": 0.13471572399139403, "memory(GiB)": 31.84, "step": 685, "token_acc": 0.9387646219305616, "train_speed(iter/s)": 0.11486 }, { "epoch": 1.3993411883947802, "grad_norm": 0.6799649000167847, "learning_rate": 5.5247547658533604e-06, "loss": 0.146738600730896, "memory(GiB)": 31.84, "step": 690, "token_acc": 0.9462739584104196, "train_speed(iter/s)": 0.11501 }, { "epoch": 1.4094767515520081, "grad_norm": 0.7268623113632202, "learning_rate": 5.471916107526577e-06, "loss": 0.14002842903137208, "memory(GiB)": 31.84, "step": 695, "token_acc": 0.9498395231545163, "train_speed(iter/s)": 0.115121 }, { "epoch": 1.419612314709236, "grad_norm": 0.6405200362205505, "learning_rate": 5.419024218261098e-06, "loss": 0.1415479898452759, "memory(GiB)": 31.84, "step": 700, "token_acc": 0.9470242347639721, "train_speed(iter/s)": 0.11524 }, { "epoch": 1.419612314709236, "eval_loss": 0.20304131507873535, "eval_runtime": 21.7388, "eval_samples_per_second": 14.628, "eval_steps_per_second": 3.68, "eval_token_acc": 0.9326059168966858, "step": 700 }, { "epoch": 1.4297478778664638, "grad_norm": 0.7136779427528381, "learning_rate": 5.366085064127734e-06, "loss": 0.14102948904037477, "memory(GiB)": 31.84, "step": 705, "token_acc": 0.9402442511130172, "train_speed(iter/s)": 0.114822 }, { "epoch": 1.4398834410236918, "grad_norm": 0.7035164833068848, "learning_rate": 5.313104616528656e-06, "loss": 0.15270428657531737, "memory(GiB)": 31.84, "step": 710, "token_acc": 0.9444076404990615, "train_speed(iter/s)": 0.114935 }, { "epoch": 1.4500190041809198, "grad_norm": 0.7185565233230591, "learning_rate": 5.260088851523833e-06, "loss": 0.14676375389099122, "memory(GiB)": 31.84, "step": 715, "token_acc": 0.9527322682405617, "train_speed(iter/s)": 0.11512 }, { "epoch": 1.4601545673381477, "grad_norm": 0.712411105632782, "learning_rate": 5.207043749156945e-06, "loss": 0.13786702156066893, "memory(GiB)": 31.84, "step": 720, "token_acc": 0.9517344563892275, "train_speed(iter/s)": 0.115272 }, { "epoch": 1.4601545673381477, "eval_loss": 0.20220014452934265, "eval_runtime": 21.8005, "eval_samples_per_second": 14.587, "eval_steps_per_second": 3.67, "eval_token_acc": 0.9331431429577074, "step": 720 }, { "epoch": 1.4702901304953757, "grad_norm": 0.7098857760429382, "learning_rate": 5.153975292780852e-06, "loss": 0.13870218992233277, "memory(GiB)": 31.84, "step": 725, "token_acc": 0.9459776516453177, "train_speed(iter/s)": 0.114836 }, { "epoch": 1.4804256936526037, "grad_norm": 0.6736304759979248, "learning_rate": 5.10088946838269e-06, "loss": 0.15405884981155396, "memory(GiB)": 31.84, "step": 730, "token_acc": 0.943327239488117, "train_speed(iter/s)": 0.115018 }, { "epoch": 1.4905612568098314, "grad_norm": 0.6198113560676575, "learning_rate": 5.0477922639086594e-06, "loss": 0.12916960716247558, "memory(GiB)": 31.84, "step": 735, "token_acc": 0.9533303842264914, "train_speed(iter/s)": 0.115172 }, { "epoch": 1.5006968199670594, "grad_norm": 0.6400096416473389, "learning_rate": 4.99468966858861e-06, "loss": 0.13272554874420167, "memory(GiB)": 31.84, "step": 740, "token_acc": 0.9574656362456492, "train_speed(iter/s)": 0.115287 }, { "epoch": 1.5006968199670594, "eval_loss": 0.20150884985923767, "eval_runtime": 21.7413, "eval_samples_per_second": 14.627, "eval_steps_per_second": 3.68, "eval_token_acc": 0.9324885916649686, "step": 740 }, { "epoch": 1.5108323831242874, "grad_norm": 0.6690772771835327, "learning_rate": 4.941587672260461e-06, "loss": 0.1432182550430298, "memory(GiB)": 31.84, "step": 745, "token_acc": 0.9370182365106223, "train_speed(iter/s)": 0.114917 }, { "epoch": 1.520967946281515, "grad_norm": 0.6275672912597656, "learning_rate": 4.888492264694566e-06, "loss": 0.1296250343322754, "memory(GiB)": 31.84, "step": 750, "token_acc": 0.9563844971453667, "train_speed(iter/s)": 0.115024 }, { "epoch": 1.531103509438743, "grad_norm": 0.7383422255516052, "learning_rate": 4.8354094349180885e-06, "loss": 0.13504064083099365, "memory(GiB)": 31.84, "step": 755, "token_acc": 0.951349361424971, "train_speed(iter/s)": 0.115159 }, { "epoch": 1.541239072595971, "grad_norm": 0.7565678358078003, "learning_rate": 4.782345170539441e-06, "loss": 0.13883064985275267, "memory(GiB)": 31.84, "step": 760, "token_acc": 0.9438551468491588, "train_speed(iter/s)": 0.115291 }, { "epoch": 1.541239072595971, "eval_loss": 0.20081333816051483, "eval_runtime": 21.7245, "eval_samples_per_second": 14.638, "eval_steps_per_second": 3.682, "eval_token_acc": 0.9331740180186856, "step": 760 }, { "epoch": 1.551374635753199, "grad_norm": 0.7568323016166687, "learning_rate": 4.729305457072913e-06, "loss": 0.14828368425369262, "memory(GiB)": 31.84, "step": 765, "token_acc": 0.9404283971948368, "train_speed(iter/s)": 0.114963 }, { "epoch": 1.561510198910427, "grad_norm": 0.6327131390571594, "learning_rate": 4.676296277263513e-06, "loss": 0.13124030828475952, "memory(GiB)": 31.84, "step": 770, "token_acc": 0.948938511889246, "train_speed(iter/s)": 0.115091 }, { "epoch": 1.571645762067655, "grad_norm": 0.6433929204940796, "learning_rate": 4.6233236104121266e-06, "loss": 0.134158194065094, "memory(GiB)": 31.84, "step": 775, "token_acc": 0.952758534346048, "train_speed(iter/s)": 0.115205 }, { "epoch": 1.581781325224883, "grad_norm": 0.6848548054695129, "learning_rate": 4.570393431701074e-06, "loss": 0.14014556407928466, "memory(GiB)": 31.84, "step": 780, "token_acc": 0.9504519678850148, "train_speed(iter/s)": 0.115355 }, { "epoch": 1.581781325224883, "eval_loss": 0.20110902190208435, "eval_runtime": 21.6337, "eval_samples_per_second": 14.699, "eval_steps_per_second": 3.698, "eval_token_acc": 0.9332048930796638, "step": 780 }, { "epoch": 1.5919168883821109, "grad_norm": 0.7065548896789551, "learning_rate": 4.517511711520121e-06, "loss": 0.1434975743293762, "memory(GiB)": 31.84, "step": 785, "token_acc": 0.9380875156072849, "train_speed(iter/s)": 0.114991 }, { "epoch": 1.6020524515393386, "grad_norm": 0.6895316243171692, "learning_rate": 4.46468441479303e-06, "loss": 0.14089118242263793, "memory(GiB)": 31.84, "step": 790, "token_acc": 0.9526078624333143, "train_speed(iter/s)": 0.115135 }, { "epoch": 1.6121880146965666, "grad_norm": 0.6403608918190002, "learning_rate": 4.411917500304741e-06, "loss": 0.13246408700942994, "memory(GiB)": 31.84, "step": 795, "token_acc": 0.9556548295096035, "train_speed(iter/s)": 0.11525 }, { "epoch": 1.6223235778537946, "grad_norm": 0.7208017706871033, "learning_rate": 4.359216920029227e-06, "loss": 0.1482730746269226, "memory(GiB)": 31.84, "step": 800, "token_acc": 0.9493780648247817, "train_speed(iter/s)": 0.115389 }, { "epoch": 1.6223235778537946, "eval_loss": 0.19986069202423096, "eval_runtime": 21.7303, "eval_samples_per_second": 14.634, "eval_steps_per_second": 3.681, "eval_token_acc": 0.9332419431528377, "step": 800 }, { "epoch": 1.6324591410110223, "grad_norm": 0.7288877964019775, "learning_rate": 4.306588618458134e-06, "loss": 0.14321244955062867, "memory(GiB)": 31.84, "step": 805, "token_acc": 0.9386183036920914, "train_speed(iter/s)": 0.11503 }, { "epoch": 1.6425947041682503, "grad_norm": 0.7239351868629456, "learning_rate": 4.254038531930253e-06, "loss": 0.15064480304718017, "memory(GiB)": 31.84, "step": 810, "token_acc": 0.9507855081756973, "train_speed(iter/s)": 0.115159 }, { "epoch": 1.6527302673254782, "grad_norm": 0.6510659456253052, "learning_rate": 4.201572587961911e-06, "loss": 0.13342173099517823, "memory(GiB)": 31.84, "step": 815, "token_acc": 0.9515407052300228, "train_speed(iter/s)": 0.115262 }, { "epoch": 1.6628658304827062, "grad_norm": 0.76472008228302, "learning_rate": 4.149196704578375e-06, "loss": 0.14209251403808593, "memory(GiB)": 31.84, "step": 820, "token_acc": 0.9562061456569079, "train_speed(iter/s)": 0.115391 }, { "epoch": 1.6628658304827062, "eval_loss": 0.19960026443004608, "eval_runtime": 21.6668, "eval_samples_per_second": 14.677, "eval_steps_per_second": 3.692, "eval_token_acc": 0.9334951186528594, "step": 820 }, { "epoch": 1.6730013936399342, "grad_norm": 0.6803078055381775, "learning_rate": 4.096916789646305e-06, "loss": 0.1310012936592102, "memory(GiB)": 31.84, "step": 825, "token_acc": 0.9461966384760458, "train_speed(iter/s)": 0.115013 }, { "epoch": 1.6831369567971621, "grad_norm": 0.6794367432594299, "learning_rate": 4.04473874020736e-06, "loss": 0.14054393768310547, "memory(GiB)": 31.84, "step": 830, "token_acc": 0.9499732318125037, "train_speed(iter/s)": 0.115168 }, { "epoch": 1.69327251995439, "grad_norm": 0.6680985689163208, "learning_rate": 3.992668441813036e-06, "loss": 0.15022470951080322, "memory(GiB)": 31.84, "step": 835, "token_acc": 0.9472336065573771, "train_speed(iter/s)": 0.115303 }, { "epoch": 1.7034080831116178, "grad_norm": 0.707535982131958, "learning_rate": 3.940711767860776e-06, "loss": 0.14012532234191893, "memory(GiB)": 31.84, "step": 840, "token_acc": 0.9501387137452711, "train_speed(iter/s)": 0.115467 }, { "epoch": 1.7034080831116178, "eval_loss": 0.19959832727909088, "eval_runtime": 21.6673, "eval_samples_per_second": 14.676, "eval_steps_per_second": 3.692, "eval_token_acc": 0.9337853442260549, "step": 840 }, { "epoch": 1.7135436462688458, "grad_norm": 0.6823382377624512, "learning_rate": 3.888874578931482e-06, "loss": 0.14827988147735596, "memory(GiB)": 31.84, "step": 845, "token_acc": 0.9361747509472786, "train_speed(iter/s)": 0.115101 }, { "epoch": 1.7236792094260738, "grad_norm": 0.695568859577179, "learning_rate": 3.8371627221284495e-06, "loss": 0.14526791572570802, "memory(GiB)": 31.84, "step": 850, "token_acc": 0.9438457505679502, "train_speed(iter/s)": 0.115237 }, { "epoch": 1.7338147725833015, "grad_norm": 0.6451678276062012, "learning_rate": 3.7855820304178202e-06, "loss": 0.13392380475997925, "memory(GiB)": 31.84, "step": 855, "token_acc": 0.9532031010915026, "train_speed(iter/s)": 0.115329 }, { "epoch": 1.7439503357405295, "grad_norm": 0.6347934007644653, "learning_rate": 3.7341383219706535e-06, "loss": 0.137128746509552, "memory(GiB)": 31.84, "step": 860, "token_acc": 0.9556541717810719, "train_speed(iter/s)": 0.115435 }, { "epoch": 1.7439503357405295, "eval_loss": 0.19803106784820557, "eval_runtime": 21.589, "eval_samples_per_second": 14.73, "eval_steps_per_second": 3.706, "eval_token_acc": 0.9337729942016636, "step": 860 }, { "epoch": 1.7540858988977575, "grad_norm": 0.7886172533035278, "learning_rate": 3.6828373995066434e-06, "loss": 0.13904647827148436, "memory(GiB)": 31.84, "step": 865, "token_acc": 0.9446668643732701, "train_speed(iter/s)": 0.115085 }, { "epoch": 1.7642214620549854, "grad_norm": 0.6498300433158875, "learning_rate": 3.6316850496395863e-06, "loss": 0.1318161129951477, "memory(GiB)": 31.84, "step": 870, "token_acc": 0.9486192100341072, "train_speed(iter/s)": 0.115204 }, { "epoch": 1.7743570252122134, "grad_norm": 0.6739187240600586, "learning_rate": 3.5806870422246675e-06, "loss": 0.13507769107818604, "memory(GiB)": 31.84, "step": 875, "token_acc": 0.9506144334128633, "train_speed(iter/s)": 0.115288 }, { "epoch": 1.7844925883694414, "grad_norm": 0.6173581480979919, "learning_rate": 3.5298491297076332e-06, "loss": 0.1330319046974182, "memory(GiB)": 31.84, "step": 880, "token_acc": 0.9496503128779513, "train_speed(iter/s)": 0.115413 }, { "epoch": 1.7844925883694414, "eval_loss": 0.1983470916748047, "eval_runtime": 21.6381, "eval_samples_per_second": 14.696, "eval_steps_per_second": 3.697, "eval_token_acc": 0.9336927190431201, "step": 880 }, { "epoch": 1.7946281515266693, "grad_norm": 0.7845348715782166, "learning_rate": 3.479177046475935e-06, "loss": 0.14459047317504883, "memory(GiB)": 31.84, "step": 885, "token_acc": 0.9400699857249334, "train_speed(iter/s)": 0.115073 }, { "epoch": 1.804763714683897, "grad_norm": 0.677846372127533, "learning_rate": 3.428676508211902e-06, "loss": 0.1446584701538086, "memory(GiB)": 31.84, "step": 890, "token_acc": 0.9527403560028134, "train_speed(iter/s)": 0.115198 }, { "epoch": 1.814899277841125, "grad_norm": 0.6514244079589844, "learning_rate": 3.3783532112480243e-06, "loss": 0.14350442886352538, "memory(GiB)": 31.84, "step": 895, "token_acc": 0.9489065777752235, "train_speed(iter/s)": 0.115319 }, { "epoch": 1.825034840998353, "grad_norm": 0.6789301633834839, "learning_rate": 3.328212831924424e-06, "loss": 0.13438014984130858, "memory(GiB)": 31.84, "step": 900, "token_acc": 0.9504659558391403, "train_speed(iter/s)": 0.1154 }, { "epoch": 1.825034840998353, "eval_loss": 0.19804814457893372, "eval_runtime": 21.7224, "eval_samples_per_second": 14.639, "eval_steps_per_second": 3.683, "eval_token_acc": 0.9340385197260764, "step": 900 }, { "epoch": 1.8351704041555807, "grad_norm": 0.7078403830528259, "learning_rate": 3.2782610259485816e-06, "loss": 0.14103929996490477, "memory(GiB)": 31.84, "step": 905, "token_acc": 0.9467157966221059, "train_speed(iter/s)": 0.115049 }, { "epoch": 1.8453059673128087, "grad_norm": 0.6451456546783447, "learning_rate": 3.228503427757374e-06, "loss": 0.1354345202445984, "memory(GiB)": 31.84, "step": 910, "token_acc": 0.949082312268036, "train_speed(iter/s)": 0.115184 }, { "epoch": 1.8554415304700367, "grad_norm": 0.7488725185394287, "learning_rate": 3.178945649881543e-06, "loss": 0.1395805835723877, "memory(GiB)": 31.84, "step": 915, "token_acc": 0.9481235027947831, "train_speed(iter/s)": 0.115318 }, { "epoch": 1.8655770936272646, "grad_norm": 0.6281618475914001, "learning_rate": 3.1295932823125984e-06, "loss": 0.13758153915405275, "memory(GiB)": 31.84, "step": 920, "token_acc": 0.952330743618202, "train_speed(iter/s)": 0.115431 }, { "epoch": 1.8655770936272646, "eval_loss": 0.19679398834705353, "eval_runtime": 21.7247, "eval_samples_per_second": 14.638, "eval_steps_per_second": 3.682, "eval_token_acc": 0.9344151954700111, "step": 920 }, { "epoch": 1.8757126567844926, "grad_norm": 0.678552508354187, "learning_rate": 3.0804518918722953e-06, "loss": 0.14409549236297609, "memory(GiB)": 31.84, "step": 925, "token_acc": 0.9402150206553255, "train_speed(iter/s)": 0.115117 }, { "epoch": 1.8858482199417206, "grad_norm": 0.654852569103241, "learning_rate": 3.0315270215847015e-06, "loss": 0.14367053508758545, "memory(GiB)": 31.84, "step": 930, "token_acc": 0.9497569353477134, "train_speed(iter/s)": 0.115224 }, { "epoch": 1.8959837830989485, "grad_norm": 0.7289896011352539, "learning_rate": 2.982824190050958e-06, "loss": 0.1398463487625122, "memory(GiB)": 31.84, "step": 935, "token_acc": 0.9556035338495136, "train_speed(iter/s)": 0.115346 }, { "epoch": 1.9061193462561765, "grad_norm": 0.7101432681083679, "learning_rate": 2.934348890826804e-06, "loss": 0.14695172309875487, "memory(GiB)": 31.84, "step": 940, "token_acc": 0.9451462090193687, "train_speed(iter/s)": 0.115476 }, { "epoch": 1.9061193462561765, "eval_loss": 0.196446493268013, "eval_runtime": 21.7433, "eval_samples_per_second": 14.625, "eval_steps_per_second": 3.679, "eval_token_acc": 0.9346374959090544, "step": 940 }, { "epoch": 1.9162549094134043, "grad_norm": 0.6972203850746155, "learning_rate": 2.8861065918029085e-06, "loss": 0.13769317865371705, "memory(GiB)": 31.84, "step": 945, "token_acc": 0.9454437011741591, "train_speed(iter/s)": 0.11518 }, { "epoch": 1.9263904725706322, "grad_norm": 0.633342981338501, "learning_rate": 2.83810273458811e-06, "loss": 0.13131552934646606, "memory(GiB)": 31.84, "step": 950, "token_acc": 0.9490581763627272, "train_speed(iter/s)": 0.115298 }, { "epoch": 1.93652603572786, "grad_norm": 0.7772082686424255, "learning_rate": 2.790342733895618e-06, "loss": 0.14086905717849732, "memory(GiB)": 31.84, "step": 955, "token_acc": 0.949326951839665, "train_speed(iter/s)": 0.115399 }, { "epoch": 1.946661598885088, "grad_norm": 0.6924586892127991, "learning_rate": 2.742831976932242e-06, "loss": 0.13804731369018555, "memory(GiB)": 31.84, "step": 960, "token_acc": 0.954203088394589, "train_speed(iter/s)": 0.115508 }, { "epoch": 1.946661598885088, "eval_loss": 0.1961846649646759, "eval_runtime": 21.6189, "eval_samples_per_second": 14.709, "eval_steps_per_second": 3.7, "eval_token_acc": 0.9348597963480978, "step": 960 }, { "epoch": 1.956797162042316, "grad_norm": 0.6690623760223389, "learning_rate": 2.6955758227907335e-06, "loss": 0.13871316909790038, "memory(GiB)": 31.84, "step": 965, "token_acc": 0.9416814206770749, "train_speed(iter/s)": 0.115192 }, { "epoch": 1.9669327251995439, "grad_norm": 0.7014771103858948, "learning_rate": 2.648579601845295e-06, "loss": 0.1412734031677246, "memory(GiB)": 31.84, "step": 970, "token_acc": 0.9565555225449642, "train_speed(iter/s)": 0.115273 }, { "epoch": 1.9770682883567718, "grad_norm": 0.6217651963233948, "learning_rate": 2.6018486151503213e-06, "loss": 0.14024865627288818, "memory(GiB)": 31.84, "step": 975, "token_acc": 0.9587345601209982, "train_speed(iter/s)": 0.115361 }, { "epoch": 1.9872038515139998, "grad_norm": 0.6598934531211853, "learning_rate": 2.5553881338424553e-06, "loss": 0.13222227096557618, "memory(GiB)": 31.84, "step": 980, "token_acc": 0.9532787321172154, "train_speed(iter/s)": 0.115455 }, { "epoch": 1.9872038515139998, "eval_loss": 0.19523988664150238, "eval_runtime": 21.7763, "eval_samples_per_second": 14.603, "eval_steps_per_second": 3.674, "eval_token_acc": 0.9347609961529674, "step": 980 } ], "logging_steps": 5, "max_steps": 1479, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3311229017634898e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }