{ "best_global_step": 760, "best_metric": 0.22517732, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v36-20250515-204543/checkpoint-760", "epoch": 0.9533516268130146, "eval_steps": 20, "global_step": 760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012544100352802822, "grad_norm": 0.7007026672363281, "learning_rate": 9.999995684008912e-06, "loss": 0.09371452033519745, "memory(GiB)": 30.15, "step": 1, "token_acc": 0.9615550755939525, "train_speed(iter/s)": 0.062663 }, { "epoch": 0.006272050176401411, "grad_norm": 0.7769243121147156, "learning_rate": 9.999892100595329e-06, "loss": 0.10849708318710327, "memory(GiB)": 30.19, "step": 5, "token_acc": 0.9562576748199528, "train_speed(iter/s)": 0.122173 }, { "epoch": 0.012544100352802822, "grad_norm": 0.7894852757453918, "learning_rate": 9.999568407038233e-06, "loss": 0.12320096492767334, "memory(GiB)": 30.19, "step": 10, "token_acc": 0.955855880061259, "train_speed(iter/s)": 0.136645 }, { "epoch": 0.018816150529204233, "grad_norm": 0.8094897866249084, "learning_rate": 9.999028933299243e-06, "loss": 0.11489032506942749, "memory(GiB)": 30.19, "step": 15, "token_acc": 0.9583607506645961, "train_speed(iter/s)": 0.144552 }, { "epoch": 0.025088200705605645, "grad_norm": 0.8118980526924133, "learning_rate": 9.99827370266192e-06, "loss": 0.11607390642166138, "memory(GiB)": 30.19, "step": 20, "token_acc": 0.9584569732937686, "train_speed(iter/s)": 0.146159 }, { "epoch": 0.025088200705605645, "eval_loss": 0.23747889697551727, "eval_runtime": 29.1116, "eval_samples_per_second": 17.691, "eval_steps_per_second": 4.431, "eval_token_acc": 0.9248609195450487, "step": 20 }, { "epoch": 0.03136025088200706, "grad_norm": 0.7518147826194763, "learning_rate": 9.99730274772184e-06, "loss": 0.12006251811981201, "memory(GiB)": 30.19, "step": 25, "token_acc": 0.9398439645614428, "train_speed(iter/s)": 0.119433 }, { "epoch": 0.037632301058408466, "grad_norm": 0.7859554886817932, "learning_rate": 9.996116110385186e-06, "loss": 0.12473204135894775, "memory(GiB)": 30.19, "step": 30, "token_acc": 0.9547159567642268, "train_speed(iter/s)": 0.124931 }, { "epoch": 0.04390435123480988, "grad_norm": 0.7734975814819336, "learning_rate": 9.99471384186694e-06, "loss": 0.11890232563018799, "memory(GiB)": 30.2, "step": 35, "token_acc": 0.9632012432012432, "train_speed(iter/s)": 0.128582 }, { "epoch": 0.05017640141121129, "grad_norm": 0.7775484323501587, "learning_rate": 9.99309600268868e-06, "loss": 0.11513264179229736, "memory(GiB)": 30.2, "step": 40, "token_acc": 0.9620845390377802, "train_speed(iter/s)": 0.130866 }, { "epoch": 0.05017640141121129, "eval_loss": 0.23900838196277618, "eval_runtime": 29.1592, "eval_samples_per_second": 17.662, "eval_steps_per_second": 4.424, "eval_token_acc": 0.9247099957657495, "step": 40 }, { "epoch": 0.0564484515876127, "grad_norm": 0.7852229475975037, "learning_rate": 9.991262662675962e-06, "loss": 0.1213950753211975, "memory(GiB)": 30.2, "step": 45, "token_acc": 0.9422833912915708, "train_speed(iter/s)": 0.119133 }, { "epoch": 0.06272050176401411, "grad_norm": 0.7627941370010376, "learning_rate": 9.9892139009553e-06, "loss": 0.12013821601867676, "memory(GiB)": 30.2, "step": 50, "token_acc": 0.9524904419431597, "train_speed(iter/s)": 0.122203 }, { "epoch": 0.06899255194041552, "grad_norm": 0.7667288184165955, "learning_rate": 9.986949805950763e-06, "loss": 0.12547953128814698, "memory(GiB)": 30.2, "step": 55, "token_acc": 0.9569144662104125, "train_speed(iter/s)": 0.124401 }, { "epoch": 0.07526460211681693, "grad_norm": 0.8220178484916687, "learning_rate": 9.984470475380154e-06, "loss": 0.12330178022384644, "memory(GiB)": 30.2, "step": 60, "token_acc": 0.9609616164135824, "train_speed(iter/s)": 0.12684 }, { "epoch": 0.07526460211681693, "eval_loss": 0.2401651293039322, "eval_runtime": 28.9982, "eval_samples_per_second": 17.76, "eval_steps_per_second": 4.449, "eval_token_acc": 0.9250453819419698, "step": 60 }, { "epoch": 0.08153665229321834, "grad_norm": 0.7496252059936523, "learning_rate": 9.982332112912999e-06, "loss": 0.12913516759872437, "memory(GiB)": 31.66, "step": 65, "token_acc": 0.9413472329138108, "train_speed(iter/s)": 0.119192 }, { "epoch": 0.08780870246961976, "grad_norm": 0.742638349533081, "learning_rate": 9.979465634221514e-06, "loss": 0.11686735153198242, "memory(GiB)": 31.66, "step": 70, "token_acc": 0.9583432768541352, "train_speed(iter/s)": 0.121248 }, { "epoch": 0.09408075264602117, "grad_norm": 0.6979886889457703, "learning_rate": 9.976384242979025e-06, "loss": 0.11433117389678955, "memory(GiB)": 31.66, "step": 75, "token_acc": 0.9620859246922897, "train_speed(iter/s)": 0.123121 }, { "epoch": 0.10035280282242258, "grad_norm": 0.772607684135437, "learning_rate": 9.973088072177646e-06, "loss": 0.11932685375213622, "memory(GiB)": 31.66, "step": 80, "token_acc": 0.9525376807136265, "train_speed(iter/s)": 0.124671 }, { "epoch": 0.10035280282242258, "eval_loss": 0.2368009090423584, "eval_runtime": 28.9145, "eval_samples_per_second": 17.811, "eval_steps_per_second": 4.461, "eval_token_acc": 0.9248148039458184, "step": 80 }, { "epoch": 0.10662485299882399, "grad_norm": 0.6924039721488953, "learning_rate": 9.96957726407932e-06, "loss": 0.11466219425201415, "memory(GiB)": 31.66, "step": 85, "token_acc": 0.9409575111971916, "train_speed(iter/s)": 0.119139 }, { "epoch": 0.1128969031752254, "grad_norm": 0.7015154957771301, "learning_rate": 9.965851970209695e-06, "loss": 0.11789379119873047, "memory(GiB)": 31.66, "step": 90, "token_acc": 0.9585812037424941, "train_speed(iter/s)": 0.120316 }, { "epoch": 0.11916895335162682, "grad_norm": 0.7769283652305603, "learning_rate": 9.96191235135156e-06, "loss": 0.12155482769012452, "memory(GiB)": 31.66, "step": 95, "token_acc": 0.9554308702096125, "train_speed(iter/s)": 0.122053 }, { "epoch": 0.12544100352802823, "grad_norm": 0.7704362869262695, "learning_rate": 9.957758577537933e-06, "loss": 0.13259472846984863, "memory(GiB)": 31.66, "step": 100, "token_acc": 0.9502720633165537, "train_speed(iter/s)": 0.123484 }, { "epoch": 0.12544100352802823, "eval_loss": 0.23640382289886475, "eval_runtime": 29.2359, "eval_samples_per_second": 17.615, "eval_steps_per_second": 4.412, "eval_token_acc": 0.9245884182768697, "step": 100 }, { "epoch": 0.13171305370442962, "grad_norm": 0.7656592726707458, "learning_rate": 9.953390828044698e-06, "loss": 0.1214489221572876, "memory(GiB)": 31.66, "step": 105, "token_acc": 0.94163746105919, "train_speed(iter/s)": 0.119212 }, { "epoch": 0.13798510388083104, "grad_norm": 0.6987672448158264, "learning_rate": 9.948809291382886e-06, "loss": 0.12167651653289795, "memory(GiB)": 31.66, "step": 110, "token_acc": 0.9554091191158653, "train_speed(iter/s)": 0.120455 }, { "epoch": 0.14425715405723247, "grad_norm": 0.7173567414283752, "learning_rate": 9.944014165290526e-06, "loss": 0.12870512008666993, "memory(GiB)": 31.66, "step": 115, "token_acc": 0.9555927368478797, "train_speed(iter/s)": 0.121851 }, { "epoch": 0.15052920423363386, "grad_norm": 0.733219563961029, "learning_rate": 9.939005656724122e-06, "loss": 0.12763895988464355, "memory(GiB)": 31.66, "step": 120, "token_acc": 0.9537378141994336, "train_speed(iter/s)": 0.123038 }, { "epoch": 0.15052920423363386, "eval_loss": 0.2358570694923401, "eval_runtime": 29.0168, "eval_samples_per_second": 17.748, "eval_steps_per_second": 4.446, "eval_token_acc": 0.9245213410416256, "step": 120 }, { "epoch": 0.15680125441003528, "grad_norm": 0.765933096408844, "learning_rate": 9.933783981849704e-06, "loss": 0.12144865989685058, "memory(GiB)": 31.66, "step": 125, "token_acc": 0.9409262529390531, "train_speed(iter/s)": 0.119399 }, { "epoch": 0.16307330458643668, "grad_norm": 0.6981678009033203, "learning_rate": 9.928349366033525e-06, "loss": 0.12389117479324341, "memory(GiB)": 31.66, "step": 130, "token_acc": 0.9555003388299074, "train_speed(iter/s)": 0.120494 }, { "epoch": 0.1693453547628381, "grad_norm": 0.7008967399597168, "learning_rate": 9.923848513216085e-06, "loss": 0.12482867240905762, "memory(GiB)": 31.66, "step": 135, "token_acc": 0.9500779220779221, "train_speed(iter/s)": 0.121483 }, { "epoch": 0.17561740493923952, "grad_norm": 0.7450129985809326, "learning_rate": 9.918031200957224e-06, "loss": 0.1304723024368286, "memory(GiB)": 31.66, "step": 140, "token_acc": 0.9544462545722805, "train_speed(iter/s)": 0.122651 }, { "epoch": 0.17561740493923952, "eval_loss": 0.23741304874420166, "eval_runtime": 29.1676, "eval_samples_per_second": 17.657, "eval_steps_per_second": 4.423, "eval_token_acc": 0.9250328049603616, "step": 140 }, { "epoch": 0.18188945511564092, "grad_norm": 0.7213057279586792, "learning_rate": 9.912001627642868e-06, "loss": 0.12079639434814453, "memory(GiB)": 31.66, "step": 145, "token_acc": 0.9405716060888475, "train_speed(iter/s)": 0.119397 }, { "epoch": 0.18816150529204234, "grad_norm": 0.6824830174446106, "learning_rate": 9.905760053507967e-06, "loss": 0.11286978721618653, "memory(GiB)": 31.66, "step": 150, "token_acc": 0.9568509120833905, "train_speed(iter/s)": 0.120378 }, { "epoch": 0.19443355546844374, "grad_norm": 0.7245369553565979, "learning_rate": 9.899306747937377e-06, "loss": 0.12503495216369628, "memory(GiB)": 31.66, "step": 155, "token_acc": 0.9548737472705896, "train_speed(iter/s)": 0.121472 }, { "epoch": 0.20070560564484516, "grad_norm": 0.7653631567955017, "learning_rate": 9.892641989454225e-06, "loss": 0.1246172308921814, "memory(GiB)": 31.66, "step": 160, "token_acc": 0.9559546167897296, "train_speed(iter/s)": 0.122509 }, { "epoch": 0.20070560564484516, "eval_loss": 0.23944813013076782, "eval_runtime": 29.28, "eval_samples_per_second": 17.589, "eval_steps_per_second": 4.406, "eval_token_acc": 0.9244165328615568, "step": 160 }, { "epoch": 0.20697765582124658, "grad_norm": 0.6885049939155579, "learning_rate": 9.885766065707903e-06, "loss": 0.12521634101867676, "memory(GiB)": 31.66, "step": 165, "token_acc": 0.9397906012239098, "train_speed(iter/s)": 0.119705 }, { "epoch": 0.21324970599764798, "grad_norm": 0.7737838625907898, "learning_rate": 9.878679273461643e-06, "loss": 0.12545130252838135, "memory(GiB)": 31.66, "step": 170, "token_acc": 0.9525804833426775, "train_speed(iter/s)": 0.120457 }, { "epoch": 0.2195217561740494, "grad_norm": 0.7562989592552185, "learning_rate": 9.871381918579706e-06, "loss": 0.11616495847702027, "memory(GiB)": 31.66, "step": 175, "token_acc": 0.9607603010588085, "train_speed(iter/s)": 0.121311 }, { "epoch": 0.2257938063504508, "grad_norm": 0.8050908446311951, "learning_rate": 9.863874316014197e-06, "loss": 0.11883351802825928, "memory(GiB)": 31.66, "step": 180, "token_acc": 0.9569015887148382, "train_speed(iter/s)": 0.122115 }, { "epoch": 0.2257938063504508, "eval_loss": 0.2381161004304886, "eval_runtime": 29.157, "eval_samples_per_second": 17.663, "eval_steps_per_second": 4.424, "eval_token_acc": 0.9248860735082651, "step": 180 }, { "epoch": 0.23206585652685222, "grad_norm": 0.7181304097175598, "learning_rate": 9.856156789791454e-06, "loss": 0.12097489833831787, "memory(GiB)": 31.66, "step": 185, "token_acc": 0.9404164162611713, "train_speed(iter/s)": 0.119715 }, { "epoch": 0.23833790670325364, "grad_norm": 0.7890848517417908, "learning_rate": 9.848229672998066e-06, "loss": 0.1189950704574585, "memory(GiB)": 31.66, "step": 190, "token_acc": 0.9567714631197098, "train_speed(iter/s)": 0.120621 }, { "epoch": 0.24460995687965503, "grad_norm": 0.7079647183418274, "learning_rate": 9.840093307766511e-06, "loss": 0.12529479265213012, "memory(GiB)": 31.66, "step": 195, "token_acc": 0.954030785285677, "train_speed(iter/s)": 0.12131 }, { "epoch": 0.25088200705605646, "grad_norm": 0.7202178835868835, "learning_rate": 9.831748045260374e-06, "loss": 0.12180191278457642, "memory(GiB)": 31.66, "step": 200, "token_acc": 0.9546836066920402, "train_speed(iter/s)": 0.121964 }, { "epoch": 0.25088200705605646, "eval_loss": 0.23432905972003937, "eval_runtime": 29.0534, "eval_samples_per_second": 17.726, "eval_steps_per_second": 4.44, "eval_token_acc": 0.9248776888538597, "step": 200 }, { "epoch": 0.2571540572324579, "grad_norm": 0.7178977727890015, "learning_rate": 9.823194245659197e-06, "loss": 0.12807730436325074, "memory(GiB)": 31.66, "step": 205, "token_acc": 0.9399802586519479, "train_speed(iter/s)": 0.119811 }, { "epoch": 0.26342610740885924, "grad_norm": 0.7141036987304688, "learning_rate": 9.814432278142934e-06, "loss": 0.11557638645172119, "memory(GiB)": 31.66, "step": 210, "token_acc": 0.9552243011722272, "train_speed(iter/s)": 0.120373 }, { "epoch": 0.26969815758526067, "grad_norm": 0.8078102469444275, "learning_rate": 9.805462520876015e-06, "loss": 0.1150855302810669, "memory(GiB)": 33.77, "step": 215, "token_acc": 0.9593783736285808, "train_speed(iter/s)": 0.121017 }, { "epoch": 0.2759702077616621, "grad_norm": 0.7020242214202881, "learning_rate": 9.79628536099103e-06, "loss": 0.1237363338470459, "memory(GiB)": 33.77, "step": 220, "token_acc": 0.9599538638985006, "train_speed(iter/s)": 0.121813 }, { "epoch": 0.2759702077616621, "eval_loss": 0.23757396638393402, "eval_runtime": 29.0264, "eval_samples_per_second": 17.742, "eval_steps_per_second": 4.444, "eval_token_acc": 0.9250914975412001, "step": 220 }, { "epoch": 0.2822422579380635, "grad_norm": 0.8055130839347839, "learning_rate": 9.786901194572012e-06, "loss": 0.1192856788635254, "memory(GiB)": 33.77, "step": 225, "token_acc": 0.9404110409842614, "train_speed(iter/s)": 0.119956 }, { "epoch": 0.28851430811446493, "grad_norm": 0.8203707337379456, "learning_rate": 9.777310426637349e-06, "loss": 0.11806493997573853, "memory(GiB)": 33.77, "step": 230, "token_acc": 0.9590930586937103, "train_speed(iter/s)": 0.120564 }, { "epoch": 0.2947863582908663, "grad_norm": 0.7641969919204712, "learning_rate": 9.767513471122305e-06, "loss": 0.11997225284576415, "memory(GiB)": 33.77, "step": 235, "token_acc": 0.9602041571122124, "train_speed(iter/s)": 0.121067 }, { "epoch": 0.3010584084672677, "grad_norm": 0.733931839466095, "learning_rate": 9.757510750861143e-06, "loss": 0.12144792079925537, "memory(GiB)": 33.77, "step": 240, "token_acc": 0.9531961770923274, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.3010584084672677, "eval_loss": 0.23732979595661163, "eval_runtime": 29.0284, "eval_samples_per_second": 17.741, "eval_steps_per_second": 4.444, "eval_token_acc": 0.9247980346370074, "step": 240 }, { "epoch": 0.30733045864366915, "grad_norm": 0.7051452398300171, "learning_rate": 9.749360713849587e-06, "loss": 0.12806930541992187, "memory(GiB)": 33.77, "step": 245, "token_acc": 0.9387967295240796, "train_speed(iter/s)": 0.119787 }, { "epoch": 0.31360250882007057, "grad_norm": 0.7828952074050903, "learning_rate": 9.741079488650608e-06, "loss": 0.13568118810653687, "memory(GiB)": 33.77, "step": 250, "token_acc": 0.948076923076923, "train_speed(iter/s)": 0.12055 }, { "epoch": 0.319874558996472, "grad_norm": 0.7669305205345154, "learning_rate": 9.730543822588614e-06, "loss": 0.12099459171295165, "memory(GiB)": 33.77, "step": 255, "token_acc": 0.9579619299557192, "train_speed(iter/s)": 0.121138 }, { "epoch": 0.32614660917287336, "grad_norm": 0.6992693543434143, "learning_rate": 9.71980398738173e-06, "loss": 0.12393572330474853, "memory(GiB)": 33.77, "step": 260, "token_acc": 0.9492717094266536, "train_speed(iter/s)": 0.121576 }, { "epoch": 0.32614660917287336, "eval_loss": 0.23609744012355804, "eval_runtime": 29.0643, "eval_samples_per_second": 17.719, "eval_steps_per_second": 4.438, "eval_token_acc": 0.9247644960193854, "step": 260 }, { "epoch": 0.3324186593492748, "grad_norm": 0.7092508673667908, "learning_rate": 9.708860446558685e-06, "loss": 0.12540948390960693, "memory(GiB)": 33.77, "step": 265, "token_acc": 0.9381209283387623, "train_speed(iter/s)": 0.119928 }, { "epoch": 0.3386907095256762, "grad_norm": 0.7747306823730469, "learning_rate": 9.6977136724401e-06, "loss": 0.1322183132171631, "memory(GiB)": 33.77, "step": 270, "token_acc": 0.948199121522694, "train_speed(iter/s)": 0.120479 }, { "epoch": 0.3449627597020776, "grad_norm": 0.7813765406608582, "learning_rate": 9.686364146118085e-06, "loss": 0.12453765869140625, "memory(GiB)": 33.77, "step": 275, "token_acc": 0.9574612482015366, "train_speed(iter/s)": 0.121031 }, { "epoch": 0.35123480987847905, "grad_norm": 0.8077899217605591, "learning_rate": 9.674812357435497e-06, "loss": 0.13067824840545655, "memory(GiB)": 33.77, "step": 280, "token_acc": 0.9562002982107356, "train_speed(iter/s)": 0.121591 }, { "epoch": 0.35123480987847905, "eval_loss": 0.23406127095222473, "eval_runtime": 28.8857, "eval_samples_per_second": 17.829, "eval_steps_per_second": 4.466, "eval_token_acc": 0.925296921574135, "step": 280 }, { "epoch": 0.3575068600548804, "grad_norm": 0.7608367204666138, "learning_rate": 9.663058804964784e-06, "loss": 0.12904319763183594, "memory(GiB)": 33.77, "step": 285, "token_acc": 0.9395176026312584, "train_speed(iter/s)": 0.120021 }, { "epoch": 0.36377891023128184, "grad_norm": 0.7974592447280884, "learning_rate": 9.65110399598647e-06, "loss": 0.11571755409240722, "memory(GiB)": 33.77, "step": 290, "token_acc": 0.9629809560823941, "train_speed(iter/s)": 0.120511 }, { "epoch": 0.37005096040768326, "grad_norm": 0.807405948638916, "learning_rate": 9.638948446467268e-06, "loss": 0.12567424774169922, "memory(GiB)": 33.77, "step": 295, "token_acc": 0.9559097936770272, "train_speed(iter/s)": 0.121048 }, { "epoch": 0.3763230105840847, "grad_norm": 0.8031049966812134, "learning_rate": 9.626592681037797e-06, "loss": 0.12862168550491332, "memory(GiB)": 33.77, "step": 300, "token_acc": 0.9541518224171006, "train_speed(iter/s)": 0.121608 }, { "epoch": 0.3763230105840847, "eval_loss": 0.239148810505867, "eval_runtime": 29.0314, "eval_samples_per_second": 17.739, "eval_steps_per_second": 4.443, "eval_token_acc": 0.9250160356515505, "step": 300 }, { "epoch": 0.3825950607604861, "grad_norm": 0.7160354256629944, "learning_rate": 9.614037232969952e-06, "loss": 0.11383086442947388, "memory(GiB)": 33.77, "step": 305, "token_acc": 0.9417116516042943, "train_speed(iter/s)": 0.120136 }, { "epoch": 0.3888671109368875, "grad_norm": 0.7129721641540527, "learning_rate": 9.601282644153882e-06, "loss": 0.12448391914367676, "memory(GiB)": 33.77, "step": 310, "token_acc": 0.9534784033888903, "train_speed(iter/s)": 0.120638 }, { "epoch": 0.3951391611132889, "grad_norm": 0.6943992972373962, "learning_rate": 9.5883294650746e-06, "loss": 0.12468962669372559, "memory(GiB)": 33.77, "step": 315, "token_acc": 0.9542487486461961, "train_speed(iter/s)": 0.12098 }, { "epoch": 0.4014112112896903, "grad_norm": 0.6942281126976013, "learning_rate": 9.575178254788235e-06, "loss": 0.12767086029052735, "memory(GiB)": 33.77, "step": 320, "token_acc": 0.9511481009569767, "train_speed(iter/s)": 0.121441 }, { "epoch": 0.4014112112896903, "eval_loss": 0.23587600886821747, "eval_runtime": 29.0365, "eval_samples_per_second": 17.736, "eval_steps_per_second": 4.443, "eval_token_acc": 0.9246219568944917, "step": 320 }, { "epoch": 0.40768326146609174, "grad_norm": 0.7060673236846924, "learning_rate": 9.56182958089789e-06, "loss": 0.1355045199394226, "memory(GiB)": 33.77, "step": 325, "token_acc": 0.9405469567818154, "train_speed(iter/s)": 0.120115 }, { "epoch": 0.41395531164249316, "grad_norm": 0.8162744045257568, "learning_rate": 9.548284019529149e-06, "loss": 0.13120698928833008, "memory(GiB)": 33.77, "step": 330, "token_acc": 0.9542016095898688, "train_speed(iter/s)": 0.120527 }, { "epoch": 0.42022736181889453, "grad_norm": 0.7768563032150269, "learning_rate": 9.534542155305217e-06, "loss": 0.12495183944702148, "memory(GiB)": 33.77, "step": 335, "token_acc": 0.955458468751665, "train_speed(iter/s)": 0.121049 }, { "epoch": 0.42649941199529595, "grad_norm": 0.7437924146652222, "learning_rate": 9.520604581321682e-06, "loss": 0.12085769176483155, "memory(GiB)": 33.77, "step": 340, "token_acc": 0.9580750533707143, "train_speed(iter/s)": 0.121396 }, { "epoch": 0.42649941199529595, "eval_loss": 0.23598669469356537, "eval_runtime": 29.1207, "eval_samples_per_second": 17.685, "eval_steps_per_second": 4.43, "eval_token_acc": 0.9251292284860249, "step": 340 }, { "epoch": 0.4327714621716974, "grad_norm": 0.7635094523429871, "learning_rate": 9.506471899120917e-06, "loss": 0.12304807901382446, "memory(GiB)": 33.77, "step": 345, "token_acc": 0.9426115423821846, "train_speed(iter/s)": 0.120096 }, { "epoch": 0.4390435123480988, "grad_norm": 0.7481803297996521, "learning_rate": 9.49214471866612e-06, "loss": 0.1286768436431885, "memory(GiB)": 33.77, "step": 350, "token_acc": 0.9525481515405949, "train_speed(iter/s)": 0.120576 }, { "epoch": 0.4453155625245002, "grad_norm": 0.7514587640762329, "learning_rate": 9.477623658314988e-06, "loss": 0.13611133098602296, "memory(GiB)": 33.77, "step": 355, "token_acc": 0.9467895891385546, "train_speed(iter/s)": 0.121066 }, { "epoch": 0.4515876127009016, "grad_norm": 0.6806954741477966, "learning_rate": 9.462909344793028e-06, "loss": 0.12503905296325685, "memory(GiB)": 33.77, "step": 360, "token_acc": 0.9559137034194594, "train_speed(iter/s)": 0.121424 }, { "epoch": 0.4515876127009016, "eval_loss": 0.23415741324424744, "eval_runtime": 29.1302, "eval_samples_per_second": 17.679, "eval_steps_per_second": 4.428, "eval_token_acc": 0.9256910003311939, "step": 360 }, { "epoch": 0.457859662877303, "grad_norm": 0.7245866060256958, "learning_rate": 9.448002413166509e-06, "loss": 0.11684945821762086, "memory(GiB)": 33.77, "step": 365, "token_acc": 0.9405347148691575, "train_speed(iter/s)": 0.12022 }, { "epoch": 0.46413171305370443, "grad_norm": 0.7275366187095642, "learning_rate": 9.43290350681505e-06, "loss": 0.12942945957183838, "memory(GiB)": 33.77, "step": 370, "token_acc": 0.948218290555694, "train_speed(iter/s)": 0.120576 }, { "epoch": 0.47040376323010585, "grad_norm": 0.9837439060211182, "learning_rate": 9.41761327740385e-06, "loss": 0.13003346920013428, "memory(GiB)": 33.77, "step": 375, "token_acc": 0.957800478604328, "train_speed(iter/s)": 0.121019 }, { "epoch": 0.4766758134065073, "grad_norm": 0.7750929594039917, "learning_rate": 9.402132384855573e-06, "loss": 0.12979254722595215, "memory(GiB)": 33.77, "step": 380, "token_acc": 0.9498875140607425, "train_speed(iter/s)": 0.121423 }, { "epoch": 0.4766758134065073, "eval_loss": 0.2338828444480896, "eval_runtime": 29.2086, "eval_samples_per_second": 17.632, "eval_steps_per_second": 4.417, "eval_token_acc": 0.9252759599381213, "step": 380 }, { "epoch": 0.48294786358290864, "grad_norm": 0.6944046020507812, "learning_rate": 9.389610842080394e-06, "loss": 0.12626748085021972, "memory(GiB)": 33.77, "step": 385, "token_acc": 0.9413088592055652, "train_speed(iter/s)": 0.120293 }, { "epoch": 0.48921991375931007, "grad_norm": 0.7166000008583069, "learning_rate": 9.373788445138972e-06, "loss": 0.12364100217819214, "memory(GiB)": 33.77, "step": 390, "token_acc": 0.9516249135684273, "train_speed(iter/s)": 0.120761 }, { "epoch": 0.4954919639357115, "grad_norm": 0.6891298294067383, "learning_rate": 9.357777276529793e-06, "loss": 0.11418641805648803, "memory(GiB)": 36.04, "step": 395, "token_acc": 0.9607581283065386, "train_speed(iter/s)": 0.121105 }, { "epoch": 0.5017640141121129, "grad_norm": 0.8014844059944153, "learning_rate": 9.341578027291085e-06, "loss": 0.12451854944229127, "memory(GiB)": 36.04, "step": 400, "token_acc": 0.9554823405376911, "train_speed(iter/s)": 0.121475 }, { "epoch": 0.5017640141121129, "eval_loss": 0.2350710779428482, "eval_runtime": 28.9464, "eval_samples_per_second": 17.792, "eval_steps_per_second": 4.457, "eval_token_acc": 0.9253011139013377, "step": 400 }, { "epoch": 0.5080360642885143, "grad_norm": 0.7108286619186401, "learning_rate": 9.325191396578589e-06, "loss": 0.12221509218215942, "memory(GiB)": 36.04, "step": 405, "token_acc": 0.9413378371462204, "train_speed(iter/s)": 0.12034 }, { "epoch": 0.5143081144649158, "grad_norm": 0.7911909222602844, "learning_rate": 9.308618091635382e-06, "loss": 0.12177256345748902, "memory(GiB)": 36.04, "step": 410, "token_acc": 0.959347706235673, "train_speed(iter/s)": 0.120651 }, { "epoch": 0.5205801646413172, "grad_norm": 1.8789387941360474, "learning_rate": 9.291858827761359e-06, "loss": 0.1333709716796875, "memory(GiB)": 36.04, "step": 415, "token_acc": 0.95194391673133, "train_speed(iter/s)": 0.120981 }, { "epoch": 0.5268522148177185, "grad_norm": 0.7984176278114319, "learning_rate": 9.274914328282359e-06, "loss": 0.12819453477859497, "memory(GiB)": 36.04, "step": 420, "token_acc": 0.957134979829933, "train_speed(iter/s)": 0.121312 }, { "epoch": 0.5268522148177185, "eval_loss": 0.23512502014636993, "eval_runtime": 29.1452, "eval_samples_per_second": 17.67, "eval_steps_per_second": 4.426, "eval_token_acc": 0.9250202279787533, "step": 420 }, { "epoch": 0.5331242649941199, "grad_norm": 0.7520173788070679, "learning_rate": 9.257785324518943e-06, "loss": 0.12105765342712402, "memory(GiB)": 36.04, "step": 425, "token_acc": 0.9403581723767339, "train_speed(iter/s)": 0.120363 }, { "epoch": 0.5393963151705213, "grad_norm": 0.8471489548683167, "learning_rate": 9.240472555754835e-06, "loss": 0.12356100082397461, "memory(GiB)": 36.04, "step": 430, "token_acc": 0.9564814136828489, "train_speed(iter/s)": 0.120684 }, { "epoch": 0.5456683653469228, "grad_norm": 0.7935863733291626, "learning_rate": 9.222976769205013e-06, "loss": 0.12740910053253174, "memory(GiB)": 38.32, "step": 435, "token_acc": 0.9569268406943757, "train_speed(iter/s)": 0.120949 }, { "epoch": 0.5519404155233242, "grad_norm": 0.7470819354057312, "learning_rate": 9.205298719983458e-06, "loss": 0.12629660367965698, "memory(GiB)": 38.32, "step": 440, "token_acc": 0.9542381848107219, "train_speed(iter/s)": 0.121266 }, { "epoch": 0.5519404155233242, "eval_loss": 0.2329576462507248, "eval_runtime": 29.0573, "eval_samples_per_second": 17.724, "eval_steps_per_second": 4.44, "eval_token_acc": 0.925007650997145, "step": 440 }, { "epoch": 0.5582124656997256, "grad_norm": 0.8528454899787903, "learning_rate": 9.187439171070563e-06, "loss": 0.11683663129806518, "memory(GiB)": 38.32, "step": 445, "token_acc": 0.9422364773256167, "train_speed(iter/s)": 0.12028 }, { "epoch": 0.564484515876127, "grad_norm": 0.6902926564216614, "learning_rate": 9.173021369887053e-06, "loss": 0.1320955276489258, "memory(GiB)": 38.32, "step": 450, "token_acc": 0.9549962232889062, "train_speed(iter/s)": 0.12058 }, { "epoch": 0.5707565660525284, "grad_norm": 0.7285463213920593, "learning_rate": 9.154837069223594e-06, "loss": 0.12488093376159667, "memory(GiB)": 38.32, "step": 455, "token_acc": 0.9579794738443663, "train_speed(iter/s)": 0.120802 }, { "epoch": 0.5770286162289299, "grad_norm": 0.6829497218132019, "learning_rate": 9.136473446781624e-06, "loss": 0.12886552810668944, "memory(GiB)": 38.32, "step": 460, "token_acc": 0.9550450619099832, "train_speed(iter/s)": 0.121049 }, { "epoch": 0.5770286162289299, "eval_loss": 0.2350022941827774, "eval_runtime": 29.2036, "eval_samples_per_second": 17.635, "eval_steps_per_second": 4.417, "eval_token_acc": 0.9253220755373516, "step": 460 }, { "epoch": 0.5833006664053313, "grad_norm": 0.793786346912384, "learning_rate": 9.11793129513072e-06, "loss": 0.1309070110321045, "memory(GiB)": 38.32, "step": 465, "token_acc": 0.9386901904304689, "train_speed(iter/s)": 0.12018 }, { "epoch": 0.5895727165817326, "grad_norm": 0.6860336661338806, "learning_rate": 9.102969570306243e-06, "loss": 0.13614410161972046, "memory(GiB)": 38.32, "step": 470, "token_acc": 0.950530035335689, "train_speed(iter/s)": 0.120521 }, { "epoch": 0.595844766758134, "grad_norm": 0.7473175525665283, "learning_rate": 9.084108087927778e-06, "loss": 0.13468925952911376, "memory(GiB)": 38.32, "step": 475, "token_acc": 0.9512150026413101, "train_speed(iter/s)": 0.120848 }, { "epoch": 0.6021168169345354, "grad_norm": 0.705289900302887, "learning_rate": 9.065070336416794e-06, "loss": 0.12688368558883667, "memory(GiB)": 38.32, "step": 480, "token_acc": 0.9514988814317673, "train_speed(iter/s)": 0.121127 }, { "epoch": 0.6021168169345354, "eval_loss": 0.23424042761325836, "eval_runtime": 28.918, "eval_samples_per_second": 17.809, "eval_steps_per_second": 4.461, "eval_token_acc": 0.9258838473825205, "step": 480 }, { "epoch": 0.6083888671109369, "grad_norm": 0.7309315204620361, "learning_rate": 9.045857137438114e-06, "loss": 0.12572396993637086, "memory(GiB)": 38.32, "step": 485, "token_acc": 0.9416618199382905, "train_speed(iter/s)": 0.120198 }, { "epoch": 0.6146609172873383, "grad_norm": 0.7343481779098511, "learning_rate": 9.02646932022883e-06, "loss": 0.12929785251617432, "memory(GiB)": 38.32, "step": 490, "token_acc": 0.9553018035624546, "train_speed(iter/s)": 0.120514 }, { "epoch": 0.6209329674637397, "grad_norm": 0.7487764954566956, "learning_rate": 9.006907721562515e-06, "loss": 0.12204375267028808, "memory(GiB)": 38.32, "step": 495, "token_acc": 0.9591731423020884, "train_speed(iter/s)": 0.120819 }, { "epoch": 0.6272050176401411, "grad_norm": 0.7962038516998291, "learning_rate": 8.987173185713113e-06, "loss": 0.12226212024688721, "memory(GiB)": 38.32, "step": 500, "token_acc": 0.9564000589188393, "train_speed(iter/s)": 0.121101 }, { "epoch": 0.6272050176401411, "eval_loss": 0.23693928122520447, "eval_runtime": 29.1127, "eval_samples_per_second": 17.69, "eval_steps_per_second": 4.431, "eval_token_acc": 0.9255736151695168, "step": 500 }, { "epoch": 0.6334770678165426, "grad_norm": 0.7448955178260803, "learning_rate": 8.967266564418485e-06, "loss": 0.12553646564483642, "memory(GiB)": 38.32, "step": 505, "token_acc": 0.9397639899675178, "train_speed(iter/s)": 0.120264 }, { "epoch": 0.639749117992944, "grad_norm": 0.6869771480560303, "learning_rate": 8.947188716843668e-06, "loss": 0.12530720233917236, "memory(GiB)": 38.32, "step": 510, "token_acc": 0.9531347241388641, "train_speed(iter/s)": 0.120511 }, { "epoch": 0.6460211681693454, "grad_norm": 0.7007948756217957, "learning_rate": 8.926940509543786e-06, "loss": 0.12557142972946167, "memory(GiB)": 38.32, "step": 515, "token_acc": 0.9570901871809416, "train_speed(iter/s)": 0.120789 }, { "epoch": 0.6522932183457467, "grad_norm": 0.6679887771606445, "learning_rate": 8.906522816426642e-06, "loss": 0.11763076782226563, "memory(GiB)": 38.32, "step": 520, "token_acc": 0.9630898229846002, "train_speed(iter/s)": 0.121093 }, { "epoch": 0.6522932183457467, "eval_loss": 0.23494519293308258, "eval_runtime": 29.1208, "eval_samples_per_second": 17.685, "eval_steps_per_second": 4.43, "eval_token_acc": 0.9250202279787533, "step": 520 }, { "epoch": 0.6585652685221481, "grad_norm": 1.0034900903701782, "learning_rate": 8.885936518715009e-06, "loss": 0.12190806865692139, "memory(GiB)": 38.32, "step": 525, "token_acc": 0.9413621144839724, "train_speed(iter/s)": 0.12028 }, { "epoch": 0.6648373186985496, "grad_norm": 0.7202277779579163, "learning_rate": 8.865182504908593e-06, "loss": 0.12205361127853394, "memory(GiB)": 38.32, "step": 530, "token_acc": 0.9566966466480848, "train_speed(iter/s)": 0.120583 }, { "epoch": 0.671109368874951, "grad_norm": 0.7922447323799133, "learning_rate": 8.84426167074569e-06, "loss": 0.12360981702804566, "memory(GiB)": 38.32, "step": 535, "token_acc": 0.9545589899350843, "train_speed(iter/s)": 0.120816 }, { "epoch": 0.6773814190513524, "grad_norm": 0.7317930459976196, "learning_rate": 8.823174919164517e-06, "loss": 0.12647807598114014, "memory(GiB)": 38.32, "step": 540, "token_acc": 0.9562937062937062, "train_speed(iter/s)": 0.121078 }, { "epoch": 0.6773814190513524, "eval_loss": 0.234180748462677, "eval_runtime": 29.282, "eval_samples_per_second": 17.588, "eval_steps_per_second": 4.405, "eval_token_acc": 0.9257161542944103, "step": 540 }, { "epoch": 0.6836534692277538, "grad_norm": 0.7069205045700073, "learning_rate": 8.801923160264254e-06, "loss": 0.12029304504394531, "memory(GiB)": 38.32, "step": 545, "token_acc": 0.940614257111556, "train_speed(iter/s)": 0.120293 }, { "epoch": 0.6899255194041553, "grad_norm": 0.7268481850624084, "learning_rate": 8.78050731126575e-06, "loss": 0.12312864065170288, "memory(GiB)": 38.32, "step": 550, "token_acc": 0.9642680054543201, "train_speed(iter/s)": 0.12052 }, { "epoch": 0.6961975695805567, "grad_norm": 0.7602020502090454, "learning_rate": 8.758928296471955e-06, "loss": 0.12826888561248778, "memory(GiB)": 38.32, "step": 555, "token_acc": 0.9557154631332023, "train_speed(iter/s)": 0.120805 }, { "epoch": 0.7024696197569581, "grad_norm": 0.7089629173278809, "learning_rate": 8.737187047228004e-06, "loss": 0.12195276021957398, "memory(GiB)": 38.32, "step": 560, "token_acc": 0.9564936463493431, "train_speed(iter/s)": 0.121045 }, { "epoch": 0.7024696197569581, "eval_loss": 0.23239342868328094, "eval_runtime": 28.8986, "eval_samples_per_second": 17.821, "eval_steps_per_second": 4.464, "eval_token_acc": 0.9253933450997983, "step": 560 }, { "epoch": 0.7087416699333595, "grad_norm": 0.7802727818489075, "learning_rate": 8.715284501881039e-06, "loss": 0.12478115558624267, "memory(GiB)": 38.32, "step": 565, "token_acc": 0.9394610632417493, "train_speed(iter/s)": 0.120318 }, { "epoch": 0.7150137201097608, "grad_norm": 0.692456841468811, "learning_rate": 8.693221605739697e-06, "loss": 0.12183520793914795, "memory(GiB)": 38.32, "step": 570, "token_acc": 0.9573177580590813, "train_speed(iter/s)": 0.120604 }, { "epoch": 0.7212857702861623, "grad_norm": 0.783674955368042, "learning_rate": 8.670999311033328e-06, "loss": 0.1260378837585449, "memory(GiB)": 40.76, "step": 575, "token_acc": 0.958084188606277, "train_speed(iter/s)": 0.12084 }, { "epoch": 0.7275578204625637, "grad_norm": 0.7612254023551941, "learning_rate": 8.648618576870877e-06, "loss": 0.12205030918121337, "memory(GiB)": 40.76, "step": 580, "token_acc": 0.9522026264517598, "train_speed(iter/s)": 0.121105 }, { "epoch": 0.7275578204625637, "eval_loss": 0.23236523568630219, "eval_runtime": 29.1132, "eval_samples_per_second": 17.69, "eval_steps_per_second": 4.431, "eval_token_acc": 0.9257538852392352, "step": 580 }, { "epoch": 0.7338298706389651, "grad_norm": 0.6874219179153442, "learning_rate": 8.626080369199499e-06, "loss": 0.12317302227020263, "memory(GiB)": 40.76, "step": 585, "token_acc": 0.9398445420750253, "train_speed(iter/s)": 0.120379 }, { "epoch": 0.7401019208153665, "grad_norm": 0.7761655449867249, "learning_rate": 8.603385660762872e-06, "loss": 0.1282115697860718, "memory(GiB)": 40.76, "step": 590, "token_acc": 0.9520723436322532, "train_speed(iter/s)": 0.120652 }, { "epoch": 0.7463739709917679, "grad_norm": 0.6982787251472473, "learning_rate": 8.58053543105921e-06, "loss": 0.1281890869140625, "memory(GiB)": 40.76, "step": 595, "token_acc": 0.953644096279635, "train_speed(iter/s)": 0.120874 }, { "epoch": 0.7526460211681694, "grad_norm": 0.7546159625053406, "learning_rate": 8.55753066629898e-06, "loss": 0.12751117944717408, "memory(GiB)": 40.76, "step": 600, "token_acc": 0.9540350393157677, "train_speed(iter/s)": 0.121112 }, { "epoch": 0.7526460211681694, "eval_loss": 0.23002442717552185, "eval_runtime": 29.1415, "eval_samples_per_second": 17.672, "eval_steps_per_second": 4.427, "eval_token_acc": 0.9261940795955242, "step": 600 }, { "epoch": 0.7589180713445708, "grad_norm": 0.7351900935173035, "learning_rate": 8.534372359362357e-06, "loss": 0.1303678870201111, "memory(GiB)": 40.76, "step": 605, "token_acc": 0.9409314468422133, "train_speed(iter/s)": 0.120394 }, { "epoch": 0.7651901215209722, "grad_norm": 0.8682727217674255, "learning_rate": 8.51106150975635e-06, "loss": 0.1233241081237793, "memory(GiB)": 40.76, "step": 610, "token_acc": 0.95544310046902, "train_speed(iter/s)": 0.120622 }, { "epoch": 0.7714621716973736, "grad_norm": 0.7701956629753113, "learning_rate": 8.487599123571675e-06, "loss": 0.11557955741882324, "memory(GiB)": 40.76, "step": 615, "token_acc": 0.9595501699938976, "train_speed(iter/s)": 0.120911 }, { "epoch": 0.777734221873775, "grad_norm": 0.763529360294342, "learning_rate": 8.463986213439337e-06, "loss": 0.12450950145721436, "memory(GiB)": 40.76, "step": 620, "token_acc": 0.9594680177327423, "train_speed(iter/s)": 0.121153 }, { "epoch": 0.777734221873775, "eval_loss": 0.2307971715927124, "eval_runtime": 28.9628, "eval_samples_per_second": 17.781, "eval_steps_per_second": 4.454, "eval_token_acc": 0.9257371159304242, "step": 620 }, { "epoch": 0.7840062720501764, "grad_norm": 0.743877649307251, "learning_rate": 8.440223798486913e-06, "loss": 0.13349132537841796, "memory(GiB)": 40.76, "step": 625, "token_acc": 0.9376087341521601, "train_speed(iter/s)": 0.120442 }, { "epoch": 0.7902783222265778, "grad_norm": 0.7353401184082031, "learning_rate": 8.416312904294572e-06, "loss": 0.13025209903717042, "memory(GiB)": 40.76, "step": 630, "token_acc": 0.960784808848038, "train_speed(iter/s)": 0.120683 }, { "epoch": 0.7965503724029792, "grad_norm": 0.7505218982696533, "learning_rate": 8.397077977170049e-06, "loss": 0.13371331691741944, "memory(GiB)": 40.76, "step": 635, "token_acc": 0.9515476784822766, "train_speed(iter/s)": 0.120921 }, { "epoch": 0.8028224225793806, "grad_norm": 0.8705490231513977, "learning_rate": 8.372902425234847e-06, "loss": 0.12443286180496216, "memory(GiB)": 40.76, "step": 640, "token_acc": 0.9559957659156207, "train_speed(iter/s)": 0.12116 }, { "epoch": 0.8028224225793806, "eval_loss": 0.23105858266353607, "eval_runtime": 29.1773, "eval_samples_per_second": 17.651, "eval_steps_per_second": 4.421, "eval_token_acc": 0.9266720048966381, "step": 640 }, { "epoch": 0.8090944727557821, "grad_norm": 0.720313549041748, "learning_rate": 8.348581299634171e-06, "loss": 0.12005361318588256, "memory(GiB)": 40.76, "step": 645, "token_acc": 0.942721820579713, "train_speed(iter/s)": 0.120467 }, { "epoch": 0.8153665229321835, "grad_norm": 0.7976186275482178, "learning_rate": 8.324115650062005e-06, "loss": 0.1226189136505127, "memory(GiB)": 40.76, "step": 650, "token_acc": 0.95751953125, "train_speed(iter/s)": 0.120671 }, { "epoch": 0.8216385731085849, "grad_norm": 0.7320578694343567, "learning_rate": 8.29950653244996e-06, "loss": 0.12214083671569824, "memory(GiB)": 40.76, "step": 655, "token_acc": 0.9560769335697722, "train_speed(iter/s)": 0.120871 }, { "epoch": 0.8279106232849863, "grad_norm": 0.714158833026886, "learning_rate": 8.27475500892169e-06, "loss": 0.13046940565109252, "memory(GiB)": 40.76, "step": 660, "token_acc": 0.9564310899892687, "train_speed(iter/s)": 0.121118 }, { "epoch": 0.8279106232849863, "eval_loss": 0.2298216074705124, "eval_runtime": 29.0928, "eval_samples_per_second": 17.702, "eval_steps_per_second": 4.434, "eval_token_acc": 0.9264120806100674, "step": 660 }, { "epoch": 0.8341826734613876, "grad_norm": 0.70967036485672, "learning_rate": 8.249862147747062e-06, "loss": 0.12797050476074218, "memory(GiB)": 40.76, "step": 665, "token_acc": 0.940926979466161, "train_speed(iter/s)": 0.120422 }, { "epoch": 0.8404547236377891, "grad_norm": 0.6546662449836731, "learning_rate": 8.224829023296032e-06, "loss": 0.12179737091064453, "memory(GiB)": 40.76, "step": 670, "token_acc": 0.9526879044300647, "train_speed(iter/s)": 0.12064 }, { "epoch": 0.8467267738141905, "grad_norm": 0.7337839007377625, "learning_rate": 8.199656715992292e-06, "loss": 0.13117530345916747, "memory(GiB)": 40.76, "step": 675, "token_acc": 0.9478685921294229, "train_speed(iter/s)": 0.120857 }, { "epoch": 0.8529988239905919, "grad_norm": 0.7467445731163025, "learning_rate": 8.179419388376196e-06, "loss": 0.13929787874221802, "memory(GiB)": 40.76, "step": 680, "token_acc": 0.946326665465249, "train_speed(iter/s)": 0.121055 }, { "epoch": 0.8529988239905919, "eval_loss": 0.22663576900959015, "eval_runtime": 29.1409, "eval_samples_per_second": 17.673, "eval_steps_per_second": 4.427, "eval_token_acc": 0.9265671967165693, "step": 680 }, { "epoch": 0.8592708741669933, "grad_norm": 0.6901698708534241, "learning_rate": 8.153999293750005e-06, "loss": 0.1212563157081604, "memory(GiB)": 40.76, "step": 685, "token_acc": 0.9411858718235576, "train_speed(iter/s)": 0.120382 }, { "epoch": 0.8655429243433947, "grad_norm": 0.7482547163963318, "learning_rate": 8.128443073265364e-06, "loss": 0.13035836219787597, "memory(GiB)": 40.76, "step": 690, "token_acc": 0.9523483030510799, "train_speed(iter/s)": 0.120598 }, { "epoch": 0.8718149745197962, "grad_norm": 0.7634561657905579, "learning_rate": 8.102751829922664e-06, "loss": 0.13618214130401612, "memory(GiB)": 40.76, "step": 695, "token_acc": 0.9506073092564165, "train_speed(iter/s)": 0.1208 }, { "epoch": 0.8780870246961976, "grad_norm": 0.743306040763855, "learning_rate": 8.082102363728494e-06, "loss": 0.12926363945007324, "memory(GiB)": 40.76, "step": 700, "token_acc": 0.9271781534460338, "train_speed(iter/s)": 0.121029 }, { "epoch": 0.8780870246961976, "eval_loss": 0.22914662957191467, "eval_runtime": 28.9342, "eval_samples_per_second": 17.799, "eval_steps_per_second": 4.458, "eval_token_acc": 0.9267055435142602, "step": 700 }, { "epoch": 0.884359074872599, "grad_norm": 0.774067759513855, "learning_rate": 8.056170877373277e-06, "loss": 0.12883291244506836, "memory(GiB)": 40.76, "step": 705, "token_acc": 0.941404062515909, "train_speed(iter/s)": 0.120445 }, { "epoch": 0.8906311250490004, "grad_norm": 0.731858491897583, "learning_rate": 8.030107487410766e-06, "loss": 0.1272268772125244, "memory(GiB)": 40.76, "step": 710, "token_acc": 0.9559863699726366, "train_speed(iter/s)": 0.120655 }, { "epoch": 0.8969031752254017, "grad_norm": 0.7816020846366882, "learning_rate": 8.003913318730662e-06, "loss": 0.12550874948501586, "memory(GiB)": 40.76, "step": 715, "token_acc": 0.9573273273273273, "train_speed(iter/s)": 0.120859 }, { "epoch": 0.9031752254018032, "grad_norm": 0.7145897150039673, "learning_rate": 7.97758950186705e-06, "loss": 0.11747034788131713, "memory(GiB)": 40.76, "step": 720, "token_acc": 0.9536420703541395, "train_speed(iter/s)": 0.121052 }, { "epoch": 0.9031752254018032, "eval_loss": 0.2287711501121521, "eval_runtime": 28.9287, "eval_samples_per_second": 17.802, "eval_steps_per_second": 4.459, "eval_token_acc": 0.9270073910728585, "step": 720 }, { "epoch": 0.9094472755782046, "grad_norm": 0.7315598130226135, "learning_rate": 7.951137172949595e-06, "loss": 0.1277442455291748, "memory(GiB)": 40.76, "step": 725, "token_acc": 0.9404945141684427, "train_speed(iter/s)": 0.12046 }, { "epoch": 0.915719325754606, "grad_norm": 0.7869780659675598, "learning_rate": 7.924557473654516e-06, "loss": 0.13705768585205078, "memory(GiB)": 40.77, "step": 730, "token_acc": 0.9484856989768581, "train_speed(iter/s)": 0.120675 }, { "epoch": 0.9219913759310074, "grad_norm": 0.738673985004425, "learning_rate": 7.897851551155306e-06, "loss": 0.12492038011550903, "memory(GiB)": 40.77, "step": 735, "token_acc": 0.9554785841007012, "train_speed(iter/s)": 0.120899 }, { "epoch": 0.9282634261074089, "grad_norm": 0.6920596361160278, "learning_rate": 7.871020558073217e-06, "loss": 0.12350271940231324, "memory(GiB)": 40.77, "step": 740, "token_acc": 0.9585155697561742, "train_speed(iter/s)": 0.121065 }, { "epoch": 0.9282634261074089, "eval_loss": 0.22647298872470856, "eval_runtime": 29.274, "eval_samples_per_second": 17.592, "eval_steps_per_second": 4.407, "eval_token_acc": 0.9271583148521576, "step": 740 }, { "epoch": 0.9345354762838103, "grad_norm": 0.7176641821861267, "learning_rate": 7.849466490796728e-06, "loss": 0.12098994255065917, "memory(GiB)": 40.77, "step": 745, "token_acc": 0.9415065810170385, "train_speed(iter/s)": 0.120467 }, { "epoch": 0.9408075264602117, "grad_norm": 0.7007727026939392, "learning_rate": 7.822413292469593e-06, "loss": 0.12603325843811036, "memory(GiB)": 40.77, "step": 750, "token_acc": 0.955721036803666, "train_speed(iter/s)": 0.120684 }, { "epoch": 0.9470795766366131, "grad_norm": 0.7308924198150635, "learning_rate": 7.79523827945686e-06, "loss": 0.1311476469039917, "memory(GiB)": 40.77, "step": 755, "token_acc": 0.9497907949790795, "train_speed(iter/s)": 0.120874 }, { "epoch": 0.9533516268130146, "grad_norm": 0.701979398727417, "learning_rate": 7.767942624625625e-06, "loss": 0.12925295829772948, "memory(GiB)": 40.77, "step": 760, "token_acc": 0.9502816180235535, "train_speed(iter/s)": 0.121076 }, { "epoch": 0.9533516268130146, "eval_loss": 0.2251773178577423, "eval_runtime": 29.2786, "eval_samples_per_second": 17.59, "eval_steps_per_second": 4.406, "eval_token_acc": 0.9271918534697796, "step": 760 } ], "logging_steps": 5, "max_steps": 2391, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.199964448180142e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }