{ "best_global_step": 1620, "best_metric": 0.3465479, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v14-20250430-214816/checkpoint-1620", "epoch": 2.9988481916609078, "eval_steps": 20, "global_step": 2439, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012285955616985335, "grad_norm": 4.7140889167785645, "learning_rate": 9.99999585221637e-06, "loss": 0.7454355955123901, "memory(GiB)": 28.92, "step": 1, "token_acc": 0.8220973782771536, "train_speed(iter/s)": 0.064112 }, { "epoch": 0.006142977808492667, "grad_norm": 2.3402233123779297, "learning_rate": 9.999896305753298e-06, "loss": 0.6025638580322266, "memory(GiB)": 28.92, "step": 5, "token_acc": 0.8154806964420893, "train_speed(iter/s)": 0.1223 }, { "epoch": 0.012285955616985334, "grad_norm": 1.1033470630645752, "learning_rate": 9.99958522731419e-06, "loss": 0.4681520462036133, "memory(GiB)": 28.96, "step": 10, "token_acc": 0.8509727902413654, "train_speed(iter/s)": 0.134255 }, { "epoch": 0.018428933425478, "grad_norm": 1.1683531999588013, "learning_rate": 9.999066777585496e-06, "loss": 0.4340578556060791, "memory(GiB)": 30.5, "step": 15, "token_acc": 0.8624967569989859, "train_speed(iter/s)": 0.141354 }, { "epoch": 0.024571911233970668, "grad_norm": 0.8643156290054321, "learning_rate": 9.998340978071314e-06, "loss": 0.438944673538208, "memory(GiB)": 30.5, "step": 20, "token_acc": 0.8628914650122352, "train_speed(iter/s)": 0.145504 }, { "epoch": 0.024571911233970668, "eval_loss": 0.43904876708984375, "eval_runtime": 31.0999, "eval_samples_per_second": 16.913, "eval_steps_per_second": 4.244, "eval_token_acc": 0.8671750972762646, "step": 20 }, { "epoch": 0.030714889042463334, "grad_norm": 0.882213830947876, "learning_rate": 9.997407858876141e-06, "loss": 0.4316856384277344, "memory(GiB)": 32.21, "step": 25, "token_acc": 0.8673553096382113, "train_speed(iter/s)": 0.118362 }, { "epoch": 0.036857866850956, "grad_norm": 0.876335859298706, "learning_rate": 9.99626745870361e-06, "loss": 0.4254283428192139, "memory(GiB)": 32.21, "step": 30, "token_acc": 0.866745778634824, "train_speed(iter/s)": 0.122938 }, { "epoch": 0.043000844659448666, "grad_norm": 0.8186553120613098, "learning_rate": 9.994919824854899e-06, "loss": 0.4170750617980957, "memory(GiB)": 32.21, "step": 35, "token_acc": 0.8640802675585284, "train_speed(iter/s)": 0.127141 }, { "epoch": 0.049143822467941335, "grad_norm": 0.8065207004547119, "learning_rate": 9.993365013226757e-06, "loss": 0.40838775634765623, "memory(GiB)": 32.21, "step": 40, "token_acc": 0.8663708595604169, "train_speed(iter/s)": 0.130143 }, { "epoch": 0.049143822467941335, "eval_loss": 0.41924959421157837, "eval_runtime": 31.0376, "eval_samples_per_second": 16.947, "eval_steps_per_second": 4.253, "eval_token_acc": 0.8721037613488976, "step": 40 }, { "epoch": 0.055286800276434005, "grad_norm": 0.7789999842643738, "learning_rate": 9.991603088309195e-06, "loss": 0.4384481906890869, "memory(GiB)": 32.21, "step": 45, "token_acc": 0.8650371852302875, "train_speed(iter/s)": 0.117241 }, { "epoch": 0.06142977808492667, "grad_norm": 0.7491472959518433, "learning_rate": 9.989634123182798e-06, "loss": 0.3983407497406006, "memory(GiB)": 32.21, "step": 50, "token_acc": 0.8744787141615986, "train_speed(iter/s)": 0.120332 }, { "epoch": 0.06757275589341934, "grad_norm": 0.8437614440917969, "learning_rate": 9.987458199515714e-06, "loss": 0.4000354290008545, "memory(GiB)": 32.21, "step": 55, "token_acc": 0.8653561422291064, "train_speed(iter/s)": 0.123396 }, { "epoch": 0.073715733701912, "grad_norm": 0.7674087285995483, "learning_rate": 9.985075407560247e-06, "loss": 0.4135420799255371, "memory(GiB)": 32.21, "step": 60, "token_acc": 0.872202027931892, "train_speed(iter/s)": 0.125154 }, { "epoch": 0.073715733701912, "eval_loss": 0.4098711311817169, "eval_runtime": 31.0819, "eval_samples_per_second": 16.923, "eval_steps_per_second": 4.247, "eval_token_acc": 0.8743692174664938, "step": 60 }, { "epoch": 0.07985871151040466, "grad_norm": 0.8239404559135437, "learning_rate": 9.982485846149125e-06, "loss": 0.39459028244018557, "memory(GiB)": 32.21, "step": 65, "token_acc": 0.8727861165617594, "train_speed(iter/s)": 0.116909 }, { "epoch": 0.08600168931889733, "grad_norm": 0.8135547637939453, "learning_rate": 9.979689622691393e-06, "loss": 0.4003786087036133, "memory(GiB)": 32.21, "step": 70, "token_acc": 0.8739415872132136, "train_speed(iter/s)": 0.118714 }, { "epoch": 0.09214466712739, "grad_norm": 0.853965699672699, "learning_rate": 9.976686853167967e-06, "loss": 0.405532693862915, "memory(GiB)": 32.21, "step": 75, "token_acc": 0.863868962219034, "train_speed(iter/s)": 0.120582 }, { "epoch": 0.09828764493588267, "grad_norm": 0.7862138152122498, "learning_rate": 9.973477662126818e-06, "loss": 0.38930883407592776, "memory(GiB)": 32.21, "step": 80, "token_acc": 0.8843768172126381, "train_speed(iter/s)": 0.122421 }, { "epoch": 0.09828764493588267, "eval_loss": 0.4023858904838562, "eval_runtime": 30.9765, "eval_samples_per_second": 16.981, "eval_steps_per_second": 4.261, "eval_token_acc": 0.8762680501513186, "step": 80 }, { "epoch": 0.10443062274437534, "grad_norm": 0.7761799097061157, "learning_rate": 9.970062182677802e-06, "loss": 0.3841962099075317, "memory(GiB)": 32.21, "step": 85, "token_acc": 0.8720659317731335, "train_speed(iter/s)": 0.116555 }, { "epoch": 0.11057360055286801, "grad_norm": 0.7647544145584106, "learning_rate": 9.966440556487149e-06, "loss": 0.40062150955200193, "memory(GiB)": 32.21, "step": 90, "token_acc": 0.8734117200834439, "train_speed(iter/s)": 0.11815 }, { "epoch": 0.11671657836136066, "grad_norm": 0.8558200597763062, "learning_rate": 9.962612933771575e-06, "loss": 0.41026945114135743, "memory(GiB)": 32.21, "step": 95, "token_acc": 0.8802111051978002, "train_speed(iter/s)": 0.119854 }, { "epoch": 0.12285955616985333, "grad_norm": 0.8282895088195801, "learning_rate": 9.958579473292067e-06, "loss": 0.40637502670288084, "memory(GiB)": 32.21, "step": 100, "token_acc": 0.8726802284082797, "train_speed(iter/s)": 0.121692 }, { "epoch": 0.12285955616985333, "eval_loss": 0.39839640259742737, "eval_runtime": 31.0438, "eval_samples_per_second": 16.944, "eval_steps_per_second": 4.252, "eval_token_acc": 0.8775339386078685, "step": 100 }, { "epoch": 0.129002533978346, "grad_norm": 0.8366308212280273, "learning_rate": 9.95434034234728e-06, "loss": 0.3811495780944824, "memory(GiB)": 32.21, "step": 105, "token_acc": 0.875357573668792, "train_speed(iter/s)": 0.117395 }, { "epoch": 0.13514551178683867, "grad_norm": 0.7479439377784729, "learning_rate": 9.949895716766611e-06, "loss": 0.38749701976776124, "memory(GiB)": 32.21, "step": 110, "token_acc": 0.8701843549972431, "train_speed(iter/s)": 0.118845 }, { "epoch": 0.14128848959533133, "grad_norm": 0.801934003829956, "learning_rate": 9.945245780902899e-06, "loss": 0.37144348621368406, "memory(GiB)": 32.21, "step": 115, "token_acc": 0.8773385913426266, "train_speed(iter/s)": 0.120098 }, { "epoch": 0.147431467403824, "grad_norm": 0.7849209308624268, "learning_rate": 9.940390727624785e-06, "loss": 0.4016891956329346, "memory(GiB)": 32.21, "step": 120, "token_acc": 0.8671916991890818, "train_speed(iter/s)": 0.121292 }, { "epoch": 0.147431467403824, "eval_loss": 0.39595848321914673, "eval_runtime": 31.0135, "eval_samples_per_second": 16.96, "eval_steps_per_second": 4.256, "eval_token_acc": 0.877976653696498, "step": 120 }, { "epoch": 0.15357444521231667, "grad_norm": 0.7716063857078552, "learning_rate": 9.935330758308706e-06, "loss": 0.38781228065490725, "memory(GiB)": 32.21, "step": 125, "token_acc": 0.8762705679981929, "train_speed(iter/s)": 0.1175 }, { "epoch": 0.15971742302080932, "grad_norm": 0.7710253000259399, "learning_rate": 9.93006608283054e-06, "loss": 0.3876336574554443, "memory(GiB)": 32.21, "step": 130, "token_acc": 0.8821086956521739, "train_speed(iter/s)": 0.118454 }, { "epoch": 0.165860400829302, "grad_norm": 0.7821493744850159, "learning_rate": 9.924596919556917e-06, "loss": 0.40181121826171873, "memory(GiB)": 32.21, "step": 135, "token_acc": 0.8626237623762376, "train_speed(iter/s)": 0.119818 }, { "epoch": 0.17200337863779466, "grad_norm": 0.8226854205131531, "learning_rate": 9.918923495336138e-06, "loss": 0.39958484172821046, "memory(GiB)": 32.21, "step": 140, "token_acc": 0.8556235746008882, "train_speed(iter/s)": 0.120946 }, { "epoch": 0.17200337863779466, "eval_loss": 0.393728107213974, "eval_runtime": 31.0073, "eval_samples_per_second": 16.964, "eval_steps_per_second": 4.257, "eval_token_acc": 0.8784504971897968, "step": 140 }, { "epoch": 0.17814635644628735, "grad_norm": 0.7877047061920166, "learning_rate": 9.913046045488787e-06, "loss": 0.38108556270599364, "memory(GiB)": 34.13, "step": 145, "token_acc": 0.8813046265713381, "train_speed(iter/s)": 0.11771 }, { "epoch": 0.18428933425478, "grad_norm": 0.7512264251708984, "learning_rate": 9.906964813797955e-06, "loss": 0.3876554250717163, "memory(GiB)": 34.13, "step": 150, "token_acc": 0.881988944871105, "train_speed(iter/s)": 0.118688 }, { "epoch": 0.19043231206327269, "grad_norm": 0.7701375484466553, "learning_rate": 9.900680052499138e-06, "loss": 0.38112673759460447, "memory(GiB)": 34.13, "step": 155, "token_acc": 0.8716818566661686, "train_speed(iter/s)": 0.119662 }, { "epoch": 0.19657528987176534, "grad_norm": 0.7622193098068237, "learning_rate": 9.894192022269773e-06, "loss": 0.3982468843460083, "memory(GiB)": 34.13, "step": 160, "token_acc": 0.8648266919817547, "train_speed(iter/s)": 0.120545 }, { "epoch": 0.19657528987176534, "eval_loss": 0.39097315073013306, "eval_runtime": 31.0177, "eval_samples_per_second": 16.958, "eval_steps_per_second": 4.256, "eval_token_acc": 0.8786718547341116, "step": 160 }, { "epoch": 0.202718267680258, "grad_norm": 0.7747094035148621, "learning_rate": 9.887500992218421e-06, "loss": 0.3932340621948242, "memory(GiB)": 34.13, "step": 165, "token_acc": 0.8735516505058284, "train_speed(iter/s)": 0.117866 }, { "epoch": 0.20886124548875068, "grad_norm": 0.7225446701049805, "learning_rate": 9.880607239873614e-06, "loss": 0.3682489633560181, "memory(GiB)": 34.13, "step": 170, "token_acc": 0.8780595564195458, "train_speed(iter/s)": 0.118651 }, { "epoch": 0.21500422329724334, "grad_norm": 0.7513542771339417, "learning_rate": 9.873511051172331e-06, "loss": 0.37564697265625, "memory(GiB)": 34.13, "step": 175, "token_acc": 0.8798945693728777, "train_speed(iter/s)": 0.119494 }, { "epoch": 0.22114720110573602, "grad_norm": 0.7389309406280518, "learning_rate": 9.866212720448149e-06, "loss": 0.3957530498504639, "memory(GiB)": 34.13, "step": 180, "token_acc": 0.8693595046908373, "train_speed(iter/s)": 0.120172 }, { "epoch": 0.22114720110573602, "eval_loss": 0.3888963460922241, "eval_runtime": 31.0263, "eval_samples_per_second": 16.953, "eval_steps_per_second": 4.254, "eval_token_acc": 0.8793705144833549, "step": 180 }, { "epoch": 0.22729017891422867, "grad_norm": 0.8499953746795654, "learning_rate": 9.85871255041903e-06, "loss": 0.39398903846740724, "memory(GiB)": 34.13, "step": 185, "token_acc": 0.8721618431945888, "train_speed(iter/s)": 0.11765 }, { "epoch": 0.23343315672272133, "grad_norm": 0.7052657008171082, "learning_rate": 9.85101085217477e-06, "loss": 0.3804319381713867, "memory(GiB)": 34.13, "step": 190, "token_acc": 0.8779494871039452, "train_speed(iter/s)": 0.118504 }, { "epoch": 0.239576134531214, "grad_norm": 0.8443171977996826, "learning_rate": 9.843107945164086e-06, "loss": 0.3854555606842041, "memory(GiB)": 34.13, "step": 195, "token_acc": 0.8738016136687233, "train_speed(iter/s)": 0.119158 }, { "epoch": 0.24571911233970667, "grad_norm": 0.7444053292274475, "learning_rate": 9.835004157181372e-06, "loss": 0.3835892677307129, "memory(GiB)": 34.13, "step": 200, "token_acc": 0.8789022648439094, "train_speed(iter/s)": 0.119936 }, { "epoch": 0.24571911233970667, "eval_loss": 0.38609230518341064, "eval_runtime": 30.9745, "eval_samples_per_second": 16.982, "eval_steps_per_second": 4.262, "eval_token_acc": 0.880086467790748, "step": 200 }, { "epoch": 0.2518620901481993, "grad_norm": 0.7656287550926208, "learning_rate": 9.826699824353106e-06, "loss": 0.3835402488708496, "memory(GiB)": 34.13, "step": 205, "token_acc": 0.8772318628475851, "train_speed(iter/s)": 0.117635 }, { "epoch": 0.258005067956692, "grad_norm": 0.7985251545906067, "learning_rate": 9.818195291123903e-06, "loss": 0.37469916343688964, "memory(GiB)": 36.59, "step": 210, "token_acc": 0.8918794474675596, "train_speed(iter/s)": 0.11841 }, { "epoch": 0.2641480457651847, "grad_norm": 0.7901045680046082, "learning_rate": 9.80949091024223e-06, "loss": 0.39004669189453123, "memory(GiB)": 36.59, "step": 215, "token_acc": 0.8694972278822917, "train_speed(iter/s)": 0.119102 }, { "epoch": 0.27029102357367735, "grad_norm": 0.7759472727775574, "learning_rate": 9.800587042745774e-06, "loss": 0.37646257877349854, "memory(GiB)": 36.59, "step": 220, "token_acc": 0.8768733180258252, "train_speed(iter/s)": 0.119681 }, { "epoch": 0.27029102357367735, "eval_loss": 0.38418954610824585, "eval_runtime": 30.9751, "eval_samples_per_second": 16.981, "eval_steps_per_second": 4.261, "eval_token_acc": 0.8808370082144401, "step": 220 }, { "epoch": 0.27643400138217, "grad_norm": 0.7889726161956787, "learning_rate": 9.791484057946465e-06, "loss": 0.3830937385559082, "memory(GiB)": 36.59, "step": 225, "token_acc": 0.8788355828537511, "train_speed(iter/s)": 0.117815 }, { "epoch": 0.28257697919066266, "grad_norm": 0.8053146004676819, "learning_rate": 9.782182333415168e-06, "loss": 0.40045747756958006, "memory(GiB)": 36.59, "step": 230, "token_acc": 0.8767751952143934, "train_speed(iter/s)": 0.118387 }, { "epoch": 0.2887199569991553, "grad_norm": 0.7342280745506287, "learning_rate": 9.772682254966009e-06, "loss": 0.39071879386901853, "memory(GiB)": 36.59, "step": 235, "token_acc": 0.8698379998127166, "train_speed(iter/s)": 0.119097 }, { "epoch": 0.294862934807648, "grad_norm": 0.7769783139228821, "learning_rate": 9.762984216640378e-06, "loss": 0.38714871406555174, "memory(GiB)": 36.59, "step": 240, "token_acc": 0.8766444973056945, "train_speed(iter/s)": 0.119737 }, { "epoch": 0.294862934807648, "eval_loss": 0.38342124223709106, "eval_runtime": 30.9908, "eval_samples_per_second": 16.973, "eval_steps_per_second": 4.259, "eval_token_acc": 0.8811033290099438, "step": 240 }, { "epoch": 0.3010059126161407, "grad_norm": 0.7775170803070068, "learning_rate": 9.753088620690589e-06, "loss": 0.36563289165496826, "memory(GiB)": 36.59, "step": 245, "token_acc": 0.8821107213664786, "train_speed(iter/s)": 0.117883 }, { "epoch": 0.30714889042463334, "grad_norm": 0.7627344131469727, "learning_rate": 9.742995877563187e-06, "loss": 0.3691666841506958, "memory(GiB)": 36.59, "step": 250, "token_acc": 0.8684178043301157, "train_speed(iter/s)": 0.11847 }, { "epoch": 0.313291868233126, "grad_norm": 0.730969250202179, "learning_rate": 9.732706405881931e-06, "loss": 0.37671756744384766, "memory(GiB)": 36.59, "step": 255, "token_acc": 0.8784978880675819, "train_speed(iter/s)": 0.118913 }, { "epoch": 0.31943484604161865, "grad_norm": 0.7510061860084534, "learning_rate": 9.722220632430428e-06, "loss": 0.36403095722198486, "memory(GiB)": 36.59, "step": 260, "token_acc": 0.884961560097506, "train_speed(iter/s)": 0.1194 }, { "epoch": 0.31943484604161865, "eval_loss": 0.3818422555923462, "eval_runtime": 30.9337, "eval_samples_per_second": 17.004, "eval_steps_per_second": 4.267, "eval_token_acc": 0.8810894941634241, "step": 260 }, { "epoch": 0.32557782385011136, "grad_norm": 0.6700690984725952, "learning_rate": 9.711538992134427e-06, "loss": 0.37852253913879397, "memory(GiB)": 36.59, "step": 265, "token_acc": 0.8780975219824141, "train_speed(iter/s)": 0.117682 }, { "epoch": 0.331720801658604, "grad_norm": 0.7542963624000549, "learning_rate": 9.700661928043787e-06, "loss": 0.3520061016082764, "memory(GiB)": 36.59, "step": 270, "token_acc": 0.8765217391304347, "train_speed(iter/s)": 0.118172 }, { "epoch": 0.33786377946709667, "grad_norm": 0.6696748733520508, "learning_rate": 9.689589891314094e-06, "loss": 0.3755272150039673, "memory(GiB)": 36.59, "step": 275, "token_acc": 0.8727695145026466, "train_speed(iter/s)": 0.118608 }, { "epoch": 0.3440067572755893, "grad_norm": 0.7883334159851074, "learning_rate": 9.678323341187956e-06, "loss": 0.376280689239502, "memory(GiB)": 36.59, "step": 280, "token_acc": 0.8781244037397443, "train_speed(iter/s)": 0.119045 }, { "epoch": 0.3440067572755893, "eval_loss": 0.380220502614975, "eval_runtime": 30.9631, "eval_samples_per_second": 16.988, "eval_steps_per_second": 4.263, "eval_token_acc": 0.8814526588845655, "step": 280 }, { "epoch": 0.350149735084082, "grad_norm": 0.7125808596611023, "learning_rate": 9.666862744975938e-06, "loss": 0.3811634063720703, "memory(GiB)": 36.59, "step": 285, "token_acc": 0.881547675634566, "train_speed(iter/s)": 0.117616 }, { "epoch": 0.3562927128925747, "grad_norm": 0.7022562623023987, "learning_rate": 9.655208578037198e-06, "loss": 0.36770806312561033, "memory(GiB)": 36.59, "step": 290, "token_acc": 0.8775136241403108, "train_speed(iter/s)": 0.118155 }, { "epoch": 0.36243569070106735, "grad_norm": 0.7109845280647278, "learning_rate": 9.643361323759763e-06, "loss": 0.36910414695739746, "memory(GiB)": 36.59, "step": 295, "token_acc": 0.8801465983159751, "train_speed(iter/s)": 0.118621 }, { "epoch": 0.36857866850956, "grad_norm": 0.7310053706169128, "learning_rate": 9.631321473540476e-06, "loss": 0.36344945430755615, "memory(GiB)": 36.59, "step": 300, "token_acc": 0.8726629026286561, "train_speed(iter/s)": 0.119086 }, { "epoch": 0.36857866850956, "eval_loss": 0.3780768811702728, "eval_runtime": 31.0728, "eval_samples_per_second": 16.928, "eval_steps_per_second": 4.248, "eval_token_acc": 0.8825006485084306, "step": 300 }, { "epoch": 0.37472164631805266, "grad_norm": 0.7264479994773865, "learning_rate": 9.619089526764614e-06, "loss": 0.380098819732666, "memory(GiB)": 36.59, "step": 305, "token_acc": 0.8804112554112554, "train_speed(iter/s)": 0.11773 }, { "epoch": 0.38086462412654537, "grad_norm": 0.8007322549819946, "learning_rate": 9.60666599078518e-06, "loss": 0.3628620862960815, "memory(GiB)": 36.59, "step": 310, "token_acc": 0.8855827918881669, "train_speed(iter/s)": 0.118139 }, { "epoch": 0.387007601935038, "grad_norm": 0.730522871017456, "learning_rate": 9.59405138090186e-06, "loss": 0.36655001640319823, "memory(GiB)": 36.59, "step": 315, "token_acc": 0.8823326091250246, "train_speed(iter/s)": 0.118659 }, { "epoch": 0.3931505797435307, "grad_norm": 0.7646607756614685, "learning_rate": 9.581246220339636e-06, "loss": 0.35800130367279054, "memory(GiB)": 36.59, "step": 320, "token_acc": 0.8788769866274592, "train_speed(iter/s)": 0.119038 }, { "epoch": 0.3931505797435307, "eval_loss": 0.37690821290016174, "eval_runtime": 31.0274, "eval_samples_per_second": 16.953, "eval_steps_per_second": 4.254, "eval_token_acc": 0.8822689148292261, "step": 320 }, { "epoch": 0.39929355755202334, "grad_norm": 0.7562268972396851, "learning_rate": 9.568251040227101e-06, "loss": 0.384972071647644, "memory(GiB)": 36.59, "step": 325, "token_acc": 0.8815015713117225, "train_speed(iter/s)": 0.117697 }, { "epoch": 0.405436535360516, "grad_norm": 0.7428621053695679, "learning_rate": 9.555066379574423e-06, "loss": 0.3597818613052368, "memory(GiB)": 36.59, "step": 330, "token_acc": 0.889793055068397, "train_speed(iter/s)": 0.118163 }, { "epoch": 0.4115795131690087, "grad_norm": 0.7479391098022461, "learning_rate": 9.541692785250983e-06, "loss": 0.3805227279663086, "memory(GiB)": 36.59, "step": 335, "token_acc": 0.8907455632716049, "train_speed(iter/s)": 0.118498 }, { "epoch": 0.41772249097750136, "grad_norm": 0.6682092547416687, "learning_rate": 9.528130811962693e-06, "loss": 0.37683632373809817, "memory(GiB)": 36.59, "step": 340, "token_acc": 0.8722201102452005, "train_speed(iter/s)": 0.118896 }, { "epoch": 0.41772249097750136, "eval_loss": 0.3755421042442322, "eval_runtime": 31.0414, "eval_samples_per_second": 16.945, "eval_steps_per_second": 4.252, "eval_token_acc": 0.88294336359706, "step": 340 }, { "epoch": 0.423865468785994, "grad_norm": 0.7604427933692932, "learning_rate": 9.514381022228997e-06, "loss": 0.36464648246765136, "memory(GiB)": 36.59, "step": 345, "token_acc": 0.8840508026994174, "train_speed(iter/s)": 0.117605 }, { "epoch": 0.43000844659448667, "grad_norm": 0.7618926763534546, "learning_rate": 9.50044398635953e-06, "loss": 0.37844386100769045, "memory(GiB)": 36.59, "step": 350, "token_acc": 0.8788168373151308, "train_speed(iter/s)": 0.117953 }, { "epoch": 0.4361514244029793, "grad_norm": 0.6848899126052856, "learning_rate": 9.486320282430469e-06, "loss": 0.3681621551513672, "memory(GiB)": 36.59, "step": 355, "token_acc": 0.8739398701268689, "train_speed(iter/s)": 0.11841 }, { "epoch": 0.44229440221147204, "grad_norm": 0.7334110140800476, "learning_rate": 9.472010496260545e-06, "loss": 0.3769216060638428, "memory(GiB)": 36.59, "step": 360, "token_acc": 0.8754503693028283, "train_speed(iter/s)": 0.118855 }, { "epoch": 0.44229440221147204, "eval_loss": 0.3748551905155182, "eval_runtime": 31.0629, "eval_samples_per_second": 16.933, "eval_steps_per_second": 4.249, "eval_token_acc": 0.8829156939040208, "step": 360 }, { "epoch": 0.4484373800199647, "grad_norm": 0.6733468770980835, "learning_rate": 9.45751522138676e-06, "loss": 0.3699374198913574, "memory(GiB)": 36.59, "step": 365, "token_acc": 0.8818029853755239, "train_speed(iter/s)": 0.117632 }, { "epoch": 0.45458035782845735, "grad_norm": 0.6975194811820984, "learning_rate": 9.44283505903976e-06, "loss": 0.3686963081359863, "memory(GiB)": 36.59, "step": 370, "token_acc": 0.8794543496470025, "train_speed(iter/s)": 0.118021 }, { "epoch": 0.46072333563695, "grad_norm": 0.7434240579605103, "learning_rate": 9.427970618118888e-06, "loss": 0.38825435638427735, "memory(GiB)": 36.59, "step": 375, "token_acc": 0.875475461545598, "train_speed(iter/s)": 0.118433 }, { "epoch": 0.46686631344544266, "grad_norm": 0.7431550621986389, "learning_rate": 9.412922515166952e-06, "loss": 0.36851983070373534, "memory(GiB)": 36.59, "step": 380, "token_acc": 0.8677917508307813, "train_speed(iter/s)": 0.118763 }, { "epoch": 0.46686631344544266, "eval_loss": 0.37393027544021606, "eval_runtime": 31.0461, "eval_samples_per_second": 16.943, "eval_steps_per_second": 4.252, "eval_token_acc": 0.8833099870298314, "step": 380 }, { "epoch": 0.47300929125393537, "grad_norm": 0.7830028533935547, "learning_rate": 9.39769137434463e-06, "loss": 0.39449851512908934, "memory(GiB)": 36.59, "step": 385, "token_acc": 0.8813338029015882, "train_speed(iter/s)": 0.117631 }, { "epoch": 0.479152269062428, "grad_norm": 0.7592146992683411, "learning_rate": 9.38227782740459e-06, "loss": 0.3797061681747437, "memory(GiB)": 36.59, "step": 390, "token_acc": 0.8725708251892791, "train_speed(iter/s)": 0.118076 }, { "epoch": 0.4852952468709207, "grad_norm": 0.7044036388397217, "learning_rate": 9.366682513665293e-06, "loss": 0.34874444007873534, "memory(GiB)": 36.59, "step": 395, "token_acc": 0.8924136680866491, "train_speed(iter/s)": 0.118392 }, { "epoch": 0.49143822467941334, "grad_norm": 0.7327947020530701, "learning_rate": 9.350906079984456e-06, "loss": 0.3913299322128296, "memory(GiB)": 36.59, "step": 400, "token_acc": 0.8793465520609494, "train_speed(iter/s)": 0.118741 }, { "epoch": 0.49143822467941334, "eval_loss": 0.37309640645980835, "eval_runtime": 31.0239, "eval_samples_per_second": 16.955, "eval_steps_per_second": 4.255, "eval_token_acc": 0.8834448767833982, "step": 400 }, { "epoch": 0.497581202487906, "grad_norm": 0.714108407497406, "learning_rate": 9.334949180732245e-06, "loss": 0.3835240364074707, "memory(GiB)": 36.59, "step": 405, "token_acc": 0.8806726886733547, "train_speed(iter/s)": 0.117711 }, { "epoch": 0.5037241802963986, "grad_norm": 0.6842460632324219, "learning_rate": 9.31881247776412e-06, "loss": 0.34242706298828124, "memory(GiB)": 36.59, "step": 410, "token_acc": 0.8898083315651744, "train_speed(iter/s)": 0.118116 }, { "epoch": 0.5098671581048914, "grad_norm": 0.7109769582748413, "learning_rate": 9.302496640393383e-06, "loss": 0.3699876546859741, "memory(GiB)": 36.59, "step": 415, "token_acc": 0.8834916327453641, "train_speed(iter/s)": 0.118429 }, { "epoch": 0.516010135913384, "grad_norm": 0.72795569896698, "learning_rate": 9.286002345363418e-06, "loss": 0.36434710025787354, "memory(GiB)": 36.59, "step": 420, "token_acc": 0.8838608737513539, "train_speed(iter/s)": 0.118728 }, { "epoch": 0.516010135913384, "eval_loss": 0.37135639786720276, "eval_runtime": 31.0313, "eval_samples_per_second": 16.951, "eval_steps_per_second": 4.254, "eval_token_acc": 0.8840812797233031, "step": 420 }, { "epoch": 0.5221531137218767, "grad_norm": 0.6852810382843018, "learning_rate": 9.26933027681963e-06, "loss": 0.371048641204834, "memory(GiB)": 36.59, "step": 425, "token_acc": 0.8814758591608687, "train_speed(iter/s)": 0.117721 }, { "epoch": 0.5282960915303694, "grad_norm": 0.7316782474517822, "learning_rate": 9.25248112628105e-06, "loss": 0.3735438346862793, "memory(GiB)": 36.59, "step": 430, "token_acc": 0.8826662287081789, "train_speed(iter/s)": 0.117992 }, { "epoch": 0.534439069338862, "grad_norm": 0.7115728259086609, "learning_rate": 9.235455592611667e-06, "loss": 0.360302734375, "memory(GiB)": 36.59, "step": 435, "token_acc": 0.8884788847888478, "train_speed(iter/s)": 0.118327 }, { "epoch": 0.5405820471473547, "grad_norm": 0.6213703155517578, "learning_rate": 9.218254381991438e-06, "loss": 0.363602352142334, "memory(GiB)": 36.59, "step": 440, "token_acc": 0.8796412181894034, "train_speed(iter/s)": 0.118669 }, { "epoch": 0.5405820471473547, "eval_loss": 0.37061288952827454, "eval_runtime": 31.0004, "eval_samples_per_second": 16.968, "eval_steps_per_second": 4.258, "eval_token_acc": 0.8841919584954604, "step": 440 }, { "epoch": 0.5467250249558473, "grad_norm": 0.6319580674171448, "learning_rate": 9.200878207886995e-06, "loss": 0.36367177963256836, "memory(GiB)": 36.59, "step": 445, "token_acc": 0.880615405975304, "train_speed(iter/s)": 0.11768 }, { "epoch": 0.55286800276434, "grad_norm": 0.7951823472976685, "learning_rate": 9.183327791022048e-06, "loss": 0.37214341163635256, "memory(GiB)": 36.59, "step": 450, "token_acc": 0.88060522696011, "train_speed(iter/s)": 0.118044 }, { "epoch": 0.5590109805728327, "grad_norm": 0.7379077076911926, "learning_rate": 9.165603859347503e-06, "loss": 0.3636307716369629, "memory(GiB)": 36.59, "step": 455, "token_acc": 0.8860057913311012, "train_speed(iter/s)": 0.118377 }, { "epoch": 0.5651539583813253, "grad_norm": 0.6838334798812866, "learning_rate": 9.147707148011255e-06, "loss": 0.36699528694152833, "memory(GiB)": 36.59, "step": 460, "token_acc": 0.8731886687471273, "train_speed(iter/s)": 0.118711 }, { "epoch": 0.5651539583813253, "eval_loss": 0.3706679344177246, "eval_runtime": 31.0119, "eval_samples_per_second": 16.961, "eval_steps_per_second": 4.256, "eval_token_acc": 0.883956766104626, "step": 460 }, { "epoch": 0.571296936189818, "grad_norm": 0.7097205519676208, "learning_rate": 9.129638399327707e-06, "loss": 0.3835044622421265, "memory(GiB)": 36.59, "step": 465, "token_acc": 0.8826179212466955, "train_speed(iter/s)": 0.117847 }, { "epoch": 0.5774399139983106, "grad_norm": 0.7685003876686096, "learning_rate": 9.111398362746969e-06, "loss": 0.34739508628845217, "memory(GiB)": 36.59, "step": 470, "token_acc": 0.8856424192063242, "train_speed(iter/s)": 0.118092 }, { "epoch": 0.5835828918068033, "grad_norm": 0.684012234210968, "learning_rate": 9.092987794823785e-06, "loss": 0.35583484172821045, "memory(GiB)": 36.59, "step": 475, "token_acc": 0.8870865428183053, "train_speed(iter/s)": 0.118389 }, { "epoch": 0.589725869615296, "grad_norm": 0.7555577158927917, "learning_rate": 9.074407459186144e-06, "loss": 0.3742217540740967, "memory(GiB)": 36.59, "step": 480, "token_acc": 0.8749733708902407, "train_speed(iter/s)": 0.118715 }, { "epoch": 0.589725869615296, "eval_loss": 0.3698117733001709, "eval_runtime": 31.0843, "eval_samples_per_second": 16.922, "eval_steps_per_second": 4.247, "eval_token_acc": 0.8841297016861219, "step": 480 }, { "epoch": 0.5958688474237887, "grad_norm": 0.7176510095596313, "learning_rate": 9.055658126503605e-06, "loss": 0.35448513031005857, "memory(GiB)": 36.59, "step": 485, "token_acc": 0.882393450149208, "train_speed(iter/s)": 0.117835 }, { "epoch": 0.6020118252322814, "grad_norm": 0.7371023297309875, "learning_rate": 9.036740574455345e-06, "loss": 0.35907247066497805, "memory(GiB)": 36.59, "step": 490, "token_acc": 0.8887174366887537, "train_speed(iter/s)": 0.118083 }, { "epoch": 0.608154803040774, "grad_norm": 0.6593868136405945, "learning_rate": 9.017655587697885e-06, "loss": 0.36144974231719973, "memory(GiB)": 36.59, "step": 495, "token_acc": 0.8897585166019836, "train_speed(iter/s)": 0.118377 }, { "epoch": 0.6142977808492667, "grad_norm": 0.7346932291984558, "learning_rate": 8.998403957832553e-06, "loss": 0.35957746505737304, "memory(GiB)": 36.59, "step": 500, "token_acc": 0.8936918488180564, "train_speed(iter/s)": 0.118644 }, { "epoch": 0.6142977808492667, "eval_loss": 0.36875346302986145, "eval_runtime": 30.9564, "eval_samples_per_second": 16.992, "eval_steps_per_second": 4.264, "eval_token_acc": 0.8846000864677908, "step": 500 }, { "epoch": 0.6204407586577594, "grad_norm": 0.6690182089805603, "learning_rate": 8.978986483372657e-06, "loss": 0.36060357093811035, "memory(GiB)": 36.59, "step": 505, "token_acc": 0.8834197325817438, "train_speed(iter/s)": 0.117782 }, { "epoch": 0.626583736466252, "grad_norm": 0.6996055245399475, "learning_rate": 8.959403969710346e-06, "loss": 0.35636866092681885, "memory(GiB)": 36.59, "step": 510, "token_acc": 0.8747046644744855, "train_speed(iter/s)": 0.118099 }, { "epoch": 0.6327267142747447, "grad_norm": 0.7242439985275269, "learning_rate": 8.939657229083223e-06, "loss": 0.362790584564209, "memory(GiB)": 36.59, "step": 515, "token_acc": 0.8795643179382369, "train_speed(iter/s)": 0.11841 }, { "epoch": 0.6388696920832373, "grad_norm": 0.7438492178916931, "learning_rate": 8.919747080540647e-06, "loss": 0.36803131103515624, "memory(GiB)": 36.59, "step": 520, "token_acc": 0.8868724794882492, "train_speed(iter/s)": 0.118724 }, { "epoch": 0.6388696920832373, "eval_loss": 0.3668961226940155, "eval_runtime": 31.0395, "eval_samples_per_second": 16.946, "eval_steps_per_second": 4.253, "eval_token_acc": 0.8850704712494596, "step": 520 }, { "epoch": 0.64501266989173, "grad_norm": 0.7443042993545532, "learning_rate": 8.899674349909759e-06, "loss": 0.3723003387451172, "memory(GiB)": 36.59, "step": 525, "token_acc": 0.8819261436583553, "train_speed(iter/s)": 0.117947 }, { "epoch": 0.6511556477002227, "grad_norm": 0.7717169523239136, "learning_rate": 8.879439869761233e-06, "loss": 0.37207541465759275, "memory(GiB)": 36.59, "step": 530, "token_acc": 0.8742153725911633, "train_speed(iter/s)": 0.118275 }, { "epoch": 0.6572986255087153, "grad_norm": 0.7282743453979492, "learning_rate": 8.859044479374737e-06, "loss": 0.3790937900543213, "memory(GiB)": 36.59, "step": 535, "token_acc": 0.8727581424267062, "train_speed(iter/s)": 0.118594 }, { "epoch": 0.663441603317208, "grad_norm": 0.7114688158035278, "learning_rate": 8.838489024704131e-06, "loss": 0.3806754112243652, "memory(GiB)": 36.59, "step": 540, "token_acc": 0.8710419328609594, "train_speed(iter/s)": 0.1188 }, { "epoch": 0.663441603317208, "eval_loss": 0.3665069043636322, "eval_runtime": 31.0202, "eval_samples_per_second": 16.957, "eval_steps_per_second": 4.255, "eval_token_acc": 0.8851638564634674, "step": 540 }, { "epoch": 0.6695845811257006, "grad_norm": 0.7657713890075684, "learning_rate": 8.817774358342367e-06, "loss": 0.3505518913269043, "memory(GiB)": 36.59, "step": 545, "token_acc": 0.8844506134759065, "train_speed(iter/s)": 0.118004 }, { "epoch": 0.6757275589341933, "grad_norm": 0.7225794196128845, "learning_rate": 8.796901339486136e-06, "loss": 0.36959023475646974, "memory(GiB)": 36.59, "step": 550, "token_acc": 0.8763546536336592, "train_speed(iter/s)": 0.118273 }, { "epoch": 0.681870536742686, "grad_norm": 0.6392650604248047, "learning_rate": 8.775870833900226e-06, "loss": 0.35045757293701174, "memory(GiB)": 36.59, "step": 555, "token_acc": 0.879759337041662, "train_speed(iter/s)": 0.118527 }, { "epoch": 0.6880135145511787, "grad_norm": 0.7549835443496704, "learning_rate": 8.75468371388161e-06, "loss": 0.3724693775177002, "memory(GiB)": 36.59, "step": 560, "token_acc": 0.8853696026829382, "train_speed(iter/s)": 0.11871 }, { "epoch": 0.6880135145511787, "eval_loss": 0.3657075762748718, "eval_runtime": 31.0165, "eval_samples_per_second": 16.959, "eval_steps_per_second": 4.256, "eval_token_acc": 0.8855408560311284, "step": 560 }, { "epoch": 0.6941564923596714, "grad_norm": 0.7093605399131775, "learning_rate": 8.733340858223268e-06, "loss": 0.3566612720489502, "memory(GiB)": 36.59, "step": 565, "token_acc": 0.8851378312772255, "train_speed(iter/s)": 0.117959 }, { "epoch": 0.700299470168164, "grad_norm": 0.7799498438835144, "learning_rate": 8.711843152177735e-06, "loss": 0.35616464614868165, "memory(GiB)": 36.59, "step": 570, "token_acc": 0.8828106906294872, "train_speed(iter/s)": 0.118218 }, { "epoch": 0.7064424479766567, "grad_norm": 0.7166778445243835, "learning_rate": 8.690191487420385e-06, "loss": 0.36056735515594485, "memory(GiB)": 36.59, "step": 575, "token_acc": 0.8801836905093237, "train_speed(iter/s)": 0.118442 }, { "epoch": 0.7125854257851494, "grad_norm": 0.7225358486175537, "learning_rate": 8.668386762012445e-06, "loss": 0.3537228345870972, "memory(GiB)": 36.59, "step": 580, "token_acc": 0.8771770513178728, "train_speed(iter/s)": 0.118636 }, { "epoch": 0.7125854257851494, "eval_loss": 0.3655913472175598, "eval_runtime": 31.0226, "eval_samples_per_second": 16.955, "eval_steps_per_second": 4.255, "eval_token_acc": 0.8858832684824903, "step": 580 }, { "epoch": 0.718728403593642, "grad_norm": 0.707248866558075, "learning_rate": 8.646429880363746e-06, "loss": 0.35696611404418943, "memory(GiB)": 36.59, "step": 585, "token_acc": 0.8862164894194702, "train_speed(iter/s)": 0.117878 }, { "epoch": 0.7248713814021347, "grad_norm": 0.7443193197250366, "learning_rate": 8.624321753195209e-06, "loss": 0.3900872468948364, "memory(GiB)": 36.59, "step": 590, "token_acc": 0.8764379646896419, "train_speed(iter/s)": 0.118131 }, { "epoch": 0.7310143592106274, "grad_norm": 0.6623369455337524, "learning_rate": 8.602063297501069e-06, "loss": 0.36977558135986327, "memory(GiB)": 36.59, "step": 595, "token_acc": 0.880887231518028, "train_speed(iter/s)": 0.118362 }, { "epoch": 0.73715733701912, "grad_norm": 0.7453055381774902, "learning_rate": 8.579655436510847e-06, "loss": 0.35259857177734377, "memory(GiB)": 36.59, "step": 600, "token_acc": 0.8738677315671569, "train_speed(iter/s)": 0.118577 }, { "epoch": 0.73715733701912, "eval_loss": 0.3638327419757843, "eval_runtime": 31.0054, "eval_samples_per_second": 16.965, "eval_steps_per_second": 4.257, "eval_token_acc": 0.885855598789451, "step": 600 }, { "epoch": 0.7433003148276127, "grad_norm": 0.7225296497344971, "learning_rate": 8.557099099651046e-06, "loss": 0.36968977451324464, "memory(GiB)": 36.59, "step": 605, "token_acc": 0.8823149650444696, "train_speed(iter/s)": 0.11788 }, { "epoch": 0.7494432926361053, "grad_norm": 0.6900773048400879, "learning_rate": 8.534395222506614e-06, "loss": 0.36718852519989015, "memory(GiB)": 36.59, "step": 610, "token_acc": 0.8860133630289533, "train_speed(iter/s)": 0.118141 }, { "epoch": 0.755586270444598, "grad_norm": 0.671517014503479, "learning_rate": 8.511544746782124e-06, "loss": 0.36435210704803467, "memory(GiB)": 36.59, "step": 615, "token_acc": 0.8798159594739043, "train_speed(iter/s)": 0.118359 }, { "epoch": 0.7617292482530907, "grad_norm": 0.6808713674545288, "learning_rate": 8.488548620262722e-06, "loss": 0.36147489547729494, "memory(GiB)": 36.59, "step": 620, "token_acc": 0.8823460793691529, "train_speed(iter/s)": 0.118579 }, { "epoch": 0.7617292482530907, "eval_loss": 0.3635016977787018, "eval_runtime": 31.0094, "eval_samples_per_second": 16.963, "eval_steps_per_second": 4.257, "eval_token_acc": 0.8860146995244271, "step": 620 }, { "epoch": 0.7678722260615833, "grad_norm": 0.7556000351905823, "learning_rate": 8.465407796774816e-06, "loss": 0.36651790142059326, "memory(GiB)": 36.59, "step": 625, "token_acc": 0.8846928285600197, "train_speed(iter/s)": 0.117873 }, { "epoch": 0.774015203870076, "grad_norm": 0.724098801612854, "learning_rate": 8.442123236146509e-06, "loss": 0.35537469387054443, "memory(GiB)": 36.59, "step": 630, "token_acc": 0.8859382569251772, "train_speed(iter/s)": 0.118118 }, { "epoch": 0.7801581816785687, "grad_norm": 0.728448748588562, "learning_rate": 8.418695904167789e-06, "loss": 0.3752614974975586, "memory(GiB)": 36.59, "step": 635, "token_acc": 0.8905149297823024, "train_speed(iter/s)": 0.118318 }, { "epoch": 0.7863011594870614, "grad_norm": 0.7735581994056702, "learning_rate": 8.395126772550475e-06, "loss": 0.3447936773300171, "memory(GiB)": 36.59, "step": 640, "token_acc": 0.8823329283110571, "train_speed(iter/s)": 0.118526 }, { "epoch": 0.7863011594870614, "eval_loss": 0.36254996061325073, "eval_runtime": 31.0583, "eval_samples_per_second": 16.936, "eval_steps_per_second": 4.25, "eval_token_acc": 0.8862879377431907, "step": 640 }, { "epoch": 0.7924441372955541, "grad_norm": 0.6083407402038574, "learning_rate": 8.371416818887907e-06, "loss": 0.3541299343109131, "memory(GiB)": 36.59, "step": 645, "token_acc": 0.8867384523493496, "train_speed(iter/s)": 0.117839 }, { "epoch": 0.7985871151040467, "grad_norm": 0.7006340622901917, "learning_rate": 8.347567026614398e-06, "loss": 0.36687259674072265, "memory(GiB)": 36.59, "step": 650, "token_acc": 0.878874098160756, "train_speed(iter/s)": 0.118045 }, { "epoch": 0.8047300929125394, "grad_norm": 0.7071450352668762, "learning_rate": 8.323578384964444e-06, "loss": 0.354215145111084, "memory(GiB)": 36.59, "step": 655, "token_acc": 0.8844807747626809, "train_speed(iter/s)": 0.118259 }, { "epoch": 0.810873070721032, "grad_norm": 0.6859620809555054, "learning_rate": 8.299451888931696e-06, "loss": 0.33744206428527834, "memory(GiB)": 36.59, "step": 660, "token_acc": 0.8832839002687923, "train_speed(iter/s)": 0.118483 }, { "epoch": 0.810873070721032, "eval_loss": 0.36208656430244446, "eval_runtime": 31.005, "eval_samples_per_second": 16.965, "eval_steps_per_second": 4.257, "eval_token_acc": 0.8863259835711198, "step": 660 }, { "epoch": 0.8170160485295247, "grad_norm": 0.6853975057601929, "learning_rate": 8.275188539227687e-06, "loss": 0.3501296043395996, "memory(GiB)": 36.59, "step": 665, "token_acc": 0.8818506429867994, "train_speed(iter/s)": 0.117792 }, { "epoch": 0.8231590263380174, "grad_norm": 0.672095775604248, "learning_rate": 8.250789342240326e-06, "loss": 0.3572331190109253, "memory(GiB)": 36.59, "step": 670, "token_acc": 0.8840531998946537, "train_speed(iter/s)": 0.118042 }, { "epoch": 0.82930200414651, "grad_norm": 0.6654704809188843, "learning_rate": 8.22625530999215e-06, "loss": 0.35687694549560545, "memory(GiB)": 36.59, "step": 675, "token_acc": 0.8840721896461247, "train_speed(iter/s)": 0.118263 }, { "epoch": 0.8354449819550027, "grad_norm": 0.6872120499610901, "learning_rate": 8.201587460098362e-06, "loss": 0.34873204231262206, "memory(GiB)": 36.59, "step": 680, "token_acc": 0.884066094755313, "train_speed(iter/s)": 0.118437 }, { "epoch": 0.8354449819550027, "eval_loss": 0.36098214983940125, "eval_runtime": 31.0688, "eval_samples_per_second": 16.93, "eval_steps_per_second": 4.249, "eval_token_acc": 0.8864989191526157, "step": 680 }, { "epoch": 0.8415879597634953, "grad_norm": 0.6905971765518188, "learning_rate": 8.176786815724601e-06, "loss": 0.3643667221069336, "memory(GiB)": 36.59, "step": 685, "token_acc": 0.8811371118426906, "train_speed(iter/s)": 0.117814 }, { "epoch": 0.847730937571988, "grad_norm": 0.688023567199707, "learning_rate": 8.151854405544526e-06, "loss": 0.369766902923584, "memory(GiB)": 36.59, "step": 690, "token_acc": 0.8848363488998546, "train_speed(iter/s)": 0.118018 }, { "epoch": 0.8538739153804807, "grad_norm": 0.6458128690719604, "learning_rate": 8.12679126369713e-06, "loss": 0.3629646301269531, "memory(GiB)": 36.59, "step": 695, "token_acc": 0.8775533863525702, "train_speed(iter/s)": 0.118233 }, { "epoch": 0.8600168931889733, "grad_norm": 0.6942622065544128, "learning_rate": 8.101598429743862e-06, "loss": 0.3692671298980713, "memory(GiB)": 36.59, "step": 700, "token_acc": 0.8780482002236338, "train_speed(iter/s)": 0.118437 }, { "epoch": 0.8600168931889733, "eval_loss": 0.35998860001564026, "eval_runtime": 31.0268, "eval_samples_per_second": 16.953, "eval_steps_per_second": 4.254, "eval_token_acc": 0.8866441850410722, "step": 700 }, { "epoch": 0.866159870997466, "grad_norm": 0.7322993278503418, "learning_rate": 8.076276948625495e-06, "loss": 0.36251187324523926, "memory(GiB)": 36.59, "step": 705, "token_acc": 0.8850018575958389, "train_speed(iter/s)": 0.117844 }, { "epoch": 0.8723028488059587, "grad_norm": 0.7000852823257446, "learning_rate": 8.050827870618795e-06, "loss": 0.352423095703125, "memory(GiB)": 36.59, "step": 710, "token_acc": 0.8848280386093149, "train_speed(iter/s)": 0.118064 }, { "epoch": 0.8784458266144514, "grad_norm": 0.7393072843551636, "learning_rate": 8.02525225129295e-06, "loss": 0.3464043140411377, "memory(GiB)": 36.59, "step": 715, "token_acc": 0.8842617899915519, "train_speed(iter/s)": 0.118282 }, { "epoch": 0.8845888044229441, "grad_norm": 0.676538348197937, "learning_rate": 7.999551151465793e-06, "loss": 0.3531349658966064, "memory(GiB)": 36.59, "step": 720, "token_acc": 0.882141211070386, "train_speed(iter/s)": 0.118479 }, { "epoch": 0.8845888044229441, "eval_loss": 0.3599785268306732, "eval_runtime": 31.1371, "eval_samples_per_second": 16.893, "eval_steps_per_second": 4.239, "eval_token_acc": 0.8867168179853004, "step": 720 }, { "epoch": 0.8907317822314367, "grad_norm": 0.6606590151786804, "learning_rate": 7.973725637159795e-06, "loss": 0.3510305881500244, "memory(GiB)": 36.59, "step": 725, "token_acc": 0.8858640888051448, "train_speed(iter/s)": 0.117866 }, { "epoch": 0.8968747600399294, "grad_norm": 0.6910014748573303, "learning_rate": 7.947776779557862e-06, "loss": 0.34729857444763185, "memory(GiB)": 36.59, "step": 730, "token_acc": 0.8849401138817985, "train_speed(iter/s)": 0.11805 }, { "epoch": 0.903017737848422, "grad_norm": 0.715715765953064, "learning_rate": 7.921705654958886e-06, "loss": 0.37070040702819823, "memory(GiB)": 36.59, "step": 735, "token_acc": 0.873466112894091, "train_speed(iter/s)": 0.118238 }, { "epoch": 0.9091607156569147, "grad_norm": 0.6847560405731201, "learning_rate": 7.895513344733124e-06, "loss": 0.3388267993927002, "memory(GiB)": 36.59, "step": 740, "token_acc": 0.892940483205657, "train_speed(iter/s)": 0.118418 }, { "epoch": 0.9091607156569147, "eval_loss": 0.35883787274360657, "eval_runtime": 31.0744, "eval_samples_per_second": 16.927, "eval_steps_per_second": 4.248, "eval_token_acc": 0.8870696065715521, "step": 740 }, { "epoch": 0.9153036934654074, "grad_norm": 0.7038071155548096, "learning_rate": 7.869200935277317e-06, "loss": 0.3523221015930176, "memory(GiB)": 36.59, "step": 745, "token_acc": 0.8841770158578834, "train_speed(iter/s)": 0.117874 }, { "epoch": 0.9214466712739, "grad_norm": 0.7095304727554321, "learning_rate": 7.842769517969665e-06, "loss": 0.34724674224853513, "memory(GiB)": 36.59, "step": 750, "token_acc": 0.8921830597616321, "train_speed(iter/s)": 0.118073 }, { "epoch": 0.9275896490823927, "grad_norm": 0.7056006789207458, "learning_rate": 7.816220189124527e-06, "loss": 0.34354069232940676, "memory(GiB)": 36.59, "step": 755, "token_acc": 0.8906672115144498, "train_speed(iter/s)": 0.118273 }, { "epoch": 0.9337326268908853, "grad_norm": 0.6470732092857361, "learning_rate": 7.789554049946966e-06, "loss": 0.37253437042236326, "memory(GiB)": 36.59, "step": 760, "token_acc": 0.8801472977363803, "train_speed(iter/s)": 0.118474 }, { "epoch": 0.9337326268908853, "eval_loss": 0.3579709231853485, "eval_runtime": 31.0126, "eval_samples_per_second": 16.961, "eval_steps_per_second": 4.256, "eval_token_acc": 0.8876368352788586, "step": 760 }, { "epoch": 0.939875604699378, "grad_norm": 0.671111524105072, "learning_rate": 7.762772206487066e-06, "loss": 0.3589931011199951, "memory(GiB)": 36.59, "step": 765, "token_acc": 0.8832086813686086, "train_speed(iter/s)": 0.117907 }, { "epoch": 0.9460185825078707, "grad_norm": 0.7187632322311401, "learning_rate": 7.735875769594063e-06, "loss": 0.34763507843017577, "memory(GiB)": 36.59, "step": 770, "token_acc": 0.8847997559593846, "train_speed(iter/s)": 0.118076 }, { "epoch": 0.9521615603163633, "grad_norm": 0.7212729454040527, "learning_rate": 7.70886585487026e-06, "loss": 0.3598261833190918, "memory(GiB)": 36.59, "step": 775, "token_acc": 0.8688440332679189, "train_speed(iter/s)": 0.118251 }, { "epoch": 0.958304538124856, "grad_norm": 0.6621137261390686, "learning_rate": 7.681743582624761e-06, "loss": 0.35757567882537844, "memory(GiB)": 36.59, "step": 780, "token_acc": 0.8785007468259896, "train_speed(iter/s)": 0.118454 }, { "epoch": 0.958304538124856, "eval_loss": 0.3576439321041107, "eval_runtime": 31.075, "eval_samples_per_second": 16.927, "eval_steps_per_second": 4.248, "eval_token_acc": 0.8877751837440553, "step": 780 }, { "epoch": 0.9644475159333487, "grad_norm": 0.7074070572853088, "learning_rate": 7.654510077827003e-06, "loss": 0.3493576765060425, "memory(GiB)": 36.59, "step": 785, "token_acc": 0.8852768310495931, "train_speed(iter/s)": 0.117922 }, { "epoch": 0.9705904937418414, "grad_norm": 0.6370189189910889, "learning_rate": 7.627166470060092e-06, "loss": 0.3448970317840576, "memory(GiB)": 36.59, "step": 790, "token_acc": 0.8896250845717751, "train_speed(iter/s)": 0.118138 }, { "epoch": 0.9767334715503341, "grad_norm": 0.6875202655792236, "learning_rate": 7.59971389347395e-06, "loss": 0.36741271018981936, "memory(GiB)": 36.59, "step": 795, "token_acc": 0.880575873679322, "train_speed(iter/s)": 0.118312 }, { "epoch": 0.9828764493588267, "grad_norm": 0.7139670848846436, "learning_rate": 7.572153486738281e-06, "loss": 0.3554513692855835, "memory(GiB)": 36.59, "step": 800, "token_acc": 0.8777580460748777, "train_speed(iter/s)": 0.118491 }, { "epoch": 0.9828764493588267, "eval_loss": 0.3568785786628723, "eval_runtime": 31.0006, "eval_samples_per_second": 16.967, "eval_steps_per_second": 4.258, "eval_token_acc": 0.8877959360138349, "step": 800 }, { "epoch": 0.9890194271673194, "grad_norm": 0.7183944582939148, "learning_rate": 7.544486392995325e-06, "loss": 0.3408940076828003, "memory(GiB)": 36.59, "step": 805, "token_acc": 0.8823203099663748, "train_speed(iter/s)": 0.117937 }, { "epoch": 0.995162404975812, "grad_norm": 0.7064708471298218, "learning_rate": 7.516713759812465e-06, "loss": 0.3436570167541504, "memory(GiB)": 36.59, "step": 810, "token_acc": 0.8865785782162089, "train_speed(iter/s)": 0.118112 }, { "epoch": 1.002457191123397, "grad_norm": 0.7077184915542603, "learning_rate": 7.4888367391346085e-06, "loss": 0.40673046112060546, "memory(GiB)": 36.59, "step": 815, "token_acc": 0.8932987364620939, "train_speed(iter/s)": 0.11823 }, { "epoch": 1.0086001689318898, "grad_norm": 0.6631501317024231, "learning_rate": 7.460856487236421e-06, "loss": 0.32202835083007814, "memory(GiB)": 36.59, "step": 820, "token_acc": 0.8988542163968578, "train_speed(iter/s)": 0.118434 }, { "epoch": 1.0086001689318898, "eval_loss": 0.3615255355834961, "eval_runtime": 31.0114, "eval_samples_per_second": 16.961, "eval_steps_per_second": 4.256, "eval_token_acc": 0.8877959360138349, "step": 820 }, { "epoch": 1.0147431467403825, "grad_norm": 0.657802939414978, "learning_rate": 7.432774164674359e-06, "loss": 0.2976385116577148, "memory(GiB)": 36.59, "step": 825, "token_acc": 0.8940141675474071, "train_speed(iter/s)": 0.117911 }, { "epoch": 1.0208861245488752, "grad_norm": 0.675591766834259, "learning_rate": 7.404590936238535e-06, "loss": 0.311181640625, "memory(GiB)": 36.59, "step": 830, "token_acc": 0.8997530755324309, "train_speed(iter/s)": 0.118145 }, { "epoch": 1.0270291023573677, "grad_norm": 0.6661099791526794, "learning_rate": 7.376307970904408e-06, "loss": 0.3044283866882324, "memory(GiB)": 36.59, "step": 835, "token_acc": 0.8999576197242789, "train_speed(iter/s)": 0.118312 }, { "epoch": 1.0331720801658604, "grad_norm": 0.6595695614814758, "learning_rate": 7.34792644178429e-06, "loss": 0.3037309408187866, "memory(GiB)": 36.59, "step": 840, "token_acc": 0.9055141287284144, "train_speed(iter/s)": 0.118457 }, { "epoch": 1.0331720801658604, "eval_loss": 0.35968878865242004, "eval_runtime": 31.0067, "eval_samples_per_second": 16.964, "eval_steps_per_second": 4.257, "eval_token_acc": 0.8877025507998271, "step": 840 }, { "epoch": 1.039315057974353, "grad_norm": 0.7282394170761108, "learning_rate": 7.319447526078696e-06, "loss": 0.3085323333740234, "memory(GiB)": 36.59, "step": 845, "token_acc": 0.8898290405833752, "train_speed(iter/s)": 0.118005 }, { "epoch": 1.0454580357828458, "grad_norm": 0.6701980233192444, "learning_rate": 7.290872405027508e-06, "loss": 0.29195051193237304, "memory(GiB)": 36.59, "step": 850, "token_acc": 0.9044647710888937, "train_speed(iter/s)": 0.118164 }, { "epoch": 1.0516010135913385, "grad_norm": 0.6651575565338135, "learning_rate": 7.262202263860989e-06, "loss": 0.30650150775909424, "memory(GiB)": 36.59, "step": 855, "token_acc": 0.8993040861428504, "train_speed(iter/s)": 0.118324 }, { "epoch": 1.057743991399831, "grad_norm": 0.682246744632721, "learning_rate": 7.233438291750615e-06, "loss": 0.3102306842803955, "memory(GiB)": 36.59, "step": 860, "token_acc": 0.9063239097279017, "train_speed(iter/s)": 0.11848 }, { "epoch": 1.057743991399831, "eval_loss": 0.35930460691452026, "eval_runtime": 31.0087, "eval_samples_per_second": 16.963, "eval_steps_per_second": 4.257, "eval_token_acc": 0.887875486381323, "step": 860 }, { "epoch": 1.0638869692083237, "grad_norm": 0.7295219898223877, "learning_rate": 7.204581681759752e-06, "loss": 0.30730266571044923, "memory(GiB)": 36.59, "step": 865, "token_acc": 0.8905181851880587, "train_speed(iter/s)": 0.117999 }, { "epoch": 1.0700299470168164, "grad_norm": 0.6892926096916199, "learning_rate": 7.175633630794176e-06, "loss": 0.2974876403808594, "memory(GiB)": 36.59, "step": 870, "token_acc": 0.9006297483247798, "train_speed(iter/s)": 0.118168 }, { "epoch": 1.0761729248253091, "grad_norm": 0.6752432584762573, "learning_rate": 7.146595339552423e-06, "loss": 0.3102593421936035, "memory(GiB)": 36.59, "step": 875, "token_acc": 0.9038279095421953, "train_speed(iter/s)": 0.118364 }, { "epoch": 1.0823159026338018, "grad_norm": 0.674329400062561, "learning_rate": 7.1174680124759856e-06, "loss": 0.28625760078430174, "memory(GiB)": 36.59, "step": 880, "token_acc": 0.9079884290164664, "train_speed(iter/s)": 0.118523 }, { "epoch": 1.0823159026338018, "eval_loss": 0.36010968685150146, "eval_runtime": 31.0269, "eval_samples_per_second": 16.953, "eval_steps_per_second": 4.254, "eval_token_acc": 0.8876299178555987, "step": 880 }, { "epoch": 1.0884588804422943, "grad_norm": 0.6883670091629028, "learning_rate": 7.08825285769936e-06, "loss": 0.3032073020935059, "memory(GiB)": 36.59, "step": 885, "token_acc": 0.8932736033602344, "train_speed(iter/s)": 0.118061 }, { "epoch": 1.094601858250787, "grad_norm": 0.671500027179718, "learning_rate": 7.058951086999934e-06, "loss": 0.3017904758453369, "memory(GiB)": 36.59, "step": 890, "token_acc": 0.9018632618216911, "train_speed(iter/s)": 0.118196 }, { "epoch": 1.1007448360592798, "grad_norm": 0.7209696173667908, "learning_rate": 7.029563915747723e-06, "loss": 0.31074273586273193, "memory(GiB)": 36.59, "step": 895, "token_acc": 0.898548356982823, "train_speed(iter/s)": 0.118358 }, { "epoch": 1.1068878138677725, "grad_norm": 0.624523937702179, "learning_rate": 7.0000925628549595e-06, "loss": 0.2956224918365479, "memory(GiB)": 36.59, "step": 900, "token_acc": 0.9076877474540027, "train_speed(iter/s)": 0.118515 }, { "epoch": 1.1068878138677725, "eval_loss": 0.3587914705276489, "eval_runtime": 31.0433, "eval_samples_per_second": 16.944, "eval_steps_per_second": 4.252, "eval_token_acc": 0.8878443579766537, "step": 900 }, { "epoch": 1.1130307916762652, "grad_norm": 0.7052697539329529, "learning_rate": 6.9705382507255405e-06, "loss": 0.2872809648513794, "memory(GiB)": 36.59, "step": 905, "token_acc": 0.8926222488296873, "train_speed(iter/s)": 0.118076 }, { "epoch": 1.1191737694847577, "grad_norm": 0.7123196125030518, "learning_rate": 6.940902205204321e-06, "loss": 0.2964935302734375, "memory(GiB)": 36.59, "step": 910, "token_acc": 0.9039498517120518, "train_speed(iter/s)": 0.118226 }, { "epoch": 1.1253167472932504, "grad_norm": 0.660994291305542, "learning_rate": 6.911185655526263e-06, "loss": 0.302768611907959, "memory(GiB)": 36.59, "step": 915, "token_acc": 0.9020544461398969, "train_speed(iter/s)": 0.118393 }, { "epoch": 1.131459725101743, "grad_norm": 0.7210450768470764, "learning_rate": 6.881389834265463e-06, "loss": 0.3173034429550171, "memory(GiB)": 36.59, "step": 920, "token_acc": 0.8982849864950921, "train_speed(iter/s)": 0.118553 }, { "epoch": 1.131459725101743, "eval_loss": 0.3588680624961853, "eval_runtime": 31.0113, "eval_samples_per_second": 16.962, "eval_steps_per_second": 4.257, "eval_token_acc": 0.8877405966277562, "step": 920 }, { "epoch": 1.1376027029102358, "grad_norm": 0.6697967648506165, "learning_rate": 6.851515977284014e-06, "loss": 0.299291205406189, "memory(GiB)": 36.59, "step": 925, "token_acc": 0.8902243928864662, "train_speed(iter/s)": 0.118081 }, { "epoch": 1.1437456807187285, "grad_norm": 0.7066377401351929, "learning_rate": 6.821565323680759e-06, "loss": 0.29554860591888427, "memory(GiB)": 36.59, "step": 930, "token_acc": 0.9000831485587583, "train_speed(iter/s)": 0.118223 }, { "epoch": 1.149888658527221, "grad_norm": 0.6386650204658508, "learning_rate": 6.791539115739879e-06, "loss": 0.3022310256958008, "memory(GiB)": 36.59, "step": 935, "token_acc": 0.8924001814882032, "train_speed(iter/s)": 0.118412 }, { "epoch": 1.1560316363357137, "grad_norm": 0.6704084873199463, "learning_rate": 6.761438598879383e-06, "loss": 0.28601846694946287, "memory(GiB)": 36.59, "step": 940, "token_acc": 0.9012753677155092, "train_speed(iter/s)": 0.118547 }, { "epoch": 1.1560316363357137, "eval_loss": 0.35880643129348755, "eval_runtime": 31.0179, "eval_samples_per_second": 16.958, "eval_steps_per_second": 4.256, "eval_token_acc": 0.8880657155209685, "step": 940 }, { "epoch": 1.1621746141442064, "grad_norm": 0.6651415228843689, "learning_rate": 6.731265021599437e-06, "loss": 0.3218855381011963, "memory(GiB)": 36.59, "step": 945, "token_acc": 0.8918404969109147, "train_speed(iter/s)": 0.118102 }, { "epoch": 1.1683175919526991, "grad_norm": 0.6738328337669373, "learning_rate": 6.7010196354305876e-06, "loss": 0.30361137390136717, "memory(GiB)": 36.59, "step": 950, "token_acc": 0.9092978421945045, "train_speed(iter/s)": 0.118249 }, { "epoch": 1.1744605697611918, "grad_norm": 0.6776899099349976, "learning_rate": 6.670703694881851e-06, "loss": 0.29663915634155275, "memory(GiB)": 36.59, "step": 955, "token_acc": 0.8984023842094978, "train_speed(iter/s)": 0.118405 }, { "epoch": 1.1806035475696843, "grad_norm": 0.6939485669136047, "learning_rate": 6.640318457388672e-06, "loss": 0.3056649684906006, "memory(GiB)": 36.59, "step": 960, "token_acc": 0.8867154116418194, "train_speed(iter/s)": 0.118549 }, { "epoch": 1.1806035475696843, "eval_loss": 0.35902491211891174, "eval_runtime": 31.0009, "eval_samples_per_second": 16.967, "eval_steps_per_second": 4.258, "eval_token_acc": 0.8878408992650237, "step": 960 }, { "epoch": 1.186746525378177, "grad_norm": 0.7092224359512329, "learning_rate": 6.609865183260777e-06, "loss": 0.2987541198730469, "memory(GiB)": 36.59, "step": 965, "token_acc": 0.8890386576114193, "train_speed(iter/s)": 0.118089 }, { "epoch": 1.1928895031866698, "grad_norm": 0.7263514399528503, "learning_rate": 6.579345135629896e-06, "loss": 0.28489587306976316, "memory(GiB)": 36.59, "step": 970, "token_acc": 0.8956198679571216, "train_speed(iter/s)": 0.118237 }, { "epoch": 1.1990324809951625, "grad_norm": 0.6999565362930298, "learning_rate": 6.548759580397363e-06, "loss": 0.30396156311035155, "memory(GiB)": 36.59, "step": 975, "token_acc": 0.8999096083844331, "train_speed(iter/s)": 0.118377 }, { "epoch": 1.2051754588036552, "grad_norm": 0.6386498212814331, "learning_rate": 6.518109786181628e-06, "loss": 0.32303242683410643, "memory(GiB)": 36.59, "step": 980, "token_acc": 0.8918318331799511, "train_speed(iter/s)": 0.11851 }, { "epoch": 1.2051754588036552, "eval_loss": 0.3577713966369629, "eval_runtime": 30.9949, "eval_samples_per_second": 16.971, "eval_steps_per_second": 4.259, "eval_token_acc": 0.8879481193255512, "step": 980 }, { "epoch": 1.2113184366121477, "grad_norm": 0.6696978807449341, "learning_rate": 6.487397024265616e-06, "loss": 0.29286723136901854, "memory(GiB)": 36.59, "step": 985, "token_acc": 0.8883067219587296, "train_speed(iter/s)": 0.11806 }, { "epoch": 1.2174614144206404, "grad_norm": 0.6677629947662354, "learning_rate": 6.456622568544012e-06, "loss": 0.295971155166626, "memory(GiB)": 36.59, "step": 990, "token_acc": 0.901066495199663, "train_speed(iter/s)": 0.118215 }, { "epoch": 1.223604392229133, "grad_norm": 0.6924172639846802, "learning_rate": 6.425787695470419e-06, "loss": 0.2936640024185181, "memory(GiB)": 36.59, "step": 995, "token_acc": 0.8968813591405991, "train_speed(iter/s)": 0.118377 }, { "epoch": 1.2297473700376258, "grad_norm": 0.6816849112510681, "learning_rate": 6.3948936840044096e-06, "loss": 0.29815101623535156, "memory(GiB)": 36.59, "step": 1000, "token_acc": 0.9113140380746014, "train_speed(iter/s)": 0.118511 }, { "epoch": 1.2297473700376258, "eval_loss": 0.35851019620895386, "eval_runtime": 31.0715, "eval_samples_per_second": 16.929, "eval_steps_per_second": 4.248, "eval_token_acc": 0.8881279723303069, "step": 1000 }, { "epoch": 1.2358903478461185, "grad_norm": 0.7491683959960938, "learning_rate": 6.363941815558484e-06, "loss": 0.305048394203186, "memory(GiB)": 36.59, "step": 1005, "token_acc": 0.8883380321029248, "train_speed(iter/s)": 0.118078 }, { "epoch": 1.242033325654611, "grad_norm": 0.6767114400863647, "learning_rate": 6.332933373944914e-06, "loss": 0.2910877466201782, "memory(GiB)": 36.59, "step": 1010, "token_acc": 0.8970752230332523, "train_speed(iter/s)": 0.118198 }, { "epoch": 1.2481763034631037, "grad_norm": 0.6579700112342834, "learning_rate": 6.301869645322498e-06, "loss": 0.2989434480667114, "memory(GiB)": 36.59, "step": 1015, "token_acc": 0.9020202767705173, "train_speed(iter/s)": 0.118352 }, { "epoch": 1.2543192812715964, "grad_norm": 0.7496470808982849, "learning_rate": 6.270751918143213e-06, "loss": 0.3161623477935791, "memory(GiB)": 36.59, "step": 1020, "token_acc": 0.8931434478006202, "train_speed(iter/s)": 0.118501 }, { "epoch": 1.2543192812715964, "eval_loss": 0.3574770390987396, "eval_runtime": 31.0423, "eval_samples_per_second": 16.945, "eval_steps_per_second": 4.252, "eval_token_acc": 0.888463467358409, "step": 1020 }, { "epoch": 1.2604622590800891, "grad_norm": 0.6567991971969604, "learning_rate": 6.239581483098767e-06, "loss": 0.2918637752532959, "memory(GiB)": 36.59, "step": 1025, "token_acc": 0.8930598715558318, "train_speed(iter/s)": 0.118037 }, { "epoch": 1.2666052368885818, "grad_norm": 0.7520761489868164, "learning_rate": 6.208359633067077e-06, "loss": 0.2961498022079468, "memory(GiB)": 36.59, "step": 1030, "token_acc": 0.9095238095238095, "train_speed(iter/s)": 0.118175 }, { "epoch": 1.2727482146970743, "grad_norm": 0.7256974577903748, "learning_rate": 6.177087663058626e-06, "loss": 0.30830044746398927, "memory(GiB)": 36.59, "step": 1035, "token_acc": 0.9017879399034648, "train_speed(iter/s)": 0.118311 }, { "epoch": 1.278891192505567, "grad_norm": 0.6479539275169373, "learning_rate": 6.145766870162767e-06, "loss": 0.2862563610076904, "memory(GiB)": 36.59, "step": 1040, "token_acc": 0.9018611343172747, "train_speed(iter/s)": 0.118441 }, { "epoch": 1.278891192505567, "eval_loss": 0.3572877049446106, "eval_runtime": 31.0406, "eval_samples_per_second": 16.946, "eval_steps_per_second": 4.253, "eval_token_acc": 0.8883147427583226, "step": 1040 }, { "epoch": 1.2850341703140598, "grad_norm": 0.7319021224975586, "learning_rate": 6.114398553493909e-06, "loss": 0.3000927925109863, "memory(GiB)": 36.59, "step": 1045, "token_acc": 0.8926547069479344, "train_speed(iter/s)": 0.118 }, { "epoch": 1.2911771481225525, "grad_norm": 0.705988883972168, "learning_rate": 6.0829840141376385e-06, "loss": 0.30697922706604003, "memory(GiB)": 36.59, "step": 1050, "token_acc": 0.901831032683459, "train_speed(iter/s)": 0.118157 }, { "epoch": 1.2973201259310452, "grad_norm": 0.64214026927948, "learning_rate": 6.051524555096754e-06, "loss": 0.30261845588684083, "memory(GiB)": 36.59, "step": 1055, "token_acc": 0.902963066984974, "train_speed(iter/s)": 0.118309 }, { "epoch": 1.3034631037395377, "grad_norm": 0.7394285798072815, "learning_rate": 6.020021481237216e-06, "loss": 0.30278654098510743, "memory(GiB)": 36.59, "step": 1060, "token_acc": 0.9020456426628828, "train_speed(iter/s)": 0.118449 }, { "epoch": 1.3034631037395377, "eval_loss": 0.35663846135139465, "eval_runtime": 31.0016, "eval_samples_per_second": 16.967, "eval_steps_per_second": 4.258, "eval_token_acc": 0.8883424124513619, "step": 1060 }, { "epoch": 1.3096060815480304, "grad_norm": 0.6863911151885986, "learning_rate": 5.988476099234033e-06, "loss": 0.2937177658081055, "memory(GiB)": 36.59, "step": 1065, "token_acc": 0.8901542316498898, "train_speed(iter/s)": 0.1181 }, { "epoch": 1.315749059356523, "grad_norm": 0.654614269733429, "learning_rate": 5.956889717517053e-06, "loss": 0.3110340595245361, "memory(GiB)": 36.59, "step": 1070, "token_acc": 0.9028094153378892, "train_speed(iter/s)": 0.118212 }, { "epoch": 1.3218920371650158, "grad_norm": 0.7234563827514648, "learning_rate": 5.925263646216697e-06, "loss": 0.31188764572143557, "memory(GiB)": 36.59, "step": 1075, "token_acc": 0.9096784327805578, "train_speed(iter/s)": 0.118351 }, { "epoch": 1.3280350149735085, "grad_norm": 0.6865576505661011, "learning_rate": 5.893599197109625e-06, "loss": 0.302515435218811, "memory(GiB)": 36.59, "step": 1080, "token_acc": 0.8899835796387521, "train_speed(iter/s)": 0.118487 }, { "epoch": 1.3280350149735085, "eval_loss": 0.35516050457954407, "eval_runtime": 30.9944, "eval_samples_per_second": 16.971, "eval_steps_per_second": 4.259, "eval_token_acc": 0.8885637699956767, "step": 1080 }, { "epoch": 1.334177992782001, "grad_norm": 0.6132445335388184, "learning_rate": 5.861897683564313e-06, "loss": 0.3079413414001465, "memory(GiB)": 36.59, "step": 1085, "token_acc": 0.8899461794132038, "train_speed(iter/s)": 0.118068 }, { "epoch": 1.3403209705904937, "grad_norm": 0.7110121250152588, "learning_rate": 5.830160420486588e-06, "loss": 0.29248368740081787, "memory(GiB)": 36.59, "step": 1090, "token_acc": 0.905348378514747, "train_speed(iter/s)": 0.118225 }, { "epoch": 1.3464639483989864, "grad_norm": 0.6436595916748047, "learning_rate": 5.798388724265085e-06, "loss": 0.3002151966094971, "memory(GiB)": 39.06, "step": 1095, "token_acc": 0.9053737339917971, "train_speed(iter/s)": 0.118367 }, { "epoch": 1.3526069262074791, "grad_norm": 0.7013940215110779, "learning_rate": 5.7665839127166475e-06, "loss": 0.3010303020477295, "memory(GiB)": 39.06, "step": 1100, "token_acc": 0.9023475037752253, "train_speed(iter/s)": 0.118479 }, { "epoch": 1.3526069262074791, "eval_loss": 0.3555811047554016, "eval_runtime": 31.0333, "eval_samples_per_second": 16.95, "eval_steps_per_second": 4.253, "eval_token_acc": 0.8887712926934717, "step": 1100 }, { "epoch": 1.3587499040159718, "grad_norm": 0.7001612186431885, "learning_rate": 5.734747305031664e-06, "loss": 0.3120265483856201, "memory(GiB)": 39.06, "step": 1105, "token_acc": 0.8886269689596821, "train_speed(iter/s)": 0.118091 }, { "epoch": 1.3648928818244643, "grad_norm": 0.6804000735282898, "learning_rate": 5.7028802217193565e-06, "loss": 0.30517282485961916, "memory(GiB)": 39.06, "step": 1110, "token_acc": 0.8981199555362235, "train_speed(iter/s)": 0.118215 }, { "epoch": 1.371035859632957, "grad_norm": 0.6867697834968567, "learning_rate": 5.670983984553003e-06, "loss": 0.3074041366577148, "memory(GiB)": 39.06, "step": 1115, "token_acc": 0.903482807952247, "train_speed(iter/s)": 0.118338 }, { "epoch": 1.3771788374414498, "grad_norm": 0.7690563201904297, "learning_rate": 5.63905991651512e-06, "loss": 0.3027225971221924, "memory(GiB)": 39.06, "step": 1120, "token_acc": 0.8987023004673533, "train_speed(iter/s)": 0.118449 }, { "epoch": 1.3771788374414498, "eval_loss": 0.3556562066078186, "eval_runtime": 31.1119, "eval_samples_per_second": 16.907, "eval_steps_per_second": 4.243, "eval_token_acc": 0.8888612191958496, "step": 1120 }, { "epoch": 1.3833218152499425, "grad_norm": 0.6769737005233765, "learning_rate": 5.607109341742579e-06, "loss": 0.30417637825012206, "memory(GiB)": 39.06, "step": 1125, "token_acc": 0.8885960318346111, "train_speed(iter/s)": 0.118061 }, { "epoch": 1.3894647930584352, "grad_norm": 0.6724239587783813, "learning_rate": 5.575133585471697e-06, "loss": 0.31278433799743655, "memory(GiB)": 39.06, "step": 1130, "token_acc": 0.8959036584253262, "train_speed(iter/s)": 0.118168 }, { "epoch": 1.3956077708669277, "grad_norm": 0.7643016576766968, "learning_rate": 5.543133973983254e-06, "loss": 0.29112992286682127, "memory(GiB)": 39.06, "step": 1135, "token_acc": 0.9014400645633149, "train_speed(iter/s)": 0.118301 }, { "epoch": 1.4017507486754204, "grad_norm": 0.6788151264190674, "learning_rate": 5.511111834547496e-06, "loss": 0.3165508508682251, "memory(GiB)": 39.06, "step": 1140, "token_acc": 0.903283467750516, "train_speed(iter/s)": 0.118415 }, { "epoch": 1.4017507486754204, "eval_loss": 0.35399720072746277, "eval_runtime": 31.0476, "eval_samples_per_second": 16.942, "eval_steps_per_second": 4.252, "eval_token_acc": 0.8891137051448336, "step": 1140 }, { "epoch": 1.407893726483913, "grad_norm": 0.6638893485069275, "learning_rate": 5.479068495369071e-06, "loss": 0.2801161289215088, "memory(GiB)": 39.06, "step": 1145, "token_acc": 0.8925869894099848, "train_speed(iter/s)": 0.118025 }, { "epoch": 1.4140367042924058, "grad_norm": 0.7107008099555969, "learning_rate": 5.447005285531948e-06, "loss": 0.29520745277404786, "memory(GiB)": 39.06, "step": 1150, "token_acc": 0.9020094269412057, "train_speed(iter/s)": 0.118132 }, { "epoch": 1.4201796821008985, "grad_norm": 0.6262108087539673, "learning_rate": 5.414923534944283e-06, "loss": 0.28986170291900637, "memory(GiB)": 39.06, "step": 1155, "token_acc": 0.9047965292421047, "train_speed(iter/s)": 0.11825 }, { "epoch": 1.426322659909391, "grad_norm": 0.7209280729293823, "learning_rate": 5.38282457428326e-06, "loss": 0.30995869636535645, "memory(GiB)": 39.06, "step": 1160, "token_acc": 0.9020344876192267, "train_speed(iter/s)": 0.118366 }, { "epoch": 1.426322659909391, "eval_loss": 0.3549746870994568, "eval_runtime": 31.0688, "eval_samples_per_second": 16.93, "eval_steps_per_second": 4.249, "eval_token_acc": 0.8894630350194552, "step": 1160 }, { "epoch": 1.4324656377178837, "grad_norm": 0.6941882371902466, "learning_rate": 5.350709734939898e-06, "loss": 0.313739013671875, "memory(GiB)": 39.06, "step": 1165, "token_acc": 0.889407067409571, "train_speed(iter/s)": 0.117998 }, { "epoch": 1.4386086155263764, "grad_norm": 0.6950980424880981, "learning_rate": 5.318580348963826e-06, "loss": 0.29497203826904295, "memory(GiB)": 39.06, "step": 1170, "token_acc": 0.9058259992665934, "train_speed(iter/s)": 0.118116 }, { "epoch": 1.4447515933348691, "grad_norm": 0.6526186466217041, "learning_rate": 5.286437749008031e-06, "loss": 0.29609017372131347, "memory(GiB)": 39.06, "step": 1175, "token_acc": 0.9071177290528133, "train_speed(iter/s)": 0.11824 }, { "epoch": 1.4508945711433618, "grad_norm": 0.6585668921470642, "learning_rate": 5.2542832682735956e-06, "loss": 0.2915393590927124, "memory(GiB)": 39.06, "step": 1180, "token_acc": 0.8964739593006288, "train_speed(iter/s)": 0.118376 }, { "epoch": 1.4508945711433618, "eval_loss": 0.35392019152641296, "eval_runtime": 31.07, "eval_samples_per_second": 16.93, "eval_steps_per_second": 4.248, "eval_token_acc": 0.889134457414613, "step": 1180 }, { "epoch": 1.4570375489518543, "grad_norm": 0.680291473865509, "learning_rate": 5.222118240454376e-06, "loss": 0.3221513509750366, "memory(GiB)": 39.06, "step": 1185, "token_acc": 0.8858566297847655, "train_speed(iter/s)": 0.117989 }, { "epoch": 1.463180526760347, "grad_norm": 0.676287055015564, "learning_rate": 5.18994399968171e-06, "loss": 0.303191614151001, "memory(GiB)": 39.06, "step": 1190, "token_acc": 0.8928310930499115, "train_speed(iter/s)": 0.118095 }, { "epoch": 1.4693235045688398, "grad_norm": 0.7134848237037659, "learning_rate": 5.157761880469058e-06, "loss": 0.30745644569396974, "memory(GiB)": 39.06, "step": 1195, "token_acc": 0.8987542686739455, "train_speed(iter/s)": 0.118213 }, { "epoch": 1.4754664823773325, "grad_norm": 0.706149160861969, "learning_rate": 5.125573217656664e-06, "loss": 0.3102452278137207, "memory(GiB)": 39.06, "step": 1200, "token_acc": 0.9014028524666823, "train_speed(iter/s)": 0.118318 }, { "epoch": 1.4754664823773325, "eval_loss": 0.35402196645736694, "eval_runtime": 31.0702, "eval_samples_per_second": 16.929, "eval_steps_per_second": 4.248, "eval_token_acc": 0.8895391266753134, "step": 1200 }, { "epoch": 1.4816094601858252, "grad_norm": 0.7066270112991333, "learning_rate": 5.0933793463561855e-06, "loss": 0.3033695936203003, "memory(GiB)": 39.06, "step": 1205, "token_acc": 0.8896138651714031, "train_speed(iter/s)": 0.117945 }, { "epoch": 1.4877524379943177, "grad_norm": 0.6695776581764221, "learning_rate": 5.061181601895317e-06, "loss": 0.30724053382873534, "memory(GiB)": 39.06, "step": 1210, "token_acc": 0.9012793441808471, "train_speed(iter/s)": 0.118065 }, { "epoch": 1.4938954158028104, "grad_norm": 0.7692334651947021, "learning_rate": 5.028981319762399e-06, "loss": 0.28596570491790774, "memory(GiB)": 39.06, "step": 1215, "token_acc": 0.8964816040858792, "train_speed(iter/s)": 0.118187 }, { "epoch": 1.500038393611303, "grad_norm": 0.6707490086555481, "learning_rate": 4.996779835551035e-06, "loss": 0.2939592838287354, "memory(GiB)": 39.06, "step": 1220, "token_acc": 0.8994356329668192, "train_speed(iter/s)": 0.118298 }, { "epoch": 1.500038393611303, "eval_loss": 0.35305002331733704, "eval_runtime": 31.066, "eval_samples_per_second": 16.932, "eval_steps_per_second": 4.249, "eval_token_acc": 0.8896774751405102, "step": 1220 }, { "epoch": 1.5061813714197958, "grad_norm": 0.7542144656181335, "learning_rate": 4.964578484904679e-06, "loss": 0.30585541725158694, "memory(GiB)": 39.06, "step": 1225, "token_acc": 0.8881905335110271, "train_speed(iter/s)": 0.117949 }, { "epoch": 1.5123243492282885, "grad_norm": 0.6754580140113831, "learning_rate": 4.932378603461253e-06, "loss": 0.2997127056121826, "memory(GiB)": 39.06, "step": 1230, "token_acc": 0.9038497785317123, "train_speed(iter/s)": 0.118065 }, { "epoch": 1.518467327036781, "grad_norm": 0.7103241682052612, "learning_rate": 4.900181526797737e-06, "loss": 0.29804291725158694, "memory(GiB)": 39.06, "step": 1235, "token_acc": 0.8995869901910171, "train_speed(iter/s)": 0.118167 }, { "epoch": 1.5246103048452737, "grad_norm": 0.6416381001472473, "learning_rate": 4.867988590374777e-06, "loss": 0.2915628433227539, "memory(GiB)": 39.06, "step": 1240, "token_acc": 0.8995757044689388, "train_speed(iter/s)": 0.118299 }, { "epoch": 1.5246103048452737, "eval_loss": 0.35335448384284973, "eval_runtime": 31.1015, "eval_samples_per_second": 16.912, "eval_steps_per_second": 4.244, "eval_token_acc": 0.8896774751405102, "step": 1240 }, { "epoch": 1.5307532826537664, "grad_norm": 0.7514793872833252, "learning_rate": 4.835801129481287e-06, "loss": 0.305086350440979, "memory(GiB)": 39.06, "step": 1245, "token_acc": 0.8938343509704211, "train_speed(iter/s)": 0.117954 }, { "epoch": 1.5368962604622591, "grad_norm": 0.712042510509491, "learning_rate": 4.803620479179071e-06, "loss": 0.30651469230651857, "memory(GiB)": 39.06, "step": 1250, "token_acc": 0.9019437191760952, "train_speed(iter/s)": 0.118064 }, { "epoch": 1.5430392382707518, "grad_norm": 0.6950103640556335, "learning_rate": 4.771447974247449e-06, "loss": 0.29916160106658934, "memory(GiB)": 39.06, "step": 1255, "token_acc": 0.8986829014071162, "train_speed(iter/s)": 0.118206 }, { "epoch": 1.5491822160792443, "grad_norm": 0.702800452709198, "learning_rate": 4.7392849491278825e-06, "loss": 0.3027307987213135, "memory(GiB)": 39.06, "step": 1260, "token_acc": 0.8973517128165512, "train_speed(iter/s)": 0.118315 }, { "epoch": 1.5491822160792443, "eval_loss": 0.35245779156684875, "eval_runtime": 31.0495, "eval_samples_per_second": 16.941, "eval_steps_per_second": 4.251, "eval_token_acc": 0.8897846952010376, "step": 1260 }, { "epoch": 1.555325193887737, "grad_norm": 0.6939496397972107, "learning_rate": 4.707132737868639e-06, "loss": 0.30812973976135255, "memory(GiB)": 39.06, "step": 1265, "token_acc": 0.8929094774646575, "train_speed(iter/s)": 0.117991 }, { "epoch": 1.5614681716962298, "grad_norm": 0.6996237635612488, "learning_rate": 4.674992674069445e-06, "loss": 0.3079190969467163, "memory(GiB)": 39.06, "step": 1270, "token_acc": 0.8922962411611463, "train_speed(iter/s)": 0.118087 }, { "epoch": 1.5676111495047225, "grad_norm": 0.7096247673034668, "learning_rate": 4.642866090826187e-06, "loss": 0.29966809749603274, "memory(GiB)": 39.06, "step": 1275, "token_acc": 0.8995864625915011, "train_speed(iter/s)": 0.118159 }, { "epoch": 1.5737541273132152, "grad_norm": 0.6891176104545593, "learning_rate": 4.610754320675603e-06, "loss": 0.28565430641174316, "memory(GiB)": 39.06, "step": 1280, "token_acc": 0.9035195544740737, "train_speed(iter/s)": 0.118282 }, { "epoch": 1.5737541273132152, "eval_loss": 0.3529431223869324, "eval_runtime": 31.0442, "eval_samples_per_second": 16.944, "eval_steps_per_second": 4.252, "eval_token_acc": 0.8897604842196282, "step": 1280 }, { "epoch": 1.5798971051217077, "grad_norm": 0.6836899518966675, "learning_rate": 4.578658695540018e-06, "loss": 0.30156033039093016, "memory(GiB)": 39.06, "step": 1285, "token_acc": 0.8901772041128856, "train_speed(iter/s)": 0.117956 }, { "epoch": 1.5860400829302004, "grad_norm": 0.6600014567375183, "learning_rate": 4.5465805466721e-06, "loss": 0.30488083362579343, "memory(GiB)": 39.06, "step": 1290, "token_acc": 0.9087979374798582, "train_speed(iter/s)": 0.11807 }, { "epoch": 1.592183060738693, "grad_norm": 0.7213631272315979, "learning_rate": 4.514521204599645e-06, "loss": 0.30581624507904054, "memory(GiB)": 39.06, "step": 1295, "token_acc": 0.9020306055757139, "train_speed(iter/s)": 0.118174 }, { "epoch": 1.5983260385471858, "grad_norm": 0.6365712285041809, "learning_rate": 4.48248199907038e-06, "loss": 0.2971078872680664, "memory(GiB)": 39.06, "step": 1300, "token_acc": 0.9063318669368791, "train_speed(iter/s)": 0.118307 }, { "epoch": 1.5983260385471858, "eval_loss": 0.35123586654663086, "eval_runtime": 31.0565, "eval_samples_per_second": 16.937, "eval_steps_per_second": 4.25, "eval_token_acc": 0.8903000432338953, "step": 1300 }, { "epoch": 1.6044690163556785, "grad_norm": 0.7233961820602417, "learning_rate": 4.450464258996822e-06, "loss": 0.3078035831451416, "memory(GiB)": 39.06, "step": 1305, "token_acc": 0.8908178398170103, "train_speed(iter/s)": 0.117996 }, { "epoch": 1.610611994164171, "grad_norm": 0.7506811022758484, "learning_rate": 4.418469312401141e-06, "loss": 0.29109845161437986, "memory(GiB)": 39.06, "step": 1310, "token_acc": 0.906337023704408, "train_speed(iter/s)": 0.118097 }, { "epoch": 1.6167549719726637, "grad_norm": 0.7110884785652161, "learning_rate": 4.386498486360095e-06, "loss": 0.3077766180038452, "memory(GiB)": 39.06, "step": 1315, "token_acc": 0.8983554542610717, "train_speed(iter/s)": 0.118213 }, { "epoch": 1.6228979497811564, "grad_norm": 0.6889677047729492, "learning_rate": 4.354553106949972e-06, "loss": 0.30059351921081545, "memory(GiB)": 39.06, "step": 1320, "token_acc": 0.90420160281651, "train_speed(iter/s)": 0.118315 }, { "epoch": 1.6228979497811564, "eval_loss": 0.3506639003753662, "eval_runtime": 31.0876, "eval_samples_per_second": 16.92, "eval_steps_per_second": 4.246, "eval_token_acc": 0.8903450064850843, "step": 1320 }, { "epoch": 1.6290409275896491, "grad_norm": 0.6659175753593445, "learning_rate": 4.3226344991915936e-06, "loss": 0.2960678577423096, "memory(GiB)": 39.06, "step": 1325, "token_acc": 0.8925680515759312, "train_speed(iter/s)": 0.117967 }, { "epoch": 1.6351839053981418, "grad_norm": 0.6886357069015503, "learning_rate": 4.290743986995353e-06, "loss": 0.30909056663513185, "memory(GiB)": 39.06, "step": 1330, "token_acc": 0.9006650503792344, "train_speed(iter/s)": 0.118082 }, { "epoch": 1.6413268832066343, "grad_norm": 0.7061545848846436, "learning_rate": 4.258882893106308e-06, "loss": 0.28565549850463867, "memory(GiB)": 39.06, "step": 1335, "token_acc": 0.9070018118019403, "train_speed(iter/s)": 0.118171 }, { "epoch": 1.647469861015127, "grad_norm": 0.7113469243049622, "learning_rate": 4.227052539049312e-06, "loss": 0.28241825103759766, "memory(GiB)": 39.06, "step": 1340, "token_acc": 0.898852240585334, "train_speed(iter/s)": 0.118285 }, { "epoch": 1.647469861015127, "eval_loss": 0.3508993089199066, "eval_runtime": 31.0521, "eval_samples_per_second": 16.939, "eval_steps_per_second": 4.251, "eval_token_acc": 0.8900994379593601, "step": 1340 }, { "epoch": 1.6536128388236198, "grad_norm": 0.663295567035675, "learning_rate": 4.195254245074196e-06, "loss": 0.2974137783050537, "memory(GiB)": 39.06, "step": 1345, "token_acc": 0.8932698844323589, "train_speed(iter/s)": 0.117947 }, { "epoch": 1.6597558166321125, "grad_norm": 0.6674165725708008, "learning_rate": 4.163489330101017e-06, "loss": 0.3030970096588135, "memory(GiB)": 39.06, "step": 1350, "token_acc": 0.8978457754971743, "train_speed(iter/s)": 0.118042 }, { "epoch": 1.6658987944406052, "grad_norm": 0.6563280820846558, "learning_rate": 4.131759111665349e-06, "loss": 0.2904500961303711, "memory(GiB)": 39.06, "step": 1355, "token_acc": 0.902543907296759, "train_speed(iter/s)": 0.118117 }, { "epoch": 1.6720417722490977, "grad_norm": 0.6549026370048523, "learning_rate": 4.100064905863628e-06, "loss": 0.2979156970977783, "memory(GiB)": 39.06, "step": 1360, "token_acc": 0.8915877216849292, "train_speed(iter/s)": 0.118213 }, { "epoch": 1.6720417722490977, "eval_loss": 0.3503533601760864, "eval_runtime": 31.0554, "eval_samples_per_second": 16.937, "eval_steps_per_second": 4.25, "eval_token_acc": 0.8904902723735408, "step": 1360 }, { "epoch": 1.6781847500575904, "grad_norm": 0.6918724179267883, "learning_rate": 4.068408027298576e-06, "loss": 0.2886175632476807, "memory(GiB)": 39.06, "step": 1365, "token_acc": 0.8957540263543192, "train_speed(iter/s)": 0.117895 }, { "epoch": 1.684327727866083, "grad_norm": 0.6951196193695068, "learning_rate": 4.036789789024659e-06, "loss": 0.30408420562744143, "memory(GiB)": 39.06, "step": 1370, "token_acc": 0.9016488217746225, "train_speed(iter/s)": 0.117988 }, { "epoch": 1.6904707056745758, "grad_norm": 0.7309929728507996, "learning_rate": 4.00521150249364e-06, "loss": 0.2967136144638062, "memory(GiB)": 39.06, "step": 1375, "token_acc": 0.9024064171122995, "train_speed(iter/s)": 0.1181 }, { "epoch": 1.6966136834830685, "grad_norm": 0.7061511278152466, "learning_rate": 3.973674477500172e-06, "loss": 0.3006556749343872, "memory(GiB)": 39.06, "step": 1380, "token_acc": 0.9038457180411086, "train_speed(iter/s)": 0.118226 }, { "epoch": 1.6966136834830685, "eval_loss": 0.3506544828414917, "eval_runtime": 31.002, "eval_samples_per_second": 16.967, "eval_steps_per_second": 4.258, "eval_token_acc": 0.8901928231733679, "step": 1380 }, { "epoch": 1.702756661291561, "grad_norm": 0.696220338344574, "learning_rate": 3.942180022127475e-06, "loss": 0.2850822925567627, "memory(GiB)": 39.06, "step": 1385, "token_acc": 0.8949225591538171, "train_speed(iter/s)": 0.117915 }, { "epoch": 1.7088996391000537, "grad_norm": 0.6707799434661865, "learning_rate": 3.910729442693077e-06, "loss": 0.30518031120300293, "memory(GiB)": 39.06, "step": 1390, "token_acc": 0.8971721087421103, "train_speed(iter/s)": 0.118027 }, { "epoch": 1.7150426169085464, "grad_norm": 0.694172203540802, "learning_rate": 3.8793240436946385e-06, "loss": 0.29511513710021975, "memory(GiB)": 39.06, "step": 1395, "token_acc": 0.9010794140323825, "train_speed(iter/s)": 0.118112 }, { "epoch": 1.7211855947170391, "grad_norm": 0.6791805624961853, "learning_rate": 3.847965127755834e-06, "loss": 0.2960803747177124, "memory(GiB)": 39.06, "step": 1400, "token_acc": 0.8956415132105685, "train_speed(iter/s)": 0.11822 }, { "epoch": 1.7211855947170391, "eval_loss": 0.350666344165802, "eval_runtime": 31.0507, "eval_samples_per_second": 16.94, "eval_steps_per_second": 4.251, "eval_token_acc": 0.8905456117596195, "step": 1400 }, { "epoch": 1.7273285725255318, "grad_norm": 0.6747899651527405, "learning_rate": 3.816653995572332e-06, "loss": 0.290825629234314, "memory(GiB)": 39.06, "step": 1405, "token_acc": 0.891223331082264, "train_speed(iter/s)": 0.117914 }, { "epoch": 1.7334715503340243, "grad_norm": 0.660038411617279, "learning_rate": 3.7853919458578327e-06, "loss": 0.28858532905578616, "memory(GiB)": 39.06, "step": 1410, "token_acc": 0.9013322410968354, "train_speed(iter/s)": 0.118029 }, { "epoch": 1.739614528142517, "grad_norm": 0.6371601223945618, "learning_rate": 3.7541802752902224e-06, "loss": 0.28829474449157716, "memory(GiB)": 39.06, "step": 1415, "token_acc": 0.9037818893145325, "train_speed(iter/s)": 0.118112 }, { "epoch": 1.7457575059510098, "grad_norm": 0.7338966131210327, "learning_rate": 3.723020278457763e-06, "loss": 0.2963329076766968, "memory(GiB)": 39.06, "step": 1420, "token_acc": 0.9052094407824792, "train_speed(iter/s)": 0.118216 }, { "epoch": 1.7457575059510098, "eval_loss": 0.3507256507873535, "eval_runtime": 31.0383, "eval_samples_per_second": 16.947, "eval_steps_per_second": 4.253, "eval_token_acc": 0.8900544747081712, "step": 1420 }, { "epoch": 1.7519004837595025, "grad_norm": 0.6258969902992249, "learning_rate": 3.6919132478054153e-06, "loss": 0.29568450450897216, "memory(GiB)": 39.06, "step": 1425, "token_acc": 0.8909262230371559, "train_speed(iter/s)": 0.117906 }, { "epoch": 1.7580434615679952, "grad_norm": 0.6673945784568787, "learning_rate": 3.6608604735812226e-06, "loss": 0.29297194480895994, "memory(GiB)": 39.06, "step": 1430, "token_acc": 0.9073745475193413, "train_speed(iter/s)": 0.117999 }, { "epoch": 1.7641864393764877, "grad_norm": 0.6559710502624512, "learning_rate": 3.629863243782799e-06, "loss": 0.29749407768249514, "memory(GiB)": 39.06, "step": 1435, "token_acc": 0.9093345763896982, "train_speed(iter/s)": 0.118115 }, { "epoch": 1.7703294171849804, "grad_norm": 0.6504038572311401, "learning_rate": 3.5989228441039024e-06, "loss": 0.29113216400146485, "memory(GiB)": 39.06, "step": 1440, "token_acc": 0.8930581191194346, "train_speed(iter/s)": 0.118206 }, { "epoch": 1.7703294171849804, "eval_loss": 0.34917929768562317, "eval_runtime": 31.0337, "eval_samples_per_second": 16.949, "eval_steps_per_second": 4.253, "eval_token_acc": 0.8902377864245569, "step": 1440 }, { "epoch": 1.776472394993473, "grad_norm": 0.6400864720344543, "learning_rate": 3.568040557881106e-06, "loss": 0.2814110279083252, "memory(GiB)": 39.06, "step": 1445, "token_acc": 0.8906971833959715, "train_speed(iter/s)": 0.117931 }, { "epoch": 1.7826153728019658, "grad_norm": 0.7064361572265625, "learning_rate": 3.5372176660405717e-06, "loss": 0.3039525270462036, "memory(GiB)": 39.06, "step": 1450, "token_acc": 0.9050828549515421, "train_speed(iter/s)": 0.118013 }, { "epoch": 1.7887583506104585, "grad_norm": 0.6955869793891907, "learning_rate": 3.506455447044923e-06, "loss": 0.2821065425872803, "memory(GiB)": 39.06, "step": 1455, "token_acc": 0.9053625617102223, "train_speed(iter/s)": 0.118116 }, { "epoch": 1.794901328418951, "grad_norm": 0.6877216696739197, "learning_rate": 3.4757551768402074e-06, "loss": 0.2811419010162354, "memory(GiB)": 39.06, "step": 1460, "token_acc": 0.9011031359892095, "train_speed(iter/s)": 0.118215 }, { "epoch": 1.794901328418951, "eval_loss": 0.34925225377082825, "eval_runtime": 31.0492, "eval_samples_per_second": 16.941, "eval_steps_per_second": 4.251, "eval_token_acc": 0.8903795936013835, "step": 1460 }, { "epoch": 1.8010443062274437, "grad_norm": 0.6559416055679321, "learning_rate": 3.4451181288029834e-06, "loss": 0.2829850912094116, "memory(GiB)": 39.06, "step": 1465, "token_acc": 0.8958890676209237, "train_speed(iter/s)": 0.117907 }, { "epoch": 1.8071872840359364, "grad_norm": 0.7104200720787048, "learning_rate": 3.4145455736874957e-06, "loss": 0.2918513059616089, "memory(GiB)": 39.06, "step": 1470, "token_acc": 0.9029460760822436, "train_speed(iter/s)": 0.118008 }, { "epoch": 1.8133302618444291, "grad_norm": 0.7294064164161682, "learning_rate": 3.3840387795729753e-06, "loss": 0.30045604705810547, "memory(GiB)": 39.06, "step": 1475, "token_acc": 0.8996919108690979, "train_speed(iter/s)": 0.118115 }, { "epoch": 1.8194732396529218, "grad_norm": 0.7393286824226379, "learning_rate": 3.353599011811037e-06, "loss": 0.3116471767425537, "memory(GiB)": 39.06, "step": 1480, "token_acc": 0.8992412297989751, "train_speed(iter/s)": 0.118208 }, { "epoch": 1.8194732396529218, "eval_loss": 0.34843236207962036, "eval_runtime": 31.0158, "eval_samples_per_second": 16.959, "eval_steps_per_second": 4.256, "eval_token_acc": 0.8908534370946822, "step": 1480 }, { "epoch": 1.8256162174614143, "grad_norm": 0.7225602865219116, "learning_rate": 3.323227532973193e-06, "loss": 0.2964847326278687, "memory(GiB)": 39.06, "step": 1485, "token_acc": 0.8920995259023428, "train_speed(iter/s)": 0.117914 }, { "epoch": 1.831759195269907, "grad_norm": 0.7169524431228638, "learning_rate": 3.292925602798492e-06, "loss": 0.2890679359436035, "memory(GiB)": 39.06, "step": 1490, "token_acc": 0.9052988882813924, "train_speed(iter/s)": 0.118009 }, { "epoch": 1.8379021730783998, "grad_norm": 0.7271701097488403, "learning_rate": 3.262694478141266e-06, "loss": 0.30105009078979494, "memory(GiB)": 39.06, "step": 1495, "token_acc": 0.8908968566759589, "train_speed(iter/s)": 0.118105 }, { "epoch": 1.8440451508868925, "grad_norm": 0.7436238527297974, "learning_rate": 3.2325354129189923e-06, "loss": 0.3033268451690674, "memory(GiB)": 39.06, "step": 1500, "token_acc": 0.9051588095396857, "train_speed(iter/s)": 0.118206 }, { "epoch": 1.8440451508868925, "eval_loss": 0.3476485013961792, "eval_runtime": 31.0233, "eval_samples_per_second": 16.955, "eval_steps_per_second": 4.255, "eval_token_acc": 0.8910609597924773, "step": 1500 }, { "epoch": 1.8501881286953852, "grad_norm": 0.6778759956359863, "learning_rate": 3.2024496580602892e-06, "loss": 0.29907703399658203, "memory(GiB)": 39.06, "step": 1505, "token_acc": 0.8934337447015377, "train_speed(iter/s)": 0.117911 }, { "epoch": 1.8563311065038777, "grad_norm": 0.6664173007011414, "learning_rate": 3.172438461453032e-06, "loss": 0.29923856258392334, "memory(GiB)": 39.06, "step": 1510, "token_acc": 0.8983641727004559, "train_speed(iter/s)": 0.118019 }, { "epoch": 1.8624740843123704, "grad_norm": 0.7407649755477905, "learning_rate": 3.142503067892594e-06, "loss": 0.3053209066390991, "memory(GiB)": 39.06, "step": 1515, "token_acc": 0.8974559495588846, "train_speed(iter/s)": 0.118102 }, { "epoch": 1.868617062120863, "grad_norm": 0.7822189927101135, "learning_rate": 3.112644719030206e-06, "loss": 0.2917191982269287, "memory(GiB)": 39.06, "step": 1520, "token_acc": 0.9052366138763197, "train_speed(iter/s)": 0.118195 }, { "epoch": 1.868617062120863, "eval_loss": 0.3474676311016083, "eval_runtime": 31.0192, "eval_samples_per_second": 16.957, "eval_steps_per_second": 4.255, "eval_token_acc": 0.8910090791180285, "step": 1520 }, { "epoch": 1.8747600399293558, "grad_norm": 0.6843962669372559, "learning_rate": 3.0828646533214657e-06, "loss": 0.3129580497741699, "memory(GiB)": 39.06, "step": 1525, "token_acc": 0.8910584210937568, "train_speed(iter/s)": 0.117907 }, { "epoch": 1.8809030177378485, "grad_norm": 0.6650720238685608, "learning_rate": 3.053164105974964e-06, "loss": 0.3007251024246216, "memory(GiB)": 39.06, "step": 1530, "token_acc": 0.9046979865771813, "train_speed(iter/s)": 0.118012 }, { "epoch": 1.887045995546341, "grad_norm": 0.687574028968811, "learning_rate": 3.0235443089010564e-06, "loss": 0.2859373092651367, "memory(GiB)": 39.06, "step": 1535, "token_acc": 0.9096972925400097, "train_speed(iter/s)": 0.118098 }, { "epoch": 1.8931889733548337, "grad_norm": 0.6390689611434937, "learning_rate": 2.9940064906607607e-06, "loss": 0.28398540019989016, "memory(GiB)": 39.06, "step": 1540, "token_acc": 0.9035791530035582, "train_speed(iter/s)": 0.118191 }, { "epoch": 1.8931889733548337, "eval_loss": 0.3476438522338867, "eval_runtime": 31.0517, "eval_samples_per_second": 16.939, "eval_steps_per_second": 4.251, "eval_token_acc": 0.8913341980112408, "step": 1540 }, { "epoch": 1.8993319511633264, "grad_norm": 0.6599735617637634, "learning_rate": 2.964551876414801e-06, "loss": 0.27951204776763916, "memory(GiB)": 39.06, "step": 1545, "token_acc": 0.8958811522271253, "train_speed(iter/s)": 0.117923 }, { "epoch": 1.9054749289718191, "grad_norm": 0.6753197312355042, "learning_rate": 2.93518168787279e-06, "loss": 0.2956626176834106, "memory(GiB)": 39.06, "step": 1550, "token_acc": 0.8987615726824576, "train_speed(iter/s)": 0.118005 }, { "epoch": 1.9116179067803118, "grad_norm": 0.7011248469352722, "learning_rate": 2.905897143242562e-06, "loss": 0.2975893497467041, "memory(GiB)": 39.06, "step": 1555, "token_acc": 0.9092895928621318, "train_speed(iter/s)": 0.118101 }, { "epoch": 1.9177608845888043, "grad_norm": 0.6635907292366028, "learning_rate": 2.8766994571796336e-06, "loss": 0.28919239044189454, "memory(GiB)": 39.06, "step": 1560, "token_acc": 0.9010686955756882, "train_speed(iter/s)": 0.118185 }, { "epoch": 1.9177608845888043, "eval_loss": 0.3471442759037018, "eval_runtime": 31.0546, "eval_samples_per_second": 16.938, "eval_steps_per_second": 4.251, "eval_token_acc": 0.8912892347600518, "step": 1560 }, { "epoch": 1.923903862397297, "grad_norm": 0.7003067135810852, "learning_rate": 2.8475898407368298e-06, "loss": 0.3121751308441162, "memory(GiB)": 39.06, "step": 1565, "token_acc": 0.8906831756550552, "train_speed(iter/s)": 0.11792 }, { "epoch": 1.9300468402057898, "grad_norm": 0.6917641162872314, "learning_rate": 2.8185695013140474e-06, "loss": 0.31047801971435546, "memory(GiB)": 39.06, "step": 1570, "token_acc": 0.8935967102364517, "train_speed(iter/s)": 0.117987 }, { "epoch": 1.9361898180142825, "grad_norm": 0.717903196811676, "learning_rate": 2.7896396426081844e-06, "loss": 0.29785962104797364, "memory(GiB)": 39.06, "step": 1575, "token_acc": 0.9072703838075233, "train_speed(iter/s)": 0.118079 }, { "epoch": 1.9423327958227752, "grad_norm": 0.7065854072570801, "learning_rate": 2.7608014645632e-06, "loss": 0.2994864463806152, "memory(GiB)": 39.06, "step": 1580, "token_acc": 0.8992320879224104, "train_speed(iter/s)": 0.118176 }, { "epoch": 1.9423327958227752, "eval_loss": 0.34740638732910156, "eval_runtime": 31.0367, "eval_samples_per_second": 16.948, "eval_steps_per_second": 4.253, "eval_token_acc": 0.8909641158668397, "step": 1580 }, { "epoch": 1.9484757736312677, "grad_norm": 0.7280552387237549, "learning_rate": 2.7320561633203567e-06, "loss": 0.2979745864868164, "memory(GiB)": 39.06, "step": 1585, "token_acc": 0.8901521037274909, "train_speed(iter/s)": 0.11791 }, { "epoch": 1.9546187514397604, "grad_norm": 0.6418682336807251, "learning_rate": 2.703404931168594e-06, "loss": 0.2907557010650635, "memory(GiB)": 39.06, "step": 1590, "token_acc": 0.8992377813256425, "train_speed(iter/s)": 0.117995 }, { "epoch": 1.960761729248253, "grad_norm": 0.738042414188385, "learning_rate": 2.6748489564950907e-06, "loss": 0.29802637100219725, "memory(GiB)": 39.06, "step": 1595, "token_acc": 0.8980035246119306, "train_speed(iter/s)": 0.118068 }, { "epoch": 1.9669047070567458, "grad_norm": 0.6280907988548279, "learning_rate": 2.6463894237359556e-06, "loss": 0.28393306732177737, "memory(GiB)": 39.06, "step": 1600, "token_acc": 0.9109865416676735, "train_speed(iter/s)": 0.11816 }, { "epoch": 1.9669047070567458, "eval_loss": 0.34687539935112, "eval_runtime": 31.0445, "eval_samples_per_second": 16.943, "eval_steps_per_second": 4.252, "eval_token_acc": 0.8911128404669261, "step": 1600 }, { "epoch": 1.9730476848652385, "grad_norm": 0.7155392169952393, "learning_rate": 2.618027513327116e-06, "loss": 0.3036234378814697, "memory(GiB)": 39.06, "step": 1605, "token_acc": 0.8935712088588127, "train_speed(iter/s)": 0.117891 }, { "epoch": 1.979190662673731, "grad_norm": 0.7185878157615662, "learning_rate": 2.589764401655343e-06, "loss": 0.30625033378601074, "memory(GiB)": 39.06, "step": 1610, "token_acc": 0.9087600373057938, "train_speed(iter/s)": 0.117966 }, { "epoch": 1.9853336404822237, "grad_norm": 0.6735581159591675, "learning_rate": 2.5616012610094702e-06, "loss": 0.30725975036621095, "memory(GiB)": 39.06, "step": 1615, "token_acc": 0.8961660250130988, "train_speed(iter/s)": 0.118045 }, { "epoch": 1.9914766182907164, "grad_norm": 0.7557063102722168, "learning_rate": 2.533539259531757e-06, "loss": 0.29239468574523925, "memory(GiB)": 39.06, "step": 1620, "token_acc": 0.8937711127034497, "train_speed(iter/s)": 0.118131 }, { "epoch": 1.9914766182907164, "eval_loss": 0.3465479016304016, "eval_runtime": 31.0613, "eval_samples_per_second": 16.934, "eval_steps_per_second": 4.25, "eval_token_acc": 0.8915348032857761, "step": 1620 }, { "epoch": 1.9976195960992091, "grad_norm": 0.6903244853019714, "learning_rate": 2.5055795611694435e-06, "loss": 0.2919922351837158, "memory(GiB)": 39.06, "step": 1625, "token_acc": 0.8967221510883483, "train_speed(iter/s)": 0.117886 }, { "epoch": 2.004914382246794, "grad_norm": 0.6568908095359802, "learning_rate": 2.4777233256264743e-06, "loss": 0.32158265113830564, "memory(GiB)": 39.06, "step": 1630, "token_acc": 0.9122267969438128, "train_speed(iter/s)": 0.117945 }, { "epoch": 2.011057360055287, "grad_norm": 0.7132671475410461, "learning_rate": 2.4499717083153975e-06, "loss": 0.26807637214660646, "memory(GiB)": 39.06, "step": 1635, "token_acc": 0.9197771990740741, "train_speed(iter/s)": 0.118027 }, { "epoch": 2.0172003378637795, "grad_norm": 0.6795634627342224, "learning_rate": 2.4223258603094295e-06, "loss": 0.2491468906402588, "memory(GiB)": 39.06, "step": 1640, "token_acc": 0.9240849211677818, "train_speed(iter/s)": 0.118126 }, { "epoch": 2.0172003378637795, "eval_loss": 0.353736937046051, "eval_runtime": 31.0562, "eval_samples_per_second": 16.937, "eval_steps_per_second": 4.25, "eval_token_acc": 0.8904072632944229, "step": 1640 }, { "epoch": 2.023343315672272, "grad_norm": 0.6820616126060486, "learning_rate": 2.3947869282947263e-06, "loss": 0.24982304573059083, "memory(GiB)": 39.06, "step": 1645, "token_acc": 0.8986748783803705, "train_speed(iter/s)": 0.117854 }, { "epoch": 2.029486293480765, "grad_norm": 0.7354549169540405, "learning_rate": 2.3673560545228082e-06, "loss": 0.25387675762176515, "memory(GiB)": 39.06, "step": 1650, "token_acc": 0.9141678261286763, "train_speed(iter/s)": 0.117936 }, { "epoch": 2.0356292712892574, "grad_norm": 0.6763687133789062, "learning_rate": 2.3400343767631943e-06, "loss": 0.25168399810791015, "memory(GiB)": 41.58, "step": 1655, "token_acc": 0.9232377049180328, "train_speed(iter/s)": 0.118023 }, { "epoch": 2.0417722490977503, "grad_norm": 0.6416710019111633, "learning_rate": 2.312823028256205e-06, "loss": 0.2497392177581787, "memory(GiB)": 41.58, "step": 1660, "token_acc": 0.9226366364968939, "train_speed(iter/s)": 0.118098 }, { "epoch": 2.0417722490977503, "eval_loss": 0.35414808988571167, "eval_runtime": 31.0374, "eval_samples_per_second": 16.947, "eval_steps_per_second": 4.253, "eval_token_acc": 0.8909952442715089, "step": 1660 }, { "epoch": 2.047915226906243, "grad_norm": 0.6878734827041626, "learning_rate": 2.2857231376659517e-06, "loss": 0.26041717529296876, "memory(GiB)": 41.58, "step": 1665, "token_acc": 0.895642282731377, "train_speed(iter/s)": 0.117842 }, { "epoch": 2.0540582047147353, "grad_norm": 0.6756438612937927, "learning_rate": 2.258735829033529e-06, "loss": 0.2607592582702637, "memory(GiB)": 41.58, "step": 1670, "token_acc": 0.904909300316729, "train_speed(iter/s)": 0.117933 }, { "epoch": 2.0602011825232283, "grad_norm": 0.6508097648620605, "learning_rate": 2.231862221730394e-06, "loss": 0.2445054054260254, "memory(GiB)": 41.58, "step": 1675, "token_acc": 0.9190018092758484, "train_speed(iter/s)": 0.117998 }, { "epoch": 2.0663441603317207, "grad_norm": 0.6221520900726318, "learning_rate": 2.2051034304119344e-06, "loss": 0.2536668300628662, "memory(GiB)": 41.58, "step": 1680, "token_acc": 0.9074535753395931, "train_speed(iter/s)": 0.118087 }, { "epoch": 2.0663441603317207, "eval_loss": 0.3554106652736664, "eval_runtime": 31.0289, "eval_samples_per_second": 16.952, "eval_steps_per_second": 4.254, "eval_token_acc": 0.8906770428015565, "step": 1680 }, { "epoch": 2.0724871381402137, "grad_norm": 0.6437965035438538, "learning_rate": 2.1784605649712326e-06, "loss": 0.2540877103805542, "memory(GiB)": 41.58, "step": 1685, "token_acc": 0.896763604572522, "train_speed(iter/s)": 0.117846 }, { "epoch": 2.078630115948706, "grad_norm": 0.6842249631881714, "learning_rate": 2.1519347304930317e-06, "loss": 0.2614542007446289, "memory(GiB)": 41.58, "step": 1690, "token_acc": 0.9103810036765567, "train_speed(iter/s)": 0.117925 }, { "epoch": 2.0847730937571987, "grad_norm": 0.6956413388252258, "learning_rate": 2.1255270272079044e-06, "loss": 0.2528813362121582, "memory(GiB)": 41.58, "step": 1695, "token_acc": 0.9163791495710556, "train_speed(iter/s)": 0.118022 }, { "epoch": 2.0909160715656916, "grad_norm": 0.7066583037376404, "learning_rate": 2.0992385504466075e-06, "loss": 0.2548670291900635, "memory(GiB)": 41.58, "step": 1700, "token_acc": 0.9165421398684998, "train_speed(iter/s)": 0.118107 }, { "epoch": 2.0909160715656916, "eval_loss": 0.35480257868766785, "eval_runtime": 31.1108, "eval_samples_per_second": 16.907, "eval_steps_per_second": 4.243, "eval_token_acc": 0.8904660613921315, "step": 1700 }, { "epoch": 2.097059049374184, "grad_norm": 0.6234432458877563, "learning_rate": 2.0730703905946612e-06, "loss": 0.24052574634552001, "memory(GiB)": 41.58, "step": 1705, "token_acc": 0.8977333662447761, "train_speed(iter/s)": 0.117854 }, { "epoch": 2.103202027182677, "grad_norm": 0.7239139080047607, "learning_rate": 2.0470236330471125e-06, "loss": 0.2701937437057495, "memory(GiB)": 41.58, "step": 1710, "token_acc": 0.9132731300051116, "train_speed(iter/s)": 0.117927 }, { "epoch": 2.1093450049911695, "grad_norm": 0.7042427062988281, "learning_rate": 2.0210993581635257e-06, "loss": 0.2760786533355713, "memory(GiB)": 41.58, "step": 1715, "token_acc": 0.9138149259328708, "train_speed(iter/s)": 0.118022 }, { "epoch": 2.115487982799662, "grad_norm": 0.6625633835792542, "learning_rate": 1.9952986412231612e-06, "loss": 0.2629417657852173, "memory(GiB)": 41.58, "step": 1720, "token_acc": 0.9162193754622009, "train_speed(iter/s)": 0.118081 }, { "epoch": 2.115487982799662, "eval_loss": 0.35503000020980835, "eval_runtime": 31.0455, "eval_samples_per_second": 16.943, "eval_steps_per_second": 4.252, "eval_token_acc": 0.8903346303501946, "step": 1720 }, { "epoch": 2.121630960608155, "grad_norm": 0.698905885219574, "learning_rate": 1.9696225523803803e-06, "loss": 0.2582688808441162, "memory(GiB)": 41.58, "step": 1725, "token_acc": 0.8980166095055636, "train_speed(iter/s)": 0.11783 }, { "epoch": 2.1277739384166474, "grad_norm": 0.6825575828552246, "learning_rate": 1.944072156620261e-06, "loss": 0.2485950469970703, "memory(GiB)": 41.58, "step": 1730, "token_acc": 0.9185884165422945, "train_speed(iter/s)": 0.117919 }, { "epoch": 2.1339169162251403, "grad_norm": 0.656775176525116, "learning_rate": 1.9186485137144217e-06, "loss": 0.26242403984069823, "memory(GiB)": 41.58, "step": 1735, "token_acc": 0.9276958754348186, "train_speed(iter/s)": 0.118001 }, { "epoch": 2.140059894033633, "grad_norm": 0.6787784099578857, "learning_rate": 1.89335267817706e-06, "loss": 0.2578416347503662, "memory(GiB)": 41.58, "step": 1740, "token_acc": 0.9204126213592233, "train_speed(iter/s)": 0.118068 }, { "epoch": 2.140059894033633, "eval_loss": 0.35603559017181396, "eval_runtime": 31.0997, "eval_samples_per_second": 16.913, "eval_steps_per_second": 4.244, "eval_token_acc": 0.8905179420665802, "step": 1740 }, { "epoch": 2.1462028718421253, "grad_norm": 0.705270528793335, "learning_rate": 1.8681856992212211e-06, "loss": 0.27148022651672366, "memory(GiB)": 41.58, "step": 1745, "token_acc": 0.8956282843498057, "train_speed(iter/s)": 0.117819 }, { "epoch": 2.1523458496506183, "grad_norm": 0.6656559705734253, "learning_rate": 1.8431486207152704e-06, "loss": 0.251650071144104, "memory(GiB)": 41.58, "step": 1750, "token_acc": 0.9161503405192278, "train_speed(iter/s)": 0.117892 }, { "epoch": 2.1584888274591107, "grad_norm": 0.6367560625076294, "learning_rate": 1.8182424811396131e-06, "loss": 0.24891986846923828, "memory(GiB)": 41.58, "step": 1755, "token_acc": 0.917142553869016, "train_speed(iter/s)": 0.117962 }, { "epoch": 2.1646318052676037, "grad_norm": 0.7008864283561707, "learning_rate": 1.7934683135435993e-06, "loss": 0.25353493690490725, "memory(GiB)": 41.58, "step": 1760, "token_acc": 0.9114828452290961, "train_speed(iter/s)": 0.118051 }, { "epoch": 2.1646318052676037, "eval_loss": 0.35659661889076233, "eval_runtime": 31.0459, "eval_samples_per_second": 16.943, "eval_steps_per_second": 4.252, "eval_token_acc": 0.8904626026805015, "step": 1760 }, { "epoch": 2.170774783076096, "grad_norm": 0.6810339093208313, "learning_rate": 1.7688271455026867e-06, "loss": 0.25748143196105955, "memory(GiB)": 41.58, "step": 1765, "token_acc": 0.8993780164502753, "train_speed(iter/s)": 0.117817 }, { "epoch": 2.1769177608845887, "grad_norm": 0.701768696308136, "learning_rate": 1.7443199990758168e-06, "loss": 0.25628554821014404, "memory(GiB)": 41.58, "step": 1770, "token_acc": 0.9092464549396461, "train_speed(iter/s)": 0.117899 }, { "epoch": 2.1830607386930816, "grad_norm": 0.6798021793365479, "learning_rate": 1.7199478907630269e-06, "loss": 0.25238001346588135, "memory(GiB)": 41.58, "step": 1775, "token_acc": 0.9152910102820488, "train_speed(iter/s)": 0.117983 }, { "epoch": 2.189203716501574, "grad_norm": 0.7590020895004272, "learning_rate": 1.6957118314632825e-06, "loss": 0.26000936031341554, "memory(GiB)": 41.58, "step": 1780, "token_acc": 0.9114106063560148, "train_speed(iter/s)": 0.118075 }, { "epoch": 2.189203716501574, "eval_loss": 0.3557458817958832, "eval_runtime": 31.0956, "eval_samples_per_second": 16.916, "eval_steps_per_second": 4.245, "eval_token_acc": 0.8905248594898401, "step": 1780 }, { "epoch": 2.195346694310067, "grad_norm": 0.690200924873352, "learning_rate": 1.6716128264325477e-06, "loss": 0.26896276473999026, "memory(GiB)": 41.58, "step": 1785, "token_acc": 0.8972400913052501, "train_speed(iter/s)": 0.117847 }, { "epoch": 2.2014896721185595, "grad_norm": 0.7046708464622498, "learning_rate": 1.64765187524209e-06, "loss": 0.2622739315032959, "memory(GiB)": 41.58, "step": 1790, "token_acc": 0.9040114613180515, "train_speed(iter/s)": 0.117916 }, { "epoch": 2.207632649927052, "grad_norm": 0.6468427181243896, "learning_rate": 1.6238299717370254e-06, "loss": 0.25573272705078126, "memory(GiB)": 41.58, "step": 1795, "token_acc": 0.913803724588921, "train_speed(iter/s)": 0.117988 }, { "epoch": 2.213775627735545, "grad_norm": 0.6906710863113403, "learning_rate": 1.6001481039950872e-06, "loss": 0.24774715900421143, "memory(GiB)": 41.58, "step": 1800, "token_acc": 0.9198941998866428, "train_speed(iter/s)": 0.118059 }, { "epoch": 2.213775627735545, "eval_loss": 0.3556331396102905, "eval_runtime": 31.0281, "eval_samples_per_second": 16.952, "eval_steps_per_second": 4.254, "eval_token_acc": 0.8904798962386511, "step": 1800 }, { "epoch": 2.2199186055440374, "grad_norm": 0.67650306224823, "learning_rate": 1.5766072542856525e-06, "loss": 0.2552159070968628, "memory(GiB)": 41.58, "step": 1805, "token_acc": 0.8967117243311388, "train_speed(iter/s)": 0.117823 }, { "epoch": 2.2260615833525303, "grad_norm": 0.6951079368591309, "learning_rate": 1.5532083990289892e-06, "loss": 0.25490808486938477, "memory(GiB)": 41.58, "step": 1810, "token_acc": 0.9191604784561341, "train_speed(iter/s)": 0.117913 }, { "epoch": 2.232204561161023, "grad_norm": 0.6896148920059204, "learning_rate": 1.5299525087557682e-06, "loss": 0.2403803586959839, "memory(GiB)": 41.58, "step": 1815, "token_acc": 0.9143227478937136, "train_speed(iter/s)": 0.117979 }, { "epoch": 2.2383475389695153, "grad_norm": 0.6858778595924377, "learning_rate": 1.5068405480667975e-06, "loss": 0.2647264003753662, "memory(GiB)": 41.58, "step": 1820, "token_acc": 0.9243344548061508, "train_speed(iter/s)": 0.118051 }, { "epoch": 2.2383475389695153, "eval_loss": 0.35514572262763977, "eval_runtime": 31.0442, "eval_samples_per_second": 16.944, "eval_steps_per_second": 4.252, "eval_token_acc": 0.8905214007782101, "step": 1820 }, { "epoch": 2.2444905167780083, "grad_norm": 0.7021420001983643, "learning_rate": 1.4838734755930168e-06, "loss": 0.2488544464111328, "memory(GiB)": 41.58, "step": 1825, "token_acc": 0.8990035802096363, "train_speed(iter/s)": 0.117822 }, { "epoch": 2.2506334945865007, "grad_norm": 0.7138723134994507, "learning_rate": 1.461052243955739e-06, "loss": 0.2516676902770996, "memory(GiB)": 41.58, "step": 1830, "token_acc": 0.9070056092612484, "train_speed(iter/s)": 0.117885 }, { "epoch": 2.2567764723949937, "grad_norm": 0.6612991094589233, "learning_rate": 1.4383777997271347e-06, "loss": 0.25036053657531737, "memory(GiB)": 41.58, "step": 1835, "token_acc": 0.9232339162298808, "train_speed(iter/s)": 0.11797 }, { "epoch": 2.262919450203486, "grad_norm": 0.670829176902771, "learning_rate": 1.4158510833909688e-06, "loss": 0.26495842933654784, "memory(GiB)": 41.58, "step": 1840, "token_acc": 0.9127685871838752, "train_speed(iter/s)": 0.118042 }, { "epoch": 2.262919450203486, "eval_loss": 0.35496076941490173, "eval_runtime": 31.0316, "eval_samples_per_second": 16.95, "eval_steps_per_second": 4.254, "eval_token_acc": 0.890611327280588, "step": 1840 }, { "epoch": 2.2690624280119787, "grad_norm": 0.6969290971755981, "learning_rate": 1.3934730293035935e-06, "loss": 0.2619413614273071, "memory(GiB)": 41.58, "step": 1845, "token_acc": 0.8992944915071285, "train_speed(iter/s)": 0.117838 }, { "epoch": 2.2752054058204716, "grad_norm": 0.697259247303009, "learning_rate": 1.3712445656551904e-06, "loss": 0.26856374740600586, "memory(GiB)": 41.58, "step": 1850, "token_acc": 0.9039304347826087, "train_speed(iter/s)": 0.117916 }, { "epoch": 2.281348383628964, "grad_norm": 0.7025954127311707, "learning_rate": 1.349166614431282e-06, "loss": 0.2570216655731201, "memory(GiB)": 41.58, "step": 1855, "token_acc": 0.9162639337494233, "train_speed(iter/s)": 0.117981 }, { "epoch": 2.287491361437457, "grad_norm": 0.6871860027313232, "learning_rate": 1.3272400913744744e-06, "loss": 0.262271785736084, "memory(GiB)": 41.58, "step": 1860, "token_acc": 0.9138437528688148, "train_speed(iter/s)": 0.118061 }, { "epoch": 2.287491361437457, "eval_loss": 0.35484763979911804, "eval_runtime": 31.0168, "eval_samples_per_second": 16.959, "eval_steps_per_second": 4.256, "eval_token_acc": 0.8907669693039343, "step": 1860 }, { "epoch": 2.2936343392459495, "grad_norm": 0.6811879873275757, "learning_rate": 1.3054659059464836e-06, "loss": 0.2392117500305176, "memory(GiB)": 41.58, "step": 1865, "token_acc": 0.901497755975368, "train_speed(iter/s)": 0.117825 }, { "epoch": 2.299777317054442, "grad_norm": 0.7064546346664429, "learning_rate": 1.2838449612904108e-06, "loss": 0.266256046295166, "memory(GiB)": 41.58, "step": 1870, "token_acc": 0.9117101026954622, "train_speed(iter/s)": 0.117915 }, { "epoch": 2.305920294862935, "grad_norm": 0.7244398593902588, "learning_rate": 1.262378154193285e-06, "loss": 0.23866605758666992, "memory(GiB)": 41.58, "step": 1875, "token_acc": 0.915842304335176, "train_speed(iter/s)": 0.117981 }, { "epoch": 2.3120632726714274, "grad_norm": 0.7136631608009338, "learning_rate": 1.2410663750488644e-06, "loss": 0.25197710990905764, "memory(GiB)": 41.58, "step": 1880, "token_acc": 0.9191310820870271, "train_speed(iter/s)": 0.118043 }, { "epoch": 2.3120632726714274, "eval_loss": 0.355129599571228, "eval_runtime": 31.0906, "eval_samples_per_second": 16.918, "eval_steps_per_second": 4.246, "eval_token_acc": 0.8907254647643753, "step": 1880 }, { "epoch": 2.3182062504799203, "grad_norm": 0.6782585978507996, "learning_rate": 1.2199105078207002e-06, "loss": 0.2743240833282471, "memory(GiB)": 41.58, "step": 1885, "token_acc": 0.8939592652104051, "train_speed(iter/s)": 0.117803 }, { "epoch": 2.324349228288413, "grad_norm": 0.6339967846870422, "learning_rate": 1.1989114300054782e-06, "loss": 0.25202603340148927, "memory(GiB)": 41.58, "step": 1890, "token_acc": 0.916531565897387, "train_speed(iter/s)": 0.117882 }, { "epoch": 2.3304922060969053, "grad_norm": 0.6756547689437866, "learning_rate": 1.1780700125966232e-06, "loss": 0.2598109722137451, "memory(GiB)": 41.58, "step": 1895, "token_acc": 0.9081785893065719, "train_speed(iter/s)": 0.117946 }, { "epoch": 2.3366351839053983, "grad_norm": 0.7056384086608887, "learning_rate": 1.1573871200481634e-06, "loss": 0.2566692352294922, "memory(GiB)": 41.58, "step": 1900, "token_acc": 0.9156997782187464, "train_speed(iter/s)": 0.118011 }, { "epoch": 2.3366351839053983, "eval_loss": 0.35557088255882263, "eval_runtime": 31.0333, "eval_samples_per_second": 16.95, "eval_steps_per_second": 4.253, "eval_token_acc": 0.8906389969736274, "step": 1900 }, { "epoch": 2.3427781617138907, "grad_norm": 0.7157571911811829, "learning_rate": 1.136863610238887e-06, "loss": 0.25399596691131593, "memory(GiB)": 41.58, "step": 1905, "token_acc": 0.8955967995576062, "train_speed(iter/s)": 0.117798 }, { "epoch": 2.3489211395223837, "grad_norm": 0.6849676370620728, "learning_rate": 1.1165003344367465e-06, "loss": 0.2500483512878418, "memory(GiB)": 41.58, "step": 1910, "token_acc": 0.9112139701241321, "train_speed(iter/s)": 0.11788 }, { "epoch": 2.355064117330876, "grad_norm": 0.6843670010566711, "learning_rate": 1.0962981372635629e-06, "loss": 0.24124569892883302, "memory(GiB)": 41.58, "step": 1915, "token_acc": 0.9228162034548048, "train_speed(iter/s)": 0.117963 }, { "epoch": 2.3612070951393687, "grad_norm": 0.6974015235900879, "learning_rate": 1.0762578566599818e-06, "loss": 0.24528083801269532, "memory(GiB)": 41.58, "step": 1920, "token_acc": 0.9175750441436139, "train_speed(iter/s)": 0.118052 }, { "epoch": 2.3612070951393687, "eval_loss": 0.3551888167858124, "eval_runtime": 30.9708, "eval_samples_per_second": 16.984, "eval_steps_per_second": 4.262, "eval_token_acc": 0.8907427583225248, "step": 1920 }, { "epoch": 2.3673500729478616, "grad_norm": 0.6731058359146118, "learning_rate": 1.056380323850722e-06, "loss": 0.24767663478851318, "memory(GiB)": 41.58, "step": 1925, "token_acc": 0.90198810396806, "train_speed(iter/s)": 0.117814 }, { "epoch": 2.373493050756354, "grad_norm": 0.6461980938911438, "learning_rate": 1.0366663633101015e-06, "loss": 0.2535504102706909, "memory(GiB)": 41.58, "step": 1930, "token_acc": 0.9234430094966145, "train_speed(iter/s)": 0.117879 }, { "epoch": 2.379636028564847, "grad_norm": 0.6973277926445007, "learning_rate": 1.0171167927278369e-06, "loss": 0.25800695419311526, "memory(GiB)": 41.58, "step": 1935, "token_acc": 0.9152892113208366, "train_speed(iter/s)": 0.117936 }, { "epoch": 2.3857790063733395, "grad_norm": 0.6010280847549438, "learning_rate": 9.977324229751245e-07, "loss": 0.2460566520690918, "memory(GiB)": 41.58, "step": 1940, "token_acc": 0.9177397229965928, "train_speed(iter/s)": 0.117997 }, { "epoch": 2.3857790063733395, "eval_loss": 0.35497036576271057, "eval_runtime": 31.0056, "eval_samples_per_second": 16.965, "eval_steps_per_second": 4.257, "eval_token_acc": 0.8907565931690445, "step": 1940 }, { "epoch": 2.391921984181832, "grad_norm": 0.7224907875061035, "learning_rate": 9.785140580710106e-07, "loss": 0.24542105197906494, "memory(GiB)": 41.58, "step": 1945, "token_acc": 0.899276675757627, "train_speed(iter/s)": 0.117779 }, { "epoch": 2.398064961990325, "grad_norm": 0.6951374411582947, "learning_rate": 9.594624951490455e-07, "loss": 0.2523444652557373, "memory(GiB)": 41.58, "step": 1950, "token_acc": 0.9187413638457249, "train_speed(iter/s)": 0.11785 }, { "epoch": 2.4042079397988174, "grad_norm": 0.708865761756897, "learning_rate": 9.405785244242166e-07, "loss": 0.2396538734436035, "memory(GiB)": 41.58, "step": 1955, "token_acc": 0.9178324813918034, "train_speed(iter/s)": 0.117923 }, { "epoch": 2.4103509176073104, "grad_norm": 0.6320639848709106, "learning_rate": 9.218629291601699e-07, "loss": 0.23296713829040527, "memory(GiB)": 41.58, "step": 1960, "token_acc": 0.9257631364964948, "train_speed(iter/s)": 0.117998 }, { "epoch": 2.4103509176073104, "eval_loss": 0.3550316095352173, "eval_runtime": 31.0148, "eval_samples_per_second": 16.96, "eval_steps_per_second": 4.256, "eval_token_acc": 0.8910021616947686, "step": 1960 }, { "epoch": 2.416493895415803, "grad_norm": 0.6433020234107971, "learning_rate": 9.033164856367271e-07, "loss": 0.24781334400177002, "memory(GiB)": 41.58, "step": 1965, "token_acc": 0.8978867315004879, "train_speed(iter/s)": 0.117779 }, { "epoch": 2.4226368732242953, "grad_norm": 0.7556272745132446, "learning_rate": 8.849399631176825e-07, "loss": 0.261240553855896, "memory(GiB)": 41.58, "step": 1970, "token_acc": 0.9180517884878411, "train_speed(iter/s)": 0.117846 }, { "epoch": 2.4287798510327883, "grad_norm": 0.6567925214767456, "learning_rate": 8.667341238189009e-07, "loss": 0.24376273155212402, "memory(GiB)": 41.58, "step": 1975, "token_acc": 0.9204566085693536, "train_speed(iter/s)": 0.117908 }, { "epoch": 2.4349228288412808, "grad_norm": 0.6730430722236633, "learning_rate": 8.486997228767013e-07, "loss": 0.26009833812713623, "memory(GiB)": 41.58, "step": 1980, "token_acc": 0.9134192822777164, "train_speed(iter/s)": 0.117978 }, { "epoch": 2.4349228288412808, "eval_loss": 0.3539762794971466, "eval_runtime": 31.0521, "eval_samples_per_second": 16.939, "eval_steps_per_second": 4.251, "eval_token_acc": 0.8909018590575011, "step": 1980 }, { "epoch": 2.4410658066497737, "grad_norm": 0.7423481941223145, "learning_rate": 8.308375083165299e-07, "loss": 0.24584083557128905, "memory(GiB)": 41.58, "step": 1985, "token_acc": 0.8992385337347264, "train_speed(iter/s)": 0.117767 }, { "epoch": 2.447208784458266, "grad_norm": 0.6721974015235901, "learning_rate": 8.131482210219383e-07, "loss": 0.251566219329834, "memory(GiB)": 41.58, "step": 1990, "token_acc": 0.9197381858694643, "train_speed(iter/s)": 0.117832 }, { "epoch": 2.4533517622667587, "grad_norm": 0.6605408787727356, "learning_rate": 7.956325947038585e-07, "loss": 0.2555187702178955, "memory(GiB)": 41.58, "step": 1995, "token_acc": 0.9162280042111596, "train_speed(iter/s)": 0.117901 }, { "epoch": 2.4594947400752516, "grad_norm": 0.647904098033905, "learning_rate": 7.782913558701572e-07, "loss": 0.2506421089172363, "memory(GiB)": 41.58, "step": 2000, "token_acc": 0.9203814955936324, "train_speed(iter/s)": 0.117965 }, { "epoch": 2.4594947400752516, "eval_loss": 0.3547162115573883, "eval_runtime": 31.1783, "eval_samples_per_second": 16.871, "eval_steps_per_second": 4.234, "eval_token_acc": 0.8908811067877216, "step": 2000 }, { "epoch": 2.465637717883744, "grad_norm": 0.7217480540275574, "learning_rate": 7.611252237955168e-07, "loss": 0.24761755466461183, "memory(GiB)": 41.58, "step": 2005, "token_acc": 0.8972576188708813, "train_speed(iter/s)": 0.117755 }, { "epoch": 2.471780695692237, "grad_norm": 0.6904724836349487, "learning_rate": 7.44134910491589e-07, "loss": 0.2681485414505005, "memory(GiB)": 41.58, "step": 2010, "token_acc": 0.9053991693585602, "train_speed(iter/s)": 0.117834 }, { "epoch": 2.4779236735007295, "grad_norm": 0.6789990663528442, "learning_rate": 7.273211206774711e-07, "loss": 0.24847228527069093, "memory(GiB)": 41.58, "step": 2015, "token_acc": 0.9193213372105735, "train_speed(iter/s)": 0.117908 }, { "epoch": 2.484066651309222, "grad_norm": 0.7324934601783752, "learning_rate": 7.106845517504684e-07, "loss": 0.24457526206970215, "memory(GiB)": 41.58, "step": 2020, "token_acc": 0.9162846862832077, "train_speed(iter/s)": 0.117969 }, { "epoch": 2.484066651309222, "eval_loss": 0.3543083965778351, "eval_runtime": 31.0241, "eval_samples_per_second": 16.955, "eval_steps_per_second": 4.255, "eval_token_acc": 0.8908396022481626, "step": 2020 }, { "epoch": 2.490209629117715, "grad_norm": 0.7012256383895874, "learning_rate": 6.942258937571772e-07, "loss": 0.25258448123931887, "memory(GiB)": 41.58, "step": 2025, "token_acc": 0.8976666927565725, "train_speed(iter/s)": 0.11777 }, { "epoch": 2.4963526069262074, "grad_norm": 0.6754176020622253, "learning_rate": 6.779458293648506e-07, "loss": 0.2500795841217041, "memory(GiB)": 41.58, "step": 2030, "token_acc": 0.9177111716621253, "train_speed(iter/s)": 0.117835 }, { "epoch": 2.5024955847347004, "grad_norm": 0.6942124962806702, "learning_rate": 6.618450338330978e-07, "loss": 0.245684814453125, "memory(GiB)": 41.58, "step": 2035, "token_acc": 0.9162501585690727, "train_speed(iter/s)": 0.117915 }, { "epoch": 2.508638562543193, "grad_norm": 0.6740065813064575, "learning_rate": 6.459241749858619e-07, "loss": 0.25455806255340574, "memory(GiB)": 41.58, "step": 2040, "token_acc": 0.9220431950634214, "train_speed(iter/s)": 0.117979 }, { "epoch": 2.508638562543193, "eval_loss": 0.35373052954673767, "eval_runtime": 31.0821, "eval_samples_per_second": 16.923, "eval_steps_per_second": 4.247, "eval_token_acc": 0.8911335927367056, "step": 2040 }, { "epoch": 2.5147815403516853, "grad_norm": 0.6818024516105652, "learning_rate": 6.301839131837284e-07, "loss": 0.2483248233795166, "memory(GiB)": 41.58, "step": 2045, "token_acc": 0.9004994038258826, "train_speed(iter/s)": 0.117768 }, { "epoch": 2.5209245181601783, "grad_norm": 0.6766259074211121, "learning_rate": 6.146249012965349e-07, "loss": 0.25524895191192626, "memory(GiB)": 41.58, "step": 2050, "token_acc": 0.9155308997100655, "train_speed(iter/s)": 0.117834 }, { "epoch": 2.5270674959686708, "grad_norm": 0.6721575260162354, "learning_rate": 5.992477846762896e-07, "loss": 0.2647790193557739, "memory(GiB)": 41.58, "step": 2055, "token_acc": 0.9044405418966383, "train_speed(iter/s)": 0.117893 }, { "epoch": 2.5332104737771637, "grad_norm": 0.7143027782440186, "learning_rate": 5.840532011303996e-07, "loss": 0.2634526491165161, "memory(GiB)": 41.58, "step": 2060, "token_acc": 0.9136083648221958, "train_speed(iter/s)": 0.117955 }, { "epoch": 2.5332104737771637, "eval_loss": 0.35337749123573303, "eval_runtime": 31.0323, "eval_samples_per_second": 16.95, "eval_steps_per_second": 4.254, "eval_token_acc": 0.8909814094249892, "step": 2060 }, { "epoch": 2.539353451585656, "grad_norm": 0.6832711100578308, "learning_rate": 5.690417808952243e-07, "loss": 0.2547764301300049, "memory(GiB)": 41.58, "step": 2065, "token_acc": 0.8971203129214999, "train_speed(iter/s)": 0.117757 }, { "epoch": 2.5454964293941487, "grad_norm": 0.7033362984657288, "learning_rate": 5.542141466099271e-07, "loss": 0.26053800582885744, "memory(GiB)": 41.58, "step": 2070, "token_acc": 0.9055393728734732, "train_speed(iter/s)": 0.117841 }, { "epoch": 2.5516394072026416, "grad_norm": 0.7116051912307739, "learning_rate": 5.395709132906569e-07, "loss": 0.25941154956817625, "memory(GiB)": 41.58, "step": 2075, "token_acc": 0.920958114777396, "train_speed(iter/s)": 0.117919 }, { "epoch": 2.557782385011134, "grad_norm": 0.6814519166946411, "learning_rate": 5.251126883050333e-07, "loss": 0.26160635948181155, "memory(GiB)": 41.58, "step": 2080, "token_acc": 0.912257738587306, "train_speed(iter/s)": 0.117989 }, { "epoch": 2.557782385011134, "eval_loss": 0.3543572723865509, "eval_runtime": 31.0115, "eval_samples_per_second": 16.961, "eval_steps_per_second": 4.256, "eval_token_acc": 0.8910229139645482, "step": 2080 }, { "epoch": 2.563925362819627, "grad_norm": 0.7511703372001648, "learning_rate": 5.108400713469547e-07, "loss": 0.24686145782470703, "memory(GiB)": 41.58, "step": 2085, "token_acc": 0.8980160383253489, "train_speed(iter/s)": 0.117791 }, { "epoch": 2.5700683406281195, "grad_norm": 0.6902100443840027, "learning_rate": 4.967536544117263e-07, "loss": 0.26129970550537107, "memory(GiB)": 41.58, "step": 2090, "token_acc": 0.9143400153853115, "train_speed(iter/s)": 0.117849 }, { "epoch": 2.576211318436612, "grad_norm": 0.759671688079834, "learning_rate": 4.828540217715067e-07, "loss": 0.27549381256103517, "memory(GiB)": 41.58, "step": 2095, "token_acc": 0.9109081247944131, "train_speed(iter/s)": 0.117916 }, { "epoch": 2.582354296245105, "grad_norm": 0.6925843954086304, "learning_rate": 4.6914174995106863e-07, "loss": 0.25518312454223635, "memory(GiB)": 41.58, "step": 2100, "token_acc": 0.9096630452258998, "train_speed(iter/s)": 0.117988 }, { "epoch": 2.582354296245105, "eval_loss": 0.3541419208049774, "eval_runtime": 31.047, "eval_samples_per_second": 16.942, "eval_steps_per_second": 4.252, "eval_token_acc": 0.8910021616947686, "step": 2100 }, { "epoch": 2.5884972740535974, "grad_norm": 0.7308095693588257, "learning_rate": 4.556174077038927e-07, "loss": 0.2574288845062256, "memory(GiB)": 41.58, "step": 2105, "token_acc": 0.899001034002444, "train_speed(iter/s)": 0.11778 }, { "epoch": 2.5946402518620904, "grad_norm": 0.6761147379875183, "learning_rate": 4.422815559885696e-07, "loss": 0.2425455093383789, "memory(GiB)": 41.58, "step": 2110, "token_acc": 0.9116659922401276, "train_speed(iter/s)": 0.117842 }, { "epoch": 2.600783229670583, "grad_norm": 0.697441816329956, "learning_rate": 4.2913474794554044e-07, "loss": 0.2548621892929077, "memory(GiB)": 41.58, "step": 2115, "token_acc": 0.9114378356971362, "train_speed(iter/s)": 0.11791 }, { "epoch": 2.6069262074790753, "grad_norm": 0.667349100112915, "learning_rate": 4.161775288741454e-07, "loss": 0.252597713470459, "memory(GiB)": 41.58, "step": 2120, "token_acc": 0.9123547788733769, "train_speed(iter/s)": 0.117978 }, { "epoch": 2.6069262074790753, "eval_loss": 0.3542228639125824, "eval_runtime": 31.0775, "eval_samples_per_second": 16.925, "eval_steps_per_second": 4.247, "eval_token_acc": 0.8909537397319498, "step": 2120 }, { "epoch": 2.6130691852875683, "grad_norm": 0.7039747834205627, "learning_rate": 4.034104362100155e-07, "loss": 0.25393052101135255, "memory(GiB)": 41.58, "step": 2125, "token_acc": 0.8992231097494255, "train_speed(iter/s)": 0.117764 }, { "epoch": 2.6192121630960608, "grad_norm": 0.7111782431602478, "learning_rate": 3.9083399950277156e-07, "loss": 0.2592860221862793, "memory(GiB)": 41.58, "step": 2130, "token_acc": 0.9017042520227233, "train_speed(iter/s)": 0.117842 }, { "epoch": 2.6253551409045537, "grad_norm": 0.7449079155921936, "learning_rate": 3.7844874039406677e-07, "loss": 0.23967378139495848, "memory(GiB)": 41.58, "step": 2135, "token_acc": 0.9237554343728797, "train_speed(iter/s)": 0.11791 }, { "epoch": 2.631498118713046, "grad_norm": 0.6821849346160889, "learning_rate": 3.6625517259594566e-07, "loss": 0.273772144317627, "memory(GiB)": 41.58, "step": 2140, "token_acc": 0.9114792099290095, "train_speed(iter/s)": 0.117984 }, { "epoch": 2.631498118713046, "eval_loss": 0.3543878495693207, "eval_runtime": 31.0747, "eval_samples_per_second": 16.927, "eval_steps_per_second": 4.248, "eval_token_acc": 0.8910263726761781, "step": 2140 }, { "epoch": 2.6376410965215387, "grad_norm": 0.7271039485931396, "learning_rate": 3.5425380186953905e-07, "loss": 0.2533170223236084, "memory(GiB)": 41.58, "step": 2145, "token_acc": 0.8992799581191373, "train_speed(iter/s)": 0.117788 }, { "epoch": 2.6437840743300316, "grad_norm": 0.6954792737960815, "learning_rate": 3.424451260040862e-07, "loss": 0.2587547302246094, "memory(GiB)": 41.58, "step": 2150, "token_acc": 0.9252017450665703, "train_speed(iter/s)": 0.117868 }, { "epoch": 2.649927052138524, "grad_norm": 0.6999133229255676, "learning_rate": 3.3082963479628747e-07, "loss": 0.2520002841949463, "memory(GiB)": 41.58, "step": 2155, "token_acc": 0.9169615355242726, "train_speed(iter/s)": 0.117941 }, { "epoch": 2.656070029947017, "grad_norm": 0.6630998253822327, "learning_rate": 3.194078100299863e-07, "loss": 0.2589444160461426, "memory(GiB)": 41.58, "step": 2160, "token_acc": 0.9155829021582063, "train_speed(iter/s)": 0.118006 }, { "epoch": 2.656070029947017, "eval_loss": 0.3538263440132141, "eval_runtime": 31.0691, "eval_samples_per_second": 16.93, "eval_steps_per_second": 4.249, "eval_token_acc": 0.8909779507133593, "step": 2160 }, { "epoch": 2.6622130077555095, "grad_norm": 0.6728103756904602, "learning_rate": 3.0818012545618836e-07, "loss": 0.243510103225708, "memory(GiB)": 41.58, "step": 2165, "token_acc": 0.89773630732402, "train_speed(iter/s)": 0.117802 }, { "epoch": 2.668355985564002, "grad_norm": 0.6952410936355591, "learning_rate": 2.9714704677341055e-07, "loss": 0.2590247631072998, "memory(GiB)": 41.58, "step": 2170, "token_acc": 0.9167405790179891, "train_speed(iter/s)": 0.117866 }, { "epoch": 2.674498963372495, "grad_norm": 0.6924260258674622, "learning_rate": 2.8630903160836776e-07, "loss": 0.25694501399993896, "memory(GiB)": 41.58, "step": 2175, "token_acc": 0.9082922132627271, "train_speed(iter/s)": 0.11794 }, { "epoch": 2.6806419411809874, "grad_norm": 0.6898376941680908, "learning_rate": 2.756665294969868e-07, "loss": 0.2537565231323242, "memory(GiB)": 41.58, "step": 2180, "token_acc": 0.917653237630479, "train_speed(iter/s)": 0.118015 }, { "epoch": 2.6806419411809874, "eval_loss": 0.35428422689437866, "eval_runtime": 31.0905, "eval_samples_per_second": 16.918, "eval_steps_per_second": 4.246, "eval_token_acc": 0.8910713359273671, "step": 2180 }, { "epoch": 2.6867849189894804, "grad_norm": 0.6692034602165222, "learning_rate": 2.6521998186576357e-07, "loss": 0.24578571319580078, "memory(GiB)": 41.58, "step": 2185, "token_acc": 0.9007592006264257, "train_speed(iter/s)": 0.117803 }, { "epoch": 2.692927896797973, "grad_norm": 0.6597223877906799, "learning_rate": 2.549698220134517e-07, "loss": 0.2445077896118164, "memory(GiB)": 41.58, "step": 2190, "token_acc": 0.921655840125781, "train_speed(iter/s)": 0.117862 }, { "epoch": 2.6990708746064653, "grad_norm": 0.7004697322845459, "learning_rate": 2.449164750930938e-07, "loss": 0.24747202396392823, "memory(GiB)": 41.58, "step": 2195, "token_acc": 0.9170990796945369, "train_speed(iter/s)": 0.117919 }, { "epoch": 2.7052138524149583, "grad_norm": 0.6603142619132996, "learning_rate": 2.3506035809438553e-07, "loss": 0.25233500003814696, "memory(GiB)": 41.58, "step": 2200, "token_acc": 0.9180474800634293, "train_speed(iter/s)": 0.117989 }, { "epoch": 2.7052138524149583, "eval_loss": 0.35384565591812134, "eval_runtime": 31.0643, "eval_samples_per_second": 16.933, "eval_steps_per_second": 4.249, "eval_token_acc": 0.8911958495460441, "step": 2200 }, { "epoch": 2.7113568302234508, "grad_norm": 0.6453321576118469, "learning_rate": 2.2540187982637628e-07, "loss": 0.2474754571914673, "memory(GiB)": 41.58, "step": 2205, "token_acc": 0.8990364613669268, "train_speed(iter/s)": 0.117783 }, { "epoch": 2.7174998080319437, "grad_norm": 0.6942773461341858, "learning_rate": 2.1594144090051728e-07, "loss": 0.25811138153076174, "memory(GiB)": 41.58, "step": 2210, "token_acc": 0.9148966602302796, "train_speed(iter/s)": 0.117842 }, { "epoch": 2.723642785840436, "grad_norm": 0.687302827835083, "learning_rate": 2.066794337140443e-07, "loss": 0.25774784088134767, "memory(GiB)": 41.58, "step": 2215, "token_acc": 0.9122162054746883, "train_speed(iter/s)": 0.117899 }, { "epoch": 2.7297857636489287, "grad_norm": 0.735578715801239, "learning_rate": 1.9761624243370026e-07, "loss": 0.26178154945373533, "memory(GiB)": 41.58, "step": 2220, "token_acc": 0.9086515587830224, "train_speed(iter/s)": 0.117952 }, { "epoch": 2.7297857636489287, "eval_loss": 0.35361814498901367, "eval_runtime": 31.0398, "eval_samples_per_second": 16.946, "eval_steps_per_second": 4.253, "eval_token_acc": 0.8910920881971466, "step": 2220 }, { "epoch": 2.7359287414574216, "grad_norm": 0.6561589241027832, "learning_rate": 1.8875224297980332e-07, "loss": 0.25756092071533204, "memory(GiB)": 41.58, "step": 2225, "token_acc": 0.8958707817534339, "train_speed(iter/s)": 0.117769 }, { "epoch": 2.742071719265914, "grad_norm": 0.6671420335769653, "learning_rate": 1.800878030106501e-07, "loss": 0.24125266075134277, "memory(GiB)": 41.58, "step": 2230, "token_acc": 0.9233396163654507, "train_speed(iter/s)": 0.117827 }, { "epoch": 2.748214697074407, "grad_norm": 0.7091180086135864, "learning_rate": 1.7162328190727217e-07, "loss": 0.25800223350524903, "memory(GiB)": 41.58, "step": 2235, "token_acc": 0.9130938866210961, "train_speed(iter/s)": 0.117897 }, { "epoch": 2.7543576748828995, "grad_norm": 0.7402175068855286, "learning_rate": 1.6335903075852478e-07, "loss": 0.2690894365310669, "memory(GiB)": 41.58, "step": 2240, "token_acc": 0.9129178605539637, "train_speed(iter/s)": 0.117956 }, { "epoch": 2.7543576748828995, "eval_loss": 0.35380449891090393, "eval_runtime": 31.058, "eval_samples_per_second": 16.936, "eval_steps_per_second": 4.25, "eval_token_acc": 0.891244271508863, "step": 2240 }, { "epoch": 2.760500652691392, "grad_norm": 0.700340211391449, "learning_rate": 1.552953923465267e-07, "loss": 0.26177315711975097, "memory(GiB)": 41.58, "step": 2245, "token_acc": 0.8943636286526147, "train_speed(iter/s)": 0.11777 }, { "epoch": 2.766643630499885, "grad_norm": 0.6342586278915405, "learning_rate": 1.4743270113244278e-07, "loss": 0.23961200714111328, "memory(GiB)": 41.58, "step": 2250, "token_acc": 0.9237576735224269, "train_speed(iter/s)": 0.117824 }, { "epoch": 2.7727866083083774, "grad_norm": 0.627129077911377, "learning_rate": 1.3977128324261068e-07, "loss": 0.24526638984680177, "memory(GiB)": 41.58, "step": 2255, "token_acc": 0.9125838004176283, "train_speed(iter/s)": 0.117896 }, { "epoch": 2.7789295861168704, "grad_norm": 0.6337400674819946, "learning_rate": 1.3231145645501153e-07, "loss": 0.2480980396270752, "memory(GiB)": 41.58, "step": 2260, "token_acc": 0.9186616671473897, "train_speed(iter/s)": 0.117951 }, { "epoch": 2.7789295861168704, "eval_loss": 0.354061484336853, "eval_runtime": 31.0534, "eval_samples_per_second": 16.939, "eval_steps_per_second": 4.251, "eval_token_acc": 0.8911024643320363, "step": 2260 }, { "epoch": 2.785072563925363, "grad_norm": 0.712088942527771, "learning_rate": 1.2505353018609445e-07, "loss": 0.2516076326370239, "memory(GiB)": 41.58, "step": 2265, "token_acc": 0.8994283331306145, "train_speed(iter/s)": 0.117768 }, { "epoch": 2.7912155417338553, "grad_norm": 0.7046364545822144, "learning_rate": 1.1799780547793682e-07, "loss": 0.25043492317199706, "memory(GiB)": 41.58, "step": 2270, "token_acc": 0.9169777512318948, "train_speed(iter/s)": 0.117833 }, { "epoch": 2.7973585195423483, "grad_norm": 0.6503071784973145, "learning_rate": 1.111445749857626e-07, "loss": 0.2525207757949829, "memory(GiB)": 41.58, "step": 2275, "token_acc": 0.9089755560343795, "train_speed(iter/s)": 0.117899 }, { "epoch": 2.8035014973508408, "grad_norm": 0.7683473229408264, "learning_rate": 1.0449412296580252e-07, "loss": 0.2637613534927368, "memory(GiB)": 41.58, "step": 2280, "token_acc": 0.9091683159202835, "train_speed(iter/s)": 0.117958 }, { "epoch": 2.8035014973508408, "eval_loss": 0.35387495160102844, "eval_runtime": 31.012, "eval_samples_per_second": 16.961, "eval_steps_per_second": 4.256, "eval_token_acc": 0.8911405101599654, "step": 2280 }, { "epoch": 2.8096444751593337, "grad_norm": 0.7151490449905396, "learning_rate": 9.804672526349979e-08, "loss": 0.2488321304321289, "memory(GiB)": 41.58, "step": 2285, "token_acc": 0.8973117200307805, "train_speed(iter/s)": 0.117778 }, { "epoch": 2.815787452967826, "grad_norm": 0.730139434337616, "learning_rate": 9.180264930207405e-08, "loss": 0.2607487678527832, "memory(GiB)": 41.58, "step": 2290, "token_acc": 0.9156902926894462, "train_speed(iter/s)": 0.11785 }, { "epoch": 2.8219304307763187, "grad_norm": 0.6628730297088623, "learning_rate": 8.576215407142652e-08, "loss": 0.26926565170288086, "memory(GiB)": 41.58, "step": 2295, "token_acc": 0.9116337769619092, "train_speed(iter/s)": 0.11791 }, { "epoch": 2.8280734085848116, "grad_norm": 0.6601608991622925, "learning_rate": 7.992549011739903e-08, "loss": 0.2524131774902344, "memory(GiB)": 41.58, "step": 2300, "token_acc": 0.9154497235075048, "train_speed(iter/s)": 0.117965 }, { "epoch": 2.8280734085848116, "eval_loss": 0.35372012853622437, "eval_runtime": 31.0582, "eval_samples_per_second": 16.936, "eval_steps_per_second": 4.25, "eval_token_acc": 0.891157803718115, "step": 2300 }, { "epoch": 2.834216386393304, "grad_norm": 0.7079156041145325, "learning_rate": 7.42928995313802e-08, "loss": 0.25153977870941163, "memory(GiB)": 41.58, "step": 2305, "token_acc": 0.8977190549519733, "train_speed(iter/s)": 0.117776 }, { "epoch": 2.840359364201797, "grad_norm": 0.707416296005249, "learning_rate": 6.886461594026394e-08, "loss": 0.24887454509735107, "memory(GiB)": 41.58, "step": 2310, "token_acc": 0.9237657201262054, "train_speed(iter/s)": 0.117827 }, { "epoch": 2.8465023420102895, "grad_norm": 0.6941429972648621, "learning_rate": 6.364086449676233e-08, "loss": 0.2661618947982788, "memory(GiB)": 41.58, "step": 2315, "token_acc": 0.9116836428999401, "train_speed(iter/s)": 0.117875 }, { "epoch": 2.852645319818782, "grad_norm": 0.705603301525116, "learning_rate": 5.862186187006347e-08, "loss": 0.251740837097168, "memory(GiB)": 41.58, "step": 2320, "token_acc": 0.9094119805522429, "train_speed(iter/s)": 0.117943 }, { "epoch": 2.852645319818782, "eval_loss": 0.35377010703086853, "eval_runtime": 31.0667, "eval_samples_per_second": 16.931, "eval_steps_per_second": 4.249, "eval_token_acc": 0.8910367488110679, "step": 2320 }, { "epoch": 2.858788297627275, "grad_norm": 0.6828641891479492, "learning_rate": 5.3807816236846614e-08, "loss": 0.26838877201080324, "memory(GiB)": 41.58, "step": 2325, "token_acc": 0.8946421677020814, "train_speed(iter/s)": 0.117759 }, { "epoch": 2.8649312754357674, "grad_norm": 0.6699286699295044, "learning_rate": 4.919892727264508e-08, "loss": 0.2658334493637085, "memory(GiB)": 41.58, "step": 2330, "token_acc": 0.9121956642579211, "train_speed(iter/s)": 0.117813 }, { "epoch": 2.8710742532442604, "grad_norm": 0.6932682394981384, "learning_rate": 4.4795386143567375e-08, "loss": 0.24600727558135987, "memory(GiB)": 41.58, "step": 2335, "token_acc": 0.918826454010682, "train_speed(iter/s)": 0.117881 }, { "epoch": 2.877217231052753, "grad_norm": 0.6962621212005615, "learning_rate": 4.0597375498365175e-08, "loss": 0.2586866617202759, "memory(GiB)": 41.58, "step": 2340, "token_acc": 0.9217780343483908, "train_speed(iter/s)": 0.117944 }, { "epoch": 2.877217231052753, "eval_loss": 0.35374194383621216, "eval_runtime": 31.066, "eval_samples_per_second": 16.932, "eval_steps_per_second": 4.249, "eval_token_acc": 0.8911439688715953, "step": 2340 }, { "epoch": 2.8833602088612453, "grad_norm": 0.6817741990089417, "learning_rate": 3.6605069460858286e-08, "loss": 0.2390669822692871, "memory(GiB)": 41.58, "step": 2345, "token_acc": 0.9005924037018727, "train_speed(iter/s)": 0.117765 }, { "epoch": 2.8895031866697383, "grad_norm": 0.6809601783752441, "learning_rate": 3.281863362271487e-08, "loss": 0.24726104736328125, "memory(GiB)": 41.58, "step": 2350, "token_acc": 0.9243779025438414, "train_speed(iter/s)": 0.117823 }, { "epoch": 2.8956461644782308, "grad_norm": 0.6868336200714111, "learning_rate": 2.9238225036579693e-08, "loss": 0.2603924036026001, "memory(GiB)": 41.58, "step": 2355, "token_acc": 0.9140520341253614, "train_speed(iter/s)": 0.117884 }, { "epoch": 2.9017891422867237, "grad_norm": 0.6945005655288696, "learning_rate": 2.5863992209560484e-08, "loss": 0.2470933675765991, "memory(GiB)": 41.58, "step": 2360, "token_acc": 0.9241285200347351, "train_speed(iter/s)": 0.117951 }, { "epoch": 2.9017891422867237, "eval_loss": 0.353762149810791, "eval_runtime": 31.0637, "eval_samples_per_second": 16.933, "eval_steps_per_second": 4.249, "eval_token_acc": 0.891199308257674, "step": 2360 }, { "epoch": 2.907932120095216, "grad_norm": 0.6887286305427551, "learning_rate": 2.269607509707006e-08, "loss": 0.2686716318130493, "memory(GiB)": 41.58, "step": 2365, "token_acc": 0.8963972388465724, "train_speed(iter/s)": 0.117755 }, { "epoch": 2.9140750979037087, "grad_norm": 0.6807404160499573, "learning_rate": 1.97346050970193e-08, "loss": 0.25454580783843994, "memory(GiB)": 41.58, "step": 2370, "token_acc": 0.9134818448123169, "train_speed(iter/s)": 0.11783 }, { "epoch": 2.9202180757122016, "grad_norm": 0.6732537150382996, "learning_rate": 1.69797050443693e-08, "loss": 0.251677131652832, "memory(GiB)": 41.58, "step": 2375, "token_acc": 0.9128050937389459, "train_speed(iter/s)": 0.117893 }, { "epoch": 2.926361053520694, "grad_norm": 0.701576292514801, "learning_rate": 1.4431489206034321e-08, "loss": 0.26529679298400877, "memory(GiB)": 41.58, "step": 2380, "token_acc": 0.915328677370581, "train_speed(iter/s)": 0.117951 }, { "epoch": 2.926361053520694, "eval_loss": 0.3537040054798126, "eval_runtime": 31.0696, "eval_samples_per_second": 16.93, "eval_steps_per_second": 4.249, "eval_token_acc": 0.8912581063553826, "step": 2380 }, { "epoch": 2.932504031329187, "grad_norm": 0.6693256497383118, "learning_rate": 1.2090063276142261e-08, "loss": 0.2500641107559204, "memory(GiB)": 41.58, "step": 2385, "token_acc": 0.8987698849300564, "train_speed(iter/s)": 0.117778 }, { "epoch": 2.9386470091376795, "grad_norm": 0.726274847984314, "learning_rate": 9.955524371653146e-09, "loss": 0.2546469926834106, "memory(GiB)": 41.58, "step": 2390, "token_acc": 0.9140067149004587, "train_speed(iter/s)": 0.117838 }, { "epoch": 2.944789986946172, "grad_norm": 0.6883347630500793, "learning_rate": 8.02796102832848e-09, "loss": 0.2519416570663452, "memory(GiB)": 41.58, "step": 2395, "token_acc": 0.9123212139777092, "train_speed(iter/s)": 0.117903 }, { "epoch": 2.950932964754665, "grad_norm": 0.7407357692718506, "learning_rate": 6.307453197059166e-09, "loss": 0.25615706443786623, "memory(GiB)": 41.58, "step": 2400, "token_acc": 0.9138917665630704, "train_speed(iter/s)": 0.117957 }, { "epoch": 2.950932964754665, "eval_loss": 0.35370346903800964, "eval_runtime": 31.078, "eval_samples_per_second": 16.925, "eval_steps_per_second": 4.247, "eval_token_acc": 0.8911854734111544, "step": 2400 }, { "epoch": 2.9570759425631574, "grad_norm": 0.6578332781791687, "learning_rate": 4.794072240550951e-09, "loss": 0.2539684772491455, "memory(GiB)": 41.58, "step": 2405, "token_acc": 0.8979478357573546, "train_speed(iter/s)": 0.117778 }, { "epoch": 2.9632189203716504, "grad_norm": 0.6638470888137817, "learning_rate": 3.487880930363452e-09, "loss": 0.24514734745025635, "memory(GiB)": 41.58, "step": 2410, "token_acc": 0.9181864403032916, "train_speed(iter/s)": 0.117844 }, { "epoch": 2.969361898180143, "grad_norm": 0.7209091782569885, "learning_rate": 2.3889334443055743e-09, "loss": 0.24684855937957764, "memory(GiB)": 41.58, "step": 2415, "token_acc": 0.9140838085792214, "train_speed(iter/s)": 0.117913 }, { "epoch": 2.9755048759886353, "grad_norm": 0.651500940322876, "learning_rate": 1.4972753641906424e-09, "loss": 0.24752352237701417, "memory(GiB)": 41.58, "step": 2420, "token_acc": 0.9205346018801677, "train_speed(iter/s)": 0.117962 }, { "epoch": 2.9755048759886353, "eval_loss": 0.3538280427455902, "eval_runtime": 31.0603, "eval_samples_per_second": 16.935, "eval_steps_per_second": 4.25, "eval_token_acc": 0.8910817120622568, "step": 2420 }, { "epoch": 2.9816478537971283, "grad_norm": 0.7020614147186279, "learning_rate": 8.12943673943467e-10, "loss": 0.2728489875793457, "memory(GiB)": 41.58, "step": 2425, "token_acc": 0.8962999446979123, "train_speed(iter/s)": 0.117785 }, { "epoch": 2.9877908316056208, "grad_norm": 0.6406486630439758, "learning_rate": 3.359667580682402e-10, "loss": 0.24820823669433595, "memory(GiB)": 41.58, "step": 2430, "token_acc": 0.9153875671527245, "train_speed(iter/s)": 0.11784 }, { "epoch": 2.9939338094141137, "grad_norm": 0.6937646269798279, "learning_rate": 6.636440046892123e-11, "loss": 0.253904914855957, "memory(GiB)": 41.58, "step": 2435, "token_acc": 0.9180234572177958, "train_speed(iter/s)": 0.117894 }, { "epoch": 2.9988481916609078, "eval_loss": 0.3536596894264221, "eval_runtime": 31.0744, "eval_samples_per_second": 16.927, "eval_steps_per_second": 4.248, "eval_token_acc": 0.8911750972762645, "step": 2439 } ], "logging_steps": 5, "max_steps": 2439, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1644436512416727e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }