{ "best_global_step": 1620, "best_metric": 0.25625008, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v13-20250430-203547/checkpoint-1620", "epoch": 2.9988481916609078, "eval_steps": 20, "global_step": 2439, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012285955616985335, "grad_norm": 2.85188364982605, "learning_rate": 9.99999585221637e-06, "loss": 0.3927260637283325, "memory(GiB)": 27.77, "step": 1, "token_acc": 0.9111180904522613, "train_speed(iter/s)": 0.065127 }, { "epoch": 0.006142977808492667, "grad_norm": 2.0374207496643066, "learning_rate": 9.999896305753298e-06, "loss": 0.4172998070716858, "memory(GiB)": 27.77, "step": 5, "token_acc": 0.8710935003515051, "train_speed(iter/s)": 0.125986 }, { "epoch": 0.012285955616985334, "grad_norm": 1.0768085718154907, "learning_rate": 9.99958522731419e-06, "loss": 0.31686446666717527, "memory(GiB)": 27.77, "step": 10, "token_acc": 0.9013287401574803, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.018428933425478, "grad_norm": 1.4088069200515747, "learning_rate": 9.999066777585496e-06, "loss": 0.31454758644104003, "memory(GiB)": 27.77, "step": 15, "token_acc": 0.8979140839756373, "train_speed(iter/s)": 0.146387 }, { "epoch": 0.024571911233970668, "grad_norm": 1.0749812126159668, "learning_rate": 9.998340978071314e-06, "loss": 0.31294023990631104, "memory(GiB)": 27.77, "step": 20, "token_acc": 0.9126713473754597, "train_speed(iter/s)": 0.15088 }, { "epoch": 0.024571911233970668, "eval_loss": 0.34731194376945496, "eval_runtime": 30.1495, "eval_samples_per_second": 17.446, "eval_steps_per_second": 4.378, "eval_token_acc": 0.9024643320363165, "step": 20 }, { "epoch": 0.030714889042463334, "grad_norm": 1.2545477151870728, "learning_rate": 9.997407858876141e-06, "loss": 0.30952184200286864, "memory(GiB)": 27.77, "step": 25, "token_acc": 0.8951860272094114, "train_speed(iter/s)": 0.122954 }, { "epoch": 0.036857866850956, "grad_norm": 1.0299962759017944, "learning_rate": 9.99626745870361e-06, "loss": 0.28446354866027834, "memory(GiB)": 27.77, "step": 30, "token_acc": 0.9173851303377625, "train_speed(iter/s)": 0.127723 }, { "epoch": 0.043000844659448666, "grad_norm": 1.05655038356781, "learning_rate": 9.994919824854899e-06, "loss": 0.3267578125, "memory(GiB)": 27.77, "step": 35, "token_acc": 0.8922404371584699, "train_speed(iter/s)": 0.132153 }, { "epoch": 0.049143822467941335, "grad_norm": 1.0286122560501099, "learning_rate": 9.993365013226757e-06, "loss": 0.29858396053314207, "memory(GiB)": 27.77, "step": 40, "token_acc": 0.8932590177726365, "train_speed(iter/s)": 0.135375 }, { "epoch": 0.049143822467941335, "eval_loss": 0.32549846172332764, "eval_runtime": 29.8986, "eval_samples_per_second": 17.593, "eval_steps_per_second": 4.415, "eval_token_acc": 0.9053393860786857, "step": 40 }, { "epoch": 0.055286800276434005, "grad_norm": 0.9242257475852966, "learning_rate": 9.991603088309195e-06, "loss": 0.2890357971191406, "memory(GiB)": 27.77, "step": 45, "token_acc": 0.8943298969072165, "train_speed(iter/s)": 0.121978 }, { "epoch": 0.06142977808492667, "grad_norm": 0.8638477325439453, "learning_rate": 9.989634123182798e-06, "loss": 0.2940737247467041, "memory(GiB)": 27.77, "step": 50, "token_acc": 0.9095572243424023, "train_speed(iter/s)": 0.125285 }, { "epoch": 0.06757275589341934, "grad_norm": 1.1406885385513306, "learning_rate": 9.987458199515714e-06, "loss": 0.2999709606170654, "memory(GiB)": 27.77, "step": 55, "token_acc": 0.9033247521498603, "train_speed(iter/s)": 0.128485 }, { "epoch": 0.073715733701912, "grad_norm": 1.0126744508743286, "learning_rate": 9.985075407560247e-06, "loss": 0.2821986675262451, "memory(GiB)": 29.52, "step": 60, "token_acc": 0.9166214683694098, "train_speed(iter/s)": 0.130304 }, { "epoch": 0.073715733701912, "eval_loss": 0.31559479236602783, "eval_runtime": 29.8185, "eval_samples_per_second": 17.64, "eval_steps_per_second": 4.427, "eval_token_acc": 0.9073641735120335, "step": 60 }, { "epoch": 0.07985871151040466, "grad_norm": 0.9640651345252991, "learning_rate": 9.982485846149125e-06, "loss": 0.2909295320510864, "memory(GiB)": 29.52, "step": 65, "token_acc": 0.9025612979673685, "train_speed(iter/s)": 0.121841 }, { "epoch": 0.08600168931889733, "grad_norm": 1.0075881481170654, "learning_rate": 9.979689622691393e-06, "loss": 0.2951636791229248, "memory(GiB)": 29.52, "step": 70, "token_acc": 0.9077655003069368, "train_speed(iter/s)": 0.123796 }, { "epoch": 0.09214466712739, "grad_norm": 1.0915230512619019, "learning_rate": 9.976686853167967e-06, "loss": 0.28308849334716796, "memory(GiB)": 29.52, "step": 75, "token_acc": 0.8968571616035693, "train_speed(iter/s)": 0.125709 }, { "epoch": 0.09828764493588267, "grad_norm": 1.0471818447113037, "learning_rate": 9.973477662126818e-06, "loss": 0.2649773836135864, "memory(GiB)": 29.52, "step": 80, "token_acc": 0.9152229480261289, "train_speed(iter/s)": 0.127584 }, { "epoch": 0.09828764493588267, "eval_loss": 0.311291366815567, "eval_runtime": 29.9425, "eval_samples_per_second": 17.567, "eval_steps_per_second": 4.408, "eval_token_acc": 0.9086323677763366, "step": 80 }, { "epoch": 0.10443062274437534, "grad_norm": 0.9933186769485474, "learning_rate": 9.970062182677802e-06, "loss": 0.27203946113586425, "memory(GiB)": 29.52, "step": 85, "token_acc": 0.8999555278840167, "train_speed(iter/s)": 0.121459 }, { "epoch": 0.11057360055286801, "grad_norm": 1.0888270139694214, "learning_rate": 9.966440556487149e-06, "loss": 0.27809457778930663, "memory(GiB)": 29.52, "step": 90, "token_acc": 0.9105367793240556, "train_speed(iter/s)": 0.123076 }, { "epoch": 0.11671657836136066, "grad_norm": 1.0842769145965576, "learning_rate": 9.962612933771575e-06, "loss": 0.30378289222717286, "memory(GiB)": 29.52, "step": 95, "token_acc": 0.9065857885615252, "train_speed(iter/s)": 0.124867 }, { "epoch": 0.12285955616985333, "grad_norm": 1.1600843667984009, "learning_rate": 9.958579473292067e-06, "loss": 0.2904845714569092, "memory(GiB)": 29.52, "step": 100, "token_acc": 0.910148975791434, "train_speed(iter/s)": 0.126774 }, { "epoch": 0.12285955616985333, "eval_loss": 0.3061583638191223, "eval_runtime": 29.8822, "eval_samples_per_second": 17.602, "eval_steps_per_second": 4.417, "eval_token_acc": 0.9093385214007782, "step": 100 }, { "epoch": 0.129002533978346, "grad_norm": 1.0021681785583496, "learning_rate": 9.95434034234728e-06, "loss": 0.29146251678466795, "memory(GiB)": 29.52, "step": 105, "token_acc": 0.9012319578712691, "train_speed(iter/s)": 0.122309 }, { "epoch": 0.13514551178683867, "grad_norm": 0.9947881698608398, "learning_rate": 9.949895716766611e-06, "loss": 0.28587632179260253, "memory(GiB)": 29.52, "step": 110, "token_acc": 0.9113877118644068, "train_speed(iter/s)": 0.12384 }, { "epoch": 0.14128848959533133, "grad_norm": 0.9120563268661499, "learning_rate": 9.945245780902899e-06, "loss": 0.25429134368896483, "memory(GiB)": 29.52, "step": 115, "token_acc": 0.9119403599818774, "train_speed(iter/s)": 0.125132 }, { "epoch": 0.147431467403824, "grad_norm": 0.9895658493041992, "learning_rate": 9.940390727624785e-06, "loss": 0.29624483585357664, "memory(GiB)": 29.52, "step": 120, "token_acc": 0.907488553000837, "train_speed(iter/s)": 0.126349 }, { "epoch": 0.147431467403824, "eval_loss": 0.30357423424720764, "eval_runtime": 29.8849, "eval_samples_per_second": 17.601, "eval_steps_per_second": 4.417, "eval_token_acc": 0.9093457270500072, "step": 120 }, { "epoch": 0.15357444521231667, "grad_norm": 1.0295214653015137, "learning_rate": 9.935330758308706e-06, "loss": 0.2756758689880371, "memory(GiB)": 29.52, "step": 125, "token_acc": 0.9061426587736607, "train_speed(iter/s)": 0.122406 }, { "epoch": 0.15971742302080932, "grad_norm": 0.9042087197303772, "learning_rate": 9.93006608283054e-06, "loss": 0.26598501205444336, "memory(GiB)": 29.52, "step": 130, "token_acc": 0.9089727568107973, "train_speed(iter/s)": 0.123416 }, { "epoch": 0.165860400829302, "grad_norm": 1.020095944404602, "learning_rate": 9.924596919556917e-06, "loss": 0.30240449905395506, "memory(GiB)": 29.52, "step": 135, "token_acc": 0.8946991831137082, "train_speed(iter/s)": 0.124801 }, { "epoch": 0.17200337863779466, "grad_norm": 0.8778018355369568, "learning_rate": 9.918923495336138e-06, "loss": 0.30482988357543944, "memory(GiB)": 29.52, "step": 140, "token_acc": 0.896467782800934, "train_speed(iter/s)": 0.125985 }, { "epoch": 0.17200337863779466, "eval_loss": 0.3020693063735962, "eval_runtime": 29.8385, "eval_samples_per_second": 17.628, "eval_steps_per_second": 4.424, "eval_token_acc": 0.9099077676898688, "step": 140 }, { "epoch": 0.17814635644628735, "grad_norm": 0.888533890247345, "learning_rate": 9.913046045488787e-06, "loss": 0.28194656372070315, "memory(GiB)": 29.52, "step": 145, "token_acc": 0.9068211113661646, "train_speed(iter/s)": 0.122725 }, { "epoch": 0.18428933425478, "grad_norm": 0.9438475370407104, "learning_rate": 9.906964813797955e-06, "loss": 0.2703879356384277, "memory(GiB)": 29.52, "step": 150, "token_acc": 0.9050589050589051, "train_speed(iter/s)": 0.123734 }, { "epoch": 0.19043231206327269, "grad_norm": 1.0727230310440063, "learning_rate": 9.900680052499138e-06, "loss": 0.267763090133667, "memory(GiB)": 29.52, "step": 155, "token_acc": 0.8978266300274794, "train_speed(iter/s)": 0.124756 }, { "epoch": 0.19657528987176534, "grad_norm": 0.8617845773696899, "learning_rate": 9.894192022269773e-06, "loss": 0.2951368808746338, "memory(GiB)": 29.52, "step": 160, "token_acc": 0.9030839367122553, "train_speed(iter/s)": 0.1257 }, { "epoch": 0.19657528987176534, "eval_loss": 0.2993355393409729, "eval_runtime": 29.9095, "eval_samples_per_second": 17.586, "eval_steps_per_second": 4.413, "eval_token_acc": 0.9106355382619974, "step": 160 }, { "epoch": 0.202718267680258, "grad_norm": 1.1480624675750732, "learning_rate": 9.887500992218421e-06, "loss": 0.30594232082366946, "memory(GiB)": 29.52, "step": 165, "token_acc": 0.9010627678938407, "train_speed(iter/s)": 0.122909 }, { "epoch": 0.20886124548875068, "grad_norm": 1.0773506164550781, "learning_rate": 9.880607239873614e-06, "loss": 0.2754403591156006, "memory(GiB)": 29.52, "step": 170, "token_acc": 0.9078512396694215, "train_speed(iter/s)": 0.123726 }, { "epoch": 0.21500422329724334, "grad_norm": 1.0344544649124146, "learning_rate": 9.873511051172331e-06, "loss": 0.27539350986480715, "memory(GiB)": 29.52, "step": 175, "token_acc": 0.9078795220527504, "train_speed(iter/s)": 0.124606 }, { "epoch": 0.22114720110573602, "grad_norm": 0.801381528377533, "learning_rate": 9.866212720448149e-06, "loss": 0.2653654098510742, "memory(GiB)": 29.52, "step": 180, "token_acc": 0.9089000349935845, "train_speed(iter/s)": 0.125295 }, { "epoch": 0.22114720110573602, "eval_loss": 0.29467687010765076, "eval_runtime": 29.9602, "eval_samples_per_second": 17.557, "eval_steps_per_second": 4.406, "eval_token_acc": 0.911284046692607, "step": 180 }, { "epoch": 0.22729017891422867, "grad_norm": 0.7735671401023865, "learning_rate": 9.85871255041903e-06, "loss": 0.2625685691833496, "memory(GiB)": 29.52, "step": 185, "token_acc": 0.9061491117110654, "train_speed(iter/s)": 0.122679 }, { "epoch": 0.23343315672272133, "grad_norm": 0.8188781142234802, "learning_rate": 9.85101085217477e-06, "loss": 0.2741875171661377, "memory(GiB)": 29.52, "step": 190, "token_acc": 0.9082752921732972, "train_speed(iter/s)": 0.123561 }, { "epoch": 0.239576134531214, "grad_norm": 0.9473600387573242, "learning_rate": 9.843107945164086e-06, "loss": 0.2795043230056763, "memory(GiB)": 29.52, "step": 195, "token_acc": 0.9222816722590006, "train_speed(iter/s)": 0.124249 }, { "epoch": 0.24571911233970667, "grad_norm": 0.9036338329315186, "learning_rate": 9.835004157181372e-06, "loss": 0.2842700004577637, "memory(GiB)": 29.52, "step": 200, "token_acc": 0.9156384193074958, "train_speed(iter/s)": 0.125075 }, { "epoch": 0.24571911233970667, "eval_loss": 0.292435884475708, "eval_runtime": 29.9157, "eval_samples_per_second": 17.583, "eval_steps_per_second": 4.412, "eval_token_acc": 0.9119109381755296, "step": 200 }, { "epoch": 0.2518620901481993, "grad_norm": 0.850283145904541, "learning_rate": 9.826699824353106e-06, "loss": 0.25057048797607423, "memory(GiB)": 29.52, "step": 205, "token_acc": 0.9017097011526469, "train_speed(iter/s)": 0.122679 }, { "epoch": 0.258005067956692, "grad_norm": 0.9866188168525696, "learning_rate": 9.818195291123903e-06, "loss": 0.2645299434661865, "memory(GiB)": 31.97, "step": 210, "token_acc": 0.9247558634504632, "train_speed(iter/s)": 0.123484 }, { "epoch": 0.2641480457651847, "grad_norm": 0.9569115042686462, "learning_rate": 9.80949091024223e-06, "loss": 0.26346535682678224, "memory(GiB)": 31.97, "step": 215, "token_acc": 0.9060756912373298, "train_speed(iter/s)": 0.1242 }, { "epoch": 0.27029102357367735, "grad_norm": 0.8597538471221924, "learning_rate": 9.800587042745774e-06, "loss": 0.24233598709106446, "memory(GiB)": 31.97, "step": 220, "token_acc": 0.9219184958700315, "train_speed(iter/s)": 0.124816 }, { "epoch": 0.27029102357367735, "eval_loss": 0.29206207394599915, "eval_runtime": 29.9221, "eval_samples_per_second": 17.579, "eval_steps_per_second": 4.411, "eval_token_acc": 0.9120910794062546, "step": 220 }, { "epoch": 0.27643400138217, "grad_norm": 0.955256462097168, "learning_rate": 9.791484057946465e-06, "loss": 0.256744384765625, "memory(GiB)": 31.97, "step": 225, "token_acc": 0.9052175977500594, "train_speed(iter/s)": 0.122865 }, { "epoch": 0.28257697919066266, "grad_norm": 0.9161826968193054, "learning_rate": 9.782182333415168e-06, "loss": 0.25551562309265136, "memory(GiB)": 31.97, "step": 230, "token_acc": 0.9160751966238251, "train_speed(iter/s)": 0.123442 }, { "epoch": 0.2887199569991553, "grad_norm": 0.8681318759918213, "learning_rate": 9.772682254966009e-06, "loss": 0.27017927169799805, "memory(GiB)": 31.97, "step": 235, "token_acc": 0.9036617262423714, "train_speed(iter/s)": 0.124191 }, { "epoch": 0.294862934807648, "grad_norm": 1.02655029296875, "learning_rate": 9.762984216640378e-06, "loss": 0.2807133197784424, "memory(GiB)": 31.97, "step": 240, "token_acc": 0.91136, "train_speed(iter/s)": 0.12485 }, { "epoch": 0.294862934807648, "eval_loss": 0.2887011170387268, "eval_runtime": 29.7971, "eval_samples_per_second": 17.653, "eval_steps_per_second": 4.43, "eval_token_acc": 0.9119037325263006, "step": 240 }, { "epoch": 0.3010059126161407, "grad_norm": 0.8672446012496948, "learning_rate": 9.753088620690589e-06, "loss": 0.25624737739562986, "memory(GiB)": 31.97, "step": 245, "token_acc": 0.9064800901577761, "train_speed(iter/s)": 0.122919 }, { "epoch": 0.30714889042463334, "grad_norm": 0.9362043142318726, "learning_rate": 9.742995877563187e-06, "loss": 0.2410278081893921, "memory(GiB)": 31.97, "step": 250, "token_acc": 0.9145431429992814, "train_speed(iter/s)": 0.123516 }, { "epoch": 0.313291868233126, "grad_norm": 0.8355256915092468, "learning_rate": 9.732706405881931e-06, "loss": 0.29171640872955323, "memory(GiB)": 31.97, "step": 255, "token_acc": 0.9167446592065107, "train_speed(iter/s)": 0.123982 }, { "epoch": 0.31943484604161865, "grad_norm": 0.9195040464401245, "learning_rate": 9.722220632430428e-06, "loss": 0.2701089859008789, "memory(GiB)": 31.97, "step": 260, "token_acc": 0.914859208523592, "train_speed(iter/s)": 0.124498 }, { "epoch": 0.31943484604161865, "eval_loss": 0.2895512878894806, "eval_runtime": 29.7827, "eval_samples_per_second": 17.661, "eval_steps_per_second": 4.432, "eval_token_acc": 0.9118460873324686, "step": 260 }, { "epoch": 0.32557782385011136, "grad_norm": 0.7184414267539978, "learning_rate": 9.711538992134427e-06, "loss": 0.27772011756896975, "memory(GiB)": 31.97, "step": 265, "token_acc": 0.9080251975547935, "train_speed(iter/s)": 0.122689 }, { "epoch": 0.331720801658604, "grad_norm": 0.8985347151756287, "learning_rate": 9.700661928043787e-06, "loss": 0.2564595460891724, "memory(GiB)": 31.97, "step": 270, "token_acc": 0.9087627174269773, "train_speed(iter/s)": 0.123177 }, { "epoch": 0.33786377946709667, "grad_norm": 0.8065007925033569, "learning_rate": 9.689589891314094e-06, "loss": 0.25415422916412356, "memory(GiB)": 31.97, "step": 275, "token_acc": 0.9148117934972614, "train_speed(iter/s)": 0.123657 }, { "epoch": 0.3440067572755893, "grad_norm": 1.036281943321228, "learning_rate": 9.678323341187956e-06, "loss": 0.2695312023162842, "memory(GiB)": 31.97, "step": 280, "token_acc": 0.9125853071055801, "train_speed(iter/s)": 0.124107 }, { "epoch": 0.3440067572755893, "eval_loss": 0.28761476278305054, "eval_runtime": 29.7589, "eval_samples_per_second": 17.675, "eval_steps_per_second": 4.436, "eval_token_acc": 0.9123000432338954, "step": 280 }, { "epoch": 0.350149735084082, "grad_norm": 0.9171528816223145, "learning_rate": 9.666862744975938e-06, "loss": 0.26874988079071044, "memory(GiB)": 31.97, "step": 285, "token_acc": 0.9094196412588164, "train_speed(iter/s)": 0.122607 }, { "epoch": 0.3562927128925747, "grad_norm": 0.8578206300735474, "learning_rate": 9.655208578037198e-06, "loss": 0.28213140964508054, "memory(GiB)": 31.97, "step": 290, "token_acc": 0.9133274656042989, "train_speed(iter/s)": 0.123162 }, { "epoch": 0.36243569070106735, "grad_norm": 0.9107432961463928, "learning_rate": 9.643361323759763e-06, "loss": 0.27148008346557617, "memory(GiB)": 31.97, "step": 295, "token_acc": 0.9049124513618677, "train_speed(iter/s)": 0.123652 }, { "epoch": 0.36857866850956, "grad_norm": 0.9925222396850586, "learning_rate": 9.631321473540476e-06, "loss": 0.2592118740081787, "memory(GiB)": 31.97, "step": 300, "token_acc": 0.8945048023933239, "train_speed(iter/s)": 0.124138 }, { "epoch": 0.36857866850956, "eval_loss": 0.28638342022895813, "eval_runtime": 29.7767, "eval_samples_per_second": 17.665, "eval_steps_per_second": 4.433, "eval_token_acc": 0.9123360714800404, "step": 300 }, { "epoch": 0.37472164631805266, "grad_norm": 0.9066736102104187, "learning_rate": 9.619089526764614e-06, "loss": 0.26896276473999026, "memory(GiB)": 31.97, "step": 305, "token_acc": 0.9059246028729954, "train_speed(iter/s)": 0.12275 }, { "epoch": 0.38086462412654537, "grad_norm": 1.0033142566680908, "learning_rate": 9.60666599078518e-06, "loss": 0.2597354412078857, "memory(GiB)": 31.97, "step": 310, "token_acc": 0.9248076074702221, "train_speed(iter/s)": 0.123177 }, { "epoch": 0.387007601935038, "grad_norm": 0.7254430055618286, "learning_rate": 9.59405138090186e-06, "loss": 0.25493106842041013, "memory(GiB)": 31.97, "step": 315, "token_acc": 0.9245887855378633, "train_speed(iter/s)": 0.123726 }, { "epoch": 0.3931505797435307, "grad_norm": 0.9664581418037415, "learning_rate": 9.581246220339636e-06, "loss": 0.25707592964172366, "memory(GiB)": 31.97, "step": 320, "token_acc": 0.9214029811137158, "train_speed(iter/s)": 0.124122 }, { "epoch": 0.3931505797435307, "eval_loss": 0.2849082946777344, "eval_runtime": 29.7736, "eval_samples_per_second": 17.667, "eval_steps_per_second": 4.433, "eval_token_acc": 0.9127467934860931, "step": 320 }, { "epoch": 0.39929355755202334, "grad_norm": 1.0068587064743042, "learning_rate": 9.568251040227101e-06, "loss": 0.26822853088378906, "memory(GiB)": 31.97, "step": 325, "token_acc": 0.9102573583789381, "train_speed(iter/s)": 0.122735 }, { "epoch": 0.405436535360516, "grad_norm": 0.9217173457145691, "learning_rate": 9.555066379574423e-06, "loss": 0.25938191413879397, "memory(GiB)": 31.97, "step": 330, "token_acc": 0.91627231410767, "train_speed(iter/s)": 0.123232 }, { "epoch": 0.4115795131690087, "grad_norm": 0.9511445760726929, "learning_rate": 9.541692785250983e-06, "loss": 0.2502701759338379, "memory(GiB)": 31.97, "step": 335, "token_acc": 0.9267840101791963, "train_speed(iter/s)": 0.123579 }, { "epoch": 0.41772249097750136, "grad_norm": 0.8599613904953003, "learning_rate": 9.528130811962693e-06, "loss": 0.2726857662200928, "memory(GiB)": 31.97, "step": 340, "token_acc": 0.9137533709242461, "train_speed(iter/s)": 0.124005 }, { "epoch": 0.41772249097750136, "eval_loss": 0.2847888171672821, "eval_runtime": 29.6352, "eval_samples_per_second": 17.749, "eval_steps_per_second": 4.454, "eval_token_acc": 0.9126459143968871, "step": 340 }, { "epoch": 0.423865468785994, "grad_norm": 0.8748957514762878, "learning_rate": 9.514381022228997e-06, "loss": 0.2631422996520996, "memory(GiB)": 31.97, "step": 345, "token_acc": 0.9046689686233298, "train_speed(iter/s)": 0.122672 }, { "epoch": 0.43000844659448667, "grad_norm": 0.8299148082733154, "learning_rate": 9.50044398635953e-06, "loss": 0.25957283973693845, "memory(GiB)": 31.97, "step": 350, "token_acc": 0.9137482867810388, "train_speed(iter/s)": 0.123033 }, { "epoch": 0.4361514244029793, "grad_norm": 0.8662333488464355, "learning_rate": 9.486320282430469e-06, "loss": 0.25355665683746337, "memory(GiB)": 31.97, "step": 355, "token_acc": 0.9042410061421469, "train_speed(iter/s)": 0.123514 }, { "epoch": 0.44229440221147204, "grad_norm": 0.8986192345619202, "learning_rate": 9.472010496260545e-06, "loss": 0.27993130683898926, "memory(GiB)": 31.97, "step": 360, "token_acc": 0.9059234866040574, "train_speed(iter/s)": 0.12398 }, { "epoch": 0.44229440221147204, "eval_loss": 0.28364232182502747, "eval_runtime": 29.7243, "eval_samples_per_second": 17.696, "eval_steps_per_second": 4.441, "eval_token_acc": 0.9125234183599943, "step": 360 }, { "epoch": 0.4484373800199647, "grad_norm": 0.8662838339805603, "learning_rate": 9.45751522138676e-06, "loss": 0.2574014663696289, "memory(GiB)": 31.97, "step": 365, "token_acc": 0.9092130002686006, "train_speed(iter/s)": 0.122708 }, { "epoch": 0.45458035782845735, "grad_norm": 0.8905205130577087, "learning_rate": 9.44283505903976e-06, "loss": 0.2571540355682373, "memory(GiB)": 31.97, "step": 370, "token_acc": 0.9236671451908322, "train_speed(iter/s)": 0.12311 }, { "epoch": 0.46072333563695, "grad_norm": 0.8431711196899414, "learning_rate": 9.427970618118888e-06, "loss": 0.29239816665649415, "memory(GiB)": 31.97, "step": 375, "token_acc": 0.910949410949411, "train_speed(iter/s)": 0.123539 }, { "epoch": 0.46686631344544266, "grad_norm": 0.8874600529670715, "learning_rate": 9.412922515166952e-06, "loss": 0.2700673580169678, "memory(GiB)": 31.97, "step": 380, "token_acc": 0.8923294784045315, "train_speed(iter/s)": 0.123896 }, { "epoch": 0.46686631344544266, "eval_loss": 0.2839924693107605, "eval_runtime": 29.8081, "eval_samples_per_second": 17.646, "eval_steps_per_second": 4.428, "eval_token_acc": 0.9126891482922611, "step": 380 }, { "epoch": 0.47300929125393537, "grad_norm": 0.9719629883766174, "learning_rate": 9.39769137434463e-06, "loss": 0.26788945198059083, "memory(GiB)": 31.97, "step": 385, "token_acc": 0.9089390748674752, "train_speed(iter/s)": 0.122706 }, { "epoch": 0.479152269062428, "grad_norm": 0.8490920066833496, "learning_rate": 9.38227782740459e-06, "loss": 0.26457748413085935, "memory(GiB)": 31.97, "step": 390, "token_acc": 0.9092120695170061, "train_speed(iter/s)": 0.12317 }, { "epoch": 0.4852952468709207, "grad_norm": 0.8913845419883728, "learning_rate": 9.366682513665293e-06, "loss": 0.2367623805999756, "memory(GiB)": 31.97, "step": 395, "token_acc": 0.9159677595033221, "train_speed(iter/s)": 0.123494 }, { "epoch": 0.49143822467941334, "grad_norm": 1.0622432231903076, "learning_rate": 9.350906079984456e-06, "loss": 0.29119043350219725, "memory(GiB)": 31.97, "step": 400, "token_acc": 0.9054560355930219, "train_speed(iter/s)": 0.123861 }, { "epoch": 0.49143822467941334, "eval_loss": 0.28296077251434326, "eval_runtime": 29.8253, "eval_samples_per_second": 17.636, "eval_steps_per_second": 4.426, "eval_token_acc": 0.9125594466061392, "step": 400 }, { "epoch": 0.497581202487906, "grad_norm": 0.7695022225379944, "learning_rate": 9.334949180732245e-06, "loss": 0.27100481986999514, "memory(GiB)": 31.97, "step": 405, "token_acc": 0.9032931397580931, "train_speed(iter/s)": 0.122792 }, { "epoch": 0.5037241802963986, "grad_norm": 0.8153588175773621, "learning_rate": 9.31881247776412e-06, "loss": 0.24918160438537598, "memory(GiB)": 31.97, "step": 410, "token_acc": 0.9269037635243568, "train_speed(iter/s)": 0.123224 }, { "epoch": 0.5098671581048914, "grad_norm": 0.8573175668716431, "learning_rate": 9.302496640393383e-06, "loss": 0.2658379554748535, "memory(GiB)": 31.97, "step": 415, "token_acc": 0.9111843654344243, "train_speed(iter/s)": 0.12355 }, { "epoch": 0.516010135913384, "grad_norm": 0.8473800420761108, "learning_rate": 9.286002345363418e-06, "loss": 0.25906102657318114, "memory(GiB)": 31.97, "step": 420, "token_acc": 0.9103914478855526, "train_speed(iter/s)": 0.12386 }, { "epoch": 0.516010135913384, "eval_loss": 0.2819235920906067, "eval_runtime": 29.7627, "eval_samples_per_second": 17.673, "eval_steps_per_second": 4.435, "eval_token_acc": 0.9134313301628477, "step": 420 }, { "epoch": 0.5221531137218767, "grad_norm": 0.8349065780639648, "learning_rate": 9.26933027681963e-06, "loss": 0.27934200763702394, "memory(GiB)": 31.97, "step": 425, "token_acc": 0.9077418760931603, "train_speed(iter/s)": 0.122817 }, { "epoch": 0.5282960915303694, "grad_norm": 0.7763445973396301, "learning_rate": 9.25248112628105e-06, "loss": 0.25652432441711426, "memory(GiB)": 31.97, "step": 430, "token_acc": 0.9304649945266765, "train_speed(iter/s)": 0.123084 }, { "epoch": 0.534439069338862, "grad_norm": 0.7395066022872925, "learning_rate": 9.235455592611667e-06, "loss": 0.2478388547897339, "memory(GiB)": 31.97, "step": 435, "token_acc": 0.9145705869023341, "train_speed(iter/s)": 0.123435 }, { "epoch": 0.5405820471473547, "grad_norm": 0.7695580124855042, "learning_rate": 9.218254381991438e-06, "loss": 0.26280052661895753, "memory(GiB)": 31.97, "step": 440, "token_acc": 0.9116754512058777, "train_speed(iter/s)": 0.123796 }, { "epoch": 0.5405820471473547, "eval_loss": 0.2799247205257416, "eval_runtime": 29.6791, "eval_samples_per_second": 17.723, "eval_steps_per_second": 4.448, "eval_token_acc": 0.9137844069750685, "step": 440 }, { "epoch": 0.5467250249558473, "grad_norm": 0.8697651028633118, "learning_rate": 9.200878207886995e-06, "loss": 0.2713948726654053, "memory(GiB)": 31.97, "step": 445, "token_acc": 0.9107910032853171, "train_speed(iter/s)": 0.122757 }, { "epoch": 0.55286800276434, "grad_norm": 0.8758683204650879, "learning_rate": 9.183327791022048e-06, "loss": 0.28769237995147706, "memory(GiB)": 31.97, "step": 450, "token_acc": 0.9117891241178913, "train_speed(iter/s)": 0.123141 }, { "epoch": 0.5590109805728327, "grad_norm": 0.8500604033470154, "learning_rate": 9.165603859347503e-06, "loss": 0.28233935832977297, "memory(GiB)": 31.97, "step": 455, "token_acc": 0.9127747252747253, "train_speed(iter/s)": 0.123484 }, { "epoch": 0.5651539583813253, "grad_norm": 0.7201665639877319, "learning_rate": 9.147707148011255e-06, "loss": 0.26129984855651855, "memory(GiB)": 31.97, "step": 460, "token_acc": 0.9233639048655371, "train_speed(iter/s)": 0.123825 }, { "epoch": 0.5651539583813253, "eval_loss": 0.27780428528785706, "eval_runtime": 29.6703, "eval_samples_per_second": 17.728, "eval_steps_per_second": 4.449, "eval_token_acc": 0.9140005764519383, "step": 460 }, { "epoch": 0.571296936189818, "grad_norm": 0.9048472046852112, "learning_rate": 9.129638399327707e-06, "loss": 0.2747702360153198, "memory(GiB)": 31.97, "step": 465, "token_acc": 0.9071566001433583, "train_speed(iter/s)": 0.122931 }, { "epoch": 0.5774399139983106, "grad_norm": 0.920282244682312, "learning_rate": 9.111398362746969e-06, "loss": 0.25536236763000486, "memory(GiB)": 31.97, "step": 470, "token_acc": 0.9256230196112015, "train_speed(iter/s)": 0.123189 }, { "epoch": 0.5835828918068033, "grad_norm": 0.7103043794631958, "learning_rate": 9.092987794823785e-06, "loss": 0.25142607688903806, "memory(GiB)": 31.97, "step": 475, "token_acc": 0.9101461736887361, "train_speed(iter/s)": 0.1235 }, { "epoch": 0.589725869615296, "grad_norm": 0.962253987789154, "learning_rate": 9.074407459186144e-06, "loss": 0.27944207191467285, "memory(GiB)": 31.97, "step": 480, "token_acc": 0.9034072816049251, "train_speed(iter/s)": 0.123839 }, { "epoch": 0.589725869615296, "eval_loss": 0.2775828242301941, "eval_runtime": 29.9011, "eval_samples_per_second": 17.591, "eval_steps_per_second": 4.415, "eval_token_acc": 0.9140798385934573, "step": 480 }, { "epoch": 0.5958688474237887, "grad_norm": 0.8453803658485413, "learning_rate": 9.055658126503605e-06, "loss": 0.25680568218231203, "memory(GiB)": 31.97, "step": 485, "token_acc": 0.9067185532791576, "train_speed(iter/s)": 0.122913 }, { "epoch": 0.6020118252322814, "grad_norm": 0.8572331666946411, "learning_rate": 9.036740574455345e-06, "loss": 0.24585814476013185, "memory(GiB)": 31.97, "step": 490, "token_acc": 0.9118667917448405, "train_speed(iter/s)": 0.123173 }, { "epoch": 0.608154803040774, "grad_norm": 0.764564573764801, "learning_rate": 9.017655587697885e-06, "loss": 0.2665162801742554, "memory(GiB)": 31.97, "step": 495, "token_acc": 0.9117743676380872, "train_speed(iter/s)": 0.12348 }, { "epoch": 0.6142977808492667, "grad_norm": 0.8492972254753113, "learning_rate": 8.998403957832553e-06, "loss": 0.24622914791107178, "memory(GiB)": 31.97, "step": 500, "token_acc": 0.9276657659530243, "train_speed(iter/s)": 0.12376 }, { "epoch": 0.6142977808492667, "eval_loss": 0.2776328921318054, "eval_runtime": 29.8051, "eval_samples_per_second": 17.648, "eval_steps_per_second": 4.429, "eval_token_acc": 0.9137772013258395, "step": 500 }, { "epoch": 0.6204407586577594, "grad_norm": 0.8095591068267822, "learning_rate": 8.978986483372657e-06, "loss": 0.26657900810241697, "memory(GiB)": 31.97, "step": 505, "token_acc": 0.9116968207877298, "train_speed(iter/s)": 0.122844 }, { "epoch": 0.626583736466252, "grad_norm": 0.8510200381278992, "learning_rate": 8.959403969710346e-06, "loss": 0.2664052486419678, "memory(GiB)": 31.97, "step": 510, "token_acc": 0.9214573689711811, "train_speed(iter/s)": 0.123175 }, { "epoch": 0.6327267142747447, "grad_norm": 0.9017972946166992, "learning_rate": 8.939657229083223e-06, "loss": 0.27168979644775393, "memory(GiB)": 31.97, "step": 515, "token_acc": 0.9179679028410324, "train_speed(iter/s)": 0.123502 }, { "epoch": 0.6388696920832373, "grad_norm": 0.8968759179115295, "learning_rate": 8.919747080540647e-06, "loss": 0.2673780918121338, "memory(GiB)": 31.97, "step": 520, "token_acc": 0.918267105457046, "train_speed(iter/s)": 0.123831 }, { "epoch": 0.6388696920832373, "eval_loss": 0.27617883682250977, "eval_runtime": 29.7042, "eval_samples_per_second": 17.708, "eval_steps_per_second": 4.444, "eval_token_acc": 0.9144112984579911, "step": 520 }, { "epoch": 0.64501266989173, "grad_norm": 0.8835923075675964, "learning_rate": 8.899674349909759e-06, "loss": 0.25952877998352053, "memory(GiB)": 31.97, "step": 525, "token_acc": 0.9108591693084734, "train_speed(iter/s)": 0.123021 }, { "epoch": 0.6511556477002227, "grad_norm": 1.0378886461257935, "learning_rate": 8.879439869761233e-06, "loss": 0.27931737899780273, "memory(GiB)": 31.97, "step": 530, "token_acc": 0.9016547678344126, "train_speed(iter/s)": 0.123368 }, { "epoch": 0.6572986255087153, "grad_norm": 0.815498411655426, "learning_rate": 8.859044479374737e-06, "loss": 0.26363046169281007, "memory(GiB)": 31.97, "step": 535, "token_acc": 0.9000225428313796, "train_speed(iter/s)": 0.123703 }, { "epoch": 0.663441603317208, "grad_norm": 0.7910681962966919, "learning_rate": 8.838489024704131e-06, "loss": 0.25994918346405027, "memory(GiB)": 31.97, "step": 540, "token_acc": 0.9154737238651347, "train_speed(iter/s)": 0.123918 }, { "epoch": 0.663441603317208, "eval_loss": 0.27339863777160645, "eval_runtime": 29.6984, "eval_samples_per_second": 17.711, "eval_steps_per_second": 4.445, "eval_token_acc": 0.9149733390978527, "step": 540 }, { "epoch": 0.6695845811257006, "grad_norm": 1.0020904541015625, "learning_rate": 8.817774358342367e-06, "loss": 0.25385727882385256, "memory(GiB)": 31.97, "step": 545, "token_acc": 0.9148255452267112, "train_speed(iter/s)": 0.123092 }, { "epoch": 0.6757275589341933, "grad_norm": 0.8850826621055603, "learning_rate": 8.796901339486136e-06, "loss": 0.258061146736145, "memory(GiB)": 31.97, "step": 550, "token_acc": 0.9247646909183413, "train_speed(iter/s)": 0.12338 }, { "epoch": 0.681870536742686, "grad_norm": 0.7173855304718018, "learning_rate": 8.775870833900226e-06, "loss": 0.2372835636138916, "memory(GiB)": 31.97, "step": 555, "token_acc": 0.9205999329148498, "train_speed(iter/s)": 0.12365 }, { "epoch": 0.6880135145511787, "grad_norm": 0.8591410517692566, "learning_rate": 8.75468371388161e-06, "loss": 0.2429880380630493, "memory(GiB)": 31.97, "step": 560, "token_acc": 0.9182278006744099, "train_speed(iter/s)": 0.123832 }, { "epoch": 0.6880135145511787, "eval_loss": 0.2735811173915863, "eval_runtime": 29.6978, "eval_samples_per_second": 17.712, "eval_steps_per_second": 4.445, "eval_token_acc": 0.914454532353365, "step": 560 }, { "epoch": 0.6941564923596714, "grad_norm": 0.7596750259399414, "learning_rate": 8.733340858223268e-06, "loss": 0.27418644428253175, "memory(GiB)": 31.97, "step": 565, "token_acc": 0.9034487711172151, "train_speed(iter/s)": 0.123058 }, { "epoch": 0.700299470168164, "grad_norm": 0.9523453116416931, "learning_rate": 8.711843152177735e-06, "loss": 0.23236403465270997, "memory(GiB)": 31.97, "step": 570, "token_acc": 0.9180918923916058, "train_speed(iter/s)": 0.123331 }, { "epoch": 0.7064424479766567, "grad_norm": 0.7896727919578552, "learning_rate": 8.690191487420385e-06, "loss": 0.24450998306274413, "memory(GiB)": 31.97, "step": 575, "token_acc": 0.9229566883477033, "train_speed(iter/s)": 0.12356 }, { "epoch": 0.7125854257851494, "grad_norm": 0.9423844218254089, "learning_rate": 8.668386762012445e-06, "loss": 0.2612689256668091, "memory(GiB)": 31.97, "step": 580, "token_acc": 0.9097009202453987, "train_speed(iter/s)": 0.123753 }, { "epoch": 0.7125854257851494, "eval_loss": 0.2717309594154358, "eval_runtime": 29.6986, "eval_samples_per_second": 17.711, "eval_steps_per_second": 4.445, "eval_token_acc": 0.9148292261132728, "step": 580 }, { "epoch": 0.718728403593642, "grad_norm": 0.8224099278450012, "learning_rate": 8.646429880363746e-06, "loss": 0.2574700117111206, "memory(GiB)": 31.97, "step": 585, "token_acc": 0.9109293706792103, "train_speed(iter/s)": 0.122969 }, { "epoch": 0.7248713814021347, "grad_norm": 0.8321871757507324, "learning_rate": 8.624321753195209e-06, "loss": 0.24323840141296388, "memory(GiB)": 31.97, "step": 590, "token_acc": 0.9101112629318758, "train_speed(iter/s)": 0.12323 }, { "epoch": 0.7310143592106274, "grad_norm": 0.7631216645240784, "learning_rate": 8.602063297501069e-06, "loss": 0.2646035194396973, "memory(GiB)": 31.97, "step": 595, "token_acc": 0.9244347364071078, "train_speed(iter/s)": 0.123473 }, { "epoch": 0.73715733701912, "grad_norm": 0.7845233678817749, "learning_rate": 8.579655436510847e-06, "loss": 0.24114649295806884, "memory(GiB)": 31.97, "step": 600, "token_acc": 0.9176872685844488, "train_speed(iter/s)": 0.123706 }, { "epoch": 0.73715733701912, "eval_loss": 0.27247732877731323, "eval_runtime": 29.6948, "eval_samples_per_second": 17.714, "eval_steps_per_second": 4.445, "eval_token_acc": 0.9152687707162416, "step": 600 }, { "epoch": 0.7433003148276127, "grad_norm": 0.8848146796226501, "learning_rate": 8.557099099651046e-06, "loss": 0.26951429843902586, "memory(GiB)": 31.97, "step": 605, "token_acc": 0.9051814218282708, "train_speed(iter/s)": 0.122981 }, { "epoch": 0.7494432926361053, "grad_norm": 0.7597134709358215, "learning_rate": 8.534395222506614e-06, "loss": 0.2650261163711548, "memory(GiB)": 31.97, "step": 610, "token_acc": 0.9191399015223382, "train_speed(iter/s)": 0.123257 }, { "epoch": 0.755586270444598, "grad_norm": 0.8406746983528137, "learning_rate": 8.511544746782124e-06, "loss": 0.266461181640625, "memory(GiB)": 31.97, "step": 615, "token_acc": 0.9117747440273037, "train_speed(iter/s)": 0.123491 }, { "epoch": 0.7617292482530907, "grad_norm": 0.7886612415313721, "learning_rate": 8.488548620262722e-06, "loss": 0.23856868743896484, "memory(GiB)": 31.97, "step": 620, "token_acc": 0.9248648177219606, "train_speed(iter/s)": 0.123722 }, { "epoch": 0.7617292482530907, "eval_loss": 0.27053606510162354, "eval_runtime": 29.6607, "eval_samples_per_second": 17.734, "eval_steps_per_second": 4.45, "eval_token_acc": 0.9157299322668973, "step": 620 }, { "epoch": 0.7678722260615833, "grad_norm": 0.7711523771286011, "learning_rate": 8.465407796774816e-06, "loss": 0.23632125854492186, "memory(GiB)": 31.97, "step": 625, "token_acc": 0.9121895174526754, "train_speed(iter/s)": 0.122988 }, { "epoch": 0.774015203870076, "grad_norm": 0.8446414470672607, "learning_rate": 8.442123236146509e-06, "loss": 0.25997061729431153, "memory(GiB)": 31.97, "step": 630, "token_acc": 0.922757768361582, "train_speed(iter/s)": 0.123243 }, { "epoch": 0.7801581816785687, "grad_norm": 0.8071849942207336, "learning_rate": 8.418695904167789e-06, "loss": 0.2547910690307617, "memory(GiB)": 31.97, "step": 635, "token_acc": 0.9215275839612987, "train_speed(iter/s)": 0.123454 }, { "epoch": 0.7863011594870614, "grad_norm": 1.0132827758789062, "learning_rate": 8.395126772550475e-06, "loss": 0.2584752082824707, "memory(GiB)": 31.97, "step": 640, "token_acc": 0.9119555143651529, "train_speed(iter/s)": 0.123674 }, { "epoch": 0.7863011594870614, "eval_loss": 0.2703080475330353, "eval_runtime": 29.6849, "eval_samples_per_second": 17.719, "eval_steps_per_second": 4.447, "eval_token_acc": 0.9151967142239515, "step": 640 }, { "epoch": 0.7924441372955541, "grad_norm": 0.795028805732727, "learning_rate": 8.371416818887907e-06, "loss": 0.2689487934112549, "memory(GiB)": 31.97, "step": 645, "token_acc": 0.906519600423793, "train_speed(iter/s)": 0.122961 }, { "epoch": 0.7985871151040467, "grad_norm": 0.8361831903457642, "learning_rate": 8.347567026614398e-06, "loss": 0.25730276107788086, "memory(GiB)": 31.97, "step": 650, "token_acc": 0.9204035220712578, "train_speed(iter/s)": 0.123176 }, { "epoch": 0.8047300929125394, "grad_norm": 0.8233036994934082, "learning_rate": 8.323578384964444e-06, "loss": 0.2561511039733887, "memory(GiB)": 31.97, "step": 655, "token_acc": 0.9101053936763794, "train_speed(iter/s)": 0.123399 }, { "epoch": 0.810873070721032, "grad_norm": 0.8036513328552246, "learning_rate": 8.299451888931696e-06, "loss": 0.24744575023651122, "memory(GiB)": 31.97, "step": 660, "token_acc": 0.9082344368103269, "train_speed(iter/s)": 0.123638 }, { "epoch": 0.810873070721032, "eval_loss": 0.2683682441711426, "eval_runtime": 29.6531, "eval_samples_per_second": 17.738, "eval_steps_per_second": 4.451, "eval_token_acc": 0.9163135898544459, "step": 660 }, { "epoch": 0.8170160485295247, "grad_norm": 0.7272213697433472, "learning_rate": 8.275188539227687e-06, "loss": 0.23523108959197997, "memory(GiB)": 31.97, "step": 665, "token_acc": 0.9134302376185917, "train_speed(iter/s)": 0.122932 }, { "epoch": 0.8231590263380174, "grad_norm": 0.854129433631897, "learning_rate": 8.250789342240326e-06, "loss": 0.24518890380859376, "memory(GiB)": 31.97, "step": 670, "token_acc": 0.9203224101479915, "train_speed(iter/s)": 0.123198 }, { "epoch": 0.82930200414651, "grad_norm": 0.755439043045044, "learning_rate": 8.22625530999215e-06, "loss": 0.23708434104919435, "memory(GiB)": 31.97, "step": 675, "token_acc": 0.9149347105009027, "train_speed(iter/s)": 0.123435 }, { "epoch": 0.8354449819550027, "grad_norm": 0.7811703085899353, "learning_rate": 8.201587460098362e-06, "loss": 0.23157744407653807, "memory(GiB)": 31.97, "step": 680, "token_acc": 0.9248452220726784, "train_speed(iter/s)": 0.123616 }, { "epoch": 0.8354449819550027, "eval_loss": 0.26816263794898987, "eval_runtime": 29.696, "eval_samples_per_second": 17.713, "eval_steps_per_second": 4.445, "eval_token_acc": 0.9156650814238363, "step": 680 }, { "epoch": 0.8415879597634953, "grad_norm": 0.8753883242607117, "learning_rate": 8.176786815724601e-06, "loss": 0.26745316982269285, "memory(GiB)": 31.97, "step": 685, "token_acc": 0.9049404582454025, "train_speed(iter/s)": 0.122971 }, { "epoch": 0.847730937571988, "grad_norm": 0.699246346950531, "learning_rate": 8.151854405544526e-06, "loss": 0.2602883815765381, "memory(GiB)": 31.97, "step": 690, "token_acc": 0.9025235288033923, "train_speed(iter/s)": 0.12319 }, { "epoch": 0.8538739153804807, "grad_norm": 0.6880396604537964, "learning_rate": 8.12679126369713e-06, "loss": 0.27959246635437013, "memory(GiB)": 31.97, "step": 695, "token_acc": 0.9131617782696919, "train_speed(iter/s)": 0.123417 }, { "epoch": 0.8600168931889733, "grad_norm": 0.847633421421051, "learning_rate": 8.101598429743862e-06, "loss": 0.2776790142059326, "memory(GiB)": 31.97, "step": 700, "token_acc": 0.9208461614857484, "train_speed(iter/s)": 0.123628 }, { "epoch": 0.8600168931889733, "eval_loss": 0.26767873764038086, "eval_runtime": 29.7471, "eval_samples_per_second": 17.682, "eval_steps_per_second": 4.437, "eval_token_acc": 0.9162271220636979, "step": 700 }, { "epoch": 0.866159870997466, "grad_norm": 0.796385645866394, "learning_rate": 8.076276948625495e-06, "loss": 0.2519699573516846, "memory(GiB)": 31.97, "step": 705, "token_acc": 0.9091483105121683, "train_speed(iter/s)": 0.123011 }, { "epoch": 0.8723028488059587, "grad_norm": 0.7697290182113647, "learning_rate": 8.050827870618795e-06, "loss": 0.2222222328186035, "memory(GiB)": 31.97, "step": 710, "token_acc": 0.9183491244605387, "train_speed(iter/s)": 0.123243 }, { "epoch": 0.8784458266144514, "grad_norm": 0.7783083319664001, "learning_rate": 8.02525225129295e-06, "loss": 0.2620779752731323, "memory(GiB)": 31.97, "step": 715, "token_acc": 0.9140030018344544, "train_speed(iter/s)": 0.123471 }, { "epoch": 0.8845888044229441, "grad_norm": 0.8889957666397095, "learning_rate": 7.999551151465793e-06, "loss": 0.2590866327285767, "memory(GiB)": 31.97, "step": 720, "token_acc": 0.9240682856455879, "train_speed(iter/s)": 0.123679 }, { "epoch": 0.8845888044229441, "eval_loss": 0.2673368752002716, "eval_runtime": 29.7041, "eval_samples_per_second": 17.708, "eval_steps_per_second": 4.444, "eval_token_acc": 0.9157587548638132, "step": 720 }, { "epoch": 0.8907317822314367, "grad_norm": 0.699228048324585, "learning_rate": 7.973725637159795e-06, "loss": 0.24339399337768555, "memory(GiB)": 31.97, "step": 725, "token_acc": 0.9063403422456052, "train_speed(iter/s)": 0.123046 }, { "epoch": 0.8968747600399294, "grad_norm": 0.8761767745018005, "learning_rate": 7.947776779557862e-06, "loss": 0.22902493476867675, "memory(GiB)": 31.97, "step": 730, "token_acc": 0.929031261265901, "train_speed(iter/s)": 0.123239 }, { "epoch": 0.903017737848422, "grad_norm": 0.7344342470169067, "learning_rate": 7.921705654958886e-06, "loss": 0.25461578369140625, "memory(GiB)": 31.97, "step": 735, "token_acc": 0.910783754344105, "train_speed(iter/s)": 0.123436 }, { "epoch": 0.9091607156569147, "grad_norm": 0.8068217635154724, "learning_rate": 7.895513344733124e-06, "loss": 0.23727846145629883, "memory(GiB)": 31.97, "step": 740, "token_acc": 0.9222560975609756, "train_speed(iter/s)": 0.123624 }, { "epoch": 0.9091607156569147, "eval_loss": 0.26663488149642944, "eval_runtime": 29.6915, "eval_samples_per_second": 17.715, "eval_steps_per_second": 4.446, "eval_token_acc": 0.916219916414469, "step": 740 }, { "epoch": 0.9153036934654074, "grad_norm": 0.7529241442680359, "learning_rate": 7.869200935277317e-06, "loss": 0.2533961534500122, "memory(GiB)": 31.97, "step": 745, "token_acc": 0.9121754667444574, "train_speed(iter/s)": 0.123063 }, { "epoch": 0.9214466712739, "grad_norm": 0.8958796858787537, "learning_rate": 7.842769517969665e-06, "loss": 0.2638097286224365, "memory(GiB)": 31.97, "step": 750, "token_acc": 0.9133777069466579, "train_speed(iter/s)": 0.123274 }, { "epoch": 0.9275896490823927, "grad_norm": 0.8491634726524353, "learning_rate": 7.816220189124527e-06, "loss": 0.2510275363922119, "memory(GiB)": 31.97, "step": 755, "token_acc": 0.9162462967411322, "train_speed(iter/s)": 0.123478 }, { "epoch": 0.9337326268908853, "grad_norm": 0.8026111125946045, "learning_rate": 7.789554049946966e-06, "loss": 0.2663265228271484, "memory(GiB)": 31.97, "step": 760, "token_acc": 0.9118450459399057, "train_speed(iter/s)": 0.123691 }, { "epoch": 0.9337326268908853, "eval_loss": 0.265114426612854, "eval_runtime": 29.6567, "eval_samples_per_second": 17.736, "eval_steps_per_second": 4.451, "eval_token_acc": 0.9164793197867128, "step": 760 }, { "epoch": 0.939875604699378, "grad_norm": 0.7799587845802307, "learning_rate": 7.762772206487066e-06, "loss": 0.2516252756118774, "memory(GiB)": 31.97, "step": 765, "token_acc": 0.9153250495227805, "train_speed(iter/s)": 0.123108 }, { "epoch": 0.9460185825078707, "grad_norm": 0.7860950231552124, "learning_rate": 7.735875769594063e-06, "loss": 0.2252351760864258, "memory(GiB)": 31.97, "step": 770, "token_acc": 0.9219084178777077, "train_speed(iter/s)": 0.123281 }, { "epoch": 0.9521615603163633, "grad_norm": 0.7115477323532104, "learning_rate": 7.70886585487026e-06, "loss": 0.24201133251190185, "memory(GiB)": 31.97, "step": 775, "token_acc": 0.9087906037805101, "train_speed(iter/s)": 0.123461 }, { "epoch": 0.958304538124856, "grad_norm": 0.7322366833686829, "learning_rate": 7.681743582624761e-06, "loss": 0.24702987670898438, "memory(GiB)": 31.97, "step": 780, "token_acc": 0.9231603262150568, "train_speed(iter/s)": 0.123669 }, { "epoch": 0.958304538124856, "eval_loss": 0.2637149393558502, "eval_runtime": 29.6465, "eval_samples_per_second": 17.742, "eval_steps_per_second": 4.452, "eval_token_acc": 0.9160253638852861, "step": 780 }, { "epoch": 0.9644475159333487, "grad_norm": 0.8311400413513184, "learning_rate": 7.654510077827003e-06, "loss": 0.26540687084198, "memory(GiB)": 31.97, "step": 785, "token_acc": 0.9113840464870576, "train_speed(iter/s)": 0.12312 }, { "epoch": 0.9705904937418414, "grad_norm": 0.8548195958137512, "learning_rate": 7.627166470060092e-06, "loss": 0.26256117820739744, "memory(GiB)": 31.97, "step": 790, "token_acc": 0.9282326450438365, "train_speed(iter/s)": 0.123344 }, { "epoch": 0.9767334715503341, "grad_norm": 0.822137176990509, "learning_rate": 7.59971389347395e-06, "loss": 0.2492506980895996, "memory(GiB)": 31.97, "step": 795, "token_acc": 0.9176904773466712, "train_speed(iter/s)": 0.123523 }, { "epoch": 0.9828764493588267, "grad_norm": 0.8581626415252686, "learning_rate": 7.572153486738281e-06, "loss": 0.23105947971343993, "memory(GiB)": 31.97, "step": 800, "token_acc": 0.9199036089276493, "train_speed(iter/s)": 0.123714 }, { "epoch": 0.9828764493588267, "eval_loss": 0.262928307056427, "eval_runtime": 29.686, "eval_samples_per_second": 17.719, "eval_steps_per_second": 4.447, "eval_token_acc": 0.9163784406975068, "step": 800 }, { "epoch": 0.9890194271673194, "grad_norm": 0.801688551902771, "learning_rate": 7.544486392995325e-06, "loss": 0.22394142150878907, "memory(GiB)": 31.97, "step": 805, "token_acc": 0.9134662867996202, "train_speed(iter/s)": 0.12313 }, { "epoch": 0.995162404975812, "grad_norm": 0.769991397857666, "learning_rate": 7.516713759812465e-06, "loss": 0.24088678359985352, "memory(GiB)": 31.97, "step": 810, "token_acc": 0.9233073946152885, "train_speed(iter/s)": 0.12331 }, { "epoch": 1.002457191123397, "grad_norm": 0.7465444207191467, "learning_rate": 7.4888367391346085e-06, "loss": 0.2843191623687744, "memory(GiB)": 31.97, "step": 815, "token_acc": 0.9340755933196602, "train_speed(iter/s)": 0.12343 }, { "epoch": 1.0086001689318898, "grad_norm": 0.7140413522720337, "learning_rate": 7.460856487236421e-06, "loss": 0.21777431964874266, "memory(GiB)": 31.97, "step": 820, "token_acc": 0.9192456915997473, "train_speed(iter/s)": 0.123641 }, { "epoch": 1.0086001689318898, "eval_loss": 0.26577290892601013, "eval_runtime": 29.655, "eval_samples_per_second": 17.737, "eval_steps_per_second": 4.451, "eval_token_acc": 0.9173079694480473, "step": 820 }, { "epoch": 1.0147431467403825, "grad_norm": 0.8933572769165039, "learning_rate": 7.432774164674359e-06, "loss": 0.18578357696533204, "memory(GiB)": 31.97, "step": 825, "token_acc": 0.9199372635852016, "train_speed(iter/s)": 0.123101 }, { "epoch": 1.0208861245488752, "grad_norm": 0.8847187161445618, "learning_rate": 7.404590936238535e-06, "loss": 0.19721906185150145, "memory(GiB)": 31.97, "step": 830, "token_acc": 0.9378547338981656, "train_speed(iter/s)": 0.123344 }, { "epoch": 1.0270291023573677, "grad_norm": 0.8109038472175598, "learning_rate": 7.376307970904408e-06, "loss": 0.209037446975708, "memory(GiB)": 31.97, "step": 835, "token_acc": 0.9270859687294439, "train_speed(iter/s)": 0.123517 }, { "epoch": 1.0331720801658604, "grad_norm": 0.7887623906135559, "learning_rate": 7.34792644178429e-06, "loss": 0.18892388343811034, "memory(GiB)": 31.97, "step": 840, "token_acc": 0.940854053515372, "train_speed(iter/s)": 0.123667 }, { "epoch": 1.0331720801658604, "eval_loss": 0.26770132780075073, "eval_runtime": 29.6867, "eval_samples_per_second": 17.718, "eval_steps_per_second": 4.446, "eval_token_acc": 0.9165729932266897, "step": 840 }, { "epoch": 1.039315057974353, "grad_norm": 0.8831640481948853, "learning_rate": 7.319447526078696e-06, "loss": 0.20574064254760743, "memory(GiB)": 31.97, "step": 845, "token_acc": 0.9144320335497399, "train_speed(iter/s)": 0.123197 }, { "epoch": 1.0454580357828458, "grad_norm": 0.7205056548118591, "learning_rate": 7.290872405027508e-06, "loss": 0.1763360857963562, "memory(GiB)": 31.97, "step": 850, "token_acc": 0.941142747945729, "train_speed(iter/s)": 0.123361 }, { "epoch": 1.0516010135913385, "grad_norm": 0.6685133576393127, "learning_rate": 7.262202263860989e-06, "loss": 0.18052310943603517, "memory(GiB)": 31.97, "step": 855, "token_acc": 0.9395161290322581, "train_speed(iter/s)": 0.123527 }, { "epoch": 1.057743991399831, "grad_norm": 0.6980032920837402, "learning_rate": 7.233438291750615e-06, "loss": 0.2039564609527588, "memory(GiB)": 31.97, "step": 860, "token_acc": 0.9359017096052193, "train_speed(iter/s)": 0.123688 }, { "epoch": 1.057743991399831, "eval_loss": 0.2684628963470459, "eval_runtime": 29.667, "eval_samples_per_second": 17.73, "eval_steps_per_second": 4.449, "eval_token_acc": 0.9164360858913388, "step": 860 }, { "epoch": 1.0638869692083237, "grad_norm": 0.9002748727798462, "learning_rate": 7.204581681759752e-06, "loss": 0.2119807004928589, "memory(GiB)": 31.97, "step": 865, "token_acc": 0.9171671861932639, "train_speed(iter/s)": 0.123189 }, { "epoch": 1.0700299470168164, "grad_norm": 0.7583820819854736, "learning_rate": 7.175633630794176e-06, "loss": 0.20298078060150146, "memory(GiB)": 31.97, "step": 870, "token_acc": 0.938098510882016, "train_speed(iter/s)": 0.123367 }, { "epoch": 1.0761729248253091, "grad_norm": 0.8472638726234436, "learning_rate": 7.146595339552423e-06, "loss": 0.19818198680877686, "memory(GiB)": 31.97, "step": 875, "token_acc": 0.937833543813908, "train_speed(iter/s)": 0.123568 }, { "epoch": 1.0823159026338018, "grad_norm": 0.8150883913040161, "learning_rate": 7.1174680124759856e-06, "loss": 0.17738423347473145, "memory(GiB)": 31.97, "step": 880, "token_acc": 0.9374801246581441, "train_speed(iter/s)": 0.123737 }, { "epoch": 1.0823159026338018, "eval_loss": 0.26935601234436035, "eval_runtime": 29.5753, "eval_samples_per_second": 17.785, "eval_steps_per_second": 4.463, "eval_token_acc": 0.916198299466782, "step": 880 }, { "epoch": 1.0884588804422943, "grad_norm": 0.7381166815757751, "learning_rate": 7.08825285769936e-06, "loss": 0.18000258207321168, "memory(GiB)": 31.97, "step": 885, "token_acc": 0.9197572365671506, "train_speed(iter/s)": 0.12326 }, { "epoch": 1.094601858250787, "grad_norm": 0.7191819548606873, "learning_rate": 7.058951086999934e-06, "loss": 0.17380096912384033, "memory(GiB)": 31.97, "step": 890, "token_acc": 0.9333728639965501, "train_speed(iter/s)": 0.123399 }, { "epoch": 1.1007448360592798, "grad_norm": 0.8915813565254211, "learning_rate": 7.029563915747723e-06, "loss": 0.18771791458129883, "memory(GiB)": 31.97, "step": 895, "token_acc": 0.9397812810680302, "train_speed(iter/s)": 0.123569 }, { "epoch": 1.1068878138677725, "grad_norm": 0.7277628183364868, "learning_rate": 7.0000925628549595e-06, "loss": 0.20253748893737794, "memory(GiB)": 31.97, "step": 900, "token_acc": 0.9236840782263335, "train_speed(iter/s)": 0.123737 }, { "epoch": 1.1068878138677725, "eval_loss": 0.2681773900985718, "eval_runtime": 29.6364, "eval_samples_per_second": 17.748, "eval_steps_per_second": 4.454, "eval_token_acc": 0.916162271220637, "step": 900 }, { "epoch": 1.1130307916762652, "grad_norm": 0.9438680410385132, "learning_rate": 6.9705382507255405e-06, "loss": 0.18554757833480834, "memory(GiB)": 31.97, "step": 905, "token_acc": 0.9169849491620015, "train_speed(iter/s)": 0.123281 }, { "epoch": 1.1191737694847577, "grad_norm": 0.8167145252227783, "learning_rate": 6.940902205204321e-06, "loss": 0.19586331844329835, "memory(GiB)": 31.97, "step": 910, "token_acc": 0.935408560311284, "train_speed(iter/s)": 0.123439 }, { "epoch": 1.1253167472932504, "grad_norm": 0.7680448889732361, "learning_rate": 6.911185655526263e-06, "loss": 0.2027712345123291, "memory(GiB)": 31.97, "step": 915, "token_acc": 0.9350154026697961, "train_speed(iter/s)": 0.123617 }, { "epoch": 1.131459725101743, "grad_norm": 0.7472254037857056, "learning_rate": 6.881389834265463e-06, "loss": 0.20426957607269286, "memory(GiB)": 31.97, "step": 920, "token_acc": 0.9317039744175423, "train_speed(iter/s)": 0.123791 }, { "epoch": 1.131459725101743, "eval_loss": 0.26660415530204773, "eval_runtime": 29.6749, "eval_samples_per_second": 17.725, "eval_steps_per_second": 4.448, "eval_token_acc": 0.9159677186914541, "step": 920 }, { "epoch": 1.1376027029102358, "grad_norm": 0.7990177273750305, "learning_rate": 6.851515977284014e-06, "loss": 0.17569031715393066, "memory(GiB)": 31.97, "step": 925, "token_acc": 0.9180286145399676, "train_speed(iter/s)": 0.123301 }, { "epoch": 1.1437456807187285, "grad_norm": 0.7919936776161194, "learning_rate": 6.821565323680759e-06, "loss": 0.18091797828674316, "memory(GiB)": 31.97, "step": 930, "token_acc": 0.9357635368079497, "train_speed(iter/s)": 0.123449 }, { "epoch": 1.149888658527221, "grad_norm": 0.7052066922187805, "learning_rate": 6.791539115739879e-06, "loss": 0.20484356880187987, "memory(GiB)": 31.97, "step": 935, "token_acc": 0.9249510662408571, "train_speed(iter/s)": 0.123648 }, { "epoch": 1.1560316363357137, "grad_norm": 0.7116119861602783, "learning_rate": 6.761438598879383e-06, "loss": 0.18515671491622926, "memory(GiB)": 31.97, "step": 940, "token_acc": 0.9454207808678322, "train_speed(iter/s)": 0.123794 }, { "epoch": 1.1560316363357137, "eval_loss": 0.265466570854187, "eval_runtime": 29.6874, "eval_samples_per_second": 17.718, "eval_steps_per_second": 4.446, "eval_token_acc": 0.9163063842052169, "step": 940 }, { "epoch": 1.1621746141442064, "grad_norm": 0.8001610040664673, "learning_rate": 6.731265021599437e-06, "loss": 0.2151487112045288, "memory(GiB)": 31.97, "step": 945, "token_acc": 0.9195844345210973, "train_speed(iter/s)": 0.123335 }, { "epoch": 1.1683175919526991, "grad_norm": 0.703551709651947, "learning_rate": 6.7010196354305876e-06, "loss": 0.188127601146698, "memory(GiB)": 31.97, "step": 950, "token_acc": 0.9344865159357123, "train_speed(iter/s)": 0.123485 }, { "epoch": 1.1744605697611918, "grad_norm": 0.7591283917427063, "learning_rate": 6.670703694881851e-06, "loss": 0.19852180480957032, "memory(GiB)": 31.97, "step": 955, "token_acc": 0.9347434962314612, "train_speed(iter/s)": 0.12365 }, { "epoch": 1.1806035475696843, "grad_norm": 0.8347094058990479, "learning_rate": 6.640318457388672e-06, "loss": 0.1904957413673401, "memory(GiB)": 31.97, "step": 960, "token_acc": 0.9374376643394686, "train_speed(iter/s)": 0.123801 }, { "epoch": 1.1806035475696843, "eval_loss": 0.2668422758579254, "eval_runtime": 29.65, "eval_samples_per_second": 17.74, "eval_steps_per_second": 4.452, "eval_token_acc": 0.9165946101743767, "step": 960 }, { "epoch": 1.186746525378177, "grad_norm": 0.8114694952964783, "learning_rate": 6.609865183260777e-06, "loss": 0.19182581901550294, "memory(GiB)": 31.97, "step": 965, "token_acc": 0.9153690632426489, "train_speed(iter/s)": 0.123322 }, { "epoch": 1.1928895031866698, "grad_norm": 0.8866662979125977, "learning_rate": 6.579345135629896e-06, "loss": 0.19811842441558838, "memory(GiB)": 31.97, "step": 970, "token_acc": 0.9233720292959992, "train_speed(iter/s)": 0.123477 }, { "epoch": 1.1990324809951625, "grad_norm": 0.7779002785682678, "learning_rate": 6.548759580397363e-06, "loss": 0.20236413478851317, "memory(GiB)": 31.97, "step": 975, "token_acc": 0.9268359567816596, "train_speed(iter/s)": 0.123623 }, { "epoch": 1.2051754588036552, "grad_norm": 0.6543186902999878, "learning_rate": 6.518109786181628e-06, "loss": 0.19884101152420045, "memory(GiB)": 31.97, "step": 980, "token_acc": 0.9361254541977434, "train_speed(iter/s)": 0.123766 }, { "epoch": 1.2051754588036552, "eval_loss": 0.26707014441490173, "eval_runtime": 29.6219, "eval_samples_per_second": 17.757, "eval_steps_per_second": 4.456, "eval_token_acc": 0.9165801988759187, "step": 980 }, { "epoch": 1.2113184366121477, "grad_norm": 0.7516761422157288, "learning_rate": 6.487397024265616e-06, "loss": 0.2052464008331299, "memory(GiB)": 31.97, "step": 985, "token_acc": 0.9144311222289313, "train_speed(iter/s)": 0.123306 }, { "epoch": 1.2174614144206404, "grad_norm": 0.7994732856750488, "learning_rate": 6.456622568544012e-06, "loss": 0.19984896183013917, "memory(GiB)": 31.97, "step": 990, "token_acc": 0.9331774440147496, "train_speed(iter/s)": 0.123471 }, { "epoch": 1.223604392229133, "grad_norm": 0.8212082386016846, "learning_rate": 6.425787695470419e-06, "loss": 0.19194519519805908, "memory(GiB)": 31.97, "step": 995, "token_acc": 0.9230769230769231, "train_speed(iter/s)": 0.123636 }, { "epoch": 1.2297473700376258, "grad_norm": 0.7439980506896973, "learning_rate": 6.3948936840044096e-06, "loss": 0.20161755084991456, "memory(GiB)": 31.97, "step": 1000, "token_acc": 0.9443154490422091, "train_speed(iter/s)": 0.123777 }, { "epoch": 1.2297473700376258, "eval_loss": 0.26448243856430054, "eval_runtime": 29.656, "eval_samples_per_second": 17.737, "eval_steps_per_second": 4.451, "eval_token_acc": 0.9172719412019023, "step": 1000 }, { "epoch": 1.2358903478461185, "grad_norm": 0.9987765550613403, "learning_rate": 6.363941815558484e-06, "loss": 0.19967958927154542, "memory(GiB)": 31.97, "step": 1005, "token_acc": 0.9190461073035912, "train_speed(iter/s)": 0.12333 }, { "epoch": 1.242033325654611, "grad_norm": 0.8285346627235413, "learning_rate": 6.332933373944914e-06, "loss": 0.19167766571044922, "memory(GiB)": 31.97, "step": 1010, "token_acc": 0.9248888888888889, "train_speed(iter/s)": 0.123459 }, { "epoch": 1.2481763034631037, "grad_norm": 0.8778380751609802, "learning_rate": 6.301869645322498e-06, "loss": 0.20817289352416993, "memory(GiB)": 31.97, "step": 1015, "token_acc": 0.9210802145631667, "train_speed(iter/s)": 0.123619 }, { "epoch": 1.2543192812715964, "grad_norm": 0.764149010181427, "learning_rate": 6.270751918143213e-06, "loss": 0.2000873565673828, "memory(GiB)": 31.97, "step": 1020, "token_acc": 0.9338627474220694, "train_speed(iter/s)": 0.123776 }, { "epoch": 1.2543192812715964, "eval_loss": 0.2691567838191986, "eval_runtime": 29.7546, "eval_samples_per_second": 17.678, "eval_steps_per_second": 4.436, "eval_token_acc": 0.9163496181005909, "step": 1020 }, { "epoch": 1.2604622590800891, "grad_norm": 0.8174912929534912, "learning_rate": 6.239581483098767e-06, "loss": 0.19734174013137817, "memory(GiB)": 31.97, "step": 1025, "token_acc": 0.9185803052816426, "train_speed(iter/s)": 0.123299 }, { "epoch": 1.2666052368885818, "grad_norm": 0.8845574259757996, "learning_rate": 6.208359633067077e-06, "loss": 0.18390114307403566, "memory(GiB)": 31.97, "step": 1030, "token_acc": 0.9341327407655864, "train_speed(iter/s)": 0.123442 }, { "epoch": 1.2727482146970743, "grad_norm": 0.9409281015396118, "learning_rate": 6.177087663058626e-06, "loss": 0.20083985328674317, "memory(GiB)": 31.97, "step": 1035, "token_acc": 0.9444893687865671, "train_speed(iter/s)": 0.123585 }, { "epoch": 1.278891192505567, "grad_norm": 0.7851516604423523, "learning_rate": 6.145766870162767e-06, "loss": 0.21141374111175537, "memory(GiB)": 31.97, "step": 1040, "token_acc": 0.9148654159869495, "train_speed(iter/s)": 0.123721 }, { "epoch": 1.278891192505567, "eval_loss": 0.2667810916900635, "eval_runtime": 29.6926, "eval_samples_per_second": 17.715, "eval_steps_per_second": 4.446, "eval_token_acc": 0.9169765095835135, "step": 1040 }, { "epoch": 1.2850341703140598, "grad_norm": 0.9521942138671875, "learning_rate": 6.114398553493909e-06, "loss": 0.1960476517677307, "memory(GiB)": 31.97, "step": 1045, "token_acc": 0.9178016461816706, "train_speed(iter/s)": 0.123266 }, { "epoch": 1.2911771481225525, "grad_norm": 0.8174967765808105, "learning_rate": 6.0829840141376385e-06, "loss": 0.20267832279205322, "memory(GiB)": 31.97, "step": 1050, "token_acc": 0.9279712548369264, "train_speed(iter/s)": 0.123426 }, { "epoch": 1.2973201259310452, "grad_norm": 0.8733800053596497, "learning_rate": 6.051524555096754e-06, "loss": 0.18992329835891725, "memory(GiB)": 31.97, "step": 1055, "token_acc": 0.9288329960489544, "train_speed(iter/s)": 0.123584 }, { "epoch": 1.3034631037395377, "grad_norm": 0.8741737008094788, "learning_rate": 6.020021481237216e-06, "loss": 0.2002291202545166, "memory(GiB)": 31.97, "step": 1060, "token_acc": 0.9348597405477326, "train_speed(iter/s)": 0.123729 }, { "epoch": 1.3034631037395377, "eval_loss": 0.26571905612945557, "eval_runtime": 29.6815, "eval_samples_per_second": 17.721, "eval_steps_per_second": 4.447, "eval_token_acc": 0.9168756304943075, "step": 1060 }, { "epoch": 1.3096060815480304, "grad_norm": 0.7780086398124695, "learning_rate": 5.988476099234033e-06, "loss": 0.20159559249877929, "memory(GiB)": 31.97, "step": 1065, "token_acc": 0.9135952477386257, "train_speed(iter/s)": 0.123364 }, { "epoch": 1.315749059356523, "grad_norm": 0.7431135177612305, "learning_rate": 5.956889717517053e-06, "loss": 0.1894887328147888, "memory(GiB)": 31.97, "step": 1070, "token_acc": 0.9296808409887299, "train_speed(iter/s)": 0.12348 }, { "epoch": 1.3218920371650158, "grad_norm": 0.72322016954422, "learning_rate": 5.925263646216697e-06, "loss": 0.17778899669647216, "memory(GiB)": 31.97, "step": 1075, "token_acc": 0.9421357447673238, "train_speed(iter/s)": 0.123618 }, { "epoch": 1.3280350149735085, "grad_norm": 0.9004871249198914, "learning_rate": 5.893599197109625e-06, "loss": 0.1900892972946167, "memory(GiB)": 31.97, "step": 1080, "token_acc": 0.9286453541858326, "train_speed(iter/s)": 0.123756 }, { "epoch": 1.3280350149735085, "eval_loss": 0.26534271240234375, "eval_runtime": 29.6233, "eval_samples_per_second": 17.756, "eval_steps_per_second": 4.456, "eval_token_acc": 0.9171062112696354, "step": 1080 }, { "epoch": 1.334177992782001, "grad_norm": 0.657845675945282, "learning_rate": 5.861897683564313e-06, "loss": 0.18198509216308595, "memory(GiB)": 31.97, "step": 1085, "token_acc": 0.9204705963413373, "train_speed(iter/s)": 0.123316 }, { "epoch": 1.3403209705904937, "grad_norm": 0.8465989828109741, "learning_rate": 5.830160420486588e-06, "loss": 0.2053920269012451, "memory(GiB)": 31.97, "step": 1090, "token_acc": 0.9380833375835161, "train_speed(iter/s)": 0.123479 }, { "epoch": 1.3464639483989864, "grad_norm": 0.7130780816078186, "learning_rate": 5.798388724265085e-06, "loss": 0.18327146768569946, "memory(GiB)": 31.97, "step": 1095, "token_acc": 0.9395675675675675, "train_speed(iter/s)": 0.123624 }, { "epoch": 1.3526069262074791, "grad_norm": 0.800010085105896, "learning_rate": 5.7665839127166475e-06, "loss": 0.18803975582122803, "memory(GiB)": 31.97, "step": 1100, "token_acc": 0.9407923378319547, "train_speed(iter/s)": 0.123735 }, { "epoch": 1.3526069262074791, "eval_loss": 0.26394224166870117, "eval_runtime": 29.7389, "eval_samples_per_second": 17.687, "eval_steps_per_second": 4.439, "eval_token_acc": 0.9168684248450786, "step": 1100 }, { "epoch": 1.3587499040159718, "grad_norm": 0.8300402164459229, "learning_rate": 5.734747305031664e-06, "loss": 0.21199843883514405, "memory(GiB)": 31.97, "step": 1105, "token_acc": 0.9151312319249167, "train_speed(iter/s)": 0.123328 }, { "epoch": 1.3648928818244643, "grad_norm": 0.7512418627738953, "learning_rate": 5.7028802217193565e-06, "loss": 0.18927464485168458, "memory(GiB)": 31.97, "step": 1110, "token_acc": 0.9413873811065187, "train_speed(iter/s)": 0.123452 }, { "epoch": 1.371035859632957, "grad_norm": 0.8453025221824646, "learning_rate": 5.670983984553003e-06, "loss": 0.20298895835876465, "memory(GiB)": 31.97, "step": 1115, "token_acc": 0.9409311022678237, "train_speed(iter/s)": 0.123572 }, { "epoch": 1.3771788374414498, "grad_norm": 0.8611200451850891, "learning_rate": 5.63905991651512e-06, "loss": 0.1865471839904785, "memory(GiB)": 31.97, "step": 1120, "token_acc": 0.9321085791674028, "train_speed(iter/s)": 0.123686 }, { "epoch": 1.3771788374414498, "eval_loss": 0.2642819583415985, "eval_runtime": 29.6704, "eval_samples_per_second": 17.728, "eval_steps_per_second": 4.449, "eval_token_acc": 0.9167243118604986, "step": 1120 }, { "epoch": 1.3833218152499425, "grad_norm": 0.7028310298919678, "learning_rate": 5.607109341742579e-06, "loss": 0.1868009090423584, "memory(GiB)": 31.97, "step": 1125, "token_acc": 0.9161407676887668, "train_speed(iter/s)": 0.123281 }, { "epoch": 1.3894647930584352, "grad_norm": 0.7893259525299072, "learning_rate": 5.575133585471697e-06, "loss": 0.18891712427139282, "memory(GiB)": 31.97, "step": 1130, "token_acc": 0.9357982673267327, "train_speed(iter/s)": 0.123383 }, { "epoch": 1.3956077708669277, "grad_norm": 0.8665569424629211, "learning_rate": 5.543133973983254e-06, "loss": 0.18693907260894777, "memory(GiB)": 31.97, "step": 1135, "token_acc": 0.9333289413004809, "train_speed(iter/s)": 0.123515 }, { "epoch": 1.4017507486754204, "grad_norm": 0.7769924998283386, "learning_rate": 5.511111834547496e-06, "loss": 0.18896095752716063, "memory(GiB)": 31.97, "step": 1140, "token_acc": 0.9316990440949738, "train_speed(iter/s)": 0.123635 }, { "epoch": 1.4017507486754204, "eval_loss": 0.2637878358364105, "eval_runtime": 29.598, "eval_samples_per_second": 17.771, "eval_steps_per_second": 4.46, "eval_token_acc": 0.9168612191958495, "step": 1140 }, { "epoch": 1.407893726483913, "grad_norm": 0.7512997388839722, "learning_rate": 5.479068495369071e-06, "loss": 0.17823780775070192, "memory(GiB)": 31.97, "step": 1145, "token_acc": 0.9169221157037702, "train_speed(iter/s)": 0.123233 }, { "epoch": 1.4140367042924058, "grad_norm": 0.7769235372543335, "learning_rate": 5.447005285531948e-06, "loss": 0.18888635635375978, "memory(GiB)": 31.97, "step": 1150, "token_acc": 0.9348544111255975, "train_speed(iter/s)": 0.123343 }, { "epoch": 1.4201796821008985, "grad_norm": 0.6804760694503784, "learning_rate": 5.414923534944283e-06, "loss": 0.19998799562454223, "memory(GiB)": 31.97, "step": 1155, "token_acc": 0.927479002131127, "train_speed(iter/s)": 0.12347 }, { "epoch": 1.426322659909391, "grad_norm": 0.8411586284637451, "learning_rate": 5.38282457428326e-06, "loss": 0.18058542013168336, "memory(GiB)": 31.97, "step": 1160, "token_acc": 0.9393255256102724, "train_speed(iter/s)": 0.123592 }, { "epoch": 1.426322659909391, "eval_loss": 0.26423749327659607, "eval_runtime": 29.6214, "eval_samples_per_second": 17.757, "eval_steps_per_second": 4.456, "eval_token_acc": 0.9165369649805447, "step": 1160 }, { "epoch": 1.4324656377178837, "grad_norm": 0.9211987257003784, "learning_rate": 5.350709734939898e-06, "loss": 0.19590919017791747, "memory(GiB)": 31.97, "step": 1165, "token_acc": 0.9181291791405984, "train_speed(iter/s)": 0.123222 }, { "epoch": 1.4386086155263764, "grad_norm": 0.8355826735496521, "learning_rate": 5.318580348963826e-06, "loss": 0.18302634954452515, "memory(GiB)": 31.97, "step": 1170, "token_acc": 0.940089028541503, "train_speed(iter/s)": 0.123344 }, { "epoch": 1.4447515933348691, "grad_norm": 0.8388906717300415, "learning_rate": 5.286437749008031e-06, "loss": 0.19118983745574952, "memory(GiB)": 31.97, "step": 1175, "token_acc": 0.934627927660836, "train_speed(iter/s)": 0.123476 }, { "epoch": 1.4508945711433618, "grad_norm": 0.9128098487854004, "learning_rate": 5.2542832682735956e-06, "loss": 0.20061683654785156, "memory(GiB)": 31.97, "step": 1180, "token_acc": 0.9325906344410876, "train_speed(iter/s)": 0.123611 }, { "epoch": 1.4508945711433618, "eval_loss": 0.26307445764541626, "eval_runtime": 29.7678, "eval_samples_per_second": 17.67, "eval_steps_per_second": 4.434, "eval_token_acc": 0.9170125378296584, "step": 1180 }, { "epoch": 1.4570375489518543, "grad_norm": 0.7825379371643066, "learning_rate": 5.222118240454376e-06, "loss": 0.19818990230560302, "memory(GiB)": 31.97, "step": 1185, "token_acc": 0.9137893551001047, "train_speed(iter/s)": 0.123207 }, { "epoch": 1.463180526760347, "grad_norm": 0.748515784740448, "learning_rate": 5.18994399968171e-06, "loss": 0.19512221813201905, "memory(GiB)": 31.97, "step": 1190, "token_acc": 0.9283811949976841, "train_speed(iter/s)": 0.123314 }, { "epoch": 1.4693235045688398, "grad_norm": 0.9209436178207397, "learning_rate": 5.157761880469058e-06, "loss": 0.19263048171997071, "memory(GiB)": 31.97, "step": 1195, "token_acc": 0.9410712406608439, "train_speed(iter/s)": 0.123435 }, { "epoch": 1.4754664823773325, "grad_norm": 0.8622936606407166, "learning_rate": 5.125573217656664e-06, "loss": 0.1777910351753235, "memory(GiB)": 31.97, "step": 1200, "token_acc": 0.938923185912357, "train_speed(iter/s)": 0.123542 }, { "epoch": 1.4754664823773325, "eval_loss": 0.26356378197669983, "eval_runtime": 29.7055, "eval_samples_per_second": 17.707, "eval_steps_per_second": 4.444, "eval_token_acc": 0.9174160541864822, "step": 1200 }, { "epoch": 1.4816094601858252, "grad_norm": 0.8892216086387634, "learning_rate": 5.0933793463561855e-06, "loss": 0.189991557598114, "memory(GiB)": 31.97, "step": 1205, "token_acc": 0.9184802373432864, "train_speed(iter/s)": 0.12315 }, { "epoch": 1.4877524379943177, "grad_norm": 0.7723336219787598, "learning_rate": 5.061181601895317e-06, "loss": 0.19531933069229127, "memory(GiB)": 31.97, "step": 1210, "token_acc": 0.9326246228990087, "train_speed(iter/s)": 0.123271 }, { "epoch": 1.4938954158028104, "grad_norm": 0.8178744912147522, "learning_rate": 5.028981319762399e-06, "loss": 0.19836077690124512, "memory(GiB)": 31.97, "step": 1215, "token_acc": 0.9317647058823529, "train_speed(iter/s)": 0.123399 }, { "epoch": 1.500038393611303, "grad_norm": 0.8365611433982849, "learning_rate": 4.996779835551035e-06, "loss": 0.17670562267303466, "memory(GiB)": 31.97, "step": 1220, "token_acc": 0.9215732593161283, "train_speed(iter/s)": 0.123517 }, { "epoch": 1.500038393611303, "eval_loss": 0.26140737533569336, "eval_runtime": 29.7037, "eval_samples_per_second": 17.708, "eval_steps_per_second": 4.444, "eval_token_acc": 0.9175601671710621, "step": 1220 }, { "epoch": 1.5061813714197958, "grad_norm": 0.9357166290283203, "learning_rate": 4.964578484904679e-06, "loss": 0.19990785121917726, "memory(GiB)": 31.97, "step": 1225, "token_acc": 0.9138523956723339, "train_speed(iter/s)": 0.123153 }, { "epoch": 1.5123243492282885, "grad_norm": 0.7626463174819946, "learning_rate": 4.932378603461253e-06, "loss": 0.17721318006515502, "memory(GiB)": 31.97, "step": 1230, "token_acc": 0.9358071645166264, "train_speed(iter/s)": 0.123279 }, { "epoch": 1.518467327036781, "grad_norm": 0.7975868582725525, "learning_rate": 4.900181526797737e-06, "loss": 0.18672944307327272, "memory(GiB)": 31.97, "step": 1235, "token_acc": 0.9301695649818517, "train_speed(iter/s)": 0.123379 }, { "epoch": 1.5246103048452737, "grad_norm": 0.9394721984863281, "learning_rate": 4.867988590374777e-06, "loss": 0.21254873275756836, "memory(GiB)": 31.97, "step": 1240, "token_acc": 0.9301865980329075, "train_speed(iter/s)": 0.12352 }, { "epoch": 1.5246103048452737, "eval_loss": 0.2631884217262268, "eval_runtime": 29.7103, "eval_samples_per_second": 17.704, "eval_steps_per_second": 4.443, "eval_token_acc": 0.9173656146418793, "step": 1240 }, { "epoch": 1.5307532826537664, "grad_norm": 0.8997290134429932, "learning_rate": 4.835801129481287e-06, "loss": 0.17868154048919677, "memory(GiB)": 31.97, "step": 1245, "token_acc": 0.9191156488994138, "train_speed(iter/s)": 0.123159 }, { "epoch": 1.5368962604622591, "grad_norm": 0.9004043340682983, "learning_rate": 4.803620479179071e-06, "loss": 0.2074437618255615, "memory(GiB)": 31.97, "step": 1250, "token_acc": 0.9450173238739482, "train_speed(iter/s)": 0.123271 }, { "epoch": 1.5430392382707518, "grad_norm": 0.7723172307014465, "learning_rate": 4.771447974247449e-06, "loss": 0.1962502956390381, "memory(GiB)": 31.97, "step": 1255, "token_acc": 0.9253724029792239, "train_speed(iter/s)": 0.123413 }, { "epoch": 1.5491822160792443, "grad_norm": 0.8553282618522644, "learning_rate": 4.7392849491278825e-06, "loss": 0.18894779682159424, "memory(GiB)": 31.97, "step": 1260, "token_acc": 0.9323178471693323, "train_speed(iter/s)": 0.123525 }, { "epoch": 1.5491822160792443, "eval_loss": 0.26209425926208496, "eval_runtime": 29.6524, "eval_samples_per_second": 17.739, "eval_steps_per_second": 4.452, "eval_token_acc": 0.9178844213863669, "step": 1260 }, { "epoch": 1.555325193887737, "grad_norm": 0.886417806148529, "learning_rate": 4.707132737868639e-06, "loss": 0.2006976842880249, "memory(GiB)": 31.97, "step": 1265, "token_acc": 0.9235658289984614, "train_speed(iter/s)": 0.123184 }, { "epoch": 1.5614681716962298, "grad_norm": 0.798883318901062, "learning_rate": 4.674992674069445e-06, "loss": 0.17858563661575316, "memory(GiB)": 31.97, "step": 1270, "token_acc": 0.9344567177637512, "train_speed(iter/s)": 0.123284 }, { "epoch": 1.5676111495047225, "grad_norm": 0.8193321824073792, "learning_rate": 4.642866090826187e-06, "loss": 0.19088488817214966, "memory(GiB)": 31.97, "step": 1275, "token_acc": 0.9263077510500191, "train_speed(iter/s)": 0.123357 }, { "epoch": 1.5737541273132152, "grad_norm": 0.7805312275886536, "learning_rate": 4.610754320675603e-06, "loss": 0.19581155776977538, "memory(GiB)": 31.97, "step": 1280, "token_acc": 0.9278596416834517, "train_speed(iter/s)": 0.123486 }, { "epoch": 1.5737541273132152, "eval_loss": 0.26224827766418457, "eval_runtime": 29.7067, "eval_samples_per_second": 17.706, "eval_steps_per_second": 4.443, "eval_token_acc": 0.9181726473555267, "step": 1280 }, { "epoch": 1.5798971051217077, "grad_norm": 0.8490621447563171, "learning_rate": 4.578658695540018e-06, "loss": 0.2049680233001709, "memory(GiB)": 31.97, "step": 1285, "token_acc": 0.9203214434630984, "train_speed(iter/s)": 0.123147 }, { "epoch": 1.5860400829302004, "grad_norm": 0.8018094301223755, "learning_rate": 4.5465805466721e-06, "loss": 0.21368024349212647, "memory(GiB)": 31.97, "step": 1290, "token_acc": 0.9337745342459544, "train_speed(iter/s)": 0.123269 }, { "epoch": 1.592183060738693, "grad_norm": 0.8416959047317505, "learning_rate": 4.514521204599645e-06, "loss": 0.19902560710906983, "memory(GiB)": 31.97, "step": 1295, "token_acc": 0.9410267803045603, "train_speed(iter/s)": 0.123379 }, { "epoch": 1.5983260385471858, "grad_norm": 0.7020901441574097, "learning_rate": 4.48248199907038e-06, "loss": 0.1922709822654724, "memory(GiB)": 31.97, "step": 1300, "token_acc": 0.9301529196433843, "train_speed(iter/s)": 0.123512 }, { "epoch": 1.5983260385471858, "eval_loss": 0.2603996694087982, "eval_runtime": 29.664, "eval_samples_per_second": 17.732, "eval_steps_per_second": 4.45, "eval_token_acc": 0.9183167603401067, "step": 1300 }, { "epoch": 1.6044690163556785, "grad_norm": 0.9669449329376221, "learning_rate": 4.450464258996822e-06, "loss": 0.20973031520843505, "memory(GiB)": 31.97, "step": 1305, "token_acc": 0.9143080561489166, "train_speed(iter/s)": 0.123187 }, { "epoch": 1.610611994164171, "grad_norm": 0.8625094890594482, "learning_rate": 4.418469312401141e-06, "loss": 0.16759986877441407, "memory(GiB)": 31.97, "step": 1310, "token_acc": 0.9306892935456192, "train_speed(iter/s)": 0.123294 }, { "epoch": 1.6167549719726637, "grad_norm": 0.8282011151313782, "learning_rate": 4.386498486360095e-06, "loss": 0.20370192527770997, "memory(GiB)": 31.97, "step": 1315, "token_acc": 0.9212595005428882, "train_speed(iter/s)": 0.123412 }, { "epoch": 1.6228979497811564, "grad_norm": 0.8530144095420837, "learning_rate": 4.354553106949972e-06, "loss": 0.20181150436401368, "memory(GiB)": 31.97, "step": 1320, "token_acc": 0.9427324788655577, "train_speed(iter/s)": 0.12352 }, { "epoch": 1.6228979497811564, "eval_loss": 0.26066553592681885, "eval_runtime": 29.7468, "eval_samples_per_second": 17.683, "eval_steps_per_second": 4.437, "eval_token_acc": 0.9182663207955036, "step": 1320 }, { "epoch": 1.6290409275896491, "grad_norm": 0.760425329208374, "learning_rate": 4.3226344991915936e-06, "loss": 0.18798611164093018, "memory(GiB)": 31.97, "step": 1325, "token_acc": 0.9204100274028215, "train_speed(iter/s)": 0.123163 }, { "epoch": 1.6351839053981418, "grad_norm": 0.8320059776306152, "learning_rate": 4.290743986995353e-06, "loss": 0.20692143440246583, "memory(GiB)": 31.97, "step": 1330, "token_acc": 0.9179834090460202, "train_speed(iter/s)": 0.123278 }, { "epoch": 1.6413268832066343, "grad_norm": 0.9049168229103088, "learning_rate": 4.258882893106308e-06, "loss": 0.18184820413589478, "memory(GiB)": 31.97, "step": 1335, "token_acc": 0.9419431279620853, "train_speed(iter/s)": 0.123369 }, { "epoch": 1.647469861015127, "grad_norm": 0.8740628361701965, "learning_rate": 4.227052539049312e-06, "loss": 0.1948437809944153, "memory(GiB)": 31.97, "step": 1340, "token_acc": 0.9350572326671016, "train_speed(iter/s)": 0.123484 }, { "epoch": 1.647469861015127, "eval_loss": 0.2610000967979431, "eval_runtime": 29.6634, "eval_samples_per_second": 17.732, "eval_steps_per_second": 4.45, "eval_token_acc": 0.9179132439832829, "step": 1340 }, { "epoch": 1.6536128388236198, "grad_norm": 0.7111175656318665, "learning_rate": 4.195254245074196e-06, "loss": 0.17852287292480468, "memory(GiB)": 31.97, "step": 1345, "token_acc": 0.9160269612432129, "train_speed(iter/s)": 0.123131 }, { "epoch": 1.6597558166321125, "grad_norm": 0.7991046905517578, "learning_rate": 4.163489330101017e-06, "loss": 0.1986152410507202, "memory(GiB)": 31.97, "step": 1350, "token_acc": 0.93740389861614, "train_speed(iter/s)": 0.123232 }, { "epoch": 1.6658987944406052, "grad_norm": 0.8582780957221985, "learning_rate": 4.131759111665349e-06, "loss": 0.18987109661102294, "memory(GiB)": 31.97, "step": 1355, "token_acc": 0.9441934490194065, "train_speed(iter/s)": 0.12331 }, { "epoch": 1.6720417722490977, "grad_norm": 0.6932474970817566, "learning_rate": 4.100064905863628e-06, "loss": 0.19035787582397462, "memory(GiB)": 31.97, "step": 1360, "token_acc": 0.9174038315725623, "train_speed(iter/s)": 0.123414 }, { "epoch": 1.6720417722490977, "eval_loss": 0.26117271184921265, "eval_runtime": 29.681, "eval_samples_per_second": 17.722, "eval_steps_per_second": 4.447, "eval_token_acc": 0.9178772157371379, "step": 1360 }, { "epoch": 1.6781847500575904, "grad_norm": 0.8091769814491272, "learning_rate": 4.068408027298576e-06, "loss": 0.20030708312988282, "memory(GiB)": 31.97, "step": 1365, "token_acc": 0.921018299777864, "train_speed(iter/s)": 0.123084 }, { "epoch": 1.684327727866083, "grad_norm": 0.8656709790229797, "learning_rate": 4.036789789024659e-06, "loss": 0.17970023155212403, "memory(GiB)": 31.97, "step": 1370, "token_acc": 0.9361972662458562, "train_speed(iter/s)": 0.123185 }, { "epoch": 1.6904707056745758, "grad_norm": 0.8730081915855408, "learning_rate": 4.00521150249364e-06, "loss": 0.20007739067077637, "memory(GiB)": 31.97, "step": 1375, "token_acc": 0.9322461977708231, "train_speed(iter/s)": 0.123308 }, { "epoch": 1.6966136834830685, "grad_norm": 0.8743943572044373, "learning_rate": 3.973674477500172e-06, "loss": 0.19028009176254274, "memory(GiB)": 31.97, "step": 1380, "token_acc": 0.9364455364455364, "train_speed(iter/s)": 0.123436 }, { "epoch": 1.6966136834830685, "eval_loss": 0.2610381841659546, "eval_runtime": 29.6856, "eval_samples_per_second": 17.719, "eval_steps_per_second": 4.447, "eval_token_acc": 0.9183167603401067, "step": 1380 }, { "epoch": 1.702756661291561, "grad_norm": 0.804136335849762, "learning_rate": 3.942180022127475e-06, "loss": 0.16746077537536622, "memory(GiB)": 31.97, "step": 1385, "token_acc": 0.9190062765437667, "train_speed(iter/s)": 0.12311 }, { "epoch": 1.7088996391000537, "grad_norm": 0.7739353775978088, "learning_rate": 3.910729442693077e-06, "loss": 0.20771589279174804, "memory(GiB)": 31.97, "step": 1390, "token_acc": 0.9368890897790836, "train_speed(iter/s)": 0.123227 }, { "epoch": 1.7150426169085464, "grad_norm": 0.7843467593193054, "learning_rate": 3.8793240436946385e-06, "loss": 0.1791388511657715, "memory(GiB)": 31.97, "step": 1395, "token_acc": 0.9298795912172417, "train_speed(iter/s)": 0.123311 }, { "epoch": 1.7211855947170391, "grad_norm": 0.8047447800636292, "learning_rate": 3.847965127755834e-06, "loss": 0.1962286114692688, "memory(GiB)": 31.97, "step": 1400, "token_acc": 0.9289617486338798, "train_speed(iter/s)": 0.123422 }, { "epoch": 1.7211855947170391, "eval_loss": 0.25994521379470825, "eval_runtime": 29.6994, "eval_samples_per_second": 17.711, "eval_steps_per_second": 4.445, "eval_token_acc": 0.9187274823461594, "step": 1400 }, { "epoch": 1.7273285725255318, "grad_norm": 0.7783213257789612, "learning_rate": 3.816653995572332e-06, "loss": 0.1934323787689209, "memory(GiB)": 31.97, "step": 1405, "token_acc": 0.9191889097250214, "train_speed(iter/s)": 0.123105 }, { "epoch": 1.7334715503340243, "grad_norm": 0.9164287447929382, "learning_rate": 3.7853919458578327e-06, "loss": 0.1951138973236084, "memory(GiB)": 31.97, "step": 1410, "token_acc": 0.9345938875014865, "train_speed(iter/s)": 0.123227 }, { "epoch": 1.739614528142517, "grad_norm": 0.6820860505104065, "learning_rate": 3.7541802752902224e-06, "loss": 0.1772141695022583, "memory(GiB)": 31.97, "step": 1415, "token_acc": 0.9298903956901357, "train_speed(iter/s)": 0.123313 }, { "epoch": 1.7457575059510098, "grad_norm": 0.900181770324707, "learning_rate": 3.723020278457763e-06, "loss": 0.1944177269935608, "memory(GiB)": 31.97, "step": 1420, "token_acc": 0.9333839438223572, "train_speed(iter/s)": 0.123418 }, { "epoch": 1.7457575059510098, "eval_loss": 0.2598145008087158, "eval_runtime": 29.6524, "eval_samples_per_second": 17.739, "eval_steps_per_second": 4.452, "eval_token_acc": 0.9181582360570687, "step": 1420 }, { "epoch": 1.7519004837595025, "grad_norm": 0.8312418460845947, "learning_rate": 3.6919132478054153e-06, "loss": 0.2060741662979126, "memory(GiB)": 31.97, "step": 1425, "token_acc": 0.9155994474106709, "train_speed(iter/s)": 0.123098 }, { "epoch": 1.7580434615679952, "grad_norm": 0.8780198097229004, "learning_rate": 3.6608604735812226e-06, "loss": 0.1988367795944214, "memory(GiB)": 31.97, "step": 1430, "token_acc": 0.9336537924095915, "train_speed(iter/s)": 0.123197 }, { "epoch": 1.7641864393764877, "grad_norm": 0.8375983834266663, "learning_rate": 3.629863243782799e-06, "loss": 0.20454792976379393, "memory(GiB)": 31.97, "step": 1435, "token_acc": 0.9179058065245661, "train_speed(iter/s)": 0.123317 }, { "epoch": 1.7703294171849804, "grad_norm": 0.7774383425712585, "learning_rate": 3.5989228441039024e-06, "loss": 0.1952831268310547, "memory(GiB)": 31.97, "step": 1440, "token_acc": 0.9264312326179357, "train_speed(iter/s)": 0.123413 }, { "epoch": 1.7703294171849804, "eval_loss": 0.2601591646671295, "eval_runtime": 29.6923, "eval_samples_per_second": 17.715, "eval_steps_per_second": 4.446, "eval_token_acc": 0.9185257241677475, "step": 1440 }, { "epoch": 1.776472394993473, "grad_norm": 0.8040404915809631, "learning_rate": 3.568040557881106e-06, "loss": 0.18471212387084962, "memory(GiB)": 31.97, "step": 1445, "token_acc": 0.9204298276599304, "train_speed(iter/s)": 0.123124 }, { "epoch": 1.7826153728019658, "grad_norm": 0.7603934407234192, "learning_rate": 3.5372176660405717e-06, "loss": 0.19175269603729247, "memory(GiB)": 31.97, "step": 1450, "token_acc": 0.9407587455914593, "train_speed(iter/s)": 0.123205 }, { "epoch": 1.7887583506104585, "grad_norm": 0.8216592073440552, "learning_rate": 3.506455447044923e-06, "loss": 0.18449797630310058, "memory(GiB)": 31.97, "step": 1455, "token_acc": 0.9232101076275152, "train_speed(iter/s)": 0.12331 }, { "epoch": 1.794901328418951, "grad_norm": 0.8587454557418823, "learning_rate": 3.4757551768402074e-06, "loss": 0.1803336501121521, "memory(GiB)": 31.97, "step": 1460, "token_acc": 0.9327302250057992, "train_speed(iter/s)": 0.123414 }, { "epoch": 1.794901328418951, "eval_loss": 0.25998592376708984, "eval_runtime": 29.6543, "eval_samples_per_second": 17.738, "eval_steps_per_second": 4.451, "eval_token_acc": 0.9187851275399913, "step": 1460 }, { "epoch": 1.8010443062274437, "grad_norm": 0.6978159546852112, "learning_rate": 3.4451181288029834e-06, "loss": 0.17668429613113404, "memory(GiB)": 31.97, "step": 1465, "token_acc": 0.9205462088038718, "train_speed(iter/s)": 0.123097 }, { "epoch": 1.8071872840359364, "grad_norm": 0.7977909445762634, "learning_rate": 3.4145455736874957e-06, "loss": 0.20127489566802978, "memory(GiB)": 31.97, "step": 1470, "token_acc": 0.9337199247164149, "train_speed(iter/s)": 0.123204 }, { "epoch": 1.8133302618444291, "grad_norm": 0.847145140171051, "learning_rate": 3.3840387795729753e-06, "loss": 0.1935032606124878, "memory(GiB)": 31.97, "step": 1475, "token_acc": 0.9318121092288784, "train_speed(iter/s)": 0.123311 }, { "epoch": 1.8194732396529218, "grad_norm": 0.8690524697303772, "learning_rate": 3.353599011811037e-06, "loss": 0.19352041482925414, "memory(GiB)": 31.97, "step": 1480, "token_acc": 0.9311466218110457, "train_speed(iter/s)": 0.123409 }, { "epoch": 1.8194732396529218, "eval_loss": 0.259257048368454, "eval_runtime": 29.6405, "eval_samples_per_second": 17.746, "eval_steps_per_second": 4.453, "eval_token_acc": 0.9186266032569534, "step": 1480 }, { "epoch": 1.8256162174614143, "grad_norm": 0.8610875010490417, "learning_rate": 3.323227532973193e-06, "loss": 0.18993620872497557, "memory(GiB)": 31.97, "step": 1485, "token_acc": 0.9163555740842508, "train_speed(iter/s)": 0.123098 }, { "epoch": 1.831759195269907, "grad_norm": 0.9433273673057556, "learning_rate": 3.292925602798492e-06, "loss": 0.1930912494659424, "memory(GiB)": 31.97, "step": 1490, "token_acc": 0.9382497082847141, "train_speed(iter/s)": 0.123201 }, { "epoch": 1.8379021730783998, "grad_norm": 0.7419607043266296, "learning_rate": 3.262694478141266e-06, "loss": 0.1879183053970337, "memory(GiB)": 31.97, "step": 1495, "token_acc": 0.9291406527587432, "train_speed(iter/s)": 0.123298 }, { "epoch": 1.8440451508868925, "grad_norm": 0.8275686502456665, "learning_rate": 3.2325354129189923e-06, "loss": 0.19919825792312623, "memory(GiB)": 31.97, "step": 1500, "token_acc": 0.9369797252438589, "train_speed(iter/s)": 0.123397 }, { "epoch": 1.8440451508868925, "eval_loss": 0.2576294541358948, "eval_runtime": 29.692, "eval_samples_per_second": 17.715, "eval_steps_per_second": 4.446, "eval_token_acc": 0.9190301196137772, "step": 1500 }, { "epoch": 1.8501881286953852, "grad_norm": 0.7494759559631348, "learning_rate": 3.2024496580602892e-06, "loss": 0.1704793930053711, "memory(GiB)": 31.97, "step": 1505, "token_acc": 0.920035804863494, "train_speed(iter/s)": 0.123086 }, { "epoch": 1.8563311065038777, "grad_norm": 0.7116587162017822, "learning_rate": 3.172438461453032e-06, "loss": 0.19869464635849, "memory(GiB)": 31.97, "step": 1510, "token_acc": 0.9331588853693247, "train_speed(iter/s)": 0.123201 }, { "epoch": 1.8624740843123704, "grad_norm": 0.8266251087188721, "learning_rate": 3.142503067892594e-06, "loss": 0.18929662704467773, "memory(GiB)": 31.97, "step": 1515, "token_acc": 0.9361521750649076, "train_speed(iter/s)": 0.123284 }, { "epoch": 1.868617062120863, "grad_norm": 0.8856237530708313, "learning_rate": 3.112644719030206e-06, "loss": 0.1765504837036133, "memory(GiB)": 31.97, "step": 1520, "token_acc": 0.9446170019591915, "train_speed(iter/s)": 0.123379 }, { "epoch": 1.868617062120863, "eval_loss": 0.2572629451751709, "eval_runtime": 29.7297, "eval_samples_per_second": 17.693, "eval_steps_per_second": 4.44, "eval_token_acc": 0.9187563049430754, "step": 1520 }, { "epoch": 1.8747600399293558, "grad_norm": 0.7266517281532288, "learning_rate": 3.0828646533214657e-06, "loss": 0.18753888607025146, "memory(GiB)": 31.97, "step": 1525, "token_acc": 0.921090387374462, "train_speed(iter/s)": 0.123072 }, { "epoch": 1.8809030177378485, "grad_norm": 0.7010701298713684, "learning_rate": 3.053164105974964e-06, "loss": 0.18214144706726074, "memory(GiB)": 31.97, "step": 1530, "token_acc": 0.9355687362479671, "train_speed(iter/s)": 0.123181 }, { "epoch": 1.887045995546341, "grad_norm": 0.8133987188339233, "learning_rate": 3.0235443089010564e-06, "loss": 0.19535071849823, "memory(GiB)": 31.97, "step": 1535, "token_acc": 0.9338049036944989, "train_speed(iter/s)": 0.123268 }, { "epoch": 1.8931889733548337, "grad_norm": 0.7620673775672913, "learning_rate": 2.9940064906607607e-06, "loss": 0.19279036521911622, "memory(GiB)": 31.97, "step": 1540, "token_acc": 0.9357547655847501, "train_speed(iter/s)": 0.123366 }, { "epoch": 1.8931889733548337, "eval_loss": 0.2575133144855499, "eval_runtime": 29.6891, "eval_samples_per_second": 17.717, "eval_steps_per_second": 4.446, "eval_token_acc": 0.9187923331892204, "step": 1540 }, { "epoch": 1.8993319511633264, "grad_norm": 0.8368042707443237, "learning_rate": 2.964551876414801e-06, "loss": 0.186897873878479, "memory(GiB)": 31.97, "step": 1545, "token_acc": 0.9212080946652063, "train_speed(iter/s)": 0.123086 }, { "epoch": 1.9054749289718191, "grad_norm": 0.7964156270027161, "learning_rate": 2.93518168787279e-06, "loss": 0.1861191511154175, "memory(GiB)": 31.97, "step": 1550, "token_acc": 0.9394473838918284, "train_speed(iter/s)": 0.123163 }, { "epoch": 1.9116179067803118, "grad_norm": 0.9378722906112671, "learning_rate": 2.905897143242562e-06, "loss": 0.197173810005188, "memory(GiB)": 31.97, "step": 1555, "token_acc": 0.9388659543467702, "train_speed(iter/s)": 0.123262 }, { "epoch": 1.9177608845888043, "grad_norm": 0.8486573100090027, "learning_rate": 2.8766994571796336e-06, "loss": 0.18908753395080566, "memory(GiB)": 31.97, "step": 1560, "token_acc": 0.9474116680361545, "train_speed(iter/s)": 0.123349 }, { "epoch": 1.9177608845888043, "eval_loss": 0.25726109743118286, "eval_runtime": 29.6999, "eval_samples_per_second": 17.711, "eval_steps_per_second": 4.444, "eval_token_acc": 0.9191309987029831, "step": 1560 }, { "epoch": 1.923903862397297, "grad_norm": 0.8716571927070618, "learning_rate": 2.8475898407368298e-06, "loss": 0.18810817003250122, "memory(GiB)": 31.97, "step": 1565, "token_acc": 0.9179941342227845, "train_speed(iter/s)": 0.123063 }, { "epoch": 1.9300468402057898, "grad_norm": 0.794176459312439, "learning_rate": 2.8185695013140474e-06, "loss": 0.17928617000579833, "memory(GiB)": 31.97, "step": 1570, "token_acc": 0.9303826916366175, "train_speed(iter/s)": 0.123136 }, { "epoch": 1.9361898180142825, "grad_norm": 0.7357900142669678, "learning_rate": 2.7896396426081844e-06, "loss": 0.18468384742736815, "memory(GiB)": 31.97, "step": 1575, "token_acc": 0.9476946498477599, "train_speed(iter/s)": 0.123228 }, { "epoch": 1.9423327958227752, "grad_norm": 0.8071329593658447, "learning_rate": 2.7608014645632e-06, "loss": 0.1790044903755188, "memory(GiB)": 31.97, "step": 1580, "token_acc": 0.938118933832586, "train_speed(iter/s)": 0.123325 }, { "epoch": 1.9423327958227752, "eval_loss": 0.25785306096076965, "eval_runtime": 29.6989, "eval_samples_per_second": 17.711, "eval_steps_per_second": 4.445, "eval_token_acc": 0.9192462890906471, "step": 1580 }, { "epoch": 1.9484757736312677, "grad_norm": 0.8714718818664551, "learning_rate": 2.7320561633203567e-06, "loss": 0.19142614603042601, "memory(GiB)": 31.97, "step": 1585, "token_acc": 0.9180685641538602, "train_speed(iter/s)": 0.123048 }, { "epoch": 1.9546187514397604, "grad_norm": 0.7642494440078735, "learning_rate": 2.703404931168594e-06, "loss": 0.1714502215385437, "memory(GiB)": 31.97, "step": 1590, "token_acc": 0.9387637940932576, "train_speed(iter/s)": 0.123137 }, { "epoch": 1.960761729248253, "grad_norm": 0.7690022587776184, "learning_rate": 2.6748489564950907e-06, "loss": 0.1712334156036377, "memory(GiB)": 31.97, "step": 1595, "token_acc": 0.9381139489194499, "train_speed(iter/s)": 0.123216 }, { "epoch": 1.9669047070567458, "grad_norm": 0.789311945438385, "learning_rate": 2.6463894237359556e-06, "loss": 0.1898505687713623, "memory(GiB)": 31.97, "step": 1600, "token_acc": 0.9361558383064971, "train_speed(iter/s)": 0.123315 }, { "epoch": 1.9669047070567458, "eval_loss": 0.25641384720802307, "eval_runtime": 29.7239, "eval_samples_per_second": 17.696, "eval_steps_per_second": 4.441, "eval_token_acc": 0.9191237930537541, "step": 1600 }, { "epoch": 1.9730476848652385, "grad_norm": 0.8089138865470886, "learning_rate": 2.618027513327116e-06, "loss": 0.18109874725341796, "memory(GiB)": 31.97, "step": 1605, "token_acc": 0.923689472311571, "train_speed(iter/s)": 0.12303 }, { "epoch": 1.979190662673731, "grad_norm": 0.8698074221611023, "learning_rate": 2.589764401655343e-06, "loss": 0.183346688747406, "memory(GiB)": 31.97, "step": 1610, "token_acc": 0.9436480028852222, "train_speed(iter/s)": 0.123111 }, { "epoch": 1.9853336404822237, "grad_norm": 0.8340507745742798, "learning_rate": 2.5616012610094702e-06, "loss": 0.19840478897094727, "memory(GiB)": 31.97, "step": 1615, "token_acc": 0.931529030765672, "train_speed(iter/s)": 0.123193 }, { "epoch": 1.9914766182907164, "grad_norm": 0.9170531034469604, "learning_rate": 2.533539259531757e-06, "loss": 0.20222840309143067, "memory(GiB)": 31.97, "step": 1620, "token_acc": 0.9312559145599567, "train_speed(iter/s)": 0.123282 }, { "epoch": 1.9914766182907164, "eval_loss": 0.2562500834465027, "eval_runtime": 29.7389, "eval_samples_per_second": 17.687, "eval_steps_per_second": 4.439, "eval_token_acc": 0.91929672863525, "step": 1620 }, { "epoch": 1.9976195960992091, "grad_norm": 0.8579339385032654, "learning_rate": 2.5055795611694435e-06, "loss": 0.17736260890960692, "memory(GiB)": 31.97, "step": 1625, "token_acc": 0.9170028818443804, "train_speed(iter/s)": 0.123032 }, { "epoch": 2.004914382246794, "grad_norm": 0.6278606057167053, "learning_rate": 2.4777233256264743e-06, "loss": 0.20010933876037598, "memory(GiB)": 31.97, "step": 1630, "token_acc": 0.947276073094535, "train_speed(iter/s)": 0.123086 }, { "epoch": 2.011057360055287, "grad_norm": 0.6573106050491333, "learning_rate": 2.4499717083153975e-06, "loss": 0.1415931224822998, "memory(GiB)": 31.97, "step": 1635, "token_acc": 0.9564882032667876, "train_speed(iter/s)": 0.123172 }, { "epoch": 2.0172003378637795, "grad_norm": 0.862938404083252, "learning_rate": 2.4223258603094295e-06, "loss": 0.16473679542541503, "memory(GiB)": 31.97, "step": 1640, "token_acc": 0.9506443652316973, "train_speed(iter/s)": 0.123277 }, { "epoch": 2.0172003378637795, "eval_loss": 0.2674296498298645, "eval_runtime": 29.6424, "eval_samples_per_second": 17.745, "eval_steps_per_second": 4.453, "eval_token_acc": 0.9185977806600375, "step": 1640 }, { "epoch": 2.023343315672272, "grad_norm": 0.8443688750267029, "learning_rate": 2.3947869282947263e-06, "loss": 0.14469457864761354, "memory(GiB)": 31.97, "step": 1645, "token_acc": 0.9298440219802724, "train_speed(iter/s)": 0.122997 }, { "epoch": 2.029486293480765, "grad_norm": 0.8116291761398315, "learning_rate": 2.3673560545228082e-06, "loss": 0.14491933584213257, "memory(GiB)": 31.97, "step": 1650, "token_acc": 0.9576881945413122, "train_speed(iter/s)": 0.12308 }, { "epoch": 2.0356292712892574, "grad_norm": 0.6990134119987488, "learning_rate": 2.3400343767631943e-06, "loss": 0.1429425835609436, "memory(GiB)": 31.97, "step": 1655, "token_acc": 0.9598897189612238, "train_speed(iter/s)": 0.12317 }, { "epoch": 2.0417722490977503, "grad_norm": 0.7768262624740601, "learning_rate": 2.312823028256205e-06, "loss": 0.13332735300064086, "memory(GiB)": 31.97, "step": 1660, "token_acc": 0.955598381190981, "train_speed(iter/s)": 0.123249 }, { "epoch": 2.0417722490977503, "eval_loss": 0.2723671495914459, "eval_runtime": 29.6708, "eval_samples_per_second": 17.728, "eval_steps_per_second": 4.449, "eval_token_acc": 0.9178267761925349, "step": 1660 }, { "epoch": 2.047915226906243, "grad_norm": 0.8455916047096252, "learning_rate": 2.2857231376659517e-06, "loss": 0.13110907077789308, "memory(GiB)": 31.97, "step": 1665, "token_acc": 0.9266902441777343, "train_speed(iter/s)": 0.12298 }, { "epoch": 2.0540582047147353, "grad_norm": 0.8857413530349731, "learning_rate": 2.258735829033529e-06, "loss": 0.16349921226501465, "memory(GiB)": 31.97, "step": 1670, "token_acc": 0.9470571801080275, "train_speed(iter/s)": 0.123074 }, { "epoch": 2.0602011825232283, "grad_norm": 0.8224142789840698, "learning_rate": 2.231862221730394e-06, "loss": 0.1457624077796936, "memory(GiB)": 31.97, "step": 1675, "token_acc": 0.9467787114845938, "train_speed(iter/s)": 0.123144 }, { "epoch": 2.0663441603317207, "grad_norm": 0.7867154479026794, "learning_rate": 2.2051034304119344e-06, "loss": 0.13943665027618407, "memory(GiB)": 31.97, "step": 1680, "token_acc": 0.9487998351704955, "train_speed(iter/s)": 0.123235 }, { "epoch": 2.0663441603317207, "eval_loss": 0.27172932028770447, "eval_runtime": 29.8181, "eval_samples_per_second": 17.64, "eval_steps_per_second": 4.427, "eval_token_acc": 0.9178844213863669, "step": 1680 }, { "epoch": 2.0724871381402137, "grad_norm": 0.7623206973075867, "learning_rate": 2.1784605649712326e-06, "loss": 0.14780081510543824, "memory(GiB)": 31.97, "step": 1685, "token_acc": 0.9247235706580367, "train_speed(iter/s)": 0.122983 }, { "epoch": 2.078630115948706, "grad_norm": 0.8290310502052307, "learning_rate": 2.1519347304930317e-06, "loss": 0.1389237880706787, "memory(GiB)": 31.97, "step": 1690, "token_acc": 0.9470190895741557, "train_speed(iter/s)": 0.123069 }, { "epoch": 2.0847730937571987, "grad_norm": 0.7647258639335632, "learning_rate": 2.1255270272079044e-06, "loss": 0.14199459552764893, "memory(GiB)": 31.97, "step": 1695, "token_acc": 0.9419040287400564, "train_speed(iter/s)": 0.123168 }, { "epoch": 2.0909160715656916, "grad_norm": 0.8620509505271912, "learning_rate": 2.0992385504466075e-06, "loss": 0.14582890272140503, "memory(GiB)": 31.97, "step": 1700, "token_acc": 0.9481878509443593, "train_speed(iter/s)": 0.123261 }, { "epoch": 2.0909160715656916, "eval_loss": 0.27268144488334656, "eval_runtime": 29.6628, "eval_samples_per_second": 17.733, "eval_steps_per_second": 4.45, "eval_token_acc": 0.9178772157371379, "step": 1700 }, { "epoch": 2.097059049374184, "grad_norm": 0.7256646752357483, "learning_rate": 2.0730703905946612e-06, "loss": 0.14624775648117067, "memory(GiB)": 31.97, "step": 1705, "token_acc": 0.9200135124990616, "train_speed(iter/s)": 0.123001 }, { "epoch": 2.103202027182677, "grad_norm": 0.7388427257537842, "learning_rate": 2.0470236330471125e-06, "loss": 0.11770030260086059, "memory(GiB)": 31.97, "step": 1710, "token_acc": 0.9668929503916449, "train_speed(iter/s)": 0.123069 }, { "epoch": 2.1093450049911695, "grad_norm": 0.8730806112289429, "learning_rate": 2.0210993581635257e-06, "loss": 0.16097368001937867, "memory(GiB)": 31.97, "step": 1715, "token_acc": 0.9433831352051436, "train_speed(iter/s)": 0.123169 }, { "epoch": 2.115487982799662, "grad_norm": 0.7302968502044678, "learning_rate": 1.9952986412231612e-06, "loss": 0.1270466446876526, "memory(GiB)": 31.97, "step": 1720, "token_acc": 0.9573284772123241, "train_speed(iter/s)": 0.123225 }, { "epoch": 2.115487982799662, "eval_loss": 0.27457520365715027, "eval_runtime": 29.6821, "eval_samples_per_second": 17.721, "eval_steps_per_second": 4.447, "eval_token_acc": 0.9178555987894509, "step": 1720 }, { "epoch": 2.121630960608155, "grad_norm": 1.026114821434021, "learning_rate": 1.9696225523803803e-06, "loss": 0.1560563325881958, "memory(GiB)": 31.97, "step": 1725, "token_acc": 0.9226966883434199, "train_speed(iter/s)": 0.12297 }, { "epoch": 2.1277739384166474, "grad_norm": 0.8650787472724915, "learning_rate": 1.944072156620261e-06, "loss": 0.13898645639419555, "memory(GiB)": 31.97, "step": 1730, "token_acc": 0.9579385943157581, "train_speed(iter/s)": 0.12307 }, { "epoch": 2.1339169162251403, "grad_norm": 0.6428267955780029, "learning_rate": 1.9186485137144217e-06, "loss": 0.15046895742416383, "memory(GiB)": 31.97, "step": 1735, "token_acc": 0.9573064770932069, "train_speed(iter/s)": 0.123153 }, { "epoch": 2.140059894033633, "grad_norm": 0.7333597540855408, "learning_rate": 1.89335267817706e-06, "loss": 0.1286949872970581, "memory(GiB)": 31.97, "step": 1740, "token_acc": 0.9547167656464138, "train_speed(iter/s)": 0.123228 }, { "epoch": 2.140059894033633, "eval_loss": 0.2735785245895386, "eval_runtime": 29.6624, "eval_samples_per_second": 17.733, "eval_steps_per_second": 4.45, "eval_token_acc": 0.917639429312581, "step": 1740 }, { "epoch": 2.1462028718421253, "grad_norm": 0.8134050369262695, "learning_rate": 1.8681856992212211e-06, "loss": 0.1448550343513489, "memory(GiB)": 31.97, "step": 1745, "token_acc": 0.9220646406174626, "train_speed(iter/s)": 0.122967 }, { "epoch": 2.1523458496506183, "grad_norm": 0.6919598579406738, "learning_rate": 1.8431486207152704e-06, "loss": 0.12585388422012328, "memory(GiB)": 31.97, "step": 1750, "token_acc": 0.9565885062902368, "train_speed(iter/s)": 0.12304 }, { "epoch": 2.1584888274591107, "grad_norm": 0.7137647271156311, "learning_rate": 1.8182424811396131e-06, "loss": 0.13218532800674437, "memory(GiB)": 31.97, "step": 1755, "token_acc": 0.9553192383674499, "train_speed(iter/s)": 0.123112 }, { "epoch": 2.1646318052676037, "grad_norm": 0.8254464864730835, "learning_rate": 1.7934683135435993e-06, "loss": 0.15681140422821044, "memory(GiB)": 31.97, "step": 1760, "token_acc": 0.9500492764147518, "train_speed(iter/s)": 0.123204 }, { "epoch": 2.1646318052676037, "eval_loss": 0.2743065655231476, "eval_runtime": 29.7411, "eval_samples_per_second": 17.686, "eval_steps_per_second": 4.438, "eval_token_acc": 0.91786280443868, "step": 1760 }, { "epoch": 2.170774783076096, "grad_norm": 0.8115731477737427, "learning_rate": 1.7688271455026867e-06, "loss": 0.15295430421829223, "memory(GiB)": 31.97, "step": 1765, "token_acc": 0.9211377831289036, "train_speed(iter/s)": 0.122963 }, { "epoch": 2.1769177608845887, "grad_norm": 0.7977039813995361, "learning_rate": 1.7443199990758168e-06, "loss": 0.14479312896728516, "memory(GiB)": 31.97, "step": 1770, "token_acc": 0.9533281533281533, "train_speed(iter/s)": 0.123053 }, { "epoch": 2.1830607386930816, "grad_norm": 0.8783808350563049, "learning_rate": 1.7199478907630269e-06, "loss": 0.14664456844329835, "memory(GiB)": 31.97, "step": 1775, "token_acc": 0.9566591882520905, "train_speed(iter/s)": 0.123142 }, { "epoch": 2.189203716501574, "grad_norm": 0.737375795841217, "learning_rate": 1.6957118314632825e-06, "loss": 0.12802677154541015, "memory(GiB)": 31.97, "step": 1780, "token_acc": 0.9474123975142305, "train_speed(iter/s)": 0.12324 }, { "epoch": 2.189203716501574, "eval_loss": 0.27472689747810364, "eval_runtime": 29.7215, "eval_samples_per_second": 17.698, "eval_steps_per_second": 4.441, "eval_token_acc": 0.9179997117740308, "step": 1780 }, { "epoch": 2.195346694310067, "grad_norm": 0.9062975645065308, "learning_rate": 1.6716128264325477e-06, "loss": 0.1491732716560364, "memory(GiB)": 31.97, "step": 1785, "token_acc": 0.9252631765812785, "train_speed(iter/s)": 0.123004 }, { "epoch": 2.2014896721185595, "grad_norm": 0.9112216830253601, "learning_rate": 1.64765187524209e-06, "loss": 0.14550890922546386, "memory(GiB)": 31.97, "step": 1790, "token_acc": 0.9464007023019898, "train_speed(iter/s)": 0.123075 }, { "epoch": 2.207632649927052, "grad_norm": 0.822755753993988, "learning_rate": 1.6238299717370254e-06, "loss": 0.14908239841461182, "memory(GiB)": 31.97, "step": 1795, "token_acc": 0.9550466874166296, "train_speed(iter/s)": 0.123153 }, { "epoch": 2.213775627735545, "grad_norm": 0.9669148921966553, "learning_rate": 1.6001481039950872e-06, "loss": 0.14041876792907715, "memory(GiB)": 31.97, "step": 1800, "token_acc": 0.9552882955460927, "train_speed(iter/s)": 0.123227 }, { "epoch": 2.213775627735545, "eval_loss": 0.2743581235408783, "eval_runtime": 29.714, "eval_samples_per_second": 17.702, "eval_steps_per_second": 4.442, "eval_token_acc": 0.9181150021616947, "step": 1800 }, { "epoch": 2.2199186055440374, "grad_norm": 0.7515302896499634, "learning_rate": 1.5766072542856525e-06, "loss": 0.1314539670944214, "memory(GiB)": 31.97, "step": 1805, "token_acc": 0.9241629064430544, "train_speed(iter/s)": 0.122981 }, { "epoch": 2.2260615833525303, "grad_norm": 0.8223626613616943, "learning_rate": 1.5532083990289892e-06, "loss": 0.1447986364364624, "memory(GiB)": 31.97, "step": 1810, "token_acc": 0.9575009707900073, "train_speed(iter/s)": 0.123073 }, { "epoch": 2.232204561161023, "grad_norm": 0.8387971520423889, "learning_rate": 1.5299525087557682e-06, "loss": 0.12721827030181884, "memory(GiB)": 31.97, "step": 1815, "token_acc": 0.9589310504396112, "train_speed(iter/s)": 0.123144 }, { "epoch": 2.2383475389695153, "grad_norm": 0.8764155507087708, "learning_rate": 1.5068405480667975e-06, "loss": 0.1495474696159363, "memory(GiB)": 31.97, "step": 1820, "token_acc": 0.9530952884005915, "train_speed(iter/s)": 0.123223 }, { "epoch": 2.2383475389695153, "eval_loss": 0.2754988670349121, "eval_runtime": 29.5873, "eval_samples_per_second": 17.778, "eval_steps_per_second": 4.461, "eval_token_acc": 0.9176898688571841, "step": 1820 }, { "epoch": 2.2444905167780083, "grad_norm": 0.8443633317947388, "learning_rate": 1.4838734755930168e-06, "loss": 0.14514811038970948, "memory(GiB)": 31.97, "step": 1825, "token_acc": 0.9260831823671497, "train_speed(iter/s)": 0.122984 }, { "epoch": 2.2506334945865007, "grad_norm": 0.8468235731124878, "learning_rate": 1.461052243955739e-06, "loss": 0.14231607913970948, "memory(GiB)": 31.97, "step": 1830, "token_acc": 0.9431918169819622, "train_speed(iter/s)": 0.123052 }, { "epoch": 2.2567764723949937, "grad_norm": 0.8009449243545532, "learning_rate": 1.4383777997271347e-06, "loss": 0.13485580682754517, "memory(GiB)": 31.97, "step": 1835, "token_acc": 0.954977119519756, "train_speed(iter/s)": 0.123135 }, { "epoch": 2.262919450203486, "grad_norm": 0.8349820971488953, "learning_rate": 1.4158510833909688e-06, "loss": 0.1553872346878052, "memory(GiB)": 31.97, "step": 1840, "token_acc": 0.9466980320156062, "train_speed(iter/s)": 0.12321 }, { "epoch": 2.262919450203486, "eval_loss": 0.2755924463272095, "eval_runtime": 29.7545, "eval_samples_per_second": 17.678, "eval_steps_per_second": 4.436, "eval_token_acc": 0.9181582360570687, "step": 1840 }, { "epoch": 2.2690624280119787, "grad_norm": 0.9547154903411865, "learning_rate": 1.3934730293035935e-06, "loss": 0.1530256986618042, "memory(GiB)": 31.97, "step": 1845, "token_acc": 0.9236753100338219, "train_speed(iter/s)": 0.122997 }, { "epoch": 2.2752054058204716, "grad_norm": 0.8379245400428772, "learning_rate": 1.3712445656551904e-06, "loss": 0.14752573966979982, "memory(GiB)": 31.97, "step": 1850, "token_acc": 0.9471953309555793, "train_speed(iter/s)": 0.123078 }, { "epoch": 2.281348383628964, "grad_norm": 0.7745286822319031, "learning_rate": 1.349166614431282e-06, "loss": 0.13339710235595703, "memory(GiB)": 31.97, "step": 1855, "token_acc": 0.9574489743981269, "train_speed(iter/s)": 0.123139 }, { "epoch": 2.287491361437457, "grad_norm": 0.7709481120109558, "learning_rate": 1.3272400913744744e-06, "loss": 0.13953914642333984, "memory(GiB)": 31.97, "step": 1860, "token_acc": 0.9522856703093736, "train_speed(iter/s)": 0.123224 }, { "epoch": 2.287491361437457, "eval_loss": 0.27358269691467285, "eval_runtime": 29.7058, "eval_samples_per_second": 17.707, "eval_steps_per_second": 4.444, "eval_token_acc": 0.9181294134601528, "step": 1860 }, { "epoch": 2.2936343392459495, "grad_norm": 0.9447105526924133, "learning_rate": 1.3054659059464836e-06, "loss": 0.1305554747581482, "memory(GiB)": 31.97, "step": 1865, "token_acc": 0.9289236364999907, "train_speed(iter/s)": 0.122979 }, { "epoch": 2.299777317054442, "grad_norm": 0.8118374347686768, "learning_rate": 1.2838449612904108e-06, "loss": 0.14541189670562743, "memory(GiB)": 31.97, "step": 1870, "token_acc": 0.9568722866275464, "train_speed(iter/s)": 0.123069 }, { "epoch": 2.305920294862935, "grad_norm": 0.7850742936134338, "learning_rate": 1.262378154193285e-06, "loss": 0.14573101997375487, "memory(GiB)": 31.97, "step": 1875, "token_acc": 0.9437350591802227, "train_speed(iter/s)": 0.123139 }, { "epoch": 2.3120632726714274, "grad_norm": 0.8586622476577759, "learning_rate": 1.2410663750488644e-06, "loss": 0.1231348991394043, "memory(GiB)": 31.97, "step": 1880, "token_acc": 0.9528933210864716, "train_speed(iter/s)": 0.123207 }, { "epoch": 2.3120632726714274, "eval_loss": 0.27450090646743774, "eval_runtime": 29.7231, "eval_samples_per_second": 17.697, "eval_steps_per_second": 4.441, "eval_token_acc": 0.9181798530047557, "step": 1880 }, { "epoch": 2.3182062504799203, "grad_norm": 0.7418249845504761, "learning_rate": 1.2199105078207002e-06, "loss": 0.15627479553222656, "memory(GiB)": 31.97, "step": 1885, "token_acc": 0.9225045238007312, "train_speed(iter/s)": 0.122962 }, { "epoch": 2.324349228288413, "grad_norm": 0.838173508644104, "learning_rate": 1.1989114300054782e-06, "loss": 0.14288971424102784, "memory(GiB)": 31.97, "step": 1890, "token_acc": 0.94441322229602, "train_speed(iter/s)": 0.123041 }, { "epoch": 2.3304922060969053, "grad_norm": 0.8450609445571899, "learning_rate": 1.1780700125966232e-06, "loss": 0.13255660533905028, "memory(GiB)": 31.97, "step": 1895, "token_acc": 0.9575001607406931, "train_speed(iter/s)": 0.123109 }, { "epoch": 2.3366351839053983, "grad_norm": 0.7407841086387634, "learning_rate": 1.1573871200481634e-06, "loss": 0.1363093614578247, "memory(GiB)": 31.97, "step": 1900, "token_acc": 0.948611310292079, "train_speed(iter/s)": 0.123174 }, { "epoch": 2.3366351839053983, "eval_loss": 0.27403974533081055, "eval_runtime": 29.6863, "eval_samples_per_second": 17.719, "eval_steps_per_second": 4.446, "eval_token_acc": 0.9180429456694048, "step": 1900 }, { "epoch": 2.3427781617138907, "grad_norm": 0.8412113189697266, "learning_rate": 1.136863610238887e-06, "loss": 0.151106858253479, "memory(GiB)": 31.97, "step": 1905, "token_acc": 0.9202309459903064, "train_speed(iter/s)": 0.122952 }, { "epoch": 2.3489211395223837, "grad_norm": 0.7621601819992065, "learning_rate": 1.1165003344367465e-06, "loss": 0.145496666431427, "memory(GiB)": 31.97, "step": 1910, "token_acc": 0.9506583322250299, "train_speed(iter/s)": 0.123038 }, { "epoch": 2.355064117330876, "grad_norm": 0.8177499175071716, "learning_rate": 1.0962981372635629e-06, "loss": 0.13563876152038573, "memory(GiB)": 31.97, "step": 1915, "token_acc": 0.9514687814140511, "train_speed(iter/s)": 0.123121 }, { "epoch": 2.3612070951393687, "grad_norm": 0.8387221693992615, "learning_rate": 1.0762578566599818e-06, "loss": 0.15051798820495604, "memory(GiB)": 31.97, "step": 1920, "token_acc": 0.9480101984258952, "train_speed(iter/s)": 0.123214 }, { "epoch": 2.3612070951393687, "eval_loss": 0.2752765119075775, "eval_runtime": 29.73, "eval_samples_per_second": 17.693, "eval_steps_per_second": 4.44, "eval_token_acc": 0.9179492722294279, "step": 1920 }, { "epoch": 2.3673500729478616, "grad_norm": 0.9462500214576721, "learning_rate": 1.056380323850722e-06, "loss": 0.1329110622406006, "memory(GiB)": 31.97, "step": 1925, "token_acc": 0.9277916379142559, "train_speed(iter/s)": 0.122971 }, { "epoch": 2.373493050756354, "grad_norm": 0.6897282004356384, "learning_rate": 1.0366663633101015e-06, "loss": 0.14667117595672607, "memory(GiB)": 31.97, "step": 1930, "token_acc": 0.9519438953214723, "train_speed(iter/s)": 0.123037 }, { "epoch": 2.379636028564847, "grad_norm": 0.756504476070404, "learning_rate": 1.0171167927278369e-06, "loss": 0.15089083909988404, "memory(GiB)": 31.97, "step": 1935, "token_acc": 0.9499266411093757, "train_speed(iter/s)": 0.123095 }, { "epoch": 2.3857790063733395, "grad_norm": 0.620968222618103, "learning_rate": 9.977324229751245e-07, "loss": 0.13846428394317628, "memory(GiB)": 31.97, "step": 1940, "token_acc": 0.9542850274450099, "train_speed(iter/s)": 0.123159 }, { "epoch": 2.3857790063733395, "eval_loss": 0.273898720741272, "eval_runtime": 29.7246, "eval_samples_per_second": 17.696, "eval_steps_per_second": 4.441, "eval_token_acc": 0.9181582360570687, "step": 1940 }, { "epoch": 2.391921984181832, "grad_norm": 0.8139801621437073, "learning_rate": 9.785140580710106e-07, "loss": 0.1415960192680359, "memory(GiB)": 31.97, "step": 1945, "token_acc": 0.9238499208097432, "train_speed(iter/s)": 0.122929 }, { "epoch": 2.398064961990325, "grad_norm": 0.8904073238372803, "learning_rate": 9.594624951490455e-07, "loss": 0.15040233135223388, "memory(GiB)": 31.97, "step": 1950, "token_acc": 0.9561746584516475, "train_speed(iter/s)": 0.123001 }, { "epoch": 2.4042079397988174, "grad_norm": 1.0656641721725464, "learning_rate": 9.405785244242166e-07, "loss": 0.1426215648651123, "memory(GiB)": 31.97, "step": 1955, "token_acc": 0.9464016327979412, "train_speed(iter/s)": 0.123079 }, { "epoch": 2.4103509176073104, "grad_norm": 0.656574547290802, "learning_rate": 9.218629291601699e-07, "loss": 0.12366310358047486, "memory(GiB)": 31.97, "step": 1960, "token_acc": 0.9601010101010101, "train_speed(iter/s)": 0.123155 }, { "epoch": 2.4103509176073104, "eval_loss": 0.27443960309028625, "eval_runtime": 29.6457, "eval_samples_per_second": 17.743, "eval_steps_per_second": 4.453, "eval_token_acc": 0.9185473411154345, "step": 1960 }, { "epoch": 2.416493895415803, "grad_norm": 0.7282077074050903, "learning_rate": 9.033164856367271e-07, "loss": 0.14160101413726806, "memory(GiB)": 31.97, "step": 1965, "token_acc": 0.9257107472635547, "train_speed(iter/s)": 0.122927 }, { "epoch": 2.4226368732242953, "grad_norm": 0.9724346995353699, "learning_rate": 8.849399631176825e-07, "loss": 0.13960802555084229, "memory(GiB)": 31.97, "step": 1970, "token_acc": 0.957187156146844, "train_speed(iter/s)": 0.122994 }, { "epoch": 2.4287798510327883, "grad_norm": 0.6904351711273193, "learning_rate": 8.667341238189009e-07, "loss": 0.13362197875976561, "memory(GiB)": 31.97, "step": 1975, "token_acc": 0.9529874213836478, "train_speed(iter/s)": 0.123061 }, { "epoch": 2.4349228288412808, "grad_norm": 0.8341466784477234, "learning_rate": 8.486997228767013e-07, "loss": 0.15857725143432616, "memory(GiB)": 31.97, "step": 1980, "token_acc": 0.9431441341856106, "train_speed(iter/s)": 0.123136 }, { "epoch": 2.4349228288412808, "eval_loss": 0.27409639954566956, "eval_runtime": 29.6556, "eval_samples_per_second": 17.737, "eval_steps_per_second": 4.451, "eval_token_acc": 0.9185329298169765, "step": 1980 }, { "epoch": 2.4410658066497737, "grad_norm": 1.1063848733901978, "learning_rate": 8.308375083165299e-07, "loss": 0.15017662048339844, "memory(GiB)": 31.97, "step": 1985, "token_acc": 0.9246744744307849, "train_speed(iter/s)": 0.122915 }, { "epoch": 2.447208784458266, "grad_norm": 0.8507280945777893, "learning_rate": 8.131482210219383e-07, "loss": 0.1420647144317627, "memory(GiB)": 31.97, "step": 1990, "token_acc": 0.9540951446787641, "train_speed(iter/s)": 0.122981 }, { "epoch": 2.4533517622667587, "grad_norm": 0.7251470685005188, "learning_rate": 7.956325947038585e-07, "loss": 0.13122901916503907, "memory(GiB)": 31.97, "step": 1995, "token_acc": 0.9569695888700616, "train_speed(iter/s)": 0.123056 }, { "epoch": 2.4594947400752516, "grad_norm": 0.6275292038917542, "learning_rate": 7.782913558701572e-07, "loss": 0.13776025772094727, "memory(GiB)": 31.97, "step": 2000, "token_acc": 0.9476885644768857, "train_speed(iter/s)": 0.123119 }, { "epoch": 2.4594947400752516, "eval_loss": 0.2740454375743866, "eval_runtime": 29.8131, "eval_samples_per_second": 17.643, "eval_steps_per_second": 4.428, "eval_token_acc": 0.9183816111831676, "step": 2000 }, { "epoch": 2.465637717883744, "grad_norm": 0.830629825592041, "learning_rate": 7.611252237955168e-07, "loss": 0.12884964942932128, "memory(GiB)": 31.97, "step": 2005, "token_acc": 0.926461027233981, "train_speed(iter/s)": 0.122895 }, { "epoch": 2.471780695692237, "grad_norm": 0.8567506670951843, "learning_rate": 7.44134910491589e-07, "loss": 0.15558898448944092, "memory(GiB)": 31.97, "step": 2010, "token_acc": 0.9473684210526315, "train_speed(iter/s)": 0.122982 }, { "epoch": 2.4779236735007295, "grad_norm": 0.9071369171142578, "learning_rate": 7.273211206774711e-07, "loss": 0.14407318830490112, "memory(GiB)": 31.97, "step": 2015, "token_acc": 0.9545647558386412, "train_speed(iter/s)": 0.123063 }, { "epoch": 2.484066651309222, "grad_norm": 0.8281042575836182, "learning_rate": 7.106845517504684e-07, "loss": 0.14147133827209474, "memory(GiB)": 31.97, "step": 2020, "token_acc": 0.9535927353360435, "train_speed(iter/s)": 0.123126 }, { "epoch": 2.484066651309222, "eval_loss": 0.2732316851615906, "eval_runtime": 29.6919, "eval_samples_per_second": 17.715, "eval_steps_per_second": 4.446, "eval_token_acc": 0.9181510304078397, "step": 2020 }, { "epoch": 2.490209629117715, "grad_norm": 0.8336676955223083, "learning_rate": 6.942258937571772e-07, "loss": 0.1445910692214966, "memory(GiB)": 31.97, "step": 2025, "token_acc": 0.9194783843365841, "train_speed(iter/s)": 0.122916 }, { "epoch": 2.4963526069262074, "grad_norm": 0.7258805632591248, "learning_rate": 6.779458293648506e-07, "loss": 0.13561407327651978, "memory(GiB)": 31.97, "step": 2030, "token_acc": 0.9523143224939833, "train_speed(iter/s)": 0.122985 }, { "epoch": 2.5024955847347004, "grad_norm": 0.8204253911972046, "learning_rate": 6.618450338330978e-07, "loss": 0.14749345779418946, "memory(GiB)": 31.97, "step": 2035, "token_acc": 0.9511697728431429, "train_speed(iter/s)": 0.12307 }, { "epoch": 2.508638562543193, "grad_norm": 0.755736768245697, "learning_rate": 6.459241749858619e-07, "loss": 0.1365538001060486, "memory(GiB)": 31.97, "step": 2040, "token_acc": 0.94921875, "train_speed(iter/s)": 0.123134 }, { "epoch": 2.508638562543193, "eval_loss": 0.2726115584373474, "eval_runtime": 29.6337, "eval_samples_per_second": 17.75, "eval_steps_per_second": 4.454, "eval_token_acc": 0.9181870586539848, "step": 2040 }, { "epoch": 2.5147815403516853, "grad_norm": 0.7565985918045044, "learning_rate": 6.301839131837284e-07, "loss": 0.14346761703491212, "memory(GiB)": 31.97, "step": 2045, "token_acc": 0.9241052727438303, "train_speed(iter/s)": 0.122918 }, { "epoch": 2.5209245181601783, "grad_norm": 0.8396884202957153, "learning_rate": 6.146249012965349e-07, "loss": 0.13507163524627686, "memory(GiB)": 31.97, "step": 2050, "token_acc": 0.9460710284016174, "train_speed(iter/s)": 0.122992 }, { "epoch": 2.5270674959686708, "grad_norm": 0.7319416999816895, "learning_rate": 5.992477846762896e-07, "loss": 0.13839869499206542, "memory(GiB)": 31.97, "step": 2055, "token_acc": 0.950456398185889, "train_speed(iter/s)": 0.123056 }, { "epoch": 2.5332104737771637, "grad_norm": 0.7878042459487915, "learning_rate": 5.840532011303996e-07, "loss": 0.15459495782852173, "memory(GiB)": 31.97, "step": 2060, "token_acc": 0.9486288752039581, "train_speed(iter/s)": 0.12312 }, { "epoch": 2.5332104737771637, "eval_loss": 0.27472230792045593, "eval_runtime": 29.5056, "eval_samples_per_second": 17.827, "eval_steps_per_second": 4.474, "eval_token_acc": 0.9183744055339386, "step": 2060 }, { "epoch": 2.539353451585656, "grad_norm": 0.8421174883842468, "learning_rate": 5.690417808952243e-07, "loss": 0.15741729736328125, "memory(GiB)": 31.97, "step": 2065, "token_acc": 0.9253989855251763, "train_speed(iter/s)": 0.122912 }, { "epoch": 2.5454964293941487, "grad_norm": 0.8520276546478271, "learning_rate": 5.542141466099271e-07, "loss": 0.1434725046157837, "memory(GiB)": 31.97, "step": 2070, "token_acc": 0.9435738510115776, "train_speed(iter/s)": 0.122999 }, { "epoch": 2.5516394072026416, "grad_norm": 0.8779215216636658, "learning_rate": 5.395709132906569e-07, "loss": 0.13479983806610107, "memory(GiB)": 31.97, "step": 2075, "token_acc": 0.950151781434734, "train_speed(iter/s)": 0.123082 }, { "epoch": 2.557782385011134, "grad_norm": 0.7466580867767334, "learning_rate": 5.251126883050333e-07, "loss": 0.13184006214141847, "memory(GiB)": 31.97, "step": 2080, "token_acc": 0.954845163930699, "train_speed(iter/s)": 0.12315 }, { "epoch": 2.557782385011134, "eval_loss": 0.27403029799461365, "eval_runtime": 29.675, "eval_samples_per_second": 17.725, "eval_steps_per_second": 4.448, "eval_token_acc": 0.9182735264447327, "step": 2080 }, { "epoch": 2.563925362819627, "grad_norm": 0.9100626707077026, "learning_rate": 5.108400713469547e-07, "loss": 0.15517921447753907, "memory(GiB)": 31.97, "step": 2085, "token_acc": 0.9215290970418331, "train_speed(iter/s)": 0.122941 }, { "epoch": 2.5700683406281195, "grad_norm": 0.7965090870857239, "learning_rate": 4.967536544117263e-07, "loss": 0.1428399920463562, "memory(GiB)": 31.97, "step": 2090, "token_acc": 0.9566871852266369, "train_speed(iter/s)": 0.123002 }, { "epoch": 2.576211318436612, "grad_norm": 0.8939210772514343, "learning_rate": 4.828540217715067e-07, "loss": 0.15376098155975343, "memory(GiB)": 31.97, "step": 2095, "token_acc": 0.9479303634355442, "train_speed(iter/s)": 0.123068 }, { "epoch": 2.582354296245105, "grad_norm": 0.825840950012207, "learning_rate": 4.6914174995106863e-07, "loss": 0.14222912788391112, "memory(GiB)": 31.97, "step": 2100, "token_acc": 0.9461206896551724, "train_speed(iter/s)": 0.123144 }, { "epoch": 2.582354296245105, "eval_loss": 0.27377504110336304, "eval_runtime": 29.7092, "eval_samples_per_second": 17.705, "eval_steps_per_second": 4.443, "eval_token_acc": 0.9185257241677475, "step": 2100 }, { "epoch": 2.5884972740535974, "grad_norm": 0.8890196681022644, "learning_rate": 4.556174077038927e-07, "loss": 0.14791591167449952, "memory(GiB)": 31.97, "step": 2105, "token_acc": 0.9235944439194935, "train_speed(iter/s)": 0.122918 }, { "epoch": 2.5946402518620904, "grad_norm": 0.7061293721199036, "learning_rate": 4.422815559885696e-07, "loss": 0.13169264793395996, "memory(GiB)": 31.97, "step": 2110, "token_acc": 0.948949511019606, "train_speed(iter/s)": 0.122982 }, { "epoch": 2.600783229670583, "grad_norm": 0.8161597847938538, "learning_rate": 4.2913474794554044e-07, "loss": 0.1341610074043274, "memory(GiB)": 31.97, "step": 2115, "token_acc": 0.9497422680412371, "train_speed(iter/s)": 0.123053 }, { "epoch": 2.6069262074790753, "grad_norm": 0.8730164766311646, "learning_rate": 4.161775288741454e-07, "loss": 0.15282490253448486, "memory(GiB)": 31.97, "step": 2120, "token_acc": 0.943151087595532, "train_speed(iter/s)": 0.123121 }, { "epoch": 2.6069262074790753, "eval_loss": 0.2740446925163269, "eval_runtime": 29.6992, "eval_samples_per_second": 17.711, "eval_steps_per_second": 4.445, "eval_token_acc": 0.9185833693615795, "step": 2120 }, { "epoch": 2.6130691852875683, "grad_norm": 0.835308313369751, "learning_rate": 4.034104362100155e-07, "loss": 0.13589699268341066, "memory(GiB)": 31.97, "step": 2125, "token_acc": 0.9286285805728917, "train_speed(iter/s)": 0.1229 }, { "epoch": 2.6192121630960608, "grad_norm": 0.8869427442550659, "learning_rate": 3.9083399950277156e-07, "loss": 0.14998774528503417, "memory(GiB)": 31.97, "step": 2130, "token_acc": 0.9448426301028358, "train_speed(iter/s)": 0.122982 }, { "epoch": 2.6253551409045537, "grad_norm": 0.8314644694328308, "learning_rate": 3.7844874039406677e-07, "loss": 0.12793076038360596, "memory(GiB)": 31.97, "step": 2135, "token_acc": 0.9614247859763555, "train_speed(iter/s)": 0.123051 }, { "epoch": 2.631498118713046, "grad_norm": 0.8190094828605652, "learning_rate": 3.6625517259594566e-07, "loss": 0.14857040643692015, "memory(GiB)": 31.97, "step": 2140, "token_acc": 0.9486269539501478, "train_speed(iter/s)": 0.123129 }, { "epoch": 2.631498118713046, "eval_loss": 0.27396196126937866, "eval_runtime": 29.662, "eval_samples_per_second": 17.733, "eval_steps_per_second": 4.45, "eval_token_acc": 0.9183023490416486, "step": 2140 }, { "epoch": 2.6376410965215387, "grad_norm": 0.8906332850456238, "learning_rate": 3.5425380186953905e-07, "loss": 0.15265541076660155, "memory(GiB)": 31.97, "step": 2145, "token_acc": 0.9210909443851305, "train_speed(iter/s)": 0.122924 }, { "epoch": 2.6437840743300316, "grad_norm": 0.8447383642196655, "learning_rate": 3.424451260040862e-07, "loss": 0.1445927381515503, "memory(GiB)": 31.97, "step": 2150, "token_acc": 0.951250271798217, "train_speed(iter/s)": 0.123006 }, { "epoch": 2.649927052138524, "grad_norm": 0.8714754581451416, "learning_rate": 3.3082963479628747e-07, "loss": 0.15293993949890136, "memory(GiB)": 31.97, "step": 2155, "token_acc": 0.9348866900734121, "train_speed(iter/s)": 0.123079 }, { "epoch": 2.656070029947017, "grad_norm": 0.7784111499786377, "learning_rate": 3.194078100299863e-07, "loss": 0.13703620433807373, "memory(GiB)": 31.97, "step": 2160, "token_acc": 0.955746644295302, "train_speed(iter/s)": 0.123143 }, { "epoch": 2.656070029947017, "eval_loss": 0.2733152210712433, "eval_runtime": 29.7291, "eval_samples_per_second": 17.693, "eval_steps_per_second": 4.44, "eval_token_acc": 0.9185329298169765, "step": 2160 }, { "epoch": 2.6622130077555095, "grad_norm": 0.8391521573066711, "learning_rate": 3.0818012545618836e-07, "loss": 0.13625545501708985, "memory(GiB)": 31.97, "step": 2165, "token_acc": 0.9254722933600564, "train_speed(iter/s)": 0.12293 }, { "epoch": 2.668355985564002, "grad_norm": 0.8696740865707397, "learning_rate": 2.9714704677341055e-07, "loss": 0.15032825469970704, "memory(GiB)": 31.97, "step": 2170, "token_acc": 0.9501612578109252, "train_speed(iter/s)": 0.122998 }, { "epoch": 2.674498963372495, "grad_norm": 0.7542688250541687, "learning_rate": 2.8630903160836776e-07, "loss": 0.14371325969696044, "memory(GiB)": 31.97, "step": 2175, "token_acc": 0.9407879649589506, "train_speed(iter/s)": 0.123074 }, { "epoch": 2.6806419411809874, "grad_norm": 0.703037679195404, "learning_rate": 2.756665294969868e-07, "loss": 0.13071630001068116, "memory(GiB)": 31.97, "step": 2180, "token_acc": 0.9524260355029586, "train_speed(iter/s)": 0.123154 }, { "epoch": 2.6806419411809874, "eval_loss": 0.2734775245189667, "eval_runtime": 29.4649, "eval_samples_per_second": 17.852, "eval_steps_per_second": 4.48, "eval_token_acc": 0.9185833693615795, "step": 2180 }, { "epoch": 2.6867849189894804, "grad_norm": 0.7227668166160583, "learning_rate": 2.6521998186576357e-07, "loss": 0.13176329135894777, "memory(GiB)": 31.97, "step": 2185, "token_acc": 0.9272463413354781, "train_speed(iter/s)": 0.122934 }, { "epoch": 2.692927896797973, "grad_norm": 0.812665581703186, "learning_rate": 2.549698220134517e-07, "loss": 0.14241292476654052, "memory(GiB)": 31.97, "step": 2190, "token_acc": 0.9535809018567639, "train_speed(iter/s)": 0.122998 }, { "epoch": 2.6990708746064653, "grad_norm": 0.8766520023345947, "learning_rate": 2.449164750930938e-07, "loss": 0.14588472843170167, "memory(GiB)": 31.97, "step": 2195, "token_acc": 0.9549009533595936, "train_speed(iter/s)": 0.123059 }, { "epoch": 2.7052138524149583, "grad_norm": 0.9236011505126953, "learning_rate": 2.3506035809438553e-07, "loss": 0.1432283639907837, "memory(GiB)": 31.97, "step": 2200, "token_acc": 0.9595282766014473, "train_speed(iter/s)": 0.123129 }, { "epoch": 2.7052138524149583, "eval_loss": 0.27359798550605774, "eval_runtime": 29.6892, "eval_samples_per_second": 17.717, "eval_steps_per_second": 4.446, "eval_token_acc": 0.9185833693615795, "step": 2200 }, { "epoch": 2.7113568302234508, "grad_norm": 0.7723605036735535, "learning_rate": 2.2540187982637628e-07, "loss": 0.1351910948753357, "memory(GiB)": 31.97, "step": 2205, "token_acc": 0.9280636513015653, "train_speed(iter/s)": 0.122914 }, { "epoch": 2.7174998080319437, "grad_norm": 0.7751135230064392, "learning_rate": 2.1594144090051728e-07, "loss": 0.14594308137893677, "memory(GiB)": 31.97, "step": 2210, "token_acc": 0.9436165379373013, "train_speed(iter/s)": 0.122972 }, { "epoch": 2.723642785840436, "grad_norm": 0.8251017928123474, "learning_rate": 2.066794337140443e-07, "loss": 0.13928499221801757, "memory(GiB)": 31.97, "step": 2215, "token_acc": 0.9523616048755713, "train_speed(iter/s)": 0.123029 }, { "epoch": 2.7297857636489287, "grad_norm": 0.805425763130188, "learning_rate": 1.9761624243370026e-07, "loss": 0.13413631916046143, "memory(GiB)": 31.97, "step": 2220, "token_acc": 0.9549077181208053, "train_speed(iter/s)": 0.123085 }, { "epoch": 2.7297857636489287, "eval_loss": 0.27328255772590637, "eval_runtime": 29.7048, "eval_samples_per_second": 17.708, "eval_steps_per_second": 4.444, "eval_token_acc": 0.9185257241677475, "step": 2220 }, { "epoch": 2.7359287414574216, "grad_norm": 0.720260739326477, "learning_rate": 1.8875224297980332e-07, "loss": 0.14869468212127684, "memory(GiB)": 31.97, "step": 2225, "token_acc": 0.9208377041810281, "train_speed(iter/s)": 0.122897 }, { "epoch": 2.742071719265914, "grad_norm": 0.7555059194564819, "learning_rate": 1.800878030106501e-07, "loss": 0.13696482181549072, "memory(GiB)": 31.97, "step": 2230, "token_acc": 0.951278626898155, "train_speed(iter/s)": 0.122958 }, { "epoch": 2.748214697074407, "grad_norm": 0.9698434472084045, "learning_rate": 1.7162328190727217e-07, "loss": 0.16066057682037355, "memory(GiB)": 31.97, "step": 2235, "token_acc": 0.9414913717092833, "train_speed(iter/s)": 0.123034 }, { "epoch": 2.7543576748828995, "grad_norm": 0.9093776345252991, "learning_rate": 1.6335903075852478e-07, "loss": 0.13771231174468995, "memory(GiB)": 31.97, "step": 2240, "token_acc": 0.953091935104632, "train_speed(iter/s)": 0.123095 }, { "epoch": 2.7543576748828995, "eval_loss": 0.2732333838939667, "eval_runtime": 29.6848, "eval_samples_per_second": 17.72, "eval_steps_per_second": 4.447, "eval_token_acc": 0.9184104337800836, "step": 2240 }, { "epoch": 2.760500652691392, "grad_norm": 0.9624541997909546, "learning_rate": 1.552953923465267e-07, "loss": 0.1540065288543701, "memory(GiB)": 31.97, "step": 2245, "token_acc": 0.9251303793194012, "train_speed(iter/s)": 0.122899 }, { "epoch": 2.766643630499885, "grad_norm": 0.6570599675178528, "learning_rate": 1.4743270113244278e-07, "loss": 0.11645562648773193, "memory(GiB)": 31.97, "step": 2250, "token_acc": 0.960995889387145, "train_speed(iter/s)": 0.122957 }, { "epoch": 2.7727866083083774, "grad_norm": 0.8584187030792236, "learning_rate": 1.3977128324261068e-07, "loss": 0.1433710813522339, "memory(GiB)": 31.97, "step": 2255, "token_acc": 0.9531076066790353, "train_speed(iter/s)": 0.123031 }, { "epoch": 2.7789295861168704, "grad_norm": 0.8224872350692749, "learning_rate": 1.3231145645501153e-07, "loss": 0.14238922595977782, "memory(GiB)": 31.97, "step": 2260, "token_acc": 0.9506010814215969, "train_speed(iter/s)": 0.123085 }, { "epoch": 2.7789295861168704, "eval_loss": 0.2732416093349457, "eval_runtime": 29.7221, "eval_samples_per_second": 17.697, "eval_steps_per_second": 4.441, "eval_token_acc": 0.9183599942354806, "step": 2260 }, { "epoch": 2.785072563925363, "grad_norm": 0.9489020705223083, "learning_rate": 1.2505353018609445e-07, "loss": 0.14603989124298095, "memory(GiB)": 31.97, "step": 2265, "token_acc": 0.9227581508884137, "train_speed(iter/s)": 0.122896 }, { "epoch": 2.7912155417338553, "grad_norm": 0.736284077167511, "learning_rate": 1.1799780547793682e-07, "loss": 0.14218697547912598, "memory(GiB)": 31.97, "step": 2270, "token_acc": 0.9579674123170395, "train_speed(iter/s)": 0.122963 }, { "epoch": 2.7973585195423483, "grad_norm": 0.7859813570976257, "learning_rate": 1.111445749857626e-07, "loss": 0.1413131594657898, "memory(GiB)": 31.97, "step": 2275, "token_acc": 0.9514263252470799, "train_speed(iter/s)": 0.123032 }, { "epoch": 2.8035014973508408, "grad_norm": 1.0802668333053589, "learning_rate": 1.0449412296580252e-07, "loss": 0.1472024917602539, "memory(GiB)": 31.97, "step": 2280, "token_acc": 0.9520010294685368, "train_speed(iter/s)": 0.123092 }, { "epoch": 2.8035014973508408, "eval_loss": 0.273179292678833, "eval_runtime": 29.7252, "eval_samples_per_second": 17.695, "eval_steps_per_second": 4.441, "eval_token_acc": 0.9184392563769995, "step": 2280 }, { "epoch": 2.8096444751593337, "grad_norm": 0.7576152682304382, "learning_rate": 9.804672526349979e-08, "loss": 0.14902775287628173, "memory(GiB)": 31.97, "step": 2285, "token_acc": 0.9238161925601751, "train_speed(iter/s)": 0.122903 }, { "epoch": 2.815787452967826, "grad_norm": 0.8574935793876648, "learning_rate": 9.180264930207405e-08, "loss": 0.1530381441116333, "memory(GiB)": 31.97, "step": 2290, "token_acc": 0.9533666759284987, "train_speed(iter/s)": 0.12298 }, { "epoch": 2.8219304307763187, "grad_norm": 0.7855550050735474, "learning_rate": 8.576215407142652e-08, "loss": 0.12575039863586426, "memory(GiB)": 31.97, "step": 2295, "token_acc": 0.9574297591025192, "train_speed(iter/s)": 0.123043 }, { "epoch": 2.8280734085848116, "grad_norm": 0.8387411236763, "learning_rate": 7.992549011739903e-08, "loss": 0.14488180875778198, "memory(GiB)": 31.97, "step": 2300, "token_acc": 0.9483278379651436, "train_speed(iter/s)": 0.123101 }, { "epoch": 2.8280734085848116, "eval_loss": 0.2733011543750763, "eval_runtime": 29.6296, "eval_samples_per_second": 17.753, "eval_steps_per_second": 4.455, "eval_token_acc": 0.9184680789739156, "step": 2300 }, { "epoch": 2.834216386393304, "grad_norm": 0.9076153039932251, "learning_rate": 7.42928995313802e-08, "loss": 0.14017899036407472, "memory(GiB)": 31.97, "step": 2305, "token_acc": 0.9183514619299471, "train_speed(iter/s)": 0.122904 }, { "epoch": 2.840359364201797, "grad_norm": 0.8693546056747437, "learning_rate": 6.886461594026394e-08, "loss": 0.134627628326416, "memory(GiB)": 31.97, "step": 2310, "token_acc": 0.9542199129335768, "train_speed(iter/s)": 0.122955 }, { "epoch": 2.8465023420102895, "grad_norm": 0.6884361505508423, "learning_rate": 6.364086449676233e-08, "loss": 0.11727933883666992, "memory(GiB)": 31.97, "step": 2315, "token_acc": 0.9606529928840519, "train_speed(iter/s)": 0.123004 }, { "epoch": 2.852645319818782, "grad_norm": 0.8520733118057251, "learning_rate": 5.862186187006347e-08, "loss": 0.13235876560211182, "memory(GiB)": 31.97, "step": 2320, "token_acc": 0.9552718507276136, "train_speed(iter/s)": 0.12307 }, { "epoch": 2.852645319818782, "eval_loss": 0.2732396423816681, "eval_runtime": 29.7321, "eval_samples_per_second": 17.691, "eval_steps_per_second": 4.44, "eval_token_acc": 0.9184248450785416, "step": 2320 }, { "epoch": 2.858788297627275, "grad_norm": 0.6756072640419006, "learning_rate": 5.3807816236846614e-08, "loss": 0.14616656303405762, "memory(GiB)": 31.97, "step": 2325, "token_acc": 0.9212030774597392, "train_speed(iter/s)": 0.122879 }, { "epoch": 2.8649312754357674, "grad_norm": 0.7540440559387207, "learning_rate": 4.919892727264508e-08, "loss": 0.1399930238723755, "memory(GiB)": 31.97, "step": 2330, "token_acc": 0.9455614286419997, "train_speed(iter/s)": 0.122935 }, { "epoch": 2.8710742532442604, "grad_norm": 0.7465505599975586, "learning_rate": 4.4795386143567375e-08, "loss": 0.14697123765945436, "memory(GiB)": 31.97, "step": 2335, "token_acc": 0.9416914178521182, "train_speed(iter/s)": 0.123005 }, { "epoch": 2.877217231052753, "grad_norm": 0.8006975650787354, "learning_rate": 4.0597375498365175e-08, "loss": 0.14045066833496095, "memory(GiB)": 31.97, "step": 2340, "token_acc": 0.9556364912896573, "train_speed(iter/s)": 0.123073 }, { "epoch": 2.877217231052753, "eval_loss": 0.2732827663421631, "eval_runtime": 29.7807, "eval_samples_per_second": 17.662, "eval_steps_per_second": 4.432, "eval_token_acc": 0.9184320507277706, "step": 2340 }, { "epoch": 2.8833602088612453, "grad_norm": 0.7655653357505798, "learning_rate": 3.6605069460858286e-08, "loss": 0.14170855283737183, "memory(GiB)": 31.97, "step": 2345, "token_acc": 0.9223746043924427, "train_speed(iter/s)": 0.122886 }, { "epoch": 2.8895031866697383, "grad_norm": 0.7868750691413879, "learning_rate": 3.281863362271487e-08, "loss": 0.13158297538757324, "memory(GiB)": 31.97, "step": 2350, "token_acc": 0.9610517504554631, "train_speed(iter/s)": 0.122944 }, { "epoch": 2.8956461644782308, "grad_norm": 0.9070404171943665, "learning_rate": 2.9238225036579693e-08, "loss": 0.13984733819961548, "memory(GiB)": 31.97, "step": 2355, "token_acc": 0.9568466078293356, "train_speed(iter/s)": 0.123006 }, { "epoch": 2.9017891422867237, "grad_norm": 0.826079249382019, "learning_rate": 2.5863992209560484e-08, "loss": 0.1394752025604248, "memory(GiB)": 31.97, "step": 2360, "token_acc": 0.9418652788455852, "train_speed(iter/s)": 0.123076 }, { "epoch": 2.9017891422867237, "eval_loss": 0.27347302436828613, "eval_runtime": 29.7339, "eval_samples_per_second": 17.69, "eval_steps_per_second": 4.439, "eval_token_acc": 0.9184464620262286, "step": 2360 }, { "epoch": 2.907932120095216, "grad_norm": 0.9018839001655579, "learning_rate": 2.269607509707006e-08, "loss": 0.1596289873123169, "memory(GiB)": 31.97, "step": 2365, "token_acc": 0.9238603473227207, "train_speed(iter/s)": 0.122868 }, { "epoch": 2.9140750979037087, "grad_norm": 0.8631731271743774, "learning_rate": 1.97346050970193e-08, "loss": 0.1415793776512146, "memory(GiB)": 31.97, "step": 2370, "token_acc": 0.9460295790671217, "train_speed(iter/s)": 0.122945 }, { "epoch": 2.9202180757122016, "grad_norm": 0.8872095942497253, "learning_rate": 1.69797050443693e-08, "loss": 0.13625437021255493, "memory(GiB)": 31.97, "step": 2375, "token_acc": 0.9583434245580044, "train_speed(iter/s)": 0.123012 }, { "epoch": 2.926361053520694, "grad_norm": 0.8124271035194397, "learning_rate": 1.4431489206034321e-08, "loss": 0.14173973798751832, "memory(GiB)": 31.97, "step": 2380, "token_acc": 0.9510019878579488, "train_speed(iter/s)": 0.123073 }, { "epoch": 2.926361053520694, "eval_loss": 0.27337023615837097, "eval_runtime": 29.6435, "eval_samples_per_second": 17.744, "eval_steps_per_second": 4.453, "eval_token_acc": 0.9184969015708315, "step": 2380 }, { "epoch": 2.932504031329187, "grad_norm": 0.8951108455657959, "learning_rate": 1.2090063276142261e-08, "loss": 0.13466954231262207, "memory(GiB)": 31.97, "step": 2385, "token_acc": 0.9219987812309567, "train_speed(iter/s)": 0.122895 }, { "epoch": 2.9386470091376795, "grad_norm": 0.9628083109855652, "learning_rate": 9.955524371653146e-09, "loss": 0.15039776563644408, "memory(GiB)": 31.97, "step": 2390, "token_acc": 0.9414807461204869, "train_speed(iter/s)": 0.12296 }, { "epoch": 2.944789986946172, "grad_norm": 0.8701411485671997, "learning_rate": 8.02796102832848e-09, "loss": 0.13970096111297609, "memory(GiB)": 31.97, "step": 2395, "token_acc": 0.9505520319473808, "train_speed(iter/s)": 0.123027 }, { "epoch": 2.950932964754665, "grad_norm": 0.9468409419059753, "learning_rate": 6.307453197059166e-09, "loss": 0.14919402599334716, "memory(GiB)": 31.97, "step": 2400, "token_acc": 0.9412962147887324, "train_speed(iter/s)": 0.12308 }, { "epoch": 2.950932964754665, "eval_loss": 0.27350106835365295, "eval_runtime": 29.668, "eval_samples_per_second": 17.73, "eval_steps_per_second": 4.449, "eval_token_acc": 0.9183167603401067, "step": 2400 }, { "epoch": 2.9570759425631574, "grad_norm": 0.8505570292472839, "learning_rate": 4.794072240550951e-09, "loss": 0.1571817636489868, "memory(GiB)": 31.97, "step": 2405, "token_acc": 0.920852764823451, "train_speed(iter/s)": 0.122896 }, { "epoch": 2.9632189203716504, "grad_norm": 0.7646933197975159, "learning_rate": 3.487880930363452e-09, "loss": 0.13370524644851683, "memory(GiB)": 31.97, "step": 2410, "token_acc": 0.9574316090263478, "train_speed(iter/s)": 0.122964 }, { "epoch": 2.969361898180143, "grad_norm": 0.8396750092506409, "learning_rate": 2.3889334443055743e-09, "loss": 0.14388556480407716, "memory(GiB)": 31.97, "step": 2415, "token_acc": 0.9585525888390827, "train_speed(iter/s)": 0.123037 }, { "epoch": 2.9755048759886353, "grad_norm": 0.6261889338493347, "learning_rate": 1.4972753641906424e-09, "loss": 0.13296045064926149, "memory(GiB)": 31.97, "step": 2420, "token_acc": 0.9545391609359856, "train_speed(iter/s)": 0.123089 }, { "epoch": 2.9755048759886353, "eval_loss": 0.27353137731552124, "eval_runtime": 29.6147, "eval_samples_per_second": 17.761, "eval_steps_per_second": 4.457, "eval_token_acc": 0.9184320507277706, "step": 2420 }, { "epoch": 2.9816478537971283, "grad_norm": 0.7684817910194397, "learning_rate": 8.12943673943467e-10, "loss": 0.1473867416381836, "memory(GiB)": 31.97, "step": 2425, "token_acc": 0.9241485786940332, "train_speed(iter/s)": 0.122905 }, { "epoch": 2.9877908316056208, "grad_norm": 0.7693585157394409, "learning_rate": 3.359667580682402e-10, "loss": 0.14533259868621826, "memory(GiB)": 31.97, "step": 2430, "token_acc": 0.9479254868755292, "train_speed(iter/s)": 0.122963 }, { "epoch": 2.9939338094141137, "grad_norm": 0.8009675145149231, "learning_rate": 6.636440046892123e-11, "loss": 0.12457112073898316, "memory(GiB)": 31.97, "step": 2435, "token_acc": 0.9580943014806316, "train_speed(iter/s)": 0.123019 }, { "epoch": 2.9988481916609078, "eval_loss": 0.2735154330730438, "eval_runtime": 29.6742, "eval_samples_per_second": 17.726, "eval_steps_per_second": 4.448, "eval_token_acc": 0.9182735264447327, "step": 2439 } ], "logging_steps": 5, "max_steps": 2439, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2467967701619835e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }