{ "best_global_step": 460, "best_metric": 0.24467714, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v29-20250507-134003/checkpoint-460", "epoch": 2.9911123081066524, "eval_steps": 20, "global_step": 696, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0043091839482897925, "grad_norm": 2.422614812850952, "learning_rate": 9.99994906450425e-06, "loss": 0.3783050775527954, "memory(GiB)": 29.06, "step": 1, "token_acc": 0.8827366746221161, "train_speed(iter/s)": 0.066016 }, { "epoch": 0.02154591974144896, "grad_norm": 1.8293063640594482, "learning_rate": 9.99872666449397e-06, "loss": 0.304108202457428, "memory(GiB)": 29.06, "step": 5, "token_acc": 0.9021085311428756, "train_speed(iter/s)": 0.123613 }, { "epoch": 0.04309183948289792, "grad_norm": 1.0181312561035156, "learning_rate": 9.994907306529203e-06, "loss": 0.30131869316101073, "memory(GiB)": 29.06, "step": 10, "token_acc": 0.9080832657474993, "train_speed(iter/s)": 0.141649 }, { "epoch": 0.06463775922434689, "grad_norm": 0.8247085213661194, "learning_rate": 9.988543871435342e-06, "loss": 0.29039506912231444, "memory(GiB)": 29.07, "step": 15, "token_acc": 0.9026237111961776, "train_speed(iter/s)": 0.145623 }, { "epoch": 0.08618367896579585, "grad_norm": 0.8313817977905273, "learning_rate": 9.979639600327522e-06, "loss": 0.2841599941253662, "memory(GiB)": 29.07, "step": 20, "token_acc": 0.9051915284043123, "train_speed(iter/s)": 0.148055 }, { "epoch": 0.08618367896579585, "eval_loss": 0.30637428164482117, "eval_runtime": 9.2534, "eval_samples_per_second": 16.21, "eval_steps_per_second": 4.107, "eval_token_acc": 0.8990793096302436, "step": 20 }, { "epoch": 0.10772959870724481, "grad_norm": 0.8578795790672302, "learning_rate": 9.96819902845557e-06, "loss": 0.26584060192108155, "memory(GiB)": 29.07, "step": 25, "token_acc": 0.9061398699976505, "train_speed(iter/s)": 0.133467 }, { "epoch": 0.12927551844869378, "grad_norm": 0.8536065220832825, "learning_rate": 9.954227982894034e-06, "loss": 0.2721074342727661, "memory(GiB)": 29.07, "step": 30, "token_acc": 0.9100939031401429, "train_speed(iter/s)": 0.137239 }, { "epoch": 0.15082143819014274, "grad_norm": 0.8567067384719849, "learning_rate": 9.937733579574263e-06, "loss": 0.26833133697509765, "memory(GiB)": 29.07, "step": 35, "token_acc": 0.9014427903508017, "train_speed(iter/s)": 0.139069 }, { "epoch": 0.1723673579315917, "grad_norm": 0.8759805560112, "learning_rate": 9.918724219660013e-06, "loss": 0.2696810483932495, "memory(GiB)": 29.07, "step": 40, "token_acc": 0.9073056300268096, "train_speed(iter/s)": 0.141475 }, { "epoch": 0.1723673579315917, "eval_loss": 0.28540194034576416, "eval_runtime": 9.1496, "eval_samples_per_second": 16.394, "eval_steps_per_second": 4.153, "eval_token_acc": 0.9062731282749638, "step": 40 }, { "epoch": 0.19391327767304067, "grad_norm": 0.8179587125778198, "learning_rate": 9.897209585268459e-06, "loss": 0.26938886642456056, "memory(GiB)": 29.07, "step": 45, "token_acc": 0.9068100358422939, "train_speed(iter/s)": 0.134153 }, { "epoch": 0.21545919741448963, "grad_norm": 0.7597087621688843, "learning_rate": 9.873200634538746e-06, "loss": 0.2661460876464844, "memory(GiB)": 29.07, "step": 50, "token_acc": 0.9265829903627394, "train_speed(iter/s)": 0.135778 }, { "epoch": 0.23700511715593858, "grad_norm": 0.9001930952072144, "learning_rate": 9.846709596050646e-06, "loss": 0.2637378692626953, "memory(GiB)": 29.07, "step": 55, "token_acc": 0.9097354466352211, "train_speed(iter/s)": 0.137425 }, { "epoch": 0.25855103689738757, "grad_norm": 0.894802451133728, "learning_rate": 9.817749962596115e-06, "loss": 0.26340594291687014, "memory(GiB)": 29.07, "step": 60, "token_acc": 0.9115004961612785, "train_speed(iter/s)": 0.139172 }, { "epoch": 0.25855103689738757, "eval_loss": 0.2748047709465027, "eval_runtime": 9.1341, "eval_samples_per_second": 16.422, "eval_steps_per_second": 4.16, "eval_token_acc": 0.9068652121140354, "step": 60 }, { "epoch": 0.28009695663883655, "grad_norm": 0.7363941669464111, "learning_rate": 9.786336484306966e-06, "loss": 0.27098889350891114, "memory(GiB)": 29.07, "step": 65, "token_acc": 0.9083590733590734, "train_speed(iter/s)": 0.135246 }, { "epoch": 0.3016428763802855, "grad_norm": 0.8381310105323792, "learning_rate": 9.752485161142103e-06, "loss": 0.2478638172149658, "memory(GiB)": 29.07, "step": 70, "token_acc": 0.9232435033686237, "train_speed(iter/s)": 0.136407 }, { "epoch": 0.32318879612173446, "grad_norm": 0.7714306116104126, "learning_rate": 9.716213234738216e-06, "loss": 0.2461942672729492, "memory(GiB)": 29.07, "step": 75, "token_acc": 0.9149177216982468, "train_speed(iter/s)": 0.137517 }, { "epoch": 0.3447347158631834, "grad_norm": 0.9231055974960327, "learning_rate": 9.677539179628005e-06, "loss": 0.24781365394592286, "memory(GiB)": 29.07, "step": 80, "token_acc": 0.9261125903385318, "train_speed(iter/s)": 0.138742 }, { "epoch": 0.3447347158631834, "eval_loss": 0.26845118403434753, "eval_runtime": 9.1426, "eval_samples_per_second": 16.407, "eval_steps_per_second": 4.156, "eval_token_acc": 0.9084194321915984, "step": 80 }, { "epoch": 0.36628063560463237, "grad_norm": 0.7188398241996765, "learning_rate": 9.636482693830488e-06, "loss": 0.26428771018981934, "memory(GiB)": 29.07, "step": 85, "token_acc": 0.9042584492261823, "train_speed(iter/s)": 0.135179 }, { "epoch": 0.38782655534608135, "grad_norm": 0.8020254969596863, "learning_rate": 9.59306468881811e-06, "loss": 0.2622120141983032, "memory(GiB)": 29.07, "step": 90, "token_acc": 0.9098044980155814, "train_speed(iter/s)": 0.13707 }, { "epoch": 0.4093724750875303, "grad_norm": 0.8326025605201721, "learning_rate": 9.547307278865823e-06, "loss": 0.2394162893295288, "memory(GiB)": 29.07, "step": 95, "token_acc": 0.9164833305127771, "train_speed(iter/s)": 0.138039 }, { "epoch": 0.43091839482897926, "grad_norm": 0.7730870842933655, "learning_rate": 9.499233769787534e-06, "loss": 0.24491536617279053, "memory(GiB)": 29.07, "step": 100, "token_acc": 0.9174321989744152, "train_speed(iter/s)": 0.139304 }, { "epoch": 0.43091839482897926, "eval_loss": 0.26416531205177307, "eval_runtime": 9.1324, "eval_samples_per_second": 16.425, "eval_steps_per_second": 4.161, "eval_token_acc": 0.9095295893898576, "step": 100 }, { "epoch": 0.45246431457042824, "grad_norm": 0.860744833946228, "learning_rate": 9.448868647065644e-06, "loss": 0.25905332565307615, "memory(GiB)": 29.07, "step": 105, "token_acc": 0.9042013222435488, "train_speed(iter/s)": 0.136744 }, { "epoch": 0.47401023431187717, "grad_norm": 0.7342467904090881, "learning_rate": 9.396237563379761e-06, "loss": 0.23780291080474852, "memory(GiB)": 29.07, "step": 110, "token_acc": 0.9195750302763405, "train_speed(iter/s)": 0.137488 }, { "epoch": 0.49555615405332615, "grad_norm": 0.8625522255897522, "learning_rate": 9.341367325540921e-06, "loss": 0.23777966499328612, "memory(GiB)": 29.07, "step": 115, "token_acc": 0.9236284378674467, "train_speed(iter/s)": 0.138134 }, { "epoch": 0.5171020737947751, "grad_norm": 0.7858663201332092, "learning_rate": 9.284285880837947e-06, "loss": 0.23805389404296876, "memory(GiB)": 29.07, "step": 120, "token_acc": 0.9197558545092177, "train_speed(iter/s)": 0.138839 }, { "epoch": 0.5171020737947751, "eval_loss": 0.26037347316741943, "eval_runtime": 9.1916, "eval_samples_per_second": 16.319, "eval_steps_per_second": 4.134, "eval_token_acc": 0.9109209864116758, "step": 120 }, { "epoch": 0.5386479935362241, "grad_norm": 0.69041508436203, "learning_rate": 9.225022302802951e-06, "loss": 0.23021812438964845, "memory(GiB)": 29.07, "step": 125, "token_acc": 0.9175774697625143, "train_speed(iter/s)": 0.136057 }, { "epoch": 0.5601939132776731, "grad_norm": 0.8726988434791565, "learning_rate": 9.163606776403182e-06, "loss": 0.24287738800048828, "memory(GiB)": 29.07, "step": 130, "token_acc": 0.9174245368571058, "train_speed(iter/s)": 0.136884 }, { "epoch": 0.581739833019122, "grad_norm": 0.782781720161438, "learning_rate": 9.100070582666796e-06, "loss": 0.2355494499206543, "memory(GiB)": 29.07, "step": 135, "token_acc": 0.9218754381537254, "train_speed(iter/s)": 0.137519 }, { "epoch": 0.603285752760571, "grad_norm": 0.8618416786193848, "learning_rate": 9.034446082750352e-06, "loss": 0.26097152233123777, "memory(GiB)": 29.07, "step": 140, "token_acc": 0.9200457337339467, "train_speed(iter/s)": 0.137992 }, { "epoch": 0.603285752760571, "eval_loss": 0.2570641040802002, "eval_runtime": 9.1422, "eval_samples_per_second": 16.407, "eval_steps_per_second": 4.157, "eval_token_acc": 0.9119867373220047, "step": 140 }, { "epoch": 0.6248316725020199, "grad_norm": 0.7354313135147095, "learning_rate": 8.966766701456177e-06, "loss": 0.2450572967529297, "memory(GiB)": 29.07, "step": 145, "token_acc": 0.9104867095521301, "train_speed(iter/s)": 0.135924 }, { "epoch": 0.6463775922434689, "grad_norm": 0.7455778121948242, "learning_rate": 8.897066910207958e-06, "loss": 0.24456512928009033, "memory(GiB)": 29.07, "step": 150, "token_acc": 0.9091944119638131, "train_speed(iter/s)": 0.136598 }, { "epoch": 0.6679235119849178, "grad_norm": 0.6894455552101135, "learning_rate": 8.825382209493284e-06, "loss": 0.22335777282714844, "memory(GiB)": 29.07, "step": 155, "token_acc": 0.9254186825455899, "train_speed(iter/s)": 0.137109 }, { "epoch": 0.6894694317263668, "grad_norm": 0.8090242743492126, "learning_rate": 8.751749110782013e-06, "loss": 0.22939071655273438, "memory(GiB)": 29.07, "step": 160, "token_acc": 0.917912822144448, "train_speed(iter/s)": 0.137664 }, { "epoch": 0.6894694317263668, "eval_loss": 0.2559308111667633, "eval_runtime": 9.1265, "eval_samples_per_second": 16.436, "eval_steps_per_second": 4.164, "eval_token_acc": 0.9122235708576334, "step": 160 }, { "epoch": 0.7110153514678158, "grad_norm": 0.6877977252006531, "learning_rate": 8.676205117929752e-06, "loss": 0.2350329875946045, "memory(GiB)": 29.07, "step": 165, "token_acc": 0.9184605472599106, "train_speed(iter/s)": 0.13599 }, { "epoch": 0.7325612712092647, "grad_norm": 0.7360235452651978, "learning_rate": 8.598788708075844e-06, "loss": 0.23023662567138672, "memory(GiB)": 29.07, "step": 170, "token_acc": 0.9058568071626164, "train_speed(iter/s)": 0.136418 }, { "epoch": 0.7541071909507137, "grad_norm": 0.8410085439682007, "learning_rate": 8.51953931204566e-06, "loss": 0.23642911911010742, "memory(GiB)": 29.07, "step": 175, "token_acc": 0.9052932094269817, "train_speed(iter/s)": 0.137007 }, { "epoch": 0.7756531106921627, "grad_norm": 0.7055257558822632, "learning_rate": 8.438497294267117e-06, "loss": 0.21671390533447266, "memory(GiB)": 29.07, "step": 180, "token_acc": 0.9202059202059202, "train_speed(iter/s)": 0.137438 }, { "epoch": 0.7756531106921627, "eval_loss": 0.251537024974823, "eval_runtime": 9.1508, "eval_samples_per_second": 16.392, "eval_steps_per_second": 4.153, "eval_token_acc": 0.9133337280558927, "step": 180 }, { "epoch": 0.7971990304336116, "grad_norm": 0.7327454686164856, "learning_rate": 8.3557039322117e-06, "loss": 0.23624320030212403, "memory(GiB)": 29.07, "step": 185, "token_acc": 0.9212678936605317, "train_speed(iter/s)": 0.135745 }, { "epoch": 0.8187449501750605, "grad_norm": 0.7439467906951904, "learning_rate": 8.27120139537044e-06, "loss": 0.226143741607666, "memory(GiB)": 29.07, "step": 190, "token_acc": 0.9260711777101381, "train_speed(iter/s)": 0.136185 }, { "epoch": 0.8402908699165096, "grad_norm": 0.7658076286315918, "learning_rate": 8.18503272377554e-06, "loss": 0.23765263557434083, "memory(GiB)": 29.07, "step": 195, "token_acc": 0.9223176899233237, "train_speed(iter/s)": 0.136807 }, { "epoch": 0.8618367896579585, "grad_norm": 0.739122211933136, "learning_rate": 8.097241806078616e-06, "loss": 0.2310422420501709, "memory(GiB)": 29.07, "step": 200, "token_acc": 0.9296302733841532, "train_speed(iter/s)": 0.13714 }, { "epoch": 0.8618367896579585, "eval_loss": 0.24832946062088013, "eval_runtime": 9.1389, "eval_samples_per_second": 16.413, "eval_steps_per_second": 4.158, "eval_token_acc": 0.9134077385357766, "step": 200 }, { "epoch": 0.8833827093994074, "grad_norm": 0.7651655077934265, "learning_rate": 8.007873357196716e-06, "loss": 0.24373788833618165, "memory(GiB)": 29.07, "step": 205, "token_acc": 0.9188066080938974, "train_speed(iter/s)": 0.135947 }, { "epoch": 0.9049286291408565, "grad_norm": 0.7528461217880249, "learning_rate": 7.916972895537471e-06, "loss": 0.23199746608734131, "memory(GiB)": 29.07, "step": 210, "token_acc": 0.922004793261512, "train_speed(iter/s)": 0.136394 }, { "epoch": 0.9264745488823054, "grad_norm": 0.8405919075012207, "learning_rate": 7.824586719815019e-06, "loss": 0.2173825740814209, "memory(GiB)": 29.07, "step": 215, "token_acc": 0.9274689356403538, "train_speed(iter/s)": 0.136811 }, { "epoch": 0.9480204686237543, "grad_norm": 0.7239152193069458, "learning_rate": 7.730761885468486e-06, "loss": 0.22583813667297364, "memory(GiB)": 29.07, "step": 220, "token_acc": 0.9232335730673059, "train_speed(iter/s)": 0.137394 }, { "epoch": 0.9480204686237543, "eval_loss": 0.2480185180902481, "eval_runtime": 9.1514, "eval_samples_per_second": 16.391, "eval_steps_per_second": 4.152, "eval_token_acc": 0.9135261553035909, "step": 220 }, { "epoch": 0.9695663883652034, "grad_norm": 0.7927827835083008, "learning_rate": 7.635546180695039e-06, "loss": 0.24525394439697265, "memory(GiB)": 31.51, "step": 225, "token_acc": 0.9190034762456547, "train_speed(iter/s)": 0.136277 }, { "epoch": 0.9911123081066523, "grad_norm": 0.7584970593452454, "learning_rate": 7.538988102109728e-06, "loss": 0.24703338146209716, "memory(GiB)": 31.51, "step": 230, "token_acc": 0.9166930084197822, "train_speed(iter/s)": 0.136884 }, { "epoch": 1.0086183678965797, "grad_norm": 0.6898382306098938, "learning_rate": 7.441136830044495e-06, "loss": 0.19301869869232177, "memory(GiB)": 31.51, "step": 235, "token_acc": 0.9391968052558693, "train_speed(iter/s)": 0.137591 }, { "epoch": 1.0301642876380286, "grad_norm": 0.8064629435539246, "learning_rate": 7.342042203498952e-06, "loss": 0.16187149286270142, "memory(GiB)": 31.51, "step": 240, "token_acc": 0.9393010954707055, "train_speed(iter/s)": 0.137991 }, { "epoch": 1.0301642876380286, "eval_loss": 0.2533319890499115, "eval_runtime": 9.1306, "eval_samples_per_second": 16.428, "eval_steps_per_second": 4.162, "eval_token_acc": 0.914014624470825, "step": 240 }, { "epoch": 1.0517102073794775, "grad_norm": 0.8042640089988708, "learning_rate": 7.241754694755674e-06, "loss": 0.16961312294006348, "memory(GiB)": 31.51, "step": 245, "token_acc": 0.9325969259837942, "train_speed(iter/s)": 0.13689 }, { "epoch": 1.0732561271209264, "grad_norm": 0.8370431661605835, "learning_rate": 7.140325383672938e-06, "loss": 0.1677647829055786, "memory(GiB)": 31.51, "step": 250, "token_acc": 0.9420376456528234, "train_speed(iter/s)": 0.13733 }, { "epoch": 1.0948020468623754, "grad_norm": 0.7853599190711975, "learning_rate": 7.037805931668006e-06, "loss": 0.16614892482757568, "memory(GiB)": 31.51, "step": 255, "token_acc": 0.9391651270517043, "train_speed(iter/s)": 0.137609 }, { "epoch": 1.1163479666038243, "grad_norm": 0.6807184815406799, "learning_rate": 6.934248555404197e-06, "loss": 0.1581436276435852, "memory(GiB)": 31.51, "step": 260, "token_acc": 0.9458589779605179, "train_speed(iter/s)": 0.137947 }, { "epoch": 1.1163479666038243, "eval_loss": 0.2524171471595764, "eval_runtime": 9.156, "eval_samples_per_second": 16.383, "eval_steps_per_second": 4.15, "eval_token_acc": 0.9150063649012701, "step": 260 }, { "epoch": 1.1378938863452734, "grad_norm": 0.7507938146591187, "learning_rate": 6.8297060001951545e-06, "loss": 0.16150083541870117, "memory(GiB)": 31.51, "step": 265, "token_acc": 0.9309385421629347, "train_speed(iter/s)": 0.137099 }, { "epoch": 1.1594398060867224, "grad_norm": 0.7291717529296875, "learning_rate": 6.724231513139853e-06, "loss": 0.16564717292785644, "memory(GiB)": 31.51, "step": 270, "token_acc": 0.9417148494231771, "train_speed(iter/s)": 0.137339 }, { "epoch": 1.1809857258281713, "grad_norm": 0.7378965616226196, "learning_rate": 6.617878816002032e-06, "loss": 0.1618717670440674, "memory(GiB)": 31.51, "step": 275, "token_acc": 0.9485524256651017, "train_speed(iter/s)": 0.137622 }, { "epoch": 1.2025316455696202, "grad_norm": 0.8035087585449219, "learning_rate": 6.510702077847864e-06, "loss": 0.1574448823928833, "memory(GiB)": 31.51, "step": 280, "token_acc": 0.9411747078213965, "train_speed(iter/s)": 0.138001 }, { "epoch": 1.2025316455696202, "eval_loss": 0.24995924532413483, "eval_runtime": 9.1411, "eval_samples_per_second": 16.409, "eval_steps_per_second": 4.157, "eval_token_acc": 0.9145474999259895, "step": 280 }, { "epoch": 1.2240775653110691, "grad_norm": 0.7014583349227905, "learning_rate": 6.402755887455792e-06, "loss": 0.1643718123435974, "memory(GiB)": 31.51, "step": 285, "token_acc": 0.9339765241569784, "train_speed(iter/s)": 0.137005 }, { "epoch": 1.2456234850525183, "grad_norm": 0.7766486406326294, "learning_rate": 6.294095225512604e-06, "loss": 0.16814930438995362, "memory(GiB)": 31.51, "step": 290, "token_acc": 0.9360814298463542, "train_speed(iter/s)": 0.137309 }, { "epoch": 1.2671694047939672, "grad_norm": 0.7775722146034241, "learning_rate": 6.184775436609885e-06, "loss": 0.1682277202606201, "memory(GiB)": 31.51, "step": 295, "token_acc": 0.9411593528110813, "train_speed(iter/s)": 0.137689 }, { "epoch": 1.2887153245354162, "grad_norm": 0.7489521503448486, "learning_rate": 6.074852201055121e-06, "loss": 0.16042615175247193, "memory(GiB)": 31.51, "step": 300, "token_acc": 0.9415187229598687, "train_speed(iter/s)": 0.138077 }, { "epoch": 1.2887153245354162, "eval_loss": 0.25172922015190125, "eval_runtime": 9.1377, "eval_samples_per_second": 16.415, "eval_steps_per_second": 4.159, "eval_token_acc": 0.9150211669972468, "step": 300 }, { "epoch": 1.310261244276865, "grad_norm": 0.792445182800293, "learning_rate": 5.964381506511823e-06, "loss": 0.16529514789581298, "memory(GiB)": 31.51, "step": 305, "token_acc": 0.9390098282355103, "train_speed(iter/s)": 0.137303 }, { "epoch": 1.331807164018314, "grad_norm": 0.7657850980758667, "learning_rate": 5.853419619483083e-06, "loss": 0.16101518869400025, "memory(GiB)": 31.51, "step": 310, "token_acc": 0.9423482091553342, "train_speed(iter/s)": 0.137569 }, { "epoch": 1.353353083759763, "grad_norm": 0.7221185564994812, "learning_rate": 5.742023056653131e-06, "loss": 0.16527080535888672, "memory(GiB)": 31.51, "step": 315, "token_acc": 0.9436092441929018, "train_speed(iter/s)": 0.13783 }, { "epoch": 1.3748990035012119, "grad_norm": 0.7651124596595764, "learning_rate": 5.630248556101448e-06, "loss": 0.16076445579528809, "memory(GiB)": 31.51, "step": 320, "token_acc": 0.941539852342926, "train_speed(iter/s)": 0.138138 }, { "epoch": 1.3748990035012119, "eval_loss": 0.2519991993904114, "eval_runtime": 9.1625, "eval_samples_per_second": 16.371, "eval_steps_per_second": 4.147, "eval_token_acc": 0.9146955208857575, "step": 320 }, { "epoch": 1.3964449232426608, "grad_norm": 0.6687130331993103, "learning_rate": 5.51815304840412e-06, "loss": 0.16071187257766723, "memory(GiB)": 31.51, "step": 325, "token_acc": 0.93119978263401, "train_speed(iter/s)": 0.137342 }, { "epoch": 1.41799084298411, "grad_norm": 0.7091411352157593, "learning_rate": 5.405793627637157e-06, "loss": 0.15800976753234863, "memory(GiB)": 31.51, "step": 330, "token_acc": 0.9493767600253226, "train_speed(iter/s)": 0.137567 }, { "epoch": 1.4395367627255589, "grad_norm": 0.7872418761253357, "learning_rate": 5.293227522296517e-06, "loss": 0.16303786039352416, "memory(GiB)": 31.51, "step": 335, "token_acc": 0.9474813818783616, "train_speed(iter/s)": 0.137773 }, { "epoch": 1.4610826824670078, "grad_norm": 0.696894109249115, "learning_rate": 5.180512066149682e-06, "loss": 0.1651884913444519, "memory(GiB)": 31.51, "step": 340, "token_acc": 0.9437182487584908, "train_speed(iter/s)": 0.138053 }, { "epoch": 1.4610826824670078, "eval_loss": 0.2488545924425125, "eval_runtime": 9.1671, "eval_samples_per_second": 16.363, "eval_steps_per_second": 4.145, "eval_token_acc": 0.9151395837650611, "step": 340 }, { "epoch": 1.4826286022084567, "grad_norm": 0.7297951579093933, "learning_rate": 5.06770466903361e-06, "loss": 0.15690993070602416, "memory(GiB)": 31.51, "step": 345, "token_acc": 0.9340376019152534, "train_speed(iter/s)": 0.137212 }, { "epoch": 1.5041745219499059, "grad_norm": 0.7707265019416809, "learning_rate": 4.954862787613937e-06, "loss": 0.15354007482528687, "memory(GiB)": 31.51, "step": 350, "token_acc": 0.9396699063799393, "train_speed(iter/s)": 0.13744 }, { "epoch": 1.5257204416913548, "grad_norm": 0.7526496052742004, "learning_rate": 4.842043896120332e-06, "loss": 0.16020708084106444, "memory(GiB)": 31.51, "step": 355, "token_acc": 0.9479154768703598, "train_speed(iter/s)": 0.137715 }, { "epoch": 1.5472663614328037, "grad_norm": 0.7758511900901794, "learning_rate": 4.729305457072913e-06, "loss": 0.16963763236999513, "memory(GiB)": 31.51, "step": 360, "token_acc": 0.9411747908278363, "train_speed(iter/s)": 0.138029 }, { "epoch": 1.5472663614328037, "eval_loss": 0.24962776899337769, "eval_runtime": 9.1377, "eval_samples_per_second": 16.415, "eval_steps_per_second": 4.159, "eval_token_acc": 0.9158204801799935, "step": 360 }, { "epoch": 1.5688122811742526, "grad_norm": 0.7562235593795776, "learning_rate": 4.616704892014613e-06, "loss": 0.1591555118560791, "memory(GiB)": 31.51, "step": 365, "token_acc": 0.9352809509107749, "train_speed(iter/s)": 0.137302 }, { "epoch": 1.5903582009157016, "grad_norm": 0.7587376236915588, "learning_rate": 4.504299552264428e-06, "loss": 0.15684648752212524, "memory(GiB)": 31.51, "step": 370, "token_acc": 0.9416271651313239, "train_speed(iter/s)": 0.137471 }, { "epoch": 1.6119041206571505, "grad_norm": 0.8137562870979309, "learning_rate": 4.392146689706426e-06, "loss": 0.1647357702255249, "memory(GiB)": 31.51, "step": 375, "token_acc": 0.9458877409154104, "train_speed(iter/s)": 0.137806 }, { "epoch": 1.6334500403985994, "grad_norm": 0.7551019191741943, "learning_rate": 4.280303427629404e-06, "loss": 0.15853278636932372, "memory(GiB)": 31.51, "step": 380, "token_acc": 0.9448746907604604, "train_speed(iter/s)": 0.138095 }, { "epoch": 1.6334500403985994, "eval_loss": 0.25000789761543274, "eval_runtime": 9.1322, "eval_samples_per_second": 16.425, "eval_steps_per_second": 4.161, "eval_token_acc": 0.9155096361644809, "step": 380 }, { "epoch": 1.6549959601400483, "grad_norm": 0.7977014780044556, "learning_rate": 4.168826731632052e-06, "loss": 0.15717003345489503, "memory(GiB)": 31.51, "step": 385, "token_acc": 0.9312666413084824, "train_speed(iter/s)": 0.137411 }, { "epoch": 1.6765418798814973, "grad_norm": 0.7832633852958679, "learning_rate": 4.057773380608411e-06, "loss": 0.1634294271469116, "memory(GiB)": 31.51, "step": 390, "token_acc": 0.9465973781793492, "train_speed(iter/s)": 0.137768 }, { "epoch": 1.6980877996229464, "grad_norm": 0.8293562531471252, "learning_rate": 3.947199937828447e-06, "loss": 0.16505708694458007, "memory(GiB)": 31.51, "step": 395, "token_acc": 0.9415725074644342, "train_speed(iter/s)": 0.138088 }, { "epoch": 1.7196337193643954, "grad_norm": 0.7886548042297363, "learning_rate": 3.8371627221284495e-06, "loss": 0.1561971426010132, "memory(GiB)": 31.51, "step": 400, "token_acc": 0.9470925236321971, "train_speed(iter/s)": 0.138285 }, { "epoch": 1.7196337193643954, "eval_loss": 0.24771690368652344, "eval_runtime": 9.1277, "eval_samples_per_second": 16.434, "eval_steps_per_second": 4.163, "eval_token_acc": 0.915642855028272, "step": 400 }, { "epoch": 1.7411796391058443, "grad_norm": 0.7254658937454224, "learning_rate": 3.727717779225912e-06, "loss": 0.1556318521499634, "memory(GiB)": 31.51, "step": 405, "token_acc": 0.9359159282917783, "train_speed(iter/s)": 0.137626 }, { "epoch": 1.7627255588472934, "grad_norm": 0.7896953225135803, "learning_rate": 3.6189208531735354e-06, "loss": 0.16613179445266724, "memory(GiB)": 31.51, "step": 410, "token_acc": 0.9352341759749168, "train_speed(iter/s)": 0.137957 }, { "epoch": 1.7842714785887424, "grad_norm": 0.6848239898681641, "learning_rate": 3.510827357966876e-06, "loss": 0.1551806092262268, "memory(GiB)": 31.51, "step": 415, "token_acc": 0.9506668360218469, "train_speed(iter/s)": 0.138211 }, { "epoch": 1.8058173983301913, "grad_norm": 0.7046887874603271, "learning_rate": 3.403492349320101e-06, "loss": 0.15121963024139404, "memory(GiB)": 31.51, "step": 420, "token_acc": 0.9443215339233039, "train_speed(iter/s)": 0.138347 }, { "epoch": 1.8058173983301913, "eval_loss": 0.24637845158576965, "eval_runtime": 9.144, "eval_samples_per_second": 16.404, "eval_steps_per_second": 4.156, "eval_token_acc": 0.9157020634121792, "step": 420 }, { "epoch": 1.8273633180716402, "grad_norm": 0.7264747619628906, "learning_rate": 3.29697049662423e-06, "loss": 0.1486160159111023, "memory(GiB)": 31.51, "step": 425, "token_acc": 0.9378997513712539, "train_speed(iter/s)": 0.137609 }, { "epoch": 1.8489092378130891, "grad_norm": 0.6881827116012573, "learning_rate": 3.191316055102146e-06, "loss": 0.14999903440475465, "memory(GiB)": 31.51, "step": 430, "token_acc": 0.9458710676835081, "train_speed(iter/s)": 0.13782 }, { "epoch": 1.870455157554538, "grad_norm": 0.7096033096313477, "learning_rate": 3.0865828381745515e-06, "loss": 0.15066919326782227, "memory(GiB)": 31.51, "step": 435, "token_acc": 0.9486315094650982, "train_speed(iter/s)": 0.137979 }, { "epoch": 1.892001077295987, "grad_norm": 0.7479064464569092, "learning_rate": 2.982824190050958e-06, "loss": 0.165749990940094, "memory(GiB)": 31.51, "step": 440, "token_acc": 0.9481958622195534, "train_speed(iter/s)": 0.138187 }, { "epoch": 1.892001077295987, "eval_loss": 0.24523746967315674, "eval_runtime": 9.1328, "eval_samples_per_second": 16.424, "eval_steps_per_second": 4.161, "eval_token_acc": 0.9161757304834365, "step": 440 }, { "epoch": 1.913546997037436, "grad_norm": 0.7331624031066895, "learning_rate": 2.8800929585596506e-06, "loss": 0.15496289730072021, "memory(GiB)": 31.51, "step": 445, "token_acc": 0.9352829677768751, "train_speed(iter/s)": 0.137542 }, { "epoch": 1.9350929167788848, "grad_norm": 0.6734929084777832, "learning_rate": 2.778441468230483e-06, "loss": 0.1523799180984497, "memory(GiB)": 31.51, "step": 450, "token_acc": 0.9479633806554332, "train_speed(iter/s)": 0.1377 }, { "epoch": 1.956638836520334, "grad_norm": 0.7542054057121277, "learning_rate": 2.6779214936442056e-06, "loss": 0.16172744035720826, "memory(GiB)": 31.51, "step": 455, "token_acc": 0.935499950154521, "train_speed(iter/s)": 0.137884 }, { "epoch": 1.978184756261783, "grad_norm": 0.7129687070846558, "learning_rate": 2.5785842330619038e-06, "loss": 0.15356701612472534, "memory(GiB)": 31.51, "step": 460, "token_acc": 0.941601546088564, "train_speed(iter/s)": 0.13804 }, { "epoch": 1.978184756261783, "eval_loss": 0.24467714130878448, "eval_runtime": 9.1316, "eval_samples_per_second": 16.426, "eval_steps_per_second": 4.161, "eval_token_acc": 0.9170934604339974, "step": 460 }, { "epoch": 1.9997306760032318, "grad_norm": 0.7861402630805969, "learning_rate": 2.480480282347961e-06, "loss": 0.15792056322097778, "memory(GiB)": 31.51, "step": 465, "token_acc": 0.9337420552337027, "train_speed(iter/s)": 0.137533 }, { "epoch": 2.0172367357931593, "grad_norm": 0.6826748847961426, "learning_rate": 2.383659609199873e-06, "loss": 0.14240689277648927, "memory(GiB)": 31.51, "step": 470, "token_acc": 0.9578195371952166, "train_speed(iter/s)": 0.137988 }, { "epoch": 2.0387826555346082, "grad_norm": 0.6501537561416626, "learning_rate": 2.2881715276979705e-06, "loss": 0.10814023017883301, "memory(GiB)": 31.51, "step": 475, "token_acc": 0.9586633663366336, "train_speed(iter/s)": 0.138103 }, { "epoch": 2.060328575276057, "grad_norm": 0.6575304269790649, "learning_rate": 2.1940646731880887e-06, "loss": 0.1118842363357544, "memory(GiB)": 31.51, "step": 480, "token_acc": 0.9698543524895563, "train_speed(iter/s)": 0.138265 }, { "epoch": 2.060328575276057, "eval_loss": 0.26591211557388306, "eval_runtime": 9.1643, "eval_samples_per_second": 16.368, "eval_steps_per_second": 4.147, "eval_token_acc": 0.9162201367713668, "step": 480 }, { "epoch": 2.081874495017506, "grad_norm": 0.765779972076416, "learning_rate": 2.101386977509907e-06, "loss": 0.12155743837356567, "memory(GiB)": 31.51, "step": 485, "token_acc": 0.946289860026969, "train_speed(iter/s)": 0.137773 }, { "epoch": 2.103420414758955, "grad_norm": 0.7146125435829163, "learning_rate": 2.010185644583641e-06, "loss": 0.11463183164596558, "memory(GiB)": 31.51, "step": 490, "token_acc": 0.9624759934997784, "train_speed(iter/s)": 0.137939 }, { "epoch": 2.124966334500404, "grad_norm": 0.6777431964874268, "learning_rate": 1.920507126367448e-06, "loss": 0.10685477256774903, "memory(GiB)": 31.51, "step": 495, "token_acc": 0.9612281857095818, "train_speed(iter/s)": 0.138102 }, { "epoch": 2.146512254241853, "grad_norm": 0.7272450923919678, "learning_rate": 1.8323970991978823e-06, "loss": 0.10419889688491821, "memory(GiB)": 31.51, "step": 500, "token_acc": 0.9610325296357052, "train_speed(iter/s)": 0.138212 }, { "epoch": 2.146512254241853, "eval_loss": 0.26627564430236816, "eval_runtime": 9.1679, "eval_samples_per_second": 16.361, "eval_steps_per_second": 4.145, "eval_token_acc": 0.9159833032357382, "step": 500 }, { "epoch": 2.168058173983302, "grad_norm": 0.7035279273986816, "learning_rate": 1.7459004405253544e-06, "loss": 0.1082868218421936, "memory(GiB)": 31.51, "step": 505, "token_acc": 0.9475257941268758, "train_speed(iter/s)": 0.137652 }, { "epoch": 2.1896040937247507, "grad_norm": 0.601047158241272, "learning_rate": 1.6610612060565235e-06, "loss": 0.09674398303031921, "memory(GiB)": 31.51, "step": 510, "token_acc": 0.9646692233940556, "train_speed(iter/s)": 0.137835 }, { "epoch": 2.2111500134661997, "grad_norm": 0.7340168356895447, "learning_rate": 1.5779226073152071e-06, "loss": 0.1145021677017212, "memory(GiB)": 31.51, "step": 515, "token_acc": 0.9572068592615479, "train_speed(iter/s)": 0.138109 }, { "epoch": 2.2326959332076486, "grad_norm": 0.7059099078178406, "learning_rate": 1.4965269896332884e-06, "loss": 0.1138340711593628, "memory(GiB)": 31.51, "step": 520, "token_acc": 0.9643348939686037, "train_speed(iter/s)": 0.138301 }, { "epoch": 2.2326959332076486, "eval_loss": 0.26528117060661316, "eval_runtime": 9.1705, "eval_samples_per_second": 16.357, "eval_steps_per_second": 4.144, "eval_token_acc": 0.9159536990437847, "step": 520 }, { "epoch": 2.254241852949098, "grad_norm": 0.7410480976104736, "learning_rate": 1.4169158105827768e-06, "loss": 0.11105086803436279, "memory(GiB)": 31.51, "step": 525, "token_acc": 0.9514605435256503, "train_speed(iter/s)": 0.137827 }, { "epoch": 2.275787772690547, "grad_norm": 0.6698100566864014, "learning_rate": 1.3391296188600594e-06, "loss": 0.10843292474746705, "memory(GiB)": 31.51, "step": 530, "token_acc": 0.9629225092250923, "train_speed(iter/s)": 0.137941 }, { "epoch": 2.297333692431996, "grad_norm": 0.6693587303161621, "learning_rate": 1.2632080336330532e-06, "loss": 0.11362366676330567, "memory(GiB)": 31.51, "step": 535, "token_acc": 0.9621976353183642, "train_speed(iter/s)": 0.138116 }, { "epoch": 2.3188796121734447, "grad_norm": 0.6858277320861816, "learning_rate": 1.1891897243618184e-06, "loss": 0.10754673480987549, "memory(GiB)": 31.51, "step": 540, "token_acc": 0.9638930030070464, "train_speed(iter/s)": 0.138279 }, { "epoch": 2.3188796121734447, "eval_loss": 0.2651301622390747, "eval_runtime": 9.1447, "eval_samples_per_second": 16.403, "eval_steps_per_second": 4.155, "eval_token_acc": 0.91620533467539, "step": 540 }, { "epoch": 2.3404255319148937, "grad_norm": 0.7329075932502747, "learning_rate": 1.1171123911028692e-06, "loss": 0.10752699375152588, "memory(GiB)": 31.51, "step": 545, "token_acc": 0.946654961925566, "train_speed(iter/s)": 0.137811 }, { "epoch": 2.3619714516563426, "grad_norm": 0.7588092684745789, "learning_rate": 1.047012745307255e-06, "loss": 0.10413261651992797, "memory(GiB)": 31.51, "step": 550, "token_acc": 0.9638513608403786, "train_speed(iter/s)": 0.137959 }, { "epoch": 2.3835173713977915, "grad_norm": 0.6776463389396667, "learning_rate": 9.789264911221546e-07, "loss": 0.11203373670578003, "memory(GiB)": 31.51, "step": 555, "token_acc": 0.9539418840061927, "train_speed(iter/s)": 0.138182 }, { "epoch": 2.4050632911392404, "grad_norm": 0.6233177185058594, "learning_rate": 9.128883072055411e-07, "loss": 0.10640518665313721, "memory(GiB)": 31.51, "step": 560, "token_acc": 0.9605239362389232, "train_speed(iter/s)": 0.138441 }, { "epoch": 2.4050632911392404, "eval_loss": 0.26431551575660706, "eval_runtime": 9.1559, "eval_samples_per_second": 16.383, "eval_steps_per_second": 4.15, "eval_token_acc": 0.9158056780840167, "step": 560 }, { "epoch": 2.4266092108806894, "grad_norm": 0.6891351938247681, "learning_rate": 8.489318290631454e-07, "loss": 0.11017493009567261, "memory(GiB)": 31.51, "step": 565, "token_acc": 0.9501483222252186, "train_speed(iter/s)": 0.137996 }, { "epoch": 2.4481551306221383, "grad_norm": 0.685417890548706, "learning_rate": 7.870896319167548e-07, "loss": 0.10502817630767822, "memory(GiB)": 31.51, "step": 570, "token_acc": 0.9675666865866247, "train_speed(iter/s)": 0.138123 }, { "epoch": 2.4697010503635872, "grad_norm": 0.8273110389709473, "learning_rate": 7.273932141125256e-07, "loss": 0.11376097202301025, "memory(GiB)": 31.51, "step": 575, "token_acc": 0.9588286984389538, "train_speed(iter/s)": 0.138286 }, { "epoch": 2.4912469701050366, "grad_norm": 0.7995973825454712, "learning_rate": 6.698729810778065e-07, "loss": 0.1191399335861206, "memory(GiB)": 31.51, "step": 580, "token_acc": 0.9582757592998997, "train_speed(iter/s)": 0.138394 }, { "epoch": 2.4912469701050366, "eval_loss": 0.26525548100471497, "eval_runtime": 9.1367, "eval_samples_per_second": 16.417, "eval_steps_per_second": 4.159, "eval_token_acc": 0.9161461262914828, "step": 580 }, { "epoch": 2.5127928898464855, "grad_norm": 0.6976614594459534, "learning_rate": 6.145582298346153e-07, "loss": 0.10850718021392822, "memory(GiB)": 31.51, "step": 585, "token_acc": 0.9468696569536905, "train_speed(iter/s)": 0.137863 }, { "epoch": 2.5343388095879344, "grad_norm": 0.7127689123153687, "learning_rate": 5.614771340776559e-07, "loss": 0.1049992561340332, "memory(GiB)": 31.51, "step": 590, "token_acc": 0.9636775106082037, "train_speed(iter/s)": 0.137998 }, { "epoch": 2.5558847293293834, "grad_norm": 0.7365370392799377, "learning_rate": 5.106567298245008e-07, "loss": 0.11682652235031128, "memory(GiB)": 31.51, "step": 595, "token_acc": 0.9585528403681371, "train_speed(iter/s)": 0.138108 }, { "epoch": 2.5774306490708323, "grad_norm": 0.6995398998260498, "learning_rate": 4.6212290164521554e-07, "loss": 0.10941903591156006, "memory(GiB)": 31.51, "step": 600, "token_acc": 0.9588730068630993, "train_speed(iter/s)": 0.138228 }, { "epoch": 2.5774306490708323, "eval_loss": 0.2648448944091797, "eval_runtime": 9.1555, "eval_samples_per_second": 16.384, "eval_steps_per_second": 4.15, "eval_token_acc": 0.91620533467539, "step": 600 }, { "epoch": 2.5989765688122812, "grad_norm": 0.6558308601379395, "learning_rate": 4.159003694784647e-07, "loss": 0.09994454979896546, "memory(GiB)": 31.51, "step": 605, "token_acc": 0.9488222044057573, "train_speed(iter/s)": 0.137823 }, { "epoch": 2.62052248855373, "grad_norm": 0.5623044371604919, "learning_rate": 3.7201267604080436e-07, "loss": 0.10503623485565186, "memory(GiB)": 31.51, "step": 610, "token_acc": 0.9605492530908896, "train_speed(iter/s)": 0.137935 }, { "epoch": 2.642068408295179, "grad_norm": 0.7411386966705322, "learning_rate": 3.3048217483556743e-07, "loss": 0.10335917472839355, "memory(GiB)": 31.51, "step": 615, "token_acc": 0.9596229517824632, "train_speed(iter/s)": 0.138049 }, { "epoch": 2.663614328036628, "grad_norm": 0.7703331112861633, "learning_rate": 2.9133001876746004e-07, "loss": 0.11330341100692749, "memory(GiB)": 31.51, "step": 620, "token_acc": 0.9627358888545153, "train_speed(iter/s)": 0.138195 }, { "epoch": 2.663614328036628, "eval_loss": 0.2645653188228607, "eval_runtime": 9.1574, "eval_samples_per_second": 16.38, "eval_steps_per_second": 4.15, "eval_token_acc": 0.9161461262914828, "step": 620 }, { "epoch": 2.685160247778077, "grad_norm": 0.7208820581436157, "learning_rate": 2.545761493686666e-07, "loss": 0.10512195825576783, "memory(GiB)": 31.51, "step": 625, "token_acc": 0.9453959214438257, "train_speed(iter/s)": 0.137691 }, { "epoch": 2.706706167519526, "grad_norm": 0.792917013168335, "learning_rate": 2.2023928664194229e-07, "loss": 0.10448248386383056, "memory(GiB)": 31.51, "step": 630, "token_acc": 0.9634454263743831, "train_speed(iter/s)": 0.137865 }, { "epoch": 2.728252087260975, "grad_norm": 0.7074964046478271, "learning_rate": 1.8833691952587829e-07, "loss": 0.10274065732955932, "memory(GiB)": 31.51, "step": 635, "token_acc": 0.9614498168320434, "train_speed(iter/s)": 0.137973 }, { "epoch": 2.7497980070024237, "grad_norm": 0.695501446723938, "learning_rate": 1.5888529698718347e-07, "loss": 0.111275053024292, "memory(GiB)": 31.51, "step": 640, "token_acc": 0.9627026215729437, "train_speed(iter/s)": 0.13808 }, { "epoch": 2.7497980070024237, "eval_loss": 0.26474642753601074, "eval_runtime": 9.1444, "eval_samples_per_second": 16.403, "eval_steps_per_second": 4.156, "eval_token_acc": 0.9162349388673436, "step": 640 }, { "epoch": 2.7713439267438726, "grad_norm": 0.7010114192962646, "learning_rate": 1.3189941974453502e-07, "loss": 0.11862779855728149, "memory(GiB)": 31.51, "step": 645, "token_acc": 0.9463942439720986, "train_speed(iter/s)": 0.13766 }, { "epoch": 2.7928898464853216, "grad_norm": 0.7053817510604858, "learning_rate": 1.0739303262819301e-07, "loss": 0.10773177146911621, "memory(GiB)": 31.51, "step": 650, "token_acc": 0.9672267425750056, "train_speed(iter/s)": 0.137765 }, { "epoch": 2.814435766226771, "grad_norm": 0.7381494641304016, "learning_rate": 8.537861757929422e-08, "loss": 0.10787509679794312, "memory(GiB)": 31.51, "step": 655, "token_acc": 0.9632690990902866, "train_speed(iter/s)": 0.137917 }, { "epoch": 2.83598168596822, "grad_norm": 0.7262890934944153, "learning_rate": 6.58673872923693e-08, "loss": 0.11206209659576416, "memory(GiB)": 31.51, "step": 660, "token_acc": 0.965990990990991, "train_speed(iter/s)": 0.138079 }, { "epoch": 2.83598168596822, "eval_loss": 0.2648203372955322, "eval_runtime": 9.1399, "eval_samples_per_second": 16.412, "eval_steps_per_second": 4.158, "eval_token_acc": 0.916279345155274, "step": 660 }, { "epoch": 2.857527605709669, "grad_norm": 0.7241911888122559, "learning_rate": 4.88692795043344e-08, "loss": 0.10918653011322021, "memory(GiB)": 31.51, "step": 665, "token_acc": 0.9504457917261055, "train_speed(iter/s)": 0.137668 }, { "epoch": 2.8790735254511177, "grad_norm": 0.7945267558097839, "learning_rate": 3.439295193286174e-08, "loss": 0.11153676509857177, "memory(GiB)": 31.51, "step": 670, "token_acc": 0.9583095218657305, "train_speed(iter/s)": 0.13784 }, { "epoch": 2.9006194451925666, "grad_norm": 0.7374799847602844, "learning_rate": 2.2445777866709208e-08, "loss": 0.10855717658996582, "memory(GiB)": 31.51, "step": 675, "token_acc": 0.9629715143294179, "train_speed(iter/s)": 0.13794 }, { "epoch": 2.9221653649340156, "grad_norm": 0.6748504042625427, "learning_rate": 1.3033842410251074e-08, "loss": 0.11381592750549316, "memory(GiB)": 31.51, "step": 680, "token_acc": 0.9596367864459332, "train_speed(iter/s)": 0.138055 }, { "epoch": 2.9221653649340156, "eval_loss": 0.26483333110809326, "eval_runtime": 9.1492, "eval_samples_per_second": 16.395, "eval_steps_per_second": 4.153, "eval_token_acc": 0.9160425116196453, "step": 680 }, { "epoch": 2.9437112846754645, "grad_norm": 0.7286980748176575, "learning_rate": 6.16193938412557e-09, "loss": 0.10510704517364503, "memory(GiB)": 31.51, "step": 685, "token_acc": 0.9509096674461929, "train_speed(iter/s)": 0.137639 }, { "epoch": 2.9652572044169134, "grad_norm": 0.6892649531364441, "learning_rate": 1.8335688835802169e-09, "loss": 0.105083167552948, "memory(GiB)": 31.51, "step": 690, "token_acc": 0.9589703497799398, "train_speed(iter/s)": 0.137777 }, { "epoch": 2.9868031241583624, "grad_norm": 0.7908564209938049, "learning_rate": 5.093549575119205e-11, "loss": 0.10409483909606934, "memory(GiB)": 31.51, "step": 695, "token_acc": 0.9638513775207209, "train_speed(iter/s)": 0.137894 }, { "epoch": 2.9911123081066524, "eval_loss": 0.26468953490257263, "eval_runtime": 9.1426, "eval_samples_per_second": 16.407, "eval_steps_per_second": 4.156, "eval_token_acc": 0.9161757304834365, "step": 696 } ], "logging_steps": 5, "max_steps": 696, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.392166013990339e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }