{ "best_global_step": 1580, "best_metric": 0.33898818, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v54-20250507-020216/checkpoint-1580", "epoch": 2.9972559780478245, "eval_steps": 20, "global_step": 2391, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012544100352802822, "grad_norm": 3.029660940170288, "learning_rate": 9.999995684008912e-06, "loss": 0.7123785018920898, "memory(GiB)": 28.82, "step": 1, "token_acc": 0.8081615027528878, "train_speed(iter/s)": 0.06477 }, { "epoch": 0.006272050176401411, "grad_norm": 2.1236259937286377, "learning_rate": 9.999892100595329e-06, "loss": 0.6517069339752197, "memory(GiB)": 28.86, "step": 5, "token_acc": 0.8096913375373382, "train_speed(iter/s)": 0.12466 }, { "epoch": 0.012544100352802822, "grad_norm": 1.261985182762146, "learning_rate": 9.999568407038233e-06, "loss": 0.5820259571075439, "memory(GiB)": 28.86, "step": 10, "token_acc": 0.8246924192768496, "train_speed(iter/s)": 0.139298 }, { "epoch": 0.018816150529204233, "grad_norm": 1.0761600732803345, "learning_rate": 9.999028933299243e-06, "loss": 0.5411366939544677, "memory(GiB)": 28.86, "step": 15, "token_acc": 0.8337762808199353, "train_speed(iter/s)": 0.145935 }, { "epoch": 0.025088200705605645, "grad_norm": 1.0089085102081299, "learning_rate": 9.99827370266192e-06, "loss": 0.512534236907959, "memory(GiB)": 28.86, "step": 20, "token_acc": 0.8508319467554076, "train_speed(iter/s)": 0.147072 }, { "epoch": 0.025088200705605645, "eval_loss": 0.49907156825065613, "eval_runtime": 29.7712, "eval_samples_per_second": 17.299, "eval_steps_per_second": 4.333, "eval_token_acc": 0.8417260457708107, "step": 20 }, { "epoch": 0.03136025088200706, "grad_norm": 0.9683756232261658, "learning_rate": 9.99730274772184e-06, "loss": 0.509831714630127, "memory(GiB)": 28.86, "step": 25, "token_acc": 0.8483637399068481, "train_speed(iter/s)": 0.120173 }, { "epoch": 0.037632301058408466, "grad_norm": 1.0160096883773804, "learning_rate": 9.996116110385186e-06, "loss": 0.5112733364105224, "memory(GiB)": 28.86, "step": 30, "token_acc": 0.8454686767499815, "train_speed(iter/s)": 0.125825 }, { "epoch": 0.04390435123480988, "grad_norm": 0.9916045665740967, "learning_rate": 9.99471384186694e-06, "loss": 0.5009718418121338, "memory(GiB)": 28.87, "step": 35, "token_acc": 0.8552517091361094, "train_speed(iter/s)": 0.129686 }, { "epoch": 0.05017640141121129, "grad_norm": 0.9175627827644348, "learning_rate": 9.99309600268868e-06, "loss": 0.47254362106323244, "memory(GiB)": 28.87, "step": 40, "token_acc": 0.8560227119102338, "train_speed(iter/s)": 0.131824 }, { "epoch": 0.05017640141121129, "eval_loss": 0.46525266766548157, "eval_runtime": 29.5949, "eval_samples_per_second": 17.402, "eval_steps_per_second": 4.359, "eval_token_acc": 0.8489353675915835, "step": 40 }, { "epoch": 0.0564484515876127, "grad_norm": 0.9363399744033813, "learning_rate": 9.991262662675962e-06, "loss": 0.49153480529785154, "memory(GiB)": 28.87, "step": 45, "token_acc": 0.8566000039174975, "train_speed(iter/s)": 0.120002 }, { "epoch": 0.06272050176401411, "grad_norm": 0.9040453433990479, "learning_rate": 9.9892139009553e-06, "loss": 0.47645087242126466, "memory(GiB)": 28.87, "step": 50, "token_acc": 0.8460839347767977, "train_speed(iter/s)": 0.1231 }, { "epoch": 0.06899255194041552, "grad_norm": 0.9906909465789795, "learning_rate": 9.986949805950763e-06, "loss": 0.48256454467773435, "memory(GiB)": 28.87, "step": 55, "token_acc": 0.8567946374162096, "train_speed(iter/s)": 0.12528 }, { "epoch": 0.07526460211681693, "grad_norm": 0.9595295190811157, "learning_rate": 9.984470475380154e-06, "loss": 0.47391643524169924, "memory(GiB)": 28.87, "step": 60, "token_acc": 0.8629193166230203, "train_speed(iter/s)": 0.12768 }, { "epoch": 0.07526460211681693, "eval_loss": 0.4490402638912201, "eval_runtime": 29.6498, "eval_samples_per_second": 17.369, "eval_steps_per_second": 4.351, "eval_token_acc": 0.8536633414368346, "step": 60 }, { "epoch": 0.08153665229321834, "grad_norm": 0.9027647972106934, "learning_rate": 9.981776016250789e-06, "loss": 0.4695126533508301, "memory(GiB)": 28.87, "step": 65, "token_acc": 0.8616360365706656, "train_speed(iter/s)": 0.119908 }, { "epoch": 0.08780870246961976, "grad_norm": 0.9021025896072388, "learning_rate": 9.97886654485488e-06, "loss": 0.44661579132080076, "memory(GiB)": 28.87, "step": 70, "token_acc": 0.8703178432256183, "train_speed(iter/s)": 0.121955 }, { "epoch": 0.09408075264602117, "grad_norm": 0.9017606973648071, "learning_rate": 9.975742186764526e-06, "loss": 0.4442440509796143, "memory(GiB)": 28.87, "step": 75, "token_acc": 0.8678062233322926, "train_speed(iter/s)": 0.123859 }, { "epoch": 0.10035280282242258, "grad_norm": 0.9587047696113586, "learning_rate": 9.972403076826272e-06, "loss": 0.454923677444458, "memory(GiB)": 28.87, "step": 80, "token_acc": 0.849014240457663, "train_speed(iter/s)": 0.125488 }, { "epoch": 0.10035280282242258, "eval_loss": 0.4374794065952301, "eval_runtime": 29.5479, "eval_samples_per_second": 17.429, "eval_steps_per_second": 4.366, "eval_token_acc": 0.8562285187358538, "step": 80 }, { "epoch": 0.10662485299882399, "grad_norm": 0.8854019641876221, "learning_rate": 9.96884935915531e-06, "loss": 0.4319493770599365, "memory(GiB)": 28.87, "step": 85, "token_acc": 0.8624362103394719, "train_speed(iter/s)": 0.119824 }, { "epoch": 0.1128969031752254, "grad_norm": 0.8231946229934692, "learning_rate": 9.965081187129248e-06, "loss": 0.43582682609558104, "memory(GiB)": 28.87, "step": 90, "token_acc": 0.8703445195153275, "train_speed(iter/s)": 0.120848 }, { "epoch": 0.11916895335162682, "grad_norm": 0.9398965239524841, "learning_rate": 9.961098723381495e-06, "loss": 0.4444568634033203, "memory(GiB)": 28.87, "step": 95, "token_acc": 0.8672030063249265, "train_speed(iter/s)": 0.122608 }, { "epoch": 0.12544100352802823, "grad_norm": 0.8782020807266235, "learning_rate": 9.956902139794236e-06, "loss": 0.4650153636932373, "memory(GiB)": 28.87, "step": 100, "token_acc": 0.8458562410914274, "train_speed(iter/s)": 0.12399 }, { "epoch": 0.12544100352802823, "eval_loss": 0.4288901388645172, "eval_runtime": 29.5859, "eval_samples_per_second": 17.407, "eval_steps_per_second": 4.36, "eval_token_acc": 0.8588523765613212, "step": 100 }, { "epoch": 0.13171305370442962, "grad_norm": 0.885040283203125, "learning_rate": 9.95249161749102e-06, "loss": 0.45799875259399414, "memory(GiB)": 28.87, "step": 105, "token_acc": 0.8647499099695356, "train_speed(iter/s)": 0.119708 }, { "epoch": 0.13798510388083104, "grad_norm": 0.8790634274482727, "learning_rate": 9.94786734682894e-06, "loss": 0.4435451030731201, "memory(GiB)": 28.87, "step": 110, "token_acc": 0.8620516456892986, "train_speed(iter/s)": 0.120941 }, { "epoch": 0.14425715405723247, "grad_norm": 0.8384956121444702, "learning_rate": 9.943029527390415e-06, "loss": 0.45367069244384767, "memory(GiB)": 28.87, "step": 115, "token_acc": 0.8639205792014041, "train_speed(iter/s)": 0.122387 }, { "epoch": 0.15052920423363386, "grad_norm": 0.828059196472168, "learning_rate": 9.93797836797458e-06, "loss": 0.4475994110107422, "memory(GiB)": 28.87, "step": 120, "token_acc": 0.8620637823483729, "train_speed(iter/s)": 0.123587 }, { "epoch": 0.15052920423363386, "eval_loss": 0.4217882454395294, "eval_runtime": 29.5603, "eval_samples_per_second": 17.422, "eval_steps_per_second": 4.364, "eval_token_acc": 0.8602690921284265, "step": 120 }, { "epoch": 0.15680125441003528, "grad_norm": 0.9089007377624512, "learning_rate": 9.932714086588276e-06, "loss": 0.4421473503112793, "memory(GiB)": 28.87, "step": 125, "token_acc": 0.8651899104582431, "train_speed(iter/s)": 0.119877 }, { "epoch": 0.16307330458643668, "grad_norm": 0.8998169898986816, "learning_rate": 9.92723691043663e-06, "loss": 0.4212520599365234, "memory(GiB)": 28.87, "step": 130, "token_acc": 0.8652097803376814, "train_speed(iter/s)": 0.120973 }, { "epoch": 0.1693453547628381, "grad_norm": 0.8805875778198242, "learning_rate": 9.921547075913261e-06, "loss": 0.439087438583374, "memory(GiB)": 28.87, "step": 135, "token_acc": 0.8655743664312422, "train_speed(iter/s)": 0.121977 }, { "epoch": 0.17561740493923952, "grad_norm": 0.8694149851799011, "learning_rate": 9.915644828590074e-06, "loss": 0.4487740516662598, "memory(GiB)": 28.87, "step": 140, "token_acc": 0.8618538324420677, "train_speed(iter/s)": 0.123124 }, { "epoch": 0.17561740493923952, "eval_loss": 0.4171818792819977, "eval_runtime": 29.6071, "eval_samples_per_second": 17.394, "eval_steps_per_second": 4.357, "eval_token_acc": 0.86238159108056, "step": 140 }, { "epoch": 0.18188945511564092, "grad_norm": 0.8099656701087952, "learning_rate": 9.909530423206657e-06, "loss": 0.43574037551879885, "memory(GiB)": 28.87, "step": 145, "token_acc": 0.8709998032902297, "train_speed(iter/s)": 0.119783 }, { "epoch": 0.18816150529204234, "grad_norm": 0.811008632183075, "learning_rate": 9.903204123659288e-06, "loss": 0.42326993942260743, "memory(GiB)": 28.87, "step": 150, "token_acc": 0.8682702065220372, "train_speed(iter/s)": 0.120804 }, { "epoch": 0.19443355546844374, "grad_norm": 0.8804604411125183, "learning_rate": 9.896666202989553e-06, "loss": 0.43947763442993165, "memory(GiB)": 28.87, "step": 155, "token_acc": 0.8630830791616532, "train_speed(iter/s)": 0.121921 }, { "epoch": 0.20070560564484516, "grad_norm": 0.8700482845306396, "learning_rate": 9.889916943372549e-06, "loss": 0.43802604675292967, "memory(GiB)": 28.87, "step": 160, "token_acc": 0.8609460606528282, "train_speed(iter/s)": 0.12293 }, { "epoch": 0.20070560564484516, "eval_loss": 0.4114477336406708, "eval_runtime": 29.5534, "eval_samples_per_second": 17.426, "eval_steps_per_second": 4.365, "eval_token_acc": 0.8631695867214352, "step": 160 }, { "epoch": 0.20697765582124658, "grad_norm": 0.7978352308273315, "learning_rate": 9.882956636104714e-06, "loss": 0.43416438102722166, "memory(GiB)": 28.87, "step": 165, "token_acc": 0.8689747827683861, "train_speed(iter/s)": 0.120122 }, { "epoch": 0.21324970599764798, "grad_norm": 0.8909711837768555, "learning_rate": 9.875785581591253e-06, "loss": 0.43141732215881345, "memory(GiB)": 28.87, "step": 170, "token_acc": 0.8619692543743177, "train_speed(iter/s)": 0.120876 }, { "epoch": 0.2195217561740494, "grad_norm": 0.8957451581954956, "learning_rate": 9.868404089333171e-06, "loss": 0.42152652740478513, "memory(GiB)": 28.87, "step": 175, "token_acc": 0.8754910463751849, "train_speed(iter/s)": 0.121784 }, { "epoch": 0.2257938063504508, "grad_norm": 0.9475491046905518, "learning_rate": 9.860812477913915e-06, "loss": 0.417528772354126, "memory(GiB)": 28.87, "step": 180, "token_acc": 0.8698917851281053, "train_speed(iter/s)": 0.122636 }, { "epoch": 0.2257938063504508, "eval_loss": 0.4081062376499176, "eval_runtime": 29.6207, "eval_samples_per_second": 17.387, "eval_steps_per_second": 4.355, "eval_token_acc": 0.8642426020622014, "step": 180 }, { "epoch": 0.23206585652685222, "grad_norm": 0.8352420926094055, "learning_rate": 9.853011074985628e-06, "loss": 0.4175262928009033, "memory(GiB)": 28.87, "step": 185, "token_acc": 0.8727626873915867, "train_speed(iter/s)": 0.120174 }, { "epoch": 0.23833790670325364, "grad_norm": 0.9205328822135925, "learning_rate": 9.845000217255e-06, "loss": 0.4254606246948242, "memory(GiB)": 28.87, "step": 190, "token_acc": 0.8669568108320489, "train_speed(iter/s)": 0.121076 }, { "epoch": 0.24460995687965503, "grad_norm": 0.8192192316055298, "learning_rate": 9.836780250468744e-06, "loss": 0.41744155883789064, "memory(GiB)": 28.87, "step": 195, "token_acc": 0.8699167905678588, "train_speed(iter/s)": 0.121801 }, { "epoch": 0.25088200705605646, "grad_norm": 0.8817274570465088, "learning_rate": 9.82835152939867e-06, "loss": 0.4215375900268555, "memory(GiB)": 28.87, "step": 200, "token_acc": 0.8657595006023436, "train_speed(iter/s)": 0.122488 }, { "epoch": 0.25088200705605646, "eval_loss": 0.403290718793869, "eval_runtime": 29.5998, "eval_samples_per_second": 17.399, "eval_steps_per_second": 4.358, "eval_token_acc": 0.8656928493587057, "step": 200 }, { "epoch": 0.2571540572324579, "grad_norm": 0.8728285431861877, "learning_rate": 9.81971441782637e-06, "loss": 0.4273026466369629, "memory(GiB)": 28.87, "step": 205, "token_acc": 0.8701400706898809, "train_speed(iter/s)": 0.120321 }, { "epoch": 0.26342610740885924, "grad_norm": 0.840922474861145, "learning_rate": 9.810869288527528e-06, "loss": 0.41336374282836913, "memory(GiB)": 28.87, "step": 210, "token_acc": 0.8631279407207055, "train_speed(iter/s)": 0.120898 }, { "epoch": 0.26969815758526067, "grad_norm": 0.9264822602272034, "learning_rate": 9.801816523255811e-06, "loss": 0.40021047592163084, "memory(GiB)": 28.87, "step": 215, "token_acc": 0.8800351361525913, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.2759702077616621, "grad_norm": 0.833281934261322, "learning_rate": 9.792556512726419e-06, "loss": 0.4237551689147949, "memory(GiB)": 28.87, "step": 220, "token_acc": 0.8799363351171803, "train_speed(iter/s)": 0.122367 }, { "epoch": 0.2759702077616621, "eval_loss": 0.40029406547546387, "eval_runtime": 29.5811, "eval_samples_per_second": 17.41, "eval_steps_per_second": 4.361, "eval_token_acc": 0.8666275463157013, "step": 220 }, { "epoch": 0.2822422579380635, "grad_norm": 0.9009267687797546, "learning_rate": 9.783089656599196e-06, "loss": 0.41417922973632815, "memory(GiB)": 28.87, "step": 225, "token_acc": 0.8714769221553299, "train_speed(iter/s)": 0.120484 }, { "epoch": 0.28851430811446493, "grad_norm": 0.8666077256202698, "learning_rate": 9.773416363461401e-06, "loss": 0.4124805450439453, "memory(GiB)": 28.87, "step": 230, "token_acc": 0.880201765447667, "train_speed(iter/s)": 0.121099 }, { "epoch": 0.2947863582908663, "grad_norm": 0.8758551478385925, "learning_rate": 9.763537050810064e-06, "loss": 0.41759481430053713, "memory(GiB)": 28.87, "step": 235, "token_acc": 0.8732158264513743, "train_speed(iter/s)": 0.121632 }, { "epoch": 0.3010584084672677, "grad_norm": 0.8822108507156372, "learning_rate": 9.753452145033961e-06, "loss": 0.42021803855895995, "memory(GiB)": 28.87, "step": 240, "token_acc": 0.8624877221180463, "train_speed(iter/s)": 0.122082 }, { "epoch": 0.3010584084672677, "eval_loss": 0.3969193994998932, "eval_runtime": 29.8072, "eval_samples_per_second": 17.278, "eval_steps_per_second": 4.328, "eval_token_acc": 0.8671514795875598, "step": 240 }, { "epoch": 0.30733045864366915, "grad_norm": 0.8353932499885559, "learning_rate": 9.743162081395227e-06, "loss": 0.4134369850158691, "memory(GiB)": 28.87, "step": 245, "token_acc": 0.8730592226495686, "train_speed(iter/s)": 0.120284 }, { "epoch": 0.31360250882007057, "grad_norm": 0.861675500869751, "learning_rate": 9.73266730401056e-06, "loss": 0.42804179191589353, "memory(GiB)": 28.87, "step": 250, "token_acc": 0.8642180774748924, "train_speed(iter/s)": 0.121032 }, { "epoch": 0.319874558996472, "grad_norm": 0.86508709192276, "learning_rate": 9.72196826583205e-06, "loss": 0.4076427459716797, "memory(GiB)": 28.87, "step": 255, "token_acc": 0.8697714532125916, "train_speed(iter/s)": 0.121618 }, { "epoch": 0.32614660917287336, "grad_norm": 0.8428534865379333, "learning_rate": 9.711065428627638e-06, "loss": 0.41555137634277345, "memory(GiB)": 28.87, "step": 260, "token_acc": 0.8662227763482505, "train_speed(iter/s)": 0.122077 }, { "epoch": 0.32614660917287336, "eval_loss": 0.39386531710624695, "eval_runtime": 29.7323, "eval_samples_per_second": 17.321, "eval_steps_per_second": 4.339, "eval_token_acc": 0.8679436666946098, "step": 260 }, { "epoch": 0.3324186593492748, "grad_norm": 0.8486117124557495, "learning_rate": 9.699959262961182e-06, "loss": 0.422438907623291, "memory(GiB)": 28.87, "step": 265, "token_acc": 0.8720271109166218, "train_speed(iter/s)": 0.120433 }, { "epoch": 0.3386907095256762, "grad_norm": 0.8571962714195251, "learning_rate": 9.688650248172145e-06, "loss": 0.43577041625976565, "memory(GiB)": 28.87, "step": 270, "token_acc": 0.8602295350743647, "train_speed(iter/s)": 0.120989 }, { "epoch": 0.3449627597020776, "grad_norm": 0.8868110775947571, "learning_rate": 9.677138872354916e-06, "loss": 0.41460485458374025, "memory(GiB)": 28.87, "step": 275, "token_acc": 0.8727983282221077, "train_speed(iter/s)": 0.121551 }, { "epoch": 0.35123480987847905, "grad_norm": 0.9915756583213806, "learning_rate": 9.665425632337731e-06, "loss": 0.4305459976196289, "memory(GiB)": 28.87, "step": 280, "token_acc": 0.8665983351969189, "train_speed(iter/s)": 0.12212 }, { "epoch": 0.35123480987847905, "eval_loss": 0.3917335569858551, "eval_runtime": 29.562, "eval_samples_per_second": 17.421, "eval_steps_per_second": 4.364, "eval_token_acc": 0.8687568111325341, "step": 280 }, { "epoch": 0.3575068600548804, "grad_norm": 0.9129208922386169, "learning_rate": 9.653511033661242e-06, "loss": 0.4195101261138916, "memory(GiB)": 28.87, "step": 285, "token_acc": 0.8746003876873738, "train_speed(iter/s)": 0.12054 }, { "epoch": 0.36377891023128184, "grad_norm": 0.8719012141227722, "learning_rate": 9.641395590556689e-06, "loss": 0.3962116241455078, "memory(GiB)": 28.87, "step": 290, "token_acc": 0.8869061113450141, "train_speed(iter/s)": 0.121042 }, { "epoch": 0.37005096040768326, "grad_norm": 0.8595064878463745, "learning_rate": 9.629079825923712e-06, "loss": 0.40920305252075195, "memory(GiB)": 28.87, "step": 295, "token_acc": 0.8752032411973238, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.3763230105840847, "grad_norm": 0.8926518559455872, "learning_rate": 9.616564271307779e-06, "loss": 0.42294983863830565, "memory(GiB)": 28.87, "step": 300, "token_acc": 0.8639302937308199, "train_speed(iter/s)": 0.12214 }, { "epoch": 0.3763230105840847, "eval_loss": 0.3890170454978943, "eval_runtime": 29.5793, "eval_samples_per_second": 17.411, "eval_steps_per_second": 4.361, "eval_token_acc": 0.8693939139911141, "step": 300 }, { "epoch": 0.3825950607604861, "grad_norm": 0.8123087286949158, "learning_rate": 9.603849466877249e-06, "loss": 0.39454007148742676, "memory(GiB)": 28.87, "step": 305, "token_acc": 0.8777894566623544, "train_speed(iter/s)": 0.120651 }, { "epoch": 0.3888671109368875, "grad_norm": 0.824553370475769, "learning_rate": 9.59093596140005e-06, "loss": 0.3974800109863281, "memory(GiB)": 28.87, "step": 310, "token_acc": 0.8730650935309837, "train_speed(iter/s)": 0.121172 }, { "epoch": 0.3951391611132889, "grad_norm": 0.8097919821739197, "learning_rate": 9.577824312220006e-06, "loss": 0.40521669387817383, "memory(GiB)": 28.87, "step": 315, "token_acc": 0.8752670549328339, "train_speed(iter/s)": 0.121519 }, { "epoch": 0.4014112112896903, "grad_norm": 0.7727741003036499, "learning_rate": 9.564515085232772e-06, "loss": 0.4013851165771484, "memory(GiB)": 28.87, "step": 320, "token_acc": 0.868708497086326, "train_speed(iter/s)": 0.121981 }, { "epoch": 0.4014112112896903, "eval_loss": 0.3878418207168579, "eval_runtime": 29.6547, "eval_samples_per_second": 17.367, "eval_steps_per_second": 4.35, "eval_token_acc": 0.8700855059099674, "step": 320 }, { "epoch": 0.40768326146609174, "grad_norm": 0.8198765516281128, "learning_rate": 9.55100885486142e-06, "loss": 0.4199061393737793, "memory(GiB)": 28.87, "step": 325, "token_acc": 0.8758526603001364, "train_speed(iter/s)": 0.120641 }, { "epoch": 0.41395531164249316, "grad_norm": 0.9562544226646423, "learning_rate": 9.537306204031628e-06, "loss": 0.4178496837615967, "memory(GiB)": 28.87, "step": 330, "token_acc": 0.8739065294089218, "train_speed(iter/s)": 0.121059 }, { "epoch": 0.42022736181889453, "grad_norm": 0.8651145100593567, "learning_rate": 9.523407724146548e-06, "loss": 0.414202356338501, "memory(GiB)": 28.87, "step": 335, "token_acc": 0.866993368311716, "train_speed(iter/s)": 0.121595 }, { "epoch": 0.42649941199529595, "grad_norm": 0.8332494497299194, "learning_rate": 9.509314015061263e-06, "loss": 0.3904601812362671, "memory(GiB)": 28.87, "step": 340, "token_acc": 0.8766263821033685, "train_speed(iter/s)": 0.121943 }, { "epoch": 0.42649941199529595, "eval_loss": 0.3836223781108856, "eval_runtime": 29.5735, "eval_samples_per_second": 17.414, "eval_steps_per_second": 4.362, "eval_token_acc": 0.8714351580182748, "step": 340 }, { "epoch": 0.4327714621716974, "grad_norm": 0.7933188080787659, "learning_rate": 9.495025685056898e-06, "loss": 0.4055050849914551, "memory(GiB)": 28.87, "step": 345, "token_acc": 0.8755267232424593, "train_speed(iter/s)": 0.120637 }, { "epoch": 0.4390435123480988, "grad_norm": 0.8231353759765625, "learning_rate": 9.480543350814376e-06, "loss": 0.41367053985595703, "memory(GiB)": 28.87, "step": 350, "token_acc": 0.8694767367912289, "train_speed(iter/s)": 0.121138 }, { "epoch": 0.4453155625245002, "grad_norm": 0.7924429178237915, "learning_rate": 9.465867637387793e-06, "loss": 0.4196880340576172, "memory(GiB)": 28.87, "step": 355, "token_acc": 0.8645996976613214, "train_speed(iter/s)": 0.121637 }, { "epoch": 0.4515876127009016, "grad_norm": 0.7936041951179504, "learning_rate": 9.450999178177445e-06, "loss": 0.40215320587158204, "memory(GiB)": 28.87, "step": 360, "token_acc": 0.8788266393792104, "train_speed(iter/s)": 0.122002 }, { "epoch": 0.4515876127009016, "eval_loss": 0.3816666603088379, "eval_runtime": 29.619, "eval_samples_per_second": 17.387, "eval_steps_per_second": 4.355, "eval_token_acc": 0.8717076033196413, "step": 360 }, { "epoch": 0.457859662877303, "grad_norm": 0.8175886869430542, "learning_rate": 9.435938614902494e-06, "loss": 0.38351328372955323, "memory(GiB)": 28.87, "step": 365, "token_acc": 0.8787878787878788, "train_speed(iter/s)": 0.120772 }, { "epoch": 0.46413171305370443, "grad_norm": 0.8586422801017761, "learning_rate": 9.42068659757326e-06, "loss": 0.4199483394622803, "memory(GiB)": 28.87, "step": 370, "token_acc": 0.8653846153846154, "train_speed(iter/s)": 0.121125 }, { "epoch": 0.47040376323010585, "grad_norm": 0.8419802784919739, "learning_rate": 9.405243784463181e-06, "loss": 0.4090768337249756, "memory(GiB)": 28.87, "step": 375, "token_acc": 0.8812298983661392, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.4766758134065073, "grad_norm": 0.9342820048332214, "learning_rate": 9.389610842080394e-06, "loss": 0.414335823059082, "memory(GiB)": 28.87, "step": 380, "token_acc": 0.8696789068211213, "train_speed(iter/s)": 0.121985 }, { "epoch": 0.4766758134065073, "eval_loss": 0.3803957402706146, "eval_runtime": 29.5699, "eval_samples_per_second": 17.416, "eval_steps_per_second": 4.363, "eval_token_acc": 0.8713722860256518, "step": 380 }, { "epoch": 0.48294786358290864, "grad_norm": 0.8180081844329834, "learning_rate": 9.373788445138972e-06, "loss": 0.39729149341583253, "memory(GiB)": 28.87, "step": 385, "token_acc": 0.8782972920319502, "train_speed(iter/s)": 0.120827 }, { "epoch": 0.48921991375931007, "grad_norm": 0.8141390085220337, "learning_rate": 9.357777276529793e-06, "loss": 0.3939579963684082, "memory(GiB)": 28.87, "step": 390, "token_acc": 0.8719189555691457, "train_speed(iter/s)": 0.121285 }, { "epoch": 0.4954919639357115, "grad_norm": 0.79034024477005, "learning_rate": 9.341578027291085e-06, "loss": 0.3828037977218628, "memory(GiB)": 28.87, "step": 395, "token_acc": 0.8823363286264442, "train_speed(iter/s)": 0.121636 }, { "epoch": 0.5017640141121129, "grad_norm": 0.903527021408081, "learning_rate": 9.325191396578589e-06, "loss": 0.4000723838806152, "memory(GiB)": 28.87, "step": 400, "token_acc": 0.8762913767657601, "train_speed(iter/s)": 0.122009 }, { "epoch": 0.5017640141121129, "eval_loss": 0.3788001239299774, "eval_runtime": 29.5859, "eval_samples_per_second": 17.407, "eval_steps_per_second": 4.36, "eval_token_acc": 0.8721686645988767, "step": 400 }, { "epoch": 0.5080360642885143, "grad_norm": 0.8032602667808533, "learning_rate": 9.308618091635382e-06, "loss": 0.38360297679901123, "memory(GiB)": 28.87, "step": 405, "token_acc": 0.8814102236094765, "train_speed(iter/s)": 0.120867 }, { "epoch": 0.5143081144649158, "grad_norm": 0.8667065501213074, "learning_rate": 9.291858827761359e-06, "loss": 0.39394588470458985, "memory(GiB)": 28.87, "step": 410, "token_acc": 0.8806277558598282, "train_speed(iter/s)": 0.121171 }, { "epoch": 0.5205801646413172, "grad_norm": 1.8045368194580078, "learning_rate": 9.274914328282359e-06, "loss": 0.41473889350891113, "memory(GiB)": 28.87, "step": 415, "token_acc": 0.8681345140319431, "train_speed(iter/s)": 0.121492 }, { "epoch": 0.5268522148177185, "grad_norm": 0.8981844782829285, "learning_rate": 9.257785324518943e-06, "loss": 0.39594154357910155, "memory(GiB)": 28.87, "step": 420, "token_acc": 0.8816446146703807, "train_speed(iter/s)": 0.12182 }, { "epoch": 0.5268522148177185, "eval_loss": 0.3774873614311218, "eval_runtime": 29.5939, "eval_samples_per_second": 17.402, "eval_steps_per_second": 4.359, "eval_token_acc": 0.8723111744488222, "step": 420 }, { "epoch": 0.5331242649941199, "grad_norm": 0.8493431806564331, "learning_rate": 9.240472555754835e-06, "loss": 0.3900270462036133, "memory(GiB)": 28.87, "step": 425, "token_acc": 0.876733513421235, "train_speed(iter/s)": 0.120876 }, { "epoch": 0.5393963151705213, "grad_norm": 0.9211533069610596, "learning_rate": 9.222976769205013e-06, "loss": 0.39029178619384763, "memory(GiB)": 28.87, "step": 430, "token_acc": 0.87612563145179, "train_speed(iter/s)": 0.121195 }, { "epoch": 0.5456683653469228, "grad_norm": 0.8063600659370422, "learning_rate": 9.205298719983458e-06, "loss": 0.40296125411987305, "memory(GiB)": 31.15, "step": 435, "token_acc": 0.8771728947642344, "train_speed(iter/s)": 0.12146 }, { "epoch": 0.5519404155233242, "grad_norm": 0.8496724367141724, "learning_rate": 9.187439171070563e-06, "loss": 0.4008660316467285, "memory(GiB)": 31.15, "step": 440, "token_acc": 0.8774796485144151, "train_speed(iter/s)": 0.121775 }, { "epoch": 0.5519404155233242, "eval_loss": 0.3754238188266754, "eval_runtime": 29.6083, "eval_samples_per_second": 17.394, "eval_steps_per_second": 4.357, "eval_token_acc": 0.8736524436247799, "step": 440 }, { "epoch": 0.5582124656997256, "grad_norm": 0.8935557007789612, "learning_rate": 9.169398893280208e-06, "loss": 0.38558981418609617, "memory(GiB)": 31.15, "step": 445, "token_acc": 0.8804487471295762, "train_speed(iter/s)": 0.12076 }, { "epoch": 0.564484515876127, "grad_norm": 0.7438162565231323, "learning_rate": 9.151178665226486e-06, "loss": 0.39479656219482423, "memory(GiB)": 31.15, "step": 450, "token_acc": 0.8771318584933242, "train_speed(iter/s)": 0.121072 }, { "epoch": 0.5707565660525284, "grad_norm": 0.7340822815895081, "learning_rate": 9.132779273290103e-06, "loss": 0.39113516807556153, "memory(GiB)": 31.15, "step": 455, "token_acc": 0.8820633384040935, "train_speed(iter/s)": 0.12129 }, { "epoch": 0.5770286162289299, "grad_norm": 0.8130801320075989, "learning_rate": 9.114201511584428e-06, "loss": 0.40251779556274414, "memory(GiB)": 31.15, "step": 460, "token_acc": 0.8760628910636933, "train_speed(iter/s)": 0.121548 }, { "epoch": 0.5770286162289299, "eval_loss": 0.3735375702381134, "eval_runtime": 29.5965, "eval_samples_per_second": 17.401, "eval_steps_per_second": 4.359, "eval_token_acc": 0.8740087182496438, "step": 460 }, { "epoch": 0.5833006664053313, "grad_norm": 0.9230681657791138, "learning_rate": 9.095446181921237e-06, "loss": 0.4032279014587402, "memory(GiB)": 31.15, "step": 465, "token_acc": 0.8776148954041838, "train_speed(iter/s)": 0.120694 }, { "epoch": 0.5895727165817326, "grad_norm": 0.7464693188667297, "learning_rate": 9.07651409377609e-06, "loss": 0.38982985019683836, "memory(GiB)": 31.15, "step": 470, "token_acc": 0.871431801480979, "train_speed(iter/s)": 0.12105 }, { "epoch": 0.595844766758134, "grad_norm": 0.8642957210540771, "learning_rate": 9.057406064253404e-06, "loss": 0.4086627006530762, "memory(GiB)": 31.15, "step": 475, "token_acc": 0.8692161419818297, "train_speed(iter/s)": 0.121393 }, { "epoch": 0.6021168169345354, "grad_norm": 0.7960444092750549, "learning_rate": 9.038122918051184e-06, "loss": 0.3928786516189575, "memory(GiB)": 31.15, "step": 480, "token_acc": 0.8757158196134575, "train_speed(iter/s)": 0.121671 }, { "epoch": 0.6021168169345354, "eval_loss": 0.3725303113460541, "eval_runtime": 29.61, "eval_samples_per_second": 17.393, "eval_steps_per_second": 4.357, "eval_token_acc": 0.8741512280995892, "step": 480 }, { "epoch": 0.6083888671109369, "grad_norm": 0.8392713665962219, "learning_rate": 9.018665487425426e-06, "loss": 0.37983551025390627, "memory(GiB)": 31.15, "step": 485, "token_acc": 0.8818607401567042, "train_speed(iter/s)": 0.120742 }, { "epoch": 0.6146609172873383, "grad_norm": 0.8359962701797485, "learning_rate": 8.999034612154204e-06, "loss": 0.3970278263092041, "memory(GiB)": 31.15, "step": 490, "token_acc": 0.8752512282268871, "train_speed(iter/s)": 0.121067 }, { "epoch": 0.6209329674637397, "grad_norm": 0.8340564966201782, "learning_rate": 8.979231139501417e-06, "loss": 0.3811976909637451, "memory(GiB)": 31.15, "step": 495, "token_acc": 0.8809906820044313, "train_speed(iter/s)": 0.121367 }, { "epoch": 0.6272050176401411, "grad_norm": 0.9611604809761047, "learning_rate": 8.95925592418023e-06, "loss": 0.3964669227600098, "memory(GiB)": 31.15, "step": 500, "token_acc": 0.8726217824114979, "train_speed(iter/s)": 0.121645 }, { "epoch": 0.6272050176401411, "eval_loss": 0.371460884809494, "eval_runtime": 29.5898, "eval_samples_per_second": 17.405, "eval_steps_per_second": 4.36, "eval_token_acc": 0.8745955235141253, "step": 500 }, { "epoch": 0.6334770678165426, "grad_norm": 0.8176589608192444, "learning_rate": 8.939109828316184e-06, "loss": 0.38572733402252196, "memory(GiB)": 31.15, "step": 505, "token_acc": 0.8809286455710512, "train_speed(iter/s)": 0.120811 }, { "epoch": 0.639749117992944, "grad_norm": 0.7897095084190369, "learning_rate": 8.918793721409973e-06, "loss": 0.3885223150253296, "memory(GiB)": 31.15, "step": 510, "token_acc": 0.8733564201071371, "train_speed(iter/s)": 0.121065 }, { "epoch": 0.6460211681693454, "grad_norm": 0.8051208257675171, "learning_rate": 8.898308480299937e-06, "loss": 0.3946079254150391, "memory(GiB)": 31.15, "step": 515, "token_acc": 0.8788746774056323, "train_speed(iter/s)": 0.121332 }, { "epoch": 0.6522932183457467, "grad_norm": 0.7898780703544617, "learning_rate": 8.877654989124202e-06, "loss": 0.38194358348846436, "memory(GiB)": 31.15, "step": 520, "token_acc": 0.889278570841439, "train_speed(iter/s)": 0.121638 }, { "epoch": 0.6522932183457467, "eval_loss": 0.3701505661010742, "eval_runtime": 29.5916, "eval_samples_per_second": 17.404, "eval_steps_per_second": 4.359, "eval_token_acc": 0.8744613965965294, "step": 520 }, { "epoch": 0.6585652685221481, "grad_norm": 0.7575268745422363, "learning_rate": 8.856834139282531e-06, "loss": 0.3784614086151123, "memory(GiB)": 31.15, "step": 525, "token_acc": 0.8807604793446934, "train_speed(iter/s)": 0.120829 }, { "epoch": 0.6648373186985496, "grad_norm": 0.8083406686782837, "learning_rate": 8.835846829397843e-06, "loss": 0.38791258335113527, "memory(GiB)": 31.15, "step": 530, "token_acc": 0.8846142795575348, "train_speed(iter/s)": 0.121137 }, { "epoch": 0.671109368874951, "grad_norm": 0.8856268525123596, "learning_rate": 8.814693965277435e-06, "loss": 0.38564338684082033, "memory(GiB)": 31.15, "step": 535, "token_acc": 0.879522377393324, "train_speed(iter/s)": 0.121379 }, { "epoch": 0.6773814190513524, "grad_norm": 0.7782846093177795, "learning_rate": 8.793376459873888e-06, "loss": 0.39195048809051514, "memory(GiB)": 31.15, "step": 540, "token_acc": 0.8783795212990894, "train_speed(iter/s)": 0.121643 }, { "epoch": 0.6773814190513524, "eval_loss": 0.36819902062416077, "eval_runtime": 29.5691, "eval_samples_per_second": 17.417, "eval_steps_per_second": 4.363, "eval_token_acc": 0.8753080727638528, "step": 540 }, { "epoch": 0.6836534692277538, "grad_norm": 0.8303295373916626, "learning_rate": 8.771895233245655e-06, "loss": 0.3857764720916748, "memory(GiB)": 31.15, "step": 545, "token_acc": 0.8770407440630754, "train_speed(iter/s)": 0.120864 }, { "epoch": 0.6899255194041553, "grad_norm": 0.8560570478439331, "learning_rate": 8.750251212517364e-06, "loss": 0.3794244289398193, "memory(GiB)": 31.15, "step": 550, "token_acc": 0.8898940454798935, "train_speed(iter/s)": 0.121097 }, { "epoch": 0.6961975695805567, "grad_norm": 0.7564458250999451, "learning_rate": 8.728445331839796e-06, "loss": 0.3893013000488281, "memory(GiB)": 31.15, "step": 555, "token_acc": 0.8829551217038539, "train_speed(iter/s)": 0.121376 }, { "epoch": 0.7024696197569581, "grad_norm": 0.7831209897994995, "learning_rate": 8.706478532349567e-06, "loss": 0.38511061668395996, "memory(GiB)": 31.15, "step": 560, "token_acc": 0.8776614357621598, "train_speed(iter/s)": 0.121619 }, { "epoch": 0.7024696197569581, "eval_loss": 0.36745068430900574, "eval_runtime": 29.5839, "eval_samples_per_second": 17.408, "eval_steps_per_second": 4.36, "eval_token_acc": 0.875936792690083, "step": 560 }, { "epoch": 0.7087416699333595, "grad_norm": 0.9281031489372253, "learning_rate": 8.684351762128511e-06, "loss": 0.3907186508178711, "memory(GiB)": 31.15, "step": 565, "token_acc": 0.8817858594067453, "train_speed(iter/s)": 0.120888 }, { "epoch": 0.7150137201097608, "grad_norm": 0.8066033720970154, "learning_rate": 8.662065976162765e-06, "loss": 0.3858931541442871, "memory(GiB)": 31.15, "step": 570, "token_acc": 0.8792498385693029, "train_speed(iter/s)": 0.121186 }, { "epoch": 0.7212857702861623, "grad_norm": 0.8844251036643982, "learning_rate": 8.639622136301541e-06, "loss": 0.3788000583648682, "memory(GiB)": 33.59, "step": 575, "token_acc": 0.8837105552746631, "train_speed(iter/s)": 0.121421 }, { "epoch": 0.7275578204625637, "grad_norm": 0.8435778021812439, "learning_rate": 8.617021211215629e-06, "loss": 0.37533106803894045, "memory(GiB)": 33.59, "step": 580, "token_acc": 0.8742189278757369, "train_speed(iter/s)": 0.12169 }, { "epoch": 0.7275578204625637, "eval_loss": 0.36586904525756836, "eval_runtime": 29.5932, "eval_samples_per_second": 17.403, "eval_steps_per_second": 4.359, "eval_token_acc": 0.8756224327269679, "step": 580 }, { "epoch": 0.7338298706389651, "grad_norm": 0.7818266153335571, "learning_rate": 8.594264176355565e-06, "loss": 0.37725415229797366, "memory(GiB)": 33.59, "step": 585, "token_acc": 0.8777972097460154, "train_speed(iter/s)": 0.120955 }, { "epoch": 0.7401019208153665, "grad_norm": 0.8796353936195374, "learning_rate": 8.571352013909558e-06, "loss": 0.400989294052124, "memory(GiB)": 33.59, "step": 590, "token_acc": 0.8727272727272727, "train_speed(iter/s)": 0.121228 }, { "epoch": 0.7463739709917679, "grad_norm": 0.7808417677879333, "learning_rate": 8.548285712761084e-06, "loss": 0.3853422164916992, "memory(GiB)": 33.59, "step": 595, "token_acc": 0.8805738658394726, "train_speed(iter/s)": 0.121437 }, { "epoch": 0.7526460211681694, "grad_norm": 0.833470344543457, "learning_rate": 8.525066268446208e-06, "loss": 0.37978854179382326, "memory(GiB)": 33.59, "step": 600, "token_acc": 0.8784342932803707, "train_speed(iter/s)": 0.121678 }, { "epoch": 0.7526460211681694, "eval_loss": 0.3638017177581787, "eval_runtime": 29.6016, "eval_samples_per_second": 17.398, "eval_steps_per_second": 4.358, "eval_token_acc": 0.8765571296839635, "step": 600 }, { "epoch": 0.7589180713445708, "grad_norm": 0.80388343334198, "learning_rate": 8.501694683110615e-06, "loss": 0.39281136989593507, "memory(GiB)": 33.59, "step": 605, "token_acc": 0.8803214107664786, "train_speed(iter/s)": 0.120954 }, { "epoch": 0.7651901215209722, "grad_norm": 0.8746615052223206, "learning_rate": 8.478171965466366e-06, "loss": 0.38366003036499025, "memory(GiB)": 33.59, "step": 610, "token_acc": 0.8762894230294654, "train_speed(iter/s)": 0.121177 }, { "epoch": 0.7714621716973736, "grad_norm": 0.9113203287124634, "learning_rate": 8.454499130748352e-06, "loss": 0.3745842933654785, "memory(GiB)": 33.59, "step": 615, "token_acc": 0.8812096914763814, "train_speed(iter/s)": 0.121476 }, { "epoch": 0.777734221873775, "grad_norm": 0.8363653421401978, "learning_rate": 8.43067720067048e-06, "loss": 0.37692205905914306, "memory(GiB)": 33.59, "step": 620, "token_acc": 0.8883228276199522, "train_speed(iter/s)": 0.121722 }, { "epoch": 0.777734221873775, "eval_loss": 0.36300593614578247, "eval_runtime": 29.8369, "eval_samples_per_second": 17.261, "eval_steps_per_second": 4.324, "eval_token_acc": 0.8767834688574063, "step": 620 }, { "epoch": 0.7840062720501764, "grad_norm": 0.8239129781723022, "learning_rate": 8.40670720338158e-06, "loss": 0.3925849437713623, "memory(GiB)": 33.59, "step": 625, "token_acc": 0.8787143407663546, "train_speed(iter/s)": 0.120995 }, { "epoch": 0.7902783222265778, "grad_norm": 0.8488283157348633, "learning_rate": 8.382590173421029e-06, "loss": 0.3935189485549927, "memory(GiB)": 33.59, "step": 630, "token_acc": 0.8936352849099952, "train_speed(iter/s)": 0.121252 }, { "epoch": 0.7965503724029792, "grad_norm": 0.8239266276359558, "learning_rate": 8.358327151674095e-06, "loss": 0.39822547435760497, "memory(GiB)": 33.59, "step": 635, "token_acc": 0.869016123396396, "train_speed(iter/s)": 0.121499 }, { "epoch": 0.8028224225793806, "grad_norm": 0.8977949619293213, "learning_rate": 8.33391918532702e-06, "loss": 0.3884063720703125, "memory(GiB)": 33.59, "step": 640, "token_acc": 0.8792178010735076, "train_speed(iter/s)": 0.121738 }, { "epoch": 0.8028224225793806, "eval_loss": 0.36090487241744995, "eval_runtime": 29.6252, "eval_samples_per_second": 17.384, "eval_steps_per_second": 4.354, "eval_token_acc": 0.8771942325425434, "step": 640 }, { "epoch": 0.8090944727557821, "grad_norm": 0.7422317266464233, "learning_rate": 8.309367327821819e-06, "loss": 0.36748080253601073, "memory(GiB)": 33.59, "step": 645, "token_acc": 0.882223291626564, "train_speed(iter/s)": 0.12104 }, { "epoch": 0.8153665229321835, "grad_norm": 0.8495545387268066, "learning_rate": 8.284672638810813e-06, "loss": 0.37848606109619143, "memory(GiB)": 33.59, "step": 650, "token_acc": 0.8850858133346354, "train_speed(iter/s)": 0.121243 }, { "epoch": 0.8216385731085849, "grad_norm": 0.7722618579864502, "learning_rate": 8.259836184110904e-06, "loss": 0.36829509735107424, "memory(GiB)": 33.59, "step": 655, "token_acc": 0.875583076461442, "train_speed(iter/s)": 0.121449 }, { "epoch": 0.8279106232849863, "grad_norm": 0.7796041965484619, "learning_rate": 8.234859035657557e-06, "loss": 0.40014114379882815, "memory(GiB)": 33.59, "step": 660, "token_acc": 0.8759235981236778, "train_speed(iter/s)": 0.121697 }, { "epoch": 0.8279106232849863, "eval_loss": 0.3596991300582886, "eval_runtime": 29.7406, "eval_samples_per_second": 17.316, "eval_steps_per_second": 4.338, "eval_token_acc": 0.8776930170173527, "step": 660 }, { "epoch": 0.8341826734613876, "grad_norm": 0.7468088269233704, "learning_rate": 8.209742271458556e-06, "loss": 0.38162546157836913, "memory(GiB)": 33.59, "step": 665, "token_acc": 0.8829189729162207, "train_speed(iter/s)": 0.12099 }, { "epoch": 0.8404547236377891, "grad_norm": 0.7364327311515808, "learning_rate": 8.18448697554746e-06, "loss": 0.36592922210693357, "memory(GiB)": 33.59, "step": 670, "token_acc": 0.8770340880816123, "train_speed(iter/s)": 0.121214 }, { "epoch": 0.8467267738141905, "grad_norm": 0.7761643528938293, "learning_rate": 8.159094237936828e-06, "loss": 0.38688228130340574, "memory(GiB)": 33.59, "step": 675, "token_acc": 0.8751209347324551, "train_speed(iter/s)": 0.12144 }, { "epoch": 0.8529988239905919, "grad_norm": 0.8629448413848877, "learning_rate": 8.133565154571169e-06, "loss": 0.3881547451019287, "memory(GiB)": 33.59, "step": 680, "token_acc": 0.8676196768574689, "train_speed(iter/s)": 0.121629 }, { "epoch": 0.8529988239905919, "eval_loss": 0.3583786189556122, "eval_runtime": 29.6285, "eval_samples_per_second": 17.382, "eval_steps_per_second": 4.354, "eval_token_acc": 0.8773115935954396, "step": 680 }, { "epoch": 0.8592708741669933, "grad_norm": 0.7593112587928772, "learning_rate": 8.107900827279638e-06, "loss": 0.37020533084869384, "memory(GiB)": 33.59, "step": 685, "token_acc": 0.884691054718319, "train_speed(iter/s)": 0.120936 }, { "epoch": 0.8655429243433947, "grad_norm": 0.797288179397583, "learning_rate": 8.082102363728494e-06, "loss": 0.3854295492172241, "memory(GiB)": 33.59, "step": 690, "token_acc": 0.8761994516792323, "train_speed(iter/s)": 0.121156 }, { "epoch": 0.8718149745197962, "grad_norm": 0.7850540280342102, "learning_rate": 8.056170877373277e-06, "loss": 0.40497736930847167, "memory(GiB)": 33.59, "step": 695, "token_acc": 0.8737006516938655, "train_speed(iter/s)": 0.121359 }, { "epoch": 0.8780870246961976, "grad_norm": 0.7562994956970215, "learning_rate": 8.030107487410766e-06, "loss": 0.37325115203857423, "memory(GiB)": 33.59, "step": 700, "token_acc": 0.8829526314234213, "train_speed(iter/s)": 0.121586 }, { "epoch": 0.8780870246961976, "eval_loss": 0.3569120168685913, "eval_runtime": 29.6346, "eval_samples_per_second": 17.378, "eval_steps_per_second": 4.353, "eval_token_acc": 0.878032525777517, "step": 700 }, { "epoch": 0.884359074872599, "grad_norm": 0.8399495482444763, "learning_rate": 8.003913318730662e-06, "loss": 0.3845979690551758, "memory(GiB)": 33.59, "step": 705, "token_acc": 0.8839846493683643, "train_speed(iter/s)": 0.12098 }, { "epoch": 0.8906311250490004, "grad_norm": 0.8614782691001892, "learning_rate": 7.97758950186705e-06, "loss": 0.3775152683258057, "memory(GiB)": 33.59, "step": 710, "token_acc": 0.8816499315970161, "train_speed(iter/s)": 0.121192 }, { "epoch": 0.8969031752254017, "grad_norm": 0.8226941823959351, "learning_rate": 7.951137172949595e-06, "loss": 0.37194027900695803, "memory(GiB)": 33.59, "step": 715, "token_acc": 0.8872406234047026, "train_speed(iter/s)": 0.121408 }, { "epoch": 0.9031752254018032, "grad_norm": 0.7586838006973267, "learning_rate": 7.924557473654516e-06, "loss": 0.37108325958251953, "memory(GiB)": 33.59, "step": 720, "token_acc": 0.87804646976623, "train_speed(iter/s)": 0.121604 }, { "epoch": 0.9031752254018032, "eval_loss": 0.3558177351951599, "eval_runtime": 29.6063, "eval_samples_per_second": 17.395, "eval_steps_per_second": 4.357, "eval_token_acc": 0.8782337161539107, "step": 720 }, { "epoch": 0.9094472755782046, "grad_norm": 0.7763127088546753, "learning_rate": 7.897851551155306e-06, "loss": 0.378222918510437, "memory(GiB)": 33.59, "step": 725, "token_acc": 0.8826250789141414, "train_speed(iter/s)": 0.121006 }, { "epoch": 0.915719325754606, "grad_norm": 0.8064581751823425, "learning_rate": 7.871020558073217e-06, "loss": 0.3942488431930542, "memory(GiB)": 33.6, "step": 730, "token_acc": 0.8696306499336802, "train_speed(iter/s)": 0.121223 }, { "epoch": 0.9219913759310074, "grad_norm": 0.8451412320137024, "learning_rate": 7.844065652427523e-06, "loss": 0.37436461448669434, "memory(GiB)": 33.6, "step": 735, "token_acc": 0.8797443405788941, "train_speed(iter/s)": 0.121448 }, { "epoch": 0.9282634261074089, "grad_norm": 0.7629019021987915, "learning_rate": 7.816987997585535e-06, "loss": 0.3741127967834473, "memory(GiB)": 33.6, "step": 740, "token_acc": 0.8834049015500628, "train_speed(iter/s)": 0.121613 }, { "epoch": 0.9282634261074089, "eval_loss": 0.35523363947868347, "eval_runtime": 29.6401, "eval_samples_per_second": 17.375, "eval_steps_per_second": 4.352, "eval_token_acc": 0.8790091373962612, "step": 740 }, { "epoch": 0.9345354762838103, "grad_norm": 0.8186476230621338, "learning_rate": 7.789788762212384e-06, "loss": 0.3544290542602539, "memory(GiB)": 33.6, "step": 745, "token_acc": 0.8867604833554854, "train_speed(iter/s)": 0.121009 }, { "epoch": 0.9408075264602117, "grad_norm": 0.7892398238182068, "learning_rate": 7.762469120220595e-06, "loss": 0.3707085609436035, "memory(GiB)": 33.6, "step": 750, "token_acc": 0.8836383423547269, "train_speed(iter/s)": 0.121228 }, { "epoch": 0.9470795766366131, "grad_norm": 0.7983216047286987, "learning_rate": 7.73503025071941e-06, "loss": 0.38054685592651366, "memory(GiB)": 33.6, "step": 755, "token_acc": 0.8786164633787048, "train_speed(iter/s)": 0.121423 }, { "epoch": 0.9533516268130146, "grad_norm": 0.7637438178062439, "learning_rate": 7.7074733379639e-06, "loss": 0.3841462373733521, "memory(GiB)": 33.6, "step": 760, "token_acc": 0.8768013924795863, "train_speed(iter/s)": 0.12163 }, { "epoch": 0.9533516268130146, "eval_loss": 0.35423436760902405, "eval_runtime": 29.6267, "eval_samples_per_second": 17.383, "eval_steps_per_second": 4.354, "eval_token_acc": 0.8791390728476821, "step": 760 }, { "epoch": 0.9596236769894159, "grad_norm": 0.7450261116027832, "learning_rate": 7.679799571303861e-06, "loss": 0.3811268091201782, "memory(GiB)": 33.6, "step": 765, "token_acc": 0.8854130493498475, "train_speed(iter/s)": 0.121106 }, { "epoch": 0.9658957271658173, "grad_norm": 0.8632524013519287, "learning_rate": 7.65201014513247e-06, "loss": 0.38339235782623293, "memory(GiB)": 33.6, "step": 770, "token_acc": 0.876136081450211, "train_speed(iter/s)": 0.121281 }, { "epoch": 0.9721677773422187, "grad_norm": 0.7771950364112854, "learning_rate": 7.62410625883474e-06, "loss": 0.37677223682403566, "memory(GiB)": 33.6, "step": 775, "token_acc": 0.8823102678571428, "train_speed(iter/s)": 0.121452 }, { "epoch": 0.9784398275186201, "grad_norm": 0.8017596006393433, "learning_rate": 7.596089116735765e-06, "loss": 0.37508654594421387, "memory(GiB)": 33.6, "step": 780, "token_acc": 0.8823698425468337, "train_speed(iter/s)": 0.121653 }, { "epoch": 0.9784398275186201, "eval_loss": 0.3527185022830963, "eval_runtime": 29.6007, "eval_samples_per_second": 17.398, "eval_steps_per_second": 4.358, "eval_token_acc": 0.8791348813815073, "step": 780 }, { "epoch": 0.9847118776950216, "grad_norm": 0.8020228147506714, "learning_rate": 7.567959928048723e-06, "loss": 0.38076558113098147, "memory(GiB)": 33.6, "step": 785, "token_acc": 0.8822499299523676, "train_speed(iter/s)": 0.121074 }, { "epoch": 0.990983927871423, "grad_norm": 0.8349906802177429, "learning_rate": 7.5397199068227e-06, "loss": 0.3815056324005127, "memory(GiB)": 33.6, "step": 790, "token_acc": 0.8813588951692792, "train_speed(iter/s)": 0.121236 }, { "epoch": 0.9972559780478244, "grad_norm": 0.7655360102653503, "learning_rate": 7.511370271890286e-06, "loss": 0.37683281898498533, "memory(GiB)": 33.6, "step": 795, "token_acc": 0.8849736151561464, "train_speed(iter/s)": 0.121426 }, { "epoch": 1.0025088200705605, "grad_norm": 0.781859815120697, "learning_rate": 7.482912246814975e-06, "loss": 0.33410110473632815, "memory(GiB)": 33.6, "step": 800, "token_acc": 0.8970368853657823, "train_speed(iter/s)": 0.121715 }, { "epoch": 1.0025088200705605, "eval_loss": 0.35208848118782043, "eval_runtime": 29.6155, "eval_samples_per_second": 17.39, "eval_steps_per_second": 4.356, "eval_token_acc": 0.8798600050297594, "step": 800 }, { "epoch": 1.008780870246962, "grad_norm": 0.7701341509819031, "learning_rate": 7.454347059838351e-06, "loss": 0.3262555360794067, "memory(GiB)": 33.6, "step": 805, "token_acc": 0.8892221089920901, "train_speed(iter/s)": 0.121208 }, { "epoch": 1.0150529204233634, "grad_norm": 0.8458274602890015, "learning_rate": 7.425675943827084e-06, "loss": 0.3318117618560791, "memory(GiB)": 33.6, "step": 810, "token_acc": 0.8916776012730674, "train_speed(iter/s)": 0.12141 }, { "epoch": 1.021324970599765, "grad_norm": 0.9188606142997742, "learning_rate": 7.3969001362197135e-06, "loss": 0.31556293964385984, "memory(GiB)": 33.6, "step": 815, "token_acc": 0.8948849424712356, "train_speed(iter/s)": 0.121599 }, { "epoch": 1.0275970207761662, "grad_norm": 0.8285843729972839, "learning_rate": 7.3680208789732385e-06, "loss": 0.3075234413146973, "memory(GiB)": 33.6, "step": 820, "token_acc": 0.8996916527955904, "train_speed(iter/s)": 0.12177 }, { "epoch": 1.0275970207761662, "eval_loss": 0.3578624427318573, "eval_runtime": 29.734, "eval_samples_per_second": 17.32, "eval_steps_per_second": 4.338, "eval_token_acc": 0.8788289043507419, "step": 820 }, { "epoch": 1.0338690709525675, "grad_norm": 0.7456257939338684, "learning_rate": 7.339039418509532e-06, "loss": 0.3122047185897827, "memory(GiB)": 33.6, "step": 825, "token_acc": 0.8912523923331079, "train_speed(iter/s)": 0.121242 }, { "epoch": 1.040141121128969, "grad_norm": 0.8746250867843628, "learning_rate": 7.309957005661521e-06, "loss": 0.30740058422088623, "memory(GiB)": 33.6, "step": 830, "token_acc": 0.903616077429762, "train_speed(iter/s)": 0.121448 }, { "epoch": 1.0464131713053704, "grad_norm": 0.8537876009941101, "learning_rate": 7.280774895619219e-06, "loss": 0.3157168388366699, "memory(GiB)": 33.6, "step": 835, "token_acc": 0.8915479475195676, "train_speed(iter/s)": 0.121643 }, { "epoch": 1.052685221481772, "grad_norm": 0.8017281293869019, "learning_rate": 7.25149434787555e-06, "loss": 0.31319799423217776, "memory(GiB)": 33.6, "step": 840, "token_acc": 0.8969325908377688, "train_speed(iter/s)": 0.121836 }, { "epoch": 1.052685221481772, "eval_loss": 0.3565090596675873, "eval_runtime": 29.6075, "eval_samples_per_second": 17.394, "eval_steps_per_second": 4.357, "eval_token_acc": 0.8790636264565345, "step": 840 }, { "epoch": 1.0589572716581732, "grad_norm": 0.7407189011573792, "learning_rate": 7.2221166261719755e-06, "loss": 0.3067447662353516, "memory(GiB)": 33.6, "step": 845, "token_acc": 0.8904468233651385, "train_speed(iter/s)": 0.1213 }, { "epoch": 1.0652293218345747, "grad_norm": 0.7648513913154602, "learning_rate": 7.192642998443975e-06, "loss": 0.31682767868041994, "memory(GiB)": 33.6, "step": 850, "token_acc": 0.9042325428194994, "train_speed(iter/s)": 0.12146 }, { "epoch": 1.071501372010976, "grad_norm": 0.8380671739578247, "learning_rate": 7.163074736766299e-06, "loss": 0.3035914421081543, "memory(GiB)": 33.6, "step": 855, "token_acc": 0.900502677303027, "train_speed(iter/s)": 0.121596 }, { "epoch": 1.0777734221873776, "grad_norm": 0.8171470761299133, "learning_rate": 7.133413117298081e-06, "loss": 0.30316686630249023, "memory(GiB)": 33.6, "step": 860, "token_acc": 0.9034346601631681, "train_speed(iter/s)": 0.121766 }, { "epoch": 1.0777734221873776, "eval_loss": 0.3562227189540863, "eval_runtime": 29.6044, "eval_samples_per_second": 17.396, "eval_steps_per_second": 4.357, "eval_token_acc": 0.8794701986754967, "step": 860 }, { "epoch": 1.084045472363779, "grad_norm": 0.7644967436790466, "learning_rate": 7.103659420227755e-06, "loss": 0.32071871757507325, "memory(GiB)": 33.6, "step": 865, "token_acc": 0.8916257634980457, "train_speed(iter/s)": 0.121253 }, { "epoch": 1.0903175225401802, "grad_norm": 0.8376749753952026, "learning_rate": 7.0738149297178005e-06, "loss": 0.31877703666687013, "memory(GiB)": 33.6, "step": 870, "token_acc": 0.9103868211748601, "train_speed(iter/s)": 0.121402 }, { "epoch": 1.0965895727165818, "grad_norm": 0.7798628211021423, "learning_rate": 7.04388093384932e-06, "loss": 0.3112868547439575, "memory(GiB)": 33.6, "step": 875, "token_acc": 0.9032720088899864, "train_speed(iter/s)": 0.121542 }, { "epoch": 1.102861622892983, "grad_norm": 0.8039925694465637, "learning_rate": 7.013858724566449e-06, "loss": 0.32082467079162597, "memory(GiB)": 33.6, "step": 880, "token_acc": 0.9032403958710227, "train_speed(iter/s)": 0.121699 }, { "epoch": 1.102861622892983, "eval_loss": 0.3564984202384949, "eval_runtime": 29.624, "eval_samples_per_second": 17.385, "eval_steps_per_second": 4.355, "eval_token_acc": 0.8794660072093218, "step": 880 }, { "epoch": 1.1091336730693846, "grad_norm": 0.8847902417182922, "learning_rate": 6.983749597620588e-06, "loss": 0.3243894577026367, "memory(GiB)": 33.6, "step": 885, "token_acc": 0.8885523807680771, "train_speed(iter/s)": 0.121222 }, { "epoch": 1.115405723245786, "grad_norm": 0.8933838605880737, "learning_rate": 6.9535548525144894e-06, "loss": 0.3157766580581665, "memory(GiB)": 33.6, "step": 890, "token_acc": 0.8964195078892199, "train_speed(iter/s)": 0.121388 }, { "epoch": 1.1216777734221874, "grad_norm": 0.7823595404624939, "learning_rate": 6.923275792446159e-06, "loss": 0.310500955581665, "memory(GiB)": 33.6, "step": 895, "token_acc": 0.897285512497697, "train_speed(iter/s)": 0.121527 }, { "epoch": 1.1279498235985888, "grad_norm": 0.7890828251838684, "learning_rate": 6.8929137242526216e-06, "loss": 0.31655497550964357, "memory(GiB)": 33.6, "step": 900, "token_acc": 0.9005792271681345, "train_speed(iter/s)": 0.121653 }, { "epoch": 1.1279498235985888, "eval_loss": 0.35559797286987305, "eval_runtime": 29.5743, "eval_samples_per_second": 17.414, "eval_steps_per_second": 4.362, "eval_token_acc": 0.8799396428870819, "step": 900 }, { "epoch": 1.1342218737749903, "grad_norm": 0.7184568047523499, "learning_rate": 6.862469958353506e-06, "loss": 0.31396899223327634, "memory(GiB)": 33.6, "step": 905, "token_acc": 0.8933355593966059, "train_speed(iter/s)": 0.121163 }, { "epoch": 1.1404939239513916, "grad_norm": 0.9552344679832458, "learning_rate": 6.8319458086945026e-06, "loss": 0.33000473976135253, "memory(GiB)": 33.6, "step": 910, "token_acc": 0.9004522625621548, "train_speed(iter/s)": 0.121332 }, { "epoch": 1.146765974127793, "grad_norm": 0.8863224387168884, "learning_rate": 6.801342592690641e-06, "loss": 0.318299388885498, "memory(GiB)": 33.6, "step": 915, "token_acc": 0.900812626514507, "train_speed(iter/s)": 0.121495 }, { "epoch": 1.1530380243041944, "grad_norm": 0.7559683918952942, "learning_rate": 6.770661631169434e-06, "loss": 0.3138300895690918, "memory(GiB)": 33.6, "step": 920, "token_acc": 0.8954110250189142, "train_speed(iter/s)": 0.121644 }, { "epoch": 1.1530380243041944, "eval_loss": 0.354932963848114, "eval_runtime": 29.566, "eval_samples_per_second": 17.419, "eval_steps_per_second": 4.363, "eval_token_acc": 0.8799270684885573, "step": 920 }, { "epoch": 1.1593100744805958, "grad_norm": 0.7694470286369324, "learning_rate": 6.739904248313879e-06, "loss": 0.31344189643859866, "memory(GiB)": 33.6, "step": 925, "token_acc": 0.8902944897178665, "train_speed(iter/s)": 0.121164 }, { "epoch": 1.1655821246569973, "grad_norm": 0.863102912902832, "learning_rate": 6.709071771605292e-06, "loss": 0.3148585557937622, "memory(GiB)": 33.6, "step": 930, "token_acc": 0.89984285587215, "train_speed(iter/s)": 0.1213 }, { "epoch": 1.1718541748333986, "grad_norm": 0.851425290107727, "learning_rate": 6.678165531766029e-06, "loss": 0.31011836528778075, "memory(GiB)": 33.6, "step": 935, "token_acc": 0.9080040472893812, "train_speed(iter/s)": 0.121449 }, { "epoch": 1.1781262250098001, "grad_norm": 0.837813138961792, "learning_rate": 6.647186862702038e-06, "loss": 0.30878582000732424, "memory(GiB)": 33.6, "step": 940, "token_acc": 0.9038740191923886, "train_speed(iter/s)": 0.121586 }, { "epoch": 1.1781262250098001, "eval_loss": 0.3556138277053833, "eval_runtime": 29.559, "eval_samples_per_second": 17.423, "eval_steps_per_second": 4.364, "eval_token_acc": 0.879683963450415, "step": 940 }, { "epoch": 1.1843982751862014, "grad_norm": 0.8195896148681641, "learning_rate": 6.616137101445301e-06, "loss": 0.314269495010376, "memory(GiB)": 33.6, "step": 945, "token_acc": 0.8931946066461408, "train_speed(iter/s)": 0.121118 }, { "epoch": 1.190670325362603, "grad_norm": 0.8756702542304993, "learning_rate": 6.58501758809612e-06, "loss": 0.33217945098876955, "memory(GiB)": 33.6, "step": 950, "token_acc": 0.8988422076495874, "train_speed(iter/s)": 0.121284 }, { "epoch": 1.1969423755390043, "grad_norm": 0.8291054964065552, "learning_rate": 6.55382966576528e-06, "loss": 0.31435232162475585, "memory(GiB)": 33.6, "step": 955, "token_acc": 0.9005618808221204, "train_speed(iter/s)": 0.121434 }, { "epoch": 1.2032144257154056, "grad_norm": 0.8235255479812622, "learning_rate": 6.522574680516081e-06, "loss": 0.3093531608581543, "memory(GiB)": 33.6, "step": 960, "token_acc": 0.9002794452494307, "train_speed(iter/s)": 0.121606 }, { "epoch": 1.2032144257154056, "eval_loss": 0.3561866879463196, "eval_runtime": 29.5597, "eval_samples_per_second": 17.422, "eval_steps_per_second": 4.364, "eval_token_acc": 0.8802288540531478, "step": 960 }, { "epoch": 1.2094864758918071, "grad_norm": 0.8153970837593079, "learning_rate": 6.491253981306245e-06, "loss": 0.325747013092041, "memory(GiB)": 33.6, "step": 965, "token_acc": 0.8883892481810832, "train_speed(iter/s)": 0.121177 }, { "epoch": 1.2157585260682087, "grad_norm": 0.8184377551078796, "learning_rate": 6.459868919929691e-06, "loss": 0.3134697675704956, "memory(GiB)": 33.6, "step": 970, "token_acc": 0.8953274158432424, "train_speed(iter/s)": 0.121304 }, { "epoch": 1.22203057624461, "grad_norm": 0.8619184494018555, "learning_rate": 6.428420850958194e-06, "loss": 0.3030562162399292, "memory(GiB)": 33.6, "step": 975, "token_acc": 0.9094885815374718, "train_speed(iter/s)": 0.121432 }, { "epoch": 1.2283026264210113, "grad_norm": 0.7978628873825073, "learning_rate": 6.3969111316829215e-06, "loss": 0.319288444519043, "memory(GiB)": 33.6, "step": 980, "token_acc": 0.8982630272952854, "train_speed(iter/s)": 0.121593 }, { "epoch": 1.2283026264210113, "eval_loss": 0.35557761788368225, "eval_runtime": 29.5767, "eval_samples_per_second": 17.412, "eval_steps_per_second": 4.362, "eval_token_acc": 0.8800444295414536, "step": 980 }, { "epoch": 1.2345746765974128, "grad_norm": 0.7945271730422974, "learning_rate": 6.365341122055857e-06, "loss": 0.31643688678741455, "memory(GiB)": 33.6, "step": 985, "token_acc": 0.8898873699940827, "train_speed(iter/s)": 0.121182 }, { "epoch": 1.2408467267738141, "grad_norm": 0.7574790716171265, "learning_rate": 6.333712184631093e-06, "loss": 0.3020521879196167, "memory(GiB)": 33.6, "step": 990, "token_acc": 0.9032726908234351, "train_speed(iter/s)": 0.121316 }, { "epoch": 1.2471187769502157, "grad_norm": 0.8387673497200012, "learning_rate": 6.302025684506042e-06, "loss": 0.3192462682723999, "memory(GiB)": 33.6, "step": 995, "token_acc": 0.8955223880597015, "train_speed(iter/s)": 0.121466 }, { "epoch": 1.253390827126617, "grad_norm": 0.7831020951271057, "learning_rate": 6.2702829892625e-06, "loss": 0.31763949394226076, "memory(GiB)": 33.6, "step": 1000, "token_acc": 0.9040835976438605, "train_speed(iter/s)": 0.121627 }, { "epoch": 1.253390827126617, "eval_loss": 0.3534168303012848, "eval_runtime": 29.5641, "eval_samples_per_second": 17.42, "eval_steps_per_second": 4.363, "eval_token_acc": 0.880287534579596, "step": 1000 }, { "epoch": 1.2596628773030183, "grad_norm": 0.8562179803848267, "learning_rate": 6.238485468907637e-06, "loss": 0.317755913734436, "memory(GiB)": 33.6, "step": 1005, "token_acc": 0.893742948563819, "train_speed(iter/s)": 0.121201 }, { "epoch": 1.2659349274794198, "grad_norm": 0.7360610365867615, "learning_rate": 6.2066344958148596e-06, "loss": 0.3083625555038452, "memory(GiB)": 33.6, "step": 1010, "token_acc": 0.9031221682181398, "train_speed(iter/s)": 0.121357 }, { "epoch": 1.2722069776558214, "grad_norm": 0.7288538217544556, "learning_rate": 6.174731444664579e-06, "loss": 0.3151975154876709, "memory(GiB)": 33.6, "step": 1015, "token_acc": 0.8976653802994226, "train_speed(iter/s)": 0.121523 }, { "epoch": 1.2784790278322227, "grad_norm": 0.7573474645614624, "learning_rate": 6.14277769238489e-06, "loss": 0.30403614044189453, "memory(GiB)": 33.6, "step": 1020, "token_acc": 0.9034773205850549, "train_speed(iter/s)": 0.121659 }, { "epoch": 1.2784790278322227, "eval_loss": 0.35168829560279846, "eval_runtime": 29.6625, "eval_samples_per_second": 17.362, "eval_steps_per_second": 4.349, "eval_token_acc": 0.8805432140162629, "step": 1020 }, { "epoch": 1.284751078008624, "grad_norm": 0.7631447315216064, "learning_rate": 6.110774618092128e-06, "loss": 0.302550745010376, "memory(GiB)": 33.6, "step": 1025, "token_acc": 0.8947289080950706, "train_speed(iter/s)": 0.121234 }, { "epoch": 1.2910231281850255, "grad_norm": 0.7580868601799011, "learning_rate": 6.07872360303136e-06, "loss": 0.3163439273834229, "memory(GiB)": 33.6, "step": 1030, "token_acc": 0.8989989733059548, "train_speed(iter/s)": 0.121408 }, { "epoch": 1.2972951783614268, "grad_norm": 0.8098512291908264, "learning_rate": 6.046626030516766e-06, "loss": 0.31558966636657715, "memory(GiB)": 33.6, "step": 1035, "token_acc": 0.902613864848326, "train_speed(iter/s)": 0.121551 }, { "epoch": 1.3035672285378284, "grad_norm": 0.7808286547660828, "learning_rate": 6.0144832858719256e-06, "loss": 0.31145410537719725, "memory(GiB)": 33.6, "step": 1040, "token_acc": 0.9054388771350431, "train_speed(iter/s)": 0.121678 }, { "epoch": 1.3035672285378284, "eval_loss": 0.351917564868927, "eval_runtime": 29.5693, "eval_samples_per_second": 17.417, "eval_steps_per_second": 4.363, "eval_token_acc": 0.8806480006706345, "step": 1040 }, { "epoch": 1.3098392787142297, "grad_norm": 0.8395740389823914, "learning_rate": 5.982296756370052e-06, "loss": 0.30791757106781004, "memory(GiB)": 33.6, "step": 1045, "token_acc": 0.8919198395135772, "train_speed(iter/s)": 0.121267 }, { "epoch": 1.3161113288906312, "grad_norm": 0.8778785467147827, "learning_rate": 5.950067831174086e-06, "loss": 0.3176340341567993, "memory(GiB)": 33.6, "step": 1050, "token_acc": 0.8968351513289292, "train_speed(iter/s)": 0.121396 }, { "epoch": 1.3223833790670325, "grad_norm": 0.8171632289886475, "learning_rate": 5.917797901276771e-06, "loss": 0.3169762134552002, "memory(GiB)": 33.6, "step": 1055, "token_acc": 0.8947547773205065, "train_speed(iter/s)": 0.121546 }, { "epoch": 1.328655429243434, "grad_norm": 0.852591335773468, "learning_rate": 5.885488359440592e-06, "loss": 0.30669825077056884, "memory(GiB)": 33.6, "step": 1060, "token_acc": 0.897007142047387, "train_speed(iter/s)": 0.121675 }, { "epoch": 1.328655429243434, "eval_loss": 0.35168930888175964, "eval_runtime": 29.6924, "eval_samples_per_second": 17.345, "eval_steps_per_second": 4.345, "eval_token_acc": 0.8810084667616732, "step": 1060 }, { "epoch": 1.3349274794198354, "grad_norm": 0.7796351909637451, "learning_rate": 5.853140600137684e-06, "loss": 0.3120392322540283, "memory(GiB)": 33.6, "step": 1065, "token_acc": 0.8927770755874566, "train_speed(iter/s)": 0.121274 }, { "epoch": 1.3411995295962367, "grad_norm": 0.7424667477607727, "learning_rate": 5.8207560194896325e-06, "loss": 0.3261461973190308, "memory(GiB)": 33.6, "step": 1070, "token_acc": 0.8903715475668567, "train_speed(iter/s)": 0.121404 }, { "epoch": 1.3474715797726382, "grad_norm": 0.7519833445549011, "learning_rate": 5.78833601520723e-06, "loss": 0.3177935123443604, "memory(GiB)": 33.6, "step": 1075, "token_acc": 0.9002484697897097, "train_speed(iter/s)": 0.121541 }, { "epoch": 1.3537436299490395, "grad_norm": 0.7985767722129822, "learning_rate": 5.755881986530137e-06, "loss": 0.3214226722717285, "memory(GiB)": 33.6, "step": 1080, "token_acc": 0.8970797820315775, "train_speed(iter/s)": 0.121665 }, { "epoch": 1.3537436299490395, "eval_loss": 0.3515044152736664, "eval_runtime": 29.5934, "eval_samples_per_second": 17.403, "eval_steps_per_second": 4.359, "eval_token_acc": 0.8810922960851706, "step": 1080 }, { "epoch": 1.360015680125441, "grad_norm": 0.8925988674163818, "learning_rate": 5.723395334166506e-06, "loss": 0.3182457447052002, "memory(GiB)": 33.6, "step": 1085, "token_acc": 0.891329215282544, "train_speed(iter/s)": 0.121279 }, { "epoch": 1.3662877303018424, "grad_norm": 0.8478845953941345, "learning_rate": 5.6908774602325165e-06, "loss": 0.301633358001709, "memory(GiB)": 33.6, "step": 1090, "token_acc": 0.9010009298255209, "train_speed(iter/s)": 0.121401 }, { "epoch": 1.372559780478244, "grad_norm": 0.7945632934570312, "learning_rate": 5.6583297681918615e-06, "loss": 0.3118173122406006, "memory(GiB)": 33.6, "step": 1095, "token_acc": 0.8985079483127266, "train_speed(iter/s)": 0.121529 }, { "epoch": 1.3788318306546452, "grad_norm": 0.8674613833427429, "learning_rate": 5.625753662795183e-06, "loss": 0.31327056884765625, "memory(GiB)": 33.6, "step": 1100, "token_acc": 0.8926878892390144, "train_speed(iter/s)": 0.121662 }, { "epoch": 1.3788318306546452, "eval_loss": 0.3502074182033539, "eval_runtime": 29.7159, "eval_samples_per_second": 17.331, "eval_steps_per_second": 4.341, "eval_token_acc": 0.8811845083410177, "step": 1100 }, { "epoch": 1.3851038808310467, "grad_norm": 0.9332753419876099, "learning_rate": 5.59315055001943e-06, "loss": 0.3266937255859375, "memory(GiB)": 33.6, "step": 1105, "token_acc": 0.8881552483640253, "train_speed(iter/s)": 0.121281 }, { "epoch": 1.391375931007448, "grad_norm": 0.8074430823326111, "learning_rate": 5.5605218370071836e-06, "loss": 0.30109169483184817, "memory(GiB)": 33.6, "step": 1110, "token_acc": 0.9126673532440782, "train_speed(iter/s)": 0.121389 }, { "epoch": 1.3976479811838494, "grad_norm": 0.7786163687705994, "learning_rate": 5.5278689320059305e-06, "loss": 0.32388741970062257, "memory(GiB)": 33.6, "step": 1115, "token_acc": 0.8926039631593636, "train_speed(iter/s)": 0.12155 }, { "epoch": 1.403920031360251, "grad_norm": 0.8224254250526428, "learning_rate": 5.4951932443072764e-06, "loss": 0.3238008260726929, "memory(GiB)": 33.6, "step": 1120, "token_acc": 0.8939509836918276, "train_speed(iter/s)": 0.121686 }, { "epoch": 1.403920031360251, "eval_loss": 0.3496204912662506, "eval_runtime": 29.6525, "eval_samples_per_second": 17.368, "eval_steps_per_second": 4.35, "eval_token_acc": 0.8808827227764272, "step": 1120 }, { "epoch": 1.4101920815366524, "grad_norm": 0.8136359453201294, "learning_rate": 5.462496184186118e-06, "loss": 0.31591062545776366, "memory(GiB)": 33.6, "step": 1125, "token_acc": 0.8925674700533988, "train_speed(iter/s)": 0.121313 }, { "epoch": 1.4164641317130537, "grad_norm": 0.7958024740219116, "learning_rate": 5.429779162839787e-06, "loss": 0.32611215114593506, "memory(GiB)": 33.6, "step": 1130, "token_acc": 0.8966544669669669, "train_speed(iter/s)": 0.121454 }, { "epoch": 1.422736181889455, "grad_norm": 0.7814271450042725, "learning_rate": 5.397043592327129e-06, "loss": 0.31618432998657225, "memory(GiB)": 33.6, "step": 1135, "token_acc": 0.8981076808629347, "train_speed(iter/s)": 0.12158 }, { "epoch": 1.4290082320658566, "grad_norm": 0.8769139647483826, "learning_rate": 5.364290885507577e-06, "loss": 0.3024888277053833, "memory(GiB)": 33.6, "step": 1140, "token_acc": 0.9077086992829787, "train_speed(iter/s)": 0.121677 }, { "epoch": 1.4290082320658566, "eval_loss": 0.3490462005138397, "eval_runtime": 29.6376, "eval_samples_per_second": 17.377, "eval_steps_per_second": 4.353, "eval_token_acc": 0.8812515717998156, "step": 1140 }, { "epoch": 1.435280282242258, "grad_norm": 0.8784323930740356, "learning_rate": 5.3315224559801555e-06, "loss": 0.30409352779388427, "memory(GiB)": 33.6, "step": 1145, "token_acc": 0.8937964910867968, "train_speed(iter/s)": 0.12129 }, { "epoch": 1.4415523324186594, "grad_norm": 0.8287903070449829, "learning_rate": 5.2987397180224795e-06, "loss": 0.3141587972640991, "memory(GiB)": 33.6, "step": 1150, "token_acc": 0.9003217129898212, "train_speed(iter/s)": 0.121389 }, { "epoch": 1.4478243825950607, "grad_norm": 0.8509834408760071, "learning_rate": 5.265944086529714e-06, "loss": 0.30988848209381104, "memory(GiB)": 33.6, "step": 1155, "token_acc": 0.896488090168356, "train_speed(iter/s)": 0.12152 }, { "epoch": 1.454096432771462, "grad_norm": 0.8002197742462158, "learning_rate": 5.233136976953504e-06, "loss": 0.3149235725402832, "memory(GiB)": 33.6, "step": 1160, "token_acc": 0.9005892084869243, "train_speed(iter/s)": 0.121672 }, { "epoch": 1.454096432771462, "eval_loss": 0.3489372730255127, "eval_runtime": 29.6441, "eval_samples_per_second": 17.373, "eval_steps_per_second": 4.352, "eval_token_acc": 0.8816916757481768, "step": 1160 }, { "epoch": 1.4603684829478636, "grad_norm": 0.7909538149833679, "learning_rate": 5.200319805240884e-06, "loss": 0.3111138343811035, "memory(GiB)": 33.6, "step": 1165, "token_acc": 0.8955117952818873, "train_speed(iter/s)": 0.1213 }, { "epoch": 1.4666405331242651, "grad_norm": 0.8317378163337708, "learning_rate": 5.167493987773175e-06, "loss": 0.31470346450805664, "memory(GiB)": 33.6, "step": 1170, "token_acc": 0.8882736915724443, "train_speed(iter/s)": 0.121439 }, { "epoch": 1.4729125833006664, "grad_norm": 0.8626337647438049, "learning_rate": 5.134660941304838e-06, "loss": 0.3050379276275635, "memory(GiB)": 33.6, "step": 1175, "token_acc": 0.9051131601005867, "train_speed(iter/s)": 0.121566 }, { "epoch": 1.4791846334770677, "grad_norm": 0.8486148118972778, "learning_rate": 5.10182208290234e-06, "loss": 0.3151291608810425, "memory(GiB)": 33.6, "step": 1180, "token_acc": 0.9041969040589388, "train_speed(iter/s)": 0.121677 }, { "epoch": 1.4791846334770677, "eval_loss": 0.3478534519672394, "eval_runtime": 29.59, "eval_samples_per_second": 17.405, "eval_steps_per_second": 4.36, "eval_token_acc": 0.8822281834185598, "step": 1180 }, { "epoch": 1.4854566836534693, "grad_norm": 0.7371572852134705, "learning_rate": 5.068978829882992e-06, "loss": 0.31096959114074707, "memory(GiB)": 33.6, "step": 1185, "token_acc": 0.8926875593542261, "train_speed(iter/s)": 0.121251 }, { "epoch": 1.4917287338298706, "grad_norm": 0.804434597492218, "learning_rate": 5.036132599753771e-06, "loss": 0.32340445518493655, "memory(GiB)": 33.6, "step": 1190, "token_acc": 0.8985925658607001, "train_speed(iter/s)": 0.121393 }, { "epoch": 1.4980007840062721, "grad_norm": 0.8279677033424377, "learning_rate": 5.003284810150152e-06, "loss": 0.305421781539917, "memory(GiB)": 33.6, "step": 1195, "token_acc": 0.8999231444883619, "train_speed(iter/s)": 0.121518 }, { "epoch": 1.5042728341826734, "grad_norm": 0.8119995594024658, "learning_rate": 4.970436878774907e-06, "loss": 0.32050256729125975, "memory(GiB)": 33.6, "step": 1200, "token_acc": 0.8954298771779492, "train_speed(iter/s)": 0.121643 }, { "epoch": 1.5042728341826734, "eval_loss": 0.34642741084098816, "eval_runtime": 29.5744, "eval_samples_per_second": 17.414, "eval_steps_per_second": 4.362, "eval_token_acc": 0.8823287786067566, "step": 1200 }, { "epoch": 1.5105448843590747, "grad_norm": 0.8389795422554016, "learning_rate": 4.937590223336936e-06, "loss": 0.311386251449585, "memory(GiB)": 33.6, "step": 1205, "token_acc": 0.8925981666438637, "train_speed(iter/s)": 0.121306 }, { "epoch": 1.5168169345354763, "grad_norm": 0.8051816821098328, "learning_rate": 4.904746261490062e-06, "loss": 0.3099170684814453, "memory(GiB)": 33.6, "step": 1210, "token_acc": 0.8959899446472166, "train_speed(iter/s)": 0.121422 }, { "epoch": 1.5230889847118778, "grad_norm": 0.7875924110412598, "learning_rate": 4.87190641077186e-06, "loss": 0.30568342208862304, "memory(GiB)": 33.6, "step": 1215, "token_acc": 0.901078617984657, "train_speed(iter/s)": 0.12152 }, { "epoch": 1.5293610348882791, "grad_norm": 0.8027485609054565, "learning_rate": 4.8390720885424665e-06, "loss": 0.30870785713195803, "memory(GiB)": 33.6, "step": 1220, "token_acc": 0.9042371803028375, "train_speed(iter/s)": 0.12164 }, { "epoch": 1.5293610348882791, "eval_loss": 0.34620022773742676, "eval_runtime": 29.5969, "eval_samples_per_second": 17.4, "eval_steps_per_second": 4.359, "eval_token_acc": 0.8822030346215106, "step": 1220 }, { "epoch": 1.5356330850646804, "grad_norm": 0.7887035608291626, "learning_rate": 4.806244711923408e-06, "loss": 0.31411142349243165, "memory(GiB)": 33.6, "step": 1225, "token_acc": 0.8938585311454517, "train_speed(iter/s)": 0.121287 }, { "epoch": 1.541905135241082, "grad_norm": 0.8030281066894531, "learning_rate": 4.773425697736445e-06, "loss": 0.3045094728469849, "memory(GiB)": 33.6, "step": 1230, "token_acc": 0.9042907034758024, "train_speed(iter/s)": 0.121403 }, { "epoch": 1.5481771854174835, "grad_norm": 0.829979658126831, "learning_rate": 4.7406164624424135e-06, "loss": 0.30724167823791504, "memory(GiB)": 33.6, "step": 1235, "token_acc": 0.8976526045138559, "train_speed(iter/s)": 0.121507 }, { "epoch": 1.5544492355938848, "grad_norm": 0.8427159190177917, "learning_rate": 4.707818422080094e-06, "loss": 0.3048734664916992, "memory(GiB)": 33.6, "step": 1240, "token_acc": 0.9048282007016762, "train_speed(iter/s)": 0.12164 }, { "epoch": 1.5544492355938848, "eval_loss": 0.34649136662483215, "eval_runtime": 29.5723, "eval_samples_per_second": 17.415, "eval_steps_per_second": 4.362, "eval_token_acc": 0.8821317796965379, "step": 1240 }, { "epoch": 1.5607212857702861, "grad_norm": 0.8154223561286926, "learning_rate": 4.675032992205099e-06, "loss": 0.31010706424713136, "memory(GiB)": 33.6, "step": 1245, "token_acc": 0.8929249172868086, "train_speed(iter/s)": 0.121305 }, { "epoch": 1.5669933359466874, "grad_norm": 0.8454766273498535, "learning_rate": 4.642261587828778e-06, "loss": 0.3093379735946655, "memory(GiB)": 33.6, "step": 1250, "token_acc": 0.8965885145768318, "train_speed(iter/s)": 0.121428 }, { "epoch": 1.573265386123089, "grad_norm": 0.8195229768753052, "learning_rate": 4.609505623357135e-06, "loss": 0.29369077682495115, "memory(GiB)": 33.6, "step": 1255, "token_acc": 0.9066529450935287, "train_speed(iter/s)": 0.121553 }, { "epoch": 1.5795374362994905, "grad_norm": 0.8140600919723511, "learning_rate": 4.576766512529799e-06, "loss": 0.3222052574157715, "memory(GiB)": 33.6, "step": 1260, "token_acc": 0.8947929354445798, "train_speed(iter/s)": 0.121668 }, { "epoch": 1.5795374362994905, "eval_loss": 0.3448590338230133, "eval_runtime": 29.5768, "eval_samples_per_second": 17.412, "eval_steps_per_second": 4.362, "eval_token_acc": 0.8820186101098164, "step": 1260 }, { "epoch": 1.5858094864758918, "grad_norm": 0.8057063221931458, "learning_rate": 4.544045668358999e-06, "loss": 0.3125570774078369, "memory(GiB)": 33.6, "step": 1265, "token_acc": 0.892243011525279, "train_speed(iter/s)": 0.12132 }, { "epoch": 1.5920815366522931, "grad_norm": 0.813529372215271, "learning_rate": 4.511344503068574e-06, "loss": 0.3144998550415039, "memory(GiB)": 33.6, "step": 1270, "token_acc": 0.8892766751032369, "train_speed(iter/s)": 0.121413 }, { "epoch": 1.5983535868286947, "grad_norm": 0.8066157698631287, "learning_rate": 4.478664428033031e-06, "loss": 0.30913031101226807, "memory(GiB)": 33.6, "step": 1275, "token_acc": 0.8998527245949927, "train_speed(iter/s)": 0.121518 }, { "epoch": 1.6046256370050962, "grad_norm": 0.7944806218147278, "learning_rate": 4.446006853716628e-06, "loss": 0.31250030994415284, "memory(GiB)": 33.6, "step": 1280, "token_acc": 0.9023781249048446, "train_speed(iter/s)": 0.121625 }, { "epoch": 1.6046256370050962, "eval_loss": 0.34460246562957764, "eval_runtime": 29.653, "eval_samples_per_second": 17.368, "eval_steps_per_second": 4.35, "eval_token_acc": 0.8823539274038058, "step": 1280 }, { "epoch": 1.6108976871814975, "grad_norm": 0.7362424731254578, "learning_rate": 4.413373189612497e-06, "loss": 0.3013019561767578, "memory(GiB)": 33.6, "step": 1285, "token_acc": 0.8930150309460654, "train_speed(iter/s)": 0.121286 }, { "epoch": 1.6171697373578988, "grad_norm": 0.7636338472366333, "learning_rate": 4.380764844181806e-06, "loss": 0.30982949733734133, "memory(GiB)": 33.6, "step": 1290, "token_acc": 0.8940976689137035, "train_speed(iter/s)": 0.12142 }, { "epoch": 1.6234417875343001, "grad_norm": 0.8665372729301453, "learning_rate": 4.34818322479298e-06, "loss": 0.3103508949279785, "memory(GiB)": 33.6, "step": 1295, "token_acc": 0.8969226252435476, "train_speed(iter/s)": 0.121525 }, { "epoch": 1.6297138377107017, "grad_norm": 0.8369340896606445, "learning_rate": 4.315629737660956e-06, "loss": 0.30051441192626954, "memory(GiB)": 33.6, "step": 1300, "token_acc": 0.9011551155115511, "train_speed(iter/s)": 0.121614 }, { "epoch": 1.6297138377107017, "eval_loss": 0.34451213479042053, "eval_runtime": 29.6214, "eval_samples_per_second": 17.386, "eval_steps_per_second": 4.355, "eval_token_acc": 0.8825928409757733, "step": 1300 }, { "epoch": 1.6359858878871032, "grad_norm": 0.8098254799842834, "learning_rate": 4.283105787786482e-06, "loss": 0.2908606052398682, "memory(GiB)": 33.6, "step": 1305, "token_acc": 0.8952707220970546, "train_speed(iter/s)": 0.121269 }, { "epoch": 1.6422579380635045, "grad_norm": 0.7809374332427979, "learning_rate": 4.250612778895492e-06, "loss": 0.31494917869567873, "memory(GiB)": 33.6, "step": 1310, "token_acc": 0.8949442815249267, "train_speed(iter/s)": 0.121381 }, { "epoch": 1.6485299882399058, "grad_norm": 0.8695399165153503, "learning_rate": 4.218152113378513e-06, "loss": 0.3110328674316406, "memory(GiB)": 33.6, "step": 1315, "token_acc": 0.9081429854890964, "train_speed(iter/s)": 0.121498 }, { "epoch": 1.6548020384163074, "grad_norm": 0.7448384165763855, "learning_rate": 4.185725192230136e-06, "loss": 0.3125450849533081, "memory(GiB)": 33.6, "step": 1320, "token_acc": 0.8948348068869242, "train_speed(iter/s)": 0.121599 }, { "epoch": 1.6548020384163074, "eval_loss": 0.34412243962287903, "eval_runtime": 29.6115, "eval_samples_per_second": 17.392, "eval_steps_per_second": 4.356, "eval_token_acc": 0.8831041998491073, "step": 1320 }, { "epoch": 1.6610740885927089, "grad_norm": 0.8303554058074951, "learning_rate": 4.1533334149885594e-06, "loss": 0.30773005485534666, "memory(GiB)": 33.6, "step": 1325, "token_acc": 0.8920632614516095, "train_speed(iter/s)": 0.121277 }, { "epoch": 1.6673461387691102, "grad_norm": 0.8903198838233948, "learning_rate": 4.120978179675172e-06, "loss": 0.3075371265411377, "memory(GiB)": 33.6, "step": 1330, "token_acc": 0.8935736892803082, "train_speed(iter/s)": 0.121374 }, { "epoch": 1.6736181889455115, "grad_norm": 0.7909395694732666, "learning_rate": 4.088660882734228e-06, "loss": 0.3146337985992432, "memory(GiB)": 33.6, "step": 1335, "token_acc": 0.9018489263354982, "train_speed(iter/s)": 0.121483 }, { "epoch": 1.6798902391219128, "grad_norm": 0.789057195186615, "learning_rate": 4.056382918972565e-06, "loss": 0.30276224613189695, "memory(GiB)": 33.6, "step": 1340, "token_acc": 0.8988961986291943, "train_speed(iter/s)": 0.121581 }, { "epoch": 1.6798902391219128, "eval_loss": 0.34376105666160583, "eval_runtime": 29.7014, "eval_samples_per_second": 17.339, "eval_steps_per_second": 4.343, "eval_token_acc": 0.8829281582697628, "step": 1340 }, { "epoch": 1.6861622892983144, "grad_norm": 0.7871639132499695, "learning_rate": 4.024145681499416e-06, "loss": 0.2980226993560791, "memory(GiB)": 33.6, "step": 1345, "token_acc": 0.8948409478211474, "train_speed(iter/s)": 0.121283 }, { "epoch": 1.6924343394747159, "grad_norm": 0.7809557318687439, "learning_rate": 3.991950561666269e-06, "loss": 0.29270751476287843, "memory(GiB)": 33.6, "step": 1350, "token_acc": 0.9090764878513075, "train_speed(iter/s)": 0.121377 }, { "epoch": 1.6987063896511172, "grad_norm": 0.8027935028076172, "learning_rate": 3.959798949006831e-06, "loss": 0.29990406036376954, "memory(GiB)": 33.6, "step": 1355, "token_acc": 0.9105977304119384, "train_speed(iter/s)": 0.121483 }, { "epoch": 1.7049784398275185, "grad_norm": 0.7330254912376404, "learning_rate": 3.927692231177053e-06, "loss": 0.31405091285705566, "memory(GiB)": 33.6, "step": 1360, "token_acc": 0.9093888419489464, "train_speed(iter/s)": 0.121592 }, { "epoch": 1.7049784398275185, "eval_loss": 0.34284746646881104, "eval_runtime": 29.5951, "eval_samples_per_second": 17.402, "eval_steps_per_second": 4.359, "eval_token_acc": 0.883364070751949, "step": 1360 }, { "epoch": 1.71125049000392, "grad_norm": 0.7684708833694458, "learning_rate": 3.895631793895223e-06, "loss": 0.3107592582702637, "memory(GiB)": 33.6, "step": 1365, "token_acc": 0.8940206880820383, "train_speed(iter/s)": 0.121282 }, { "epoch": 1.7175225401803216, "grad_norm": 0.8445452451705933, "learning_rate": 3.863619020882184e-06, "loss": 0.32090349197387696, "memory(GiB)": 33.6, "step": 1370, "token_acc": 0.9026219178795533, "train_speed(iter/s)": 0.121395 }, { "epoch": 1.7237945903567229, "grad_norm": 0.8261458873748779, "learning_rate": 3.831655293801596e-06, "loss": 0.31141784191131594, "memory(GiB)": 33.6, "step": 1375, "token_acc": 0.9064525633470831, "train_speed(iter/s)": 0.121509 }, { "epoch": 1.7300666405331242, "grad_norm": 0.7714701890945435, "learning_rate": 3.7997419922003077e-06, "loss": 0.31881678104400635, "memory(GiB)": 33.6, "step": 1380, "token_acc": 0.8884819878472222, "train_speed(iter/s)": 0.121611 }, { "epoch": 1.7300666405331242, "eval_loss": 0.3424231708049774, "eval_runtime": 29.6736, "eval_samples_per_second": 17.355, "eval_steps_per_second": 4.347, "eval_token_acc": 0.8831922206387794, "step": 1380 }, { "epoch": 1.7363386907095255, "grad_norm": 0.7543424367904663, "learning_rate": 3.7678804934488146e-06, "loss": 0.31963376998901366, "memory(GiB)": 33.6, "step": 1385, "token_acc": 0.8950204631510191, "train_speed(iter/s)": 0.121307 }, { "epoch": 1.742610740885927, "grad_norm": 0.7479931116104126, "learning_rate": 3.736072172681818e-06, "loss": 0.3031449794769287, "memory(GiB)": 33.6, "step": 1390, "token_acc": 0.9012068207358186, "train_speed(iter/s)": 0.121432 }, { "epoch": 1.7488827910623286, "grad_norm": 0.7986470460891724, "learning_rate": 3.704318402738867e-06, "loss": 0.3046679973602295, "memory(GiB)": 33.6, "step": 1395, "token_acc": 0.9063746108046102, "train_speed(iter/s)": 0.121546 }, { "epoch": 1.75515484123873, "grad_norm": 0.8084174990653992, "learning_rate": 3.672620554105111e-06, "loss": 0.2944044589996338, "memory(GiB)": 33.6, "step": 1400, "token_acc": 0.9039976484420928, "train_speed(iter/s)": 0.12164 }, { "epoch": 1.75515484123873, "eval_loss": 0.34153878688812256, "eval_runtime": 29.599, "eval_samples_per_second": 17.399, "eval_steps_per_second": 4.358, "eval_token_acc": 0.8834227512783972, "step": 1400 }, { "epoch": 1.7614268914151312, "grad_norm": 0.8045998215675354, "learning_rate": 3.6409799948521473e-06, "loss": 0.3039552688598633, "memory(GiB)": 33.6, "step": 1405, "token_acc": 0.8954536924550851, "train_speed(iter/s)": 0.121314 }, { "epoch": 1.7676989415915327, "grad_norm": 0.8704200983047485, "learning_rate": 3.6093980905789824e-06, "loss": 0.31539764404296877, "memory(GiB)": 33.6, "step": 1410, "token_acc": 0.8984341080783814, "train_speed(iter/s)": 0.121425 }, { "epoch": 1.7739709917679343, "grad_norm": 0.8571969270706177, "learning_rate": 3.577876204353079e-06, "loss": 0.32133939266204836, "memory(GiB)": 33.6, "step": 1415, "token_acc": 0.8925621987755634, "train_speed(iter/s)": 0.121539 }, { "epoch": 1.7802430419443356, "grad_norm": 0.800134003162384, "learning_rate": 3.5464156966515426e-06, "loss": 0.2933182716369629, "memory(GiB)": 33.6, "step": 1420, "token_acc": 0.9109921166146066, "train_speed(iter/s)": 0.121629 }, { "epoch": 1.7802430419443356, "eval_loss": 0.3414349853992462, "eval_runtime": 29.7108, "eval_samples_per_second": 17.334, "eval_steps_per_second": 4.342, "eval_token_acc": 0.8835065806018946, "step": 1420 }, { "epoch": 1.786515092120737, "grad_norm": 0.8272744417190552, "learning_rate": 3.515017925302396e-06, "loss": 0.2981221675872803, "memory(GiB)": 33.6, "step": 1425, "token_acc": 0.894788244695019, "train_speed(iter/s)": 0.121322 }, { "epoch": 1.7927871422971384, "grad_norm": 0.8108351826667786, "learning_rate": 3.48368424542597e-06, "loss": 0.32099928855896, "memory(GiB)": 33.6, "step": 1430, "token_acc": 0.8884463309687173, "train_speed(iter/s)": 0.12142 }, { "epoch": 1.7990591924735397, "grad_norm": 0.8740379810333252, "learning_rate": 3.4524160093764288e-06, "loss": 0.28867833614349364, "memory(GiB)": 33.6, "step": 1435, "token_acc": 0.9071488053295753, "train_speed(iter/s)": 0.121496 }, { "epoch": 1.8053312426499413, "grad_norm": 0.7525773644447327, "learning_rate": 3.421214566683395e-06, "loss": 0.3096869230270386, "memory(GiB)": 33.6, "step": 1440, "token_acc": 0.8992541967025018, "train_speed(iter/s)": 0.121595 }, { "epoch": 1.8053312426499413, "eval_loss": 0.3419208526611328, "eval_runtime": 29.7092, "eval_samples_per_second": 17.335, "eval_steps_per_second": 4.342, "eval_token_acc": 0.8835065806018946, "step": 1440 }, { "epoch": 1.8116032928263426, "grad_norm": 0.7440472841262817, "learning_rate": 3.390081263993702e-06, "loss": 0.30480227470397947, "memory(GiB)": 33.6, "step": 1445, "token_acc": 0.8953731301707568, "train_speed(iter/s)": 0.121312 }, { "epoch": 1.817875343002744, "grad_norm": 0.7330334186553955, "learning_rate": 3.3590174450132828e-06, "loss": 0.3220362186431885, "memory(GiB)": 33.6, "step": 1450, "token_acc": 0.8987148014440434, "train_speed(iter/s)": 0.121421 }, { "epoch": 1.8241473931791454, "grad_norm": 0.9180962443351746, "learning_rate": 3.3280244504491664e-06, "loss": 0.3133381366729736, "memory(GiB)": 33.6, "step": 1455, "token_acc": 0.9044320549642395, "train_speed(iter/s)": 0.121527 }, { "epoch": 1.830419443355547, "grad_norm": 0.8578007817268372, "learning_rate": 3.297103617951618e-06, "loss": 0.31406660079956056, "memory(GiB)": 33.6, "step": 1460, "token_acc": 0.9081761962692619, "train_speed(iter/s)": 0.121624 }, { "epoch": 1.830419443355547, "eval_loss": 0.34057924151420593, "eval_runtime": 29.6166, "eval_samples_per_second": 17.389, "eval_steps_per_second": 4.356, "eval_token_acc": 0.8835023891357197, "step": 1460 }, { "epoch": 1.8366914935319483, "grad_norm": 0.8259463310241699, "learning_rate": 3.2662562820564043e-06, "loss": 0.3026223659515381, "memory(GiB)": 33.6, "step": 1465, "token_acc": 0.8945344631088648, "train_speed(iter/s)": 0.121317 }, { "epoch": 1.8429635437083496, "grad_norm": 0.848983108997345, "learning_rate": 3.2354837741271994e-06, "loss": 0.30731768608093263, "memory(GiB)": 33.6, "step": 1470, "token_acc": 0.8996287038609945, "train_speed(iter/s)": 0.121417 }, { "epoch": 1.8492355938847511, "grad_norm": 0.7451069951057434, "learning_rate": 3.2047874222981134e-06, "loss": 0.3043700933456421, "memory(GiB)": 33.6, "step": 1475, "token_acc": 0.9003040283759818, "train_speed(iter/s)": 0.121511 }, { "epoch": 1.8555076440611527, "grad_norm": 0.8326135277748108, "learning_rate": 3.174168551416384e-06, "loss": 0.3095861434936523, "memory(GiB)": 33.6, "step": 1480, "token_acc": 0.9073784192512802, "train_speed(iter/s)": 0.121591 }, { "epoch": 1.8555076440611527, "eval_loss": 0.3408661484718323, "eval_runtime": 29.6597, "eval_samples_per_second": 17.364, "eval_steps_per_second": 4.349, "eval_token_acc": 0.8834227512783972, "step": 1480 }, { "epoch": 1.861779694237554, "grad_norm": 0.8090599775314331, "learning_rate": 3.1436284829851883e-06, "loss": 0.3018056392669678, "memory(GiB)": 33.6, "step": 1485, "token_acc": 0.8960189466561408, "train_speed(iter/s)": 0.121294 }, { "epoch": 1.8680517444139553, "grad_norm": 0.8168737292289734, "learning_rate": 3.113168535106604e-06, "loss": 0.3135341167449951, "memory(GiB)": 33.6, "step": 1490, "token_acc": 0.8999463724814218, "train_speed(iter/s)": 0.1214 }, { "epoch": 1.8743237945903566, "grad_norm": 0.8455703258514404, "learning_rate": 3.08279002242473e-06, "loss": 0.31365869045257566, "memory(GiB)": 33.6, "step": 1495, "token_acc": 0.8966787576513262, "train_speed(iter/s)": 0.121495 }, { "epoch": 1.8805958447667581, "grad_norm": 0.7874972820281982, "learning_rate": 3.0524942560689387e-06, "loss": 0.2999868392944336, "memory(GiB)": 33.6, "step": 1500, "token_acc": 0.9022604710944704, "train_speed(iter/s)": 0.121584 }, { "epoch": 1.8805958447667581, "eval_loss": 0.34014302492141724, "eval_runtime": 29.589, "eval_samples_per_second": 17.405, "eval_steps_per_second": 4.36, "eval_token_acc": 0.8837999832341353, "step": 1500 }, { "epoch": 1.8868678949431597, "grad_norm": 0.7840932607650757, "learning_rate": 3.0222825435972948e-06, "loss": 0.30387725830078127, "memory(GiB)": 33.6, "step": 1505, "token_acc": 0.896817482063844, "train_speed(iter/s)": 0.121288 }, { "epoch": 1.893139945119561, "grad_norm": 0.7794051766395569, "learning_rate": 2.99215618894011e-06, "loss": 0.2972860813140869, "memory(GiB)": 33.6, "step": 1510, "token_acc": 0.9043185860382392, "train_speed(iter/s)": 0.121375 }, { "epoch": 1.8994119952959623, "grad_norm": 0.8160467147827148, "learning_rate": 2.9621164923436774e-06, "loss": 0.28430678844451907, "memory(GiB)": 33.6, "step": 1515, "token_acc": 0.9079036497141464, "train_speed(iter/s)": 0.121466 }, { "epoch": 1.9056840454723638, "grad_norm": 0.7849772572517395, "learning_rate": 2.9321647503141525e-06, "loss": 0.30244333744049073, "memory(GiB)": 33.6, "step": 1520, "token_acc": 0.9014301372723003, "train_speed(iter/s)": 0.121546 }, { "epoch": 1.9056840454723638, "eval_loss": 0.34012025594711304, "eval_runtime": 29.6307, "eval_samples_per_second": 17.381, "eval_steps_per_second": 4.354, "eval_token_acc": 0.8839005784223322, "step": 1520 }, { "epoch": 1.9119560956487653, "grad_norm": 0.7242955565452576, "learning_rate": 2.902302255561585e-06, "loss": 0.31361520290374756, "memory(GiB)": 33.6, "step": 1525, "token_acc": 0.8939074979877407, "train_speed(iter/s)": 0.121264 }, { "epoch": 1.9182281458251667, "grad_norm": 0.7752643823623657, "learning_rate": 2.87253029694414e-06, "loss": 0.3019782304763794, "memory(GiB)": 33.6, "step": 1530, "token_acc": 0.8990245948922335, "train_speed(iter/s)": 0.121338 }, { "epoch": 1.924500196001568, "grad_norm": 0.9198828935623169, "learning_rate": 2.8428501594124602e-06, "loss": 0.2983893871307373, "memory(GiB)": 33.6, "step": 1535, "token_acc": 0.9083565129904717, "train_speed(iter/s)": 0.121439 }, { "epoch": 1.9307722461779693, "grad_norm": 0.808824896812439, "learning_rate": 2.813263123954214e-06, "loss": 0.30156128406524657, "memory(GiB)": 33.6, "step": 1540, "token_acc": 0.9017935200148356, "train_speed(iter/s)": 0.12152 }, { "epoch": 1.9307722461779693, "eval_loss": 0.3396177589893341, "eval_runtime": 29.634, "eval_samples_per_second": 17.379, "eval_steps_per_second": 4.353, "eval_token_acc": 0.8845460642132618, "step": 1540 }, { "epoch": 1.9370442963543708, "grad_norm": 0.7807343006134033, "learning_rate": 2.7837704675388045e-06, "loss": 0.2953279972076416, "memory(GiB)": 33.6, "step": 1545, "token_acc": 0.8962438270065346, "train_speed(iter/s)": 0.121237 }, { "epoch": 1.9433163465307723, "grad_norm": 0.7839388251304626, "learning_rate": 2.7543734630622622e-06, "loss": 0.3038333415985107, "memory(GiB)": 33.6, "step": 1550, "token_acc": 0.9053210378601294, "train_speed(iter/s)": 0.121329 }, { "epoch": 1.9495883967071737, "grad_norm": 0.7545835375785828, "learning_rate": 2.7250733792922997e-06, "loss": 0.29517788887023927, "memory(GiB)": 33.6, "step": 1555, "token_acc": 0.90642349335544, "train_speed(iter/s)": 0.1214 }, { "epoch": 1.955860446883575, "grad_norm": 0.7737696170806885, "learning_rate": 2.6958714808135546e-06, "loss": 0.295018744468689, "memory(GiB)": 33.6, "step": 1560, "token_acc": 0.9136314827175901, "train_speed(iter/s)": 0.121508 }, { "epoch": 1.955860446883575, "eval_loss": 0.33957967162132263, "eval_runtime": 29.6912, "eval_samples_per_second": 17.345, "eval_steps_per_second": 4.345, "eval_token_acc": 0.8840179394752284, "step": 1560 }, { "epoch": 1.9621324970599765, "grad_norm": 0.7262492179870605, "learning_rate": 2.6667690279730096e-06, "loss": 0.30216593742370607, "memory(GiB)": 33.6, "step": 1565, "token_acc": 0.8968756421421372, "train_speed(iter/s)": 0.121231 }, { "epoch": 1.968404547236378, "grad_norm": 0.7777485847473145, "learning_rate": 2.6377672768256003e-06, "loss": 0.2954871654510498, "memory(GiB)": 33.6, "step": 1570, "token_acc": 0.9041269349045146, "train_speed(iter/s)": 0.121291 }, { "epoch": 1.9746765974127793, "grad_norm": 0.8558617234230042, "learning_rate": 2.608867479080001e-06, "loss": 0.2946753025054932, "memory(GiB)": 33.6, "step": 1575, "token_acc": 0.8997012032625373, "train_speed(iter/s)": 0.121379 }, { "epoch": 1.9809486475891807, "grad_norm": 0.7917863726615906, "learning_rate": 2.5800708820446002e-06, "loss": 0.3050684928894043, "memory(GiB)": 33.6, "step": 1580, "token_acc": 0.9030535237431789, "train_speed(iter/s)": 0.121479 }, { "epoch": 1.9809486475891807, "eval_loss": 0.33898818492889404, "eval_runtime": 29.6993, "eval_samples_per_second": 17.34, "eval_steps_per_second": 4.344, "eval_token_acc": 0.8843700226339174, "step": 1580 }, { "epoch": 1.987220697765582, "grad_norm": 0.8041856288909912, "learning_rate": 2.551378728573668e-06, "loss": 0.2989157438278198, "memory(GiB)": 33.6, "step": 1585, "token_acc": 0.8997509598422746, "train_speed(iter/s)": 0.12119 }, { "epoch": 1.9934927479419835, "grad_norm": 0.8586787581443787, "learning_rate": 2.5227922570137143e-06, "loss": 0.30647430419921873, "memory(GiB)": 33.6, "step": 1590, "token_acc": 0.8982128790862545, "train_speed(iter/s)": 0.121292 }, { "epoch": 1.999764798118385, "grad_norm": 0.7968061566352844, "learning_rate": 2.4943127011500483e-06, "loss": 0.31006150245666503, "memory(GiB)": 33.6, "step": 1595, "token_acc": 0.9085869784317678, "train_speed(iter/s)": 0.121394 }, { "epoch": 2.005017640141121, "grad_norm": 0.7677069306373596, "learning_rate": 2.465941290153514e-06, "loss": 0.2801233768463135, "memory(GiB)": 33.6, "step": 1600, "token_acc": 0.9198317869301115, "train_speed(iter/s)": 0.121535 }, { "epoch": 2.005017640141121, "eval_loss": 0.3399621546268463, "eval_runtime": 29.7379, "eval_samples_per_second": 17.318, "eval_steps_per_second": 4.338, "eval_token_acc": 0.8842107469192724, "step": 1600 }, { "epoch": 2.0112896903175224, "grad_norm": 0.805497944355011, "learning_rate": 2.4376792485274577e-06, "loss": 0.2575787782669067, "memory(GiB)": 33.6, "step": 1605, "token_acc": 0.9027501640006056, "train_speed(iter/s)": 0.121259 }, { "epoch": 2.017561740493924, "grad_norm": 0.7615222930908203, "learning_rate": 2.409527796054863e-06, "loss": 0.25977578163146975, "memory(GiB)": 33.6, "step": 1610, "token_acc": 0.9162343277129269, "train_speed(iter/s)": 0.121356 }, { "epoch": 2.0238337906703254, "grad_norm": 0.8287392854690552, "learning_rate": 2.38148814774572e-06, "loss": 0.24034299850463867, "memory(GiB)": 33.6, "step": 1615, "token_acc": 0.9182042343338152, "train_speed(iter/s)": 0.121426 }, { "epoch": 2.0301058408467267, "grad_norm": 0.8738728761672974, "learning_rate": 2.353561513784566e-06, "loss": 0.2571218252182007, "memory(GiB)": 33.6, "step": 1620, "token_acc": 0.9214568880079287, "train_speed(iter/s)": 0.121521 }, { "epoch": 2.0301058408467267, "eval_loss": 0.3534950017929077, "eval_runtime": 29.6483, "eval_samples_per_second": 17.37, "eval_steps_per_second": 4.351, "eval_token_acc": 0.883016179059435, "step": 1620 }, { "epoch": 2.036377891023128, "grad_norm": 0.8269490599632263, "learning_rate": 2.325749099478277e-06, "loss": 0.2555586814880371, "memory(GiB)": 33.6, "step": 1625, "token_acc": 0.9017842820882989, "train_speed(iter/s)": 0.121252 }, { "epoch": 2.04264994119953, "grad_norm": 0.802456259727478, "learning_rate": 2.29805210520403e-06, "loss": 0.25757761001586915, "memory(GiB)": 33.6, "step": 1630, "token_acc": 0.9153420162034663, "train_speed(iter/s)": 0.121337 }, { "epoch": 2.048921991375931, "grad_norm": 0.8305613398551941, "learning_rate": 2.270471726357501e-06, "loss": 0.24194817543029784, "memory(GiB)": 33.6, "step": 1635, "token_acc": 0.9273302172119879, "train_speed(iter/s)": 0.121409 }, { "epoch": 2.0551940415523324, "grad_norm": 0.8397504687309265, "learning_rate": 2.243009153301276e-06, "loss": 0.25193355083465574, "memory(GiB)": 33.6, "step": 1640, "token_acc": 0.918960244648318, "train_speed(iter/s)": 0.121491 }, { "epoch": 2.0551940415523324, "eval_loss": 0.3526792824268341, "eval_runtime": 29.596, "eval_samples_per_second": 17.401, "eval_steps_per_second": 4.359, "eval_token_acc": 0.8830622851873585, "step": 1640 }, { "epoch": 2.0614660917287337, "grad_norm": 0.7059065103530884, "learning_rate": 2.215665571313468e-06, "loss": 0.25691723823547363, "memory(GiB)": 33.6, "step": 1645, "token_acc": 0.9024566051703649, "train_speed(iter/s)": 0.121247 }, { "epoch": 2.067738141905135, "grad_norm": 0.8032014966011047, "learning_rate": 2.188442160536562e-06, "loss": 0.25560142993927004, "memory(GiB)": 33.6, "step": 1650, "token_acc": 0.9098803496167228, "train_speed(iter/s)": 0.121319 }, { "epoch": 2.074010192081537, "grad_norm": 0.7990818619728088, "learning_rate": 2.1613400959264845e-06, "loss": 0.24714956283569336, "memory(GiB)": 33.6, "step": 1655, "token_acc": 0.9181002989040186, "train_speed(iter/s)": 0.121389 }, { "epoch": 2.080282242257938, "grad_norm": 0.8864375948905945, "learning_rate": 2.1343605472018954e-06, "loss": 0.2497623920440674, "memory(GiB)": 33.6, "step": 1660, "token_acc": 0.9190709535476774, "train_speed(iter/s)": 0.121476 }, { "epoch": 2.080282242257938, "eval_loss": 0.3539391756057739, "eval_runtime": 29.6226, "eval_samples_per_second": 17.385, "eval_steps_per_second": 4.355, "eval_token_acc": 0.8831544974432056, "step": 1660 }, { "epoch": 2.0865542924343394, "grad_norm": 0.8491289019584656, "learning_rate": 2.1075046787936842e-06, "loss": 0.26420676708221436, "memory(GiB)": 33.6, "step": 1665, "token_acc": 0.8965886626349976, "train_speed(iter/s)": 0.121214 }, { "epoch": 2.0928263426107407, "grad_norm": 0.6870825886726379, "learning_rate": 2.0807736497947436e-06, "loss": 0.25780699253082273, "memory(GiB)": 33.6, "step": 1670, "token_acc": 0.9150844173816772, "train_speed(iter/s)": 0.121287 }, { "epoch": 2.0990983927871425, "grad_norm": 0.7348774075508118, "learning_rate": 2.0541686139099164e-06, "loss": 0.24700713157653809, "memory(GiB)": 33.6, "step": 1675, "token_acc": 0.9142644537189816, "train_speed(iter/s)": 0.121368 }, { "epoch": 2.105370442963544, "grad_norm": 0.789790153503418, "learning_rate": 2.0276907194062167e-06, "loss": 0.25561089515686036, "memory(GiB)": 33.6, "step": 1680, "token_acc": 0.9232616718411341, "train_speed(iter/s)": 0.121454 }, { "epoch": 2.105370442963544, "eval_loss": 0.3535526394844055, "eval_runtime": 29.6041, "eval_samples_per_second": 17.396, "eval_steps_per_second": 4.358, "eval_token_acc": 0.8829072009388884, "step": 1680 }, { "epoch": 2.111642493139945, "grad_norm": 0.825501024723053, "learning_rate": 2.0013411090632638e-06, "loss": 0.2573189973831177, "memory(GiB)": 33.6, "step": 1685, "token_acc": 0.8988425094093394, "train_speed(iter/s)": 0.1212 }, { "epoch": 2.1179145433163464, "grad_norm": 0.7919580340385437, "learning_rate": 1.9751209201239696e-06, "loss": 0.24840922355651857, "memory(GiB)": 33.6, "step": 1690, "token_acc": 0.9239367219917013, "train_speed(iter/s)": 0.121296 }, { "epoch": 2.1241865934927477, "grad_norm": 0.7574432492256165, "learning_rate": 1.9490312842454425e-06, "loss": 0.24667706489562988, "memory(GiB)": 33.6, "step": 1695, "token_acc": 0.9196868524061709, "train_speed(iter/s)": 0.12138 }, { "epoch": 2.1304586436691495, "grad_norm": 0.7842095494270325, "learning_rate": 1.9230733274501525e-06, "loss": 0.25333414077758787, "memory(GiB)": 33.6, "step": 1700, "token_acc": 0.9171972811047527, "train_speed(iter/s)": 0.12145 }, { "epoch": 2.1304586436691495, "eval_loss": 0.3539462685585022, "eval_runtime": 29.5964, "eval_samples_per_second": 17.401, "eval_steps_per_second": 4.359, "eval_token_acc": 0.8834395171430967, "step": 1700 }, { "epoch": 2.136730693845551, "grad_norm": 0.8125796914100647, "learning_rate": 1.8972481700773388e-06, "loss": 0.25501580238342286, "memory(GiB)": 33.6, "step": 1705, "token_acc": 0.9002077731605964, "train_speed(iter/s)": 0.1212 }, { "epoch": 2.143002744021952, "grad_norm": 0.8007329702377319, "learning_rate": 1.8715569267346368e-06, "loss": 0.25573315620422366, "memory(GiB)": 33.6, "step": 1710, "token_acc": 0.9234607218683651, "train_speed(iter/s)": 0.121277 }, { "epoch": 2.1492747941983534, "grad_norm": 0.7575182318687439, "learning_rate": 1.846000706249997e-06, "loss": 0.2531334400177002, "memory(GiB)": 33.6, "step": 1715, "token_acc": 0.9233944058674252, "train_speed(iter/s)": 0.121348 }, { "epoch": 2.155546844374755, "grad_norm": 0.9734401702880859, "learning_rate": 1.8205806116238055e-06, "loss": 0.248917818069458, "memory(GiB)": 33.6, "step": 1720, "token_acc": 0.9278663414080671, "train_speed(iter/s)": 0.121411 }, { "epoch": 2.155546844374755, "eval_loss": 0.3528047800064087, "eval_runtime": 29.5743, "eval_samples_per_second": 17.414, "eval_steps_per_second": 4.362, "eval_token_acc": 0.8836658563165395, "step": 1720 }, { "epoch": 2.1618188945511565, "grad_norm": 0.7814000844955444, "learning_rate": 1.7952977399812988e-06, "loss": 0.25339622497558595, "memory(GiB)": 33.6, "step": 1725, "token_acc": 0.9013212593534945, "train_speed(iter/s)": 0.121154 }, { "epoch": 2.168090944727558, "grad_norm": 0.7855998277664185, "learning_rate": 1.7701531825251888e-06, "loss": 0.256337571144104, "memory(GiB)": 33.6, "step": 1730, "token_acc": 0.9186892875503964, "train_speed(iter/s)": 0.121253 }, { "epoch": 2.174362994903959, "grad_norm": 0.8425953388214111, "learning_rate": 1.7451480244885938e-06, "loss": 0.25911998748779297, "memory(GiB)": 33.6, "step": 1735, "token_acc": 0.9183445487671889, "train_speed(iter/s)": 0.121337 }, { "epoch": 2.1806350450803604, "grad_norm": 0.8361583948135376, "learning_rate": 1.720283345088178e-06, "loss": 0.259613561630249, "memory(GiB)": 33.6, "step": 1740, "token_acc": 0.9204680395053146, "train_speed(iter/s)": 0.121423 }, { "epoch": 2.1806350450803604, "eval_loss": 0.353371798992157, "eval_runtime": 29.6124, "eval_samples_per_second": 17.391, "eval_steps_per_second": 4.356, "eval_token_acc": 0.883364070751949, "step": 1740 }, { "epoch": 2.186907095256762, "grad_norm": 0.8060917258262634, "learning_rate": 1.695560217477582e-06, "loss": 0.24777050018310548, "memory(GiB)": 33.6, "step": 1745, "token_acc": 0.9022448606112365, "train_speed(iter/s)": 0.121162 }, { "epoch": 2.1931791454331635, "grad_norm": 0.7034837603569031, "learning_rate": 1.6709797087011066e-06, "loss": 0.24974467754364013, "memory(GiB)": 33.6, "step": 1750, "token_acc": 0.9183000028447075, "train_speed(iter/s)": 0.121239 }, { "epoch": 2.199451195609565, "grad_norm": 0.7496914267539978, "learning_rate": 1.6465428796476584e-06, "loss": 0.24403119087219238, "memory(GiB)": 33.6, "step": 1755, "token_acc": 0.9174810328140123, "train_speed(iter/s)": 0.121305 }, { "epoch": 2.205723245785966, "grad_norm": 0.9179002046585083, "learning_rate": 1.6222507850049602e-06, "loss": 0.2549721717834473, "memory(GiB)": 33.6, "step": 1760, "token_acc": 0.9200168800112534, "train_speed(iter/s)": 0.121396 }, { "epoch": 2.205723245785966, "eval_loss": 0.3538911044597626, "eval_runtime": 29.6392, "eval_samples_per_second": 17.376, "eval_steps_per_second": 4.352, "eval_token_acc": 0.8833808366166485, "step": 1760 }, { "epoch": 2.211995295962368, "grad_norm": 0.794003963470459, "learning_rate": 1.598104473214031e-06, "loss": 0.25570311546325686, "memory(GiB)": 33.6, "step": 1765, "token_acc": 0.8993348159524498, "train_speed(iter/s)": 0.121141 }, { "epoch": 2.218267346138769, "grad_norm": 0.7644505500793457, "learning_rate": 1.5741049864239383e-06, "loss": 0.24341793060302735, "memory(GiB)": 33.6, "step": 1770, "token_acc": 0.9168759872594971, "train_speed(iter/s)": 0.12122 }, { "epoch": 2.2245393963151705, "grad_norm": 0.8791477680206299, "learning_rate": 1.550253360446815e-06, "loss": 0.2522608757019043, "memory(GiB)": 33.6, "step": 1775, "token_acc": 0.9258004677796596, "train_speed(iter/s)": 0.121309 }, { "epoch": 2.230811446491572, "grad_norm": 0.8298718333244324, "learning_rate": 1.5265506247131617e-06, "loss": 0.2546123266220093, "memory(GiB)": 33.6, "step": 1780, "token_acc": 0.9118973044798785, "train_speed(iter/s)": 0.121394 }, { "epoch": 2.230811446491572, "eval_loss": 0.35324588418006897, "eval_runtime": 29.6796, "eval_samples_per_second": 17.352, "eval_steps_per_second": 4.346, "eval_token_acc": 0.8832844328946265, "step": 1780 }, { "epoch": 2.2370834966679736, "grad_norm": 0.816360592842102, "learning_rate": 1.5029978022274067e-06, "loss": 0.2614459991455078, "memory(GiB)": 33.6, "step": 1785, "token_acc": 0.9004699229406026, "train_speed(iter/s)": 0.121137 }, { "epoch": 2.243355546844375, "grad_norm": 0.8427248597145081, "learning_rate": 1.47959590952376e-06, "loss": 0.25159344673156736, "memory(GiB)": 33.6, "step": 1790, "token_acc": 0.9224594190787307, "train_speed(iter/s)": 0.121218 }, { "epoch": 2.249627597020776, "grad_norm": 0.7986406683921814, "learning_rate": 1.4563459566223358e-06, "loss": 0.24192914962768555, "memory(GiB)": 33.6, "step": 1795, "token_acc": 0.9266775696302905, "train_speed(iter/s)": 0.121301 }, { "epoch": 2.2558996471971775, "grad_norm": 0.7850765585899353, "learning_rate": 1.4332489469855698e-06, "loss": 0.24650468826293945, "memory(GiB)": 33.6, "step": 1800, "token_acc": 0.9252590346221885, "train_speed(iter/s)": 0.121371 }, { "epoch": 2.2558996471971775, "eval_loss": 0.35429683327674866, "eval_runtime": 29.682, "eval_samples_per_second": 17.351, "eval_steps_per_second": 4.346, "eval_token_acc": 0.8836155587224411, "step": 1800 }, { "epoch": 2.262171697373579, "grad_norm": 0.7986750602722168, "learning_rate": 1.4103058774748923e-06, "loss": 0.2553676128387451, "memory(GiB)": 33.6, "step": 1805, "token_acc": 0.8993731709045985, "train_speed(iter/s)": 0.121131 }, { "epoch": 2.2684437475499806, "grad_norm": 0.7992218732833862, "learning_rate": 1.3875177383077233e-06, "loss": 0.2504106521606445, "memory(GiB)": 33.6, "step": 1810, "token_acc": 0.9165525002446423, "train_speed(iter/s)": 0.121197 }, { "epoch": 2.274715797726382, "grad_norm": 0.7687636017799377, "learning_rate": 1.3648855130147216e-06, "loss": 0.2536652088165283, "memory(GiB)": 33.6, "step": 1815, "token_acc": 0.9203490718321227, "train_speed(iter/s)": 0.121289 }, { "epoch": 2.280987847902783, "grad_norm": 0.8168686628341675, "learning_rate": 1.3424101783973403e-06, "loss": 0.24730236530303956, "memory(GiB)": 33.6, "step": 1820, "token_acc": 0.9207464126371638, "train_speed(iter/s)": 0.121365 }, { "epoch": 2.280987847902783, "eval_loss": 0.35338255763053894, "eval_runtime": 29.6568, "eval_samples_per_second": 17.365, "eval_steps_per_second": 4.35, "eval_token_acc": 0.8838293234973594, "step": 1820 }, { "epoch": 2.2872598980791845, "grad_norm": 0.8269901871681213, "learning_rate": 1.3200927044856714e-06, "loss": 0.2572518825531006, "memory(GiB)": 33.6, "step": 1825, "token_acc": 0.9020789813985874, "train_speed(iter/s)": 0.121125 }, { "epoch": 2.293531948255586, "grad_norm": 0.8186900019645691, "learning_rate": 1.2979340544965745e-06, "loss": 0.2439603328704834, "memory(GiB)": 33.6, "step": 1830, "token_acc": 0.9237335485473057, "train_speed(iter/s)": 0.121206 }, { "epoch": 2.2998039984319876, "grad_norm": 0.7761522531509399, "learning_rate": 1.2759351847921053e-06, "loss": 0.2594336748123169, "memory(GiB)": 33.6, "step": 1835, "token_acc": 0.9104045438093382, "train_speed(iter/s)": 0.121282 }, { "epoch": 2.306076048608389, "grad_norm": 0.7789030075073242, "learning_rate": 1.25409704483824e-06, "loss": 0.25881831645965575, "memory(GiB)": 33.6, "step": 1840, "token_acc": 0.9176125295446872, "train_speed(iter/s)": 0.121357 }, { "epoch": 2.306076048608389, "eval_loss": 0.3527611494064331, "eval_runtime": 29.6253, "eval_samples_per_second": 17.384, "eval_steps_per_second": 4.354, "eval_token_acc": 0.8836407075194903, "step": 1840 }, { "epoch": 2.31234809878479, "grad_norm": 0.8218780755996704, "learning_rate": 1.232420577163902e-06, "loss": 0.25008678436279297, "memory(GiB)": 33.6, "step": 1845, "token_acc": 0.9028911896854854, "train_speed(iter/s)": 0.121116 }, { "epoch": 2.3186201489611915, "grad_norm": 0.7720741629600525, "learning_rate": 1.2109067173202731e-06, "loss": 0.2578773021697998, "memory(GiB)": 33.6, "step": 1850, "token_acc": 0.9163940481215703, "train_speed(iter/s)": 0.121195 }, { "epoch": 2.3248921991375933, "grad_norm": 0.9268919229507446, "learning_rate": 1.1895563938404203e-06, "loss": 0.25402810573577883, "memory(GiB)": 33.6, "step": 1855, "token_acc": 0.9201602408716287, "train_speed(iter/s)": 0.121273 }, { "epoch": 2.3311642493139946, "grad_norm": 0.8763542771339417, "learning_rate": 1.1683705281992202e-06, "loss": 0.25608100891113283, "memory(GiB)": 33.6, "step": 1860, "token_acc": 0.9185731132075472, "train_speed(iter/s)": 0.121349 }, { "epoch": 2.3311642493139946, "eval_loss": 0.35326558351516724, "eval_runtime": 29.6773, "eval_samples_per_second": 17.353, "eval_steps_per_second": 4.347, "eval_token_acc": 0.883716153910638, "step": 1860 }, { "epoch": 2.337436299490396, "grad_norm": 0.777973473072052, "learning_rate": 1.1473500347735927e-06, "loss": 0.26839523315429686, "memory(GiB)": 33.6, "step": 1865, "token_acc": 0.9000454114673827, "train_speed(iter/s)": 0.121142 }, { "epoch": 2.343708349666797, "grad_norm": 0.8818052411079407, "learning_rate": 1.1264958208030224e-06, "loss": 0.25702362060546874, "memory(GiB)": 33.6, "step": 1870, "token_acc": 0.9185548071034905, "train_speed(iter/s)": 0.121226 }, { "epoch": 2.349980399843199, "grad_norm": 0.8378356099128723, "learning_rate": 1.105808786350423e-06, "loss": 0.26531405448913575, "memory(GiB)": 33.6, "step": 1875, "token_acc": 0.9187210090165217, "train_speed(iter/s)": 0.121307 }, { "epoch": 2.3562524500196003, "grad_norm": 0.8721848130226135, "learning_rate": 1.085289824263273e-06, "loss": 0.2487030506134033, "memory(GiB)": 33.6, "step": 1880, "token_acc": 0.9182288299935358, "train_speed(iter/s)": 0.121392 }, { "epoch": 2.3562524500196003, "eval_loss": 0.3520548939704895, "eval_runtime": 29.802, "eval_samples_per_second": 17.281, "eval_steps_per_second": 4.329, "eval_token_acc": 0.8838251320311845, "step": 1880 }, { "epoch": 2.3625245001960016, "grad_norm": 0.8557198643684387, "learning_rate": 1.0649398201350907e-06, "loss": 0.2521216869354248, "memory(GiB)": 33.6, "step": 1885, "token_acc": 0.901105316416284, "train_speed(iter/s)": 0.121149 }, { "epoch": 2.368796550372403, "grad_norm": 0.8094416856765747, "learning_rate": 1.044759652267207e-06, "loss": 0.2516944408416748, "memory(GiB)": 33.6, "step": 1890, "token_acc": 0.9250567064872222, "train_speed(iter/s)": 0.12121 }, { "epoch": 2.375068600548804, "grad_norm": 0.7657244205474854, "learning_rate": 1.024750191630864e-06, "loss": 0.26051204204559325, "memory(GiB)": 33.6, "step": 1895, "token_acc": 0.919248217757615, "train_speed(iter/s)": 0.121289 }, { "epoch": 2.381340650725206, "grad_norm": 0.7817269563674927, "learning_rate": 1.0049123018296158e-06, "loss": 0.253748345375061, "memory(GiB)": 33.6, "step": 1900, "token_acc": 0.9162441497659907, "train_speed(iter/s)": 0.121364 }, { "epoch": 2.381340650725206, "eval_loss": 0.3525380492210388, "eval_runtime": 29.5922, "eval_samples_per_second": 17.403, "eval_steps_per_second": 4.359, "eval_token_acc": 0.8839634504149552, "step": 1900 }, { "epoch": 2.3876127009016073, "grad_norm": 0.7803794145584106, "learning_rate": 9.852468390620624e-07, "loss": 0.2489931583404541, "memory(GiB)": 33.6, "step": 1905, "token_acc": 0.903682999338265, "train_speed(iter/s)": 0.121132 }, { "epoch": 2.3938847510780086, "grad_norm": 0.7914659976959229, "learning_rate": 9.65754652084896e-07, "loss": 0.2515411853790283, "memory(GiB)": 33.6, "step": 1910, "token_acc": 0.9195726949858777, "train_speed(iter/s)": 0.121205 }, { "epoch": 2.40015680125441, "grad_norm": 0.7828955054283142, "learning_rate": 9.464365821762611e-07, "loss": 0.25966334342956543, "memory(GiB)": 33.6, "step": 1915, "token_acc": 0.9172705476684356, "train_speed(iter/s)": 0.121274 }, { "epoch": 2.406428851430811, "grad_norm": 0.8471463918685913, "learning_rate": 9.272934630994579e-07, "loss": 0.26224753856658933, "memory(GiB)": 33.6, "step": 1920, "token_acc": 0.9160864785635764, "train_speed(iter/s)": 0.121357 }, { "epoch": 2.406428851430811, "eval_loss": 0.35224801301956177, "eval_runtime": 29.5629, "eval_samples_per_second": 17.42, "eval_steps_per_second": 4.364, "eval_token_acc": 0.8840053650767038, "step": 1920 }, { "epoch": 2.412700901607213, "grad_norm": 0.8474516272544861, "learning_rate": 9.083261210669458e-07, "loss": 0.24689688682556152, "memory(GiB)": 33.6, "step": 1925, "token_acc": 0.9013123514853926, "train_speed(iter/s)": 0.121128 }, { "epoch": 2.4189729517836143, "grad_norm": 0.8423922061920166, "learning_rate": 8.895353747046903e-07, "loss": 0.2583484649658203, "memory(GiB)": 33.6, "step": 1930, "token_acc": 0.9094146095182419, "train_speed(iter/s)": 0.121212 }, { "epoch": 2.4252450019600156, "grad_norm": 0.8378574848175049, "learning_rate": 8.70922035016829e-07, "loss": 0.25626001358032224, "memory(GiB)": 33.6, "step": 1935, "token_acc": 0.9183032429679099, "train_speed(iter/s)": 0.121293 }, { "epoch": 2.4315170521364173, "grad_norm": 0.8666955828666687, "learning_rate": 8.524869053506718e-07, "loss": 0.25580859184265137, "memory(GiB)": 33.6, "step": 1940, "token_acc": 0.9219803161348047, "train_speed(iter/s)": 0.121373 }, { "epoch": 2.4315170521364173, "eval_loss": 0.35265490412712097, "eval_runtime": 29.5852, "eval_samples_per_second": 17.407, "eval_steps_per_second": 4.36, "eval_token_acc": 0.883653281918015, "step": 1940 }, { "epoch": 2.4377891023128186, "grad_norm": 0.7587525248527527, "learning_rate": 8.342307813620254e-07, "loss": 0.25416107177734376, "memory(GiB)": 33.6, "step": 1945, "token_acc": 0.9025201951373665, "train_speed(iter/s)": 0.121167 }, { "epoch": 2.44406115248922, "grad_norm": 0.7803459763526917, "learning_rate": 8.161544509808522e-07, "loss": 0.25071403980255125, "memory(GiB)": 33.6, "step": 1950, "token_acc": 0.9182450116792841, "train_speed(iter/s)": 0.121237 }, { "epoch": 2.4503332026656213, "grad_norm": 0.8933445811271667, "learning_rate": 7.982586943772663e-07, "loss": 0.2513444900512695, "memory(GiB)": 33.6, "step": 1955, "token_acc": 0.9193114259516697, "train_speed(iter/s)": 0.121306 }, { "epoch": 2.4566052528420226, "grad_norm": 0.8411695957183838, "learning_rate": 7.805442839278643e-07, "loss": 0.25985763072967527, "memory(GiB)": 33.6, "step": 1960, "token_acc": 0.9140053688790151, "train_speed(iter/s)": 0.121373 }, { "epoch": 2.4566052528420226, "eval_loss": 0.3529517650604248, "eval_runtime": 29.6072, "eval_samples_per_second": 17.394, "eval_steps_per_second": 4.357, "eval_token_acc": 0.8836448989856652, "step": 1960 }, { "epoch": 2.4628773030184243, "grad_norm": 0.7706183791160583, "learning_rate": 7.630119841823808e-07, "loss": 0.24757421016693115, "memory(GiB)": 33.6, "step": 1965, "token_acc": 0.9010213793035783, "train_speed(iter/s)": 0.12116 }, { "epoch": 2.4691493531948256, "grad_norm": 0.7962974905967712, "learning_rate": 7.456625518306976e-07, "loss": 0.25676703453063965, "memory(GiB)": 33.6, "step": 1970, "token_acc": 0.9233753637245393, "train_speed(iter/s)": 0.121238 }, { "epoch": 2.475421403371227, "grad_norm": 0.7932366132736206, "learning_rate": 7.284967356701839e-07, "loss": 0.25052144527435305, "memory(GiB)": 33.6, "step": 1975, "token_acc": 0.9187351328989555, "train_speed(iter/s)": 0.121285 }, { "epoch": 2.4816934535476283, "grad_norm": 0.8126112222671509, "learning_rate": 7.115152765733768e-07, "loss": 0.2548501968383789, "memory(GiB)": 33.6, "step": 1980, "token_acc": 0.9198236175051306, "train_speed(iter/s)": 0.121368 }, { "epoch": 2.4816934535476283, "eval_loss": 0.352863073348999, "eval_runtime": 29.594, "eval_samples_per_second": 17.402, "eval_steps_per_second": 4.359, "eval_token_acc": 0.8838125576326599, "step": 1980 }, { "epoch": 2.4879655037240296, "grad_norm": 0.8036893606185913, "learning_rate": 6.94718907456009e-07, "loss": 0.26411118507385256, "memory(GiB)": 33.6, "step": 1985, "token_acc": 0.9022120803784831, "train_speed(iter/s)": 0.121147 }, { "epoch": 2.4942375539004313, "grad_norm": 0.8398657441139221, "learning_rate": 6.781083532453702e-07, "loss": 0.24961705207824708, "memory(GiB)": 33.6, "step": 1990, "token_acc": 0.9132801067953688, "train_speed(iter/s)": 0.121219 }, { "epoch": 2.5005096040768326, "grad_norm": 0.840071976184845, "learning_rate": 6.61684330849025e-07, "loss": 0.25878229141235354, "memory(GiB)": 33.6, "step": 1995, "token_acc": 0.9152943072831314, "train_speed(iter/s)": 0.121292 }, { "epoch": 2.506781654253234, "grad_norm": 0.7767475247383118, "learning_rate": 6.454475491238682e-07, "loss": 0.2700009346008301, "memory(GiB)": 33.6, "step": 2000, "token_acc": 0.9162267615331651, "train_speed(iter/s)": 0.121371 }, { "epoch": 2.506781654253234, "eval_loss": 0.3521389663219452, "eval_runtime": 29.5782, "eval_samples_per_second": 17.411, "eval_steps_per_second": 4.361, "eval_token_acc": 0.883841897895884, "step": 2000 }, { "epoch": 2.5130537044296353, "grad_norm": 0.7912172675132751, "learning_rate": 6.293987088455355e-07, "loss": 0.25088133811950686, "memory(GiB)": 33.6, "step": 2005, "token_acc": 0.9012983750990204, "train_speed(iter/s)": 0.121148 }, { "epoch": 2.5193257546060366, "grad_norm": 0.8571103811264038, "learning_rate": 6.135385026781476e-07, "loss": 0.25229463577270506, "memory(GiB)": 33.6, "step": 2010, "token_acc": 0.9172969454855578, "train_speed(iter/s)": 0.121222 }, { "epoch": 2.5255978047824383, "grad_norm": 0.823409914970398, "learning_rate": 5.978676151444285e-07, "loss": 0.2578453540802002, "memory(GiB)": 33.6, "step": 2015, "token_acc": 0.921449302499766, "train_speed(iter/s)": 0.121294 }, { "epoch": 2.5318698549588396, "grad_norm": 0.737509548664093, "learning_rate": 5.823867225961516e-07, "loss": 0.26110265254974363, "memory(GiB)": 33.6, "step": 2020, "token_acc": 0.9162979877265591, "train_speed(iter/s)": 0.121359 }, { "epoch": 2.5318698549588396, "eval_loss": 0.35181859135627747, "eval_runtime": 29.6024, "eval_samples_per_second": 17.397, "eval_steps_per_second": 4.358, "eval_token_acc": 0.8837580685723866, "step": 2020 }, { "epoch": 2.538141905135241, "grad_norm": 0.9168305397033691, "learning_rate": 5.670964931849521e-07, "loss": 0.2577165365219116, "memory(GiB)": 33.6, "step": 2025, "token_acc": 0.8999222873752308, "train_speed(iter/s)": 0.121158 }, { "epoch": 2.5444139553116427, "grad_norm": 0.8547112345695496, "learning_rate": 5.519975868334914e-07, "loss": 0.23379290103912354, "memory(GiB)": 33.6, "step": 2030, "token_acc": 0.9258048932965816, "train_speed(iter/s)": 0.121231 }, { "epoch": 2.550686005488044, "grad_norm": 0.855364203453064, "learning_rate": 5.370906552069721e-07, "loss": 0.26704959869384765, "memory(GiB)": 33.6, "step": 2035, "token_acc": 0.9188056010965678, "train_speed(iter/s)": 0.121316 }, { "epoch": 2.5569580556644453, "grad_norm": 0.8302978873252869, "learning_rate": 5.22376341685013e-07, "loss": 0.25648543834686277, "memory(GiB)": 33.6, "step": 2040, "token_acc": 0.923139132403843, "train_speed(iter/s)": 0.121378 }, { "epoch": 2.5569580556644453, "eval_loss": 0.35207509994506836, "eval_runtime": 29.5872, "eval_samples_per_second": 17.406, "eval_steps_per_second": 4.36, "eval_token_acc": 0.8838083661664851, "step": 2040 }, { "epoch": 2.5632301058408467, "grad_norm": 0.7551521062850952, "learning_rate": 5.07855281333881e-07, "loss": 0.25759091377258303, "memory(GiB)": 33.6, "step": 2045, "token_acc": 0.8994057400542669, "train_speed(iter/s)": 0.121167 }, { "epoch": 2.569502156017248, "grad_norm": 0.7979341745376587, "learning_rate": 4.935281008790843e-07, "loss": 0.2503528594970703, "memory(GiB)": 33.6, "step": 2050, "token_acc": 0.9201623815967523, "train_speed(iter/s)": 0.121241 }, { "epoch": 2.5757742061936497, "grad_norm": 0.7783675193786621, "learning_rate": 4.793954186783195e-07, "loss": 0.26355133056640623, "memory(GiB)": 33.6, "step": 2055, "token_acc": 0.9151320361362058, "train_speed(iter/s)": 0.121321 }, { "epoch": 2.582046256370051, "grad_norm": 0.8197569847106934, "learning_rate": 4.6545784469478386e-07, "loss": 0.24934740066528321, "memory(GiB)": 33.6, "step": 2060, "token_acc": 0.9148257180318136, "train_speed(iter/s)": 0.121394 }, { "epoch": 2.582046256370051, "eval_loss": 0.35226860642433167, "eval_runtime": 29.6044, "eval_samples_per_second": 17.396, "eval_steps_per_second": 4.357, "eval_token_acc": 0.883938301617906, "step": 2060 }, { "epoch": 2.5883183065464523, "grad_norm": 0.7964560389518738, "learning_rate": 4.5171598047085153e-07, "loss": 0.26240172386169436, "memory(GiB)": 33.6, "step": 2065, "token_acc": 0.9007787850191121, "train_speed(iter/s)": 0.12119 }, { "epoch": 2.5945903567228537, "grad_norm": 0.8210738301277161, "learning_rate": 4.381704191021119e-07, "loss": 0.26297893524169924, "memory(GiB)": 33.6, "step": 2070, "token_acc": 0.9183645562696567, "train_speed(iter/s)": 0.121265 }, { "epoch": 2.600862406899255, "grad_norm": 0.8720937371253967, "learning_rate": 4.248217452117653e-07, "loss": 0.25324339866638185, "memory(GiB)": 33.6, "step": 2075, "token_acc": 0.9215979250655465, "train_speed(iter/s)": 0.121329 }, { "epoch": 2.6071344570756567, "grad_norm": 0.8678158521652222, "learning_rate": 4.1167053492540023e-07, "loss": 0.26408936977386477, "memory(GiB)": 33.6, "step": 2080, "token_acc": 0.9133987390087365, "train_speed(iter/s)": 0.121395 }, { "epoch": 2.6071344570756567, "eval_loss": 0.3519818186759949, "eval_runtime": 29.602, "eval_samples_per_second": 17.397, "eval_steps_per_second": 4.358, "eval_token_acc": 0.8838083661664851, "step": 2080 }, { "epoch": 2.613406507252058, "grad_norm": 0.8411008715629578, "learning_rate": 3.987173558461199e-07, "loss": 0.25940699577331544, "memory(GiB)": 33.6, "step": 2085, "token_acc": 0.8995980691159541, "train_speed(iter/s)": 0.121193 }, { "epoch": 2.6196785574284593, "grad_norm": 0.8319599032402039, "learning_rate": 3.8596276703004974e-07, "loss": 0.25266613960266116, "memory(GiB)": 33.6, "step": 2090, "token_acc": 0.9181341472959224, "train_speed(iter/s)": 0.121258 }, { "epoch": 2.625950607604861, "grad_norm": 0.7988993525505066, "learning_rate": 3.7340731896220393e-07, "loss": 0.25626089572906496, "memory(GiB)": 33.6, "step": 2095, "token_acc": 0.9169616764475279, "train_speed(iter/s)": 0.121326 }, { "epoch": 2.6322226577812624, "grad_norm": 0.9199852347373962, "learning_rate": 3.6105155353273305e-07, "loss": 0.25374295711517336, "memory(GiB)": 33.6, "step": 2100, "token_acc": 0.9187081757346524, "train_speed(iter/s)": 0.121375 }, { "epoch": 2.6322226577812624, "eval_loss": 0.35189294815063477, "eval_runtime": 29.6062, "eval_samples_per_second": 17.395, "eval_steps_per_second": 4.357, "eval_token_acc": 0.8839299186855563, "step": 2100 }, { "epoch": 2.6384947079576637, "grad_norm": 0.8447644114494324, "learning_rate": 3.488960040135303e-07, "loss": 0.24541032314300537, "memory(GiB)": 33.6, "step": 2105, "token_acc": 0.9017500930900579, "train_speed(iter/s)": 0.12117 }, { "epoch": 2.644766758134065, "grad_norm": 0.7984169125556946, "learning_rate": 3.369411950352175e-07, "loss": 0.23687467575073243, "memory(GiB)": 33.6, "step": 2110, "token_acc": 0.9224370308107012, "train_speed(iter/s)": 0.121237 }, { "epoch": 2.6510388083104663, "grad_norm": 0.7522621154785156, "learning_rate": 3.251876425645051e-07, "loss": 0.2515209197998047, "memory(GiB)": 33.6, "step": 2115, "token_acc": 0.9186267773138115, "train_speed(iter/s)": 0.121299 }, { "epoch": 2.657310858486868, "grad_norm": 0.9188127517700195, "learning_rate": 3.136358538819162e-07, "loss": 0.2667581081390381, "memory(GiB)": 33.6, "step": 2120, "token_acc": 0.9170892494929006, "train_speed(iter/s)": 0.121378 }, { "epoch": 2.657310858486868, "eval_loss": 0.3520144820213318, "eval_runtime": 29.6961, "eval_samples_per_second": 17.342, "eval_steps_per_second": 4.344, "eval_token_acc": 0.8841101517310755, "step": 2120 }, { "epoch": 2.6635829086632694, "grad_norm": 0.8419914841651917, "learning_rate": 3.0228632755990197e-07, "loss": 0.252849817276001, "memory(GiB)": 33.6, "step": 2125, "token_acc": 0.9014873358640861, "train_speed(iter/s)": 0.121179 }, { "epoch": 2.6698549588396707, "grad_norm": 0.8139102458953857, "learning_rate": 2.911395534413147e-07, "loss": 0.25453083515167235, "memory(GiB)": 33.6, "step": 2130, "token_acc": 0.9150135319885048, "train_speed(iter/s)": 0.121246 }, { "epoch": 2.676127009016072, "grad_norm": 0.8062320351600647, "learning_rate": 2.8019601261827123e-07, "loss": 0.24791936874389647, "memory(GiB)": 33.6, "step": 2135, "token_acc": 0.9251222952492422, "train_speed(iter/s)": 0.121311 }, { "epoch": 2.6823990591924733, "grad_norm": 0.8251351714134216, "learning_rate": 2.694561774113863e-07, "loss": 0.2614239931106567, "memory(GiB)": 33.6, "step": 2140, "token_acc": 0.9208586832863431, "train_speed(iter/s)": 0.121385 }, { "epoch": 2.6823990591924733, "eval_loss": 0.3521276116371155, "eval_runtime": 29.7027, "eval_samples_per_second": 17.339, "eval_steps_per_second": 4.343, "eval_token_acc": 0.8841185346634253, "step": 2140 }, { "epoch": 2.688671109368875, "grad_norm": 0.9006824493408203, "learning_rate": 2.5892051134939256e-07, "loss": 0.24248223304748534, "memory(GiB)": 33.6, "step": 2145, "token_acc": 0.9014429392662344, "train_speed(iter/s)": 0.12118 }, { "epoch": 2.6949431595452764, "grad_norm": 0.9092527627944946, "learning_rate": 2.485894691491253e-07, "loss": 0.2548917293548584, "memory(GiB)": 33.6, "step": 2150, "token_acc": 0.9183499893684882, "train_speed(iter/s)": 0.12125 }, { "epoch": 2.7012152097216777, "grad_norm": 0.8024119734764099, "learning_rate": 2.384634966959076e-07, "loss": 0.252050518989563, "memory(GiB)": 33.6, "step": 2155, "token_acc": 0.9186300234774203, "train_speed(iter/s)": 0.121319 }, { "epoch": 2.707487259898079, "grad_norm": 0.857513964176178, "learning_rate": 2.2854303102429808e-07, "loss": 0.24875540733337403, "memory(GiB)": 33.6, "step": 2160, "token_acc": 0.9268544278078209, "train_speed(iter/s)": 0.121379 }, { "epoch": 2.707487259898079, "eval_loss": 0.35207265615463257, "eval_runtime": 29.6934, "eval_samples_per_second": 17.344, "eval_steps_per_second": 4.344, "eval_token_acc": 0.8840137480090535, "step": 2160 }, { "epoch": 2.7137593100744803, "grad_norm": 0.8570399284362793, "learning_rate": 2.1882850029923463e-07, "loss": 0.2508984088897705, "memory(GiB)": 33.6, "step": 2165, "token_acc": 0.9019046776207754, "train_speed(iter/s)": 0.121193 }, { "epoch": 2.720031360250882, "grad_norm": 0.8015756607055664, "learning_rate": 2.093203237975483e-07, "loss": 0.2542969465255737, "memory(GiB)": 33.6, "step": 2170, "token_acc": 0.9189912422964645, "train_speed(iter/s)": 0.121256 }, { "epoch": 2.7263034104272834, "grad_norm": 0.7729219794273376, "learning_rate": 2.0001891188987265e-07, "loss": 0.247190523147583, "memory(GiB)": 33.6, "step": 2175, "token_acc": 0.9275373459399903, "train_speed(iter/s)": 0.121308 }, { "epoch": 2.7325754606036847, "grad_norm": 0.7974966764450073, "learning_rate": 1.9092466602293247e-07, "loss": 0.25701584815979006, "memory(GiB)": 33.6, "step": 2180, "token_acc": 0.9192098092643052, "train_speed(iter/s)": 0.121375 }, { "epoch": 2.7325754606036847, "eval_loss": 0.35181429982185364, "eval_runtime": 29.5889, "eval_samples_per_second": 17.405, "eval_steps_per_second": 4.36, "eval_token_acc": 0.8839634504149552, "step": 2180 }, { "epoch": 2.7388475107800865, "grad_norm": 0.859959065914154, "learning_rate": 1.8203797870221197e-07, "loss": 0.24590330123901366, "memory(GiB)": 33.6, "step": 2185, "token_acc": 0.8985490934392011, "train_speed(iter/s)": 0.121162 }, { "epoch": 2.745119560956488, "grad_norm": 0.7980506420135498, "learning_rate": 1.7335923347502003e-07, "loss": 0.2525080442428589, "memory(GiB)": 33.6, "step": 2190, "token_acc": 0.9187770535847555, "train_speed(iter/s)": 0.121227 }, { "epoch": 2.751391611132889, "grad_norm": 0.9112074971199036, "learning_rate": 1.6488880491393467e-07, "loss": 0.2503790855407715, "memory(GiB)": 33.6, "step": 2195, "token_acc": 0.9191731649956678, "train_speed(iter/s)": 0.121291 }, { "epoch": 2.7576636613092904, "grad_norm": 0.8652266263961792, "learning_rate": 1.5662705860063465e-07, "loss": 0.2473994016647339, "memory(GiB)": 33.6, "step": 2200, "token_acc": 0.9257844886063864, "train_speed(iter/s)": 0.121351 }, { "epoch": 2.7576636613092904, "eval_loss": 0.3517080843448639, "eval_runtime": 29.6123, "eval_samples_per_second": 17.391, "eval_steps_per_second": 4.356, "eval_token_acc": 0.8841855981222232, "step": 2200 }, { "epoch": 2.7639357114856917, "grad_norm": 0.7372477054595947, "learning_rate": 1.485743511101234e-07, "loss": 0.2570472717285156, "memory(GiB)": 33.6, "step": 2205, "token_acc": 0.9023636113783152, "train_speed(iter/s)": 0.121155 }, { "epoch": 2.7702077616620935, "grad_norm": 0.8067111968994141, "learning_rate": 1.4073102999534017e-07, "loss": 0.25244066715240476, "memory(GiB)": 33.6, "step": 2210, "token_acc": 0.9208304949204751, "train_speed(iter/s)": 0.121222 }, { "epoch": 2.776479811838495, "grad_norm": 0.840241551399231, "learning_rate": 1.3309743377215468e-07, "loss": 0.24581263065338135, "memory(GiB)": 33.6, "step": 2215, "token_acc": 0.9247633420222253, "train_speed(iter/s)": 0.121277 }, { "epoch": 2.782751862014896, "grad_norm": 0.8023516535758972, "learning_rate": 1.2567389190476287e-07, "loss": 0.26343064308166503, "memory(GiB)": 33.6, "step": 2220, "token_acc": 0.9156072196662401, "train_speed(iter/s)": 0.121348 }, { "epoch": 2.782751862014896, "eval_loss": 0.3517560660839081, "eval_runtime": 29.6264, "eval_samples_per_second": 17.383, "eval_steps_per_second": 4.354, "eval_token_acc": 0.8841227261296002, "step": 2220 }, { "epoch": 2.7890239121912974, "grad_norm": 0.8527393937110901, "learning_rate": 1.1846072479146431e-07, "loss": 0.23856933116912843, "memory(GiB)": 33.6, "step": 2225, "token_acc": 0.9016640429579982, "train_speed(iter/s)": 0.121138 }, { "epoch": 2.7952959623676987, "grad_norm": 0.796775221824646, "learning_rate": 1.114582437508327e-07, "loss": 0.2579585790634155, "memory(GiB)": 33.6, "step": 2230, "token_acc": 0.9188932252576804, "train_speed(iter/s)": 0.121209 }, { "epoch": 2.8015680125441005, "grad_norm": 0.8311977982521057, "learning_rate": 1.0466675100828383e-07, "loss": 0.2482445240020752, "memory(GiB)": 33.6, "step": 2235, "token_acc": 0.9148045991260421, "train_speed(iter/s)": 0.121265 }, { "epoch": 2.807840062720502, "grad_norm": 0.7720575928688049, "learning_rate": 9.808653968302607e-08, "loss": 0.2451555013656616, "memory(GiB)": 33.6, "step": 2240, "token_acc": 0.9182748574416694, "train_speed(iter/s)": 0.121331 }, { "epoch": 2.807840062720502, "eval_loss": 0.35162004828453064, "eval_runtime": 29.7076, "eval_samples_per_second": 17.336, "eval_steps_per_second": 4.342, "eval_token_acc": 0.88396764188113, "step": 2240 }, { "epoch": 2.814112112896903, "grad_norm": 0.7808555364608765, "learning_rate": 9.17178937754143e-08, "loss": 0.25096561908721926, "memory(GiB)": 33.6, "step": 2245, "token_acc": 0.9010792691911006, "train_speed(iter/s)": 0.121135 }, { "epoch": 2.820384163073305, "grad_norm": 0.7035224437713623, "learning_rate": 8.556108815468756e-08, "loss": 0.24024505615234376, "memory(GiB)": 33.6, "step": 2250, "token_acc": 0.9196050096339113, "train_speed(iter/s)": 0.121197 }, { "epoch": 2.8266562132497057, "grad_norm": 0.8584672808647156, "learning_rate": 7.961638854711296e-08, "loss": 0.2527903079986572, "memory(GiB)": 33.6, "step": 2255, "token_acc": 0.9181211708645337, "train_speed(iter/s)": 0.121251 }, { "epoch": 2.8329282634261075, "grad_norm": 0.8620744943618774, "learning_rate": 7.388405152450706e-08, "loss": 0.25180099010467527, "memory(GiB)": 33.6, "step": 2260, "token_acc": 0.9159815615644157, "train_speed(iter/s)": 0.121319 }, { "epoch": 2.8329282634261075, "eval_loss": 0.35167694091796875, "eval_runtime": 29.7193, "eval_samples_per_second": 17.329, "eval_steps_per_second": 4.341, "eval_token_acc": 0.8842610445133707, "step": 2260 }, { "epoch": 2.839200313602509, "grad_norm": 0.8185881972312927, "learning_rate": 6.836432449317255e-08, "loss": 0.2555046081542969, "memory(GiB)": 33.6, "step": 2265, "token_acc": 0.9007627158298629, "train_speed(iter/s)": 0.121123 }, { "epoch": 2.84547236377891, "grad_norm": 0.8594284653663635, "learning_rate": 6.305744568321281e-08, "loss": 0.243331241607666, "memory(GiB)": 33.6, "step": 2270, "token_acc": 0.919937106918239, "train_speed(iter/s)": 0.121187 }, { "epoch": 2.851744413955312, "grad_norm": 0.7847645282745361, "learning_rate": 5.7963644138254175e-08, "loss": 0.25553407669067385, "memory(GiB)": 33.6, "step": 2275, "token_acc": 0.9147093923774218, "train_speed(iter/s)": 0.12125 }, { "epoch": 2.858016464131713, "grad_norm": 0.7746605277061462, "learning_rate": 5.308313970555812e-08, "loss": 0.256377649307251, "memory(GiB)": 33.6, "step": 2280, "token_acc": 0.9144332047137345, "train_speed(iter/s)": 0.121321 }, { "epoch": 2.858016464131713, "eval_loss": 0.3516838848590851, "eval_runtime": 29.6227, "eval_samples_per_second": 17.385, "eval_steps_per_second": 4.355, "eval_token_acc": 0.8841688322575236, "step": 2280 }, { "epoch": 2.8642885143081145, "grad_norm": 0.8503162860870361, "learning_rate": 4.841614302653341e-08, "loss": 0.24309797286987306, "memory(GiB)": 33.6, "step": 2285, "token_acc": 0.901774899142082, "train_speed(iter/s)": 0.12112 }, { "epoch": 2.870560564484516, "grad_norm": 0.8309412002563477, "learning_rate": 4.396285552764557e-08, "loss": 0.25277886390686033, "memory(GiB)": 33.6, "step": 2290, "token_acc": 0.918742246397557, "train_speed(iter/s)": 0.121195 }, { "epoch": 2.876832614660917, "grad_norm": 0.8796764612197876, "learning_rate": 3.9723469411723226e-08, "loss": 0.2501843929290771, "memory(GiB)": 33.6, "step": 2295, "token_acc": 0.9242152837104197, "train_speed(iter/s)": 0.12126 }, { "epoch": 2.883104664837319, "grad_norm": 0.764290452003479, "learning_rate": 3.5698167649660384e-08, "loss": 0.23828110694885254, "memory(GiB)": 33.6, "step": 2300, "token_acc": 0.9215037112190955, "train_speed(iter/s)": 0.121321 }, { "epoch": 2.883104664837319, "eval_loss": 0.3518492877483368, "eval_runtime": 29.5991, "eval_samples_per_second": 17.399, "eval_steps_per_second": 4.358, "eval_token_acc": 0.8841688322575236, "step": 2300 }, { "epoch": 2.88937671501372, "grad_norm": 0.893683671951294, "learning_rate": 3.188712397252325e-08, "loss": 0.25220484733581544, "memory(GiB)": 33.6, "step": 2305, "token_acc": 0.900192213902516, "train_speed(iter/s)": 0.121135 }, { "epoch": 2.8956487651901215, "grad_norm": 0.7913616299629211, "learning_rate": 2.8290502864049553e-08, "loss": 0.23975701332092286, "memory(GiB)": 33.6, "step": 2310, "token_acc": 0.9262284488329413, "train_speed(iter/s)": 0.121196 }, { "epoch": 2.901920815366523, "grad_norm": 0.8090708255767822, "learning_rate": 2.4908459553549257e-08, "loss": 0.25728065967559816, "memory(GiB)": 33.6, "step": 2315, "token_acc": 0.921334886001471, "train_speed(iter/s)": 0.121264 }, { "epoch": 2.908192865542924, "grad_norm": 0.8814035058021545, "learning_rate": 2.174114000920713e-08, "loss": 0.25480012893676757, "memory(GiB)": 33.6, "step": 2320, "token_acc": 0.9189424911420006, "train_speed(iter/s)": 0.121328 }, { "epoch": 2.908192865542924, "eval_loss": 0.351841539144516, "eval_runtime": 29.651, "eval_samples_per_second": 17.369, "eval_steps_per_second": 4.351, "eval_token_acc": 0.8842400871824965, "step": 2320 }, { "epoch": 2.914464915719326, "grad_norm": 0.798668622970581, "learning_rate": 1.878868093177999e-08, "loss": 0.24860291481018065, "memory(GiB)": 33.6, "step": 2325, "token_acc": 0.9021842640764242, "train_speed(iter/s)": 0.121153 }, { "epoch": 2.920736965895727, "grad_norm": 0.8112826347351074, "learning_rate": 1.6051209748698116e-08, "loss": 0.25038626194000246, "memory(GiB)": 33.6, "step": 2330, "token_acc": 0.9164307381193124, "train_speed(iter/s)": 0.121208 }, { "epoch": 2.9270090160721285, "grad_norm": 0.8192731738090515, "learning_rate": 1.3528844608566848e-08, "loss": 0.253769063949585, "memory(GiB)": 33.6, "step": 2335, "token_acc": 0.9176401557582546, "train_speed(iter/s)": 0.121264 }, { "epoch": 2.9332810662485302, "grad_norm": 0.8607764840126038, "learning_rate": 1.1221694376064018e-08, "loss": 0.26034162044525144, "memory(GiB)": 33.6, "step": 2340, "token_acc": 0.9166121154136664, "train_speed(iter/s)": 0.12133 }, { "epoch": 2.9332810662485302, "eval_loss": 0.3517746925354004, "eval_runtime": 29.5861, "eval_samples_per_second": 17.407, "eval_steps_per_second": 4.36, "eval_token_acc": 0.8842400871824965, "step": 2340 }, { "epoch": 2.9395531164249316, "grad_norm": 0.8527613878250122, "learning_rate": 9.129858627244802e-09, "loss": 0.2517427921295166, "memory(GiB)": 33.6, "step": 2345, "token_acc": 0.9014389514175809, "train_speed(iter/s)": 0.121151 }, { "epoch": 2.945825166601333, "grad_norm": 0.9257199764251709, "learning_rate": 7.25342764524184e-09, "loss": 0.24849720001220704, "memory(GiB)": 33.6, "step": 2350, "token_acc": 0.9203306319162112, "train_speed(iter/s)": 0.121212 }, { "epoch": 2.952097216777734, "grad_norm": 0.8115556240081787, "learning_rate": 5.592482416369449e-09, "loss": 0.26071810722351074, "memory(GiB)": 33.6, "step": 2355, "token_acc": 0.9158378541289933, "train_speed(iter/s)": 0.121275 }, { "epoch": 2.9583692669541355, "grad_norm": 0.7887356281280518, "learning_rate": 4.147094626628656e-09, "loss": 0.2557636260986328, "memory(GiB)": 33.6, "step": 2360, "token_acc": 0.9162988480999031, "train_speed(iter/s)": 0.121332 }, { "epoch": 2.9583692669541355, "eval_loss": 0.3517840504646301, "eval_runtime": 29.6869, "eval_samples_per_second": 17.348, "eval_steps_per_second": 4.345, "eval_token_acc": 0.8840933858663761, "step": 2360 }, { "epoch": 2.9646413171305372, "grad_norm": 0.7621276378631592, "learning_rate": 2.9173266586113303e-09, "loss": 0.2530327320098877, "memory(GiB)": 33.6, "step": 2365, "token_acc": 0.9022793086399823, "train_speed(iter/s)": 0.121129 }, { "epoch": 2.9709133673069386, "grad_norm": 0.8078117370605469, "learning_rate": 1.9032315888106724e-09, "loss": 0.2542110919952393, "memory(GiB)": 33.6, "step": 2370, "token_acc": 0.919013704543002, "train_speed(iter/s)": 0.121185 }, { "epoch": 2.97718541748334, "grad_norm": 0.847828209400177, "learning_rate": 1.1048531853286027e-09, "loss": 0.26172566413879395, "memory(GiB)": 33.6, "step": 2375, "token_acc": 0.9156356655290102, "train_speed(iter/s)": 0.12124 }, { "epoch": 2.983457467659741, "grad_norm": 0.7999985814094543, "learning_rate": 5.222259059867174e-10, "loss": 0.2618349552154541, "memory(GiB)": 33.6, "step": 2380, "token_acc": 0.9119521576775136, "train_speed(iter/s)": 0.121301 }, { "epoch": 2.983457467659741, "eval_loss": 0.35182681679725647, "eval_runtime": 29.5769, "eval_samples_per_second": 17.412, "eval_steps_per_second": 4.362, "eval_token_acc": 0.8840472797384525, "step": 2380 }, { "epoch": 2.9897295178361425, "grad_norm": 0.864345371723175, "learning_rate": 1.5537489683914442e-10, "loss": 0.25390353202819826, "memory(GiB)": 33.6, "step": 2385, "token_acc": 0.9013812207837177, "train_speed(iter/s)": 0.121125 }, { "epoch": 2.9960015680125442, "grad_norm": 0.7659916281700134, "learning_rate": 4.315991088965632e-12, "loss": 0.25308642387390134, "memory(GiB)": 33.6, "step": 2390, "token_acc": 0.9178361065117668, "train_speed(iter/s)": 0.121184 }, { "epoch": 2.9972559780478245, "eval_loss": 0.35187748074531555, "eval_runtime": 29.5575, "eval_samples_per_second": 17.424, "eval_steps_per_second": 4.364, "eval_token_acc": 0.8842107469192724, "step": 2391 } ], "logging_steps": 5, "max_steps": 2391, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.893313222448775e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }