{ "best_global_step": 1560, "best_metric": 0.31897813, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v53-20250506-203614/checkpoint-1560", "epoch": 2.9972559780478245, "eval_steps": 20, "global_step": 2391, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012544100352802822, "grad_norm": 3.00976824760437, "learning_rate": 9.999995684008912e-06, "loss": 0.6037987470626831, "memory(GiB)": 28.87, "step": 1, "token_acc": 0.8419128400116993, "train_speed(iter/s)": 0.06443 }, { "epoch": 0.006272050176401411, "grad_norm": 1.8649096488952637, "learning_rate": 9.999892100595329e-06, "loss": 0.5593533515930176, "memory(GiB)": 28.87, "step": 5, "token_acc": 0.830772646536412, "train_speed(iter/s)": 0.125347 }, { "epoch": 0.012544100352802822, "grad_norm": 1.3044856786727905, "learning_rate": 9.999568407038233e-06, "loss": 0.49286112785339353, "memory(GiB)": 28.87, "step": 10, "token_acc": 0.8464868234234858, "train_speed(iter/s)": 0.140401 }, { "epoch": 0.018816150529204233, "grad_norm": 1.1055748462677002, "learning_rate": 9.999028933299243e-06, "loss": 0.45445590019226073, "memory(GiB)": 28.87, "step": 15, "token_acc": 0.852595056694209, "train_speed(iter/s)": 0.14752 }, { "epoch": 0.025088200705605645, "grad_norm": 1.1405771970748901, "learning_rate": 9.99827370266192e-06, "loss": 0.42907133102416994, "memory(GiB)": 28.87, "step": 20, "token_acc": 0.8665471789701654, "train_speed(iter/s)": 0.148788 }, { "epoch": 0.025088200705605645, "eval_loss": 0.44562071561813354, "eval_runtime": 29.4389, "eval_samples_per_second": 17.494, "eval_steps_per_second": 4.382, "eval_token_acc": 0.8629940758096486, "step": 20 }, { "epoch": 0.03136025088200706, "grad_norm": 1.0768901109695435, "learning_rate": 9.99730274772184e-06, "loss": 0.44150071144104003, "memory(GiB)": 28.87, "step": 25, "token_acc": 0.8656667620254199, "train_speed(iter/s)": 0.121315 }, { "epoch": 0.037632301058408466, "grad_norm": 1.097219705581665, "learning_rate": 9.996116110385186e-06, "loss": 0.43617844581604004, "memory(GiB)": 28.87, "step": 30, "token_acc": 0.8606751454471655, "train_speed(iter/s)": 0.127058 }, { "epoch": 0.04390435123480988, "grad_norm": 1.0356194972991943, "learning_rate": 9.99471384186694e-06, "loss": 0.42529120445251467, "memory(GiB)": 28.87, "step": 35, "token_acc": 0.8749200767263428, "train_speed(iter/s)": 0.130997 }, { "epoch": 0.05017640141121129, "grad_norm": 1.0370515584945679, "learning_rate": 9.99309600268868e-06, "loss": 0.3916675567626953, "memory(GiB)": 28.87, "step": 40, "token_acc": 0.8746708984022794, "train_speed(iter/s)": 0.133319 }, { "epoch": 0.05017640141121129, "eval_loss": 0.4175032675266266, "eval_runtime": 29.2644, "eval_samples_per_second": 17.598, "eval_steps_per_second": 4.408, "eval_token_acc": 0.869223636797717, "step": 40 }, { "epoch": 0.0564484515876127, "grad_norm": 1.0402519702911377, "learning_rate": 9.991262662675962e-06, "loss": 0.4160133361816406, "memory(GiB)": 28.87, "step": 45, "token_acc": 0.8742820425397991, "train_speed(iter/s)": 0.121167 }, { "epoch": 0.06272050176401411, "grad_norm": 1.0216219425201416, "learning_rate": 9.9892139009553e-06, "loss": 0.4001720428466797, "memory(GiB)": 28.87, "step": 50, "token_acc": 0.8626985512305813, "train_speed(iter/s)": 0.124367 }, { "epoch": 0.06899255194041552, "grad_norm": 1.0897234678268433, "learning_rate": 9.986949805950763e-06, "loss": 0.4109466075897217, "memory(GiB)": 28.87, "step": 55, "token_acc": 0.8756929944560443, "train_speed(iter/s)": 0.126442 }, { "epoch": 0.07526460211681693, "grad_norm": 1.0888633728027344, "learning_rate": 9.984470475380154e-06, "loss": 0.4020622730255127, "memory(GiB)": 28.87, "step": 60, "token_acc": 0.880744833231021, "train_speed(iter/s)": 0.128837 }, { "epoch": 0.07526460211681693, "eval_loss": 0.4036850929260254, "eval_runtime": 29.3294, "eval_samples_per_second": 17.559, "eval_steps_per_second": 4.398, "eval_token_acc": 0.8717332282118051, "step": 60 }, { "epoch": 0.08153665229321834, "grad_norm": 0.957584023475647, "learning_rate": 9.981776016250789e-06, "loss": 0.39967339038848876, "memory(GiB)": 28.87, "step": 65, "token_acc": 0.8750420639371845, "train_speed(iter/s)": 0.121004 }, { "epoch": 0.08780870246961976, "grad_norm": 0.9902129769325256, "learning_rate": 9.97886654485488e-06, "loss": 0.3779956817626953, "memory(GiB)": 28.87, "step": 70, "token_acc": 0.8874758475320569, "train_speed(iter/s)": 0.12309 }, { "epoch": 0.09408075264602117, "grad_norm": 0.9632574915885925, "learning_rate": 9.975742186764526e-06, "loss": 0.3755610704421997, "memory(GiB)": 28.87, "step": 75, "token_acc": 0.8887904599659284, "train_speed(iter/s)": 0.125047 }, { "epoch": 0.10035280282242258, "grad_norm": 1.083781123161316, "learning_rate": 9.972403076826272e-06, "loss": 0.3894859790802002, "memory(GiB)": 28.87, "step": 80, "token_acc": 0.8634405980822363, "train_speed(iter/s)": 0.126695 }, { "epoch": 0.10035280282242258, "eval_loss": 0.3936529755592346, "eval_runtime": 29.4382, "eval_samples_per_second": 17.494, "eval_steps_per_second": 4.382, "eval_token_acc": 0.8746037121965032, "step": 80 }, { "epoch": 0.10662485299882399, "grad_norm": 1.0563101768493652, "learning_rate": 9.96884935915531e-06, "loss": 0.36060025691986086, "memory(GiB)": 28.87, "step": 85, "token_acc": 0.8797673608319262, "train_speed(iter/s)": 0.120902 }, { "epoch": 0.1128969031752254, "grad_norm": 0.9367106556892395, "learning_rate": 9.965081187129248e-06, "loss": 0.3710139274597168, "memory(GiB)": 28.87, "step": 90, "token_acc": 0.8800344362928582, "train_speed(iter/s)": 0.122022 }, { "epoch": 0.11916895335162682, "grad_norm": 1.0503040552139282, "learning_rate": 9.961098723381495e-06, "loss": 0.3768899917602539, "memory(GiB)": 28.87, "step": 95, "token_acc": 0.8798389919495975, "train_speed(iter/s)": 0.123839 }, { "epoch": 0.12544100352802823, "grad_norm": 0.9691568613052368, "learning_rate": 9.956902139794236e-06, "loss": 0.40593662261962893, "memory(GiB)": 28.87, "step": 100, "token_acc": 0.8564825788101446, "train_speed(iter/s)": 0.125265 }, { "epoch": 0.12544100352802823, "eval_loss": 0.3885510563850403, "eval_runtime": 29.3787, "eval_samples_per_second": 17.53, "eval_steps_per_second": 4.391, "eval_token_acc": 0.8755364806866953, "step": 100 }, { "epoch": 0.13171305370442962, "grad_norm": 0.912711501121521, "learning_rate": 9.95249161749102e-06, "loss": 0.3844183921813965, "memory(GiB)": 28.87, "step": 105, "token_acc": 0.8794101344470655, "train_speed(iter/s)": 0.120949 }, { "epoch": 0.13798510388083104, "grad_norm": 0.9416125416755676, "learning_rate": 9.94786734682894e-06, "loss": 0.37710275650024416, "memory(GiB)": 28.87, "step": 110, "token_acc": 0.8741283259963379, "train_speed(iter/s)": 0.122239 }, { "epoch": 0.14425715405723247, "grad_norm": 0.9355840086936951, "learning_rate": 9.943029527390415e-06, "loss": 0.3904699802398682, "memory(GiB)": 28.87, "step": 115, "token_acc": 0.8733137387896601, "train_speed(iter/s)": 0.123713 }, { "epoch": 0.15052920423363386, "grad_norm": 0.8927087783813477, "learning_rate": 9.93797836797458e-06, "loss": 0.3807236909866333, "memory(GiB)": 28.87, "step": 120, "token_acc": 0.8784389647442429, "train_speed(iter/s)": 0.124943 }, { "epoch": 0.15052920423363386, "eval_loss": 0.3819100260734558, "eval_runtime": 29.4329, "eval_samples_per_second": 17.497, "eval_steps_per_second": 4.383, "eval_token_acc": 0.8767468588465873, "step": 120 }, { "epoch": 0.15680125441003528, "grad_norm": 1.0235486030578613, "learning_rate": 9.932714086588276e-06, "loss": 0.3710296630859375, "memory(GiB)": 28.87, "step": 125, "token_acc": 0.8803818429768167, "train_speed(iter/s)": 0.121182 }, { "epoch": 0.16307330458643668, "grad_norm": 0.949593722820282, "learning_rate": 9.92723691043663e-06, "loss": 0.3603027820587158, "memory(GiB)": 28.87, "step": 130, "token_acc": 0.8798780487804878, "train_speed(iter/s)": 0.122278 }, { "epoch": 0.1693453547628381, "grad_norm": 0.9754647612571716, "learning_rate": 9.921547075913261e-06, "loss": 0.3779932737350464, "memory(GiB)": 28.87, "step": 135, "token_acc": 0.8775437975579543, "train_speed(iter/s)": 0.123298 }, { "epoch": 0.17561740493923952, "grad_norm": 0.8868964910507202, "learning_rate": 9.915644828590074e-06, "loss": 0.38963828086853025, "memory(GiB)": 28.87, "step": 140, "token_acc": 0.8774046938055872, "train_speed(iter/s)": 0.124471 }, { "epoch": 0.17561740493923952, "eval_loss": 0.3778168857097626, "eval_runtime": 29.4005, "eval_samples_per_second": 17.517, "eval_steps_per_second": 4.388, "eval_token_acc": 0.8770189163228933, "step": 140 }, { "epoch": 0.18188945511564092, "grad_norm": 0.9047612547874451, "learning_rate": 9.909530423206657e-06, "loss": 0.38094215393066405, "memory(GiB)": 28.87, "step": 145, "token_acc": 0.8818703968341212, "train_speed(iter/s)": 0.121105 }, { "epoch": 0.18816150529204234, "grad_norm": 0.8378849625587463, "learning_rate": 9.903204123659288e-06, "loss": 0.3542912483215332, "memory(GiB)": 28.87, "step": 150, "token_acc": 0.8816124572509642, "train_speed(iter/s)": 0.122126 }, { "epoch": 0.19443355546844374, "grad_norm": 0.9955030679702759, "learning_rate": 9.896666202989553e-06, "loss": 0.37951111793518066, "memory(GiB)": 28.87, "step": 155, "token_acc": 0.8759518001034967, "train_speed(iter/s)": 0.123261 }, { "epoch": 0.20070560564484516, "grad_norm": 0.9958081841468811, "learning_rate": 9.889916943372549e-06, "loss": 0.3788171291351318, "memory(GiB)": 28.87, "step": 160, "token_acc": 0.8731931436649585, "train_speed(iter/s)": 0.124288 }, { "epoch": 0.20070560564484516, "eval_loss": 0.3737770915031433, "eval_runtime": 29.3525, "eval_samples_per_second": 17.545, "eval_steps_per_second": 4.395, "eval_token_acc": 0.8785124563458795, "step": 160 }, { "epoch": 0.20697765582124658, "grad_norm": 0.8732254505157471, "learning_rate": 9.882956636104714e-06, "loss": 0.36996870040893554, "memory(GiB)": 28.87, "step": 165, "token_acc": 0.8824279750554038, "train_speed(iter/s)": 0.121446 }, { "epoch": 0.21324970599764798, "grad_norm": 0.9481439590454102, "learning_rate": 9.875785581591253e-06, "loss": 0.36848177909851076, "memory(GiB)": 28.87, "step": 170, "token_acc": 0.8760355937404112, "train_speed(iter/s)": 0.122195 }, { "epoch": 0.2195217561740494, "grad_norm": 1.0382639169692993, "learning_rate": 9.868404089333171e-06, "loss": 0.3516302347183228, "memory(GiB)": 28.87, "step": 175, "token_acc": 0.8920562652894808, "train_speed(iter/s)": 0.123108 }, { "epoch": 0.2257938063504508, "grad_norm": 0.973171591758728, "learning_rate": 9.860812477913915e-06, "loss": 0.36341152191162107, "memory(GiB)": 28.87, "step": 180, "token_acc": 0.8771131832309909, "train_speed(iter/s)": 0.123961 }, { "epoch": 0.2257938063504508, "eval_loss": 0.37089061737060547, "eval_runtime": 29.3996, "eval_samples_per_second": 17.517, "eval_steps_per_second": 4.388, "eval_token_acc": 0.8789066620768534, "step": 180 }, { "epoch": 0.23206585652685222, "grad_norm": 0.8672347068786621, "learning_rate": 9.853011074985628e-06, "loss": 0.35667340755462645, "memory(GiB)": 28.87, "step": 185, "token_acc": 0.8844639006185514, "train_speed(iter/s)": 0.121474 }, { "epoch": 0.23833790670325364, "grad_norm": 1.0073734521865845, "learning_rate": 9.845000217255e-06, "loss": 0.36460084915161134, "memory(GiB)": 28.87, "step": 190, "token_acc": 0.8793251008680768, "train_speed(iter/s)": 0.122381 }, { "epoch": 0.24460995687965503, "grad_norm": 0.859307050704956, "learning_rate": 9.836780250468744e-06, "loss": 0.3675198554992676, "memory(GiB)": 28.87, "step": 195, "token_acc": 0.8756779751143252, "train_speed(iter/s)": 0.123104 }, { "epoch": 0.25088200705605646, "grad_norm": 0.8690401911735535, "learning_rate": 9.82835152939867e-06, "loss": 0.3573744773864746, "memory(GiB)": 28.87, "step": 200, "token_acc": 0.8790942326344848, "train_speed(iter/s)": 0.123813 }, { "epoch": 0.25088200705605646, "eval_loss": 0.367218554019928, "eval_runtime": 29.4154, "eval_samples_per_second": 17.508, "eval_steps_per_second": 4.385, "eval_token_acc": 0.8802725016517775, "step": 200 }, { "epoch": 0.2571540572324579, "grad_norm": 0.9202991127967834, "learning_rate": 9.81971441782637e-06, "loss": 0.3682036161422729, "memory(GiB)": 28.87, "step": 205, "token_acc": 0.8823869363618037, "train_speed(iter/s)": 0.121607 }, { "epoch": 0.26342610740885924, "grad_norm": 1.1039172410964966, "learning_rate": 9.810869288527528e-06, "loss": 0.3587596893310547, "memory(GiB)": 28.87, "step": 210, "token_acc": 0.8775054019819686, "train_speed(iter/s)": 0.122214 }, { "epoch": 0.26969815758526067, "grad_norm": 1.0208649635314941, "learning_rate": 9.801816523255811e-06, "loss": 0.3483666181564331, "memory(GiB)": 28.87, "step": 215, "token_acc": 0.8888964740417108, "train_speed(iter/s)": 0.12288 }, { "epoch": 0.2759702077616621, "grad_norm": 0.9139247536659241, "learning_rate": 9.792556512726419e-06, "loss": 0.363814640045166, "memory(GiB)": 28.87, "step": 220, "token_acc": 0.8944489591798462, "train_speed(iter/s)": 0.123698 }, { "epoch": 0.2759702077616621, "eval_loss": 0.3650023639202118, "eval_runtime": 29.425, "eval_samples_per_second": 17.502, "eval_steps_per_second": 4.384, "eval_token_acc": 0.8810831218873015, "step": 220 }, { "epoch": 0.2822422579380635, "grad_norm": 0.9822434186935425, "learning_rate": 9.783089656599196e-06, "loss": 0.3573582172393799, "memory(GiB)": 28.87, "step": 225, "token_acc": 0.8843033462786615, "train_speed(iter/s)": 0.121775 }, { "epoch": 0.28851430811446493, "grad_norm": 0.933601438999176, "learning_rate": 9.773416363461401e-06, "loss": 0.34882917404174807, "memory(GiB)": 28.87, "step": 230, "token_acc": 0.8913467127081268, "train_speed(iter/s)": 0.122386 }, { "epoch": 0.2947863582908663, "grad_norm": 0.921463668346405, "learning_rate": 9.763537050810064e-06, "loss": 0.357175350189209, "memory(GiB)": 28.87, "step": 235, "token_acc": 0.8878133772142096, "train_speed(iter/s)": 0.122933 }, { "epoch": 0.3010584084672677, "grad_norm": 0.863202691078186, "learning_rate": 9.753452145033961e-06, "loss": 0.36011409759521484, "memory(GiB)": 28.87, "step": 240, "token_acc": 0.8802597806642296, "train_speed(iter/s)": 0.123406 }, { "epoch": 0.3010584084672677, "eval_loss": 0.3631041347980499, "eval_runtime": 29.4014, "eval_samples_per_second": 17.516, "eval_steps_per_second": 4.388, "eval_token_acc": 0.8814828798116696, "step": 240 }, { "epoch": 0.30733045864366915, "grad_norm": 0.9408513307571411, "learning_rate": 9.743162081395227e-06, "loss": 0.3614873647689819, "memory(GiB)": 28.87, "step": 245, "token_acc": 0.8843432165774723, "train_speed(iter/s)": 0.121612 }, { "epoch": 0.31360250882007057, "grad_norm": 0.9375274777412415, "learning_rate": 9.73266730401056e-06, "loss": 0.3697765588760376, "memory(GiB)": 28.87, "step": 250, "token_acc": 0.8756347101174687, "train_speed(iter/s)": 0.12238 }, { "epoch": 0.319874558996472, "grad_norm": 0.9580682516098022, "learning_rate": 9.72196826583205e-06, "loss": 0.3556410312652588, "memory(GiB)": 28.87, "step": 255, "token_acc": 0.8826956988422909, "train_speed(iter/s)": 0.122971 }, { "epoch": 0.32614660917287336, "grad_norm": 0.9287164211273193, "learning_rate": 9.711065428627638e-06, "loss": 0.35933010578155516, "memory(GiB)": 28.87, "step": 260, "token_acc": 0.8740019687192387, "train_speed(iter/s)": 0.123433 }, { "epoch": 0.32614660917287336, "eval_loss": 0.3610526919364929, "eval_runtime": 29.4208, "eval_samples_per_second": 17.505, "eval_steps_per_second": 4.385, "eval_token_acc": 0.8819714728303416, "step": 260 }, { "epoch": 0.3324186593492748, "grad_norm": 0.8553582429885864, "learning_rate": 9.699959262961182e-06, "loss": 0.36598858833312986, "memory(GiB)": 28.87, "step": 265, "token_acc": 0.8825705413797759, "train_speed(iter/s)": 0.121765 }, { "epoch": 0.3386907095256762, "grad_norm": 0.8980154395103455, "learning_rate": 9.688650248172145e-06, "loss": 0.3778824329376221, "memory(GiB)": 28.87, "step": 270, "token_acc": 0.8675095993484079, "train_speed(iter/s)": 0.122329 }, { "epoch": 0.3449627597020776, "grad_norm": 0.8882062435150146, "learning_rate": 9.677138872354916e-06, "loss": 0.35373764038085936, "memory(GiB)": 28.87, "step": 275, "token_acc": 0.8847052060503718, "train_speed(iter/s)": 0.122902 }, { "epoch": 0.35123480987847905, "grad_norm": 1.0147747993469238, "learning_rate": 9.665425632337731e-06, "loss": 0.37800116539001466, "memory(GiB)": 28.87, "step": 280, "token_acc": 0.8738335199701381, "train_speed(iter/s)": 0.123477 }, { "epoch": 0.35123480987847905, "eval_loss": 0.3590083718299866, "eval_runtime": 29.4336, "eval_samples_per_second": 17.497, "eval_steps_per_second": 4.383, "eval_token_acc": 0.8824434092688317, "step": 280 }, { "epoch": 0.3575068600548804, "grad_norm": 0.9124870300292969, "learning_rate": 9.653511033661242e-06, "loss": 0.3612790584564209, "memory(GiB)": 28.87, "step": 285, "token_acc": 0.8852118028240377, "train_speed(iter/s)": 0.121866 }, { "epoch": 0.36377891023128184, "grad_norm": 0.8596437573432922, "learning_rate": 9.641395590556689e-06, "loss": 0.3433859825134277, "memory(GiB)": 28.87, "step": 290, "token_acc": 0.9010951837286989, "train_speed(iter/s)": 0.122379 }, { "epoch": 0.37005096040768326, "grad_norm": 0.9107604622840881, "learning_rate": 9.629079825923712e-06, "loss": 0.3564929962158203, "memory(GiB)": 28.87, "step": 295, "token_acc": 0.8857908060314782, "train_speed(iter/s)": 0.122912 }, { "epoch": 0.3763230105840847, "grad_norm": 0.9795000553131104, "learning_rate": 9.616564271307779e-06, "loss": 0.3729372024536133, "memory(GiB)": 28.87, "step": 300, "token_acc": 0.8755715825867133, "train_speed(iter/s)": 0.123491 }, { "epoch": 0.3763230105840847, "eval_loss": 0.3561505973339081, "eval_runtime": 29.4641, "eval_samples_per_second": 17.479, "eval_steps_per_second": 4.378, "eval_token_acc": 0.8826766013913797, "step": 300 }, { "epoch": 0.3825950607604861, "grad_norm": 0.8801693916320801, "learning_rate": 9.603849466877249e-06, "loss": 0.33815276622772217, "memory(GiB)": 28.87, "step": 305, "token_acc": 0.8876053110467076, "train_speed(iter/s)": 0.121969 }, { "epoch": 0.3888671109368875, "grad_norm": 0.8597840666770935, "learning_rate": 9.59093596140005e-06, "loss": 0.35080363750457766, "memory(GiB)": 28.87, "step": 310, "token_acc": 0.8818184958717656, "train_speed(iter/s)": 0.122478 }, { "epoch": 0.3951391611132889, "grad_norm": 0.8145344853401184, "learning_rate": 9.577824312220006e-06, "loss": 0.3441263914108276, "memory(GiB)": 28.87, "step": 315, "token_acc": 0.8895818188774535, "train_speed(iter/s)": 0.122834 }, { "epoch": 0.4014112112896903, "grad_norm": 0.850265383720398, "learning_rate": 9.564515085232772e-06, "loss": 0.34675819873809816, "memory(GiB)": 28.87, "step": 320, "token_acc": 0.8802672898561309, "train_speed(iter/s)": 0.123297 }, { "epoch": 0.4014112112896903, "eval_loss": 0.35628461837768555, "eval_runtime": 29.2861, "eval_samples_per_second": 17.585, "eval_steps_per_second": 4.405, "eval_token_acc": 0.8831707466034457, "step": 320 }, { "epoch": 0.40768326146609174, "grad_norm": 0.8921403288841248, "learning_rate": 9.55100885486142e-06, "loss": 0.3695497989654541, "memory(GiB)": 28.87, "step": 325, "token_acc": 0.8874945370543798, "train_speed(iter/s)": 0.121931 }, { "epoch": 0.41395531164249316, "grad_norm": 1.006880760192871, "learning_rate": 9.537306204031628e-06, "loss": 0.3622285842895508, "memory(GiB)": 28.87, "step": 330, "token_acc": 0.8859732356116873, "train_speed(iter/s)": 0.122367 }, { "epoch": 0.42022736181889453, "grad_norm": 0.9347439408302307, "learning_rate": 9.523407724146548e-06, "loss": 0.35402708053588866, "memory(GiB)": 28.87, "step": 335, "token_acc": 0.879896406604079, "train_speed(iter/s)": 0.122904 }, { "epoch": 0.42649941199529595, "grad_norm": 0.8348590135574341, "learning_rate": 9.509314015061263e-06, "loss": 0.33470354080200193, "memory(GiB)": 28.87, "step": 340, "token_acc": 0.888100010191256, "train_speed(iter/s)": 0.123264 }, { "epoch": 0.42649941199529595, "eval_loss": 0.35398781299591064, "eval_runtime": 29.3885, "eval_samples_per_second": 17.524, "eval_steps_per_second": 4.389, "eval_token_acc": 0.8837870400701797, "step": 340 }, { "epoch": 0.4327714621716974, "grad_norm": 0.843921422958374, "learning_rate": 9.495025685056898e-06, "loss": 0.343002462387085, "memory(GiB)": 28.87, "step": 345, "token_acc": 0.8873276256453827, "train_speed(iter/s)": 0.121931 }, { "epoch": 0.4390435123480988, "grad_norm": 0.8519654273986816, "learning_rate": 9.480543350814376e-06, "loss": 0.35919780731201173, "memory(GiB)": 28.87, "step": 350, "token_acc": 0.881159420289855, "train_speed(iter/s)": 0.122444 }, { "epoch": 0.4453155625245002, "grad_norm": 0.8579279184341431, "learning_rate": 9.465867637387793e-06, "loss": 0.37248964309692384, "memory(GiB)": 28.87, "step": 355, "token_acc": 0.8757786153540964, "train_speed(iter/s)": 0.122942 }, { "epoch": 0.4515876127009016, "grad_norm": 0.789768636226654, "learning_rate": 9.450999178177445e-06, "loss": 0.3402097702026367, "memory(GiB)": 28.87, "step": 360, "token_acc": 0.8902190332326284, "train_speed(iter/s)": 0.123304 }, { "epoch": 0.4515876127009016, "eval_loss": 0.35195934772491455, "eval_runtime": 29.3974, "eval_samples_per_second": 17.519, "eval_steps_per_second": 4.388, "eval_token_acc": 0.8847198085603718, "step": 360 }, { "epoch": 0.457859662877303, "grad_norm": 0.8919506669044495, "learning_rate": 9.435938614902494e-06, "loss": 0.32901549339294434, "memory(GiB)": 28.87, "step": 365, "token_acc": 0.8874121986962454, "train_speed(iter/s)": 0.122069 }, { "epoch": 0.46413171305370443, "grad_norm": 0.881273627281189, "learning_rate": 9.42068659757326e-06, "loss": 0.3595102548599243, "memory(GiB)": 28.87, "step": 370, "token_acc": 0.8718952901164588, "train_speed(iter/s)": 0.122435 }, { "epoch": 0.47040376323010585, "grad_norm": 0.900892972946167, "learning_rate": 9.405243784463181e-06, "loss": 0.34714303016662595, "memory(GiB)": 28.87, "step": 375, "token_acc": 0.893684034176579, "train_speed(iter/s)": 0.122895 }, { "epoch": 0.4766758134065073, "grad_norm": 0.9816926717758179, "learning_rate": 9.389610842080394e-06, "loss": 0.3555105209350586, "memory(GiB)": 28.87, "step": 380, "token_acc": 0.880410447761194, "train_speed(iter/s)": 0.123308 }, { "epoch": 0.4766758134065073, "eval_loss": 0.34997043013572693, "eval_runtime": 29.336, "eval_samples_per_second": 17.555, "eval_steps_per_second": 4.397, "eval_token_acc": 0.8844144379237018, "step": 380 }, { "epoch": 0.48294786358290864, "grad_norm": 0.8801642060279846, "learning_rate": 9.373788445138972e-06, "loss": 0.3438990592956543, "memory(GiB)": 28.87, "step": 385, "token_acc": 0.888115044818469, "train_speed(iter/s)": 0.122133 }, { "epoch": 0.48921991375931007, "grad_norm": 0.8868786692619324, "learning_rate": 9.357777276529793e-06, "loss": 0.3474756956100464, "memory(GiB)": 28.87, "step": 390, "token_acc": 0.8811514138256887, "train_speed(iter/s)": 0.122604 }, { "epoch": 0.4954919639357115, "grad_norm": 0.8451352119445801, "learning_rate": 9.341578027291085e-06, "loss": 0.32396225929260253, "memory(GiB)": 28.87, "step": 395, "token_acc": 0.8982433222180206, "train_speed(iter/s)": 0.122966 }, { "epoch": 0.5017640141121129, "grad_norm": 0.9279198050498962, "learning_rate": 9.325191396578589e-06, "loss": 0.35566129684448244, "memory(GiB)": 28.87, "step": 400, "token_acc": 0.883422080227192, "train_speed(iter/s)": 0.123361 }, { "epoch": 0.5017640141121129, "eval_loss": 0.3484845459461212, "eval_runtime": 29.3966, "eval_samples_per_second": 17.519, "eval_steps_per_second": 4.388, "eval_token_acc": 0.8852472669328018, "step": 400 }, { "epoch": 0.5080360642885143, "grad_norm": 0.8177010416984558, "learning_rate": 9.308618091635382e-06, "loss": 0.3363117933273315, "memory(GiB)": 28.87, "step": 405, "token_acc": 0.8922487570299128, "train_speed(iter/s)": 0.122214 }, { "epoch": 0.5143081144649158, "grad_norm": 0.9669262170791626, "learning_rate": 9.291858827761359e-06, "loss": 0.34157965183258054, "memory(GiB)": 28.87, "step": 410, "token_acc": 0.8911739502999143, "train_speed(iter/s)": 0.12252 }, { "epoch": 0.5205801646413172, "grad_norm": 1.5560276508331299, "learning_rate": 9.274914328282359e-06, "loss": 0.36170029640197754, "memory(GiB)": 28.87, "step": 415, "token_acc": 0.8824226748572045, "train_speed(iter/s)": 0.122859 }, { "epoch": 0.5268522148177185, "grad_norm": 0.9071317911148071, "learning_rate": 9.257785324518943e-06, "loss": 0.3503504753112793, "memory(GiB)": 28.87, "step": 420, "token_acc": 0.8918499451111027, "train_speed(iter/s)": 0.123193 }, { "epoch": 0.5268522148177185, "eval_loss": 0.34762412309646606, "eval_runtime": 29.3255, "eval_samples_per_second": 17.562, "eval_steps_per_second": 4.399, "eval_token_acc": 0.8854193849280159, "step": 420 }, { "epoch": 0.5331242649941199, "grad_norm": 0.8516839146614075, "learning_rate": 9.240472555754835e-06, "loss": 0.33733839988708497, "memory(GiB)": 28.87, "step": 425, "token_acc": 0.8860463239447987, "train_speed(iter/s)": 0.122243 }, { "epoch": 0.5393963151705213, "grad_norm": 1.0013768672943115, "learning_rate": 9.222976769205013e-06, "loss": 0.34144785404205324, "memory(GiB)": 28.87, "step": 430, "token_acc": 0.8886329988484948, "train_speed(iter/s)": 0.122568 }, { "epoch": 0.5456683653469228, "grad_norm": 0.8237244486808777, "learning_rate": 9.205298719983458e-06, "loss": 0.34563274383544923, "memory(GiB)": 28.87, "step": 435, "token_acc": 0.889361264442692, "train_speed(iter/s)": 0.122842 }, { "epoch": 0.5519404155233242, "grad_norm": 0.8497081995010376, "learning_rate": 9.187439171070563e-06, "loss": 0.3556859016418457, "memory(GiB)": 28.87, "step": 440, "token_acc": 0.8856649117871639, "train_speed(iter/s)": 0.123161 }, { "epoch": 0.5519404155233242, "eval_loss": 0.3459347188472748, "eval_runtime": 29.345, "eval_samples_per_second": 17.55, "eval_steps_per_second": 4.396, "eval_token_acc": 0.8852472669328018, "step": 440 }, { "epoch": 0.5582124656997256, "grad_norm": 1.3792065382003784, "learning_rate": 9.169398893280208e-06, "loss": 0.3321459054946899, "memory(GiB)": 28.87, "step": 445, "token_acc": 0.8902999810652168, "train_speed(iter/s)": 0.122142 }, { "epoch": 0.564484515876127, "grad_norm": 0.788133978843689, "learning_rate": 9.151178665226486e-06, "loss": 0.3413043260574341, "memory(GiB)": 28.87, "step": 450, "token_acc": 0.8891621577307756, "train_speed(iter/s)": 0.122465 }, { "epoch": 0.5707565660525284, "grad_norm": 0.7931389808654785, "learning_rate": 9.132779273290103e-06, "loss": 0.3318148612976074, "memory(GiB)": 28.87, "step": 455, "token_acc": 0.8939530523203404, "train_speed(iter/s)": 0.122682 }, { "epoch": 0.5770286162289299, "grad_norm": 0.8171889781951904, "learning_rate": 9.114201511584428e-06, "loss": 0.3486793994903564, "memory(GiB)": 28.87, "step": 460, "token_acc": 0.8884144810017581, "train_speed(iter/s)": 0.122934 }, { "epoch": 0.5770286162289299, "eval_loss": 0.3437627851963043, "eval_runtime": 29.3507, "eval_samples_per_second": 17.546, "eval_steps_per_second": 4.395, "eval_token_acc": 0.8861966920031759, "step": 460 }, { "epoch": 0.5833006664053313, "grad_norm": 0.9594098329544067, "learning_rate": 9.095446181921237e-06, "loss": 0.35313169956207274, "memory(GiB)": 28.87, "step": 465, "token_acc": 0.8879731787460262, "train_speed(iter/s)": 0.122073 }, { "epoch": 0.5895727165817326, "grad_norm": 0.7837137579917908, "learning_rate": 9.07651409377609e-06, "loss": 0.344076943397522, "memory(GiB)": 28.87, "step": 470, "token_acc": 0.8796075311588438, "train_speed(iter/s)": 0.122438 }, { "epoch": 0.595844766758134, "grad_norm": 0.8649491667747498, "learning_rate": 9.057406064253404e-06, "loss": 0.356815505027771, "memory(GiB)": 28.87, "step": 475, "token_acc": 0.8810882595743154, "train_speed(iter/s)": 0.12278 }, { "epoch": 0.6021168169345354, "grad_norm": 0.8207665681838989, "learning_rate": 9.038122918051184e-06, "loss": 0.33744909763336184, "memory(GiB)": 28.87, "step": 480, "token_acc": 0.8888801540819936, "train_speed(iter/s)": 0.123064 }, { "epoch": 0.6021168169345354, "eval_loss": 0.3429865837097168, "eval_runtime": 29.3492, "eval_samples_per_second": 17.547, "eval_steps_per_second": 4.395, "eval_token_acc": 0.8863299446446319, "step": 480 }, { "epoch": 0.6083888671109369, "grad_norm": 0.8997246623039246, "learning_rate": 9.018665487425426e-06, "loss": 0.34094789028167727, "memory(GiB)": 28.87, "step": 485, "token_acc": 0.8909048190883935, "train_speed(iter/s)": 0.122119 }, { "epoch": 0.6146609172873383, "grad_norm": 0.8832601308822632, "learning_rate": 8.999034612154204e-06, "loss": 0.34579076766967776, "memory(GiB)": 28.87, "step": 490, "token_acc": 0.889549670855775, "train_speed(iter/s)": 0.122448 }, { "epoch": 0.6209329674637397, "grad_norm": 0.9071618914604187, "learning_rate": 8.979231139501417e-06, "loss": 0.33304905891418457, "memory(GiB)": 28.87, "step": 495, "token_acc": 0.892831455286502, "train_speed(iter/s)": 0.122757 }, { "epoch": 0.6272050176401411, "grad_norm": 1.053952693939209, "learning_rate": 8.95925592418023e-06, "loss": 0.3448563814163208, "memory(GiB)": 28.87, "step": 500, "token_acc": 0.876147859922179, "train_speed(iter/s)": 0.123039 }, { "epoch": 0.6272050176401411, "eval_loss": 0.3429498076438904, "eval_runtime": 29.3653, "eval_samples_per_second": 17.538, "eval_steps_per_second": 4.393, "eval_token_acc": 0.8868574030170618, "step": 500 }, { "epoch": 0.6334770678165426, "grad_norm": 0.9261718392372131, "learning_rate": 8.939109828316184e-06, "loss": 0.3366121292114258, "memory(GiB)": 28.87, "step": 505, "token_acc": 0.8902018439384988, "train_speed(iter/s)": 0.122191 }, { "epoch": 0.639749117992944, "grad_norm": 0.8245139122009277, "learning_rate": 8.918793721409973e-06, "loss": 0.3344353914260864, "memory(GiB)": 28.87, "step": 510, "token_acc": 0.8851031265942716, "train_speed(iter/s)": 0.122447 }, { "epoch": 0.6460211681693454, "grad_norm": 0.7967011332511902, "learning_rate": 8.898308480299937e-06, "loss": 0.33919148445129393, "memory(GiB)": 28.87, "step": 515, "token_acc": 0.889465313541351, "train_speed(iter/s)": 0.122726 }, { "epoch": 0.6522932183457467, "grad_norm": 0.7725489139556885, "learning_rate": 8.877654989124202e-06, "loss": 0.32682027816772463, "memory(GiB)": 28.87, "step": 520, "token_acc": 0.9023533671252716, "train_speed(iter/s)": 0.123046 }, { "epoch": 0.6522932183457467, "eval_loss": 0.3419317901134491, "eval_runtime": 29.3667, "eval_samples_per_second": 17.537, "eval_steps_per_second": 4.393, "eval_token_acc": 0.8866020021209379, "step": 520 }, { "epoch": 0.6585652685221481, "grad_norm": 0.8528004884719849, "learning_rate": 8.856834139282531e-06, "loss": 0.32720084190368653, "memory(GiB)": 28.87, "step": 525, "token_acc": 0.8922637415946852, "train_speed(iter/s)": 0.122236 }, { "epoch": 0.6648373186985496, "grad_norm": 0.8266517519950867, "learning_rate": 8.835846829397843e-06, "loss": 0.33587045669555665, "memory(GiB)": 28.87, "step": 530, "token_acc": 0.8928849794009576, "train_speed(iter/s)": 0.122551 }, { "epoch": 0.671109368874951, "grad_norm": 0.9410281181335449, "learning_rate": 8.814693965277435e-06, "loss": 0.334349536895752, "memory(GiB)": 28.87, "step": 535, "token_acc": 0.8876769459036773, "train_speed(iter/s)": 0.122795 }, { "epoch": 0.6773814190513524, "grad_norm": 0.9425005912780762, "learning_rate": 8.793376459873888e-06, "loss": 0.34319519996643066, "memory(GiB)": 28.87, "step": 540, "token_acc": 0.893324486711637, "train_speed(iter/s)": 0.123071 }, { "epoch": 0.6773814190513524, "eval_loss": 0.3407135009765625, "eval_runtime": 29.3625, "eval_samples_per_second": 17.539, "eval_steps_per_second": 4.393, "eval_token_acc": 0.8873793091960979, "step": 540 }, { "epoch": 0.6836534692277538, "grad_norm": 0.8792872428894043, "learning_rate": 8.771895233245655e-06, "loss": 0.3311576843261719, "memory(GiB)": 28.87, "step": 545, "token_acc": 0.8878104806138933, "train_speed(iter/s)": 0.122276 }, { "epoch": 0.6899255194041553, "grad_norm": 0.8647459149360657, "learning_rate": 8.750251212517364e-06, "loss": 0.3273744583129883, "memory(GiB)": 28.87, "step": 550, "token_acc": 0.9019584187879519, "train_speed(iter/s)": 0.122511 }, { "epoch": 0.6961975695805567, "grad_norm": 0.8187234401702881, "learning_rate": 8.728445331839796e-06, "loss": 0.34240922927856443, "memory(GiB)": 28.87, "step": 555, "token_acc": 0.8956161234709794, "train_speed(iter/s)": 0.122795 }, { "epoch": 0.7024696197569581, "grad_norm": 0.819709837436676, "learning_rate": 8.706478532349567e-06, "loss": 0.3307104825973511, "memory(GiB)": 28.87, "step": 560, "token_acc": 0.8898898898898899, "train_speed(iter/s)": 0.123049 }, { "epoch": 0.7024696197569581, "eval_loss": 0.33920466899871826, "eval_runtime": 29.2564, "eval_samples_per_second": 17.603, "eval_steps_per_second": 4.409, "eval_token_acc": 0.887334891648946, "step": 560 }, { "epoch": 0.7087416699333595, "grad_norm": 0.9903004765510559, "learning_rate": 8.684351762128511e-06, "loss": 0.3422609806060791, "memory(GiB)": 28.87, "step": 565, "token_acc": 0.8903627722134206, "train_speed(iter/s)": 0.122313 }, { "epoch": 0.7150137201097608, "grad_norm": 0.8508421778678894, "learning_rate": 8.662065976162765e-06, "loss": 0.33224852085113527, "memory(GiB)": 28.87, "step": 570, "token_acc": 0.8952832987244141, "train_speed(iter/s)": 0.122616 }, { "epoch": 0.7212857702861623, "grad_norm": 0.9183920621871948, "learning_rate": 8.639622136301541e-06, "loss": 0.33959968090057374, "memory(GiB)": 30.83, "step": 575, "token_acc": 0.8883200380997738, "train_speed(iter/s)": 0.122861 }, { "epoch": 0.7275578204625637, "grad_norm": 0.8977269530296326, "learning_rate": 8.617021211215629e-06, "loss": 0.3228691339492798, "memory(GiB)": 30.83, "step": 580, "token_acc": 0.8865359042553191, "train_speed(iter/s)": 0.123129 }, { "epoch": 0.7275578204625637, "eval_loss": 0.3381815254688263, "eval_runtime": 29.3858, "eval_samples_per_second": 17.525, "eval_steps_per_second": 4.39, "eval_token_acc": 0.887751306153496, "step": 580 }, { "epoch": 0.7338298706389651, "grad_norm": 0.8259280323982239, "learning_rate": 8.594264176355565e-06, "loss": 0.3299635648727417, "memory(GiB)": 30.83, "step": 585, "token_acc": 0.8886926504234737, "train_speed(iter/s)": 0.122386 }, { "epoch": 0.7401019208153665, "grad_norm": 0.9581537246704102, "learning_rate": 8.571352013909558e-06, "loss": 0.3483741283416748, "memory(GiB)": 30.83, "step": 590, "token_acc": 0.8793361921695778, "train_speed(iter/s)": 0.122675 }, { "epoch": 0.7463739709917679, "grad_norm": 0.8338929414749146, "learning_rate": 8.548285712761084e-06, "loss": 0.3371764898300171, "memory(GiB)": 30.83, "step": 595, "token_acc": 0.887261212985838, "train_speed(iter/s)": 0.122898 }, { "epoch": 0.7526460211681694, "grad_norm": 0.9058257341384888, "learning_rate": 8.525066268446208e-06, "loss": 0.3369316816329956, "memory(GiB)": 30.83, "step": 600, "token_acc": 0.8893781157890907, "train_speed(iter/s)": 0.123152 }, { "epoch": 0.7526460211681694, "eval_loss": 0.33716732263565063, "eval_runtime": 29.3779, "eval_samples_per_second": 17.53, "eval_steps_per_second": 4.391, "eval_token_acc": 0.8880511245967719, "step": 600 }, { "epoch": 0.7589180713445708, "grad_norm": 0.8647992610931396, "learning_rate": 8.501694683110615e-06, "loss": 0.3460502862930298, "memory(GiB)": 30.83, "step": 605, "token_acc": 0.8899021439489947, "train_speed(iter/s)": 0.122422 }, { "epoch": 0.7651901215209722, "grad_norm": 0.9482452273368835, "learning_rate": 8.478171965466366e-06, "loss": 0.3319687843322754, "memory(GiB)": 30.83, "step": 610, "token_acc": 0.8881137119384495, "train_speed(iter/s)": 0.122651 }, { "epoch": 0.7714621716973736, "grad_norm": 0.9784294366836548, "learning_rate": 8.454499130748352e-06, "loss": 0.3246720790863037, "memory(GiB)": 30.83, "step": 615, "token_acc": 0.8938534900311643, "train_speed(iter/s)": 0.122956 }, { "epoch": 0.777734221873775, "grad_norm": 0.8589239716529846, "learning_rate": 8.43067720067048e-06, "loss": 0.33240423202514646, "memory(GiB)": 30.83, "step": 620, "token_acc": 0.8954405877094104, "train_speed(iter/s)": 0.123205 }, { "epoch": 0.777734221873775, "eval_loss": 0.33614107966423035, "eval_runtime": 29.3659, "eval_samples_per_second": 17.537, "eval_steps_per_second": 4.393, "eval_token_acc": 0.8878901109883459, "step": 620 }, { "epoch": 0.7840062720501764, "grad_norm": 0.9298199415206909, "learning_rate": 8.40670720338158e-06, "loss": 0.3492927312850952, "memory(GiB)": 30.83, "step": 625, "token_acc": 0.8875478688841884, "train_speed(iter/s)": 0.122475 }, { "epoch": 0.7902783222265778, "grad_norm": 0.9355754256248474, "learning_rate": 8.382590173421029e-06, "loss": 0.34650092124938964, "memory(GiB)": 30.83, "step": 630, "token_acc": 0.8980740024851581, "train_speed(iter/s)": 0.122741 }, { "epoch": 0.7965503724029792, "grad_norm": 0.8517131209373474, "learning_rate": 8.358327151674095e-06, "loss": 0.3493072986602783, "memory(GiB)": 30.83, "step": 635, "token_acc": 0.8794758648566713, "train_speed(iter/s)": 0.122993 }, { "epoch": 0.8028224225793806, "grad_norm": 0.9389694333076477, "learning_rate": 8.33391918532702e-06, "loss": 0.34041428565979004, "memory(GiB)": 30.83, "step": 640, "token_acc": 0.8885475048956716, "train_speed(iter/s)": 0.123237 }, { "epoch": 0.8028224225793806, "eval_loss": 0.3339459300041199, "eval_runtime": 29.3874, "eval_samples_per_second": 17.525, "eval_steps_per_second": 4.39, "eval_token_acc": 0.888389808393806, "step": 640 }, { "epoch": 0.8090944727557821, "grad_norm": 0.8025544285774231, "learning_rate": 8.309367327821819e-06, "loss": 0.319812536239624, "memory(GiB)": 30.83, "step": 645, "token_acc": 0.8937441216095986, "train_speed(iter/s)": 0.122531 }, { "epoch": 0.8153665229321835, "grad_norm": 0.9150540232658386, "learning_rate": 8.284672638810813e-06, "loss": 0.32463486194610597, "memory(GiB)": 30.83, "step": 650, "token_acc": 0.8960157054582905, "train_speed(iter/s)": 0.122736 }, { "epoch": 0.8216385731085849, "grad_norm": 0.7610167264938354, "learning_rate": 8.259836184110904e-06, "loss": 0.3184787750244141, "memory(GiB)": 30.83, "step": 655, "token_acc": 0.8911636749168281, "train_speed(iter/s)": 0.122944 }, { "epoch": 0.8279106232849863, "grad_norm": 0.8336440920829773, "learning_rate": 8.234859035657557e-06, "loss": 0.34786210060119627, "memory(GiB)": 30.83, "step": 660, "token_acc": 0.8883749488334015, "train_speed(iter/s)": 0.123201 }, { "epoch": 0.8279106232849863, "eval_loss": 0.33380812406539917, "eval_runtime": 29.4213, "eval_samples_per_second": 17.504, "eval_steps_per_second": 4.385, "eval_token_acc": 0.8884841956815039, "step": 660 }, { "epoch": 0.8341826734613876, "grad_norm": 0.7955749034881592, "learning_rate": 8.209742271458556e-06, "loss": 0.33576648235321044, "memory(GiB)": 30.83, "step": 665, "token_acc": 0.8916429159969202, "train_speed(iter/s)": 0.122484 }, { "epoch": 0.8404547236377891, "grad_norm": 0.8152847290039062, "learning_rate": 8.18448697554746e-06, "loss": 0.3175548553466797, "memory(GiB)": 30.83, "step": 670, "token_acc": 0.8854783940342233, "train_speed(iter/s)": 0.122708 }, { "epoch": 0.8467267738141905, "grad_norm": 0.8721585869789124, "learning_rate": 8.159094237936828e-06, "loss": 0.3407304763793945, "memory(GiB)": 30.83, "step": 675, "token_acc": 0.8833216654998833, "train_speed(iter/s)": 0.122938 }, { "epoch": 0.8529988239905919, "grad_norm": 0.9121716022491455, "learning_rate": 8.133565154571169e-06, "loss": 0.3379061222076416, "memory(GiB)": 30.83, "step": 680, "token_acc": 0.8774592247819455, "train_speed(iter/s)": 0.123134 }, { "epoch": 0.8529988239905919, "eval_loss": 0.3322893977165222, "eval_runtime": 29.3785, "eval_samples_per_second": 17.53, "eval_steps_per_second": 4.391, "eval_token_acc": 0.888578582969202, "step": 680 }, { "epoch": 0.8592708741669933, "grad_norm": 0.7398496866226196, "learning_rate": 8.107900827279638e-06, "loss": 0.3155172109603882, "memory(GiB)": 30.83, "step": 685, "token_acc": 0.895827825075353, "train_speed(iter/s)": 0.122438 }, { "epoch": 0.8655429243433947, "grad_norm": 0.8711411952972412, "learning_rate": 8.082102363728494e-06, "loss": 0.3454484462738037, "memory(GiB)": 30.83, "step": 690, "token_acc": 0.8851303206450403, "train_speed(iter/s)": 0.122657 }, { "epoch": 0.8718149745197962, "grad_norm": 0.8494858741760254, "learning_rate": 8.056170877373277e-06, "loss": 0.36029322147369386, "memory(GiB)": 30.83, "step": 695, "token_acc": 0.8825764562659092, "train_speed(iter/s)": 0.122862 }, { "epoch": 0.8780870246961976, "grad_norm": 0.7830603122711182, "learning_rate": 8.030107487410766e-06, "loss": 0.32322983741760253, "memory(GiB)": 30.83, "step": 700, "token_acc": 0.8960877431026685, "train_speed(iter/s)": 0.123089 }, { "epoch": 0.8780870246961976, "eval_loss": 0.3324893116950989, "eval_runtime": 29.4344, "eval_samples_per_second": 17.497, "eval_steps_per_second": 4.383, "eval_token_acc": 0.888356495233442, "step": 700 }, { "epoch": 0.884359074872599, "grad_norm": 0.848610520362854, "learning_rate": 8.003913318730662e-06, "loss": 0.3297175645828247, "memory(GiB)": 30.83, "step": 705, "token_acc": 0.8931060983995578, "train_speed(iter/s)": 0.122471 }, { "epoch": 0.8906311250490004, "grad_norm": 0.8601354956626892, "learning_rate": 7.97758950186705e-06, "loss": 0.33080010414123534, "memory(GiB)": 30.83, "step": 710, "token_acc": 0.8949430800932338, "train_speed(iter/s)": 0.122686 }, { "epoch": 0.8969031752254017, "grad_norm": 0.9265198111534119, "learning_rate": 7.951137172949595e-06, "loss": 0.3245250225067139, "memory(GiB)": 30.83, "step": 715, "token_acc": 0.895298551874628, "train_speed(iter/s)": 0.122903 }, { "epoch": 0.9031752254018032, "grad_norm": 0.8287230730056763, "learning_rate": 7.924557473654516e-06, "loss": 0.3210673570632935, "memory(GiB)": 30.83, "step": 720, "token_acc": 0.8865173220523668, "train_speed(iter/s)": 0.123104 }, { "epoch": 0.9031752254018032, "eval_loss": 0.3312128484249115, "eval_runtime": 29.3758, "eval_samples_per_second": 17.531, "eval_steps_per_second": 4.391, "eval_token_acc": 0.888828431671932, "step": 720 }, { "epoch": 0.9094472755782046, "grad_norm": 0.8217957615852356, "learning_rate": 7.897851551155306e-06, "loss": 0.3300743579864502, "memory(GiB)": 30.83, "step": 725, "token_acc": 0.890940345904522, "train_speed(iter/s)": 0.122496 }, { "epoch": 0.915719325754606, "grad_norm": 0.8754700422286987, "learning_rate": 7.871020558073217e-06, "loss": 0.3481910228729248, "memory(GiB)": 30.83, "step": 730, "token_acc": 0.8795096810140688, "train_speed(iter/s)": 0.122719 }, { "epoch": 0.9219913759310074, "grad_norm": 0.9567843079566956, "learning_rate": 7.844065652427523e-06, "loss": 0.32534041404724123, "memory(GiB)": 30.83, "step": 735, "token_acc": 0.8889229120416443, "train_speed(iter/s)": 0.122947 }, { "epoch": 0.9282634261074089, "grad_norm": 0.861889660358429, "learning_rate": 7.816987997585535e-06, "loss": 0.3210756778717041, "memory(GiB)": 30.83, "step": 740, "token_acc": 0.8954586487049511, "train_speed(iter/s)": 0.12311 }, { "epoch": 0.9282634261074089, "eval_loss": 0.33026477694511414, "eval_runtime": 29.443, "eval_samples_per_second": 17.491, "eval_steps_per_second": 4.381, "eval_token_acc": 0.889078280374662, "step": 740 }, { "epoch": 0.9345354762838103, "grad_norm": 0.8503465056419373, "learning_rate": 7.789788762212384e-06, "loss": 0.3078526735305786, "memory(GiB)": 30.83, "step": 745, "token_acc": 0.8963680387409201, "train_speed(iter/s)": 0.122495 }, { "epoch": 0.9408075264602117, "grad_norm": 0.7904318571090698, "learning_rate": 7.762469120220595e-06, "loss": 0.32522361278533934, "memory(GiB)": 30.83, "step": 750, "token_acc": 0.8939921307506054, "train_speed(iter/s)": 0.122717 }, { "epoch": 0.9470795766366131, "grad_norm": 0.8881962895393372, "learning_rate": 7.73503025071941e-06, "loss": 0.33283185958862305, "memory(GiB)": 30.83, "step": 755, "token_acc": 0.888671875, "train_speed(iter/s)": 0.122911 }, { "epoch": 0.9533516268130146, "grad_norm": 0.794685959815979, "learning_rate": 7.7074733379639e-06, "loss": 0.33060617446899415, "memory(GiB)": 30.83, "step": 760, "token_acc": 0.8894294111685416, "train_speed(iter/s)": 0.123122 }, { "epoch": 0.9533516268130146, "eval_loss": 0.32935982942581177, "eval_runtime": 29.5219, "eval_samples_per_second": 17.445, "eval_steps_per_second": 4.37, "eval_token_acc": 0.889405859784908, "step": 760 }, { "epoch": 0.9596236769894159, "grad_norm": 0.8581651449203491, "learning_rate": 7.679799571303861e-06, "loss": 0.3343641996383667, "memory(GiB)": 30.83, "step": 765, "token_acc": 0.8940541099253211, "train_speed(iter/s)": 0.122584 }, { "epoch": 0.9658957271658173, "grad_norm": 0.9159526824951172, "learning_rate": 7.65201014513247e-06, "loss": 0.3283867359161377, "memory(GiB)": 30.83, "step": 770, "token_acc": 0.8896209236881311, "train_speed(iter/s)": 0.122761 }, { "epoch": 0.9721677773422187, "grad_norm": 0.8125607371330261, "learning_rate": 7.62410625883474e-06, "loss": 0.33158369064331056, "memory(GiB)": 30.83, "step": 775, "token_acc": 0.8984477961634207, "train_speed(iter/s)": 0.122932 }, { "epoch": 0.9784398275186201, "grad_norm": 0.8378052711486816, "learning_rate": 7.596089116735765e-06, "loss": 0.32932515144348146, "memory(GiB)": 30.83, "step": 780, "token_acc": 0.889555958314454, "train_speed(iter/s)": 0.123137 }, { "epoch": 0.9784398275186201, "eval_loss": 0.3282354474067688, "eval_runtime": 29.4374, "eval_samples_per_second": 17.495, "eval_steps_per_second": 4.382, "eval_token_acc": 0.8892281895963, "step": 780 }, { "epoch": 0.9847118776950216, "grad_norm": 0.829319417476654, "learning_rate": 7.567959928048723e-06, "loss": 0.32491297721862794, "memory(GiB)": 30.83, "step": 785, "token_acc": 0.8919960394979796, "train_speed(iter/s)": 0.122552 }, { "epoch": 0.990983927871423, "grad_norm": 0.9168446660041809, "learning_rate": 7.5397199068227e-06, "loss": 0.33011536598205565, "memory(GiB)": 30.83, "step": 790, "token_acc": 0.8893950507663205, "train_speed(iter/s)": 0.122716 }, { "epoch": 0.9972559780478244, "grad_norm": 0.8191807866096497, "learning_rate": 7.511370271890286e-06, "loss": 0.32829596996307375, "memory(GiB)": 30.83, "step": 795, "token_acc": 0.896822042039734, "train_speed(iter/s)": 0.122911 }, { "epoch": 1.0025088200705605, "grad_norm": 0.80043625831604, "learning_rate": 7.482912246814975e-06, "loss": 0.2824315071105957, "memory(GiB)": 30.83, "step": 800, "token_acc": 0.9048456348395011, "train_speed(iter/s)": 0.123208 }, { "epoch": 1.0025088200705605, "eval_loss": 0.3272770643234253, "eval_runtime": 29.4466, "eval_samples_per_second": 17.489, "eval_steps_per_second": 4.381, "eval_token_acc": 0.8895391124263641, "step": 800 }, { "epoch": 1.008780870246962, "grad_norm": 0.8259254693984985, "learning_rate": 7.454347059838351e-06, "loss": 0.26992073059082033, "memory(GiB)": 30.83, "step": 805, "token_acc": 0.8975824979114453, "train_speed(iter/s)": 0.122691 }, { "epoch": 1.0150529204233634, "grad_norm": 0.8588547110557556, "learning_rate": 7.425675943827084e-06, "loss": 0.2826483726501465, "memory(GiB)": 30.83, "step": 810, "token_acc": 0.9070769230769231, "train_speed(iter/s)": 0.122895 }, { "epoch": 1.021324970599765, "grad_norm": 0.9410291910171509, "learning_rate": 7.3969001362197135e-06, "loss": 0.2646550416946411, "memory(GiB)": 30.83, "step": 815, "token_acc": 0.9091513589892777, "train_speed(iter/s)": 0.123089 }, { "epoch": 1.0275970207761662, "grad_norm": 0.7985222339630127, "learning_rate": 7.3680208789732385e-06, "loss": 0.2572730779647827, "memory(GiB)": 30.83, "step": 820, "token_acc": 0.9100758396533044, "train_speed(iter/s)": 0.123265 }, { "epoch": 1.0275970207761662, "eval_loss": 0.3340509831905365, "eval_runtime": 29.3123, "eval_samples_per_second": 17.569, "eval_steps_per_second": 4.401, "eval_token_acc": 0.8894946948792121, "step": 820 }, { "epoch": 1.0338690709525675, "grad_norm": 0.7672378420829773, "learning_rate": 7.339039418509532e-06, "loss": 0.26021647453308105, "memory(GiB)": 30.83, "step": 825, "token_acc": 0.9020274970593072, "train_speed(iter/s)": 0.122729 }, { "epoch": 1.040141121128969, "grad_norm": 0.9672802090644836, "learning_rate": 7.309957005661521e-06, "loss": 0.25925168991088865, "memory(GiB)": 30.83, "step": 830, "token_acc": 0.9167668481719822, "train_speed(iter/s)": 0.122939 }, { "epoch": 1.0464131713053704, "grad_norm": 0.862086832523346, "learning_rate": 7.280774895619219e-06, "loss": 0.2655090570449829, "memory(GiB)": 30.83, "step": 835, "token_acc": 0.9072164948453608, "train_speed(iter/s)": 0.123142 }, { "epoch": 1.052685221481772, "grad_norm": 0.8724802732467651, "learning_rate": 7.25149434787555e-06, "loss": 0.2601273536682129, "memory(GiB)": 30.83, "step": 840, "token_acc": 0.9072434197102632, "train_speed(iter/s)": 0.123337 }, { "epoch": 1.052685221481772, "eval_loss": 0.33454829454421997, "eval_runtime": 29.4084, "eval_samples_per_second": 17.512, "eval_steps_per_second": 4.387, "eval_token_acc": 0.8896834694546081, "step": 840 }, { "epoch": 1.0589572716581732, "grad_norm": 0.8254806399345398, "learning_rate": 7.2221166261719755e-06, "loss": 0.2561455726623535, "memory(GiB)": 30.83, "step": 845, "token_acc": 0.9006843577219007, "train_speed(iter/s)": 0.122791 }, { "epoch": 1.0652293218345747, "grad_norm": 0.7920213341712952, "learning_rate": 7.192642998443975e-06, "loss": 0.25534210205078123, "memory(GiB)": 30.83, "step": 850, "token_acc": 0.9210035842293907, "train_speed(iter/s)": 0.122955 }, { "epoch": 1.071501372010976, "grad_norm": 0.8850580453872681, "learning_rate": 7.163074736766299e-06, "loss": 0.2532507419586182, "memory(GiB)": 30.83, "step": 855, "token_acc": 0.9125168236877523, "train_speed(iter/s)": 0.123087 }, { "epoch": 1.0777734221873776, "grad_norm": 0.8684042096138, "learning_rate": 7.133413117298081e-06, "loss": 0.2542534828186035, "memory(GiB)": 30.83, "step": 860, "token_acc": 0.9134933617377411, "train_speed(iter/s)": 0.123259 }, { "epoch": 1.0777734221873776, "eval_loss": 0.334031879901886, "eval_runtime": 29.396, "eval_samples_per_second": 17.519, "eval_steps_per_second": 4.388, "eval_token_acc": 0.889344785657574, "step": 860 }, { "epoch": 1.084045472363779, "grad_norm": 0.8075751066207886, "learning_rate": 7.103659420227755e-06, "loss": 0.2629993438720703, "memory(GiB)": 30.83, "step": 865, "token_acc": 0.9003039997875985, "train_speed(iter/s)": 0.122739 }, { "epoch": 1.0903175225401802, "grad_norm": 0.9507380723953247, "learning_rate": 7.0738149297178005e-06, "loss": 0.2679288387298584, "memory(GiB)": 32.99, "step": 870, "token_acc": 0.9241651993945572, "train_speed(iter/s)": 0.122888 }, { "epoch": 1.0965895727165818, "grad_norm": 0.8346853852272034, "learning_rate": 7.04388093384932e-06, "loss": 0.24904875755310057, "memory(GiB)": 32.99, "step": 875, "token_acc": 0.9159838773622307, "train_speed(iter/s)": 0.123027 }, { "epoch": 1.102861622892983, "grad_norm": 0.8962292075157166, "learning_rate": 7.013858724566449e-06, "loss": 0.26425485610961913, "memory(GiB)": 32.99, "step": 880, "token_acc": 0.9163976759199484, "train_speed(iter/s)": 0.123183 }, { "epoch": 1.102861622892983, "eval_loss": 0.33524197340011597, "eval_runtime": 29.4091, "eval_samples_per_second": 17.512, "eval_steps_per_second": 4.386, "eval_token_acc": 0.88925595056327, "step": 880 }, { "epoch": 1.1091336730693846, "grad_norm": 0.9459292888641357, "learning_rate": 6.983749597620588e-06, "loss": 0.26885480880737306, "memory(GiB)": 32.99, "step": 885, "token_acc": 0.8969475538971807, "train_speed(iter/s)": 0.122693 }, { "epoch": 1.115405723245786, "grad_norm": 0.9416642785072327, "learning_rate": 6.9535548525144894e-06, "loss": 0.26124646663665774, "memory(GiB)": 32.99, "step": 890, "token_acc": 0.9064468321600593, "train_speed(iter/s)": 0.122861 }, { "epoch": 1.1216777734221874, "grad_norm": 0.8303311467170715, "learning_rate": 6.923275792446159e-06, "loss": 0.25772600173950194, "memory(GiB)": 32.99, "step": 895, "token_acc": 0.9122112744306251, "train_speed(iter/s)": 0.123004 }, { "epoch": 1.1279498235985888, "grad_norm": 0.8063751459121704, "learning_rate": 6.8929137242526216e-06, "loss": 0.2566836833953857, "memory(GiB)": 32.99, "step": 900, "token_acc": 0.9133560897668676, "train_speed(iter/s)": 0.123136 }, { "epoch": 1.1279498235985888, "eval_loss": 0.3343105614185333, "eval_runtime": 29.4826, "eval_samples_per_second": 17.468, "eval_steps_per_second": 4.375, "eval_token_acc": 0.889039415020904, "step": 900 }, { "epoch": 1.1342218737749903, "grad_norm": 0.7772172689437866, "learning_rate": 6.862469958353506e-06, "loss": 0.25952978134155275, "memory(GiB)": 32.99, "step": 905, "token_acc": 0.9005278010033445, "train_speed(iter/s)": 0.122633 }, { "epoch": 1.1404939239513916, "grad_norm": 0.9848916530609131, "learning_rate": 6.8319458086945026e-06, "loss": 0.2750791788101196, "memory(GiB)": 32.99, "step": 910, "token_acc": 0.9121159843407869, "train_speed(iter/s)": 0.122802 }, { "epoch": 1.146765974127793, "grad_norm": 0.9113922119140625, "learning_rate": 6.801342592690641e-06, "loss": 0.2661754131317139, "memory(GiB)": 32.99, "step": 915, "token_acc": 0.9091255477233758, "train_speed(iter/s)": 0.122967 }, { "epoch": 1.1530380243041944, "grad_norm": 0.8235742449760437, "learning_rate": 6.770661631169434e-06, "loss": 0.2528377532958984, "memory(GiB)": 32.99, "step": 920, "token_acc": 0.9075825218827996, "train_speed(iter/s)": 0.123121 }, { "epoch": 1.1530380243041944, "eval_loss": 0.3342270255088806, "eval_runtime": 29.4268, "eval_samples_per_second": 17.501, "eval_steps_per_second": 4.384, "eval_token_acc": 0.889300368110422, "step": 920 }, { "epoch": 1.1593100744805958, "grad_norm": 0.8287192583084106, "learning_rate": 6.739904248313879e-06, "loss": 0.2636830806732178, "memory(GiB)": 32.99, "step": 925, "token_acc": 0.8985745469847197, "train_speed(iter/s)": 0.122627 }, { "epoch": 1.1655821246569973, "grad_norm": 0.9481701850891113, "learning_rate": 6.709071771605292e-06, "loss": 0.26240465641021726, "memory(GiB)": 32.99, "step": 930, "token_acc": 0.9111058712567791, "train_speed(iter/s)": 0.122762 }, { "epoch": 1.1718541748333986, "grad_norm": 0.9033907651901245, "learning_rate": 6.678165531766029e-06, "loss": 0.2581218719482422, "memory(GiB)": 32.99, "step": 935, "token_acc": 0.9199569773090001, "train_speed(iter/s)": 0.122912 }, { "epoch": 1.1781262250098001, "grad_norm": 0.8978760838508606, "learning_rate": 6.647186862702038e-06, "loss": 0.2560389995574951, "memory(GiB)": 32.99, "step": 940, "token_acc": 0.9132201156577118, "train_speed(iter/s)": 0.123049 }, { "epoch": 1.1781262250098001, "eval_loss": 0.33342665433883667, "eval_runtime": 29.3912, "eval_samples_per_second": 17.522, "eval_steps_per_second": 4.389, "eval_token_acc": 0.8889505799266, "step": 940 }, { "epoch": 1.1843982751862014, "grad_norm": 0.8684834837913513, "learning_rate": 6.616137101445301e-06, "loss": 0.2619821548461914, "memory(GiB)": 32.99, "step": 945, "token_acc": 0.9023693647540983, "train_speed(iter/s)": 0.122572 }, { "epoch": 1.190670325362603, "grad_norm": 0.8880748152732849, "learning_rate": 6.58501758809612e-06, "loss": 0.2793832778930664, "memory(GiB)": 32.99, "step": 950, "token_acc": 0.9074586794194144, "train_speed(iter/s)": 0.122739 }, { "epoch": 1.1969423755390043, "grad_norm": 0.9005119204521179, "learning_rate": 6.55382966576528e-06, "loss": 0.26382246017456057, "memory(GiB)": 32.99, "step": 955, "token_acc": 0.9095440156260524, "train_speed(iter/s)": 0.122892 }, { "epoch": 1.2032144257154056, "grad_norm": 0.917253851890564, "learning_rate": 6.522574680516081e-06, "loss": 0.26546216011047363, "memory(GiB)": 32.99, "step": 960, "token_acc": 0.9125179553655888, "train_speed(iter/s)": 0.123068 }, { "epoch": 1.2032144257154056, "eval_loss": 0.33402466773986816, "eval_runtime": 29.402, "eval_samples_per_second": 17.516, "eval_steps_per_second": 4.387, "eval_token_acc": 0.889317024690604, "step": 960 }, { "epoch": 1.2094864758918071, "grad_norm": 0.8855717778205872, "learning_rate": 6.491253981306245e-06, "loss": 0.27286221981048586, "memory(GiB)": 32.99, "step": 965, "token_acc": 0.896218792922356, "train_speed(iter/s)": 0.122634 }, { "epoch": 1.2157585260682087, "grad_norm": 0.8950490951538086, "learning_rate": 6.459868919929691e-06, "loss": 0.2556123733520508, "memory(GiB)": 32.99, "step": 970, "token_acc": 0.9105622119815668, "train_speed(iter/s)": 0.122762 }, { "epoch": 1.22203057624461, "grad_norm": 0.8709114193916321, "learning_rate": 6.428420850958194e-06, "loss": 0.25667073726654055, "memory(GiB)": 32.99, "step": 975, "token_acc": 0.9179039301310044, "train_speed(iter/s)": 0.122889 }, { "epoch": 1.2283026264210113, "grad_norm": 0.8599227070808411, "learning_rate": 6.3969111316829215e-06, "loss": 0.2665587902069092, "memory(GiB)": 32.99, "step": 980, "token_acc": 0.908648175626831, "train_speed(iter/s)": 0.123052 }, { "epoch": 1.2283026264210113, "eval_loss": 0.3336588442325592, "eval_runtime": 29.4102, "eval_samples_per_second": 17.511, "eval_steps_per_second": 4.386, "eval_token_acc": 0.89000549667146, "step": 980 }, { "epoch": 1.2345746765974128, "grad_norm": 0.8702272772789001, "learning_rate": 6.365341122055857e-06, "loss": 0.264133620262146, "memory(GiB)": 32.99, "step": 985, "token_acc": 0.8992360079744176, "train_speed(iter/s)": 0.122637 }, { "epoch": 1.2408467267738141, "grad_norm": 0.7991276979446411, "learning_rate": 6.333712184631093e-06, "loss": 0.2504168272018433, "memory(GiB)": 32.99, "step": 990, "token_acc": 0.9176470588235294, "train_speed(iter/s)": 0.122773 }, { "epoch": 1.2471187769502157, "grad_norm": 0.8343760371208191, "learning_rate": 6.302025684506042e-06, "loss": 0.26856470108032227, "memory(GiB)": 32.99, "step": 995, "token_acc": 0.9059094987822074, "train_speed(iter/s)": 0.122922 }, { "epoch": 1.253390827126617, "grad_norm": 0.9037004709243774, "learning_rate": 6.2702829892625e-06, "loss": 0.262753963470459, "memory(GiB)": 32.99, "step": 1000, "token_acc": 0.9124660313086079, "train_speed(iter/s)": 0.123082 }, { "epoch": 1.253390827126617, "eval_loss": 0.3312474489212036, "eval_runtime": 29.3973, "eval_samples_per_second": 17.519, "eval_steps_per_second": 4.388, "eval_token_acc": 0.890455224336374, "step": 1000 }, { "epoch": 1.2596628773030183, "grad_norm": 0.9027026891708374, "learning_rate": 6.238485468907637e-06, "loss": 0.2612313747406006, "memory(GiB)": 32.99, "step": 1005, "token_acc": 0.9030250326107212, "train_speed(iter/s)": 0.122646 }, { "epoch": 1.2659349274794198, "grad_norm": 0.8399356603622437, "learning_rate": 6.2066344958148596e-06, "loss": 0.2536637306213379, "memory(GiB)": 32.99, "step": 1010, "token_acc": 0.9170757737459979, "train_speed(iter/s)": 0.122809 }, { "epoch": 1.2722069776558214, "grad_norm": 0.7805183529853821, "learning_rate": 6.174731444664579e-06, "loss": 0.2619153022766113, "memory(GiB)": 32.99, "step": 1015, "token_acc": 0.9090187590187591, "train_speed(iter/s)": 0.122976 }, { "epoch": 1.2784790278322227, "grad_norm": 0.8158499598503113, "learning_rate": 6.14277769238489e-06, "loss": 0.25308961868286134, "memory(GiB)": 32.99, "step": 1020, "token_acc": 0.9159010077059869, "train_speed(iter/s)": 0.123114 }, { "epoch": 1.2784790278322227, "eval_loss": 0.3310454487800598, "eval_runtime": 29.4178, "eval_samples_per_second": 17.506, "eval_steps_per_second": 4.385, "eval_token_acc": 0.8902220322138261, "step": 1020 }, { "epoch": 1.284751078008624, "grad_norm": 0.8155426979064941, "learning_rate": 6.110774618092128e-06, "loss": 0.24832696914672853, "memory(GiB)": 32.99, "step": 1025, "token_acc": 0.9044705444194677, "train_speed(iter/s)": 0.12268 }, { "epoch": 1.2910231281850255, "grad_norm": 0.825543999671936, "learning_rate": 6.07872360303136e-06, "loss": 0.26351313591003417, "memory(GiB)": 32.99, "step": 1030, "token_acc": 0.9093544316747946, "train_speed(iter/s)": 0.122857 }, { "epoch": 1.2972951783614268, "grad_norm": 0.8312624096870422, "learning_rate": 6.046626030516766e-06, "loss": 0.26206340789794924, "memory(GiB)": 32.99, "step": 1035, "token_acc": 0.9150324332478503, "train_speed(iter/s)": 0.123001 }, { "epoch": 1.3035672285378284, "grad_norm": 0.813248336315155, "learning_rate": 6.0144832858719256e-06, "loss": 0.25995168685913084, "memory(GiB)": 32.99, "step": 1040, "token_acc": 0.9156251058496765, "train_speed(iter/s)": 0.12313 }, { "epoch": 1.3035672285378284, "eval_loss": 0.3311326503753662, "eval_runtime": 29.4093, "eval_samples_per_second": 17.511, "eval_steps_per_second": 4.386, "eval_token_acc": 0.8905884769778301, "step": 1040 }, { "epoch": 1.3098392787142297, "grad_norm": 0.8660376071929932, "learning_rate": 5.982296756370052e-06, "loss": 0.2568789482116699, "memory(GiB)": 32.99, "step": 1045, "token_acc": 0.8992794915347104, "train_speed(iter/s)": 0.122708 }, { "epoch": 1.3161113288906312, "grad_norm": 0.9326941967010498, "learning_rate": 5.950067831174086e-06, "loss": 0.2631781816482544, "memory(GiB)": 32.99, "step": 1050, "token_acc": 0.9101537421861801, "train_speed(iter/s)": 0.122837 }, { "epoch": 1.3223833790670325, "grad_norm": 0.9141260981559753, "learning_rate": 5.917797901276771e-06, "loss": 0.2625840187072754, "memory(GiB)": 32.99, "step": 1055, "token_acc": 0.9048415211127316, "train_speed(iter/s)": 0.122993 }, { "epoch": 1.328655429243434, "grad_norm": 0.8678974509239197, "learning_rate": 5.885488359440592e-06, "loss": 0.25161261558532716, "memory(GiB)": 32.99, "step": 1060, "token_acc": 0.9070050606902325, "train_speed(iter/s)": 0.12312 }, { "epoch": 1.328655429243434, "eval_loss": 0.3307149112224579, "eval_runtime": 29.4299, "eval_samples_per_second": 17.499, "eval_steps_per_second": 4.383, "eval_token_acc": 0.8903774936288581, "step": 1060 }, { "epoch": 1.3349274794198354, "grad_norm": 0.8765535354614258, "learning_rate": 5.853140600137684e-06, "loss": 0.25869274139404297, "memory(GiB)": 32.99, "step": 1065, "token_acc": 0.9016284233900814, "train_speed(iter/s)": 0.122714 }, { "epoch": 1.3411995295962367, "grad_norm": 0.8286167979240417, "learning_rate": 5.8207560194896325e-06, "loss": 0.2691312551498413, "memory(GiB)": 32.99, "step": 1070, "token_acc": 0.9035621198957429, "train_speed(iter/s)": 0.122843 }, { "epoch": 1.3474715797726382, "grad_norm": 0.8201180100440979, "learning_rate": 5.78833601520723e-06, "loss": 0.2646843433380127, "memory(GiB)": 32.99, "step": 1075, "token_acc": 0.9125431530494822, "train_speed(iter/s)": 0.122981 }, { "epoch": 1.3537436299490395, "grad_norm": 0.8124395608901978, "learning_rate": 5.755881986530137e-06, "loss": 0.2646932125091553, "memory(GiB)": 32.99, "step": 1080, "token_acc": 0.909264075607165, "train_speed(iter/s)": 0.123106 }, { "epoch": 1.3537436299490395, "eval_loss": 0.3297117352485657, "eval_runtime": 29.4873, "eval_samples_per_second": 17.465, "eval_steps_per_second": 4.375, "eval_token_acc": 0.8906551032985581, "step": 1080 }, { "epoch": 1.360015680125441, "grad_norm": 0.9377442598342896, "learning_rate": 5.723395334166506e-06, "loss": 0.26891088485717773, "memory(GiB)": 32.99, "step": 1085, "token_acc": 0.9008568241735956, "train_speed(iter/s)": 0.12271 }, { "epoch": 1.3662877303018424, "grad_norm": 0.8344196081161499, "learning_rate": 5.6908774602325165e-06, "loss": 0.2543730974197388, "memory(GiB)": 32.99, "step": 1090, "token_acc": 0.9094074127801552, "train_speed(iter/s)": 0.122831 }, { "epoch": 1.372559780478244, "grad_norm": 0.8761776685714722, "learning_rate": 5.6583297681918615e-06, "loss": 0.25609617233276366, "memory(GiB)": 32.99, "step": 1095, "token_acc": 0.9128849780012571, "train_speed(iter/s)": 0.122967 }, { "epoch": 1.3788318306546452, "grad_norm": 0.9113081097602844, "learning_rate": 5.625753662795183e-06, "loss": 0.2611519813537598, "memory(GiB)": 32.99, "step": 1100, "token_acc": 0.9040675364543361, "train_speed(iter/s)": 0.123104 }, { "epoch": 1.3788318306546452, "eval_loss": 0.3294164836406708, "eval_runtime": 29.4555, "eval_samples_per_second": 17.484, "eval_steps_per_second": 4.379, "eval_token_acc": 0.8907772515532261, "step": 1100 }, { "epoch": 1.3851038808310467, "grad_norm": 0.959723711013794, "learning_rate": 5.59315055001943e-06, "loss": 0.26945905685424804, "memory(GiB)": 32.99, "step": 1105, "token_acc": 0.8986453766114783, "train_speed(iter/s)": 0.122721 }, { "epoch": 1.391375931007448, "grad_norm": 0.831580400466919, "learning_rate": 5.5605218370071836e-06, "loss": 0.2433305263519287, "memory(GiB)": 35.31, "step": 1110, "token_acc": 0.9251693967502138, "train_speed(iter/s)": 0.122835 }, { "epoch": 1.3976479811838494, "grad_norm": 0.8291701078414917, "learning_rate": 5.5278689320059305e-06, "loss": 0.26767911911010744, "memory(GiB)": 35.31, "step": 1115, "token_acc": 0.9057218870611159, "train_speed(iter/s)": 0.123 }, { "epoch": 1.403920031360251, "grad_norm": 0.95394366979599, "learning_rate": 5.4951932443072764e-06, "loss": 0.2736950159072876, "memory(GiB)": 35.31, "step": 1120, "token_acc": 0.9039156015418949, "train_speed(iter/s)": 0.123138 }, { "epoch": 1.403920031360251, "eval_loss": 0.3296511769294739, "eval_runtime": 29.4223, "eval_samples_per_second": 17.504, "eval_steps_per_second": 4.384, "eval_token_acc": 0.8906273423315881, "step": 1120 }, { "epoch": 1.4101920815366524, "grad_norm": 0.8989368081092834, "learning_rate": 5.462496184186118e-06, "loss": 0.2623956918716431, "memory(GiB)": 35.31, "step": 1125, "token_acc": 0.902071147885406, "train_speed(iter/s)": 0.122763 }, { "epoch": 1.4164641317130537, "grad_norm": 0.8569273352622986, "learning_rate": 5.429779162839787e-06, "loss": 0.2738351345062256, "memory(GiB)": 35.31, "step": 1130, "token_acc": 0.9097424759540801, "train_speed(iter/s)": 0.122909 }, { "epoch": 1.422736181889455, "grad_norm": 0.8919984698295593, "learning_rate": 5.397043592327129e-06, "loss": 0.264469051361084, "memory(GiB)": 35.31, "step": 1135, "token_acc": 0.9089981447124305, "train_speed(iter/s)": 0.12304 }, { "epoch": 1.4290082320658566, "grad_norm": 0.9191476702690125, "learning_rate": 5.364290885507577e-06, "loss": 0.25141263008117676, "memory(GiB)": 35.31, "step": 1140, "token_acc": 0.915509841073431, "train_speed(iter/s)": 0.123134 }, { "epoch": 1.4290082320658566, "eval_loss": 0.3284692168235779, "eval_runtime": 29.3072, "eval_samples_per_second": 17.572, "eval_steps_per_second": 4.402, "eval_token_acc": 0.8913990972133541, "step": 1140 }, { "epoch": 1.435280282242258, "grad_norm": 0.8672423362731934, "learning_rate": 5.3315224559801555e-06, "loss": 0.25262012481689455, "memory(GiB)": 35.31, "step": 1145, "token_acc": 0.9027614571092832, "train_speed(iter/s)": 0.122738 }, { "epoch": 1.4415523324186594, "grad_norm": 0.9336539506912231, "learning_rate": 5.2987397180224795e-06, "loss": 0.2663726806640625, "memory(GiB)": 35.31, "step": 1150, "token_acc": 0.9097083349054824, "train_speed(iter/s)": 0.122841 }, { "epoch": 1.4478243825950607, "grad_norm": 0.9010604619979858, "learning_rate": 5.265944086529714e-06, "loss": 0.2540728569030762, "memory(GiB)": 35.31, "step": 1155, "token_acc": 0.9103166186941373, "train_speed(iter/s)": 0.122975 }, { "epoch": 1.454096432771462, "grad_norm": 0.8994977474212646, "learning_rate": 5.233136976953504e-06, "loss": 0.27104973793029785, "memory(GiB)": 35.31, "step": 1160, "token_acc": 0.9070164133277738, "train_speed(iter/s)": 0.123129 }, { "epoch": 1.454096432771462, "eval_loss": 0.32739755511283875, "eval_runtime": 29.4285, "eval_samples_per_second": 17.5, "eval_steps_per_second": 4.384, "eval_token_acc": 0.8911048309634721, "step": 1160 }, { "epoch": 1.4603684829478636, "grad_norm": 0.9086834788322449, "learning_rate": 5.200319805240884e-06, "loss": 0.2572296380996704, "memory(GiB)": 35.31, "step": 1165, "token_acc": 0.9042424885024718, "train_speed(iter/s)": 0.122748 }, { "epoch": 1.4666405331242651, "grad_norm": 0.9285444617271423, "learning_rate": 5.167493987773175e-06, "loss": 0.2664067268371582, "memory(GiB)": 35.31, "step": 1170, "token_acc": 0.8990601360128372, "train_speed(iter/s)": 0.122891 }, { "epoch": 1.4729125833006664, "grad_norm": 0.8944875597953796, "learning_rate": 5.134660941304838e-06, "loss": 0.25232925415039065, "memory(GiB)": 35.31, "step": 1175, "token_acc": 0.9148593474922276, "train_speed(iter/s)": 0.123017 }, { "epoch": 1.4791846334770677, "grad_norm": 0.8905362486839294, "learning_rate": 5.10182208290234e-06, "loss": 0.2574918746948242, "memory(GiB)": 35.31, "step": 1180, "token_acc": 0.9164562602109015, "train_speed(iter/s)": 0.123133 }, { "epoch": 1.4791846334770677, "eval_loss": 0.32784461975097656, "eval_runtime": 29.4674, "eval_samples_per_second": 17.477, "eval_steps_per_second": 4.378, "eval_token_acc": 0.8912214270247462, "step": 1180 }, { "epoch": 1.4854566836534693, "grad_norm": 0.8255454897880554, "learning_rate": 5.068978829882992e-06, "loss": 0.26115126609802247, "memory(GiB)": 35.31, "step": 1185, "token_acc": 0.9016378309948306, "train_speed(iter/s)": 0.122699 }, { "epoch": 1.4917287338298706, "grad_norm": 0.844142496585846, "learning_rate": 5.036132599753771e-06, "loss": 0.26899421215057373, "memory(GiB)": 35.31, "step": 1190, "token_acc": 0.9105432414675473, "train_speed(iter/s)": 0.122845 }, { "epoch": 1.4980007840062721, "grad_norm": 0.8452991247177124, "learning_rate": 5.003284810150152e-06, "loss": 0.24796614646911622, "memory(GiB)": 35.31, "step": 1195, "token_acc": 0.9108413875777328, "train_speed(iter/s)": 0.122976 }, { "epoch": 1.5042728341826734, "grad_norm": 0.8760294914245605, "learning_rate": 4.970436878774907e-06, "loss": 0.2594925880432129, "memory(GiB)": 35.31, "step": 1200, "token_acc": 0.9037070354960831, "train_speed(iter/s)": 0.123102 }, { "epoch": 1.5042728341826734, "eval_loss": 0.3267907202243805, "eval_runtime": 29.2019, "eval_samples_per_second": 17.636, "eval_steps_per_second": 4.418, "eval_token_acc": 0.8919376599725721, "step": 1200 }, { "epoch": 1.5105448843590747, "grad_norm": 0.9031227231025696, "learning_rate": 4.937590223336936e-06, "loss": 0.26340641975402834, "memory(GiB)": 35.31, "step": 1205, "token_acc": 0.900339087116085, "train_speed(iter/s)": 0.122762 }, { "epoch": 1.5168169345354763, "grad_norm": 0.8399583697319031, "learning_rate": 4.904746261490062e-06, "loss": 0.2580922365188599, "memory(GiB)": 35.31, "step": 1210, "token_acc": 0.9061312895701433, "train_speed(iter/s)": 0.122878 }, { "epoch": 1.5230889847118778, "grad_norm": 0.8265976309776306, "learning_rate": 4.87190641077186e-06, "loss": 0.25390305519104006, "memory(GiB)": 35.31, "step": 1215, "token_acc": 0.9108635097493036, "train_speed(iter/s)": 0.12298 }, { "epoch": 1.5293610348882791, "grad_norm": 0.864519476890564, "learning_rate": 4.8390720885424665e-06, "loss": 0.2515955686569214, "memory(GiB)": 35.31, "step": 1220, "token_acc": 0.9193281845468956, "train_speed(iter/s)": 0.1231 }, { "epoch": 1.5293610348882791, "eval_loss": 0.32665589451789856, "eval_runtime": 29.401, "eval_samples_per_second": 17.516, "eval_steps_per_second": 4.388, "eval_token_acc": 0.8917100200434182, "step": 1220 }, { "epoch": 1.5356330850646804, "grad_norm": 0.8479579091072083, "learning_rate": 4.806244711923408e-06, "loss": 0.2633438348770142, "memory(GiB)": 35.31, "step": 1225, "token_acc": 0.9037422372771998, "train_speed(iter/s)": 0.122739 }, { "epoch": 1.541905135241082, "grad_norm": 0.8362464904785156, "learning_rate": 4.773425697736445e-06, "loss": 0.2546710968017578, "memory(GiB)": 35.31, "step": 1230, "token_acc": 0.9130922134210133, "train_speed(iter/s)": 0.122859 }, { "epoch": 1.5481771854174835, "grad_norm": 0.8684584498405457, "learning_rate": 4.7406164624424135e-06, "loss": 0.24759359359741212, "memory(GiB)": 35.31, "step": 1235, "token_acc": 0.9112730806608358, "train_speed(iter/s)": 0.122966 }, { "epoch": 1.5544492355938848, "grad_norm": 0.8719802498817444, "learning_rate": 4.707818422080094e-06, "loss": 0.25945463180541994, "memory(GiB)": 35.31, "step": 1240, "token_acc": 0.9120634224755224, "train_speed(iter/s)": 0.123098 }, { "epoch": 1.5544492355938848, "eval_loss": 0.32659924030303955, "eval_runtime": 29.4283, "eval_samples_per_second": 17.5, "eval_steps_per_second": 4.384, "eval_token_acc": 0.8916378415292961, "step": 1240 }, { "epoch": 1.5607212857702861, "grad_norm": 0.868956983089447, "learning_rate": 4.675032992205099e-06, "loss": 0.2547459125518799, "memory(GiB)": 35.31, "step": 1245, "token_acc": 0.9023946449205106, "train_speed(iter/s)": 0.122755 }, { "epoch": 1.5669933359466874, "grad_norm": 0.8359695672988892, "learning_rate": 4.642261587828778e-06, "loss": 0.2538146495819092, "memory(GiB)": 35.31, "step": 1250, "token_acc": 0.9125770129594222, "train_speed(iter/s)": 0.122873 }, { "epoch": 1.573265386123089, "grad_norm": 0.8868972063064575, "learning_rate": 4.609505623357135e-06, "loss": 0.2526993751525879, "memory(GiB)": 35.31, "step": 1255, "token_acc": 0.9158266736275652, "train_speed(iter/s)": 0.122999 }, { "epoch": 1.5795374362994905, "grad_norm": 0.8684692978858948, "learning_rate": 4.576766512529799e-06, "loss": 0.27338666915893556, "memory(GiB)": 35.31, "step": 1260, "token_acc": 0.9070495871130456, "train_speed(iter/s)": 0.123113 }, { "epoch": 1.5795374362994905, "eval_loss": 0.3250684440135956, "eval_runtime": 29.4388, "eval_samples_per_second": 17.494, "eval_steps_per_second": 4.382, "eval_token_acc": 0.8918099595245101, "step": 1260 }, { "epoch": 1.5858094864758918, "grad_norm": 0.8290318250656128, "learning_rate": 4.544045668358999e-06, "loss": 0.25896754264831545, "memory(GiB)": 35.31, "step": 1265, "token_acc": 0.9011410774976416, "train_speed(iter/s)": 0.122758 }, { "epoch": 1.5920815366522931, "grad_norm": 0.8665211796760559, "learning_rate": 4.511344503068574e-06, "loss": 0.2611932039260864, "memory(GiB)": 35.31, "step": 1270, "token_acc": 0.904277129727819, "train_speed(iter/s)": 0.122849 }, { "epoch": 1.5983535868286947, "grad_norm": 0.8333092331886292, "learning_rate": 4.478664428033031e-06, "loss": 0.2565239429473877, "memory(GiB)": 35.31, "step": 1275, "token_acc": 0.9113872754288802, "train_speed(iter/s)": 0.122955 }, { "epoch": 1.6046256370050962, "grad_norm": 0.873248815536499, "learning_rate": 4.446006853716628e-06, "loss": 0.25569474697113037, "memory(GiB)": 35.31, "step": 1280, "token_acc": 0.9105040810467167, "train_speed(iter/s)": 0.123062 }, { "epoch": 1.6046256370050962, "eval_loss": 0.32411840558052063, "eval_runtime": 29.4423, "eval_samples_per_second": 17.492, "eval_steps_per_second": 4.381, "eval_token_acc": 0.8917821985575402, "step": 1280 }, { "epoch": 1.6108976871814975, "grad_norm": 0.7883860468864441, "learning_rate": 4.413373189612497e-06, "loss": 0.24754109382629394, "memory(GiB)": 35.31, "step": 1285, "token_acc": 0.9016164438235881, "train_speed(iter/s)": 0.122717 }, { "epoch": 1.6171697373578988, "grad_norm": 0.8996075987815857, "learning_rate": 4.380764844181806e-06, "loss": 0.25697779655456543, "memory(GiB)": 35.31, "step": 1290, "token_acc": 0.9108724003020111, "train_speed(iter/s)": 0.122851 }, { "epoch": 1.6234417875343001, "grad_norm": 0.9099392890930176, "learning_rate": 4.34818322479298e-06, "loss": 0.25770959854125974, "memory(GiB)": 35.31, "step": 1295, "token_acc": 0.9080447672174858, "train_speed(iter/s)": 0.122959 }, { "epoch": 1.6297138377107017, "grad_norm": 0.929043173789978, "learning_rate": 4.315629737660956e-06, "loss": 0.24951376914978027, "memory(GiB)": 35.31, "step": 1300, "token_acc": 0.9104897729405141, "train_speed(iter/s)": 0.12305 }, { "epoch": 1.6297138377107017, "eval_loss": 0.3247908353805542, "eval_runtime": 29.4232, "eval_samples_per_second": 17.503, "eval_steps_per_second": 4.384, "eval_token_acc": 0.8921819564819081, "step": 1300 }, { "epoch": 1.6359858878871032, "grad_norm": 0.8205110430717468, "learning_rate": 4.283105787786482e-06, "loss": 0.24354634284973145, "memory(GiB)": 35.31, "step": 1305, "token_acc": 0.9043088899776798, "train_speed(iter/s)": 0.122699 }, { "epoch": 1.6422579380635045, "grad_norm": 0.8134395480155945, "learning_rate": 4.250612778895492e-06, "loss": 0.2616206169128418, "memory(GiB)": 35.31, "step": 1310, "token_acc": 0.9057426853207865, "train_speed(iter/s)": 0.122811 }, { "epoch": 1.6485299882399058, "grad_norm": 0.9198798537254333, "learning_rate": 4.218152113378513e-06, "loss": 0.2624333381652832, "memory(GiB)": 35.31, "step": 1315, "token_acc": 0.919613921375229, "train_speed(iter/s)": 0.122931 }, { "epoch": 1.6548020384163074, "grad_norm": 0.8332286477088928, "learning_rate": 4.185725192230136e-06, "loss": 0.25506982803344724, "memory(GiB)": 35.31, "step": 1320, "token_acc": 0.9053922590507957, "train_speed(iter/s)": 0.123033 }, { "epoch": 1.6548020384163074, "eval_loss": 0.3236985206604004, "eval_runtime": 29.3395, "eval_samples_per_second": 17.553, "eval_steps_per_second": 4.397, "eval_token_acc": 0.8926538929203982, "step": 1320 }, { "epoch": 1.6610740885927089, "grad_norm": 0.890203058719635, "learning_rate": 4.1533334149885594e-06, "loss": 0.2616013526916504, "memory(GiB)": 35.31, "step": 1325, "token_acc": 0.9023847464146382, "train_speed(iter/s)": 0.122706 }, { "epoch": 1.6673461387691102, "grad_norm": 0.9320402145385742, "learning_rate": 4.120978179675172e-06, "loss": 0.24896547794342042, "memory(GiB)": 35.31, "step": 1330, "token_acc": 0.9081224549156486, "train_speed(iter/s)": 0.122805 }, { "epoch": 1.6736181889455115, "grad_norm": 0.8774133324623108, "learning_rate": 4.088660882734228e-06, "loss": 0.265717077255249, "memory(GiB)": 35.31, "step": 1335, "token_acc": 0.9120258587117377, "train_speed(iter/s)": 0.122911 }, { "epoch": 1.6798902391219128, "grad_norm": 0.8072150945663452, "learning_rate": 4.056382918972565e-06, "loss": 0.25207223892211916, "memory(GiB)": 35.31, "step": 1340, "token_acc": 0.9112906186891867, "train_speed(iter/s)": 0.123008 }, { "epoch": 1.6798902391219128, "eval_loss": 0.32341495156288147, "eval_runtime": 29.3437, "eval_samples_per_second": 17.551, "eval_steps_per_second": 4.396, "eval_token_acc": 0.8927760411750663, "step": 1340 }, { "epoch": 1.6861622892983144, "grad_norm": 0.8317608833312988, "learning_rate": 4.024145681499416e-06, "loss": 0.2493518829345703, "memory(GiB)": 35.31, "step": 1345, "token_acc": 0.9048487057965731, "train_speed(iter/s)": 0.122706 }, { "epoch": 1.6924343394747159, "grad_norm": 0.8192688822746277, "learning_rate": 3.991950561666269e-06, "loss": 0.2477407217025757, "memory(GiB)": 35.31, "step": 1350, "token_acc": 0.9176169257219045, "train_speed(iter/s)": 0.122803 }, { "epoch": 1.6987063896511172, "grad_norm": 0.8379951119422913, "learning_rate": 3.959798949006831e-06, "loss": 0.2488119125366211, "memory(GiB)": 35.31, "step": 1355, "token_acc": 0.92136861119966, "train_speed(iter/s)": 0.122913 }, { "epoch": 1.7049784398275185, "grad_norm": 0.7797974348068237, "learning_rate": 3.927692231177053e-06, "loss": 0.2600484609603882, "memory(GiB)": 35.31, "step": 1360, "token_acc": 0.9198253158649198, "train_speed(iter/s)": 0.123022 }, { "epoch": 1.7049784398275185, "eval_loss": 0.3236111104488373, "eval_runtime": 29.4455, "eval_samples_per_second": 17.49, "eval_steps_per_second": 4.381, "eval_token_acc": 0.8926094753732462, "step": 1360 }, { "epoch": 1.71125049000392, "grad_norm": 0.818781316280365, "learning_rate": 3.895631793895223e-06, "loss": 0.2504476547241211, "memory(GiB)": 35.31, "step": 1365, "token_acc": 0.9028884527745216, "train_speed(iter/s)": 0.122706 }, { "epoch": 1.7175225401803216, "grad_norm": 0.8820050954818726, "learning_rate": 3.863619020882184e-06, "loss": 0.2579957008361816, "memory(GiB)": 35.31, "step": 1370, "token_acc": 0.9132163908813294, "train_speed(iter/s)": 0.12282 }, { "epoch": 1.7237945903567229, "grad_norm": 0.8338585495948792, "learning_rate": 3.831655293801596e-06, "loss": 0.26077022552490237, "memory(GiB)": 35.31, "step": 1375, "token_acc": 0.9174921383647798, "train_speed(iter/s)": 0.122933 }, { "epoch": 1.7300666405331242, "grad_norm": 0.8168034553527832, "learning_rate": 3.7997419922003077e-06, "loss": 0.263335132598877, "memory(GiB)": 35.31, "step": 1380, "token_acc": 0.8992103531476201, "train_speed(iter/s)": 0.123038 }, { "epoch": 1.7300666405331242, "eval_loss": 0.3226911127567291, "eval_runtime": 29.1494, "eval_samples_per_second": 17.668, "eval_steps_per_second": 4.425, "eval_token_acc": 0.8926761016939742, "step": 1380 }, { "epoch": 1.7363386907095255, "grad_norm": 0.78660649061203, "learning_rate": 3.7678804934488146e-06, "loss": 0.2630495071411133, "memory(GiB)": 35.31, "step": 1385, "token_acc": 0.90369452426021, "train_speed(iter/s)": 0.122732 }, { "epoch": 1.742610740885927, "grad_norm": 0.8045564889907837, "learning_rate": 3.736072172681818e-06, "loss": 0.252573823928833, "memory(GiB)": 35.31, "step": 1390, "token_acc": 0.9139742579458892, "train_speed(iter/s)": 0.122859 }, { "epoch": 1.7488827910623286, "grad_norm": 0.8700997233390808, "learning_rate": 3.704318402738867e-06, "loss": 0.251971435546875, "memory(GiB)": 35.31, "step": 1395, "token_acc": 0.91973076509971, "train_speed(iter/s)": 0.122973 }, { "epoch": 1.75515484123873, "grad_norm": 0.8267092704772949, "learning_rate": 3.672620554105111e-06, "loss": 0.24774155616760254, "memory(GiB)": 35.31, "step": 1400, "token_acc": 0.9149910336144073, "train_speed(iter/s)": 0.123067 }, { "epoch": 1.75515484123873, "eval_loss": 0.320892870426178, "eval_runtime": 29.2095, "eval_samples_per_second": 17.631, "eval_steps_per_second": 4.416, "eval_token_acc": 0.8924984315053662, "step": 1400 }, { "epoch": 1.7614268914151312, "grad_norm": 0.8506399393081665, "learning_rate": 3.6409799948521473e-06, "loss": 0.24930577278137206, "memory(GiB)": 35.31, "step": 1405, "token_acc": 0.9041733294885574, "train_speed(iter/s)": 0.122739 }, { "epoch": 1.7676989415915327, "grad_norm": 0.9828294515609741, "learning_rate": 3.6093980905789824e-06, "loss": 0.2731804132461548, "memory(GiB)": 35.31, "step": 1410, "token_acc": 0.9054926688444883, "train_speed(iter/s)": 0.122848 }, { "epoch": 1.7739709917679343, "grad_norm": 0.8984673023223877, "learning_rate": 3.577876204353079e-06, "loss": 0.2694148778915405, "memory(GiB)": 35.31, "step": 1415, "token_acc": 0.9082330415754923, "train_speed(iter/s)": 0.122964 }, { "epoch": 1.7802430419443356, "grad_norm": 0.8837220668792725, "learning_rate": 3.5464156966515426e-06, "loss": 0.2497929334640503, "memory(GiB)": 35.31, "step": 1420, "token_acc": 0.9207889421363222, "train_speed(iter/s)": 0.123051 }, { "epoch": 1.7802430419443356, "eval_loss": 0.32131004333496094, "eval_runtime": 29.3893, "eval_samples_per_second": 17.523, "eval_steps_per_second": 4.389, "eval_token_acc": 0.8924318051846382, "step": 1420 }, { "epoch": 1.786515092120737, "grad_norm": 0.8857332468032837, "learning_rate": 3.515017925302396e-06, "loss": 0.24897024631500245, "memory(GiB)": 35.31, "step": 1425, "token_acc": 0.9022955373286864, "train_speed(iter/s)": 0.122741 }, { "epoch": 1.7927871422971384, "grad_norm": 0.9256414175033569, "learning_rate": 3.48368424542597e-06, "loss": 0.2715806484222412, "memory(GiB)": 35.31, "step": 1430, "token_acc": 0.897119341563786, "train_speed(iter/s)": 0.122841 }, { "epoch": 1.7990591924735397, "grad_norm": 0.9175465703010559, "learning_rate": 3.4524160093764288e-06, "loss": 0.23990106582641602, "memory(GiB)": 35.31, "step": 1435, "token_acc": 0.9154644289198463, "train_speed(iter/s)": 0.122916 }, { "epoch": 1.8053312426499413, "grad_norm": 0.8272521495819092, "learning_rate": 3.421214566683395e-06, "loss": 0.2521500587463379, "memory(GiB)": 35.31, "step": 1440, "token_acc": 0.9128348930707119, "train_speed(iter/s)": 0.123014 }, { "epoch": 1.8053312426499413, "eval_loss": 0.32204827666282654, "eval_runtime": 29.4038, "eval_samples_per_second": 17.515, "eval_steps_per_second": 4.387, "eval_token_acc": 0.8927871455618542, "step": 1440 }, { "epoch": 1.8116032928263426, "grad_norm": 0.8805301189422607, "learning_rate": 3.390081263993702e-06, "loss": 0.25586814880371095, "memory(GiB)": 35.31, "step": 1445, "token_acc": 0.9027470478558111, "train_speed(iter/s)": 0.122726 }, { "epoch": 1.817875343002744, "grad_norm": 0.792771577835083, "learning_rate": 3.3590174450132828e-06, "loss": 0.2642062187194824, "memory(GiB)": 35.31, "step": 1450, "token_acc": 0.9100382735230589, "train_speed(iter/s)": 0.122835 }, { "epoch": 1.8241473931791454, "grad_norm": 0.8667325973510742, "learning_rate": 3.3280244504491664e-06, "loss": 0.26262435913085935, "memory(GiB)": 35.31, "step": 1455, "token_acc": 0.9134121762938738, "train_speed(iter/s)": 0.12294 }, { "epoch": 1.830419443355547, "grad_norm": 0.914749026298523, "learning_rate": 3.297103617951618e-06, "loss": 0.25986638069152834, "memory(GiB)": 35.31, "step": 1460, "token_acc": 0.9224172317510969, "train_speed(iter/s)": 0.123038 }, { "epoch": 1.830419443355547, "eval_loss": 0.32037732005119324, "eval_runtime": 29.4247, "eval_samples_per_second": 17.502, "eval_steps_per_second": 4.384, "eval_token_acc": 0.8929370547834922, "step": 1460 }, { "epoch": 1.8366914935319483, "grad_norm": 0.8536245226860046, "learning_rate": 3.2662562820564043e-06, "loss": 0.25100226402282716, "memory(GiB)": 35.31, "step": 1465, "token_acc": 0.9041838910977119, "train_speed(iter/s)": 0.122724 }, { "epoch": 1.8429635437083496, "grad_norm": 0.9133491516113281, "learning_rate": 3.2354837741271994e-06, "loss": 0.2528833866119385, "memory(GiB)": 35.31, "step": 1470, "token_acc": 0.910944418742978, "train_speed(iter/s)": 0.122825 }, { "epoch": 1.8492355938847511, "grad_norm": 0.8379256725311279, "learning_rate": 3.2047874222981134e-06, "loss": 0.24852747917175294, "memory(GiB)": 35.31, "step": 1475, "token_acc": 0.91071612747963, "train_speed(iter/s)": 0.122921 }, { "epoch": 1.8555076440611527, "grad_norm": 0.8636755347251892, "learning_rate": 3.174168551416384e-06, "loss": 0.2513826131820679, "memory(GiB)": 35.31, "step": 1480, "token_acc": 0.9178916270355959, "train_speed(iter/s)": 0.123 }, { "epoch": 1.8555076440611527, "eval_loss": 0.3209190368652344, "eval_runtime": 29.4485, "eval_samples_per_second": 17.488, "eval_steps_per_second": 4.381, "eval_token_acc": 0.8929148460099162, "step": 1480 }, { "epoch": 1.861779694237554, "grad_norm": 0.884774923324585, "learning_rate": 3.1436284829851883e-06, "loss": 0.2535923719406128, "memory(GiB)": 35.31, "step": 1485, "token_acc": 0.9044768032509313, "train_speed(iter/s)": 0.122697 }, { "epoch": 1.8680517444139553, "grad_norm": 0.9002357721328735, "learning_rate": 3.113168535106604e-06, "loss": 0.2631272792816162, "memory(GiB)": 35.31, "step": 1490, "token_acc": 0.9088035186585114, "train_speed(iter/s)": 0.122806 }, { "epoch": 1.8743237945903566, "grad_norm": 0.9321665167808533, "learning_rate": 3.08279002242473e-06, "loss": 0.25894691944122317, "memory(GiB)": 35.31, "step": 1495, "token_acc": 0.9117967604945695, "train_speed(iter/s)": 0.122898 }, { "epoch": 1.8805958447667581, "grad_norm": 0.8132336139678955, "learning_rate": 3.0524942560689387e-06, "loss": 0.24980921745300294, "memory(GiB)": 35.31, "step": 1500, "token_acc": 0.9154492980830898, "train_speed(iter/s)": 0.122985 }, { "epoch": 1.8805958447667581, "eval_loss": 0.32020601630210876, "eval_runtime": 29.4077, "eval_samples_per_second": 17.512, "eval_steps_per_second": 4.387, "eval_token_acc": 0.8928315631090062, "step": 1500 }, { "epoch": 1.8868678949431597, "grad_norm": 0.8028678297996521, "learning_rate": 3.0222825435972948e-06, "loss": 0.24374105930328369, "memory(GiB)": 35.31, "step": 1505, "token_acc": 0.9061514997458058, "train_speed(iter/s)": 0.122682 }, { "epoch": 1.893139945119561, "grad_norm": 0.8144044876098633, "learning_rate": 2.99215618894011e-06, "loss": 0.24561538696289062, "memory(GiB)": 35.31, "step": 1510, "token_acc": 0.9156626506024096, "train_speed(iter/s)": 0.122769 }, { "epoch": 1.8994119952959623, "grad_norm": 0.8524841666221619, "learning_rate": 2.9621164923436774e-06, "loss": 0.23612394332885742, "memory(GiB)": 35.31, "step": 1515, "token_acc": 0.9185016451531258, "train_speed(iter/s)": 0.12286 }, { "epoch": 1.9056840454723638, "grad_norm": 0.7969034910202026, "learning_rate": 2.9321647503141525e-06, "loss": 0.25468385219573975, "memory(GiB)": 35.31, "step": 1520, "token_acc": 0.9106673673147662, "train_speed(iter/s)": 0.122942 }, { "epoch": 1.9056840454723638, "eval_loss": 0.3204187750816345, "eval_runtime": 29.5394, "eval_samples_per_second": 17.434, "eval_steps_per_second": 4.367, "eval_token_acc": 0.8928926372363403, "step": 1520 }, { "epoch": 1.9119560956487653, "grad_norm": 0.7709624171257019, "learning_rate": 2.902302255561585e-06, "loss": 0.25543718338012694, "memory(GiB)": 35.31, "step": 1525, "token_acc": 0.9032785309104894, "train_speed(iter/s)": 0.122654 }, { "epoch": 1.9182281458251667, "grad_norm": 0.8021811246871948, "learning_rate": 2.87253029694414e-06, "loss": 0.25137279033660886, "memory(GiB)": 35.31, "step": 1530, "token_acc": 0.9112309955860716, "train_speed(iter/s)": 0.122731 }, { "epoch": 1.924500196001568, "grad_norm": 0.9843617081642151, "learning_rate": 2.8428501594124602e-06, "loss": 0.24772090911865235, "memory(GiB)": 35.31, "step": 1535, "token_acc": 0.9226571027600187, "train_speed(iter/s)": 0.122835 }, { "epoch": 1.9307722461779693, "grad_norm": 0.8464626669883728, "learning_rate": 2.813263123954214e-06, "loss": 0.24349329471588135, "memory(GiB)": 35.31, "step": 1540, "token_acc": 0.9155610012252757, "train_speed(iter/s)": 0.122913 }, { "epoch": 1.9307722461779693, "eval_loss": 0.3201349973678589, "eval_runtime": 29.3804, "eval_samples_per_second": 17.529, "eval_steps_per_second": 4.391, "eval_token_acc": 0.8934423043823463, "step": 1540 }, { "epoch": 1.9370442963543708, "grad_norm": 0.8658434152603149, "learning_rate": 2.7837704675388045e-06, "loss": 0.24450139999389647, "memory(GiB)": 35.31, "step": 1545, "token_acc": 0.904251637293009, "train_speed(iter/s)": 0.122623 }, { "epoch": 1.9433163465307723, "grad_norm": 0.8747855424880981, "learning_rate": 2.7543734630622622e-06, "loss": 0.2556029796600342, "memory(GiB)": 35.31, "step": 1550, "token_acc": 0.9159713945172825, "train_speed(iter/s)": 0.122715 }, { "epoch": 1.9495883967071737, "grad_norm": 0.7818464040756226, "learning_rate": 2.7250733792922997e-06, "loss": 0.2455909252166748, "memory(GiB)": 35.31, "step": 1555, "token_acc": 0.9172009090600726, "train_speed(iter/s)": 0.122785 }, { "epoch": 1.955860446883575, "grad_norm": 0.840918242931366, "learning_rate": 2.6958714808135546e-06, "loss": 0.24802937507629394, "memory(GiB)": 35.31, "step": 1560, "token_acc": 0.9247232757953402, "train_speed(iter/s)": 0.122896 }, { "epoch": 1.955860446883575, "eval_loss": 0.31897813081741333, "eval_runtime": 29.4222, "eval_samples_per_second": 17.504, "eval_steps_per_second": 4.384, "eval_token_acc": 0.8934034390285882, "step": 1560 }, { "epoch": 1.9621324970599765, "grad_norm": 0.7543806433677673, "learning_rate": 2.6667690279730096e-06, "loss": 0.24353570938110353, "memory(GiB)": 35.31, "step": 1565, "token_acc": 0.9054516586629185, "train_speed(iter/s)": 0.122615 }, { "epoch": 1.968404547236378, "grad_norm": 0.8479281663894653, "learning_rate": 2.6377672768256003e-06, "loss": 0.23759632110595702, "memory(GiB)": 35.31, "step": 1570, "token_acc": 0.9129114454256693, "train_speed(iter/s)": 0.122681 }, { "epoch": 1.9746765974127793, "grad_norm": 0.8568554520606995, "learning_rate": 2.608867479080001e-06, "loss": 0.2510775089263916, "memory(GiB)": 35.31, "step": 1575, "token_acc": 0.90967994588629, "train_speed(iter/s)": 0.122771 }, { "epoch": 1.9809486475891807, "grad_norm": 0.8321829438209534, "learning_rate": 2.5800708820446002e-06, "loss": 0.2509664297103882, "memory(GiB)": 35.31, "step": 1580, "token_acc": 0.9128670788253478, "train_speed(iter/s)": 0.122872 }, { "epoch": 1.9809486475891807, "eval_loss": 0.31922805309295654, "eval_runtime": 29.3844, "eval_samples_per_second": 17.526, "eval_steps_per_second": 4.39, "eval_token_acc": 0.8935255872832563, "step": 1580 }, { "epoch": 1.987220697765582, "grad_norm": 0.8347560167312622, "learning_rate": 2.551378728573668e-06, "loss": 0.24763202667236328, "memory(GiB)": 35.31, "step": 1585, "token_acc": 0.9060148795763103, "train_speed(iter/s)": 0.122581 }, { "epoch": 1.9934927479419835, "grad_norm": 0.899139404296875, "learning_rate": 2.5227922570137143e-06, "loss": 0.25942072868347166, "memory(GiB)": 35.31, "step": 1590, "token_acc": 0.9087091290870913, "train_speed(iter/s)": 0.122684 }, { "epoch": 1.999764798118385, "grad_norm": 0.811363935470581, "learning_rate": 2.4943127011500483e-06, "loss": 0.25793845653533937, "memory(GiB)": 35.31, "step": 1595, "token_acc": 0.9161427702291625, "train_speed(iter/s)": 0.122787 }, { "epoch": 2.005017640141121, "grad_norm": 0.8322923183441162, "learning_rate": 2.465941290153514e-06, "loss": 0.2291651725769043, "memory(GiB)": 35.31, "step": 1600, "token_acc": 0.9299772392555897, "train_speed(iter/s)": 0.122933 }, { "epoch": 2.005017640141121, "eval_loss": 0.32035958766937256, "eval_runtime": 29.3301, "eval_samples_per_second": 17.559, "eval_steps_per_second": 4.398, "eval_token_acc": 0.8932868429673142, "step": 1600 }, { "epoch": 2.0112896903175224, "grad_norm": 0.8588864207267761, "learning_rate": 2.4376792485274577e-06, "loss": 0.2054748058319092, "memory(GiB)": 35.31, "step": 1605, "token_acc": 0.9124336719729088, "train_speed(iter/s)": 0.122653 }, { "epoch": 2.017561740493924, "grad_norm": 0.7657065987586975, "learning_rate": 2.409527796054863e-06, "loss": 0.2039170742034912, "memory(GiB)": 35.31, "step": 1610, "token_acc": 0.929469197420671, "train_speed(iter/s)": 0.122752 }, { "epoch": 2.0238337906703254, "grad_norm": 0.8790497183799744, "learning_rate": 2.38148814774572e-06, "loss": 0.18966434001922608, "memory(GiB)": 35.31, "step": 1615, "token_acc": 0.9288194444444444, "train_speed(iter/s)": 0.122823 }, { "epoch": 2.0301058408467267, "grad_norm": 0.9316287040710449, "learning_rate": 2.353561513784566e-06, "loss": 0.2005706548690796, "memory(GiB)": 35.31, "step": 1620, "token_acc": 0.9332470414201184, "train_speed(iter/s)": 0.12292 }, { "epoch": 2.0301058408467267, "eval_loss": 0.3415764570236206, "eval_runtime": 29.383, "eval_samples_per_second": 17.527, "eval_steps_per_second": 4.39, "eval_token_acc": 0.8920264950668761, "step": 1620 }, { "epoch": 2.036377891023128, "grad_norm": 0.9010135531425476, "learning_rate": 2.325749099478277e-06, "loss": 0.20035338401794434, "memory(GiB)": 35.31, "step": 1625, "token_acc": 0.9104487073829418, "train_speed(iter/s)": 0.122648 }, { "epoch": 2.04264994119953, "grad_norm": 0.8294356465339661, "learning_rate": 2.29805210520403e-06, "loss": 0.19484407901763917, "memory(GiB)": 35.31, "step": 1630, "token_acc": 0.9329941210850208, "train_speed(iter/s)": 0.122735 }, { "epoch": 2.048921991375931, "grad_norm": 0.8767816424369812, "learning_rate": 2.270471726357501e-06, "loss": 0.19034754037857055, "memory(GiB)": 35.31, "step": 1635, "token_acc": 0.9369187962217178, "train_speed(iter/s)": 0.122807 }, { "epoch": 2.0551940415523324, "grad_norm": 0.9023549556732178, "learning_rate": 2.243009153301276e-06, "loss": 0.19750189781188965, "memory(GiB)": 35.31, "step": 1640, "token_acc": 0.930916058130287, "train_speed(iter/s)": 0.122894 }, { "epoch": 2.0551940415523324, "eval_loss": 0.3384056091308594, "eval_runtime": 29.3915, "eval_samples_per_second": 17.522, "eval_steps_per_second": 4.389, "eval_token_acc": 0.8916600503028721, "step": 1640 }, { "epoch": 2.0614660917287337, "grad_norm": 0.744107723236084, "learning_rate": 2.215665571313468e-06, "loss": 0.19824939966201782, "memory(GiB)": 35.31, "step": 1645, "token_acc": 0.9114719614180904, "train_speed(iter/s)": 0.122646 }, { "epoch": 2.067738141905135, "grad_norm": 0.8998700380325317, "learning_rate": 2.188442160536562e-06, "loss": 0.19962520599365235, "memory(GiB)": 35.31, "step": 1650, "token_acc": 0.9246565057436744, "train_speed(iter/s)": 0.122714 }, { "epoch": 2.074010192081537, "grad_norm": 0.8231090307235718, "learning_rate": 2.1613400959264845e-06, "loss": 0.1893744945526123, "memory(GiB)": 35.31, "step": 1655, "token_acc": 0.9325602140945585, "train_speed(iter/s)": 0.122783 }, { "epoch": 2.080282242257938, "grad_norm": 1.0551718473434448, "learning_rate": 2.1343605472018954e-06, "loss": 0.19394491910934447, "memory(GiB)": 35.31, "step": 1660, "token_acc": 0.9273977016432177, "train_speed(iter/s)": 0.122871 }, { "epoch": 2.080282242257938, "eval_loss": 0.34014904499053955, "eval_runtime": 29.2198, "eval_samples_per_second": 17.625, "eval_steps_per_second": 4.415, "eval_token_acc": 0.8916989156566302, "step": 1660 }, { "epoch": 2.0865542924343394, "grad_norm": 0.9032732844352722, "learning_rate": 2.1075046787936842e-06, "loss": 0.20794956684112548, "memory(GiB)": 35.31, "step": 1665, "token_acc": 0.9062622258157065, "train_speed(iter/s)": 0.122603 }, { "epoch": 2.0928263426107407, "grad_norm": 0.7685064673423767, "learning_rate": 2.0807736497947436e-06, "loss": 0.19878649711608887, "memory(GiB)": 35.31, "step": 1670, "token_acc": 0.9290486321054007, "train_speed(iter/s)": 0.122682 }, { "epoch": 2.0990983927871425, "grad_norm": 0.7682995200157166, "learning_rate": 2.0541686139099164e-06, "loss": 0.19356679916381836, "memory(GiB)": 35.31, "step": 1675, "token_acc": 0.922647425159988, "train_speed(iter/s)": 0.122764 }, { "epoch": 2.105370442963544, "grad_norm": 0.855737030506134, "learning_rate": 2.0276907194062167e-06, "loss": 0.20157759189605712, "memory(GiB)": 35.31, "step": 1680, "token_acc": 0.9333961552958915, "train_speed(iter/s)": 0.122849 }, { "epoch": 2.105370442963544, "eval_loss": 0.3408814072608948, "eval_runtime": 29.4394, "eval_samples_per_second": 17.494, "eval_steps_per_second": 4.382, "eval_token_acc": 0.8919487643593602, "step": 1680 }, { "epoch": 2.111642493139945, "grad_norm": 0.8940383791923523, "learning_rate": 2.0013411090632638e-06, "loss": 0.1950603485107422, "memory(GiB)": 35.31, "step": 1685, "token_acc": 0.9098292220113852, "train_speed(iter/s)": 0.122586 }, { "epoch": 2.1179145433163464, "grad_norm": 0.8400514721870422, "learning_rate": 1.9751209201239696e-06, "loss": 0.19134198427200316, "memory(GiB)": 35.31, "step": 1690, "token_acc": 0.9370006770480704, "train_speed(iter/s)": 0.122686 }, { "epoch": 2.1241865934927477, "grad_norm": 0.7992594838142395, "learning_rate": 1.9490312842454425e-06, "loss": 0.18724431991577148, "memory(GiB)": 35.31, "step": 1695, "token_acc": 0.9336970593674866, "train_speed(iter/s)": 0.12277 }, { "epoch": 2.1304586436691495, "grad_norm": 0.8192344903945923, "learning_rate": 1.9230733274501525e-06, "loss": 0.19678905010223388, "memory(GiB)": 35.31, "step": 1700, "token_acc": 0.931975970358646, "train_speed(iter/s)": 0.122839 }, { "epoch": 2.1304586436691495, "eval_loss": 0.3409229516983032, "eval_runtime": 29.4398, "eval_samples_per_second": 17.493, "eval_steps_per_second": 4.382, "eval_token_acc": 0.8916822590764482, "step": 1700 }, { "epoch": 2.136730693845551, "grad_norm": 0.8598861694335938, "learning_rate": 1.8972481700773388e-06, "loss": 0.2011383056640625, "memory(GiB)": 35.31, "step": 1705, "token_acc": 0.9084407844836927, "train_speed(iter/s)": 0.122585 }, { "epoch": 2.143002744021952, "grad_norm": 0.851450502872467, "learning_rate": 1.8715569267346368e-06, "loss": 0.2008237361907959, "memory(GiB)": 35.31, "step": 1710, "token_acc": 0.934470600804872, "train_speed(iter/s)": 0.122661 }, { "epoch": 2.1492747941983534, "grad_norm": 0.8260136246681213, "learning_rate": 1.846000706249997e-06, "loss": 0.19412180185317993, "memory(GiB)": 35.31, "step": 1715, "token_acc": 0.934304410514252, "train_speed(iter/s)": 0.122733 }, { "epoch": 2.155546844374755, "grad_norm": 0.9584734439849854, "learning_rate": 1.8205806116238055e-06, "loss": 0.19354283809661865, "memory(GiB)": 35.31, "step": 1720, "token_acc": 0.9366102705522844, "train_speed(iter/s)": 0.122798 }, { "epoch": 2.155546844374755, "eval_loss": 0.34049588441848755, "eval_runtime": 29.3984, "eval_samples_per_second": 17.518, "eval_steps_per_second": 4.388, "eval_token_acc": 0.8918266161046922, "step": 1720 }, { "epoch": 2.1618188945511565, "grad_norm": 0.8262621164321899, "learning_rate": 1.7952977399812988e-06, "loss": 0.19691638946533202, "memory(GiB)": 35.31, "step": 1725, "token_acc": 0.9087991263467711, "train_speed(iter/s)": 0.122536 }, { "epoch": 2.168090944727558, "grad_norm": 0.8243085145950317, "learning_rate": 1.7701531825251888e-06, "loss": 0.20423364639282227, "memory(GiB)": 35.31, "step": 1730, "token_acc": 0.9323758228605625, "train_speed(iter/s)": 0.122637 }, { "epoch": 2.174362994903959, "grad_norm": 0.8588405847549438, "learning_rate": 1.7451480244885938e-06, "loss": 0.20565853118896485, "memory(GiB)": 35.31, "step": 1735, "token_acc": 0.9293734801888142, "train_speed(iter/s)": 0.122722 }, { "epoch": 2.1806350450803604, "grad_norm": 0.8922966122627258, "learning_rate": 1.720283345088178e-06, "loss": 0.20658740997314454, "memory(GiB)": 35.31, "step": 1740, "token_acc": 0.9308320373250388, "train_speed(iter/s)": 0.122814 }, { "epoch": 2.1806350450803604, "eval_loss": 0.33896124362945557, "eval_runtime": 29.4645, "eval_samples_per_second": 17.479, "eval_steps_per_second": 4.378, "eval_token_acc": 0.8921264345479681, "step": 1740 }, { "epoch": 2.186907095256762, "grad_norm": 0.8128474354743958, "learning_rate": 1.695560217477582e-06, "loss": 0.19282236099243164, "memory(GiB)": 35.31, "step": 1745, "token_acc": 0.9095768254522502, "train_speed(iter/s)": 0.122545 }, { "epoch": 2.1931791454331635, "grad_norm": 0.7443967461585999, "learning_rate": 1.6709797087011066e-06, "loss": 0.1943533182144165, "memory(GiB)": 35.31, "step": 1750, "token_acc": 0.9320897479117504, "train_speed(iter/s)": 0.122626 }, { "epoch": 2.199451195609565, "grad_norm": 0.7897108793258667, "learning_rate": 1.6465428796476584e-06, "loss": 0.1893579602241516, "memory(GiB)": 35.31, "step": 1755, "token_acc": 0.9279509242867141, "train_speed(iter/s)": 0.122698 }, { "epoch": 2.205723245785966, "grad_norm": 0.9222955703735352, "learning_rate": 1.6222507850049602e-06, "loss": 0.20297529697418212, "memory(GiB)": 35.31, "step": 1760, "token_acc": 0.9288448547624409, "train_speed(iter/s)": 0.122793 }, { "epoch": 2.205723245785966, "eval_loss": 0.34091004729270935, "eval_runtime": 29.4544, "eval_samples_per_second": 17.485, "eval_steps_per_second": 4.38, "eval_token_acc": 0.8917488853971761, "step": 1760 }, { "epoch": 2.211995295962368, "grad_norm": 0.814859926700592, "learning_rate": 1.598104473214031e-06, "loss": 0.20043063163757324, "memory(GiB)": 35.31, "step": 1765, "token_acc": 0.9068337921265734, "train_speed(iter/s)": 0.122535 }, { "epoch": 2.218267346138769, "grad_norm": 0.8000169396400452, "learning_rate": 1.5741049864239383e-06, "loss": 0.19192855358123778, "memory(GiB)": 35.31, "step": 1770, "token_acc": 0.9305805924968539, "train_speed(iter/s)": 0.122614 }, { "epoch": 2.2245393963151705, "grad_norm": 0.9338832497596741, "learning_rate": 1.550253360446815e-06, "loss": 0.1987203598022461, "memory(GiB)": 35.31, "step": 1775, "token_acc": 0.938174715909091, "train_speed(iter/s)": 0.122706 }, { "epoch": 2.230811446491572, "grad_norm": 0.8646144866943359, "learning_rate": 1.5265506247131617e-06, "loss": 0.19849686622619628, "memory(GiB)": 35.31, "step": 1780, "token_acc": 0.9281764423845293, "train_speed(iter/s)": 0.122792 }, { "epoch": 2.230811446491572, "eval_loss": 0.3393869996070862, "eval_runtime": 29.4398, "eval_samples_per_second": 17.493, "eval_steps_per_second": 4.382, "eval_token_acc": 0.8924151486044561, "step": 1780 }, { "epoch": 2.2370834966679736, "grad_norm": 0.865871787071228, "learning_rate": 1.5029978022274067e-06, "loss": 0.21043546199798585, "memory(GiB)": 35.31, "step": 1785, "token_acc": 0.9090308690081499, "train_speed(iter/s)": 0.122528 }, { "epoch": 2.243355546844375, "grad_norm": 0.8752254247665405, "learning_rate": 1.47959590952376e-06, "loss": 0.19589321613311766, "memory(GiB)": 35.31, "step": 1790, "token_acc": 0.9334750193974969, "train_speed(iter/s)": 0.122614 }, { "epoch": 2.249627597020776, "grad_norm": 0.8532087802886963, "learning_rate": 1.4563459566223358e-06, "loss": 0.19277225732803344, "memory(GiB)": 35.31, "step": 1795, "token_acc": 0.9376329562721594, "train_speed(iter/s)": 0.122703 }, { "epoch": 2.2558996471971775, "grad_norm": 0.8500798344612122, "learning_rate": 1.4332489469855698e-06, "loss": 0.19048466682434081, "memory(GiB)": 35.31, "step": 1800, "token_acc": 0.9341178420485622, "train_speed(iter/s)": 0.122772 }, { "epoch": 2.2558996471971775, "eval_loss": 0.3418830335140228, "eval_runtime": 29.4345, "eval_samples_per_second": 17.496, "eval_steps_per_second": 4.383, "eval_token_acc": 0.8921042257743922, "step": 1800 }, { "epoch": 2.262171697373579, "grad_norm": 0.9023928046226501, "learning_rate": 1.4103058774748923e-06, "loss": 0.20214588642120362, "memory(GiB)": 35.31, "step": 1805, "token_acc": 0.9073493650364767, "train_speed(iter/s)": 0.122533 }, { "epoch": 2.2684437475499806, "grad_norm": 0.874862790107727, "learning_rate": 1.3875177383077233e-06, "loss": 0.19704325199127198, "memory(GiB)": 35.31, "step": 1810, "token_acc": 0.9309168859008087, "train_speed(iter/s)": 0.122602 }, { "epoch": 2.274715797726382, "grad_norm": 0.8751354217529297, "learning_rate": 1.3648855130147216e-06, "loss": 0.1942400574684143, "memory(GiB)": 35.31, "step": 1815, "token_acc": 0.9344223881096643, "train_speed(iter/s)": 0.122696 }, { "epoch": 2.280987847902783, "grad_norm": 0.8818191885948181, "learning_rate": 1.3424101783973403e-06, "loss": 0.19853044748306276, "memory(GiB)": 35.31, "step": 1820, "token_acc": 0.9334029143751756, "train_speed(iter/s)": 0.122774 }, { "epoch": 2.280987847902783, "eval_loss": 0.34076765179634094, "eval_runtime": 29.4137, "eval_samples_per_second": 17.509, "eval_steps_per_second": 4.386, "eval_token_acc": 0.8922596871894242, "step": 1820 }, { "epoch": 2.2872598980791845, "grad_norm": 0.8647897839546204, "learning_rate": 1.3200927044856714e-06, "loss": 0.20430846214294435, "memory(GiB)": 35.31, "step": 1825, "token_acc": 0.9105302740430922, "train_speed(iter/s)": 0.122528 }, { "epoch": 2.293531948255586, "grad_norm": 0.8771964907646179, "learning_rate": 1.2979340544965745e-06, "loss": 0.19436899423599244, "memory(GiB)": 35.31, "step": 1830, "token_acc": 0.9332179074944348, "train_speed(iter/s)": 0.122614 }, { "epoch": 2.2998039984319876, "grad_norm": 0.8589149117469788, "learning_rate": 1.2759351847921053e-06, "loss": 0.20428872108459473, "memory(GiB)": 35.31, "step": 1835, "token_acc": 0.9258984534361904, "train_speed(iter/s)": 0.122693 }, { "epoch": 2.306076048608389, "grad_norm": 0.8475139737129211, "learning_rate": 1.25409704483824e-06, "loss": 0.20542593002319337, "memory(GiB)": 35.31, "step": 1840, "token_acc": 0.9302525044599973, "train_speed(iter/s)": 0.122772 }, { "epoch": 2.306076048608389, "eval_loss": 0.3399674594402313, "eval_runtime": 29.4357, "eval_samples_per_second": 17.496, "eval_steps_per_second": 4.382, "eval_token_acc": 0.8921819564819081, "step": 1840 }, { "epoch": 2.31234809878479, "grad_norm": 0.8466213345527649, "learning_rate": 1.232420577163902e-06, "loss": 0.1930600881576538, "memory(GiB)": 35.31, "step": 1845, "token_acc": 0.9117772908211558, "train_speed(iter/s)": 0.122528 }, { "epoch": 2.3186201489611915, "grad_norm": 0.8181382417678833, "learning_rate": 1.2109067173202731e-06, "loss": 0.20191946029663085, "memory(GiB)": 35.31, "step": 1850, "token_acc": 0.9309912619968486, "train_speed(iter/s)": 0.122613 }, { "epoch": 2.3248921991375933, "grad_norm": 0.9908396601676941, "learning_rate": 1.1895563938404203e-06, "loss": 0.20410680770874023, "memory(GiB)": 35.31, "step": 1855, "token_acc": 0.9304864290928396, "train_speed(iter/s)": 0.122689 }, { "epoch": 2.3311642493139946, "grad_norm": 0.9089633822441101, "learning_rate": 1.1683705281992202e-06, "loss": 0.19948985576629638, "memory(GiB)": 35.31, "step": 1860, "token_acc": 0.9321098763499766, "train_speed(iter/s)": 0.122765 }, { "epoch": 2.3311642493139946, "eval_loss": 0.3399353623390198, "eval_runtime": 29.1818, "eval_samples_per_second": 17.648, "eval_steps_per_second": 4.421, "eval_token_acc": 0.8923929398308802, "step": 1860 }, { "epoch": 2.337436299490396, "grad_norm": 0.8357407450675964, "learning_rate": 1.1473500347735927e-06, "loss": 0.21080975532531737, "memory(GiB)": 35.31, "step": 1865, "token_acc": 0.9067027184646176, "train_speed(iter/s)": 0.122557 }, { "epoch": 2.343708349666797, "grad_norm": 0.9168654084205627, "learning_rate": 1.1264958208030224e-06, "loss": 0.20281777381896973, "memory(GiB)": 35.31, "step": 1870, "token_acc": 0.9303914590747331, "train_speed(iter/s)": 0.122644 }, { "epoch": 2.349980399843199, "grad_norm": 0.870764970779419, "learning_rate": 1.105808786350423e-06, "loss": 0.20570874214172363, "memory(GiB)": 35.31, "step": 1875, "token_acc": 0.9311500593533971, "train_speed(iter/s)": 0.122727 }, { "epoch": 2.3562524500196003, "grad_norm": 0.9401370286941528, "learning_rate": 1.085289824263273e-06, "loss": 0.19900286197662354, "memory(GiB)": 35.31, "step": 1880, "token_acc": 0.9296601826391011, "train_speed(iter/s)": 0.122811 }, { "epoch": 2.3562524500196003, "eval_loss": 0.33901476860046387, "eval_runtime": 29.2027, "eval_samples_per_second": 17.635, "eval_steps_per_second": 4.417, "eval_token_acc": 0.8928149065288242, "step": 1880 }, { "epoch": 2.3625245001960016, "grad_norm": 0.8954982161521912, "learning_rate": 1.0649398201350907e-06, "loss": 0.19722338914871215, "memory(GiB)": 35.31, "step": 1885, "token_acc": 0.9103969754253308, "train_speed(iter/s)": 0.12257 }, { "epoch": 2.368796550372403, "grad_norm": 0.8362477421760559, "learning_rate": 1.044759652267207e-06, "loss": 0.1907820224761963, "memory(GiB)": 35.31, "step": 1890, "token_acc": 0.9338316722037652, "train_speed(iter/s)": 0.122633 }, { "epoch": 2.375068600548804, "grad_norm": 0.8261101245880127, "learning_rate": 1.024750191630864e-06, "loss": 0.20354986190795898, "memory(GiB)": 35.31, "step": 1895, "token_acc": 0.9313385826771654, "train_speed(iter/s)": 0.122717 }, { "epoch": 2.381340650725206, "grad_norm": 0.8243074417114258, "learning_rate": 1.0049123018296158e-06, "loss": 0.1990055799484253, "memory(GiB)": 35.31, "step": 1900, "token_acc": 0.931945788964182, "train_speed(iter/s)": 0.122795 }, { "epoch": 2.381340650725206, "eval_loss": 0.3398912847042084, "eval_runtime": 29.3886, "eval_samples_per_second": 17.524, "eval_steps_per_second": 4.389, "eval_token_acc": 0.8929537113636742, "step": 1900 }, { "epoch": 2.3876127009016073, "grad_norm": 0.8291247487068176, "learning_rate": 9.852468390620624e-07, "loss": 0.19908733367919923, "memory(GiB)": 35.31, "step": 1905, "token_acc": 0.9119964458846742, "train_speed(iter/s)": 0.122563 }, { "epoch": 2.3938847510780086, "grad_norm": 0.9071137309074402, "learning_rate": 9.65754652084896e-07, "loss": 0.20061676502227782, "memory(GiB)": 35.31, "step": 1910, "token_acc": 0.9315043133770224, "train_speed(iter/s)": 0.122633 }, { "epoch": 2.40015680125441, "grad_norm": 0.8123340606689453, "learning_rate": 9.464365821762611e-07, "loss": 0.2007359504699707, "memory(GiB)": 35.31, "step": 1915, "token_acc": 0.9310011111509373, "train_speed(iter/s)": 0.122704 }, { "epoch": 2.406428851430811, "grad_norm": 0.9131314754486084, "learning_rate": 9.272934630994579e-07, "loss": 0.2020493984222412, "memory(GiB)": 35.31, "step": 1920, "token_acc": 0.9327723569957926, "train_speed(iter/s)": 0.122789 }, { "epoch": 2.406428851430811, "eval_loss": 0.3394792377948761, "eval_runtime": 29.4324, "eval_samples_per_second": 17.498, "eval_steps_per_second": 4.383, "eval_token_acc": 0.8929315025900982, "step": 1920 }, { "epoch": 2.412700901607213, "grad_norm": 0.8938679099082947, "learning_rate": 9.083261210669458e-07, "loss": 0.19544891119003296, "memory(GiB)": 35.31, "step": 1925, "token_acc": 0.9099344547105741, "train_speed(iter/s)": 0.122556 }, { "epoch": 2.4189729517836143, "grad_norm": 0.843400239944458, "learning_rate": 8.895353747046903e-07, "loss": 0.20389878749847412, "memory(GiB)": 35.31, "step": 1930, "token_acc": 0.9224234943914038, "train_speed(iter/s)": 0.122642 }, { "epoch": 2.4252450019600156, "grad_norm": 0.9117215871810913, "learning_rate": 8.70922035016829e-07, "loss": 0.20622677803039552, "memory(GiB)": 35.31, "step": 1935, "token_acc": 0.9284166479862512, "train_speed(iter/s)": 0.122721 }, { "epoch": 2.4315170521364173, "grad_norm": 0.9367381930351257, "learning_rate": 8.524869053506718e-07, "loss": 0.20433897972106935, "memory(GiB)": 35.31, "step": 1940, "token_acc": 0.9362286970863112, "train_speed(iter/s)": 0.122802 }, { "epoch": 2.4315170521364173, "eval_loss": 0.33946192264556885, "eval_runtime": 29.2561, "eval_samples_per_second": 17.603, "eval_steps_per_second": 4.409, "eval_token_acc": 0.8925650578260942, "step": 1940 }, { "epoch": 2.4377891023128186, "grad_norm": 0.8426703214645386, "learning_rate": 8.342307813620254e-07, "loss": 0.1967821955680847, "memory(GiB)": 35.31, "step": 1945, "token_acc": 0.9112145208413204, "train_speed(iter/s)": 0.122596 }, { "epoch": 2.44406115248922, "grad_norm": 0.845736563205719, "learning_rate": 8.161544509808522e-07, "loss": 0.1979314088821411, "memory(GiB)": 35.31, "step": 1950, "token_acc": 0.9301829610506229, "train_speed(iter/s)": 0.122667 }, { "epoch": 2.4503332026656213, "grad_norm": 0.9228907823562622, "learning_rate": 7.982586943772663e-07, "loss": 0.19447792768478395, "memory(GiB)": 35.31, "step": 1955, "token_acc": 0.9315416813363063, "train_speed(iter/s)": 0.122739 }, { "epoch": 2.4566052528420226, "grad_norm": 0.8696323037147522, "learning_rate": 7.805442839278643e-07, "loss": 0.2015920639038086, "memory(GiB)": 35.31, "step": 1960, "token_acc": 0.9276550395540435, "train_speed(iter/s)": 0.12281 }, { "epoch": 2.4566052528420226, "eval_loss": 0.3402659595012665, "eval_runtime": 29.1749, "eval_samples_per_second": 17.652, "eval_steps_per_second": 4.422, "eval_token_acc": 0.8927316236279143, "step": 1960 }, { "epoch": 2.4628773030184243, "grad_norm": 0.8550149202346802, "learning_rate": 7.630119841823808e-07, "loss": 0.19550955295562744, "memory(GiB)": 35.31, "step": 1965, "token_acc": 0.9109677753600663, "train_speed(iter/s)": 0.122595 }, { "epoch": 2.4691493531948256, "grad_norm": 0.8550127148628235, "learning_rate": 7.456625518306976e-07, "loss": 0.20470118522644043, "memory(GiB)": 35.31, "step": 1970, "token_acc": 0.9361644784969007, "train_speed(iter/s)": 0.122672 }, { "epoch": 2.475421403371227, "grad_norm": 0.8760136365890503, "learning_rate": 7.284967356701839e-07, "loss": 0.19109119176864625, "memory(GiB)": 35.31, "step": 1975, "token_acc": 0.9317175239755885, "train_speed(iter/s)": 0.122722 }, { "epoch": 2.4816934535476283, "grad_norm": 0.8244690299034119, "learning_rate": 7.115152765733768e-07, "loss": 0.19548358917236328, "memory(GiB)": 35.31, "step": 1980, "token_acc": 0.9295848857777276, "train_speed(iter/s)": 0.122806 }, { "epoch": 2.4816934535476283, "eval_loss": 0.33928003907203674, "eval_runtime": 29.3453, "eval_samples_per_second": 17.55, "eval_steps_per_second": 4.396, "eval_token_acc": 0.8926039231798522, "step": 1980 }, { "epoch": 2.4879655037240296, "grad_norm": 0.8801511526107788, "learning_rate": 6.94718907456009e-07, "loss": 0.20571486949920653, "memory(GiB)": 35.31, "step": 1985, "token_acc": 0.9119239420411153, "train_speed(iter/s)": 0.122584 }, { "epoch": 2.4942375539004313, "grad_norm": 0.8793259263038635, "learning_rate": 6.781083532453702e-07, "loss": 0.19008111953735352, "memory(GiB)": 35.31, "step": 1990, "token_acc": 0.9300288504988008, "train_speed(iter/s)": 0.122656 }, { "epoch": 2.5005096040768326, "grad_norm": 0.8719813823699951, "learning_rate": 6.61684330849025e-07, "loss": 0.20494444370269777, "memory(GiB)": 35.31, "step": 1995, "token_acc": 0.9264506459210621, "train_speed(iter/s)": 0.122733 }, { "epoch": 2.506781654253234, "grad_norm": 0.8206390738487244, "learning_rate": 6.454475491238682e-07, "loss": 0.21190786361694336, "memory(GiB)": 35.31, "step": 2000, "token_acc": 0.9282438731017892, "train_speed(iter/s)": 0.122814 }, { "epoch": 2.506781654253234, "eval_loss": 0.3389175236225128, "eval_runtime": 29.4243, "eval_samples_per_second": 17.503, "eval_steps_per_second": 4.384, "eval_token_acc": 0.8926705495005802, "step": 2000 }, { "epoch": 2.5130537044296353, "grad_norm": 0.8637955188751221, "learning_rate": 6.293987088455355e-07, "loss": 0.18972909450531006, "memory(GiB)": 35.31, "step": 2005, "token_acc": 0.9111111111111111, "train_speed(iter/s)": 0.122588 }, { "epoch": 2.5193257546060366, "grad_norm": 0.8693656325340271, "learning_rate": 6.135385026781476e-07, "loss": 0.19535259008407593, "memory(GiB)": 35.31, "step": 2010, "token_acc": 0.9307692307692308, "train_speed(iter/s)": 0.122665 }, { "epoch": 2.5255978047824383, "grad_norm": 0.8843157887458801, "learning_rate": 5.978676151444285e-07, "loss": 0.19748587608337403, "memory(GiB)": 35.31, "step": 2015, "token_acc": 0.932608875299562, "train_speed(iter/s)": 0.122743 }, { "epoch": 2.5318698549588396, "grad_norm": 0.7771234512329102, "learning_rate": 5.823867225961516e-07, "loss": 0.20223774909973144, "memory(GiB)": 35.31, "step": 2020, "token_acc": 0.9318809052241956, "train_speed(iter/s)": 0.122809 }, { "epoch": 2.5318698549588396, "eval_loss": 0.3387446999549866, "eval_runtime": 29.4213, "eval_samples_per_second": 17.504, "eval_steps_per_second": 4.385, "eval_token_acc": 0.8927593845948842, "step": 2020 }, { "epoch": 2.538141905135241, "grad_norm": 0.8923653364181519, "learning_rate": 5.670964931849521e-07, "loss": 0.20152680873870848, "memory(GiB)": 35.31, "step": 2025, "token_acc": 0.9079145999568686, "train_speed(iter/s)": 0.122607 }, { "epoch": 2.5444139553116427, "grad_norm": 0.9194443225860596, "learning_rate": 5.519975868334914e-07, "loss": 0.18218059539794923, "memory(GiB)": 35.31, "step": 2030, "token_acc": 0.9371556217423679, "train_speed(iter/s)": 0.122682 }, { "epoch": 2.550686005488044, "grad_norm": 0.9134296774864197, "learning_rate": 5.370906552069721e-07, "loss": 0.21519947052001953, "memory(GiB)": 35.31, "step": 2035, "token_acc": 0.9308896388310476, "train_speed(iter/s)": 0.122768 }, { "epoch": 2.5569580556644453, "grad_norm": 0.8432523608207703, "learning_rate": 5.22376341685013e-07, "loss": 0.19406379461288453, "memory(GiB)": 35.31, "step": 2040, "token_acc": 0.9379203093476799, "train_speed(iter/s)": 0.122832 }, { "epoch": 2.5569580556644453, "eval_loss": 0.3386911153793335, "eval_runtime": 29.3737, "eval_samples_per_second": 17.533, "eval_steps_per_second": 4.392, "eval_token_acc": 0.8926372363402162, "step": 2040 }, { "epoch": 2.5632301058408467, "grad_norm": 0.8082075715065002, "learning_rate": 5.07855281333881e-07, "loss": 0.20029840469360352, "memory(GiB)": 35.31, "step": 2045, "token_acc": 0.9093570973901973, "train_speed(iter/s)": 0.122621 }, { "epoch": 2.569502156017248, "grad_norm": 0.8321412801742554, "learning_rate": 4.935281008790843e-07, "loss": 0.19831552505493164, "memory(GiB)": 35.31, "step": 2050, "token_acc": 0.9315964443159644, "train_speed(iter/s)": 0.122696 }, { "epoch": 2.5757742061936497, "grad_norm": 0.836764395236969, "learning_rate": 4.793954186783195e-07, "loss": 0.20497620105743408, "memory(GiB)": 35.31, "step": 2055, "token_acc": 0.9296713578417996, "train_speed(iter/s)": 0.122776 }, { "epoch": 2.582046256370051, "grad_norm": 0.8858775496482849, "learning_rate": 4.6545784469478386e-07, "loss": 0.1925274133682251, "memory(GiB)": 35.31, "step": 2060, "token_acc": 0.9290921363482957, "train_speed(iter/s)": 0.122853 }, { "epoch": 2.582046256370051, "eval_loss": 0.33907607197761536, "eval_runtime": 29.2969, "eval_samples_per_second": 17.579, "eval_steps_per_second": 4.403, "eval_token_acc": 0.8925872665996702, "step": 2060 }, { "epoch": 2.5883183065464523, "grad_norm": 0.8445903062820435, "learning_rate": 4.5171598047085153e-07, "loss": 0.2032531976699829, "memory(GiB)": 35.31, "step": 2065, "token_acc": 0.9114080767856231, "train_speed(iter/s)": 0.122651 }, { "epoch": 2.5945903567228537, "grad_norm": 0.8861658573150635, "learning_rate": 4.381704191021119e-07, "loss": 0.20241761207580566, "memory(GiB)": 35.31, "step": 2070, "token_acc": 0.9290917036929128, "train_speed(iter/s)": 0.12273 }, { "epoch": 2.600862406899255, "grad_norm": 0.9306016564369202, "learning_rate": 4.248217452117653e-07, "loss": 0.19754456281661986, "memory(GiB)": 35.31, "step": 2075, "token_acc": 0.9342051643018561, "train_speed(iter/s)": 0.122795 }, { "epoch": 2.6071344570756567, "grad_norm": 0.8931999206542969, "learning_rate": 4.1167053492540023e-07, "loss": 0.20364174842834473, "memory(GiB)": 35.31, "step": 2080, "token_acc": 0.927598729005901, "train_speed(iter/s)": 0.122864 }, { "epoch": 2.6071344570756567, "eval_loss": 0.3389338552951813, "eval_runtime": 29.4394, "eval_samples_per_second": 17.494, "eval_steps_per_second": 4.382, "eval_token_acc": 0.8927205192411262, "step": 2080 }, { "epoch": 2.613406507252058, "grad_norm": 0.9079094529151917, "learning_rate": 3.987173558461199e-07, "loss": 0.2029134750366211, "memory(GiB)": 35.31, "step": 2085, "token_acc": 0.9080656826765958, "train_speed(iter/s)": 0.122659 }, { "epoch": 2.6196785574284593, "grad_norm": 0.8731334209442139, "learning_rate": 3.8596276703004974e-07, "loss": 0.1946258783340454, "memory(GiB)": 35.31, "step": 2090, "token_acc": 0.9326196473551638, "train_speed(iter/s)": 0.122729 }, { "epoch": 2.625950607604861, "grad_norm": 0.858302652835846, "learning_rate": 3.7340731896220393e-07, "loss": 0.19946659803390504, "memory(GiB)": 35.31, "step": 2095, "token_acc": 0.9289610347192736, "train_speed(iter/s)": 0.1228 }, { "epoch": 2.6322226577812624, "grad_norm": 0.9710678458213806, "learning_rate": 3.6105155353273305e-07, "loss": 0.19640454053878784, "memory(GiB)": 35.31, "step": 2100, "token_acc": 0.9349016126645954, "train_speed(iter/s)": 0.122853 }, { "epoch": 2.6322226577812624, "eval_loss": 0.33849242329597473, "eval_runtime": 29.4901, "eval_samples_per_second": 17.464, "eval_steps_per_second": 4.374, "eval_token_acc": 0.8928537718825822, "step": 2100 }, { "epoch": 2.6384947079576637, "grad_norm": 0.9348801970481873, "learning_rate": 3.488960040135303e-07, "loss": 0.19354541301727296, "memory(GiB)": 35.31, "step": 2105, "token_acc": 0.9099158577870177, "train_speed(iter/s)": 0.122646 }, { "epoch": 2.644766758134065, "grad_norm": 0.8546217679977417, "learning_rate": 3.369411950352175e-07, "loss": 0.19183013439178467, "memory(GiB)": 35.31, "step": 2110, "token_acc": 0.9336448231183007, "train_speed(iter/s)": 0.122714 }, { "epoch": 2.6510388083104663, "grad_norm": 0.8211365938186646, "learning_rate": 3.251876425645051e-07, "loss": 0.1970944881439209, "memory(GiB)": 35.31, "step": 2115, "token_acc": 0.9344840840001397, "train_speed(iter/s)": 0.122778 }, { "epoch": 2.657310858486868, "grad_norm": 0.9623558521270752, "learning_rate": 3.136358538819162e-07, "loss": 0.21173155307769775, "memory(GiB)": 35.31, "step": 2120, "token_acc": 0.9270025343675601, "train_speed(iter/s)": 0.122858 }, { "epoch": 2.657310858486868, "eval_loss": 0.33901315927505493, "eval_runtime": 29.3435, "eval_samples_per_second": 17.551, "eval_steps_per_second": 4.396, "eval_token_acc": 0.8928426674957942, "step": 2120 }, { "epoch": 2.6635829086632694, "grad_norm": 0.8673403263092041, "learning_rate": 3.0228632755990197e-07, "loss": 0.1995314836502075, "memory(GiB)": 35.31, "step": 2125, "token_acc": 0.9100443616846486, "train_speed(iter/s)": 0.122656 }, { "epoch": 2.6698549588396707, "grad_norm": 0.8678857684135437, "learning_rate": 2.911395534413147e-07, "loss": 0.20220797061920165, "memory(GiB)": 35.31, "step": 2130, "token_acc": 0.9269149418341139, "train_speed(iter/s)": 0.122725 }, { "epoch": 2.676127009016072, "grad_norm": 0.8663479089736938, "learning_rate": 2.8019601261827123e-07, "loss": 0.19415628910064697, "memory(GiB)": 35.31, "step": 2135, "token_acc": 0.9364371994839921, "train_speed(iter/s)": 0.122789 }, { "epoch": 2.6823990591924733, "grad_norm": 0.9166416525840759, "learning_rate": 2.694561774113863e-07, "loss": 0.2000800371170044, "memory(GiB)": 35.31, "step": 2140, "token_acc": 0.9343491222259026, "train_speed(iter/s)": 0.122866 }, { "epoch": 2.6823990591924733, "eval_loss": 0.3389025926589966, "eval_runtime": 29.1862, "eval_samples_per_second": 17.645, "eval_steps_per_second": 4.42, "eval_token_acc": 0.8927649367882782, "step": 2140 }, { "epoch": 2.688671109368875, "grad_norm": 0.924248993396759, "learning_rate": 2.5892051134939256e-07, "loss": 0.19067001342773438, "memory(GiB)": 35.31, "step": 2145, "token_acc": 0.9105434393007803, "train_speed(iter/s)": 0.122659 }, { "epoch": 2.6949431595452764, "grad_norm": 0.9541825652122498, "learning_rate": 2.485894691491253e-07, "loss": 0.19531885385513306, "memory(GiB)": 35.31, "step": 2150, "token_acc": 0.9327190236696796, "train_speed(iter/s)": 0.122731 }, { "epoch": 2.7012152097216777, "grad_norm": 0.7837492227554321, "learning_rate": 2.384634966959076e-07, "loss": 0.1978324294090271, "memory(GiB)": 35.31, "step": 2155, "token_acc": 0.9313808767588728, "train_speed(iter/s)": 0.122802 }, { "epoch": 2.707487259898079, "grad_norm": 0.9158398509025574, "learning_rate": 2.2854303102429808e-07, "loss": 0.19039928913116455, "memory(GiB)": 35.31, "step": 2160, "token_acc": 0.9376352705410822, "train_speed(iter/s)": 0.122861 }, { "epoch": 2.707487259898079, "eval_loss": 0.33926960825920105, "eval_runtime": 29.5457, "eval_samples_per_second": 17.431, "eval_steps_per_second": 4.366, "eval_token_acc": 0.8928482196891883, "step": 2160 }, { "epoch": 2.7137593100744803, "grad_norm": 0.9085947871208191, "learning_rate": 2.1882850029923463e-07, "loss": 0.1978399395942688, "memory(GiB)": 35.31, "step": 2165, "token_acc": 0.9107124038360207, "train_speed(iter/s)": 0.122668 }, { "epoch": 2.720031360250882, "grad_norm": 0.8751495480537415, "learning_rate": 2.093203237975483e-07, "loss": 0.199626362323761, "memory(GiB)": 35.31, "step": 2170, "token_acc": 0.9323327305605786, "train_speed(iter/s)": 0.122732 }, { "epoch": 2.7263034104272834, "grad_norm": 0.8074597716331482, "learning_rate": 2.0001891188987265e-07, "loss": 0.19364542961120607, "memory(GiB)": 35.31, "step": 2175, "token_acc": 0.9388682499668303, "train_speed(iter/s)": 0.122783 }, { "epoch": 2.7325754606036847, "grad_norm": 0.8650562763214111, "learning_rate": 1.9092466602293247e-07, "loss": 0.20354480743408204, "memory(GiB)": 35.31, "step": 2180, "token_acc": 0.9326430478389495, "train_speed(iter/s)": 0.122852 }, { "epoch": 2.7325754606036847, "eval_loss": 0.3389414846897125, "eval_runtime": 29.3624, "eval_samples_per_second": 17.539, "eval_steps_per_second": 4.393, "eval_token_acc": 0.8927538324014902, "step": 2180 }, { "epoch": 2.7388475107800865, "grad_norm": 0.9609673619270325, "learning_rate": 1.8203797870221197e-07, "loss": 0.19662023782730104, "memory(GiB)": 35.31, "step": 2185, "token_acc": 0.9078222548659567, "train_speed(iter/s)": 0.122635 }, { "epoch": 2.745119560956488, "grad_norm": 0.8336656093597412, "learning_rate": 1.7335923347502003e-07, "loss": 0.19498822689056397, "memory(GiB)": 35.31, "step": 2190, "token_acc": 0.9327082366973692, "train_speed(iter/s)": 0.122704 }, { "epoch": 2.751391611132889, "grad_norm": 0.9428432583808899, "learning_rate": 1.6488880491393467e-07, "loss": 0.19512466192245484, "memory(GiB)": 35.31, "step": 2195, "token_acc": 0.9311785670394495, "train_speed(iter/s)": 0.12277 }, { "epoch": 2.7576636613092904, "grad_norm": 0.8996681571006775, "learning_rate": 1.5662705860063465e-07, "loss": 0.19020618200302125, "memory(GiB)": 35.31, "step": 2200, "token_acc": 0.9422429845480382, "train_speed(iter/s)": 0.122831 }, { "epoch": 2.7576636613092904, "eval_loss": 0.3384985029697418, "eval_runtime": 29.4137, "eval_samples_per_second": 17.509, "eval_steps_per_second": 4.386, "eval_token_acc": 0.8928870850429462, "step": 2200 }, { "epoch": 2.7639357114856917, "grad_norm": 0.8269793391227722, "learning_rate": 1.485743511101234e-07, "loss": 0.20108513832092284, "memory(GiB)": 35.31, "step": 2205, "token_acc": 0.9105911137718317, "train_speed(iter/s)": 0.122632 }, { "epoch": 2.7702077616620935, "grad_norm": 0.8052636384963989, "learning_rate": 1.4073102999534017e-07, "loss": 0.19754087924957275, "memory(GiB)": 35.31, "step": 2210, "token_acc": 0.9333692597867508, "train_speed(iter/s)": 0.122702 }, { "epoch": 2.776479811838495, "grad_norm": 0.8950125575065613, "learning_rate": 1.3309743377215468e-07, "loss": 0.19114834070205688, "memory(GiB)": 35.31, "step": 2215, "token_acc": 0.9366596409622793, "train_speed(iter/s)": 0.122756 }, { "epoch": 2.782751862014896, "grad_norm": 0.8540958166122437, "learning_rate": 1.2567389190476287e-07, "loss": 0.2070404052734375, "memory(GiB)": 35.31, "step": 2220, "token_acc": 0.9272875816993464, "train_speed(iter/s)": 0.122829 }, { "epoch": 2.782751862014896, "eval_loss": 0.33857327699661255, "eval_runtime": 29.3528, "eval_samples_per_second": 17.545, "eval_steps_per_second": 4.395, "eval_token_acc": 0.8927593845948842, "step": 2220 }, { "epoch": 2.7890239121912974, "grad_norm": 0.8678739666938782, "learning_rate": 1.1846072479146431e-07, "loss": 0.18475788831710815, "memory(GiB)": 35.31, "step": 2225, "token_acc": 0.91176622304756, "train_speed(iter/s)": 0.122618 }, { "epoch": 2.7952959623676987, "grad_norm": 0.9150513410568237, "learning_rate": 1.114582437508327e-07, "loss": 0.20294113159179689, "memory(GiB)": 35.31, "step": 2230, "token_acc": 0.9297203746436813, "train_speed(iter/s)": 0.12269 }, { "epoch": 2.8015680125441005, "grad_norm": 0.9211878776550293, "learning_rate": 1.0466675100828383e-07, "loss": 0.19586080312728882, "memory(GiB)": 35.31, "step": 2235, "token_acc": 0.926397298076527, "train_speed(iter/s)": 0.122748 }, { "epoch": 2.807840062720502, "grad_norm": 0.8305221199989319, "learning_rate": 9.808653968302607e-08, "loss": 0.18916590213775636, "memory(GiB)": 35.31, "step": 2240, "token_acc": 0.9314254859611231, "train_speed(iter/s)": 0.122814 }, { "epoch": 2.807840062720502, "eval_loss": 0.3386108875274658, "eval_runtime": 29.4168, "eval_samples_per_second": 17.507, "eval_steps_per_second": 4.385, "eval_token_acc": 0.8929481591702803, "step": 2240 }, { "epoch": 2.814112112896903, "grad_norm": 0.8349918127059937, "learning_rate": 9.17178937754143e-08, "loss": 0.19523937702178956, "memory(GiB)": 35.31, "step": 2245, "token_acc": 0.9112593067402988, "train_speed(iter/s)": 0.122615 }, { "epoch": 2.820384163073305, "grad_norm": 0.7576460838317871, "learning_rate": 8.556108815468756e-08, "loss": 0.18766900300979614, "memory(GiB)": 35.31, "step": 2250, "token_acc": 0.9345958040143973, "train_speed(iter/s)": 0.122681 }, { "epoch": 2.8266562132497057, "grad_norm": 0.8792471289634705, "learning_rate": 7.961638854711296e-08, "loss": 0.19756540060043334, "memory(GiB)": 35.31, "step": 2255, "token_acc": 0.9329730890352133, "train_speed(iter/s)": 0.122737 }, { "epoch": 2.8329282634261075, "grad_norm": 0.9662355780601501, "learning_rate": 7.388405152450706e-08, "loss": 0.19980876445770263, "memory(GiB)": 35.31, "step": 2260, "token_acc": 0.9275946077635212, "train_speed(iter/s)": 0.122811 }, { "epoch": 2.8329282634261075, "eval_loss": 0.3388676047325134, "eval_runtime": 29.3235, "eval_samples_per_second": 17.563, "eval_steps_per_second": 4.399, "eval_token_acc": 0.8927760411750663, "step": 2260 }, { "epoch": 2.839200313602509, "grad_norm": 0.8288848400115967, "learning_rate": 6.836432449317255e-08, "loss": 0.19607880115509033, "memory(GiB)": 35.31, "step": 2265, "token_acc": 0.9086215087640337, "train_speed(iter/s)": 0.122615 }, { "epoch": 2.84547236377891, "grad_norm": 0.8882344961166382, "learning_rate": 6.305744568321281e-08, "loss": 0.19206061363220214, "memory(GiB)": 35.31, "step": 2270, "token_acc": 0.9301113088095336, "train_speed(iter/s)": 0.122679 }, { "epoch": 2.851744413955312, "grad_norm": 0.8858575820922852, "learning_rate": 5.7963644138254175e-08, "loss": 0.1968652129173279, "memory(GiB)": 35.31, "step": 2275, "token_acc": 0.9304818328139447, "train_speed(iter/s)": 0.122744 }, { "epoch": 2.858016464131713, "grad_norm": 0.8176801800727844, "learning_rate": 5.308313970555812e-08, "loss": 0.2044682025909424, "memory(GiB)": 35.31, "step": 2280, "token_acc": 0.9269594335344797, "train_speed(iter/s)": 0.122817 }, { "epoch": 2.858016464131713, "eval_loss": 0.3388592600822449, "eval_runtime": 29.4082, "eval_samples_per_second": 17.512, "eval_steps_per_second": 4.387, "eval_token_acc": 0.8928815328495522, "step": 2280 }, { "epoch": 2.8642885143081145, "grad_norm": 0.9048309922218323, "learning_rate": 4.841614302653341e-08, "loss": 0.18790122270584106, "memory(GiB)": 35.31, "step": 2285, "token_acc": 0.9097248097088412, "train_speed(iter/s)": 0.122613 }, { "epoch": 2.870560564484516, "grad_norm": 0.911831259727478, "learning_rate": 4.396285552764557e-08, "loss": 0.19619462490081788, "memory(GiB)": 35.31, "step": 2290, "token_acc": 0.9307579243353783, "train_speed(iter/s)": 0.122691 }, { "epoch": 2.876832614660917, "grad_norm": 0.9140892624855042, "learning_rate": 3.9723469411723226e-08, "loss": 0.1958878755569458, "memory(GiB)": 35.31, "step": 2295, "token_acc": 0.9350311098249168, "train_speed(iter/s)": 0.122756 }, { "epoch": 2.883104664837319, "grad_norm": 0.8358930945396423, "learning_rate": 3.5698167649660384e-08, "loss": 0.18899658918380738, "memory(GiB)": 35.31, "step": 2300, "token_acc": 0.933035064746108, "train_speed(iter/s)": 0.122816 }, { "epoch": 2.883104664837319, "eval_loss": 0.3388690948486328, "eval_runtime": 29.3534, "eval_samples_per_second": 17.545, "eval_steps_per_second": 4.395, "eval_token_acc": 0.8928926372363403, "step": 2300 }, { "epoch": 2.88937671501372, "grad_norm": 0.902189314365387, "learning_rate": 3.188712397252325e-08, "loss": 0.19353920221328735, "memory(GiB)": 35.31, "step": 2305, "token_acc": 0.9086479591836735, "train_speed(iter/s)": 0.122622 }, { "epoch": 2.8956487651901215, "grad_norm": 0.9097030758857727, "learning_rate": 2.8290502864049553e-08, "loss": 0.1854855537414551, "memory(GiB)": 35.31, "step": 2310, "token_acc": 0.9400766855552917, "train_speed(iter/s)": 0.122682 }, { "epoch": 2.901920815366523, "grad_norm": 0.7989513278007507, "learning_rate": 2.4908459553549257e-08, "loss": 0.1963629961013794, "memory(GiB)": 35.31, "step": 2315, "token_acc": 0.9324489715346046, "train_speed(iter/s)": 0.122749 }, { "epoch": 2.908192865542924, "grad_norm": 0.9992919564247131, "learning_rate": 2.174114000920713e-08, "loss": 0.20053634643554688, "memory(GiB)": 35.31, "step": 2320, "token_acc": 0.9333678449933441, "train_speed(iter/s)": 0.122817 }, { "epoch": 2.908192865542924, "eval_loss": 0.3389608860015869, "eval_runtime": 29.2024, "eval_samples_per_second": 17.636, "eval_steps_per_second": 4.417, "eval_token_acc": 0.8928149065288242, "step": 2320 }, { "epoch": 2.914464915719326, "grad_norm": 0.8522620797157288, "learning_rate": 1.878868093177999e-08, "loss": 0.19264475107192994, "memory(GiB)": 35.31, "step": 2325, "token_acc": 0.9097824352359168, "train_speed(iter/s)": 0.122641 }, { "epoch": 2.920736965895727, "grad_norm": 0.8912385106086731, "learning_rate": 1.6051209748698116e-08, "loss": 0.1977926969528198, "memory(GiB)": 35.31, "step": 2330, "token_acc": 0.9306182995077877, "train_speed(iter/s)": 0.122698 }, { "epoch": 2.9270090160721285, "grad_norm": 0.8228368759155273, "learning_rate": 1.3528844608566848e-08, "loss": 0.19549624919891356, "memory(GiB)": 35.31, "step": 2335, "token_acc": 0.9288313505948216, "train_speed(iter/s)": 0.122753 }, { "epoch": 2.9332810662485302, "grad_norm": 0.9116573929786682, "learning_rate": 1.1221694376064018e-08, "loss": 0.2056950092315674, "memory(GiB)": 35.31, "step": 2340, "token_acc": 0.9298293985869378, "train_speed(iter/s)": 0.122821 }, { "epoch": 2.9332810662485302, "eval_loss": 0.33883053064346313, "eval_runtime": 29.3994, "eval_samples_per_second": 17.517, "eval_steps_per_second": 4.388, "eval_token_acc": 0.8927760411750663, "step": 2340 }, { "epoch": 2.9395531164249316, "grad_norm": 0.9035621881484985, "learning_rate": 9.129858627244802e-09, "loss": 0.1941608190536499, "memory(GiB)": 35.31, "step": 2345, "token_acc": 0.9095595779446971, "train_speed(iter/s)": 0.122637 }, { "epoch": 2.945825166601333, "grad_norm": 1.024281620979309, "learning_rate": 7.25342764524184e-09, "loss": 0.19218976497650148, "memory(GiB)": 35.31, "step": 2350, "token_acc": 0.9338138445777843, "train_speed(iter/s)": 0.122699 }, { "epoch": 2.952097216777734, "grad_norm": 0.8680444955825806, "learning_rate": 5.592482416369449e-09, "loss": 0.1976174831390381, "memory(GiB)": 35.31, "step": 2355, "token_acc": 0.9297104920949969, "train_speed(iter/s)": 0.122764 }, { "epoch": 2.9583692669541355, "grad_norm": 0.8660714626312256, "learning_rate": 4.147094626628656e-09, "loss": 0.19633731842041016, "memory(GiB)": 35.31, "step": 2360, "token_acc": 0.9261772040487017, "train_speed(iter/s)": 0.122824 }, { "epoch": 2.9583692669541355, "eval_loss": 0.33882075548171997, "eval_runtime": 29.461, "eval_samples_per_second": 17.481, "eval_steps_per_second": 4.379, "eval_token_acc": 0.8928426674957942, "step": 2360 }, { "epoch": 2.9646413171305372, "grad_norm": 0.8004366159439087, "learning_rate": 2.9173266586113303e-09, "loss": 0.2010037899017334, "memory(GiB)": 35.31, "step": 2365, "token_acc": 0.9107686175513187, "train_speed(iter/s)": 0.122617 }, { "epoch": 2.9709133673069386, "grad_norm": 0.8887699842453003, "learning_rate": 1.9032315888106724e-09, "loss": 0.19576088190078736, "memory(GiB)": 35.31, "step": 2370, "token_acc": 0.9332804385458742, "train_speed(iter/s)": 0.122676 }, { "epoch": 2.97718541748334, "grad_norm": 0.8810137510299683, "learning_rate": 1.1048531853286027e-09, "loss": 0.20437698364257811, "memory(GiB)": 35.31, "step": 2375, "token_acc": 0.92782014674522, "train_speed(iter/s)": 0.122731 }, { "epoch": 2.983457467659741, "grad_norm": 0.8733575940132141, "learning_rate": 5.222259059867174e-10, "loss": 0.2047698974609375, "memory(GiB)": 35.31, "step": 2380, "token_acc": 0.9247718926286298, "train_speed(iter/s)": 0.122792 }, { "epoch": 2.983457467659741, "eval_loss": 0.3388313353061676, "eval_runtime": 29.4983, "eval_samples_per_second": 17.459, "eval_steps_per_second": 4.373, "eval_token_acc": 0.8928260109156122, "step": 2380 }, { "epoch": 2.9897295178361425, "grad_norm": 0.8730230927467346, "learning_rate": 1.5537489683914442e-10, "loss": 0.1937323570251465, "memory(GiB)": 35.31, "step": 2385, "token_acc": 0.9117496217657769, "train_speed(iter/s)": 0.122611 }, { "epoch": 2.9960015680125442, "grad_norm": 0.8275863528251648, "learning_rate": 4.315991088965632e-12, "loss": 0.19883054494857788, "memory(GiB)": 35.31, "step": 2390, "token_acc": 0.9306170220547895, "train_speed(iter/s)": 0.122671 }, { "epoch": 2.9972559780478245, "eval_loss": 0.3387167751789093, "eval_runtime": 29.4848, "eval_samples_per_second": 17.467, "eval_steps_per_second": 4.375, "eval_token_acc": 0.8929092938165222, "step": 2391 } ], "logging_steps": 5, "max_steps": 2391, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.529076327726711e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }