{ "best_global_step": 180, "best_metric": 0.27287108, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v32-20250504-043500/checkpoint-180", "epoch": 2.9732620320855614, "eval_steps": 20, "global_step": 279, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0106951871657754, "grad_norm": 3.2918701171875, "learning_rate": 9.999683023724021e-06, "loss": 0.2989569902420044, "memory(GiB)": 28.98, "step": 1, "token_acc": 0.9019468186134852, "train_speed(iter/s)": 0.075727 }, { "epoch": 0.053475935828877004, "grad_norm": 1.4581928253173828, "learning_rate": 9.992077602401358e-06, "loss": 0.26982036232948303, "memory(GiB)": 28.98, "step": 5, "token_acc": 0.9124155874528606, "train_speed(iter/s)": 0.162223 }, { "epoch": 0.10695187165775401, "grad_norm": 0.8813036680221558, "learning_rate": 9.968335515358916e-06, "loss": 0.26494245529174804, "memory(GiB)": 28.98, "step": 10, "token_acc": 0.9064403726266386, "train_speed(iter/s)": 0.196471 }, { "epoch": 0.16042780748663102, "grad_norm": 0.8264817595481873, "learning_rate": 9.92884897657402e-06, "loss": 0.27532644271850587, "memory(GiB)": 28.98, "step": 15, "token_acc": 0.9036378177940428, "train_speed(iter/s)": 0.206122 }, { "epoch": 0.21390374331550802, "grad_norm": 0.7590827345848083, "learning_rate": 9.873743117270691e-06, "loss": 0.24748692512512208, "memory(GiB)": 28.98, "step": 20, "token_acc": 0.9165032561067131, "train_speed(iter/s)": 0.213242 }, { "epoch": 0.21390374331550802, "eval_loss": 0.29857584834098816, "eval_runtime": 1.6521, "eval_samples_per_second": 36.318, "eval_steps_per_second": 9.079, "eval_token_acc": 0.9033078880407125, "step": 20 }, { "epoch": 0.26737967914438504, "grad_norm": 0.7544880509376526, "learning_rate": 9.803192565659898e-06, "loss": 0.2472740650177002, "memory(GiB)": 28.98, "step": 25, "token_acc": 0.9127769919849128, "train_speed(iter/s)": 0.193549 }, { "epoch": 0.32085561497326204, "grad_norm": 0.7806485891342163, "learning_rate": 9.717420893549902e-06, "loss": 0.2667980670928955, "memory(GiB)": 28.98, "step": 30, "token_acc": 0.908313332992902, "train_speed(iter/s)": 0.199526 }, { "epoch": 0.37433155080213903, "grad_norm": 0.7090319395065308, "learning_rate": 9.616699907856368e-06, "loss": 0.23824496269226075, "memory(GiB)": 28.98, "step": 35, "token_acc": 0.9139585630821934, "train_speed(iter/s)": 0.202496 }, { "epoch": 0.42780748663101603, "grad_norm": 0.6566728949546814, "learning_rate": 9.501348789257373e-06, "loss": 0.24109985828399658, "memory(GiB)": 28.98, "step": 40, "token_acc": 0.9166758030917662, "train_speed(iter/s)": 0.20511 }, { "epoch": 0.42780748663101603, "eval_loss": 0.2843839228153229, "eval_runtime": 1.6484, "eval_samples_per_second": 36.398, "eval_steps_per_second": 9.1, "eval_token_acc": 0.9075595065545785, "step": 40 }, { "epoch": 0.48128342245989303, "grad_norm": 0.6919281482696533, "learning_rate": 9.371733080722911e-06, "loss": 0.24000308513641358, "memory(GiB)": 28.98, "step": 45, "token_acc": 0.9120257943391221, "train_speed(iter/s)": 0.195603 }, { "epoch": 0.5347593582887701, "grad_norm": 0.6628720164299011, "learning_rate": 9.228263529124199e-06, "loss": 0.225927734375, "memory(GiB)": 28.98, "step": 50, "token_acc": 0.922932112394543, "train_speed(iter/s)": 0.199738 }, { "epoch": 0.5882352941176471, "grad_norm": 0.7601417899131775, "learning_rate": 9.071394783593664e-06, "loss": 0.24698638916015625, "memory(GiB)": 28.98, "step": 55, "token_acc": 0.916304375460809, "train_speed(iter/s)": 0.202796 }, { "epoch": 0.6417112299465241, "grad_norm": 0.7333827018737793, "learning_rate": 8.90162395476046e-06, "loss": 0.24123883247375488, "memory(GiB)": 28.98, "step": 60, "token_acc": 0.9169203180670583, "train_speed(iter/s)": 0.205478 }, { "epoch": 0.6417112299465241, "eval_loss": 0.2792617380619049, "eval_runtime": 1.6412, "eval_samples_per_second": 36.559, "eval_steps_per_second": 9.14, "eval_token_acc": 0.908719038876542, "step": 60 }, { "epoch": 0.6951871657754011, "grad_norm": 0.732627809047699, "learning_rate": 8.719489039427256e-06, "loss": 0.2210240602493286, "memory(GiB)": 28.98, "step": 65, "token_acc": 0.9183851177518306, "train_speed(iter/s)": 0.19878 }, { "epoch": 0.7486631016042781, "grad_norm": 0.7144444584846497, "learning_rate": 8.525567215680397e-06, "loss": 0.24620118141174316, "memory(GiB)": 28.98, "step": 70, "token_acc": 0.9128896697452457, "train_speed(iter/s)": 0.20038 }, { "epoch": 0.8021390374331551, "grad_norm": 0.736126184463501, "learning_rate": 8.320473013836197e-06, "loss": 0.23789706230163574, "memory(GiB)": 28.98, "step": 75, "token_acc": 0.9134095303360337, "train_speed(iter/s)": 0.202414 }, { "epoch": 0.8556149732620321, "grad_norm": 0.7036953568458557, "learning_rate": 8.104856369019525e-06, "loss": 0.23406553268432617, "memory(GiB)": 28.98, "step": 80, "token_acc": 0.9200627693460746, "train_speed(iter/s)": 0.204729 }, { "epoch": 0.8556149732620321, "eval_loss": 0.2767316699028015, "eval_runtime": 1.6471, "eval_samples_per_second": 36.427, "eval_steps_per_second": 9.107, "eval_token_acc": 0.9091699681128611, "step": 80 }, { "epoch": 0.9090909090909091, "grad_norm": 0.6711894273757935, "learning_rate": 7.879400561546033e-06, "loss": 0.23753652572631836, "memory(GiB)": 28.98, "step": 85, "token_acc": 0.9143226902311286, "train_speed(iter/s)": 0.200371 }, { "epoch": 0.9625668449197861, "grad_norm": 0.6814318895339966, "learning_rate": 7.644820051634813e-06, "loss": 0.23459360599517823, "memory(GiB)": 28.98, "step": 90, "token_acc": 0.9142770409116383, "train_speed(iter/s)": 0.201764 }, { "epoch": 1.0106951871657754, "grad_norm": 1.021850824356079, "learning_rate": 7.401858215313228e-06, "loss": 0.21953530311584474, "memory(GiB)": 28.98, "step": 95, "token_acc": 0.9268375978563548, "train_speed(iter/s)": 0.203547 }, { "epoch": 1.0641711229946524, "grad_norm": 0.7571138739585876, "learning_rate": 7.151284988688731e-06, "loss": 0.19227520227432252, "memory(GiB)": 28.98, "step": 100, "token_acc": 0.9299330505442838, "train_speed(iter/s)": 0.205218 }, { "epoch": 1.0641711229946524, "eval_loss": 0.27827945351600647, "eval_runtime": 1.6406, "eval_samples_per_second": 36.572, "eval_steps_per_second": 9.143, "eval_token_acc": 0.9097497342738429, "step": 100 }, { "epoch": 1.1176470588235294, "grad_norm": 0.6977977156639099, "learning_rate": 6.893894428052881e-06, "loss": 0.18528327941894532, "memory(GiB)": 28.98, "step": 105, "token_acc": 0.9313939048472141, "train_speed(iter/s)": 0.200929 }, { "epoch": 1.1711229946524064, "grad_norm": 0.7503034472465515, "learning_rate": 6.6305021935494755e-06, "loss": 0.191499924659729, "memory(GiB)": 28.98, "step": 110, "token_acc": 0.934816576879125, "train_speed(iter/s)": 0.202263 }, { "epoch": 1.2245989304812834, "grad_norm": 0.6970927715301514, "learning_rate": 6.361942964380967e-06, "loss": 0.18341017961502076, "memory(GiB)": 28.98, "step": 115, "token_acc": 0.9350552403702598, "train_speed(iter/s)": 0.203567 }, { "epoch": 1.2780748663101604, "grad_norm": 0.7112876176834106, "learning_rate": 6.089067793744258e-06, "loss": 0.19445158243179322, "memory(GiB)": 28.98, "step": 120, "token_acc": 0.9335015519281871, "train_speed(iter/s)": 0.204703 }, { "epoch": 1.2780748663101604, "eval_loss": 0.27691978216171265, "eval_runtime": 1.6606, "eval_samples_per_second": 36.132, "eval_steps_per_second": 9.033, "eval_token_acc": 0.9094920604245177, "step": 120 }, { "epoch": 1.3315508021390374, "grad_norm": 0.6548081636428833, "learning_rate": 5.8127414118779825e-06, "loss": 0.18807239532470704, "memory(GiB)": 31.29, "step": 125, "token_acc": 0.9327750242123853, "train_speed(iter/s)": 0.200918 }, { "epoch": 1.3850267379679144, "grad_norm": 0.709028422832489, "learning_rate": 5.533839485767795e-06, "loss": 0.19655026197433473, "memory(GiB)": 31.29, "step": 130, "token_acc": 0.9308182054862607, "train_speed(iter/s)": 0.201887 }, { "epoch": 1.4385026737967914, "grad_norm": 0.6588287949562073, "learning_rate": 5.253245844193564e-06, "loss": 0.19113950729370116, "memory(GiB)": 31.29, "step": 135, "token_acc": 0.9270080346573307, "train_speed(iter/s)": 0.202901 }, { "epoch": 1.4919786096256684, "grad_norm": 0.6656479239463806, "learning_rate": 4.971849676912172e-06, "loss": 0.18891613483428954, "memory(GiB)": 31.29, "step": 140, "token_acc": 0.9305257651059378, "train_speed(iter/s)": 0.203847 }, { "epoch": 1.4919786096256684, "eval_loss": 0.2746458649635315, "eval_runtime": 1.6467, "eval_samples_per_second": 36.436, "eval_steps_per_second": 9.109, "eval_token_acc": 0.9111991496762972, "step": 140 }, { "epoch": 1.5454545454545454, "grad_norm": 0.7020911574363708, "learning_rate": 4.6905427168515914e-06, "loss": 0.19171638488769532, "memory(GiB)": 31.29, "step": 145, "token_acc": 0.9305895351590245, "train_speed(iter/s)": 0.201182 }, { "epoch": 1.5989304812834224, "grad_norm": 0.6727572083473206, "learning_rate": 4.410216414245771e-06, "loss": 0.1821829557418823, "memory(GiB)": 31.29, "step": 150, "token_acc": 0.9352090736503919, "train_speed(iter/s)": 0.202227 }, { "epoch": 1.6524064171122994, "grad_norm": 0.6589164733886719, "learning_rate": 4.131759111665349e-06, "loss": 0.18441460132598878, "memory(GiB)": 31.29, "step": 155, "token_acc": 0.9374578346368156, "train_speed(iter/s)": 0.203318 }, { "epoch": 1.7058823529411766, "grad_norm": 0.6176323890686035, "learning_rate": 3.856053228896442e-06, "loss": 0.18946645259857178, "memory(GiB)": 31.29, "step": 160, "token_acc": 0.9367611881372071, "train_speed(iter/s)": 0.20408 }, { "epoch": 1.7058823529411766, "eval_loss": 0.2751389443874359, "eval_runtime": 1.6421, "eval_samples_per_second": 36.539, "eval_steps_per_second": 9.135, "eval_token_acc": 0.9112957773697942, "step": 160 }, { "epoch": 1.7593582887700534, "grad_norm": 0.6360734701156616, "learning_rate": 3.58397246658848e-06, "loss": 0.1823675274848938, "memory(GiB)": 31.29, "step": 165, "token_acc": 0.9278697615463836, "train_speed(iter/s)": 0.201592 }, { "epoch": 1.8128342245989306, "grad_norm": 0.5981405973434448, "learning_rate": 3.316379037532644e-06, "loss": 0.18013572692871094, "memory(GiB)": 31.29, "step": 170, "token_acc": 0.9407218114408998, "train_speed(iter/s)": 0.202459 }, { "epoch": 1.8663101604278074, "grad_norm": 0.5807086825370789, "learning_rate": 3.0541209343448373e-06, "loss": 0.1835346221923828, "memory(GiB)": 31.29, "step": 175, "token_acc": 0.9373540226163772, "train_speed(iter/s)": 0.203227 }, { "epoch": 1.9197860962566846, "grad_norm": 0.610285758972168, "learning_rate": 2.7980292422118282e-06, "loss": 0.18963263034820557, "memory(GiB)": 31.29, "step": 180, "token_acc": 0.9329708446611044, "train_speed(iter/s)": 0.204063 }, { "epoch": 1.9197860962566846, "eval_loss": 0.27287107706069946, "eval_runtime": 1.6399, "eval_samples_per_second": 36.589, "eval_steps_per_second": 9.147, "eval_token_acc": 0.911682288143782, "step": 180 }, { "epoch": 1.9732620320855614, "grad_norm": 0.6412176489830017, "learning_rate": 2.548915505216333e-06, "loss": 0.18783329725265502, "memory(GiB)": 31.29, "step": 185, "token_acc": 0.9265865937289413, "train_speed(iter/s)": 0.201898 }, { "epoch": 2.021390374331551, "grad_norm": 0.607214629650116, "learning_rate": 2.307569154587056e-06, "loss": 0.1662315845489502, "memory(GiB)": 31.29, "step": 190, "token_acc": 0.9465564026359995, "train_speed(iter/s)": 0.203071 }, { "epoch": 2.0748663101604277, "grad_norm": 0.6007011532783508, "learning_rate": 2.074755007023461e-06, "loss": 0.16532043218612671, "memory(GiB)": 31.29, "step": 195, "token_acc": 0.9450870631362545, "train_speed(iter/s)": 0.203942 }, { "epoch": 2.128342245989305, "grad_norm": 0.6896679997444153, "learning_rate": 1.8512108410229878e-06, "loss": 0.15121257305145264, "memory(GiB)": 31.29, "step": 200, "token_acc": 0.9484533555566449, "train_speed(iter/s)": 0.204504 }, { "epoch": 2.128342245989305, "eval_loss": 0.28094714879989624, "eval_runtime": 1.6463, "eval_samples_per_second": 36.446, "eval_steps_per_second": 9.111, "eval_token_acc": 0.9109736850581377, "step": 200 }, { "epoch": 2.1818181818181817, "grad_norm": 0.6233195662498474, "learning_rate": 1.6376450588911985e-06, "loss": 0.15310670137405397, "memory(GiB)": 31.29, "step": 205, "token_acc": 0.9403647217565523, "train_speed(iter/s)": 0.202351 }, { "epoch": 2.235294117647059, "grad_norm": 0.6323373913764954, "learning_rate": 1.434734441843899e-06, "loss": 0.15562598705291747, "memory(GiB)": 31.29, "step": 210, "token_acc": 0.9448852085089503, "train_speed(iter/s)": 0.202913 }, { "epoch": 2.2887700534759357, "grad_norm": 0.6409267783164978, "learning_rate": 1.2431220053151832e-06, "loss": 0.15542089939117432, "memory(GiB)": 31.29, "step": 215, "token_acc": 0.9450054780164817, "train_speed(iter/s)": 0.203493 }, { "epoch": 2.342245989304813, "grad_norm": 0.6448594331741333, "learning_rate": 1.063414961267859e-06, "loss": 0.1522960662841797, "memory(GiB)": 31.29, "step": 220, "token_acc": 0.9481132075471698, "train_speed(iter/s)": 0.204302 }, { "epoch": 2.342245989304813, "eval_loss": 0.2818092703819275, "eval_runtime": 1.643, "eval_samples_per_second": 36.519, "eval_steps_per_second": 9.13, "eval_token_acc": 0.9113924050632911, "step": 220 }, { "epoch": 2.3957219251336896, "grad_norm": 0.6113874316215515, "learning_rate": 8.961827939636198e-07, "loss": 0.16363799571990967, "memory(GiB)": 31.29, "step": 225, "token_acc": 0.9390907965842993, "train_speed(iter/s)": 0.202503 }, { "epoch": 2.449197860962567, "grad_norm": 0.597212016582489, "learning_rate": 7.41955455290726e-07, "loss": 0.15171511173248292, "memory(GiB)": 31.29, "step": 230, "token_acc": 0.9467608786903596, "train_speed(iter/s)": 0.20326 }, { "epoch": 2.502673796791444, "grad_norm": 0.6323869228363037, "learning_rate": 6.012216853682001e-07, "loss": 0.16323232650756836, "memory(GiB)": 31.29, "step": 235, "token_acc": 0.9391786687427014, "train_speed(iter/s)": 0.20378 }, { "epoch": 2.556149732620321, "grad_norm": 0.6109181642532349, "learning_rate": 4.7442746374839363e-07, "loss": 0.1464900016784668, "memory(GiB)": 31.3, "step": 240, "token_acc": 0.9483738659414637, "train_speed(iter/s)": 0.20435 }, { "epoch": 2.556149732620321, "eval_loss": 0.28033456206321716, "eval_runtime": 1.6424, "eval_samples_per_second": 36.532, "eval_steps_per_second": 9.133, "eval_token_acc": 0.9115856604502851, "step": 240 }, { "epoch": 2.6096256684491976, "grad_norm": 0.6514647006988525, "learning_rate": 3.619745961260623e-07, "loss": 0.1541598081588745, "memory(GiB)": 31.3, "step": 245, "token_acc": 0.9415382075569038, "train_speed(iter/s)": 0.202522 }, { "epoch": 2.663101604278075, "grad_norm": 0.5899693965911865, "learning_rate": 2.6421944103256657e-07, "loss": 0.15795296430587769, "memory(GiB)": 31.3, "step": 250, "token_acc": 0.947255862532017, "train_speed(iter/s)": 0.203017 }, { "epoch": 2.716577540106952, "grad_norm": 0.612455427646637, "learning_rate": 1.814717805502958e-07, "loss": 0.15344234704971313, "memory(GiB)": 31.3, "step": 255, "token_acc": 0.9460515010284584, "train_speed(iter/s)": 0.203605 }, { "epoch": 2.770053475935829, "grad_norm": 0.6128495931625366, "learning_rate": 1.1399383862592928e-07, "loss": 0.1595083236694336, "memory(GiB)": 31.3, "step": 260, "token_acc": 0.9440190249702735, "train_speed(iter/s)": 0.20408 }, { "epoch": 2.770053475935829, "eval_loss": 0.2802920639514923, "eval_runtime": 1.6389, "eval_samples_per_second": 36.609, "eval_steps_per_second": 9.152, "eval_token_acc": 0.9115856604502851, "step": 260 }, { "epoch": 2.8235294117647056, "grad_norm": 0.5782672166824341, "learning_rate": 6.199945009349173e-08, "loss": 0.15760741233825684, "memory(GiB)": 31.3, "step": 265, "token_acc": 0.9367169337749707, "train_speed(iter/s)": 0.202464 }, { "epoch": 2.877005347593583, "grad_norm": 0.6260784864425659, "learning_rate": 2.5653383040524228e-08, "loss": 0.14205594062805177, "memory(GiB)": 31.3, "step": 270, "token_acc": 0.9525445321564256, "train_speed(iter/s)": 0.202893 }, { "epoch": 2.93048128342246, "grad_norm": 0.6263572573661804, "learning_rate": 5.0708166647628345e-09, "loss": 0.1594037890434265, "memory(GiB)": 31.3, "step": 275, "token_acc": 0.9494285781334335, "train_speed(iter/s)": 0.203622 }, { "epoch": 2.9732620320855614, "eval_loss": 0.27989062666893005, "eval_runtime": 1.6446, "eval_samples_per_second": 36.483, "eval_steps_per_second": 9.121, "eval_token_acc": 0.9118433342996103, "step": 279 } ], "logging_steps": 5, "max_steps": 279, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8413396385162854e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }