{ "best_global_step": 300, "best_metric": 0.40253255, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v8-20250507-004645/checkpoint-300", "epoch": 2.9826262626262627, "eval_steps": 20, "global_step": 462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006464646464646465, "grad_norm": 4.932199478149414, "learning_rate": 9.999884400986087e-06, "loss": 0.7780591249465942, "memory(GiB)": 27.73, "step": 1, "token_acc": 0.782099343955014, "train_speed(iter/s)": 0.064891 }, { "epoch": 0.03232323232323232, "grad_norm": 2.3600621223449707, "learning_rate": 9.997110291906109e-06, "loss": 0.6091042757034302, "memory(GiB)": 27.73, "step": 5, "token_acc": 0.8179287124866458, "train_speed(iter/s)": 0.118621 }, { "epoch": 0.06464646464646465, "grad_norm": 1.088510274887085, "learning_rate": 9.988444507789584e-06, "loss": 0.4719734191894531, "memory(GiB)": 27.73, "step": 10, "token_acc": 0.8583190394511149, "train_speed(iter/s)": 0.135341 }, { "epoch": 0.09696969696969697, "grad_norm": 1.0002374649047852, "learning_rate": 9.97401266428502e-06, "loss": 0.47036895751953123, "memory(GiB)": 27.73, "step": 15, "token_acc": 0.8504078264405482, "train_speed(iter/s)": 0.137962 }, { "epoch": 0.1292929292929293, "grad_norm": 0.9563055038452148, "learning_rate": 9.953831442918418e-06, "loss": 0.42792816162109376, "memory(GiB)": 27.73, "step": 20, "token_acc": 0.863187115610118, "train_speed(iter/s)": 0.141664 }, { "epoch": 0.1292929292929293, "eval_loss": 0.4551742970943451, "eval_runtime": 5.4465, "eval_samples_per_second": 18.36, "eval_steps_per_second": 4.59, "eval_token_acc": 0.8559020470633251, "step": 20 }, { "epoch": 0.16161616161616163, "grad_norm": 0.8774688243865967, "learning_rate": 9.927924170825266e-06, "loss": 0.41231346130371094, "memory(GiB)": 27.73, "step": 25, "token_acc": 0.8627429786160803, "train_speed(iter/s)": 0.130766 }, { "epoch": 0.19393939393939394, "grad_norm": 0.7832381129264832, "learning_rate": 9.896320793787106e-06, "loss": 0.4305295467376709, "memory(GiB)": 27.73, "step": 30, "token_acc": 0.8626980747248807, "train_speed(iter/s)": 0.135526 }, { "epoch": 0.22626262626262628, "grad_norm": 0.7605119943618774, "learning_rate": 9.859057841617709e-06, "loss": 0.40700688362121584, "memory(GiB)": 27.77, "step": 35, "token_acc": 0.8702584217812644, "train_speed(iter/s)": 0.137556 }, { "epoch": 0.2585858585858586, "grad_norm": 0.7823914289474487, "learning_rate": 9.816178385938867e-06, "loss": 0.40500674247741697, "memory(GiB)": 27.77, "step": 40, "token_acc": 0.8749971213412246, "train_speed(iter/s)": 0.139743 }, { "epoch": 0.2585858585858586, "eval_loss": 0.4355735778808594, "eval_runtime": 5.436, "eval_samples_per_second": 18.396, "eval_steps_per_second": 4.599, "eval_token_acc": 0.8621580256361201, "step": 40 }, { "epoch": 0.2909090909090909, "grad_norm": 0.8007071018218994, "learning_rate": 9.767731990394638e-06, "loss": 0.41349210739135744, "memory(GiB)": 27.77, "step": 45, "token_acc": 0.8671768894761958, "train_speed(iter/s)": 0.134577 }, { "epoch": 0.32323232323232326, "grad_norm": 0.8084492683410645, "learning_rate": 9.71377465336155e-06, "loss": 0.41720309257507326, "memory(GiB)": 27.77, "step": 50, "token_acc": 0.8574223526534605, "train_speed(iter/s)": 0.136609 }, { "epoch": 0.35555555555555557, "grad_norm": 0.757235050201416, "learning_rate": 9.654368743221022e-06, "loss": 0.41148929595947265, "memory(GiB)": 27.77, "step": 55, "token_acc": 0.8688445445767622, "train_speed(iter/s)": 0.138016 }, { "epoch": 0.3878787878787879, "grad_norm": 0.787464439868927, "learning_rate": 9.589582926268798e-06, "loss": 0.40866241455078123, "memory(GiB)": 30.15, "step": 60, "token_acc": 0.8828734404289198, "train_speed(iter/s)": 0.139782 }, { "epoch": 0.3878787878787879, "eval_loss": 0.4253135919570923, "eval_runtime": 5.4547, "eval_samples_per_second": 18.333, "eval_steps_per_second": 4.583, "eval_token_acc": 0.8650277405777693, "step": 60 }, { "epoch": 0.4202020202020202, "grad_norm": 0.8023079633712769, "learning_rate": 9.519492087344724e-06, "loss": 0.3891183137893677, "memory(GiB)": 30.15, "step": 65, "token_acc": 0.8755363232975173, "train_speed(iter/s)": 0.135184 }, { "epoch": 0.45252525252525255, "grad_norm": 0.762795090675354, "learning_rate": 9.444177243274619e-06, "loss": 0.4177716255187988, "memory(GiB)": 30.15, "step": 70, "token_acc": 0.8719830172135309, "train_speed(iter/s)": 0.137146 }, { "epoch": 0.48484848484848486, "grad_norm": 0.7374395728111267, "learning_rate": 9.363725449224281e-06, "loss": 0.3992452621459961, "memory(GiB)": 30.15, "step": 75, "token_acc": 0.872343302756429, "train_speed(iter/s)": 0.138292 }, { "epoch": 0.5171717171717172, "grad_norm": 0.7449392080307007, "learning_rate": 9.278229698073889e-06, "loss": 0.39937677383422854, "memory(GiB)": 30.15, "step": 80, "token_acc": 0.8710408988995696, "train_speed(iter/s)": 0.138969 }, { "epoch": 0.5171717171717172, "eval_loss": 0.4182729721069336, "eval_runtime": 5.4623, "eval_samples_per_second": 18.307, "eval_steps_per_second": 4.577, "eval_token_acc": 0.8654868949684331, "step": 80 }, { "epoch": 0.5494949494949495, "grad_norm": 0.7495951652526855, "learning_rate": 9.187788812929074e-06, "loss": 0.39512038230895996, "memory(GiB)": 30.15, "step": 85, "token_acc": 0.870334291390984, "train_speed(iter/s)": 0.136059 }, { "epoch": 0.5818181818181818, "grad_norm": 0.8733298182487488, "learning_rate": 9.092507332892968e-06, "loss": 0.4132417678833008, "memory(GiB)": 30.15, "step": 90, "token_acc": 0.8729202391435325, "train_speed(iter/s)": 0.137339 }, { "epoch": 0.6141414141414141, "grad_norm": 0.8373153805732727, "learning_rate": 8.992495392231195e-06, "loss": 0.40344934463500975, "memory(GiB)": 30.15, "step": 95, "token_acc": 0.8812556053811659, "train_speed(iter/s)": 0.138502 }, { "epoch": 0.6464646464646465, "grad_norm": 0.8512130379676819, "learning_rate": 8.88786859306952e-06, "loss": 0.3863351821899414, "memory(GiB)": 30.16, "step": 100, "token_acc": 0.8727646779553727, "train_speed(iter/s)": 0.139249 }, { "epoch": 0.6464646464646465, "eval_loss": 0.4132169485092163, "eval_runtime": 5.4403, "eval_samples_per_second": 18.381, "eval_steps_per_second": 4.595, "eval_token_acc": 0.8673235125310885, "step": 100 }, { "epoch": 0.6787878787878788, "grad_norm": 0.8247293829917908, "learning_rate": 8.778747871771293e-06, "loss": 0.40260896682739256, "memory(GiB)": 30.16, "step": 105, "token_acc": 0.8619686556852478, "train_speed(iter/s)": 0.137282 }, { "epoch": 0.7111111111111111, "grad_norm": 0.7707865834236145, "learning_rate": 8.665259359149132e-06, "loss": 0.3850682020187378, "memory(GiB)": 30.16, "step": 110, "token_acc": 0.8824787229538045, "train_speed(iter/s)": 0.138008 }, { "epoch": 0.7434343434343434, "grad_norm": 0.7147298455238342, "learning_rate": 8.547534234672435e-06, "loss": 0.37834107875823975, "memory(GiB)": 30.16, "step": 115, "token_acc": 0.8782488780852655, "train_speed(iter/s)": 0.138792 }, { "epoch": 0.7757575757575758, "grad_norm": 0.7929354906082153, "learning_rate": 8.425708574839221e-06, "loss": 0.40454673767089844, "memory(GiB)": 30.16, "step": 120, "token_acc": 0.8664209147790658, "train_speed(iter/s)": 0.139398 }, { "epoch": 0.7757575757575758, "eval_loss": 0.40885430574417114, "eval_runtime": 5.4542, "eval_samples_per_second": 18.334, "eval_steps_per_second": 4.584, "eval_token_acc": 0.8693323129902429, "step": 120 }, { "epoch": 0.8080808080808081, "grad_norm": 0.7925952076911926, "learning_rate": 8.299923195887599e-06, "loss": 0.39439709186553956, "memory(GiB)": 30.16, "step": 125, "token_acc": 0.8721447484554281, "train_speed(iter/s)": 0.137539 }, { "epoch": 0.8404040404040404, "grad_norm": 0.7893044352531433, "learning_rate": 8.170323491028625e-06, "loss": 0.39348788261413575, "memory(GiB)": 30.16, "step": 130, "token_acc": 0.872299544278852, "train_speed(iter/s)": 0.137807 }, { "epoch": 0.8727272727272727, "grad_norm": 0.7297965884208679, "learning_rate": 8.03705926238874e-06, "loss": 0.390042781829834, "memory(GiB)": 30.16, "step": 135, "token_acc": 0.872897976215314, "train_speed(iter/s)": 0.138413 }, { "epoch": 0.9050505050505051, "grad_norm": 0.7339494824409485, "learning_rate": 7.900284547855992e-06, "loss": 0.3968710660934448, "memory(GiB)": 30.16, "step": 140, "token_acc": 0.8762483817273904, "train_speed(iter/s)": 0.138796 }, { "epoch": 0.9050505050505051, "eval_loss": 0.40561485290527344, "eval_runtime": 5.454, "eval_samples_per_second": 18.335, "eval_steps_per_second": 4.584, "eval_token_acc": 0.8705184618327912, "step": 140 }, { "epoch": 0.9373737373737374, "grad_norm": 0.7347233891487122, "learning_rate": 7.760157443030234e-06, "loss": 0.3751286506652832, "memory(GiB)": 30.16, "step": 145, "token_acc": 0.8804492278895648, "train_speed(iter/s)": 0.137132 }, { "epoch": 0.9696969696969697, "grad_norm": 0.7385802268981934, "learning_rate": 7.616839918483061e-06, "loss": 0.38750033378601073, "memory(GiB)": 30.16, "step": 150, "token_acc": 0.8667628785284477, "train_speed(iter/s)": 0.137631 }, { "epoch": 1.0, "grad_norm": 0.7280504107475281, "learning_rate": 7.470497632538743e-06, "loss": 0.38422205448150637, "memory(GiB)": 30.16, "step": 155, "token_acc": 0.8743071565213126, "train_speed(iter/s)": 0.138389 }, { "epoch": 1.0323232323232323, "grad_norm": 0.7759126424789429, "learning_rate": 7.321299739792553e-06, "loss": 0.33709375858306884, "memory(GiB)": 30.16, "step": 160, "token_acc": 0.8903214253738025, "train_speed(iter/s)": 0.138965 }, { "epoch": 1.0323232323232323, "eval_loss": 0.41121506690979004, "eval_runtime": 5.4481, "eval_samples_per_second": 18.355, "eval_steps_per_second": 4.589, "eval_token_acc": 0.8699062559785727, "step": 160 }, { "epoch": 1.0646464646464646, "grad_norm": 0.7367027997970581, "learning_rate": 7.169418695587791e-06, "loss": 0.3059047222137451, "memory(GiB)": 30.16, "step": 165, "token_acc": 0.893117110476366, "train_speed(iter/s)": 0.13755 }, { "epoch": 1.096969696969697, "grad_norm": 0.7874158024787903, "learning_rate": 7.015030056677559e-06, "loss": 0.3194535255432129, "memory(GiB)": 30.16, "step": 170, "token_acc": 0.8963855982498197, "train_speed(iter/s)": 0.13837 }, { "epoch": 1.1292929292929292, "grad_norm": 0.8298231959342957, "learning_rate": 6.858312278301638e-06, "loss": 0.32886972427368166, "memory(GiB)": 30.16, "step": 175, "token_acc": 0.8890347381744879, "train_speed(iter/s)": 0.138796 }, { "epoch": 1.1616161616161615, "grad_norm": 0.7421779632568359, "learning_rate": 6.699446507913083e-06, "loss": 0.3223016977310181, "memory(GiB)": 30.16, "step": 180, "token_acc": 0.8996364289240989, "train_speed(iter/s)": 0.139126 }, { "epoch": 1.1616161616161615, "eval_loss": 0.4112629294395447, "eval_runtime": 5.4548, "eval_samples_per_second": 18.333, "eval_steps_per_second": 4.583, "eval_token_acc": 0.8689114214654677, "step": 180 }, { "epoch": 1.1939393939393939, "grad_norm": 0.6949385404586792, "learning_rate": 6.53861637579291e-06, "loss": 0.3096341609954834, "memory(GiB)": 30.16, "step": 185, "token_acc": 0.8867440022985204, "train_speed(iter/s)": 0.137864 }, { "epoch": 1.2262626262626264, "grad_norm": 0.7675971984863281, "learning_rate": 6.376007782794926e-06, "loss": 0.3296669483184814, "memory(GiB)": 30.16, "step": 190, "token_acc": 0.8872481430414091, "train_speed(iter/s)": 0.138534 }, { "epoch": 1.2585858585858585, "grad_norm": 0.6753478646278381, "learning_rate": 6.211808685466063e-06, "loss": 0.31036269664764404, "memory(GiB)": 30.16, "step": 195, "token_acc": 0.8989660334986432, "train_speed(iter/s)": 0.139226 }, { "epoch": 1.290909090909091, "grad_norm": 0.7082095742225647, "learning_rate": 6.046208878790543e-06, "loss": 0.3189213752746582, "memory(GiB)": 30.16, "step": 200, "token_acc": 0.893559169826382, "train_speed(iter/s)": 0.139505 }, { "epoch": 1.290909090909091, "eval_loss": 0.4101768136024475, "eval_runtime": 5.4502, "eval_samples_per_second": 18.348, "eval_steps_per_second": 4.587, "eval_token_acc": 0.8691792615266883, "step": 200 }, { "epoch": 1.3232323232323233, "grad_norm": 0.7023712992668152, "learning_rate": 5.879399776809047e-06, "loss": 0.3078160285949707, "memory(GiB)": 30.16, "step": 205, "token_acc": 0.8920373624341071, "train_speed(iter/s)": 0.138308 }, { "epoch": 1.3555555555555556, "grad_norm": 0.7248120307922363, "learning_rate": 5.711574191366427e-06, "loss": 0.326322340965271, "memory(GiB)": 30.16, "step": 210, "token_acc": 0.888576901881544, "train_speed(iter/s)": 0.138642 }, { "epoch": 1.387878787878788, "grad_norm": 0.7424785494804382, "learning_rate": 5.542926109243727e-06, "loss": 0.3178426504135132, "memory(GiB)": 30.16, "step": 215, "token_acc": 0.8996045025859446, "train_speed(iter/s)": 0.138982 }, { "epoch": 1.4202020202020202, "grad_norm": 0.7585700154304504, "learning_rate": 5.373650467932122e-06, "loss": 0.31358323097229, "memory(GiB)": 30.16, "step": 220, "token_acc": 0.8893666839273251, "train_speed(iter/s)": 0.139322 }, { "epoch": 1.4202020202020202, "eval_loss": 0.4098529815673828, "eval_runtime": 5.4526, "eval_samples_per_second": 18.34, "eval_steps_per_second": 4.585, "eval_token_acc": 0.8699062559785727, "step": 220 }, { "epoch": 1.4525252525252526, "grad_norm": 0.7577831149101257, "learning_rate": 5.2039429303079294e-06, "loss": 0.3181041717529297, "memory(GiB)": 30.16, "step": 225, "token_acc": 0.8940511833475905, "train_speed(iter/s)": 0.138553 }, { "epoch": 1.4848484848484849, "grad_norm": 0.8157357573509216, "learning_rate": 5.033999658469174e-06, "loss": 0.3100062370300293, "memory(GiB)": 30.16, "step": 230, "token_acc": 0.8942779905384095, "train_speed(iter/s)": 0.138826 }, { "epoch": 1.5171717171717172, "grad_norm": 0.7473869919776917, "learning_rate": 4.864017086995112e-06, "loss": 0.3215769290924072, "memory(GiB)": 30.16, "step": 235, "token_acc": 0.8864230396902226, "train_speed(iter/s)": 0.139172 }, { "epoch": 1.5494949494949495, "grad_norm": 0.7379017472267151, "learning_rate": 4.694191695890788e-06, "loss": 0.32453505992889403, "memory(GiB)": 30.16, "step": 240, "token_acc": 0.9024658286970259, "train_speed(iter/s)": 0.139554 }, { "epoch": 1.5494949494949495, "eval_loss": 0.406717449426651, "eval_runtime": 5.4667, "eval_samples_per_second": 18.293, "eval_steps_per_second": 4.573, "eval_token_acc": 0.8711880619858428, "step": 240 }, { "epoch": 1.5818181818181818, "grad_norm": 0.7182732224464417, "learning_rate": 4.524719783479088e-06, "loss": 0.3010763645172119, "memory(GiB)": 30.16, "step": 245, "token_acc": 0.8939800233960227, "train_speed(iter/s)": 0.138563 }, { "epoch": 1.614141414141414, "grad_norm": 0.7385874390602112, "learning_rate": 4.355797239502807e-06, "loss": 0.30601317882537843, "memory(GiB)": 30.16, "step": 250, "token_acc": 0.9005994116476079, "train_speed(iter/s)": 0.138773 }, { "epoch": 1.6464646464646466, "grad_norm": 0.7460725903511047, "learning_rate": 4.187619318698971e-06, "loss": 0.32054686546325684, "memory(GiB)": 30.16, "step": 255, "token_acc": 0.8981558249490219, "train_speed(iter/s)": 0.139197 }, { "epoch": 1.6787878787878787, "grad_norm": 0.7663230299949646, "learning_rate": 4.020380415107167e-06, "loss": 0.32004489898681643, "memory(GiB)": 30.16, "step": 260, "token_acc": 0.899984937490586, "train_speed(iter/s)": 0.139402 }, { "epoch": 1.6787878787878787, "eval_loss": 0.406484454870224, "eval_runtime": 5.4651, "eval_samples_per_second": 18.298, "eval_steps_per_second": 4.574, "eval_token_acc": 0.8711306676870098, "step": 260 }, { "epoch": 1.7111111111111112, "grad_norm": 0.7412447929382324, "learning_rate": 3.854273837372724e-06, "loss": 0.3273331642150879, "memory(GiB)": 30.16, "step": 265, "token_acc": 0.8869161986402602, "train_speed(iter/s)": 0.138596 }, { "epoch": 1.7434343434343433, "grad_norm": 0.773398756980896, "learning_rate": 3.689491585304491e-06, "loss": 0.3207144498825073, "memory(GiB)": 30.16, "step": 270, "token_acc": 0.8838720231835285, "train_speed(iter/s)": 0.138842 }, { "epoch": 1.7757575757575759, "grad_norm": 0.737702488899231, "learning_rate": 3.526224127945479e-06, "loss": 0.32349045276641847, "memory(GiB)": 30.16, "step": 275, "token_acc": 0.899477893067213, "train_speed(iter/s)": 0.139328 }, { "epoch": 1.808080808080808, "grad_norm": 0.7224950194358826, "learning_rate": 3.3646601834128924e-06, "loss": 0.30070719718933103, "memory(GiB)": 30.16, "step": 280, "token_acc": 0.8971495671394364, "train_speed(iter/s)": 0.139562 }, { "epoch": 1.808080808080808, "eval_loss": 0.40459758043289185, "eval_runtime": 5.4432, "eval_samples_per_second": 18.371, "eval_steps_per_second": 4.593, "eval_token_acc": 0.8716854792423953, "step": 280 }, { "epoch": 1.8404040404040405, "grad_norm": 0.6978333592414856, "learning_rate": 3.204986500762006e-06, "loss": 0.31296162605285643, "memory(GiB)": 30.16, "step": 285, "token_acc": 0.8931119696495075, "train_speed(iter/s)": 0.138664 }, { "epoch": 1.8727272727272726, "grad_norm": 0.7149765491485596, "learning_rate": 3.0473876441260786e-06, "loss": 0.2978228569030762, "memory(GiB)": 30.16, "step": 290, "token_acc": 0.9099453551912569, "train_speed(iter/s)": 0.138912 }, { "epoch": 1.905050505050505, "grad_norm": 0.7401219010353088, "learning_rate": 2.8920457793817507e-06, "loss": 0.3145498752593994, "memory(GiB)": 30.16, "step": 295, "token_acc": 0.8971085419769723, "train_speed(iter/s)": 0.139171 }, { "epoch": 1.9373737373737374, "grad_norm": 0.7960948348045349, "learning_rate": 2.7391404635865725e-06, "loss": 0.31858437061309813, "memory(GiB)": 30.16, "step": 300, "token_acc": 0.8927697189483228, "train_speed(iter/s)": 0.139487 }, { "epoch": 1.9373737373737374, "eval_loss": 0.40253254771232605, "eval_runtime": 5.4598, "eval_samples_per_second": 18.316, "eval_steps_per_second": 4.579, "eval_token_acc": 0.871608953510618, "step": 300 }, { "epoch": 1.9696969696969697, "grad_norm": 0.7060583233833313, "learning_rate": 2.5888484374320033e-06, "loss": 0.3089438438415527, "memory(GiB)": 30.16, "step": 305, "token_acc": 0.8951569409988135, "train_speed(iter/s)": 0.138796 }, { "epoch": 2.0, "grad_norm": 0.8663066625595093, "learning_rate": 2.4413434209518137e-06, "loss": 0.30643525123596194, "memory(GiB)": 30.16, "step": 310, "token_acc": 0.9002104614752836, "train_speed(iter/s)": 0.139084 }, { "epoch": 2.0323232323232325, "grad_norm": 0.6777763366699219, "learning_rate": 2.296795912722014e-06, "loss": 0.2622525691986084, "memory(GiB)": 30.16, "step": 315, "token_acc": 0.9182915057915058, "train_speed(iter/s)": 0.139164 }, { "epoch": 2.0646464646464646, "grad_norm": 0.7604569792747498, "learning_rate": 2.1553729927843894e-06, "loss": 0.2744235277175903, "memory(GiB)": 30.16, "step": 320, "token_acc": 0.9147708067912951, "train_speed(iter/s)": 0.139363 }, { "epoch": 2.0646464646464646, "eval_loss": 0.41404902935028076, "eval_runtime": 5.4464, "eval_samples_per_second": 18.361, "eval_steps_per_second": 4.59, "eval_token_acc": 0.8715898220776737, "step": 320 }, { "epoch": 2.096969696969697, "grad_norm": 0.7027745246887207, "learning_rate": 2.017238129521506e-06, "loss": 0.2601346492767334, "memory(GiB)": 30.16, "step": 325, "token_acc": 0.9096833050957904, "train_speed(iter/s)": 0.138685 }, { "epoch": 2.1292929292929292, "grad_norm": 0.7389653325080872, "learning_rate": 1.8825509907063328e-06, "loss": 0.26451470851898196, "memory(GiB)": 30.16, "step": 330, "token_acc": 0.9258255445505091, "train_speed(iter/s)": 0.138988 }, { "epoch": 2.1616161616161618, "grad_norm": 0.750593364238739, "learning_rate": 1.7514672589449378e-06, "loss": 0.283371901512146, "memory(GiB)": 30.16, "step": 335, "token_acc": 0.904814352497736, "train_speed(iter/s)": 0.139218 }, { "epoch": 2.193939393939394, "grad_norm": 0.6902281641960144, "learning_rate": 1.6241384517255854e-06, "loss": 0.2589299440383911, "memory(GiB)": 30.16, "step": 340, "token_acc": 0.9170253055603375, "train_speed(iter/s)": 0.139418 }, { "epoch": 2.193939393939394, "eval_loss": 0.4144185781478882, "eval_runtime": 5.4505, "eval_samples_per_second": 18.347, "eval_steps_per_second": 4.587, "eval_token_acc": 0.8716472163765066, "step": 340 }, { "epoch": 2.2262626262626264, "grad_norm": 0.6674037575721741, "learning_rate": 1.500711746282192e-06, "loss": 0.2723775148391724, "memory(GiB)": 30.16, "step": 345, "token_acc": 0.9019237534484993, "train_speed(iter/s)": 0.138831 }, { "epoch": 2.2585858585858585, "grad_norm": 0.7093120217323303, "learning_rate": 1.3813298094746491e-06, "loss": 0.2645721912384033, "memory(GiB)": 30.16, "step": 350, "token_acc": 0.9119130680746748, "train_speed(iter/s)": 0.138979 }, { "epoch": 2.290909090909091, "grad_norm": 0.6906498074531555, "learning_rate": 1.2661306328825818e-06, "loss": 0.259444522857666, "memory(GiB)": 30.16, "step": 355, "token_acc": 0.9145142038672714, "train_speed(iter/s)": 0.139182 }, { "epoch": 2.323232323232323, "grad_norm": 0.7055203318595886, "learning_rate": 1.1552473733031893e-06, "loss": 0.25058302879333494, "memory(GiB)": 30.16, "step": 360, "token_acc": 0.9134637201070926, "train_speed(iter/s)": 0.139508 }, { "epoch": 2.323232323232323, "eval_loss": 0.41360363364219666, "eval_runtime": 5.4444, "eval_samples_per_second": 18.367, "eval_steps_per_second": 4.592, "eval_token_acc": 0.8721828964989478, "step": 360 }, { "epoch": 2.3555555555555556, "grad_norm": 0.7174535393714905, "learning_rate": 1.0488081988375493e-06, "loss": 0.26172120571136476, "memory(GiB)": 30.16, "step": 365, "token_acc": 0.9057994175722708, "train_speed(iter/s)": 0.139028 }, { "epoch": 2.3878787878787877, "grad_norm": 0.7143301367759705, "learning_rate": 9.469361407432431e-07, "loss": 0.2703177213668823, "memory(GiB)": 30.16, "step": 370, "token_acc": 0.9194112781795432, "train_speed(iter/s)": 0.139208 }, { "epoch": 2.4202020202020202, "grad_norm": 0.7012506127357483, "learning_rate": 8.497489512245971e-07, "loss": 0.27690658569335935, "memory(GiB)": 30.16, "step": 375, "token_acc": 0.9169483450919897, "train_speed(iter/s)": 0.139394 }, { "epoch": 2.4525252525252528, "grad_norm": 0.7648996114730835, "learning_rate": 7.573589673248833e-07, "loss": 0.26938657760620116, "memory(GiB)": 30.16, "step": 380, "token_acc": 0.9042763382008948, "train_speed(iter/s)": 0.139668 }, { "epoch": 2.4525252525252528, "eval_loss": 0.41534754633903503, "eval_runtime": 5.5436, "eval_samples_per_second": 18.039, "eval_steps_per_second": 4.51, "eval_token_acc": 0.871436770614119, "step": 380 }, { "epoch": 2.484848484848485, "grad_norm": 0.6768928170204163, "learning_rate": 6.698729810778065e-07, "loss": 0.2653059482574463, "memory(GiB)": 30.16, "step": 385, "token_acc": 0.9062684911242603, "train_speed(iter/s)": 0.139087 }, { "epoch": 2.517171717171717, "grad_norm": 0.6828300952911377, "learning_rate": 5.873921160683943e-07, "loss": 0.27868268489837644, "memory(GiB)": 30.16, "step": 390, "token_acc": 0.9041146306155998, "train_speed(iter/s)": 0.139361 }, { "epoch": 2.5494949494949495, "grad_norm": 0.6979082822799683, "learning_rate": 5.100117105459279e-07, "loss": 0.24405245780944823, "memory(GiB)": 30.16, "step": 395, "token_acc": 0.9200107009095773, "train_speed(iter/s)": 0.139546 }, { "epoch": 2.581818181818182, "grad_norm": 0.6200575828552246, "learning_rate": 4.3782120722406565e-07, "loss": 0.2625063419342041, "memory(GiB)": 30.16, "step": 400, "token_acc": 0.9186572124972302, "train_speed(iter/s)": 0.139799 }, { "epoch": 2.581818181818182, "eval_loss": 0.41504132747650146, "eval_runtime": 5.4443, "eval_samples_per_second": 18.368, "eval_steps_per_second": 4.592, "eval_token_acc": 0.871723742108284, "step": 400 }, { "epoch": 2.614141414141414, "grad_norm": 0.7500612139701843, "learning_rate": 3.709040498955102e-07, "loss": 0.26823058128356936, "memory(GiB)": 30.16, "step": 405, "token_acc": 0.8981710236522072, "train_speed(iter/s)": 0.139221 }, { "epoch": 2.6464646464646466, "grad_norm": 0.8057283163070679, "learning_rate": 3.0933758698072023e-07, "loss": 0.27291839122772216, "memory(GiB)": 30.16, "step": 410, "token_acc": 0.9183253730661121, "train_speed(iter/s)": 0.139402 }, { "epoch": 2.6787878787878787, "grad_norm": 0.7092121243476868, "learning_rate": 2.531929821221768e-07, "loss": 0.28043303489685056, "memory(GiB)": 30.16, "step": 415, "token_acc": 0.9004696220894495, "train_speed(iter/s)": 0.13956 }, { "epoch": 2.7111111111111112, "grad_norm": 0.7463679909706116, "learning_rate": 2.0253513192751374e-07, "loss": 0.26267204284667967, "memory(GiB)": 30.16, "step": 420, "token_acc": 0.9165323480546532, "train_speed(iter/s)": 0.139835 }, { "epoch": 2.7111111111111112, "eval_loss": 0.41523492336273193, "eval_runtime": 5.4494, "eval_samples_per_second": 18.351, "eval_steps_per_second": 4.588, "eval_token_acc": 0.8714559020470634, "step": 420 }, { "epoch": 2.7434343434343433, "grad_norm": 0.6929903626441956, "learning_rate": 1.5742259095662126e-07, "loss": 0.26644191741943357, "memory(GiB)": 30.16, "step": 425, "token_acc": 0.8992937483651583, "train_speed(iter/s)": 0.139333 }, { "epoch": 2.775757575757576, "grad_norm": 0.707722008228302, "learning_rate": 1.1790750403941231e-07, "loss": 0.266437292098999, "memory(GiB)": 30.16, "step": 430, "token_acc": 0.9212307137056753, "train_speed(iter/s)": 0.139491 }, { "epoch": 2.808080808080808, "grad_norm": 0.7085736393928528, "learning_rate": 8.403554600248498e-08, "loss": 0.25338120460510255, "memory(GiB)": 30.16, "step": 435, "token_acc": 0.9183108895950982, "train_speed(iter/s)": 0.139672 }, { "epoch": 2.8404040404040405, "grad_norm": 0.7079160213470459, "learning_rate": 5.584586887435739e-08, "loss": 0.26110315322875977, "memory(GiB)": 30.16, "step": 440, "token_acc": 0.9123924065558306, "train_speed(iter/s)": 0.139879 }, { "epoch": 2.8404040404040405, "eval_loss": 0.4152638614177704, "eval_runtime": 5.4468, "eval_samples_per_second": 18.359, "eval_steps_per_second": 4.59, "eval_token_acc": 0.8718959250047829, "step": 440 }, { "epoch": 2.8727272727272726, "grad_norm": 0.7511946558952332, "learning_rate": 3.337105663029361e-08, "loss": 0.2701514482498169, "memory(GiB)": 30.16, "step": 445, "token_acc": 0.9035763569457221, "train_speed(iter/s)": 0.13938 }, { "epoch": 2.905050505050505, "grad_norm": 0.7157277464866638, "learning_rate": 1.6637087529033925e-08, "loss": 0.25613832473754883, "memory(GiB)": 30.16, "step": 450, "token_acc": 0.9136720727064674, "train_speed(iter/s)": 0.139627 }, { "epoch": 2.937373737373737, "grad_norm": 0.7009196281433105, "learning_rate": 5.6633040849601865e-09, "loss": 0.25980963706970217, "memory(GiB)": 30.16, "step": 455, "token_acc": 0.9089919103920349, "train_speed(iter/s)": 0.139745 }, { "epoch": 2.9696969696969697, "grad_norm": 0.7183138728141785, "learning_rate": 4.623907104084335e-10, "loss": 0.27978599071502686, "memory(GiB)": 30.16, "step": 460, "token_acc": 0.9131394658753709, "train_speed(iter/s)": 0.140016 }, { "epoch": 2.9696969696969697, "eval_loss": 0.41502535343170166, "eval_runtime": 5.4392, "eval_samples_per_second": 18.385, "eval_steps_per_second": 4.596, "eval_token_acc": 0.8717046106753396, "step": 460 }, { "epoch": 2.9826262626262627, "eval_loss": 0.41519150137901306, "eval_runtime": 5.4401, "eval_samples_per_second": 18.382, "eval_steps_per_second": 4.596, "eval_token_acc": 0.871608953510618, "step": 462 } ], "logging_steps": 5, "max_steps": 462, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.033815544329667e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }