{ "best_global_step": 120, "best_metric": 0.33126009, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v7-20250507-004227/checkpoint-120", "epoch": 2.9826262626262627, "eval_steps": 20, "global_step": 462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006464646464646465, "grad_norm": 2.4505362510681152, "learning_rate": 9.999884400986087e-06, "loss": 0.4081788659095764, "memory(GiB)": 27.77, "step": 1, "token_acc": 0.8560397131825703, "train_speed(iter/s)": 0.065308 }, { "epoch": 0.03232323232323232, "grad_norm": 1.3398605585098267, "learning_rate": 9.997110291906109e-06, "loss": 0.3790343999862671, "memory(GiB)": 27.77, "step": 5, "token_acc": 0.8759903354497949, "train_speed(iter/s)": 0.120195 }, { "epoch": 0.06464646464646465, "grad_norm": 1.0211882591247559, "learning_rate": 9.988444507789584e-06, "loss": 0.3569159030914307, "memory(GiB)": 27.77, "step": 10, "token_acc": 0.8904844941361507, "train_speed(iter/s)": 0.137392 }, { "epoch": 0.09696969696969697, "grad_norm": 1.0586270093917847, "learning_rate": 9.97401266428502e-06, "loss": 0.36738641262054444, "memory(GiB)": 27.77, "step": 15, "token_acc": 0.8821489760952925, "train_speed(iter/s)": 0.140251 }, { "epoch": 0.1292929292929293, "grad_norm": 1.1482552289962769, "learning_rate": 9.953831442918418e-06, "loss": 0.3260908842086792, "memory(GiB)": 27.77, "step": 20, "token_acc": 0.8923615160349854, "train_speed(iter/s)": 0.144117 }, { "epoch": 0.1292929292929293, "eval_loss": 0.3649641275405884, "eval_runtime": 5.3926, "eval_samples_per_second": 18.544, "eval_steps_per_second": 4.636, "eval_token_acc": 0.8829422873787651, "step": 20 }, { "epoch": 0.16161616161616163, "grad_norm": 1.0274701118469238, "learning_rate": 9.927924170825266e-06, "loss": 0.32098817825317383, "memory(GiB)": 27.77, "step": 25, "token_acc": 0.8865429663420047, "train_speed(iter/s)": 0.132926 }, { "epoch": 0.19393939393939394, "grad_norm": 0.9113245010375977, "learning_rate": 9.896320793787106e-06, "loss": 0.35467684268951416, "memory(GiB)": 27.77, "step": 30, "token_acc": 0.8852310260970564, "train_speed(iter/s)": 0.137888 }, { "epoch": 0.22626262626262628, "grad_norm": 0.9023920893669128, "learning_rate": 9.859057841617709e-06, "loss": 0.3223384380340576, "memory(GiB)": 27.77, "step": 35, "token_acc": 0.8949514563106796, "train_speed(iter/s)": 0.140106 }, { "epoch": 0.2585858585858586, "grad_norm": 0.9127278923988342, "learning_rate": 9.816178385938867e-06, "loss": 0.3180943489074707, "memory(GiB)": 27.77, "step": 40, "token_acc": 0.90066669149689, "train_speed(iter/s)": 0.142488 }, { "epoch": 0.2585858585858586, "eval_loss": 0.3500390648841858, "eval_runtime": 5.3821, "eval_samples_per_second": 18.58, "eval_steps_per_second": 4.645, "eval_token_acc": 0.8875516148650812, "step": 40 }, { "epoch": 0.2909090909090909, "grad_norm": 0.947066068649292, "learning_rate": 9.767731990394638e-06, "loss": 0.33401944637298586, "memory(GiB)": 27.77, "step": 45, "token_acc": 0.8919813402256824, "train_speed(iter/s)": 0.136919 }, { "epoch": 0.32323232323232326, "grad_norm": 2.0620720386505127, "learning_rate": 9.71377465336155e-06, "loss": 0.3351354837417603, "memory(GiB)": 27.77, "step": 50, "token_acc": 0.8765755647073505, "train_speed(iter/s)": 0.139127 }, { "epoch": 0.35555555555555557, "grad_norm": 0.863703191280365, "learning_rate": 9.654368743221022e-06, "loss": 0.3273132801055908, "memory(GiB)": 27.77, "step": 55, "token_acc": 0.8912860949877706, "train_speed(iter/s)": 0.140614 }, { "epoch": 0.3878787878787879, "grad_norm": 0.890646755695343, "learning_rate": 9.589582926268798e-06, "loss": 0.3155367374420166, "memory(GiB)": 30.08, "step": 60, "token_acc": 0.9143575243480992, "train_speed(iter/s)": 0.142452 }, { "epoch": 0.3878787878787879, "eval_loss": 0.3454614281654358, "eval_runtime": 5.3711, "eval_samples_per_second": 18.618, "eval_steps_per_second": 4.655, "eval_token_acc": 0.8889920297045549, "step": 60 }, { "epoch": 0.4202020202020202, "grad_norm": 0.8996425271034241, "learning_rate": 9.519492087344724e-06, "loss": 0.2981250762939453, "memory(GiB)": 30.08, "step": 65, "token_acc": 0.9051183738056113, "train_speed(iter/s)": 0.137618 }, { "epoch": 0.45252525252525255, "grad_norm": 0.9426372647285461, "learning_rate": 9.444177243274619e-06, "loss": 0.3414067029953003, "memory(GiB)": 30.08, "step": 70, "token_acc": 0.8901309721453606, "train_speed(iter/s)": 0.139706 }, { "epoch": 0.48484848484848486, "grad_norm": 0.8184367418289185, "learning_rate": 9.363725449224281e-06, "loss": 0.32115802764892576, "memory(GiB)": 30.08, "step": 75, "token_acc": 0.8985269424515341, "train_speed(iter/s)": 0.14097 }, { "epoch": 0.5171717171717172, "grad_norm": 0.9321162104606628, "learning_rate": 9.278229698073889e-06, "loss": 0.31497313976287844, "memory(GiB)": 30.08, "step": 80, "token_acc": 0.9010191988622896, "train_speed(iter/s)": 0.141741 }, { "epoch": 0.5171717171717172, "eval_loss": 0.33993807435035706, "eval_runtime": 5.3957, "eval_samples_per_second": 18.533, "eval_steps_per_second": 4.633, "eval_token_acc": 0.8895361864216894, "step": 80 }, { "epoch": 0.5494949494949495, "grad_norm": 0.8877705931663513, "learning_rate": 9.187788812929074e-06, "loss": 0.32355318069458006, "memory(GiB)": 30.08, "step": 85, "token_acc": 0.8971333885666943, "train_speed(iter/s)": 0.138684 }, { "epoch": 0.5818181818181818, "grad_norm": 1.019900918006897, "learning_rate": 9.092507332892968e-06, "loss": 0.33187189102172854, "memory(GiB)": 30.08, "step": 90, "token_acc": 0.8973049754299754, "train_speed(iter/s)": 0.140038 }, { "epoch": 0.6141414141414141, "grad_norm": 1.0134016275405884, "learning_rate": 8.992495392231195e-06, "loss": 0.3340008020401001, "memory(GiB)": 30.08, "step": 95, "token_acc": 0.9059227157818707, "train_speed(iter/s)": 0.141282 }, { "epoch": 0.6464646464646465, "grad_norm": 0.9215405583381653, "learning_rate": 8.88786859306952e-06, "loss": 0.306801438331604, "memory(GiB)": 30.08, "step": 100, "token_acc": 0.8903882234088613, "train_speed(iter/s)": 0.142097 }, { "epoch": 0.6464646464646465, "eval_loss": 0.3363126516342163, "eval_runtime": 5.4132, "eval_samples_per_second": 18.473, "eval_steps_per_second": 4.618, "eval_token_acc": 0.8896002048589994, "step": 100 }, { "epoch": 0.6787878787878788, "grad_norm": 0.9498901963233948, "learning_rate": 8.778747871771293e-06, "loss": 0.31042842864990233, "memory(GiB)": 30.08, "step": 105, "token_acc": 0.889660103071286, "train_speed(iter/s)": 0.139949 }, { "epoch": 0.7111111111111111, "grad_norm": 0.8483244180679321, "learning_rate": 8.665259359149132e-06, "loss": 0.3191797733306885, "memory(GiB)": 30.08, "step": 110, "token_acc": 0.909381808278867, "train_speed(iter/s)": 0.140731 }, { "epoch": 0.7434343434343434, "grad_norm": 0.7640553116798401, "learning_rate": 8.547534234672435e-06, "loss": 0.2995746374130249, "memory(GiB)": 30.08, "step": 115, "token_acc": 0.8994715117849015, "train_speed(iter/s)": 0.141551 }, { "epoch": 0.7757575757575758, "grad_norm": 0.9591003656387329, "learning_rate": 8.425708574839221e-06, "loss": 0.32628965377807617, "memory(GiB)": 30.08, "step": 120, "token_acc": 0.8891077731264877, "train_speed(iter/s)": 0.142186 }, { "epoch": 0.7757575757575758, "eval_loss": 0.331260085105896, "eval_runtime": 5.3939, "eval_samples_per_second": 18.539, "eval_steps_per_second": 4.635, "eval_token_acc": 0.8917128132902276, "step": 120 }, { "epoch": 0.8080808080808081, "grad_norm": 0.9353374242782593, "learning_rate": 8.299923195887599e-06, "loss": 0.3271709680557251, "memory(GiB)": 30.08, "step": 125, "token_acc": 0.8903564002694234, "train_speed(iter/s)": 0.140229 }, { "epoch": 0.8404040404040404, "grad_norm": 0.9060182571411133, "learning_rate": 8.170323491028625e-06, "loss": 0.3163918018341064, "memory(GiB)": 30.08, "step": 130, "token_acc": 0.8912760416666666, "train_speed(iter/s)": 0.140553 }, { "epoch": 0.8727272727272727, "grad_norm": 0.8269082903862, "learning_rate": 8.03705926238874e-06, "loss": 0.3141618251800537, "memory(GiB)": 30.08, "step": 135, "token_acc": 0.8898337308583083, "train_speed(iter/s)": 0.141211 }, { "epoch": 0.9050505050505051, "grad_norm": 0.8577111959457397, "learning_rate": 7.900284547855992e-06, "loss": 0.3134615898132324, "memory(GiB)": 30.08, "step": 140, "token_acc": 0.9006930194742344, "train_speed(iter/s)": 0.141613 }, { "epoch": 0.9050505050505051, "eval_loss": 0.3313320279121399, "eval_runtime": 5.3884, "eval_samples_per_second": 18.558, "eval_steps_per_second": 4.64, "eval_token_acc": 0.8926410806312218, "step": 140 }, { "epoch": 0.9373737373737374, "grad_norm": 0.8400760293006897, "learning_rate": 7.760157443030234e-06, "loss": 0.2992702007293701, "memory(GiB)": 30.08, "step": 145, "token_acc": 0.9021267154765301, "train_speed(iter/s)": 0.139829 }, { "epoch": 0.9696969696969697, "grad_norm": 0.8579837679862976, "learning_rate": 7.616839918483061e-06, "loss": 0.32117404937744143, "memory(GiB)": 30.08, "step": 150, "token_acc": 0.8848251385041551, "train_speed(iter/s)": 0.140369 }, { "epoch": 1.0, "grad_norm": 0.8645791411399841, "learning_rate": 7.470497632538743e-06, "loss": 0.3043407440185547, "memory(GiB)": 30.08, "step": 155, "token_acc": 0.903212915601023, "train_speed(iter/s)": 0.141142 }, { "epoch": 1.0323232323232323, "grad_norm": 0.8486159443855286, "learning_rate": 7.321299739792553e-06, "loss": 0.2472972869873047, "memory(GiB)": 30.08, "step": 160, "token_acc": 0.9189923065319052, "train_speed(iter/s)": 0.14175 }, { "epoch": 1.0323232323232323, "eval_loss": 0.33611831068992615, "eval_runtime": 5.3694, "eval_samples_per_second": 18.624, "eval_steps_per_second": 4.656, "eval_token_acc": 0.8918088409461925, "step": 160 }, { "epoch": 1.0646464646464646, "grad_norm": 0.8444439768791199, "learning_rate": 7.169418695587791e-06, "loss": 0.22124772071838378, "memory(GiB)": 30.08, "step": 165, "token_acc": 0.92039636166496, "train_speed(iter/s)": 0.14026 }, { "epoch": 1.096969696969697, "grad_norm": 0.8758794069290161, "learning_rate": 7.015030056677559e-06, "loss": 0.231048059463501, "memory(GiB)": 30.08, "step": 170, "token_acc": 0.927355278093076, "train_speed(iter/s)": 0.141102 }, { "epoch": 1.1292929292929292, "grad_norm": 0.9414038062095642, "learning_rate": 6.858312278301638e-06, "loss": 0.2431964874267578, "memory(GiB)": 30.08, "step": 175, "token_acc": 0.914981199287552, "train_speed(iter/s)": 0.141563 }, { "epoch": 1.1616161616161615, "grad_norm": 0.8615570664405823, "learning_rate": 6.699446507913083e-06, "loss": 0.22901198863983155, "memory(GiB)": 30.08, "step": 180, "token_acc": 0.9300724249884048, "train_speed(iter/s)": 0.141935 }, { "epoch": 1.1616161616161615, "eval_loss": 0.34168365597724915, "eval_runtime": 5.3971, "eval_samples_per_second": 18.528, "eval_steps_per_second": 4.632, "eval_token_acc": 0.8907205275119234, "step": 180 }, { "epoch": 1.1939393939393939, "grad_norm": 0.8244655132293701, "learning_rate": 6.53861637579291e-06, "loss": 0.2308629035949707, "memory(GiB)": 30.08, "step": 185, "token_acc": 0.9109640722038423, "train_speed(iter/s)": 0.140603 }, { "epoch": 1.2262626262626264, "grad_norm": 0.9139054417610168, "learning_rate": 6.376007782794926e-06, "loss": 0.2585730791091919, "memory(GiB)": 30.08, "step": 190, "token_acc": 0.9028991841491841, "train_speed(iter/s)": 0.141319 }, { "epoch": 1.2585858585858585, "grad_norm": 0.7501769065856934, "learning_rate": 6.211808685466063e-06, "loss": 0.2274195671081543, "memory(GiB)": 30.08, "step": 195, "token_acc": 0.9299425265767627, "train_speed(iter/s)": 0.142053 }, { "epoch": 1.290909090909091, "grad_norm": 0.8027601838111877, "learning_rate": 6.046208878790543e-06, "loss": 0.2291938304901123, "memory(GiB)": 30.08, "step": 200, "token_acc": 0.9253513490971267, "train_speed(iter/s)": 0.142337 }, { "epoch": 1.290909090909091, "eval_loss": 0.34100720286369324, "eval_runtime": 5.3678, "eval_samples_per_second": 18.63, "eval_steps_per_second": 4.657, "eval_token_acc": 0.8911046381357831, "step": 200 }, { "epoch": 1.3232323232323233, "grad_norm": 0.8584316372871399, "learning_rate": 5.879399776809047e-06, "loss": 0.21250443458557128, "memory(GiB)": 30.08, "step": 205, "token_acc": 0.9207754541291406, "train_speed(iter/s)": 0.141123 }, { "epoch": 1.3555555555555556, "grad_norm": 0.8164386749267578, "learning_rate": 5.711574191366427e-06, "loss": 0.23753111362457274, "memory(GiB)": 30.08, "step": 210, "token_acc": 0.9153866525423728, "train_speed(iter/s)": 0.141492 }, { "epoch": 1.387878787878788, "grad_norm": 0.8197464346885681, "learning_rate": 5.542926109243727e-06, "loss": 0.2262495279312134, "memory(GiB)": 30.08, "step": 215, "token_acc": 0.9298754093424173, "train_speed(iter/s)": 0.141871 }, { "epoch": 1.4202020202020202, "grad_norm": 0.8861284255981445, "learning_rate": 5.373650467932122e-06, "loss": 0.23235108852386474, "memory(GiB)": 30.08, "step": 220, "token_acc": 0.916040434865535, "train_speed(iter/s)": 0.142257 }, { "epoch": 1.4202020202020202, "eval_loss": 0.34229058027267456, "eval_runtime": 5.3952, "eval_samples_per_second": 18.535, "eval_steps_per_second": 4.634, "eval_token_acc": 0.8919368778208124, "step": 220 }, { "epoch": 1.4525252525252526, "grad_norm": 0.8957362771034241, "learning_rate": 5.2039429303079294e-06, "loss": 0.2363661289215088, "memory(GiB)": 30.08, "step": 225, "token_acc": 0.9184887277670782, "train_speed(iter/s)": 0.141447 }, { "epoch": 1.4848484848484849, "grad_norm": 0.8692964911460876, "learning_rate": 5.033999658469174e-06, "loss": 0.22849671840667723, "memory(GiB)": 30.08, "step": 230, "token_acc": 0.9200207931085698, "train_speed(iter/s)": 0.141745 }, { "epoch": 1.5171717171717172, "grad_norm": 0.8732675909996033, "learning_rate": 4.864017086995112e-06, "loss": 0.22888550758361817, "memory(GiB)": 30.08, "step": 235, "token_acc": 0.9242218099360956, "train_speed(iter/s)": 0.142114 }, { "epoch": 1.5494949494949495, "grad_norm": 0.8548147082328796, "learning_rate": 4.694191695890788e-06, "loss": 0.24225883483886718, "memory(GiB)": 30.08, "step": 240, "token_acc": 0.9354398726983405, "train_speed(iter/s)": 0.142537 }, { "epoch": 1.5494949494949495, "eval_loss": 0.3384065330028534, "eval_runtime": 5.3801, "eval_samples_per_second": 18.587, "eval_steps_per_second": 4.647, "eval_token_acc": 0.8922569700073621, "step": 240 }, { "epoch": 1.5818181818181818, "grad_norm": 0.8377289772033691, "learning_rate": 4.524719783479088e-06, "loss": 0.20645420551300048, "memory(GiB)": 30.08, "step": 245, "token_acc": 0.9174505252870755, "train_speed(iter/s)": 0.14148 }, { "epoch": 1.614141414141414, "grad_norm": 0.779121994972229, "learning_rate": 4.355797239502807e-06, "loss": 0.2250507354736328, "memory(GiB)": 30.08, "step": 250, "token_acc": 0.9264376661536309, "train_speed(iter/s)": 0.141711 }, { "epoch": 1.6464646464646466, "grad_norm": 0.8410191535949707, "learning_rate": 4.187619318698971e-06, "loss": 0.2303227186203003, "memory(GiB)": 30.08, "step": 255, "token_acc": 0.9288690903865497, "train_speed(iter/s)": 0.142159 }, { "epoch": 1.6787878787878787, "grad_norm": 0.8751044273376465, "learning_rate": 4.020380415107167e-06, "loss": 0.24010176658630372, "memory(GiB)": 30.08, "step": 260, "token_acc": 0.9276785345930113, "train_speed(iter/s)": 0.142379 }, { "epoch": 1.6787878787878787, "eval_loss": 0.33762192726135254, "eval_runtime": 5.3885, "eval_samples_per_second": 18.558, "eval_steps_per_second": 4.639, "eval_token_acc": 0.8926090714125668, "step": 260 }, { "epoch": 1.7111111111111112, "grad_norm": 0.7685917019844055, "learning_rate": 3.854273837372724e-06, "loss": 0.25291612148284914, "memory(GiB)": 30.08, "step": 265, "token_acc": 0.9064826915478832, "train_speed(iter/s)": 0.141534 }, { "epoch": 1.7434343434343433, "grad_norm": 0.8771150708198547, "learning_rate": 3.689491585304491e-06, "loss": 0.23574538230895997, "memory(GiB)": 30.08, "step": 270, "token_acc": 0.9073804876022761, "train_speed(iter/s)": 0.141801 }, { "epoch": 1.7757575757575759, "grad_norm": 0.8586969375610352, "learning_rate": 3.526224127945479e-06, "loss": 0.24325270652770997, "memory(GiB)": 30.08, "step": 275, "token_acc": 0.9250533832744076, "train_speed(iter/s)": 0.142307 }, { "epoch": 1.808080808080808, "grad_norm": 0.8120052814483643, "learning_rate": 3.3646601834128924e-06, "loss": 0.2067141056060791, "memory(GiB)": 30.08, "step": 280, "token_acc": 0.9247845178077736, "train_speed(iter/s)": 0.142568 }, { "epoch": 1.808080808080808, "eval_loss": 0.3372032642364502, "eval_runtime": 5.3852, "eval_samples_per_second": 18.569, "eval_steps_per_second": 4.642, "eval_token_acc": 0.8925130437566019, "step": 280 }, { "epoch": 1.8404040404040405, "grad_norm": 0.7563571929931641, "learning_rate": 3.204986500762006e-06, "loss": 0.22141036987304688, "memory(GiB)": 30.08, "step": 285, "token_acc": 0.9158564914393874, "train_speed(iter/s)": 0.141564 }, { "epoch": 1.8727272727272726, "grad_norm": 0.840555727481842, "learning_rate": 3.0473876441260786e-06, "loss": 0.22226524353027344, "memory(GiB)": 30.08, "step": 290, "token_acc": 0.9322453534191164, "train_speed(iter/s)": 0.14182 }, { "epoch": 1.905050505050505, "grad_norm": 0.8599358797073364, "learning_rate": 2.8920457793817507e-06, "loss": 0.22878422737121581, "memory(GiB)": 30.08, "step": 295, "token_acc": 0.9275855327468231, "train_speed(iter/s)": 0.142089 }, { "epoch": 1.9373737373737374, "grad_norm": 0.9196203947067261, "learning_rate": 2.7391404635865725e-06, "loss": 0.23831405639648437, "memory(GiB)": 30.08, "step": 300, "token_acc": 0.9162388743213797, "train_speed(iter/s)": 0.142402 }, { "epoch": 1.9373737373737374, "eval_loss": 0.33528250455856323, "eval_runtime": 5.3837, "eval_samples_per_second": 18.575, "eval_steps_per_second": 4.644, "eval_token_acc": 0.8929931820364265, "step": 300 }, { "epoch": 1.9696969696969697, "grad_norm": 0.757847785949707, "learning_rate": 2.5888484374320033e-06, "loss": 0.2106797695159912, "memory(GiB)": 30.08, "step": 305, "token_acc": 0.9235401079083078, "train_speed(iter/s)": 0.141615 }, { "epoch": 2.0, "grad_norm": 0.967450737953186, "learning_rate": 2.4413434209518137e-06, "loss": 0.21637356281280518, "memory(GiB)": 30.08, "step": 310, "token_acc": 0.9329970868298622, "train_speed(iter/s)": 0.141903 }, { "epoch": 2.0323232323232325, "grad_norm": 0.7503668665885925, "learning_rate": 2.296795912722014e-06, "loss": 0.16243449449539185, "memory(GiB)": 30.08, "step": 315, "token_acc": 0.9508763656370353, "train_speed(iter/s)": 0.141997 }, { "epoch": 2.0646464646464646, "grad_norm": 0.8131990432739258, "learning_rate": 2.1553729927843894e-06, "loss": 0.17449368238449098, "memory(GiB)": 30.08, "step": 320, "token_acc": 0.9495816440955749, "train_speed(iter/s)": 0.142202 }, { "epoch": 2.0646464646464646, "eval_loss": 0.3504800796508789, "eval_runtime": 5.4562, "eval_samples_per_second": 18.328, "eval_steps_per_second": 4.582, "eval_token_acc": 0.8927691175058416, "step": 320 }, { "epoch": 2.096969696969697, "grad_norm": 0.8142232894897461, "learning_rate": 2.017238129521506e-06, "loss": 0.16946163177490234, "memory(GiB)": 30.08, "step": 325, "token_acc": 0.9349265764468759, "train_speed(iter/s)": 0.141472 }, { "epoch": 2.1292929292929292, "grad_norm": 0.8298311829566956, "learning_rate": 1.8825509907063328e-06, "loss": 0.1755598545074463, "memory(GiB)": 30.08, "step": 330, "token_acc": 0.9531752999707346, "train_speed(iter/s)": 0.141802 }, { "epoch": 2.1616161616161618, "grad_norm": 0.7940059304237366, "learning_rate": 1.7514672589449378e-06, "loss": 0.1952407479286194, "memory(GiB)": 30.08, "step": 335, "token_acc": 0.9343925770825635, "train_speed(iter/s)": 0.142047 }, { "epoch": 2.193939393939394, "grad_norm": 0.7858513593673706, "learning_rate": 1.6241384517255854e-06, "loss": 0.16918621063232422, "memory(GiB)": 30.08, "step": 340, "token_acc": 0.9412830735773831, "train_speed(iter/s)": 0.142253 }, { "epoch": 2.193939393939394, "eval_loss": 0.3576539158821106, "eval_runtime": 5.351, "eval_samples_per_second": 18.688, "eval_steps_per_second": 4.672, "eval_token_acc": 0.8910406196984731, "step": 340 }, { "epoch": 2.2262626262626264, "grad_norm": 0.7290251851081848, "learning_rate": 1.500711746282192e-06, "loss": 0.1872728943824768, "memory(GiB)": 30.08, "step": 345, "token_acc": 0.9292867611138251, "train_speed(iter/s)": 0.141644 }, { "epoch": 2.2585858585858585, "grad_norm": 0.7997108101844788, "learning_rate": 1.3813298094746491e-06, "loss": 0.16540231704711914, "memory(GiB)": 30.08, "step": 350, "token_acc": 0.9447741310403294, "train_speed(iter/s)": 0.141801 }, { "epoch": 2.290909090909091, "grad_norm": 0.7840582728385925, "learning_rate": 1.2661306328825818e-06, "loss": 0.17242782115936278, "memory(GiB)": 30.08, "step": 355, "token_acc": 0.9399465492847037, "train_speed(iter/s)": 0.142023 }, { "epoch": 2.323232323232323, "grad_norm": 0.7512005567550659, "learning_rate": 1.1552473733031893e-06, "loss": 0.1620992064476013, "memory(GiB)": 30.08, "step": 360, "token_acc": 0.9435792877983619, "train_speed(iter/s)": 0.142359 }, { "epoch": 2.323232323232323, "eval_loss": 0.36009594798088074, "eval_runtime": 5.3832, "eval_samples_per_second": 18.576, "eval_steps_per_second": 4.644, "eval_token_acc": 0.8920008962581223, "step": 360 }, { "epoch": 2.3555555555555556, "grad_norm": 0.7732217311859131, "learning_rate": 1.0488081988375493e-06, "loss": 0.16843740940093993, "memory(GiB)": 30.08, "step": 365, "token_acc": 0.9334714548802947, "train_speed(iter/s)": 0.141824 }, { "epoch": 2.3878787878787877, "grad_norm": 0.7981094121932983, "learning_rate": 9.469361407432431e-07, "loss": 0.1794123411178589, "memory(GiB)": 30.08, "step": 370, "token_acc": 0.9482818106541541, "train_speed(iter/s)": 0.142017 }, { "epoch": 2.4202020202020202, "grad_norm": 0.7665418982505798, "learning_rate": 8.497489512245971e-07, "loss": 0.1843852996826172, "memory(GiB)": 30.08, "step": 375, "token_acc": 0.9573796089286348, "train_speed(iter/s)": 0.142235 }, { "epoch": 2.4525252525252528, "grad_norm": 0.9355995655059814, "learning_rate": 7.573589673248833e-07, "loss": 0.17202303409576417, "memory(GiB)": 30.08, "step": 380, "token_acc": 0.9362966839881864, "train_speed(iter/s)": 0.142515 }, { "epoch": 2.4525252525252528, "eval_loss": 0.36116844415664673, "eval_runtime": 5.3783, "eval_samples_per_second": 18.593, "eval_steps_per_second": 4.648, "eval_token_acc": 0.8914567395409878, "step": 380 }, { "epoch": 2.484848484848485, "grad_norm": 0.7312317490577698, "learning_rate": 6.698729810778065e-07, "loss": 0.17411458492279053, "memory(GiB)": 30.08, "step": 385, "token_acc": 0.9338178444410082, "train_speed(iter/s)": 0.141893 }, { "epoch": 2.517171717171717, "grad_norm": 0.7563744187355042, "learning_rate": 5.873921160683943e-07, "loss": 0.1915157437324524, "memory(GiB)": 30.08, "step": 390, "token_acc": 0.9295483460559797, "train_speed(iter/s)": 0.142171 }, { "epoch": 2.5494949494949495, "grad_norm": 0.7823712229728699, "learning_rate": 5.100117105459279e-07, "loss": 0.15321061611175538, "memory(GiB)": 30.08, "step": 395, "token_acc": 0.9472502392696753, "train_speed(iter/s)": 0.142376 }, { "epoch": 2.581818181818182, "grad_norm": 0.6383055448532104, "learning_rate": 4.3782120722406565e-07, "loss": 0.16857578754425048, "memory(GiB)": 30.08, "step": 400, "token_acc": 0.9525385172164202, "train_speed(iter/s)": 0.142658 }, { "epoch": 2.581818181818182, "eval_loss": 0.36100760102272034, "eval_runtime": 5.3454, "eval_samples_per_second": 18.708, "eval_steps_per_second": 4.677, "eval_token_acc": 0.891264684229058, "step": 400 }, { "epoch": 2.614141414141414, "grad_norm": 0.8574426174163818, "learning_rate": 3.709040498955102e-07, "loss": 0.18224529027938843, "memory(GiB)": 30.08, "step": 405, "token_acc": 0.9255623050402233, "train_speed(iter/s)": 0.142038 }, { "epoch": 2.6464646464646466, "grad_norm": 0.9696727395057678, "learning_rate": 3.0933758698072023e-07, "loss": 0.18939828872680664, "memory(GiB)": 30.08, "step": 410, "token_acc": 0.9416907375312922, "train_speed(iter/s)": 0.142235 }, { "epoch": 2.6787878787878787, "grad_norm": 0.7818398475646973, "learning_rate": 2.531929821221768e-07, "loss": 0.19069280624389648, "memory(GiB)": 30.08, "step": 415, "token_acc": 0.9258034817542685, "train_speed(iter/s)": 0.142409 }, { "epoch": 2.7111111111111112, "grad_norm": 0.8981226086616516, "learning_rate": 2.0253513192751374e-07, "loss": 0.17302310466766357, "memory(GiB)": 30.08, "step": 420, "token_acc": 0.950883135736753, "train_speed(iter/s)": 0.142691 }, { "epoch": 2.7111111111111112, "eval_loss": 0.3606036305427551, "eval_runtime": 5.3784, "eval_samples_per_second": 18.593, "eval_steps_per_second": 4.648, "eval_token_acc": 0.8917128132902276, "step": 420 }, { "epoch": 2.7434343434343433, "grad_norm": 0.754592776298523, "learning_rate": 1.5742259095662126e-07, "loss": 0.16562799215316773, "memory(GiB)": 30.08, "step": 425, "token_acc": 0.9294097342078012, "train_speed(iter/s)": 0.14213 }, { "epoch": 2.775757575757576, "grad_norm": 0.811010479927063, "learning_rate": 1.1790750403941231e-07, "loss": 0.17516304254531861, "memory(GiB)": 30.08, "step": 430, "token_acc": 0.953036002149382, "train_speed(iter/s)": 0.142302 }, { "epoch": 2.808080808080808, "grad_norm": 0.8035722374916077, "learning_rate": 8.403554600248498e-08, "loss": 0.16143158674240113, "memory(GiB)": 30.08, "step": 435, "token_acc": 0.9470889436753271, "train_speed(iter/s)": 0.142493 }, { "epoch": 2.8404040404040405, "grad_norm": 0.7885386347770691, "learning_rate": 5.584586887435739e-08, "loss": 0.16893348693847657, "memory(GiB)": 30.08, "step": 440, "token_acc": 0.946100607044813, "train_speed(iter/s)": 0.142701 }, { "epoch": 2.8404040404040405, "eval_loss": 0.36063292622566223, "eval_runtime": 5.3864, "eval_samples_per_second": 18.565, "eval_steps_per_second": 4.641, "eval_token_acc": 0.8917448225088825, "step": 440 }, { "epoch": 2.8727272727272726, "grad_norm": 0.8363362550735474, "learning_rate": 3.337105663029361e-08, "loss": 0.166959547996521, "memory(GiB)": 30.08, "step": 445, "token_acc": 0.9339094103124109, "train_speed(iter/s)": 0.142143 }, { "epoch": 2.905050505050505, "grad_norm": 0.817148745059967, "learning_rate": 1.6637087529033925e-08, "loss": 0.16920559406280516, "memory(GiB)": 30.08, "step": 450, "token_acc": 0.9441476444876153, "train_speed(iter/s)": 0.142396 }, { "epoch": 2.937373737373737, "grad_norm": 0.7608515620231628, "learning_rate": 5.6633040849601865e-09, "loss": 0.16781603097915648, "memory(GiB)": 30.08, "step": 455, "token_acc": 0.9337727971874313, "train_speed(iter/s)": 0.142524 }, { "epoch": 2.9696969696969697, "grad_norm": 0.8431264162063599, "learning_rate": 4.623907104084335e-10, "loss": 0.2008026123046875, "memory(GiB)": 30.08, "step": 460, "token_acc": 0.9438367531683766, "train_speed(iter/s)": 0.142801 }, { "epoch": 2.9696969696969697, "eval_loss": 0.36077243089675903, "eval_runtime": 5.3777, "eval_samples_per_second": 18.595, "eval_steps_per_second": 4.649, "eval_token_acc": 0.8913927211036778, "step": 460 }, { "epoch": 2.9826262626262627, "eval_loss": 0.3604045808315277, "eval_runtime": 5.3956, "eval_samples_per_second": 18.534, "eval_steps_per_second": 4.633, "eval_token_acc": 0.8918728593835025, "step": 462 } ], "logging_steps": 5, "max_steps": 462, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.754364855085957e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }