{ "best_global_step": 300, "best_metric": 0.21289518, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v32-20250511-155741/checkpoint-300", "epoch": 2.9826262626262627, "eval_steps": 20, "global_step": 462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006464646464646465, "grad_norm": 2.4983460903167725, "learning_rate": 9.999884400986087e-06, "loss": 0.4511958062648773, "memory(GiB)": 29.0, "step": 1, "token_acc": 0.8814259881167656, "train_speed(iter/s)": 0.064483 }, { "epoch": 0.03232323232323232, "grad_norm": 1.3926482200622559, "learning_rate": 9.997110291906109e-06, "loss": 0.36116155982017517, "memory(GiB)": 29.0, "step": 5, "token_acc": 0.8813427587993634, "train_speed(iter/s)": 0.117455 }, { "epoch": 0.06464646464646465, "grad_norm": 0.9170165657997131, "learning_rate": 9.988444507789584e-06, "loss": 0.2866232395172119, "memory(GiB)": 29.01, "step": 10, "token_acc": 0.9120523855610239, "train_speed(iter/s)": 0.133259 }, { "epoch": 0.09696969696969697, "grad_norm": 1.0082355737686157, "learning_rate": 9.97401266428502e-06, "loss": 0.28241963386535646, "memory(GiB)": 29.01, "step": 15, "token_acc": 0.9087824080358354, "train_speed(iter/s)": 0.138372 }, { "epoch": 0.1292929292929293, "grad_norm": 0.8785488605499268, "learning_rate": 9.953831442918418e-06, "loss": 0.2537196159362793, "memory(GiB)": 29.01, "step": 20, "token_acc": 0.9221852133546644, "train_speed(iter/s)": 0.141695 }, { "epoch": 0.1292929292929293, "eval_loss": 0.2639790177345276, "eval_runtime": 4.917, "eval_samples_per_second": 20.337, "eval_steps_per_second": 5.084, "eval_token_acc": 0.9243485416476181, "step": 20 }, { "epoch": 0.16161616161616163, "grad_norm": 0.8217573761940002, "learning_rate": 9.927924170825266e-06, "loss": 0.25768203735351564, "memory(GiB)": 35.4, "step": 25, "token_acc": 0.9051673457803726, "train_speed(iter/s)": 0.130395 }, { "epoch": 0.19393939393939394, "grad_norm": 0.7097320556640625, "learning_rate": 9.896320793787106e-06, "loss": 0.2483672618865967, "memory(GiB)": 35.4, "step": 30, "token_acc": 0.909970182164424, "train_speed(iter/s)": 0.135436 }, { "epoch": 0.22626262626262628, "grad_norm": 0.836563229560852, "learning_rate": 9.859057841617709e-06, "loss": 0.2459421157836914, "memory(GiB)": 35.4, "step": 35, "token_acc": 0.9186036076460559, "train_speed(iter/s)": 0.137299 }, { "epoch": 0.2585858585858586, "grad_norm": 0.8180928230285645, "learning_rate": 9.816178385938867e-06, "loss": 0.24172163009643555, "memory(GiB)": 35.4, "step": 40, "token_acc": 0.9263462681936868, "train_speed(iter/s)": 0.138678 }, { "epoch": 0.2585858585858586, "eval_loss": 0.24495410919189453, "eval_runtime": 4.9047, "eval_samples_per_second": 20.389, "eval_steps_per_second": 5.097, "eval_token_acc": 0.9283167230501966, "step": 40 }, { "epoch": 0.2909090909090909, "grad_norm": 0.7037041187286377, "learning_rate": 9.767731990394638e-06, "loss": 0.23539552688598633, "memory(GiB)": 35.4, "step": 45, "token_acc": 0.923866090712743, "train_speed(iter/s)": 0.133364 }, { "epoch": 0.32323232323232326, "grad_norm": 0.7791242003440857, "learning_rate": 9.71377465336155e-06, "loss": 0.24135751724243165, "memory(GiB)": 35.4, "step": 50, "token_acc": 0.9219366605869673, "train_speed(iter/s)": 0.13559 }, { "epoch": 0.35555555555555557, "grad_norm": 0.8093725442886353, "learning_rate": 9.654368743221022e-06, "loss": 0.22368321418762208, "memory(GiB)": 35.4, "step": 55, "token_acc": 0.9367819177493826, "train_speed(iter/s)": 0.136789 }, { "epoch": 0.3878787878787879, "grad_norm": 0.793940007686615, "learning_rate": 9.589582926268798e-06, "loss": 0.253676700592041, "memory(GiB)": 35.4, "step": 60, "token_acc": 0.9199327545541226, "train_speed(iter/s)": 0.138407 }, { "epoch": 0.3878787878787879, "eval_loss": 0.23592451214790344, "eval_runtime": 4.9128, "eval_samples_per_second": 20.355, "eval_steps_per_second": 5.089, "eval_token_acc": 0.9300722318734571, "step": 60 }, { "epoch": 0.4202020202020202, "grad_norm": 0.6124956011772156, "learning_rate": 9.519492087344724e-06, "loss": 0.22318036556243898, "memory(GiB)": 35.4, "step": 65, "token_acc": 0.9210595135801547, "train_speed(iter/s)": 0.133844 }, { "epoch": 0.45252525252525255, "grad_norm": 0.7477027773857117, "learning_rate": 9.444177243274619e-06, "loss": 0.2359461307525635, "memory(GiB)": 35.4, "step": 70, "token_acc": 0.9285559703511335, "train_speed(iter/s)": 0.13606 }, { "epoch": 0.48484848484848486, "grad_norm": 0.6800923347473145, "learning_rate": 9.363725449224281e-06, "loss": 0.2285386562347412, "memory(GiB)": 35.4, "step": 75, "token_acc": 0.9188793644156387, "train_speed(iter/s)": 0.137382 }, { "epoch": 0.5171717171717172, "grad_norm": 0.6734771132469177, "learning_rate": 9.278229698073889e-06, "loss": 0.21797473430633546, "memory(GiB)": 35.4, "step": 80, "token_acc": 0.9257270821968936, "train_speed(iter/s)": 0.137993 }, { "epoch": 0.5171717171717172, "eval_loss": 0.22674760222434998, "eval_runtime": 4.9057, "eval_samples_per_second": 20.384, "eval_steps_per_second": 5.096, "eval_token_acc": 0.9334003840175551, "step": 80 }, { "epoch": 0.5494949494949495, "grad_norm": 0.7561268210411072, "learning_rate": 9.187788812929074e-06, "loss": 0.21897151470184326, "memory(GiB)": 35.4, "step": 85, "token_acc": 0.9210582145281738, "train_speed(iter/s)": 0.134965 }, { "epoch": 0.5818181818181818, "grad_norm": 0.8261750936508179, "learning_rate": 9.092507332892968e-06, "loss": 0.22996132373809813, "memory(GiB)": 35.4, "step": 90, "token_acc": 0.9239816972180894, "train_speed(iter/s)": 0.135853 }, { "epoch": 0.6141414141414141, "grad_norm": 0.7424522638320923, "learning_rate": 8.992495392231195e-06, "loss": 0.2230750799179077, "memory(GiB)": 35.4, "step": 95, "token_acc": 0.9186349499873918, "train_speed(iter/s)": 0.13676 }, { "epoch": 0.6464646464646465, "grad_norm": 0.7229611873626709, "learning_rate": 8.88786859306952e-06, "loss": 0.22269039154052733, "memory(GiB)": 35.4, "step": 100, "token_acc": 0.9253269004084542, "train_speed(iter/s)": 0.13764 }, { "epoch": 0.6464646464646465, "eval_loss": 0.22388949990272522, "eval_runtime": 4.9194, "eval_samples_per_second": 20.328, "eval_steps_per_second": 5.082, "eval_token_acc": 0.9335466764194935, "step": 100 }, { "epoch": 0.6787878787878788, "grad_norm": 0.7450099587440491, "learning_rate": 8.778747871771293e-06, "loss": 0.2219161033630371, "memory(GiB)": 35.4, "step": 105, "token_acc": 0.9172185430463576, "train_speed(iter/s)": 0.135528 }, { "epoch": 0.7111111111111111, "grad_norm": 0.7880620360374451, "learning_rate": 8.665259359149132e-06, "loss": 0.2219111680984497, "memory(GiB)": 35.4, "step": 110, "token_acc": 0.9298043728423475, "train_speed(iter/s)": 0.136321 }, { "epoch": 0.7434343434343434, "grad_norm": 0.6852765679359436, "learning_rate": 8.547534234672435e-06, "loss": 0.21125171184539795, "memory(GiB)": 35.4, "step": 115, "token_acc": 0.9242475103502293, "train_speed(iter/s)": 0.137112 }, { "epoch": 0.7757575757575758, "grad_norm": 0.7938790917396545, "learning_rate": 8.425708574839221e-06, "loss": 0.20452361106872557, "memory(GiB)": 35.4, "step": 120, "token_acc": 0.9356995325578867, "train_speed(iter/s)": 0.137738 }, { "epoch": 0.7757575757575758, "eval_loss": 0.22211778163909912, "eval_runtime": 4.9147, "eval_samples_per_second": 20.347, "eval_steps_per_second": 5.087, "eval_token_acc": 0.9335649629697358, "step": 120 }, { "epoch": 0.8080808080808081, "grad_norm": 0.8188007473945618, "learning_rate": 8.299923195887599e-06, "loss": 0.2093752384185791, "memory(GiB)": 35.4, "step": 125, "token_acc": 0.9252643368325139, "train_speed(iter/s)": 0.135892 }, { "epoch": 0.8404040404040404, "grad_norm": 0.7376716732978821, "learning_rate": 8.170323491028625e-06, "loss": 0.22786922454833985, "memory(GiB)": 35.4, "step": 130, "token_acc": 0.9265276699567484, "train_speed(iter/s)": 0.136311 }, { "epoch": 0.8727272727272727, "grad_norm": 0.7271324396133423, "learning_rate": 8.03705926238874e-06, "loss": 0.22320261001586914, "memory(GiB)": 35.4, "step": 135, "token_acc": 0.9232753388630386, "train_speed(iter/s)": 0.136955 }, { "epoch": 0.9050505050505051, "grad_norm": 0.6261648535728455, "learning_rate": 7.900284547855992e-06, "loss": 0.20459423065185547, "memory(GiB)": 35.4, "step": 140, "token_acc": 0.9249030499947595, "train_speed(iter/s)": 0.137413 }, { "epoch": 0.9050505050505051, "eval_loss": 0.2182023972272873, "eval_runtime": 4.9305, "eval_samples_per_second": 20.282, "eval_steps_per_second": 5.07, "eval_token_acc": 0.9343512846301545, "step": 140 }, { "epoch": 0.9373737373737374, "grad_norm": 0.7701309323310852, "learning_rate": 7.760157443030234e-06, "loss": 0.21836166381835936, "memory(GiB)": 35.4, "step": 145, "token_acc": 0.9190240230822304, "train_speed(iter/s)": 0.13576 }, { "epoch": 0.9696969696969697, "grad_norm": 0.7041613459587097, "learning_rate": 7.616839918483061e-06, "loss": 0.20981380939483643, "memory(GiB)": 35.4, "step": 150, "token_acc": 0.936579955636217, "train_speed(iter/s)": 0.136283 }, { "epoch": 1.0, "grad_norm": 0.7106350660324097, "learning_rate": 7.470497632538743e-06, "loss": 0.21157641410827638, "memory(GiB)": 35.4, "step": 155, "token_acc": 0.9277830690795003, "train_speed(iter/s)": 0.13695 }, { "epoch": 1.0323232323232323, "grad_norm": 0.6274145245552063, "learning_rate": 7.321299739792553e-06, "loss": 0.15708084106445314, "memory(GiB)": 35.4, "step": 160, "token_acc": 0.9426217376288915, "train_speed(iter/s)": 0.137434 }, { "epoch": 1.0323232323232323, "eval_loss": 0.21975626051425934, "eval_runtime": 4.9158, "eval_samples_per_second": 20.343, "eval_steps_per_second": 5.086, "eval_token_acc": 0.9343878577306391, "step": 160 }, { "epoch": 1.0646464646464646, "grad_norm": 0.774198591709137, "learning_rate": 7.169418695587791e-06, "loss": 0.15303026437759398, "memory(GiB)": 35.4, "step": 165, "token_acc": 0.9383086204390173, "train_speed(iter/s)": 0.136042 }, { "epoch": 1.096969696969697, "grad_norm": 0.9729277491569519, "learning_rate": 7.015030056677559e-06, "loss": 0.16362838745117186, "memory(GiB)": 35.4, "step": 170, "token_acc": 0.9423035053342043, "train_speed(iter/s)": 0.136853 }, { "epoch": 1.1292929292929292, "grad_norm": 0.6922506093978882, "learning_rate": 6.858312278301638e-06, "loss": 0.14241609573364258, "memory(GiB)": 35.4, "step": 175, "token_acc": 0.9497638260185003, "train_speed(iter/s)": 0.137229 }, { "epoch": 1.1616161616161615, "grad_norm": 0.7330082058906555, "learning_rate": 6.699446507913083e-06, "loss": 0.14002810716629027, "memory(GiB)": 35.4, "step": 180, "token_acc": 0.9511152364076167, "train_speed(iter/s)": 0.137547 }, { "epoch": 1.1616161616161615, "eval_loss": 0.2188844084739685, "eval_runtime": 4.9164, "eval_samples_per_second": 20.34, "eval_steps_per_second": 5.085, "eval_token_acc": 0.9342232787784585, "step": 180 }, { "epoch": 1.1939393939393939, "grad_norm": 0.6539108157157898, "learning_rate": 6.53861637579291e-06, "loss": 0.14178130626678467, "memory(GiB)": 35.4, "step": 185, "token_acc": 0.9393139351120201, "train_speed(iter/s)": 0.13623 }, { "epoch": 1.2262626262626264, "grad_norm": 0.6944511532783508, "learning_rate": 6.376007782794926e-06, "loss": 0.14827605485916137, "memory(GiB)": 35.4, "step": 190, "token_acc": 0.953679121068877, "train_speed(iter/s)": 0.136568 }, { "epoch": 1.2585858585858585, "grad_norm": 0.771194577217102, "learning_rate": 6.211808685466063e-06, "loss": 0.15898674726486206, "memory(GiB)": 35.4, "step": 195, "token_acc": 0.943248080364875, "train_speed(iter/s)": 0.137259 }, { "epoch": 1.290909090909091, "grad_norm": 0.6921746134757996, "learning_rate": 6.046208878790543e-06, "loss": 0.15377380847930908, "memory(GiB)": 35.4, "step": 200, "token_acc": 0.9407217751767861, "train_speed(iter/s)": 0.137707 }, { "epoch": 1.290909090909091, "eval_loss": 0.21775808930397034, "eval_runtime": 4.8976, "eval_samples_per_second": 20.418, "eval_steps_per_second": 5.105, "eval_token_acc": 0.93543019109445, "step": 200 }, { "epoch": 1.3232323232323233, "grad_norm": 0.6446962952613831, "learning_rate": 5.879399776809047e-06, "loss": 0.14692131280899048, "memory(GiB)": 35.4, "step": 205, "token_acc": 0.9425448637471383, "train_speed(iter/s)": 0.136659 }, { "epoch": 1.3555555555555556, "grad_norm": 0.7306222319602966, "learning_rate": 5.711574191366427e-06, "loss": 0.15312260389328003, "memory(GiB)": 35.4, "step": 210, "token_acc": 0.9499927420525476, "train_speed(iter/s)": 0.137029 }, { "epoch": 1.387878787878788, "grad_norm": 0.6448764801025391, "learning_rate": 5.542926109243727e-06, "loss": 0.13940632343292236, "memory(GiB)": 35.4, "step": 215, "token_acc": 0.9516881492881878, "train_speed(iter/s)": 0.137465 }, { "epoch": 1.4202020202020202, "grad_norm": 0.6932432055473328, "learning_rate": 5.373650467932122e-06, "loss": 0.15438802242279054, "memory(GiB)": 35.4, "step": 220, "token_acc": 0.9442491210447012, "train_speed(iter/s)": 0.137794 }, { "epoch": 1.4202020202020202, "eval_loss": 0.21947798132896423, "eval_runtime": 4.9113, "eval_samples_per_second": 20.361, "eval_steps_per_second": 5.09, "eval_token_acc": 0.9346804425345159, "step": 220 }, { "epoch": 1.4525252525252526, "grad_norm": 0.6570079922676086, "learning_rate": 5.2039429303079294e-06, "loss": 0.15822865962982177, "memory(GiB)": 35.4, "step": 225, "token_acc": 0.9367170212395239, "train_speed(iter/s)": 0.136817 }, { "epoch": 1.4848484848484849, "grad_norm": 0.6642510294914246, "learning_rate": 5.033999658469174e-06, "loss": 0.15136797428131105, "memory(GiB)": 35.4, "step": 230, "token_acc": 0.9503748661192432, "train_speed(iter/s)": 0.137201 }, { "epoch": 1.5171717171717172, "grad_norm": 0.7496922016143799, "learning_rate": 4.864017086995112e-06, "loss": 0.14471328258514404, "memory(GiB)": 35.4, "step": 235, "token_acc": 0.9505234475835365, "train_speed(iter/s)": 0.137524 }, { "epoch": 1.5494949494949495, "grad_norm": 0.7266538739204407, "learning_rate": 4.694191695890788e-06, "loss": 0.1494928002357483, "memory(GiB)": 35.4, "step": 240, "token_acc": 0.9422356427892412, "train_speed(iter/s)": 0.137886 }, { "epoch": 1.5494949494949495, "eval_loss": 0.21711412072181702, "eval_runtime": 4.9366, "eval_samples_per_second": 20.257, "eval_steps_per_second": 5.064, "eval_token_acc": 0.9350644600896041, "step": 240 }, { "epoch": 1.5818181818181818, "grad_norm": 0.7479135394096375, "learning_rate": 4.524719783479088e-06, "loss": 0.1447455883026123, "memory(GiB)": 35.4, "step": 245, "token_acc": 0.939549201495775, "train_speed(iter/s)": 0.136926 }, { "epoch": 1.614141414141414, "grad_norm": 0.749000072479248, "learning_rate": 4.355797239502807e-06, "loss": 0.14387867450714112, "memory(GiB)": 35.4, "step": 250, "token_acc": 0.9542463385992511, "train_speed(iter/s)": 0.136999 }, { "epoch": 1.6464646464646466, "grad_norm": 0.7617182731628418, "learning_rate": 4.187619318698971e-06, "loss": 0.14646867513656617, "memory(GiB)": 35.4, "step": 255, "token_acc": 0.9474541561947315, "train_speed(iter/s)": 0.137388 }, { "epoch": 1.6787878787878787, "grad_norm": 0.6883800029754639, "learning_rate": 4.020380415107167e-06, "loss": 0.1463113784790039, "memory(GiB)": 35.4, "step": 260, "token_acc": 0.9495336172965978, "train_speed(iter/s)": 0.137618 }, { "epoch": 1.6787878787878787, "eval_loss": 0.21431542932987213, "eval_runtime": 4.9045, "eval_samples_per_second": 20.39, "eval_steps_per_second": 5.097, "eval_token_acc": 0.93678339581238, "step": 260 }, { "epoch": 1.7111111111111112, "grad_norm": 0.6667433977127075, "learning_rate": 3.854273837372724e-06, "loss": 0.1535871744155884, "memory(GiB)": 35.4, "step": 265, "token_acc": 0.9473451928299838, "train_speed(iter/s)": 0.136826 }, { "epoch": 1.7434343434343433, "grad_norm": 0.6924517154693604, "learning_rate": 3.689491585304491e-06, "loss": 0.14744930267333983, "memory(GiB)": 35.4, "step": 270, "token_acc": 0.9449676204236436, "train_speed(iter/s)": 0.137097 }, { "epoch": 1.7757575757575759, "grad_norm": 0.6516327857971191, "learning_rate": 3.526224127945479e-06, "loss": 0.1559753894805908, "memory(GiB)": 35.4, "step": 275, "token_acc": 0.9424614287896532, "train_speed(iter/s)": 0.137539 }, { "epoch": 1.808080808080808, "grad_norm": 0.6028838753700256, "learning_rate": 3.3646601834128924e-06, "loss": 0.1387632369995117, "memory(GiB)": 35.4, "step": 280, "token_acc": 0.94836721764672, "train_speed(iter/s)": 0.137773 }, { "epoch": 1.808080808080808, "eval_loss": 0.2140393853187561, "eval_runtime": 4.9027, "eval_samples_per_second": 20.397, "eval_steps_per_second": 5.099, "eval_token_acc": 0.9365639572094724, "step": 280 }, { "epoch": 1.8404040404040405, "grad_norm": 0.6998237371444702, "learning_rate": 3.204986500762006e-06, "loss": 0.1372049331665039, "memory(GiB)": 35.4, "step": 285, "token_acc": 0.9367344291458962, "train_speed(iter/s)": 0.136971 }, { "epoch": 1.8727272727272726, "grad_norm": 0.6459011435508728, "learning_rate": 3.0473876441260786e-06, "loss": 0.14626307487487794, "memory(GiB)": 35.4, "step": 290, "token_acc": 0.9516988818583687, "train_speed(iter/s)": 0.137206 }, { "epoch": 1.905050505050505, "grad_norm": 0.7556573748588562, "learning_rate": 2.8920457793817507e-06, "loss": 0.14524000883102417, "memory(GiB)": 35.4, "step": 295, "token_acc": 0.9526069161871997, "train_speed(iter/s)": 0.137397 }, { "epoch": 1.9373737373737374, "grad_norm": 0.7367635369300842, "learning_rate": 2.7391404635865725e-06, "loss": 0.15229568481445313, "memory(GiB)": 35.4, "step": 300, "token_acc": 0.9469880723162711, "train_speed(iter/s)": 0.137634 }, { "epoch": 1.9373737373737374, "eval_loss": 0.21289518475532532, "eval_runtime": 4.928, "eval_samples_per_second": 20.292, "eval_steps_per_second": 5.073, "eval_token_acc": 0.9364908110085033, "step": 300 }, { "epoch": 1.9696969696969697, "grad_norm": 0.6437100172042847, "learning_rate": 2.5888484374320033e-06, "loss": 0.1357938528060913, "memory(GiB)": 35.4, "step": 305, "token_acc": 0.9442405000496081, "train_speed(iter/s)": 0.136908 }, { "epoch": 2.0, "grad_norm": 0.8284677863121033, "learning_rate": 2.4413434209518137e-06, "loss": 0.15702880620956422, "memory(GiB)": 35.4, "step": 310, "token_acc": 0.9496246977167803, "train_speed(iter/s)": 0.137317 }, { "epoch": 2.0323232323232325, "grad_norm": 0.5962865948677063, "learning_rate": 2.296795912722014e-06, "loss": 0.10595057010650635, "memory(GiB)": 35.4, "step": 315, "token_acc": 0.9668544137877333, "train_speed(iter/s)": 0.137506 }, { "epoch": 2.0646464646464646, "grad_norm": 0.659416675567627, "learning_rate": 2.1553729927843894e-06, "loss": 0.10077614784240722, "memory(GiB)": 35.4, "step": 320, "token_acc": 0.9683417974178226, "train_speed(iter/s)": 0.13772 }, { "epoch": 2.0646464646464646, "eval_loss": 0.22252832353115082, "eval_runtime": 4.9207, "eval_samples_per_second": 20.322, "eval_steps_per_second": 5.081, "eval_token_acc": 0.9355033372954192, "step": 320 }, { "epoch": 2.096969696969697, "grad_norm": 0.7271497249603271, "learning_rate": 2.017238129521506e-06, "loss": 0.10290584564208985, "memory(GiB)": 35.4, "step": 325, "token_acc": 0.9533099941945359, "train_speed(iter/s)": 0.137044 }, { "epoch": 2.1292929292929292, "grad_norm": 0.8144314885139465, "learning_rate": 1.8825509907063328e-06, "loss": 0.10785359144210815, "memory(GiB)": 35.4, "step": 330, "token_acc": 0.9629136358979613, "train_speed(iter/s)": 0.137367 }, { "epoch": 2.1616161616161618, "grad_norm": 0.6300661563873291, "learning_rate": 1.7514672589449378e-06, "loss": 0.0975375771522522, "memory(GiB)": 35.4, "step": 335, "token_acc": 0.9635617067708034, "train_speed(iter/s)": 0.137455 }, { "epoch": 2.193939393939394, "grad_norm": 0.7295346856117249, "learning_rate": 1.6241384517255854e-06, "loss": 0.10815587043762206, "memory(GiB)": 35.4, "step": 340, "token_acc": 0.9572717202088938, "train_speed(iter/s)": 0.137684 }, { "epoch": 2.193939393939394, "eval_loss": 0.2339620590209961, "eval_runtime": 4.9038, "eval_samples_per_second": 20.392, "eval_steps_per_second": 5.098, "eval_token_acc": 0.9357044893480845, "step": 340 }, { "epoch": 2.2262626262626264, "grad_norm": 0.641219437122345, "learning_rate": 1.500711746282192e-06, "loss": 0.10359236001968383, "memory(GiB)": 35.4, "step": 345, "token_acc": 0.9507107042642255, "train_speed(iter/s)": 0.137115 }, { "epoch": 2.2585858585858585, "grad_norm": 0.671101450920105, "learning_rate": 1.3813298094746491e-06, "loss": 0.10023324489593506, "memory(GiB)": 35.4, "step": 350, "token_acc": 0.9665440259416314, "train_speed(iter/s)": 0.137251 }, { "epoch": 2.290909090909091, "grad_norm": 0.6462791562080383, "learning_rate": 1.2661306328825818e-06, "loss": 0.10422945022583008, "memory(GiB)": 35.4, "step": 355, "token_acc": 0.9596824570536179, "train_speed(iter/s)": 0.137541 }, { "epoch": 2.323232323232323, "grad_norm": 0.6095359921455383, "learning_rate": 1.1552473733031893e-06, "loss": 0.10793395042419433, "memory(GiB)": 35.4, "step": 360, "token_acc": 0.9589668637246134, "train_speed(iter/s)": 0.137864 }, { "epoch": 2.323232323232323, "eval_loss": 0.2318117916584015, "eval_runtime": 4.9037, "eval_samples_per_second": 20.393, "eval_steps_per_second": 5.098, "eval_token_acc": 0.9356496296973575, "step": 360 }, { "epoch": 2.3555555555555556, "grad_norm": 0.6510639786720276, "learning_rate": 1.0488081988375493e-06, "loss": 0.10193748474121093, "memory(GiB)": 35.4, "step": 365, "token_acc": 0.9518641538926091, "train_speed(iter/s)": 0.137469 }, { "epoch": 2.3878787878787877, "grad_norm": 0.6623317003250122, "learning_rate": 9.469361407432431e-07, "loss": 0.10095088481903076, "memory(GiB)": 35.4, "step": 370, "token_acc": 0.9654876015836633, "train_speed(iter/s)": 0.137592 }, { "epoch": 2.4202020202020202, "grad_norm": 4.409001350402832, "learning_rate": 8.497489512245971e-07, "loss": 0.11350960731506347, "memory(GiB)": 35.4, "step": 375, "token_acc": 0.9622961334066337, "train_speed(iter/s)": 0.137743 }, { "epoch": 2.4525252525252528, "grad_norm": 0.712914228439331, "learning_rate": 7.573589673248833e-07, "loss": 0.10257253646850586, "memory(GiB)": 35.4, "step": 380, "token_acc": 0.9649572411487786, "train_speed(iter/s)": 0.137953 }, { "epoch": 2.4525252525252528, "eval_loss": 0.2319175750017166, "eval_runtime": 4.9058, "eval_samples_per_second": 20.384, "eval_steps_per_second": 5.096, "eval_token_acc": 0.935923927950992, "step": 380 }, { "epoch": 2.484848484848485, "grad_norm": 0.5886191129684448, "learning_rate": 6.698729810778065e-07, "loss": 0.09679580926895141, "memory(GiB)": 35.4, "step": 385, "token_acc": 0.9530538972153801, "train_speed(iter/s)": 0.137382 }, { "epoch": 2.517171717171717, "grad_norm": 0.648043692111969, "learning_rate": 5.873921160683943e-07, "loss": 0.10484771728515625, "memory(GiB)": 35.4, "step": 390, "token_acc": 0.9592218765019858, "train_speed(iter/s)": 0.137666 }, { "epoch": 2.5494949494949495, "grad_norm": 0.6488398909568787, "learning_rate": 5.100117105459279e-07, "loss": 0.10683284997940064, "memory(GiB)": 35.4, "step": 395, "token_acc": 0.9655297765422859, "train_speed(iter/s)": 0.137785 }, { "epoch": 2.581818181818182, "grad_norm": 0.6164813041687012, "learning_rate": 4.3782120722406565e-07, "loss": 0.09545568823814392, "memory(GiB)": 35.4, "step": 400, "token_acc": 0.964151079309434, "train_speed(iter/s)": 0.138017 }, { "epoch": 2.581818181818182, "eval_loss": 0.2318730354309082, "eval_runtime": 4.9271, "eval_samples_per_second": 20.296, "eval_steps_per_second": 5.074, "eval_token_acc": 0.935795922099296, "step": 400 }, { "epoch": 2.614141414141414, "grad_norm": 0.6586740612983704, "learning_rate": 3.709040498955102e-07, "loss": 0.09898868799209595, "memory(GiB)": 35.4, "step": 405, "token_acc": 0.9504973446851556, "train_speed(iter/s)": 0.137464 }, { "epoch": 2.6464646464646466, "grad_norm": 0.7148135304450989, "learning_rate": 3.0933758698072023e-07, "loss": 0.10967177152633667, "memory(GiB)": 35.4, "step": 410, "token_acc": 0.9612623965810093, "train_speed(iter/s)": 0.137658 }, { "epoch": 2.6787878787878787, "grad_norm": 0.694380521774292, "learning_rate": 2.531929821221768e-07, "loss": 0.11133409738540649, "memory(GiB)": 35.4, "step": 415, "token_acc": 0.9621429914828448, "train_speed(iter/s)": 0.137903 }, { "epoch": 2.7111111111111112, "grad_norm": 0.6905390620231628, "learning_rate": 2.0253513192751374e-07, "loss": 0.09970238208770751, "memory(GiB)": 35.4, "step": 420, "token_acc": 0.9661263829919112, "train_speed(iter/s)": 0.138101 }, { "epoch": 2.7111111111111112, "eval_loss": 0.23147569596767426, "eval_runtime": 4.9087, "eval_samples_per_second": 20.372, "eval_steps_per_second": 5.093, "eval_token_acc": 0.9359605010514767, "step": 420 }, { "epoch": 2.7434343434343433, "grad_norm": 0.7609266042709351, "learning_rate": 1.5742259095662126e-07, "loss": 0.10753922462463379, "memory(GiB)": 35.4, "step": 425, "token_acc": 0.9499499081649692, "train_speed(iter/s)": 0.13755 }, { "epoch": 2.775757575757576, "grad_norm": 0.6554747223854065, "learning_rate": 1.1790750403941231e-07, "loss": 0.10583784580230712, "memory(GiB)": 35.4, "step": 430, "token_acc": 0.9655550844287033, "train_speed(iter/s)": 0.137741 }, { "epoch": 2.808080808080808, "grad_norm": 0.6224953532218933, "learning_rate": 8.403554600248498e-08, "loss": 0.09819064140319825, "memory(GiB)": 35.4, "step": 435, "token_acc": 0.9677975086138352, "train_speed(iter/s)": 0.137915 }, { "epoch": 2.8404040404040405, "grad_norm": 0.6320119500160217, "learning_rate": 5.584586887435739e-08, "loss": 0.10492353439331055, "memory(GiB)": 35.4, "step": 440, "token_acc": 0.964147038991839, "train_speed(iter/s)": 0.138113 }, { "epoch": 2.8404040404040405, "eval_loss": 0.23168671131134033, "eval_runtime": 4.9097, "eval_samples_per_second": 20.368, "eval_steps_per_second": 5.092, "eval_token_acc": 0.9358507817500229, "step": 440 }, { "epoch": 2.8727272727272726, "grad_norm": 0.6035221219062805, "learning_rate": 3.337105663029361e-08, "loss": 0.09671036005020142, "memory(GiB)": 35.4, "step": 445, "token_acc": 0.950904653367489, "train_speed(iter/s)": 0.137622 }, { "epoch": 2.905050505050505, "grad_norm": 0.640915036201477, "learning_rate": 1.6637087529033925e-08, "loss": 0.10660991668701172, "memory(GiB)": 35.4, "step": 450, "token_acc": 0.9602548048129798, "train_speed(iter/s)": 0.137845 }, { "epoch": 2.937373737373737, "grad_norm": 0.6130684614181519, "learning_rate": 5.6633040849601865e-09, "loss": 0.09656901955604554, "memory(GiB)": 35.4, "step": 455, "token_acc": 0.9638375350140056, "train_speed(iter/s)": 0.137992 }, { "epoch": 2.9696969696969697, "grad_norm": 0.6519441604614258, "learning_rate": 4.623907104084335e-10, "loss": 0.1028173565864563, "memory(GiB)": 35.4, "step": 460, "token_acc": 0.9615221849669585, "train_speed(iter/s)": 0.138235 }, { "epoch": 2.9696969696969697, "eval_loss": 0.23168207705020905, "eval_runtime": 4.9089, "eval_samples_per_second": 20.371, "eval_steps_per_second": 5.093, "eval_token_acc": 0.9358690683002652, "step": 460 }, { "epoch": 2.9826262626262627, "eval_loss": 0.23158639669418335, "eval_runtime": 4.9243, "eval_samples_per_second": 20.307, "eval_steps_per_second": 5.077, "eval_token_acc": 0.935923927950992, "step": 462 } ], "logging_steps": 5, "max_steps": 462, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.7147979377043046e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }