{ "best_global_step": 300, "best_metric": 0.67163587, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v21-20250507-064807/checkpoint-300", "epoch": 2.9826262626262627, "eval_steps": 20, "global_step": 462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006464646464646465, "grad_norm": 5.7837233543396, "learning_rate": 9.999884400986087e-06, "loss": 1.0828938484191895, "memory(GiB)": 27.73, "step": 1, "token_acc": 0.7079992873686086, "train_speed(iter/s)": 0.069407 }, { "epoch": 0.03232323232323232, "grad_norm": 2.584890842437744, "learning_rate": 9.997110291906109e-06, "loss": 0.8132728338241577, "memory(GiB)": 27.77, "step": 5, "token_acc": 0.7766760462727232, "train_speed(iter/s)": 0.127059 }, { "epoch": 0.06464646464646465, "grad_norm": 1.4381581544876099, "learning_rate": 9.988444507789584e-06, "loss": 0.7008798599243165, "memory(GiB)": 27.77, "step": 10, "token_acc": 0.8064654179148698, "train_speed(iter/s)": 0.144995 }, { "epoch": 0.09696969696969697, "grad_norm": 1.4875842332839966, "learning_rate": 9.97401266428502e-06, "loss": 0.6918133735656739, "memory(GiB)": 27.77, "step": 15, "token_acc": 0.7917820548324563, "train_speed(iter/s)": 0.148104 }, { "epoch": 0.1292929292929293, "grad_norm": 1.2409731149673462, "learning_rate": 9.953831442918418e-06, "loss": 0.6582849025726318, "memory(GiB)": 27.77, "step": 20, "token_acc": 0.8092561024264626, "train_speed(iter/s)": 0.1504 }, { "epoch": 0.1292929292929293, "eval_loss": 0.722944438457489, "eval_runtime": 4.5169, "eval_samples_per_second": 22.139, "eval_steps_per_second": 5.535, "eval_token_acc": 0.8038043327122556, "step": 20 }, { "epoch": 0.16161616161616163, "grad_norm": 1.244113802909851, "learning_rate": 9.927924170825266e-06, "loss": 0.6381969451904297, "memory(GiB)": 27.77, "step": 25, "token_acc": 0.8135619641465316, "train_speed(iter/s)": 0.138906 }, { "epoch": 0.19393939393939394, "grad_norm": 1.034762978553772, "learning_rate": 9.896320793787106e-06, "loss": 0.6519847869873047, "memory(GiB)": 27.77, "step": 30, "token_acc": 0.8079350766456267, "train_speed(iter/s)": 0.144048 }, { "epoch": 0.22626262626262628, "grad_norm": 1.1062158346176147, "learning_rate": 9.859057841617709e-06, "loss": 0.6522578716278076, "memory(GiB)": 27.77, "step": 35, "token_acc": 0.7871428029296801, "train_speed(iter/s)": 0.146527 }, { "epoch": 0.2585858585858586, "grad_norm": 1.048275113105774, "learning_rate": 9.816178385938867e-06, "loss": 0.6380832672119141, "memory(GiB)": 30.0, "step": 40, "token_acc": 0.8310100032268474, "train_speed(iter/s)": 0.147965 }, { "epoch": 0.2585858585858586, "eval_loss": 0.7094467878341675, "eval_runtime": 4.5026, "eval_samples_per_second": 22.209, "eval_steps_per_second": 5.552, "eval_token_acc": 0.8064740398787508, "step": 40 }, { "epoch": 0.2909090909090909, "grad_norm": 1.228413462638855, "learning_rate": 9.767731990394638e-06, "loss": 0.6573184967041016, "memory(GiB)": 30.0, "step": 45, "token_acc": 0.793196216263126, "train_speed(iter/s)": 0.142174 }, { "epoch": 0.32323232323232326, "grad_norm": 1.2188494205474854, "learning_rate": 9.71377465336155e-06, "loss": 0.6832234382629394, "memory(GiB)": 30.0, "step": 50, "token_acc": 0.7763722873389811, "train_speed(iter/s)": 0.14464 }, { "epoch": 0.35555555555555557, "grad_norm": 1.0591987371444702, "learning_rate": 9.654368743221022e-06, "loss": 0.653898048400879, "memory(GiB)": 30.0, "step": 55, "token_acc": 0.8127191799298624, "train_speed(iter/s)": 0.146174 }, { "epoch": 0.3878787878787879, "grad_norm": 1.0762509107589722, "learning_rate": 9.589582926268798e-06, "loss": 0.612049913406372, "memory(GiB)": 30.0, "step": 60, "token_acc": 0.8167217591261857, "train_speed(iter/s)": 0.147559 }, { "epoch": 0.3878787878787879, "eval_loss": 0.6970872282981873, "eval_runtime": 4.4596, "eval_samples_per_second": 22.423, "eval_steps_per_second": 5.606, "eval_token_acc": 0.8075586084151395, "step": 60 }, { "epoch": 0.4202020202020202, "grad_norm": 1.0156168937683105, "learning_rate": 9.519492087344724e-06, "loss": 0.6183786392211914, "memory(GiB)": 30.0, "step": 65, "token_acc": 0.82515202980332, "train_speed(iter/s)": 0.142388 }, { "epoch": 0.45252525252525255, "grad_norm": 1.1835277080535889, "learning_rate": 9.444177243274619e-06, "loss": 0.6588045597076416, "memory(GiB)": 30.0, "step": 70, "token_acc": 0.8151584404952757, "train_speed(iter/s)": 0.144193 }, { "epoch": 0.48484848484848486, "grad_norm": 1.1693778038024902, "learning_rate": 9.363725449224281e-06, "loss": 0.6500480651855469, "memory(GiB)": 30.0, "step": 75, "token_acc": 0.8076034754555119, "train_speed(iter/s)": 0.145372 }, { "epoch": 0.5171717171717172, "grad_norm": 1.0537551641464233, "learning_rate": 9.278229698073889e-06, "loss": 0.6718258380889892, "memory(GiB)": 30.0, "step": 80, "token_acc": 0.7944381259859853, "train_speed(iter/s)": 0.14633 }, { "epoch": 0.5171717171717172, "eval_loss": 0.6917301416397095, "eval_runtime": 4.4847, "eval_samples_per_second": 22.298, "eval_steps_per_second": 5.575, "eval_token_acc": 0.8087266053004811, "step": 80 }, { "epoch": 0.5494949494949495, "grad_norm": 1.139643669128418, "learning_rate": 9.187788812929074e-06, "loss": 0.6676198005676269, "memory(GiB)": 30.0, "step": 85, "token_acc": 0.8008274744669155, "train_speed(iter/s)": 0.143323 }, { "epoch": 0.5818181818181818, "grad_norm": 1.1069749593734741, "learning_rate": 9.092507332892968e-06, "loss": 0.6722775459289551, "memory(GiB)": 30.0, "step": 90, "token_acc": 0.8005652779928759, "train_speed(iter/s)": 0.144592 }, { "epoch": 0.6141414141414141, "grad_norm": 1.1481822729110718, "learning_rate": 8.992495392231195e-06, "loss": 0.6242180824279785, "memory(GiB)": 30.0, "step": 95, "token_acc": 0.8088621855050695, "train_speed(iter/s)": 0.145728 }, { "epoch": 0.6464646464646465, "grad_norm": 1.0265405178070068, "learning_rate": 8.88786859306952e-06, "loss": 0.6294228076934815, "memory(GiB)": 30.0, "step": 100, "token_acc": 0.797233893557423, "train_speed(iter/s)": 0.146555 }, { "epoch": 0.6464646464646465, "eval_loss": 0.6827206611633301, "eval_runtime": 4.4877, "eval_samples_per_second": 22.283, "eval_steps_per_second": 5.571, "eval_token_acc": 0.8118412636613921, "step": 100 }, { "epoch": 0.6787878787878788, "grad_norm": 1.1638389825820923, "learning_rate": 8.778747871771293e-06, "loss": 0.6758254051208497, "memory(GiB)": 30.0, "step": 105, "token_acc": 0.8088623640012675, "train_speed(iter/s)": 0.144045 }, { "epoch": 0.7111111111111111, "grad_norm": 1.2460920810699463, "learning_rate": 8.665259359149132e-06, "loss": 0.6744856834411621, "memory(GiB)": 30.0, "step": 110, "token_acc": 0.8033372194695424, "train_speed(iter/s)": 0.145146 }, { "epoch": 0.7434343434343434, "grad_norm": 1.1937848329544067, "learning_rate": 8.547534234672435e-06, "loss": 0.6776030540466309, "memory(GiB)": 30.0, "step": 115, "token_acc": 0.7960356428441535, "train_speed(iter/s)": 0.145963 }, { "epoch": 0.7757575757575758, "grad_norm": 1.2813752889633179, "learning_rate": 8.425708574839221e-06, "loss": 0.6523926734924317, "memory(GiB)": 30.0, "step": 120, "token_acc": 0.8113255093959248, "train_speed(iter/s)": 0.146806 }, { "epoch": 0.7757575757575758, "eval_loss": 0.6801063418388367, "eval_runtime": 4.5179, "eval_samples_per_second": 22.134, "eval_steps_per_second": 5.533, "eval_token_acc": 0.8103673628298896, "step": 120 }, { "epoch": 0.8080808080808081, "grad_norm": 1.2211530208587646, "learning_rate": 8.299923195887599e-06, "loss": 0.6863309383392334, "memory(GiB)": 30.0, "step": 125, "token_acc": 0.7986384909941853, "train_speed(iter/s)": 0.144918 }, { "epoch": 0.8404040404040404, "grad_norm": 1.0644088983535767, "learning_rate": 8.170323491028625e-06, "loss": 0.6193663597106933, "memory(GiB)": 30.0, "step": 130, "token_acc": 0.8070464504820333, "train_speed(iter/s)": 0.145439 }, { "epoch": 0.8727272727272727, "grad_norm": 1.0717216730117798, "learning_rate": 8.03705926238874e-06, "loss": 0.6962187767028809, "memory(GiB)": 30.0, "step": 135, "token_acc": 0.7882992561955905, "train_speed(iter/s)": 0.146161 }, { "epoch": 0.9050505050505051, "grad_norm": 1.0318809747695923, "learning_rate": 7.900284547855992e-06, "loss": 0.6141955375671386, "memory(GiB)": 30.0, "step": 140, "token_acc": 0.8250999478532939, "train_speed(iter/s)": 0.146718 }, { "epoch": 0.9050505050505051, "eval_loss": 0.6769556403160095, "eval_runtime": 4.5137, "eval_samples_per_second": 22.155, "eval_steps_per_second": 5.539, "eval_token_acc": 0.812314024305459, "step": 140 }, { "epoch": 0.9373737373737374, "grad_norm": 1.1931071281433105, "learning_rate": 7.760157443030234e-06, "loss": 0.6433291435241699, "memory(GiB)": 30.0, "step": 145, "token_acc": 0.8096512634810674, "train_speed(iter/s)": 0.145106 }, { "epoch": 0.9696969696969697, "grad_norm": 0.9276081323623657, "learning_rate": 7.616839918483061e-06, "loss": 0.6246243000030518, "memory(GiB)": 30.0, "step": 150, "token_acc": 0.8373285914577848, "train_speed(iter/s)": 0.145752 }, { "epoch": 1.0, "grad_norm": 1.0196563005447388, "learning_rate": 7.470497632538743e-06, "loss": 0.6316683769226075, "memory(GiB)": 30.0, "step": 155, "token_acc": 0.8189631162217006, "train_speed(iter/s)": 0.14654 }, { "epoch": 1.0323232323232323, "grad_norm": 1.0861464738845825, "learning_rate": 7.321299739792553e-06, "loss": 0.574582290649414, "memory(GiB)": 30.0, "step": 160, "token_acc": 0.822840260798696, "train_speed(iter/s)": 0.147158 }, { "epoch": 1.0323232323232323, "eval_loss": 0.6779691576957703, "eval_runtime": 4.5159, "eval_samples_per_second": 22.144, "eval_steps_per_second": 5.536, "eval_token_acc": 0.8116465975138352, "step": 160 }, { "epoch": 1.0646464646464646, "grad_norm": 1.1801550388336182, "learning_rate": 7.169418695587791e-06, "loss": 0.5374558448791504, "memory(GiB)": 30.0, "step": 165, "token_acc": 0.8396995365190986, "train_speed(iter/s)": 0.145626 }, { "epoch": 1.096969696969697, "grad_norm": 1.0841394662857056, "learning_rate": 7.015030056677559e-06, "loss": 0.5700150489807129, "memory(GiB)": 30.0, "step": 170, "token_acc": 0.8313138512710858, "train_speed(iter/s)": 0.146379 }, { "epoch": 1.1292929292929292, "grad_norm": 1.0907129049301147, "learning_rate": 6.858312278301638e-06, "loss": 0.530540657043457, "memory(GiB)": 30.0, "step": 175, "token_acc": 0.8248816768086545, "train_speed(iter/s)": 0.14678 }, { "epoch": 1.1616161616161615, "grad_norm": 1.0117602348327637, "learning_rate": 6.699446507913083e-06, "loss": 0.5229566097259521, "memory(GiB)": 30.0, "step": 180, "token_acc": 0.8368724855693546, "train_speed(iter/s)": 0.147192 }, { "epoch": 1.1616161616161615, "eval_loss": 0.6804619431495667, "eval_runtime": 4.4977, "eval_samples_per_second": 22.233, "eval_steps_per_second": 5.558, "eval_token_acc": 0.8114797408159292, "step": 180 }, { "epoch": 1.1939393939393939, "grad_norm": 1.050369381904602, "learning_rate": 6.53861637579291e-06, "loss": 0.548884916305542, "memory(GiB)": 30.0, "step": 185, "token_acc": 0.8174426020408163, "train_speed(iter/s)": 0.1459 }, { "epoch": 1.2262626262626264, "grad_norm": 1.1832817792892456, "learning_rate": 6.376007782794926e-06, "loss": 0.5427236557006836, "memory(GiB)": 30.0, "step": 190, "token_acc": 0.8312639081497726, "train_speed(iter/s)": 0.146405 }, { "epoch": 1.2585858585858585, "grad_norm": 1.0814838409423828, "learning_rate": 6.211808685466063e-06, "loss": 0.5683661937713623, "memory(GiB)": 30.0, "step": 195, "token_acc": 0.8363592434074278, "train_speed(iter/s)": 0.147077 }, { "epoch": 1.290909090909091, "grad_norm": 0.9832177758216858, "learning_rate": 6.046208878790543e-06, "loss": 0.5114535808563232, "memory(GiB)": 30.0, "step": 200, "token_acc": 0.8398058252427184, "train_speed(iter/s)": 0.14749 }, { "epoch": 1.290909090909091, "eval_loss": 0.6795041561126709, "eval_runtime": 4.5398, "eval_samples_per_second": 22.027, "eval_steps_per_second": 5.507, "eval_token_acc": 0.812147167607553, "step": 200 }, { "epoch": 1.3232323232323233, "grad_norm": 1.0313045978546143, "learning_rate": 5.879399776809047e-06, "loss": 0.5399807453155517, "memory(GiB)": 30.0, "step": 205, "token_acc": 0.8344659940404622, "train_speed(iter/s)": 0.14634 }, { "epoch": 1.3555555555555556, "grad_norm": 1.0118134021759033, "learning_rate": 5.711574191366427e-06, "loss": 0.5527422904968262, "memory(GiB)": 30.0, "step": 210, "token_acc": 0.8313429020123443, "train_speed(iter/s)": 0.146686 }, { "epoch": 1.387878787878788, "grad_norm": 1.0724433660507202, "learning_rate": 5.542926109243727e-06, "loss": 0.5539234161376954, "memory(GiB)": 30.0, "step": 215, "token_acc": 0.8117005197773436, "train_speed(iter/s)": 0.147158 }, { "epoch": 1.4202020202020202, "grad_norm": 1.086374282836914, "learning_rate": 5.373650467932122e-06, "loss": 0.5297107219696044, "memory(GiB)": 30.0, "step": 220, "token_acc": 0.8333011097792042, "train_speed(iter/s)": 0.147561 }, { "epoch": 1.4202020202020202, "eval_loss": 0.6780869960784912, "eval_runtime": 4.4783, "eval_samples_per_second": 22.33, "eval_steps_per_second": 5.582, "eval_token_acc": 0.8137323062376596, "step": 220 }, { "epoch": 1.4525252525252526, "grad_norm": 1.1475560665130615, "learning_rate": 5.2039429303079294e-06, "loss": 0.5562318801879883, "memory(GiB)": 30.0, "step": 225, "token_acc": 0.820589226025445, "train_speed(iter/s)": 0.146647 }, { "epoch": 1.4848484848484849, "grad_norm": 1.0739235877990723, "learning_rate": 5.033999658469174e-06, "loss": 0.5656192779541016, "memory(GiB)": 30.0, "step": 230, "token_acc": 0.8434569629111267, "train_speed(iter/s)": 0.147011 }, { "epoch": 1.5171717171717172, "grad_norm": 1.153382420539856, "learning_rate": 4.864017086995112e-06, "loss": 0.5392692565917969, "memory(GiB)": 30.0, "step": 235, "token_acc": 0.8344013490725126, "train_speed(iter/s)": 0.147492 }, { "epoch": 1.5494949494949495, "grad_norm": 1.0742793083190918, "learning_rate": 4.694191695890788e-06, "loss": 0.5323070049285888, "memory(GiB)": 30.0, "step": 240, "token_acc": 0.8339429680501642, "train_speed(iter/s)": 0.147851 }, { "epoch": 1.5494949494949495, "eval_loss": 0.6787331104278564, "eval_runtime": 4.4769, "eval_samples_per_second": 22.337, "eval_steps_per_second": 5.584, "eval_token_acc": 0.8122027865068551, "step": 240 }, { "epoch": 1.5818181818181818, "grad_norm": 1.012741208076477, "learning_rate": 4.524719783479088e-06, "loss": 0.5413528919219971, "memory(GiB)": 30.0, "step": 245, "token_acc": 0.8438339287914254, "train_speed(iter/s)": 0.14685 }, { "epoch": 1.614141414141414, "grad_norm": 1.0688731670379639, "learning_rate": 4.355797239502807e-06, "loss": 0.5247974395751953, "memory(GiB)": 30.0, "step": 250, "token_acc": 0.8370563375806298, "train_speed(iter/s)": 0.147062 }, { "epoch": 1.6464646464646466, "grad_norm": 1.195318341255188, "learning_rate": 4.187619318698971e-06, "loss": 0.5625959873199463, "memory(GiB)": 30.0, "step": 255, "token_acc": 0.8187680020947892, "train_speed(iter/s)": 0.147527 }, { "epoch": 1.6787878787878787, "grad_norm": 1.0666542053222656, "learning_rate": 4.020380415107167e-06, "loss": 0.5226840972900391, "memory(GiB)": 30.0, "step": 260, "token_acc": 0.8682154605263158, "train_speed(iter/s)": 0.147705 }, { "epoch": 1.6787878787878787, "eval_loss": 0.6767453551292419, "eval_runtime": 4.4986, "eval_samples_per_second": 22.229, "eval_steps_per_second": 5.557, "eval_token_acc": 0.8135098306404516, "step": 260 }, { "epoch": 1.7111111111111112, "grad_norm": 1.1024754047393799, "learning_rate": 3.854273837372724e-06, "loss": 0.5334303379058838, "memory(GiB)": 30.0, "step": 265, "token_acc": 0.8305059560662721, "train_speed(iter/s)": 0.146899 }, { "epoch": 1.7434343434343433, "grad_norm": 1.0549464225769043, "learning_rate": 3.689491585304491e-06, "loss": 0.5367157936096192, "memory(GiB)": 30.0, "step": 270, "token_acc": 0.8195051514205433, "train_speed(iter/s)": 0.14719 }, { "epoch": 1.7757575757575759, "grad_norm": 1.1271840333938599, "learning_rate": 3.526224127945479e-06, "loss": 0.5592126369476318, "memory(GiB)": 30.0, "step": 275, "token_acc": 0.8336587028601531, "train_speed(iter/s)": 0.147572 }, { "epoch": 1.808080808080808, "grad_norm": 1.0035786628723145, "learning_rate": 3.3646601834128924e-06, "loss": 0.5336441040039063, "memory(GiB)": 30.0, "step": 280, "token_acc": 0.8376280205561832, "train_speed(iter/s)": 0.147801 }, { "epoch": 1.808080808080808, "eval_loss": 0.675382137298584, "eval_runtime": 4.4852, "eval_samples_per_second": 22.296, "eval_steps_per_second": 5.574, "eval_token_acc": 0.8133151644928946, "step": 280 }, { "epoch": 1.8404040404040405, "grad_norm": 1.0249357223510742, "learning_rate": 3.204986500762006e-06, "loss": 0.5767297267913818, "memory(GiB)": 30.0, "step": 285, "token_acc": 0.8033832987162484, "train_speed(iter/s)": 0.146928 }, { "epoch": 1.8727272727272726, "grad_norm": 0.9631988406181335, "learning_rate": 3.0473876441260786e-06, "loss": 0.5411409854888916, "memory(GiB)": 30.0, "step": 290, "token_acc": 0.8262143620505396, "train_speed(iter/s)": 0.147203 }, { "epoch": 1.905050505050505, "grad_norm": 1.028135895729065, "learning_rate": 2.8920457793817507e-06, "loss": 0.5459909439086914, "memory(GiB)": 30.0, "step": 295, "token_acc": 0.8455647944260032, "train_speed(iter/s)": 0.147451 }, { "epoch": 1.9373737373737374, "grad_norm": 1.0845921039581299, "learning_rate": 2.7391404635865725e-06, "loss": 0.5368780612945556, "memory(GiB)": 30.0, "step": 300, "token_acc": 0.8320271783191137, "train_speed(iter/s)": 0.147671 }, { "epoch": 1.9373737373737374, "eval_loss": 0.6716358661651611, "eval_runtime": 4.5215, "eval_samples_per_second": 22.116, "eval_steps_per_second": 5.529, "eval_token_acc": 0.8139547818348675, "step": 300 }, { "epoch": 1.9696969696969697, "grad_norm": 0.9843314290046692, "learning_rate": 2.5888484374320033e-06, "loss": 0.5163120269775391, "memory(GiB)": 30.0, "step": 305, "token_acc": 0.8328818151032849, "train_speed(iter/s)": 0.146859 }, { "epoch": 2.0, "grad_norm": 1.1701740026474, "learning_rate": 2.4413434209518137e-06, "loss": 0.5329459190368653, "memory(GiB)": 30.0, "step": 310, "token_acc": 0.8684262230663435, "train_speed(iter/s)": 0.147224 }, { "epoch": 2.0323232323232325, "grad_norm": 0.9886131286621094, "learning_rate": 2.296795912722014e-06, "loss": 0.47854862213134763, "memory(GiB)": 30.0, "step": 315, "token_acc": 0.8571065805702677, "train_speed(iter/s)": 0.147399 }, { "epoch": 2.0646464646464646, "grad_norm": 1.1649657487869263, "learning_rate": 2.1553729927843894e-06, "loss": 0.46472911834716796, "memory(GiB)": 30.0, "step": 320, "token_acc": 0.8510946618102064, "train_speed(iter/s)": 0.147612 }, { "epoch": 2.0646464646464646, "eval_loss": 0.683362603187561, "eval_runtime": 4.5092, "eval_samples_per_second": 22.177, "eval_steps_per_second": 5.544, "eval_token_acc": 0.812425262104063, "step": 320 }, { "epoch": 2.096969696969697, "grad_norm": 1.0194660425186157, "learning_rate": 2.017238129521506e-06, "loss": 0.4674004077911377, "memory(GiB)": 30.0, "step": 325, "token_acc": 0.8521572339577521, "train_speed(iter/s)": 0.146897 }, { "epoch": 2.1292929292929292, "grad_norm": 1.0168867111206055, "learning_rate": 1.8825509907063328e-06, "loss": 0.4946479320526123, "memory(GiB)": 30.0, "step": 330, "token_acc": 0.846690244227946, "train_speed(iter/s)": 0.147164 }, { "epoch": 2.1616161616161618, "grad_norm": 1.0326777696609497, "learning_rate": 1.7514672589449378e-06, "loss": 0.48072013854980467, "memory(GiB)": 30.0, "step": 335, "token_acc": 0.8457147012835897, "train_speed(iter/s)": 0.147447 }, { "epoch": 2.193939393939394, "grad_norm": 0.9925091862678528, "learning_rate": 1.6241384517255854e-06, "loss": 0.4736426830291748, "memory(GiB)": 30.0, "step": 340, "token_acc": 0.8668122952098611, "train_speed(iter/s)": 0.147682 }, { "epoch": 2.193939393939394, "eval_loss": 0.687374472618103, "eval_runtime": 4.4755, "eval_samples_per_second": 22.344, "eval_steps_per_second": 5.586, "eval_token_acc": 0.8125643093523179, "step": 340 }, { "epoch": 2.2262626262626264, "grad_norm": 1.008774995803833, "learning_rate": 1.500711746282192e-06, "loss": 0.5085729598999024, "memory(GiB)": 30.0, "step": 345, "token_acc": 0.8404898047254289, "train_speed(iter/s)": 0.147118 }, { "epoch": 2.2585858585858585, "grad_norm": 0.9545453786849976, "learning_rate": 1.3813298094746491e-06, "loss": 0.49334096908569336, "memory(GiB)": 30.0, "step": 350, "token_acc": 0.8572497735061252, "train_speed(iter/s)": 0.147268 }, { "epoch": 2.290909090909091, "grad_norm": 1.0316177606582642, "learning_rate": 1.2661306328825818e-06, "loss": 0.48065829277038574, "memory(GiB)": 30.0, "step": 355, "token_acc": 0.8579491647410887, "train_speed(iter/s)": 0.147552 }, { "epoch": 2.323232323232323, "grad_norm": 0.9947900772094727, "learning_rate": 1.1552473733031893e-06, "loss": 0.4793752670288086, "memory(GiB)": 30.0, "step": 360, "token_acc": 0.8398426718189346, "train_speed(iter/s)": 0.147791 }, { "epoch": 2.323232323232323, "eval_loss": 0.6864572167396545, "eval_runtime": 4.5103, "eval_samples_per_second": 22.171, "eval_steps_per_second": 5.543, "eval_token_acc": 0.8118968825606941, "step": 360 }, { "epoch": 2.3555555555555556, "grad_norm": 0.9976386427879333, "learning_rate": 1.0488081988375493e-06, "loss": 0.46926078796386717, "memory(GiB)": 30.0, "step": 365, "token_acc": 0.8339437138994715, "train_speed(iter/s)": 0.147274 }, { "epoch": 2.3878787878787877, "grad_norm": 0.9950555562973022, "learning_rate": 9.469361407432431e-07, "loss": 0.46298680305480955, "memory(GiB)": 30.0, "step": 370, "token_acc": 0.8648511440693332, "train_speed(iter/s)": 0.14743 }, { "epoch": 2.4202020202020202, "grad_norm": 0.9489296078681946, "learning_rate": 8.497489512245971e-07, "loss": 0.4750084400177002, "memory(GiB)": 30.0, "step": 375, "token_acc": 0.8613484960635219, "train_speed(iter/s)": 0.147589 }, { "epoch": 2.4525252525252528, "grad_norm": 1.0904533863067627, "learning_rate": 7.573589673248833e-07, "loss": 0.49940977096557615, "memory(GiB)": 30.0, "step": 380, "token_acc": 0.8456219466366027, "train_speed(iter/s)": 0.147838 }, { "epoch": 2.4525252525252528, "eval_loss": 0.6869800090789795, "eval_runtime": 4.4689, "eval_samples_per_second": 22.377, "eval_steps_per_second": 5.594, "eval_token_acc": 0.812230595956506, "step": 380 }, { "epoch": 2.484848484848485, "grad_norm": 0.9920545816421509, "learning_rate": 6.698729810778065e-07, "loss": 0.4398204803466797, "memory(GiB)": 30.0, "step": 385, "token_acc": 0.8489904129398532, "train_speed(iter/s)": 0.147202 }, { "epoch": 2.517171717171717, "grad_norm": 0.9930100440979004, "learning_rate": 5.873921160683943e-07, "loss": 0.4805948257446289, "memory(GiB)": 30.0, "step": 390, "token_acc": 0.8426801497549115, "train_speed(iter/s)": 0.14741 }, { "epoch": 2.5494949494949495, "grad_norm": 1.0105656385421753, "learning_rate": 5.100117105459279e-07, "loss": 0.4643260478973389, "memory(GiB)": 30.0, "step": 395, "token_acc": 0.8793709396854699, "train_speed(iter/s)": 0.14759 }, { "epoch": 2.581818181818182, "grad_norm": 0.949529230594635, "learning_rate": 4.3782120722406565e-07, "loss": 0.4940080165863037, "memory(GiB)": 30.0, "step": 400, "token_acc": 0.8489606206997511, "train_speed(iter/s)": 0.147848 }, { "epoch": 2.581818181818182, "eval_loss": 0.6872583627700806, "eval_runtime": 4.4941, "eval_samples_per_second": 22.251, "eval_steps_per_second": 5.563, "eval_token_acc": 0.812397452654412, "step": 400 }, { "epoch": 2.614141414141414, "grad_norm": 1.0164008140563965, "learning_rate": 3.709040498955102e-07, "loss": 0.4730886936187744, "memory(GiB)": 30.0, "step": 405, "token_acc": 0.8358856213579076, "train_speed(iter/s)": 0.147331 }, { "epoch": 2.6464646464646466, "grad_norm": 0.9890522956848145, "learning_rate": 3.0933758698072023e-07, "loss": 0.4806517124176025, "memory(GiB)": 30.0, "step": 410, "token_acc": 0.8606986899563319, "train_speed(iter/s)": 0.147503 }, { "epoch": 2.6787878787878787, "grad_norm": 1.0946931838989258, "learning_rate": 2.531929821221768e-07, "loss": 0.48343396186828613, "memory(GiB)": 30.0, "step": 415, "token_acc": 0.8560399806064223, "train_speed(iter/s)": 0.147741 }, { "epoch": 2.7111111111111112, "grad_norm": 0.9739342331886292, "learning_rate": 2.0253513192751374e-07, "loss": 0.4568051338195801, "memory(GiB)": 30.0, "step": 420, "token_acc": 0.8513886113886114, "train_speed(iter/s)": 0.147934 }, { "epoch": 2.7111111111111112, "eval_loss": 0.6869549751281738, "eval_runtime": 4.4891, "eval_samples_per_second": 22.276, "eval_steps_per_second": 5.569, "eval_token_acc": 0.8125643093523179, "step": 420 }, { "epoch": 2.7434343434343433, "grad_norm": 1.036109447479248, "learning_rate": 1.5742259095662126e-07, "loss": 0.4853102684020996, "memory(GiB)": 30.0, "step": 425, "token_acc": 0.8378954181386694, "train_speed(iter/s)": 0.147362 }, { "epoch": 2.775757575757576, "grad_norm": 1.012856125831604, "learning_rate": 1.1790750403941231e-07, "loss": 0.4725470542907715, "memory(GiB)": 30.0, "step": 430, "token_acc": 0.8562256448320653, "train_speed(iter/s)": 0.147546 }, { "epoch": 2.808080808080808, "grad_norm": 0.9843412041664124, "learning_rate": 8.403554600248498e-08, "loss": 0.47121171951293944, "memory(GiB)": 30.0, "step": 435, "token_acc": 0.8520417505951291, "train_speed(iter/s)": 0.14775 }, { "epoch": 2.8404040404040405, "grad_norm": 1.0212868452072144, "learning_rate": 5.584586887435739e-08, "loss": 0.47542705535888674, "memory(GiB)": 30.0, "step": 440, "token_acc": 0.8456783799474098, "train_speed(iter/s)": 0.147955 }, { "epoch": 2.8404040404040405, "eval_loss": 0.6869864463806152, "eval_runtime": 4.4679, "eval_samples_per_second": 22.382, "eval_steps_per_second": 5.595, "eval_token_acc": 0.812230595956506, "step": 440 }, { "epoch": 2.8727272727272726, "grad_norm": 0.9807941317558289, "learning_rate": 3.337105663029361e-08, "loss": 0.46687631607055663, "memory(GiB)": 30.0, "step": 445, "token_acc": 0.8307434410089937, "train_speed(iter/s)": 0.147443 }, { "epoch": 2.905050505050505, "grad_norm": 0.9818356037139893, "learning_rate": 1.6637087529033925e-08, "loss": 0.4775029182434082, "memory(GiB)": 30.0, "step": 450, "token_acc": 0.8397012044747847, "train_speed(iter/s)": 0.147671 }, { "epoch": 2.937373737373737, "grad_norm": 0.9716631174087524, "learning_rate": 5.6633040849601865e-09, "loss": 0.5018224716186523, "memory(GiB)": 30.0, "step": 455, "token_acc": 0.8534569498346989, "train_speed(iter/s)": 0.147853 }, { "epoch": 2.9696969696969697, "grad_norm": 1.012851357460022, "learning_rate": 4.623907104084335e-10, "loss": 0.48965134620666506, "memory(GiB)": 30.0, "step": 460, "token_acc": 0.8447179410444411, "train_speed(iter/s)": 0.148014 }, { "epoch": 2.9696969696969697, "eval_loss": 0.6869931817054749, "eval_runtime": 4.4888, "eval_samples_per_second": 22.278, "eval_steps_per_second": 5.569, "eval_token_acc": 0.8127033566005729, "step": 460 }, { "epoch": 2.9826262626262627, "eval_loss": 0.6868388652801514, "eval_runtime": 4.4905, "eval_samples_per_second": 22.269, "eval_steps_per_second": 5.567, "eval_token_acc": 0.8126199282516199, "step": 462 } ], "logging_steps": 5, "max_steps": 462, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.7722341067862835e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }