{ "best_global_step": 260, "best_metric": 0.33951408, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v13-20250507-015956/checkpoint-260", "epoch": 2.9826262626262627, "eval_steps": 20, "global_step": 462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006464646464646465, "grad_norm": 2.9768006801605225, "learning_rate": 9.999884400986087e-06, "loss": 0.4710537791252136, "memory(GiB)": 27.77, "step": 1, "token_acc": 0.8540840602696272, "train_speed(iter/s)": 0.065349 }, { "epoch": 0.03232323232323232, "grad_norm": 1.8765902519226074, "learning_rate": 9.997110291906109e-06, "loss": 0.3920785188674927, "memory(GiB)": 27.77, "step": 5, "token_acc": 0.8758898589657488, "train_speed(iter/s)": 0.119626 }, { "epoch": 0.06464646464646465, "grad_norm": 1.293035626411438, "learning_rate": 9.988444507789584e-06, "loss": 0.3347900867462158, "memory(GiB)": 27.77, "step": 10, "token_acc": 0.886203631273416, "train_speed(iter/s)": 0.137285 }, { "epoch": 0.09696969696969697, "grad_norm": 1.2047252655029297, "learning_rate": 9.97401266428502e-06, "loss": 0.2941281318664551, "memory(GiB)": 27.77, "step": 15, "token_acc": 0.9130237482982907, "train_speed(iter/s)": 0.139757 }, { "epoch": 0.1292929292929293, "grad_norm": 1.105458378791809, "learning_rate": 9.953831442918418e-06, "loss": 0.30655100345611574, "memory(GiB)": 27.77, "step": 20, "token_acc": 0.8922364378410602, "train_speed(iter/s)": 0.143912 }, { "epoch": 0.1292929292929293, "eval_loss": 0.4002552032470703, "eval_runtime": 5.4131, "eval_samples_per_second": 18.474, "eval_steps_per_second": 4.618, "eval_token_acc": 0.8938616739334873, "step": 20 }, { "epoch": 0.16161616161616163, "grad_norm": 1.041798710823059, "learning_rate": 9.927924170825266e-06, "loss": 0.29069504737854, "memory(GiB)": 29.53, "step": 25, "token_acc": 0.8984301348526458, "train_speed(iter/s)": 0.13295 }, { "epoch": 0.19393939393939394, "grad_norm": 1.0589144229888916, "learning_rate": 9.896320793787106e-06, "loss": 0.33169257640838623, "memory(GiB)": 29.53, "step": 30, "token_acc": 0.8989100590731341, "train_speed(iter/s)": 0.138041 }, { "epoch": 0.22626262626262628, "grad_norm": 0.905042290687561, "learning_rate": 9.859057841617709e-06, "loss": 0.29272122383117677, "memory(GiB)": 29.53, "step": 35, "token_acc": 0.9075644898907739, "train_speed(iter/s)": 0.140237 }, { "epoch": 0.2585858585858586, "grad_norm": 0.8976985812187195, "learning_rate": 9.816178385938867e-06, "loss": 0.29633958339691163, "memory(GiB)": 29.53, "step": 40, "token_acc": 0.9119097956307258, "train_speed(iter/s)": 0.142425 }, { "epoch": 0.2585858585858586, "eval_loss": 0.375478595495224, "eval_runtime": 5.311, "eval_samples_per_second": 18.829, "eval_steps_per_second": 4.707, "eval_token_acc": 0.8989911943233307, "step": 40 }, { "epoch": 0.2909090909090909, "grad_norm": 1.0213873386383057, "learning_rate": 9.767731990394638e-06, "loss": 0.2878244400024414, "memory(GiB)": 29.53, "step": 45, "token_acc": 0.8998546863647372, "train_speed(iter/s)": 0.137223 }, { "epoch": 0.32323232323232326, "grad_norm": 0.9807785749435425, "learning_rate": 9.71377465336155e-06, "loss": 0.30373663902282716, "memory(GiB)": 29.53, "step": 50, "token_acc": 0.9004981024667932, "train_speed(iter/s)": 0.139351 }, { "epoch": 0.35555555555555557, "grad_norm": 0.87921142578125, "learning_rate": 9.654368743221022e-06, "loss": 0.2892775535583496, "memory(GiB)": 29.53, "step": 55, "token_acc": 0.8972517176764522, "train_speed(iter/s)": 0.140773 }, { "epoch": 0.3878787878787879, "grad_norm": 1.0269665718078613, "learning_rate": 9.589582926268798e-06, "loss": 0.2767331600189209, "memory(GiB)": 29.53, "step": 60, "token_acc": 0.917551472624873, "train_speed(iter/s)": 0.142633 }, { "epoch": 0.3878787878787879, "eval_loss": 0.36320608854293823, "eval_runtime": 5.3096, "eval_samples_per_second": 18.834, "eval_steps_per_second": 4.708, "eval_token_acc": 0.9018979225442421, "step": 60 }, { "epoch": 0.4202020202020202, "grad_norm": 0.93445885181427, "learning_rate": 9.519492087344724e-06, "loss": 0.26028871536254883, "memory(GiB)": 29.53, "step": 65, "token_acc": 0.9259884974838246, "train_speed(iter/s)": 0.137874 }, { "epoch": 0.45252525252525255, "grad_norm": 0.9748146533966064, "learning_rate": 9.444177243274619e-06, "loss": 0.29547710418701173, "memory(GiB)": 29.53, "step": 70, "token_acc": 0.8966111739669199, "train_speed(iter/s)": 0.140001 }, { "epoch": 0.48484848484848486, "grad_norm": 0.937225341796875, "learning_rate": 9.363725449224281e-06, "loss": 0.2684732675552368, "memory(GiB)": 29.53, "step": 75, "token_acc": 0.9166046511627907, "train_speed(iter/s)": 0.141231 }, { "epoch": 0.5171717171717172, "grad_norm": 1.026237964630127, "learning_rate": 9.278229698073889e-06, "loss": 0.2696544647216797, "memory(GiB)": 29.53, "step": 80, "token_acc": 0.9159072741806554, "train_speed(iter/s)": 0.142009 }, { "epoch": 0.5171717171717172, "eval_loss": 0.35313570499420166, "eval_runtime": 5.3322, "eval_samples_per_second": 18.754, "eval_steps_per_second": 4.689, "eval_token_acc": 0.9009575104727708, "step": 80 }, { "epoch": 0.5494949494949495, "grad_norm": 0.9791724681854248, "learning_rate": 9.187788812929074e-06, "loss": 0.2900824546813965, "memory(GiB)": 29.53, "step": 85, "token_acc": 0.9096507542880761, "train_speed(iter/s)": 0.139042 }, { "epoch": 0.5818181818181818, "grad_norm": 1.288805603981018, "learning_rate": 9.092507332892968e-06, "loss": 0.2910241365432739, "memory(GiB)": 31.36, "step": 90, "token_acc": 0.9108959132712102, "train_speed(iter/s)": 0.14038 }, { "epoch": 0.6141414141414141, "grad_norm": 1.055824875831604, "learning_rate": 8.992495392231195e-06, "loss": 0.2932537794113159, "memory(GiB)": 31.36, "step": 95, "token_acc": 0.904834793623984, "train_speed(iter/s)": 0.141584 }, { "epoch": 0.6464646464646465, "grad_norm": 1.0129783153533936, "learning_rate": 8.88786859306952e-06, "loss": 0.26900548934936525, "memory(GiB)": 31.36, "step": 100, "token_acc": 0.8921682782362172, "train_speed(iter/s)": 0.142349 }, { "epoch": 0.6464646464646465, "eval_loss": 0.3519335985183716, "eval_runtime": 5.3292, "eval_samples_per_second": 18.764, "eval_steps_per_second": 4.691, "eval_token_acc": 0.9018979225442421, "step": 100 }, { "epoch": 0.6787878787878788, "grad_norm": 1.1157113313674927, "learning_rate": 8.778747871771293e-06, "loss": 0.2843419790267944, "memory(GiB)": 31.36, "step": 105, "token_acc": 0.9077419118455281, "train_speed(iter/s)": 0.140295 }, { "epoch": 0.7111111111111111, "grad_norm": 0.9619855880737305, "learning_rate": 8.665259359149132e-06, "loss": 0.2657431125640869, "memory(GiB)": 31.36, "step": 110, "token_acc": 0.9188039238860655, "train_speed(iter/s)": 0.141014 }, { "epoch": 0.7434343434343434, "grad_norm": 0.9229025840759277, "learning_rate": 8.547534234672435e-06, "loss": 0.28671417236328123, "memory(GiB)": 31.36, "step": 115, "token_acc": 0.9083705301168564, "train_speed(iter/s)": 0.141923 }, { "epoch": 0.7757575757575758, "grad_norm": 1.197439432144165, "learning_rate": 8.425708574839221e-06, "loss": 0.26473350524902345, "memory(GiB)": 31.36, "step": 120, "token_acc": 0.9178892300693734, "train_speed(iter/s)": 0.14256 }, { "epoch": 0.7757575757575758, "eval_loss": 0.3486604392528534, "eval_runtime": 5.3218, "eval_samples_per_second": 18.791, "eval_steps_per_second": 4.698, "eval_token_acc": 0.9009147644695221, "step": 120 }, { "epoch": 0.8080808080808081, "grad_norm": 0.9874864220619202, "learning_rate": 8.299923195887599e-06, "loss": 0.30171942710876465, "memory(GiB)": 31.36, "step": 125, "token_acc": 0.9013058472507459, "train_speed(iter/s)": 0.140695 }, { "epoch": 0.8404040404040404, "grad_norm": 0.916023850440979, "learning_rate": 8.170323491028625e-06, "loss": 0.2660797119140625, "memory(GiB)": 31.36, "step": 130, "token_acc": 0.9089231715442571, "train_speed(iter/s)": 0.141028 }, { "epoch": 0.8727272727272727, "grad_norm": 0.9316055774688721, "learning_rate": 8.03705926238874e-06, "loss": 0.26981799602508544, "memory(GiB)": 31.36, "step": 135, "token_acc": 0.9050517346317711, "train_speed(iter/s)": 0.141684 }, { "epoch": 0.9050505050505051, "grad_norm": 0.991523802280426, "learning_rate": 7.900284547855992e-06, "loss": 0.30231451988220215, "memory(GiB)": 31.36, "step": 140, "token_acc": 0.9078060346831559, "train_speed(iter/s)": 0.142094 }, { "epoch": 0.9050505050505051, "eval_loss": 0.34019821882247925, "eval_runtime": 5.3029, "eval_samples_per_second": 18.857, "eval_steps_per_second": 4.714, "eval_token_acc": 0.9021543985637342, "step": 140 }, { "epoch": 0.9373737373737374, "grad_norm": 0.9023342132568359, "learning_rate": 7.760157443030234e-06, "loss": 0.26686046123504636, "memory(GiB)": 31.36, "step": 145, "token_acc": 0.9070432898663288, "train_speed(iter/s)": 0.140367 }, { "epoch": 0.9696969696969697, "grad_norm": 0.8874338269233704, "learning_rate": 7.616839918483061e-06, "loss": 0.287930965423584, "memory(GiB)": 31.36, "step": 150, "token_acc": 0.9119464202274026, "train_speed(iter/s)": 0.140934 }, { "epoch": 1.0, "grad_norm": 0.9795826077461243, "learning_rate": 7.470497632538743e-06, "loss": 0.267154598236084, "memory(GiB)": 31.36, "step": 155, "token_acc": 0.9206813096488424, "train_speed(iter/s)": 0.141745 }, { "epoch": 1.0323232323232323, "grad_norm": 1.0346843004226685, "learning_rate": 7.321299739792553e-06, "loss": 0.210396146774292, "memory(GiB)": 31.36, "step": 160, "token_acc": 0.9283128167994207, "train_speed(iter/s)": 0.142337 }, { "epoch": 1.0323232323232323, "eval_loss": 0.3432323932647705, "eval_runtime": 5.3155, "eval_samples_per_second": 18.813, "eval_steps_per_second": 4.703, "eval_token_acc": 0.90360776267419, "step": 160 }, { "epoch": 1.0646464646464646, "grad_norm": 1.1043457984924316, "learning_rate": 7.169418695587791e-06, "loss": 0.18866196870803834, "memory(GiB)": 31.36, "step": 165, "token_acc": 0.9338293722459005, "train_speed(iter/s)": 0.140897 }, { "epoch": 1.096969696969697, "grad_norm": 1.0049176216125488, "learning_rate": 7.015030056677559e-06, "loss": 0.19572091102600098, "memory(GiB)": 31.36, "step": 170, "token_acc": 0.9378101525153654, "train_speed(iter/s)": 0.141758 }, { "epoch": 1.1292929292929292, "grad_norm": 1.0148508548736572, "learning_rate": 6.858312278301638e-06, "loss": 0.20892024040222168, "memory(GiB)": 31.36, "step": 175, "token_acc": 0.9264176417641764, "train_speed(iter/s)": 0.142192 }, { "epoch": 1.1616161616161615, "grad_norm": 0.8647620677947998, "learning_rate": 6.699446507913083e-06, "loss": 0.190657377243042, "memory(GiB)": 31.36, "step": 180, "token_acc": 0.929062185462343, "train_speed(iter/s)": 0.142547 }, { "epoch": 1.1616161616161615, "eval_loss": 0.3503071963787079, "eval_runtime": 5.314, "eval_samples_per_second": 18.818, "eval_steps_per_second": 4.705, "eval_token_acc": 0.9053176028041379, "step": 180 }, { "epoch": 1.1939393939393939, "grad_norm": 0.734727680683136, "learning_rate": 6.53861637579291e-06, "loss": 0.19445569515228273, "memory(GiB)": 31.36, "step": 185, "token_acc": 0.9243174259416815, "train_speed(iter/s)": 0.141275 }, { "epoch": 1.2262626262626264, "grad_norm": 1.0062605142593384, "learning_rate": 6.376007782794926e-06, "loss": 0.23203377723693847, "memory(GiB)": 31.36, "step": 190, "token_acc": 0.9284152664126429, "train_speed(iter/s)": 0.142023 }, { "epoch": 1.2585858585858585, "grad_norm": 0.8467244505882263, "learning_rate": 6.211808685466063e-06, "loss": 0.20841593742370607, "memory(GiB)": 31.36, "step": 195, "token_acc": 0.9305149796643181, "train_speed(iter/s)": 0.14276 }, { "epoch": 1.290909090909091, "grad_norm": 0.8289315700531006, "learning_rate": 6.046208878790543e-06, "loss": 0.20221335887908937, "memory(GiB)": 31.36, "step": 200, "token_acc": 0.9386574826174541, "train_speed(iter/s)": 0.143051 }, { "epoch": 1.290909090909091, "eval_loss": 0.3498072922229767, "eval_runtime": 5.3519, "eval_samples_per_second": 18.685, "eval_steps_per_second": 4.671, "eval_token_acc": 0.9033512866546978, "step": 200 }, { "epoch": 1.3232323232323233, "grad_norm": 0.9581440091133118, "learning_rate": 5.879399776809047e-06, "loss": 0.1989992380142212, "memory(GiB)": 31.36, "step": 205, "token_acc": 0.9317071351311029, "train_speed(iter/s)": 0.14184 }, { "epoch": 1.3555555555555556, "grad_norm": 0.905442476272583, "learning_rate": 5.711574191366427e-06, "loss": 0.20228266716003418, "memory(GiB)": 31.36, "step": 210, "token_acc": 0.9324250349735418, "train_speed(iter/s)": 0.142185 }, { "epoch": 1.387878787878788, "grad_norm": 0.8766697645187378, "learning_rate": 5.542926109243727e-06, "loss": 0.18625075817108155, "memory(GiB)": 31.36, "step": 215, "token_acc": 0.9340369393139841, "train_speed(iter/s)": 0.142509 }, { "epoch": 1.4202020202020202, "grad_norm": 0.8826522827148438, "learning_rate": 5.373650467932122e-06, "loss": 0.18408771753311157, "memory(GiB)": 31.36, "step": 220, "token_acc": 0.9386043390740326, "train_speed(iter/s)": 0.14291 }, { "epoch": 1.4202020202020202, "eval_loss": 0.3430667817592621, "eval_runtime": 5.3043, "eval_samples_per_second": 18.853, "eval_steps_per_second": 4.713, "eval_token_acc": 0.9047619047619048, "step": 220 }, { "epoch": 1.4525252525252526, "grad_norm": 1.0129814147949219, "learning_rate": 5.2039429303079294e-06, "loss": 0.20485594272613525, "memory(GiB)": 31.36, "step": 225, "token_acc": 0.9242819843342036, "train_speed(iter/s)": 0.142107 }, { "epoch": 1.4848484848484849, "grad_norm": 0.9322662353515625, "learning_rate": 5.033999658469174e-06, "loss": 0.1999491572380066, "memory(GiB)": 31.36, "step": 230, "token_acc": 0.935610103166133, "train_speed(iter/s)": 0.142426 }, { "epoch": 1.5171717171717172, "grad_norm": 0.9451385140419006, "learning_rate": 4.864017086995112e-06, "loss": 0.20325605869293212, "memory(GiB)": 31.36, "step": 235, "token_acc": 0.9395159286317778, "train_speed(iter/s)": 0.142786 }, { "epoch": 1.5494949494949495, "grad_norm": 0.7493119835853577, "learning_rate": 4.694191695890788e-06, "loss": 0.17417298555374144, "memory(GiB)": 31.36, "step": 240, "token_acc": 0.9434940634843713, "train_speed(iter/s)": 0.14319 }, { "epoch": 1.5494949494949495, "eval_loss": 0.3442021608352661, "eval_runtime": 5.3253, "eval_samples_per_second": 18.778, "eval_steps_per_second": 4.695, "eval_token_acc": 0.9033512866546978, "step": 240 }, { "epoch": 1.5818181818181818, "grad_norm": 0.8448814749717712, "learning_rate": 4.524719783479088e-06, "loss": 0.17924880981445312, "memory(GiB)": 31.36, "step": 245, "token_acc": 0.9297393970362801, "train_speed(iter/s)": 0.142181 }, { "epoch": 1.614141414141414, "grad_norm": 0.8817701935768127, "learning_rate": 4.355797239502807e-06, "loss": 0.1808495044708252, "memory(GiB)": 31.37, "step": 250, "token_acc": 0.932560963270262, "train_speed(iter/s)": 0.142404 }, { "epoch": 1.6464646464646466, "grad_norm": 0.892765462398529, "learning_rate": 4.187619318698971e-06, "loss": 0.1975640058517456, "memory(GiB)": 31.37, "step": 255, "token_acc": 0.9292730844793713, "train_speed(iter/s)": 0.142833 }, { "epoch": 1.6787878787878787, "grad_norm": 0.8926272392272949, "learning_rate": 4.020380415107167e-06, "loss": 0.19108818769454955, "memory(GiB)": 31.37, "step": 260, "token_acc": 0.9380416838629798, "train_speed(iter/s)": 0.14303 }, { "epoch": 1.6787878787878787, "eval_loss": 0.3395140767097473, "eval_runtime": 5.3138, "eval_samples_per_second": 18.819, "eval_steps_per_second": 4.705, "eval_token_acc": 0.904719158758656, "step": 260 }, { "epoch": 1.7111111111111112, "grad_norm": 0.9119723439216614, "learning_rate": 3.854273837372724e-06, "loss": 0.1968384265899658, "memory(GiB)": 31.37, "step": 265, "token_acc": 0.9235020131049183, "train_speed(iter/s)": 0.14221 }, { "epoch": 1.7434343434343433, "grad_norm": 0.8273991942405701, "learning_rate": 3.689491585304491e-06, "loss": 0.1967773199081421, "memory(GiB)": 31.37, "step": 270, "token_acc": 0.9207749251850685, "train_speed(iter/s)": 0.142463 }, { "epoch": 1.7757575757575759, "grad_norm": 1.0739576816558838, "learning_rate": 3.526224127945479e-06, "loss": 0.19146767854690552, "memory(GiB)": 31.37, "step": 275, "token_acc": 0.9385914241279716, "train_speed(iter/s)": 0.142947 }, { "epoch": 1.808080808080808, "grad_norm": 0.7444007992744446, "learning_rate": 3.3646601834128924e-06, "loss": 0.18657138347625732, "memory(GiB)": 31.37, "step": 280, "token_acc": 0.9347743648084945, "train_speed(iter/s)": 0.143177 }, { "epoch": 1.808080808080808, "eval_loss": 0.33954280614852905, "eval_runtime": 5.3308, "eval_samples_per_second": 18.759, "eval_steps_per_second": 4.69, "eval_token_acc": 0.9033512866546978, "step": 280 }, { "epoch": 1.8404040404040405, "grad_norm": 0.854715883731842, "learning_rate": 3.204986500762006e-06, "loss": 0.17565726041793822, "memory(GiB)": 31.37, "step": 285, "token_acc": 0.9216834543608964, "train_speed(iter/s)": 0.142226 }, { "epoch": 1.8727272727272726, "grad_norm": 0.9426065683364868, "learning_rate": 3.0473876441260786e-06, "loss": 0.18239200115203857, "memory(GiB)": 31.37, "step": 290, "token_acc": 0.939308718134809, "train_speed(iter/s)": 0.142498 }, { "epoch": 1.905050505050505, "grad_norm": 0.7922055721282959, "learning_rate": 2.8920457793817507e-06, "loss": 0.19457708597183226, "memory(GiB)": 31.37, "step": 295, "token_acc": 0.9396199182102478, "train_speed(iter/s)": 0.142765 }, { "epoch": 1.9373737373737374, "grad_norm": 1.0278714895248413, "learning_rate": 2.7391404635865725e-06, "loss": 0.1904490351676941, "memory(GiB)": 31.37, "step": 300, "token_acc": 0.9375734430082256, "train_speed(iter/s)": 0.143068 }, { "epoch": 1.9373737373737374, "eval_loss": 0.34051987528800964, "eval_runtime": 5.283, "eval_samples_per_second": 18.929, "eval_steps_per_second": 4.732, "eval_token_acc": 0.9058733008463709, "step": 300 }, { "epoch": 1.9696969696969697, "grad_norm": 0.7814855575561523, "learning_rate": 2.5888484374320033e-06, "loss": 0.18078551292419434, "memory(GiB)": 31.37, "step": 305, "token_acc": 0.9328905860866678, "train_speed(iter/s)": 0.142327 }, { "epoch": 2.0, "grad_norm": 1.068848967552185, "learning_rate": 2.4413434209518137e-06, "loss": 0.1775040626525879, "memory(GiB)": 31.37, "step": 310, "token_acc": 0.9349484536082474, "train_speed(iter/s)": 0.142606 }, { "epoch": 2.0323232323232325, "grad_norm": 0.8913955092430115, "learning_rate": 2.296795912722014e-06, "loss": 0.13691856861114501, "memory(GiB)": 31.37, "step": 315, "token_acc": 0.9597564204395023, "train_speed(iter/s)": 0.142686 }, { "epoch": 2.0646464646464646, "grad_norm": 0.7020771503448486, "learning_rate": 2.1553729927843894e-06, "loss": 0.14630917310714722, "memory(GiB)": 31.37, "step": 320, "token_acc": 0.9601399342732959, "train_speed(iter/s)": 0.142875 }, { "epoch": 2.0646464646464646, "eval_loss": 0.3507172465324402, "eval_runtime": 5.3226, "eval_samples_per_second": 18.788, "eval_steps_per_second": 4.697, "eval_token_acc": 0.9062152688723605, "step": 320 }, { "epoch": 2.096969696969697, "grad_norm": 0.7276548743247986, "learning_rate": 2.017238129521506e-06, "loss": 0.14279915094375611, "memory(GiB)": 31.37, "step": 325, "token_acc": 0.9453313981615868, "train_speed(iter/s)": 0.142166 }, { "epoch": 2.1292929292929292, "grad_norm": 0.9800724387168884, "learning_rate": 1.8825509907063328e-06, "loss": 0.14380053281784058, "memory(GiB)": 31.37, "step": 330, "token_acc": 0.9582820773386105, "train_speed(iter/s)": 0.142491 }, { "epoch": 2.1616161616161618, "grad_norm": 0.764988899230957, "learning_rate": 1.7514672589449378e-06, "loss": 0.1322154998779297, "memory(GiB)": 31.37, "step": 335, "token_acc": 0.9540802213001384, "train_speed(iter/s)": 0.142717 }, { "epoch": 2.193939393939394, "grad_norm": 0.8425831198692322, "learning_rate": 1.6241384517255854e-06, "loss": 0.1310911536216736, "memory(GiB)": 31.37, "step": 340, "token_acc": 0.9542607139305956, "train_speed(iter/s)": 0.142908 }, { "epoch": 2.193939393939394, "eval_loss": 0.3661825954914093, "eval_runtime": 5.3319, "eval_samples_per_second": 18.755, "eval_steps_per_second": 4.689, "eval_token_acc": 0.9046764127554073, "step": 340 }, { "epoch": 2.2262626262626264, "grad_norm": 0.8031105399131775, "learning_rate": 1.500711746282192e-06, "loss": 0.15711712837219238, "memory(GiB)": 31.37, "step": 345, "token_acc": 0.9412384531628675, "train_speed(iter/s)": 0.142295 }, { "epoch": 2.2585858585858585, "grad_norm": 0.7701563835144043, "learning_rate": 1.3813298094746491e-06, "loss": 0.1371659517288208, "memory(GiB)": 31.37, "step": 350, "token_acc": 0.9590588494599956, "train_speed(iter/s)": 0.142428 }, { "epoch": 2.290909090909091, "grad_norm": 0.7400316596031189, "learning_rate": 1.2661306328825818e-06, "loss": 0.13530057668685913, "memory(GiB)": 31.37, "step": 355, "token_acc": 0.9558357869007278, "train_speed(iter/s)": 0.142645 }, { "epoch": 2.323232323232323, "grad_norm": 0.8807271718978882, "learning_rate": 1.1552473733031893e-06, "loss": 0.13264925479888917, "memory(GiB)": 31.37, "step": 360, "token_acc": 0.9480800203441553, "train_speed(iter/s)": 0.142961 }, { "epoch": 2.323232323232323, "eval_loss": 0.3667658269405365, "eval_runtime": 5.3077, "eval_samples_per_second": 18.841, "eval_steps_per_second": 4.71, "eval_token_acc": 0.9054458408138839, "step": 360 }, { "epoch": 2.3555555555555556, "grad_norm": 0.8146414160728455, "learning_rate": 1.0488081988375493e-06, "loss": 0.14928791522979737, "memory(GiB)": 31.37, "step": 365, "token_acc": 0.9331647539389224, "train_speed(iter/s)": 0.142423 }, { "epoch": 2.3878787878787877, "grad_norm": 0.8976642489433289, "learning_rate": 9.469361407432431e-07, "loss": 0.14599543809890747, "memory(GiB)": 31.37, "step": 370, "token_acc": 0.9549465467503818, "train_speed(iter/s)": 0.142603 }, { "epoch": 2.4202020202020202, "grad_norm": 0.7792761921882629, "learning_rate": 8.497489512245971e-07, "loss": 0.13710694313049315, "memory(GiB)": 31.37, "step": 375, "token_acc": 0.9578516805975458, "train_speed(iter/s)": 0.142793 }, { "epoch": 2.4525252525252528, "grad_norm": 0.9567272663116455, "learning_rate": 7.573589673248833e-07, "loss": 0.13858846426010132, "memory(GiB)": 31.37, "step": 380, "token_acc": 0.9512100926879505, "train_speed(iter/s)": 0.143064 }, { "epoch": 2.4525252525252528, "eval_loss": 0.3684777021408081, "eval_runtime": 5.3389, "eval_samples_per_second": 18.73, "eval_steps_per_second": 4.683, "eval_token_acc": 0.905018380781397, "step": 380 }, { "epoch": 2.484848484848485, "grad_norm": 0.7511286735534668, "learning_rate": 6.698729810778065e-07, "loss": 0.14136791229248047, "memory(GiB)": 31.37, "step": 385, "token_acc": 0.9423156301596037, "train_speed(iter/s)": 0.14245 }, { "epoch": 2.517171717171717, "grad_norm": 0.8342220783233643, "learning_rate": 5.873921160683943e-07, "loss": 0.13304708003997803, "memory(GiB)": 31.37, "step": 390, "token_acc": 0.9584492790285859, "train_speed(iter/s)": 0.142715 }, { "epoch": 2.5494949494949495, "grad_norm": 0.7826300263404846, "learning_rate": 5.100117105459279e-07, "loss": 0.1284404754638672, "memory(GiB)": 31.37, "step": 395, "token_acc": 0.9555216985304958, "train_speed(iter/s)": 0.142902 }, { "epoch": 2.581818181818182, "grad_norm": 0.7116090655326843, "learning_rate": 4.3782120722406565e-07, "loss": 0.15817636251449585, "memory(GiB)": 31.37, "step": 400, "token_acc": 0.9559717504022885, "train_speed(iter/s)": 0.143144 }, { "epoch": 2.581818181818182, "eval_loss": 0.3664516508579254, "eval_runtime": 5.3264, "eval_samples_per_second": 18.774, "eval_steps_per_second": 4.694, "eval_token_acc": 0.9054885868171326, "step": 400 }, { "epoch": 2.614141414141414, "grad_norm": 0.7377658486366272, "learning_rate": 3.709040498955102e-07, "loss": 0.13814414739608766, "memory(GiB)": 31.37, "step": 405, "token_acc": 0.9434749034749035, "train_speed(iter/s)": 0.142529 }, { "epoch": 2.6464646464646466, "grad_norm": 0.9466362595558167, "learning_rate": 3.0933758698072023e-07, "loss": 0.13764555454254152, "memory(GiB)": 31.37, "step": 410, "token_acc": 0.9556354916067147, "train_speed(iter/s)": 0.142696 }, { "epoch": 2.6787878787878787, "grad_norm": 0.8730055093765259, "learning_rate": 2.531929821221768e-07, "loss": 0.1323538064956665, "memory(GiB)": 31.37, "step": 415, "token_acc": 0.951093389819949, "train_speed(iter/s)": 0.142879 }, { "epoch": 2.7111111111111112, "grad_norm": 0.8799780011177063, "learning_rate": 2.0253513192751374e-07, "loss": 0.1415894865989685, "memory(GiB)": 31.37, "step": 420, "token_acc": 0.957655213984328, "train_speed(iter/s)": 0.143168 }, { "epoch": 2.7111111111111112, "eval_loss": 0.3659995198249817, "eval_runtime": 5.3257, "eval_samples_per_second": 18.777, "eval_steps_per_second": 4.694, "eval_token_acc": 0.9056595708301274, "step": 420 }, { "epoch": 2.7434343434343433, "grad_norm": 0.9417216777801514, "learning_rate": 1.5742259095662126e-07, "loss": 0.13300987482070922, "memory(GiB)": 31.37, "step": 425, "token_acc": 0.9425528606965174, "train_speed(iter/s)": 0.142605 }, { "epoch": 2.775757575757576, "grad_norm": 0.7449456453323364, "learning_rate": 1.1790750403941231e-07, "loss": 0.13196516036987305, "memory(GiB)": 31.37, "step": 430, "token_acc": 0.9565181855333061, "train_speed(iter/s)": 0.142783 }, { "epoch": 2.808080808080808, "grad_norm": 0.807500422000885, "learning_rate": 8.403554600248498e-08, "loss": 0.12061362266540528, "memory(GiB)": 31.37, "step": 435, "token_acc": 0.9600038504115127, "train_speed(iter/s)": 0.142955 }, { "epoch": 2.8404040404040405, "grad_norm": 0.8164981007575989, "learning_rate": 5.584586887435739e-08, "loss": 0.1402422547340393, "memory(GiB)": 31.37, "step": 440, "token_acc": 0.9568223268439581, "train_speed(iter/s)": 0.143151 }, { "epoch": 2.8404040404040405, "eval_loss": 0.3657556176185608, "eval_runtime": 5.3232, "eval_samples_per_second": 18.786, "eval_steps_per_second": 4.696, "eval_token_acc": 0.9058733008463709, "step": 440 }, { "epoch": 2.8727272727272726, "grad_norm": 0.8275712728500366, "learning_rate": 3.337105663029361e-08, "loss": 0.13185644149780273, "memory(GiB)": 31.37, "step": 445, "token_acc": 0.9404505730973607, "train_speed(iter/s)": 0.142606 }, { "epoch": 2.905050505050505, "grad_norm": 0.8103200197219849, "learning_rate": 1.6637087529033925e-08, "loss": 0.12856653928756714, "memory(GiB)": 31.37, "step": 450, "token_acc": 0.9603827178950516, "train_speed(iter/s)": 0.14286 }, { "epoch": 2.937373737373737, "grad_norm": 0.7664045691490173, "learning_rate": 5.6633040849601865e-09, "loss": 0.12633774280548096, "memory(GiB)": 31.37, "step": 455, "token_acc": 0.9499862901014533, "train_speed(iter/s)": 0.142986 }, { "epoch": 2.9696969696969697, "grad_norm": 0.8392007350921631, "learning_rate": 4.623907104084335e-10, "loss": 0.15011647939682007, "memory(GiB)": 31.37, "step": 460, "token_acc": 0.9552918794432762, "train_speed(iter/s)": 0.143271 }, { "epoch": 2.9696969696969697, "eval_loss": 0.36587560176849365, "eval_runtime": 5.3221, "eval_samples_per_second": 18.79, "eval_steps_per_second": 4.697, "eval_token_acc": 0.9063007608788578, "step": 460 }, { "epoch": 2.9826262626262627, "eval_loss": 0.36588189005851746, "eval_runtime": 5.3146, "eval_samples_per_second": 18.816, "eval_steps_per_second": 4.704, "eval_token_acc": 0.9059587928528683, "step": 462 } ], "logging_steps": 5, "max_steps": 462, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.302620480776438e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }