{ "best_global_step": 300, "best_metric": 0.22917783, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v27-20250507-113338/checkpoint-300", "epoch": 2.9826262626262627, "eval_steps": 20, "global_step": 462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006464646464646465, "grad_norm": 2.5242867469787598, "learning_rate": 9.999884400986087e-06, "loss": 0.39474862813949585, "memory(GiB)": 28.84, "step": 1, "token_acc": 0.8908829863603733, "train_speed(iter/s)": 0.064293 }, { "epoch": 0.03232323232323232, "grad_norm": 1.5711474418640137, "learning_rate": 9.997110291906109e-06, "loss": 0.3434034585952759, "memory(GiB)": 30.62, "step": 5, "token_acc": 0.8817614172656647, "train_speed(iter/s)": 0.119566 }, { "epoch": 0.06464646464646465, "grad_norm": 0.9338254928588867, "learning_rate": 9.988444507789584e-06, "loss": 0.2892385244369507, "memory(GiB)": 30.62, "step": 10, "token_acc": 0.9103395025620628, "train_speed(iter/s)": 0.1364 }, { "epoch": 0.09696969696969697, "grad_norm": 0.9935480952262878, "learning_rate": 9.97401266428502e-06, "loss": 0.30152087211608886, "memory(GiB)": 32.44, "step": 15, "token_acc": 0.9016964442328413, "train_speed(iter/s)": 0.141481 }, { "epoch": 0.1292929292929293, "grad_norm": 0.8869792819023132, "learning_rate": 9.953831442918418e-06, "loss": 0.2820048570632935, "memory(GiB)": 32.44, "step": 20, "token_acc": 0.9094200925673837, "train_speed(iter/s)": 0.144923 }, { "epoch": 0.1292929292929293, "eval_loss": 0.2714148759841919, "eval_runtime": 4.8691, "eval_samples_per_second": 20.538, "eval_steps_per_second": 5.134, "eval_token_acc": 0.9161811841070817, "step": 20 }, { "epoch": 0.16161616161616163, "grad_norm": 0.9623355865478516, "learning_rate": 9.927924170825266e-06, "loss": 0.28305883407592775, "memory(GiB)": 32.44, "step": 25, "token_acc": 0.8970619818736308, "train_speed(iter/s)": 0.133319 }, { "epoch": 0.19393939393939394, "grad_norm": 0.806711733341217, "learning_rate": 9.896320793787106e-06, "loss": 0.2747792720794678, "memory(GiB)": 32.44, "step": 30, "token_acc": 0.9068442528293171, "train_speed(iter/s)": 0.138408 }, { "epoch": 0.22626262626262628, "grad_norm": 0.8571366667747498, "learning_rate": 9.859057841617709e-06, "loss": 0.2719248294830322, "memory(GiB)": 32.44, "step": 35, "token_acc": 0.9210450095580143, "train_speed(iter/s)": 0.14047 }, { "epoch": 0.2585858585858586, "grad_norm": 0.8261837363243103, "learning_rate": 9.816178385938867e-06, "loss": 0.2738617420196533, "memory(GiB)": 32.45, "step": 40, "token_acc": 0.9066808952792833, "train_speed(iter/s)": 0.142017 }, { "epoch": 0.2585858585858586, "eval_loss": 0.2558521330356598, "eval_runtime": 4.8838, "eval_samples_per_second": 20.476, "eval_steps_per_second": 5.119, "eval_token_acc": 0.9202935069647429, "step": 40 }, { "epoch": 0.2909090909090909, "grad_norm": 0.7319702506065369, "learning_rate": 9.767731990394638e-06, "loss": 0.2567479133605957, "memory(GiB)": 32.45, "step": 45, "token_acc": 0.9167146310579026, "train_speed(iter/s)": 0.136503 }, { "epoch": 0.32323232323232326, "grad_norm": 0.8507645130157471, "learning_rate": 9.71377465336155e-06, "loss": 0.261569881439209, "memory(GiB)": 32.45, "step": 50, "token_acc": 0.9125896733273807, "train_speed(iter/s)": 0.138728 }, { "epoch": 0.35555555555555557, "grad_norm": 0.8376278877258301, "learning_rate": 9.654368743221022e-06, "loss": 0.24444923400878907, "memory(GiB)": 32.45, "step": 55, "token_acc": 0.9274953450318795, "train_speed(iter/s)": 0.139921 }, { "epoch": 0.3878787878787879, "grad_norm": 0.8305687308311462, "learning_rate": 9.589582926268798e-06, "loss": 0.26804823875427247, "memory(GiB)": 34.77, "step": 60, "token_acc": 0.9223425512494758, "train_speed(iter/s)": 0.14136 }, { "epoch": 0.3878787878787879, "eval_loss": 0.2471594363451004, "eval_runtime": 4.836, "eval_samples_per_second": 20.678, "eval_steps_per_second": 5.17, "eval_token_acc": 0.9224706190658576, "step": 60 }, { "epoch": 0.4202020202020202, "grad_norm": 0.6730025410652161, "learning_rate": 9.519492087344724e-06, "loss": 0.24588844776153565, "memory(GiB)": 34.77, "step": 65, "token_acc": 0.9117032737506321, "train_speed(iter/s)": 0.136683 }, { "epoch": 0.45252525252525255, "grad_norm": 0.7964938282966614, "learning_rate": 9.444177243274619e-06, "loss": 0.2592522859573364, "memory(GiB)": 34.77, "step": 70, "token_acc": 0.9183477688849907, "train_speed(iter/s)": 0.138829 }, { "epoch": 0.48484848484848486, "grad_norm": 0.6614187359809875, "learning_rate": 9.363725449224281e-06, "loss": 0.2513019561767578, "memory(GiB)": 34.77, "step": 75, "token_acc": 0.9213027816690014, "train_speed(iter/s)": 0.140268 }, { "epoch": 0.5171717171717172, "grad_norm": 0.7636239528656006, "learning_rate": 9.278229698073889e-06, "loss": 0.2455005168914795, "memory(GiB)": 34.77, "step": 80, "token_acc": 0.9128787878787878, "train_speed(iter/s)": 0.140852 }, { "epoch": 0.5171717171717172, "eval_loss": 0.24173545837402344, "eval_runtime": 4.8448, "eval_samples_per_second": 20.641, "eval_steps_per_second": 5.16, "eval_token_acc": 0.9233374322172274, "step": 80 }, { "epoch": 0.5494949494949495, "grad_norm": 0.8166657090187073, "learning_rate": 9.187788812929074e-06, "loss": 0.2590561628341675, "memory(GiB)": 34.77, "step": 85, "token_acc": 0.9130182349905385, "train_speed(iter/s)": 0.137729 }, { "epoch": 0.5818181818181818, "grad_norm": 0.8091973066329956, "learning_rate": 9.092507332892968e-06, "loss": 0.2490919589996338, "memory(GiB)": 34.77, "step": 90, "token_acc": 0.910330508950115, "train_speed(iter/s)": 0.13867 }, { "epoch": 0.6141414141414141, "grad_norm": 0.8563810586929321, "learning_rate": 8.992495392231195e-06, "loss": 0.2534335613250732, "memory(GiB)": 34.77, "step": 95, "token_acc": 0.9122233688797157, "train_speed(iter/s)": 0.139651 }, { "epoch": 0.6464646464646465, "grad_norm": 0.7787972092628479, "learning_rate": 8.88786859306952e-06, "loss": 0.24485716819763184, "memory(GiB)": 34.77, "step": 100, "token_acc": 0.9189008559751176, "train_speed(iter/s)": 0.140544 }, { "epoch": 0.6464646464646465, "eval_loss": 0.23870104551315308, "eval_runtime": 4.8543, "eval_samples_per_second": 20.6, "eval_steps_per_second": 5.15, "eval_token_acc": 0.9255750196544843, "step": 100 }, { "epoch": 0.6787878787878788, "grad_norm": 0.7570570707321167, "learning_rate": 8.778747871771293e-06, "loss": 0.25060880184173584, "memory(GiB)": 34.77, "step": 105, "token_acc": 0.9076021696824148, "train_speed(iter/s)": 0.138513 }, { "epoch": 0.7111111111111111, "grad_norm": 0.7018725872039795, "learning_rate": 8.665259359149132e-06, "loss": 0.2399357795715332, "memory(GiB)": 34.77, "step": 110, "token_acc": 0.927743086529884, "train_speed(iter/s)": 0.139328 }, { "epoch": 0.7434343434343434, "grad_norm": 0.7108844518661499, "learning_rate": 8.547534234672435e-06, "loss": 0.23419642448425293, "memory(GiB)": 34.77, "step": 115, "token_acc": 0.9277124928693667, "train_speed(iter/s)": 0.140095 }, { "epoch": 0.7757575757575758, "grad_norm": 0.8351752161979675, "learning_rate": 8.425708574839221e-06, "loss": 0.24019112586975097, "memory(GiB)": 34.77, "step": 120, "token_acc": 0.9238991888760139, "train_speed(iter/s)": 0.140762 }, { "epoch": 0.7757575757575758, "eval_loss": 0.23568643629550934, "eval_runtime": 4.8562, "eval_samples_per_second": 20.592, "eval_steps_per_second": 5.148, "eval_token_acc": 0.9252323260830124, "step": 120 }, { "epoch": 0.8080808080808081, "grad_norm": 0.7820432186126709, "learning_rate": 8.299923195887599e-06, "loss": 0.23381190299987792, "memory(GiB)": 34.77, "step": 125, "token_acc": 0.9166274910577414, "train_speed(iter/s)": 0.138826 }, { "epoch": 0.8404040404040404, "grad_norm": 0.8153200149536133, "learning_rate": 8.170323491028625e-06, "loss": 0.25104479789733886, "memory(GiB)": 34.77, "step": 130, "token_acc": 0.9234165067178502, "train_speed(iter/s)": 0.139183 }, { "epoch": 0.8727272727272727, "grad_norm": 0.8061110973358154, "learning_rate": 8.03705926238874e-06, "loss": 0.24108409881591797, "memory(GiB)": 34.77, "step": 135, "token_acc": 0.9225523279137268, "train_speed(iter/s)": 0.139787 }, { "epoch": 0.9050505050505051, "grad_norm": 0.7487571835517883, "learning_rate": 7.900284547855992e-06, "loss": 0.23796701431274414, "memory(GiB)": 34.77, "step": 140, "token_acc": 0.9192467460537247, "train_speed(iter/s)": 0.140229 }, { "epoch": 0.9050505050505051, "eval_loss": 0.2325511872768402, "eval_runtime": 4.8347, "eval_samples_per_second": 20.684, "eval_steps_per_second": 5.171, "eval_token_acc": 0.9265224665873768, "step": 140 }, { "epoch": 0.9373737373737374, "grad_norm": 0.7421184182167053, "learning_rate": 7.760157443030234e-06, "loss": 0.22932517528533936, "memory(GiB)": 34.77, "step": 145, "token_acc": 0.9112203397203071, "train_speed(iter/s)": 0.13847 }, { "epoch": 0.9696969696969697, "grad_norm": 0.7295236587524414, "learning_rate": 7.616839918483061e-06, "loss": 0.233046817779541, "memory(GiB)": 34.77, "step": 150, "token_acc": 0.9231597652253514, "train_speed(iter/s)": 0.138991 }, { "epoch": 1.0, "grad_norm": 0.7331147789955139, "learning_rate": 7.470497632538743e-06, "loss": 0.23622214794158936, "memory(GiB)": 34.77, "step": 155, "token_acc": 0.923393272448806, "train_speed(iter/s)": 0.139619 }, { "epoch": 1.0323232323232323, "grad_norm": 0.6679208874702454, "learning_rate": 7.321299739792553e-06, "loss": 0.17297937870025634, "memory(GiB)": 34.77, "step": 160, "token_acc": 0.9432674199623352, "train_speed(iter/s)": 0.140142 }, { "epoch": 1.0323232323232323, "eval_loss": 0.23158761858940125, "eval_runtime": 4.8528, "eval_samples_per_second": 20.607, "eval_steps_per_second": 5.152, "eval_token_acc": 0.9269256354949906, "step": 160 }, { "epoch": 1.0646464646464646, "grad_norm": 0.8160377144813538, "learning_rate": 7.169418695587791e-06, "loss": 0.16782424449920655, "memory(GiB)": 34.77, "step": 165, "token_acc": 0.9344088433847973, "train_speed(iter/s)": 0.138724 }, { "epoch": 1.096969696969697, "grad_norm": 0.8471182584762573, "learning_rate": 7.015030056677559e-06, "loss": 0.16909420490264893, "memory(GiB)": 34.77, "step": 170, "token_acc": 0.9429876289177185, "train_speed(iter/s)": 0.139504 }, { "epoch": 1.1292929292929292, "grad_norm": 0.7286836504936218, "learning_rate": 6.858312278301638e-06, "loss": 0.1667182445526123, "memory(GiB)": 34.77, "step": 175, "token_acc": 0.9418443002780352, "train_speed(iter/s)": 0.139935 }, { "epoch": 1.1616161616161615, "grad_norm": 0.7752698063850403, "learning_rate": 6.699446507913083e-06, "loss": 0.15690959692001344, "memory(GiB)": 34.77, "step": 180, "token_acc": 0.9501004865665327, "train_speed(iter/s)": 0.14024 }, { "epoch": 1.1616161616161615, "eval_loss": 0.23583181202411652, "eval_runtime": 4.8506, "eval_samples_per_second": 20.616, "eval_steps_per_second": 5.154, "eval_token_acc": 0.9267442094865644, "step": 180 }, { "epoch": 1.1939393939393939, "grad_norm": 0.7071201205253601, "learning_rate": 6.53861637579291e-06, "loss": 0.15962274074554444, "memory(GiB)": 34.77, "step": 185, "token_acc": 0.9369357151160538, "train_speed(iter/s)": 0.138875 }, { "epoch": 1.2262626262626264, "grad_norm": 0.7520214319229126, "learning_rate": 6.376007782794926e-06, "loss": 0.15966968536376952, "memory(GiB)": 34.77, "step": 190, "token_acc": 0.9476890003582945, "train_speed(iter/s)": 0.139235 }, { "epoch": 1.2585858585858585, "grad_norm": 0.7770646214485168, "learning_rate": 6.211808685466063e-06, "loss": 0.17346657514572145, "memory(GiB)": 34.77, "step": 195, "token_acc": 0.937822677420255, "train_speed(iter/s)": 0.139941 }, { "epoch": 1.290909090909091, "grad_norm": 0.7723908424377441, "learning_rate": 6.046208878790543e-06, "loss": 0.1594362735748291, "memory(GiB)": 34.77, "step": 200, "token_acc": 0.9459411057384808, "train_speed(iter/s)": 0.140377 }, { "epoch": 1.290909090909091, "eval_loss": 0.23655511438846588, "eval_runtime": 4.8375, "eval_samples_per_second": 20.672, "eval_steps_per_second": 5.168, "eval_token_acc": 0.9270062692765134, "step": 200 }, { "epoch": 1.3232323232323233, "grad_norm": 0.7364096641540527, "learning_rate": 5.879399776809047e-06, "loss": 0.16425321102142335, "memory(GiB)": 34.77, "step": 205, "token_acc": 0.9377912867274569, "train_speed(iter/s)": 0.139168 }, { "epoch": 1.3555555555555556, "grad_norm": 0.772078275680542, "learning_rate": 5.711574191366427e-06, "loss": 0.16698684692382812, "memory(GiB)": 34.77, "step": 210, "token_acc": 0.9447016139121731, "train_speed(iter/s)": 0.139538 }, { "epoch": 1.387878787878788, "grad_norm": 0.6973662376403809, "learning_rate": 5.542926109243727e-06, "loss": 0.15273804664611818, "memory(GiB)": 34.77, "step": 215, "token_acc": 0.9457073269738178, "train_speed(iter/s)": 0.140027 }, { "epoch": 1.4202020202020202, "grad_norm": 0.7803521156311035, "learning_rate": 5.373650467932122e-06, "loss": 0.17012779712677, "memory(GiB)": 34.77, "step": 220, "token_acc": 0.9377423694832089, "train_speed(iter/s)": 0.14035 }, { "epoch": 1.4202020202020202, "eval_loss": 0.2345450520515442, "eval_runtime": 4.8264, "eval_samples_per_second": 20.719, "eval_steps_per_second": 5.18, "eval_token_acc": 0.9277722902009797, "step": 220 }, { "epoch": 1.4525252525252526, "grad_norm": 0.694709300994873, "learning_rate": 5.2039429303079294e-06, "loss": 0.16966450214385986, "memory(GiB)": 34.77, "step": 225, "token_acc": 0.9322694965253717, "train_speed(iter/s)": 0.139252 }, { "epoch": 1.4848484848484849, "grad_norm": 0.7459155917167664, "learning_rate": 5.033999658469174e-06, "loss": 0.17206931114196777, "memory(GiB)": 34.77, "step": 230, "token_acc": 0.9417377303357386, "train_speed(iter/s)": 0.13963 }, { "epoch": 1.5171717171717172, "grad_norm": 0.7466315627098083, "learning_rate": 4.864017086995112e-06, "loss": 0.15746488571166992, "memory(GiB)": 34.77, "step": 235, "token_acc": 0.9459481252519822, "train_speed(iter/s)": 0.139941 }, { "epoch": 1.5494949494949495, "grad_norm": 0.7919719815254211, "learning_rate": 4.694191695890788e-06, "loss": 0.1569303512573242, "memory(GiB)": 34.77, "step": 240, "token_acc": 0.941917082024835, "train_speed(iter/s)": 0.140297 }, { "epoch": 1.5494949494949495, "eval_loss": 0.23328742384910583, "eval_runtime": 4.8307, "eval_samples_per_second": 20.701, "eval_steps_per_second": 5.175, "eval_token_acc": 0.9284576773439233, "step": 240 }, { "epoch": 1.5818181818181818, "grad_norm": 0.8104657530784607, "learning_rate": 4.524719783479088e-06, "loss": 0.15921467542648315, "memory(GiB)": 34.77, "step": 245, "token_acc": 0.9333036905291241, "train_speed(iter/s)": 0.139163 }, { "epoch": 1.614141414141414, "grad_norm": 0.8245537877082825, "learning_rate": 4.355797239502807e-06, "loss": 0.16331541538238525, "memory(GiB)": 34.77, "step": 250, "token_acc": 0.9451396561913816, "train_speed(iter/s)": 0.139254 }, { "epoch": 1.6464646464646466, "grad_norm": 0.7749842405319214, "learning_rate": 4.187619318698971e-06, "loss": 0.15697014331817627, "memory(GiB)": 34.77, "step": 255, "token_acc": 0.9451475779917865, "train_speed(iter/s)": 0.139673 }, { "epoch": 1.6787878787878787, "grad_norm": 0.8173830509185791, "learning_rate": 4.020380415107167e-06, "loss": 0.16766272783279418, "memory(GiB)": 34.77, "step": 260, "token_acc": 0.9469431879605132, "train_speed(iter/s)": 0.139915 }, { "epoch": 1.6787878787878787, "eval_loss": 0.2292548418045044, "eval_runtime": 4.8388, "eval_samples_per_second": 20.666, "eval_steps_per_second": 5.167, "eval_token_acc": 0.9289213215876791, "step": 260 }, { "epoch": 1.7111111111111112, "grad_norm": 0.7277234196662903, "learning_rate": 3.854273837372724e-06, "loss": 0.16253018379211426, "memory(GiB)": 34.77, "step": 265, "token_acc": 0.9426872469635628, "train_speed(iter/s)": 0.138974 }, { "epoch": 1.7434343434343433, "grad_norm": 0.8412746787071228, "learning_rate": 3.689491585304491e-06, "loss": 0.16776057481765747, "memory(GiB)": 34.77, "step": 270, "token_acc": 0.9380598276153456, "train_speed(iter/s)": 0.139245 }, { "epoch": 1.7757575757575759, "grad_norm": 0.7006183862686157, "learning_rate": 3.526224127945479e-06, "loss": 0.15875219106674193, "memory(GiB)": 34.77, "step": 275, "token_acc": 0.9433733748578773, "train_speed(iter/s)": 0.13972 }, { "epoch": 1.808080808080808, "grad_norm": 0.7234155535697937, "learning_rate": 3.3646601834128924e-06, "loss": 0.159059476852417, "memory(GiB)": 34.77, "step": 280, "token_acc": 0.9385077213505401, "train_speed(iter/s)": 0.139973 }, { "epoch": 1.808080808080808, "eval_loss": 0.22943958640098572, "eval_runtime": 4.8423, "eval_samples_per_second": 20.652, "eval_steps_per_second": 5.163, "eval_token_acc": 0.9282157759993549, "step": 280 }, { "epoch": 1.8404040404040405, "grad_norm": 0.7152555584907532, "learning_rate": 3.204986500762006e-06, "loss": 0.1508580207824707, "memory(GiB)": 34.77, "step": 285, "token_acc": 0.9341365308729054, "train_speed(iter/s)": 0.138995 }, { "epoch": 1.8727272727272726, "grad_norm": 0.7166336178779602, "learning_rate": 3.0473876441260786e-06, "loss": 0.16788345575332642, "memory(GiB)": 34.77, "step": 290, "token_acc": 0.9407490363579539, "train_speed(iter/s)": 0.139247 }, { "epoch": 1.905050505050505, "grad_norm": 0.7887818813323975, "learning_rate": 2.8920457793817507e-06, "loss": 0.15700163841247558, "memory(GiB)": 34.77, "step": 295, "token_acc": 0.9426423803879983, "train_speed(iter/s)": 0.139471 }, { "epoch": 1.9373737373737374, "grad_norm": 0.8042952418327332, "learning_rate": 2.7391404635865725e-06, "loss": 0.16809990406036376, "memory(GiB)": 34.77, "step": 300, "token_acc": 0.944771353933029, "train_speed(iter/s)": 0.139742 }, { "epoch": 1.9373737373737374, "eval_loss": 0.2291778326034546, "eval_runtime": 4.8337, "eval_samples_per_second": 20.688, "eval_steps_per_second": 5.172, "eval_token_acc": 0.9292035398230089, "step": 300 }, { "epoch": 1.9696969696969697, "grad_norm": 0.6989196538925171, "learning_rate": 2.5888484374320033e-06, "loss": 0.1462658405303955, "memory(GiB)": 34.77, "step": 305, "token_acc": 0.9413777899090852, "train_speed(iter/s)": 0.138859 }, { "epoch": 2.0, "grad_norm": 0.9214490056037903, "learning_rate": 2.4413434209518137e-06, "loss": 0.17060282230377197, "memory(GiB)": 34.77, "step": 310, "token_acc": 0.9452530120481928, "train_speed(iter/s)": 0.139266 }, { "epoch": 2.0323232323232325, "grad_norm": 0.6455035209655762, "learning_rate": 2.296795912722014e-06, "loss": 0.12579550743103027, "memory(GiB)": 34.77, "step": 315, "token_acc": 0.9598925994294345, "train_speed(iter/s)": 0.139482 }, { "epoch": 2.0646464646464646, "grad_norm": 0.6761350631713867, "learning_rate": 2.1553729927843894e-06, "loss": 0.10464283227920532, "memory(GiB)": 34.77, "step": 320, "token_acc": 0.9628479377702958, "train_speed(iter/s)": 0.139728 }, { "epoch": 2.0646464646464646, "eval_loss": 0.23895612359046936, "eval_runtime": 4.8116, "eval_samples_per_second": 20.783, "eval_steps_per_second": 5.196, "eval_token_acc": 0.9286391033523494, "step": 320 }, { "epoch": 2.096969696969697, "grad_norm": 0.7690797448158264, "learning_rate": 2.017238129521506e-06, "loss": 0.10620735883712769, "memory(GiB)": 34.77, "step": 325, "token_acc": 0.951885791453651, "train_speed(iter/s)": 0.138943 }, { "epoch": 2.1292929292929292, "grad_norm": 0.8825842142105103, "learning_rate": 1.8825509907063328e-06, "loss": 0.11970834732055664, "memory(GiB)": 34.77, "step": 330, "token_acc": 0.9611402417348027, "train_speed(iter/s)": 0.139288 }, { "epoch": 2.1616161616161618, "grad_norm": 0.7779159545898438, "learning_rate": 1.7514672589449378e-06, "loss": 0.1070137619972229, "memory(GiB)": 34.77, "step": 335, "token_acc": 0.9595128097438052, "train_speed(iter/s)": 0.139418 }, { "epoch": 2.193939393939394, "grad_norm": 0.7191023826599121, "learning_rate": 1.6241384517255854e-06, "loss": 0.11621193885803223, "memory(GiB)": 34.77, "step": 340, "token_acc": 0.9595157410042165, "train_speed(iter/s)": 0.139652 }, { "epoch": 2.193939393939394, "eval_loss": 0.2503757178783417, "eval_runtime": 4.8011, "eval_samples_per_second": 20.829, "eval_steps_per_second": 5.207, "eval_token_acc": 0.928538311125446, "step": 340 }, { "epoch": 2.2262626262626264, "grad_norm": 0.6950616240501404, "learning_rate": 1.500711746282192e-06, "loss": 0.11175984144210815, "memory(GiB)": 34.77, "step": 345, "token_acc": 0.9465600862223819, "train_speed(iter/s)": 0.138949 }, { "epoch": 2.2585858585858585, "grad_norm": 0.7209317088127136, "learning_rate": 1.3813298094746491e-06, "loss": 0.11107317209243775, "memory(GiB)": 34.77, "step": 350, "token_acc": 0.9588838612368024, "train_speed(iter/s)": 0.139108 }, { "epoch": 2.290909090909091, "grad_norm": 0.7023849487304688, "learning_rate": 1.2661306328825818e-06, "loss": 0.1061722993850708, "memory(GiB)": 34.77, "step": 355, "token_acc": 0.9638245595692404, "train_speed(iter/s)": 0.139405 }, { "epoch": 2.323232323232323, "grad_norm": 0.6932234764099121, "learning_rate": 1.1552473733031893e-06, "loss": 0.11119704246520996, "memory(GiB)": 34.77, "step": 360, "token_acc": 0.9628568099732029, "train_speed(iter/s)": 0.139754 }, { "epoch": 2.323232323232323, "eval_loss": 0.2477826327085495, "eval_runtime": 4.8417, "eval_samples_per_second": 20.654, "eval_steps_per_second": 5.163, "eval_token_acc": 0.9284173604531618, "step": 360 }, { "epoch": 2.3555555555555556, "grad_norm": 0.7261531352996826, "learning_rate": 1.0488081988375493e-06, "loss": 0.11782848834991455, "memory(GiB)": 34.77, "step": 365, "token_acc": 0.9477818154288743, "train_speed(iter/s)": 0.139242 }, { "epoch": 2.3878787878787877, "grad_norm": 0.7011246681213379, "learning_rate": 9.469361407432431e-07, "loss": 0.10731152296066285, "memory(GiB)": 34.77, "step": 370, "token_acc": 0.9617250245182495, "train_speed(iter/s)": 0.139376 }, { "epoch": 2.4202020202020202, "grad_norm": 0.7193347811698914, "learning_rate": 8.497489512245971e-07, "loss": 0.11734654903411865, "memory(GiB)": 34.77, "step": 375, "token_acc": 0.9643068481359944, "train_speed(iter/s)": 0.139563 }, { "epoch": 2.4525252525252528, "grad_norm": 0.8027297258377075, "learning_rate": 7.573589673248833e-07, "loss": 0.1112905502319336, "memory(GiB)": 34.77, "step": 380, "token_acc": 0.964820230517805, "train_speed(iter/s)": 0.139778 }, { "epoch": 2.4525252525252528, "eval_loss": 0.24963097274303436, "eval_runtime": 4.8179, "eval_samples_per_second": 20.756, "eval_steps_per_second": 5.189, "eval_token_acc": 0.9288810046969178, "step": 380 }, { "epoch": 2.484848484848485, "grad_norm": 0.7203919291496277, "learning_rate": 6.698729810778065e-07, "loss": 0.10394268035888672, "memory(GiB)": 34.77, "step": 385, "token_acc": 0.9475781400629834, "train_speed(iter/s)": 0.139104 }, { "epoch": 2.517171717171717, "grad_norm": 0.71819007396698, "learning_rate": 5.873921160683943e-07, "loss": 0.10946273803710938, "memory(GiB)": 34.77, "step": 390, "token_acc": 0.9592940980604345, "train_speed(iter/s)": 0.139407 }, { "epoch": 2.5494949494949495, "grad_norm": 0.7178409695625305, "learning_rate": 5.100117105459279e-07, "loss": 0.1130871295928955, "memory(GiB)": 34.77, "step": 395, "token_acc": 0.9668103880477289, "train_speed(iter/s)": 0.139538 }, { "epoch": 2.581818181818182, "grad_norm": 0.67863929271698, "learning_rate": 4.3782120722406565e-07, "loss": 0.10182794332504272, "memory(GiB)": 34.77, "step": 400, "token_acc": 0.9624655998369177, "train_speed(iter/s)": 0.139808 }, { "epoch": 2.581818181818182, "eval_loss": 0.24975040555000305, "eval_runtime": 4.8388, "eval_samples_per_second": 20.666, "eval_steps_per_second": 5.167, "eval_token_acc": 0.9285786280162074, "step": 400 }, { "epoch": 2.614141414141414, "grad_norm": 0.652603805065155, "learning_rate": 3.709040498955102e-07, "loss": 0.1031190037727356, "memory(GiB)": 34.77, "step": 405, "token_acc": 0.9463408184402476, "train_speed(iter/s)": 0.139139 }, { "epoch": 2.6464646464646466, "grad_norm": 1.0421696901321411, "learning_rate": 3.0933758698072023e-07, "loss": 0.121562659740448, "memory(GiB)": 34.77, "step": 410, "token_acc": 0.9582254445019851, "train_speed(iter/s)": 0.139341 }, { "epoch": 2.6787878787878787, "grad_norm": 0.7723399996757507, "learning_rate": 2.531929821221768e-07, "loss": 0.11757031679153443, "memory(GiB)": 34.77, "step": 415, "token_acc": 0.9626831890454366, "train_speed(iter/s)": 0.139573 }, { "epoch": 2.7111111111111112, "grad_norm": 0.6900373101234436, "learning_rate": 2.0253513192751374e-07, "loss": 0.11266238689422607, "memory(GiB)": 34.77, "step": 420, "token_acc": 0.9586481947942905, "train_speed(iter/s)": 0.139767 }, { "epoch": 2.7111111111111112, "eval_loss": 0.2500038743019104, "eval_runtime": 4.8315, "eval_samples_per_second": 20.697, "eval_steps_per_second": 5.174, "eval_token_acc": 0.9289616384784405, "step": 420 }, { "epoch": 2.7434343434343433, "grad_norm": 0.7542772889137268, "learning_rate": 1.5742259095662126e-07, "loss": 0.11297458410263062, "memory(GiB)": 34.77, "step": 425, "token_acc": 0.9452227294191721, "train_speed(iter/s)": 0.139122 }, { "epoch": 2.775757575757576, "grad_norm": 0.7564345598220825, "learning_rate": 1.1790750403941231e-07, "loss": 0.1145315408706665, "memory(GiB)": 34.77, "step": 430, "token_acc": 0.9606825351304846, "train_speed(iter/s)": 0.139315 }, { "epoch": 2.808080808080808, "grad_norm": 0.7472628355026245, "learning_rate": 8.403554600248498e-08, "loss": 0.10037808418273926, "memory(GiB)": 34.77, "step": 435, "token_acc": 0.968122471719594, "train_speed(iter/s)": 0.139508 }, { "epoch": 2.8404040404040405, "grad_norm": 0.7140054702758789, "learning_rate": 5.584586887435739e-08, "loss": 0.1066713809967041, "memory(GiB)": 34.77, "step": 440, "token_acc": 0.9640665162880974, "train_speed(iter/s)": 0.139714 }, { "epoch": 2.8404040404040405, "eval_loss": 0.2495991587638855, "eval_runtime": 4.8171, "eval_samples_per_second": 20.76, "eval_steps_per_second": 5.19, "eval_token_acc": 0.9289414800330599, "step": 440 }, { "epoch": 2.8727272727272726, "grad_norm": 0.6888604164123535, "learning_rate": 3.337105663029361e-08, "loss": 0.11188592910766601, "memory(GiB)": 34.77, "step": 445, "token_acc": 0.9492009251349155, "train_speed(iter/s)": 0.13915 }, { "epoch": 2.905050505050505, "grad_norm": 0.7185996770858765, "learning_rate": 1.6637087529033925e-08, "loss": 0.11582531929016113, "memory(GiB)": 34.77, "step": 450, "token_acc": 0.9591944327288064, "train_speed(iter/s)": 0.139392 }, { "epoch": 2.937373737373737, "grad_norm": 0.6511121988296509, "learning_rate": 5.6633040849601865e-09, "loss": 0.10450353622436523, "memory(GiB)": 34.77, "step": 455, "token_acc": 0.9614834408486059, "train_speed(iter/s)": 0.139543 }, { "epoch": 2.9696969696969697, "grad_norm": 0.6926988363265991, "learning_rate": 4.623907104084335e-10, "loss": 0.11017694473266601, "memory(GiB)": 34.77, "step": 460, "token_acc": 0.9601264597715982, "train_speed(iter/s)": 0.139788 }, { "epoch": 2.9696969696969697, "eval_loss": 0.24949033558368683, "eval_runtime": 4.8501, "eval_samples_per_second": 20.618, "eval_steps_per_second": 5.155, "eval_token_acc": 0.9285786280162074, "step": 460 }, { "epoch": 2.9826262626262627, "eval_loss": 0.2496582567691803, "eval_runtime": 4.861, "eval_samples_per_second": 20.572, "eval_steps_per_second": 5.143, "eval_token_acc": 0.929062430705344, "step": 462 } ], "logging_steps": 5, "max_steps": 462, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.5801089900517786e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }