{ "best_global_step": 1860, "best_metric": 0.1756638, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v30-20250504-002959/checkpoint-1860", "epoch": 2.997061180870959, "eval_steps": 20, "global_step": 2805, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010686615014694097, "grad_norm": 2.5240726470947266, "learning_rate": 9.999996864014995e-06, "loss": 0.36646389961242676, "memory(GiB)": 28.83, "step": 1, "token_acc": 0.898885535175296, "train_speed(iter/s)": 0.068736 }, { "epoch": 0.0053433075073470475, "grad_norm": 2.2844929695129395, "learning_rate": 9.99992160057155e-06, "loss": 0.32597002387046814, "memory(GiB)": 28.87, "step": 5, "token_acc": 0.8913895678642254, "train_speed(iter/s)": 0.125487 }, { "epoch": 0.010686615014694095, "grad_norm": 1.130465030670166, "learning_rate": 9.999686404744782e-06, "loss": 0.27506489753723146, "memory(GiB)": 28.87, "step": 10, "token_acc": 0.9082294264339152, "train_speed(iter/s)": 0.138754 }, { "epoch": 0.016029922522041145, "grad_norm": 1.0269818305969238, "learning_rate": 9.999294419895389e-06, "loss": 0.27271237373352053, "memory(GiB)": 28.88, "step": 15, "token_acc": 0.9203989829845491, "train_speed(iter/s)": 0.144155 }, { "epoch": 0.02137323002938819, "grad_norm": 0.9383953213691711, "learning_rate": 9.998745658315924e-06, "loss": 0.2601430892944336, "memory(GiB)": 28.88, "step": 20, "token_acc": 0.9118547099521022, "train_speed(iter/s)": 0.144659 }, { "epoch": 0.02137323002938819, "eval_loss": 0.26998990774154663, "eval_runtime": 39.5781, "eval_samples_per_second": 15.261, "eval_steps_per_second": 3.815, "eval_token_acc": 0.9141692937871282, "step": 20 }, { "epoch": 0.026716537536735238, "grad_norm": 0.8839966058731079, "learning_rate": 9.998040137215423e-06, "loss": 0.2755439758300781, "memory(GiB)": 28.88, "step": 25, "token_acc": 0.9063694193980396, "train_speed(iter/s)": 0.11355 }, { "epoch": 0.03205984504408229, "grad_norm": 1.0895779132843018, "learning_rate": 9.99717787871887e-06, "loss": 0.2630652666091919, "memory(GiB)": 28.88, "step": 30, "token_acc": 0.9156098058549522, "train_speed(iter/s)": 0.118275 }, { "epoch": 0.037403152551429335, "grad_norm": 0.9162330031394958, "learning_rate": 9.99615890986649e-06, "loss": 0.25594387054443357, "memory(GiB)": 28.88, "step": 35, "token_acc": 0.9182685889734348, "train_speed(iter/s)": 0.12251 }, { "epoch": 0.04274646005877638, "grad_norm": 0.7786328792572021, "learning_rate": 9.994983262612916e-06, "loss": 0.26515631675720214, "memory(GiB)": 28.88, "step": 40, "token_acc": 0.9209860093271153, "train_speed(iter/s)": 0.125479 }, { "epoch": 0.04274646005877638, "eval_loss": 0.2504235804080963, "eval_runtime": 39.1128, "eval_samples_per_second": 15.443, "eval_steps_per_second": 3.861, "eval_token_acc": 0.9184799159321453, "step": 40 }, { "epoch": 0.04808976756612343, "grad_norm": 0.9364318251609802, "learning_rate": 9.993650973826177e-06, "loss": 0.2555875062942505, "memory(GiB)": 28.88, "step": 45, "token_acc": 0.9150358901081727, "train_speed(iter/s)": 0.111936 }, { "epoch": 0.053433075073470476, "grad_norm": 0.8948811888694763, "learning_rate": 9.992162085286543e-06, "loss": 0.24805829524993897, "memory(GiB)": 28.88, "step": 50, "token_acc": 0.9186546283208815, "train_speed(iter/s)": 0.115376 }, { "epoch": 0.05877638258081753, "grad_norm": 0.9033949375152588, "learning_rate": 9.990516643685222e-06, "loss": 0.24738097190856934, "memory(GiB)": 28.88, "step": 55, "token_acc": 0.9198283024093049, "train_speed(iter/s)": 0.117724 }, { "epoch": 0.06411969008816458, "grad_norm": 0.8253108859062195, "learning_rate": 9.988714700622882e-06, "loss": 0.24202115535736085, "memory(GiB)": 28.88, "step": 60, "token_acc": 0.9179127157792366, "train_speed(iter/s)": 0.12054 }, { "epoch": 0.06411969008816458, "eval_loss": 0.24108153581619263, "eval_runtime": 39.139, "eval_samples_per_second": 15.432, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9215252310793712, "step": 60 }, { "epoch": 0.06946299759551162, "grad_norm": 0.9196599125862122, "learning_rate": 9.986756312608048e-06, "loss": 0.2574702262878418, "memory(GiB)": 28.88, "step": 65, "token_acc": 0.9133585236043591, "train_speed(iter/s)": 0.112242 }, { "epoch": 0.07480630510285867, "grad_norm": 0.8361815810203552, "learning_rate": 9.98464154105532e-06, "loss": 0.24776794910430908, "memory(GiB)": 28.88, "step": 70, "token_acc": 0.9202816026503778, "train_speed(iter/s)": 0.114497 }, { "epoch": 0.08014961261020571, "grad_norm": 0.794195294380188, "learning_rate": 9.982370452283451e-06, "loss": 0.23735420703887938, "memory(GiB)": 28.88, "step": 75, "token_acc": 0.9199817957916153, "train_speed(iter/s)": 0.116585 }, { "epoch": 0.08549292011755276, "grad_norm": 0.9916167855262756, "learning_rate": 9.979943117513265e-06, "loss": 0.23866429328918456, "memory(GiB)": 28.88, "step": 80, "token_acc": 0.9108762520703525, "train_speed(iter/s)": 0.118446 }, { "epoch": 0.08549292011755276, "eval_loss": 0.2345978170633316, "eval_runtime": 39.1184, "eval_samples_per_second": 15.44, "eval_steps_per_second": 3.86, "eval_token_acc": 0.9226018143215595, "step": 80 }, { "epoch": 0.09083622762489982, "grad_norm": 0.8545597195625305, "learning_rate": 9.977359612865424e-06, "loss": 0.24537258148193358, "memory(GiB)": 28.88, "step": 85, "token_acc": 0.9151461409910985, "train_speed(iter/s)": 0.112121 }, { "epoch": 0.09617953513224686, "grad_norm": 0.8722995519638062, "learning_rate": 9.974620019358046e-06, "loss": 0.2280275344848633, "memory(GiB)": 28.88, "step": 90, "token_acc": 0.9271924248000804, "train_speed(iter/s)": 0.113659 }, { "epoch": 0.10152284263959391, "grad_norm": 0.8093506097793579, "learning_rate": 9.971724422904154e-06, "loss": 0.22236292362213134, "memory(GiB)": 28.88, "step": 95, "token_acc": 0.9146961880194898, "train_speed(iter/s)": 0.115242 }, { "epoch": 0.10686615014694095, "grad_norm": 0.8567050695419312, "learning_rate": 9.968672914308995e-06, "loss": 0.2444392681121826, "memory(GiB)": 28.88, "step": 100, "token_acc": 0.9146307217073856, "train_speed(iter/s)": 0.116624 }, { "epoch": 0.10686615014694095, "eval_loss": 0.2300056368112564, "eval_runtime": 39.1847, "eval_samples_per_second": 15.414, "eval_steps_per_second": 3.854, "eval_token_acc": 0.9230350211241931, "step": 100 }, { "epoch": 0.112209457654288, "grad_norm": 0.812283992767334, "learning_rate": 9.965465589267176e-06, "loss": 0.23651769161224365, "memory(GiB)": 28.88, "step": 105, "token_acc": 0.9196928564126446, "train_speed(iter/s)": 0.111612 }, { "epoch": 0.11755276516163506, "grad_norm": 0.7877810597419739, "learning_rate": 9.96210254835968e-06, "loss": 0.24004974365234374, "memory(GiB)": 28.88, "step": 110, "token_acc": 0.9218568537014301, "train_speed(iter/s)": 0.112971 }, { "epoch": 0.1228960726689821, "grad_norm": 0.7748284935951233, "learning_rate": 9.9585838970507e-06, "loss": 0.22294034957885742, "memory(GiB)": 28.88, "step": 115, "token_acc": 0.9274852331280633, "train_speed(iter/s)": 0.114143 }, { "epoch": 0.12823938017632916, "grad_norm": 0.8219108581542969, "learning_rate": 9.954909745684339e-06, "loss": 0.22394099235534667, "memory(GiB)": 28.88, "step": 120, "token_acc": 0.9172609252148474, "train_speed(iter/s)": 0.115495 }, { "epoch": 0.12823938017632916, "eval_loss": 0.227636456489563, "eval_runtime": 39.1617, "eval_samples_per_second": 15.423, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9239829290784705, "step": 120 }, { "epoch": 0.1335826876836762, "grad_norm": 0.8432355523109436, "learning_rate": 9.951080209481138e-06, "loss": 0.2368373155593872, "memory(GiB)": 28.88, "step": 125, "token_acc": 0.9205502826063773, "train_speed(iter/s)": 0.111608 }, { "epoch": 0.13892599519102325, "grad_norm": 0.916549801826477, "learning_rate": 9.947095408534483e-06, "loss": 0.25100035667419435, "memory(GiB)": 28.88, "step": 130, "token_acc": 0.9245168060115764, "train_speed(iter/s)": 0.113003 }, { "epoch": 0.1442693026983703, "grad_norm": 0.81369948387146, "learning_rate": 9.94295546780682e-06, "loss": 0.2301029920578003, "memory(GiB)": 28.88, "step": 135, "token_acc": 0.9171662305832478, "train_speed(iter/s)": 0.113963 }, { "epoch": 0.14961261020571734, "grad_norm": 0.9230105876922607, "learning_rate": 9.93866051712574e-06, "loss": 0.23561997413635255, "memory(GiB)": 28.88, "step": 140, "token_acc": 0.9211832729821282, "train_speed(iter/s)": 0.115079 }, { "epoch": 0.14961261020571734, "eval_loss": 0.22445102035999298, "eval_runtime": 39.1558, "eval_samples_per_second": 15.426, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9246691972806622, "step": 140 }, { "epoch": 0.15495591771306438, "grad_norm": 0.7951855063438416, "learning_rate": 9.934210691179918e-06, "loss": 0.241453218460083, "memory(GiB)": 28.88, "step": 145, "token_acc": 0.9198347485752214, "train_speed(iter/s)": 0.111598 }, { "epoch": 0.16029922522041143, "grad_norm": 0.8992099165916443, "learning_rate": 9.929606129514875e-06, "loss": 0.24097092151641847, "memory(GiB)": 28.88, "step": 150, "token_acc": 0.9258129774449712, "train_speed(iter/s)": 0.112709 }, { "epoch": 0.16564253272775847, "grad_norm": 0.8297993540763855, "learning_rate": 9.924846976528618e-06, "loss": 0.2204681158065796, "memory(GiB)": 28.88, "step": 155, "token_acc": 0.9230016313213704, "train_speed(iter/s)": 0.113601 }, { "epoch": 0.17098584023510552, "grad_norm": 0.8261107802391052, "learning_rate": 9.919933381467088e-06, "loss": 0.22699880599975586, "memory(GiB)": 28.88, "step": 160, "token_acc": 0.928885791828651, "train_speed(iter/s)": 0.114559 }, { "epoch": 0.17098584023510552, "eval_loss": 0.22084873914718628, "eval_runtime": 39.2244, "eval_samples_per_second": 15.399, "eval_steps_per_second": 3.85, "eval_token_acc": 0.9256943104076862, "step": 160 }, { "epoch": 0.1763291477424526, "grad_norm": 0.8418117761611938, "learning_rate": 9.91486549841951e-06, "loss": 0.22405543327331542, "memory(GiB)": 28.88, "step": 165, "token_acc": 0.9225325105463167, "train_speed(iter/s)": 0.111753 }, { "epoch": 0.18167245524979964, "grad_norm": 0.8489673733711243, "learning_rate": 9.909643486313533e-06, "loss": 0.23412735462188722, "memory(GiB)": 28.88, "step": 170, "token_acc": 0.9189576348674184, "train_speed(iter/s)": 0.112758 }, { "epoch": 0.18701576275714668, "grad_norm": 0.7228992581367493, "learning_rate": 9.904267508910269e-06, "loss": 0.2174379587173462, "memory(GiB)": 28.88, "step": 175, "token_acc": 0.9282256367080615, "train_speed(iter/s)": 0.113726 }, { "epoch": 0.19235907026449373, "grad_norm": 0.8382998704910278, "learning_rate": 9.898737734799134e-06, "loss": 0.22646231651306153, "memory(GiB)": 28.88, "step": 180, "token_acc": 0.9216335902903067, "train_speed(iter/s)": 0.114665 }, { "epoch": 0.19235907026449373, "eval_loss": 0.21828743815422058, "eval_runtime": 39.1773, "eval_samples_per_second": 15.417, "eval_steps_per_second": 3.854, "eval_token_acc": 0.9263333976709773, "step": 180 }, { "epoch": 0.19770237777184077, "grad_norm": 0.7243833541870117, "learning_rate": 9.89305433739258e-06, "loss": 0.21673502922058105, "memory(GiB)": 28.88, "step": 185, "token_acc": 0.9237459546925566, "train_speed(iter/s)": 0.112007 }, { "epoch": 0.20304568527918782, "grad_norm": 0.891444981098175, "learning_rate": 9.887217494920655e-06, "loss": 0.2255467414855957, "memory(GiB)": 28.88, "step": 190, "token_acc": 0.9196933155753932, "train_speed(iter/s)": 0.112921 }, { "epoch": 0.20838899278653486, "grad_norm": 0.8923943042755127, "learning_rate": 9.881227390425404e-06, "loss": 0.23066916465759277, "memory(GiB)": 28.88, "step": 195, "token_acc": 0.9164964514453868, "train_speed(iter/s)": 0.113717 }, { "epoch": 0.2137323002938819, "grad_norm": 0.8573595881462097, "learning_rate": 9.875084211755127e-06, "loss": 0.22467041015625, "memory(GiB)": 28.88, "step": 200, "token_acc": 0.9248140544542792, "train_speed(iter/s)": 0.114571 }, { "epoch": 0.2137323002938819, "eval_loss": 0.2157127410173416, "eval_runtime": 39.2065, "eval_samples_per_second": 15.406, "eval_steps_per_second": 3.851, "eval_token_acc": 0.9273456432692101, "step": 200 }, { "epoch": 0.21907560780122895, "grad_norm": 0.776364266872406, "learning_rate": 9.868788151558513e-06, "loss": 0.2205509662628174, "memory(GiB)": 28.88, "step": 205, "token_acc": 0.9281972298447889, "train_speed(iter/s)": 0.112238 }, { "epoch": 0.224418915308576, "grad_norm": 0.7892180681228638, "learning_rate": 9.862339407278564e-06, "loss": 0.2204576015472412, "memory(GiB)": 28.88, "step": 210, "token_acc": 0.9257216072890344, "train_speed(iter/s)": 0.113008 }, { "epoch": 0.22976222281592307, "grad_norm": 0.6594001054763794, "learning_rate": 9.855738181146427e-06, "loss": 0.2142805814743042, "memory(GiB)": 28.88, "step": 215, "token_acc": 0.9388168011006501, "train_speed(iter/s)": 0.113727 }, { "epoch": 0.2351055303232701, "grad_norm": 0.8227058053016663, "learning_rate": 9.848984680175049e-06, "loss": 0.2226466417312622, "memory(GiB)": 28.88, "step": 220, "token_acc": 0.9279371197521067, "train_speed(iter/s)": 0.114316 }, { "epoch": 0.2351055303232701, "eval_loss": 0.21347786486148834, "eval_runtime": 39.2014, "eval_samples_per_second": 15.408, "eval_steps_per_second": 3.852, "eval_token_acc": 0.9271354736322889, "step": 220 }, { "epoch": 0.24044883783061716, "grad_norm": 0.7295470237731934, "learning_rate": 9.84207911615267e-06, "loss": 0.21721224784851073, "memory(GiB)": 28.88, "step": 225, "token_acc": 0.9288023754536456, "train_speed(iter/s)": 0.112132 }, { "epoch": 0.2457921453379642, "grad_norm": 0.8412714600563049, "learning_rate": 9.835021705636201e-06, "loss": 0.22250523567199706, "memory(GiB)": 28.88, "step": 230, "token_acc": 0.9202628419788969, "train_speed(iter/s)": 0.112833 }, { "epoch": 0.25113545284531125, "grad_norm": 0.8208211064338684, "learning_rate": 9.827812669944423e-06, "loss": 0.2162861108779907, "memory(GiB)": 28.88, "step": 235, "token_acc": 0.9284959688629414, "train_speed(iter/s)": 0.113507 }, { "epoch": 0.2564787603526583, "grad_norm": 0.849270224571228, "learning_rate": 9.82045223515105e-06, "loss": 0.23803000450134276, "memory(GiB)": 28.88, "step": 240, "token_acc": 0.9182260788924586, "train_speed(iter/s)": 0.114162 }, { "epoch": 0.2564787603526583, "eval_loss": 0.21100114285945892, "eval_runtime": 39.173, "eval_samples_per_second": 15.419, "eval_steps_per_second": 3.855, "eval_token_acc": 0.9279032361834909, "step": 240 }, { "epoch": 0.26182206786000534, "grad_norm": 0.787993311882019, "learning_rate": 9.812940632077629e-06, "loss": 0.22567553520202638, "memory(GiB)": 28.88, "step": 245, "token_acc": 0.9222485073786189, "train_speed(iter/s)": 0.112214 }, { "epoch": 0.2671653753673524, "grad_norm": 0.8460186123847961, "learning_rate": 9.805278096286318e-06, "loss": 0.23234963417053223, "memory(GiB)": 28.88, "step": 250, "token_acc": 0.9275632677484074, "train_speed(iter/s)": 0.112833 }, { "epoch": 0.2725086828746994, "grad_norm": 0.8697513937950134, "learning_rate": 9.797464868072489e-06, "loss": 0.21605942249298096, "memory(GiB)": 28.88, "step": 255, "token_acc": 0.9228022233953358, "train_speed(iter/s)": 0.11345 }, { "epoch": 0.2778519903820465, "grad_norm": 0.7353094816207886, "learning_rate": 9.789501192457188e-06, "loss": 0.20451416969299316, "memory(GiB)": 28.88, "step": 260, "token_acc": 0.9241443920719032, "train_speed(iter/s)": 0.11402 }, { "epoch": 0.2778519903820465, "eval_loss": 0.20931538939476013, "eval_runtime": 39.2202, "eval_samples_per_second": 15.4, "eval_steps_per_second": 3.85, "eval_token_acc": 0.9285251667417272, "step": 260 }, { "epoch": 0.2831952978893935, "grad_norm": 0.7424066066741943, "learning_rate": 9.781387319179465e-06, "loss": 0.21903395652770996, "memory(GiB)": 28.88, "step": 265, "token_acc": 0.9232737136934036, "train_speed(iter/s)": 0.112272 }, { "epoch": 0.2885386053967406, "grad_norm": 0.7803475856781006, "learning_rate": 9.773123502688532e-06, "loss": 0.20421533584594725, "memory(GiB)": 28.88, "step": 270, "token_acc": 0.9321742863275381, "train_speed(iter/s)": 0.112863 }, { "epoch": 0.2938819129040876, "grad_norm": 0.9278019666671753, "learning_rate": 9.764710002135784e-06, "loss": 0.2212052345275879, "memory(GiB)": 28.88, "step": 275, "token_acc": 0.9205871656693225, "train_speed(iter/s)": 0.113471 }, { "epoch": 0.2992252204114347, "grad_norm": 0.8120574355125427, "learning_rate": 9.756147081366673e-06, "loss": 0.22279133796691894, "memory(GiB)": 28.88, "step": 280, "token_acc": 0.9252955870108243, "train_speed(iter/s)": 0.114018 }, { "epoch": 0.2992252204114347, "eval_loss": 0.20953305065631866, "eval_runtime": 39.1588, "eval_samples_per_second": 15.424, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9285980827382101, "step": 280 }, { "epoch": 0.30456852791878175, "grad_norm": 0.7851824164390564, "learning_rate": 9.747435008912438e-06, "loss": 0.2276832103729248, "memory(GiB)": 28.88, "step": 285, "token_acc": 0.9250812487968384, "train_speed(iter/s)": 0.11243 }, { "epoch": 0.30991183542612877, "grad_norm": 0.7576037049293518, "learning_rate": 9.73857405798168e-06, "loss": 0.21578505039215087, "memory(GiB)": 28.88, "step": 290, "token_acc": 0.9281283843654622, "train_speed(iter/s)": 0.113065 }, { "epoch": 0.31525514293347584, "grad_norm": 0.7560254335403442, "learning_rate": 9.729564506451791e-06, "loss": 0.2222808837890625, "memory(GiB)": 28.88, "step": 295, "token_acc": 0.9222295127257658, "train_speed(iter/s)": 0.11355 }, { "epoch": 0.32059845044082286, "grad_norm": 0.7463079690933228, "learning_rate": 9.720406636860252e-06, "loss": 0.2202146530151367, "memory(GiB)": 28.88, "step": 300, "token_acc": 0.9305221834918271, "train_speed(iter/s)": 0.11405 }, { "epoch": 0.32059845044082286, "eval_loss": 0.20838645100593567, "eval_runtime": 39.2025, "eval_samples_per_second": 15.407, "eval_steps_per_second": 3.852, "eval_token_acc": 0.9288683008428231, "step": 300 }, { "epoch": 0.32594175794816993, "grad_norm": 0.7550280690193176, "learning_rate": 9.711100736395758e-06, "loss": 0.22840361595153807, "memory(GiB)": 28.88, "step": 305, "token_acc": 0.9224960254372019, "train_speed(iter/s)": 0.112523 }, { "epoch": 0.33128506545551695, "grad_norm": 0.8339446187019348, "learning_rate": 9.70164709688922e-06, "loss": 0.23481903076171876, "memory(GiB)": 28.88, "step": 310, "token_acc": 0.9225257780131461, "train_speed(iter/s)": 0.113092 }, { "epoch": 0.336628372962864, "grad_norm": 0.7605867385864258, "learning_rate": 9.69204601480461e-06, "loss": 0.20481536388397217, "memory(GiB)": 28.88, "step": 315, "token_acc": 0.925222651655184, "train_speed(iter/s)": 0.113604 }, { "epoch": 0.34197168047021104, "grad_norm": 0.7798689007759094, "learning_rate": 9.682297791229668e-06, "loss": 0.21814496517181398, "memory(GiB)": 28.88, "step": 320, "token_acc": 0.9192225342713344, "train_speed(iter/s)": 0.114156 }, { "epoch": 0.34197168047021104, "eval_loss": 0.20693515241146088, "eval_runtime": 39.111, "eval_samples_per_second": 15.443, "eval_steps_per_second": 3.861, "eval_token_acc": 0.9290999163610628, "step": 320 }, { "epoch": 0.3473149879775581, "grad_norm": 0.7268094420433044, "learning_rate": 9.67240273186646e-06, "loss": 0.21229307651519774, "memory(GiB)": 28.88, "step": 325, "token_acc": 0.9255569407555965, "train_speed(iter/s)": 0.112688 }, { "epoch": 0.3526582954849052, "grad_norm": 0.8207147121429443, "learning_rate": 9.66236114702178e-06, "loss": 0.23093049526214598, "memory(GiB)": 28.88, "step": 330, "token_acc": 0.9176510638297872, "train_speed(iter/s)": 0.113194 }, { "epoch": 0.3580016029922522, "grad_norm": 0.7545695304870605, "learning_rate": 9.652173351597435e-06, "loss": 0.22740473747253417, "memory(GiB)": 28.88, "step": 335, "token_acc": 0.9171260565968394, "train_speed(iter/s)": 0.113721 }, { "epoch": 0.36334491049959927, "grad_norm": 0.7324105501174927, "learning_rate": 9.641839665080363e-06, "loss": 0.2098468780517578, "memory(GiB)": 28.88, "step": 340, "token_acc": 0.9380478206427763, "train_speed(iter/s)": 0.114179 }, { "epoch": 0.36334491049959927, "eval_loss": 0.20602919161319733, "eval_runtime": 39.2173, "eval_samples_per_second": 15.401, "eval_steps_per_second": 3.85, "eval_token_acc": 0.9290613137746896, "step": 340 }, { "epoch": 0.3686882180069463, "grad_norm": 0.8628275394439697, "learning_rate": 9.631360411532609e-06, "loss": 0.20752103328704835, "memory(GiB)": 31.36, "step": 345, "token_acc": 0.927929963287207, "train_speed(iter/s)": 0.112824 }, { "epoch": 0.37403152551429336, "grad_norm": 0.8375836610794067, "learning_rate": 9.620735919581168e-06, "loss": 0.21789727210998536, "memory(GiB)": 31.36, "step": 350, "token_acc": 0.9248612142427778, "train_speed(iter/s)": 0.113287 }, { "epoch": 0.3793748330216404, "grad_norm": 0.7989551424980164, "learning_rate": 9.609966522407678e-06, "loss": 0.21387200355529784, "memory(GiB)": 31.36, "step": 355, "token_acc": 0.9233053363875456, "train_speed(iter/s)": 0.11376 }, { "epoch": 0.38471814052898745, "grad_norm": 0.8270795941352844, "learning_rate": 9.599052557737973e-06, "loss": 0.2108246088027954, "memory(GiB)": 31.36, "step": 360, "token_acc": 0.9243235094595613, "train_speed(iter/s)": 0.114225 }, { "epoch": 0.38471814052898745, "eval_loss": 0.2033926397562027, "eval_runtime": 39.2398, "eval_samples_per_second": 15.393, "eval_steps_per_second": 3.848, "eval_token_acc": 0.9299148598511656, "step": 360 }, { "epoch": 0.39006144803633447, "grad_norm": 0.7160355448722839, "learning_rate": 9.58799436783149e-06, "loss": 0.19975266456604004, "memory(GiB)": 31.36, "step": 365, "token_acc": 0.9291021882882269, "train_speed(iter/s)": 0.112953 }, { "epoch": 0.39540475554368154, "grad_norm": 0.7318681478500366, "learning_rate": 9.576792299470537e-06, "loss": 0.2046557903289795, "memory(GiB)": 31.36, "step": 370, "token_acc": 0.9262238367064521, "train_speed(iter/s)": 0.113418 }, { "epoch": 0.4007480630510286, "grad_norm": 0.741698682308197, "learning_rate": 9.565446703949417e-06, "loss": 0.20282254219055176, "memory(GiB)": 31.36, "step": 375, "token_acc": 0.9226779000139257, "train_speed(iter/s)": 0.113823 }, { "epoch": 0.40609137055837563, "grad_norm": 0.8014733791351318, "learning_rate": 9.55395793706341e-06, "loss": 0.20639050006866455, "memory(GiB)": 31.36, "step": 380, "token_acc": 0.9284898178641038, "train_speed(iter/s)": 0.11425 }, { "epoch": 0.40609137055837563, "eval_loss": 0.20332778990268707, "eval_runtime": 39.1548, "eval_samples_per_second": 15.426, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9302279697184156, "step": 380 }, { "epoch": 0.4114346780657227, "grad_norm": 0.7752350568771362, "learning_rate": 9.542326359097619e-06, "loss": 0.21643948554992676, "memory(GiB)": 31.36, "step": 385, "token_acc": 0.9229331980070123, "train_speed(iter/s)": 0.113068 }, { "epoch": 0.4167779855730697, "grad_norm": 0.7383193373680115, "learning_rate": 9.530552334815672e-06, "loss": 0.23035106658935547, "memory(GiB)": 31.36, "step": 390, "token_acc": 0.9227077075527204, "train_speed(iter/s)": 0.113539 }, { "epoch": 0.4221212930804168, "grad_norm": 0.826757550239563, "learning_rate": 9.518636233448276e-06, "loss": 0.2027421474456787, "memory(GiB)": 31.36, "step": 395, "token_acc": 0.9212801395939086, "train_speed(iter/s)": 0.113879 }, { "epoch": 0.4274646005877638, "grad_norm": 0.8301543593406677, "learning_rate": 9.506578428681648e-06, "loss": 0.2126997232437134, "memory(GiB)": 31.36, "step": 400, "token_acc": 0.9254076176645323, "train_speed(iter/s)": 0.114274 }, { "epoch": 0.4274646005877638, "eval_loss": 0.20262883603572845, "eval_runtime": 39.1295, "eval_samples_per_second": 15.436, "eval_steps_per_second": 3.859, "eval_token_acc": 0.930107872783032, "step": 400 }, { "epoch": 0.4328079080951109, "grad_norm": 0.7817525863647461, "learning_rate": 9.494379298645788e-06, "loss": 0.20061790943145752, "memory(GiB)": 31.36, "step": 405, "token_acc": 0.9269484626175747, "train_speed(iter/s)": 0.113056 }, { "epoch": 0.4381512156024579, "grad_norm": 0.7769330143928528, "learning_rate": 9.482039225902623e-06, "loss": 0.20687649250030518, "memory(GiB)": 31.36, "step": 410, "token_acc": 0.9282657499649222, "train_speed(iter/s)": 0.113426 }, { "epoch": 0.443494523109805, "grad_norm": 0.7487837076187134, "learning_rate": 9.469558597434018e-06, "loss": 0.20723772048950195, "memory(GiB)": 31.36, "step": 415, "token_acc": 0.9278717406624384, "train_speed(iter/s)": 0.113801 }, { "epoch": 0.448837830617152, "grad_norm": 0.7757744789123535, "learning_rate": 9.456937804629623e-06, "loss": 0.2050936698913574, "memory(GiB)": 31.36, "step": 420, "token_acc": 0.9330446275414077, "train_speed(iter/s)": 0.114158 }, { "epoch": 0.448837830617152, "eval_loss": 0.2025863230228424, "eval_runtime": 39.128, "eval_samples_per_second": 15.437, "eval_steps_per_second": 3.859, "eval_token_acc": 0.9305839713483025, "step": 420 }, { "epoch": 0.45418113812449906, "grad_norm": 0.7080808877944946, "learning_rate": 9.444177243274619e-06, "loss": 0.1936201810836792, "memory(GiB)": 31.36, "step": 425, "token_acc": 0.9299704947798456, "train_speed(iter/s)": 0.113084 }, { "epoch": 0.45952444563184613, "grad_norm": 0.647037148475647, "learning_rate": 9.43127731353729e-06, "loss": 0.1949155330657959, "memory(GiB)": 31.36, "step": 430, "token_acc": 0.9309410335482665, "train_speed(iter/s)": 0.113375 }, { "epoch": 0.46486775313919315, "grad_norm": 0.8139774203300476, "learning_rate": 9.418238419956484e-06, "loss": 0.21164326667785643, "memory(GiB)": 31.36, "step": 435, "token_acc": 0.926740549360892, "train_speed(iter/s)": 0.113761 }, { "epoch": 0.4702110606465402, "grad_norm": 0.7926304340362549, "learning_rate": 9.405060971428924e-06, "loss": 0.21458048820495607, "memory(GiB)": 31.36, "step": 440, "token_acc": 0.9295489624421197, "train_speed(iter/s)": 0.11413 }, { "epoch": 0.4702110606465402, "eval_loss": 0.20057949423789978, "eval_runtime": 39.1388, "eval_samples_per_second": 15.432, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9312059019065388, "step": 440 }, { "epoch": 0.47555436815388724, "grad_norm": 0.7835922837257385, "learning_rate": 9.391745381196382e-06, "loss": 0.20983607769012452, "memory(GiB)": 31.36, "step": 445, "token_acc": 0.9275723362435434, "train_speed(iter/s)": 0.113097 }, { "epoch": 0.4808976756612343, "grad_norm": 0.7004543542861938, "learning_rate": 9.378292066832723e-06, "loss": 0.21295685768127443, "memory(GiB)": 31.36, "step": 450, "token_acc": 0.9304546811262561, "train_speed(iter/s)": 0.113495 }, { "epoch": 0.48624098316858133, "grad_norm": 0.7315278053283691, "learning_rate": 9.364701450230813e-06, "loss": 0.19759070873260498, "memory(GiB)": 31.36, "step": 455, "token_acc": 0.9323673009514272, "train_speed(iter/s)": 0.113833 }, { "epoch": 0.4915842906759284, "grad_norm": 0.8716449737548828, "learning_rate": 9.350973957589278e-06, "loss": 0.21045897006988526, "memory(GiB)": 31.36, "step": 460, "token_acc": 0.928, "train_speed(iter/s)": 0.114182 }, { "epoch": 0.4915842906759284, "eval_loss": 0.20007076859474182, "eval_runtime": 39.0359, "eval_samples_per_second": 15.473, "eval_steps_per_second": 3.868, "eval_token_acc": 0.9313774689570868, "step": 460 }, { "epoch": 0.4969275981832754, "grad_norm": 0.8024523258209229, "learning_rate": 9.33711001939915e-06, "loss": 0.22184643745422364, "memory(GiB)": 31.36, "step": 465, "token_acc": 0.9268073439852581, "train_speed(iter/s)": 0.113198 }, { "epoch": 0.5022709056906225, "grad_norm": 0.7982778549194336, "learning_rate": 9.32311007043036e-06, "loss": 0.2020054817199707, "memory(GiB)": 31.36, "step": 470, "token_acc": 0.9326877815883263, "train_speed(iter/s)": 0.113499 }, { "epoch": 0.5076142131979695, "grad_norm": 0.7509546279907227, "learning_rate": 9.30897454971811e-06, "loss": 0.20111818313598634, "memory(GiB)": 31.36, "step": 475, "token_acc": 0.9371612406895828, "train_speed(iter/s)": 0.113839 }, { "epoch": 0.5129575207053166, "grad_norm": 0.7862565517425537, "learning_rate": 9.294703900549096e-06, "loss": 0.20246634483337403, "memory(GiB)": 31.36, "step": 480, "token_acc": 0.9213386062277757, "train_speed(iter/s)": 0.114112 }, { "epoch": 0.5129575207053166, "eval_loss": 0.19960449635982513, "eval_runtime": 39.1156, "eval_samples_per_second": 15.441, "eval_steps_per_second": 3.86, "eval_token_acc": 0.9314846983636793, "step": 480 }, { "epoch": 0.5183008282126637, "grad_norm": 0.7772889733314514, "learning_rate": 9.280298570447612e-06, "loss": 0.20028786659240722, "memory(GiB)": 31.36, "step": 485, "token_acc": 0.9264812360063717, "train_speed(iter/s)": 0.113159 }, { "epoch": 0.5236441357200107, "grad_norm": 0.6927475929260254, "learning_rate": 9.265759011161519e-06, "loss": 0.19862596988677977, "memory(GiB)": 31.36, "step": 490, "token_acc": 0.9293699036323202, "train_speed(iter/s)": 0.113461 }, { "epoch": 0.5289874432273577, "grad_norm": 0.7092183828353882, "learning_rate": 9.251085678648072e-06, "loss": 0.20681967735290527, "memory(GiB)": 31.36, "step": 495, "token_acc": 0.9285693844260653, "train_speed(iter/s)": 0.113757 }, { "epoch": 0.5343307507347048, "grad_norm": 0.7293869853019714, "learning_rate": 9.236279033059622e-06, "loss": 0.21075177192687988, "memory(GiB)": 31.36, "step": 500, "token_acc": 0.9218673562093788, "train_speed(iter/s)": 0.114071 }, { "epoch": 0.5343307507347048, "eval_loss": 0.1984926164150238, "eval_runtime": 39.2663, "eval_samples_per_second": 15.382, "eval_steps_per_second": 3.846, "eval_token_acc": 0.9314375174247785, "step": 500 }, { "epoch": 0.5396740582420518, "grad_norm": 0.8831562995910645, "learning_rate": 9.221339538729191e-06, "loss": 0.21746454238891602, "memory(GiB)": 31.36, "step": 505, "token_acc": 0.9285247936980231, "train_speed(iter/s)": 0.113113 }, { "epoch": 0.5450173657493989, "grad_norm": 0.7981932163238525, "learning_rate": 9.206267664155906e-06, "loss": 0.2134779691696167, "memory(GiB)": 31.36, "step": 510, "token_acc": 0.9249437382154371, "train_speed(iter/s)": 0.113434 }, { "epoch": 0.550360673256746, "grad_norm": 0.7087724208831787, "learning_rate": 9.191063881990308e-06, "loss": 0.2046900749206543, "memory(GiB)": 31.36, "step": 515, "token_acc": 0.9302552606840697, "train_speed(iter/s)": 0.113724 }, { "epoch": 0.555703980764093, "grad_norm": 0.7404990196228027, "learning_rate": 9.17572866901953e-06, "loss": 0.2034606456756592, "memory(GiB)": 31.36, "step": 520, "token_acc": 0.9364186566673245, "train_speed(iter/s)": 0.113991 }, { "epoch": 0.555703980764093, "eval_loss": 0.19815480709075928, "eval_runtime": 39.1389, "eval_samples_per_second": 15.432, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9321194964507067, "step": 520 }, { "epoch": 0.56104728827144, "grad_norm": 0.7702794671058655, "learning_rate": 9.160262506152343e-06, "loss": 0.20876009464263917, "memory(GiB)": 31.36, "step": 525, "token_acc": 0.9260557562023464, "train_speed(iter/s)": 0.113137 }, { "epoch": 0.566390595778787, "grad_norm": 0.7707433104515076, "learning_rate": 9.14466587840408e-06, "loss": 0.2181908369064331, "memory(GiB)": 31.36, "step": 530, "token_acc": 0.9201792801423293, "train_speed(iter/s)": 0.113398 }, { "epoch": 0.5717339032861342, "grad_norm": 0.7392153143882751, "learning_rate": 9.12893927488142e-06, "loss": 0.2074495792388916, "memory(GiB)": 31.36, "step": 535, "token_acc": 0.9264089572777882, "train_speed(iter/s)": 0.113707 }, { "epoch": 0.5770772107934812, "grad_norm": 0.6916921138763428, "learning_rate": 9.113083188767057e-06, "loss": 0.192901611328125, "memory(GiB)": 31.36, "step": 540, "token_acc": 0.9276745841693095, "train_speed(iter/s)": 0.114011 }, { "epoch": 0.5770772107934812, "eval_loss": 0.1964673101902008, "eval_runtime": 39.2785, "eval_samples_per_second": 15.377, "eval_steps_per_second": 3.844, "eval_token_acc": 0.9323039310300457, "step": 540 }, { "epoch": 0.5824205183008282, "grad_norm": 0.7404196262359619, "learning_rate": 9.097098117304223e-06, "loss": 0.19912216663360596, "memory(GiB)": 31.36, "step": 545, "token_acc": 0.9267058907942886, "train_speed(iter/s)": 0.113126 }, { "epoch": 0.5877638258081752, "grad_norm": 0.8616334795951843, "learning_rate": 9.08098456178111e-06, "loss": 0.22115416526794435, "memory(GiB)": 31.36, "step": 550, "token_acc": 0.9247433468141067, "train_speed(iter/s)": 0.113428 }, { "epoch": 0.5931071333155223, "grad_norm": 0.6713617444038391, "learning_rate": 9.064743027515127e-06, "loss": 0.20437276363372803, "memory(GiB)": 31.36, "step": 555, "token_acc": 0.9308613364233295, "train_speed(iter/s)": 0.113724 }, { "epoch": 0.5984504408228694, "grad_norm": 0.7443351745605469, "learning_rate": 9.048374023837086e-06, "loss": 0.21196300983428956, "memory(GiB)": 31.36, "step": 560, "token_acc": 0.9288690476190476, "train_speed(iter/s)": 0.113984 }, { "epoch": 0.5984504408228694, "eval_loss": 0.1953776627779007, "eval_runtime": 39.1331, "eval_samples_per_second": 15.435, "eval_steps_per_second": 3.859, "eval_token_acc": 0.9320208453966415, "step": 560 }, { "epoch": 0.6037937483302164, "grad_norm": 0.7687556147575378, "learning_rate": 9.03187806407519e-06, "loss": 0.20800457000732422, "memory(GiB)": 31.36, "step": 565, "token_acc": 0.9276673510367078, "train_speed(iter/s)": 0.113135 }, { "epoch": 0.6091370558375635, "grad_norm": 0.7980899214744568, "learning_rate": 9.015255665538972e-06, "loss": 0.2199338912963867, "memory(GiB)": 31.36, "step": 570, "token_acc": 0.9232478287214704, "train_speed(iter/s)": 0.113444 }, { "epoch": 0.6144803633449105, "grad_norm": 0.7510707974433899, "learning_rate": 8.998507349503048e-06, "loss": 0.19736554622650146, "memory(GiB)": 31.36, "step": 575, "token_acc": 0.9285859751544362, "train_speed(iter/s)": 0.113724 }, { "epoch": 0.6198236708522575, "grad_norm": 0.8514485955238342, "learning_rate": 8.981633641190779e-06, "loss": 0.21196255683898926, "memory(GiB)": 31.36, "step": 580, "token_acc": 0.9259536976286438, "train_speed(iter/s)": 0.113995 }, { "epoch": 0.6198236708522575, "eval_loss": 0.19566793739795685, "eval_runtime": 39.1561, "eval_samples_per_second": 15.425, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9323511119689464, "step": 580 }, { "epoch": 0.6251669783596046, "grad_norm": 0.7959375381469727, "learning_rate": 8.964635069757803e-06, "loss": 0.20478439331054688, "memory(GiB)": 31.36, "step": 585, "token_acc": 0.9299914280010803, "train_speed(iter/s)": 0.113198 }, { "epoch": 0.6305102858669517, "grad_norm": 0.6840202808380127, "learning_rate": 8.94751216827543e-06, "loss": 0.20479016304016112, "memory(GiB)": 31.36, "step": 590, "token_acc": 0.930996839988509, "train_speed(iter/s)": 0.113473 }, { "epoch": 0.6358535933742987, "grad_norm": 0.7271625399589539, "learning_rate": 8.930265473713939e-06, "loss": 0.18621822595596313, "memory(GiB)": 31.36, "step": 595, "token_acc": 0.9302919832373684, "train_speed(iter/s)": 0.113714 }, { "epoch": 0.6411969008816457, "grad_norm": 0.7664585113525391, "learning_rate": 8.912895526925726e-06, "loss": 0.20015628337860109, "memory(GiB)": 31.36, "step": 600, "token_acc": 0.9368948705623841, "train_speed(iter/s)": 0.113966 }, { "epoch": 0.6411969008816457, "eval_loss": 0.19422343373298645, "eval_runtime": 39.1797, "eval_samples_per_second": 15.416, "eval_steps_per_second": 3.854, "eval_token_acc": 0.9324712089043299, "step": 600 }, { "epoch": 0.6465402083889927, "grad_norm": 0.7383084297180176, "learning_rate": 8.895402872628352e-06, "loss": 0.20294442176818847, "memory(GiB)": 31.36, "step": 605, "token_acc": 0.9330581241743725, "train_speed(iter/s)": 0.113194 }, { "epoch": 0.6518835158963399, "grad_norm": 0.7372247576713562, "learning_rate": 8.87778805938746e-06, "loss": 0.20596041679382324, "memory(GiB)": 31.36, "step": 610, "token_acc": 0.9245759105003952, "train_speed(iter/s)": 0.11346 }, { "epoch": 0.6572268234036869, "grad_norm": 0.7220420241355896, "learning_rate": 8.86005163959956e-06, "loss": 0.20566608905792236, "memory(GiB)": 31.36, "step": 615, "token_acc": 0.9246050420168067, "train_speed(iter/s)": 0.113732 }, { "epoch": 0.6625701309110339, "grad_norm": 0.687303900718689, "learning_rate": 8.842194169474727e-06, "loss": 0.20316667556762696, "memory(GiB)": 31.36, "step": 620, "token_acc": 0.929285897482871, "train_speed(iter/s)": 0.114 }, { "epoch": 0.6625701309110339, "eval_loss": 0.19414789974689484, "eval_runtime": 39.2021, "eval_samples_per_second": 15.407, "eval_steps_per_second": 3.852, "eval_token_acc": 0.9324969439619121, "step": 620 }, { "epoch": 0.667913438418381, "grad_norm": 0.7889045476913452, "learning_rate": 8.824216209019139e-06, "loss": 0.2020263195037842, "memory(GiB)": 31.36, "step": 625, "token_acc": 0.9281722400366468, "train_speed(iter/s)": 0.113212 }, { "epoch": 0.673256745925728, "grad_norm": 0.7296798229217529, "learning_rate": 8.806118322017525e-06, "loss": 0.19929354190826415, "memory(GiB)": 31.36, "step": 630, "token_acc": 0.9315762957365074, "train_speed(iter/s)": 0.113443 }, { "epoch": 0.6786000534330751, "grad_norm": 0.7287123799324036, "learning_rate": 8.787901076015487e-06, "loss": 0.20486984252929688, "memory(GiB)": 31.36, "step": 635, "token_acc": 0.9216136550589525, "train_speed(iter/s)": 0.113669 }, { "epoch": 0.6839433609404221, "grad_norm": 0.7702438831329346, "learning_rate": 8.769565042301692e-06, "loss": 0.21484546661376952, "memory(GiB)": 31.36, "step": 640, "token_acc": 0.9197191255248299, "train_speed(iter/s)": 0.113928 }, { "epoch": 0.6839433609404221, "eval_loss": 0.19368213415145874, "eval_runtime": 39.232, "eval_samples_per_second": 15.396, "eval_steps_per_second": 3.849, "eval_token_acc": 0.9326599326599326, "step": 640 }, { "epoch": 0.6892866684477692, "grad_norm": 0.7215872406959534, "learning_rate": 8.751110795889966e-06, "loss": 0.20535902976989745, "memory(GiB)": 31.36, "step": 645, "token_acc": 0.9300116508547923, "train_speed(iter/s)": 0.113175 }, { "epoch": 0.6946299759551162, "grad_norm": 0.7084954380989075, "learning_rate": 8.732538915501257e-06, "loss": 0.19380364418029786, "memory(GiB)": 31.36, "step": 650, "token_acc": 0.933890160921023, "train_speed(iter/s)": 0.113399 }, { "epoch": 0.6999732834624632, "grad_norm": 0.7394425272941589, "learning_rate": 8.71384998354549e-06, "loss": 0.19607152938842773, "memory(GiB)": 31.36, "step": 655, "token_acc": 0.9408127208480566, "train_speed(iter/s)": 0.113636 }, { "epoch": 0.7053165909698104, "grad_norm": 0.6766705513000488, "learning_rate": 8.695044586103297e-06, "loss": 0.19869532585144042, "memory(GiB)": 31.36, "step": 660, "token_acc": 0.9315438606795688, "train_speed(iter/s)": 0.113856 }, { "epoch": 0.7053165909698104, "eval_loss": 0.19304147362709045, "eval_runtime": 39.1551, "eval_samples_per_second": 15.426, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9326256192498231, "step": 660 }, { "epoch": 0.7106598984771574, "grad_norm": 0.8370521664619446, "learning_rate": 8.676123312907641e-06, "loss": 0.20741744041442872, "memory(GiB)": 31.36, "step": 665, "token_acc": 0.9293741677762982, "train_speed(iter/s)": 0.113158 }, { "epoch": 0.7160032059845044, "grad_norm": 0.6481726765632629, "learning_rate": 8.657086757325328e-06, "loss": 0.19325138330459596, "memory(GiB)": 31.36, "step": 670, "token_acc": 0.9333951984673412, "train_speed(iter/s)": 0.113374 }, { "epoch": 0.7213465134918514, "grad_norm": 0.7585648894309998, "learning_rate": 8.637935516338384e-06, "loss": 0.20734424591064454, "memory(GiB)": 31.36, "step": 675, "token_acc": 0.929390127754606, "train_speed(iter/s)": 0.113613 }, { "epoch": 0.7266898209991985, "grad_norm": 0.6115338206291199, "learning_rate": 8.61867019052535e-06, "loss": 0.1957784414291382, "memory(GiB)": 31.36, "step": 680, "token_acc": 0.9327852845758922, "train_speed(iter/s)": 0.113851 }, { "epoch": 0.7266898209991985, "eval_loss": 0.1919844150543213, "eval_runtime": 39.1768, "eval_samples_per_second": 15.417, "eval_steps_per_second": 3.854, "eval_token_acc": 0.9334705869737717, "step": 680 }, { "epoch": 0.7320331285065456, "grad_norm": 0.763317883014679, "learning_rate": 8.599291384042442e-06, "loss": 0.1990307092666626, "memory(GiB)": 31.36, "step": 685, "token_acc": 0.9309296501645834, "train_speed(iter/s)": 0.113214 }, { "epoch": 0.7373764360138926, "grad_norm": 0.6819867491722107, "learning_rate": 8.579799704604597e-06, "loss": 0.19830925464630128, "memory(GiB)": 31.36, "step": 690, "token_acc": 0.9335458059266292, "train_speed(iter/s)": 0.113398 }, { "epoch": 0.7427197435212396, "grad_norm": 0.7253126502037048, "learning_rate": 8.560195763466428e-06, "loss": 0.20094099044799804, "memory(GiB)": 31.36, "step": 695, "token_acc": 0.9241353978300181, "train_speed(iter/s)": 0.113667 }, { "epoch": 0.7480630510285867, "grad_norm": 0.75684654712677, "learning_rate": 8.540480175403045e-06, "loss": 0.20439500808715821, "memory(GiB)": 31.36, "step": 700, "token_acc": 0.9336929261002508, "train_speed(iter/s)": 0.113873 }, { "epoch": 0.7480630510285867, "eval_loss": 0.19040465354919434, "eval_runtime": 39.1788, "eval_samples_per_second": 15.416, "eval_steps_per_second": 3.854, "eval_token_acc": 0.9335263462651998, "step": 700 }, { "epoch": 0.7534063585359337, "grad_norm": 0.695967972278595, "learning_rate": 8.520653558690785e-06, "loss": 0.2008056879043579, "memory(GiB)": 31.36, "step": 705, "token_acc": 0.9287990912399502, "train_speed(iter/s)": 0.113225 }, { "epoch": 0.7587496660432808, "grad_norm": 0.7082876563072205, "learning_rate": 8.500716535087815e-06, "loss": 0.19893609285354613, "memory(GiB)": 31.36, "step": 710, "token_acc": 0.9335433493079527, "train_speed(iter/s)": 0.113434 }, { "epoch": 0.7640929735506279, "grad_norm": 0.6951336860656738, "learning_rate": 8.480669729814635e-06, "loss": 0.20382363796234132, "memory(GiB)": 31.36, "step": 715, "token_acc": 0.9272279597838725, "train_speed(iter/s)": 0.113684 }, { "epoch": 0.7694362810579749, "grad_norm": 0.690929651260376, "learning_rate": 8.460513771534475e-06, "loss": 0.20613670349121094, "memory(GiB)": 31.36, "step": 720, "token_acc": 0.9379507848960543, "train_speed(iter/s)": 0.113902 }, { "epoch": 0.7694362810579749, "eval_loss": 0.19106899201869965, "eval_runtime": 39.2423, "eval_samples_per_second": 15.392, "eval_steps_per_second": 3.848, "eval_token_acc": 0.9331446095777306, "step": 720 }, { "epoch": 0.7747795885653219, "grad_norm": 0.6783177256584167, "learning_rate": 8.440249292333583e-06, "loss": 0.1977448582649231, "memory(GiB)": 31.36, "step": 725, "token_acc": 0.9308449330614417, "train_speed(iter/s)": 0.11326 }, { "epoch": 0.7801228960726689, "grad_norm": 0.6677674055099487, "learning_rate": 8.41987692770139e-06, "loss": 0.21048130989074706, "memory(GiB)": 31.36, "step": 730, "token_acc": 0.9202530400865319, "train_speed(iter/s)": 0.113463 }, { "epoch": 0.7854662035800161, "grad_norm": 0.8552298545837402, "learning_rate": 8.399397316510596e-06, "loss": 0.20974290370941162, "memory(GiB)": 31.36, "step": 735, "token_acc": 0.9332773814519513, "train_speed(iter/s)": 0.113682 }, { "epoch": 0.7908095110873631, "grad_norm": 0.7160708904266357, "learning_rate": 8.378811100997122e-06, "loss": 0.20636558532714844, "memory(GiB)": 31.36, "step": 740, "token_acc": 0.9329161099782013, "train_speed(iter/s)": 0.113884 }, { "epoch": 0.7908095110873631, "eval_loss": 0.19004826247692108, "eval_runtime": 39.1971, "eval_samples_per_second": 15.409, "eval_steps_per_second": 3.852, "eval_token_acc": 0.9335435029702546, "step": 740 }, { "epoch": 0.7961528185947101, "grad_norm": 0.7204393148422241, "learning_rate": 8.358118926739984e-06, "loss": 0.20534658432006836, "memory(GiB)": 31.36, "step": 745, "token_acc": 0.9301107044622764, "train_speed(iter/s)": 0.113248 }, { "epoch": 0.8014961261020572, "grad_norm": 0.7450360059738159, "learning_rate": 8.337321442641036e-06, "loss": 0.20312881469726562, "memory(GiB)": 31.36, "step": 750, "token_acc": 0.9307398246008545, "train_speed(iter/s)": 0.113436 }, { "epoch": 0.8068394336094042, "grad_norm": 0.7013337016105652, "learning_rate": 8.316419300904622e-06, "loss": 0.20806705951690674, "memory(GiB)": 31.36, "step": 755, "token_acc": 0.9265781630994311, "train_speed(iter/s)": 0.113667 }, { "epoch": 0.8121827411167513, "grad_norm": 0.7928115129470825, "learning_rate": 8.295413157017127e-06, "loss": 0.20586895942687988, "memory(GiB)": 31.36, "step": 760, "token_acc": 0.9280571524250711, "train_speed(iter/s)": 0.113891 }, { "epoch": 0.8121827411167513, "eval_loss": 0.1891859769821167, "eval_runtime": 39.1442, "eval_samples_per_second": 15.43, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9337751184884943, "step": 760 }, { "epoch": 0.8175260486240983, "grad_norm": 0.7362833023071289, "learning_rate": 8.274303669726427e-06, "loss": 0.21117730140686036, "memory(GiB)": 31.36, "step": 765, "token_acc": 0.9295638706201881, "train_speed(iter/s)": 0.113261 }, { "epoch": 0.8228693561314454, "grad_norm": 0.8272237777709961, "learning_rate": 8.25309150102121e-06, "loss": 0.20895204544067383, "memory(GiB)": 31.36, "step": 770, "token_acc": 0.925374677002584, "train_speed(iter/s)": 0.113483 }, { "epoch": 0.8282126636387924, "grad_norm": 0.7175537943840027, "learning_rate": 8.231777316110245e-06, "loss": 0.1944166898727417, "memory(GiB)": 31.36, "step": 775, "token_acc": 0.9321589002543703, "train_speed(iter/s)": 0.11366 }, { "epoch": 0.8335559711461394, "grad_norm": 0.7809334993362427, "learning_rate": 8.210361783401491e-06, "loss": 0.19996525049209596, "memory(GiB)": 31.36, "step": 780, "token_acc": 0.9270649417354178, "train_speed(iter/s)": 0.113842 }, { "epoch": 0.8335559711461394, "eval_loss": 0.18875150382518768, "eval_runtime": 39.1112, "eval_samples_per_second": 15.443, "eval_steps_per_second": 3.861, "eval_token_acc": 0.9345214351583778, "step": 780 }, { "epoch": 0.8388992786534865, "grad_norm": 0.7965995669364929, "learning_rate": 8.188845574481162e-06, "loss": 0.20428986549377443, "memory(GiB)": 31.36, "step": 785, "token_acc": 0.9295484112938198, "train_speed(iter/s)": 0.113243 }, { "epoch": 0.8442425861608336, "grad_norm": 0.8420124053955078, "learning_rate": 8.167229364092648e-06, "loss": 0.2018270969390869, "memory(GiB)": 31.36, "step": 790, "token_acc": 0.9257297598661941, "train_speed(iter/s)": 0.113454 }, { "epoch": 0.8495858936681806, "grad_norm": 0.7496061325073242, "learning_rate": 8.145513830115367e-06, "loss": 0.18817675113677979, "memory(GiB)": 31.36, "step": 795, "token_acc": 0.9294179964245514, "train_speed(iter/s)": 0.113632 }, { "epoch": 0.8549292011755276, "grad_norm": 0.7280667424201965, "learning_rate": 8.1236996535435e-06, "loss": 0.20691485404968263, "memory(GiB)": 31.36, "step": 800, "token_acc": 0.9246175682069074, "train_speed(iter/s)": 0.113856 }, { "epoch": 0.8549292011755276, "eval_loss": 0.18890005350112915, "eval_runtime": 39.0966, "eval_samples_per_second": 15.449, "eval_steps_per_second": 3.862, "eval_token_acc": 0.9341997469386004, "step": 800 }, { "epoch": 0.8602725086828747, "grad_norm": 0.7484925389289856, "learning_rate": 8.101787518464634e-06, "loss": 0.20772714614868165, "memory(GiB)": 31.36, "step": 805, "token_acc": 0.9296667323028454, "train_speed(iter/s)": 0.113284 }, { "epoch": 0.8656158161902218, "grad_norm": 0.791478157043457, "learning_rate": 8.079778112038318e-06, "loss": 0.20092449188232422, "memory(GiB)": 31.36, "step": 810, "token_acc": 0.9342826902722987, "train_speed(iter/s)": 0.113469 }, { "epoch": 0.8709591236975688, "grad_norm": 0.7884016633033752, "learning_rate": 8.057672124474508e-06, "loss": 0.19559590816497802, "memory(GiB)": 31.36, "step": 815, "token_acc": 0.9313420307089644, "train_speed(iter/s)": 0.113673 }, { "epoch": 0.8763024312049158, "grad_norm": 0.7414833307266235, "learning_rate": 8.035470249011916e-06, "loss": 0.21486268043518067, "memory(GiB)": 31.36, "step": 820, "token_acc": 0.9156693981017509, "train_speed(iter/s)": 0.113866 }, { "epoch": 0.8763024312049158, "eval_loss": 0.18826377391815186, "eval_runtime": 39.1249, "eval_samples_per_second": 15.438, "eval_steps_per_second": 3.859, "eval_token_acc": 0.9340796500032169, "step": 820 }, { "epoch": 0.8816457387122629, "grad_norm": 0.6968944072723389, "learning_rate": 8.013173181896283e-06, "loss": 0.195112144947052, "memory(GiB)": 31.36, "step": 825, "token_acc": 0.9297919928427645, "train_speed(iter/s)": 0.113289 }, { "epoch": 0.88698904621961, "grad_norm": 0.6487668752670288, "learning_rate": 7.990781622358535e-06, "loss": 0.20295815467834472, "memory(GiB)": 31.36, "step": 830, "token_acc": 0.9273416807127378, "train_speed(iter/s)": 0.113496 }, { "epoch": 0.892332353726957, "grad_norm": 0.7101380825042725, "learning_rate": 7.968296272592862e-06, "loss": 0.2020167589187622, "memory(GiB)": 31.36, "step": 835, "token_acc": 0.9326549210206562, "train_speed(iter/s)": 0.113672 }, { "epoch": 0.897675661234304, "grad_norm": 0.7018981575965881, "learning_rate": 7.945717837734688e-06, "loss": 0.21045067310333251, "memory(GiB)": 31.36, "step": 840, "token_acc": 0.9254527494237734, "train_speed(iter/s)": 0.113897 }, { "epoch": 0.897675661234304, "eval_loss": 0.18678458034992218, "eval_runtime": 39.117, "eval_samples_per_second": 15.441, "eval_steps_per_second": 3.86, "eval_token_acc": 0.9345257243346415, "step": 840 }, { "epoch": 0.9030189687416511, "grad_norm": 0.7442994713783264, "learning_rate": 7.923047025838573e-06, "loss": 0.18977639675140381, "memory(GiB)": 31.36, "step": 845, "token_acc": 0.9316706328119585, "train_speed(iter/s)": 0.11334 }, { "epoch": 0.9083622762489981, "grad_norm": 0.5974848866462708, "learning_rate": 7.900284547855992e-06, "loss": 0.19339005947113036, "memory(GiB)": 31.36, "step": 850, "token_acc": 0.9319930430681249, "train_speed(iter/s)": 0.113541 }, { "epoch": 0.9137055837563451, "grad_norm": 0.7044218182563782, "learning_rate": 7.87743111761305e-06, "loss": 0.19252583980560303, "memory(GiB)": 31.36, "step": 855, "token_acc": 0.9317780249983639, "train_speed(iter/s)": 0.113714 }, { "epoch": 0.9190488912636923, "grad_norm": 0.6662729978561401, "learning_rate": 7.8544874517881e-06, "loss": 0.18919335603713988, "memory(GiB)": 31.36, "step": 860, "token_acc": 0.9358071036673404, "train_speed(iter/s)": 0.113874 }, { "epoch": 0.9190488912636923, "eval_loss": 0.18598179519176483, "eval_runtime": 39.1151, "eval_samples_per_second": 15.442, "eval_steps_per_second": 3.86, "eval_token_acc": 0.9348645692594737, "step": 860 }, { "epoch": 0.9243921987710393, "grad_norm": 0.7112472057342529, "learning_rate": 7.831454269889251e-06, "loss": 0.2195812225341797, "memory(GiB)": 31.36, "step": 865, "token_acc": 0.929244681467741, "train_speed(iter/s)": 0.113345 }, { "epoch": 0.9297355062783863, "grad_norm": 0.6910869479179382, "learning_rate": 7.808332294231824e-06, "loss": 0.19954900741577147, "memory(GiB)": 31.36, "step": 870, "token_acc": 0.931801259053679, "train_speed(iter/s)": 0.113528 }, { "epoch": 0.9350788137857333, "grad_norm": 0.681770384311676, "learning_rate": 7.785122249915688e-06, "loss": 0.18991070985794067, "memory(GiB)": 31.36, "step": 875, "token_acc": 0.9295779137975007, "train_speed(iter/s)": 0.113699 }, { "epoch": 0.9404221212930804, "grad_norm": 0.8212052583694458, "learning_rate": 7.76182486480253e-06, "loss": 0.1996673822402954, "memory(GiB)": 31.36, "step": 880, "token_acc": 0.9402507040697995, "train_speed(iter/s)": 0.113869 }, { "epoch": 0.9404221212930804, "eval_loss": 0.1853218525648117, "eval_runtime": 39.2228, "eval_samples_per_second": 15.399, "eval_steps_per_second": 3.85, "eval_token_acc": 0.9349203285509018, "step": 880 }, { "epoch": 0.9457654288004275, "grad_norm": 0.7549769878387451, "learning_rate": 7.738440869493018e-06, "loss": 0.19690234661102296, "memory(GiB)": 31.36, "step": 885, "token_acc": 0.9322219434878789, "train_speed(iter/s)": 0.113325 }, { "epoch": 0.9511087363077745, "grad_norm": 0.7041129469871521, "learning_rate": 7.714970997303898e-06, "loss": 0.19316442012786866, "memory(GiB)": 31.36, "step": 890, "token_acc": 0.9402204546877936, "train_speed(iter/s)": 0.113488 }, { "epoch": 0.9564520438151216, "grad_norm": 0.6991894841194153, "learning_rate": 7.691415984244998e-06, "loss": 0.19888077974319457, "memory(GiB)": 31.36, "step": 895, "token_acc": 0.9314758549356779, "train_speed(iter/s)": 0.113654 }, { "epoch": 0.9617953513224686, "grad_norm": 0.6569207906723022, "learning_rate": 7.667776568996143e-06, "loss": 0.19370880126953124, "memory(GiB)": 31.36, "step": 900, "token_acc": 0.9230841325877097, "train_speed(iter/s)": 0.113808 }, { "epoch": 0.9617953513224686, "eval_loss": 0.1854468584060669, "eval_runtime": 39.1165, "eval_samples_per_second": 15.441, "eval_steps_per_second": 3.86, "eval_token_acc": 0.9353878487636449, "step": 900 }, { "epoch": 0.9671386588298156, "grad_norm": 0.6145524382591248, "learning_rate": 7.64405349288399e-06, "loss": 0.18694071769714354, "memory(GiB)": 31.36, "step": 905, "token_acc": 0.9349944519517045, "train_speed(iter/s)": 0.113288 }, { "epoch": 0.9724819663371627, "grad_norm": 0.6918651461601257, "learning_rate": 7.62024749985878e-06, "loss": 0.1850353240966797, "memory(GiB)": 31.36, "step": 910, "token_acc": 0.9334631974398219, "train_speed(iter/s)": 0.113436 }, { "epoch": 0.9778252738445098, "grad_norm": 0.6467410922050476, "learning_rate": 7.596359336471015e-06, "loss": 0.1928159236907959, "memory(GiB)": 31.36, "step": 915, "token_acc": 0.9306192268217585, "train_speed(iter/s)": 0.113579 }, { "epoch": 0.9831685813518568, "grad_norm": 0.6539848446846008, "learning_rate": 7.572389751848037e-06, "loss": 0.19003190994262695, "memory(GiB)": 31.36, "step": 920, "token_acc": 0.9286221470836855, "train_speed(iter/s)": 0.113727 }, { "epoch": 0.9831685813518568, "eval_loss": 0.1848677396774292, "eval_runtime": 39.1075, "eval_samples_per_second": 15.445, "eval_steps_per_second": 3.861, "eval_token_acc": 0.9352377275944155, "step": 920 }, { "epoch": 0.9885118888592038, "grad_norm": 0.630643904209137, "learning_rate": 7.548339497670538e-06, "loss": 0.19637407064437867, "memory(GiB)": 31.36, "step": 925, "token_acc": 0.934642791292936, "train_speed(iter/s)": 0.113199 }, { "epoch": 0.9938551963665508, "grad_norm": 0.7896732091903687, "learning_rate": 7.524209328148995e-06, "loss": 0.1935054898262024, "memory(GiB)": 31.36, "step": 930, "token_acc": 0.9335429563394583, "train_speed(iter/s)": 0.113379 }, { "epoch": 0.999198503873898, "grad_norm": 0.8091479539871216, "learning_rate": 7.500000000000001e-06, "loss": 0.1939959168434143, "memory(GiB)": 31.36, "step": 935, "token_acc": 0.9332101204512141, "train_speed(iter/s)": 0.113561 }, { "epoch": 1.0042746460058776, "grad_norm": 0.7063812017440796, "learning_rate": 7.4757122724225575e-06, "loss": 0.15370899438858032, "memory(GiB)": 31.36, "step": 940, "token_acc": 0.9470076897358742, "train_speed(iter/s)": 0.113758 }, { "epoch": 1.0042746460058776, "eval_loss": 0.1846647411584854, "eval_runtime": 39.1772, "eval_samples_per_second": 15.417, "eval_steps_per_second": 3.854, "eval_token_acc": 0.9355808616955114, "step": 940 }, { "epoch": 1.0096179535132246, "grad_norm": 0.7912789583206177, "learning_rate": 7.451346907074245e-06, "loss": 0.14589121341705322, "memory(GiB)": 31.36, "step": 945, "token_acc": 0.9395386832162834, "train_speed(iter/s)": 0.113271 }, { "epoch": 1.0149612610205718, "grad_norm": 0.773916482925415, "learning_rate": 7.426904668047352e-06, "loss": 0.14080936908721925, "memory(GiB)": 31.36, "step": 950, "token_acc": 0.9501985945615643, "train_speed(iter/s)": 0.113448 }, { "epoch": 1.0203045685279188, "grad_norm": 0.7244420647621155, "learning_rate": 7.40238632184491e-06, "loss": 0.1430067539215088, "memory(GiB)": 31.36, "step": 955, "token_acc": 0.9496945399007255, "train_speed(iter/s)": 0.113636 }, { "epoch": 1.0256478760352659, "grad_norm": 0.7013021111488342, "learning_rate": 7.377792637356644e-06, "loss": 0.13634157180786133, "memory(GiB)": 31.36, "step": 960, "token_acc": 0.9495622671230802, "train_speed(iter/s)": 0.113799 }, { "epoch": 1.0256478760352659, "eval_loss": 0.18845246732234955, "eval_runtime": 39.1642, "eval_samples_per_second": 15.422, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9350661605438676, "step": 960 }, { "epoch": 1.0309911835426129, "grad_norm": 0.6317921876907349, "learning_rate": 7.35312438583488e-06, "loss": 0.14405910968780516, "memory(GiB)": 31.36, "step": 965, "token_acc": 0.94003444829566, "train_speed(iter/s)": 0.113324 }, { "epoch": 1.03633449104996, "grad_norm": 0.7653928399085999, "learning_rate": 7.3283823408703466e-06, "loss": 0.14201946258544923, "memory(GiB)": 31.36, "step": 970, "token_acc": 0.9429385599110369, "train_speed(iter/s)": 0.113486 }, { "epoch": 1.041677798557307, "grad_norm": 0.7575430870056152, "learning_rate": 7.303567278367918e-06, "loss": 0.15218265056610109, "memory(GiB)": 31.36, "step": 975, "token_acc": 0.9438988818667963, "train_speed(iter/s)": 0.113648 }, { "epoch": 1.047021106064654, "grad_norm": 0.6947309970855713, "learning_rate": 7.278679976522279e-06, "loss": 0.14258232116699218, "memory(GiB)": 31.36, "step": 980, "token_acc": 0.9484339445857315, "train_speed(iter/s)": 0.113835 }, { "epoch": 1.047021106064654, "eval_loss": 0.18912462890148163, "eval_runtime": 39.1355, "eval_samples_per_second": 15.434, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9351605224216689, "step": 980 }, { "epoch": 1.0523644135720012, "grad_norm": 0.6447970271110535, "learning_rate": 7.253721215793528e-06, "loss": 0.13303806781768798, "memory(GiB)": 31.36, "step": 985, "token_acc": 0.9368344110205229, "train_speed(iter/s)": 0.113351 }, { "epoch": 1.0577077210793482, "grad_norm": 0.7357504367828369, "learning_rate": 7.2286917788826926e-06, "loss": 0.14264590740203859, "memory(GiB)": 31.36, "step": 990, "token_acc": 0.9517513105612875, "train_speed(iter/s)": 0.113523 }, { "epoch": 1.0630510285866952, "grad_norm": 0.7251073718070984, "learning_rate": 7.203592450707193e-06, "loss": 0.14065431356430053, "memory(GiB)": 31.36, "step": 995, "token_acc": 0.9541761579347001, "train_speed(iter/s)": 0.113657 }, { "epoch": 1.0683943360940422, "grad_norm": 0.6575713753700256, "learning_rate": 7.178424018376224e-06, "loss": 0.13455284833908082, "memory(GiB)": 31.36, "step": 1000, "token_acc": 0.952753960692989, "train_speed(iter/s)": 0.113814 }, { "epoch": 1.0683943360940422, "eval_loss": 0.18993094563484192, "eval_runtime": 39.1132, "eval_samples_per_second": 15.442, "eval_steps_per_second": 3.861, "eval_token_acc": 0.9351004739539771, "step": 1000 }, { "epoch": 1.0737376436013892, "grad_norm": 0.6939108371734619, "learning_rate": 7.153187271166071e-06, "loss": 0.1378490924835205, "memory(GiB)": 31.36, "step": 1005, "token_acc": 0.9382960940547167, "train_speed(iter/s)": 0.113358 }, { "epoch": 1.0790809511087363, "grad_norm": 0.6724715828895569, "learning_rate": 7.127883000495353e-06, "loss": 0.14932271242141723, "memory(GiB)": 31.36, "step": 1010, "token_acc": 0.9499566799514816, "train_speed(iter/s)": 0.113511 }, { "epoch": 1.0844242586160833, "grad_norm": 0.731438159942627, "learning_rate": 7.102511999900213e-06, "loss": 0.13644077777862548, "memory(GiB)": 31.36, "step": 1015, "token_acc": 0.9460200277757473, "train_speed(iter/s)": 0.113649 }, { "epoch": 1.0897675661234305, "grad_norm": 0.6680863499641418, "learning_rate": 7.0770750650094335e-06, "loss": 0.13693207502365112, "memory(GiB)": 31.36, "step": 1020, "token_acc": 0.9469487672670047, "train_speed(iter/s)": 0.113805 }, { "epoch": 1.0897675661234305, "eval_loss": 0.18932494521141052, "eval_runtime": 39.1534, "eval_samples_per_second": 15.427, "eval_steps_per_second": 3.857, "eval_token_acc": 0.9353063544146347, "step": 1020 }, { "epoch": 1.0951108736307775, "grad_norm": 0.6814751029014587, "learning_rate": 7.051572993519474e-06, "loss": 0.13657076358795167, "memory(GiB)": 31.36, "step": 1025, "token_acc": 0.9386909315096644, "train_speed(iter/s)": 0.113321 }, { "epoch": 1.1004541811381245, "grad_norm": 0.6972615718841553, "learning_rate": 7.026006585169467e-06, "loss": 0.14217867851257324, "memory(GiB)": 31.36, "step": 1030, "token_acc": 0.9473362948896199, "train_speed(iter/s)": 0.113472 }, { "epoch": 1.1057974886454716, "grad_norm": 0.7882171869277954, "learning_rate": 7.0003766417161335e-06, "loss": 0.13929877281188965, "memory(GiB)": 31.36, "step": 1035, "token_acc": 0.9491739520659509, "train_speed(iter/s)": 0.113632 }, { "epoch": 1.1111407961528186, "grad_norm": 0.7138720154762268, "learning_rate": 6.974683966908642e-06, "loss": 0.1398939847946167, "memory(GiB)": 31.36, "step": 1040, "token_acc": 0.9491174031512853, "train_speed(iter/s)": 0.113768 }, { "epoch": 1.1111407961528186, "eval_loss": 0.18828535079956055, "eval_runtime": 39.1885, "eval_samples_per_second": 15.413, "eval_steps_per_second": 3.853, "eval_token_acc": 0.9354607647601278, "step": 1040 }, { "epoch": 1.1164841036601656, "grad_norm": 0.7279144525527954, "learning_rate": 6.948929366463397e-06, "loss": 0.15247514247894287, "memory(GiB)": 31.36, "step": 1045, "token_acc": 0.9380235654449663, "train_speed(iter/s)": 0.113327 }, { "epoch": 1.1218274111675126, "grad_norm": 0.7380113005638123, "learning_rate": 6.923113648038784e-06, "loss": 0.14748337268829345, "memory(GiB)": 31.36, "step": 1050, "token_acc": 0.9475613194248661, "train_speed(iter/s)": 0.11348 }, { "epoch": 1.1271707186748596, "grad_norm": 0.7649953961372375, "learning_rate": 6.897237621209831e-06, "loss": 0.14567428827285767, "memory(GiB)": 31.36, "step": 1055, "token_acc": 0.9500533120085299, "train_speed(iter/s)": 0.113609 }, { "epoch": 1.1325140261822069, "grad_norm": 0.7318655252456665, "learning_rate": 6.87130209744282e-06, "loss": 0.13384032249450684, "memory(GiB)": 31.36, "step": 1060, "token_acc": 0.9565458338766463, "train_speed(iter/s)": 0.113739 }, { "epoch": 1.1325140261822069, "eval_loss": 0.18837569653987885, "eval_runtime": 39.2359, "eval_samples_per_second": 15.394, "eval_steps_per_second": 3.849, "eval_token_acc": 0.935216281713097, "step": 1060 }, { "epoch": 1.1378573336895539, "grad_norm": 0.7364778518676758, "learning_rate": 6.845307890069851e-06, "loss": 0.1373004674911499, "memory(GiB)": 31.36, "step": 1065, "token_acc": 0.9390180878552972, "train_speed(iter/s)": 0.113291 }, { "epoch": 1.143200641196901, "grad_norm": 0.685958743095398, "learning_rate": 6.8192558142633215e-06, "loss": 0.13763891458511351, "memory(GiB)": 31.36, "step": 1070, "token_acc": 0.9479466974181562, "train_speed(iter/s)": 0.113441 }, { "epoch": 1.148543948704248, "grad_norm": 0.6921528577804565, "learning_rate": 6.7931466870103735e-06, "loss": 0.1474214553833008, "memory(GiB)": 31.36, "step": 1075, "token_acc": 0.9462754416778593, "train_speed(iter/s)": 0.11358 }, { "epoch": 1.153887256211595, "grad_norm": 0.6673567891120911, "learning_rate": 6.766981327087271e-06, "loss": 0.134868586063385, "memory(GiB)": 31.36, "step": 1080, "token_acc": 0.9541073453445886, "train_speed(iter/s)": 0.113692 }, { "epoch": 1.153887256211595, "eval_loss": 0.1878366470336914, "eval_runtime": 39.1901, "eval_samples_per_second": 15.412, "eval_steps_per_second": 3.853, "eval_token_acc": 0.9354607647601278, "step": 1080 }, { "epoch": 1.159230563718942, "grad_norm": 0.8000864386558533, "learning_rate": 6.740760555033715e-06, "loss": 0.14174835681915282, "memory(GiB)": 31.36, "step": 1085, "token_acc": 0.9395288542253296, "train_speed(iter/s)": 0.113249 }, { "epoch": 1.1645738712262892, "grad_norm": 0.6853452920913696, "learning_rate": 6.714485193127126e-06, "loss": 0.14102463722229003, "memory(GiB)": 31.36, "step": 1090, "token_acc": 0.9465929419417791, "train_speed(iter/s)": 0.11338 }, { "epoch": 1.1699171787336362, "grad_norm": 0.747848629951477, "learning_rate": 6.688156065356845e-06, "loss": 0.14443647861480713, "memory(GiB)": 31.36, "step": 1095, "token_acc": 0.9509245187436677, "train_speed(iter/s)": 0.113501 }, { "epoch": 1.1752604862409832, "grad_norm": 0.7009637355804443, "learning_rate": 6.6617739973982985e-06, "loss": 0.1462648630142212, "memory(GiB)": 31.36, "step": 1100, "token_acc": 0.9474907617117654, "train_speed(iter/s)": 0.113649 }, { "epoch": 1.1752604862409832, "eval_loss": 0.1868010312318802, "eval_runtime": 39.2141, "eval_samples_per_second": 15.403, "eval_steps_per_second": 3.851, "eval_token_acc": 0.9356494885157306, "step": 1100 }, { "epoch": 1.1806037937483302, "grad_norm": 0.7919987440109253, "learning_rate": 6.635339816587109e-06, "loss": 0.14761772155761718, "memory(GiB)": 31.36, "step": 1105, "token_acc": 0.9380447931623158, "train_speed(iter/s)": 0.113238 }, { "epoch": 1.1859471012556773, "grad_norm": 0.7107034921646118, "learning_rate": 6.60885435189314e-06, "loss": 0.15119514465332032, "memory(GiB)": 31.36, "step": 1110, "token_acc": 0.9427783975326465, "train_speed(iter/s)": 0.11339 }, { "epoch": 1.1912904087630243, "grad_norm": 0.7467309236526489, "learning_rate": 6.582318433894513e-06, "loss": 0.13204342126846313, "memory(GiB)": 31.36, "step": 1115, "token_acc": 0.9524959742351047, "train_speed(iter/s)": 0.113534 }, { "epoch": 1.1966337162703713, "grad_norm": 0.7935456037521362, "learning_rate": 6.555732894751548e-06, "loss": 0.1459757924079895, "memory(GiB)": 31.36, "step": 1120, "token_acc": 0.9519983083104251, "train_speed(iter/s)": 0.113678 }, { "epoch": 1.1966337162703713, "eval_loss": 0.18683308362960815, "eval_runtime": 39.1575, "eval_samples_per_second": 15.425, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9356194642818847, "step": 1120 }, { "epoch": 1.2019770237777183, "grad_norm": 0.7466067671775818, "learning_rate": 6.529098568180672e-06, "loss": 0.14143054485321044, "memory(GiB)": 31.36, "step": 1125, "token_acc": 0.939937276954688, "train_speed(iter/s)": 0.11325 }, { "epoch": 1.2073203312850656, "grad_norm": 0.7526156306266785, "learning_rate": 6.502416289428282e-06, "loss": 0.14170231819152831, "memory(GiB)": 31.36, "step": 1130, "token_acc": 0.9475191453761503, "train_speed(iter/s)": 0.113376 }, { "epoch": 1.2126636387924126, "grad_norm": 0.7008678913116455, "learning_rate": 6.475686895244534e-06, "loss": 0.14561245441436768, "memory(GiB)": 31.36, "step": 1135, "token_acc": 0.9469662033072007, "train_speed(iter/s)": 0.113525 }, { "epoch": 1.2180069462997596, "grad_norm": 0.7882223129272461, "learning_rate": 6.448911223857124e-06, "loss": 0.14698657989501954, "memory(GiB)": 31.36, "step": 1140, "token_acc": 0.9527493782812931, "train_speed(iter/s)": 0.113658 }, { "epoch": 1.2180069462997596, "eval_loss": 0.1868022382259369, "eval_runtime": 39.1631, "eval_samples_per_second": 15.423, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9359154174440799, "step": 1140 }, { "epoch": 1.2233502538071066, "grad_norm": 0.7010296583175659, "learning_rate": 6.422090114944982e-06, "loss": 0.14752376079559326, "memory(GiB)": 31.36, "step": 1145, "token_acc": 0.937754062131431, "train_speed(iter/s)": 0.113221 }, { "epoch": 1.2286935613144536, "grad_norm": 0.710648238658905, "learning_rate": 6.3952244096119535e-06, "loss": 0.13726551532745362, "memory(GiB)": 31.36, "step": 1150, "token_acc": 0.9516319057474925, "train_speed(iter/s)": 0.113373 }, { "epoch": 1.2340368688218006, "grad_norm": 0.7825097441673279, "learning_rate": 6.368314950360416e-06, "loss": 0.151510751247406, "memory(GiB)": 31.36, "step": 1155, "token_acc": 0.9457324403228576, "train_speed(iter/s)": 0.113517 }, { "epoch": 1.2393801763291477, "grad_norm": 0.7237306833267212, "learning_rate": 6.341362581064856e-06, "loss": 0.14253956079483032, "memory(GiB)": 31.36, "step": 1160, "token_acc": 0.9461406518010291, "train_speed(iter/s)": 0.113677 }, { "epoch": 1.2393801763291477, "eval_loss": 0.18586544692516327, "eval_runtime": 39.1092, "eval_samples_per_second": 15.444, "eval_steps_per_second": 3.861, "eval_token_acc": 0.9360355143794634, "step": 1160 }, { "epoch": 1.244723483836495, "grad_norm": 0.7146082520484924, "learning_rate": 6.314368146945418e-06, "loss": 0.14136313199996947, "memory(GiB)": 31.36, "step": 1165, "token_acc": 0.9388746238483078, "train_speed(iter/s)": 0.113271 }, { "epoch": 1.250066791343842, "grad_norm": 0.7276601195335388, "learning_rate": 6.28733249454138e-06, "loss": 0.1453978180885315, "memory(GiB)": 31.36, "step": 1170, "token_acc": 0.9472019757845913, "train_speed(iter/s)": 0.113427 }, { "epoch": 1.255410098851189, "grad_norm": 0.7507435083389282, "learning_rate": 6.260256471684622e-06, "loss": 0.14081387519836425, "memory(GiB)": 31.36, "step": 1175, "token_acc": 0.9456987966162278, "train_speed(iter/s)": 0.113564 }, { "epoch": 1.260753406358536, "grad_norm": 0.6047825217247009, "learning_rate": 6.233140927473033e-06, "loss": 0.1298896551132202, "memory(GiB)": 31.36, "step": 1180, "token_acc": 0.950142074581832, "train_speed(iter/s)": 0.113686 }, { "epoch": 1.260753406358536, "eval_loss": 0.186279758810997, "eval_runtime": 39.1529, "eval_samples_per_second": 15.427, "eval_steps_per_second": 3.857, "eval_token_acc": 0.9361170087284737, "step": 1180 }, { "epoch": 1.266096713865883, "grad_norm": 0.7231855988502502, "learning_rate": 6.205986712243876e-06, "loss": 0.13684126138687133, "memory(GiB)": 31.36, "step": 1185, "token_acc": 0.939052757793765, "train_speed(iter/s)": 0.113247 }, { "epoch": 1.27144002137323, "grad_norm": 0.7016168832778931, "learning_rate": 6.178794677547138e-06, "loss": 0.15314276218414308, "memory(GiB)": 31.36, "step": 1190, "token_acc": 0.9389803557822904, "train_speed(iter/s)": 0.113385 }, { "epoch": 1.276783328880577, "grad_norm": 0.7309826612472534, "learning_rate": 6.151565676118805e-06, "loss": 0.13780862092971802, "memory(GiB)": 31.36, "step": 1195, "token_acc": 0.9577650445215613, "train_speed(iter/s)": 0.113507 }, { "epoch": 1.282126636387924, "grad_norm": 0.7305301427841187, "learning_rate": 6.124300561854139e-06, "loss": 0.13783036470413207, "memory(GiB)": 31.36, "step": 1200, "token_acc": 0.9519318638739628, "train_speed(iter/s)": 0.113619 }, { "epoch": 1.282126636387924, "eval_loss": 0.1861124336719513, "eval_runtime": 39.1734, "eval_samples_per_second": 15.419, "eval_steps_per_second": 3.855, "eval_token_acc": 0.9362371056638572, "step": 1200 }, { "epoch": 1.2874699438952713, "grad_norm": 0.7063933610916138, "learning_rate": 6.097000189780893e-06, "loss": 0.1543891429901123, "memory(GiB)": 31.36, "step": 1205, "token_acc": 0.9369793792821915, "train_speed(iter/s)": 0.113257 }, { "epoch": 1.2928132514026183, "grad_norm": 0.7241778373718262, "learning_rate": 6.0696654160324875e-06, "loss": 0.13728095293045045, "memory(GiB)": 31.36, "step": 1210, "token_acc": 0.9575441100155683, "train_speed(iter/s)": 0.113381 }, { "epoch": 1.2981565589099653, "grad_norm": 0.7719012498855591, "learning_rate": 6.042297097821184e-06, "loss": 0.15218913555145264, "memory(GiB)": 31.36, "step": 1215, "token_acc": 0.949528983015884, "train_speed(iter/s)": 0.113494 }, { "epoch": 1.3034998664173123, "grad_norm": 0.6969226002693176, "learning_rate": 6.014896093411181e-06, "loss": 0.13368651866912842, "memory(GiB)": 31.36, "step": 1220, "token_acc": 0.9476698598847714, "train_speed(iter/s)": 0.113599 }, { "epoch": 1.3034998664173123, "eval_loss": 0.1850433051586151, "eval_runtime": 39.2188, "eval_samples_per_second": 15.401, "eval_steps_per_second": 3.85, "eval_token_acc": 0.9363186000128675, "step": 1220 }, { "epoch": 1.3088431739246593, "grad_norm": 0.6775336861610413, "learning_rate": 5.987463262091715e-06, "loss": 0.139385187625885, "memory(GiB)": 31.36, "step": 1225, "token_acc": 0.9377400468384075, "train_speed(iter/s)": 0.113206 }, { "epoch": 1.3141864814320063, "grad_norm": 0.7808154821395874, "learning_rate": 5.959999464150101e-06, "loss": 0.1481320381164551, "memory(GiB)": 31.36, "step": 1230, "token_acc": 0.9476690131491566, "train_speed(iter/s)": 0.113336 }, { "epoch": 1.3195297889393536, "grad_norm": 0.7226517796516418, "learning_rate": 5.932505560844766e-06, "loss": 0.14821076393127441, "memory(GiB)": 31.36, "step": 1235, "token_acc": 0.9464622560620092, "train_speed(iter/s)": 0.113466 }, { "epoch": 1.3248730964467006, "grad_norm": 0.7376400828361511, "learning_rate": 5.904982414378233e-06, "loss": 0.13770921230316163, "memory(GiB)": 31.36, "step": 1240, "token_acc": 0.9496229260935143, "train_speed(iter/s)": 0.113615 }, { "epoch": 1.3248730964467006, "eval_loss": 0.18510028719902039, "eval_runtime": 39.1646, "eval_samples_per_second": 15.422, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9362156597825387, "step": 1240 }, { "epoch": 1.3302164039540476, "grad_norm": 0.7447443008422852, "learning_rate": 5.877430887870081e-06, "loss": 0.14754925966262816, "memory(GiB)": 31.36, "step": 1245, "token_acc": 0.9386804566572503, "train_speed(iter/s)": 0.113234 }, { "epoch": 1.3355597114613946, "grad_norm": 0.7104332447052002, "learning_rate": 5.849851845329884e-06, "loss": 0.1406762719154358, "memory(GiB)": 31.36, "step": 1250, "token_acc": 0.9495687504455057, "train_speed(iter/s)": 0.113347 }, { "epoch": 1.3409030189687416, "grad_norm": 0.7238494753837585, "learning_rate": 5.822246151630109e-06, "loss": 0.13662366867065429, "memory(GiB)": 31.36, "step": 1255, "token_acc": 0.9507098934354979, "train_speed(iter/s)": 0.113485 }, { "epoch": 1.3462463264760887, "grad_norm": 0.7518407106399536, "learning_rate": 5.794614672479e-06, "loss": 0.14233107566833497, "memory(GiB)": 31.36, "step": 1260, "token_acc": 0.9495046143399488, "train_speed(iter/s)": 0.1136 }, { "epoch": 1.3462463264760887, "eval_loss": 0.18543414771556854, "eval_runtime": 39.1743, "eval_samples_per_second": 15.418, "eval_steps_per_second": 3.855, "eval_token_acc": 0.9364129618906689, "step": 1260 }, { "epoch": 1.3515896339834357, "grad_norm": 0.7268742322921753, "learning_rate": 5.766958274393428e-06, "loss": 0.14559613466262816, "memory(GiB)": 31.36, "step": 1265, "token_acc": 0.9380886914433095, "train_speed(iter/s)": 0.113217 }, { "epoch": 1.3569329414907827, "grad_norm": 0.6919171810150146, "learning_rate": 5.739277824671711e-06, "loss": 0.1417681932449341, "memory(GiB)": 31.36, "step": 1270, "token_acc": 0.9501291664041334, "train_speed(iter/s)": 0.113322 }, { "epoch": 1.3622762489981297, "grad_norm": 0.7078922390937805, "learning_rate": 5.711574191366427e-06, "loss": 0.14929780960083008, "memory(GiB)": 31.36, "step": 1275, "token_acc": 0.9459644322845417, "train_speed(iter/s)": 0.113449 }, { "epoch": 1.367619556505477, "grad_norm": 0.768913745880127, "learning_rate": 5.683848243257181e-06, "loss": 0.14540610313415528, "memory(GiB)": 31.36, "step": 1280, "token_acc": 0.9463650228774784, "train_speed(iter/s)": 0.113575 }, { "epoch": 1.367619556505477, "eval_loss": 0.18363338708877563, "eval_runtime": 38.9429, "eval_samples_per_second": 15.51, "eval_steps_per_second": 3.877, "eval_token_acc": 0.9366574449376998, "step": 1280 }, { "epoch": 1.372962864012824, "grad_norm": 0.7159769535064697, "learning_rate": 5.656100849823366e-06, "loss": 0.13922522068023682, "memory(GiB)": 31.36, "step": 1285, "token_acc": 0.9396986067671311, "train_speed(iter/s)": 0.113203 }, { "epoch": 1.378306171520171, "grad_norm": 0.6261381506919861, "learning_rate": 5.628332881216899e-06, "loss": 0.13264775276184082, "memory(GiB)": 31.36, "step": 1290, "token_acc": 0.956436461236709, "train_speed(iter/s)": 0.113332 }, { "epoch": 1.383649479027518, "grad_norm": 0.790125846862793, "learning_rate": 5.600545208234927e-06, "loss": 0.1441697359085083, "memory(GiB)": 31.36, "step": 1295, "token_acc": 0.9488245412844036, "train_speed(iter/s)": 0.113461 }, { "epoch": 1.388992786534865, "grad_norm": 0.7451324462890625, "learning_rate": 5.57273870229252e-06, "loss": 0.13834784030914307, "memory(GiB)": 31.36, "step": 1300, "token_acc": 0.9505032488215059, "train_speed(iter/s)": 0.11358 }, { "epoch": 1.388992786534865, "eval_loss": 0.18352170288562775, "eval_runtime": 39.1528, "eval_samples_per_second": 15.427, "eval_steps_per_second": 3.857, "eval_token_acc": 0.9367603851680285, "step": 1300 }, { "epoch": 1.3943360940422123, "grad_norm": 0.7569957375526428, "learning_rate": 5.544914235395347e-06, "loss": 0.15216903686523436, "memory(GiB)": 31.36, "step": 1305, "token_acc": 0.9388893760687508, "train_speed(iter/s)": 0.113241 }, { "epoch": 1.3996794015495593, "grad_norm": 0.7055838108062744, "learning_rate": 5.517072680112332e-06, "loss": 0.13284831047058104, "memory(GiB)": 31.36, "step": 1310, "token_acc": 0.9498824853520474, "train_speed(iter/s)": 0.113338 }, { "epoch": 1.4050227090569063, "grad_norm": 0.6958155035972595, "learning_rate": 5.4892149095482815e-06, "loss": 0.136586332321167, "memory(GiB)": 31.36, "step": 1315, "token_acc": 0.9489724944672779, "train_speed(iter/s)": 0.113435 }, { "epoch": 1.4103660165642533, "grad_norm": 0.6948981285095215, "learning_rate": 5.46134179731651e-06, "loss": 0.14353724718093872, "memory(GiB)": 31.36, "step": 1320, "token_acc": 0.9483981258705838, "train_speed(iter/s)": 0.11357 }, { "epoch": 1.4103660165642533, "eval_loss": 0.1832687258720398, "eval_runtime": 39.1038, "eval_samples_per_second": 15.446, "eval_steps_per_second": 3.862, "eval_token_acc": 0.9369748439812134, "step": 1320 }, { "epoch": 1.4157093240716003, "grad_norm": 0.6944836378097534, "learning_rate": 5.4334542175114495e-06, "loss": 0.1423251748085022, "memory(GiB)": 31.36, "step": 1325, "token_acc": 0.9394761855681909, "train_speed(iter/s)": 0.11322 }, { "epoch": 1.4210526315789473, "grad_norm": 0.6609280705451965, "learning_rate": 5.40555304468122e-06, "loss": 0.13840043544769287, "memory(GiB)": 31.36, "step": 1330, "token_acc": 0.9507696104136463, "train_speed(iter/s)": 0.113316 }, { "epoch": 1.4263959390862944, "grad_norm": 0.7256921529769897, "learning_rate": 5.377639153800229e-06, "loss": 0.1384860634803772, "memory(GiB)": 31.36, "step": 1335, "token_acc": 0.9545629784656056, "train_speed(iter/s)": 0.113429 }, { "epoch": 1.4317392465936414, "grad_norm": 0.7888199687004089, "learning_rate": 5.34971342024171e-06, "loss": 0.14113259315490723, "memory(GiB)": 31.36, "step": 1340, "token_acc": 0.954348504280911, "train_speed(iter/s)": 0.113523 }, { "epoch": 1.4317392465936414, "eval_loss": 0.184078186750412, "eval_runtime": 39.1268, "eval_samples_per_second": 15.437, "eval_steps_per_second": 3.859, "eval_token_acc": 0.9369877115100045, "step": 1340 }, { "epoch": 1.4370825541009884, "grad_norm": 0.7354726791381836, "learning_rate": 5.321776719750283e-06, "loss": 0.1384582042694092, "memory(GiB)": 31.36, "step": 1345, "token_acc": 0.9407986188960137, "train_speed(iter/s)": 0.113178 }, { "epoch": 1.4424258616083356, "grad_norm": 0.6982942819595337, "learning_rate": 5.29382992841449e-06, "loss": 0.14179346561431885, "memory(GiB)": 31.36, "step": 1350, "token_acc": 0.9446530872056015, "train_speed(iter/s)": 0.113289 }, { "epoch": 1.4477691691156827, "grad_norm": 0.6804877519607544, "learning_rate": 5.265873922639315e-06, "loss": 0.13716717958450317, "memory(GiB)": 31.36, "step": 1355, "token_acc": 0.9538438661710037, "train_speed(iter/s)": 0.113405 }, { "epoch": 1.4531124766230297, "grad_norm": 0.7978929877281189, "learning_rate": 5.237909579118713e-06, "loss": 0.1416216015815735, "memory(GiB)": 31.36, "step": 1360, "token_acc": 0.9511662976866557, "train_speed(iter/s)": 0.113536 }, { "epoch": 1.4531124766230297, "eval_loss": 0.1834552139043808, "eval_runtime": 39.1688, "eval_samples_per_second": 15.42, "eval_steps_per_second": 3.855, "eval_token_acc": 0.9368547470458298, "step": 1360 }, { "epoch": 1.4584557841303767, "grad_norm": 0.721110999584198, "learning_rate": 5.209937774808098e-06, "loss": 0.13820960521697997, "memory(GiB)": 31.36, "step": 1365, "token_acc": 0.9396422402036548, "train_speed(iter/s)": 0.113179 }, { "epoch": 1.4637990916377237, "grad_norm": 0.7276850938796997, "learning_rate": 5.181959386896862e-06, "loss": 0.14571261405944824, "memory(GiB)": 31.36, "step": 1370, "token_acc": 0.9458817568637385, "train_speed(iter/s)": 0.113285 }, { "epoch": 1.4691423991450707, "grad_norm": 0.6762943267822266, "learning_rate": 5.153975292780852e-06, "loss": 0.14370789527893066, "memory(GiB)": 31.36, "step": 1375, "token_acc": 0.9516790861044546, "train_speed(iter/s)": 0.113408 }, { "epoch": 1.474485706652418, "grad_norm": 0.7835574746131897, "learning_rate": 5.125986370034862e-06, "loss": 0.14546499252319336, "memory(GiB)": 31.36, "step": 1380, "token_acc": 0.9492231661229368, "train_speed(iter/s)": 0.113557 }, { "epoch": 1.474485706652418, "eval_loss": 0.18329627811908722, "eval_runtime": 39.2746, "eval_samples_per_second": 15.379, "eval_steps_per_second": 3.845, "eval_token_acc": 0.9371549893842888, "step": 1380 }, { "epoch": 1.479829014159765, "grad_norm": 0.7045428156852722, "learning_rate": 5.097993496385112e-06, "loss": 0.14536089897155763, "memory(GiB)": 31.36, "step": 1385, "token_acc": 0.9408298818336975, "train_speed(iter/s)": 0.113213 }, { "epoch": 1.485172321667112, "grad_norm": 0.7122427225112915, "learning_rate": 5.069997549681718e-06, "loss": 0.1389164924621582, "memory(GiB)": 31.36, "step": 1390, "token_acc": 0.949877300613497, "train_speed(iter/s)": 0.113318 }, { "epoch": 1.490515629174459, "grad_norm": 0.7316491007804871, "learning_rate": 5.041999407871168e-06, "loss": 0.14822676181793212, "memory(GiB)": 31.36, "step": 1395, "token_acc": 0.9491525423728814, "train_speed(iter/s)": 0.113431 }, { "epoch": 1.495858936681806, "grad_norm": 0.7621902227401733, "learning_rate": 5.01399994896879e-06, "loss": 0.14519236087799073, "memory(GiB)": 31.36, "step": 1400, "token_acc": 0.9512387720856776, "train_speed(iter/s)": 0.113547 }, { "epoch": 1.495858936681806, "eval_loss": 0.1818138211965561, "eval_runtime": 39.3086, "eval_samples_per_second": 15.366, "eval_steps_per_second": 3.841, "eval_token_acc": 0.9370348924489051, "step": 1400 }, { "epoch": 1.501202244189153, "grad_norm": 0.7017737627029419, "learning_rate": 4.986000051031212e-06, "loss": 0.13984346389770508, "memory(GiB)": 31.36, "step": 1405, "token_acc": 0.9408215177889058, "train_speed(iter/s)": 0.113226 }, { "epoch": 1.5065455516965, "grad_norm": 0.7313562035560608, "learning_rate": 4.958000592128834e-06, "loss": 0.13321598768234252, "memory(GiB)": 31.36, "step": 1410, "token_acc": 0.9527582267752515, "train_speed(iter/s)": 0.113333 }, { "epoch": 1.511888859203847, "grad_norm": 0.7831226587295532, "learning_rate": 4.930002450318282e-06, "loss": 0.1345110058784485, "memory(GiB)": 31.36, "step": 1415, "token_acc": 0.9496566716124357, "train_speed(iter/s)": 0.113428 }, { "epoch": 1.517232166711194, "grad_norm": 0.701524019241333, "learning_rate": 4.9020065036148885e-06, "loss": 0.14322736263275146, "memory(GiB)": 31.36, "step": 1420, "token_acc": 0.9406170170956966, "train_speed(iter/s)": 0.113546 }, { "epoch": 1.517232166711194, "eval_loss": 0.18205159902572632, "eval_runtime": 39.1325, "eval_samples_per_second": 15.435, "eval_steps_per_second": 3.859, "eval_token_acc": 0.9375109910141757, "step": 1420 }, { "epoch": 1.5225754742185411, "grad_norm": 0.6394919157028198, "learning_rate": 4.874013629965138e-06, "loss": 0.13802753686904906, "memory(GiB)": 31.36, "step": 1425, "token_acc": 0.9402941276057832, "train_speed(iter/s)": 0.113221 }, { "epoch": 1.5279187817258884, "grad_norm": 0.7372197508811951, "learning_rate": 4.846024707219149e-06, "loss": 0.1441117525100708, "memory(GiB)": 31.36, "step": 1430, "token_acc": 0.9448832927481142, "train_speed(iter/s)": 0.113341 }, { "epoch": 1.5332620892332354, "grad_norm": 0.6687447428703308, "learning_rate": 4.818040613103139e-06, "loss": 0.13662933111190795, "memory(GiB)": 31.36, "step": 1435, "token_acc": 0.949387670379852, "train_speed(iter/s)": 0.113439 }, { "epoch": 1.5386053967405824, "grad_norm": 0.7553586363792419, "learning_rate": 4.790062225191902e-06, "loss": 0.15836725234985352, "memory(GiB)": 31.36, "step": 1440, "token_acc": 0.9384995877988458, "train_speed(iter/s)": 0.113564 }, { "epoch": 1.5386053967405824, "eval_loss": 0.18115834891796112, "eval_runtime": 39.2778, "eval_samples_per_second": 15.378, "eval_steps_per_second": 3.844, "eval_token_acc": 0.9376825580647237, "step": 1440 }, { "epoch": 1.5439487042479296, "grad_norm": 0.6660886406898499, "learning_rate": 4.762090420881289e-06, "loss": 0.1435617685317993, "memory(GiB)": 31.36, "step": 1445, "token_acc": 0.9411263893262424, "train_speed(iter/s)": 0.113249 }, { "epoch": 1.5492920117552766, "grad_norm": 0.6773468255996704, "learning_rate": 4.734126077360685e-06, "loss": 0.13354458808898925, "memory(GiB)": 31.36, "step": 1450, "token_acc": 0.9558268311099924, "train_speed(iter/s)": 0.113351 }, { "epoch": 1.5546353192626237, "grad_norm": 0.7248560786247253, "learning_rate": 4.706170071585513e-06, "loss": 0.1327458381652832, "memory(GiB)": 31.36, "step": 1455, "token_acc": 0.955091649694501, "train_speed(iter/s)": 0.113455 }, { "epoch": 1.5599786267699707, "grad_norm": 0.7063668966293335, "learning_rate": 4.678223280249718e-06, "loss": 0.12768800258636476, "memory(GiB)": 31.36, "step": 1460, "token_acc": 0.949375866851595, "train_speed(iter/s)": 0.113557 }, { "epoch": 1.5599786267699707, "eval_loss": 0.18129871785640717, "eval_runtime": 39.1068, "eval_samples_per_second": 15.445, "eval_steps_per_second": 3.861, "eval_token_acc": 0.9380900298097751, "step": 1460 }, { "epoch": 1.5653219342773177, "grad_norm": 0.8191858530044556, "learning_rate": 4.650286579758291e-06, "loss": 0.13946748971939088, "memory(GiB)": 31.36, "step": 1465, "token_acc": 0.9430553548200608, "train_speed(iter/s)": 0.113238 }, { "epoch": 1.5706652417846647, "grad_norm": 0.7221189141273499, "learning_rate": 4.622360846199772e-06, "loss": 0.13773694038391113, "memory(GiB)": 31.36, "step": 1470, "token_acc": 0.9456267929815197, "train_speed(iter/s)": 0.113346 }, { "epoch": 1.5760085492920117, "grad_norm": 0.715904176235199, "learning_rate": 4.594446955318781e-06, "loss": 0.13796852827072142, "memory(GiB)": 31.36, "step": 1475, "token_acc": 0.9508182349503215, "train_speed(iter/s)": 0.113453 }, { "epoch": 1.5813518567993587, "grad_norm": 0.7929291129112244, "learning_rate": 4.566545782488554e-06, "loss": 0.14087553024291993, "memory(GiB)": 31.36, "step": 1480, "token_acc": 0.9462326623398016, "train_speed(iter/s)": 0.113554 }, { "epoch": 1.5813518567993587, "eval_loss": 0.18039724230766296, "eval_runtime": 39.124, "eval_samples_per_second": 15.438, "eval_steps_per_second": 3.86, "eval_token_acc": 0.9378841493491175, "step": 1480 }, { "epoch": 1.5866951643067058, "grad_norm": 0.7181767225265503, "learning_rate": 4.53865820268349e-06, "loss": 0.14211044311523438, "memory(GiB)": 31.36, "step": 1485, "token_acc": 0.9420699925539836, "train_speed(iter/s)": 0.113263 }, { "epoch": 1.5920384718140528, "grad_norm": 0.6460716724395752, "learning_rate": 4.510785090451719e-06, "loss": 0.13882654905319214, "memory(GiB)": 31.36, "step": 1490, "token_acc": 0.9472210254200776, "train_speed(iter/s)": 0.11337 }, { "epoch": 1.5973817793213998, "grad_norm": 0.7372190952301025, "learning_rate": 4.482927319887669e-06, "loss": 0.14314990043640136, "memory(GiB)": 31.36, "step": 1495, "token_acc": 0.9522527490349996, "train_speed(iter/s)": 0.113469 }, { "epoch": 1.602725086828747, "grad_norm": 0.7231627702713013, "learning_rate": 4.455085764604653e-06, "loss": 0.14776058197021485, "memory(GiB)": 31.36, "step": 1500, "token_acc": 0.9497078741203028, "train_speed(iter/s)": 0.113578 }, { "epoch": 1.602725086828747, "eval_loss": 0.1816372275352478, "eval_runtime": 39.1733, "eval_samples_per_second": 15.419, "eval_steps_per_second": 3.855, "eval_token_acc": 0.9376225095970319, "step": 1500 }, { "epoch": 1.608068394336094, "grad_norm": 0.7648904323577881, "learning_rate": 4.427261297707482e-06, "loss": 0.14478824138641358, "memory(GiB)": 31.36, "step": 1505, "token_acc": 0.9387511308018835, "train_speed(iter/s)": 0.113249 }, { "epoch": 1.613411701843441, "grad_norm": 0.6977179646492004, "learning_rate": 4.399454791765076e-06, "loss": 0.14400074481964112, "memory(GiB)": 31.36, "step": 1510, "token_acc": 0.9406537812945908, "train_speed(iter/s)": 0.113346 }, { "epoch": 1.618755009350788, "grad_norm": 0.6725999116897583, "learning_rate": 4.371667118783101e-06, "loss": 0.14132678508758545, "memory(GiB)": 31.36, "step": 1515, "token_acc": 0.9514303482587064, "train_speed(iter/s)": 0.113465 }, { "epoch": 1.6240983168581353, "grad_norm": 0.7282098531723022, "learning_rate": 4.343899150176635e-06, "loss": 0.1421644926071167, "memory(GiB)": 31.36, "step": 1520, "token_acc": 0.9474935470724086, "train_speed(iter/s)": 0.113561 }, { "epoch": 1.6240983168581353, "eval_loss": 0.18081353604793549, "eval_runtime": 39.1617, "eval_samples_per_second": 15.423, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9381457891012032, "step": 1520 }, { "epoch": 1.6294416243654823, "grad_norm": 0.7503587603569031, "learning_rate": 4.316151756742821e-06, "loss": 0.13501241207122802, "memory(GiB)": 31.36, "step": 1525, "token_acc": 0.9416415493274551, "train_speed(iter/s)": 0.113245 }, { "epoch": 1.6347849318728294, "grad_norm": 0.7951456308364868, "learning_rate": 4.2884258086335755e-06, "loss": 0.14272716045379638, "memory(GiB)": 31.36, "step": 1530, "token_acc": 0.9456025411951559, "train_speed(iter/s)": 0.113354 }, { "epoch": 1.6401282393801764, "grad_norm": 0.728985607624054, "learning_rate": 4.26072217532829e-06, "loss": 0.1494640588760376, "memory(GiB)": 31.36, "step": 1535, "token_acc": 0.9476955108993974, "train_speed(iter/s)": 0.113456 }, { "epoch": 1.6454715468875234, "grad_norm": 0.7554564476013184, "learning_rate": 4.233041725606573e-06, "loss": 0.14563045501708985, "memory(GiB)": 31.36, "step": 1540, "token_acc": 0.9460028304967885, "train_speed(iter/s)": 0.113557 }, { "epoch": 1.6454715468875234, "eval_loss": 0.18041342496871948, "eval_runtime": 39.1831, "eval_samples_per_second": 15.415, "eval_steps_per_second": 3.854, "eval_token_acc": 0.9377854982950524, "step": 1540 }, { "epoch": 1.6508148543948704, "grad_norm": 0.7042314410209656, "learning_rate": 4.205385327521002e-06, "loss": 0.15157747268676758, "memory(GiB)": 31.36, "step": 1545, "token_acc": 0.9420924680785825, "train_speed(iter/s)": 0.113252 }, { "epoch": 1.6561581619022174, "grad_norm": 0.7284954786300659, "learning_rate": 4.177753848369892e-06, "loss": 0.13592784404754638, "memory(GiB)": 31.36, "step": 1550, "token_acc": 0.9485677708433349, "train_speed(iter/s)": 0.113347 }, { "epoch": 1.6615014694095644, "grad_norm": 0.7814568877220154, "learning_rate": 4.1501481546701185e-06, "loss": 0.13980913162231445, "memory(GiB)": 31.36, "step": 1555, "token_acc": 0.9546170365068003, "train_speed(iter/s)": 0.113447 }, { "epoch": 1.6668447769169115, "grad_norm": 0.7283998727798462, "learning_rate": 4.12256911212992e-06, "loss": 0.13263875246047974, "memory(GiB)": 31.36, "step": 1560, "token_acc": 0.9517357901112563, "train_speed(iter/s)": 0.113556 }, { "epoch": 1.6668447769169115, "eval_loss": 0.17983108758926392, "eval_runtime": 39.1475, "eval_samples_per_second": 15.429, "eval_steps_per_second": 3.857, "eval_token_acc": 0.938132921572412, "step": 1560 }, { "epoch": 1.6721880844242585, "grad_norm": 0.7712817192077637, "learning_rate": 4.095017585621767e-06, "loss": 0.1369832158088684, "memory(GiB)": 31.36, "step": 1565, "token_acc": 0.9403439916887915, "train_speed(iter/s)": 0.113246 }, { "epoch": 1.6775313919316055, "grad_norm": 0.7404097318649292, "learning_rate": 4.067494439155236e-06, "loss": 0.14037466049194336, "memory(GiB)": 31.36, "step": 1570, "token_acc": 0.9485796116828188, "train_speed(iter/s)": 0.113344 }, { "epoch": 1.6828746994389527, "grad_norm": 0.7791383862495422, "learning_rate": 4.0400005358499e-06, "loss": 0.1371939778327942, "memory(GiB)": 31.36, "step": 1575, "token_acc": 0.9578335949764522, "train_speed(iter/s)": 0.113426 }, { "epoch": 1.6882180069462998, "grad_norm": 0.7537409663200378, "learning_rate": 4.012536737908288e-06, "loss": 0.1379605770111084, "memory(GiB)": 31.36, "step": 1580, "token_acc": 0.9421140684410646, "train_speed(iter/s)": 0.113529 }, { "epoch": 1.6882180069462998, "eval_loss": 0.1801919788122177, "eval_runtime": 39.1291, "eval_samples_per_second": 15.436, "eval_steps_per_second": 3.859, "eval_token_acc": 0.9378026550001072, "step": 1580 }, { "epoch": 1.6935613144536468, "grad_norm": 0.759289562702179, "learning_rate": 3.985103906588821e-06, "loss": 0.1377565622329712, "memory(GiB)": 31.36, "step": 1585, "token_acc": 0.9409732617857719, "train_speed(iter/s)": 0.113222 }, { "epoch": 1.698904621960994, "grad_norm": 0.6593434810638428, "learning_rate": 3.957702902178816e-06, "loss": 0.13015660047531127, "memory(GiB)": 31.36, "step": 1590, "token_acc": 0.9526614173228346, "train_speed(iter/s)": 0.11332 }, { "epoch": 1.704247929468341, "grad_norm": 0.7041330337524414, "learning_rate": 3.930334583967514e-06, "loss": 0.14423298835754395, "memory(GiB)": 31.36, "step": 1595, "token_acc": 0.9480859417602792, "train_speed(iter/s)": 0.113416 }, { "epoch": 1.709591236975688, "grad_norm": 0.6841444969177246, "learning_rate": 3.902999810219109e-06, "loss": 0.13458824157714844, "memory(GiB)": 31.36, "step": 1600, "token_acc": 0.9575135610614279, "train_speed(iter/s)": 0.113518 }, { "epoch": 1.709591236975688, "eval_loss": 0.17973561584949493, "eval_runtime": 39.1386, "eval_samples_per_second": 15.432, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9382272834502133, "step": 1600 }, { "epoch": 1.714934544483035, "grad_norm": 0.8064398765563965, "learning_rate": 3.875699438145862e-06, "loss": 0.13843204975128173, "memory(GiB)": 31.36, "step": 1605, "token_acc": 0.941116183732414, "train_speed(iter/s)": 0.113215 }, { "epoch": 1.720277851990382, "grad_norm": 0.7203328609466553, "learning_rate": 3.8484343238811976e-06, "loss": 0.14074230194091797, "memory(GiB)": 31.36, "step": 1610, "token_acc": 0.9515015593790518, "train_speed(iter/s)": 0.113297 }, { "epoch": 1.725621159497729, "grad_norm": 0.7621043920516968, "learning_rate": 3.821205322452863e-06, "loss": 0.13528130054473878, "memory(GiB)": 31.36, "step": 1615, "token_acc": 0.9499534380726367, "train_speed(iter/s)": 0.11339 }, { "epoch": 1.7309644670050761, "grad_norm": 0.6963400840759277, "learning_rate": 3.794013287756125e-06, "loss": 0.13751909732818604, "memory(GiB)": 31.36, "step": 1620, "token_acc": 0.9522881588161861, "train_speed(iter/s)": 0.113481 }, { "epoch": 1.7309644670050761, "eval_loss": 0.17938879132270813, "eval_runtime": 39.133, "eval_samples_per_second": 15.435, "eval_steps_per_second": 3.859, "eval_token_acc": 0.9384074288532888, "step": 1620 }, { "epoch": 1.7363077745124231, "grad_norm": 0.7448264956474304, "learning_rate": 3.766859072526969e-06, "loss": 0.129533851146698, "memory(GiB)": 31.36, "step": 1625, "token_acc": 0.9424371772034192, "train_speed(iter/s)": 0.113206 }, { "epoch": 1.7416510820197701, "grad_norm": 0.7266005873680115, "learning_rate": 3.7397435283153795e-06, "loss": 0.1342164993286133, "memory(GiB)": 31.36, "step": 1630, "token_acc": 0.9512910597946584, "train_speed(iter/s)": 0.113278 }, { "epoch": 1.7469943895271172, "grad_norm": 0.7142292857170105, "learning_rate": 3.712667505458622e-06, "loss": 0.14621845483779908, "memory(GiB)": 31.36, "step": 1635, "token_acc": 0.9502799032760277, "train_speed(iter/s)": 0.113376 }, { "epoch": 1.7523376970344642, "grad_norm": 0.6820980906486511, "learning_rate": 3.685631853054583e-06, "loss": 0.14358122348785402, "memory(GiB)": 31.36, "step": 1640, "token_acc": 0.9519375470278405, "train_speed(iter/s)": 0.113482 }, { "epoch": 1.7523376970344642, "eval_loss": 0.17889852821826935, "eval_runtime": 39.0969, "eval_samples_per_second": 15.449, "eval_steps_per_second": 3.862, "eval_token_acc": 0.9381586566299942, "step": 1640 }, { "epoch": 1.7576810045418114, "grad_norm": 0.7781234979629517, "learning_rate": 3.658637418935146e-06, "loss": 0.13783912658691405, "memory(GiB)": 31.36, "step": 1645, "token_acc": 0.9402854612580077, "train_speed(iter/s)": 0.113198 }, { "epoch": 1.7630243120491584, "grad_norm": 0.7067362666130066, "learning_rate": 3.6316850496395863e-06, "loss": 0.138936448097229, "memory(GiB)": 31.36, "step": 1650, "token_acc": 0.9473799468168309, "train_speed(iter/s)": 0.113308 }, { "epoch": 1.7683676195565055, "grad_norm": 0.7398570775985718, "learning_rate": 3.6047755903880478e-06, "loss": 0.14049469232559203, "memory(GiB)": 31.36, "step": 1655, "token_acc": 0.9463533798334994, "train_speed(iter/s)": 0.113405 }, { "epoch": 1.7737109270638525, "grad_norm": 0.8484781980514526, "learning_rate": 3.577909885055019e-06, "loss": 0.1409994840621948, "memory(GiB)": 31.36, "step": 1660, "token_acc": 0.9477940181350656, "train_speed(iter/s)": 0.113503 }, { "epoch": 1.7737109270638525, "eval_loss": 0.1786307543516159, "eval_runtime": 39.1475, "eval_samples_per_second": 15.429, "eval_steps_per_second": 3.857, "eval_token_acc": 0.9385661283750456, "step": 1660 }, { "epoch": 1.7790542345711997, "grad_norm": 0.7654802799224854, "learning_rate": 3.5510887761428764e-06, "loss": 0.13482067584991456, "memory(GiB)": 31.36, "step": 1665, "token_acc": 0.9418829437383344, "train_speed(iter/s)": 0.113194 }, { "epoch": 1.7843975420785467, "grad_norm": 0.7164176106452942, "learning_rate": 3.524313104755468e-06, "loss": 0.13556787967681885, "memory(GiB)": 31.36, "step": 1670, "token_acc": 0.9485995797016571, "train_speed(iter/s)": 0.113282 }, { "epoch": 1.7897408495858937, "grad_norm": 0.7502966523170471, "learning_rate": 3.4975837105717203e-06, "loss": 0.13461077213287354, "memory(GiB)": 31.36, "step": 1675, "token_acc": 0.9522437216961712, "train_speed(iter/s)": 0.113382 }, { "epoch": 1.7950841570932408, "grad_norm": 0.7419613003730774, "learning_rate": 3.4709014318193298e-06, "loss": 0.1423276662826538, "memory(GiB)": 31.36, "step": 1680, "token_acc": 0.9488505932943192, "train_speed(iter/s)": 0.113481 }, { "epoch": 1.7950841570932408, "eval_loss": 0.1787741631269455, "eval_runtime": 39.2394, "eval_samples_per_second": 15.393, "eval_steps_per_second": 3.848, "eval_token_acc": 0.9386218876664737, "step": 1680 }, { "epoch": 1.8004274646005878, "grad_norm": 0.7690842151641846, "learning_rate": 3.4442671052484545e-06, "loss": 0.1434476137161255, "memory(GiB)": 31.36, "step": 1685, "token_acc": 0.9390579069378391, "train_speed(iter/s)": 0.113206 }, { "epoch": 1.8057707721079348, "grad_norm": 0.7701799273490906, "learning_rate": 3.4176815661054884e-06, "loss": 0.13522175550460816, "memory(GiB)": 31.36, "step": 1690, "token_acc": 0.9531173213642171, "train_speed(iter/s)": 0.113311 }, { "epoch": 1.8111140796152818, "grad_norm": 0.7921913862228394, "learning_rate": 3.3911456481068613e-06, "loss": 0.13670728206634522, "memory(GiB)": 31.36, "step": 1695, "token_acc": 0.9481580510992276, "train_speed(iter/s)": 0.113396 }, { "epoch": 1.8164573871226288, "grad_norm": 0.8159610629081726, "learning_rate": 3.3646601834128924e-06, "loss": 0.141351580619812, "memory(GiB)": 31.36, "step": 1700, "token_acc": 0.9491429380932144, "train_speed(iter/s)": 0.113491 }, { "epoch": 1.8164573871226288, "eval_loss": 0.17854392528533936, "eval_runtime": 39.1874, "eval_samples_per_second": 15.413, "eval_steps_per_second": 3.853, "eval_token_acc": 0.9385875742563641, "step": 1700 }, { "epoch": 1.8218006946299758, "grad_norm": 0.7273773550987244, "learning_rate": 3.3382260026017027e-06, "loss": 0.1371569514274597, "memory(GiB)": 31.36, "step": 1705, "token_acc": 0.9418744245136776, "train_speed(iter/s)": 0.113203 }, { "epoch": 1.8271440021373229, "grad_norm": 0.7564848065376282, "learning_rate": 3.311843934643157e-06, "loss": 0.12437918186187744, "memory(GiB)": 31.36, "step": 1710, "token_acc": 0.9571033210332104, "train_speed(iter/s)": 0.113276 }, { "epoch": 1.83248730964467, "grad_norm": 0.7761486172676086, "learning_rate": 3.2855148068728753e-06, "loss": 0.14540971517562867, "memory(GiB)": 31.36, "step": 1715, "token_acc": 0.9477377595488337, "train_speed(iter/s)": 0.113367 }, { "epoch": 1.8378306171520171, "grad_norm": 0.7440487742424011, "learning_rate": 3.2592394449662867e-06, "loss": 0.13129628896713258, "memory(GiB)": 31.36, "step": 1720, "token_acc": 0.950944535784988, "train_speed(iter/s)": 0.113447 }, { "epoch": 1.8378306171520171, "eval_loss": 0.17832355201244354, "eval_runtime": 39.1412, "eval_samples_per_second": 15.431, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9387977438932853, "step": 1720 }, { "epoch": 1.8431739246593641, "grad_norm": 0.7634470462799072, "learning_rate": 3.233018672912731e-06, "loss": 0.14330395460128784, "memory(GiB)": 31.36, "step": 1725, "token_acc": 0.9432163730078285, "train_speed(iter/s)": 0.113192 }, { "epoch": 1.8485172321667112, "grad_norm": 0.7993137836456299, "learning_rate": 3.2068533129896273e-06, "loss": 0.1466256022453308, "memory(GiB)": 31.36, "step": 1730, "token_acc": 0.950088022429419, "train_speed(iter/s)": 0.113299 }, { "epoch": 1.8538605396740584, "grad_norm": 0.7558074593544006, "learning_rate": 3.1807441857366798e-06, "loss": 0.13543074131011962, "memory(GiB)": 31.36, "step": 1735, "token_acc": 0.9463475916166196, "train_speed(iter/s)": 0.1134 }, { "epoch": 1.8592038471814054, "grad_norm": 0.7222068309783936, "learning_rate": 3.1546921099301507e-06, "loss": 0.13885715007781982, "memory(GiB)": 31.36, "step": 1740, "token_acc": 0.9468914264999093, "train_speed(iter/s)": 0.113485 }, { "epoch": 1.8592038471814054, "eval_loss": 0.1776435226202011, "eval_runtime": 39.1268, "eval_samples_per_second": 15.437, "eval_steps_per_second": 3.859, "eval_token_acc": 0.9389221300049325, "step": 1740 }, { "epoch": 1.8645471546887524, "grad_norm": 0.7544873356819153, "learning_rate": 3.1286979025571817e-06, "loss": 0.1199462890625, "memory(GiB)": 31.36, "step": 1745, "token_acc": 0.942448560630803, "train_speed(iter/s)": 0.113208 }, { "epoch": 1.8698904621960994, "grad_norm": 0.6838610768318176, "learning_rate": 3.1027623787901706e-06, "loss": 0.13257505893707275, "memory(GiB)": 31.36, "step": 1750, "token_acc": 0.953539454854062, "train_speed(iter/s)": 0.113284 }, { "epoch": 1.8752337697034465, "grad_norm": 0.7307048439979553, "learning_rate": 3.076886351961217e-06, "loss": 0.14348651170730592, "memory(GiB)": 31.36, "step": 1755, "token_acc": 0.9522128203567298, "train_speed(iter/s)": 0.113384 }, { "epoch": 1.8805770772107935, "grad_norm": 0.7680135369300842, "learning_rate": 3.0510706335366034e-06, "loss": 0.14112248420715331, "memory(GiB)": 31.36, "step": 1760, "token_acc": 0.950036469730124, "train_speed(iter/s)": 0.11347 }, { "epoch": 1.8805770772107935, "eval_loss": 0.17695675790309906, "eval_runtime": 39.1442, "eval_samples_per_second": 15.43, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9390207810589977, "step": 1760 }, { "epoch": 1.8859203847181405, "grad_norm": 0.6436514258384705, "learning_rate": 3.02531603309136e-06, "loss": 0.13258137702941894, "memory(GiB)": 31.36, "step": 1765, "token_acc": 0.9418506940943162, "train_speed(iter/s)": 0.113198 }, { "epoch": 1.8912636922254875, "grad_norm": 0.783997654914856, "learning_rate": 2.9996233582838686e-06, "loss": 0.137099552154541, "memory(GiB)": 31.36, "step": 1770, "token_acc": 0.9559140509228075, "train_speed(iter/s)": 0.11329 }, { "epoch": 1.8966069997328345, "grad_norm": 0.6852074861526489, "learning_rate": 2.973993414830534e-06, "loss": 0.1261454463005066, "memory(GiB)": 31.36, "step": 1775, "token_acc": 0.9524192587295912, "train_speed(iter/s)": 0.113369 }, { "epoch": 1.9019503072401815, "grad_norm": 0.6981728076934814, "learning_rate": 2.948427006480528e-06, "loss": 0.1357527494430542, "memory(GiB)": 31.36, "step": 1780, "token_acc": 0.9534122629704496, "train_speed(iter/s)": 0.11347 }, { "epoch": 1.9019503072401815, "eval_loss": 0.17668980360031128, "eval_runtime": 39.1514, "eval_samples_per_second": 15.427, "eval_steps_per_second": 3.857, "eval_token_acc": 0.9394754337429496, "step": 1780 }, { "epoch": 1.9072936147475286, "grad_norm": 0.6986772418022156, "learning_rate": 2.9229249349905686e-06, "loss": 0.1431878089904785, "memory(GiB)": 31.36, "step": 1785, "token_acc": 0.9415058402368222, "train_speed(iter/s)": 0.113218 }, { "epoch": 1.9126369222548758, "grad_norm": 0.6814377903938293, "learning_rate": 2.897488000099788e-06, "loss": 0.13923795223236085, "memory(GiB)": 31.36, "step": 1790, "token_acc": 0.9478648950197789, "train_speed(iter/s)": 0.113294 }, { "epoch": 1.9179802297622228, "grad_norm": 0.7366232872009277, "learning_rate": 2.8721169995046503e-06, "loss": 0.13349125385284424, "memory(GiB)": 31.36, "step": 1795, "token_acc": 0.9429411168541604, "train_speed(iter/s)": 0.113372 }, { "epoch": 1.9233235372695698, "grad_norm": 0.7991761565208435, "learning_rate": 2.846812728833931e-06, "loss": 0.13615771532058715, "memory(GiB)": 31.36, "step": 1800, "token_acc": 0.9488062932585297, "train_speed(iter/s)": 0.113453 }, { "epoch": 1.9233235372695698, "eval_loss": 0.17635242640972137, "eval_runtime": 39.1984, "eval_samples_per_second": 15.409, "eval_steps_per_second": 3.852, "eval_token_acc": 0.9390679619978983, "step": 1800 }, { "epoch": 1.928666844776917, "grad_norm": 0.7395577430725098, "learning_rate": 2.8215759816237748e-06, "loss": 0.1406429648399353, "memory(GiB)": 31.36, "step": 1805, "token_acc": 0.9421045579401058, "train_speed(iter/s)": 0.113193 }, { "epoch": 1.934010152284264, "grad_norm": 0.7564118504524231, "learning_rate": 2.796407549292809e-06, "loss": 0.13832550048828124, "memory(GiB)": 31.36, "step": 1810, "token_acc": 0.956355867541584, "train_speed(iter/s)": 0.113289 }, { "epoch": 1.9393534597916111, "grad_norm": 0.6458448767662048, "learning_rate": 2.771308221117309e-06, "loss": 0.13285930156707765, "memory(GiB)": 31.36, "step": 1815, "token_acc": 0.950381946877847, "train_speed(iter/s)": 0.113379 }, { "epoch": 1.9446967672989581, "grad_norm": 0.7599870562553406, "learning_rate": 2.7462787842064753e-06, "loss": 0.131211256980896, "memory(GiB)": 31.36, "step": 1820, "token_acc": 0.9533532132424537, "train_speed(iter/s)": 0.113463 }, { "epoch": 1.9446967672989581, "eval_loss": 0.17678451538085938, "eval_runtime": 39.1954, "eval_samples_per_second": 15.41, "eval_steps_per_second": 3.852, "eval_token_acc": 0.9391751914044908, "step": 1820 }, { "epoch": 1.9500400748063051, "grad_norm": 0.7510969042778015, "learning_rate": 2.7213200234777215e-06, "loss": 0.151234769821167, "memory(GiB)": 31.36, "step": 1825, "token_acc": 0.9407468474954135, "train_speed(iter/s)": 0.113216 }, { "epoch": 1.9553833823136522, "grad_norm": 0.7278915047645569, "learning_rate": 2.696432721632082e-06, "loss": 0.13203661441802977, "memory(GiB)": 31.36, "step": 1830, "token_acc": 0.9518988171303122, "train_speed(iter/s)": 0.113284 }, { "epoch": 1.9607266898209992, "grad_norm": 0.7572088241577148, "learning_rate": 2.671617659129655e-06, "loss": 0.14195291996002196, "memory(GiB)": 31.36, "step": 1835, "token_acc": 0.9429287939813056, "train_speed(iter/s)": 0.113373 }, { "epoch": 1.9660699973283462, "grad_norm": 0.7470288276672363, "learning_rate": 2.646875614165121e-06, "loss": 0.1265857696533203, "memory(GiB)": 31.36, "step": 1840, "token_acc": 0.9504980895196506, "train_speed(iter/s)": 0.113448 }, { "epoch": 1.9660699973283462, "eval_loss": 0.17609840631484985, "eval_runtime": 39.2428, "eval_samples_per_second": 15.391, "eval_steps_per_second": 3.848, "eval_token_acc": 0.9394668553904223, "step": 1840 }, { "epoch": 1.9714133048356932, "grad_norm": 0.6941691637039185, "learning_rate": 2.6222073626433587e-06, "loss": 0.13350989818572997, "memory(GiB)": 31.36, "step": 1845, "token_acc": 0.941098463918665, "train_speed(iter/s)": 0.113192 }, { "epoch": 1.9767566123430402, "grad_norm": 0.7226253747940063, "learning_rate": 2.597613678155092e-06, "loss": 0.13629913330078125, "memory(GiB)": 31.36, "step": 1850, "token_acc": 0.9471854356964505, "train_speed(iter/s)": 0.113272 }, { "epoch": 1.9820999198503872, "grad_norm": 0.7369644641876221, "learning_rate": 2.573095331952646e-06, "loss": 0.1342089891433716, "memory(GiB)": 31.36, "step": 1855, "token_acc": 0.9550353716804326, "train_speed(iter/s)": 0.113379 }, { "epoch": 1.9874432273577345, "grad_norm": 0.71446293592453, "learning_rate": 2.5486530929257574e-06, "loss": 0.13259618282318114, "memory(GiB)": 31.36, "step": 1860, "token_acc": 0.9522027151471942, "train_speed(iter/s)": 0.113466 }, { "epoch": 1.9874432273577345, "eval_loss": 0.1756637990474701, "eval_runtime": 39.2304, "eval_samples_per_second": 15.396, "eval_steps_per_second": 3.849, "eval_token_acc": 0.9394625662141586, "step": 1860 }, { "epoch": 1.9927865348650815, "grad_norm": 0.7780827879905701, "learning_rate": 2.5242877275774446e-06, "loss": 0.1336849570274353, "memory(GiB)": 31.36, "step": 1865, "token_acc": 0.9416884046261456, "train_speed(iter/s)": 0.113224 }, { "epoch": 1.9981298423724285, "grad_norm": 0.6946832537651062, "learning_rate": 2.5000000000000015e-06, "loss": 0.14616479873657226, "memory(GiB)": 31.36, "step": 1870, "token_acc": 0.9527469722324645, "train_speed(iter/s)": 0.113319 }, { "epoch": 2.003205984504408, "grad_norm": 0.6446176171302795, "learning_rate": 2.475790671851007e-06, "loss": 0.11167683601379394, "memory(GiB)": 31.36, "step": 1875, "token_acc": 0.9654894371875504, "train_speed(iter/s)": 0.113423 }, { "epoch": 2.008549292011755, "grad_norm": 0.6247937083244324, "learning_rate": 2.4516605023294626e-06, "loss": 0.10328346490859985, "memory(GiB)": 31.36, "step": 1880, "token_acc": 0.966084815624222, "train_speed(iter/s)": 0.11351 }, { "epoch": 2.008549292011755, "eval_loss": 0.17980773746967316, "eval_runtime": 39.1557, "eval_samples_per_second": 15.426, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9395440605631689, "step": 1880 }, { "epoch": 2.013892599519102, "grad_norm": 0.6584553718566895, "learning_rate": 2.4276102481519655e-06, "loss": 0.10475772619247437, "memory(GiB)": 31.36, "step": 1885, "token_acc": 0.94671126227415, "train_speed(iter/s)": 0.113271 }, { "epoch": 2.019235907026449, "grad_norm": 0.6800820827484131, "learning_rate": 2.403640663528986e-06, "loss": 0.09606801271438599, "memory(GiB)": 31.36, "step": 1890, "token_acc": 0.9638240304639744, "train_speed(iter/s)": 0.113359 }, { "epoch": 2.0245792145337966, "grad_norm": 0.6699435710906982, "learning_rate": 2.379752500141222e-06, "loss": 0.09734945297241211, "memory(GiB)": 31.36, "step": 1895, "token_acc": 0.9634120335110434, "train_speed(iter/s)": 0.113444 }, { "epoch": 2.0299225220411437, "grad_norm": 0.7269750833511353, "learning_rate": 2.355946507116012e-06, "loss": 0.0947374939918518, "memory(GiB)": 31.36, "step": 1900, "token_acc": 0.9667849182180642, "train_speed(iter/s)": 0.113522 }, { "epoch": 2.0299225220411437, "eval_loss": 0.19244976341724396, "eval_runtime": 39.1428, "eval_samples_per_second": 15.431, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9386519119003195, "step": 1900 }, { "epoch": 2.0352658295484907, "grad_norm": 0.6670167446136475, "learning_rate": 2.332223431003859e-06, "loss": 0.09205610752105713, "memory(GiB)": 31.36, "step": 1905, "token_acc": 0.9472729334391654, "train_speed(iter/s)": 0.11327 }, { "epoch": 2.0406091370558377, "grad_norm": 0.8095514178276062, "learning_rate": 2.3085840157550036e-06, "loss": 0.09566161036491394, "memory(GiB)": 31.36, "step": 1910, "token_acc": 0.9658257869771453, "train_speed(iter/s)": 0.113366 }, { "epoch": 2.0459524445631847, "grad_norm": 0.7360612154006958, "learning_rate": 2.2850290026961032e-06, "loss": 0.10009205341339111, "memory(GiB)": 31.36, "step": 1915, "token_acc": 0.9632088520055325, "train_speed(iter/s)": 0.113459 }, { "epoch": 2.0512957520705317, "grad_norm": 0.8155742883682251, "learning_rate": 2.2615591305069846e-06, "loss": 0.09215841293334961, "memory(GiB)": 31.36, "step": 1920, "token_acc": 0.968572231196011, "train_speed(iter/s)": 0.113542 }, { "epoch": 2.0512957520705317, "eval_loss": 0.19029787182807922, "eval_runtime": 39.1166, "eval_samples_per_second": 15.441, "eval_steps_per_second": 3.86, "eval_token_acc": 0.9387848763644941, "step": 1920 }, { "epoch": 2.0566390595778787, "grad_norm": 0.7107601761817932, "learning_rate": 2.238175135197471e-06, "loss": 0.09783934354782105, "memory(GiB)": 31.36, "step": 1925, "token_acc": 0.9484311762913965, "train_speed(iter/s)": 0.113297 }, { "epoch": 2.0619823670852258, "grad_norm": 0.7164185643196106, "learning_rate": 2.2148777500843125e-06, "loss": 0.10058900117874145, "memory(GiB)": 31.36, "step": 1930, "token_acc": 0.9623356362825941, "train_speed(iter/s)": 0.113372 }, { "epoch": 2.0673256745925728, "grad_norm": 0.7500390410423279, "learning_rate": 2.1916677057681786e-06, "loss": 0.10025620460510254, "memory(GiB)": 31.36, "step": 1935, "token_acc": 0.9640934730056406, "train_speed(iter/s)": 0.113457 }, { "epoch": 2.07266898209992, "grad_norm": 0.7709511518478394, "learning_rate": 2.1685457301107506e-06, "loss": 0.10047693252563476, "memory(GiB)": 31.36, "step": 1940, "token_acc": 0.9648505046059044, "train_speed(iter/s)": 0.113539 }, { "epoch": 2.07266898209992, "eval_loss": 0.19032922387123108, "eval_runtime": 39.188, "eval_samples_per_second": 15.413, "eval_steps_per_second": 3.853, "eval_token_acc": 0.9384202963820798, "step": 1940 }, { "epoch": 2.078012289607267, "grad_norm": 0.6654588580131531, "learning_rate": 2.145512548211902e-06, "loss": 0.09436342120170593, "memory(GiB)": 31.36, "step": 1945, "token_acc": 0.9468971724468446, "train_speed(iter/s)": 0.113306 }, { "epoch": 2.083355597114614, "grad_norm": 0.7399053573608398, "learning_rate": 2.1225688823869494e-06, "loss": 0.09469984173774719, "memory(GiB)": 31.36, "step": 1950, "token_acc": 0.9699618029029794, "train_speed(iter/s)": 0.113384 }, { "epoch": 2.088698904621961, "grad_norm": 0.7095410227775574, "learning_rate": 2.09971545214401e-06, "loss": 0.09509609937667847, "memory(GiB)": 31.36, "step": 1955, "token_acc": 0.9642617302694969, "train_speed(iter/s)": 0.113448 }, { "epoch": 2.094042212129308, "grad_norm": 0.684771716594696, "learning_rate": 2.0769529741614297e-06, "loss": 0.09686210751533508, "memory(GiB)": 31.36, "step": 1960, "token_acc": 0.9665719778485553, "train_speed(iter/s)": 0.113535 }, { "epoch": 2.094042212129308, "eval_loss": 0.19170063734054565, "eval_runtime": 39.1596, "eval_samples_per_second": 15.424, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9384932123785626, "step": 1960 }, { "epoch": 2.0993855196366553, "grad_norm": 0.6467424631118774, "learning_rate": 2.054282162265313e-06, "loss": 0.09031983613967895, "memory(GiB)": 31.36, "step": 1965, "token_acc": 0.9467923722623636, "train_speed(iter/s)": 0.113303 }, { "epoch": 2.1047288271440023, "grad_norm": 0.6366815567016602, "learning_rate": 2.0317037274071412e-06, "loss": 0.08445571660995484, "memory(GiB)": 31.36, "step": 1970, "token_acc": 0.9674427290836654, "train_speed(iter/s)": 0.113383 }, { "epoch": 2.1100721346513494, "grad_norm": 0.6350346207618713, "learning_rate": 2.009218377641466e-06, "loss": 0.0988619565963745, "memory(GiB)": 31.36, "step": 1975, "token_acc": 0.9623879433545537, "train_speed(iter/s)": 0.113457 }, { "epoch": 2.1154154421586964, "grad_norm": 0.6696274280548096, "learning_rate": 1.9868268181037186e-06, "loss": 0.09375531673431396, "memory(GiB)": 31.36, "step": 1980, "token_acc": 0.9656578045525053, "train_speed(iter/s)": 0.113523 }, { "epoch": 2.1154154421586964, "eval_loss": 0.19252096116542816, "eval_runtime": 39.2604, "eval_samples_per_second": 15.384, "eval_steps_per_second": 3.846, "eval_token_acc": 0.9383945613244976, "step": 1980 }, { "epoch": 2.1207587496660434, "grad_norm": 0.7472742199897766, "learning_rate": 1.964529750988086e-06, "loss": 0.09353007674217224, "memory(GiB)": 31.36, "step": 1985, "token_acc": 0.9438841714662963, "train_speed(iter/s)": 0.113283 }, { "epoch": 2.1261020571733904, "grad_norm": 0.7385168671607971, "learning_rate": 1.9423278755254933e-06, "loss": 0.09612951874732971, "memory(GiB)": 31.36, "step": 1990, "token_acc": 0.962575335363878, "train_speed(iter/s)": 0.113363 }, { "epoch": 2.1314453646807374, "grad_norm": 0.769904613494873, "learning_rate": 1.9202218879616824e-06, "loss": 0.09348126649856567, "memory(GiB)": 31.36, "step": 1995, "token_acc": 0.9666374287325356, "train_speed(iter/s)": 0.113436 }, { "epoch": 2.1367886721880844, "grad_norm": 0.7597969770431519, "learning_rate": 1.8982124815353665e-06, "loss": 0.09418823719024658, "memory(GiB)": 31.37, "step": 2000, "token_acc": 0.9634146341463414, "train_speed(iter/s)": 0.113511 }, { "epoch": 2.1367886721880844, "eval_loss": 0.1915539652109146, "eval_runtime": 39.2063, "eval_samples_per_second": 15.406, "eval_steps_per_second": 3.851, "eval_token_acc": 0.9383216453280148, "step": 2000 }, { "epoch": 2.1421319796954315, "grad_norm": 0.7184767127037048, "learning_rate": 1.8763003464565022e-06, "loss": 0.0999064326286316, "memory(GiB)": 31.37, "step": 2005, "token_acc": 0.9464714514407684, "train_speed(iter/s)": 0.113276 }, { "epoch": 2.1474752872027785, "grad_norm": 0.6783972382545471, "learning_rate": 1.854486169884635e-06, "loss": 0.09837267994880676, "memory(GiB)": 31.37, "step": 2010, "token_acc": 0.9645275422436406, "train_speed(iter/s)": 0.113352 }, { "epoch": 2.1528185947101255, "grad_norm": 0.746529221534729, "learning_rate": 1.8327706359073526e-06, "loss": 0.09520338773727417, "memory(GiB)": 31.37, "step": 2015, "token_acc": 0.9642478360532348, "train_speed(iter/s)": 0.113417 }, { "epoch": 2.1581619022174725, "grad_norm": 0.6819494366645813, "learning_rate": 1.8111544255188402e-06, "loss": 0.09425632357597351, "memory(GiB)": 31.37, "step": 2020, "token_acc": 0.9684003992977867, "train_speed(iter/s)": 0.113509 }, { "epoch": 2.1581619022174725, "eval_loss": 0.1915895640850067, "eval_runtime": 39.2397, "eval_samples_per_second": 15.393, "eval_steps_per_second": 3.848, "eval_token_acc": 0.9385103690836175, "step": 2020 }, { "epoch": 2.1635052097248195, "grad_norm": 0.7457813024520874, "learning_rate": 1.7896382165985094e-06, "loss": 0.09581427574157715, "memory(GiB)": 31.37, "step": 2025, "token_acc": 0.9465357277296783, "train_speed(iter/s)": 0.113279 }, { "epoch": 2.1688485172321665, "grad_norm": 0.6739791631698608, "learning_rate": 1.768222683889757e-06, "loss": 0.09893481731414795, "memory(GiB)": 31.37, "step": 2030, "token_acc": 0.9669017905588714, "train_speed(iter/s)": 0.113367 }, { "epoch": 2.1741918247395136, "grad_norm": 0.6932037472724915, "learning_rate": 1.746908498978791e-06, "loss": 0.0985231876373291, "memory(GiB)": 31.37, "step": 2035, "token_acc": 0.9686975154919789, "train_speed(iter/s)": 0.113426 }, { "epoch": 2.179535132246861, "grad_norm": 0.7460088133811951, "learning_rate": 1.7256963302735752e-06, "loss": 0.09795750975608826, "memory(GiB)": 31.37, "step": 2040, "token_acc": 0.964236412083855, "train_speed(iter/s)": 0.113512 }, { "epoch": 2.179535132246861, "eval_loss": 0.19136284291744232, "eval_runtime": 39.234, "eval_samples_per_second": 15.395, "eval_steps_per_second": 3.849, "eval_token_acc": 0.9384631881447169, "step": 2040 }, { "epoch": 2.184878439754208, "grad_norm": 0.7117043137550354, "learning_rate": 1.7045868429828745e-06, "loss": 0.09070048332214356, "memory(GiB)": 31.37, "step": 2045, "token_acc": 0.9479196430533882, "train_speed(iter/s)": 0.113281 }, { "epoch": 2.190221747261555, "grad_norm": 0.652874767780304, "learning_rate": 1.6835806990953802e-06, "loss": 0.09067975282669068, "memory(GiB)": 31.37, "step": 2050, "token_acc": 0.9657613864524992, "train_speed(iter/s)": 0.113343 }, { "epoch": 2.195565054768902, "grad_norm": 0.6861611604690552, "learning_rate": 1.6626785573589667e-06, "loss": 0.09590352177619935, "memory(GiB)": 31.37, "step": 2055, "token_acc": 0.9615629645359949, "train_speed(iter/s)": 0.113419 }, { "epoch": 2.200908362276249, "grad_norm": 0.6657945513725281, "learning_rate": 1.6418810732600177e-06, "loss": 0.0890547752380371, "memory(GiB)": 31.37, "step": 2060, "token_acc": 0.968454143363673, "train_speed(iter/s)": 0.113499 }, { "epoch": 2.200908362276249, "eval_loss": 0.19242902100086212, "eval_runtime": 39.2335, "eval_samples_per_second": 15.395, "eval_steps_per_second": 3.849, "eval_token_acc": 0.9385103690836175, "step": 2060 }, { "epoch": 2.206251669783596, "grad_norm": 0.7557697892189026, "learning_rate": 1.6211888990028785e-06, "loss": 0.09434689283370971, "memory(GiB)": 31.37, "step": 2065, "token_acc": 0.9467688912388791, "train_speed(iter/s)": 0.113268 }, { "epoch": 2.211594977290943, "grad_norm": 0.7953412532806396, "learning_rate": 1.6006026834894068e-06, "loss": 0.10123894214630128, "memory(GiB)": 31.37, "step": 2070, "token_acc": 0.9602344454463481, "train_speed(iter/s)": 0.113338 }, { "epoch": 2.21693828479829, "grad_norm": 0.720357358455658, "learning_rate": 1.5801230722986104e-06, "loss": 0.09082142114639283, "memory(GiB)": 31.37, "step": 2075, "token_acc": 0.9670389539634977, "train_speed(iter/s)": 0.113403 }, { "epoch": 2.222281592305637, "grad_norm": 0.6819139122962952, "learning_rate": 1.5597507076664187e-06, "loss": 0.088398015499115, "memory(GiB)": 31.37, "step": 2080, "token_acc": 0.9697791553661371, "train_speed(iter/s)": 0.11347 }, { "epoch": 2.222281592305637, "eval_loss": 0.1921994835138321, "eval_runtime": 39.1975, "eval_samples_per_second": 15.409, "eval_steps_per_second": 3.852, "eval_token_acc": 0.9386047309614188, "step": 2080 }, { "epoch": 2.227624899812984, "grad_norm": 0.7003424167633057, "learning_rate": 1.5394862284655266e-06, "loss": 0.09183210134506226, "memory(GiB)": 31.37, "step": 2085, "token_acc": 0.9471144387997177, "train_speed(iter/s)": 0.113237 }, { "epoch": 2.232968207320331, "grad_norm": 0.735797107219696, "learning_rate": 1.5193302701853674e-06, "loss": 0.09793744683265686, "memory(GiB)": 31.37, "step": 2090, "token_acc": 0.9629603963826617, "train_speed(iter/s)": 0.113318 }, { "epoch": 2.238311514827678, "grad_norm": 0.7035617828369141, "learning_rate": 1.499283464912188e-06, "loss": 0.09379619359970093, "memory(GiB)": 31.37, "step": 2095, "token_acc": 0.9692680087017802, "train_speed(iter/s)": 0.113392 }, { "epoch": 2.2436548223350252, "grad_norm": 0.653683602809906, "learning_rate": 1.4793464413092161e-06, "loss": 0.08412163257598877, "memory(GiB)": 31.37, "step": 2100, "token_acc": 0.9694610598455018, "train_speed(iter/s)": 0.113463 }, { "epoch": 2.2436548223350252, "eval_loss": 0.19128654897212982, "eval_runtime": 39.2107, "eval_samples_per_second": 15.404, "eval_steps_per_second": 3.851, "eval_token_acc": 0.9386562010765832, "step": 2100 }, { "epoch": 2.2489981298423722, "grad_norm": 0.720621645450592, "learning_rate": 1.459519824596956e-06, "loss": 0.09899259209632874, "memory(GiB)": 31.37, "step": 2105, "token_acc": 0.9451508803713978, "train_speed(iter/s)": 0.113243 }, { "epoch": 2.2543414373497193, "grad_norm": 0.6509672403335571, "learning_rate": 1.4398042365335745e-06, "loss": 0.09483298659324646, "memory(GiB)": 31.37, "step": 2110, "token_acc": 0.9613455205736621, "train_speed(iter/s)": 0.113303 }, { "epoch": 2.2596847448570667, "grad_norm": 0.7314789891242981, "learning_rate": 1.4202002953954042e-06, "loss": 0.0946409523487091, "memory(GiB)": 31.37, "step": 2115, "token_acc": 0.9664519265832017, "train_speed(iter/s)": 0.11336 }, { "epoch": 2.2650280523644137, "grad_norm": 0.7073846459388733, "learning_rate": 1.4007086159575595e-06, "loss": 0.0916548490524292, "memory(GiB)": 31.37, "step": 2120, "token_acc": 0.967791956065256, "train_speed(iter/s)": 0.113428 }, { "epoch": 2.2650280523644137, "eval_loss": 0.1914122849702835, "eval_runtime": 39.2346, "eval_samples_per_second": 15.395, "eval_steps_per_second": 3.849, "eval_token_acc": 0.9387119603680113, "step": 2120 }, { "epoch": 2.2703713598717608, "grad_norm": 0.744266152381897, "learning_rate": 1.3813298094746491e-06, "loss": 0.10223530530929566, "memory(GiB)": 31.37, "step": 2125, "token_acc": 0.9450423116125821, "train_speed(iter/s)": 0.11321 }, { "epoch": 2.2757146673791078, "grad_norm": 0.7458162307739258, "learning_rate": 1.362064483661617e-06, "loss": 0.0993034303188324, "memory(GiB)": 31.37, "step": 2130, "token_acc": 0.9647883245497191, "train_speed(iter/s)": 0.113282 }, { "epoch": 2.281057974886455, "grad_norm": 0.7311099767684937, "learning_rate": 1.3429132426746743e-06, "loss": 0.08791648149490357, "memory(GiB)": 31.37, "step": 2135, "token_acc": 0.9718640093786636, "train_speed(iter/s)": 0.11335 }, { "epoch": 2.286401282393802, "grad_norm": 0.6718652248382568, "learning_rate": 1.3238766870923592e-06, "loss": 0.09885276556015014, "memory(GiB)": 31.37, "step": 2140, "token_acc": 0.9679231605654223, "train_speed(iter/s)": 0.113429 }, { "epoch": 2.286401282393802, "eval_loss": 0.1916726678609848, "eval_runtime": 39.2123, "eval_samples_per_second": 15.403, "eval_steps_per_second": 3.851, "eval_token_acc": 0.9388191897746038, "step": 2140 }, { "epoch": 2.291744589901149, "grad_norm": 0.7811794877052307, "learning_rate": 1.3049554138967052e-06, "loss": 0.09513717889785767, "memory(GiB)": 31.37, "step": 2145, "token_acc": 0.9481464839361942, "train_speed(iter/s)": 0.113213 }, { "epoch": 2.297087897408496, "grad_norm": 0.7876958847045898, "learning_rate": 1.286150016454511e-06, "loss": 0.09975624084472656, "memory(GiB)": 31.37, "step": 2150, "token_acc": 0.9664041395578892, "train_speed(iter/s)": 0.113297 }, { "epoch": 2.302431204915843, "grad_norm": 0.6765829920768738, "learning_rate": 1.267461084498744e-06, "loss": 0.09555368423461914, "memory(GiB)": 31.37, "step": 2155, "token_acc": 0.9689418005736669, "train_speed(iter/s)": 0.113362 }, { "epoch": 2.30777451242319, "grad_norm": 0.7572174668312073, "learning_rate": 1.2488892041100364e-06, "loss": 0.09787599444389343, "memory(GiB)": 31.37, "step": 2160, "token_acc": 0.9644865820343885, "train_speed(iter/s)": 0.113441 }, { "epoch": 2.30777451242319, "eval_loss": 0.19186273217201233, "eval_runtime": 39.232, "eval_samples_per_second": 15.396, "eval_steps_per_second": 3.849, "eval_token_acc": 0.9388191897746038, "step": 2160 }, { "epoch": 2.313117819930537, "grad_norm": 0.7487595677375793, "learning_rate": 1.2304349576983094e-06, "loss": 0.09345256090164185, "memory(GiB)": 31.37, "step": 2165, "token_acc": 0.9464117197681211, "train_speed(iter/s)": 0.113226 }, { "epoch": 2.318461127437884, "grad_norm": 0.7937645316123962, "learning_rate": 1.2120989239845149e-06, "loss": 0.09632455110549927, "memory(GiB)": 31.37, "step": 2170, "token_acc": 0.9689201004033055, "train_speed(iter/s)": 0.113299 }, { "epoch": 2.323804434945231, "grad_norm": 0.7212129831314087, "learning_rate": 1.1938816779824753e-06, "loss": 0.09377689361572265, "memory(GiB)": 31.37, "step": 2175, "token_acc": 0.9685556654016647, "train_speed(iter/s)": 0.113374 }, { "epoch": 2.3291477424525784, "grad_norm": 0.6610186696052551, "learning_rate": 1.1757837909808628e-06, "loss": 0.09794212579727173, "memory(GiB)": 31.37, "step": 2180, "token_acc": 0.9649737302977233, "train_speed(iter/s)": 0.113446 }, { "epoch": 2.3291477424525784, "eval_loss": 0.1913762092590332, "eval_runtime": 39.2194, "eval_samples_per_second": 15.401, "eval_steps_per_second": 3.85, "eval_token_acc": 0.9388835274185593, "step": 2180 }, { "epoch": 2.334491049959925, "grad_norm": 0.6436466574668884, "learning_rate": 1.157805830525275e-06, "loss": 0.08866640329360961, "memory(GiB)": 31.37, "step": 2185, "token_acc": 0.9485147195950446, "train_speed(iter/s)": 0.113237 }, { "epoch": 2.3398343574672724, "grad_norm": 0.7666484713554382, "learning_rate": 1.1399483604004403e-06, "loss": 0.08878711462020875, "memory(GiB)": 31.37, "step": 2190, "token_acc": 0.9614538598512622, "train_speed(iter/s)": 0.113307 }, { "epoch": 2.3451776649746194, "grad_norm": 0.7943670153617859, "learning_rate": 1.1222119406125426e-06, "loss": 0.09242654442787171, "memory(GiB)": 31.37, "step": 2195, "token_acc": 0.96523288032722, "train_speed(iter/s)": 0.113373 }, { "epoch": 2.3505209724819665, "grad_norm": 0.755363941192627, "learning_rate": 1.1045971273716476e-06, "loss": 0.10125420093536378, "memory(GiB)": 31.37, "step": 2200, "token_acc": 0.964818502602802, "train_speed(iter/s)": 0.113435 }, { "epoch": 2.3505209724819665, "eval_loss": 0.19072869420051575, "eval_runtime": 39.1742, "eval_samples_per_second": 15.418, "eval_steps_per_second": 3.855, "eval_token_acc": 0.938943575886251, "step": 2200 }, { "epoch": 2.3558642799893135, "grad_norm": 0.7719199657440186, "learning_rate": 1.0871044730742752e-06, "loss": 0.0994708001613617, "memory(GiB)": 31.37, "step": 2205, "token_acc": 0.9452918677716055, "train_speed(iter/s)": 0.113231 }, { "epoch": 2.3612075874966605, "grad_norm": 0.7336219549179077, "learning_rate": 1.0697345262860638e-06, "loss": 0.09733407497406006, "memory(GiB)": 31.37, "step": 2210, "token_acc": 0.962336711024295, "train_speed(iter/s)": 0.1133 }, { "epoch": 2.3665508950040075, "grad_norm": 0.7070329189300537, "learning_rate": 1.0524878317245713e-06, "loss": 0.08725832104682922, "memory(GiB)": 31.37, "step": 2215, "token_acc": 0.9673966575828217, "train_speed(iter/s)": 0.113371 }, { "epoch": 2.3718942025113545, "grad_norm": 0.7620383501052856, "learning_rate": 1.0353649302421982e-06, "loss": 0.08947555422782898, "memory(GiB)": 31.37, "step": 2220, "token_acc": 0.9708626514987136, "train_speed(iter/s)": 0.113435 }, { "epoch": 2.3718942025113545, "eval_loss": 0.19121414422988892, "eval_runtime": 39.1416, "eval_samples_per_second": 15.431, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9391365888181175, "step": 2220 }, { "epoch": 2.3772375100187015, "grad_norm": 0.6480873227119446, "learning_rate": 1.0183663588092214e-06, "loss": 0.0861007571220398, "memory(GiB)": 31.37, "step": 2225, "token_acc": 0.9472103487064117, "train_speed(iter/s)": 0.11322 }, { "epoch": 2.3825808175260486, "grad_norm": 0.7264376878738403, "learning_rate": 1.0014926504969535e-06, "loss": 0.09383871555328369, "memory(GiB)": 31.37, "step": 2230, "token_acc": 0.968945743273048, "train_speed(iter/s)": 0.113288 }, { "epoch": 2.3879241250333956, "grad_norm": 0.6731327772140503, "learning_rate": 9.847443344610296e-07, "loss": 0.09176123142242432, "memory(GiB)": 31.37, "step": 2235, "token_acc": 0.9693463125322331, "train_speed(iter/s)": 0.113364 }, { "epoch": 2.3932674325407426, "grad_norm": 0.8005653023719788, "learning_rate": 9.681219359248106e-07, "loss": 0.09590315818786621, "memory(GiB)": 31.37, "step": 2240, "token_acc": 0.9636749520427738, "train_speed(iter/s)": 0.113431 }, { "epoch": 2.3932674325407426, "eval_loss": 0.19097845256328583, "eval_runtime": 39.2123, "eval_samples_per_second": 15.403, "eval_steps_per_second": 3.851, "eval_token_acc": 0.9388963949473503, "step": 2240 }, { "epoch": 2.3986107400480896, "grad_norm": 0.7299765348434448, "learning_rate": 9.516259761629148e-07, "loss": 0.08505445718765259, "memory(GiB)": 31.37, "step": 2245, "token_acc": 0.944954128440367, "train_speed(iter/s)": 0.113227 }, { "epoch": 2.4039540475554366, "grad_norm": 0.7002612948417664, "learning_rate": 9.352569724848715e-07, "loss": 0.08956900835037232, "memory(GiB)": 31.37, "step": 2250, "token_acc": 0.9705892762342948, "train_speed(iter/s)": 0.113285 }, { "epoch": 2.409297355062784, "grad_norm": 0.7428932785987854, "learning_rate": 9.190154382188921e-07, "loss": 0.09664742350578308, "memory(GiB)": 31.37, "step": 2255, "token_acc": 0.9705281875658588, "train_speed(iter/s)": 0.113348 }, { "epoch": 2.414640662570131, "grad_norm": 0.6886446475982666, "learning_rate": 9.029018826957775e-07, "loss": 0.09689427018165589, "memory(GiB)": 31.37, "step": 2260, "token_acc": 0.960169941582581, "train_speed(iter/s)": 0.113411 }, { "epoch": 2.414640662570131, "eval_loss": 0.1908179074525833, "eval_runtime": 39.2091, "eval_samples_per_second": 15.405, "eval_steps_per_second": 3.851, "eval_token_acc": 0.9391408779943812, "step": 2260 }, { "epoch": 2.419983970077478, "grad_norm": 0.7271323800086975, "learning_rate": 8.86916811232944e-07, "loss": 0.09017704725265503, "memory(GiB)": 31.37, "step": 2265, "token_acc": 0.9487460692263789, "train_speed(iter/s)": 0.113206 }, { "epoch": 2.425327277584825, "grad_norm": 0.7703067064285278, "learning_rate": 8.710607251185799e-07, "loss": 0.09243172407150269, "memory(GiB)": 31.37, "step": 2270, "token_acc": 0.9636782618237683, "train_speed(iter/s)": 0.113273 }, { "epoch": 2.430670585092172, "grad_norm": 0.7155824899673462, "learning_rate": 8.553341215959215e-07, "loss": 0.09737263917922974, "memory(GiB)": 31.37, "step": 2275, "token_acc": 0.9632417350679687, "train_speed(iter/s)": 0.113346 }, { "epoch": 2.436013892599519, "grad_norm": 0.7440068125724792, "learning_rate": 8.397374938476594e-07, "loss": 0.09268940091133118, "memory(GiB)": 31.37, "step": 2280, "token_acc": 0.9701119443538746, "train_speed(iter/s)": 0.11341 }, { "epoch": 2.436013892599519, "eval_loss": 0.190630704164505, "eval_runtime": 39.2105, "eval_samples_per_second": 15.404, "eval_steps_per_second": 3.851, "eval_token_acc": 0.9391837697570181, "step": 2280 }, { "epoch": 2.441357200106866, "grad_norm": 0.7325111031532288, "learning_rate": 8.242713309804729e-07, "loss": 0.09075909256935119, "memory(GiB)": 31.37, "step": 2285, "token_acc": 0.947367209794716, "train_speed(iter/s)": 0.113205 }, { "epoch": 2.446700507614213, "grad_norm": 0.757805585861206, "learning_rate": 8.089361180096927e-07, "loss": 0.09486221075057984, "memory(GiB)": 31.37, "step": 2290, "token_acc": 0.9661316211878009, "train_speed(iter/s)": 0.113267 }, { "epoch": 2.45204381512156, "grad_norm": 0.7468327283859253, "learning_rate": 7.937323358440935e-07, "loss": 0.09470909833908081, "memory(GiB)": 31.37, "step": 2295, "token_acc": 0.9667186525265128, "train_speed(iter/s)": 0.113336 }, { "epoch": 2.4573871226289072, "grad_norm": 0.7450569868087769, "learning_rate": 7.786604612708093e-07, "loss": 0.09495973587036133, "memory(GiB)": 31.37, "step": 2300, "token_acc": 0.9614005227645736, "train_speed(iter/s)": 0.113403 }, { "epoch": 2.4573871226289072, "eval_loss": 0.19077461957931519, "eval_runtime": 39.2014, "eval_samples_per_second": 15.408, "eval_steps_per_second": 3.852, "eval_token_acc": 0.9391537455231723, "step": 2300 }, { "epoch": 2.4627304301362543, "grad_norm": 0.6540588736534119, "learning_rate": 7.637209669403789e-07, "loss": 0.0869560956954956, "memory(GiB)": 31.37, "step": 2305, "token_acc": 0.9489500731715618, "train_speed(iter/s)": 0.113205 }, { "epoch": 2.4680737376436013, "grad_norm": 0.738757848739624, "learning_rate": 7.489143213519301e-07, "loss": 0.09310966730117798, "memory(GiB)": 31.37, "step": 2310, "token_acc": 0.9699513919575784, "train_speed(iter/s)": 0.113277 }, { "epoch": 2.4734170451509483, "grad_norm": 0.6581406593322754, "learning_rate": 7.342409888384816e-07, "loss": 0.08957692980766296, "memory(GiB)": 31.37, "step": 2315, "token_acc": 0.9666096166887331, "train_speed(iter/s)": 0.113338 }, { "epoch": 2.4787603526582953, "grad_norm": 0.6974185705184937, "learning_rate": 7.197014295523879e-07, "loss": 0.08715896606445313, "memory(GiB)": 31.37, "step": 2320, "token_acc": 0.9711248792133235, "train_speed(iter/s)": 0.113406 }, { "epoch": 2.4787603526582953, "eval_loss": 0.19092287123203278, "eval_runtime": 39.2106, "eval_samples_per_second": 15.404, "eval_steps_per_second": 3.851, "eval_token_acc": 0.9391966372858093, "step": 2320 }, { "epoch": 2.4841036601656423, "grad_norm": 0.7533718347549438, "learning_rate": 7.052960994509056e-07, "loss": 0.09223737120628357, "memory(GiB)": 31.37, "step": 2325, "token_acc": 0.9484453382745063, "train_speed(iter/s)": 0.113199 }, { "epoch": 2.48944696767299, "grad_norm": 0.7056859731674194, "learning_rate": 6.910254502818914e-07, "loss": 0.08803938627243042, "memory(GiB)": 31.37, "step": 2330, "token_acc": 0.9710507958653053, "train_speed(iter/s)": 0.113258 }, { "epoch": 2.494790275180337, "grad_norm": 0.7328131198883057, "learning_rate": 6.768899295696413e-07, "loss": 0.09180974960327148, "memory(GiB)": 31.37, "step": 2335, "token_acc": 0.9653558350581117, "train_speed(iter/s)": 0.113316 }, { "epoch": 2.500133582687684, "grad_norm": 0.7516103982925415, "learning_rate": 6.628899806008515e-07, "loss": 0.09033479690551757, "memory(GiB)": 31.37, "step": 2340, "token_acc": 0.9666970260959094, "train_speed(iter/s)": 0.113382 }, { "epoch": 2.500133582687684, "eval_loss": 0.19086231291294098, "eval_runtime": 39.1889, "eval_samples_per_second": 15.413, "eval_steps_per_second": 3.853, "eval_token_acc": 0.939299577516138, "step": 2340 }, { "epoch": 2.505476890195031, "grad_norm": 0.8012656569480896, "learning_rate": 6.490260424107231e-07, "loss": 0.09262714982032776, "memory(GiB)": 31.37, "step": 2345, "token_acc": 0.9465704224528235, "train_speed(iter/s)": 0.113185 }, { "epoch": 2.510820197702378, "grad_norm": 0.6409561634063721, "learning_rate": 6.352985497691883e-07, "loss": 0.09137773513793945, "memory(GiB)": 31.37, "step": 2350, "token_acc": 0.9675587467362924, "train_speed(iter/s)": 0.113254 }, { "epoch": 2.516163505209725, "grad_norm": 0.790011465549469, "learning_rate": 6.217079331672777e-07, "loss": 0.10272359848022461, "memory(GiB)": 31.37, "step": 2355, "token_acc": 0.9673232662587969, "train_speed(iter/s)": 0.113325 }, { "epoch": 2.521506812717072, "grad_norm": 0.7116812467575073, "learning_rate": 6.082546188036204e-07, "loss": 0.09815052747726441, "memory(GiB)": 31.37, "step": 2360, "token_acc": 0.9669859985261606, "train_speed(iter/s)": 0.113389 }, { "epoch": 2.521506812717072, "eval_loss": 0.19036073982715607, "eval_runtime": 39.1911, "eval_samples_per_second": 15.412, "eval_steps_per_second": 3.853, "eval_token_acc": 0.9394539878616311, "step": 2360 }, { "epoch": 2.526850120224419, "grad_norm": 0.755511462688446, "learning_rate": 5.949390285710777e-07, "loss": 0.09070051908493042, "memory(GiB)": 31.37, "step": 2365, "token_acc": 0.9475465313028765, "train_speed(iter/s)": 0.113185 }, { "epoch": 2.532193427731766, "grad_norm": 0.694186806678772, "learning_rate": 5.817615800435167e-07, "loss": 0.09250964522361756, "memory(GiB)": 31.37, "step": 2370, "token_acc": 0.9697894963718658, "train_speed(iter/s)": 0.113247 }, { "epoch": 2.537536735239113, "grad_norm": 0.7912867665290833, "learning_rate": 5.687226864627115e-07, "loss": 0.10275306701660156, "memory(GiB)": 31.37, "step": 2375, "token_acc": 0.9677234207772039, "train_speed(iter/s)": 0.113304 }, { "epoch": 2.54288004274646, "grad_norm": 0.7408064007759094, "learning_rate": 5.558227567253832e-07, "loss": 0.09581139087677001, "memory(GiB)": 31.37, "step": 2380, "token_acc": 0.9702766420961533, "train_speed(iter/s)": 0.113383 }, { "epoch": 2.54288004274646, "eval_loss": 0.19033583998680115, "eval_runtime": 39.1446, "eval_samples_per_second": 15.43, "eval_steps_per_second": 3.857, "eval_token_acc": 0.9392781316348195, "step": 2380 }, { "epoch": 2.548223350253807, "grad_norm": 0.6932405829429626, "learning_rate": 5.430621953703785e-07, "loss": 0.09184646606445312, "memory(GiB)": 31.37, "step": 2385, "token_acc": 0.9487018329752709, "train_speed(iter/s)": 0.113172 }, { "epoch": 2.553566657761154, "grad_norm": 0.6893176436424255, "learning_rate": 5.304414025659832e-07, "loss": 0.08671947121620179, "memory(GiB)": 31.37, "step": 2390, "token_acc": 0.9663333126281135, "train_speed(iter/s)": 0.113228 }, { "epoch": 2.5589099652685015, "grad_norm": 0.6595478057861328, "learning_rate": 5.179607740973764e-07, "loss": 0.09736074805259705, "memory(GiB)": 31.37, "step": 2395, "token_acc": 0.9659873313136876, "train_speed(iter/s)": 0.113281 }, { "epoch": 2.564253272775848, "grad_norm": 0.738300085067749, "learning_rate": 5.056207013542131e-07, "loss": 0.09036798477172851, "memory(GiB)": 31.37, "step": 2400, "token_acc": 0.9698333820520316, "train_speed(iter/s)": 0.113341 }, { "epoch": 2.564253272775848, "eval_loss": 0.1903306394815445, "eval_runtime": 39.1763, "eval_samples_per_second": 15.417, "eval_steps_per_second": 3.854, "eval_token_acc": 0.9393596259838298, "step": 2400 }, { "epoch": 2.5695965802831955, "grad_norm": 0.6590713858604431, "learning_rate": 4.934215713183527e-07, "loss": 0.08946118354797364, "memory(GiB)": 31.37, "step": 2405, "token_acc": 0.9487092468155388, "train_speed(iter/s)": 0.113141 }, { "epoch": 2.5749398877905425, "grad_norm": 0.7545654773712158, "learning_rate": 4.813637665517251e-07, "loss": 0.08997320532798767, "memory(GiB)": 31.37, "step": 2410, "token_acc": 0.9712623356584911, "train_speed(iter/s)": 0.113199 }, { "epoch": 2.5802831952978895, "grad_norm": 0.722476601600647, "learning_rate": 4.6944766518432936e-07, "loss": 0.09507122039794921, "memory(GiB)": 31.37, "step": 2415, "token_acc": 0.9640161909989023, "train_speed(iter/s)": 0.113266 }, { "epoch": 2.5856265028052365, "grad_norm": 0.7068451046943665, "learning_rate": 4.576736409023813e-07, "loss": 0.08914280533790589, "memory(GiB)": 31.37, "step": 2420, "token_acc": 0.9624791076849609, "train_speed(iter/s)": 0.113323 }, { "epoch": 2.5856265028052365, "eval_loss": 0.18999898433685303, "eval_runtime": 39.1317, "eval_samples_per_second": 15.435, "eval_steps_per_second": 3.859, "eval_token_acc": 0.9394068069227305, "step": 2420 }, { "epoch": 2.5909698103125836, "grad_norm": 0.7488506436347961, "learning_rate": 4.460420629365919e-07, "loss": 0.10495038032531738, "memory(GiB)": 31.37, "step": 2425, "token_acc": 0.9451158336373586, "train_speed(iter/s)": 0.113141 }, { "epoch": 2.5963131178199306, "grad_norm": 0.7361159324645996, "learning_rate": 4.3455329605058436e-07, "loss": 0.09409030675888061, "memory(GiB)": 31.37, "step": 2430, "token_acc": 0.9666012515273817, "train_speed(iter/s)": 0.113204 }, { "epoch": 2.6016564253272776, "grad_norm": 0.7987903356552124, "learning_rate": 4.232077005294638e-07, "loss": 0.09288793802261353, "memory(GiB)": 31.37, "step": 2435, "token_acc": 0.9625309550299681, "train_speed(iter/s)": 0.11326 }, { "epoch": 2.6069997328346246, "grad_norm": 0.7318440675735474, "learning_rate": 4.120056321685101e-07, "loss": 0.09065854549407959, "memory(GiB)": 31.37, "step": 2440, "token_acc": 0.9711286089238845, "train_speed(iter/s)": 0.113321 }, { "epoch": 2.6069997328346246, "eval_loss": 0.19047002494335175, "eval_runtime": 39.1424, "eval_samples_per_second": 15.431, "eval_steps_per_second": 3.858, "eval_token_acc": 0.939428252804049, "step": 2440 }, { "epoch": 2.6123430403419716, "grad_norm": 0.7801464200019836, "learning_rate": 4.009474422620269e-07, "loss": 0.09015793800354004, "memory(GiB)": 31.37, "step": 2445, "token_acc": 0.9462027912208955, "train_speed(iter/s)": 0.113137 }, { "epoch": 2.6176863478493186, "grad_norm": 0.7523171305656433, "learning_rate": 3.900334775923237e-07, "loss": 0.08972238302230835, "memory(GiB)": 31.37, "step": 2450, "token_acc": 0.964319157867545, "train_speed(iter/s)": 0.113199 }, { "epoch": 2.6230296553566657, "grad_norm": 0.7075212001800537, "learning_rate": 3.7926408041883355e-07, "loss": 0.09695062637329102, "memory(GiB)": 31.37, "step": 2455, "token_acc": 0.9670872765509989, "train_speed(iter/s)": 0.113263 }, { "epoch": 2.6283729628640127, "grad_norm": 0.7174405455589294, "learning_rate": 3.6863958846739213e-07, "loss": 0.09820109009742736, "memory(GiB)": 31.37, "step": 2460, "token_acc": 0.9666613545816733, "train_speed(iter/s)": 0.113326 }, { "epoch": 2.6283729628640127, "eval_loss": 0.1904931664466858, "eval_runtime": 38.991, "eval_samples_per_second": 15.491, "eval_steps_per_second": 3.873, "eval_token_acc": 0.9393724935126209, "step": 2460 }, { "epoch": 2.6337162703713597, "grad_norm": 0.7205042243003845, "learning_rate": 3.581603349196372e-07, "loss": 0.09016447067260742, "memory(GiB)": 31.37, "step": 2465, "token_acc": 0.9470618527207186, "train_speed(iter/s)": 0.113146 }, { "epoch": 2.639059577878707, "grad_norm": 0.7430869340896606, "learning_rate": 3.4782664840256387e-07, "loss": 0.09322860240936279, "memory(GiB)": 31.37, "step": 2470, "token_acc": 0.9666258078894584, "train_speed(iter/s)": 0.113215 }, { "epoch": 2.6444028853860537, "grad_norm": 0.7449864149093628, "learning_rate": 3.3763885297822153e-07, "loss": 0.09292680621147156, "memory(GiB)": 31.37, "step": 2475, "token_acc": 0.965542892849704, "train_speed(iter/s)": 0.113277 }, { "epoch": 2.649746192893401, "grad_norm": 0.6502243280410767, "learning_rate": 3.275972681335421e-07, "loss": 0.09386556148529053, "memory(GiB)": 31.37, "step": 2480, "token_acc": 0.9635287435353204, "train_speed(iter/s)": 0.113346 }, { "epoch": 2.649746192893401, "eval_loss": 0.19051562249660492, "eval_runtime": 38.9188, "eval_samples_per_second": 15.519, "eval_steps_per_second": 3.88, "eval_token_acc": 0.9393296017499839, "step": 2480 }, { "epoch": 2.655089500400748, "grad_norm": 0.6717422604560852, "learning_rate": 3.1770220877033243e-07, "loss": 0.08921995162963867, "memory(GiB)": 31.37, "step": 2485, "token_acc": 0.9476881346660132, "train_speed(iter/s)": 0.113146 }, { "epoch": 2.660432807908095, "grad_norm": 0.7039415240287781, "learning_rate": 3.0795398519539113e-07, "loss": 0.08925142884254456, "memory(GiB)": 31.37, "step": 2490, "token_acc": 0.971012390099527, "train_speed(iter/s)": 0.113211 }, { "epoch": 2.6657761154154422, "grad_norm": 0.7549923062324524, "learning_rate": 2.9835290311078123e-07, "loss": 0.09008611440658569, "memory(GiB)": 31.37, "step": 2495, "token_acc": 0.9727889176682867, "train_speed(iter/s)": 0.113272 }, { "epoch": 2.6711194229227893, "grad_norm": 0.6972574591636658, "learning_rate": 2.888992636042437e-07, "loss": 0.08815158009529114, "memory(GiB)": 31.37, "step": 2500, "token_acc": 0.9709897080314667, "train_speed(iter/s)": 0.113333 }, { "epoch": 2.6711194229227893, "eval_loss": 0.1904427856206894, "eval_runtime": 38.99, "eval_samples_per_second": 15.491, "eval_steps_per_second": 3.873, "eval_token_acc": 0.9394454095091038, "step": 2500 }, { "epoch": 2.6764627304301363, "grad_norm": 0.7152836322784424, "learning_rate": 2.7959336313974847e-07, "loss": 0.09650709629058837, "memory(GiB)": 31.37, "step": 2505, "token_acc": 0.9477473770829048, "train_speed(iter/s)": 0.113148 }, { "epoch": 2.6818060379374833, "grad_norm": 0.6270228028297424, "learning_rate": 2.704354935482095e-07, "loss": 0.08850882053375245, "memory(GiB)": 31.37, "step": 2510, "token_acc": 0.9648082862758556, "train_speed(iter/s)": 0.113203 }, { "epoch": 2.6871493454448303, "grad_norm": 0.7035424709320068, "learning_rate": 2.6142594201832183e-07, "loss": 0.10063533782958985, "memory(GiB)": 31.37, "step": 2515, "token_acc": 0.9673570595099183, "train_speed(iter/s)": 0.113268 }, { "epoch": 2.6924926529521773, "grad_norm": 0.6758410334587097, "learning_rate": 2.525649910875627e-07, "loss": 0.08648205399513245, "memory(GiB)": 31.37, "step": 2520, "token_acc": 0.970028145397928, "train_speed(iter/s)": 0.113337 }, { "epoch": 2.6924926529521773, "eval_loss": 0.19026722013950348, "eval_runtime": 39.0306, "eval_samples_per_second": 15.475, "eval_steps_per_second": 3.869, "eval_token_acc": 0.9392909991636106, "step": 2520 }, { "epoch": 2.6978359604595243, "grad_norm": 0.7509530186653137, "learning_rate": 2.438529186333288e-07, "loss": 0.09757347106933593, "memory(GiB)": 31.37, "step": 2525, "token_acc": 0.9475406925782507, "train_speed(iter/s)": 0.11316 }, { "epoch": 2.7031792679668714, "grad_norm": 0.7634799480438232, "learning_rate": 2.3528999786421758e-07, "loss": 0.09107044339179993, "memory(GiB)": 31.37, "step": 2530, "token_acc": 0.969339403512032, "train_speed(iter/s)": 0.113213 }, { "epoch": 2.708522575474219, "grad_norm": 0.6541688442230225, "learning_rate": 2.2687649731146844e-07, "loss": 0.0989515721797943, "memory(GiB)": 31.37, "step": 2535, "token_acc": 0.9692385434102896, "train_speed(iter/s)": 0.113272 }, { "epoch": 2.7138658829815654, "grad_norm": 0.7726590037345886, "learning_rate": 2.1861268082053466e-07, "loss": 0.09174089431762696, "memory(GiB)": 31.37, "step": 2540, "token_acc": 0.963183540877098, "train_speed(iter/s)": 0.113328 }, { "epoch": 2.7138658829815654, "eval_loss": 0.1902547925710678, "eval_runtime": 39.0546, "eval_samples_per_second": 15.466, "eval_steps_per_second": 3.866, "eval_token_acc": 0.9391923481095455, "step": 2540 }, { "epoch": 2.719209190488913, "grad_norm": 0.7492479681968689, "learning_rate": 2.104988075428127e-07, "loss": 0.10171656608581543, "memory(GiB)": 31.37, "step": 2545, "token_acc": 0.947412690936007, "train_speed(iter/s)": 0.113141 }, { "epoch": 2.7245524979962594, "grad_norm": 0.7487272620201111, "learning_rate": 2.0253513192751374e-07, "loss": 0.10030395984649658, "memory(GiB)": 31.37, "step": 2550, "token_acc": 0.9660484246469037, "train_speed(iter/s)": 0.113213 }, { "epoch": 2.729895805503607, "grad_norm": 0.7023541927337646, "learning_rate": 1.947219037136827e-07, "loss": 0.09708930253982544, "memory(GiB)": 31.37, "step": 2555, "token_acc": 0.9649252477742315, "train_speed(iter/s)": 0.113282 }, { "epoch": 2.735239113010954, "grad_norm": 0.7202330231666565, "learning_rate": 1.8705936792237255e-07, "loss": 0.08675633072853088, "memory(GiB)": 31.37, "step": 2560, "token_acc": 0.9641549104720565, "train_speed(iter/s)": 0.113325 }, { "epoch": 2.735239113010954, "eval_loss": 0.1902298927307129, "eval_runtime": 39.1479, "eval_samples_per_second": 15.429, "eval_steps_per_second": 3.857, "eval_token_acc": 0.9393167342211928, "step": 2560 }, { "epoch": 2.740582420518301, "grad_norm": 0.6873902082443237, "learning_rate": 1.7954776484895188e-07, "loss": 0.0941142499446869, "memory(GiB)": 31.37, "step": 2565, "token_acc": 0.947600952293746, "train_speed(iter/s)": 0.113141 }, { "epoch": 2.745925728025648, "grad_norm": 0.7027167081832886, "learning_rate": 1.7218733005557707e-07, "loss": 0.08929444551467895, "memory(GiB)": 31.37, "step": 2570, "token_acc": 0.9652069391202651, "train_speed(iter/s)": 0.1132 }, { "epoch": 2.751269035532995, "grad_norm": 0.7328935265541077, "learning_rate": 1.6497829436380009e-07, "loss": 0.09518647193908691, "memory(GiB)": 31.37, "step": 2575, "token_acc": 0.9699323199822478, "train_speed(iter/s)": 0.11326 }, { "epoch": 2.756612343040342, "grad_norm": 0.7890388369560242, "learning_rate": 1.5792088384733174e-07, "loss": 0.09950923323631286, "memory(GiB)": 31.37, "step": 2580, "token_acc": 0.962556860615113, "train_speed(iter/s)": 0.113323 }, { "epoch": 2.756612343040342, "eval_loss": 0.19023241102695465, "eval_runtime": 39.1648, "eval_samples_per_second": 15.422, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9394196744515216, "step": 2580 }, { "epoch": 2.761955650547689, "grad_norm": 0.7428569793701172, "learning_rate": 1.510153198249531e-07, "loss": 0.08879505395889283, "memory(GiB)": 31.37, "step": 2585, "token_acc": 0.9496286055977968, "train_speed(iter/s)": 0.113151 }, { "epoch": 2.767298958055036, "grad_norm": 0.7776227593421936, "learning_rate": 1.4426181885357215e-07, "loss": 0.09481008052825927, "memory(GiB)": 31.37, "step": 2590, "token_acc": 0.9610127994545511, "train_speed(iter/s)": 0.113206 }, { "epoch": 2.772642265562383, "grad_norm": 0.6838895082473755, "learning_rate": 1.376605927214364e-07, "loss": 0.0857117772102356, "memory(GiB)": 31.37, "step": 2595, "token_acc": 0.9700140999530001, "train_speed(iter/s)": 0.113255 }, { "epoch": 2.77798557306973, "grad_norm": 0.911618173122406, "learning_rate": 1.312118484414876e-07, "loss": 0.09858800768852234, "memory(GiB)": 31.37, "step": 2600, "token_acc": 0.9638269804901757, "train_speed(iter/s)": 0.113319 }, { "epoch": 2.77798557306973, "eval_loss": 0.1901472806930542, "eval_runtime": 39.1574, "eval_samples_per_second": 15.425, "eval_steps_per_second": 3.856, "eval_token_acc": 0.9393510476313024, "step": 2600 }, { "epoch": 2.783328880577077, "grad_norm": 0.7218348979949951, "learning_rate": 1.2491578824487204e-07, "loss": 0.09123666286468506, "memory(GiB)": 31.37, "step": 2605, "token_acc": 0.9471299437601038, "train_speed(iter/s)": 0.113146 }, { "epoch": 2.7886721880844245, "grad_norm": 0.7690821290016174, "learning_rate": 1.1877260957459835e-07, "loss": 0.09849429726600648, "memory(GiB)": 31.37, "step": 2610, "token_acc": 0.9662001494666758, "train_speed(iter/s)": 0.113204 }, { "epoch": 2.794015495591771, "grad_norm": 0.700734555721283, "learning_rate": 1.1278250507934518e-07, "loss": 0.09033372402191162, "memory(GiB)": 31.37, "step": 2615, "token_acc": 0.967641649881776, "train_speed(iter/s)": 0.113271 }, { "epoch": 2.7993588030991186, "grad_norm": 0.7667319178581238, "learning_rate": 1.0694566260742001e-07, "loss": 0.08968676328659057, "memory(GiB)": 31.37, "step": 2620, "token_acc": 0.9692407567701249, "train_speed(iter/s)": 0.11334 }, { "epoch": 2.7993588030991186, "eval_loss": 0.19011949002742767, "eval_runtime": 39.1765, "eval_samples_per_second": 15.417, "eval_steps_per_second": 3.854, "eval_token_acc": 0.9393338909262476, "step": 2620 }, { "epoch": 2.804702110606465, "grad_norm": 0.6720722913742065, "learning_rate": 1.0126226520086823e-07, "loss": 0.09349075555801392, "memory(GiB)": 31.37, "step": 2625, "token_acc": 0.9490298548952021, "train_speed(iter/s)": 0.113158 }, { "epoch": 2.8100454181138126, "grad_norm": 0.7081384658813477, "learning_rate": 9.573249108973281e-08, "loss": 0.09482257962226867, "memory(GiB)": 31.37, "step": 2630, "token_acc": 0.9653012204622176, "train_speed(iter/s)": 0.113226 }, { "epoch": 2.8153887256211596, "grad_norm": 0.8206889629364014, "learning_rate": 9.035651368646647e-08, "loss": 0.08651464581489562, "memory(GiB)": 31.37, "step": 2635, "token_acc": 0.965938566552901, "train_speed(iter/s)": 0.113281 }, { "epoch": 2.8207320331285066, "grad_norm": 0.7286149263381958, "learning_rate": 8.513450158049109e-08, "loss": 0.09981081485748292, "memory(GiB)": 31.37, "step": 2640, "token_acc": 0.9647688936806594, "train_speed(iter/s)": 0.113351 }, { "epoch": 2.8207320331285066, "eval_loss": 0.19009515643119812, "eval_runtime": 39.1536, "eval_samples_per_second": 15.426, "eval_steps_per_second": 3.857, "eval_token_acc": 0.9394239636277852, "step": 2640 }, { "epoch": 2.8260753406358536, "grad_norm": 0.6493250131607056, "learning_rate": 8.006661853291298e-08, "loss": 0.09253804683685303, "memory(GiB)": 31.37, "step": 2645, "token_acc": 0.9492279023135687, "train_speed(iter/s)": 0.113182 }, { "epoch": 2.8314186481432007, "grad_norm": 0.7164784669876099, "learning_rate": 7.515302347138486e-08, "loss": 0.10055129528045655, "memory(GiB)": 31.37, "step": 2650, "token_acc": 0.9693806782717248, "train_speed(iter/s)": 0.113245 }, { "epoch": 2.8367619556505477, "grad_norm": 0.7498785853385925, "learning_rate": 7.03938704851248e-08, "loss": 0.09636443257331848, "memory(GiB)": 31.37, "step": 2655, "token_acc": 0.9618243584137769, "train_speed(iter/s)": 0.113298 }, { "epoch": 2.8421052631578947, "grad_norm": 0.7123965620994568, "learning_rate": 6.578930882008283e-08, "loss": 0.09678665399551392, "memory(GiB)": 31.37, "step": 2660, "token_acc": 0.9638537936625904, "train_speed(iter/s)": 0.113365 }, { "epoch": 2.8421052631578947, "eval_loss": 0.18998001515865326, "eval_runtime": 39.1365, "eval_samples_per_second": 15.433, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9393296017499839, "step": 2660 }, { "epoch": 2.8474485706652417, "grad_norm": 0.7025579214096069, "learning_rate": 6.133948287426028e-08, "loss": 0.09522255659103393, "memory(GiB)": 31.37, "step": 2665, "token_acc": 0.9474585290238993, "train_speed(iter/s)": 0.113185 }, { "epoch": 2.8527918781725887, "grad_norm": 0.7414459586143494, "learning_rate": 5.704453219318118e-08, "loss": 0.09207627773284913, "memory(GiB)": 31.37, "step": 2670, "token_acc": 0.9669233407114132, "train_speed(iter/s)": 0.113245 }, { "epoch": 2.8581351856799357, "grad_norm": 0.6940200328826904, "learning_rate": 5.2904591465516855e-08, "loss": 0.0982110857963562, "memory(GiB)": 31.37, "step": 2675, "token_acc": 0.9648942786069652, "train_speed(iter/s)": 0.113311 }, { "epoch": 2.8634784931872828, "grad_norm": 0.5920035243034363, "learning_rate": 4.891979051886153e-08, "loss": 0.08439633250236511, "memory(GiB)": 31.37, "step": 2680, "token_acc": 0.9698492462311558, "train_speed(iter/s)": 0.113369 }, { "epoch": 2.8634784931872828, "eval_loss": 0.19001102447509766, "eval_runtime": 39.056, "eval_samples_per_second": 15.465, "eval_steps_per_second": 3.866, "eval_token_acc": 0.9394068069227305, "step": 2680 }, { "epoch": 2.86882180069463, "grad_norm": 0.7572726607322693, "learning_rate": 4.509025431566283e-08, "loss": 0.09407066106796265, "memory(GiB)": 31.37, "step": 2685, "token_acc": 0.9469456798144891, "train_speed(iter/s)": 0.113207 }, { "epoch": 2.874165108201977, "grad_norm": 0.7466961741447449, "learning_rate": 4.141610294930043e-08, "loss": 0.08772618174552918, "memory(GiB)": 31.37, "step": 2690, "token_acc": 0.9621488738657316, "train_speed(iter/s)": 0.113262 }, { "epoch": 2.8795084157093243, "grad_norm": 0.7221233248710632, "learning_rate": 3.7897451640321326e-08, "loss": 0.09155750274658203, "memory(GiB)": 31.37, "step": 2695, "token_acc": 0.9711238770396626, "train_speed(iter/s)": 0.113319 }, { "epoch": 2.8848517232166713, "grad_norm": 0.690424919128418, "learning_rate": 3.4534410732825485e-08, "loss": 0.09065448045730591, "memory(GiB)": 31.37, "step": 2700, "token_acc": 0.9694952336302547, "train_speed(iter/s)": 0.113377 }, { "epoch": 2.8848517232166713, "eval_loss": 0.18985731899738312, "eval_runtime": 39.1204, "eval_samples_per_second": 15.44, "eval_steps_per_second": 3.86, "eval_token_acc": 0.939342469278775, "step": 2700 }, { "epoch": 2.8901950307240183, "grad_norm": 0.7742385864257812, "learning_rate": 3.1327085691006954e-08, "loss": 0.09011354446411132, "memory(GiB)": 31.37, "step": 2705, "token_acc": 0.9484952623546592, "train_speed(iter/s)": 0.113201 }, { "epoch": 2.8955383382313653, "grad_norm": 0.67616868019104, "learning_rate": 2.8275577095846495e-08, "loss": 0.08765259981155396, "memory(GiB)": 31.37, "step": 2710, "token_acc": 0.9678389470704784, "train_speed(iter/s)": 0.113251 }, { "epoch": 2.9008816457387123, "grad_norm": 0.6910288333892822, "learning_rate": 2.5379980641955792e-08, "loss": 0.09436768293380737, "memory(GiB)": 31.37, "step": 2715, "token_acc": 0.9657377798081316, "train_speed(iter/s)": 0.113309 }, { "epoch": 2.9062249532460593, "grad_norm": 0.7219937443733215, "learning_rate": 2.264038713457706e-08, "loss": 0.09317046403884888, "memory(GiB)": 31.37, "step": 2720, "token_acc": 0.9689827817804372, "train_speed(iter/s)": 0.113372 }, { "epoch": 2.9062249532460593, "eval_loss": 0.18988655507564545, "eval_runtime": 39.1328, "eval_samples_per_second": 15.435, "eval_steps_per_second": 3.859, "eval_token_acc": 0.9394668553904223, "step": 2720 }, { "epoch": 2.9115682607534064, "grad_norm": 0.7260240316390991, "learning_rate": 2.0056882486736982e-08, "loss": 0.09241507053375245, "memory(GiB)": 31.37, "step": 2725, "token_acc": 0.9490058931065415, "train_speed(iter/s)": 0.113201 }, { "epoch": 2.9169115682607534, "grad_norm": 0.6595222353935242, "learning_rate": 1.762954771655001e-08, "loss": 0.08784698247909546, "memory(GiB)": 31.37, "step": 2730, "token_acc": 0.967156078213068, "train_speed(iter/s)": 0.11326 }, { "epoch": 2.9222548757681004, "grad_norm": 0.7033255100250244, "learning_rate": 1.5358458944680356e-08, "loss": 0.08944010734558105, "memory(GiB)": 31.37, "step": 2735, "token_acc": 0.9681864301377934, "train_speed(iter/s)": 0.113312 }, { "epoch": 2.9275981832754474, "grad_norm": 0.684511661529541, "learning_rate": 1.3243687391952809e-08, "loss": 0.08554937839508056, "memory(GiB)": 31.37, "step": 2740, "token_acc": 0.9701811147059805, "train_speed(iter/s)": 0.11336 }, { "epoch": 2.9275981832754474, "eval_loss": 0.18995320796966553, "eval_runtime": 39.1422, "eval_samples_per_second": 15.431, "eval_steps_per_second": 3.858, "eval_token_acc": 0.9393767826888846, "step": 2740 }, { "epoch": 2.9329414907827944, "grad_norm": 0.6943471431732178, "learning_rate": 1.1285299377118974e-08, "loss": 0.08585541248321533, "memory(GiB)": 31.37, "step": 2745, "token_acc": 0.9478389939459105, "train_speed(iter/s)": 0.113182 }, { "epoch": 2.9382847982901414, "grad_norm": 0.778913676738739, "learning_rate": 9.48335631477948e-09, "loss": 0.09440468549728394, "memory(GiB)": 31.37, "step": 2750, "token_acc": 0.9696560591449694, "train_speed(iter/s)": 0.113235 }, { "epoch": 2.9436281057974885, "grad_norm": 0.6813073754310608, "learning_rate": 7.837914713457184e-09, "loss": 0.09091969132423401, "memory(GiB)": 31.37, "step": 2755, "token_acc": 0.9660938225731538, "train_speed(iter/s)": 0.113288 }, { "epoch": 2.948971413304836, "grad_norm": 0.8302053213119507, "learning_rate": 6.349026173824713e-09, "loss": 0.09277503490447998, "memory(GiB)": 31.37, "step": 2760, "token_acc": 0.9680151152198118, "train_speed(iter/s)": 0.113333 }, { "epoch": 2.948971413304836, "eval_loss": 0.1899597942829132, "eval_runtime": 39.1837, "eval_samples_per_second": 15.415, "eval_steps_per_second": 3.854, "eval_token_acc": 0.9393553368075661, "step": 2760 }, { "epoch": 2.9543147208121825, "grad_norm": 0.7870346307754517, "learning_rate": 5.016737387085191e-09, "loss": 0.09893054962158203, "memory(GiB)": 31.37, "step": 2765, "token_acc": 0.946137875713679, "train_speed(iter/s)": 0.113161 }, { "epoch": 2.95965802831953, "grad_norm": 0.7045755386352539, "learning_rate": 3.841090133511749e-09, "loss": 0.08883514404296874, "memory(GiB)": 31.37, "step": 2770, "token_acc": 0.9683565527543212, "train_speed(iter/s)": 0.113206 }, { "epoch": 2.965001335826877, "grad_norm": 0.6870289444923401, "learning_rate": 2.8221212811324616e-09, "loss": 0.09689734578132629, "memory(GiB)": 31.37, "step": 2775, "token_acc": 0.9711121335611936, "train_speed(iter/s)": 0.113254 }, { "epoch": 2.970344643334224, "grad_norm": 0.7232000827789307, "learning_rate": 1.959862784577937e-09, "loss": 0.09915404319763184, "memory(GiB)": 31.37, "step": 2780, "token_acc": 0.9624814043439452, "train_speed(iter/s)": 0.113315 }, { "epoch": 2.970344643334224, "eval_loss": 0.1898777186870575, "eval_runtime": 39.1537, "eval_samples_per_second": 15.426, "eval_steps_per_second": 3.857, "eval_token_acc": 0.9394239636277852, "step": 2780 }, { "epoch": 2.975687950841571, "grad_norm": 0.6913635730743408, "learning_rate": 1.2543416840771206e-09, "loss": 0.09539123177528382, "memory(GiB)": 31.37, "step": 2785, "token_acc": 0.9471013034632325, "train_speed(iter/s)": 0.113152 }, { "epoch": 2.981031258348918, "grad_norm": 0.704073429107666, "learning_rate": 7.055801046113031e-10, "loss": 0.08701257705688477, "memory(GiB)": 31.37, "step": 2790, "token_acc": 0.969090176051606, "train_speed(iter/s)": 0.113208 }, { "epoch": 2.986374565856265, "grad_norm": 0.7093097567558289, "learning_rate": 3.1359525521801326e-10, "loss": 0.08475543856620789, "memory(GiB)": 31.37, "step": 2795, "token_acc": 0.9701114312493491, "train_speed(iter/s)": 0.113264 }, { "epoch": 2.991717873363612, "grad_norm": 0.6521144509315491, "learning_rate": 7.839942845144777e-11, "loss": 0.09691762924194336, "memory(GiB)": 31.37, "step": 2800, "token_acc": 0.9690315134805886, "train_speed(iter/s)": 0.113325 }, { "epoch": 2.991717873363612, "eval_loss": 0.19001290202140808, "eval_runtime": 39.1732, "eval_samples_per_second": 15.419, "eval_steps_per_second": 3.855, "eval_token_acc": 0.939398228570203, "step": 2800 }, { "epoch": 2.997061180870959, "grad_norm": 0.6936495900154114, "learning_rate": 0.0, "loss": 0.09210923910140992, "memory(GiB)": 31.37, "step": 2805, "token_acc": 0.947245220359309, "train_speed(iter/s)": 0.11316 }, { "epoch": 2.997061180870959, "eval_loss": 0.18997597694396973, "eval_runtime": 38.9833, "eval_samples_per_second": 15.494, "eval_steps_per_second": 3.873, "eval_token_acc": 0.9393338909262476, "step": 2805 } ], "logging_steps": 5, "max_steps": 2805, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1732509215136154e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }