{ "best_global_step": 480, "best_metric": 0.2286583, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v47-20250505-200714/checkpoint-480", "epoch": 2.9908544551868306, "eval_steps": 20, "global_step": 717, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004180820486020381, "grad_norm": 2.620922088623047, "learning_rate": 9.999952004474853e-06, "loss": 0.34187427163124084, "memory(GiB)": 29.13, "step": 1, "token_acc": 0.9049815498154982, "train_speed(iter/s)": 0.073614 }, { "epoch": 0.020904102430101906, "grad_norm": 1.5743770599365234, "learning_rate": 9.998800157942083e-06, "loss": 0.2900405824184418, "memory(GiB)": 29.13, "step": 5, "token_acc": 0.8957104981995172, "train_speed(iter/s)": 0.151098 }, { "epoch": 0.04180820486020381, "grad_norm": 0.8994374871253967, "learning_rate": 9.995201207616718e-06, "loss": 0.2705298900604248, "memory(GiB)": 29.14, "step": 10, "token_acc": 0.9024558145491803, "train_speed(iter/s)": 0.180178 }, { "epoch": 0.06271230729030572, "grad_norm": 0.6387772560119629, "learning_rate": 9.98920487629269e-06, "loss": 0.24947943687438964, "memory(GiB)": 32.63, "step": 15, "token_acc": 0.9174802221848174, "train_speed(iter/s)": 0.188578 }, { "epoch": 0.08361640972040763, "grad_norm": 0.6275189518928528, "learning_rate": 9.980814041830203e-06, "loss": 0.25027036666870117, "memory(GiB)": 34.7, "step": 20, "token_acc": 0.9166679292737989, "train_speed(iter/s)": 0.191459 }, { "epoch": 0.08361640972040763, "eval_loss": 0.28380751609802246, "eval_runtime": 6.7011, "eval_samples_per_second": 22.981, "eval_steps_per_second": 5.82, "eval_token_acc": 0.9153551646138229, "step": 20 }, { "epoch": 0.10452051215050953, "grad_norm": 0.6861765384674072, "learning_rate": 9.970032731299697e-06, "loss": 0.24285974502563476, "memory(GiB)": 34.7, "step": 25, "token_acc": 0.9173268332317692, "train_speed(iter/s)": 0.170029 }, { "epoch": 0.12542461458061144, "grad_norm": 0.6327997446060181, "learning_rate": 9.956866119049095e-06, "loss": 0.2500872850418091, "memory(GiB)": 34.7, "step": 30, "token_acc": 0.9172865583004255, "train_speed(iter/s)": 0.175321 }, { "epoch": 0.14632871701071334, "grad_norm": 0.6613020300865173, "learning_rate": 9.941320524220455e-06, "loss": 0.24076151847839355, "memory(GiB)": 34.7, "step": 35, "token_acc": 0.9165492852219258, "train_speed(iter/s)": 0.179249 }, { "epoch": 0.16723281944081525, "grad_norm": 0.5987035036087036, "learning_rate": 9.92340340771717e-06, "loss": 0.23788681030273437, "memory(GiB)": 37.02, "step": 40, "token_acc": 0.9233054502142574, "train_speed(iter/s)": 0.182322 }, { "epoch": 0.16723281944081525, "eval_loss": 0.2673507630825043, "eval_runtime": 6.6176, "eval_samples_per_second": 23.271, "eval_steps_per_second": 5.893, "eval_token_acc": 0.9202628066277145, "step": 40 }, { "epoch": 0.18813692187091716, "grad_norm": 0.5674816370010376, "learning_rate": 9.903123368623216e-06, "loss": 0.2267207145690918, "memory(GiB)": 37.02, "step": 45, "token_acc": 0.9245033510654285, "train_speed(iter/s)": 0.172104 }, { "epoch": 0.20904102430101906, "grad_norm": 0.5934615731239319, "learning_rate": 9.88049014007613e-06, "loss": 0.23341102600097657, "memory(GiB)": 37.02, "step": 50, "token_acc": 0.9186244567132621, "train_speed(iter/s)": 0.175846 }, { "epoch": 0.22994512673112097, "grad_norm": 0.6319158673286438, "learning_rate": 9.855514584595719e-06, "loss": 0.24078943729400634, "memory(GiB)": 37.02, "step": 55, "token_acc": 0.9141281922363426, "train_speed(iter/s)": 0.17881 }, { "epoch": 0.2508492291612229, "grad_norm": 0.6614541411399841, "learning_rate": 9.828208688870736e-06, "loss": 0.2285898208618164, "memory(GiB)": 37.02, "step": 60, "token_acc": 0.919986967045243, "train_speed(iter/s)": 0.181102 }, { "epoch": 0.2508492291612229, "eval_loss": 0.2608849108219147, "eval_runtime": 6.6441, "eval_samples_per_second": 23.178, "eval_steps_per_second": 5.87, "eval_token_acc": 0.9218693799905684, "step": 60 }, { "epoch": 0.2717533315913248, "grad_norm": 0.6391910314559937, "learning_rate": 9.79858555800603e-06, "loss": 0.23735857009887695, "memory(GiB)": 37.02, "step": 65, "token_acc": 0.9195915366053212, "train_speed(iter/s)": 0.173657 }, { "epoch": 0.2926574340214267, "grad_norm": 0.6433473825454712, "learning_rate": 9.766659409232918e-06, "loss": 0.22774579524993896, "memory(GiB)": 37.02, "step": 70, "token_acc": 0.9207077029819454, "train_speed(iter/s)": 0.17624 }, { "epoch": 0.3135615364515286, "grad_norm": 0.5562565326690674, "learning_rate": 9.732445565085823e-06, "loss": 0.22526907920837402, "memory(GiB)": 37.02, "step": 75, "token_acc": 0.9271212368745452, "train_speed(iter/s)": 0.177663 }, { "epoch": 0.3344656388816305, "grad_norm": 0.7328993678092957, "learning_rate": 9.69596044604841e-06, "loss": 0.23299379348754884, "memory(GiB)": 37.02, "step": 80, "token_acc": 0.917067597792331, "train_speed(iter/s)": 0.180448 }, { "epoch": 0.3344656388816305, "eval_loss": 0.25337016582489014, "eval_runtime": 6.6465, "eval_samples_per_second": 23.17, "eval_steps_per_second": 5.868, "eval_token_acc": 0.9238915842731654, "step": 80 }, { "epoch": 0.35536974131173243, "grad_norm": 0.6519869565963745, "learning_rate": 9.657221562672803e-06, "loss": 0.22955694198608398, "memory(GiB)": 37.02, "step": 85, "token_acc": 0.9178025442310218, "train_speed(iter/s)": 0.174696 }, { "epoch": 0.3762738437418343, "grad_norm": 0.5939056277275085, "learning_rate": 9.616247507175624e-06, "loss": 0.22620651721954346, "memory(GiB)": 37.02, "step": 90, "token_acc": 0.9214102974096579, "train_speed(iter/s)": 0.176915 }, { "epoch": 0.39717794617193625, "grad_norm": 0.6213501691818237, "learning_rate": 9.573057944514897e-06, "loss": 0.2274672269821167, "memory(GiB)": 37.02, "step": 95, "token_acc": 0.9245137714423773, "train_speed(iter/s)": 0.178294 }, { "epoch": 0.4180820486020381, "grad_norm": 0.6670387387275696, "learning_rate": 9.527673602952123e-06, "loss": 0.23140230178833007, "memory(GiB)": 37.02, "step": 100, "token_acc": 0.9132270680440643, "train_speed(iter/s)": 0.179262 }, { "epoch": 0.4180820486020381, "eval_loss": 0.25054118037223816, "eval_runtime": 6.6295, "eval_samples_per_second": 23.23, "eval_steps_per_second": 5.883, "eval_token_acc": 0.9250425622047622, "step": 100 }, { "epoch": 0.43898615103214006, "grad_norm": 0.6463162899017334, "learning_rate": 9.48011626410401e-06, "loss": 0.22332818508148194, "memory(GiB)": 37.02, "step": 105, "token_acc": 0.9278207538447314, "train_speed(iter/s)": 0.174957 }, { "epoch": 0.45989025346224194, "grad_norm": 0.5779640674591064, "learning_rate": 9.430408752488687e-06, "loss": 0.2219472885131836, "memory(GiB)": 37.02, "step": 110, "token_acc": 0.9219851633866716, "train_speed(iter/s)": 0.176134 }, { "epoch": 0.48079435589234387, "grad_norm": 0.6310390830039978, "learning_rate": 9.378574924571362e-06, "loss": 0.22848432064056395, "memory(GiB)": 37.02, "step": 115, "token_acc": 0.9266341377642974, "train_speed(iter/s)": 0.177157 }, { "epoch": 0.5016984583224458, "grad_norm": 0.5928042531013489, "learning_rate": 9.324639657314742e-06, "loss": 0.23035595417022706, "memory(GiB)": 37.02, "step": 120, "token_acc": 0.9220481773335896, "train_speed(iter/s)": 0.178639 }, { "epoch": 0.5016984583224458, "eval_loss": 0.24868212640285492, "eval_runtime": 6.6274, "eval_samples_per_second": 23.237, "eval_steps_per_second": 5.885, "eval_token_acc": 0.9254182286129916, "step": 120 }, { "epoch": 0.5226025607525477, "grad_norm": 0.5700287222862244, "learning_rate": 9.268628836239646e-06, "loss": 0.22175629138946534, "memory(GiB)": 37.02, "step": 125, "token_acc": 0.9286114910407667, "train_speed(iter/s)": 0.175138 }, { "epoch": 0.5435066631826496, "grad_norm": 0.6334052681922913, "learning_rate": 9.21056934300161e-06, "loss": 0.2261972188949585, "memory(GiB)": 37.02, "step": 130, "token_acc": 0.9182782377470237, "train_speed(iter/s)": 0.176545 }, { "epoch": 0.5644107656127515, "grad_norm": 0.6031065583229065, "learning_rate": 9.150489042489368e-06, "loss": 0.2102799892425537, "memory(GiB)": 37.02, "step": 135, "token_acc": 0.9315734630079889, "train_speed(iter/s)": 0.177648 }, { "epoch": 0.5853148680428534, "grad_norm": 0.593549907207489, "learning_rate": 9.088416769451485e-06, "loss": 0.21703071594238282, "memory(GiB)": 37.02, "step": 140, "token_acc": 0.9275403061439635, "train_speed(iter/s)": 0.178909 }, { "epoch": 0.5853148680428534, "eval_loss": 0.24534018337726593, "eval_runtime": 6.6365, "eval_samples_per_second": 23.205, "eval_steps_per_second": 5.877, "eval_token_acc": 0.9266331497630105, "step": 140 }, { "epoch": 0.6062189704729554, "grad_norm": 0.6485511660575867, "learning_rate": 9.02438231465749e-06, "loss": 0.2273317813873291, "memory(GiB)": 37.02, "step": 145, "token_acc": 0.9251848272032739, "train_speed(iter/s)": 0.175995 }, { "epoch": 0.6271230729030572, "grad_norm": 0.6085463166236877, "learning_rate": 8.958416410600188e-06, "loss": 0.22932357788085939, "memory(GiB)": 37.02, "step": 150, "token_acc": 0.9163140495867769, "train_speed(iter/s)": 0.177285 }, { "epoch": 0.6480271753331591, "grad_norm": 0.5881887078285217, "learning_rate": 8.890550716746013e-06, "loss": 0.2232443571090698, "memory(GiB)": 37.02, "step": 155, "token_acc": 0.920258257982589, "train_speed(iter/s)": 0.17828 }, { "epoch": 0.668931277763261, "grad_norm": 0.5788638591766357, "learning_rate": 8.820817804340471e-06, "loss": 0.20956034660339357, "memory(GiB)": 37.02, "step": 160, "token_acc": 0.9304798016625346, "train_speed(iter/s)": 0.179214 }, { "epoch": 0.668931277763261, "eval_loss": 0.24245667457580566, "eval_runtime": 6.5715, "eval_samples_per_second": 23.435, "eval_steps_per_second": 5.935, "eval_token_acc": 0.927040787780451, "step": 160 }, { "epoch": 0.689835380193363, "grad_norm": 0.6712965369224548, "learning_rate": 8.749251140776016e-06, "loss": 0.2295995235443115, "memory(GiB)": 37.02, "step": 165, "token_acc": 0.9216801551445813, "train_speed(iter/s)": 0.176683 }, { "epoch": 0.7107394826234649, "grad_norm": 0.590825617313385, "learning_rate": 8.675885073529802e-06, "loss": 0.21770024299621582, "memory(GiB)": 37.02, "step": 170, "token_acc": 0.9224579044445766, "train_speed(iter/s)": 0.177781 }, { "epoch": 0.7316435850535667, "grad_norm": 0.6172182559967041, "learning_rate": 8.600754813679072e-06, "loss": 0.21406757831573486, "memory(GiB)": 37.02, "step": 175, "token_acc": 0.9237590553351814, "train_speed(iter/s)": 0.178707 }, { "epoch": 0.7525476874836686, "grad_norm": 0.5739086866378784, "learning_rate": 8.52389641900206e-06, "loss": 0.20645816326141359, "memory(GiB)": 37.02, "step": 180, "token_acc": 0.926357608155873, "train_speed(iter/s)": 0.179338 }, { "epoch": 0.7525476874836686, "eval_loss": 0.24040701985359192, "eval_runtime": 6.6295, "eval_samples_per_second": 23.229, "eval_steps_per_second": 5.883, "eval_token_acc": 0.9270647664873592, "step": 180 }, { "epoch": 0.7734517899137706, "grad_norm": 0.5748027563095093, "learning_rate": 8.445346776672546e-06, "loss": 0.21495263576507567, "memory(GiB)": 37.02, "step": 185, "token_acc": 0.9272353992083938, "train_speed(iter/s)": 0.177187 }, { "epoch": 0.7943558923438725, "grad_norm": 0.6537684798240662, "learning_rate": 8.365143585556326e-06, "loss": 0.22100327014923096, "memory(GiB)": 37.02, "step": 190, "token_acc": 0.9197183562004408, "train_speed(iter/s)": 0.177962 }, { "epoch": 0.8152599947739744, "grad_norm": 0.6247681379318237, "learning_rate": 8.283325338118154e-06, "loss": 0.21339471340179444, "memory(GiB)": 37.02, "step": 195, "token_acc": 0.9236349420094696, "train_speed(iter/s)": 0.178683 }, { "epoch": 0.8361640972040763, "grad_norm": 0.6301010847091675, "learning_rate": 8.199931301947782e-06, "loss": 0.2191436767578125, "memory(GiB)": 37.02, "step": 200, "token_acc": 0.9171661150371232, "train_speed(iter/s)": 0.179195 }, { "epoch": 0.8361640972040763, "eval_loss": 0.2372375875711441, "eval_runtime": 6.6381, "eval_samples_per_second": 23.199, "eval_steps_per_second": 5.875, "eval_token_acc": 0.9280638792752036, "step": 200 }, { "epoch": 0.8570681996341782, "grad_norm": 0.5749327540397644, "learning_rate": 8.115001500914e-06, "loss": 0.21314010620117188, "memory(GiB)": 37.02, "step": 205, "token_acc": 0.9257877237051938, "train_speed(iter/s)": 0.176853 }, { "epoch": 0.8779723020642801, "grad_norm": 0.6665855050086975, "learning_rate": 8.028576695955711e-06, "loss": 0.20737431049346924, "memory(GiB)": 37.02, "step": 210, "token_acc": 0.9287075750122971, "train_speed(iter/s)": 0.177713 }, { "epoch": 0.898876404494382, "grad_norm": 0.5684797763824463, "learning_rate": 7.940698365519246e-06, "loss": 0.21522219181060792, "memory(GiB)": 37.02, "step": 215, "token_acc": 0.9235592334740796, "train_speed(iter/s)": 0.178152 }, { "epoch": 0.9197805069244839, "grad_norm": 0.6096109747886658, "learning_rate": 7.851408685651342e-06, "loss": 0.21898245811462402, "memory(GiB)": 37.02, "step": 220, "token_acc": 0.9227710824734293, "train_speed(iter/s)": 0.178957 }, { "epoch": 0.9197805069244839, "eval_loss": 0.23681528866291046, "eval_runtime": 6.6341, "eval_samples_per_second": 23.213, "eval_steps_per_second": 5.879, "eval_token_acc": 0.9285674321202771, "step": 220 }, { "epoch": 0.9406846093545859, "grad_norm": 0.55597323179245, "learning_rate": 7.7607505097573e-06, "loss": 0.21175863742828369, "memory(GiB)": 37.02, "step": 225, "token_acc": 0.9328776189582448, "train_speed(iter/s)": 0.17712 }, { "epoch": 0.9615887117846877, "grad_norm": 0.6526412963867188, "learning_rate": 7.668767348034044e-06, "loss": 0.21146607398986816, "memory(GiB)": 37.02, "step": 230, "token_acc": 0.920968304305235, "train_speed(iter/s)": 0.177568 }, { "epoch": 0.9824928142147896, "grad_norm": 0.5566428303718567, "learning_rate": 7.5755033465880024e-06, "loss": 0.21480951309204102, "memory(GiB)": 37.02, "step": 235, "token_acc": 0.9206400893147881, "train_speed(iter/s)": 0.178101 }, { "epoch": 1.0, "grad_norm": 0.6223005056381226, "learning_rate": 7.481003266247745e-06, "loss": 0.21818625926971436, "memory(GiB)": 37.02, "step": 240, "token_acc": 0.9244112717911505, "train_speed(iter/s)": 0.179289 }, { "epoch": 1.0, "eval_loss": 0.23499608039855957, "eval_runtime": 6.6316, "eval_samples_per_second": 23.222, "eval_steps_per_second": 5.881, "eval_token_acc": 0.9285194747064607, "step": 240 }, { "epoch": 1.020904102430102, "grad_norm": 0.5871774554252625, "learning_rate": 7.385312461081616e-06, "loss": 0.1686471700668335, "memory(GiB)": 37.02, "step": 245, "token_acc": 0.9376223265624841, "train_speed(iter/s)": 0.177674 }, { "epoch": 1.0418082048602038, "grad_norm": 0.6004992723464966, "learning_rate": 7.288476856630656e-06, "loss": 0.16282508373260499, "memory(GiB)": 37.02, "step": 250, "token_acc": 0.9393488548772574, "train_speed(iter/s)": 0.17831 }, { "epoch": 1.0627123072903057, "grad_norm": 0.6309487223625183, "learning_rate": 7.190542927867234e-06, "loss": 0.1637326955795288, "memory(GiB)": 37.02, "step": 255, "token_acc": 0.945661712279932, "train_speed(iter/s)": 0.178866 }, { "epoch": 1.0836164097204075, "grad_norm": 0.5814361572265625, "learning_rate": 7.091557676890001e-06, "loss": 0.15928541421890258, "memory(GiB)": 37.02, "step": 260, "token_acc": 0.9451221362463966, "train_speed(iter/s)": 0.179402 }, { "epoch": 1.0836164097204075, "eval_loss": 0.23942023515701294, "eval_runtime": 6.6351, "eval_samples_per_second": 23.21, "eval_steps_per_second": 5.878, "eval_token_acc": 0.9281597941028367, "step": 260 }, { "epoch": 1.1045205121505095, "grad_norm": 0.5660094022750854, "learning_rate": 6.991568610365851e-06, "loss": 0.15568481683731078, "memory(GiB)": 37.02, "step": 265, "token_acc": 0.9408398229323706, "train_speed(iter/s)": 0.177849 }, { "epoch": 1.1254246145806115, "grad_norm": 0.5609026551246643, "learning_rate": 6.890623716729724e-06, "loss": 0.16430522203445436, "memory(GiB)": 37.02, "step": 270, "token_acc": 0.9378496054183171, "train_speed(iter/s)": 0.178387 }, { "epoch": 1.1463287170107133, "grad_norm": 0.579684317111969, "learning_rate": 6.788771443153183e-06, "loss": 0.15910866260528564, "memory(GiB)": 37.02, "step": 275, "token_acc": 0.9425582560761714, "train_speed(iter/s)": 0.178707 }, { "epoch": 1.1672328194408153, "grad_norm": 0.5673295855522156, "learning_rate": 6.686060672292847e-06, "loss": 0.15805959701538086, "memory(GiB)": 37.02, "step": 280, "token_acc": 0.9461089281816197, "train_speed(iter/s)": 0.179432 }, { "epoch": 1.1672328194408153, "eval_loss": 0.23961253464221954, "eval_runtime": 6.6282, "eval_samples_per_second": 23.234, "eval_steps_per_second": 5.884, "eval_token_acc": 0.9277281773784879, "step": 280 }, { "epoch": 1.1881369218709172, "grad_norm": 0.5638378858566284, "learning_rate": 6.5825406988297815e-06, "loss": 0.160142982006073, "memory(GiB)": 37.02, "step": 285, "token_acc": 0.9402033005473476, "train_speed(iter/s)": 0.178002 }, { "epoch": 1.209041024301019, "grad_norm": 0.5472784638404846, "learning_rate": 6.478261205811188e-06, "loss": 0.15631693601608276, "memory(GiB)": 37.02, "step": 290, "token_acc": 0.9425503107883562, "train_speed(iter/s)": 0.178558 }, { "epoch": 1.229945126731121, "grad_norm": 0.5741646885871887, "learning_rate": 6.373272240805668e-06, "loss": 0.16377530097961426, "memory(GiB)": 37.02, "step": 295, "token_acc": 0.9453790125312598, "train_speed(iter/s)": 0.179076 }, { "epoch": 1.250849229161223, "grad_norm": 0.5998528599739075, "learning_rate": 6.267624191883551e-06, "loss": 0.1654489278793335, "memory(GiB)": 37.02, "step": 300, "token_acc": 0.9433800623052959, "train_speed(iter/s)": 0.179541 }, { "epoch": 1.250849229161223, "eval_loss": 0.23872551321983337, "eval_runtime": 6.6305, "eval_samples_per_second": 23.226, "eval_steps_per_second": 5.882, "eval_token_acc": 0.9284555314880386, "step": 300 }, { "epoch": 1.2717533315913248, "grad_norm": 0.5820715427398682, "learning_rate": 6.161367763433812e-06, "loss": 0.1659400701522827, "memory(GiB)": 37.02, "step": 305, "token_acc": 0.9422344370459437, "train_speed(iter/s)": 0.178162 }, { "epoch": 1.2926574340214267, "grad_norm": 0.5959991216659546, "learning_rate": 6.054553951829163e-06, "loss": 0.16145311594009398, "memory(GiB)": 37.02, "step": 310, "token_acc": 0.9404544902175234, "train_speed(iter/s)": 0.178528 }, { "epoch": 1.3135615364515285, "grad_norm": 0.6270994544029236, "learning_rate": 5.947234020951015e-06, "loss": 0.16696672439575194, "memory(GiB)": 37.02, "step": 315, "token_acc": 0.9387243092362137, "train_speed(iter/s)": 0.178987 }, { "epoch": 1.3344656388816305, "grad_norm": 0.5847781896591187, "learning_rate": 5.839459477586056e-06, "loss": 0.1612384557723999, "memory(GiB)": 37.02, "step": 320, "token_acc": 0.9422280984134284, "train_speed(iter/s)": 0.179441 }, { "epoch": 1.3344656388816305, "eval_loss": 0.23678933084011078, "eval_runtime": 6.6342, "eval_samples_per_second": 23.213, "eval_steps_per_second": 5.879, "eval_token_acc": 0.9287432759709378, "step": 320 }, { "epoch": 1.3553697413117325, "grad_norm": 0.5601792931556702, "learning_rate": 5.731282046706247e-06, "loss": 0.16321887969970703, "memory(GiB)": 37.02, "step": 325, "token_acc": 0.938570500852342, "train_speed(iter/s)": 0.17826 }, { "epoch": 1.3762738437418343, "grad_norm": 0.5960825085639954, "learning_rate": 5.622753646644102e-06, "loss": 0.16715322732925414, "memory(GiB)": 37.02, "step": 330, "token_acc": 0.9422235465708954, "train_speed(iter/s)": 0.178716 }, { "epoch": 1.3971779461719362, "grad_norm": 0.5615048408508301, "learning_rate": 5.513926364175172e-06, "loss": 0.16500518321990967, "memory(GiB)": 37.02, "step": 335, "token_acc": 0.9453201823653863, "train_speed(iter/s)": 0.179021 }, { "epoch": 1.418082048602038, "grad_norm": 0.5556049942970276, "learning_rate": 5.404852429519678e-06, "loss": 0.15454906225204468, "memory(GiB)": 37.02, "step": 340, "token_acc": 0.9407398356599567, "train_speed(iter/s)": 0.179344 }, { "epoch": 1.418082048602038, "eval_loss": 0.23516136407852173, "eval_runtime": 6.6178, "eval_samples_per_second": 23.271, "eval_steps_per_second": 5.893, "eval_token_acc": 0.9299502042186538, "step": 340 }, { "epoch": 1.43898615103214, "grad_norm": 0.5906144976615906, "learning_rate": 5.295584191275308e-06, "loss": 0.16890814304351806, "memory(GiB)": 37.02, "step": 345, "token_acc": 0.9373257274217237, "train_speed(iter/s)": 0.17811 }, { "epoch": 1.459890253462242, "grad_norm": 0.6000419855117798, "learning_rate": 5.1861740912932e-06, "loss": 0.16450071334838867, "memory(GiB)": 37.02, "step": 350, "token_acc": 0.9405447723714142, "train_speed(iter/s)": 0.178429 }, { "epoch": 1.480794355892344, "grad_norm": 0.5739107131958008, "learning_rate": 5.07667463950916e-06, "loss": 0.1568189740180969, "memory(GiB)": 37.02, "step": 355, "token_acc": 0.9423155065170506, "train_speed(iter/s)": 0.178939 }, { "epoch": 1.5016984583224458, "grad_norm": 0.5783458948135376, "learning_rate": 4.967138388742218e-06, "loss": 0.16183936595916748, "memory(GiB)": 37.02, "step": 360, "token_acc": 0.9435463668380956, "train_speed(iter/s)": 0.179308 }, { "epoch": 1.5016984583224458, "eval_loss": 0.23441138863563538, "eval_runtime": 6.6338, "eval_samples_per_second": 23.214, "eval_steps_per_second": 5.879, "eval_token_acc": 0.9301819983854337, "step": 360 }, { "epoch": 1.5226025607525477, "grad_norm": 0.509042501449585, "learning_rate": 4.8576179094725855e-06, "loss": 0.16039080619812013, "memory(GiB)": 37.02, "step": 365, "token_acc": 0.9386532058855117, "train_speed(iter/s)": 0.178194 }, { "epoch": 1.5435066631826495, "grad_norm": 0.5684186220169067, "learning_rate": 4.748165764611157e-06, "loss": 0.16779780387878418, "memory(GiB)": 37.02, "step": 370, "token_acc": 0.9417494999465592, "train_speed(iter/s)": 0.178626 }, { "epoch": 1.5644107656127515, "grad_norm": 0.5707104802131653, "learning_rate": 4.6388344842726266e-06, "loss": 0.16627538204193115, "memory(GiB)": 37.02, "step": 375, "token_acc": 0.938589794484991, "train_speed(iter/s)": 0.179043 }, { "epoch": 1.5853148680428535, "grad_norm": 0.5481753945350647, "learning_rate": 4.529676540564351e-06, "loss": 0.15947449207305908, "memory(GiB)": 37.02, "step": 380, "token_acc": 0.940670141008185, "train_speed(iter/s)": 0.179357 }, { "epoch": 1.5853148680428535, "eval_loss": 0.23353615403175354, "eval_runtime": 6.6285, "eval_samples_per_second": 23.233, "eval_steps_per_second": 5.884, "eval_token_acc": 0.9297983390749015, "step": 380 }, { "epoch": 1.6062189704729555, "grad_norm": 0.5632703900337219, "learning_rate": 4.420744322403058e-06, "loss": 0.15636355876922609, "memory(GiB)": 37.02, "step": 385, "token_acc": 0.9417008449869052, "train_speed(iter/s)": 0.178382 }, { "epoch": 1.6271230729030572, "grad_norm": 0.5890342593193054, "learning_rate": 4.312090110371473e-06, "loss": 0.15623259544372559, "memory(GiB)": 37.02, "step": 390, "token_acc": 0.9422333549803631, "train_speed(iter/s)": 0.178811 }, { "epoch": 1.648027175333159, "grad_norm": 0.5722407102584839, "learning_rate": 4.203766051626939e-06, "loss": 0.16071751117706298, "memory(GiB)": 37.02, "step": 395, "token_acc": 0.9453681710213777, "train_speed(iter/s)": 0.179205 }, { "epoch": 1.668931277763261, "grad_norm": 0.611022412776947, "learning_rate": 4.095824134874087e-06, "loss": 0.16332566738128662, "memory(GiB)": 37.02, "step": 400, "token_acc": 0.937804167388531, "train_speed(iter/s)": 0.179585 }, { "epoch": 1.668931277763261, "eval_loss": 0.23261623084545135, "eval_runtime": 6.6176, "eval_samples_per_second": 23.271, "eval_steps_per_second": 5.893, "eval_token_acc": 0.9303818209430026, "step": 400 }, { "epoch": 1.689835380193363, "grad_norm": 0.5979709029197693, "learning_rate": 3.988316165413528e-06, "loss": 0.1527752161026001, "memory(GiB)": 37.02, "step": 405, "token_acc": 0.940948516358103, "train_speed(iter/s)": 0.178456 }, { "epoch": 1.710739482623465, "grad_norm": 0.549656093120575, "learning_rate": 3.881293740278588e-06, "loss": 0.15298218727111818, "memory(GiB)": 39.46, "step": 410, "token_acc": 0.9467923967923968, "train_speed(iter/s)": 0.178833 }, { "epoch": 1.7316435850535667, "grad_norm": 0.5743902921676636, "learning_rate": 3.774808223471996e-06, "loss": 0.15700061321258546, "memory(GiB)": 39.46, "step": 415, "token_acc": 0.9460204535349044, "train_speed(iter/s)": 0.179167 }, { "epoch": 1.7525476874836685, "grad_norm": 0.5933798551559448, "learning_rate": 3.6689107213144025e-06, "loss": 0.16367003917694092, "memory(GiB)": 39.46, "step": 420, "token_acc": 0.9469271826676363, "train_speed(iter/s)": 0.179611 }, { "epoch": 1.7525476874836685, "eval_loss": 0.23134766519069672, "eval_runtime": 6.6251, "eval_samples_per_second": 23.245, "eval_steps_per_second": 5.887, "eval_token_acc": 0.9307654802535349, "step": 420 }, { "epoch": 1.7734517899137705, "grad_norm": 0.5866169929504395, "learning_rate": 3.5636520579165704e-06, "loss": 0.15368299484252929, "memory(GiB)": 39.46, "step": 425, "token_acc": 0.9376751889917608, "train_speed(iter/s)": 0.178672 }, { "epoch": 1.7943558923438725, "grad_norm": 0.6050538420677185, "learning_rate": 3.4590827507870257e-06, "loss": 0.1597348690032959, "memory(GiB)": 39.46, "step": 430, "token_acc": 0.9424976034327972, "train_speed(iter/s)": 0.179078 }, { "epoch": 1.8152599947739745, "grad_norm": 0.5516080856323242, "learning_rate": 3.3552529865868323e-06, "loss": 0.15672740936279297, "memory(GiB)": 39.46, "step": 435, "token_acc": 0.9391331914315475, "train_speed(iter/s)": 0.179453 }, { "epoch": 1.8361640972040763, "grad_norm": 0.5520040392875671, "learning_rate": 3.252212597043167e-06, "loss": 0.154363751411438, "memory(GiB)": 39.46, "step": 440, "token_acc": 0.9407710731669628, "train_speed(iter/s)": 0.179753 }, { "epoch": 1.8361640972040763, "eval_loss": 0.230976402759552, "eval_runtime": 6.6289, "eval_samples_per_second": 23.231, "eval_steps_per_second": 5.883, "eval_token_acc": 0.9307734731558376, "step": 440 }, { "epoch": 1.8570681996341782, "grad_norm": 0.5343540906906128, "learning_rate": 3.1500110350332492e-06, "loss": 0.1531538486480713, "memory(GiB)": 39.46, "step": 445, "token_acc": 0.9422152799196107, "train_speed(iter/s)": 0.178932 }, { "epoch": 1.87797230206428, "grad_norm": 0.5729184746742249, "learning_rate": 3.048697350850073e-06, "loss": 0.16055722236633302, "memory(GiB)": 39.46, "step": 450, "token_acc": 0.9512691733405728, "train_speed(iter/s)": 0.179136 }, { "epoch": 1.898876404494382, "grad_norm": 0.5333752036094666, "learning_rate": 2.9483201686613626e-06, "loss": 0.15494089126586913, "memory(GiB)": 39.46, "step": 455, "token_acc": 0.9471734021540102, "train_speed(iter/s)": 0.179584 }, { "epoch": 1.919780506924484, "grad_norm": 0.8302111625671387, "learning_rate": 2.8489276631730633e-06, "loss": 0.15890274047851563, "memory(GiB)": 39.46, "step": 460, "token_acc": 0.9424079706996089, "train_speed(iter/s)": 0.179933 }, { "epoch": 1.919780506924484, "eval_loss": 0.22922886908054352, "eval_runtime": 6.6337, "eval_samples_per_second": 23.215, "eval_steps_per_second": 5.879, "eval_token_acc": 0.9315727633861132, "step": 460 }, { "epoch": 1.940684609354586, "grad_norm": 0.5802881717681885, "learning_rate": 2.750567536508504e-06, "loss": 0.1534783959388733, "memory(GiB)": 39.46, "step": 465, "token_acc": 0.9404760675375639, "train_speed(iter/s)": 0.178985 }, { "epoch": 1.9615887117846877, "grad_norm": 0.5230849385261536, "learning_rate": 2.653286995314398e-06, "loss": 0.15316032171249389, "memory(GiB)": 39.46, "step": 470, "token_acc": 0.940865764070184, "train_speed(iter/s)": 0.179339 }, { "epoch": 1.9824928142147895, "grad_norm": 0.5382450222969055, "learning_rate": 2.5571327281046486e-06, "loss": 0.15808116197586058, "memory(GiB)": 39.46, "step": 475, "token_acc": 0.9414967066105865, "train_speed(iter/s)": 0.179724 }, { "epoch": 2.0, "grad_norm": 0.5938447117805481, "learning_rate": 2.46215088285279e-06, "loss": 0.16262369155883788, "memory(GiB)": 39.46, "step": 480, "token_acc": 0.9486793264834041, "train_speed(iter/s)": 0.180254 }, { "epoch": 2.0, "eval_loss": 0.2286583036184311, "eval_runtime": 6.6325, "eval_samples_per_second": 23.219, "eval_steps_per_second": 5.88, "eval_token_acc": 0.9318205433574985, "step": 480 }, { "epoch": 2.020904102430102, "grad_norm": 0.536767303943634, "learning_rate": 2.3683870448438905e-06, "loss": 0.12537214756011963, "memory(GiB)": 39.46, "step": 485, "token_acc": 0.9521131994289179, "train_speed(iter/s)": 0.179418 }, { "epoch": 2.041808204860204, "grad_norm": 0.5739563703536987, "learning_rate": 2.2758862147964933e-06, "loss": 0.12507247924804688, "memory(GiB)": 39.46, "step": 490, "token_acc": 0.9547663049958268, "train_speed(iter/s)": 0.179658 }, { "epoch": 2.0627123072903055, "grad_norm": 0.6101587414741516, "learning_rate": 2.1846927872651135e-06, "loss": 0.1226424217224121, "memory(GiB)": 39.46, "step": 495, "token_acc": 0.9567438898659713, "train_speed(iter/s)": 0.180059 }, { "epoch": 2.0836164097204075, "grad_norm": 0.5811767578125, "learning_rate": 2.0948505293336506e-06, "loss": 0.12014656066894532, "memory(GiB)": 39.46, "step": 500, "token_acc": 0.9582742281109468, "train_speed(iter/s)": 0.180402 }, { "epoch": 2.0836164097204075, "eval_loss": 0.24429188668727875, "eval_runtime": 6.6294, "eval_samples_per_second": 23.23, "eval_steps_per_second": 5.883, "eval_token_acc": 0.9308134376673514, "step": 500 }, { "epoch": 2.1045205121505095, "grad_norm": 0.610534131526947, "learning_rate": 2.0064025596099663e-06, "loss": 0.11917252540588379, "memory(GiB)": 39.46, "step": 505, "token_acc": 0.9507537400408873, "train_speed(iter/s)": 0.179597 }, { "epoch": 2.1254246145806115, "grad_norm": 0.5680590867996216, "learning_rate": 1.919391327531663e-06, "loss": 0.12412093877792359, "memory(GiB)": 39.46, "step": 510, "token_acc": 0.9561542850973921, "train_speed(iter/s)": 0.179885 }, { "epoch": 2.1463287170107135, "grad_norm": 0.5825695395469666, "learning_rate": 1.8338585929930424e-06, "loss": 0.11532289981842041, "memory(GiB)": 39.46, "step": 515, "token_acc": 0.9611214610756498, "train_speed(iter/s)": 0.180163 }, { "epoch": 2.167232819440815, "grad_norm": 0.5703282952308655, "learning_rate": 1.7498454063029984e-06, "loss": 0.11896244287490845, "memory(GiB)": 39.46, "step": 520, "token_acc": 0.9602545823350432, "train_speed(iter/s)": 0.180409 }, { "epoch": 2.167232819440815, "eval_loss": 0.24228492379188538, "eval_runtime": 6.5759, "eval_samples_per_second": 23.419, "eval_steps_per_second": 5.931, "eval_token_acc": 0.9311811111732782, "step": 520 }, { "epoch": 2.188136921870917, "grad_norm": 0.5464326739311218, "learning_rate": 1.667392088483456e-06, "loss": 0.12099326848983764, "memory(GiB)": 39.46, "step": 525, "token_acc": 0.9483148644281589, "train_speed(iter/s)": 0.179639 }, { "epoch": 2.209041024301019, "grad_norm": 0.5550944805145264, "learning_rate": 1.5865382119178258e-06, "loss": 0.11996636390686036, "memory(GiB)": 39.46, "step": 530, "token_acc": 0.9563651862085183, "train_speed(iter/s)": 0.179841 }, { "epoch": 2.229945126731121, "grad_norm": 0.5549576282501221, "learning_rate": 1.507322581358771e-06, "loss": 0.11995522975921631, "memory(GiB)": 39.46, "step": 535, "token_acc": 0.9548897519016594, "train_speed(iter/s)": 0.180134 }, { "epoch": 2.250849229161223, "grad_norm": 0.5435655117034912, "learning_rate": 1.4297832153043657e-06, "loss": 0.11738158464431762, "memory(GiB)": 39.46, "step": 540, "token_acc": 0.9565079135650791, "train_speed(iter/s)": 0.180343 }, { "epoch": 2.250849229161223, "eval_loss": 0.24256764352321625, "eval_runtime": 6.6338, "eval_samples_per_second": 23.214, "eval_steps_per_second": 5.879, "eval_token_acc": 0.9310052673226176, "step": 540 }, { "epoch": 2.271753331591325, "grad_norm": 0.5776016116142273, "learning_rate": 1.353957327751621e-06, "loss": 0.12343101501464844, "memory(GiB)": 39.46, "step": 545, "token_acc": 0.9484948703433345, "train_speed(iter/s)": 0.179597 }, { "epoch": 2.2926574340214265, "grad_norm": 0.5709097385406494, "learning_rate": 1.2798813103361291e-06, "loss": 0.11971625089645385, "memory(GiB)": 39.46, "step": 550, "token_acc": 0.9561414627962986, "train_speed(iter/s)": 0.179863 }, { "epoch": 2.3135615364515285, "grad_norm": 0.5969334840774536, "learning_rate": 1.2075907148663579e-06, "loss": 0.12062759399414062, "memory(GiB)": 39.46, "step": 555, "token_acc": 0.9569148857062122, "train_speed(iter/s)": 0.180073 }, { "epoch": 2.3344656388816305, "grad_norm": 0.5327123403549194, "learning_rate": 1.1371202362610412e-06, "loss": 0.11765568256378174, "memory(GiB)": 39.46, "step": 560, "token_acc": 0.9581596210457106, "train_speed(iter/s)": 0.180333 }, { "epoch": 2.3344656388816305, "eval_loss": 0.2422230988740921, "eval_runtime": 6.5972, "eval_samples_per_second": 23.343, "eval_steps_per_second": 5.912, "eval_token_acc": 0.9311651253686726, "step": 560 }, { "epoch": 2.3553697413117325, "grad_norm": 0.5562822818756104, "learning_rate": 1.06850369589781e-06, "loss": 0.11786850690841674, "memory(GiB)": 39.46, "step": 565, "token_acc": 0.9489760711168842, "train_speed(iter/s)": 0.179625 }, { "epoch": 2.3762738437418345, "grad_norm": 0.5818617939949036, "learning_rate": 1.0017740253810608e-06, "loss": 0.12321114540100098, "memory(GiB)": 39.46, "step": 570, "token_acc": 0.9529802240354032, "train_speed(iter/s)": 0.179844 }, { "epoch": 2.3971779461719365, "grad_norm": 0.5551012754440308, "learning_rate": 9.369632507368736e-07, "loss": 0.13383883237838745, "memory(GiB)": 39.46, "step": 575, "token_acc": 0.9536231884057971, "train_speed(iter/s)": 0.180115 }, { "epoch": 2.418082048602038, "grad_norm": 0.5434304475784302, "learning_rate": 8.741024770425394e-07, "loss": 0.12519620656967162, "memory(GiB)": 39.46, "step": 580, "token_acc": 0.9545193961398816, "train_speed(iter/s)": 0.180327 }, { "epoch": 2.418082048602038, "eval_loss": 0.24199490249156952, "eval_runtime": 6.6585, "eval_samples_per_second": 23.128, "eval_steps_per_second": 5.857, "eval_token_acc": 0.9310612176387368, "step": 580 }, { "epoch": 2.43898615103214, "grad_norm": 0.5505854487419128, "learning_rate": 8.132218734980852e-07, "loss": 0.12351593971252442, "memory(GiB)": 39.46, "step": 585, "token_acc": 0.9496205882650123, "train_speed(iter/s)": 0.17969 }, { "epoch": 2.459890253462242, "grad_norm": 0.5361783504486084, "learning_rate": 7.543506589469674e-07, "loss": 0.12893223762512207, "memory(GiB)": 39.46, "step": 590, "token_acc": 0.9550055365804208, "train_speed(iter/s)": 0.179827 }, { "epoch": 2.480794355892344, "grad_norm": 0.570577085018158, "learning_rate": 6.975170878528765e-07, "loss": 0.12303224802017212, "memory(GiB)": 39.46, "step": 595, "token_acc": 0.9543349832348714, "train_speed(iter/s)": 0.180173 }, { "epoch": 2.501698458322446, "grad_norm": 0.5742475390434265, "learning_rate": 6.427484367393699e-07, "loss": 0.1256563663482666, "memory(GiB)": 39.46, "step": 600, "token_acc": 0.954024779864412, "train_speed(iter/s)": 0.180362 }, { "epoch": 2.501698458322446, "eval_loss": 0.24162955582141876, "eval_runtime": 6.6424, "eval_samples_per_second": 23.184, "eval_steps_per_second": 5.871, "eval_token_acc": 0.9309413241041955, "step": 600 }, { "epoch": 2.5226025607525475, "grad_norm": 0.5502005219459534, "learning_rate": 5.900709910988739e-07, "loss": 0.1203322172164917, "memory(GiB)": 39.46, "step": 605, "token_acc": 0.945725693054572, "train_speed(iter/s)": 0.179668 }, { "epoch": 2.5435066631826495, "grad_norm": 0.550845742225647, "learning_rate": 5.395100327773018e-07, "loss": 0.11989848613739014, "memory(GiB)": 39.46, "step": 610, "token_acc": 0.9564310580152438, "train_speed(iter/s)": 0.179957 }, { "epoch": 2.5644107656127515, "grad_norm": 0.5418065190315247, "learning_rate": 4.91089827840367e-07, "loss": 0.12273094654083253, "memory(GiB)": 39.46, "step": 615, "token_acc": 0.9586797908520233, "train_speed(iter/s)": 0.180256 }, { "epoch": 2.5853148680428535, "grad_norm": 0.5306729674339294, "learning_rate": 4.4483361492740184e-07, "loss": 0.11489535570144653, "memory(GiB)": 39.46, "step": 620, "token_acc": 0.9587957254418413, "train_speed(iter/s)": 0.180471 }, { "epoch": 2.5853148680428535, "eval_loss": 0.24175922572612762, "eval_runtime": 6.6237, "eval_samples_per_second": 23.25, "eval_steps_per_second": 5.888, "eval_token_acc": 0.9314368840469663, "step": 620 }, { "epoch": 2.6062189704729555, "grad_norm": 0.5641535520553589, "learning_rate": 4.007635940982857e-07, "loss": 0.11712099313735962, "memory(GiB)": 39.46, "step": 625, "token_acc": 0.9532489285186463, "train_speed(iter/s)": 0.179823 }, { "epoch": 2.627123072903057, "grad_norm": 0.6171642541885376, "learning_rate": 3.589009161788104e-07, "loss": 0.1301506519317627, "memory(GiB)": 39.46, "step": 630, "token_acc": 0.9547707464948845, "train_speed(iter/s)": 0.180046 }, { "epoch": 2.648027175333159, "grad_norm": 0.5656234622001648, "learning_rate": 3.192656726096277e-07, "loss": 0.1133504033088684, "memory(GiB)": 39.46, "step": 635, "token_acc": 0.9620012398557258, "train_speed(iter/s)": 0.180284 }, { "epoch": 2.668931277763261, "grad_norm": 0.5739580988883972, "learning_rate": 2.818768858036208e-07, "loss": 0.11660020351409912, "memory(GiB)": 39.46, "step": 640, "token_acc": 0.9527535918095915, "train_speed(iter/s)": 0.180505 }, { "epoch": 2.668931277763261, "eval_loss": 0.24187050759792328, "eval_runtime": 6.6525, "eval_samples_per_second": 23.149, "eval_steps_per_second": 5.862, "eval_token_acc": 0.9315088201676911, "step": 640 }, { "epoch": 2.689835380193363, "grad_norm": 0.5692787170410156, "learning_rate": 2.467525000163523e-07, "loss": 0.12488572597503662, "memory(GiB)": 39.46, "step": 645, "token_acc": 0.949108626758746, "train_speed(iter/s)": 0.179804 }, { "epoch": 2.710739482623465, "grad_norm": 0.5694913864135742, "learning_rate": 2.139093727339503e-07, "loss": 0.1269749402999878, "memory(GiB)": 39.46, "step": 650, "token_acc": 0.9520699186238631, "train_speed(iter/s)": 0.180029 }, { "epoch": 2.7316435850535665, "grad_norm": 0.5761524438858032, "learning_rate": 1.8336326658258797e-07, "loss": 0.12404175996780395, "memory(GiB)": 39.46, "step": 655, "token_acc": 0.9545447343821791, "train_speed(iter/s)": 0.180226 }, { "epoch": 2.7525476874836685, "grad_norm": 0.5228170156478882, "learning_rate": 1.551288417634106e-07, "loss": 0.11508429050445557, "memory(GiB)": 39.46, "step": 660, "token_acc": 0.961899334176617, "train_speed(iter/s)": 0.180412 }, { "epoch": 2.7525476874836685, "eval_loss": 0.24170467257499695, "eval_runtime": 6.6488, "eval_samples_per_second": 23.162, "eval_steps_per_second": 5.866, "eval_token_acc": 0.9315327988745994, "step": 660 }, { "epoch": 2.7734517899137705, "grad_norm": 0.5596388578414917, "learning_rate": 1.292196490165698e-07, "loss": 0.12398817539215087, "memory(GiB)": 39.46, "step": 665, "token_acc": 0.9481295480168085, "train_speed(iter/s)": 0.179823 }, { "epoch": 2.7943558923438725, "grad_norm": 0.5801319479942322, "learning_rate": 1.0564812311772422e-07, "loss": 0.1263979434967041, "memory(GiB)": 39.46, "step": 670, "token_acc": 0.9553416631792909, "train_speed(iter/s)": 0.180051 }, { "epoch": 2.8152599947739745, "grad_norm": 0.6298143863677979, "learning_rate": 8.442557691013042e-08, "loss": 0.12353664636611938, "memory(GiB)": 39.46, "step": 675, "token_acc": 0.955449425153873, "train_speed(iter/s)": 0.180251 }, { "epoch": 2.836164097204076, "grad_norm": 0.5088171362876892, "learning_rate": 6.556219587519397e-08, "loss": 0.11518588066101074, "memory(GiB)": 39.46, "step": 680, "token_acc": 0.9564935905736113, "train_speed(iter/s)": 0.180447 }, { "epoch": 2.836164097204076, "eval_loss": 0.2416183054447174, "eval_runtime": 6.639, "eval_samples_per_second": 23.196, "eval_steps_per_second": 5.874, "eval_token_acc": 0.9316287137022324, "step": 680 }, { "epoch": 2.8570681996341785, "grad_norm": 0.5331613421440125, "learning_rate": 4.906703324408402e-08, "loss": 0.11869626045227051, "memory(GiB)": 39.46, "step": 685, "token_acc": 0.9486922250089573, "train_speed(iter/s)": 0.179804 }, { "epoch": 2.87797230206428, "grad_norm": 0.5424652099609375, "learning_rate": 3.494800565275125e-08, "loss": 0.11893690824508667, "memory(GiB)": 39.46, "step": 690, "token_acc": 0.958352824231084, "train_speed(iter/s)": 0.179959 }, { "epoch": 2.898876404494382, "grad_norm": 0.5787132978439331, "learning_rate": 2.321188934244323e-08, "loss": 0.12508745193481446, "memory(GiB)": 39.46, "step": 695, "token_acc": 0.9538041333143876, "train_speed(iter/s)": 0.180164 }, { "epoch": 2.919780506924484, "grad_norm": 0.5377312302589417, "learning_rate": 1.3864316907539754e-08, "loss": 0.11967707872390747, "memory(GiB)": 39.46, "step": 700, "token_acc": 0.9583007797539715, "train_speed(iter/s)": 0.180364 }, { "epoch": 2.919780506924484, "eval_loss": 0.24175819754600525, "eval_runtime": 6.6511, "eval_samples_per_second": 23.154, "eval_steps_per_second": 5.864, "eval_token_acc": 0.9318525149667095, "step": 700 }, { "epoch": 2.940684609354586, "grad_norm": 0.5400473475456238, "learning_rate": 6.9097745922580564e-09, "loss": 0.11984028816223144, "memory(GiB)": 39.46, "step": 705, "token_acc": 0.950023827776972, "train_speed(iter/s)": 0.179708 }, { "epoch": 2.961588711784688, "grad_norm": 0.5357044339179993, "learning_rate": 2.3516001375439856e-09, "loss": 0.1183488130569458, "memory(GiB)": 39.46, "step": 710, "token_acc": 0.9582627468029924, "train_speed(iter/s)": 0.179902 }, { "epoch": 2.9824928142147895, "grad_norm": 0.5505788922309875, "learning_rate": 1.919811791650794e-10, "loss": 0.12027006149291992, "memory(GiB)": 39.46, "step": 715, "token_acc": 0.9548030559208467, "train_speed(iter/s)": 0.180076 }, { "epoch": 2.9908544551868306, "eval_loss": 0.24157196283340454, "eval_runtime": 6.6398, "eval_samples_per_second": 23.193, "eval_steps_per_second": 5.874, "eval_token_acc": 0.9317246285298655, "step": 717 } ], "logging_steps": 5, "max_steps": 717, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0098725192109916e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }