Files
qwen2.5vl-3b-32b-longest-15460/trainer_state.json
ModelHub XC 873af72fa4 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-32b-longest-15460
Source: Original Platform
2026-05-22 03:54:13 +08:00

1799 lines
51 KiB
JSON

{
"best_global_step": 480,
"best_metric": 0.2286583,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v47-20250505-200714/checkpoint-480",
"epoch": 2.9908544551868306,
"eval_steps": 20,
"global_step": 717,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004180820486020381,
"grad_norm": 2.620922088623047,
"learning_rate": 9.999952004474853e-06,
"loss": 0.34187427163124084,
"memory(GiB)": 29.13,
"step": 1,
"token_acc": 0.9049815498154982,
"train_speed(iter/s)": 0.073614
},
{
"epoch": 0.020904102430101906,
"grad_norm": 1.5743770599365234,
"learning_rate": 9.998800157942083e-06,
"loss": 0.2900405824184418,
"memory(GiB)": 29.13,
"step": 5,
"token_acc": 0.8957104981995172,
"train_speed(iter/s)": 0.151098
},
{
"epoch": 0.04180820486020381,
"grad_norm": 0.8994374871253967,
"learning_rate": 9.995201207616718e-06,
"loss": 0.2705298900604248,
"memory(GiB)": 29.14,
"step": 10,
"token_acc": 0.9024558145491803,
"train_speed(iter/s)": 0.180178
},
{
"epoch": 0.06271230729030572,
"grad_norm": 0.6387772560119629,
"learning_rate": 9.98920487629269e-06,
"loss": 0.24947943687438964,
"memory(GiB)": 32.63,
"step": 15,
"token_acc": 0.9174802221848174,
"train_speed(iter/s)": 0.188578
},
{
"epoch": 0.08361640972040763,
"grad_norm": 0.6275189518928528,
"learning_rate": 9.980814041830203e-06,
"loss": 0.25027036666870117,
"memory(GiB)": 34.7,
"step": 20,
"token_acc": 0.9166679292737989,
"train_speed(iter/s)": 0.191459
},
{
"epoch": 0.08361640972040763,
"eval_loss": 0.28380751609802246,
"eval_runtime": 6.7011,
"eval_samples_per_second": 22.981,
"eval_steps_per_second": 5.82,
"eval_token_acc": 0.9153551646138229,
"step": 20
},
{
"epoch": 0.10452051215050953,
"grad_norm": 0.6861765384674072,
"learning_rate": 9.970032731299697e-06,
"loss": 0.24285974502563476,
"memory(GiB)": 34.7,
"step": 25,
"token_acc": 0.9173268332317692,
"train_speed(iter/s)": 0.170029
},
{
"epoch": 0.12542461458061144,
"grad_norm": 0.6327997446060181,
"learning_rate": 9.956866119049095e-06,
"loss": 0.2500872850418091,
"memory(GiB)": 34.7,
"step": 30,
"token_acc": 0.9172865583004255,
"train_speed(iter/s)": 0.175321
},
{
"epoch": 0.14632871701071334,
"grad_norm": 0.6613020300865173,
"learning_rate": 9.941320524220455e-06,
"loss": 0.24076151847839355,
"memory(GiB)": 34.7,
"step": 35,
"token_acc": 0.9165492852219258,
"train_speed(iter/s)": 0.179249
},
{
"epoch": 0.16723281944081525,
"grad_norm": 0.5987035036087036,
"learning_rate": 9.92340340771717e-06,
"loss": 0.23788681030273437,
"memory(GiB)": 37.02,
"step": 40,
"token_acc": 0.9233054502142574,
"train_speed(iter/s)": 0.182322
},
{
"epoch": 0.16723281944081525,
"eval_loss": 0.2673507630825043,
"eval_runtime": 6.6176,
"eval_samples_per_second": 23.271,
"eval_steps_per_second": 5.893,
"eval_token_acc": 0.9202628066277145,
"step": 40
},
{
"epoch": 0.18813692187091716,
"grad_norm": 0.5674816370010376,
"learning_rate": 9.903123368623216e-06,
"loss": 0.2267207145690918,
"memory(GiB)": 37.02,
"step": 45,
"token_acc": 0.9245033510654285,
"train_speed(iter/s)": 0.172104
},
{
"epoch": 0.20904102430101906,
"grad_norm": 0.5934615731239319,
"learning_rate": 9.88049014007613e-06,
"loss": 0.23341102600097657,
"memory(GiB)": 37.02,
"step": 50,
"token_acc": 0.9186244567132621,
"train_speed(iter/s)": 0.175846
},
{
"epoch": 0.22994512673112097,
"grad_norm": 0.6319158673286438,
"learning_rate": 9.855514584595719e-06,
"loss": 0.24078943729400634,
"memory(GiB)": 37.02,
"step": 55,
"token_acc": 0.9141281922363426,
"train_speed(iter/s)": 0.17881
},
{
"epoch": 0.2508492291612229,
"grad_norm": 0.6614541411399841,
"learning_rate": 9.828208688870736e-06,
"loss": 0.2285898208618164,
"memory(GiB)": 37.02,
"step": 60,
"token_acc": 0.919986967045243,
"train_speed(iter/s)": 0.181102
},
{
"epoch": 0.2508492291612229,
"eval_loss": 0.2608849108219147,
"eval_runtime": 6.6441,
"eval_samples_per_second": 23.178,
"eval_steps_per_second": 5.87,
"eval_token_acc": 0.9218693799905684,
"step": 60
},
{
"epoch": 0.2717533315913248,
"grad_norm": 0.6391910314559937,
"learning_rate": 9.79858555800603e-06,
"loss": 0.23735857009887695,
"memory(GiB)": 37.02,
"step": 65,
"token_acc": 0.9195915366053212,
"train_speed(iter/s)": 0.173657
},
{
"epoch": 0.2926574340214267,
"grad_norm": 0.6433473825454712,
"learning_rate": 9.766659409232918e-06,
"loss": 0.22774579524993896,
"memory(GiB)": 37.02,
"step": 70,
"token_acc": 0.9207077029819454,
"train_speed(iter/s)": 0.17624
},
{
"epoch": 0.3135615364515286,
"grad_norm": 0.5562565326690674,
"learning_rate": 9.732445565085823e-06,
"loss": 0.22526907920837402,
"memory(GiB)": 37.02,
"step": 75,
"token_acc": 0.9271212368745452,
"train_speed(iter/s)": 0.177663
},
{
"epoch": 0.3344656388816305,
"grad_norm": 0.7328993678092957,
"learning_rate": 9.69596044604841e-06,
"loss": 0.23299379348754884,
"memory(GiB)": 37.02,
"step": 80,
"token_acc": 0.917067597792331,
"train_speed(iter/s)": 0.180448
},
{
"epoch": 0.3344656388816305,
"eval_loss": 0.25337016582489014,
"eval_runtime": 6.6465,
"eval_samples_per_second": 23.17,
"eval_steps_per_second": 5.868,
"eval_token_acc": 0.9238915842731654,
"step": 80
},
{
"epoch": 0.35536974131173243,
"grad_norm": 0.6519869565963745,
"learning_rate": 9.657221562672803e-06,
"loss": 0.22955694198608398,
"memory(GiB)": 37.02,
"step": 85,
"token_acc": 0.9178025442310218,
"train_speed(iter/s)": 0.174696
},
{
"epoch": 0.3762738437418343,
"grad_norm": 0.5939056277275085,
"learning_rate": 9.616247507175624e-06,
"loss": 0.22620651721954346,
"memory(GiB)": 37.02,
"step": 90,
"token_acc": 0.9214102974096579,
"train_speed(iter/s)": 0.176915
},
{
"epoch": 0.39717794617193625,
"grad_norm": 0.6213501691818237,
"learning_rate": 9.573057944514897e-06,
"loss": 0.2274672269821167,
"memory(GiB)": 37.02,
"step": 95,
"token_acc": 0.9245137714423773,
"train_speed(iter/s)": 0.178294
},
{
"epoch": 0.4180820486020381,
"grad_norm": 0.6670387387275696,
"learning_rate": 9.527673602952123e-06,
"loss": 0.23140230178833007,
"memory(GiB)": 37.02,
"step": 100,
"token_acc": 0.9132270680440643,
"train_speed(iter/s)": 0.179262
},
{
"epoch": 0.4180820486020381,
"eval_loss": 0.25054118037223816,
"eval_runtime": 6.6295,
"eval_samples_per_second": 23.23,
"eval_steps_per_second": 5.883,
"eval_token_acc": 0.9250425622047622,
"step": 100
},
{
"epoch": 0.43898615103214006,
"grad_norm": 0.6463162899017334,
"learning_rate": 9.48011626410401e-06,
"loss": 0.22332818508148194,
"memory(GiB)": 37.02,
"step": 105,
"token_acc": 0.9278207538447314,
"train_speed(iter/s)": 0.174957
},
{
"epoch": 0.45989025346224194,
"grad_norm": 0.5779640674591064,
"learning_rate": 9.430408752488687e-06,
"loss": 0.2219472885131836,
"memory(GiB)": 37.02,
"step": 110,
"token_acc": 0.9219851633866716,
"train_speed(iter/s)": 0.176134
},
{
"epoch": 0.48079435589234387,
"grad_norm": 0.6310390830039978,
"learning_rate": 9.378574924571362e-06,
"loss": 0.22848432064056395,
"memory(GiB)": 37.02,
"step": 115,
"token_acc": 0.9266341377642974,
"train_speed(iter/s)": 0.177157
},
{
"epoch": 0.5016984583224458,
"grad_norm": 0.5928042531013489,
"learning_rate": 9.324639657314742e-06,
"loss": 0.23035595417022706,
"memory(GiB)": 37.02,
"step": 120,
"token_acc": 0.9220481773335896,
"train_speed(iter/s)": 0.178639
},
{
"epoch": 0.5016984583224458,
"eval_loss": 0.24868212640285492,
"eval_runtime": 6.6274,
"eval_samples_per_second": 23.237,
"eval_steps_per_second": 5.885,
"eval_token_acc": 0.9254182286129916,
"step": 120
},
{
"epoch": 0.5226025607525477,
"grad_norm": 0.5700287222862244,
"learning_rate": 9.268628836239646e-06,
"loss": 0.22175629138946534,
"memory(GiB)": 37.02,
"step": 125,
"token_acc": 0.9286114910407667,
"train_speed(iter/s)": 0.175138
},
{
"epoch": 0.5435066631826496,
"grad_norm": 0.6334052681922913,
"learning_rate": 9.21056934300161e-06,
"loss": 0.2261972188949585,
"memory(GiB)": 37.02,
"step": 130,
"token_acc": 0.9182782377470237,
"train_speed(iter/s)": 0.176545
},
{
"epoch": 0.5644107656127515,
"grad_norm": 0.6031065583229065,
"learning_rate": 9.150489042489368e-06,
"loss": 0.2102799892425537,
"memory(GiB)": 37.02,
"step": 135,
"token_acc": 0.9315734630079889,
"train_speed(iter/s)": 0.177648
},
{
"epoch": 0.5853148680428534,
"grad_norm": 0.593549907207489,
"learning_rate": 9.088416769451485e-06,
"loss": 0.21703071594238282,
"memory(GiB)": 37.02,
"step": 140,
"token_acc": 0.9275403061439635,
"train_speed(iter/s)": 0.178909
},
{
"epoch": 0.5853148680428534,
"eval_loss": 0.24534018337726593,
"eval_runtime": 6.6365,
"eval_samples_per_second": 23.205,
"eval_steps_per_second": 5.877,
"eval_token_acc": 0.9266331497630105,
"step": 140
},
{
"epoch": 0.6062189704729554,
"grad_norm": 0.6485511660575867,
"learning_rate": 9.02438231465749e-06,
"loss": 0.2273317813873291,
"memory(GiB)": 37.02,
"step": 145,
"token_acc": 0.9251848272032739,
"train_speed(iter/s)": 0.175995
},
{
"epoch": 0.6271230729030572,
"grad_norm": 0.6085463166236877,
"learning_rate": 8.958416410600188e-06,
"loss": 0.22932357788085939,
"memory(GiB)": 37.02,
"step": 150,
"token_acc": 0.9163140495867769,
"train_speed(iter/s)": 0.177285
},
{
"epoch": 0.6480271753331591,
"grad_norm": 0.5881887078285217,
"learning_rate": 8.890550716746013e-06,
"loss": 0.2232443571090698,
"memory(GiB)": 37.02,
"step": 155,
"token_acc": 0.920258257982589,
"train_speed(iter/s)": 0.17828
},
{
"epoch": 0.668931277763261,
"grad_norm": 0.5788638591766357,
"learning_rate": 8.820817804340471e-06,
"loss": 0.20956034660339357,
"memory(GiB)": 37.02,
"step": 160,
"token_acc": 0.9304798016625346,
"train_speed(iter/s)": 0.179214
},
{
"epoch": 0.668931277763261,
"eval_loss": 0.24245667457580566,
"eval_runtime": 6.5715,
"eval_samples_per_second": 23.435,
"eval_steps_per_second": 5.935,
"eval_token_acc": 0.927040787780451,
"step": 160
},
{
"epoch": 0.689835380193363,
"grad_norm": 0.6712965369224548,
"learning_rate": 8.749251140776016e-06,
"loss": 0.2295995235443115,
"memory(GiB)": 37.02,
"step": 165,
"token_acc": 0.9216801551445813,
"train_speed(iter/s)": 0.176683
},
{
"epoch": 0.7107394826234649,
"grad_norm": 0.590825617313385,
"learning_rate": 8.675885073529802e-06,
"loss": 0.21770024299621582,
"memory(GiB)": 37.02,
"step": 170,
"token_acc": 0.9224579044445766,
"train_speed(iter/s)": 0.177781
},
{
"epoch": 0.7316435850535667,
"grad_norm": 0.6172182559967041,
"learning_rate": 8.600754813679072e-06,
"loss": 0.21406757831573486,
"memory(GiB)": 37.02,
"step": 175,
"token_acc": 0.9237590553351814,
"train_speed(iter/s)": 0.178707
},
{
"epoch": 0.7525476874836686,
"grad_norm": 0.5739086866378784,
"learning_rate": 8.52389641900206e-06,
"loss": 0.20645816326141359,
"memory(GiB)": 37.02,
"step": 180,
"token_acc": 0.926357608155873,
"train_speed(iter/s)": 0.179338
},
{
"epoch": 0.7525476874836686,
"eval_loss": 0.24040701985359192,
"eval_runtime": 6.6295,
"eval_samples_per_second": 23.229,
"eval_steps_per_second": 5.883,
"eval_token_acc": 0.9270647664873592,
"step": 180
},
{
"epoch": 0.7734517899137706,
"grad_norm": 0.5748027563095093,
"learning_rate": 8.445346776672546e-06,
"loss": 0.21495263576507567,
"memory(GiB)": 37.02,
"step": 185,
"token_acc": 0.9272353992083938,
"train_speed(iter/s)": 0.177187
},
{
"epoch": 0.7943558923438725,
"grad_norm": 0.6537684798240662,
"learning_rate": 8.365143585556326e-06,
"loss": 0.22100327014923096,
"memory(GiB)": 37.02,
"step": 190,
"token_acc": 0.9197183562004408,
"train_speed(iter/s)": 0.177962
},
{
"epoch": 0.8152599947739744,
"grad_norm": 0.6247681379318237,
"learning_rate": 8.283325338118154e-06,
"loss": 0.21339471340179444,
"memory(GiB)": 37.02,
"step": 195,
"token_acc": 0.9236349420094696,
"train_speed(iter/s)": 0.178683
},
{
"epoch": 0.8361640972040763,
"grad_norm": 0.6301010847091675,
"learning_rate": 8.199931301947782e-06,
"loss": 0.2191436767578125,
"memory(GiB)": 37.02,
"step": 200,
"token_acc": 0.9171661150371232,
"train_speed(iter/s)": 0.179195
},
{
"epoch": 0.8361640972040763,
"eval_loss": 0.2372375875711441,
"eval_runtime": 6.6381,
"eval_samples_per_second": 23.199,
"eval_steps_per_second": 5.875,
"eval_token_acc": 0.9280638792752036,
"step": 200
},
{
"epoch": 0.8570681996341782,
"grad_norm": 0.5749327540397644,
"learning_rate": 8.115001500914e-06,
"loss": 0.21314010620117188,
"memory(GiB)": 37.02,
"step": 205,
"token_acc": 0.9257877237051938,
"train_speed(iter/s)": 0.176853
},
{
"epoch": 0.8779723020642801,
"grad_norm": 0.6665855050086975,
"learning_rate": 8.028576695955711e-06,
"loss": 0.20737431049346924,
"memory(GiB)": 37.02,
"step": 210,
"token_acc": 0.9287075750122971,
"train_speed(iter/s)": 0.177713
},
{
"epoch": 0.898876404494382,
"grad_norm": 0.5684797763824463,
"learning_rate": 7.940698365519246e-06,
"loss": 0.21522219181060792,
"memory(GiB)": 37.02,
"step": 215,
"token_acc": 0.9235592334740796,
"train_speed(iter/s)": 0.178152
},
{
"epoch": 0.9197805069244839,
"grad_norm": 0.6096109747886658,
"learning_rate": 7.851408685651342e-06,
"loss": 0.21898245811462402,
"memory(GiB)": 37.02,
"step": 220,
"token_acc": 0.9227710824734293,
"train_speed(iter/s)": 0.178957
},
{
"epoch": 0.9197805069244839,
"eval_loss": 0.23681528866291046,
"eval_runtime": 6.6341,
"eval_samples_per_second": 23.213,
"eval_steps_per_second": 5.879,
"eval_token_acc": 0.9285674321202771,
"step": 220
},
{
"epoch": 0.9406846093545859,
"grad_norm": 0.55597323179245,
"learning_rate": 7.7607505097573e-06,
"loss": 0.21175863742828369,
"memory(GiB)": 37.02,
"step": 225,
"token_acc": 0.9328776189582448,
"train_speed(iter/s)": 0.17712
},
{
"epoch": 0.9615887117846877,
"grad_norm": 0.6526412963867188,
"learning_rate": 7.668767348034044e-06,
"loss": 0.21146607398986816,
"memory(GiB)": 37.02,
"step": 230,
"token_acc": 0.920968304305235,
"train_speed(iter/s)": 0.177568
},
{
"epoch": 0.9824928142147896,
"grad_norm": 0.5566428303718567,
"learning_rate": 7.5755033465880024e-06,
"loss": 0.21480951309204102,
"memory(GiB)": 37.02,
"step": 235,
"token_acc": 0.9206400893147881,
"train_speed(iter/s)": 0.178101
},
{
"epoch": 1.0,
"grad_norm": 0.6223005056381226,
"learning_rate": 7.481003266247745e-06,
"loss": 0.21818625926971436,
"memory(GiB)": 37.02,
"step": 240,
"token_acc": 0.9244112717911505,
"train_speed(iter/s)": 0.179289
},
{
"epoch": 1.0,
"eval_loss": 0.23499608039855957,
"eval_runtime": 6.6316,
"eval_samples_per_second": 23.222,
"eval_steps_per_second": 5.881,
"eval_token_acc": 0.9285194747064607,
"step": 240
},
{
"epoch": 1.020904102430102,
"grad_norm": 0.5871774554252625,
"learning_rate": 7.385312461081616e-06,
"loss": 0.1686471700668335,
"memory(GiB)": 37.02,
"step": 245,
"token_acc": 0.9376223265624841,
"train_speed(iter/s)": 0.177674
},
{
"epoch": 1.0418082048602038,
"grad_norm": 0.6004992723464966,
"learning_rate": 7.288476856630656e-06,
"loss": 0.16282508373260499,
"memory(GiB)": 37.02,
"step": 250,
"token_acc": 0.9393488548772574,
"train_speed(iter/s)": 0.17831
},
{
"epoch": 1.0627123072903057,
"grad_norm": 0.6309487223625183,
"learning_rate": 7.190542927867234e-06,
"loss": 0.1637326955795288,
"memory(GiB)": 37.02,
"step": 255,
"token_acc": 0.945661712279932,
"train_speed(iter/s)": 0.178866
},
{
"epoch": 1.0836164097204075,
"grad_norm": 0.5814361572265625,
"learning_rate": 7.091557676890001e-06,
"loss": 0.15928541421890258,
"memory(GiB)": 37.02,
"step": 260,
"token_acc": 0.9451221362463966,
"train_speed(iter/s)": 0.179402
},
{
"epoch": 1.0836164097204075,
"eval_loss": 0.23942023515701294,
"eval_runtime": 6.6351,
"eval_samples_per_second": 23.21,
"eval_steps_per_second": 5.878,
"eval_token_acc": 0.9281597941028367,
"step": 260
},
{
"epoch": 1.1045205121505095,
"grad_norm": 0.5660094022750854,
"learning_rate": 6.991568610365851e-06,
"loss": 0.15568481683731078,
"memory(GiB)": 37.02,
"step": 265,
"token_acc": 0.9408398229323706,
"train_speed(iter/s)": 0.177849
},
{
"epoch": 1.1254246145806115,
"grad_norm": 0.5609026551246643,
"learning_rate": 6.890623716729724e-06,
"loss": 0.16430522203445436,
"memory(GiB)": 37.02,
"step": 270,
"token_acc": 0.9378496054183171,
"train_speed(iter/s)": 0.178387
},
{
"epoch": 1.1463287170107133,
"grad_norm": 0.579684317111969,
"learning_rate": 6.788771443153183e-06,
"loss": 0.15910866260528564,
"memory(GiB)": 37.02,
"step": 275,
"token_acc": 0.9425582560761714,
"train_speed(iter/s)": 0.178707
},
{
"epoch": 1.1672328194408153,
"grad_norm": 0.5673295855522156,
"learning_rate": 6.686060672292847e-06,
"loss": 0.15805959701538086,
"memory(GiB)": 37.02,
"step": 280,
"token_acc": 0.9461089281816197,
"train_speed(iter/s)": 0.179432
},
{
"epoch": 1.1672328194408153,
"eval_loss": 0.23961253464221954,
"eval_runtime": 6.6282,
"eval_samples_per_second": 23.234,
"eval_steps_per_second": 5.884,
"eval_token_acc": 0.9277281773784879,
"step": 280
},
{
"epoch": 1.1881369218709172,
"grad_norm": 0.5638378858566284,
"learning_rate": 6.5825406988297815e-06,
"loss": 0.160142982006073,
"memory(GiB)": 37.02,
"step": 285,
"token_acc": 0.9402033005473476,
"train_speed(iter/s)": 0.178002
},
{
"epoch": 1.209041024301019,
"grad_norm": 0.5472784638404846,
"learning_rate": 6.478261205811188e-06,
"loss": 0.15631693601608276,
"memory(GiB)": 37.02,
"step": 290,
"token_acc": 0.9425503107883562,
"train_speed(iter/s)": 0.178558
},
{
"epoch": 1.229945126731121,
"grad_norm": 0.5741646885871887,
"learning_rate": 6.373272240805668e-06,
"loss": 0.16377530097961426,
"memory(GiB)": 37.02,
"step": 295,
"token_acc": 0.9453790125312598,
"train_speed(iter/s)": 0.179076
},
{
"epoch": 1.250849229161223,
"grad_norm": 0.5998528599739075,
"learning_rate": 6.267624191883551e-06,
"loss": 0.1654489278793335,
"memory(GiB)": 37.02,
"step": 300,
"token_acc": 0.9433800623052959,
"train_speed(iter/s)": 0.179541
},
{
"epoch": 1.250849229161223,
"eval_loss": 0.23872551321983337,
"eval_runtime": 6.6305,
"eval_samples_per_second": 23.226,
"eval_steps_per_second": 5.882,
"eval_token_acc": 0.9284555314880386,
"step": 300
},
{
"epoch": 1.2717533315913248,
"grad_norm": 0.5820715427398682,
"learning_rate": 6.161367763433812e-06,
"loss": 0.1659400701522827,
"memory(GiB)": 37.02,
"step": 305,
"token_acc": 0.9422344370459437,
"train_speed(iter/s)": 0.178162
},
{
"epoch": 1.2926574340214267,
"grad_norm": 0.5959991216659546,
"learning_rate": 6.054553951829163e-06,
"loss": 0.16145311594009398,
"memory(GiB)": 37.02,
"step": 310,
"token_acc": 0.9404544902175234,
"train_speed(iter/s)": 0.178528
},
{
"epoch": 1.3135615364515285,
"grad_norm": 0.6270994544029236,
"learning_rate": 5.947234020951015e-06,
"loss": 0.16696672439575194,
"memory(GiB)": 37.02,
"step": 315,
"token_acc": 0.9387243092362137,
"train_speed(iter/s)": 0.178987
},
{
"epoch": 1.3344656388816305,
"grad_norm": 0.5847781896591187,
"learning_rate": 5.839459477586056e-06,
"loss": 0.1612384557723999,
"memory(GiB)": 37.02,
"step": 320,
"token_acc": 0.9422280984134284,
"train_speed(iter/s)": 0.179441
},
{
"epoch": 1.3344656388816305,
"eval_loss": 0.23678933084011078,
"eval_runtime": 6.6342,
"eval_samples_per_second": 23.213,
"eval_steps_per_second": 5.879,
"eval_token_acc": 0.9287432759709378,
"step": 320
},
{
"epoch": 1.3553697413117325,
"grad_norm": 0.5601792931556702,
"learning_rate": 5.731282046706247e-06,
"loss": 0.16321887969970703,
"memory(GiB)": 37.02,
"step": 325,
"token_acc": 0.938570500852342,
"train_speed(iter/s)": 0.17826
},
{
"epoch": 1.3762738437418343,
"grad_norm": 0.5960825085639954,
"learning_rate": 5.622753646644102e-06,
"loss": 0.16715322732925414,
"memory(GiB)": 37.02,
"step": 330,
"token_acc": 0.9422235465708954,
"train_speed(iter/s)": 0.178716
},
{
"epoch": 1.3971779461719362,
"grad_norm": 0.5615048408508301,
"learning_rate": 5.513926364175172e-06,
"loss": 0.16500518321990967,
"memory(GiB)": 37.02,
"step": 335,
"token_acc": 0.9453201823653863,
"train_speed(iter/s)": 0.179021
},
{
"epoch": 1.418082048602038,
"grad_norm": 0.5556049942970276,
"learning_rate": 5.404852429519678e-06,
"loss": 0.15454906225204468,
"memory(GiB)": 37.02,
"step": 340,
"token_acc": 0.9407398356599567,
"train_speed(iter/s)": 0.179344
},
{
"epoch": 1.418082048602038,
"eval_loss": 0.23516136407852173,
"eval_runtime": 6.6178,
"eval_samples_per_second": 23.271,
"eval_steps_per_second": 5.893,
"eval_token_acc": 0.9299502042186538,
"step": 340
},
{
"epoch": 1.43898615103214,
"grad_norm": 0.5906144976615906,
"learning_rate": 5.295584191275308e-06,
"loss": 0.16890814304351806,
"memory(GiB)": 37.02,
"step": 345,
"token_acc": 0.9373257274217237,
"train_speed(iter/s)": 0.17811
},
{
"epoch": 1.459890253462242,
"grad_norm": 0.6000419855117798,
"learning_rate": 5.1861740912932e-06,
"loss": 0.16450071334838867,
"memory(GiB)": 37.02,
"step": 350,
"token_acc": 0.9405447723714142,
"train_speed(iter/s)": 0.178429
},
{
"epoch": 1.480794355892344,
"grad_norm": 0.5739107131958008,
"learning_rate": 5.07667463950916e-06,
"loss": 0.1568189740180969,
"memory(GiB)": 37.02,
"step": 355,
"token_acc": 0.9423155065170506,
"train_speed(iter/s)": 0.178939
},
{
"epoch": 1.5016984583224458,
"grad_norm": 0.5783458948135376,
"learning_rate": 4.967138388742218e-06,
"loss": 0.16183936595916748,
"memory(GiB)": 37.02,
"step": 360,
"token_acc": 0.9435463668380956,
"train_speed(iter/s)": 0.179308
},
{
"epoch": 1.5016984583224458,
"eval_loss": 0.23441138863563538,
"eval_runtime": 6.6338,
"eval_samples_per_second": 23.214,
"eval_steps_per_second": 5.879,
"eval_token_acc": 0.9301819983854337,
"step": 360
},
{
"epoch": 1.5226025607525477,
"grad_norm": 0.509042501449585,
"learning_rate": 4.8576179094725855e-06,
"loss": 0.16039080619812013,
"memory(GiB)": 37.02,
"step": 365,
"token_acc": 0.9386532058855117,
"train_speed(iter/s)": 0.178194
},
{
"epoch": 1.5435066631826495,
"grad_norm": 0.5684186220169067,
"learning_rate": 4.748165764611157e-06,
"loss": 0.16779780387878418,
"memory(GiB)": 37.02,
"step": 370,
"token_acc": 0.9417494999465592,
"train_speed(iter/s)": 0.178626
},
{
"epoch": 1.5644107656127515,
"grad_norm": 0.5707104802131653,
"learning_rate": 4.6388344842726266e-06,
"loss": 0.16627538204193115,
"memory(GiB)": 37.02,
"step": 375,
"token_acc": 0.938589794484991,
"train_speed(iter/s)": 0.179043
},
{
"epoch": 1.5853148680428535,
"grad_norm": 0.5481753945350647,
"learning_rate": 4.529676540564351e-06,
"loss": 0.15947449207305908,
"memory(GiB)": 37.02,
"step": 380,
"token_acc": 0.940670141008185,
"train_speed(iter/s)": 0.179357
},
{
"epoch": 1.5853148680428535,
"eval_loss": 0.23353615403175354,
"eval_runtime": 6.6285,
"eval_samples_per_second": 23.233,
"eval_steps_per_second": 5.884,
"eval_token_acc": 0.9297983390749015,
"step": 380
},
{
"epoch": 1.6062189704729555,
"grad_norm": 0.5632703900337219,
"learning_rate": 4.420744322403058e-06,
"loss": 0.15636355876922609,
"memory(GiB)": 37.02,
"step": 385,
"token_acc": 0.9417008449869052,
"train_speed(iter/s)": 0.178382
},
{
"epoch": 1.6271230729030572,
"grad_norm": 0.5890342593193054,
"learning_rate": 4.312090110371473e-06,
"loss": 0.15623259544372559,
"memory(GiB)": 37.02,
"step": 390,
"token_acc": 0.9422333549803631,
"train_speed(iter/s)": 0.178811
},
{
"epoch": 1.648027175333159,
"grad_norm": 0.5722407102584839,
"learning_rate": 4.203766051626939e-06,
"loss": 0.16071751117706298,
"memory(GiB)": 37.02,
"step": 395,
"token_acc": 0.9453681710213777,
"train_speed(iter/s)": 0.179205
},
{
"epoch": 1.668931277763261,
"grad_norm": 0.611022412776947,
"learning_rate": 4.095824134874087e-06,
"loss": 0.16332566738128662,
"memory(GiB)": 37.02,
"step": 400,
"token_acc": 0.937804167388531,
"train_speed(iter/s)": 0.179585
},
{
"epoch": 1.668931277763261,
"eval_loss": 0.23261623084545135,
"eval_runtime": 6.6176,
"eval_samples_per_second": 23.271,
"eval_steps_per_second": 5.893,
"eval_token_acc": 0.9303818209430026,
"step": 400
},
{
"epoch": 1.689835380193363,
"grad_norm": 0.5979709029197693,
"learning_rate": 3.988316165413528e-06,
"loss": 0.1527752161026001,
"memory(GiB)": 37.02,
"step": 405,
"token_acc": 0.940948516358103,
"train_speed(iter/s)": 0.178456
},
{
"epoch": 1.710739482623465,
"grad_norm": 0.549656093120575,
"learning_rate": 3.881293740278588e-06,
"loss": 0.15298218727111818,
"memory(GiB)": 39.46,
"step": 410,
"token_acc": 0.9467923967923968,
"train_speed(iter/s)": 0.178833
},
{
"epoch": 1.7316435850535667,
"grad_norm": 0.5743902921676636,
"learning_rate": 3.774808223471996e-06,
"loss": 0.15700061321258546,
"memory(GiB)": 39.46,
"step": 415,
"token_acc": 0.9460204535349044,
"train_speed(iter/s)": 0.179167
},
{
"epoch": 1.7525476874836685,
"grad_norm": 0.5933798551559448,
"learning_rate": 3.6689107213144025e-06,
"loss": 0.16367003917694092,
"memory(GiB)": 39.46,
"step": 420,
"token_acc": 0.9469271826676363,
"train_speed(iter/s)": 0.179611
},
{
"epoch": 1.7525476874836685,
"eval_loss": 0.23134766519069672,
"eval_runtime": 6.6251,
"eval_samples_per_second": 23.245,
"eval_steps_per_second": 5.887,
"eval_token_acc": 0.9307654802535349,
"step": 420
},
{
"epoch": 1.7734517899137705,
"grad_norm": 0.5866169929504395,
"learning_rate": 3.5636520579165704e-06,
"loss": 0.15368299484252929,
"memory(GiB)": 39.46,
"step": 425,
"token_acc": 0.9376751889917608,
"train_speed(iter/s)": 0.178672
},
{
"epoch": 1.7943558923438725,
"grad_norm": 0.6050538420677185,
"learning_rate": 3.4590827507870257e-06,
"loss": 0.1597348690032959,
"memory(GiB)": 39.46,
"step": 430,
"token_acc": 0.9424976034327972,
"train_speed(iter/s)": 0.179078
},
{
"epoch": 1.8152599947739745,
"grad_norm": 0.5516080856323242,
"learning_rate": 3.3552529865868323e-06,
"loss": 0.15672740936279297,
"memory(GiB)": 39.46,
"step": 435,
"token_acc": 0.9391331914315475,
"train_speed(iter/s)": 0.179453
},
{
"epoch": 1.8361640972040763,
"grad_norm": 0.5520040392875671,
"learning_rate": 3.252212597043167e-06,
"loss": 0.154363751411438,
"memory(GiB)": 39.46,
"step": 440,
"token_acc": 0.9407710731669628,
"train_speed(iter/s)": 0.179753
},
{
"epoch": 1.8361640972040763,
"eval_loss": 0.230976402759552,
"eval_runtime": 6.6289,
"eval_samples_per_second": 23.231,
"eval_steps_per_second": 5.883,
"eval_token_acc": 0.9307734731558376,
"step": 440
},
{
"epoch": 1.8570681996341782,
"grad_norm": 0.5343540906906128,
"learning_rate": 3.1500110350332492e-06,
"loss": 0.1531538486480713,
"memory(GiB)": 39.46,
"step": 445,
"token_acc": 0.9422152799196107,
"train_speed(iter/s)": 0.178932
},
{
"epoch": 1.87797230206428,
"grad_norm": 0.5729184746742249,
"learning_rate": 3.048697350850073e-06,
"loss": 0.16055722236633302,
"memory(GiB)": 39.46,
"step": 450,
"token_acc": 0.9512691733405728,
"train_speed(iter/s)": 0.179136
},
{
"epoch": 1.898876404494382,
"grad_norm": 0.5333752036094666,
"learning_rate": 2.9483201686613626e-06,
"loss": 0.15494089126586913,
"memory(GiB)": 39.46,
"step": 455,
"token_acc": 0.9471734021540102,
"train_speed(iter/s)": 0.179584
},
{
"epoch": 1.919780506924484,
"grad_norm": 0.8302111625671387,
"learning_rate": 2.8489276631730633e-06,
"loss": 0.15890274047851563,
"memory(GiB)": 39.46,
"step": 460,
"token_acc": 0.9424079706996089,
"train_speed(iter/s)": 0.179933
},
{
"epoch": 1.919780506924484,
"eval_loss": 0.22922886908054352,
"eval_runtime": 6.6337,
"eval_samples_per_second": 23.215,
"eval_steps_per_second": 5.879,
"eval_token_acc": 0.9315727633861132,
"step": 460
},
{
"epoch": 1.940684609354586,
"grad_norm": 0.5802881717681885,
"learning_rate": 2.750567536508504e-06,
"loss": 0.1534783959388733,
"memory(GiB)": 39.46,
"step": 465,
"token_acc": 0.9404760675375639,
"train_speed(iter/s)": 0.178985
},
{
"epoch": 1.9615887117846877,
"grad_norm": 0.5230849385261536,
"learning_rate": 2.653286995314398e-06,
"loss": 0.15316032171249389,
"memory(GiB)": 39.46,
"step": 470,
"token_acc": 0.940865764070184,
"train_speed(iter/s)": 0.179339
},
{
"epoch": 1.9824928142147895,
"grad_norm": 0.5382450222969055,
"learning_rate": 2.5571327281046486e-06,
"loss": 0.15808116197586058,
"memory(GiB)": 39.46,
"step": 475,
"token_acc": 0.9414967066105865,
"train_speed(iter/s)": 0.179724
},
{
"epoch": 2.0,
"grad_norm": 0.5938447117805481,
"learning_rate": 2.46215088285279e-06,
"loss": 0.16262369155883788,
"memory(GiB)": 39.46,
"step": 480,
"token_acc": 0.9486793264834041,
"train_speed(iter/s)": 0.180254
},
{
"epoch": 2.0,
"eval_loss": 0.2286583036184311,
"eval_runtime": 6.6325,
"eval_samples_per_second": 23.219,
"eval_steps_per_second": 5.88,
"eval_token_acc": 0.9318205433574985,
"step": 480
},
{
"epoch": 2.020904102430102,
"grad_norm": 0.536767303943634,
"learning_rate": 2.3683870448438905e-06,
"loss": 0.12537214756011963,
"memory(GiB)": 39.46,
"step": 485,
"token_acc": 0.9521131994289179,
"train_speed(iter/s)": 0.179418
},
{
"epoch": 2.041808204860204,
"grad_norm": 0.5739563703536987,
"learning_rate": 2.2758862147964933e-06,
"loss": 0.12507247924804688,
"memory(GiB)": 39.46,
"step": 490,
"token_acc": 0.9547663049958268,
"train_speed(iter/s)": 0.179658
},
{
"epoch": 2.0627123072903055,
"grad_norm": 0.6101587414741516,
"learning_rate": 2.1846927872651135e-06,
"loss": 0.1226424217224121,
"memory(GiB)": 39.46,
"step": 495,
"token_acc": 0.9567438898659713,
"train_speed(iter/s)": 0.180059
},
{
"epoch": 2.0836164097204075,
"grad_norm": 0.5811767578125,
"learning_rate": 2.0948505293336506e-06,
"loss": 0.12014656066894532,
"memory(GiB)": 39.46,
"step": 500,
"token_acc": 0.9582742281109468,
"train_speed(iter/s)": 0.180402
},
{
"epoch": 2.0836164097204075,
"eval_loss": 0.24429188668727875,
"eval_runtime": 6.6294,
"eval_samples_per_second": 23.23,
"eval_steps_per_second": 5.883,
"eval_token_acc": 0.9308134376673514,
"step": 500
},
{
"epoch": 2.1045205121505095,
"grad_norm": 0.610534131526947,
"learning_rate": 2.0064025596099663e-06,
"loss": 0.11917252540588379,
"memory(GiB)": 39.46,
"step": 505,
"token_acc": 0.9507537400408873,
"train_speed(iter/s)": 0.179597
},
{
"epoch": 2.1254246145806115,
"grad_norm": 0.5680590867996216,
"learning_rate": 1.919391327531663e-06,
"loss": 0.12412093877792359,
"memory(GiB)": 39.46,
"step": 510,
"token_acc": 0.9561542850973921,
"train_speed(iter/s)": 0.179885
},
{
"epoch": 2.1463287170107135,
"grad_norm": 0.5825695395469666,
"learning_rate": 1.8338585929930424e-06,
"loss": 0.11532289981842041,
"memory(GiB)": 39.46,
"step": 515,
"token_acc": 0.9611214610756498,
"train_speed(iter/s)": 0.180163
},
{
"epoch": 2.167232819440815,
"grad_norm": 0.5703282952308655,
"learning_rate": 1.7498454063029984e-06,
"loss": 0.11896244287490845,
"memory(GiB)": 39.46,
"step": 520,
"token_acc": 0.9602545823350432,
"train_speed(iter/s)": 0.180409
},
{
"epoch": 2.167232819440815,
"eval_loss": 0.24228492379188538,
"eval_runtime": 6.5759,
"eval_samples_per_second": 23.419,
"eval_steps_per_second": 5.931,
"eval_token_acc": 0.9311811111732782,
"step": 520
},
{
"epoch": 2.188136921870917,
"grad_norm": 0.5464326739311218,
"learning_rate": 1.667392088483456e-06,
"loss": 0.12099326848983764,
"memory(GiB)": 39.46,
"step": 525,
"token_acc": 0.9483148644281589,
"train_speed(iter/s)": 0.179639
},
{
"epoch": 2.209041024301019,
"grad_norm": 0.5550944805145264,
"learning_rate": 1.5865382119178258e-06,
"loss": 0.11996636390686036,
"memory(GiB)": 39.46,
"step": 530,
"token_acc": 0.9563651862085183,
"train_speed(iter/s)": 0.179841
},
{
"epoch": 2.229945126731121,
"grad_norm": 0.5549576282501221,
"learning_rate": 1.507322581358771e-06,
"loss": 0.11995522975921631,
"memory(GiB)": 39.46,
"step": 535,
"token_acc": 0.9548897519016594,
"train_speed(iter/s)": 0.180134
},
{
"epoch": 2.250849229161223,
"grad_norm": 0.5435655117034912,
"learning_rate": 1.4297832153043657e-06,
"loss": 0.11738158464431762,
"memory(GiB)": 39.46,
"step": 540,
"token_acc": 0.9565079135650791,
"train_speed(iter/s)": 0.180343
},
{
"epoch": 2.250849229161223,
"eval_loss": 0.24256764352321625,
"eval_runtime": 6.6338,
"eval_samples_per_second": 23.214,
"eval_steps_per_second": 5.879,
"eval_token_acc": 0.9310052673226176,
"step": 540
},
{
"epoch": 2.271753331591325,
"grad_norm": 0.5776016116142273,
"learning_rate": 1.353957327751621e-06,
"loss": 0.12343101501464844,
"memory(GiB)": 39.46,
"step": 545,
"token_acc": 0.9484948703433345,
"train_speed(iter/s)": 0.179597
},
{
"epoch": 2.2926574340214265,
"grad_norm": 0.5709097385406494,
"learning_rate": 1.2798813103361291e-06,
"loss": 0.11971625089645385,
"memory(GiB)": 39.46,
"step": 550,
"token_acc": 0.9561414627962986,
"train_speed(iter/s)": 0.179863
},
{
"epoch": 2.3135615364515285,
"grad_norm": 0.5969334840774536,
"learning_rate": 1.2075907148663579e-06,
"loss": 0.12062759399414062,
"memory(GiB)": 39.46,
"step": 555,
"token_acc": 0.9569148857062122,
"train_speed(iter/s)": 0.180073
},
{
"epoch": 2.3344656388816305,
"grad_norm": 0.5327123403549194,
"learning_rate": 1.1371202362610412e-06,
"loss": 0.11765568256378174,
"memory(GiB)": 39.46,
"step": 560,
"token_acc": 0.9581596210457106,
"train_speed(iter/s)": 0.180333
},
{
"epoch": 2.3344656388816305,
"eval_loss": 0.2422230988740921,
"eval_runtime": 6.5972,
"eval_samples_per_second": 23.343,
"eval_steps_per_second": 5.912,
"eval_token_acc": 0.9311651253686726,
"step": 560
},
{
"epoch": 2.3553697413117325,
"grad_norm": 0.5562822818756104,
"learning_rate": 1.06850369589781e-06,
"loss": 0.11786850690841674,
"memory(GiB)": 39.46,
"step": 565,
"token_acc": 0.9489760711168842,
"train_speed(iter/s)": 0.179625
},
{
"epoch": 2.3762738437418345,
"grad_norm": 0.5818617939949036,
"learning_rate": 1.0017740253810608e-06,
"loss": 0.12321114540100098,
"memory(GiB)": 39.46,
"step": 570,
"token_acc": 0.9529802240354032,
"train_speed(iter/s)": 0.179844
},
{
"epoch": 2.3971779461719365,
"grad_norm": 0.5551012754440308,
"learning_rate": 9.369632507368736e-07,
"loss": 0.13383883237838745,
"memory(GiB)": 39.46,
"step": 575,
"token_acc": 0.9536231884057971,
"train_speed(iter/s)": 0.180115
},
{
"epoch": 2.418082048602038,
"grad_norm": 0.5434304475784302,
"learning_rate": 8.741024770425394e-07,
"loss": 0.12519620656967162,
"memory(GiB)": 39.46,
"step": 580,
"token_acc": 0.9545193961398816,
"train_speed(iter/s)": 0.180327
},
{
"epoch": 2.418082048602038,
"eval_loss": 0.24199490249156952,
"eval_runtime": 6.6585,
"eval_samples_per_second": 23.128,
"eval_steps_per_second": 5.857,
"eval_token_acc": 0.9310612176387368,
"step": 580
},
{
"epoch": 2.43898615103214,
"grad_norm": 0.5505854487419128,
"learning_rate": 8.132218734980852e-07,
"loss": 0.12351593971252442,
"memory(GiB)": 39.46,
"step": 585,
"token_acc": 0.9496205882650123,
"train_speed(iter/s)": 0.17969
},
{
"epoch": 2.459890253462242,
"grad_norm": 0.5361783504486084,
"learning_rate": 7.543506589469674e-07,
"loss": 0.12893223762512207,
"memory(GiB)": 39.46,
"step": 590,
"token_acc": 0.9550055365804208,
"train_speed(iter/s)": 0.179827
},
{
"epoch": 2.480794355892344,
"grad_norm": 0.570577085018158,
"learning_rate": 6.975170878528765e-07,
"loss": 0.12303224802017212,
"memory(GiB)": 39.46,
"step": 595,
"token_acc": 0.9543349832348714,
"train_speed(iter/s)": 0.180173
},
{
"epoch": 2.501698458322446,
"grad_norm": 0.5742475390434265,
"learning_rate": 6.427484367393699e-07,
"loss": 0.1256563663482666,
"memory(GiB)": 39.46,
"step": 600,
"token_acc": 0.954024779864412,
"train_speed(iter/s)": 0.180362
},
{
"epoch": 2.501698458322446,
"eval_loss": 0.24162955582141876,
"eval_runtime": 6.6424,
"eval_samples_per_second": 23.184,
"eval_steps_per_second": 5.871,
"eval_token_acc": 0.9309413241041955,
"step": 600
},
{
"epoch": 2.5226025607525475,
"grad_norm": 0.5502005219459534,
"learning_rate": 5.900709910988739e-07,
"loss": 0.1203322172164917,
"memory(GiB)": 39.46,
"step": 605,
"token_acc": 0.945725693054572,
"train_speed(iter/s)": 0.179668
},
{
"epoch": 2.5435066631826495,
"grad_norm": 0.550845742225647,
"learning_rate": 5.395100327773018e-07,
"loss": 0.11989848613739014,
"memory(GiB)": 39.46,
"step": 610,
"token_acc": 0.9564310580152438,
"train_speed(iter/s)": 0.179957
},
{
"epoch": 2.5644107656127515,
"grad_norm": 0.5418065190315247,
"learning_rate": 4.91089827840367e-07,
"loss": 0.12273094654083253,
"memory(GiB)": 39.46,
"step": 615,
"token_acc": 0.9586797908520233,
"train_speed(iter/s)": 0.180256
},
{
"epoch": 2.5853148680428535,
"grad_norm": 0.5306729674339294,
"learning_rate": 4.4483361492740184e-07,
"loss": 0.11489535570144653,
"memory(GiB)": 39.46,
"step": 620,
"token_acc": 0.9587957254418413,
"train_speed(iter/s)": 0.180471
},
{
"epoch": 2.5853148680428535,
"eval_loss": 0.24175922572612762,
"eval_runtime": 6.6237,
"eval_samples_per_second": 23.25,
"eval_steps_per_second": 5.888,
"eval_token_acc": 0.9314368840469663,
"step": 620
},
{
"epoch": 2.6062189704729555,
"grad_norm": 0.5641535520553589,
"learning_rate": 4.007635940982857e-07,
"loss": 0.11712099313735962,
"memory(GiB)": 39.46,
"step": 625,
"token_acc": 0.9532489285186463,
"train_speed(iter/s)": 0.179823
},
{
"epoch": 2.627123072903057,
"grad_norm": 0.6171642541885376,
"learning_rate": 3.589009161788104e-07,
"loss": 0.1301506519317627,
"memory(GiB)": 39.46,
"step": 630,
"token_acc": 0.9547707464948845,
"train_speed(iter/s)": 0.180046
},
{
"epoch": 2.648027175333159,
"grad_norm": 0.5656234622001648,
"learning_rate": 3.192656726096277e-07,
"loss": 0.1133504033088684,
"memory(GiB)": 39.46,
"step": 635,
"token_acc": 0.9620012398557258,
"train_speed(iter/s)": 0.180284
},
{
"epoch": 2.668931277763261,
"grad_norm": 0.5739580988883972,
"learning_rate": 2.818768858036208e-07,
"loss": 0.11660020351409912,
"memory(GiB)": 39.46,
"step": 640,
"token_acc": 0.9527535918095915,
"train_speed(iter/s)": 0.180505
},
{
"epoch": 2.668931277763261,
"eval_loss": 0.24187050759792328,
"eval_runtime": 6.6525,
"eval_samples_per_second": 23.149,
"eval_steps_per_second": 5.862,
"eval_token_acc": 0.9315088201676911,
"step": 640
},
{
"epoch": 2.689835380193363,
"grad_norm": 0.5692787170410156,
"learning_rate": 2.467525000163523e-07,
"loss": 0.12488572597503662,
"memory(GiB)": 39.46,
"step": 645,
"token_acc": 0.949108626758746,
"train_speed(iter/s)": 0.179804
},
{
"epoch": 2.710739482623465,
"grad_norm": 0.5694913864135742,
"learning_rate": 2.139093727339503e-07,
"loss": 0.1269749402999878,
"memory(GiB)": 39.46,
"step": 650,
"token_acc": 0.9520699186238631,
"train_speed(iter/s)": 0.180029
},
{
"epoch": 2.7316435850535665,
"grad_norm": 0.5761524438858032,
"learning_rate": 1.8336326658258797e-07,
"loss": 0.12404175996780395,
"memory(GiB)": 39.46,
"step": 655,
"token_acc": 0.9545447343821791,
"train_speed(iter/s)": 0.180226
},
{
"epoch": 2.7525476874836685,
"grad_norm": 0.5228170156478882,
"learning_rate": 1.551288417634106e-07,
"loss": 0.11508429050445557,
"memory(GiB)": 39.46,
"step": 660,
"token_acc": 0.961899334176617,
"train_speed(iter/s)": 0.180412
},
{
"epoch": 2.7525476874836685,
"eval_loss": 0.24170467257499695,
"eval_runtime": 6.6488,
"eval_samples_per_second": 23.162,
"eval_steps_per_second": 5.866,
"eval_token_acc": 0.9315327988745994,
"step": 660
},
{
"epoch": 2.7734517899137705,
"grad_norm": 0.5596388578414917,
"learning_rate": 1.292196490165698e-07,
"loss": 0.12398817539215087,
"memory(GiB)": 39.46,
"step": 665,
"token_acc": 0.9481295480168085,
"train_speed(iter/s)": 0.179823
},
{
"epoch": 2.7943558923438725,
"grad_norm": 0.5801319479942322,
"learning_rate": 1.0564812311772422e-07,
"loss": 0.1263979434967041,
"memory(GiB)": 39.46,
"step": 670,
"token_acc": 0.9553416631792909,
"train_speed(iter/s)": 0.180051
},
{
"epoch": 2.8152599947739745,
"grad_norm": 0.6298143863677979,
"learning_rate": 8.442557691013042e-08,
"loss": 0.12353664636611938,
"memory(GiB)": 39.46,
"step": 675,
"token_acc": 0.955449425153873,
"train_speed(iter/s)": 0.180251
},
{
"epoch": 2.836164097204076,
"grad_norm": 0.5088171362876892,
"learning_rate": 6.556219587519397e-08,
"loss": 0.11518588066101074,
"memory(GiB)": 39.46,
"step": 680,
"token_acc": 0.9564935905736113,
"train_speed(iter/s)": 0.180447
},
{
"epoch": 2.836164097204076,
"eval_loss": 0.2416183054447174,
"eval_runtime": 6.639,
"eval_samples_per_second": 23.196,
"eval_steps_per_second": 5.874,
"eval_token_acc": 0.9316287137022324,
"step": 680
},
{
"epoch": 2.8570681996341785,
"grad_norm": 0.5331613421440125,
"learning_rate": 4.906703324408402e-08,
"loss": 0.11869626045227051,
"memory(GiB)": 39.46,
"step": 685,
"token_acc": 0.9486922250089573,
"train_speed(iter/s)": 0.179804
},
{
"epoch": 2.87797230206428,
"grad_norm": 0.5424652099609375,
"learning_rate": 3.494800565275125e-08,
"loss": 0.11893690824508667,
"memory(GiB)": 39.46,
"step": 690,
"token_acc": 0.958352824231084,
"train_speed(iter/s)": 0.179959
},
{
"epoch": 2.898876404494382,
"grad_norm": 0.5787132978439331,
"learning_rate": 2.321188934244323e-08,
"loss": 0.12508745193481446,
"memory(GiB)": 39.46,
"step": 695,
"token_acc": 0.9538041333143876,
"train_speed(iter/s)": 0.180164
},
{
"epoch": 2.919780506924484,
"grad_norm": 0.5377312302589417,
"learning_rate": 1.3864316907539754e-08,
"loss": 0.11967707872390747,
"memory(GiB)": 39.46,
"step": 700,
"token_acc": 0.9583007797539715,
"train_speed(iter/s)": 0.180364
},
{
"epoch": 2.919780506924484,
"eval_loss": 0.24175819754600525,
"eval_runtime": 6.6511,
"eval_samples_per_second": 23.154,
"eval_steps_per_second": 5.864,
"eval_token_acc": 0.9318525149667095,
"step": 700
},
{
"epoch": 2.940684609354586,
"grad_norm": 0.5400473475456238,
"learning_rate": 6.9097745922580564e-09,
"loss": 0.11984028816223144,
"memory(GiB)": 39.46,
"step": 705,
"token_acc": 0.950023827776972,
"train_speed(iter/s)": 0.179708
},
{
"epoch": 2.961588711784688,
"grad_norm": 0.5357044339179993,
"learning_rate": 2.3516001375439856e-09,
"loss": 0.1183488130569458,
"memory(GiB)": 39.46,
"step": 710,
"token_acc": 0.9582627468029924,
"train_speed(iter/s)": 0.179902
},
{
"epoch": 2.9824928142147895,
"grad_norm": 0.5505788922309875,
"learning_rate": 1.919811791650794e-10,
"loss": 0.12027006149291992,
"memory(GiB)": 39.46,
"step": 715,
"token_acc": 0.9548030559208467,
"train_speed(iter/s)": 0.180076
},
{
"epoch": 2.9908544551868306,
"eval_loss": 0.24157196283340454,
"eval_runtime": 6.6398,
"eval_samples_per_second": 23.193,
"eval_steps_per_second": 5.874,
"eval_token_acc": 0.9317246285298655,
"step": 717
}
],
"logging_steps": 5,
"max_steps": 717,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0098725192109916e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}