1799 lines
51 KiB
JSON
1799 lines
51 KiB
JSON
{
|
|
"best_global_step": 480,
|
|
"best_metric": 0.2286583,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v47-20250505-200714/checkpoint-480",
|
|
"epoch": 2.9908544551868306,
|
|
"eval_steps": 20,
|
|
"global_step": 717,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.004180820486020381,
|
|
"grad_norm": 2.620922088623047,
|
|
"learning_rate": 9.999952004474853e-06,
|
|
"loss": 0.34187427163124084,
|
|
"memory(GiB)": 29.13,
|
|
"step": 1,
|
|
"token_acc": 0.9049815498154982,
|
|
"train_speed(iter/s)": 0.073614
|
|
},
|
|
{
|
|
"epoch": 0.020904102430101906,
|
|
"grad_norm": 1.5743770599365234,
|
|
"learning_rate": 9.998800157942083e-06,
|
|
"loss": 0.2900405824184418,
|
|
"memory(GiB)": 29.13,
|
|
"step": 5,
|
|
"token_acc": 0.8957104981995172,
|
|
"train_speed(iter/s)": 0.151098
|
|
},
|
|
{
|
|
"epoch": 0.04180820486020381,
|
|
"grad_norm": 0.8994374871253967,
|
|
"learning_rate": 9.995201207616718e-06,
|
|
"loss": 0.2705298900604248,
|
|
"memory(GiB)": 29.14,
|
|
"step": 10,
|
|
"token_acc": 0.9024558145491803,
|
|
"train_speed(iter/s)": 0.180178
|
|
},
|
|
{
|
|
"epoch": 0.06271230729030572,
|
|
"grad_norm": 0.6387772560119629,
|
|
"learning_rate": 9.98920487629269e-06,
|
|
"loss": 0.24947943687438964,
|
|
"memory(GiB)": 32.63,
|
|
"step": 15,
|
|
"token_acc": 0.9174802221848174,
|
|
"train_speed(iter/s)": 0.188578
|
|
},
|
|
{
|
|
"epoch": 0.08361640972040763,
|
|
"grad_norm": 0.6275189518928528,
|
|
"learning_rate": 9.980814041830203e-06,
|
|
"loss": 0.25027036666870117,
|
|
"memory(GiB)": 34.7,
|
|
"step": 20,
|
|
"token_acc": 0.9166679292737989,
|
|
"train_speed(iter/s)": 0.191459
|
|
},
|
|
{
|
|
"epoch": 0.08361640972040763,
|
|
"eval_loss": 0.28380751609802246,
|
|
"eval_runtime": 6.7011,
|
|
"eval_samples_per_second": 22.981,
|
|
"eval_steps_per_second": 5.82,
|
|
"eval_token_acc": 0.9153551646138229,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.10452051215050953,
|
|
"grad_norm": 0.6861765384674072,
|
|
"learning_rate": 9.970032731299697e-06,
|
|
"loss": 0.24285974502563476,
|
|
"memory(GiB)": 34.7,
|
|
"step": 25,
|
|
"token_acc": 0.9173268332317692,
|
|
"train_speed(iter/s)": 0.170029
|
|
},
|
|
{
|
|
"epoch": 0.12542461458061144,
|
|
"grad_norm": 0.6327997446060181,
|
|
"learning_rate": 9.956866119049095e-06,
|
|
"loss": 0.2500872850418091,
|
|
"memory(GiB)": 34.7,
|
|
"step": 30,
|
|
"token_acc": 0.9172865583004255,
|
|
"train_speed(iter/s)": 0.175321
|
|
},
|
|
{
|
|
"epoch": 0.14632871701071334,
|
|
"grad_norm": 0.6613020300865173,
|
|
"learning_rate": 9.941320524220455e-06,
|
|
"loss": 0.24076151847839355,
|
|
"memory(GiB)": 34.7,
|
|
"step": 35,
|
|
"token_acc": 0.9165492852219258,
|
|
"train_speed(iter/s)": 0.179249
|
|
},
|
|
{
|
|
"epoch": 0.16723281944081525,
|
|
"grad_norm": 0.5987035036087036,
|
|
"learning_rate": 9.92340340771717e-06,
|
|
"loss": 0.23788681030273437,
|
|
"memory(GiB)": 37.02,
|
|
"step": 40,
|
|
"token_acc": 0.9233054502142574,
|
|
"train_speed(iter/s)": 0.182322
|
|
},
|
|
{
|
|
"epoch": 0.16723281944081525,
|
|
"eval_loss": 0.2673507630825043,
|
|
"eval_runtime": 6.6176,
|
|
"eval_samples_per_second": 23.271,
|
|
"eval_steps_per_second": 5.893,
|
|
"eval_token_acc": 0.9202628066277145,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.18813692187091716,
|
|
"grad_norm": 0.5674816370010376,
|
|
"learning_rate": 9.903123368623216e-06,
|
|
"loss": 0.2267207145690918,
|
|
"memory(GiB)": 37.02,
|
|
"step": 45,
|
|
"token_acc": 0.9245033510654285,
|
|
"train_speed(iter/s)": 0.172104
|
|
},
|
|
{
|
|
"epoch": 0.20904102430101906,
|
|
"grad_norm": 0.5934615731239319,
|
|
"learning_rate": 9.88049014007613e-06,
|
|
"loss": 0.23341102600097657,
|
|
"memory(GiB)": 37.02,
|
|
"step": 50,
|
|
"token_acc": 0.9186244567132621,
|
|
"train_speed(iter/s)": 0.175846
|
|
},
|
|
{
|
|
"epoch": 0.22994512673112097,
|
|
"grad_norm": 0.6319158673286438,
|
|
"learning_rate": 9.855514584595719e-06,
|
|
"loss": 0.24078943729400634,
|
|
"memory(GiB)": 37.02,
|
|
"step": 55,
|
|
"token_acc": 0.9141281922363426,
|
|
"train_speed(iter/s)": 0.17881
|
|
},
|
|
{
|
|
"epoch": 0.2508492291612229,
|
|
"grad_norm": 0.6614541411399841,
|
|
"learning_rate": 9.828208688870736e-06,
|
|
"loss": 0.2285898208618164,
|
|
"memory(GiB)": 37.02,
|
|
"step": 60,
|
|
"token_acc": 0.919986967045243,
|
|
"train_speed(iter/s)": 0.181102
|
|
},
|
|
{
|
|
"epoch": 0.2508492291612229,
|
|
"eval_loss": 0.2608849108219147,
|
|
"eval_runtime": 6.6441,
|
|
"eval_samples_per_second": 23.178,
|
|
"eval_steps_per_second": 5.87,
|
|
"eval_token_acc": 0.9218693799905684,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.2717533315913248,
|
|
"grad_norm": 0.6391910314559937,
|
|
"learning_rate": 9.79858555800603e-06,
|
|
"loss": 0.23735857009887695,
|
|
"memory(GiB)": 37.02,
|
|
"step": 65,
|
|
"token_acc": 0.9195915366053212,
|
|
"train_speed(iter/s)": 0.173657
|
|
},
|
|
{
|
|
"epoch": 0.2926574340214267,
|
|
"grad_norm": 0.6433473825454712,
|
|
"learning_rate": 9.766659409232918e-06,
|
|
"loss": 0.22774579524993896,
|
|
"memory(GiB)": 37.02,
|
|
"step": 70,
|
|
"token_acc": 0.9207077029819454,
|
|
"train_speed(iter/s)": 0.17624
|
|
},
|
|
{
|
|
"epoch": 0.3135615364515286,
|
|
"grad_norm": 0.5562565326690674,
|
|
"learning_rate": 9.732445565085823e-06,
|
|
"loss": 0.22526907920837402,
|
|
"memory(GiB)": 37.02,
|
|
"step": 75,
|
|
"token_acc": 0.9271212368745452,
|
|
"train_speed(iter/s)": 0.177663
|
|
},
|
|
{
|
|
"epoch": 0.3344656388816305,
|
|
"grad_norm": 0.7328993678092957,
|
|
"learning_rate": 9.69596044604841e-06,
|
|
"loss": 0.23299379348754884,
|
|
"memory(GiB)": 37.02,
|
|
"step": 80,
|
|
"token_acc": 0.917067597792331,
|
|
"train_speed(iter/s)": 0.180448
|
|
},
|
|
{
|
|
"epoch": 0.3344656388816305,
|
|
"eval_loss": 0.25337016582489014,
|
|
"eval_runtime": 6.6465,
|
|
"eval_samples_per_second": 23.17,
|
|
"eval_steps_per_second": 5.868,
|
|
"eval_token_acc": 0.9238915842731654,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.35536974131173243,
|
|
"grad_norm": 0.6519869565963745,
|
|
"learning_rate": 9.657221562672803e-06,
|
|
"loss": 0.22955694198608398,
|
|
"memory(GiB)": 37.02,
|
|
"step": 85,
|
|
"token_acc": 0.9178025442310218,
|
|
"train_speed(iter/s)": 0.174696
|
|
},
|
|
{
|
|
"epoch": 0.3762738437418343,
|
|
"grad_norm": 0.5939056277275085,
|
|
"learning_rate": 9.616247507175624e-06,
|
|
"loss": 0.22620651721954346,
|
|
"memory(GiB)": 37.02,
|
|
"step": 90,
|
|
"token_acc": 0.9214102974096579,
|
|
"train_speed(iter/s)": 0.176915
|
|
},
|
|
{
|
|
"epoch": 0.39717794617193625,
|
|
"grad_norm": 0.6213501691818237,
|
|
"learning_rate": 9.573057944514897e-06,
|
|
"loss": 0.2274672269821167,
|
|
"memory(GiB)": 37.02,
|
|
"step": 95,
|
|
"token_acc": 0.9245137714423773,
|
|
"train_speed(iter/s)": 0.178294
|
|
},
|
|
{
|
|
"epoch": 0.4180820486020381,
|
|
"grad_norm": 0.6670387387275696,
|
|
"learning_rate": 9.527673602952123e-06,
|
|
"loss": 0.23140230178833007,
|
|
"memory(GiB)": 37.02,
|
|
"step": 100,
|
|
"token_acc": 0.9132270680440643,
|
|
"train_speed(iter/s)": 0.179262
|
|
},
|
|
{
|
|
"epoch": 0.4180820486020381,
|
|
"eval_loss": 0.25054118037223816,
|
|
"eval_runtime": 6.6295,
|
|
"eval_samples_per_second": 23.23,
|
|
"eval_steps_per_second": 5.883,
|
|
"eval_token_acc": 0.9250425622047622,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.43898615103214006,
|
|
"grad_norm": 0.6463162899017334,
|
|
"learning_rate": 9.48011626410401e-06,
|
|
"loss": 0.22332818508148194,
|
|
"memory(GiB)": 37.02,
|
|
"step": 105,
|
|
"token_acc": 0.9278207538447314,
|
|
"train_speed(iter/s)": 0.174957
|
|
},
|
|
{
|
|
"epoch": 0.45989025346224194,
|
|
"grad_norm": 0.5779640674591064,
|
|
"learning_rate": 9.430408752488687e-06,
|
|
"loss": 0.2219472885131836,
|
|
"memory(GiB)": 37.02,
|
|
"step": 110,
|
|
"token_acc": 0.9219851633866716,
|
|
"train_speed(iter/s)": 0.176134
|
|
},
|
|
{
|
|
"epoch": 0.48079435589234387,
|
|
"grad_norm": 0.6310390830039978,
|
|
"learning_rate": 9.378574924571362e-06,
|
|
"loss": 0.22848432064056395,
|
|
"memory(GiB)": 37.02,
|
|
"step": 115,
|
|
"token_acc": 0.9266341377642974,
|
|
"train_speed(iter/s)": 0.177157
|
|
},
|
|
{
|
|
"epoch": 0.5016984583224458,
|
|
"grad_norm": 0.5928042531013489,
|
|
"learning_rate": 9.324639657314742e-06,
|
|
"loss": 0.23035595417022706,
|
|
"memory(GiB)": 37.02,
|
|
"step": 120,
|
|
"token_acc": 0.9220481773335896,
|
|
"train_speed(iter/s)": 0.178639
|
|
},
|
|
{
|
|
"epoch": 0.5016984583224458,
|
|
"eval_loss": 0.24868212640285492,
|
|
"eval_runtime": 6.6274,
|
|
"eval_samples_per_second": 23.237,
|
|
"eval_steps_per_second": 5.885,
|
|
"eval_token_acc": 0.9254182286129916,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.5226025607525477,
|
|
"grad_norm": 0.5700287222862244,
|
|
"learning_rate": 9.268628836239646e-06,
|
|
"loss": 0.22175629138946534,
|
|
"memory(GiB)": 37.02,
|
|
"step": 125,
|
|
"token_acc": 0.9286114910407667,
|
|
"train_speed(iter/s)": 0.175138
|
|
},
|
|
{
|
|
"epoch": 0.5435066631826496,
|
|
"grad_norm": 0.6334052681922913,
|
|
"learning_rate": 9.21056934300161e-06,
|
|
"loss": 0.2261972188949585,
|
|
"memory(GiB)": 37.02,
|
|
"step": 130,
|
|
"token_acc": 0.9182782377470237,
|
|
"train_speed(iter/s)": 0.176545
|
|
},
|
|
{
|
|
"epoch": 0.5644107656127515,
|
|
"grad_norm": 0.6031065583229065,
|
|
"learning_rate": 9.150489042489368e-06,
|
|
"loss": 0.2102799892425537,
|
|
"memory(GiB)": 37.02,
|
|
"step": 135,
|
|
"token_acc": 0.9315734630079889,
|
|
"train_speed(iter/s)": 0.177648
|
|
},
|
|
{
|
|
"epoch": 0.5853148680428534,
|
|
"grad_norm": 0.593549907207489,
|
|
"learning_rate": 9.088416769451485e-06,
|
|
"loss": 0.21703071594238282,
|
|
"memory(GiB)": 37.02,
|
|
"step": 140,
|
|
"token_acc": 0.9275403061439635,
|
|
"train_speed(iter/s)": 0.178909
|
|
},
|
|
{
|
|
"epoch": 0.5853148680428534,
|
|
"eval_loss": 0.24534018337726593,
|
|
"eval_runtime": 6.6365,
|
|
"eval_samples_per_second": 23.205,
|
|
"eval_steps_per_second": 5.877,
|
|
"eval_token_acc": 0.9266331497630105,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.6062189704729554,
|
|
"grad_norm": 0.6485511660575867,
|
|
"learning_rate": 9.02438231465749e-06,
|
|
"loss": 0.2273317813873291,
|
|
"memory(GiB)": 37.02,
|
|
"step": 145,
|
|
"token_acc": 0.9251848272032739,
|
|
"train_speed(iter/s)": 0.175995
|
|
},
|
|
{
|
|
"epoch": 0.6271230729030572,
|
|
"grad_norm": 0.6085463166236877,
|
|
"learning_rate": 8.958416410600188e-06,
|
|
"loss": 0.22932357788085939,
|
|
"memory(GiB)": 37.02,
|
|
"step": 150,
|
|
"token_acc": 0.9163140495867769,
|
|
"train_speed(iter/s)": 0.177285
|
|
},
|
|
{
|
|
"epoch": 0.6480271753331591,
|
|
"grad_norm": 0.5881887078285217,
|
|
"learning_rate": 8.890550716746013e-06,
|
|
"loss": 0.2232443571090698,
|
|
"memory(GiB)": 37.02,
|
|
"step": 155,
|
|
"token_acc": 0.920258257982589,
|
|
"train_speed(iter/s)": 0.17828
|
|
},
|
|
{
|
|
"epoch": 0.668931277763261,
|
|
"grad_norm": 0.5788638591766357,
|
|
"learning_rate": 8.820817804340471e-06,
|
|
"loss": 0.20956034660339357,
|
|
"memory(GiB)": 37.02,
|
|
"step": 160,
|
|
"token_acc": 0.9304798016625346,
|
|
"train_speed(iter/s)": 0.179214
|
|
},
|
|
{
|
|
"epoch": 0.668931277763261,
|
|
"eval_loss": 0.24245667457580566,
|
|
"eval_runtime": 6.5715,
|
|
"eval_samples_per_second": 23.435,
|
|
"eval_steps_per_second": 5.935,
|
|
"eval_token_acc": 0.927040787780451,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.689835380193363,
|
|
"grad_norm": 0.6712965369224548,
|
|
"learning_rate": 8.749251140776016e-06,
|
|
"loss": 0.2295995235443115,
|
|
"memory(GiB)": 37.02,
|
|
"step": 165,
|
|
"token_acc": 0.9216801551445813,
|
|
"train_speed(iter/s)": 0.176683
|
|
},
|
|
{
|
|
"epoch": 0.7107394826234649,
|
|
"grad_norm": 0.590825617313385,
|
|
"learning_rate": 8.675885073529802e-06,
|
|
"loss": 0.21770024299621582,
|
|
"memory(GiB)": 37.02,
|
|
"step": 170,
|
|
"token_acc": 0.9224579044445766,
|
|
"train_speed(iter/s)": 0.177781
|
|
},
|
|
{
|
|
"epoch": 0.7316435850535667,
|
|
"grad_norm": 0.6172182559967041,
|
|
"learning_rate": 8.600754813679072e-06,
|
|
"loss": 0.21406757831573486,
|
|
"memory(GiB)": 37.02,
|
|
"step": 175,
|
|
"token_acc": 0.9237590553351814,
|
|
"train_speed(iter/s)": 0.178707
|
|
},
|
|
{
|
|
"epoch": 0.7525476874836686,
|
|
"grad_norm": 0.5739086866378784,
|
|
"learning_rate": 8.52389641900206e-06,
|
|
"loss": 0.20645816326141359,
|
|
"memory(GiB)": 37.02,
|
|
"step": 180,
|
|
"token_acc": 0.926357608155873,
|
|
"train_speed(iter/s)": 0.179338
|
|
},
|
|
{
|
|
"epoch": 0.7525476874836686,
|
|
"eval_loss": 0.24040701985359192,
|
|
"eval_runtime": 6.6295,
|
|
"eval_samples_per_second": 23.229,
|
|
"eval_steps_per_second": 5.883,
|
|
"eval_token_acc": 0.9270647664873592,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.7734517899137706,
|
|
"grad_norm": 0.5748027563095093,
|
|
"learning_rate": 8.445346776672546e-06,
|
|
"loss": 0.21495263576507567,
|
|
"memory(GiB)": 37.02,
|
|
"step": 185,
|
|
"token_acc": 0.9272353992083938,
|
|
"train_speed(iter/s)": 0.177187
|
|
},
|
|
{
|
|
"epoch": 0.7943558923438725,
|
|
"grad_norm": 0.6537684798240662,
|
|
"learning_rate": 8.365143585556326e-06,
|
|
"loss": 0.22100327014923096,
|
|
"memory(GiB)": 37.02,
|
|
"step": 190,
|
|
"token_acc": 0.9197183562004408,
|
|
"train_speed(iter/s)": 0.177962
|
|
},
|
|
{
|
|
"epoch": 0.8152599947739744,
|
|
"grad_norm": 0.6247681379318237,
|
|
"learning_rate": 8.283325338118154e-06,
|
|
"loss": 0.21339471340179444,
|
|
"memory(GiB)": 37.02,
|
|
"step": 195,
|
|
"token_acc": 0.9236349420094696,
|
|
"train_speed(iter/s)": 0.178683
|
|
},
|
|
{
|
|
"epoch": 0.8361640972040763,
|
|
"grad_norm": 0.6301010847091675,
|
|
"learning_rate": 8.199931301947782e-06,
|
|
"loss": 0.2191436767578125,
|
|
"memory(GiB)": 37.02,
|
|
"step": 200,
|
|
"token_acc": 0.9171661150371232,
|
|
"train_speed(iter/s)": 0.179195
|
|
},
|
|
{
|
|
"epoch": 0.8361640972040763,
|
|
"eval_loss": 0.2372375875711441,
|
|
"eval_runtime": 6.6381,
|
|
"eval_samples_per_second": 23.199,
|
|
"eval_steps_per_second": 5.875,
|
|
"eval_token_acc": 0.9280638792752036,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.8570681996341782,
|
|
"grad_norm": 0.5749327540397644,
|
|
"learning_rate": 8.115001500914e-06,
|
|
"loss": 0.21314010620117188,
|
|
"memory(GiB)": 37.02,
|
|
"step": 205,
|
|
"token_acc": 0.9257877237051938,
|
|
"train_speed(iter/s)": 0.176853
|
|
},
|
|
{
|
|
"epoch": 0.8779723020642801,
|
|
"grad_norm": 0.6665855050086975,
|
|
"learning_rate": 8.028576695955711e-06,
|
|
"loss": 0.20737431049346924,
|
|
"memory(GiB)": 37.02,
|
|
"step": 210,
|
|
"token_acc": 0.9287075750122971,
|
|
"train_speed(iter/s)": 0.177713
|
|
},
|
|
{
|
|
"epoch": 0.898876404494382,
|
|
"grad_norm": 0.5684797763824463,
|
|
"learning_rate": 7.940698365519246e-06,
|
|
"loss": 0.21522219181060792,
|
|
"memory(GiB)": 37.02,
|
|
"step": 215,
|
|
"token_acc": 0.9235592334740796,
|
|
"train_speed(iter/s)": 0.178152
|
|
},
|
|
{
|
|
"epoch": 0.9197805069244839,
|
|
"grad_norm": 0.6096109747886658,
|
|
"learning_rate": 7.851408685651342e-06,
|
|
"loss": 0.21898245811462402,
|
|
"memory(GiB)": 37.02,
|
|
"step": 220,
|
|
"token_acc": 0.9227710824734293,
|
|
"train_speed(iter/s)": 0.178957
|
|
},
|
|
{
|
|
"epoch": 0.9197805069244839,
|
|
"eval_loss": 0.23681528866291046,
|
|
"eval_runtime": 6.6341,
|
|
"eval_samples_per_second": 23.213,
|
|
"eval_steps_per_second": 5.879,
|
|
"eval_token_acc": 0.9285674321202771,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.9406846093545859,
|
|
"grad_norm": 0.55597323179245,
|
|
"learning_rate": 7.7607505097573e-06,
|
|
"loss": 0.21175863742828369,
|
|
"memory(GiB)": 37.02,
|
|
"step": 225,
|
|
"token_acc": 0.9328776189582448,
|
|
"train_speed(iter/s)": 0.17712
|
|
},
|
|
{
|
|
"epoch": 0.9615887117846877,
|
|
"grad_norm": 0.6526412963867188,
|
|
"learning_rate": 7.668767348034044e-06,
|
|
"loss": 0.21146607398986816,
|
|
"memory(GiB)": 37.02,
|
|
"step": 230,
|
|
"token_acc": 0.920968304305235,
|
|
"train_speed(iter/s)": 0.177568
|
|
},
|
|
{
|
|
"epoch": 0.9824928142147896,
|
|
"grad_norm": 0.5566428303718567,
|
|
"learning_rate": 7.5755033465880024e-06,
|
|
"loss": 0.21480951309204102,
|
|
"memory(GiB)": 37.02,
|
|
"step": 235,
|
|
"token_acc": 0.9206400893147881,
|
|
"train_speed(iter/s)": 0.178101
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.6223005056381226,
|
|
"learning_rate": 7.481003266247745e-06,
|
|
"loss": 0.21818625926971436,
|
|
"memory(GiB)": 37.02,
|
|
"step": 240,
|
|
"token_acc": 0.9244112717911505,
|
|
"train_speed(iter/s)": 0.179289
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"eval_loss": 0.23499608039855957,
|
|
"eval_runtime": 6.6316,
|
|
"eval_samples_per_second": 23.222,
|
|
"eval_steps_per_second": 5.881,
|
|
"eval_token_acc": 0.9285194747064607,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.020904102430102,
|
|
"grad_norm": 0.5871774554252625,
|
|
"learning_rate": 7.385312461081616e-06,
|
|
"loss": 0.1686471700668335,
|
|
"memory(GiB)": 37.02,
|
|
"step": 245,
|
|
"token_acc": 0.9376223265624841,
|
|
"train_speed(iter/s)": 0.177674
|
|
},
|
|
{
|
|
"epoch": 1.0418082048602038,
|
|
"grad_norm": 0.6004992723464966,
|
|
"learning_rate": 7.288476856630656e-06,
|
|
"loss": 0.16282508373260499,
|
|
"memory(GiB)": 37.02,
|
|
"step": 250,
|
|
"token_acc": 0.9393488548772574,
|
|
"train_speed(iter/s)": 0.17831
|
|
},
|
|
{
|
|
"epoch": 1.0627123072903057,
|
|
"grad_norm": 0.6309487223625183,
|
|
"learning_rate": 7.190542927867234e-06,
|
|
"loss": 0.1637326955795288,
|
|
"memory(GiB)": 37.02,
|
|
"step": 255,
|
|
"token_acc": 0.945661712279932,
|
|
"train_speed(iter/s)": 0.178866
|
|
},
|
|
{
|
|
"epoch": 1.0836164097204075,
|
|
"grad_norm": 0.5814361572265625,
|
|
"learning_rate": 7.091557676890001e-06,
|
|
"loss": 0.15928541421890258,
|
|
"memory(GiB)": 37.02,
|
|
"step": 260,
|
|
"token_acc": 0.9451221362463966,
|
|
"train_speed(iter/s)": 0.179402
|
|
},
|
|
{
|
|
"epoch": 1.0836164097204075,
|
|
"eval_loss": 0.23942023515701294,
|
|
"eval_runtime": 6.6351,
|
|
"eval_samples_per_second": 23.21,
|
|
"eval_steps_per_second": 5.878,
|
|
"eval_token_acc": 0.9281597941028367,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.1045205121505095,
|
|
"grad_norm": 0.5660094022750854,
|
|
"learning_rate": 6.991568610365851e-06,
|
|
"loss": 0.15568481683731078,
|
|
"memory(GiB)": 37.02,
|
|
"step": 265,
|
|
"token_acc": 0.9408398229323706,
|
|
"train_speed(iter/s)": 0.177849
|
|
},
|
|
{
|
|
"epoch": 1.1254246145806115,
|
|
"grad_norm": 0.5609026551246643,
|
|
"learning_rate": 6.890623716729724e-06,
|
|
"loss": 0.16430522203445436,
|
|
"memory(GiB)": 37.02,
|
|
"step": 270,
|
|
"token_acc": 0.9378496054183171,
|
|
"train_speed(iter/s)": 0.178387
|
|
},
|
|
{
|
|
"epoch": 1.1463287170107133,
|
|
"grad_norm": 0.579684317111969,
|
|
"learning_rate": 6.788771443153183e-06,
|
|
"loss": 0.15910866260528564,
|
|
"memory(GiB)": 37.02,
|
|
"step": 275,
|
|
"token_acc": 0.9425582560761714,
|
|
"train_speed(iter/s)": 0.178707
|
|
},
|
|
{
|
|
"epoch": 1.1672328194408153,
|
|
"grad_norm": 0.5673295855522156,
|
|
"learning_rate": 6.686060672292847e-06,
|
|
"loss": 0.15805959701538086,
|
|
"memory(GiB)": 37.02,
|
|
"step": 280,
|
|
"token_acc": 0.9461089281816197,
|
|
"train_speed(iter/s)": 0.179432
|
|
},
|
|
{
|
|
"epoch": 1.1672328194408153,
|
|
"eval_loss": 0.23961253464221954,
|
|
"eval_runtime": 6.6282,
|
|
"eval_samples_per_second": 23.234,
|
|
"eval_steps_per_second": 5.884,
|
|
"eval_token_acc": 0.9277281773784879,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.1881369218709172,
|
|
"grad_norm": 0.5638378858566284,
|
|
"learning_rate": 6.5825406988297815e-06,
|
|
"loss": 0.160142982006073,
|
|
"memory(GiB)": 37.02,
|
|
"step": 285,
|
|
"token_acc": 0.9402033005473476,
|
|
"train_speed(iter/s)": 0.178002
|
|
},
|
|
{
|
|
"epoch": 1.209041024301019,
|
|
"grad_norm": 0.5472784638404846,
|
|
"learning_rate": 6.478261205811188e-06,
|
|
"loss": 0.15631693601608276,
|
|
"memory(GiB)": 37.02,
|
|
"step": 290,
|
|
"token_acc": 0.9425503107883562,
|
|
"train_speed(iter/s)": 0.178558
|
|
},
|
|
{
|
|
"epoch": 1.229945126731121,
|
|
"grad_norm": 0.5741646885871887,
|
|
"learning_rate": 6.373272240805668e-06,
|
|
"loss": 0.16377530097961426,
|
|
"memory(GiB)": 37.02,
|
|
"step": 295,
|
|
"token_acc": 0.9453790125312598,
|
|
"train_speed(iter/s)": 0.179076
|
|
},
|
|
{
|
|
"epoch": 1.250849229161223,
|
|
"grad_norm": 0.5998528599739075,
|
|
"learning_rate": 6.267624191883551e-06,
|
|
"loss": 0.1654489278793335,
|
|
"memory(GiB)": 37.02,
|
|
"step": 300,
|
|
"token_acc": 0.9433800623052959,
|
|
"train_speed(iter/s)": 0.179541
|
|
},
|
|
{
|
|
"epoch": 1.250849229161223,
|
|
"eval_loss": 0.23872551321983337,
|
|
"eval_runtime": 6.6305,
|
|
"eval_samples_per_second": 23.226,
|
|
"eval_steps_per_second": 5.882,
|
|
"eval_token_acc": 0.9284555314880386,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.2717533315913248,
|
|
"grad_norm": 0.5820715427398682,
|
|
"learning_rate": 6.161367763433812e-06,
|
|
"loss": 0.1659400701522827,
|
|
"memory(GiB)": 37.02,
|
|
"step": 305,
|
|
"token_acc": 0.9422344370459437,
|
|
"train_speed(iter/s)": 0.178162
|
|
},
|
|
{
|
|
"epoch": 1.2926574340214267,
|
|
"grad_norm": 0.5959991216659546,
|
|
"learning_rate": 6.054553951829163e-06,
|
|
"loss": 0.16145311594009398,
|
|
"memory(GiB)": 37.02,
|
|
"step": 310,
|
|
"token_acc": 0.9404544902175234,
|
|
"train_speed(iter/s)": 0.178528
|
|
},
|
|
{
|
|
"epoch": 1.3135615364515285,
|
|
"grad_norm": 0.6270994544029236,
|
|
"learning_rate": 5.947234020951015e-06,
|
|
"loss": 0.16696672439575194,
|
|
"memory(GiB)": 37.02,
|
|
"step": 315,
|
|
"token_acc": 0.9387243092362137,
|
|
"train_speed(iter/s)": 0.178987
|
|
},
|
|
{
|
|
"epoch": 1.3344656388816305,
|
|
"grad_norm": 0.5847781896591187,
|
|
"learning_rate": 5.839459477586056e-06,
|
|
"loss": 0.1612384557723999,
|
|
"memory(GiB)": 37.02,
|
|
"step": 320,
|
|
"token_acc": 0.9422280984134284,
|
|
"train_speed(iter/s)": 0.179441
|
|
},
|
|
{
|
|
"epoch": 1.3344656388816305,
|
|
"eval_loss": 0.23678933084011078,
|
|
"eval_runtime": 6.6342,
|
|
"eval_samples_per_second": 23.213,
|
|
"eval_steps_per_second": 5.879,
|
|
"eval_token_acc": 0.9287432759709378,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.3553697413117325,
|
|
"grad_norm": 0.5601792931556702,
|
|
"learning_rate": 5.731282046706247e-06,
|
|
"loss": 0.16321887969970703,
|
|
"memory(GiB)": 37.02,
|
|
"step": 325,
|
|
"token_acc": 0.938570500852342,
|
|
"train_speed(iter/s)": 0.17826
|
|
},
|
|
{
|
|
"epoch": 1.3762738437418343,
|
|
"grad_norm": 0.5960825085639954,
|
|
"learning_rate": 5.622753646644102e-06,
|
|
"loss": 0.16715322732925414,
|
|
"memory(GiB)": 37.02,
|
|
"step": 330,
|
|
"token_acc": 0.9422235465708954,
|
|
"train_speed(iter/s)": 0.178716
|
|
},
|
|
{
|
|
"epoch": 1.3971779461719362,
|
|
"grad_norm": 0.5615048408508301,
|
|
"learning_rate": 5.513926364175172e-06,
|
|
"loss": 0.16500518321990967,
|
|
"memory(GiB)": 37.02,
|
|
"step": 335,
|
|
"token_acc": 0.9453201823653863,
|
|
"train_speed(iter/s)": 0.179021
|
|
},
|
|
{
|
|
"epoch": 1.418082048602038,
|
|
"grad_norm": 0.5556049942970276,
|
|
"learning_rate": 5.404852429519678e-06,
|
|
"loss": 0.15454906225204468,
|
|
"memory(GiB)": 37.02,
|
|
"step": 340,
|
|
"token_acc": 0.9407398356599567,
|
|
"train_speed(iter/s)": 0.179344
|
|
},
|
|
{
|
|
"epoch": 1.418082048602038,
|
|
"eval_loss": 0.23516136407852173,
|
|
"eval_runtime": 6.6178,
|
|
"eval_samples_per_second": 23.271,
|
|
"eval_steps_per_second": 5.893,
|
|
"eval_token_acc": 0.9299502042186538,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.43898615103214,
|
|
"grad_norm": 0.5906144976615906,
|
|
"learning_rate": 5.295584191275308e-06,
|
|
"loss": 0.16890814304351806,
|
|
"memory(GiB)": 37.02,
|
|
"step": 345,
|
|
"token_acc": 0.9373257274217237,
|
|
"train_speed(iter/s)": 0.17811
|
|
},
|
|
{
|
|
"epoch": 1.459890253462242,
|
|
"grad_norm": 0.6000419855117798,
|
|
"learning_rate": 5.1861740912932e-06,
|
|
"loss": 0.16450071334838867,
|
|
"memory(GiB)": 37.02,
|
|
"step": 350,
|
|
"token_acc": 0.9405447723714142,
|
|
"train_speed(iter/s)": 0.178429
|
|
},
|
|
{
|
|
"epoch": 1.480794355892344,
|
|
"grad_norm": 0.5739107131958008,
|
|
"learning_rate": 5.07667463950916e-06,
|
|
"loss": 0.1568189740180969,
|
|
"memory(GiB)": 37.02,
|
|
"step": 355,
|
|
"token_acc": 0.9423155065170506,
|
|
"train_speed(iter/s)": 0.178939
|
|
},
|
|
{
|
|
"epoch": 1.5016984583224458,
|
|
"grad_norm": 0.5783458948135376,
|
|
"learning_rate": 4.967138388742218e-06,
|
|
"loss": 0.16183936595916748,
|
|
"memory(GiB)": 37.02,
|
|
"step": 360,
|
|
"token_acc": 0.9435463668380956,
|
|
"train_speed(iter/s)": 0.179308
|
|
},
|
|
{
|
|
"epoch": 1.5016984583224458,
|
|
"eval_loss": 0.23441138863563538,
|
|
"eval_runtime": 6.6338,
|
|
"eval_samples_per_second": 23.214,
|
|
"eval_steps_per_second": 5.879,
|
|
"eval_token_acc": 0.9301819983854337,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.5226025607525477,
|
|
"grad_norm": 0.509042501449585,
|
|
"learning_rate": 4.8576179094725855e-06,
|
|
"loss": 0.16039080619812013,
|
|
"memory(GiB)": 37.02,
|
|
"step": 365,
|
|
"token_acc": 0.9386532058855117,
|
|
"train_speed(iter/s)": 0.178194
|
|
},
|
|
{
|
|
"epoch": 1.5435066631826495,
|
|
"grad_norm": 0.5684186220169067,
|
|
"learning_rate": 4.748165764611157e-06,
|
|
"loss": 0.16779780387878418,
|
|
"memory(GiB)": 37.02,
|
|
"step": 370,
|
|
"token_acc": 0.9417494999465592,
|
|
"train_speed(iter/s)": 0.178626
|
|
},
|
|
{
|
|
"epoch": 1.5644107656127515,
|
|
"grad_norm": 0.5707104802131653,
|
|
"learning_rate": 4.6388344842726266e-06,
|
|
"loss": 0.16627538204193115,
|
|
"memory(GiB)": 37.02,
|
|
"step": 375,
|
|
"token_acc": 0.938589794484991,
|
|
"train_speed(iter/s)": 0.179043
|
|
},
|
|
{
|
|
"epoch": 1.5853148680428535,
|
|
"grad_norm": 0.5481753945350647,
|
|
"learning_rate": 4.529676540564351e-06,
|
|
"loss": 0.15947449207305908,
|
|
"memory(GiB)": 37.02,
|
|
"step": 380,
|
|
"token_acc": 0.940670141008185,
|
|
"train_speed(iter/s)": 0.179357
|
|
},
|
|
{
|
|
"epoch": 1.5853148680428535,
|
|
"eval_loss": 0.23353615403175354,
|
|
"eval_runtime": 6.6285,
|
|
"eval_samples_per_second": 23.233,
|
|
"eval_steps_per_second": 5.884,
|
|
"eval_token_acc": 0.9297983390749015,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.6062189704729555,
|
|
"grad_norm": 0.5632703900337219,
|
|
"learning_rate": 4.420744322403058e-06,
|
|
"loss": 0.15636355876922609,
|
|
"memory(GiB)": 37.02,
|
|
"step": 385,
|
|
"token_acc": 0.9417008449869052,
|
|
"train_speed(iter/s)": 0.178382
|
|
},
|
|
{
|
|
"epoch": 1.6271230729030572,
|
|
"grad_norm": 0.5890342593193054,
|
|
"learning_rate": 4.312090110371473e-06,
|
|
"loss": 0.15623259544372559,
|
|
"memory(GiB)": 37.02,
|
|
"step": 390,
|
|
"token_acc": 0.9422333549803631,
|
|
"train_speed(iter/s)": 0.178811
|
|
},
|
|
{
|
|
"epoch": 1.648027175333159,
|
|
"grad_norm": 0.5722407102584839,
|
|
"learning_rate": 4.203766051626939e-06,
|
|
"loss": 0.16071751117706298,
|
|
"memory(GiB)": 37.02,
|
|
"step": 395,
|
|
"token_acc": 0.9453681710213777,
|
|
"train_speed(iter/s)": 0.179205
|
|
},
|
|
{
|
|
"epoch": 1.668931277763261,
|
|
"grad_norm": 0.611022412776947,
|
|
"learning_rate": 4.095824134874087e-06,
|
|
"loss": 0.16332566738128662,
|
|
"memory(GiB)": 37.02,
|
|
"step": 400,
|
|
"token_acc": 0.937804167388531,
|
|
"train_speed(iter/s)": 0.179585
|
|
},
|
|
{
|
|
"epoch": 1.668931277763261,
|
|
"eval_loss": 0.23261623084545135,
|
|
"eval_runtime": 6.6176,
|
|
"eval_samples_per_second": 23.271,
|
|
"eval_steps_per_second": 5.893,
|
|
"eval_token_acc": 0.9303818209430026,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.689835380193363,
|
|
"grad_norm": 0.5979709029197693,
|
|
"learning_rate": 3.988316165413528e-06,
|
|
"loss": 0.1527752161026001,
|
|
"memory(GiB)": 37.02,
|
|
"step": 405,
|
|
"token_acc": 0.940948516358103,
|
|
"train_speed(iter/s)": 0.178456
|
|
},
|
|
{
|
|
"epoch": 1.710739482623465,
|
|
"grad_norm": 0.549656093120575,
|
|
"learning_rate": 3.881293740278588e-06,
|
|
"loss": 0.15298218727111818,
|
|
"memory(GiB)": 39.46,
|
|
"step": 410,
|
|
"token_acc": 0.9467923967923968,
|
|
"train_speed(iter/s)": 0.178833
|
|
},
|
|
{
|
|
"epoch": 1.7316435850535667,
|
|
"grad_norm": 0.5743902921676636,
|
|
"learning_rate": 3.774808223471996e-06,
|
|
"loss": 0.15700061321258546,
|
|
"memory(GiB)": 39.46,
|
|
"step": 415,
|
|
"token_acc": 0.9460204535349044,
|
|
"train_speed(iter/s)": 0.179167
|
|
},
|
|
{
|
|
"epoch": 1.7525476874836685,
|
|
"grad_norm": 0.5933798551559448,
|
|
"learning_rate": 3.6689107213144025e-06,
|
|
"loss": 0.16367003917694092,
|
|
"memory(GiB)": 39.46,
|
|
"step": 420,
|
|
"token_acc": 0.9469271826676363,
|
|
"train_speed(iter/s)": 0.179611
|
|
},
|
|
{
|
|
"epoch": 1.7525476874836685,
|
|
"eval_loss": 0.23134766519069672,
|
|
"eval_runtime": 6.6251,
|
|
"eval_samples_per_second": 23.245,
|
|
"eval_steps_per_second": 5.887,
|
|
"eval_token_acc": 0.9307654802535349,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.7734517899137705,
|
|
"grad_norm": 0.5866169929504395,
|
|
"learning_rate": 3.5636520579165704e-06,
|
|
"loss": 0.15368299484252929,
|
|
"memory(GiB)": 39.46,
|
|
"step": 425,
|
|
"token_acc": 0.9376751889917608,
|
|
"train_speed(iter/s)": 0.178672
|
|
},
|
|
{
|
|
"epoch": 1.7943558923438725,
|
|
"grad_norm": 0.6050538420677185,
|
|
"learning_rate": 3.4590827507870257e-06,
|
|
"loss": 0.1597348690032959,
|
|
"memory(GiB)": 39.46,
|
|
"step": 430,
|
|
"token_acc": 0.9424976034327972,
|
|
"train_speed(iter/s)": 0.179078
|
|
},
|
|
{
|
|
"epoch": 1.8152599947739745,
|
|
"grad_norm": 0.5516080856323242,
|
|
"learning_rate": 3.3552529865868323e-06,
|
|
"loss": 0.15672740936279297,
|
|
"memory(GiB)": 39.46,
|
|
"step": 435,
|
|
"token_acc": 0.9391331914315475,
|
|
"train_speed(iter/s)": 0.179453
|
|
},
|
|
{
|
|
"epoch": 1.8361640972040763,
|
|
"grad_norm": 0.5520040392875671,
|
|
"learning_rate": 3.252212597043167e-06,
|
|
"loss": 0.154363751411438,
|
|
"memory(GiB)": 39.46,
|
|
"step": 440,
|
|
"token_acc": 0.9407710731669628,
|
|
"train_speed(iter/s)": 0.179753
|
|
},
|
|
{
|
|
"epoch": 1.8361640972040763,
|
|
"eval_loss": 0.230976402759552,
|
|
"eval_runtime": 6.6289,
|
|
"eval_samples_per_second": 23.231,
|
|
"eval_steps_per_second": 5.883,
|
|
"eval_token_acc": 0.9307734731558376,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.8570681996341782,
|
|
"grad_norm": 0.5343540906906128,
|
|
"learning_rate": 3.1500110350332492e-06,
|
|
"loss": 0.1531538486480713,
|
|
"memory(GiB)": 39.46,
|
|
"step": 445,
|
|
"token_acc": 0.9422152799196107,
|
|
"train_speed(iter/s)": 0.178932
|
|
},
|
|
{
|
|
"epoch": 1.87797230206428,
|
|
"grad_norm": 0.5729184746742249,
|
|
"learning_rate": 3.048697350850073e-06,
|
|
"loss": 0.16055722236633302,
|
|
"memory(GiB)": 39.46,
|
|
"step": 450,
|
|
"token_acc": 0.9512691733405728,
|
|
"train_speed(iter/s)": 0.179136
|
|
},
|
|
{
|
|
"epoch": 1.898876404494382,
|
|
"grad_norm": 0.5333752036094666,
|
|
"learning_rate": 2.9483201686613626e-06,
|
|
"loss": 0.15494089126586913,
|
|
"memory(GiB)": 39.46,
|
|
"step": 455,
|
|
"token_acc": 0.9471734021540102,
|
|
"train_speed(iter/s)": 0.179584
|
|
},
|
|
{
|
|
"epoch": 1.919780506924484,
|
|
"grad_norm": 0.8302111625671387,
|
|
"learning_rate": 2.8489276631730633e-06,
|
|
"loss": 0.15890274047851563,
|
|
"memory(GiB)": 39.46,
|
|
"step": 460,
|
|
"token_acc": 0.9424079706996089,
|
|
"train_speed(iter/s)": 0.179933
|
|
},
|
|
{
|
|
"epoch": 1.919780506924484,
|
|
"eval_loss": 0.22922886908054352,
|
|
"eval_runtime": 6.6337,
|
|
"eval_samples_per_second": 23.215,
|
|
"eval_steps_per_second": 5.879,
|
|
"eval_token_acc": 0.9315727633861132,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 1.940684609354586,
|
|
"grad_norm": 0.5802881717681885,
|
|
"learning_rate": 2.750567536508504e-06,
|
|
"loss": 0.1534783959388733,
|
|
"memory(GiB)": 39.46,
|
|
"step": 465,
|
|
"token_acc": 0.9404760675375639,
|
|
"train_speed(iter/s)": 0.178985
|
|
},
|
|
{
|
|
"epoch": 1.9615887117846877,
|
|
"grad_norm": 0.5230849385261536,
|
|
"learning_rate": 2.653286995314398e-06,
|
|
"loss": 0.15316032171249389,
|
|
"memory(GiB)": 39.46,
|
|
"step": 470,
|
|
"token_acc": 0.940865764070184,
|
|
"train_speed(iter/s)": 0.179339
|
|
},
|
|
{
|
|
"epoch": 1.9824928142147895,
|
|
"grad_norm": 0.5382450222969055,
|
|
"learning_rate": 2.5571327281046486e-06,
|
|
"loss": 0.15808116197586058,
|
|
"memory(GiB)": 39.46,
|
|
"step": 475,
|
|
"token_acc": 0.9414967066105865,
|
|
"train_speed(iter/s)": 0.179724
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.5938447117805481,
|
|
"learning_rate": 2.46215088285279e-06,
|
|
"loss": 0.16262369155883788,
|
|
"memory(GiB)": 39.46,
|
|
"step": 480,
|
|
"token_acc": 0.9486793264834041,
|
|
"train_speed(iter/s)": 0.180254
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"eval_loss": 0.2286583036184311,
|
|
"eval_runtime": 6.6325,
|
|
"eval_samples_per_second": 23.219,
|
|
"eval_steps_per_second": 5.88,
|
|
"eval_token_acc": 0.9318205433574985,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 2.020904102430102,
|
|
"grad_norm": 0.536767303943634,
|
|
"learning_rate": 2.3683870448438905e-06,
|
|
"loss": 0.12537214756011963,
|
|
"memory(GiB)": 39.46,
|
|
"step": 485,
|
|
"token_acc": 0.9521131994289179,
|
|
"train_speed(iter/s)": 0.179418
|
|
},
|
|
{
|
|
"epoch": 2.041808204860204,
|
|
"grad_norm": 0.5739563703536987,
|
|
"learning_rate": 2.2758862147964933e-06,
|
|
"loss": 0.12507247924804688,
|
|
"memory(GiB)": 39.46,
|
|
"step": 490,
|
|
"token_acc": 0.9547663049958268,
|
|
"train_speed(iter/s)": 0.179658
|
|
},
|
|
{
|
|
"epoch": 2.0627123072903055,
|
|
"grad_norm": 0.6101587414741516,
|
|
"learning_rate": 2.1846927872651135e-06,
|
|
"loss": 0.1226424217224121,
|
|
"memory(GiB)": 39.46,
|
|
"step": 495,
|
|
"token_acc": 0.9567438898659713,
|
|
"train_speed(iter/s)": 0.180059
|
|
},
|
|
{
|
|
"epoch": 2.0836164097204075,
|
|
"grad_norm": 0.5811767578125,
|
|
"learning_rate": 2.0948505293336506e-06,
|
|
"loss": 0.12014656066894532,
|
|
"memory(GiB)": 39.46,
|
|
"step": 500,
|
|
"token_acc": 0.9582742281109468,
|
|
"train_speed(iter/s)": 0.180402
|
|
},
|
|
{
|
|
"epoch": 2.0836164097204075,
|
|
"eval_loss": 0.24429188668727875,
|
|
"eval_runtime": 6.6294,
|
|
"eval_samples_per_second": 23.23,
|
|
"eval_steps_per_second": 5.883,
|
|
"eval_token_acc": 0.9308134376673514,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 2.1045205121505095,
|
|
"grad_norm": 0.610534131526947,
|
|
"learning_rate": 2.0064025596099663e-06,
|
|
"loss": 0.11917252540588379,
|
|
"memory(GiB)": 39.46,
|
|
"step": 505,
|
|
"token_acc": 0.9507537400408873,
|
|
"train_speed(iter/s)": 0.179597
|
|
},
|
|
{
|
|
"epoch": 2.1254246145806115,
|
|
"grad_norm": 0.5680590867996216,
|
|
"learning_rate": 1.919391327531663e-06,
|
|
"loss": 0.12412093877792359,
|
|
"memory(GiB)": 39.46,
|
|
"step": 510,
|
|
"token_acc": 0.9561542850973921,
|
|
"train_speed(iter/s)": 0.179885
|
|
},
|
|
{
|
|
"epoch": 2.1463287170107135,
|
|
"grad_norm": 0.5825695395469666,
|
|
"learning_rate": 1.8338585929930424e-06,
|
|
"loss": 0.11532289981842041,
|
|
"memory(GiB)": 39.46,
|
|
"step": 515,
|
|
"token_acc": 0.9611214610756498,
|
|
"train_speed(iter/s)": 0.180163
|
|
},
|
|
{
|
|
"epoch": 2.167232819440815,
|
|
"grad_norm": 0.5703282952308655,
|
|
"learning_rate": 1.7498454063029984e-06,
|
|
"loss": 0.11896244287490845,
|
|
"memory(GiB)": 39.46,
|
|
"step": 520,
|
|
"token_acc": 0.9602545823350432,
|
|
"train_speed(iter/s)": 0.180409
|
|
},
|
|
{
|
|
"epoch": 2.167232819440815,
|
|
"eval_loss": 0.24228492379188538,
|
|
"eval_runtime": 6.5759,
|
|
"eval_samples_per_second": 23.419,
|
|
"eval_steps_per_second": 5.931,
|
|
"eval_token_acc": 0.9311811111732782,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 2.188136921870917,
|
|
"grad_norm": 0.5464326739311218,
|
|
"learning_rate": 1.667392088483456e-06,
|
|
"loss": 0.12099326848983764,
|
|
"memory(GiB)": 39.46,
|
|
"step": 525,
|
|
"token_acc": 0.9483148644281589,
|
|
"train_speed(iter/s)": 0.179639
|
|
},
|
|
{
|
|
"epoch": 2.209041024301019,
|
|
"grad_norm": 0.5550944805145264,
|
|
"learning_rate": 1.5865382119178258e-06,
|
|
"loss": 0.11996636390686036,
|
|
"memory(GiB)": 39.46,
|
|
"step": 530,
|
|
"token_acc": 0.9563651862085183,
|
|
"train_speed(iter/s)": 0.179841
|
|
},
|
|
{
|
|
"epoch": 2.229945126731121,
|
|
"grad_norm": 0.5549576282501221,
|
|
"learning_rate": 1.507322581358771e-06,
|
|
"loss": 0.11995522975921631,
|
|
"memory(GiB)": 39.46,
|
|
"step": 535,
|
|
"token_acc": 0.9548897519016594,
|
|
"train_speed(iter/s)": 0.180134
|
|
},
|
|
{
|
|
"epoch": 2.250849229161223,
|
|
"grad_norm": 0.5435655117034912,
|
|
"learning_rate": 1.4297832153043657e-06,
|
|
"loss": 0.11738158464431762,
|
|
"memory(GiB)": 39.46,
|
|
"step": 540,
|
|
"token_acc": 0.9565079135650791,
|
|
"train_speed(iter/s)": 0.180343
|
|
},
|
|
{
|
|
"epoch": 2.250849229161223,
|
|
"eval_loss": 0.24256764352321625,
|
|
"eval_runtime": 6.6338,
|
|
"eval_samples_per_second": 23.214,
|
|
"eval_steps_per_second": 5.879,
|
|
"eval_token_acc": 0.9310052673226176,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 2.271753331591325,
|
|
"grad_norm": 0.5776016116142273,
|
|
"learning_rate": 1.353957327751621e-06,
|
|
"loss": 0.12343101501464844,
|
|
"memory(GiB)": 39.46,
|
|
"step": 545,
|
|
"token_acc": 0.9484948703433345,
|
|
"train_speed(iter/s)": 0.179597
|
|
},
|
|
{
|
|
"epoch": 2.2926574340214265,
|
|
"grad_norm": 0.5709097385406494,
|
|
"learning_rate": 1.2798813103361291e-06,
|
|
"loss": 0.11971625089645385,
|
|
"memory(GiB)": 39.46,
|
|
"step": 550,
|
|
"token_acc": 0.9561414627962986,
|
|
"train_speed(iter/s)": 0.179863
|
|
},
|
|
{
|
|
"epoch": 2.3135615364515285,
|
|
"grad_norm": 0.5969334840774536,
|
|
"learning_rate": 1.2075907148663579e-06,
|
|
"loss": 0.12062759399414062,
|
|
"memory(GiB)": 39.46,
|
|
"step": 555,
|
|
"token_acc": 0.9569148857062122,
|
|
"train_speed(iter/s)": 0.180073
|
|
},
|
|
{
|
|
"epoch": 2.3344656388816305,
|
|
"grad_norm": 0.5327123403549194,
|
|
"learning_rate": 1.1371202362610412e-06,
|
|
"loss": 0.11765568256378174,
|
|
"memory(GiB)": 39.46,
|
|
"step": 560,
|
|
"token_acc": 0.9581596210457106,
|
|
"train_speed(iter/s)": 0.180333
|
|
},
|
|
{
|
|
"epoch": 2.3344656388816305,
|
|
"eval_loss": 0.2422230988740921,
|
|
"eval_runtime": 6.5972,
|
|
"eval_samples_per_second": 23.343,
|
|
"eval_steps_per_second": 5.912,
|
|
"eval_token_acc": 0.9311651253686726,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 2.3553697413117325,
|
|
"grad_norm": 0.5562822818756104,
|
|
"learning_rate": 1.06850369589781e-06,
|
|
"loss": 0.11786850690841674,
|
|
"memory(GiB)": 39.46,
|
|
"step": 565,
|
|
"token_acc": 0.9489760711168842,
|
|
"train_speed(iter/s)": 0.179625
|
|
},
|
|
{
|
|
"epoch": 2.3762738437418345,
|
|
"grad_norm": 0.5818617939949036,
|
|
"learning_rate": 1.0017740253810608e-06,
|
|
"loss": 0.12321114540100098,
|
|
"memory(GiB)": 39.46,
|
|
"step": 570,
|
|
"token_acc": 0.9529802240354032,
|
|
"train_speed(iter/s)": 0.179844
|
|
},
|
|
{
|
|
"epoch": 2.3971779461719365,
|
|
"grad_norm": 0.5551012754440308,
|
|
"learning_rate": 9.369632507368736e-07,
|
|
"loss": 0.13383883237838745,
|
|
"memory(GiB)": 39.46,
|
|
"step": 575,
|
|
"token_acc": 0.9536231884057971,
|
|
"train_speed(iter/s)": 0.180115
|
|
},
|
|
{
|
|
"epoch": 2.418082048602038,
|
|
"grad_norm": 0.5434304475784302,
|
|
"learning_rate": 8.741024770425394e-07,
|
|
"loss": 0.12519620656967162,
|
|
"memory(GiB)": 39.46,
|
|
"step": 580,
|
|
"token_acc": 0.9545193961398816,
|
|
"train_speed(iter/s)": 0.180327
|
|
},
|
|
{
|
|
"epoch": 2.418082048602038,
|
|
"eval_loss": 0.24199490249156952,
|
|
"eval_runtime": 6.6585,
|
|
"eval_samples_per_second": 23.128,
|
|
"eval_steps_per_second": 5.857,
|
|
"eval_token_acc": 0.9310612176387368,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 2.43898615103214,
|
|
"grad_norm": 0.5505854487419128,
|
|
"learning_rate": 8.132218734980852e-07,
|
|
"loss": 0.12351593971252442,
|
|
"memory(GiB)": 39.46,
|
|
"step": 585,
|
|
"token_acc": 0.9496205882650123,
|
|
"train_speed(iter/s)": 0.17969
|
|
},
|
|
{
|
|
"epoch": 2.459890253462242,
|
|
"grad_norm": 0.5361783504486084,
|
|
"learning_rate": 7.543506589469674e-07,
|
|
"loss": 0.12893223762512207,
|
|
"memory(GiB)": 39.46,
|
|
"step": 590,
|
|
"token_acc": 0.9550055365804208,
|
|
"train_speed(iter/s)": 0.179827
|
|
},
|
|
{
|
|
"epoch": 2.480794355892344,
|
|
"grad_norm": 0.570577085018158,
|
|
"learning_rate": 6.975170878528765e-07,
|
|
"loss": 0.12303224802017212,
|
|
"memory(GiB)": 39.46,
|
|
"step": 595,
|
|
"token_acc": 0.9543349832348714,
|
|
"train_speed(iter/s)": 0.180173
|
|
},
|
|
{
|
|
"epoch": 2.501698458322446,
|
|
"grad_norm": 0.5742475390434265,
|
|
"learning_rate": 6.427484367393699e-07,
|
|
"loss": 0.1256563663482666,
|
|
"memory(GiB)": 39.46,
|
|
"step": 600,
|
|
"token_acc": 0.954024779864412,
|
|
"train_speed(iter/s)": 0.180362
|
|
},
|
|
{
|
|
"epoch": 2.501698458322446,
|
|
"eval_loss": 0.24162955582141876,
|
|
"eval_runtime": 6.6424,
|
|
"eval_samples_per_second": 23.184,
|
|
"eval_steps_per_second": 5.871,
|
|
"eval_token_acc": 0.9309413241041955,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 2.5226025607525475,
|
|
"grad_norm": 0.5502005219459534,
|
|
"learning_rate": 5.900709910988739e-07,
|
|
"loss": 0.1203322172164917,
|
|
"memory(GiB)": 39.46,
|
|
"step": 605,
|
|
"token_acc": 0.945725693054572,
|
|
"train_speed(iter/s)": 0.179668
|
|
},
|
|
{
|
|
"epoch": 2.5435066631826495,
|
|
"grad_norm": 0.550845742225647,
|
|
"learning_rate": 5.395100327773018e-07,
|
|
"loss": 0.11989848613739014,
|
|
"memory(GiB)": 39.46,
|
|
"step": 610,
|
|
"token_acc": 0.9564310580152438,
|
|
"train_speed(iter/s)": 0.179957
|
|
},
|
|
{
|
|
"epoch": 2.5644107656127515,
|
|
"grad_norm": 0.5418065190315247,
|
|
"learning_rate": 4.91089827840367e-07,
|
|
"loss": 0.12273094654083253,
|
|
"memory(GiB)": 39.46,
|
|
"step": 615,
|
|
"token_acc": 0.9586797908520233,
|
|
"train_speed(iter/s)": 0.180256
|
|
},
|
|
{
|
|
"epoch": 2.5853148680428535,
|
|
"grad_norm": 0.5306729674339294,
|
|
"learning_rate": 4.4483361492740184e-07,
|
|
"loss": 0.11489535570144653,
|
|
"memory(GiB)": 39.46,
|
|
"step": 620,
|
|
"token_acc": 0.9587957254418413,
|
|
"train_speed(iter/s)": 0.180471
|
|
},
|
|
{
|
|
"epoch": 2.5853148680428535,
|
|
"eval_loss": 0.24175922572612762,
|
|
"eval_runtime": 6.6237,
|
|
"eval_samples_per_second": 23.25,
|
|
"eval_steps_per_second": 5.888,
|
|
"eval_token_acc": 0.9314368840469663,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 2.6062189704729555,
|
|
"grad_norm": 0.5641535520553589,
|
|
"learning_rate": 4.007635940982857e-07,
|
|
"loss": 0.11712099313735962,
|
|
"memory(GiB)": 39.46,
|
|
"step": 625,
|
|
"token_acc": 0.9532489285186463,
|
|
"train_speed(iter/s)": 0.179823
|
|
},
|
|
{
|
|
"epoch": 2.627123072903057,
|
|
"grad_norm": 0.6171642541885376,
|
|
"learning_rate": 3.589009161788104e-07,
|
|
"loss": 0.1301506519317627,
|
|
"memory(GiB)": 39.46,
|
|
"step": 630,
|
|
"token_acc": 0.9547707464948845,
|
|
"train_speed(iter/s)": 0.180046
|
|
},
|
|
{
|
|
"epoch": 2.648027175333159,
|
|
"grad_norm": 0.5656234622001648,
|
|
"learning_rate": 3.192656726096277e-07,
|
|
"loss": 0.1133504033088684,
|
|
"memory(GiB)": 39.46,
|
|
"step": 635,
|
|
"token_acc": 0.9620012398557258,
|
|
"train_speed(iter/s)": 0.180284
|
|
},
|
|
{
|
|
"epoch": 2.668931277763261,
|
|
"grad_norm": 0.5739580988883972,
|
|
"learning_rate": 2.818768858036208e-07,
|
|
"loss": 0.11660020351409912,
|
|
"memory(GiB)": 39.46,
|
|
"step": 640,
|
|
"token_acc": 0.9527535918095915,
|
|
"train_speed(iter/s)": 0.180505
|
|
},
|
|
{
|
|
"epoch": 2.668931277763261,
|
|
"eval_loss": 0.24187050759792328,
|
|
"eval_runtime": 6.6525,
|
|
"eval_samples_per_second": 23.149,
|
|
"eval_steps_per_second": 5.862,
|
|
"eval_token_acc": 0.9315088201676911,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 2.689835380193363,
|
|
"grad_norm": 0.5692787170410156,
|
|
"learning_rate": 2.467525000163523e-07,
|
|
"loss": 0.12488572597503662,
|
|
"memory(GiB)": 39.46,
|
|
"step": 645,
|
|
"token_acc": 0.949108626758746,
|
|
"train_speed(iter/s)": 0.179804
|
|
},
|
|
{
|
|
"epoch": 2.710739482623465,
|
|
"grad_norm": 0.5694913864135742,
|
|
"learning_rate": 2.139093727339503e-07,
|
|
"loss": 0.1269749402999878,
|
|
"memory(GiB)": 39.46,
|
|
"step": 650,
|
|
"token_acc": 0.9520699186238631,
|
|
"train_speed(iter/s)": 0.180029
|
|
},
|
|
{
|
|
"epoch": 2.7316435850535665,
|
|
"grad_norm": 0.5761524438858032,
|
|
"learning_rate": 1.8336326658258797e-07,
|
|
"loss": 0.12404175996780395,
|
|
"memory(GiB)": 39.46,
|
|
"step": 655,
|
|
"token_acc": 0.9545447343821791,
|
|
"train_speed(iter/s)": 0.180226
|
|
},
|
|
{
|
|
"epoch": 2.7525476874836685,
|
|
"grad_norm": 0.5228170156478882,
|
|
"learning_rate": 1.551288417634106e-07,
|
|
"loss": 0.11508429050445557,
|
|
"memory(GiB)": 39.46,
|
|
"step": 660,
|
|
"token_acc": 0.961899334176617,
|
|
"train_speed(iter/s)": 0.180412
|
|
},
|
|
{
|
|
"epoch": 2.7525476874836685,
|
|
"eval_loss": 0.24170467257499695,
|
|
"eval_runtime": 6.6488,
|
|
"eval_samples_per_second": 23.162,
|
|
"eval_steps_per_second": 5.866,
|
|
"eval_token_acc": 0.9315327988745994,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 2.7734517899137705,
|
|
"grad_norm": 0.5596388578414917,
|
|
"learning_rate": 1.292196490165698e-07,
|
|
"loss": 0.12398817539215087,
|
|
"memory(GiB)": 39.46,
|
|
"step": 665,
|
|
"token_acc": 0.9481295480168085,
|
|
"train_speed(iter/s)": 0.179823
|
|
},
|
|
{
|
|
"epoch": 2.7943558923438725,
|
|
"grad_norm": 0.5801319479942322,
|
|
"learning_rate": 1.0564812311772422e-07,
|
|
"loss": 0.1263979434967041,
|
|
"memory(GiB)": 39.46,
|
|
"step": 670,
|
|
"token_acc": 0.9553416631792909,
|
|
"train_speed(iter/s)": 0.180051
|
|
},
|
|
{
|
|
"epoch": 2.8152599947739745,
|
|
"grad_norm": 0.6298143863677979,
|
|
"learning_rate": 8.442557691013042e-08,
|
|
"loss": 0.12353664636611938,
|
|
"memory(GiB)": 39.46,
|
|
"step": 675,
|
|
"token_acc": 0.955449425153873,
|
|
"train_speed(iter/s)": 0.180251
|
|
},
|
|
{
|
|
"epoch": 2.836164097204076,
|
|
"grad_norm": 0.5088171362876892,
|
|
"learning_rate": 6.556219587519397e-08,
|
|
"loss": 0.11518588066101074,
|
|
"memory(GiB)": 39.46,
|
|
"step": 680,
|
|
"token_acc": 0.9564935905736113,
|
|
"train_speed(iter/s)": 0.180447
|
|
},
|
|
{
|
|
"epoch": 2.836164097204076,
|
|
"eval_loss": 0.2416183054447174,
|
|
"eval_runtime": 6.639,
|
|
"eval_samples_per_second": 23.196,
|
|
"eval_steps_per_second": 5.874,
|
|
"eval_token_acc": 0.9316287137022324,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 2.8570681996341785,
|
|
"grad_norm": 0.5331613421440125,
|
|
"learning_rate": 4.906703324408402e-08,
|
|
"loss": 0.11869626045227051,
|
|
"memory(GiB)": 39.46,
|
|
"step": 685,
|
|
"token_acc": 0.9486922250089573,
|
|
"train_speed(iter/s)": 0.179804
|
|
},
|
|
{
|
|
"epoch": 2.87797230206428,
|
|
"grad_norm": 0.5424652099609375,
|
|
"learning_rate": 3.494800565275125e-08,
|
|
"loss": 0.11893690824508667,
|
|
"memory(GiB)": 39.46,
|
|
"step": 690,
|
|
"token_acc": 0.958352824231084,
|
|
"train_speed(iter/s)": 0.179959
|
|
},
|
|
{
|
|
"epoch": 2.898876404494382,
|
|
"grad_norm": 0.5787132978439331,
|
|
"learning_rate": 2.321188934244323e-08,
|
|
"loss": 0.12508745193481446,
|
|
"memory(GiB)": 39.46,
|
|
"step": 695,
|
|
"token_acc": 0.9538041333143876,
|
|
"train_speed(iter/s)": 0.180164
|
|
},
|
|
{
|
|
"epoch": 2.919780506924484,
|
|
"grad_norm": 0.5377312302589417,
|
|
"learning_rate": 1.3864316907539754e-08,
|
|
"loss": 0.11967707872390747,
|
|
"memory(GiB)": 39.46,
|
|
"step": 700,
|
|
"token_acc": 0.9583007797539715,
|
|
"train_speed(iter/s)": 0.180364
|
|
},
|
|
{
|
|
"epoch": 2.919780506924484,
|
|
"eval_loss": 0.24175819754600525,
|
|
"eval_runtime": 6.6511,
|
|
"eval_samples_per_second": 23.154,
|
|
"eval_steps_per_second": 5.864,
|
|
"eval_token_acc": 0.9318525149667095,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 2.940684609354586,
|
|
"grad_norm": 0.5400473475456238,
|
|
"learning_rate": 6.9097745922580564e-09,
|
|
"loss": 0.11984028816223144,
|
|
"memory(GiB)": 39.46,
|
|
"step": 705,
|
|
"token_acc": 0.950023827776972,
|
|
"train_speed(iter/s)": 0.179708
|
|
},
|
|
{
|
|
"epoch": 2.961588711784688,
|
|
"grad_norm": 0.5357044339179993,
|
|
"learning_rate": 2.3516001375439856e-09,
|
|
"loss": 0.1183488130569458,
|
|
"memory(GiB)": 39.46,
|
|
"step": 710,
|
|
"token_acc": 0.9582627468029924,
|
|
"train_speed(iter/s)": 0.179902
|
|
},
|
|
{
|
|
"epoch": 2.9824928142147895,
|
|
"grad_norm": 0.5505788922309875,
|
|
"learning_rate": 1.919811791650794e-10,
|
|
"loss": 0.12027006149291992,
|
|
"memory(GiB)": 39.46,
|
|
"step": 715,
|
|
"token_acc": 0.9548030559208467,
|
|
"train_speed(iter/s)": 0.180076
|
|
},
|
|
{
|
|
"epoch": 2.9908544551868306,
|
|
"eval_loss": 0.24157196283340454,
|
|
"eval_runtime": 6.6398,
|
|
"eval_samples_per_second": 23.193,
|
|
"eval_steps_per_second": 5.874,
|
|
"eval_token_acc": 0.9317246285298655,
|
|
"step": 717
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 717,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.0098725192109916e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|