5905 lines
168 KiB
JSON
5905 lines
168 KiB
JSON
{
|
|
"best_global_step": 1560,
|
|
"best_metric": 0.31897813,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v53-20250506-203614/checkpoint-1560",
|
|
"epoch": 2.9972559780478245,
|
|
"eval_steps": 20,
|
|
"global_step": 2391,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0012544100352802822,
|
|
"grad_norm": 3.00976824760437,
|
|
"learning_rate": 9.999995684008912e-06,
|
|
"loss": 0.6037987470626831,
|
|
"memory(GiB)": 28.87,
|
|
"step": 1,
|
|
"token_acc": 0.8419128400116993,
|
|
"train_speed(iter/s)": 0.06443
|
|
},
|
|
{
|
|
"epoch": 0.006272050176401411,
|
|
"grad_norm": 1.8649096488952637,
|
|
"learning_rate": 9.999892100595329e-06,
|
|
"loss": 0.5593533515930176,
|
|
"memory(GiB)": 28.87,
|
|
"step": 5,
|
|
"token_acc": 0.830772646536412,
|
|
"train_speed(iter/s)": 0.125347
|
|
},
|
|
{
|
|
"epoch": 0.012544100352802822,
|
|
"grad_norm": 1.3044856786727905,
|
|
"learning_rate": 9.999568407038233e-06,
|
|
"loss": 0.49286112785339353,
|
|
"memory(GiB)": 28.87,
|
|
"step": 10,
|
|
"token_acc": 0.8464868234234858,
|
|
"train_speed(iter/s)": 0.140401
|
|
},
|
|
{
|
|
"epoch": 0.018816150529204233,
|
|
"grad_norm": 1.1055748462677002,
|
|
"learning_rate": 9.999028933299243e-06,
|
|
"loss": 0.45445590019226073,
|
|
"memory(GiB)": 28.87,
|
|
"step": 15,
|
|
"token_acc": 0.852595056694209,
|
|
"train_speed(iter/s)": 0.14752
|
|
},
|
|
{
|
|
"epoch": 0.025088200705605645,
|
|
"grad_norm": 1.1405771970748901,
|
|
"learning_rate": 9.99827370266192e-06,
|
|
"loss": 0.42907133102416994,
|
|
"memory(GiB)": 28.87,
|
|
"step": 20,
|
|
"token_acc": 0.8665471789701654,
|
|
"train_speed(iter/s)": 0.148788
|
|
},
|
|
{
|
|
"epoch": 0.025088200705605645,
|
|
"eval_loss": 0.44562071561813354,
|
|
"eval_runtime": 29.4389,
|
|
"eval_samples_per_second": 17.494,
|
|
"eval_steps_per_second": 4.382,
|
|
"eval_token_acc": 0.8629940758096486,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.03136025088200706,
|
|
"grad_norm": 1.0768901109695435,
|
|
"learning_rate": 9.99730274772184e-06,
|
|
"loss": 0.44150071144104003,
|
|
"memory(GiB)": 28.87,
|
|
"step": 25,
|
|
"token_acc": 0.8656667620254199,
|
|
"train_speed(iter/s)": 0.121315
|
|
},
|
|
{
|
|
"epoch": 0.037632301058408466,
|
|
"grad_norm": 1.097219705581665,
|
|
"learning_rate": 9.996116110385186e-06,
|
|
"loss": 0.43617844581604004,
|
|
"memory(GiB)": 28.87,
|
|
"step": 30,
|
|
"token_acc": 0.8606751454471655,
|
|
"train_speed(iter/s)": 0.127058
|
|
},
|
|
{
|
|
"epoch": 0.04390435123480988,
|
|
"grad_norm": 1.0356194972991943,
|
|
"learning_rate": 9.99471384186694e-06,
|
|
"loss": 0.42529120445251467,
|
|
"memory(GiB)": 28.87,
|
|
"step": 35,
|
|
"token_acc": 0.8749200767263428,
|
|
"train_speed(iter/s)": 0.130997
|
|
},
|
|
{
|
|
"epoch": 0.05017640141121129,
|
|
"grad_norm": 1.0370515584945679,
|
|
"learning_rate": 9.99309600268868e-06,
|
|
"loss": 0.3916675567626953,
|
|
"memory(GiB)": 28.87,
|
|
"step": 40,
|
|
"token_acc": 0.8746708984022794,
|
|
"train_speed(iter/s)": 0.133319
|
|
},
|
|
{
|
|
"epoch": 0.05017640141121129,
|
|
"eval_loss": 0.4175032675266266,
|
|
"eval_runtime": 29.2644,
|
|
"eval_samples_per_second": 17.598,
|
|
"eval_steps_per_second": 4.408,
|
|
"eval_token_acc": 0.869223636797717,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.0564484515876127,
|
|
"grad_norm": 1.0402519702911377,
|
|
"learning_rate": 9.991262662675962e-06,
|
|
"loss": 0.4160133361816406,
|
|
"memory(GiB)": 28.87,
|
|
"step": 45,
|
|
"token_acc": 0.8742820425397991,
|
|
"train_speed(iter/s)": 0.121167
|
|
},
|
|
{
|
|
"epoch": 0.06272050176401411,
|
|
"grad_norm": 1.0216219425201416,
|
|
"learning_rate": 9.9892139009553e-06,
|
|
"loss": 0.4001720428466797,
|
|
"memory(GiB)": 28.87,
|
|
"step": 50,
|
|
"token_acc": 0.8626985512305813,
|
|
"train_speed(iter/s)": 0.124367
|
|
},
|
|
{
|
|
"epoch": 0.06899255194041552,
|
|
"grad_norm": 1.0897234678268433,
|
|
"learning_rate": 9.986949805950763e-06,
|
|
"loss": 0.4109466075897217,
|
|
"memory(GiB)": 28.87,
|
|
"step": 55,
|
|
"token_acc": 0.8756929944560443,
|
|
"train_speed(iter/s)": 0.126442
|
|
},
|
|
{
|
|
"epoch": 0.07526460211681693,
|
|
"grad_norm": 1.0888633728027344,
|
|
"learning_rate": 9.984470475380154e-06,
|
|
"loss": 0.4020622730255127,
|
|
"memory(GiB)": 28.87,
|
|
"step": 60,
|
|
"token_acc": 0.880744833231021,
|
|
"train_speed(iter/s)": 0.128837
|
|
},
|
|
{
|
|
"epoch": 0.07526460211681693,
|
|
"eval_loss": 0.4036850929260254,
|
|
"eval_runtime": 29.3294,
|
|
"eval_samples_per_second": 17.559,
|
|
"eval_steps_per_second": 4.398,
|
|
"eval_token_acc": 0.8717332282118051,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.08153665229321834,
|
|
"grad_norm": 0.957584023475647,
|
|
"learning_rate": 9.981776016250789e-06,
|
|
"loss": 0.39967339038848876,
|
|
"memory(GiB)": 28.87,
|
|
"step": 65,
|
|
"token_acc": 0.8750420639371845,
|
|
"train_speed(iter/s)": 0.121004
|
|
},
|
|
{
|
|
"epoch": 0.08780870246961976,
|
|
"grad_norm": 0.9902129769325256,
|
|
"learning_rate": 9.97886654485488e-06,
|
|
"loss": 0.3779956817626953,
|
|
"memory(GiB)": 28.87,
|
|
"step": 70,
|
|
"token_acc": 0.8874758475320569,
|
|
"train_speed(iter/s)": 0.12309
|
|
},
|
|
{
|
|
"epoch": 0.09408075264602117,
|
|
"grad_norm": 0.9632574915885925,
|
|
"learning_rate": 9.975742186764526e-06,
|
|
"loss": 0.3755610704421997,
|
|
"memory(GiB)": 28.87,
|
|
"step": 75,
|
|
"token_acc": 0.8887904599659284,
|
|
"train_speed(iter/s)": 0.125047
|
|
},
|
|
{
|
|
"epoch": 0.10035280282242258,
|
|
"grad_norm": 1.083781123161316,
|
|
"learning_rate": 9.972403076826272e-06,
|
|
"loss": 0.3894859790802002,
|
|
"memory(GiB)": 28.87,
|
|
"step": 80,
|
|
"token_acc": 0.8634405980822363,
|
|
"train_speed(iter/s)": 0.126695
|
|
},
|
|
{
|
|
"epoch": 0.10035280282242258,
|
|
"eval_loss": 0.3936529755592346,
|
|
"eval_runtime": 29.4382,
|
|
"eval_samples_per_second": 17.494,
|
|
"eval_steps_per_second": 4.382,
|
|
"eval_token_acc": 0.8746037121965032,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.10662485299882399,
|
|
"grad_norm": 1.0563101768493652,
|
|
"learning_rate": 9.96884935915531e-06,
|
|
"loss": 0.36060025691986086,
|
|
"memory(GiB)": 28.87,
|
|
"step": 85,
|
|
"token_acc": 0.8797673608319262,
|
|
"train_speed(iter/s)": 0.120902
|
|
},
|
|
{
|
|
"epoch": 0.1128969031752254,
|
|
"grad_norm": 0.9367106556892395,
|
|
"learning_rate": 9.965081187129248e-06,
|
|
"loss": 0.3710139274597168,
|
|
"memory(GiB)": 28.87,
|
|
"step": 90,
|
|
"token_acc": 0.8800344362928582,
|
|
"train_speed(iter/s)": 0.122022
|
|
},
|
|
{
|
|
"epoch": 0.11916895335162682,
|
|
"grad_norm": 1.0503040552139282,
|
|
"learning_rate": 9.961098723381495e-06,
|
|
"loss": 0.3768899917602539,
|
|
"memory(GiB)": 28.87,
|
|
"step": 95,
|
|
"token_acc": 0.8798389919495975,
|
|
"train_speed(iter/s)": 0.123839
|
|
},
|
|
{
|
|
"epoch": 0.12544100352802823,
|
|
"grad_norm": 0.9691568613052368,
|
|
"learning_rate": 9.956902139794236e-06,
|
|
"loss": 0.40593662261962893,
|
|
"memory(GiB)": 28.87,
|
|
"step": 100,
|
|
"token_acc": 0.8564825788101446,
|
|
"train_speed(iter/s)": 0.125265
|
|
},
|
|
{
|
|
"epoch": 0.12544100352802823,
|
|
"eval_loss": 0.3885510563850403,
|
|
"eval_runtime": 29.3787,
|
|
"eval_samples_per_second": 17.53,
|
|
"eval_steps_per_second": 4.391,
|
|
"eval_token_acc": 0.8755364806866953,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.13171305370442962,
|
|
"grad_norm": 0.912711501121521,
|
|
"learning_rate": 9.95249161749102e-06,
|
|
"loss": 0.3844183921813965,
|
|
"memory(GiB)": 28.87,
|
|
"step": 105,
|
|
"token_acc": 0.8794101344470655,
|
|
"train_speed(iter/s)": 0.120949
|
|
},
|
|
{
|
|
"epoch": 0.13798510388083104,
|
|
"grad_norm": 0.9416125416755676,
|
|
"learning_rate": 9.94786734682894e-06,
|
|
"loss": 0.37710275650024416,
|
|
"memory(GiB)": 28.87,
|
|
"step": 110,
|
|
"token_acc": 0.8741283259963379,
|
|
"train_speed(iter/s)": 0.122239
|
|
},
|
|
{
|
|
"epoch": 0.14425715405723247,
|
|
"grad_norm": 0.9355840086936951,
|
|
"learning_rate": 9.943029527390415e-06,
|
|
"loss": 0.3904699802398682,
|
|
"memory(GiB)": 28.87,
|
|
"step": 115,
|
|
"token_acc": 0.8733137387896601,
|
|
"train_speed(iter/s)": 0.123713
|
|
},
|
|
{
|
|
"epoch": 0.15052920423363386,
|
|
"grad_norm": 0.8927087783813477,
|
|
"learning_rate": 9.93797836797458e-06,
|
|
"loss": 0.3807236909866333,
|
|
"memory(GiB)": 28.87,
|
|
"step": 120,
|
|
"token_acc": 0.8784389647442429,
|
|
"train_speed(iter/s)": 0.124943
|
|
},
|
|
{
|
|
"epoch": 0.15052920423363386,
|
|
"eval_loss": 0.3819100260734558,
|
|
"eval_runtime": 29.4329,
|
|
"eval_samples_per_second": 17.497,
|
|
"eval_steps_per_second": 4.383,
|
|
"eval_token_acc": 0.8767468588465873,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.15680125441003528,
|
|
"grad_norm": 1.0235486030578613,
|
|
"learning_rate": 9.932714086588276e-06,
|
|
"loss": 0.3710296630859375,
|
|
"memory(GiB)": 28.87,
|
|
"step": 125,
|
|
"token_acc": 0.8803818429768167,
|
|
"train_speed(iter/s)": 0.121182
|
|
},
|
|
{
|
|
"epoch": 0.16307330458643668,
|
|
"grad_norm": 0.949593722820282,
|
|
"learning_rate": 9.92723691043663e-06,
|
|
"loss": 0.3603027820587158,
|
|
"memory(GiB)": 28.87,
|
|
"step": 130,
|
|
"token_acc": 0.8798780487804878,
|
|
"train_speed(iter/s)": 0.122278
|
|
},
|
|
{
|
|
"epoch": 0.1693453547628381,
|
|
"grad_norm": 0.9754647612571716,
|
|
"learning_rate": 9.921547075913261e-06,
|
|
"loss": 0.3779932737350464,
|
|
"memory(GiB)": 28.87,
|
|
"step": 135,
|
|
"token_acc": 0.8775437975579543,
|
|
"train_speed(iter/s)": 0.123298
|
|
},
|
|
{
|
|
"epoch": 0.17561740493923952,
|
|
"grad_norm": 0.8868964910507202,
|
|
"learning_rate": 9.915644828590074e-06,
|
|
"loss": 0.38963828086853025,
|
|
"memory(GiB)": 28.87,
|
|
"step": 140,
|
|
"token_acc": 0.8774046938055872,
|
|
"train_speed(iter/s)": 0.124471
|
|
},
|
|
{
|
|
"epoch": 0.17561740493923952,
|
|
"eval_loss": 0.3778168857097626,
|
|
"eval_runtime": 29.4005,
|
|
"eval_samples_per_second": 17.517,
|
|
"eval_steps_per_second": 4.388,
|
|
"eval_token_acc": 0.8770189163228933,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.18188945511564092,
|
|
"grad_norm": 0.9047612547874451,
|
|
"learning_rate": 9.909530423206657e-06,
|
|
"loss": 0.38094215393066405,
|
|
"memory(GiB)": 28.87,
|
|
"step": 145,
|
|
"token_acc": 0.8818703968341212,
|
|
"train_speed(iter/s)": 0.121105
|
|
},
|
|
{
|
|
"epoch": 0.18816150529204234,
|
|
"grad_norm": 0.8378849625587463,
|
|
"learning_rate": 9.903204123659288e-06,
|
|
"loss": 0.3542912483215332,
|
|
"memory(GiB)": 28.87,
|
|
"step": 150,
|
|
"token_acc": 0.8816124572509642,
|
|
"train_speed(iter/s)": 0.122126
|
|
},
|
|
{
|
|
"epoch": 0.19443355546844374,
|
|
"grad_norm": 0.9955030679702759,
|
|
"learning_rate": 9.896666202989553e-06,
|
|
"loss": 0.37951111793518066,
|
|
"memory(GiB)": 28.87,
|
|
"step": 155,
|
|
"token_acc": 0.8759518001034967,
|
|
"train_speed(iter/s)": 0.123261
|
|
},
|
|
{
|
|
"epoch": 0.20070560564484516,
|
|
"grad_norm": 0.9958081841468811,
|
|
"learning_rate": 9.889916943372549e-06,
|
|
"loss": 0.3788171291351318,
|
|
"memory(GiB)": 28.87,
|
|
"step": 160,
|
|
"token_acc": 0.8731931436649585,
|
|
"train_speed(iter/s)": 0.124288
|
|
},
|
|
{
|
|
"epoch": 0.20070560564484516,
|
|
"eval_loss": 0.3737770915031433,
|
|
"eval_runtime": 29.3525,
|
|
"eval_samples_per_second": 17.545,
|
|
"eval_steps_per_second": 4.395,
|
|
"eval_token_acc": 0.8785124563458795,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.20697765582124658,
|
|
"grad_norm": 0.8732254505157471,
|
|
"learning_rate": 9.882956636104714e-06,
|
|
"loss": 0.36996870040893554,
|
|
"memory(GiB)": 28.87,
|
|
"step": 165,
|
|
"token_acc": 0.8824279750554038,
|
|
"train_speed(iter/s)": 0.121446
|
|
},
|
|
{
|
|
"epoch": 0.21324970599764798,
|
|
"grad_norm": 0.9481439590454102,
|
|
"learning_rate": 9.875785581591253e-06,
|
|
"loss": 0.36848177909851076,
|
|
"memory(GiB)": 28.87,
|
|
"step": 170,
|
|
"token_acc": 0.8760355937404112,
|
|
"train_speed(iter/s)": 0.122195
|
|
},
|
|
{
|
|
"epoch": 0.2195217561740494,
|
|
"grad_norm": 1.0382639169692993,
|
|
"learning_rate": 9.868404089333171e-06,
|
|
"loss": 0.3516302347183228,
|
|
"memory(GiB)": 28.87,
|
|
"step": 175,
|
|
"token_acc": 0.8920562652894808,
|
|
"train_speed(iter/s)": 0.123108
|
|
},
|
|
{
|
|
"epoch": 0.2257938063504508,
|
|
"grad_norm": 0.973171591758728,
|
|
"learning_rate": 9.860812477913915e-06,
|
|
"loss": 0.36341152191162107,
|
|
"memory(GiB)": 28.87,
|
|
"step": 180,
|
|
"token_acc": 0.8771131832309909,
|
|
"train_speed(iter/s)": 0.123961
|
|
},
|
|
{
|
|
"epoch": 0.2257938063504508,
|
|
"eval_loss": 0.37089061737060547,
|
|
"eval_runtime": 29.3996,
|
|
"eval_samples_per_second": 17.517,
|
|
"eval_steps_per_second": 4.388,
|
|
"eval_token_acc": 0.8789066620768534,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.23206585652685222,
|
|
"grad_norm": 0.8672347068786621,
|
|
"learning_rate": 9.853011074985628e-06,
|
|
"loss": 0.35667340755462645,
|
|
"memory(GiB)": 28.87,
|
|
"step": 185,
|
|
"token_acc": 0.8844639006185514,
|
|
"train_speed(iter/s)": 0.121474
|
|
},
|
|
{
|
|
"epoch": 0.23833790670325364,
|
|
"grad_norm": 1.0073734521865845,
|
|
"learning_rate": 9.845000217255e-06,
|
|
"loss": 0.36460084915161134,
|
|
"memory(GiB)": 28.87,
|
|
"step": 190,
|
|
"token_acc": 0.8793251008680768,
|
|
"train_speed(iter/s)": 0.122381
|
|
},
|
|
{
|
|
"epoch": 0.24460995687965503,
|
|
"grad_norm": 0.859307050704956,
|
|
"learning_rate": 9.836780250468744e-06,
|
|
"loss": 0.3675198554992676,
|
|
"memory(GiB)": 28.87,
|
|
"step": 195,
|
|
"token_acc": 0.8756779751143252,
|
|
"train_speed(iter/s)": 0.123104
|
|
},
|
|
{
|
|
"epoch": 0.25088200705605646,
|
|
"grad_norm": 0.8690401911735535,
|
|
"learning_rate": 9.82835152939867e-06,
|
|
"loss": 0.3573744773864746,
|
|
"memory(GiB)": 28.87,
|
|
"step": 200,
|
|
"token_acc": 0.8790942326344848,
|
|
"train_speed(iter/s)": 0.123813
|
|
},
|
|
{
|
|
"epoch": 0.25088200705605646,
|
|
"eval_loss": 0.367218554019928,
|
|
"eval_runtime": 29.4154,
|
|
"eval_samples_per_second": 17.508,
|
|
"eval_steps_per_second": 4.385,
|
|
"eval_token_acc": 0.8802725016517775,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.2571540572324579,
|
|
"grad_norm": 0.9202991127967834,
|
|
"learning_rate": 9.81971441782637e-06,
|
|
"loss": 0.3682036161422729,
|
|
"memory(GiB)": 28.87,
|
|
"step": 205,
|
|
"token_acc": 0.8823869363618037,
|
|
"train_speed(iter/s)": 0.121607
|
|
},
|
|
{
|
|
"epoch": 0.26342610740885924,
|
|
"grad_norm": 1.1039172410964966,
|
|
"learning_rate": 9.810869288527528e-06,
|
|
"loss": 0.3587596893310547,
|
|
"memory(GiB)": 28.87,
|
|
"step": 210,
|
|
"token_acc": 0.8775054019819686,
|
|
"train_speed(iter/s)": 0.122214
|
|
},
|
|
{
|
|
"epoch": 0.26969815758526067,
|
|
"grad_norm": 1.0208649635314941,
|
|
"learning_rate": 9.801816523255811e-06,
|
|
"loss": 0.3483666181564331,
|
|
"memory(GiB)": 28.87,
|
|
"step": 215,
|
|
"token_acc": 0.8888964740417108,
|
|
"train_speed(iter/s)": 0.12288
|
|
},
|
|
{
|
|
"epoch": 0.2759702077616621,
|
|
"grad_norm": 0.9139247536659241,
|
|
"learning_rate": 9.792556512726419e-06,
|
|
"loss": 0.363814640045166,
|
|
"memory(GiB)": 28.87,
|
|
"step": 220,
|
|
"token_acc": 0.8944489591798462,
|
|
"train_speed(iter/s)": 0.123698
|
|
},
|
|
{
|
|
"epoch": 0.2759702077616621,
|
|
"eval_loss": 0.3650023639202118,
|
|
"eval_runtime": 29.425,
|
|
"eval_samples_per_second": 17.502,
|
|
"eval_steps_per_second": 4.384,
|
|
"eval_token_acc": 0.8810831218873015,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.2822422579380635,
|
|
"grad_norm": 0.9822434186935425,
|
|
"learning_rate": 9.783089656599196e-06,
|
|
"loss": 0.3573582172393799,
|
|
"memory(GiB)": 28.87,
|
|
"step": 225,
|
|
"token_acc": 0.8843033462786615,
|
|
"train_speed(iter/s)": 0.121775
|
|
},
|
|
{
|
|
"epoch": 0.28851430811446493,
|
|
"grad_norm": 0.933601438999176,
|
|
"learning_rate": 9.773416363461401e-06,
|
|
"loss": 0.34882917404174807,
|
|
"memory(GiB)": 28.87,
|
|
"step": 230,
|
|
"token_acc": 0.8913467127081268,
|
|
"train_speed(iter/s)": 0.122386
|
|
},
|
|
{
|
|
"epoch": 0.2947863582908663,
|
|
"grad_norm": 0.921463668346405,
|
|
"learning_rate": 9.763537050810064e-06,
|
|
"loss": 0.357175350189209,
|
|
"memory(GiB)": 28.87,
|
|
"step": 235,
|
|
"token_acc": 0.8878133772142096,
|
|
"train_speed(iter/s)": 0.122933
|
|
},
|
|
{
|
|
"epoch": 0.3010584084672677,
|
|
"grad_norm": 0.863202691078186,
|
|
"learning_rate": 9.753452145033961e-06,
|
|
"loss": 0.36011409759521484,
|
|
"memory(GiB)": 28.87,
|
|
"step": 240,
|
|
"token_acc": 0.8802597806642296,
|
|
"train_speed(iter/s)": 0.123406
|
|
},
|
|
{
|
|
"epoch": 0.3010584084672677,
|
|
"eval_loss": 0.3631041347980499,
|
|
"eval_runtime": 29.4014,
|
|
"eval_samples_per_second": 17.516,
|
|
"eval_steps_per_second": 4.388,
|
|
"eval_token_acc": 0.8814828798116696,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.30733045864366915,
|
|
"grad_norm": 0.9408513307571411,
|
|
"learning_rate": 9.743162081395227e-06,
|
|
"loss": 0.3614873647689819,
|
|
"memory(GiB)": 28.87,
|
|
"step": 245,
|
|
"token_acc": 0.8843432165774723,
|
|
"train_speed(iter/s)": 0.121612
|
|
},
|
|
{
|
|
"epoch": 0.31360250882007057,
|
|
"grad_norm": 0.9375274777412415,
|
|
"learning_rate": 9.73266730401056e-06,
|
|
"loss": 0.3697765588760376,
|
|
"memory(GiB)": 28.87,
|
|
"step": 250,
|
|
"token_acc": 0.8756347101174687,
|
|
"train_speed(iter/s)": 0.12238
|
|
},
|
|
{
|
|
"epoch": 0.319874558996472,
|
|
"grad_norm": 0.9580682516098022,
|
|
"learning_rate": 9.72196826583205e-06,
|
|
"loss": 0.3556410312652588,
|
|
"memory(GiB)": 28.87,
|
|
"step": 255,
|
|
"token_acc": 0.8826956988422909,
|
|
"train_speed(iter/s)": 0.122971
|
|
},
|
|
{
|
|
"epoch": 0.32614660917287336,
|
|
"grad_norm": 0.9287164211273193,
|
|
"learning_rate": 9.711065428627638e-06,
|
|
"loss": 0.35933010578155516,
|
|
"memory(GiB)": 28.87,
|
|
"step": 260,
|
|
"token_acc": 0.8740019687192387,
|
|
"train_speed(iter/s)": 0.123433
|
|
},
|
|
{
|
|
"epoch": 0.32614660917287336,
|
|
"eval_loss": 0.3610526919364929,
|
|
"eval_runtime": 29.4208,
|
|
"eval_samples_per_second": 17.505,
|
|
"eval_steps_per_second": 4.385,
|
|
"eval_token_acc": 0.8819714728303416,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.3324186593492748,
|
|
"grad_norm": 0.8553582429885864,
|
|
"learning_rate": 9.699959262961182e-06,
|
|
"loss": 0.36598858833312986,
|
|
"memory(GiB)": 28.87,
|
|
"step": 265,
|
|
"token_acc": 0.8825705413797759,
|
|
"train_speed(iter/s)": 0.121765
|
|
},
|
|
{
|
|
"epoch": 0.3386907095256762,
|
|
"grad_norm": 0.8980154395103455,
|
|
"learning_rate": 9.688650248172145e-06,
|
|
"loss": 0.3778824329376221,
|
|
"memory(GiB)": 28.87,
|
|
"step": 270,
|
|
"token_acc": 0.8675095993484079,
|
|
"train_speed(iter/s)": 0.122329
|
|
},
|
|
{
|
|
"epoch": 0.3449627597020776,
|
|
"grad_norm": 0.8882062435150146,
|
|
"learning_rate": 9.677138872354916e-06,
|
|
"loss": 0.35373764038085936,
|
|
"memory(GiB)": 28.87,
|
|
"step": 275,
|
|
"token_acc": 0.8847052060503718,
|
|
"train_speed(iter/s)": 0.122902
|
|
},
|
|
{
|
|
"epoch": 0.35123480987847905,
|
|
"grad_norm": 1.0147747993469238,
|
|
"learning_rate": 9.665425632337731e-06,
|
|
"loss": 0.37800116539001466,
|
|
"memory(GiB)": 28.87,
|
|
"step": 280,
|
|
"token_acc": 0.8738335199701381,
|
|
"train_speed(iter/s)": 0.123477
|
|
},
|
|
{
|
|
"epoch": 0.35123480987847905,
|
|
"eval_loss": 0.3590083718299866,
|
|
"eval_runtime": 29.4336,
|
|
"eval_samples_per_second": 17.497,
|
|
"eval_steps_per_second": 4.383,
|
|
"eval_token_acc": 0.8824434092688317,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.3575068600548804,
|
|
"grad_norm": 0.9124870300292969,
|
|
"learning_rate": 9.653511033661242e-06,
|
|
"loss": 0.3612790584564209,
|
|
"memory(GiB)": 28.87,
|
|
"step": 285,
|
|
"token_acc": 0.8852118028240377,
|
|
"train_speed(iter/s)": 0.121866
|
|
},
|
|
{
|
|
"epoch": 0.36377891023128184,
|
|
"grad_norm": 0.8596437573432922,
|
|
"learning_rate": 9.641395590556689e-06,
|
|
"loss": 0.3433859825134277,
|
|
"memory(GiB)": 28.87,
|
|
"step": 290,
|
|
"token_acc": 0.9010951837286989,
|
|
"train_speed(iter/s)": 0.122379
|
|
},
|
|
{
|
|
"epoch": 0.37005096040768326,
|
|
"grad_norm": 0.9107604622840881,
|
|
"learning_rate": 9.629079825923712e-06,
|
|
"loss": 0.3564929962158203,
|
|
"memory(GiB)": 28.87,
|
|
"step": 295,
|
|
"token_acc": 0.8857908060314782,
|
|
"train_speed(iter/s)": 0.122912
|
|
},
|
|
{
|
|
"epoch": 0.3763230105840847,
|
|
"grad_norm": 0.9795000553131104,
|
|
"learning_rate": 9.616564271307779e-06,
|
|
"loss": 0.3729372024536133,
|
|
"memory(GiB)": 28.87,
|
|
"step": 300,
|
|
"token_acc": 0.8755715825867133,
|
|
"train_speed(iter/s)": 0.123491
|
|
},
|
|
{
|
|
"epoch": 0.3763230105840847,
|
|
"eval_loss": 0.3561505973339081,
|
|
"eval_runtime": 29.4641,
|
|
"eval_samples_per_second": 17.479,
|
|
"eval_steps_per_second": 4.378,
|
|
"eval_token_acc": 0.8826766013913797,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.3825950607604861,
|
|
"grad_norm": 0.8801693916320801,
|
|
"learning_rate": 9.603849466877249e-06,
|
|
"loss": 0.33815276622772217,
|
|
"memory(GiB)": 28.87,
|
|
"step": 305,
|
|
"token_acc": 0.8876053110467076,
|
|
"train_speed(iter/s)": 0.121969
|
|
},
|
|
{
|
|
"epoch": 0.3888671109368875,
|
|
"grad_norm": 0.8597840666770935,
|
|
"learning_rate": 9.59093596140005e-06,
|
|
"loss": 0.35080363750457766,
|
|
"memory(GiB)": 28.87,
|
|
"step": 310,
|
|
"token_acc": 0.8818184958717656,
|
|
"train_speed(iter/s)": 0.122478
|
|
},
|
|
{
|
|
"epoch": 0.3951391611132889,
|
|
"grad_norm": 0.8145344853401184,
|
|
"learning_rate": 9.577824312220006e-06,
|
|
"loss": 0.3441263914108276,
|
|
"memory(GiB)": 28.87,
|
|
"step": 315,
|
|
"token_acc": 0.8895818188774535,
|
|
"train_speed(iter/s)": 0.122834
|
|
},
|
|
{
|
|
"epoch": 0.4014112112896903,
|
|
"grad_norm": 0.850265383720398,
|
|
"learning_rate": 9.564515085232772e-06,
|
|
"loss": 0.34675819873809816,
|
|
"memory(GiB)": 28.87,
|
|
"step": 320,
|
|
"token_acc": 0.8802672898561309,
|
|
"train_speed(iter/s)": 0.123297
|
|
},
|
|
{
|
|
"epoch": 0.4014112112896903,
|
|
"eval_loss": 0.35628461837768555,
|
|
"eval_runtime": 29.2861,
|
|
"eval_samples_per_second": 17.585,
|
|
"eval_steps_per_second": 4.405,
|
|
"eval_token_acc": 0.8831707466034457,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.40768326146609174,
|
|
"grad_norm": 0.8921403288841248,
|
|
"learning_rate": 9.55100885486142e-06,
|
|
"loss": 0.3695497989654541,
|
|
"memory(GiB)": 28.87,
|
|
"step": 325,
|
|
"token_acc": 0.8874945370543798,
|
|
"train_speed(iter/s)": 0.121931
|
|
},
|
|
{
|
|
"epoch": 0.41395531164249316,
|
|
"grad_norm": 1.006880760192871,
|
|
"learning_rate": 9.537306204031628e-06,
|
|
"loss": 0.3622285842895508,
|
|
"memory(GiB)": 28.87,
|
|
"step": 330,
|
|
"token_acc": 0.8859732356116873,
|
|
"train_speed(iter/s)": 0.122367
|
|
},
|
|
{
|
|
"epoch": 0.42022736181889453,
|
|
"grad_norm": 0.9347439408302307,
|
|
"learning_rate": 9.523407724146548e-06,
|
|
"loss": 0.35402708053588866,
|
|
"memory(GiB)": 28.87,
|
|
"step": 335,
|
|
"token_acc": 0.879896406604079,
|
|
"train_speed(iter/s)": 0.122904
|
|
},
|
|
{
|
|
"epoch": 0.42649941199529595,
|
|
"grad_norm": 0.8348590135574341,
|
|
"learning_rate": 9.509314015061263e-06,
|
|
"loss": 0.33470354080200193,
|
|
"memory(GiB)": 28.87,
|
|
"step": 340,
|
|
"token_acc": 0.888100010191256,
|
|
"train_speed(iter/s)": 0.123264
|
|
},
|
|
{
|
|
"epoch": 0.42649941199529595,
|
|
"eval_loss": 0.35398781299591064,
|
|
"eval_runtime": 29.3885,
|
|
"eval_samples_per_second": 17.524,
|
|
"eval_steps_per_second": 4.389,
|
|
"eval_token_acc": 0.8837870400701797,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.4327714621716974,
|
|
"grad_norm": 0.843921422958374,
|
|
"learning_rate": 9.495025685056898e-06,
|
|
"loss": 0.343002462387085,
|
|
"memory(GiB)": 28.87,
|
|
"step": 345,
|
|
"token_acc": 0.8873276256453827,
|
|
"train_speed(iter/s)": 0.121931
|
|
},
|
|
{
|
|
"epoch": 0.4390435123480988,
|
|
"grad_norm": 0.8519654273986816,
|
|
"learning_rate": 9.480543350814376e-06,
|
|
"loss": 0.35919780731201173,
|
|
"memory(GiB)": 28.87,
|
|
"step": 350,
|
|
"token_acc": 0.881159420289855,
|
|
"train_speed(iter/s)": 0.122444
|
|
},
|
|
{
|
|
"epoch": 0.4453155625245002,
|
|
"grad_norm": 0.8579279184341431,
|
|
"learning_rate": 9.465867637387793e-06,
|
|
"loss": 0.37248964309692384,
|
|
"memory(GiB)": 28.87,
|
|
"step": 355,
|
|
"token_acc": 0.8757786153540964,
|
|
"train_speed(iter/s)": 0.122942
|
|
},
|
|
{
|
|
"epoch": 0.4515876127009016,
|
|
"grad_norm": 0.789768636226654,
|
|
"learning_rate": 9.450999178177445e-06,
|
|
"loss": 0.3402097702026367,
|
|
"memory(GiB)": 28.87,
|
|
"step": 360,
|
|
"token_acc": 0.8902190332326284,
|
|
"train_speed(iter/s)": 0.123304
|
|
},
|
|
{
|
|
"epoch": 0.4515876127009016,
|
|
"eval_loss": 0.35195934772491455,
|
|
"eval_runtime": 29.3974,
|
|
"eval_samples_per_second": 17.519,
|
|
"eval_steps_per_second": 4.388,
|
|
"eval_token_acc": 0.8847198085603718,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.457859662877303,
|
|
"grad_norm": 0.8919506669044495,
|
|
"learning_rate": 9.435938614902494e-06,
|
|
"loss": 0.32901549339294434,
|
|
"memory(GiB)": 28.87,
|
|
"step": 365,
|
|
"token_acc": 0.8874121986962454,
|
|
"train_speed(iter/s)": 0.122069
|
|
},
|
|
{
|
|
"epoch": 0.46413171305370443,
|
|
"grad_norm": 0.881273627281189,
|
|
"learning_rate": 9.42068659757326e-06,
|
|
"loss": 0.3595102548599243,
|
|
"memory(GiB)": 28.87,
|
|
"step": 370,
|
|
"token_acc": 0.8718952901164588,
|
|
"train_speed(iter/s)": 0.122435
|
|
},
|
|
{
|
|
"epoch": 0.47040376323010585,
|
|
"grad_norm": 0.900892972946167,
|
|
"learning_rate": 9.405243784463181e-06,
|
|
"loss": 0.34714303016662595,
|
|
"memory(GiB)": 28.87,
|
|
"step": 375,
|
|
"token_acc": 0.893684034176579,
|
|
"train_speed(iter/s)": 0.122895
|
|
},
|
|
{
|
|
"epoch": 0.4766758134065073,
|
|
"grad_norm": 0.9816926717758179,
|
|
"learning_rate": 9.389610842080394e-06,
|
|
"loss": 0.3555105209350586,
|
|
"memory(GiB)": 28.87,
|
|
"step": 380,
|
|
"token_acc": 0.880410447761194,
|
|
"train_speed(iter/s)": 0.123308
|
|
},
|
|
{
|
|
"epoch": 0.4766758134065073,
|
|
"eval_loss": 0.34997043013572693,
|
|
"eval_runtime": 29.336,
|
|
"eval_samples_per_second": 17.555,
|
|
"eval_steps_per_second": 4.397,
|
|
"eval_token_acc": 0.8844144379237018,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.48294786358290864,
|
|
"grad_norm": 0.8801642060279846,
|
|
"learning_rate": 9.373788445138972e-06,
|
|
"loss": 0.3438990592956543,
|
|
"memory(GiB)": 28.87,
|
|
"step": 385,
|
|
"token_acc": 0.888115044818469,
|
|
"train_speed(iter/s)": 0.122133
|
|
},
|
|
{
|
|
"epoch": 0.48921991375931007,
|
|
"grad_norm": 0.8868786692619324,
|
|
"learning_rate": 9.357777276529793e-06,
|
|
"loss": 0.3474756956100464,
|
|
"memory(GiB)": 28.87,
|
|
"step": 390,
|
|
"token_acc": 0.8811514138256887,
|
|
"train_speed(iter/s)": 0.122604
|
|
},
|
|
{
|
|
"epoch": 0.4954919639357115,
|
|
"grad_norm": 0.8451352119445801,
|
|
"learning_rate": 9.341578027291085e-06,
|
|
"loss": 0.32396225929260253,
|
|
"memory(GiB)": 28.87,
|
|
"step": 395,
|
|
"token_acc": 0.8982433222180206,
|
|
"train_speed(iter/s)": 0.122966
|
|
},
|
|
{
|
|
"epoch": 0.5017640141121129,
|
|
"grad_norm": 0.9279198050498962,
|
|
"learning_rate": 9.325191396578589e-06,
|
|
"loss": 0.35566129684448244,
|
|
"memory(GiB)": 28.87,
|
|
"step": 400,
|
|
"token_acc": 0.883422080227192,
|
|
"train_speed(iter/s)": 0.123361
|
|
},
|
|
{
|
|
"epoch": 0.5017640141121129,
|
|
"eval_loss": 0.3484845459461212,
|
|
"eval_runtime": 29.3966,
|
|
"eval_samples_per_second": 17.519,
|
|
"eval_steps_per_second": 4.388,
|
|
"eval_token_acc": 0.8852472669328018,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.5080360642885143,
|
|
"grad_norm": 0.8177010416984558,
|
|
"learning_rate": 9.308618091635382e-06,
|
|
"loss": 0.3363117933273315,
|
|
"memory(GiB)": 28.87,
|
|
"step": 405,
|
|
"token_acc": 0.8922487570299128,
|
|
"train_speed(iter/s)": 0.122214
|
|
},
|
|
{
|
|
"epoch": 0.5143081144649158,
|
|
"grad_norm": 0.9669262170791626,
|
|
"learning_rate": 9.291858827761359e-06,
|
|
"loss": 0.34157965183258054,
|
|
"memory(GiB)": 28.87,
|
|
"step": 410,
|
|
"token_acc": 0.8911739502999143,
|
|
"train_speed(iter/s)": 0.12252
|
|
},
|
|
{
|
|
"epoch": 0.5205801646413172,
|
|
"grad_norm": 1.5560276508331299,
|
|
"learning_rate": 9.274914328282359e-06,
|
|
"loss": 0.36170029640197754,
|
|
"memory(GiB)": 28.87,
|
|
"step": 415,
|
|
"token_acc": 0.8824226748572045,
|
|
"train_speed(iter/s)": 0.122859
|
|
},
|
|
{
|
|
"epoch": 0.5268522148177185,
|
|
"grad_norm": 0.9071317911148071,
|
|
"learning_rate": 9.257785324518943e-06,
|
|
"loss": 0.3503504753112793,
|
|
"memory(GiB)": 28.87,
|
|
"step": 420,
|
|
"token_acc": 0.8918499451111027,
|
|
"train_speed(iter/s)": 0.123193
|
|
},
|
|
{
|
|
"epoch": 0.5268522148177185,
|
|
"eval_loss": 0.34762412309646606,
|
|
"eval_runtime": 29.3255,
|
|
"eval_samples_per_second": 17.562,
|
|
"eval_steps_per_second": 4.399,
|
|
"eval_token_acc": 0.8854193849280159,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.5331242649941199,
|
|
"grad_norm": 0.8516839146614075,
|
|
"learning_rate": 9.240472555754835e-06,
|
|
"loss": 0.33733839988708497,
|
|
"memory(GiB)": 28.87,
|
|
"step": 425,
|
|
"token_acc": 0.8860463239447987,
|
|
"train_speed(iter/s)": 0.122243
|
|
},
|
|
{
|
|
"epoch": 0.5393963151705213,
|
|
"grad_norm": 1.0013768672943115,
|
|
"learning_rate": 9.222976769205013e-06,
|
|
"loss": 0.34144785404205324,
|
|
"memory(GiB)": 28.87,
|
|
"step": 430,
|
|
"token_acc": 0.8886329988484948,
|
|
"train_speed(iter/s)": 0.122568
|
|
},
|
|
{
|
|
"epoch": 0.5456683653469228,
|
|
"grad_norm": 0.8237244486808777,
|
|
"learning_rate": 9.205298719983458e-06,
|
|
"loss": 0.34563274383544923,
|
|
"memory(GiB)": 28.87,
|
|
"step": 435,
|
|
"token_acc": 0.889361264442692,
|
|
"train_speed(iter/s)": 0.122842
|
|
},
|
|
{
|
|
"epoch": 0.5519404155233242,
|
|
"grad_norm": 0.8497081995010376,
|
|
"learning_rate": 9.187439171070563e-06,
|
|
"loss": 0.3556859016418457,
|
|
"memory(GiB)": 28.87,
|
|
"step": 440,
|
|
"token_acc": 0.8856649117871639,
|
|
"train_speed(iter/s)": 0.123161
|
|
},
|
|
{
|
|
"epoch": 0.5519404155233242,
|
|
"eval_loss": 0.3459347188472748,
|
|
"eval_runtime": 29.345,
|
|
"eval_samples_per_second": 17.55,
|
|
"eval_steps_per_second": 4.396,
|
|
"eval_token_acc": 0.8852472669328018,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.5582124656997256,
|
|
"grad_norm": 1.3792065382003784,
|
|
"learning_rate": 9.169398893280208e-06,
|
|
"loss": 0.3321459054946899,
|
|
"memory(GiB)": 28.87,
|
|
"step": 445,
|
|
"token_acc": 0.8902999810652168,
|
|
"train_speed(iter/s)": 0.122142
|
|
},
|
|
{
|
|
"epoch": 0.564484515876127,
|
|
"grad_norm": 0.788133978843689,
|
|
"learning_rate": 9.151178665226486e-06,
|
|
"loss": 0.3413043260574341,
|
|
"memory(GiB)": 28.87,
|
|
"step": 450,
|
|
"token_acc": 0.8891621577307756,
|
|
"train_speed(iter/s)": 0.122465
|
|
},
|
|
{
|
|
"epoch": 0.5707565660525284,
|
|
"grad_norm": 0.7931389808654785,
|
|
"learning_rate": 9.132779273290103e-06,
|
|
"loss": 0.3318148612976074,
|
|
"memory(GiB)": 28.87,
|
|
"step": 455,
|
|
"token_acc": 0.8939530523203404,
|
|
"train_speed(iter/s)": 0.122682
|
|
},
|
|
{
|
|
"epoch": 0.5770286162289299,
|
|
"grad_norm": 0.8171889781951904,
|
|
"learning_rate": 9.114201511584428e-06,
|
|
"loss": 0.3486793994903564,
|
|
"memory(GiB)": 28.87,
|
|
"step": 460,
|
|
"token_acc": 0.8884144810017581,
|
|
"train_speed(iter/s)": 0.122934
|
|
},
|
|
{
|
|
"epoch": 0.5770286162289299,
|
|
"eval_loss": 0.3437627851963043,
|
|
"eval_runtime": 29.3507,
|
|
"eval_samples_per_second": 17.546,
|
|
"eval_steps_per_second": 4.395,
|
|
"eval_token_acc": 0.8861966920031759,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.5833006664053313,
|
|
"grad_norm": 0.9594098329544067,
|
|
"learning_rate": 9.095446181921237e-06,
|
|
"loss": 0.35313169956207274,
|
|
"memory(GiB)": 28.87,
|
|
"step": 465,
|
|
"token_acc": 0.8879731787460262,
|
|
"train_speed(iter/s)": 0.122073
|
|
},
|
|
{
|
|
"epoch": 0.5895727165817326,
|
|
"grad_norm": 0.7837137579917908,
|
|
"learning_rate": 9.07651409377609e-06,
|
|
"loss": 0.344076943397522,
|
|
"memory(GiB)": 28.87,
|
|
"step": 470,
|
|
"token_acc": 0.8796075311588438,
|
|
"train_speed(iter/s)": 0.122438
|
|
},
|
|
{
|
|
"epoch": 0.595844766758134,
|
|
"grad_norm": 0.8649491667747498,
|
|
"learning_rate": 9.057406064253404e-06,
|
|
"loss": 0.356815505027771,
|
|
"memory(GiB)": 28.87,
|
|
"step": 475,
|
|
"token_acc": 0.8810882595743154,
|
|
"train_speed(iter/s)": 0.12278
|
|
},
|
|
{
|
|
"epoch": 0.6021168169345354,
|
|
"grad_norm": 0.8207665681838989,
|
|
"learning_rate": 9.038122918051184e-06,
|
|
"loss": 0.33744909763336184,
|
|
"memory(GiB)": 28.87,
|
|
"step": 480,
|
|
"token_acc": 0.8888801540819936,
|
|
"train_speed(iter/s)": 0.123064
|
|
},
|
|
{
|
|
"epoch": 0.6021168169345354,
|
|
"eval_loss": 0.3429865837097168,
|
|
"eval_runtime": 29.3492,
|
|
"eval_samples_per_second": 17.547,
|
|
"eval_steps_per_second": 4.395,
|
|
"eval_token_acc": 0.8863299446446319,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.6083888671109369,
|
|
"grad_norm": 0.8997246623039246,
|
|
"learning_rate": 9.018665487425426e-06,
|
|
"loss": 0.34094789028167727,
|
|
"memory(GiB)": 28.87,
|
|
"step": 485,
|
|
"token_acc": 0.8909048190883935,
|
|
"train_speed(iter/s)": 0.122119
|
|
},
|
|
{
|
|
"epoch": 0.6146609172873383,
|
|
"grad_norm": 0.8832601308822632,
|
|
"learning_rate": 8.999034612154204e-06,
|
|
"loss": 0.34579076766967776,
|
|
"memory(GiB)": 28.87,
|
|
"step": 490,
|
|
"token_acc": 0.889549670855775,
|
|
"train_speed(iter/s)": 0.122448
|
|
},
|
|
{
|
|
"epoch": 0.6209329674637397,
|
|
"grad_norm": 0.9071618914604187,
|
|
"learning_rate": 8.979231139501417e-06,
|
|
"loss": 0.33304905891418457,
|
|
"memory(GiB)": 28.87,
|
|
"step": 495,
|
|
"token_acc": 0.892831455286502,
|
|
"train_speed(iter/s)": 0.122757
|
|
},
|
|
{
|
|
"epoch": 0.6272050176401411,
|
|
"grad_norm": 1.053952693939209,
|
|
"learning_rate": 8.95925592418023e-06,
|
|
"loss": 0.3448563814163208,
|
|
"memory(GiB)": 28.87,
|
|
"step": 500,
|
|
"token_acc": 0.876147859922179,
|
|
"train_speed(iter/s)": 0.123039
|
|
},
|
|
{
|
|
"epoch": 0.6272050176401411,
|
|
"eval_loss": 0.3429498076438904,
|
|
"eval_runtime": 29.3653,
|
|
"eval_samples_per_second": 17.538,
|
|
"eval_steps_per_second": 4.393,
|
|
"eval_token_acc": 0.8868574030170618,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.6334770678165426,
|
|
"grad_norm": 0.9261718392372131,
|
|
"learning_rate": 8.939109828316184e-06,
|
|
"loss": 0.3366121292114258,
|
|
"memory(GiB)": 28.87,
|
|
"step": 505,
|
|
"token_acc": 0.8902018439384988,
|
|
"train_speed(iter/s)": 0.122191
|
|
},
|
|
{
|
|
"epoch": 0.639749117992944,
|
|
"grad_norm": 0.8245139122009277,
|
|
"learning_rate": 8.918793721409973e-06,
|
|
"loss": 0.3344353914260864,
|
|
"memory(GiB)": 28.87,
|
|
"step": 510,
|
|
"token_acc": 0.8851031265942716,
|
|
"train_speed(iter/s)": 0.122447
|
|
},
|
|
{
|
|
"epoch": 0.6460211681693454,
|
|
"grad_norm": 0.7967011332511902,
|
|
"learning_rate": 8.898308480299937e-06,
|
|
"loss": 0.33919148445129393,
|
|
"memory(GiB)": 28.87,
|
|
"step": 515,
|
|
"token_acc": 0.889465313541351,
|
|
"train_speed(iter/s)": 0.122726
|
|
},
|
|
{
|
|
"epoch": 0.6522932183457467,
|
|
"grad_norm": 0.7725489139556885,
|
|
"learning_rate": 8.877654989124202e-06,
|
|
"loss": 0.32682027816772463,
|
|
"memory(GiB)": 28.87,
|
|
"step": 520,
|
|
"token_acc": 0.9023533671252716,
|
|
"train_speed(iter/s)": 0.123046
|
|
},
|
|
{
|
|
"epoch": 0.6522932183457467,
|
|
"eval_loss": 0.3419317901134491,
|
|
"eval_runtime": 29.3667,
|
|
"eval_samples_per_second": 17.537,
|
|
"eval_steps_per_second": 4.393,
|
|
"eval_token_acc": 0.8866020021209379,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.6585652685221481,
|
|
"grad_norm": 0.8528004884719849,
|
|
"learning_rate": 8.856834139282531e-06,
|
|
"loss": 0.32720084190368653,
|
|
"memory(GiB)": 28.87,
|
|
"step": 525,
|
|
"token_acc": 0.8922637415946852,
|
|
"train_speed(iter/s)": 0.122236
|
|
},
|
|
{
|
|
"epoch": 0.6648373186985496,
|
|
"grad_norm": 0.8266517519950867,
|
|
"learning_rate": 8.835846829397843e-06,
|
|
"loss": 0.33587045669555665,
|
|
"memory(GiB)": 28.87,
|
|
"step": 530,
|
|
"token_acc": 0.8928849794009576,
|
|
"train_speed(iter/s)": 0.122551
|
|
},
|
|
{
|
|
"epoch": 0.671109368874951,
|
|
"grad_norm": 0.9410281181335449,
|
|
"learning_rate": 8.814693965277435e-06,
|
|
"loss": 0.334349536895752,
|
|
"memory(GiB)": 28.87,
|
|
"step": 535,
|
|
"token_acc": 0.8876769459036773,
|
|
"train_speed(iter/s)": 0.122795
|
|
},
|
|
{
|
|
"epoch": 0.6773814190513524,
|
|
"grad_norm": 0.9425005912780762,
|
|
"learning_rate": 8.793376459873888e-06,
|
|
"loss": 0.34319519996643066,
|
|
"memory(GiB)": 28.87,
|
|
"step": 540,
|
|
"token_acc": 0.893324486711637,
|
|
"train_speed(iter/s)": 0.123071
|
|
},
|
|
{
|
|
"epoch": 0.6773814190513524,
|
|
"eval_loss": 0.3407135009765625,
|
|
"eval_runtime": 29.3625,
|
|
"eval_samples_per_second": 17.539,
|
|
"eval_steps_per_second": 4.393,
|
|
"eval_token_acc": 0.8873793091960979,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.6836534692277538,
|
|
"grad_norm": 0.8792872428894043,
|
|
"learning_rate": 8.771895233245655e-06,
|
|
"loss": 0.3311576843261719,
|
|
"memory(GiB)": 28.87,
|
|
"step": 545,
|
|
"token_acc": 0.8878104806138933,
|
|
"train_speed(iter/s)": 0.122276
|
|
},
|
|
{
|
|
"epoch": 0.6899255194041553,
|
|
"grad_norm": 0.8647459149360657,
|
|
"learning_rate": 8.750251212517364e-06,
|
|
"loss": 0.3273744583129883,
|
|
"memory(GiB)": 28.87,
|
|
"step": 550,
|
|
"token_acc": 0.9019584187879519,
|
|
"train_speed(iter/s)": 0.122511
|
|
},
|
|
{
|
|
"epoch": 0.6961975695805567,
|
|
"grad_norm": 0.8187234401702881,
|
|
"learning_rate": 8.728445331839796e-06,
|
|
"loss": 0.34240922927856443,
|
|
"memory(GiB)": 28.87,
|
|
"step": 555,
|
|
"token_acc": 0.8956161234709794,
|
|
"train_speed(iter/s)": 0.122795
|
|
},
|
|
{
|
|
"epoch": 0.7024696197569581,
|
|
"grad_norm": 0.819709837436676,
|
|
"learning_rate": 8.706478532349567e-06,
|
|
"loss": 0.3307104825973511,
|
|
"memory(GiB)": 28.87,
|
|
"step": 560,
|
|
"token_acc": 0.8898898898898899,
|
|
"train_speed(iter/s)": 0.123049
|
|
},
|
|
{
|
|
"epoch": 0.7024696197569581,
|
|
"eval_loss": 0.33920466899871826,
|
|
"eval_runtime": 29.2564,
|
|
"eval_samples_per_second": 17.603,
|
|
"eval_steps_per_second": 4.409,
|
|
"eval_token_acc": 0.887334891648946,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.7087416699333595,
|
|
"grad_norm": 0.9903004765510559,
|
|
"learning_rate": 8.684351762128511e-06,
|
|
"loss": 0.3422609806060791,
|
|
"memory(GiB)": 28.87,
|
|
"step": 565,
|
|
"token_acc": 0.8903627722134206,
|
|
"train_speed(iter/s)": 0.122313
|
|
},
|
|
{
|
|
"epoch": 0.7150137201097608,
|
|
"grad_norm": 0.8508421778678894,
|
|
"learning_rate": 8.662065976162765e-06,
|
|
"loss": 0.33224852085113527,
|
|
"memory(GiB)": 28.87,
|
|
"step": 570,
|
|
"token_acc": 0.8952832987244141,
|
|
"train_speed(iter/s)": 0.122616
|
|
},
|
|
{
|
|
"epoch": 0.7212857702861623,
|
|
"grad_norm": 0.9183920621871948,
|
|
"learning_rate": 8.639622136301541e-06,
|
|
"loss": 0.33959968090057374,
|
|
"memory(GiB)": 30.83,
|
|
"step": 575,
|
|
"token_acc": 0.8883200380997738,
|
|
"train_speed(iter/s)": 0.122861
|
|
},
|
|
{
|
|
"epoch": 0.7275578204625637,
|
|
"grad_norm": 0.8977269530296326,
|
|
"learning_rate": 8.617021211215629e-06,
|
|
"loss": 0.3228691339492798,
|
|
"memory(GiB)": 30.83,
|
|
"step": 580,
|
|
"token_acc": 0.8865359042553191,
|
|
"train_speed(iter/s)": 0.123129
|
|
},
|
|
{
|
|
"epoch": 0.7275578204625637,
|
|
"eval_loss": 0.3381815254688263,
|
|
"eval_runtime": 29.3858,
|
|
"eval_samples_per_second": 17.525,
|
|
"eval_steps_per_second": 4.39,
|
|
"eval_token_acc": 0.887751306153496,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.7338298706389651,
|
|
"grad_norm": 0.8259280323982239,
|
|
"learning_rate": 8.594264176355565e-06,
|
|
"loss": 0.3299635648727417,
|
|
"memory(GiB)": 30.83,
|
|
"step": 585,
|
|
"token_acc": 0.8886926504234737,
|
|
"train_speed(iter/s)": 0.122386
|
|
},
|
|
{
|
|
"epoch": 0.7401019208153665,
|
|
"grad_norm": 0.9581537246704102,
|
|
"learning_rate": 8.571352013909558e-06,
|
|
"loss": 0.3483741283416748,
|
|
"memory(GiB)": 30.83,
|
|
"step": 590,
|
|
"token_acc": 0.8793361921695778,
|
|
"train_speed(iter/s)": 0.122675
|
|
},
|
|
{
|
|
"epoch": 0.7463739709917679,
|
|
"grad_norm": 0.8338929414749146,
|
|
"learning_rate": 8.548285712761084e-06,
|
|
"loss": 0.3371764898300171,
|
|
"memory(GiB)": 30.83,
|
|
"step": 595,
|
|
"token_acc": 0.887261212985838,
|
|
"train_speed(iter/s)": 0.122898
|
|
},
|
|
{
|
|
"epoch": 0.7526460211681694,
|
|
"grad_norm": 0.9058257341384888,
|
|
"learning_rate": 8.525066268446208e-06,
|
|
"loss": 0.3369316816329956,
|
|
"memory(GiB)": 30.83,
|
|
"step": 600,
|
|
"token_acc": 0.8893781157890907,
|
|
"train_speed(iter/s)": 0.123152
|
|
},
|
|
{
|
|
"epoch": 0.7526460211681694,
|
|
"eval_loss": 0.33716732263565063,
|
|
"eval_runtime": 29.3779,
|
|
"eval_samples_per_second": 17.53,
|
|
"eval_steps_per_second": 4.391,
|
|
"eval_token_acc": 0.8880511245967719,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.7589180713445708,
|
|
"grad_norm": 0.8647992610931396,
|
|
"learning_rate": 8.501694683110615e-06,
|
|
"loss": 0.3460502862930298,
|
|
"memory(GiB)": 30.83,
|
|
"step": 605,
|
|
"token_acc": 0.8899021439489947,
|
|
"train_speed(iter/s)": 0.122422
|
|
},
|
|
{
|
|
"epoch": 0.7651901215209722,
|
|
"grad_norm": 0.9482452273368835,
|
|
"learning_rate": 8.478171965466366e-06,
|
|
"loss": 0.3319687843322754,
|
|
"memory(GiB)": 30.83,
|
|
"step": 610,
|
|
"token_acc": 0.8881137119384495,
|
|
"train_speed(iter/s)": 0.122651
|
|
},
|
|
{
|
|
"epoch": 0.7714621716973736,
|
|
"grad_norm": 0.9784294366836548,
|
|
"learning_rate": 8.454499130748352e-06,
|
|
"loss": 0.3246720790863037,
|
|
"memory(GiB)": 30.83,
|
|
"step": 615,
|
|
"token_acc": 0.8938534900311643,
|
|
"train_speed(iter/s)": 0.122956
|
|
},
|
|
{
|
|
"epoch": 0.777734221873775,
|
|
"grad_norm": 0.8589239716529846,
|
|
"learning_rate": 8.43067720067048e-06,
|
|
"loss": 0.33240423202514646,
|
|
"memory(GiB)": 30.83,
|
|
"step": 620,
|
|
"token_acc": 0.8954405877094104,
|
|
"train_speed(iter/s)": 0.123205
|
|
},
|
|
{
|
|
"epoch": 0.777734221873775,
|
|
"eval_loss": 0.33614107966423035,
|
|
"eval_runtime": 29.3659,
|
|
"eval_samples_per_second": 17.537,
|
|
"eval_steps_per_second": 4.393,
|
|
"eval_token_acc": 0.8878901109883459,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.7840062720501764,
|
|
"grad_norm": 0.9298199415206909,
|
|
"learning_rate": 8.40670720338158e-06,
|
|
"loss": 0.3492927312850952,
|
|
"memory(GiB)": 30.83,
|
|
"step": 625,
|
|
"token_acc": 0.8875478688841884,
|
|
"train_speed(iter/s)": 0.122475
|
|
},
|
|
{
|
|
"epoch": 0.7902783222265778,
|
|
"grad_norm": 0.9355754256248474,
|
|
"learning_rate": 8.382590173421029e-06,
|
|
"loss": 0.34650092124938964,
|
|
"memory(GiB)": 30.83,
|
|
"step": 630,
|
|
"token_acc": 0.8980740024851581,
|
|
"train_speed(iter/s)": 0.122741
|
|
},
|
|
{
|
|
"epoch": 0.7965503724029792,
|
|
"grad_norm": 0.8517131209373474,
|
|
"learning_rate": 8.358327151674095e-06,
|
|
"loss": 0.3493072986602783,
|
|
"memory(GiB)": 30.83,
|
|
"step": 635,
|
|
"token_acc": 0.8794758648566713,
|
|
"train_speed(iter/s)": 0.122993
|
|
},
|
|
{
|
|
"epoch": 0.8028224225793806,
|
|
"grad_norm": 0.9389694333076477,
|
|
"learning_rate": 8.33391918532702e-06,
|
|
"loss": 0.34041428565979004,
|
|
"memory(GiB)": 30.83,
|
|
"step": 640,
|
|
"token_acc": 0.8885475048956716,
|
|
"train_speed(iter/s)": 0.123237
|
|
},
|
|
{
|
|
"epoch": 0.8028224225793806,
|
|
"eval_loss": 0.3339459300041199,
|
|
"eval_runtime": 29.3874,
|
|
"eval_samples_per_second": 17.525,
|
|
"eval_steps_per_second": 4.39,
|
|
"eval_token_acc": 0.888389808393806,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.8090944727557821,
|
|
"grad_norm": 0.8025544285774231,
|
|
"learning_rate": 8.309367327821819e-06,
|
|
"loss": 0.319812536239624,
|
|
"memory(GiB)": 30.83,
|
|
"step": 645,
|
|
"token_acc": 0.8937441216095986,
|
|
"train_speed(iter/s)": 0.122531
|
|
},
|
|
{
|
|
"epoch": 0.8153665229321835,
|
|
"grad_norm": 0.9150540232658386,
|
|
"learning_rate": 8.284672638810813e-06,
|
|
"loss": 0.32463486194610597,
|
|
"memory(GiB)": 30.83,
|
|
"step": 650,
|
|
"token_acc": 0.8960157054582905,
|
|
"train_speed(iter/s)": 0.122736
|
|
},
|
|
{
|
|
"epoch": 0.8216385731085849,
|
|
"grad_norm": 0.7610167264938354,
|
|
"learning_rate": 8.259836184110904e-06,
|
|
"loss": 0.3184787750244141,
|
|
"memory(GiB)": 30.83,
|
|
"step": 655,
|
|
"token_acc": 0.8911636749168281,
|
|
"train_speed(iter/s)": 0.122944
|
|
},
|
|
{
|
|
"epoch": 0.8279106232849863,
|
|
"grad_norm": 0.8336440920829773,
|
|
"learning_rate": 8.234859035657557e-06,
|
|
"loss": 0.34786210060119627,
|
|
"memory(GiB)": 30.83,
|
|
"step": 660,
|
|
"token_acc": 0.8883749488334015,
|
|
"train_speed(iter/s)": 0.123201
|
|
},
|
|
{
|
|
"epoch": 0.8279106232849863,
|
|
"eval_loss": 0.33380812406539917,
|
|
"eval_runtime": 29.4213,
|
|
"eval_samples_per_second": 17.504,
|
|
"eval_steps_per_second": 4.385,
|
|
"eval_token_acc": 0.8884841956815039,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.8341826734613876,
|
|
"grad_norm": 0.7955749034881592,
|
|
"learning_rate": 8.209742271458556e-06,
|
|
"loss": 0.33576648235321044,
|
|
"memory(GiB)": 30.83,
|
|
"step": 665,
|
|
"token_acc": 0.8916429159969202,
|
|
"train_speed(iter/s)": 0.122484
|
|
},
|
|
{
|
|
"epoch": 0.8404547236377891,
|
|
"grad_norm": 0.8152847290039062,
|
|
"learning_rate": 8.18448697554746e-06,
|
|
"loss": 0.3175548553466797,
|
|
"memory(GiB)": 30.83,
|
|
"step": 670,
|
|
"token_acc": 0.8854783940342233,
|
|
"train_speed(iter/s)": 0.122708
|
|
},
|
|
{
|
|
"epoch": 0.8467267738141905,
|
|
"grad_norm": 0.8721585869789124,
|
|
"learning_rate": 8.159094237936828e-06,
|
|
"loss": 0.3407304763793945,
|
|
"memory(GiB)": 30.83,
|
|
"step": 675,
|
|
"token_acc": 0.8833216654998833,
|
|
"train_speed(iter/s)": 0.122938
|
|
},
|
|
{
|
|
"epoch": 0.8529988239905919,
|
|
"grad_norm": 0.9121716022491455,
|
|
"learning_rate": 8.133565154571169e-06,
|
|
"loss": 0.3379061222076416,
|
|
"memory(GiB)": 30.83,
|
|
"step": 680,
|
|
"token_acc": 0.8774592247819455,
|
|
"train_speed(iter/s)": 0.123134
|
|
},
|
|
{
|
|
"epoch": 0.8529988239905919,
|
|
"eval_loss": 0.3322893977165222,
|
|
"eval_runtime": 29.3785,
|
|
"eval_samples_per_second": 17.53,
|
|
"eval_steps_per_second": 4.391,
|
|
"eval_token_acc": 0.888578582969202,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.8592708741669933,
|
|
"grad_norm": 0.7398496866226196,
|
|
"learning_rate": 8.107900827279638e-06,
|
|
"loss": 0.3155172109603882,
|
|
"memory(GiB)": 30.83,
|
|
"step": 685,
|
|
"token_acc": 0.895827825075353,
|
|
"train_speed(iter/s)": 0.122438
|
|
},
|
|
{
|
|
"epoch": 0.8655429243433947,
|
|
"grad_norm": 0.8711411952972412,
|
|
"learning_rate": 8.082102363728494e-06,
|
|
"loss": 0.3454484462738037,
|
|
"memory(GiB)": 30.83,
|
|
"step": 690,
|
|
"token_acc": 0.8851303206450403,
|
|
"train_speed(iter/s)": 0.122657
|
|
},
|
|
{
|
|
"epoch": 0.8718149745197962,
|
|
"grad_norm": 0.8494858741760254,
|
|
"learning_rate": 8.056170877373277e-06,
|
|
"loss": 0.36029322147369386,
|
|
"memory(GiB)": 30.83,
|
|
"step": 695,
|
|
"token_acc": 0.8825764562659092,
|
|
"train_speed(iter/s)": 0.122862
|
|
},
|
|
{
|
|
"epoch": 0.8780870246961976,
|
|
"grad_norm": 0.7830603122711182,
|
|
"learning_rate": 8.030107487410766e-06,
|
|
"loss": 0.32322983741760253,
|
|
"memory(GiB)": 30.83,
|
|
"step": 700,
|
|
"token_acc": 0.8960877431026685,
|
|
"train_speed(iter/s)": 0.123089
|
|
},
|
|
{
|
|
"epoch": 0.8780870246961976,
|
|
"eval_loss": 0.3324893116950989,
|
|
"eval_runtime": 29.4344,
|
|
"eval_samples_per_second": 17.497,
|
|
"eval_steps_per_second": 4.383,
|
|
"eval_token_acc": 0.888356495233442,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.884359074872599,
|
|
"grad_norm": 0.848610520362854,
|
|
"learning_rate": 8.003913318730662e-06,
|
|
"loss": 0.3297175645828247,
|
|
"memory(GiB)": 30.83,
|
|
"step": 705,
|
|
"token_acc": 0.8931060983995578,
|
|
"train_speed(iter/s)": 0.122471
|
|
},
|
|
{
|
|
"epoch": 0.8906311250490004,
|
|
"grad_norm": 0.8601354956626892,
|
|
"learning_rate": 7.97758950186705e-06,
|
|
"loss": 0.33080010414123534,
|
|
"memory(GiB)": 30.83,
|
|
"step": 710,
|
|
"token_acc": 0.8949430800932338,
|
|
"train_speed(iter/s)": 0.122686
|
|
},
|
|
{
|
|
"epoch": 0.8969031752254017,
|
|
"grad_norm": 0.9265198111534119,
|
|
"learning_rate": 7.951137172949595e-06,
|
|
"loss": 0.3245250225067139,
|
|
"memory(GiB)": 30.83,
|
|
"step": 715,
|
|
"token_acc": 0.895298551874628,
|
|
"train_speed(iter/s)": 0.122903
|
|
},
|
|
{
|
|
"epoch": 0.9031752254018032,
|
|
"grad_norm": 0.8287230730056763,
|
|
"learning_rate": 7.924557473654516e-06,
|
|
"loss": 0.3210673570632935,
|
|
"memory(GiB)": 30.83,
|
|
"step": 720,
|
|
"token_acc": 0.8865173220523668,
|
|
"train_speed(iter/s)": 0.123104
|
|
},
|
|
{
|
|
"epoch": 0.9031752254018032,
|
|
"eval_loss": 0.3312128484249115,
|
|
"eval_runtime": 29.3758,
|
|
"eval_samples_per_second": 17.531,
|
|
"eval_steps_per_second": 4.391,
|
|
"eval_token_acc": 0.888828431671932,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.9094472755782046,
|
|
"grad_norm": 0.8217957615852356,
|
|
"learning_rate": 7.897851551155306e-06,
|
|
"loss": 0.3300743579864502,
|
|
"memory(GiB)": 30.83,
|
|
"step": 725,
|
|
"token_acc": 0.890940345904522,
|
|
"train_speed(iter/s)": 0.122496
|
|
},
|
|
{
|
|
"epoch": 0.915719325754606,
|
|
"grad_norm": 0.8754700422286987,
|
|
"learning_rate": 7.871020558073217e-06,
|
|
"loss": 0.3481910228729248,
|
|
"memory(GiB)": 30.83,
|
|
"step": 730,
|
|
"token_acc": 0.8795096810140688,
|
|
"train_speed(iter/s)": 0.122719
|
|
},
|
|
{
|
|
"epoch": 0.9219913759310074,
|
|
"grad_norm": 0.9567843079566956,
|
|
"learning_rate": 7.844065652427523e-06,
|
|
"loss": 0.32534041404724123,
|
|
"memory(GiB)": 30.83,
|
|
"step": 735,
|
|
"token_acc": 0.8889229120416443,
|
|
"train_speed(iter/s)": 0.122947
|
|
},
|
|
{
|
|
"epoch": 0.9282634261074089,
|
|
"grad_norm": 0.861889660358429,
|
|
"learning_rate": 7.816987997585535e-06,
|
|
"loss": 0.3210756778717041,
|
|
"memory(GiB)": 30.83,
|
|
"step": 740,
|
|
"token_acc": 0.8954586487049511,
|
|
"train_speed(iter/s)": 0.12311
|
|
},
|
|
{
|
|
"epoch": 0.9282634261074089,
|
|
"eval_loss": 0.33026477694511414,
|
|
"eval_runtime": 29.443,
|
|
"eval_samples_per_second": 17.491,
|
|
"eval_steps_per_second": 4.381,
|
|
"eval_token_acc": 0.889078280374662,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.9345354762838103,
|
|
"grad_norm": 0.8503465056419373,
|
|
"learning_rate": 7.789788762212384e-06,
|
|
"loss": 0.3078526735305786,
|
|
"memory(GiB)": 30.83,
|
|
"step": 745,
|
|
"token_acc": 0.8963680387409201,
|
|
"train_speed(iter/s)": 0.122495
|
|
},
|
|
{
|
|
"epoch": 0.9408075264602117,
|
|
"grad_norm": 0.7904318571090698,
|
|
"learning_rate": 7.762469120220595e-06,
|
|
"loss": 0.32522361278533934,
|
|
"memory(GiB)": 30.83,
|
|
"step": 750,
|
|
"token_acc": 0.8939921307506054,
|
|
"train_speed(iter/s)": 0.122717
|
|
},
|
|
{
|
|
"epoch": 0.9470795766366131,
|
|
"grad_norm": 0.8881962895393372,
|
|
"learning_rate": 7.73503025071941e-06,
|
|
"loss": 0.33283185958862305,
|
|
"memory(GiB)": 30.83,
|
|
"step": 755,
|
|
"token_acc": 0.888671875,
|
|
"train_speed(iter/s)": 0.122911
|
|
},
|
|
{
|
|
"epoch": 0.9533516268130146,
|
|
"grad_norm": 0.794685959815979,
|
|
"learning_rate": 7.7074733379639e-06,
|
|
"loss": 0.33060617446899415,
|
|
"memory(GiB)": 30.83,
|
|
"step": 760,
|
|
"token_acc": 0.8894294111685416,
|
|
"train_speed(iter/s)": 0.123122
|
|
},
|
|
{
|
|
"epoch": 0.9533516268130146,
|
|
"eval_loss": 0.32935982942581177,
|
|
"eval_runtime": 29.5219,
|
|
"eval_samples_per_second": 17.445,
|
|
"eval_steps_per_second": 4.37,
|
|
"eval_token_acc": 0.889405859784908,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.9596236769894159,
|
|
"grad_norm": 0.8581651449203491,
|
|
"learning_rate": 7.679799571303861e-06,
|
|
"loss": 0.3343641996383667,
|
|
"memory(GiB)": 30.83,
|
|
"step": 765,
|
|
"token_acc": 0.8940541099253211,
|
|
"train_speed(iter/s)": 0.122584
|
|
},
|
|
{
|
|
"epoch": 0.9658957271658173,
|
|
"grad_norm": 0.9159526824951172,
|
|
"learning_rate": 7.65201014513247e-06,
|
|
"loss": 0.3283867359161377,
|
|
"memory(GiB)": 30.83,
|
|
"step": 770,
|
|
"token_acc": 0.8896209236881311,
|
|
"train_speed(iter/s)": 0.122761
|
|
},
|
|
{
|
|
"epoch": 0.9721677773422187,
|
|
"grad_norm": 0.8125607371330261,
|
|
"learning_rate": 7.62410625883474e-06,
|
|
"loss": 0.33158369064331056,
|
|
"memory(GiB)": 30.83,
|
|
"step": 775,
|
|
"token_acc": 0.8984477961634207,
|
|
"train_speed(iter/s)": 0.122932
|
|
},
|
|
{
|
|
"epoch": 0.9784398275186201,
|
|
"grad_norm": 0.8378052711486816,
|
|
"learning_rate": 7.596089116735765e-06,
|
|
"loss": 0.32932515144348146,
|
|
"memory(GiB)": 30.83,
|
|
"step": 780,
|
|
"token_acc": 0.889555958314454,
|
|
"train_speed(iter/s)": 0.123137
|
|
},
|
|
{
|
|
"epoch": 0.9784398275186201,
|
|
"eval_loss": 0.3282354474067688,
|
|
"eval_runtime": 29.4374,
|
|
"eval_samples_per_second": 17.495,
|
|
"eval_steps_per_second": 4.382,
|
|
"eval_token_acc": 0.8892281895963,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.9847118776950216,
|
|
"grad_norm": 0.829319417476654,
|
|
"learning_rate": 7.567959928048723e-06,
|
|
"loss": 0.32491297721862794,
|
|
"memory(GiB)": 30.83,
|
|
"step": 785,
|
|
"token_acc": 0.8919960394979796,
|
|
"train_speed(iter/s)": 0.122552
|
|
},
|
|
{
|
|
"epoch": 0.990983927871423,
|
|
"grad_norm": 0.9168446660041809,
|
|
"learning_rate": 7.5397199068227e-06,
|
|
"loss": 0.33011536598205565,
|
|
"memory(GiB)": 30.83,
|
|
"step": 790,
|
|
"token_acc": 0.8893950507663205,
|
|
"train_speed(iter/s)": 0.122716
|
|
},
|
|
{
|
|
"epoch": 0.9972559780478244,
|
|
"grad_norm": 0.8191807866096497,
|
|
"learning_rate": 7.511370271890286e-06,
|
|
"loss": 0.32829596996307375,
|
|
"memory(GiB)": 30.83,
|
|
"step": 795,
|
|
"token_acc": 0.896822042039734,
|
|
"train_speed(iter/s)": 0.122911
|
|
},
|
|
{
|
|
"epoch": 1.0025088200705605,
|
|
"grad_norm": 0.80043625831604,
|
|
"learning_rate": 7.482912246814975e-06,
|
|
"loss": 0.2824315071105957,
|
|
"memory(GiB)": 30.83,
|
|
"step": 800,
|
|
"token_acc": 0.9048456348395011,
|
|
"train_speed(iter/s)": 0.123208
|
|
},
|
|
{
|
|
"epoch": 1.0025088200705605,
|
|
"eval_loss": 0.3272770643234253,
|
|
"eval_runtime": 29.4466,
|
|
"eval_samples_per_second": 17.489,
|
|
"eval_steps_per_second": 4.381,
|
|
"eval_token_acc": 0.8895391124263641,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.008780870246962,
|
|
"grad_norm": 0.8259254693984985,
|
|
"learning_rate": 7.454347059838351e-06,
|
|
"loss": 0.26992073059082033,
|
|
"memory(GiB)": 30.83,
|
|
"step": 805,
|
|
"token_acc": 0.8975824979114453,
|
|
"train_speed(iter/s)": 0.122691
|
|
},
|
|
{
|
|
"epoch": 1.0150529204233634,
|
|
"grad_norm": 0.8588547110557556,
|
|
"learning_rate": 7.425675943827084e-06,
|
|
"loss": 0.2826483726501465,
|
|
"memory(GiB)": 30.83,
|
|
"step": 810,
|
|
"token_acc": 0.9070769230769231,
|
|
"train_speed(iter/s)": 0.122895
|
|
},
|
|
{
|
|
"epoch": 1.021324970599765,
|
|
"grad_norm": 0.9410291910171509,
|
|
"learning_rate": 7.3969001362197135e-06,
|
|
"loss": 0.2646550416946411,
|
|
"memory(GiB)": 30.83,
|
|
"step": 815,
|
|
"token_acc": 0.9091513589892777,
|
|
"train_speed(iter/s)": 0.123089
|
|
},
|
|
{
|
|
"epoch": 1.0275970207761662,
|
|
"grad_norm": 0.7985222339630127,
|
|
"learning_rate": 7.3680208789732385e-06,
|
|
"loss": 0.2572730779647827,
|
|
"memory(GiB)": 30.83,
|
|
"step": 820,
|
|
"token_acc": 0.9100758396533044,
|
|
"train_speed(iter/s)": 0.123265
|
|
},
|
|
{
|
|
"epoch": 1.0275970207761662,
|
|
"eval_loss": 0.3340509831905365,
|
|
"eval_runtime": 29.3123,
|
|
"eval_samples_per_second": 17.569,
|
|
"eval_steps_per_second": 4.401,
|
|
"eval_token_acc": 0.8894946948792121,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.0338690709525675,
|
|
"grad_norm": 0.7672378420829773,
|
|
"learning_rate": 7.339039418509532e-06,
|
|
"loss": 0.26021647453308105,
|
|
"memory(GiB)": 30.83,
|
|
"step": 825,
|
|
"token_acc": 0.9020274970593072,
|
|
"train_speed(iter/s)": 0.122729
|
|
},
|
|
{
|
|
"epoch": 1.040141121128969,
|
|
"grad_norm": 0.9672802090644836,
|
|
"learning_rate": 7.309957005661521e-06,
|
|
"loss": 0.25925168991088865,
|
|
"memory(GiB)": 30.83,
|
|
"step": 830,
|
|
"token_acc": 0.9167668481719822,
|
|
"train_speed(iter/s)": 0.122939
|
|
},
|
|
{
|
|
"epoch": 1.0464131713053704,
|
|
"grad_norm": 0.862086832523346,
|
|
"learning_rate": 7.280774895619219e-06,
|
|
"loss": 0.2655090570449829,
|
|
"memory(GiB)": 30.83,
|
|
"step": 835,
|
|
"token_acc": 0.9072164948453608,
|
|
"train_speed(iter/s)": 0.123142
|
|
},
|
|
{
|
|
"epoch": 1.052685221481772,
|
|
"grad_norm": 0.8724802732467651,
|
|
"learning_rate": 7.25149434787555e-06,
|
|
"loss": 0.2601273536682129,
|
|
"memory(GiB)": 30.83,
|
|
"step": 840,
|
|
"token_acc": 0.9072434197102632,
|
|
"train_speed(iter/s)": 0.123337
|
|
},
|
|
{
|
|
"epoch": 1.052685221481772,
|
|
"eval_loss": 0.33454829454421997,
|
|
"eval_runtime": 29.4084,
|
|
"eval_samples_per_second": 17.512,
|
|
"eval_steps_per_second": 4.387,
|
|
"eval_token_acc": 0.8896834694546081,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.0589572716581732,
|
|
"grad_norm": 0.8254806399345398,
|
|
"learning_rate": 7.2221166261719755e-06,
|
|
"loss": 0.2561455726623535,
|
|
"memory(GiB)": 30.83,
|
|
"step": 845,
|
|
"token_acc": 0.9006843577219007,
|
|
"train_speed(iter/s)": 0.122791
|
|
},
|
|
{
|
|
"epoch": 1.0652293218345747,
|
|
"grad_norm": 0.7920213341712952,
|
|
"learning_rate": 7.192642998443975e-06,
|
|
"loss": 0.25534210205078123,
|
|
"memory(GiB)": 30.83,
|
|
"step": 850,
|
|
"token_acc": 0.9210035842293907,
|
|
"train_speed(iter/s)": 0.122955
|
|
},
|
|
{
|
|
"epoch": 1.071501372010976,
|
|
"grad_norm": 0.8850580453872681,
|
|
"learning_rate": 7.163074736766299e-06,
|
|
"loss": 0.2532507419586182,
|
|
"memory(GiB)": 30.83,
|
|
"step": 855,
|
|
"token_acc": 0.9125168236877523,
|
|
"train_speed(iter/s)": 0.123087
|
|
},
|
|
{
|
|
"epoch": 1.0777734221873776,
|
|
"grad_norm": 0.8684042096138,
|
|
"learning_rate": 7.133413117298081e-06,
|
|
"loss": 0.2542534828186035,
|
|
"memory(GiB)": 30.83,
|
|
"step": 860,
|
|
"token_acc": 0.9134933617377411,
|
|
"train_speed(iter/s)": 0.123259
|
|
},
|
|
{
|
|
"epoch": 1.0777734221873776,
|
|
"eval_loss": 0.334031879901886,
|
|
"eval_runtime": 29.396,
|
|
"eval_samples_per_second": 17.519,
|
|
"eval_steps_per_second": 4.388,
|
|
"eval_token_acc": 0.889344785657574,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.084045472363779,
|
|
"grad_norm": 0.8075751066207886,
|
|
"learning_rate": 7.103659420227755e-06,
|
|
"loss": 0.2629993438720703,
|
|
"memory(GiB)": 30.83,
|
|
"step": 865,
|
|
"token_acc": 0.9003039997875985,
|
|
"train_speed(iter/s)": 0.122739
|
|
},
|
|
{
|
|
"epoch": 1.0903175225401802,
|
|
"grad_norm": 0.9507380723953247,
|
|
"learning_rate": 7.0738149297178005e-06,
|
|
"loss": 0.2679288387298584,
|
|
"memory(GiB)": 32.99,
|
|
"step": 870,
|
|
"token_acc": 0.9241651993945572,
|
|
"train_speed(iter/s)": 0.122888
|
|
},
|
|
{
|
|
"epoch": 1.0965895727165818,
|
|
"grad_norm": 0.8346853852272034,
|
|
"learning_rate": 7.04388093384932e-06,
|
|
"loss": 0.24904875755310057,
|
|
"memory(GiB)": 32.99,
|
|
"step": 875,
|
|
"token_acc": 0.9159838773622307,
|
|
"train_speed(iter/s)": 0.123027
|
|
},
|
|
{
|
|
"epoch": 1.102861622892983,
|
|
"grad_norm": 0.8962292075157166,
|
|
"learning_rate": 7.013858724566449e-06,
|
|
"loss": 0.26425485610961913,
|
|
"memory(GiB)": 32.99,
|
|
"step": 880,
|
|
"token_acc": 0.9163976759199484,
|
|
"train_speed(iter/s)": 0.123183
|
|
},
|
|
{
|
|
"epoch": 1.102861622892983,
|
|
"eval_loss": 0.33524197340011597,
|
|
"eval_runtime": 29.4091,
|
|
"eval_samples_per_second": 17.512,
|
|
"eval_steps_per_second": 4.386,
|
|
"eval_token_acc": 0.88925595056327,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.1091336730693846,
|
|
"grad_norm": 0.9459292888641357,
|
|
"learning_rate": 6.983749597620588e-06,
|
|
"loss": 0.26885480880737306,
|
|
"memory(GiB)": 32.99,
|
|
"step": 885,
|
|
"token_acc": 0.8969475538971807,
|
|
"train_speed(iter/s)": 0.122693
|
|
},
|
|
{
|
|
"epoch": 1.115405723245786,
|
|
"grad_norm": 0.9416642785072327,
|
|
"learning_rate": 6.9535548525144894e-06,
|
|
"loss": 0.26124646663665774,
|
|
"memory(GiB)": 32.99,
|
|
"step": 890,
|
|
"token_acc": 0.9064468321600593,
|
|
"train_speed(iter/s)": 0.122861
|
|
},
|
|
{
|
|
"epoch": 1.1216777734221874,
|
|
"grad_norm": 0.8303311467170715,
|
|
"learning_rate": 6.923275792446159e-06,
|
|
"loss": 0.25772600173950194,
|
|
"memory(GiB)": 32.99,
|
|
"step": 895,
|
|
"token_acc": 0.9122112744306251,
|
|
"train_speed(iter/s)": 0.123004
|
|
},
|
|
{
|
|
"epoch": 1.1279498235985888,
|
|
"grad_norm": 0.8063751459121704,
|
|
"learning_rate": 6.8929137242526216e-06,
|
|
"loss": 0.2566836833953857,
|
|
"memory(GiB)": 32.99,
|
|
"step": 900,
|
|
"token_acc": 0.9133560897668676,
|
|
"train_speed(iter/s)": 0.123136
|
|
},
|
|
{
|
|
"epoch": 1.1279498235985888,
|
|
"eval_loss": 0.3343105614185333,
|
|
"eval_runtime": 29.4826,
|
|
"eval_samples_per_second": 17.468,
|
|
"eval_steps_per_second": 4.375,
|
|
"eval_token_acc": 0.889039415020904,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.1342218737749903,
|
|
"grad_norm": 0.7772172689437866,
|
|
"learning_rate": 6.862469958353506e-06,
|
|
"loss": 0.25952978134155275,
|
|
"memory(GiB)": 32.99,
|
|
"step": 905,
|
|
"token_acc": 0.9005278010033445,
|
|
"train_speed(iter/s)": 0.122633
|
|
},
|
|
{
|
|
"epoch": 1.1404939239513916,
|
|
"grad_norm": 0.9848916530609131,
|
|
"learning_rate": 6.8319458086945026e-06,
|
|
"loss": 0.2750791788101196,
|
|
"memory(GiB)": 32.99,
|
|
"step": 910,
|
|
"token_acc": 0.9121159843407869,
|
|
"train_speed(iter/s)": 0.122802
|
|
},
|
|
{
|
|
"epoch": 1.146765974127793,
|
|
"grad_norm": 0.9113922119140625,
|
|
"learning_rate": 6.801342592690641e-06,
|
|
"loss": 0.2661754131317139,
|
|
"memory(GiB)": 32.99,
|
|
"step": 915,
|
|
"token_acc": 0.9091255477233758,
|
|
"train_speed(iter/s)": 0.122967
|
|
},
|
|
{
|
|
"epoch": 1.1530380243041944,
|
|
"grad_norm": 0.8235742449760437,
|
|
"learning_rate": 6.770661631169434e-06,
|
|
"loss": 0.2528377532958984,
|
|
"memory(GiB)": 32.99,
|
|
"step": 920,
|
|
"token_acc": 0.9075825218827996,
|
|
"train_speed(iter/s)": 0.123121
|
|
},
|
|
{
|
|
"epoch": 1.1530380243041944,
|
|
"eval_loss": 0.3342270255088806,
|
|
"eval_runtime": 29.4268,
|
|
"eval_samples_per_second": 17.501,
|
|
"eval_steps_per_second": 4.384,
|
|
"eval_token_acc": 0.889300368110422,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.1593100744805958,
|
|
"grad_norm": 0.8287192583084106,
|
|
"learning_rate": 6.739904248313879e-06,
|
|
"loss": 0.2636830806732178,
|
|
"memory(GiB)": 32.99,
|
|
"step": 925,
|
|
"token_acc": 0.8985745469847197,
|
|
"train_speed(iter/s)": 0.122627
|
|
},
|
|
{
|
|
"epoch": 1.1655821246569973,
|
|
"grad_norm": 0.9481701850891113,
|
|
"learning_rate": 6.709071771605292e-06,
|
|
"loss": 0.26240465641021726,
|
|
"memory(GiB)": 32.99,
|
|
"step": 930,
|
|
"token_acc": 0.9111058712567791,
|
|
"train_speed(iter/s)": 0.122762
|
|
},
|
|
{
|
|
"epoch": 1.1718541748333986,
|
|
"grad_norm": 0.9033907651901245,
|
|
"learning_rate": 6.678165531766029e-06,
|
|
"loss": 0.2581218719482422,
|
|
"memory(GiB)": 32.99,
|
|
"step": 935,
|
|
"token_acc": 0.9199569773090001,
|
|
"train_speed(iter/s)": 0.122912
|
|
},
|
|
{
|
|
"epoch": 1.1781262250098001,
|
|
"grad_norm": 0.8978760838508606,
|
|
"learning_rate": 6.647186862702038e-06,
|
|
"loss": 0.2560389995574951,
|
|
"memory(GiB)": 32.99,
|
|
"step": 940,
|
|
"token_acc": 0.9132201156577118,
|
|
"train_speed(iter/s)": 0.123049
|
|
},
|
|
{
|
|
"epoch": 1.1781262250098001,
|
|
"eval_loss": 0.33342665433883667,
|
|
"eval_runtime": 29.3912,
|
|
"eval_samples_per_second": 17.522,
|
|
"eval_steps_per_second": 4.389,
|
|
"eval_token_acc": 0.8889505799266,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.1843982751862014,
|
|
"grad_norm": 0.8684834837913513,
|
|
"learning_rate": 6.616137101445301e-06,
|
|
"loss": 0.2619821548461914,
|
|
"memory(GiB)": 32.99,
|
|
"step": 945,
|
|
"token_acc": 0.9023693647540983,
|
|
"train_speed(iter/s)": 0.122572
|
|
},
|
|
{
|
|
"epoch": 1.190670325362603,
|
|
"grad_norm": 0.8880748152732849,
|
|
"learning_rate": 6.58501758809612e-06,
|
|
"loss": 0.2793832778930664,
|
|
"memory(GiB)": 32.99,
|
|
"step": 950,
|
|
"token_acc": 0.9074586794194144,
|
|
"train_speed(iter/s)": 0.122739
|
|
},
|
|
{
|
|
"epoch": 1.1969423755390043,
|
|
"grad_norm": 0.9005119204521179,
|
|
"learning_rate": 6.55382966576528e-06,
|
|
"loss": 0.26382246017456057,
|
|
"memory(GiB)": 32.99,
|
|
"step": 955,
|
|
"token_acc": 0.9095440156260524,
|
|
"train_speed(iter/s)": 0.122892
|
|
},
|
|
{
|
|
"epoch": 1.2032144257154056,
|
|
"grad_norm": 0.917253851890564,
|
|
"learning_rate": 6.522574680516081e-06,
|
|
"loss": 0.26546216011047363,
|
|
"memory(GiB)": 32.99,
|
|
"step": 960,
|
|
"token_acc": 0.9125179553655888,
|
|
"train_speed(iter/s)": 0.123068
|
|
},
|
|
{
|
|
"epoch": 1.2032144257154056,
|
|
"eval_loss": 0.33402466773986816,
|
|
"eval_runtime": 29.402,
|
|
"eval_samples_per_second": 17.516,
|
|
"eval_steps_per_second": 4.387,
|
|
"eval_token_acc": 0.889317024690604,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.2094864758918071,
|
|
"grad_norm": 0.8855717778205872,
|
|
"learning_rate": 6.491253981306245e-06,
|
|
"loss": 0.27286221981048586,
|
|
"memory(GiB)": 32.99,
|
|
"step": 965,
|
|
"token_acc": 0.896218792922356,
|
|
"train_speed(iter/s)": 0.122634
|
|
},
|
|
{
|
|
"epoch": 1.2157585260682087,
|
|
"grad_norm": 0.8950490951538086,
|
|
"learning_rate": 6.459868919929691e-06,
|
|
"loss": 0.2556123733520508,
|
|
"memory(GiB)": 32.99,
|
|
"step": 970,
|
|
"token_acc": 0.9105622119815668,
|
|
"train_speed(iter/s)": 0.122762
|
|
},
|
|
{
|
|
"epoch": 1.22203057624461,
|
|
"grad_norm": 0.8709114193916321,
|
|
"learning_rate": 6.428420850958194e-06,
|
|
"loss": 0.25667073726654055,
|
|
"memory(GiB)": 32.99,
|
|
"step": 975,
|
|
"token_acc": 0.9179039301310044,
|
|
"train_speed(iter/s)": 0.122889
|
|
},
|
|
{
|
|
"epoch": 1.2283026264210113,
|
|
"grad_norm": 0.8599227070808411,
|
|
"learning_rate": 6.3969111316829215e-06,
|
|
"loss": 0.2665587902069092,
|
|
"memory(GiB)": 32.99,
|
|
"step": 980,
|
|
"token_acc": 0.908648175626831,
|
|
"train_speed(iter/s)": 0.123052
|
|
},
|
|
{
|
|
"epoch": 1.2283026264210113,
|
|
"eval_loss": 0.3336588442325592,
|
|
"eval_runtime": 29.4102,
|
|
"eval_samples_per_second": 17.511,
|
|
"eval_steps_per_second": 4.386,
|
|
"eval_token_acc": 0.89000549667146,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.2345746765974128,
|
|
"grad_norm": 0.8702272772789001,
|
|
"learning_rate": 6.365341122055857e-06,
|
|
"loss": 0.264133620262146,
|
|
"memory(GiB)": 32.99,
|
|
"step": 985,
|
|
"token_acc": 0.8992360079744176,
|
|
"train_speed(iter/s)": 0.122637
|
|
},
|
|
{
|
|
"epoch": 1.2408467267738141,
|
|
"grad_norm": 0.7991276979446411,
|
|
"learning_rate": 6.333712184631093e-06,
|
|
"loss": 0.2504168272018433,
|
|
"memory(GiB)": 32.99,
|
|
"step": 990,
|
|
"token_acc": 0.9176470588235294,
|
|
"train_speed(iter/s)": 0.122773
|
|
},
|
|
{
|
|
"epoch": 1.2471187769502157,
|
|
"grad_norm": 0.8343760371208191,
|
|
"learning_rate": 6.302025684506042e-06,
|
|
"loss": 0.26856470108032227,
|
|
"memory(GiB)": 32.99,
|
|
"step": 995,
|
|
"token_acc": 0.9059094987822074,
|
|
"train_speed(iter/s)": 0.122922
|
|
},
|
|
{
|
|
"epoch": 1.253390827126617,
|
|
"grad_norm": 0.9037004709243774,
|
|
"learning_rate": 6.2702829892625e-06,
|
|
"loss": 0.262753963470459,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1000,
|
|
"token_acc": 0.9124660313086079,
|
|
"train_speed(iter/s)": 0.123082
|
|
},
|
|
{
|
|
"epoch": 1.253390827126617,
|
|
"eval_loss": 0.3312474489212036,
|
|
"eval_runtime": 29.3973,
|
|
"eval_samples_per_second": 17.519,
|
|
"eval_steps_per_second": 4.388,
|
|
"eval_token_acc": 0.890455224336374,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.2596628773030183,
|
|
"grad_norm": 0.9027026891708374,
|
|
"learning_rate": 6.238485468907637e-06,
|
|
"loss": 0.2612313747406006,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1005,
|
|
"token_acc": 0.9030250326107212,
|
|
"train_speed(iter/s)": 0.122646
|
|
},
|
|
{
|
|
"epoch": 1.2659349274794198,
|
|
"grad_norm": 0.8399356603622437,
|
|
"learning_rate": 6.2066344958148596e-06,
|
|
"loss": 0.2536637306213379,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1010,
|
|
"token_acc": 0.9170757737459979,
|
|
"train_speed(iter/s)": 0.122809
|
|
},
|
|
{
|
|
"epoch": 1.2722069776558214,
|
|
"grad_norm": 0.7805183529853821,
|
|
"learning_rate": 6.174731444664579e-06,
|
|
"loss": 0.2619153022766113,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1015,
|
|
"token_acc": 0.9090187590187591,
|
|
"train_speed(iter/s)": 0.122976
|
|
},
|
|
{
|
|
"epoch": 1.2784790278322227,
|
|
"grad_norm": 0.8158499598503113,
|
|
"learning_rate": 6.14277769238489e-06,
|
|
"loss": 0.25308961868286134,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1020,
|
|
"token_acc": 0.9159010077059869,
|
|
"train_speed(iter/s)": 0.123114
|
|
},
|
|
{
|
|
"epoch": 1.2784790278322227,
|
|
"eval_loss": 0.3310454487800598,
|
|
"eval_runtime": 29.4178,
|
|
"eval_samples_per_second": 17.506,
|
|
"eval_steps_per_second": 4.385,
|
|
"eval_token_acc": 0.8902220322138261,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.284751078008624,
|
|
"grad_norm": 0.8155426979064941,
|
|
"learning_rate": 6.110774618092128e-06,
|
|
"loss": 0.24832696914672853,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1025,
|
|
"token_acc": 0.9044705444194677,
|
|
"train_speed(iter/s)": 0.12268
|
|
},
|
|
{
|
|
"epoch": 1.2910231281850255,
|
|
"grad_norm": 0.825543999671936,
|
|
"learning_rate": 6.07872360303136e-06,
|
|
"loss": 0.26351313591003417,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1030,
|
|
"token_acc": 0.9093544316747946,
|
|
"train_speed(iter/s)": 0.122857
|
|
},
|
|
{
|
|
"epoch": 1.2972951783614268,
|
|
"grad_norm": 0.8312624096870422,
|
|
"learning_rate": 6.046626030516766e-06,
|
|
"loss": 0.26206340789794924,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1035,
|
|
"token_acc": 0.9150324332478503,
|
|
"train_speed(iter/s)": 0.123001
|
|
},
|
|
{
|
|
"epoch": 1.3035672285378284,
|
|
"grad_norm": 0.813248336315155,
|
|
"learning_rate": 6.0144832858719256e-06,
|
|
"loss": 0.25995168685913084,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1040,
|
|
"token_acc": 0.9156251058496765,
|
|
"train_speed(iter/s)": 0.12313
|
|
},
|
|
{
|
|
"epoch": 1.3035672285378284,
|
|
"eval_loss": 0.3311326503753662,
|
|
"eval_runtime": 29.4093,
|
|
"eval_samples_per_second": 17.511,
|
|
"eval_steps_per_second": 4.386,
|
|
"eval_token_acc": 0.8905884769778301,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.3098392787142297,
|
|
"grad_norm": 0.8660376071929932,
|
|
"learning_rate": 5.982296756370052e-06,
|
|
"loss": 0.2568789482116699,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1045,
|
|
"token_acc": 0.8992794915347104,
|
|
"train_speed(iter/s)": 0.122708
|
|
},
|
|
{
|
|
"epoch": 1.3161113288906312,
|
|
"grad_norm": 0.9326941967010498,
|
|
"learning_rate": 5.950067831174086e-06,
|
|
"loss": 0.2631781816482544,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1050,
|
|
"token_acc": 0.9101537421861801,
|
|
"train_speed(iter/s)": 0.122837
|
|
},
|
|
{
|
|
"epoch": 1.3223833790670325,
|
|
"grad_norm": 0.9141260981559753,
|
|
"learning_rate": 5.917797901276771e-06,
|
|
"loss": 0.2625840187072754,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1055,
|
|
"token_acc": 0.9048415211127316,
|
|
"train_speed(iter/s)": 0.122993
|
|
},
|
|
{
|
|
"epoch": 1.328655429243434,
|
|
"grad_norm": 0.8678974509239197,
|
|
"learning_rate": 5.885488359440592e-06,
|
|
"loss": 0.25161261558532716,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1060,
|
|
"token_acc": 0.9070050606902325,
|
|
"train_speed(iter/s)": 0.12312
|
|
},
|
|
{
|
|
"epoch": 1.328655429243434,
|
|
"eval_loss": 0.3307149112224579,
|
|
"eval_runtime": 29.4299,
|
|
"eval_samples_per_second": 17.499,
|
|
"eval_steps_per_second": 4.383,
|
|
"eval_token_acc": 0.8903774936288581,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.3349274794198354,
|
|
"grad_norm": 0.8765535354614258,
|
|
"learning_rate": 5.853140600137684e-06,
|
|
"loss": 0.25869274139404297,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1065,
|
|
"token_acc": 0.9016284233900814,
|
|
"train_speed(iter/s)": 0.122714
|
|
},
|
|
{
|
|
"epoch": 1.3411995295962367,
|
|
"grad_norm": 0.8286167979240417,
|
|
"learning_rate": 5.8207560194896325e-06,
|
|
"loss": 0.2691312551498413,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1070,
|
|
"token_acc": 0.9035621198957429,
|
|
"train_speed(iter/s)": 0.122843
|
|
},
|
|
{
|
|
"epoch": 1.3474715797726382,
|
|
"grad_norm": 0.8201180100440979,
|
|
"learning_rate": 5.78833601520723e-06,
|
|
"loss": 0.2646843433380127,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1075,
|
|
"token_acc": 0.9125431530494822,
|
|
"train_speed(iter/s)": 0.122981
|
|
},
|
|
{
|
|
"epoch": 1.3537436299490395,
|
|
"grad_norm": 0.8124395608901978,
|
|
"learning_rate": 5.755881986530137e-06,
|
|
"loss": 0.2646932125091553,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1080,
|
|
"token_acc": 0.909264075607165,
|
|
"train_speed(iter/s)": 0.123106
|
|
},
|
|
{
|
|
"epoch": 1.3537436299490395,
|
|
"eval_loss": 0.3297117352485657,
|
|
"eval_runtime": 29.4873,
|
|
"eval_samples_per_second": 17.465,
|
|
"eval_steps_per_second": 4.375,
|
|
"eval_token_acc": 0.8906551032985581,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 1.360015680125441,
|
|
"grad_norm": 0.9377442598342896,
|
|
"learning_rate": 5.723395334166506e-06,
|
|
"loss": 0.26891088485717773,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1085,
|
|
"token_acc": 0.9008568241735956,
|
|
"train_speed(iter/s)": 0.12271
|
|
},
|
|
{
|
|
"epoch": 1.3662877303018424,
|
|
"grad_norm": 0.8344196081161499,
|
|
"learning_rate": 5.6908774602325165e-06,
|
|
"loss": 0.2543730974197388,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1090,
|
|
"token_acc": 0.9094074127801552,
|
|
"train_speed(iter/s)": 0.122831
|
|
},
|
|
{
|
|
"epoch": 1.372559780478244,
|
|
"grad_norm": 0.8761776685714722,
|
|
"learning_rate": 5.6583297681918615e-06,
|
|
"loss": 0.25609617233276366,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1095,
|
|
"token_acc": 0.9128849780012571,
|
|
"train_speed(iter/s)": 0.122967
|
|
},
|
|
{
|
|
"epoch": 1.3788318306546452,
|
|
"grad_norm": 0.9113081097602844,
|
|
"learning_rate": 5.625753662795183e-06,
|
|
"loss": 0.2611519813537598,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1100,
|
|
"token_acc": 0.9040675364543361,
|
|
"train_speed(iter/s)": 0.123104
|
|
},
|
|
{
|
|
"epoch": 1.3788318306546452,
|
|
"eval_loss": 0.3294164836406708,
|
|
"eval_runtime": 29.4555,
|
|
"eval_samples_per_second": 17.484,
|
|
"eval_steps_per_second": 4.379,
|
|
"eval_token_acc": 0.8907772515532261,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 1.3851038808310467,
|
|
"grad_norm": 0.959723711013794,
|
|
"learning_rate": 5.59315055001943e-06,
|
|
"loss": 0.26945905685424804,
|
|
"memory(GiB)": 32.99,
|
|
"step": 1105,
|
|
"token_acc": 0.8986453766114783,
|
|
"train_speed(iter/s)": 0.122721
|
|
},
|
|
{
|
|
"epoch": 1.391375931007448,
|
|
"grad_norm": 0.831580400466919,
|
|
"learning_rate": 5.5605218370071836e-06,
|
|
"loss": 0.2433305263519287,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1110,
|
|
"token_acc": 0.9251693967502138,
|
|
"train_speed(iter/s)": 0.122835
|
|
},
|
|
{
|
|
"epoch": 1.3976479811838494,
|
|
"grad_norm": 0.8291701078414917,
|
|
"learning_rate": 5.5278689320059305e-06,
|
|
"loss": 0.26767911911010744,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1115,
|
|
"token_acc": 0.9057218870611159,
|
|
"train_speed(iter/s)": 0.123
|
|
},
|
|
{
|
|
"epoch": 1.403920031360251,
|
|
"grad_norm": 0.95394366979599,
|
|
"learning_rate": 5.4951932443072764e-06,
|
|
"loss": 0.2736950159072876,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1120,
|
|
"token_acc": 0.9039156015418949,
|
|
"train_speed(iter/s)": 0.123138
|
|
},
|
|
{
|
|
"epoch": 1.403920031360251,
|
|
"eval_loss": 0.3296511769294739,
|
|
"eval_runtime": 29.4223,
|
|
"eval_samples_per_second": 17.504,
|
|
"eval_steps_per_second": 4.384,
|
|
"eval_token_acc": 0.8906273423315881,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 1.4101920815366524,
|
|
"grad_norm": 0.8989368081092834,
|
|
"learning_rate": 5.462496184186118e-06,
|
|
"loss": 0.2623956918716431,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1125,
|
|
"token_acc": 0.902071147885406,
|
|
"train_speed(iter/s)": 0.122763
|
|
},
|
|
{
|
|
"epoch": 1.4164641317130537,
|
|
"grad_norm": 0.8569273352622986,
|
|
"learning_rate": 5.429779162839787e-06,
|
|
"loss": 0.2738351345062256,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1130,
|
|
"token_acc": 0.9097424759540801,
|
|
"train_speed(iter/s)": 0.122909
|
|
},
|
|
{
|
|
"epoch": 1.422736181889455,
|
|
"grad_norm": 0.8919984698295593,
|
|
"learning_rate": 5.397043592327129e-06,
|
|
"loss": 0.264469051361084,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1135,
|
|
"token_acc": 0.9089981447124305,
|
|
"train_speed(iter/s)": 0.12304
|
|
},
|
|
{
|
|
"epoch": 1.4290082320658566,
|
|
"grad_norm": 0.9191476702690125,
|
|
"learning_rate": 5.364290885507577e-06,
|
|
"loss": 0.25141263008117676,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1140,
|
|
"token_acc": 0.915509841073431,
|
|
"train_speed(iter/s)": 0.123134
|
|
},
|
|
{
|
|
"epoch": 1.4290082320658566,
|
|
"eval_loss": 0.3284692168235779,
|
|
"eval_runtime": 29.3072,
|
|
"eval_samples_per_second": 17.572,
|
|
"eval_steps_per_second": 4.402,
|
|
"eval_token_acc": 0.8913990972133541,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 1.435280282242258,
|
|
"grad_norm": 0.8672423362731934,
|
|
"learning_rate": 5.3315224559801555e-06,
|
|
"loss": 0.25262012481689455,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1145,
|
|
"token_acc": 0.9027614571092832,
|
|
"train_speed(iter/s)": 0.122738
|
|
},
|
|
{
|
|
"epoch": 1.4415523324186594,
|
|
"grad_norm": 0.9336539506912231,
|
|
"learning_rate": 5.2987397180224795e-06,
|
|
"loss": 0.2663726806640625,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1150,
|
|
"token_acc": 0.9097083349054824,
|
|
"train_speed(iter/s)": 0.122841
|
|
},
|
|
{
|
|
"epoch": 1.4478243825950607,
|
|
"grad_norm": 0.9010604619979858,
|
|
"learning_rate": 5.265944086529714e-06,
|
|
"loss": 0.2540728569030762,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1155,
|
|
"token_acc": 0.9103166186941373,
|
|
"train_speed(iter/s)": 0.122975
|
|
},
|
|
{
|
|
"epoch": 1.454096432771462,
|
|
"grad_norm": 0.8994977474212646,
|
|
"learning_rate": 5.233136976953504e-06,
|
|
"loss": 0.27104973793029785,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1160,
|
|
"token_acc": 0.9070164133277738,
|
|
"train_speed(iter/s)": 0.123129
|
|
},
|
|
{
|
|
"epoch": 1.454096432771462,
|
|
"eval_loss": 0.32739755511283875,
|
|
"eval_runtime": 29.4285,
|
|
"eval_samples_per_second": 17.5,
|
|
"eval_steps_per_second": 4.384,
|
|
"eval_token_acc": 0.8911048309634721,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 1.4603684829478636,
|
|
"grad_norm": 0.9086834788322449,
|
|
"learning_rate": 5.200319805240884e-06,
|
|
"loss": 0.2572296380996704,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1165,
|
|
"token_acc": 0.9042424885024718,
|
|
"train_speed(iter/s)": 0.122748
|
|
},
|
|
{
|
|
"epoch": 1.4666405331242651,
|
|
"grad_norm": 0.9285444617271423,
|
|
"learning_rate": 5.167493987773175e-06,
|
|
"loss": 0.2664067268371582,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1170,
|
|
"token_acc": 0.8990601360128372,
|
|
"train_speed(iter/s)": 0.122891
|
|
},
|
|
{
|
|
"epoch": 1.4729125833006664,
|
|
"grad_norm": 0.8944875597953796,
|
|
"learning_rate": 5.134660941304838e-06,
|
|
"loss": 0.25232925415039065,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1175,
|
|
"token_acc": 0.9148593474922276,
|
|
"train_speed(iter/s)": 0.123017
|
|
},
|
|
{
|
|
"epoch": 1.4791846334770677,
|
|
"grad_norm": 0.8905362486839294,
|
|
"learning_rate": 5.10182208290234e-06,
|
|
"loss": 0.2574918746948242,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1180,
|
|
"token_acc": 0.9164562602109015,
|
|
"train_speed(iter/s)": 0.123133
|
|
},
|
|
{
|
|
"epoch": 1.4791846334770677,
|
|
"eval_loss": 0.32784461975097656,
|
|
"eval_runtime": 29.4674,
|
|
"eval_samples_per_second": 17.477,
|
|
"eval_steps_per_second": 4.378,
|
|
"eval_token_acc": 0.8912214270247462,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 1.4854566836534693,
|
|
"grad_norm": 0.8255454897880554,
|
|
"learning_rate": 5.068978829882992e-06,
|
|
"loss": 0.26115126609802247,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1185,
|
|
"token_acc": 0.9016378309948306,
|
|
"train_speed(iter/s)": 0.122699
|
|
},
|
|
{
|
|
"epoch": 1.4917287338298706,
|
|
"grad_norm": 0.844142496585846,
|
|
"learning_rate": 5.036132599753771e-06,
|
|
"loss": 0.26899421215057373,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1190,
|
|
"token_acc": 0.9105432414675473,
|
|
"train_speed(iter/s)": 0.122845
|
|
},
|
|
{
|
|
"epoch": 1.4980007840062721,
|
|
"grad_norm": 0.8452991247177124,
|
|
"learning_rate": 5.003284810150152e-06,
|
|
"loss": 0.24796614646911622,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1195,
|
|
"token_acc": 0.9108413875777328,
|
|
"train_speed(iter/s)": 0.122976
|
|
},
|
|
{
|
|
"epoch": 1.5042728341826734,
|
|
"grad_norm": 0.8760294914245605,
|
|
"learning_rate": 4.970436878774907e-06,
|
|
"loss": 0.2594925880432129,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1200,
|
|
"token_acc": 0.9037070354960831,
|
|
"train_speed(iter/s)": 0.123102
|
|
},
|
|
{
|
|
"epoch": 1.5042728341826734,
|
|
"eval_loss": 0.3267907202243805,
|
|
"eval_runtime": 29.2019,
|
|
"eval_samples_per_second": 17.636,
|
|
"eval_steps_per_second": 4.418,
|
|
"eval_token_acc": 0.8919376599725721,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 1.5105448843590747,
|
|
"grad_norm": 0.9031227231025696,
|
|
"learning_rate": 4.937590223336936e-06,
|
|
"loss": 0.26340641975402834,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1205,
|
|
"token_acc": 0.900339087116085,
|
|
"train_speed(iter/s)": 0.122762
|
|
},
|
|
{
|
|
"epoch": 1.5168169345354763,
|
|
"grad_norm": 0.8399583697319031,
|
|
"learning_rate": 4.904746261490062e-06,
|
|
"loss": 0.2580922365188599,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1210,
|
|
"token_acc": 0.9061312895701433,
|
|
"train_speed(iter/s)": 0.122878
|
|
},
|
|
{
|
|
"epoch": 1.5230889847118778,
|
|
"grad_norm": 0.8265976309776306,
|
|
"learning_rate": 4.87190641077186e-06,
|
|
"loss": 0.25390305519104006,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1215,
|
|
"token_acc": 0.9108635097493036,
|
|
"train_speed(iter/s)": 0.12298
|
|
},
|
|
{
|
|
"epoch": 1.5293610348882791,
|
|
"grad_norm": 0.864519476890564,
|
|
"learning_rate": 4.8390720885424665e-06,
|
|
"loss": 0.2515955686569214,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1220,
|
|
"token_acc": 0.9193281845468956,
|
|
"train_speed(iter/s)": 0.1231
|
|
},
|
|
{
|
|
"epoch": 1.5293610348882791,
|
|
"eval_loss": 0.32665589451789856,
|
|
"eval_runtime": 29.401,
|
|
"eval_samples_per_second": 17.516,
|
|
"eval_steps_per_second": 4.388,
|
|
"eval_token_acc": 0.8917100200434182,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 1.5356330850646804,
|
|
"grad_norm": 0.8479579091072083,
|
|
"learning_rate": 4.806244711923408e-06,
|
|
"loss": 0.2633438348770142,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1225,
|
|
"token_acc": 0.9037422372771998,
|
|
"train_speed(iter/s)": 0.122739
|
|
},
|
|
{
|
|
"epoch": 1.541905135241082,
|
|
"grad_norm": 0.8362464904785156,
|
|
"learning_rate": 4.773425697736445e-06,
|
|
"loss": 0.2546710968017578,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1230,
|
|
"token_acc": 0.9130922134210133,
|
|
"train_speed(iter/s)": 0.122859
|
|
},
|
|
{
|
|
"epoch": 1.5481771854174835,
|
|
"grad_norm": 0.8684584498405457,
|
|
"learning_rate": 4.7406164624424135e-06,
|
|
"loss": 0.24759359359741212,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1235,
|
|
"token_acc": 0.9112730806608358,
|
|
"train_speed(iter/s)": 0.122966
|
|
},
|
|
{
|
|
"epoch": 1.5544492355938848,
|
|
"grad_norm": 0.8719802498817444,
|
|
"learning_rate": 4.707818422080094e-06,
|
|
"loss": 0.25945463180541994,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1240,
|
|
"token_acc": 0.9120634224755224,
|
|
"train_speed(iter/s)": 0.123098
|
|
},
|
|
{
|
|
"epoch": 1.5544492355938848,
|
|
"eval_loss": 0.32659924030303955,
|
|
"eval_runtime": 29.4283,
|
|
"eval_samples_per_second": 17.5,
|
|
"eval_steps_per_second": 4.384,
|
|
"eval_token_acc": 0.8916378415292961,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 1.5607212857702861,
|
|
"grad_norm": 0.868956983089447,
|
|
"learning_rate": 4.675032992205099e-06,
|
|
"loss": 0.2547459125518799,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1245,
|
|
"token_acc": 0.9023946449205106,
|
|
"train_speed(iter/s)": 0.122755
|
|
},
|
|
{
|
|
"epoch": 1.5669933359466874,
|
|
"grad_norm": 0.8359695672988892,
|
|
"learning_rate": 4.642261587828778e-06,
|
|
"loss": 0.2538146495819092,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1250,
|
|
"token_acc": 0.9125770129594222,
|
|
"train_speed(iter/s)": 0.122873
|
|
},
|
|
{
|
|
"epoch": 1.573265386123089,
|
|
"grad_norm": 0.8868972063064575,
|
|
"learning_rate": 4.609505623357135e-06,
|
|
"loss": 0.2526993751525879,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1255,
|
|
"token_acc": 0.9158266736275652,
|
|
"train_speed(iter/s)": 0.122999
|
|
},
|
|
{
|
|
"epoch": 1.5795374362994905,
|
|
"grad_norm": 0.8684692978858948,
|
|
"learning_rate": 4.576766512529799e-06,
|
|
"loss": 0.27338666915893556,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1260,
|
|
"token_acc": 0.9070495871130456,
|
|
"train_speed(iter/s)": 0.123113
|
|
},
|
|
{
|
|
"epoch": 1.5795374362994905,
|
|
"eval_loss": 0.3250684440135956,
|
|
"eval_runtime": 29.4388,
|
|
"eval_samples_per_second": 17.494,
|
|
"eval_steps_per_second": 4.382,
|
|
"eval_token_acc": 0.8918099595245101,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 1.5858094864758918,
|
|
"grad_norm": 0.8290318250656128,
|
|
"learning_rate": 4.544045668358999e-06,
|
|
"loss": 0.25896754264831545,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1265,
|
|
"token_acc": 0.9011410774976416,
|
|
"train_speed(iter/s)": 0.122758
|
|
},
|
|
{
|
|
"epoch": 1.5920815366522931,
|
|
"grad_norm": 0.8665211796760559,
|
|
"learning_rate": 4.511344503068574e-06,
|
|
"loss": 0.2611932039260864,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1270,
|
|
"token_acc": 0.904277129727819,
|
|
"train_speed(iter/s)": 0.122849
|
|
},
|
|
{
|
|
"epoch": 1.5983535868286947,
|
|
"grad_norm": 0.8333092331886292,
|
|
"learning_rate": 4.478664428033031e-06,
|
|
"loss": 0.2565239429473877,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1275,
|
|
"token_acc": 0.9113872754288802,
|
|
"train_speed(iter/s)": 0.122955
|
|
},
|
|
{
|
|
"epoch": 1.6046256370050962,
|
|
"grad_norm": 0.873248815536499,
|
|
"learning_rate": 4.446006853716628e-06,
|
|
"loss": 0.25569474697113037,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1280,
|
|
"token_acc": 0.9105040810467167,
|
|
"train_speed(iter/s)": 0.123062
|
|
},
|
|
{
|
|
"epoch": 1.6046256370050962,
|
|
"eval_loss": 0.32411840558052063,
|
|
"eval_runtime": 29.4423,
|
|
"eval_samples_per_second": 17.492,
|
|
"eval_steps_per_second": 4.381,
|
|
"eval_token_acc": 0.8917821985575402,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 1.6108976871814975,
|
|
"grad_norm": 0.7883860468864441,
|
|
"learning_rate": 4.413373189612497e-06,
|
|
"loss": 0.24754109382629394,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1285,
|
|
"token_acc": 0.9016164438235881,
|
|
"train_speed(iter/s)": 0.122717
|
|
},
|
|
{
|
|
"epoch": 1.6171697373578988,
|
|
"grad_norm": 0.8996075987815857,
|
|
"learning_rate": 4.380764844181806e-06,
|
|
"loss": 0.25697779655456543,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1290,
|
|
"token_acc": 0.9108724003020111,
|
|
"train_speed(iter/s)": 0.122851
|
|
},
|
|
{
|
|
"epoch": 1.6234417875343001,
|
|
"grad_norm": 0.9099392890930176,
|
|
"learning_rate": 4.34818322479298e-06,
|
|
"loss": 0.25770959854125974,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1295,
|
|
"token_acc": 0.9080447672174858,
|
|
"train_speed(iter/s)": 0.122959
|
|
},
|
|
{
|
|
"epoch": 1.6297138377107017,
|
|
"grad_norm": 0.929043173789978,
|
|
"learning_rate": 4.315629737660956e-06,
|
|
"loss": 0.24951376914978027,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1300,
|
|
"token_acc": 0.9104897729405141,
|
|
"train_speed(iter/s)": 0.12305
|
|
},
|
|
{
|
|
"epoch": 1.6297138377107017,
|
|
"eval_loss": 0.3247908353805542,
|
|
"eval_runtime": 29.4232,
|
|
"eval_samples_per_second": 17.503,
|
|
"eval_steps_per_second": 4.384,
|
|
"eval_token_acc": 0.8921819564819081,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 1.6359858878871032,
|
|
"grad_norm": 0.8205110430717468,
|
|
"learning_rate": 4.283105787786482e-06,
|
|
"loss": 0.24354634284973145,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1305,
|
|
"token_acc": 0.9043088899776798,
|
|
"train_speed(iter/s)": 0.122699
|
|
},
|
|
{
|
|
"epoch": 1.6422579380635045,
|
|
"grad_norm": 0.8134395480155945,
|
|
"learning_rate": 4.250612778895492e-06,
|
|
"loss": 0.2616206169128418,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1310,
|
|
"token_acc": 0.9057426853207865,
|
|
"train_speed(iter/s)": 0.122811
|
|
},
|
|
{
|
|
"epoch": 1.6485299882399058,
|
|
"grad_norm": 0.9198798537254333,
|
|
"learning_rate": 4.218152113378513e-06,
|
|
"loss": 0.2624333381652832,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1315,
|
|
"token_acc": 0.919613921375229,
|
|
"train_speed(iter/s)": 0.122931
|
|
},
|
|
{
|
|
"epoch": 1.6548020384163074,
|
|
"grad_norm": 0.8332286477088928,
|
|
"learning_rate": 4.185725192230136e-06,
|
|
"loss": 0.25506982803344724,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1320,
|
|
"token_acc": 0.9053922590507957,
|
|
"train_speed(iter/s)": 0.123033
|
|
},
|
|
{
|
|
"epoch": 1.6548020384163074,
|
|
"eval_loss": 0.3236985206604004,
|
|
"eval_runtime": 29.3395,
|
|
"eval_samples_per_second": 17.553,
|
|
"eval_steps_per_second": 4.397,
|
|
"eval_token_acc": 0.8926538929203982,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 1.6610740885927089,
|
|
"grad_norm": 0.890203058719635,
|
|
"learning_rate": 4.1533334149885594e-06,
|
|
"loss": 0.2616013526916504,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1325,
|
|
"token_acc": 0.9023847464146382,
|
|
"train_speed(iter/s)": 0.122706
|
|
},
|
|
{
|
|
"epoch": 1.6673461387691102,
|
|
"grad_norm": 0.9320402145385742,
|
|
"learning_rate": 4.120978179675172e-06,
|
|
"loss": 0.24896547794342042,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1330,
|
|
"token_acc": 0.9081224549156486,
|
|
"train_speed(iter/s)": 0.122805
|
|
},
|
|
{
|
|
"epoch": 1.6736181889455115,
|
|
"grad_norm": 0.8774133324623108,
|
|
"learning_rate": 4.088660882734228e-06,
|
|
"loss": 0.265717077255249,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1335,
|
|
"token_acc": 0.9120258587117377,
|
|
"train_speed(iter/s)": 0.122911
|
|
},
|
|
{
|
|
"epoch": 1.6798902391219128,
|
|
"grad_norm": 0.8072150945663452,
|
|
"learning_rate": 4.056382918972565e-06,
|
|
"loss": 0.25207223892211916,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1340,
|
|
"token_acc": 0.9112906186891867,
|
|
"train_speed(iter/s)": 0.123008
|
|
},
|
|
{
|
|
"epoch": 1.6798902391219128,
|
|
"eval_loss": 0.32341495156288147,
|
|
"eval_runtime": 29.3437,
|
|
"eval_samples_per_second": 17.551,
|
|
"eval_steps_per_second": 4.396,
|
|
"eval_token_acc": 0.8927760411750663,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 1.6861622892983144,
|
|
"grad_norm": 0.8317608833312988,
|
|
"learning_rate": 4.024145681499416e-06,
|
|
"loss": 0.2493518829345703,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1345,
|
|
"token_acc": 0.9048487057965731,
|
|
"train_speed(iter/s)": 0.122706
|
|
},
|
|
{
|
|
"epoch": 1.6924343394747159,
|
|
"grad_norm": 0.8192688822746277,
|
|
"learning_rate": 3.991950561666269e-06,
|
|
"loss": 0.2477407217025757,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1350,
|
|
"token_acc": 0.9176169257219045,
|
|
"train_speed(iter/s)": 0.122803
|
|
},
|
|
{
|
|
"epoch": 1.6987063896511172,
|
|
"grad_norm": 0.8379951119422913,
|
|
"learning_rate": 3.959798949006831e-06,
|
|
"loss": 0.2488119125366211,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1355,
|
|
"token_acc": 0.92136861119966,
|
|
"train_speed(iter/s)": 0.122913
|
|
},
|
|
{
|
|
"epoch": 1.7049784398275185,
|
|
"grad_norm": 0.7797974348068237,
|
|
"learning_rate": 3.927692231177053e-06,
|
|
"loss": 0.2600484609603882,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1360,
|
|
"token_acc": 0.9198253158649198,
|
|
"train_speed(iter/s)": 0.123022
|
|
},
|
|
{
|
|
"epoch": 1.7049784398275185,
|
|
"eval_loss": 0.3236111104488373,
|
|
"eval_runtime": 29.4455,
|
|
"eval_samples_per_second": 17.49,
|
|
"eval_steps_per_second": 4.381,
|
|
"eval_token_acc": 0.8926094753732462,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 1.71125049000392,
|
|
"grad_norm": 0.818781316280365,
|
|
"learning_rate": 3.895631793895223e-06,
|
|
"loss": 0.2504476547241211,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1365,
|
|
"token_acc": 0.9028884527745216,
|
|
"train_speed(iter/s)": 0.122706
|
|
},
|
|
{
|
|
"epoch": 1.7175225401803216,
|
|
"grad_norm": 0.8820050954818726,
|
|
"learning_rate": 3.863619020882184e-06,
|
|
"loss": 0.2579957008361816,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1370,
|
|
"token_acc": 0.9132163908813294,
|
|
"train_speed(iter/s)": 0.12282
|
|
},
|
|
{
|
|
"epoch": 1.7237945903567229,
|
|
"grad_norm": 0.8338585495948792,
|
|
"learning_rate": 3.831655293801596e-06,
|
|
"loss": 0.26077022552490237,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1375,
|
|
"token_acc": 0.9174921383647798,
|
|
"train_speed(iter/s)": 0.122933
|
|
},
|
|
{
|
|
"epoch": 1.7300666405331242,
|
|
"grad_norm": 0.8168034553527832,
|
|
"learning_rate": 3.7997419922003077e-06,
|
|
"loss": 0.263335132598877,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1380,
|
|
"token_acc": 0.8992103531476201,
|
|
"train_speed(iter/s)": 0.123038
|
|
},
|
|
{
|
|
"epoch": 1.7300666405331242,
|
|
"eval_loss": 0.3226911127567291,
|
|
"eval_runtime": 29.1494,
|
|
"eval_samples_per_second": 17.668,
|
|
"eval_steps_per_second": 4.425,
|
|
"eval_token_acc": 0.8926761016939742,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 1.7363386907095255,
|
|
"grad_norm": 0.78660649061203,
|
|
"learning_rate": 3.7678804934488146e-06,
|
|
"loss": 0.2630495071411133,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1385,
|
|
"token_acc": 0.90369452426021,
|
|
"train_speed(iter/s)": 0.122732
|
|
},
|
|
{
|
|
"epoch": 1.742610740885927,
|
|
"grad_norm": 0.8045564889907837,
|
|
"learning_rate": 3.736072172681818e-06,
|
|
"loss": 0.252573823928833,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1390,
|
|
"token_acc": 0.9139742579458892,
|
|
"train_speed(iter/s)": 0.122859
|
|
},
|
|
{
|
|
"epoch": 1.7488827910623286,
|
|
"grad_norm": 0.8700997233390808,
|
|
"learning_rate": 3.704318402738867e-06,
|
|
"loss": 0.251971435546875,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1395,
|
|
"token_acc": 0.91973076509971,
|
|
"train_speed(iter/s)": 0.122973
|
|
},
|
|
{
|
|
"epoch": 1.75515484123873,
|
|
"grad_norm": 0.8267092704772949,
|
|
"learning_rate": 3.672620554105111e-06,
|
|
"loss": 0.24774155616760254,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1400,
|
|
"token_acc": 0.9149910336144073,
|
|
"train_speed(iter/s)": 0.123067
|
|
},
|
|
{
|
|
"epoch": 1.75515484123873,
|
|
"eval_loss": 0.320892870426178,
|
|
"eval_runtime": 29.2095,
|
|
"eval_samples_per_second": 17.631,
|
|
"eval_steps_per_second": 4.416,
|
|
"eval_token_acc": 0.8924984315053662,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 1.7614268914151312,
|
|
"grad_norm": 0.8506399393081665,
|
|
"learning_rate": 3.6409799948521473e-06,
|
|
"loss": 0.24930577278137206,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1405,
|
|
"token_acc": 0.9041733294885574,
|
|
"train_speed(iter/s)": 0.122739
|
|
},
|
|
{
|
|
"epoch": 1.7676989415915327,
|
|
"grad_norm": 0.9828294515609741,
|
|
"learning_rate": 3.6093980905789824e-06,
|
|
"loss": 0.2731804132461548,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1410,
|
|
"token_acc": 0.9054926688444883,
|
|
"train_speed(iter/s)": 0.122848
|
|
},
|
|
{
|
|
"epoch": 1.7739709917679343,
|
|
"grad_norm": 0.8984673023223877,
|
|
"learning_rate": 3.577876204353079e-06,
|
|
"loss": 0.2694148778915405,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1415,
|
|
"token_acc": 0.9082330415754923,
|
|
"train_speed(iter/s)": 0.122964
|
|
},
|
|
{
|
|
"epoch": 1.7802430419443356,
|
|
"grad_norm": 0.8837220668792725,
|
|
"learning_rate": 3.5464156966515426e-06,
|
|
"loss": 0.2497929334640503,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1420,
|
|
"token_acc": 0.9207889421363222,
|
|
"train_speed(iter/s)": 0.123051
|
|
},
|
|
{
|
|
"epoch": 1.7802430419443356,
|
|
"eval_loss": 0.32131004333496094,
|
|
"eval_runtime": 29.3893,
|
|
"eval_samples_per_second": 17.523,
|
|
"eval_steps_per_second": 4.389,
|
|
"eval_token_acc": 0.8924318051846382,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 1.786515092120737,
|
|
"grad_norm": 0.8857332468032837,
|
|
"learning_rate": 3.515017925302396e-06,
|
|
"loss": 0.24897024631500245,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1425,
|
|
"token_acc": 0.9022955373286864,
|
|
"train_speed(iter/s)": 0.122741
|
|
},
|
|
{
|
|
"epoch": 1.7927871422971384,
|
|
"grad_norm": 0.9256414175033569,
|
|
"learning_rate": 3.48368424542597e-06,
|
|
"loss": 0.2715806484222412,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1430,
|
|
"token_acc": 0.897119341563786,
|
|
"train_speed(iter/s)": 0.122841
|
|
},
|
|
{
|
|
"epoch": 1.7990591924735397,
|
|
"grad_norm": 0.9175465703010559,
|
|
"learning_rate": 3.4524160093764288e-06,
|
|
"loss": 0.23990106582641602,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1435,
|
|
"token_acc": 0.9154644289198463,
|
|
"train_speed(iter/s)": 0.122916
|
|
},
|
|
{
|
|
"epoch": 1.8053312426499413,
|
|
"grad_norm": 0.8272521495819092,
|
|
"learning_rate": 3.421214566683395e-06,
|
|
"loss": 0.2521500587463379,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1440,
|
|
"token_acc": 0.9128348930707119,
|
|
"train_speed(iter/s)": 0.123014
|
|
},
|
|
{
|
|
"epoch": 1.8053312426499413,
|
|
"eval_loss": 0.32204827666282654,
|
|
"eval_runtime": 29.4038,
|
|
"eval_samples_per_second": 17.515,
|
|
"eval_steps_per_second": 4.387,
|
|
"eval_token_acc": 0.8927871455618542,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 1.8116032928263426,
|
|
"grad_norm": 0.8805301189422607,
|
|
"learning_rate": 3.390081263993702e-06,
|
|
"loss": 0.25586814880371095,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1445,
|
|
"token_acc": 0.9027470478558111,
|
|
"train_speed(iter/s)": 0.122726
|
|
},
|
|
{
|
|
"epoch": 1.817875343002744,
|
|
"grad_norm": 0.792771577835083,
|
|
"learning_rate": 3.3590174450132828e-06,
|
|
"loss": 0.2642062187194824,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1450,
|
|
"token_acc": 0.9100382735230589,
|
|
"train_speed(iter/s)": 0.122835
|
|
},
|
|
{
|
|
"epoch": 1.8241473931791454,
|
|
"grad_norm": 0.8667325973510742,
|
|
"learning_rate": 3.3280244504491664e-06,
|
|
"loss": 0.26262435913085935,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1455,
|
|
"token_acc": 0.9134121762938738,
|
|
"train_speed(iter/s)": 0.12294
|
|
},
|
|
{
|
|
"epoch": 1.830419443355547,
|
|
"grad_norm": 0.914749026298523,
|
|
"learning_rate": 3.297103617951618e-06,
|
|
"loss": 0.25986638069152834,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1460,
|
|
"token_acc": 0.9224172317510969,
|
|
"train_speed(iter/s)": 0.123038
|
|
},
|
|
{
|
|
"epoch": 1.830419443355547,
|
|
"eval_loss": 0.32037732005119324,
|
|
"eval_runtime": 29.4247,
|
|
"eval_samples_per_second": 17.502,
|
|
"eval_steps_per_second": 4.384,
|
|
"eval_token_acc": 0.8929370547834922,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 1.8366914935319483,
|
|
"grad_norm": 0.8536245226860046,
|
|
"learning_rate": 3.2662562820564043e-06,
|
|
"loss": 0.25100226402282716,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1465,
|
|
"token_acc": 0.9041838910977119,
|
|
"train_speed(iter/s)": 0.122724
|
|
},
|
|
{
|
|
"epoch": 1.8429635437083496,
|
|
"grad_norm": 0.9133491516113281,
|
|
"learning_rate": 3.2354837741271994e-06,
|
|
"loss": 0.2528833866119385,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1470,
|
|
"token_acc": 0.910944418742978,
|
|
"train_speed(iter/s)": 0.122825
|
|
},
|
|
{
|
|
"epoch": 1.8492355938847511,
|
|
"grad_norm": 0.8379256725311279,
|
|
"learning_rate": 3.2047874222981134e-06,
|
|
"loss": 0.24852747917175294,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1475,
|
|
"token_acc": 0.91071612747963,
|
|
"train_speed(iter/s)": 0.122921
|
|
},
|
|
{
|
|
"epoch": 1.8555076440611527,
|
|
"grad_norm": 0.8636755347251892,
|
|
"learning_rate": 3.174168551416384e-06,
|
|
"loss": 0.2513826131820679,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1480,
|
|
"token_acc": 0.9178916270355959,
|
|
"train_speed(iter/s)": 0.123
|
|
},
|
|
{
|
|
"epoch": 1.8555076440611527,
|
|
"eval_loss": 0.3209190368652344,
|
|
"eval_runtime": 29.4485,
|
|
"eval_samples_per_second": 17.488,
|
|
"eval_steps_per_second": 4.381,
|
|
"eval_token_acc": 0.8929148460099162,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 1.861779694237554,
|
|
"grad_norm": 0.884774923324585,
|
|
"learning_rate": 3.1436284829851883e-06,
|
|
"loss": 0.2535923719406128,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1485,
|
|
"token_acc": 0.9044768032509313,
|
|
"train_speed(iter/s)": 0.122697
|
|
},
|
|
{
|
|
"epoch": 1.8680517444139553,
|
|
"grad_norm": 0.9002357721328735,
|
|
"learning_rate": 3.113168535106604e-06,
|
|
"loss": 0.2631272792816162,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1490,
|
|
"token_acc": 0.9088035186585114,
|
|
"train_speed(iter/s)": 0.122806
|
|
},
|
|
{
|
|
"epoch": 1.8743237945903566,
|
|
"grad_norm": 0.9321665167808533,
|
|
"learning_rate": 3.08279002242473e-06,
|
|
"loss": 0.25894691944122317,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1495,
|
|
"token_acc": 0.9117967604945695,
|
|
"train_speed(iter/s)": 0.122898
|
|
},
|
|
{
|
|
"epoch": 1.8805958447667581,
|
|
"grad_norm": 0.8132336139678955,
|
|
"learning_rate": 3.0524942560689387e-06,
|
|
"loss": 0.24980921745300294,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1500,
|
|
"token_acc": 0.9154492980830898,
|
|
"train_speed(iter/s)": 0.122985
|
|
},
|
|
{
|
|
"epoch": 1.8805958447667581,
|
|
"eval_loss": 0.32020601630210876,
|
|
"eval_runtime": 29.4077,
|
|
"eval_samples_per_second": 17.512,
|
|
"eval_steps_per_second": 4.387,
|
|
"eval_token_acc": 0.8928315631090062,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.8868678949431597,
|
|
"grad_norm": 0.8028678297996521,
|
|
"learning_rate": 3.0222825435972948e-06,
|
|
"loss": 0.24374105930328369,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1505,
|
|
"token_acc": 0.9061514997458058,
|
|
"train_speed(iter/s)": 0.122682
|
|
},
|
|
{
|
|
"epoch": 1.893139945119561,
|
|
"grad_norm": 0.8144044876098633,
|
|
"learning_rate": 2.99215618894011e-06,
|
|
"loss": 0.24561538696289062,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1510,
|
|
"token_acc": 0.9156626506024096,
|
|
"train_speed(iter/s)": 0.122769
|
|
},
|
|
{
|
|
"epoch": 1.8994119952959623,
|
|
"grad_norm": 0.8524841666221619,
|
|
"learning_rate": 2.9621164923436774e-06,
|
|
"loss": 0.23612394332885742,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1515,
|
|
"token_acc": 0.9185016451531258,
|
|
"train_speed(iter/s)": 0.12286
|
|
},
|
|
{
|
|
"epoch": 1.9056840454723638,
|
|
"grad_norm": 0.7969034910202026,
|
|
"learning_rate": 2.9321647503141525e-06,
|
|
"loss": 0.25468385219573975,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1520,
|
|
"token_acc": 0.9106673673147662,
|
|
"train_speed(iter/s)": 0.122942
|
|
},
|
|
{
|
|
"epoch": 1.9056840454723638,
|
|
"eval_loss": 0.3204187750816345,
|
|
"eval_runtime": 29.5394,
|
|
"eval_samples_per_second": 17.434,
|
|
"eval_steps_per_second": 4.367,
|
|
"eval_token_acc": 0.8928926372363403,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 1.9119560956487653,
|
|
"grad_norm": 0.7709624171257019,
|
|
"learning_rate": 2.902302255561585e-06,
|
|
"loss": 0.25543718338012694,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1525,
|
|
"token_acc": 0.9032785309104894,
|
|
"train_speed(iter/s)": 0.122654
|
|
},
|
|
{
|
|
"epoch": 1.9182281458251667,
|
|
"grad_norm": 0.8021811246871948,
|
|
"learning_rate": 2.87253029694414e-06,
|
|
"loss": 0.25137279033660886,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1530,
|
|
"token_acc": 0.9112309955860716,
|
|
"train_speed(iter/s)": 0.122731
|
|
},
|
|
{
|
|
"epoch": 1.924500196001568,
|
|
"grad_norm": 0.9843617081642151,
|
|
"learning_rate": 2.8428501594124602e-06,
|
|
"loss": 0.24772090911865235,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1535,
|
|
"token_acc": 0.9226571027600187,
|
|
"train_speed(iter/s)": 0.122835
|
|
},
|
|
{
|
|
"epoch": 1.9307722461779693,
|
|
"grad_norm": 0.8464626669883728,
|
|
"learning_rate": 2.813263123954214e-06,
|
|
"loss": 0.24349329471588135,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1540,
|
|
"token_acc": 0.9155610012252757,
|
|
"train_speed(iter/s)": 0.122913
|
|
},
|
|
{
|
|
"epoch": 1.9307722461779693,
|
|
"eval_loss": 0.3201349973678589,
|
|
"eval_runtime": 29.3804,
|
|
"eval_samples_per_second": 17.529,
|
|
"eval_steps_per_second": 4.391,
|
|
"eval_token_acc": 0.8934423043823463,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 1.9370442963543708,
|
|
"grad_norm": 0.8658434152603149,
|
|
"learning_rate": 2.7837704675388045e-06,
|
|
"loss": 0.24450139999389647,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1545,
|
|
"token_acc": 0.904251637293009,
|
|
"train_speed(iter/s)": 0.122623
|
|
},
|
|
{
|
|
"epoch": 1.9433163465307723,
|
|
"grad_norm": 0.8747855424880981,
|
|
"learning_rate": 2.7543734630622622e-06,
|
|
"loss": 0.2556029796600342,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1550,
|
|
"token_acc": 0.9159713945172825,
|
|
"train_speed(iter/s)": 0.122715
|
|
},
|
|
{
|
|
"epoch": 1.9495883967071737,
|
|
"grad_norm": 0.7818464040756226,
|
|
"learning_rate": 2.7250733792922997e-06,
|
|
"loss": 0.2455909252166748,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1555,
|
|
"token_acc": 0.9172009090600726,
|
|
"train_speed(iter/s)": 0.122785
|
|
},
|
|
{
|
|
"epoch": 1.955860446883575,
|
|
"grad_norm": 0.840918242931366,
|
|
"learning_rate": 2.6958714808135546e-06,
|
|
"loss": 0.24802937507629394,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1560,
|
|
"token_acc": 0.9247232757953402,
|
|
"train_speed(iter/s)": 0.122896
|
|
},
|
|
{
|
|
"epoch": 1.955860446883575,
|
|
"eval_loss": 0.31897813081741333,
|
|
"eval_runtime": 29.4222,
|
|
"eval_samples_per_second": 17.504,
|
|
"eval_steps_per_second": 4.384,
|
|
"eval_token_acc": 0.8934034390285882,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 1.9621324970599765,
|
|
"grad_norm": 0.7543806433677673,
|
|
"learning_rate": 2.6667690279730096e-06,
|
|
"loss": 0.24353570938110353,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1565,
|
|
"token_acc": 0.9054516586629185,
|
|
"train_speed(iter/s)": 0.122615
|
|
},
|
|
{
|
|
"epoch": 1.968404547236378,
|
|
"grad_norm": 0.8479281663894653,
|
|
"learning_rate": 2.6377672768256003e-06,
|
|
"loss": 0.23759632110595702,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1570,
|
|
"token_acc": 0.9129114454256693,
|
|
"train_speed(iter/s)": 0.122681
|
|
},
|
|
{
|
|
"epoch": 1.9746765974127793,
|
|
"grad_norm": 0.8568554520606995,
|
|
"learning_rate": 2.608867479080001e-06,
|
|
"loss": 0.2510775089263916,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1575,
|
|
"token_acc": 0.90967994588629,
|
|
"train_speed(iter/s)": 0.122771
|
|
},
|
|
{
|
|
"epoch": 1.9809486475891807,
|
|
"grad_norm": 0.8321829438209534,
|
|
"learning_rate": 2.5800708820446002e-06,
|
|
"loss": 0.2509664297103882,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1580,
|
|
"token_acc": 0.9128670788253478,
|
|
"train_speed(iter/s)": 0.122872
|
|
},
|
|
{
|
|
"epoch": 1.9809486475891807,
|
|
"eval_loss": 0.31922805309295654,
|
|
"eval_runtime": 29.3844,
|
|
"eval_samples_per_second": 17.526,
|
|
"eval_steps_per_second": 4.39,
|
|
"eval_token_acc": 0.8935255872832563,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 1.987220697765582,
|
|
"grad_norm": 0.8347560167312622,
|
|
"learning_rate": 2.551378728573668e-06,
|
|
"loss": 0.24763202667236328,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1585,
|
|
"token_acc": 0.9060148795763103,
|
|
"train_speed(iter/s)": 0.122581
|
|
},
|
|
{
|
|
"epoch": 1.9934927479419835,
|
|
"grad_norm": 0.899139404296875,
|
|
"learning_rate": 2.5227922570137143e-06,
|
|
"loss": 0.25942072868347166,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1590,
|
|
"token_acc": 0.9087091290870913,
|
|
"train_speed(iter/s)": 0.122684
|
|
},
|
|
{
|
|
"epoch": 1.999764798118385,
|
|
"grad_norm": 0.811363935470581,
|
|
"learning_rate": 2.4943127011500483e-06,
|
|
"loss": 0.25793845653533937,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1595,
|
|
"token_acc": 0.9161427702291625,
|
|
"train_speed(iter/s)": 0.122787
|
|
},
|
|
{
|
|
"epoch": 2.005017640141121,
|
|
"grad_norm": 0.8322923183441162,
|
|
"learning_rate": 2.465941290153514e-06,
|
|
"loss": 0.2291651725769043,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1600,
|
|
"token_acc": 0.9299772392555897,
|
|
"train_speed(iter/s)": 0.122933
|
|
},
|
|
{
|
|
"epoch": 2.005017640141121,
|
|
"eval_loss": 0.32035958766937256,
|
|
"eval_runtime": 29.3301,
|
|
"eval_samples_per_second": 17.559,
|
|
"eval_steps_per_second": 4.398,
|
|
"eval_token_acc": 0.8932868429673142,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 2.0112896903175224,
|
|
"grad_norm": 0.8588864207267761,
|
|
"learning_rate": 2.4376792485274577e-06,
|
|
"loss": 0.2054748058319092,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1605,
|
|
"token_acc": 0.9124336719729088,
|
|
"train_speed(iter/s)": 0.122653
|
|
},
|
|
{
|
|
"epoch": 2.017561740493924,
|
|
"grad_norm": 0.7657065987586975,
|
|
"learning_rate": 2.409527796054863e-06,
|
|
"loss": 0.2039170742034912,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1610,
|
|
"token_acc": 0.929469197420671,
|
|
"train_speed(iter/s)": 0.122752
|
|
},
|
|
{
|
|
"epoch": 2.0238337906703254,
|
|
"grad_norm": 0.8790497183799744,
|
|
"learning_rate": 2.38148814774572e-06,
|
|
"loss": 0.18966434001922608,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1615,
|
|
"token_acc": 0.9288194444444444,
|
|
"train_speed(iter/s)": 0.122823
|
|
},
|
|
{
|
|
"epoch": 2.0301058408467267,
|
|
"grad_norm": 0.9316287040710449,
|
|
"learning_rate": 2.353561513784566e-06,
|
|
"loss": 0.2005706548690796,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1620,
|
|
"token_acc": 0.9332470414201184,
|
|
"train_speed(iter/s)": 0.12292
|
|
},
|
|
{
|
|
"epoch": 2.0301058408467267,
|
|
"eval_loss": 0.3415764570236206,
|
|
"eval_runtime": 29.383,
|
|
"eval_samples_per_second": 17.527,
|
|
"eval_steps_per_second": 4.39,
|
|
"eval_token_acc": 0.8920264950668761,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 2.036377891023128,
|
|
"grad_norm": 0.9010135531425476,
|
|
"learning_rate": 2.325749099478277e-06,
|
|
"loss": 0.20035338401794434,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1625,
|
|
"token_acc": 0.9104487073829418,
|
|
"train_speed(iter/s)": 0.122648
|
|
},
|
|
{
|
|
"epoch": 2.04264994119953,
|
|
"grad_norm": 0.8294356465339661,
|
|
"learning_rate": 2.29805210520403e-06,
|
|
"loss": 0.19484407901763917,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1630,
|
|
"token_acc": 0.9329941210850208,
|
|
"train_speed(iter/s)": 0.122735
|
|
},
|
|
{
|
|
"epoch": 2.048921991375931,
|
|
"grad_norm": 0.8767816424369812,
|
|
"learning_rate": 2.270471726357501e-06,
|
|
"loss": 0.19034754037857055,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1635,
|
|
"token_acc": 0.9369187962217178,
|
|
"train_speed(iter/s)": 0.122807
|
|
},
|
|
{
|
|
"epoch": 2.0551940415523324,
|
|
"grad_norm": 0.9023549556732178,
|
|
"learning_rate": 2.243009153301276e-06,
|
|
"loss": 0.19750189781188965,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1640,
|
|
"token_acc": 0.930916058130287,
|
|
"train_speed(iter/s)": 0.122894
|
|
},
|
|
{
|
|
"epoch": 2.0551940415523324,
|
|
"eval_loss": 0.3384056091308594,
|
|
"eval_runtime": 29.3915,
|
|
"eval_samples_per_second": 17.522,
|
|
"eval_steps_per_second": 4.389,
|
|
"eval_token_acc": 0.8916600503028721,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 2.0614660917287337,
|
|
"grad_norm": 0.744107723236084,
|
|
"learning_rate": 2.215665571313468e-06,
|
|
"loss": 0.19824939966201782,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1645,
|
|
"token_acc": 0.9114719614180904,
|
|
"train_speed(iter/s)": 0.122646
|
|
},
|
|
{
|
|
"epoch": 2.067738141905135,
|
|
"grad_norm": 0.8998700380325317,
|
|
"learning_rate": 2.188442160536562e-06,
|
|
"loss": 0.19962520599365235,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1650,
|
|
"token_acc": 0.9246565057436744,
|
|
"train_speed(iter/s)": 0.122714
|
|
},
|
|
{
|
|
"epoch": 2.074010192081537,
|
|
"grad_norm": 0.8231090307235718,
|
|
"learning_rate": 2.1613400959264845e-06,
|
|
"loss": 0.1893744945526123,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1655,
|
|
"token_acc": 0.9325602140945585,
|
|
"train_speed(iter/s)": 0.122783
|
|
},
|
|
{
|
|
"epoch": 2.080282242257938,
|
|
"grad_norm": 1.0551718473434448,
|
|
"learning_rate": 2.1343605472018954e-06,
|
|
"loss": 0.19394491910934447,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1660,
|
|
"token_acc": 0.9273977016432177,
|
|
"train_speed(iter/s)": 0.122871
|
|
},
|
|
{
|
|
"epoch": 2.080282242257938,
|
|
"eval_loss": 0.34014904499053955,
|
|
"eval_runtime": 29.2198,
|
|
"eval_samples_per_second": 17.625,
|
|
"eval_steps_per_second": 4.415,
|
|
"eval_token_acc": 0.8916989156566302,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 2.0865542924343394,
|
|
"grad_norm": 0.9032732844352722,
|
|
"learning_rate": 2.1075046787936842e-06,
|
|
"loss": 0.20794956684112548,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1665,
|
|
"token_acc": 0.9062622258157065,
|
|
"train_speed(iter/s)": 0.122603
|
|
},
|
|
{
|
|
"epoch": 2.0928263426107407,
|
|
"grad_norm": 0.7685064673423767,
|
|
"learning_rate": 2.0807736497947436e-06,
|
|
"loss": 0.19878649711608887,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1670,
|
|
"token_acc": 0.9290486321054007,
|
|
"train_speed(iter/s)": 0.122682
|
|
},
|
|
{
|
|
"epoch": 2.0990983927871425,
|
|
"grad_norm": 0.7682995200157166,
|
|
"learning_rate": 2.0541686139099164e-06,
|
|
"loss": 0.19356679916381836,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1675,
|
|
"token_acc": 0.922647425159988,
|
|
"train_speed(iter/s)": 0.122764
|
|
},
|
|
{
|
|
"epoch": 2.105370442963544,
|
|
"grad_norm": 0.855737030506134,
|
|
"learning_rate": 2.0276907194062167e-06,
|
|
"loss": 0.20157759189605712,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1680,
|
|
"token_acc": 0.9333961552958915,
|
|
"train_speed(iter/s)": 0.122849
|
|
},
|
|
{
|
|
"epoch": 2.105370442963544,
|
|
"eval_loss": 0.3408814072608948,
|
|
"eval_runtime": 29.4394,
|
|
"eval_samples_per_second": 17.494,
|
|
"eval_steps_per_second": 4.382,
|
|
"eval_token_acc": 0.8919487643593602,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 2.111642493139945,
|
|
"grad_norm": 0.8940383791923523,
|
|
"learning_rate": 2.0013411090632638e-06,
|
|
"loss": 0.1950603485107422,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1685,
|
|
"token_acc": 0.9098292220113852,
|
|
"train_speed(iter/s)": 0.122586
|
|
},
|
|
{
|
|
"epoch": 2.1179145433163464,
|
|
"grad_norm": 0.8400514721870422,
|
|
"learning_rate": 1.9751209201239696e-06,
|
|
"loss": 0.19134198427200316,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1690,
|
|
"token_acc": 0.9370006770480704,
|
|
"train_speed(iter/s)": 0.122686
|
|
},
|
|
{
|
|
"epoch": 2.1241865934927477,
|
|
"grad_norm": 0.7992594838142395,
|
|
"learning_rate": 1.9490312842454425e-06,
|
|
"loss": 0.18724431991577148,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1695,
|
|
"token_acc": 0.9336970593674866,
|
|
"train_speed(iter/s)": 0.12277
|
|
},
|
|
{
|
|
"epoch": 2.1304586436691495,
|
|
"grad_norm": 0.8192344903945923,
|
|
"learning_rate": 1.9230733274501525e-06,
|
|
"loss": 0.19678905010223388,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1700,
|
|
"token_acc": 0.931975970358646,
|
|
"train_speed(iter/s)": 0.122839
|
|
},
|
|
{
|
|
"epoch": 2.1304586436691495,
|
|
"eval_loss": 0.3409229516983032,
|
|
"eval_runtime": 29.4398,
|
|
"eval_samples_per_second": 17.493,
|
|
"eval_steps_per_second": 4.382,
|
|
"eval_token_acc": 0.8916822590764482,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 2.136730693845551,
|
|
"grad_norm": 0.8598861694335938,
|
|
"learning_rate": 1.8972481700773388e-06,
|
|
"loss": 0.2011383056640625,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1705,
|
|
"token_acc": 0.9084407844836927,
|
|
"train_speed(iter/s)": 0.122585
|
|
},
|
|
{
|
|
"epoch": 2.143002744021952,
|
|
"grad_norm": 0.851450502872467,
|
|
"learning_rate": 1.8715569267346368e-06,
|
|
"loss": 0.2008237361907959,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1710,
|
|
"token_acc": 0.934470600804872,
|
|
"train_speed(iter/s)": 0.122661
|
|
},
|
|
{
|
|
"epoch": 2.1492747941983534,
|
|
"grad_norm": 0.8260136246681213,
|
|
"learning_rate": 1.846000706249997e-06,
|
|
"loss": 0.19412180185317993,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1715,
|
|
"token_acc": 0.934304410514252,
|
|
"train_speed(iter/s)": 0.122733
|
|
},
|
|
{
|
|
"epoch": 2.155546844374755,
|
|
"grad_norm": 0.9584734439849854,
|
|
"learning_rate": 1.8205806116238055e-06,
|
|
"loss": 0.19354283809661865,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1720,
|
|
"token_acc": 0.9366102705522844,
|
|
"train_speed(iter/s)": 0.122798
|
|
},
|
|
{
|
|
"epoch": 2.155546844374755,
|
|
"eval_loss": 0.34049588441848755,
|
|
"eval_runtime": 29.3984,
|
|
"eval_samples_per_second": 17.518,
|
|
"eval_steps_per_second": 4.388,
|
|
"eval_token_acc": 0.8918266161046922,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 2.1618188945511565,
|
|
"grad_norm": 0.8262621164321899,
|
|
"learning_rate": 1.7952977399812988e-06,
|
|
"loss": 0.19691638946533202,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1725,
|
|
"token_acc": 0.9087991263467711,
|
|
"train_speed(iter/s)": 0.122536
|
|
},
|
|
{
|
|
"epoch": 2.168090944727558,
|
|
"grad_norm": 0.8243085145950317,
|
|
"learning_rate": 1.7701531825251888e-06,
|
|
"loss": 0.20423364639282227,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1730,
|
|
"token_acc": 0.9323758228605625,
|
|
"train_speed(iter/s)": 0.122637
|
|
},
|
|
{
|
|
"epoch": 2.174362994903959,
|
|
"grad_norm": 0.8588405847549438,
|
|
"learning_rate": 1.7451480244885938e-06,
|
|
"loss": 0.20565853118896485,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1735,
|
|
"token_acc": 0.9293734801888142,
|
|
"train_speed(iter/s)": 0.122722
|
|
},
|
|
{
|
|
"epoch": 2.1806350450803604,
|
|
"grad_norm": 0.8922966122627258,
|
|
"learning_rate": 1.720283345088178e-06,
|
|
"loss": 0.20658740997314454,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1740,
|
|
"token_acc": 0.9308320373250388,
|
|
"train_speed(iter/s)": 0.122814
|
|
},
|
|
{
|
|
"epoch": 2.1806350450803604,
|
|
"eval_loss": 0.33896124362945557,
|
|
"eval_runtime": 29.4645,
|
|
"eval_samples_per_second": 17.479,
|
|
"eval_steps_per_second": 4.378,
|
|
"eval_token_acc": 0.8921264345479681,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 2.186907095256762,
|
|
"grad_norm": 0.8128474354743958,
|
|
"learning_rate": 1.695560217477582e-06,
|
|
"loss": 0.19282236099243164,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1745,
|
|
"token_acc": 0.9095768254522502,
|
|
"train_speed(iter/s)": 0.122545
|
|
},
|
|
{
|
|
"epoch": 2.1931791454331635,
|
|
"grad_norm": 0.7443967461585999,
|
|
"learning_rate": 1.6709797087011066e-06,
|
|
"loss": 0.1943533182144165,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1750,
|
|
"token_acc": 0.9320897479117504,
|
|
"train_speed(iter/s)": 0.122626
|
|
},
|
|
{
|
|
"epoch": 2.199451195609565,
|
|
"grad_norm": 0.7897108793258667,
|
|
"learning_rate": 1.6465428796476584e-06,
|
|
"loss": 0.1893579602241516,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1755,
|
|
"token_acc": 0.9279509242867141,
|
|
"train_speed(iter/s)": 0.122698
|
|
},
|
|
{
|
|
"epoch": 2.205723245785966,
|
|
"grad_norm": 0.9222955703735352,
|
|
"learning_rate": 1.6222507850049602e-06,
|
|
"loss": 0.20297529697418212,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1760,
|
|
"token_acc": 0.9288448547624409,
|
|
"train_speed(iter/s)": 0.122793
|
|
},
|
|
{
|
|
"epoch": 2.205723245785966,
|
|
"eval_loss": 0.34091004729270935,
|
|
"eval_runtime": 29.4544,
|
|
"eval_samples_per_second": 17.485,
|
|
"eval_steps_per_second": 4.38,
|
|
"eval_token_acc": 0.8917488853971761,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 2.211995295962368,
|
|
"grad_norm": 0.814859926700592,
|
|
"learning_rate": 1.598104473214031e-06,
|
|
"loss": 0.20043063163757324,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1765,
|
|
"token_acc": 0.9068337921265734,
|
|
"train_speed(iter/s)": 0.122535
|
|
},
|
|
{
|
|
"epoch": 2.218267346138769,
|
|
"grad_norm": 0.8000169396400452,
|
|
"learning_rate": 1.5741049864239383e-06,
|
|
"loss": 0.19192855358123778,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1770,
|
|
"token_acc": 0.9305805924968539,
|
|
"train_speed(iter/s)": 0.122614
|
|
},
|
|
{
|
|
"epoch": 2.2245393963151705,
|
|
"grad_norm": 0.9338832497596741,
|
|
"learning_rate": 1.550253360446815e-06,
|
|
"loss": 0.1987203598022461,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1775,
|
|
"token_acc": 0.938174715909091,
|
|
"train_speed(iter/s)": 0.122706
|
|
},
|
|
{
|
|
"epoch": 2.230811446491572,
|
|
"grad_norm": 0.8646144866943359,
|
|
"learning_rate": 1.5265506247131617e-06,
|
|
"loss": 0.19849686622619628,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1780,
|
|
"token_acc": 0.9281764423845293,
|
|
"train_speed(iter/s)": 0.122792
|
|
},
|
|
{
|
|
"epoch": 2.230811446491572,
|
|
"eval_loss": 0.3393869996070862,
|
|
"eval_runtime": 29.4398,
|
|
"eval_samples_per_second": 17.493,
|
|
"eval_steps_per_second": 4.382,
|
|
"eval_token_acc": 0.8924151486044561,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 2.2370834966679736,
|
|
"grad_norm": 0.865871787071228,
|
|
"learning_rate": 1.5029978022274067e-06,
|
|
"loss": 0.21043546199798585,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1785,
|
|
"token_acc": 0.9090308690081499,
|
|
"train_speed(iter/s)": 0.122528
|
|
},
|
|
{
|
|
"epoch": 2.243355546844375,
|
|
"grad_norm": 0.8752254247665405,
|
|
"learning_rate": 1.47959590952376e-06,
|
|
"loss": 0.19589321613311766,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1790,
|
|
"token_acc": 0.9334750193974969,
|
|
"train_speed(iter/s)": 0.122614
|
|
},
|
|
{
|
|
"epoch": 2.249627597020776,
|
|
"grad_norm": 0.8532087802886963,
|
|
"learning_rate": 1.4563459566223358e-06,
|
|
"loss": 0.19277225732803344,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1795,
|
|
"token_acc": 0.9376329562721594,
|
|
"train_speed(iter/s)": 0.122703
|
|
},
|
|
{
|
|
"epoch": 2.2558996471971775,
|
|
"grad_norm": 0.8500798344612122,
|
|
"learning_rate": 1.4332489469855698e-06,
|
|
"loss": 0.19048466682434081,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1800,
|
|
"token_acc": 0.9341178420485622,
|
|
"train_speed(iter/s)": 0.122772
|
|
},
|
|
{
|
|
"epoch": 2.2558996471971775,
|
|
"eval_loss": 0.3418830335140228,
|
|
"eval_runtime": 29.4345,
|
|
"eval_samples_per_second": 17.496,
|
|
"eval_steps_per_second": 4.383,
|
|
"eval_token_acc": 0.8921042257743922,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 2.262171697373579,
|
|
"grad_norm": 0.9023928046226501,
|
|
"learning_rate": 1.4103058774748923e-06,
|
|
"loss": 0.20214588642120362,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1805,
|
|
"token_acc": 0.9073493650364767,
|
|
"train_speed(iter/s)": 0.122533
|
|
},
|
|
{
|
|
"epoch": 2.2684437475499806,
|
|
"grad_norm": 0.874862790107727,
|
|
"learning_rate": 1.3875177383077233e-06,
|
|
"loss": 0.19704325199127198,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1810,
|
|
"token_acc": 0.9309168859008087,
|
|
"train_speed(iter/s)": 0.122602
|
|
},
|
|
{
|
|
"epoch": 2.274715797726382,
|
|
"grad_norm": 0.8751354217529297,
|
|
"learning_rate": 1.3648855130147216e-06,
|
|
"loss": 0.1942400574684143,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1815,
|
|
"token_acc": 0.9344223881096643,
|
|
"train_speed(iter/s)": 0.122696
|
|
},
|
|
{
|
|
"epoch": 2.280987847902783,
|
|
"grad_norm": 0.8818191885948181,
|
|
"learning_rate": 1.3424101783973403e-06,
|
|
"loss": 0.19853044748306276,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1820,
|
|
"token_acc": 0.9334029143751756,
|
|
"train_speed(iter/s)": 0.122774
|
|
},
|
|
{
|
|
"epoch": 2.280987847902783,
|
|
"eval_loss": 0.34076765179634094,
|
|
"eval_runtime": 29.4137,
|
|
"eval_samples_per_second": 17.509,
|
|
"eval_steps_per_second": 4.386,
|
|
"eval_token_acc": 0.8922596871894242,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 2.2872598980791845,
|
|
"grad_norm": 0.8647897839546204,
|
|
"learning_rate": 1.3200927044856714e-06,
|
|
"loss": 0.20430846214294435,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1825,
|
|
"token_acc": 0.9105302740430922,
|
|
"train_speed(iter/s)": 0.122528
|
|
},
|
|
{
|
|
"epoch": 2.293531948255586,
|
|
"grad_norm": 0.8771964907646179,
|
|
"learning_rate": 1.2979340544965745e-06,
|
|
"loss": 0.19436899423599244,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1830,
|
|
"token_acc": 0.9332179074944348,
|
|
"train_speed(iter/s)": 0.122614
|
|
},
|
|
{
|
|
"epoch": 2.2998039984319876,
|
|
"grad_norm": 0.8589149117469788,
|
|
"learning_rate": 1.2759351847921053e-06,
|
|
"loss": 0.20428872108459473,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1835,
|
|
"token_acc": 0.9258984534361904,
|
|
"train_speed(iter/s)": 0.122693
|
|
},
|
|
{
|
|
"epoch": 2.306076048608389,
|
|
"grad_norm": 0.8475139737129211,
|
|
"learning_rate": 1.25409704483824e-06,
|
|
"loss": 0.20542593002319337,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1840,
|
|
"token_acc": 0.9302525044599973,
|
|
"train_speed(iter/s)": 0.122772
|
|
},
|
|
{
|
|
"epoch": 2.306076048608389,
|
|
"eval_loss": 0.3399674594402313,
|
|
"eval_runtime": 29.4357,
|
|
"eval_samples_per_second": 17.496,
|
|
"eval_steps_per_second": 4.382,
|
|
"eval_token_acc": 0.8921819564819081,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 2.31234809878479,
|
|
"grad_norm": 0.8466213345527649,
|
|
"learning_rate": 1.232420577163902e-06,
|
|
"loss": 0.1930600881576538,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1845,
|
|
"token_acc": 0.9117772908211558,
|
|
"train_speed(iter/s)": 0.122528
|
|
},
|
|
{
|
|
"epoch": 2.3186201489611915,
|
|
"grad_norm": 0.8181382417678833,
|
|
"learning_rate": 1.2109067173202731e-06,
|
|
"loss": 0.20191946029663085,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1850,
|
|
"token_acc": 0.9309912619968486,
|
|
"train_speed(iter/s)": 0.122613
|
|
},
|
|
{
|
|
"epoch": 2.3248921991375933,
|
|
"grad_norm": 0.9908396601676941,
|
|
"learning_rate": 1.1895563938404203e-06,
|
|
"loss": 0.20410680770874023,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1855,
|
|
"token_acc": 0.9304864290928396,
|
|
"train_speed(iter/s)": 0.122689
|
|
},
|
|
{
|
|
"epoch": 2.3311642493139946,
|
|
"grad_norm": 0.9089633822441101,
|
|
"learning_rate": 1.1683705281992202e-06,
|
|
"loss": 0.19948985576629638,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1860,
|
|
"token_acc": 0.9321098763499766,
|
|
"train_speed(iter/s)": 0.122765
|
|
},
|
|
{
|
|
"epoch": 2.3311642493139946,
|
|
"eval_loss": 0.3399353623390198,
|
|
"eval_runtime": 29.1818,
|
|
"eval_samples_per_second": 17.648,
|
|
"eval_steps_per_second": 4.421,
|
|
"eval_token_acc": 0.8923929398308802,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 2.337436299490396,
|
|
"grad_norm": 0.8357407450675964,
|
|
"learning_rate": 1.1473500347735927e-06,
|
|
"loss": 0.21080975532531737,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1865,
|
|
"token_acc": 0.9067027184646176,
|
|
"train_speed(iter/s)": 0.122557
|
|
},
|
|
{
|
|
"epoch": 2.343708349666797,
|
|
"grad_norm": 0.9168654084205627,
|
|
"learning_rate": 1.1264958208030224e-06,
|
|
"loss": 0.20281777381896973,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1870,
|
|
"token_acc": 0.9303914590747331,
|
|
"train_speed(iter/s)": 0.122644
|
|
},
|
|
{
|
|
"epoch": 2.349980399843199,
|
|
"grad_norm": 0.870764970779419,
|
|
"learning_rate": 1.105808786350423e-06,
|
|
"loss": 0.20570874214172363,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1875,
|
|
"token_acc": 0.9311500593533971,
|
|
"train_speed(iter/s)": 0.122727
|
|
},
|
|
{
|
|
"epoch": 2.3562524500196003,
|
|
"grad_norm": 0.9401370286941528,
|
|
"learning_rate": 1.085289824263273e-06,
|
|
"loss": 0.19900286197662354,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1880,
|
|
"token_acc": 0.9296601826391011,
|
|
"train_speed(iter/s)": 0.122811
|
|
},
|
|
{
|
|
"epoch": 2.3562524500196003,
|
|
"eval_loss": 0.33901476860046387,
|
|
"eval_runtime": 29.2027,
|
|
"eval_samples_per_second": 17.635,
|
|
"eval_steps_per_second": 4.417,
|
|
"eval_token_acc": 0.8928149065288242,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 2.3625245001960016,
|
|
"grad_norm": 0.8954982161521912,
|
|
"learning_rate": 1.0649398201350907e-06,
|
|
"loss": 0.19722338914871215,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1885,
|
|
"token_acc": 0.9103969754253308,
|
|
"train_speed(iter/s)": 0.12257
|
|
},
|
|
{
|
|
"epoch": 2.368796550372403,
|
|
"grad_norm": 0.8362477421760559,
|
|
"learning_rate": 1.044759652267207e-06,
|
|
"loss": 0.1907820224761963,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1890,
|
|
"token_acc": 0.9338316722037652,
|
|
"train_speed(iter/s)": 0.122633
|
|
},
|
|
{
|
|
"epoch": 2.375068600548804,
|
|
"grad_norm": 0.8261101245880127,
|
|
"learning_rate": 1.024750191630864e-06,
|
|
"loss": 0.20354986190795898,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1895,
|
|
"token_acc": 0.9313385826771654,
|
|
"train_speed(iter/s)": 0.122717
|
|
},
|
|
{
|
|
"epoch": 2.381340650725206,
|
|
"grad_norm": 0.8243074417114258,
|
|
"learning_rate": 1.0049123018296158e-06,
|
|
"loss": 0.1990055799484253,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1900,
|
|
"token_acc": 0.931945788964182,
|
|
"train_speed(iter/s)": 0.122795
|
|
},
|
|
{
|
|
"epoch": 2.381340650725206,
|
|
"eval_loss": 0.3398912847042084,
|
|
"eval_runtime": 29.3886,
|
|
"eval_samples_per_second": 17.524,
|
|
"eval_steps_per_second": 4.389,
|
|
"eval_token_acc": 0.8929537113636742,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 2.3876127009016073,
|
|
"grad_norm": 0.8291247487068176,
|
|
"learning_rate": 9.852468390620624e-07,
|
|
"loss": 0.19908733367919923,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1905,
|
|
"token_acc": 0.9119964458846742,
|
|
"train_speed(iter/s)": 0.122563
|
|
},
|
|
{
|
|
"epoch": 2.3938847510780086,
|
|
"grad_norm": 0.9071137309074402,
|
|
"learning_rate": 9.65754652084896e-07,
|
|
"loss": 0.20061676502227782,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1910,
|
|
"token_acc": 0.9315043133770224,
|
|
"train_speed(iter/s)": 0.122633
|
|
},
|
|
{
|
|
"epoch": 2.40015680125441,
|
|
"grad_norm": 0.8123340606689453,
|
|
"learning_rate": 9.464365821762611e-07,
|
|
"loss": 0.2007359504699707,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1915,
|
|
"token_acc": 0.9310011111509373,
|
|
"train_speed(iter/s)": 0.122704
|
|
},
|
|
{
|
|
"epoch": 2.406428851430811,
|
|
"grad_norm": 0.9131314754486084,
|
|
"learning_rate": 9.272934630994579e-07,
|
|
"loss": 0.2020493984222412,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1920,
|
|
"token_acc": 0.9327723569957926,
|
|
"train_speed(iter/s)": 0.122789
|
|
},
|
|
{
|
|
"epoch": 2.406428851430811,
|
|
"eval_loss": 0.3394792377948761,
|
|
"eval_runtime": 29.4324,
|
|
"eval_samples_per_second": 17.498,
|
|
"eval_steps_per_second": 4.383,
|
|
"eval_token_acc": 0.8929315025900982,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 2.412700901607213,
|
|
"grad_norm": 0.8938679099082947,
|
|
"learning_rate": 9.083261210669458e-07,
|
|
"loss": 0.19544891119003296,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1925,
|
|
"token_acc": 0.9099344547105741,
|
|
"train_speed(iter/s)": 0.122556
|
|
},
|
|
{
|
|
"epoch": 2.4189729517836143,
|
|
"grad_norm": 0.843400239944458,
|
|
"learning_rate": 8.895353747046903e-07,
|
|
"loss": 0.20389878749847412,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1930,
|
|
"token_acc": 0.9224234943914038,
|
|
"train_speed(iter/s)": 0.122642
|
|
},
|
|
{
|
|
"epoch": 2.4252450019600156,
|
|
"grad_norm": 0.9117215871810913,
|
|
"learning_rate": 8.70922035016829e-07,
|
|
"loss": 0.20622677803039552,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1935,
|
|
"token_acc": 0.9284166479862512,
|
|
"train_speed(iter/s)": 0.122721
|
|
},
|
|
{
|
|
"epoch": 2.4315170521364173,
|
|
"grad_norm": 0.9367381930351257,
|
|
"learning_rate": 8.524869053506718e-07,
|
|
"loss": 0.20433897972106935,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1940,
|
|
"token_acc": 0.9362286970863112,
|
|
"train_speed(iter/s)": 0.122802
|
|
},
|
|
{
|
|
"epoch": 2.4315170521364173,
|
|
"eval_loss": 0.33946192264556885,
|
|
"eval_runtime": 29.2561,
|
|
"eval_samples_per_second": 17.603,
|
|
"eval_steps_per_second": 4.409,
|
|
"eval_token_acc": 0.8925650578260942,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 2.4377891023128186,
|
|
"grad_norm": 0.8426703214645386,
|
|
"learning_rate": 8.342307813620254e-07,
|
|
"loss": 0.1967821955680847,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1945,
|
|
"token_acc": 0.9112145208413204,
|
|
"train_speed(iter/s)": 0.122596
|
|
},
|
|
{
|
|
"epoch": 2.44406115248922,
|
|
"grad_norm": 0.845736563205719,
|
|
"learning_rate": 8.161544509808522e-07,
|
|
"loss": 0.1979314088821411,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1950,
|
|
"token_acc": 0.9301829610506229,
|
|
"train_speed(iter/s)": 0.122667
|
|
},
|
|
{
|
|
"epoch": 2.4503332026656213,
|
|
"grad_norm": 0.9228907823562622,
|
|
"learning_rate": 7.982586943772663e-07,
|
|
"loss": 0.19447792768478395,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1955,
|
|
"token_acc": 0.9315416813363063,
|
|
"train_speed(iter/s)": 0.122739
|
|
},
|
|
{
|
|
"epoch": 2.4566052528420226,
|
|
"grad_norm": 0.8696323037147522,
|
|
"learning_rate": 7.805442839278643e-07,
|
|
"loss": 0.2015920639038086,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1960,
|
|
"token_acc": 0.9276550395540435,
|
|
"train_speed(iter/s)": 0.12281
|
|
},
|
|
{
|
|
"epoch": 2.4566052528420226,
|
|
"eval_loss": 0.3402659595012665,
|
|
"eval_runtime": 29.1749,
|
|
"eval_samples_per_second": 17.652,
|
|
"eval_steps_per_second": 4.422,
|
|
"eval_token_acc": 0.8927316236279143,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 2.4628773030184243,
|
|
"grad_norm": 0.8550149202346802,
|
|
"learning_rate": 7.630119841823808e-07,
|
|
"loss": 0.19550955295562744,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1965,
|
|
"token_acc": 0.9109677753600663,
|
|
"train_speed(iter/s)": 0.122595
|
|
},
|
|
{
|
|
"epoch": 2.4691493531948256,
|
|
"grad_norm": 0.8550127148628235,
|
|
"learning_rate": 7.456625518306976e-07,
|
|
"loss": 0.20470118522644043,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1970,
|
|
"token_acc": 0.9361644784969007,
|
|
"train_speed(iter/s)": 0.122672
|
|
},
|
|
{
|
|
"epoch": 2.475421403371227,
|
|
"grad_norm": 0.8760136365890503,
|
|
"learning_rate": 7.284967356701839e-07,
|
|
"loss": 0.19109119176864625,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1975,
|
|
"token_acc": 0.9317175239755885,
|
|
"train_speed(iter/s)": 0.122722
|
|
},
|
|
{
|
|
"epoch": 2.4816934535476283,
|
|
"grad_norm": 0.8244690299034119,
|
|
"learning_rate": 7.115152765733768e-07,
|
|
"loss": 0.19548358917236328,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1980,
|
|
"token_acc": 0.9295848857777276,
|
|
"train_speed(iter/s)": 0.122806
|
|
},
|
|
{
|
|
"epoch": 2.4816934535476283,
|
|
"eval_loss": 0.33928003907203674,
|
|
"eval_runtime": 29.3453,
|
|
"eval_samples_per_second": 17.55,
|
|
"eval_steps_per_second": 4.396,
|
|
"eval_token_acc": 0.8926039231798522,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 2.4879655037240296,
|
|
"grad_norm": 0.8801511526107788,
|
|
"learning_rate": 6.94718907456009e-07,
|
|
"loss": 0.20571486949920653,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1985,
|
|
"token_acc": 0.9119239420411153,
|
|
"train_speed(iter/s)": 0.122584
|
|
},
|
|
{
|
|
"epoch": 2.4942375539004313,
|
|
"grad_norm": 0.8793259263038635,
|
|
"learning_rate": 6.781083532453702e-07,
|
|
"loss": 0.19008111953735352,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1990,
|
|
"token_acc": 0.9300288504988008,
|
|
"train_speed(iter/s)": 0.122656
|
|
},
|
|
{
|
|
"epoch": 2.5005096040768326,
|
|
"grad_norm": 0.8719813823699951,
|
|
"learning_rate": 6.61684330849025e-07,
|
|
"loss": 0.20494444370269777,
|
|
"memory(GiB)": 35.31,
|
|
"step": 1995,
|
|
"token_acc": 0.9264506459210621,
|
|
"train_speed(iter/s)": 0.122733
|
|
},
|
|
{
|
|
"epoch": 2.506781654253234,
|
|
"grad_norm": 0.8206390738487244,
|
|
"learning_rate": 6.454475491238682e-07,
|
|
"loss": 0.21190786361694336,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2000,
|
|
"token_acc": 0.9282438731017892,
|
|
"train_speed(iter/s)": 0.122814
|
|
},
|
|
{
|
|
"epoch": 2.506781654253234,
|
|
"eval_loss": 0.3389175236225128,
|
|
"eval_runtime": 29.4243,
|
|
"eval_samples_per_second": 17.503,
|
|
"eval_steps_per_second": 4.384,
|
|
"eval_token_acc": 0.8926705495005802,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 2.5130537044296353,
|
|
"grad_norm": 0.8637955188751221,
|
|
"learning_rate": 6.293987088455355e-07,
|
|
"loss": 0.18972909450531006,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2005,
|
|
"token_acc": 0.9111111111111111,
|
|
"train_speed(iter/s)": 0.122588
|
|
},
|
|
{
|
|
"epoch": 2.5193257546060366,
|
|
"grad_norm": 0.8693656325340271,
|
|
"learning_rate": 6.135385026781476e-07,
|
|
"loss": 0.19535259008407593,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2010,
|
|
"token_acc": 0.9307692307692308,
|
|
"train_speed(iter/s)": 0.122665
|
|
},
|
|
{
|
|
"epoch": 2.5255978047824383,
|
|
"grad_norm": 0.8843157887458801,
|
|
"learning_rate": 5.978676151444285e-07,
|
|
"loss": 0.19748587608337403,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2015,
|
|
"token_acc": 0.932608875299562,
|
|
"train_speed(iter/s)": 0.122743
|
|
},
|
|
{
|
|
"epoch": 2.5318698549588396,
|
|
"grad_norm": 0.7771234512329102,
|
|
"learning_rate": 5.823867225961516e-07,
|
|
"loss": 0.20223774909973144,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2020,
|
|
"token_acc": 0.9318809052241956,
|
|
"train_speed(iter/s)": 0.122809
|
|
},
|
|
{
|
|
"epoch": 2.5318698549588396,
|
|
"eval_loss": 0.3387446999549866,
|
|
"eval_runtime": 29.4213,
|
|
"eval_samples_per_second": 17.504,
|
|
"eval_steps_per_second": 4.385,
|
|
"eval_token_acc": 0.8927593845948842,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 2.538141905135241,
|
|
"grad_norm": 0.8923653364181519,
|
|
"learning_rate": 5.670964931849521e-07,
|
|
"loss": 0.20152680873870848,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2025,
|
|
"token_acc": 0.9079145999568686,
|
|
"train_speed(iter/s)": 0.122607
|
|
},
|
|
{
|
|
"epoch": 2.5444139553116427,
|
|
"grad_norm": 0.9194443225860596,
|
|
"learning_rate": 5.519975868334914e-07,
|
|
"loss": 0.18218059539794923,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2030,
|
|
"token_acc": 0.9371556217423679,
|
|
"train_speed(iter/s)": 0.122682
|
|
},
|
|
{
|
|
"epoch": 2.550686005488044,
|
|
"grad_norm": 0.9134296774864197,
|
|
"learning_rate": 5.370906552069721e-07,
|
|
"loss": 0.21519947052001953,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2035,
|
|
"token_acc": 0.9308896388310476,
|
|
"train_speed(iter/s)": 0.122768
|
|
},
|
|
{
|
|
"epoch": 2.5569580556644453,
|
|
"grad_norm": 0.8432523608207703,
|
|
"learning_rate": 5.22376341685013e-07,
|
|
"loss": 0.19406379461288453,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2040,
|
|
"token_acc": 0.9379203093476799,
|
|
"train_speed(iter/s)": 0.122832
|
|
},
|
|
{
|
|
"epoch": 2.5569580556644453,
|
|
"eval_loss": 0.3386911153793335,
|
|
"eval_runtime": 29.3737,
|
|
"eval_samples_per_second": 17.533,
|
|
"eval_steps_per_second": 4.392,
|
|
"eval_token_acc": 0.8926372363402162,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 2.5632301058408467,
|
|
"grad_norm": 0.8082075715065002,
|
|
"learning_rate": 5.07855281333881e-07,
|
|
"loss": 0.20029840469360352,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2045,
|
|
"token_acc": 0.9093570973901973,
|
|
"train_speed(iter/s)": 0.122621
|
|
},
|
|
{
|
|
"epoch": 2.569502156017248,
|
|
"grad_norm": 0.8321412801742554,
|
|
"learning_rate": 4.935281008790843e-07,
|
|
"loss": 0.19831552505493164,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2050,
|
|
"token_acc": 0.9315964443159644,
|
|
"train_speed(iter/s)": 0.122696
|
|
},
|
|
{
|
|
"epoch": 2.5757742061936497,
|
|
"grad_norm": 0.836764395236969,
|
|
"learning_rate": 4.793954186783195e-07,
|
|
"loss": 0.20497620105743408,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2055,
|
|
"token_acc": 0.9296713578417996,
|
|
"train_speed(iter/s)": 0.122776
|
|
},
|
|
{
|
|
"epoch": 2.582046256370051,
|
|
"grad_norm": 0.8858775496482849,
|
|
"learning_rate": 4.6545784469478386e-07,
|
|
"loss": 0.1925274133682251,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2060,
|
|
"token_acc": 0.9290921363482957,
|
|
"train_speed(iter/s)": 0.122853
|
|
},
|
|
{
|
|
"epoch": 2.582046256370051,
|
|
"eval_loss": 0.33907607197761536,
|
|
"eval_runtime": 29.2969,
|
|
"eval_samples_per_second": 17.579,
|
|
"eval_steps_per_second": 4.403,
|
|
"eval_token_acc": 0.8925872665996702,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 2.5883183065464523,
|
|
"grad_norm": 0.8445903062820435,
|
|
"learning_rate": 4.5171598047085153e-07,
|
|
"loss": 0.2032531976699829,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2065,
|
|
"token_acc": 0.9114080767856231,
|
|
"train_speed(iter/s)": 0.122651
|
|
},
|
|
{
|
|
"epoch": 2.5945903567228537,
|
|
"grad_norm": 0.8861658573150635,
|
|
"learning_rate": 4.381704191021119e-07,
|
|
"loss": 0.20241761207580566,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2070,
|
|
"token_acc": 0.9290917036929128,
|
|
"train_speed(iter/s)": 0.12273
|
|
},
|
|
{
|
|
"epoch": 2.600862406899255,
|
|
"grad_norm": 0.9306016564369202,
|
|
"learning_rate": 4.248217452117653e-07,
|
|
"loss": 0.19754456281661986,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2075,
|
|
"token_acc": 0.9342051643018561,
|
|
"train_speed(iter/s)": 0.122795
|
|
},
|
|
{
|
|
"epoch": 2.6071344570756567,
|
|
"grad_norm": 0.8931999206542969,
|
|
"learning_rate": 4.1167053492540023e-07,
|
|
"loss": 0.20364174842834473,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2080,
|
|
"token_acc": 0.927598729005901,
|
|
"train_speed(iter/s)": 0.122864
|
|
},
|
|
{
|
|
"epoch": 2.6071344570756567,
|
|
"eval_loss": 0.3389338552951813,
|
|
"eval_runtime": 29.4394,
|
|
"eval_samples_per_second": 17.494,
|
|
"eval_steps_per_second": 4.382,
|
|
"eval_token_acc": 0.8927205192411262,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 2.613406507252058,
|
|
"grad_norm": 0.9079094529151917,
|
|
"learning_rate": 3.987173558461199e-07,
|
|
"loss": 0.2029134750366211,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2085,
|
|
"token_acc": 0.9080656826765958,
|
|
"train_speed(iter/s)": 0.122659
|
|
},
|
|
{
|
|
"epoch": 2.6196785574284593,
|
|
"grad_norm": 0.8731334209442139,
|
|
"learning_rate": 3.8596276703004974e-07,
|
|
"loss": 0.1946258783340454,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2090,
|
|
"token_acc": 0.9326196473551638,
|
|
"train_speed(iter/s)": 0.122729
|
|
},
|
|
{
|
|
"epoch": 2.625950607604861,
|
|
"grad_norm": 0.858302652835846,
|
|
"learning_rate": 3.7340731896220393e-07,
|
|
"loss": 0.19946659803390504,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2095,
|
|
"token_acc": 0.9289610347192736,
|
|
"train_speed(iter/s)": 0.1228
|
|
},
|
|
{
|
|
"epoch": 2.6322226577812624,
|
|
"grad_norm": 0.9710678458213806,
|
|
"learning_rate": 3.6105155353273305e-07,
|
|
"loss": 0.19640454053878784,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2100,
|
|
"token_acc": 0.9349016126645954,
|
|
"train_speed(iter/s)": 0.122853
|
|
},
|
|
{
|
|
"epoch": 2.6322226577812624,
|
|
"eval_loss": 0.33849242329597473,
|
|
"eval_runtime": 29.4901,
|
|
"eval_samples_per_second": 17.464,
|
|
"eval_steps_per_second": 4.374,
|
|
"eval_token_acc": 0.8928537718825822,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 2.6384947079576637,
|
|
"grad_norm": 0.9348801970481873,
|
|
"learning_rate": 3.488960040135303e-07,
|
|
"loss": 0.19354541301727296,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2105,
|
|
"token_acc": 0.9099158577870177,
|
|
"train_speed(iter/s)": 0.122646
|
|
},
|
|
{
|
|
"epoch": 2.644766758134065,
|
|
"grad_norm": 0.8546217679977417,
|
|
"learning_rate": 3.369411950352175e-07,
|
|
"loss": 0.19183013439178467,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2110,
|
|
"token_acc": 0.9336448231183007,
|
|
"train_speed(iter/s)": 0.122714
|
|
},
|
|
{
|
|
"epoch": 2.6510388083104663,
|
|
"grad_norm": 0.8211365938186646,
|
|
"learning_rate": 3.251876425645051e-07,
|
|
"loss": 0.1970944881439209,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2115,
|
|
"token_acc": 0.9344840840001397,
|
|
"train_speed(iter/s)": 0.122778
|
|
},
|
|
{
|
|
"epoch": 2.657310858486868,
|
|
"grad_norm": 0.9623558521270752,
|
|
"learning_rate": 3.136358538819162e-07,
|
|
"loss": 0.21173155307769775,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2120,
|
|
"token_acc": 0.9270025343675601,
|
|
"train_speed(iter/s)": 0.122858
|
|
},
|
|
{
|
|
"epoch": 2.657310858486868,
|
|
"eval_loss": 0.33901315927505493,
|
|
"eval_runtime": 29.3435,
|
|
"eval_samples_per_second": 17.551,
|
|
"eval_steps_per_second": 4.396,
|
|
"eval_token_acc": 0.8928426674957942,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 2.6635829086632694,
|
|
"grad_norm": 0.8673403263092041,
|
|
"learning_rate": 3.0228632755990197e-07,
|
|
"loss": 0.1995314836502075,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2125,
|
|
"token_acc": 0.9100443616846486,
|
|
"train_speed(iter/s)": 0.122656
|
|
},
|
|
{
|
|
"epoch": 2.6698549588396707,
|
|
"grad_norm": 0.8678857684135437,
|
|
"learning_rate": 2.911395534413147e-07,
|
|
"loss": 0.20220797061920165,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2130,
|
|
"token_acc": 0.9269149418341139,
|
|
"train_speed(iter/s)": 0.122725
|
|
},
|
|
{
|
|
"epoch": 2.676127009016072,
|
|
"grad_norm": 0.8663479089736938,
|
|
"learning_rate": 2.8019601261827123e-07,
|
|
"loss": 0.19415628910064697,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2135,
|
|
"token_acc": 0.9364371994839921,
|
|
"train_speed(iter/s)": 0.122789
|
|
},
|
|
{
|
|
"epoch": 2.6823990591924733,
|
|
"grad_norm": 0.9166416525840759,
|
|
"learning_rate": 2.694561774113863e-07,
|
|
"loss": 0.2000800371170044,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2140,
|
|
"token_acc": 0.9343491222259026,
|
|
"train_speed(iter/s)": 0.122866
|
|
},
|
|
{
|
|
"epoch": 2.6823990591924733,
|
|
"eval_loss": 0.3389025926589966,
|
|
"eval_runtime": 29.1862,
|
|
"eval_samples_per_second": 17.645,
|
|
"eval_steps_per_second": 4.42,
|
|
"eval_token_acc": 0.8927649367882782,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 2.688671109368875,
|
|
"grad_norm": 0.924248993396759,
|
|
"learning_rate": 2.5892051134939256e-07,
|
|
"loss": 0.19067001342773438,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2145,
|
|
"token_acc": 0.9105434393007803,
|
|
"train_speed(iter/s)": 0.122659
|
|
},
|
|
{
|
|
"epoch": 2.6949431595452764,
|
|
"grad_norm": 0.9541825652122498,
|
|
"learning_rate": 2.485894691491253e-07,
|
|
"loss": 0.19531885385513306,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2150,
|
|
"token_acc": 0.9327190236696796,
|
|
"train_speed(iter/s)": 0.122731
|
|
},
|
|
{
|
|
"epoch": 2.7012152097216777,
|
|
"grad_norm": 0.7837492227554321,
|
|
"learning_rate": 2.384634966959076e-07,
|
|
"loss": 0.1978324294090271,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2155,
|
|
"token_acc": 0.9313808767588728,
|
|
"train_speed(iter/s)": 0.122802
|
|
},
|
|
{
|
|
"epoch": 2.707487259898079,
|
|
"grad_norm": 0.9158398509025574,
|
|
"learning_rate": 2.2854303102429808e-07,
|
|
"loss": 0.19039928913116455,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2160,
|
|
"token_acc": 0.9376352705410822,
|
|
"train_speed(iter/s)": 0.122861
|
|
},
|
|
{
|
|
"epoch": 2.707487259898079,
|
|
"eval_loss": 0.33926960825920105,
|
|
"eval_runtime": 29.5457,
|
|
"eval_samples_per_second": 17.431,
|
|
"eval_steps_per_second": 4.366,
|
|
"eval_token_acc": 0.8928482196891883,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 2.7137593100744803,
|
|
"grad_norm": 0.9085947871208191,
|
|
"learning_rate": 2.1882850029923463e-07,
|
|
"loss": 0.1978399395942688,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2165,
|
|
"token_acc": 0.9107124038360207,
|
|
"train_speed(iter/s)": 0.122668
|
|
},
|
|
{
|
|
"epoch": 2.720031360250882,
|
|
"grad_norm": 0.8751495480537415,
|
|
"learning_rate": 2.093203237975483e-07,
|
|
"loss": 0.199626362323761,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2170,
|
|
"token_acc": 0.9323327305605786,
|
|
"train_speed(iter/s)": 0.122732
|
|
},
|
|
{
|
|
"epoch": 2.7263034104272834,
|
|
"grad_norm": 0.8074597716331482,
|
|
"learning_rate": 2.0001891188987265e-07,
|
|
"loss": 0.19364542961120607,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2175,
|
|
"token_acc": 0.9388682499668303,
|
|
"train_speed(iter/s)": 0.122783
|
|
},
|
|
{
|
|
"epoch": 2.7325754606036847,
|
|
"grad_norm": 0.8650562763214111,
|
|
"learning_rate": 1.9092466602293247e-07,
|
|
"loss": 0.20354480743408204,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2180,
|
|
"token_acc": 0.9326430478389495,
|
|
"train_speed(iter/s)": 0.122852
|
|
},
|
|
{
|
|
"epoch": 2.7325754606036847,
|
|
"eval_loss": 0.3389414846897125,
|
|
"eval_runtime": 29.3624,
|
|
"eval_samples_per_second": 17.539,
|
|
"eval_steps_per_second": 4.393,
|
|
"eval_token_acc": 0.8927538324014902,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 2.7388475107800865,
|
|
"grad_norm": 0.9609673619270325,
|
|
"learning_rate": 1.8203797870221197e-07,
|
|
"loss": 0.19662023782730104,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2185,
|
|
"token_acc": 0.9078222548659567,
|
|
"train_speed(iter/s)": 0.122635
|
|
},
|
|
{
|
|
"epoch": 2.745119560956488,
|
|
"grad_norm": 0.8336656093597412,
|
|
"learning_rate": 1.7335923347502003e-07,
|
|
"loss": 0.19498822689056397,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2190,
|
|
"token_acc": 0.9327082366973692,
|
|
"train_speed(iter/s)": 0.122704
|
|
},
|
|
{
|
|
"epoch": 2.751391611132889,
|
|
"grad_norm": 0.9428432583808899,
|
|
"learning_rate": 1.6488880491393467e-07,
|
|
"loss": 0.19512466192245484,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2195,
|
|
"token_acc": 0.9311785670394495,
|
|
"train_speed(iter/s)": 0.12277
|
|
},
|
|
{
|
|
"epoch": 2.7576636613092904,
|
|
"grad_norm": 0.8996681571006775,
|
|
"learning_rate": 1.5662705860063465e-07,
|
|
"loss": 0.19020618200302125,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2200,
|
|
"token_acc": 0.9422429845480382,
|
|
"train_speed(iter/s)": 0.122831
|
|
},
|
|
{
|
|
"epoch": 2.7576636613092904,
|
|
"eval_loss": 0.3384985029697418,
|
|
"eval_runtime": 29.4137,
|
|
"eval_samples_per_second": 17.509,
|
|
"eval_steps_per_second": 4.386,
|
|
"eval_token_acc": 0.8928870850429462,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 2.7639357114856917,
|
|
"grad_norm": 0.8269793391227722,
|
|
"learning_rate": 1.485743511101234e-07,
|
|
"loss": 0.20108513832092284,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2205,
|
|
"token_acc": 0.9105911137718317,
|
|
"train_speed(iter/s)": 0.122632
|
|
},
|
|
{
|
|
"epoch": 2.7702077616620935,
|
|
"grad_norm": 0.8052636384963989,
|
|
"learning_rate": 1.4073102999534017e-07,
|
|
"loss": 0.19754087924957275,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2210,
|
|
"token_acc": 0.9333692597867508,
|
|
"train_speed(iter/s)": 0.122702
|
|
},
|
|
{
|
|
"epoch": 2.776479811838495,
|
|
"grad_norm": 0.8950125575065613,
|
|
"learning_rate": 1.3309743377215468e-07,
|
|
"loss": 0.19114834070205688,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2215,
|
|
"token_acc": 0.9366596409622793,
|
|
"train_speed(iter/s)": 0.122756
|
|
},
|
|
{
|
|
"epoch": 2.782751862014896,
|
|
"grad_norm": 0.8540958166122437,
|
|
"learning_rate": 1.2567389190476287e-07,
|
|
"loss": 0.2070404052734375,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2220,
|
|
"token_acc": 0.9272875816993464,
|
|
"train_speed(iter/s)": 0.122829
|
|
},
|
|
{
|
|
"epoch": 2.782751862014896,
|
|
"eval_loss": 0.33857327699661255,
|
|
"eval_runtime": 29.3528,
|
|
"eval_samples_per_second": 17.545,
|
|
"eval_steps_per_second": 4.395,
|
|
"eval_token_acc": 0.8927593845948842,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 2.7890239121912974,
|
|
"grad_norm": 0.8678739666938782,
|
|
"learning_rate": 1.1846072479146431e-07,
|
|
"loss": 0.18475788831710815,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2225,
|
|
"token_acc": 0.91176622304756,
|
|
"train_speed(iter/s)": 0.122618
|
|
},
|
|
{
|
|
"epoch": 2.7952959623676987,
|
|
"grad_norm": 0.9150513410568237,
|
|
"learning_rate": 1.114582437508327e-07,
|
|
"loss": 0.20294113159179689,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2230,
|
|
"token_acc": 0.9297203746436813,
|
|
"train_speed(iter/s)": 0.12269
|
|
},
|
|
{
|
|
"epoch": 2.8015680125441005,
|
|
"grad_norm": 0.9211878776550293,
|
|
"learning_rate": 1.0466675100828383e-07,
|
|
"loss": 0.19586080312728882,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2235,
|
|
"token_acc": 0.926397298076527,
|
|
"train_speed(iter/s)": 0.122748
|
|
},
|
|
{
|
|
"epoch": 2.807840062720502,
|
|
"grad_norm": 0.8305221199989319,
|
|
"learning_rate": 9.808653968302607e-08,
|
|
"loss": 0.18916590213775636,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2240,
|
|
"token_acc": 0.9314254859611231,
|
|
"train_speed(iter/s)": 0.122814
|
|
},
|
|
{
|
|
"epoch": 2.807840062720502,
|
|
"eval_loss": 0.3386108875274658,
|
|
"eval_runtime": 29.4168,
|
|
"eval_samples_per_second": 17.507,
|
|
"eval_steps_per_second": 4.385,
|
|
"eval_token_acc": 0.8929481591702803,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 2.814112112896903,
|
|
"grad_norm": 0.8349918127059937,
|
|
"learning_rate": 9.17178937754143e-08,
|
|
"loss": 0.19523937702178956,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2245,
|
|
"token_acc": 0.9112593067402988,
|
|
"train_speed(iter/s)": 0.122615
|
|
},
|
|
{
|
|
"epoch": 2.820384163073305,
|
|
"grad_norm": 0.7576460838317871,
|
|
"learning_rate": 8.556108815468756e-08,
|
|
"loss": 0.18766900300979614,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2250,
|
|
"token_acc": 0.9345958040143973,
|
|
"train_speed(iter/s)": 0.122681
|
|
},
|
|
{
|
|
"epoch": 2.8266562132497057,
|
|
"grad_norm": 0.8792471289634705,
|
|
"learning_rate": 7.961638854711296e-08,
|
|
"loss": 0.19756540060043334,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2255,
|
|
"token_acc": 0.9329730890352133,
|
|
"train_speed(iter/s)": 0.122737
|
|
},
|
|
{
|
|
"epoch": 2.8329282634261075,
|
|
"grad_norm": 0.9662355780601501,
|
|
"learning_rate": 7.388405152450706e-08,
|
|
"loss": 0.19980876445770263,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2260,
|
|
"token_acc": 0.9275946077635212,
|
|
"train_speed(iter/s)": 0.122811
|
|
},
|
|
{
|
|
"epoch": 2.8329282634261075,
|
|
"eval_loss": 0.3388676047325134,
|
|
"eval_runtime": 29.3235,
|
|
"eval_samples_per_second": 17.563,
|
|
"eval_steps_per_second": 4.399,
|
|
"eval_token_acc": 0.8927760411750663,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 2.839200313602509,
|
|
"grad_norm": 0.8288848400115967,
|
|
"learning_rate": 6.836432449317255e-08,
|
|
"loss": 0.19607880115509033,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2265,
|
|
"token_acc": 0.9086215087640337,
|
|
"train_speed(iter/s)": 0.122615
|
|
},
|
|
{
|
|
"epoch": 2.84547236377891,
|
|
"grad_norm": 0.8882344961166382,
|
|
"learning_rate": 6.305744568321281e-08,
|
|
"loss": 0.19206061363220214,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2270,
|
|
"token_acc": 0.9301113088095336,
|
|
"train_speed(iter/s)": 0.122679
|
|
},
|
|
{
|
|
"epoch": 2.851744413955312,
|
|
"grad_norm": 0.8858575820922852,
|
|
"learning_rate": 5.7963644138254175e-08,
|
|
"loss": 0.1968652129173279,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2275,
|
|
"token_acc": 0.9304818328139447,
|
|
"train_speed(iter/s)": 0.122744
|
|
},
|
|
{
|
|
"epoch": 2.858016464131713,
|
|
"grad_norm": 0.8176801800727844,
|
|
"learning_rate": 5.308313970555812e-08,
|
|
"loss": 0.2044682025909424,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2280,
|
|
"token_acc": 0.9269594335344797,
|
|
"train_speed(iter/s)": 0.122817
|
|
},
|
|
{
|
|
"epoch": 2.858016464131713,
|
|
"eval_loss": 0.3388592600822449,
|
|
"eval_runtime": 29.4082,
|
|
"eval_samples_per_second": 17.512,
|
|
"eval_steps_per_second": 4.387,
|
|
"eval_token_acc": 0.8928815328495522,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 2.8642885143081145,
|
|
"grad_norm": 0.9048309922218323,
|
|
"learning_rate": 4.841614302653341e-08,
|
|
"loss": 0.18790122270584106,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2285,
|
|
"token_acc": 0.9097248097088412,
|
|
"train_speed(iter/s)": 0.122613
|
|
},
|
|
{
|
|
"epoch": 2.870560564484516,
|
|
"grad_norm": 0.911831259727478,
|
|
"learning_rate": 4.396285552764557e-08,
|
|
"loss": 0.19619462490081788,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2290,
|
|
"token_acc": 0.9307579243353783,
|
|
"train_speed(iter/s)": 0.122691
|
|
},
|
|
{
|
|
"epoch": 2.876832614660917,
|
|
"grad_norm": 0.9140892624855042,
|
|
"learning_rate": 3.9723469411723226e-08,
|
|
"loss": 0.1958878755569458,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2295,
|
|
"token_acc": 0.9350311098249168,
|
|
"train_speed(iter/s)": 0.122756
|
|
},
|
|
{
|
|
"epoch": 2.883104664837319,
|
|
"grad_norm": 0.8358930945396423,
|
|
"learning_rate": 3.5698167649660384e-08,
|
|
"loss": 0.18899658918380738,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2300,
|
|
"token_acc": 0.933035064746108,
|
|
"train_speed(iter/s)": 0.122816
|
|
},
|
|
{
|
|
"epoch": 2.883104664837319,
|
|
"eval_loss": 0.3388690948486328,
|
|
"eval_runtime": 29.3534,
|
|
"eval_samples_per_second": 17.545,
|
|
"eval_steps_per_second": 4.395,
|
|
"eval_token_acc": 0.8928926372363403,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 2.88937671501372,
|
|
"grad_norm": 0.902189314365387,
|
|
"learning_rate": 3.188712397252325e-08,
|
|
"loss": 0.19353920221328735,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2305,
|
|
"token_acc": 0.9086479591836735,
|
|
"train_speed(iter/s)": 0.122622
|
|
},
|
|
{
|
|
"epoch": 2.8956487651901215,
|
|
"grad_norm": 0.9097030758857727,
|
|
"learning_rate": 2.8290502864049553e-08,
|
|
"loss": 0.1854855537414551,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2310,
|
|
"token_acc": 0.9400766855552917,
|
|
"train_speed(iter/s)": 0.122682
|
|
},
|
|
{
|
|
"epoch": 2.901920815366523,
|
|
"grad_norm": 0.7989513278007507,
|
|
"learning_rate": 2.4908459553549257e-08,
|
|
"loss": 0.1963629961013794,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2315,
|
|
"token_acc": 0.9324489715346046,
|
|
"train_speed(iter/s)": 0.122749
|
|
},
|
|
{
|
|
"epoch": 2.908192865542924,
|
|
"grad_norm": 0.9992919564247131,
|
|
"learning_rate": 2.174114000920713e-08,
|
|
"loss": 0.20053634643554688,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2320,
|
|
"token_acc": 0.9333678449933441,
|
|
"train_speed(iter/s)": 0.122817
|
|
},
|
|
{
|
|
"epoch": 2.908192865542924,
|
|
"eval_loss": 0.3389608860015869,
|
|
"eval_runtime": 29.2024,
|
|
"eval_samples_per_second": 17.636,
|
|
"eval_steps_per_second": 4.417,
|
|
"eval_token_acc": 0.8928149065288242,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 2.914464915719326,
|
|
"grad_norm": 0.8522620797157288,
|
|
"learning_rate": 1.878868093177999e-08,
|
|
"loss": 0.19264475107192994,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2325,
|
|
"token_acc": 0.9097824352359168,
|
|
"train_speed(iter/s)": 0.122641
|
|
},
|
|
{
|
|
"epoch": 2.920736965895727,
|
|
"grad_norm": 0.8912385106086731,
|
|
"learning_rate": 1.6051209748698116e-08,
|
|
"loss": 0.1977926969528198,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2330,
|
|
"token_acc": 0.9306182995077877,
|
|
"train_speed(iter/s)": 0.122698
|
|
},
|
|
{
|
|
"epoch": 2.9270090160721285,
|
|
"grad_norm": 0.8228368759155273,
|
|
"learning_rate": 1.3528844608566848e-08,
|
|
"loss": 0.19549624919891356,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2335,
|
|
"token_acc": 0.9288313505948216,
|
|
"train_speed(iter/s)": 0.122753
|
|
},
|
|
{
|
|
"epoch": 2.9332810662485302,
|
|
"grad_norm": 0.9116573929786682,
|
|
"learning_rate": 1.1221694376064018e-08,
|
|
"loss": 0.2056950092315674,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2340,
|
|
"token_acc": 0.9298293985869378,
|
|
"train_speed(iter/s)": 0.122821
|
|
},
|
|
{
|
|
"epoch": 2.9332810662485302,
|
|
"eval_loss": 0.33883053064346313,
|
|
"eval_runtime": 29.3994,
|
|
"eval_samples_per_second": 17.517,
|
|
"eval_steps_per_second": 4.388,
|
|
"eval_token_acc": 0.8927760411750663,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 2.9395531164249316,
|
|
"grad_norm": 0.9035621881484985,
|
|
"learning_rate": 9.129858627244802e-09,
|
|
"loss": 0.1941608190536499,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2345,
|
|
"token_acc": 0.9095595779446971,
|
|
"train_speed(iter/s)": 0.122637
|
|
},
|
|
{
|
|
"epoch": 2.945825166601333,
|
|
"grad_norm": 1.024281620979309,
|
|
"learning_rate": 7.25342764524184e-09,
|
|
"loss": 0.19218976497650148,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2350,
|
|
"token_acc": 0.9338138445777843,
|
|
"train_speed(iter/s)": 0.122699
|
|
},
|
|
{
|
|
"epoch": 2.952097216777734,
|
|
"grad_norm": 0.8680444955825806,
|
|
"learning_rate": 5.592482416369449e-09,
|
|
"loss": 0.1976174831390381,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2355,
|
|
"token_acc": 0.9297104920949969,
|
|
"train_speed(iter/s)": 0.122764
|
|
},
|
|
{
|
|
"epoch": 2.9583692669541355,
|
|
"grad_norm": 0.8660714626312256,
|
|
"learning_rate": 4.147094626628656e-09,
|
|
"loss": 0.19633731842041016,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2360,
|
|
"token_acc": 0.9261772040487017,
|
|
"train_speed(iter/s)": 0.122824
|
|
},
|
|
{
|
|
"epoch": 2.9583692669541355,
|
|
"eval_loss": 0.33882075548171997,
|
|
"eval_runtime": 29.461,
|
|
"eval_samples_per_second": 17.481,
|
|
"eval_steps_per_second": 4.379,
|
|
"eval_token_acc": 0.8928426674957942,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 2.9646413171305372,
|
|
"grad_norm": 0.8004366159439087,
|
|
"learning_rate": 2.9173266586113303e-09,
|
|
"loss": 0.2010037899017334,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2365,
|
|
"token_acc": 0.9107686175513187,
|
|
"train_speed(iter/s)": 0.122617
|
|
},
|
|
{
|
|
"epoch": 2.9709133673069386,
|
|
"grad_norm": 0.8887699842453003,
|
|
"learning_rate": 1.9032315888106724e-09,
|
|
"loss": 0.19576088190078736,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2370,
|
|
"token_acc": 0.9332804385458742,
|
|
"train_speed(iter/s)": 0.122676
|
|
},
|
|
{
|
|
"epoch": 2.97718541748334,
|
|
"grad_norm": 0.8810137510299683,
|
|
"learning_rate": 1.1048531853286027e-09,
|
|
"loss": 0.20437698364257811,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2375,
|
|
"token_acc": 0.92782014674522,
|
|
"train_speed(iter/s)": 0.122731
|
|
},
|
|
{
|
|
"epoch": 2.983457467659741,
|
|
"grad_norm": 0.8733575940132141,
|
|
"learning_rate": 5.222259059867174e-10,
|
|
"loss": 0.2047698974609375,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2380,
|
|
"token_acc": 0.9247718926286298,
|
|
"train_speed(iter/s)": 0.122792
|
|
},
|
|
{
|
|
"epoch": 2.983457467659741,
|
|
"eval_loss": 0.3388313353061676,
|
|
"eval_runtime": 29.4983,
|
|
"eval_samples_per_second": 17.459,
|
|
"eval_steps_per_second": 4.373,
|
|
"eval_token_acc": 0.8928260109156122,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 2.9897295178361425,
|
|
"grad_norm": 0.8730230927467346,
|
|
"learning_rate": 1.5537489683914442e-10,
|
|
"loss": 0.1937323570251465,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2385,
|
|
"token_acc": 0.9117496217657769,
|
|
"train_speed(iter/s)": 0.122611
|
|
},
|
|
{
|
|
"epoch": 2.9960015680125442,
|
|
"grad_norm": 0.8275863528251648,
|
|
"learning_rate": 4.315991088965632e-12,
|
|
"loss": 0.19883054494857788,
|
|
"memory(GiB)": 35.31,
|
|
"step": 2390,
|
|
"token_acc": 0.9306170220547895,
|
|
"train_speed(iter/s)": 0.122671
|
|
},
|
|
{
|
|
"epoch": 2.9972559780478245,
|
|
"eval_loss": 0.3387167751789093,
|
|
"eval_runtime": 29.4848,
|
|
"eval_samples_per_second": 17.467,
|
|
"eval_steps_per_second": 4.375,
|
|
"eval_token_acc": 0.8929092938165222,
|
|
"step": 2391
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 2391,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.529076327726711e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|