5905 lines
167 KiB
JSON
5905 lines
167 KiB
JSON
{
|
|
"best_global_step": 1580,
|
|
"best_metric": 0.33898818,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v54-20250507-020216/checkpoint-1580",
|
|
"epoch": 2.9972559780478245,
|
|
"eval_steps": 20,
|
|
"global_step": 2391,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0012544100352802822,
|
|
"grad_norm": 3.029660940170288,
|
|
"learning_rate": 9.999995684008912e-06,
|
|
"loss": 0.7123785018920898,
|
|
"memory(GiB)": 28.82,
|
|
"step": 1,
|
|
"token_acc": 0.8081615027528878,
|
|
"train_speed(iter/s)": 0.06477
|
|
},
|
|
{
|
|
"epoch": 0.006272050176401411,
|
|
"grad_norm": 2.1236259937286377,
|
|
"learning_rate": 9.999892100595329e-06,
|
|
"loss": 0.6517069339752197,
|
|
"memory(GiB)": 28.86,
|
|
"step": 5,
|
|
"token_acc": 0.8096913375373382,
|
|
"train_speed(iter/s)": 0.12466
|
|
},
|
|
{
|
|
"epoch": 0.012544100352802822,
|
|
"grad_norm": 1.261985182762146,
|
|
"learning_rate": 9.999568407038233e-06,
|
|
"loss": 0.5820259571075439,
|
|
"memory(GiB)": 28.86,
|
|
"step": 10,
|
|
"token_acc": 0.8246924192768496,
|
|
"train_speed(iter/s)": 0.139298
|
|
},
|
|
{
|
|
"epoch": 0.018816150529204233,
|
|
"grad_norm": 1.0761600732803345,
|
|
"learning_rate": 9.999028933299243e-06,
|
|
"loss": 0.5411366939544677,
|
|
"memory(GiB)": 28.86,
|
|
"step": 15,
|
|
"token_acc": 0.8337762808199353,
|
|
"train_speed(iter/s)": 0.145935
|
|
},
|
|
{
|
|
"epoch": 0.025088200705605645,
|
|
"grad_norm": 1.0089085102081299,
|
|
"learning_rate": 9.99827370266192e-06,
|
|
"loss": 0.512534236907959,
|
|
"memory(GiB)": 28.86,
|
|
"step": 20,
|
|
"token_acc": 0.8508319467554076,
|
|
"train_speed(iter/s)": 0.147072
|
|
},
|
|
{
|
|
"epoch": 0.025088200705605645,
|
|
"eval_loss": 0.49907156825065613,
|
|
"eval_runtime": 29.7712,
|
|
"eval_samples_per_second": 17.299,
|
|
"eval_steps_per_second": 4.333,
|
|
"eval_token_acc": 0.8417260457708107,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.03136025088200706,
|
|
"grad_norm": 0.9683756232261658,
|
|
"learning_rate": 9.99730274772184e-06,
|
|
"loss": 0.509831714630127,
|
|
"memory(GiB)": 28.86,
|
|
"step": 25,
|
|
"token_acc": 0.8483637399068481,
|
|
"train_speed(iter/s)": 0.120173
|
|
},
|
|
{
|
|
"epoch": 0.037632301058408466,
|
|
"grad_norm": 1.0160096883773804,
|
|
"learning_rate": 9.996116110385186e-06,
|
|
"loss": 0.5112733364105224,
|
|
"memory(GiB)": 28.86,
|
|
"step": 30,
|
|
"token_acc": 0.8454686767499815,
|
|
"train_speed(iter/s)": 0.125825
|
|
},
|
|
{
|
|
"epoch": 0.04390435123480988,
|
|
"grad_norm": 0.9916045665740967,
|
|
"learning_rate": 9.99471384186694e-06,
|
|
"loss": 0.5009718418121338,
|
|
"memory(GiB)": 28.87,
|
|
"step": 35,
|
|
"token_acc": 0.8552517091361094,
|
|
"train_speed(iter/s)": 0.129686
|
|
},
|
|
{
|
|
"epoch": 0.05017640141121129,
|
|
"grad_norm": 0.9175627827644348,
|
|
"learning_rate": 9.99309600268868e-06,
|
|
"loss": 0.47254362106323244,
|
|
"memory(GiB)": 28.87,
|
|
"step": 40,
|
|
"token_acc": 0.8560227119102338,
|
|
"train_speed(iter/s)": 0.131824
|
|
},
|
|
{
|
|
"epoch": 0.05017640141121129,
|
|
"eval_loss": 0.46525266766548157,
|
|
"eval_runtime": 29.5949,
|
|
"eval_samples_per_second": 17.402,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.8489353675915835,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.0564484515876127,
|
|
"grad_norm": 0.9363399744033813,
|
|
"learning_rate": 9.991262662675962e-06,
|
|
"loss": 0.49153480529785154,
|
|
"memory(GiB)": 28.87,
|
|
"step": 45,
|
|
"token_acc": 0.8566000039174975,
|
|
"train_speed(iter/s)": 0.120002
|
|
},
|
|
{
|
|
"epoch": 0.06272050176401411,
|
|
"grad_norm": 0.9040453433990479,
|
|
"learning_rate": 9.9892139009553e-06,
|
|
"loss": 0.47645087242126466,
|
|
"memory(GiB)": 28.87,
|
|
"step": 50,
|
|
"token_acc": 0.8460839347767977,
|
|
"train_speed(iter/s)": 0.1231
|
|
},
|
|
{
|
|
"epoch": 0.06899255194041552,
|
|
"grad_norm": 0.9906909465789795,
|
|
"learning_rate": 9.986949805950763e-06,
|
|
"loss": 0.48256454467773435,
|
|
"memory(GiB)": 28.87,
|
|
"step": 55,
|
|
"token_acc": 0.8567946374162096,
|
|
"train_speed(iter/s)": 0.12528
|
|
},
|
|
{
|
|
"epoch": 0.07526460211681693,
|
|
"grad_norm": 0.9595295190811157,
|
|
"learning_rate": 9.984470475380154e-06,
|
|
"loss": 0.47391643524169924,
|
|
"memory(GiB)": 28.87,
|
|
"step": 60,
|
|
"token_acc": 0.8629193166230203,
|
|
"train_speed(iter/s)": 0.12768
|
|
},
|
|
{
|
|
"epoch": 0.07526460211681693,
|
|
"eval_loss": 0.4490402638912201,
|
|
"eval_runtime": 29.6498,
|
|
"eval_samples_per_second": 17.369,
|
|
"eval_steps_per_second": 4.351,
|
|
"eval_token_acc": 0.8536633414368346,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.08153665229321834,
|
|
"grad_norm": 0.9027647972106934,
|
|
"learning_rate": 9.981776016250789e-06,
|
|
"loss": 0.4695126533508301,
|
|
"memory(GiB)": 28.87,
|
|
"step": 65,
|
|
"token_acc": 0.8616360365706656,
|
|
"train_speed(iter/s)": 0.119908
|
|
},
|
|
{
|
|
"epoch": 0.08780870246961976,
|
|
"grad_norm": 0.9021025896072388,
|
|
"learning_rate": 9.97886654485488e-06,
|
|
"loss": 0.44661579132080076,
|
|
"memory(GiB)": 28.87,
|
|
"step": 70,
|
|
"token_acc": 0.8703178432256183,
|
|
"train_speed(iter/s)": 0.121955
|
|
},
|
|
{
|
|
"epoch": 0.09408075264602117,
|
|
"grad_norm": 0.9017606973648071,
|
|
"learning_rate": 9.975742186764526e-06,
|
|
"loss": 0.4442440509796143,
|
|
"memory(GiB)": 28.87,
|
|
"step": 75,
|
|
"token_acc": 0.8678062233322926,
|
|
"train_speed(iter/s)": 0.123859
|
|
},
|
|
{
|
|
"epoch": 0.10035280282242258,
|
|
"grad_norm": 0.9587047696113586,
|
|
"learning_rate": 9.972403076826272e-06,
|
|
"loss": 0.454923677444458,
|
|
"memory(GiB)": 28.87,
|
|
"step": 80,
|
|
"token_acc": 0.849014240457663,
|
|
"train_speed(iter/s)": 0.125488
|
|
},
|
|
{
|
|
"epoch": 0.10035280282242258,
|
|
"eval_loss": 0.4374794065952301,
|
|
"eval_runtime": 29.5479,
|
|
"eval_samples_per_second": 17.429,
|
|
"eval_steps_per_second": 4.366,
|
|
"eval_token_acc": 0.8562285187358538,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.10662485299882399,
|
|
"grad_norm": 0.8854019641876221,
|
|
"learning_rate": 9.96884935915531e-06,
|
|
"loss": 0.4319493770599365,
|
|
"memory(GiB)": 28.87,
|
|
"step": 85,
|
|
"token_acc": 0.8624362103394719,
|
|
"train_speed(iter/s)": 0.119824
|
|
},
|
|
{
|
|
"epoch": 0.1128969031752254,
|
|
"grad_norm": 0.8231946229934692,
|
|
"learning_rate": 9.965081187129248e-06,
|
|
"loss": 0.43582682609558104,
|
|
"memory(GiB)": 28.87,
|
|
"step": 90,
|
|
"token_acc": 0.8703445195153275,
|
|
"train_speed(iter/s)": 0.120848
|
|
},
|
|
{
|
|
"epoch": 0.11916895335162682,
|
|
"grad_norm": 0.9398965239524841,
|
|
"learning_rate": 9.961098723381495e-06,
|
|
"loss": 0.4444568634033203,
|
|
"memory(GiB)": 28.87,
|
|
"step": 95,
|
|
"token_acc": 0.8672030063249265,
|
|
"train_speed(iter/s)": 0.122608
|
|
},
|
|
{
|
|
"epoch": 0.12544100352802823,
|
|
"grad_norm": 0.8782020807266235,
|
|
"learning_rate": 9.956902139794236e-06,
|
|
"loss": 0.4650153636932373,
|
|
"memory(GiB)": 28.87,
|
|
"step": 100,
|
|
"token_acc": 0.8458562410914274,
|
|
"train_speed(iter/s)": 0.12399
|
|
},
|
|
{
|
|
"epoch": 0.12544100352802823,
|
|
"eval_loss": 0.4288901388645172,
|
|
"eval_runtime": 29.5859,
|
|
"eval_samples_per_second": 17.407,
|
|
"eval_steps_per_second": 4.36,
|
|
"eval_token_acc": 0.8588523765613212,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.13171305370442962,
|
|
"grad_norm": 0.885040283203125,
|
|
"learning_rate": 9.95249161749102e-06,
|
|
"loss": 0.45799875259399414,
|
|
"memory(GiB)": 28.87,
|
|
"step": 105,
|
|
"token_acc": 0.8647499099695356,
|
|
"train_speed(iter/s)": 0.119708
|
|
},
|
|
{
|
|
"epoch": 0.13798510388083104,
|
|
"grad_norm": 0.8790634274482727,
|
|
"learning_rate": 9.94786734682894e-06,
|
|
"loss": 0.4435451030731201,
|
|
"memory(GiB)": 28.87,
|
|
"step": 110,
|
|
"token_acc": 0.8620516456892986,
|
|
"train_speed(iter/s)": 0.120941
|
|
},
|
|
{
|
|
"epoch": 0.14425715405723247,
|
|
"grad_norm": 0.8384956121444702,
|
|
"learning_rate": 9.943029527390415e-06,
|
|
"loss": 0.45367069244384767,
|
|
"memory(GiB)": 28.87,
|
|
"step": 115,
|
|
"token_acc": 0.8639205792014041,
|
|
"train_speed(iter/s)": 0.122387
|
|
},
|
|
{
|
|
"epoch": 0.15052920423363386,
|
|
"grad_norm": 0.828059196472168,
|
|
"learning_rate": 9.93797836797458e-06,
|
|
"loss": 0.4475994110107422,
|
|
"memory(GiB)": 28.87,
|
|
"step": 120,
|
|
"token_acc": 0.8620637823483729,
|
|
"train_speed(iter/s)": 0.123587
|
|
},
|
|
{
|
|
"epoch": 0.15052920423363386,
|
|
"eval_loss": 0.4217882454395294,
|
|
"eval_runtime": 29.5603,
|
|
"eval_samples_per_second": 17.422,
|
|
"eval_steps_per_second": 4.364,
|
|
"eval_token_acc": 0.8602690921284265,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.15680125441003528,
|
|
"grad_norm": 0.9089007377624512,
|
|
"learning_rate": 9.932714086588276e-06,
|
|
"loss": 0.4421473503112793,
|
|
"memory(GiB)": 28.87,
|
|
"step": 125,
|
|
"token_acc": 0.8651899104582431,
|
|
"train_speed(iter/s)": 0.119877
|
|
},
|
|
{
|
|
"epoch": 0.16307330458643668,
|
|
"grad_norm": 0.8998169898986816,
|
|
"learning_rate": 9.92723691043663e-06,
|
|
"loss": 0.4212520599365234,
|
|
"memory(GiB)": 28.87,
|
|
"step": 130,
|
|
"token_acc": 0.8652097803376814,
|
|
"train_speed(iter/s)": 0.120973
|
|
},
|
|
{
|
|
"epoch": 0.1693453547628381,
|
|
"grad_norm": 0.8805875778198242,
|
|
"learning_rate": 9.921547075913261e-06,
|
|
"loss": 0.439087438583374,
|
|
"memory(GiB)": 28.87,
|
|
"step": 135,
|
|
"token_acc": 0.8655743664312422,
|
|
"train_speed(iter/s)": 0.121977
|
|
},
|
|
{
|
|
"epoch": 0.17561740493923952,
|
|
"grad_norm": 0.8694149851799011,
|
|
"learning_rate": 9.915644828590074e-06,
|
|
"loss": 0.4487740516662598,
|
|
"memory(GiB)": 28.87,
|
|
"step": 140,
|
|
"token_acc": 0.8618538324420677,
|
|
"train_speed(iter/s)": 0.123124
|
|
},
|
|
{
|
|
"epoch": 0.17561740493923952,
|
|
"eval_loss": 0.4171818792819977,
|
|
"eval_runtime": 29.6071,
|
|
"eval_samples_per_second": 17.394,
|
|
"eval_steps_per_second": 4.357,
|
|
"eval_token_acc": 0.86238159108056,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.18188945511564092,
|
|
"grad_norm": 0.8099656701087952,
|
|
"learning_rate": 9.909530423206657e-06,
|
|
"loss": 0.43574037551879885,
|
|
"memory(GiB)": 28.87,
|
|
"step": 145,
|
|
"token_acc": 0.8709998032902297,
|
|
"train_speed(iter/s)": 0.119783
|
|
},
|
|
{
|
|
"epoch": 0.18816150529204234,
|
|
"grad_norm": 0.811008632183075,
|
|
"learning_rate": 9.903204123659288e-06,
|
|
"loss": 0.42326993942260743,
|
|
"memory(GiB)": 28.87,
|
|
"step": 150,
|
|
"token_acc": 0.8682702065220372,
|
|
"train_speed(iter/s)": 0.120804
|
|
},
|
|
{
|
|
"epoch": 0.19443355546844374,
|
|
"grad_norm": 0.8804604411125183,
|
|
"learning_rate": 9.896666202989553e-06,
|
|
"loss": 0.43947763442993165,
|
|
"memory(GiB)": 28.87,
|
|
"step": 155,
|
|
"token_acc": 0.8630830791616532,
|
|
"train_speed(iter/s)": 0.121921
|
|
},
|
|
{
|
|
"epoch": 0.20070560564484516,
|
|
"grad_norm": 0.8700482845306396,
|
|
"learning_rate": 9.889916943372549e-06,
|
|
"loss": 0.43802604675292967,
|
|
"memory(GiB)": 28.87,
|
|
"step": 160,
|
|
"token_acc": 0.8609460606528282,
|
|
"train_speed(iter/s)": 0.12293
|
|
},
|
|
{
|
|
"epoch": 0.20070560564484516,
|
|
"eval_loss": 0.4114477336406708,
|
|
"eval_runtime": 29.5534,
|
|
"eval_samples_per_second": 17.426,
|
|
"eval_steps_per_second": 4.365,
|
|
"eval_token_acc": 0.8631695867214352,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.20697765582124658,
|
|
"grad_norm": 0.7978352308273315,
|
|
"learning_rate": 9.882956636104714e-06,
|
|
"loss": 0.43416438102722166,
|
|
"memory(GiB)": 28.87,
|
|
"step": 165,
|
|
"token_acc": 0.8689747827683861,
|
|
"train_speed(iter/s)": 0.120122
|
|
},
|
|
{
|
|
"epoch": 0.21324970599764798,
|
|
"grad_norm": 0.8909711837768555,
|
|
"learning_rate": 9.875785581591253e-06,
|
|
"loss": 0.43141732215881345,
|
|
"memory(GiB)": 28.87,
|
|
"step": 170,
|
|
"token_acc": 0.8619692543743177,
|
|
"train_speed(iter/s)": 0.120876
|
|
},
|
|
{
|
|
"epoch": 0.2195217561740494,
|
|
"grad_norm": 0.8957451581954956,
|
|
"learning_rate": 9.868404089333171e-06,
|
|
"loss": 0.42152652740478513,
|
|
"memory(GiB)": 28.87,
|
|
"step": 175,
|
|
"token_acc": 0.8754910463751849,
|
|
"train_speed(iter/s)": 0.121784
|
|
},
|
|
{
|
|
"epoch": 0.2257938063504508,
|
|
"grad_norm": 0.9475491046905518,
|
|
"learning_rate": 9.860812477913915e-06,
|
|
"loss": 0.417528772354126,
|
|
"memory(GiB)": 28.87,
|
|
"step": 180,
|
|
"token_acc": 0.8698917851281053,
|
|
"train_speed(iter/s)": 0.122636
|
|
},
|
|
{
|
|
"epoch": 0.2257938063504508,
|
|
"eval_loss": 0.4081062376499176,
|
|
"eval_runtime": 29.6207,
|
|
"eval_samples_per_second": 17.387,
|
|
"eval_steps_per_second": 4.355,
|
|
"eval_token_acc": 0.8642426020622014,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.23206585652685222,
|
|
"grad_norm": 0.8352420926094055,
|
|
"learning_rate": 9.853011074985628e-06,
|
|
"loss": 0.4175262928009033,
|
|
"memory(GiB)": 28.87,
|
|
"step": 185,
|
|
"token_acc": 0.8727626873915867,
|
|
"train_speed(iter/s)": 0.120174
|
|
},
|
|
{
|
|
"epoch": 0.23833790670325364,
|
|
"grad_norm": 0.9205328822135925,
|
|
"learning_rate": 9.845000217255e-06,
|
|
"loss": 0.4254606246948242,
|
|
"memory(GiB)": 28.87,
|
|
"step": 190,
|
|
"token_acc": 0.8669568108320489,
|
|
"train_speed(iter/s)": 0.121076
|
|
},
|
|
{
|
|
"epoch": 0.24460995687965503,
|
|
"grad_norm": 0.8192192316055298,
|
|
"learning_rate": 9.836780250468744e-06,
|
|
"loss": 0.41744155883789064,
|
|
"memory(GiB)": 28.87,
|
|
"step": 195,
|
|
"token_acc": 0.8699167905678588,
|
|
"train_speed(iter/s)": 0.121801
|
|
},
|
|
{
|
|
"epoch": 0.25088200705605646,
|
|
"grad_norm": 0.8817274570465088,
|
|
"learning_rate": 9.82835152939867e-06,
|
|
"loss": 0.4215375900268555,
|
|
"memory(GiB)": 28.87,
|
|
"step": 200,
|
|
"token_acc": 0.8657595006023436,
|
|
"train_speed(iter/s)": 0.122488
|
|
},
|
|
{
|
|
"epoch": 0.25088200705605646,
|
|
"eval_loss": 0.403290718793869,
|
|
"eval_runtime": 29.5998,
|
|
"eval_samples_per_second": 17.399,
|
|
"eval_steps_per_second": 4.358,
|
|
"eval_token_acc": 0.8656928493587057,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.2571540572324579,
|
|
"grad_norm": 0.8728285431861877,
|
|
"learning_rate": 9.81971441782637e-06,
|
|
"loss": 0.4273026466369629,
|
|
"memory(GiB)": 28.87,
|
|
"step": 205,
|
|
"token_acc": 0.8701400706898809,
|
|
"train_speed(iter/s)": 0.120321
|
|
},
|
|
{
|
|
"epoch": 0.26342610740885924,
|
|
"grad_norm": 0.840922474861145,
|
|
"learning_rate": 9.810869288527528e-06,
|
|
"loss": 0.41336374282836913,
|
|
"memory(GiB)": 28.87,
|
|
"step": 210,
|
|
"token_acc": 0.8631279407207055,
|
|
"train_speed(iter/s)": 0.120898
|
|
},
|
|
{
|
|
"epoch": 0.26969815758526067,
|
|
"grad_norm": 0.9264822602272034,
|
|
"learning_rate": 9.801816523255811e-06,
|
|
"loss": 0.40021047592163084,
|
|
"memory(GiB)": 28.87,
|
|
"step": 215,
|
|
"token_acc": 0.8800351361525913,
|
|
"train_speed(iter/s)": 0.121557
|
|
},
|
|
{
|
|
"epoch": 0.2759702077616621,
|
|
"grad_norm": 0.833281934261322,
|
|
"learning_rate": 9.792556512726419e-06,
|
|
"loss": 0.4237551689147949,
|
|
"memory(GiB)": 28.87,
|
|
"step": 220,
|
|
"token_acc": 0.8799363351171803,
|
|
"train_speed(iter/s)": 0.122367
|
|
},
|
|
{
|
|
"epoch": 0.2759702077616621,
|
|
"eval_loss": 0.40029406547546387,
|
|
"eval_runtime": 29.5811,
|
|
"eval_samples_per_second": 17.41,
|
|
"eval_steps_per_second": 4.361,
|
|
"eval_token_acc": 0.8666275463157013,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.2822422579380635,
|
|
"grad_norm": 0.9009267687797546,
|
|
"learning_rate": 9.783089656599196e-06,
|
|
"loss": 0.41417922973632815,
|
|
"memory(GiB)": 28.87,
|
|
"step": 225,
|
|
"token_acc": 0.8714769221553299,
|
|
"train_speed(iter/s)": 0.120484
|
|
},
|
|
{
|
|
"epoch": 0.28851430811446493,
|
|
"grad_norm": 0.8666077256202698,
|
|
"learning_rate": 9.773416363461401e-06,
|
|
"loss": 0.4124805450439453,
|
|
"memory(GiB)": 28.87,
|
|
"step": 230,
|
|
"token_acc": 0.880201765447667,
|
|
"train_speed(iter/s)": 0.121099
|
|
},
|
|
{
|
|
"epoch": 0.2947863582908663,
|
|
"grad_norm": 0.8758551478385925,
|
|
"learning_rate": 9.763537050810064e-06,
|
|
"loss": 0.41759481430053713,
|
|
"memory(GiB)": 28.87,
|
|
"step": 235,
|
|
"token_acc": 0.8732158264513743,
|
|
"train_speed(iter/s)": 0.121632
|
|
},
|
|
{
|
|
"epoch": 0.3010584084672677,
|
|
"grad_norm": 0.8822108507156372,
|
|
"learning_rate": 9.753452145033961e-06,
|
|
"loss": 0.42021803855895995,
|
|
"memory(GiB)": 28.87,
|
|
"step": 240,
|
|
"token_acc": 0.8624877221180463,
|
|
"train_speed(iter/s)": 0.122082
|
|
},
|
|
{
|
|
"epoch": 0.3010584084672677,
|
|
"eval_loss": 0.3969193994998932,
|
|
"eval_runtime": 29.8072,
|
|
"eval_samples_per_second": 17.278,
|
|
"eval_steps_per_second": 4.328,
|
|
"eval_token_acc": 0.8671514795875598,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.30733045864366915,
|
|
"grad_norm": 0.8353932499885559,
|
|
"learning_rate": 9.743162081395227e-06,
|
|
"loss": 0.4134369850158691,
|
|
"memory(GiB)": 28.87,
|
|
"step": 245,
|
|
"token_acc": 0.8730592226495686,
|
|
"train_speed(iter/s)": 0.120284
|
|
},
|
|
{
|
|
"epoch": 0.31360250882007057,
|
|
"grad_norm": 0.861675500869751,
|
|
"learning_rate": 9.73266730401056e-06,
|
|
"loss": 0.42804179191589353,
|
|
"memory(GiB)": 28.87,
|
|
"step": 250,
|
|
"token_acc": 0.8642180774748924,
|
|
"train_speed(iter/s)": 0.121032
|
|
},
|
|
{
|
|
"epoch": 0.319874558996472,
|
|
"grad_norm": 0.86508709192276,
|
|
"learning_rate": 9.72196826583205e-06,
|
|
"loss": 0.4076427459716797,
|
|
"memory(GiB)": 28.87,
|
|
"step": 255,
|
|
"token_acc": 0.8697714532125916,
|
|
"train_speed(iter/s)": 0.121618
|
|
},
|
|
{
|
|
"epoch": 0.32614660917287336,
|
|
"grad_norm": 0.8428534865379333,
|
|
"learning_rate": 9.711065428627638e-06,
|
|
"loss": 0.41555137634277345,
|
|
"memory(GiB)": 28.87,
|
|
"step": 260,
|
|
"token_acc": 0.8662227763482505,
|
|
"train_speed(iter/s)": 0.122077
|
|
},
|
|
{
|
|
"epoch": 0.32614660917287336,
|
|
"eval_loss": 0.39386531710624695,
|
|
"eval_runtime": 29.7323,
|
|
"eval_samples_per_second": 17.321,
|
|
"eval_steps_per_second": 4.339,
|
|
"eval_token_acc": 0.8679436666946098,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.3324186593492748,
|
|
"grad_norm": 0.8486117124557495,
|
|
"learning_rate": 9.699959262961182e-06,
|
|
"loss": 0.422438907623291,
|
|
"memory(GiB)": 28.87,
|
|
"step": 265,
|
|
"token_acc": 0.8720271109166218,
|
|
"train_speed(iter/s)": 0.120433
|
|
},
|
|
{
|
|
"epoch": 0.3386907095256762,
|
|
"grad_norm": 0.8571962714195251,
|
|
"learning_rate": 9.688650248172145e-06,
|
|
"loss": 0.43577041625976565,
|
|
"memory(GiB)": 28.87,
|
|
"step": 270,
|
|
"token_acc": 0.8602295350743647,
|
|
"train_speed(iter/s)": 0.120989
|
|
},
|
|
{
|
|
"epoch": 0.3449627597020776,
|
|
"grad_norm": 0.8868110775947571,
|
|
"learning_rate": 9.677138872354916e-06,
|
|
"loss": 0.41460485458374025,
|
|
"memory(GiB)": 28.87,
|
|
"step": 275,
|
|
"token_acc": 0.8727983282221077,
|
|
"train_speed(iter/s)": 0.121551
|
|
},
|
|
{
|
|
"epoch": 0.35123480987847905,
|
|
"grad_norm": 0.9915756583213806,
|
|
"learning_rate": 9.665425632337731e-06,
|
|
"loss": 0.4305459976196289,
|
|
"memory(GiB)": 28.87,
|
|
"step": 280,
|
|
"token_acc": 0.8665983351969189,
|
|
"train_speed(iter/s)": 0.12212
|
|
},
|
|
{
|
|
"epoch": 0.35123480987847905,
|
|
"eval_loss": 0.3917335569858551,
|
|
"eval_runtime": 29.562,
|
|
"eval_samples_per_second": 17.421,
|
|
"eval_steps_per_second": 4.364,
|
|
"eval_token_acc": 0.8687568111325341,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.3575068600548804,
|
|
"grad_norm": 0.9129208922386169,
|
|
"learning_rate": 9.653511033661242e-06,
|
|
"loss": 0.4195101261138916,
|
|
"memory(GiB)": 28.87,
|
|
"step": 285,
|
|
"token_acc": 0.8746003876873738,
|
|
"train_speed(iter/s)": 0.12054
|
|
},
|
|
{
|
|
"epoch": 0.36377891023128184,
|
|
"grad_norm": 0.8719012141227722,
|
|
"learning_rate": 9.641395590556689e-06,
|
|
"loss": 0.3962116241455078,
|
|
"memory(GiB)": 28.87,
|
|
"step": 290,
|
|
"token_acc": 0.8869061113450141,
|
|
"train_speed(iter/s)": 0.121042
|
|
},
|
|
{
|
|
"epoch": 0.37005096040768326,
|
|
"grad_norm": 0.8595064878463745,
|
|
"learning_rate": 9.629079825923712e-06,
|
|
"loss": 0.40920305252075195,
|
|
"memory(GiB)": 28.87,
|
|
"step": 295,
|
|
"token_acc": 0.8752032411973238,
|
|
"train_speed(iter/s)": 0.121573
|
|
},
|
|
{
|
|
"epoch": 0.3763230105840847,
|
|
"grad_norm": 0.8926518559455872,
|
|
"learning_rate": 9.616564271307779e-06,
|
|
"loss": 0.42294983863830565,
|
|
"memory(GiB)": 28.87,
|
|
"step": 300,
|
|
"token_acc": 0.8639302937308199,
|
|
"train_speed(iter/s)": 0.12214
|
|
},
|
|
{
|
|
"epoch": 0.3763230105840847,
|
|
"eval_loss": 0.3890170454978943,
|
|
"eval_runtime": 29.5793,
|
|
"eval_samples_per_second": 17.411,
|
|
"eval_steps_per_second": 4.361,
|
|
"eval_token_acc": 0.8693939139911141,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.3825950607604861,
|
|
"grad_norm": 0.8123087286949158,
|
|
"learning_rate": 9.603849466877249e-06,
|
|
"loss": 0.39454007148742676,
|
|
"memory(GiB)": 28.87,
|
|
"step": 305,
|
|
"token_acc": 0.8777894566623544,
|
|
"train_speed(iter/s)": 0.120651
|
|
},
|
|
{
|
|
"epoch": 0.3888671109368875,
|
|
"grad_norm": 0.824553370475769,
|
|
"learning_rate": 9.59093596140005e-06,
|
|
"loss": 0.3974800109863281,
|
|
"memory(GiB)": 28.87,
|
|
"step": 310,
|
|
"token_acc": 0.8730650935309837,
|
|
"train_speed(iter/s)": 0.121172
|
|
},
|
|
{
|
|
"epoch": 0.3951391611132889,
|
|
"grad_norm": 0.8097919821739197,
|
|
"learning_rate": 9.577824312220006e-06,
|
|
"loss": 0.40521669387817383,
|
|
"memory(GiB)": 28.87,
|
|
"step": 315,
|
|
"token_acc": 0.8752670549328339,
|
|
"train_speed(iter/s)": 0.121519
|
|
},
|
|
{
|
|
"epoch": 0.4014112112896903,
|
|
"grad_norm": 0.7727741003036499,
|
|
"learning_rate": 9.564515085232772e-06,
|
|
"loss": 0.4013851165771484,
|
|
"memory(GiB)": 28.87,
|
|
"step": 320,
|
|
"token_acc": 0.868708497086326,
|
|
"train_speed(iter/s)": 0.121981
|
|
},
|
|
{
|
|
"epoch": 0.4014112112896903,
|
|
"eval_loss": 0.3878418207168579,
|
|
"eval_runtime": 29.6547,
|
|
"eval_samples_per_second": 17.367,
|
|
"eval_steps_per_second": 4.35,
|
|
"eval_token_acc": 0.8700855059099674,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.40768326146609174,
|
|
"grad_norm": 0.8198765516281128,
|
|
"learning_rate": 9.55100885486142e-06,
|
|
"loss": 0.4199061393737793,
|
|
"memory(GiB)": 28.87,
|
|
"step": 325,
|
|
"token_acc": 0.8758526603001364,
|
|
"train_speed(iter/s)": 0.120641
|
|
},
|
|
{
|
|
"epoch": 0.41395531164249316,
|
|
"grad_norm": 0.9562544226646423,
|
|
"learning_rate": 9.537306204031628e-06,
|
|
"loss": 0.4178496837615967,
|
|
"memory(GiB)": 28.87,
|
|
"step": 330,
|
|
"token_acc": 0.8739065294089218,
|
|
"train_speed(iter/s)": 0.121059
|
|
},
|
|
{
|
|
"epoch": 0.42022736181889453,
|
|
"grad_norm": 0.8651145100593567,
|
|
"learning_rate": 9.523407724146548e-06,
|
|
"loss": 0.414202356338501,
|
|
"memory(GiB)": 28.87,
|
|
"step": 335,
|
|
"token_acc": 0.866993368311716,
|
|
"train_speed(iter/s)": 0.121595
|
|
},
|
|
{
|
|
"epoch": 0.42649941199529595,
|
|
"grad_norm": 0.8332494497299194,
|
|
"learning_rate": 9.509314015061263e-06,
|
|
"loss": 0.3904601812362671,
|
|
"memory(GiB)": 28.87,
|
|
"step": 340,
|
|
"token_acc": 0.8766263821033685,
|
|
"train_speed(iter/s)": 0.121943
|
|
},
|
|
{
|
|
"epoch": 0.42649941199529595,
|
|
"eval_loss": 0.3836223781108856,
|
|
"eval_runtime": 29.5735,
|
|
"eval_samples_per_second": 17.414,
|
|
"eval_steps_per_second": 4.362,
|
|
"eval_token_acc": 0.8714351580182748,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.4327714621716974,
|
|
"grad_norm": 0.7933188080787659,
|
|
"learning_rate": 9.495025685056898e-06,
|
|
"loss": 0.4055050849914551,
|
|
"memory(GiB)": 28.87,
|
|
"step": 345,
|
|
"token_acc": 0.8755267232424593,
|
|
"train_speed(iter/s)": 0.120637
|
|
},
|
|
{
|
|
"epoch": 0.4390435123480988,
|
|
"grad_norm": 0.8231353759765625,
|
|
"learning_rate": 9.480543350814376e-06,
|
|
"loss": 0.41367053985595703,
|
|
"memory(GiB)": 28.87,
|
|
"step": 350,
|
|
"token_acc": 0.8694767367912289,
|
|
"train_speed(iter/s)": 0.121138
|
|
},
|
|
{
|
|
"epoch": 0.4453155625245002,
|
|
"grad_norm": 0.7924429178237915,
|
|
"learning_rate": 9.465867637387793e-06,
|
|
"loss": 0.4196880340576172,
|
|
"memory(GiB)": 28.87,
|
|
"step": 355,
|
|
"token_acc": 0.8645996976613214,
|
|
"train_speed(iter/s)": 0.121637
|
|
},
|
|
{
|
|
"epoch": 0.4515876127009016,
|
|
"grad_norm": 0.7936041951179504,
|
|
"learning_rate": 9.450999178177445e-06,
|
|
"loss": 0.40215320587158204,
|
|
"memory(GiB)": 28.87,
|
|
"step": 360,
|
|
"token_acc": 0.8788266393792104,
|
|
"train_speed(iter/s)": 0.122002
|
|
},
|
|
{
|
|
"epoch": 0.4515876127009016,
|
|
"eval_loss": 0.3816666603088379,
|
|
"eval_runtime": 29.619,
|
|
"eval_samples_per_second": 17.387,
|
|
"eval_steps_per_second": 4.355,
|
|
"eval_token_acc": 0.8717076033196413,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.457859662877303,
|
|
"grad_norm": 0.8175886869430542,
|
|
"learning_rate": 9.435938614902494e-06,
|
|
"loss": 0.38351328372955323,
|
|
"memory(GiB)": 28.87,
|
|
"step": 365,
|
|
"token_acc": 0.8787878787878788,
|
|
"train_speed(iter/s)": 0.120772
|
|
},
|
|
{
|
|
"epoch": 0.46413171305370443,
|
|
"grad_norm": 0.8586422801017761,
|
|
"learning_rate": 9.42068659757326e-06,
|
|
"loss": 0.4199483394622803,
|
|
"memory(GiB)": 28.87,
|
|
"step": 370,
|
|
"token_acc": 0.8653846153846154,
|
|
"train_speed(iter/s)": 0.121125
|
|
},
|
|
{
|
|
"epoch": 0.47040376323010585,
|
|
"grad_norm": 0.8419802784919739,
|
|
"learning_rate": 9.405243784463181e-06,
|
|
"loss": 0.4090768337249756,
|
|
"memory(GiB)": 28.87,
|
|
"step": 375,
|
|
"token_acc": 0.8812298983661392,
|
|
"train_speed(iter/s)": 0.121579
|
|
},
|
|
{
|
|
"epoch": 0.4766758134065073,
|
|
"grad_norm": 0.9342820048332214,
|
|
"learning_rate": 9.389610842080394e-06,
|
|
"loss": 0.414335823059082,
|
|
"memory(GiB)": 28.87,
|
|
"step": 380,
|
|
"token_acc": 0.8696789068211213,
|
|
"train_speed(iter/s)": 0.121985
|
|
},
|
|
{
|
|
"epoch": 0.4766758134065073,
|
|
"eval_loss": 0.3803957402706146,
|
|
"eval_runtime": 29.5699,
|
|
"eval_samples_per_second": 17.416,
|
|
"eval_steps_per_second": 4.363,
|
|
"eval_token_acc": 0.8713722860256518,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.48294786358290864,
|
|
"grad_norm": 0.8180081844329834,
|
|
"learning_rate": 9.373788445138972e-06,
|
|
"loss": 0.39729149341583253,
|
|
"memory(GiB)": 28.87,
|
|
"step": 385,
|
|
"token_acc": 0.8782972920319502,
|
|
"train_speed(iter/s)": 0.120827
|
|
},
|
|
{
|
|
"epoch": 0.48921991375931007,
|
|
"grad_norm": 0.8141390085220337,
|
|
"learning_rate": 9.357777276529793e-06,
|
|
"loss": 0.3939579963684082,
|
|
"memory(GiB)": 28.87,
|
|
"step": 390,
|
|
"token_acc": 0.8719189555691457,
|
|
"train_speed(iter/s)": 0.121285
|
|
},
|
|
{
|
|
"epoch": 0.4954919639357115,
|
|
"grad_norm": 0.79034024477005,
|
|
"learning_rate": 9.341578027291085e-06,
|
|
"loss": 0.3828037977218628,
|
|
"memory(GiB)": 28.87,
|
|
"step": 395,
|
|
"token_acc": 0.8823363286264442,
|
|
"train_speed(iter/s)": 0.121636
|
|
},
|
|
{
|
|
"epoch": 0.5017640141121129,
|
|
"grad_norm": 0.903527021408081,
|
|
"learning_rate": 9.325191396578589e-06,
|
|
"loss": 0.4000723838806152,
|
|
"memory(GiB)": 28.87,
|
|
"step": 400,
|
|
"token_acc": 0.8762913767657601,
|
|
"train_speed(iter/s)": 0.122009
|
|
},
|
|
{
|
|
"epoch": 0.5017640141121129,
|
|
"eval_loss": 0.3788001239299774,
|
|
"eval_runtime": 29.5859,
|
|
"eval_samples_per_second": 17.407,
|
|
"eval_steps_per_second": 4.36,
|
|
"eval_token_acc": 0.8721686645988767,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.5080360642885143,
|
|
"grad_norm": 0.8032602667808533,
|
|
"learning_rate": 9.308618091635382e-06,
|
|
"loss": 0.38360297679901123,
|
|
"memory(GiB)": 28.87,
|
|
"step": 405,
|
|
"token_acc": 0.8814102236094765,
|
|
"train_speed(iter/s)": 0.120867
|
|
},
|
|
{
|
|
"epoch": 0.5143081144649158,
|
|
"grad_norm": 0.8667065501213074,
|
|
"learning_rate": 9.291858827761359e-06,
|
|
"loss": 0.39394588470458985,
|
|
"memory(GiB)": 28.87,
|
|
"step": 410,
|
|
"token_acc": 0.8806277558598282,
|
|
"train_speed(iter/s)": 0.121171
|
|
},
|
|
{
|
|
"epoch": 0.5205801646413172,
|
|
"grad_norm": 1.8045368194580078,
|
|
"learning_rate": 9.274914328282359e-06,
|
|
"loss": 0.41473889350891113,
|
|
"memory(GiB)": 28.87,
|
|
"step": 415,
|
|
"token_acc": 0.8681345140319431,
|
|
"train_speed(iter/s)": 0.121492
|
|
},
|
|
{
|
|
"epoch": 0.5268522148177185,
|
|
"grad_norm": 0.8981844782829285,
|
|
"learning_rate": 9.257785324518943e-06,
|
|
"loss": 0.39594154357910155,
|
|
"memory(GiB)": 28.87,
|
|
"step": 420,
|
|
"token_acc": 0.8816446146703807,
|
|
"train_speed(iter/s)": 0.12182
|
|
},
|
|
{
|
|
"epoch": 0.5268522148177185,
|
|
"eval_loss": 0.3774873614311218,
|
|
"eval_runtime": 29.5939,
|
|
"eval_samples_per_second": 17.402,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.8723111744488222,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.5331242649941199,
|
|
"grad_norm": 0.8493431806564331,
|
|
"learning_rate": 9.240472555754835e-06,
|
|
"loss": 0.3900270462036133,
|
|
"memory(GiB)": 28.87,
|
|
"step": 425,
|
|
"token_acc": 0.876733513421235,
|
|
"train_speed(iter/s)": 0.120876
|
|
},
|
|
{
|
|
"epoch": 0.5393963151705213,
|
|
"grad_norm": 0.9211533069610596,
|
|
"learning_rate": 9.222976769205013e-06,
|
|
"loss": 0.39029178619384763,
|
|
"memory(GiB)": 28.87,
|
|
"step": 430,
|
|
"token_acc": 0.87612563145179,
|
|
"train_speed(iter/s)": 0.121195
|
|
},
|
|
{
|
|
"epoch": 0.5456683653469228,
|
|
"grad_norm": 0.8063600659370422,
|
|
"learning_rate": 9.205298719983458e-06,
|
|
"loss": 0.40296125411987305,
|
|
"memory(GiB)": 31.15,
|
|
"step": 435,
|
|
"token_acc": 0.8771728947642344,
|
|
"train_speed(iter/s)": 0.12146
|
|
},
|
|
{
|
|
"epoch": 0.5519404155233242,
|
|
"grad_norm": 0.8496724367141724,
|
|
"learning_rate": 9.187439171070563e-06,
|
|
"loss": 0.4008660316467285,
|
|
"memory(GiB)": 31.15,
|
|
"step": 440,
|
|
"token_acc": 0.8774796485144151,
|
|
"train_speed(iter/s)": 0.121775
|
|
},
|
|
{
|
|
"epoch": 0.5519404155233242,
|
|
"eval_loss": 0.3754238188266754,
|
|
"eval_runtime": 29.6083,
|
|
"eval_samples_per_second": 17.394,
|
|
"eval_steps_per_second": 4.357,
|
|
"eval_token_acc": 0.8736524436247799,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.5582124656997256,
|
|
"grad_norm": 0.8935557007789612,
|
|
"learning_rate": 9.169398893280208e-06,
|
|
"loss": 0.38558981418609617,
|
|
"memory(GiB)": 31.15,
|
|
"step": 445,
|
|
"token_acc": 0.8804487471295762,
|
|
"train_speed(iter/s)": 0.12076
|
|
},
|
|
{
|
|
"epoch": 0.564484515876127,
|
|
"grad_norm": 0.7438162565231323,
|
|
"learning_rate": 9.151178665226486e-06,
|
|
"loss": 0.39479656219482423,
|
|
"memory(GiB)": 31.15,
|
|
"step": 450,
|
|
"token_acc": 0.8771318584933242,
|
|
"train_speed(iter/s)": 0.121072
|
|
},
|
|
{
|
|
"epoch": 0.5707565660525284,
|
|
"grad_norm": 0.7340822815895081,
|
|
"learning_rate": 9.132779273290103e-06,
|
|
"loss": 0.39113516807556153,
|
|
"memory(GiB)": 31.15,
|
|
"step": 455,
|
|
"token_acc": 0.8820633384040935,
|
|
"train_speed(iter/s)": 0.12129
|
|
},
|
|
{
|
|
"epoch": 0.5770286162289299,
|
|
"grad_norm": 0.8130801320075989,
|
|
"learning_rate": 9.114201511584428e-06,
|
|
"loss": 0.40251779556274414,
|
|
"memory(GiB)": 31.15,
|
|
"step": 460,
|
|
"token_acc": 0.8760628910636933,
|
|
"train_speed(iter/s)": 0.121548
|
|
},
|
|
{
|
|
"epoch": 0.5770286162289299,
|
|
"eval_loss": 0.3735375702381134,
|
|
"eval_runtime": 29.5965,
|
|
"eval_samples_per_second": 17.401,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.8740087182496438,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.5833006664053313,
|
|
"grad_norm": 0.9230681657791138,
|
|
"learning_rate": 9.095446181921237e-06,
|
|
"loss": 0.4032279014587402,
|
|
"memory(GiB)": 31.15,
|
|
"step": 465,
|
|
"token_acc": 0.8776148954041838,
|
|
"train_speed(iter/s)": 0.120694
|
|
},
|
|
{
|
|
"epoch": 0.5895727165817326,
|
|
"grad_norm": 0.7464693188667297,
|
|
"learning_rate": 9.07651409377609e-06,
|
|
"loss": 0.38982985019683836,
|
|
"memory(GiB)": 31.15,
|
|
"step": 470,
|
|
"token_acc": 0.871431801480979,
|
|
"train_speed(iter/s)": 0.12105
|
|
},
|
|
{
|
|
"epoch": 0.595844766758134,
|
|
"grad_norm": 0.8642957210540771,
|
|
"learning_rate": 9.057406064253404e-06,
|
|
"loss": 0.4086627006530762,
|
|
"memory(GiB)": 31.15,
|
|
"step": 475,
|
|
"token_acc": 0.8692161419818297,
|
|
"train_speed(iter/s)": 0.121393
|
|
},
|
|
{
|
|
"epoch": 0.6021168169345354,
|
|
"grad_norm": 0.7960444092750549,
|
|
"learning_rate": 9.038122918051184e-06,
|
|
"loss": 0.3928786516189575,
|
|
"memory(GiB)": 31.15,
|
|
"step": 480,
|
|
"token_acc": 0.8757158196134575,
|
|
"train_speed(iter/s)": 0.121671
|
|
},
|
|
{
|
|
"epoch": 0.6021168169345354,
|
|
"eval_loss": 0.3725303113460541,
|
|
"eval_runtime": 29.61,
|
|
"eval_samples_per_second": 17.393,
|
|
"eval_steps_per_second": 4.357,
|
|
"eval_token_acc": 0.8741512280995892,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.6083888671109369,
|
|
"grad_norm": 0.8392713665962219,
|
|
"learning_rate": 9.018665487425426e-06,
|
|
"loss": 0.37983551025390627,
|
|
"memory(GiB)": 31.15,
|
|
"step": 485,
|
|
"token_acc": 0.8818607401567042,
|
|
"train_speed(iter/s)": 0.120742
|
|
},
|
|
{
|
|
"epoch": 0.6146609172873383,
|
|
"grad_norm": 0.8359962701797485,
|
|
"learning_rate": 8.999034612154204e-06,
|
|
"loss": 0.3970278263092041,
|
|
"memory(GiB)": 31.15,
|
|
"step": 490,
|
|
"token_acc": 0.8752512282268871,
|
|
"train_speed(iter/s)": 0.121067
|
|
},
|
|
{
|
|
"epoch": 0.6209329674637397,
|
|
"grad_norm": 0.8340564966201782,
|
|
"learning_rate": 8.979231139501417e-06,
|
|
"loss": 0.3811976909637451,
|
|
"memory(GiB)": 31.15,
|
|
"step": 495,
|
|
"token_acc": 0.8809906820044313,
|
|
"train_speed(iter/s)": 0.121367
|
|
},
|
|
{
|
|
"epoch": 0.6272050176401411,
|
|
"grad_norm": 0.9611604809761047,
|
|
"learning_rate": 8.95925592418023e-06,
|
|
"loss": 0.3964669227600098,
|
|
"memory(GiB)": 31.15,
|
|
"step": 500,
|
|
"token_acc": 0.8726217824114979,
|
|
"train_speed(iter/s)": 0.121645
|
|
},
|
|
{
|
|
"epoch": 0.6272050176401411,
|
|
"eval_loss": 0.371460884809494,
|
|
"eval_runtime": 29.5898,
|
|
"eval_samples_per_second": 17.405,
|
|
"eval_steps_per_second": 4.36,
|
|
"eval_token_acc": 0.8745955235141253,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.6334770678165426,
|
|
"grad_norm": 0.8176589608192444,
|
|
"learning_rate": 8.939109828316184e-06,
|
|
"loss": 0.38572733402252196,
|
|
"memory(GiB)": 31.15,
|
|
"step": 505,
|
|
"token_acc": 0.8809286455710512,
|
|
"train_speed(iter/s)": 0.120811
|
|
},
|
|
{
|
|
"epoch": 0.639749117992944,
|
|
"grad_norm": 0.7897095084190369,
|
|
"learning_rate": 8.918793721409973e-06,
|
|
"loss": 0.3885223150253296,
|
|
"memory(GiB)": 31.15,
|
|
"step": 510,
|
|
"token_acc": 0.8733564201071371,
|
|
"train_speed(iter/s)": 0.121065
|
|
},
|
|
{
|
|
"epoch": 0.6460211681693454,
|
|
"grad_norm": 0.8051208257675171,
|
|
"learning_rate": 8.898308480299937e-06,
|
|
"loss": 0.3946079254150391,
|
|
"memory(GiB)": 31.15,
|
|
"step": 515,
|
|
"token_acc": 0.8788746774056323,
|
|
"train_speed(iter/s)": 0.121332
|
|
},
|
|
{
|
|
"epoch": 0.6522932183457467,
|
|
"grad_norm": 0.7898780703544617,
|
|
"learning_rate": 8.877654989124202e-06,
|
|
"loss": 0.38194358348846436,
|
|
"memory(GiB)": 31.15,
|
|
"step": 520,
|
|
"token_acc": 0.889278570841439,
|
|
"train_speed(iter/s)": 0.121638
|
|
},
|
|
{
|
|
"epoch": 0.6522932183457467,
|
|
"eval_loss": 0.3701505661010742,
|
|
"eval_runtime": 29.5916,
|
|
"eval_samples_per_second": 17.404,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.8744613965965294,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.6585652685221481,
|
|
"grad_norm": 0.7575268745422363,
|
|
"learning_rate": 8.856834139282531e-06,
|
|
"loss": 0.3784614086151123,
|
|
"memory(GiB)": 31.15,
|
|
"step": 525,
|
|
"token_acc": 0.8807604793446934,
|
|
"train_speed(iter/s)": 0.120829
|
|
},
|
|
{
|
|
"epoch": 0.6648373186985496,
|
|
"grad_norm": 0.8083406686782837,
|
|
"learning_rate": 8.835846829397843e-06,
|
|
"loss": 0.38791258335113527,
|
|
"memory(GiB)": 31.15,
|
|
"step": 530,
|
|
"token_acc": 0.8846142795575348,
|
|
"train_speed(iter/s)": 0.121137
|
|
},
|
|
{
|
|
"epoch": 0.671109368874951,
|
|
"grad_norm": 0.8856268525123596,
|
|
"learning_rate": 8.814693965277435e-06,
|
|
"loss": 0.38564338684082033,
|
|
"memory(GiB)": 31.15,
|
|
"step": 535,
|
|
"token_acc": 0.879522377393324,
|
|
"train_speed(iter/s)": 0.121379
|
|
},
|
|
{
|
|
"epoch": 0.6773814190513524,
|
|
"grad_norm": 0.7782846093177795,
|
|
"learning_rate": 8.793376459873888e-06,
|
|
"loss": 0.39195048809051514,
|
|
"memory(GiB)": 31.15,
|
|
"step": 540,
|
|
"token_acc": 0.8783795212990894,
|
|
"train_speed(iter/s)": 0.121643
|
|
},
|
|
{
|
|
"epoch": 0.6773814190513524,
|
|
"eval_loss": 0.36819902062416077,
|
|
"eval_runtime": 29.5691,
|
|
"eval_samples_per_second": 17.417,
|
|
"eval_steps_per_second": 4.363,
|
|
"eval_token_acc": 0.8753080727638528,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.6836534692277538,
|
|
"grad_norm": 0.8303295373916626,
|
|
"learning_rate": 8.771895233245655e-06,
|
|
"loss": 0.3857764720916748,
|
|
"memory(GiB)": 31.15,
|
|
"step": 545,
|
|
"token_acc": 0.8770407440630754,
|
|
"train_speed(iter/s)": 0.120864
|
|
},
|
|
{
|
|
"epoch": 0.6899255194041553,
|
|
"grad_norm": 0.8560570478439331,
|
|
"learning_rate": 8.750251212517364e-06,
|
|
"loss": 0.3794244289398193,
|
|
"memory(GiB)": 31.15,
|
|
"step": 550,
|
|
"token_acc": 0.8898940454798935,
|
|
"train_speed(iter/s)": 0.121097
|
|
},
|
|
{
|
|
"epoch": 0.6961975695805567,
|
|
"grad_norm": 0.7564458250999451,
|
|
"learning_rate": 8.728445331839796e-06,
|
|
"loss": 0.3893013000488281,
|
|
"memory(GiB)": 31.15,
|
|
"step": 555,
|
|
"token_acc": 0.8829551217038539,
|
|
"train_speed(iter/s)": 0.121376
|
|
},
|
|
{
|
|
"epoch": 0.7024696197569581,
|
|
"grad_norm": 0.7831209897994995,
|
|
"learning_rate": 8.706478532349567e-06,
|
|
"loss": 0.38511061668395996,
|
|
"memory(GiB)": 31.15,
|
|
"step": 560,
|
|
"token_acc": 0.8776614357621598,
|
|
"train_speed(iter/s)": 0.121619
|
|
},
|
|
{
|
|
"epoch": 0.7024696197569581,
|
|
"eval_loss": 0.36745068430900574,
|
|
"eval_runtime": 29.5839,
|
|
"eval_samples_per_second": 17.408,
|
|
"eval_steps_per_second": 4.36,
|
|
"eval_token_acc": 0.875936792690083,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.7087416699333595,
|
|
"grad_norm": 0.9281031489372253,
|
|
"learning_rate": 8.684351762128511e-06,
|
|
"loss": 0.3907186508178711,
|
|
"memory(GiB)": 31.15,
|
|
"step": 565,
|
|
"token_acc": 0.8817858594067453,
|
|
"train_speed(iter/s)": 0.120888
|
|
},
|
|
{
|
|
"epoch": 0.7150137201097608,
|
|
"grad_norm": 0.8066033720970154,
|
|
"learning_rate": 8.662065976162765e-06,
|
|
"loss": 0.3858931541442871,
|
|
"memory(GiB)": 31.15,
|
|
"step": 570,
|
|
"token_acc": 0.8792498385693029,
|
|
"train_speed(iter/s)": 0.121186
|
|
},
|
|
{
|
|
"epoch": 0.7212857702861623,
|
|
"grad_norm": 0.8844251036643982,
|
|
"learning_rate": 8.639622136301541e-06,
|
|
"loss": 0.3788000583648682,
|
|
"memory(GiB)": 33.59,
|
|
"step": 575,
|
|
"token_acc": 0.8837105552746631,
|
|
"train_speed(iter/s)": 0.121421
|
|
},
|
|
{
|
|
"epoch": 0.7275578204625637,
|
|
"grad_norm": 0.8435778021812439,
|
|
"learning_rate": 8.617021211215629e-06,
|
|
"loss": 0.37533106803894045,
|
|
"memory(GiB)": 33.59,
|
|
"step": 580,
|
|
"token_acc": 0.8742189278757369,
|
|
"train_speed(iter/s)": 0.12169
|
|
},
|
|
{
|
|
"epoch": 0.7275578204625637,
|
|
"eval_loss": 0.36586904525756836,
|
|
"eval_runtime": 29.5932,
|
|
"eval_samples_per_second": 17.403,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.8756224327269679,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.7338298706389651,
|
|
"grad_norm": 0.7818266153335571,
|
|
"learning_rate": 8.594264176355565e-06,
|
|
"loss": 0.37725415229797366,
|
|
"memory(GiB)": 33.59,
|
|
"step": 585,
|
|
"token_acc": 0.8777972097460154,
|
|
"train_speed(iter/s)": 0.120955
|
|
},
|
|
{
|
|
"epoch": 0.7401019208153665,
|
|
"grad_norm": 0.8796353936195374,
|
|
"learning_rate": 8.571352013909558e-06,
|
|
"loss": 0.400989294052124,
|
|
"memory(GiB)": 33.59,
|
|
"step": 590,
|
|
"token_acc": 0.8727272727272727,
|
|
"train_speed(iter/s)": 0.121228
|
|
},
|
|
{
|
|
"epoch": 0.7463739709917679,
|
|
"grad_norm": 0.7808417677879333,
|
|
"learning_rate": 8.548285712761084e-06,
|
|
"loss": 0.3853422164916992,
|
|
"memory(GiB)": 33.59,
|
|
"step": 595,
|
|
"token_acc": 0.8805738658394726,
|
|
"train_speed(iter/s)": 0.121437
|
|
},
|
|
{
|
|
"epoch": 0.7526460211681694,
|
|
"grad_norm": 0.833470344543457,
|
|
"learning_rate": 8.525066268446208e-06,
|
|
"loss": 0.37978854179382326,
|
|
"memory(GiB)": 33.59,
|
|
"step": 600,
|
|
"token_acc": 0.8784342932803707,
|
|
"train_speed(iter/s)": 0.121678
|
|
},
|
|
{
|
|
"epoch": 0.7526460211681694,
|
|
"eval_loss": 0.3638017177581787,
|
|
"eval_runtime": 29.6016,
|
|
"eval_samples_per_second": 17.398,
|
|
"eval_steps_per_second": 4.358,
|
|
"eval_token_acc": 0.8765571296839635,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.7589180713445708,
|
|
"grad_norm": 0.80388343334198,
|
|
"learning_rate": 8.501694683110615e-06,
|
|
"loss": 0.39281136989593507,
|
|
"memory(GiB)": 33.59,
|
|
"step": 605,
|
|
"token_acc": 0.8803214107664786,
|
|
"train_speed(iter/s)": 0.120954
|
|
},
|
|
{
|
|
"epoch": 0.7651901215209722,
|
|
"grad_norm": 0.8746615052223206,
|
|
"learning_rate": 8.478171965466366e-06,
|
|
"loss": 0.38366003036499025,
|
|
"memory(GiB)": 33.59,
|
|
"step": 610,
|
|
"token_acc": 0.8762894230294654,
|
|
"train_speed(iter/s)": 0.121177
|
|
},
|
|
{
|
|
"epoch": 0.7714621716973736,
|
|
"grad_norm": 0.9113203287124634,
|
|
"learning_rate": 8.454499130748352e-06,
|
|
"loss": 0.3745842933654785,
|
|
"memory(GiB)": 33.59,
|
|
"step": 615,
|
|
"token_acc": 0.8812096914763814,
|
|
"train_speed(iter/s)": 0.121476
|
|
},
|
|
{
|
|
"epoch": 0.777734221873775,
|
|
"grad_norm": 0.8363653421401978,
|
|
"learning_rate": 8.43067720067048e-06,
|
|
"loss": 0.37692205905914306,
|
|
"memory(GiB)": 33.59,
|
|
"step": 620,
|
|
"token_acc": 0.8883228276199522,
|
|
"train_speed(iter/s)": 0.121722
|
|
},
|
|
{
|
|
"epoch": 0.777734221873775,
|
|
"eval_loss": 0.36300593614578247,
|
|
"eval_runtime": 29.8369,
|
|
"eval_samples_per_second": 17.261,
|
|
"eval_steps_per_second": 4.324,
|
|
"eval_token_acc": 0.8767834688574063,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.7840062720501764,
|
|
"grad_norm": 0.8239129781723022,
|
|
"learning_rate": 8.40670720338158e-06,
|
|
"loss": 0.3925849437713623,
|
|
"memory(GiB)": 33.59,
|
|
"step": 625,
|
|
"token_acc": 0.8787143407663546,
|
|
"train_speed(iter/s)": 0.120995
|
|
},
|
|
{
|
|
"epoch": 0.7902783222265778,
|
|
"grad_norm": 0.8488283157348633,
|
|
"learning_rate": 8.382590173421029e-06,
|
|
"loss": 0.3935189485549927,
|
|
"memory(GiB)": 33.59,
|
|
"step": 630,
|
|
"token_acc": 0.8936352849099952,
|
|
"train_speed(iter/s)": 0.121252
|
|
},
|
|
{
|
|
"epoch": 0.7965503724029792,
|
|
"grad_norm": 0.8239266276359558,
|
|
"learning_rate": 8.358327151674095e-06,
|
|
"loss": 0.39822547435760497,
|
|
"memory(GiB)": 33.59,
|
|
"step": 635,
|
|
"token_acc": 0.869016123396396,
|
|
"train_speed(iter/s)": 0.121499
|
|
},
|
|
{
|
|
"epoch": 0.8028224225793806,
|
|
"grad_norm": 0.8977949619293213,
|
|
"learning_rate": 8.33391918532702e-06,
|
|
"loss": 0.3884063720703125,
|
|
"memory(GiB)": 33.59,
|
|
"step": 640,
|
|
"token_acc": 0.8792178010735076,
|
|
"train_speed(iter/s)": 0.121738
|
|
},
|
|
{
|
|
"epoch": 0.8028224225793806,
|
|
"eval_loss": 0.36090487241744995,
|
|
"eval_runtime": 29.6252,
|
|
"eval_samples_per_second": 17.384,
|
|
"eval_steps_per_second": 4.354,
|
|
"eval_token_acc": 0.8771942325425434,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.8090944727557821,
|
|
"grad_norm": 0.7422317266464233,
|
|
"learning_rate": 8.309367327821819e-06,
|
|
"loss": 0.36748080253601073,
|
|
"memory(GiB)": 33.59,
|
|
"step": 645,
|
|
"token_acc": 0.882223291626564,
|
|
"train_speed(iter/s)": 0.12104
|
|
},
|
|
{
|
|
"epoch": 0.8153665229321835,
|
|
"grad_norm": 0.8495545387268066,
|
|
"learning_rate": 8.284672638810813e-06,
|
|
"loss": 0.37848606109619143,
|
|
"memory(GiB)": 33.59,
|
|
"step": 650,
|
|
"token_acc": 0.8850858133346354,
|
|
"train_speed(iter/s)": 0.121243
|
|
},
|
|
{
|
|
"epoch": 0.8216385731085849,
|
|
"grad_norm": 0.7722618579864502,
|
|
"learning_rate": 8.259836184110904e-06,
|
|
"loss": 0.36829509735107424,
|
|
"memory(GiB)": 33.59,
|
|
"step": 655,
|
|
"token_acc": 0.875583076461442,
|
|
"train_speed(iter/s)": 0.121449
|
|
},
|
|
{
|
|
"epoch": 0.8279106232849863,
|
|
"grad_norm": 0.7796041965484619,
|
|
"learning_rate": 8.234859035657557e-06,
|
|
"loss": 0.40014114379882815,
|
|
"memory(GiB)": 33.59,
|
|
"step": 660,
|
|
"token_acc": 0.8759235981236778,
|
|
"train_speed(iter/s)": 0.121697
|
|
},
|
|
{
|
|
"epoch": 0.8279106232849863,
|
|
"eval_loss": 0.3596991300582886,
|
|
"eval_runtime": 29.7406,
|
|
"eval_samples_per_second": 17.316,
|
|
"eval_steps_per_second": 4.338,
|
|
"eval_token_acc": 0.8776930170173527,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.8341826734613876,
|
|
"grad_norm": 0.7468088269233704,
|
|
"learning_rate": 8.209742271458556e-06,
|
|
"loss": 0.38162546157836913,
|
|
"memory(GiB)": 33.59,
|
|
"step": 665,
|
|
"token_acc": 0.8829189729162207,
|
|
"train_speed(iter/s)": 0.12099
|
|
},
|
|
{
|
|
"epoch": 0.8404547236377891,
|
|
"grad_norm": 0.7364327311515808,
|
|
"learning_rate": 8.18448697554746e-06,
|
|
"loss": 0.36592922210693357,
|
|
"memory(GiB)": 33.59,
|
|
"step": 670,
|
|
"token_acc": 0.8770340880816123,
|
|
"train_speed(iter/s)": 0.121214
|
|
},
|
|
{
|
|
"epoch": 0.8467267738141905,
|
|
"grad_norm": 0.7761643528938293,
|
|
"learning_rate": 8.159094237936828e-06,
|
|
"loss": 0.38688228130340574,
|
|
"memory(GiB)": 33.59,
|
|
"step": 675,
|
|
"token_acc": 0.8751209347324551,
|
|
"train_speed(iter/s)": 0.12144
|
|
},
|
|
{
|
|
"epoch": 0.8529988239905919,
|
|
"grad_norm": 0.8629448413848877,
|
|
"learning_rate": 8.133565154571169e-06,
|
|
"loss": 0.3881547451019287,
|
|
"memory(GiB)": 33.59,
|
|
"step": 680,
|
|
"token_acc": 0.8676196768574689,
|
|
"train_speed(iter/s)": 0.121629
|
|
},
|
|
{
|
|
"epoch": 0.8529988239905919,
|
|
"eval_loss": 0.3583786189556122,
|
|
"eval_runtime": 29.6285,
|
|
"eval_samples_per_second": 17.382,
|
|
"eval_steps_per_second": 4.354,
|
|
"eval_token_acc": 0.8773115935954396,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.8592708741669933,
|
|
"grad_norm": 0.7593112587928772,
|
|
"learning_rate": 8.107900827279638e-06,
|
|
"loss": 0.37020533084869384,
|
|
"memory(GiB)": 33.59,
|
|
"step": 685,
|
|
"token_acc": 0.884691054718319,
|
|
"train_speed(iter/s)": 0.120936
|
|
},
|
|
{
|
|
"epoch": 0.8655429243433947,
|
|
"grad_norm": 0.797288179397583,
|
|
"learning_rate": 8.082102363728494e-06,
|
|
"loss": 0.3854295492172241,
|
|
"memory(GiB)": 33.59,
|
|
"step": 690,
|
|
"token_acc": 0.8761994516792323,
|
|
"train_speed(iter/s)": 0.121156
|
|
},
|
|
{
|
|
"epoch": 0.8718149745197962,
|
|
"grad_norm": 0.7850540280342102,
|
|
"learning_rate": 8.056170877373277e-06,
|
|
"loss": 0.40497736930847167,
|
|
"memory(GiB)": 33.59,
|
|
"step": 695,
|
|
"token_acc": 0.8737006516938655,
|
|
"train_speed(iter/s)": 0.121359
|
|
},
|
|
{
|
|
"epoch": 0.8780870246961976,
|
|
"grad_norm": 0.7562994956970215,
|
|
"learning_rate": 8.030107487410766e-06,
|
|
"loss": 0.37325115203857423,
|
|
"memory(GiB)": 33.59,
|
|
"step": 700,
|
|
"token_acc": 0.8829526314234213,
|
|
"train_speed(iter/s)": 0.121586
|
|
},
|
|
{
|
|
"epoch": 0.8780870246961976,
|
|
"eval_loss": 0.3569120168685913,
|
|
"eval_runtime": 29.6346,
|
|
"eval_samples_per_second": 17.378,
|
|
"eval_steps_per_second": 4.353,
|
|
"eval_token_acc": 0.878032525777517,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.884359074872599,
|
|
"grad_norm": 0.8399495482444763,
|
|
"learning_rate": 8.003913318730662e-06,
|
|
"loss": 0.3845979690551758,
|
|
"memory(GiB)": 33.59,
|
|
"step": 705,
|
|
"token_acc": 0.8839846493683643,
|
|
"train_speed(iter/s)": 0.12098
|
|
},
|
|
{
|
|
"epoch": 0.8906311250490004,
|
|
"grad_norm": 0.8614782691001892,
|
|
"learning_rate": 7.97758950186705e-06,
|
|
"loss": 0.3775152683258057,
|
|
"memory(GiB)": 33.59,
|
|
"step": 710,
|
|
"token_acc": 0.8816499315970161,
|
|
"train_speed(iter/s)": 0.121192
|
|
},
|
|
{
|
|
"epoch": 0.8969031752254017,
|
|
"grad_norm": 0.8226941823959351,
|
|
"learning_rate": 7.951137172949595e-06,
|
|
"loss": 0.37194027900695803,
|
|
"memory(GiB)": 33.59,
|
|
"step": 715,
|
|
"token_acc": 0.8872406234047026,
|
|
"train_speed(iter/s)": 0.121408
|
|
},
|
|
{
|
|
"epoch": 0.9031752254018032,
|
|
"grad_norm": 0.7586838006973267,
|
|
"learning_rate": 7.924557473654516e-06,
|
|
"loss": 0.37108325958251953,
|
|
"memory(GiB)": 33.59,
|
|
"step": 720,
|
|
"token_acc": 0.87804646976623,
|
|
"train_speed(iter/s)": 0.121604
|
|
},
|
|
{
|
|
"epoch": 0.9031752254018032,
|
|
"eval_loss": 0.3558177351951599,
|
|
"eval_runtime": 29.6063,
|
|
"eval_samples_per_second": 17.395,
|
|
"eval_steps_per_second": 4.357,
|
|
"eval_token_acc": 0.8782337161539107,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.9094472755782046,
|
|
"grad_norm": 0.7763127088546753,
|
|
"learning_rate": 7.897851551155306e-06,
|
|
"loss": 0.378222918510437,
|
|
"memory(GiB)": 33.59,
|
|
"step": 725,
|
|
"token_acc": 0.8826250789141414,
|
|
"train_speed(iter/s)": 0.121006
|
|
},
|
|
{
|
|
"epoch": 0.915719325754606,
|
|
"grad_norm": 0.8064581751823425,
|
|
"learning_rate": 7.871020558073217e-06,
|
|
"loss": 0.3942488431930542,
|
|
"memory(GiB)": 33.6,
|
|
"step": 730,
|
|
"token_acc": 0.8696306499336802,
|
|
"train_speed(iter/s)": 0.121223
|
|
},
|
|
{
|
|
"epoch": 0.9219913759310074,
|
|
"grad_norm": 0.8451412320137024,
|
|
"learning_rate": 7.844065652427523e-06,
|
|
"loss": 0.37436461448669434,
|
|
"memory(GiB)": 33.6,
|
|
"step": 735,
|
|
"token_acc": 0.8797443405788941,
|
|
"train_speed(iter/s)": 0.121448
|
|
},
|
|
{
|
|
"epoch": 0.9282634261074089,
|
|
"grad_norm": 0.7629019021987915,
|
|
"learning_rate": 7.816987997585535e-06,
|
|
"loss": 0.3741127967834473,
|
|
"memory(GiB)": 33.6,
|
|
"step": 740,
|
|
"token_acc": 0.8834049015500628,
|
|
"train_speed(iter/s)": 0.121613
|
|
},
|
|
{
|
|
"epoch": 0.9282634261074089,
|
|
"eval_loss": 0.35523363947868347,
|
|
"eval_runtime": 29.6401,
|
|
"eval_samples_per_second": 17.375,
|
|
"eval_steps_per_second": 4.352,
|
|
"eval_token_acc": 0.8790091373962612,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.9345354762838103,
|
|
"grad_norm": 0.8186476230621338,
|
|
"learning_rate": 7.789788762212384e-06,
|
|
"loss": 0.3544290542602539,
|
|
"memory(GiB)": 33.6,
|
|
"step": 745,
|
|
"token_acc": 0.8867604833554854,
|
|
"train_speed(iter/s)": 0.121009
|
|
},
|
|
{
|
|
"epoch": 0.9408075264602117,
|
|
"grad_norm": 0.7892398238182068,
|
|
"learning_rate": 7.762469120220595e-06,
|
|
"loss": 0.3707085609436035,
|
|
"memory(GiB)": 33.6,
|
|
"step": 750,
|
|
"token_acc": 0.8836383423547269,
|
|
"train_speed(iter/s)": 0.121228
|
|
},
|
|
{
|
|
"epoch": 0.9470795766366131,
|
|
"grad_norm": 0.7983216047286987,
|
|
"learning_rate": 7.73503025071941e-06,
|
|
"loss": 0.38054685592651366,
|
|
"memory(GiB)": 33.6,
|
|
"step": 755,
|
|
"token_acc": 0.8786164633787048,
|
|
"train_speed(iter/s)": 0.121423
|
|
},
|
|
{
|
|
"epoch": 0.9533516268130146,
|
|
"grad_norm": 0.7637438178062439,
|
|
"learning_rate": 7.7074733379639e-06,
|
|
"loss": 0.3841462373733521,
|
|
"memory(GiB)": 33.6,
|
|
"step": 760,
|
|
"token_acc": 0.8768013924795863,
|
|
"train_speed(iter/s)": 0.12163
|
|
},
|
|
{
|
|
"epoch": 0.9533516268130146,
|
|
"eval_loss": 0.35423436760902405,
|
|
"eval_runtime": 29.6267,
|
|
"eval_samples_per_second": 17.383,
|
|
"eval_steps_per_second": 4.354,
|
|
"eval_token_acc": 0.8791390728476821,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.9596236769894159,
|
|
"grad_norm": 0.7450261116027832,
|
|
"learning_rate": 7.679799571303861e-06,
|
|
"loss": 0.3811268091201782,
|
|
"memory(GiB)": 33.6,
|
|
"step": 765,
|
|
"token_acc": 0.8854130493498475,
|
|
"train_speed(iter/s)": 0.121106
|
|
},
|
|
{
|
|
"epoch": 0.9658957271658173,
|
|
"grad_norm": 0.8632524013519287,
|
|
"learning_rate": 7.65201014513247e-06,
|
|
"loss": 0.38339235782623293,
|
|
"memory(GiB)": 33.6,
|
|
"step": 770,
|
|
"token_acc": 0.876136081450211,
|
|
"train_speed(iter/s)": 0.121281
|
|
},
|
|
{
|
|
"epoch": 0.9721677773422187,
|
|
"grad_norm": 0.7771950364112854,
|
|
"learning_rate": 7.62410625883474e-06,
|
|
"loss": 0.37677223682403566,
|
|
"memory(GiB)": 33.6,
|
|
"step": 775,
|
|
"token_acc": 0.8823102678571428,
|
|
"train_speed(iter/s)": 0.121452
|
|
},
|
|
{
|
|
"epoch": 0.9784398275186201,
|
|
"grad_norm": 0.8017596006393433,
|
|
"learning_rate": 7.596089116735765e-06,
|
|
"loss": 0.37508654594421387,
|
|
"memory(GiB)": 33.6,
|
|
"step": 780,
|
|
"token_acc": 0.8823698425468337,
|
|
"train_speed(iter/s)": 0.121653
|
|
},
|
|
{
|
|
"epoch": 0.9784398275186201,
|
|
"eval_loss": 0.3527185022830963,
|
|
"eval_runtime": 29.6007,
|
|
"eval_samples_per_second": 17.398,
|
|
"eval_steps_per_second": 4.358,
|
|
"eval_token_acc": 0.8791348813815073,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.9847118776950216,
|
|
"grad_norm": 0.8020228147506714,
|
|
"learning_rate": 7.567959928048723e-06,
|
|
"loss": 0.38076558113098147,
|
|
"memory(GiB)": 33.6,
|
|
"step": 785,
|
|
"token_acc": 0.8822499299523676,
|
|
"train_speed(iter/s)": 0.121074
|
|
},
|
|
{
|
|
"epoch": 0.990983927871423,
|
|
"grad_norm": 0.8349906802177429,
|
|
"learning_rate": 7.5397199068227e-06,
|
|
"loss": 0.3815056324005127,
|
|
"memory(GiB)": 33.6,
|
|
"step": 790,
|
|
"token_acc": 0.8813588951692792,
|
|
"train_speed(iter/s)": 0.121236
|
|
},
|
|
{
|
|
"epoch": 0.9972559780478244,
|
|
"grad_norm": 0.7655360102653503,
|
|
"learning_rate": 7.511370271890286e-06,
|
|
"loss": 0.37683281898498533,
|
|
"memory(GiB)": 33.6,
|
|
"step": 795,
|
|
"token_acc": 0.8849736151561464,
|
|
"train_speed(iter/s)": 0.121426
|
|
},
|
|
{
|
|
"epoch": 1.0025088200705605,
|
|
"grad_norm": 0.781859815120697,
|
|
"learning_rate": 7.482912246814975e-06,
|
|
"loss": 0.33410110473632815,
|
|
"memory(GiB)": 33.6,
|
|
"step": 800,
|
|
"token_acc": 0.8970368853657823,
|
|
"train_speed(iter/s)": 0.121715
|
|
},
|
|
{
|
|
"epoch": 1.0025088200705605,
|
|
"eval_loss": 0.35208848118782043,
|
|
"eval_runtime": 29.6155,
|
|
"eval_samples_per_second": 17.39,
|
|
"eval_steps_per_second": 4.356,
|
|
"eval_token_acc": 0.8798600050297594,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.008780870246962,
|
|
"grad_norm": 0.7701341509819031,
|
|
"learning_rate": 7.454347059838351e-06,
|
|
"loss": 0.3262555360794067,
|
|
"memory(GiB)": 33.6,
|
|
"step": 805,
|
|
"token_acc": 0.8892221089920901,
|
|
"train_speed(iter/s)": 0.121208
|
|
},
|
|
{
|
|
"epoch": 1.0150529204233634,
|
|
"grad_norm": 0.8458274602890015,
|
|
"learning_rate": 7.425675943827084e-06,
|
|
"loss": 0.3318117618560791,
|
|
"memory(GiB)": 33.6,
|
|
"step": 810,
|
|
"token_acc": 0.8916776012730674,
|
|
"train_speed(iter/s)": 0.12141
|
|
},
|
|
{
|
|
"epoch": 1.021324970599765,
|
|
"grad_norm": 0.9188606142997742,
|
|
"learning_rate": 7.3969001362197135e-06,
|
|
"loss": 0.31556293964385984,
|
|
"memory(GiB)": 33.6,
|
|
"step": 815,
|
|
"token_acc": 0.8948849424712356,
|
|
"train_speed(iter/s)": 0.121599
|
|
},
|
|
{
|
|
"epoch": 1.0275970207761662,
|
|
"grad_norm": 0.8285843729972839,
|
|
"learning_rate": 7.3680208789732385e-06,
|
|
"loss": 0.3075234413146973,
|
|
"memory(GiB)": 33.6,
|
|
"step": 820,
|
|
"token_acc": 0.8996916527955904,
|
|
"train_speed(iter/s)": 0.12177
|
|
},
|
|
{
|
|
"epoch": 1.0275970207761662,
|
|
"eval_loss": 0.3578624427318573,
|
|
"eval_runtime": 29.734,
|
|
"eval_samples_per_second": 17.32,
|
|
"eval_steps_per_second": 4.338,
|
|
"eval_token_acc": 0.8788289043507419,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.0338690709525675,
|
|
"grad_norm": 0.7456257939338684,
|
|
"learning_rate": 7.339039418509532e-06,
|
|
"loss": 0.3122047185897827,
|
|
"memory(GiB)": 33.6,
|
|
"step": 825,
|
|
"token_acc": 0.8912523923331079,
|
|
"train_speed(iter/s)": 0.121242
|
|
},
|
|
{
|
|
"epoch": 1.040141121128969,
|
|
"grad_norm": 0.8746250867843628,
|
|
"learning_rate": 7.309957005661521e-06,
|
|
"loss": 0.30740058422088623,
|
|
"memory(GiB)": 33.6,
|
|
"step": 830,
|
|
"token_acc": 0.903616077429762,
|
|
"train_speed(iter/s)": 0.121448
|
|
},
|
|
{
|
|
"epoch": 1.0464131713053704,
|
|
"grad_norm": 0.8537876009941101,
|
|
"learning_rate": 7.280774895619219e-06,
|
|
"loss": 0.3157168388366699,
|
|
"memory(GiB)": 33.6,
|
|
"step": 835,
|
|
"token_acc": 0.8915479475195676,
|
|
"train_speed(iter/s)": 0.121643
|
|
},
|
|
{
|
|
"epoch": 1.052685221481772,
|
|
"grad_norm": 0.8017281293869019,
|
|
"learning_rate": 7.25149434787555e-06,
|
|
"loss": 0.31319799423217776,
|
|
"memory(GiB)": 33.6,
|
|
"step": 840,
|
|
"token_acc": 0.8969325908377688,
|
|
"train_speed(iter/s)": 0.121836
|
|
},
|
|
{
|
|
"epoch": 1.052685221481772,
|
|
"eval_loss": 0.3565090596675873,
|
|
"eval_runtime": 29.6075,
|
|
"eval_samples_per_second": 17.394,
|
|
"eval_steps_per_second": 4.357,
|
|
"eval_token_acc": 0.8790636264565345,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.0589572716581732,
|
|
"grad_norm": 0.7407189011573792,
|
|
"learning_rate": 7.2221166261719755e-06,
|
|
"loss": 0.3067447662353516,
|
|
"memory(GiB)": 33.6,
|
|
"step": 845,
|
|
"token_acc": 0.8904468233651385,
|
|
"train_speed(iter/s)": 0.1213
|
|
},
|
|
{
|
|
"epoch": 1.0652293218345747,
|
|
"grad_norm": 0.7648513913154602,
|
|
"learning_rate": 7.192642998443975e-06,
|
|
"loss": 0.31682767868041994,
|
|
"memory(GiB)": 33.6,
|
|
"step": 850,
|
|
"token_acc": 0.9042325428194994,
|
|
"train_speed(iter/s)": 0.12146
|
|
},
|
|
{
|
|
"epoch": 1.071501372010976,
|
|
"grad_norm": 0.8380671739578247,
|
|
"learning_rate": 7.163074736766299e-06,
|
|
"loss": 0.3035914421081543,
|
|
"memory(GiB)": 33.6,
|
|
"step": 855,
|
|
"token_acc": 0.900502677303027,
|
|
"train_speed(iter/s)": 0.121596
|
|
},
|
|
{
|
|
"epoch": 1.0777734221873776,
|
|
"grad_norm": 0.8171470761299133,
|
|
"learning_rate": 7.133413117298081e-06,
|
|
"loss": 0.30316686630249023,
|
|
"memory(GiB)": 33.6,
|
|
"step": 860,
|
|
"token_acc": 0.9034346601631681,
|
|
"train_speed(iter/s)": 0.121766
|
|
},
|
|
{
|
|
"epoch": 1.0777734221873776,
|
|
"eval_loss": 0.3562227189540863,
|
|
"eval_runtime": 29.6044,
|
|
"eval_samples_per_second": 17.396,
|
|
"eval_steps_per_second": 4.357,
|
|
"eval_token_acc": 0.8794701986754967,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.084045472363779,
|
|
"grad_norm": 0.7644967436790466,
|
|
"learning_rate": 7.103659420227755e-06,
|
|
"loss": 0.32071871757507325,
|
|
"memory(GiB)": 33.6,
|
|
"step": 865,
|
|
"token_acc": 0.8916257634980457,
|
|
"train_speed(iter/s)": 0.121253
|
|
},
|
|
{
|
|
"epoch": 1.0903175225401802,
|
|
"grad_norm": 0.8376749753952026,
|
|
"learning_rate": 7.0738149297178005e-06,
|
|
"loss": 0.31877703666687013,
|
|
"memory(GiB)": 33.6,
|
|
"step": 870,
|
|
"token_acc": 0.9103868211748601,
|
|
"train_speed(iter/s)": 0.121402
|
|
},
|
|
{
|
|
"epoch": 1.0965895727165818,
|
|
"grad_norm": 0.7798628211021423,
|
|
"learning_rate": 7.04388093384932e-06,
|
|
"loss": 0.3112868547439575,
|
|
"memory(GiB)": 33.6,
|
|
"step": 875,
|
|
"token_acc": 0.9032720088899864,
|
|
"train_speed(iter/s)": 0.121542
|
|
},
|
|
{
|
|
"epoch": 1.102861622892983,
|
|
"grad_norm": 0.8039925694465637,
|
|
"learning_rate": 7.013858724566449e-06,
|
|
"loss": 0.32082467079162597,
|
|
"memory(GiB)": 33.6,
|
|
"step": 880,
|
|
"token_acc": 0.9032403958710227,
|
|
"train_speed(iter/s)": 0.121699
|
|
},
|
|
{
|
|
"epoch": 1.102861622892983,
|
|
"eval_loss": 0.3564984202384949,
|
|
"eval_runtime": 29.624,
|
|
"eval_samples_per_second": 17.385,
|
|
"eval_steps_per_second": 4.355,
|
|
"eval_token_acc": 0.8794660072093218,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.1091336730693846,
|
|
"grad_norm": 0.8847902417182922,
|
|
"learning_rate": 6.983749597620588e-06,
|
|
"loss": 0.3243894577026367,
|
|
"memory(GiB)": 33.6,
|
|
"step": 885,
|
|
"token_acc": 0.8885523807680771,
|
|
"train_speed(iter/s)": 0.121222
|
|
},
|
|
{
|
|
"epoch": 1.115405723245786,
|
|
"grad_norm": 0.8933838605880737,
|
|
"learning_rate": 6.9535548525144894e-06,
|
|
"loss": 0.3157766580581665,
|
|
"memory(GiB)": 33.6,
|
|
"step": 890,
|
|
"token_acc": 0.8964195078892199,
|
|
"train_speed(iter/s)": 0.121388
|
|
},
|
|
{
|
|
"epoch": 1.1216777734221874,
|
|
"grad_norm": 0.7823595404624939,
|
|
"learning_rate": 6.923275792446159e-06,
|
|
"loss": 0.310500955581665,
|
|
"memory(GiB)": 33.6,
|
|
"step": 895,
|
|
"token_acc": 0.897285512497697,
|
|
"train_speed(iter/s)": 0.121527
|
|
},
|
|
{
|
|
"epoch": 1.1279498235985888,
|
|
"grad_norm": 0.7890828251838684,
|
|
"learning_rate": 6.8929137242526216e-06,
|
|
"loss": 0.31655497550964357,
|
|
"memory(GiB)": 33.6,
|
|
"step": 900,
|
|
"token_acc": 0.9005792271681345,
|
|
"train_speed(iter/s)": 0.121653
|
|
},
|
|
{
|
|
"epoch": 1.1279498235985888,
|
|
"eval_loss": 0.35559797286987305,
|
|
"eval_runtime": 29.5743,
|
|
"eval_samples_per_second": 17.414,
|
|
"eval_steps_per_second": 4.362,
|
|
"eval_token_acc": 0.8799396428870819,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.1342218737749903,
|
|
"grad_norm": 0.7184568047523499,
|
|
"learning_rate": 6.862469958353506e-06,
|
|
"loss": 0.31396899223327634,
|
|
"memory(GiB)": 33.6,
|
|
"step": 905,
|
|
"token_acc": 0.8933355593966059,
|
|
"train_speed(iter/s)": 0.121163
|
|
},
|
|
{
|
|
"epoch": 1.1404939239513916,
|
|
"grad_norm": 0.9552344679832458,
|
|
"learning_rate": 6.8319458086945026e-06,
|
|
"loss": 0.33000473976135253,
|
|
"memory(GiB)": 33.6,
|
|
"step": 910,
|
|
"token_acc": 0.9004522625621548,
|
|
"train_speed(iter/s)": 0.121332
|
|
},
|
|
{
|
|
"epoch": 1.146765974127793,
|
|
"grad_norm": 0.8863224387168884,
|
|
"learning_rate": 6.801342592690641e-06,
|
|
"loss": 0.318299388885498,
|
|
"memory(GiB)": 33.6,
|
|
"step": 915,
|
|
"token_acc": 0.900812626514507,
|
|
"train_speed(iter/s)": 0.121495
|
|
},
|
|
{
|
|
"epoch": 1.1530380243041944,
|
|
"grad_norm": 0.7559683918952942,
|
|
"learning_rate": 6.770661631169434e-06,
|
|
"loss": 0.3138300895690918,
|
|
"memory(GiB)": 33.6,
|
|
"step": 920,
|
|
"token_acc": 0.8954110250189142,
|
|
"train_speed(iter/s)": 0.121644
|
|
},
|
|
{
|
|
"epoch": 1.1530380243041944,
|
|
"eval_loss": 0.354932963848114,
|
|
"eval_runtime": 29.566,
|
|
"eval_samples_per_second": 17.419,
|
|
"eval_steps_per_second": 4.363,
|
|
"eval_token_acc": 0.8799270684885573,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.1593100744805958,
|
|
"grad_norm": 0.7694470286369324,
|
|
"learning_rate": 6.739904248313879e-06,
|
|
"loss": 0.31344189643859866,
|
|
"memory(GiB)": 33.6,
|
|
"step": 925,
|
|
"token_acc": 0.8902944897178665,
|
|
"train_speed(iter/s)": 0.121164
|
|
},
|
|
{
|
|
"epoch": 1.1655821246569973,
|
|
"grad_norm": 0.863102912902832,
|
|
"learning_rate": 6.709071771605292e-06,
|
|
"loss": 0.3148585557937622,
|
|
"memory(GiB)": 33.6,
|
|
"step": 930,
|
|
"token_acc": 0.89984285587215,
|
|
"train_speed(iter/s)": 0.1213
|
|
},
|
|
{
|
|
"epoch": 1.1718541748333986,
|
|
"grad_norm": 0.851425290107727,
|
|
"learning_rate": 6.678165531766029e-06,
|
|
"loss": 0.31011836528778075,
|
|
"memory(GiB)": 33.6,
|
|
"step": 935,
|
|
"token_acc": 0.9080040472893812,
|
|
"train_speed(iter/s)": 0.121449
|
|
},
|
|
{
|
|
"epoch": 1.1781262250098001,
|
|
"grad_norm": 0.837813138961792,
|
|
"learning_rate": 6.647186862702038e-06,
|
|
"loss": 0.30878582000732424,
|
|
"memory(GiB)": 33.6,
|
|
"step": 940,
|
|
"token_acc": 0.9038740191923886,
|
|
"train_speed(iter/s)": 0.121586
|
|
},
|
|
{
|
|
"epoch": 1.1781262250098001,
|
|
"eval_loss": 0.3556138277053833,
|
|
"eval_runtime": 29.559,
|
|
"eval_samples_per_second": 17.423,
|
|
"eval_steps_per_second": 4.364,
|
|
"eval_token_acc": 0.879683963450415,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.1843982751862014,
|
|
"grad_norm": 0.8195896148681641,
|
|
"learning_rate": 6.616137101445301e-06,
|
|
"loss": 0.314269495010376,
|
|
"memory(GiB)": 33.6,
|
|
"step": 945,
|
|
"token_acc": 0.8931946066461408,
|
|
"train_speed(iter/s)": 0.121118
|
|
},
|
|
{
|
|
"epoch": 1.190670325362603,
|
|
"grad_norm": 0.8756702542304993,
|
|
"learning_rate": 6.58501758809612e-06,
|
|
"loss": 0.33217945098876955,
|
|
"memory(GiB)": 33.6,
|
|
"step": 950,
|
|
"token_acc": 0.8988422076495874,
|
|
"train_speed(iter/s)": 0.121284
|
|
},
|
|
{
|
|
"epoch": 1.1969423755390043,
|
|
"grad_norm": 0.8291054964065552,
|
|
"learning_rate": 6.55382966576528e-06,
|
|
"loss": 0.31435232162475585,
|
|
"memory(GiB)": 33.6,
|
|
"step": 955,
|
|
"token_acc": 0.9005618808221204,
|
|
"train_speed(iter/s)": 0.121434
|
|
},
|
|
{
|
|
"epoch": 1.2032144257154056,
|
|
"grad_norm": 0.8235255479812622,
|
|
"learning_rate": 6.522574680516081e-06,
|
|
"loss": 0.3093531608581543,
|
|
"memory(GiB)": 33.6,
|
|
"step": 960,
|
|
"token_acc": 0.9002794452494307,
|
|
"train_speed(iter/s)": 0.121606
|
|
},
|
|
{
|
|
"epoch": 1.2032144257154056,
|
|
"eval_loss": 0.3561866879463196,
|
|
"eval_runtime": 29.5597,
|
|
"eval_samples_per_second": 17.422,
|
|
"eval_steps_per_second": 4.364,
|
|
"eval_token_acc": 0.8802288540531478,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.2094864758918071,
|
|
"grad_norm": 0.8153970837593079,
|
|
"learning_rate": 6.491253981306245e-06,
|
|
"loss": 0.325747013092041,
|
|
"memory(GiB)": 33.6,
|
|
"step": 965,
|
|
"token_acc": 0.8883892481810832,
|
|
"train_speed(iter/s)": 0.121177
|
|
},
|
|
{
|
|
"epoch": 1.2157585260682087,
|
|
"grad_norm": 0.8184377551078796,
|
|
"learning_rate": 6.459868919929691e-06,
|
|
"loss": 0.3134697675704956,
|
|
"memory(GiB)": 33.6,
|
|
"step": 970,
|
|
"token_acc": 0.8953274158432424,
|
|
"train_speed(iter/s)": 0.121304
|
|
},
|
|
{
|
|
"epoch": 1.22203057624461,
|
|
"grad_norm": 0.8619184494018555,
|
|
"learning_rate": 6.428420850958194e-06,
|
|
"loss": 0.3030562162399292,
|
|
"memory(GiB)": 33.6,
|
|
"step": 975,
|
|
"token_acc": 0.9094885815374718,
|
|
"train_speed(iter/s)": 0.121432
|
|
},
|
|
{
|
|
"epoch": 1.2283026264210113,
|
|
"grad_norm": 0.7978628873825073,
|
|
"learning_rate": 6.3969111316829215e-06,
|
|
"loss": 0.319288444519043,
|
|
"memory(GiB)": 33.6,
|
|
"step": 980,
|
|
"token_acc": 0.8982630272952854,
|
|
"train_speed(iter/s)": 0.121593
|
|
},
|
|
{
|
|
"epoch": 1.2283026264210113,
|
|
"eval_loss": 0.35557761788368225,
|
|
"eval_runtime": 29.5767,
|
|
"eval_samples_per_second": 17.412,
|
|
"eval_steps_per_second": 4.362,
|
|
"eval_token_acc": 0.8800444295414536,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.2345746765974128,
|
|
"grad_norm": 0.7945271730422974,
|
|
"learning_rate": 6.365341122055857e-06,
|
|
"loss": 0.31643688678741455,
|
|
"memory(GiB)": 33.6,
|
|
"step": 985,
|
|
"token_acc": 0.8898873699940827,
|
|
"train_speed(iter/s)": 0.121182
|
|
},
|
|
{
|
|
"epoch": 1.2408467267738141,
|
|
"grad_norm": 0.7574790716171265,
|
|
"learning_rate": 6.333712184631093e-06,
|
|
"loss": 0.3020521879196167,
|
|
"memory(GiB)": 33.6,
|
|
"step": 990,
|
|
"token_acc": 0.9032726908234351,
|
|
"train_speed(iter/s)": 0.121316
|
|
},
|
|
{
|
|
"epoch": 1.2471187769502157,
|
|
"grad_norm": 0.8387673497200012,
|
|
"learning_rate": 6.302025684506042e-06,
|
|
"loss": 0.3192462682723999,
|
|
"memory(GiB)": 33.6,
|
|
"step": 995,
|
|
"token_acc": 0.8955223880597015,
|
|
"train_speed(iter/s)": 0.121466
|
|
},
|
|
{
|
|
"epoch": 1.253390827126617,
|
|
"grad_norm": 0.7831020951271057,
|
|
"learning_rate": 6.2702829892625e-06,
|
|
"loss": 0.31763949394226076,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1000,
|
|
"token_acc": 0.9040835976438605,
|
|
"train_speed(iter/s)": 0.121627
|
|
},
|
|
{
|
|
"epoch": 1.253390827126617,
|
|
"eval_loss": 0.3534168303012848,
|
|
"eval_runtime": 29.5641,
|
|
"eval_samples_per_second": 17.42,
|
|
"eval_steps_per_second": 4.363,
|
|
"eval_token_acc": 0.880287534579596,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.2596628773030183,
|
|
"grad_norm": 0.8562179803848267,
|
|
"learning_rate": 6.238485468907637e-06,
|
|
"loss": 0.317755913734436,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1005,
|
|
"token_acc": 0.893742948563819,
|
|
"train_speed(iter/s)": 0.121201
|
|
},
|
|
{
|
|
"epoch": 1.2659349274794198,
|
|
"grad_norm": 0.7360610365867615,
|
|
"learning_rate": 6.2066344958148596e-06,
|
|
"loss": 0.3083625555038452,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1010,
|
|
"token_acc": 0.9031221682181398,
|
|
"train_speed(iter/s)": 0.121357
|
|
},
|
|
{
|
|
"epoch": 1.2722069776558214,
|
|
"grad_norm": 0.7288538217544556,
|
|
"learning_rate": 6.174731444664579e-06,
|
|
"loss": 0.3151975154876709,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1015,
|
|
"token_acc": 0.8976653802994226,
|
|
"train_speed(iter/s)": 0.121523
|
|
},
|
|
{
|
|
"epoch": 1.2784790278322227,
|
|
"grad_norm": 0.7573474645614624,
|
|
"learning_rate": 6.14277769238489e-06,
|
|
"loss": 0.30403614044189453,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1020,
|
|
"token_acc": 0.9034773205850549,
|
|
"train_speed(iter/s)": 0.121659
|
|
},
|
|
{
|
|
"epoch": 1.2784790278322227,
|
|
"eval_loss": 0.35168829560279846,
|
|
"eval_runtime": 29.6625,
|
|
"eval_samples_per_second": 17.362,
|
|
"eval_steps_per_second": 4.349,
|
|
"eval_token_acc": 0.8805432140162629,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.284751078008624,
|
|
"grad_norm": 0.7631447315216064,
|
|
"learning_rate": 6.110774618092128e-06,
|
|
"loss": 0.302550745010376,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1025,
|
|
"token_acc": 0.8947289080950706,
|
|
"train_speed(iter/s)": 0.121234
|
|
},
|
|
{
|
|
"epoch": 1.2910231281850255,
|
|
"grad_norm": 0.7580868601799011,
|
|
"learning_rate": 6.07872360303136e-06,
|
|
"loss": 0.3163439273834229,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1030,
|
|
"token_acc": 0.8989989733059548,
|
|
"train_speed(iter/s)": 0.121408
|
|
},
|
|
{
|
|
"epoch": 1.2972951783614268,
|
|
"grad_norm": 0.8098512291908264,
|
|
"learning_rate": 6.046626030516766e-06,
|
|
"loss": 0.31558966636657715,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1035,
|
|
"token_acc": 0.902613864848326,
|
|
"train_speed(iter/s)": 0.121551
|
|
},
|
|
{
|
|
"epoch": 1.3035672285378284,
|
|
"grad_norm": 0.7808286547660828,
|
|
"learning_rate": 6.0144832858719256e-06,
|
|
"loss": 0.31145410537719725,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1040,
|
|
"token_acc": 0.9054388771350431,
|
|
"train_speed(iter/s)": 0.121678
|
|
},
|
|
{
|
|
"epoch": 1.3035672285378284,
|
|
"eval_loss": 0.351917564868927,
|
|
"eval_runtime": 29.5693,
|
|
"eval_samples_per_second": 17.417,
|
|
"eval_steps_per_second": 4.363,
|
|
"eval_token_acc": 0.8806480006706345,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.3098392787142297,
|
|
"grad_norm": 0.8395740389823914,
|
|
"learning_rate": 5.982296756370052e-06,
|
|
"loss": 0.30791757106781004,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1045,
|
|
"token_acc": 0.8919198395135772,
|
|
"train_speed(iter/s)": 0.121267
|
|
},
|
|
{
|
|
"epoch": 1.3161113288906312,
|
|
"grad_norm": 0.8778785467147827,
|
|
"learning_rate": 5.950067831174086e-06,
|
|
"loss": 0.3176340341567993,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1050,
|
|
"token_acc": 0.8968351513289292,
|
|
"train_speed(iter/s)": 0.121396
|
|
},
|
|
{
|
|
"epoch": 1.3223833790670325,
|
|
"grad_norm": 0.8171632289886475,
|
|
"learning_rate": 5.917797901276771e-06,
|
|
"loss": 0.3169762134552002,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1055,
|
|
"token_acc": 0.8947547773205065,
|
|
"train_speed(iter/s)": 0.121546
|
|
},
|
|
{
|
|
"epoch": 1.328655429243434,
|
|
"grad_norm": 0.852591335773468,
|
|
"learning_rate": 5.885488359440592e-06,
|
|
"loss": 0.30669825077056884,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1060,
|
|
"token_acc": 0.897007142047387,
|
|
"train_speed(iter/s)": 0.121675
|
|
},
|
|
{
|
|
"epoch": 1.328655429243434,
|
|
"eval_loss": 0.35168930888175964,
|
|
"eval_runtime": 29.6924,
|
|
"eval_samples_per_second": 17.345,
|
|
"eval_steps_per_second": 4.345,
|
|
"eval_token_acc": 0.8810084667616732,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.3349274794198354,
|
|
"grad_norm": 0.7796351909637451,
|
|
"learning_rate": 5.853140600137684e-06,
|
|
"loss": 0.3120392322540283,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1065,
|
|
"token_acc": 0.8927770755874566,
|
|
"train_speed(iter/s)": 0.121274
|
|
},
|
|
{
|
|
"epoch": 1.3411995295962367,
|
|
"grad_norm": 0.7424667477607727,
|
|
"learning_rate": 5.8207560194896325e-06,
|
|
"loss": 0.3261461973190308,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1070,
|
|
"token_acc": 0.8903715475668567,
|
|
"train_speed(iter/s)": 0.121404
|
|
},
|
|
{
|
|
"epoch": 1.3474715797726382,
|
|
"grad_norm": 0.7519833445549011,
|
|
"learning_rate": 5.78833601520723e-06,
|
|
"loss": 0.3177935123443604,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1075,
|
|
"token_acc": 0.9002484697897097,
|
|
"train_speed(iter/s)": 0.121541
|
|
},
|
|
{
|
|
"epoch": 1.3537436299490395,
|
|
"grad_norm": 0.7985767722129822,
|
|
"learning_rate": 5.755881986530137e-06,
|
|
"loss": 0.3214226722717285,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1080,
|
|
"token_acc": 0.8970797820315775,
|
|
"train_speed(iter/s)": 0.121665
|
|
},
|
|
{
|
|
"epoch": 1.3537436299490395,
|
|
"eval_loss": 0.3515044152736664,
|
|
"eval_runtime": 29.5934,
|
|
"eval_samples_per_second": 17.403,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.8810922960851706,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 1.360015680125441,
|
|
"grad_norm": 0.8925988674163818,
|
|
"learning_rate": 5.723395334166506e-06,
|
|
"loss": 0.3182457447052002,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1085,
|
|
"token_acc": 0.891329215282544,
|
|
"train_speed(iter/s)": 0.121279
|
|
},
|
|
{
|
|
"epoch": 1.3662877303018424,
|
|
"grad_norm": 0.8478845953941345,
|
|
"learning_rate": 5.6908774602325165e-06,
|
|
"loss": 0.301633358001709,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1090,
|
|
"token_acc": 0.9010009298255209,
|
|
"train_speed(iter/s)": 0.121401
|
|
},
|
|
{
|
|
"epoch": 1.372559780478244,
|
|
"grad_norm": 0.7945632934570312,
|
|
"learning_rate": 5.6583297681918615e-06,
|
|
"loss": 0.3118173122406006,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1095,
|
|
"token_acc": 0.8985079483127266,
|
|
"train_speed(iter/s)": 0.121529
|
|
},
|
|
{
|
|
"epoch": 1.3788318306546452,
|
|
"grad_norm": 0.8674613833427429,
|
|
"learning_rate": 5.625753662795183e-06,
|
|
"loss": 0.31327056884765625,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1100,
|
|
"token_acc": 0.8926878892390144,
|
|
"train_speed(iter/s)": 0.121662
|
|
},
|
|
{
|
|
"epoch": 1.3788318306546452,
|
|
"eval_loss": 0.3502074182033539,
|
|
"eval_runtime": 29.7159,
|
|
"eval_samples_per_second": 17.331,
|
|
"eval_steps_per_second": 4.341,
|
|
"eval_token_acc": 0.8811845083410177,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 1.3851038808310467,
|
|
"grad_norm": 0.9332753419876099,
|
|
"learning_rate": 5.59315055001943e-06,
|
|
"loss": 0.3266937255859375,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1105,
|
|
"token_acc": 0.8881552483640253,
|
|
"train_speed(iter/s)": 0.121281
|
|
},
|
|
{
|
|
"epoch": 1.391375931007448,
|
|
"grad_norm": 0.8074430823326111,
|
|
"learning_rate": 5.5605218370071836e-06,
|
|
"loss": 0.30109169483184817,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1110,
|
|
"token_acc": 0.9126673532440782,
|
|
"train_speed(iter/s)": 0.121389
|
|
},
|
|
{
|
|
"epoch": 1.3976479811838494,
|
|
"grad_norm": 0.7786163687705994,
|
|
"learning_rate": 5.5278689320059305e-06,
|
|
"loss": 0.32388741970062257,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1115,
|
|
"token_acc": 0.8926039631593636,
|
|
"train_speed(iter/s)": 0.12155
|
|
},
|
|
{
|
|
"epoch": 1.403920031360251,
|
|
"grad_norm": 0.8224254250526428,
|
|
"learning_rate": 5.4951932443072764e-06,
|
|
"loss": 0.3238008260726929,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1120,
|
|
"token_acc": 0.8939509836918276,
|
|
"train_speed(iter/s)": 0.121686
|
|
},
|
|
{
|
|
"epoch": 1.403920031360251,
|
|
"eval_loss": 0.3496204912662506,
|
|
"eval_runtime": 29.6525,
|
|
"eval_samples_per_second": 17.368,
|
|
"eval_steps_per_second": 4.35,
|
|
"eval_token_acc": 0.8808827227764272,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 1.4101920815366524,
|
|
"grad_norm": 0.8136359453201294,
|
|
"learning_rate": 5.462496184186118e-06,
|
|
"loss": 0.31591062545776366,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1125,
|
|
"token_acc": 0.8925674700533988,
|
|
"train_speed(iter/s)": 0.121313
|
|
},
|
|
{
|
|
"epoch": 1.4164641317130537,
|
|
"grad_norm": 0.7958024740219116,
|
|
"learning_rate": 5.429779162839787e-06,
|
|
"loss": 0.32611215114593506,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1130,
|
|
"token_acc": 0.8966544669669669,
|
|
"train_speed(iter/s)": 0.121454
|
|
},
|
|
{
|
|
"epoch": 1.422736181889455,
|
|
"grad_norm": 0.7814271450042725,
|
|
"learning_rate": 5.397043592327129e-06,
|
|
"loss": 0.31618432998657225,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1135,
|
|
"token_acc": 0.8981076808629347,
|
|
"train_speed(iter/s)": 0.12158
|
|
},
|
|
{
|
|
"epoch": 1.4290082320658566,
|
|
"grad_norm": 0.8769139647483826,
|
|
"learning_rate": 5.364290885507577e-06,
|
|
"loss": 0.3024888277053833,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1140,
|
|
"token_acc": 0.9077086992829787,
|
|
"train_speed(iter/s)": 0.121677
|
|
},
|
|
{
|
|
"epoch": 1.4290082320658566,
|
|
"eval_loss": 0.3490462005138397,
|
|
"eval_runtime": 29.6376,
|
|
"eval_samples_per_second": 17.377,
|
|
"eval_steps_per_second": 4.353,
|
|
"eval_token_acc": 0.8812515717998156,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 1.435280282242258,
|
|
"grad_norm": 0.8784323930740356,
|
|
"learning_rate": 5.3315224559801555e-06,
|
|
"loss": 0.30409352779388427,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1145,
|
|
"token_acc": 0.8937964910867968,
|
|
"train_speed(iter/s)": 0.12129
|
|
},
|
|
{
|
|
"epoch": 1.4415523324186594,
|
|
"grad_norm": 0.8287903070449829,
|
|
"learning_rate": 5.2987397180224795e-06,
|
|
"loss": 0.3141587972640991,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1150,
|
|
"token_acc": 0.9003217129898212,
|
|
"train_speed(iter/s)": 0.121389
|
|
},
|
|
{
|
|
"epoch": 1.4478243825950607,
|
|
"grad_norm": 0.8509834408760071,
|
|
"learning_rate": 5.265944086529714e-06,
|
|
"loss": 0.30988848209381104,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1155,
|
|
"token_acc": 0.896488090168356,
|
|
"train_speed(iter/s)": 0.12152
|
|
},
|
|
{
|
|
"epoch": 1.454096432771462,
|
|
"grad_norm": 0.8002197742462158,
|
|
"learning_rate": 5.233136976953504e-06,
|
|
"loss": 0.3149235725402832,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1160,
|
|
"token_acc": 0.9005892084869243,
|
|
"train_speed(iter/s)": 0.121672
|
|
},
|
|
{
|
|
"epoch": 1.454096432771462,
|
|
"eval_loss": 0.3489372730255127,
|
|
"eval_runtime": 29.6441,
|
|
"eval_samples_per_second": 17.373,
|
|
"eval_steps_per_second": 4.352,
|
|
"eval_token_acc": 0.8816916757481768,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 1.4603684829478636,
|
|
"grad_norm": 0.7909538149833679,
|
|
"learning_rate": 5.200319805240884e-06,
|
|
"loss": 0.3111138343811035,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1165,
|
|
"token_acc": 0.8955117952818873,
|
|
"train_speed(iter/s)": 0.1213
|
|
},
|
|
{
|
|
"epoch": 1.4666405331242651,
|
|
"grad_norm": 0.8317378163337708,
|
|
"learning_rate": 5.167493987773175e-06,
|
|
"loss": 0.31470346450805664,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1170,
|
|
"token_acc": 0.8882736915724443,
|
|
"train_speed(iter/s)": 0.121439
|
|
},
|
|
{
|
|
"epoch": 1.4729125833006664,
|
|
"grad_norm": 0.8626337647438049,
|
|
"learning_rate": 5.134660941304838e-06,
|
|
"loss": 0.3050379276275635,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1175,
|
|
"token_acc": 0.9051131601005867,
|
|
"train_speed(iter/s)": 0.121566
|
|
},
|
|
{
|
|
"epoch": 1.4791846334770677,
|
|
"grad_norm": 0.8486148118972778,
|
|
"learning_rate": 5.10182208290234e-06,
|
|
"loss": 0.3151291608810425,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1180,
|
|
"token_acc": 0.9041969040589388,
|
|
"train_speed(iter/s)": 0.121677
|
|
},
|
|
{
|
|
"epoch": 1.4791846334770677,
|
|
"eval_loss": 0.3478534519672394,
|
|
"eval_runtime": 29.59,
|
|
"eval_samples_per_second": 17.405,
|
|
"eval_steps_per_second": 4.36,
|
|
"eval_token_acc": 0.8822281834185598,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 1.4854566836534693,
|
|
"grad_norm": 0.7371572852134705,
|
|
"learning_rate": 5.068978829882992e-06,
|
|
"loss": 0.31096959114074707,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1185,
|
|
"token_acc": 0.8926875593542261,
|
|
"train_speed(iter/s)": 0.121251
|
|
},
|
|
{
|
|
"epoch": 1.4917287338298706,
|
|
"grad_norm": 0.804434597492218,
|
|
"learning_rate": 5.036132599753771e-06,
|
|
"loss": 0.32340445518493655,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1190,
|
|
"token_acc": 0.8985925658607001,
|
|
"train_speed(iter/s)": 0.121393
|
|
},
|
|
{
|
|
"epoch": 1.4980007840062721,
|
|
"grad_norm": 0.8279677033424377,
|
|
"learning_rate": 5.003284810150152e-06,
|
|
"loss": 0.305421781539917,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1195,
|
|
"token_acc": 0.8999231444883619,
|
|
"train_speed(iter/s)": 0.121518
|
|
},
|
|
{
|
|
"epoch": 1.5042728341826734,
|
|
"grad_norm": 0.8119995594024658,
|
|
"learning_rate": 4.970436878774907e-06,
|
|
"loss": 0.32050256729125975,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1200,
|
|
"token_acc": 0.8954298771779492,
|
|
"train_speed(iter/s)": 0.121643
|
|
},
|
|
{
|
|
"epoch": 1.5042728341826734,
|
|
"eval_loss": 0.34642741084098816,
|
|
"eval_runtime": 29.5744,
|
|
"eval_samples_per_second": 17.414,
|
|
"eval_steps_per_second": 4.362,
|
|
"eval_token_acc": 0.8823287786067566,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 1.5105448843590747,
|
|
"grad_norm": 0.8389795422554016,
|
|
"learning_rate": 4.937590223336936e-06,
|
|
"loss": 0.311386251449585,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1205,
|
|
"token_acc": 0.8925981666438637,
|
|
"train_speed(iter/s)": 0.121306
|
|
},
|
|
{
|
|
"epoch": 1.5168169345354763,
|
|
"grad_norm": 0.8051816821098328,
|
|
"learning_rate": 4.904746261490062e-06,
|
|
"loss": 0.3099170684814453,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1210,
|
|
"token_acc": 0.8959899446472166,
|
|
"train_speed(iter/s)": 0.121422
|
|
},
|
|
{
|
|
"epoch": 1.5230889847118778,
|
|
"grad_norm": 0.7875924110412598,
|
|
"learning_rate": 4.87190641077186e-06,
|
|
"loss": 0.30568342208862304,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1215,
|
|
"token_acc": 0.901078617984657,
|
|
"train_speed(iter/s)": 0.12152
|
|
},
|
|
{
|
|
"epoch": 1.5293610348882791,
|
|
"grad_norm": 0.8027485609054565,
|
|
"learning_rate": 4.8390720885424665e-06,
|
|
"loss": 0.30870785713195803,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1220,
|
|
"token_acc": 0.9042371803028375,
|
|
"train_speed(iter/s)": 0.12164
|
|
},
|
|
{
|
|
"epoch": 1.5293610348882791,
|
|
"eval_loss": 0.34620022773742676,
|
|
"eval_runtime": 29.5969,
|
|
"eval_samples_per_second": 17.4,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.8822030346215106,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 1.5356330850646804,
|
|
"grad_norm": 0.7887035608291626,
|
|
"learning_rate": 4.806244711923408e-06,
|
|
"loss": 0.31411142349243165,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1225,
|
|
"token_acc": 0.8938585311454517,
|
|
"train_speed(iter/s)": 0.121287
|
|
},
|
|
{
|
|
"epoch": 1.541905135241082,
|
|
"grad_norm": 0.8030281066894531,
|
|
"learning_rate": 4.773425697736445e-06,
|
|
"loss": 0.3045094728469849,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1230,
|
|
"token_acc": 0.9042907034758024,
|
|
"train_speed(iter/s)": 0.121403
|
|
},
|
|
{
|
|
"epoch": 1.5481771854174835,
|
|
"grad_norm": 0.829979658126831,
|
|
"learning_rate": 4.7406164624424135e-06,
|
|
"loss": 0.30724167823791504,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1235,
|
|
"token_acc": 0.8976526045138559,
|
|
"train_speed(iter/s)": 0.121507
|
|
},
|
|
{
|
|
"epoch": 1.5544492355938848,
|
|
"grad_norm": 0.8427159190177917,
|
|
"learning_rate": 4.707818422080094e-06,
|
|
"loss": 0.3048734664916992,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1240,
|
|
"token_acc": 0.9048282007016762,
|
|
"train_speed(iter/s)": 0.12164
|
|
},
|
|
{
|
|
"epoch": 1.5544492355938848,
|
|
"eval_loss": 0.34649136662483215,
|
|
"eval_runtime": 29.5723,
|
|
"eval_samples_per_second": 17.415,
|
|
"eval_steps_per_second": 4.362,
|
|
"eval_token_acc": 0.8821317796965379,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 1.5607212857702861,
|
|
"grad_norm": 0.8154223561286926,
|
|
"learning_rate": 4.675032992205099e-06,
|
|
"loss": 0.31010706424713136,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1245,
|
|
"token_acc": 0.8929249172868086,
|
|
"train_speed(iter/s)": 0.121305
|
|
},
|
|
{
|
|
"epoch": 1.5669933359466874,
|
|
"grad_norm": 0.8454766273498535,
|
|
"learning_rate": 4.642261587828778e-06,
|
|
"loss": 0.3093379735946655,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1250,
|
|
"token_acc": 0.8965885145768318,
|
|
"train_speed(iter/s)": 0.121428
|
|
},
|
|
{
|
|
"epoch": 1.573265386123089,
|
|
"grad_norm": 0.8195229768753052,
|
|
"learning_rate": 4.609505623357135e-06,
|
|
"loss": 0.29369077682495115,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1255,
|
|
"token_acc": 0.9066529450935287,
|
|
"train_speed(iter/s)": 0.121553
|
|
},
|
|
{
|
|
"epoch": 1.5795374362994905,
|
|
"grad_norm": 0.8140600919723511,
|
|
"learning_rate": 4.576766512529799e-06,
|
|
"loss": 0.3222052574157715,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1260,
|
|
"token_acc": 0.8947929354445798,
|
|
"train_speed(iter/s)": 0.121668
|
|
},
|
|
{
|
|
"epoch": 1.5795374362994905,
|
|
"eval_loss": 0.3448590338230133,
|
|
"eval_runtime": 29.5768,
|
|
"eval_samples_per_second": 17.412,
|
|
"eval_steps_per_second": 4.362,
|
|
"eval_token_acc": 0.8820186101098164,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 1.5858094864758918,
|
|
"grad_norm": 0.8057063221931458,
|
|
"learning_rate": 4.544045668358999e-06,
|
|
"loss": 0.3125570774078369,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1265,
|
|
"token_acc": 0.892243011525279,
|
|
"train_speed(iter/s)": 0.12132
|
|
},
|
|
{
|
|
"epoch": 1.5920815366522931,
|
|
"grad_norm": 0.813529372215271,
|
|
"learning_rate": 4.511344503068574e-06,
|
|
"loss": 0.3144998550415039,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1270,
|
|
"token_acc": 0.8892766751032369,
|
|
"train_speed(iter/s)": 0.121413
|
|
},
|
|
{
|
|
"epoch": 1.5983535868286947,
|
|
"grad_norm": 0.8066157698631287,
|
|
"learning_rate": 4.478664428033031e-06,
|
|
"loss": 0.30913031101226807,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1275,
|
|
"token_acc": 0.8998527245949927,
|
|
"train_speed(iter/s)": 0.121518
|
|
},
|
|
{
|
|
"epoch": 1.6046256370050962,
|
|
"grad_norm": 0.7944806218147278,
|
|
"learning_rate": 4.446006853716628e-06,
|
|
"loss": 0.31250030994415284,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1280,
|
|
"token_acc": 0.9023781249048446,
|
|
"train_speed(iter/s)": 0.121625
|
|
},
|
|
{
|
|
"epoch": 1.6046256370050962,
|
|
"eval_loss": 0.34460246562957764,
|
|
"eval_runtime": 29.653,
|
|
"eval_samples_per_second": 17.368,
|
|
"eval_steps_per_second": 4.35,
|
|
"eval_token_acc": 0.8823539274038058,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 1.6108976871814975,
|
|
"grad_norm": 0.7362424731254578,
|
|
"learning_rate": 4.413373189612497e-06,
|
|
"loss": 0.3013019561767578,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1285,
|
|
"token_acc": 0.8930150309460654,
|
|
"train_speed(iter/s)": 0.121286
|
|
},
|
|
{
|
|
"epoch": 1.6171697373578988,
|
|
"grad_norm": 0.7636338472366333,
|
|
"learning_rate": 4.380764844181806e-06,
|
|
"loss": 0.30982949733734133,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1290,
|
|
"token_acc": 0.8940976689137035,
|
|
"train_speed(iter/s)": 0.12142
|
|
},
|
|
{
|
|
"epoch": 1.6234417875343001,
|
|
"grad_norm": 0.8665372729301453,
|
|
"learning_rate": 4.34818322479298e-06,
|
|
"loss": 0.3103508949279785,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1295,
|
|
"token_acc": 0.8969226252435476,
|
|
"train_speed(iter/s)": 0.121525
|
|
},
|
|
{
|
|
"epoch": 1.6297138377107017,
|
|
"grad_norm": 0.8369340896606445,
|
|
"learning_rate": 4.315629737660956e-06,
|
|
"loss": 0.30051441192626954,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1300,
|
|
"token_acc": 0.9011551155115511,
|
|
"train_speed(iter/s)": 0.121614
|
|
},
|
|
{
|
|
"epoch": 1.6297138377107017,
|
|
"eval_loss": 0.34451213479042053,
|
|
"eval_runtime": 29.6214,
|
|
"eval_samples_per_second": 17.386,
|
|
"eval_steps_per_second": 4.355,
|
|
"eval_token_acc": 0.8825928409757733,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 1.6359858878871032,
|
|
"grad_norm": 0.8098254799842834,
|
|
"learning_rate": 4.283105787786482e-06,
|
|
"loss": 0.2908606052398682,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1305,
|
|
"token_acc": 0.8952707220970546,
|
|
"train_speed(iter/s)": 0.121269
|
|
},
|
|
{
|
|
"epoch": 1.6422579380635045,
|
|
"grad_norm": 0.7809374332427979,
|
|
"learning_rate": 4.250612778895492e-06,
|
|
"loss": 0.31494917869567873,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1310,
|
|
"token_acc": 0.8949442815249267,
|
|
"train_speed(iter/s)": 0.121381
|
|
},
|
|
{
|
|
"epoch": 1.6485299882399058,
|
|
"grad_norm": 0.8695399165153503,
|
|
"learning_rate": 4.218152113378513e-06,
|
|
"loss": 0.3110328674316406,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1315,
|
|
"token_acc": 0.9081429854890964,
|
|
"train_speed(iter/s)": 0.121498
|
|
},
|
|
{
|
|
"epoch": 1.6548020384163074,
|
|
"grad_norm": 0.7448384165763855,
|
|
"learning_rate": 4.185725192230136e-06,
|
|
"loss": 0.3125450849533081,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1320,
|
|
"token_acc": 0.8948348068869242,
|
|
"train_speed(iter/s)": 0.121599
|
|
},
|
|
{
|
|
"epoch": 1.6548020384163074,
|
|
"eval_loss": 0.34412243962287903,
|
|
"eval_runtime": 29.6115,
|
|
"eval_samples_per_second": 17.392,
|
|
"eval_steps_per_second": 4.356,
|
|
"eval_token_acc": 0.8831041998491073,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 1.6610740885927089,
|
|
"grad_norm": 0.8303554058074951,
|
|
"learning_rate": 4.1533334149885594e-06,
|
|
"loss": 0.30773005485534666,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1325,
|
|
"token_acc": 0.8920632614516095,
|
|
"train_speed(iter/s)": 0.121277
|
|
},
|
|
{
|
|
"epoch": 1.6673461387691102,
|
|
"grad_norm": 0.8903198838233948,
|
|
"learning_rate": 4.120978179675172e-06,
|
|
"loss": 0.3075371265411377,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1330,
|
|
"token_acc": 0.8935736892803082,
|
|
"train_speed(iter/s)": 0.121374
|
|
},
|
|
{
|
|
"epoch": 1.6736181889455115,
|
|
"grad_norm": 0.7909395694732666,
|
|
"learning_rate": 4.088660882734228e-06,
|
|
"loss": 0.3146337985992432,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1335,
|
|
"token_acc": 0.9018489263354982,
|
|
"train_speed(iter/s)": 0.121483
|
|
},
|
|
{
|
|
"epoch": 1.6798902391219128,
|
|
"grad_norm": 0.789057195186615,
|
|
"learning_rate": 4.056382918972565e-06,
|
|
"loss": 0.30276224613189695,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1340,
|
|
"token_acc": 0.8988961986291943,
|
|
"train_speed(iter/s)": 0.121581
|
|
},
|
|
{
|
|
"epoch": 1.6798902391219128,
|
|
"eval_loss": 0.34376105666160583,
|
|
"eval_runtime": 29.7014,
|
|
"eval_samples_per_second": 17.339,
|
|
"eval_steps_per_second": 4.343,
|
|
"eval_token_acc": 0.8829281582697628,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 1.6861622892983144,
|
|
"grad_norm": 0.7871639132499695,
|
|
"learning_rate": 4.024145681499416e-06,
|
|
"loss": 0.2980226993560791,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1345,
|
|
"token_acc": 0.8948409478211474,
|
|
"train_speed(iter/s)": 0.121283
|
|
},
|
|
{
|
|
"epoch": 1.6924343394747159,
|
|
"grad_norm": 0.7809557318687439,
|
|
"learning_rate": 3.991950561666269e-06,
|
|
"loss": 0.29270751476287843,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1350,
|
|
"token_acc": 0.9090764878513075,
|
|
"train_speed(iter/s)": 0.121377
|
|
},
|
|
{
|
|
"epoch": 1.6987063896511172,
|
|
"grad_norm": 0.8027935028076172,
|
|
"learning_rate": 3.959798949006831e-06,
|
|
"loss": 0.29990406036376954,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1355,
|
|
"token_acc": 0.9105977304119384,
|
|
"train_speed(iter/s)": 0.121483
|
|
},
|
|
{
|
|
"epoch": 1.7049784398275185,
|
|
"grad_norm": 0.7330254912376404,
|
|
"learning_rate": 3.927692231177053e-06,
|
|
"loss": 0.31405091285705566,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1360,
|
|
"token_acc": 0.9093888419489464,
|
|
"train_speed(iter/s)": 0.121592
|
|
},
|
|
{
|
|
"epoch": 1.7049784398275185,
|
|
"eval_loss": 0.34284746646881104,
|
|
"eval_runtime": 29.5951,
|
|
"eval_samples_per_second": 17.402,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.883364070751949,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 1.71125049000392,
|
|
"grad_norm": 0.7684708833694458,
|
|
"learning_rate": 3.895631793895223e-06,
|
|
"loss": 0.3107592582702637,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1365,
|
|
"token_acc": 0.8940206880820383,
|
|
"train_speed(iter/s)": 0.121282
|
|
},
|
|
{
|
|
"epoch": 1.7175225401803216,
|
|
"grad_norm": 0.8445452451705933,
|
|
"learning_rate": 3.863619020882184e-06,
|
|
"loss": 0.32090349197387696,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1370,
|
|
"token_acc": 0.9026219178795533,
|
|
"train_speed(iter/s)": 0.121395
|
|
},
|
|
{
|
|
"epoch": 1.7237945903567229,
|
|
"grad_norm": 0.8261458873748779,
|
|
"learning_rate": 3.831655293801596e-06,
|
|
"loss": 0.31141784191131594,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1375,
|
|
"token_acc": 0.9064525633470831,
|
|
"train_speed(iter/s)": 0.121509
|
|
},
|
|
{
|
|
"epoch": 1.7300666405331242,
|
|
"grad_norm": 0.7714701890945435,
|
|
"learning_rate": 3.7997419922003077e-06,
|
|
"loss": 0.31881678104400635,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1380,
|
|
"token_acc": 0.8884819878472222,
|
|
"train_speed(iter/s)": 0.121611
|
|
},
|
|
{
|
|
"epoch": 1.7300666405331242,
|
|
"eval_loss": 0.3424231708049774,
|
|
"eval_runtime": 29.6736,
|
|
"eval_samples_per_second": 17.355,
|
|
"eval_steps_per_second": 4.347,
|
|
"eval_token_acc": 0.8831922206387794,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 1.7363386907095255,
|
|
"grad_norm": 0.7543424367904663,
|
|
"learning_rate": 3.7678804934488146e-06,
|
|
"loss": 0.31963376998901366,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1385,
|
|
"token_acc": 0.8950204631510191,
|
|
"train_speed(iter/s)": 0.121307
|
|
},
|
|
{
|
|
"epoch": 1.742610740885927,
|
|
"grad_norm": 0.7479931116104126,
|
|
"learning_rate": 3.736072172681818e-06,
|
|
"loss": 0.3031449794769287,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1390,
|
|
"token_acc": 0.9012068207358186,
|
|
"train_speed(iter/s)": 0.121432
|
|
},
|
|
{
|
|
"epoch": 1.7488827910623286,
|
|
"grad_norm": 0.7986470460891724,
|
|
"learning_rate": 3.704318402738867e-06,
|
|
"loss": 0.3046679973602295,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1395,
|
|
"token_acc": 0.9063746108046102,
|
|
"train_speed(iter/s)": 0.121546
|
|
},
|
|
{
|
|
"epoch": 1.75515484123873,
|
|
"grad_norm": 0.8084174990653992,
|
|
"learning_rate": 3.672620554105111e-06,
|
|
"loss": 0.2944044589996338,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1400,
|
|
"token_acc": 0.9039976484420928,
|
|
"train_speed(iter/s)": 0.12164
|
|
},
|
|
{
|
|
"epoch": 1.75515484123873,
|
|
"eval_loss": 0.34153878688812256,
|
|
"eval_runtime": 29.599,
|
|
"eval_samples_per_second": 17.399,
|
|
"eval_steps_per_second": 4.358,
|
|
"eval_token_acc": 0.8834227512783972,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 1.7614268914151312,
|
|
"grad_norm": 0.8045998215675354,
|
|
"learning_rate": 3.6409799948521473e-06,
|
|
"loss": 0.3039552688598633,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1405,
|
|
"token_acc": 0.8954536924550851,
|
|
"train_speed(iter/s)": 0.121314
|
|
},
|
|
{
|
|
"epoch": 1.7676989415915327,
|
|
"grad_norm": 0.8704200983047485,
|
|
"learning_rate": 3.6093980905789824e-06,
|
|
"loss": 0.31539764404296877,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1410,
|
|
"token_acc": 0.8984341080783814,
|
|
"train_speed(iter/s)": 0.121425
|
|
},
|
|
{
|
|
"epoch": 1.7739709917679343,
|
|
"grad_norm": 0.8571969270706177,
|
|
"learning_rate": 3.577876204353079e-06,
|
|
"loss": 0.32133939266204836,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1415,
|
|
"token_acc": 0.8925621987755634,
|
|
"train_speed(iter/s)": 0.121539
|
|
},
|
|
{
|
|
"epoch": 1.7802430419443356,
|
|
"grad_norm": 0.800134003162384,
|
|
"learning_rate": 3.5464156966515426e-06,
|
|
"loss": 0.2933182716369629,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1420,
|
|
"token_acc": 0.9109921166146066,
|
|
"train_speed(iter/s)": 0.121629
|
|
},
|
|
{
|
|
"epoch": 1.7802430419443356,
|
|
"eval_loss": 0.3414349853992462,
|
|
"eval_runtime": 29.7108,
|
|
"eval_samples_per_second": 17.334,
|
|
"eval_steps_per_second": 4.342,
|
|
"eval_token_acc": 0.8835065806018946,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 1.786515092120737,
|
|
"grad_norm": 0.8272744417190552,
|
|
"learning_rate": 3.515017925302396e-06,
|
|
"loss": 0.2981221675872803,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1425,
|
|
"token_acc": 0.894788244695019,
|
|
"train_speed(iter/s)": 0.121322
|
|
},
|
|
{
|
|
"epoch": 1.7927871422971384,
|
|
"grad_norm": 0.8108351826667786,
|
|
"learning_rate": 3.48368424542597e-06,
|
|
"loss": 0.32099928855896,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1430,
|
|
"token_acc": 0.8884463309687173,
|
|
"train_speed(iter/s)": 0.12142
|
|
},
|
|
{
|
|
"epoch": 1.7990591924735397,
|
|
"grad_norm": 0.8740379810333252,
|
|
"learning_rate": 3.4524160093764288e-06,
|
|
"loss": 0.28867833614349364,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1435,
|
|
"token_acc": 0.9071488053295753,
|
|
"train_speed(iter/s)": 0.121496
|
|
},
|
|
{
|
|
"epoch": 1.8053312426499413,
|
|
"grad_norm": 0.7525773644447327,
|
|
"learning_rate": 3.421214566683395e-06,
|
|
"loss": 0.3096869230270386,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1440,
|
|
"token_acc": 0.8992541967025018,
|
|
"train_speed(iter/s)": 0.121595
|
|
},
|
|
{
|
|
"epoch": 1.8053312426499413,
|
|
"eval_loss": 0.3419208526611328,
|
|
"eval_runtime": 29.7092,
|
|
"eval_samples_per_second": 17.335,
|
|
"eval_steps_per_second": 4.342,
|
|
"eval_token_acc": 0.8835065806018946,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 1.8116032928263426,
|
|
"grad_norm": 0.7440472841262817,
|
|
"learning_rate": 3.390081263993702e-06,
|
|
"loss": 0.30480227470397947,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1445,
|
|
"token_acc": 0.8953731301707568,
|
|
"train_speed(iter/s)": 0.121312
|
|
},
|
|
{
|
|
"epoch": 1.817875343002744,
|
|
"grad_norm": 0.7330334186553955,
|
|
"learning_rate": 3.3590174450132828e-06,
|
|
"loss": 0.3220362186431885,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1450,
|
|
"token_acc": 0.8987148014440434,
|
|
"train_speed(iter/s)": 0.121421
|
|
},
|
|
{
|
|
"epoch": 1.8241473931791454,
|
|
"grad_norm": 0.9180962443351746,
|
|
"learning_rate": 3.3280244504491664e-06,
|
|
"loss": 0.3133381366729736,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1455,
|
|
"token_acc": 0.9044320549642395,
|
|
"train_speed(iter/s)": 0.121527
|
|
},
|
|
{
|
|
"epoch": 1.830419443355547,
|
|
"grad_norm": 0.8578007817268372,
|
|
"learning_rate": 3.297103617951618e-06,
|
|
"loss": 0.31406660079956056,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1460,
|
|
"token_acc": 0.9081761962692619,
|
|
"train_speed(iter/s)": 0.121624
|
|
},
|
|
{
|
|
"epoch": 1.830419443355547,
|
|
"eval_loss": 0.34057924151420593,
|
|
"eval_runtime": 29.6166,
|
|
"eval_samples_per_second": 17.389,
|
|
"eval_steps_per_second": 4.356,
|
|
"eval_token_acc": 0.8835023891357197,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 1.8366914935319483,
|
|
"grad_norm": 0.8259463310241699,
|
|
"learning_rate": 3.2662562820564043e-06,
|
|
"loss": 0.3026223659515381,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1465,
|
|
"token_acc": 0.8945344631088648,
|
|
"train_speed(iter/s)": 0.121317
|
|
},
|
|
{
|
|
"epoch": 1.8429635437083496,
|
|
"grad_norm": 0.848983108997345,
|
|
"learning_rate": 3.2354837741271994e-06,
|
|
"loss": 0.30731768608093263,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1470,
|
|
"token_acc": 0.8996287038609945,
|
|
"train_speed(iter/s)": 0.121417
|
|
},
|
|
{
|
|
"epoch": 1.8492355938847511,
|
|
"grad_norm": 0.7451069951057434,
|
|
"learning_rate": 3.2047874222981134e-06,
|
|
"loss": 0.3043700933456421,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1475,
|
|
"token_acc": 0.9003040283759818,
|
|
"train_speed(iter/s)": 0.121511
|
|
},
|
|
{
|
|
"epoch": 1.8555076440611527,
|
|
"grad_norm": 0.8326135277748108,
|
|
"learning_rate": 3.174168551416384e-06,
|
|
"loss": 0.3095861434936523,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1480,
|
|
"token_acc": 0.9073784192512802,
|
|
"train_speed(iter/s)": 0.121591
|
|
},
|
|
{
|
|
"epoch": 1.8555076440611527,
|
|
"eval_loss": 0.3408661484718323,
|
|
"eval_runtime": 29.6597,
|
|
"eval_samples_per_second": 17.364,
|
|
"eval_steps_per_second": 4.349,
|
|
"eval_token_acc": 0.8834227512783972,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 1.861779694237554,
|
|
"grad_norm": 0.8090599775314331,
|
|
"learning_rate": 3.1436284829851883e-06,
|
|
"loss": 0.3018056392669678,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1485,
|
|
"token_acc": 0.8960189466561408,
|
|
"train_speed(iter/s)": 0.121294
|
|
},
|
|
{
|
|
"epoch": 1.8680517444139553,
|
|
"grad_norm": 0.8168737292289734,
|
|
"learning_rate": 3.113168535106604e-06,
|
|
"loss": 0.3135341167449951,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1490,
|
|
"token_acc": 0.8999463724814218,
|
|
"train_speed(iter/s)": 0.1214
|
|
},
|
|
{
|
|
"epoch": 1.8743237945903566,
|
|
"grad_norm": 0.8455703258514404,
|
|
"learning_rate": 3.08279002242473e-06,
|
|
"loss": 0.31365869045257566,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1495,
|
|
"token_acc": 0.8966787576513262,
|
|
"train_speed(iter/s)": 0.121495
|
|
},
|
|
{
|
|
"epoch": 1.8805958447667581,
|
|
"grad_norm": 0.7874972820281982,
|
|
"learning_rate": 3.0524942560689387e-06,
|
|
"loss": 0.2999868392944336,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1500,
|
|
"token_acc": 0.9022604710944704,
|
|
"train_speed(iter/s)": 0.121584
|
|
},
|
|
{
|
|
"epoch": 1.8805958447667581,
|
|
"eval_loss": 0.34014302492141724,
|
|
"eval_runtime": 29.589,
|
|
"eval_samples_per_second": 17.405,
|
|
"eval_steps_per_second": 4.36,
|
|
"eval_token_acc": 0.8837999832341353,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.8868678949431597,
|
|
"grad_norm": 0.7840932607650757,
|
|
"learning_rate": 3.0222825435972948e-06,
|
|
"loss": 0.30387725830078127,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1505,
|
|
"token_acc": 0.896817482063844,
|
|
"train_speed(iter/s)": 0.121288
|
|
},
|
|
{
|
|
"epoch": 1.893139945119561,
|
|
"grad_norm": 0.7794051766395569,
|
|
"learning_rate": 2.99215618894011e-06,
|
|
"loss": 0.2972860813140869,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1510,
|
|
"token_acc": 0.9043185860382392,
|
|
"train_speed(iter/s)": 0.121375
|
|
},
|
|
{
|
|
"epoch": 1.8994119952959623,
|
|
"grad_norm": 0.8160467147827148,
|
|
"learning_rate": 2.9621164923436774e-06,
|
|
"loss": 0.28430678844451907,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1515,
|
|
"token_acc": 0.9079036497141464,
|
|
"train_speed(iter/s)": 0.121466
|
|
},
|
|
{
|
|
"epoch": 1.9056840454723638,
|
|
"grad_norm": 0.7849772572517395,
|
|
"learning_rate": 2.9321647503141525e-06,
|
|
"loss": 0.30244333744049073,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1520,
|
|
"token_acc": 0.9014301372723003,
|
|
"train_speed(iter/s)": 0.121546
|
|
},
|
|
{
|
|
"epoch": 1.9056840454723638,
|
|
"eval_loss": 0.34012025594711304,
|
|
"eval_runtime": 29.6307,
|
|
"eval_samples_per_second": 17.381,
|
|
"eval_steps_per_second": 4.354,
|
|
"eval_token_acc": 0.8839005784223322,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 1.9119560956487653,
|
|
"grad_norm": 0.7242955565452576,
|
|
"learning_rate": 2.902302255561585e-06,
|
|
"loss": 0.31361520290374756,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1525,
|
|
"token_acc": 0.8939074979877407,
|
|
"train_speed(iter/s)": 0.121264
|
|
},
|
|
{
|
|
"epoch": 1.9182281458251667,
|
|
"grad_norm": 0.7752643823623657,
|
|
"learning_rate": 2.87253029694414e-06,
|
|
"loss": 0.3019782304763794,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1530,
|
|
"token_acc": 0.8990245948922335,
|
|
"train_speed(iter/s)": 0.121338
|
|
},
|
|
{
|
|
"epoch": 1.924500196001568,
|
|
"grad_norm": 0.9198828935623169,
|
|
"learning_rate": 2.8428501594124602e-06,
|
|
"loss": 0.2983893871307373,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1535,
|
|
"token_acc": 0.9083565129904717,
|
|
"train_speed(iter/s)": 0.121439
|
|
},
|
|
{
|
|
"epoch": 1.9307722461779693,
|
|
"grad_norm": 0.808824896812439,
|
|
"learning_rate": 2.813263123954214e-06,
|
|
"loss": 0.30156128406524657,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1540,
|
|
"token_acc": 0.9017935200148356,
|
|
"train_speed(iter/s)": 0.12152
|
|
},
|
|
{
|
|
"epoch": 1.9307722461779693,
|
|
"eval_loss": 0.3396177589893341,
|
|
"eval_runtime": 29.634,
|
|
"eval_samples_per_second": 17.379,
|
|
"eval_steps_per_second": 4.353,
|
|
"eval_token_acc": 0.8845460642132618,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 1.9370442963543708,
|
|
"grad_norm": 0.7807343006134033,
|
|
"learning_rate": 2.7837704675388045e-06,
|
|
"loss": 0.2953279972076416,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1545,
|
|
"token_acc": 0.8962438270065346,
|
|
"train_speed(iter/s)": 0.121237
|
|
},
|
|
{
|
|
"epoch": 1.9433163465307723,
|
|
"grad_norm": 0.7839388251304626,
|
|
"learning_rate": 2.7543734630622622e-06,
|
|
"loss": 0.3038333415985107,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1550,
|
|
"token_acc": 0.9053210378601294,
|
|
"train_speed(iter/s)": 0.121329
|
|
},
|
|
{
|
|
"epoch": 1.9495883967071737,
|
|
"grad_norm": 0.7545835375785828,
|
|
"learning_rate": 2.7250733792922997e-06,
|
|
"loss": 0.29517788887023927,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1555,
|
|
"token_acc": 0.90642349335544,
|
|
"train_speed(iter/s)": 0.1214
|
|
},
|
|
{
|
|
"epoch": 1.955860446883575,
|
|
"grad_norm": 0.7737696170806885,
|
|
"learning_rate": 2.6958714808135546e-06,
|
|
"loss": 0.295018744468689,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1560,
|
|
"token_acc": 0.9136314827175901,
|
|
"train_speed(iter/s)": 0.121508
|
|
},
|
|
{
|
|
"epoch": 1.955860446883575,
|
|
"eval_loss": 0.33957967162132263,
|
|
"eval_runtime": 29.6912,
|
|
"eval_samples_per_second": 17.345,
|
|
"eval_steps_per_second": 4.345,
|
|
"eval_token_acc": 0.8840179394752284,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 1.9621324970599765,
|
|
"grad_norm": 0.7262492179870605,
|
|
"learning_rate": 2.6667690279730096e-06,
|
|
"loss": 0.30216593742370607,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1565,
|
|
"token_acc": 0.8968756421421372,
|
|
"train_speed(iter/s)": 0.121231
|
|
},
|
|
{
|
|
"epoch": 1.968404547236378,
|
|
"grad_norm": 0.7777485847473145,
|
|
"learning_rate": 2.6377672768256003e-06,
|
|
"loss": 0.2954871654510498,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1570,
|
|
"token_acc": 0.9041269349045146,
|
|
"train_speed(iter/s)": 0.121291
|
|
},
|
|
{
|
|
"epoch": 1.9746765974127793,
|
|
"grad_norm": 0.8558617234230042,
|
|
"learning_rate": 2.608867479080001e-06,
|
|
"loss": 0.2946753025054932,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1575,
|
|
"token_acc": 0.8997012032625373,
|
|
"train_speed(iter/s)": 0.121379
|
|
},
|
|
{
|
|
"epoch": 1.9809486475891807,
|
|
"grad_norm": 0.7917863726615906,
|
|
"learning_rate": 2.5800708820446002e-06,
|
|
"loss": 0.3050684928894043,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1580,
|
|
"token_acc": 0.9030535237431789,
|
|
"train_speed(iter/s)": 0.121479
|
|
},
|
|
{
|
|
"epoch": 1.9809486475891807,
|
|
"eval_loss": 0.33898818492889404,
|
|
"eval_runtime": 29.6993,
|
|
"eval_samples_per_second": 17.34,
|
|
"eval_steps_per_second": 4.344,
|
|
"eval_token_acc": 0.8843700226339174,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 1.987220697765582,
|
|
"grad_norm": 0.8041856288909912,
|
|
"learning_rate": 2.551378728573668e-06,
|
|
"loss": 0.2989157438278198,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1585,
|
|
"token_acc": 0.8997509598422746,
|
|
"train_speed(iter/s)": 0.12119
|
|
},
|
|
{
|
|
"epoch": 1.9934927479419835,
|
|
"grad_norm": 0.8586787581443787,
|
|
"learning_rate": 2.5227922570137143e-06,
|
|
"loss": 0.30647430419921873,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1590,
|
|
"token_acc": 0.8982128790862545,
|
|
"train_speed(iter/s)": 0.121292
|
|
},
|
|
{
|
|
"epoch": 1.999764798118385,
|
|
"grad_norm": 0.7968061566352844,
|
|
"learning_rate": 2.4943127011500483e-06,
|
|
"loss": 0.31006150245666503,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1595,
|
|
"token_acc": 0.9085869784317678,
|
|
"train_speed(iter/s)": 0.121394
|
|
},
|
|
{
|
|
"epoch": 2.005017640141121,
|
|
"grad_norm": 0.7677069306373596,
|
|
"learning_rate": 2.465941290153514e-06,
|
|
"loss": 0.2801233768463135,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1600,
|
|
"token_acc": 0.9198317869301115,
|
|
"train_speed(iter/s)": 0.121535
|
|
},
|
|
{
|
|
"epoch": 2.005017640141121,
|
|
"eval_loss": 0.3399621546268463,
|
|
"eval_runtime": 29.7379,
|
|
"eval_samples_per_second": 17.318,
|
|
"eval_steps_per_second": 4.338,
|
|
"eval_token_acc": 0.8842107469192724,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 2.0112896903175224,
|
|
"grad_norm": 0.805497944355011,
|
|
"learning_rate": 2.4376792485274577e-06,
|
|
"loss": 0.2575787782669067,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1605,
|
|
"token_acc": 0.9027501640006056,
|
|
"train_speed(iter/s)": 0.121259
|
|
},
|
|
{
|
|
"epoch": 2.017561740493924,
|
|
"grad_norm": 0.7615222930908203,
|
|
"learning_rate": 2.409527796054863e-06,
|
|
"loss": 0.25977578163146975,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1610,
|
|
"token_acc": 0.9162343277129269,
|
|
"train_speed(iter/s)": 0.121356
|
|
},
|
|
{
|
|
"epoch": 2.0238337906703254,
|
|
"grad_norm": 0.8287392854690552,
|
|
"learning_rate": 2.38148814774572e-06,
|
|
"loss": 0.24034299850463867,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1615,
|
|
"token_acc": 0.9182042343338152,
|
|
"train_speed(iter/s)": 0.121426
|
|
},
|
|
{
|
|
"epoch": 2.0301058408467267,
|
|
"grad_norm": 0.8738728761672974,
|
|
"learning_rate": 2.353561513784566e-06,
|
|
"loss": 0.2571218252182007,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1620,
|
|
"token_acc": 0.9214568880079287,
|
|
"train_speed(iter/s)": 0.121521
|
|
},
|
|
{
|
|
"epoch": 2.0301058408467267,
|
|
"eval_loss": 0.3534950017929077,
|
|
"eval_runtime": 29.6483,
|
|
"eval_samples_per_second": 17.37,
|
|
"eval_steps_per_second": 4.351,
|
|
"eval_token_acc": 0.883016179059435,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 2.036377891023128,
|
|
"grad_norm": 0.8269490599632263,
|
|
"learning_rate": 2.325749099478277e-06,
|
|
"loss": 0.2555586814880371,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1625,
|
|
"token_acc": 0.9017842820882989,
|
|
"train_speed(iter/s)": 0.121252
|
|
},
|
|
{
|
|
"epoch": 2.04264994119953,
|
|
"grad_norm": 0.802456259727478,
|
|
"learning_rate": 2.29805210520403e-06,
|
|
"loss": 0.25757761001586915,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1630,
|
|
"token_acc": 0.9153420162034663,
|
|
"train_speed(iter/s)": 0.121337
|
|
},
|
|
{
|
|
"epoch": 2.048921991375931,
|
|
"grad_norm": 0.8305613398551941,
|
|
"learning_rate": 2.270471726357501e-06,
|
|
"loss": 0.24194817543029784,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1635,
|
|
"token_acc": 0.9273302172119879,
|
|
"train_speed(iter/s)": 0.121409
|
|
},
|
|
{
|
|
"epoch": 2.0551940415523324,
|
|
"grad_norm": 0.8397504687309265,
|
|
"learning_rate": 2.243009153301276e-06,
|
|
"loss": 0.25193355083465574,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1640,
|
|
"token_acc": 0.918960244648318,
|
|
"train_speed(iter/s)": 0.121491
|
|
},
|
|
{
|
|
"epoch": 2.0551940415523324,
|
|
"eval_loss": 0.3526792824268341,
|
|
"eval_runtime": 29.596,
|
|
"eval_samples_per_second": 17.401,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.8830622851873585,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 2.0614660917287337,
|
|
"grad_norm": 0.7059065103530884,
|
|
"learning_rate": 2.215665571313468e-06,
|
|
"loss": 0.25691723823547363,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1645,
|
|
"token_acc": 0.9024566051703649,
|
|
"train_speed(iter/s)": 0.121247
|
|
},
|
|
{
|
|
"epoch": 2.067738141905135,
|
|
"grad_norm": 0.8032014966011047,
|
|
"learning_rate": 2.188442160536562e-06,
|
|
"loss": 0.25560142993927004,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1650,
|
|
"token_acc": 0.9098803496167228,
|
|
"train_speed(iter/s)": 0.121319
|
|
},
|
|
{
|
|
"epoch": 2.074010192081537,
|
|
"grad_norm": 0.7990818619728088,
|
|
"learning_rate": 2.1613400959264845e-06,
|
|
"loss": 0.24714956283569336,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1655,
|
|
"token_acc": 0.9181002989040186,
|
|
"train_speed(iter/s)": 0.121389
|
|
},
|
|
{
|
|
"epoch": 2.080282242257938,
|
|
"grad_norm": 0.8864375948905945,
|
|
"learning_rate": 2.1343605472018954e-06,
|
|
"loss": 0.2497623920440674,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1660,
|
|
"token_acc": 0.9190709535476774,
|
|
"train_speed(iter/s)": 0.121476
|
|
},
|
|
{
|
|
"epoch": 2.080282242257938,
|
|
"eval_loss": 0.3539391756057739,
|
|
"eval_runtime": 29.6226,
|
|
"eval_samples_per_second": 17.385,
|
|
"eval_steps_per_second": 4.355,
|
|
"eval_token_acc": 0.8831544974432056,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 2.0865542924343394,
|
|
"grad_norm": 0.8491289019584656,
|
|
"learning_rate": 2.1075046787936842e-06,
|
|
"loss": 0.26420676708221436,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1665,
|
|
"token_acc": 0.8965886626349976,
|
|
"train_speed(iter/s)": 0.121214
|
|
},
|
|
{
|
|
"epoch": 2.0928263426107407,
|
|
"grad_norm": 0.6870825886726379,
|
|
"learning_rate": 2.0807736497947436e-06,
|
|
"loss": 0.25780699253082273,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1670,
|
|
"token_acc": 0.9150844173816772,
|
|
"train_speed(iter/s)": 0.121287
|
|
},
|
|
{
|
|
"epoch": 2.0990983927871425,
|
|
"grad_norm": 0.7348774075508118,
|
|
"learning_rate": 2.0541686139099164e-06,
|
|
"loss": 0.24700713157653809,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1675,
|
|
"token_acc": 0.9142644537189816,
|
|
"train_speed(iter/s)": 0.121368
|
|
},
|
|
{
|
|
"epoch": 2.105370442963544,
|
|
"grad_norm": 0.789790153503418,
|
|
"learning_rate": 2.0276907194062167e-06,
|
|
"loss": 0.25561089515686036,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1680,
|
|
"token_acc": 0.9232616718411341,
|
|
"train_speed(iter/s)": 0.121454
|
|
},
|
|
{
|
|
"epoch": 2.105370442963544,
|
|
"eval_loss": 0.3535526394844055,
|
|
"eval_runtime": 29.6041,
|
|
"eval_samples_per_second": 17.396,
|
|
"eval_steps_per_second": 4.358,
|
|
"eval_token_acc": 0.8829072009388884,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 2.111642493139945,
|
|
"grad_norm": 0.825501024723053,
|
|
"learning_rate": 2.0013411090632638e-06,
|
|
"loss": 0.2573189973831177,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1685,
|
|
"token_acc": 0.8988425094093394,
|
|
"train_speed(iter/s)": 0.1212
|
|
},
|
|
{
|
|
"epoch": 2.1179145433163464,
|
|
"grad_norm": 0.7919580340385437,
|
|
"learning_rate": 1.9751209201239696e-06,
|
|
"loss": 0.24840922355651857,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1690,
|
|
"token_acc": 0.9239367219917013,
|
|
"train_speed(iter/s)": 0.121296
|
|
},
|
|
{
|
|
"epoch": 2.1241865934927477,
|
|
"grad_norm": 0.7574432492256165,
|
|
"learning_rate": 1.9490312842454425e-06,
|
|
"loss": 0.24667706489562988,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1695,
|
|
"token_acc": 0.9196868524061709,
|
|
"train_speed(iter/s)": 0.12138
|
|
},
|
|
{
|
|
"epoch": 2.1304586436691495,
|
|
"grad_norm": 0.7842095494270325,
|
|
"learning_rate": 1.9230733274501525e-06,
|
|
"loss": 0.25333414077758787,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1700,
|
|
"token_acc": 0.9171972811047527,
|
|
"train_speed(iter/s)": 0.12145
|
|
},
|
|
{
|
|
"epoch": 2.1304586436691495,
|
|
"eval_loss": 0.3539462685585022,
|
|
"eval_runtime": 29.5964,
|
|
"eval_samples_per_second": 17.401,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.8834395171430967,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 2.136730693845551,
|
|
"grad_norm": 0.8125796914100647,
|
|
"learning_rate": 1.8972481700773388e-06,
|
|
"loss": 0.25501580238342286,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1705,
|
|
"token_acc": 0.9002077731605964,
|
|
"train_speed(iter/s)": 0.1212
|
|
},
|
|
{
|
|
"epoch": 2.143002744021952,
|
|
"grad_norm": 0.8007329702377319,
|
|
"learning_rate": 1.8715569267346368e-06,
|
|
"loss": 0.25573315620422366,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1710,
|
|
"token_acc": 0.9234607218683651,
|
|
"train_speed(iter/s)": 0.121277
|
|
},
|
|
{
|
|
"epoch": 2.1492747941983534,
|
|
"grad_norm": 0.7575182318687439,
|
|
"learning_rate": 1.846000706249997e-06,
|
|
"loss": 0.2531334400177002,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1715,
|
|
"token_acc": 0.9233944058674252,
|
|
"train_speed(iter/s)": 0.121348
|
|
},
|
|
{
|
|
"epoch": 2.155546844374755,
|
|
"grad_norm": 0.9734401702880859,
|
|
"learning_rate": 1.8205806116238055e-06,
|
|
"loss": 0.248917818069458,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1720,
|
|
"token_acc": 0.9278663414080671,
|
|
"train_speed(iter/s)": 0.121411
|
|
},
|
|
{
|
|
"epoch": 2.155546844374755,
|
|
"eval_loss": 0.3528047800064087,
|
|
"eval_runtime": 29.5743,
|
|
"eval_samples_per_second": 17.414,
|
|
"eval_steps_per_second": 4.362,
|
|
"eval_token_acc": 0.8836658563165395,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 2.1618188945511565,
|
|
"grad_norm": 0.7814000844955444,
|
|
"learning_rate": 1.7952977399812988e-06,
|
|
"loss": 0.25339622497558595,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1725,
|
|
"token_acc": 0.9013212593534945,
|
|
"train_speed(iter/s)": 0.121154
|
|
},
|
|
{
|
|
"epoch": 2.168090944727558,
|
|
"grad_norm": 0.7855998277664185,
|
|
"learning_rate": 1.7701531825251888e-06,
|
|
"loss": 0.256337571144104,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1730,
|
|
"token_acc": 0.9186892875503964,
|
|
"train_speed(iter/s)": 0.121253
|
|
},
|
|
{
|
|
"epoch": 2.174362994903959,
|
|
"grad_norm": 0.8425953388214111,
|
|
"learning_rate": 1.7451480244885938e-06,
|
|
"loss": 0.25911998748779297,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1735,
|
|
"token_acc": 0.9183445487671889,
|
|
"train_speed(iter/s)": 0.121337
|
|
},
|
|
{
|
|
"epoch": 2.1806350450803604,
|
|
"grad_norm": 0.8361583948135376,
|
|
"learning_rate": 1.720283345088178e-06,
|
|
"loss": 0.259613561630249,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1740,
|
|
"token_acc": 0.9204680395053146,
|
|
"train_speed(iter/s)": 0.121423
|
|
},
|
|
{
|
|
"epoch": 2.1806350450803604,
|
|
"eval_loss": 0.353371798992157,
|
|
"eval_runtime": 29.6124,
|
|
"eval_samples_per_second": 17.391,
|
|
"eval_steps_per_second": 4.356,
|
|
"eval_token_acc": 0.883364070751949,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 2.186907095256762,
|
|
"grad_norm": 0.8060917258262634,
|
|
"learning_rate": 1.695560217477582e-06,
|
|
"loss": 0.24777050018310548,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1745,
|
|
"token_acc": 0.9022448606112365,
|
|
"train_speed(iter/s)": 0.121162
|
|
},
|
|
{
|
|
"epoch": 2.1931791454331635,
|
|
"grad_norm": 0.7034837603569031,
|
|
"learning_rate": 1.6709797087011066e-06,
|
|
"loss": 0.24974467754364013,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1750,
|
|
"token_acc": 0.9183000028447075,
|
|
"train_speed(iter/s)": 0.121239
|
|
},
|
|
{
|
|
"epoch": 2.199451195609565,
|
|
"grad_norm": 0.7496914267539978,
|
|
"learning_rate": 1.6465428796476584e-06,
|
|
"loss": 0.24403119087219238,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1755,
|
|
"token_acc": 0.9174810328140123,
|
|
"train_speed(iter/s)": 0.121305
|
|
},
|
|
{
|
|
"epoch": 2.205723245785966,
|
|
"grad_norm": 0.9179002046585083,
|
|
"learning_rate": 1.6222507850049602e-06,
|
|
"loss": 0.2549721717834473,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1760,
|
|
"token_acc": 0.9200168800112534,
|
|
"train_speed(iter/s)": 0.121396
|
|
},
|
|
{
|
|
"epoch": 2.205723245785966,
|
|
"eval_loss": 0.3538911044597626,
|
|
"eval_runtime": 29.6392,
|
|
"eval_samples_per_second": 17.376,
|
|
"eval_steps_per_second": 4.352,
|
|
"eval_token_acc": 0.8833808366166485,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 2.211995295962368,
|
|
"grad_norm": 0.794003963470459,
|
|
"learning_rate": 1.598104473214031e-06,
|
|
"loss": 0.25570311546325686,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1765,
|
|
"token_acc": 0.8993348159524498,
|
|
"train_speed(iter/s)": 0.121141
|
|
},
|
|
{
|
|
"epoch": 2.218267346138769,
|
|
"grad_norm": 0.7644505500793457,
|
|
"learning_rate": 1.5741049864239383e-06,
|
|
"loss": 0.24341793060302735,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1770,
|
|
"token_acc": 0.9168759872594971,
|
|
"train_speed(iter/s)": 0.12122
|
|
},
|
|
{
|
|
"epoch": 2.2245393963151705,
|
|
"grad_norm": 0.8791477680206299,
|
|
"learning_rate": 1.550253360446815e-06,
|
|
"loss": 0.2522608757019043,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1775,
|
|
"token_acc": 0.9258004677796596,
|
|
"train_speed(iter/s)": 0.121309
|
|
},
|
|
{
|
|
"epoch": 2.230811446491572,
|
|
"grad_norm": 0.8298718333244324,
|
|
"learning_rate": 1.5265506247131617e-06,
|
|
"loss": 0.2546123266220093,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1780,
|
|
"token_acc": 0.9118973044798785,
|
|
"train_speed(iter/s)": 0.121394
|
|
},
|
|
{
|
|
"epoch": 2.230811446491572,
|
|
"eval_loss": 0.35324588418006897,
|
|
"eval_runtime": 29.6796,
|
|
"eval_samples_per_second": 17.352,
|
|
"eval_steps_per_second": 4.346,
|
|
"eval_token_acc": 0.8832844328946265,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 2.2370834966679736,
|
|
"grad_norm": 0.816360592842102,
|
|
"learning_rate": 1.5029978022274067e-06,
|
|
"loss": 0.2614459991455078,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1785,
|
|
"token_acc": 0.9004699229406026,
|
|
"train_speed(iter/s)": 0.121137
|
|
},
|
|
{
|
|
"epoch": 2.243355546844375,
|
|
"grad_norm": 0.8427248597145081,
|
|
"learning_rate": 1.47959590952376e-06,
|
|
"loss": 0.25159344673156736,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1790,
|
|
"token_acc": 0.9224594190787307,
|
|
"train_speed(iter/s)": 0.121218
|
|
},
|
|
{
|
|
"epoch": 2.249627597020776,
|
|
"grad_norm": 0.7986406683921814,
|
|
"learning_rate": 1.4563459566223358e-06,
|
|
"loss": 0.24192914962768555,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1795,
|
|
"token_acc": 0.9266775696302905,
|
|
"train_speed(iter/s)": 0.121301
|
|
},
|
|
{
|
|
"epoch": 2.2558996471971775,
|
|
"grad_norm": 0.7850765585899353,
|
|
"learning_rate": 1.4332489469855698e-06,
|
|
"loss": 0.24650468826293945,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1800,
|
|
"token_acc": 0.9252590346221885,
|
|
"train_speed(iter/s)": 0.121371
|
|
},
|
|
{
|
|
"epoch": 2.2558996471971775,
|
|
"eval_loss": 0.35429683327674866,
|
|
"eval_runtime": 29.682,
|
|
"eval_samples_per_second": 17.351,
|
|
"eval_steps_per_second": 4.346,
|
|
"eval_token_acc": 0.8836155587224411,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 2.262171697373579,
|
|
"grad_norm": 0.7986750602722168,
|
|
"learning_rate": 1.4103058774748923e-06,
|
|
"loss": 0.2553676128387451,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1805,
|
|
"token_acc": 0.8993731709045985,
|
|
"train_speed(iter/s)": 0.121131
|
|
},
|
|
{
|
|
"epoch": 2.2684437475499806,
|
|
"grad_norm": 0.7992218732833862,
|
|
"learning_rate": 1.3875177383077233e-06,
|
|
"loss": 0.2504106521606445,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1810,
|
|
"token_acc": 0.9165525002446423,
|
|
"train_speed(iter/s)": 0.121197
|
|
},
|
|
{
|
|
"epoch": 2.274715797726382,
|
|
"grad_norm": 0.7687636017799377,
|
|
"learning_rate": 1.3648855130147216e-06,
|
|
"loss": 0.2536652088165283,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1815,
|
|
"token_acc": 0.9203490718321227,
|
|
"train_speed(iter/s)": 0.121289
|
|
},
|
|
{
|
|
"epoch": 2.280987847902783,
|
|
"grad_norm": 0.8168686628341675,
|
|
"learning_rate": 1.3424101783973403e-06,
|
|
"loss": 0.24730236530303956,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1820,
|
|
"token_acc": 0.9207464126371638,
|
|
"train_speed(iter/s)": 0.121365
|
|
},
|
|
{
|
|
"epoch": 2.280987847902783,
|
|
"eval_loss": 0.35338255763053894,
|
|
"eval_runtime": 29.6568,
|
|
"eval_samples_per_second": 17.365,
|
|
"eval_steps_per_second": 4.35,
|
|
"eval_token_acc": 0.8838293234973594,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 2.2872598980791845,
|
|
"grad_norm": 0.8269901871681213,
|
|
"learning_rate": 1.3200927044856714e-06,
|
|
"loss": 0.2572518825531006,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1825,
|
|
"token_acc": 0.9020789813985874,
|
|
"train_speed(iter/s)": 0.121125
|
|
},
|
|
{
|
|
"epoch": 2.293531948255586,
|
|
"grad_norm": 0.8186900019645691,
|
|
"learning_rate": 1.2979340544965745e-06,
|
|
"loss": 0.2439603328704834,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1830,
|
|
"token_acc": 0.9237335485473057,
|
|
"train_speed(iter/s)": 0.121206
|
|
},
|
|
{
|
|
"epoch": 2.2998039984319876,
|
|
"grad_norm": 0.7761522531509399,
|
|
"learning_rate": 1.2759351847921053e-06,
|
|
"loss": 0.2594336748123169,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1835,
|
|
"token_acc": 0.9104045438093382,
|
|
"train_speed(iter/s)": 0.121282
|
|
},
|
|
{
|
|
"epoch": 2.306076048608389,
|
|
"grad_norm": 0.7789030075073242,
|
|
"learning_rate": 1.25409704483824e-06,
|
|
"loss": 0.25881831645965575,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1840,
|
|
"token_acc": 0.9176125295446872,
|
|
"train_speed(iter/s)": 0.121357
|
|
},
|
|
{
|
|
"epoch": 2.306076048608389,
|
|
"eval_loss": 0.3527611494064331,
|
|
"eval_runtime": 29.6253,
|
|
"eval_samples_per_second": 17.384,
|
|
"eval_steps_per_second": 4.354,
|
|
"eval_token_acc": 0.8836407075194903,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 2.31234809878479,
|
|
"grad_norm": 0.8218780755996704,
|
|
"learning_rate": 1.232420577163902e-06,
|
|
"loss": 0.25008678436279297,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1845,
|
|
"token_acc": 0.9028911896854854,
|
|
"train_speed(iter/s)": 0.121116
|
|
},
|
|
{
|
|
"epoch": 2.3186201489611915,
|
|
"grad_norm": 0.7720741629600525,
|
|
"learning_rate": 1.2109067173202731e-06,
|
|
"loss": 0.2578773021697998,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1850,
|
|
"token_acc": 0.9163940481215703,
|
|
"train_speed(iter/s)": 0.121195
|
|
},
|
|
{
|
|
"epoch": 2.3248921991375933,
|
|
"grad_norm": 0.9268919229507446,
|
|
"learning_rate": 1.1895563938404203e-06,
|
|
"loss": 0.25402810573577883,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1855,
|
|
"token_acc": 0.9201602408716287,
|
|
"train_speed(iter/s)": 0.121273
|
|
},
|
|
{
|
|
"epoch": 2.3311642493139946,
|
|
"grad_norm": 0.8763542771339417,
|
|
"learning_rate": 1.1683705281992202e-06,
|
|
"loss": 0.25608100891113283,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1860,
|
|
"token_acc": 0.9185731132075472,
|
|
"train_speed(iter/s)": 0.121349
|
|
},
|
|
{
|
|
"epoch": 2.3311642493139946,
|
|
"eval_loss": 0.35326558351516724,
|
|
"eval_runtime": 29.6773,
|
|
"eval_samples_per_second": 17.353,
|
|
"eval_steps_per_second": 4.347,
|
|
"eval_token_acc": 0.883716153910638,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 2.337436299490396,
|
|
"grad_norm": 0.777973473072052,
|
|
"learning_rate": 1.1473500347735927e-06,
|
|
"loss": 0.26839523315429686,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1865,
|
|
"token_acc": 0.9000454114673827,
|
|
"train_speed(iter/s)": 0.121142
|
|
},
|
|
{
|
|
"epoch": 2.343708349666797,
|
|
"grad_norm": 0.8818052411079407,
|
|
"learning_rate": 1.1264958208030224e-06,
|
|
"loss": 0.25702362060546874,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1870,
|
|
"token_acc": 0.9185548071034905,
|
|
"train_speed(iter/s)": 0.121226
|
|
},
|
|
{
|
|
"epoch": 2.349980399843199,
|
|
"grad_norm": 0.8378356099128723,
|
|
"learning_rate": 1.105808786350423e-06,
|
|
"loss": 0.26531405448913575,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1875,
|
|
"token_acc": 0.9187210090165217,
|
|
"train_speed(iter/s)": 0.121307
|
|
},
|
|
{
|
|
"epoch": 2.3562524500196003,
|
|
"grad_norm": 0.8721848130226135,
|
|
"learning_rate": 1.085289824263273e-06,
|
|
"loss": 0.2487030506134033,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1880,
|
|
"token_acc": 0.9182288299935358,
|
|
"train_speed(iter/s)": 0.121392
|
|
},
|
|
{
|
|
"epoch": 2.3562524500196003,
|
|
"eval_loss": 0.3520548939704895,
|
|
"eval_runtime": 29.802,
|
|
"eval_samples_per_second": 17.281,
|
|
"eval_steps_per_second": 4.329,
|
|
"eval_token_acc": 0.8838251320311845,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 2.3625245001960016,
|
|
"grad_norm": 0.8557198643684387,
|
|
"learning_rate": 1.0649398201350907e-06,
|
|
"loss": 0.2521216869354248,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1885,
|
|
"token_acc": 0.901105316416284,
|
|
"train_speed(iter/s)": 0.121149
|
|
},
|
|
{
|
|
"epoch": 2.368796550372403,
|
|
"grad_norm": 0.8094416856765747,
|
|
"learning_rate": 1.044759652267207e-06,
|
|
"loss": 0.2516944408416748,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1890,
|
|
"token_acc": 0.9250567064872222,
|
|
"train_speed(iter/s)": 0.12121
|
|
},
|
|
{
|
|
"epoch": 2.375068600548804,
|
|
"grad_norm": 0.7657244205474854,
|
|
"learning_rate": 1.024750191630864e-06,
|
|
"loss": 0.26051204204559325,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1895,
|
|
"token_acc": 0.919248217757615,
|
|
"train_speed(iter/s)": 0.121289
|
|
},
|
|
{
|
|
"epoch": 2.381340650725206,
|
|
"grad_norm": 0.7817269563674927,
|
|
"learning_rate": 1.0049123018296158e-06,
|
|
"loss": 0.253748345375061,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1900,
|
|
"token_acc": 0.9162441497659907,
|
|
"train_speed(iter/s)": 0.121364
|
|
},
|
|
{
|
|
"epoch": 2.381340650725206,
|
|
"eval_loss": 0.3525380492210388,
|
|
"eval_runtime": 29.5922,
|
|
"eval_samples_per_second": 17.403,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.8839634504149552,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 2.3876127009016073,
|
|
"grad_norm": 0.7803794145584106,
|
|
"learning_rate": 9.852468390620624e-07,
|
|
"loss": 0.2489931583404541,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1905,
|
|
"token_acc": 0.903682999338265,
|
|
"train_speed(iter/s)": 0.121132
|
|
},
|
|
{
|
|
"epoch": 2.3938847510780086,
|
|
"grad_norm": 0.7914659976959229,
|
|
"learning_rate": 9.65754652084896e-07,
|
|
"loss": 0.2515411853790283,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1910,
|
|
"token_acc": 0.9195726949858777,
|
|
"train_speed(iter/s)": 0.121205
|
|
},
|
|
{
|
|
"epoch": 2.40015680125441,
|
|
"grad_norm": 0.7828955054283142,
|
|
"learning_rate": 9.464365821762611e-07,
|
|
"loss": 0.25966334342956543,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1915,
|
|
"token_acc": 0.9172705476684356,
|
|
"train_speed(iter/s)": 0.121274
|
|
},
|
|
{
|
|
"epoch": 2.406428851430811,
|
|
"grad_norm": 0.8471463918685913,
|
|
"learning_rate": 9.272934630994579e-07,
|
|
"loss": 0.26224753856658933,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1920,
|
|
"token_acc": 0.9160864785635764,
|
|
"train_speed(iter/s)": 0.121357
|
|
},
|
|
{
|
|
"epoch": 2.406428851430811,
|
|
"eval_loss": 0.35224801301956177,
|
|
"eval_runtime": 29.5629,
|
|
"eval_samples_per_second": 17.42,
|
|
"eval_steps_per_second": 4.364,
|
|
"eval_token_acc": 0.8840053650767038,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 2.412700901607213,
|
|
"grad_norm": 0.8474516272544861,
|
|
"learning_rate": 9.083261210669458e-07,
|
|
"loss": 0.24689688682556152,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1925,
|
|
"token_acc": 0.9013123514853926,
|
|
"train_speed(iter/s)": 0.121128
|
|
},
|
|
{
|
|
"epoch": 2.4189729517836143,
|
|
"grad_norm": 0.8423922061920166,
|
|
"learning_rate": 8.895353747046903e-07,
|
|
"loss": 0.2583484649658203,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1930,
|
|
"token_acc": 0.9094146095182419,
|
|
"train_speed(iter/s)": 0.121212
|
|
},
|
|
{
|
|
"epoch": 2.4252450019600156,
|
|
"grad_norm": 0.8378574848175049,
|
|
"learning_rate": 8.70922035016829e-07,
|
|
"loss": 0.25626001358032224,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1935,
|
|
"token_acc": 0.9183032429679099,
|
|
"train_speed(iter/s)": 0.121293
|
|
},
|
|
{
|
|
"epoch": 2.4315170521364173,
|
|
"grad_norm": 0.8666955828666687,
|
|
"learning_rate": 8.524869053506718e-07,
|
|
"loss": 0.25580859184265137,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1940,
|
|
"token_acc": 0.9219803161348047,
|
|
"train_speed(iter/s)": 0.121373
|
|
},
|
|
{
|
|
"epoch": 2.4315170521364173,
|
|
"eval_loss": 0.35265490412712097,
|
|
"eval_runtime": 29.5852,
|
|
"eval_samples_per_second": 17.407,
|
|
"eval_steps_per_second": 4.36,
|
|
"eval_token_acc": 0.883653281918015,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 2.4377891023128186,
|
|
"grad_norm": 0.7587525248527527,
|
|
"learning_rate": 8.342307813620254e-07,
|
|
"loss": 0.25416107177734376,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1945,
|
|
"token_acc": 0.9025201951373665,
|
|
"train_speed(iter/s)": 0.121167
|
|
},
|
|
{
|
|
"epoch": 2.44406115248922,
|
|
"grad_norm": 0.7803459763526917,
|
|
"learning_rate": 8.161544509808522e-07,
|
|
"loss": 0.25071403980255125,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1950,
|
|
"token_acc": 0.9182450116792841,
|
|
"train_speed(iter/s)": 0.121237
|
|
},
|
|
{
|
|
"epoch": 2.4503332026656213,
|
|
"grad_norm": 0.8933445811271667,
|
|
"learning_rate": 7.982586943772663e-07,
|
|
"loss": 0.2513444900512695,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1955,
|
|
"token_acc": 0.9193114259516697,
|
|
"train_speed(iter/s)": 0.121306
|
|
},
|
|
{
|
|
"epoch": 2.4566052528420226,
|
|
"grad_norm": 0.8411695957183838,
|
|
"learning_rate": 7.805442839278643e-07,
|
|
"loss": 0.25985763072967527,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1960,
|
|
"token_acc": 0.9140053688790151,
|
|
"train_speed(iter/s)": 0.121373
|
|
},
|
|
{
|
|
"epoch": 2.4566052528420226,
|
|
"eval_loss": 0.3529517650604248,
|
|
"eval_runtime": 29.6072,
|
|
"eval_samples_per_second": 17.394,
|
|
"eval_steps_per_second": 4.357,
|
|
"eval_token_acc": 0.8836448989856652,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 2.4628773030184243,
|
|
"grad_norm": 0.7706183791160583,
|
|
"learning_rate": 7.630119841823808e-07,
|
|
"loss": 0.24757421016693115,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1965,
|
|
"token_acc": 0.9010213793035783,
|
|
"train_speed(iter/s)": 0.12116
|
|
},
|
|
{
|
|
"epoch": 2.4691493531948256,
|
|
"grad_norm": 0.7962974905967712,
|
|
"learning_rate": 7.456625518306976e-07,
|
|
"loss": 0.25676703453063965,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1970,
|
|
"token_acc": 0.9233753637245393,
|
|
"train_speed(iter/s)": 0.121238
|
|
},
|
|
{
|
|
"epoch": 2.475421403371227,
|
|
"grad_norm": 0.7932366132736206,
|
|
"learning_rate": 7.284967356701839e-07,
|
|
"loss": 0.25052144527435305,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1975,
|
|
"token_acc": 0.9187351328989555,
|
|
"train_speed(iter/s)": 0.121285
|
|
},
|
|
{
|
|
"epoch": 2.4816934535476283,
|
|
"grad_norm": 0.8126112222671509,
|
|
"learning_rate": 7.115152765733768e-07,
|
|
"loss": 0.2548501968383789,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1980,
|
|
"token_acc": 0.9198236175051306,
|
|
"train_speed(iter/s)": 0.121368
|
|
},
|
|
{
|
|
"epoch": 2.4816934535476283,
|
|
"eval_loss": 0.352863073348999,
|
|
"eval_runtime": 29.594,
|
|
"eval_samples_per_second": 17.402,
|
|
"eval_steps_per_second": 4.359,
|
|
"eval_token_acc": 0.8838125576326599,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 2.4879655037240296,
|
|
"grad_norm": 0.8036893606185913,
|
|
"learning_rate": 6.94718907456009e-07,
|
|
"loss": 0.26411118507385256,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1985,
|
|
"token_acc": 0.9022120803784831,
|
|
"train_speed(iter/s)": 0.121147
|
|
},
|
|
{
|
|
"epoch": 2.4942375539004313,
|
|
"grad_norm": 0.8398657441139221,
|
|
"learning_rate": 6.781083532453702e-07,
|
|
"loss": 0.24961705207824708,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1990,
|
|
"token_acc": 0.9132801067953688,
|
|
"train_speed(iter/s)": 0.121219
|
|
},
|
|
{
|
|
"epoch": 2.5005096040768326,
|
|
"grad_norm": 0.840071976184845,
|
|
"learning_rate": 6.61684330849025e-07,
|
|
"loss": 0.25878229141235354,
|
|
"memory(GiB)": 33.6,
|
|
"step": 1995,
|
|
"token_acc": 0.9152943072831314,
|
|
"train_speed(iter/s)": 0.121292
|
|
},
|
|
{
|
|
"epoch": 2.506781654253234,
|
|
"grad_norm": 0.7767475247383118,
|
|
"learning_rate": 6.454475491238682e-07,
|
|
"loss": 0.2700009346008301,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2000,
|
|
"token_acc": 0.9162267615331651,
|
|
"train_speed(iter/s)": 0.121371
|
|
},
|
|
{
|
|
"epoch": 2.506781654253234,
|
|
"eval_loss": 0.3521389663219452,
|
|
"eval_runtime": 29.5782,
|
|
"eval_samples_per_second": 17.411,
|
|
"eval_steps_per_second": 4.361,
|
|
"eval_token_acc": 0.883841897895884,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 2.5130537044296353,
|
|
"grad_norm": 0.7912172675132751,
|
|
"learning_rate": 6.293987088455355e-07,
|
|
"loss": 0.25088133811950686,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2005,
|
|
"token_acc": 0.9012983750990204,
|
|
"train_speed(iter/s)": 0.121148
|
|
},
|
|
{
|
|
"epoch": 2.5193257546060366,
|
|
"grad_norm": 0.8571103811264038,
|
|
"learning_rate": 6.135385026781476e-07,
|
|
"loss": 0.25229463577270506,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2010,
|
|
"token_acc": 0.9172969454855578,
|
|
"train_speed(iter/s)": 0.121222
|
|
},
|
|
{
|
|
"epoch": 2.5255978047824383,
|
|
"grad_norm": 0.823409914970398,
|
|
"learning_rate": 5.978676151444285e-07,
|
|
"loss": 0.2578453540802002,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2015,
|
|
"token_acc": 0.921449302499766,
|
|
"train_speed(iter/s)": 0.121294
|
|
},
|
|
{
|
|
"epoch": 2.5318698549588396,
|
|
"grad_norm": 0.737509548664093,
|
|
"learning_rate": 5.823867225961516e-07,
|
|
"loss": 0.26110265254974363,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2020,
|
|
"token_acc": 0.9162979877265591,
|
|
"train_speed(iter/s)": 0.121359
|
|
},
|
|
{
|
|
"epoch": 2.5318698549588396,
|
|
"eval_loss": 0.35181859135627747,
|
|
"eval_runtime": 29.6024,
|
|
"eval_samples_per_second": 17.397,
|
|
"eval_steps_per_second": 4.358,
|
|
"eval_token_acc": 0.8837580685723866,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 2.538141905135241,
|
|
"grad_norm": 0.9168305397033691,
|
|
"learning_rate": 5.670964931849521e-07,
|
|
"loss": 0.2577165365219116,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2025,
|
|
"token_acc": 0.8999222873752308,
|
|
"train_speed(iter/s)": 0.121158
|
|
},
|
|
{
|
|
"epoch": 2.5444139553116427,
|
|
"grad_norm": 0.8547112345695496,
|
|
"learning_rate": 5.519975868334914e-07,
|
|
"loss": 0.23379290103912354,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2030,
|
|
"token_acc": 0.9258048932965816,
|
|
"train_speed(iter/s)": 0.121231
|
|
},
|
|
{
|
|
"epoch": 2.550686005488044,
|
|
"grad_norm": 0.855364203453064,
|
|
"learning_rate": 5.370906552069721e-07,
|
|
"loss": 0.26704959869384765,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2035,
|
|
"token_acc": 0.9188056010965678,
|
|
"train_speed(iter/s)": 0.121316
|
|
},
|
|
{
|
|
"epoch": 2.5569580556644453,
|
|
"grad_norm": 0.8302978873252869,
|
|
"learning_rate": 5.22376341685013e-07,
|
|
"loss": 0.25648543834686277,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2040,
|
|
"token_acc": 0.923139132403843,
|
|
"train_speed(iter/s)": 0.121378
|
|
},
|
|
{
|
|
"epoch": 2.5569580556644453,
|
|
"eval_loss": 0.35207509994506836,
|
|
"eval_runtime": 29.5872,
|
|
"eval_samples_per_second": 17.406,
|
|
"eval_steps_per_second": 4.36,
|
|
"eval_token_acc": 0.8838083661664851,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 2.5632301058408467,
|
|
"grad_norm": 0.7551521062850952,
|
|
"learning_rate": 5.07855281333881e-07,
|
|
"loss": 0.25759091377258303,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2045,
|
|
"token_acc": 0.8994057400542669,
|
|
"train_speed(iter/s)": 0.121167
|
|
},
|
|
{
|
|
"epoch": 2.569502156017248,
|
|
"grad_norm": 0.7979341745376587,
|
|
"learning_rate": 4.935281008790843e-07,
|
|
"loss": 0.2503528594970703,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2050,
|
|
"token_acc": 0.9201623815967523,
|
|
"train_speed(iter/s)": 0.121241
|
|
},
|
|
{
|
|
"epoch": 2.5757742061936497,
|
|
"grad_norm": 0.7783675193786621,
|
|
"learning_rate": 4.793954186783195e-07,
|
|
"loss": 0.26355133056640623,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2055,
|
|
"token_acc": 0.9151320361362058,
|
|
"train_speed(iter/s)": 0.121321
|
|
},
|
|
{
|
|
"epoch": 2.582046256370051,
|
|
"grad_norm": 0.8197569847106934,
|
|
"learning_rate": 4.6545784469478386e-07,
|
|
"loss": 0.24934740066528321,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2060,
|
|
"token_acc": 0.9148257180318136,
|
|
"train_speed(iter/s)": 0.121394
|
|
},
|
|
{
|
|
"epoch": 2.582046256370051,
|
|
"eval_loss": 0.35226860642433167,
|
|
"eval_runtime": 29.6044,
|
|
"eval_samples_per_second": 17.396,
|
|
"eval_steps_per_second": 4.357,
|
|
"eval_token_acc": 0.883938301617906,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 2.5883183065464523,
|
|
"grad_norm": 0.7964560389518738,
|
|
"learning_rate": 4.5171598047085153e-07,
|
|
"loss": 0.26240172386169436,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2065,
|
|
"token_acc": 0.9007787850191121,
|
|
"train_speed(iter/s)": 0.12119
|
|
},
|
|
{
|
|
"epoch": 2.5945903567228537,
|
|
"grad_norm": 0.8210738301277161,
|
|
"learning_rate": 4.381704191021119e-07,
|
|
"loss": 0.26297893524169924,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2070,
|
|
"token_acc": 0.9183645562696567,
|
|
"train_speed(iter/s)": 0.121265
|
|
},
|
|
{
|
|
"epoch": 2.600862406899255,
|
|
"grad_norm": 0.8720937371253967,
|
|
"learning_rate": 4.248217452117653e-07,
|
|
"loss": 0.25324339866638185,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2075,
|
|
"token_acc": 0.9215979250655465,
|
|
"train_speed(iter/s)": 0.121329
|
|
},
|
|
{
|
|
"epoch": 2.6071344570756567,
|
|
"grad_norm": 0.8678158521652222,
|
|
"learning_rate": 4.1167053492540023e-07,
|
|
"loss": 0.26408936977386477,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2080,
|
|
"token_acc": 0.9133987390087365,
|
|
"train_speed(iter/s)": 0.121395
|
|
},
|
|
{
|
|
"epoch": 2.6071344570756567,
|
|
"eval_loss": 0.3519818186759949,
|
|
"eval_runtime": 29.602,
|
|
"eval_samples_per_second": 17.397,
|
|
"eval_steps_per_second": 4.358,
|
|
"eval_token_acc": 0.8838083661664851,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 2.613406507252058,
|
|
"grad_norm": 0.8411008715629578,
|
|
"learning_rate": 3.987173558461199e-07,
|
|
"loss": 0.25940699577331544,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2085,
|
|
"token_acc": 0.8995980691159541,
|
|
"train_speed(iter/s)": 0.121193
|
|
},
|
|
{
|
|
"epoch": 2.6196785574284593,
|
|
"grad_norm": 0.8319599032402039,
|
|
"learning_rate": 3.8596276703004974e-07,
|
|
"loss": 0.25266613960266116,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2090,
|
|
"token_acc": 0.9181341472959224,
|
|
"train_speed(iter/s)": 0.121258
|
|
},
|
|
{
|
|
"epoch": 2.625950607604861,
|
|
"grad_norm": 0.7988993525505066,
|
|
"learning_rate": 3.7340731896220393e-07,
|
|
"loss": 0.25626089572906496,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2095,
|
|
"token_acc": 0.9169616764475279,
|
|
"train_speed(iter/s)": 0.121326
|
|
},
|
|
{
|
|
"epoch": 2.6322226577812624,
|
|
"grad_norm": 0.9199852347373962,
|
|
"learning_rate": 3.6105155353273305e-07,
|
|
"loss": 0.25374295711517336,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2100,
|
|
"token_acc": 0.9187081757346524,
|
|
"train_speed(iter/s)": 0.121375
|
|
},
|
|
{
|
|
"epoch": 2.6322226577812624,
|
|
"eval_loss": 0.35189294815063477,
|
|
"eval_runtime": 29.6062,
|
|
"eval_samples_per_second": 17.395,
|
|
"eval_steps_per_second": 4.357,
|
|
"eval_token_acc": 0.8839299186855563,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 2.6384947079576637,
|
|
"grad_norm": 0.8447644114494324,
|
|
"learning_rate": 3.488960040135303e-07,
|
|
"loss": 0.24541032314300537,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2105,
|
|
"token_acc": 0.9017500930900579,
|
|
"train_speed(iter/s)": 0.12117
|
|
},
|
|
{
|
|
"epoch": 2.644766758134065,
|
|
"grad_norm": 0.7984169125556946,
|
|
"learning_rate": 3.369411950352175e-07,
|
|
"loss": 0.23687467575073243,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2110,
|
|
"token_acc": 0.9224370308107012,
|
|
"train_speed(iter/s)": 0.121237
|
|
},
|
|
{
|
|
"epoch": 2.6510388083104663,
|
|
"grad_norm": 0.7522621154785156,
|
|
"learning_rate": 3.251876425645051e-07,
|
|
"loss": 0.2515209197998047,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2115,
|
|
"token_acc": 0.9186267773138115,
|
|
"train_speed(iter/s)": 0.121299
|
|
},
|
|
{
|
|
"epoch": 2.657310858486868,
|
|
"grad_norm": 0.9188127517700195,
|
|
"learning_rate": 3.136358538819162e-07,
|
|
"loss": 0.2667581081390381,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2120,
|
|
"token_acc": 0.9170892494929006,
|
|
"train_speed(iter/s)": 0.121378
|
|
},
|
|
{
|
|
"epoch": 2.657310858486868,
|
|
"eval_loss": 0.3520144820213318,
|
|
"eval_runtime": 29.6961,
|
|
"eval_samples_per_second": 17.342,
|
|
"eval_steps_per_second": 4.344,
|
|
"eval_token_acc": 0.8841101517310755,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 2.6635829086632694,
|
|
"grad_norm": 0.8419914841651917,
|
|
"learning_rate": 3.0228632755990197e-07,
|
|
"loss": 0.252849817276001,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2125,
|
|
"token_acc": 0.9014873358640861,
|
|
"train_speed(iter/s)": 0.121179
|
|
},
|
|
{
|
|
"epoch": 2.6698549588396707,
|
|
"grad_norm": 0.8139102458953857,
|
|
"learning_rate": 2.911395534413147e-07,
|
|
"loss": 0.25453083515167235,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2130,
|
|
"token_acc": 0.9150135319885048,
|
|
"train_speed(iter/s)": 0.121246
|
|
},
|
|
{
|
|
"epoch": 2.676127009016072,
|
|
"grad_norm": 0.8062320351600647,
|
|
"learning_rate": 2.8019601261827123e-07,
|
|
"loss": 0.24791936874389647,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2135,
|
|
"token_acc": 0.9251222952492422,
|
|
"train_speed(iter/s)": 0.121311
|
|
},
|
|
{
|
|
"epoch": 2.6823990591924733,
|
|
"grad_norm": 0.8251351714134216,
|
|
"learning_rate": 2.694561774113863e-07,
|
|
"loss": 0.2614239931106567,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2140,
|
|
"token_acc": 0.9208586832863431,
|
|
"train_speed(iter/s)": 0.121385
|
|
},
|
|
{
|
|
"epoch": 2.6823990591924733,
|
|
"eval_loss": 0.3521276116371155,
|
|
"eval_runtime": 29.7027,
|
|
"eval_samples_per_second": 17.339,
|
|
"eval_steps_per_second": 4.343,
|
|
"eval_token_acc": 0.8841185346634253,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 2.688671109368875,
|
|
"grad_norm": 0.9006824493408203,
|
|
"learning_rate": 2.5892051134939256e-07,
|
|
"loss": 0.24248223304748534,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2145,
|
|
"token_acc": 0.9014429392662344,
|
|
"train_speed(iter/s)": 0.12118
|
|
},
|
|
{
|
|
"epoch": 2.6949431595452764,
|
|
"grad_norm": 0.9092527627944946,
|
|
"learning_rate": 2.485894691491253e-07,
|
|
"loss": 0.2548917293548584,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2150,
|
|
"token_acc": 0.9183499893684882,
|
|
"train_speed(iter/s)": 0.12125
|
|
},
|
|
{
|
|
"epoch": 2.7012152097216777,
|
|
"grad_norm": 0.8024119734764099,
|
|
"learning_rate": 2.384634966959076e-07,
|
|
"loss": 0.252050518989563,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2155,
|
|
"token_acc": 0.9186300234774203,
|
|
"train_speed(iter/s)": 0.121319
|
|
},
|
|
{
|
|
"epoch": 2.707487259898079,
|
|
"grad_norm": 0.857513964176178,
|
|
"learning_rate": 2.2854303102429808e-07,
|
|
"loss": 0.24875540733337403,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2160,
|
|
"token_acc": 0.9268544278078209,
|
|
"train_speed(iter/s)": 0.121379
|
|
},
|
|
{
|
|
"epoch": 2.707487259898079,
|
|
"eval_loss": 0.35207265615463257,
|
|
"eval_runtime": 29.6934,
|
|
"eval_samples_per_second": 17.344,
|
|
"eval_steps_per_second": 4.344,
|
|
"eval_token_acc": 0.8840137480090535,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 2.7137593100744803,
|
|
"grad_norm": 0.8570399284362793,
|
|
"learning_rate": 2.1882850029923463e-07,
|
|
"loss": 0.2508984088897705,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2165,
|
|
"token_acc": 0.9019046776207754,
|
|
"train_speed(iter/s)": 0.121193
|
|
},
|
|
{
|
|
"epoch": 2.720031360250882,
|
|
"grad_norm": 0.8015756607055664,
|
|
"learning_rate": 2.093203237975483e-07,
|
|
"loss": 0.2542969465255737,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2170,
|
|
"token_acc": 0.9189912422964645,
|
|
"train_speed(iter/s)": 0.121256
|
|
},
|
|
{
|
|
"epoch": 2.7263034104272834,
|
|
"grad_norm": 0.7729219794273376,
|
|
"learning_rate": 2.0001891188987265e-07,
|
|
"loss": 0.247190523147583,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2175,
|
|
"token_acc": 0.9275373459399903,
|
|
"train_speed(iter/s)": 0.121308
|
|
},
|
|
{
|
|
"epoch": 2.7325754606036847,
|
|
"grad_norm": 0.7974966764450073,
|
|
"learning_rate": 1.9092466602293247e-07,
|
|
"loss": 0.25701584815979006,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2180,
|
|
"token_acc": 0.9192098092643052,
|
|
"train_speed(iter/s)": 0.121375
|
|
},
|
|
{
|
|
"epoch": 2.7325754606036847,
|
|
"eval_loss": 0.35181429982185364,
|
|
"eval_runtime": 29.5889,
|
|
"eval_samples_per_second": 17.405,
|
|
"eval_steps_per_second": 4.36,
|
|
"eval_token_acc": 0.8839634504149552,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 2.7388475107800865,
|
|
"grad_norm": 0.859959065914154,
|
|
"learning_rate": 1.8203797870221197e-07,
|
|
"loss": 0.24590330123901366,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2185,
|
|
"token_acc": 0.8985490934392011,
|
|
"train_speed(iter/s)": 0.121162
|
|
},
|
|
{
|
|
"epoch": 2.745119560956488,
|
|
"grad_norm": 0.7980506420135498,
|
|
"learning_rate": 1.7335923347502003e-07,
|
|
"loss": 0.2525080442428589,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2190,
|
|
"token_acc": 0.9187770535847555,
|
|
"train_speed(iter/s)": 0.121227
|
|
},
|
|
{
|
|
"epoch": 2.751391611132889,
|
|
"grad_norm": 0.9112074971199036,
|
|
"learning_rate": 1.6488880491393467e-07,
|
|
"loss": 0.2503790855407715,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2195,
|
|
"token_acc": 0.9191731649956678,
|
|
"train_speed(iter/s)": 0.121291
|
|
},
|
|
{
|
|
"epoch": 2.7576636613092904,
|
|
"grad_norm": 0.8652266263961792,
|
|
"learning_rate": 1.5662705860063465e-07,
|
|
"loss": 0.2473994016647339,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2200,
|
|
"token_acc": 0.9257844886063864,
|
|
"train_speed(iter/s)": 0.121351
|
|
},
|
|
{
|
|
"epoch": 2.7576636613092904,
|
|
"eval_loss": 0.3517080843448639,
|
|
"eval_runtime": 29.6123,
|
|
"eval_samples_per_second": 17.391,
|
|
"eval_steps_per_second": 4.356,
|
|
"eval_token_acc": 0.8841855981222232,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 2.7639357114856917,
|
|
"grad_norm": 0.7372477054595947,
|
|
"learning_rate": 1.485743511101234e-07,
|
|
"loss": 0.2570472717285156,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2205,
|
|
"token_acc": 0.9023636113783152,
|
|
"train_speed(iter/s)": 0.121155
|
|
},
|
|
{
|
|
"epoch": 2.7702077616620935,
|
|
"grad_norm": 0.8067111968994141,
|
|
"learning_rate": 1.4073102999534017e-07,
|
|
"loss": 0.25244066715240476,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2210,
|
|
"token_acc": 0.9208304949204751,
|
|
"train_speed(iter/s)": 0.121222
|
|
},
|
|
{
|
|
"epoch": 2.776479811838495,
|
|
"grad_norm": 0.840241551399231,
|
|
"learning_rate": 1.3309743377215468e-07,
|
|
"loss": 0.24581263065338135,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2215,
|
|
"token_acc": 0.9247633420222253,
|
|
"train_speed(iter/s)": 0.121277
|
|
},
|
|
{
|
|
"epoch": 2.782751862014896,
|
|
"grad_norm": 0.8023516535758972,
|
|
"learning_rate": 1.2567389190476287e-07,
|
|
"loss": 0.26343064308166503,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2220,
|
|
"token_acc": 0.9156072196662401,
|
|
"train_speed(iter/s)": 0.121348
|
|
},
|
|
{
|
|
"epoch": 2.782751862014896,
|
|
"eval_loss": 0.3517560660839081,
|
|
"eval_runtime": 29.6264,
|
|
"eval_samples_per_second": 17.383,
|
|
"eval_steps_per_second": 4.354,
|
|
"eval_token_acc": 0.8841227261296002,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 2.7890239121912974,
|
|
"grad_norm": 0.8527393937110901,
|
|
"learning_rate": 1.1846072479146431e-07,
|
|
"loss": 0.23856933116912843,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2225,
|
|
"token_acc": 0.9016640429579982,
|
|
"train_speed(iter/s)": 0.121138
|
|
},
|
|
{
|
|
"epoch": 2.7952959623676987,
|
|
"grad_norm": 0.796775221824646,
|
|
"learning_rate": 1.114582437508327e-07,
|
|
"loss": 0.2579585790634155,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2230,
|
|
"token_acc": 0.9188932252576804,
|
|
"train_speed(iter/s)": 0.121209
|
|
},
|
|
{
|
|
"epoch": 2.8015680125441005,
|
|
"grad_norm": 0.8311977982521057,
|
|
"learning_rate": 1.0466675100828383e-07,
|
|
"loss": 0.2482445240020752,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2235,
|
|
"token_acc": 0.9148045991260421,
|
|
"train_speed(iter/s)": 0.121265
|
|
},
|
|
{
|
|
"epoch": 2.807840062720502,
|
|
"grad_norm": 0.7720575928688049,
|
|
"learning_rate": 9.808653968302607e-08,
|
|
"loss": 0.2451555013656616,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2240,
|
|
"token_acc": 0.9182748574416694,
|
|
"train_speed(iter/s)": 0.121331
|
|
},
|
|
{
|
|
"epoch": 2.807840062720502,
|
|
"eval_loss": 0.35162004828453064,
|
|
"eval_runtime": 29.7076,
|
|
"eval_samples_per_second": 17.336,
|
|
"eval_steps_per_second": 4.342,
|
|
"eval_token_acc": 0.88396764188113,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 2.814112112896903,
|
|
"grad_norm": 0.7808555364608765,
|
|
"learning_rate": 9.17178937754143e-08,
|
|
"loss": 0.25096561908721926,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2245,
|
|
"token_acc": 0.9010792691911006,
|
|
"train_speed(iter/s)": 0.121135
|
|
},
|
|
{
|
|
"epoch": 2.820384163073305,
|
|
"grad_norm": 0.7035224437713623,
|
|
"learning_rate": 8.556108815468756e-08,
|
|
"loss": 0.24024505615234376,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2250,
|
|
"token_acc": 0.9196050096339113,
|
|
"train_speed(iter/s)": 0.121197
|
|
},
|
|
{
|
|
"epoch": 2.8266562132497057,
|
|
"grad_norm": 0.8584672808647156,
|
|
"learning_rate": 7.961638854711296e-08,
|
|
"loss": 0.2527903079986572,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2255,
|
|
"token_acc": 0.9181211708645337,
|
|
"train_speed(iter/s)": 0.121251
|
|
},
|
|
{
|
|
"epoch": 2.8329282634261075,
|
|
"grad_norm": 0.8620744943618774,
|
|
"learning_rate": 7.388405152450706e-08,
|
|
"loss": 0.25180099010467527,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2260,
|
|
"token_acc": 0.9159815615644157,
|
|
"train_speed(iter/s)": 0.121319
|
|
},
|
|
{
|
|
"epoch": 2.8329282634261075,
|
|
"eval_loss": 0.35167694091796875,
|
|
"eval_runtime": 29.7193,
|
|
"eval_samples_per_second": 17.329,
|
|
"eval_steps_per_second": 4.341,
|
|
"eval_token_acc": 0.8842610445133707,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 2.839200313602509,
|
|
"grad_norm": 0.8185881972312927,
|
|
"learning_rate": 6.836432449317255e-08,
|
|
"loss": 0.2555046081542969,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2265,
|
|
"token_acc": 0.9007627158298629,
|
|
"train_speed(iter/s)": 0.121123
|
|
},
|
|
{
|
|
"epoch": 2.84547236377891,
|
|
"grad_norm": 0.8594284653663635,
|
|
"learning_rate": 6.305744568321281e-08,
|
|
"loss": 0.243331241607666,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2270,
|
|
"token_acc": 0.919937106918239,
|
|
"train_speed(iter/s)": 0.121187
|
|
},
|
|
{
|
|
"epoch": 2.851744413955312,
|
|
"grad_norm": 0.7847645282745361,
|
|
"learning_rate": 5.7963644138254175e-08,
|
|
"loss": 0.25553407669067385,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2275,
|
|
"token_acc": 0.9147093923774218,
|
|
"train_speed(iter/s)": 0.12125
|
|
},
|
|
{
|
|
"epoch": 2.858016464131713,
|
|
"grad_norm": 0.7746605277061462,
|
|
"learning_rate": 5.308313970555812e-08,
|
|
"loss": 0.256377649307251,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2280,
|
|
"token_acc": 0.9144332047137345,
|
|
"train_speed(iter/s)": 0.121321
|
|
},
|
|
{
|
|
"epoch": 2.858016464131713,
|
|
"eval_loss": 0.3516838848590851,
|
|
"eval_runtime": 29.6227,
|
|
"eval_samples_per_second": 17.385,
|
|
"eval_steps_per_second": 4.355,
|
|
"eval_token_acc": 0.8841688322575236,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 2.8642885143081145,
|
|
"grad_norm": 0.8503162860870361,
|
|
"learning_rate": 4.841614302653341e-08,
|
|
"loss": 0.24309797286987306,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2285,
|
|
"token_acc": 0.901774899142082,
|
|
"train_speed(iter/s)": 0.12112
|
|
},
|
|
{
|
|
"epoch": 2.870560564484516,
|
|
"grad_norm": 0.8309412002563477,
|
|
"learning_rate": 4.396285552764557e-08,
|
|
"loss": 0.25277886390686033,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2290,
|
|
"token_acc": 0.918742246397557,
|
|
"train_speed(iter/s)": 0.121195
|
|
},
|
|
{
|
|
"epoch": 2.876832614660917,
|
|
"grad_norm": 0.8796764612197876,
|
|
"learning_rate": 3.9723469411723226e-08,
|
|
"loss": 0.2501843929290771,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2295,
|
|
"token_acc": 0.9242152837104197,
|
|
"train_speed(iter/s)": 0.12126
|
|
},
|
|
{
|
|
"epoch": 2.883104664837319,
|
|
"grad_norm": 0.764290452003479,
|
|
"learning_rate": 3.5698167649660384e-08,
|
|
"loss": 0.23828110694885254,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2300,
|
|
"token_acc": 0.9215037112190955,
|
|
"train_speed(iter/s)": 0.121321
|
|
},
|
|
{
|
|
"epoch": 2.883104664837319,
|
|
"eval_loss": 0.3518492877483368,
|
|
"eval_runtime": 29.5991,
|
|
"eval_samples_per_second": 17.399,
|
|
"eval_steps_per_second": 4.358,
|
|
"eval_token_acc": 0.8841688322575236,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 2.88937671501372,
|
|
"grad_norm": 0.893683671951294,
|
|
"learning_rate": 3.188712397252325e-08,
|
|
"loss": 0.25220484733581544,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2305,
|
|
"token_acc": 0.900192213902516,
|
|
"train_speed(iter/s)": 0.121135
|
|
},
|
|
{
|
|
"epoch": 2.8956487651901215,
|
|
"grad_norm": 0.7913616299629211,
|
|
"learning_rate": 2.8290502864049553e-08,
|
|
"loss": 0.23975701332092286,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2310,
|
|
"token_acc": 0.9262284488329413,
|
|
"train_speed(iter/s)": 0.121196
|
|
},
|
|
{
|
|
"epoch": 2.901920815366523,
|
|
"grad_norm": 0.8090708255767822,
|
|
"learning_rate": 2.4908459553549257e-08,
|
|
"loss": 0.25728065967559816,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2315,
|
|
"token_acc": 0.921334886001471,
|
|
"train_speed(iter/s)": 0.121264
|
|
},
|
|
{
|
|
"epoch": 2.908192865542924,
|
|
"grad_norm": 0.8814035058021545,
|
|
"learning_rate": 2.174114000920713e-08,
|
|
"loss": 0.25480012893676757,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2320,
|
|
"token_acc": 0.9189424911420006,
|
|
"train_speed(iter/s)": 0.121328
|
|
},
|
|
{
|
|
"epoch": 2.908192865542924,
|
|
"eval_loss": 0.351841539144516,
|
|
"eval_runtime": 29.651,
|
|
"eval_samples_per_second": 17.369,
|
|
"eval_steps_per_second": 4.351,
|
|
"eval_token_acc": 0.8842400871824965,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 2.914464915719326,
|
|
"grad_norm": 0.798668622970581,
|
|
"learning_rate": 1.878868093177999e-08,
|
|
"loss": 0.24860291481018065,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2325,
|
|
"token_acc": 0.9021842640764242,
|
|
"train_speed(iter/s)": 0.121153
|
|
},
|
|
{
|
|
"epoch": 2.920736965895727,
|
|
"grad_norm": 0.8112826347351074,
|
|
"learning_rate": 1.6051209748698116e-08,
|
|
"loss": 0.25038626194000246,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2330,
|
|
"token_acc": 0.9164307381193124,
|
|
"train_speed(iter/s)": 0.121208
|
|
},
|
|
{
|
|
"epoch": 2.9270090160721285,
|
|
"grad_norm": 0.8192731738090515,
|
|
"learning_rate": 1.3528844608566848e-08,
|
|
"loss": 0.253769063949585,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2335,
|
|
"token_acc": 0.9176401557582546,
|
|
"train_speed(iter/s)": 0.121264
|
|
},
|
|
{
|
|
"epoch": 2.9332810662485302,
|
|
"grad_norm": 0.8607764840126038,
|
|
"learning_rate": 1.1221694376064018e-08,
|
|
"loss": 0.26034162044525144,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2340,
|
|
"token_acc": 0.9166121154136664,
|
|
"train_speed(iter/s)": 0.12133
|
|
},
|
|
{
|
|
"epoch": 2.9332810662485302,
|
|
"eval_loss": 0.3517746925354004,
|
|
"eval_runtime": 29.5861,
|
|
"eval_samples_per_second": 17.407,
|
|
"eval_steps_per_second": 4.36,
|
|
"eval_token_acc": 0.8842400871824965,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 2.9395531164249316,
|
|
"grad_norm": 0.8527613878250122,
|
|
"learning_rate": 9.129858627244802e-09,
|
|
"loss": 0.2517427921295166,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2345,
|
|
"token_acc": 0.9014389514175809,
|
|
"train_speed(iter/s)": 0.121151
|
|
},
|
|
{
|
|
"epoch": 2.945825166601333,
|
|
"grad_norm": 0.9257199764251709,
|
|
"learning_rate": 7.25342764524184e-09,
|
|
"loss": 0.24849720001220704,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2350,
|
|
"token_acc": 0.9203306319162112,
|
|
"train_speed(iter/s)": 0.121212
|
|
},
|
|
{
|
|
"epoch": 2.952097216777734,
|
|
"grad_norm": 0.8115556240081787,
|
|
"learning_rate": 5.592482416369449e-09,
|
|
"loss": 0.26071810722351074,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2355,
|
|
"token_acc": 0.9158378541289933,
|
|
"train_speed(iter/s)": 0.121275
|
|
},
|
|
{
|
|
"epoch": 2.9583692669541355,
|
|
"grad_norm": 0.7887356281280518,
|
|
"learning_rate": 4.147094626628656e-09,
|
|
"loss": 0.2557636260986328,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2360,
|
|
"token_acc": 0.9162988480999031,
|
|
"train_speed(iter/s)": 0.121332
|
|
},
|
|
{
|
|
"epoch": 2.9583692669541355,
|
|
"eval_loss": 0.3517840504646301,
|
|
"eval_runtime": 29.6869,
|
|
"eval_samples_per_second": 17.348,
|
|
"eval_steps_per_second": 4.345,
|
|
"eval_token_acc": 0.8840933858663761,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 2.9646413171305372,
|
|
"grad_norm": 0.7621276378631592,
|
|
"learning_rate": 2.9173266586113303e-09,
|
|
"loss": 0.2530327320098877,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2365,
|
|
"token_acc": 0.9022793086399823,
|
|
"train_speed(iter/s)": 0.121129
|
|
},
|
|
{
|
|
"epoch": 2.9709133673069386,
|
|
"grad_norm": 0.8078117370605469,
|
|
"learning_rate": 1.9032315888106724e-09,
|
|
"loss": 0.2542110919952393,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2370,
|
|
"token_acc": 0.919013704543002,
|
|
"train_speed(iter/s)": 0.121185
|
|
},
|
|
{
|
|
"epoch": 2.97718541748334,
|
|
"grad_norm": 0.847828209400177,
|
|
"learning_rate": 1.1048531853286027e-09,
|
|
"loss": 0.26172566413879395,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2375,
|
|
"token_acc": 0.9156356655290102,
|
|
"train_speed(iter/s)": 0.12124
|
|
},
|
|
{
|
|
"epoch": 2.983457467659741,
|
|
"grad_norm": 0.7999985814094543,
|
|
"learning_rate": 5.222259059867174e-10,
|
|
"loss": 0.2618349552154541,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2380,
|
|
"token_acc": 0.9119521576775136,
|
|
"train_speed(iter/s)": 0.121301
|
|
},
|
|
{
|
|
"epoch": 2.983457467659741,
|
|
"eval_loss": 0.35182681679725647,
|
|
"eval_runtime": 29.5769,
|
|
"eval_samples_per_second": 17.412,
|
|
"eval_steps_per_second": 4.362,
|
|
"eval_token_acc": 0.8840472797384525,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 2.9897295178361425,
|
|
"grad_norm": 0.864345371723175,
|
|
"learning_rate": 1.5537489683914442e-10,
|
|
"loss": 0.25390353202819826,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2385,
|
|
"token_acc": 0.9013812207837177,
|
|
"train_speed(iter/s)": 0.121125
|
|
},
|
|
{
|
|
"epoch": 2.9960015680125442,
|
|
"grad_norm": 0.7659916281700134,
|
|
"learning_rate": 4.315991088965632e-12,
|
|
"loss": 0.25308642387390134,
|
|
"memory(GiB)": 33.6,
|
|
"step": 2390,
|
|
"token_acc": 0.9178361065117668,
|
|
"train_speed(iter/s)": 0.121184
|
|
},
|
|
{
|
|
"epoch": 2.9972559780478245,
|
|
"eval_loss": 0.35187748074531555,
|
|
"eval_runtime": 29.5575,
|
|
"eval_samples_per_second": 17.424,
|
|
"eval_steps_per_second": 4.364,
|
|
"eval_token_acc": 0.8842107469192724,
|
|
"step": 2391
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 2391,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.893313222448775e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|