6013 lines
171 KiB
JSON
6013 lines
171 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": 1620,
|
||
|
|
"best_metric": 0.25625008,
|
||
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v13-20250430-203547/checkpoint-1620",
|
||
|
|
"epoch": 2.9988481916609078,
|
||
|
|
"eval_steps": 20,
|
||
|
|
"global_step": 2439,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.0012285955616985335,
|
||
|
|
"grad_norm": 2.85188364982605,
|
||
|
|
"learning_rate": 9.99999585221637e-06,
|
||
|
|
"loss": 0.3927260637283325,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 1,
|
||
|
|
"token_acc": 0.9111180904522613,
|
||
|
|
"train_speed(iter/s)": 0.065127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.006142977808492667,
|
||
|
|
"grad_norm": 2.0374207496643066,
|
||
|
|
"learning_rate": 9.999896305753298e-06,
|
||
|
|
"loss": 0.4172998070716858,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 5,
|
||
|
|
"token_acc": 0.8710935003515051,
|
||
|
|
"train_speed(iter/s)": 0.125986
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.012285955616985334,
|
||
|
|
"grad_norm": 1.0768085718154907,
|
||
|
|
"learning_rate": 9.99958522731419e-06,
|
||
|
|
"loss": 0.31686446666717527,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 10,
|
||
|
|
"token_acc": 0.9013287401574803,
|
||
|
|
"train_speed(iter/s)": 0.138597
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.018428933425478,
|
||
|
|
"grad_norm": 1.4088069200515747,
|
||
|
|
"learning_rate": 9.999066777585496e-06,
|
||
|
|
"loss": 0.31454758644104003,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 15,
|
||
|
|
"token_acc": 0.8979140839756373,
|
||
|
|
"train_speed(iter/s)": 0.146387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.024571911233970668,
|
||
|
|
"grad_norm": 1.0749812126159668,
|
||
|
|
"learning_rate": 9.998340978071314e-06,
|
||
|
|
"loss": 0.31294023990631104,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 20,
|
||
|
|
"token_acc": 0.9126713473754597,
|
||
|
|
"train_speed(iter/s)": 0.15088
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.024571911233970668,
|
||
|
|
"eval_loss": 0.34731194376945496,
|
||
|
|
"eval_runtime": 30.1495,
|
||
|
|
"eval_samples_per_second": 17.446,
|
||
|
|
"eval_steps_per_second": 4.378,
|
||
|
|
"eval_token_acc": 0.9024643320363165,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.030714889042463334,
|
||
|
|
"grad_norm": 1.2545477151870728,
|
||
|
|
"learning_rate": 9.997407858876141e-06,
|
||
|
|
"loss": 0.30952184200286864,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 25,
|
||
|
|
"token_acc": 0.8951860272094114,
|
||
|
|
"train_speed(iter/s)": 0.122954
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.036857866850956,
|
||
|
|
"grad_norm": 1.0299962759017944,
|
||
|
|
"learning_rate": 9.99626745870361e-06,
|
||
|
|
"loss": 0.28446354866027834,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 30,
|
||
|
|
"token_acc": 0.9173851303377625,
|
||
|
|
"train_speed(iter/s)": 0.127723
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.043000844659448666,
|
||
|
|
"grad_norm": 1.05655038356781,
|
||
|
|
"learning_rate": 9.994919824854899e-06,
|
||
|
|
"loss": 0.3267578125,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 35,
|
||
|
|
"token_acc": 0.8922404371584699,
|
||
|
|
"train_speed(iter/s)": 0.132153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.049143822467941335,
|
||
|
|
"grad_norm": 1.0286122560501099,
|
||
|
|
"learning_rate": 9.993365013226757e-06,
|
||
|
|
"loss": 0.29858396053314207,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 40,
|
||
|
|
"token_acc": 0.8932590177726365,
|
||
|
|
"train_speed(iter/s)": 0.135375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.049143822467941335,
|
||
|
|
"eval_loss": 0.32549846172332764,
|
||
|
|
"eval_runtime": 29.8986,
|
||
|
|
"eval_samples_per_second": 17.593,
|
||
|
|
"eval_steps_per_second": 4.415,
|
||
|
|
"eval_token_acc": 0.9053393860786857,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.055286800276434005,
|
||
|
|
"grad_norm": 0.9242257475852966,
|
||
|
|
"learning_rate": 9.991603088309195e-06,
|
||
|
|
"loss": 0.2890357971191406,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 45,
|
||
|
|
"token_acc": 0.8943298969072165,
|
||
|
|
"train_speed(iter/s)": 0.121978
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06142977808492667,
|
||
|
|
"grad_norm": 0.8638477325439453,
|
||
|
|
"learning_rate": 9.989634123182798e-06,
|
||
|
|
"loss": 0.2940737247467041,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 50,
|
||
|
|
"token_acc": 0.9095572243424023,
|
||
|
|
"train_speed(iter/s)": 0.125285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06757275589341934,
|
||
|
|
"grad_norm": 1.1406885385513306,
|
||
|
|
"learning_rate": 9.987458199515714e-06,
|
||
|
|
"loss": 0.2999709606170654,
|
||
|
|
"memory(GiB)": 27.77,
|
||
|
|
"step": 55,
|
||
|
|
"token_acc": 0.9033247521498603,
|
||
|
|
"train_speed(iter/s)": 0.128485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.073715733701912,
|
||
|
|
"grad_norm": 1.0126744508743286,
|
||
|
|
"learning_rate": 9.985075407560247e-06,
|
||
|
|
"loss": 0.2821986675262451,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 60,
|
||
|
|
"token_acc": 0.9166214683694098,
|
||
|
|
"train_speed(iter/s)": 0.130304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.073715733701912,
|
||
|
|
"eval_loss": 0.31559479236602783,
|
||
|
|
"eval_runtime": 29.8185,
|
||
|
|
"eval_samples_per_second": 17.64,
|
||
|
|
"eval_steps_per_second": 4.427,
|
||
|
|
"eval_token_acc": 0.9073641735120335,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07985871151040466,
|
||
|
|
"grad_norm": 0.9640651345252991,
|
||
|
|
"learning_rate": 9.982485846149125e-06,
|
||
|
|
"loss": 0.2909295320510864,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 65,
|
||
|
|
"token_acc": 0.9025612979673685,
|
||
|
|
"train_speed(iter/s)": 0.121841
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08600168931889733,
|
||
|
|
"grad_norm": 1.0075881481170654,
|
||
|
|
"learning_rate": 9.979689622691393e-06,
|
||
|
|
"loss": 0.2951636791229248,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 70,
|
||
|
|
"token_acc": 0.9077655003069368,
|
||
|
|
"train_speed(iter/s)": 0.123796
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09214466712739,
|
||
|
|
"grad_norm": 1.0915230512619019,
|
||
|
|
"learning_rate": 9.976686853167967e-06,
|
||
|
|
"loss": 0.28308849334716796,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 75,
|
||
|
|
"token_acc": 0.8968571616035693,
|
||
|
|
"train_speed(iter/s)": 0.125709
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09828764493588267,
|
||
|
|
"grad_norm": 1.0471818447113037,
|
||
|
|
"learning_rate": 9.973477662126818e-06,
|
||
|
|
"loss": 0.2649773836135864,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 80,
|
||
|
|
"token_acc": 0.9152229480261289,
|
||
|
|
"train_speed(iter/s)": 0.127584
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09828764493588267,
|
||
|
|
"eval_loss": 0.311291366815567,
|
||
|
|
"eval_runtime": 29.9425,
|
||
|
|
"eval_samples_per_second": 17.567,
|
||
|
|
"eval_steps_per_second": 4.408,
|
||
|
|
"eval_token_acc": 0.9086323677763366,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10443062274437534,
|
||
|
|
"grad_norm": 0.9933186769485474,
|
||
|
|
"learning_rate": 9.970062182677802e-06,
|
||
|
|
"loss": 0.27203946113586425,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 85,
|
||
|
|
"token_acc": 0.8999555278840167,
|
||
|
|
"train_speed(iter/s)": 0.121459
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11057360055286801,
|
||
|
|
"grad_norm": 1.0888270139694214,
|
||
|
|
"learning_rate": 9.966440556487149e-06,
|
||
|
|
"loss": 0.27809457778930663,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 90,
|
||
|
|
"token_acc": 0.9105367793240556,
|
||
|
|
"train_speed(iter/s)": 0.123076
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11671657836136066,
|
||
|
|
"grad_norm": 1.0842769145965576,
|
||
|
|
"learning_rate": 9.962612933771575e-06,
|
||
|
|
"loss": 0.30378289222717286,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 95,
|
||
|
|
"token_acc": 0.9065857885615252,
|
||
|
|
"train_speed(iter/s)": 0.124867
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12285955616985333,
|
||
|
|
"grad_norm": 1.1600843667984009,
|
||
|
|
"learning_rate": 9.958579473292067e-06,
|
||
|
|
"loss": 0.2904845714569092,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 100,
|
||
|
|
"token_acc": 0.910148975791434,
|
||
|
|
"train_speed(iter/s)": 0.126774
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12285955616985333,
|
||
|
|
"eval_loss": 0.3061583638191223,
|
||
|
|
"eval_runtime": 29.8822,
|
||
|
|
"eval_samples_per_second": 17.602,
|
||
|
|
"eval_steps_per_second": 4.417,
|
||
|
|
"eval_token_acc": 0.9093385214007782,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.129002533978346,
|
||
|
|
"grad_norm": 1.0021681785583496,
|
||
|
|
"learning_rate": 9.95434034234728e-06,
|
||
|
|
"loss": 0.29146251678466795,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 105,
|
||
|
|
"token_acc": 0.9012319578712691,
|
||
|
|
"train_speed(iter/s)": 0.122309
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13514551178683867,
|
||
|
|
"grad_norm": 0.9947881698608398,
|
||
|
|
"learning_rate": 9.949895716766611e-06,
|
||
|
|
"loss": 0.28587632179260253,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 110,
|
||
|
|
"token_acc": 0.9113877118644068,
|
||
|
|
"train_speed(iter/s)": 0.12384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14128848959533133,
|
||
|
|
"grad_norm": 0.9120563268661499,
|
||
|
|
"learning_rate": 9.945245780902899e-06,
|
||
|
|
"loss": 0.25429134368896483,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 115,
|
||
|
|
"token_acc": 0.9119403599818774,
|
||
|
|
"train_speed(iter/s)": 0.125132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.147431467403824,
|
||
|
|
"grad_norm": 0.9895658493041992,
|
||
|
|
"learning_rate": 9.940390727624785e-06,
|
||
|
|
"loss": 0.29624483585357664,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 120,
|
||
|
|
"token_acc": 0.907488553000837,
|
||
|
|
"train_speed(iter/s)": 0.126349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.147431467403824,
|
||
|
|
"eval_loss": 0.30357423424720764,
|
||
|
|
"eval_runtime": 29.8849,
|
||
|
|
"eval_samples_per_second": 17.601,
|
||
|
|
"eval_steps_per_second": 4.417,
|
||
|
|
"eval_token_acc": 0.9093457270500072,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15357444521231667,
|
||
|
|
"grad_norm": 1.0295214653015137,
|
||
|
|
"learning_rate": 9.935330758308706e-06,
|
||
|
|
"loss": 0.2756758689880371,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 125,
|
||
|
|
"token_acc": 0.9061426587736607,
|
||
|
|
"train_speed(iter/s)": 0.122406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15971742302080932,
|
||
|
|
"grad_norm": 0.9042087197303772,
|
||
|
|
"learning_rate": 9.93006608283054e-06,
|
||
|
|
"loss": 0.26598501205444336,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 130,
|
||
|
|
"token_acc": 0.9089727568107973,
|
||
|
|
"train_speed(iter/s)": 0.123416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.165860400829302,
|
||
|
|
"grad_norm": 1.020095944404602,
|
||
|
|
"learning_rate": 9.924596919556917e-06,
|
||
|
|
"loss": 0.30240449905395506,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 135,
|
||
|
|
"token_acc": 0.8946991831137082,
|
||
|
|
"train_speed(iter/s)": 0.124801
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17200337863779466,
|
||
|
|
"grad_norm": 0.8778018355369568,
|
||
|
|
"learning_rate": 9.918923495336138e-06,
|
||
|
|
"loss": 0.30482988357543944,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 140,
|
||
|
|
"token_acc": 0.896467782800934,
|
||
|
|
"train_speed(iter/s)": 0.125985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17200337863779466,
|
||
|
|
"eval_loss": 0.3020693063735962,
|
||
|
|
"eval_runtime": 29.8385,
|
||
|
|
"eval_samples_per_second": 17.628,
|
||
|
|
"eval_steps_per_second": 4.424,
|
||
|
|
"eval_token_acc": 0.9099077676898688,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17814635644628735,
|
||
|
|
"grad_norm": 0.888533890247345,
|
||
|
|
"learning_rate": 9.913046045488787e-06,
|
||
|
|
"loss": 0.28194656372070315,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 145,
|
||
|
|
"token_acc": 0.9068211113661646,
|
||
|
|
"train_speed(iter/s)": 0.122725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18428933425478,
|
||
|
|
"grad_norm": 0.9438475370407104,
|
||
|
|
"learning_rate": 9.906964813797955e-06,
|
||
|
|
"loss": 0.2703879356384277,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 150,
|
||
|
|
"token_acc": 0.9050589050589051,
|
||
|
|
"train_speed(iter/s)": 0.123734
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19043231206327269,
|
||
|
|
"grad_norm": 1.0727230310440063,
|
||
|
|
"learning_rate": 9.900680052499138e-06,
|
||
|
|
"loss": 0.267763090133667,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 155,
|
||
|
|
"token_acc": 0.8978266300274794,
|
||
|
|
"train_speed(iter/s)": 0.124756
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19657528987176534,
|
||
|
|
"grad_norm": 0.8617845773696899,
|
||
|
|
"learning_rate": 9.894192022269773e-06,
|
||
|
|
"loss": 0.2951368808746338,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 160,
|
||
|
|
"token_acc": 0.9030839367122553,
|
||
|
|
"train_speed(iter/s)": 0.1257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19657528987176534,
|
||
|
|
"eval_loss": 0.2993355393409729,
|
||
|
|
"eval_runtime": 29.9095,
|
||
|
|
"eval_samples_per_second": 17.586,
|
||
|
|
"eval_steps_per_second": 4.413,
|
||
|
|
"eval_token_acc": 0.9106355382619974,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.202718267680258,
|
||
|
|
"grad_norm": 1.1480624675750732,
|
||
|
|
"learning_rate": 9.887500992218421e-06,
|
||
|
|
"loss": 0.30594232082366946,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 165,
|
||
|
|
"token_acc": 0.9010627678938407,
|
||
|
|
"train_speed(iter/s)": 0.122909
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20886124548875068,
|
||
|
|
"grad_norm": 1.0773506164550781,
|
||
|
|
"learning_rate": 9.880607239873614e-06,
|
||
|
|
"loss": 0.2754403591156006,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 170,
|
||
|
|
"token_acc": 0.9078512396694215,
|
||
|
|
"train_speed(iter/s)": 0.123726
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21500422329724334,
|
||
|
|
"grad_norm": 1.0344544649124146,
|
||
|
|
"learning_rate": 9.873511051172331e-06,
|
||
|
|
"loss": 0.27539350986480715,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 175,
|
||
|
|
"token_acc": 0.9078795220527504,
|
||
|
|
"train_speed(iter/s)": 0.124606
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22114720110573602,
|
||
|
|
"grad_norm": 0.801381528377533,
|
||
|
|
"learning_rate": 9.866212720448149e-06,
|
||
|
|
"loss": 0.2653654098510742,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 180,
|
||
|
|
"token_acc": 0.9089000349935845,
|
||
|
|
"train_speed(iter/s)": 0.125295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22114720110573602,
|
||
|
|
"eval_loss": 0.29467687010765076,
|
||
|
|
"eval_runtime": 29.9602,
|
||
|
|
"eval_samples_per_second": 17.557,
|
||
|
|
"eval_steps_per_second": 4.406,
|
||
|
|
"eval_token_acc": 0.911284046692607,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22729017891422867,
|
||
|
|
"grad_norm": 0.7735671401023865,
|
||
|
|
"learning_rate": 9.85871255041903e-06,
|
||
|
|
"loss": 0.2625685691833496,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 185,
|
||
|
|
"token_acc": 0.9061491117110654,
|
||
|
|
"train_speed(iter/s)": 0.122679
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23343315672272133,
|
||
|
|
"grad_norm": 0.8188781142234802,
|
||
|
|
"learning_rate": 9.85101085217477e-06,
|
||
|
|
"loss": 0.2741875171661377,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 190,
|
||
|
|
"token_acc": 0.9082752921732972,
|
||
|
|
"train_speed(iter/s)": 0.123561
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.239576134531214,
|
||
|
|
"grad_norm": 0.9473600387573242,
|
||
|
|
"learning_rate": 9.843107945164086e-06,
|
||
|
|
"loss": 0.2795043230056763,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 195,
|
||
|
|
"token_acc": 0.9222816722590006,
|
||
|
|
"train_speed(iter/s)": 0.124249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24571911233970667,
|
||
|
|
"grad_norm": 0.9036338329315186,
|
||
|
|
"learning_rate": 9.835004157181372e-06,
|
||
|
|
"loss": 0.2842700004577637,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 200,
|
||
|
|
"token_acc": 0.9156384193074958,
|
||
|
|
"train_speed(iter/s)": 0.125075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24571911233970667,
|
||
|
|
"eval_loss": 0.292435884475708,
|
||
|
|
"eval_runtime": 29.9157,
|
||
|
|
"eval_samples_per_second": 17.583,
|
||
|
|
"eval_steps_per_second": 4.412,
|
||
|
|
"eval_token_acc": 0.9119109381755296,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2518620901481993,
|
||
|
|
"grad_norm": 0.850283145904541,
|
||
|
|
"learning_rate": 9.826699824353106e-06,
|
||
|
|
"loss": 0.25057048797607423,
|
||
|
|
"memory(GiB)": 29.52,
|
||
|
|
"step": 205,
|
||
|
|
"token_acc": 0.9017097011526469,
|
||
|
|
"train_speed(iter/s)": 0.122679
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.258005067956692,
|
||
|
|
"grad_norm": 0.9866188168525696,
|
||
|
|
"learning_rate": 9.818195291123903e-06,
|
||
|
|
"loss": 0.2645299434661865,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 210,
|
||
|
|
"token_acc": 0.9247558634504632,
|
||
|
|
"train_speed(iter/s)": 0.123484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2641480457651847,
|
||
|
|
"grad_norm": 0.9569115042686462,
|
||
|
|
"learning_rate": 9.80949091024223e-06,
|
||
|
|
"loss": 0.26346535682678224,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 215,
|
||
|
|
"token_acc": 0.9060756912373298,
|
||
|
|
"train_speed(iter/s)": 0.1242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27029102357367735,
|
||
|
|
"grad_norm": 0.8597538471221924,
|
||
|
|
"learning_rate": 9.800587042745774e-06,
|
||
|
|
"loss": 0.24233598709106446,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 220,
|
||
|
|
"token_acc": 0.9219184958700315,
|
||
|
|
"train_speed(iter/s)": 0.124816
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27029102357367735,
|
||
|
|
"eval_loss": 0.29206207394599915,
|
||
|
|
"eval_runtime": 29.9221,
|
||
|
|
"eval_samples_per_second": 17.579,
|
||
|
|
"eval_steps_per_second": 4.411,
|
||
|
|
"eval_token_acc": 0.9120910794062546,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27643400138217,
|
||
|
|
"grad_norm": 0.955256462097168,
|
||
|
|
"learning_rate": 9.791484057946465e-06,
|
||
|
|
"loss": 0.256744384765625,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 225,
|
||
|
|
"token_acc": 0.9052175977500594,
|
||
|
|
"train_speed(iter/s)": 0.122865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28257697919066266,
|
||
|
|
"grad_norm": 0.9161826968193054,
|
||
|
|
"learning_rate": 9.782182333415168e-06,
|
||
|
|
"loss": 0.25551562309265136,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 230,
|
||
|
|
"token_acc": 0.9160751966238251,
|
||
|
|
"train_speed(iter/s)": 0.123442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2887199569991553,
|
||
|
|
"grad_norm": 0.8681318759918213,
|
||
|
|
"learning_rate": 9.772682254966009e-06,
|
||
|
|
"loss": 0.27017927169799805,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 235,
|
||
|
|
"token_acc": 0.9036617262423714,
|
||
|
|
"train_speed(iter/s)": 0.124191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.294862934807648,
|
||
|
|
"grad_norm": 1.02655029296875,
|
||
|
|
"learning_rate": 9.762984216640378e-06,
|
||
|
|
"loss": 0.2807133197784424,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 240,
|
||
|
|
"token_acc": 0.91136,
|
||
|
|
"train_speed(iter/s)": 0.12485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.294862934807648,
|
||
|
|
"eval_loss": 0.2887011170387268,
|
||
|
|
"eval_runtime": 29.7971,
|
||
|
|
"eval_samples_per_second": 17.653,
|
||
|
|
"eval_steps_per_second": 4.43,
|
||
|
|
"eval_token_acc": 0.9119037325263006,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3010059126161407,
|
||
|
|
"grad_norm": 0.8672446012496948,
|
||
|
|
"learning_rate": 9.753088620690589e-06,
|
||
|
|
"loss": 0.25624737739562986,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 245,
|
||
|
|
"token_acc": 0.9064800901577761,
|
||
|
|
"train_speed(iter/s)": 0.122919
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30714889042463334,
|
||
|
|
"grad_norm": 0.9362043142318726,
|
||
|
|
"learning_rate": 9.742995877563187e-06,
|
||
|
|
"loss": 0.2410278081893921,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 250,
|
||
|
|
"token_acc": 0.9145431429992814,
|
||
|
|
"train_speed(iter/s)": 0.123516
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.313291868233126,
|
||
|
|
"grad_norm": 0.8355256915092468,
|
||
|
|
"learning_rate": 9.732706405881931e-06,
|
||
|
|
"loss": 0.29171640872955323,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 255,
|
||
|
|
"token_acc": 0.9167446592065107,
|
||
|
|
"train_speed(iter/s)": 0.123982
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31943484604161865,
|
||
|
|
"grad_norm": 0.9195040464401245,
|
||
|
|
"learning_rate": 9.722220632430428e-06,
|
||
|
|
"loss": 0.2701089859008789,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 260,
|
||
|
|
"token_acc": 0.914859208523592,
|
||
|
|
"train_speed(iter/s)": 0.124498
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31943484604161865,
|
||
|
|
"eval_loss": 0.2895512878894806,
|
||
|
|
"eval_runtime": 29.7827,
|
||
|
|
"eval_samples_per_second": 17.661,
|
||
|
|
"eval_steps_per_second": 4.432,
|
||
|
|
"eval_token_acc": 0.9118460873324686,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32557782385011136,
|
||
|
|
"grad_norm": 0.7184414267539978,
|
||
|
|
"learning_rate": 9.711538992134427e-06,
|
||
|
|
"loss": 0.27772011756896975,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 265,
|
||
|
|
"token_acc": 0.9080251975547935,
|
||
|
|
"train_speed(iter/s)": 0.122689
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.331720801658604,
|
||
|
|
"grad_norm": 0.8985347151756287,
|
||
|
|
"learning_rate": 9.700661928043787e-06,
|
||
|
|
"loss": 0.2564595460891724,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 270,
|
||
|
|
"token_acc": 0.9087627174269773,
|
||
|
|
"train_speed(iter/s)": 0.123177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33786377946709667,
|
||
|
|
"grad_norm": 0.8065007925033569,
|
||
|
|
"learning_rate": 9.689589891314094e-06,
|
||
|
|
"loss": 0.25415422916412356,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 275,
|
||
|
|
"token_acc": 0.9148117934972614,
|
||
|
|
"train_speed(iter/s)": 0.123657
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3440067572755893,
|
||
|
|
"grad_norm": 1.036281943321228,
|
||
|
|
"learning_rate": 9.678323341187956e-06,
|
||
|
|
"loss": 0.2695312023162842,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 280,
|
||
|
|
"token_acc": 0.9125853071055801,
|
||
|
|
"train_speed(iter/s)": 0.124107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3440067572755893,
|
||
|
|
"eval_loss": 0.28761476278305054,
|
||
|
|
"eval_runtime": 29.7589,
|
||
|
|
"eval_samples_per_second": 17.675,
|
||
|
|
"eval_steps_per_second": 4.436,
|
||
|
|
"eval_token_acc": 0.9123000432338954,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.350149735084082,
|
||
|
|
"grad_norm": 0.9171528816223145,
|
||
|
|
"learning_rate": 9.666862744975938e-06,
|
||
|
|
"loss": 0.26874988079071044,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 285,
|
||
|
|
"token_acc": 0.9094196412588164,
|
||
|
|
"train_speed(iter/s)": 0.122607
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3562927128925747,
|
||
|
|
"grad_norm": 0.8578206300735474,
|
||
|
|
"learning_rate": 9.655208578037198e-06,
|
||
|
|
"loss": 0.28213140964508054,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 290,
|
||
|
|
"token_acc": 0.9133274656042989,
|
||
|
|
"train_speed(iter/s)": 0.123162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36243569070106735,
|
||
|
|
"grad_norm": 0.9107432961463928,
|
||
|
|
"learning_rate": 9.643361323759763e-06,
|
||
|
|
"loss": 0.27148008346557617,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 295,
|
||
|
|
"token_acc": 0.9049124513618677,
|
||
|
|
"train_speed(iter/s)": 0.123652
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36857866850956,
|
||
|
|
"grad_norm": 0.9925222396850586,
|
||
|
|
"learning_rate": 9.631321473540476e-06,
|
||
|
|
"loss": 0.2592118740081787,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 300,
|
||
|
|
"token_acc": 0.8945048023933239,
|
||
|
|
"train_speed(iter/s)": 0.124138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36857866850956,
|
||
|
|
"eval_loss": 0.28638342022895813,
|
||
|
|
"eval_runtime": 29.7767,
|
||
|
|
"eval_samples_per_second": 17.665,
|
||
|
|
"eval_steps_per_second": 4.433,
|
||
|
|
"eval_token_acc": 0.9123360714800404,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37472164631805266,
|
||
|
|
"grad_norm": 0.9066736102104187,
|
||
|
|
"learning_rate": 9.619089526764614e-06,
|
||
|
|
"loss": 0.26896276473999026,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 305,
|
||
|
|
"token_acc": 0.9059246028729954,
|
||
|
|
"train_speed(iter/s)": 0.12275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38086462412654537,
|
||
|
|
"grad_norm": 1.0033142566680908,
|
||
|
|
"learning_rate": 9.60666599078518e-06,
|
||
|
|
"loss": 0.2597354412078857,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 310,
|
||
|
|
"token_acc": 0.9248076074702221,
|
||
|
|
"train_speed(iter/s)": 0.123177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.387007601935038,
|
||
|
|
"grad_norm": 0.7254430055618286,
|
||
|
|
"learning_rate": 9.59405138090186e-06,
|
||
|
|
"loss": 0.25493106842041013,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 315,
|
||
|
|
"token_acc": 0.9245887855378633,
|
||
|
|
"train_speed(iter/s)": 0.123726
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3931505797435307,
|
||
|
|
"grad_norm": 0.9664581418037415,
|
||
|
|
"learning_rate": 9.581246220339636e-06,
|
||
|
|
"loss": 0.25707592964172366,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 320,
|
||
|
|
"token_acc": 0.9214029811137158,
|
||
|
|
"train_speed(iter/s)": 0.124122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3931505797435307,
|
||
|
|
"eval_loss": 0.2849082946777344,
|
||
|
|
"eval_runtime": 29.7736,
|
||
|
|
"eval_samples_per_second": 17.667,
|
||
|
|
"eval_steps_per_second": 4.433,
|
||
|
|
"eval_token_acc": 0.9127467934860931,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39929355755202334,
|
||
|
|
"grad_norm": 1.0068587064743042,
|
||
|
|
"learning_rate": 9.568251040227101e-06,
|
||
|
|
"loss": 0.26822853088378906,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 325,
|
||
|
|
"token_acc": 0.9102573583789381,
|
||
|
|
"train_speed(iter/s)": 0.122735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.405436535360516,
|
||
|
|
"grad_norm": 0.9217173457145691,
|
||
|
|
"learning_rate": 9.555066379574423e-06,
|
||
|
|
"loss": 0.25938191413879397,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 330,
|
||
|
|
"token_acc": 0.91627231410767,
|
||
|
|
"train_speed(iter/s)": 0.123232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4115795131690087,
|
||
|
|
"grad_norm": 0.9511445760726929,
|
||
|
|
"learning_rate": 9.541692785250983e-06,
|
||
|
|
"loss": 0.2502701759338379,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 335,
|
||
|
|
"token_acc": 0.9267840101791963,
|
||
|
|
"train_speed(iter/s)": 0.123579
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41772249097750136,
|
||
|
|
"grad_norm": 0.8599613904953003,
|
||
|
|
"learning_rate": 9.528130811962693e-06,
|
||
|
|
"loss": 0.2726857662200928,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 340,
|
||
|
|
"token_acc": 0.9137533709242461,
|
||
|
|
"train_speed(iter/s)": 0.124005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41772249097750136,
|
||
|
|
"eval_loss": 0.2847888171672821,
|
||
|
|
"eval_runtime": 29.6352,
|
||
|
|
"eval_samples_per_second": 17.749,
|
||
|
|
"eval_steps_per_second": 4.454,
|
||
|
|
"eval_token_acc": 0.9126459143968871,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.423865468785994,
|
||
|
|
"grad_norm": 0.8748957514762878,
|
||
|
|
"learning_rate": 9.514381022228997e-06,
|
||
|
|
"loss": 0.2631422996520996,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 345,
|
||
|
|
"token_acc": 0.9046689686233298,
|
||
|
|
"train_speed(iter/s)": 0.122672
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43000844659448667,
|
||
|
|
"grad_norm": 0.8299148082733154,
|
||
|
|
"learning_rate": 9.50044398635953e-06,
|
||
|
|
"loss": 0.25957283973693845,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 350,
|
||
|
|
"token_acc": 0.9137482867810388,
|
||
|
|
"train_speed(iter/s)": 0.123033
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4361514244029793,
|
||
|
|
"grad_norm": 0.8662333488464355,
|
||
|
|
"learning_rate": 9.486320282430469e-06,
|
||
|
|
"loss": 0.25355665683746337,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 355,
|
||
|
|
"token_acc": 0.9042410061421469,
|
||
|
|
"train_speed(iter/s)": 0.123514
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44229440221147204,
|
||
|
|
"grad_norm": 0.8986192345619202,
|
||
|
|
"learning_rate": 9.472010496260545e-06,
|
||
|
|
"loss": 0.27993130683898926,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 360,
|
||
|
|
"token_acc": 0.9059234866040574,
|
||
|
|
"train_speed(iter/s)": 0.12398
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44229440221147204,
|
||
|
|
"eval_loss": 0.28364232182502747,
|
||
|
|
"eval_runtime": 29.7243,
|
||
|
|
"eval_samples_per_second": 17.696,
|
||
|
|
"eval_steps_per_second": 4.441,
|
||
|
|
"eval_token_acc": 0.9125234183599943,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4484373800199647,
|
||
|
|
"grad_norm": 0.8662838339805603,
|
||
|
|
"learning_rate": 9.45751522138676e-06,
|
||
|
|
"loss": 0.2574014663696289,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 365,
|
||
|
|
"token_acc": 0.9092130002686006,
|
||
|
|
"train_speed(iter/s)": 0.122708
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45458035782845735,
|
||
|
|
"grad_norm": 0.8905205130577087,
|
||
|
|
"learning_rate": 9.44283505903976e-06,
|
||
|
|
"loss": 0.2571540355682373,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 370,
|
||
|
|
"token_acc": 0.9236671451908322,
|
||
|
|
"train_speed(iter/s)": 0.12311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46072333563695,
|
||
|
|
"grad_norm": 0.8431711196899414,
|
||
|
|
"learning_rate": 9.427970618118888e-06,
|
||
|
|
"loss": 0.29239816665649415,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 375,
|
||
|
|
"token_acc": 0.910949410949411,
|
||
|
|
"train_speed(iter/s)": 0.123539
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46686631344544266,
|
||
|
|
"grad_norm": 0.8874600529670715,
|
||
|
|
"learning_rate": 9.412922515166952e-06,
|
||
|
|
"loss": 0.2700673580169678,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 380,
|
||
|
|
"token_acc": 0.8923294784045315,
|
||
|
|
"train_speed(iter/s)": 0.123896
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46686631344544266,
|
||
|
|
"eval_loss": 0.2839924693107605,
|
||
|
|
"eval_runtime": 29.8081,
|
||
|
|
"eval_samples_per_second": 17.646,
|
||
|
|
"eval_steps_per_second": 4.428,
|
||
|
|
"eval_token_acc": 0.9126891482922611,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47300929125393537,
|
||
|
|
"grad_norm": 0.9719629883766174,
|
||
|
|
"learning_rate": 9.39769137434463e-06,
|
||
|
|
"loss": 0.26788945198059083,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 385,
|
||
|
|
"token_acc": 0.9089390748674752,
|
||
|
|
"train_speed(iter/s)": 0.122706
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.479152269062428,
|
||
|
|
"grad_norm": 0.8490920066833496,
|
||
|
|
"learning_rate": 9.38227782740459e-06,
|
||
|
|
"loss": 0.26457748413085935,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 390,
|
||
|
|
"token_acc": 0.9092120695170061,
|
||
|
|
"train_speed(iter/s)": 0.12317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4852952468709207,
|
||
|
|
"grad_norm": 0.8913845419883728,
|
||
|
|
"learning_rate": 9.366682513665293e-06,
|
||
|
|
"loss": 0.2367623805999756,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 395,
|
||
|
|
"token_acc": 0.9159677595033221,
|
||
|
|
"train_speed(iter/s)": 0.123494
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49143822467941334,
|
||
|
|
"grad_norm": 1.0622432231903076,
|
||
|
|
"learning_rate": 9.350906079984456e-06,
|
||
|
|
"loss": 0.29119043350219725,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 400,
|
||
|
|
"token_acc": 0.9054560355930219,
|
||
|
|
"train_speed(iter/s)": 0.123861
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49143822467941334,
|
||
|
|
"eval_loss": 0.28296077251434326,
|
||
|
|
"eval_runtime": 29.8253,
|
||
|
|
"eval_samples_per_second": 17.636,
|
||
|
|
"eval_steps_per_second": 4.426,
|
||
|
|
"eval_token_acc": 0.9125594466061392,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.497581202487906,
|
||
|
|
"grad_norm": 0.7695022225379944,
|
||
|
|
"learning_rate": 9.334949180732245e-06,
|
||
|
|
"loss": 0.27100481986999514,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 405,
|
||
|
|
"token_acc": 0.9032931397580931,
|
||
|
|
"train_speed(iter/s)": 0.122792
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5037241802963986,
|
||
|
|
"grad_norm": 0.8153588175773621,
|
||
|
|
"learning_rate": 9.31881247776412e-06,
|
||
|
|
"loss": 0.24918160438537598,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 410,
|
||
|
|
"token_acc": 0.9269037635243568,
|
||
|
|
"train_speed(iter/s)": 0.123224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5098671581048914,
|
||
|
|
"grad_norm": 0.8573175668716431,
|
||
|
|
"learning_rate": 9.302496640393383e-06,
|
||
|
|
"loss": 0.2658379554748535,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 415,
|
||
|
|
"token_acc": 0.9111843654344243,
|
||
|
|
"train_speed(iter/s)": 0.12355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.516010135913384,
|
||
|
|
"grad_norm": 0.8473800420761108,
|
||
|
|
"learning_rate": 9.286002345363418e-06,
|
||
|
|
"loss": 0.25906102657318114,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 420,
|
||
|
|
"token_acc": 0.9103914478855526,
|
||
|
|
"train_speed(iter/s)": 0.12386
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.516010135913384,
|
||
|
|
"eval_loss": 0.2819235920906067,
|
||
|
|
"eval_runtime": 29.7627,
|
||
|
|
"eval_samples_per_second": 17.673,
|
||
|
|
"eval_steps_per_second": 4.435,
|
||
|
|
"eval_token_acc": 0.9134313301628477,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5221531137218767,
|
||
|
|
"grad_norm": 0.8349065780639648,
|
||
|
|
"learning_rate": 9.26933027681963e-06,
|
||
|
|
"loss": 0.27934200763702394,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 425,
|
||
|
|
"token_acc": 0.9077418760931603,
|
||
|
|
"train_speed(iter/s)": 0.122817
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5282960915303694,
|
||
|
|
"grad_norm": 0.7763445973396301,
|
||
|
|
"learning_rate": 9.25248112628105e-06,
|
||
|
|
"loss": 0.25652432441711426,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 430,
|
||
|
|
"token_acc": 0.9304649945266765,
|
||
|
|
"train_speed(iter/s)": 0.123084
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.534439069338862,
|
||
|
|
"grad_norm": 0.7395066022872925,
|
||
|
|
"learning_rate": 9.235455592611667e-06,
|
||
|
|
"loss": 0.2478388547897339,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 435,
|
||
|
|
"token_acc": 0.9145705869023341,
|
||
|
|
"train_speed(iter/s)": 0.123435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5405820471473547,
|
||
|
|
"grad_norm": 0.7695580124855042,
|
||
|
|
"learning_rate": 9.218254381991438e-06,
|
||
|
|
"loss": 0.26280052661895753,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 440,
|
||
|
|
"token_acc": 0.9116754512058777,
|
||
|
|
"train_speed(iter/s)": 0.123796
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5405820471473547,
|
||
|
|
"eval_loss": 0.2799247205257416,
|
||
|
|
"eval_runtime": 29.6791,
|
||
|
|
"eval_samples_per_second": 17.723,
|
||
|
|
"eval_steps_per_second": 4.448,
|
||
|
|
"eval_token_acc": 0.9137844069750685,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5467250249558473,
|
||
|
|
"grad_norm": 0.8697651028633118,
|
||
|
|
"learning_rate": 9.200878207886995e-06,
|
||
|
|
"loss": 0.2713948726654053,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 445,
|
||
|
|
"token_acc": 0.9107910032853171,
|
||
|
|
"train_speed(iter/s)": 0.122757
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.55286800276434,
|
||
|
|
"grad_norm": 0.8758683204650879,
|
||
|
|
"learning_rate": 9.183327791022048e-06,
|
||
|
|
"loss": 0.28769237995147706,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 450,
|
||
|
|
"token_acc": 0.9117891241178913,
|
||
|
|
"train_speed(iter/s)": 0.123141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5590109805728327,
|
||
|
|
"grad_norm": 0.8500604033470154,
|
||
|
|
"learning_rate": 9.165603859347503e-06,
|
||
|
|
"loss": 0.28233935832977297,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 455,
|
||
|
|
"token_acc": 0.9127747252747253,
|
||
|
|
"train_speed(iter/s)": 0.123484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5651539583813253,
|
||
|
|
"grad_norm": 0.7201665639877319,
|
||
|
|
"learning_rate": 9.147707148011255e-06,
|
||
|
|
"loss": 0.26129984855651855,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 460,
|
||
|
|
"token_acc": 0.9233639048655371,
|
||
|
|
"train_speed(iter/s)": 0.123825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5651539583813253,
|
||
|
|
"eval_loss": 0.27780428528785706,
|
||
|
|
"eval_runtime": 29.6703,
|
||
|
|
"eval_samples_per_second": 17.728,
|
||
|
|
"eval_steps_per_second": 4.449,
|
||
|
|
"eval_token_acc": 0.9140005764519383,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.571296936189818,
|
||
|
|
"grad_norm": 0.9048472046852112,
|
||
|
|
"learning_rate": 9.129638399327707e-06,
|
||
|
|
"loss": 0.2747702360153198,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 465,
|
||
|
|
"token_acc": 0.9071566001433583,
|
||
|
|
"train_speed(iter/s)": 0.122931
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5774399139983106,
|
||
|
|
"grad_norm": 0.920282244682312,
|
||
|
|
"learning_rate": 9.111398362746969e-06,
|
||
|
|
"loss": 0.25536236763000486,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 470,
|
||
|
|
"token_acc": 0.9256230196112015,
|
||
|
|
"train_speed(iter/s)": 0.123189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5835828918068033,
|
||
|
|
"grad_norm": 0.7103043794631958,
|
||
|
|
"learning_rate": 9.092987794823785e-06,
|
||
|
|
"loss": 0.25142607688903806,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 475,
|
||
|
|
"token_acc": 0.9101461736887361,
|
||
|
|
"train_speed(iter/s)": 0.1235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.589725869615296,
|
||
|
|
"grad_norm": 0.962253987789154,
|
||
|
|
"learning_rate": 9.074407459186144e-06,
|
||
|
|
"loss": 0.27944207191467285,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 480,
|
||
|
|
"token_acc": 0.9034072816049251,
|
||
|
|
"train_speed(iter/s)": 0.123839
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.589725869615296,
|
||
|
|
"eval_loss": 0.2775828242301941,
|
||
|
|
"eval_runtime": 29.9011,
|
||
|
|
"eval_samples_per_second": 17.591,
|
||
|
|
"eval_steps_per_second": 4.415,
|
||
|
|
"eval_token_acc": 0.9140798385934573,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5958688474237887,
|
||
|
|
"grad_norm": 0.8453803658485413,
|
||
|
|
"learning_rate": 9.055658126503605e-06,
|
||
|
|
"loss": 0.25680568218231203,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 485,
|
||
|
|
"token_acc": 0.9067185532791576,
|
||
|
|
"train_speed(iter/s)": 0.122913
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6020118252322814,
|
||
|
|
"grad_norm": 0.8572331666946411,
|
||
|
|
"learning_rate": 9.036740574455345e-06,
|
||
|
|
"loss": 0.24585814476013185,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 490,
|
||
|
|
"token_acc": 0.9118667917448405,
|
||
|
|
"train_speed(iter/s)": 0.123173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.608154803040774,
|
||
|
|
"grad_norm": 0.764564573764801,
|
||
|
|
"learning_rate": 9.017655587697885e-06,
|
||
|
|
"loss": 0.2665162801742554,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 495,
|
||
|
|
"token_acc": 0.9117743676380872,
|
||
|
|
"train_speed(iter/s)": 0.12348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6142977808492667,
|
||
|
|
"grad_norm": 0.8492972254753113,
|
||
|
|
"learning_rate": 8.998403957832553e-06,
|
||
|
|
"loss": 0.24622914791107178,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 500,
|
||
|
|
"token_acc": 0.9276657659530243,
|
||
|
|
"train_speed(iter/s)": 0.12376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6142977808492667,
|
||
|
|
"eval_loss": 0.2776328921318054,
|
||
|
|
"eval_runtime": 29.8051,
|
||
|
|
"eval_samples_per_second": 17.648,
|
||
|
|
"eval_steps_per_second": 4.429,
|
||
|
|
"eval_token_acc": 0.9137772013258395,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6204407586577594,
|
||
|
|
"grad_norm": 0.8095591068267822,
|
||
|
|
"learning_rate": 8.978986483372657e-06,
|
||
|
|
"loss": 0.26657900810241697,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 505,
|
||
|
|
"token_acc": 0.9116968207877298,
|
||
|
|
"train_speed(iter/s)": 0.122844
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.626583736466252,
|
||
|
|
"grad_norm": 0.8510200381278992,
|
||
|
|
"learning_rate": 8.959403969710346e-06,
|
||
|
|
"loss": 0.2664052486419678,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 510,
|
||
|
|
"token_acc": 0.9214573689711811,
|
||
|
|
"train_speed(iter/s)": 0.123175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6327267142747447,
|
||
|
|
"grad_norm": 0.9017972946166992,
|
||
|
|
"learning_rate": 8.939657229083223e-06,
|
||
|
|
"loss": 0.27168979644775393,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 515,
|
||
|
|
"token_acc": 0.9179679028410324,
|
||
|
|
"train_speed(iter/s)": 0.123502
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6388696920832373,
|
||
|
|
"grad_norm": 0.8968759179115295,
|
||
|
|
"learning_rate": 8.919747080540647e-06,
|
||
|
|
"loss": 0.2673780918121338,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 520,
|
||
|
|
"token_acc": 0.918267105457046,
|
||
|
|
"train_speed(iter/s)": 0.123831
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6388696920832373,
|
||
|
|
"eval_loss": 0.27617883682250977,
|
||
|
|
"eval_runtime": 29.7042,
|
||
|
|
"eval_samples_per_second": 17.708,
|
||
|
|
"eval_steps_per_second": 4.444,
|
||
|
|
"eval_token_acc": 0.9144112984579911,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.64501266989173,
|
||
|
|
"grad_norm": 0.8835923075675964,
|
||
|
|
"learning_rate": 8.899674349909759e-06,
|
||
|
|
"loss": 0.25952877998352053,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 525,
|
||
|
|
"token_acc": 0.9108591693084734,
|
||
|
|
"train_speed(iter/s)": 0.123021
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6511556477002227,
|
||
|
|
"grad_norm": 1.0378886461257935,
|
||
|
|
"learning_rate": 8.879439869761233e-06,
|
||
|
|
"loss": 0.27931737899780273,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 530,
|
||
|
|
"token_acc": 0.9016547678344126,
|
||
|
|
"train_speed(iter/s)": 0.123368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6572986255087153,
|
||
|
|
"grad_norm": 0.815498411655426,
|
||
|
|
"learning_rate": 8.859044479374737e-06,
|
||
|
|
"loss": 0.26363046169281007,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 535,
|
||
|
|
"token_acc": 0.9000225428313796,
|
||
|
|
"train_speed(iter/s)": 0.123703
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.663441603317208,
|
||
|
|
"grad_norm": 0.7910681962966919,
|
||
|
|
"learning_rate": 8.838489024704131e-06,
|
||
|
|
"loss": 0.25994918346405027,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 540,
|
||
|
|
"token_acc": 0.9154737238651347,
|
||
|
|
"train_speed(iter/s)": 0.123918
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.663441603317208,
|
||
|
|
"eval_loss": 0.27339863777160645,
|
||
|
|
"eval_runtime": 29.6984,
|
||
|
|
"eval_samples_per_second": 17.711,
|
||
|
|
"eval_steps_per_second": 4.445,
|
||
|
|
"eval_token_acc": 0.9149733390978527,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6695845811257006,
|
||
|
|
"grad_norm": 1.0020904541015625,
|
||
|
|
"learning_rate": 8.817774358342367e-06,
|
||
|
|
"loss": 0.25385727882385256,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 545,
|
||
|
|
"token_acc": 0.9148255452267112,
|
||
|
|
"train_speed(iter/s)": 0.123092
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6757275589341933,
|
||
|
|
"grad_norm": 0.8850826621055603,
|
||
|
|
"learning_rate": 8.796901339486136e-06,
|
||
|
|
"loss": 0.258061146736145,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 550,
|
||
|
|
"token_acc": 0.9247646909183413,
|
||
|
|
"train_speed(iter/s)": 0.12338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.681870536742686,
|
||
|
|
"grad_norm": 0.7173855304718018,
|
||
|
|
"learning_rate": 8.775870833900226e-06,
|
||
|
|
"loss": 0.2372835636138916,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 555,
|
||
|
|
"token_acc": 0.9205999329148498,
|
||
|
|
"train_speed(iter/s)": 0.12365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6880135145511787,
|
||
|
|
"grad_norm": 0.8591410517692566,
|
||
|
|
"learning_rate": 8.75468371388161e-06,
|
||
|
|
"loss": 0.2429880380630493,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 560,
|
||
|
|
"token_acc": 0.9182278006744099,
|
||
|
|
"train_speed(iter/s)": 0.123832
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6880135145511787,
|
||
|
|
"eval_loss": 0.2735811173915863,
|
||
|
|
"eval_runtime": 29.6978,
|
||
|
|
"eval_samples_per_second": 17.712,
|
||
|
|
"eval_steps_per_second": 4.445,
|
||
|
|
"eval_token_acc": 0.914454532353365,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6941564923596714,
|
||
|
|
"grad_norm": 0.7596750259399414,
|
||
|
|
"learning_rate": 8.733340858223268e-06,
|
||
|
|
"loss": 0.27418644428253175,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 565,
|
||
|
|
"token_acc": 0.9034487711172151,
|
||
|
|
"train_speed(iter/s)": 0.123058
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.700299470168164,
|
||
|
|
"grad_norm": 0.9523453116416931,
|
||
|
|
"learning_rate": 8.711843152177735e-06,
|
||
|
|
"loss": 0.23236403465270997,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 570,
|
||
|
|
"token_acc": 0.9180918923916058,
|
||
|
|
"train_speed(iter/s)": 0.123331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7064424479766567,
|
||
|
|
"grad_norm": 0.7896727919578552,
|
||
|
|
"learning_rate": 8.690191487420385e-06,
|
||
|
|
"loss": 0.24450998306274413,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 575,
|
||
|
|
"token_acc": 0.9229566883477033,
|
||
|
|
"train_speed(iter/s)": 0.12356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7125854257851494,
|
||
|
|
"grad_norm": 0.9423844218254089,
|
||
|
|
"learning_rate": 8.668386762012445e-06,
|
||
|
|
"loss": 0.2612689256668091,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 580,
|
||
|
|
"token_acc": 0.9097009202453987,
|
||
|
|
"train_speed(iter/s)": 0.123753
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7125854257851494,
|
||
|
|
"eval_loss": 0.2717309594154358,
|
||
|
|
"eval_runtime": 29.6986,
|
||
|
|
"eval_samples_per_second": 17.711,
|
||
|
|
"eval_steps_per_second": 4.445,
|
||
|
|
"eval_token_acc": 0.9148292261132728,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.718728403593642,
|
||
|
|
"grad_norm": 0.8224099278450012,
|
||
|
|
"learning_rate": 8.646429880363746e-06,
|
||
|
|
"loss": 0.2574700117111206,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 585,
|
||
|
|
"token_acc": 0.9109293706792103,
|
||
|
|
"train_speed(iter/s)": 0.122969
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7248713814021347,
|
||
|
|
"grad_norm": 0.8321871757507324,
|
||
|
|
"learning_rate": 8.624321753195209e-06,
|
||
|
|
"loss": 0.24323840141296388,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 590,
|
||
|
|
"token_acc": 0.9101112629318758,
|
||
|
|
"train_speed(iter/s)": 0.12323
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7310143592106274,
|
||
|
|
"grad_norm": 0.7631216645240784,
|
||
|
|
"learning_rate": 8.602063297501069e-06,
|
||
|
|
"loss": 0.2646035194396973,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 595,
|
||
|
|
"token_acc": 0.9244347364071078,
|
||
|
|
"train_speed(iter/s)": 0.123473
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.73715733701912,
|
||
|
|
"grad_norm": 0.7845233678817749,
|
||
|
|
"learning_rate": 8.579655436510847e-06,
|
||
|
|
"loss": 0.24114649295806884,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 600,
|
||
|
|
"token_acc": 0.9176872685844488,
|
||
|
|
"train_speed(iter/s)": 0.123706
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.73715733701912,
|
||
|
|
"eval_loss": 0.27247732877731323,
|
||
|
|
"eval_runtime": 29.6948,
|
||
|
|
"eval_samples_per_second": 17.714,
|
||
|
|
"eval_steps_per_second": 4.445,
|
||
|
|
"eval_token_acc": 0.9152687707162416,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7433003148276127,
|
||
|
|
"grad_norm": 0.8848146796226501,
|
||
|
|
"learning_rate": 8.557099099651046e-06,
|
||
|
|
"loss": 0.26951429843902586,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 605,
|
||
|
|
"token_acc": 0.9051814218282708,
|
||
|
|
"train_speed(iter/s)": 0.122981
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7494432926361053,
|
||
|
|
"grad_norm": 0.7597134709358215,
|
||
|
|
"learning_rate": 8.534395222506614e-06,
|
||
|
|
"loss": 0.2650261163711548,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 610,
|
||
|
|
"token_acc": 0.9191399015223382,
|
||
|
|
"train_speed(iter/s)": 0.123257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.755586270444598,
|
||
|
|
"grad_norm": 0.8406746983528137,
|
||
|
|
"learning_rate": 8.511544746782124e-06,
|
||
|
|
"loss": 0.266461181640625,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 615,
|
||
|
|
"token_acc": 0.9117747440273037,
|
||
|
|
"train_speed(iter/s)": 0.123491
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7617292482530907,
|
||
|
|
"grad_norm": 0.7886612415313721,
|
||
|
|
"learning_rate": 8.488548620262722e-06,
|
||
|
|
"loss": 0.23856868743896484,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 620,
|
||
|
|
"token_acc": 0.9248648177219606,
|
||
|
|
"train_speed(iter/s)": 0.123722
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7617292482530907,
|
||
|
|
"eval_loss": 0.27053606510162354,
|
||
|
|
"eval_runtime": 29.6607,
|
||
|
|
"eval_samples_per_second": 17.734,
|
||
|
|
"eval_steps_per_second": 4.45,
|
||
|
|
"eval_token_acc": 0.9157299322668973,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7678722260615833,
|
||
|
|
"grad_norm": 0.7711523771286011,
|
||
|
|
"learning_rate": 8.465407796774816e-06,
|
||
|
|
"loss": 0.23632125854492186,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 625,
|
||
|
|
"token_acc": 0.9121895174526754,
|
||
|
|
"train_speed(iter/s)": 0.122988
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.774015203870076,
|
||
|
|
"grad_norm": 0.8446414470672607,
|
||
|
|
"learning_rate": 8.442123236146509e-06,
|
||
|
|
"loss": 0.25997061729431153,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 630,
|
||
|
|
"token_acc": 0.922757768361582,
|
||
|
|
"train_speed(iter/s)": 0.123243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7801581816785687,
|
||
|
|
"grad_norm": 0.8071849942207336,
|
||
|
|
"learning_rate": 8.418695904167789e-06,
|
||
|
|
"loss": 0.2547910690307617,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 635,
|
||
|
|
"token_acc": 0.9215275839612987,
|
||
|
|
"train_speed(iter/s)": 0.123454
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7863011594870614,
|
||
|
|
"grad_norm": 1.0132827758789062,
|
||
|
|
"learning_rate": 8.395126772550475e-06,
|
||
|
|
"loss": 0.2584752082824707,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 640,
|
||
|
|
"token_acc": 0.9119555143651529,
|
||
|
|
"train_speed(iter/s)": 0.123674
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7863011594870614,
|
||
|
|
"eval_loss": 0.2703080475330353,
|
||
|
|
"eval_runtime": 29.6849,
|
||
|
|
"eval_samples_per_second": 17.719,
|
||
|
|
"eval_steps_per_second": 4.447,
|
||
|
|
"eval_token_acc": 0.9151967142239515,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7924441372955541,
|
||
|
|
"grad_norm": 0.795028805732727,
|
||
|
|
"learning_rate": 8.371416818887907e-06,
|
||
|
|
"loss": 0.2689487934112549,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 645,
|
||
|
|
"token_acc": 0.906519600423793,
|
||
|
|
"train_speed(iter/s)": 0.122961
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7985871151040467,
|
||
|
|
"grad_norm": 0.8361831903457642,
|
||
|
|
"learning_rate": 8.347567026614398e-06,
|
||
|
|
"loss": 0.25730276107788086,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 650,
|
||
|
|
"token_acc": 0.9204035220712578,
|
||
|
|
"train_speed(iter/s)": 0.123176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8047300929125394,
|
||
|
|
"grad_norm": 0.8233036994934082,
|
||
|
|
"learning_rate": 8.323578384964444e-06,
|
||
|
|
"loss": 0.2561511039733887,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 655,
|
||
|
|
"token_acc": 0.9101053936763794,
|
||
|
|
"train_speed(iter/s)": 0.123399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.810873070721032,
|
||
|
|
"grad_norm": 0.8036513328552246,
|
||
|
|
"learning_rate": 8.299451888931696e-06,
|
||
|
|
"loss": 0.24744575023651122,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 660,
|
||
|
|
"token_acc": 0.9082344368103269,
|
||
|
|
"train_speed(iter/s)": 0.123638
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.810873070721032,
|
||
|
|
"eval_loss": 0.2683682441711426,
|
||
|
|
"eval_runtime": 29.6531,
|
||
|
|
"eval_samples_per_second": 17.738,
|
||
|
|
"eval_steps_per_second": 4.451,
|
||
|
|
"eval_token_acc": 0.9163135898544459,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8170160485295247,
|
||
|
|
"grad_norm": 0.7272213697433472,
|
||
|
|
"learning_rate": 8.275188539227687e-06,
|
||
|
|
"loss": 0.23523108959197997,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 665,
|
||
|
|
"token_acc": 0.9134302376185917,
|
||
|
|
"train_speed(iter/s)": 0.122932
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8231590263380174,
|
||
|
|
"grad_norm": 0.854129433631897,
|
||
|
|
"learning_rate": 8.250789342240326e-06,
|
||
|
|
"loss": 0.24518890380859376,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 670,
|
||
|
|
"token_acc": 0.9203224101479915,
|
||
|
|
"train_speed(iter/s)": 0.123198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.82930200414651,
|
||
|
|
"grad_norm": 0.755439043045044,
|
||
|
|
"learning_rate": 8.22625530999215e-06,
|
||
|
|
"loss": 0.23708434104919435,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 675,
|
||
|
|
"token_acc": 0.9149347105009027,
|
||
|
|
"train_speed(iter/s)": 0.123435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8354449819550027,
|
||
|
|
"grad_norm": 0.7811703085899353,
|
||
|
|
"learning_rate": 8.201587460098362e-06,
|
||
|
|
"loss": 0.23157744407653807,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 680,
|
||
|
|
"token_acc": 0.9248452220726784,
|
||
|
|
"train_speed(iter/s)": 0.123616
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8354449819550027,
|
||
|
|
"eval_loss": 0.26816263794898987,
|
||
|
|
"eval_runtime": 29.696,
|
||
|
|
"eval_samples_per_second": 17.713,
|
||
|
|
"eval_steps_per_second": 4.445,
|
||
|
|
"eval_token_acc": 0.9156650814238363,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8415879597634953,
|
||
|
|
"grad_norm": 0.8753883242607117,
|
||
|
|
"learning_rate": 8.176786815724601e-06,
|
||
|
|
"loss": 0.26745316982269285,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 685,
|
||
|
|
"token_acc": 0.9049404582454025,
|
||
|
|
"train_speed(iter/s)": 0.122971
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.847730937571988,
|
||
|
|
"grad_norm": 0.699246346950531,
|
||
|
|
"learning_rate": 8.151854405544526e-06,
|
||
|
|
"loss": 0.2602883815765381,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 690,
|
||
|
|
"token_acc": 0.9025235288033923,
|
||
|
|
"train_speed(iter/s)": 0.12319
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8538739153804807,
|
||
|
|
"grad_norm": 0.6880396604537964,
|
||
|
|
"learning_rate": 8.12679126369713e-06,
|
||
|
|
"loss": 0.27959246635437013,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 695,
|
||
|
|
"token_acc": 0.9131617782696919,
|
||
|
|
"train_speed(iter/s)": 0.123417
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8600168931889733,
|
||
|
|
"grad_norm": 0.847633421421051,
|
||
|
|
"learning_rate": 8.101598429743862e-06,
|
||
|
|
"loss": 0.2776790142059326,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 700,
|
||
|
|
"token_acc": 0.9208461614857484,
|
||
|
|
"train_speed(iter/s)": 0.123628
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8600168931889733,
|
||
|
|
"eval_loss": 0.26767873764038086,
|
||
|
|
"eval_runtime": 29.7471,
|
||
|
|
"eval_samples_per_second": 17.682,
|
||
|
|
"eval_steps_per_second": 4.437,
|
||
|
|
"eval_token_acc": 0.9162271220636979,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.866159870997466,
|
||
|
|
"grad_norm": 0.796385645866394,
|
||
|
|
"learning_rate": 8.076276948625495e-06,
|
||
|
|
"loss": 0.2519699573516846,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 705,
|
||
|
|
"token_acc": 0.9091483105121683,
|
||
|
|
"train_speed(iter/s)": 0.123011
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8723028488059587,
|
||
|
|
"grad_norm": 0.7697290182113647,
|
||
|
|
"learning_rate": 8.050827870618795e-06,
|
||
|
|
"loss": 0.2222222328186035,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 710,
|
||
|
|
"token_acc": 0.9183491244605387,
|
||
|
|
"train_speed(iter/s)": 0.123243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8784458266144514,
|
||
|
|
"grad_norm": 0.7783083319664001,
|
||
|
|
"learning_rate": 8.02525225129295e-06,
|
||
|
|
"loss": 0.2620779752731323,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 715,
|
||
|
|
"token_acc": 0.9140030018344544,
|
||
|
|
"train_speed(iter/s)": 0.123471
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8845888044229441,
|
||
|
|
"grad_norm": 0.8889957666397095,
|
||
|
|
"learning_rate": 7.999551151465793e-06,
|
||
|
|
"loss": 0.2590866327285767,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 720,
|
||
|
|
"token_acc": 0.9240682856455879,
|
||
|
|
"train_speed(iter/s)": 0.123679
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8845888044229441,
|
||
|
|
"eval_loss": 0.2673368752002716,
|
||
|
|
"eval_runtime": 29.7041,
|
||
|
|
"eval_samples_per_second": 17.708,
|
||
|
|
"eval_steps_per_second": 4.444,
|
||
|
|
"eval_token_acc": 0.9157587548638132,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8907317822314367,
|
||
|
|
"grad_norm": 0.699228048324585,
|
||
|
|
"learning_rate": 7.973725637159795e-06,
|
||
|
|
"loss": 0.24339399337768555,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 725,
|
||
|
|
"token_acc": 0.9063403422456052,
|
||
|
|
"train_speed(iter/s)": 0.123046
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8968747600399294,
|
||
|
|
"grad_norm": 0.8761767745018005,
|
||
|
|
"learning_rate": 7.947776779557862e-06,
|
||
|
|
"loss": 0.22902493476867675,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 730,
|
||
|
|
"token_acc": 0.929031261265901,
|
||
|
|
"train_speed(iter/s)": 0.123239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.903017737848422,
|
||
|
|
"grad_norm": 0.7344342470169067,
|
||
|
|
"learning_rate": 7.921705654958886e-06,
|
||
|
|
"loss": 0.25461578369140625,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 735,
|
||
|
|
"token_acc": 0.910783754344105,
|
||
|
|
"train_speed(iter/s)": 0.123436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9091607156569147,
|
||
|
|
"grad_norm": 0.8068217635154724,
|
||
|
|
"learning_rate": 7.895513344733124e-06,
|
||
|
|
"loss": 0.23727846145629883,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 740,
|
||
|
|
"token_acc": 0.9222560975609756,
|
||
|
|
"train_speed(iter/s)": 0.123624
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9091607156569147,
|
||
|
|
"eval_loss": 0.26663488149642944,
|
||
|
|
"eval_runtime": 29.6915,
|
||
|
|
"eval_samples_per_second": 17.715,
|
||
|
|
"eval_steps_per_second": 4.446,
|
||
|
|
"eval_token_acc": 0.916219916414469,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9153036934654074,
|
||
|
|
"grad_norm": 0.7529241442680359,
|
||
|
|
"learning_rate": 7.869200935277317e-06,
|
||
|
|
"loss": 0.2533961534500122,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 745,
|
||
|
|
"token_acc": 0.9121754667444574,
|
||
|
|
"train_speed(iter/s)": 0.123063
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9214466712739,
|
||
|
|
"grad_norm": 0.8958796858787537,
|
||
|
|
"learning_rate": 7.842769517969665e-06,
|
||
|
|
"loss": 0.2638097286224365,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 750,
|
||
|
|
"token_acc": 0.9133777069466579,
|
||
|
|
"train_speed(iter/s)": 0.123274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9275896490823927,
|
||
|
|
"grad_norm": 0.8491634726524353,
|
||
|
|
"learning_rate": 7.816220189124527e-06,
|
||
|
|
"loss": 0.2510275363922119,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 755,
|
||
|
|
"token_acc": 0.9162462967411322,
|
||
|
|
"train_speed(iter/s)": 0.123478
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9337326268908853,
|
||
|
|
"grad_norm": 0.8026111125946045,
|
||
|
|
"learning_rate": 7.789554049946966e-06,
|
||
|
|
"loss": 0.2663265228271484,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 760,
|
||
|
|
"token_acc": 0.9118450459399057,
|
||
|
|
"train_speed(iter/s)": 0.123691
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9337326268908853,
|
||
|
|
"eval_loss": 0.265114426612854,
|
||
|
|
"eval_runtime": 29.6567,
|
||
|
|
"eval_samples_per_second": 17.736,
|
||
|
|
"eval_steps_per_second": 4.451,
|
||
|
|
"eval_token_acc": 0.9164793197867128,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.939875604699378,
|
||
|
|
"grad_norm": 0.7799587845802307,
|
||
|
|
"learning_rate": 7.762772206487066e-06,
|
||
|
|
"loss": 0.2516252756118774,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 765,
|
||
|
|
"token_acc": 0.9153250495227805,
|
||
|
|
"train_speed(iter/s)": 0.123108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9460185825078707,
|
||
|
|
"grad_norm": 0.7860950231552124,
|
||
|
|
"learning_rate": 7.735875769594063e-06,
|
||
|
|
"loss": 0.2252351760864258,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 770,
|
||
|
|
"token_acc": 0.9219084178777077,
|
||
|
|
"train_speed(iter/s)": 0.123281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9521615603163633,
|
||
|
|
"grad_norm": 0.7115477323532104,
|
||
|
|
"learning_rate": 7.70886585487026e-06,
|
||
|
|
"loss": 0.24201133251190185,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 775,
|
||
|
|
"token_acc": 0.9087906037805101,
|
||
|
|
"train_speed(iter/s)": 0.123461
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.958304538124856,
|
||
|
|
"grad_norm": 0.7322366833686829,
|
||
|
|
"learning_rate": 7.681743582624761e-06,
|
||
|
|
"loss": 0.24702987670898438,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 780,
|
||
|
|
"token_acc": 0.9231603262150568,
|
||
|
|
"train_speed(iter/s)": 0.123669
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.958304538124856,
|
||
|
|
"eval_loss": 0.2637149393558502,
|
||
|
|
"eval_runtime": 29.6465,
|
||
|
|
"eval_samples_per_second": 17.742,
|
||
|
|
"eval_steps_per_second": 4.452,
|
||
|
|
"eval_token_acc": 0.9160253638852861,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9644475159333487,
|
||
|
|
"grad_norm": 0.8311400413513184,
|
||
|
|
"learning_rate": 7.654510077827003e-06,
|
||
|
|
"loss": 0.26540687084198,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 785,
|
||
|
|
"token_acc": 0.9113840464870576,
|
||
|
|
"train_speed(iter/s)": 0.12312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9705904937418414,
|
||
|
|
"grad_norm": 0.8548195958137512,
|
||
|
|
"learning_rate": 7.627166470060092e-06,
|
||
|
|
"loss": 0.26256117820739744,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 790,
|
||
|
|
"token_acc": 0.9282326450438365,
|
||
|
|
"train_speed(iter/s)": 0.123344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9767334715503341,
|
||
|
|
"grad_norm": 0.822137176990509,
|
||
|
|
"learning_rate": 7.59971389347395e-06,
|
||
|
|
"loss": 0.2492506980895996,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 795,
|
||
|
|
"token_acc": 0.9176904773466712,
|
||
|
|
"train_speed(iter/s)": 0.123523
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9828764493588267,
|
||
|
|
"grad_norm": 0.8581626415252686,
|
||
|
|
"learning_rate": 7.572153486738281e-06,
|
||
|
|
"loss": 0.23105947971343993,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 800,
|
||
|
|
"token_acc": 0.9199036089276493,
|
||
|
|
"train_speed(iter/s)": 0.123714
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9828764493588267,
|
||
|
|
"eval_loss": 0.262928307056427,
|
||
|
|
"eval_runtime": 29.686,
|
||
|
|
"eval_samples_per_second": 17.719,
|
||
|
|
"eval_steps_per_second": 4.447,
|
||
|
|
"eval_token_acc": 0.9163784406975068,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9890194271673194,
|
||
|
|
"grad_norm": 0.801688551902771,
|
||
|
|
"learning_rate": 7.544486392995325e-06,
|
||
|
|
"loss": 0.22394142150878907,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 805,
|
||
|
|
"token_acc": 0.9134662867996202,
|
||
|
|
"train_speed(iter/s)": 0.12313
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.995162404975812,
|
||
|
|
"grad_norm": 0.769991397857666,
|
||
|
|
"learning_rate": 7.516713759812465e-06,
|
||
|
|
"loss": 0.24088678359985352,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 810,
|
||
|
|
"token_acc": 0.9233073946152885,
|
||
|
|
"train_speed(iter/s)": 0.12331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.002457191123397,
|
||
|
|
"grad_norm": 0.7465444207191467,
|
||
|
|
"learning_rate": 7.4888367391346085e-06,
|
||
|
|
"loss": 0.2843191623687744,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 815,
|
||
|
|
"token_acc": 0.9340755933196602,
|
||
|
|
"train_speed(iter/s)": 0.12343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0086001689318898,
|
||
|
|
"grad_norm": 0.7140413522720337,
|
||
|
|
"learning_rate": 7.460856487236421e-06,
|
||
|
|
"loss": 0.21777431964874266,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 820,
|
||
|
|
"token_acc": 0.9192456915997473,
|
||
|
|
"train_speed(iter/s)": 0.123641
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0086001689318898,
|
||
|
|
"eval_loss": 0.26577290892601013,
|
||
|
|
"eval_runtime": 29.655,
|
||
|
|
"eval_samples_per_second": 17.737,
|
||
|
|
"eval_steps_per_second": 4.451,
|
||
|
|
"eval_token_acc": 0.9173079694480473,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0147431467403825,
|
||
|
|
"grad_norm": 0.8933572769165039,
|
||
|
|
"learning_rate": 7.432774164674359e-06,
|
||
|
|
"loss": 0.18578357696533204,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 825,
|
||
|
|
"token_acc": 0.9199372635852016,
|
||
|
|
"train_speed(iter/s)": 0.123101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0208861245488752,
|
||
|
|
"grad_norm": 0.8847187161445618,
|
||
|
|
"learning_rate": 7.404590936238535e-06,
|
||
|
|
"loss": 0.19721906185150145,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 830,
|
||
|
|
"token_acc": 0.9378547338981656,
|
||
|
|
"train_speed(iter/s)": 0.123344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0270291023573677,
|
||
|
|
"grad_norm": 0.8109038472175598,
|
||
|
|
"learning_rate": 7.376307970904408e-06,
|
||
|
|
"loss": 0.209037446975708,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 835,
|
||
|
|
"token_acc": 0.9270859687294439,
|
||
|
|
"train_speed(iter/s)": 0.123517
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0331720801658604,
|
||
|
|
"grad_norm": 0.7887623906135559,
|
||
|
|
"learning_rate": 7.34792644178429e-06,
|
||
|
|
"loss": 0.18892388343811034,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 840,
|
||
|
|
"token_acc": 0.940854053515372,
|
||
|
|
"train_speed(iter/s)": 0.123667
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0331720801658604,
|
||
|
|
"eval_loss": 0.26770132780075073,
|
||
|
|
"eval_runtime": 29.6867,
|
||
|
|
"eval_samples_per_second": 17.718,
|
||
|
|
"eval_steps_per_second": 4.446,
|
||
|
|
"eval_token_acc": 0.9165729932266897,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.039315057974353,
|
||
|
|
"grad_norm": 0.8831640481948853,
|
||
|
|
"learning_rate": 7.319447526078696e-06,
|
||
|
|
"loss": 0.20574064254760743,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 845,
|
||
|
|
"token_acc": 0.9144320335497399,
|
||
|
|
"train_speed(iter/s)": 0.123197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0454580357828458,
|
||
|
|
"grad_norm": 0.7205056548118591,
|
||
|
|
"learning_rate": 7.290872405027508e-06,
|
||
|
|
"loss": 0.1763360857963562,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 850,
|
||
|
|
"token_acc": 0.941142747945729,
|
||
|
|
"train_speed(iter/s)": 0.123361
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0516010135913385,
|
||
|
|
"grad_norm": 0.6685133576393127,
|
||
|
|
"learning_rate": 7.262202263860989e-06,
|
||
|
|
"loss": 0.18052310943603517,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 855,
|
||
|
|
"token_acc": 0.9395161290322581,
|
||
|
|
"train_speed(iter/s)": 0.123527
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.057743991399831,
|
||
|
|
"grad_norm": 0.6980032920837402,
|
||
|
|
"learning_rate": 7.233438291750615e-06,
|
||
|
|
"loss": 0.2039564609527588,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 860,
|
||
|
|
"token_acc": 0.9359017096052193,
|
||
|
|
"train_speed(iter/s)": 0.123688
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.057743991399831,
|
||
|
|
"eval_loss": 0.2684628963470459,
|
||
|
|
"eval_runtime": 29.667,
|
||
|
|
"eval_samples_per_second": 17.73,
|
||
|
|
"eval_steps_per_second": 4.449,
|
||
|
|
"eval_token_acc": 0.9164360858913388,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0638869692083237,
|
||
|
|
"grad_norm": 0.9002748727798462,
|
||
|
|
"learning_rate": 7.204581681759752e-06,
|
||
|
|
"loss": 0.2119807004928589,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 865,
|
||
|
|
"token_acc": 0.9171671861932639,
|
||
|
|
"train_speed(iter/s)": 0.123189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0700299470168164,
|
||
|
|
"grad_norm": 0.7583820819854736,
|
||
|
|
"learning_rate": 7.175633630794176e-06,
|
||
|
|
"loss": 0.20298078060150146,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 870,
|
||
|
|
"token_acc": 0.938098510882016,
|
||
|
|
"train_speed(iter/s)": 0.123367
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0761729248253091,
|
||
|
|
"grad_norm": 0.8472638726234436,
|
||
|
|
"learning_rate": 7.146595339552423e-06,
|
||
|
|
"loss": 0.19818198680877686,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 875,
|
||
|
|
"token_acc": 0.937833543813908,
|
||
|
|
"train_speed(iter/s)": 0.123568
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0823159026338018,
|
||
|
|
"grad_norm": 0.8150883913040161,
|
||
|
|
"learning_rate": 7.1174680124759856e-06,
|
||
|
|
"loss": 0.17738423347473145,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 880,
|
||
|
|
"token_acc": 0.9374801246581441,
|
||
|
|
"train_speed(iter/s)": 0.123737
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0823159026338018,
|
||
|
|
"eval_loss": 0.26935601234436035,
|
||
|
|
"eval_runtime": 29.5753,
|
||
|
|
"eval_samples_per_second": 17.785,
|
||
|
|
"eval_steps_per_second": 4.463,
|
||
|
|
"eval_token_acc": 0.916198299466782,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0884588804422943,
|
||
|
|
"grad_norm": 0.7381166815757751,
|
||
|
|
"learning_rate": 7.08825285769936e-06,
|
||
|
|
"loss": 0.18000258207321168,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 885,
|
||
|
|
"token_acc": 0.9197572365671506,
|
||
|
|
"train_speed(iter/s)": 0.12326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.094601858250787,
|
||
|
|
"grad_norm": 0.7191819548606873,
|
||
|
|
"learning_rate": 7.058951086999934e-06,
|
||
|
|
"loss": 0.17380096912384033,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 890,
|
||
|
|
"token_acc": 0.9333728639965501,
|
||
|
|
"train_speed(iter/s)": 0.123399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1007448360592798,
|
||
|
|
"grad_norm": 0.8915813565254211,
|
||
|
|
"learning_rate": 7.029563915747723e-06,
|
||
|
|
"loss": 0.18771791458129883,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 895,
|
||
|
|
"token_acc": 0.9397812810680302,
|
||
|
|
"train_speed(iter/s)": 0.123569
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1068878138677725,
|
||
|
|
"grad_norm": 0.7277628183364868,
|
||
|
|
"learning_rate": 7.0000925628549595e-06,
|
||
|
|
"loss": 0.20253748893737794,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 900,
|
||
|
|
"token_acc": 0.9236840782263335,
|
||
|
|
"train_speed(iter/s)": 0.123737
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1068878138677725,
|
||
|
|
"eval_loss": 0.2681773900985718,
|
||
|
|
"eval_runtime": 29.6364,
|
||
|
|
"eval_samples_per_second": 17.748,
|
||
|
|
"eval_steps_per_second": 4.454,
|
||
|
|
"eval_token_acc": 0.916162271220637,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1130307916762652,
|
||
|
|
"grad_norm": 0.9438680410385132,
|
||
|
|
"learning_rate": 6.9705382507255405e-06,
|
||
|
|
"loss": 0.18554757833480834,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 905,
|
||
|
|
"token_acc": 0.9169849491620015,
|
||
|
|
"train_speed(iter/s)": 0.123281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1191737694847577,
|
||
|
|
"grad_norm": 0.8167145252227783,
|
||
|
|
"learning_rate": 6.940902205204321e-06,
|
||
|
|
"loss": 0.19586331844329835,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 910,
|
||
|
|
"token_acc": 0.935408560311284,
|
||
|
|
"train_speed(iter/s)": 0.123439
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1253167472932504,
|
||
|
|
"grad_norm": 0.7680448889732361,
|
||
|
|
"learning_rate": 6.911185655526263e-06,
|
||
|
|
"loss": 0.2027712345123291,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 915,
|
||
|
|
"token_acc": 0.9350154026697961,
|
||
|
|
"train_speed(iter/s)": 0.123617
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.131459725101743,
|
||
|
|
"grad_norm": 0.7472254037857056,
|
||
|
|
"learning_rate": 6.881389834265463e-06,
|
||
|
|
"loss": 0.20426957607269286,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 920,
|
||
|
|
"token_acc": 0.9317039744175423,
|
||
|
|
"train_speed(iter/s)": 0.123791
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.131459725101743,
|
||
|
|
"eval_loss": 0.26660415530204773,
|
||
|
|
"eval_runtime": 29.6749,
|
||
|
|
"eval_samples_per_second": 17.725,
|
||
|
|
"eval_steps_per_second": 4.448,
|
||
|
|
"eval_token_acc": 0.9159677186914541,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1376027029102358,
|
||
|
|
"grad_norm": 0.7990177273750305,
|
||
|
|
"learning_rate": 6.851515977284014e-06,
|
||
|
|
"loss": 0.17569031715393066,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 925,
|
||
|
|
"token_acc": 0.9180286145399676,
|
||
|
|
"train_speed(iter/s)": 0.123301
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1437456807187285,
|
||
|
|
"grad_norm": 0.7919936776161194,
|
||
|
|
"learning_rate": 6.821565323680759e-06,
|
||
|
|
"loss": 0.18091797828674316,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 930,
|
||
|
|
"token_acc": 0.9357635368079497,
|
||
|
|
"train_speed(iter/s)": 0.123449
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.149888658527221,
|
||
|
|
"grad_norm": 0.7052066922187805,
|
||
|
|
"learning_rate": 6.791539115739879e-06,
|
||
|
|
"loss": 0.20484356880187987,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 935,
|
||
|
|
"token_acc": 0.9249510662408571,
|
||
|
|
"train_speed(iter/s)": 0.123648
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1560316363357137,
|
||
|
|
"grad_norm": 0.7116119861602783,
|
||
|
|
"learning_rate": 6.761438598879383e-06,
|
||
|
|
"loss": 0.18515671491622926,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 940,
|
||
|
|
"token_acc": 0.9454207808678322,
|
||
|
|
"train_speed(iter/s)": 0.123794
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1560316363357137,
|
||
|
|
"eval_loss": 0.265466570854187,
|
||
|
|
"eval_runtime": 29.6874,
|
||
|
|
"eval_samples_per_second": 17.718,
|
||
|
|
"eval_steps_per_second": 4.446,
|
||
|
|
"eval_token_acc": 0.9163063842052169,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1621746141442064,
|
||
|
|
"grad_norm": 0.8001610040664673,
|
||
|
|
"learning_rate": 6.731265021599437e-06,
|
||
|
|
"loss": 0.2151487112045288,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 945,
|
||
|
|
"token_acc": 0.9195844345210973,
|
||
|
|
"train_speed(iter/s)": 0.123335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1683175919526991,
|
||
|
|
"grad_norm": 0.703551709651947,
|
||
|
|
"learning_rate": 6.7010196354305876e-06,
|
||
|
|
"loss": 0.188127601146698,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 950,
|
||
|
|
"token_acc": 0.9344865159357123,
|
||
|
|
"train_speed(iter/s)": 0.123485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1744605697611918,
|
||
|
|
"grad_norm": 0.7591283917427063,
|
||
|
|
"learning_rate": 6.670703694881851e-06,
|
||
|
|
"loss": 0.19852180480957032,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 955,
|
||
|
|
"token_acc": 0.9347434962314612,
|
||
|
|
"train_speed(iter/s)": 0.12365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1806035475696843,
|
||
|
|
"grad_norm": 0.8347094058990479,
|
||
|
|
"learning_rate": 6.640318457388672e-06,
|
||
|
|
"loss": 0.1904957413673401,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 960,
|
||
|
|
"token_acc": 0.9374376643394686,
|
||
|
|
"train_speed(iter/s)": 0.123801
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1806035475696843,
|
||
|
|
"eval_loss": 0.2668422758579254,
|
||
|
|
"eval_runtime": 29.65,
|
||
|
|
"eval_samples_per_second": 17.74,
|
||
|
|
"eval_steps_per_second": 4.452,
|
||
|
|
"eval_token_acc": 0.9165946101743767,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.186746525378177,
|
||
|
|
"grad_norm": 0.8114694952964783,
|
||
|
|
"learning_rate": 6.609865183260777e-06,
|
||
|
|
"loss": 0.19182581901550294,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 965,
|
||
|
|
"token_acc": 0.9153690632426489,
|
||
|
|
"train_speed(iter/s)": 0.123322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1928895031866698,
|
||
|
|
"grad_norm": 0.8866662979125977,
|
||
|
|
"learning_rate": 6.579345135629896e-06,
|
||
|
|
"loss": 0.19811842441558838,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 970,
|
||
|
|
"token_acc": 0.9233720292959992,
|
||
|
|
"train_speed(iter/s)": 0.123477
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1990324809951625,
|
||
|
|
"grad_norm": 0.7779002785682678,
|
||
|
|
"learning_rate": 6.548759580397363e-06,
|
||
|
|
"loss": 0.20236413478851317,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 975,
|
||
|
|
"token_acc": 0.9268359567816596,
|
||
|
|
"train_speed(iter/s)": 0.123623
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2051754588036552,
|
||
|
|
"grad_norm": 0.6543186902999878,
|
||
|
|
"learning_rate": 6.518109786181628e-06,
|
||
|
|
"loss": 0.19884101152420045,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 980,
|
||
|
|
"token_acc": 0.9361254541977434,
|
||
|
|
"train_speed(iter/s)": 0.123766
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2051754588036552,
|
||
|
|
"eval_loss": 0.26707014441490173,
|
||
|
|
"eval_runtime": 29.6219,
|
||
|
|
"eval_samples_per_second": 17.757,
|
||
|
|
"eval_steps_per_second": 4.456,
|
||
|
|
"eval_token_acc": 0.9165801988759187,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2113184366121477,
|
||
|
|
"grad_norm": 0.7516761422157288,
|
||
|
|
"learning_rate": 6.487397024265616e-06,
|
||
|
|
"loss": 0.2052464008331299,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 985,
|
||
|
|
"token_acc": 0.9144311222289313,
|
||
|
|
"train_speed(iter/s)": 0.123306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2174614144206404,
|
||
|
|
"grad_norm": 0.7994732856750488,
|
||
|
|
"learning_rate": 6.456622568544012e-06,
|
||
|
|
"loss": 0.19984896183013917,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 990,
|
||
|
|
"token_acc": 0.9331774440147496,
|
||
|
|
"train_speed(iter/s)": 0.123471
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.223604392229133,
|
||
|
|
"grad_norm": 0.8212082386016846,
|
||
|
|
"learning_rate": 6.425787695470419e-06,
|
||
|
|
"loss": 0.19194519519805908,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 995,
|
||
|
|
"token_acc": 0.9230769230769231,
|
||
|
|
"train_speed(iter/s)": 0.123636
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2297473700376258,
|
||
|
|
"grad_norm": 0.7439980506896973,
|
||
|
|
"learning_rate": 6.3948936840044096e-06,
|
||
|
|
"loss": 0.20161755084991456,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1000,
|
||
|
|
"token_acc": 0.9443154490422091,
|
||
|
|
"train_speed(iter/s)": 0.123777
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2297473700376258,
|
||
|
|
"eval_loss": 0.26448243856430054,
|
||
|
|
"eval_runtime": 29.656,
|
||
|
|
"eval_samples_per_second": 17.737,
|
||
|
|
"eval_steps_per_second": 4.451,
|
||
|
|
"eval_token_acc": 0.9172719412019023,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2358903478461185,
|
||
|
|
"grad_norm": 0.9987765550613403,
|
||
|
|
"learning_rate": 6.363941815558484e-06,
|
||
|
|
"loss": 0.19967958927154542,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1005,
|
||
|
|
"token_acc": 0.9190461073035912,
|
||
|
|
"train_speed(iter/s)": 0.12333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.242033325654611,
|
||
|
|
"grad_norm": 0.8285346627235413,
|
||
|
|
"learning_rate": 6.332933373944914e-06,
|
||
|
|
"loss": 0.19167766571044922,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1010,
|
||
|
|
"token_acc": 0.9248888888888889,
|
||
|
|
"train_speed(iter/s)": 0.123459
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2481763034631037,
|
||
|
|
"grad_norm": 0.8778380751609802,
|
||
|
|
"learning_rate": 6.301869645322498e-06,
|
||
|
|
"loss": 0.20817289352416993,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1015,
|
||
|
|
"token_acc": 0.9210802145631667,
|
||
|
|
"train_speed(iter/s)": 0.123619
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2543192812715964,
|
||
|
|
"grad_norm": 0.764149010181427,
|
||
|
|
"learning_rate": 6.270751918143213e-06,
|
||
|
|
"loss": 0.2000873565673828,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1020,
|
||
|
|
"token_acc": 0.9338627474220694,
|
||
|
|
"train_speed(iter/s)": 0.123776
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2543192812715964,
|
||
|
|
"eval_loss": 0.2691567838191986,
|
||
|
|
"eval_runtime": 29.7546,
|
||
|
|
"eval_samples_per_second": 17.678,
|
||
|
|
"eval_steps_per_second": 4.436,
|
||
|
|
"eval_token_acc": 0.9163496181005909,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2604622590800891,
|
||
|
|
"grad_norm": 0.8174912929534912,
|
||
|
|
"learning_rate": 6.239581483098767e-06,
|
||
|
|
"loss": 0.19734174013137817,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1025,
|
||
|
|
"token_acc": 0.9185803052816426,
|
||
|
|
"train_speed(iter/s)": 0.123299
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2666052368885818,
|
||
|
|
"grad_norm": 0.8845574259757996,
|
||
|
|
"learning_rate": 6.208359633067077e-06,
|
||
|
|
"loss": 0.18390114307403566,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1030,
|
||
|
|
"token_acc": 0.9341327407655864,
|
||
|
|
"train_speed(iter/s)": 0.123442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2727482146970743,
|
||
|
|
"grad_norm": 0.9409281015396118,
|
||
|
|
"learning_rate": 6.177087663058626e-06,
|
||
|
|
"loss": 0.20083985328674317,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1035,
|
||
|
|
"token_acc": 0.9444893687865671,
|
||
|
|
"train_speed(iter/s)": 0.123585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.278891192505567,
|
||
|
|
"grad_norm": 0.7851516604423523,
|
||
|
|
"learning_rate": 6.145766870162767e-06,
|
||
|
|
"loss": 0.21141374111175537,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1040,
|
||
|
|
"token_acc": 0.9148654159869495,
|
||
|
|
"train_speed(iter/s)": 0.123721
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.278891192505567,
|
||
|
|
"eval_loss": 0.2667810916900635,
|
||
|
|
"eval_runtime": 29.6926,
|
||
|
|
"eval_samples_per_second": 17.715,
|
||
|
|
"eval_steps_per_second": 4.446,
|
||
|
|
"eval_token_acc": 0.9169765095835135,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2850341703140598,
|
||
|
|
"grad_norm": 0.9521942138671875,
|
||
|
|
"learning_rate": 6.114398553493909e-06,
|
||
|
|
"loss": 0.1960476517677307,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1045,
|
||
|
|
"token_acc": 0.9178016461816706,
|
||
|
|
"train_speed(iter/s)": 0.123266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2911771481225525,
|
||
|
|
"grad_norm": 0.8174967765808105,
|
||
|
|
"learning_rate": 6.0829840141376385e-06,
|
||
|
|
"loss": 0.20267832279205322,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1050,
|
||
|
|
"token_acc": 0.9279712548369264,
|
||
|
|
"train_speed(iter/s)": 0.123426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2973201259310452,
|
||
|
|
"grad_norm": 0.8733800053596497,
|
||
|
|
"learning_rate": 6.051524555096754e-06,
|
||
|
|
"loss": 0.18992329835891725,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1055,
|
||
|
|
"token_acc": 0.9288329960489544,
|
||
|
|
"train_speed(iter/s)": 0.123584
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3034631037395377,
|
||
|
|
"grad_norm": 0.8741737008094788,
|
||
|
|
"learning_rate": 6.020021481237216e-06,
|
||
|
|
"loss": 0.2002291202545166,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1060,
|
||
|
|
"token_acc": 0.9348597405477326,
|
||
|
|
"train_speed(iter/s)": 0.123729
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3034631037395377,
|
||
|
|
"eval_loss": 0.26571905612945557,
|
||
|
|
"eval_runtime": 29.6815,
|
||
|
|
"eval_samples_per_second": 17.721,
|
||
|
|
"eval_steps_per_second": 4.447,
|
||
|
|
"eval_token_acc": 0.9168756304943075,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3096060815480304,
|
||
|
|
"grad_norm": 0.7780086398124695,
|
||
|
|
"learning_rate": 5.988476099234033e-06,
|
||
|
|
"loss": 0.20159559249877929,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1065,
|
||
|
|
"token_acc": 0.9135952477386257,
|
||
|
|
"train_speed(iter/s)": 0.123364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.315749059356523,
|
||
|
|
"grad_norm": 0.7431135177612305,
|
||
|
|
"learning_rate": 5.956889717517053e-06,
|
||
|
|
"loss": 0.1894887328147888,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1070,
|
||
|
|
"token_acc": 0.9296808409887299,
|
||
|
|
"train_speed(iter/s)": 0.12348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3218920371650158,
|
||
|
|
"grad_norm": 0.72322016954422,
|
||
|
|
"learning_rate": 5.925263646216697e-06,
|
||
|
|
"loss": 0.17778899669647216,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1075,
|
||
|
|
"token_acc": 0.9421357447673238,
|
||
|
|
"train_speed(iter/s)": 0.123618
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3280350149735085,
|
||
|
|
"grad_norm": 0.9004871249198914,
|
||
|
|
"learning_rate": 5.893599197109625e-06,
|
||
|
|
"loss": 0.1900892972946167,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1080,
|
||
|
|
"token_acc": 0.9286453541858326,
|
||
|
|
"train_speed(iter/s)": 0.123756
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3280350149735085,
|
||
|
|
"eval_loss": 0.26534271240234375,
|
||
|
|
"eval_runtime": 29.6233,
|
||
|
|
"eval_samples_per_second": 17.756,
|
||
|
|
"eval_steps_per_second": 4.456,
|
||
|
|
"eval_token_acc": 0.9171062112696354,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.334177992782001,
|
||
|
|
"grad_norm": 0.657845675945282,
|
||
|
|
"learning_rate": 5.861897683564313e-06,
|
||
|
|
"loss": 0.18198509216308595,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1085,
|
||
|
|
"token_acc": 0.9204705963413373,
|
||
|
|
"train_speed(iter/s)": 0.123316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3403209705904937,
|
||
|
|
"grad_norm": 0.8465989828109741,
|
||
|
|
"learning_rate": 5.830160420486588e-06,
|
||
|
|
"loss": 0.2053920269012451,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1090,
|
||
|
|
"token_acc": 0.9380833375835161,
|
||
|
|
"train_speed(iter/s)": 0.123479
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3464639483989864,
|
||
|
|
"grad_norm": 0.7130780816078186,
|
||
|
|
"learning_rate": 5.798388724265085e-06,
|
||
|
|
"loss": 0.18327146768569946,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1095,
|
||
|
|
"token_acc": 0.9395675675675675,
|
||
|
|
"train_speed(iter/s)": 0.123624
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3526069262074791,
|
||
|
|
"grad_norm": 0.800010085105896,
|
||
|
|
"learning_rate": 5.7665839127166475e-06,
|
||
|
|
"loss": 0.18803975582122803,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1100,
|
||
|
|
"token_acc": 0.9407923378319547,
|
||
|
|
"train_speed(iter/s)": 0.123735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3526069262074791,
|
||
|
|
"eval_loss": 0.26394224166870117,
|
||
|
|
"eval_runtime": 29.7389,
|
||
|
|
"eval_samples_per_second": 17.687,
|
||
|
|
"eval_steps_per_second": 4.439,
|
||
|
|
"eval_token_acc": 0.9168684248450786,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3587499040159718,
|
||
|
|
"grad_norm": 0.8300402164459229,
|
||
|
|
"learning_rate": 5.734747305031664e-06,
|
||
|
|
"loss": 0.21199843883514405,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1105,
|
||
|
|
"token_acc": 0.9151312319249167,
|
||
|
|
"train_speed(iter/s)": 0.123328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3648928818244643,
|
||
|
|
"grad_norm": 0.7512418627738953,
|
||
|
|
"learning_rate": 5.7028802217193565e-06,
|
||
|
|
"loss": 0.18927464485168458,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1110,
|
||
|
|
"token_acc": 0.9413873811065187,
|
||
|
|
"train_speed(iter/s)": 0.123452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.371035859632957,
|
||
|
|
"grad_norm": 0.8453025221824646,
|
||
|
|
"learning_rate": 5.670983984553003e-06,
|
||
|
|
"loss": 0.20298895835876465,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1115,
|
||
|
|
"token_acc": 0.9409311022678237,
|
||
|
|
"train_speed(iter/s)": 0.123572
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3771788374414498,
|
||
|
|
"grad_norm": 0.8611200451850891,
|
||
|
|
"learning_rate": 5.63905991651512e-06,
|
||
|
|
"loss": 0.1865471839904785,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1120,
|
||
|
|
"token_acc": 0.9321085791674028,
|
||
|
|
"train_speed(iter/s)": 0.123686
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3771788374414498,
|
||
|
|
"eval_loss": 0.2642819583415985,
|
||
|
|
"eval_runtime": 29.6704,
|
||
|
|
"eval_samples_per_second": 17.728,
|
||
|
|
"eval_steps_per_second": 4.449,
|
||
|
|
"eval_token_acc": 0.9167243118604986,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3833218152499425,
|
||
|
|
"grad_norm": 0.7028310298919678,
|
||
|
|
"learning_rate": 5.607109341742579e-06,
|
||
|
|
"loss": 0.1868009090423584,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1125,
|
||
|
|
"token_acc": 0.9161407676887668,
|
||
|
|
"train_speed(iter/s)": 0.123281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3894647930584352,
|
||
|
|
"grad_norm": 0.7893259525299072,
|
||
|
|
"learning_rate": 5.575133585471697e-06,
|
||
|
|
"loss": 0.18891712427139282,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1130,
|
||
|
|
"token_acc": 0.9357982673267327,
|
||
|
|
"train_speed(iter/s)": 0.123383
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3956077708669277,
|
||
|
|
"grad_norm": 0.8665569424629211,
|
||
|
|
"learning_rate": 5.543133973983254e-06,
|
||
|
|
"loss": 0.18693907260894777,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1135,
|
||
|
|
"token_acc": 0.9333289413004809,
|
||
|
|
"train_speed(iter/s)": 0.123515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4017507486754204,
|
||
|
|
"grad_norm": 0.7769924998283386,
|
||
|
|
"learning_rate": 5.511111834547496e-06,
|
||
|
|
"loss": 0.18896095752716063,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1140,
|
||
|
|
"token_acc": 0.9316990440949738,
|
||
|
|
"train_speed(iter/s)": 0.123635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4017507486754204,
|
||
|
|
"eval_loss": 0.2637878358364105,
|
||
|
|
"eval_runtime": 29.598,
|
||
|
|
"eval_samples_per_second": 17.771,
|
||
|
|
"eval_steps_per_second": 4.46,
|
||
|
|
"eval_token_acc": 0.9168612191958495,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.407893726483913,
|
||
|
|
"grad_norm": 0.7512997388839722,
|
||
|
|
"learning_rate": 5.479068495369071e-06,
|
||
|
|
"loss": 0.17823780775070192,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1145,
|
||
|
|
"token_acc": 0.9169221157037702,
|
||
|
|
"train_speed(iter/s)": 0.123233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4140367042924058,
|
||
|
|
"grad_norm": 0.7769235372543335,
|
||
|
|
"learning_rate": 5.447005285531948e-06,
|
||
|
|
"loss": 0.18888635635375978,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1150,
|
||
|
|
"token_acc": 0.9348544111255975,
|
||
|
|
"train_speed(iter/s)": 0.123343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4201796821008985,
|
||
|
|
"grad_norm": 0.6804760694503784,
|
||
|
|
"learning_rate": 5.414923534944283e-06,
|
||
|
|
"loss": 0.19998799562454223,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1155,
|
||
|
|
"token_acc": 0.927479002131127,
|
||
|
|
"train_speed(iter/s)": 0.12347
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.426322659909391,
|
||
|
|
"grad_norm": 0.8411586284637451,
|
||
|
|
"learning_rate": 5.38282457428326e-06,
|
||
|
|
"loss": 0.18058542013168336,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1160,
|
||
|
|
"token_acc": 0.9393255256102724,
|
||
|
|
"train_speed(iter/s)": 0.123592
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.426322659909391,
|
||
|
|
"eval_loss": 0.26423749327659607,
|
||
|
|
"eval_runtime": 29.6214,
|
||
|
|
"eval_samples_per_second": 17.757,
|
||
|
|
"eval_steps_per_second": 4.456,
|
||
|
|
"eval_token_acc": 0.9165369649805447,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4324656377178837,
|
||
|
|
"grad_norm": 0.9211987257003784,
|
||
|
|
"learning_rate": 5.350709734939898e-06,
|
||
|
|
"loss": 0.19590919017791747,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1165,
|
||
|
|
"token_acc": 0.9181291791405984,
|
||
|
|
"train_speed(iter/s)": 0.123222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4386086155263764,
|
||
|
|
"grad_norm": 0.8355826735496521,
|
||
|
|
"learning_rate": 5.318580348963826e-06,
|
||
|
|
"loss": 0.18302634954452515,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1170,
|
||
|
|
"token_acc": 0.940089028541503,
|
||
|
|
"train_speed(iter/s)": 0.123344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4447515933348691,
|
||
|
|
"grad_norm": 0.8388906717300415,
|
||
|
|
"learning_rate": 5.286437749008031e-06,
|
||
|
|
"loss": 0.19118983745574952,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1175,
|
||
|
|
"token_acc": 0.934627927660836,
|
||
|
|
"train_speed(iter/s)": 0.123476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4508945711433618,
|
||
|
|
"grad_norm": 0.9128098487854004,
|
||
|
|
"learning_rate": 5.2542832682735956e-06,
|
||
|
|
"loss": 0.20061683654785156,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1180,
|
||
|
|
"token_acc": 0.9325906344410876,
|
||
|
|
"train_speed(iter/s)": 0.123611
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4508945711433618,
|
||
|
|
"eval_loss": 0.26307445764541626,
|
||
|
|
"eval_runtime": 29.7678,
|
||
|
|
"eval_samples_per_second": 17.67,
|
||
|
|
"eval_steps_per_second": 4.434,
|
||
|
|
"eval_token_acc": 0.9170125378296584,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4570375489518543,
|
||
|
|
"grad_norm": 0.7825379371643066,
|
||
|
|
"learning_rate": 5.222118240454376e-06,
|
||
|
|
"loss": 0.19818990230560302,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1185,
|
||
|
|
"token_acc": 0.9137893551001047,
|
||
|
|
"train_speed(iter/s)": 0.123207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.463180526760347,
|
||
|
|
"grad_norm": 0.748515784740448,
|
||
|
|
"learning_rate": 5.18994399968171e-06,
|
||
|
|
"loss": 0.19512221813201905,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1190,
|
||
|
|
"token_acc": 0.9283811949976841,
|
||
|
|
"train_speed(iter/s)": 0.123314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4693235045688398,
|
||
|
|
"grad_norm": 0.9209436178207397,
|
||
|
|
"learning_rate": 5.157761880469058e-06,
|
||
|
|
"loss": 0.19263048171997071,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1195,
|
||
|
|
"token_acc": 0.9410712406608439,
|
||
|
|
"train_speed(iter/s)": 0.123435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4754664823773325,
|
||
|
|
"grad_norm": 0.8622936606407166,
|
||
|
|
"learning_rate": 5.125573217656664e-06,
|
||
|
|
"loss": 0.1777910351753235,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1200,
|
||
|
|
"token_acc": 0.938923185912357,
|
||
|
|
"train_speed(iter/s)": 0.123542
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4754664823773325,
|
||
|
|
"eval_loss": 0.26356378197669983,
|
||
|
|
"eval_runtime": 29.7055,
|
||
|
|
"eval_samples_per_second": 17.707,
|
||
|
|
"eval_steps_per_second": 4.444,
|
||
|
|
"eval_token_acc": 0.9174160541864822,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4816094601858252,
|
||
|
|
"grad_norm": 0.8892216086387634,
|
||
|
|
"learning_rate": 5.0933793463561855e-06,
|
||
|
|
"loss": 0.189991557598114,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1205,
|
||
|
|
"token_acc": 0.9184802373432864,
|
||
|
|
"train_speed(iter/s)": 0.12315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4877524379943177,
|
||
|
|
"grad_norm": 0.7723336219787598,
|
||
|
|
"learning_rate": 5.061181601895317e-06,
|
||
|
|
"loss": 0.19531933069229127,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1210,
|
||
|
|
"token_acc": 0.9326246228990087,
|
||
|
|
"train_speed(iter/s)": 0.123271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4938954158028104,
|
||
|
|
"grad_norm": 0.8178744912147522,
|
||
|
|
"learning_rate": 5.028981319762399e-06,
|
||
|
|
"loss": 0.19836077690124512,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1215,
|
||
|
|
"token_acc": 0.9317647058823529,
|
||
|
|
"train_speed(iter/s)": 0.123399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.500038393611303,
|
||
|
|
"grad_norm": 0.8365611433982849,
|
||
|
|
"learning_rate": 4.996779835551035e-06,
|
||
|
|
"loss": 0.17670562267303466,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1220,
|
||
|
|
"token_acc": 0.9215732593161283,
|
||
|
|
"train_speed(iter/s)": 0.123517
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.500038393611303,
|
||
|
|
"eval_loss": 0.26140737533569336,
|
||
|
|
"eval_runtime": 29.7037,
|
||
|
|
"eval_samples_per_second": 17.708,
|
||
|
|
"eval_steps_per_second": 4.444,
|
||
|
|
"eval_token_acc": 0.9175601671710621,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5061813714197958,
|
||
|
|
"grad_norm": 0.9357166290283203,
|
||
|
|
"learning_rate": 4.964578484904679e-06,
|
||
|
|
"loss": 0.19990785121917726,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1225,
|
||
|
|
"token_acc": 0.9138523956723339,
|
||
|
|
"train_speed(iter/s)": 0.123153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5123243492282885,
|
||
|
|
"grad_norm": 0.7626463174819946,
|
||
|
|
"learning_rate": 4.932378603461253e-06,
|
||
|
|
"loss": 0.17721318006515502,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1230,
|
||
|
|
"token_acc": 0.9358071645166264,
|
||
|
|
"train_speed(iter/s)": 0.123279
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.518467327036781,
|
||
|
|
"grad_norm": 0.7975868582725525,
|
||
|
|
"learning_rate": 4.900181526797737e-06,
|
||
|
|
"loss": 0.18672944307327272,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1235,
|
||
|
|
"token_acc": 0.9301695649818517,
|
||
|
|
"train_speed(iter/s)": 0.123379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5246103048452737,
|
||
|
|
"grad_norm": 0.9394721984863281,
|
||
|
|
"learning_rate": 4.867988590374777e-06,
|
||
|
|
"loss": 0.21254873275756836,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1240,
|
||
|
|
"token_acc": 0.9301865980329075,
|
||
|
|
"train_speed(iter/s)": 0.12352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5246103048452737,
|
||
|
|
"eval_loss": 0.2631884217262268,
|
||
|
|
"eval_runtime": 29.7103,
|
||
|
|
"eval_samples_per_second": 17.704,
|
||
|
|
"eval_steps_per_second": 4.443,
|
||
|
|
"eval_token_acc": 0.9173656146418793,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5307532826537664,
|
||
|
|
"grad_norm": 0.8997290134429932,
|
||
|
|
"learning_rate": 4.835801129481287e-06,
|
||
|
|
"loss": 0.17868154048919677,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1245,
|
||
|
|
"token_acc": 0.9191156488994138,
|
||
|
|
"train_speed(iter/s)": 0.123159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5368962604622591,
|
||
|
|
"grad_norm": 0.9004043340682983,
|
||
|
|
"learning_rate": 4.803620479179071e-06,
|
||
|
|
"loss": 0.2074437618255615,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1250,
|
||
|
|
"token_acc": 0.9450173238739482,
|
||
|
|
"train_speed(iter/s)": 0.123271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5430392382707518,
|
||
|
|
"grad_norm": 0.7723172307014465,
|
||
|
|
"learning_rate": 4.771447974247449e-06,
|
||
|
|
"loss": 0.1962502956390381,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1255,
|
||
|
|
"token_acc": 0.9253724029792239,
|
||
|
|
"train_speed(iter/s)": 0.123413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5491822160792443,
|
||
|
|
"grad_norm": 0.8553282618522644,
|
||
|
|
"learning_rate": 4.7392849491278825e-06,
|
||
|
|
"loss": 0.18894779682159424,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1260,
|
||
|
|
"token_acc": 0.9323178471693323,
|
||
|
|
"train_speed(iter/s)": 0.123525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5491822160792443,
|
||
|
|
"eval_loss": 0.26209425926208496,
|
||
|
|
"eval_runtime": 29.6524,
|
||
|
|
"eval_samples_per_second": 17.739,
|
||
|
|
"eval_steps_per_second": 4.452,
|
||
|
|
"eval_token_acc": 0.9178844213863669,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.555325193887737,
|
||
|
|
"grad_norm": 0.886417806148529,
|
||
|
|
"learning_rate": 4.707132737868639e-06,
|
||
|
|
"loss": 0.2006976842880249,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1265,
|
||
|
|
"token_acc": 0.9235658289984614,
|
||
|
|
"train_speed(iter/s)": 0.123184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5614681716962298,
|
||
|
|
"grad_norm": 0.798883318901062,
|
||
|
|
"learning_rate": 4.674992674069445e-06,
|
||
|
|
"loss": 0.17858563661575316,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1270,
|
||
|
|
"token_acc": 0.9344567177637512,
|
||
|
|
"train_speed(iter/s)": 0.123284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5676111495047225,
|
||
|
|
"grad_norm": 0.8193321824073792,
|
||
|
|
"learning_rate": 4.642866090826187e-06,
|
||
|
|
"loss": 0.19088488817214966,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1275,
|
||
|
|
"token_acc": 0.9263077510500191,
|
||
|
|
"train_speed(iter/s)": 0.123357
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5737541273132152,
|
||
|
|
"grad_norm": 0.7805312275886536,
|
||
|
|
"learning_rate": 4.610754320675603e-06,
|
||
|
|
"loss": 0.19581155776977538,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1280,
|
||
|
|
"token_acc": 0.9278596416834517,
|
||
|
|
"train_speed(iter/s)": 0.123486
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5737541273132152,
|
||
|
|
"eval_loss": 0.26224827766418457,
|
||
|
|
"eval_runtime": 29.7067,
|
||
|
|
"eval_samples_per_second": 17.706,
|
||
|
|
"eval_steps_per_second": 4.443,
|
||
|
|
"eval_token_acc": 0.9181726473555267,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5798971051217077,
|
||
|
|
"grad_norm": 0.8490621447563171,
|
||
|
|
"learning_rate": 4.578658695540018e-06,
|
||
|
|
"loss": 0.2049680233001709,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1285,
|
||
|
|
"token_acc": 0.9203214434630984,
|
||
|
|
"train_speed(iter/s)": 0.123147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5860400829302004,
|
||
|
|
"grad_norm": 0.8018094301223755,
|
||
|
|
"learning_rate": 4.5465805466721e-06,
|
||
|
|
"loss": 0.21368024349212647,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1290,
|
||
|
|
"token_acc": 0.9337745342459544,
|
||
|
|
"train_speed(iter/s)": 0.123269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.592183060738693,
|
||
|
|
"grad_norm": 0.8416959047317505,
|
||
|
|
"learning_rate": 4.514521204599645e-06,
|
||
|
|
"loss": 0.19902560710906983,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1295,
|
||
|
|
"token_acc": 0.9410267803045603,
|
||
|
|
"train_speed(iter/s)": 0.123379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5983260385471858,
|
||
|
|
"grad_norm": 0.7020901441574097,
|
||
|
|
"learning_rate": 4.48248199907038e-06,
|
||
|
|
"loss": 0.1922709822654724,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1300,
|
||
|
|
"token_acc": 0.9301529196433843,
|
||
|
|
"train_speed(iter/s)": 0.123512
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5983260385471858,
|
||
|
|
"eval_loss": 0.2603996694087982,
|
||
|
|
"eval_runtime": 29.664,
|
||
|
|
"eval_samples_per_second": 17.732,
|
||
|
|
"eval_steps_per_second": 4.45,
|
||
|
|
"eval_token_acc": 0.9183167603401067,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6044690163556785,
|
||
|
|
"grad_norm": 0.9669449329376221,
|
||
|
|
"learning_rate": 4.450464258996822e-06,
|
||
|
|
"loss": 0.20973031520843505,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1305,
|
||
|
|
"token_acc": 0.9143080561489166,
|
||
|
|
"train_speed(iter/s)": 0.123187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.610611994164171,
|
||
|
|
"grad_norm": 0.8625094890594482,
|
||
|
|
"learning_rate": 4.418469312401141e-06,
|
||
|
|
"loss": 0.16759986877441407,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1310,
|
||
|
|
"token_acc": 0.9306892935456192,
|
||
|
|
"train_speed(iter/s)": 0.123294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6167549719726637,
|
||
|
|
"grad_norm": 0.8282011151313782,
|
||
|
|
"learning_rate": 4.386498486360095e-06,
|
||
|
|
"loss": 0.20370192527770997,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1315,
|
||
|
|
"token_acc": 0.9212595005428882,
|
||
|
|
"train_speed(iter/s)": 0.123412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6228979497811564,
|
||
|
|
"grad_norm": 0.8530144095420837,
|
||
|
|
"learning_rate": 4.354553106949972e-06,
|
||
|
|
"loss": 0.20181150436401368,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1320,
|
||
|
|
"token_acc": 0.9427324788655577,
|
||
|
|
"train_speed(iter/s)": 0.12352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6228979497811564,
|
||
|
|
"eval_loss": 0.26066553592681885,
|
||
|
|
"eval_runtime": 29.7468,
|
||
|
|
"eval_samples_per_second": 17.683,
|
||
|
|
"eval_steps_per_second": 4.437,
|
||
|
|
"eval_token_acc": 0.9182663207955036,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6290409275896491,
|
||
|
|
"grad_norm": 0.760425329208374,
|
||
|
|
"learning_rate": 4.3226344991915936e-06,
|
||
|
|
"loss": 0.18798611164093018,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1325,
|
||
|
|
"token_acc": 0.9204100274028215,
|
||
|
|
"train_speed(iter/s)": 0.123163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6351839053981418,
|
||
|
|
"grad_norm": 0.8320059776306152,
|
||
|
|
"learning_rate": 4.290743986995353e-06,
|
||
|
|
"loss": 0.20692143440246583,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1330,
|
||
|
|
"token_acc": 0.9179834090460202,
|
||
|
|
"train_speed(iter/s)": 0.123278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6413268832066343,
|
||
|
|
"grad_norm": 0.9049168229103088,
|
||
|
|
"learning_rate": 4.258882893106308e-06,
|
||
|
|
"loss": 0.18184820413589478,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1335,
|
||
|
|
"token_acc": 0.9419431279620853,
|
||
|
|
"train_speed(iter/s)": 0.123369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.647469861015127,
|
||
|
|
"grad_norm": 0.8740628361701965,
|
||
|
|
"learning_rate": 4.227052539049312e-06,
|
||
|
|
"loss": 0.1948437809944153,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1340,
|
||
|
|
"token_acc": 0.9350572326671016,
|
||
|
|
"train_speed(iter/s)": 0.123484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.647469861015127,
|
||
|
|
"eval_loss": 0.2610000967979431,
|
||
|
|
"eval_runtime": 29.6634,
|
||
|
|
"eval_samples_per_second": 17.732,
|
||
|
|
"eval_steps_per_second": 4.45,
|
||
|
|
"eval_token_acc": 0.9179132439832829,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6536128388236198,
|
||
|
|
"grad_norm": 0.7111175656318665,
|
||
|
|
"learning_rate": 4.195254245074196e-06,
|
||
|
|
"loss": 0.17852287292480468,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1345,
|
||
|
|
"token_acc": 0.9160269612432129,
|
||
|
|
"train_speed(iter/s)": 0.123131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6597558166321125,
|
||
|
|
"grad_norm": 0.7991046905517578,
|
||
|
|
"learning_rate": 4.163489330101017e-06,
|
||
|
|
"loss": 0.1986152410507202,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1350,
|
||
|
|
"token_acc": 0.93740389861614,
|
||
|
|
"train_speed(iter/s)": 0.123232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6658987944406052,
|
||
|
|
"grad_norm": 0.8582780957221985,
|
||
|
|
"learning_rate": 4.131759111665349e-06,
|
||
|
|
"loss": 0.18987109661102294,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1355,
|
||
|
|
"token_acc": 0.9441934490194065,
|
||
|
|
"train_speed(iter/s)": 0.12331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6720417722490977,
|
||
|
|
"grad_norm": 0.6932474970817566,
|
||
|
|
"learning_rate": 4.100064905863628e-06,
|
||
|
|
"loss": 0.19035787582397462,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1360,
|
||
|
|
"token_acc": 0.9174038315725623,
|
||
|
|
"train_speed(iter/s)": 0.123414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6720417722490977,
|
||
|
|
"eval_loss": 0.26117271184921265,
|
||
|
|
"eval_runtime": 29.681,
|
||
|
|
"eval_samples_per_second": 17.722,
|
||
|
|
"eval_steps_per_second": 4.447,
|
||
|
|
"eval_token_acc": 0.9178772157371379,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6781847500575904,
|
||
|
|
"grad_norm": 0.8091769814491272,
|
||
|
|
"learning_rate": 4.068408027298576e-06,
|
||
|
|
"loss": 0.20030708312988282,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1365,
|
||
|
|
"token_acc": 0.921018299777864,
|
||
|
|
"train_speed(iter/s)": 0.123084
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.684327727866083,
|
||
|
|
"grad_norm": 0.8656709790229797,
|
||
|
|
"learning_rate": 4.036789789024659e-06,
|
||
|
|
"loss": 0.17970023155212403,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1370,
|
||
|
|
"token_acc": 0.9361972662458562,
|
||
|
|
"train_speed(iter/s)": 0.123185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6904707056745758,
|
||
|
|
"grad_norm": 0.8730081915855408,
|
||
|
|
"learning_rate": 4.00521150249364e-06,
|
||
|
|
"loss": 0.20007739067077637,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1375,
|
||
|
|
"token_acc": 0.9322461977708231,
|
||
|
|
"train_speed(iter/s)": 0.123308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6966136834830685,
|
||
|
|
"grad_norm": 0.8743943572044373,
|
||
|
|
"learning_rate": 3.973674477500172e-06,
|
||
|
|
"loss": 0.19028009176254274,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1380,
|
||
|
|
"token_acc": 0.9364455364455364,
|
||
|
|
"train_speed(iter/s)": 0.123436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6966136834830685,
|
||
|
|
"eval_loss": 0.2610381841659546,
|
||
|
|
"eval_runtime": 29.6856,
|
||
|
|
"eval_samples_per_second": 17.719,
|
||
|
|
"eval_steps_per_second": 4.447,
|
||
|
|
"eval_token_acc": 0.9183167603401067,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.702756661291561,
|
||
|
|
"grad_norm": 0.804136335849762,
|
||
|
|
"learning_rate": 3.942180022127475e-06,
|
||
|
|
"loss": 0.16746077537536622,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1385,
|
||
|
|
"token_acc": 0.9190062765437667,
|
||
|
|
"train_speed(iter/s)": 0.12311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7088996391000537,
|
||
|
|
"grad_norm": 0.7739353775978088,
|
||
|
|
"learning_rate": 3.910729442693077e-06,
|
||
|
|
"loss": 0.20771589279174804,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1390,
|
||
|
|
"token_acc": 0.9368890897790836,
|
||
|
|
"train_speed(iter/s)": 0.123227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7150426169085464,
|
||
|
|
"grad_norm": 0.7843467593193054,
|
||
|
|
"learning_rate": 3.8793240436946385e-06,
|
||
|
|
"loss": 0.1791388511657715,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1395,
|
||
|
|
"token_acc": 0.9298795912172417,
|
||
|
|
"train_speed(iter/s)": 0.123311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7211855947170391,
|
||
|
|
"grad_norm": 0.8047447800636292,
|
||
|
|
"learning_rate": 3.847965127755834e-06,
|
||
|
|
"loss": 0.1962286114692688,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1400,
|
||
|
|
"token_acc": 0.9289617486338798,
|
||
|
|
"train_speed(iter/s)": 0.123422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7211855947170391,
|
||
|
|
"eval_loss": 0.25994521379470825,
|
||
|
|
"eval_runtime": 29.6994,
|
||
|
|
"eval_samples_per_second": 17.711,
|
||
|
|
"eval_steps_per_second": 4.445,
|
||
|
|
"eval_token_acc": 0.9187274823461594,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7273285725255318,
|
||
|
|
"grad_norm": 0.7783213257789612,
|
||
|
|
"learning_rate": 3.816653995572332e-06,
|
||
|
|
"loss": 0.1934323787689209,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1405,
|
||
|
|
"token_acc": 0.9191889097250214,
|
||
|
|
"train_speed(iter/s)": 0.123105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7334715503340243,
|
||
|
|
"grad_norm": 0.9164287447929382,
|
||
|
|
"learning_rate": 3.7853919458578327e-06,
|
||
|
|
"loss": 0.1951138973236084,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1410,
|
||
|
|
"token_acc": 0.9345938875014865,
|
||
|
|
"train_speed(iter/s)": 0.123227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.739614528142517,
|
||
|
|
"grad_norm": 0.6820860505104065,
|
||
|
|
"learning_rate": 3.7541802752902224e-06,
|
||
|
|
"loss": 0.1772141695022583,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1415,
|
||
|
|
"token_acc": 0.9298903956901357,
|
||
|
|
"train_speed(iter/s)": 0.123313
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7457575059510098,
|
||
|
|
"grad_norm": 0.900181770324707,
|
||
|
|
"learning_rate": 3.723020278457763e-06,
|
||
|
|
"loss": 0.1944177269935608,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1420,
|
||
|
|
"token_acc": 0.9333839438223572,
|
||
|
|
"train_speed(iter/s)": 0.123418
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7457575059510098,
|
||
|
|
"eval_loss": 0.2598145008087158,
|
||
|
|
"eval_runtime": 29.6524,
|
||
|
|
"eval_samples_per_second": 17.739,
|
||
|
|
"eval_steps_per_second": 4.452,
|
||
|
|
"eval_token_acc": 0.9181582360570687,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7519004837595025,
|
||
|
|
"grad_norm": 0.8312418460845947,
|
||
|
|
"learning_rate": 3.6919132478054153e-06,
|
||
|
|
"loss": 0.2060741662979126,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1425,
|
||
|
|
"token_acc": 0.9155994474106709,
|
||
|
|
"train_speed(iter/s)": 0.123098
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7580434615679952,
|
||
|
|
"grad_norm": 0.8780198097229004,
|
||
|
|
"learning_rate": 3.6608604735812226e-06,
|
||
|
|
"loss": 0.1988367795944214,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1430,
|
||
|
|
"token_acc": 0.9336537924095915,
|
||
|
|
"train_speed(iter/s)": 0.123197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7641864393764877,
|
||
|
|
"grad_norm": 0.8375983834266663,
|
||
|
|
"learning_rate": 3.629863243782799e-06,
|
||
|
|
"loss": 0.20454792976379393,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1435,
|
||
|
|
"token_acc": 0.9179058065245661,
|
||
|
|
"train_speed(iter/s)": 0.123317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7703294171849804,
|
||
|
|
"grad_norm": 0.7774383425712585,
|
||
|
|
"learning_rate": 3.5989228441039024e-06,
|
||
|
|
"loss": 0.1952831268310547,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1440,
|
||
|
|
"token_acc": 0.9264312326179357,
|
||
|
|
"train_speed(iter/s)": 0.123413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7703294171849804,
|
||
|
|
"eval_loss": 0.2601591646671295,
|
||
|
|
"eval_runtime": 29.6923,
|
||
|
|
"eval_samples_per_second": 17.715,
|
||
|
|
"eval_steps_per_second": 4.446,
|
||
|
|
"eval_token_acc": 0.9185257241677475,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.776472394993473,
|
||
|
|
"grad_norm": 0.8040404915809631,
|
||
|
|
"learning_rate": 3.568040557881106e-06,
|
||
|
|
"loss": 0.18471212387084962,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1445,
|
||
|
|
"token_acc": 0.9204298276599304,
|
||
|
|
"train_speed(iter/s)": 0.123124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7826153728019658,
|
||
|
|
"grad_norm": 0.7603934407234192,
|
||
|
|
"learning_rate": 3.5372176660405717e-06,
|
||
|
|
"loss": 0.19175269603729247,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1450,
|
||
|
|
"token_acc": 0.9407587455914593,
|
||
|
|
"train_speed(iter/s)": 0.123205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7887583506104585,
|
||
|
|
"grad_norm": 0.8216592073440552,
|
||
|
|
"learning_rate": 3.506455447044923e-06,
|
||
|
|
"loss": 0.18449797630310058,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1455,
|
||
|
|
"token_acc": 0.9232101076275152,
|
||
|
|
"train_speed(iter/s)": 0.12331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.794901328418951,
|
||
|
|
"grad_norm": 0.8587454557418823,
|
||
|
|
"learning_rate": 3.4757551768402074e-06,
|
||
|
|
"loss": 0.1803336501121521,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1460,
|
||
|
|
"token_acc": 0.9327302250057992,
|
||
|
|
"train_speed(iter/s)": 0.123414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.794901328418951,
|
||
|
|
"eval_loss": 0.25998592376708984,
|
||
|
|
"eval_runtime": 29.6543,
|
||
|
|
"eval_samples_per_second": 17.738,
|
||
|
|
"eval_steps_per_second": 4.451,
|
||
|
|
"eval_token_acc": 0.9187851275399913,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8010443062274437,
|
||
|
|
"grad_norm": 0.6978159546852112,
|
||
|
|
"learning_rate": 3.4451181288029834e-06,
|
||
|
|
"loss": 0.17668429613113404,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1465,
|
||
|
|
"token_acc": 0.9205462088038718,
|
||
|
|
"train_speed(iter/s)": 0.123097
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8071872840359364,
|
||
|
|
"grad_norm": 0.7977909445762634,
|
||
|
|
"learning_rate": 3.4145455736874957e-06,
|
||
|
|
"loss": 0.20127489566802978,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1470,
|
||
|
|
"token_acc": 0.9337199247164149,
|
||
|
|
"train_speed(iter/s)": 0.123204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8133302618444291,
|
||
|
|
"grad_norm": 0.847145140171051,
|
||
|
|
"learning_rate": 3.3840387795729753e-06,
|
||
|
|
"loss": 0.1935032606124878,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1475,
|
||
|
|
"token_acc": 0.9318121092288784,
|
||
|
|
"train_speed(iter/s)": 0.123311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8194732396529218,
|
||
|
|
"grad_norm": 0.8690524697303772,
|
||
|
|
"learning_rate": 3.353599011811037e-06,
|
||
|
|
"loss": 0.19352041482925414,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1480,
|
||
|
|
"token_acc": 0.9311466218110457,
|
||
|
|
"train_speed(iter/s)": 0.123409
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8194732396529218,
|
||
|
|
"eval_loss": 0.259257048368454,
|
||
|
|
"eval_runtime": 29.6405,
|
||
|
|
"eval_samples_per_second": 17.746,
|
||
|
|
"eval_steps_per_second": 4.453,
|
||
|
|
"eval_token_acc": 0.9186266032569534,
|
||
|
|
"step": 1480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8256162174614143,
|
||
|
|
"grad_norm": 0.8610875010490417,
|
||
|
|
"learning_rate": 3.323227532973193e-06,
|
||
|
|
"loss": 0.18993620872497557,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1485,
|
||
|
|
"token_acc": 0.9163555740842508,
|
||
|
|
"train_speed(iter/s)": 0.123098
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.831759195269907,
|
||
|
|
"grad_norm": 0.9433273673057556,
|
||
|
|
"learning_rate": 3.292925602798492e-06,
|
||
|
|
"loss": 0.1930912494659424,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1490,
|
||
|
|
"token_acc": 0.9382497082847141,
|
||
|
|
"train_speed(iter/s)": 0.123201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8379021730783998,
|
||
|
|
"grad_norm": 0.7419607043266296,
|
||
|
|
"learning_rate": 3.262694478141266e-06,
|
||
|
|
"loss": 0.1879183053970337,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1495,
|
||
|
|
"token_acc": 0.9291406527587432,
|
||
|
|
"train_speed(iter/s)": 0.123298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8440451508868925,
|
||
|
|
"grad_norm": 0.8275686502456665,
|
||
|
|
"learning_rate": 3.2325354129189923e-06,
|
||
|
|
"loss": 0.19919825792312623,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1500,
|
||
|
|
"token_acc": 0.9369797252438589,
|
||
|
|
"train_speed(iter/s)": 0.123397
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8440451508868925,
|
||
|
|
"eval_loss": 0.2576294541358948,
|
||
|
|
"eval_runtime": 29.692,
|
||
|
|
"eval_samples_per_second": 17.715,
|
||
|
|
"eval_steps_per_second": 4.446,
|
||
|
|
"eval_token_acc": 0.9190301196137772,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8501881286953852,
|
||
|
|
"grad_norm": 0.7494759559631348,
|
||
|
|
"learning_rate": 3.2024496580602892e-06,
|
||
|
|
"loss": 0.1704793930053711,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1505,
|
||
|
|
"token_acc": 0.920035804863494,
|
||
|
|
"train_speed(iter/s)": 0.123086
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8563311065038777,
|
||
|
|
"grad_norm": 0.7116587162017822,
|
||
|
|
"learning_rate": 3.172438461453032e-06,
|
||
|
|
"loss": 0.19869464635849,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1510,
|
||
|
|
"token_acc": 0.9331588853693247,
|
||
|
|
"train_speed(iter/s)": 0.123201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8624740843123704,
|
||
|
|
"grad_norm": 0.8266251087188721,
|
||
|
|
"learning_rate": 3.142503067892594e-06,
|
||
|
|
"loss": 0.18929662704467773,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1515,
|
||
|
|
"token_acc": 0.9361521750649076,
|
||
|
|
"train_speed(iter/s)": 0.123284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.868617062120863,
|
||
|
|
"grad_norm": 0.8856237530708313,
|
||
|
|
"learning_rate": 3.112644719030206e-06,
|
||
|
|
"loss": 0.1765504837036133,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1520,
|
||
|
|
"token_acc": 0.9446170019591915,
|
||
|
|
"train_speed(iter/s)": 0.123379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.868617062120863,
|
||
|
|
"eval_loss": 0.2572629451751709,
|
||
|
|
"eval_runtime": 29.7297,
|
||
|
|
"eval_samples_per_second": 17.693,
|
||
|
|
"eval_steps_per_second": 4.44,
|
||
|
|
"eval_token_acc": 0.9187563049430754,
|
||
|
|
"step": 1520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8747600399293558,
|
||
|
|
"grad_norm": 0.7266517281532288,
|
||
|
|
"learning_rate": 3.0828646533214657e-06,
|
||
|
|
"loss": 0.18753888607025146,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1525,
|
||
|
|
"token_acc": 0.921090387374462,
|
||
|
|
"train_speed(iter/s)": 0.123072
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8809030177378485,
|
||
|
|
"grad_norm": 0.7010701298713684,
|
||
|
|
"learning_rate": 3.053164105974964e-06,
|
||
|
|
"loss": 0.18214144706726074,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1530,
|
||
|
|
"token_acc": 0.9355687362479671,
|
||
|
|
"train_speed(iter/s)": 0.123181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.887045995546341,
|
||
|
|
"grad_norm": 0.8133987188339233,
|
||
|
|
"learning_rate": 3.0235443089010564e-06,
|
||
|
|
"loss": 0.19535071849823,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1535,
|
||
|
|
"token_acc": 0.9338049036944989,
|
||
|
|
"train_speed(iter/s)": 0.123268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8931889733548337,
|
||
|
|
"grad_norm": 0.7620673775672913,
|
||
|
|
"learning_rate": 2.9940064906607607e-06,
|
||
|
|
"loss": 0.19279036521911622,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1540,
|
||
|
|
"token_acc": 0.9357547655847501,
|
||
|
|
"train_speed(iter/s)": 0.123366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8931889733548337,
|
||
|
|
"eval_loss": 0.2575133144855499,
|
||
|
|
"eval_runtime": 29.6891,
|
||
|
|
"eval_samples_per_second": 17.717,
|
||
|
|
"eval_steps_per_second": 4.446,
|
||
|
|
"eval_token_acc": 0.9187923331892204,
|
||
|
|
"step": 1540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8993319511633264,
|
||
|
|
"grad_norm": 0.8368042707443237,
|
||
|
|
"learning_rate": 2.964551876414801e-06,
|
||
|
|
"loss": 0.186897873878479,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1545,
|
||
|
|
"token_acc": 0.9212080946652063,
|
||
|
|
"train_speed(iter/s)": 0.123086
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9054749289718191,
|
||
|
|
"grad_norm": 0.7964156270027161,
|
||
|
|
"learning_rate": 2.93518168787279e-06,
|
||
|
|
"loss": 0.1861191511154175,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1550,
|
||
|
|
"token_acc": 0.9394473838918284,
|
||
|
|
"train_speed(iter/s)": 0.123163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9116179067803118,
|
||
|
|
"grad_norm": 0.9378722906112671,
|
||
|
|
"learning_rate": 2.905897143242562e-06,
|
||
|
|
"loss": 0.197173810005188,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1555,
|
||
|
|
"token_acc": 0.9388659543467702,
|
||
|
|
"train_speed(iter/s)": 0.123262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9177608845888043,
|
||
|
|
"grad_norm": 0.8486573100090027,
|
||
|
|
"learning_rate": 2.8766994571796336e-06,
|
||
|
|
"loss": 0.18908753395080566,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1560,
|
||
|
|
"token_acc": 0.9474116680361545,
|
||
|
|
"train_speed(iter/s)": 0.123349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9177608845888043,
|
||
|
|
"eval_loss": 0.25726109743118286,
|
||
|
|
"eval_runtime": 29.6999,
|
||
|
|
"eval_samples_per_second": 17.711,
|
||
|
|
"eval_steps_per_second": 4.444,
|
||
|
|
"eval_token_acc": 0.9191309987029831,
|
||
|
|
"step": 1560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.923903862397297,
|
||
|
|
"grad_norm": 0.8716571927070618,
|
||
|
|
"learning_rate": 2.8475898407368298e-06,
|
||
|
|
"loss": 0.18810817003250122,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1565,
|
||
|
|
"token_acc": 0.9179941342227845,
|
||
|
|
"train_speed(iter/s)": 0.123063
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9300468402057898,
|
||
|
|
"grad_norm": 0.794176459312439,
|
||
|
|
"learning_rate": 2.8185695013140474e-06,
|
||
|
|
"loss": 0.17928617000579833,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1570,
|
||
|
|
"token_acc": 0.9303826916366175,
|
||
|
|
"train_speed(iter/s)": 0.123136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9361898180142825,
|
||
|
|
"grad_norm": 0.7357900142669678,
|
||
|
|
"learning_rate": 2.7896396426081844e-06,
|
||
|
|
"loss": 0.18468384742736815,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1575,
|
||
|
|
"token_acc": 0.9476946498477599,
|
||
|
|
"train_speed(iter/s)": 0.123228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9423327958227752,
|
||
|
|
"grad_norm": 0.8071329593658447,
|
||
|
|
"learning_rate": 2.7608014645632e-06,
|
||
|
|
"loss": 0.1790044903755188,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1580,
|
||
|
|
"token_acc": 0.938118933832586,
|
||
|
|
"train_speed(iter/s)": 0.123325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9423327958227752,
|
||
|
|
"eval_loss": 0.25785306096076965,
|
||
|
|
"eval_runtime": 29.6989,
|
||
|
|
"eval_samples_per_second": 17.711,
|
||
|
|
"eval_steps_per_second": 4.445,
|
||
|
|
"eval_token_acc": 0.9192462890906471,
|
||
|
|
"step": 1580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9484757736312677,
|
||
|
|
"grad_norm": 0.8714718818664551,
|
||
|
|
"learning_rate": 2.7320561633203567e-06,
|
||
|
|
"loss": 0.19142614603042601,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1585,
|
||
|
|
"token_acc": 0.9180685641538602,
|
||
|
|
"train_speed(iter/s)": 0.123048
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9546187514397604,
|
||
|
|
"grad_norm": 0.7642494440078735,
|
||
|
|
"learning_rate": 2.703404931168594e-06,
|
||
|
|
"loss": 0.1714502215385437,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1590,
|
||
|
|
"token_acc": 0.9387637940932576,
|
||
|
|
"train_speed(iter/s)": 0.123137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.960761729248253,
|
||
|
|
"grad_norm": 0.7690022587776184,
|
||
|
|
"learning_rate": 2.6748489564950907e-06,
|
||
|
|
"loss": 0.1712334156036377,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1595,
|
||
|
|
"token_acc": 0.9381139489194499,
|
||
|
|
"train_speed(iter/s)": 0.123216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9669047070567458,
|
||
|
|
"grad_norm": 0.789311945438385,
|
||
|
|
"learning_rate": 2.6463894237359556e-06,
|
||
|
|
"loss": 0.1898505687713623,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1600,
|
||
|
|
"token_acc": 0.9361558383064971,
|
||
|
|
"train_speed(iter/s)": 0.123315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9669047070567458,
|
||
|
|
"eval_loss": 0.25641384720802307,
|
||
|
|
"eval_runtime": 29.7239,
|
||
|
|
"eval_samples_per_second": 17.696,
|
||
|
|
"eval_steps_per_second": 4.441,
|
||
|
|
"eval_token_acc": 0.9191237930537541,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9730476848652385,
|
||
|
|
"grad_norm": 0.8089138865470886,
|
||
|
|
"learning_rate": 2.618027513327116e-06,
|
||
|
|
"loss": 0.18109874725341796,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1605,
|
||
|
|
"token_acc": 0.923689472311571,
|
||
|
|
"train_speed(iter/s)": 0.12303
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.979190662673731,
|
||
|
|
"grad_norm": 0.8698074221611023,
|
||
|
|
"learning_rate": 2.589764401655343e-06,
|
||
|
|
"loss": 0.183346688747406,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1610,
|
||
|
|
"token_acc": 0.9436480028852222,
|
||
|
|
"train_speed(iter/s)": 0.123111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9853336404822237,
|
||
|
|
"grad_norm": 0.8340507745742798,
|
||
|
|
"learning_rate": 2.5616012610094702e-06,
|
||
|
|
"loss": 0.19840478897094727,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1615,
|
||
|
|
"token_acc": 0.931529030765672,
|
||
|
|
"train_speed(iter/s)": 0.123193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9914766182907164,
|
||
|
|
"grad_norm": 0.9170531034469604,
|
||
|
|
"learning_rate": 2.533539259531757e-06,
|
||
|
|
"loss": 0.20222840309143067,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1620,
|
||
|
|
"token_acc": 0.9312559145599567,
|
||
|
|
"train_speed(iter/s)": 0.123282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9914766182907164,
|
||
|
|
"eval_loss": 0.2562500834465027,
|
||
|
|
"eval_runtime": 29.7389,
|
||
|
|
"eval_samples_per_second": 17.687,
|
||
|
|
"eval_steps_per_second": 4.439,
|
||
|
|
"eval_token_acc": 0.91929672863525,
|
||
|
|
"step": 1620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9976195960992091,
|
||
|
|
"grad_norm": 0.8579339385032654,
|
||
|
|
"learning_rate": 2.5055795611694435e-06,
|
||
|
|
"loss": 0.17736260890960692,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1625,
|
||
|
|
"token_acc": 0.9170028818443804,
|
||
|
|
"train_speed(iter/s)": 0.123032
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.004914382246794,
|
||
|
|
"grad_norm": 0.6278606057167053,
|
||
|
|
"learning_rate": 2.4777233256264743e-06,
|
||
|
|
"loss": 0.20010933876037598,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1630,
|
||
|
|
"token_acc": 0.947276073094535,
|
||
|
|
"train_speed(iter/s)": 0.123086
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.011057360055287,
|
||
|
|
"grad_norm": 0.6573106050491333,
|
||
|
|
"learning_rate": 2.4499717083153975e-06,
|
||
|
|
"loss": 0.1415931224822998,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1635,
|
||
|
|
"token_acc": 0.9564882032667876,
|
||
|
|
"train_speed(iter/s)": 0.123172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0172003378637795,
|
||
|
|
"grad_norm": 0.862938404083252,
|
||
|
|
"learning_rate": 2.4223258603094295e-06,
|
||
|
|
"loss": 0.16473679542541503,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1640,
|
||
|
|
"token_acc": 0.9506443652316973,
|
||
|
|
"train_speed(iter/s)": 0.123277
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0172003378637795,
|
||
|
|
"eval_loss": 0.2674296498298645,
|
||
|
|
"eval_runtime": 29.6424,
|
||
|
|
"eval_samples_per_second": 17.745,
|
||
|
|
"eval_steps_per_second": 4.453,
|
||
|
|
"eval_token_acc": 0.9185977806600375,
|
||
|
|
"step": 1640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.023343315672272,
|
||
|
|
"grad_norm": 0.8443688750267029,
|
||
|
|
"learning_rate": 2.3947869282947263e-06,
|
||
|
|
"loss": 0.14469457864761354,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1645,
|
||
|
|
"token_acc": 0.9298440219802724,
|
||
|
|
"train_speed(iter/s)": 0.122997
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.029486293480765,
|
||
|
|
"grad_norm": 0.8116291761398315,
|
||
|
|
"learning_rate": 2.3673560545228082e-06,
|
||
|
|
"loss": 0.14491933584213257,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1650,
|
||
|
|
"token_acc": 0.9576881945413122,
|
||
|
|
"train_speed(iter/s)": 0.12308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0356292712892574,
|
||
|
|
"grad_norm": 0.6990134119987488,
|
||
|
|
"learning_rate": 2.3400343767631943e-06,
|
||
|
|
"loss": 0.1429425835609436,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1655,
|
||
|
|
"token_acc": 0.9598897189612238,
|
||
|
|
"train_speed(iter/s)": 0.12317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0417722490977503,
|
||
|
|
"grad_norm": 0.7768262624740601,
|
||
|
|
"learning_rate": 2.312823028256205e-06,
|
||
|
|
"loss": 0.13332735300064086,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1660,
|
||
|
|
"token_acc": 0.955598381190981,
|
||
|
|
"train_speed(iter/s)": 0.123249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0417722490977503,
|
||
|
|
"eval_loss": 0.2723671495914459,
|
||
|
|
"eval_runtime": 29.6708,
|
||
|
|
"eval_samples_per_second": 17.728,
|
||
|
|
"eval_steps_per_second": 4.449,
|
||
|
|
"eval_token_acc": 0.9178267761925349,
|
||
|
|
"step": 1660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.047915226906243,
|
||
|
|
"grad_norm": 0.8455916047096252,
|
||
|
|
"learning_rate": 2.2857231376659517e-06,
|
||
|
|
"loss": 0.13110907077789308,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1665,
|
||
|
|
"token_acc": 0.9266902441777343,
|
||
|
|
"train_speed(iter/s)": 0.12298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0540582047147353,
|
||
|
|
"grad_norm": 0.8857413530349731,
|
||
|
|
"learning_rate": 2.258735829033529e-06,
|
||
|
|
"loss": 0.16349921226501465,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1670,
|
||
|
|
"token_acc": 0.9470571801080275,
|
||
|
|
"train_speed(iter/s)": 0.123074
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0602011825232283,
|
||
|
|
"grad_norm": 0.8224142789840698,
|
||
|
|
"learning_rate": 2.231862221730394e-06,
|
||
|
|
"loss": 0.1457624077796936,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1675,
|
||
|
|
"token_acc": 0.9467787114845938,
|
||
|
|
"train_speed(iter/s)": 0.123144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0663441603317207,
|
||
|
|
"grad_norm": 0.7867154479026794,
|
||
|
|
"learning_rate": 2.2051034304119344e-06,
|
||
|
|
"loss": 0.13943665027618407,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1680,
|
||
|
|
"token_acc": 0.9487998351704955,
|
||
|
|
"train_speed(iter/s)": 0.123235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0663441603317207,
|
||
|
|
"eval_loss": 0.27172932028770447,
|
||
|
|
"eval_runtime": 29.8181,
|
||
|
|
"eval_samples_per_second": 17.64,
|
||
|
|
"eval_steps_per_second": 4.427,
|
||
|
|
"eval_token_acc": 0.9178844213863669,
|
||
|
|
"step": 1680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0724871381402137,
|
||
|
|
"grad_norm": 0.7623206973075867,
|
||
|
|
"learning_rate": 2.1784605649712326e-06,
|
||
|
|
"loss": 0.14780081510543824,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1685,
|
||
|
|
"token_acc": 0.9247235706580367,
|
||
|
|
"train_speed(iter/s)": 0.122983
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.078630115948706,
|
||
|
|
"grad_norm": 0.8290310502052307,
|
||
|
|
"learning_rate": 2.1519347304930317e-06,
|
||
|
|
"loss": 0.1389237880706787,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1690,
|
||
|
|
"token_acc": 0.9470190895741557,
|
||
|
|
"train_speed(iter/s)": 0.123069
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0847730937571987,
|
||
|
|
"grad_norm": 0.7647258639335632,
|
||
|
|
"learning_rate": 2.1255270272079044e-06,
|
||
|
|
"loss": 0.14199459552764893,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1695,
|
||
|
|
"token_acc": 0.9419040287400564,
|
||
|
|
"train_speed(iter/s)": 0.123168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0909160715656916,
|
||
|
|
"grad_norm": 0.8620509505271912,
|
||
|
|
"learning_rate": 2.0992385504466075e-06,
|
||
|
|
"loss": 0.14582890272140503,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1700,
|
||
|
|
"token_acc": 0.9481878509443593,
|
||
|
|
"train_speed(iter/s)": 0.123261
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0909160715656916,
|
||
|
|
"eval_loss": 0.27268144488334656,
|
||
|
|
"eval_runtime": 29.6628,
|
||
|
|
"eval_samples_per_second": 17.733,
|
||
|
|
"eval_steps_per_second": 4.45,
|
||
|
|
"eval_token_acc": 0.9178772157371379,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.097059049374184,
|
||
|
|
"grad_norm": 0.7256646752357483,
|
||
|
|
"learning_rate": 2.0730703905946612e-06,
|
||
|
|
"loss": 0.14624775648117067,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1705,
|
||
|
|
"token_acc": 0.9200135124990616,
|
||
|
|
"train_speed(iter/s)": 0.123001
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.103202027182677,
|
||
|
|
"grad_norm": 0.7388427257537842,
|
||
|
|
"learning_rate": 2.0470236330471125e-06,
|
||
|
|
"loss": 0.11770030260086059,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1710,
|
||
|
|
"token_acc": 0.9668929503916449,
|
||
|
|
"train_speed(iter/s)": 0.123069
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1093450049911695,
|
||
|
|
"grad_norm": 0.8730806112289429,
|
||
|
|
"learning_rate": 2.0210993581635257e-06,
|
||
|
|
"loss": 0.16097368001937867,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1715,
|
||
|
|
"token_acc": 0.9433831352051436,
|
||
|
|
"train_speed(iter/s)": 0.123169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.115487982799662,
|
||
|
|
"grad_norm": 0.7302968502044678,
|
||
|
|
"learning_rate": 1.9952986412231612e-06,
|
||
|
|
"loss": 0.1270466446876526,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1720,
|
||
|
|
"token_acc": 0.9573284772123241,
|
||
|
|
"train_speed(iter/s)": 0.123225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.115487982799662,
|
||
|
|
"eval_loss": 0.27457520365715027,
|
||
|
|
"eval_runtime": 29.6821,
|
||
|
|
"eval_samples_per_second": 17.721,
|
||
|
|
"eval_steps_per_second": 4.447,
|
||
|
|
"eval_token_acc": 0.9178555987894509,
|
||
|
|
"step": 1720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.121630960608155,
|
||
|
|
"grad_norm": 1.026114821434021,
|
||
|
|
"learning_rate": 1.9696225523803803e-06,
|
||
|
|
"loss": 0.1560563325881958,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1725,
|
||
|
|
"token_acc": 0.9226966883434199,
|
||
|
|
"train_speed(iter/s)": 0.12297
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1277739384166474,
|
||
|
|
"grad_norm": 0.8650787472724915,
|
||
|
|
"learning_rate": 1.944072156620261e-06,
|
||
|
|
"loss": 0.13898645639419555,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1730,
|
||
|
|
"token_acc": 0.9579385943157581,
|
||
|
|
"train_speed(iter/s)": 0.12307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1339169162251403,
|
||
|
|
"grad_norm": 0.6428267955780029,
|
||
|
|
"learning_rate": 1.9186485137144217e-06,
|
||
|
|
"loss": 0.15046895742416383,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1735,
|
||
|
|
"token_acc": 0.9573064770932069,
|
||
|
|
"train_speed(iter/s)": 0.123153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.140059894033633,
|
||
|
|
"grad_norm": 0.7333597540855408,
|
||
|
|
"learning_rate": 1.89335267817706e-06,
|
||
|
|
"loss": 0.1286949872970581,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1740,
|
||
|
|
"token_acc": 0.9547167656464138,
|
||
|
|
"train_speed(iter/s)": 0.123228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.140059894033633,
|
||
|
|
"eval_loss": 0.2735785245895386,
|
||
|
|
"eval_runtime": 29.6624,
|
||
|
|
"eval_samples_per_second": 17.733,
|
||
|
|
"eval_steps_per_second": 4.45,
|
||
|
|
"eval_token_acc": 0.917639429312581,
|
||
|
|
"step": 1740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1462028718421253,
|
||
|
|
"grad_norm": 0.8134050369262695,
|
||
|
|
"learning_rate": 1.8681856992212211e-06,
|
||
|
|
"loss": 0.1448550343513489,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1745,
|
||
|
|
"token_acc": 0.9220646406174626,
|
||
|
|
"train_speed(iter/s)": 0.122967
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1523458496506183,
|
||
|
|
"grad_norm": 0.6919598579406738,
|
||
|
|
"learning_rate": 1.8431486207152704e-06,
|
||
|
|
"loss": 0.12585388422012328,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1750,
|
||
|
|
"token_acc": 0.9565885062902368,
|
||
|
|
"train_speed(iter/s)": 0.12304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1584888274591107,
|
||
|
|
"grad_norm": 0.7137647271156311,
|
||
|
|
"learning_rate": 1.8182424811396131e-06,
|
||
|
|
"loss": 0.13218532800674437,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1755,
|
||
|
|
"token_acc": 0.9553192383674499,
|
||
|
|
"train_speed(iter/s)": 0.123112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1646318052676037,
|
||
|
|
"grad_norm": 0.8254464864730835,
|
||
|
|
"learning_rate": 1.7934683135435993e-06,
|
||
|
|
"loss": 0.15681140422821044,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1760,
|
||
|
|
"token_acc": 0.9500492764147518,
|
||
|
|
"train_speed(iter/s)": 0.123204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1646318052676037,
|
||
|
|
"eval_loss": 0.2743065655231476,
|
||
|
|
"eval_runtime": 29.7411,
|
||
|
|
"eval_samples_per_second": 17.686,
|
||
|
|
"eval_steps_per_second": 4.438,
|
||
|
|
"eval_token_acc": 0.91786280443868,
|
||
|
|
"step": 1760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.170774783076096,
|
||
|
|
"grad_norm": 0.8115731477737427,
|
||
|
|
"learning_rate": 1.7688271455026867e-06,
|
||
|
|
"loss": 0.15295430421829223,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1765,
|
||
|
|
"token_acc": 0.9211377831289036,
|
||
|
|
"train_speed(iter/s)": 0.122963
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1769177608845887,
|
||
|
|
"grad_norm": 0.7977039813995361,
|
||
|
|
"learning_rate": 1.7443199990758168e-06,
|
||
|
|
"loss": 0.14479312896728516,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1770,
|
||
|
|
"token_acc": 0.9533281533281533,
|
||
|
|
"train_speed(iter/s)": 0.123053
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1830607386930816,
|
||
|
|
"grad_norm": 0.8783808350563049,
|
||
|
|
"learning_rate": 1.7199478907630269e-06,
|
||
|
|
"loss": 0.14664456844329835,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1775,
|
||
|
|
"token_acc": 0.9566591882520905,
|
||
|
|
"train_speed(iter/s)": 0.123142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.189203716501574,
|
||
|
|
"grad_norm": 0.737375795841217,
|
||
|
|
"learning_rate": 1.6957118314632825e-06,
|
||
|
|
"loss": 0.12802677154541015,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1780,
|
||
|
|
"token_acc": 0.9474123975142305,
|
||
|
|
"train_speed(iter/s)": 0.12324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.189203716501574,
|
||
|
|
"eval_loss": 0.27472689747810364,
|
||
|
|
"eval_runtime": 29.7215,
|
||
|
|
"eval_samples_per_second": 17.698,
|
||
|
|
"eval_steps_per_second": 4.441,
|
||
|
|
"eval_token_acc": 0.9179997117740308,
|
||
|
|
"step": 1780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.195346694310067,
|
||
|
|
"grad_norm": 0.9062975645065308,
|
||
|
|
"learning_rate": 1.6716128264325477e-06,
|
||
|
|
"loss": 0.1491732716560364,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1785,
|
||
|
|
"token_acc": 0.9252631765812785,
|
||
|
|
"train_speed(iter/s)": 0.123004
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2014896721185595,
|
||
|
|
"grad_norm": 0.9112216830253601,
|
||
|
|
"learning_rate": 1.64765187524209e-06,
|
||
|
|
"loss": 0.14550890922546386,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1790,
|
||
|
|
"token_acc": 0.9464007023019898,
|
||
|
|
"train_speed(iter/s)": 0.123075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.207632649927052,
|
||
|
|
"grad_norm": 0.822755753993988,
|
||
|
|
"learning_rate": 1.6238299717370254e-06,
|
||
|
|
"loss": 0.14908239841461182,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1795,
|
||
|
|
"token_acc": 0.9550466874166296,
|
||
|
|
"train_speed(iter/s)": 0.123153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.213775627735545,
|
||
|
|
"grad_norm": 0.9669148921966553,
|
||
|
|
"learning_rate": 1.6001481039950872e-06,
|
||
|
|
"loss": 0.14041876792907715,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1800,
|
||
|
|
"token_acc": 0.9552882955460927,
|
||
|
|
"train_speed(iter/s)": 0.123227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.213775627735545,
|
||
|
|
"eval_loss": 0.2743581235408783,
|
||
|
|
"eval_runtime": 29.714,
|
||
|
|
"eval_samples_per_second": 17.702,
|
||
|
|
"eval_steps_per_second": 4.442,
|
||
|
|
"eval_token_acc": 0.9181150021616947,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2199186055440374,
|
||
|
|
"grad_norm": 0.7515302896499634,
|
||
|
|
"learning_rate": 1.5766072542856525e-06,
|
||
|
|
"loss": 0.1314539670944214,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1805,
|
||
|
|
"token_acc": 0.9241629064430544,
|
||
|
|
"train_speed(iter/s)": 0.122981
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2260615833525303,
|
||
|
|
"grad_norm": 0.8223626613616943,
|
||
|
|
"learning_rate": 1.5532083990289892e-06,
|
||
|
|
"loss": 0.1447986364364624,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1810,
|
||
|
|
"token_acc": 0.9575009707900073,
|
||
|
|
"train_speed(iter/s)": 0.123073
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.232204561161023,
|
||
|
|
"grad_norm": 0.8387971520423889,
|
||
|
|
"learning_rate": 1.5299525087557682e-06,
|
||
|
|
"loss": 0.12721827030181884,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1815,
|
||
|
|
"token_acc": 0.9589310504396112,
|
||
|
|
"train_speed(iter/s)": 0.123144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2383475389695153,
|
||
|
|
"grad_norm": 0.8764155507087708,
|
||
|
|
"learning_rate": 1.5068405480667975e-06,
|
||
|
|
"loss": 0.1495474696159363,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1820,
|
||
|
|
"token_acc": 0.9530952884005915,
|
||
|
|
"train_speed(iter/s)": 0.123223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2383475389695153,
|
||
|
|
"eval_loss": 0.2754988670349121,
|
||
|
|
"eval_runtime": 29.5873,
|
||
|
|
"eval_samples_per_second": 17.778,
|
||
|
|
"eval_steps_per_second": 4.461,
|
||
|
|
"eval_token_acc": 0.9176898688571841,
|
||
|
|
"step": 1820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2444905167780083,
|
||
|
|
"grad_norm": 0.8443633317947388,
|
||
|
|
"learning_rate": 1.4838734755930168e-06,
|
||
|
|
"loss": 0.14514811038970948,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1825,
|
||
|
|
"token_acc": 0.9260831823671497,
|
||
|
|
"train_speed(iter/s)": 0.122984
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2506334945865007,
|
||
|
|
"grad_norm": 0.8468235731124878,
|
||
|
|
"learning_rate": 1.461052243955739e-06,
|
||
|
|
"loss": 0.14231607913970948,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1830,
|
||
|
|
"token_acc": 0.9431918169819622,
|
||
|
|
"train_speed(iter/s)": 0.123052
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2567764723949937,
|
||
|
|
"grad_norm": 0.8009449243545532,
|
||
|
|
"learning_rate": 1.4383777997271347e-06,
|
||
|
|
"loss": 0.13485580682754517,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1835,
|
||
|
|
"token_acc": 0.954977119519756,
|
||
|
|
"train_speed(iter/s)": 0.123135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.262919450203486,
|
||
|
|
"grad_norm": 0.8349820971488953,
|
||
|
|
"learning_rate": 1.4158510833909688e-06,
|
||
|
|
"loss": 0.1553872346878052,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1840,
|
||
|
|
"token_acc": 0.9466980320156062,
|
||
|
|
"train_speed(iter/s)": 0.12321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.262919450203486,
|
||
|
|
"eval_loss": 0.2755924463272095,
|
||
|
|
"eval_runtime": 29.7545,
|
||
|
|
"eval_samples_per_second": 17.678,
|
||
|
|
"eval_steps_per_second": 4.436,
|
||
|
|
"eval_token_acc": 0.9181582360570687,
|
||
|
|
"step": 1840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2690624280119787,
|
||
|
|
"grad_norm": 0.9547154903411865,
|
||
|
|
"learning_rate": 1.3934730293035935e-06,
|
||
|
|
"loss": 0.1530256986618042,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1845,
|
||
|
|
"token_acc": 0.9236753100338219,
|
||
|
|
"train_speed(iter/s)": 0.122997
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2752054058204716,
|
||
|
|
"grad_norm": 0.8379245400428772,
|
||
|
|
"learning_rate": 1.3712445656551904e-06,
|
||
|
|
"loss": 0.14752573966979982,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1850,
|
||
|
|
"token_acc": 0.9471953309555793,
|
||
|
|
"train_speed(iter/s)": 0.123078
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.281348383628964,
|
||
|
|
"grad_norm": 0.7745286822319031,
|
||
|
|
"learning_rate": 1.349166614431282e-06,
|
||
|
|
"loss": 0.13339710235595703,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1855,
|
||
|
|
"token_acc": 0.9574489743981269,
|
||
|
|
"train_speed(iter/s)": 0.123139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.287491361437457,
|
||
|
|
"grad_norm": 0.7709481120109558,
|
||
|
|
"learning_rate": 1.3272400913744744e-06,
|
||
|
|
"loss": 0.13953914642333984,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1860,
|
||
|
|
"token_acc": 0.9522856703093736,
|
||
|
|
"train_speed(iter/s)": 0.123224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.287491361437457,
|
||
|
|
"eval_loss": 0.27358269691467285,
|
||
|
|
"eval_runtime": 29.7058,
|
||
|
|
"eval_samples_per_second": 17.707,
|
||
|
|
"eval_steps_per_second": 4.444,
|
||
|
|
"eval_token_acc": 0.9181294134601528,
|
||
|
|
"step": 1860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2936343392459495,
|
||
|
|
"grad_norm": 0.9447105526924133,
|
||
|
|
"learning_rate": 1.3054659059464836e-06,
|
||
|
|
"loss": 0.1305554747581482,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1865,
|
||
|
|
"token_acc": 0.9289236364999907,
|
||
|
|
"train_speed(iter/s)": 0.122979
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.299777317054442,
|
||
|
|
"grad_norm": 0.8118374347686768,
|
||
|
|
"learning_rate": 1.2838449612904108e-06,
|
||
|
|
"loss": 0.14541189670562743,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1870,
|
||
|
|
"token_acc": 0.9568722866275464,
|
||
|
|
"train_speed(iter/s)": 0.123069
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.305920294862935,
|
||
|
|
"grad_norm": 0.7850742936134338,
|
||
|
|
"learning_rate": 1.262378154193285e-06,
|
||
|
|
"loss": 0.14573101997375487,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1875,
|
||
|
|
"token_acc": 0.9437350591802227,
|
||
|
|
"train_speed(iter/s)": 0.123139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3120632726714274,
|
||
|
|
"grad_norm": 0.8586622476577759,
|
||
|
|
"learning_rate": 1.2410663750488644e-06,
|
||
|
|
"loss": 0.1231348991394043,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1880,
|
||
|
|
"token_acc": 0.9528933210864716,
|
||
|
|
"train_speed(iter/s)": 0.123207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3120632726714274,
|
||
|
|
"eval_loss": 0.27450090646743774,
|
||
|
|
"eval_runtime": 29.7231,
|
||
|
|
"eval_samples_per_second": 17.697,
|
||
|
|
"eval_steps_per_second": 4.441,
|
||
|
|
"eval_token_acc": 0.9181798530047557,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3182062504799203,
|
||
|
|
"grad_norm": 0.7418249845504761,
|
||
|
|
"learning_rate": 1.2199105078207002e-06,
|
||
|
|
"loss": 0.15627479553222656,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1885,
|
||
|
|
"token_acc": 0.9225045238007312,
|
||
|
|
"train_speed(iter/s)": 0.122962
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.324349228288413,
|
||
|
|
"grad_norm": 0.838173508644104,
|
||
|
|
"learning_rate": 1.1989114300054782e-06,
|
||
|
|
"loss": 0.14288971424102784,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1890,
|
||
|
|
"token_acc": 0.94441322229602,
|
||
|
|
"train_speed(iter/s)": 0.123041
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3304922060969053,
|
||
|
|
"grad_norm": 0.8450609445571899,
|
||
|
|
"learning_rate": 1.1780700125966232e-06,
|
||
|
|
"loss": 0.13255660533905028,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1895,
|
||
|
|
"token_acc": 0.9575001607406931,
|
||
|
|
"train_speed(iter/s)": 0.123109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3366351839053983,
|
||
|
|
"grad_norm": 0.7407841086387634,
|
||
|
|
"learning_rate": 1.1573871200481634e-06,
|
||
|
|
"loss": 0.1363093614578247,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1900,
|
||
|
|
"token_acc": 0.948611310292079,
|
||
|
|
"train_speed(iter/s)": 0.123174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3366351839053983,
|
||
|
|
"eval_loss": 0.27403974533081055,
|
||
|
|
"eval_runtime": 29.6863,
|
||
|
|
"eval_samples_per_second": 17.719,
|
||
|
|
"eval_steps_per_second": 4.446,
|
||
|
|
"eval_token_acc": 0.9180429456694048,
|
||
|
|
"step": 1900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3427781617138907,
|
||
|
|
"grad_norm": 0.8412113189697266,
|
||
|
|
"learning_rate": 1.136863610238887e-06,
|
||
|
|
"loss": 0.151106858253479,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1905,
|
||
|
|
"token_acc": 0.9202309459903064,
|
||
|
|
"train_speed(iter/s)": 0.122952
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3489211395223837,
|
||
|
|
"grad_norm": 0.7621601819992065,
|
||
|
|
"learning_rate": 1.1165003344367465e-06,
|
||
|
|
"loss": 0.145496666431427,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1910,
|
||
|
|
"token_acc": 0.9506583322250299,
|
||
|
|
"train_speed(iter/s)": 0.123038
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.355064117330876,
|
||
|
|
"grad_norm": 0.8177499175071716,
|
||
|
|
"learning_rate": 1.0962981372635629e-06,
|
||
|
|
"loss": 0.13563876152038573,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1915,
|
||
|
|
"token_acc": 0.9514687814140511,
|
||
|
|
"train_speed(iter/s)": 0.123121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3612070951393687,
|
||
|
|
"grad_norm": 0.8387221693992615,
|
||
|
|
"learning_rate": 1.0762578566599818e-06,
|
||
|
|
"loss": 0.15051798820495604,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1920,
|
||
|
|
"token_acc": 0.9480101984258952,
|
||
|
|
"train_speed(iter/s)": 0.123214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3612070951393687,
|
||
|
|
"eval_loss": 0.2752765119075775,
|
||
|
|
"eval_runtime": 29.73,
|
||
|
|
"eval_samples_per_second": 17.693,
|
||
|
|
"eval_steps_per_second": 4.44,
|
||
|
|
"eval_token_acc": 0.9179492722294279,
|
||
|
|
"step": 1920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3673500729478616,
|
||
|
|
"grad_norm": 0.9462500214576721,
|
||
|
|
"learning_rate": 1.056380323850722e-06,
|
||
|
|
"loss": 0.1329110622406006,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1925,
|
||
|
|
"token_acc": 0.9277916379142559,
|
||
|
|
"train_speed(iter/s)": 0.122971
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.373493050756354,
|
||
|
|
"grad_norm": 0.6897282004356384,
|
||
|
|
"learning_rate": 1.0366663633101015e-06,
|
||
|
|
"loss": 0.14667117595672607,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1930,
|
||
|
|
"token_acc": 0.9519438953214723,
|
||
|
|
"train_speed(iter/s)": 0.123037
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.379636028564847,
|
||
|
|
"grad_norm": 0.756504476070404,
|
||
|
|
"learning_rate": 1.0171167927278369e-06,
|
||
|
|
"loss": 0.15089083909988404,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1935,
|
||
|
|
"token_acc": 0.9499266411093757,
|
||
|
|
"train_speed(iter/s)": 0.123095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3857790063733395,
|
||
|
|
"grad_norm": 0.620968222618103,
|
||
|
|
"learning_rate": 9.977324229751245e-07,
|
||
|
|
"loss": 0.13846428394317628,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1940,
|
||
|
|
"token_acc": 0.9542850274450099,
|
||
|
|
"train_speed(iter/s)": 0.123159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3857790063733395,
|
||
|
|
"eval_loss": 0.273898720741272,
|
||
|
|
"eval_runtime": 29.7246,
|
||
|
|
"eval_samples_per_second": 17.696,
|
||
|
|
"eval_steps_per_second": 4.441,
|
||
|
|
"eval_token_acc": 0.9181582360570687,
|
||
|
|
"step": 1940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.391921984181832,
|
||
|
|
"grad_norm": 0.8139801621437073,
|
||
|
|
"learning_rate": 9.785140580710106e-07,
|
||
|
|
"loss": 0.1415960192680359,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1945,
|
||
|
|
"token_acc": 0.9238499208097432,
|
||
|
|
"train_speed(iter/s)": 0.122929
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.398064961990325,
|
||
|
|
"grad_norm": 0.8904073238372803,
|
||
|
|
"learning_rate": 9.594624951490455e-07,
|
||
|
|
"loss": 0.15040233135223388,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1950,
|
||
|
|
"token_acc": 0.9561746584516475,
|
||
|
|
"train_speed(iter/s)": 0.123001
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4042079397988174,
|
||
|
|
"grad_norm": 1.0656641721725464,
|
||
|
|
"learning_rate": 9.405785244242166e-07,
|
||
|
|
"loss": 0.1426215648651123,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1955,
|
||
|
|
"token_acc": 0.9464016327979412,
|
||
|
|
"train_speed(iter/s)": 0.123079
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4103509176073104,
|
||
|
|
"grad_norm": 0.656574547290802,
|
||
|
|
"learning_rate": 9.218629291601699e-07,
|
||
|
|
"loss": 0.12366310358047486,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1960,
|
||
|
|
"token_acc": 0.9601010101010101,
|
||
|
|
"train_speed(iter/s)": 0.123155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4103509176073104,
|
||
|
|
"eval_loss": 0.27443960309028625,
|
||
|
|
"eval_runtime": 29.6457,
|
||
|
|
"eval_samples_per_second": 17.743,
|
||
|
|
"eval_steps_per_second": 4.453,
|
||
|
|
"eval_token_acc": 0.9185473411154345,
|
||
|
|
"step": 1960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.416493895415803,
|
||
|
|
"grad_norm": 0.7282077074050903,
|
||
|
|
"learning_rate": 9.033164856367271e-07,
|
||
|
|
"loss": 0.14160101413726806,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1965,
|
||
|
|
"token_acc": 0.9257107472635547,
|
||
|
|
"train_speed(iter/s)": 0.122927
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4226368732242953,
|
||
|
|
"grad_norm": 0.9724346995353699,
|
||
|
|
"learning_rate": 8.849399631176825e-07,
|
||
|
|
"loss": 0.13960802555084229,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1970,
|
||
|
|
"token_acc": 0.957187156146844,
|
||
|
|
"train_speed(iter/s)": 0.122994
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4287798510327883,
|
||
|
|
"grad_norm": 0.6904351711273193,
|
||
|
|
"learning_rate": 8.667341238189009e-07,
|
||
|
|
"loss": 0.13362197875976561,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1975,
|
||
|
|
"token_acc": 0.9529874213836478,
|
||
|
|
"train_speed(iter/s)": 0.123061
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4349228288412808,
|
||
|
|
"grad_norm": 0.8341466784477234,
|
||
|
|
"learning_rate": 8.486997228767013e-07,
|
||
|
|
"loss": 0.15857725143432616,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1980,
|
||
|
|
"token_acc": 0.9431441341856106,
|
||
|
|
"train_speed(iter/s)": 0.123136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4349228288412808,
|
||
|
|
"eval_loss": 0.27409639954566956,
|
||
|
|
"eval_runtime": 29.6556,
|
||
|
|
"eval_samples_per_second": 17.737,
|
||
|
|
"eval_steps_per_second": 4.451,
|
||
|
|
"eval_token_acc": 0.9185329298169765,
|
||
|
|
"step": 1980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4410658066497737,
|
||
|
|
"grad_norm": 1.1063848733901978,
|
||
|
|
"learning_rate": 8.308375083165299e-07,
|
||
|
|
"loss": 0.15017662048339844,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1985,
|
||
|
|
"token_acc": 0.9246744744307849,
|
||
|
|
"train_speed(iter/s)": 0.122915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.447208784458266,
|
||
|
|
"grad_norm": 0.8507280945777893,
|
||
|
|
"learning_rate": 8.131482210219383e-07,
|
||
|
|
"loss": 0.1420647144317627,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1990,
|
||
|
|
"token_acc": 0.9540951446787641,
|
||
|
|
"train_speed(iter/s)": 0.122981
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4533517622667587,
|
||
|
|
"grad_norm": 0.7251470685005188,
|
||
|
|
"learning_rate": 7.956325947038585e-07,
|
||
|
|
"loss": 0.13122901916503907,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 1995,
|
||
|
|
"token_acc": 0.9569695888700616,
|
||
|
|
"train_speed(iter/s)": 0.123056
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4594947400752516,
|
||
|
|
"grad_norm": 0.6275292038917542,
|
||
|
|
"learning_rate": 7.782913558701572e-07,
|
||
|
|
"loss": 0.13776025772094727,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2000,
|
||
|
|
"token_acc": 0.9476885644768857,
|
||
|
|
"train_speed(iter/s)": 0.123119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4594947400752516,
|
||
|
|
"eval_loss": 0.2740454375743866,
|
||
|
|
"eval_runtime": 29.8131,
|
||
|
|
"eval_samples_per_second": 17.643,
|
||
|
|
"eval_steps_per_second": 4.428,
|
||
|
|
"eval_token_acc": 0.9183816111831676,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.465637717883744,
|
||
|
|
"grad_norm": 0.830629825592041,
|
||
|
|
"learning_rate": 7.611252237955168e-07,
|
||
|
|
"loss": 0.12884964942932128,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2005,
|
||
|
|
"token_acc": 0.926461027233981,
|
||
|
|
"train_speed(iter/s)": 0.122895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.471780695692237,
|
||
|
|
"grad_norm": 0.8567506670951843,
|
||
|
|
"learning_rate": 7.44134910491589e-07,
|
||
|
|
"loss": 0.15558898448944092,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2010,
|
||
|
|
"token_acc": 0.9473684210526315,
|
||
|
|
"train_speed(iter/s)": 0.122982
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4779236735007295,
|
||
|
|
"grad_norm": 0.9071369171142578,
|
||
|
|
"learning_rate": 7.273211206774711e-07,
|
||
|
|
"loss": 0.14407318830490112,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2015,
|
||
|
|
"token_acc": 0.9545647558386412,
|
||
|
|
"train_speed(iter/s)": 0.123063
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.484066651309222,
|
||
|
|
"grad_norm": 0.8281042575836182,
|
||
|
|
"learning_rate": 7.106845517504684e-07,
|
||
|
|
"loss": 0.14147133827209474,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2020,
|
||
|
|
"token_acc": 0.9535927353360435,
|
||
|
|
"train_speed(iter/s)": 0.123126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.484066651309222,
|
||
|
|
"eval_loss": 0.2732316851615906,
|
||
|
|
"eval_runtime": 29.6919,
|
||
|
|
"eval_samples_per_second": 17.715,
|
||
|
|
"eval_steps_per_second": 4.446,
|
||
|
|
"eval_token_acc": 0.9181510304078397,
|
||
|
|
"step": 2020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.490209629117715,
|
||
|
|
"grad_norm": 0.8336676955223083,
|
||
|
|
"learning_rate": 6.942258937571772e-07,
|
||
|
|
"loss": 0.1445910692214966,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2025,
|
||
|
|
"token_acc": 0.9194783843365841,
|
||
|
|
"train_speed(iter/s)": 0.122916
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4963526069262074,
|
||
|
|
"grad_norm": 0.7258805632591248,
|
||
|
|
"learning_rate": 6.779458293648506e-07,
|
||
|
|
"loss": 0.13561407327651978,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2030,
|
||
|
|
"token_acc": 0.9523143224939833,
|
||
|
|
"train_speed(iter/s)": 0.122985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5024955847347004,
|
||
|
|
"grad_norm": 0.8204253911972046,
|
||
|
|
"learning_rate": 6.618450338330978e-07,
|
||
|
|
"loss": 0.14749345779418946,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2035,
|
||
|
|
"token_acc": 0.9511697728431429,
|
||
|
|
"train_speed(iter/s)": 0.12307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.508638562543193,
|
||
|
|
"grad_norm": 0.755736768245697,
|
||
|
|
"learning_rate": 6.459241749858619e-07,
|
||
|
|
"loss": 0.1365538001060486,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2040,
|
||
|
|
"token_acc": 0.94921875,
|
||
|
|
"train_speed(iter/s)": 0.123134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.508638562543193,
|
||
|
|
"eval_loss": 0.2726115584373474,
|
||
|
|
"eval_runtime": 29.6337,
|
||
|
|
"eval_samples_per_second": 17.75,
|
||
|
|
"eval_steps_per_second": 4.454,
|
||
|
|
"eval_token_acc": 0.9181870586539848,
|
||
|
|
"step": 2040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5147815403516853,
|
||
|
|
"grad_norm": 0.7565985918045044,
|
||
|
|
"learning_rate": 6.301839131837284e-07,
|
||
|
|
"loss": 0.14346761703491212,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2045,
|
||
|
|
"token_acc": 0.9241052727438303,
|
||
|
|
"train_speed(iter/s)": 0.122918
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5209245181601783,
|
||
|
|
"grad_norm": 0.8396884202957153,
|
||
|
|
"learning_rate": 6.146249012965349e-07,
|
||
|
|
"loss": 0.13507163524627686,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2050,
|
||
|
|
"token_acc": 0.9460710284016174,
|
||
|
|
"train_speed(iter/s)": 0.122992
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5270674959686708,
|
||
|
|
"grad_norm": 0.7319416999816895,
|
||
|
|
"learning_rate": 5.992477846762896e-07,
|
||
|
|
"loss": 0.13839869499206542,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2055,
|
||
|
|
"token_acc": 0.950456398185889,
|
||
|
|
"train_speed(iter/s)": 0.123056
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5332104737771637,
|
||
|
|
"grad_norm": 0.7878042459487915,
|
||
|
|
"learning_rate": 5.840532011303996e-07,
|
||
|
|
"loss": 0.15459495782852173,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2060,
|
||
|
|
"token_acc": 0.9486288752039581,
|
||
|
|
"train_speed(iter/s)": 0.12312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5332104737771637,
|
||
|
|
"eval_loss": 0.27472230792045593,
|
||
|
|
"eval_runtime": 29.5056,
|
||
|
|
"eval_samples_per_second": 17.827,
|
||
|
|
"eval_steps_per_second": 4.474,
|
||
|
|
"eval_token_acc": 0.9183744055339386,
|
||
|
|
"step": 2060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.539353451585656,
|
||
|
|
"grad_norm": 0.8421174883842468,
|
||
|
|
"learning_rate": 5.690417808952243e-07,
|
||
|
|
"loss": 0.15741729736328125,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2065,
|
||
|
|
"token_acc": 0.9253989855251763,
|
||
|
|
"train_speed(iter/s)": 0.122912
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5454964293941487,
|
||
|
|
"grad_norm": 0.8520276546478271,
|
||
|
|
"learning_rate": 5.542141466099271e-07,
|
||
|
|
"loss": 0.1434725046157837,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2070,
|
||
|
|
"token_acc": 0.9435738510115776,
|
||
|
|
"train_speed(iter/s)": 0.122999
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5516394072026416,
|
||
|
|
"grad_norm": 0.8779215216636658,
|
||
|
|
"learning_rate": 5.395709132906569e-07,
|
||
|
|
"loss": 0.13479983806610107,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2075,
|
||
|
|
"token_acc": 0.950151781434734,
|
||
|
|
"train_speed(iter/s)": 0.123082
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.557782385011134,
|
||
|
|
"grad_norm": 0.7466580867767334,
|
||
|
|
"learning_rate": 5.251126883050333e-07,
|
||
|
|
"loss": 0.13184006214141847,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2080,
|
||
|
|
"token_acc": 0.954845163930699,
|
||
|
|
"train_speed(iter/s)": 0.12315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.557782385011134,
|
||
|
|
"eval_loss": 0.27403029799461365,
|
||
|
|
"eval_runtime": 29.675,
|
||
|
|
"eval_samples_per_second": 17.725,
|
||
|
|
"eval_steps_per_second": 4.448,
|
||
|
|
"eval_token_acc": 0.9182735264447327,
|
||
|
|
"step": 2080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.563925362819627,
|
||
|
|
"grad_norm": 0.9100626707077026,
|
||
|
|
"learning_rate": 5.108400713469547e-07,
|
||
|
|
"loss": 0.15517921447753907,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2085,
|
||
|
|
"token_acc": 0.9215290970418331,
|
||
|
|
"train_speed(iter/s)": 0.122941
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5700683406281195,
|
||
|
|
"grad_norm": 0.7965090870857239,
|
||
|
|
"learning_rate": 4.967536544117263e-07,
|
||
|
|
"loss": 0.1428399920463562,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2090,
|
||
|
|
"token_acc": 0.9566871852266369,
|
||
|
|
"train_speed(iter/s)": 0.123002
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.576211318436612,
|
||
|
|
"grad_norm": 0.8939210772514343,
|
||
|
|
"learning_rate": 4.828540217715067e-07,
|
||
|
|
"loss": 0.15376098155975343,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2095,
|
||
|
|
"token_acc": 0.9479303634355442,
|
||
|
|
"train_speed(iter/s)": 0.123068
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.582354296245105,
|
||
|
|
"grad_norm": 0.825840950012207,
|
||
|
|
"learning_rate": 4.6914174995106863e-07,
|
||
|
|
"loss": 0.14222912788391112,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2100,
|
||
|
|
"token_acc": 0.9461206896551724,
|
||
|
|
"train_speed(iter/s)": 0.123144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.582354296245105,
|
||
|
|
"eval_loss": 0.27377504110336304,
|
||
|
|
"eval_runtime": 29.7092,
|
||
|
|
"eval_samples_per_second": 17.705,
|
||
|
|
"eval_steps_per_second": 4.443,
|
||
|
|
"eval_token_acc": 0.9185257241677475,
|
||
|
|
"step": 2100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5884972740535974,
|
||
|
|
"grad_norm": 0.8890196681022644,
|
||
|
|
"learning_rate": 4.556174077038927e-07,
|
||
|
|
"loss": 0.14791591167449952,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2105,
|
||
|
|
"token_acc": 0.9235944439194935,
|
||
|
|
"train_speed(iter/s)": 0.122918
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5946402518620904,
|
||
|
|
"grad_norm": 0.7061293721199036,
|
||
|
|
"learning_rate": 4.422815559885696e-07,
|
||
|
|
"loss": 0.13169264793395996,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2110,
|
||
|
|
"token_acc": 0.948949511019606,
|
||
|
|
"train_speed(iter/s)": 0.122982
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.600783229670583,
|
||
|
|
"grad_norm": 0.8161597847938538,
|
||
|
|
"learning_rate": 4.2913474794554044e-07,
|
||
|
|
"loss": 0.1341610074043274,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2115,
|
||
|
|
"token_acc": 0.9497422680412371,
|
||
|
|
"train_speed(iter/s)": 0.123053
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6069262074790753,
|
||
|
|
"grad_norm": 0.8730164766311646,
|
||
|
|
"learning_rate": 4.161775288741454e-07,
|
||
|
|
"loss": 0.15282490253448486,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2120,
|
||
|
|
"token_acc": 0.943151087595532,
|
||
|
|
"train_speed(iter/s)": 0.123121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6069262074790753,
|
||
|
|
"eval_loss": 0.2740446925163269,
|
||
|
|
"eval_runtime": 29.6992,
|
||
|
|
"eval_samples_per_second": 17.711,
|
||
|
|
"eval_steps_per_second": 4.445,
|
||
|
|
"eval_token_acc": 0.9185833693615795,
|
||
|
|
"step": 2120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6130691852875683,
|
||
|
|
"grad_norm": 0.835308313369751,
|
||
|
|
"learning_rate": 4.034104362100155e-07,
|
||
|
|
"loss": 0.13589699268341066,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2125,
|
||
|
|
"token_acc": 0.9286285805728917,
|
||
|
|
"train_speed(iter/s)": 0.1229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6192121630960608,
|
||
|
|
"grad_norm": 0.8869427442550659,
|
||
|
|
"learning_rate": 3.9083399950277156e-07,
|
||
|
|
"loss": 0.14998774528503417,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2130,
|
||
|
|
"token_acc": 0.9448426301028358,
|
||
|
|
"train_speed(iter/s)": 0.122982
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6253551409045537,
|
||
|
|
"grad_norm": 0.8314644694328308,
|
||
|
|
"learning_rate": 3.7844874039406677e-07,
|
||
|
|
"loss": 0.12793076038360596,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2135,
|
||
|
|
"token_acc": 0.9614247859763555,
|
||
|
|
"train_speed(iter/s)": 0.123051
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.631498118713046,
|
||
|
|
"grad_norm": 0.8190094828605652,
|
||
|
|
"learning_rate": 3.6625517259594566e-07,
|
||
|
|
"loss": 0.14857040643692015,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2140,
|
||
|
|
"token_acc": 0.9486269539501478,
|
||
|
|
"train_speed(iter/s)": 0.123129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.631498118713046,
|
||
|
|
"eval_loss": 0.27396196126937866,
|
||
|
|
"eval_runtime": 29.662,
|
||
|
|
"eval_samples_per_second": 17.733,
|
||
|
|
"eval_steps_per_second": 4.45,
|
||
|
|
"eval_token_acc": 0.9183023490416486,
|
||
|
|
"step": 2140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6376410965215387,
|
||
|
|
"grad_norm": 0.8906332850456238,
|
||
|
|
"learning_rate": 3.5425380186953905e-07,
|
||
|
|
"loss": 0.15265541076660155,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2145,
|
||
|
|
"token_acc": 0.9210909443851305,
|
||
|
|
"train_speed(iter/s)": 0.122924
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6437840743300316,
|
||
|
|
"grad_norm": 0.8447383642196655,
|
||
|
|
"learning_rate": 3.424451260040862e-07,
|
||
|
|
"loss": 0.1445927381515503,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2150,
|
||
|
|
"token_acc": 0.951250271798217,
|
||
|
|
"train_speed(iter/s)": 0.123006
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.649927052138524,
|
||
|
|
"grad_norm": 0.8714754581451416,
|
||
|
|
"learning_rate": 3.3082963479628747e-07,
|
||
|
|
"loss": 0.15293993949890136,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2155,
|
||
|
|
"token_acc": 0.9348866900734121,
|
||
|
|
"train_speed(iter/s)": 0.123079
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.656070029947017,
|
||
|
|
"grad_norm": 0.7784111499786377,
|
||
|
|
"learning_rate": 3.194078100299863e-07,
|
||
|
|
"loss": 0.13703620433807373,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2160,
|
||
|
|
"token_acc": 0.955746644295302,
|
||
|
|
"train_speed(iter/s)": 0.123143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.656070029947017,
|
||
|
|
"eval_loss": 0.2733152210712433,
|
||
|
|
"eval_runtime": 29.7291,
|
||
|
|
"eval_samples_per_second": 17.693,
|
||
|
|
"eval_steps_per_second": 4.44,
|
||
|
|
"eval_token_acc": 0.9185329298169765,
|
||
|
|
"step": 2160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6622130077555095,
|
||
|
|
"grad_norm": 0.8391521573066711,
|
||
|
|
"learning_rate": 3.0818012545618836e-07,
|
||
|
|
"loss": 0.13625545501708985,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2165,
|
||
|
|
"token_acc": 0.9254722933600564,
|
||
|
|
"train_speed(iter/s)": 0.12293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.668355985564002,
|
||
|
|
"grad_norm": 0.8696740865707397,
|
||
|
|
"learning_rate": 2.9714704677341055e-07,
|
||
|
|
"loss": 0.15032825469970704,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2170,
|
||
|
|
"token_acc": 0.9501612578109252,
|
||
|
|
"train_speed(iter/s)": 0.122998
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.674498963372495,
|
||
|
|
"grad_norm": 0.7542688250541687,
|
||
|
|
"learning_rate": 2.8630903160836776e-07,
|
||
|
|
"loss": 0.14371325969696044,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2175,
|
||
|
|
"token_acc": 0.9407879649589506,
|
||
|
|
"train_speed(iter/s)": 0.123074
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6806419411809874,
|
||
|
|
"grad_norm": 0.703037679195404,
|
||
|
|
"learning_rate": 2.756665294969868e-07,
|
||
|
|
"loss": 0.13071630001068116,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2180,
|
||
|
|
"token_acc": 0.9524260355029586,
|
||
|
|
"train_speed(iter/s)": 0.123154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6806419411809874,
|
||
|
|
"eval_loss": 0.2734775245189667,
|
||
|
|
"eval_runtime": 29.4649,
|
||
|
|
"eval_samples_per_second": 17.852,
|
||
|
|
"eval_steps_per_second": 4.48,
|
||
|
|
"eval_token_acc": 0.9185833693615795,
|
||
|
|
"step": 2180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6867849189894804,
|
||
|
|
"grad_norm": 0.7227668166160583,
|
||
|
|
"learning_rate": 2.6521998186576357e-07,
|
||
|
|
"loss": 0.13176329135894777,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2185,
|
||
|
|
"token_acc": 0.9272463413354781,
|
||
|
|
"train_speed(iter/s)": 0.122934
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.692927896797973,
|
||
|
|
"grad_norm": 0.812665581703186,
|
||
|
|
"learning_rate": 2.549698220134517e-07,
|
||
|
|
"loss": 0.14241292476654052,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2190,
|
||
|
|
"token_acc": 0.9535809018567639,
|
||
|
|
"train_speed(iter/s)": 0.122998
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6990708746064653,
|
||
|
|
"grad_norm": 0.8766520023345947,
|
||
|
|
"learning_rate": 2.449164750930938e-07,
|
||
|
|
"loss": 0.14588472843170167,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2195,
|
||
|
|
"token_acc": 0.9549009533595936,
|
||
|
|
"train_speed(iter/s)": 0.123059
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7052138524149583,
|
||
|
|
"grad_norm": 0.9236011505126953,
|
||
|
|
"learning_rate": 2.3506035809438553e-07,
|
||
|
|
"loss": 0.1432283639907837,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2200,
|
||
|
|
"token_acc": 0.9595282766014473,
|
||
|
|
"train_speed(iter/s)": 0.123129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7052138524149583,
|
||
|
|
"eval_loss": 0.27359798550605774,
|
||
|
|
"eval_runtime": 29.6892,
|
||
|
|
"eval_samples_per_second": 17.717,
|
||
|
|
"eval_steps_per_second": 4.446,
|
||
|
|
"eval_token_acc": 0.9185833693615795,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7113568302234508,
|
||
|
|
"grad_norm": 0.7723605036735535,
|
||
|
|
"learning_rate": 2.2540187982637628e-07,
|
||
|
|
"loss": 0.1351910948753357,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2205,
|
||
|
|
"token_acc": 0.9280636513015653,
|
||
|
|
"train_speed(iter/s)": 0.122914
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7174998080319437,
|
||
|
|
"grad_norm": 0.7751135230064392,
|
||
|
|
"learning_rate": 2.1594144090051728e-07,
|
||
|
|
"loss": 0.14594308137893677,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2210,
|
||
|
|
"token_acc": 0.9436165379373013,
|
||
|
|
"train_speed(iter/s)": 0.122972
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.723642785840436,
|
||
|
|
"grad_norm": 0.8251017928123474,
|
||
|
|
"learning_rate": 2.066794337140443e-07,
|
||
|
|
"loss": 0.13928499221801757,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2215,
|
||
|
|
"token_acc": 0.9523616048755713,
|
||
|
|
"train_speed(iter/s)": 0.123029
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7297857636489287,
|
||
|
|
"grad_norm": 0.805425763130188,
|
||
|
|
"learning_rate": 1.9761624243370026e-07,
|
||
|
|
"loss": 0.13413631916046143,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2220,
|
||
|
|
"token_acc": 0.9549077181208053,
|
||
|
|
"train_speed(iter/s)": 0.123085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7297857636489287,
|
||
|
|
"eval_loss": 0.27328255772590637,
|
||
|
|
"eval_runtime": 29.7048,
|
||
|
|
"eval_samples_per_second": 17.708,
|
||
|
|
"eval_steps_per_second": 4.444,
|
||
|
|
"eval_token_acc": 0.9185257241677475,
|
||
|
|
"step": 2220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7359287414574216,
|
||
|
|
"grad_norm": 0.720260739326477,
|
||
|
|
"learning_rate": 1.8875224297980332e-07,
|
||
|
|
"loss": 0.14869468212127684,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2225,
|
||
|
|
"token_acc": 0.9208377041810281,
|
||
|
|
"train_speed(iter/s)": 0.122897
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.742071719265914,
|
||
|
|
"grad_norm": 0.7555059194564819,
|
||
|
|
"learning_rate": 1.800878030106501e-07,
|
||
|
|
"loss": 0.13696482181549072,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2230,
|
||
|
|
"token_acc": 0.951278626898155,
|
||
|
|
"train_speed(iter/s)": 0.122958
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.748214697074407,
|
||
|
|
"grad_norm": 0.9698434472084045,
|
||
|
|
"learning_rate": 1.7162328190727217e-07,
|
||
|
|
"loss": 0.16066057682037355,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2235,
|
||
|
|
"token_acc": 0.9414913717092833,
|
||
|
|
"train_speed(iter/s)": 0.123034
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7543576748828995,
|
||
|
|
"grad_norm": 0.9093776345252991,
|
||
|
|
"learning_rate": 1.6335903075852478e-07,
|
||
|
|
"loss": 0.13771231174468995,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2240,
|
||
|
|
"token_acc": 0.953091935104632,
|
||
|
|
"train_speed(iter/s)": 0.123095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7543576748828995,
|
||
|
|
"eval_loss": 0.2732333838939667,
|
||
|
|
"eval_runtime": 29.6848,
|
||
|
|
"eval_samples_per_second": 17.72,
|
||
|
|
"eval_steps_per_second": 4.447,
|
||
|
|
"eval_token_acc": 0.9184104337800836,
|
||
|
|
"step": 2240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.760500652691392,
|
||
|
|
"grad_norm": 0.9624541997909546,
|
||
|
|
"learning_rate": 1.552953923465267e-07,
|
||
|
|
"loss": 0.1540065288543701,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2245,
|
||
|
|
"token_acc": 0.9251303793194012,
|
||
|
|
"train_speed(iter/s)": 0.122899
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.766643630499885,
|
||
|
|
"grad_norm": 0.6570599675178528,
|
||
|
|
"learning_rate": 1.4743270113244278e-07,
|
||
|
|
"loss": 0.11645562648773193,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2250,
|
||
|
|
"token_acc": 0.960995889387145,
|
||
|
|
"train_speed(iter/s)": 0.122957
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7727866083083774,
|
||
|
|
"grad_norm": 0.8584187030792236,
|
||
|
|
"learning_rate": 1.3977128324261068e-07,
|
||
|
|
"loss": 0.1433710813522339,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2255,
|
||
|
|
"token_acc": 0.9531076066790353,
|
||
|
|
"train_speed(iter/s)": 0.123031
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7789295861168704,
|
||
|
|
"grad_norm": 0.8224872350692749,
|
||
|
|
"learning_rate": 1.3231145645501153e-07,
|
||
|
|
"loss": 0.14238922595977782,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2260,
|
||
|
|
"token_acc": 0.9506010814215969,
|
||
|
|
"train_speed(iter/s)": 0.123085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7789295861168704,
|
||
|
|
"eval_loss": 0.2732416093349457,
|
||
|
|
"eval_runtime": 29.7221,
|
||
|
|
"eval_samples_per_second": 17.697,
|
||
|
|
"eval_steps_per_second": 4.441,
|
||
|
|
"eval_token_acc": 0.9183599942354806,
|
||
|
|
"step": 2260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.785072563925363,
|
||
|
|
"grad_norm": 0.9489020705223083,
|
||
|
|
"learning_rate": 1.2505353018609445e-07,
|
||
|
|
"loss": 0.14603989124298095,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2265,
|
||
|
|
"token_acc": 0.9227581508884137,
|
||
|
|
"train_speed(iter/s)": 0.122896
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7912155417338553,
|
||
|
|
"grad_norm": 0.736284077167511,
|
||
|
|
"learning_rate": 1.1799780547793682e-07,
|
||
|
|
"loss": 0.14218697547912598,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2270,
|
||
|
|
"token_acc": 0.9579674123170395,
|
||
|
|
"train_speed(iter/s)": 0.122963
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7973585195423483,
|
||
|
|
"grad_norm": 0.7859813570976257,
|
||
|
|
"learning_rate": 1.111445749857626e-07,
|
||
|
|
"loss": 0.1413131594657898,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2275,
|
||
|
|
"token_acc": 0.9514263252470799,
|
||
|
|
"train_speed(iter/s)": 0.123032
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8035014973508408,
|
||
|
|
"grad_norm": 1.0802668333053589,
|
||
|
|
"learning_rate": 1.0449412296580252e-07,
|
||
|
|
"loss": 0.1472024917602539,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2280,
|
||
|
|
"token_acc": 0.9520010294685368,
|
||
|
|
"train_speed(iter/s)": 0.123092
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8035014973508408,
|
||
|
|
"eval_loss": 0.273179292678833,
|
||
|
|
"eval_runtime": 29.7252,
|
||
|
|
"eval_samples_per_second": 17.695,
|
||
|
|
"eval_steps_per_second": 4.441,
|
||
|
|
"eval_token_acc": 0.9184392563769995,
|
||
|
|
"step": 2280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8096444751593337,
|
||
|
|
"grad_norm": 0.7576152682304382,
|
||
|
|
"learning_rate": 9.804672526349979e-08,
|
||
|
|
"loss": 0.14902775287628173,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2285,
|
||
|
|
"token_acc": 0.9238161925601751,
|
||
|
|
"train_speed(iter/s)": 0.122903
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.815787452967826,
|
||
|
|
"grad_norm": 0.8574935793876648,
|
||
|
|
"learning_rate": 9.180264930207405e-08,
|
||
|
|
"loss": 0.1530381441116333,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2290,
|
||
|
|
"token_acc": 0.9533666759284987,
|
||
|
|
"train_speed(iter/s)": 0.12298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8219304307763187,
|
||
|
|
"grad_norm": 0.7855550050735474,
|
||
|
|
"learning_rate": 8.576215407142652e-08,
|
||
|
|
"loss": 0.12575039863586426,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2295,
|
||
|
|
"token_acc": 0.9574297591025192,
|
||
|
|
"train_speed(iter/s)": 0.123043
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8280734085848116,
|
||
|
|
"grad_norm": 0.8387411236763,
|
||
|
|
"learning_rate": 7.992549011739903e-08,
|
||
|
|
"loss": 0.14488180875778198,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2300,
|
||
|
|
"token_acc": 0.9483278379651436,
|
||
|
|
"train_speed(iter/s)": 0.123101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8280734085848116,
|
||
|
|
"eval_loss": 0.2733011543750763,
|
||
|
|
"eval_runtime": 29.6296,
|
||
|
|
"eval_samples_per_second": 17.753,
|
||
|
|
"eval_steps_per_second": 4.455,
|
||
|
|
"eval_token_acc": 0.9184680789739156,
|
||
|
|
"step": 2300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.834216386393304,
|
||
|
|
"grad_norm": 0.9076153039932251,
|
||
|
|
"learning_rate": 7.42928995313802e-08,
|
||
|
|
"loss": 0.14017899036407472,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2305,
|
||
|
|
"token_acc": 0.9183514619299471,
|
||
|
|
"train_speed(iter/s)": 0.122904
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.840359364201797,
|
||
|
|
"grad_norm": 0.8693546056747437,
|
||
|
|
"learning_rate": 6.886461594026394e-08,
|
||
|
|
"loss": 0.134627628326416,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2310,
|
||
|
|
"token_acc": 0.9542199129335768,
|
||
|
|
"train_speed(iter/s)": 0.122955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8465023420102895,
|
||
|
|
"grad_norm": 0.6884361505508423,
|
||
|
|
"learning_rate": 6.364086449676233e-08,
|
||
|
|
"loss": 0.11727933883666992,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2315,
|
||
|
|
"token_acc": 0.9606529928840519,
|
||
|
|
"train_speed(iter/s)": 0.123004
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.852645319818782,
|
||
|
|
"grad_norm": 0.8520733118057251,
|
||
|
|
"learning_rate": 5.862186187006347e-08,
|
||
|
|
"loss": 0.13235876560211182,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2320,
|
||
|
|
"token_acc": 0.9552718507276136,
|
||
|
|
"train_speed(iter/s)": 0.12307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.852645319818782,
|
||
|
|
"eval_loss": 0.2732396423816681,
|
||
|
|
"eval_runtime": 29.7321,
|
||
|
|
"eval_samples_per_second": 17.691,
|
||
|
|
"eval_steps_per_second": 4.44,
|
||
|
|
"eval_token_acc": 0.9184248450785416,
|
||
|
|
"step": 2320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.858788297627275,
|
||
|
|
"grad_norm": 0.6756072640419006,
|
||
|
|
"learning_rate": 5.3807816236846614e-08,
|
||
|
|
"loss": 0.14616656303405762,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2325,
|
||
|
|
"token_acc": 0.9212030774597392,
|
||
|
|
"train_speed(iter/s)": 0.122879
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8649312754357674,
|
||
|
|
"grad_norm": 0.7540440559387207,
|
||
|
|
"learning_rate": 4.919892727264508e-08,
|
||
|
|
"loss": 0.1399930238723755,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2330,
|
||
|
|
"token_acc": 0.9455614286419997,
|
||
|
|
"train_speed(iter/s)": 0.122935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8710742532442604,
|
||
|
|
"grad_norm": 0.7465505599975586,
|
||
|
|
"learning_rate": 4.4795386143567375e-08,
|
||
|
|
"loss": 0.14697123765945436,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2335,
|
||
|
|
"token_acc": 0.9416914178521182,
|
||
|
|
"train_speed(iter/s)": 0.123005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.877217231052753,
|
||
|
|
"grad_norm": 0.8006975650787354,
|
||
|
|
"learning_rate": 4.0597375498365175e-08,
|
||
|
|
"loss": 0.14045066833496095,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2340,
|
||
|
|
"token_acc": 0.9556364912896573,
|
||
|
|
"train_speed(iter/s)": 0.123073
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.877217231052753,
|
||
|
|
"eval_loss": 0.2732827663421631,
|
||
|
|
"eval_runtime": 29.7807,
|
||
|
|
"eval_samples_per_second": 17.662,
|
||
|
|
"eval_steps_per_second": 4.432,
|
||
|
|
"eval_token_acc": 0.9184320507277706,
|
||
|
|
"step": 2340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8833602088612453,
|
||
|
|
"grad_norm": 0.7655653357505798,
|
||
|
|
"learning_rate": 3.6605069460858286e-08,
|
||
|
|
"loss": 0.14170855283737183,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2345,
|
||
|
|
"token_acc": 0.9223746043924427,
|
||
|
|
"train_speed(iter/s)": 0.122886
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8895031866697383,
|
||
|
|
"grad_norm": 0.7868750691413879,
|
||
|
|
"learning_rate": 3.281863362271487e-08,
|
||
|
|
"loss": 0.13158297538757324,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2350,
|
||
|
|
"token_acc": 0.9610517504554631,
|
||
|
|
"train_speed(iter/s)": 0.122944
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8956461644782308,
|
||
|
|
"grad_norm": 0.9070404171943665,
|
||
|
|
"learning_rate": 2.9238225036579693e-08,
|
||
|
|
"loss": 0.13984733819961548,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2355,
|
||
|
|
"token_acc": 0.9568466078293356,
|
||
|
|
"train_speed(iter/s)": 0.123006
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9017891422867237,
|
||
|
|
"grad_norm": 0.826079249382019,
|
||
|
|
"learning_rate": 2.5863992209560484e-08,
|
||
|
|
"loss": 0.1394752025604248,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2360,
|
||
|
|
"token_acc": 0.9418652788455852,
|
||
|
|
"train_speed(iter/s)": 0.123076
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9017891422867237,
|
||
|
|
"eval_loss": 0.27347302436828613,
|
||
|
|
"eval_runtime": 29.7339,
|
||
|
|
"eval_samples_per_second": 17.69,
|
||
|
|
"eval_steps_per_second": 4.439,
|
||
|
|
"eval_token_acc": 0.9184464620262286,
|
||
|
|
"step": 2360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.907932120095216,
|
||
|
|
"grad_norm": 0.9018839001655579,
|
||
|
|
"learning_rate": 2.269607509707006e-08,
|
||
|
|
"loss": 0.1596289873123169,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2365,
|
||
|
|
"token_acc": 0.9238603473227207,
|
||
|
|
"train_speed(iter/s)": 0.122868
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9140750979037087,
|
||
|
|
"grad_norm": 0.8631731271743774,
|
||
|
|
"learning_rate": 1.97346050970193e-08,
|
||
|
|
"loss": 0.1415793776512146,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2370,
|
||
|
|
"token_acc": 0.9460295790671217,
|
||
|
|
"train_speed(iter/s)": 0.122945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9202180757122016,
|
||
|
|
"grad_norm": 0.8872095942497253,
|
||
|
|
"learning_rate": 1.69797050443693e-08,
|
||
|
|
"loss": 0.13625437021255493,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2375,
|
||
|
|
"token_acc": 0.9583434245580044,
|
||
|
|
"train_speed(iter/s)": 0.123012
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.926361053520694,
|
||
|
|
"grad_norm": 0.8124271035194397,
|
||
|
|
"learning_rate": 1.4431489206034321e-08,
|
||
|
|
"loss": 0.14173973798751832,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2380,
|
||
|
|
"token_acc": 0.9510019878579488,
|
||
|
|
"train_speed(iter/s)": 0.123073
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.926361053520694,
|
||
|
|
"eval_loss": 0.27337023615837097,
|
||
|
|
"eval_runtime": 29.6435,
|
||
|
|
"eval_samples_per_second": 17.744,
|
||
|
|
"eval_steps_per_second": 4.453,
|
||
|
|
"eval_token_acc": 0.9184969015708315,
|
||
|
|
"step": 2380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.932504031329187,
|
||
|
|
"grad_norm": 0.8951108455657959,
|
||
|
|
"learning_rate": 1.2090063276142261e-08,
|
||
|
|
"loss": 0.13466954231262207,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2385,
|
||
|
|
"token_acc": 0.9219987812309567,
|
||
|
|
"train_speed(iter/s)": 0.122895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9386470091376795,
|
||
|
|
"grad_norm": 0.9628083109855652,
|
||
|
|
"learning_rate": 9.955524371653146e-09,
|
||
|
|
"loss": 0.15039776563644408,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2390,
|
||
|
|
"token_acc": 0.9414807461204869,
|
||
|
|
"train_speed(iter/s)": 0.12296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.944789986946172,
|
||
|
|
"grad_norm": 0.8701411485671997,
|
||
|
|
"learning_rate": 8.02796102832848e-09,
|
||
|
|
"loss": 0.13970096111297609,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2395,
|
||
|
|
"token_acc": 0.9505520319473808,
|
||
|
|
"train_speed(iter/s)": 0.123027
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.950932964754665,
|
||
|
|
"grad_norm": 0.9468409419059753,
|
||
|
|
"learning_rate": 6.307453197059166e-09,
|
||
|
|
"loss": 0.14919402599334716,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2400,
|
||
|
|
"token_acc": 0.9412962147887324,
|
||
|
|
"train_speed(iter/s)": 0.12308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.950932964754665,
|
||
|
|
"eval_loss": 0.27350106835365295,
|
||
|
|
"eval_runtime": 29.668,
|
||
|
|
"eval_samples_per_second": 17.73,
|
||
|
|
"eval_steps_per_second": 4.449,
|
||
|
|
"eval_token_acc": 0.9183167603401067,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9570759425631574,
|
||
|
|
"grad_norm": 0.8505570292472839,
|
||
|
|
"learning_rate": 4.794072240550951e-09,
|
||
|
|
"loss": 0.1571817636489868,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2405,
|
||
|
|
"token_acc": 0.920852764823451,
|
||
|
|
"train_speed(iter/s)": 0.122896
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9632189203716504,
|
||
|
|
"grad_norm": 0.7646933197975159,
|
||
|
|
"learning_rate": 3.487880930363452e-09,
|
||
|
|
"loss": 0.13370524644851683,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2410,
|
||
|
|
"token_acc": 0.9574316090263478,
|
||
|
|
"train_speed(iter/s)": 0.122964
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.969361898180143,
|
||
|
|
"grad_norm": 0.8396750092506409,
|
||
|
|
"learning_rate": 2.3889334443055743e-09,
|
||
|
|
"loss": 0.14388556480407716,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2415,
|
||
|
|
"token_acc": 0.9585525888390827,
|
||
|
|
"train_speed(iter/s)": 0.123037
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9755048759886353,
|
||
|
|
"grad_norm": 0.6261889338493347,
|
||
|
|
"learning_rate": 1.4972753641906424e-09,
|
||
|
|
"loss": 0.13296045064926149,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2420,
|
||
|
|
"token_acc": 0.9545391609359856,
|
||
|
|
"train_speed(iter/s)": 0.123089
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9755048759886353,
|
||
|
|
"eval_loss": 0.27353137731552124,
|
||
|
|
"eval_runtime": 29.6147,
|
||
|
|
"eval_samples_per_second": 17.761,
|
||
|
|
"eval_steps_per_second": 4.457,
|
||
|
|
"eval_token_acc": 0.9184320507277706,
|
||
|
|
"step": 2420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9816478537971283,
|
||
|
|
"grad_norm": 0.7684817910194397,
|
||
|
|
"learning_rate": 8.12943673943467e-10,
|
||
|
|
"loss": 0.1473867416381836,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2425,
|
||
|
|
"token_acc": 0.9241485786940332,
|
||
|
|
"train_speed(iter/s)": 0.122905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9877908316056208,
|
||
|
|
"grad_norm": 0.7693585157394409,
|
||
|
|
"learning_rate": 3.359667580682402e-10,
|
||
|
|
"loss": 0.14533259868621826,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2430,
|
||
|
|
"token_acc": 0.9479254868755292,
|
||
|
|
"train_speed(iter/s)": 0.122963
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9939338094141137,
|
||
|
|
"grad_norm": 0.8009675145149231,
|
||
|
|
"learning_rate": 6.636440046892123e-11,
|
||
|
|
"loss": 0.12457112073898316,
|
||
|
|
"memory(GiB)": 31.97,
|
||
|
|
"step": 2435,
|
||
|
|
"token_acc": 0.9580943014806316,
|
||
|
|
"train_speed(iter/s)": 0.123019
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9988481916609078,
|
||
|
|
"eval_loss": 0.2735154330730438,
|
||
|
|
"eval_runtime": 29.6742,
|
||
|
|
"eval_samples_per_second": 17.726,
|
||
|
|
"eval_steps_per_second": 4.448,
|
||
|
|
"eval_token_acc": 0.9182735264447327,
|
||
|
|
"step": 2439
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 2439,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 3,
|
||
|
|
"save_steps": 20,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 2.2467967701619835e+18,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|