6153 lines
176 KiB
JSON
6153 lines
176 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": 9000,
|
||
|
|
"best_metric": 0.44325256,
|
||
|
|
"best_model_checkpoint": "/openpai_config/sft/Long_Cot_data/Stage1-380k-25k-length-Qwen2.5-Coder-7B-Instruct-8p-5e-5/v0-20250829-164426/checkpoint-9000",
|
||
|
|
"epoch": 5.150246834084568,
|
||
|
|
"eval_steps": 1000,
|
||
|
|
"global_step": 12000,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.00042927666881305,
|
||
|
|
"grad_norm": 1.6802181005477905,
|
||
|
|
"learning_rate": 8.928571428571429e-08,
|
||
|
|
"loss": 0.9940392374992371,
|
||
|
|
"memory(GiB)": 47.57,
|
||
|
|
"step": 1,
|
||
|
|
"token_acc": 0.7531317395493966,
|
||
|
|
"train_speed(iter/s)": 0.017784
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.008585533376261001,
|
||
|
|
"grad_norm": 0.8075768947601318,
|
||
|
|
"learning_rate": 1.7857142857142857e-06,
|
||
|
|
"loss": 0.9834547544780531,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 20,
|
||
|
|
"token_acc": 0.7346348107371886,
|
||
|
|
"train_speed(iter/s)": 0.071123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.017171066752522002,
|
||
|
|
"grad_norm": 0.4582468867301941,
|
||
|
|
"learning_rate": 3.5714285714285714e-06,
|
||
|
|
"loss": 0.8647201538085938,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 40,
|
||
|
|
"token_acc": 0.7403293957393929,
|
||
|
|
"train_speed(iter/s)": 0.078614
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.025756600128783,
|
||
|
|
"grad_norm": 0.22638651728630066,
|
||
|
|
"learning_rate": 5.357142857142857e-06,
|
||
|
|
"loss": 0.756020975112915,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 60,
|
||
|
|
"token_acc": 0.7670503626615286,
|
||
|
|
"train_speed(iter/s)": 0.081671
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.034342133505044004,
|
||
|
|
"grad_norm": 0.22832736372947693,
|
||
|
|
"learning_rate": 7.142857142857143e-06,
|
||
|
|
"loss": 0.6885409832000733,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 80,
|
||
|
|
"token_acc": 0.7765815619910137,
|
||
|
|
"train_speed(iter/s)": 0.083578
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.042927666881305004,
|
||
|
|
"grad_norm": 0.1798371970653534,
|
||
|
|
"learning_rate": 8.92857142857143e-06,
|
||
|
|
"loss": 0.6466886043548584,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 100,
|
||
|
|
"token_acc": 0.7900423674902669,
|
||
|
|
"train_speed(iter/s)": 0.084818
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.051513200257566,
|
||
|
|
"grad_norm": 0.17644034326076508,
|
||
|
|
"learning_rate": 1.0714285714285714e-05,
|
||
|
|
"loss": 0.6123067378997803,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 120,
|
||
|
|
"token_acc": 0.7967146967110779,
|
||
|
|
"train_speed(iter/s)": 0.086053
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.060098733633827,
|
||
|
|
"grad_norm": 0.20387020707130432,
|
||
|
|
"learning_rate": 1.25e-05,
|
||
|
|
"loss": 0.6003653049468994,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 140,
|
||
|
|
"token_acc": 0.8265060359403877,
|
||
|
|
"train_speed(iter/s)": 0.086654
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06868426701008801,
|
||
|
|
"grad_norm": 0.24960927665233612,
|
||
|
|
"learning_rate": 1.4285714285714285e-05,
|
||
|
|
"loss": 0.5757434368133545,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 160,
|
||
|
|
"token_acc": 0.7987813737966625,
|
||
|
|
"train_speed(iter/s)": 0.087161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07726980038634901,
|
||
|
|
"grad_norm": 0.2726881504058838,
|
||
|
|
"learning_rate": 1.6071428571428572e-05,
|
||
|
|
"loss": 0.5653277397155761,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 180,
|
||
|
|
"token_acc": 0.8027387420432055,
|
||
|
|
"train_speed(iter/s)": 0.087596
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08585533376261001,
|
||
|
|
"grad_norm": 0.2119862288236618,
|
||
|
|
"learning_rate": 1.785714285714286e-05,
|
||
|
|
"loss": 0.5523943424224853,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 200,
|
||
|
|
"token_acc": 0.8231761512065608,
|
||
|
|
"train_speed(iter/s)": 0.087961
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.094440867138871,
|
||
|
|
"grad_norm": 0.24396856129169464,
|
||
|
|
"learning_rate": 1.9642857142857145e-05,
|
||
|
|
"loss": 0.5498331546783447,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 220,
|
||
|
|
"token_acc": 0.7940026244174245,
|
||
|
|
"train_speed(iter/s)": 0.088292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.103026400515132,
|
||
|
|
"grad_norm": 0.2601749002933502,
|
||
|
|
"learning_rate": 2.1428571428571428e-05,
|
||
|
|
"loss": 0.5398545265197754,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 240,
|
||
|
|
"token_acc": 0.80989644710031,
|
||
|
|
"train_speed(iter/s)": 0.088502
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.111611933891393,
|
||
|
|
"grad_norm": 0.42718759179115295,
|
||
|
|
"learning_rate": 2.3214285714285715e-05,
|
||
|
|
"loss": 0.5296700477600098,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 260,
|
||
|
|
"token_acc": 0.8186448573942751,
|
||
|
|
"train_speed(iter/s)": 0.088739
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.120197467267654,
|
||
|
|
"grad_norm": 0.2564183175563812,
|
||
|
|
"learning_rate": 2.5e-05,
|
||
|
|
"loss": 0.5314459323883056,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 280,
|
||
|
|
"token_acc": 0.8186711788362628,
|
||
|
|
"train_speed(iter/s)": 0.088928
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.128783000643915,
|
||
|
|
"grad_norm": 0.42152953147888184,
|
||
|
|
"learning_rate": 2.6785714285714288e-05,
|
||
|
|
"loss": 0.5304059028625489,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 300,
|
||
|
|
"token_acc": 0.8313805341388459,
|
||
|
|
"train_speed(iter/s)": 0.089116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13736853402017601,
|
||
|
|
"grad_norm": 0.44018375873565674,
|
||
|
|
"learning_rate": 2.857142857142857e-05,
|
||
|
|
"loss": 0.5269341945648194,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 320,
|
||
|
|
"token_acc": 0.8261322879913329,
|
||
|
|
"train_speed(iter/s)": 0.089293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14595406739643701,
|
||
|
|
"grad_norm": 0.332704097032547,
|
||
|
|
"learning_rate": 3.0357142857142857e-05,
|
||
|
|
"loss": 0.5224681854248047,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 340,
|
||
|
|
"token_acc": 0.8305089071105363,
|
||
|
|
"train_speed(iter/s)": 0.08944
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15453960077269802,
|
||
|
|
"grad_norm": 0.2763151526451111,
|
||
|
|
"learning_rate": 3.2142857142857144e-05,
|
||
|
|
"loss": 0.5171589374542236,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 360,
|
||
|
|
"token_acc": 0.8298510336859191,
|
||
|
|
"train_speed(iter/s)": 0.08953
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16312513414895902,
|
||
|
|
"grad_norm": 0.49729594588279724,
|
||
|
|
"learning_rate": 3.392857142857143e-05,
|
||
|
|
"loss": 0.5136796474456787,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 380,
|
||
|
|
"token_acc": 0.8310140069023154,
|
||
|
|
"train_speed(iter/s)": 0.089637
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17171066752522002,
|
||
|
|
"grad_norm": 0.3252655267715454,
|
||
|
|
"learning_rate": 3.571428571428572e-05,
|
||
|
|
"loss": 0.5128469944000245,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 400,
|
||
|
|
"token_acc": 0.8536899287574551,
|
||
|
|
"train_speed(iter/s)": 0.089729
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.180296200901481,
|
||
|
|
"grad_norm": 0.28958284854888916,
|
||
|
|
"learning_rate": 3.7500000000000003e-05,
|
||
|
|
"loss": 0.5108192920684814,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 420,
|
||
|
|
"token_acc": 0.8385648117441578,
|
||
|
|
"train_speed(iter/s)": 0.089819
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.188881734277742,
|
||
|
|
"grad_norm": 0.34760820865631104,
|
||
|
|
"learning_rate": 3.928571428571429e-05,
|
||
|
|
"loss": 0.5059587478637695,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 440,
|
||
|
|
"token_acc": 0.8356636206879049,
|
||
|
|
"train_speed(iter/s)": 0.089855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.197467267654003,
|
||
|
|
"grad_norm": 0.41139236092567444,
|
||
|
|
"learning_rate": 4.107142857142857e-05,
|
||
|
|
"loss": 0.5062141418457031,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 460,
|
||
|
|
"token_acc": 0.8206318874596391,
|
||
|
|
"train_speed(iter/s)": 0.089919
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.206052801030264,
|
||
|
|
"grad_norm": 0.3865952789783478,
|
||
|
|
"learning_rate": 4.2857142857142856e-05,
|
||
|
|
"loss": 0.4976132869720459,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 480,
|
||
|
|
"token_acc": 0.809242185807305,
|
||
|
|
"train_speed(iter/s)": 0.090029
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.214638334406525,
|
||
|
|
"grad_norm": 0.34395724534988403,
|
||
|
|
"learning_rate": 4.464285714285715e-05,
|
||
|
|
"loss": 0.504787015914917,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 500,
|
||
|
|
"token_acc": 0.8331918132662932,
|
||
|
|
"train_speed(iter/s)": 0.090119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.223223867782786,
|
||
|
|
"grad_norm": 0.23087145388126373,
|
||
|
|
"learning_rate": 4.642857142857143e-05,
|
||
|
|
"loss": 0.49814434051513673,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 520,
|
||
|
|
"token_acc": 0.8304561567217256,
|
||
|
|
"train_speed(iter/s)": 0.0902
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.231809401159047,
|
||
|
|
"grad_norm": 0.3384479582309723,
|
||
|
|
"learning_rate": 4.8214285714285716e-05,
|
||
|
|
"loss": 0.49905076026916506,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 540,
|
||
|
|
"token_acc": 0.8417635120347525,
|
||
|
|
"train_speed(iter/s)": 0.090248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.240394934535308,
|
||
|
|
"grad_norm": 0.40263310074806213,
|
||
|
|
"learning_rate": 5e-05,
|
||
|
|
"loss": 0.4956265926361084,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 560,
|
||
|
|
"token_acc": 0.816079870788593,
|
||
|
|
"train_speed(iter/s)": 0.090319
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.248980467911569,
|
||
|
|
"grad_norm": 0.30763953924179077,
|
||
|
|
"learning_rate": 4.999984903632473e-05,
|
||
|
|
"loss": 0.4967645168304443,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 580,
|
||
|
|
"token_acc": 0.8258691170614004,
|
||
|
|
"train_speed(iter/s)": 0.0903
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25756600128783,
|
||
|
|
"grad_norm": 0.28709837794303894,
|
||
|
|
"learning_rate": 4.999939614712212e-05,
|
||
|
|
"loss": 0.49540038108825685,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 600,
|
||
|
|
"token_acc": 0.8345701058201058,
|
||
|
|
"train_speed(iter/s)": 0.090345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.266151534664091,
|
||
|
|
"grad_norm": 0.27484264969825745,
|
||
|
|
"learning_rate": 4.999864133786175e-05,
|
||
|
|
"loss": 0.4913135051727295,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 620,
|
||
|
|
"token_acc": 0.8408849265417659,
|
||
|
|
"train_speed(iter/s)": 0.090402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27473706804035203,
|
||
|
|
"grad_norm": 0.275291383266449,
|
||
|
|
"learning_rate": 4.999758461765953e-05,
|
||
|
|
"loss": 0.4913851261138916,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 640,
|
||
|
|
"token_acc": 0.823726404893571,
|
||
|
|
"train_speed(iter/s)": 0.090443
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28332260141661303,
|
||
|
|
"grad_norm": 0.31161361932754517,
|
||
|
|
"learning_rate": 4.999622599927756e-05,
|
||
|
|
"loss": 0.48822855949401855,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 660,
|
||
|
|
"token_acc": 0.8308604661462827,
|
||
|
|
"train_speed(iter/s)": 0.090487
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29190813479287403,
|
||
|
|
"grad_norm": 0.3709673285484314,
|
||
|
|
"learning_rate": 4.999456549912401e-05,
|
||
|
|
"loss": 0.486361026763916,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 680,
|
||
|
|
"token_acc": 0.8271976771900934,
|
||
|
|
"train_speed(iter/s)": 0.090543
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30049366816913503,
|
||
|
|
"grad_norm": 0.2165047973394394,
|
||
|
|
"learning_rate": 4.99926031372529e-05,
|
||
|
|
"loss": 0.48601832389831545,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 700,
|
||
|
|
"token_acc": 0.8387291407835747,
|
||
|
|
"train_speed(iter/s)": 0.090587
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30907920154539603,
|
||
|
|
"grad_norm": 0.24446570873260498,
|
||
|
|
"learning_rate": 4.999033893736386e-05,
|
||
|
|
"loss": 0.48243279457092286,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 720,
|
||
|
|
"token_acc": 0.8372941834434668,
|
||
|
|
"train_speed(iter/s)": 0.090636
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31766473492165703,
|
||
|
|
"grad_norm": 0.24655242264270782,
|
||
|
|
"learning_rate": 4.998777292680182e-05,
|
||
|
|
"loss": 0.48319129943847655,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 740,
|
||
|
|
"token_acc": 0.8441225801781377,
|
||
|
|
"train_speed(iter/s)": 0.090658
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32625026829791803,
|
||
|
|
"grad_norm": 0.2514285445213318,
|
||
|
|
"learning_rate": 4.998490513655676e-05,
|
||
|
|
"loss": 0.47730517387390137,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 760,
|
||
|
|
"token_acc": 0.8397503992980168,
|
||
|
|
"train_speed(iter/s)": 0.090692
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33483580167417903,
|
||
|
|
"grad_norm": 0.2303766906261444,
|
||
|
|
"learning_rate": 4.998173560126323e-05,
|
||
|
|
"loss": 0.4783301830291748,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 780,
|
||
|
|
"token_acc": 0.8443087371876962,
|
||
|
|
"train_speed(iter/s)": 0.090725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34342133505044004,
|
||
|
|
"grad_norm": 0.2418110966682434,
|
||
|
|
"learning_rate": 4.997826435920003e-05,
|
||
|
|
"loss": 0.47623915672302247,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 800,
|
||
|
|
"token_acc": 0.8400687170332869,
|
||
|
|
"train_speed(iter/s)": 0.090766
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35200686842670104,
|
||
|
|
"grad_norm": 0.24591697752475739,
|
||
|
|
"learning_rate": 4.9974491452289664e-05,
|
||
|
|
"loss": 0.47069730758666994,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 820,
|
||
|
|
"token_acc": 0.833947379545595,
|
||
|
|
"train_speed(iter/s)": 0.090805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.360592401802962,
|
||
|
|
"grad_norm": 0.17342260479927063,
|
||
|
|
"learning_rate": 4.9970416926097885e-05,
|
||
|
|
"loss": 0.47403693199157715,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 840,
|
||
|
|
"token_acc": 0.82827573574307,
|
||
|
|
"train_speed(iter/s)": 0.090837
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.369177935179223,
|
||
|
|
"grad_norm": 0.25668865442276,
|
||
|
|
"learning_rate": 4.9966040829833115e-05,
|
||
|
|
"loss": 0.4738626003265381,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 860,
|
||
|
|
"token_acc": 0.8238550967767793,
|
||
|
|
"train_speed(iter/s)": 0.090859
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.377763468555484,
|
||
|
|
"grad_norm": 0.23179244995117188,
|
||
|
|
"learning_rate": 4.99613632163459e-05,
|
||
|
|
"loss": 0.47292590141296387,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 880,
|
||
|
|
"token_acc": 0.8182959019634485,
|
||
|
|
"train_speed(iter/s)": 0.090879
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.386349001931745,
|
||
|
|
"grad_norm": 0.220433309674263,
|
||
|
|
"learning_rate": 4.995638414212821e-05,
|
||
|
|
"loss": 0.47188587188720704,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 900,
|
||
|
|
"token_acc": 0.8470956528576601,
|
||
|
|
"train_speed(iter/s)": 0.090882
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.394934535308006,
|
||
|
|
"grad_norm": 0.18783436715602875,
|
||
|
|
"learning_rate": 4.9951103667312795e-05,
|
||
|
|
"loss": 0.46758122444152833,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 920,
|
||
|
|
"token_acc": 0.8408150854174393,
|
||
|
|
"train_speed(iter/s)": 0.090902
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.403520068684267,
|
||
|
|
"grad_norm": 0.19517077505588531,
|
||
|
|
"learning_rate": 4.994552185567244e-05,
|
||
|
|
"loss": 0.4659998893737793,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 940,
|
||
|
|
"token_acc": 0.8483965614563244,
|
||
|
|
"train_speed(iter/s)": 0.090926
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.412105602060528,
|
||
|
|
"grad_norm": 0.21663770079612732,
|
||
|
|
"learning_rate": 4.9939638774619216e-05,
|
||
|
|
"loss": 0.46530804634094236,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 960,
|
||
|
|
"token_acc": 0.8299559114387157,
|
||
|
|
"train_speed(iter/s)": 0.090953
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.420691135436789,
|
||
|
|
"grad_norm": 0.2215634137392044,
|
||
|
|
"learning_rate": 4.993345449520364e-05,
|
||
|
|
"loss": 0.46740241050720216,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 980,
|
||
|
|
"token_acc": 0.8444846788598264,
|
||
|
|
"train_speed(iter/s)": 0.090976
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42927666881305,
|
||
|
|
"grad_norm": 0.26028579473495483,
|
||
|
|
"learning_rate": 4.992696909211384e-05,
|
||
|
|
"loss": 0.4601090431213379,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1000,
|
||
|
|
"token_acc": 0.8435964299778611,
|
||
|
|
"train_speed(iter/s)": 0.091007
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42927666881305,
|
||
|
|
"eval_loss": 0.49964410066604614,
|
||
|
|
"eval_runtime": 68.8659,
|
||
|
|
"eval_samples_per_second": 54.657,
|
||
|
|
"eval_steps_per_second": 0.697,
|
||
|
|
"eval_token_acc": 0.8252659243990504,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.437862202189311,
|
||
|
|
"grad_norm": 0.1770976185798645,
|
||
|
|
"learning_rate": 4.992018264367464e-05,
|
||
|
|
"loss": 0.4663649082183838,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1020,
|
||
|
|
"token_acc": 0.8298894735758832,
|
||
|
|
"train_speed(iter/s)": 0.090155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.446447735565572,
|
||
|
|
"grad_norm": 0.19963237643241882,
|
||
|
|
"learning_rate": 4.991309523184661e-05,
|
||
|
|
"loss": 0.45961837768554686,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1040,
|
||
|
|
"token_acc": 0.8395059398856838,
|
||
|
|
"train_speed(iter/s)": 0.090109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.455033268941833,
|
||
|
|
"grad_norm": 0.16753822565078735,
|
||
|
|
"learning_rate": 4.9905706942225094e-05,
|
||
|
|
"loss": 0.4637617111206055,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1060,
|
||
|
|
"token_acc": 0.8323250084598088,
|
||
|
|
"train_speed(iter/s)": 0.090086
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.463618802318094,
|
||
|
|
"grad_norm": 0.17514312267303467,
|
||
|
|
"learning_rate": 4.989801786403916e-05,
|
||
|
|
"loss": 0.45838212966918945,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1080,
|
||
|
|
"token_acc": 0.8644555660931506,
|
||
|
|
"train_speed(iter/s)": 0.090071
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.472204335694355,
|
||
|
|
"grad_norm": 0.18766745924949646,
|
||
|
|
"learning_rate": 4.989002809015052e-05,
|
||
|
|
"loss": 0.46158289909362793,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1100,
|
||
|
|
"token_acc": 0.8342647441453608,
|
||
|
|
"train_speed(iter/s)": 0.090073
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.480789869070616,
|
||
|
|
"grad_norm": 0.16132639348506927,
|
||
|
|
"learning_rate": 4.9881737717052436e-05,
|
||
|
|
"loss": 0.4612901210784912,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1120,
|
||
|
|
"token_acc": 0.8357742084275915,
|
||
|
|
"train_speed(iter/s)": 0.090059
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.489375402446877,
|
||
|
|
"grad_norm": 0.2307191789150238,
|
||
|
|
"learning_rate": 4.987314684486852e-05,
|
||
|
|
"loss": 0.4583921432495117,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1140,
|
||
|
|
"token_acc": 0.8285798810251781,
|
||
|
|
"train_speed(iter/s)": 0.090074
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.497960935823138,
|
||
|
|
"grad_norm": 0.18384596705436707,
|
||
|
|
"learning_rate": 4.9864255577351534e-05,
|
||
|
|
"loss": 0.4601446151733398,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1160,
|
||
|
|
"token_acc": 0.8331488125236877,
|
||
|
|
"train_speed(iter/s)": 0.090082
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.506546469199399,
|
||
|
|
"grad_norm": 0.16498738527297974,
|
||
|
|
"learning_rate": 4.985506402188217e-05,
|
||
|
|
"loss": 0.46405863761901855,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1180,
|
||
|
|
"token_acc": 0.8514261702005886,
|
||
|
|
"train_speed(iter/s)": 0.090094
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.51513200257566,
|
||
|
|
"grad_norm": 0.20875471830368042,
|
||
|
|
"learning_rate": 4.98455722894677e-05,
|
||
|
|
"loss": 0.4559325695037842,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1200,
|
||
|
|
"token_acc": 0.8449181040663494,
|
||
|
|
"train_speed(iter/s)": 0.090127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.523717535951921,
|
||
|
|
"grad_norm": 0.20588186383247375,
|
||
|
|
"learning_rate": 4.9835780494740655e-05,
|
||
|
|
"loss": 0.4588587760925293,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1220,
|
||
|
|
"token_acc": 0.8452262520285315,
|
||
|
|
"train_speed(iter/s)": 0.090144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.532303069328182,
|
||
|
|
"grad_norm": 0.1740783005952835,
|
||
|
|
"learning_rate": 4.982568875595748e-05,
|
||
|
|
"loss": 0.4509147644042969,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1240,
|
||
|
|
"token_acc": 0.8587345890329355,
|
||
|
|
"train_speed(iter/s)": 0.090167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5408886027044431,
|
||
|
|
"grad_norm": 0.16246297955513,
|
||
|
|
"learning_rate": 4.981529719499704e-05,
|
||
|
|
"loss": 0.45482635498046875,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1260,
|
||
|
|
"token_acc": 0.8503446562311433,
|
||
|
|
"train_speed(iter/s)": 0.09019
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5494741360807041,
|
||
|
|
"grad_norm": 0.16924946010112762,
|
||
|
|
"learning_rate": 4.98046059373592e-05,
|
||
|
|
"loss": 0.45041213035583494,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1280,
|
||
|
|
"token_acc": 0.8577976623734301,
|
||
|
|
"train_speed(iter/s)": 0.090182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5580596694569651,
|
||
|
|
"grad_norm": 0.1474551260471344,
|
||
|
|
"learning_rate": 4.979361511216328e-05,
|
||
|
|
"loss": 0.4552830696105957,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1300,
|
||
|
|
"token_acc": 0.8588476242043739,
|
||
|
|
"train_speed(iter/s)": 0.090192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5666452028332261,
|
||
|
|
"grad_norm": 0.18833380937576294,
|
||
|
|
"learning_rate": 4.978232485214652e-05,
|
||
|
|
"loss": 0.45859723091125487,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1320,
|
||
|
|
"token_acc": 0.8404036444064411,
|
||
|
|
"train_speed(iter/s)": 0.090203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5752307362094871,
|
||
|
|
"grad_norm": 0.15271180868148804,
|
||
|
|
"learning_rate": 4.977073529366244e-05,
|
||
|
|
"loss": 0.45444612503051757,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1340,
|
||
|
|
"token_acc": 0.8709423088586175,
|
||
|
|
"train_speed(iter/s)": 0.090204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5838162695857481,
|
||
|
|
"grad_norm": 0.15060073137283325,
|
||
|
|
"learning_rate": 4.975884657667922e-05,
|
||
|
|
"loss": 0.44826583862304686,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1360,
|
||
|
|
"token_acc": 0.8445347567633144,
|
||
|
|
"train_speed(iter/s)": 0.090219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5924018029620091,
|
||
|
|
"grad_norm": 0.20435456931591034,
|
||
|
|
"learning_rate": 4.974665884477803e-05,
|
||
|
|
"loss": 0.4500474452972412,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1380,
|
||
|
|
"token_acc": 0.8436003043631144,
|
||
|
|
"train_speed(iter/s)": 0.090231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6009873363382701,
|
||
|
|
"grad_norm": 0.14778906106948853,
|
||
|
|
"learning_rate": 4.9734172245151256e-05,
|
||
|
|
"loss": 0.45103793144226073,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1400,
|
||
|
|
"token_acc": 0.83692786963815,
|
||
|
|
"train_speed(iter/s)": 0.090242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6095728697145311,
|
||
|
|
"grad_norm": 0.14574205875396729,
|
||
|
|
"learning_rate": 4.972138692860072e-05,
|
||
|
|
"loss": 0.445733642578125,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1420,
|
||
|
|
"token_acc": 0.8457673279623152,
|
||
|
|
"train_speed(iter/s)": 0.090256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6181584030907921,
|
||
|
|
"grad_norm": 0.16354091465473175,
|
||
|
|
"learning_rate": 4.97083030495359e-05,
|
||
|
|
"loss": 0.44748697280883787,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1440,
|
||
|
|
"token_acc": 0.8472576057582539,
|
||
|
|
"train_speed(iter/s)": 0.09027
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6267439364670531,
|
||
|
|
"grad_norm": 0.14656536281108856,
|
||
|
|
"learning_rate": 4.969492076597203e-05,
|
||
|
|
"loss": 0.44432525634765624,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1460,
|
||
|
|
"token_acc": 0.8350947008237803,
|
||
|
|
"train_speed(iter/s)": 0.090292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6353294698433141,
|
||
|
|
"grad_norm": 0.16932909190654755,
|
||
|
|
"learning_rate": 4.9681240239528216e-05,
|
||
|
|
"loss": 0.44748797416687014,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1480,
|
||
|
|
"token_acc": 0.8489820684323982,
|
||
|
|
"train_speed(iter/s)": 0.090307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6439150032195751,
|
||
|
|
"grad_norm": 0.16873878240585327,
|
||
|
|
"learning_rate": 4.9667261635425446e-05,
|
||
|
|
"loss": 0.4508372783660889,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1500,
|
||
|
|
"token_acc": 0.8508125264242287,
|
||
|
|
"train_speed(iter/s)": 0.090325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6525005365958361,
|
||
|
|
"grad_norm": 0.1554819792509079,
|
||
|
|
"learning_rate": 4.965298512248466e-05,
|
||
|
|
"loss": 0.4475415706634521,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1520,
|
||
|
|
"token_acc": 0.8513087716943568,
|
||
|
|
"train_speed(iter/s)": 0.090345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6610860699720971,
|
||
|
|
"grad_norm": 0.15099839866161346,
|
||
|
|
"learning_rate": 4.963841087312462e-05,
|
||
|
|
"loss": 0.44126238822937014,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1540,
|
||
|
|
"token_acc": 0.8473235774968391,
|
||
|
|
"train_speed(iter/s)": 0.090357
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6696716033483581,
|
||
|
|
"grad_norm": 0.16528978943824768,
|
||
|
|
"learning_rate": 4.9623539063359925e-05,
|
||
|
|
"loss": 0.44157891273498534,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1560,
|
||
|
|
"token_acc": 0.8506024455489073,
|
||
|
|
"train_speed(iter/s)": 0.090379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6782571367246191,
|
||
|
|
"grad_norm": 0.1654183566570282,
|
||
|
|
"learning_rate": 4.9608369872798815e-05,
|
||
|
|
"loss": 0.4443850517272949,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1580,
|
||
|
|
"token_acc": 0.8580666295200214,
|
||
|
|
"train_speed(iter/s)": 0.090387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6868426701008801,
|
||
|
|
"grad_norm": 0.17317461967468262,
|
||
|
|
"learning_rate": 4.9592903484641026e-05,
|
||
|
|
"loss": 0.44514150619506837,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1600,
|
||
|
|
"token_acc": 0.8373144994303555,
|
||
|
|
"train_speed(iter/s)": 0.090402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6954282034771411,
|
||
|
|
"grad_norm": 0.14516599476337433,
|
||
|
|
"learning_rate": 4.9577140085675586e-05,
|
||
|
|
"loss": 0.4465588092803955,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1620,
|
||
|
|
"token_acc": 0.8457774631145212,
|
||
|
|
"train_speed(iter/s)": 0.090411
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7040137368534021,
|
||
|
|
"grad_norm": 0.19526512920856476,
|
||
|
|
"learning_rate": 4.956107986627855e-05,
|
||
|
|
"loss": 0.44002666473388674,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1640,
|
||
|
|
"token_acc": 0.8571882184288229,
|
||
|
|
"train_speed(iter/s)": 0.090425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7125992702296631,
|
||
|
|
"grad_norm": 0.15198639035224915,
|
||
|
|
"learning_rate": 4.954472302041069e-05,
|
||
|
|
"loss": 0.4411801815032959,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1660,
|
||
|
|
"token_acc": 0.8389074986086463,
|
||
|
|
"train_speed(iter/s)": 0.090436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.721184803605924,
|
||
|
|
"grad_norm": 0.14642658829689026,
|
||
|
|
"learning_rate": 4.952806974561518e-05,
|
||
|
|
"loss": 0.4408212184906006,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1680,
|
||
|
|
"token_acc": 0.8505627783277739,
|
||
|
|
"train_speed(iter/s)": 0.090445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.729770336982185,
|
||
|
|
"grad_norm": 0.15425585210323334,
|
||
|
|
"learning_rate": 4.951112024301517e-05,
|
||
|
|
"loss": 0.4436194896697998,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1700,
|
||
|
|
"token_acc": 0.8396897524541256,
|
||
|
|
"train_speed(iter/s)": 0.090457
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.738355870358446,
|
||
|
|
"grad_norm": 0.1366390883922577,
|
||
|
|
"learning_rate": 4.9493874717311416e-05,
|
||
|
|
"loss": 0.4426912307739258,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1720,
|
||
|
|
"token_acc": 0.8376902006111753,
|
||
|
|
"train_speed(iter/s)": 0.090475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.746941403734707,
|
||
|
|
"grad_norm": 0.18443486094474792,
|
||
|
|
"learning_rate": 4.9476333376779746e-05,
|
||
|
|
"loss": 0.4428090572357178,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1740,
|
||
|
|
"token_acc": 0.8405056707361122,
|
||
|
|
"train_speed(iter/s)": 0.090496
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.755526937110968,
|
||
|
|
"grad_norm": 0.16430367529392242,
|
||
|
|
"learning_rate": 4.945849643326857e-05,
|
||
|
|
"loss": 0.4388707637786865,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1760,
|
||
|
|
"token_acc": 0.8453185251787173,
|
||
|
|
"train_speed(iter/s)": 0.090513
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.764112470487229,
|
||
|
|
"grad_norm": 0.16152745485305786,
|
||
|
|
"learning_rate": 4.9440364102196345e-05,
|
||
|
|
"loss": 0.43615312576293946,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1780,
|
||
|
|
"token_acc": 0.855409006002105,
|
||
|
|
"train_speed(iter/s)": 0.090532
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.77269800386349,
|
||
|
|
"grad_norm": 0.18139781057834625,
|
||
|
|
"learning_rate": 4.942193660254892e-05,
|
||
|
|
"loss": 0.440519380569458,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1800,
|
||
|
|
"token_acc": 0.8458556213090118,
|
||
|
|
"train_speed(iter/s)": 0.09055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.781283537239751,
|
||
|
|
"grad_norm": 0.1560135781764984,
|
||
|
|
"learning_rate": 4.9403214156876966e-05,
|
||
|
|
"loss": 0.4351651191711426,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1820,
|
||
|
|
"token_acc": 0.844846138018734,
|
||
|
|
"train_speed(iter/s)": 0.090564
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.789869070616012,
|
||
|
|
"grad_norm": 0.18113134801387787,
|
||
|
|
"learning_rate": 4.9384196991293205e-05,
|
||
|
|
"loss": 0.4427495002746582,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1840,
|
||
|
|
"token_acc": 0.8444957533319758,
|
||
|
|
"train_speed(iter/s)": 0.090581
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.798454603992273,
|
||
|
|
"grad_norm": 0.16674058139324188,
|
||
|
|
"learning_rate": 4.9364885335469734e-05,
|
||
|
|
"loss": 0.4387219429016113,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1860,
|
||
|
|
"token_acc": 0.862598161076389,
|
||
|
|
"train_speed(iter/s)": 0.090598
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.807040137368534,
|
||
|
|
"grad_norm": 0.1326039731502533,
|
||
|
|
"learning_rate": 4.934527942263523e-05,
|
||
|
|
"loss": 0.4364177703857422,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1880,
|
||
|
|
"token_acc": 0.8337704981881752,
|
||
|
|
"train_speed(iter/s)": 0.090612
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.815625670744795,
|
||
|
|
"grad_norm": 0.15598100423812866,
|
||
|
|
"learning_rate": 4.9325379489572165e-05,
|
||
|
|
"loss": 0.4394540309906006,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1900,
|
||
|
|
"token_acc": 0.8388467949805115,
|
||
|
|
"train_speed(iter/s)": 0.090628
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.824211204121056,
|
||
|
|
"grad_norm": 0.19666017591953278,
|
||
|
|
"learning_rate": 4.930518577661388e-05,
|
||
|
|
"loss": 0.4369682788848877,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1920,
|
||
|
|
"token_acc": 0.8537222609570074,
|
||
|
|
"train_speed(iter/s)": 0.090641
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.832796737497317,
|
||
|
|
"grad_norm": 0.14630870521068573,
|
||
|
|
"learning_rate": 4.928469852764176e-05,
|
||
|
|
"loss": 0.43962607383728025,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1940,
|
||
|
|
"token_acc": 0.8393774787079826,
|
||
|
|
"train_speed(iter/s)": 0.090657
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.841382270873578,
|
||
|
|
"grad_norm": 0.1797455996274948,
|
||
|
|
"learning_rate": 4.926391799008223e-05,
|
||
|
|
"loss": 0.4379319190979004,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1960,
|
||
|
|
"token_acc": 0.843222227690404,
|
||
|
|
"train_speed(iter/s)": 0.090674
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.849967804249839,
|
||
|
|
"grad_norm": 0.12199361622333527,
|
||
|
|
"learning_rate": 4.92428444149038e-05,
|
||
|
|
"loss": 0.4334880352020264,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 1980,
|
||
|
|
"token_acc": 0.8431945161599516,
|
||
|
|
"train_speed(iter/s)": 0.090689
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8585533376261,
|
||
|
|
"grad_norm": 0.14421170949935913,
|
||
|
|
"learning_rate": 4.922147805661402e-05,
|
||
|
|
"loss": 0.43396615982055664,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2000,
|
||
|
|
"token_acc": 0.8505196095201227,
|
||
|
|
"train_speed(iter/s)": 0.0907
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8585533376261,
|
||
|
|
"eval_loss": 0.470032662153244,
|
||
|
|
"eval_runtime": 68.4365,
|
||
|
|
"eval_samples_per_second": 55.0,
|
||
|
|
"eval_steps_per_second": 0.701,
|
||
|
|
"eval_token_acc": 0.8330788522432155,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.867138871002361,
|
||
|
|
"grad_norm": 0.14598308503627777,
|
||
|
|
"learning_rate": 4.91998191732564e-05,
|
||
|
|
"loss": 0.4354074001312256,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2020,
|
||
|
|
"token_acc": 0.8444459301633199,
|
||
|
|
"train_speed(iter/s)": 0.090271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.875724404378622,
|
||
|
|
"grad_norm": 0.14193296432495117,
|
||
|
|
"learning_rate": 4.917786802640732e-05,
|
||
|
|
"loss": 0.4282365322113037,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2040,
|
||
|
|
"token_acc": 0.851482400022546,
|
||
|
|
"train_speed(iter/s)": 0.090242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.884309937754883,
|
||
|
|
"grad_norm": 0.1344188153743744,
|
||
|
|
"learning_rate": 4.9155624881172834e-05,
|
||
|
|
"loss": 0.4284001350402832,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2060,
|
||
|
|
"token_acc": 0.8423048427291708,
|
||
|
|
"train_speed(iter/s)": 0.090228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.892895471131144,
|
||
|
|
"grad_norm": 0.19214758276939392,
|
||
|
|
"learning_rate": 4.91330900061855e-05,
|
||
|
|
"loss": 0.4374197483062744,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2080,
|
||
|
|
"token_acc": 0.838007610676071,
|
||
|
|
"train_speed(iter/s)": 0.09022
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.901481004507405,
|
||
|
|
"grad_norm": 0.14042872190475464,
|
||
|
|
"learning_rate": 4.911026367360114e-05,
|
||
|
|
"loss": 0.4368441104888916,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2100,
|
||
|
|
"token_acc": 0.8546391628505924,
|
||
|
|
"train_speed(iter/s)": 0.09023
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.910066537883666,
|
||
|
|
"grad_norm": 0.12134739011526108,
|
||
|
|
"learning_rate": 4.90871461590955e-05,
|
||
|
|
"loss": 0.4329835414886475,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2120,
|
||
|
|
"token_acc": 0.8415444091274719,
|
||
|
|
"train_speed(iter/s)": 0.090231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.918652071259927,
|
||
|
|
"grad_norm": 0.13989004492759705,
|
||
|
|
"learning_rate": 4.906373774186097e-05,
|
||
|
|
"loss": 0.4377878665924072,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2140,
|
||
|
|
"token_acc": 0.848478083434529,
|
||
|
|
"train_speed(iter/s)": 0.090235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.927237604636188,
|
||
|
|
"grad_norm": 0.13958944380283356,
|
||
|
|
"learning_rate": 4.904003870460323e-05,
|
||
|
|
"loss": 0.4368983268737793,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2160,
|
||
|
|
"token_acc": 0.8715589150065507,
|
||
|
|
"train_speed(iter/s)": 0.090238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.935823138012449,
|
||
|
|
"grad_norm": 0.12047629058361053,
|
||
|
|
"learning_rate": 4.901604933353776e-05,
|
||
|
|
"loss": 0.432587194442749,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2180,
|
||
|
|
"token_acc": 0.8463879291216281,
|
||
|
|
"train_speed(iter/s)": 0.090247
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.94440867138871,
|
||
|
|
"grad_norm": 0.19937904179096222,
|
||
|
|
"learning_rate": 4.899176991838646e-05,
|
||
|
|
"loss": 0.42923874855041505,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2200,
|
||
|
|
"token_acc": 0.8560462814584306,
|
||
|
|
"train_speed(iter/s)": 0.090255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.952994204764971,
|
||
|
|
"grad_norm": 0.13658791780471802,
|
||
|
|
"learning_rate": 4.896720075237411e-05,
|
||
|
|
"loss": 0.43826861381530763,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2220,
|
||
|
|
"token_acc": 0.8582506049536265,
|
||
|
|
"train_speed(iter/s)": 0.090257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.961579738141232,
|
||
|
|
"grad_norm": 0.1443174183368683,
|
||
|
|
"learning_rate": 4.894234213222484e-05,
|
||
|
|
"loss": 0.4363288879394531,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2240,
|
||
|
|
"token_acc": 0.8583811494758153,
|
||
|
|
"train_speed(iter/s)": 0.090269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.970165271517493,
|
||
|
|
"grad_norm": 0.1416754275560379,
|
||
|
|
"learning_rate": 4.8917194358158534e-05,
|
||
|
|
"loss": 0.43085694313049316,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2260,
|
||
|
|
"token_acc": 0.8524799246312664,
|
||
|
|
"train_speed(iter/s)": 0.090285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.978750804893754,
|
||
|
|
"grad_norm": 0.15419602394104004,
|
||
|
|
"learning_rate": 4.889175773388722e-05,
|
||
|
|
"loss": 0.42989211082458495,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2280,
|
||
|
|
"token_acc": 0.8570728938425664,
|
||
|
|
"train_speed(iter/s)": 0.090292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.987336338270015,
|
||
|
|
"grad_norm": 0.15600045025348663,
|
||
|
|
"learning_rate": 4.886603256661142e-05,
|
||
|
|
"loss": 0.43334760665893557,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2300,
|
||
|
|
"token_acc": 0.844059695609059,
|
||
|
|
"train_speed(iter/s)": 0.090301
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.995921871646276,
|
||
|
|
"grad_norm": 0.1368878036737442,
|
||
|
|
"learning_rate": 4.884001916701639e-05,
|
||
|
|
"loss": 0.4333777904510498,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2320,
|
||
|
|
"token_acc": 0.841434785356969,
|
||
|
|
"train_speed(iter/s)": 0.090297
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0042927666881305,
|
||
|
|
"grad_norm": 0.17282716929912567,
|
||
|
|
"learning_rate": 4.881371784926839e-05,
|
||
|
|
"loss": 0.42626185417175294,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2340,
|
||
|
|
"token_acc": 0.8547437072110268,
|
||
|
|
"train_speed(iter/s)": 0.090271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0128783000643915,
|
||
|
|
"grad_norm": 0.21046976745128632,
|
||
|
|
"learning_rate": 4.878712893101092e-05,
|
||
|
|
"loss": 0.40583181381225586,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2360,
|
||
|
|
"token_acc": 0.8494737944090475,
|
||
|
|
"train_speed(iter/s)": 0.09027
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0214638334406525,
|
||
|
|
"grad_norm": 0.1504330039024353,
|
||
|
|
"learning_rate": 4.8760252733360845e-05,
|
||
|
|
"loss": 0.40615053176879884,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2380,
|
||
|
|
"token_acc": 0.861498977359772,
|
||
|
|
"train_speed(iter/s)": 0.090271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0300493668169135,
|
||
|
|
"grad_norm": 0.13325518369674683,
|
||
|
|
"learning_rate": 4.8733089580904525e-05,
|
||
|
|
"loss": 0.4108716011047363,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2400,
|
||
|
|
"token_acc": 0.8607458709259072,
|
||
|
|
"train_speed(iter/s)": 0.090273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0386349001931745,
|
||
|
|
"grad_norm": 0.14907221496105194,
|
||
|
|
"learning_rate": 4.870563980169391e-05,
|
||
|
|
"loss": 0.4110468864440918,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2420,
|
||
|
|
"token_acc": 0.8597078066556821,
|
||
|
|
"train_speed(iter/s)": 0.090268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0472204335694355,
|
||
|
|
"grad_norm": 0.13383924961090088,
|
||
|
|
"learning_rate": 4.867790372724257e-05,
|
||
|
|
"loss": 0.4098019599914551,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2440,
|
||
|
|
"token_acc": 0.8552879722635879,
|
||
|
|
"train_speed(iter/s)": 0.090259
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0558059669456965,
|
||
|
|
"grad_norm": 0.1269863396883011,
|
||
|
|
"learning_rate": 4.864988169252168e-05,
|
||
|
|
"loss": 0.40692687034606934,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2460,
|
||
|
|
"token_acc": 0.8569142548291154,
|
||
|
|
"train_speed(iter/s)": 0.090254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0643915003219575,
|
||
|
|
"grad_norm": 0.1471211463212967,
|
||
|
|
"learning_rate": 4.862157403595598e-05,
|
||
|
|
"loss": 0.4115363597869873,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2480,
|
||
|
|
"token_acc": 0.8509032023648785,
|
||
|
|
"train_speed(iter/s)": 0.090253
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0729770336982185,
|
||
|
|
"grad_norm": 0.1170874610543251,
|
||
|
|
"learning_rate": 4.859298109941971e-05,
|
||
|
|
"loss": 0.40721793174743653,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2500,
|
||
|
|
"token_acc": 0.8535656636728612,
|
||
|
|
"train_speed(iter/s)": 0.09024
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0815625670744795,
|
||
|
|
"grad_norm": 0.15042538940906525,
|
||
|
|
"learning_rate": 4.8564103228232445e-05,
|
||
|
|
"loss": 0.4073436737060547,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2520,
|
||
|
|
"token_acc": 0.8541391331235382,
|
||
|
|
"train_speed(iter/s)": 0.090233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0901481004507405,
|
||
|
|
"grad_norm": 0.13396978378295898,
|
||
|
|
"learning_rate": 4.8534940771154954e-05,
|
||
|
|
"loss": 0.40180039405822754,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2540,
|
||
|
|
"token_acc": 0.8529722329553782,
|
||
|
|
"train_speed(iter/s)": 0.09023
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0987336338270015,
|
||
|
|
"grad_norm": 0.1457211673259735,
|
||
|
|
"learning_rate": 4.850549408038498e-05,
|
||
|
|
"loss": 0.4088040828704834,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2560,
|
||
|
|
"token_acc": 0.8557055478261985,
|
||
|
|
"train_speed(iter/s)": 0.090233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1073191672032625,
|
||
|
|
"grad_norm": 0.1382468044757843,
|
||
|
|
"learning_rate": 4.8475763511552965e-05,
|
||
|
|
"loss": 0.4087985515594482,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2580,
|
||
|
|
"token_acc": 0.8476997133289814,
|
||
|
|
"train_speed(iter/s)": 0.090235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1159047005795235,
|
||
|
|
"grad_norm": 0.13849055767059326,
|
||
|
|
"learning_rate": 4.844574942371779e-05,
|
||
|
|
"loss": 0.4051491737365723,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2600,
|
||
|
|
"token_acc": 0.8530841075229781,
|
||
|
|
"train_speed(iter/s)": 0.09023
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1244902339557845,
|
||
|
|
"grad_norm": 0.10844399780035019,
|
||
|
|
"learning_rate": 4.841545217936241e-05,
|
||
|
|
"loss": 0.40656099319458006,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2620,
|
||
|
|
"token_acc": 0.8659740741451311,
|
||
|
|
"train_speed(iter/s)": 0.090231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1330757673320455,
|
||
|
|
"grad_norm": 0.14399060606956482,
|
||
|
|
"learning_rate": 4.838487214438951e-05,
|
||
|
|
"loss": 0.40219764709472655,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2640,
|
||
|
|
"token_acc": 0.8656865378871902,
|
||
|
|
"train_speed(iter/s)": 0.090232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1416613007083065,
|
||
|
|
"grad_norm": 0.15220606327056885,
|
||
|
|
"learning_rate": 4.8354009688117026e-05,
|
||
|
|
"loss": 0.409071159362793,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2660,
|
||
|
|
"token_acc": 0.8480521276805948,
|
||
|
|
"train_speed(iter/s)": 0.090228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1502468340845675,
|
||
|
|
"grad_norm": 0.13173505663871765,
|
||
|
|
"learning_rate": 4.832286518327376e-05,
|
||
|
|
"loss": 0.40669097900390627,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2680,
|
||
|
|
"token_acc": 0.8510178845290564,
|
||
|
|
"train_speed(iter/s)": 0.090198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1588323674608285,
|
||
|
|
"grad_norm": 0.13876375555992126,
|
||
|
|
"learning_rate": 4.829143900599481e-05,
|
||
|
|
"loss": 0.40750818252563475,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2700,
|
||
|
|
"token_acc": 0.8563185312128616,
|
||
|
|
"train_speed(iter/s)": 0.090196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1674179008370895,
|
||
|
|
"grad_norm": 0.1286059468984604,
|
||
|
|
"learning_rate": 4.825973153581709e-05,
|
||
|
|
"loss": 0.4104398250579834,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2720,
|
||
|
|
"token_acc": 0.8429334625658422,
|
||
|
|
"train_speed(iter/s)": 0.090194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1760034342133505,
|
||
|
|
"grad_norm": 0.11903152614831924,
|
||
|
|
"learning_rate": 4.8227743155674684e-05,
|
||
|
|
"loss": 0.405780553817749,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2740,
|
||
|
|
"token_acc": 0.8503315207488469,
|
||
|
|
"train_speed(iter/s)": 0.090196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1845889675896115,
|
||
|
|
"grad_norm": 0.13296058773994446,
|
||
|
|
"learning_rate": 4.819547425189429e-05,
|
||
|
|
"loss": 0.406817626953125,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2760,
|
||
|
|
"token_acc": 0.8561766559029692,
|
||
|
|
"train_speed(iter/s)": 0.090196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1931745009658725,
|
||
|
|
"grad_norm": 0.1934213489294052,
|
||
|
|
"learning_rate": 4.816292521419046e-05,
|
||
|
|
"loss": 0.40883073806762693,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2780,
|
||
|
|
"token_acc": 0.844781303243432,
|
||
|
|
"train_speed(iter/s)": 0.090191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2017600343421335,
|
||
|
|
"grad_norm": 0.14654423296451569,
|
||
|
|
"learning_rate": 4.813009643566101e-05,
|
||
|
|
"loss": 0.40619373321533203,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2800,
|
||
|
|
"token_acc": 0.8772289089291062,
|
||
|
|
"train_speed(iter/s)": 0.090194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2103455677183945,
|
||
|
|
"grad_norm": 0.15193308889865875,
|
||
|
|
"learning_rate": 4.8096988312782174e-05,
|
||
|
|
"loss": 0.41390376091003417,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2820,
|
||
|
|
"token_acc": 0.8615587932421312,
|
||
|
|
"train_speed(iter/s)": 0.089981
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2189311010946555,
|
||
|
|
"grad_norm": 0.31674590706825256,
|
||
|
|
"learning_rate": 4.8063601245403864e-05,
|
||
|
|
"loss": 0.40833268165588377,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2840,
|
||
|
|
"token_acc": 0.8733467856737243,
|
||
|
|
"train_speed(iter/s)": 0.089985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2275166344709165,
|
||
|
|
"grad_norm": 0.14927241206169128,
|
||
|
|
"learning_rate": 4.802993563674483e-05,
|
||
|
|
"loss": 0.4076714038848877,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2860,
|
||
|
|
"token_acc": 0.854028153160118,
|
||
|
|
"train_speed(iter/s)": 0.089983
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2361021678471775,
|
||
|
|
"grad_norm": 0.12387314438819885,
|
||
|
|
"learning_rate": 4.7995991893387796e-05,
|
||
|
|
"loss": 0.4103559970855713,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2880,
|
||
|
|
"token_acc": 0.8473229063574101,
|
||
|
|
"train_speed(iter/s)": 0.089987
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2446877012234385,
|
||
|
|
"grad_norm": 0.12055594474077225,
|
||
|
|
"learning_rate": 4.7961770425274545e-05,
|
||
|
|
"loss": 0.4068136215209961,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2900,
|
||
|
|
"token_acc": 0.8558365116304547,
|
||
|
|
"train_speed(iter/s)": 0.089986
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2532732345996995,
|
||
|
|
"grad_norm": 0.15471091866493225,
|
||
|
|
"learning_rate": 4.7927271645700966e-05,
|
||
|
|
"loss": 0.40784463882446287,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2920,
|
||
|
|
"token_acc": 0.8654847024471946,
|
||
|
|
"train_speed(iter/s)": 0.089986
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2618587679759605,
|
||
|
|
"grad_norm": 0.14402052760124207,
|
||
|
|
"learning_rate": 4.789249597131205e-05,
|
||
|
|
"loss": 0.416036319732666,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2940,
|
||
|
|
"token_acc": 0.8460941475007567,
|
||
|
|
"train_speed(iter/s)": 0.089991
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2704443013522215,
|
||
|
|
"grad_norm": 0.12818260490894318,
|
||
|
|
"learning_rate": 4.7857443822096905e-05,
|
||
|
|
"loss": 0.4087369441986084,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2960,
|
||
|
|
"token_acc": 0.8485137361496985,
|
||
|
|
"train_speed(iter/s)": 0.089995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2790298347284825,
|
||
|
|
"grad_norm": 0.1295406073331833,
|
||
|
|
"learning_rate": 4.7822115621383626e-05,
|
||
|
|
"loss": 0.406325101852417,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 2980,
|
||
|
|
"token_acc": 0.844768784514136,
|
||
|
|
"train_speed(iter/s)": 0.089994
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2876153681047435,
|
||
|
|
"grad_norm": 0.12578845024108887,
|
||
|
|
"learning_rate": 4.77865117958342e-05,
|
||
|
|
"loss": 0.4075514316558838,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3000,
|
||
|
|
"token_acc": 0.8509595377960147,
|
||
|
|
"train_speed(iter/s)": 0.089997
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2876153681047435,
|
||
|
|
"eval_loss": 0.458536833524704,
|
||
|
|
"eval_runtime": 73.3656,
|
||
|
|
"eval_samples_per_second": 51.305,
|
||
|
|
"eval_steps_per_second": 0.654,
|
||
|
|
"eval_token_acc": 0.836097728930092,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2962009014810045,
|
||
|
|
"grad_norm": 0.14806412160396576,
|
||
|
|
"learning_rate": 4.7750632775439396e-05,
|
||
|
|
"loss": 0.4144165515899658,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3020,
|
||
|
|
"token_acc": 0.8443864646089783,
|
||
|
|
"train_speed(iter/s)": 0.089698
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3047864348572655,
|
||
|
|
"grad_norm": 0.12434946000576019,
|
||
|
|
"learning_rate": 4.771447899351351e-05,
|
||
|
|
"loss": 0.4105505466461182,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3040,
|
||
|
|
"token_acc": 0.8428811902693311,
|
||
|
|
"train_speed(iter/s)": 0.089674
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3133719682335265,
|
||
|
|
"grad_norm": 0.13531598448753357,
|
||
|
|
"learning_rate": 4.767805088668916e-05,
|
||
|
|
"loss": 0.40719943046569823,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3060,
|
||
|
|
"token_acc": 0.8596681679947646,
|
||
|
|
"train_speed(iter/s)": 0.089654
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3219575016097875,
|
||
|
|
"grad_norm": 0.1139962449669838,
|
||
|
|
"learning_rate": 4.764134889491203e-05,
|
||
|
|
"loss": 0.41121878623962405,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3080,
|
||
|
|
"token_acc": 0.8527798587518215,
|
||
|
|
"train_speed(iter/s)": 0.089646
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3305430349860485,
|
||
|
|
"grad_norm": 0.12814205884933472,
|
||
|
|
"learning_rate": 4.760437346143551e-05,
|
||
|
|
"loss": 0.409865140914917,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3100,
|
||
|
|
"token_acc": 0.8476134603221164,
|
||
|
|
"train_speed(iter/s)": 0.089635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3391285683623095,
|
||
|
|
"grad_norm": 0.12462539970874786,
|
||
|
|
"learning_rate": 4.7567125032815394e-05,
|
||
|
|
"loss": 0.4104144096374512,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3120,
|
||
|
|
"token_acc": 0.8563240702901512,
|
||
|
|
"train_speed(iter/s)": 0.089626
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3477141017385705,
|
||
|
|
"grad_norm": 0.14763693511486053,
|
||
|
|
"learning_rate": 4.752960405890446e-05,
|
||
|
|
"loss": 0.4084192752838135,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3140,
|
||
|
|
"token_acc": 0.8617203660494134,
|
||
|
|
"train_speed(iter/s)": 0.08962
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3562996351148315,
|
||
|
|
"grad_norm": 0.1228506788611412,
|
||
|
|
"learning_rate": 4.749181099284703e-05,
|
||
|
|
"loss": 0.4092958927154541,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3160,
|
||
|
|
"token_acc": 0.8593913560568706,
|
||
|
|
"train_speed(iter/s)": 0.089613
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3648851684910925,
|
||
|
|
"grad_norm": 0.13346746563911438,
|
||
|
|
"learning_rate": 4.745374629107352e-05,
|
||
|
|
"loss": 0.4028874397277832,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3180,
|
||
|
|
"token_acc": 0.8684066693278472,
|
||
|
|
"train_speed(iter/s)": 0.08961
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3734707018673535,
|
||
|
|
"grad_norm": 0.11734752357006073,
|
||
|
|
"learning_rate": 4.7415410413294914e-05,
|
||
|
|
"loss": 0.40769195556640625,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3200,
|
||
|
|
"token_acc": 0.8596272472768909,
|
||
|
|
"train_speed(iter/s)": 0.089609
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3820562352436145,
|
||
|
|
"grad_norm": 0.12647828459739685,
|
||
|
|
"learning_rate": 4.737680382249721e-05,
|
||
|
|
"loss": 0.40363130569458006,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3220,
|
||
|
|
"token_acc": 0.8496863902084465,
|
||
|
|
"train_speed(iter/s)": 0.089608
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3906417686198755,
|
||
|
|
"grad_norm": 0.1108260527253151,
|
||
|
|
"learning_rate": 4.733792698493584e-05,
|
||
|
|
"loss": 0.40738682746887206,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3240,
|
||
|
|
"token_acc": 0.8423361384211572,
|
||
|
|
"train_speed(iter/s)": 0.089609
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3992273019961365,
|
||
|
|
"grad_norm": 0.12982375919818878,
|
||
|
|
"learning_rate": 4.7298780370130014e-05,
|
||
|
|
"loss": 0.4081905364990234,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3260,
|
||
|
|
"token_acc": 0.856714728114282,
|
||
|
|
"train_speed(iter/s)": 0.089608
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4078128353723975,
|
||
|
|
"grad_norm": 0.11955548077821732,
|
||
|
|
"learning_rate": 4.7259364450857096e-05,
|
||
|
|
"loss": 0.405292272567749,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3280,
|
||
|
|
"token_acc": 0.8639308100087719,
|
||
|
|
"train_speed(iter/s)": 0.08961
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4163983687486585,
|
||
|
|
"grad_norm": 0.13745689392089844,
|
||
|
|
"learning_rate": 4.721967970314684e-05,
|
||
|
|
"loss": 0.40678954124450684,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3300,
|
||
|
|
"token_acc": 0.8452769593980903,
|
||
|
|
"train_speed(iter/s)": 0.089609
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4249839021249195,
|
||
|
|
"grad_norm": 0.12158916145563126,
|
||
|
|
"learning_rate": 4.717972660627567e-05,
|
||
|
|
"loss": 0.40230860710144045,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3320,
|
||
|
|
"token_acc": 0.8659293308755474,
|
||
|
|
"train_speed(iter/s)": 0.089614
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4335694355011805,
|
||
|
|
"grad_norm": 0.14111177623271942,
|
||
|
|
"learning_rate": 4.713950564276091e-05,
|
||
|
|
"loss": 0.4016873359680176,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3340,
|
||
|
|
"token_acc": 0.8510812474231116,
|
||
|
|
"train_speed(iter/s)": 0.089592
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4421549688774415,
|
||
|
|
"grad_norm": 0.10712361335754395,
|
||
|
|
"learning_rate": 4.70990172983549e-05,
|
||
|
|
"loss": 0.4058821201324463,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3360,
|
||
|
|
"token_acc": 0.8550924401373665,
|
||
|
|
"train_speed(iter/s)": 0.089592
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4507405022537025,
|
||
|
|
"grad_norm": 0.11166644841432571,
|
||
|
|
"learning_rate": 4.705826206203918e-05,
|
||
|
|
"loss": 0.4066760540008545,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3380,
|
||
|
|
"token_acc": 0.8444937034366048,
|
||
|
|
"train_speed(iter/s)": 0.089586
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4593260356299635,
|
||
|
|
"grad_norm": 0.14026156067848206,
|
||
|
|
"learning_rate": 4.701724042601859e-05,
|
||
|
|
"loss": 0.40719261169433596,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3400,
|
||
|
|
"token_acc": 0.8498371056241426,
|
||
|
|
"train_speed(iter/s)": 0.08959
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4679115690062245,
|
||
|
|
"grad_norm": 0.13125832378864288,
|
||
|
|
"learning_rate": 4.697595288571528e-05,
|
||
|
|
"loss": 0.4064974308013916,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3420,
|
||
|
|
"token_acc": 0.8575960472975773,
|
||
|
|
"train_speed(iter/s)": 0.089593
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4764971023824855,
|
||
|
|
"grad_norm": 0.12359972298145294,
|
||
|
|
"learning_rate": 4.6934399939762746e-05,
|
||
|
|
"loss": 0.4019315242767334,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3440,
|
||
|
|
"token_acc": 0.8573588526594907,
|
||
|
|
"train_speed(iter/s)": 0.089592
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4850826357587465,
|
||
|
|
"grad_norm": 0.15697510540485382,
|
||
|
|
"learning_rate": 4.689258208999983e-05,
|
||
|
|
"loss": 0.4078845500946045,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3460,
|
||
|
|
"token_acc": 0.8560958939786878,
|
||
|
|
"train_speed(iter/s)": 0.089591
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4936681691350076,
|
||
|
|
"grad_norm": 0.11863242089748383,
|
||
|
|
"learning_rate": 4.685049984146463e-05,
|
||
|
|
"loss": 0.4097602844238281,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3480,
|
||
|
|
"token_acc": 0.8628702144893777,
|
||
|
|
"train_speed(iter/s)": 0.08959
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5022537025112686,
|
||
|
|
"grad_norm": 0.11114250868558884,
|
||
|
|
"learning_rate": 4.680815370238843e-05,
|
||
|
|
"loss": 0.40899147987365725,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3500,
|
||
|
|
"token_acc": 0.8451921045701701,
|
||
|
|
"train_speed(iter/s)": 0.089584
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5108392358875296,
|
||
|
|
"grad_norm": 0.1112656220793724,
|
||
|
|
"learning_rate": 4.676554418418953e-05,
|
||
|
|
"loss": 0.40816683769226075,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3520,
|
||
|
|
"token_acc": 0.8431806288233773,
|
||
|
|
"train_speed(iter/s)": 0.089584
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5194247692637906,
|
||
|
|
"grad_norm": 0.11323296278715134,
|
||
|
|
"learning_rate": 4.6722671801467074e-05,
|
||
|
|
"loss": 0.4055006980895996,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3540,
|
||
|
|
"token_acc": 0.8815225166268434,
|
||
|
|
"train_speed(iter/s)": 0.089589
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5280103026400516,
|
||
|
|
"grad_norm": 0.12150542438030243,
|
||
|
|
"learning_rate": 4.6679537071994874e-05,
|
||
|
|
"loss": 0.4004813194274902,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3560,
|
||
|
|
"token_acc": 0.8570034017657414,
|
||
|
|
"train_speed(iter/s)": 0.089589
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5365958360163126,
|
||
|
|
"grad_norm": 0.12244880199432373,
|
||
|
|
"learning_rate": 4.6636140516715104e-05,
|
||
|
|
"loss": 0.4029510021209717,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3580,
|
||
|
|
"token_acc": 0.8517572914459227,
|
||
|
|
"train_speed(iter/s)": 0.089593
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5451813693925736,
|
||
|
|
"grad_norm": 0.1206183210015297,
|
||
|
|
"learning_rate": 4.659248265973205e-05,
|
||
|
|
"loss": 0.40460500717163084,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3600,
|
||
|
|
"token_acc": 0.8554049462946347,
|
||
|
|
"train_speed(iter/s)": 0.089596
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5537669027688346,
|
||
|
|
"grad_norm": 0.1283605545759201,
|
||
|
|
"learning_rate": 4.6548564028305746e-05,
|
||
|
|
"loss": 0.40555410385131835,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3620,
|
||
|
|
"token_acc": 0.8552409152003629,
|
||
|
|
"train_speed(iter/s)": 0.0896
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5623524361450956,
|
||
|
|
"grad_norm": 0.10448771715164185,
|
||
|
|
"learning_rate": 4.650438515284564e-05,
|
||
|
|
"loss": 0.4010280132293701,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3640,
|
||
|
|
"token_acc": 0.8516997869926084,
|
||
|
|
"train_speed(iter/s)": 0.089603
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5709379695213566,
|
||
|
|
"grad_norm": 0.14749032258987427,
|
||
|
|
"learning_rate": 4.645994656690417e-05,
|
||
|
|
"loss": 0.4050903797149658,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3660,
|
||
|
|
"token_acc": 0.8502366458426844,
|
||
|
|
"train_speed(iter/s)": 0.0896
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5795235028976176,
|
||
|
|
"grad_norm": 0.1269512176513672,
|
||
|
|
"learning_rate": 4.6415248807170296e-05,
|
||
|
|
"loss": 0.4045454502105713,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3680,
|
||
|
|
"token_acc": 0.8799187339606501,
|
||
|
|
"train_speed(iter/s)": 0.089583
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5881090362738786,
|
||
|
|
"grad_norm": 0.11708427965641022,
|
||
|
|
"learning_rate": 4.637029241346309e-05,
|
||
|
|
"loss": 0.4028982162475586,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3700,
|
||
|
|
"token_acc": 0.8584745030316171,
|
||
|
|
"train_speed(iter/s)": 0.089582
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5966945696501396,
|
||
|
|
"grad_norm": 0.12971659004688263,
|
||
|
|
"learning_rate": 4.632507792872513e-05,
|
||
|
|
"loss": 0.4027679920196533,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3720,
|
||
|
|
"token_acc": 0.8444115651659281,
|
||
|
|
"train_speed(iter/s)": 0.089587
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6052801030264006,
|
||
|
|
"grad_norm": 0.1406456083059311,
|
||
|
|
"learning_rate": 4.6279605899016007e-05,
|
||
|
|
"loss": 0.4045069694519043,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3740,
|
||
|
|
"token_acc": 0.8620591654047942,
|
||
|
|
"train_speed(iter/s)": 0.089588
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6138656364026616,
|
||
|
|
"grad_norm": 0.12651792168617249,
|
||
|
|
"learning_rate": 4.6233876873505694e-05,
|
||
|
|
"loss": 0.3987946271896362,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3760,
|
||
|
|
"token_acc": 0.8604080254900858,
|
||
|
|
"train_speed(iter/s)": 0.089587
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6224511697789226,
|
||
|
|
"grad_norm": 0.1294124722480774,
|
||
|
|
"learning_rate": 4.618789140446793e-05,
|
||
|
|
"loss": 0.4040426254272461,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3780,
|
||
|
|
"token_acc": 0.8575430560407852,
|
||
|
|
"train_speed(iter/s)": 0.089588
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6310367031551836,
|
||
|
|
"grad_norm": 0.13899479806423187,
|
||
|
|
"learning_rate": 4.614165004727356e-05,
|
||
|
|
"loss": 0.40485129356384275,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3800,
|
||
|
|
"token_acc": 0.8618784194621236,
|
||
|
|
"train_speed(iter/s)": 0.089589
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6396222365314446,
|
||
|
|
"grad_norm": 0.11304246634244919,
|
||
|
|
"learning_rate": 4.609515336038379e-05,
|
||
|
|
"loss": 0.39697728157043455,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3820,
|
||
|
|
"token_acc": 0.8657167944284284,
|
||
|
|
"train_speed(iter/s)": 0.089593
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6482077699077056,
|
||
|
|
"grad_norm": 0.10555765777826309,
|
||
|
|
"learning_rate": 4.604840190534349e-05,
|
||
|
|
"loss": 0.4016863346099854,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3840,
|
||
|
|
"token_acc": 0.8618964493040964,
|
||
|
|
"train_speed(iter/s)": 0.089597
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6567933032839666,
|
||
|
|
"grad_norm": 0.10668028146028519,
|
||
|
|
"learning_rate": 4.600139624677436e-05,
|
||
|
|
"loss": 0.40195555686950685,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3860,
|
||
|
|
"token_acc": 0.8585640908572081,
|
||
|
|
"train_speed(iter/s)": 0.089599
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6653788366602276,
|
||
|
|
"grad_norm": 0.11972223222255707,
|
||
|
|
"learning_rate": 4.5954136952368175e-05,
|
||
|
|
"loss": 0.404964542388916,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3880,
|
||
|
|
"token_acc": 0.8751289644195156,
|
||
|
|
"train_speed(iter/s)": 0.089603
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6739643700364886,
|
||
|
|
"grad_norm": 0.1090841144323349,
|
||
|
|
"learning_rate": 4.590662459287987e-05,
|
||
|
|
"loss": 0.4025224208831787,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3900,
|
||
|
|
"token_acc": 0.8712011406091705,
|
||
|
|
"train_speed(iter/s)": 0.089608
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6825499034127496,
|
||
|
|
"grad_norm": 0.09250445663928986,
|
||
|
|
"learning_rate": 4.585885974212068e-05,
|
||
|
|
"loss": 0.39822845458984374,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3920,
|
||
|
|
"token_acc": 0.8478179395649417,
|
||
|
|
"train_speed(iter/s)": 0.089608
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6911354367890106,
|
||
|
|
"grad_norm": 0.12228672951459885,
|
||
|
|
"learning_rate": 4.58108429769512e-05,
|
||
|
|
"loss": 0.4002052307128906,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3940,
|
||
|
|
"token_acc": 0.8550839992606666,
|
||
|
|
"train_speed(iter/s)": 0.089611
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6997209701652716,
|
||
|
|
"grad_norm": 0.11608360707759857,
|
||
|
|
"learning_rate": 4.576257487727442e-05,
|
||
|
|
"loss": 0.40276689529418946,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3960,
|
||
|
|
"token_acc": 0.8589090178774137,
|
||
|
|
"train_speed(iter/s)": 0.089614
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7083065035415326,
|
||
|
|
"grad_norm": 0.10027152299880981,
|
||
|
|
"learning_rate": 4.571405602602871e-05,
|
||
|
|
"loss": 0.39651687145233155,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 3980,
|
||
|
|
"token_acc": 0.8630956830570248,
|
||
|
|
"train_speed(iter/s)": 0.089614
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7168920369177934,
|
||
|
|
"grad_norm": 0.13469679653644562,
|
||
|
|
"learning_rate": 4.5665287009180796e-05,
|
||
|
|
"loss": 0.404406213760376,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4000,
|
||
|
|
"token_acc": 0.8562729568578561,
|
||
|
|
"train_speed(iter/s)": 0.089618
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7168920369177934,
|
||
|
|
"eval_loss": 0.45004475116729736,
|
||
|
|
"eval_runtime": 69.5068,
|
||
|
|
"eval_samples_per_second": 54.153,
|
||
|
|
"eval_steps_per_second": 0.691,
|
||
|
|
"eval_token_acc": 0.838482459020931,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7254775702940544,
|
||
|
|
"grad_norm": 0.11884400248527527,
|
||
|
|
"learning_rate": 4.5616268415718686e-05,
|
||
|
|
"loss": 0.4039021968841553,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4020,
|
||
|
|
"token_acc": 0.8519779575146431,
|
||
|
|
"train_speed(iter/s)": 0.089391
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7340631036703154,
|
||
|
|
"grad_norm": 0.11766815185546875,
|
||
|
|
"learning_rate": 4.5567000837644555e-05,
|
||
|
|
"loss": 0.40551328659057617,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4040,
|
||
|
|
"token_acc": 0.8603655792648116,
|
||
|
|
"train_speed(iter/s)": 0.089374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7426486370465764,
|
||
|
|
"grad_norm": 0.1035754606127739,
|
||
|
|
"learning_rate": 4.551748486996755e-05,
|
||
|
|
"loss": 0.3972191333770752,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4060,
|
||
|
|
"token_acc": 0.8441598716065328,
|
||
|
|
"train_speed(iter/s)": 0.08936
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7512341704228374,
|
||
|
|
"grad_norm": 0.11534030735492706,
|
||
|
|
"learning_rate": 4.5467721110696685e-05,
|
||
|
|
"loss": 0.39623782634735105,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4080,
|
||
|
|
"token_acc": 0.8508078067985404,
|
||
|
|
"train_speed(iter/s)": 0.089346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7598197037990984,
|
||
|
|
"grad_norm": 0.11770807206630707,
|
||
|
|
"learning_rate": 4.541771016083356e-05,
|
||
|
|
"loss": 0.4031228542327881,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4100,
|
||
|
|
"token_acc": 0.8575402257628572,
|
||
|
|
"train_speed(iter/s)": 0.089337
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7684052371753594,
|
||
|
|
"grad_norm": 0.11031018942594528,
|
||
|
|
"learning_rate": 4.5367452624365107e-05,
|
||
|
|
"loss": 0.39590916633605955,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4120,
|
||
|
|
"token_acc": 0.8493938383274198,
|
||
|
|
"train_speed(iter/s)": 0.089333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7769907705516204,
|
||
|
|
"grad_norm": 0.12101167440414429,
|
||
|
|
"learning_rate": 4.531694910825632e-05,
|
||
|
|
"loss": 0.4022487163543701,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4140,
|
||
|
|
"token_acc": 0.8616033848286162,
|
||
|
|
"train_speed(iter/s)": 0.08933
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7855763039278814,
|
||
|
|
"grad_norm": 0.12361987680196762,
|
||
|
|
"learning_rate": 4.526620022244293e-05,
|
||
|
|
"loss": 0.3952162265777588,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4160,
|
||
|
|
"token_acc": 0.8546911728976807,
|
||
|
|
"train_speed(iter/s)": 0.089329
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7941618373041424,
|
||
|
|
"grad_norm": 0.11886027455329895,
|
||
|
|
"learning_rate": 4.521520657982399e-05,
|
||
|
|
"loss": 0.3967653751373291,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4180,
|
||
|
|
"token_acc": 0.850109229842917,
|
||
|
|
"train_speed(iter/s)": 0.089327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8027473706804034,
|
||
|
|
"grad_norm": 0.10228098928928375,
|
||
|
|
"learning_rate": 4.516396879625451e-05,
|
||
|
|
"loss": 0.3982940435409546,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4200,
|
||
|
|
"token_acc": 0.8674571957241461,
|
||
|
|
"train_speed(iter/s)": 0.089324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8113329040566644,
|
||
|
|
"grad_norm": 0.13192002475261688,
|
||
|
|
"learning_rate": 4.5112487490538033e-05,
|
||
|
|
"loss": 0.4016000747680664,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4220,
|
||
|
|
"token_acc": 0.8583699143774935,
|
||
|
|
"train_speed(iter/s)": 0.089324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8199184374329254,
|
||
|
|
"grad_norm": 0.13863115012645721,
|
||
|
|
"learning_rate": 4.5060763284419114e-05,
|
||
|
|
"loss": 0.3993339538574219,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4240,
|
||
|
|
"token_acc": 0.8529649884386873,
|
||
|
|
"train_speed(iter/s)": 0.089327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8285039708091864,
|
||
|
|
"grad_norm": 0.1052585169672966,
|
||
|
|
"learning_rate": 4.500879680257587e-05,
|
||
|
|
"loss": 0.39501266479492186,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4260,
|
||
|
|
"token_acc": 0.8490466163025552,
|
||
|
|
"train_speed(iter/s)": 0.089326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8370895041854474,
|
||
|
|
"grad_norm": 0.11824264377355576,
|
||
|
|
"learning_rate": 4.495658867261237e-05,
|
||
|
|
"loss": 0.3999388933181763,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4280,
|
||
|
|
"token_acc": 0.8604835011176714,
|
||
|
|
"train_speed(iter/s)": 0.08933
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8456750375617084,
|
||
|
|
"grad_norm": 0.10404901951551437,
|
||
|
|
"learning_rate": 4.490413952505113e-05,
|
||
|
|
"loss": 0.399350905418396,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4300,
|
||
|
|
"token_acc": 0.8754184479751959,
|
||
|
|
"train_speed(iter/s)": 0.089333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8542605709379694,
|
||
|
|
"grad_norm": 0.11935856193304062,
|
||
|
|
"learning_rate": 4.485144999332541e-05,
|
||
|
|
"loss": 0.3988263845443726,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4320,
|
||
|
|
"token_acc": 0.8642416058331645,
|
||
|
|
"train_speed(iter/s)": 0.089334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8628461043142304,
|
||
|
|
"grad_norm": 0.12025253474712372,
|
||
|
|
"learning_rate": 4.4798520713771655e-05,
|
||
|
|
"loss": 0.3969618320465088,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4340,
|
||
|
|
"token_acc": 0.8592759073410623,
|
||
|
|
"train_speed(iter/s)": 0.089324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8714316376904914,
|
||
|
|
"grad_norm": 0.10460798442363739,
|
||
|
|
"learning_rate": 4.474535232562176e-05,
|
||
|
|
"loss": 0.4043170928955078,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4360,
|
||
|
|
"token_acc": 0.852819602922532,
|
||
|
|
"train_speed(iter/s)": 0.089327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8800171710667524,
|
||
|
|
"grad_norm": 0.10020267218351364,
|
||
|
|
"learning_rate": 4.469194547099532e-05,
|
||
|
|
"loss": 0.3999593734741211,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4380,
|
||
|
|
"token_acc": 0.8611075959033526,
|
||
|
|
"train_speed(iter/s)": 0.089328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8886027044430134,
|
||
|
|
"grad_norm": 0.12959228456020355,
|
||
|
|
"learning_rate": 4.463830079489196e-05,
|
||
|
|
"loss": 0.39733612537384033,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4400,
|
||
|
|
"token_acc": 0.8531978711946401,
|
||
|
|
"train_speed(iter/s)": 0.089335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8971882378192744,
|
||
|
|
"grad_norm": 0.11115922778844833,
|
||
|
|
"learning_rate": 4.458441894518348e-05,
|
||
|
|
"loss": 0.4049359321594238,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4420,
|
||
|
|
"token_acc": 0.8702030459301568,
|
||
|
|
"train_speed(iter/s)": 0.089338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9057737711955354,
|
||
|
|
"grad_norm": 0.10734923928976059,
|
||
|
|
"learning_rate": 4.453030057260604e-05,
|
||
|
|
"loss": 0.40124940872192383,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4440,
|
||
|
|
"token_acc": 0.8526137694097369,
|
||
|
|
"train_speed(iter/s)": 0.089343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9143593045717964,
|
||
|
|
"grad_norm": 0.10538238286972046,
|
||
|
|
"learning_rate": 4.44759463307523e-05,
|
||
|
|
"loss": 0.3986711263656616,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4460,
|
||
|
|
"token_acc": 0.8580899206582427,
|
||
|
|
"train_speed(iter/s)": 0.089347
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9229448379480574,
|
||
|
|
"grad_norm": 0.11792416125535965,
|
||
|
|
"learning_rate": 4.4421356876063566e-05,
|
||
|
|
"loss": 0.4009650707244873,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4480,
|
||
|
|
"token_acc": 0.8415756258347672,
|
||
|
|
"train_speed(iter/s)": 0.089351
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9315303713243184,
|
||
|
|
"grad_norm": 0.10540692508220673,
|
||
|
|
"learning_rate": 4.4366532867821816e-05,
|
||
|
|
"loss": 0.40032110214233396,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4500,
|
||
|
|
"token_acc": 0.8645283673549553,
|
||
|
|
"train_speed(iter/s)": 0.089356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9401159047005794,
|
||
|
|
"grad_norm": 0.10806146264076233,
|
||
|
|
"learning_rate": 4.4311474968141745e-05,
|
||
|
|
"loss": 0.4047665596008301,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4520,
|
||
|
|
"token_acc": 0.8665738751278136,
|
||
|
|
"train_speed(iter/s)": 0.089358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9487014380768404,
|
||
|
|
"grad_norm": 0.0982556939125061,
|
||
|
|
"learning_rate": 4.4256183841962776e-05,
|
||
|
|
"loss": 0.39951965808868406,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4540,
|
||
|
|
"token_acc": 0.8557438649716252,
|
||
|
|
"train_speed(iter/s)": 0.08936
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9572869714531014,
|
||
|
|
"grad_norm": 0.11462666094303131,
|
||
|
|
"learning_rate": 4.420066015704105e-05,
|
||
|
|
"loss": 0.39820613861083987,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4560,
|
||
|
|
"token_acc": 0.851616577376715,
|
||
|
|
"train_speed(iter/s)": 0.089365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9658725048293624,
|
||
|
|
"grad_norm": 0.12274167686700821,
|
||
|
|
"learning_rate": 4.414490458394134e-05,
|
||
|
|
"loss": 0.39962952136993407,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4580,
|
||
|
|
"token_acc": 0.8450544293089454,
|
||
|
|
"train_speed(iter/s)": 0.089369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9744580382056234,
|
||
|
|
"grad_norm": 0.11052652448415756,
|
||
|
|
"learning_rate": 4.408891779602892e-05,
|
||
|
|
"loss": 0.40143113136291503,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4600,
|
||
|
|
"token_acc": 0.8466183479919549,
|
||
|
|
"train_speed(iter/s)": 0.089369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9830435715818844,
|
||
|
|
"grad_norm": 0.11736435443162918,
|
||
|
|
"learning_rate": 4.403270046946151e-05,
|
||
|
|
"loss": 0.39746062755584716,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4620,
|
||
|
|
"token_acc": 0.8545920867275066,
|
||
|
|
"train_speed(iter/s)": 0.08937
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9916291049581454,
|
||
|
|
"grad_norm": 0.09831462055444717,
|
||
|
|
"learning_rate": 4.397625328318104e-05,
|
||
|
|
"loss": 0.40285186767578124,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4640,
|
||
|
|
"token_acc": 0.8588040292883812,
|
||
|
|
"train_speed(iter/s)": 0.089375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0,
|
||
|
|
"grad_norm": 0.1868225783109665,
|
||
|
|
"learning_rate": 4.3919576918905495e-05,
|
||
|
|
"loss": 0.40441222190856935,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4660,
|
||
|
|
"token_acc": 0.8483147592149679,
|
||
|
|
"train_speed(iter/s)": 0.089388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.008585533376261,
|
||
|
|
"grad_norm": 0.1071023941040039,
|
||
|
|
"learning_rate": 4.3862672061120637e-05,
|
||
|
|
"loss": 0.3615531921386719,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4680,
|
||
|
|
"token_acc": 0.8768156740901892,
|
||
|
|
"train_speed(iter/s)": 0.089352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.017171066752522,
|
||
|
|
"grad_norm": 0.10470844805240631,
|
||
|
|
"learning_rate": 4.3805539397071806e-05,
|
||
|
|
"loss": 0.36854674816131594,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4700,
|
||
|
|
"token_acc": 0.8658165567867299,
|
||
|
|
"train_speed(iter/s)": 0.089356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.025756600128783,
|
||
|
|
"grad_norm": 0.11344057321548462,
|
||
|
|
"learning_rate": 4.374817961675553e-05,
|
||
|
|
"loss": 0.36517815589904784,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4720,
|
||
|
|
"token_acc": 0.8573345434699361,
|
||
|
|
"train_speed(iter/s)": 0.08936
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.034342133505044,
|
||
|
|
"grad_norm": 0.12088248133659363,
|
||
|
|
"learning_rate": 4.369059341291131e-05,
|
||
|
|
"loss": 0.3732161045074463,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4740,
|
||
|
|
"token_acc": 0.8643586935864834,
|
||
|
|
"train_speed(iter/s)": 0.089232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.042927666881305,
|
||
|
|
"grad_norm": 0.1079382449388504,
|
||
|
|
"learning_rate": 4.3632781481013105e-05,
|
||
|
|
"loss": 0.3706186294555664,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4760,
|
||
|
|
"token_acc": 0.8583230735096428,
|
||
|
|
"train_speed(iter/s)": 0.089236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.051513200257566,
|
||
|
|
"grad_norm": 0.10861940681934357,
|
||
|
|
"learning_rate": 4.357474451926107e-05,
|
||
|
|
"loss": 0.36578049659729006,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4780,
|
||
|
|
"token_acc": 0.8634094633238114,
|
||
|
|
"train_speed(iter/s)": 0.089242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.060098733633827,
|
||
|
|
"grad_norm": 0.10976995527744293,
|
||
|
|
"learning_rate": 4.351648322857304e-05,
|
||
|
|
"loss": 0.3717454671859741,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4800,
|
||
|
|
"token_acc": 0.8754387101732538,
|
||
|
|
"train_speed(iter/s)": 0.089245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.068684267010088,
|
||
|
|
"grad_norm": 0.11381576955318451,
|
||
|
|
"learning_rate": 4.345799831257612e-05,
|
||
|
|
"loss": 0.3690098524093628,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4820,
|
||
|
|
"token_acc": 0.8754805492942517,
|
||
|
|
"train_speed(iter/s)": 0.089251
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.077269800386349,
|
||
|
|
"grad_norm": 0.11746755242347717,
|
||
|
|
"learning_rate": 4.339929047759812e-05,
|
||
|
|
"loss": 0.3719310760498047,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4840,
|
||
|
|
"token_acc": 0.8569852569604883,
|
||
|
|
"train_speed(iter/s)": 0.089254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.08585533376261,
|
||
|
|
"grad_norm": 0.1201663538813591,
|
||
|
|
"learning_rate": 4.334036043265909e-05,
|
||
|
|
"loss": 0.366811728477478,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4860,
|
||
|
|
"token_acc": 0.868435326772985,
|
||
|
|
"train_speed(iter/s)": 0.089256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.094440867138871,
|
||
|
|
"grad_norm": 0.10783121734857559,
|
||
|
|
"learning_rate": 4.3281208889462715e-05,
|
||
|
|
"loss": 0.3673741102218628,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4880,
|
||
|
|
"token_acc": 0.8597632948845746,
|
||
|
|
"train_speed(iter/s)": 0.089257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.103026400515132,
|
||
|
|
"grad_norm": 0.10495728254318237,
|
||
|
|
"learning_rate": 4.3221836562387754e-05,
|
||
|
|
"loss": 0.371392560005188,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4900,
|
||
|
|
"token_acc": 0.8613381730879158,
|
||
|
|
"train_speed(iter/s)": 0.08926
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.111611933891393,
|
||
|
|
"grad_norm": 0.12534630298614502,
|
||
|
|
"learning_rate": 4.3162244168479385e-05,
|
||
|
|
"loss": 0.37217743396759034,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4920,
|
||
|
|
"token_acc": 0.8550305751583707,
|
||
|
|
"train_speed(iter/s)": 0.089265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.120197467267654,
|
||
|
|
"grad_norm": 0.11463397741317749,
|
||
|
|
"learning_rate": 4.310243242744055e-05,
|
||
|
|
"loss": 0.37210404872894287,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4940,
|
||
|
|
"token_acc": 0.8669802804648793,
|
||
|
|
"train_speed(iter/s)": 0.089267
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.128783000643915,
|
||
|
|
"grad_norm": 0.0987405851483345,
|
||
|
|
"learning_rate": 4.304240206162326e-05,
|
||
|
|
"loss": 0.36531455516815187,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4960,
|
||
|
|
"token_acc": 0.8615956192835081,
|
||
|
|
"train_speed(iter/s)": 0.089271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.137368534020176,
|
||
|
|
"grad_norm": 0.10236576199531555,
|
||
|
|
"learning_rate": 4.2982153796019895e-05,
|
||
|
|
"loss": 0.3683722734451294,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 4980,
|
||
|
|
"token_acc": 0.8691021414446882,
|
||
|
|
"train_speed(iter/s)": 0.089273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.145954067396437,
|
||
|
|
"grad_norm": 0.11913823336362839,
|
||
|
|
"learning_rate": 4.292168835825442e-05,
|
||
|
|
"loss": 0.36794998645782473,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5000,
|
||
|
|
"token_acc": 0.8603812367895441,
|
||
|
|
"train_speed(iter/s)": 0.089274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.145954067396437,
|
||
|
|
"eval_loss": 0.44772276282310486,
|
||
|
|
"eval_runtime": 74.9501,
|
||
|
|
"eval_samples_per_second": 50.22,
|
||
|
|
"eval_steps_per_second": 0.64,
|
||
|
|
"eval_token_acc": 0.8396186479726367,
|
||
|
|
"step": 5000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.154539600772698,
|
||
|
|
"grad_norm": 0.10815497487783432,
|
||
|
|
"learning_rate": 4.286100647857362e-05,
|
||
|
|
"loss": 0.3666555881500244,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5020,
|
||
|
|
"token_acc": 0.8487304373111233,
|
||
|
|
"train_speed(iter/s)": 0.089094
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.163125134148959,
|
||
|
|
"grad_norm": 0.10708407312631607,
|
||
|
|
"learning_rate": 4.2800108889838244e-05,
|
||
|
|
"loss": 0.3680349111557007,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5040,
|
||
|
|
"token_acc": 0.8607205605794815,
|
||
|
|
"train_speed(iter/s)": 0.089078
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.17171066752522,
|
||
|
|
"grad_norm": 0.10280643403530121,
|
||
|
|
"learning_rate": 4.273899632751422e-05,
|
||
|
|
"loss": 0.3690458297729492,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5060,
|
||
|
|
"token_acc": 0.8681282741623693,
|
||
|
|
"train_speed(iter/s)": 0.089068
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.180296200901481,
|
||
|
|
"grad_norm": 0.11067724972963333,
|
||
|
|
"learning_rate": 4.267766952966369e-05,
|
||
|
|
"loss": 0.37246291637420653,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5080,
|
||
|
|
"token_acc": 0.8648300486787626,
|
||
|
|
"train_speed(iter/s)": 0.089062
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.188881734277742,
|
||
|
|
"grad_norm": 0.10517250746488571,
|
||
|
|
"learning_rate": 4.261612923693617e-05,
|
||
|
|
"loss": 0.37222487926483155,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5100,
|
||
|
|
"token_acc": 0.8561770562371953,
|
||
|
|
"train_speed(iter/s)": 0.089058
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.197467267654003,
|
||
|
|
"grad_norm": 0.11643174290657043,
|
||
|
|
"learning_rate": 4.255437619255955e-05,
|
||
|
|
"loss": 0.37151226997375486,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5120,
|
||
|
|
"token_acc": 0.856546833515401,
|
||
|
|
"train_speed(iter/s)": 0.089056
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.206052801030264,
|
||
|
|
"grad_norm": 0.10725266486406326,
|
||
|
|
"learning_rate": 4.2492411142331164e-05,
|
||
|
|
"loss": 0.3672873258590698,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5140,
|
||
|
|
"token_acc": 0.8657454419748819,
|
||
|
|
"train_speed(iter/s)": 0.089055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.214638334406525,
|
||
|
|
"grad_norm": 0.10386510193347931,
|
||
|
|
"learning_rate": 4.243023483460875e-05,
|
||
|
|
"loss": 0.3682314395904541,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5160,
|
||
|
|
"token_acc": 0.8692801593001643,
|
||
|
|
"train_speed(iter/s)": 0.089056
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.223223867782786,
|
||
|
|
"grad_norm": 0.11796915531158447,
|
||
|
|
"learning_rate": 4.236784802030141e-05,
|
||
|
|
"loss": 0.3701756000518799,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5180,
|
||
|
|
"token_acc": 0.8771635645482831,
|
||
|
|
"train_speed(iter/s)": 0.089057
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.231809401159047,
|
||
|
|
"grad_norm": 0.10015714913606644,
|
||
|
|
"learning_rate": 4.230525145286057e-05,
|
||
|
|
"loss": 0.36999518871307374,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5200,
|
||
|
|
"token_acc": 0.8674851697347774,
|
||
|
|
"train_speed(iter/s)": 0.089057
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.240394934535308,
|
||
|
|
"grad_norm": 0.1074676439166069,
|
||
|
|
"learning_rate": 4.224244588827088e-05,
|
||
|
|
"loss": 0.3750225782394409,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5220,
|
||
|
|
"token_acc": 0.8527425346133436,
|
||
|
|
"train_speed(iter/s)": 0.089057
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.248980467911569,
|
||
|
|
"grad_norm": 0.10311347991228104,
|
||
|
|
"learning_rate": 4.2179432085041016e-05,
|
||
|
|
"loss": 0.3746063232421875,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5240,
|
||
|
|
"token_acc": 0.8669185952544043,
|
||
|
|
"train_speed(iter/s)": 0.089056
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.25756600128783,
|
||
|
|
"grad_norm": 0.11873036623001099,
|
||
|
|
"learning_rate": 4.211621080419463e-05,
|
||
|
|
"loss": 0.37813477516174315,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5260,
|
||
|
|
"token_acc": 0.8692531193982356,
|
||
|
|
"train_speed(iter/s)": 0.089056
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.266151534664091,
|
||
|
|
"grad_norm": 0.11505374312400818,
|
||
|
|
"learning_rate": 4.205278280926106e-05,
|
||
|
|
"loss": 0.37494683265686035,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5280,
|
||
|
|
"token_acc": 0.8686222108977568,
|
||
|
|
"train_speed(iter/s)": 0.089057
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.274737068040352,
|
||
|
|
"grad_norm": 0.10475321859121323,
|
||
|
|
"learning_rate": 4.198914886626617e-05,
|
||
|
|
"loss": 0.37322399616241453,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5300,
|
||
|
|
"token_acc": 0.8642545858709445,
|
||
|
|
"train_speed(iter/s)": 0.089058
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.283322601416613,
|
||
|
|
"grad_norm": 0.10895238816738129,
|
||
|
|
"learning_rate": 4.192530974372307e-05,
|
||
|
|
"loss": 0.37212719917297366,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5320,
|
||
|
|
"token_acc": 0.8592036985069942,
|
||
|
|
"train_speed(iter/s)": 0.089059
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.291908134792874,
|
||
|
|
"grad_norm": 0.13440454006195068,
|
||
|
|
"learning_rate": 4.186126621262286e-05,
|
||
|
|
"loss": 0.3748520612716675,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5340,
|
||
|
|
"token_acc": 0.8694009430316147,
|
||
|
|
"train_speed(iter/s)": 0.089059
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.300493668169135,
|
||
|
|
"grad_norm": 0.10428149253129959,
|
||
|
|
"learning_rate": 4.1797019046425264e-05,
|
||
|
|
"loss": 0.3729527711868286,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5360,
|
||
|
|
"token_acc": 0.8606326299971436,
|
||
|
|
"train_speed(iter/s)": 0.089059
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.309079201545396,
|
||
|
|
"grad_norm": 0.10109774023294449,
|
||
|
|
"learning_rate": 4.173256902104937e-05,
|
||
|
|
"loss": 0.3786268949508667,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5380,
|
||
|
|
"token_acc": 0.8546159979614149,
|
||
|
|
"train_speed(iter/s)": 0.089063
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.317664734921657,
|
||
|
|
"grad_norm": 0.1086476594209671,
|
||
|
|
"learning_rate": 4.166791691486417e-05,
|
||
|
|
"loss": 0.37719101905822755,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5400,
|
||
|
|
"token_acc": 0.8614693814596865,
|
||
|
|
"train_speed(iter/s)": 0.089065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.326250268297918,
|
||
|
|
"grad_norm": 0.0986161157488823,
|
||
|
|
"learning_rate": 4.1603063508679254e-05,
|
||
|
|
"loss": 0.3716520071029663,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5420,
|
||
|
|
"token_acc": 0.8700038391325128,
|
||
|
|
"train_speed(iter/s)": 0.089068
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.334835801674179,
|
||
|
|
"grad_norm": 0.10710026323795319,
|
||
|
|
"learning_rate": 4.1538009585735296e-05,
|
||
|
|
"loss": 0.37460925579071047,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5440,
|
||
|
|
"token_acc": 0.864236101862486,
|
||
|
|
"train_speed(iter/s)": 0.089068
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.34342133505044,
|
||
|
|
"grad_norm": 0.1084044948220253,
|
||
|
|
"learning_rate": 4.1472755931694626e-05,
|
||
|
|
"loss": 0.37008664608001707,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5460,
|
||
|
|
"token_acc": 0.884960342611309,
|
||
|
|
"train_speed(iter/s)": 0.08907
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.352006868426701,
|
||
|
|
"grad_norm": 0.1017412543296814,
|
||
|
|
"learning_rate": 4.1407303334631784e-05,
|
||
|
|
"loss": 0.37591137886047366,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5480,
|
||
|
|
"token_acc": 0.8690706806478822,
|
||
|
|
"train_speed(iter/s)": 0.089076
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.360592401802962,
|
||
|
|
"grad_norm": 0.09642521291971207,
|
||
|
|
"learning_rate": 4.134165258502392e-05,
|
||
|
|
"loss": 0.3724454641342163,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5500,
|
||
|
|
"token_acc": 0.8665798727743621,
|
||
|
|
"train_speed(iter/s)": 0.089078
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.369177935179223,
|
||
|
|
"grad_norm": 0.10195600241422653,
|
||
|
|
"learning_rate": 4.127580447574131e-05,
|
||
|
|
"loss": 0.37389321327209474,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5520,
|
||
|
|
"token_acc": 0.8659414758069467,
|
||
|
|
"train_speed(iter/s)": 0.089082
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.377763468555484,
|
||
|
|
"grad_norm": 0.10220296680927277,
|
||
|
|
"learning_rate": 4.120975980203778e-05,
|
||
|
|
"loss": 0.37123832702636717,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5540,
|
||
|
|
"token_acc": 0.8599722163416741,
|
||
|
|
"train_speed(iter/s)": 0.089087
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.386349001931745,
|
||
|
|
"grad_norm": 0.09796139597892761,
|
||
|
|
"learning_rate": 4.114351936154105e-05,
|
||
|
|
"loss": 0.37191407680511473,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5560,
|
||
|
|
"token_acc": 0.8607424388032349,
|
||
|
|
"train_speed(iter/s)": 0.089089
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.394934535308006,
|
||
|
|
"grad_norm": 0.1000937819480896,
|
||
|
|
"learning_rate": 4.1077083954243134e-05,
|
||
|
|
"loss": 0.3728537082672119,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5580,
|
||
|
|
"token_acc": 0.8599175456576579,
|
||
|
|
"train_speed(iter/s)": 0.089092
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.403520068684267,
|
||
|
|
"grad_norm": 0.10470744967460632,
|
||
|
|
"learning_rate": 4.101045438249072e-05,
|
||
|
|
"loss": 0.3749739170074463,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5600,
|
||
|
|
"token_acc": 0.8662434580620245,
|
||
|
|
"train_speed(iter/s)": 0.089094
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.412105602060528,
|
||
|
|
"grad_norm": 0.10006117075681686,
|
||
|
|
"learning_rate": 4.0943631450975395e-05,
|
||
|
|
"loss": 0.3695227146148682,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5620,
|
||
|
|
"token_acc": 0.8567862235957147,
|
||
|
|
"train_speed(iter/s)": 0.089097
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.420691135436789,
|
||
|
|
"grad_norm": 0.11233365535736084,
|
||
|
|
"learning_rate": 4.0876615966723983e-05,
|
||
|
|
"loss": 0.37129299640655516,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5640,
|
||
|
|
"token_acc": 0.8725153838730988,
|
||
|
|
"train_speed(iter/s)": 0.089101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.42927666881305,
|
||
|
|
"grad_norm": 0.09630627185106277,
|
||
|
|
"learning_rate": 4.080940873908881e-05,
|
||
|
|
"loss": 0.3767483472824097,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5660,
|
||
|
|
"token_acc": 0.8623814759151552,
|
||
|
|
"train_speed(iter/s)": 0.089105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.437862202189311,
|
||
|
|
"grad_norm": 0.11699684709310532,
|
||
|
|
"learning_rate": 4.0742010579737855e-05,
|
||
|
|
"loss": 0.37203705310821533,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5680,
|
||
|
|
"token_acc": 0.8617447464487988,
|
||
|
|
"train_speed(iter/s)": 0.089104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.446447735565572,
|
||
|
|
"grad_norm": 0.10771006345748901,
|
||
|
|
"learning_rate": 4.067442230264503e-05,
|
||
|
|
"loss": 0.3795736312866211,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5700,
|
||
|
|
"token_acc": 0.8621945679332835,
|
||
|
|
"train_speed(iter/s)": 0.089107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.455033268941833,
|
||
|
|
"grad_norm": 0.10978804528713226,
|
||
|
|
"learning_rate": 4.0606644724080334e-05,
|
||
|
|
"loss": 0.37045629024505616,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5720,
|
||
|
|
"token_acc": 0.8683952247812166,
|
||
|
|
"train_speed(iter/s)": 0.089111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.463618802318094,
|
||
|
|
"grad_norm": 0.11052682995796204,
|
||
|
|
"learning_rate": 4.053867866259994e-05,
|
||
|
|
"loss": 0.37306039333343505,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5740,
|
||
|
|
"token_acc": 0.8691631145068139,
|
||
|
|
"train_speed(iter/s)": 0.089115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.472204335694355,
|
||
|
|
"grad_norm": 0.09953057020902634,
|
||
|
|
"learning_rate": 4.0470524939036355e-05,
|
||
|
|
"loss": 0.37361931800842285,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5760,
|
||
|
|
"token_acc": 0.8691507758784558,
|
||
|
|
"train_speed(iter/s)": 0.089118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.480789869070616,
|
||
|
|
"grad_norm": 0.11204252392053604,
|
||
|
|
"learning_rate": 4.0402184376488514e-05,
|
||
|
|
"loss": 0.37611095905303954,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5780,
|
||
|
|
"token_acc": 0.8522185815081345,
|
||
|
|
"train_speed(iter/s)": 0.089118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.489375402446877,
|
||
|
|
"grad_norm": 0.09915061295032501,
|
||
|
|
"learning_rate": 4.033365780031183e-05,
|
||
|
|
"loss": 0.37398972511291506,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5800,
|
||
|
|
"token_acc": 0.8830448305013368,
|
||
|
|
"train_speed(iter/s)": 0.089119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.497960935823138,
|
||
|
|
"grad_norm": 0.10546642541885376,
|
||
|
|
"learning_rate": 4.026494603810819e-05,
|
||
|
|
"loss": 0.3730853796005249,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5820,
|
||
|
|
"token_acc": 0.8712887390375224,
|
||
|
|
"train_speed(iter/s)": 0.089122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.506546469199399,
|
||
|
|
"grad_norm": 0.10121016949415207,
|
||
|
|
"learning_rate": 4.0196049919716004e-05,
|
||
|
|
"loss": 0.3762380361557007,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5840,
|
||
|
|
"token_acc": 0.8579485282281408,
|
||
|
|
"train_speed(iter/s)": 0.089126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.51513200257566,
|
||
|
|
"grad_norm": 0.103721484541893,
|
||
|
|
"learning_rate": 4.012697027720018e-05,
|
||
|
|
"loss": 0.36703407764434814,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5860,
|
||
|
|
"token_acc": 0.8760437267344359,
|
||
|
|
"train_speed(iter/s)": 0.089129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.523717535951921,
|
||
|
|
"grad_norm": 0.10886813700199127,
|
||
|
|
"learning_rate": 4.005770794484206e-05,
|
||
|
|
"loss": 0.3760274648666382,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5880,
|
||
|
|
"token_acc": 0.86771377124094,
|
||
|
|
"train_speed(iter/s)": 0.089132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.532303069328182,
|
||
|
|
"grad_norm": 0.10048224776983261,
|
||
|
|
"learning_rate": 3.998826375912934e-05,
|
||
|
|
"loss": 0.3727203369140625,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5900,
|
||
|
|
"token_acc": 0.8678732978111068,
|
||
|
|
"train_speed(iter/s)": 0.089136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.540888602704443,
|
||
|
|
"grad_norm": 0.11523660272359848,
|
||
|
|
"learning_rate": 3.9918638558745966e-05,
|
||
|
|
"loss": 0.3741061449050903,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5920,
|
||
|
|
"token_acc": 0.8660318303612676,
|
||
|
|
"train_speed(iter/s)": 0.089136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.549474136080704,
|
||
|
|
"grad_norm": 0.11144141107797623,
|
||
|
|
"learning_rate": 3.9848833184562056e-05,
|
||
|
|
"loss": 0.3695514440536499,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5940,
|
||
|
|
"token_acc": 0.8587312382845311,
|
||
|
|
"train_speed(iter/s)": 0.089141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.558059669456965,
|
||
|
|
"grad_norm": 0.10469717532396317,
|
||
|
|
"learning_rate": 3.9778848479623656e-05,
|
||
|
|
"loss": 0.3754448413848877,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5960,
|
||
|
|
"token_acc": 0.856428029145263,
|
||
|
|
"train_speed(iter/s)": 0.089145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.566645202833226,
|
||
|
|
"grad_norm": 0.09304027259349823,
|
||
|
|
"learning_rate": 3.970868528914264e-05,
|
||
|
|
"loss": 0.3713753938674927,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 5980,
|
||
|
|
"token_acc": 0.8559786330639638,
|
||
|
|
"train_speed(iter/s)": 0.089145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.575230736209487,
|
||
|
|
"grad_norm": 0.10925323516130447,
|
||
|
|
"learning_rate": 3.963834446048644e-05,
|
||
|
|
"loss": 0.3693029165267944,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6000,
|
||
|
|
"token_acc": 0.8629965592743197,
|
||
|
|
"train_speed(iter/s)": 0.089148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.575230736209487,
|
||
|
|
"eval_loss": 0.4436081647872925,
|
||
|
|
"eval_runtime": 70.1538,
|
||
|
|
"eval_samples_per_second": 53.654,
|
||
|
|
"eval_steps_per_second": 0.684,
|
||
|
|
"eval_token_acc": 0.84051665623243,
|
||
|
|
"step": 6000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.583816269585748,
|
||
|
|
"grad_norm": 0.1098393052816391,
|
||
|
|
"learning_rate": 3.956782684316788e-05,
|
||
|
|
"loss": 0.37103126049041746,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6020,
|
||
|
|
"token_acc": 0.8471303763965553,
|
||
|
|
"train_speed(iter/s)": 0.089007
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.592401802962009,
|
||
|
|
"grad_norm": 0.10387935489416122,
|
||
|
|
"learning_rate": 3.949713328883483e-05,
|
||
|
|
"loss": 0.36882970333099363,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6040,
|
||
|
|
"token_acc": 0.8459705942755437,
|
||
|
|
"train_speed(iter/s)": 0.088998
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.60098733633827,
|
||
|
|
"grad_norm": 0.10209009051322937,
|
||
|
|
"learning_rate": 3.942626465126001e-05,
|
||
|
|
"loss": 0.36882977485656737,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6060,
|
||
|
|
"token_acc": 0.8655902503061222,
|
||
|
|
"train_speed(iter/s)": 0.088991
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.609572869714531,
|
||
|
|
"grad_norm": 0.10415869951248169,
|
||
|
|
"learning_rate": 3.935522178633062e-05,
|
||
|
|
"loss": 0.3759881258010864,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6080,
|
||
|
|
"token_acc": 0.8581398082906834,
|
||
|
|
"train_speed(iter/s)": 0.08899
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.618158403090792,
|
||
|
|
"grad_norm": 0.11114171892404556,
|
||
|
|
"learning_rate": 3.928400555203801e-05,
|
||
|
|
"loss": 0.37210090160369874,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6100,
|
||
|
|
"token_acc": 0.8736639992402806,
|
||
|
|
"train_speed(iter/s)": 0.088988
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.626743936467053,
|
||
|
|
"grad_norm": 0.10994569212198257,
|
||
|
|
"learning_rate": 3.921261680846734e-05,
|
||
|
|
"loss": 0.3746177673339844,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6120,
|
||
|
|
"token_acc": 0.8693309992064365,
|
||
|
|
"train_speed(iter/s)": 0.088985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.635329469843314,
|
||
|
|
"grad_norm": 0.096384197473526,
|
||
|
|
"learning_rate": 3.914105641778718e-05,
|
||
|
|
"loss": 0.3694021701812744,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6140,
|
||
|
|
"token_acc": 0.8684860314899538,
|
||
|
|
"train_speed(iter/s)": 0.088985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.643915003219575,
|
||
|
|
"grad_norm": 0.10146961361169815,
|
||
|
|
"learning_rate": 3.9069325244239095e-05,
|
||
|
|
"loss": 0.36874828338623045,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6160,
|
||
|
|
"token_acc": 0.8668048776320361,
|
||
|
|
"train_speed(iter/s)": 0.08898
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.652500536595836,
|
||
|
|
"grad_norm": 0.0965966135263443,
|
||
|
|
"learning_rate": 3.899742415412722e-05,
|
||
|
|
"loss": 0.36864802837371824,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6180,
|
||
|
|
"token_acc": 0.8755646100841697,
|
||
|
|
"train_speed(iter/s)": 0.088979
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.661086069972097,
|
||
|
|
"grad_norm": 0.09827576577663422,
|
||
|
|
"learning_rate": 3.892535401580776e-05,
|
||
|
|
"loss": 0.36760308742523196,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6200,
|
||
|
|
"token_acc": 0.8648272017837235,
|
||
|
|
"train_speed(iter/s)": 0.088982
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.669671603348358,
|
||
|
|
"grad_norm": 0.09901771694421768,
|
||
|
|
"learning_rate": 3.885311569967858e-05,
|
||
|
|
"loss": 0.37281830310821534,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6220,
|
||
|
|
"token_acc": 0.8820075603884335,
|
||
|
|
"train_speed(iter/s)": 0.088983
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.678257136724619,
|
||
|
|
"grad_norm": 0.1107199490070343,
|
||
|
|
"learning_rate": 3.878071007816859e-05,
|
||
|
|
"loss": 0.37139651775360105,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6240,
|
||
|
|
"token_acc": 0.8544989601044564,
|
||
|
|
"train_speed(iter/s)": 0.088985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.68684267010088,
|
||
|
|
"grad_norm": 0.10703787952661514,
|
||
|
|
"learning_rate": 3.87081380257273e-05,
|
||
|
|
"loss": 0.3727452039718628,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6260,
|
||
|
|
"token_acc": 0.8577885712183096,
|
||
|
|
"train_speed(iter/s)": 0.088987
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.695428203477141,
|
||
|
|
"grad_norm": 0.10672149062156677,
|
||
|
|
"learning_rate": 3.8635400418814214e-05,
|
||
|
|
"loss": 0.36861019134521483,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6280,
|
||
|
|
"token_acc": 0.8560803640097792,
|
||
|
|
"train_speed(iter/s)": 0.088987
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.704013736853402,
|
||
|
|
"grad_norm": 0.09396067261695862,
|
||
|
|
"learning_rate": 3.856249813588824e-05,
|
||
|
|
"loss": 0.36811778545379636,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6300,
|
||
|
|
"token_acc": 0.868624502647731,
|
||
|
|
"train_speed(iter/s)": 0.088988
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.712599270229663,
|
||
|
|
"grad_norm": 0.1063656210899353,
|
||
|
|
"learning_rate": 3.848943205739711e-05,
|
||
|
|
"loss": 0.369048547744751,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6320,
|
||
|
|
"token_acc": 0.8519738843659109,
|
||
|
|
"train_speed(iter/s)": 0.088991
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.721184803605924,
|
||
|
|
"grad_norm": 0.10474120825529099,
|
||
|
|
"learning_rate": 3.841620306576673e-05,
|
||
|
|
"loss": 0.3731086730957031,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6340,
|
||
|
|
"token_acc": 0.8653390159502418,
|
||
|
|
"train_speed(iter/s)": 0.088991
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.729770336982185,
|
||
|
|
"grad_norm": 0.10354544967412949,
|
||
|
|
"learning_rate": 3.834281204539051e-05,
|
||
|
|
"loss": 0.37295677661895754,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6360,
|
||
|
|
"token_acc": 0.8547169188263978,
|
||
|
|
"train_speed(iter/s)": 0.088993
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.738355870358446,
|
||
|
|
"grad_norm": 0.10440333932638168,
|
||
|
|
"learning_rate": 3.82692598826187e-05,
|
||
|
|
"loss": 0.3712725877761841,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6380,
|
||
|
|
"token_acc": 0.8820346020559459,
|
||
|
|
"train_speed(iter/s)": 0.088996
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.746941403734707,
|
||
|
|
"grad_norm": 0.09520816057920456,
|
||
|
|
"learning_rate": 3.8195547465747685e-05,
|
||
|
|
"loss": 0.3697003602981567,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6400,
|
||
|
|
"token_acc": 0.8595229803723984,
|
||
|
|
"train_speed(iter/s)": 0.088997
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.755526937110968,
|
||
|
|
"grad_norm": 0.09628502279520035,
|
||
|
|
"learning_rate": 3.812167568500927e-05,
|
||
|
|
"loss": 0.3673550128936768,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6420,
|
||
|
|
"token_acc": 0.8689877572158957,
|
||
|
|
"train_speed(iter/s)": 0.089
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.764112470487229,
|
||
|
|
"grad_norm": 0.09505701065063477,
|
||
|
|
"learning_rate": 3.804764543255987e-05,
|
||
|
|
"loss": 0.36903977394104004,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6440,
|
||
|
|
"token_acc": 0.8750163024104382,
|
||
|
|
"train_speed(iter/s)": 0.089003
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.77269800386349,
|
||
|
|
"grad_norm": 0.092622309923172,
|
||
|
|
"learning_rate": 3.797345760246982e-05,
|
||
|
|
"loss": 0.3679107666015625,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6460,
|
||
|
|
"token_acc": 0.8469855548827956,
|
||
|
|
"train_speed(iter/s)": 0.089006
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.781283537239751,
|
||
|
|
"grad_norm": 0.10573814809322357,
|
||
|
|
"learning_rate": 3.7899113090712526e-05,
|
||
|
|
"loss": 0.3690340042114258,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6480,
|
||
|
|
"token_acc": 0.8704448664825046,
|
||
|
|
"train_speed(iter/s)": 0.089008
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.789869070616012,
|
||
|
|
"grad_norm": 0.1018662378191948,
|
||
|
|
"learning_rate": 3.782461279515363e-05,
|
||
|
|
"loss": 0.3682270050048828,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6500,
|
||
|
|
"token_acc": 0.8687270373931054,
|
||
|
|
"train_speed(iter/s)": 0.089011
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.798454603992273,
|
||
|
|
"grad_norm": 0.09893783926963806,
|
||
|
|
"learning_rate": 3.7749957615540224e-05,
|
||
|
|
"loss": 0.371025824546814,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6520,
|
||
|
|
"token_acc": 0.8599820821280371,
|
||
|
|
"train_speed(iter/s)": 0.089014
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.807040137368534,
|
||
|
|
"grad_norm": 0.1044677123427391,
|
||
|
|
"learning_rate": 3.767514845348992e-05,
|
||
|
|
"loss": 0.37092270851135256,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6540,
|
||
|
|
"token_acc": 0.860977069485444,
|
||
|
|
"train_speed(iter/s)": 0.089016
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.815625670744795,
|
||
|
|
"grad_norm": 0.10815873742103577,
|
||
|
|
"learning_rate": 3.760018621248e-05,
|
||
|
|
"loss": 0.36874244213104246,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6560,
|
||
|
|
"token_acc": 0.8586234130381737,
|
||
|
|
"train_speed(iter/s)": 0.089019
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.824211204121056,
|
||
|
|
"grad_norm": 0.08873378485441208,
|
||
|
|
"learning_rate": 3.75250717978365e-05,
|
||
|
|
"loss": 0.36833083629608154,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6580,
|
||
|
|
"token_acc": 0.8633419814445173,
|
||
|
|
"train_speed(iter/s)": 0.089022
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.832796737497317,
|
||
|
|
"grad_norm": 0.09121917188167572,
|
||
|
|
"learning_rate": 3.7449806116723266e-05,
|
||
|
|
"loss": 0.3694983720779419,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6600,
|
||
|
|
"token_acc": 0.8697197272952701,
|
||
|
|
"train_speed(iter/s)": 0.089026
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.841382270873578,
|
||
|
|
"grad_norm": 0.09253229945898056,
|
||
|
|
"learning_rate": 3.7374390078131015e-05,
|
||
|
|
"loss": 0.37108821868896485,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6620,
|
||
|
|
"token_acc": 0.8706145844516814,
|
||
|
|
"train_speed(iter/s)": 0.089031
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.849967804249839,
|
||
|
|
"grad_norm": 0.09768302738666534,
|
||
|
|
"learning_rate": 3.729882459286632e-05,
|
||
|
|
"loss": 0.3706928253173828,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6640,
|
||
|
|
"token_acc": 0.8605562350922205,
|
||
|
|
"train_speed(iter/s)": 0.089033
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8585533376261,
|
||
|
|
"grad_norm": 0.09809901565313339,
|
||
|
|
"learning_rate": 3.722311057354067e-05,
|
||
|
|
"loss": 0.3715434312820435,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6660,
|
||
|
|
"token_acc": 0.8687115200037456,
|
||
|
|
"train_speed(iter/s)": 0.089037
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.867138871002361,
|
||
|
|
"grad_norm": 0.10311082750558853,
|
||
|
|
"learning_rate": 3.714724893455938e-05,
|
||
|
|
"loss": 0.3686758756637573,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6680,
|
||
|
|
"token_acc": 0.8536554098061117,
|
||
|
|
"train_speed(iter/s)": 0.089035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.875724404378622,
|
||
|
|
"grad_norm": 0.0951702892780304,
|
||
|
|
"learning_rate": 3.7071240592110604e-05,
|
||
|
|
"loss": 0.37487409114837644,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6700,
|
||
|
|
"token_acc": 0.8619494831493575,
|
||
|
|
"train_speed(iter/s)": 0.089039
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.884309937754883,
|
||
|
|
"grad_norm": 0.10398156195878983,
|
||
|
|
"learning_rate": 3.699508646415424e-05,
|
||
|
|
"loss": 0.3755856275558472,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6720,
|
||
|
|
"token_acc": 0.8719096505699291,
|
||
|
|
"train_speed(iter/s)": 0.089043
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.892895471131144,
|
||
|
|
"grad_norm": 0.09801426529884338,
|
||
|
|
"learning_rate": 3.691878747041084e-05,
|
||
|
|
"loss": 0.36969609260559083,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6740,
|
||
|
|
"token_acc": 0.8539101926900138,
|
||
|
|
"train_speed(iter/s)": 0.089046
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.901481004507405,
|
||
|
|
"grad_norm": 0.10008656978607178,
|
||
|
|
"learning_rate": 3.684234453235054e-05,
|
||
|
|
"loss": 0.3719330310821533,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6760,
|
||
|
|
"token_acc": 0.8648975749697432,
|
||
|
|
"train_speed(iter/s)": 0.08905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.910066537883666,
|
||
|
|
"grad_norm": 0.12179595977067947,
|
||
|
|
"learning_rate": 3.676575857318189e-05,
|
||
|
|
"loss": 0.37140851020812987,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6780,
|
||
|
|
"token_acc": 0.8577137651213464,
|
||
|
|
"train_speed(iter/s)": 0.089052
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.918652071259927,
|
||
|
|
"grad_norm": 0.09753546863794327,
|
||
|
|
"learning_rate": 3.66890305178407e-05,
|
||
|
|
"loss": 0.3708536624908447,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6800,
|
||
|
|
"token_acc": 0.8671116019269858,
|
||
|
|
"train_speed(iter/s)": 0.089056
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.927237604636188,
|
||
|
|
"grad_norm": 0.09348613768815994,
|
||
|
|
"learning_rate": 3.661216129297894e-05,
|
||
|
|
"loss": 0.3709095001220703,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6820,
|
||
|
|
"token_acc": 0.8573097173193669,
|
||
|
|
"train_speed(iter/s)": 0.089061
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.935823138012449,
|
||
|
|
"grad_norm": 0.0905463695526123,
|
||
|
|
"learning_rate": 3.653515182695344e-05,
|
||
|
|
"loss": 0.3767134189605713,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6840,
|
||
|
|
"token_acc": 0.8626287415238181,
|
||
|
|
"train_speed(iter/s)": 0.089065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.94440867138871,
|
||
|
|
"grad_norm": 0.10822242498397827,
|
||
|
|
"learning_rate": 3.645800304981477e-05,
|
||
|
|
"loss": 0.3709308385848999,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6860,
|
||
|
|
"token_acc": 0.8577446782413705,
|
||
|
|
"train_speed(iter/s)": 0.089069
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.952994204764971,
|
||
|
|
"grad_norm": 0.1089344173669815,
|
||
|
|
"learning_rate": 3.638071589329597e-05,
|
||
|
|
"loss": 0.3755086660385132,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6880,
|
||
|
|
"token_acc": 0.8570769973171867,
|
||
|
|
"train_speed(iter/s)": 0.089072
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.961579738141232,
|
||
|
|
"grad_norm": 0.10646732896566391,
|
||
|
|
"learning_rate": 3.630329129080129e-05,
|
||
|
|
"loss": 0.36852853298187255,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6900,
|
||
|
|
"token_acc": 0.8610651132070156,
|
||
|
|
"train_speed(iter/s)": 0.089077
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.970165271517493,
|
||
|
|
"grad_norm": 0.10016820579767227,
|
||
|
|
"learning_rate": 3.622573017739495e-05,
|
||
|
|
"loss": 0.37330124378204343,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6920,
|
||
|
|
"token_acc": 0.8775841748626209,
|
||
|
|
"train_speed(iter/s)": 0.08908
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.978750804893754,
|
||
|
|
"grad_norm": 0.1020449697971344,
|
||
|
|
"learning_rate": 3.6148033489789765e-05,
|
||
|
|
"loss": 0.3684419631958008,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6940,
|
||
|
|
"token_acc": 0.8642162515149019,
|
||
|
|
"train_speed(iter/s)": 0.089084
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.987336338270015,
|
||
|
|
"grad_norm": 0.0974557027220726,
|
||
|
|
"learning_rate": 3.607020216633599e-05,
|
||
|
|
"loss": 0.37378945350646975,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6960,
|
||
|
|
"token_acc": 0.858156359329171,
|
||
|
|
"train_speed(iter/s)": 0.089087
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.995921871646276,
|
||
|
|
"grad_norm": 0.09330358356237411,
|
||
|
|
"learning_rate": 3.59922371470098e-05,
|
||
|
|
"loss": 0.36865170001983644,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 6980,
|
||
|
|
"token_acc": 0.8638886721914512,
|
||
|
|
"train_speed(iter/s)": 0.089091
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0042927666881303,
|
||
|
|
"grad_norm": 0.1193256601691246,
|
||
|
|
"learning_rate": 3.591413937340208e-05,
|
||
|
|
"loss": 0.3534395694732666,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7000,
|
||
|
|
"token_acc": 0.8802663670407237,
|
||
|
|
"train_speed(iter/s)": 0.089092
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0042927666881303,
|
||
|
|
"eval_loss": 0.4485101103782654,
|
||
|
|
"eval_runtime": 74.3969,
|
||
|
|
"eval_samples_per_second": 50.593,
|
||
|
|
"eval_steps_per_second": 0.645,
|
||
|
|
"eval_token_acc": 0.8402603254517357,
|
||
|
|
"step": 7000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0128783000643913,
|
||
|
|
"grad_norm": 0.1156892329454422,
|
||
|
|
"learning_rate": 3.583590978870699e-05,
|
||
|
|
"loss": 0.3319342851638794,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7020,
|
||
|
|
"token_acc": 0.8532470204427854,
|
||
|
|
"train_speed(iter/s)": 0.088961
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0214638334406523,
|
||
|
|
"grad_norm": 0.10194379091262817,
|
||
|
|
"learning_rate": 3.5757549337710564e-05,
|
||
|
|
"loss": 0.33723247051239014,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7040,
|
||
|
|
"token_acc": 0.8831583445244781,
|
||
|
|
"train_speed(iter/s)": 0.088954
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0300493668169133,
|
||
|
|
"grad_norm": 0.10132017731666565,
|
||
|
|
"learning_rate": 3.5679058966779344e-05,
|
||
|
|
"loss": 0.336438250541687,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7060,
|
||
|
|
"token_acc": 0.8769756994854098,
|
||
|
|
"train_speed(iter/s)": 0.08895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0386349001931743,
|
||
|
|
"grad_norm": 0.1068112775683403,
|
||
|
|
"learning_rate": 3.560043962384891e-05,
|
||
|
|
"loss": 0.3355576753616333,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7080,
|
||
|
|
"token_acc": 0.8759380793584041,
|
||
|
|
"train_speed(iter/s)": 0.088949
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0472204335694353,
|
||
|
|
"grad_norm": 0.10327329486608505,
|
||
|
|
"learning_rate": 3.552169225841248e-05,
|
||
|
|
"loss": 0.3344245195388794,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7100,
|
||
|
|
"token_acc": 0.8749370992739901,
|
||
|
|
"train_speed(iter/s)": 0.088945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0558059669456963,
|
||
|
|
"grad_norm": 0.10621868073940277,
|
||
|
|
"learning_rate": 3.544281782150936e-05,
|
||
|
|
"loss": 0.33667793273925783,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7120,
|
||
|
|
"token_acc": 0.8698413495330857,
|
||
|
|
"train_speed(iter/s)": 0.088946
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0643915003219577,
|
||
|
|
"grad_norm": 0.09647602587938309,
|
||
|
|
"learning_rate": 3.536381726571358e-05,
|
||
|
|
"loss": 0.33697144985198973,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7140,
|
||
|
|
"token_acc": 0.879177233267265,
|
||
|
|
"train_speed(iter/s)": 0.088946
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0729770336982183,
|
||
|
|
"grad_norm": 0.1008361279964447,
|
||
|
|
"learning_rate": 3.528469154512224e-05,
|
||
|
|
"loss": 0.3379324674606323,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7160,
|
||
|
|
"token_acc": 0.881303225060136,
|
||
|
|
"train_speed(iter/s)": 0.088946
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0815625670744797,
|
||
|
|
"grad_norm": 0.09905105084180832,
|
||
|
|
"learning_rate": 3.520544161534413e-05,
|
||
|
|
"loss": 0.33641412258148196,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7180,
|
||
|
|
"token_acc": 0.8765279938577173,
|
||
|
|
"train_speed(iter/s)": 0.088947
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0901481004507403,
|
||
|
|
"grad_norm": 0.09547468274831772,
|
||
|
|
"learning_rate": 3.51260684334881e-05,
|
||
|
|
"loss": 0.33444535732269287,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7200,
|
||
|
|
"token_acc": 0.8740168402536957,
|
||
|
|
"train_speed(iter/s)": 0.088949
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0987336338270013,
|
||
|
|
"grad_norm": 0.091608926653862,
|
||
|
|
"learning_rate": 3.504657295815153e-05,
|
||
|
|
"loss": 0.33458809852600097,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7220,
|
||
|
|
"token_acc": 0.8822041996574748,
|
||
|
|
"train_speed(iter/s)": 0.088951
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1073191672032623,
|
||
|
|
"grad_norm": 0.095795176923275,
|
||
|
|
"learning_rate": 3.496695614940875e-05,
|
||
|
|
"loss": 0.3341191053390503,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7240,
|
||
|
|
"token_acc": 0.8863122055178043,
|
||
|
|
"train_speed(iter/s)": 0.088952
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1159047005795233,
|
||
|
|
"grad_norm": 0.11027920246124268,
|
||
|
|
"learning_rate": 3.488721896879943e-05,
|
||
|
|
"loss": 0.3351098299026489,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7260,
|
||
|
|
"token_acc": 0.8802774242498409,
|
||
|
|
"train_speed(iter/s)": 0.088955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1244902339557843,
|
||
|
|
"grad_norm": 0.09548976272344589,
|
||
|
|
"learning_rate": 3.4807362379317025e-05,
|
||
|
|
"loss": 0.3381031513214111,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7280,
|
||
|
|
"token_acc": 0.8777537505068252,
|
||
|
|
"train_speed(iter/s)": 0.088954
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1330757673320457,
|
||
|
|
"grad_norm": 0.1054491475224495,
|
||
|
|
"learning_rate": 3.472738734539706e-05,
|
||
|
|
"loss": 0.33547115325927734,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7300,
|
||
|
|
"token_acc": 0.8795795912347567,
|
||
|
|
"train_speed(iter/s)": 0.088956
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1416613007083063,
|
||
|
|
"grad_norm": 0.09988971799612045,
|
||
|
|
"learning_rate": 3.464729483290553e-05,
|
||
|
|
"loss": 0.3418281555175781,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7320,
|
||
|
|
"token_acc": 0.8629133179032032,
|
||
|
|
"train_speed(iter/s)": 0.088958
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1502468340845677,
|
||
|
|
"grad_norm": 0.09766259044408798,
|
||
|
|
"learning_rate": 3.456708580912725e-05,
|
||
|
|
"loss": 0.3392175674438477,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7340,
|
||
|
|
"token_acc": 0.8706737594562531,
|
||
|
|
"train_speed(iter/s)": 0.088951
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1588323674608283,
|
||
|
|
"grad_norm": 0.09341710805892944,
|
||
|
|
"learning_rate": 3.448676124275414e-05,
|
||
|
|
"loss": 0.3362084150314331,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7360,
|
||
|
|
"token_acc": 0.8706982003587074,
|
||
|
|
"train_speed(iter/s)": 0.088954
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1674179008370897,
|
||
|
|
"grad_norm": 0.0969720259308815,
|
||
|
|
"learning_rate": 3.440632210387354e-05,
|
||
|
|
"loss": 0.3380004644393921,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7380,
|
||
|
|
"token_acc": 0.8738021476597112,
|
||
|
|
"train_speed(iter/s)": 0.088957
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1760034342133503,
|
||
|
|
"grad_norm": 0.09787522256374359,
|
||
|
|
"learning_rate": 3.432576936395648e-05,
|
||
|
|
"loss": 0.3357203245162964,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7400,
|
||
|
|
"token_acc": 0.8912336656741101,
|
||
|
|
"train_speed(iter/s)": 0.088961
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1845889675896113,
|
||
|
|
"grad_norm": 0.10224709659814835,
|
||
|
|
"learning_rate": 3.424510399584601e-05,
|
||
|
|
"loss": 0.33477025032043456,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7420,
|
||
|
|
"token_acc": 0.8561189105937783,
|
||
|
|
"train_speed(iter/s)": 0.088965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1931745009658723,
|
||
|
|
"grad_norm": 0.10669636726379395,
|
||
|
|
"learning_rate": 3.416432697374533e-05,
|
||
|
|
"loss": 0.33573341369628906,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7440,
|
||
|
|
"token_acc": 0.874112458982316,
|
||
|
|
"train_speed(iter/s)": 0.088968
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2017600343421333,
|
||
|
|
"grad_norm": 0.1014070212841034,
|
||
|
|
"learning_rate": 3.408343927320613e-05,
|
||
|
|
"loss": 0.3380695343017578,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7460,
|
||
|
|
"token_acc": 0.8848022091860703,
|
||
|
|
"train_speed(iter/s)": 0.088972
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2103455677183943,
|
||
|
|
"grad_norm": 0.09528549015522003,
|
||
|
|
"learning_rate": 3.40024418711168e-05,
|
||
|
|
"loss": 0.33952438831329346,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7480,
|
||
|
|
"token_acc": 0.8705726760778868,
|
||
|
|
"train_speed(iter/s)": 0.088975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2189311010946553,
|
||
|
|
"grad_norm": 0.10318120568990707,
|
||
|
|
"learning_rate": 3.392133574569057e-05,
|
||
|
|
"loss": 0.3406086444854736,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7500,
|
||
|
|
"token_acc": 0.8733639567077774,
|
||
|
|
"train_speed(iter/s)": 0.088978
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2275166344709163,
|
||
|
|
"grad_norm": 0.11275230348110199,
|
||
|
|
"learning_rate": 3.3840121876453734e-05,
|
||
|
|
"loss": 0.33986356258392336,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7520,
|
||
|
|
"token_acc": 0.8619126202517206,
|
||
|
|
"train_speed(iter/s)": 0.088978
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2361021678471773,
|
||
|
|
"grad_norm": 0.10118957608938217,
|
||
|
|
"learning_rate": 3.375880124423383e-05,
|
||
|
|
"loss": 0.3386232852935791,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7540,
|
||
|
|
"token_acc": 0.8710604646623604,
|
||
|
|
"train_speed(iter/s)": 0.088981
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2446877012234383,
|
||
|
|
"grad_norm": 0.10550114512443542,
|
||
|
|
"learning_rate": 3.367737483114779e-05,
|
||
|
|
"loss": 0.3421770572662354,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7560,
|
||
|
|
"token_acc": 0.8851797047121107,
|
||
|
|
"train_speed(iter/s)": 0.088985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2532732345996997,
|
||
|
|
"grad_norm": 0.1023048609495163,
|
||
|
|
"learning_rate": 3.359584362059004e-05,
|
||
|
|
"loss": 0.33796124458312987,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7580,
|
||
|
|
"token_acc": 0.8739776940178287,
|
||
|
|
"train_speed(iter/s)": 0.088985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2618587679759603,
|
||
|
|
"grad_norm": 0.09559116512537003,
|
||
|
|
"learning_rate": 3.3514208597220705e-05,
|
||
|
|
"loss": 0.3409781217575073,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7600,
|
||
|
|
"token_acc": 0.874609344576846,
|
||
|
|
"train_speed(iter/s)": 0.088989
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2704443013522217,
|
||
|
|
"grad_norm": 0.09580449014902115,
|
||
|
|
"learning_rate": 3.3432470746953606e-05,
|
||
|
|
"loss": 0.33773849010467527,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7620,
|
||
|
|
"token_acc": 0.8727284510693454,
|
||
|
|
"train_speed(iter/s)": 0.088993
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2790298347284823,
|
||
|
|
"grad_norm": 0.10818155109882355,
|
||
|
|
"learning_rate": 3.335063105694447e-05,
|
||
|
|
"loss": 0.3401022434234619,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7640,
|
||
|
|
"token_acc": 0.8764802837026362,
|
||
|
|
"train_speed(iter/s)": 0.088996
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2876153681047438,
|
||
|
|
"grad_norm": 0.10184460878372192,
|
||
|
|
"learning_rate": 3.326869051557891e-05,
|
||
|
|
"loss": 0.3434968709945679,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7660,
|
||
|
|
"token_acc": 0.8761705077978165,
|
||
|
|
"train_speed(iter/s)": 0.088919
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2962009014810043,
|
||
|
|
"grad_norm": 0.09505783021450043,
|
||
|
|
"learning_rate": 3.318665011246056e-05,
|
||
|
|
"loss": 0.3408296346664429,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7680,
|
||
|
|
"token_acc": 0.8661087384073535,
|
||
|
|
"train_speed(iter/s)": 0.088905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3047864348572658,
|
||
|
|
"grad_norm": 0.10040104389190674,
|
||
|
|
"learning_rate": 3.310451083839908e-05,
|
||
|
|
"loss": 0.3423358678817749,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7700,
|
||
|
|
"token_acc": 0.861539109557306,
|
||
|
|
"train_speed(iter/s)": 0.088907
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3133719682335263,
|
||
|
|
"grad_norm": 0.10616692155599594,
|
||
|
|
"learning_rate": 3.30222736853982e-05,
|
||
|
|
"loss": 0.34503300189971925,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7720,
|
||
|
|
"token_acc": 0.8724093642360908,
|
||
|
|
"train_speed(iter/s)": 0.08891
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3219575016097878,
|
||
|
|
"grad_norm": 0.10949140787124634,
|
||
|
|
"learning_rate": 3.293993964664376e-05,
|
||
|
|
"loss": 0.3432727098464966,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7740,
|
||
|
|
"token_acc": 0.8669582519497799,
|
||
|
|
"train_speed(iter/s)": 0.088914
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3305430349860483,
|
||
|
|
"grad_norm": 0.09881085902452469,
|
||
|
|
"learning_rate": 3.285750971649167e-05,
|
||
|
|
"loss": 0.3427408695220947,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7760,
|
||
|
|
"token_acc": 0.8689034982030741,
|
||
|
|
"train_speed(iter/s)": 0.088917
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3391285683623098,
|
||
|
|
"grad_norm": 0.09140335768461227,
|
||
|
|
"learning_rate": 3.2774984890455976e-05,
|
||
|
|
"loss": 0.3475862979888916,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7780,
|
||
|
|
"token_acc": 0.8685826593182928,
|
||
|
|
"train_speed(iter/s)": 0.088921
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3477141017385703,
|
||
|
|
"grad_norm": 0.1024077907204628,
|
||
|
|
"learning_rate": 3.2692366165196727e-05,
|
||
|
|
"loss": 0.3404365539550781,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7800,
|
||
|
|
"token_acc": 0.8840015739822477,
|
||
|
|
"train_speed(iter/s)": 0.088925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3562996351148318,
|
||
|
|
"grad_norm": 0.09467454254627228,
|
||
|
|
"learning_rate": 3.260965453850806e-05,
|
||
|
|
"loss": 0.34421525001525877,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7820,
|
||
|
|
"token_acc": 0.8758503166590742,
|
||
|
|
"train_speed(iter/s)": 0.088929
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3648851684910923,
|
||
|
|
"grad_norm": 0.10136840492486954,
|
||
|
|
"learning_rate": 3.252685100930605e-05,
|
||
|
|
"loss": 0.3386892795562744,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7840,
|
||
|
|
"token_acc": 0.85672288931185,
|
||
|
|
"train_speed(iter/s)": 0.088932
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3734707018673533,
|
||
|
|
"grad_norm": 0.09780098497867584,
|
||
|
|
"learning_rate": 3.244395657761671e-05,
|
||
|
|
"loss": 0.3428237199783325,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7860,
|
||
|
|
"token_acc": 0.868161995980711,
|
||
|
|
"train_speed(iter/s)": 0.088935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3820562352436143,
|
||
|
|
"grad_norm": 0.1032358855009079,
|
||
|
|
"learning_rate": 3.23609722445639e-05,
|
||
|
|
"loss": 0.3407264709472656,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7880,
|
||
|
|
"token_acc": 0.8630356105896284,
|
||
|
|
"train_speed(iter/s)": 0.088936
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3906417686198753,
|
||
|
|
"grad_norm": 0.09920444339513779,
|
||
|
|
"learning_rate": 3.2277899012357196e-05,
|
||
|
|
"loss": 0.34147114753723146,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7900,
|
||
|
|
"token_acc": 0.8645183518911774,
|
||
|
|
"train_speed(iter/s)": 0.088941
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3992273019961363,
|
||
|
|
"grad_norm": 0.1050969585776329,
|
||
|
|
"learning_rate": 3.219473788427984e-05,
|
||
|
|
"loss": 0.3448856115341187,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7920,
|
||
|
|
"token_acc": 0.8714814655549509,
|
||
|
|
"train_speed(iter/s)": 0.088944
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4078128353723973,
|
||
|
|
"grad_norm": 0.10028455406427383,
|
||
|
|
"learning_rate": 3.211148986467659e-05,
|
||
|
|
"loss": 0.3422698974609375,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7940,
|
||
|
|
"token_acc": 0.8711220342714154,
|
||
|
|
"train_speed(iter/s)": 0.088948
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4163983687486583,
|
||
|
|
"grad_norm": 0.09475808590650558,
|
||
|
|
"learning_rate": 3.2028155958941615e-05,
|
||
|
|
"loss": 0.3451426029205322,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7960,
|
||
|
|
"token_acc": 0.8738779982122461,
|
||
|
|
"train_speed(iter/s)": 0.088952
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4249839021249193,
|
||
|
|
"grad_norm": 0.09882804751396179,
|
||
|
|
"learning_rate": 3.1944737173506324e-05,
|
||
|
|
"loss": 0.3444493532180786,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 7980,
|
||
|
|
"token_acc": 0.8827410911702268,
|
||
|
|
"train_speed(iter/s)": 0.088955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4335694355011803,
|
||
|
|
"grad_norm": 0.10227163881063461,
|
||
|
|
"learning_rate": 3.186123451582723e-05,
|
||
|
|
"loss": 0.339670729637146,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8000,
|
||
|
|
"token_acc": 0.8807350762593477,
|
||
|
|
"train_speed(iter/s)": 0.08896
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4335694355011803,
|
||
|
|
"eval_loss": 0.44718989729881287,
|
||
|
|
"eval_runtime": 69.1311,
|
||
|
|
"eval_samples_per_second": 54.447,
|
||
|
|
"eval_steps_per_second": 0.694,
|
||
|
|
"eval_token_acc": 0.8404557155339724,
|
||
|
|
"step": 8000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4421549688774413,
|
||
|
|
"grad_norm": 0.0968368649482727,
|
||
|
|
"learning_rate": 3.177764899437378e-05,
|
||
|
|
"loss": 0.34265289306640623,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8020,
|
||
|
|
"token_acc": 0.854713276154318,
|
||
|
|
"train_speed(iter/s)": 0.088852
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4507405022537023,
|
||
|
|
"grad_norm": 0.09359851479530334,
|
||
|
|
"learning_rate": 3.169398161861618e-05,
|
||
|
|
"loss": 0.33971107006073,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8040,
|
||
|
|
"token_acc": 0.8740548416277094,
|
||
|
|
"train_speed(iter/s)": 0.08884
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4593260356299633,
|
||
|
|
"grad_norm": 0.09218861162662506,
|
||
|
|
"learning_rate": 3.1610233399013194e-05,
|
||
|
|
"loss": 0.34025261402130125,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8060,
|
||
|
|
"token_acc": 0.8837196272437907,
|
||
|
|
"train_speed(iter/s)": 0.088833
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4679115690062243,
|
||
|
|
"grad_norm": 0.09785692393779755,
|
||
|
|
"learning_rate": 3.1526405346999946e-05,
|
||
|
|
"loss": 0.34408791065216066,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8080,
|
||
|
|
"token_acc": 0.8632519203232839,
|
||
|
|
"train_speed(iter/s)": 0.088829
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4764971023824853,
|
||
|
|
"grad_norm": 0.0918072834610939,
|
||
|
|
"learning_rate": 3.1442498474975694e-05,
|
||
|
|
"loss": 0.3405976057052612,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8100,
|
||
|
|
"token_acc": 0.8723113057185948,
|
||
|
|
"train_speed(iter/s)": 0.088832
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4850826357587463,
|
||
|
|
"grad_norm": 0.10397264361381531,
|
||
|
|
"learning_rate": 3.1358513796291625e-05,
|
||
|
|
"loss": 0.3404028654098511,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8120,
|
||
|
|
"token_acc": 0.8617087474123225,
|
||
|
|
"train_speed(iter/s)": 0.088834
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4936681691350073,
|
||
|
|
"grad_norm": 0.10147637873888016,
|
||
|
|
"learning_rate": 3.1274452325238604e-05,
|
||
|
|
"loss": 0.3449804067611694,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8140,
|
||
|
|
"token_acc": 0.881801972466236,
|
||
|
|
"train_speed(iter/s)": 0.088832
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5022537025112683,
|
||
|
|
"grad_norm": 0.10313740372657776,
|
||
|
|
"learning_rate": 3.119031507703491e-05,
|
||
|
|
"loss": 0.34123189449310304,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8160,
|
||
|
|
"token_acc": 0.8796580674904387,
|
||
|
|
"train_speed(iter/s)": 0.088831
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5108392358875298,
|
||
|
|
"grad_norm": 0.10292479395866394,
|
||
|
|
"learning_rate": 3.1106103067814005e-05,
|
||
|
|
"loss": 0.342661452293396,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8180,
|
||
|
|
"token_acc": 0.8706844817024822,
|
||
|
|
"train_speed(iter/s)": 0.088834
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5194247692637903,
|
||
|
|
"grad_norm": 0.10231835395097733,
|
||
|
|
"learning_rate": 3.102181731461225e-05,
|
||
|
|
"loss": 0.3427009344100952,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8200,
|
||
|
|
"token_acc": 0.8746011467506197,
|
||
|
|
"train_speed(iter/s)": 0.088833
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5280103026400518,
|
||
|
|
"grad_norm": 0.09958157688379288,
|
||
|
|
"learning_rate": 3.09374588353566e-05,
|
||
|
|
"loss": 0.34229106903076173,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8220,
|
||
|
|
"token_acc": 0.8835202199767901,
|
||
|
|
"train_speed(iter/s)": 0.088833
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5365958360163123,
|
||
|
|
"grad_norm": 0.10157457739114761,
|
||
|
|
"learning_rate": 3.085302864885235e-05,
|
||
|
|
"loss": 0.3417761564254761,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8240,
|
||
|
|
"token_acc": 0.8649101475499108,
|
||
|
|
"train_speed(iter/s)": 0.088834
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.545181369392574,
|
||
|
|
"grad_norm": 0.0995817556977272,
|
||
|
|
"learning_rate": 3.076852777477079e-05,
|
||
|
|
"loss": 0.34410881996154785,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8260,
|
||
|
|
"token_acc": 0.8783496646486048,
|
||
|
|
"train_speed(iter/s)": 0.088836
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5537669027688343,
|
||
|
|
"grad_norm": 0.09822899103164673,
|
||
|
|
"learning_rate": 3.068395723363694e-05,
|
||
|
|
"loss": 0.34146294593811033,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8280,
|
||
|
|
"token_acc": 0.8781757426389024,
|
||
|
|
"train_speed(iter/s)": 0.088836
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.562352436145096,
|
||
|
|
"grad_norm": 0.10480652749538422,
|
||
|
|
"learning_rate": 3.0599318046817144e-05,
|
||
|
|
"loss": 0.34048995971679685,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8300,
|
||
|
|
"token_acc": 0.8748031260669741,
|
||
|
|
"train_speed(iter/s)": 0.088838
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5709379695213563,
|
||
|
|
"grad_norm": 0.09434372186660767,
|
||
|
|
"learning_rate": 3.051461123650685e-05,
|
||
|
|
"loss": 0.33703758716583254,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8320,
|
||
|
|
"token_acc": 0.8765604747936422,
|
||
|
|
"train_speed(iter/s)": 0.088842
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.579523502897618,
|
||
|
|
"grad_norm": 0.09659520536661148,
|
||
|
|
"learning_rate": 3.0429837825718162e-05,
|
||
|
|
"loss": 0.3348528385162354,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8340,
|
||
|
|
"token_acc": 0.8765401382308406,
|
||
|
|
"train_speed(iter/s)": 0.088834
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5881090362738783,
|
||
|
|
"grad_norm": 0.09309985488653183,
|
||
|
|
"learning_rate": 3.0344998838267525e-05,
|
||
|
|
"loss": 0.3402057647705078,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8360,
|
||
|
|
"token_acc": 0.8651010368553427,
|
||
|
|
"train_speed(iter/s)": 0.088836
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.59669456965014,
|
||
|
|
"grad_norm": 0.0929030105471611,
|
||
|
|
"learning_rate": 3.0260095298763376e-05,
|
||
|
|
"loss": 0.34411866664886476,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8380,
|
||
|
|
"token_acc": 0.8811669848458061,
|
||
|
|
"train_speed(iter/s)": 0.088838
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6052801030264003,
|
||
|
|
"grad_norm": 0.0983252078294754,
|
||
|
|
"learning_rate": 3.017512823259373e-05,
|
||
|
|
"loss": 0.34260566234588624,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8400,
|
||
|
|
"token_acc": 0.8748058346767034,
|
||
|
|
"train_speed(iter/s)": 0.088837
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.613865636402662,
|
||
|
|
"grad_norm": 0.10412958264350891,
|
||
|
|
"learning_rate": 3.0090098665913857e-05,
|
||
|
|
"loss": 0.3410640716552734,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8420,
|
||
|
|
"token_acc": 0.8833422403080311,
|
||
|
|
"train_speed(iter/s)": 0.088839
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6224511697789223,
|
||
|
|
"grad_norm": 0.1032663881778717,
|
||
|
|
"learning_rate": 3.0005007625633806e-05,
|
||
|
|
"loss": 0.3369549512863159,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8440,
|
||
|
|
"token_acc": 0.8744007729088683,
|
||
|
|
"train_speed(iter/s)": 0.088841
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.631036703155184,
|
||
|
|
"grad_norm": 0.09928712248802185,
|
||
|
|
"learning_rate": 2.9919856139406093e-05,
|
||
|
|
"loss": 0.3410694122314453,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8460,
|
||
|
|
"token_acc": 0.868446777131458,
|
||
|
|
"train_speed(iter/s)": 0.088844
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6396222365314443,
|
||
|
|
"grad_norm": 0.10240930318832397,
|
||
|
|
"learning_rate": 2.9834645235613202e-05,
|
||
|
|
"loss": 0.34042160511016845,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8480,
|
||
|
|
"token_acc": 0.8746132434983096,
|
||
|
|
"train_speed(iter/s)": 0.088845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.648207769907706,
|
||
|
|
"grad_norm": 0.09757622331380844,
|
||
|
|
"learning_rate": 2.9749375943355245e-05,
|
||
|
|
"loss": 0.3391597032546997,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8500,
|
||
|
|
"token_acc": 0.8870871533336139,
|
||
|
|
"train_speed(iter/s)": 0.088847
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6567933032839663,
|
||
|
|
"grad_norm": 0.10708373039960861,
|
||
|
|
"learning_rate": 2.966404929243746e-05,
|
||
|
|
"loss": 0.3418737888336182,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8520,
|
||
|
|
"token_acc": 0.8803486188795007,
|
||
|
|
"train_speed(iter/s)": 0.08885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.665378836660228,
|
||
|
|
"grad_norm": 0.09238722175359726,
|
||
|
|
"learning_rate": 2.9578666313357866e-05,
|
||
|
|
"loss": 0.3395582675933838,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8540,
|
||
|
|
"token_acc": 0.8617492297025544,
|
||
|
|
"train_speed(iter/s)": 0.088851
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6739643700364883,
|
||
|
|
"grad_norm": 0.0982414111495018,
|
||
|
|
"learning_rate": 2.9493228037294702e-05,
|
||
|
|
"loss": 0.339850926399231,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8560,
|
||
|
|
"token_acc": 0.872913510605142,
|
||
|
|
"train_speed(iter/s)": 0.088854
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.68254990341275,
|
||
|
|
"grad_norm": 0.09378170222043991,
|
||
|
|
"learning_rate": 2.9407735496094074e-05,
|
||
|
|
"loss": 0.3445668935775757,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8580,
|
||
|
|
"token_acc": 0.8608412452277943,
|
||
|
|
"train_speed(iter/s)": 0.088857
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6911354367890103,
|
||
|
|
"grad_norm": 0.10139860957860947,
|
||
|
|
"learning_rate": 2.9322189722257437e-05,
|
||
|
|
"loss": 0.33951511383056643,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8600,
|
||
|
|
"token_acc": 0.8813381599903551,
|
||
|
|
"train_speed(iter/s)": 0.088858
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.699720970165272,
|
||
|
|
"grad_norm": 0.10095764696598053,
|
||
|
|
"learning_rate": 2.9236591748929143e-05,
|
||
|
|
"loss": 0.3414825201034546,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8620,
|
||
|
|
"token_acc": 0.8747491060455407,
|
||
|
|
"train_speed(iter/s)": 0.088861
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7083065035415324,
|
||
|
|
"grad_norm": 0.09368202835321426,
|
||
|
|
"learning_rate": 2.915094260988397e-05,
|
||
|
|
"loss": 0.3400054216384888,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8640,
|
||
|
|
"token_acc": 0.8603559177014007,
|
||
|
|
"train_speed(iter/s)": 0.088863
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7168920369177934,
|
||
|
|
"grad_norm": 0.09599091857671738,
|
||
|
|
"learning_rate": 2.906524333951461e-05,
|
||
|
|
"loss": 0.33973557949066163,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8660,
|
||
|
|
"token_acc": 0.8864862275305668,
|
||
|
|
"train_speed(iter/s)": 0.088865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7254775702940544,
|
||
|
|
"grad_norm": 0.0969940647482872,
|
||
|
|
"learning_rate": 2.8979494972819227e-05,
|
||
|
|
"loss": 0.3434182405471802,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8680,
|
||
|
|
"token_acc": 0.8716537070538549,
|
||
|
|
"train_speed(iter/s)": 0.088858
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7340631036703154,
|
||
|
|
"grad_norm": 0.10267031192779541,
|
||
|
|
"learning_rate": 2.8893698545388887e-05,
|
||
|
|
"loss": 0.3440374851226807,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8700,
|
||
|
|
"token_acc": 0.8709150326797386,
|
||
|
|
"train_speed(iter/s)": 0.088861
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7426486370465764,
|
||
|
|
"grad_norm": 0.09835559874773026,
|
||
|
|
"learning_rate": 2.8807855093395126e-05,
|
||
|
|
"loss": 0.34554252624511717,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8720,
|
||
|
|
"token_acc": 0.8670599046959998,
|
||
|
|
"train_speed(iter/s)": 0.088863
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7512341704228374,
|
||
|
|
"grad_norm": 0.0885239914059639,
|
||
|
|
"learning_rate": 2.8721965653577386e-05,
|
||
|
|
"loss": 0.3446002244949341,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8740,
|
||
|
|
"token_acc": 0.8721490695849959,
|
||
|
|
"train_speed(iter/s)": 0.088867
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7598197037990984,
|
||
|
|
"grad_norm": 0.09081339836120605,
|
||
|
|
"learning_rate": 2.86360312632305e-05,
|
||
|
|
"loss": 0.33843419551849363,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8760,
|
||
|
|
"token_acc": 0.8680939478458125,
|
||
|
|
"train_speed(iter/s)": 0.088869
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7684052371753594,
|
||
|
|
"grad_norm": 0.09640111774206161,
|
||
|
|
"learning_rate": 2.855005296019218e-05,
|
||
|
|
"loss": 0.340420126914978,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8780,
|
||
|
|
"token_acc": 0.8749122556452666,
|
||
|
|
"train_speed(iter/s)": 0.088872
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7769907705516204,
|
||
|
|
"grad_norm": 0.0949261263012886,
|
||
|
|
"learning_rate": 2.8464031782830474e-05,
|
||
|
|
"loss": 0.3449671983718872,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8800,
|
||
|
|
"token_acc": 0.8710775436891774,
|
||
|
|
"train_speed(iter/s)": 0.088876
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7855763039278814,
|
||
|
|
"grad_norm": 0.09448053687810898,
|
||
|
|
"learning_rate": 2.837796877003124e-05,
|
||
|
|
"loss": 0.3435060977935791,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8820,
|
||
|
|
"token_acc": 0.884149136577708,
|
||
|
|
"train_speed(iter/s)": 0.08888
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7941618373041424,
|
||
|
|
"grad_norm": 0.09816328436136246,
|
||
|
|
"learning_rate": 2.8291864961185566e-05,
|
||
|
|
"loss": 0.34175992012023926,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8840,
|
||
|
|
"token_acc": 0.8704215639701488,
|
||
|
|
"train_speed(iter/s)": 0.088882
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8027473706804034,
|
||
|
|
"grad_norm": 0.09840340167284012,
|
||
|
|
"learning_rate": 2.820572139617725e-05,
|
||
|
|
"loss": 0.3442914247512817,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8860,
|
||
|
|
"token_acc": 0.8852563932460973,
|
||
|
|
"train_speed(iter/s)": 0.088885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8113329040566644,
|
||
|
|
"grad_norm": 0.09052480757236481,
|
||
|
|
"learning_rate": 2.8119539115370218e-05,
|
||
|
|
"loss": 0.3354163408279419,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8880,
|
||
|
|
"token_acc": 0.8710054027589692,
|
||
|
|
"train_speed(iter/s)": 0.088887
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8199184374329254,
|
||
|
|
"grad_norm": 0.09055832773447037,
|
||
|
|
"learning_rate": 2.803331915959599e-05,
|
||
|
|
"loss": 0.341020393371582,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8900,
|
||
|
|
"token_acc": 0.8775211583840608,
|
||
|
|
"train_speed(iter/s)": 0.088889
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8285039708091864,
|
||
|
|
"grad_norm": 0.09606460481882095,
|
||
|
|
"learning_rate": 2.7947062570141073e-05,
|
||
|
|
"loss": 0.34467277526855467,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8920,
|
||
|
|
"token_acc": 0.8684845089446742,
|
||
|
|
"train_speed(iter/s)": 0.088892
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8370895041854474,
|
||
|
|
"grad_norm": 0.0941082313656807,
|
||
|
|
"learning_rate": 2.7860770388734408e-05,
|
||
|
|
"loss": 0.34154183864593507,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8940,
|
||
|
|
"token_acc": 0.8651064878551884,
|
||
|
|
"train_speed(iter/s)": 0.088895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8456750375617084,
|
||
|
|
"grad_norm": 0.08800920099020004,
|
||
|
|
"learning_rate": 2.7774443657534788e-05,
|
||
|
|
"loss": 0.34454681873321535,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8960,
|
||
|
|
"token_acc": 0.884229596704054,
|
||
|
|
"train_speed(iter/s)": 0.088899
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8542605709379694,
|
||
|
|
"grad_norm": 0.0993284210562706,
|
||
|
|
"learning_rate": 2.7688083419118255e-05,
|
||
|
|
"loss": 0.3417619466781616,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 8980,
|
||
|
|
"token_acc": 0.8696293253324922,
|
||
|
|
"train_speed(iter/s)": 0.088902
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8628461043142304,
|
||
|
|
"grad_norm": 0.10383660346269608,
|
||
|
|
"learning_rate": 2.760169071646553e-05,
|
||
|
|
"loss": 0.34536774158477784,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9000,
|
||
|
|
"token_acc": 0.8775136024730062,
|
||
|
|
"train_speed(iter/s)": 0.088905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8628461043142304,
|
||
|
|
"eval_loss": 0.4432525634765625,
|
||
|
|
"eval_runtime": 69.6489,
|
||
|
|
"eval_samples_per_second": 54.042,
|
||
|
|
"eval_steps_per_second": 0.689,
|
||
|
|
"eval_token_acc": 0.8414275958111643,
|
||
|
|
"step": 9000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8714316376904914,
|
||
|
|
"grad_norm": 0.0947885811328888,
|
||
|
|
"learning_rate": 2.7515266592949407e-05,
|
||
|
|
"loss": 0.3397974491119385,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9020,
|
||
|
|
"token_acc": 0.8571858554733831,
|
||
|
|
"train_speed(iter/s)": 0.08881
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8800171710667524,
|
||
|
|
"grad_norm": 0.09524156153202057,
|
||
|
|
"learning_rate": 2.742881209232215e-05,
|
||
|
|
"loss": 0.3427132129669189,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9040,
|
||
|
|
"token_acc": 0.868957431040566,
|
||
|
|
"train_speed(iter/s)": 0.088802
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8886027044430134,
|
||
|
|
"grad_norm": 0.08956858515739441,
|
||
|
|
"learning_rate": 2.7342328258702894e-05,
|
||
|
|
"loss": 0.34703960418701174,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9060,
|
||
|
|
"token_acc": 0.8717364607638463,
|
||
|
|
"train_speed(iter/s)": 0.088797
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8971882378192744,
|
||
|
|
"grad_norm": 0.09309873729944229,
|
||
|
|
"learning_rate": 2.7255816136565026e-05,
|
||
|
|
"loss": 0.34093830585479734,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9080,
|
||
|
|
"token_acc": 0.8860340449246085,
|
||
|
|
"train_speed(iter/s)": 0.088797
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9057737711955354,
|
||
|
|
"grad_norm": 0.09236317873001099,
|
||
|
|
"learning_rate": 2.7169276770723585e-05,
|
||
|
|
"loss": 0.3432276248931885,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9100,
|
||
|
|
"token_acc": 0.8692972431017865,
|
||
|
|
"train_speed(iter/s)": 0.088797
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9143593045717964,
|
||
|
|
"grad_norm": 0.09957270324230194,
|
||
|
|
"learning_rate": 2.708271120632262e-05,
|
||
|
|
"loss": 0.34100799560546874,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9120,
|
||
|
|
"token_acc": 0.8780453295762229,
|
||
|
|
"train_speed(iter/s)": 0.088796
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9229448379480574,
|
||
|
|
"grad_norm": 0.09253112971782684,
|
||
|
|
"learning_rate": 2.69961204888226e-05,
|
||
|
|
"loss": 0.344201922416687,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9140,
|
||
|
|
"token_acc": 0.893788044699683,
|
||
|
|
"train_speed(iter/s)": 0.088799
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9315303713243184,
|
||
|
|
"grad_norm": 0.09970075637102127,
|
||
|
|
"learning_rate": 2.6909505663987756e-05,
|
||
|
|
"loss": 0.34385430812835693,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9160,
|
||
|
|
"token_acc": 0.884597342165496,
|
||
|
|
"train_speed(iter/s)": 0.088801
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9401159047005794,
|
||
|
|
"grad_norm": 0.0891101136803627,
|
||
|
|
"learning_rate": 2.682286777787348e-05,
|
||
|
|
"loss": 0.3451590299606323,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9180,
|
||
|
|
"token_acc": 0.8716154630632927,
|
||
|
|
"train_speed(iter/s)": 0.0888
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9487014380768404,
|
||
|
|
"grad_norm": 0.09408137947320938,
|
||
|
|
"learning_rate": 2.6736207876813646e-05,
|
||
|
|
"loss": 0.34462172985076905,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9200,
|
||
|
|
"token_acc": 0.8778122218028758,
|
||
|
|
"train_speed(iter/s)": 0.088802
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9572869714531014,
|
||
|
|
"grad_norm": 0.09145346283912659,
|
||
|
|
"learning_rate": 2.664952700740806e-05,
|
||
|
|
"loss": 0.34248254299163816,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9220,
|
||
|
|
"token_acc": 0.872891004579533,
|
||
|
|
"train_speed(iter/s)": 0.088803
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9658725048293624,
|
||
|
|
"grad_norm": 0.09725998342037201,
|
||
|
|
"learning_rate": 2.6562826216509696e-05,
|
||
|
|
"loss": 0.34380669593811036,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9240,
|
||
|
|
"token_acc": 0.8909276331759067,
|
||
|
|
"train_speed(iter/s)": 0.088804
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9744580382056234,
|
||
|
|
"grad_norm": 0.10166844725608826,
|
||
|
|
"learning_rate": 2.6476106551212188e-05,
|
||
|
|
"loss": 0.34403514862060547,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9260,
|
||
|
|
"token_acc": 0.8776962289782687,
|
||
|
|
"train_speed(iter/s)": 0.088806
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9830435715818844,
|
||
|
|
"grad_norm": 0.09070953726768494,
|
||
|
|
"learning_rate": 2.6389369058837077e-05,
|
||
|
|
"loss": 0.341811990737915,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9280,
|
||
|
|
"token_acc": 0.8719749437415167,
|
||
|
|
"train_speed(iter/s)": 0.088808
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9916291049581454,
|
||
|
|
"grad_norm": 0.09677760303020477,
|
||
|
|
"learning_rate": 2.6302614786921204e-05,
|
||
|
|
"loss": 0.3442156553268433,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9300,
|
||
|
|
"token_acc": 0.882238909204825,
|
||
|
|
"train_speed(iter/s)": 0.088808
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0,
|
||
|
|
"grad_norm": 0.15741688013076782,
|
||
|
|
"learning_rate": 2.621584478320408e-05,
|
||
|
|
"loss": 0.3397855758666992,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9320,
|
||
|
|
"token_acc": 0.8889204303051386,
|
||
|
|
"train_speed(iter/s)": 0.088814
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.008585533376261,
|
||
|
|
"grad_norm": 0.10205920785665512,
|
||
|
|
"learning_rate": 2.6129060095615187e-05,
|
||
|
|
"loss": 0.29747543334960935,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9340,
|
||
|
|
"token_acc": 0.8900451968067217,
|
||
|
|
"train_speed(iter/s)": 0.0888
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.017171066752522,
|
||
|
|
"grad_norm": 0.10247659683227539,
|
||
|
|
"learning_rate": 2.604226177226137e-05,
|
||
|
|
"loss": 0.30353684425354005,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9360,
|
||
|
|
"token_acc": 0.886528226098631,
|
||
|
|
"train_speed(iter/s)": 0.088801
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.025756600128783,
|
||
|
|
"grad_norm": 0.10435572266578674,
|
||
|
|
"learning_rate": 2.5955450861414126e-05,
|
||
|
|
"loss": 0.30368824005126954,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9380,
|
||
|
|
"token_acc": 0.8827944824311919,
|
||
|
|
"train_speed(iter/s)": 0.088803
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.034342133505044,
|
||
|
|
"grad_norm": 0.1014116182923317,
|
||
|
|
"learning_rate": 2.586862841149701e-05,
|
||
|
|
"loss": 0.3020852327346802,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9400,
|
||
|
|
"token_acc": 0.8891262896776423,
|
||
|
|
"train_speed(iter/s)": 0.088805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.042927666881305,
|
||
|
|
"grad_norm": 0.10401485115289688,
|
||
|
|
"learning_rate": 2.5781795471072885e-05,
|
||
|
|
"loss": 0.3056429386138916,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9420,
|
||
|
|
"token_acc": 0.8829484753143999,
|
||
|
|
"train_speed(iter/s)": 0.088807
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.051513200257566,
|
||
|
|
"grad_norm": 0.10134406387805939,
|
||
|
|
"learning_rate": 2.5694953088831352e-05,
|
||
|
|
"loss": 0.30531723499298097,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9440,
|
||
|
|
"token_acc": 0.8840279216629264,
|
||
|
|
"train_speed(iter/s)": 0.088808
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.060098733633827,
|
||
|
|
"grad_norm": 0.10662077367305756,
|
||
|
|
"learning_rate": 2.5608102313576027e-05,
|
||
|
|
"loss": 0.3047459363937378,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9460,
|
||
|
|
"token_acc": 0.9002080243657248,
|
||
|
|
"train_speed(iter/s)": 0.088811
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.068684267010088,
|
||
|
|
"grad_norm": 0.10326355695724487,
|
||
|
|
"learning_rate": 2.5521244194211884e-05,
|
||
|
|
"loss": 0.30735197067260744,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9480,
|
||
|
|
"token_acc": 0.8819828054997908,
|
||
|
|
"train_speed(iter/s)": 0.088814
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.077269800386349,
|
||
|
|
"grad_norm": 0.10981076210737228,
|
||
|
|
"learning_rate": 2.5434379779732603e-05,
|
||
|
|
"loss": 0.30461032390594484,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9500,
|
||
|
|
"token_acc": 0.882671980207554,
|
||
|
|
"train_speed(iter/s)": 0.088816
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.08585533376261,
|
||
|
|
"grad_norm": 0.09967193752527237,
|
||
|
|
"learning_rate": 2.5347510119207878e-05,
|
||
|
|
"loss": 0.3016824722290039,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9520,
|
||
|
|
"token_acc": 0.8960580499977037,
|
||
|
|
"train_speed(iter/s)": 0.088818
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.094440867138871,
|
||
|
|
"grad_norm": 0.10693041980266571,
|
||
|
|
"learning_rate": 2.5260636261770777e-05,
|
||
|
|
"loss": 0.3073539972305298,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9540,
|
||
|
|
"token_acc": 0.890892156523979,
|
||
|
|
"train_speed(iter/s)": 0.08882
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.103026400515132,
|
||
|
|
"grad_norm": 0.10553585737943649,
|
||
|
|
"learning_rate": 2.5173759256605027e-05,
|
||
|
|
"loss": 0.30216293334960936,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9560,
|
||
|
|
"token_acc": 0.8891749049597542,
|
||
|
|
"train_speed(iter/s)": 0.088822
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.111611933891393,
|
||
|
|
"grad_norm": 0.10220309346914291,
|
||
|
|
"learning_rate": 2.5086880152932402e-05,
|
||
|
|
"loss": 0.3027711153030396,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9580,
|
||
|
|
"token_acc": 0.8892161871654268,
|
||
|
|
"train_speed(iter/s)": 0.088824
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.120197467267654,
|
||
|
|
"grad_norm": 0.10086795687675476,
|
||
|
|
"learning_rate": 2.5e-05,
|
||
|
|
"loss": 0.30600886344909667,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9600,
|
||
|
|
"token_acc": 0.8824066390041494,
|
||
|
|
"train_speed(iter/s)": 0.088825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1287830006439155,
|
||
|
|
"grad_norm": 0.10636570304632187,
|
||
|
|
"learning_rate": 2.4913119847067603e-05,
|
||
|
|
"loss": 0.30425918102264404,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9620,
|
||
|
|
"token_acc": 0.8838457920573797,
|
||
|
|
"train_speed(iter/s)": 0.088829
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.137368534020176,
|
||
|
|
"grad_norm": 0.10464228689670563,
|
||
|
|
"learning_rate": 2.4826240743394982e-05,
|
||
|
|
"loss": 0.3025052070617676,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9640,
|
||
|
|
"token_acc": 0.8769574601853707,
|
||
|
|
"train_speed(iter/s)": 0.088832
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.145954067396437,
|
||
|
|
"grad_norm": 0.1083202064037323,
|
||
|
|
"learning_rate": 2.4739363738229232e-05,
|
||
|
|
"loss": 0.30380189418792725,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9660,
|
||
|
|
"token_acc": 0.8893545408707838,
|
||
|
|
"train_speed(iter/s)": 0.088834
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.154539600772698,
|
||
|
|
"grad_norm": 0.10492519289255142,
|
||
|
|
"learning_rate": 2.4652489880792128e-05,
|
||
|
|
"loss": 0.30443031787872316,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9680,
|
||
|
|
"token_acc": 0.8797012712026356,
|
||
|
|
"train_speed(iter/s)": 0.088831
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1631251341489595,
|
||
|
|
"grad_norm": 0.09974920004606247,
|
||
|
|
"learning_rate": 2.4565620220267396e-05,
|
||
|
|
"loss": 0.3066636800765991,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9700,
|
||
|
|
"token_acc": 0.8844553871840214,
|
||
|
|
"train_speed(iter/s)": 0.088833
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.17171066752522,
|
||
|
|
"grad_norm": 0.0984271839261055,
|
||
|
|
"learning_rate": 2.447875580578812e-05,
|
||
|
|
"loss": 0.3007610082626343,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9720,
|
||
|
|
"token_acc": 0.8761438976087101,
|
||
|
|
"train_speed(iter/s)": 0.088836
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.180296200901481,
|
||
|
|
"grad_norm": 0.10344758629798889,
|
||
|
|
"learning_rate": 2.439189768642398e-05,
|
||
|
|
"loss": 0.3055333375930786,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9740,
|
||
|
|
"token_acc": 0.8798825324153172,
|
||
|
|
"train_speed(iter/s)": 0.088839
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.188881734277742,
|
||
|
|
"grad_norm": 0.10062626749277115,
|
||
|
|
"learning_rate": 2.4305046911168653e-05,
|
||
|
|
"loss": 0.30226128101348876,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9760,
|
||
|
|
"token_acc": 0.877849069049261,
|
||
|
|
"train_speed(iter/s)": 0.088842
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.197467267654003,
|
||
|
|
"grad_norm": 0.1044364646077156,
|
||
|
|
"learning_rate": 2.4218204528927117e-05,
|
||
|
|
"loss": 0.3027973175048828,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9780,
|
||
|
|
"token_acc": 0.8901094903786694,
|
||
|
|
"train_speed(iter/s)": 0.088844
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.206052801030264,
|
||
|
|
"grad_norm": 0.09792552888393402,
|
||
|
|
"learning_rate": 2.4131371588503003e-05,
|
||
|
|
"loss": 0.30410778522491455,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9800,
|
||
|
|
"token_acc": 0.8904304675100755,
|
||
|
|
"train_speed(iter/s)": 0.088846
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.214638334406525,
|
||
|
|
"grad_norm": 0.11304216086864471,
|
||
|
|
"learning_rate": 2.4044549138585877e-05,
|
||
|
|
"loss": 0.3036644697189331,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9820,
|
||
|
|
"token_acc": 0.8800798395927938,
|
||
|
|
"train_speed(iter/s)": 0.088849
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.223223867782786,
|
||
|
|
"grad_norm": 0.10015735030174255,
|
||
|
|
"learning_rate": 2.395773822773863e-05,
|
||
|
|
"loss": 0.30791220664978025,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9840,
|
||
|
|
"token_acc": 0.8848758135171705,
|
||
|
|
"train_speed(iter/s)": 0.088852
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.231809401159047,
|
||
|
|
"grad_norm": 0.09757008403539658,
|
||
|
|
"learning_rate": 2.3870939904384815e-05,
|
||
|
|
"loss": 0.30361478328704833,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9860,
|
||
|
|
"token_acc": 0.8940831985400854,
|
||
|
|
"train_speed(iter/s)": 0.088855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.240394934535308,
|
||
|
|
"grad_norm": 0.09704037755727768,
|
||
|
|
"learning_rate": 2.378415521679593e-05,
|
||
|
|
"loss": 0.3088146924972534,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9880,
|
||
|
|
"token_acc": 0.887872541700794,
|
||
|
|
"train_speed(iter/s)": 0.088857
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.248980467911569,
|
||
|
|
"grad_norm": 0.10431049019098282,
|
||
|
|
"learning_rate": 2.3697385213078805e-05,
|
||
|
|
"loss": 0.30578904151916503,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9900,
|
||
|
|
"token_acc": 0.8816228300017872,
|
||
|
|
"train_speed(iter/s)": 0.088859
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.25756600128783,
|
||
|
|
"grad_norm": 0.09818245470523834,
|
||
|
|
"learning_rate": 2.361063094116293e-05,
|
||
|
|
"loss": 0.3096456527709961,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9920,
|
||
|
|
"token_acc": 0.8843864415701027,
|
||
|
|
"train_speed(iter/s)": 0.088863
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.2661515346640915,
|
||
|
|
"grad_norm": 0.09961646795272827,
|
||
|
|
"learning_rate": 2.3523893448787818e-05,
|
||
|
|
"loss": 0.30978071689605713,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9940,
|
||
|
|
"token_acc": 0.8849045058887656,
|
||
|
|
"train_speed(iter/s)": 0.088865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.274737068040352,
|
||
|
|
"grad_norm": 0.10661664605140686,
|
||
|
|
"learning_rate": 2.3437173783490307e-05,
|
||
|
|
"loss": 0.30757110118865966,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9960,
|
||
|
|
"token_acc": 0.8822289688850337,
|
||
|
|
"train_speed(iter/s)": 0.088868
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.283322601416613,
|
||
|
|
"grad_norm": 0.1005556732416153,
|
||
|
|
"learning_rate": 2.3350472992591947e-05,
|
||
|
|
"loss": 0.30759055614471437,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 9980,
|
||
|
|
"token_acc": 0.8835728408590111,
|
||
|
|
"train_speed(iter/s)": 0.088871
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.291908134792874,
|
||
|
|
"grad_norm": 0.09660108387470245,
|
||
|
|
"learning_rate": 2.3263792123186353e-05,
|
||
|
|
"loss": 0.30487823486328125,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10000,
|
||
|
|
"token_acc": 0.8812157065140277,
|
||
|
|
"train_speed(iter/s)": 0.088874
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.291908134792874,
|
||
|
|
"eval_loss": 0.45848873257637024,
|
||
|
|
"eval_runtime": 74.5961,
|
||
|
|
"eval_samples_per_second": 50.458,
|
||
|
|
"eval_steps_per_second": 0.643,
|
||
|
|
"eval_token_acc": 0.8390728406167212,
|
||
|
|
"step": 10000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.3004936681691355,
|
||
|
|
"grad_norm": 0.10211784392595291,
|
||
|
|
"learning_rate": 2.3177132222126536e-05,
|
||
|
|
"loss": 0.3054050922393799,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10020,
|
||
|
|
"token_acc": 0.861507260950951,
|
||
|
|
"train_speed(iter/s)": 0.088783
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.309079201545396,
|
||
|
|
"grad_norm": 0.1039443388581276,
|
||
|
|
"learning_rate": 2.3090494336012253e-05,
|
||
|
|
"loss": 0.3065175533294678,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10040,
|
||
|
|
"token_acc": 0.8864880616836895,
|
||
|
|
"train_speed(iter/s)": 0.088777
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.317664734921657,
|
||
|
|
"grad_norm": 0.1060820147395134,
|
||
|
|
"learning_rate": 2.3003879511177405e-05,
|
||
|
|
"loss": 0.31085891723632814,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10060,
|
||
|
|
"token_acc": 0.8897265286253574,
|
||
|
|
"train_speed(iter/s)": 0.088773
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.326250268297918,
|
||
|
|
"grad_norm": 0.10298410803079605,
|
||
|
|
"learning_rate": 2.2917288793677382e-05,
|
||
|
|
"loss": 0.31043663024902346,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10080,
|
||
|
|
"token_acc": 0.8748683362897243,
|
||
|
|
"train_speed(iter/s)": 0.088769
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.3348358016741795,
|
||
|
|
"grad_norm": 0.1114133968949318,
|
||
|
|
"learning_rate": 2.2830723229276424e-05,
|
||
|
|
"loss": 0.31448495388031006,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10100,
|
||
|
|
"token_acc": 0.8866603970434808,
|
||
|
|
"train_speed(iter/s)": 0.088766
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.34342133505044,
|
||
|
|
"grad_norm": 0.10426465421915054,
|
||
|
|
"learning_rate": 2.2744183863434976e-05,
|
||
|
|
"loss": 0.31032671928405764,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10120,
|
||
|
|
"token_acc": 0.8818581792950851,
|
||
|
|
"train_speed(iter/s)": 0.088765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.352006868426701,
|
||
|
|
"grad_norm": 0.10287055373191833,
|
||
|
|
"learning_rate": 2.265767174129711e-05,
|
||
|
|
"loss": 0.3112910747528076,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10140,
|
||
|
|
"token_acc": 0.8739920728492123,
|
||
|
|
"train_speed(iter/s)": 0.088763
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.360592401802962,
|
||
|
|
"grad_norm": 0.10366437584161758,
|
||
|
|
"learning_rate": 2.2571187907677853e-05,
|
||
|
|
"loss": 0.31062612533569334,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10160,
|
||
|
|
"token_acc": 0.8771409538302638,
|
||
|
|
"train_speed(iter/s)": 0.088761
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.369177935179223,
|
||
|
|
"grad_norm": 0.10374686121940613,
|
||
|
|
"learning_rate": 2.2484733407050602e-05,
|
||
|
|
"loss": 0.31010420322418214,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10180,
|
||
|
|
"token_acc": 0.8837488220680202,
|
||
|
|
"train_speed(iter/s)": 0.088762
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.377763468555484,
|
||
|
|
"grad_norm": 0.10094033926725388,
|
||
|
|
"learning_rate": 2.2398309283534477e-05,
|
||
|
|
"loss": 0.3080222845077515,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10200,
|
||
|
|
"token_acc": 0.8891878281040166,
|
||
|
|
"train_speed(iter/s)": 0.088764
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.386349001931745,
|
||
|
|
"grad_norm": 0.10435180366039276,
|
||
|
|
"learning_rate": 2.2311916580881754e-05,
|
||
|
|
"loss": 0.30961949825286866,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10220,
|
||
|
|
"token_acc": 0.8952461985350648,
|
||
|
|
"train_speed(iter/s)": 0.088764
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.394934535308006,
|
||
|
|
"grad_norm": 0.0953126922249794,
|
||
|
|
"learning_rate": 2.222555634246521e-05,
|
||
|
|
"loss": 0.3070392608642578,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10240,
|
||
|
|
"token_acc": 0.8863533099042126,
|
||
|
|
"train_speed(iter/s)": 0.088766
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.403520068684267,
|
||
|
|
"grad_norm": 0.10288111865520477,
|
||
|
|
"learning_rate": 2.2139229611265594e-05,
|
||
|
|
"loss": 0.30999772548675536,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10260,
|
||
|
|
"token_acc": 0.8752241865231873,
|
||
|
|
"train_speed(iter/s)": 0.088766
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.412105602060528,
|
||
|
|
"grad_norm": 0.10298358649015427,
|
||
|
|
"learning_rate": 2.205293742985893e-05,
|
||
|
|
"loss": 0.310498046875,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10280,
|
||
|
|
"token_acc": 0.895062097103973,
|
||
|
|
"train_speed(iter/s)": 0.088768
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.420691135436789,
|
||
|
|
"grad_norm": 0.10269106179475784,
|
||
|
|
"learning_rate": 2.1966680840404013e-05,
|
||
|
|
"loss": 0.31382122039794924,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10300,
|
||
|
|
"token_acc": 0.8826629491356146,
|
||
|
|
"train_speed(iter/s)": 0.088769
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.42927666881305,
|
||
|
|
"grad_norm": 0.09890419244766235,
|
||
|
|
"learning_rate": 2.188046088462979e-05,
|
||
|
|
"loss": 0.31236202716827394,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10320,
|
||
|
|
"token_acc": 0.877004450607206,
|
||
|
|
"train_speed(iter/s)": 0.08877
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.437862202189311,
|
||
|
|
"grad_norm": 0.1035868227481842,
|
||
|
|
"learning_rate": 2.179427860382276e-05,
|
||
|
|
"loss": 0.31030888557434083,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10340,
|
||
|
|
"token_acc": 0.88265658710238,
|
||
|
|
"train_speed(iter/s)": 0.08877
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.446447735565572,
|
||
|
|
"grad_norm": 0.10644908994436264,
|
||
|
|
"learning_rate": 2.170813503881444e-05,
|
||
|
|
"loss": 0.31080482006072996,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10360,
|
||
|
|
"token_acc": 0.8680257223302367,
|
||
|
|
"train_speed(iter/s)": 0.088772
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.455033268941833,
|
||
|
|
"grad_norm": 0.10393664985895157,
|
||
|
|
"learning_rate": 2.162203122996876e-05,
|
||
|
|
"loss": 0.3072603702545166,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10380,
|
||
|
|
"token_acc": 0.8879988357215192,
|
||
|
|
"train_speed(iter/s)": 0.088774
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.463618802318094,
|
||
|
|
"grad_norm": 0.09875033795833588,
|
||
|
|
"learning_rate": 2.1535968217169535e-05,
|
||
|
|
"loss": 0.308307147026062,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10400,
|
||
|
|
"token_acc": 0.8760545062481376,
|
||
|
|
"train_speed(iter/s)": 0.088777
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.472204335694355,
|
||
|
|
"grad_norm": 0.10074667632579803,
|
||
|
|
"learning_rate": 2.1449947039807826e-05,
|
||
|
|
"loss": 0.3109966039657593,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10420,
|
||
|
|
"token_acc": 0.8947892374351213,
|
||
|
|
"train_speed(iter/s)": 0.088778
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.480789869070616,
|
||
|
|
"grad_norm": 0.09881151467561722,
|
||
|
|
"learning_rate": 2.1363968736769508e-05,
|
||
|
|
"loss": 0.3046985626220703,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10440,
|
||
|
|
"token_acc": 0.8952631152568657,
|
||
|
|
"train_speed(iter/s)": 0.08878
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.489375402446877,
|
||
|
|
"grad_norm": 0.09804583340883255,
|
||
|
|
"learning_rate": 2.1278034346422616e-05,
|
||
|
|
"loss": 0.31377933025360105,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10460,
|
||
|
|
"token_acc": 0.8788372867424049,
|
||
|
|
"train_speed(iter/s)": 0.088782
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.497960935823138,
|
||
|
|
"grad_norm": 0.10384197533130646,
|
||
|
|
"learning_rate": 2.1192144906604876e-05,
|
||
|
|
"loss": 0.3103285312652588,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10480,
|
||
|
|
"token_acc": 0.891363222526985,
|
||
|
|
"train_speed(iter/s)": 0.088783
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5065464691993995,
|
||
|
|
"grad_norm": 0.10672769695520401,
|
||
|
|
"learning_rate": 2.110630145461112e-05,
|
||
|
|
"loss": 0.3111438512802124,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10500,
|
||
|
|
"token_acc": 0.88412093531313,
|
||
|
|
"train_speed(iter/s)": 0.088785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.51513200257566,
|
||
|
|
"grad_norm": 0.10372064262628555,
|
||
|
|
"learning_rate": 2.102050502718078e-05,
|
||
|
|
"loss": 0.3104998111724854,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10520,
|
||
|
|
"token_acc": 0.8813348577961984,
|
||
|
|
"train_speed(iter/s)": 0.088787
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.523717535951921,
|
||
|
|
"grad_norm": 0.1009448915719986,
|
||
|
|
"learning_rate": 2.093475666048539e-05,
|
||
|
|
"loss": 0.30964412689208987,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10540,
|
||
|
|
"token_acc": 0.8954398710496272,
|
||
|
|
"train_speed(iter/s)": 0.088788
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.532303069328182,
|
||
|
|
"grad_norm": 0.10434540361166,
|
||
|
|
"learning_rate": 2.0849057390116042e-05,
|
||
|
|
"loss": 0.30902681350708006,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10560,
|
||
|
|
"token_acc": 0.8803101400044141,
|
||
|
|
"train_speed(iter/s)": 0.088789
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5408886027044435,
|
||
|
|
"grad_norm": 0.10229279845952988,
|
||
|
|
"learning_rate": 2.0763408251070866e-05,
|
||
|
|
"loss": 0.3061969757080078,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10580,
|
||
|
|
"token_acc": 0.8930533404217614,
|
||
|
|
"train_speed(iter/s)": 0.088791
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.549474136080704,
|
||
|
|
"grad_norm": 0.09319902211427689,
|
||
|
|
"learning_rate": 2.0677810277742565e-05,
|
||
|
|
"loss": 0.3094120740890503,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10600,
|
||
|
|
"token_acc": 0.8876524522036789,
|
||
|
|
"train_speed(iter/s)": 0.088793
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.558059669456965,
|
||
|
|
"grad_norm": 0.09506496042013168,
|
||
|
|
"learning_rate": 2.0592264503905932e-05,
|
||
|
|
"loss": 0.3105063199996948,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10620,
|
||
|
|
"token_acc": 0.8743828338452405,
|
||
|
|
"train_speed(iter/s)": 0.088795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.566645202833226,
|
||
|
|
"grad_norm": 0.09979739040136337,
|
||
|
|
"learning_rate": 2.0506771962705304e-05,
|
||
|
|
"loss": 0.30733799934387207,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10640,
|
||
|
|
"token_acc": 0.889678967341867,
|
||
|
|
"train_speed(iter/s)": 0.088798
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5752307362094875,
|
||
|
|
"grad_norm": 0.0996963307261467,
|
||
|
|
"learning_rate": 2.0421333686642137e-05,
|
||
|
|
"loss": 0.30787818431854247,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10660,
|
||
|
|
"token_acc": 0.8724791602710936,
|
||
|
|
"train_speed(iter/s)": 0.0888
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.583816269585748,
|
||
|
|
"grad_norm": 0.10467605292797089,
|
||
|
|
"learning_rate": 2.0335950707562535e-05,
|
||
|
|
"loss": 0.30961976051330564,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10680,
|
||
|
|
"token_acc": 0.8865601551069852,
|
||
|
|
"train_speed(iter/s)": 0.088797
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.592401802962009,
|
||
|
|
"grad_norm": 0.10287564992904663,
|
||
|
|
"learning_rate": 2.0250624056644767e-05,
|
||
|
|
"loss": 0.30673904418945314,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10700,
|
||
|
|
"token_acc": 0.8888166591838771,
|
||
|
|
"train_speed(iter/s)": 0.088799
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.60098733633827,
|
||
|
|
"grad_norm": 0.10342861711978912,
|
||
|
|
"learning_rate": 2.0165354764386807e-05,
|
||
|
|
"loss": 0.3080348253250122,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10720,
|
||
|
|
"token_acc": 0.8935362282980741,
|
||
|
|
"train_speed(iter/s)": 0.088801
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.6095728697145315,
|
||
|
|
"grad_norm": 0.09834201633930206,
|
||
|
|
"learning_rate": 2.0080143860593913e-05,
|
||
|
|
"loss": 0.30832786560058595,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10740,
|
||
|
|
"token_acc": 0.8824297207331616,
|
||
|
|
"train_speed(iter/s)": 0.088803
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.618158403090792,
|
||
|
|
"grad_norm": 0.10289661586284637,
|
||
|
|
"learning_rate": 1.9994992374366193e-05,
|
||
|
|
"loss": 0.3109771251678467,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10760,
|
||
|
|
"token_acc": 0.8895210650649608,
|
||
|
|
"train_speed(iter/s)": 0.088805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.626743936467053,
|
||
|
|
"grad_norm": 0.09662512689828873,
|
||
|
|
"learning_rate": 1.9909901334086152e-05,
|
||
|
|
"loss": 0.31307733058929443,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10780,
|
||
|
|
"token_acc": 0.8865438146287556,
|
||
|
|
"train_speed(iter/s)": 0.088807
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.635329469843314,
|
||
|
|
"grad_norm": 0.10243885219097137,
|
||
|
|
"learning_rate": 1.982487176740627e-05,
|
||
|
|
"loss": 0.31298274993896485,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10800,
|
||
|
|
"token_acc": 0.8782184863693918,
|
||
|
|
"train_speed(iter/s)": 0.088808
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.6439150032195755,
|
||
|
|
"grad_norm": 0.10350590944290161,
|
||
|
|
"learning_rate": 1.973990470123663e-05,
|
||
|
|
"loss": 0.309729266166687,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10820,
|
||
|
|
"token_acc": 0.8839905751216937,
|
||
|
|
"train_speed(iter/s)": 0.088809
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.652500536595836,
|
||
|
|
"grad_norm": 0.10676155984401703,
|
||
|
|
"learning_rate": 1.9655001161732478e-05,
|
||
|
|
"loss": 0.3093304395675659,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10840,
|
||
|
|
"token_acc": 0.8909490610287415,
|
||
|
|
"train_speed(iter/s)": 0.088812
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.661086069972097,
|
||
|
|
"grad_norm": 0.09464031457901001,
|
||
|
|
"learning_rate": 1.9570162174281847e-05,
|
||
|
|
"loss": 0.3070455312728882,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10860,
|
||
|
|
"token_acc": 0.8747045411759784,
|
||
|
|
"train_speed(iter/s)": 0.088813
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.669671603348358,
|
||
|
|
"grad_norm": 0.09355127811431885,
|
||
|
|
"learning_rate": 1.9485388763493153e-05,
|
||
|
|
"loss": 0.30823278427124023,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10880,
|
||
|
|
"token_acc": 0.9008269805356058,
|
||
|
|
"train_speed(iter/s)": 0.088815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.6782571367246195,
|
||
|
|
"grad_norm": 0.0956326350569725,
|
||
|
|
"learning_rate": 1.9400681953182855e-05,
|
||
|
|
"loss": 0.30865190029144285,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10900,
|
||
|
|
"token_acc": 0.8869463759204074,
|
||
|
|
"train_speed(iter/s)": 0.088817
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.68684267010088,
|
||
|
|
"grad_norm": 0.10339660942554474,
|
||
|
|
"learning_rate": 1.9316042766363075e-05,
|
||
|
|
"loss": 0.3091820955276489,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10920,
|
||
|
|
"token_acc": 0.8778233411535858,
|
||
|
|
"train_speed(iter/s)": 0.088819
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.695428203477141,
|
||
|
|
"grad_norm": 0.0986744612455368,
|
||
|
|
"learning_rate": 1.9231472225229216e-05,
|
||
|
|
"loss": 0.31184089183807373,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10940,
|
||
|
|
"token_acc": 0.8970116747089019,
|
||
|
|
"train_speed(iter/s)": 0.088821
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.704013736853402,
|
||
|
|
"grad_norm": 0.10175996273756027,
|
||
|
|
"learning_rate": 1.9146971351147655e-05,
|
||
|
|
"loss": 0.3101097583770752,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10960,
|
||
|
|
"token_acc": 0.8852196976340255,
|
||
|
|
"train_speed(iter/s)": 0.088823
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.7125992702296635,
|
||
|
|
"grad_norm": 0.7812356948852539,
|
||
|
|
"learning_rate": 1.9062541164643403e-05,
|
||
|
|
"loss": 0.3123283863067627,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 10980,
|
||
|
|
"token_acc": 0.8854708801840979,
|
||
|
|
"train_speed(iter/s)": 0.088825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.721184803605924,
|
||
|
|
"grad_norm": 0.10162019729614258,
|
||
|
|
"learning_rate": 1.897818268538776e-05,
|
||
|
|
"loss": 0.31052777767181394,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11000,
|
||
|
|
"token_acc": 0.8801016226848057,
|
||
|
|
"train_speed(iter/s)": 0.088825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.721184803605924,
|
||
|
|
"eval_loss": 0.45501717925071716,
|
||
|
|
"eval_runtime": 70.0969,
|
||
|
|
"eval_samples_per_second": 53.697,
|
||
|
|
"eval_steps_per_second": 0.685,
|
||
|
|
"eval_token_acc": 0.8396253696160709,
|
||
|
|
"step": 11000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.729770336982185,
|
||
|
|
"grad_norm": 0.10113983601331711,
|
||
|
|
"learning_rate": 1.8893896932185994e-05,
|
||
|
|
"loss": 0.30813672542572024,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11020,
|
||
|
|
"token_acc": 0.8546351539786743,
|
||
|
|
"train_speed(iter/s)": 0.088747
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.738355870358446,
|
||
|
|
"grad_norm": 0.103078193962574,
|
||
|
|
"learning_rate": 1.8809684922965097e-05,
|
||
|
|
"loss": 0.30388219356536866,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11040,
|
||
|
|
"token_acc": 0.8948337756570212,
|
||
|
|
"train_speed(iter/s)": 0.088741
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.746941403734707,
|
||
|
|
"grad_norm": 0.09970963001251221,
|
||
|
|
"learning_rate": 1.87255476747614e-05,
|
||
|
|
"loss": 0.3133774042129517,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11060,
|
||
|
|
"token_acc": 0.8968223367439061,
|
||
|
|
"train_speed(iter/s)": 0.088738
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.755526937110968,
|
||
|
|
"grad_norm": 0.10380697250366211,
|
||
|
|
"learning_rate": 1.8641486203708387e-05,
|
||
|
|
"loss": 0.30957233905792236,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11080,
|
||
|
|
"token_acc": 0.8824741415108899,
|
||
|
|
"train_speed(iter/s)": 0.088734
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.764112470487229,
|
||
|
|
"grad_norm": 0.1037619411945343,
|
||
|
|
"learning_rate": 1.855750152502431e-05,
|
||
|
|
"loss": 0.3057359457015991,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11100,
|
||
|
|
"token_acc": 0.8958725033279122,
|
||
|
|
"train_speed(iter/s)": 0.088733
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.77269800386349,
|
||
|
|
"grad_norm": 0.10157765448093414,
|
||
|
|
"learning_rate": 1.847359465300006e-05,
|
||
|
|
"loss": 0.30702900886535645,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11120,
|
||
|
|
"token_acc": 0.876122716238661,
|
||
|
|
"train_speed(iter/s)": 0.088732
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.781283537239751,
|
||
|
|
"grad_norm": 0.09982700645923615,
|
||
|
|
"learning_rate": 1.83897666009868e-05,
|
||
|
|
"loss": 0.3116676092147827,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11140,
|
||
|
|
"token_acc": 0.8806798775281173,
|
||
|
|
"train_speed(iter/s)": 0.088731
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.789869070616012,
|
||
|
|
"grad_norm": 0.10041019320487976,
|
||
|
|
"learning_rate": 1.830601838138382e-05,
|
||
|
|
"loss": 0.30963037014007566,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11160,
|
||
|
|
"token_acc": 0.8754533556507809,
|
||
|
|
"train_speed(iter/s)": 0.088732
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.798454603992273,
|
||
|
|
"grad_norm": 0.09908230602741241,
|
||
|
|
"learning_rate": 1.8222351005626226e-05,
|
||
|
|
"loss": 0.31059741973876953,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11180,
|
||
|
|
"token_acc": 0.8883735287189193,
|
||
|
|
"train_speed(iter/s)": 0.088733
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.807040137368534,
|
||
|
|
"grad_norm": 0.10159313678741455,
|
||
|
|
"learning_rate": 1.8138765484172775e-05,
|
||
|
|
"loss": 0.3082897186279297,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11200,
|
||
|
|
"token_acc": 0.8837576612751032,
|
||
|
|
"train_speed(iter/s)": 0.088733
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.815625670744795,
|
||
|
|
"grad_norm": 0.09573191404342651,
|
||
|
|
"learning_rate": 1.805526282649369e-05,
|
||
|
|
"loss": 0.31205048561096194,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11220,
|
||
|
|
"token_acc": 0.8822307222234796,
|
||
|
|
"train_speed(iter/s)": 0.088734
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.824211204121056,
|
||
|
|
"grad_norm": 0.10527610033750534,
|
||
|
|
"learning_rate": 1.797184404105839e-05,
|
||
|
|
"loss": 0.3125370264053345,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11240,
|
||
|
|
"token_acc": 0.877393258829162,
|
||
|
|
"train_speed(iter/s)": 0.088733
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.832796737497317,
|
||
|
|
"grad_norm": 0.09318065643310547,
|
||
|
|
"learning_rate": 1.7888510135323414e-05,
|
||
|
|
"loss": 0.30796611309051514,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11260,
|
||
|
|
"token_acc": 0.8781843195222971,
|
||
|
|
"train_speed(iter/s)": 0.088735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.841382270873578,
|
||
|
|
"grad_norm": 0.09891670942306519,
|
||
|
|
"learning_rate": 1.780526211572016e-05,
|
||
|
|
"loss": 0.31104702949523927,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11280,
|
||
|
|
"token_acc": 0.8761593749911606,
|
||
|
|
"train_speed(iter/s)": 0.088737
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.849967804249839,
|
||
|
|
"grad_norm": 0.09686607122421265,
|
||
|
|
"learning_rate": 1.772210098764281e-05,
|
||
|
|
"loss": 0.3131218433380127,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11300,
|
||
|
|
"token_acc": 0.8879271267617395,
|
||
|
|
"train_speed(iter/s)": 0.088736
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.8585533376261,
|
||
|
|
"grad_norm": 0.09879806637763977,
|
||
|
|
"learning_rate": 1.7639027755436104e-05,
|
||
|
|
"loss": 0.30540714263916013,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11320,
|
||
|
|
"token_acc": 0.8883117608455857,
|
||
|
|
"train_speed(iter/s)": 0.088738
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.867138871002361,
|
||
|
|
"grad_norm": 0.09865374863147736,
|
||
|
|
"learning_rate": 1.7556043422383293e-05,
|
||
|
|
"loss": 0.3053091287612915,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11340,
|
||
|
|
"token_acc": 0.8942929802909607,
|
||
|
|
"train_speed(iter/s)": 0.088737
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.875724404378622,
|
||
|
|
"grad_norm": 0.10021404922008514,
|
||
|
|
"learning_rate": 1.7473148990693955e-05,
|
||
|
|
"loss": 0.31073627471923826,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11360,
|
||
|
|
"token_acc": 0.8850116031551548,
|
||
|
|
"train_speed(iter/s)": 0.088738
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.884309937754883,
|
||
|
|
"grad_norm": 0.10069513320922852,
|
||
|
|
"learning_rate": 1.7390345461491954e-05,
|
||
|
|
"loss": 0.3094152927398682,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11380,
|
||
|
|
"token_acc": 0.8841858526281734,
|
||
|
|
"train_speed(iter/s)": 0.088739
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.892895471131144,
|
||
|
|
"grad_norm": 0.1061035767197609,
|
||
|
|
"learning_rate": 1.730763383480328e-05,
|
||
|
|
"loss": 0.30918545722961427,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11400,
|
||
|
|
"token_acc": 0.8904189361026621,
|
||
|
|
"train_speed(iter/s)": 0.08874
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.901481004507405,
|
||
|
|
"grad_norm": 0.0995524600148201,
|
||
|
|
"learning_rate": 1.722501510954403e-05,
|
||
|
|
"loss": 0.3127927541732788,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11420,
|
||
|
|
"token_acc": 0.8850685685523632,
|
||
|
|
"train_speed(iter/s)": 0.088743
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.910066537883666,
|
||
|
|
"grad_norm": 0.09735783189535141,
|
||
|
|
"learning_rate": 1.7142490283508324e-05,
|
||
|
|
"loss": 0.30820300579071047,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11440,
|
||
|
|
"token_acc": 0.8748503235514588,
|
||
|
|
"train_speed(iter/s)": 0.088745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.918652071259927,
|
||
|
|
"grad_norm": 0.10155107080936432,
|
||
|
|
"learning_rate": 1.706006035335625e-05,
|
||
|
|
"loss": 0.3070305109024048,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11460,
|
||
|
|
"token_acc": 0.886489278720699,
|
||
|
|
"train_speed(iter/s)": 0.088748
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.927237604636188,
|
||
|
|
"grad_norm": 0.11103896051645279,
|
||
|
|
"learning_rate": 1.6977726314601806e-05,
|
||
|
|
"loss": 0.31273181438446046,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11480,
|
||
|
|
"token_acc": 0.8837625376784819,
|
||
|
|
"train_speed(iter/s)": 0.08875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.935823138012449,
|
||
|
|
"grad_norm": 0.09665607661008835,
|
||
|
|
"learning_rate": 1.6895489161600924e-05,
|
||
|
|
"loss": 0.30753934383392334,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11500,
|
||
|
|
"token_acc": 0.8802015271291028,
|
||
|
|
"train_speed(iter/s)": 0.088752
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.94440867138871,
|
||
|
|
"grad_norm": 0.0969487726688385,
|
||
|
|
"learning_rate": 1.6813349887539443e-05,
|
||
|
|
"loss": 0.3144726514816284,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11520,
|
||
|
|
"token_acc": 0.8802431565821507,
|
||
|
|
"train_speed(iter/s)": 0.088753
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.952994204764971,
|
||
|
|
"grad_norm": 0.09839560836553574,
|
||
|
|
"learning_rate": 1.67313094844211e-05,
|
||
|
|
"loss": 0.30981805324554446,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11540,
|
||
|
|
"token_acc": 0.8963515858448547,
|
||
|
|
"train_speed(iter/s)": 0.088755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.961579738141232,
|
||
|
|
"grad_norm": 0.10357420891523361,
|
||
|
|
"learning_rate": 1.664936894305554e-05,
|
||
|
|
"loss": 0.3088369846343994,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11560,
|
||
|
|
"token_acc": 0.8838528141659493,
|
||
|
|
"train_speed(iter/s)": 0.088757
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.970165271517493,
|
||
|
|
"grad_norm": 0.09701311588287354,
|
||
|
|
"learning_rate": 1.65675292530464e-05,
|
||
|
|
"loss": 0.31214241981506347,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11580,
|
||
|
|
"token_acc": 0.8797079209755736,
|
||
|
|
"train_speed(iter/s)": 0.088759
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.978750804893754,
|
||
|
|
"grad_norm": 0.09698698669672012,
|
||
|
|
"learning_rate": 1.648579140277931e-05,
|
||
|
|
"loss": 0.3103867292404175,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11600,
|
||
|
|
"token_acc": 0.874222062607426,
|
||
|
|
"train_speed(iter/s)": 0.088759
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.987336338270015,
|
||
|
|
"grad_norm": 0.09471474587917328,
|
||
|
|
"learning_rate": 1.640415637940996e-05,
|
||
|
|
"loss": 0.31050570011138917,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11620,
|
||
|
|
"token_acc": 0.891306756689066,
|
||
|
|
"train_speed(iter/s)": 0.088762
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.995921871646276,
|
||
|
|
"grad_norm": 0.09426256269216537,
|
||
|
|
"learning_rate": 1.6322625168852217e-05,
|
||
|
|
"loss": 0.31265413761138916,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11640,
|
||
|
|
"token_acc": 0.8955581978003312,
|
||
|
|
"train_speed(iter/s)": 0.088764
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.00429276668813,
|
||
|
|
"grad_norm": 0.11454425007104874,
|
||
|
|
"learning_rate": 1.6241198755766175e-05,
|
||
|
|
"loss": 0.2891073703765869,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11660,
|
||
|
|
"token_acc": 0.8897775721320687,
|
||
|
|
"train_speed(iter/s)": 0.088768
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.012878300064392,
|
||
|
|
"grad_norm": 0.10882110148668289,
|
||
|
|
"learning_rate": 1.6159878123546275e-05,
|
||
|
|
"loss": 0.2693314790725708,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11680,
|
||
|
|
"token_acc": 0.8982194210665359,
|
||
|
|
"train_speed(iter/s)": 0.088762
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.021463833440652,
|
||
|
|
"grad_norm": 0.10319822281599045,
|
||
|
|
"learning_rate": 1.6078664254309436e-05,
|
||
|
|
"loss": 0.27081449031829835,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11700,
|
||
|
|
"token_acc": 0.9001353267268086,
|
||
|
|
"train_speed(iter/s)": 0.088765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.030049366816914,
|
||
|
|
"grad_norm": 0.11284149438142776,
|
||
|
|
"learning_rate": 1.59975581288832e-05,
|
||
|
|
"loss": 0.27441935539245604,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11720,
|
||
|
|
"token_acc": 0.8984662917082801,
|
||
|
|
"train_speed(iter/s)": 0.088767
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.038634900193174,
|
||
|
|
"grad_norm": 0.10425851494073868,
|
||
|
|
"learning_rate": 1.591656072679387e-05,
|
||
|
|
"loss": 0.2715555906295776,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11740,
|
||
|
|
"token_acc": 0.890478422247908,
|
||
|
|
"train_speed(iter/s)": 0.088768
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.047220433569436,
|
||
|
|
"grad_norm": 0.11284064501523972,
|
||
|
|
"learning_rate": 1.583567302625469e-05,
|
||
|
|
"loss": 0.2725609540939331,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11760,
|
||
|
|
"token_acc": 0.897063681945232,
|
||
|
|
"train_speed(iter/s)": 0.088769
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.055805966945696,
|
||
|
|
"grad_norm": 0.11659186333417892,
|
||
|
|
"learning_rate": 1.5754896004154e-05,
|
||
|
|
"loss": 0.2763663291931152,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11780,
|
||
|
|
"token_acc": 0.8892781727292928,
|
||
|
|
"train_speed(iter/s)": 0.088771
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.064391500321958,
|
||
|
|
"grad_norm": 0.10959072411060333,
|
||
|
|
"learning_rate": 1.567423063604352e-05,
|
||
|
|
"loss": 0.27177045345306394,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11800,
|
||
|
|
"token_acc": 0.8928949946338838,
|
||
|
|
"train_speed(iter/s)": 0.088772
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.072977033698218,
|
||
|
|
"grad_norm": 0.11139950156211853,
|
||
|
|
"learning_rate": 1.5593677896126462e-05,
|
||
|
|
"loss": 0.2721517086029053,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11820,
|
||
|
|
"token_acc": 0.895870023109786,
|
||
|
|
"train_speed(iter/s)": 0.088773
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.08156256707448,
|
||
|
|
"grad_norm": 0.10203303396701813,
|
||
|
|
"learning_rate": 1.551323875724587e-05,
|
||
|
|
"loss": 0.27356884479522703,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11840,
|
||
|
|
"token_acc": 0.9059859374397118,
|
||
|
|
"train_speed(iter/s)": 0.088775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.09014810045074,
|
||
|
|
"grad_norm": 0.10516630858182907,
|
||
|
|
"learning_rate": 1.5432914190872757e-05,
|
||
|
|
"loss": 0.2754658222198486,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11860,
|
||
|
|
"token_acc": 0.8908944849786643,
|
||
|
|
"train_speed(iter/s)": 0.088776
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.098733633827002,
|
||
|
|
"grad_norm": 0.10858064144849777,
|
||
|
|
"learning_rate": 1.5352705167094477e-05,
|
||
|
|
"loss": 0.2734870672225952,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11880,
|
||
|
|
"token_acc": 0.8974409839317595,
|
||
|
|
"train_speed(iter/s)": 0.088777
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.107319167203262,
|
||
|
|
"grad_norm": 0.11306975036859512,
|
||
|
|
"learning_rate": 1.527261265460296e-05,
|
||
|
|
"loss": 0.27300803661346434,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11900,
|
||
|
|
"token_acc": 0.9109106165341432,
|
||
|
|
"train_speed(iter/s)": 0.08878
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.115904700579524,
|
||
|
|
"grad_norm": 0.10477675497531891,
|
||
|
|
"learning_rate": 1.5192637620682981e-05,
|
||
|
|
"loss": 0.2717351198196411,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11920,
|
||
|
|
"token_acc": 0.8888284413313953,
|
||
|
|
"train_speed(iter/s)": 0.088782
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.124490233955784,
|
||
|
|
"grad_norm": 0.11272590607404709,
|
||
|
|
"learning_rate": 1.5112781031200569e-05,
|
||
|
|
"loss": 0.2693598508834839,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11940,
|
||
|
|
"token_acc": 0.8937451291948688,
|
||
|
|
"train_speed(iter/s)": 0.088784
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.133075767332046,
|
||
|
|
"grad_norm": 0.1111418828368187,
|
||
|
|
"learning_rate": 1.5033043850591256e-05,
|
||
|
|
"loss": 0.2743582487106323,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11960,
|
||
|
|
"token_acc": 0.8922390332455552,
|
||
|
|
"train_speed(iter/s)": 0.088786
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.141661300708306,
|
||
|
|
"grad_norm": 0.10565278679132462,
|
||
|
|
"learning_rate": 1.4953427041848473e-05,
|
||
|
|
"loss": 0.2750978946685791,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 11980,
|
||
|
|
"token_acc": 0.8900847655801997,
|
||
|
|
"train_speed(iter/s)": 0.088787
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.150246834084568,
|
||
|
|
"grad_norm": 0.10609643161296844,
|
||
|
|
"learning_rate": 1.4873931566511901e-05,
|
||
|
|
"loss": 0.27565574645996094,
|
||
|
|
"memory(GiB)": 72.72,
|
||
|
|
"step": 12000,
|
||
|
|
"token_acc": 0.9002406831246359,
|
||
|
|
"train_speed(iter/s)": 0.088788
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.150246834084568,
|
||
|
|
"eval_loss": 0.47488316893577576,
|
||
|
|
"eval_runtime": 70.9516,
|
||
|
|
"eval_samples_per_second": 53.05,
|
||
|
|
"eval_steps_per_second": 0.677,
|
||
|
|
"eval_token_acc": 0.8365712662327689,
|
||
|
|
"step": 12000
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 20,
|
||
|
|
"max_steps": 18640,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 8,
|
||
|
|
"save_steps": 1000,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 6.567507230772429e+16,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|