4512 lines
117 KiB
JSON
4512 lines
117 KiB
JSON
{
|
|
"best_global_step": 5800,
|
|
"best_metric": 0.63199037,
|
|
"best_model_checkpoint": "/root/data/output/8B-SFT-Zero3/v0-20251202-213653/checkpoint-5800",
|
|
"epoch": 0.9633911368015414,
|
|
"eval_steps": 200,
|
|
"global_step": 6000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.00016056518946692356,
|
|
"grad_norm": 5.045594701496255,
|
|
"learning_rate": 4e-08,
|
|
"loss": 0.9851051568984985,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.0016056518946692357,
|
|
"grad_norm": 5.125985791135598,
|
|
"learning_rate": 4.0000000000000003e-07,
|
|
"loss": 0.98167970445421,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.0032113037893384713,
|
|
"grad_norm": 4.641912421373809,
|
|
"learning_rate": 8.000000000000001e-07,
|
|
"loss": 0.9206424713134765,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.004816955684007707,
|
|
"grad_norm": 2.7941687987616666,
|
|
"learning_rate": 1.2000000000000002e-06,
|
|
"loss": 0.8770656585693359,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.006422607578676943,
|
|
"grad_norm": 2.353368758760998,
|
|
"learning_rate": 1.6000000000000001e-06,
|
|
"loss": 0.9011880874633789,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.00802825947334618,
|
|
"grad_norm": 2.239598951932816,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": 0.7988590240478516,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.009633911368015413,
|
|
"grad_norm": 2.674482927730846,
|
|
"learning_rate": 2.4000000000000003e-06,
|
|
"loss": 0.8681858062744141,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.01123956326268465,
|
|
"grad_norm": 1.998306304918899,
|
|
"learning_rate": 2.8000000000000003e-06,
|
|
"loss": 0.7784456729888916,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.012845215157353885,
|
|
"grad_norm": 1.839932484836926,
|
|
"learning_rate": 3.2000000000000003e-06,
|
|
"loss": 0.7746688842773437,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.014450867052023121,
|
|
"grad_norm": 2.321512760194257,
|
|
"learning_rate": 3.6000000000000003e-06,
|
|
"loss": 0.7785279273986816,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.01605651894669236,
|
|
"grad_norm": 2.179078311231977,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 0.7911964893341065,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.017662170841361593,
|
|
"grad_norm": 2.1384126935619725,
|
|
"learning_rate": 4.4e-06,
|
|
"loss": 0.6902450561523438,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.019267822736030827,
|
|
"grad_norm": 2.5664033239134767,
|
|
"learning_rate": 4.800000000000001e-06,
|
|
"loss": 0.7887882709503173,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.020873474630700065,
|
|
"grad_norm": 1.8552462640900655,
|
|
"learning_rate": 5.2e-06,
|
|
"loss": 0.729155158996582,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.0224791265253693,
|
|
"grad_norm": 2.125884253152752,
|
|
"learning_rate": 5.600000000000001e-06,
|
|
"loss": 0.7969522953033448,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.024084778420038536,
|
|
"grad_norm": 2.1733323665476583,
|
|
"learning_rate": 6e-06,
|
|
"loss": 0.7695069313049316,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.02569043031470777,
|
|
"grad_norm": 1.9452374095451752,
|
|
"learning_rate": 6.4000000000000006e-06,
|
|
"loss": 0.7883370399475098,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.027296082209377008,
|
|
"grad_norm": 2.049305323206828,
|
|
"learning_rate": 6.800000000000001e-06,
|
|
"loss": 0.6908586502075196,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.028901734104046242,
|
|
"grad_norm": 1.9905937382445111,
|
|
"learning_rate": 7.2000000000000005e-06,
|
|
"loss": 0.7423882484436035,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.03050738599871548,
|
|
"grad_norm": 2.067670072267235,
|
|
"learning_rate": 7.600000000000001e-06,
|
|
"loss": 0.7752325057983398,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.03211303789338472,
|
|
"grad_norm": 1.7901830862145491,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 0.753516435623169,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.03211303789338472,
|
|
"eval_loss": 0.7076867818832397,
|
|
"eval_runtime": 99.7317,
|
|
"eval_samples_per_second": 20.284,
|
|
"eval_steps_per_second": 5.074,
|
|
"eval_token_acc": 0.7709792922559945,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.03371868978805395,
|
|
"grad_norm": 2.1021202279756883,
|
|
"learning_rate": 8.400000000000001e-06,
|
|
"loss": 0.7589893817901612,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.035324341682723186,
|
|
"grad_norm": 1.8535429586983847,
|
|
"learning_rate": 8.8e-06,
|
|
"loss": 0.7508197784423828,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.03692999357739242,
|
|
"grad_norm": 2.0946882996421876,
|
|
"learning_rate": 9.200000000000002e-06,
|
|
"loss": 0.7733804225921631,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.038535645472061654,
|
|
"grad_norm": 1.8769331743438782,
|
|
"learning_rate": 9.600000000000001e-06,
|
|
"loss": 0.8231748580932617,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.040141297366730895,
|
|
"grad_norm": 1.9051022771627888,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.8040414810180664,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.04174694926140013,
|
|
"grad_norm": 1.8256531327814847,
|
|
"learning_rate": 9.99993095584273e-06,
|
|
"loss": 0.7720019817352295,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.04335260115606936,
|
|
"grad_norm": 1.7654139413134322,
|
|
"learning_rate": 9.999723825277754e-06,
|
|
"loss": 0.7307815074920654,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.0449582530507386,
|
|
"grad_norm": 2.1357010540003203,
|
|
"learning_rate": 9.999378614025538e-06,
|
|
"loss": 0.7429679870605469,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.04656390494540784,
|
|
"grad_norm": 2.3921459236651454,
|
|
"learning_rate": 9.998895331620009e-06,
|
|
"loss": 0.7576918601989746,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.04816955684007707,
|
|
"grad_norm": 1.8796286801473356,
|
|
"learning_rate": 9.998273991408293e-06,
|
|
"loss": 0.7387456893920898,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.04977520873474631,
|
|
"grad_norm": 1.9563558062770663,
|
|
"learning_rate": 9.997514610550363e-06,
|
|
"loss": 0.841197395324707,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.05138086062941554,
|
|
"grad_norm": 1.9499020656946842,
|
|
"learning_rate": 9.996617210018536e-06,
|
|
"loss": 0.7673286437988281,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.052986512524084775,
|
|
"grad_norm": 2.0448234673688934,
|
|
"learning_rate": 9.995581814596923e-06,
|
|
"loss": 0.8508127212524415,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.054592164418754016,
|
|
"grad_norm": 1.8756180216303968,
|
|
"learning_rate": 9.99440845288072e-06,
|
|
"loss": 0.7628500938415528,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.05619781631342325,
|
|
"grad_norm": 2.041528608451536,
|
|
"learning_rate": 9.99309715727544e-06,
|
|
"loss": 0.7440057754516601,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.057803468208092484,
|
|
"grad_norm": 1.8006559726861797,
|
|
"learning_rate": 9.991647963996001e-06,
|
|
"loss": 0.7254949569702148,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.05940912010276172,
|
|
"grad_norm": 1.7855335690115415,
|
|
"learning_rate": 9.990060913065735e-06,
|
|
"loss": 0.7758376121520996,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.06101477199743096,
|
|
"grad_norm": 2.0412805758933388,
|
|
"learning_rate": 9.988336048315278e-06,
|
|
"loss": 0.7547982215881348,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.0626204238921002,
|
|
"grad_norm": 1.8153150010996724,
|
|
"learning_rate": 9.986473417381366e-06,
|
|
"loss": 0.75426025390625,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.06422607578676943,
|
|
"grad_norm": 1.8236902944999438,
|
|
"learning_rate": 9.984473071705512e-06,
|
|
"loss": 0.7518160343170166,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.06422607578676943,
|
|
"eval_loss": 0.7055317759513855,
|
|
"eval_runtime": 101.0735,
|
|
"eval_samples_per_second": 20.015,
|
|
"eval_steps_per_second": 5.006,
|
|
"eval_token_acc": 0.7710310157322793,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.06583172768143866,
|
|
"grad_norm": 1.981984502463804,
|
|
"learning_rate": 9.982335066532586e-06,
|
|
"loss": 0.724842882156372,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.0674373795761079,
|
|
"grad_norm": 2.00462113974283,
|
|
"learning_rate": 9.980059460909298e-06,
|
|
"loss": 0.7751227378845215,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.06904303147077713,
|
|
"grad_norm": 1.9164446754319746,
|
|
"learning_rate": 9.977646317682553e-06,
|
|
"loss": 0.7310826301574707,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.07064868336544637,
|
|
"grad_norm": 1.425780494666241,
|
|
"learning_rate": 9.975095703497727e-06,
|
|
"loss": 0.7348933696746827,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.07225433526011561,
|
|
"grad_norm": 1.7264282435567964,
|
|
"learning_rate": 9.972407688796827e-06,
|
|
"loss": 0.7918214321136474,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.07385998715478484,
|
|
"grad_norm": 1.7873690443121253,
|
|
"learning_rate": 9.969582347816534e-06,
|
|
"loss": 0.7410657405853271,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.07546563904945408,
|
|
"grad_norm": 1.916709071222249,
|
|
"learning_rate": 9.966619758586164e-06,
|
|
"loss": 0.7694530487060547,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.07707129094412331,
|
|
"grad_norm": 1.7832878680515183,
|
|
"learning_rate": 9.963520002925506e-06,
|
|
"loss": 0.747270917892456,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.07867694283879255,
|
|
"grad_norm": 1.999021678268013,
|
|
"learning_rate": 9.960283166442569e-06,
|
|
"loss": 0.7553834438323974,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.08028259473346179,
|
|
"grad_norm": 1.9891718192525643,
|
|
"learning_rate": 9.956909338531211e-06,
|
|
"loss": 0.7506345748901367,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.08188824662813102,
|
|
"grad_norm": 2.0785214210367076,
|
|
"learning_rate": 9.953398612368673e-06,
|
|
"loss": 0.7921933174133301,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.08349389852280026,
|
|
"grad_norm": 1.7387864788088594,
|
|
"learning_rate": 9.949751084913008e-06,
|
|
"loss": 0.8228381156921387,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.0850995504174695,
|
|
"grad_norm": 1.6647874896219872,
|
|
"learning_rate": 9.9459668569004e-06,
|
|
"loss": 0.7765993118286133,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.08670520231213873,
|
|
"grad_norm": 1.783168935050446,
|
|
"learning_rate": 9.942046032842381e-06,
|
|
"loss": 0.7280902862548828,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.08831085420680797,
|
|
"grad_norm": 1.7261682650429326,
|
|
"learning_rate": 9.937988721022948e-06,
|
|
"loss": 0.7468975067138672,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.0899165061014772,
|
|
"grad_norm": 1.98380263385464,
|
|
"learning_rate": 9.933795033495575e-06,
|
|
"loss": 0.7684798240661621,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.09152215799614644,
|
|
"grad_norm": 2.0917455900276916,
|
|
"learning_rate": 9.929465086080106e-06,
|
|
"loss": 0.7396535396575927,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.09312780989081568,
|
|
"grad_norm": 2.040496130596098,
|
|
"learning_rate": 9.924998998359571e-06,
|
|
"loss": 0.7308344841003418,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.0947334617854849,
|
|
"grad_norm": 1.5962826247391504,
|
|
"learning_rate": 9.920396893676875e-06,
|
|
"loss": 0.7247919082641602,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.09633911368015415,
|
|
"grad_norm": 2.287295954105862,
|
|
"learning_rate": 9.915658899131393e-06,
|
|
"loss": 0.7654624938964844,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.09633911368015415,
|
|
"eval_loss": 0.6988159418106079,
|
|
"eval_runtime": 100.6058,
|
|
"eval_samples_per_second": 20.108,
|
|
"eval_steps_per_second": 5.03,
|
|
"eval_token_acc": 0.7711052639482365,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.09794476557482337,
|
|
"grad_norm": 1.863642076813837,
|
|
"learning_rate": 9.910785145575464e-06,
|
|
"loss": 0.7458448886871338,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.09955041746949261,
|
|
"grad_norm": 1.8059006554878292,
|
|
"learning_rate": 9.905775767610767e-06,
|
|
"loss": 0.7553767204284668,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.10115606936416185,
|
|
"grad_norm": 1.8021678660460414,
|
|
"learning_rate": 9.900630903584616e-06,
|
|
"loss": 0.8086840629577636,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.10276172125883108,
|
|
"grad_norm": 1.86266598368208,
|
|
"learning_rate": 9.895350695586133e-06,
|
|
"loss": 0.7645243644714356,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.10436737315350032,
|
|
"grad_norm": 1.9757822930978346,
|
|
"learning_rate": 9.889935289442318e-06,
|
|
"loss": 0.7408963203430176,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.10597302504816955,
|
|
"grad_norm": 2.056921894533938,
|
|
"learning_rate": 9.884384834714038e-06,
|
|
"loss": 0.7500060081481934,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.10757867694283879,
|
|
"grad_norm": 1.6383333188595506,
|
|
"learning_rate": 9.878699484691876e-06,
|
|
"loss": 0.6806066989898681,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.10918432883750803,
|
|
"grad_norm": 2.0688309441664456,
|
|
"learning_rate": 9.872879396391915e-06,
|
|
"loss": 0.7850825309753418,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.11078998073217726,
|
|
"grad_norm": 2.0032148038902897,
|
|
"learning_rate": 9.866924730551391e-06,
|
|
"loss": 0.737500810623169,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.1123956326268465,
|
|
"grad_norm": 2.0476297529312664,
|
|
"learning_rate": 9.860835651624259e-06,
|
|
"loss": 0.7644984245300293,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.11400128452151574,
|
|
"grad_norm": 1.9631657998239447,
|
|
"learning_rate": 9.854612327776644e-06,
|
|
"loss": 0.7839177131652832,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.11560693641618497,
|
|
"grad_norm": 1.6570213251759018,
|
|
"learning_rate": 9.848254930882214e-06,
|
|
"loss": 0.753380823135376,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.11721258831085421,
|
|
"grad_norm": 2.164985354398442,
|
|
"learning_rate": 9.841763636517406e-06,
|
|
"loss": 0.8096098899841309,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.11881824020552344,
|
|
"grad_norm": 1.7563983933390181,
|
|
"learning_rate": 9.835138623956603e-06,
|
|
"loss": 0.7301701068878174,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.12042389210019268,
|
|
"grad_norm": 1.7499502476904967,
|
|
"learning_rate": 9.828380076167167e-06,
|
|
"loss": 0.7721126079559326,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.12202954399486192,
|
|
"grad_norm": 1.825643887765283,
|
|
"learning_rate": 9.821488179804394e-06,
|
|
"loss": 0.7608715057373047,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.12363519588953115,
|
|
"grad_norm": 2.0344498074032096,
|
|
"learning_rate": 9.814463125206356e-06,
|
|
"loss": 0.8143133163452149,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.1252408477842004,
|
|
"grad_norm": 1.8599734868121924,
|
|
"learning_rate": 9.80730510638864e-06,
|
|
"loss": 0.7370705127716064,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.12684649967886963,
|
|
"grad_norm": 2.401539645903579,
|
|
"learning_rate": 9.800014321038998e-06,
|
|
"loss": 0.7864031791687012,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.12845215157353887,
|
|
"grad_norm": 1.9156614624885597,
|
|
"learning_rate": 9.792590970511882e-06,
|
|
"loss": 0.7531948089599609,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.12845215157353887,
|
|
"eval_loss": 0.6976503133773804,
|
|
"eval_runtime": 100.9577,
|
|
"eval_samples_per_second": 20.038,
|
|
"eval_steps_per_second": 5.012,
|
|
"eval_token_acc": 0.7718560886039834,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.13005780346820808,
|
|
"grad_norm": 2.0518814451795673,
|
|
"learning_rate": 9.785035259822884e-06,
|
|
"loss": 0.7535972595214844,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.13166345536287732,
|
|
"grad_norm": 1.8413763571861954,
|
|
"learning_rate": 9.777347397643075e-06,
|
|
"loss": 0.7913750171661377,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.13326910725754657,
|
|
"grad_norm": 1.8730208265946418,
|
|
"learning_rate": 9.769527596293242e-06,
|
|
"loss": 0.7386277198791504,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.1348747591522158,
|
|
"grad_norm": 1.659408810434294,
|
|
"learning_rate": 9.761576071738023e-06,
|
|
"loss": 0.7074491500854492,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.13648041104688505,
|
|
"grad_norm": 1.8873087614580026,
|
|
"learning_rate": 9.753493043579942e-06,
|
|
"loss": 0.7388912200927734,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.13808606294155426,
|
|
"grad_norm": 1.6722890579957244,
|
|
"learning_rate": 9.745278735053345e-06,
|
|
"loss": 0.7493368148803711,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.1396917148362235,
|
|
"grad_norm": 1.6772342194345173,
|
|
"learning_rate": 9.736933373018236e-06,
|
|
"loss": 0.6926981449127197,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.14129736673089274,
|
|
"grad_norm": 1.7437024472242095,
|
|
"learning_rate": 9.728457187954013e-06,
|
|
"loss": 0.769747257232666,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.14290301862556198,
|
|
"grad_norm": 1.785712412031916,
|
|
"learning_rate": 9.719850413953095e-06,
|
|
"loss": 0.7651779174804687,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.14450867052023122,
|
|
"grad_norm": 1.7302496278883281,
|
|
"learning_rate": 9.711113288714466e-06,
|
|
"loss": 0.7735454559326171,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.14611432241490044,
|
|
"grad_norm": 1.8272087036997788,
|
|
"learning_rate": 9.702246053537108e-06,
|
|
"loss": 0.7874812126159668,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.14771997430956968,
|
|
"grad_norm": 1.804466644550254,
|
|
"learning_rate": 9.69324895331333e-06,
|
|
"loss": 0.7952823638916016,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.14932562620423892,
|
|
"grad_norm": 1.6692779796502772,
|
|
"learning_rate": 9.684122236522014e-06,
|
|
"loss": 0.760723876953125,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.15093127809890816,
|
|
"grad_norm": 2.279602144252399,
|
|
"learning_rate": 9.674866155221745e-06,
|
|
"loss": 0.75501708984375,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.1525369299935774,
|
|
"grad_norm": 2.2519054003919377,
|
|
"learning_rate": 9.665480965043862e-06,
|
|
"loss": 0.7569370269775391,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.15414258188824662,
|
|
"grad_norm": 1.639554654895838,
|
|
"learning_rate": 9.655966925185381e-06,
|
|
"loss": 0.7236834526062011,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.15574823378291586,
|
|
"grad_norm": 1.743926286057347,
|
|
"learning_rate": 9.646324298401849e-06,
|
|
"loss": 0.8129490852355957,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.1573538856775851,
|
|
"grad_norm": 2.053932657043873,
|
|
"learning_rate": 9.636553351000077e-06,
|
|
"loss": 0.8245757102966309,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.15895953757225434,
|
|
"grad_norm": 1.7670536328368698,
|
|
"learning_rate": 9.626654352830801e-06,
|
|
"loss": 0.7364852905273438,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.16056518946692358,
|
|
"grad_norm": 1.997570888195663,
|
|
"learning_rate": 9.616627577281217e-06,
|
|
"loss": 0.8088130950927734,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.16056518946692358,
|
|
"eval_loss": 0.6925203204154968,
|
|
"eval_runtime": 100.0123,
|
|
"eval_samples_per_second": 20.228,
|
|
"eval_steps_per_second": 5.059,
|
|
"eval_token_acc": 0.7732542909629075,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.1621708413615928,
|
|
"grad_norm": 2.303872077571802,
|
|
"learning_rate": 9.606473301267427e-06,
|
|
"loss": 0.7984984397888184,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.16377649325626203,
|
|
"grad_norm": 1.7949507187260336,
|
|
"learning_rate": 9.59619180522681e-06,
|
|
"loss": 0.7323333740234375,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.16538214515093128,
|
|
"grad_norm": 1.9305044307817225,
|
|
"learning_rate": 9.585783373110248e-06,
|
|
"loss": 0.767444372177124,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.16698779704560052,
|
|
"grad_norm": 1.940504218877435,
|
|
"learning_rate": 9.575248292374322e-06,
|
|
"loss": 0.7465566635131836,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.16859344894026976,
|
|
"grad_norm": 1.974154937597088,
|
|
"learning_rate": 9.564586853973332e-06,
|
|
"loss": 0.8003168106079102,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.170199100834939,
|
|
"grad_norm": 1.8080878076418148,
|
|
"learning_rate": 9.553799352351293e-06,
|
|
"loss": 0.7629599094390869,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.1718047527296082,
|
|
"grad_norm": 1.870487361356404,
|
|
"learning_rate": 9.54288608543379e-06,
|
|
"loss": 0.7689233779907226,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.17341040462427745,
|
|
"grad_norm": 1.8275349692113991,
|
|
"learning_rate": 9.531847354619745e-06,
|
|
"loss": 0.704521894454956,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.1750160565189467,
|
|
"grad_norm": 1.7015288397617692,
|
|
"learning_rate": 9.52068346477311e-06,
|
|
"loss": 0.7523484230041504,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.17662170841361594,
|
|
"grad_norm": 1.7290639256028824,
|
|
"learning_rate": 9.509394724214428e-06,
|
|
"loss": 0.7474178791046142,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.17822736030828518,
|
|
"grad_norm": 1.8497366081907418,
|
|
"learning_rate": 9.497981444712332e-06,
|
|
"loss": 0.7826375007629395,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.1798330122029544,
|
|
"grad_norm": 1.8815163506102677,
|
|
"learning_rate": 9.486443941474928e-06,
|
|
"loss": 0.7807172775268555,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.18143866409762363,
|
|
"grad_norm": 2.001875209196253,
|
|
"learning_rate": 9.47478253314109e-06,
|
|
"loss": 0.7732196807861328,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.18304431599229287,
|
|
"grad_norm": 1.7043428423094649,
|
|
"learning_rate": 9.462997541771664e-06,
|
|
"loss": 0.7617592811584473,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.1846499678869621,
|
|
"grad_norm": 1.7130988725260645,
|
|
"learning_rate": 9.451089292840569e-06,
|
|
"loss": 0.7022255420684814,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.18625561978163135,
|
|
"grad_norm": 1.7793067950964665,
|
|
"learning_rate": 9.439058115225808e-06,
|
|
"loss": 0.767824363708496,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.18786127167630057,
|
|
"grad_norm": 1.8447543701250206,
|
|
"learning_rate": 9.42690434120039e-06,
|
|
"loss": 0.7075161933898926,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.1894669235709698,
|
|
"grad_norm": 2.1540810301213305,
|
|
"learning_rate": 9.414628306423148e-06,
|
|
"loss": 0.7819767475128174,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.19107257546563905,
|
|
"grad_norm": 1.5815294669707292,
|
|
"learning_rate": 9.402230349929475e-06,
|
|
"loss": 0.7533804893493652,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.1926782273603083,
|
|
"grad_norm": 1.7999543693842177,
|
|
"learning_rate": 9.389710814121951e-06,
|
|
"loss": 0.7735315322875976,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.1926782273603083,
|
|
"eval_loss": 0.6868460774421692,
|
|
"eval_runtime": 100.3485,
|
|
"eval_samples_per_second": 20.16,
|
|
"eval_steps_per_second": 5.042,
|
|
"eval_token_acc": 0.77447312965407,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.19428387925497753,
|
|
"grad_norm": 1.9775051957951533,
|
|
"learning_rate": 9.377070044760899e-06,
|
|
"loss": 0.7410672187805176,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.19588953114964675,
|
|
"grad_norm": 1.8830123288540028,
|
|
"learning_rate": 9.364308390954823e-06,
|
|
"loss": 0.7257200241088867,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.197495183044316,
|
|
"grad_norm": 1.923119464740565,
|
|
"learning_rate": 9.351426205150778e-06,
|
|
"loss": 0.7510544300079346,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.19910083493898523,
|
|
"grad_norm": 1.884018657813029,
|
|
"learning_rate": 9.338423843124627e-06,
|
|
"loss": 0.7750972747802735,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.20070648683365447,
|
|
"grad_norm": 1.6459258264733365,
|
|
"learning_rate": 9.325301663971222e-06,
|
|
"loss": 0.7792768478393555,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.2023121387283237,
|
|
"grad_norm": 2.3093451268783265,
|
|
"learning_rate": 9.312060030094487e-06,
|
|
"loss": 0.7784916400909424,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.20391779062299292,
|
|
"grad_norm": 2.017607836003892,
|
|
"learning_rate": 9.298699307197398e-06,
|
|
"loss": 0.707589054107666,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.20552344251766216,
|
|
"grad_norm": 2.16235384945215,
|
|
"learning_rate": 9.2852198642719e-06,
|
|
"loss": 0.6854191780090332,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.2071290944123314,
|
|
"grad_norm": 1.7496835798170465,
|
|
"learning_rate": 9.271622073588699e-06,
|
|
"loss": 0.7528693675994873,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.20873474630700065,
|
|
"grad_norm": 1.6805965965166436,
|
|
"learning_rate": 9.257906310686999e-06,
|
|
"loss": 0.7310030937194825,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.2103403982016699,
|
|
"grad_norm": 1.7240696394841069,
|
|
"learning_rate": 9.244072954364116e-06,
|
|
"loss": 0.7570672035217285,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.2119460500963391,
|
|
"grad_norm": 1.696536924694626,
|
|
"learning_rate": 9.23012238666502e-06,
|
|
"loss": 0.7422749996185303,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.21355170199100834,
|
|
"grad_norm": 1.8201920770430506,
|
|
"learning_rate": 9.216054992871787e-06,
|
|
"loss": 0.7625796318054199,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.21515735388567758,
|
|
"grad_norm": 1.7512686774637143,
|
|
"learning_rate": 9.201871161492957e-06,
|
|
"loss": 0.7356629848480225,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.21676300578034682,
|
|
"grad_norm": 1.6301937120337884,
|
|
"learning_rate": 9.187571284252806e-06,
|
|
"loss": 0.728066349029541,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.21836865767501606,
|
|
"grad_norm": 1.8986829697456375,
|
|
"learning_rate": 9.17315575608052e-06,
|
|
"loss": 0.7538561820983887,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.2199743095696853,
|
|
"grad_norm": 1.9496897623998384,
|
|
"learning_rate": 9.158624975099299e-06,
|
|
"loss": 0.7733088493347168,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.22157996146435452,
|
|
"grad_norm": 1.733812640841246,
|
|
"learning_rate": 9.143979342615354e-06,
|
|
"loss": 0.7938045501708985,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.22318561335902376,
|
|
"grad_norm": 1.8703147098290802,
|
|
"learning_rate": 9.129219263106825e-06,
|
|
"loss": 0.7479697227478027,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.224791265253693,
|
|
"grad_norm": 1.7131842139401694,
|
|
"learning_rate": 9.11434514421261e-06,
|
|
"loss": 0.7115351676940918,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.224791265253693,
|
|
"eval_loss": 0.6884077787399292,
|
|
"eval_runtime": 101.5323,
|
|
"eval_samples_per_second": 19.925,
|
|
"eval_steps_per_second": 4.984,
|
|
"eval_token_acc": 0.774602438344782,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.22639691714836224,
|
|
"grad_norm": 1.581417818250803,
|
|
"learning_rate": 9.099357396721117e-06,
|
|
"loss": 0.6985572814941406,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.22800256904303148,
|
|
"grad_norm": 2.0581793655158918,
|
|
"learning_rate": 9.084256434558898e-06,
|
|
"loss": 0.7135414123535156,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.2296082209377007,
|
|
"grad_norm": 1.895110040798408,
|
|
"learning_rate": 9.069042674779238e-06,
|
|
"loss": 0.7391388893127442,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.23121387283236994,
|
|
"grad_norm": 1.8028664147595719,
|
|
"learning_rate": 9.053716537550627e-06,
|
|
"loss": 0.7372079372406006,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.23281952472703918,
|
|
"grad_norm": 1.5474434923515745,
|
|
"learning_rate": 9.038278446145155e-06,
|
|
"loss": 0.8027336120605468,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.23442517662170842,
|
|
"grad_norm": 1.6608147397642952,
|
|
"learning_rate": 9.022728826926825e-06,
|
|
"loss": 0.7410070419311523,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.23603082851637766,
|
|
"grad_norm": 2.0544380178655457,
|
|
"learning_rate": 9.007068109339783e-06,
|
|
"loss": 0.7138307094573975,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.23763648041104687,
|
|
"grad_norm": 1.9265320027175405,
|
|
"learning_rate": 8.991296725896449e-06,
|
|
"loss": 0.777537488937378,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.23924213230571612,
|
|
"grad_norm": 1.8444266047774585,
|
|
"learning_rate": 8.975415112165566e-06,
|
|
"loss": 0.7667001724243164,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.24084778420038536,
|
|
"grad_norm": 1.539393309355395,
|
|
"learning_rate": 8.959423706760197e-06,
|
|
"loss": 0.7157824039459229,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.2424534360950546,
|
|
"grad_norm": 1.9224139568424825,
|
|
"learning_rate": 8.943322951325583e-06,
|
|
"loss": 0.7478031158447266,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.24405908798972384,
|
|
"grad_norm": 1.739352000191543,
|
|
"learning_rate": 8.927113290526961e-06,
|
|
"loss": 0.7133482456207275,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.24566473988439305,
|
|
"grad_norm": 2.0313357914665,
|
|
"learning_rate": 8.910795172037278e-06,
|
|
"loss": 0.724417781829834,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.2472703917790623,
|
|
"grad_norm": 1.824835287137536,
|
|
"learning_rate": 8.894369046524829e-06,
|
|
"loss": 0.702476167678833,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.24887604367373153,
|
|
"grad_norm": 1.7069567462050508,
|
|
"learning_rate": 8.877835367640813e-06,
|
|
"loss": 0.7205674648284912,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.2504816955684008,
|
|
"grad_norm": 1.6643061973982913,
|
|
"learning_rate": 8.861194592006798e-06,
|
|
"loss": 0.751797866821289,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.25208734746307,
|
|
"grad_norm": 1.8641371048435398,
|
|
"learning_rate": 8.844447179202119e-06,
|
|
"loss": 0.7409004688262939,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.25369299935773926,
|
|
"grad_norm": 1.6197221278271672,
|
|
"learning_rate": 8.827593591751172e-06,
|
|
"loss": 0.7813194274902344,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.25529865125240847,
|
|
"grad_norm": 1.8401298655390084,
|
|
"learning_rate": 8.810634295110661e-06,
|
|
"loss": 0.724795913696289,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.25690430314707774,
|
|
"grad_norm": 1.7223908388688454,
|
|
"learning_rate": 8.793569757656718e-06,
|
|
"loss": 0.7601978302001953,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.25690430314707774,
|
|
"eval_loss": 0.6818346977233887,
|
|
"eval_runtime": 100.0754,
|
|
"eval_samples_per_second": 20.215,
|
|
"eval_steps_per_second": 5.056,
|
|
"eval_token_acc": 0.7759555912243614,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.25850995504174695,
|
|
"grad_norm": 2.051065991460817,
|
|
"learning_rate": 8.77640045067199e-06,
|
|
"loss": 0.7404222011566162,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.26011560693641617,
|
|
"grad_norm": 2.0856367857981732,
|
|
"learning_rate": 8.759126848332608e-06,
|
|
"loss": 0.7499489784240723,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.26172125883108543,
|
|
"grad_norm": 1.776943450899247,
|
|
"learning_rate": 8.7417494276951e-06,
|
|
"loss": 0.7666402816772461,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.26332691072575465,
|
|
"grad_norm": 1.8417418661119802,
|
|
"learning_rate": 8.724268668683207e-06,
|
|
"loss": 0.7746978282928467,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.2649325626204239,
|
|
"grad_norm": 1.7767374838825565,
|
|
"learning_rate": 8.706685054074644e-06,
|
|
"loss": 0.6751203536987305,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.26653821451509313,
|
|
"grad_norm": 1.8890767025158177,
|
|
"learning_rate": 8.688999069487749e-06,
|
|
"loss": 0.7576641082763672,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.26814386640976234,
|
|
"grad_norm": 1.6503160358499191,
|
|
"learning_rate": 8.671211203368083e-06,
|
|
"loss": 0.7491856575012207,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.2697495183044316,
|
|
"grad_norm": 1.7619578916338219,
|
|
"learning_rate": 8.653321946974939e-06,
|
|
"loss": 0.7653478145599365,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.2713551701991008,
|
|
"grad_norm": 1.6161841088316613,
|
|
"learning_rate": 8.635331794367766e-06,
|
|
"loss": 0.7169931411743165,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.2729608220937701,
|
|
"grad_norm": 1.862893489220715,
|
|
"learning_rate": 8.617241242392535e-06,
|
|
"loss": 0.709630298614502,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.2745664739884393,
|
|
"grad_norm": 2.0942702006851284,
|
|
"learning_rate": 8.599050790668016e-06,
|
|
"loss": 0.732765007019043,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.2761721258831085,
|
|
"grad_norm": 1.909209797536119,
|
|
"learning_rate": 8.580760941571968e-06,
|
|
"loss": 0.7709503173828125,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.2777777777777778,
|
|
"grad_norm": 1.8776515240203724,
|
|
"learning_rate": 8.56237220022728e-06,
|
|
"loss": 0.7700500965118409,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.279383429672447,
|
|
"grad_norm": 2.055122504726026,
|
|
"learning_rate": 8.543885074488012e-06,
|
|
"loss": 0.7499999523162841,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.2809890815671163,
|
|
"grad_norm": 1.8264371508475254,
|
|
"learning_rate": 8.525300074925371e-06,
|
|
"loss": 0.7511765003204346,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.2825947334617855,
|
|
"grad_norm": 1.9987659171231758,
|
|
"learning_rate": 8.50661771481361e-06,
|
|
"loss": 0.7588027954101563,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.2842003853564547,
|
|
"grad_norm": 1.8223934722762116,
|
|
"learning_rate": 8.48783851011585e-06,
|
|
"loss": 0.7207003116607666,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.28580603725112397,
|
|
"grad_norm": 1.8043617826293,
|
|
"learning_rate": 8.468962979469841e-06,
|
|
"loss": 0.7769267082214355,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.2874116891457932,
|
|
"grad_norm": 1.5780943217315104,
|
|
"learning_rate": 8.449991644173624e-06,
|
|
"loss": 0.7341270446777344,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.28901734104046245,
|
|
"grad_norm": 1.7203708079691964,
|
|
"learning_rate": 8.43092502817114e-06,
|
|
"loss": 0.7272850036621094,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.28901734104046245,
|
|
"eval_loss": 0.6783972978591919,
|
|
"eval_runtime": 100.8991,
|
|
"eval_samples_per_second": 20.05,
|
|
"eval_steps_per_second": 5.015,
|
|
"eval_token_acc": 0.7769258235295099,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.29062299293513166,
|
|
"grad_norm": 1.889553755438213,
|
|
"learning_rate": 8.411763658037764e-06,
|
|
"loss": 0.7512543201446533,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.2922286448298009,
|
|
"grad_norm": 1.7574412886035682,
|
|
"learning_rate": 8.392508062965758e-06,
|
|
"loss": 0.7729681968688965,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.29383429672447015,
|
|
"grad_norm": 1.5623367854652537,
|
|
"learning_rate": 8.373158774749654e-06,
|
|
"loss": 0.7248388767242432,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.29543994861913936,
|
|
"grad_norm": 1.641798516178298,
|
|
"learning_rate": 8.353716327771572e-06,
|
|
"loss": 0.7033072948455811,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.2970456005138086,
|
|
"grad_norm": 1.834462270850742,
|
|
"learning_rate": 8.33418125898646e-06,
|
|
"loss": 0.7425559997558594,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.29865125240847784,
|
|
"grad_norm": 1.8200586992341523,
|
|
"learning_rate": 8.314554107907262e-06,
|
|
"loss": 0.7474506855010986,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.30025690430314705,
|
|
"grad_norm": 1.9230698549313547,
|
|
"learning_rate": 8.294835416590019e-06,
|
|
"loss": 0.7160479068756104,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.3018625561978163,
|
|
"grad_norm": 1.8857497922783588,
|
|
"learning_rate": 8.275025729618902e-06,
|
|
"loss": 0.7675430297851562,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.30346820809248554,
|
|
"grad_norm": 1.7590784658550567,
|
|
"learning_rate": 8.255125594091169e-06,
|
|
"loss": 0.6951826095581055,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.3050738599871548,
|
|
"grad_norm": 2.070729187194605,
|
|
"learning_rate": 8.235135559602055e-06,
|
|
"loss": 0.6935332298278809,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.306679511881824,
|
|
"grad_norm": 1.9490809465048364,
|
|
"learning_rate": 8.21505617822959e-06,
|
|
"loss": 0.7839535713195801,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.30828516377649323,
|
|
"grad_norm": 1.9244151039820374,
|
|
"learning_rate": 8.194888004519365e-06,
|
|
"loss": 0.77308669090271,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.3098908156711625,
|
|
"grad_norm": 1.7027702709888077,
|
|
"learning_rate": 8.1746315954692e-06,
|
|
"loss": 0.7465029716491699,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.3114964675658317,
|
|
"grad_norm": 1.8566176904812088,
|
|
"learning_rate": 8.154287510513773e-06,
|
|
"loss": 0.7425686836242675,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.313102119460501,
|
|
"grad_norm": 1.6606391878700422,
|
|
"learning_rate": 8.133856311509165e-06,
|
|
"loss": 0.738848876953125,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.3147077713551702,
|
|
"grad_norm": 1.739030610713329,
|
|
"learning_rate": 8.113338562717341e-06,
|
|
"loss": 0.6945414543151855,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.3163134232498394,
|
|
"grad_norm": 1.7105926585715472,
|
|
"learning_rate": 8.092734830790575e-06,
|
|
"loss": 0.6944126129150391,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.3179190751445087,
|
|
"grad_norm": 2.0738901086385337,
|
|
"learning_rate": 8.072045684755783e-06,
|
|
"loss": 0.7403692245483399,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.3195247270391779,
|
|
"grad_norm": 1.8644608268454954,
|
|
"learning_rate": 8.051271695998832e-06,
|
|
"loss": 0.747812557220459,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.32113037893384716,
|
|
"grad_norm": 2.0091042407307476,
|
|
"learning_rate": 8.03041343824874e-06,
|
|
"loss": 0.7711987972259522,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.32113037893384716,
|
|
"eval_loss": 0.6775335669517517,
|
|
"eval_runtime": 99.8279,
|
|
"eval_samples_per_second": 20.265,
|
|
"eval_steps_per_second": 5.069,
|
|
"eval_token_acc": 0.7775732012326872,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.3227360308285164,
|
|
"grad_norm": 1.7263752186897543,
|
|
"learning_rate": 8.009471487561837e-06,
|
|
"loss": 0.7122695922851563,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.3243416827231856,
|
|
"grad_norm": 2.3515395561571615,
|
|
"learning_rate": 7.988446422305857e-06,
|
|
"loss": 0.7937726020812989,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.32594733461785486,
|
|
"grad_norm": 1.6016251213715271,
|
|
"learning_rate": 7.967338823143967e-06,
|
|
"loss": 0.7619183540344239,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.32755298651252407,
|
|
"grad_norm": 1.9134390634488312,
|
|
"learning_rate": 7.946149273018723e-06,
|
|
"loss": 0.7309248924255372,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.32915863840719334,
|
|
"grad_norm": 1.8217084821661345,
|
|
"learning_rate": 7.92487835713598e-06,
|
|
"loss": 0.7465410232543945,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.33076429030186255,
|
|
"grad_norm": 1.6321933700829891,
|
|
"learning_rate": 7.903526662948721e-06,
|
|
"loss": 0.7252727508544922,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.33236994219653176,
|
|
"grad_norm": 2.062142440094139,
|
|
"learning_rate": 7.882094780140838e-06,
|
|
"loss": 0.7518548965454102,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.33397559409120103,
|
|
"grad_norm": 1.8038702247848208,
|
|
"learning_rate": 7.860583300610849e-06,
|
|
"loss": 0.6832700252532959,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.33558124598587025,
|
|
"grad_norm": 1.6414105854167742,
|
|
"learning_rate": 7.838992818455542e-06,
|
|
"loss": 0.7141282081604003,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.3371868978805395,
|
|
"grad_norm": 1.918967202852482,
|
|
"learning_rate": 7.817323929953575e-06,
|
|
"loss": 0.7613588333129883,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.33879254977520873,
|
|
"grad_norm": 1.7884788315718811,
|
|
"learning_rate": 7.795577233549006e-06,
|
|
"loss": 0.7314887046813965,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.340398201669878,
|
|
"grad_norm": 1.7578597871238686,
|
|
"learning_rate": 7.773753329834767e-06,
|
|
"loss": 0.6956823348999024,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.3420038535645472,
|
|
"grad_norm": 1.7192989942916452,
|
|
"learning_rate": 7.751852821536073e-06,
|
|
"loss": 0.7687590599060059,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.3436095054592164,
|
|
"grad_norm": 1.897575905000496,
|
|
"learning_rate": 7.729876313493781e-06,
|
|
"loss": 0.7444051265716553,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.3452151573538857,
|
|
"grad_norm": 1.9737469804350343,
|
|
"learning_rate": 7.70782441264768e-06,
|
|
"loss": 0.7778015613555909,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.3468208092485549,
|
|
"grad_norm": 1.7624130678825956,
|
|
"learning_rate": 7.68569772801974e-06,
|
|
"loss": 0.7128086090087891,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.3484264611432242,
|
|
"grad_norm": 2.1270906968507917,
|
|
"learning_rate": 7.663496870697267e-06,
|
|
"loss": 0.7526941299438477,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.3500321130378934,
|
|
"grad_norm": 1.7446416063759722,
|
|
"learning_rate": 7.641222453816064e-06,
|
|
"loss": 0.7122650146484375,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.3516377649325626,
|
|
"grad_norm": 1.8945875657421267,
|
|
"learning_rate": 7.618875092543467e-06,
|
|
"loss": 0.727197265625,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.35324341682723187,
|
|
"grad_norm": 1.9106945656360363,
|
|
"learning_rate": 7.596455404061365e-06,
|
|
"loss": 0.7072073459625244,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.35324341682723187,
|
|
"eval_loss": 0.6732829213142395,
|
|
"eval_runtime": 100.1797,
|
|
"eval_samples_per_second": 20.194,
|
|
"eval_steps_per_second": 5.051,
|
|
"eval_token_acc": 0.778239766677067,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.3548490687219011,
|
|
"grad_norm": 1.8773845986707667,
|
|
"learning_rate": 7.5739640075491546e-06,
|
|
"loss": 0.7187977313995362,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.35645472061657035,
|
|
"grad_norm": 1.9889746803856923,
|
|
"learning_rate": 7.551401524166646e-06,
|
|
"loss": 0.720254373550415,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.35806037251123957,
|
|
"grad_norm": 1.6923801552458178,
|
|
"learning_rate": 7.5287685770369e-06,
|
|
"loss": 0.6862672805786133,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.3596660244059088,
|
|
"grad_norm": 1.9583643264209265,
|
|
"learning_rate": 7.506065791229018e-06,
|
|
"loss": 0.7031614780426025,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.36127167630057805,
|
|
"grad_norm": 1.9020847905208893,
|
|
"learning_rate": 7.48329379374089e-06,
|
|
"loss": 0.7478256702423096,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.36287732819524726,
|
|
"grad_norm": 2.1005274782078276,
|
|
"learning_rate": 7.460453213481862e-06,
|
|
"loss": 0.7472408294677735,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.36448298008991653,
|
|
"grad_norm": 1.8211786447513192,
|
|
"learning_rate": 7.437544681255383e-06,
|
|
"loss": 0.7363036632537842,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.36608863198458574,
|
|
"grad_norm": 1.7715962280794173,
|
|
"learning_rate": 7.414568829741572e-06,
|
|
"loss": 0.6726340293884278,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.36769428387925496,
|
|
"grad_norm": 1.8917230150289452,
|
|
"learning_rate": 7.3915262934797525e-06,
|
|
"loss": 0.7265505790710449,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.3692999357739242,
|
|
"grad_norm": 2.70306165575983,
|
|
"learning_rate": 7.368417708850923e-06,
|
|
"loss": 0.7766015529632568,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.37090558766859344,
|
|
"grad_norm": 1.9625809458137666,
|
|
"learning_rate": 7.3452437140601855e-06,
|
|
"loss": 0.7961873531341552,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.3725112395632627,
|
|
"grad_norm": 2.3354598106113844,
|
|
"learning_rate": 7.322004949119114e-06,
|
|
"loss": 0.7435796737670899,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.3741168914579319,
|
|
"grad_norm": 1.7052628525681701,
|
|
"learning_rate": 7.298702055828086e-06,
|
|
"loss": 0.7392897605895996,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.37572254335260113,
|
|
"grad_norm": 1.7874835493922498,
|
|
"learning_rate": 7.275335677758553e-06,
|
|
"loss": 0.7336819648742676,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.3773281952472704,
|
|
"grad_norm": 1.4173055115054254,
|
|
"learning_rate": 7.251906460235268e-06,
|
|
"loss": 0.6930481433868408,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.3789338471419396,
|
|
"grad_norm": 1.6721852496901906,
|
|
"learning_rate": 7.228415050318463e-06,
|
|
"loss": 0.7872378826141357,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.3805394990366089,
|
|
"grad_norm": 1.9633631265419778,
|
|
"learning_rate": 7.204862096785978e-06,
|
|
"loss": 0.7082842826843262,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.3821451509312781,
|
|
"grad_norm": 1.8911957209638606,
|
|
"learning_rate": 7.181248250115346e-06,
|
|
"loss": 0.7364721775054932,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.3837508028259473,
|
|
"grad_norm": 1.832997056599628,
|
|
"learning_rate": 7.1575741624658215e-06,
|
|
"loss": 0.7178568840026855,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.3853564547206166,
|
|
"grad_norm": 1.797539058974665,
|
|
"learning_rate": 7.1338404876603784e-06,
|
|
"loss": 0.7448820114135742,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.3853564547206166,
|
|
"eval_loss": 0.6699023246765137,
|
|
"eval_runtime": 101.0776,
|
|
"eval_samples_per_second": 20.014,
|
|
"eval_steps_per_second": 5.006,
|
|
"eval_token_acc": 0.7787461561948874,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.3869621066152858,
|
|
"grad_norm": 1.6601258550350135,
|
|
"learning_rate": 7.110047881167647e-06,
|
|
"loss": 0.7200119972229004,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.38856775850995506,
|
|
"grad_norm": 1.8606082672724402,
|
|
"learning_rate": 7.086197000083812e-06,
|
|
"loss": 0.7550562858581543,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.3901734104046243,
|
|
"grad_norm": 1.926057437824125,
|
|
"learning_rate": 7.0622885031144685e-06,
|
|
"loss": 0.7396778583526611,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.3917790622992935,
|
|
"grad_norm": 2.053918702058126,
|
|
"learning_rate": 7.038323050556426e-06,
|
|
"loss": 0.7883121490478515,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.39338471419396276,
|
|
"grad_norm": 1.793367615238582,
|
|
"learning_rate": 7.014301304279476e-06,
|
|
"loss": 0.7374165534973145,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.394990366088632,
|
|
"grad_norm": 2.202257715478466,
|
|
"learning_rate": 6.990223927708107e-06,
|
|
"loss": 0.7440276145935059,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.39659601798330124,
|
|
"grad_norm": 1.9298723179304524,
|
|
"learning_rate": 6.966091585803191e-06,
|
|
"loss": 0.7077909946441651,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.39820166987797045,
|
|
"grad_norm": 2.0285592686096656,
|
|
"learning_rate": 6.94190494504361e-06,
|
|
"loss": 0.7101529598236084,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.39980732177263967,
|
|
"grad_norm": 1.9995862014508614,
|
|
"learning_rate": 6.917664673407858e-06,
|
|
"loss": 0.7186386108398437,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.40141297366730894,
|
|
"grad_norm": 1.7943762448203864,
|
|
"learning_rate": 6.893371440355585e-06,
|
|
"loss": 0.716456413269043,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.40301862556197815,
|
|
"grad_norm": 1.8687538472669623,
|
|
"learning_rate": 6.8690259168091115e-06,
|
|
"loss": 0.7763209342956543,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 0.4046242774566474,
|
|
"grad_norm": 1.6838226140000359,
|
|
"learning_rate": 6.8446287751349e-06,
|
|
"loss": 0.711727523803711,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 0.40622992935131663,
|
|
"grad_norm": 1.6964778939304066,
|
|
"learning_rate": 6.820180689124984e-06,
|
|
"loss": 0.6991249084472656,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 0.40783558124598585,
|
|
"grad_norm": 1.7927684693226813,
|
|
"learning_rate": 6.795682333978365e-06,
|
|
"loss": 0.7027994155883789,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 0.4094412331406551,
|
|
"grad_norm": 2.013968267642278,
|
|
"learning_rate": 6.771134386282355e-06,
|
|
"loss": 0.7337832450866699,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 0.4110468850353243,
|
|
"grad_norm": 1.6812600519591554,
|
|
"learning_rate": 6.7465375239939e-06,
|
|
"loss": 0.7464197635650635,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 0.4126525369299936,
|
|
"grad_norm": 1.9037670571068173,
|
|
"learning_rate": 6.721892426420851e-06,
|
|
"loss": 0.6945043563842773,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 0.4142581888246628,
|
|
"grad_norm": 1.8058567638830525,
|
|
"learning_rate": 6.697199774203203e-06,
|
|
"loss": 0.6616389751434326,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 0.415863840719332,
|
|
"grad_norm": 1.924596069745487,
|
|
"learning_rate": 6.6724602492943035e-06,
|
|
"loss": 0.725306510925293,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 0.4174694926140013,
|
|
"grad_norm": 1.824557569415237,
|
|
"learning_rate": 6.64767453494201e-06,
|
|
"loss": 0.726342248916626,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.4174694926140013,
|
|
"eval_loss": 0.665752649307251,
|
|
"eval_runtime": 100.2911,
|
|
"eval_samples_per_second": 20.171,
|
|
"eval_steps_per_second": 5.045,
|
|
"eval_token_acc": 0.7796020963024388,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.4190751445086705,
|
|
"grad_norm": 2.124267190283648,
|
|
"learning_rate": 6.6228433156698295e-06,
|
|
"loss": 0.7493174552917481,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 0.4206807964033398,
|
|
"grad_norm": 1.7913149024168675,
|
|
"learning_rate": 6.597967277258003e-06,
|
|
"loss": 0.7273980617523194,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 0.422286448298009,
|
|
"grad_norm": 1.8889920586323337,
|
|
"learning_rate": 6.573047106724574e-06,
|
|
"loss": 0.7230459690093994,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 0.4238921001926782,
|
|
"grad_norm": 2.366710188379232,
|
|
"learning_rate": 6.548083492306413e-06,
|
|
"loss": 0.7138511657714843,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 0.42549775208734747,
|
|
"grad_norm": 1.6217613788371101,
|
|
"learning_rate": 6.523077123440207e-06,
|
|
"loss": 0.7306194305419922,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 0.4271034039820167,
|
|
"grad_norm": 1.9317008242725322,
|
|
"learning_rate": 6.498028690743422e-06,
|
|
"loss": 0.7434864997863769,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 0.42870905587668595,
|
|
"grad_norm": 1.9815053943156722,
|
|
"learning_rate": 6.472938885995229e-06,
|
|
"loss": 0.6782712936401367,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 0.43031470777135516,
|
|
"grad_norm": 1.7541812536369152,
|
|
"learning_rate": 6.447808402117399e-06,
|
|
"loss": 0.680427074432373,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 0.4319203596660244,
|
|
"grad_norm": 1.7689207888279326,
|
|
"learning_rate": 6.4226379331551625e-06,
|
|
"loss": 0.7533660888671875,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 0.43352601156069365,
|
|
"grad_norm": 1.8560483228499987,
|
|
"learning_rate": 6.397428174258048e-06,
|
|
"loss": 0.6594399452209473,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.43513166345536286,
|
|
"grad_norm": 2.0233467391422835,
|
|
"learning_rate": 6.372179821660678e-06,
|
|
"loss": 0.7649170875549316,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 0.43673731535003213,
|
|
"grad_norm": 2.030975023954992,
|
|
"learning_rate": 6.346893572663544e-06,
|
|
"loss": 0.7382317543029785,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 0.43834296724470134,
|
|
"grad_norm": 1.9123338195053914,
|
|
"learning_rate": 6.321570125613744e-06,
|
|
"loss": 0.6933588981628418,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 0.4399486191393706,
|
|
"grad_norm": 2.0971123140916306,
|
|
"learning_rate": 6.296210179885708e-06,
|
|
"loss": 0.7823019981384277,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 0.4415542710340398,
|
|
"grad_norm": 1.5531867347941812,
|
|
"learning_rate": 6.270814435861864e-06,
|
|
"loss": 0.7157551765441894,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 0.44315992292870904,
|
|
"grad_norm": 1.5844075638735038,
|
|
"learning_rate": 6.245383594913312e-06,
|
|
"loss": 0.7315412521362304,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 0.4447655748233783,
|
|
"grad_norm": 1.8782313508796873,
|
|
"learning_rate": 6.219918359380444e-06,
|
|
"loss": 0.7463915348052979,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 0.4463712267180475,
|
|
"grad_norm": 1.6919689969132032,
|
|
"learning_rate": 6.19441943255355e-06,
|
|
"loss": 0.6843451499938965,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 0.4479768786127168,
|
|
"grad_norm": 1.7577177879387673,
|
|
"learning_rate": 6.1688875186533955e-06,
|
|
"loss": 0.7416125774383545,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 0.449582530507386,
|
|
"grad_norm": 1.8152525178335799,
|
|
"learning_rate": 6.143323322811776e-06,
|
|
"loss": 0.7234923362731933,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.449582530507386,
|
|
"eval_loss": 0.6638072729110718,
|
|
"eval_runtime": 99.5953,
|
|
"eval_samples_per_second": 20.312,
|
|
"eval_steps_per_second": 5.081,
|
|
"eval_token_acc": 0.7802753357437585,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.4511881824020552,
|
|
"grad_norm": 1.8407066956007139,
|
|
"learning_rate": 6.11772755105203e-06,
|
|
"loss": 0.7649462699890137,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 0.4527938342967245,
|
|
"grad_norm": 2.025366690651478,
|
|
"learning_rate": 6.092100910269556e-06,
|
|
"loss": 0.7118297576904297,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 0.4543994861913937,
|
|
"grad_norm": 1.882535993483909,
|
|
"learning_rate": 6.06644410821228e-06,
|
|
"loss": 0.7083555221557617,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 0.45600513808606297,
|
|
"grad_norm": 1.9132786155674317,
|
|
"learning_rate": 6.040757853461113e-06,
|
|
"loss": 0.7021913528442383,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 0.4576107899807322,
|
|
"grad_norm": 1.9130646960075248,
|
|
"learning_rate": 6.015042855410379e-06,
|
|
"loss": 0.7390027046203613,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 0.4592164418754014,
|
|
"grad_norm": 1.5813838739698574,
|
|
"learning_rate": 5.989299824248227e-06,
|
|
"loss": 0.6849452972412109,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 0.46082209377007066,
|
|
"grad_norm": 1.9649959319170844,
|
|
"learning_rate": 5.963529470937015e-06,
|
|
"loss": 0.7207717895507812,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 0.4624277456647399,
|
|
"grad_norm": 1.671436841803361,
|
|
"learning_rate": 5.937732507193671e-06,
|
|
"loss": 0.7367728233337403,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 0.46403339755940914,
|
|
"grad_norm": 1.5219254190640068,
|
|
"learning_rate": 5.911909645470045e-06,
|
|
"loss": 0.7643050193786621,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 0.46563904945407836,
|
|
"grad_norm": 1.6651969418471841,
|
|
"learning_rate": 5.886061598933228e-06,
|
|
"loss": 0.7690213203430176,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.46724470134874757,
|
|
"grad_norm": 2.035256573817491,
|
|
"learning_rate": 5.860189081445854e-06,
|
|
"loss": 0.751746940612793,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 0.46885035324341684,
|
|
"grad_norm": 1.6085243690709197,
|
|
"learning_rate": 5.834292807546392e-06,
|
|
"loss": 0.746894359588623,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 0.47045600513808605,
|
|
"grad_norm": 1.7904211777209753,
|
|
"learning_rate": 5.808373492429405e-06,
|
|
"loss": 0.7455621719360351,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 0.4720616570327553,
|
|
"grad_norm": 2.1015882721546486,
|
|
"learning_rate": 5.782431851925801e-06,
|
|
"loss": 0.710727596282959,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 0.47366730892742454,
|
|
"grad_norm": 1.8851211128040317,
|
|
"learning_rate": 5.75646860248306e-06,
|
|
"loss": 0.7258530616760254,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 0.47527296082209375,
|
|
"grad_norm": 1.7596039756078887,
|
|
"learning_rate": 5.730484461145455e-06,
|
|
"loss": 0.7183322429656982,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 0.476878612716763,
|
|
"grad_norm": 1.7585093428217082,
|
|
"learning_rate": 5.704480145534243e-06,
|
|
"loss": 0.6958836078643799,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 0.47848426461143223,
|
|
"grad_norm": 1.7759136501368777,
|
|
"learning_rate": 5.678456373827843e-06,
|
|
"loss": 0.6852969169616699,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 0.4800899165061015,
|
|
"grad_norm": 1.7327874227415498,
|
|
"learning_rate": 5.652413864742016e-06,
|
|
"loss": 0.7584549427032471,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 0.4816955684007707,
|
|
"grad_norm": 1.8579814597845732,
|
|
"learning_rate": 5.626353337509994e-06,
|
|
"loss": 0.7292413711547852,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.4816955684007707,
|
|
"eval_loss": 0.6603939533233643,
|
|
"eval_runtime": 99.8487,
|
|
"eval_samples_per_second": 20.261,
|
|
"eval_steps_per_second": 5.068,
|
|
"eval_token_acc": 0.7815250416707684,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.4833012202954399,
|
|
"grad_norm": 1.6620639526626975,
|
|
"learning_rate": 5.600275511862636e-06,
|
|
"loss": 0.6874475479125977,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 0.4849068721901092,
|
|
"grad_norm": 1.863548208870452,
|
|
"learning_rate": 5.574181108008539e-06,
|
|
"loss": 0.7283189296722412,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 0.4865125240847784,
|
|
"grad_norm": 1.7019535500961747,
|
|
"learning_rate": 5.548070846614153e-06,
|
|
"loss": 0.7144774436950684,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 0.4881181759794477,
|
|
"grad_norm": 1.8180271114688507,
|
|
"learning_rate": 5.521945448783874e-06,
|
|
"loss": 0.7144730091094971,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 0.4897238278741169,
|
|
"grad_norm": 1.6469354529009128,
|
|
"learning_rate": 5.495805636040135e-06,
|
|
"loss": 0.702541732788086,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 0.4913294797687861,
|
|
"grad_norm": 1.4253159494568708,
|
|
"learning_rate": 5.469652130303471e-06,
|
|
"loss": 0.7350476264953614,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 0.4929351316634554,
|
|
"grad_norm": 1.6761584639579492,
|
|
"learning_rate": 5.443485653872589e-06,
|
|
"loss": 0.7056559085845947,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 0.4945407835581246,
|
|
"grad_norm": 2.091157632155003,
|
|
"learning_rate": 5.417306929404413e-06,
|
|
"loss": 0.7697329044342041,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 0.49614643545279385,
|
|
"grad_norm": 1.8425062990475631,
|
|
"learning_rate": 5.391116679894131e-06,
|
|
"loss": 0.7427218914031982,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 0.49775208734746307,
|
|
"grad_norm": 1.7957934599197594,
|
|
"learning_rate": 5.364915628655227e-06,
|
|
"loss": 0.6918140888214112,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.4993577392421323,
|
|
"grad_norm": 1.9779972048893961,
|
|
"learning_rate": 5.3387044992995e-06,
|
|
"loss": 0.7020491123199463,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 0.5009633911368016,
|
|
"grad_norm": 1.9981792268408507,
|
|
"learning_rate": 5.312484015717087e-06,
|
|
"loss": 0.7167632102966308,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 0.5025690430314708,
|
|
"grad_norm": 1.8909911302926332,
|
|
"learning_rate": 5.286254902056462e-06,
|
|
"loss": 0.7625884532928466,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 0.50417469492614,
|
|
"grad_norm": 1.61762455157073,
|
|
"learning_rate": 5.2600178827044476e-06,
|
|
"loss": 0.6554212093353271,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 0.5057803468208093,
|
|
"grad_norm": 1.9936400724678622,
|
|
"learning_rate": 5.233773682266198e-06,
|
|
"loss": 0.7177192211151123,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 0.5073859987154785,
|
|
"grad_norm": 2.0016810272501364,
|
|
"learning_rate": 5.2075230255451924e-06,
|
|
"loss": 0.6999452114105225,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 0.5089916506101477,
|
|
"grad_norm": 1.4929333492848669,
|
|
"learning_rate": 5.181266637523225e-06,
|
|
"loss": 0.7226743698120117,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 0.5105973025048169,
|
|
"grad_norm": 1.7985727217161158,
|
|
"learning_rate": 5.155005243340364e-06,
|
|
"loss": 0.6844250679016113,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 0.5122029543994862,
|
|
"grad_norm": 1.732864365144089,
|
|
"learning_rate": 5.1287395682749444e-06,
|
|
"loss": 0.708368968963623,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 0.5138086062941555,
|
|
"grad_norm": 1.6036953471683275,
|
|
"learning_rate": 5.102470337723524e-06,
|
|
"loss": 0.7594008445739746,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.5138086062941555,
|
|
"eval_loss": 0.6590485572814941,
|
|
"eval_runtime": 99.4458,
|
|
"eval_samples_per_second": 20.343,
|
|
"eval_steps_per_second": 5.088,
|
|
"eval_token_acc": 0.7822341538456404,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.5154142581888247,
|
|
"grad_norm": 1.9434983597979074,
|
|
"learning_rate": 5.0761982771808595e-06,
|
|
"loss": 0.690369701385498,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 0.5170199100834939,
|
|
"grad_norm": 1.6748286006153636,
|
|
"learning_rate": 5.049924112219859e-06,
|
|
"loss": 0.744829797744751,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 0.5186255619781631,
|
|
"grad_norm": 1.5457805731661085,
|
|
"learning_rate": 5.023648568471559e-06,
|
|
"loss": 0.6961235046386719,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 0.5202312138728323,
|
|
"grad_norm": 1.6660310158387102,
|
|
"learning_rate": 4.997372371605066e-06,
|
|
"loss": 0.7534513473510742,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 0.5218368657675017,
|
|
"grad_norm": 1.7718731852467215,
|
|
"learning_rate": 4.971096247307528e-06,
|
|
"loss": 0.7069045066833496,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 0.5234425176621709,
|
|
"grad_norm": 1.6283766654055158,
|
|
"learning_rate": 4.944820921264089e-06,
|
|
"loss": 0.6802540302276612,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 0.5250481695568401,
|
|
"grad_norm": 2.2262011526125107,
|
|
"learning_rate": 4.918547119137846e-06,
|
|
"loss": 0.7522598266601562,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 0.5266538214515093,
|
|
"grad_norm": 1.74540928610349,
|
|
"learning_rate": 4.89227556654981e-06,
|
|
"loss": 0.7161440372467041,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 0.5282594733461785,
|
|
"grad_norm": 1.4513773030560617,
|
|
"learning_rate": 4.866006989058862e-06,
|
|
"loss": 0.6914768218994141,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 0.5298651252408478,
|
|
"grad_norm": 1.7847841689336374,
|
|
"learning_rate": 4.839742112141725e-06,
|
|
"loss": 0.6942119598388672,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.531470777135517,
|
|
"grad_norm": 1.7175053440669847,
|
|
"learning_rate": 4.813481661172912e-06,
|
|
"loss": 0.6722054481506348,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 0.5330764290301863,
|
|
"grad_norm": 1.7006060538832268,
|
|
"learning_rate": 4.787226361404706e-06,
|
|
"loss": 0.7729241371154785,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 0.5346820809248555,
|
|
"grad_norm": 1.9551619500224633,
|
|
"learning_rate": 4.760976937947128e-06,
|
|
"loss": 0.7557572364807129,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 0.5362877328195247,
|
|
"grad_norm": 1.9346894609017689,
|
|
"learning_rate": 4.7347341157479055e-06,
|
|
"loss": 0.7043597221374511,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 0.537893384714194,
|
|
"grad_norm": 2.0892624262979878,
|
|
"learning_rate": 4.708498619572455e-06,
|
|
"loss": 0.6981153011322021,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 0.5394990366088632,
|
|
"grad_norm": 1.8011714452824432,
|
|
"learning_rate": 4.682271173983865e-06,
|
|
"loss": 0.6955915451049804,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 0.5411046885035324,
|
|
"grad_norm": 1.8385763388931964,
|
|
"learning_rate": 4.6560525033228885e-06,
|
|
"loss": 0.7396990776062011,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 0.5427103403982017,
|
|
"grad_norm": 1.9241555680433726,
|
|
"learning_rate": 4.629843331687935e-06,
|
|
"loss": 0.7320691108703613,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 0.5443159922928709,
|
|
"grad_norm": 1.8609663841971344,
|
|
"learning_rate": 4.603644382915069e-06,
|
|
"loss": 0.7013369560241699,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 0.5459216441875402,
|
|
"grad_norm": 1.5651123605862947,
|
|
"learning_rate": 4.577456380558028e-06,
|
|
"loss": 0.7569509506225586,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.5459216441875402,
|
|
"eval_loss": 0.6562788486480713,
|
|
"eval_runtime": 101.0782,
|
|
"eval_samples_per_second": 20.014,
|
|
"eval_steps_per_second": 5.006,
|
|
"eval_token_acc": 0.7831568339225916,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.5475272960822094,
|
|
"grad_norm": 1.5797728712399979,
|
|
"learning_rate": 4.551280047868233e-06,
|
|
"loss": 0.703862476348877,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 0.5491329479768786,
|
|
"grad_norm": 2.104046311869008,
|
|
"learning_rate": 4.525116107774815e-06,
|
|
"loss": 0.6947150230407715,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 0.5507385998715478,
|
|
"grad_norm": 1.9353364330514562,
|
|
"learning_rate": 4.498965282864654e-06,
|
|
"loss": 0.7750646591186523,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 0.552344251766217,
|
|
"grad_norm": 1.6516572356278183,
|
|
"learning_rate": 4.472828295362417e-06,
|
|
"loss": 0.7100794315338135,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 0.5539499036608864,
|
|
"grad_norm": 1.8878923547505169,
|
|
"learning_rate": 4.446705867110613e-06,
|
|
"loss": 0.7117058753967285,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 0.5555555555555556,
|
|
"grad_norm": 1.7084225805175084,
|
|
"learning_rate": 4.420598719549661e-06,
|
|
"loss": 0.6922745704650879,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 0.5571612074502248,
|
|
"grad_norm": 1.8216538078021467,
|
|
"learning_rate": 4.39450757369796e-06,
|
|
"loss": 0.6797658920288085,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 0.558766859344894,
|
|
"grad_norm": 1.6982254987436856,
|
|
"learning_rate": 4.368433150131983e-06,
|
|
"loss": 0.6903456211090088,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 0.5603725112395632,
|
|
"grad_norm": 1.8338849318375383,
|
|
"learning_rate": 4.342376168966368e-06,
|
|
"loss": 0.6925700187683106,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 0.5619781631342325,
|
|
"grad_norm": 1.6205656689270516,
|
|
"learning_rate": 4.316337349834041e-06,
|
|
"loss": 0.6894615650177002,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.5635838150289018,
|
|
"grad_norm": 1.8627217085763261,
|
|
"learning_rate": 4.290317411866329e-06,
|
|
"loss": 0.7587858200073242,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 0.565189466923571,
|
|
"grad_norm": 1.7262427202144772,
|
|
"learning_rate": 4.264317073673108e-06,
|
|
"loss": 0.6634279727935791,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 0.5667951188182402,
|
|
"grad_norm": 1.7942547624076908,
|
|
"learning_rate": 4.238337053322954e-06,
|
|
"loss": 0.6912859916687012,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 0.5684007707129094,
|
|
"grad_norm": 1.9726052068008915,
|
|
"learning_rate": 4.212378068323312e-06,
|
|
"loss": 0.7470963001251221,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 0.5700064226075787,
|
|
"grad_norm": 1.8122951442259065,
|
|
"learning_rate": 4.186440835600677e-06,
|
|
"loss": 0.708991813659668,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 0.5716120745022479,
|
|
"grad_norm": 1.9588586663070653,
|
|
"learning_rate": 4.1605260714808e-06,
|
|
"loss": 0.7249481201171875,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 0.5732177263969171,
|
|
"grad_norm": 1.810489889692105,
|
|
"learning_rate": 4.134634491668903e-06,
|
|
"loss": 0.680173110961914,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 0.5748233782915864,
|
|
"grad_norm": 1.6521777772159827,
|
|
"learning_rate": 4.108766811229906e-06,
|
|
"loss": 0.7033731937408447,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 0.5764290301862556,
|
|
"grad_norm": 1.7201532835806859,
|
|
"learning_rate": 4.0829237445686895e-06,
|
|
"loss": 0.7144006729125977,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 0.5780346820809249,
|
|
"grad_norm": 1.8473455193169876,
|
|
"learning_rate": 4.057106005410356e-06,
|
|
"loss": 0.7390274047851563,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.5780346820809249,
|
|
"eval_loss": 0.6529011726379395,
|
|
"eval_runtime": 100.1569,
|
|
"eval_samples_per_second": 20.198,
|
|
"eval_steps_per_second": 5.052,
|
|
"eval_token_acc": 0.7836457041984446,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.5796403339755941,
|
|
"grad_norm": 1.6568429454392342,
|
|
"learning_rate": 4.0313143067805255e-06,
|
|
"loss": 0.7353650093078613,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 0.5812459858702633,
|
|
"grad_norm": 1.7711489931478204,
|
|
"learning_rate": 4.005549360985633e-06,
|
|
"loss": 0.652749490737915,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 0.5828516377649325,
|
|
"grad_norm": 1.757927815054455,
|
|
"learning_rate": 3.979811879593269e-06,
|
|
"loss": 0.7223974227905273,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 0.5844572896596018,
|
|
"grad_norm": 2.066360457008701,
|
|
"learning_rate": 3.954102573412517e-06,
|
|
"loss": 0.7130667209625244,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 0.5860629415542711,
|
|
"grad_norm": 1.9898510462446428,
|
|
"learning_rate": 3.9284221524743285e-06,
|
|
"loss": 0.6991080284118653,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 0.5876685934489403,
|
|
"grad_norm": 1.7615438213992136,
|
|
"learning_rate": 3.902771326011914e-06,
|
|
"loss": 0.6893427848815918,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 0.5892742453436095,
|
|
"grad_norm": 1.982320389173454,
|
|
"learning_rate": 3.877150802441151e-06,
|
|
"loss": 0.7484220027923584,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 0.5908798972382787,
|
|
"grad_norm": 1.741693118016323,
|
|
"learning_rate": 3.851561289341023e-06,
|
|
"loss": 0.6734917640686036,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 0.5924855491329479,
|
|
"grad_norm": 1.7966100597081194,
|
|
"learning_rate": 3.8260034934340774e-06,
|
|
"loss": 0.6977890014648438,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 0.5940912010276173,
|
|
"grad_norm": 1.8214854618786471,
|
|
"learning_rate": 3.800478120566906e-06,
|
|
"loss": 0.6718869209289551,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 0.5956968529222865,
|
|
"grad_norm": 1.936538410584516,
|
|
"learning_rate": 3.7749858756906516e-06,
|
|
"loss": 0.7291570663452148,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 0.5973025048169557,
|
|
"grad_norm": 1.9259131709896102,
|
|
"learning_rate": 3.749527462841539e-06,
|
|
"loss": 0.7349955558776855,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 0.5989081567116249,
|
|
"grad_norm": 1.7114063191334197,
|
|
"learning_rate": 3.724103585121436e-06,
|
|
"loss": 0.6840395927429199,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 0.6005138086062941,
|
|
"grad_norm": 1.8004891247262185,
|
|
"learning_rate": 3.698714944678424e-06,
|
|
"loss": 0.7046355724334716,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 0.6021194605009634,
|
|
"grad_norm": 1.9635884438112328,
|
|
"learning_rate": 3.6733622426874184e-06,
|
|
"loss": 0.7282588005065918,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 0.6037251123956326,
|
|
"grad_norm": 1.8288920625641636,
|
|
"learning_rate": 3.648046179330796e-06,
|
|
"loss": 0.6897663116455078,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 0.6053307642903019,
|
|
"grad_norm": 1.956581186568473,
|
|
"learning_rate": 3.62276745377906e-06,
|
|
"loss": 0.7430720806121827,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 0.6069364161849711,
|
|
"grad_norm": 1.905669012280911,
|
|
"learning_rate": 3.597526764171532e-06,
|
|
"loss": 0.763132905960083,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 0.6085420680796403,
|
|
"grad_norm": 1.692613025987985,
|
|
"learning_rate": 3.5723248075970684e-06,
|
|
"loss": 0.7154220104217529,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 0.6101477199743096,
|
|
"grad_norm": 1.6188001979454867,
|
|
"learning_rate": 3.547162280074813e-06,
|
|
"loss": 0.6731904983520508,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.6101477199743096,
|
|
"eval_loss": 0.6499109268188477,
|
|
"eval_runtime": 101.607,
|
|
"eval_samples_per_second": 19.91,
|
|
"eval_steps_per_second": 4.98,
|
|
"eval_token_acc": 0.7846401297425005,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.6117533718689788,
|
|
"grad_norm": 1.6466933977964522,
|
|
"learning_rate": 3.5220398765349662e-06,
|
|
"loss": 0.6557781219482421,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 0.613359023763648,
|
|
"grad_norm": 1.650956939754262,
|
|
"learning_rate": 3.4969582907996015e-06,
|
|
"loss": 0.7916486740112305,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 0.6149646756583173,
|
|
"grad_norm": 1.9499600064872187,
|
|
"learning_rate": 3.471918215563499e-06,
|
|
"loss": 0.7331917762756348,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 0.6165703275529865,
|
|
"grad_norm": 1.7488554020922835,
|
|
"learning_rate": 3.4469203423750152e-06,
|
|
"loss": 0.7139426708221436,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 0.6181759794476558,
|
|
"grad_norm": 1.9481794866010824,
|
|
"learning_rate": 3.421965361616985e-06,
|
|
"loss": 0.7002578258514405,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 0.619781631342325,
|
|
"grad_norm": 1.8164493133989255,
|
|
"learning_rate": 3.3970539624876565e-06,
|
|
"loss": 0.7944469451904297,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 0.6213872832369942,
|
|
"grad_norm": 1.7973574254384317,
|
|
"learning_rate": 3.372186832981652e-06,
|
|
"loss": 0.7096256256103516,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 0.6229929351316634,
|
|
"grad_norm": 1.8905305819711578,
|
|
"learning_rate": 3.3473646598709724e-06,
|
|
"loss": 0.7385225772857666,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 0.6245985870263326,
|
|
"grad_norm": 1.9720483336100236,
|
|
"learning_rate": 3.322588128686027e-06,
|
|
"loss": 0.631558609008789,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 0.626204238921002,
|
|
"grad_norm": 1.9199256983971729,
|
|
"learning_rate": 3.297857923696702e-06,
|
|
"loss": 0.7101581573486329,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 0.6278098908156712,
|
|
"grad_norm": 1.7281699563667379,
|
|
"learning_rate": 3.273174727893463e-06,
|
|
"loss": 0.6946653366088867,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 0.6294155427103404,
|
|
"grad_norm": 1.8488768775022235,
|
|
"learning_rate": 3.248539222968489e-06,
|
|
"loss": 0.7121337413787842,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 0.6310211946050096,
|
|
"grad_norm": 1.9382810286305994,
|
|
"learning_rate": 3.223952089296854e-06,
|
|
"loss": 0.6842686653137207,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 0.6326268464996788,
|
|
"grad_norm": 1.6643793135842617,
|
|
"learning_rate": 3.199414005917726e-06,
|
|
"loss": 0.7061540603637695,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 0.6342324983943481,
|
|
"grad_norm": 2.724263331256564,
|
|
"learning_rate": 3.1749256505156203e-06,
|
|
"loss": 0.7035871505737304,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 0.6358381502890174,
|
|
"grad_norm": 2.0469158236441265,
|
|
"learning_rate": 3.150487699401681e-06,
|
|
"loss": 0.7187200546264648,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 0.6374438021836866,
|
|
"grad_norm": 2.0083569748830437,
|
|
"learning_rate": 3.1261008274950045e-06,
|
|
"loss": 0.7021552085876465,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 0.6390494540783558,
|
|
"grad_norm": 1.9963871377571232,
|
|
"learning_rate": 3.1017657083039974e-06,
|
|
"loss": 0.7082595825195312,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 0.640655105973025,
|
|
"grad_norm": 1.680927867718523,
|
|
"learning_rate": 3.0774830139077816e-06,
|
|
"loss": 0.6650783061981201,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 0.6422607578676943,
|
|
"grad_norm": 1.59265243871285,
|
|
"learning_rate": 3.0532534149376225e-06,
|
|
"loss": 0.6559285163879395,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.6422607578676943,
|
|
"eval_loss": 0.6435865759849548,
|
|
"eval_runtime": 100.3623,
|
|
"eval_samples_per_second": 20.157,
|
|
"eval_steps_per_second": 5.042,
|
|
"eval_token_acc": 0.7852274414732181,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.6438664097623635,
|
|
"grad_norm": 2.059823477222312,
|
|
"learning_rate": 3.0290775805584182e-06,
|
|
"loss": 0.6783754348754882,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 0.6454720616570327,
|
|
"grad_norm": 1.8047856556885493,
|
|
"learning_rate": 3.0049561784502125e-06,
|
|
"loss": 0.6974565029144287,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 0.647077713551702,
|
|
"grad_norm": 1.8332131891460988,
|
|
"learning_rate": 2.980889874789758e-06,
|
|
"loss": 0.7201285839080811,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 0.6486833654463712,
|
|
"grad_norm": 1.4349168104380714,
|
|
"learning_rate": 2.956879334232117e-06,
|
|
"loss": 0.669098949432373,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 0.6502890173410405,
|
|
"grad_norm": 1.6713525304929475,
|
|
"learning_rate": 2.9329252198923026e-06,
|
|
"loss": 0.6994924545288086,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 0.6518946692357097,
|
|
"grad_norm": 1.9182171362094331,
|
|
"learning_rate": 2.909028193326974e-06,
|
|
"loss": 0.7437381744384766,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 0.6535003211303789,
|
|
"grad_norm": 1.9925290937878184,
|
|
"learning_rate": 2.8851889145161515e-06,
|
|
"loss": 0.7246116161346435,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 0.6551059730250481,
|
|
"grad_norm": 2.0862100251659523,
|
|
"learning_rate": 2.861408041845002e-06,
|
|
"loss": 0.6847538948059082,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 0.6567116249197174,
|
|
"grad_norm": 1.8109480499644828,
|
|
"learning_rate": 2.8376862320856524e-06,
|
|
"loss": 0.6831855773925781,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 0.6583172768143867,
|
|
"grad_norm": 1.976494032777444,
|
|
"learning_rate": 2.814024140379048e-06,
|
|
"loss": 0.6985796451568603,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 0.6599229287090559,
|
|
"grad_norm": 1.9685296222085586,
|
|
"learning_rate": 2.7904224202168608e-06,
|
|
"loss": 0.6664264678955079,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 0.6615285806037251,
|
|
"grad_norm": 1.9281066229028763,
|
|
"learning_rate": 2.766881723423441e-06,
|
|
"loss": 0.719045352935791,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 0.6631342324983943,
|
|
"grad_norm": 1.6502284540715824,
|
|
"learning_rate": 2.7434027001378194e-06,
|
|
"loss": 0.6993152141571045,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 0.6647398843930635,
|
|
"grad_norm": 1.6697487416381616,
|
|
"learning_rate": 2.719985998795747e-06,
|
|
"loss": 0.7472553253173828,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 0.6663455362877329,
|
|
"grad_norm": 1.6636753892060745,
|
|
"learning_rate": 2.696632266111784e-06,
|
|
"loss": 0.7038356781005859,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 0.6679511881824021,
|
|
"grad_norm": 1.9803593784420075,
|
|
"learning_rate": 2.67334214706145e-06,
|
|
"loss": 0.7307730197906495,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 0.6695568400770713,
|
|
"grad_norm": 1.767255994774783,
|
|
"learning_rate": 2.6501162848634023e-06,
|
|
"loss": 0.7219781875610352,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 0.6711624919717405,
|
|
"grad_norm": 1.901874438910319,
|
|
"learning_rate": 2.6269553209616705e-06,
|
|
"loss": 0.6743021965026855,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 0.6727681438664097,
|
|
"grad_norm": 2.0677163684150965,
|
|
"learning_rate": 2.603859895007953e-06,
|
|
"loss": 0.7430953502655029,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 0.674373795761079,
|
|
"grad_norm": 1.6171922112559496,
|
|
"learning_rate": 2.5808306448439363e-06,
|
|
"loss": 0.6823254585266113,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.674373795761079,
|
|
"eval_loss": 0.6416329741477966,
|
|
"eval_runtime": 102.1529,
|
|
"eval_samples_per_second": 19.804,
|
|
"eval_steps_per_second": 4.953,
|
|
"eval_token_acc": 0.7860491773464522,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.6759794476557482,
|
|
"grad_norm": 2.156271432064396,
|
|
"learning_rate": 2.557868206483689e-06,
|
|
"loss": 0.7033274650573731,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 0.6775850995504175,
|
|
"grad_norm": 1.8682650555990705,
|
|
"learning_rate": 2.5349732140960924e-06,
|
|
"loss": 0.706195592880249,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 0.6791907514450867,
|
|
"grad_norm": 1.8511909561934217,
|
|
"learning_rate": 2.5121462999873304e-06,
|
|
"loss": 0.6629680633544922,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 0.680796403339756,
|
|
"grad_norm": 1.7968251606366754,
|
|
"learning_rate": 2.48938809458342e-06,
|
|
"loss": 0.6985820770263672,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 0.6824020552344252,
|
|
"grad_norm": 1.5956361541126427,
|
|
"learning_rate": 2.466699226412807e-06,
|
|
"loss": 0.6692060470581055,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 0.6840077071290944,
|
|
"grad_norm": 1.8716762576484385,
|
|
"learning_rate": 2.4440803220890054e-06,
|
|
"loss": 0.6867996215820312,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 0.6856133590237636,
|
|
"grad_norm": 1.937863972461209,
|
|
"learning_rate": 2.4215320062932884e-06,
|
|
"loss": 0.679615592956543,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 0.6872190109184328,
|
|
"grad_norm": 1.779698511514955,
|
|
"learning_rate": 2.399054901757442e-06,
|
|
"loss": 0.7149673461914062,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 0.6888246628131022,
|
|
"grad_norm": 1.9594648302014621,
|
|
"learning_rate": 2.3766496292465626e-06,
|
|
"loss": 0.6776654243469238,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 0.6904303147077714,
|
|
"grad_norm": 2.164984986049087,
|
|
"learning_rate": 2.3543168075419128e-06,
|
|
"loss": 0.7544286727905274,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 0.6920359666024406,
|
|
"grad_norm": 1.7026126548400184,
|
|
"learning_rate": 2.3320570534238333e-06,
|
|
"loss": 0.6883758544921875,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 0.6936416184971098,
|
|
"grad_norm": 1.894526210798688,
|
|
"learning_rate": 2.3098709816547126e-06,
|
|
"loss": 0.669807243347168,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 0.695247270391779,
|
|
"grad_norm": 1.7104031052730917,
|
|
"learning_rate": 2.2877592049620013e-06,
|
|
"loss": 0.6752557277679443,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 0.6968529222864484,
|
|
"grad_norm": 2.0154910175662177,
|
|
"learning_rate": 2.2657223340212937e-06,
|
|
"loss": 0.7199252128601075,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 0.6984585741811176,
|
|
"grad_norm": 1.6326002815510572,
|
|
"learning_rate": 2.243760977439463e-06,
|
|
"loss": 0.6786823749542237,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 0.7000642260757868,
|
|
"grad_norm": 1.8278106499138467,
|
|
"learning_rate": 2.2218757417378524e-06,
|
|
"loss": 0.7039000034332276,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 0.701669877970456,
|
|
"grad_norm": 1.5150426633022813,
|
|
"learning_rate": 2.2000672313355243e-06,
|
|
"loss": 0.6673988342285156,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 0.7032755298651252,
|
|
"grad_norm": 1.814010824208995,
|
|
"learning_rate": 2.178336048532567e-06,
|
|
"loss": 0.6703363418579101,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 0.7048811817597945,
|
|
"grad_norm": 1.932034342856998,
|
|
"learning_rate": 2.1566827934934625e-06,
|
|
"loss": 0.7365949630737305,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 0.7064868336544637,
|
|
"grad_norm": 1.583750341584408,
|
|
"learning_rate": 2.1351080642305087e-06,
|
|
"loss": 0.7208436012268067,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.7064868336544637,
|
|
"eval_loss": 0.6407771110534668,
|
|
"eval_runtime": 100.9662,
|
|
"eval_samples_per_second": 20.036,
|
|
"eval_steps_per_second": 5.012,
|
|
"eval_token_acc": 0.7865872683497375,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.708092485549133,
|
|
"grad_norm": 1.7120130450491946,
|
|
"learning_rate": 2.1136124565873067e-06,
|
|
"loss": 0.6615838050842285,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 0.7096981374438022,
|
|
"grad_norm": 1.8942066282925965,
|
|
"learning_rate": 2.092196564222301e-06,
|
|
"loss": 0.7101615905761719,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 0.7113037893384714,
|
|
"grad_norm": 2.0967864377771313,
|
|
"learning_rate": 2.070860978592389e-06,
|
|
"loss": 0.760159969329834,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 0.7129094412331407,
|
|
"grad_norm": 1.87264295172251,
|
|
"learning_rate": 2.04960628893658e-06,
|
|
"loss": 0.6784673690795898,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 0.7145150931278099,
|
|
"grad_norm": 1.5829797358177315,
|
|
"learning_rate": 2.0284330822597328e-06,
|
|
"loss": 0.697022819519043,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 0.7161207450224791,
|
|
"grad_norm": 1.954361401728327,
|
|
"learning_rate": 2.0073419433163287e-06,
|
|
"loss": 0.6933704376220703,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 0.7177263969171483,
|
|
"grad_norm": 1.8847843880125525,
|
|
"learning_rate": 1.9863334545943346e-06,
|
|
"loss": 0.6825516223907471,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 0.7193320488118176,
|
|
"grad_norm": 1.80323508483505,
|
|
"learning_rate": 1.96540819629911e-06,
|
|
"loss": 0.71434326171875,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 0.7209377007064869,
|
|
"grad_norm": 1.5642782564942102,
|
|
"learning_rate": 1.944566746337384e-06,
|
|
"loss": 0.6642905712127686,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 0.7225433526011561,
|
|
"grad_norm": 1.678835575797861,
|
|
"learning_rate": 1.9238096803012977e-06,
|
|
"loss": 0.7029307842254638,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.7241490044958253,
|
|
"grad_norm": 1.7368484560789226,
|
|
"learning_rate": 1.9031375714525024e-06,
|
|
"loss": 0.6675965309143066,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 0.7257546563904945,
|
|
"grad_norm": 2.0308775134008976,
|
|
"learning_rate": 1.8825509907063328e-06,
|
|
"loss": 0.7120641231536865,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 0.7273603082851637,
|
|
"grad_norm": 1.632678314506496,
|
|
"learning_rate": 1.862050506616036e-06,
|
|
"loss": 0.699639892578125,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 0.7289659601798331,
|
|
"grad_norm": 1.7984916892790255,
|
|
"learning_rate": 1.841636685357071e-06,
|
|
"loss": 0.7304025650024414,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 0.7305716120745023,
|
|
"grad_norm": 1.8169707359801452,
|
|
"learning_rate": 1.8213100907114723e-06,
|
|
"loss": 0.6680910110473632,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 0.7321772639691715,
|
|
"grad_norm": 1.9689432226586039,
|
|
"learning_rate": 1.8010712840522787e-06,
|
|
"loss": 0.7318731307983398,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 0.7337829158638407,
|
|
"grad_norm": 1.9672898212636953,
|
|
"learning_rate": 1.7809208243280295e-06,
|
|
"loss": 0.6793680191040039,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 0.7353885677585099,
|
|
"grad_norm": 1.7885095729577234,
|
|
"learning_rate": 1.7608592680473286e-06,
|
|
"loss": 0.6592792510986328,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 0.7369942196531792,
|
|
"grad_norm": 1.7762408489737018,
|
|
"learning_rate": 1.740887169263477e-06,
|
|
"loss": 0.7005180358886719,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 0.7385998715478485,
|
|
"grad_norm": 1.7404523046374716,
|
|
"learning_rate": 1.7210050795591659e-06,
|
|
"loss": 0.6815290451049805,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.7385998715478485,
|
|
"eval_loss": 0.6378750205039978,
|
|
"eval_runtime": 101.5529,
|
|
"eval_samples_per_second": 19.921,
|
|
"eval_steps_per_second": 4.983,
|
|
"eval_token_acc": 0.7871428785949902,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.7402055234425177,
|
|
"grad_norm": 2.1390203235709064,
|
|
"learning_rate": 1.7012135480312453e-06,
|
|
"loss": 0.773406982421875,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 0.7418111753371869,
|
|
"grad_norm": 1.8394303048257645,
|
|
"learning_rate": 1.681513121275562e-06,
|
|
"loss": 0.6579567432403565,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 0.7434168272318561,
|
|
"grad_norm": 4.00766593758118,
|
|
"learning_rate": 1.6619043433718618e-06,
|
|
"loss": 0.6489538192749024,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 0.7450224791265254,
|
|
"grad_norm": 2.004303828563679,
|
|
"learning_rate": 1.6423877558687618e-06,
|
|
"loss": 0.7187282562255859,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 0.7466281310211946,
|
|
"grad_norm": 1.8011662444901102,
|
|
"learning_rate": 1.6229638977687978e-06,
|
|
"loss": 0.6926092624664306,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 0.7482337829158638,
|
|
"grad_norm": 1.857568101595114,
|
|
"learning_rate": 1.6036333055135345e-06,
|
|
"loss": 0.7091545104980469,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 0.7498394348105331,
|
|
"grad_norm": 1.5863365553976057,
|
|
"learning_rate": 1.5843965129687534e-06,
|
|
"loss": 0.6575593948364258,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 0.7514450867052023,
|
|
"grad_norm": 2.0852974372506323,
|
|
"learning_rate": 1.5652540514097053e-06,
|
|
"loss": 0.7059723854064941,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 0.7530507385998716,
|
|
"grad_norm": 1.6022043007467883,
|
|
"learning_rate": 1.5462064495064422e-06,
|
|
"loss": 0.7094513893127441,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 0.7546563904945408,
|
|
"grad_norm": 1.9880232886802098,
|
|
"learning_rate": 1.5272542333092111e-06,
|
|
"loss": 0.712027645111084,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 0.75626204238921,
|
|
"grad_norm": 1.8966672269171247,
|
|
"learning_rate": 1.5083979262339299e-06,
|
|
"loss": 0.7096713066101075,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 0.7578676942838792,
|
|
"grad_norm": 2.2560082777752926,
|
|
"learning_rate": 1.4896380490477336e-06,
|
|
"loss": 0.7394931793212891,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 0.7594733461785484,
|
|
"grad_norm": 1.6889658107554155,
|
|
"learning_rate": 1.4709751198545858e-06,
|
|
"loss": 0.6921211242675781,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 0.7610789980732178,
|
|
"grad_norm": 1.8873972073347112,
|
|
"learning_rate": 1.4524096540809746e-06,
|
|
"loss": 0.6788894653320312,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 0.762684649967887,
|
|
"grad_norm": 1.7372990869576996,
|
|
"learning_rate": 1.4339421644616723e-06,
|
|
"loss": 0.6684075355529785,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 0.7642903018625562,
|
|
"grad_norm": 1.8153713813071988,
|
|
"learning_rate": 1.415573161025584e-06,
|
|
"loss": 0.6402009010314942,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 0.7658959537572254,
|
|
"grad_norm": 1.659182519397587,
|
|
"learning_rate": 1.3973031510816542e-06,
|
|
"loss": 0.6899700164794922,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 0.7675016056518946,
|
|
"grad_norm": 1.871741870915225,
|
|
"learning_rate": 1.3791326392048593e-06,
|
|
"loss": 0.7124199867248535,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 0.769107257546564,
|
|
"grad_norm": 1.9475717396141488,
|
|
"learning_rate": 1.3610621272222713e-06,
|
|
"loss": 0.6798200607299805,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 0.7707129094412332,
|
|
"grad_norm": 1.609026845213952,
|
|
"learning_rate": 1.3430921141991977e-06,
|
|
"loss": 0.701236629486084,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 0.7707129094412332,
|
|
"eval_loss": 0.637983500957489,
|
|
"eval_runtime": 101.3114,
|
|
"eval_samples_per_second": 19.968,
|
|
"eval_steps_per_second": 4.995,
|
|
"eval_token_acc": 0.7876934833425379,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 0.7723185613359024,
|
|
"grad_norm": 1.9809938227068873,
|
|
"learning_rate": 1.3252230964253998e-06,
|
|
"loss": 0.6687247276306152,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 0.7739242132305716,
|
|
"grad_norm": 1.5879163475967482,
|
|
"learning_rate": 1.3074555674013901e-06,
|
|
"loss": 0.6856508255004883,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 0.7755298651252408,
|
|
"grad_norm": 1.9328904935598028,
|
|
"learning_rate": 1.2897900178247945e-06,
|
|
"loss": 0.7378098487854003,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 0.7771355170199101,
|
|
"grad_norm": 1.8830227485438022,
|
|
"learning_rate": 1.2722269355768058e-06,
|
|
"loss": 0.6985198020935058,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 0.7787411689145793,
|
|
"grad_norm": 1.848123076439394,
|
|
"learning_rate": 1.2547668057087097e-06,
|
|
"loss": 0.6710779666900635,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 0.7803468208092486,
|
|
"grad_norm": 2.004806413254504,
|
|
"learning_rate": 1.237410110428487e-06,
|
|
"loss": 0.711176586151123,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 0.7819524727039178,
|
|
"grad_norm": 2.2065298885710942,
|
|
"learning_rate": 1.2201573290874963e-06,
|
|
"loss": 0.6620469093322754,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 0.783558124598587,
|
|
"grad_norm": 1.8575907718766689,
|
|
"learning_rate": 1.2030089381672384e-06,
|
|
"loss": 0.7247062206268311,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 0.7851637764932563,
|
|
"grad_norm": 1.8829745520117143,
|
|
"learning_rate": 1.1859654112661923e-06,
|
|
"loss": 0.7227104187011719,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 0.7867694283879255,
|
|
"grad_norm": 1.9172890092540606,
|
|
"learning_rate": 1.169027219086739e-06,
|
|
"loss": 0.7319454669952392,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 0.7883750802825947,
|
|
"grad_norm": 1.6495086584014267,
|
|
"learning_rate": 1.1521948294221603e-06,
|
|
"loss": 0.7280515193939209,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 0.789980732177264,
|
|
"grad_norm": 1.8581360269695162,
|
|
"learning_rate": 1.1354687071437197e-06,
|
|
"loss": 0.6312118530273437,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 0.7915863840719332,
|
|
"grad_norm": 1.7869920126685614,
|
|
"learning_rate": 1.1188493141878248e-06,
|
|
"loss": 0.6938999176025391,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 0.7931920359666025,
|
|
"grad_norm": 1.6368096604874645,
|
|
"learning_rate": 1.1023371095432656e-06,
|
|
"loss": 0.7159996032714844,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 0.7947976878612717,
|
|
"grad_norm": 1.8355082872813364,
|
|
"learning_rate": 1.085932549238547e-06,
|
|
"loss": 0.7142012596130372,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 0.7964033397559409,
|
|
"grad_norm": 1.7194268581182632,
|
|
"learning_rate": 1.0696360863292842e-06,
|
|
"loss": 0.7176544189453125,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 0.7980089916506101,
|
|
"grad_norm": 2.2633388706554216,
|
|
"learning_rate": 1.053448170885697e-06,
|
|
"loss": 0.6689523220062256,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 0.7996146435452793,
|
|
"grad_norm": 1.877683615965777,
|
|
"learning_rate": 1.0373692499801763e-06,
|
|
"loss": 0.7301874160766602,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 0.8012202954399487,
|
|
"grad_norm": 1.5723248207516158,
|
|
"learning_rate": 1.021399767674941e-06,
|
|
"loss": 0.6963142395019531,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 0.8028259473346179,
|
|
"grad_norm": 2.210852669764262,
|
|
"learning_rate": 1.0055401650097685e-06,
|
|
"loss": 0.7632877349853515,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.8028259473346179,
|
|
"eval_loss": 0.6379601955413818,
|
|
"eval_runtime": 100.435,
|
|
"eval_samples_per_second": 20.142,
|
|
"eval_steps_per_second": 5.038,
|
|
"eval_token_acc": 0.7878169522859274,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.8044315992292871,
|
|
"grad_norm": 1.6585137655893314,
|
|
"learning_rate": 9.89790879989821e-07,
|
|
"loss": 0.7174354553222656,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"epoch": 0.8060372511239563,
|
|
"grad_norm": 1.794298397140435,
|
|
"learning_rate": 9.741523475735414e-07,
|
|
"loss": 0.7475096702575683,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 0.8076429030186255,
|
|
"grad_norm": 1.7840894537770773,
|
|
"learning_rate": 9.586249996606473e-07,
|
|
"loss": 0.6916313648223877,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"epoch": 0.8092485549132948,
|
|
"grad_norm": 1.6625161110070892,
|
|
"learning_rate": 9.432092650801994e-07,
|
|
"loss": 0.7148942470550537,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 0.810854206807964,
|
|
"grad_norm": 1.8916186866947355,
|
|
"learning_rate": 9.279055695787582e-07,
|
|
"loss": 0.6605861186981201,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 0.8124598587026333,
|
|
"grad_norm": 1.7413837106014958,
|
|
"learning_rate": 9.127143358086277e-07,
|
|
"loss": 0.6902444362640381,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 0.8140655105973025,
|
|
"grad_norm": 1.5421682897627211,
|
|
"learning_rate": 8.976359833161796e-07,
|
|
"loss": 0.6847622871398926,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 0.8156711624919717,
|
|
"grad_norm": 1.6388970285941844,
|
|
"learning_rate": 8.826709285302737e-07,
|
|
"loss": 0.7044576644897461,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 0.817276814386641,
|
|
"grad_norm": 2.1860191201925963,
|
|
"learning_rate": 8.678195847507464e-07,
|
|
"loss": 0.7263412475585938,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"epoch": 0.8188824662813102,
|
|
"grad_norm": 1.840780268433954,
|
|
"learning_rate": 8.530823621370043e-07,
|
|
"loss": 0.6977847576141357,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 0.8204881181759794,
|
|
"grad_norm": 1.681597730618285,
|
|
"learning_rate": 8.384596676966938e-07,
|
|
"loss": 0.6181282997131348,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"epoch": 0.8220937700706487,
|
|
"grad_norm": 1.8111683293074319,
|
|
"learning_rate": 8.239519052744605e-07,
|
|
"loss": 0.6957583427429199,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 0.8236994219653179,
|
|
"grad_norm": 2.018101916346725,
|
|
"learning_rate": 8.095594755407971e-07,
|
|
"loss": 0.7490185260772705,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"epoch": 0.8253050738599872,
|
|
"grad_norm": 1.512552912962594,
|
|
"learning_rate": 7.952827759809756e-07,
|
|
"loss": 0.6976777076721191,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 0.8269107257546564,
|
|
"grad_norm": 1.8005082781446453,
|
|
"learning_rate": 7.811222008840719e-07,
|
|
"loss": 0.7236599922180176,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 0.8285163776493256,
|
|
"grad_norm": 1.9947903621763263,
|
|
"learning_rate": 7.670781413320766e-07,
|
|
"loss": 0.6453072547912597,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 0.8301220295439948,
|
|
"grad_norm": 1.749286146781592,
|
|
"learning_rate": 7.531509851890911e-07,
|
|
"loss": 0.6826435089111328,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"epoch": 0.831727681438664,
|
|
"grad_norm": 1.8798572056617036,
|
|
"learning_rate": 7.393411170906201e-07,
|
|
"loss": 0.6848629474639892,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 0.8333333333333334,
|
|
"grad_norm": 1.6109675391338243,
|
|
"learning_rate": 7.256489184329452e-07,
|
|
"loss": 0.6714759349822998,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"epoch": 0.8349389852280026,
|
|
"grad_norm": 1.6184688330019217,
|
|
"learning_rate": 7.120747673625916e-07,
|
|
"loss": 0.6852529525756836,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 0.8349389852280026,
|
|
"eval_loss": 0.6380971074104309,
|
|
"eval_runtime": 100.8172,
|
|
"eval_samples_per_second": 20.066,
|
|
"eval_steps_per_second": 5.019,
|
|
"eval_token_acc": 0.7881940331130358,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 0.8365446371226718,
|
|
"grad_norm": 1.7911508916416556,
|
|
"learning_rate": 6.986190387658909e-07,
|
|
"loss": 0.7391942024230957,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"epoch": 0.838150289017341,
|
|
"grad_norm": 1.7945846037138906,
|
|
"learning_rate": 6.852821042586183e-07,
|
|
"loss": 0.7381104469299317,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 0.8397559409120102,
|
|
"grad_norm": 1.7286241666325148,
|
|
"learning_rate": 6.720643321757348e-07,
|
|
"loss": 0.7324903011322021,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"epoch": 0.8413615928066795,
|
|
"grad_norm": 1.8638958372473948,
|
|
"learning_rate": 6.589660875612147e-07,
|
|
"loss": 0.720820140838623,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 0.8429672447013488,
|
|
"grad_norm": 2.2946259368571535,
|
|
"learning_rate": 6.459877321579628e-07,
|
|
"loss": 0.7330772876739502,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 0.844572896596018,
|
|
"grad_norm": 2.024802800575958,
|
|
"learning_rate": 6.33129624397823e-07,
|
|
"loss": 0.7107767581939697,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 0.8461785484906872,
|
|
"grad_norm": 1.700416273773183,
|
|
"learning_rate": 6.203921193916812e-07,
|
|
"loss": 0.637398099899292,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"epoch": 0.8477842003853564,
|
|
"grad_norm": 2.0030457735835228,
|
|
"learning_rate": 6.077755689196574e-07,
|
|
"loss": 0.714143180847168,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"epoch": 0.8493898522800257,
|
|
"grad_norm": 1.9186452668781833,
|
|
"learning_rate": 5.952803214213887e-07,
|
|
"loss": 0.6865103721618653,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"epoch": 0.8509955041746949,
|
|
"grad_norm": 1.77904181814012,
|
|
"learning_rate": 5.829067219864099e-07,
|
|
"loss": 0.7384478092193604,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 0.8526011560693642,
|
|
"grad_norm": 1.5991572147517608,
|
|
"learning_rate": 5.706551123446175e-07,
|
|
"loss": 0.6412908554077148,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"epoch": 0.8542068079640334,
|
|
"grad_norm": 1.8024184059984696,
|
|
"learning_rate": 5.585258308568381e-07,
|
|
"loss": 0.6989157676696778,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"epoch": 0.8558124598587026,
|
|
"grad_norm": 2.0451623196060917,
|
|
"learning_rate": 5.465192125054769e-07,
|
|
"loss": 0.6701858043670654,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"epoch": 0.8574181117533719,
|
|
"grad_norm": 1.8934593514894456,
|
|
"learning_rate": 5.346355888852767e-07,
|
|
"loss": 0.6798998832702636,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"epoch": 0.8590237636480411,
|
|
"grad_norm": 1.9778835857420947,
|
|
"learning_rate": 5.22875288194144e-07,
|
|
"loss": 0.657193374633789,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 0.8606294155427103,
|
|
"grad_norm": 1.7955267532960553,
|
|
"learning_rate": 5.112386352241017e-07,
|
|
"loss": 0.6918414115905762,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"epoch": 0.8622350674373795,
|
|
"grad_norm": 1.6988172442673055,
|
|
"learning_rate": 4.997259513523079e-07,
|
|
"loss": 0.7405441284179688,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"epoch": 0.8638407193320488,
|
|
"grad_norm": 1.9188503717325887,
|
|
"learning_rate": 4.883375545321845e-07,
|
|
"loss": 0.6795600891113281,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"epoch": 0.8654463712267181,
|
|
"grad_norm": 1.5981461088093833,
|
|
"learning_rate": 4.770737592846375e-07,
|
|
"loss": 0.6697923183441162,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"epoch": 0.8670520231213873,
|
|
"grad_norm": 1.8377885982974524,
|
|
"learning_rate": 4.6593487668936565e-07,
|
|
"loss": 0.7350438117980957,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 0.8670520231213873,
|
|
"eval_loss": 0.6342610716819763,
|
|
"eval_runtime": 100.1583,
|
|
"eval_samples_per_second": 20.198,
|
|
"eval_steps_per_second": 5.052,
|
|
"eval_token_acc": 0.7883325185495402,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 0.8686576750160565,
|
|
"grad_norm": 1.6968512863196625,
|
|
"learning_rate": 4.5492121437627433e-07,
|
|
"loss": 0.6635457038879394,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"epoch": 0.8702633269107257,
|
|
"grad_norm": 2.0071517773652663,
|
|
"learning_rate": 4.440330765169765e-07,
|
|
"loss": 0.6638803958892823,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"epoch": 0.871868978805395,
|
|
"grad_norm": 1.731964996230591,
|
|
"learning_rate": 4.3327076381639357e-07,
|
|
"loss": 0.7365228176116944,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"epoch": 0.8734746307000643,
|
|
"grad_norm": 2.0180621500310316,
|
|
"learning_rate": 4.226345735044485e-07,
|
|
"loss": 0.7146649837493897,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"epoch": 0.8750802825947335,
|
|
"grad_norm": 1.852487136991446,
|
|
"learning_rate": 4.121247993278621e-07,
|
|
"loss": 0.7220353603363037,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 0.8766859344894027,
|
|
"grad_norm": 1.7475004626282986,
|
|
"learning_rate": 4.0174173154203356e-07,
|
|
"loss": 0.7063849449157715,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"epoch": 0.8782915863840719,
|
|
"grad_norm": 1.722362531304578,
|
|
"learning_rate": 3.9148565690302896e-07,
|
|
"loss": 0.6868960857391357,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"epoch": 0.8798972382787412,
|
|
"grad_norm": 1.7102032863217258,
|
|
"learning_rate": 3.813568586596611e-07,
|
|
"loss": 0.6587272644042969,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"epoch": 0.8815028901734104,
|
|
"grad_norm": 1.4768286346198138,
|
|
"learning_rate": 3.7135561654566497e-07,
|
|
"loss": 0.7307673454284668,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"epoch": 0.8831085420680796,
|
|
"grad_norm": 1.7825391189854223,
|
|
"learning_rate": 3.6148220677197364e-07,
|
|
"loss": 0.6861472129821777,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 0.8847141939627489,
|
|
"grad_norm": 1.5463273855512805,
|
|
"learning_rate": 3.5173690201909084e-07,
|
|
"loss": 0.6802690029144287,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"epoch": 0.8863198458574181,
|
|
"grad_norm": 1.8670581209467214,
|
|
"learning_rate": 3.4211997142955756e-07,
|
|
"loss": 0.7045941352844238,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"epoch": 0.8879254977520874,
|
|
"grad_norm": 2.0779947935631022,
|
|
"learning_rate": 3.326316806005209e-07,
|
|
"loss": 0.6813518524169921,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"epoch": 0.8895311496467566,
|
|
"grad_norm": 1.95188637541602,
|
|
"learning_rate": 3.2327229157639915e-07,
|
|
"loss": 0.671013069152832,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"epoch": 0.8911368015414258,
|
|
"grad_norm": 1.7012921225510371,
|
|
"learning_rate": 3.1404206284164295e-07,
|
|
"loss": 0.695673131942749,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"epoch": 0.892742453436095,
|
|
"grad_norm": 2.0319087769759774,
|
|
"learning_rate": 3.0494124931359834e-07,
|
|
"loss": 0.673918628692627,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"epoch": 0.8943481053307643,
|
|
"grad_norm": 1.7787149579377735,
|
|
"learning_rate": 2.959701023354644e-07,
|
|
"loss": 0.7056601524353028,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"epoch": 0.8959537572254336,
|
|
"grad_norm": 1.6803005634091956,
|
|
"learning_rate": 2.871288696693564e-07,
|
|
"loss": 0.711387300491333,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"epoch": 0.8975594091201028,
|
|
"grad_norm": 2.038872846650325,
|
|
"learning_rate": 2.7841779548945626e-07,
|
|
"loss": 0.643738079071045,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"epoch": 0.899165061014772,
|
|
"grad_norm": 1.7323673485398343,
|
|
"learning_rate": 2.698371203752753e-07,
|
|
"loss": 0.6891765594482422,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 0.899165061014772,
|
|
"eval_loss": 0.6369297504425049,
|
|
"eval_runtime": 100.171,
|
|
"eval_samples_per_second": 20.195,
|
|
"eval_steps_per_second": 5.051,
|
|
"eval_token_acc": 0.7884993684730396,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 0.9007707129094412,
|
|
"grad_norm": 1.7595790189861171,
|
|
"learning_rate": 2.613870813050051e-07,
|
|
"loss": 0.6577945232391358,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"epoch": 0.9023763648041104,
|
|
"grad_norm": 1.9701548509943287,
|
|
"learning_rate": 2.53067911648977e-07,
|
|
"loss": 0.7041536331176758,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"epoch": 0.9039820166987798,
|
|
"grad_norm": 1.8393776355225353,
|
|
"learning_rate": 2.4487984116321474e-07,
|
|
"loss": 0.7007421493530274,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"epoch": 0.905587668593449,
|
|
"grad_norm": 1.8071384165730564,
|
|
"learning_rate": 2.368230959830875e-07,
|
|
"loss": 0.6663236618041992,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"epoch": 0.9071933204881182,
|
|
"grad_norm": 1.7242592136590722,
|
|
"learning_rate": 2.2889789861706868e-07,
|
|
"loss": 0.6813517570495605,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"epoch": 0.9087989723827874,
|
|
"grad_norm": 3.112061318957083,
|
|
"learning_rate": 2.211044679405877e-07,
|
|
"loss": 0.663045072555542,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"epoch": 0.9104046242774566,
|
|
"grad_norm": 1.615986400574688,
|
|
"learning_rate": 2.1344301918998555e-07,
|
|
"loss": 0.6766707420349121,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"epoch": 0.9120102761721259,
|
|
"grad_norm": 1.7528874633834992,
|
|
"learning_rate": 2.059137639565717e-07,
|
|
"loss": 0.6877583503723145,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"epoch": 0.9136159280667951,
|
|
"grad_norm": 1.7796225000122403,
|
|
"learning_rate": 1.9851691018077824e-07,
|
|
"loss": 0.6933162689208985,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"epoch": 0.9152215799614644,
|
|
"grad_norm": 1.7019095020573654,
|
|
"learning_rate": 1.9125266214642e-07,
|
|
"loss": 0.7472042083740235,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 0.9168272318561336,
|
|
"grad_norm": 1.9587898193737927,
|
|
"learning_rate": 1.8412122047505032e-07,
|
|
"loss": 0.6880541801452636,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"epoch": 0.9184328837508028,
|
|
"grad_norm": 1.6775853695124985,
|
|
"learning_rate": 1.7712278212042134e-07,
|
|
"loss": 0.7018757820129394,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"epoch": 0.9200385356454721,
|
|
"grad_norm": 1.7798533051734553,
|
|
"learning_rate": 1.7025754036304466e-07,
|
|
"loss": 0.691845703125,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"epoch": 0.9216441875401413,
|
|
"grad_norm": 1.7420651980828974,
|
|
"learning_rate": 1.6352568480485277e-07,
|
|
"loss": 0.6812302112579346,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"epoch": 0.9232498394348105,
|
|
"grad_norm": 1.9239749814632983,
|
|
"learning_rate": 1.5692740136396324e-07,
|
|
"loss": 0.6581586837768555,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"epoch": 0.9248554913294798,
|
|
"grad_norm": 1.9082994117549892,
|
|
"learning_rate": 1.5046287226954394e-07,
|
|
"loss": 0.731188154220581,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"epoch": 0.926461143224149,
|
|
"grad_norm": 1.7926613214649838,
|
|
"learning_rate": 1.4413227605677983e-07,
|
|
"loss": 0.6892642021179199,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"epoch": 0.9280667951188183,
|
|
"grad_norm": 1.771249909731025,
|
|
"learning_rate": 1.379357875619436e-07,
|
|
"loss": 0.6990791320800781,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"epoch": 0.9296724470134875,
|
|
"grad_norm": 1.8356150776446425,
|
|
"learning_rate": 1.3187357791756504e-07,
|
|
"loss": 0.6910379886627197,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"epoch": 0.9312780989081567,
|
|
"grad_norm": 1.705632884206152,
|
|
"learning_rate": 1.2594581454770772e-07,
|
|
"loss": 0.6509772300720215,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 0.9312780989081567,
|
|
"eval_loss": 0.631990373134613,
|
|
"eval_runtime": 101.722,
|
|
"eval_samples_per_second": 19.888,
|
|
"eval_steps_per_second": 4.974,
|
|
"eval_token_acc": 0.7884676669875746,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 0.9328837508028259,
|
|
"grad_norm": 1.7844388380004357,
|
|
"learning_rate": 1.2015266116334135e-07,
|
|
"loss": 0.668633222579956,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"epoch": 0.9344894026974951,
|
|
"grad_norm": 1.733170437170936,
|
|
"learning_rate": 1.1449427775782396e-07,
|
|
"loss": 0.655826187133789,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"epoch": 0.9360950545921645,
|
|
"grad_norm": 2.358368796318552,
|
|
"learning_rate": 1.0897082060247976e-07,
|
|
"loss": 0.7064967155456543,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"epoch": 0.9377007064868337,
|
|
"grad_norm": 1.729089971309778,
|
|
"learning_rate": 1.0358244224228764e-07,
|
|
"loss": 0.6829689979553223,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"epoch": 0.9393063583815029,
|
|
"grad_norm": 1.6455732105763972,
|
|
"learning_rate": 9.832929149166503e-08,
|
|
"loss": 0.6565438747406006,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"epoch": 0.9409120102761721,
|
|
"grad_norm": 1.760683614917597,
|
|
"learning_rate": 9.32115134303574e-08,
|
|
"loss": 0.6851202487945557,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"epoch": 0.9425176621708413,
|
|
"grad_norm": 1.7874937383563934,
|
|
"learning_rate": 8.822924939943523e-08,
|
|
"loss": 0.7376208782196045,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"epoch": 0.9441233140655106,
|
|
"grad_norm": 1.6510863458242004,
|
|
"learning_rate": 8.338263699738668e-08,
|
|
"loss": 0.6026273250579834,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"epoch": 0.9457289659601799,
|
|
"grad_norm": 1.9834215383922407,
|
|
"learning_rate": 7.867181007631897e-08,
|
|
"loss": 0.7061845779418945,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"epoch": 0.9473346178548491,
|
|
"grad_norm": 1.9131535301068874,
|
|
"learning_rate": 7.409689873826232e-08,
|
|
"loss": 0.670206880569458,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 0.9489402697495183,
|
|
"grad_norm": 2.0738608371934237,
|
|
"learning_rate": 6.965802933157573e-08,
|
|
"loss": 0.708193302154541,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"epoch": 0.9505459216441875,
|
|
"grad_norm": 1.8177322993411624,
|
|
"learning_rate": 6.535532444745862e-08,
|
|
"loss": 0.7078551292419434,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"epoch": 0.9521515735388568,
|
|
"grad_norm": 2.082436198696705,
|
|
"learning_rate": 6.118890291656355e-08,
|
|
"loss": 0.7242929458618164,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"epoch": 0.953757225433526,
|
|
"grad_norm": 1.849544928484393,
|
|
"learning_rate": 5.7158879805716e-08,
|
|
"loss": 0.6365905284881592,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"epoch": 0.9553628773281952,
|
|
"grad_norm": 1.9831271095863907,
|
|
"learning_rate": 5.32653664147359e-08,
|
|
"loss": 0.6500433444976806,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"epoch": 0.9569685292228645,
|
|
"grad_norm": 1.823333486059779,
|
|
"learning_rate": 4.950847027336336e-08,
|
|
"loss": 0.6756103038787842,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"epoch": 0.9585741811175337,
|
|
"grad_norm": 1.8028664132327312,
|
|
"learning_rate": 4.588829513828996e-08,
|
|
"loss": 0.7307151794433594,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"epoch": 0.960179833012203,
|
|
"grad_norm": 1.656799725101945,
|
|
"learning_rate": 4.2404940990292135e-08,
|
|
"loss": 0.6212584495544433,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"epoch": 0.9617854849068722,
|
|
"grad_norm": 1.9150059741320609,
|
|
"learning_rate": 3.90585040314706e-08,
|
|
"loss": 0.6227765560150147,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"epoch": 0.9633911368015414,
|
|
"grad_norm": 1.880228879847369,
|
|
"learning_rate": 3.584907668259308e-08,
|
|
"loss": 0.7474319934844971,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.9633911368015414,
|
|
"eval_loss": 0.6360629200935364,
|
|
"eval_runtime": 100.1142,
|
|
"eval_samples_per_second": 20.207,
|
|
"eval_steps_per_second": 5.054,
|
|
"eval_token_acc": 0.7885460864516194,
|
|
"step": 6000
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 6228,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 200,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 185011340083200.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|