2446 lines
69 KiB
JSON
2446 lines
69 KiB
JSON
{
|
|
"best_global_step": 980,
|
|
"best_metric": 0.19523989,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v28-20250504-001458/checkpoint-980",
|
|
"epoch": 1.9872038515139998,
|
|
"eval_steps": 20,
|
|
"global_step": 980,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.002027112631445585,
|
|
"grad_norm": 2.521756172180176,
|
|
"learning_rate": 9.999988720152121e-06,
|
|
"loss": 0.36801594495773315,
|
|
"memory(GiB)": 31.83,
|
|
"step": 1,
|
|
"token_acc": 0.8936768843476973,
|
|
"train_speed(iter/s)": 0.061871
|
|
},
|
|
{
|
|
"epoch": 0.010135563157227924,
|
|
"grad_norm": 1.4690369367599487,
|
|
"learning_rate": 9.999718006347703e-06,
|
|
"loss": 0.29584014415740967,
|
|
"memory(GiB)": 31.83,
|
|
"step": 5,
|
|
"token_acc": 0.9045181572689532,
|
|
"train_speed(iter/s)": 0.11262
|
|
},
|
|
{
|
|
"epoch": 0.020271126314455847,
|
|
"grad_norm": 1.016095519065857,
|
|
"learning_rate": 9.998872057198983e-06,
|
|
"loss": 0.2859709024429321,
|
|
"memory(GiB)": 31.83,
|
|
"step": 10,
|
|
"token_acc": 0.8981923761356186,
|
|
"train_speed(iter/s)": 0.125063
|
|
},
|
|
{
|
|
"epoch": 0.03040668947168377,
|
|
"grad_norm": 0.8137410879135132,
|
|
"learning_rate": 9.997462247974751e-06,
|
|
"loss": 0.26002681255340576,
|
|
"memory(GiB)": 31.83,
|
|
"step": 15,
|
|
"token_acc": 0.907031992397846,
|
|
"train_speed(iter/s)": 0.130865
|
|
},
|
|
{
|
|
"epoch": 0.040542252628911694,
|
|
"grad_norm": 0.8540328145027161,
|
|
"learning_rate": 9.995488737697912e-06,
|
|
"loss": 0.24060523509979248,
|
|
"memory(GiB)": 31.84,
|
|
"step": 20,
|
|
"token_acc": 0.9225532754538279,
|
|
"train_speed(iter/s)": 0.133177
|
|
},
|
|
{
|
|
"epoch": 0.040542252628911694,
|
|
"eval_loss": 0.2592654228210449,
|
|
"eval_runtime": 21.9072,
|
|
"eval_samples_per_second": 14.516,
|
|
"eval_steps_per_second": 3.652,
|
|
"eval_token_acc": 0.9178476377490845,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.05067781578613962,
|
|
"grad_norm": 0.7922812104225159,
|
|
"learning_rate": 9.992951748975412e-06,
|
|
"loss": 0.24358665943145752,
|
|
"memory(GiB)": 31.84,
|
|
"step": 25,
|
|
"token_acc": 0.9152290582047404,
|
|
"train_speed(iter/s)": 0.114782
|
|
},
|
|
{
|
|
"epoch": 0.06081337894336754,
|
|
"grad_norm": 0.8630368709564209,
|
|
"learning_rate": 9.98985156797314e-06,
|
|
"loss": 0.23777904510498046,
|
|
"memory(GiB)": 31.84,
|
|
"step": 30,
|
|
"token_acc": 0.9226734218674036,
|
|
"train_speed(iter/s)": 0.118815
|
|
},
|
|
{
|
|
"epoch": 0.07094894210059546,
|
|
"grad_norm": 0.783414900302887,
|
|
"learning_rate": 9.98618854438364e-06,
|
|
"loss": 0.2334528923034668,
|
|
"memory(GiB)": 31.84,
|
|
"step": 35,
|
|
"token_acc": 0.9277159363111174,
|
|
"train_speed(iter/s)": 0.122415
|
|
},
|
|
{
|
|
"epoch": 0.08108450525782339,
|
|
"grad_norm": 0.8003538846969604,
|
|
"learning_rate": 9.98196309138667e-06,
|
|
"loss": 0.24934375286102295,
|
|
"memory(GiB)": 31.84,
|
|
"step": 40,
|
|
"token_acc": 0.9100693397648477,
|
|
"train_speed(iter/s)": 0.124779
|
|
},
|
|
{
|
|
"epoch": 0.08108450525782339,
|
|
"eval_loss": 0.24275335669517517,
|
|
"eval_runtime": 21.8352,
|
|
"eval_samples_per_second": 14.564,
|
|
"eval_steps_per_second": 3.664,
|
|
"eval_token_acc": 0.9218243456030826,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.09122006841505131,
|
|
"grad_norm": 0.8252556324005127,
|
|
"learning_rate": 9.977175685602601e-06,
|
|
"loss": 0.22385673522949218,
|
|
"memory(GiB)": 31.84,
|
|
"step": 45,
|
|
"token_acc": 0.9232757395226219,
|
|
"train_speed(iter/s)": 0.115407
|
|
},
|
|
{
|
|
"epoch": 0.10135563157227924,
|
|
"grad_norm": 0.7236311435699463,
|
|
"learning_rate": 9.971826867038652e-06,
|
|
"loss": 0.23449177742004396,
|
|
"memory(GiB)": 31.84,
|
|
"step": 50,
|
|
"token_acc": 0.9244473590731481,
|
|
"train_speed(iter/s)": 0.117372
|
|
},
|
|
{
|
|
"epoch": 0.11149119472950716,
|
|
"grad_norm": 0.7773425579071045,
|
|
"learning_rate": 9.965917239027972e-06,
|
|
"loss": 0.22451424598693848,
|
|
"memory(GiB)": 31.84,
|
|
"step": 55,
|
|
"token_acc": 0.9317552435764683,
|
|
"train_speed(iter/s)": 0.118883
|
|
},
|
|
{
|
|
"epoch": 0.12162675788673508,
|
|
"grad_norm": 0.7654985785484314,
|
|
"learning_rate": 9.959447468161598e-06,
|
|
"loss": 0.22769575119018554,
|
|
"memory(GiB)": 31.84,
|
|
"step": 60,
|
|
"token_acc": 0.9208154305996638,
|
|
"train_speed(iter/s)": 0.120828
|
|
},
|
|
{
|
|
"epoch": 0.12162675788673508,
|
|
"eval_loss": 0.23627901077270508,
|
|
"eval_runtime": 21.6502,
|
|
"eval_samples_per_second": 14.688,
|
|
"eval_steps_per_second": 3.695,
|
|
"eval_token_acc": 0.9226332722007126,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.13176232104396302,
|
|
"grad_norm": 0.8141059279441833,
|
|
"learning_rate": 9.952418284213256e-06,
|
|
"loss": 0.21704885959625245,
|
|
"memory(GiB)": 31.84,
|
|
"step": 65,
|
|
"token_acc": 0.9179818989115771,
|
|
"train_speed(iter/s)": 0.114505
|
|
},
|
|
{
|
|
"epoch": 0.14189788420119093,
|
|
"grad_norm": 0.6942949891090393,
|
|
"learning_rate": 9.94483048005705e-06,
|
|
"loss": 0.23090925216674804,
|
|
"memory(GiB)": 31.84,
|
|
"step": 70,
|
|
"token_acc": 0.9222755382207283,
|
|
"train_speed(iter/s)": 0.115729
|
|
},
|
|
{
|
|
"epoch": 0.15203344735841884,
|
|
"grad_norm": 0.7940172553062439,
|
|
"learning_rate": 9.936684911578019e-06,
|
|
"loss": 0.22984471321105956,
|
|
"memory(GiB)": 31.84,
|
|
"step": 75,
|
|
"token_acc": 0.9262917933130699,
|
|
"train_speed(iter/s)": 0.117067
|
|
},
|
|
{
|
|
"epoch": 0.16216901051564678,
|
|
"grad_norm": 0.7496793270111084,
|
|
"learning_rate": 9.927982497575606e-06,
|
|
"loss": 0.21851859092712403,
|
|
"memory(GiB)": 31.84,
|
|
"step": 80,
|
|
"token_acc": 0.9258955118221719,
|
|
"train_speed(iter/s)": 0.117832
|
|
},
|
|
{
|
|
"epoch": 0.16216901051564678,
|
|
"eval_loss": 0.231533482670784,
|
|
"eval_runtime": 21.8888,
|
|
"eval_samples_per_second": 14.528,
|
|
"eval_steps_per_second": 3.655,
|
|
"eval_token_acc": 0.9239361997739945,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.1723045736728747,
|
|
"grad_norm": 0.7115150690078735,
|
|
"learning_rate": 9.918724219660013e-06,
|
|
"loss": 0.2375343084335327,
|
|
"memory(GiB)": 31.84,
|
|
"step": 85,
|
|
"token_acc": 0.9190241791529182,
|
|
"train_speed(iter/s)": 0.113889
|
|
},
|
|
{
|
|
"epoch": 0.18244013683010263,
|
|
"grad_norm": 0.7307594418525696,
|
|
"learning_rate": 9.908911122141486e-06,
|
|
"loss": 0.21745119094848633,
|
|
"memory(GiB)": 31.84,
|
|
"step": 90,
|
|
"token_acc": 0.9248731238527157,
|
|
"train_speed(iter/s)": 0.11504
|
|
},
|
|
{
|
|
"epoch": 0.19257569998733054,
|
|
"grad_norm": 0.7595335841178894,
|
|
"learning_rate": 9.898544311912507e-06,
|
|
"loss": 0.23666539192199706,
|
|
"memory(GiB)": 31.84,
|
|
"step": 95,
|
|
"token_acc": 0.9154884783300083,
|
|
"train_speed(iter/s)": 0.116125
|
|
},
|
|
{
|
|
"epoch": 0.20271126314455848,
|
|
"grad_norm": 0.7457159161567688,
|
|
"learning_rate": 9.887624958322945e-06,
|
|
"loss": 0.22108936309814453,
|
|
"memory(GiB)": 31.84,
|
|
"step": 100,
|
|
"token_acc": 0.9207408691631993,
|
|
"train_speed(iter/s)": 0.117134
|
|
},
|
|
{
|
|
"epoch": 0.20271126314455848,
|
|
"eval_loss": 0.22748854756355286,
|
|
"eval_runtime": 21.8753,
|
|
"eval_samples_per_second": 14.537,
|
|
"eval_steps_per_second": 3.657,
|
|
"eval_token_acc": 0.9248686266155376,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.2128468263017864,
|
|
"grad_norm": 0.715534508228302,
|
|
"learning_rate": 9.876154293048163e-06,
|
|
"loss": 0.2108386516571045,
|
|
"memory(GiB)": 31.84,
|
|
"step": 105,
|
|
"token_acc": 0.9273328473234743,
|
|
"train_speed(iter/s)": 0.113574
|
|
},
|
|
{
|
|
"epoch": 0.22298238945901433,
|
|
"grad_norm": 0.7702275514602661,
|
|
"learning_rate": 9.864133609950077e-06,
|
|
"loss": 0.22842142581939698,
|
|
"memory(GiB)": 31.84,
|
|
"step": 110,
|
|
"token_acc": 0.9225603164746199,
|
|
"train_speed(iter/s)": 0.114631
|
|
},
|
|
{
|
|
"epoch": 0.23311795261624224,
|
|
"grad_norm": 0.7723402976989746,
|
|
"learning_rate": 9.851564264931219e-06,
|
|
"loss": 0.22526850700378417,
|
|
"memory(GiB)": 31.84,
|
|
"step": 115,
|
|
"token_acc": 0.9270424907786587,
|
|
"train_speed(iter/s)": 0.115388
|
|
},
|
|
{
|
|
"epoch": 0.24325351577347015,
|
|
"grad_norm": 0.7186813950538635,
|
|
"learning_rate": 9.838447675781795e-06,
|
|
"loss": 0.21562654972076417,
|
|
"memory(GiB)": 31.84,
|
|
"step": 120,
|
|
"token_acc": 0.9274811342327921,
|
|
"train_speed(iter/s)": 0.116096
|
|
},
|
|
{
|
|
"epoch": 0.24325351577347015,
|
|
"eval_loss": 0.22359323501586914,
|
|
"eval_runtime": 21.7276,
|
|
"eval_samples_per_second": 14.636,
|
|
"eval_steps_per_second": 3.682,
|
|
"eval_token_acc": 0.925652853164385,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.25338907893069806,
|
|
"grad_norm": 0.7766680717468262,
|
|
"learning_rate": 9.824785322019753e-06,
|
|
"loss": 0.2237046480178833,
|
|
"memory(GiB)": 31.84,
|
|
"step": 125,
|
|
"token_acc": 0.9236945219863242,
|
|
"train_speed(iter/s)": 0.113406
|
|
},
|
|
{
|
|
"epoch": 0.26352464208792603,
|
|
"grad_norm": 0.7407037019729614,
|
|
"learning_rate": 9.81057874472391e-06,
|
|
"loss": 0.219429874420166,
|
|
"memory(GiB)": 31.84,
|
|
"step": 130,
|
|
"token_acc": 0.9289204018740761,
|
|
"train_speed(iter/s)": 0.114239
|
|
},
|
|
{
|
|
"epoch": 0.27366020524515394,
|
|
"grad_norm": 0.7638006210327148,
|
|
"learning_rate": 9.795829546360113e-06,
|
|
"loss": 0.2275376319885254,
|
|
"memory(GiB)": 31.84,
|
|
"step": 135,
|
|
"token_acc": 0.9208956514221492,
|
|
"train_speed(iter/s)": 0.114967
|
|
},
|
|
{
|
|
"epoch": 0.28379576840238185,
|
|
"grad_norm": 0.7686223387718201,
|
|
"learning_rate": 9.78053939060049e-06,
|
|
"loss": 0.22452235221862793,
|
|
"memory(GiB)": 31.84,
|
|
"step": 140,
|
|
"token_acc": 0.9189745452344199,
|
|
"train_speed(iter/s)": 0.11575
|
|
},
|
|
{
|
|
"epoch": 0.28379576840238185,
|
|
"eval_loss": 0.22123830020427704,
|
|
"eval_runtime": 21.7578,
|
|
"eval_samples_per_second": 14.615,
|
|
"eval_steps_per_second": 3.677,
|
|
"eval_token_acc": 0.9263012294449282,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.29393133155960977,
|
|
"grad_norm": 0.6931352019309998,
|
|
"learning_rate": 9.764710002135784e-06,
|
|
"loss": 0.2003002643585205,
|
|
"memory(GiB)": 31.84,
|
|
"step": 145,
|
|
"token_acc": 0.9311943836379415,
|
|
"train_speed(iter/s)": 0.113318
|
|
},
|
|
{
|
|
"epoch": 0.3040668947168377,
|
|
"grad_norm": 0.800798773765564,
|
|
"learning_rate": 9.748343166480823e-06,
|
|
"loss": 0.20239923000335694,
|
|
"memory(GiB)": 31.84,
|
|
"step": 150,
|
|
"token_acc": 0.9229381308508969,
|
|
"train_speed(iter/s)": 0.113936
|
|
},
|
|
{
|
|
"epoch": 0.31420245787406564,
|
|
"grad_norm": 0.7786034345626831,
|
|
"learning_rate": 9.731440729773114e-06,
|
|
"loss": 0.22815029621124266,
|
|
"memory(GiB)": 31.84,
|
|
"step": 155,
|
|
"token_acc": 0.9227938277122053,
|
|
"train_speed(iter/s)": 0.114778
|
|
},
|
|
{
|
|
"epoch": 0.32433802103129356,
|
|
"grad_norm": 0.7402083277702332,
|
|
"learning_rate": 9.714004598564599e-06,
|
|
"loss": 0.19810022115707399,
|
|
"memory(GiB)": 31.84,
|
|
"step": 160,
|
|
"token_acc": 0.9393871049589806,
|
|
"train_speed(iter/s)": 0.115347
|
|
},
|
|
{
|
|
"epoch": 0.32433802103129356,
|
|
"eval_loss": 0.21936483681201935,
|
|
"eval_runtime": 21.7335,
|
|
"eval_samples_per_second": 14.632,
|
|
"eval_steps_per_second": 3.681,
|
|
"eval_token_acc": 0.9270422309084061,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.33447358418852147,
|
|
"grad_norm": 0.7241997718811035,
|
|
"learning_rate": 9.696036739606606e-06,
|
|
"loss": 0.21367735862731935,
|
|
"memory(GiB)": 31.84,
|
|
"step": 165,
|
|
"token_acc": 0.9242776974538682,
|
|
"train_speed(iter/s)": 0.113487
|
|
},
|
|
{
|
|
"epoch": 0.3446091473457494,
|
|
"grad_norm": 0.7917912006378174,
|
|
"learning_rate": 9.677539179628005e-06,
|
|
"loss": 0.23486363887786865,
|
|
"memory(GiB)": 31.84,
|
|
"step": 170,
|
|
"token_acc": 0.9263079934401947,
|
|
"train_speed(iter/s)": 0.114247
|
|
},
|
|
{
|
|
"epoch": 0.35474471050297735,
|
|
"grad_norm": 0.7224828004837036,
|
|
"learning_rate": 9.658514005106596e-06,
|
|
"loss": 0.22237610816955566,
|
|
"memory(GiB)": 31.84,
|
|
"step": 175,
|
|
"token_acc": 0.9191534928283597,
|
|
"train_speed(iter/s)": 0.114924
|
|
},
|
|
{
|
|
"epoch": 0.36488027366020526,
|
|
"grad_norm": 0.8869006037712097,
|
|
"learning_rate": 9.638963362033756e-06,
|
|
"loss": 0.2130965232849121,
|
|
"memory(GiB)": 31.84,
|
|
"step": 180,
|
|
"token_acc": 0.9328146634139909,
|
|
"train_speed(iter/s)": 0.115499
|
|
},
|
|
{
|
|
"epoch": 0.36488027366020526,
|
|
"eval_loss": 0.21775159239768982,
|
|
"eval_runtime": 21.827,
|
|
"eval_samples_per_second": 14.569,
|
|
"eval_steps_per_second": 3.665,
|
|
"eval_token_acc": 0.9271039810303625,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.37501583681743317,
|
|
"grad_norm": 0.7315487861633301,
|
|
"learning_rate": 9.618889455672384e-06,
|
|
"loss": 0.20544004440307617,
|
|
"memory(GiB)": 31.84,
|
|
"step": 185,
|
|
"token_acc": 0.9269179022705522,
|
|
"train_speed(iter/s)": 0.113738
|
|
},
|
|
{
|
|
"epoch": 0.3851513999746611,
|
|
"grad_norm": 0.7636704444885254,
|
|
"learning_rate": 9.598294550308149e-06,
|
|
"loss": 0.21708755493164061,
|
|
"memory(GiB)": 31.84,
|
|
"step": 190,
|
|
"token_acc": 0.9192093078475603,
|
|
"train_speed(iter/s)": 0.114239
|
|
},
|
|
{
|
|
"epoch": 0.395286963131889,
|
|
"grad_norm": 0.753697395324707,
|
|
"learning_rate": 9.577180968994081e-06,
|
|
"loss": 0.20737123489379883,
|
|
"memory(GiB)": 31.84,
|
|
"step": 195,
|
|
"token_acc": 0.9441376276819314,
|
|
"train_speed(iter/s)": 0.114771
|
|
},
|
|
{
|
|
"epoch": 0.40542252628911696,
|
|
"grad_norm": 0.670136570930481,
|
|
"learning_rate": 9.55555109328855e-06,
|
|
"loss": 0.22084522247314453,
|
|
"memory(GiB)": 31.84,
|
|
"step": 200,
|
|
"token_acc": 0.9222005816280249,
|
|
"train_speed(iter/s)": 0.115258
|
|
},
|
|
{
|
|
"epoch": 0.40542252628911696,
|
|
"eval_loss": 0.2161727398633957,
|
|
"eval_runtime": 21.793,
|
|
"eval_samples_per_second": 14.592,
|
|
"eval_steps_per_second": 3.671,
|
|
"eval_token_acc": 0.9274559567255145,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.41555808944634487,
|
|
"grad_norm": 0.7045581936836243,
|
|
"learning_rate": 9.533407362986606e-06,
|
|
"loss": 0.20071265697479249,
|
|
"memory(GiB)": 31.84,
|
|
"step": 205,
|
|
"token_acc": 0.926633272095267,
|
|
"train_speed(iter/s)": 0.11359
|
|
},
|
|
{
|
|
"epoch": 0.4256936526035728,
|
|
"grad_norm": 0.7426798343658447,
|
|
"learning_rate": 9.51075227584481e-06,
|
|
"loss": 0.2052762269973755,
|
|
"memory(GiB)": 31.84,
|
|
"step": 210,
|
|
"token_acc": 0.9334708885538184,
|
|
"train_speed(iter/s)": 0.114162
|
|
},
|
|
{
|
|
"epoch": 0.4358292157608007,
|
|
"grad_norm": 0.7467470169067383,
|
|
"learning_rate": 9.487588387299465e-06,
|
|
"loss": 0.2220769166946411,
|
|
"memory(GiB)": 31.84,
|
|
"step": 215,
|
|
"token_acc": 0.9223140495867769,
|
|
"train_speed(iter/s)": 0.114807
|
|
},
|
|
{
|
|
"epoch": 0.44596477891802866,
|
|
"grad_norm": 0.7024182677268982,
|
|
"learning_rate": 9.463918310178385e-06,
|
|
"loss": 0.21398956775665284,
|
|
"memory(GiB)": 31.84,
|
|
"step": 220,
|
|
"token_acc": 0.9318319475799791,
|
|
"train_speed(iter/s)": 0.115301
|
|
},
|
|
{
|
|
"epoch": 0.44596477891802866,
|
|
"eval_loss": 0.21440377831459045,
|
|
"eval_runtime": 21.7402,
|
|
"eval_samples_per_second": 14.627,
|
|
"eval_steps_per_second": 3.68,
|
|
"eval_token_acc": 0.9279561327133621,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.45610034207525657,
|
|
"grad_norm": 0.6610811948776245,
|
|
"learning_rate": 9.439744714406167e-06,
|
|
"loss": 0.20560271739959718,
|
|
"memory(GiB)": 31.84,
|
|
"step": 225,
|
|
"token_acc": 0.9351413448427652,
|
|
"train_speed(iter/s)": 0.113954
|
|
},
|
|
{
|
|
"epoch": 0.4662359052324845,
|
|
"grad_norm": 0.6910359859466553,
|
|
"learning_rate": 9.415070326703039e-06,
|
|
"loss": 0.22013638019561768,
|
|
"memory(GiB)": 31.84,
|
|
"step": 230,
|
|
"token_acc": 0.9156021643331288,
|
|
"train_speed(iter/s)": 0.114456
|
|
},
|
|
{
|
|
"epoch": 0.4763714683897124,
|
|
"grad_norm": 0.7237235903739929,
|
|
"learning_rate": 9.38989793027728e-06,
|
|
"loss": 0.2018293857574463,
|
|
"memory(GiB)": 31.84,
|
|
"step": 235,
|
|
"token_acc": 0.9305291447985599,
|
|
"train_speed(iter/s)": 0.114857
|
|
},
|
|
{
|
|
"epoch": 0.4865070315469403,
|
|
"grad_norm": 0.5631291270256042,
|
|
"learning_rate": 9.364230364511296e-06,
|
|
"loss": 0.20158102512359619,
|
|
"memory(GiB)": 31.84,
|
|
"step": 240,
|
|
"token_acc": 0.9311977864098334,
|
|
"train_speed(iter/s)": 0.115295
|
|
},
|
|
{
|
|
"epoch": 0.4865070315469403,
|
|
"eval_loss": 0.21314272284507751,
|
|
"eval_runtime": 21.8121,
|
|
"eval_samples_per_second": 14.579,
|
|
"eval_steps_per_second": 3.668,
|
|
"eval_token_acc": 0.9278943825914057,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.49664259470416827,
|
|
"grad_norm": 0.7533831596374512,
|
|
"learning_rate": 9.338070524641329e-06,
|
|
"loss": 0.22155873775482177,
|
|
"memory(GiB)": 31.84,
|
|
"step": 245,
|
|
"token_acc": 0.9251282051282051,
|
|
"train_speed(iter/s)": 0.114158
|
|
},
|
|
{
|
|
"epoch": 0.5067781578613961,
|
|
"grad_norm": 0.6942651271820068,
|
|
"learning_rate": 9.3114213614309e-06,
|
|
"loss": 0.203749680519104,
|
|
"memory(GiB)": 31.84,
|
|
"step": 250,
|
|
"token_acc": 0.9276428102429416,
|
|
"train_speed(iter/s)": 0.114514
|
|
},
|
|
{
|
|
"epoch": 0.5169137210186241,
|
|
"grad_norm": 0.7373226284980774,
|
|
"learning_rate": 9.284285880837947e-06,
|
|
"loss": 0.204945707321167,
|
|
"memory(GiB)": 31.84,
|
|
"step": 255,
|
|
"token_acc": 0.9326470962002795,
|
|
"train_speed(iter/s)": 0.114981
|
|
},
|
|
{
|
|
"epoch": 0.5270492841758521,
|
|
"grad_norm": 0.7045229077339172,
|
|
"learning_rate": 9.256667143675789e-06,
|
|
"loss": 0.21190428733825684,
|
|
"memory(GiB)": 31.84,
|
|
"step": 260,
|
|
"token_acc": 0.9289341566160771,
|
|
"train_speed(iter/s)": 0.115383
|
|
},
|
|
{
|
|
"epoch": 0.5270492841758521,
|
|
"eval_loss": 0.21087971329689026,
|
|
"eval_runtime": 21.8346,
|
|
"eval_samples_per_second": 14.564,
|
|
"eval_steps_per_second": 3.664,
|
|
"eval_token_acc": 0.9285427588719488,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.5371848473330799,
|
|
"grad_norm": 0.6742910742759705,
|
|
"learning_rate": 9.228568265267845e-06,
|
|
"loss": 0.20208969116210937,
|
|
"memory(GiB)": 31.84,
|
|
"step": 265,
|
|
"token_acc": 0.9311459533033191,
|
|
"train_speed(iter/s)": 0.11409
|
|
},
|
|
{
|
|
"epoch": 0.5473204104903079,
|
|
"grad_norm": 0.7655450701713562,
|
|
"learning_rate": 9.199992415096261e-06,
|
|
"loss": 0.22049922943115235,
|
|
"memory(GiB)": 31.84,
|
|
"step": 270,
|
|
"token_acc": 0.9257197105762817,
|
|
"train_speed(iter/s)": 0.114477
|
|
},
|
|
{
|
|
"epoch": 0.5574559736475357,
|
|
"grad_norm": 0.7306473255157471,
|
|
"learning_rate": 9.170942816444376e-06,
|
|
"loss": 0.21227545738220216,
|
|
"memory(GiB)": 31.84,
|
|
"step": 275,
|
|
"token_acc": 0.9255821189648914,
|
|
"train_speed(iter/s)": 0.114761
|
|
},
|
|
{
|
|
"epoch": 0.5675915368047637,
|
|
"grad_norm": 0.6912848353385925,
|
|
"learning_rate": 9.141422746033158e-06,
|
|
"loss": 0.2025892972946167,
|
|
"memory(GiB)": 31.84,
|
|
"step": 280,
|
|
"token_acc": 0.9356907206914177,
|
|
"train_speed(iter/s)": 0.115104
|
|
},
|
|
{
|
|
"epoch": 0.5675915368047637,
|
|
"eval_loss": 0.21015885472297668,
|
|
"eval_runtime": 21.7959,
|
|
"eval_samples_per_second": 14.59,
|
|
"eval_steps_per_second": 3.67,
|
|
"eval_token_acc": 0.9283204584329054,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.5777270999619917,
|
|
"grad_norm": 0.6503239274024963,
|
|
"learning_rate": 9.111435533651595e-06,
|
|
"loss": 0.20708324909210205,
|
|
"memory(GiB)": 31.84,
|
|
"step": 285,
|
|
"token_acc": 0.9302453853948871,
|
|
"train_speed(iter/s)": 0.11406
|
|
},
|
|
{
|
|
"epoch": 0.5878626631192195,
|
|
"grad_norm": 0.6230775117874146,
|
|
"learning_rate": 9.08098456178111e-06,
|
|
"loss": 0.21010513305664064,
|
|
"memory(GiB)": 31.84,
|
|
"step": 290,
|
|
"token_acc": 0.9251024310297733,
|
|
"train_speed(iter/s)": 0.114468
|
|
},
|
|
{
|
|
"epoch": 0.5979982262764475,
|
|
"grad_norm": 0.5951421856880188,
|
|
"learning_rate": 9.050073265214006e-06,
|
|
"loss": 0.20655345916748047,
|
|
"memory(GiB)": 31.84,
|
|
"step": 295,
|
|
"token_acc": 0.9383279364144834,
|
|
"train_speed(iter/s)": 0.114773
|
|
},
|
|
{
|
|
"epoch": 0.6081337894336754,
|
|
"grad_norm": 0.6480836868286133,
|
|
"learning_rate": 9.01870513066605e-06,
|
|
"loss": 0.19019179344177245,
|
|
"memory(GiB)": 31.84,
|
|
"step": 300,
|
|
"token_acc": 0.9367692102809961,
|
|
"train_speed(iter/s)": 0.115106
|
|
},
|
|
{
|
|
"epoch": 0.6081337894336754,
|
|
"eval_loss": 0.20902292430400848,
|
|
"eval_runtime": 21.7988,
|
|
"eval_samples_per_second": 14.588,
|
|
"eval_steps_per_second": 3.67,
|
|
"eval_token_acc": 0.9290552848841876,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.6182693525909033,
|
|
"grad_norm": 0.5634115934371948,
|
|
"learning_rate": 8.986883696383174e-06,
|
|
"loss": 0.18940932750701905,
|
|
"memory(GiB)": 31.84,
|
|
"step": 305,
|
|
"token_acc": 0.9292788671890537,
|
|
"train_speed(iter/s)": 0.114033
|
|
},
|
|
{
|
|
"epoch": 0.6284049157481313,
|
|
"grad_norm": 0.7795764803886414,
|
|
"learning_rate": 8.95461255174237e-06,
|
|
"loss": 0.21010336875915528,
|
|
"memory(GiB)": 31.84,
|
|
"step": 310,
|
|
"token_acc": 0.922672088989893,
|
|
"train_speed(iter/s)": 0.114389
|
|
},
|
|
{
|
|
"epoch": 0.6385404789053591,
|
|
"grad_norm": 0.6814590692520142,
|
|
"learning_rate": 8.921895336846814e-06,
|
|
"loss": 0.21159787178039552,
|
|
"memory(GiB)": 31.84,
|
|
"step": 315,
|
|
"token_acc": 0.9389688078621279,
|
|
"train_speed(iter/s)": 0.114683
|
|
},
|
|
{
|
|
"epoch": 0.6486760420625871,
|
|
"grad_norm": 0.6690118908882141,
|
|
"learning_rate": 8.888735742115268e-06,
|
|
"loss": 0.1964055299758911,
|
|
"memory(GiB)": 31.84,
|
|
"step": 320,
|
|
"token_acc": 0.9258384996498146,
|
|
"train_speed(iter/s)": 0.114995
|
|
},
|
|
{
|
|
"epoch": 0.6486760420625871,
|
|
"eval_loss": 0.20818451046943665,
|
|
"eval_runtime": 21.7939,
|
|
"eval_samples_per_second": 14.591,
|
|
"eval_steps_per_second": 3.671,
|
|
"eval_token_acc": 0.929296110359818,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.6588116052198151,
|
|
"grad_norm": 0.6912645697593689,
|
|
"learning_rate": 8.855137507865831e-06,
|
|
"loss": 0.19853589534759522,
|
|
"memory(GiB)": 31.84,
|
|
"step": 325,
|
|
"token_acc": 0.9269768893593345,
|
|
"train_speed(iter/s)": 0.114013
|
|
},
|
|
{
|
|
"epoch": 0.6689471683770429,
|
|
"grad_norm": 0.6451709866523743,
|
|
"learning_rate": 8.821104423894015e-06,
|
|
"loss": 0.20868840217590331,
|
|
"memory(GiB)": 31.84,
|
|
"step": 330,
|
|
"token_acc": 0.9279026926085749,
|
|
"train_speed(iter/s)": 0.114452
|
|
},
|
|
{
|
|
"epoch": 0.6790827315342709,
|
|
"grad_norm": 0.6472577452659607,
|
|
"learning_rate": 8.786640329045279e-06,
|
|
"loss": 0.20419092178344728,
|
|
"memory(GiB)": 31.84,
|
|
"step": 335,
|
|
"token_acc": 0.9282166508987701,
|
|
"train_speed(iter/s)": 0.114692
|
|
},
|
|
{
|
|
"epoch": 0.6892182946914988,
|
|
"grad_norm": 0.6527739763259888,
|
|
"learning_rate": 8.751749110782013e-06,
|
|
"loss": 0.20285537242889404,
|
|
"memory(GiB)": 31.84,
|
|
"step": 340,
|
|
"token_acc": 0.9387946219536667,
|
|
"train_speed(iter/s)": 0.114992
|
|
},
|
|
{
|
|
"epoch": 0.6892182946914988,
|
|
"eval_loss": 0.20785662531852722,
|
|
"eval_runtime": 21.816,
|
|
"eval_samples_per_second": 14.576,
|
|
"eval_steps_per_second": 3.667,
|
|
"eval_token_acc": 0.9291232100183398,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.6993538578487267,
|
|
"grad_norm": 0.7979351878166199,
|
|
"learning_rate": 8.716434704745047e-06,
|
|
"loss": 0.21715948581695557,
|
|
"memory(GiB)": 31.84,
|
|
"step": 345,
|
|
"token_acc": 0.9273517017828201,
|
|
"train_speed(iter/s)": 0.114123
|
|
},
|
|
{
|
|
"epoch": 0.7094894210059547,
|
|
"grad_norm": 0.7549222707748413,
|
|
"learning_rate": 8.680701094309716e-06,
|
|
"loss": 0.19147870540618897,
|
|
"memory(GiB)": 31.84,
|
|
"step": 350,
|
|
"token_acc": 0.9354483972067961,
|
|
"train_speed(iter/s)": 0.114431
|
|
},
|
|
{
|
|
"epoch": 0.7196249841631825,
|
|
"grad_norm": 0.6661515831947327,
|
|
"learning_rate": 8.644552310136547e-06,
|
|
"loss": 0.19406793117523194,
|
|
"memory(GiB)": 31.84,
|
|
"step": 355,
|
|
"token_acc": 0.9296775620817432,
|
|
"train_speed(iter/s)": 0.114715
|
|
},
|
|
{
|
|
"epoch": 0.7297605473204105,
|
|
"grad_norm": 0.7383010983467102,
|
|
"learning_rate": 8.60799242971661e-06,
|
|
"loss": 0.19579734802246093,
|
|
"memory(GiB)": 31.84,
|
|
"step": 360,
|
|
"token_acc": 0.92644679160905,
|
|
"train_speed(iter/s)": 0.115022
|
|
},
|
|
{
|
|
"epoch": 0.7297605473204105,
|
|
"eval_loss": 0.20642724633216858,
|
|
"eval_runtime": 21.8294,
|
|
"eval_samples_per_second": 14.568,
|
|
"eval_steps_per_second": 3.665,
|
|
"eval_token_acc": 0.9297221862013177,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.7398961104776384,
|
|
"grad_norm": 0.763937771320343,
|
|
"learning_rate": 8.571025576911587e-06,
|
|
"loss": 0.19617555141448975,
|
|
"memory(GiB)": 31.84,
|
|
"step": 365,
|
|
"token_acc": 0.9303160729846056,
|
|
"train_speed(iter/s)": 0.114163
|
|
},
|
|
{
|
|
"epoch": 0.7500316736348663,
|
|
"grad_norm": 0.7166250348091125,
|
|
"learning_rate": 8.533655921488612e-06,
|
|
"loss": 0.21308059692382814,
|
|
"memory(GiB)": 31.84,
|
|
"step": 370,
|
|
"token_acc": 0.9230729085120818,
|
|
"train_speed(iter/s)": 0.114429
|
|
},
|
|
{
|
|
"epoch": 0.7601672367920943,
|
|
"grad_norm": 0.765160858631134,
|
|
"learning_rate": 8.495887678649933e-06,
|
|
"loss": 0.2089679002761841,
|
|
"memory(GiB)": 31.84,
|
|
"step": 375,
|
|
"token_acc": 0.9291193218752438,
|
|
"train_speed(iter/s)": 0.114729
|
|
},
|
|
{
|
|
"epoch": 0.7703027999493222,
|
|
"grad_norm": 0.6891535520553589,
|
|
"learning_rate": 8.457725108557447e-06,
|
|
"loss": 0.20386552810668945,
|
|
"memory(GiB)": 31.84,
|
|
"step": 380,
|
|
"token_acc": 0.930313113680995,
|
|
"train_speed(iter/s)": 0.114967
|
|
},
|
|
{
|
|
"epoch": 0.7703027999493222,
|
|
"eval_loss": 0.20559069514274597,
|
|
"eval_runtime": 21.7942,
|
|
"eval_samples_per_second": 14.591,
|
|
"eval_steps_per_second": 3.671,
|
|
"eval_token_acc": 0.9297468862501004,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.7804383631065501,
|
|
"grad_norm": 0.7166603803634644,
|
|
"learning_rate": 8.41917251585216e-06,
|
|
"loss": 0.19558722972869874,
|
|
"memory(GiB)": 31.84,
|
|
"step": 385,
|
|
"token_acc": 0.9298326573239745,
|
|
"train_speed(iter/s)": 0.114165
|
|
},
|
|
{
|
|
"epoch": 0.790573926263778,
|
|
"grad_norm": 0.7598738074302673,
|
|
"learning_rate": 8.380234249168642e-06,
|
|
"loss": 0.1924947738647461,
|
|
"memory(GiB)": 31.84,
|
|
"step": 390,
|
|
"token_acc": 0.9257823458899742,
|
|
"train_speed(iter/s)": 0.11441
|
|
},
|
|
{
|
|
"epoch": 0.800709489421006,
|
|
"grad_norm": 0.7150676846504211,
|
|
"learning_rate": 8.340914700644507e-06,
|
|
"loss": 0.20020556449890137,
|
|
"memory(GiB)": 31.84,
|
|
"step": 395,
|
|
"token_acc": 0.9267301944153489,
|
|
"train_speed(iter/s)": 0.114769
|
|
},
|
|
{
|
|
"epoch": 0.8108450525782339,
|
|
"grad_norm": 0.6519859433174133,
|
|
"learning_rate": 8.301218305424994e-06,
|
|
"loss": 0.19768332242965697,
|
|
"memory(GiB)": 31.84,
|
|
"step": 400,
|
|
"token_acc": 0.923796935692706,
|
|
"train_speed(iter/s)": 0.115018
|
|
},
|
|
{
|
|
"epoch": 0.8108450525782339,
|
|
"eval_loss": 0.20375800132751465,
|
|
"eval_runtime": 21.8998,
|
|
"eval_samples_per_second": 14.521,
|
|
"eval_steps_per_second": 3.653,
|
|
"eval_token_acc": 0.9307348882014042,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.8209806157354618,
|
|
"grad_norm": 0.7148239612579346,
|
|
"learning_rate": 8.261149541162693e-06,
|
|
"loss": 0.20124292373657227,
|
|
"memory(GiB)": 31.84,
|
|
"step": 405,
|
|
"token_acc": 0.9281172612274948,
|
|
"train_speed(iter/s)": 0.114267
|
|
},
|
|
{
|
|
"epoch": 0.8311161788926897,
|
|
"grad_norm": 0.6164042353630066,
|
|
"learning_rate": 8.22071292751247e-06,
|
|
"loss": 0.18608367443084717,
|
|
"memory(GiB)": 31.84,
|
|
"step": 410,
|
|
"token_acc": 0.9256482433822335,
|
|
"train_speed(iter/s)": 0.11456
|
|
},
|
|
{
|
|
"epoch": 0.8412517420499176,
|
|
"grad_norm": 0.722022294998169,
|
|
"learning_rate": 8.179913025621676e-06,
|
|
"loss": 0.19683008193969725,
|
|
"memory(GiB)": 31.84,
|
|
"step": 415,
|
|
"token_acc": 0.9332604223628362,
|
|
"train_speed(iter/s)": 0.114887
|
|
},
|
|
{
|
|
"epoch": 0.8513873052071456,
|
|
"grad_norm": 0.6409715414047241,
|
|
"learning_rate": 8.138754437615652e-06,
|
|
"loss": 0.20776214599609374,
|
|
"memory(GiB)": 31.84,
|
|
"step": 420,
|
|
"token_acc": 0.9207910620264543,
|
|
"train_speed(iter/s)": 0.115145
|
|
},
|
|
{
|
|
"epoch": 0.8513873052071456,
|
|
"eval_loss": 0.202426478266716,
|
|
"eval_runtime": 21.7226,
|
|
"eval_samples_per_second": 14.639,
|
|
"eval_steps_per_second": 3.683,
|
|
"eval_token_acc": 0.9311918391038823,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.8615228683643735,
|
|
"grad_norm": 0.6910145282745361,
|
|
"learning_rate": 8.097241806078616e-06,
|
|
"loss": 0.2034536838531494,
|
|
"memory(GiB)": 31.84,
|
|
"step": 425,
|
|
"token_acc": 0.9296462086074857,
|
|
"train_speed(iter/s)": 0.114436
|
|
},
|
|
{
|
|
"epoch": 0.8716584315216014,
|
|
"grad_norm": 0.7244237661361694,
|
|
"learning_rate": 8.055379813530002e-06,
|
|
"loss": 0.20410799980163574,
|
|
"memory(GiB)": 31.84,
|
|
"step": 430,
|
|
"token_acc": 0.9354806441932579,
|
|
"train_speed(iter/s)": 0.114688
|
|
},
|
|
{
|
|
"epoch": 0.8817939946788294,
|
|
"grad_norm": 0.6427024602890015,
|
|
"learning_rate": 8.013173181896283e-06,
|
|
"loss": 0.1945526719093323,
|
|
"memory(GiB)": 31.84,
|
|
"step": 435,
|
|
"token_acc": 0.9243187942908474,
|
|
"train_speed(iter/s)": 0.114942
|
|
},
|
|
{
|
|
"epoch": 0.8919295578360573,
|
|
"grad_norm": 0.6878598928451538,
|
|
"learning_rate": 7.970626671978336e-06,
|
|
"loss": 0.19251216650009156,
|
|
"memory(GiB)": 31.84,
|
|
"step": 440,
|
|
"token_acc": 0.9332632422148793,
|
|
"train_speed(iter/s)": 0.115122
|
|
},
|
|
{
|
|
"epoch": 0.8919295578360573,
|
|
"eval_loss": 0.20254245400428772,
|
|
"eval_runtime": 21.8009,
|
|
"eval_samples_per_second": 14.587,
|
|
"eval_steps_per_second": 3.67,
|
|
"eval_token_acc": 0.9308954385184911,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.9020651209932852,
|
|
"grad_norm": 0.679250955581665,
|
|
"learning_rate": 7.927745082914453e-06,
|
|
"loss": 0.19701287746429444,
|
|
"memory(GiB)": 31.84,
|
|
"step": 445,
|
|
"token_acc": 0.9282149553325245,
|
|
"train_speed(iter/s)": 0.1144
|
|
},
|
|
{
|
|
"epoch": 0.9122006841505131,
|
|
"grad_norm": 0.7318421602249146,
|
|
"learning_rate": 7.884533251639e-06,
|
|
"loss": 0.1975053071975708,
|
|
"memory(GiB)": 31.84,
|
|
"step": 450,
|
|
"token_acc": 0.9288036746490503,
|
|
"train_speed(iter/s)": 0.114655
|
|
},
|
|
{
|
|
"epoch": 0.922336247307741,
|
|
"grad_norm": 0.7219140529632568,
|
|
"learning_rate": 7.840996052336827e-06,
|
|
"loss": 0.22328760623931884,
|
|
"memory(GiB)": 31.84,
|
|
"step": 455,
|
|
"token_acc": 0.9224597234555373,
|
|
"train_speed(iter/s)": 0.114914
|
|
},
|
|
{
|
|
"epoch": 0.932471810464969,
|
|
"grad_norm": 0.6490547060966492,
|
|
"learning_rate": 7.797138395893471e-06,
|
|
"loss": 0.20440030097961426,
|
|
"memory(GiB)": 31.84,
|
|
"step": 460,
|
|
"token_acc": 0.9310429568465546,
|
|
"train_speed(iter/s)": 0.115177
|
|
},
|
|
{
|
|
"epoch": 0.932471810464969,
|
|
"eval_loss": 0.2018972933292389,
|
|
"eval_runtime": 21.7035,
|
|
"eval_samples_per_second": 14.652,
|
|
"eval_steps_per_second": 3.686,
|
|
"eval_token_acc": 0.9313091643355995,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.9426073736221969,
|
|
"grad_norm": 0.6232012510299683,
|
|
"learning_rate": 7.75296522934122e-06,
|
|
"loss": 0.19165756702423095,
|
|
"memory(GiB)": 31.84,
|
|
"step": 465,
|
|
"token_acc": 0.932452931203712,
|
|
"train_speed(iter/s)": 0.114511
|
|
},
|
|
{
|
|
"epoch": 0.9527429367794248,
|
|
"grad_norm": 0.638760507106781,
|
|
"learning_rate": 7.708481535301101e-06,
|
|
"loss": 0.180339515209198,
|
|
"memory(GiB)": 31.84,
|
|
"step": 470,
|
|
"token_acc": 0.9387325860490436,
|
|
"train_speed(iter/s)": 0.114716
|
|
},
|
|
{
|
|
"epoch": 0.9628784999366528,
|
|
"grad_norm": 0.7850983142852783,
|
|
"learning_rate": 7.663692331420857e-06,
|
|
"loss": 0.20282835960388185,
|
|
"memory(GiB)": 31.84,
|
|
"step": 475,
|
|
"token_acc": 0.9294787942210982,
|
|
"train_speed(iter/s)": 0.114974
|
|
},
|
|
{
|
|
"epoch": 0.9730140630938806,
|
|
"grad_norm": 0.7070600390434265,
|
|
"learning_rate": 7.6186026698089584e-06,
|
|
"loss": 0.20282812118530275,
|
|
"memory(GiB)": 31.84,
|
|
"step": 480,
|
|
"token_acc": 0.9263814944005875,
|
|
"train_speed(iter/s)": 0.115173
|
|
},
|
|
{
|
|
"epoch": 0.9730140630938806,
|
|
"eval_loss": 0.2006615400314331,
|
|
"eval_runtime": 21.7993,
|
|
"eval_samples_per_second": 14.588,
|
|
"eval_steps_per_second": 3.67,
|
|
"eval_token_acc": 0.931636439981969,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.9831496262511086,
|
|
"grad_norm": 0.6108932495117188,
|
|
"learning_rate": 7.5732176364647515e-06,
|
|
"loss": 0.18763720989227295,
|
|
"memory(GiB)": 31.84,
|
|
"step": 485,
|
|
"token_acc": 0.9314041294819665,
|
|
"train_speed(iter/s)": 0.114551
|
|
},
|
|
{
|
|
"epoch": 0.9932851894083365,
|
|
"grad_norm": 0.7140085101127625,
|
|
"learning_rate": 7.527542350704759e-06,
|
|
"loss": 0.20563454627990724,
|
|
"memory(GiB)": 31.84,
|
|
"step": 490,
|
|
"token_acc": 0.9261781443826331,
|
|
"train_speed(iter/s)": 0.114801
|
|
},
|
|
{
|
|
"epoch": 1.0040542252628912,
|
|
"grad_norm": 0.6436912417411804,
|
|
"learning_rate": 7.481581964585245e-06,
|
|
"loss": 0.2002246856689453,
|
|
"memory(GiB)": 31.84,
|
|
"step": 495,
|
|
"token_acc": 0.9414402391501628,
|
|
"train_speed(iter/s)": 0.114919
|
|
},
|
|
{
|
|
"epoch": 1.0141897884201192,
|
|
"grad_norm": 0.6124146580696106,
|
|
"learning_rate": 7.435341662321063e-06,
|
|
"loss": 0.14853100776672362,
|
|
"memory(GiB)": 31.84,
|
|
"step": 500,
|
|
"token_acc": 0.9471966205837173,
|
|
"train_speed(iter/s)": 0.115165
|
|
},
|
|
{
|
|
"epoch": 1.0141897884201192,
|
|
"eval_loss": 0.20782382786273956,
|
|
"eval_runtime": 21.7236,
|
|
"eval_samples_per_second": 14.638,
|
|
"eval_steps_per_second": 3.683,
|
|
"eval_token_acc": 0.9316426149941646,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 1.024325351577347,
|
|
"grad_norm": 0.753773033618927,
|
|
"learning_rate": 7.388826659700902e-06,
|
|
"loss": 0.13870035409927367,
|
|
"memory(GiB)": 31.84,
|
|
"step": 505,
|
|
"token_acc": 0.9419344271245667,
|
|
"train_speed(iter/s)": 0.114525
|
|
},
|
|
{
|
|
"epoch": 1.034460914734575,
|
|
"grad_norm": 0.6324412226676941,
|
|
"learning_rate": 7.342042203498952e-06,
|
|
"loss": 0.14452635049819945,
|
|
"memory(GiB)": 31.84,
|
|
"step": 510,
|
|
"token_acc": 0.9496112148490845,
|
|
"train_speed(iter/s)": 0.114737
|
|
},
|
|
{
|
|
"epoch": 1.0445964778918029,
|
|
"grad_norm": 0.568839430809021,
|
|
"learning_rate": 7.2949935708830825e-06,
|
|
"loss": 0.13303987979888915,
|
|
"memory(GiB)": 31.84,
|
|
"step": 515,
|
|
"token_acc": 0.9554845690143747,
|
|
"train_speed(iter/s)": 0.114956
|
|
},
|
|
{
|
|
"epoch": 1.0547320410490308,
|
|
"grad_norm": 0.7018365859985352,
|
|
"learning_rate": 7.247686068819592e-06,
|
|
"loss": 0.1495302438735962,
|
|
"memory(GiB)": 31.84,
|
|
"step": 520,
|
|
"token_acc": 0.9394168981676347,
|
|
"train_speed(iter/s)": 0.115106
|
|
},
|
|
{
|
|
"epoch": 1.0547320410490308,
|
|
"eval_loss": 0.20550589263439178,
|
|
"eval_runtime": 21.8398,
|
|
"eval_samples_per_second": 14.561,
|
|
"eval_steps_per_second": 3.663,
|
|
"eval_token_acc": 0.9316549650185559,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 1.0648676042062588,
|
|
"grad_norm": 0.6878096461296082,
|
|
"learning_rate": 7.200125033474599e-06,
|
|
"loss": 0.1441951036453247,
|
|
"memory(GiB)": 31.84,
|
|
"step": 525,
|
|
"token_acc": 0.9410242560514515,
|
|
"train_speed(iter/s)": 0.114529
|
|
},
|
|
{
|
|
"epoch": 1.0750031673634866,
|
|
"grad_norm": 0.5951861143112183,
|
|
"learning_rate": 7.152315829612124e-06,
|
|
"loss": 0.141739821434021,
|
|
"memory(GiB)": 31.84,
|
|
"step": 530,
|
|
"token_acc": 0.9511068989817988,
|
|
"train_speed(iter/s)": 0.11475
|
|
},
|
|
{
|
|
"epoch": 1.0851387305207145,
|
|
"grad_norm": 0.7104572057723999,
|
|
"learning_rate": 7.104263849988976e-06,
|
|
"loss": 0.13958663940429689,
|
|
"memory(GiB)": 31.84,
|
|
"step": 535,
|
|
"token_acc": 0.9464745976287302,
|
|
"train_speed(iter/s)": 0.114952
|
|
},
|
|
{
|
|
"epoch": 1.0952742936779425,
|
|
"grad_norm": 0.7287134528160095,
|
|
"learning_rate": 7.055974514746446e-06,
|
|
"loss": 0.1433733582496643,
|
|
"memory(GiB)": 31.84,
|
|
"step": 540,
|
|
"token_acc": 0.9533915015281474,
|
|
"train_speed(iter/s)": 0.11514
|
|
},
|
|
{
|
|
"epoch": 1.0952742936779425,
|
|
"eval_loss": 0.20595994591712952,
|
|
"eval_runtime": 21.8259,
|
|
"eval_samples_per_second": 14.57,
|
|
"eval_steps_per_second": 3.665,
|
|
"eval_token_acc": 0.9320995658966427,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 1.1054098568351705,
|
|
"grad_norm": 0.7750824093818665,
|
|
"learning_rate": 7.007453270798937e-06,
|
|
"loss": 0.14028260707855225,
|
|
"memory(GiB)": 31.84,
|
|
"step": 545,
|
|
"token_acc": 0.9427210537187573,
|
|
"train_speed(iter/s)": 0.114614
|
|
},
|
|
{
|
|
"epoch": 1.1155454199923984,
|
|
"grad_norm": 0.6719576120376587,
|
|
"learning_rate": 6.95870559121957e-06,
|
|
"loss": 0.1443098783493042,
|
|
"memory(GiB)": 31.84,
|
|
"step": 550,
|
|
"token_acc": 0.9419209173821336,
|
|
"train_speed(iter/s)": 0.114811
|
|
},
|
|
{
|
|
"epoch": 1.1256809831496262,
|
|
"grad_norm": 0.6712535619735718,
|
|
"learning_rate": 6.909736974622827e-06,
|
|
"loss": 0.1412811279296875,
|
|
"memory(GiB)": 31.84,
|
|
"step": 555,
|
|
"token_acc": 0.9537117362746313,
|
|
"train_speed(iter/s)": 0.114963
|
|
},
|
|
{
|
|
"epoch": 1.1358165463068541,
|
|
"grad_norm": 0.7601253390312195,
|
|
"learning_rate": 6.860552944544325e-06,
|
|
"loss": 0.1436458110809326,
|
|
"memory(GiB)": 31.84,
|
|
"step": 560,
|
|
"token_acc": 0.9485312439436308,
|
|
"train_speed(iter/s)": 0.11514
|
|
},
|
|
{
|
|
"epoch": 1.1358165463068541,
|
|
"eval_loss": 0.20563021302223206,
|
|
"eval_runtime": 21.7336,
|
|
"eval_samples_per_second": 14.632,
|
|
"eval_steps_per_second": 3.681,
|
|
"eval_token_acc": 0.9317969902990558,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 1.145952109464082,
|
|
"grad_norm": 0.7179679870605469,
|
|
"learning_rate": 6.811159048817773e-06,
|
|
"loss": 0.1442504644393921,
|
|
"memory(GiB)": 31.84,
|
|
"step": 565,
|
|
"token_acc": 0.9380280917370789,
|
|
"train_speed(iter/s)": 0.114635
|
|
},
|
|
{
|
|
"epoch": 1.15608767262131,
|
|
"grad_norm": 0.7179620265960693,
|
|
"learning_rate": 6.7615608589491935e-06,
|
|
"loss": 0.13410391807556152,
|
|
"memory(GiB)": 31.84,
|
|
"step": 570,
|
|
"token_acc": 0.9558909767726473,
|
|
"train_speed(iter/s)": 0.114806
|
|
},
|
|
{
|
|
"epoch": 1.166223235778538,
|
|
"grad_norm": 0.6108378171920776,
|
|
"learning_rate": 6.711763969488472e-06,
|
|
"loss": 0.13897714614868165,
|
|
"memory(GiB)": 31.84,
|
|
"step": 575,
|
|
"token_acc": 0.9451023932889218,
|
|
"train_speed(iter/s)": 0.114992
|
|
},
|
|
{
|
|
"epoch": 1.1763587989357658,
|
|
"grad_norm": 0.6707165837287903,
|
|
"learning_rate": 6.6617739973982985e-06,
|
|
"loss": 0.14145419597625733,
|
|
"memory(GiB)": 31.84,
|
|
"step": 580,
|
|
"token_acc": 0.9474374323014652,
|
|
"train_speed(iter/s)": 0.115192
|
|
},
|
|
{
|
|
"epoch": 1.1763587989357658,
|
|
"eval_loss": 0.20474562048912048,
|
|
"eval_runtime": 21.8012,
|
|
"eval_samples_per_second": 14.586,
|
|
"eval_steps_per_second": 3.67,
|
|
"eval_token_acc": 0.9320069407137079,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 1.1864943620929937,
|
|
"grad_norm": 0.7210031747817993,
|
|
"learning_rate": 6.6115965814206e-06,
|
|
"loss": 0.1450878858566284,
|
|
"memory(GiB)": 31.84,
|
|
"step": 585,
|
|
"token_acc": 0.9400503909192447,
|
|
"train_speed(iter/s)": 0.114706
|
|
},
|
|
{
|
|
"epoch": 1.1966299252502217,
|
|
"grad_norm": 0.6648117899894714,
|
|
"learning_rate": 6.561237381440491e-06,
|
|
"loss": 0.1425154447555542,
|
|
"memory(GiB)": 31.84,
|
|
"step": 590,
|
|
"token_acc": 0.9425449072407813,
|
|
"train_speed(iter/s)": 0.114844
|
|
},
|
|
{
|
|
"epoch": 1.2067654884074497,
|
|
"grad_norm": 0.7065860629081726,
|
|
"learning_rate": 6.510702077847864e-06,
|
|
"loss": 0.14567887783050537,
|
|
"memory(GiB)": 31.84,
|
|
"step": 595,
|
|
"token_acc": 0.9487148690847946,
|
|
"train_speed(iter/s)": 0.115008
|
|
},
|
|
{
|
|
"epoch": 1.2169010515646776,
|
|
"grad_norm": 0.6811427474021912,
|
|
"learning_rate": 6.459996370896653e-06,
|
|
"loss": 0.1396160125732422,
|
|
"memory(GiB)": 31.84,
|
|
"step": 600,
|
|
"token_acc": 0.9510217472815898,
|
|
"train_speed(iter/s)": 0.11519
|
|
},
|
|
{
|
|
"epoch": 1.2169010515646776,
|
|
"eval_loss": 0.204533189535141,
|
|
"eval_runtime": 21.8513,
|
|
"eval_samples_per_second": 14.553,
|
|
"eval_steps_per_second": 3.661,
|
|
"eval_token_acc": 0.9319143155307732,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.2270366147219054,
|
|
"grad_norm": 0.6900855302810669,
|
|
"learning_rate": 6.409125980061852e-06,
|
|
"loss": 0.15422054529190063,
|
|
"memory(GiB)": 31.84,
|
|
"step": 605,
|
|
"token_acc": 0.938386855862584,
|
|
"train_speed(iter/s)": 0.114682
|
|
},
|
|
{
|
|
"epoch": 1.2371721778791334,
|
|
"grad_norm": 0.7017642259597778,
|
|
"learning_rate": 6.358096643394387e-06,
|
|
"loss": 0.14031555652618408,
|
|
"memory(GiB)": 31.84,
|
|
"step": 610,
|
|
"token_acc": 0.9535683058461724,
|
|
"train_speed(iter/s)": 0.114824
|
|
},
|
|
{
|
|
"epoch": 1.2473077410363613,
|
|
"grad_norm": 0.6713739037513733,
|
|
"learning_rate": 6.306914116873863e-06,
|
|
"loss": 0.14935390949249266,
|
|
"memory(GiB)": 31.84,
|
|
"step": 615,
|
|
"token_acc": 0.9406998158379374,
|
|
"train_speed(iter/s)": 0.114993
|
|
},
|
|
{
|
|
"epoch": 1.2574433041935893,
|
|
"grad_norm": 0.6819109916687012,
|
|
"learning_rate": 6.255584173759319e-06,
|
|
"loss": 0.1377212643623352,
|
|
"memory(GiB)": 31.84,
|
|
"step": 620,
|
|
"token_acc": 0.9501051424201874,
|
|
"train_speed(iter/s)": 0.115142
|
|
},
|
|
{
|
|
"epoch": 1.2574433041935893,
|
|
"eval_loss": 0.2050146609544754,
|
|
"eval_runtime": 21.8467,
|
|
"eval_samples_per_second": 14.556,
|
|
"eval_steps_per_second": 3.662,
|
|
"eval_token_acc": 0.9317105401283168,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 1.2675788673508173,
|
|
"grad_norm": 0.7029967308044434,
|
|
"learning_rate": 6.2041126039380065e-06,
|
|
"loss": 0.14089449644088745,
|
|
"memory(GiB)": 31.84,
|
|
"step": 625,
|
|
"token_acc": 0.9445310259703018,
|
|
"train_speed(iter/s)": 0.114661
|
|
},
|
|
{
|
|
"epoch": 1.2777144305080452,
|
|
"grad_norm": 0.7382521629333496,
|
|
"learning_rate": 6.152505213272308e-06,
|
|
"loss": 0.15079411268234252,
|
|
"memory(GiB)": 31.84,
|
|
"step": 630,
|
|
"token_acc": 0.9498072379057331,
|
|
"train_speed(iter/s)": 0.114839
|
|
},
|
|
{
|
|
"epoch": 1.287849993665273,
|
|
"grad_norm": 0.7125928997993469,
|
|
"learning_rate": 6.100767822944856e-06,
|
|
"loss": 0.13632259368896485,
|
|
"memory(GiB)": 31.84,
|
|
"step": 635,
|
|
"token_acc": 0.9514033264033264,
|
|
"train_speed(iter/s)": 0.115011
|
|
},
|
|
{
|
|
"epoch": 1.297985556822501,
|
|
"grad_norm": 0.7053549885749817,
|
|
"learning_rate": 6.048906268801915e-06,
|
|
"loss": 0.1401703953742981,
|
|
"memory(GiB)": 31.84,
|
|
"step": 640,
|
|
"token_acc": 0.9504383760540571,
|
|
"train_speed(iter/s)": 0.115182
|
|
},
|
|
{
|
|
"epoch": 1.297985556822501,
|
|
"eval_loss": 0.20237460732460022,
|
|
"eval_runtime": 21.7848,
|
|
"eval_samples_per_second": 14.597,
|
|
"eval_steps_per_second": 3.672,
|
|
"eval_token_acc": 0.9321427909820121,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 1.308121119979729,
|
|
"grad_norm": 0.7114176750183105,
|
|
"learning_rate": 5.9969264006951135e-06,
|
|
"loss": 0.1354852318763733,
|
|
"memory(GiB)": 31.84,
|
|
"step": 645,
|
|
"token_acc": 0.9398549466401315,
|
|
"train_speed(iter/s)": 0.114762
|
|
},
|
|
{
|
|
"epoch": 1.3182566831369569,
|
|
"grad_norm": 0.6910070776939392,
|
|
"learning_rate": 5.944834081821589e-06,
|
|
"loss": 0.1504984140396118,
|
|
"memory(GiB)": 31.84,
|
|
"step": 650,
|
|
"token_acc": 0.946494742867472,
|
|
"train_speed(iter/s)": 0.114975
|
|
},
|
|
{
|
|
"epoch": 1.3283922462941846,
|
|
"grad_norm": 0.7024128437042236,
|
|
"learning_rate": 5.892635188062647e-06,
|
|
"loss": 0.1456763982772827,
|
|
"memory(GiB)": 31.84,
|
|
"step": 655,
|
|
"token_acc": 0.9487756192189828,
|
|
"train_speed(iter/s)": 0.115131
|
|
},
|
|
{
|
|
"epoch": 1.3385278094514126,
|
|
"grad_norm": 0.6766383647918701,
|
|
"learning_rate": 5.8403356073209636e-06,
|
|
"loss": 0.13979326486587523,
|
|
"memory(GiB)": 31.84,
|
|
"step": 660,
|
|
"token_acc": 0.9429099638530946,
|
|
"train_speed(iter/s)": 0.115294
|
|
},
|
|
{
|
|
"epoch": 1.3385278094514126,
|
|
"eval_loss": 0.2033311426639557,
|
|
"eval_runtime": 21.7863,
|
|
"eval_samples_per_second": 14.596,
|
|
"eval_steps_per_second": 3.672,
|
|
"eval_token_acc": 0.9320625158234688,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 1.3486633726086406,
|
|
"grad_norm": 0.704328179359436,
|
|
"learning_rate": 5.787941238856456e-06,
|
|
"loss": 0.14371912479400634,
|
|
"memory(GiB)": 31.84,
|
|
"step": 665,
|
|
"token_acc": 0.9368773059543715,
|
|
"train_speed(iter/s)": 0.114861
|
|
},
|
|
{
|
|
"epoch": 1.3587989357658685,
|
|
"grad_norm": 0.6802431344985962,
|
|
"learning_rate": 5.735457992620851e-06,
|
|
"loss": 0.1401059865951538,
|
|
"memory(GiB)": 31.84,
|
|
"step": 670,
|
|
"token_acc": 0.9508861724428956,
|
|
"train_speed(iter/s)": 0.115032
|
|
},
|
|
{
|
|
"epoch": 1.3689344989230965,
|
|
"grad_norm": 0.6354742646217346,
|
|
"learning_rate": 5.682891788591066e-06,
|
|
"loss": 0.13931301832199097,
|
|
"memory(GiB)": 31.84,
|
|
"step": 675,
|
|
"token_acc": 0.9524576521435666,
|
|
"train_speed(iter/s)": 0.115144
|
|
},
|
|
{
|
|
"epoch": 1.3790700620803245,
|
|
"grad_norm": 0.6422320604324341,
|
|
"learning_rate": 5.630248556101448e-06,
|
|
"loss": 0.14210424423217774,
|
|
"memory(GiB)": 31.84,
|
|
"step": 680,
|
|
"token_acc": 0.9509293240203622,
|
|
"train_speed(iter/s)": 0.115296
|
|
},
|
|
{
|
|
"epoch": 1.3790700620803245,
|
|
"eval_loss": 0.20212741196155548,
|
|
"eval_runtime": 21.7681,
|
|
"eval_samples_per_second": 14.609,
|
|
"eval_steps_per_second": 3.675,
|
|
"eval_token_acc": 0.9325071167015555,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 1.3892056252375522,
|
|
"grad_norm": 0.629282534122467,
|
|
"learning_rate": 5.5775342331749525e-06,
|
|
"loss": 0.13471572399139403,
|
|
"memory(GiB)": 31.84,
|
|
"step": 685,
|
|
"token_acc": 0.9387646219305616,
|
|
"train_speed(iter/s)": 0.11486
|
|
},
|
|
{
|
|
"epoch": 1.3993411883947802,
|
|
"grad_norm": 0.6799649000167847,
|
|
"learning_rate": 5.5247547658533604e-06,
|
|
"loss": 0.146738600730896,
|
|
"memory(GiB)": 31.84,
|
|
"step": 690,
|
|
"token_acc": 0.9462739584104196,
|
|
"train_speed(iter/s)": 0.11501
|
|
},
|
|
{
|
|
"epoch": 1.4094767515520081,
|
|
"grad_norm": 0.7268623113632202,
|
|
"learning_rate": 5.471916107526577e-06,
|
|
"loss": 0.14002842903137208,
|
|
"memory(GiB)": 31.84,
|
|
"step": 695,
|
|
"token_acc": 0.9498395231545163,
|
|
"train_speed(iter/s)": 0.115121
|
|
},
|
|
{
|
|
"epoch": 1.419612314709236,
|
|
"grad_norm": 0.6405200362205505,
|
|
"learning_rate": 5.419024218261098e-06,
|
|
"loss": 0.1415479898452759,
|
|
"memory(GiB)": 31.84,
|
|
"step": 700,
|
|
"token_acc": 0.9470242347639721,
|
|
"train_speed(iter/s)": 0.11524
|
|
},
|
|
{
|
|
"epoch": 1.419612314709236,
|
|
"eval_loss": 0.20304131507873535,
|
|
"eval_runtime": 21.7388,
|
|
"eval_samples_per_second": 14.628,
|
|
"eval_steps_per_second": 3.68,
|
|
"eval_token_acc": 0.9326059168966858,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 1.4297478778664638,
|
|
"grad_norm": 0.7136779427528381,
|
|
"learning_rate": 5.366085064127734e-06,
|
|
"loss": 0.14102948904037477,
|
|
"memory(GiB)": 31.84,
|
|
"step": 705,
|
|
"token_acc": 0.9402442511130172,
|
|
"train_speed(iter/s)": 0.114822
|
|
},
|
|
{
|
|
"epoch": 1.4398834410236918,
|
|
"grad_norm": 0.7035164833068848,
|
|
"learning_rate": 5.313104616528656e-06,
|
|
"loss": 0.15270428657531737,
|
|
"memory(GiB)": 31.84,
|
|
"step": 710,
|
|
"token_acc": 0.9444076404990615,
|
|
"train_speed(iter/s)": 0.114935
|
|
},
|
|
{
|
|
"epoch": 1.4500190041809198,
|
|
"grad_norm": 0.7185565233230591,
|
|
"learning_rate": 5.260088851523833e-06,
|
|
"loss": 0.14676375389099122,
|
|
"memory(GiB)": 31.84,
|
|
"step": 715,
|
|
"token_acc": 0.9527322682405617,
|
|
"train_speed(iter/s)": 0.11512
|
|
},
|
|
{
|
|
"epoch": 1.4601545673381477,
|
|
"grad_norm": 0.712411105632782,
|
|
"learning_rate": 5.207043749156945e-06,
|
|
"loss": 0.13786702156066893,
|
|
"memory(GiB)": 31.84,
|
|
"step": 720,
|
|
"token_acc": 0.9517344563892275,
|
|
"train_speed(iter/s)": 0.115272
|
|
},
|
|
{
|
|
"epoch": 1.4601545673381477,
|
|
"eval_loss": 0.20220014452934265,
|
|
"eval_runtime": 21.8005,
|
|
"eval_samples_per_second": 14.587,
|
|
"eval_steps_per_second": 3.67,
|
|
"eval_token_acc": 0.9331431429577074,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 1.4702901304953757,
|
|
"grad_norm": 0.7098857760429382,
|
|
"learning_rate": 5.153975292780852e-06,
|
|
"loss": 0.13870218992233277,
|
|
"memory(GiB)": 31.84,
|
|
"step": 725,
|
|
"token_acc": 0.9459776516453177,
|
|
"train_speed(iter/s)": 0.114836
|
|
},
|
|
{
|
|
"epoch": 1.4804256936526037,
|
|
"grad_norm": 0.6736304759979248,
|
|
"learning_rate": 5.10088946838269e-06,
|
|
"loss": 0.15405884981155396,
|
|
"memory(GiB)": 31.84,
|
|
"step": 730,
|
|
"token_acc": 0.943327239488117,
|
|
"train_speed(iter/s)": 0.115018
|
|
},
|
|
{
|
|
"epoch": 1.4905612568098314,
|
|
"grad_norm": 0.6198113560676575,
|
|
"learning_rate": 5.0477922639086594e-06,
|
|
"loss": 0.12916960716247558,
|
|
"memory(GiB)": 31.84,
|
|
"step": 735,
|
|
"token_acc": 0.9533303842264914,
|
|
"train_speed(iter/s)": 0.115172
|
|
},
|
|
{
|
|
"epoch": 1.5006968199670594,
|
|
"grad_norm": 0.6400096416473389,
|
|
"learning_rate": 4.99468966858861e-06,
|
|
"loss": 0.13272554874420167,
|
|
"memory(GiB)": 31.84,
|
|
"step": 740,
|
|
"token_acc": 0.9574656362456492,
|
|
"train_speed(iter/s)": 0.115287
|
|
},
|
|
{
|
|
"epoch": 1.5006968199670594,
|
|
"eval_loss": 0.20150884985923767,
|
|
"eval_runtime": 21.7413,
|
|
"eval_samples_per_second": 14.627,
|
|
"eval_steps_per_second": 3.68,
|
|
"eval_token_acc": 0.9324885916649686,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 1.5108323831242874,
|
|
"grad_norm": 0.6690772771835327,
|
|
"learning_rate": 4.941587672260461e-06,
|
|
"loss": 0.1432182550430298,
|
|
"memory(GiB)": 31.84,
|
|
"step": 745,
|
|
"token_acc": 0.9370182365106223,
|
|
"train_speed(iter/s)": 0.114917
|
|
},
|
|
{
|
|
"epoch": 1.520967946281515,
|
|
"grad_norm": 0.6275672912597656,
|
|
"learning_rate": 4.888492264694566e-06,
|
|
"loss": 0.1296250343322754,
|
|
"memory(GiB)": 31.84,
|
|
"step": 750,
|
|
"token_acc": 0.9563844971453667,
|
|
"train_speed(iter/s)": 0.115024
|
|
},
|
|
{
|
|
"epoch": 1.531103509438743,
|
|
"grad_norm": 0.7383422255516052,
|
|
"learning_rate": 4.8354094349180885e-06,
|
|
"loss": 0.13504064083099365,
|
|
"memory(GiB)": 31.84,
|
|
"step": 755,
|
|
"token_acc": 0.951349361424971,
|
|
"train_speed(iter/s)": 0.115159
|
|
},
|
|
{
|
|
"epoch": 1.541239072595971,
|
|
"grad_norm": 0.7565678358078003,
|
|
"learning_rate": 4.782345170539441e-06,
|
|
"loss": 0.13883064985275267,
|
|
"memory(GiB)": 31.84,
|
|
"step": 760,
|
|
"token_acc": 0.9438551468491588,
|
|
"train_speed(iter/s)": 0.115291
|
|
},
|
|
{
|
|
"epoch": 1.541239072595971,
|
|
"eval_loss": 0.20081333816051483,
|
|
"eval_runtime": 21.7245,
|
|
"eval_samples_per_second": 14.638,
|
|
"eval_steps_per_second": 3.682,
|
|
"eval_token_acc": 0.9331740180186856,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 1.551374635753199,
|
|
"grad_norm": 0.7568323016166687,
|
|
"learning_rate": 4.729305457072913e-06,
|
|
"loss": 0.14828368425369262,
|
|
"memory(GiB)": 31.84,
|
|
"step": 765,
|
|
"token_acc": 0.9404283971948368,
|
|
"train_speed(iter/s)": 0.114963
|
|
},
|
|
{
|
|
"epoch": 1.561510198910427,
|
|
"grad_norm": 0.6327131390571594,
|
|
"learning_rate": 4.676296277263513e-06,
|
|
"loss": 0.13124030828475952,
|
|
"memory(GiB)": 31.84,
|
|
"step": 770,
|
|
"token_acc": 0.948938511889246,
|
|
"train_speed(iter/s)": 0.115091
|
|
},
|
|
{
|
|
"epoch": 1.571645762067655,
|
|
"grad_norm": 0.6433929204940796,
|
|
"learning_rate": 4.6233236104121266e-06,
|
|
"loss": 0.134158194065094,
|
|
"memory(GiB)": 31.84,
|
|
"step": 775,
|
|
"token_acc": 0.952758534346048,
|
|
"train_speed(iter/s)": 0.115205
|
|
},
|
|
{
|
|
"epoch": 1.581781325224883,
|
|
"grad_norm": 0.6848548054695129,
|
|
"learning_rate": 4.570393431701074e-06,
|
|
"loss": 0.14014556407928466,
|
|
"memory(GiB)": 31.84,
|
|
"step": 780,
|
|
"token_acc": 0.9504519678850148,
|
|
"train_speed(iter/s)": 0.115355
|
|
},
|
|
{
|
|
"epoch": 1.581781325224883,
|
|
"eval_loss": 0.20110902190208435,
|
|
"eval_runtime": 21.6337,
|
|
"eval_samples_per_second": 14.699,
|
|
"eval_steps_per_second": 3.698,
|
|
"eval_token_acc": 0.9332048930796638,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 1.5919168883821109,
|
|
"grad_norm": 0.7065548896789551,
|
|
"learning_rate": 4.517511711520121e-06,
|
|
"loss": 0.1434975743293762,
|
|
"memory(GiB)": 31.84,
|
|
"step": 785,
|
|
"token_acc": 0.9380875156072849,
|
|
"train_speed(iter/s)": 0.114991
|
|
},
|
|
{
|
|
"epoch": 1.6020524515393386,
|
|
"grad_norm": 0.6895316243171692,
|
|
"learning_rate": 4.46468441479303e-06,
|
|
"loss": 0.14089118242263793,
|
|
"memory(GiB)": 31.84,
|
|
"step": 790,
|
|
"token_acc": 0.9526078624333143,
|
|
"train_speed(iter/s)": 0.115135
|
|
},
|
|
{
|
|
"epoch": 1.6121880146965666,
|
|
"grad_norm": 0.6403608918190002,
|
|
"learning_rate": 4.411917500304741e-06,
|
|
"loss": 0.13246408700942994,
|
|
"memory(GiB)": 31.84,
|
|
"step": 795,
|
|
"token_acc": 0.9556548295096035,
|
|
"train_speed(iter/s)": 0.11525
|
|
},
|
|
{
|
|
"epoch": 1.6223235778537946,
|
|
"grad_norm": 0.7208017706871033,
|
|
"learning_rate": 4.359216920029227e-06,
|
|
"loss": 0.1482730746269226,
|
|
"memory(GiB)": 31.84,
|
|
"step": 800,
|
|
"token_acc": 0.9493780648247817,
|
|
"train_speed(iter/s)": 0.115389
|
|
},
|
|
{
|
|
"epoch": 1.6223235778537946,
|
|
"eval_loss": 0.19986069202423096,
|
|
"eval_runtime": 21.7303,
|
|
"eval_samples_per_second": 14.634,
|
|
"eval_steps_per_second": 3.681,
|
|
"eval_token_acc": 0.9332419431528377,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.6324591410110223,
|
|
"grad_norm": 0.7288877964019775,
|
|
"learning_rate": 4.306588618458134e-06,
|
|
"loss": 0.14321244955062867,
|
|
"memory(GiB)": 31.84,
|
|
"step": 805,
|
|
"token_acc": 0.9386183036920914,
|
|
"train_speed(iter/s)": 0.11503
|
|
},
|
|
{
|
|
"epoch": 1.6425947041682503,
|
|
"grad_norm": 0.7239351868629456,
|
|
"learning_rate": 4.254038531930253e-06,
|
|
"loss": 0.15064480304718017,
|
|
"memory(GiB)": 31.84,
|
|
"step": 810,
|
|
"token_acc": 0.9507855081756973,
|
|
"train_speed(iter/s)": 0.115159
|
|
},
|
|
{
|
|
"epoch": 1.6527302673254782,
|
|
"grad_norm": 0.6510659456253052,
|
|
"learning_rate": 4.201572587961911e-06,
|
|
"loss": 0.13342173099517823,
|
|
"memory(GiB)": 31.84,
|
|
"step": 815,
|
|
"token_acc": 0.9515407052300228,
|
|
"train_speed(iter/s)": 0.115262
|
|
},
|
|
{
|
|
"epoch": 1.6628658304827062,
|
|
"grad_norm": 0.76472008228302,
|
|
"learning_rate": 4.149196704578375e-06,
|
|
"loss": 0.14209251403808593,
|
|
"memory(GiB)": 31.84,
|
|
"step": 820,
|
|
"token_acc": 0.9562061456569079,
|
|
"train_speed(iter/s)": 0.115391
|
|
},
|
|
{
|
|
"epoch": 1.6628658304827062,
|
|
"eval_loss": 0.19960026443004608,
|
|
"eval_runtime": 21.6668,
|
|
"eval_samples_per_second": 14.677,
|
|
"eval_steps_per_second": 3.692,
|
|
"eval_token_acc": 0.9334951186528594,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.6730013936399342,
|
|
"grad_norm": 0.6803078055381775,
|
|
"learning_rate": 4.096916789646305e-06,
|
|
"loss": 0.1310012936592102,
|
|
"memory(GiB)": 31.84,
|
|
"step": 825,
|
|
"token_acc": 0.9461966384760458,
|
|
"train_speed(iter/s)": 0.115013
|
|
},
|
|
{
|
|
"epoch": 1.6831369567971621,
|
|
"grad_norm": 0.6794367432594299,
|
|
"learning_rate": 4.04473874020736e-06,
|
|
"loss": 0.14054393768310547,
|
|
"memory(GiB)": 31.84,
|
|
"step": 830,
|
|
"token_acc": 0.9499732318125037,
|
|
"train_speed(iter/s)": 0.115168
|
|
},
|
|
{
|
|
"epoch": 1.69327251995439,
|
|
"grad_norm": 0.6680985689163208,
|
|
"learning_rate": 3.992668441813036e-06,
|
|
"loss": 0.15022470951080322,
|
|
"memory(GiB)": 31.84,
|
|
"step": 835,
|
|
"token_acc": 0.9472336065573771,
|
|
"train_speed(iter/s)": 0.115303
|
|
},
|
|
{
|
|
"epoch": 1.7034080831116178,
|
|
"grad_norm": 0.707535982131958,
|
|
"learning_rate": 3.940711767860776e-06,
|
|
"loss": 0.14012532234191893,
|
|
"memory(GiB)": 31.84,
|
|
"step": 840,
|
|
"token_acc": 0.9501387137452711,
|
|
"train_speed(iter/s)": 0.115467
|
|
},
|
|
{
|
|
"epoch": 1.7034080831116178,
|
|
"eval_loss": 0.19959832727909088,
|
|
"eval_runtime": 21.6673,
|
|
"eval_samples_per_second": 14.676,
|
|
"eval_steps_per_second": 3.692,
|
|
"eval_token_acc": 0.9337853442260549,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.7135436462688458,
|
|
"grad_norm": 0.6823382377624512,
|
|
"learning_rate": 3.888874578931482e-06,
|
|
"loss": 0.14827988147735596,
|
|
"memory(GiB)": 31.84,
|
|
"step": 845,
|
|
"token_acc": 0.9361747509472786,
|
|
"train_speed(iter/s)": 0.115101
|
|
},
|
|
{
|
|
"epoch": 1.7236792094260738,
|
|
"grad_norm": 0.695568859577179,
|
|
"learning_rate": 3.8371627221284495e-06,
|
|
"loss": 0.14526791572570802,
|
|
"memory(GiB)": 31.84,
|
|
"step": 850,
|
|
"token_acc": 0.9438457505679502,
|
|
"train_speed(iter/s)": 0.115237
|
|
},
|
|
{
|
|
"epoch": 1.7338147725833015,
|
|
"grad_norm": 0.6451678276062012,
|
|
"learning_rate": 3.7855820304178202e-06,
|
|
"loss": 0.13392380475997925,
|
|
"memory(GiB)": 31.84,
|
|
"step": 855,
|
|
"token_acc": 0.9532031010915026,
|
|
"train_speed(iter/s)": 0.115329
|
|
},
|
|
{
|
|
"epoch": 1.7439503357405295,
|
|
"grad_norm": 0.6347934007644653,
|
|
"learning_rate": 3.7341383219706535e-06,
|
|
"loss": 0.137128746509552,
|
|
"memory(GiB)": 31.84,
|
|
"step": 860,
|
|
"token_acc": 0.9556541717810719,
|
|
"train_speed(iter/s)": 0.115435
|
|
},
|
|
{
|
|
"epoch": 1.7439503357405295,
|
|
"eval_loss": 0.19803106784820557,
|
|
"eval_runtime": 21.589,
|
|
"eval_samples_per_second": 14.73,
|
|
"eval_steps_per_second": 3.706,
|
|
"eval_token_acc": 0.9337729942016636,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.7540858988977575,
|
|
"grad_norm": 0.7886172533035278,
|
|
"learning_rate": 3.6828373995066434e-06,
|
|
"loss": 0.13904647827148436,
|
|
"memory(GiB)": 31.84,
|
|
"step": 865,
|
|
"token_acc": 0.9446668643732701,
|
|
"train_speed(iter/s)": 0.115085
|
|
},
|
|
{
|
|
"epoch": 1.7642214620549854,
|
|
"grad_norm": 0.6498300433158875,
|
|
"learning_rate": 3.6316850496395863e-06,
|
|
"loss": 0.1318161129951477,
|
|
"memory(GiB)": 31.84,
|
|
"step": 870,
|
|
"token_acc": 0.9486192100341072,
|
|
"train_speed(iter/s)": 0.115204
|
|
},
|
|
{
|
|
"epoch": 1.7743570252122134,
|
|
"grad_norm": 0.6739187240600586,
|
|
"learning_rate": 3.5806870422246675e-06,
|
|
"loss": 0.13507769107818604,
|
|
"memory(GiB)": 31.84,
|
|
"step": 875,
|
|
"token_acc": 0.9506144334128633,
|
|
"train_speed(iter/s)": 0.115288
|
|
},
|
|
{
|
|
"epoch": 1.7844925883694414,
|
|
"grad_norm": 0.6173581480979919,
|
|
"learning_rate": 3.5298491297076332e-06,
|
|
"loss": 0.1330319046974182,
|
|
"memory(GiB)": 31.84,
|
|
"step": 880,
|
|
"token_acc": 0.9496503128779513,
|
|
"train_speed(iter/s)": 0.115413
|
|
},
|
|
{
|
|
"epoch": 1.7844925883694414,
|
|
"eval_loss": 0.1983470916748047,
|
|
"eval_runtime": 21.6381,
|
|
"eval_samples_per_second": 14.696,
|
|
"eval_steps_per_second": 3.697,
|
|
"eval_token_acc": 0.9336927190431201,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.7946281515266693,
|
|
"grad_norm": 0.7845348715782166,
|
|
"learning_rate": 3.479177046475935e-06,
|
|
"loss": 0.14459047317504883,
|
|
"memory(GiB)": 31.84,
|
|
"step": 885,
|
|
"token_acc": 0.9400699857249334,
|
|
"train_speed(iter/s)": 0.115073
|
|
},
|
|
{
|
|
"epoch": 1.804763714683897,
|
|
"grad_norm": 0.677846372127533,
|
|
"learning_rate": 3.428676508211902e-06,
|
|
"loss": 0.1446584701538086,
|
|
"memory(GiB)": 31.84,
|
|
"step": 890,
|
|
"token_acc": 0.9527403560028134,
|
|
"train_speed(iter/s)": 0.115198
|
|
},
|
|
{
|
|
"epoch": 1.814899277841125,
|
|
"grad_norm": 0.6514244079589844,
|
|
"learning_rate": 3.3783532112480243e-06,
|
|
"loss": 0.14350442886352538,
|
|
"memory(GiB)": 31.84,
|
|
"step": 895,
|
|
"token_acc": 0.9489065777752235,
|
|
"train_speed(iter/s)": 0.115319
|
|
},
|
|
{
|
|
"epoch": 1.825034840998353,
|
|
"grad_norm": 0.6789301633834839,
|
|
"learning_rate": 3.328212831924424e-06,
|
|
"loss": 0.13438014984130858,
|
|
"memory(GiB)": 31.84,
|
|
"step": 900,
|
|
"token_acc": 0.9504659558391403,
|
|
"train_speed(iter/s)": 0.1154
|
|
},
|
|
{
|
|
"epoch": 1.825034840998353,
|
|
"eval_loss": 0.19804814457893372,
|
|
"eval_runtime": 21.7224,
|
|
"eval_samples_per_second": 14.639,
|
|
"eval_steps_per_second": 3.683,
|
|
"eval_token_acc": 0.9340385197260764,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.8351704041555807,
|
|
"grad_norm": 0.7078403830528259,
|
|
"learning_rate": 3.2782610259485816e-06,
|
|
"loss": 0.14103929996490477,
|
|
"memory(GiB)": 31.84,
|
|
"step": 905,
|
|
"token_acc": 0.9467157966221059,
|
|
"train_speed(iter/s)": 0.115049
|
|
},
|
|
{
|
|
"epoch": 1.8453059673128087,
|
|
"grad_norm": 0.6451456546783447,
|
|
"learning_rate": 3.228503427757374e-06,
|
|
"loss": 0.1354345202445984,
|
|
"memory(GiB)": 31.84,
|
|
"step": 910,
|
|
"token_acc": 0.949082312268036,
|
|
"train_speed(iter/s)": 0.115184
|
|
},
|
|
{
|
|
"epoch": 1.8554415304700367,
|
|
"grad_norm": 0.7488725185394287,
|
|
"learning_rate": 3.178945649881543e-06,
|
|
"loss": 0.1395805835723877,
|
|
"memory(GiB)": 31.84,
|
|
"step": 915,
|
|
"token_acc": 0.9481235027947831,
|
|
"train_speed(iter/s)": 0.115318
|
|
},
|
|
{
|
|
"epoch": 1.8655770936272646,
|
|
"grad_norm": 0.6281618475914001,
|
|
"learning_rate": 3.1295932823125984e-06,
|
|
"loss": 0.13758153915405275,
|
|
"memory(GiB)": 31.84,
|
|
"step": 920,
|
|
"token_acc": 0.952330743618202,
|
|
"train_speed(iter/s)": 0.115431
|
|
},
|
|
{
|
|
"epoch": 1.8655770936272646,
|
|
"eval_loss": 0.19679398834705353,
|
|
"eval_runtime": 21.7247,
|
|
"eval_samples_per_second": 14.638,
|
|
"eval_steps_per_second": 3.682,
|
|
"eval_token_acc": 0.9344151954700111,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.8757126567844926,
|
|
"grad_norm": 0.678552508354187,
|
|
"learning_rate": 3.0804518918722953e-06,
|
|
"loss": 0.14409549236297609,
|
|
"memory(GiB)": 31.84,
|
|
"step": 925,
|
|
"token_acc": 0.9402150206553255,
|
|
"train_speed(iter/s)": 0.115117
|
|
},
|
|
{
|
|
"epoch": 1.8858482199417206,
|
|
"grad_norm": 0.654852569103241,
|
|
"learning_rate": 3.0315270215847015e-06,
|
|
"loss": 0.14367053508758545,
|
|
"memory(GiB)": 31.84,
|
|
"step": 930,
|
|
"token_acc": 0.9497569353477134,
|
|
"train_speed(iter/s)": 0.115224
|
|
},
|
|
{
|
|
"epoch": 1.8959837830989485,
|
|
"grad_norm": 0.7289896011352539,
|
|
"learning_rate": 2.982824190050958e-06,
|
|
"loss": 0.1398463487625122,
|
|
"memory(GiB)": 31.84,
|
|
"step": 935,
|
|
"token_acc": 0.9556035338495136,
|
|
"train_speed(iter/s)": 0.115346
|
|
},
|
|
{
|
|
"epoch": 1.9061193462561765,
|
|
"grad_norm": 0.7101432681083679,
|
|
"learning_rate": 2.934348890826804e-06,
|
|
"loss": 0.14695172309875487,
|
|
"memory(GiB)": 31.84,
|
|
"step": 940,
|
|
"token_acc": 0.9451462090193687,
|
|
"train_speed(iter/s)": 0.115476
|
|
},
|
|
{
|
|
"epoch": 1.9061193462561765,
|
|
"eval_loss": 0.196446493268013,
|
|
"eval_runtime": 21.7433,
|
|
"eval_samples_per_second": 14.625,
|
|
"eval_steps_per_second": 3.679,
|
|
"eval_token_acc": 0.9346374959090544,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.9162549094134043,
|
|
"grad_norm": 0.6972203850746155,
|
|
"learning_rate": 2.8861065918029085e-06,
|
|
"loss": 0.13769317865371705,
|
|
"memory(GiB)": 31.84,
|
|
"step": 945,
|
|
"token_acc": 0.9454437011741591,
|
|
"train_speed(iter/s)": 0.11518
|
|
},
|
|
{
|
|
"epoch": 1.9263904725706322,
|
|
"grad_norm": 0.633342981338501,
|
|
"learning_rate": 2.83810273458811e-06,
|
|
"loss": 0.13131552934646606,
|
|
"memory(GiB)": 31.84,
|
|
"step": 950,
|
|
"token_acc": 0.9490581763627272,
|
|
"train_speed(iter/s)": 0.115298
|
|
},
|
|
{
|
|
"epoch": 1.93652603572786,
|
|
"grad_norm": 0.7772082686424255,
|
|
"learning_rate": 2.790342733895618e-06,
|
|
"loss": 0.14086905717849732,
|
|
"memory(GiB)": 31.84,
|
|
"step": 955,
|
|
"token_acc": 0.949326951839665,
|
|
"train_speed(iter/s)": 0.115399
|
|
},
|
|
{
|
|
"epoch": 1.946661598885088,
|
|
"grad_norm": 0.6924586892127991,
|
|
"learning_rate": 2.742831976932242e-06,
|
|
"loss": 0.13804731369018555,
|
|
"memory(GiB)": 31.84,
|
|
"step": 960,
|
|
"token_acc": 0.954203088394589,
|
|
"train_speed(iter/s)": 0.115508
|
|
},
|
|
{
|
|
"epoch": 1.946661598885088,
|
|
"eval_loss": 0.1961846649646759,
|
|
"eval_runtime": 21.6189,
|
|
"eval_samples_per_second": 14.709,
|
|
"eval_steps_per_second": 3.7,
|
|
"eval_token_acc": 0.9348597963480978,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.956797162042316,
|
|
"grad_norm": 0.6690623760223389,
|
|
"learning_rate": 2.6955758227907335e-06,
|
|
"loss": 0.13871316909790038,
|
|
"memory(GiB)": 31.84,
|
|
"step": 965,
|
|
"token_acc": 0.9416814206770749,
|
|
"train_speed(iter/s)": 0.115192
|
|
},
|
|
{
|
|
"epoch": 1.9669327251995439,
|
|
"grad_norm": 0.7014771103858948,
|
|
"learning_rate": 2.648579601845295e-06,
|
|
"loss": 0.1412734031677246,
|
|
"memory(GiB)": 31.84,
|
|
"step": 970,
|
|
"token_acc": 0.9565555225449642,
|
|
"train_speed(iter/s)": 0.115273
|
|
},
|
|
{
|
|
"epoch": 1.9770682883567718,
|
|
"grad_norm": 0.6217651963233948,
|
|
"learning_rate": 2.6018486151503213e-06,
|
|
"loss": 0.14024865627288818,
|
|
"memory(GiB)": 31.84,
|
|
"step": 975,
|
|
"token_acc": 0.9587345601209982,
|
|
"train_speed(iter/s)": 0.115361
|
|
},
|
|
{
|
|
"epoch": 1.9872038515139998,
|
|
"grad_norm": 0.6598934531211853,
|
|
"learning_rate": 2.5553881338424553e-06,
|
|
"loss": 0.13222227096557618,
|
|
"memory(GiB)": 31.84,
|
|
"step": 980,
|
|
"token_acc": 0.9532787321172154,
|
|
"train_speed(iter/s)": 0.115455
|
|
},
|
|
{
|
|
"epoch": 1.9872038515139998,
|
|
"eval_loss": 0.19523988664150238,
|
|
"eval_runtime": 21.7763,
|
|
"eval_samples_per_second": 14.603,
|
|
"eval_steps_per_second": 3.674,
|
|
"eval_token_acc": 0.9347609961529674,
|
|
"step": 980
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 1479,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.3311229017634898e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|