Files
qwen2.5vl-3b-32b-wrong-resp…/trainer_state.json
ModelHub XC 102de829b9 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-32b-wrong-response
Source: Original Platform
2026-05-20 00:14:48 +08:00

2446 lines
69 KiB
JSON

{
"best_global_step": 980,
"best_metric": 0.19523989,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v28-20250504-001458/checkpoint-980",
"epoch": 1.9872038515139998,
"eval_steps": 20,
"global_step": 980,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002027112631445585,
"grad_norm": 2.521756172180176,
"learning_rate": 9.999988720152121e-06,
"loss": 0.36801594495773315,
"memory(GiB)": 31.83,
"step": 1,
"token_acc": 0.8936768843476973,
"train_speed(iter/s)": 0.061871
},
{
"epoch": 0.010135563157227924,
"grad_norm": 1.4690369367599487,
"learning_rate": 9.999718006347703e-06,
"loss": 0.29584014415740967,
"memory(GiB)": 31.83,
"step": 5,
"token_acc": 0.9045181572689532,
"train_speed(iter/s)": 0.11262
},
{
"epoch": 0.020271126314455847,
"grad_norm": 1.016095519065857,
"learning_rate": 9.998872057198983e-06,
"loss": 0.2859709024429321,
"memory(GiB)": 31.83,
"step": 10,
"token_acc": 0.8981923761356186,
"train_speed(iter/s)": 0.125063
},
{
"epoch": 0.03040668947168377,
"grad_norm": 0.8137410879135132,
"learning_rate": 9.997462247974751e-06,
"loss": 0.26002681255340576,
"memory(GiB)": 31.83,
"step": 15,
"token_acc": 0.907031992397846,
"train_speed(iter/s)": 0.130865
},
{
"epoch": 0.040542252628911694,
"grad_norm": 0.8540328145027161,
"learning_rate": 9.995488737697912e-06,
"loss": 0.24060523509979248,
"memory(GiB)": 31.84,
"step": 20,
"token_acc": 0.9225532754538279,
"train_speed(iter/s)": 0.133177
},
{
"epoch": 0.040542252628911694,
"eval_loss": 0.2592654228210449,
"eval_runtime": 21.9072,
"eval_samples_per_second": 14.516,
"eval_steps_per_second": 3.652,
"eval_token_acc": 0.9178476377490845,
"step": 20
},
{
"epoch": 0.05067781578613962,
"grad_norm": 0.7922812104225159,
"learning_rate": 9.992951748975412e-06,
"loss": 0.24358665943145752,
"memory(GiB)": 31.84,
"step": 25,
"token_acc": 0.9152290582047404,
"train_speed(iter/s)": 0.114782
},
{
"epoch": 0.06081337894336754,
"grad_norm": 0.8630368709564209,
"learning_rate": 9.98985156797314e-06,
"loss": 0.23777904510498046,
"memory(GiB)": 31.84,
"step": 30,
"token_acc": 0.9226734218674036,
"train_speed(iter/s)": 0.118815
},
{
"epoch": 0.07094894210059546,
"grad_norm": 0.783414900302887,
"learning_rate": 9.98618854438364e-06,
"loss": 0.2334528923034668,
"memory(GiB)": 31.84,
"step": 35,
"token_acc": 0.9277159363111174,
"train_speed(iter/s)": 0.122415
},
{
"epoch": 0.08108450525782339,
"grad_norm": 0.8003538846969604,
"learning_rate": 9.98196309138667e-06,
"loss": 0.24934375286102295,
"memory(GiB)": 31.84,
"step": 40,
"token_acc": 0.9100693397648477,
"train_speed(iter/s)": 0.124779
},
{
"epoch": 0.08108450525782339,
"eval_loss": 0.24275335669517517,
"eval_runtime": 21.8352,
"eval_samples_per_second": 14.564,
"eval_steps_per_second": 3.664,
"eval_token_acc": 0.9218243456030826,
"step": 40
},
{
"epoch": 0.09122006841505131,
"grad_norm": 0.8252556324005127,
"learning_rate": 9.977175685602601e-06,
"loss": 0.22385673522949218,
"memory(GiB)": 31.84,
"step": 45,
"token_acc": 0.9232757395226219,
"train_speed(iter/s)": 0.115407
},
{
"epoch": 0.10135563157227924,
"grad_norm": 0.7236311435699463,
"learning_rate": 9.971826867038652e-06,
"loss": 0.23449177742004396,
"memory(GiB)": 31.84,
"step": 50,
"token_acc": 0.9244473590731481,
"train_speed(iter/s)": 0.117372
},
{
"epoch": 0.11149119472950716,
"grad_norm": 0.7773425579071045,
"learning_rate": 9.965917239027972e-06,
"loss": 0.22451424598693848,
"memory(GiB)": 31.84,
"step": 55,
"token_acc": 0.9317552435764683,
"train_speed(iter/s)": 0.118883
},
{
"epoch": 0.12162675788673508,
"grad_norm": 0.7654985785484314,
"learning_rate": 9.959447468161598e-06,
"loss": 0.22769575119018554,
"memory(GiB)": 31.84,
"step": 60,
"token_acc": 0.9208154305996638,
"train_speed(iter/s)": 0.120828
},
{
"epoch": 0.12162675788673508,
"eval_loss": 0.23627901077270508,
"eval_runtime": 21.6502,
"eval_samples_per_second": 14.688,
"eval_steps_per_second": 3.695,
"eval_token_acc": 0.9226332722007126,
"step": 60
},
{
"epoch": 0.13176232104396302,
"grad_norm": 0.8141059279441833,
"learning_rate": 9.952418284213256e-06,
"loss": 0.21704885959625245,
"memory(GiB)": 31.84,
"step": 65,
"token_acc": 0.9179818989115771,
"train_speed(iter/s)": 0.114505
},
{
"epoch": 0.14189788420119093,
"grad_norm": 0.6942949891090393,
"learning_rate": 9.94483048005705e-06,
"loss": 0.23090925216674804,
"memory(GiB)": 31.84,
"step": 70,
"token_acc": 0.9222755382207283,
"train_speed(iter/s)": 0.115729
},
{
"epoch": 0.15203344735841884,
"grad_norm": 0.7940172553062439,
"learning_rate": 9.936684911578019e-06,
"loss": 0.22984471321105956,
"memory(GiB)": 31.84,
"step": 75,
"token_acc": 0.9262917933130699,
"train_speed(iter/s)": 0.117067
},
{
"epoch": 0.16216901051564678,
"grad_norm": 0.7496793270111084,
"learning_rate": 9.927982497575606e-06,
"loss": 0.21851859092712403,
"memory(GiB)": 31.84,
"step": 80,
"token_acc": 0.9258955118221719,
"train_speed(iter/s)": 0.117832
},
{
"epoch": 0.16216901051564678,
"eval_loss": 0.231533482670784,
"eval_runtime": 21.8888,
"eval_samples_per_second": 14.528,
"eval_steps_per_second": 3.655,
"eval_token_acc": 0.9239361997739945,
"step": 80
},
{
"epoch": 0.1723045736728747,
"grad_norm": 0.7115150690078735,
"learning_rate": 9.918724219660013e-06,
"loss": 0.2375343084335327,
"memory(GiB)": 31.84,
"step": 85,
"token_acc": 0.9190241791529182,
"train_speed(iter/s)": 0.113889
},
{
"epoch": 0.18244013683010263,
"grad_norm": 0.7307594418525696,
"learning_rate": 9.908911122141486e-06,
"loss": 0.21745119094848633,
"memory(GiB)": 31.84,
"step": 90,
"token_acc": 0.9248731238527157,
"train_speed(iter/s)": 0.11504
},
{
"epoch": 0.19257569998733054,
"grad_norm": 0.7595335841178894,
"learning_rate": 9.898544311912507e-06,
"loss": 0.23666539192199706,
"memory(GiB)": 31.84,
"step": 95,
"token_acc": 0.9154884783300083,
"train_speed(iter/s)": 0.116125
},
{
"epoch": 0.20271126314455848,
"grad_norm": 0.7457159161567688,
"learning_rate": 9.887624958322945e-06,
"loss": 0.22108936309814453,
"memory(GiB)": 31.84,
"step": 100,
"token_acc": 0.9207408691631993,
"train_speed(iter/s)": 0.117134
},
{
"epoch": 0.20271126314455848,
"eval_loss": 0.22748854756355286,
"eval_runtime": 21.8753,
"eval_samples_per_second": 14.537,
"eval_steps_per_second": 3.657,
"eval_token_acc": 0.9248686266155376,
"step": 100
},
{
"epoch": 0.2128468263017864,
"grad_norm": 0.715534508228302,
"learning_rate": 9.876154293048163e-06,
"loss": 0.2108386516571045,
"memory(GiB)": 31.84,
"step": 105,
"token_acc": 0.9273328473234743,
"train_speed(iter/s)": 0.113574
},
{
"epoch": 0.22298238945901433,
"grad_norm": 0.7702275514602661,
"learning_rate": 9.864133609950077e-06,
"loss": 0.22842142581939698,
"memory(GiB)": 31.84,
"step": 110,
"token_acc": 0.9225603164746199,
"train_speed(iter/s)": 0.114631
},
{
"epoch": 0.23311795261624224,
"grad_norm": 0.7723402976989746,
"learning_rate": 9.851564264931219e-06,
"loss": 0.22526850700378417,
"memory(GiB)": 31.84,
"step": 115,
"token_acc": 0.9270424907786587,
"train_speed(iter/s)": 0.115388
},
{
"epoch": 0.24325351577347015,
"grad_norm": 0.7186813950538635,
"learning_rate": 9.838447675781795e-06,
"loss": 0.21562654972076417,
"memory(GiB)": 31.84,
"step": 120,
"token_acc": 0.9274811342327921,
"train_speed(iter/s)": 0.116096
},
{
"epoch": 0.24325351577347015,
"eval_loss": 0.22359323501586914,
"eval_runtime": 21.7276,
"eval_samples_per_second": 14.636,
"eval_steps_per_second": 3.682,
"eval_token_acc": 0.925652853164385,
"step": 120
},
{
"epoch": 0.25338907893069806,
"grad_norm": 0.7766680717468262,
"learning_rate": 9.824785322019753e-06,
"loss": 0.2237046480178833,
"memory(GiB)": 31.84,
"step": 125,
"token_acc": 0.9236945219863242,
"train_speed(iter/s)": 0.113406
},
{
"epoch": 0.26352464208792603,
"grad_norm": 0.7407037019729614,
"learning_rate": 9.81057874472391e-06,
"loss": 0.219429874420166,
"memory(GiB)": 31.84,
"step": 130,
"token_acc": 0.9289204018740761,
"train_speed(iter/s)": 0.114239
},
{
"epoch": 0.27366020524515394,
"grad_norm": 0.7638006210327148,
"learning_rate": 9.795829546360113e-06,
"loss": 0.2275376319885254,
"memory(GiB)": 31.84,
"step": 135,
"token_acc": 0.9208956514221492,
"train_speed(iter/s)": 0.114967
},
{
"epoch": 0.28379576840238185,
"grad_norm": 0.7686223387718201,
"learning_rate": 9.78053939060049e-06,
"loss": 0.22452235221862793,
"memory(GiB)": 31.84,
"step": 140,
"token_acc": 0.9189745452344199,
"train_speed(iter/s)": 0.11575
},
{
"epoch": 0.28379576840238185,
"eval_loss": 0.22123830020427704,
"eval_runtime": 21.7578,
"eval_samples_per_second": 14.615,
"eval_steps_per_second": 3.677,
"eval_token_acc": 0.9263012294449282,
"step": 140
},
{
"epoch": 0.29393133155960977,
"grad_norm": 0.6931352019309998,
"learning_rate": 9.764710002135784e-06,
"loss": 0.2003002643585205,
"memory(GiB)": 31.84,
"step": 145,
"token_acc": 0.9311943836379415,
"train_speed(iter/s)": 0.113318
},
{
"epoch": 0.3040668947168377,
"grad_norm": 0.800798773765564,
"learning_rate": 9.748343166480823e-06,
"loss": 0.20239923000335694,
"memory(GiB)": 31.84,
"step": 150,
"token_acc": 0.9229381308508969,
"train_speed(iter/s)": 0.113936
},
{
"epoch": 0.31420245787406564,
"grad_norm": 0.7786034345626831,
"learning_rate": 9.731440729773114e-06,
"loss": 0.22815029621124266,
"memory(GiB)": 31.84,
"step": 155,
"token_acc": 0.9227938277122053,
"train_speed(iter/s)": 0.114778
},
{
"epoch": 0.32433802103129356,
"grad_norm": 0.7402083277702332,
"learning_rate": 9.714004598564599e-06,
"loss": 0.19810022115707399,
"memory(GiB)": 31.84,
"step": 160,
"token_acc": 0.9393871049589806,
"train_speed(iter/s)": 0.115347
},
{
"epoch": 0.32433802103129356,
"eval_loss": 0.21936483681201935,
"eval_runtime": 21.7335,
"eval_samples_per_second": 14.632,
"eval_steps_per_second": 3.681,
"eval_token_acc": 0.9270422309084061,
"step": 160
},
{
"epoch": 0.33447358418852147,
"grad_norm": 0.7241997718811035,
"learning_rate": 9.696036739606606e-06,
"loss": 0.21367735862731935,
"memory(GiB)": 31.84,
"step": 165,
"token_acc": 0.9242776974538682,
"train_speed(iter/s)": 0.113487
},
{
"epoch": 0.3446091473457494,
"grad_norm": 0.7917912006378174,
"learning_rate": 9.677539179628005e-06,
"loss": 0.23486363887786865,
"memory(GiB)": 31.84,
"step": 170,
"token_acc": 0.9263079934401947,
"train_speed(iter/s)": 0.114247
},
{
"epoch": 0.35474471050297735,
"grad_norm": 0.7224828004837036,
"learning_rate": 9.658514005106596e-06,
"loss": 0.22237610816955566,
"memory(GiB)": 31.84,
"step": 175,
"token_acc": 0.9191534928283597,
"train_speed(iter/s)": 0.114924
},
{
"epoch": 0.36488027366020526,
"grad_norm": 0.8869006037712097,
"learning_rate": 9.638963362033756e-06,
"loss": 0.2130965232849121,
"memory(GiB)": 31.84,
"step": 180,
"token_acc": 0.9328146634139909,
"train_speed(iter/s)": 0.115499
},
{
"epoch": 0.36488027366020526,
"eval_loss": 0.21775159239768982,
"eval_runtime": 21.827,
"eval_samples_per_second": 14.569,
"eval_steps_per_second": 3.665,
"eval_token_acc": 0.9271039810303625,
"step": 180
},
{
"epoch": 0.37501583681743317,
"grad_norm": 0.7315487861633301,
"learning_rate": 9.618889455672384e-06,
"loss": 0.20544004440307617,
"memory(GiB)": 31.84,
"step": 185,
"token_acc": 0.9269179022705522,
"train_speed(iter/s)": 0.113738
},
{
"epoch": 0.3851513999746611,
"grad_norm": 0.7636704444885254,
"learning_rate": 9.598294550308149e-06,
"loss": 0.21708755493164061,
"memory(GiB)": 31.84,
"step": 190,
"token_acc": 0.9192093078475603,
"train_speed(iter/s)": 0.114239
},
{
"epoch": 0.395286963131889,
"grad_norm": 0.753697395324707,
"learning_rate": 9.577180968994081e-06,
"loss": 0.20737123489379883,
"memory(GiB)": 31.84,
"step": 195,
"token_acc": 0.9441376276819314,
"train_speed(iter/s)": 0.114771
},
{
"epoch": 0.40542252628911696,
"grad_norm": 0.670136570930481,
"learning_rate": 9.55555109328855e-06,
"loss": 0.22084522247314453,
"memory(GiB)": 31.84,
"step": 200,
"token_acc": 0.9222005816280249,
"train_speed(iter/s)": 0.115258
},
{
"epoch": 0.40542252628911696,
"eval_loss": 0.2161727398633957,
"eval_runtime": 21.793,
"eval_samples_per_second": 14.592,
"eval_steps_per_second": 3.671,
"eval_token_acc": 0.9274559567255145,
"step": 200
},
{
"epoch": 0.41555808944634487,
"grad_norm": 0.7045581936836243,
"learning_rate": 9.533407362986606e-06,
"loss": 0.20071265697479249,
"memory(GiB)": 31.84,
"step": 205,
"token_acc": 0.926633272095267,
"train_speed(iter/s)": 0.11359
},
{
"epoch": 0.4256936526035728,
"grad_norm": 0.7426798343658447,
"learning_rate": 9.51075227584481e-06,
"loss": 0.2052762269973755,
"memory(GiB)": 31.84,
"step": 210,
"token_acc": 0.9334708885538184,
"train_speed(iter/s)": 0.114162
},
{
"epoch": 0.4358292157608007,
"grad_norm": 0.7467470169067383,
"learning_rate": 9.487588387299465e-06,
"loss": 0.2220769166946411,
"memory(GiB)": 31.84,
"step": 215,
"token_acc": 0.9223140495867769,
"train_speed(iter/s)": 0.114807
},
{
"epoch": 0.44596477891802866,
"grad_norm": 0.7024182677268982,
"learning_rate": 9.463918310178385e-06,
"loss": 0.21398956775665284,
"memory(GiB)": 31.84,
"step": 220,
"token_acc": 0.9318319475799791,
"train_speed(iter/s)": 0.115301
},
{
"epoch": 0.44596477891802866,
"eval_loss": 0.21440377831459045,
"eval_runtime": 21.7402,
"eval_samples_per_second": 14.627,
"eval_steps_per_second": 3.68,
"eval_token_acc": 0.9279561327133621,
"step": 220
},
{
"epoch": 0.45610034207525657,
"grad_norm": 0.6610811948776245,
"learning_rate": 9.439744714406167e-06,
"loss": 0.20560271739959718,
"memory(GiB)": 31.84,
"step": 225,
"token_acc": 0.9351413448427652,
"train_speed(iter/s)": 0.113954
},
{
"epoch": 0.4662359052324845,
"grad_norm": 0.6910359859466553,
"learning_rate": 9.415070326703039e-06,
"loss": 0.22013638019561768,
"memory(GiB)": 31.84,
"step": 230,
"token_acc": 0.9156021643331288,
"train_speed(iter/s)": 0.114456
},
{
"epoch": 0.4763714683897124,
"grad_norm": 0.7237235903739929,
"learning_rate": 9.38989793027728e-06,
"loss": 0.2018293857574463,
"memory(GiB)": 31.84,
"step": 235,
"token_acc": 0.9305291447985599,
"train_speed(iter/s)": 0.114857
},
{
"epoch": 0.4865070315469403,
"grad_norm": 0.5631291270256042,
"learning_rate": 9.364230364511296e-06,
"loss": 0.20158102512359619,
"memory(GiB)": 31.84,
"step": 240,
"token_acc": 0.9311977864098334,
"train_speed(iter/s)": 0.115295
},
{
"epoch": 0.4865070315469403,
"eval_loss": 0.21314272284507751,
"eval_runtime": 21.8121,
"eval_samples_per_second": 14.579,
"eval_steps_per_second": 3.668,
"eval_token_acc": 0.9278943825914057,
"step": 240
},
{
"epoch": 0.49664259470416827,
"grad_norm": 0.7533831596374512,
"learning_rate": 9.338070524641329e-06,
"loss": 0.22155873775482177,
"memory(GiB)": 31.84,
"step": 245,
"token_acc": 0.9251282051282051,
"train_speed(iter/s)": 0.114158
},
{
"epoch": 0.5067781578613961,
"grad_norm": 0.6942651271820068,
"learning_rate": 9.3114213614309e-06,
"loss": 0.203749680519104,
"memory(GiB)": 31.84,
"step": 250,
"token_acc": 0.9276428102429416,
"train_speed(iter/s)": 0.114514
},
{
"epoch": 0.5169137210186241,
"grad_norm": 0.7373226284980774,
"learning_rate": 9.284285880837947e-06,
"loss": 0.204945707321167,
"memory(GiB)": 31.84,
"step": 255,
"token_acc": 0.9326470962002795,
"train_speed(iter/s)": 0.114981
},
{
"epoch": 0.5270492841758521,
"grad_norm": 0.7045229077339172,
"learning_rate": 9.256667143675789e-06,
"loss": 0.21190428733825684,
"memory(GiB)": 31.84,
"step": 260,
"token_acc": 0.9289341566160771,
"train_speed(iter/s)": 0.115383
},
{
"epoch": 0.5270492841758521,
"eval_loss": 0.21087971329689026,
"eval_runtime": 21.8346,
"eval_samples_per_second": 14.564,
"eval_steps_per_second": 3.664,
"eval_token_acc": 0.9285427588719488,
"step": 260
},
{
"epoch": 0.5371848473330799,
"grad_norm": 0.6742910742759705,
"learning_rate": 9.228568265267845e-06,
"loss": 0.20208969116210937,
"memory(GiB)": 31.84,
"step": 265,
"token_acc": 0.9311459533033191,
"train_speed(iter/s)": 0.11409
},
{
"epoch": 0.5473204104903079,
"grad_norm": 0.7655450701713562,
"learning_rate": 9.199992415096261e-06,
"loss": 0.22049922943115235,
"memory(GiB)": 31.84,
"step": 270,
"token_acc": 0.9257197105762817,
"train_speed(iter/s)": 0.114477
},
{
"epoch": 0.5574559736475357,
"grad_norm": 0.7306473255157471,
"learning_rate": 9.170942816444376e-06,
"loss": 0.21227545738220216,
"memory(GiB)": 31.84,
"step": 275,
"token_acc": 0.9255821189648914,
"train_speed(iter/s)": 0.114761
},
{
"epoch": 0.5675915368047637,
"grad_norm": 0.6912848353385925,
"learning_rate": 9.141422746033158e-06,
"loss": 0.2025892972946167,
"memory(GiB)": 31.84,
"step": 280,
"token_acc": 0.9356907206914177,
"train_speed(iter/s)": 0.115104
},
{
"epoch": 0.5675915368047637,
"eval_loss": 0.21015885472297668,
"eval_runtime": 21.7959,
"eval_samples_per_second": 14.59,
"eval_steps_per_second": 3.67,
"eval_token_acc": 0.9283204584329054,
"step": 280
},
{
"epoch": 0.5777270999619917,
"grad_norm": 0.6503239274024963,
"learning_rate": 9.111435533651595e-06,
"loss": 0.20708324909210205,
"memory(GiB)": 31.84,
"step": 285,
"token_acc": 0.9302453853948871,
"train_speed(iter/s)": 0.11406
},
{
"epoch": 0.5878626631192195,
"grad_norm": 0.6230775117874146,
"learning_rate": 9.08098456178111e-06,
"loss": 0.21010513305664064,
"memory(GiB)": 31.84,
"step": 290,
"token_acc": 0.9251024310297733,
"train_speed(iter/s)": 0.114468
},
{
"epoch": 0.5979982262764475,
"grad_norm": 0.5951421856880188,
"learning_rate": 9.050073265214006e-06,
"loss": 0.20655345916748047,
"memory(GiB)": 31.84,
"step": 295,
"token_acc": 0.9383279364144834,
"train_speed(iter/s)": 0.114773
},
{
"epoch": 0.6081337894336754,
"grad_norm": 0.6480836868286133,
"learning_rate": 9.01870513066605e-06,
"loss": 0.19019179344177245,
"memory(GiB)": 31.84,
"step": 300,
"token_acc": 0.9367692102809961,
"train_speed(iter/s)": 0.115106
},
{
"epoch": 0.6081337894336754,
"eval_loss": 0.20902292430400848,
"eval_runtime": 21.7988,
"eval_samples_per_second": 14.588,
"eval_steps_per_second": 3.67,
"eval_token_acc": 0.9290552848841876,
"step": 300
},
{
"epoch": 0.6182693525909033,
"grad_norm": 0.5634115934371948,
"learning_rate": 8.986883696383174e-06,
"loss": 0.18940932750701905,
"memory(GiB)": 31.84,
"step": 305,
"token_acc": 0.9292788671890537,
"train_speed(iter/s)": 0.114033
},
{
"epoch": 0.6284049157481313,
"grad_norm": 0.7795764803886414,
"learning_rate": 8.95461255174237e-06,
"loss": 0.21010336875915528,
"memory(GiB)": 31.84,
"step": 310,
"token_acc": 0.922672088989893,
"train_speed(iter/s)": 0.114389
},
{
"epoch": 0.6385404789053591,
"grad_norm": 0.6814590692520142,
"learning_rate": 8.921895336846814e-06,
"loss": 0.21159787178039552,
"memory(GiB)": 31.84,
"step": 315,
"token_acc": 0.9389688078621279,
"train_speed(iter/s)": 0.114683
},
{
"epoch": 0.6486760420625871,
"grad_norm": 0.6690118908882141,
"learning_rate": 8.888735742115268e-06,
"loss": 0.1964055299758911,
"memory(GiB)": 31.84,
"step": 320,
"token_acc": 0.9258384996498146,
"train_speed(iter/s)": 0.114995
},
{
"epoch": 0.6486760420625871,
"eval_loss": 0.20818451046943665,
"eval_runtime": 21.7939,
"eval_samples_per_second": 14.591,
"eval_steps_per_second": 3.671,
"eval_token_acc": 0.929296110359818,
"step": 320
},
{
"epoch": 0.6588116052198151,
"grad_norm": 0.6912645697593689,
"learning_rate": 8.855137507865831e-06,
"loss": 0.19853589534759522,
"memory(GiB)": 31.84,
"step": 325,
"token_acc": 0.9269768893593345,
"train_speed(iter/s)": 0.114013
},
{
"epoch": 0.6689471683770429,
"grad_norm": 0.6451709866523743,
"learning_rate": 8.821104423894015e-06,
"loss": 0.20868840217590331,
"memory(GiB)": 31.84,
"step": 330,
"token_acc": 0.9279026926085749,
"train_speed(iter/s)": 0.114452
},
{
"epoch": 0.6790827315342709,
"grad_norm": 0.6472577452659607,
"learning_rate": 8.786640329045279e-06,
"loss": 0.20419092178344728,
"memory(GiB)": 31.84,
"step": 335,
"token_acc": 0.9282166508987701,
"train_speed(iter/s)": 0.114692
},
{
"epoch": 0.6892182946914988,
"grad_norm": 0.6527739763259888,
"learning_rate": 8.751749110782013e-06,
"loss": 0.20285537242889404,
"memory(GiB)": 31.84,
"step": 340,
"token_acc": 0.9387946219536667,
"train_speed(iter/s)": 0.114992
},
{
"epoch": 0.6892182946914988,
"eval_loss": 0.20785662531852722,
"eval_runtime": 21.816,
"eval_samples_per_second": 14.576,
"eval_steps_per_second": 3.667,
"eval_token_acc": 0.9291232100183398,
"step": 340
},
{
"epoch": 0.6993538578487267,
"grad_norm": 0.7979351878166199,
"learning_rate": 8.716434704745047e-06,
"loss": 0.21715948581695557,
"memory(GiB)": 31.84,
"step": 345,
"token_acc": 0.9273517017828201,
"train_speed(iter/s)": 0.114123
},
{
"epoch": 0.7094894210059547,
"grad_norm": 0.7549222707748413,
"learning_rate": 8.680701094309716e-06,
"loss": 0.19147870540618897,
"memory(GiB)": 31.84,
"step": 350,
"token_acc": 0.9354483972067961,
"train_speed(iter/s)": 0.114431
},
{
"epoch": 0.7196249841631825,
"grad_norm": 0.6661515831947327,
"learning_rate": 8.644552310136547e-06,
"loss": 0.19406793117523194,
"memory(GiB)": 31.84,
"step": 355,
"token_acc": 0.9296775620817432,
"train_speed(iter/s)": 0.114715
},
{
"epoch": 0.7297605473204105,
"grad_norm": 0.7383010983467102,
"learning_rate": 8.60799242971661e-06,
"loss": 0.19579734802246093,
"memory(GiB)": 31.84,
"step": 360,
"token_acc": 0.92644679160905,
"train_speed(iter/s)": 0.115022
},
{
"epoch": 0.7297605473204105,
"eval_loss": 0.20642724633216858,
"eval_runtime": 21.8294,
"eval_samples_per_second": 14.568,
"eval_steps_per_second": 3.665,
"eval_token_acc": 0.9297221862013177,
"step": 360
},
{
"epoch": 0.7398961104776384,
"grad_norm": 0.763937771320343,
"learning_rate": 8.571025576911587e-06,
"loss": 0.19617555141448975,
"memory(GiB)": 31.84,
"step": 365,
"token_acc": 0.9303160729846056,
"train_speed(iter/s)": 0.114163
},
{
"epoch": 0.7500316736348663,
"grad_norm": 0.7166250348091125,
"learning_rate": 8.533655921488612e-06,
"loss": 0.21308059692382814,
"memory(GiB)": 31.84,
"step": 370,
"token_acc": 0.9230729085120818,
"train_speed(iter/s)": 0.114429
},
{
"epoch": 0.7601672367920943,
"grad_norm": 0.765160858631134,
"learning_rate": 8.495887678649933e-06,
"loss": 0.2089679002761841,
"memory(GiB)": 31.84,
"step": 375,
"token_acc": 0.9291193218752438,
"train_speed(iter/s)": 0.114729
},
{
"epoch": 0.7703027999493222,
"grad_norm": 0.6891535520553589,
"learning_rate": 8.457725108557447e-06,
"loss": 0.20386552810668945,
"memory(GiB)": 31.84,
"step": 380,
"token_acc": 0.930313113680995,
"train_speed(iter/s)": 0.114967
},
{
"epoch": 0.7703027999493222,
"eval_loss": 0.20559069514274597,
"eval_runtime": 21.7942,
"eval_samples_per_second": 14.591,
"eval_steps_per_second": 3.671,
"eval_token_acc": 0.9297468862501004,
"step": 380
},
{
"epoch": 0.7804383631065501,
"grad_norm": 0.7166603803634644,
"learning_rate": 8.41917251585216e-06,
"loss": 0.19558722972869874,
"memory(GiB)": 31.84,
"step": 385,
"token_acc": 0.9298326573239745,
"train_speed(iter/s)": 0.114165
},
{
"epoch": 0.790573926263778,
"grad_norm": 0.7598738074302673,
"learning_rate": 8.380234249168642e-06,
"loss": 0.1924947738647461,
"memory(GiB)": 31.84,
"step": 390,
"token_acc": 0.9257823458899742,
"train_speed(iter/s)": 0.11441
},
{
"epoch": 0.800709489421006,
"grad_norm": 0.7150676846504211,
"learning_rate": 8.340914700644507e-06,
"loss": 0.20020556449890137,
"memory(GiB)": 31.84,
"step": 395,
"token_acc": 0.9267301944153489,
"train_speed(iter/s)": 0.114769
},
{
"epoch": 0.8108450525782339,
"grad_norm": 0.6519859433174133,
"learning_rate": 8.301218305424994e-06,
"loss": 0.19768332242965697,
"memory(GiB)": 31.84,
"step": 400,
"token_acc": 0.923796935692706,
"train_speed(iter/s)": 0.115018
},
{
"epoch": 0.8108450525782339,
"eval_loss": 0.20375800132751465,
"eval_runtime": 21.8998,
"eval_samples_per_second": 14.521,
"eval_steps_per_second": 3.653,
"eval_token_acc": 0.9307348882014042,
"step": 400
},
{
"epoch": 0.8209806157354618,
"grad_norm": 0.7148239612579346,
"learning_rate": 8.261149541162693e-06,
"loss": 0.20124292373657227,
"memory(GiB)": 31.84,
"step": 405,
"token_acc": 0.9281172612274948,
"train_speed(iter/s)": 0.114267
},
{
"epoch": 0.8311161788926897,
"grad_norm": 0.6164042353630066,
"learning_rate": 8.22071292751247e-06,
"loss": 0.18608367443084717,
"memory(GiB)": 31.84,
"step": 410,
"token_acc": 0.9256482433822335,
"train_speed(iter/s)": 0.11456
},
{
"epoch": 0.8412517420499176,
"grad_norm": 0.722022294998169,
"learning_rate": 8.179913025621676e-06,
"loss": 0.19683008193969725,
"memory(GiB)": 31.84,
"step": 415,
"token_acc": 0.9332604223628362,
"train_speed(iter/s)": 0.114887
},
{
"epoch": 0.8513873052071456,
"grad_norm": 0.6409715414047241,
"learning_rate": 8.138754437615652e-06,
"loss": 0.20776214599609374,
"memory(GiB)": 31.84,
"step": 420,
"token_acc": 0.9207910620264543,
"train_speed(iter/s)": 0.115145
},
{
"epoch": 0.8513873052071456,
"eval_loss": 0.202426478266716,
"eval_runtime": 21.7226,
"eval_samples_per_second": 14.639,
"eval_steps_per_second": 3.683,
"eval_token_acc": 0.9311918391038823,
"step": 420
},
{
"epoch": 0.8615228683643735,
"grad_norm": 0.6910145282745361,
"learning_rate": 8.097241806078616e-06,
"loss": 0.2034536838531494,
"memory(GiB)": 31.84,
"step": 425,
"token_acc": 0.9296462086074857,
"train_speed(iter/s)": 0.114436
},
{
"epoch": 0.8716584315216014,
"grad_norm": 0.7244237661361694,
"learning_rate": 8.055379813530002e-06,
"loss": 0.20410799980163574,
"memory(GiB)": 31.84,
"step": 430,
"token_acc": 0.9354806441932579,
"train_speed(iter/s)": 0.114688
},
{
"epoch": 0.8817939946788294,
"grad_norm": 0.6427024602890015,
"learning_rate": 8.013173181896283e-06,
"loss": 0.1945526719093323,
"memory(GiB)": 31.84,
"step": 435,
"token_acc": 0.9243187942908474,
"train_speed(iter/s)": 0.114942
},
{
"epoch": 0.8919295578360573,
"grad_norm": 0.6878598928451538,
"learning_rate": 7.970626671978336e-06,
"loss": 0.19251216650009156,
"memory(GiB)": 31.84,
"step": 440,
"token_acc": 0.9332632422148793,
"train_speed(iter/s)": 0.115122
},
{
"epoch": 0.8919295578360573,
"eval_loss": 0.20254245400428772,
"eval_runtime": 21.8009,
"eval_samples_per_second": 14.587,
"eval_steps_per_second": 3.67,
"eval_token_acc": 0.9308954385184911,
"step": 440
},
{
"epoch": 0.9020651209932852,
"grad_norm": 0.679250955581665,
"learning_rate": 7.927745082914453e-06,
"loss": 0.19701287746429444,
"memory(GiB)": 31.84,
"step": 445,
"token_acc": 0.9282149553325245,
"train_speed(iter/s)": 0.1144
},
{
"epoch": 0.9122006841505131,
"grad_norm": 0.7318421602249146,
"learning_rate": 7.884533251639e-06,
"loss": 0.1975053071975708,
"memory(GiB)": 31.84,
"step": 450,
"token_acc": 0.9288036746490503,
"train_speed(iter/s)": 0.114655
},
{
"epoch": 0.922336247307741,
"grad_norm": 0.7219140529632568,
"learning_rate": 7.840996052336827e-06,
"loss": 0.22328760623931884,
"memory(GiB)": 31.84,
"step": 455,
"token_acc": 0.9224597234555373,
"train_speed(iter/s)": 0.114914
},
{
"epoch": 0.932471810464969,
"grad_norm": 0.6490547060966492,
"learning_rate": 7.797138395893471e-06,
"loss": 0.20440030097961426,
"memory(GiB)": 31.84,
"step": 460,
"token_acc": 0.9310429568465546,
"train_speed(iter/s)": 0.115177
},
{
"epoch": 0.932471810464969,
"eval_loss": 0.2018972933292389,
"eval_runtime": 21.7035,
"eval_samples_per_second": 14.652,
"eval_steps_per_second": 3.686,
"eval_token_acc": 0.9313091643355995,
"step": 460
},
{
"epoch": 0.9426073736221969,
"grad_norm": 0.6232012510299683,
"learning_rate": 7.75296522934122e-06,
"loss": 0.19165756702423095,
"memory(GiB)": 31.84,
"step": 465,
"token_acc": 0.932452931203712,
"train_speed(iter/s)": 0.114511
},
{
"epoch": 0.9527429367794248,
"grad_norm": 0.638760507106781,
"learning_rate": 7.708481535301101e-06,
"loss": 0.180339515209198,
"memory(GiB)": 31.84,
"step": 470,
"token_acc": 0.9387325860490436,
"train_speed(iter/s)": 0.114716
},
{
"epoch": 0.9628784999366528,
"grad_norm": 0.7850983142852783,
"learning_rate": 7.663692331420857e-06,
"loss": 0.20282835960388185,
"memory(GiB)": 31.84,
"step": 475,
"token_acc": 0.9294787942210982,
"train_speed(iter/s)": 0.114974
},
{
"epoch": 0.9730140630938806,
"grad_norm": 0.7070600390434265,
"learning_rate": 7.6186026698089584e-06,
"loss": 0.20282812118530275,
"memory(GiB)": 31.84,
"step": 480,
"token_acc": 0.9263814944005875,
"train_speed(iter/s)": 0.115173
},
{
"epoch": 0.9730140630938806,
"eval_loss": 0.2006615400314331,
"eval_runtime": 21.7993,
"eval_samples_per_second": 14.588,
"eval_steps_per_second": 3.67,
"eval_token_acc": 0.931636439981969,
"step": 480
},
{
"epoch": 0.9831496262511086,
"grad_norm": 0.6108932495117188,
"learning_rate": 7.5732176364647515e-06,
"loss": 0.18763720989227295,
"memory(GiB)": 31.84,
"step": 485,
"token_acc": 0.9314041294819665,
"train_speed(iter/s)": 0.114551
},
{
"epoch": 0.9932851894083365,
"grad_norm": 0.7140085101127625,
"learning_rate": 7.527542350704759e-06,
"loss": 0.20563454627990724,
"memory(GiB)": 31.84,
"step": 490,
"token_acc": 0.9261781443826331,
"train_speed(iter/s)": 0.114801
},
{
"epoch": 1.0040542252628912,
"grad_norm": 0.6436912417411804,
"learning_rate": 7.481581964585245e-06,
"loss": 0.2002246856689453,
"memory(GiB)": 31.84,
"step": 495,
"token_acc": 0.9414402391501628,
"train_speed(iter/s)": 0.114919
},
{
"epoch": 1.0141897884201192,
"grad_norm": 0.6124146580696106,
"learning_rate": 7.435341662321063e-06,
"loss": 0.14853100776672362,
"memory(GiB)": 31.84,
"step": 500,
"token_acc": 0.9471966205837173,
"train_speed(iter/s)": 0.115165
},
{
"epoch": 1.0141897884201192,
"eval_loss": 0.20782382786273956,
"eval_runtime": 21.7236,
"eval_samples_per_second": 14.638,
"eval_steps_per_second": 3.683,
"eval_token_acc": 0.9316426149941646,
"step": 500
},
{
"epoch": 1.024325351577347,
"grad_norm": 0.753773033618927,
"learning_rate": 7.388826659700902e-06,
"loss": 0.13870035409927367,
"memory(GiB)": 31.84,
"step": 505,
"token_acc": 0.9419344271245667,
"train_speed(iter/s)": 0.114525
},
{
"epoch": 1.034460914734575,
"grad_norm": 0.6324412226676941,
"learning_rate": 7.342042203498952e-06,
"loss": 0.14452635049819945,
"memory(GiB)": 31.84,
"step": 510,
"token_acc": 0.9496112148490845,
"train_speed(iter/s)": 0.114737
},
{
"epoch": 1.0445964778918029,
"grad_norm": 0.568839430809021,
"learning_rate": 7.2949935708830825e-06,
"loss": 0.13303987979888915,
"memory(GiB)": 31.84,
"step": 515,
"token_acc": 0.9554845690143747,
"train_speed(iter/s)": 0.114956
},
{
"epoch": 1.0547320410490308,
"grad_norm": 0.7018365859985352,
"learning_rate": 7.247686068819592e-06,
"loss": 0.1495302438735962,
"memory(GiB)": 31.84,
"step": 520,
"token_acc": 0.9394168981676347,
"train_speed(iter/s)": 0.115106
},
{
"epoch": 1.0547320410490308,
"eval_loss": 0.20550589263439178,
"eval_runtime": 21.8398,
"eval_samples_per_second": 14.561,
"eval_steps_per_second": 3.663,
"eval_token_acc": 0.9316549650185559,
"step": 520
},
{
"epoch": 1.0648676042062588,
"grad_norm": 0.6878096461296082,
"learning_rate": 7.200125033474599e-06,
"loss": 0.1441951036453247,
"memory(GiB)": 31.84,
"step": 525,
"token_acc": 0.9410242560514515,
"train_speed(iter/s)": 0.114529
},
{
"epoch": 1.0750031673634866,
"grad_norm": 0.5951861143112183,
"learning_rate": 7.152315829612124e-06,
"loss": 0.141739821434021,
"memory(GiB)": 31.84,
"step": 530,
"token_acc": 0.9511068989817988,
"train_speed(iter/s)": 0.11475
},
{
"epoch": 1.0851387305207145,
"grad_norm": 0.7104572057723999,
"learning_rate": 7.104263849988976e-06,
"loss": 0.13958663940429689,
"memory(GiB)": 31.84,
"step": 535,
"token_acc": 0.9464745976287302,
"train_speed(iter/s)": 0.114952
},
{
"epoch": 1.0952742936779425,
"grad_norm": 0.7287134528160095,
"learning_rate": 7.055974514746446e-06,
"loss": 0.1433733582496643,
"memory(GiB)": 31.84,
"step": 540,
"token_acc": 0.9533915015281474,
"train_speed(iter/s)": 0.11514
},
{
"epoch": 1.0952742936779425,
"eval_loss": 0.20595994591712952,
"eval_runtime": 21.8259,
"eval_samples_per_second": 14.57,
"eval_steps_per_second": 3.665,
"eval_token_acc": 0.9320995658966427,
"step": 540
},
{
"epoch": 1.1054098568351705,
"grad_norm": 0.7750824093818665,
"learning_rate": 7.007453270798937e-06,
"loss": 0.14028260707855225,
"memory(GiB)": 31.84,
"step": 545,
"token_acc": 0.9427210537187573,
"train_speed(iter/s)": 0.114614
},
{
"epoch": 1.1155454199923984,
"grad_norm": 0.6719576120376587,
"learning_rate": 6.95870559121957e-06,
"loss": 0.1443098783493042,
"memory(GiB)": 31.84,
"step": 550,
"token_acc": 0.9419209173821336,
"train_speed(iter/s)": 0.114811
},
{
"epoch": 1.1256809831496262,
"grad_norm": 0.6712535619735718,
"learning_rate": 6.909736974622827e-06,
"loss": 0.1412811279296875,
"memory(GiB)": 31.84,
"step": 555,
"token_acc": 0.9537117362746313,
"train_speed(iter/s)": 0.114963
},
{
"epoch": 1.1358165463068541,
"grad_norm": 0.7601253390312195,
"learning_rate": 6.860552944544325e-06,
"loss": 0.1436458110809326,
"memory(GiB)": 31.84,
"step": 560,
"token_acc": 0.9485312439436308,
"train_speed(iter/s)": 0.11514
},
{
"epoch": 1.1358165463068541,
"eval_loss": 0.20563021302223206,
"eval_runtime": 21.7336,
"eval_samples_per_second": 14.632,
"eval_steps_per_second": 3.681,
"eval_token_acc": 0.9317969902990558,
"step": 560
},
{
"epoch": 1.145952109464082,
"grad_norm": 0.7179679870605469,
"learning_rate": 6.811159048817773e-06,
"loss": 0.1442504644393921,
"memory(GiB)": 31.84,
"step": 565,
"token_acc": 0.9380280917370789,
"train_speed(iter/s)": 0.114635
},
{
"epoch": 1.15608767262131,
"grad_norm": 0.7179620265960693,
"learning_rate": 6.7615608589491935e-06,
"loss": 0.13410391807556152,
"memory(GiB)": 31.84,
"step": 570,
"token_acc": 0.9558909767726473,
"train_speed(iter/s)": 0.114806
},
{
"epoch": 1.166223235778538,
"grad_norm": 0.6108378171920776,
"learning_rate": 6.711763969488472e-06,
"loss": 0.13897714614868165,
"memory(GiB)": 31.84,
"step": 575,
"token_acc": 0.9451023932889218,
"train_speed(iter/s)": 0.114992
},
{
"epoch": 1.1763587989357658,
"grad_norm": 0.6707165837287903,
"learning_rate": 6.6617739973982985e-06,
"loss": 0.14145419597625733,
"memory(GiB)": 31.84,
"step": 580,
"token_acc": 0.9474374323014652,
"train_speed(iter/s)": 0.115192
},
{
"epoch": 1.1763587989357658,
"eval_loss": 0.20474562048912048,
"eval_runtime": 21.8012,
"eval_samples_per_second": 14.586,
"eval_steps_per_second": 3.67,
"eval_token_acc": 0.9320069407137079,
"step": 580
},
{
"epoch": 1.1864943620929937,
"grad_norm": 0.7210031747817993,
"learning_rate": 6.6115965814206e-06,
"loss": 0.1450878858566284,
"memory(GiB)": 31.84,
"step": 585,
"token_acc": 0.9400503909192447,
"train_speed(iter/s)": 0.114706
},
{
"epoch": 1.1966299252502217,
"grad_norm": 0.6648117899894714,
"learning_rate": 6.561237381440491e-06,
"loss": 0.1425154447555542,
"memory(GiB)": 31.84,
"step": 590,
"token_acc": 0.9425449072407813,
"train_speed(iter/s)": 0.114844
},
{
"epoch": 1.2067654884074497,
"grad_norm": 0.7065860629081726,
"learning_rate": 6.510702077847864e-06,
"loss": 0.14567887783050537,
"memory(GiB)": 31.84,
"step": 595,
"token_acc": 0.9487148690847946,
"train_speed(iter/s)": 0.115008
},
{
"epoch": 1.2169010515646776,
"grad_norm": 0.6811427474021912,
"learning_rate": 6.459996370896653e-06,
"loss": 0.1396160125732422,
"memory(GiB)": 31.84,
"step": 600,
"token_acc": 0.9510217472815898,
"train_speed(iter/s)": 0.11519
},
{
"epoch": 1.2169010515646776,
"eval_loss": 0.204533189535141,
"eval_runtime": 21.8513,
"eval_samples_per_second": 14.553,
"eval_steps_per_second": 3.661,
"eval_token_acc": 0.9319143155307732,
"step": 600
},
{
"epoch": 1.2270366147219054,
"grad_norm": 0.6900855302810669,
"learning_rate": 6.409125980061852e-06,
"loss": 0.15422054529190063,
"memory(GiB)": 31.84,
"step": 605,
"token_acc": 0.938386855862584,
"train_speed(iter/s)": 0.114682
},
{
"epoch": 1.2371721778791334,
"grad_norm": 0.7017642259597778,
"learning_rate": 6.358096643394387e-06,
"loss": 0.14031555652618408,
"memory(GiB)": 31.84,
"step": 610,
"token_acc": 0.9535683058461724,
"train_speed(iter/s)": 0.114824
},
{
"epoch": 1.2473077410363613,
"grad_norm": 0.6713739037513733,
"learning_rate": 6.306914116873863e-06,
"loss": 0.14935390949249266,
"memory(GiB)": 31.84,
"step": 615,
"token_acc": 0.9406998158379374,
"train_speed(iter/s)": 0.114993
},
{
"epoch": 1.2574433041935893,
"grad_norm": 0.6819109916687012,
"learning_rate": 6.255584173759319e-06,
"loss": 0.1377212643623352,
"memory(GiB)": 31.84,
"step": 620,
"token_acc": 0.9501051424201874,
"train_speed(iter/s)": 0.115142
},
{
"epoch": 1.2574433041935893,
"eval_loss": 0.2050146609544754,
"eval_runtime": 21.8467,
"eval_samples_per_second": 14.556,
"eval_steps_per_second": 3.662,
"eval_token_acc": 0.9317105401283168,
"step": 620
},
{
"epoch": 1.2675788673508173,
"grad_norm": 0.7029967308044434,
"learning_rate": 6.2041126039380065e-06,
"loss": 0.14089449644088745,
"memory(GiB)": 31.84,
"step": 625,
"token_acc": 0.9445310259703018,
"train_speed(iter/s)": 0.114661
},
{
"epoch": 1.2777144305080452,
"grad_norm": 0.7382521629333496,
"learning_rate": 6.152505213272308e-06,
"loss": 0.15079411268234252,
"memory(GiB)": 31.84,
"step": 630,
"token_acc": 0.9498072379057331,
"train_speed(iter/s)": 0.114839
},
{
"epoch": 1.287849993665273,
"grad_norm": 0.7125928997993469,
"learning_rate": 6.100767822944856e-06,
"loss": 0.13632259368896485,
"memory(GiB)": 31.84,
"step": 635,
"token_acc": 0.9514033264033264,
"train_speed(iter/s)": 0.115011
},
{
"epoch": 1.297985556822501,
"grad_norm": 0.7053549885749817,
"learning_rate": 6.048906268801915e-06,
"loss": 0.1401703953742981,
"memory(GiB)": 31.84,
"step": 640,
"token_acc": 0.9504383760540571,
"train_speed(iter/s)": 0.115182
},
{
"epoch": 1.297985556822501,
"eval_loss": 0.20237460732460022,
"eval_runtime": 21.7848,
"eval_samples_per_second": 14.597,
"eval_steps_per_second": 3.672,
"eval_token_acc": 0.9321427909820121,
"step": 640
},
{
"epoch": 1.308121119979729,
"grad_norm": 0.7114176750183105,
"learning_rate": 5.9969264006951135e-06,
"loss": 0.1354852318763733,
"memory(GiB)": 31.84,
"step": 645,
"token_acc": 0.9398549466401315,
"train_speed(iter/s)": 0.114762
},
{
"epoch": 1.3182566831369569,
"grad_norm": 0.6910070776939392,
"learning_rate": 5.944834081821589e-06,
"loss": 0.1504984140396118,
"memory(GiB)": 31.84,
"step": 650,
"token_acc": 0.946494742867472,
"train_speed(iter/s)": 0.114975
},
{
"epoch": 1.3283922462941846,
"grad_norm": 0.7024128437042236,
"learning_rate": 5.892635188062647e-06,
"loss": 0.1456763982772827,
"memory(GiB)": 31.84,
"step": 655,
"token_acc": 0.9487756192189828,
"train_speed(iter/s)": 0.115131
},
{
"epoch": 1.3385278094514126,
"grad_norm": 0.6766383647918701,
"learning_rate": 5.8403356073209636e-06,
"loss": 0.13979326486587523,
"memory(GiB)": 31.84,
"step": 660,
"token_acc": 0.9429099638530946,
"train_speed(iter/s)": 0.115294
},
{
"epoch": 1.3385278094514126,
"eval_loss": 0.2033311426639557,
"eval_runtime": 21.7863,
"eval_samples_per_second": 14.596,
"eval_steps_per_second": 3.672,
"eval_token_acc": 0.9320625158234688,
"step": 660
},
{
"epoch": 1.3486633726086406,
"grad_norm": 0.704328179359436,
"learning_rate": 5.787941238856456e-06,
"loss": 0.14371912479400634,
"memory(GiB)": 31.84,
"step": 665,
"token_acc": 0.9368773059543715,
"train_speed(iter/s)": 0.114861
},
{
"epoch": 1.3587989357658685,
"grad_norm": 0.6802431344985962,
"learning_rate": 5.735457992620851e-06,
"loss": 0.1401059865951538,
"memory(GiB)": 31.84,
"step": 670,
"token_acc": 0.9508861724428956,
"train_speed(iter/s)": 0.115032
},
{
"epoch": 1.3689344989230965,
"grad_norm": 0.6354742646217346,
"learning_rate": 5.682891788591066e-06,
"loss": 0.13931301832199097,
"memory(GiB)": 31.84,
"step": 675,
"token_acc": 0.9524576521435666,
"train_speed(iter/s)": 0.115144
},
{
"epoch": 1.3790700620803245,
"grad_norm": 0.6422320604324341,
"learning_rate": 5.630248556101448e-06,
"loss": 0.14210424423217774,
"memory(GiB)": 31.84,
"step": 680,
"token_acc": 0.9509293240203622,
"train_speed(iter/s)": 0.115296
},
{
"epoch": 1.3790700620803245,
"eval_loss": 0.20212741196155548,
"eval_runtime": 21.7681,
"eval_samples_per_second": 14.609,
"eval_steps_per_second": 3.675,
"eval_token_acc": 0.9325071167015555,
"step": 680
},
{
"epoch": 1.3892056252375522,
"grad_norm": 0.629282534122467,
"learning_rate": 5.5775342331749525e-06,
"loss": 0.13471572399139403,
"memory(GiB)": 31.84,
"step": 685,
"token_acc": 0.9387646219305616,
"train_speed(iter/s)": 0.11486
},
{
"epoch": 1.3993411883947802,
"grad_norm": 0.6799649000167847,
"learning_rate": 5.5247547658533604e-06,
"loss": 0.146738600730896,
"memory(GiB)": 31.84,
"step": 690,
"token_acc": 0.9462739584104196,
"train_speed(iter/s)": 0.11501
},
{
"epoch": 1.4094767515520081,
"grad_norm": 0.7268623113632202,
"learning_rate": 5.471916107526577e-06,
"loss": 0.14002842903137208,
"memory(GiB)": 31.84,
"step": 695,
"token_acc": 0.9498395231545163,
"train_speed(iter/s)": 0.115121
},
{
"epoch": 1.419612314709236,
"grad_norm": 0.6405200362205505,
"learning_rate": 5.419024218261098e-06,
"loss": 0.1415479898452759,
"memory(GiB)": 31.84,
"step": 700,
"token_acc": 0.9470242347639721,
"train_speed(iter/s)": 0.11524
},
{
"epoch": 1.419612314709236,
"eval_loss": 0.20304131507873535,
"eval_runtime": 21.7388,
"eval_samples_per_second": 14.628,
"eval_steps_per_second": 3.68,
"eval_token_acc": 0.9326059168966858,
"step": 700
},
{
"epoch": 1.4297478778664638,
"grad_norm": 0.7136779427528381,
"learning_rate": 5.366085064127734e-06,
"loss": 0.14102948904037477,
"memory(GiB)": 31.84,
"step": 705,
"token_acc": 0.9402442511130172,
"train_speed(iter/s)": 0.114822
},
{
"epoch": 1.4398834410236918,
"grad_norm": 0.7035164833068848,
"learning_rate": 5.313104616528656e-06,
"loss": 0.15270428657531737,
"memory(GiB)": 31.84,
"step": 710,
"token_acc": 0.9444076404990615,
"train_speed(iter/s)": 0.114935
},
{
"epoch": 1.4500190041809198,
"grad_norm": 0.7185565233230591,
"learning_rate": 5.260088851523833e-06,
"loss": 0.14676375389099122,
"memory(GiB)": 31.84,
"step": 715,
"token_acc": 0.9527322682405617,
"train_speed(iter/s)": 0.11512
},
{
"epoch": 1.4601545673381477,
"grad_norm": 0.712411105632782,
"learning_rate": 5.207043749156945e-06,
"loss": 0.13786702156066893,
"memory(GiB)": 31.84,
"step": 720,
"token_acc": 0.9517344563892275,
"train_speed(iter/s)": 0.115272
},
{
"epoch": 1.4601545673381477,
"eval_loss": 0.20220014452934265,
"eval_runtime": 21.8005,
"eval_samples_per_second": 14.587,
"eval_steps_per_second": 3.67,
"eval_token_acc": 0.9331431429577074,
"step": 720
},
{
"epoch": 1.4702901304953757,
"grad_norm": 0.7098857760429382,
"learning_rate": 5.153975292780852e-06,
"loss": 0.13870218992233277,
"memory(GiB)": 31.84,
"step": 725,
"token_acc": 0.9459776516453177,
"train_speed(iter/s)": 0.114836
},
{
"epoch": 1.4804256936526037,
"grad_norm": 0.6736304759979248,
"learning_rate": 5.10088946838269e-06,
"loss": 0.15405884981155396,
"memory(GiB)": 31.84,
"step": 730,
"token_acc": 0.943327239488117,
"train_speed(iter/s)": 0.115018
},
{
"epoch": 1.4905612568098314,
"grad_norm": 0.6198113560676575,
"learning_rate": 5.0477922639086594e-06,
"loss": 0.12916960716247558,
"memory(GiB)": 31.84,
"step": 735,
"token_acc": 0.9533303842264914,
"train_speed(iter/s)": 0.115172
},
{
"epoch": 1.5006968199670594,
"grad_norm": 0.6400096416473389,
"learning_rate": 4.99468966858861e-06,
"loss": 0.13272554874420167,
"memory(GiB)": 31.84,
"step": 740,
"token_acc": 0.9574656362456492,
"train_speed(iter/s)": 0.115287
},
{
"epoch": 1.5006968199670594,
"eval_loss": 0.20150884985923767,
"eval_runtime": 21.7413,
"eval_samples_per_second": 14.627,
"eval_steps_per_second": 3.68,
"eval_token_acc": 0.9324885916649686,
"step": 740
},
{
"epoch": 1.5108323831242874,
"grad_norm": 0.6690772771835327,
"learning_rate": 4.941587672260461e-06,
"loss": 0.1432182550430298,
"memory(GiB)": 31.84,
"step": 745,
"token_acc": 0.9370182365106223,
"train_speed(iter/s)": 0.114917
},
{
"epoch": 1.520967946281515,
"grad_norm": 0.6275672912597656,
"learning_rate": 4.888492264694566e-06,
"loss": 0.1296250343322754,
"memory(GiB)": 31.84,
"step": 750,
"token_acc": 0.9563844971453667,
"train_speed(iter/s)": 0.115024
},
{
"epoch": 1.531103509438743,
"grad_norm": 0.7383422255516052,
"learning_rate": 4.8354094349180885e-06,
"loss": 0.13504064083099365,
"memory(GiB)": 31.84,
"step": 755,
"token_acc": 0.951349361424971,
"train_speed(iter/s)": 0.115159
},
{
"epoch": 1.541239072595971,
"grad_norm": 0.7565678358078003,
"learning_rate": 4.782345170539441e-06,
"loss": 0.13883064985275267,
"memory(GiB)": 31.84,
"step": 760,
"token_acc": 0.9438551468491588,
"train_speed(iter/s)": 0.115291
},
{
"epoch": 1.541239072595971,
"eval_loss": 0.20081333816051483,
"eval_runtime": 21.7245,
"eval_samples_per_second": 14.638,
"eval_steps_per_second": 3.682,
"eval_token_acc": 0.9331740180186856,
"step": 760
},
{
"epoch": 1.551374635753199,
"grad_norm": 0.7568323016166687,
"learning_rate": 4.729305457072913e-06,
"loss": 0.14828368425369262,
"memory(GiB)": 31.84,
"step": 765,
"token_acc": 0.9404283971948368,
"train_speed(iter/s)": 0.114963
},
{
"epoch": 1.561510198910427,
"grad_norm": 0.6327131390571594,
"learning_rate": 4.676296277263513e-06,
"loss": 0.13124030828475952,
"memory(GiB)": 31.84,
"step": 770,
"token_acc": 0.948938511889246,
"train_speed(iter/s)": 0.115091
},
{
"epoch": 1.571645762067655,
"grad_norm": 0.6433929204940796,
"learning_rate": 4.6233236104121266e-06,
"loss": 0.134158194065094,
"memory(GiB)": 31.84,
"step": 775,
"token_acc": 0.952758534346048,
"train_speed(iter/s)": 0.115205
},
{
"epoch": 1.581781325224883,
"grad_norm": 0.6848548054695129,
"learning_rate": 4.570393431701074e-06,
"loss": 0.14014556407928466,
"memory(GiB)": 31.84,
"step": 780,
"token_acc": 0.9504519678850148,
"train_speed(iter/s)": 0.115355
},
{
"epoch": 1.581781325224883,
"eval_loss": 0.20110902190208435,
"eval_runtime": 21.6337,
"eval_samples_per_second": 14.699,
"eval_steps_per_second": 3.698,
"eval_token_acc": 0.9332048930796638,
"step": 780
},
{
"epoch": 1.5919168883821109,
"grad_norm": 0.7065548896789551,
"learning_rate": 4.517511711520121e-06,
"loss": 0.1434975743293762,
"memory(GiB)": 31.84,
"step": 785,
"token_acc": 0.9380875156072849,
"train_speed(iter/s)": 0.114991
},
{
"epoch": 1.6020524515393386,
"grad_norm": 0.6895316243171692,
"learning_rate": 4.46468441479303e-06,
"loss": 0.14089118242263793,
"memory(GiB)": 31.84,
"step": 790,
"token_acc": 0.9526078624333143,
"train_speed(iter/s)": 0.115135
},
{
"epoch": 1.6121880146965666,
"grad_norm": 0.6403608918190002,
"learning_rate": 4.411917500304741e-06,
"loss": 0.13246408700942994,
"memory(GiB)": 31.84,
"step": 795,
"token_acc": 0.9556548295096035,
"train_speed(iter/s)": 0.11525
},
{
"epoch": 1.6223235778537946,
"grad_norm": 0.7208017706871033,
"learning_rate": 4.359216920029227e-06,
"loss": 0.1482730746269226,
"memory(GiB)": 31.84,
"step": 800,
"token_acc": 0.9493780648247817,
"train_speed(iter/s)": 0.115389
},
{
"epoch": 1.6223235778537946,
"eval_loss": 0.19986069202423096,
"eval_runtime": 21.7303,
"eval_samples_per_second": 14.634,
"eval_steps_per_second": 3.681,
"eval_token_acc": 0.9332419431528377,
"step": 800
},
{
"epoch": 1.6324591410110223,
"grad_norm": 0.7288877964019775,
"learning_rate": 4.306588618458134e-06,
"loss": 0.14321244955062867,
"memory(GiB)": 31.84,
"step": 805,
"token_acc": 0.9386183036920914,
"train_speed(iter/s)": 0.11503
},
{
"epoch": 1.6425947041682503,
"grad_norm": 0.7239351868629456,
"learning_rate": 4.254038531930253e-06,
"loss": 0.15064480304718017,
"memory(GiB)": 31.84,
"step": 810,
"token_acc": 0.9507855081756973,
"train_speed(iter/s)": 0.115159
},
{
"epoch": 1.6527302673254782,
"grad_norm": 0.6510659456253052,
"learning_rate": 4.201572587961911e-06,
"loss": 0.13342173099517823,
"memory(GiB)": 31.84,
"step": 815,
"token_acc": 0.9515407052300228,
"train_speed(iter/s)": 0.115262
},
{
"epoch": 1.6628658304827062,
"grad_norm": 0.76472008228302,
"learning_rate": 4.149196704578375e-06,
"loss": 0.14209251403808593,
"memory(GiB)": 31.84,
"step": 820,
"token_acc": 0.9562061456569079,
"train_speed(iter/s)": 0.115391
},
{
"epoch": 1.6628658304827062,
"eval_loss": 0.19960026443004608,
"eval_runtime": 21.6668,
"eval_samples_per_second": 14.677,
"eval_steps_per_second": 3.692,
"eval_token_acc": 0.9334951186528594,
"step": 820
},
{
"epoch": 1.6730013936399342,
"grad_norm": 0.6803078055381775,
"learning_rate": 4.096916789646305e-06,
"loss": 0.1310012936592102,
"memory(GiB)": 31.84,
"step": 825,
"token_acc": 0.9461966384760458,
"train_speed(iter/s)": 0.115013
},
{
"epoch": 1.6831369567971621,
"grad_norm": 0.6794367432594299,
"learning_rate": 4.04473874020736e-06,
"loss": 0.14054393768310547,
"memory(GiB)": 31.84,
"step": 830,
"token_acc": 0.9499732318125037,
"train_speed(iter/s)": 0.115168
},
{
"epoch": 1.69327251995439,
"grad_norm": 0.6680985689163208,
"learning_rate": 3.992668441813036e-06,
"loss": 0.15022470951080322,
"memory(GiB)": 31.84,
"step": 835,
"token_acc": 0.9472336065573771,
"train_speed(iter/s)": 0.115303
},
{
"epoch": 1.7034080831116178,
"grad_norm": 0.707535982131958,
"learning_rate": 3.940711767860776e-06,
"loss": 0.14012532234191893,
"memory(GiB)": 31.84,
"step": 840,
"token_acc": 0.9501387137452711,
"train_speed(iter/s)": 0.115467
},
{
"epoch": 1.7034080831116178,
"eval_loss": 0.19959832727909088,
"eval_runtime": 21.6673,
"eval_samples_per_second": 14.676,
"eval_steps_per_second": 3.692,
"eval_token_acc": 0.9337853442260549,
"step": 840
},
{
"epoch": 1.7135436462688458,
"grad_norm": 0.6823382377624512,
"learning_rate": 3.888874578931482e-06,
"loss": 0.14827988147735596,
"memory(GiB)": 31.84,
"step": 845,
"token_acc": 0.9361747509472786,
"train_speed(iter/s)": 0.115101
},
{
"epoch": 1.7236792094260738,
"grad_norm": 0.695568859577179,
"learning_rate": 3.8371627221284495e-06,
"loss": 0.14526791572570802,
"memory(GiB)": 31.84,
"step": 850,
"token_acc": 0.9438457505679502,
"train_speed(iter/s)": 0.115237
},
{
"epoch": 1.7338147725833015,
"grad_norm": 0.6451678276062012,
"learning_rate": 3.7855820304178202e-06,
"loss": 0.13392380475997925,
"memory(GiB)": 31.84,
"step": 855,
"token_acc": 0.9532031010915026,
"train_speed(iter/s)": 0.115329
},
{
"epoch": 1.7439503357405295,
"grad_norm": 0.6347934007644653,
"learning_rate": 3.7341383219706535e-06,
"loss": 0.137128746509552,
"memory(GiB)": 31.84,
"step": 860,
"token_acc": 0.9556541717810719,
"train_speed(iter/s)": 0.115435
},
{
"epoch": 1.7439503357405295,
"eval_loss": 0.19803106784820557,
"eval_runtime": 21.589,
"eval_samples_per_second": 14.73,
"eval_steps_per_second": 3.706,
"eval_token_acc": 0.9337729942016636,
"step": 860
},
{
"epoch": 1.7540858988977575,
"grad_norm": 0.7886172533035278,
"learning_rate": 3.6828373995066434e-06,
"loss": 0.13904647827148436,
"memory(GiB)": 31.84,
"step": 865,
"token_acc": 0.9446668643732701,
"train_speed(iter/s)": 0.115085
},
{
"epoch": 1.7642214620549854,
"grad_norm": 0.6498300433158875,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.1318161129951477,
"memory(GiB)": 31.84,
"step": 870,
"token_acc": 0.9486192100341072,
"train_speed(iter/s)": 0.115204
},
{
"epoch": 1.7743570252122134,
"grad_norm": 0.6739187240600586,
"learning_rate": 3.5806870422246675e-06,
"loss": 0.13507769107818604,
"memory(GiB)": 31.84,
"step": 875,
"token_acc": 0.9506144334128633,
"train_speed(iter/s)": 0.115288
},
{
"epoch": 1.7844925883694414,
"grad_norm": 0.6173581480979919,
"learning_rate": 3.5298491297076332e-06,
"loss": 0.1330319046974182,
"memory(GiB)": 31.84,
"step": 880,
"token_acc": 0.9496503128779513,
"train_speed(iter/s)": 0.115413
},
{
"epoch": 1.7844925883694414,
"eval_loss": 0.1983470916748047,
"eval_runtime": 21.6381,
"eval_samples_per_second": 14.696,
"eval_steps_per_second": 3.697,
"eval_token_acc": 0.9336927190431201,
"step": 880
},
{
"epoch": 1.7946281515266693,
"grad_norm": 0.7845348715782166,
"learning_rate": 3.479177046475935e-06,
"loss": 0.14459047317504883,
"memory(GiB)": 31.84,
"step": 885,
"token_acc": 0.9400699857249334,
"train_speed(iter/s)": 0.115073
},
{
"epoch": 1.804763714683897,
"grad_norm": 0.677846372127533,
"learning_rate": 3.428676508211902e-06,
"loss": 0.1446584701538086,
"memory(GiB)": 31.84,
"step": 890,
"token_acc": 0.9527403560028134,
"train_speed(iter/s)": 0.115198
},
{
"epoch": 1.814899277841125,
"grad_norm": 0.6514244079589844,
"learning_rate": 3.3783532112480243e-06,
"loss": 0.14350442886352538,
"memory(GiB)": 31.84,
"step": 895,
"token_acc": 0.9489065777752235,
"train_speed(iter/s)": 0.115319
},
{
"epoch": 1.825034840998353,
"grad_norm": 0.6789301633834839,
"learning_rate": 3.328212831924424e-06,
"loss": 0.13438014984130858,
"memory(GiB)": 31.84,
"step": 900,
"token_acc": 0.9504659558391403,
"train_speed(iter/s)": 0.1154
},
{
"epoch": 1.825034840998353,
"eval_loss": 0.19804814457893372,
"eval_runtime": 21.7224,
"eval_samples_per_second": 14.639,
"eval_steps_per_second": 3.683,
"eval_token_acc": 0.9340385197260764,
"step": 900
},
{
"epoch": 1.8351704041555807,
"grad_norm": 0.7078403830528259,
"learning_rate": 3.2782610259485816e-06,
"loss": 0.14103929996490477,
"memory(GiB)": 31.84,
"step": 905,
"token_acc": 0.9467157966221059,
"train_speed(iter/s)": 0.115049
},
{
"epoch": 1.8453059673128087,
"grad_norm": 0.6451456546783447,
"learning_rate": 3.228503427757374e-06,
"loss": 0.1354345202445984,
"memory(GiB)": 31.84,
"step": 910,
"token_acc": 0.949082312268036,
"train_speed(iter/s)": 0.115184
},
{
"epoch": 1.8554415304700367,
"grad_norm": 0.7488725185394287,
"learning_rate": 3.178945649881543e-06,
"loss": 0.1395805835723877,
"memory(GiB)": 31.84,
"step": 915,
"token_acc": 0.9481235027947831,
"train_speed(iter/s)": 0.115318
},
{
"epoch": 1.8655770936272646,
"grad_norm": 0.6281618475914001,
"learning_rate": 3.1295932823125984e-06,
"loss": 0.13758153915405275,
"memory(GiB)": 31.84,
"step": 920,
"token_acc": 0.952330743618202,
"train_speed(iter/s)": 0.115431
},
{
"epoch": 1.8655770936272646,
"eval_loss": 0.19679398834705353,
"eval_runtime": 21.7247,
"eval_samples_per_second": 14.638,
"eval_steps_per_second": 3.682,
"eval_token_acc": 0.9344151954700111,
"step": 920
},
{
"epoch": 1.8757126567844926,
"grad_norm": 0.678552508354187,
"learning_rate": 3.0804518918722953e-06,
"loss": 0.14409549236297609,
"memory(GiB)": 31.84,
"step": 925,
"token_acc": 0.9402150206553255,
"train_speed(iter/s)": 0.115117
},
{
"epoch": 1.8858482199417206,
"grad_norm": 0.654852569103241,
"learning_rate": 3.0315270215847015e-06,
"loss": 0.14367053508758545,
"memory(GiB)": 31.84,
"step": 930,
"token_acc": 0.9497569353477134,
"train_speed(iter/s)": 0.115224
},
{
"epoch": 1.8959837830989485,
"grad_norm": 0.7289896011352539,
"learning_rate": 2.982824190050958e-06,
"loss": 0.1398463487625122,
"memory(GiB)": 31.84,
"step": 935,
"token_acc": 0.9556035338495136,
"train_speed(iter/s)": 0.115346
},
{
"epoch": 1.9061193462561765,
"grad_norm": 0.7101432681083679,
"learning_rate": 2.934348890826804e-06,
"loss": 0.14695172309875487,
"memory(GiB)": 31.84,
"step": 940,
"token_acc": 0.9451462090193687,
"train_speed(iter/s)": 0.115476
},
{
"epoch": 1.9061193462561765,
"eval_loss": 0.196446493268013,
"eval_runtime": 21.7433,
"eval_samples_per_second": 14.625,
"eval_steps_per_second": 3.679,
"eval_token_acc": 0.9346374959090544,
"step": 940
},
{
"epoch": 1.9162549094134043,
"grad_norm": 0.6972203850746155,
"learning_rate": 2.8861065918029085e-06,
"loss": 0.13769317865371705,
"memory(GiB)": 31.84,
"step": 945,
"token_acc": 0.9454437011741591,
"train_speed(iter/s)": 0.11518
},
{
"epoch": 1.9263904725706322,
"grad_norm": 0.633342981338501,
"learning_rate": 2.83810273458811e-06,
"loss": 0.13131552934646606,
"memory(GiB)": 31.84,
"step": 950,
"token_acc": 0.9490581763627272,
"train_speed(iter/s)": 0.115298
},
{
"epoch": 1.93652603572786,
"grad_norm": 0.7772082686424255,
"learning_rate": 2.790342733895618e-06,
"loss": 0.14086905717849732,
"memory(GiB)": 31.84,
"step": 955,
"token_acc": 0.949326951839665,
"train_speed(iter/s)": 0.115399
},
{
"epoch": 1.946661598885088,
"grad_norm": 0.6924586892127991,
"learning_rate": 2.742831976932242e-06,
"loss": 0.13804731369018555,
"memory(GiB)": 31.84,
"step": 960,
"token_acc": 0.954203088394589,
"train_speed(iter/s)": 0.115508
},
{
"epoch": 1.946661598885088,
"eval_loss": 0.1961846649646759,
"eval_runtime": 21.6189,
"eval_samples_per_second": 14.709,
"eval_steps_per_second": 3.7,
"eval_token_acc": 0.9348597963480978,
"step": 960
},
{
"epoch": 1.956797162042316,
"grad_norm": 0.6690623760223389,
"learning_rate": 2.6955758227907335e-06,
"loss": 0.13871316909790038,
"memory(GiB)": 31.84,
"step": 965,
"token_acc": 0.9416814206770749,
"train_speed(iter/s)": 0.115192
},
{
"epoch": 1.9669327251995439,
"grad_norm": 0.7014771103858948,
"learning_rate": 2.648579601845295e-06,
"loss": 0.1412734031677246,
"memory(GiB)": 31.84,
"step": 970,
"token_acc": 0.9565555225449642,
"train_speed(iter/s)": 0.115273
},
{
"epoch": 1.9770682883567718,
"grad_norm": 0.6217651963233948,
"learning_rate": 2.6018486151503213e-06,
"loss": 0.14024865627288818,
"memory(GiB)": 31.84,
"step": 975,
"token_acc": 0.9587345601209982,
"train_speed(iter/s)": 0.115361
},
{
"epoch": 1.9872038515139998,
"grad_norm": 0.6598934531211853,
"learning_rate": 2.5553881338424553e-06,
"loss": 0.13222227096557618,
"memory(GiB)": 31.84,
"step": 980,
"token_acc": 0.9532787321172154,
"train_speed(iter/s)": 0.115455
},
{
"epoch": 1.9872038515139998,
"eval_loss": 0.19523988664150238,
"eval_runtime": 21.7763,
"eval_samples_per_second": 14.603,
"eval_steps_per_second": 3.674,
"eval_token_acc": 0.9347609961529674,
"step": 980
}
],
"logging_steps": 5,
"max_steps": 1479,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3311229017634898e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}