Files
qwen2.5vl-3b-rl-sft/trainer_state.json
ModelHub XC b00a97abe9 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-rl-sft
Source: Original Platform
2026-05-22 00:32:15 +08:00

1907 lines
54 KiB
JSON

{
"best_global_step": 760,
"best_metric": 0.22517732,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v36-20250515-204543/checkpoint-760",
"epoch": 0.9533516268130146,
"eval_steps": 20,
"global_step": 760,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012544100352802822,
"grad_norm": 0.7007026672363281,
"learning_rate": 9.999995684008912e-06,
"loss": 0.09371452033519745,
"memory(GiB)": 30.15,
"step": 1,
"token_acc": 0.9615550755939525,
"train_speed(iter/s)": 0.062663
},
{
"epoch": 0.006272050176401411,
"grad_norm": 0.7769243121147156,
"learning_rate": 9.999892100595329e-06,
"loss": 0.10849708318710327,
"memory(GiB)": 30.19,
"step": 5,
"token_acc": 0.9562576748199528,
"train_speed(iter/s)": 0.122173
},
{
"epoch": 0.012544100352802822,
"grad_norm": 0.7894852757453918,
"learning_rate": 9.999568407038233e-06,
"loss": 0.12320096492767334,
"memory(GiB)": 30.19,
"step": 10,
"token_acc": 0.955855880061259,
"train_speed(iter/s)": 0.136645
},
{
"epoch": 0.018816150529204233,
"grad_norm": 0.8094897866249084,
"learning_rate": 9.999028933299243e-06,
"loss": 0.11489032506942749,
"memory(GiB)": 30.19,
"step": 15,
"token_acc": 0.9583607506645961,
"train_speed(iter/s)": 0.144552
},
{
"epoch": 0.025088200705605645,
"grad_norm": 0.8118980526924133,
"learning_rate": 9.99827370266192e-06,
"loss": 0.11607390642166138,
"memory(GiB)": 30.19,
"step": 20,
"token_acc": 0.9584569732937686,
"train_speed(iter/s)": 0.146159
},
{
"epoch": 0.025088200705605645,
"eval_loss": 0.23747889697551727,
"eval_runtime": 29.1116,
"eval_samples_per_second": 17.691,
"eval_steps_per_second": 4.431,
"eval_token_acc": 0.9248609195450487,
"step": 20
},
{
"epoch": 0.03136025088200706,
"grad_norm": 0.7518147826194763,
"learning_rate": 9.99730274772184e-06,
"loss": 0.12006251811981201,
"memory(GiB)": 30.19,
"step": 25,
"token_acc": 0.9398439645614428,
"train_speed(iter/s)": 0.119433
},
{
"epoch": 0.037632301058408466,
"grad_norm": 0.7859554886817932,
"learning_rate": 9.996116110385186e-06,
"loss": 0.12473204135894775,
"memory(GiB)": 30.19,
"step": 30,
"token_acc": 0.9547159567642268,
"train_speed(iter/s)": 0.124931
},
{
"epoch": 0.04390435123480988,
"grad_norm": 0.7734975814819336,
"learning_rate": 9.99471384186694e-06,
"loss": 0.11890232563018799,
"memory(GiB)": 30.2,
"step": 35,
"token_acc": 0.9632012432012432,
"train_speed(iter/s)": 0.128582
},
{
"epoch": 0.05017640141121129,
"grad_norm": 0.7775484323501587,
"learning_rate": 9.99309600268868e-06,
"loss": 0.11513264179229736,
"memory(GiB)": 30.2,
"step": 40,
"token_acc": 0.9620845390377802,
"train_speed(iter/s)": 0.130866
},
{
"epoch": 0.05017640141121129,
"eval_loss": 0.23900838196277618,
"eval_runtime": 29.1592,
"eval_samples_per_second": 17.662,
"eval_steps_per_second": 4.424,
"eval_token_acc": 0.9247099957657495,
"step": 40
},
{
"epoch": 0.0564484515876127,
"grad_norm": 0.7852229475975037,
"learning_rate": 9.991262662675962e-06,
"loss": 0.1213950753211975,
"memory(GiB)": 30.2,
"step": 45,
"token_acc": 0.9422833912915708,
"train_speed(iter/s)": 0.119133
},
{
"epoch": 0.06272050176401411,
"grad_norm": 0.7627941370010376,
"learning_rate": 9.9892139009553e-06,
"loss": 0.12013821601867676,
"memory(GiB)": 30.2,
"step": 50,
"token_acc": 0.9524904419431597,
"train_speed(iter/s)": 0.122203
},
{
"epoch": 0.06899255194041552,
"grad_norm": 0.7667288184165955,
"learning_rate": 9.986949805950763e-06,
"loss": 0.12547953128814698,
"memory(GiB)": 30.2,
"step": 55,
"token_acc": 0.9569144662104125,
"train_speed(iter/s)": 0.124401
},
{
"epoch": 0.07526460211681693,
"grad_norm": 0.8220178484916687,
"learning_rate": 9.984470475380154e-06,
"loss": 0.12330178022384644,
"memory(GiB)": 30.2,
"step": 60,
"token_acc": 0.9609616164135824,
"train_speed(iter/s)": 0.12684
},
{
"epoch": 0.07526460211681693,
"eval_loss": 0.2401651293039322,
"eval_runtime": 28.9982,
"eval_samples_per_second": 17.76,
"eval_steps_per_second": 4.449,
"eval_token_acc": 0.9250453819419698,
"step": 60
},
{
"epoch": 0.08153665229321834,
"grad_norm": 0.7496252059936523,
"learning_rate": 9.982332112912999e-06,
"loss": 0.12913516759872437,
"memory(GiB)": 31.66,
"step": 65,
"token_acc": 0.9413472329138108,
"train_speed(iter/s)": 0.119192
},
{
"epoch": 0.08780870246961976,
"grad_norm": 0.742638349533081,
"learning_rate": 9.979465634221514e-06,
"loss": 0.11686735153198242,
"memory(GiB)": 31.66,
"step": 70,
"token_acc": 0.9583432768541352,
"train_speed(iter/s)": 0.121248
},
{
"epoch": 0.09408075264602117,
"grad_norm": 0.6979886889457703,
"learning_rate": 9.976384242979025e-06,
"loss": 0.11433117389678955,
"memory(GiB)": 31.66,
"step": 75,
"token_acc": 0.9620859246922897,
"train_speed(iter/s)": 0.123121
},
{
"epoch": 0.10035280282242258,
"grad_norm": 0.772607684135437,
"learning_rate": 9.973088072177646e-06,
"loss": 0.11932685375213622,
"memory(GiB)": 31.66,
"step": 80,
"token_acc": 0.9525376807136265,
"train_speed(iter/s)": 0.124671
},
{
"epoch": 0.10035280282242258,
"eval_loss": 0.2368009090423584,
"eval_runtime": 28.9145,
"eval_samples_per_second": 17.811,
"eval_steps_per_second": 4.461,
"eval_token_acc": 0.9248148039458184,
"step": 80
},
{
"epoch": 0.10662485299882399,
"grad_norm": 0.6924039721488953,
"learning_rate": 9.96957726407932e-06,
"loss": 0.11466219425201415,
"memory(GiB)": 31.66,
"step": 85,
"token_acc": 0.9409575111971916,
"train_speed(iter/s)": 0.119139
},
{
"epoch": 0.1128969031752254,
"grad_norm": 0.7015154957771301,
"learning_rate": 9.965851970209695e-06,
"loss": 0.11789379119873047,
"memory(GiB)": 31.66,
"step": 90,
"token_acc": 0.9585812037424941,
"train_speed(iter/s)": 0.120316
},
{
"epoch": 0.11916895335162682,
"grad_norm": 0.7769283652305603,
"learning_rate": 9.96191235135156e-06,
"loss": 0.12155482769012452,
"memory(GiB)": 31.66,
"step": 95,
"token_acc": 0.9554308702096125,
"train_speed(iter/s)": 0.122053
},
{
"epoch": 0.12544100352802823,
"grad_norm": 0.7704362869262695,
"learning_rate": 9.957758577537933e-06,
"loss": 0.13259472846984863,
"memory(GiB)": 31.66,
"step": 100,
"token_acc": 0.9502720633165537,
"train_speed(iter/s)": 0.123484
},
{
"epoch": 0.12544100352802823,
"eval_loss": 0.23640382289886475,
"eval_runtime": 29.2359,
"eval_samples_per_second": 17.615,
"eval_steps_per_second": 4.412,
"eval_token_acc": 0.9245884182768697,
"step": 100
},
{
"epoch": 0.13171305370442962,
"grad_norm": 0.7656592726707458,
"learning_rate": 9.953390828044698e-06,
"loss": 0.1214489221572876,
"memory(GiB)": 31.66,
"step": 105,
"token_acc": 0.94163746105919,
"train_speed(iter/s)": 0.119212
},
{
"epoch": 0.13798510388083104,
"grad_norm": 0.6987672448158264,
"learning_rate": 9.948809291382886e-06,
"loss": 0.12167651653289795,
"memory(GiB)": 31.66,
"step": 110,
"token_acc": 0.9554091191158653,
"train_speed(iter/s)": 0.120455
},
{
"epoch": 0.14425715405723247,
"grad_norm": 0.7173567414283752,
"learning_rate": 9.944014165290526e-06,
"loss": 0.12870512008666993,
"memory(GiB)": 31.66,
"step": 115,
"token_acc": 0.9555927368478797,
"train_speed(iter/s)": 0.121851
},
{
"epoch": 0.15052920423363386,
"grad_norm": 0.733219563961029,
"learning_rate": 9.939005656724122e-06,
"loss": 0.12763895988464355,
"memory(GiB)": 31.66,
"step": 120,
"token_acc": 0.9537378141994336,
"train_speed(iter/s)": 0.123038
},
{
"epoch": 0.15052920423363386,
"eval_loss": 0.2358570694923401,
"eval_runtime": 29.0168,
"eval_samples_per_second": 17.748,
"eval_steps_per_second": 4.446,
"eval_token_acc": 0.9245213410416256,
"step": 120
},
{
"epoch": 0.15680125441003528,
"grad_norm": 0.765933096408844,
"learning_rate": 9.933783981849704e-06,
"loss": 0.12144865989685058,
"memory(GiB)": 31.66,
"step": 125,
"token_acc": 0.9409262529390531,
"train_speed(iter/s)": 0.119399
},
{
"epoch": 0.16307330458643668,
"grad_norm": 0.6981678009033203,
"learning_rate": 9.928349366033525e-06,
"loss": 0.12389117479324341,
"memory(GiB)": 31.66,
"step": 130,
"token_acc": 0.9555003388299074,
"train_speed(iter/s)": 0.120494
},
{
"epoch": 0.1693453547628381,
"grad_norm": 0.7008967399597168,
"learning_rate": 9.923848513216085e-06,
"loss": 0.12482867240905762,
"memory(GiB)": 31.66,
"step": 135,
"token_acc": 0.9500779220779221,
"train_speed(iter/s)": 0.121483
},
{
"epoch": 0.17561740493923952,
"grad_norm": 0.7450129985809326,
"learning_rate": 9.918031200957224e-06,
"loss": 0.1304723024368286,
"memory(GiB)": 31.66,
"step": 140,
"token_acc": 0.9544462545722805,
"train_speed(iter/s)": 0.122651
},
{
"epoch": 0.17561740493923952,
"eval_loss": 0.23741304874420166,
"eval_runtime": 29.1676,
"eval_samples_per_second": 17.657,
"eval_steps_per_second": 4.423,
"eval_token_acc": 0.9250328049603616,
"step": 140
},
{
"epoch": 0.18188945511564092,
"grad_norm": 0.7213057279586792,
"learning_rate": 9.912001627642868e-06,
"loss": 0.12079639434814453,
"memory(GiB)": 31.66,
"step": 145,
"token_acc": 0.9405716060888475,
"train_speed(iter/s)": 0.119397
},
{
"epoch": 0.18816150529204234,
"grad_norm": 0.6824830174446106,
"learning_rate": 9.905760053507967e-06,
"loss": 0.11286978721618653,
"memory(GiB)": 31.66,
"step": 150,
"token_acc": 0.9568509120833905,
"train_speed(iter/s)": 0.120378
},
{
"epoch": 0.19443355546844374,
"grad_norm": 0.7245369553565979,
"learning_rate": 9.899306747937377e-06,
"loss": 0.12503495216369628,
"memory(GiB)": 31.66,
"step": 155,
"token_acc": 0.9548737472705896,
"train_speed(iter/s)": 0.121472
},
{
"epoch": 0.20070560564484516,
"grad_norm": 0.7653631567955017,
"learning_rate": 9.892641989454225e-06,
"loss": 0.1246172308921814,
"memory(GiB)": 31.66,
"step": 160,
"token_acc": 0.9559546167897296,
"train_speed(iter/s)": 0.122509
},
{
"epoch": 0.20070560564484516,
"eval_loss": 0.23944813013076782,
"eval_runtime": 29.28,
"eval_samples_per_second": 17.589,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.9244165328615568,
"step": 160
},
{
"epoch": 0.20697765582124658,
"grad_norm": 0.6885049939155579,
"learning_rate": 9.885766065707903e-06,
"loss": 0.12521634101867676,
"memory(GiB)": 31.66,
"step": 165,
"token_acc": 0.9397906012239098,
"train_speed(iter/s)": 0.119705
},
{
"epoch": 0.21324970599764798,
"grad_norm": 0.7737838625907898,
"learning_rate": 9.878679273461643e-06,
"loss": 0.12545130252838135,
"memory(GiB)": 31.66,
"step": 170,
"token_acc": 0.9525804833426775,
"train_speed(iter/s)": 0.120457
},
{
"epoch": 0.2195217561740494,
"grad_norm": 0.7562989592552185,
"learning_rate": 9.871381918579706e-06,
"loss": 0.11616495847702027,
"memory(GiB)": 31.66,
"step": 175,
"token_acc": 0.9607603010588085,
"train_speed(iter/s)": 0.121311
},
{
"epoch": 0.2257938063504508,
"grad_norm": 0.8050908446311951,
"learning_rate": 9.863874316014197e-06,
"loss": 0.11883351802825928,
"memory(GiB)": 31.66,
"step": 180,
"token_acc": 0.9569015887148382,
"train_speed(iter/s)": 0.122115
},
{
"epoch": 0.2257938063504508,
"eval_loss": 0.2381161004304886,
"eval_runtime": 29.157,
"eval_samples_per_second": 17.663,
"eval_steps_per_second": 4.424,
"eval_token_acc": 0.9248860735082651,
"step": 180
},
{
"epoch": 0.23206585652685222,
"grad_norm": 0.7181304097175598,
"learning_rate": 9.856156789791454e-06,
"loss": 0.12097489833831787,
"memory(GiB)": 31.66,
"step": 185,
"token_acc": 0.9404164162611713,
"train_speed(iter/s)": 0.119715
},
{
"epoch": 0.23833790670325364,
"grad_norm": 0.7890848517417908,
"learning_rate": 9.848229672998066e-06,
"loss": 0.1189950704574585,
"memory(GiB)": 31.66,
"step": 190,
"token_acc": 0.9567714631197098,
"train_speed(iter/s)": 0.120621
},
{
"epoch": 0.24460995687965503,
"grad_norm": 0.7079647183418274,
"learning_rate": 9.840093307766511e-06,
"loss": 0.12529479265213012,
"memory(GiB)": 31.66,
"step": 195,
"token_acc": 0.954030785285677,
"train_speed(iter/s)": 0.12131
},
{
"epoch": 0.25088200705605646,
"grad_norm": 0.7202178835868835,
"learning_rate": 9.831748045260374e-06,
"loss": 0.12180191278457642,
"memory(GiB)": 31.66,
"step": 200,
"token_acc": 0.9546836066920402,
"train_speed(iter/s)": 0.121964
},
{
"epoch": 0.25088200705605646,
"eval_loss": 0.23432905972003937,
"eval_runtime": 29.0534,
"eval_samples_per_second": 17.726,
"eval_steps_per_second": 4.44,
"eval_token_acc": 0.9248776888538597,
"step": 200
},
{
"epoch": 0.2571540572324579,
"grad_norm": 0.7178977727890015,
"learning_rate": 9.823194245659197e-06,
"loss": 0.12807730436325074,
"memory(GiB)": 31.66,
"step": 205,
"token_acc": 0.9399802586519479,
"train_speed(iter/s)": 0.119811
},
{
"epoch": 0.26342610740885924,
"grad_norm": 0.7141036987304688,
"learning_rate": 9.814432278142934e-06,
"loss": 0.11557638645172119,
"memory(GiB)": 31.66,
"step": 210,
"token_acc": 0.9552243011722272,
"train_speed(iter/s)": 0.120373
},
{
"epoch": 0.26969815758526067,
"grad_norm": 0.8078102469444275,
"learning_rate": 9.805462520876015e-06,
"loss": 0.1150855302810669,
"memory(GiB)": 33.77,
"step": 215,
"token_acc": 0.9593783736285808,
"train_speed(iter/s)": 0.121017
},
{
"epoch": 0.2759702077616621,
"grad_norm": 0.7020242214202881,
"learning_rate": 9.79628536099103e-06,
"loss": 0.1237363338470459,
"memory(GiB)": 33.77,
"step": 220,
"token_acc": 0.9599538638985006,
"train_speed(iter/s)": 0.121813
},
{
"epoch": 0.2759702077616621,
"eval_loss": 0.23757396638393402,
"eval_runtime": 29.0264,
"eval_samples_per_second": 17.742,
"eval_steps_per_second": 4.444,
"eval_token_acc": 0.9250914975412001,
"step": 220
},
{
"epoch": 0.2822422579380635,
"grad_norm": 0.8055130839347839,
"learning_rate": 9.786901194572012e-06,
"loss": 0.1192856788635254,
"memory(GiB)": 33.77,
"step": 225,
"token_acc": 0.9404110409842614,
"train_speed(iter/s)": 0.119956
},
{
"epoch": 0.28851430811446493,
"grad_norm": 0.8203707337379456,
"learning_rate": 9.777310426637349e-06,
"loss": 0.11806493997573853,
"memory(GiB)": 33.77,
"step": 230,
"token_acc": 0.9590930586937103,
"train_speed(iter/s)": 0.120564
},
{
"epoch": 0.2947863582908663,
"grad_norm": 0.7641969919204712,
"learning_rate": 9.767513471122305e-06,
"loss": 0.11997225284576415,
"memory(GiB)": 33.77,
"step": 235,
"token_acc": 0.9602041571122124,
"train_speed(iter/s)": 0.121067
},
{
"epoch": 0.3010584084672677,
"grad_norm": 0.733931839466095,
"learning_rate": 9.757510750861143e-06,
"loss": 0.12144792079925537,
"memory(GiB)": 33.77,
"step": 240,
"token_acc": 0.9531961770923274,
"train_speed(iter/s)": 0.121552
},
{
"epoch": 0.3010584084672677,
"eval_loss": 0.23732979595661163,
"eval_runtime": 29.0284,
"eval_samples_per_second": 17.741,
"eval_steps_per_second": 4.444,
"eval_token_acc": 0.9247980346370074,
"step": 240
},
{
"epoch": 0.30733045864366915,
"grad_norm": 0.7051452398300171,
"learning_rate": 9.749360713849587e-06,
"loss": 0.12806930541992187,
"memory(GiB)": 33.77,
"step": 245,
"token_acc": 0.9387967295240796,
"train_speed(iter/s)": 0.119787
},
{
"epoch": 0.31360250882007057,
"grad_norm": 0.7828952074050903,
"learning_rate": 9.741079488650608e-06,
"loss": 0.13568118810653687,
"memory(GiB)": 33.77,
"step": 250,
"token_acc": 0.948076923076923,
"train_speed(iter/s)": 0.12055
},
{
"epoch": 0.319874558996472,
"grad_norm": 0.7669305205345154,
"learning_rate": 9.730543822588614e-06,
"loss": 0.12099459171295165,
"memory(GiB)": 33.77,
"step": 255,
"token_acc": 0.9579619299557192,
"train_speed(iter/s)": 0.121138
},
{
"epoch": 0.32614660917287336,
"grad_norm": 0.6992693543434143,
"learning_rate": 9.71980398738173e-06,
"loss": 0.12393572330474853,
"memory(GiB)": 33.77,
"step": 260,
"token_acc": 0.9492717094266536,
"train_speed(iter/s)": 0.121576
},
{
"epoch": 0.32614660917287336,
"eval_loss": 0.23609744012355804,
"eval_runtime": 29.0643,
"eval_samples_per_second": 17.719,
"eval_steps_per_second": 4.438,
"eval_token_acc": 0.9247644960193854,
"step": 260
},
{
"epoch": 0.3324186593492748,
"grad_norm": 0.7092508673667908,
"learning_rate": 9.708860446558685e-06,
"loss": 0.12540948390960693,
"memory(GiB)": 33.77,
"step": 265,
"token_acc": 0.9381209283387623,
"train_speed(iter/s)": 0.119928
},
{
"epoch": 0.3386907095256762,
"grad_norm": 0.7747306823730469,
"learning_rate": 9.6977136724401e-06,
"loss": 0.1322183132171631,
"memory(GiB)": 33.77,
"step": 270,
"token_acc": 0.948199121522694,
"train_speed(iter/s)": 0.120479
},
{
"epoch": 0.3449627597020776,
"grad_norm": 0.7813765406608582,
"learning_rate": 9.686364146118085e-06,
"loss": 0.12453765869140625,
"memory(GiB)": 33.77,
"step": 275,
"token_acc": 0.9574612482015366,
"train_speed(iter/s)": 0.121031
},
{
"epoch": 0.35123480987847905,
"grad_norm": 0.8077899217605591,
"learning_rate": 9.674812357435497e-06,
"loss": 0.13067824840545655,
"memory(GiB)": 33.77,
"step": 280,
"token_acc": 0.9562002982107356,
"train_speed(iter/s)": 0.121591
},
{
"epoch": 0.35123480987847905,
"eval_loss": 0.23406127095222473,
"eval_runtime": 28.8857,
"eval_samples_per_second": 17.829,
"eval_steps_per_second": 4.466,
"eval_token_acc": 0.925296921574135,
"step": 280
},
{
"epoch": 0.3575068600548804,
"grad_norm": 0.7608367204666138,
"learning_rate": 9.663058804964784e-06,
"loss": 0.12904319763183594,
"memory(GiB)": 33.77,
"step": 285,
"token_acc": 0.9395176026312584,
"train_speed(iter/s)": 0.120021
},
{
"epoch": 0.36377891023128184,
"grad_norm": 0.7974592447280884,
"learning_rate": 9.65110399598647e-06,
"loss": 0.11571755409240722,
"memory(GiB)": 33.77,
"step": 290,
"token_acc": 0.9629809560823941,
"train_speed(iter/s)": 0.120511
},
{
"epoch": 0.37005096040768326,
"grad_norm": 0.807405948638916,
"learning_rate": 9.638948446467268e-06,
"loss": 0.12567424774169922,
"memory(GiB)": 33.77,
"step": 295,
"token_acc": 0.9559097936770272,
"train_speed(iter/s)": 0.121048
},
{
"epoch": 0.3763230105840847,
"grad_norm": 0.8031049966812134,
"learning_rate": 9.626592681037797e-06,
"loss": 0.12862168550491332,
"memory(GiB)": 33.77,
"step": 300,
"token_acc": 0.9541518224171006,
"train_speed(iter/s)": 0.121608
},
{
"epoch": 0.3763230105840847,
"eval_loss": 0.239148810505867,
"eval_runtime": 29.0314,
"eval_samples_per_second": 17.739,
"eval_steps_per_second": 4.443,
"eval_token_acc": 0.9250160356515505,
"step": 300
},
{
"epoch": 0.3825950607604861,
"grad_norm": 0.7160354256629944,
"learning_rate": 9.614037232969952e-06,
"loss": 0.11383086442947388,
"memory(GiB)": 33.77,
"step": 305,
"token_acc": 0.9417116516042943,
"train_speed(iter/s)": 0.120136
},
{
"epoch": 0.3888671109368875,
"grad_norm": 0.7129721641540527,
"learning_rate": 9.601282644153882e-06,
"loss": 0.12448391914367676,
"memory(GiB)": 33.77,
"step": 310,
"token_acc": 0.9534784033888903,
"train_speed(iter/s)": 0.120638
},
{
"epoch": 0.3951391611132889,
"grad_norm": 0.6943992972373962,
"learning_rate": 9.5883294650746e-06,
"loss": 0.12468962669372559,
"memory(GiB)": 33.77,
"step": 315,
"token_acc": 0.9542487486461961,
"train_speed(iter/s)": 0.12098
},
{
"epoch": 0.4014112112896903,
"grad_norm": 0.6942281126976013,
"learning_rate": 9.575178254788235e-06,
"loss": 0.12767086029052735,
"memory(GiB)": 33.77,
"step": 320,
"token_acc": 0.9511481009569767,
"train_speed(iter/s)": 0.121441
},
{
"epoch": 0.4014112112896903,
"eval_loss": 0.23587600886821747,
"eval_runtime": 29.0365,
"eval_samples_per_second": 17.736,
"eval_steps_per_second": 4.443,
"eval_token_acc": 0.9246219568944917,
"step": 320
},
{
"epoch": 0.40768326146609174,
"grad_norm": 0.7060673236846924,
"learning_rate": 9.56182958089789e-06,
"loss": 0.1355045199394226,
"memory(GiB)": 33.77,
"step": 325,
"token_acc": 0.9405469567818154,
"train_speed(iter/s)": 0.120115
},
{
"epoch": 0.41395531164249316,
"grad_norm": 0.8162744045257568,
"learning_rate": 9.548284019529149e-06,
"loss": 0.13120698928833008,
"memory(GiB)": 33.77,
"step": 330,
"token_acc": 0.9542016095898688,
"train_speed(iter/s)": 0.120527
},
{
"epoch": 0.42022736181889453,
"grad_norm": 0.7768563032150269,
"learning_rate": 9.534542155305217e-06,
"loss": 0.12495183944702148,
"memory(GiB)": 33.77,
"step": 335,
"token_acc": 0.955458468751665,
"train_speed(iter/s)": 0.121049
},
{
"epoch": 0.42649941199529595,
"grad_norm": 0.7437924146652222,
"learning_rate": 9.520604581321682e-06,
"loss": 0.12085769176483155,
"memory(GiB)": 33.77,
"step": 340,
"token_acc": 0.9580750533707143,
"train_speed(iter/s)": 0.121396
},
{
"epoch": 0.42649941199529595,
"eval_loss": 0.23598669469356537,
"eval_runtime": 29.1207,
"eval_samples_per_second": 17.685,
"eval_steps_per_second": 4.43,
"eval_token_acc": 0.9251292284860249,
"step": 340
},
{
"epoch": 0.4327714621716974,
"grad_norm": 0.7635094523429871,
"learning_rate": 9.506471899120917e-06,
"loss": 0.12304807901382446,
"memory(GiB)": 33.77,
"step": 345,
"token_acc": 0.9426115423821846,
"train_speed(iter/s)": 0.120096
},
{
"epoch": 0.4390435123480988,
"grad_norm": 0.7481803297996521,
"learning_rate": 9.49214471866612e-06,
"loss": 0.1286768436431885,
"memory(GiB)": 33.77,
"step": 350,
"token_acc": 0.9525481515405949,
"train_speed(iter/s)": 0.120576
},
{
"epoch": 0.4453155625245002,
"grad_norm": 0.7514587640762329,
"learning_rate": 9.477623658314988e-06,
"loss": 0.13611133098602296,
"memory(GiB)": 33.77,
"step": 355,
"token_acc": 0.9467895891385546,
"train_speed(iter/s)": 0.121066
},
{
"epoch": 0.4515876127009016,
"grad_norm": 0.6806954741477966,
"learning_rate": 9.462909344793028e-06,
"loss": 0.12503905296325685,
"memory(GiB)": 33.77,
"step": 360,
"token_acc": 0.9559137034194594,
"train_speed(iter/s)": 0.121424
},
{
"epoch": 0.4515876127009016,
"eval_loss": 0.23415741324424744,
"eval_runtime": 29.1302,
"eval_samples_per_second": 17.679,
"eval_steps_per_second": 4.428,
"eval_token_acc": 0.9256910003311939,
"step": 360
},
{
"epoch": 0.457859662877303,
"grad_norm": 0.7245866060256958,
"learning_rate": 9.448002413166509e-06,
"loss": 0.11684945821762086,
"memory(GiB)": 33.77,
"step": 365,
"token_acc": 0.9405347148691575,
"train_speed(iter/s)": 0.12022
},
{
"epoch": 0.46413171305370443,
"grad_norm": 0.7275366187095642,
"learning_rate": 9.43290350681505e-06,
"loss": 0.12942945957183838,
"memory(GiB)": 33.77,
"step": 370,
"token_acc": 0.948218290555694,
"train_speed(iter/s)": 0.120576
},
{
"epoch": 0.47040376323010585,
"grad_norm": 0.9837439060211182,
"learning_rate": 9.41761327740385e-06,
"loss": 0.13003346920013428,
"memory(GiB)": 33.77,
"step": 375,
"token_acc": 0.957800478604328,
"train_speed(iter/s)": 0.121019
},
{
"epoch": 0.4766758134065073,
"grad_norm": 0.7750929594039917,
"learning_rate": 9.402132384855573e-06,
"loss": 0.12979254722595215,
"memory(GiB)": 33.77,
"step": 380,
"token_acc": 0.9498875140607425,
"train_speed(iter/s)": 0.121423
},
{
"epoch": 0.4766758134065073,
"eval_loss": 0.2338828444480896,
"eval_runtime": 29.2086,
"eval_samples_per_second": 17.632,
"eval_steps_per_second": 4.417,
"eval_token_acc": 0.9252759599381213,
"step": 380
},
{
"epoch": 0.48294786358290864,
"grad_norm": 0.6944046020507812,
"learning_rate": 9.389610842080394e-06,
"loss": 0.12626748085021972,
"memory(GiB)": 33.77,
"step": 385,
"token_acc": 0.9413088592055652,
"train_speed(iter/s)": 0.120293
},
{
"epoch": 0.48921991375931007,
"grad_norm": 0.7166000008583069,
"learning_rate": 9.373788445138972e-06,
"loss": 0.12364100217819214,
"memory(GiB)": 33.77,
"step": 390,
"token_acc": 0.9516249135684273,
"train_speed(iter/s)": 0.120761
},
{
"epoch": 0.4954919639357115,
"grad_norm": 0.6891298294067383,
"learning_rate": 9.357777276529793e-06,
"loss": 0.11418641805648803,
"memory(GiB)": 36.04,
"step": 395,
"token_acc": 0.9607581283065386,
"train_speed(iter/s)": 0.121105
},
{
"epoch": 0.5017640141121129,
"grad_norm": 0.8014844059944153,
"learning_rate": 9.341578027291085e-06,
"loss": 0.12451854944229127,
"memory(GiB)": 36.04,
"step": 400,
"token_acc": 0.9554823405376911,
"train_speed(iter/s)": 0.121475
},
{
"epoch": 0.5017640141121129,
"eval_loss": 0.2350710779428482,
"eval_runtime": 28.9464,
"eval_samples_per_second": 17.792,
"eval_steps_per_second": 4.457,
"eval_token_acc": 0.9253011139013377,
"step": 400
},
{
"epoch": 0.5080360642885143,
"grad_norm": 0.7108286619186401,
"learning_rate": 9.325191396578589e-06,
"loss": 0.12221509218215942,
"memory(GiB)": 36.04,
"step": 405,
"token_acc": 0.9413378371462204,
"train_speed(iter/s)": 0.12034
},
{
"epoch": 0.5143081144649158,
"grad_norm": 0.7911909222602844,
"learning_rate": 9.308618091635382e-06,
"loss": 0.12177256345748902,
"memory(GiB)": 36.04,
"step": 410,
"token_acc": 0.959347706235673,
"train_speed(iter/s)": 0.120651
},
{
"epoch": 0.5205801646413172,
"grad_norm": 1.8789387941360474,
"learning_rate": 9.291858827761359e-06,
"loss": 0.1333709716796875,
"memory(GiB)": 36.04,
"step": 415,
"token_acc": 0.95194391673133,
"train_speed(iter/s)": 0.120981
},
{
"epoch": 0.5268522148177185,
"grad_norm": 0.7984176278114319,
"learning_rate": 9.274914328282359e-06,
"loss": 0.12819453477859497,
"memory(GiB)": 36.04,
"step": 420,
"token_acc": 0.957134979829933,
"train_speed(iter/s)": 0.121312
},
{
"epoch": 0.5268522148177185,
"eval_loss": 0.23512502014636993,
"eval_runtime": 29.1452,
"eval_samples_per_second": 17.67,
"eval_steps_per_second": 4.426,
"eval_token_acc": 0.9250202279787533,
"step": 420
},
{
"epoch": 0.5331242649941199,
"grad_norm": 0.7520173788070679,
"learning_rate": 9.257785324518943e-06,
"loss": 0.12105765342712402,
"memory(GiB)": 36.04,
"step": 425,
"token_acc": 0.9403581723767339,
"train_speed(iter/s)": 0.120363
},
{
"epoch": 0.5393963151705213,
"grad_norm": 0.8471489548683167,
"learning_rate": 9.240472555754835e-06,
"loss": 0.12356100082397461,
"memory(GiB)": 36.04,
"step": 430,
"token_acc": 0.9564814136828489,
"train_speed(iter/s)": 0.120684
},
{
"epoch": 0.5456683653469228,
"grad_norm": 0.7935863733291626,
"learning_rate": 9.222976769205013e-06,
"loss": 0.12740910053253174,
"memory(GiB)": 38.32,
"step": 435,
"token_acc": 0.9569268406943757,
"train_speed(iter/s)": 0.120949
},
{
"epoch": 0.5519404155233242,
"grad_norm": 0.7470819354057312,
"learning_rate": 9.205298719983458e-06,
"loss": 0.12629660367965698,
"memory(GiB)": 38.32,
"step": 440,
"token_acc": 0.9542381848107219,
"train_speed(iter/s)": 0.121266
},
{
"epoch": 0.5519404155233242,
"eval_loss": 0.2329576462507248,
"eval_runtime": 29.0573,
"eval_samples_per_second": 17.724,
"eval_steps_per_second": 4.44,
"eval_token_acc": 0.925007650997145,
"step": 440
},
{
"epoch": 0.5582124656997256,
"grad_norm": 0.8528454899787903,
"learning_rate": 9.187439171070563e-06,
"loss": 0.11683663129806518,
"memory(GiB)": 38.32,
"step": 445,
"token_acc": 0.9422364773256167,
"train_speed(iter/s)": 0.12028
},
{
"epoch": 0.564484515876127,
"grad_norm": 0.6902926564216614,
"learning_rate": 9.173021369887053e-06,
"loss": 0.1320955276489258,
"memory(GiB)": 38.32,
"step": 450,
"token_acc": 0.9549962232889062,
"train_speed(iter/s)": 0.12058
},
{
"epoch": 0.5707565660525284,
"grad_norm": 0.7285463213920593,
"learning_rate": 9.154837069223594e-06,
"loss": 0.12488093376159667,
"memory(GiB)": 38.32,
"step": 455,
"token_acc": 0.9579794738443663,
"train_speed(iter/s)": 0.120802
},
{
"epoch": 0.5770286162289299,
"grad_norm": 0.6829497218132019,
"learning_rate": 9.136473446781624e-06,
"loss": 0.12886552810668944,
"memory(GiB)": 38.32,
"step": 460,
"token_acc": 0.9550450619099832,
"train_speed(iter/s)": 0.121049
},
{
"epoch": 0.5770286162289299,
"eval_loss": 0.2350022941827774,
"eval_runtime": 29.2036,
"eval_samples_per_second": 17.635,
"eval_steps_per_second": 4.417,
"eval_token_acc": 0.9253220755373516,
"step": 460
},
{
"epoch": 0.5833006664053313,
"grad_norm": 0.793786346912384,
"learning_rate": 9.11793129513072e-06,
"loss": 0.1309070110321045,
"memory(GiB)": 38.32,
"step": 465,
"token_acc": 0.9386901904304689,
"train_speed(iter/s)": 0.12018
},
{
"epoch": 0.5895727165817326,
"grad_norm": 0.6860336661338806,
"learning_rate": 9.102969570306243e-06,
"loss": 0.13614410161972046,
"memory(GiB)": 38.32,
"step": 470,
"token_acc": 0.950530035335689,
"train_speed(iter/s)": 0.120521
},
{
"epoch": 0.595844766758134,
"grad_norm": 0.7473175525665283,
"learning_rate": 9.084108087927778e-06,
"loss": 0.13468925952911376,
"memory(GiB)": 38.32,
"step": 475,
"token_acc": 0.9512150026413101,
"train_speed(iter/s)": 0.120848
},
{
"epoch": 0.6021168169345354,
"grad_norm": 0.705289900302887,
"learning_rate": 9.065070336416794e-06,
"loss": 0.12688368558883667,
"memory(GiB)": 38.32,
"step": 480,
"token_acc": 0.9514988814317673,
"train_speed(iter/s)": 0.121127
},
{
"epoch": 0.6021168169345354,
"eval_loss": 0.23424042761325836,
"eval_runtime": 28.918,
"eval_samples_per_second": 17.809,
"eval_steps_per_second": 4.461,
"eval_token_acc": 0.9258838473825205,
"step": 480
},
{
"epoch": 0.6083888671109369,
"grad_norm": 0.7309315204620361,
"learning_rate": 9.045857137438114e-06,
"loss": 0.12572396993637086,
"memory(GiB)": 38.32,
"step": 485,
"token_acc": 0.9416618199382905,
"train_speed(iter/s)": 0.120198
},
{
"epoch": 0.6146609172873383,
"grad_norm": 0.7343481779098511,
"learning_rate": 9.02646932022883e-06,
"loss": 0.12929785251617432,
"memory(GiB)": 38.32,
"step": 490,
"token_acc": 0.9553018035624546,
"train_speed(iter/s)": 0.120514
},
{
"epoch": 0.6209329674637397,
"grad_norm": 0.7487764954566956,
"learning_rate": 9.006907721562515e-06,
"loss": 0.12204375267028808,
"memory(GiB)": 38.32,
"step": 495,
"token_acc": 0.9591731423020884,
"train_speed(iter/s)": 0.120819
},
{
"epoch": 0.6272050176401411,
"grad_norm": 0.7962038516998291,
"learning_rate": 8.987173185713113e-06,
"loss": 0.12226212024688721,
"memory(GiB)": 38.32,
"step": 500,
"token_acc": 0.9564000589188393,
"train_speed(iter/s)": 0.121101
},
{
"epoch": 0.6272050176401411,
"eval_loss": 0.23693928122520447,
"eval_runtime": 29.1127,
"eval_samples_per_second": 17.69,
"eval_steps_per_second": 4.431,
"eval_token_acc": 0.9255736151695168,
"step": 500
},
{
"epoch": 0.6334770678165426,
"grad_norm": 0.7448955178260803,
"learning_rate": 8.967266564418485e-06,
"loss": 0.12553646564483642,
"memory(GiB)": 38.32,
"step": 505,
"token_acc": 0.9397639899675178,
"train_speed(iter/s)": 0.120264
},
{
"epoch": 0.639749117992944,
"grad_norm": 0.6869771480560303,
"learning_rate": 8.947188716843668e-06,
"loss": 0.12530720233917236,
"memory(GiB)": 38.32,
"step": 510,
"token_acc": 0.9531347241388641,
"train_speed(iter/s)": 0.120511
},
{
"epoch": 0.6460211681693454,
"grad_norm": 0.7007948756217957,
"learning_rate": 8.926940509543786e-06,
"loss": 0.12557142972946167,
"memory(GiB)": 38.32,
"step": 515,
"token_acc": 0.9570901871809416,
"train_speed(iter/s)": 0.120789
},
{
"epoch": 0.6522932183457467,
"grad_norm": 0.6679887771606445,
"learning_rate": 8.906522816426642e-06,
"loss": 0.11763076782226563,
"memory(GiB)": 38.32,
"step": 520,
"token_acc": 0.9630898229846002,
"train_speed(iter/s)": 0.121093
},
{
"epoch": 0.6522932183457467,
"eval_loss": 0.23494519293308258,
"eval_runtime": 29.1208,
"eval_samples_per_second": 17.685,
"eval_steps_per_second": 4.43,
"eval_token_acc": 0.9250202279787533,
"step": 520
},
{
"epoch": 0.6585652685221481,
"grad_norm": 1.0034900903701782,
"learning_rate": 8.885936518715009e-06,
"loss": 0.12190806865692139,
"memory(GiB)": 38.32,
"step": 525,
"token_acc": 0.9413621144839724,
"train_speed(iter/s)": 0.12028
},
{
"epoch": 0.6648373186985496,
"grad_norm": 0.7202277779579163,
"learning_rate": 8.865182504908593e-06,
"loss": 0.12205361127853394,
"memory(GiB)": 38.32,
"step": 530,
"token_acc": 0.9566966466480848,
"train_speed(iter/s)": 0.120583
},
{
"epoch": 0.671109368874951,
"grad_norm": 0.7922447323799133,
"learning_rate": 8.84426167074569e-06,
"loss": 0.12360981702804566,
"memory(GiB)": 38.32,
"step": 535,
"token_acc": 0.9545589899350843,
"train_speed(iter/s)": 0.120816
},
{
"epoch": 0.6773814190513524,
"grad_norm": 0.7317930459976196,
"learning_rate": 8.823174919164517e-06,
"loss": 0.12647807598114014,
"memory(GiB)": 38.32,
"step": 540,
"token_acc": 0.9562937062937062,
"train_speed(iter/s)": 0.121078
},
{
"epoch": 0.6773814190513524,
"eval_loss": 0.234180748462677,
"eval_runtime": 29.282,
"eval_samples_per_second": 17.588,
"eval_steps_per_second": 4.405,
"eval_token_acc": 0.9257161542944103,
"step": 540
},
{
"epoch": 0.6836534692277538,
"grad_norm": 0.7069205045700073,
"learning_rate": 8.801923160264254e-06,
"loss": 0.12029304504394531,
"memory(GiB)": 38.32,
"step": 545,
"token_acc": 0.940614257111556,
"train_speed(iter/s)": 0.120293
},
{
"epoch": 0.6899255194041553,
"grad_norm": 0.7268481850624084,
"learning_rate": 8.78050731126575e-06,
"loss": 0.12312864065170288,
"memory(GiB)": 38.32,
"step": 550,
"token_acc": 0.9642680054543201,
"train_speed(iter/s)": 0.12052
},
{
"epoch": 0.6961975695805567,
"grad_norm": 0.7602020502090454,
"learning_rate": 8.758928296471955e-06,
"loss": 0.12826888561248778,
"memory(GiB)": 38.32,
"step": 555,
"token_acc": 0.9557154631332023,
"train_speed(iter/s)": 0.120805
},
{
"epoch": 0.7024696197569581,
"grad_norm": 0.7089629173278809,
"learning_rate": 8.737187047228004e-06,
"loss": 0.12195276021957398,
"memory(GiB)": 38.32,
"step": 560,
"token_acc": 0.9564936463493431,
"train_speed(iter/s)": 0.121045
},
{
"epoch": 0.7024696197569581,
"eval_loss": 0.23239342868328094,
"eval_runtime": 28.8986,
"eval_samples_per_second": 17.821,
"eval_steps_per_second": 4.464,
"eval_token_acc": 0.9253933450997983,
"step": 560
},
{
"epoch": 0.7087416699333595,
"grad_norm": 0.7802727818489075,
"learning_rate": 8.715284501881039e-06,
"loss": 0.12478115558624267,
"memory(GiB)": 38.32,
"step": 565,
"token_acc": 0.9394610632417493,
"train_speed(iter/s)": 0.120318
},
{
"epoch": 0.7150137201097608,
"grad_norm": 0.692456841468811,
"learning_rate": 8.693221605739697e-06,
"loss": 0.12183520793914795,
"memory(GiB)": 38.32,
"step": 570,
"token_acc": 0.9573177580590813,
"train_speed(iter/s)": 0.120604
},
{
"epoch": 0.7212857702861623,
"grad_norm": 0.783674955368042,
"learning_rate": 8.670999311033328e-06,
"loss": 0.1260378837585449,
"memory(GiB)": 40.76,
"step": 575,
"token_acc": 0.958084188606277,
"train_speed(iter/s)": 0.12084
},
{
"epoch": 0.7275578204625637,
"grad_norm": 0.7612254023551941,
"learning_rate": 8.648618576870877e-06,
"loss": 0.12205030918121337,
"memory(GiB)": 40.76,
"step": 580,
"token_acc": 0.9522026264517598,
"train_speed(iter/s)": 0.121105
},
{
"epoch": 0.7275578204625637,
"eval_loss": 0.23236523568630219,
"eval_runtime": 29.1132,
"eval_samples_per_second": 17.69,
"eval_steps_per_second": 4.431,
"eval_token_acc": 0.9257538852392352,
"step": 580
},
{
"epoch": 0.7338298706389651,
"grad_norm": 0.6874219179153442,
"learning_rate": 8.626080369199499e-06,
"loss": 0.12317302227020263,
"memory(GiB)": 40.76,
"step": 585,
"token_acc": 0.9398445420750253,
"train_speed(iter/s)": 0.120379
},
{
"epoch": 0.7401019208153665,
"grad_norm": 0.7761655449867249,
"learning_rate": 8.603385660762872e-06,
"loss": 0.1282115697860718,
"memory(GiB)": 40.76,
"step": 590,
"token_acc": 0.9520723436322532,
"train_speed(iter/s)": 0.120652
},
{
"epoch": 0.7463739709917679,
"grad_norm": 0.6982787251472473,
"learning_rate": 8.58053543105921e-06,
"loss": 0.1281890869140625,
"memory(GiB)": 40.76,
"step": 595,
"token_acc": 0.953644096279635,
"train_speed(iter/s)": 0.120874
},
{
"epoch": 0.7526460211681694,
"grad_norm": 0.7546159625053406,
"learning_rate": 8.55753066629898e-06,
"loss": 0.12751117944717408,
"memory(GiB)": 40.76,
"step": 600,
"token_acc": 0.9540350393157677,
"train_speed(iter/s)": 0.121112
},
{
"epoch": 0.7526460211681694,
"eval_loss": 0.23002442717552185,
"eval_runtime": 29.1415,
"eval_samples_per_second": 17.672,
"eval_steps_per_second": 4.427,
"eval_token_acc": 0.9261940795955242,
"step": 600
},
{
"epoch": 0.7589180713445708,
"grad_norm": 0.7351900935173035,
"learning_rate": 8.534372359362357e-06,
"loss": 0.1303678870201111,
"memory(GiB)": 40.76,
"step": 605,
"token_acc": 0.9409314468422133,
"train_speed(iter/s)": 0.120394
},
{
"epoch": 0.7651901215209722,
"grad_norm": 0.8682727217674255,
"learning_rate": 8.51106150975635e-06,
"loss": 0.1233241081237793,
"memory(GiB)": 40.76,
"step": 610,
"token_acc": 0.95544310046902,
"train_speed(iter/s)": 0.120622
},
{
"epoch": 0.7714621716973736,
"grad_norm": 0.7701956629753113,
"learning_rate": 8.487599123571675e-06,
"loss": 0.11557955741882324,
"memory(GiB)": 40.76,
"step": 615,
"token_acc": 0.9595501699938976,
"train_speed(iter/s)": 0.120911
},
{
"epoch": 0.777734221873775,
"grad_norm": 0.763529360294342,
"learning_rate": 8.463986213439337e-06,
"loss": 0.12450950145721436,
"memory(GiB)": 40.76,
"step": 620,
"token_acc": 0.9594680177327423,
"train_speed(iter/s)": 0.121153
},
{
"epoch": 0.777734221873775,
"eval_loss": 0.2307971715927124,
"eval_runtime": 28.9628,
"eval_samples_per_second": 17.781,
"eval_steps_per_second": 4.454,
"eval_token_acc": 0.9257371159304242,
"step": 620
},
{
"epoch": 0.7840062720501764,
"grad_norm": 0.743877649307251,
"learning_rate": 8.440223798486913e-06,
"loss": 0.13349132537841796,
"memory(GiB)": 40.76,
"step": 625,
"token_acc": 0.9376087341521601,
"train_speed(iter/s)": 0.120442
},
{
"epoch": 0.7902783222265778,
"grad_norm": 0.7353401184082031,
"learning_rate": 8.416312904294572e-06,
"loss": 0.13025209903717042,
"memory(GiB)": 40.76,
"step": 630,
"token_acc": 0.960784808848038,
"train_speed(iter/s)": 0.120683
},
{
"epoch": 0.7965503724029792,
"grad_norm": 0.7505218982696533,
"learning_rate": 8.397077977170049e-06,
"loss": 0.13371331691741944,
"memory(GiB)": 40.76,
"step": 635,
"token_acc": 0.9515476784822766,
"train_speed(iter/s)": 0.120921
},
{
"epoch": 0.8028224225793806,
"grad_norm": 0.8705490231513977,
"learning_rate": 8.372902425234847e-06,
"loss": 0.12443286180496216,
"memory(GiB)": 40.76,
"step": 640,
"token_acc": 0.9559957659156207,
"train_speed(iter/s)": 0.12116
},
{
"epoch": 0.8028224225793806,
"eval_loss": 0.23105858266353607,
"eval_runtime": 29.1773,
"eval_samples_per_second": 17.651,
"eval_steps_per_second": 4.421,
"eval_token_acc": 0.9266720048966381,
"step": 640
},
{
"epoch": 0.8090944727557821,
"grad_norm": 0.720313549041748,
"learning_rate": 8.348581299634171e-06,
"loss": 0.12005361318588256,
"memory(GiB)": 40.76,
"step": 645,
"token_acc": 0.942721820579713,
"train_speed(iter/s)": 0.120467
},
{
"epoch": 0.8153665229321835,
"grad_norm": 0.7976186275482178,
"learning_rate": 8.324115650062005e-06,
"loss": 0.1226189136505127,
"memory(GiB)": 40.76,
"step": 650,
"token_acc": 0.95751953125,
"train_speed(iter/s)": 0.120671
},
{
"epoch": 0.8216385731085849,
"grad_norm": 0.7320578694343567,
"learning_rate": 8.29950653244996e-06,
"loss": 0.12214083671569824,
"memory(GiB)": 40.76,
"step": 655,
"token_acc": 0.9560769335697722,
"train_speed(iter/s)": 0.120871
},
{
"epoch": 0.8279106232849863,
"grad_norm": 0.714158833026886,
"learning_rate": 8.27475500892169e-06,
"loss": 0.13046940565109252,
"memory(GiB)": 40.76,
"step": 660,
"token_acc": 0.9564310899892687,
"train_speed(iter/s)": 0.121118
},
{
"epoch": 0.8279106232849863,
"eval_loss": 0.2298216074705124,
"eval_runtime": 29.0928,
"eval_samples_per_second": 17.702,
"eval_steps_per_second": 4.434,
"eval_token_acc": 0.9264120806100674,
"step": 660
},
{
"epoch": 0.8341826734613876,
"grad_norm": 0.70967036485672,
"learning_rate": 8.249862147747062e-06,
"loss": 0.12797050476074218,
"memory(GiB)": 40.76,
"step": 665,
"token_acc": 0.940926979466161,
"train_speed(iter/s)": 0.120422
},
{
"epoch": 0.8404547236377891,
"grad_norm": 0.6546662449836731,
"learning_rate": 8.224829023296032e-06,
"loss": 0.12179737091064453,
"memory(GiB)": 40.76,
"step": 670,
"token_acc": 0.9526879044300647,
"train_speed(iter/s)": 0.12064
},
{
"epoch": 0.8467267738141905,
"grad_norm": 0.7337839007377625,
"learning_rate": 8.199656715992292e-06,
"loss": 0.13117530345916747,
"memory(GiB)": 40.76,
"step": 675,
"token_acc": 0.9478685921294229,
"train_speed(iter/s)": 0.120857
},
{
"epoch": 0.8529988239905919,
"grad_norm": 0.7467445731163025,
"learning_rate": 8.179419388376196e-06,
"loss": 0.13929787874221802,
"memory(GiB)": 40.76,
"step": 680,
"token_acc": 0.946326665465249,
"train_speed(iter/s)": 0.121055
},
{
"epoch": 0.8529988239905919,
"eval_loss": 0.22663576900959015,
"eval_runtime": 29.1409,
"eval_samples_per_second": 17.673,
"eval_steps_per_second": 4.427,
"eval_token_acc": 0.9265671967165693,
"step": 680
},
{
"epoch": 0.8592708741669933,
"grad_norm": 0.6901698708534241,
"learning_rate": 8.153999293750005e-06,
"loss": 0.1212563157081604,
"memory(GiB)": 40.76,
"step": 685,
"token_acc": 0.9411858718235576,
"train_speed(iter/s)": 0.120382
},
{
"epoch": 0.8655429243433947,
"grad_norm": 0.7482547163963318,
"learning_rate": 8.128443073265364e-06,
"loss": 0.13035836219787597,
"memory(GiB)": 40.76,
"step": 690,
"token_acc": 0.9523483030510799,
"train_speed(iter/s)": 0.120598
},
{
"epoch": 0.8718149745197962,
"grad_norm": 0.7634561657905579,
"learning_rate": 8.102751829922664e-06,
"loss": 0.13618214130401612,
"memory(GiB)": 40.76,
"step": 695,
"token_acc": 0.9506073092564165,
"train_speed(iter/s)": 0.1208
},
{
"epoch": 0.8780870246961976,
"grad_norm": 0.743306040763855,
"learning_rate": 8.082102363728494e-06,
"loss": 0.12926363945007324,
"memory(GiB)": 40.76,
"step": 700,
"token_acc": 0.9271781534460338,
"train_speed(iter/s)": 0.121029
},
{
"epoch": 0.8780870246961976,
"eval_loss": 0.22914662957191467,
"eval_runtime": 28.9342,
"eval_samples_per_second": 17.799,
"eval_steps_per_second": 4.458,
"eval_token_acc": 0.9267055435142602,
"step": 700
},
{
"epoch": 0.884359074872599,
"grad_norm": 0.774067759513855,
"learning_rate": 8.056170877373277e-06,
"loss": 0.12883291244506836,
"memory(GiB)": 40.76,
"step": 705,
"token_acc": 0.941404062515909,
"train_speed(iter/s)": 0.120445
},
{
"epoch": 0.8906311250490004,
"grad_norm": 0.731858491897583,
"learning_rate": 8.030107487410766e-06,
"loss": 0.1272268772125244,
"memory(GiB)": 40.76,
"step": 710,
"token_acc": 0.9559863699726366,
"train_speed(iter/s)": 0.120655
},
{
"epoch": 0.8969031752254017,
"grad_norm": 0.7816020846366882,
"learning_rate": 8.003913318730662e-06,
"loss": 0.12550874948501586,
"memory(GiB)": 40.76,
"step": 715,
"token_acc": 0.9573273273273273,
"train_speed(iter/s)": 0.120859
},
{
"epoch": 0.9031752254018032,
"grad_norm": 0.7145897150039673,
"learning_rate": 7.97758950186705e-06,
"loss": 0.11747034788131713,
"memory(GiB)": 40.76,
"step": 720,
"token_acc": 0.9536420703541395,
"train_speed(iter/s)": 0.121052
},
{
"epoch": 0.9031752254018032,
"eval_loss": 0.2287711501121521,
"eval_runtime": 28.9287,
"eval_samples_per_second": 17.802,
"eval_steps_per_second": 4.459,
"eval_token_acc": 0.9270073910728585,
"step": 720
},
{
"epoch": 0.9094472755782046,
"grad_norm": 0.7315598130226135,
"learning_rate": 7.951137172949595e-06,
"loss": 0.1277442455291748,
"memory(GiB)": 40.76,
"step": 725,
"token_acc": 0.9404945141684427,
"train_speed(iter/s)": 0.12046
},
{
"epoch": 0.915719325754606,
"grad_norm": 0.7869780659675598,
"learning_rate": 7.924557473654516e-06,
"loss": 0.13705768585205078,
"memory(GiB)": 40.77,
"step": 730,
"token_acc": 0.9484856989768581,
"train_speed(iter/s)": 0.120675
},
{
"epoch": 0.9219913759310074,
"grad_norm": 0.738673985004425,
"learning_rate": 7.897851551155306e-06,
"loss": 0.12492038011550903,
"memory(GiB)": 40.77,
"step": 735,
"token_acc": 0.9554785841007012,
"train_speed(iter/s)": 0.120899
},
{
"epoch": 0.9282634261074089,
"grad_norm": 0.6920596361160278,
"learning_rate": 7.871020558073217e-06,
"loss": 0.12350271940231324,
"memory(GiB)": 40.77,
"step": 740,
"token_acc": 0.9585155697561742,
"train_speed(iter/s)": 0.121065
},
{
"epoch": 0.9282634261074089,
"eval_loss": 0.22647298872470856,
"eval_runtime": 29.274,
"eval_samples_per_second": 17.592,
"eval_steps_per_second": 4.407,
"eval_token_acc": 0.9271583148521576,
"step": 740
},
{
"epoch": 0.9345354762838103,
"grad_norm": 0.7176641821861267,
"learning_rate": 7.849466490796728e-06,
"loss": 0.12098994255065917,
"memory(GiB)": 40.77,
"step": 745,
"token_acc": 0.9415065810170385,
"train_speed(iter/s)": 0.120467
},
{
"epoch": 0.9408075264602117,
"grad_norm": 0.7007727026939392,
"learning_rate": 7.822413292469593e-06,
"loss": 0.12603325843811036,
"memory(GiB)": 40.77,
"step": 750,
"token_acc": 0.955721036803666,
"train_speed(iter/s)": 0.120684
},
{
"epoch": 0.9470795766366131,
"grad_norm": 0.7308924198150635,
"learning_rate": 7.79523827945686e-06,
"loss": 0.1311476469039917,
"memory(GiB)": 40.77,
"step": 755,
"token_acc": 0.9497907949790795,
"train_speed(iter/s)": 0.120874
},
{
"epoch": 0.9533516268130146,
"grad_norm": 0.701979398727417,
"learning_rate": 7.767942624625625e-06,
"loss": 0.12925295829772948,
"memory(GiB)": 40.77,
"step": 760,
"token_acc": 0.9502816180235535,
"train_speed(iter/s)": 0.121076
},
{
"epoch": 0.9533516268130146,
"eval_loss": 0.2251773178577423,
"eval_runtime": 29.2786,
"eval_samples_per_second": 17.59,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.9271918534697796,
"step": 760
}
],
"logging_steps": 5,
"max_steps": 2391,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.199964448180142e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}