6924 lines
197 KiB
JSON
6924 lines
197 KiB
JSON
{
|
|
"best_global_step": 1860,
|
|
"best_metric": 0.1756638,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v30-20250504-002959/checkpoint-1860",
|
|
"epoch": 2.997061180870959,
|
|
"eval_steps": 20,
|
|
"global_step": 2805,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0010686615014694097,
|
|
"grad_norm": 2.5240726470947266,
|
|
"learning_rate": 9.999996864014995e-06,
|
|
"loss": 0.36646389961242676,
|
|
"memory(GiB)": 28.83,
|
|
"step": 1,
|
|
"token_acc": 0.898885535175296,
|
|
"train_speed(iter/s)": 0.068736
|
|
},
|
|
{
|
|
"epoch": 0.0053433075073470475,
|
|
"grad_norm": 2.2844929695129395,
|
|
"learning_rate": 9.99992160057155e-06,
|
|
"loss": 0.32597002387046814,
|
|
"memory(GiB)": 28.87,
|
|
"step": 5,
|
|
"token_acc": 0.8913895678642254,
|
|
"train_speed(iter/s)": 0.125487
|
|
},
|
|
{
|
|
"epoch": 0.010686615014694095,
|
|
"grad_norm": 1.130465030670166,
|
|
"learning_rate": 9.999686404744782e-06,
|
|
"loss": 0.27506489753723146,
|
|
"memory(GiB)": 28.87,
|
|
"step": 10,
|
|
"token_acc": 0.9082294264339152,
|
|
"train_speed(iter/s)": 0.138754
|
|
},
|
|
{
|
|
"epoch": 0.016029922522041145,
|
|
"grad_norm": 1.0269818305969238,
|
|
"learning_rate": 9.999294419895389e-06,
|
|
"loss": 0.27271237373352053,
|
|
"memory(GiB)": 28.88,
|
|
"step": 15,
|
|
"token_acc": 0.9203989829845491,
|
|
"train_speed(iter/s)": 0.144155
|
|
},
|
|
{
|
|
"epoch": 0.02137323002938819,
|
|
"grad_norm": 0.9383953213691711,
|
|
"learning_rate": 9.998745658315924e-06,
|
|
"loss": 0.2601430892944336,
|
|
"memory(GiB)": 28.88,
|
|
"step": 20,
|
|
"token_acc": 0.9118547099521022,
|
|
"train_speed(iter/s)": 0.144659
|
|
},
|
|
{
|
|
"epoch": 0.02137323002938819,
|
|
"eval_loss": 0.26998990774154663,
|
|
"eval_runtime": 39.5781,
|
|
"eval_samples_per_second": 15.261,
|
|
"eval_steps_per_second": 3.815,
|
|
"eval_token_acc": 0.9141692937871282,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.026716537536735238,
|
|
"grad_norm": 0.8839966058731079,
|
|
"learning_rate": 9.998040137215423e-06,
|
|
"loss": 0.2755439758300781,
|
|
"memory(GiB)": 28.88,
|
|
"step": 25,
|
|
"token_acc": 0.9063694193980396,
|
|
"train_speed(iter/s)": 0.11355
|
|
},
|
|
{
|
|
"epoch": 0.03205984504408229,
|
|
"grad_norm": 1.0895779132843018,
|
|
"learning_rate": 9.99717787871887e-06,
|
|
"loss": 0.2630652666091919,
|
|
"memory(GiB)": 28.88,
|
|
"step": 30,
|
|
"token_acc": 0.9156098058549522,
|
|
"train_speed(iter/s)": 0.118275
|
|
},
|
|
{
|
|
"epoch": 0.037403152551429335,
|
|
"grad_norm": 0.9162330031394958,
|
|
"learning_rate": 9.99615890986649e-06,
|
|
"loss": 0.25594387054443357,
|
|
"memory(GiB)": 28.88,
|
|
"step": 35,
|
|
"token_acc": 0.9182685889734348,
|
|
"train_speed(iter/s)": 0.12251
|
|
},
|
|
{
|
|
"epoch": 0.04274646005877638,
|
|
"grad_norm": 0.7786328792572021,
|
|
"learning_rate": 9.994983262612916e-06,
|
|
"loss": 0.26515631675720214,
|
|
"memory(GiB)": 28.88,
|
|
"step": 40,
|
|
"token_acc": 0.9209860093271153,
|
|
"train_speed(iter/s)": 0.125479
|
|
},
|
|
{
|
|
"epoch": 0.04274646005877638,
|
|
"eval_loss": 0.2504235804080963,
|
|
"eval_runtime": 39.1128,
|
|
"eval_samples_per_second": 15.443,
|
|
"eval_steps_per_second": 3.861,
|
|
"eval_token_acc": 0.9184799159321453,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.04808976756612343,
|
|
"grad_norm": 0.9364318251609802,
|
|
"learning_rate": 9.993650973826177e-06,
|
|
"loss": 0.2555875062942505,
|
|
"memory(GiB)": 28.88,
|
|
"step": 45,
|
|
"token_acc": 0.9150358901081727,
|
|
"train_speed(iter/s)": 0.111936
|
|
},
|
|
{
|
|
"epoch": 0.053433075073470476,
|
|
"grad_norm": 0.8948811888694763,
|
|
"learning_rate": 9.992162085286543e-06,
|
|
"loss": 0.24805829524993897,
|
|
"memory(GiB)": 28.88,
|
|
"step": 50,
|
|
"token_acc": 0.9186546283208815,
|
|
"train_speed(iter/s)": 0.115376
|
|
},
|
|
{
|
|
"epoch": 0.05877638258081753,
|
|
"grad_norm": 0.9033949375152588,
|
|
"learning_rate": 9.990516643685222e-06,
|
|
"loss": 0.24738097190856934,
|
|
"memory(GiB)": 28.88,
|
|
"step": 55,
|
|
"token_acc": 0.9198283024093049,
|
|
"train_speed(iter/s)": 0.117724
|
|
},
|
|
{
|
|
"epoch": 0.06411969008816458,
|
|
"grad_norm": 0.8253108859062195,
|
|
"learning_rate": 9.988714700622882e-06,
|
|
"loss": 0.24202115535736085,
|
|
"memory(GiB)": 28.88,
|
|
"step": 60,
|
|
"token_acc": 0.9179127157792366,
|
|
"train_speed(iter/s)": 0.12054
|
|
},
|
|
{
|
|
"epoch": 0.06411969008816458,
|
|
"eval_loss": 0.24108153581619263,
|
|
"eval_runtime": 39.139,
|
|
"eval_samples_per_second": 15.432,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9215252310793712,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.06946299759551162,
|
|
"grad_norm": 0.9196599125862122,
|
|
"learning_rate": 9.986756312608048e-06,
|
|
"loss": 0.2574702262878418,
|
|
"memory(GiB)": 28.88,
|
|
"step": 65,
|
|
"token_acc": 0.9133585236043591,
|
|
"train_speed(iter/s)": 0.112242
|
|
},
|
|
{
|
|
"epoch": 0.07480630510285867,
|
|
"grad_norm": 0.8361815810203552,
|
|
"learning_rate": 9.98464154105532e-06,
|
|
"loss": 0.24776794910430908,
|
|
"memory(GiB)": 28.88,
|
|
"step": 70,
|
|
"token_acc": 0.9202816026503778,
|
|
"train_speed(iter/s)": 0.114497
|
|
},
|
|
{
|
|
"epoch": 0.08014961261020571,
|
|
"grad_norm": 0.794195294380188,
|
|
"learning_rate": 9.982370452283451e-06,
|
|
"loss": 0.23735420703887938,
|
|
"memory(GiB)": 28.88,
|
|
"step": 75,
|
|
"token_acc": 0.9199817957916153,
|
|
"train_speed(iter/s)": 0.116585
|
|
},
|
|
{
|
|
"epoch": 0.08549292011755276,
|
|
"grad_norm": 0.9916167855262756,
|
|
"learning_rate": 9.979943117513265e-06,
|
|
"loss": 0.23866429328918456,
|
|
"memory(GiB)": 28.88,
|
|
"step": 80,
|
|
"token_acc": 0.9108762520703525,
|
|
"train_speed(iter/s)": 0.118446
|
|
},
|
|
{
|
|
"epoch": 0.08549292011755276,
|
|
"eval_loss": 0.2345978170633316,
|
|
"eval_runtime": 39.1184,
|
|
"eval_samples_per_second": 15.44,
|
|
"eval_steps_per_second": 3.86,
|
|
"eval_token_acc": 0.9226018143215595,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.09083622762489982,
|
|
"grad_norm": 0.8545597195625305,
|
|
"learning_rate": 9.977359612865424e-06,
|
|
"loss": 0.24537258148193358,
|
|
"memory(GiB)": 28.88,
|
|
"step": 85,
|
|
"token_acc": 0.9151461409910985,
|
|
"train_speed(iter/s)": 0.112121
|
|
},
|
|
{
|
|
"epoch": 0.09617953513224686,
|
|
"grad_norm": 0.8722995519638062,
|
|
"learning_rate": 9.974620019358046e-06,
|
|
"loss": 0.2280275344848633,
|
|
"memory(GiB)": 28.88,
|
|
"step": 90,
|
|
"token_acc": 0.9271924248000804,
|
|
"train_speed(iter/s)": 0.113659
|
|
},
|
|
{
|
|
"epoch": 0.10152284263959391,
|
|
"grad_norm": 0.8093506097793579,
|
|
"learning_rate": 9.971724422904154e-06,
|
|
"loss": 0.22236292362213134,
|
|
"memory(GiB)": 28.88,
|
|
"step": 95,
|
|
"token_acc": 0.9146961880194898,
|
|
"train_speed(iter/s)": 0.115242
|
|
},
|
|
{
|
|
"epoch": 0.10686615014694095,
|
|
"grad_norm": 0.8567050695419312,
|
|
"learning_rate": 9.968672914308995e-06,
|
|
"loss": 0.2444392681121826,
|
|
"memory(GiB)": 28.88,
|
|
"step": 100,
|
|
"token_acc": 0.9146307217073856,
|
|
"train_speed(iter/s)": 0.116624
|
|
},
|
|
{
|
|
"epoch": 0.10686615014694095,
|
|
"eval_loss": 0.2300056368112564,
|
|
"eval_runtime": 39.1847,
|
|
"eval_samples_per_second": 15.414,
|
|
"eval_steps_per_second": 3.854,
|
|
"eval_token_acc": 0.9230350211241931,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.112209457654288,
|
|
"grad_norm": 0.812283992767334,
|
|
"learning_rate": 9.965465589267176e-06,
|
|
"loss": 0.23651769161224365,
|
|
"memory(GiB)": 28.88,
|
|
"step": 105,
|
|
"token_acc": 0.9196928564126446,
|
|
"train_speed(iter/s)": 0.111612
|
|
},
|
|
{
|
|
"epoch": 0.11755276516163506,
|
|
"grad_norm": 0.7877810597419739,
|
|
"learning_rate": 9.96210254835968e-06,
|
|
"loss": 0.24004974365234374,
|
|
"memory(GiB)": 28.88,
|
|
"step": 110,
|
|
"token_acc": 0.9218568537014301,
|
|
"train_speed(iter/s)": 0.112971
|
|
},
|
|
{
|
|
"epoch": 0.1228960726689821,
|
|
"grad_norm": 0.7748284935951233,
|
|
"learning_rate": 9.9585838970507e-06,
|
|
"loss": 0.22294034957885742,
|
|
"memory(GiB)": 28.88,
|
|
"step": 115,
|
|
"token_acc": 0.9274852331280633,
|
|
"train_speed(iter/s)": 0.114143
|
|
},
|
|
{
|
|
"epoch": 0.12823938017632916,
|
|
"grad_norm": 0.8219108581542969,
|
|
"learning_rate": 9.954909745684339e-06,
|
|
"loss": 0.22394099235534667,
|
|
"memory(GiB)": 28.88,
|
|
"step": 120,
|
|
"token_acc": 0.9172609252148474,
|
|
"train_speed(iter/s)": 0.115495
|
|
},
|
|
{
|
|
"epoch": 0.12823938017632916,
|
|
"eval_loss": 0.227636456489563,
|
|
"eval_runtime": 39.1617,
|
|
"eval_samples_per_second": 15.423,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9239829290784705,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.1335826876836762,
|
|
"grad_norm": 0.8432355523109436,
|
|
"learning_rate": 9.951080209481138e-06,
|
|
"loss": 0.2368373155593872,
|
|
"memory(GiB)": 28.88,
|
|
"step": 125,
|
|
"token_acc": 0.9205502826063773,
|
|
"train_speed(iter/s)": 0.111608
|
|
},
|
|
{
|
|
"epoch": 0.13892599519102325,
|
|
"grad_norm": 0.916549801826477,
|
|
"learning_rate": 9.947095408534483e-06,
|
|
"loss": 0.25100035667419435,
|
|
"memory(GiB)": 28.88,
|
|
"step": 130,
|
|
"token_acc": 0.9245168060115764,
|
|
"train_speed(iter/s)": 0.113003
|
|
},
|
|
{
|
|
"epoch": 0.1442693026983703,
|
|
"grad_norm": 0.81369948387146,
|
|
"learning_rate": 9.94295546780682e-06,
|
|
"loss": 0.2301029920578003,
|
|
"memory(GiB)": 28.88,
|
|
"step": 135,
|
|
"token_acc": 0.9171662305832478,
|
|
"train_speed(iter/s)": 0.113963
|
|
},
|
|
{
|
|
"epoch": 0.14961261020571734,
|
|
"grad_norm": 0.9230105876922607,
|
|
"learning_rate": 9.93866051712574e-06,
|
|
"loss": 0.23561997413635255,
|
|
"memory(GiB)": 28.88,
|
|
"step": 140,
|
|
"token_acc": 0.9211832729821282,
|
|
"train_speed(iter/s)": 0.115079
|
|
},
|
|
{
|
|
"epoch": 0.14961261020571734,
|
|
"eval_loss": 0.22445102035999298,
|
|
"eval_runtime": 39.1558,
|
|
"eval_samples_per_second": 15.426,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9246691972806622,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.15495591771306438,
|
|
"grad_norm": 0.7951855063438416,
|
|
"learning_rate": 9.934210691179918e-06,
|
|
"loss": 0.241453218460083,
|
|
"memory(GiB)": 28.88,
|
|
"step": 145,
|
|
"token_acc": 0.9198347485752214,
|
|
"train_speed(iter/s)": 0.111598
|
|
},
|
|
{
|
|
"epoch": 0.16029922522041143,
|
|
"grad_norm": 0.8992099165916443,
|
|
"learning_rate": 9.929606129514875e-06,
|
|
"loss": 0.24097092151641847,
|
|
"memory(GiB)": 28.88,
|
|
"step": 150,
|
|
"token_acc": 0.9258129774449712,
|
|
"train_speed(iter/s)": 0.112709
|
|
},
|
|
{
|
|
"epoch": 0.16564253272775847,
|
|
"grad_norm": 0.8297993540763855,
|
|
"learning_rate": 9.924846976528618e-06,
|
|
"loss": 0.2204681158065796,
|
|
"memory(GiB)": 28.88,
|
|
"step": 155,
|
|
"token_acc": 0.9230016313213704,
|
|
"train_speed(iter/s)": 0.113601
|
|
},
|
|
{
|
|
"epoch": 0.17098584023510552,
|
|
"grad_norm": 0.8261107802391052,
|
|
"learning_rate": 9.919933381467088e-06,
|
|
"loss": 0.22699880599975586,
|
|
"memory(GiB)": 28.88,
|
|
"step": 160,
|
|
"token_acc": 0.928885791828651,
|
|
"train_speed(iter/s)": 0.114559
|
|
},
|
|
{
|
|
"epoch": 0.17098584023510552,
|
|
"eval_loss": 0.22084873914718628,
|
|
"eval_runtime": 39.2244,
|
|
"eval_samples_per_second": 15.399,
|
|
"eval_steps_per_second": 3.85,
|
|
"eval_token_acc": 0.9256943104076862,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.1763291477424526,
|
|
"grad_norm": 0.8418117761611938,
|
|
"learning_rate": 9.91486549841951e-06,
|
|
"loss": 0.22405543327331542,
|
|
"memory(GiB)": 28.88,
|
|
"step": 165,
|
|
"token_acc": 0.9225325105463167,
|
|
"train_speed(iter/s)": 0.111753
|
|
},
|
|
{
|
|
"epoch": 0.18167245524979964,
|
|
"grad_norm": 0.8489673733711243,
|
|
"learning_rate": 9.909643486313533e-06,
|
|
"loss": 0.23412735462188722,
|
|
"memory(GiB)": 28.88,
|
|
"step": 170,
|
|
"token_acc": 0.9189576348674184,
|
|
"train_speed(iter/s)": 0.112758
|
|
},
|
|
{
|
|
"epoch": 0.18701576275714668,
|
|
"grad_norm": 0.7228992581367493,
|
|
"learning_rate": 9.904267508910269e-06,
|
|
"loss": 0.2174379587173462,
|
|
"memory(GiB)": 28.88,
|
|
"step": 175,
|
|
"token_acc": 0.9282256367080615,
|
|
"train_speed(iter/s)": 0.113726
|
|
},
|
|
{
|
|
"epoch": 0.19235907026449373,
|
|
"grad_norm": 0.8382998704910278,
|
|
"learning_rate": 9.898737734799134e-06,
|
|
"loss": 0.22646231651306153,
|
|
"memory(GiB)": 28.88,
|
|
"step": 180,
|
|
"token_acc": 0.9216335902903067,
|
|
"train_speed(iter/s)": 0.114665
|
|
},
|
|
{
|
|
"epoch": 0.19235907026449373,
|
|
"eval_loss": 0.21828743815422058,
|
|
"eval_runtime": 39.1773,
|
|
"eval_samples_per_second": 15.417,
|
|
"eval_steps_per_second": 3.854,
|
|
"eval_token_acc": 0.9263333976709773,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.19770237777184077,
|
|
"grad_norm": 0.7243833541870117,
|
|
"learning_rate": 9.89305433739258e-06,
|
|
"loss": 0.21673502922058105,
|
|
"memory(GiB)": 28.88,
|
|
"step": 185,
|
|
"token_acc": 0.9237459546925566,
|
|
"train_speed(iter/s)": 0.112007
|
|
},
|
|
{
|
|
"epoch": 0.20304568527918782,
|
|
"grad_norm": 0.891444981098175,
|
|
"learning_rate": 9.887217494920655e-06,
|
|
"loss": 0.2255467414855957,
|
|
"memory(GiB)": 28.88,
|
|
"step": 190,
|
|
"token_acc": 0.9196933155753932,
|
|
"train_speed(iter/s)": 0.112921
|
|
},
|
|
{
|
|
"epoch": 0.20838899278653486,
|
|
"grad_norm": 0.8923943042755127,
|
|
"learning_rate": 9.881227390425404e-06,
|
|
"loss": 0.23066916465759277,
|
|
"memory(GiB)": 28.88,
|
|
"step": 195,
|
|
"token_acc": 0.9164964514453868,
|
|
"train_speed(iter/s)": 0.113717
|
|
},
|
|
{
|
|
"epoch": 0.2137323002938819,
|
|
"grad_norm": 0.8573595881462097,
|
|
"learning_rate": 9.875084211755127e-06,
|
|
"loss": 0.22467041015625,
|
|
"memory(GiB)": 28.88,
|
|
"step": 200,
|
|
"token_acc": 0.9248140544542792,
|
|
"train_speed(iter/s)": 0.114571
|
|
},
|
|
{
|
|
"epoch": 0.2137323002938819,
|
|
"eval_loss": 0.2157127410173416,
|
|
"eval_runtime": 39.2065,
|
|
"eval_samples_per_second": 15.406,
|
|
"eval_steps_per_second": 3.851,
|
|
"eval_token_acc": 0.9273456432692101,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.21907560780122895,
|
|
"grad_norm": 0.776364266872406,
|
|
"learning_rate": 9.868788151558513e-06,
|
|
"loss": 0.2205509662628174,
|
|
"memory(GiB)": 28.88,
|
|
"step": 205,
|
|
"token_acc": 0.9281972298447889,
|
|
"train_speed(iter/s)": 0.112238
|
|
},
|
|
{
|
|
"epoch": 0.224418915308576,
|
|
"grad_norm": 0.7892180681228638,
|
|
"learning_rate": 9.862339407278564e-06,
|
|
"loss": 0.2204576015472412,
|
|
"memory(GiB)": 28.88,
|
|
"step": 210,
|
|
"token_acc": 0.9257216072890344,
|
|
"train_speed(iter/s)": 0.113008
|
|
},
|
|
{
|
|
"epoch": 0.22976222281592307,
|
|
"grad_norm": 0.6594001054763794,
|
|
"learning_rate": 9.855738181146427e-06,
|
|
"loss": 0.2142805814743042,
|
|
"memory(GiB)": 28.88,
|
|
"step": 215,
|
|
"token_acc": 0.9388168011006501,
|
|
"train_speed(iter/s)": 0.113727
|
|
},
|
|
{
|
|
"epoch": 0.2351055303232701,
|
|
"grad_norm": 0.8227058053016663,
|
|
"learning_rate": 9.848984680175049e-06,
|
|
"loss": 0.2226466417312622,
|
|
"memory(GiB)": 28.88,
|
|
"step": 220,
|
|
"token_acc": 0.9279371197521067,
|
|
"train_speed(iter/s)": 0.114316
|
|
},
|
|
{
|
|
"epoch": 0.2351055303232701,
|
|
"eval_loss": 0.21347786486148834,
|
|
"eval_runtime": 39.2014,
|
|
"eval_samples_per_second": 15.408,
|
|
"eval_steps_per_second": 3.852,
|
|
"eval_token_acc": 0.9271354736322889,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.24044883783061716,
|
|
"grad_norm": 0.7295470237731934,
|
|
"learning_rate": 9.84207911615267e-06,
|
|
"loss": 0.21721224784851073,
|
|
"memory(GiB)": 28.88,
|
|
"step": 225,
|
|
"token_acc": 0.9288023754536456,
|
|
"train_speed(iter/s)": 0.112132
|
|
},
|
|
{
|
|
"epoch": 0.2457921453379642,
|
|
"grad_norm": 0.8412714600563049,
|
|
"learning_rate": 9.835021705636201e-06,
|
|
"loss": 0.22250523567199706,
|
|
"memory(GiB)": 28.88,
|
|
"step": 230,
|
|
"token_acc": 0.9202628419788969,
|
|
"train_speed(iter/s)": 0.112833
|
|
},
|
|
{
|
|
"epoch": 0.25113545284531125,
|
|
"grad_norm": 0.8208211064338684,
|
|
"learning_rate": 9.827812669944423e-06,
|
|
"loss": 0.2162861108779907,
|
|
"memory(GiB)": 28.88,
|
|
"step": 235,
|
|
"token_acc": 0.9284959688629414,
|
|
"train_speed(iter/s)": 0.113507
|
|
},
|
|
{
|
|
"epoch": 0.2564787603526583,
|
|
"grad_norm": 0.849270224571228,
|
|
"learning_rate": 9.82045223515105e-06,
|
|
"loss": 0.23803000450134276,
|
|
"memory(GiB)": 28.88,
|
|
"step": 240,
|
|
"token_acc": 0.9182260788924586,
|
|
"train_speed(iter/s)": 0.114162
|
|
},
|
|
{
|
|
"epoch": 0.2564787603526583,
|
|
"eval_loss": 0.21100114285945892,
|
|
"eval_runtime": 39.173,
|
|
"eval_samples_per_second": 15.419,
|
|
"eval_steps_per_second": 3.855,
|
|
"eval_token_acc": 0.9279032361834909,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.26182206786000534,
|
|
"grad_norm": 0.787993311882019,
|
|
"learning_rate": 9.812940632077629e-06,
|
|
"loss": 0.22567553520202638,
|
|
"memory(GiB)": 28.88,
|
|
"step": 245,
|
|
"token_acc": 0.9222485073786189,
|
|
"train_speed(iter/s)": 0.112214
|
|
},
|
|
{
|
|
"epoch": 0.2671653753673524,
|
|
"grad_norm": 0.8460186123847961,
|
|
"learning_rate": 9.805278096286318e-06,
|
|
"loss": 0.23234963417053223,
|
|
"memory(GiB)": 28.88,
|
|
"step": 250,
|
|
"token_acc": 0.9275632677484074,
|
|
"train_speed(iter/s)": 0.112833
|
|
},
|
|
{
|
|
"epoch": 0.2725086828746994,
|
|
"grad_norm": 0.8697513937950134,
|
|
"learning_rate": 9.797464868072489e-06,
|
|
"loss": 0.21605942249298096,
|
|
"memory(GiB)": 28.88,
|
|
"step": 255,
|
|
"token_acc": 0.9228022233953358,
|
|
"train_speed(iter/s)": 0.11345
|
|
},
|
|
{
|
|
"epoch": 0.2778519903820465,
|
|
"grad_norm": 0.7353094816207886,
|
|
"learning_rate": 9.789501192457188e-06,
|
|
"loss": 0.20451416969299316,
|
|
"memory(GiB)": 28.88,
|
|
"step": 260,
|
|
"token_acc": 0.9241443920719032,
|
|
"train_speed(iter/s)": 0.11402
|
|
},
|
|
{
|
|
"epoch": 0.2778519903820465,
|
|
"eval_loss": 0.20931538939476013,
|
|
"eval_runtime": 39.2202,
|
|
"eval_samples_per_second": 15.4,
|
|
"eval_steps_per_second": 3.85,
|
|
"eval_token_acc": 0.9285251667417272,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.2831952978893935,
|
|
"grad_norm": 0.7424066066741943,
|
|
"learning_rate": 9.781387319179465e-06,
|
|
"loss": 0.21903395652770996,
|
|
"memory(GiB)": 28.88,
|
|
"step": 265,
|
|
"token_acc": 0.9232737136934036,
|
|
"train_speed(iter/s)": 0.112272
|
|
},
|
|
{
|
|
"epoch": 0.2885386053967406,
|
|
"grad_norm": 0.7803475856781006,
|
|
"learning_rate": 9.773123502688532e-06,
|
|
"loss": 0.20421533584594725,
|
|
"memory(GiB)": 28.88,
|
|
"step": 270,
|
|
"token_acc": 0.9321742863275381,
|
|
"train_speed(iter/s)": 0.112863
|
|
},
|
|
{
|
|
"epoch": 0.2938819129040876,
|
|
"grad_norm": 0.9278019666671753,
|
|
"learning_rate": 9.764710002135784e-06,
|
|
"loss": 0.2212052345275879,
|
|
"memory(GiB)": 28.88,
|
|
"step": 275,
|
|
"token_acc": 0.9205871656693225,
|
|
"train_speed(iter/s)": 0.113471
|
|
},
|
|
{
|
|
"epoch": 0.2992252204114347,
|
|
"grad_norm": 0.8120574355125427,
|
|
"learning_rate": 9.756147081366673e-06,
|
|
"loss": 0.22279133796691894,
|
|
"memory(GiB)": 28.88,
|
|
"step": 280,
|
|
"token_acc": 0.9252955870108243,
|
|
"train_speed(iter/s)": 0.114018
|
|
},
|
|
{
|
|
"epoch": 0.2992252204114347,
|
|
"eval_loss": 0.20953305065631866,
|
|
"eval_runtime": 39.1588,
|
|
"eval_samples_per_second": 15.424,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9285980827382101,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.30456852791878175,
|
|
"grad_norm": 0.7851824164390564,
|
|
"learning_rate": 9.747435008912438e-06,
|
|
"loss": 0.2276832103729248,
|
|
"memory(GiB)": 28.88,
|
|
"step": 285,
|
|
"token_acc": 0.9250812487968384,
|
|
"train_speed(iter/s)": 0.11243
|
|
},
|
|
{
|
|
"epoch": 0.30991183542612877,
|
|
"grad_norm": 0.7576037049293518,
|
|
"learning_rate": 9.73857405798168e-06,
|
|
"loss": 0.21578505039215087,
|
|
"memory(GiB)": 28.88,
|
|
"step": 290,
|
|
"token_acc": 0.9281283843654622,
|
|
"train_speed(iter/s)": 0.113065
|
|
},
|
|
{
|
|
"epoch": 0.31525514293347584,
|
|
"grad_norm": 0.7560254335403442,
|
|
"learning_rate": 9.729564506451791e-06,
|
|
"loss": 0.2222808837890625,
|
|
"memory(GiB)": 28.88,
|
|
"step": 295,
|
|
"token_acc": 0.9222295127257658,
|
|
"train_speed(iter/s)": 0.11355
|
|
},
|
|
{
|
|
"epoch": 0.32059845044082286,
|
|
"grad_norm": 0.7463079690933228,
|
|
"learning_rate": 9.720406636860252e-06,
|
|
"loss": 0.2202146530151367,
|
|
"memory(GiB)": 28.88,
|
|
"step": 300,
|
|
"token_acc": 0.9305221834918271,
|
|
"train_speed(iter/s)": 0.11405
|
|
},
|
|
{
|
|
"epoch": 0.32059845044082286,
|
|
"eval_loss": 0.20838645100593567,
|
|
"eval_runtime": 39.2025,
|
|
"eval_samples_per_second": 15.407,
|
|
"eval_steps_per_second": 3.852,
|
|
"eval_token_acc": 0.9288683008428231,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.32594175794816993,
|
|
"grad_norm": 0.7550280690193176,
|
|
"learning_rate": 9.711100736395758e-06,
|
|
"loss": 0.22840361595153807,
|
|
"memory(GiB)": 28.88,
|
|
"step": 305,
|
|
"token_acc": 0.9224960254372019,
|
|
"train_speed(iter/s)": 0.112523
|
|
},
|
|
{
|
|
"epoch": 0.33128506545551695,
|
|
"grad_norm": 0.8339446187019348,
|
|
"learning_rate": 9.70164709688922e-06,
|
|
"loss": 0.23481903076171876,
|
|
"memory(GiB)": 28.88,
|
|
"step": 310,
|
|
"token_acc": 0.9225257780131461,
|
|
"train_speed(iter/s)": 0.113092
|
|
},
|
|
{
|
|
"epoch": 0.336628372962864,
|
|
"grad_norm": 0.7605867385864258,
|
|
"learning_rate": 9.69204601480461e-06,
|
|
"loss": 0.20481536388397217,
|
|
"memory(GiB)": 28.88,
|
|
"step": 315,
|
|
"token_acc": 0.925222651655184,
|
|
"train_speed(iter/s)": 0.113604
|
|
},
|
|
{
|
|
"epoch": 0.34197168047021104,
|
|
"grad_norm": 0.7798689007759094,
|
|
"learning_rate": 9.682297791229668e-06,
|
|
"loss": 0.21814496517181398,
|
|
"memory(GiB)": 28.88,
|
|
"step": 320,
|
|
"token_acc": 0.9192225342713344,
|
|
"train_speed(iter/s)": 0.114156
|
|
},
|
|
{
|
|
"epoch": 0.34197168047021104,
|
|
"eval_loss": 0.20693515241146088,
|
|
"eval_runtime": 39.111,
|
|
"eval_samples_per_second": 15.443,
|
|
"eval_steps_per_second": 3.861,
|
|
"eval_token_acc": 0.9290999163610628,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.3473149879775581,
|
|
"grad_norm": 0.7268094420433044,
|
|
"learning_rate": 9.67240273186646e-06,
|
|
"loss": 0.21229307651519774,
|
|
"memory(GiB)": 28.88,
|
|
"step": 325,
|
|
"token_acc": 0.9255569407555965,
|
|
"train_speed(iter/s)": 0.112688
|
|
},
|
|
{
|
|
"epoch": 0.3526582954849052,
|
|
"grad_norm": 0.8207147121429443,
|
|
"learning_rate": 9.66236114702178e-06,
|
|
"loss": 0.23093049526214598,
|
|
"memory(GiB)": 28.88,
|
|
"step": 330,
|
|
"token_acc": 0.9176510638297872,
|
|
"train_speed(iter/s)": 0.113194
|
|
},
|
|
{
|
|
"epoch": 0.3580016029922522,
|
|
"grad_norm": 0.7545695304870605,
|
|
"learning_rate": 9.652173351597435e-06,
|
|
"loss": 0.22740473747253417,
|
|
"memory(GiB)": 28.88,
|
|
"step": 335,
|
|
"token_acc": 0.9171260565968394,
|
|
"train_speed(iter/s)": 0.113721
|
|
},
|
|
{
|
|
"epoch": 0.36334491049959927,
|
|
"grad_norm": 0.7324105501174927,
|
|
"learning_rate": 9.641839665080363e-06,
|
|
"loss": 0.2098468780517578,
|
|
"memory(GiB)": 28.88,
|
|
"step": 340,
|
|
"token_acc": 0.9380478206427763,
|
|
"train_speed(iter/s)": 0.114179
|
|
},
|
|
{
|
|
"epoch": 0.36334491049959927,
|
|
"eval_loss": 0.20602919161319733,
|
|
"eval_runtime": 39.2173,
|
|
"eval_samples_per_second": 15.401,
|
|
"eval_steps_per_second": 3.85,
|
|
"eval_token_acc": 0.9290613137746896,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.3686882180069463,
|
|
"grad_norm": 0.8628275394439697,
|
|
"learning_rate": 9.631360411532609e-06,
|
|
"loss": 0.20752103328704835,
|
|
"memory(GiB)": 31.36,
|
|
"step": 345,
|
|
"token_acc": 0.927929963287207,
|
|
"train_speed(iter/s)": 0.112824
|
|
},
|
|
{
|
|
"epoch": 0.37403152551429336,
|
|
"grad_norm": 0.8375836610794067,
|
|
"learning_rate": 9.620735919581168e-06,
|
|
"loss": 0.21789727210998536,
|
|
"memory(GiB)": 31.36,
|
|
"step": 350,
|
|
"token_acc": 0.9248612142427778,
|
|
"train_speed(iter/s)": 0.113287
|
|
},
|
|
{
|
|
"epoch": 0.3793748330216404,
|
|
"grad_norm": 0.7989551424980164,
|
|
"learning_rate": 9.609966522407678e-06,
|
|
"loss": 0.21387200355529784,
|
|
"memory(GiB)": 31.36,
|
|
"step": 355,
|
|
"token_acc": 0.9233053363875456,
|
|
"train_speed(iter/s)": 0.11376
|
|
},
|
|
{
|
|
"epoch": 0.38471814052898745,
|
|
"grad_norm": 0.8270795941352844,
|
|
"learning_rate": 9.599052557737973e-06,
|
|
"loss": 0.2108246088027954,
|
|
"memory(GiB)": 31.36,
|
|
"step": 360,
|
|
"token_acc": 0.9243235094595613,
|
|
"train_speed(iter/s)": 0.114225
|
|
},
|
|
{
|
|
"epoch": 0.38471814052898745,
|
|
"eval_loss": 0.2033926397562027,
|
|
"eval_runtime": 39.2398,
|
|
"eval_samples_per_second": 15.393,
|
|
"eval_steps_per_second": 3.848,
|
|
"eval_token_acc": 0.9299148598511656,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.39006144803633447,
|
|
"grad_norm": 0.7160355448722839,
|
|
"learning_rate": 9.58799436783149e-06,
|
|
"loss": 0.19975266456604004,
|
|
"memory(GiB)": 31.36,
|
|
"step": 365,
|
|
"token_acc": 0.9291021882882269,
|
|
"train_speed(iter/s)": 0.112953
|
|
},
|
|
{
|
|
"epoch": 0.39540475554368154,
|
|
"grad_norm": 0.7318681478500366,
|
|
"learning_rate": 9.576792299470537e-06,
|
|
"loss": 0.2046557903289795,
|
|
"memory(GiB)": 31.36,
|
|
"step": 370,
|
|
"token_acc": 0.9262238367064521,
|
|
"train_speed(iter/s)": 0.113418
|
|
},
|
|
{
|
|
"epoch": 0.4007480630510286,
|
|
"grad_norm": 0.741698682308197,
|
|
"learning_rate": 9.565446703949417e-06,
|
|
"loss": 0.20282254219055176,
|
|
"memory(GiB)": 31.36,
|
|
"step": 375,
|
|
"token_acc": 0.9226779000139257,
|
|
"train_speed(iter/s)": 0.113823
|
|
},
|
|
{
|
|
"epoch": 0.40609137055837563,
|
|
"grad_norm": 0.8014733791351318,
|
|
"learning_rate": 9.55395793706341e-06,
|
|
"loss": 0.20639050006866455,
|
|
"memory(GiB)": 31.36,
|
|
"step": 380,
|
|
"token_acc": 0.9284898178641038,
|
|
"train_speed(iter/s)": 0.11425
|
|
},
|
|
{
|
|
"epoch": 0.40609137055837563,
|
|
"eval_loss": 0.20332778990268707,
|
|
"eval_runtime": 39.1548,
|
|
"eval_samples_per_second": 15.426,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9302279697184156,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.4114346780657227,
|
|
"grad_norm": 0.7752350568771362,
|
|
"learning_rate": 9.542326359097619e-06,
|
|
"loss": 0.21643948554992676,
|
|
"memory(GiB)": 31.36,
|
|
"step": 385,
|
|
"token_acc": 0.9229331980070123,
|
|
"train_speed(iter/s)": 0.113068
|
|
},
|
|
{
|
|
"epoch": 0.4167779855730697,
|
|
"grad_norm": 0.7383193373680115,
|
|
"learning_rate": 9.530552334815672e-06,
|
|
"loss": 0.23035106658935547,
|
|
"memory(GiB)": 31.36,
|
|
"step": 390,
|
|
"token_acc": 0.9227077075527204,
|
|
"train_speed(iter/s)": 0.113539
|
|
},
|
|
{
|
|
"epoch": 0.4221212930804168,
|
|
"grad_norm": 0.826757550239563,
|
|
"learning_rate": 9.518636233448276e-06,
|
|
"loss": 0.2027421474456787,
|
|
"memory(GiB)": 31.36,
|
|
"step": 395,
|
|
"token_acc": 0.9212801395939086,
|
|
"train_speed(iter/s)": 0.113879
|
|
},
|
|
{
|
|
"epoch": 0.4274646005877638,
|
|
"grad_norm": 0.8301543593406677,
|
|
"learning_rate": 9.506578428681648e-06,
|
|
"loss": 0.2126997232437134,
|
|
"memory(GiB)": 31.36,
|
|
"step": 400,
|
|
"token_acc": 0.9254076176645323,
|
|
"train_speed(iter/s)": 0.114274
|
|
},
|
|
{
|
|
"epoch": 0.4274646005877638,
|
|
"eval_loss": 0.20262883603572845,
|
|
"eval_runtime": 39.1295,
|
|
"eval_samples_per_second": 15.436,
|
|
"eval_steps_per_second": 3.859,
|
|
"eval_token_acc": 0.930107872783032,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.4328079080951109,
|
|
"grad_norm": 0.7817525863647461,
|
|
"learning_rate": 9.494379298645788e-06,
|
|
"loss": 0.20061790943145752,
|
|
"memory(GiB)": 31.36,
|
|
"step": 405,
|
|
"token_acc": 0.9269484626175747,
|
|
"train_speed(iter/s)": 0.113056
|
|
},
|
|
{
|
|
"epoch": 0.4381512156024579,
|
|
"grad_norm": 0.7769330143928528,
|
|
"learning_rate": 9.482039225902623e-06,
|
|
"loss": 0.20687649250030518,
|
|
"memory(GiB)": 31.36,
|
|
"step": 410,
|
|
"token_acc": 0.9282657499649222,
|
|
"train_speed(iter/s)": 0.113426
|
|
},
|
|
{
|
|
"epoch": 0.443494523109805,
|
|
"grad_norm": 0.7487837076187134,
|
|
"learning_rate": 9.469558597434018e-06,
|
|
"loss": 0.20723772048950195,
|
|
"memory(GiB)": 31.36,
|
|
"step": 415,
|
|
"token_acc": 0.9278717406624384,
|
|
"train_speed(iter/s)": 0.113801
|
|
},
|
|
{
|
|
"epoch": 0.448837830617152,
|
|
"grad_norm": 0.7757744789123535,
|
|
"learning_rate": 9.456937804629623e-06,
|
|
"loss": 0.2050936698913574,
|
|
"memory(GiB)": 31.36,
|
|
"step": 420,
|
|
"token_acc": 0.9330446275414077,
|
|
"train_speed(iter/s)": 0.114158
|
|
},
|
|
{
|
|
"epoch": 0.448837830617152,
|
|
"eval_loss": 0.2025863230228424,
|
|
"eval_runtime": 39.128,
|
|
"eval_samples_per_second": 15.437,
|
|
"eval_steps_per_second": 3.859,
|
|
"eval_token_acc": 0.9305839713483025,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.45418113812449906,
|
|
"grad_norm": 0.7080808877944946,
|
|
"learning_rate": 9.444177243274619e-06,
|
|
"loss": 0.1936201810836792,
|
|
"memory(GiB)": 31.36,
|
|
"step": 425,
|
|
"token_acc": 0.9299704947798456,
|
|
"train_speed(iter/s)": 0.113084
|
|
},
|
|
{
|
|
"epoch": 0.45952444563184613,
|
|
"grad_norm": 0.647037148475647,
|
|
"learning_rate": 9.43127731353729e-06,
|
|
"loss": 0.1949155330657959,
|
|
"memory(GiB)": 31.36,
|
|
"step": 430,
|
|
"token_acc": 0.9309410335482665,
|
|
"train_speed(iter/s)": 0.113375
|
|
},
|
|
{
|
|
"epoch": 0.46486775313919315,
|
|
"grad_norm": 0.8139774203300476,
|
|
"learning_rate": 9.418238419956484e-06,
|
|
"loss": 0.21164326667785643,
|
|
"memory(GiB)": 31.36,
|
|
"step": 435,
|
|
"token_acc": 0.926740549360892,
|
|
"train_speed(iter/s)": 0.113761
|
|
},
|
|
{
|
|
"epoch": 0.4702110606465402,
|
|
"grad_norm": 0.7926304340362549,
|
|
"learning_rate": 9.405060971428924e-06,
|
|
"loss": 0.21458048820495607,
|
|
"memory(GiB)": 31.36,
|
|
"step": 440,
|
|
"token_acc": 0.9295489624421197,
|
|
"train_speed(iter/s)": 0.11413
|
|
},
|
|
{
|
|
"epoch": 0.4702110606465402,
|
|
"eval_loss": 0.20057949423789978,
|
|
"eval_runtime": 39.1388,
|
|
"eval_samples_per_second": 15.432,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9312059019065388,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.47555436815388724,
|
|
"grad_norm": 0.7835922837257385,
|
|
"learning_rate": 9.391745381196382e-06,
|
|
"loss": 0.20983607769012452,
|
|
"memory(GiB)": 31.36,
|
|
"step": 445,
|
|
"token_acc": 0.9275723362435434,
|
|
"train_speed(iter/s)": 0.113097
|
|
},
|
|
{
|
|
"epoch": 0.4808976756612343,
|
|
"grad_norm": 0.7004543542861938,
|
|
"learning_rate": 9.378292066832723e-06,
|
|
"loss": 0.21295685768127443,
|
|
"memory(GiB)": 31.36,
|
|
"step": 450,
|
|
"token_acc": 0.9304546811262561,
|
|
"train_speed(iter/s)": 0.113495
|
|
},
|
|
{
|
|
"epoch": 0.48624098316858133,
|
|
"grad_norm": 0.7315278053283691,
|
|
"learning_rate": 9.364701450230813e-06,
|
|
"loss": 0.19759070873260498,
|
|
"memory(GiB)": 31.36,
|
|
"step": 455,
|
|
"token_acc": 0.9323673009514272,
|
|
"train_speed(iter/s)": 0.113833
|
|
},
|
|
{
|
|
"epoch": 0.4915842906759284,
|
|
"grad_norm": 0.8716449737548828,
|
|
"learning_rate": 9.350973957589278e-06,
|
|
"loss": 0.21045897006988526,
|
|
"memory(GiB)": 31.36,
|
|
"step": 460,
|
|
"token_acc": 0.928,
|
|
"train_speed(iter/s)": 0.114182
|
|
},
|
|
{
|
|
"epoch": 0.4915842906759284,
|
|
"eval_loss": 0.20007076859474182,
|
|
"eval_runtime": 39.0359,
|
|
"eval_samples_per_second": 15.473,
|
|
"eval_steps_per_second": 3.868,
|
|
"eval_token_acc": 0.9313774689570868,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.4969275981832754,
|
|
"grad_norm": 0.8024523258209229,
|
|
"learning_rate": 9.33711001939915e-06,
|
|
"loss": 0.22184643745422364,
|
|
"memory(GiB)": 31.36,
|
|
"step": 465,
|
|
"token_acc": 0.9268073439852581,
|
|
"train_speed(iter/s)": 0.113198
|
|
},
|
|
{
|
|
"epoch": 0.5022709056906225,
|
|
"grad_norm": 0.7982778549194336,
|
|
"learning_rate": 9.32311007043036e-06,
|
|
"loss": 0.2020054817199707,
|
|
"memory(GiB)": 31.36,
|
|
"step": 470,
|
|
"token_acc": 0.9326877815883263,
|
|
"train_speed(iter/s)": 0.113499
|
|
},
|
|
{
|
|
"epoch": 0.5076142131979695,
|
|
"grad_norm": 0.7509546279907227,
|
|
"learning_rate": 9.30897454971811e-06,
|
|
"loss": 0.20111818313598634,
|
|
"memory(GiB)": 31.36,
|
|
"step": 475,
|
|
"token_acc": 0.9371612406895828,
|
|
"train_speed(iter/s)": 0.113839
|
|
},
|
|
{
|
|
"epoch": 0.5129575207053166,
|
|
"grad_norm": 0.7862565517425537,
|
|
"learning_rate": 9.294703900549096e-06,
|
|
"loss": 0.20246634483337403,
|
|
"memory(GiB)": 31.36,
|
|
"step": 480,
|
|
"token_acc": 0.9213386062277757,
|
|
"train_speed(iter/s)": 0.114112
|
|
},
|
|
{
|
|
"epoch": 0.5129575207053166,
|
|
"eval_loss": 0.19960449635982513,
|
|
"eval_runtime": 39.1156,
|
|
"eval_samples_per_second": 15.441,
|
|
"eval_steps_per_second": 3.86,
|
|
"eval_token_acc": 0.9314846983636793,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.5183008282126637,
|
|
"grad_norm": 0.7772889733314514,
|
|
"learning_rate": 9.280298570447612e-06,
|
|
"loss": 0.20028786659240722,
|
|
"memory(GiB)": 31.36,
|
|
"step": 485,
|
|
"token_acc": 0.9264812360063717,
|
|
"train_speed(iter/s)": 0.113159
|
|
},
|
|
{
|
|
"epoch": 0.5236441357200107,
|
|
"grad_norm": 0.6927475929260254,
|
|
"learning_rate": 9.265759011161519e-06,
|
|
"loss": 0.19862596988677977,
|
|
"memory(GiB)": 31.36,
|
|
"step": 490,
|
|
"token_acc": 0.9293699036323202,
|
|
"train_speed(iter/s)": 0.113461
|
|
},
|
|
{
|
|
"epoch": 0.5289874432273577,
|
|
"grad_norm": 0.7092183828353882,
|
|
"learning_rate": 9.251085678648072e-06,
|
|
"loss": 0.20681967735290527,
|
|
"memory(GiB)": 31.36,
|
|
"step": 495,
|
|
"token_acc": 0.9285693844260653,
|
|
"train_speed(iter/s)": 0.113757
|
|
},
|
|
{
|
|
"epoch": 0.5343307507347048,
|
|
"grad_norm": 0.7293869853019714,
|
|
"learning_rate": 9.236279033059622e-06,
|
|
"loss": 0.21075177192687988,
|
|
"memory(GiB)": 31.36,
|
|
"step": 500,
|
|
"token_acc": 0.9218673562093788,
|
|
"train_speed(iter/s)": 0.114071
|
|
},
|
|
{
|
|
"epoch": 0.5343307507347048,
|
|
"eval_loss": 0.1984926164150238,
|
|
"eval_runtime": 39.2663,
|
|
"eval_samples_per_second": 15.382,
|
|
"eval_steps_per_second": 3.846,
|
|
"eval_token_acc": 0.9314375174247785,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.5396740582420518,
|
|
"grad_norm": 0.8831562995910645,
|
|
"learning_rate": 9.221339538729191e-06,
|
|
"loss": 0.21746454238891602,
|
|
"memory(GiB)": 31.36,
|
|
"step": 505,
|
|
"token_acc": 0.9285247936980231,
|
|
"train_speed(iter/s)": 0.113113
|
|
},
|
|
{
|
|
"epoch": 0.5450173657493989,
|
|
"grad_norm": 0.7981932163238525,
|
|
"learning_rate": 9.206267664155906e-06,
|
|
"loss": 0.2134779691696167,
|
|
"memory(GiB)": 31.36,
|
|
"step": 510,
|
|
"token_acc": 0.9249437382154371,
|
|
"train_speed(iter/s)": 0.113434
|
|
},
|
|
{
|
|
"epoch": 0.550360673256746,
|
|
"grad_norm": 0.7087724208831787,
|
|
"learning_rate": 9.191063881990308e-06,
|
|
"loss": 0.2046900749206543,
|
|
"memory(GiB)": 31.36,
|
|
"step": 515,
|
|
"token_acc": 0.9302552606840697,
|
|
"train_speed(iter/s)": 0.113724
|
|
},
|
|
{
|
|
"epoch": 0.555703980764093,
|
|
"grad_norm": 0.7404990196228027,
|
|
"learning_rate": 9.17572866901953e-06,
|
|
"loss": 0.2034606456756592,
|
|
"memory(GiB)": 31.36,
|
|
"step": 520,
|
|
"token_acc": 0.9364186566673245,
|
|
"train_speed(iter/s)": 0.113991
|
|
},
|
|
{
|
|
"epoch": 0.555703980764093,
|
|
"eval_loss": 0.19815480709075928,
|
|
"eval_runtime": 39.1389,
|
|
"eval_samples_per_second": 15.432,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9321194964507067,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.56104728827144,
|
|
"grad_norm": 0.7702794671058655,
|
|
"learning_rate": 9.160262506152343e-06,
|
|
"loss": 0.20876009464263917,
|
|
"memory(GiB)": 31.36,
|
|
"step": 525,
|
|
"token_acc": 0.9260557562023464,
|
|
"train_speed(iter/s)": 0.113137
|
|
},
|
|
{
|
|
"epoch": 0.566390595778787,
|
|
"grad_norm": 0.7707433104515076,
|
|
"learning_rate": 9.14466587840408e-06,
|
|
"loss": 0.2181908369064331,
|
|
"memory(GiB)": 31.36,
|
|
"step": 530,
|
|
"token_acc": 0.9201792801423293,
|
|
"train_speed(iter/s)": 0.113398
|
|
},
|
|
{
|
|
"epoch": 0.5717339032861342,
|
|
"grad_norm": 0.7392153143882751,
|
|
"learning_rate": 9.12893927488142e-06,
|
|
"loss": 0.2074495792388916,
|
|
"memory(GiB)": 31.36,
|
|
"step": 535,
|
|
"token_acc": 0.9264089572777882,
|
|
"train_speed(iter/s)": 0.113707
|
|
},
|
|
{
|
|
"epoch": 0.5770772107934812,
|
|
"grad_norm": 0.6916921138763428,
|
|
"learning_rate": 9.113083188767057e-06,
|
|
"loss": 0.192901611328125,
|
|
"memory(GiB)": 31.36,
|
|
"step": 540,
|
|
"token_acc": 0.9276745841693095,
|
|
"train_speed(iter/s)": 0.114011
|
|
},
|
|
{
|
|
"epoch": 0.5770772107934812,
|
|
"eval_loss": 0.1964673101902008,
|
|
"eval_runtime": 39.2785,
|
|
"eval_samples_per_second": 15.377,
|
|
"eval_steps_per_second": 3.844,
|
|
"eval_token_acc": 0.9323039310300457,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.5824205183008282,
|
|
"grad_norm": 0.7404196262359619,
|
|
"learning_rate": 9.097098117304223e-06,
|
|
"loss": 0.19912216663360596,
|
|
"memory(GiB)": 31.36,
|
|
"step": 545,
|
|
"token_acc": 0.9267058907942886,
|
|
"train_speed(iter/s)": 0.113126
|
|
},
|
|
{
|
|
"epoch": 0.5877638258081752,
|
|
"grad_norm": 0.8616334795951843,
|
|
"learning_rate": 9.08098456178111e-06,
|
|
"loss": 0.22115416526794435,
|
|
"memory(GiB)": 31.36,
|
|
"step": 550,
|
|
"token_acc": 0.9247433468141067,
|
|
"train_speed(iter/s)": 0.113428
|
|
},
|
|
{
|
|
"epoch": 0.5931071333155223,
|
|
"grad_norm": 0.6713617444038391,
|
|
"learning_rate": 9.064743027515127e-06,
|
|
"loss": 0.20437276363372803,
|
|
"memory(GiB)": 31.36,
|
|
"step": 555,
|
|
"token_acc": 0.9308613364233295,
|
|
"train_speed(iter/s)": 0.113724
|
|
},
|
|
{
|
|
"epoch": 0.5984504408228694,
|
|
"grad_norm": 0.7443351745605469,
|
|
"learning_rate": 9.048374023837086e-06,
|
|
"loss": 0.21196300983428956,
|
|
"memory(GiB)": 31.36,
|
|
"step": 560,
|
|
"token_acc": 0.9288690476190476,
|
|
"train_speed(iter/s)": 0.113984
|
|
},
|
|
{
|
|
"epoch": 0.5984504408228694,
|
|
"eval_loss": 0.1953776627779007,
|
|
"eval_runtime": 39.1331,
|
|
"eval_samples_per_second": 15.435,
|
|
"eval_steps_per_second": 3.859,
|
|
"eval_token_acc": 0.9320208453966415,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.6037937483302164,
|
|
"grad_norm": 0.7687556147575378,
|
|
"learning_rate": 9.03187806407519e-06,
|
|
"loss": 0.20800457000732422,
|
|
"memory(GiB)": 31.36,
|
|
"step": 565,
|
|
"token_acc": 0.9276673510367078,
|
|
"train_speed(iter/s)": 0.113135
|
|
},
|
|
{
|
|
"epoch": 0.6091370558375635,
|
|
"grad_norm": 0.7980899214744568,
|
|
"learning_rate": 9.015255665538972e-06,
|
|
"loss": 0.2199338912963867,
|
|
"memory(GiB)": 31.36,
|
|
"step": 570,
|
|
"token_acc": 0.9232478287214704,
|
|
"train_speed(iter/s)": 0.113444
|
|
},
|
|
{
|
|
"epoch": 0.6144803633449105,
|
|
"grad_norm": 0.7510707974433899,
|
|
"learning_rate": 8.998507349503048e-06,
|
|
"loss": 0.19736554622650146,
|
|
"memory(GiB)": 31.36,
|
|
"step": 575,
|
|
"token_acc": 0.9285859751544362,
|
|
"train_speed(iter/s)": 0.113724
|
|
},
|
|
{
|
|
"epoch": 0.6198236708522575,
|
|
"grad_norm": 0.8514485955238342,
|
|
"learning_rate": 8.981633641190779e-06,
|
|
"loss": 0.21196255683898926,
|
|
"memory(GiB)": 31.36,
|
|
"step": 580,
|
|
"token_acc": 0.9259536976286438,
|
|
"train_speed(iter/s)": 0.113995
|
|
},
|
|
{
|
|
"epoch": 0.6198236708522575,
|
|
"eval_loss": 0.19566793739795685,
|
|
"eval_runtime": 39.1561,
|
|
"eval_samples_per_second": 15.425,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9323511119689464,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.6251669783596046,
|
|
"grad_norm": 0.7959375381469727,
|
|
"learning_rate": 8.964635069757803e-06,
|
|
"loss": 0.20478439331054688,
|
|
"memory(GiB)": 31.36,
|
|
"step": 585,
|
|
"token_acc": 0.9299914280010803,
|
|
"train_speed(iter/s)": 0.113198
|
|
},
|
|
{
|
|
"epoch": 0.6305102858669517,
|
|
"grad_norm": 0.6840202808380127,
|
|
"learning_rate": 8.94751216827543e-06,
|
|
"loss": 0.20479016304016112,
|
|
"memory(GiB)": 31.36,
|
|
"step": 590,
|
|
"token_acc": 0.930996839988509,
|
|
"train_speed(iter/s)": 0.113473
|
|
},
|
|
{
|
|
"epoch": 0.6358535933742987,
|
|
"grad_norm": 0.7271625399589539,
|
|
"learning_rate": 8.930265473713939e-06,
|
|
"loss": 0.18621822595596313,
|
|
"memory(GiB)": 31.36,
|
|
"step": 595,
|
|
"token_acc": 0.9302919832373684,
|
|
"train_speed(iter/s)": 0.113714
|
|
},
|
|
{
|
|
"epoch": 0.6411969008816457,
|
|
"grad_norm": 0.7664585113525391,
|
|
"learning_rate": 8.912895526925726e-06,
|
|
"loss": 0.20015628337860109,
|
|
"memory(GiB)": 31.36,
|
|
"step": 600,
|
|
"token_acc": 0.9368948705623841,
|
|
"train_speed(iter/s)": 0.113966
|
|
},
|
|
{
|
|
"epoch": 0.6411969008816457,
|
|
"eval_loss": 0.19422343373298645,
|
|
"eval_runtime": 39.1797,
|
|
"eval_samples_per_second": 15.416,
|
|
"eval_steps_per_second": 3.854,
|
|
"eval_token_acc": 0.9324712089043299,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.6465402083889927,
|
|
"grad_norm": 0.7383084297180176,
|
|
"learning_rate": 8.895402872628352e-06,
|
|
"loss": 0.20294442176818847,
|
|
"memory(GiB)": 31.36,
|
|
"step": 605,
|
|
"token_acc": 0.9330581241743725,
|
|
"train_speed(iter/s)": 0.113194
|
|
},
|
|
{
|
|
"epoch": 0.6518835158963399,
|
|
"grad_norm": 0.7372247576713562,
|
|
"learning_rate": 8.87778805938746e-06,
|
|
"loss": 0.20596041679382324,
|
|
"memory(GiB)": 31.36,
|
|
"step": 610,
|
|
"token_acc": 0.9245759105003952,
|
|
"train_speed(iter/s)": 0.11346
|
|
},
|
|
{
|
|
"epoch": 0.6572268234036869,
|
|
"grad_norm": 0.7220420241355896,
|
|
"learning_rate": 8.86005163959956e-06,
|
|
"loss": 0.20566608905792236,
|
|
"memory(GiB)": 31.36,
|
|
"step": 615,
|
|
"token_acc": 0.9246050420168067,
|
|
"train_speed(iter/s)": 0.113732
|
|
},
|
|
{
|
|
"epoch": 0.6625701309110339,
|
|
"grad_norm": 0.687303900718689,
|
|
"learning_rate": 8.842194169474727e-06,
|
|
"loss": 0.20316667556762696,
|
|
"memory(GiB)": 31.36,
|
|
"step": 620,
|
|
"token_acc": 0.929285897482871,
|
|
"train_speed(iter/s)": 0.114
|
|
},
|
|
{
|
|
"epoch": 0.6625701309110339,
|
|
"eval_loss": 0.19414789974689484,
|
|
"eval_runtime": 39.2021,
|
|
"eval_samples_per_second": 15.407,
|
|
"eval_steps_per_second": 3.852,
|
|
"eval_token_acc": 0.9324969439619121,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.667913438418381,
|
|
"grad_norm": 0.7889045476913452,
|
|
"learning_rate": 8.824216209019139e-06,
|
|
"loss": 0.2020263195037842,
|
|
"memory(GiB)": 31.36,
|
|
"step": 625,
|
|
"token_acc": 0.9281722400366468,
|
|
"train_speed(iter/s)": 0.113212
|
|
},
|
|
{
|
|
"epoch": 0.673256745925728,
|
|
"grad_norm": 0.7296798229217529,
|
|
"learning_rate": 8.806118322017525e-06,
|
|
"loss": 0.19929354190826415,
|
|
"memory(GiB)": 31.36,
|
|
"step": 630,
|
|
"token_acc": 0.9315762957365074,
|
|
"train_speed(iter/s)": 0.113443
|
|
},
|
|
{
|
|
"epoch": 0.6786000534330751,
|
|
"grad_norm": 0.7287123799324036,
|
|
"learning_rate": 8.787901076015487e-06,
|
|
"loss": 0.20486984252929688,
|
|
"memory(GiB)": 31.36,
|
|
"step": 635,
|
|
"token_acc": 0.9216136550589525,
|
|
"train_speed(iter/s)": 0.113669
|
|
},
|
|
{
|
|
"epoch": 0.6839433609404221,
|
|
"grad_norm": 0.7702438831329346,
|
|
"learning_rate": 8.769565042301692e-06,
|
|
"loss": 0.21484546661376952,
|
|
"memory(GiB)": 31.36,
|
|
"step": 640,
|
|
"token_acc": 0.9197191255248299,
|
|
"train_speed(iter/s)": 0.113928
|
|
},
|
|
{
|
|
"epoch": 0.6839433609404221,
|
|
"eval_loss": 0.19368213415145874,
|
|
"eval_runtime": 39.232,
|
|
"eval_samples_per_second": 15.396,
|
|
"eval_steps_per_second": 3.849,
|
|
"eval_token_acc": 0.9326599326599326,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.6892866684477692,
|
|
"grad_norm": 0.7215872406959534,
|
|
"learning_rate": 8.751110795889966e-06,
|
|
"loss": 0.20535902976989745,
|
|
"memory(GiB)": 31.36,
|
|
"step": 645,
|
|
"token_acc": 0.9300116508547923,
|
|
"train_speed(iter/s)": 0.113175
|
|
},
|
|
{
|
|
"epoch": 0.6946299759551162,
|
|
"grad_norm": 0.7084954380989075,
|
|
"learning_rate": 8.732538915501257e-06,
|
|
"loss": 0.19380364418029786,
|
|
"memory(GiB)": 31.36,
|
|
"step": 650,
|
|
"token_acc": 0.933890160921023,
|
|
"train_speed(iter/s)": 0.113399
|
|
},
|
|
{
|
|
"epoch": 0.6999732834624632,
|
|
"grad_norm": 0.7394425272941589,
|
|
"learning_rate": 8.71384998354549e-06,
|
|
"loss": 0.19607152938842773,
|
|
"memory(GiB)": 31.36,
|
|
"step": 655,
|
|
"token_acc": 0.9408127208480566,
|
|
"train_speed(iter/s)": 0.113636
|
|
},
|
|
{
|
|
"epoch": 0.7053165909698104,
|
|
"grad_norm": 0.6766705513000488,
|
|
"learning_rate": 8.695044586103297e-06,
|
|
"loss": 0.19869532585144042,
|
|
"memory(GiB)": 31.36,
|
|
"step": 660,
|
|
"token_acc": 0.9315438606795688,
|
|
"train_speed(iter/s)": 0.113856
|
|
},
|
|
{
|
|
"epoch": 0.7053165909698104,
|
|
"eval_loss": 0.19304147362709045,
|
|
"eval_runtime": 39.1551,
|
|
"eval_samples_per_second": 15.426,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9326256192498231,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.7106598984771574,
|
|
"grad_norm": 0.8370521664619446,
|
|
"learning_rate": 8.676123312907641e-06,
|
|
"loss": 0.20741744041442872,
|
|
"memory(GiB)": 31.36,
|
|
"step": 665,
|
|
"token_acc": 0.9293741677762982,
|
|
"train_speed(iter/s)": 0.113158
|
|
},
|
|
{
|
|
"epoch": 0.7160032059845044,
|
|
"grad_norm": 0.6481726765632629,
|
|
"learning_rate": 8.657086757325328e-06,
|
|
"loss": 0.19325138330459596,
|
|
"memory(GiB)": 31.36,
|
|
"step": 670,
|
|
"token_acc": 0.9333951984673412,
|
|
"train_speed(iter/s)": 0.113374
|
|
},
|
|
{
|
|
"epoch": 0.7213465134918514,
|
|
"grad_norm": 0.7585648894309998,
|
|
"learning_rate": 8.637935516338384e-06,
|
|
"loss": 0.20734424591064454,
|
|
"memory(GiB)": 31.36,
|
|
"step": 675,
|
|
"token_acc": 0.929390127754606,
|
|
"train_speed(iter/s)": 0.113613
|
|
},
|
|
{
|
|
"epoch": 0.7266898209991985,
|
|
"grad_norm": 0.6115338206291199,
|
|
"learning_rate": 8.61867019052535e-06,
|
|
"loss": 0.1957784414291382,
|
|
"memory(GiB)": 31.36,
|
|
"step": 680,
|
|
"token_acc": 0.9327852845758922,
|
|
"train_speed(iter/s)": 0.113851
|
|
},
|
|
{
|
|
"epoch": 0.7266898209991985,
|
|
"eval_loss": 0.1919844150543213,
|
|
"eval_runtime": 39.1768,
|
|
"eval_samples_per_second": 15.417,
|
|
"eval_steps_per_second": 3.854,
|
|
"eval_token_acc": 0.9334705869737717,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.7320331285065456,
|
|
"grad_norm": 0.763317883014679,
|
|
"learning_rate": 8.599291384042442e-06,
|
|
"loss": 0.1990307092666626,
|
|
"memory(GiB)": 31.36,
|
|
"step": 685,
|
|
"token_acc": 0.9309296501645834,
|
|
"train_speed(iter/s)": 0.113214
|
|
},
|
|
{
|
|
"epoch": 0.7373764360138926,
|
|
"grad_norm": 0.6819867491722107,
|
|
"learning_rate": 8.579799704604597e-06,
|
|
"loss": 0.19830925464630128,
|
|
"memory(GiB)": 31.36,
|
|
"step": 690,
|
|
"token_acc": 0.9335458059266292,
|
|
"train_speed(iter/s)": 0.113398
|
|
},
|
|
{
|
|
"epoch": 0.7427197435212396,
|
|
"grad_norm": 0.7253126502037048,
|
|
"learning_rate": 8.560195763466428e-06,
|
|
"loss": 0.20094099044799804,
|
|
"memory(GiB)": 31.36,
|
|
"step": 695,
|
|
"token_acc": 0.9241353978300181,
|
|
"train_speed(iter/s)": 0.113667
|
|
},
|
|
{
|
|
"epoch": 0.7480630510285867,
|
|
"grad_norm": 0.75684654712677,
|
|
"learning_rate": 8.540480175403045e-06,
|
|
"loss": 0.20439500808715821,
|
|
"memory(GiB)": 31.36,
|
|
"step": 700,
|
|
"token_acc": 0.9336929261002508,
|
|
"train_speed(iter/s)": 0.113873
|
|
},
|
|
{
|
|
"epoch": 0.7480630510285867,
|
|
"eval_loss": 0.19040465354919434,
|
|
"eval_runtime": 39.1788,
|
|
"eval_samples_per_second": 15.416,
|
|
"eval_steps_per_second": 3.854,
|
|
"eval_token_acc": 0.9335263462651998,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.7534063585359337,
|
|
"grad_norm": 0.695967972278595,
|
|
"learning_rate": 8.520653558690785e-06,
|
|
"loss": 0.2008056879043579,
|
|
"memory(GiB)": 31.36,
|
|
"step": 705,
|
|
"token_acc": 0.9287990912399502,
|
|
"train_speed(iter/s)": 0.113225
|
|
},
|
|
{
|
|
"epoch": 0.7587496660432808,
|
|
"grad_norm": 0.7082876563072205,
|
|
"learning_rate": 8.500716535087815e-06,
|
|
"loss": 0.19893609285354613,
|
|
"memory(GiB)": 31.36,
|
|
"step": 710,
|
|
"token_acc": 0.9335433493079527,
|
|
"train_speed(iter/s)": 0.113434
|
|
},
|
|
{
|
|
"epoch": 0.7640929735506279,
|
|
"grad_norm": 0.6951336860656738,
|
|
"learning_rate": 8.480669729814635e-06,
|
|
"loss": 0.20382363796234132,
|
|
"memory(GiB)": 31.36,
|
|
"step": 715,
|
|
"token_acc": 0.9272279597838725,
|
|
"train_speed(iter/s)": 0.113684
|
|
},
|
|
{
|
|
"epoch": 0.7694362810579749,
|
|
"grad_norm": 0.690929651260376,
|
|
"learning_rate": 8.460513771534475e-06,
|
|
"loss": 0.20613670349121094,
|
|
"memory(GiB)": 31.36,
|
|
"step": 720,
|
|
"token_acc": 0.9379507848960543,
|
|
"train_speed(iter/s)": 0.113902
|
|
},
|
|
{
|
|
"epoch": 0.7694362810579749,
|
|
"eval_loss": 0.19106899201869965,
|
|
"eval_runtime": 39.2423,
|
|
"eval_samples_per_second": 15.392,
|
|
"eval_steps_per_second": 3.848,
|
|
"eval_token_acc": 0.9331446095777306,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.7747795885653219,
|
|
"grad_norm": 0.6783177256584167,
|
|
"learning_rate": 8.440249292333583e-06,
|
|
"loss": 0.1977448582649231,
|
|
"memory(GiB)": 31.36,
|
|
"step": 725,
|
|
"token_acc": 0.9308449330614417,
|
|
"train_speed(iter/s)": 0.11326
|
|
},
|
|
{
|
|
"epoch": 0.7801228960726689,
|
|
"grad_norm": 0.6677674055099487,
|
|
"learning_rate": 8.41987692770139e-06,
|
|
"loss": 0.21048130989074706,
|
|
"memory(GiB)": 31.36,
|
|
"step": 730,
|
|
"token_acc": 0.9202530400865319,
|
|
"train_speed(iter/s)": 0.113463
|
|
},
|
|
{
|
|
"epoch": 0.7854662035800161,
|
|
"grad_norm": 0.8552298545837402,
|
|
"learning_rate": 8.399397316510596e-06,
|
|
"loss": 0.20974290370941162,
|
|
"memory(GiB)": 31.36,
|
|
"step": 735,
|
|
"token_acc": 0.9332773814519513,
|
|
"train_speed(iter/s)": 0.113682
|
|
},
|
|
{
|
|
"epoch": 0.7908095110873631,
|
|
"grad_norm": 0.7160708904266357,
|
|
"learning_rate": 8.378811100997122e-06,
|
|
"loss": 0.20636558532714844,
|
|
"memory(GiB)": 31.36,
|
|
"step": 740,
|
|
"token_acc": 0.9329161099782013,
|
|
"train_speed(iter/s)": 0.113884
|
|
},
|
|
{
|
|
"epoch": 0.7908095110873631,
|
|
"eval_loss": 0.19004826247692108,
|
|
"eval_runtime": 39.1971,
|
|
"eval_samples_per_second": 15.409,
|
|
"eval_steps_per_second": 3.852,
|
|
"eval_token_acc": 0.9335435029702546,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.7961528185947101,
|
|
"grad_norm": 0.7204393148422241,
|
|
"learning_rate": 8.358118926739984e-06,
|
|
"loss": 0.20534658432006836,
|
|
"memory(GiB)": 31.36,
|
|
"step": 745,
|
|
"token_acc": 0.9301107044622764,
|
|
"train_speed(iter/s)": 0.113248
|
|
},
|
|
{
|
|
"epoch": 0.8014961261020572,
|
|
"grad_norm": 0.7450360059738159,
|
|
"learning_rate": 8.337321442641036e-06,
|
|
"loss": 0.20312881469726562,
|
|
"memory(GiB)": 31.36,
|
|
"step": 750,
|
|
"token_acc": 0.9307398246008545,
|
|
"train_speed(iter/s)": 0.113436
|
|
},
|
|
{
|
|
"epoch": 0.8068394336094042,
|
|
"grad_norm": 0.7013337016105652,
|
|
"learning_rate": 8.316419300904622e-06,
|
|
"loss": 0.20806705951690674,
|
|
"memory(GiB)": 31.36,
|
|
"step": 755,
|
|
"token_acc": 0.9265781630994311,
|
|
"train_speed(iter/s)": 0.113667
|
|
},
|
|
{
|
|
"epoch": 0.8121827411167513,
|
|
"grad_norm": 0.7928115129470825,
|
|
"learning_rate": 8.295413157017127e-06,
|
|
"loss": 0.20586895942687988,
|
|
"memory(GiB)": 31.36,
|
|
"step": 760,
|
|
"token_acc": 0.9280571524250711,
|
|
"train_speed(iter/s)": 0.113891
|
|
},
|
|
{
|
|
"epoch": 0.8121827411167513,
|
|
"eval_loss": 0.1891859769821167,
|
|
"eval_runtime": 39.1442,
|
|
"eval_samples_per_second": 15.43,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9337751184884943,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.8175260486240983,
|
|
"grad_norm": 0.7362833023071289,
|
|
"learning_rate": 8.274303669726427e-06,
|
|
"loss": 0.21117730140686036,
|
|
"memory(GiB)": 31.36,
|
|
"step": 765,
|
|
"token_acc": 0.9295638706201881,
|
|
"train_speed(iter/s)": 0.113261
|
|
},
|
|
{
|
|
"epoch": 0.8228693561314454,
|
|
"grad_norm": 0.8272237777709961,
|
|
"learning_rate": 8.25309150102121e-06,
|
|
"loss": 0.20895204544067383,
|
|
"memory(GiB)": 31.36,
|
|
"step": 770,
|
|
"token_acc": 0.925374677002584,
|
|
"train_speed(iter/s)": 0.113483
|
|
},
|
|
{
|
|
"epoch": 0.8282126636387924,
|
|
"grad_norm": 0.7175537943840027,
|
|
"learning_rate": 8.231777316110245e-06,
|
|
"loss": 0.1944166898727417,
|
|
"memory(GiB)": 31.36,
|
|
"step": 775,
|
|
"token_acc": 0.9321589002543703,
|
|
"train_speed(iter/s)": 0.11366
|
|
},
|
|
{
|
|
"epoch": 0.8335559711461394,
|
|
"grad_norm": 0.7809334993362427,
|
|
"learning_rate": 8.210361783401491e-06,
|
|
"loss": 0.19996525049209596,
|
|
"memory(GiB)": 31.36,
|
|
"step": 780,
|
|
"token_acc": 0.9270649417354178,
|
|
"train_speed(iter/s)": 0.113842
|
|
},
|
|
{
|
|
"epoch": 0.8335559711461394,
|
|
"eval_loss": 0.18875150382518768,
|
|
"eval_runtime": 39.1112,
|
|
"eval_samples_per_second": 15.443,
|
|
"eval_steps_per_second": 3.861,
|
|
"eval_token_acc": 0.9345214351583778,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.8388992786534865,
|
|
"grad_norm": 0.7965995669364929,
|
|
"learning_rate": 8.188845574481162e-06,
|
|
"loss": 0.20428986549377443,
|
|
"memory(GiB)": 31.36,
|
|
"step": 785,
|
|
"token_acc": 0.9295484112938198,
|
|
"train_speed(iter/s)": 0.113243
|
|
},
|
|
{
|
|
"epoch": 0.8442425861608336,
|
|
"grad_norm": 0.8420124053955078,
|
|
"learning_rate": 8.167229364092648e-06,
|
|
"loss": 0.2018270969390869,
|
|
"memory(GiB)": 31.36,
|
|
"step": 790,
|
|
"token_acc": 0.9257297598661941,
|
|
"train_speed(iter/s)": 0.113454
|
|
},
|
|
{
|
|
"epoch": 0.8495858936681806,
|
|
"grad_norm": 0.7496061325073242,
|
|
"learning_rate": 8.145513830115367e-06,
|
|
"loss": 0.18817675113677979,
|
|
"memory(GiB)": 31.36,
|
|
"step": 795,
|
|
"token_acc": 0.9294179964245514,
|
|
"train_speed(iter/s)": 0.113632
|
|
},
|
|
{
|
|
"epoch": 0.8549292011755276,
|
|
"grad_norm": 0.7280667424201965,
|
|
"learning_rate": 8.1236996535435e-06,
|
|
"loss": 0.20691485404968263,
|
|
"memory(GiB)": 31.36,
|
|
"step": 800,
|
|
"token_acc": 0.9246175682069074,
|
|
"train_speed(iter/s)": 0.113856
|
|
},
|
|
{
|
|
"epoch": 0.8549292011755276,
|
|
"eval_loss": 0.18890005350112915,
|
|
"eval_runtime": 39.0966,
|
|
"eval_samples_per_second": 15.449,
|
|
"eval_steps_per_second": 3.862,
|
|
"eval_token_acc": 0.9341997469386004,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.8602725086828747,
|
|
"grad_norm": 0.7484925389289856,
|
|
"learning_rate": 8.101787518464634e-06,
|
|
"loss": 0.20772714614868165,
|
|
"memory(GiB)": 31.36,
|
|
"step": 805,
|
|
"token_acc": 0.9296667323028454,
|
|
"train_speed(iter/s)": 0.113284
|
|
},
|
|
{
|
|
"epoch": 0.8656158161902218,
|
|
"grad_norm": 0.791478157043457,
|
|
"learning_rate": 8.079778112038318e-06,
|
|
"loss": 0.20092449188232422,
|
|
"memory(GiB)": 31.36,
|
|
"step": 810,
|
|
"token_acc": 0.9342826902722987,
|
|
"train_speed(iter/s)": 0.113469
|
|
},
|
|
{
|
|
"epoch": 0.8709591236975688,
|
|
"grad_norm": 0.7884016633033752,
|
|
"learning_rate": 8.057672124474508e-06,
|
|
"loss": 0.19559590816497802,
|
|
"memory(GiB)": 31.36,
|
|
"step": 815,
|
|
"token_acc": 0.9313420307089644,
|
|
"train_speed(iter/s)": 0.113673
|
|
},
|
|
{
|
|
"epoch": 0.8763024312049158,
|
|
"grad_norm": 0.7414833307266235,
|
|
"learning_rate": 8.035470249011916e-06,
|
|
"loss": 0.21486268043518067,
|
|
"memory(GiB)": 31.36,
|
|
"step": 820,
|
|
"token_acc": 0.9156693981017509,
|
|
"train_speed(iter/s)": 0.113866
|
|
},
|
|
{
|
|
"epoch": 0.8763024312049158,
|
|
"eval_loss": 0.18826377391815186,
|
|
"eval_runtime": 39.1249,
|
|
"eval_samples_per_second": 15.438,
|
|
"eval_steps_per_second": 3.859,
|
|
"eval_token_acc": 0.9340796500032169,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.8816457387122629,
|
|
"grad_norm": 0.6968944072723389,
|
|
"learning_rate": 8.013173181896283e-06,
|
|
"loss": 0.195112144947052,
|
|
"memory(GiB)": 31.36,
|
|
"step": 825,
|
|
"token_acc": 0.9297919928427645,
|
|
"train_speed(iter/s)": 0.113289
|
|
},
|
|
{
|
|
"epoch": 0.88698904621961,
|
|
"grad_norm": 0.6487668752670288,
|
|
"learning_rate": 7.990781622358535e-06,
|
|
"loss": 0.20295815467834472,
|
|
"memory(GiB)": 31.36,
|
|
"step": 830,
|
|
"token_acc": 0.9273416807127378,
|
|
"train_speed(iter/s)": 0.113496
|
|
},
|
|
{
|
|
"epoch": 0.892332353726957,
|
|
"grad_norm": 0.7101380825042725,
|
|
"learning_rate": 7.968296272592862e-06,
|
|
"loss": 0.2020167589187622,
|
|
"memory(GiB)": 31.36,
|
|
"step": 835,
|
|
"token_acc": 0.9326549210206562,
|
|
"train_speed(iter/s)": 0.113672
|
|
},
|
|
{
|
|
"epoch": 0.897675661234304,
|
|
"grad_norm": 0.7018981575965881,
|
|
"learning_rate": 7.945717837734688e-06,
|
|
"loss": 0.21045067310333251,
|
|
"memory(GiB)": 31.36,
|
|
"step": 840,
|
|
"token_acc": 0.9254527494237734,
|
|
"train_speed(iter/s)": 0.113897
|
|
},
|
|
{
|
|
"epoch": 0.897675661234304,
|
|
"eval_loss": 0.18678458034992218,
|
|
"eval_runtime": 39.117,
|
|
"eval_samples_per_second": 15.441,
|
|
"eval_steps_per_second": 3.86,
|
|
"eval_token_acc": 0.9345257243346415,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.9030189687416511,
|
|
"grad_norm": 0.7442994713783264,
|
|
"learning_rate": 7.923047025838573e-06,
|
|
"loss": 0.18977639675140381,
|
|
"memory(GiB)": 31.36,
|
|
"step": 845,
|
|
"token_acc": 0.9316706328119585,
|
|
"train_speed(iter/s)": 0.11334
|
|
},
|
|
{
|
|
"epoch": 0.9083622762489981,
|
|
"grad_norm": 0.5974848866462708,
|
|
"learning_rate": 7.900284547855992e-06,
|
|
"loss": 0.19339005947113036,
|
|
"memory(GiB)": 31.36,
|
|
"step": 850,
|
|
"token_acc": 0.9319930430681249,
|
|
"train_speed(iter/s)": 0.113541
|
|
},
|
|
{
|
|
"epoch": 0.9137055837563451,
|
|
"grad_norm": 0.7044218182563782,
|
|
"learning_rate": 7.87743111761305e-06,
|
|
"loss": 0.19252583980560303,
|
|
"memory(GiB)": 31.36,
|
|
"step": 855,
|
|
"token_acc": 0.9317780249983639,
|
|
"train_speed(iter/s)": 0.113714
|
|
},
|
|
{
|
|
"epoch": 0.9190488912636923,
|
|
"grad_norm": 0.6662729978561401,
|
|
"learning_rate": 7.8544874517881e-06,
|
|
"loss": 0.18919335603713988,
|
|
"memory(GiB)": 31.36,
|
|
"step": 860,
|
|
"token_acc": 0.9358071036673404,
|
|
"train_speed(iter/s)": 0.113874
|
|
},
|
|
{
|
|
"epoch": 0.9190488912636923,
|
|
"eval_loss": 0.18598179519176483,
|
|
"eval_runtime": 39.1151,
|
|
"eval_samples_per_second": 15.442,
|
|
"eval_steps_per_second": 3.86,
|
|
"eval_token_acc": 0.9348645692594737,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.9243921987710393,
|
|
"grad_norm": 0.7112472057342529,
|
|
"learning_rate": 7.831454269889251e-06,
|
|
"loss": 0.2195812225341797,
|
|
"memory(GiB)": 31.36,
|
|
"step": 865,
|
|
"token_acc": 0.929244681467741,
|
|
"train_speed(iter/s)": 0.113345
|
|
},
|
|
{
|
|
"epoch": 0.9297355062783863,
|
|
"grad_norm": 0.6910869479179382,
|
|
"learning_rate": 7.808332294231824e-06,
|
|
"loss": 0.19954900741577147,
|
|
"memory(GiB)": 31.36,
|
|
"step": 870,
|
|
"token_acc": 0.931801259053679,
|
|
"train_speed(iter/s)": 0.113528
|
|
},
|
|
{
|
|
"epoch": 0.9350788137857333,
|
|
"grad_norm": 0.681770384311676,
|
|
"learning_rate": 7.785122249915688e-06,
|
|
"loss": 0.18991070985794067,
|
|
"memory(GiB)": 31.36,
|
|
"step": 875,
|
|
"token_acc": 0.9295779137975007,
|
|
"train_speed(iter/s)": 0.113699
|
|
},
|
|
{
|
|
"epoch": 0.9404221212930804,
|
|
"grad_norm": 0.8212052583694458,
|
|
"learning_rate": 7.76182486480253e-06,
|
|
"loss": 0.1996673822402954,
|
|
"memory(GiB)": 31.36,
|
|
"step": 880,
|
|
"token_acc": 0.9402507040697995,
|
|
"train_speed(iter/s)": 0.113869
|
|
},
|
|
{
|
|
"epoch": 0.9404221212930804,
|
|
"eval_loss": 0.1853218525648117,
|
|
"eval_runtime": 39.2228,
|
|
"eval_samples_per_second": 15.399,
|
|
"eval_steps_per_second": 3.85,
|
|
"eval_token_acc": 0.9349203285509018,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.9457654288004275,
|
|
"grad_norm": 0.7549769878387451,
|
|
"learning_rate": 7.738440869493018e-06,
|
|
"loss": 0.19690234661102296,
|
|
"memory(GiB)": 31.36,
|
|
"step": 885,
|
|
"token_acc": 0.9322219434878789,
|
|
"train_speed(iter/s)": 0.113325
|
|
},
|
|
{
|
|
"epoch": 0.9511087363077745,
|
|
"grad_norm": 0.7041129469871521,
|
|
"learning_rate": 7.714970997303898e-06,
|
|
"loss": 0.19316442012786866,
|
|
"memory(GiB)": 31.36,
|
|
"step": 890,
|
|
"token_acc": 0.9402204546877936,
|
|
"train_speed(iter/s)": 0.113488
|
|
},
|
|
{
|
|
"epoch": 0.9564520438151216,
|
|
"grad_norm": 0.6991894841194153,
|
|
"learning_rate": 7.691415984244998e-06,
|
|
"loss": 0.19888077974319457,
|
|
"memory(GiB)": 31.36,
|
|
"step": 895,
|
|
"token_acc": 0.9314758549356779,
|
|
"train_speed(iter/s)": 0.113654
|
|
},
|
|
{
|
|
"epoch": 0.9617953513224686,
|
|
"grad_norm": 0.6569207906723022,
|
|
"learning_rate": 7.667776568996143e-06,
|
|
"loss": 0.19370880126953124,
|
|
"memory(GiB)": 31.36,
|
|
"step": 900,
|
|
"token_acc": 0.9230841325877097,
|
|
"train_speed(iter/s)": 0.113808
|
|
},
|
|
{
|
|
"epoch": 0.9617953513224686,
|
|
"eval_loss": 0.1854468584060669,
|
|
"eval_runtime": 39.1165,
|
|
"eval_samples_per_second": 15.441,
|
|
"eval_steps_per_second": 3.86,
|
|
"eval_token_acc": 0.9353878487636449,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.9671386588298156,
|
|
"grad_norm": 0.6145524382591248,
|
|
"learning_rate": 7.64405349288399e-06,
|
|
"loss": 0.18694071769714354,
|
|
"memory(GiB)": 31.36,
|
|
"step": 905,
|
|
"token_acc": 0.9349944519517045,
|
|
"train_speed(iter/s)": 0.113288
|
|
},
|
|
{
|
|
"epoch": 0.9724819663371627,
|
|
"grad_norm": 0.6918651461601257,
|
|
"learning_rate": 7.62024749985878e-06,
|
|
"loss": 0.1850353240966797,
|
|
"memory(GiB)": 31.36,
|
|
"step": 910,
|
|
"token_acc": 0.9334631974398219,
|
|
"train_speed(iter/s)": 0.113436
|
|
},
|
|
{
|
|
"epoch": 0.9778252738445098,
|
|
"grad_norm": 0.6467410922050476,
|
|
"learning_rate": 7.596359336471015e-06,
|
|
"loss": 0.1928159236907959,
|
|
"memory(GiB)": 31.36,
|
|
"step": 915,
|
|
"token_acc": 0.9306192268217585,
|
|
"train_speed(iter/s)": 0.113579
|
|
},
|
|
{
|
|
"epoch": 0.9831685813518568,
|
|
"grad_norm": 0.6539848446846008,
|
|
"learning_rate": 7.572389751848037e-06,
|
|
"loss": 0.19003190994262695,
|
|
"memory(GiB)": 31.36,
|
|
"step": 920,
|
|
"token_acc": 0.9286221470836855,
|
|
"train_speed(iter/s)": 0.113727
|
|
},
|
|
{
|
|
"epoch": 0.9831685813518568,
|
|
"eval_loss": 0.1848677396774292,
|
|
"eval_runtime": 39.1075,
|
|
"eval_samples_per_second": 15.445,
|
|
"eval_steps_per_second": 3.861,
|
|
"eval_token_acc": 0.9352377275944155,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.9885118888592038,
|
|
"grad_norm": 0.630643904209137,
|
|
"learning_rate": 7.548339497670538e-06,
|
|
"loss": 0.19637407064437867,
|
|
"memory(GiB)": 31.36,
|
|
"step": 925,
|
|
"token_acc": 0.934642791292936,
|
|
"train_speed(iter/s)": 0.113199
|
|
},
|
|
{
|
|
"epoch": 0.9938551963665508,
|
|
"grad_norm": 0.7896732091903687,
|
|
"learning_rate": 7.524209328148995e-06,
|
|
"loss": 0.1935054898262024,
|
|
"memory(GiB)": 31.36,
|
|
"step": 930,
|
|
"token_acc": 0.9335429563394583,
|
|
"train_speed(iter/s)": 0.113379
|
|
},
|
|
{
|
|
"epoch": 0.999198503873898,
|
|
"grad_norm": 0.8091479539871216,
|
|
"learning_rate": 7.500000000000001e-06,
|
|
"loss": 0.1939959168434143,
|
|
"memory(GiB)": 31.36,
|
|
"step": 935,
|
|
"token_acc": 0.9332101204512141,
|
|
"train_speed(iter/s)": 0.113561
|
|
},
|
|
{
|
|
"epoch": 1.0042746460058776,
|
|
"grad_norm": 0.7063812017440796,
|
|
"learning_rate": 7.4757122724225575e-06,
|
|
"loss": 0.15370899438858032,
|
|
"memory(GiB)": 31.36,
|
|
"step": 940,
|
|
"token_acc": 0.9470076897358742,
|
|
"train_speed(iter/s)": 0.113758
|
|
},
|
|
{
|
|
"epoch": 1.0042746460058776,
|
|
"eval_loss": 0.1846647411584854,
|
|
"eval_runtime": 39.1772,
|
|
"eval_samples_per_second": 15.417,
|
|
"eval_steps_per_second": 3.854,
|
|
"eval_token_acc": 0.9355808616955114,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.0096179535132246,
|
|
"grad_norm": 0.7912789583206177,
|
|
"learning_rate": 7.451346907074245e-06,
|
|
"loss": 0.14589121341705322,
|
|
"memory(GiB)": 31.36,
|
|
"step": 945,
|
|
"token_acc": 0.9395386832162834,
|
|
"train_speed(iter/s)": 0.113271
|
|
},
|
|
{
|
|
"epoch": 1.0149612610205718,
|
|
"grad_norm": 0.773916482925415,
|
|
"learning_rate": 7.426904668047352e-06,
|
|
"loss": 0.14080936908721925,
|
|
"memory(GiB)": 31.36,
|
|
"step": 950,
|
|
"token_acc": 0.9501985945615643,
|
|
"train_speed(iter/s)": 0.113448
|
|
},
|
|
{
|
|
"epoch": 1.0203045685279188,
|
|
"grad_norm": 0.7244420647621155,
|
|
"learning_rate": 7.40238632184491e-06,
|
|
"loss": 0.1430067539215088,
|
|
"memory(GiB)": 31.36,
|
|
"step": 955,
|
|
"token_acc": 0.9496945399007255,
|
|
"train_speed(iter/s)": 0.113636
|
|
},
|
|
{
|
|
"epoch": 1.0256478760352659,
|
|
"grad_norm": 0.7013021111488342,
|
|
"learning_rate": 7.377792637356644e-06,
|
|
"loss": 0.13634157180786133,
|
|
"memory(GiB)": 31.36,
|
|
"step": 960,
|
|
"token_acc": 0.9495622671230802,
|
|
"train_speed(iter/s)": 0.113799
|
|
},
|
|
{
|
|
"epoch": 1.0256478760352659,
|
|
"eval_loss": 0.18845246732234955,
|
|
"eval_runtime": 39.1642,
|
|
"eval_samples_per_second": 15.422,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9350661605438676,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.0309911835426129,
|
|
"grad_norm": 0.6317921876907349,
|
|
"learning_rate": 7.35312438583488e-06,
|
|
"loss": 0.14405910968780516,
|
|
"memory(GiB)": 31.36,
|
|
"step": 965,
|
|
"token_acc": 0.94003444829566,
|
|
"train_speed(iter/s)": 0.113324
|
|
},
|
|
{
|
|
"epoch": 1.03633449104996,
|
|
"grad_norm": 0.7653928399085999,
|
|
"learning_rate": 7.3283823408703466e-06,
|
|
"loss": 0.14201946258544923,
|
|
"memory(GiB)": 31.36,
|
|
"step": 970,
|
|
"token_acc": 0.9429385599110369,
|
|
"train_speed(iter/s)": 0.113486
|
|
},
|
|
{
|
|
"epoch": 1.041677798557307,
|
|
"grad_norm": 0.7575430870056152,
|
|
"learning_rate": 7.303567278367918e-06,
|
|
"loss": 0.15218265056610109,
|
|
"memory(GiB)": 31.36,
|
|
"step": 975,
|
|
"token_acc": 0.9438988818667963,
|
|
"train_speed(iter/s)": 0.113648
|
|
},
|
|
{
|
|
"epoch": 1.047021106064654,
|
|
"grad_norm": 0.6947309970855713,
|
|
"learning_rate": 7.278679976522279e-06,
|
|
"loss": 0.14258232116699218,
|
|
"memory(GiB)": 31.36,
|
|
"step": 980,
|
|
"token_acc": 0.9484339445857315,
|
|
"train_speed(iter/s)": 0.113835
|
|
},
|
|
{
|
|
"epoch": 1.047021106064654,
|
|
"eval_loss": 0.18912462890148163,
|
|
"eval_runtime": 39.1355,
|
|
"eval_samples_per_second": 15.434,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9351605224216689,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.0523644135720012,
|
|
"grad_norm": 0.6447970271110535,
|
|
"learning_rate": 7.253721215793528e-06,
|
|
"loss": 0.13303806781768798,
|
|
"memory(GiB)": 31.36,
|
|
"step": 985,
|
|
"token_acc": 0.9368344110205229,
|
|
"train_speed(iter/s)": 0.113351
|
|
},
|
|
{
|
|
"epoch": 1.0577077210793482,
|
|
"grad_norm": 0.7357504367828369,
|
|
"learning_rate": 7.2286917788826926e-06,
|
|
"loss": 0.14264590740203859,
|
|
"memory(GiB)": 31.36,
|
|
"step": 990,
|
|
"token_acc": 0.9517513105612875,
|
|
"train_speed(iter/s)": 0.113523
|
|
},
|
|
{
|
|
"epoch": 1.0630510285866952,
|
|
"grad_norm": 0.7251073718070984,
|
|
"learning_rate": 7.203592450707193e-06,
|
|
"loss": 0.14065431356430053,
|
|
"memory(GiB)": 31.36,
|
|
"step": 995,
|
|
"token_acc": 0.9541761579347001,
|
|
"train_speed(iter/s)": 0.113657
|
|
},
|
|
{
|
|
"epoch": 1.0683943360940422,
|
|
"grad_norm": 0.6575713753700256,
|
|
"learning_rate": 7.178424018376224e-06,
|
|
"loss": 0.13455284833908082,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1000,
|
|
"token_acc": 0.952753960692989,
|
|
"train_speed(iter/s)": 0.113814
|
|
},
|
|
{
|
|
"epoch": 1.0683943360940422,
|
|
"eval_loss": 0.18993094563484192,
|
|
"eval_runtime": 39.1132,
|
|
"eval_samples_per_second": 15.442,
|
|
"eval_steps_per_second": 3.861,
|
|
"eval_token_acc": 0.9351004739539771,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.0737376436013892,
|
|
"grad_norm": 0.6939108371734619,
|
|
"learning_rate": 7.153187271166071e-06,
|
|
"loss": 0.1378490924835205,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1005,
|
|
"token_acc": 0.9382960940547167,
|
|
"train_speed(iter/s)": 0.113358
|
|
},
|
|
{
|
|
"epoch": 1.0790809511087363,
|
|
"grad_norm": 0.6724715828895569,
|
|
"learning_rate": 7.127883000495353e-06,
|
|
"loss": 0.14932271242141723,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1010,
|
|
"token_acc": 0.9499566799514816,
|
|
"train_speed(iter/s)": 0.113511
|
|
},
|
|
{
|
|
"epoch": 1.0844242586160833,
|
|
"grad_norm": 0.731438159942627,
|
|
"learning_rate": 7.102511999900213e-06,
|
|
"loss": 0.13644077777862548,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1015,
|
|
"token_acc": 0.9460200277757473,
|
|
"train_speed(iter/s)": 0.113649
|
|
},
|
|
{
|
|
"epoch": 1.0897675661234305,
|
|
"grad_norm": 0.6680863499641418,
|
|
"learning_rate": 7.0770750650094335e-06,
|
|
"loss": 0.13693207502365112,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1020,
|
|
"token_acc": 0.9469487672670047,
|
|
"train_speed(iter/s)": 0.113805
|
|
},
|
|
{
|
|
"epoch": 1.0897675661234305,
|
|
"eval_loss": 0.18932494521141052,
|
|
"eval_runtime": 39.1534,
|
|
"eval_samples_per_second": 15.427,
|
|
"eval_steps_per_second": 3.857,
|
|
"eval_token_acc": 0.9353063544146347,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.0951108736307775,
|
|
"grad_norm": 0.6814751029014587,
|
|
"learning_rate": 7.051572993519474e-06,
|
|
"loss": 0.13657076358795167,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1025,
|
|
"token_acc": 0.9386909315096644,
|
|
"train_speed(iter/s)": 0.113321
|
|
},
|
|
{
|
|
"epoch": 1.1004541811381245,
|
|
"grad_norm": 0.6972615718841553,
|
|
"learning_rate": 7.026006585169467e-06,
|
|
"loss": 0.14217867851257324,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1030,
|
|
"token_acc": 0.9473362948896199,
|
|
"train_speed(iter/s)": 0.113472
|
|
},
|
|
{
|
|
"epoch": 1.1057974886454716,
|
|
"grad_norm": 0.7882171869277954,
|
|
"learning_rate": 7.0003766417161335e-06,
|
|
"loss": 0.13929877281188965,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1035,
|
|
"token_acc": 0.9491739520659509,
|
|
"train_speed(iter/s)": 0.113632
|
|
},
|
|
{
|
|
"epoch": 1.1111407961528186,
|
|
"grad_norm": 0.7138720154762268,
|
|
"learning_rate": 6.974683966908642e-06,
|
|
"loss": 0.1398939847946167,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1040,
|
|
"token_acc": 0.9491174031512853,
|
|
"train_speed(iter/s)": 0.113768
|
|
},
|
|
{
|
|
"epoch": 1.1111407961528186,
|
|
"eval_loss": 0.18828535079956055,
|
|
"eval_runtime": 39.1885,
|
|
"eval_samples_per_second": 15.413,
|
|
"eval_steps_per_second": 3.853,
|
|
"eval_token_acc": 0.9354607647601278,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.1164841036601656,
|
|
"grad_norm": 0.7279144525527954,
|
|
"learning_rate": 6.948929366463397e-06,
|
|
"loss": 0.15247514247894287,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1045,
|
|
"token_acc": 0.9380235654449663,
|
|
"train_speed(iter/s)": 0.113327
|
|
},
|
|
{
|
|
"epoch": 1.1218274111675126,
|
|
"grad_norm": 0.7380113005638123,
|
|
"learning_rate": 6.923113648038784e-06,
|
|
"loss": 0.14748337268829345,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1050,
|
|
"token_acc": 0.9475613194248661,
|
|
"train_speed(iter/s)": 0.11348
|
|
},
|
|
{
|
|
"epoch": 1.1271707186748596,
|
|
"grad_norm": 0.7649953961372375,
|
|
"learning_rate": 6.897237621209831e-06,
|
|
"loss": 0.14567428827285767,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1055,
|
|
"token_acc": 0.9500533120085299,
|
|
"train_speed(iter/s)": 0.113609
|
|
},
|
|
{
|
|
"epoch": 1.1325140261822069,
|
|
"grad_norm": 0.7318655252456665,
|
|
"learning_rate": 6.87130209744282e-06,
|
|
"loss": 0.13384032249450684,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1060,
|
|
"token_acc": 0.9565458338766463,
|
|
"train_speed(iter/s)": 0.113739
|
|
},
|
|
{
|
|
"epoch": 1.1325140261822069,
|
|
"eval_loss": 0.18837569653987885,
|
|
"eval_runtime": 39.2359,
|
|
"eval_samples_per_second": 15.394,
|
|
"eval_steps_per_second": 3.849,
|
|
"eval_token_acc": 0.935216281713097,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.1378573336895539,
|
|
"grad_norm": 0.7364778518676758,
|
|
"learning_rate": 6.845307890069851e-06,
|
|
"loss": 0.1373004674911499,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1065,
|
|
"token_acc": 0.9390180878552972,
|
|
"train_speed(iter/s)": 0.113291
|
|
},
|
|
{
|
|
"epoch": 1.143200641196901,
|
|
"grad_norm": 0.685958743095398,
|
|
"learning_rate": 6.8192558142633215e-06,
|
|
"loss": 0.13763891458511351,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1070,
|
|
"token_acc": 0.9479466974181562,
|
|
"train_speed(iter/s)": 0.113441
|
|
},
|
|
{
|
|
"epoch": 1.148543948704248,
|
|
"grad_norm": 0.6921528577804565,
|
|
"learning_rate": 6.7931466870103735e-06,
|
|
"loss": 0.1474214553833008,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1075,
|
|
"token_acc": 0.9462754416778593,
|
|
"train_speed(iter/s)": 0.11358
|
|
},
|
|
{
|
|
"epoch": 1.153887256211595,
|
|
"grad_norm": 0.6673567891120911,
|
|
"learning_rate": 6.766981327087271e-06,
|
|
"loss": 0.134868586063385,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1080,
|
|
"token_acc": 0.9541073453445886,
|
|
"train_speed(iter/s)": 0.113692
|
|
},
|
|
{
|
|
"epoch": 1.153887256211595,
|
|
"eval_loss": 0.1878366470336914,
|
|
"eval_runtime": 39.1901,
|
|
"eval_samples_per_second": 15.412,
|
|
"eval_steps_per_second": 3.853,
|
|
"eval_token_acc": 0.9354607647601278,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 1.159230563718942,
|
|
"grad_norm": 0.8000864386558533,
|
|
"learning_rate": 6.740760555033715e-06,
|
|
"loss": 0.14174835681915282,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1085,
|
|
"token_acc": 0.9395288542253296,
|
|
"train_speed(iter/s)": 0.113249
|
|
},
|
|
{
|
|
"epoch": 1.1645738712262892,
|
|
"grad_norm": 0.6853452920913696,
|
|
"learning_rate": 6.714485193127126e-06,
|
|
"loss": 0.14102463722229003,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1090,
|
|
"token_acc": 0.9465929419417791,
|
|
"train_speed(iter/s)": 0.11338
|
|
},
|
|
{
|
|
"epoch": 1.1699171787336362,
|
|
"grad_norm": 0.747848629951477,
|
|
"learning_rate": 6.688156065356845e-06,
|
|
"loss": 0.14443647861480713,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1095,
|
|
"token_acc": 0.9509245187436677,
|
|
"train_speed(iter/s)": 0.113501
|
|
},
|
|
{
|
|
"epoch": 1.1752604862409832,
|
|
"grad_norm": 0.7009637355804443,
|
|
"learning_rate": 6.6617739973982985e-06,
|
|
"loss": 0.1462648630142212,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1100,
|
|
"token_acc": 0.9474907617117654,
|
|
"train_speed(iter/s)": 0.113649
|
|
},
|
|
{
|
|
"epoch": 1.1752604862409832,
|
|
"eval_loss": 0.1868010312318802,
|
|
"eval_runtime": 39.2141,
|
|
"eval_samples_per_second": 15.403,
|
|
"eval_steps_per_second": 3.851,
|
|
"eval_token_acc": 0.9356494885157306,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 1.1806037937483302,
|
|
"grad_norm": 0.7919987440109253,
|
|
"learning_rate": 6.635339816587109e-06,
|
|
"loss": 0.14761772155761718,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1105,
|
|
"token_acc": 0.9380447931623158,
|
|
"train_speed(iter/s)": 0.113238
|
|
},
|
|
{
|
|
"epoch": 1.1859471012556773,
|
|
"grad_norm": 0.7107034921646118,
|
|
"learning_rate": 6.60885435189314e-06,
|
|
"loss": 0.15119514465332032,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1110,
|
|
"token_acc": 0.9427783975326465,
|
|
"train_speed(iter/s)": 0.11339
|
|
},
|
|
{
|
|
"epoch": 1.1912904087630243,
|
|
"grad_norm": 0.7467309236526489,
|
|
"learning_rate": 6.582318433894513e-06,
|
|
"loss": 0.13204342126846313,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1115,
|
|
"token_acc": 0.9524959742351047,
|
|
"train_speed(iter/s)": 0.113534
|
|
},
|
|
{
|
|
"epoch": 1.1966337162703713,
|
|
"grad_norm": 0.7935456037521362,
|
|
"learning_rate": 6.555732894751548e-06,
|
|
"loss": 0.1459757924079895,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1120,
|
|
"token_acc": 0.9519983083104251,
|
|
"train_speed(iter/s)": 0.113678
|
|
},
|
|
{
|
|
"epoch": 1.1966337162703713,
|
|
"eval_loss": 0.18683308362960815,
|
|
"eval_runtime": 39.1575,
|
|
"eval_samples_per_second": 15.425,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9356194642818847,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 1.2019770237777183,
|
|
"grad_norm": 0.7466067671775818,
|
|
"learning_rate": 6.529098568180672e-06,
|
|
"loss": 0.14143054485321044,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1125,
|
|
"token_acc": 0.939937276954688,
|
|
"train_speed(iter/s)": 0.11325
|
|
},
|
|
{
|
|
"epoch": 1.2073203312850656,
|
|
"grad_norm": 0.7526156306266785,
|
|
"learning_rate": 6.502416289428282e-06,
|
|
"loss": 0.14170231819152831,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1130,
|
|
"token_acc": 0.9475191453761503,
|
|
"train_speed(iter/s)": 0.113376
|
|
},
|
|
{
|
|
"epoch": 1.2126636387924126,
|
|
"grad_norm": 0.7008678913116455,
|
|
"learning_rate": 6.475686895244534e-06,
|
|
"loss": 0.14561245441436768,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1135,
|
|
"token_acc": 0.9469662033072007,
|
|
"train_speed(iter/s)": 0.113525
|
|
},
|
|
{
|
|
"epoch": 1.2180069462997596,
|
|
"grad_norm": 0.7882223129272461,
|
|
"learning_rate": 6.448911223857124e-06,
|
|
"loss": 0.14698657989501954,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1140,
|
|
"token_acc": 0.9527493782812931,
|
|
"train_speed(iter/s)": 0.113658
|
|
},
|
|
{
|
|
"epoch": 1.2180069462997596,
|
|
"eval_loss": 0.1868022382259369,
|
|
"eval_runtime": 39.1631,
|
|
"eval_samples_per_second": 15.423,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9359154174440799,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 1.2233502538071066,
|
|
"grad_norm": 0.7010296583175659,
|
|
"learning_rate": 6.422090114944982e-06,
|
|
"loss": 0.14752376079559326,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1145,
|
|
"token_acc": 0.937754062131431,
|
|
"train_speed(iter/s)": 0.113221
|
|
},
|
|
{
|
|
"epoch": 1.2286935613144536,
|
|
"grad_norm": 0.710648238658905,
|
|
"learning_rate": 6.3952244096119535e-06,
|
|
"loss": 0.13726551532745362,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1150,
|
|
"token_acc": 0.9516319057474925,
|
|
"train_speed(iter/s)": 0.113373
|
|
},
|
|
{
|
|
"epoch": 1.2340368688218006,
|
|
"grad_norm": 0.7825097441673279,
|
|
"learning_rate": 6.368314950360416e-06,
|
|
"loss": 0.151510751247406,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1155,
|
|
"token_acc": 0.9457324403228576,
|
|
"train_speed(iter/s)": 0.113517
|
|
},
|
|
{
|
|
"epoch": 1.2393801763291477,
|
|
"grad_norm": 0.7237306833267212,
|
|
"learning_rate": 6.341362581064856e-06,
|
|
"loss": 0.14253956079483032,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1160,
|
|
"token_acc": 0.9461406518010291,
|
|
"train_speed(iter/s)": 0.113677
|
|
},
|
|
{
|
|
"epoch": 1.2393801763291477,
|
|
"eval_loss": 0.18586544692516327,
|
|
"eval_runtime": 39.1092,
|
|
"eval_samples_per_second": 15.444,
|
|
"eval_steps_per_second": 3.861,
|
|
"eval_token_acc": 0.9360355143794634,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 1.244723483836495,
|
|
"grad_norm": 0.7146082520484924,
|
|
"learning_rate": 6.314368146945418e-06,
|
|
"loss": 0.14136313199996947,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1165,
|
|
"token_acc": 0.9388746238483078,
|
|
"train_speed(iter/s)": 0.113271
|
|
},
|
|
{
|
|
"epoch": 1.250066791343842,
|
|
"grad_norm": 0.7276601195335388,
|
|
"learning_rate": 6.28733249454138e-06,
|
|
"loss": 0.1453978180885315,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1170,
|
|
"token_acc": 0.9472019757845913,
|
|
"train_speed(iter/s)": 0.113427
|
|
},
|
|
{
|
|
"epoch": 1.255410098851189,
|
|
"grad_norm": 0.7507435083389282,
|
|
"learning_rate": 6.260256471684622e-06,
|
|
"loss": 0.14081387519836425,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1175,
|
|
"token_acc": 0.9456987966162278,
|
|
"train_speed(iter/s)": 0.113564
|
|
},
|
|
{
|
|
"epoch": 1.260753406358536,
|
|
"grad_norm": 0.6047825217247009,
|
|
"learning_rate": 6.233140927473033e-06,
|
|
"loss": 0.1298896551132202,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1180,
|
|
"token_acc": 0.950142074581832,
|
|
"train_speed(iter/s)": 0.113686
|
|
},
|
|
{
|
|
"epoch": 1.260753406358536,
|
|
"eval_loss": 0.186279758810997,
|
|
"eval_runtime": 39.1529,
|
|
"eval_samples_per_second": 15.427,
|
|
"eval_steps_per_second": 3.857,
|
|
"eval_token_acc": 0.9361170087284737,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 1.266096713865883,
|
|
"grad_norm": 0.7231855988502502,
|
|
"learning_rate": 6.205986712243876e-06,
|
|
"loss": 0.13684126138687133,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1185,
|
|
"token_acc": 0.939052757793765,
|
|
"train_speed(iter/s)": 0.113247
|
|
},
|
|
{
|
|
"epoch": 1.27144002137323,
|
|
"grad_norm": 0.7016168832778931,
|
|
"learning_rate": 6.178794677547138e-06,
|
|
"loss": 0.15314276218414308,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1190,
|
|
"token_acc": 0.9389803557822904,
|
|
"train_speed(iter/s)": 0.113385
|
|
},
|
|
{
|
|
"epoch": 1.276783328880577,
|
|
"grad_norm": 0.7309826612472534,
|
|
"learning_rate": 6.151565676118805e-06,
|
|
"loss": 0.13780862092971802,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1195,
|
|
"token_acc": 0.9577650445215613,
|
|
"train_speed(iter/s)": 0.113507
|
|
},
|
|
{
|
|
"epoch": 1.282126636387924,
|
|
"grad_norm": 0.7305301427841187,
|
|
"learning_rate": 6.124300561854139e-06,
|
|
"loss": 0.13783036470413207,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1200,
|
|
"token_acc": 0.9519318638739628,
|
|
"train_speed(iter/s)": 0.113619
|
|
},
|
|
{
|
|
"epoch": 1.282126636387924,
|
|
"eval_loss": 0.1861124336719513,
|
|
"eval_runtime": 39.1734,
|
|
"eval_samples_per_second": 15.419,
|
|
"eval_steps_per_second": 3.855,
|
|
"eval_token_acc": 0.9362371056638572,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 1.2874699438952713,
|
|
"grad_norm": 0.7063933610916138,
|
|
"learning_rate": 6.097000189780893e-06,
|
|
"loss": 0.1543891429901123,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1205,
|
|
"token_acc": 0.9369793792821915,
|
|
"train_speed(iter/s)": 0.113257
|
|
},
|
|
{
|
|
"epoch": 1.2928132514026183,
|
|
"grad_norm": 0.7241778373718262,
|
|
"learning_rate": 6.0696654160324875e-06,
|
|
"loss": 0.13728095293045045,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1210,
|
|
"token_acc": 0.9575441100155683,
|
|
"train_speed(iter/s)": 0.113381
|
|
},
|
|
{
|
|
"epoch": 1.2981565589099653,
|
|
"grad_norm": 0.7719012498855591,
|
|
"learning_rate": 6.042297097821184e-06,
|
|
"loss": 0.15218913555145264,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1215,
|
|
"token_acc": 0.949528983015884,
|
|
"train_speed(iter/s)": 0.113494
|
|
},
|
|
{
|
|
"epoch": 1.3034998664173123,
|
|
"grad_norm": 0.6969226002693176,
|
|
"learning_rate": 6.014896093411181e-06,
|
|
"loss": 0.13368651866912842,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1220,
|
|
"token_acc": 0.9476698598847714,
|
|
"train_speed(iter/s)": 0.113599
|
|
},
|
|
{
|
|
"epoch": 1.3034998664173123,
|
|
"eval_loss": 0.1850433051586151,
|
|
"eval_runtime": 39.2188,
|
|
"eval_samples_per_second": 15.401,
|
|
"eval_steps_per_second": 3.85,
|
|
"eval_token_acc": 0.9363186000128675,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 1.3088431739246593,
|
|
"grad_norm": 0.6775336861610413,
|
|
"learning_rate": 5.987463262091715e-06,
|
|
"loss": 0.139385187625885,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1225,
|
|
"token_acc": 0.9377400468384075,
|
|
"train_speed(iter/s)": 0.113206
|
|
},
|
|
{
|
|
"epoch": 1.3141864814320063,
|
|
"grad_norm": 0.7808154821395874,
|
|
"learning_rate": 5.959999464150101e-06,
|
|
"loss": 0.1481320381164551,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1230,
|
|
"token_acc": 0.9476690131491566,
|
|
"train_speed(iter/s)": 0.113336
|
|
},
|
|
{
|
|
"epoch": 1.3195297889393536,
|
|
"grad_norm": 0.7226517796516418,
|
|
"learning_rate": 5.932505560844766e-06,
|
|
"loss": 0.14821076393127441,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1235,
|
|
"token_acc": 0.9464622560620092,
|
|
"train_speed(iter/s)": 0.113466
|
|
},
|
|
{
|
|
"epoch": 1.3248730964467006,
|
|
"grad_norm": 0.7376400828361511,
|
|
"learning_rate": 5.904982414378233e-06,
|
|
"loss": 0.13770921230316163,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1240,
|
|
"token_acc": 0.9496229260935143,
|
|
"train_speed(iter/s)": 0.113615
|
|
},
|
|
{
|
|
"epoch": 1.3248730964467006,
|
|
"eval_loss": 0.18510028719902039,
|
|
"eval_runtime": 39.1646,
|
|
"eval_samples_per_second": 15.422,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9362156597825387,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 1.3302164039540476,
|
|
"grad_norm": 0.7447443008422852,
|
|
"learning_rate": 5.877430887870081e-06,
|
|
"loss": 0.14754925966262816,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1245,
|
|
"token_acc": 0.9386804566572503,
|
|
"train_speed(iter/s)": 0.113234
|
|
},
|
|
{
|
|
"epoch": 1.3355597114613946,
|
|
"grad_norm": 0.7104332447052002,
|
|
"learning_rate": 5.849851845329884e-06,
|
|
"loss": 0.1406762719154358,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1250,
|
|
"token_acc": 0.9495687504455057,
|
|
"train_speed(iter/s)": 0.113347
|
|
},
|
|
{
|
|
"epoch": 1.3409030189687416,
|
|
"grad_norm": 0.7238494753837585,
|
|
"learning_rate": 5.822246151630109e-06,
|
|
"loss": 0.13662366867065429,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1255,
|
|
"token_acc": 0.9507098934354979,
|
|
"train_speed(iter/s)": 0.113485
|
|
},
|
|
{
|
|
"epoch": 1.3462463264760887,
|
|
"grad_norm": 0.7518407106399536,
|
|
"learning_rate": 5.794614672479e-06,
|
|
"loss": 0.14233107566833497,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1260,
|
|
"token_acc": 0.9495046143399488,
|
|
"train_speed(iter/s)": 0.1136
|
|
},
|
|
{
|
|
"epoch": 1.3462463264760887,
|
|
"eval_loss": 0.18543414771556854,
|
|
"eval_runtime": 39.1743,
|
|
"eval_samples_per_second": 15.418,
|
|
"eval_steps_per_second": 3.855,
|
|
"eval_token_acc": 0.9364129618906689,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 1.3515896339834357,
|
|
"grad_norm": 0.7268742322921753,
|
|
"learning_rate": 5.766958274393428e-06,
|
|
"loss": 0.14559613466262816,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1265,
|
|
"token_acc": 0.9380886914433095,
|
|
"train_speed(iter/s)": 0.113217
|
|
},
|
|
{
|
|
"epoch": 1.3569329414907827,
|
|
"grad_norm": 0.6919171810150146,
|
|
"learning_rate": 5.739277824671711e-06,
|
|
"loss": 0.1417681932449341,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1270,
|
|
"token_acc": 0.9501291664041334,
|
|
"train_speed(iter/s)": 0.113322
|
|
},
|
|
{
|
|
"epoch": 1.3622762489981297,
|
|
"grad_norm": 0.7078922390937805,
|
|
"learning_rate": 5.711574191366427e-06,
|
|
"loss": 0.14929780960083008,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1275,
|
|
"token_acc": 0.9459644322845417,
|
|
"train_speed(iter/s)": 0.113449
|
|
},
|
|
{
|
|
"epoch": 1.367619556505477,
|
|
"grad_norm": 0.768913745880127,
|
|
"learning_rate": 5.683848243257181e-06,
|
|
"loss": 0.14540610313415528,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1280,
|
|
"token_acc": 0.9463650228774784,
|
|
"train_speed(iter/s)": 0.113575
|
|
},
|
|
{
|
|
"epoch": 1.367619556505477,
|
|
"eval_loss": 0.18363338708877563,
|
|
"eval_runtime": 38.9429,
|
|
"eval_samples_per_second": 15.51,
|
|
"eval_steps_per_second": 3.877,
|
|
"eval_token_acc": 0.9366574449376998,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 1.372962864012824,
|
|
"grad_norm": 0.7159769535064697,
|
|
"learning_rate": 5.656100849823366e-06,
|
|
"loss": 0.13922522068023682,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1285,
|
|
"token_acc": 0.9396986067671311,
|
|
"train_speed(iter/s)": 0.113203
|
|
},
|
|
{
|
|
"epoch": 1.378306171520171,
|
|
"grad_norm": 0.6261381506919861,
|
|
"learning_rate": 5.628332881216899e-06,
|
|
"loss": 0.13264775276184082,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1290,
|
|
"token_acc": 0.956436461236709,
|
|
"train_speed(iter/s)": 0.113332
|
|
},
|
|
{
|
|
"epoch": 1.383649479027518,
|
|
"grad_norm": 0.790125846862793,
|
|
"learning_rate": 5.600545208234927e-06,
|
|
"loss": 0.1441697359085083,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1295,
|
|
"token_acc": 0.9488245412844036,
|
|
"train_speed(iter/s)": 0.113461
|
|
},
|
|
{
|
|
"epoch": 1.388992786534865,
|
|
"grad_norm": 0.7451324462890625,
|
|
"learning_rate": 5.57273870229252e-06,
|
|
"loss": 0.13834784030914307,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1300,
|
|
"token_acc": 0.9505032488215059,
|
|
"train_speed(iter/s)": 0.11358
|
|
},
|
|
{
|
|
"epoch": 1.388992786534865,
|
|
"eval_loss": 0.18352170288562775,
|
|
"eval_runtime": 39.1528,
|
|
"eval_samples_per_second": 15.427,
|
|
"eval_steps_per_second": 3.857,
|
|
"eval_token_acc": 0.9367603851680285,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 1.3943360940422123,
|
|
"grad_norm": 0.7569957375526428,
|
|
"learning_rate": 5.544914235395347e-06,
|
|
"loss": 0.15216903686523436,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1305,
|
|
"token_acc": 0.9388893760687508,
|
|
"train_speed(iter/s)": 0.113241
|
|
},
|
|
{
|
|
"epoch": 1.3996794015495593,
|
|
"grad_norm": 0.7055838108062744,
|
|
"learning_rate": 5.517072680112332e-06,
|
|
"loss": 0.13284831047058104,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1310,
|
|
"token_acc": 0.9498824853520474,
|
|
"train_speed(iter/s)": 0.113338
|
|
},
|
|
{
|
|
"epoch": 1.4050227090569063,
|
|
"grad_norm": 0.6958155035972595,
|
|
"learning_rate": 5.4892149095482815e-06,
|
|
"loss": 0.136586332321167,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1315,
|
|
"token_acc": 0.9489724944672779,
|
|
"train_speed(iter/s)": 0.113435
|
|
},
|
|
{
|
|
"epoch": 1.4103660165642533,
|
|
"grad_norm": 0.6948981285095215,
|
|
"learning_rate": 5.46134179731651e-06,
|
|
"loss": 0.14353724718093872,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1320,
|
|
"token_acc": 0.9483981258705838,
|
|
"train_speed(iter/s)": 0.11357
|
|
},
|
|
{
|
|
"epoch": 1.4103660165642533,
|
|
"eval_loss": 0.1832687258720398,
|
|
"eval_runtime": 39.1038,
|
|
"eval_samples_per_second": 15.446,
|
|
"eval_steps_per_second": 3.862,
|
|
"eval_token_acc": 0.9369748439812134,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 1.4157093240716003,
|
|
"grad_norm": 0.6944836378097534,
|
|
"learning_rate": 5.4334542175114495e-06,
|
|
"loss": 0.1423251748085022,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1325,
|
|
"token_acc": 0.9394761855681909,
|
|
"train_speed(iter/s)": 0.11322
|
|
},
|
|
{
|
|
"epoch": 1.4210526315789473,
|
|
"grad_norm": 0.6609280705451965,
|
|
"learning_rate": 5.40555304468122e-06,
|
|
"loss": 0.13840043544769287,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1330,
|
|
"token_acc": 0.9507696104136463,
|
|
"train_speed(iter/s)": 0.113316
|
|
},
|
|
{
|
|
"epoch": 1.4263959390862944,
|
|
"grad_norm": 0.7256921529769897,
|
|
"learning_rate": 5.377639153800229e-06,
|
|
"loss": 0.1384860634803772,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1335,
|
|
"token_acc": 0.9545629784656056,
|
|
"train_speed(iter/s)": 0.113429
|
|
},
|
|
{
|
|
"epoch": 1.4317392465936414,
|
|
"grad_norm": 0.7888199687004089,
|
|
"learning_rate": 5.34971342024171e-06,
|
|
"loss": 0.14113259315490723,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1340,
|
|
"token_acc": 0.954348504280911,
|
|
"train_speed(iter/s)": 0.113523
|
|
},
|
|
{
|
|
"epoch": 1.4317392465936414,
|
|
"eval_loss": 0.184078186750412,
|
|
"eval_runtime": 39.1268,
|
|
"eval_samples_per_second": 15.437,
|
|
"eval_steps_per_second": 3.859,
|
|
"eval_token_acc": 0.9369877115100045,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 1.4370825541009884,
|
|
"grad_norm": 0.7354726791381836,
|
|
"learning_rate": 5.321776719750283e-06,
|
|
"loss": 0.1384582042694092,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1345,
|
|
"token_acc": 0.9407986188960137,
|
|
"train_speed(iter/s)": 0.113178
|
|
},
|
|
{
|
|
"epoch": 1.4424258616083356,
|
|
"grad_norm": 0.6982942819595337,
|
|
"learning_rate": 5.29382992841449e-06,
|
|
"loss": 0.14179346561431885,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1350,
|
|
"token_acc": 0.9446530872056015,
|
|
"train_speed(iter/s)": 0.113289
|
|
},
|
|
{
|
|
"epoch": 1.4477691691156827,
|
|
"grad_norm": 0.6804877519607544,
|
|
"learning_rate": 5.265873922639315e-06,
|
|
"loss": 0.13716717958450317,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1355,
|
|
"token_acc": 0.9538438661710037,
|
|
"train_speed(iter/s)": 0.113405
|
|
},
|
|
{
|
|
"epoch": 1.4531124766230297,
|
|
"grad_norm": 0.7978929877281189,
|
|
"learning_rate": 5.237909579118713e-06,
|
|
"loss": 0.1416216015815735,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1360,
|
|
"token_acc": 0.9511662976866557,
|
|
"train_speed(iter/s)": 0.113536
|
|
},
|
|
{
|
|
"epoch": 1.4531124766230297,
|
|
"eval_loss": 0.1834552139043808,
|
|
"eval_runtime": 39.1688,
|
|
"eval_samples_per_second": 15.42,
|
|
"eval_steps_per_second": 3.855,
|
|
"eval_token_acc": 0.9368547470458298,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 1.4584557841303767,
|
|
"grad_norm": 0.721110999584198,
|
|
"learning_rate": 5.209937774808098e-06,
|
|
"loss": 0.13820960521697997,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1365,
|
|
"token_acc": 0.9396422402036548,
|
|
"train_speed(iter/s)": 0.113179
|
|
},
|
|
{
|
|
"epoch": 1.4637990916377237,
|
|
"grad_norm": 0.7276850938796997,
|
|
"learning_rate": 5.181959386896862e-06,
|
|
"loss": 0.14571261405944824,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1370,
|
|
"token_acc": 0.9458817568637385,
|
|
"train_speed(iter/s)": 0.113285
|
|
},
|
|
{
|
|
"epoch": 1.4691423991450707,
|
|
"grad_norm": 0.6762943267822266,
|
|
"learning_rate": 5.153975292780852e-06,
|
|
"loss": 0.14370789527893066,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1375,
|
|
"token_acc": 0.9516790861044546,
|
|
"train_speed(iter/s)": 0.113408
|
|
},
|
|
{
|
|
"epoch": 1.474485706652418,
|
|
"grad_norm": 0.7835574746131897,
|
|
"learning_rate": 5.125986370034862e-06,
|
|
"loss": 0.14546499252319336,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1380,
|
|
"token_acc": 0.9492231661229368,
|
|
"train_speed(iter/s)": 0.113557
|
|
},
|
|
{
|
|
"epoch": 1.474485706652418,
|
|
"eval_loss": 0.18329627811908722,
|
|
"eval_runtime": 39.2746,
|
|
"eval_samples_per_second": 15.379,
|
|
"eval_steps_per_second": 3.845,
|
|
"eval_token_acc": 0.9371549893842888,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 1.479829014159765,
|
|
"grad_norm": 0.7045428156852722,
|
|
"learning_rate": 5.097993496385112e-06,
|
|
"loss": 0.14536089897155763,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1385,
|
|
"token_acc": 0.9408298818336975,
|
|
"train_speed(iter/s)": 0.113213
|
|
},
|
|
{
|
|
"epoch": 1.485172321667112,
|
|
"grad_norm": 0.7122427225112915,
|
|
"learning_rate": 5.069997549681718e-06,
|
|
"loss": 0.1389164924621582,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1390,
|
|
"token_acc": 0.949877300613497,
|
|
"train_speed(iter/s)": 0.113318
|
|
},
|
|
{
|
|
"epoch": 1.490515629174459,
|
|
"grad_norm": 0.7316491007804871,
|
|
"learning_rate": 5.041999407871168e-06,
|
|
"loss": 0.14822676181793212,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1395,
|
|
"token_acc": 0.9491525423728814,
|
|
"train_speed(iter/s)": 0.113431
|
|
},
|
|
{
|
|
"epoch": 1.495858936681806,
|
|
"grad_norm": 0.7621902227401733,
|
|
"learning_rate": 5.01399994896879e-06,
|
|
"loss": 0.14519236087799073,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1400,
|
|
"token_acc": 0.9512387720856776,
|
|
"train_speed(iter/s)": 0.113547
|
|
},
|
|
{
|
|
"epoch": 1.495858936681806,
|
|
"eval_loss": 0.1818138211965561,
|
|
"eval_runtime": 39.3086,
|
|
"eval_samples_per_second": 15.366,
|
|
"eval_steps_per_second": 3.841,
|
|
"eval_token_acc": 0.9370348924489051,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 1.501202244189153,
|
|
"grad_norm": 0.7017737627029419,
|
|
"learning_rate": 4.986000051031212e-06,
|
|
"loss": 0.13984346389770508,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1405,
|
|
"token_acc": 0.9408215177889058,
|
|
"train_speed(iter/s)": 0.113226
|
|
},
|
|
{
|
|
"epoch": 1.5065455516965,
|
|
"grad_norm": 0.7313562035560608,
|
|
"learning_rate": 4.958000592128834e-06,
|
|
"loss": 0.13321598768234252,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1410,
|
|
"token_acc": 0.9527582267752515,
|
|
"train_speed(iter/s)": 0.113333
|
|
},
|
|
{
|
|
"epoch": 1.511888859203847,
|
|
"grad_norm": 0.7831226587295532,
|
|
"learning_rate": 4.930002450318282e-06,
|
|
"loss": 0.1345110058784485,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1415,
|
|
"token_acc": 0.9496566716124357,
|
|
"train_speed(iter/s)": 0.113428
|
|
},
|
|
{
|
|
"epoch": 1.517232166711194,
|
|
"grad_norm": 0.701524019241333,
|
|
"learning_rate": 4.9020065036148885e-06,
|
|
"loss": 0.14322736263275146,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1420,
|
|
"token_acc": 0.9406170170956966,
|
|
"train_speed(iter/s)": 0.113546
|
|
},
|
|
{
|
|
"epoch": 1.517232166711194,
|
|
"eval_loss": 0.18205159902572632,
|
|
"eval_runtime": 39.1325,
|
|
"eval_samples_per_second": 15.435,
|
|
"eval_steps_per_second": 3.859,
|
|
"eval_token_acc": 0.9375109910141757,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 1.5225754742185411,
|
|
"grad_norm": 0.6394919157028198,
|
|
"learning_rate": 4.874013629965138e-06,
|
|
"loss": 0.13802753686904906,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1425,
|
|
"token_acc": 0.9402941276057832,
|
|
"train_speed(iter/s)": 0.113221
|
|
},
|
|
{
|
|
"epoch": 1.5279187817258884,
|
|
"grad_norm": 0.7372197508811951,
|
|
"learning_rate": 4.846024707219149e-06,
|
|
"loss": 0.1441117525100708,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1430,
|
|
"token_acc": 0.9448832927481142,
|
|
"train_speed(iter/s)": 0.113341
|
|
},
|
|
{
|
|
"epoch": 1.5332620892332354,
|
|
"grad_norm": 0.6687447428703308,
|
|
"learning_rate": 4.818040613103139e-06,
|
|
"loss": 0.13662933111190795,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1435,
|
|
"token_acc": 0.949387670379852,
|
|
"train_speed(iter/s)": 0.113439
|
|
},
|
|
{
|
|
"epoch": 1.5386053967405824,
|
|
"grad_norm": 0.7553586363792419,
|
|
"learning_rate": 4.790062225191902e-06,
|
|
"loss": 0.15836725234985352,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1440,
|
|
"token_acc": 0.9384995877988458,
|
|
"train_speed(iter/s)": 0.113564
|
|
},
|
|
{
|
|
"epoch": 1.5386053967405824,
|
|
"eval_loss": 0.18115834891796112,
|
|
"eval_runtime": 39.2778,
|
|
"eval_samples_per_second": 15.378,
|
|
"eval_steps_per_second": 3.844,
|
|
"eval_token_acc": 0.9376825580647237,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 1.5439487042479296,
|
|
"grad_norm": 0.6660886406898499,
|
|
"learning_rate": 4.762090420881289e-06,
|
|
"loss": 0.1435617685317993,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1445,
|
|
"token_acc": 0.9411263893262424,
|
|
"train_speed(iter/s)": 0.113249
|
|
},
|
|
{
|
|
"epoch": 1.5492920117552766,
|
|
"grad_norm": 0.6773468255996704,
|
|
"learning_rate": 4.734126077360685e-06,
|
|
"loss": 0.13354458808898925,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1450,
|
|
"token_acc": 0.9558268311099924,
|
|
"train_speed(iter/s)": 0.113351
|
|
},
|
|
{
|
|
"epoch": 1.5546353192626237,
|
|
"grad_norm": 0.7248560786247253,
|
|
"learning_rate": 4.706170071585513e-06,
|
|
"loss": 0.1327458381652832,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1455,
|
|
"token_acc": 0.955091649694501,
|
|
"train_speed(iter/s)": 0.113455
|
|
},
|
|
{
|
|
"epoch": 1.5599786267699707,
|
|
"grad_norm": 0.7063668966293335,
|
|
"learning_rate": 4.678223280249718e-06,
|
|
"loss": 0.12768800258636476,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1460,
|
|
"token_acc": 0.949375866851595,
|
|
"train_speed(iter/s)": 0.113557
|
|
},
|
|
{
|
|
"epoch": 1.5599786267699707,
|
|
"eval_loss": 0.18129871785640717,
|
|
"eval_runtime": 39.1068,
|
|
"eval_samples_per_second": 15.445,
|
|
"eval_steps_per_second": 3.861,
|
|
"eval_token_acc": 0.9380900298097751,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 1.5653219342773177,
|
|
"grad_norm": 0.8191858530044556,
|
|
"learning_rate": 4.650286579758291e-06,
|
|
"loss": 0.13946748971939088,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1465,
|
|
"token_acc": 0.9430553548200608,
|
|
"train_speed(iter/s)": 0.113238
|
|
},
|
|
{
|
|
"epoch": 1.5706652417846647,
|
|
"grad_norm": 0.7221189141273499,
|
|
"learning_rate": 4.622360846199772e-06,
|
|
"loss": 0.13773694038391113,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1470,
|
|
"token_acc": 0.9456267929815197,
|
|
"train_speed(iter/s)": 0.113346
|
|
},
|
|
{
|
|
"epoch": 1.5760085492920117,
|
|
"grad_norm": 0.715904176235199,
|
|
"learning_rate": 4.594446955318781e-06,
|
|
"loss": 0.13796852827072142,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1475,
|
|
"token_acc": 0.9508182349503215,
|
|
"train_speed(iter/s)": 0.113453
|
|
},
|
|
{
|
|
"epoch": 1.5813518567993587,
|
|
"grad_norm": 0.7929291129112244,
|
|
"learning_rate": 4.566545782488554e-06,
|
|
"loss": 0.14087553024291993,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1480,
|
|
"token_acc": 0.9462326623398016,
|
|
"train_speed(iter/s)": 0.113554
|
|
},
|
|
{
|
|
"epoch": 1.5813518567993587,
|
|
"eval_loss": 0.18039724230766296,
|
|
"eval_runtime": 39.124,
|
|
"eval_samples_per_second": 15.438,
|
|
"eval_steps_per_second": 3.86,
|
|
"eval_token_acc": 0.9378841493491175,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 1.5866951643067058,
|
|
"grad_norm": 0.7181767225265503,
|
|
"learning_rate": 4.53865820268349e-06,
|
|
"loss": 0.14211044311523438,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1485,
|
|
"token_acc": 0.9420699925539836,
|
|
"train_speed(iter/s)": 0.113263
|
|
},
|
|
{
|
|
"epoch": 1.5920384718140528,
|
|
"grad_norm": 0.6460716724395752,
|
|
"learning_rate": 4.510785090451719e-06,
|
|
"loss": 0.13882654905319214,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1490,
|
|
"token_acc": 0.9472210254200776,
|
|
"train_speed(iter/s)": 0.11337
|
|
},
|
|
{
|
|
"epoch": 1.5973817793213998,
|
|
"grad_norm": 0.7372190952301025,
|
|
"learning_rate": 4.482927319887669e-06,
|
|
"loss": 0.14314990043640136,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1495,
|
|
"token_acc": 0.9522527490349996,
|
|
"train_speed(iter/s)": 0.113469
|
|
},
|
|
{
|
|
"epoch": 1.602725086828747,
|
|
"grad_norm": 0.7231627702713013,
|
|
"learning_rate": 4.455085764604653e-06,
|
|
"loss": 0.14776058197021485,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1500,
|
|
"token_acc": 0.9497078741203028,
|
|
"train_speed(iter/s)": 0.113578
|
|
},
|
|
{
|
|
"epoch": 1.602725086828747,
|
|
"eval_loss": 0.1816372275352478,
|
|
"eval_runtime": 39.1733,
|
|
"eval_samples_per_second": 15.419,
|
|
"eval_steps_per_second": 3.855,
|
|
"eval_token_acc": 0.9376225095970319,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.608068394336094,
|
|
"grad_norm": 0.7648904323577881,
|
|
"learning_rate": 4.427261297707482e-06,
|
|
"loss": 0.14478824138641358,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1505,
|
|
"token_acc": 0.9387511308018835,
|
|
"train_speed(iter/s)": 0.113249
|
|
},
|
|
{
|
|
"epoch": 1.613411701843441,
|
|
"grad_norm": 0.6977179646492004,
|
|
"learning_rate": 4.399454791765076e-06,
|
|
"loss": 0.14400074481964112,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1510,
|
|
"token_acc": 0.9406537812945908,
|
|
"train_speed(iter/s)": 0.113346
|
|
},
|
|
{
|
|
"epoch": 1.618755009350788,
|
|
"grad_norm": 0.6725999116897583,
|
|
"learning_rate": 4.371667118783101e-06,
|
|
"loss": 0.14132678508758545,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1515,
|
|
"token_acc": 0.9514303482587064,
|
|
"train_speed(iter/s)": 0.113465
|
|
},
|
|
{
|
|
"epoch": 1.6240983168581353,
|
|
"grad_norm": 0.7282098531723022,
|
|
"learning_rate": 4.343899150176635e-06,
|
|
"loss": 0.1421644926071167,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1520,
|
|
"token_acc": 0.9474935470724086,
|
|
"train_speed(iter/s)": 0.113561
|
|
},
|
|
{
|
|
"epoch": 1.6240983168581353,
|
|
"eval_loss": 0.18081353604793549,
|
|
"eval_runtime": 39.1617,
|
|
"eval_samples_per_second": 15.423,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9381457891012032,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 1.6294416243654823,
|
|
"grad_norm": 0.7503587603569031,
|
|
"learning_rate": 4.316151756742821e-06,
|
|
"loss": 0.13501241207122802,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1525,
|
|
"token_acc": 0.9416415493274551,
|
|
"train_speed(iter/s)": 0.113245
|
|
},
|
|
{
|
|
"epoch": 1.6347849318728294,
|
|
"grad_norm": 0.7951456308364868,
|
|
"learning_rate": 4.2884258086335755e-06,
|
|
"loss": 0.14272716045379638,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1530,
|
|
"token_acc": 0.9456025411951559,
|
|
"train_speed(iter/s)": 0.113354
|
|
},
|
|
{
|
|
"epoch": 1.6401282393801764,
|
|
"grad_norm": 0.728985607624054,
|
|
"learning_rate": 4.26072217532829e-06,
|
|
"loss": 0.1494640588760376,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1535,
|
|
"token_acc": 0.9476955108993974,
|
|
"train_speed(iter/s)": 0.113456
|
|
},
|
|
{
|
|
"epoch": 1.6454715468875234,
|
|
"grad_norm": 0.7554564476013184,
|
|
"learning_rate": 4.233041725606573e-06,
|
|
"loss": 0.14563045501708985,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1540,
|
|
"token_acc": 0.9460028304967885,
|
|
"train_speed(iter/s)": 0.113557
|
|
},
|
|
{
|
|
"epoch": 1.6454715468875234,
|
|
"eval_loss": 0.18041342496871948,
|
|
"eval_runtime": 39.1831,
|
|
"eval_samples_per_second": 15.415,
|
|
"eval_steps_per_second": 3.854,
|
|
"eval_token_acc": 0.9377854982950524,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 1.6508148543948704,
|
|
"grad_norm": 0.7042314410209656,
|
|
"learning_rate": 4.205385327521002e-06,
|
|
"loss": 0.15157747268676758,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1545,
|
|
"token_acc": 0.9420924680785825,
|
|
"train_speed(iter/s)": 0.113252
|
|
},
|
|
{
|
|
"epoch": 1.6561581619022174,
|
|
"grad_norm": 0.7284954786300659,
|
|
"learning_rate": 4.177753848369892e-06,
|
|
"loss": 0.13592784404754638,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1550,
|
|
"token_acc": 0.9485677708433349,
|
|
"train_speed(iter/s)": 0.113347
|
|
},
|
|
{
|
|
"epoch": 1.6615014694095644,
|
|
"grad_norm": 0.7814568877220154,
|
|
"learning_rate": 4.1501481546701185e-06,
|
|
"loss": 0.13980913162231445,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1555,
|
|
"token_acc": 0.9546170365068003,
|
|
"train_speed(iter/s)": 0.113447
|
|
},
|
|
{
|
|
"epoch": 1.6668447769169115,
|
|
"grad_norm": 0.7283998727798462,
|
|
"learning_rate": 4.12256911212992e-06,
|
|
"loss": 0.13263875246047974,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1560,
|
|
"token_acc": 0.9517357901112563,
|
|
"train_speed(iter/s)": 0.113556
|
|
},
|
|
{
|
|
"epoch": 1.6668447769169115,
|
|
"eval_loss": 0.17983108758926392,
|
|
"eval_runtime": 39.1475,
|
|
"eval_samples_per_second": 15.429,
|
|
"eval_steps_per_second": 3.857,
|
|
"eval_token_acc": 0.938132921572412,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 1.6721880844242585,
|
|
"grad_norm": 0.7712817192077637,
|
|
"learning_rate": 4.095017585621767e-06,
|
|
"loss": 0.1369832158088684,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1565,
|
|
"token_acc": 0.9403439916887915,
|
|
"train_speed(iter/s)": 0.113246
|
|
},
|
|
{
|
|
"epoch": 1.6775313919316055,
|
|
"grad_norm": 0.7404097318649292,
|
|
"learning_rate": 4.067494439155236e-06,
|
|
"loss": 0.14037466049194336,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1570,
|
|
"token_acc": 0.9485796116828188,
|
|
"train_speed(iter/s)": 0.113344
|
|
},
|
|
{
|
|
"epoch": 1.6828746994389527,
|
|
"grad_norm": 0.7791383862495422,
|
|
"learning_rate": 4.0400005358499e-06,
|
|
"loss": 0.1371939778327942,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1575,
|
|
"token_acc": 0.9578335949764522,
|
|
"train_speed(iter/s)": 0.113426
|
|
},
|
|
{
|
|
"epoch": 1.6882180069462998,
|
|
"grad_norm": 0.7537409663200378,
|
|
"learning_rate": 4.012536737908288e-06,
|
|
"loss": 0.1379605770111084,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1580,
|
|
"token_acc": 0.9421140684410646,
|
|
"train_speed(iter/s)": 0.113529
|
|
},
|
|
{
|
|
"epoch": 1.6882180069462998,
|
|
"eval_loss": 0.1801919788122177,
|
|
"eval_runtime": 39.1291,
|
|
"eval_samples_per_second": 15.436,
|
|
"eval_steps_per_second": 3.859,
|
|
"eval_token_acc": 0.9378026550001072,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 1.6935613144536468,
|
|
"grad_norm": 0.759289562702179,
|
|
"learning_rate": 3.985103906588821e-06,
|
|
"loss": 0.1377565622329712,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1585,
|
|
"token_acc": 0.9409732617857719,
|
|
"train_speed(iter/s)": 0.113222
|
|
},
|
|
{
|
|
"epoch": 1.698904621960994,
|
|
"grad_norm": 0.6593434810638428,
|
|
"learning_rate": 3.957702902178816e-06,
|
|
"loss": 0.13015660047531127,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1590,
|
|
"token_acc": 0.9526614173228346,
|
|
"train_speed(iter/s)": 0.11332
|
|
},
|
|
{
|
|
"epoch": 1.704247929468341,
|
|
"grad_norm": 0.7041330337524414,
|
|
"learning_rate": 3.930334583967514e-06,
|
|
"loss": 0.14423298835754395,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1595,
|
|
"token_acc": 0.9480859417602792,
|
|
"train_speed(iter/s)": 0.113416
|
|
},
|
|
{
|
|
"epoch": 1.709591236975688,
|
|
"grad_norm": 0.6841444969177246,
|
|
"learning_rate": 3.902999810219109e-06,
|
|
"loss": 0.13458824157714844,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1600,
|
|
"token_acc": 0.9575135610614279,
|
|
"train_speed(iter/s)": 0.113518
|
|
},
|
|
{
|
|
"epoch": 1.709591236975688,
|
|
"eval_loss": 0.17973561584949493,
|
|
"eval_runtime": 39.1386,
|
|
"eval_samples_per_second": 15.432,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9382272834502133,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 1.714934544483035,
|
|
"grad_norm": 0.8064398765563965,
|
|
"learning_rate": 3.875699438145862e-06,
|
|
"loss": 0.13843204975128173,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1605,
|
|
"token_acc": 0.941116183732414,
|
|
"train_speed(iter/s)": 0.113215
|
|
},
|
|
{
|
|
"epoch": 1.720277851990382,
|
|
"grad_norm": 0.7203328609466553,
|
|
"learning_rate": 3.8484343238811976e-06,
|
|
"loss": 0.14074230194091797,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1610,
|
|
"token_acc": 0.9515015593790518,
|
|
"train_speed(iter/s)": 0.113297
|
|
},
|
|
{
|
|
"epoch": 1.725621159497729,
|
|
"grad_norm": 0.7621043920516968,
|
|
"learning_rate": 3.821205322452863e-06,
|
|
"loss": 0.13528130054473878,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1615,
|
|
"token_acc": 0.9499534380726367,
|
|
"train_speed(iter/s)": 0.11339
|
|
},
|
|
{
|
|
"epoch": 1.7309644670050761,
|
|
"grad_norm": 0.6963400840759277,
|
|
"learning_rate": 3.794013287756125e-06,
|
|
"loss": 0.13751909732818604,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1620,
|
|
"token_acc": 0.9522881588161861,
|
|
"train_speed(iter/s)": 0.113481
|
|
},
|
|
{
|
|
"epoch": 1.7309644670050761,
|
|
"eval_loss": 0.17938879132270813,
|
|
"eval_runtime": 39.133,
|
|
"eval_samples_per_second": 15.435,
|
|
"eval_steps_per_second": 3.859,
|
|
"eval_token_acc": 0.9384074288532888,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 1.7363077745124231,
|
|
"grad_norm": 0.7448264956474304,
|
|
"learning_rate": 3.766859072526969e-06,
|
|
"loss": 0.129533851146698,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1625,
|
|
"token_acc": 0.9424371772034192,
|
|
"train_speed(iter/s)": 0.113206
|
|
},
|
|
{
|
|
"epoch": 1.7416510820197701,
|
|
"grad_norm": 0.7266005873680115,
|
|
"learning_rate": 3.7397435283153795e-06,
|
|
"loss": 0.1342164993286133,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1630,
|
|
"token_acc": 0.9512910597946584,
|
|
"train_speed(iter/s)": 0.113278
|
|
},
|
|
{
|
|
"epoch": 1.7469943895271172,
|
|
"grad_norm": 0.7142292857170105,
|
|
"learning_rate": 3.712667505458622e-06,
|
|
"loss": 0.14621845483779908,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1635,
|
|
"token_acc": 0.9502799032760277,
|
|
"train_speed(iter/s)": 0.113376
|
|
},
|
|
{
|
|
"epoch": 1.7523376970344642,
|
|
"grad_norm": 0.6820980906486511,
|
|
"learning_rate": 3.685631853054583e-06,
|
|
"loss": 0.14358122348785402,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1640,
|
|
"token_acc": 0.9519375470278405,
|
|
"train_speed(iter/s)": 0.113482
|
|
},
|
|
{
|
|
"epoch": 1.7523376970344642,
|
|
"eval_loss": 0.17889852821826935,
|
|
"eval_runtime": 39.0969,
|
|
"eval_samples_per_second": 15.449,
|
|
"eval_steps_per_second": 3.862,
|
|
"eval_token_acc": 0.9381586566299942,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 1.7576810045418114,
|
|
"grad_norm": 0.7781234979629517,
|
|
"learning_rate": 3.658637418935146e-06,
|
|
"loss": 0.13783912658691405,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1645,
|
|
"token_acc": 0.9402854612580077,
|
|
"train_speed(iter/s)": 0.113198
|
|
},
|
|
{
|
|
"epoch": 1.7630243120491584,
|
|
"grad_norm": 0.7067362666130066,
|
|
"learning_rate": 3.6316850496395863e-06,
|
|
"loss": 0.138936448097229,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1650,
|
|
"token_acc": 0.9473799468168309,
|
|
"train_speed(iter/s)": 0.113308
|
|
},
|
|
{
|
|
"epoch": 1.7683676195565055,
|
|
"grad_norm": 0.7398570775985718,
|
|
"learning_rate": 3.6047755903880478e-06,
|
|
"loss": 0.14049469232559203,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1655,
|
|
"token_acc": 0.9463533798334994,
|
|
"train_speed(iter/s)": 0.113405
|
|
},
|
|
{
|
|
"epoch": 1.7737109270638525,
|
|
"grad_norm": 0.8484781980514526,
|
|
"learning_rate": 3.577909885055019e-06,
|
|
"loss": 0.1409994840621948,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1660,
|
|
"token_acc": 0.9477940181350656,
|
|
"train_speed(iter/s)": 0.113503
|
|
},
|
|
{
|
|
"epoch": 1.7737109270638525,
|
|
"eval_loss": 0.1786307543516159,
|
|
"eval_runtime": 39.1475,
|
|
"eval_samples_per_second": 15.429,
|
|
"eval_steps_per_second": 3.857,
|
|
"eval_token_acc": 0.9385661283750456,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 1.7790542345711997,
|
|
"grad_norm": 0.7654802799224854,
|
|
"learning_rate": 3.5510887761428764e-06,
|
|
"loss": 0.13482067584991456,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1665,
|
|
"token_acc": 0.9418829437383344,
|
|
"train_speed(iter/s)": 0.113194
|
|
},
|
|
{
|
|
"epoch": 1.7843975420785467,
|
|
"grad_norm": 0.7164176106452942,
|
|
"learning_rate": 3.524313104755468e-06,
|
|
"loss": 0.13556787967681885,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1670,
|
|
"token_acc": 0.9485995797016571,
|
|
"train_speed(iter/s)": 0.113282
|
|
},
|
|
{
|
|
"epoch": 1.7897408495858937,
|
|
"grad_norm": 0.7502966523170471,
|
|
"learning_rate": 3.4975837105717203e-06,
|
|
"loss": 0.13461077213287354,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1675,
|
|
"token_acc": 0.9522437216961712,
|
|
"train_speed(iter/s)": 0.113382
|
|
},
|
|
{
|
|
"epoch": 1.7950841570932408,
|
|
"grad_norm": 0.7419613003730774,
|
|
"learning_rate": 3.4709014318193298e-06,
|
|
"loss": 0.1423276662826538,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1680,
|
|
"token_acc": 0.9488505932943192,
|
|
"train_speed(iter/s)": 0.113481
|
|
},
|
|
{
|
|
"epoch": 1.7950841570932408,
|
|
"eval_loss": 0.1787741631269455,
|
|
"eval_runtime": 39.2394,
|
|
"eval_samples_per_second": 15.393,
|
|
"eval_steps_per_second": 3.848,
|
|
"eval_token_acc": 0.9386218876664737,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 1.8004274646005878,
|
|
"grad_norm": 0.7690842151641846,
|
|
"learning_rate": 3.4442671052484545e-06,
|
|
"loss": 0.1434476137161255,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1685,
|
|
"token_acc": 0.9390579069378391,
|
|
"train_speed(iter/s)": 0.113206
|
|
},
|
|
{
|
|
"epoch": 1.8057707721079348,
|
|
"grad_norm": 0.7701799273490906,
|
|
"learning_rate": 3.4176815661054884e-06,
|
|
"loss": 0.13522175550460816,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1690,
|
|
"token_acc": 0.9531173213642171,
|
|
"train_speed(iter/s)": 0.113311
|
|
},
|
|
{
|
|
"epoch": 1.8111140796152818,
|
|
"grad_norm": 0.7921913862228394,
|
|
"learning_rate": 3.3911456481068613e-06,
|
|
"loss": 0.13670728206634522,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1695,
|
|
"token_acc": 0.9481580510992276,
|
|
"train_speed(iter/s)": 0.113396
|
|
},
|
|
{
|
|
"epoch": 1.8164573871226288,
|
|
"grad_norm": 0.8159610629081726,
|
|
"learning_rate": 3.3646601834128924e-06,
|
|
"loss": 0.141351580619812,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1700,
|
|
"token_acc": 0.9491429380932144,
|
|
"train_speed(iter/s)": 0.113491
|
|
},
|
|
{
|
|
"epoch": 1.8164573871226288,
|
|
"eval_loss": 0.17854392528533936,
|
|
"eval_runtime": 39.1874,
|
|
"eval_samples_per_second": 15.413,
|
|
"eval_steps_per_second": 3.853,
|
|
"eval_token_acc": 0.9385875742563641,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 1.8218006946299758,
|
|
"grad_norm": 0.7273773550987244,
|
|
"learning_rate": 3.3382260026017027e-06,
|
|
"loss": 0.1371569514274597,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1705,
|
|
"token_acc": 0.9418744245136776,
|
|
"train_speed(iter/s)": 0.113203
|
|
},
|
|
{
|
|
"epoch": 1.8271440021373229,
|
|
"grad_norm": 0.7564848065376282,
|
|
"learning_rate": 3.311843934643157e-06,
|
|
"loss": 0.12437918186187744,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1710,
|
|
"token_acc": 0.9571033210332104,
|
|
"train_speed(iter/s)": 0.113276
|
|
},
|
|
{
|
|
"epoch": 1.83248730964467,
|
|
"grad_norm": 0.7761486172676086,
|
|
"learning_rate": 3.2855148068728753e-06,
|
|
"loss": 0.14540971517562867,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1715,
|
|
"token_acc": 0.9477377595488337,
|
|
"train_speed(iter/s)": 0.113367
|
|
},
|
|
{
|
|
"epoch": 1.8378306171520171,
|
|
"grad_norm": 0.7440487742424011,
|
|
"learning_rate": 3.2592394449662867e-06,
|
|
"loss": 0.13129628896713258,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1720,
|
|
"token_acc": 0.950944535784988,
|
|
"train_speed(iter/s)": 0.113447
|
|
},
|
|
{
|
|
"epoch": 1.8378306171520171,
|
|
"eval_loss": 0.17832355201244354,
|
|
"eval_runtime": 39.1412,
|
|
"eval_samples_per_second": 15.431,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9387977438932853,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 1.8431739246593641,
|
|
"grad_norm": 0.7634470462799072,
|
|
"learning_rate": 3.233018672912731e-06,
|
|
"loss": 0.14330395460128784,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1725,
|
|
"token_acc": 0.9432163730078285,
|
|
"train_speed(iter/s)": 0.113192
|
|
},
|
|
{
|
|
"epoch": 1.8485172321667112,
|
|
"grad_norm": 0.7993137836456299,
|
|
"learning_rate": 3.2068533129896273e-06,
|
|
"loss": 0.1466256022453308,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1730,
|
|
"token_acc": 0.950088022429419,
|
|
"train_speed(iter/s)": 0.113299
|
|
},
|
|
{
|
|
"epoch": 1.8538605396740584,
|
|
"grad_norm": 0.7558074593544006,
|
|
"learning_rate": 3.1807441857366798e-06,
|
|
"loss": 0.13543074131011962,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1735,
|
|
"token_acc": 0.9463475916166196,
|
|
"train_speed(iter/s)": 0.1134
|
|
},
|
|
{
|
|
"epoch": 1.8592038471814054,
|
|
"grad_norm": 0.7222068309783936,
|
|
"learning_rate": 3.1546921099301507e-06,
|
|
"loss": 0.13885715007781982,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1740,
|
|
"token_acc": 0.9468914264999093,
|
|
"train_speed(iter/s)": 0.113485
|
|
},
|
|
{
|
|
"epoch": 1.8592038471814054,
|
|
"eval_loss": 0.1776435226202011,
|
|
"eval_runtime": 39.1268,
|
|
"eval_samples_per_second": 15.437,
|
|
"eval_steps_per_second": 3.859,
|
|
"eval_token_acc": 0.9389221300049325,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 1.8645471546887524,
|
|
"grad_norm": 0.7544873356819153,
|
|
"learning_rate": 3.1286979025571817e-06,
|
|
"loss": 0.1199462890625,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1745,
|
|
"token_acc": 0.942448560630803,
|
|
"train_speed(iter/s)": 0.113208
|
|
},
|
|
{
|
|
"epoch": 1.8698904621960994,
|
|
"grad_norm": 0.6838610768318176,
|
|
"learning_rate": 3.1027623787901706e-06,
|
|
"loss": 0.13257505893707275,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1750,
|
|
"token_acc": 0.953539454854062,
|
|
"train_speed(iter/s)": 0.113284
|
|
},
|
|
{
|
|
"epoch": 1.8752337697034465,
|
|
"grad_norm": 0.7307048439979553,
|
|
"learning_rate": 3.076886351961217e-06,
|
|
"loss": 0.14348651170730592,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1755,
|
|
"token_acc": 0.9522128203567298,
|
|
"train_speed(iter/s)": 0.113384
|
|
},
|
|
{
|
|
"epoch": 1.8805770772107935,
|
|
"grad_norm": 0.7680135369300842,
|
|
"learning_rate": 3.0510706335366034e-06,
|
|
"loss": 0.14112248420715331,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1760,
|
|
"token_acc": 0.950036469730124,
|
|
"train_speed(iter/s)": 0.11347
|
|
},
|
|
{
|
|
"epoch": 1.8805770772107935,
|
|
"eval_loss": 0.17695675790309906,
|
|
"eval_runtime": 39.1442,
|
|
"eval_samples_per_second": 15.43,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9390207810589977,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 1.8859203847181405,
|
|
"grad_norm": 0.6436514258384705,
|
|
"learning_rate": 3.02531603309136e-06,
|
|
"loss": 0.13258137702941894,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1765,
|
|
"token_acc": 0.9418506940943162,
|
|
"train_speed(iter/s)": 0.113198
|
|
},
|
|
{
|
|
"epoch": 1.8912636922254875,
|
|
"grad_norm": 0.783997654914856,
|
|
"learning_rate": 2.9996233582838686e-06,
|
|
"loss": 0.137099552154541,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1770,
|
|
"token_acc": 0.9559140509228075,
|
|
"train_speed(iter/s)": 0.11329
|
|
},
|
|
{
|
|
"epoch": 1.8966069997328345,
|
|
"grad_norm": 0.6852074861526489,
|
|
"learning_rate": 2.973993414830534e-06,
|
|
"loss": 0.1261454463005066,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1775,
|
|
"token_acc": 0.9524192587295912,
|
|
"train_speed(iter/s)": 0.113369
|
|
},
|
|
{
|
|
"epoch": 1.9019503072401815,
|
|
"grad_norm": 0.6981728076934814,
|
|
"learning_rate": 2.948427006480528e-06,
|
|
"loss": 0.1357527494430542,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1780,
|
|
"token_acc": 0.9534122629704496,
|
|
"train_speed(iter/s)": 0.11347
|
|
},
|
|
{
|
|
"epoch": 1.9019503072401815,
|
|
"eval_loss": 0.17668980360031128,
|
|
"eval_runtime": 39.1514,
|
|
"eval_samples_per_second": 15.427,
|
|
"eval_steps_per_second": 3.857,
|
|
"eval_token_acc": 0.9394754337429496,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 1.9072936147475286,
|
|
"grad_norm": 0.6986772418022156,
|
|
"learning_rate": 2.9229249349905686e-06,
|
|
"loss": 0.1431878089904785,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1785,
|
|
"token_acc": 0.9415058402368222,
|
|
"train_speed(iter/s)": 0.113218
|
|
},
|
|
{
|
|
"epoch": 1.9126369222548758,
|
|
"grad_norm": 0.6814377903938293,
|
|
"learning_rate": 2.897488000099788e-06,
|
|
"loss": 0.13923795223236085,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1790,
|
|
"token_acc": 0.9478648950197789,
|
|
"train_speed(iter/s)": 0.113294
|
|
},
|
|
{
|
|
"epoch": 1.9179802297622228,
|
|
"grad_norm": 0.7366232872009277,
|
|
"learning_rate": 2.8721169995046503e-06,
|
|
"loss": 0.13349125385284424,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1795,
|
|
"token_acc": 0.9429411168541604,
|
|
"train_speed(iter/s)": 0.113372
|
|
},
|
|
{
|
|
"epoch": 1.9233235372695698,
|
|
"grad_norm": 0.7991761565208435,
|
|
"learning_rate": 2.846812728833931e-06,
|
|
"loss": 0.13615771532058715,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1800,
|
|
"token_acc": 0.9488062932585297,
|
|
"train_speed(iter/s)": 0.113453
|
|
},
|
|
{
|
|
"epoch": 1.9233235372695698,
|
|
"eval_loss": 0.17635242640972137,
|
|
"eval_runtime": 39.1984,
|
|
"eval_samples_per_second": 15.409,
|
|
"eval_steps_per_second": 3.852,
|
|
"eval_token_acc": 0.9390679619978983,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 1.928666844776917,
|
|
"grad_norm": 0.7395577430725098,
|
|
"learning_rate": 2.8215759816237748e-06,
|
|
"loss": 0.1406429648399353,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1805,
|
|
"token_acc": 0.9421045579401058,
|
|
"train_speed(iter/s)": 0.113193
|
|
},
|
|
{
|
|
"epoch": 1.934010152284264,
|
|
"grad_norm": 0.7564118504524231,
|
|
"learning_rate": 2.796407549292809e-06,
|
|
"loss": 0.13832550048828124,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1810,
|
|
"token_acc": 0.956355867541584,
|
|
"train_speed(iter/s)": 0.113289
|
|
},
|
|
{
|
|
"epoch": 1.9393534597916111,
|
|
"grad_norm": 0.6458448767662048,
|
|
"learning_rate": 2.771308221117309e-06,
|
|
"loss": 0.13285930156707765,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1815,
|
|
"token_acc": 0.950381946877847,
|
|
"train_speed(iter/s)": 0.113379
|
|
},
|
|
{
|
|
"epoch": 1.9446967672989581,
|
|
"grad_norm": 0.7599870562553406,
|
|
"learning_rate": 2.7462787842064753e-06,
|
|
"loss": 0.131211256980896,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1820,
|
|
"token_acc": 0.9533532132424537,
|
|
"train_speed(iter/s)": 0.113463
|
|
},
|
|
{
|
|
"epoch": 1.9446967672989581,
|
|
"eval_loss": 0.17678451538085938,
|
|
"eval_runtime": 39.1954,
|
|
"eval_samples_per_second": 15.41,
|
|
"eval_steps_per_second": 3.852,
|
|
"eval_token_acc": 0.9391751914044908,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 1.9500400748063051,
|
|
"grad_norm": 0.7510969042778015,
|
|
"learning_rate": 2.7213200234777215e-06,
|
|
"loss": 0.151234769821167,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1825,
|
|
"token_acc": 0.9407468474954135,
|
|
"train_speed(iter/s)": 0.113216
|
|
},
|
|
{
|
|
"epoch": 1.9553833823136522,
|
|
"grad_norm": 0.7278915047645569,
|
|
"learning_rate": 2.696432721632082e-06,
|
|
"loss": 0.13203661441802977,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1830,
|
|
"token_acc": 0.9518988171303122,
|
|
"train_speed(iter/s)": 0.113284
|
|
},
|
|
{
|
|
"epoch": 1.9607266898209992,
|
|
"grad_norm": 0.7572088241577148,
|
|
"learning_rate": 2.671617659129655e-06,
|
|
"loss": 0.14195291996002196,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1835,
|
|
"token_acc": 0.9429287939813056,
|
|
"train_speed(iter/s)": 0.113373
|
|
},
|
|
{
|
|
"epoch": 1.9660699973283462,
|
|
"grad_norm": 0.7470288276672363,
|
|
"learning_rate": 2.646875614165121e-06,
|
|
"loss": 0.1265857696533203,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1840,
|
|
"token_acc": 0.9504980895196506,
|
|
"train_speed(iter/s)": 0.113448
|
|
},
|
|
{
|
|
"epoch": 1.9660699973283462,
|
|
"eval_loss": 0.17609840631484985,
|
|
"eval_runtime": 39.2428,
|
|
"eval_samples_per_second": 15.391,
|
|
"eval_steps_per_second": 3.848,
|
|
"eval_token_acc": 0.9394668553904223,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 1.9714133048356932,
|
|
"grad_norm": 0.6941691637039185,
|
|
"learning_rate": 2.6222073626433587e-06,
|
|
"loss": 0.13350989818572997,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1845,
|
|
"token_acc": 0.941098463918665,
|
|
"train_speed(iter/s)": 0.113192
|
|
},
|
|
{
|
|
"epoch": 1.9767566123430402,
|
|
"grad_norm": 0.7226253747940063,
|
|
"learning_rate": 2.597613678155092e-06,
|
|
"loss": 0.13629913330078125,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1850,
|
|
"token_acc": 0.9471854356964505,
|
|
"train_speed(iter/s)": 0.113272
|
|
},
|
|
{
|
|
"epoch": 1.9820999198503872,
|
|
"grad_norm": 0.7369644641876221,
|
|
"learning_rate": 2.573095331952646e-06,
|
|
"loss": 0.1342089891433716,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1855,
|
|
"token_acc": 0.9550353716804326,
|
|
"train_speed(iter/s)": 0.113379
|
|
},
|
|
{
|
|
"epoch": 1.9874432273577345,
|
|
"grad_norm": 0.71446293592453,
|
|
"learning_rate": 2.5486530929257574e-06,
|
|
"loss": 0.13259618282318114,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1860,
|
|
"token_acc": 0.9522027151471942,
|
|
"train_speed(iter/s)": 0.113466
|
|
},
|
|
{
|
|
"epoch": 1.9874432273577345,
|
|
"eval_loss": 0.1756637990474701,
|
|
"eval_runtime": 39.2304,
|
|
"eval_samples_per_second": 15.396,
|
|
"eval_steps_per_second": 3.849,
|
|
"eval_token_acc": 0.9394625662141586,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 1.9927865348650815,
|
|
"grad_norm": 0.7780827879905701,
|
|
"learning_rate": 2.5242877275774446e-06,
|
|
"loss": 0.1336849570274353,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1865,
|
|
"token_acc": 0.9416884046261456,
|
|
"train_speed(iter/s)": 0.113224
|
|
},
|
|
{
|
|
"epoch": 1.9981298423724285,
|
|
"grad_norm": 0.6946832537651062,
|
|
"learning_rate": 2.5000000000000015e-06,
|
|
"loss": 0.14616479873657226,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1870,
|
|
"token_acc": 0.9527469722324645,
|
|
"train_speed(iter/s)": 0.113319
|
|
},
|
|
{
|
|
"epoch": 2.003205984504408,
|
|
"grad_norm": 0.6446176171302795,
|
|
"learning_rate": 2.475790671851007e-06,
|
|
"loss": 0.11167683601379394,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1875,
|
|
"token_acc": 0.9654894371875504,
|
|
"train_speed(iter/s)": 0.113423
|
|
},
|
|
{
|
|
"epoch": 2.008549292011755,
|
|
"grad_norm": 0.6247937083244324,
|
|
"learning_rate": 2.4516605023294626e-06,
|
|
"loss": 0.10328346490859985,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1880,
|
|
"token_acc": 0.966084815624222,
|
|
"train_speed(iter/s)": 0.11351
|
|
},
|
|
{
|
|
"epoch": 2.008549292011755,
|
|
"eval_loss": 0.17980773746967316,
|
|
"eval_runtime": 39.1557,
|
|
"eval_samples_per_second": 15.426,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9395440605631689,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 2.013892599519102,
|
|
"grad_norm": 0.6584553718566895,
|
|
"learning_rate": 2.4276102481519655e-06,
|
|
"loss": 0.10475772619247437,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1885,
|
|
"token_acc": 0.94671126227415,
|
|
"train_speed(iter/s)": 0.113271
|
|
},
|
|
{
|
|
"epoch": 2.019235907026449,
|
|
"grad_norm": 0.6800820827484131,
|
|
"learning_rate": 2.403640663528986e-06,
|
|
"loss": 0.09606801271438599,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1890,
|
|
"token_acc": 0.9638240304639744,
|
|
"train_speed(iter/s)": 0.113359
|
|
},
|
|
{
|
|
"epoch": 2.0245792145337966,
|
|
"grad_norm": 0.6699435710906982,
|
|
"learning_rate": 2.379752500141222e-06,
|
|
"loss": 0.09734945297241211,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1895,
|
|
"token_acc": 0.9634120335110434,
|
|
"train_speed(iter/s)": 0.113444
|
|
},
|
|
{
|
|
"epoch": 2.0299225220411437,
|
|
"grad_norm": 0.7269750833511353,
|
|
"learning_rate": 2.355946507116012e-06,
|
|
"loss": 0.0947374939918518,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1900,
|
|
"token_acc": 0.9667849182180642,
|
|
"train_speed(iter/s)": 0.113522
|
|
},
|
|
{
|
|
"epoch": 2.0299225220411437,
|
|
"eval_loss": 0.19244976341724396,
|
|
"eval_runtime": 39.1428,
|
|
"eval_samples_per_second": 15.431,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9386519119003195,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 2.0352658295484907,
|
|
"grad_norm": 0.6670167446136475,
|
|
"learning_rate": 2.332223431003859e-06,
|
|
"loss": 0.09205610752105713,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1905,
|
|
"token_acc": 0.9472729334391654,
|
|
"train_speed(iter/s)": 0.11327
|
|
},
|
|
{
|
|
"epoch": 2.0406091370558377,
|
|
"grad_norm": 0.8095514178276062,
|
|
"learning_rate": 2.3085840157550036e-06,
|
|
"loss": 0.09566161036491394,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1910,
|
|
"token_acc": 0.9658257869771453,
|
|
"train_speed(iter/s)": 0.113366
|
|
},
|
|
{
|
|
"epoch": 2.0459524445631847,
|
|
"grad_norm": 0.7360612154006958,
|
|
"learning_rate": 2.2850290026961032e-06,
|
|
"loss": 0.10009205341339111,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1915,
|
|
"token_acc": 0.9632088520055325,
|
|
"train_speed(iter/s)": 0.113459
|
|
},
|
|
{
|
|
"epoch": 2.0512957520705317,
|
|
"grad_norm": 0.8155742883682251,
|
|
"learning_rate": 2.2615591305069846e-06,
|
|
"loss": 0.09215841293334961,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1920,
|
|
"token_acc": 0.968572231196011,
|
|
"train_speed(iter/s)": 0.113542
|
|
},
|
|
{
|
|
"epoch": 2.0512957520705317,
|
|
"eval_loss": 0.19029787182807922,
|
|
"eval_runtime": 39.1166,
|
|
"eval_samples_per_second": 15.441,
|
|
"eval_steps_per_second": 3.86,
|
|
"eval_token_acc": 0.9387848763644941,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 2.0566390595778787,
|
|
"grad_norm": 0.7107601761817932,
|
|
"learning_rate": 2.238175135197471e-06,
|
|
"loss": 0.09783934354782105,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1925,
|
|
"token_acc": 0.9484311762913965,
|
|
"train_speed(iter/s)": 0.113297
|
|
},
|
|
{
|
|
"epoch": 2.0619823670852258,
|
|
"grad_norm": 0.7164185643196106,
|
|
"learning_rate": 2.2148777500843125e-06,
|
|
"loss": 0.10058900117874145,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1930,
|
|
"token_acc": 0.9623356362825941,
|
|
"train_speed(iter/s)": 0.113372
|
|
},
|
|
{
|
|
"epoch": 2.0673256745925728,
|
|
"grad_norm": 0.7500390410423279,
|
|
"learning_rate": 2.1916677057681786e-06,
|
|
"loss": 0.10025620460510254,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1935,
|
|
"token_acc": 0.9640934730056406,
|
|
"train_speed(iter/s)": 0.113457
|
|
},
|
|
{
|
|
"epoch": 2.07266898209992,
|
|
"grad_norm": 0.7709511518478394,
|
|
"learning_rate": 2.1685457301107506e-06,
|
|
"loss": 0.10047693252563476,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1940,
|
|
"token_acc": 0.9648505046059044,
|
|
"train_speed(iter/s)": 0.113539
|
|
},
|
|
{
|
|
"epoch": 2.07266898209992,
|
|
"eval_loss": 0.19032922387123108,
|
|
"eval_runtime": 39.188,
|
|
"eval_samples_per_second": 15.413,
|
|
"eval_steps_per_second": 3.853,
|
|
"eval_token_acc": 0.9384202963820798,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 2.078012289607267,
|
|
"grad_norm": 0.6654588580131531,
|
|
"learning_rate": 2.145512548211902e-06,
|
|
"loss": 0.09436342120170593,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1945,
|
|
"token_acc": 0.9468971724468446,
|
|
"train_speed(iter/s)": 0.113306
|
|
},
|
|
{
|
|
"epoch": 2.083355597114614,
|
|
"grad_norm": 0.7399053573608398,
|
|
"learning_rate": 2.1225688823869494e-06,
|
|
"loss": 0.09469984173774719,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1950,
|
|
"token_acc": 0.9699618029029794,
|
|
"train_speed(iter/s)": 0.113384
|
|
},
|
|
{
|
|
"epoch": 2.088698904621961,
|
|
"grad_norm": 0.7095410227775574,
|
|
"learning_rate": 2.09971545214401e-06,
|
|
"loss": 0.09509609937667847,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1955,
|
|
"token_acc": 0.9642617302694969,
|
|
"train_speed(iter/s)": 0.113448
|
|
},
|
|
{
|
|
"epoch": 2.094042212129308,
|
|
"grad_norm": 0.684771716594696,
|
|
"learning_rate": 2.0769529741614297e-06,
|
|
"loss": 0.09686210751533508,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1960,
|
|
"token_acc": 0.9665719778485553,
|
|
"train_speed(iter/s)": 0.113535
|
|
},
|
|
{
|
|
"epoch": 2.094042212129308,
|
|
"eval_loss": 0.19170063734054565,
|
|
"eval_runtime": 39.1596,
|
|
"eval_samples_per_second": 15.424,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9384932123785626,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 2.0993855196366553,
|
|
"grad_norm": 0.6467424631118774,
|
|
"learning_rate": 2.054282162265313e-06,
|
|
"loss": 0.09031983613967895,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1965,
|
|
"token_acc": 0.9467923722623636,
|
|
"train_speed(iter/s)": 0.113303
|
|
},
|
|
{
|
|
"epoch": 2.1047288271440023,
|
|
"grad_norm": 0.6366815567016602,
|
|
"learning_rate": 2.0317037274071412e-06,
|
|
"loss": 0.08445571660995484,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1970,
|
|
"token_acc": 0.9674427290836654,
|
|
"train_speed(iter/s)": 0.113383
|
|
},
|
|
{
|
|
"epoch": 2.1100721346513494,
|
|
"grad_norm": 0.6350346207618713,
|
|
"learning_rate": 2.009218377641466e-06,
|
|
"loss": 0.0988619565963745,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1975,
|
|
"token_acc": 0.9623879433545537,
|
|
"train_speed(iter/s)": 0.113457
|
|
},
|
|
{
|
|
"epoch": 2.1154154421586964,
|
|
"grad_norm": 0.6696274280548096,
|
|
"learning_rate": 1.9868268181037186e-06,
|
|
"loss": 0.09375531673431396,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1980,
|
|
"token_acc": 0.9656578045525053,
|
|
"train_speed(iter/s)": 0.113523
|
|
},
|
|
{
|
|
"epoch": 2.1154154421586964,
|
|
"eval_loss": 0.19252096116542816,
|
|
"eval_runtime": 39.2604,
|
|
"eval_samples_per_second": 15.384,
|
|
"eval_steps_per_second": 3.846,
|
|
"eval_token_acc": 0.9383945613244976,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 2.1207587496660434,
|
|
"grad_norm": 0.7472742199897766,
|
|
"learning_rate": 1.964529750988086e-06,
|
|
"loss": 0.09353007674217224,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1985,
|
|
"token_acc": 0.9438841714662963,
|
|
"train_speed(iter/s)": 0.113283
|
|
},
|
|
{
|
|
"epoch": 2.1261020571733904,
|
|
"grad_norm": 0.7385168671607971,
|
|
"learning_rate": 1.9423278755254933e-06,
|
|
"loss": 0.09612951874732971,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1990,
|
|
"token_acc": 0.962575335363878,
|
|
"train_speed(iter/s)": 0.113363
|
|
},
|
|
{
|
|
"epoch": 2.1314453646807374,
|
|
"grad_norm": 0.769904613494873,
|
|
"learning_rate": 1.9202218879616824e-06,
|
|
"loss": 0.09348126649856567,
|
|
"memory(GiB)": 31.36,
|
|
"step": 1995,
|
|
"token_acc": 0.9666374287325356,
|
|
"train_speed(iter/s)": 0.113436
|
|
},
|
|
{
|
|
"epoch": 2.1367886721880844,
|
|
"grad_norm": 0.7597969770431519,
|
|
"learning_rate": 1.8982124815353665e-06,
|
|
"loss": 0.09418823719024658,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2000,
|
|
"token_acc": 0.9634146341463414,
|
|
"train_speed(iter/s)": 0.113511
|
|
},
|
|
{
|
|
"epoch": 2.1367886721880844,
|
|
"eval_loss": 0.1915539652109146,
|
|
"eval_runtime": 39.2063,
|
|
"eval_samples_per_second": 15.406,
|
|
"eval_steps_per_second": 3.851,
|
|
"eval_token_acc": 0.9383216453280148,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 2.1421319796954315,
|
|
"grad_norm": 0.7184767127037048,
|
|
"learning_rate": 1.8763003464565022e-06,
|
|
"loss": 0.0999064326286316,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2005,
|
|
"token_acc": 0.9464714514407684,
|
|
"train_speed(iter/s)": 0.113276
|
|
},
|
|
{
|
|
"epoch": 2.1474752872027785,
|
|
"grad_norm": 0.6783972382545471,
|
|
"learning_rate": 1.854486169884635e-06,
|
|
"loss": 0.09837267994880676,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2010,
|
|
"token_acc": 0.9645275422436406,
|
|
"train_speed(iter/s)": 0.113352
|
|
},
|
|
{
|
|
"epoch": 2.1528185947101255,
|
|
"grad_norm": 0.746529221534729,
|
|
"learning_rate": 1.8327706359073526e-06,
|
|
"loss": 0.09520338773727417,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2015,
|
|
"token_acc": 0.9642478360532348,
|
|
"train_speed(iter/s)": 0.113417
|
|
},
|
|
{
|
|
"epoch": 2.1581619022174725,
|
|
"grad_norm": 0.6819494366645813,
|
|
"learning_rate": 1.8111544255188402e-06,
|
|
"loss": 0.09425632357597351,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2020,
|
|
"token_acc": 0.9684003992977867,
|
|
"train_speed(iter/s)": 0.113509
|
|
},
|
|
{
|
|
"epoch": 2.1581619022174725,
|
|
"eval_loss": 0.1915895640850067,
|
|
"eval_runtime": 39.2397,
|
|
"eval_samples_per_second": 15.393,
|
|
"eval_steps_per_second": 3.848,
|
|
"eval_token_acc": 0.9385103690836175,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 2.1635052097248195,
|
|
"grad_norm": 0.7457813024520874,
|
|
"learning_rate": 1.7896382165985094e-06,
|
|
"loss": 0.09581427574157715,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2025,
|
|
"token_acc": 0.9465357277296783,
|
|
"train_speed(iter/s)": 0.113279
|
|
},
|
|
{
|
|
"epoch": 2.1688485172321665,
|
|
"grad_norm": 0.6739791631698608,
|
|
"learning_rate": 1.768222683889757e-06,
|
|
"loss": 0.09893481731414795,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2030,
|
|
"token_acc": 0.9669017905588714,
|
|
"train_speed(iter/s)": 0.113367
|
|
},
|
|
{
|
|
"epoch": 2.1741918247395136,
|
|
"grad_norm": 0.6932037472724915,
|
|
"learning_rate": 1.746908498978791e-06,
|
|
"loss": 0.0985231876373291,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2035,
|
|
"token_acc": 0.9686975154919789,
|
|
"train_speed(iter/s)": 0.113426
|
|
},
|
|
{
|
|
"epoch": 2.179535132246861,
|
|
"grad_norm": 0.7460088133811951,
|
|
"learning_rate": 1.7256963302735752e-06,
|
|
"loss": 0.09795750975608826,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2040,
|
|
"token_acc": 0.964236412083855,
|
|
"train_speed(iter/s)": 0.113512
|
|
},
|
|
{
|
|
"epoch": 2.179535132246861,
|
|
"eval_loss": 0.19136284291744232,
|
|
"eval_runtime": 39.234,
|
|
"eval_samples_per_second": 15.395,
|
|
"eval_steps_per_second": 3.849,
|
|
"eval_token_acc": 0.9384631881447169,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 2.184878439754208,
|
|
"grad_norm": 0.7117043137550354,
|
|
"learning_rate": 1.7045868429828745e-06,
|
|
"loss": 0.09070048332214356,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2045,
|
|
"token_acc": 0.9479196430533882,
|
|
"train_speed(iter/s)": 0.113281
|
|
},
|
|
{
|
|
"epoch": 2.190221747261555,
|
|
"grad_norm": 0.652874767780304,
|
|
"learning_rate": 1.6835806990953802e-06,
|
|
"loss": 0.09067975282669068,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2050,
|
|
"token_acc": 0.9657613864524992,
|
|
"train_speed(iter/s)": 0.113343
|
|
},
|
|
{
|
|
"epoch": 2.195565054768902,
|
|
"grad_norm": 0.6861611604690552,
|
|
"learning_rate": 1.6626785573589667e-06,
|
|
"loss": 0.09590352177619935,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2055,
|
|
"token_acc": 0.9615629645359949,
|
|
"train_speed(iter/s)": 0.113419
|
|
},
|
|
{
|
|
"epoch": 2.200908362276249,
|
|
"grad_norm": 0.6657945513725281,
|
|
"learning_rate": 1.6418810732600177e-06,
|
|
"loss": 0.0890547752380371,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2060,
|
|
"token_acc": 0.968454143363673,
|
|
"train_speed(iter/s)": 0.113499
|
|
},
|
|
{
|
|
"epoch": 2.200908362276249,
|
|
"eval_loss": 0.19242902100086212,
|
|
"eval_runtime": 39.2335,
|
|
"eval_samples_per_second": 15.395,
|
|
"eval_steps_per_second": 3.849,
|
|
"eval_token_acc": 0.9385103690836175,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 2.206251669783596,
|
|
"grad_norm": 0.7557697892189026,
|
|
"learning_rate": 1.6211888990028785e-06,
|
|
"loss": 0.09434689283370971,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2065,
|
|
"token_acc": 0.9467688912388791,
|
|
"train_speed(iter/s)": 0.113268
|
|
},
|
|
{
|
|
"epoch": 2.211594977290943,
|
|
"grad_norm": 0.7953412532806396,
|
|
"learning_rate": 1.6006026834894068e-06,
|
|
"loss": 0.10123894214630128,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2070,
|
|
"token_acc": 0.9602344454463481,
|
|
"train_speed(iter/s)": 0.113338
|
|
},
|
|
{
|
|
"epoch": 2.21693828479829,
|
|
"grad_norm": 0.720357358455658,
|
|
"learning_rate": 1.5801230722986104e-06,
|
|
"loss": 0.09082142114639283,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2075,
|
|
"token_acc": 0.9670389539634977,
|
|
"train_speed(iter/s)": 0.113403
|
|
},
|
|
{
|
|
"epoch": 2.222281592305637,
|
|
"grad_norm": 0.6819139122962952,
|
|
"learning_rate": 1.5597507076664187e-06,
|
|
"loss": 0.088398015499115,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2080,
|
|
"token_acc": 0.9697791553661371,
|
|
"train_speed(iter/s)": 0.11347
|
|
},
|
|
{
|
|
"epoch": 2.222281592305637,
|
|
"eval_loss": 0.1921994835138321,
|
|
"eval_runtime": 39.1975,
|
|
"eval_samples_per_second": 15.409,
|
|
"eval_steps_per_second": 3.852,
|
|
"eval_token_acc": 0.9386047309614188,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 2.227624899812984,
|
|
"grad_norm": 0.7003424167633057,
|
|
"learning_rate": 1.5394862284655266e-06,
|
|
"loss": 0.09183210134506226,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2085,
|
|
"token_acc": 0.9471144387997177,
|
|
"train_speed(iter/s)": 0.113237
|
|
},
|
|
{
|
|
"epoch": 2.232968207320331,
|
|
"grad_norm": 0.735797107219696,
|
|
"learning_rate": 1.5193302701853674e-06,
|
|
"loss": 0.09793744683265686,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2090,
|
|
"token_acc": 0.9629603963826617,
|
|
"train_speed(iter/s)": 0.113318
|
|
},
|
|
{
|
|
"epoch": 2.238311514827678,
|
|
"grad_norm": 0.7035617828369141,
|
|
"learning_rate": 1.499283464912188e-06,
|
|
"loss": 0.09379619359970093,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2095,
|
|
"token_acc": 0.9692680087017802,
|
|
"train_speed(iter/s)": 0.113392
|
|
},
|
|
{
|
|
"epoch": 2.2436548223350252,
|
|
"grad_norm": 0.653683602809906,
|
|
"learning_rate": 1.4793464413092161e-06,
|
|
"loss": 0.08412163257598877,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2100,
|
|
"token_acc": 0.9694610598455018,
|
|
"train_speed(iter/s)": 0.113463
|
|
},
|
|
{
|
|
"epoch": 2.2436548223350252,
|
|
"eval_loss": 0.19128654897212982,
|
|
"eval_runtime": 39.2107,
|
|
"eval_samples_per_second": 15.404,
|
|
"eval_steps_per_second": 3.851,
|
|
"eval_token_acc": 0.9386562010765832,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 2.2489981298423722,
|
|
"grad_norm": 0.720621645450592,
|
|
"learning_rate": 1.459519824596956e-06,
|
|
"loss": 0.09899259209632874,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2105,
|
|
"token_acc": 0.9451508803713978,
|
|
"train_speed(iter/s)": 0.113243
|
|
},
|
|
{
|
|
"epoch": 2.2543414373497193,
|
|
"grad_norm": 0.6509672403335571,
|
|
"learning_rate": 1.4398042365335745e-06,
|
|
"loss": 0.09483298659324646,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2110,
|
|
"token_acc": 0.9613455205736621,
|
|
"train_speed(iter/s)": 0.113303
|
|
},
|
|
{
|
|
"epoch": 2.2596847448570667,
|
|
"grad_norm": 0.7314789891242981,
|
|
"learning_rate": 1.4202002953954042e-06,
|
|
"loss": 0.0946409523487091,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2115,
|
|
"token_acc": 0.9664519265832017,
|
|
"train_speed(iter/s)": 0.11336
|
|
},
|
|
{
|
|
"epoch": 2.2650280523644137,
|
|
"grad_norm": 0.7073846459388733,
|
|
"learning_rate": 1.4007086159575595e-06,
|
|
"loss": 0.0916548490524292,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2120,
|
|
"token_acc": 0.967791956065256,
|
|
"train_speed(iter/s)": 0.113428
|
|
},
|
|
{
|
|
"epoch": 2.2650280523644137,
|
|
"eval_loss": 0.1914122849702835,
|
|
"eval_runtime": 39.2346,
|
|
"eval_samples_per_second": 15.395,
|
|
"eval_steps_per_second": 3.849,
|
|
"eval_token_acc": 0.9387119603680113,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 2.2703713598717608,
|
|
"grad_norm": 0.744266152381897,
|
|
"learning_rate": 1.3813298094746491e-06,
|
|
"loss": 0.10223530530929566,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2125,
|
|
"token_acc": 0.9450423116125821,
|
|
"train_speed(iter/s)": 0.11321
|
|
},
|
|
{
|
|
"epoch": 2.2757146673791078,
|
|
"grad_norm": 0.7458162307739258,
|
|
"learning_rate": 1.362064483661617e-06,
|
|
"loss": 0.0993034303188324,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2130,
|
|
"token_acc": 0.9647883245497191,
|
|
"train_speed(iter/s)": 0.113282
|
|
},
|
|
{
|
|
"epoch": 2.281057974886455,
|
|
"grad_norm": 0.7311099767684937,
|
|
"learning_rate": 1.3429132426746743e-06,
|
|
"loss": 0.08791648149490357,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2135,
|
|
"token_acc": 0.9718640093786636,
|
|
"train_speed(iter/s)": 0.11335
|
|
},
|
|
{
|
|
"epoch": 2.286401282393802,
|
|
"grad_norm": 0.6718652248382568,
|
|
"learning_rate": 1.3238766870923592e-06,
|
|
"loss": 0.09885276556015014,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2140,
|
|
"token_acc": 0.9679231605654223,
|
|
"train_speed(iter/s)": 0.113429
|
|
},
|
|
{
|
|
"epoch": 2.286401282393802,
|
|
"eval_loss": 0.1916726678609848,
|
|
"eval_runtime": 39.2123,
|
|
"eval_samples_per_second": 15.403,
|
|
"eval_steps_per_second": 3.851,
|
|
"eval_token_acc": 0.9388191897746038,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 2.291744589901149,
|
|
"grad_norm": 0.7811794877052307,
|
|
"learning_rate": 1.3049554138967052e-06,
|
|
"loss": 0.09513717889785767,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2145,
|
|
"token_acc": 0.9481464839361942,
|
|
"train_speed(iter/s)": 0.113213
|
|
},
|
|
{
|
|
"epoch": 2.297087897408496,
|
|
"grad_norm": 0.7876958847045898,
|
|
"learning_rate": 1.286150016454511e-06,
|
|
"loss": 0.09975624084472656,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2150,
|
|
"token_acc": 0.9664041395578892,
|
|
"train_speed(iter/s)": 0.113297
|
|
},
|
|
{
|
|
"epoch": 2.302431204915843,
|
|
"grad_norm": 0.6765829920768738,
|
|
"learning_rate": 1.267461084498744e-06,
|
|
"loss": 0.09555368423461914,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2155,
|
|
"token_acc": 0.9689418005736669,
|
|
"train_speed(iter/s)": 0.113362
|
|
},
|
|
{
|
|
"epoch": 2.30777451242319,
|
|
"grad_norm": 0.7572174668312073,
|
|
"learning_rate": 1.2488892041100364e-06,
|
|
"loss": 0.09787599444389343,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2160,
|
|
"token_acc": 0.9644865820343885,
|
|
"train_speed(iter/s)": 0.113441
|
|
},
|
|
{
|
|
"epoch": 2.30777451242319,
|
|
"eval_loss": 0.19186273217201233,
|
|
"eval_runtime": 39.232,
|
|
"eval_samples_per_second": 15.396,
|
|
"eval_steps_per_second": 3.849,
|
|
"eval_token_acc": 0.9388191897746038,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 2.313117819930537,
|
|
"grad_norm": 0.7487595677375793,
|
|
"learning_rate": 1.2304349576983094e-06,
|
|
"loss": 0.09345256090164185,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2165,
|
|
"token_acc": 0.9464117197681211,
|
|
"train_speed(iter/s)": 0.113226
|
|
},
|
|
{
|
|
"epoch": 2.318461127437884,
|
|
"grad_norm": 0.7937645316123962,
|
|
"learning_rate": 1.2120989239845149e-06,
|
|
"loss": 0.09632455110549927,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2170,
|
|
"token_acc": 0.9689201004033055,
|
|
"train_speed(iter/s)": 0.113299
|
|
},
|
|
{
|
|
"epoch": 2.323804434945231,
|
|
"grad_norm": 0.7212129831314087,
|
|
"learning_rate": 1.1938816779824753e-06,
|
|
"loss": 0.09377689361572265,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2175,
|
|
"token_acc": 0.9685556654016647,
|
|
"train_speed(iter/s)": 0.113374
|
|
},
|
|
{
|
|
"epoch": 2.3291477424525784,
|
|
"grad_norm": 0.6610186696052551,
|
|
"learning_rate": 1.1757837909808628e-06,
|
|
"loss": 0.09794212579727173,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2180,
|
|
"token_acc": 0.9649737302977233,
|
|
"train_speed(iter/s)": 0.113446
|
|
},
|
|
{
|
|
"epoch": 2.3291477424525784,
|
|
"eval_loss": 0.1913762092590332,
|
|
"eval_runtime": 39.2194,
|
|
"eval_samples_per_second": 15.401,
|
|
"eval_steps_per_second": 3.85,
|
|
"eval_token_acc": 0.9388835274185593,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 2.334491049959925,
|
|
"grad_norm": 0.6436466574668884,
|
|
"learning_rate": 1.157805830525275e-06,
|
|
"loss": 0.08866640329360961,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2185,
|
|
"token_acc": 0.9485147195950446,
|
|
"train_speed(iter/s)": 0.113237
|
|
},
|
|
{
|
|
"epoch": 2.3398343574672724,
|
|
"grad_norm": 0.7666484713554382,
|
|
"learning_rate": 1.1399483604004403e-06,
|
|
"loss": 0.08878711462020875,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2190,
|
|
"token_acc": 0.9614538598512622,
|
|
"train_speed(iter/s)": 0.113307
|
|
},
|
|
{
|
|
"epoch": 2.3451776649746194,
|
|
"grad_norm": 0.7943670153617859,
|
|
"learning_rate": 1.1222119406125426e-06,
|
|
"loss": 0.09242654442787171,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2195,
|
|
"token_acc": 0.96523288032722,
|
|
"train_speed(iter/s)": 0.113373
|
|
},
|
|
{
|
|
"epoch": 2.3505209724819665,
|
|
"grad_norm": 0.755363941192627,
|
|
"learning_rate": 1.1045971273716476e-06,
|
|
"loss": 0.10125420093536378,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2200,
|
|
"token_acc": 0.964818502602802,
|
|
"train_speed(iter/s)": 0.113435
|
|
},
|
|
{
|
|
"epoch": 2.3505209724819665,
|
|
"eval_loss": 0.19072869420051575,
|
|
"eval_runtime": 39.1742,
|
|
"eval_samples_per_second": 15.418,
|
|
"eval_steps_per_second": 3.855,
|
|
"eval_token_acc": 0.938943575886251,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 2.3558642799893135,
|
|
"grad_norm": 0.7719199657440186,
|
|
"learning_rate": 1.0871044730742752e-06,
|
|
"loss": 0.0994708001613617,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2205,
|
|
"token_acc": 0.9452918677716055,
|
|
"train_speed(iter/s)": 0.113231
|
|
},
|
|
{
|
|
"epoch": 2.3612075874966605,
|
|
"grad_norm": 0.7336219549179077,
|
|
"learning_rate": 1.0697345262860638e-06,
|
|
"loss": 0.09733407497406006,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2210,
|
|
"token_acc": 0.962336711024295,
|
|
"train_speed(iter/s)": 0.1133
|
|
},
|
|
{
|
|
"epoch": 2.3665508950040075,
|
|
"grad_norm": 0.7070329189300537,
|
|
"learning_rate": 1.0524878317245713e-06,
|
|
"loss": 0.08725832104682922,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2215,
|
|
"token_acc": 0.9673966575828217,
|
|
"train_speed(iter/s)": 0.113371
|
|
},
|
|
{
|
|
"epoch": 2.3718942025113545,
|
|
"grad_norm": 0.7620383501052856,
|
|
"learning_rate": 1.0353649302421982e-06,
|
|
"loss": 0.08947555422782898,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2220,
|
|
"token_acc": 0.9708626514987136,
|
|
"train_speed(iter/s)": 0.113435
|
|
},
|
|
{
|
|
"epoch": 2.3718942025113545,
|
|
"eval_loss": 0.19121414422988892,
|
|
"eval_runtime": 39.1416,
|
|
"eval_samples_per_second": 15.431,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9391365888181175,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 2.3772375100187015,
|
|
"grad_norm": 0.6480873227119446,
|
|
"learning_rate": 1.0183663588092214e-06,
|
|
"loss": 0.0861007571220398,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2225,
|
|
"token_acc": 0.9472103487064117,
|
|
"train_speed(iter/s)": 0.11322
|
|
},
|
|
{
|
|
"epoch": 2.3825808175260486,
|
|
"grad_norm": 0.7264376878738403,
|
|
"learning_rate": 1.0014926504969535e-06,
|
|
"loss": 0.09383871555328369,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2230,
|
|
"token_acc": 0.968945743273048,
|
|
"train_speed(iter/s)": 0.113288
|
|
},
|
|
{
|
|
"epoch": 2.3879241250333956,
|
|
"grad_norm": 0.6731327772140503,
|
|
"learning_rate": 9.847443344610296e-07,
|
|
"loss": 0.09176123142242432,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2235,
|
|
"token_acc": 0.9693463125322331,
|
|
"train_speed(iter/s)": 0.113364
|
|
},
|
|
{
|
|
"epoch": 2.3932674325407426,
|
|
"grad_norm": 0.8005653023719788,
|
|
"learning_rate": 9.681219359248106e-07,
|
|
"loss": 0.09590315818786621,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2240,
|
|
"token_acc": 0.9636749520427738,
|
|
"train_speed(iter/s)": 0.113431
|
|
},
|
|
{
|
|
"epoch": 2.3932674325407426,
|
|
"eval_loss": 0.19097845256328583,
|
|
"eval_runtime": 39.2123,
|
|
"eval_samples_per_second": 15.403,
|
|
"eval_steps_per_second": 3.851,
|
|
"eval_token_acc": 0.9388963949473503,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 2.3986107400480896,
|
|
"grad_norm": 0.7299765348434448,
|
|
"learning_rate": 9.516259761629148e-07,
|
|
"loss": 0.08505445718765259,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2245,
|
|
"token_acc": 0.944954128440367,
|
|
"train_speed(iter/s)": 0.113227
|
|
},
|
|
{
|
|
"epoch": 2.4039540475554366,
|
|
"grad_norm": 0.7002612948417664,
|
|
"learning_rate": 9.352569724848715e-07,
|
|
"loss": 0.08956900835037232,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2250,
|
|
"token_acc": 0.9705892762342948,
|
|
"train_speed(iter/s)": 0.113285
|
|
},
|
|
{
|
|
"epoch": 2.409297355062784,
|
|
"grad_norm": 0.7428932785987854,
|
|
"learning_rate": 9.190154382188921e-07,
|
|
"loss": 0.09664742350578308,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2255,
|
|
"token_acc": 0.9705281875658588,
|
|
"train_speed(iter/s)": 0.113348
|
|
},
|
|
{
|
|
"epoch": 2.414640662570131,
|
|
"grad_norm": 0.6886446475982666,
|
|
"learning_rate": 9.029018826957775e-07,
|
|
"loss": 0.09689427018165589,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2260,
|
|
"token_acc": 0.960169941582581,
|
|
"train_speed(iter/s)": 0.113411
|
|
},
|
|
{
|
|
"epoch": 2.414640662570131,
|
|
"eval_loss": 0.1908179074525833,
|
|
"eval_runtime": 39.2091,
|
|
"eval_samples_per_second": 15.405,
|
|
"eval_steps_per_second": 3.851,
|
|
"eval_token_acc": 0.9391408779943812,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 2.419983970077478,
|
|
"grad_norm": 0.7271323800086975,
|
|
"learning_rate": 8.86916811232944e-07,
|
|
"loss": 0.09017704725265503,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2265,
|
|
"token_acc": 0.9487460692263789,
|
|
"train_speed(iter/s)": 0.113206
|
|
},
|
|
{
|
|
"epoch": 2.425327277584825,
|
|
"grad_norm": 0.7703067064285278,
|
|
"learning_rate": 8.710607251185799e-07,
|
|
"loss": 0.09243172407150269,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2270,
|
|
"token_acc": 0.9636782618237683,
|
|
"train_speed(iter/s)": 0.113273
|
|
},
|
|
{
|
|
"epoch": 2.430670585092172,
|
|
"grad_norm": 0.7155824899673462,
|
|
"learning_rate": 8.553341215959215e-07,
|
|
"loss": 0.09737263917922974,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2275,
|
|
"token_acc": 0.9632417350679687,
|
|
"train_speed(iter/s)": 0.113346
|
|
},
|
|
{
|
|
"epoch": 2.436013892599519,
|
|
"grad_norm": 0.7440068125724792,
|
|
"learning_rate": 8.397374938476594e-07,
|
|
"loss": 0.09268940091133118,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2280,
|
|
"token_acc": 0.9701119443538746,
|
|
"train_speed(iter/s)": 0.11341
|
|
},
|
|
{
|
|
"epoch": 2.436013892599519,
|
|
"eval_loss": 0.190630704164505,
|
|
"eval_runtime": 39.2105,
|
|
"eval_samples_per_second": 15.404,
|
|
"eval_steps_per_second": 3.851,
|
|
"eval_token_acc": 0.9391837697570181,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 2.441357200106866,
|
|
"grad_norm": 0.7325111031532288,
|
|
"learning_rate": 8.242713309804729e-07,
|
|
"loss": 0.09075909256935119,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2285,
|
|
"token_acc": 0.947367209794716,
|
|
"train_speed(iter/s)": 0.113205
|
|
},
|
|
{
|
|
"epoch": 2.446700507614213,
|
|
"grad_norm": 0.757805585861206,
|
|
"learning_rate": 8.089361180096927e-07,
|
|
"loss": 0.09486221075057984,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2290,
|
|
"token_acc": 0.9661316211878009,
|
|
"train_speed(iter/s)": 0.113267
|
|
},
|
|
{
|
|
"epoch": 2.45204381512156,
|
|
"grad_norm": 0.7468327283859253,
|
|
"learning_rate": 7.937323358440935e-07,
|
|
"loss": 0.09470909833908081,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2295,
|
|
"token_acc": 0.9667186525265128,
|
|
"train_speed(iter/s)": 0.113336
|
|
},
|
|
{
|
|
"epoch": 2.4573871226289072,
|
|
"grad_norm": 0.7450569868087769,
|
|
"learning_rate": 7.786604612708093e-07,
|
|
"loss": 0.09495973587036133,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2300,
|
|
"token_acc": 0.9614005227645736,
|
|
"train_speed(iter/s)": 0.113403
|
|
},
|
|
{
|
|
"epoch": 2.4573871226289072,
|
|
"eval_loss": 0.19077461957931519,
|
|
"eval_runtime": 39.2014,
|
|
"eval_samples_per_second": 15.408,
|
|
"eval_steps_per_second": 3.852,
|
|
"eval_token_acc": 0.9391537455231723,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 2.4627304301362543,
|
|
"grad_norm": 0.6540588736534119,
|
|
"learning_rate": 7.637209669403789e-07,
|
|
"loss": 0.0869560956954956,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2305,
|
|
"token_acc": 0.9489500731715618,
|
|
"train_speed(iter/s)": 0.113205
|
|
},
|
|
{
|
|
"epoch": 2.4680737376436013,
|
|
"grad_norm": 0.738757848739624,
|
|
"learning_rate": 7.489143213519301e-07,
|
|
"loss": 0.09310966730117798,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2310,
|
|
"token_acc": 0.9699513919575784,
|
|
"train_speed(iter/s)": 0.113277
|
|
},
|
|
{
|
|
"epoch": 2.4734170451509483,
|
|
"grad_norm": 0.6581406593322754,
|
|
"learning_rate": 7.342409888384816e-07,
|
|
"loss": 0.08957692980766296,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2315,
|
|
"token_acc": 0.9666096166887331,
|
|
"train_speed(iter/s)": 0.113338
|
|
},
|
|
{
|
|
"epoch": 2.4787603526582953,
|
|
"grad_norm": 0.6974185705184937,
|
|
"learning_rate": 7.197014295523879e-07,
|
|
"loss": 0.08715896606445313,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2320,
|
|
"token_acc": 0.9711248792133235,
|
|
"train_speed(iter/s)": 0.113406
|
|
},
|
|
{
|
|
"epoch": 2.4787603526582953,
|
|
"eval_loss": 0.19092287123203278,
|
|
"eval_runtime": 39.2106,
|
|
"eval_samples_per_second": 15.404,
|
|
"eval_steps_per_second": 3.851,
|
|
"eval_token_acc": 0.9391966372858093,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 2.4841036601656423,
|
|
"grad_norm": 0.7533718347549438,
|
|
"learning_rate": 7.052960994509056e-07,
|
|
"loss": 0.09223737120628357,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2325,
|
|
"token_acc": 0.9484453382745063,
|
|
"train_speed(iter/s)": 0.113199
|
|
},
|
|
{
|
|
"epoch": 2.48944696767299,
|
|
"grad_norm": 0.7056859731674194,
|
|
"learning_rate": 6.910254502818914e-07,
|
|
"loss": 0.08803938627243042,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2330,
|
|
"token_acc": 0.9710507958653053,
|
|
"train_speed(iter/s)": 0.113258
|
|
},
|
|
{
|
|
"epoch": 2.494790275180337,
|
|
"grad_norm": 0.7328131198883057,
|
|
"learning_rate": 6.768899295696413e-07,
|
|
"loss": 0.09180974960327148,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2335,
|
|
"token_acc": 0.9653558350581117,
|
|
"train_speed(iter/s)": 0.113316
|
|
},
|
|
{
|
|
"epoch": 2.500133582687684,
|
|
"grad_norm": 0.7516103982925415,
|
|
"learning_rate": 6.628899806008515e-07,
|
|
"loss": 0.09033479690551757,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2340,
|
|
"token_acc": 0.9666970260959094,
|
|
"train_speed(iter/s)": 0.113382
|
|
},
|
|
{
|
|
"epoch": 2.500133582687684,
|
|
"eval_loss": 0.19086231291294098,
|
|
"eval_runtime": 39.1889,
|
|
"eval_samples_per_second": 15.413,
|
|
"eval_steps_per_second": 3.853,
|
|
"eval_token_acc": 0.939299577516138,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 2.505476890195031,
|
|
"grad_norm": 0.8012656569480896,
|
|
"learning_rate": 6.490260424107231e-07,
|
|
"loss": 0.09262714982032776,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2345,
|
|
"token_acc": 0.9465704224528235,
|
|
"train_speed(iter/s)": 0.113185
|
|
},
|
|
{
|
|
"epoch": 2.510820197702378,
|
|
"grad_norm": 0.6409561634063721,
|
|
"learning_rate": 6.352985497691883e-07,
|
|
"loss": 0.09137773513793945,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2350,
|
|
"token_acc": 0.9675587467362924,
|
|
"train_speed(iter/s)": 0.113254
|
|
},
|
|
{
|
|
"epoch": 2.516163505209725,
|
|
"grad_norm": 0.790011465549469,
|
|
"learning_rate": 6.217079331672777e-07,
|
|
"loss": 0.10272359848022461,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2355,
|
|
"token_acc": 0.9673232662587969,
|
|
"train_speed(iter/s)": 0.113325
|
|
},
|
|
{
|
|
"epoch": 2.521506812717072,
|
|
"grad_norm": 0.7116812467575073,
|
|
"learning_rate": 6.082546188036204e-07,
|
|
"loss": 0.09815052747726441,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2360,
|
|
"token_acc": 0.9669859985261606,
|
|
"train_speed(iter/s)": 0.113389
|
|
},
|
|
{
|
|
"epoch": 2.521506812717072,
|
|
"eval_loss": 0.19036073982715607,
|
|
"eval_runtime": 39.1911,
|
|
"eval_samples_per_second": 15.412,
|
|
"eval_steps_per_second": 3.853,
|
|
"eval_token_acc": 0.9394539878616311,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 2.526850120224419,
|
|
"grad_norm": 0.755511462688446,
|
|
"learning_rate": 5.949390285710777e-07,
|
|
"loss": 0.09070051908493042,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2365,
|
|
"token_acc": 0.9475465313028765,
|
|
"train_speed(iter/s)": 0.113185
|
|
},
|
|
{
|
|
"epoch": 2.532193427731766,
|
|
"grad_norm": 0.694186806678772,
|
|
"learning_rate": 5.817615800435167e-07,
|
|
"loss": 0.09250964522361756,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2370,
|
|
"token_acc": 0.9697894963718658,
|
|
"train_speed(iter/s)": 0.113247
|
|
},
|
|
{
|
|
"epoch": 2.537536735239113,
|
|
"grad_norm": 0.7912867665290833,
|
|
"learning_rate": 5.687226864627115e-07,
|
|
"loss": 0.10275306701660156,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2375,
|
|
"token_acc": 0.9677234207772039,
|
|
"train_speed(iter/s)": 0.113304
|
|
},
|
|
{
|
|
"epoch": 2.54288004274646,
|
|
"grad_norm": 0.7408064007759094,
|
|
"learning_rate": 5.558227567253832e-07,
|
|
"loss": 0.09581139087677001,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2380,
|
|
"token_acc": 0.9702766420961533,
|
|
"train_speed(iter/s)": 0.113383
|
|
},
|
|
{
|
|
"epoch": 2.54288004274646,
|
|
"eval_loss": 0.19033583998680115,
|
|
"eval_runtime": 39.1446,
|
|
"eval_samples_per_second": 15.43,
|
|
"eval_steps_per_second": 3.857,
|
|
"eval_token_acc": 0.9392781316348195,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 2.548223350253807,
|
|
"grad_norm": 0.6932405829429626,
|
|
"learning_rate": 5.430621953703785e-07,
|
|
"loss": 0.09184646606445312,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2385,
|
|
"token_acc": 0.9487018329752709,
|
|
"train_speed(iter/s)": 0.113172
|
|
},
|
|
{
|
|
"epoch": 2.553566657761154,
|
|
"grad_norm": 0.6893176436424255,
|
|
"learning_rate": 5.304414025659832e-07,
|
|
"loss": 0.08671947121620179,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2390,
|
|
"token_acc": 0.9663333126281135,
|
|
"train_speed(iter/s)": 0.113228
|
|
},
|
|
{
|
|
"epoch": 2.5589099652685015,
|
|
"grad_norm": 0.6595478057861328,
|
|
"learning_rate": 5.179607740973764e-07,
|
|
"loss": 0.09736074805259705,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2395,
|
|
"token_acc": 0.9659873313136876,
|
|
"train_speed(iter/s)": 0.113281
|
|
},
|
|
{
|
|
"epoch": 2.564253272775848,
|
|
"grad_norm": 0.738300085067749,
|
|
"learning_rate": 5.056207013542131e-07,
|
|
"loss": 0.09036798477172851,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2400,
|
|
"token_acc": 0.9698333820520316,
|
|
"train_speed(iter/s)": 0.113341
|
|
},
|
|
{
|
|
"epoch": 2.564253272775848,
|
|
"eval_loss": 0.1903306394815445,
|
|
"eval_runtime": 39.1763,
|
|
"eval_samples_per_second": 15.417,
|
|
"eval_steps_per_second": 3.854,
|
|
"eval_token_acc": 0.9393596259838298,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 2.5695965802831955,
|
|
"grad_norm": 0.6590713858604431,
|
|
"learning_rate": 4.934215713183527e-07,
|
|
"loss": 0.08946118354797364,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2405,
|
|
"token_acc": 0.9487092468155388,
|
|
"train_speed(iter/s)": 0.113141
|
|
},
|
|
{
|
|
"epoch": 2.5749398877905425,
|
|
"grad_norm": 0.7545654773712158,
|
|
"learning_rate": 4.813637665517251e-07,
|
|
"loss": 0.08997320532798767,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2410,
|
|
"token_acc": 0.9712623356584911,
|
|
"train_speed(iter/s)": 0.113199
|
|
},
|
|
{
|
|
"epoch": 2.5802831952978895,
|
|
"grad_norm": 0.722476601600647,
|
|
"learning_rate": 4.6944766518432936e-07,
|
|
"loss": 0.09507122039794921,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2415,
|
|
"token_acc": 0.9640161909989023,
|
|
"train_speed(iter/s)": 0.113266
|
|
},
|
|
{
|
|
"epoch": 2.5856265028052365,
|
|
"grad_norm": 0.7068451046943665,
|
|
"learning_rate": 4.576736409023813e-07,
|
|
"loss": 0.08914280533790589,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2420,
|
|
"token_acc": 0.9624791076849609,
|
|
"train_speed(iter/s)": 0.113323
|
|
},
|
|
{
|
|
"epoch": 2.5856265028052365,
|
|
"eval_loss": 0.18999898433685303,
|
|
"eval_runtime": 39.1317,
|
|
"eval_samples_per_second": 15.435,
|
|
"eval_steps_per_second": 3.859,
|
|
"eval_token_acc": 0.9394068069227305,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 2.5909698103125836,
|
|
"grad_norm": 0.7488506436347961,
|
|
"learning_rate": 4.460420629365919e-07,
|
|
"loss": 0.10495038032531738,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2425,
|
|
"token_acc": 0.9451158336373586,
|
|
"train_speed(iter/s)": 0.113141
|
|
},
|
|
{
|
|
"epoch": 2.5963131178199306,
|
|
"grad_norm": 0.7361159324645996,
|
|
"learning_rate": 4.3455329605058436e-07,
|
|
"loss": 0.09409030675888061,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2430,
|
|
"token_acc": 0.9666012515273817,
|
|
"train_speed(iter/s)": 0.113204
|
|
},
|
|
{
|
|
"epoch": 2.6016564253272776,
|
|
"grad_norm": 0.7987903356552124,
|
|
"learning_rate": 4.232077005294638e-07,
|
|
"loss": 0.09288793802261353,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2435,
|
|
"token_acc": 0.9625309550299681,
|
|
"train_speed(iter/s)": 0.11326
|
|
},
|
|
{
|
|
"epoch": 2.6069997328346246,
|
|
"grad_norm": 0.7318440675735474,
|
|
"learning_rate": 4.120056321685101e-07,
|
|
"loss": 0.09065854549407959,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2440,
|
|
"token_acc": 0.9711286089238845,
|
|
"train_speed(iter/s)": 0.113321
|
|
},
|
|
{
|
|
"epoch": 2.6069997328346246,
|
|
"eval_loss": 0.19047002494335175,
|
|
"eval_runtime": 39.1424,
|
|
"eval_samples_per_second": 15.431,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.939428252804049,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 2.6123430403419716,
|
|
"grad_norm": 0.7801464200019836,
|
|
"learning_rate": 4.009474422620269e-07,
|
|
"loss": 0.09015793800354004,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2445,
|
|
"token_acc": 0.9462027912208955,
|
|
"train_speed(iter/s)": 0.113137
|
|
},
|
|
{
|
|
"epoch": 2.6176863478493186,
|
|
"grad_norm": 0.7523171305656433,
|
|
"learning_rate": 3.900334775923237e-07,
|
|
"loss": 0.08972238302230835,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2450,
|
|
"token_acc": 0.964319157867545,
|
|
"train_speed(iter/s)": 0.113199
|
|
},
|
|
{
|
|
"epoch": 2.6230296553566657,
|
|
"grad_norm": 0.7075212001800537,
|
|
"learning_rate": 3.7926408041883355e-07,
|
|
"loss": 0.09695062637329102,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2455,
|
|
"token_acc": 0.9670872765509989,
|
|
"train_speed(iter/s)": 0.113263
|
|
},
|
|
{
|
|
"epoch": 2.6283729628640127,
|
|
"grad_norm": 0.7174405455589294,
|
|
"learning_rate": 3.6863958846739213e-07,
|
|
"loss": 0.09820109009742736,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2460,
|
|
"token_acc": 0.9666613545816733,
|
|
"train_speed(iter/s)": 0.113326
|
|
},
|
|
{
|
|
"epoch": 2.6283729628640127,
|
|
"eval_loss": 0.1904931664466858,
|
|
"eval_runtime": 38.991,
|
|
"eval_samples_per_second": 15.491,
|
|
"eval_steps_per_second": 3.873,
|
|
"eval_token_acc": 0.9393724935126209,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 2.6337162703713597,
|
|
"grad_norm": 0.7205042243003845,
|
|
"learning_rate": 3.581603349196372e-07,
|
|
"loss": 0.09016447067260742,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2465,
|
|
"token_acc": 0.9470618527207186,
|
|
"train_speed(iter/s)": 0.113146
|
|
},
|
|
{
|
|
"epoch": 2.639059577878707,
|
|
"grad_norm": 0.7430869340896606,
|
|
"learning_rate": 3.4782664840256387e-07,
|
|
"loss": 0.09322860240936279,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2470,
|
|
"token_acc": 0.9666258078894584,
|
|
"train_speed(iter/s)": 0.113215
|
|
},
|
|
{
|
|
"epoch": 2.6444028853860537,
|
|
"grad_norm": 0.7449864149093628,
|
|
"learning_rate": 3.3763885297822153e-07,
|
|
"loss": 0.09292680621147156,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2475,
|
|
"token_acc": 0.965542892849704,
|
|
"train_speed(iter/s)": 0.113277
|
|
},
|
|
{
|
|
"epoch": 2.649746192893401,
|
|
"grad_norm": 0.6502243280410767,
|
|
"learning_rate": 3.275972681335421e-07,
|
|
"loss": 0.09386556148529053,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2480,
|
|
"token_acc": 0.9635287435353204,
|
|
"train_speed(iter/s)": 0.113346
|
|
},
|
|
{
|
|
"epoch": 2.649746192893401,
|
|
"eval_loss": 0.19051562249660492,
|
|
"eval_runtime": 38.9188,
|
|
"eval_samples_per_second": 15.519,
|
|
"eval_steps_per_second": 3.88,
|
|
"eval_token_acc": 0.9393296017499839,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 2.655089500400748,
|
|
"grad_norm": 0.6717422604560852,
|
|
"learning_rate": 3.1770220877033243e-07,
|
|
"loss": 0.08921995162963867,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2485,
|
|
"token_acc": 0.9476881346660132,
|
|
"train_speed(iter/s)": 0.113146
|
|
},
|
|
{
|
|
"epoch": 2.660432807908095,
|
|
"grad_norm": 0.7039415240287781,
|
|
"learning_rate": 3.0795398519539113e-07,
|
|
"loss": 0.08925142884254456,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2490,
|
|
"token_acc": 0.971012390099527,
|
|
"train_speed(iter/s)": 0.113211
|
|
},
|
|
{
|
|
"epoch": 2.6657761154154422,
|
|
"grad_norm": 0.7549923062324524,
|
|
"learning_rate": 2.9835290311078123e-07,
|
|
"loss": 0.09008611440658569,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2495,
|
|
"token_acc": 0.9727889176682867,
|
|
"train_speed(iter/s)": 0.113272
|
|
},
|
|
{
|
|
"epoch": 2.6711194229227893,
|
|
"grad_norm": 0.6972574591636658,
|
|
"learning_rate": 2.888992636042437e-07,
|
|
"loss": 0.08815158009529114,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2500,
|
|
"token_acc": 0.9709897080314667,
|
|
"train_speed(iter/s)": 0.113333
|
|
},
|
|
{
|
|
"epoch": 2.6711194229227893,
|
|
"eval_loss": 0.1904427856206894,
|
|
"eval_runtime": 38.99,
|
|
"eval_samples_per_second": 15.491,
|
|
"eval_steps_per_second": 3.873,
|
|
"eval_token_acc": 0.9394454095091038,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 2.6764627304301363,
|
|
"grad_norm": 0.7152836322784424,
|
|
"learning_rate": 2.7959336313974847e-07,
|
|
"loss": 0.09650709629058837,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2505,
|
|
"token_acc": 0.9477473770829048,
|
|
"train_speed(iter/s)": 0.113148
|
|
},
|
|
{
|
|
"epoch": 2.6818060379374833,
|
|
"grad_norm": 0.6270228028297424,
|
|
"learning_rate": 2.704354935482095e-07,
|
|
"loss": 0.08850882053375245,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2510,
|
|
"token_acc": 0.9648082862758556,
|
|
"train_speed(iter/s)": 0.113203
|
|
},
|
|
{
|
|
"epoch": 2.6871493454448303,
|
|
"grad_norm": 0.7035424709320068,
|
|
"learning_rate": 2.6142594201832183e-07,
|
|
"loss": 0.10063533782958985,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2515,
|
|
"token_acc": 0.9673570595099183,
|
|
"train_speed(iter/s)": 0.113268
|
|
},
|
|
{
|
|
"epoch": 2.6924926529521773,
|
|
"grad_norm": 0.6758410334587097,
|
|
"learning_rate": 2.525649910875627e-07,
|
|
"loss": 0.08648205399513245,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2520,
|
|
"token_acc": 0.970028145397928,
|
|
"train_speed(iter/s)": 0.113337
|
|
},
|
|
{
|
|
"epoch": 2.6924926529521773,
|
|
"eval_loss": 0.19026722013950348,
|
|
"eval_runtime": 39.0306,
|
|
"eval_samples_per_second": 15.475,
|
|
"eval_steps_per_second": 3.869,
|
|
"eval_token_acc": 0.9392909991636106,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 2.6978359604595243,
|
|
"grad_norm": 0.7509530186653137,
|
|
"learning_rate": 2.438529186333288e-07,
|
|
"loss": 0.09757347106933593,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2525,
|
|
"token_acc": 0.9475406925782507,
|
|
"train_speed(iter/s)": 0.11316
|
|
},
|
|
{
|
|
"epoch": 2.7031792679668714,
|
|
"grad_norm": 0.7634799480438232,
|
|
"learning_rate": 2.3528999786421758e-07,
|
|
"loss": 0.09107044339179993,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2530,
|
|
"token_acc": 0.969339403512032,
|
|
"train_speed(iter/s)": 0.113213
|
|
},
|
|
{
|
|
"epoch": 2.708522575474219,
|
|
"grad_norm": 0.6541688442230225,
|
|
"learning_rate": 2.2687649731146844e-07,
|
|
"loss": 0.0989515721797943,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2535,
|
|
"token_acc": 0.9692385434102896,
|
|
"train_speed(iter/s)": 0.113272
|
|
},
|
|
{
|
|
"epoch": 2.7138658829815654,
|
|
"grad_norm": 0.7726590037345886,
|
|
"learning_rate": 2.1861268082053466e-07,
|
|
"loss": 0.09174089431762696,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2540,
|
|
"token_acc": 0.963183540877098,
|
|
"train_speed(iter/s)": 0.113328
|
|
},
|
|
{
|
|
"epoch": 2.7138658829815654,
|
|
"eval_loss": 0.1902547925710678,
|
|
"eval_runtime": 39.0546,
|
|
"eval_samples_per_second": 15.466,
|
|
"eval_steps_per_second": 3.866,
|
|
"eval_token_acc": 0.9391923481095455,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 2.719209190488913,
|
|
"grad_norm": 0.7492479681968689,
|
|
"learning_rate": 2.104988075428127e-07,
|
|
"loss": 0.10171656608581543,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2545,
|
|
"token_acc": 0.947412690936007,
|
|
"train_speed(iter/s)": 0.113141
|
|
},
|
|
{
|
|
"epoch": 2.7245524979962594,
|
|
"grad_norm": 0.7487272620201111,
|
|
"learning_rate": 2.0253513192751374e-07,
|
|
"loss": 0.10030395984649658,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2550,
|
|
"token_acc": 0.9660484246469037,
|
|
"train_speed(iter/s)": 0.113213
|
|
},
|
|
{
|
|
"epoch": 2.729895805503607,
|
|
"grad_norm": 0.7023541927337646,
|
|
"learning_rate": 1.947219037136827e-07,
|
|
"loss": 0.09708930253982544,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2555,
|
|
"token_acc": 0.9649252477742315,
|
|
"train_speed(iter/s)": 0.113282
|
|
},
|
|
{
|
|
"epoch": 2.735239113010954,
|
|
"grad_norm": 0.7202330231666565,
|
|
"learning_rate": 1.8705936792237255e-07,
|
|
"loss": 0.08675633072853088,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2560,
|
|
"token_acc": 0.9641549104720565,
|
|
"train_speed(iter/s)": 0.113325
|
|
},
|
|
{
|
|
"epoch": 2.735239113010954,
|
|
"eval_loss": 0.1902298927307129,
|
|
"eval_runtime": 39.1479,
|
|
"eval_samples_per_second": 15.429,
|
|
"eval_steps_per_second": 3.857,
|
|
"eval_token_acc": 0.9393167342211928,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 2.740582420518301,
|
|
"grad_norm": 0.6873902082443237,
|
|
"learning_rate": 1.7954776484895188e-07,
|
|
"loss": 0.0941142499446869,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2565,
|
|
"token_acc": 0.947600952293746,
|
|
"train_speed(iter/s)": 0.113141
|
|
},
|
|
{
|
|
"epoch": 2.745925728025648,
|
|
"grad_norm": 0.7027167081832886,
|
|
"learning_rate": 1.7218733005557707e-07,
|
|
"loss": 0.08929444551467895,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2570,
|
|
"token_acc": 0.9652069391202651,
|
|
"train_speed(iter/s)": 0.1132
|
|
},
|
|
{
|
|
"epoch": 2.751269035532995,
|
|
"grad_norm": 0.7328935265541077,
|
|
"learning_rate": 1.6497829436380009e-07,
|
|
"loss": 0.09518647193908691,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2575,
|
|
"token_acc": 0.9699323199822478,
|
|
"train_speed(iter/s)": 0.11326
|
|
},
|
|
{
|
|
"epoch": 2.756612343040342,
|
|
"grad_norm": 0.7890388369560242,
|
|
"learning_rate": 1.5792088384733174e-07,
|
|
"loss": 0.09950923323631286,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2580,
|
|
"token_acc": 0.962556860615113,
|
|
"train_speed(iter/s)": 0.113323
|
|
},
|
|
{
|
|
"epoch": 2.756612343040342,
|
|
"eval_loss": 0.19023241102695465,
|
|
"eval_runtime": 39.1648,
|
|
"eval_samples_per_second": 15.422,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9394196744515216,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 2.761955650547689,
|
|
"grad_norm": 0.7428569793701172,
|
|
"learning_rate": 1.510153198249531e-07,
|
|
"loss": 0.08879505395889283,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2585,
|
|
"token_acc": 0.9496286055977968,
|
|
"train_speed(iter/s)": 0.113151
|
|
},
|
|
{
|
|
"epoch": 2.767298958055036,
|
|
"grad_norm": 0.7776227593421936,
|
|
"learning_rate": 1.4426181885357215e-07,
|
|
"loss": 0.09481008052825927,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2590,
|
|
"token_acc": 0.9610127994545511,
|
|
"train_speed(iter/s)": 0.113206
|
|
},
|
|
{
|
|
"epoch": 2.772642265562383,
|
|
"grad_norm": 0.6838895082473755,
|
|
"learning_rate": 1.376605927214364e-07,
|
|
"loss": 0.0857117772102356,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2595,
|
|
"token_acc": 0.9700140999530001,
|
|
"train_speed(iter/s)": 0.113255
|
|
},
|
|
{
|
|
"epoch": 2.77798557306973,
|
|
"grad_norm": 0.911618173122406,
|
|
"learning_rate": 1.312118484414876e-07,
|
|
"loss": 0.09858800768852234,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2600,
|
|
"token_acc": 0.9638269804901757,
|
|
"train_speed(iter/s)": 0.113319
|
|
},
|
|
{
|
|
"epoch": 2.77798557306973,
|
|
"eval_loss": 0.1901472806930542,
|
|
"eval_runtime": 39.1574,
|
|
"eval_samples_per_second": 15.425,
|
|
"eval_steps_per_second": 3.856,
|
|
"eval_token_acc": 0.9393510476313024,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 2.783328880577077,
|
|
"grad_norm": 0.7218348979949951,
|
|
"learning_rate": 1.2491578824487204e-07,
|
|
"loss": 0.09123666286468506,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2605,
|
|
"token_acc": 0.9471299437601038,
|
|
"train_speed(iter/s)": 0.113146
|
|
},
|
|
{
|
|
"epoch": 2.7886721880844245,
|
|
"grad_norm": 0.7690821290016174,
|
|
"learning_rate": 1.1877260957459835e-07,
|
|
"loss": 0.09849429726600648,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2610,
|
|
"token_acc": 0.9662001494666758,
|
|
"train_speed(iter/s)": 0.113204
|
|
},
|
|
{
|
|
"epoch": 2.794015495591771,
|
|
"grad_norm": 0.700734555721283,
|
|
"learning_rate": 1.1278250507934518e-07,
|
|
"loss": 0.09033372402191162,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2615,
|
|
"token_acc": 0.967641649881776,
|
|
"train_speed(iter/s)": 0.113271
|
|
},
|
|
{
|
|
"epoch": 2.7993588030991186,
|
|
"grad_norm": 0.7667319178581238,
|
|
"learning_rate": 1.0694566260742001e-07,
|
|
"loss": 0.08968676328659057,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2620,
|
|
"token_acc": 0.9692407567701249,
|
|
"train_speed(iter/s)": 0.11334
|
|
},
|
|
{
|
|
"epoch": 2.7993588030991186,
|
|
"eval_loss": 0.19011949002742767,
|
|
"eval_runtime": 39.1765,
|
|
"eval_samples_per_second": 15.417,
|
|
"eval_steps_per_second": 3.854,
|
|
"eval_token_acc": 0.9393338909262476,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 2.804702110606465,
|
|
"grad_norm": 0.6720722913742065,
|
|
"learning_rate": 1.0126226520086823e-07,
|
|
"loss": 0.09349075555801392,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2625,
|
|
"token_acc": 0.9490298548952021,
|
|
"train_speed(iter/s)": 0.113158
|
|
},
|
|
{
|
|
"epoch": 2.8100454181138126,
|
|
"grad_norm": 0.7081384658813477,
|
|
"learning_rate": 9.573249108973281e-08,
|
|
"loss": 0.09482257962226867,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2630,
|
|
"token_acc": 0.9653012204622176,
|
|
"train_speed(iter/s)": 0.113226
|
|
},
|
|
{
|
|
"epoch": 2.8153887256211596,
|
|
"grad_norm": 0.8206889629364014,
|
|
"learning_rate": 9.035651368646647e-08,
|
|
"loss": 0.08651464581489562,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2635,
|
|
"token_acc": 0.965938566552901,
|
|
"train_speed(iter/s)": 0.113281
|
|
},
|
|
{
|
|
"epoch": 2.8207320331285066,
|
|
"grad_norm": 0.7286149263381958,
|
|
"learning_rate": 8.513450158049109e-08,
|
|
"loss": 0.09981081485748292,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2640,
|
|
"token_acc": 0.9647688936806594,
|
|
"train_speed(iter/s)": 0.113351
|
|
},
|
|
{
|
|
"epoch": 2.8207320331285066,
|
|
"eval_loss": 0.19009515643119812,
|
|
"eval_runtime": 39.1536,
|
|
"eval_samples_per_second": 15.426,
|
|
"eval_steps_per_second": 3.857,
|
|
"eval_token_acc": 0.9394239636277852,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 2.8260753406358536,
|
|
"grad_norm": 0.6493250131607056,
|
|
"learning_rate": 8.006661853291298e-08,
|
|
"loss": 0.09253804683685303,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2645,
|
|
"token_acc": 0.9492279023135687,
|
|
"train_speed(iter/s)": 0.113182
|
|
},
|
|
{
|
|
"epoch": 2.8314186481432007,
|
|
"grad_norm": 0.7164784669876099,
|
|
"learning_rate": 7.515302347138486e-08,
|
|
"loss": 0.10055129528045655,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2650,
|
|
"token_acc": 0.9693806782717248,
|
|
"train_speed(iter/s)": 0.113245
|
|
},
|
|
{
|
|
"epoch": 2.8367619556505477,
|
|
"grad_norm": 0.7498785853385925,
|
|
"learning_rate": 7.03938704851248e-08,
|
|
"loss": 0.09636443257331848,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2655,
|
|
"token_acc": 0.9618243584137769,
|
|
"train_speed(iter/s)": 0.113298
|
|
},
|
|
{
|
|
"epoch": 2.8421052631578947,
|
|
"grad_norm": 0.7123965620994568,
|
|
"learning_rate": 6.578930882008283e-08,
|
|
"loss": 0.09678665399551392,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2660,
|
|
"token_acc": 0.9638537936625904,
|
|
"train_speed(iter/s)": 0.113365
|
|
},
|
|
{
|
|
"epoch": 2.8421052631578947,
|
|
"eval_loss": 0.18998001515865326,
|
|
"eval_runtime": 39.1365,
|
|
"eval_samples_per_second": 15.433,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9393296017499839,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 2.8474485706652417,
|
|
"grad_norm": 0.7025579214096069,
|
|
"learning_rate": 6.133948287426028e-08,
|
|
"loss": 0.09522255659103393,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2665,
|
|
"token_acc": 0.9474585290238993,
|
|
"train_speed(iter/s)": 0.113185
|
|
},
|
|
{
|
|
"epoch": 2.8527918781725887,
|
|
"grad_norm": 0.7414459586143494,
|
|
"learning_rate": 5.704453219318118e-08,
|
|
"loss": 0.09207627773284913,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2670,
|
|
"token_acc": 0.9669233407114132,
|
|
"train_speed(iter/s)": 0.113245
|
|
},
|
|
{
|
|
"epoch": 2.8581351856799357,
|
|
"grad_norm": 0.6940200328826904,
|
|
"learning_rate": 5.2904591465516855e-08,
|
|
"loss": 0.0982110857963562,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2675,
|
|
"token_acc": 0.9648942786069652,
|
|
"train_speed(iter/s)": 0.113311
|
|
},
|
|
{
|
|
"epoch": 2.8634784931872828,
|
|
"grad_norm": 0.5920035243034363,
|
|
"learning_rate": 4.891979051886153e-08,
|
|
"loss": 0.08439633250236511,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2680,
|
|
"token_acc": 0.9698492462311558,
|
|
"train_speed(iter/s)": 0.113369
|
|
},
|
|
{
|
|
"epoch": 2.8634784931872828,
|
|
"eval_loss": 0.19001102447509766,
|
|
"eval_runtime": 39.056,
|
|
"eval_samples_per_second": 15.465,
|
|
"eval_steps_per_second": 3.866,
|
|
"eval_token_acc": 0.9394068069227305,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 2.86882180069463,
|
|
"grad_norm": 0.7572726607322693,
|
|
"learning_rate": 4.509025431566283e-08,
|
|
"loss": 0.09407066106796265,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2685,
|
|
"token_acc": 0.9469456798144891,
|
|
"train_speed(iter/s)": 0.113207
|
|
},
|
|
{
|
|
"epoch": 2.874165108201977,
|
|
"grad_norm": 0.7466961741447449,
|
|
"learning_rate": 4.141610294930043e-08,
|
|
"loss": 0.08772618174552918,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2690,
|
|
"token_acc": 0.9621488738657316,
|
|
"train_speed(iter/s)": 0.113262
|
|
},
|
|
{
|
|
"epoch": 2.8795084157093243,
|
|
"grad_norm": 0.7221233248710632,
|
|
"learning_rate": 3.7897451640321326e-08,
|
|
"loss": 0.09155750274658203,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2695,
|
|
"token_acc": 0.9711238770396626,
|
|
"train_speed(iter/s)": 0.113319
|
|
},
|
|
{
|
|
"epoch": 2.8848517232166713,
|
|
"grad_norm": 0.690424919128418,
|
|
"learning_rate": 3.4534410732825485e-08,
|
|
"loss": 0.09065448045730591,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2700,
|
|
"token_acc": 0.9694952336302547,
|
|
"train_speed(iter/s)": 0.113377
|
|
},
|
|
{
|
|
"epoch": 2.8848517232166713,
|
|
"eval_loss": 0.18985731899738312,
|
|
"eval_runtime": 39.1204,
|
|
"eval_samples_per_second": 15.44,
|
|
"eval_steps_per_second": 3.86,
|
|
"eval_token_acc": 0.939342469278775,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 2.8901950307240183,
|
|
"grad_norm": 0.7742385864257812,
|
|
"learning_rate": 3.1327085691006954e-08,
|
|
"loss": 0.09011354446411132,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2705,
|
|
"token_acc": 0.9484952623546592,
|
|
"train_speed(iter/s)": 0.113201
|
|
},
|
|
{
|
|
"epoch": 2.8955383382313653,
|
|
"grad_norm": 0.67616868019104,
|
|
"learning_rate": 2.8275577095846495e-08,
|
|
"loss": 0.08765259981155396,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2710,
|
|
"token_acc": 0.9678389470704784,
|
|
"train_speed(iter/s)": 0.113251
|
|
},
|
|
{
|
|
"epoch": 2.9008816457387123,
|
|
"grad_norm": 0.6910288333892822,
|
|
"learning_rate": 2.5379980641955792e-08,
|
|
"loss": 0.09436768293380737,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2715,
|
|
"token_acc": 0.9657377798081316,
|
|
"train_speed(iter/s)": 0.113309
|
|
},
|
|
{
|
|
"epoch": 2.9062249532460593,
|
|
"grad_norm": 0.7219937443733215,
|
|
"learning_rate": 2.264038713457706e-08,
|
|
"loss": 0.09317046403884888,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2720,
|
|
"token_acc": 0.9689827817804372,
|
|
"train_speed(iter/s)": 0.113372
|
|
},
|
|
{
|
|
"epoch": 2.9062249532460593,
|
|
"eval_loss": 0.18988655507564545,
|
|
"eval_runtime": 39.1328,
|
|
"eval_samples_per_second": 15.435,
|
|
"eval_steps_per_second": 3.859,
|
|
"eval_token_acc": 0.9394668553904223,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 2.9115682607534064,
|
|
"grad_norm": 0.7260240316390991,
|
|
"learning_rate": 2.0056882486736982e-08,
|
|
"loss": 0.09241507053375245,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2725,
|
|
"token_acc": 0.9490058931065415,
|
|
"train_speed(iter/s)": 0.113201
|
|
},
|
|
{
|
|
"epoch": 2.9169115682607534,
|
|
"grad_norm": 0.6595222353935242,
|
|
"learning_rate": 1.762954771655001e-08,
|
|
"loss": 0.08784698247909546,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2730,
|
|
"token_acc": 0.967156078213068,
|
|
"train_speed(iter/s)": 0.11326
|
|
},
|
|
{
|
|
"epoch": 2.9222548757681004,
|
|
"grad_norm": 0.7033255100250244,
|
|
"learning_rate": 1.5358458944680356e-08,
|
|
"loss": 0.08944010734558105,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2735,
|
|
"token_acc": 0.9681864301377934,
|
|
"train_speed(iter/s)": 0.113312
|
|
},
|
|
{
|
|
"epoch": 2.9275981832754474,
|
|
"grad_norm": 0.684511661529541,
|
|
"learning_rate": 1.3243687391952809e-08,
|
|
"loss": 0.08554937839508056,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2740,
|
|
"token_acc": 0.9701811147059805,
|
|
"train_speed(iter/s)": 0.11336
|
|
},
|
|
{
|
|
"epoch": 2.9275981832754474,
|
|
"eval_loss": 0.18995320796966553,
|
|
"eval_runtime": 39.1422,
|
|
"eval_samples_per_second": 15.431,
|
|
"eval_steps_per_second": 3.858,
|
|
"eval_token_acc": 0.9393767826888846,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 2.9329414907827944,
|
|
"grad_norm": 0.6943471431732178,
|
|
"learning_rate": 1.1285299377118974e-08,
|
|
"loss": 0.08585541248321533,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2745,
|
|
"token_acc": 0.9478389939459105,
|
|
"train_speed(iter/s)": 0.113182
|
|
},
|
|
{
|
|
"epoch": 2.9382847982901414,
|
|
"grad_norm": 0.778913676738739,
|
|
"learning_rate": 9.48335631477948e-09,
|
|
"loss": 0.09440468549728394,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2750,
|
|
"token_acc": 0.9696560591449694,
|
|
"train_speed(iter/s)": 0.113235
|
|
},
|
|
{
|
|
"epoch": 2.9436281057974885,
|
|
"grad_norm": 0.6813073754310608,
|
|
"learning_rate": 7.837914713457184e-09,
|
|
"loss": 0.09091969132423401,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2755,
|
|
"token_acc": 0.9660938225731538,
|
|
"train_speed(iter/s)": 0.113288
|
|
},
|
|
{
|
|
"epoch": 2.948971413304836,
|
|
"grad_norm": 0.8302053213119507,
|
|
"learning_rate": 6.349026173824713e-09,
|
|
"loss": 0.09277503490447998,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2760,
|
|
"token_acc": 0.9680151152198118,
|
|
"train_speed(iter/s)": 0.113333
|
|
},
|
|
{
|
|
"epoch": 2.948971413304836,
|
|
"eval_loss": 0.1899597942829132,
|
|
"eval_runtime": 39.1837,
|
|
"eval_samples_per_second": 15.415,
|
|
"eval_steps_per_second": 3.854,
|
|
"eval_token_acc": 0.9393553368075661,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 2.9543147208121825,
|
|
"grad_norm": 0.7870346307754517,
|
|
"learning_rate": 5.016737387085191e-09,
|
|
"loss": 0.09893054962158203,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2765,
|
|
"token_acc": 0.946137875713679,
|
|
"train_speed(iter/s)": 0.113161
|
|
},
|
|
{
|
|
"epoch": 2.95965802831953,
|
|
"grad_norm": 0.7045755386352539,
|
|
"learning_rate": 3.841090133511749e-09,
|
|
"loss": 0.08883514404296874,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2770,
|
|
"token_acc": 0.9683565527543212,
|
|
"train_speed(iter/s)": 0.113206
|
|
},
|
|
{
|
|
"epoch": 2.965001335826877,
|
|
"grad_norm": 0.6870289444923401,
|
|
"learning_rate": 2.8221212811324616e-09,
|
|
"loss": 0.09689734578132629,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2775,
|
|
"token_acc": 0.9711121335611936,
|
|
"train_speed(iter/s)": 0.113254
|
|
},
|
|
{
|
|
"epoch": 2.970344643334224,
|
|
"grad_norm": 0.7232000827789307,
|
|
"learning_rate": 1.959862784577937e-09,
|
|
"loss": 0.09915404319763184,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2780,
|
|
"token_acc": 0.9624814043439452,
|
|
"train_speed(iter/s)": 0.113315
|
|
},
|
|
{
|
|
"epoch": 2.970344643334224,
|
|
"eval_loss": 0.1898777186870575,
|
|
"eval_runtime": 39.1537,
|
|
"eval_samples_per_second": 15.426,
|
|
"eval_steps_per_second": 3.857,
|
|
"eval_token_acc": 0.9394239636277852,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 2.975687950841571,
|
|
"grad_norm": 0.6913635730743408,
|
|
"learning_rate": 1.2543416840771206e-09,
|
|
"loss": 0.09539123177528382,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2785,
|
|
"token_acc": 0.9471013034632325,
|
|
"train_speed(iter/s)": 0.113152
|
|
},
|
|
{
|
|
"epoch": 2.981031258348918,
|
|
"grad_norm": 0.704073429107666,
|
|
"learning_rate": 7.055801046113031e-10,
|
|
"loss": 0.08701257705688477,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2790,
|
|
"token_acc": 0.969090176051606,
|
|
"train_speed(iter/s)": 0.113208
|
|
},
|
|
{
|
|
"epoch": 2.986374565856265,
|
|
"grad_norm": 0.7093097567558289,
|
|
"learning_rate": 3.1359525521801326e-10,
|
|
"loss": 0.08475543856620789,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2795,
|
|
"token_acc": 0.9701114312493491,
|
|
"train_speed(iter/s)": 0.113264
|
|
},
|
|
{
|
|
"epoch": 2.991717873363612,
|
|
"grad_norm": 0.6521144509315491,
|
|
"learning_rate": 7.839942845144777e-11,
|
|
"loss": 0.09691762924194336,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2800,
|
|
"token_acc": 0.9690315134805886,
|
|
"train_speed(iter/s)": 0.113325
|
|
},
|
|
{
|
|
"epoch": 2.991717873363612,
|
|
"eval_loss": 0.19001290202140808,
|
|
"eval_runtime": 39.1732,
|
|
"eval_samples_per_second": 15.419,
|
|
"eval_steps_per_second": 3.855,
|
|
"eval_token_acc": 0.939398228570203,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 2.997061180870959,
|
|
"grad_norm": 0.6936495900154114,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.09210923910140992,
|
|
"memory(GiB)": 31.37,
|
|
"step": 2805,
|
|
"token_acc": 0.947245220359309,
|
|
"train_speed(iter/s)": 0.11316
|
|
},
|
|
{
|
|
"epoch": 2.997061180870959,
|
|
"eval_loss": 0.18997597694396973,
|
|
"eval_runtime": 38.9833,
|
|
"eval_samples_per_second": 15.494,
|
|
"eval_steps_per_second": 3.873,
|
|
"eval_token_acc": 0.9393338909262476,
|
|
"step": 2805
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 2805,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3.1732509215136154e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|