Files
qwen2.5vl-3b-32b-random-wro…/trainer_state.json
ModelHub XC b6855e36db 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-32b-random-wrong-correct
Source: Original Platform
2026-05-22 21:31:12 +08:00

6924 lines
197 KiB
JSON

{
"best_global_step": 1860,
"best_metric": 0.1756638,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v30-20250504-002959/checkpoint-1860",
"epoch": 2.997061180870959,
"eval_steps": 20,
"global_step": 2805,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010686615014694097,
"grad_norm": 2.5240726470947266,
"learning_rate": 9.999996864014995e-06,
"loss": 0.36646389961242676,
"memory(GiB)": 28.83,
"step": 1,
"token_acc": 0.898885535175296,
"train_speed(iter/s)": 0.068736
},
{
"epoch": 0.0053433075073470475,
"grad_norm": 2.2844929695129395,
"learning_rate": 9.99992160057155e-06,
"loss": 0.32597002387046814,
"memory(GiB)": 28.87,
"step": 5,
"token_acc": 0.8913895678642254,
"train_speed(iter/s)": 0.125487
},
{
"epoch": 0.010686615014694095,
"grad_norm": 1.130465030670166,
"learning_rate": 9.999686404744782e-06,
"loss": 0.27506489753723146,
"memory(GiB)": 28.87,
"step": 10,
"token_acc": 0.9082294264339152,
"train_speed(iter/s)": 0.138754
},
{
"epoch": 0.016029922522041145,
"grad_norm": 1.0269818305969238,
"learning_rate": 9.999294419895389e-06,
"loss": 0.27271237373352053,
"memory(GiB)": 28.88,
"step": 15,
"token_acc": 0.9203989829845491,
"train_speed(iter/s)": 0.144155
},
{
"epoch": 0.02137323002938819,
"grad_norm": 0.9383953213691711,
"learning_rate": 9.998745658315924e-06,
"loss": 0.2601430892944336,
"memory(GiB)": 28.88,
"step": 20,
"token_acc": 0.9118547099521022,
"train_speed(iter/s)": 0.144659
},
{
"epoch": 0.02137323002938819,
"eval_loss": 0.26998990774154663,
"eval_runtime": 39.5781,
"eval_samples_per_second": 15.261,
"eval_steps_per_second": 3.815,
"eval_token_acc": 0.9141692937871282,
"step": 20
},
{
"epoch": 0.026716537536735238,
"grad_norm": 0.8839966058731079,
"learning_rate": 9.998040137215423e-06,
"loss": 0.2755439758300781,
"memory(GiB)": 28.88,
"step": 25,
"token_acc": 0.9063694193980396,
"train_speed(iter/s)": 0.11355
},
{
"epoch": 0.03205984504408229,
"grad_norm": 1.0895779132843018,
"learning_rate": 9.99717787871887e-06,
"loss": 0.2630652666091919,
"memory(GiB)": 28.88,
"step": 30,
"token_acc": 0.9156098058549522,
"train_speed(iter/s)": 0.118275
},
{
"epoch": 0.037403152551429335,
"grad_norm": 0.9162330031394958,
"learning_rate": 9.99615890986649e-06,
"loss": 0.25594387054443357,
"memory(GiB)": 28.88,
"step": 35,
"token_acc": 0.9182685889734348,
"train_speed(iter/s)": 0.12251
},
{
"epoch": 0.04274646005877638,
"grad_norm": 0.7786328792572021,
"learning_rate": 9.994983262612916e-06,
"loss": 0.26515631675720214,
"memory(GiB)": 28.88,
"step": 40,
"token_acc": 0.9209860093271153,
"train_speed(iter/s)": 0.125479
},
{
"epoch": 0.04274646005877638,
"eval_loss": 0.2504235804080963,
"eval_runtime": 39.1128,
"eval_samples_per_second": 15.443,
"eval_steps_per_second": 3.861,
"eval_token_acc": 0.9184799159321453,
"step": 40
},
{
"epoch": 0.04808976756612343,
"grad_norm": 0.9364318251609802,
"learning_rate": 9.993650973826177e-06,
"loss": 0.2555875062942505,
"memory(GiB)": 28.88,
"step": 45,
"token_acc": 0.9150358901081727,
"train_speed(iter/s)": 0.111936
},
{
"epoch": 0.053433075073470476,
"grad_norm": 0.8948811888694763,
"learning_rate": 9.992162085286543e-06,
"loss": 0.24805829524993897,
"memory(GiB)": 28.88,
"step": 50,
"token_acc": 0.9186546283208815,
"train_speed(iter/s)": 0.115376
},
{
"epoch": 0.05877638258081753,
"grad_norm": 0.9033949375152588,
"learning_rate": 9.990516643685222e-06,
"loss": 0.24738097190856934,
"memory(GiB)": 28.88,
"step": 55,
"token_acc": 0.9198283024093049,
"train_speed(iter/s)": 0.117724
},
{
"epoch": 0.06411969008816458,
"grad_norm": 0.8253108859062195,
"learning_rate": 9.988714700622882e-06,
"loss": 0.24202115535736085,
"memory(GiB)": 28.88,
"step": 60,
"token_acc": 0.9179127157792366,
"train_speed(iter/s)": 0.12054
},
{
"epoch": 0.06411969008816458,
"eval_loss": 0.24108153581619263,
"eval_runtime": 39.139,
"eval_samples_per_second": 15.432,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9215252310793712,
"step": 60
},
{
"epoch": 0.06946299759551162,
"grad_norm": 0.9196599125862122,
"learning_rate": 9.986756312608048e-06,
"loss": 0.2574702262878418,
"memory(GiB)": 28.88,
"step": 65,
"token_acc": 0.9133585236043591,
"train_speed(iter/s)": 0.112242
},
{
"epoch": 0.07480630510285867,
"grad_norm": 0.8361815810203552,
"learning_rate": 9.98464154105532e-06,
"loss": 0.24776794910430908,
"memory(GiB)": 28.88,
"step": 70,
"token_acc": 0.9202816026503778,
"train_speed(iter/s)": 0.114497
},
{
"epoch": 0.08014961261020571,
"grad_norm": 0.794195294380188,
"learning_rate": 9.982370452283451e-06,
"loss": 0.23735420703887938,
"memory(GiB)": 28.88,
"step": 75,
"token_acc": 0.9199817957916153,
"train_speed(iter/s)": 0.116585
},
{
"epoch": 0.08549292011755276,
"grad_norm": 0.9916167855262756,
"learning_rate": 9.979943117513265e-06,
"loss": 0.23866429328918456,
"memory(GiB)": 28.88,
"step": 80,
"token_acc": 0.9108762520703525,
"train_speed(iter/s)": 0.118446
},
{
"epoch": 0.08549292011755276,
"eval_loss": 0.2345978170633316,
"eval_runtime": 39.1184,
"eval_samples_per_second": 15.44,
"eval_steps_per_second": 3.86,
"eval_token_acc": 0.9226018143215595,
"step": 80
},
{
"epoch": 0.09083622762489982,
"grad_norm": 0.8545597195625305,
"learning_rate": 9.977359612865424e-06,
"loss": 0.24537258148193358,
"memory(GiB)": 28.88,
"step": 85,
"token_acc": 0.9151461409910985,
"train_speed(iter/s)": 0.112121
},
{
"epoch": 0.09617953513224686,
"grad_norm": 0.8722995519638062,
"learning_rate": 9.974620019358046e-06,
"loss": 0.2280275344848633,
"memory(GiB)": 28.88,
"step": 90,
"token_acc": 0.9271924248000804,
"train_speed(iter/s)": 0.113659
},
{
"epoch": 0.10152284263959391,
"grad_norm": 0.8093506097793579,
"learning_rate": 9.971724422904154e-06,
"loss": 0.22236292362213134,
"memory(GiB)": 28.88,
"step": 95,
"token_acc": 0.9146961880194898,
"train_speed(iter/s)": 0.115242
},
{
"epoch": 0.10686615014694095,
"grad_norm": 0.8567050695419312,
"learning_rate": 9.968672914308995e-06,
"loss": 0.2444392681121826,
"memory(GiB)": 28.88,
"step": 100,
"token_acc": 0.9146307217073856,
"train_speed(iter/s)": 0.116624
},
{
"epoch": 0.10686615014694095,
"eval_loss": 0.2300056368112564,
"eval_runtime": 39.1847,
"eval_samples_per_second": 15.414,
"eval_steps_per_second": 3.854,
"eval_token_acc": 0.9230350211241931,
"step": 100
},
{
"epoch": 0.112209457654288,
"grad_norm": 0.812283992767334,
"learning_rate": 9.965465589267176e-06,
"loss": 0.23651769161224365,
"memory(GiB)": 28.88,
"step": 105,
"token_acc": 0.9196928564126446,
"train_speed(iter/s)": 0.111612
},
{
"epoch": 0.11755276516163506,
"grad_norm": 0.7877810597419739,
"learning_rate": 9.96210254835968e-06,
"loss": 0.24004974365234374,
"memory(GiB)": 28.88,
"step": 110,
"token_acc": 0.9218568537014301,
"train_speed(iter/s)": 0.112971
},
{
"epoch": 0.1228960726689821,
"grad_norm": 0.7748284935951233,
"learning_rate": 9.9585838970507e-06,
"loss": 0.22294034957885742,
"memory(GiB)": 28.88,
"step": 115,
"token_acc": 0.9274852331280633,
"train_speed(iter/s)": 0.114143
},
{
"epoch": 0.12823938017632916,
"grad_norm": 0.8219108581542969,
"learning_rate": 9.954909745684339e-06,
"loss": 0.22394099235534667,
"memory(GiB)": 28.88,
"step": 120,
"token_acc": 0.9172609252148474,
"train_speed(iter/s)": 0.115495
},
{
"epoch": 0.12823938017632916,
"eval_loss": 0.227636456489563,
"eval_runtime": 39.1617,
"eval_samples_per_second": 15.423,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9239829290784705,
"step": 120
},
{
"epoch": 0.1335826876836762,
"grad_norm": 0.8432355523109436,
"learning_rate": 9.951080209481138e-06,
"loss": 0.2368373155593872,
"memory(GiB)": 28.88,
"step": 125,
"token_acc": 0.9205502826063773,
"train_speed(iter/s)": 0.111608
},
{
"epoch": 0.13892599519102325,
"grad_norm": 0.916549801826477,
"learning_rate": 9.947095408534483e-06,
"loss": 0.25100035667419435,
"memory(GiB)": 28.88,
"step": 130,
"token_acc": 0.9245168060115764,
"train_speed(iter/s)": 0.113003
},
{
"epoch": 0.1442693026983703,
"grad_norm": 0.81369948387146,
"learning_rate": 9.94295546780682e-06,
"loss": 0.2301029920578003,
"memory(GiB)": 28.88,
"step": 135,
"token_acc": 0.9171662305832478,
"train_speed(iter/s)": 0.113963
},
{
"epoch": 0.14961261020571734,
"grad_norm": 0.9230105876922607,
"learning_rate": 9.93866051712574e-06,
"loss": 0.23561997413635255,
"memory(GiB)": 28.88,
"step": 140,
"token_acc": 0.9211832729821282,
"train_speed(iter/s)": 0.115079
},
{
"epoch": 0.14961261020571734,
"eval_loss": 0.22445102035999298,
"eval_runtime": 39.1558,
"eval_samples_per_second": 15.426,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9246691972806622,
"step": 140
},
{
"epoch": 0.15495591771306438,
"grad_norm": 0.7951855063438416,
"learning_rate": 9.934210691179918e-06,
"loss": 0.241453218460083,
"memory(GiB)": 28.88,
"step": 145,
"token_acc": 0.9198347485752214,
"train_speed(iter/s)": 0.111598
},
{
"epoch": 0.16029922522041143,
"grad_norm": 0.8992099165916443,
"learning_rate": 9.929606129514875e-06,
"loss": 0.24097092151641847,
"memory(GiB)": 28.88,
"step": 150,
"token_acc": 0.9258129774449712,
"train_speed(iter/s)": 0.112709
},
{
"epoch": 0.16564253272775847,
"grad_norm": 0.8297993540763855,
"learning_rate": 9.924846976528618e-06,
"loss": 0.2204681158065796,
"memory(GiB)": 28.88,
"step": 155,
"token_acc": 0.9230016313213704,
"train_speed(iter/s)": 0.113601
},
{
"epoch": 0.17098584023510552,
"grad_norm": 0.8261107802391052,
"learning_rate": 9.919933381467088e-06,
"loss": 0.22699880599975586,
"memory(GiB)": 28.88,
"step": 160,
"token_acc": 0.928885791828651,
"train_speed(iter/s)": 0.114559
},
{
"epoch": 0.17098584023510552,
"eval_loss": 0.22084873914718628,
"eval_runtime": 39.2244,
"eval_samples_per_second": 15.399,
"eval_steps_per_second": 3.85,
"eval_token_acc": 0.9256943104076862,
"step": 160
},
{
"epoch": 0.1763291477424526,
"grad_norm": 0.8418117761611938,
"learning_rate": 9.91486549841951e-06,
"loss": 0.22405543327331542,
"memory(GiB)": 28.88,
"step": 165,
"token_acc": 0.9225325105463167,
"train_speed(iter/s)": 0.111753
},
{
"epoch": 0.18167245524979964,
"grad_norm": 0.8489673733711243,
"learning_rate": 9.909643486313533e-06,
"loss": 0.23412735462188722,
"memory(GiB)": 28.88,
"step": 170,
"token_acc": 0.9189576348674184,
"train_speed(iter/s)": 0.112758
},
{
"epoch": 0.18701576275714668,
"grad_norm": 0.7228992581367493,
"learning_rate": 9.904267508910269e-06,
"loss": 0.2174379587173462,
"memory(GiB)": 28.88,
"step": 175,
"token_acc": 0.9282256367080615,
"train_speed(iter/s)": 0.113726
},
{
"epoch": 0.19235907026449373,
"grad_norm": 0.8382998704910278,
"learning_rate": 9.898737734799134e-06,
"loss": 0.22646231651306153,
"memory(GiB)": 28.88,
"step": 180,
"token_acc": 0.9216335902903067,
"train_speed(iter/s)": 0.114665
},
{
"epoch": 0.19235907026449373,
"eval_loss": 0.21828743815422058,
"eval_runtime": 39.1773,
"eval_samples_per_second": 15.417,
"eval_steps_per_second": 3.854,
"eval_token_acc": 0.9263333976709773,
"step": 180
},
{
"epoch": 0.19770237777184077,
"grad_norm": 0.7243833541870117,
"learning_rate": 9.89305433739258e-06,
"loss": 0.21673502922058105,
"memory(GiB)": 28.88,
"step": 185,
"token_acc": 0.9237459546925566,
"train_speed(iter/s)": 0.112007
},
{
"epoch": 0.20304568527918782,
"grad_norm": 0.891444981098175,
"learning_rate": 9.887217494920655e-06,
"loss": 0.2255467414855957,
"memory(GiB)": 28.88,
"step": 190,
"token_acc": 0.9196933155753932,
"train_speed(iter/s)": 0.112921
},
{
"epoch": 0.20838899278653486,
"grad_norm": 0.8923943042755127,
"learning_rate": 9.881227390425404e-06,
"loss": 0.23066916465759277,
"memory(GiB)": 28.88,
"step": 195,
"token_acc": 0.9164964514453868,
"train_speed(iter/s)": 0.113717
},
{
"epoch": 0.2137323002938819,
"grad_norm": 0.8573595881462097,
"learning_rate": 9.875084211755127e-06,
"loss": 0.22467041015625,
"memory(GiB)": 28.88,
"step": 200,
"token_acc": 0.9248140544542792,
"train_speed(iter/s)": 0.114571
},
{
"epoch": 0.2137323002938819,
"eval_loss": 0.2157127410173416,
"eval_runtime": 39.2065,
"eval_samples_per_second": 15.406,
"eval_steps_per_second": 3.851,
"eval_token_acc": 0.9273456432692101,
"step": 200
},
{
"epoch": 0.21907560780122895,
"grad_norm": 0.776364266872406,
"learning_rate": 9.868788151558513e-06,
"loss": 0.2205509662628174,
"memory(GiB)": 28.88,
"step": 205,
"token_acc": 0.9281972298447889,
"train_speed(iter/s)": 0.112238
},
{
"epoch": 0.224418915308576,
"grad_norm": 0.7892180681228638,
"learning_rate": 9.862339407278564e-06,
"loss": 0.2204576015472412,
"memory(GiB)": 28.88,
"step": 210,
"token_acc": 0.9257216072890344,
"train_speed(iter/s)": 0.113008
},
{
"epoch": 0.22976222281592307,
"grad_norm": 0.6594001054763794,
"learning_rate": 9.855738181146427e-06,
"loss": 0.2142805814743042,
"memory(GiB)": 28.88,
"step": 215,
"token_acc": 0.9388168011006501,
"train_speed(iter/s)": 0.113727
},
{
"epoch": 0.2351055303232701,
"grad_norm": 0.8227058053016663,
"learning_rate": 9.848984680175049e-06,
"loss": 0.2226466417312622,
"memory(GiB)": 28.88,
"step": 220,
"token_acc": 0.9279371197521067,
"train_speed(iter/s)": 0.114316
},
{
"epoch": 0.2351055303232701,
"eval_loss": 0.21347786486148834,
"eval_runtime": 39.2014,
"eval_samples_per_second": 15.408,
"eval_steps_per_second": 3.852,
"eval_token_acc": 0.9271354736322889,
"step": 220
},
{
"epoch": 0.24044883783061716,
"grad_norm": 0.7295470237731934,
"learning_rate": 9.84207911615267e-06,
"loss": 0.21721224784851073,
"memory(GiB)": 28.88,
"step": 225,
"token_acc": 0.9288023754536456,
"train_speed(iter/s)": 0.112132
},
{
"epoch": 0.2457921453379642,
"grad_norm": 0.8412714600563049,
"learning_rate": 9.835021705636201e-06,
"loss": 0.22250523567199706,
"memory(GiB)": 28.88,
"step": 230,
"token_acc": 0.9202628419788969,
"train_speed(iter/s)": 0.112833
},
{
"epoch": 0.25113545284531125,
"grad_norm": 0.8208211064338684,
"learning_rate": 9.827812669944423e-06,
"loss": 0.2162861108779907,
"memory(GiB)": 28.88,
"step": 235,
"token_acc": 0.9284959688629414,
"train_speed(iter/s)": 0.113507
},
{
"epoch": 0.2564787603526583,
"grad_norm": 0.849270224571228,
"learning_rate": 9.82045223515105e-06,
"loss": 0.23803000450134276,
"memory(GiB)": 28.88,
"step": 240,
"token_acc": 0.9182260788924586,
"train_speed(iter/s)": 0.114162
},
{
"epoch": 0.2564787603526583,
"eval_loss": 0.21100114285945892,
"eval_runtime": 39.173,
"eval_samples_per_second": 15.419,
"eval_steps_per_second": 3.855,
"eval_token_acc": 0.9279032361834909,
"step": 240
},
{
"epoch": 0.26182206786000534,
"grad_norm": 0.787993311882019,
"learning_rate": 9.812940632077629e-06,
"loss": 0.22567553520202638,
"memory(GiB)": 28.88,
"step": 245,
"token_acc": 0.9222485073786189,
"train_speed(iter/s)": 0.112214
},
{
"epoch": 0.2671653753673524,
"grad_norm": 0.8460186123847961,
"learning_rate": 9.805278096286318e-06,
"loss": 0.23234963417053223,
"memory(GiB)": 28.88,
"step": 250,
"token_acc": 0.9275632677484074,
"train_speed(iter/s)": 0.112833
},
{
"epoch": 0.2725086828746994,
"grad_norm": 0.8697513937950134,
"learning_rate": 9.797464868072489e-06,
"loss": 0.21605942249298096,
"memory(GiB)": 28.88,
"step": 255,
"token_acc": 0.9228022233953358,
"train_speed(iter/s)": 0.11345
},
{
"epoch": 0.2778519903820465,
"grad_norm": 0.7353094816207886,
"learning_rate": 9.789501192457188e-06,
"loss": 0.20451416969299316,
"memory(GiB)": 28.88,
"step": 260,
"token_acc": 0.9241443920719032,
"train_speed(iter/s)": 0.11402
},
{
"epoch": 0.2778519903820465,
"eval_loss": 0.20931538939476013,
"eval_runtime": 39.2202,
"eval_samples_per_second": 15.4,
"eval_steps_per_second": 3.85,
"eval_token_acc": 0.9285251667417272,
"step": 260
},
{
"epoch": 0.2831952978893935,
"grad_norm": 0.7424066066741943,
"learning_rate": 9.781387319179465e-06,
"loss": 0.21903395652770996,
"memory(GiB)": 28.88,
"step": 265,
"token_acc": 0.9232737136934036,
"train_speed(iter/s)": 0.112272
},
{
"epoch": 0.2885386053967406,
"grad_norm": 0.7803475856781006,
"learning_rate": 9.773123502688532e-06,
"loss": 0.20421533584594725,
"memory(GiB)": 28.88,
"step": 270,
"token_acc": 0.9321742863275381,
"train_speed(iter/s)": 0.112863
},
{
"epoch": 0.2938819129040876,
"grad_norm": 0.9278019666671753,
"learning_rate": 9.764710002135784e-06,
"loss": 0.2212052345275879,
"memory(GiB)": 28.88,
"step": 275,
"token_acc": 0.9205871656693225,
"train_speed(iter/s)": 0.113471
},
{
"epoch": 0.2992252204114347,
"grad_norm": 0.8120574355125427,
"learning_rate": 9.756147081366673e-06,
"loss": 0.22279133796691894,
"memory(GiB)": 28.88,
"step": 280,
"token_acc": 0.9252955870108243,
"train_speed(iter/s)": 0.114018
},
{
"epoch": 0.2992252204114347,
"eval_loss": 0.20953305065631866,
"eval_runtime": 39.1588,
"eval_samples_per_second": 15.424,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9285980827382101,
"step": 280
},
{
"epoch": 0.30456852791878175,
"grad_norm": 0.7851824164390564,
"learning_rate": 9.747435008912438e-06,
"loss": 0.2276832103729248,
"memory(GiB)": 28.88,
"step": 285,
"token_acc": 0.9250812487968384,
"train_speed(iter/s)": 0.11243
},
{
"epoch": 0.30991183542612877,
"grad_norm": 0.7576037049293518,
"learning_rate": 9.73857405798168e-06,
"loss": 0.21578505039215087,
"memory(GiB)": 28.88,
"step": 290,
"token_acc": 0.9281283843654622,
"train_speed(iter/s)": 0.113065
},
{
"epoch": 0.31525514293347584,
"grad_norm": 0.7560254335403442,
"learning_rate": 9.729564506451791e-06,
"loss": 0.2222808837890625,
"memory(GiB)": 28.88,
"step": 295,
"token_acc": 0.9222295127257658,
"train_speed(iter/s)": 0.11355
},
{
"epoch": 0.32059845044082286,
"grad_norm": 0.7463079690933228,
"learning_rate": 9.720406636860252e-06,
"loss": 0.2202146530151367,
"memory(GiB)": 28.88,
"step": 300,
"token_acc": 0.9305221834918271,
"train_speed(iter/s)": 0.11405
},
{
"epoch": 0.32059845044082286,
"eval_loss": 0.20838645100593567,
"eval_runtime": 39.2025,
"eval_samples_per_second": 15.407,
"eval_steps_per_second": 3.852,
"eval_token_acc": 0.9288683008428231,
"step": 300
},
{
"epoch": 0.32594175794816993,
"grad_norm": 0.7550280690193176,
"learning_rate": 9.711100736395758e-06,
"loss": 0.22840361595153807,
"memory(GiB)": 28.88,
"step": 305,
"token_acc": 0.9224960254372019,
"train_speed(iter/s)": 0.112523
},
{
"epoch": 0.33128506545551695,
"grad_norm": 0.8339446187019348,
"learning_rate": 9.70164709688922e-06,
"loss": 0.23481903076171876,
"memory(GiB)": 28.88,
"step": 310,
"token_acc": 0.9225257780131461,
"train_speed(iter/s)": 0.113092
},
{
"epoch": 0.336628372962864,
"grad_norm": 0.7605867385864258,
"learning_rate": 9.69204601480461e-06,
"loss": 0.20481536388397217,
"memory(GiB)": 28.88,
"step": 315,
"token_acc": 0.925222651655184,
"train_speed(iter/s)": 0.113604
},
{
"epoch": 0.34197168047021104,
"grad_norm": 0.7798689007759094,
"learning_rate": 9.682297791229668e-06,
"loss": 0.21814496517181398,
"memory(GiB)": 28.88,
"step": 320,
"token_acc": 0.9192225342713344,
"train_speed(iter/s)": 0.114156
},
{
"epoch": 0.34197168047021104,
"eval_loss": 0.20693515241146088,
"eval_runtime": 39.111,
"eval_samples_per_second": 15.443,
"eval_steps_per_second": 3.861,
"eval_token_acc": 0.9290999163610628,
"step": 320
},
{
"epoch": 0.3473149879775581,
"grad_norm": 0.7268094420433044,
"learning_rate": 9.67240273186646e-06,
"loss": 0.21229307651519774,
"memory(GiB)": 28.88,
"step": 325,
"token_acc": 0.9255569407555965,
"train_speed(iter/s)": 0.112688
},
{
"epoch": 0.3526582954849052,
"grad_norm": 0.8207147121429443,
"learning_rate": 9.66236114702178e-06,
"loss": 0.23093049526214598,
"memory(GiB)": 28.88,
"step": 330,
"token_acc": 0.9176510638297872,
"train_speed(iter/s)": 0.113194
},
{
"epoch": 0.3580016029922522,
"grad_norm": 0.7545695304870605,
"learning_rate": 9.652173351597435e-06,
"loss": 0.22740473747253417,
"memory(GiB)": 28.88,
"step": 335,
"token_acc": 0.9171260565968394,
"train_speed(iter/s)": 0.113721
},
{
"epoch": 0.36334491049959927,
"grad_norm": 0.7324105501174927,
"learning_rate": 9.641839665080363e-06,
"loss": 0.2098468780517578,
"memory(GiB)": 28.88,
"step": 340,
"token_acc": 0.9380478206427763,
"train_speed(iter/s)": 0.114179
},
{
"epoch": 0.36334491049959927,
"eval_loss": 0.20602919161319733,
"eval_runtime": 39.2173,
"eval_samples_per_second": 15.401,
"eval_steps_per_second": 3.85,
"eval_token_acc": 0.9290613137746896,
"step": 340
},
{
"epoch": 0.3686882180069463,
"grad_norm": 0.8628275394439697,
"learning_rate": 9.631360411532609e-06,
"loss": 0.20752103328704835,
"memory(GiB)": 31.36,
"step": 345,
"token_acc": 0.927929963287207,
"train_speed(iter/s)": 0.112824
},
{
"epoch": 0.37403152551429336,
"grad_norm": 0.8375836610794067,
"learning_rate": 9.620735919581168e-06,
"loss": 0.21789727210998536,
"memory(GiB)": 31.36,
"step": 350,
"token_acc": 0.9248612142427778,
"train_speed(iter/s)": 0.113287
},
{
"epoch": 0.3793748330216404,
"grad_norm": 0.7989551424980164,
"learning_rate": 9.609966522407678e-06,
"loss": 0.21387200355529784,
"memory(GiB)": 31.36,
"step": 355,
"token_acc": 0.9233053363875456,
"train_speed(iter/s)": 0.11376
},
{
"epoch": 0.38471814052898745,
"grad_norm": 0.8270795941352844,
"learning_rate": 9.599052557737973e-06,
"loss": 0.2108246088027954,
"memory(GiB)": 31.36,
"step": 360,
"token_acc": 0.9243235094595613,
"train_speed(iter/s)": 0.114225
},
{
"epoch": 0.38471814052898745,
"eval_loss": 0.2033926397562027,
"eval_runtime": 39.2398,
"eval_samples_per_second": 15.393,
"eval_steps_per_second": 3.848,
"eval_token_acc": 0.9299148598511656,
"step": 360
},
{
"epoch": 0.39006144803633447,
"grad_norm": 0.7160355448722839,
"learning_rate": 9.58799436783149e-06,
"loss": 0.19975266456604004,
"memory(GiB)": 31.36,
"step": 365,
"token_acc": 0.9291021882882269,
"train_speed(iter/s)": 0.112953
},
{
"epoch": 0.39540475554368154,
"grad_norm": 0.7318681478500366,
"learning_rate": 9.576792299470537e-06,
"loss": 0.2046557903289795,
"memory(GiB)": 31.36,
"step": 370,
"token_acc": 0.9262238367064521,
"train_speed(iter/s)": 0.113418
},
{
"epoch": 0.4007480630510286,
"grad_norm": 0.741698682308197,
"learning_rate": 9.565446703949417e-06,
"loss": 0.20282254219055176,
"memory(GiB)": 31.36,
"step": 375,
"token_acc": 0.9226779000139257,
"train_speed(iter/s)": 0.113823
},
{
"epoch": 0.40609137055837563,
"grad_norm": 0.8014733791351318,
"learning_rate": 9.55395793706341e-06,
"loss": 0.20639050006866455,
"memory(GiB)": 31.36,
"step": 380,
"token_acc": 0.9284898178641038,
"train_speed(iter/s)": 0.11425
},
{
"epoch": 0.40609137055837563,
"eval_loss": 0.20332778990268707,
"eval_runtime": 39.1548,
"eval_samples_per_second": 15.426,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9302279697184156,
"step": 380
},
{
"epoch": 0.4114346780657227,
"grad_norm": 0.7752350568771362,
"learning_rate": 9.542326359097619e-06,
"loss": 0.21643948554992676,
"memory(GiB)": 31.36,
"step": 385,
"token_acc": 0.9229331980070123,
"train_speed(iter/s)": 0.113068
},
{
"epoch": 0.4167779855730697,
"grad_norm": 0.7383193373680115,
"learning_rate": 9.530552334815672e-06,
"loss": 0.23035106658935547,
"memory(GiB)": 31.36,
"step": 390,
"token_acc": 0.9227077075527204,
"train_speed(iter/s)": 0.113539
},
{
"epoch": 0.4221212930804168,
"grad_norm": 0.826757550239563,
"learning_rate": 9.518636233448276e-06,
"loss": 0.2027421474456787,
"memory(GiB)": 31.36,
"step": 395,
"token_acc": 0.9212801395939086,
"train_speed(iter/s)": 0.113879
},
{
"epoch": 0.4274646005877638,
"grad_norm": 0.8301543593406677,
"learning_rate": 9.506578428681648e-06,
"loss": 0.2126997232437134,
"memory(GiB)": 31.36,
"step": 400,
"token_acc": 0.9254076176645323,
"train_speed(iter/s)": 0.114274
},
{
"epoch": 0.4274646005877638,
"eval_loss": 0.20262883603572845,
"eval_runtime": 39.1295,
"eval_samples_per_second": 15.436,
"eval_steps_per_second": 3.859,
"eval_token_acc": 0.930107872783032,
"step": 400
},
{
"epoch": 0.4328079080951109,
"grad_norm": 0.7817525863647461,
"learning_rate": 9.494379298645788e-06,
"loss": 0.20061790943145752,
"memory(GiB)": 31.36,
"step": 405,
"token_acc": 0.9269484626175747,
"train_speed(iter/s)": 0.113056
},
{
"epoch": 0.4381512156024579,
"grad_norm": 0.7769330143928528,
"learning_rate": 9.482039225902623e-06,
"loss": 0.20687649250030518,
"memory(GiB)": 31.36,
"step": 410,
"token_acc": 0.9282657499649222,
"train_speed(iter/s)": 0.113426
},
{
"epoch": 0.443494523109805,
"grad_norm": 0.7487837076187134,
"learning_rate": 9.469558597434018e-06,
"loss": 0.20723772048950195,
"memory(GiB)": 31.36,
"step": 415,
"token_acc": 0.9278717406624384,
"train_speed(iter/s)": 0.113801
},
{
"epoch": 0.448837830617152,
"grad_norm": 0.7757744789123535,
"learning_rate": 9.456937804629623e-06,
"loss": 0.2050936698913574,
"memory(GiB)": 31.36,
"step": 420,
"token_acc": 0.9330446275414077,
"train_speed(iter/s)": 0.114158
},
{
"epoch": 0.448837830617152,
"eval_loss": 0.2025863230228424,
"eval_runtime": 39.128,
"eval_samples_per_second": 15.437,
"eval_steps_per_second": 3.859,
"eval_token_acc": 0.9305839713483025,
"step": 420
},
{
"epoch": 0.45418113812449906,
"grad_norm": 0.7080808877944946,
"learning_rate": 9.444177243274619e-06,
"loss": 0.1936201810836792,
"memory(GiB)": 31.36,
"step": 425,
"token_acc": 0.9299704947798456,
"train_speed(iter/s)": 0.113084
},
{
"epoch": 0.45952444563184613,
"grad_norm": 0.647037148475647,
"learning_rate": 9.43127731353729e-06,
"loss": 0.1949155330657959,
"memory(GiB)": 31.36,
"step": 430,
"token_acc": 0.9309410335482665,
"train_speed(iter/s)": 0.113375
},
{
"epoch": 0.46486775313919315,
"grad_norm": 0.8139774203300476,
"learning_rate": 9.418238419956484e-06,
"loss": 0.21164326667785643,
"memory(GiB)": 31.36,
"step": 435,
"token_acc": 0.926740549360892,
"train_speed(iter/s)": 0.113761
},
{
"epoch": 0.4702110606465402,
"grad_norm": 0.7926304340362549,
"learning_rate": 9.405060971428924e-06,
"loss": 0.21458048820495607,
"memory(GiB)": 31.36,
"step": 440,
"token_acc": 0.9295489624421197,
"train_speed(iter/s)": 0.11413
},
{
"epoch": 0.4702110606465402,
"eval_loss": 0.20057949423789978,
"eval_runtime": 39.1388,
"eval_samples_per_second": 15.432,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9312059019065388,
"step": 440
},
{
"epoch": 0.47555436815388724,
"grad_norm": 0.7835922837257385,
"learning_rate": 9.391745381196382e-06,
"loss": 0.20983607769012452,
"memory(GiB)": 31.36,
"step": 445,
"token_acc": 0.9275723362435434,
"train_speed(iter/s)": 0.113097
},
{
"epoch": 0.4808976756612343,
"grad_norm": 0.7004543542861938,
"learning_rate": 9.378292066832723e-06,
"loss": 0.21295685768127443,
"memory(GiB)": 31.36,
"step": 450,
"token_acc": 0.9304546811262561,
"train_speed(iter/s)": 0.113495
},
{
"epoch": 0.48624098316858133,
"grad_norm": 0.7315278053283691,
"learning_rate": 9.364701450230813e-06,
"loss": 0.19759070873260498,
"memory(GiB)": 31.36,
"step": 455,
"token_acc": 0.9323673009514272,
"train_speed(iter/s)": 0.113833
},
{
"epoch": 0.4915842906759284,
"grad_norm": 0.8716449737548828,
"learning_rate": 9.350973957589278e-06,
"loss": 0.21045897006988526,
"memory(GiB)": 31.36,
"step": 460,
"token_acc": 0.928,
"train_speed(iter/s)": 0.114182
},
{
"epoch": 0.4915842906759284,
"eval_loss": 0.20007076859474182,
"eval_runtime": 39.0359,
"eval_samples_per_second": 15.473,
"eval_steps_per_second": 3.868,
"eval_token_acc": 0.9313774689570868,
"step": 460
},
{
"epoch": 0.4969275981832754,
"grad_norm": 0.8024523258209229,
"learning_rate": 9.33711001939915e-06,
"loss": 0.22184643745422364,
"memory(GiB)": 31.36,
"step": 465,
"token_acc": 0.9268073439852581,
"train_speed(iter/s)": 0.113198
},
{
"epoch": 0.5022709056906225,
"grad_norm": 0.7982778549194336,
"learning_rate": 9.32311007043036e-06,
"loss": 0.2020054817199707,
"memory(GiB)": 31.36,
"step": 470,
"token_acc": 0.9326877815883263,
"train_speed(iter/s)": 0.113499
},
{
"epoch": 0.5076142131979695,
"grad_norm": 0.7509546279907227,
"learning_rate": 9.30897454971811e-06,
"loss": 0.20111818313598634,
"memory(GiB)": 31.36,
"step": 475,
"token_acc": 0.9371612406895828,
"train_speed(iter/s)": 0.113839
},
{
"epoch": 0.5129575207053166,
"grad_norm": 0.7862565517425537,
"learning_rate": 9.294703900549096e-06,
"loss": 0.20246634483337403,
"memory(GiB)": 31.36,
"step": 480,
"token_acc": 0.9213386062277757,
"train_speed(iter/s)": 0.114112
},
{
"epoch": 0.5129575207053166,
"eval_loss": 0.19960449635982513,
"eval_runtime": 39.1156,
"eval_samples_per_second": 15.441,
"eval_steps_per_second": 3.86,
"eval_token_acc": 0.9314846983636793,
"step": 480
},
{
"epoch": 0.5183008282126637,
"grad_norm": 0.7772889733314514,
"learning_rate": 9.280298570447612e-06,
"loss": 0.20028786659240722,
"memory(GiB)": 31.36,
"step": 485,
"token_acc": 0.9264812360063717,
"train_speed(iter/s)": 0.113159
},
{
"epoch": 0.5236441357200107,
"grad_norm": 0.6927475929260254,
"learning_rate": 9.265759011161519e-06,
"loss": 0.19862596988677977,
"memory(GiB)": 31.36,
"step": 490,
"token_acc": 0.9293699036323202,
"train_speed(iter/s)": 0.113461
},
{
"epoch": 0.5289874432273577,
"grad_norm": 0.7092183828353882,
"learning_rate": 9.251085678648072e-06,
"loss": 0.20681967735290527,
"memory(GiB)": 31.36,
"step": 495,
"token_acc": 0.9285693844260653,
"train_speed(iter/s)": 0.113757
},
{
"epoch": 0.5343307507347048,
"grad_norm": 0.7293869853019714,
"learning_rate": 9.236279033059622e-06,
"loss": 0.21075177192687988,
"memory(GiB)": 31.36,
"step": 500,
"token_acc": 0.9218673562093788,
"train_speed(iter/s)": 0.114071
},
{
"epoch": 0.5343307507347048,
"eval_loss": 0.1984926164150238,
"eval_runtime": 39.2663,
"eval_samples_per_second": 15.382,
"eval_steps_per_second": 3.846,
"eval_token_acc": 0.9314375174247785,
"step": 500
},
{
"epoch": 0.5396740582420518,
"grad_norm": 0.8831562995910645,
"learning_rate": 9.221339538729191e-06,
"loss": 0.21746454238891602,
"memory(GiB)": 31.36,
"step": 505,
"token_acc": 0.9285247936980231,
"train_speed(iter/s)": 0.113113
},
{
"epoch": 0.5450173657493989,
"grad_norm": 0.7981932163238525,
"learning_rate": 9.206267664155906e-06,
"loss": 0.2134779691696167,
"memory(GiB)": 31.36,
"step": 510,
"token_acc": 0.9249437382154371,
"train_speed(iter/s)": 0.113434
},
{
"epoch": 0.550360673256746,
"grad_norm": 0.7087724208831787,
"learning_rate": 9.191063881990308e-06,
"loss": 0.2046900749206543,
"memory(GiB)": 31.36,
"step": 515,
"token_acc": 0.9302552606840697,
"train_speed(iter/s)": 0.113724
},
{
"epoch": 0.555703980764093,
"grad_norm": 0.7404990196228027,
"learning_rate": 9.17572866901953e-06,
"loss": 0.2034606456756592,
"memory(GiB)": 31.36,
"step": 520,
"token_acc": 0.9364186566673245,
"train_speed(iter/s)": 0.113991
},
{
"epoch": 0.555703980764093,
"eval_loss": 0.19815480709075928,
"eval_runtime": 39.1389,
"eval_samples_per_second": 15.432,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9321194964507067,
"step": 520
},
{
"epoch": 0.56104728827144,
"grad_norm": 0.7702794671058655,
"learning_rate": 9.160262506152343e-06,
"loss": 0.20876009464263917,
"memory(GiB)": 31.36,
"step": 525,
"token_acc": 0.9260557562023464,
"train_speed(iter/s)": 0.113137
},
{
"epoch": 0.566390595778787,
"grad_norm": 0.7707433104515076,
"learning_rate": 9.14466587840408e-06,
"loss": 0.2181908369064331,
"memory(GiB)": 31.36,
"step": 530,
"token_acc": 0.9201792801423293,
"train_speed(iter/s)": 0.113398
},
{
"epoch": 0.5717339032861342,
"grad_norm": 0.7392153143882751,
"learning_rate": 9.12893927488142e-06,
"loss": 0.2074495792388916,
"memory(GiB)": 31.36,
"step": 535,
"token_acc": 0.9264089572777882,
"train_speed(iter/s)": 0.113707
},
{
"epoch": 0.5770772107934812,
"grad_norm": 0.6916921138763428,
"learning_rate": 9.113083188767057e-06,
"loss": 0.192901611328125,
"memory(GiB)": 31.36,
"step": 540,
"token_acc": 0.9276745841693095,
"train_speed(iter/s)": 0.114011
},
{
"epoch": 0.5770772107934812,
"eval_loss": 0.1964673101902008,
"eval_runtime": 39.2785,
"eval_samples_per_second": 15.377,
"eval_steps_per_second": 3.844,
"eval_token_acc": 0.9323039310300457,
"step": 540
},
{
"epoch": 0.5824205183008282,
"grad_norm": 0.7404196262359619,
"learning_rate": 9.097098117304223e-06,
"loss": 0.19912216663360596,
"memory(GiB)": 31.36,
"step": 545,
"token_acc": 0.9267058907942886,
"train_speed(iter/s)": 0.113126
},
{
"epoch": 0.5877638258081752,
"grad_norm": 0.8616334795951843,
"learning_rate": 9.08098456178111e-06,
"loss": 0.22115416526794435,
"memory(GiB)": 31.36,
"step": 550,
"token_acc": 0.9247433468141067,
"train_speed(iter/s)": 0.113428
},
{
"epoch": 0.5931071333155223,
"grad_norm": 0.6713617444038391,
"learning_rate": 9.064743027515127e-06,
"loss": 0.20437276363372803,
"memory(GiB)": 31.36,
"step": 555,
"token_acc": 0.9308613364233295,
"train_speed(iter/s)": 0.113724
},
{
"epoch": 0.5984504408228694,
"grad_norm": 0.7443351745605469,
"learning_rate": 9.048374023837086e-06,
"loss": 0.21196300983428956,
"memory(GiB)": 31.36,
"step": 560,
"token_acc": 0.9288690476190476,
"train_speed(iter/s)": 0.113984
},
{
"epoch": 0.5984504408228694,
"eval_loss": 0.1953776627779007,
"eval_runtime": 39.1331,
"eval_samples_per_second": 15.435,
"eval_steps_per_second": 3.859,
"eval_token_acc": 0.9320208453966415,
"step": 560
},
{
"epoch": 0.6037937483302164,
"grad_norm": 0.7687556147575378,
"learning_rate": 9.03187806407519e-06,
"loss": 0.20800457000732422,
"memory(GiB)": 31.36,
"step": 565,
"token_acc": 0.9276673510367078,
"train_speed(iter/s)": 0.113135
},
{
"epoch": 0.6091370558375635,
"grad_norm": 0.7980899214744568,
"learning_rate": 9.015255665538972e-06,
"loss": 0.2199338912963867,
"memory(GiB)": 31.36,
"step": 570,
"token_acc": 0.9232478287214704,
"train_speed(iter/s)": 0.113444
},
{
"epoch": 0.6144803633449105,
"grad_norm": 0.7510707974433899,
"learning_rate": 8.998507349503048e-06,
"loss": 0.19736554622650146,
"memory(GiB)": 31.36,
"step": 575,
"token_acc": 0.9285859751544362,
"train_speed(iter/s)": 0.113724
},
{
"epoch": 0.6198236708522575,
"grad_norm": 0.8514485955238342,
"learning_rate": 8.981633641190779e-06,
"loss": 0.21196255683898926,
"memory(GiB)": 31.36,
"step": 580,
"token_acc": 0.9259536976286438,
"train_speed(iter/s)": 0.113995
},
{
"epoch": 0.6198236708522575,
"eval_loss": 0.19566793739795685,
"eval_runtime": 39.1561,
"eval_samples_per_second": 15.425,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9323511119689464,
"step": 580
},
{
"epoch": 0.6251669783596046,
"grad_norm": 0.7959375381469727,
"learning_rate": 8.964635069757803e-06,
"loss": 0.20478439331054688,
"memory(GiB)": 31.36,
"step": 585,
"token_acc": 0.9299914280010803,
"train_speed(iter/s)": 0.113198
},
{
"epoch": 0.6305102858669517,
"grad_norm": 0.6840202808380127,
"learning_rate": 8.94751216827543e-06,
"loss": 0.20479016304016112,
"memory(GiB)": 31.36,
"step": 590,
"token_acc": 0.930996839988509,
"train_speed(iter/s)": 0.113473
},
{
"epoch": 0.6358535933742987,
"grad_norm": 0.7271625399589539,
"learning_rate": 8.930265473713939e-06,
"loss": 0.18621822595596313,
"memory(GiB)": 31.36,
"step": 595,
"token_acc": 0.9302919832373684,
"train_speed(iter/s)": 0.113714
},
{
"epoch": 0.6411969008816457,
"grad_norm": 0.7664585113525391,
"learning_rate": 8.912895526925726e-06,
"loss": 0.20015628337860109,
"memory(GiB)": 31.36,
"step": 600,
"token_acc": 0.9368948705623841,
"train_speed(iter/s)": 0.113966
},
{
"epoch": 0.6411969008816457,
"eval_loss": 0.19422343373298645,
"eval_runtime": 39.1797,
"eval_samples_per_second": 15.416,
"eval_steps_per_second": 3.854,
"eval_token_acc": 0.9324712089043299,
"step": 600
},
{
"epoch": 0.6465402083889927,
"grad_norm": 0.7383084297180176,
"learning_rate": 8.895402872628352e-06,
"loss": 0.20294442176818847,
"memory(GiB)": 31.36,
"step": 605,
"token_acc": 0.9330581241743725,
"train_speed(iter/s)": 0.113194
},
{
"epoch": 0.6518835158963399,
"grad_norm": 0.7372247576713562,
"learning_rate": 8.87778805938746e-06,
"loss": 0.20596041679382324,
"memory(GiB)": 31.36,
"step": 610,
"token_acc": 0.9245759105003952,
"train_speed(iter/s)": 0.11346
},
{
"epoch": 0.6572268234036869,
"grad_norm": 0.7220420241355896,
"learning_rate": 8.86005163959956e-06,
"loss": 0.20566608905792236,
"memory(GiB)": 31.36,
"step": 615,
"token_acc": 0.9246050420168067,
"train_speed(iter/s)": 0.113732
},
{
"epoch": 0.6625701309110339,
"grad_norm": 0.687303900718689,
"learning_rate": 8.842194169474727e-06,
"loss": 0.20316667556762696,
"memory(GiB)": 31.36,
"step": 620,
"token_acc": 0.929285897482871,
"train_speed(iter/s)": 0.114
},
{
"epoch": 0.6625701309110339,
"eval_loss": 0.19414789974689484,
"eval_runtime": 39.2021,
"eval_samples_per_second": 15.407,
"eval_steps_per_second": 3.852,
"eval_token_acc": 0.9324969439619121,
"step": 620
},
{
"epoch": 0.667913438418381,
"grad_norm": 0.7889045476913452,
"learning_rate": 8.824216209019139e-06,
"loss": 0.2020263195037842,
"memory(GiB)": 31.36,
"step": 625,
"token_acc": 0.9281722400366468,
"train_speed(iter/s)": 0.113212
},
{
"epoch": 0.673256745925728,
"grad_norm": 0.7296798229217529,
"learning_rate": 8.806118322017525e-06,
"loss": 0.19929354190826415,
"memory(GiB)": 31.36,
"step": 630,
"token_acc": 0.9315762957365074,
"train_speed(iter/s)": 0.113443
},
{
"epoch": 0.6786000534330751,
"grad_norm": 0.7287123799324036,
"learning_rate": 8.787901076015487e-06,
"loss": 0.20486984252929688,
"memory(GiB)": 31.36,
"step": 635,
"token_acc": 0.9216136550589525,
"train_speed(iter/s)": 0.113669
},
{
"epoch": 0.6839433609404221,
"grad_norm": 0.7702438831329346,
"learning_rate": 8.769565042301692e-06,
"loss": 0.21484546661376952,
"memory(GiB)": 31.36,
"step": 640,
"token_acc": 0.9197191255248299,
"train_speed(iter/s)": 0.113928
},
{
"epoch": 0.6839433609404221,
"eval_loss": 0.19368213415145874,
"eval_runtime": 39.232,
"eval_samples_per_second": 15.396,
"eval_steps_per_second": 3.849,
"eval_token_acc": 0.9326599326599326,
"step": 640
},
{
"epoch": 0.6892866684477692,
"grad_norm": 0.7215872406959534,
"learning_rate": 8.751110795889966e-06,
"loss": 0.20535902976989745,
"memory(GiB)": 31.36,
"step": 645,
"token_acc": 0.9300116508547923,
"train_speed(iter/s)": 0.113175
},
{
"epoch": 0.6946299759551162,
"grad_norm": 0.7084954380989075,
"learning_rate": 8.732538915501257e-06,
"loss": 0.19380364418029786,
"memory(GiB)": 31.36,
"step": 650,
"token_acc": 0.933890160921023,
"train_speed(iter/s)": 0.113399
},
{
"epoch": 0.6999732834624632,
"grad_norm": 0.7394425272941589,
"learning_rate": 8.71384998354549e-06,
"loss": 0.19607152938842773,
"memory(GiB)": 31.36,
"step": 655,
"token_acc": 0.9408127208480566,
"train_speed(iter/s)": 0.113636
},
{
"epoch": 0.7053165909698104,
"grad_norm": 0.6766705513000488,
"learning_rate": 8.695044586103297e-06,
"loss": 0.19869532585144042,
"memory(GiB)": 31.36,
"step": 660,
"token_acc": 0.9315438606795688,
"train_speed(iter/s)": 0.113856
},
{
"epoch": 0.7053165909698104,
"eval_loss": 0.19304147362709045,
"eval_runtime": 39.1551,
"eval_samples_per_second": 15.426,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9326256192498231,
"step": 660
},
{
"epoch": 0.7106598984771574,
"grad_norm": 0.8370521664619446,
"learning_rate": 8.676123312907641e-06,
"loss": 0.20741744041442872,
"memory(GiB)": 31.36,
"step": 665,
"token_acc": 0.9293741677762982,
"train_speed(iter/s)": 0.113158
},
{
"epoch": 0.7160032059845044,
"grad_norm": 0.6481726765632629,
"learning_rate": 8.657086757325328e-06,
"loss": 0.19325138330459596,
"memory(GiB)": 31.36,
"step": 670,
"token_acc": 0.9333951984673412,
"train_speed(iter/s)": 0.113374
},
{
"epoch": 0.7213465134918514,
"grad_norm": 0.7585648894309998,
"learning_rate": 8.637935516338384e-06,
"loss": 0.20734424591064454,
"memory(GiB)": 31.36,
"step": 675,
"token_acc": 0.929390127754606,
"train_speed(iter/s)": 0.113613
},
{
"epoch": 0.7266898209991985,
"grad_norm": 0.6115338206291199,
"learning_rate": 8.61867019052535e-06,
"loss": 0.1957784414291382,
"memory(GiB)": 31.36,
"step": 680,
"token_acc": 0.9327852845758922,
"train_speed(iter/s)": 0.113851
},
{
"epoch": 0.7266898209991985,
"eval_loss": 0.1919844150543213,
"eval_runtime": 39.1768,
"eval_samples_per_second": 15.417,
"eval_steps_per_second": 3.854,
"eval_token_acc": 0.9334705869737717,
"step": 680
},
{
"epoch": 0.7320331285065456,
"grad_norm": 0.763317883014679,
"learning_rate": 8.599291384042442e-06,
"loss": 0.1990307092666626,
"memory(GiB)": 31.36,
"step": 685,
"token_acc": 0.9309296501645834,
"train_speed(iter/s)": 0.113214
},
{
"epoch": 0.7373764360138926,
"grad_norm": 0.6819867491722107,
"learning_rate": 8.579799704604597e-06,
"loss": 0.19830925464630128,
"memory(GiB)": 31.36,
"step": 690,
"token_acc": 0.9335458059266292,
"train_speed(iter/s)": 0.113398
},
{
"epoch": 0.7427197435212396,
"grad_norm": 0.7253126502037048,
"learning_rate": 8.560195763466428e-06,
"loss": 0.20094099044799804,
"memory(GiB)": 31.36,
"step": 695,
"token_acc": 0.9241353978300181,
"train_speed(iter/s)": 0.113667
},
{
"epoch": 0.7480630510285867,
"grad_norm": 0.75684654712677,
"learning_rate": 8.540480175403045e-06,
"loss": 0.20439500808715821,
"memory(GiB)": 31.36,
"step": 700,
"token_acc": 0.9336929261002508,
"train_speed(iter/s)": 0.113873
},
{
"epoch": 0.7480630510285867,
"eval_loss": 0.19040465354919434,
"eval_runtime": 39.1788,
"eval_samples_per_second": 15.416,
"eval_steps_per_second": 3.854,
"eval_token_acc": 0.9335263462651998,
"step": 700
},
{
"epoch": 0.7534063585359337,
"grad_norm": 0.695967972278595,
"learning_rate": 8.520653558690785e-06,
"loss": 0.2008056879043579,
"memory(GiB)": 31.36,
"step": 705,
"token_acc": 0.9287990912399502,
"train_speed(iter/s)": 0.113225
},
{
"epoch": 0.7587496660432808,
"grad_norm": 0.7082876563072205,
"learning_rate": 8.500716535087815e-06,
"loss": 0.19893609285354613,
"memory(GiB)": 31.36,
"step": 710,
"token_acc": 0.9335433493079527,
"train_speed(iter/s)": 0.113434
},
{
"epoch": 0.7640929735506279,
"grad_norm": 0.6951336860656738,
"learning_rate": 8.480669729814635e-06,
"loss": 0.20382363796234132,
"memory(GiB)": 31.36,
"step": 715,
"token_acc": 0.9272279597838725,
"train_speed(iter/s)": 0.113684
},
{
"epoch": 0.7694362810579749,
"grad_norm": 0.690929651260376,
"learning_rate": 8.460513771534475e-06,
"loss": 0.20613670349121094,
"memory(GiB)": 31.36,
"step": 720,
"token_acc": 0.9379507848960543,
"train_speed(iter/s)": 0.113902
},
{
"epoch": 0.7694362810579749,
"eval_loss": 0.19106899201869965,
"eval_runtime": 39.2423,
"eval_samples_per_second": 15.392,
"eval_steps_per_second": 3.848,
"eval_token_acc": 0.9331446095777306,
"step": 720
},
{
"epoch": 0.7747795885653219,
"grad_norm": 0.6783177256584167,
"learning_rate": 8.440249292333583e-06,
"loss": 0.1977448582649231,
"memory(GiB)": 31.36,
"step": 725,
"token_acc": 0.9308449330614417,
"train_speed(iter/s)": 0.11326
},
{
"epoch": 0.7801228960726689,
"grad_norm": 0.6677674055099487,
"learning_rate": 8.41987692770139e-06,
"loss": 0.21048130989074706,
"memory(GiB)": 31.36,
"step": 730,
"token_acc": 0.9202530400865319,
"train_speed(iter/s)": 0.113463
},
{
"epoch": 0.7854662035800161,
"grad_norm": 0.8552298545837402,
"learning_rate": 8.399397316510596e-06,
"loss": 0.20974290370941162,
"memory(GiB)": 31.36,
"step": 735,
"token_acc": 0.9332773814519513,
"train_speed(iter/s)": 0.113682
},
{
"epoch": 0.7908095110873631,
"grad_norm": 0.7160708904266357,
"learning_rate": 8.378811100997122e-06,
"loss": 0.20636558532714844,
"memory(GiB)": 31.36,
"step": 740,
"token_acc": 0.9329161099782013,
"train_speed(iter/s)": 0.113884
},
{
"epoch": 0.7908095110873631,
"eval_loss": 0.19004826247692108,
"eval_runtime": 39.1971,
"eval_samples_per_second": 15.409,
"eval_steps_per_second": 3.852,
"eval_token_acc": 0.9335435029702546,
"step": 740
},
{
"epoch": 0.7961528185947101,
"grad_norm": 0.7204393148422241,
"learning_rate": 8.358118926739984e-06,
"loss": 0.20534658432006836,
"memory(GiB)": 31.36,
"step": 745,
"token_acc": 0.9301107044622764,
"train_speed(iter/s)": 0.113248
},
{
"epoch": 0.8014961261020572,
"grad_norm": 0.7450360059738159,
"learning_rate": 8.337321442641036e-06,
"loss": 0.20312881469726562,
"memory(GiB)": 31.36,
"step": 750,
"token_acc": 0.9307398246008545,
"train_speed(iter/s)": 0.113436
},
{
"epoch": 0.8068394336094042,
"grad_norm": 0.7013337016105652,
"learning_rate": 8.316419300904622e-06,
"loss": 0.20806705951690674,
"memory(GiB)": 31.36,
"step": 755,
"token_acc": 0.9265781630994311,
"train_speed(iter/s)": 0.113667
},
{
"epoch": 0.8121827411167513,
"grad_norm": 0.7928115129470825,
"learning_rate": 8.295413157017127e-06,
"loss": 0.20586895942687988,
"memory(GiB)": 31.36,
"step": 760,
"token_acc": 0.9280571524250711,
"train_speed(iter/s)": 0.113891
},
{
"epoch": 0.8121827411167513,
"eval_loss": 0.1891859769821167,
"eval_runtime": 39.1442,
"eval_samples_per_second": 15.43,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9337751184884943,
"step": 760
},
{
"epoch": 0.8175260486240983,
"grad_norm": 0.7362833023071289,
"learning_rate": 8.274303669726427e-06,
"loss": 0.21117730140686036,
"memory(GiB)": 31.36,
"step": 765,
"token_acc": 0.9295638706201881,
"train_speed(iter/s)": 0.113261
},
{
"epoch": 0.8228693561314454,
"grad_norm": 0.8272237777709961,
"learning_rate": 8.25309150102121e-06,
"loss": 0.20895204544067383,
"memory(GiB)": 31.36,
"step": 770,
"token_acc": 0.925374677002584,
"train_speed(iter/s)": 0.113483
},
{
"epoch": 0.8282126636387924,
"grad_norm": 0.7175537943840027,
"learning_rate": 8.231777316110245e-06,
"loss": 0.1944166898727417,
"memory(GiB)": 31.36,
"step": 775,
"token_acc": 0.9321589002543703,
"train_speed(iter/s)": 0.11366
},
{
"epoch": 0.8335559711461394,
"grad_norm": 0.7809334993362427,
"learning_rate": 8.210361783401491e-06,
"loss": 0.19996525049209596,
"memory(GiB)": 31.36,
"step": 780,
"token_acc": 0.9270649417354178,
"train_speed(iter/s)": 0.113842
},
{
"epoch": 0.8335559711461394,
"eval_loss": 0.18875150382518768,
"eval_runtime": 39.1112,
"eval_samples_per_second": 15.443,
"eval_steps_per_second": 3.861,
"eval_token_acc": 0.9345214351583778,
"step": 780
},
{
"epoch": 0.8388992786534865,
"grad_norm": 0.7965995669364929,
"learning_rate": 8.188845574481162e-06,
"loss": 0.20428986549377443,
"memory(GiB)": 31.36,
"step": 785,
"token_acc": 0.9295484112938198,
"train_speed(iter/s)": 0.113243
},
{
"epoch": 0.8442425861608336,
"grad_norm": 0.8420124053955078,
"learning_rate": 8.167229364092648e-06,
"loss": 0.2018270969390869,
"memory(GiB)": 31.36,
"step": 790,
"token_acc": 0.9257297598661941,
"train_speed(iter/s)": 0.113454
},
{
"epoch": 0.8495858936681806,
"grad_norm": 0.7496061325073242,
"learning_rate": 8.145513830115367e-06,
"loss": 0.18817675113677979,
"memory(GiB)": 31.36,
"step": 795,
"token_acc": 0.9294179964245514,
"train_speed(iter/s)": 0.113632
},
{
"epoch": 0.8549292011755276,
"grad_norm": 0.7280667424201965,
"learning_rate": 8.1236996535435e-06,
"loss": 0.20691485404968263,
"memory(GiB)": 31.36,
"step": 800,
"token_acc": 0.9246175682069074,
"train_speed(iter/s)": 0.113856
},
{
"epoch": 0.8549292011755276,
"eval_loss": 0.18890005350112915,
"eval_runtime": 39.0966,
"eval_samples_per_second": 15.449,
"eval_steps_per_second": 3.862,
"eval_token_acc": 0.9341997469386004,
"step": 800
},
{
"epoch": 0.8602725086828747,
"grad_norm": 0.7484925389289856,
"learning_rate": 8.101787518464634e-06,
"loss": 0.20772714614868165,
"memory(GiB)": 31.36,
"step": 805,
"token_acc": 0.9296667323028454,
"train_speed(iter/s)": 0.113284
},
{
"epoch": 0.8656158161902218,
"grad_norm": 0.791478157043457,
"learning_rate": 8.079778112038318e-06,
"loss": 0.20092449188232422,
"memory(GiB)": 31.36,
"step": 810,
"token_acc": 0.9342826902722987,
"train_speed(iter/s)": 0.113469
},
{
"epoch": 0.8709591236975688,
"grad_norm": 0.7884016633033752,
"learning_rate": 8.057672124474508e-06,
"loss": 0.19559590816497802,
"memory(GiB)": 31.36,
"step": 815,
"token_acc": 0.9313420307089644,
"train_speed(iter/s)": 0.113673
},
{
"epoch": 0.8763024312049158,
"grad_norm": 0.7414833307266235,
"learning_rate": 8.035470249011916e-06,
"loss": 0.21486268043518067,
"memory(GiB)": 31.36,
"step": 820,
"token_acc": 0.9156693981017509,
"train_speed(iter/s)": 0.113866
},
{
"epoch": 0.8763024312049158,
"eval_loss": 0.18826377391815186,
"eval_runtime": 39.1249,
"eval_samples_per_second": 15.438,
"eval_steps_per_second": 3.859,
"eval_token_acc": 0.9340796500032169,
"step": 820
},
{
"epoch": 0.8816457387122629,
"grad_norm": 0.6968944072723389,
"learning_rate": 8.013173181896283e-06,
"loss": 0.195112144947052,
"memory(GiB)": 31.36,
"step": 825,
"token_acc": 0.9297919928427645,
"train_speed(iter/s)": 0.113289
},
{
"epoch": 0.88698904621961,
"grad_norm": 0.6487668752670288,
"learning_rate": 7.990781622358535e-06,
"loss": 0.20295815467834472,
"memory(GiB)": 31.36,
"step": 830,
"token_acc": 0.9273416807127378,
"train_speed(iter/s)": 0.113496
},
{
"epoch": 0.892332353726957,
"grad_norm": 0.7101380825042725,
"learning_rate": 7.968296272592862e-06,
"loss": 0.2020167589187622,
"memory(GiB)": 31.36,
"step": 835,
"token_acc": 0.9326549210206562,
"train_speed(iter/s)": 0.113672
},
{
"epoch": 0.897675661234304,
"grad_norm": 0.7018981575965881,
"learning_rate": 7.945717837734688e-06,
"loss": 0.21045067310333251,
"memory(GiB)": 31.36,
"step": 840,
"token_acc": 0.9254527494237734,
"train_speed(iter/s)": 0.113897
},
{
"epoch": 0.897675661234304,
"eval_loss": 0.18678458034992218,
"eval_runtime": 39.117,
"eval_samples_per_second": 15.441,
"eval_steps_per_second": 3.86,
"eval_token_acc": 0.9345257243346415,
"step": 840
},
{
"epoch": 0.9030189687416511,
"grad_norm": 0.7442994713783264,
"learning_rate": 7.923047025838573e-06,
"loss": 0.18977639675140381,
"memory(GiB)": 31.36,
"step": 845,
"token_acc": 0.9316706328119585,
"train_speed(iter/s)": 0.11334
},
{
"epoch": 0.9083622762489981,
"grad_norm": 0.5974848866462708,
"learning_rate": 7.900284547855992e-06,
"loss": 0.19339005947113036,
"memory(GiB)": 31.36,
"step": 850,
"token_acc": 0.9319930430681249,
"train_speed(iter/s)": 0.113541
},
{
"epoch": 0.9137055837563451,
"grad_norm": 0.7044218182563782,
"learning_rate": 7.87743111761305e-06,
"loss": 0.19252583980560303,
"memory(GiB)": 31.36,
"step": 855,
"token_acc": 0.9317780249983639,
"train_speed(iter/s)": 0.113714
},
{
"epoch": 0.9190488912636923,
"grad_norm": 0.6662729978561401,
"learning_rate": 7.8544874517881e-06,
"loss": 0.18919335603713988,
"memory(GiB)": 31.36,
"step": 860,
"token_acc": 0.9358071036673404,
"train_speed(iter/s)": 0.113874
},
{
"epoch": 0.9190488912636923,
"eval_loss": 0.18598179519176483,
"eval_runtime": 39.1151,
"eval_samples_per_second": 15.442,
"eval_steps_per_second": 3.86,
"eval_token_acc": 0.9348645692594737,
"step": 860
},
{
"epoch": 0.9243921987710393,
"grad_norm": 0.7112472057342529,
"learning_rate": 7.831454269889251e-06,
"loss": 0.2195812225341797,
"memory(GiB)": 31.36,
"step": 865,
"token_acc": 0.929244681467741,
"train_speed(iter/s)": 0.113345
},
{
"epoch": 0.9297355062783863,
"grad_norm": 0.6910869479179382,
"learning_rate": 7.808332294231824e-06,
"loss": 0.19954900741577147,
"memory(GiB)": 31.36,
"step": 870,
"token_acc": 0.931801259053679,
"train_speed(iter/s)": 0.113528
},
{
"epoch": 0.9350788137857333,
"grad_norm": 0.681770384311676,
"learning_rate": 7.785122249915688e-06,
"loss": 0.18991070985794067,
"memory(GiB)": 31.36,
"step": 875,
"token_acc": 0.9295779137975007,
"train_speed(iter/s)": 0.113699
},
{
"epoch": 0.9404221212930804,
"grad_norm": 0.8212052583694458,
"learning_rate": 7.76182486480253e-06,
"loss": 0.1996673822402954,
"memory(GiB)": 31.36,
"step": 880,
"token_acc": 0.9402507040697995,
"train_speed(iter/s)": 0.113869
},
{
"epoch": 0.9404221212930804,
"eval_loss": 0.1853218525648117,
"eval_runtime": 39.2228,
"eval_samples_per_second": 15.399,
"eval_steps_per_second": 3.85,
"eval_token_acc": 0.9349203285509018,
"step": 880
},
{
"epoch": 0.9457654288004275,
"grad_norm": 0.7549769878387451,
"learning_rate": 7.738440869493018e-06,
"loss": 0.19690234661102296,
"memory(GiB)": 31.36,
"step": 885,
"token_acc": 0.9322219434878789,
"train_speed(iter/s)": 0.113325
},
{
"epoch": 0.9511087363077745,
"grad_norm": 0.7041129469871521,
"learning_rate": 7.714970997303898e-06,
"loss": 0.19316442012786866,
"memory(GiB)": 31.36,
"step": 890,
"token_acc": 0.9402204546877936,
"train_speed(iter/s)": 0.113488
},
{
"epoch": 0.9564520438151216,
"grad_norm": 0.6991894841194153,
"learning_rate": 7.691415984244998e-06,
"loss": 0.19888077974319457,
"memory(GiB)": 31.36,
"step": 895,
"token_acc": 0.9314758549356779,
"train_speed(iter/s)": 0.113654
},
{
"epoch": 0.9617953513224686,
"grad_norm": 0.6569207906723022,
"learning_rate": 7.667776568996143e-06,
"loss": 0.19370880126953124,
"memory(GiB)": 31.36,
"step": 900,
"token_acc": 0.9230841325877097,
"train_speed(iter/s)": 0.113808
},
{
"epoch": 0.9617953513224686,
"eval_loss": 0.1854468584060669,
"eval_runtime": 39.1165,
"eval_samples_per_second": 15.441,
"eval_steps_per_second": 3.86,
"eval_token_acc": 0.9353878487636449,
"step": 900
},
{
"epoch": 0.9671386588298156,
"grad_norm": 0.6145524382591248,
"learning_rate": 7.64405349288399e-06,
"loss": 0.18694071769714354,
"memory(GiB)": 31.36,
"step": 905,
"token_acc": 0.9349944519517045,
"train_speed(iter/s)": 0.113288
},
{
"epoch": 0.9724819663371627,
"grad_norm": 0.6918651461601257,
"learning_rate": 7.62024749985878e-06,
"loss": 0.1850353240966797,
"memory(GiB)": 31.36,
"step": 910,
"token_acc": 0.9334631974398219,
"train_speed(iter/s)": 0.113436
},
{
"epoch": 0.9778252738445098,
"grad_norm": 0.6467410922050476,
"learning_rate": 7.596359336471015e-06,
"loss": 0.1928159236907959,
"memory(GiB)": 31.36,
"step": 915,
"token_acc": 0.9306192268217585,
"train_speed(iter/s)": 0.113579
},
{
"epoch": 0.9831685813518568,
"grad_norm": 0.6539848446846008,
"learning_rate": 7.572389751848037e-06,
"loss": 0.19003190994262695,
"memory(GiB)": 31.36,
"step": 920,
"token_acc": 0.9286221470836855,
"train_speed(iter/s)": 0.113727
},
{
"epoch": 0.9831685813518568,
"eval_loss": 0.1848677396774292,
"eval_runtime": 39.1075,
"eval_samples_per_second": 15.445,
"eval_steps_per_second": 3.861,
"eval_token_acc": 0.9352377275944155,
"step": 920
},
{
"epoch": 0.9885118888592038,
"grad_norm": 0.630643904209137,
"learning_rate": 7.548339497670538e-06,
"loss": 0.19637407064437867,
"memory(GiB)": 31.36,
"step": 925,
"token_acc": 0.934642791292936,
"train_speed(iter/s)": 0.113199
},
{
"epoch": 0.9938551963665508,
"grad_norm": 0.7896732091903687,
"learning_rate": 7.524209328148995e-06,
"loss": 0.1935054898262024,
"memory(GiB)": 31.36,
"step": 930,
"token_acc": 0.9335429563394583,
"train_speed(iter/s)": 0.113379
},
{
"epoch": 0.999198503873898,
"grad_norm": 0.8091479539871216,
"learning_rate": 7.500000000000001e-06,
"loss": 0.1939959168434143,
"memory(GiB)": 31.36,
"step": 935,
"token_acc": 0.9332101204512141,
"train_speed(iter/s)": 0.113561
},
{
"epoch": 1.0042746460058776,
"grad_norm": 0.7063812017440796,
"learning_rate": 7.4757122724225575e-06,
"loss": 0.15370899438858032,
"memory(GiB)": 31.36,
"step": 940,
"token_acc": 0.9470076897358742,
"train_speed(iter/s)": 0.113758
},
{
"epoch": 1.0042746460058776,
"eval_loss": 0.1846647411584854,
"eval_runtime": 39.1772,
"eval_samples_per_second": 15.417,
"eval_steps_per_second": 3.854,
"eval_token_acc": 0.9355808616955114,
"step": 940
},
{
"epoch": 1.0096179535132246,
"grad_norm": 0.7912789583206177,
"learning_rate": 7.451346907074245e-06,
"loss": 0.14589121341705322,
"memory(GiB)": 31.36,
"step": 945,
"token_acc": 0.9395386832162834,
"train_speed(iter/s)": 0.113271
},
{
"epoch": 1.0149612610205718,
"grad_norm": 0.773916482925415,
"learning_rate": 7.426904668047352e-06,
"loss": 0.14080936908721925,
"memory(GiB)": 31.36,
"step": 950,
"token_acc": 0.9501985945615643,
"train_speed(iter/s)": 0.113448
},
{
"epoch": 1.0203045685279188,
"grad_norm": 0.7244420647621155,
"learning_rate": 7.40238632184491e-06,
"loss": 0.1430067539215088,
"memory(GiB)": 31.36,
"step": 955,
"token_acc": 0.9496945399007255,
"train_speed(iter/s)": 0.113636
},
{
"epoch": 1.0256478760352659,
"grad_norm": 0.7013021111488342,
"learning_rate": 7.377792637356644e-06,
"loss": 0.13634157180786133,
"memory(GiB)": 31.36,
"step": 960,
"token_acc": 0.9495622671230802,
"train_speed(iter/s)": 0.113799
},
{
"epoch": 1.0256478760352659,
"eval_loss": 0.18845246732234955,
"eval_runtime": 39.1642,
"eval_samples_per_second": 15.422,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9350661605438676,
"step": 960
},
{
"epoch": 1.0309911835426129,
"grad_norm": 0.6317921876907349,
"learning_rate": 7.35312438583488e-06,
"loss": 0.14405910968780516,
"memory(GiB)": 31.36,
"step": 965,
"token_acc": 0.94003444829566,
"train_speed(iter/s)": 0.113324
},
{
"epoch": 1.03633449104996,
"grad_norm": 0.7653928399085999,
"learning_rate": 7.3283823408703466e-06,
"loss": 0.14201946258544923,
"memory(GiB)": 31.36,
"step": 970,
"token_acc": 0.9429385599110369,
"train_speed(iter/s)": 0.113486
},
{
"epoch": 1.041677798557307,
"grad_norm": 0.7575430870056152,
"learning_rate": 7.303567278367918e-06,
"loss": 0.15218265056610109,
"memory(GiB)": 31.36,
"step": 975,
"token_acc": 0.9438988818667963,
"train_speed(iter/s)": 0.113648
},
{
"epoch": 1.047021106064654,
"grad_norm": 0.6947309970855713,
"learning_rate": 7.278679976522279e-06,
"loss": 0.14258232116699218,
"memory(GiB)": 31.36,
"step": 980,
"token_acc": 0.9484339445857315,
"train_speed(iter/s)": 0.113835
},
{
"epoch": 1.047021106064654,
"eval_loss": 0.18912462890148163,
"eval_runtime": 39.1355,
"eval_samples_per_second": 15.434,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9351605224216689,
"step": 980
},
{
"epoch": 1.0523644135720012,
"grad_norm": 0.6447970271110535,
"learning_rate": 7.253721215793528e-06,
"loss": 0.13303806781768798,
"memory(GiB)": 31.36,
"step": 985,
"token_acc": 0.9368344110205229,
"train_speed(iter/s)": 0.113351
},
{
"epoch": 1.0577077210793482,
"grad_norm": 0.7357504367828369,
"learning_rate": 7.2286917788826926e-06,
"loss": 0.14264590740203859,
"memory(GiB)": 31.36,
"step": 990,
"token_acc": 0.9517513105612875,
"train_speed(iter/s)": 0.113523
},
{
"epoch": 1.0630510285866952,
"grad_norm": 0.7251073718070984,
"learning_rate": 7.203592450707193e-06,
"loss": 0.14065431356430053,
"memory(GiB)": 31.36,
"step": 995,
"token_acc": 0.9541761579347001,
"train_speed(iter/s)": 0.113657
},
{
"epoch": 1.0683943360940422,
"grad_norm": 0.6575713753700256,
"learning_rate": 7.178424018376224e-06,
"loss": 0.13455284833908082,
"memory(GiB)": 31.36,
"step": 1000,
"token_acc": 0.952753960692989,
"train_speed(iter/s)": 0.113814
},
{
"epoch": 1.0683943360940422,
"eval_loss": 0.18993094563484192,
"eval_runtime": 39.1132,
"eval_samples_per_second": 15.442,
"eval_steps_per_second": 3.861,
"eval_token_acc": 0.9351004739539771,
"step": 1000
},
{
"epoch": 1.0737376436013892,
"grad_norm": 0.6939108371734619,
"learning_rate": 7.153187271166071e-06,
"loss": 0.1378490924835205,
"memory(GiB)": 31.36,
"step": 1005,
"token_acc": 0.9382960940547167,
"train_speed(iter/s)": 0.113358
},
{
"epoch": 1.0790809511087363,
"grad_norm": 0.6724715828895569,
"learning_rate": 7.127883000495353e-06,
"loss": 0.14932271242141723,
"memory(GiB)": 31.36,
"step": 1010,
"token_acc": 0.9499566799514816,
"train_speed(iter/s)": 0.113511
},
{
"epoch": 1.0844242586160833,
"grad_norm": 0.731438159942627,
"learning_rate": 7.102511999900213e-06,
"loss": 0.13644077777862548,
"memory(GiB)": 31.36,
"step": 1015,
"token_acc": 0.9460200277757473,
"train_speed(iter/s)": 0.113649
},
{
"epoch": 1.0897675661234305,
"grad_norm": 0.6680863499641418,
"learning_rate": 7.0770750650094335e-06,
"loss": 0.13693207502365112,
"memory(GiB)": 31.36,
"step": 1020,
"token_acc": 0.9469487672670047,
"train_speed(iter/s)": 0.113805
},
{
"epoch": 1.0897675661234305,
"eval_loss": 0.18932494521141052,
"eval_runtime": 39.1534,
"eval_samples_per_second": 15.427,
"eval_steps_per_second": 3.857,
"eval_token_acc": 0.9353063544146347,
"step": 1020
},
{
"epoch": 1.0951108736307775,
"grad_norm": 0.6814751029014587,
"learning_rate": 7.051572993519474e-06,
"loss": 0.13657076358795167,
"memory(GiB)": 31.36,
"step": 1025,
"token_acc": 0.9386909315096644,
"train_speed(iter/s)": 0.113321
},
{
"epoch": 1.1004541811381245,
"grad_norm": 0.6972615718841553,
"learning_rate": 7.026006585169467e-06,
"loss": 0.14217867851257324,
"memory(GiB)": 31.36,
"step": 1030,
"token_acc": 0.9473362948896199,
"train_speed(iter/s)": 0.113472
},
{
"epoch": 1.1057974886454716,
"grad_norm": 0.7882171869277954,
"learning_rate": 7.0003766417161335e-06,
"loss": 0.13929877281188965,
"memory(GiB)": 31.36,
"step": 1035,
"token_acc": 0.9491739520659509,
"train_speed(iter/s)": 0.113632
},
{
"epoch": 1.1111407961528186,
"grad_norm": 0.7138720154762268,
"learning_rate": 6.974683966908642e-06,
"loss": 0.1398939847946167,
"memory(GiB)": 31.36,
"step": 1040,
"token_acc": 0.9491174031512853,
"train_speed(iter/s)": 0.113768
},
{
"epoch": 1.1111407961528186,
"eval_loss": 0.18828535079956055,
"eval_runtime": 39.1885,
"eval_samples_per_second": 15.413,
"eval_steps_per_second": 3.853,
"eval_token_acc": 0.9354607647601278,
"step": 1040
},
{
"epoch": 1.1164841036601656,
"grad_norm": 0.7279144525527954,
"learning_rate": 6.948929366463397e-06,
"loss": 0.15247514247894287,
"memory(GiB)": 31.36,
"step": 1045,
"token_acc": 0.9380235654449663,
"train_speed(iter/s)": 0.113327
},
{
"epoch": 1.1218274111675126,
"grad_norm": 0.7380113005638123,
"learning_rate": 6.923113648038784e-06,
"loss": 0.14748337268829345,
"memory(GiB)": 31.36,
"step": 1050,
"token_acc": 0.9475613194248661,
"train_speed(iter/s)": 0.11348
},
{
"epoch": 1.1271707186748596,
"grad_norm": 0.7649953961372375,
"learning_rate": 6.897237621209831e-06,
"loss": 0.14567428827285767,
"memory(GiB)": 31.36,
"step": 1055,
"token_acc": 0.9500533120085299,
"train_speed(iter/s)": 0.113609
},
{
"epoch": 1.1325140261822069,
"grad_norm": 0.7318655252456665,
"learning_rate": 6.87130209744282e-06,
"loss": 0.13384032249450684,
"memory(GiB)": 31.36,
"step": 1060,
"token_acc": 0.9565458338766463,
"train_speed(iter/s)": 0.113739
},
{
"epoch": 1.1325140261822069,
"eval_loss": 0.18837569653987885,
"eval_runtime": 39.2359,
"eval_samples_per_second": 15.394,
"eval_steps_per_second": 3.849,
"eval_token_acc": 0.935216281713097,
"step": 1060
},
{
"epoch": 1.1378573336895539,
"grad_norm": 0.7364778518676758,
"learning_rate": 6.845307890069851e-06,
"loss": 0.1373004674911499,
"memory(GiB)": 31.36,
"step": 1065,
"token_acc": 0.9390180878552972,
"train_speed(iter/s)": 0.113291
},
{
"epoch": 1.143200641196901,
"grad_norm": 0.685958743095398,
"learning_rate": 6.8192558142633215e-06,
"loss": 0.13763891458511351,
"memory(GiB)": 31.36,
"step": 1070,
"token_acc": 0.9479466974181562,
"train_speed(iter/s)": 0.113441
},
{
"epoch": 1.148543948704248,
"grad_norm": 0.6921528577804565,
"learning_rate": 6.7931466870103735e-06,
"loss": 0.1474214553833008,
"memory(GiB)": 31.36,
"step": 1075,
"token_acc": 0.9462754416778593,
"train_speed(iter/s)": 0.11358
},
{
"epoch": 1.153887256211595,
"grad_norm": 0.6673567891120911,
"learning_rate": 6.766981327087271e-06,
"loss": 0.134868586063385,
"memory(GiB)": 31.36,
"step": 1080,
"token_acc": 0.9541073453445886,
"train_speed(iter/s)": 0.113692
},
{
"epoch": 1.153887256211595,
"eval_loss": 0.1878366470336914,
"eval_runtime": 39.1901,
"eval_samples_per_second": 15.412,
"eval_steps_per_second": 3.853,
"eval_token_acc": 0.9354607647601278,
"step": 1080
},
{
"epoch": 1.159230563718942,
"grad_norm": 0.8000864386558533,
"learning_rate": 6.740760555033715e-06,
"loss": 0.14174835681915282,
"memory(GiB)": 31.36,
"step": 1085,
"token_acc": 0.9395288542253296,
"train_speed(iter/s)": 0.113249
},
{
"epoch": 1.1645738712262892,
"grad_norm": 0.6853452920913696,
"learning_rate": 6.714485193127126e-06,
"loss": 0.14102463722229003,
"memory(GiB)": 31.36,
"step": 1090,
"token_acc": 0.9465929419417791,
"train_speed(iter/s)": 0.11338
},
{
"epoch": 1.1699171787336362,
"grad_norm": 0.747848629951477,
"learning_rate": 6.688156065356845e-06,
"loss": 0.14443647861480713,
"memory(GiB)": 31.36,
"step": 1095,
"token_acc": 0.9509245187436677,
"train_speed(iter/s)": 0.113501
},
{
"epoch": 1.1752604862409832,
"grad_norm": 0.7009637355804443,
"learning_rate": 6.6617739973982985e-06,
"loss": 0.1462648630142212,
"memory(GiB)": 31.36,
"step": 1100,
"token_acc": 0.9474907617117654,
"train_speed(iter/s)": 0.113649
},
{
"epoch": 1.1752604862409832,
"eval_loss": 0.1868010312318802,
"eval_runtime": 39.2141,
"eval_samples_per_second": 15.403,
"eval_steps_per_second": 3.851,
"eval_token_acc": 0.9356494885157306,
"step": 1100
},
{
"epoch": 1.1806037937483302,
"grad_norm": 0.7919987440109253,
"learning_rate": 6.635339816587109e-06,
"loss": 0.14761772155761718,
"memory(GiB)": 31.36,
"step": 1105,
"token_acc": 0.9380447931623158,
"train_speed(iter/s)": 0.113238
},
{
"epoch": 1.1859471012556773,
"grad_norm": 0.7107034921646118,
"learning_rate": 6.60885435189314e-06,
"loss": 0.15119514465332032,
"memory(GiB)": 31.36,
"step": 1110,
"token_acc": 0.9427783975326465,
"train_speed(iter/s)": 0.11339
},
{
"epoch": 1.1912904087630243,
"grad_norm": 0.7467309236526489,
"learning_rate": 6.582318433894513e-06,
"loss": 0.13204342126846313,
"memory(GiB)": 31.36,
"step": 1115,
"token_acc": 0.9524959742351047,
"train_speed(iter/s)": 0.113534
},
{
"epoch": 1.1966337162703713,
"grad_norm": 0.7935456037521362,
"learning_rate": 6.555732894751548e-06,
"loss": 0.1459757924079895,
"memory(GiB)": 31.36,
"step": 1120,
"token_acc": 0.9519983083104251,
"train_speed(iter/s)": 0.113678
},
{
"epoch": 1.1966337162703713,
"eval_loss": 0.18683308362960815,
"eval_runtime": 39.1575,
"eval_samples_per_second": 15.425,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9356194642818847,
"step": 1120
},
{
"epoch": 1.2019770237777183,
"grad_norm": 0.7466067671775818,
"learning_rate": 6.529098568180672e-06,
"loss": 0.14143054485321044,
"memory(GiB)": 31.36,
"step": 1125,
"token_acc": 0.939937276954688,
"train_speed(iter/s)": 0.11325
},
{
"epoch": 1.2073203312850656,
"grad_norm": 0.7526156306266785,
"learning_rate": 6.502416289428282e-06,
"loss": 0.14170231819152831,
"memory(GiB)": 31.36,
"step": 1130,
"token_acc": 0.9475191453761503,
"train_speed(iter/s)": 0.113376
},
{
"epoch": 1.2126636387924126,
"grad_norm": 0.7008678913116455,
"learning_rate": 6.475686895244534e-06,
"loss": 0.14561245441436768,
"memory(GiB)": 31.36,
"step": 1135,
"token_acc": 0.9469662033072007,
"train_speed(iter/s)": 0.113525
},
{
"epoch": 1.2180069462997596,
"grad_norm": 0.7882223129272461,
"learning_rate": 6.448911223857124e-06,
"loss": 0.14698657989501954,
"memory(GiB)": 31.36,
"step": 1140,
"token_acc": 0.9527493782812931,
"train_speed(iter/s)": 0.113658
},
{
"epoch": 1.2180069462997596,
"eval_loss": 0.1868022382259369,
"eval_runtime": 39.1631,
"eval_samples_per_second": 15.423,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9359154174440799,
"step": 1140
},
{
"epoch": 1.2233502538071066,
"grad_norm": 0.7010296583175659,
"learning_rate": 6.422090114944982e-06,
"loss": 0.14752376079559326,
"memory(GiB)": 31.36,
"step": 1145,
"token_acc": 0.937754062131431,
"train_speed(iter/s)": 0.113221
},
{
"epoch": 1.2286935613144536,
"grad_norm": 0.710648238658905,
"learning_rate": 6.3952244096119535e-06,
"loss": 0.13726551532745362,
"memory(GiB)": 31.36,
"step": 1150,
"token_acc": 0.9516319057474925,
"train_speed(iter/s)": 0.113373
},
{
"epoch": 1.2340368688218006,
"grad_norm": 0.7825097441673279,
"learning_rate": 6.368314950360416e-06,
"loss": 0.151510751247406,
"memory(GiB)": 31.36,
"step": 1155,
"token_acc": 0.9457324403228576,
"train_speed(iter/s)": 0.113517
},
{
"epoch": 1.2393801763291477,
"grad_norm": 0.7237306833267212,
"learning_rate": 6.341362581064856e-06,
"loss": 0.14253956079483032,
"memory(GiB)": 31.36,
"step": 1160,
"token_acc": 0.9461406518010291,
"train_speed(iter/s)": 0.113677
},
{
"epoch": 1.2393801763291477,
"eval_loss": 0.18586544692516327,
"eval_runtime": 39.1092,
"eval_samples_per_second": 15.444,
"eval_steps_per_second": 3.861,
"eval_token_acc": 0.9360355143794634,
"step": 1160
},
{
"epoch": 1.244723483836495,
"grad_norm": 0.7146082520484924,
"learning_rate": 6.314368146945418e-06,
"loss": 0.14136313199996947,
"memory(GiB)": 31.36,
"step": 1165,
"token_acc": 0.9388746238483078,
"train_speed(iter/s)": 0.113271
},
{
"epoch": 1.250066791343842,
"grad_norm": 0.7276601195335388,
"learning_rate": 6.28733249454138e-06,
"loss": 0.1453978180885315,
"memory(GiB)": 31.36,
"step": 1170,
"token_acc": 0.9472019757845913,
"train_speed(iter/s)": 0.113427
},
{
"epoch": 1.255410098851189,
"grad_norm": 0.7507435083389282,
"learning_rate": 6.260256471684622e-06,
"loss": 0.14081387519836425,
"memory(GiB)": 31.36,
"step": 1175,
"token_acc": 0.9456987966162278,
"train_speed(iter/s)": 0.113564
},
{
"epoch": 1.260753406358536,
"grad_norm": 0.6047825217247009,
"learning_rate": 6.233140927473033e-06,
"loss": 0.1298896551132202,
"memory(GiB)": 31.36,
"step": 1180,
"token_acc": 0.950142074581832,
"train_speed(iter/s)": 0.113686
},
{
"epoch": 1.260753406358536,
"eval_loss": 0.186279758810997,
"eval_runtime": 39.1529,
"eval_samples_per_second": 15.427,
"eval_steps_per_second": 3.857,
"eval_token_acc": 0.9361170087284737,
"step": 1180
},
{
"epoch": 1.266096713865883,
"grad_norm": 0.7231855988502502,
"learning_rate": 6.205986712243876e-06,
"loss": 0.13684126138687133,
"memory(GiB)": 31.36,
"step": 1185,
"token_acc": 0.939052757793765,
"train_speed(iter/s)": 0.113247
},
{
"epoch": 1.27144002137323,
"grad_norm": 0.7016168832778931,
"learning_rate": 6.178794677547138e-06,
"loss": 0.15314276218414308,
"memory(GiB)": 31.36,
"step": 1190,
"token_acc": 0.9389803557822904,
"train_speed(iter/s)": 0.113385
},
{
"epoch": 1.276783328880577,
"grad_norm": 0.7309826612472534,
"learning_rate": 6.151565676118805e-06,
"loss": 0.13780862092971802,
"memory(GiB)": 31.36,
"step": 1195,
"token_acc": 0.9577650445215613,
"train_speed(iter/s)": 0.113507
},
{
"epoch": 1.282126636387924,
"grad_norm": 0.7305301427841187,
"learning_rate": 6.124300561854139e-06,
"loss": 0.13783036470413207,
"memory(GiB)": 31.36,
"step": 1200,
"token_acc": 0.9519318638739628,
"train_speed(iter/s)": 0.113619
},
{
"epoch": 1.282126636387924,
"eval_loss": 0.1861124336719513,
"eval_runtime": 39.1734,
"eval_samples_per_second": 15.419,
"eval_steps_per_second": 3.855,
"eval_token_acc": 0.9362371056638572,
"step": 1200
},
{
"epoch": 1.2874699438952713,
"grad_norm": 0.7063933610916138,
"learning_rate": 6.097000189780893e-06,
"loss": 0.1543891429901123,
"memory(GiB)": 31.36,
"step": 1205,
"token_acc": 0.9369793792821915,
"train_speed(iter/s)": 0.113257
},
{
"epoch": 1.2928132514026183,
"grad_norm": 0.7241778373718262,
"learning_rate": 6.0696654160324875e-06,
"loss": 0.13728095293045045,
"memory(GiB)": 31.36,
"step": 1210,
"token_acc": 0.9575441100155683,
"train_speed(iter/s)": 0.113381
},
{
"epoch": 1.2981565589099653,
"grad_norm": 0.7719012498855591,
"learning_rate": 6.042297097821184e-06,
"loss": 0.15218913555145264,
"memory(GiB)": 31.36,
"step": 1215,
"token_acc": 0.949528983015884,
"train_speed(iter/s)": 0.113494
},
{
"epoch": 1.3034998664173123,
"grad_norm": 0.6969226002693176,
"learning_rate": 6.014896093411181e-06,
"loss": 0.13368651866912842,
"memory(GiB)": 31.36,
"step": 1220,
"token_acc": 0.9476698598847714,
"train_speed(iter/s)": 0.113599
},
{
"epoch": 1.3034998664173123,
"eval_loss": 0.1850433051586151,
"eval_runtime": 39.2188,
"eval_samples_per_second": 15.401,
"eval_steps_per_second": 3.85,
"eval_token_acc": 0.9363186000128675,
"step": 1220
},
{
"epoch": 1.3088431739246593,
"grad_norm": 0.6775336861610413,
"learning_rate": 5.987463262091715e-06,
"loss": 0.139385187625885,
"memory(GiB)": 31.36,
"step": 1225,
"token_acc": 0.9377400468384075,
"train_speed(iter/s)": 0.113206
},
{
"epoch": 1.3141864814320063,
"grad_norm": 0.7808154821395874,
"learning_rate": 5.959999464150101e-06,
"loss": 0.1481320381164551,
"memory(GiB)": 31.36,
"step": 1230,
"token_acc": 0.9476690131491566,
"train_speed(iter/s)": 0.113336
},
{
"epoch": 1.3195297889393536,
"grad_norm": 0.7226517796516418,
"learning_rate": 5.932505560844766e-06,
"loss": 0.14821076393127441,
"memory(GiB)": 31.36,
"step": 1235,
"token_acc": 0.9464622560620092,
"train_speed(iter/s)": 0.113466
},
{
"epoch": 1.3248730964467006,
"grad_norm": 0.7376400828361511,
"learning_rate": 5.904982414378233e-06,
"loss": 0.13770921230316163,
"memory(GiB)": 31.36,
"step": 1240,
"token_acc": 0.9496229260935143,
"train_speed(iter/s)": 0.113615
},
{
"epoch": 1.3248730964467006,
"eval_loss": 0.18510028719902039,
"eval_runtime": 39.1646,
"eval_samples_per_second": 15.422,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9362156597825387,
"step": 1240
},
{
"epoch": 1.3302164039540476,
"grad_norm": 0.7447443008422852,
"learning_rate": 5.877430887870081e-06,
"loss": 0.14754925966262816,
"memory(GiB)": 31.36,
"step": 1245,
"token_acc": 0.9386804566572503,
"train_speed(iter/s)": 0.113234
},
{
"epoch": 1.3355597114613946,
"grad_norm": 0.7104332447052002,
"learning_rate": 5.849851845329884e-06,
"loss": 0.1406762719154358,
"memory(GiB)": 31.36,
"step": 1250,
"token_acc": 0.9495687504455057,
"train_speed(iter/s)": 0.113347
},
{
"epoch": 1.3409030189687416,
"grad_norm": 0.7238494753837585,
"learning_rate": 5.822246151630109e-06,
"loss": 0.13662366867065429,
"memory(GiB)": 31.36,
"step": 1255,
"token_acc": 0.9507098934354979,
"train_speed(iter/s)": 0.113485
},
{
"epoch": 1.3462463264760887,
"grad_norm": 0.7518407106399536,
"learning_rate": 5.794614672479e-06,
"loss": 0.14233107566833497,
"memory(GiB)": 31.36,
"step": 1260,
"token_acc": 0.9495046143399488,
"train_speed(iter/s)": 0.1136
},
{
"epoch": 1.3462463264760887,
"eval_loss": 0.18543414771556854,
"eval_runtime": 39.1743,
"eval_samples_per_second": 15.418,
"eval_steps_per_second": 3.855,
"eval_token_acc": 0.9364129618906689,
"step": 1260
},
{
"epoch": 1.3515896339834357,
"grad_norm": 0.7268742322921753,
"learning_rate": 5.766958274393428e-06,
"loss": 0.14559613466262816,
"memory(GiB)": 31.36,
"step": 1265,
"token_acc": 0.9380886914433095,
"train_speed(iter/s)": 0.113217
},
{
"epoch": 1.3569329414907827,
"grad_norm": 0.6919171810150146,
"learning_rate": 5.739277824671711e-06,
"loss": 0.1417681932449341,
"memory(GiB)": 31.36,
"step": 1270,
"token_acc": 0.9501291664041334,
"train_speed(iter/s)": 0.113322
},
{
"epoch": 1.3622762489981297,
"grad_norm": 0.7078922390937805,
"learning_rate": 5.711574191366427e-06,
"loss": 0.14929780960083008,
"memory(GiB)": 31.36,
"step": 1275,
"token_acc": 0.9459644322845417,
"train_speed(iter/s)": 0.113449
},
{
"epoch": 1.367619556505477,
"grad_norm": 0.768913745880127,
"learning_rate": 5.683848243257181e-06,
"loss": 0.14540610313415528,
"memory(GiB)": 31.36,
"step": 1280,
"token_acc": 0.9463650228774784,
"train_speed(iter/s)": 0.113575
},
{
"epoch": 1.367619556505477,
"eval_loss": 0.18363338708877563,
"eval_runtime": 38.9429,
"eval_samples_per_second": 15.51,
"eval_steps_per_second": 3.877,
"eval_token_acc": 0.9366574449376998,
"step": 1280
},
{
"epoch": 1.372962864012824,
"grad_norm": 0.7159769535064697,
"learning_rate": 5.656100849823366e-06,
"loss": 0.13922522068023682,
"memory(GiB)": 31.36,
"step": 1285,
"token_acc": 0.9396986067671311,
"train_speed(iter/s)": 0.113203
},
{
"epoch": 1.378306171520171,
"grad_norm": 0.6261381506919861,
"learning_rate": 5.628332881216899e-06,
"loss": 0.13264775276184082,
"memory(GiB)": 31.36,
"step": 1290,
"token_acc": 0.956436461236709,
"train_speed(iter/s)": 0.113332
},
{
"epoch": 1.383649479027518,
"grad_norm": 0.790125846862793,
"learning_rate": 5.600545208234927e-06,
"loss": 0.1441697359085083,
"memory(GiB)": 31.36,
"step": 1295,
"token_acc": 0.9488245412844036,
"train_speed(iter/s)": 0.113461
},
{
"epoch": 1.388992786534865,
"grad_norm": 0.7451324462890625,
"learning_rate": 5.57273870229252e-06,
"loss": 0.13834784030914307,
"memory(GiB)": 31.36,
"step": 1300,
"token_acc": 0.9505032488215059,
"train_speed(iter/s)": 0.11358
},
{
"epoch": 1.388992786534865,
"eval_loss": 0.18352170288562775,
"eval_runtime": 39.1528,
"eval_samples_per_second": 15.427,
"eval_steps_per_second": 3.857,
"eval_token_acc": 0.9367603851680285,
"step": 1300
},
{
"epoch": 1.3943360940422123,
"grad_norm": 0.7569957375526428,
"learning_rate": 5.544914235395347e-06,
"loss": 0.15216903686523436,
"memory(GiB)": 31.36,
"step": 1305,
"token_acc": 0.9388893760687508,
"train_speed(iter/s)": 0.113241
},
{
"epoch": 1.3996794015495593,
"grad_norm": 0.7055838108062744,
"learning_rate": 5.517072680112332e-06,
"loss": 0.13284831047058104,
"memory(GiB)": 31.36,
"step": 1310,
"token_acc": 0.9498824853520474,
"train_speed(iter/s)": 0.113338
},
{
"epoch": 1.4050227090569063,
"grad_norm": 0.6958155035972595,
"learning_rate": 5.4892149095482815e-06,
"loss": 0.136586332321167,
"memory(GiB)": 31.36,
"step": 1315,
"token_acc": 0.9489724944672779,
"train_speed(iter/s)": 0.113435
},
{
"epoch": 1.4103660165642533,
"grad_norm": 0.6948981285095215,
"learning_rate": 5.46134179731651e-06,
"loss": 0.14353724718093872,
"memory(GiB)": 31.36,
"step": 1320,
"token_acc": 0.9483981258705838,
"train_speed(iter/s)": 0.11357
},
{
"epoch": 1.4103660165642533,
"eval_loss": 0.1832687258720398,
"eval_runtime": 39.1038,
"eval_samples_per_second": 15.446,
"eval_steps_per_second": 3.862,
"eval_token_acc": 0.9369748439812134,
"step": 1320
},
{
"epoch": 1.4157093240716003,
"grad_norm": 0.6944836378097534,
"learning_rate": 5.4334542175114495e-06,
"loss": 0.1423251748085022,
"memory(GiB)": 31.36,
"step": 1325,
"token_acc": 0.9394761855681909,
"train_speed(iter/s)": 0.11322
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.6609280705451965,
"learning_rate": 5.40555304468122e-06,
"loss": 0.13840043544769287,
"memory(GiB)": 31.36,
"step": 1330,
"token_acc": 0.9507696104136463,
"train_speed(iter/s)": 0.113316
},
{
"epoch": 1.4263959390862944,
"grad_norm": 0.7256921529769897,
"learning_rate": 5.377639153800229e-06,
"loss": 0.1384860634803772,
"memory(GiB)": 31.36,
"step": 1335,
"token_acc": 0.9545629784656056,
"train_speed(iter/s)": 0.113429
},
{
"epoch": 1.4317392465936414,
"grad_norm": 0.7888199687004089,
"learning_rate": 5.34971342024171e-06,
"loss": 0.14113259315490723,
"memory(GiB)": 31.36,
"step": 1340,
"token_acc": 0.954348504280911,
"train_speed(iter/s)": 0.113523
},
{
"epoch": 1.4317392465936414,
"eval_loss": 0.184078186750412,
"eval_runtime": 39.1268,
"eval_samples_per_second": 15.437,
"eval_steps_per_second": 3.859,
"eval_token_acc": 0.9369877115100045,
"step": 1340
},
{
"epoch": 1.4370825541009884,
"grad_norm": 0.7354726791381836,
"learning_rate": 5.321776719750283e-06,
"loss": 0.1384582042694092,
"memory(GiB)": 31.36,
"step": 1345,
"token_acc": 0.9407986188960137,
"train_speed(iter/s)": 0.113178
},
{
"epoch": 1.4424258616083356,
"grad_norm": 0.6982942819595337,
"learning_rate": 5.29382992841449e-06,
"loss": 0.14179346561431885,
"memory(GiB)": 31.36,
"step": 1350,
"token_acc": 0.9446530872056015,
"train_speed(iter/s)": 0.113289
},
{
"epoch": 1.4477691691156827,
"grad_norm": 0.6804877519607544,
"learning_rate": 5.265873922639315e-06,
"loss": 0.13716717958450317,
"memory(GiB)": 31.36,
"step": 1355,
"token_acc": 0.9538438661710037,
"train_speed(iter/s)": 0.113405
},
{
"epoch": 1.4531124766230297,
"grad_norm": 0.7978929877281189,
"learning_rate": 5.237909579118713e-06,
"loss": 0.1416216015815735,
"memory(GiB)": 31.36,
"step": 1360,
"token_acc": 0.9511662976866557,
"train_speed(iter/s)": 0.113536
},
{
"epoch": 1.4531124766230297,
"eval_loss": 0.1834552139043808,
"eval_runtime": 39.1688,
"eval_samples_per_second": 15.42,
"eval_steps_per_second": 3.855,
"eval_token_acc": 0.9368547470458298,
"step": 1360
},
{
"epoch": 1.4584557841303767,
"grad_norm": 0.721110999584198,
"learning_rate": 5.209937774808098e-06,
"loss": 0.13820960521697997,
"memory(GiB)": 31.36,
"step": 1365,
"token_acc": 0.9396422402036548,
"train_speed(iter/s)": 0.113179
},
{
"epoch": 1.4637990916377237,
"grad_norm": 0.7276850938796997,
"learning_rate": 5.181959386896862e-06,
"loss": 0.14571261405944824,
"memory(GiB)": 31.36,
"step": 1370,
"token_acc": 0.9458817568637385,
"train_speed(iter/s)": 0.113285
},
{
"epoch": 1.4691423991450707,
"grad_norm": 0.6762943267822266,
"learning_rate": 5.153975292780852e-06,
"loss": 0.14370789527893066,
"memory(GiB)": 31.36,
"step": 1375,
"token_acc": 0.9516790861044546,
"train_speed(iter/s)": 0.113408
},
{
"epoch": 1.474485706652418,
"grad_norm": 0.7835574746131897,
"learning_rate": 5.125986370034862e-06,
"loss": 0.14546499252319336,
"memory(GiB)": 31.36,
"step": 1380,
"token_acc": 0.9492231661229368,
"train_speed(iter/s)": 0.113557
},
{
"epoch": 1.474485706652418,
"eval_loss": 0.18329627811908722,
"eval_runtime": 39.2746,
"eval_samples_per_second": 15.379,
"eval_steps_per_second": 3.845,
"eval_token_acc": 0.9371549893842888,
"step": 1380
},
{
"epoch": 1.479829014159765,
"grad_norm": 0.7045428156852722,
"learning_rate": 5.097993496385112e-06,
"loss": 0.14536089897155763,
"memory(GiB)": 31.36,
"step": 1385,
"token_acc": 0.9408298818336975,
"train_speed(iter/s)": 0.113213
},
{
"epoch": 1.485172321667112,
"grad_norm": 0.7122427225112915,
"learning_rate": 5.069997549681718e-06,
"loss": 0.1389164924621582,
"memory(GiB)": 31.36,
"step": 1390,
"token_acc": 0.949877300613497,
"train_speed(iter/s)": 0.113318
},
{
"epoch": 1.490515629174459,
"grad_norm": 0.7316491007804871,
"learning_rate": 5.041999407871168e-06,
"loss": 0.14822676181793212,
"memory(GiB)": 31.36,
"step": 1395,
"token_acc": 0.9491525423728814,
"train_speed(iter/s)": 0.113431
},
{
"epoch": 1.495858936681806,
"grad_norm": 0.7621902227401733,
"learning_rate": 5.01399994896879e-06,
"loss": 0.14519236087799073,
"memory(GiB)": 31.36,
"step": 1400,
"token_acc": 0.9512387720856776,
"train_speed(iter/s)": 0.113547
},
{
"epoch": 1.495858936681806,
"eval_loss": 0.1818138211965561,
"eval_runtime": 39.3086,
"eval_samples_per_second": 15.366,
"eval_steps_per_second": 3.841,
"eval_token_acc": 0.9370348924489051,
"step": 1400
},
{
"epoch": 1.501202244189153,
"grad_norm": 0.7017737627029419,
"learning_rate": 4.986000051031212e-06,
"loss": 0.13984346389770508,
"memory(GiB)": 31.36,
"step": 1405,
"token_acc": 0.9408215177889058,
"train_speed(iter/s)": 0.113226
},
{
"epoch": 1.5065455516965,
"grad_norm": 0.7313562035560608,
"learning_rate": 4.958000592128834e-06,
"loss": 0.13321598768234252,
"memory(GiB)": 31.36,
"step": 1410,
"token_acc": 0.9527582267752515,
"train_speed(iter/s)": 0.113333
},
{
"epoch": 1.511888859203847,
"grad_norm": 0.7831226587295532,
"learning_rate": 4.930002450318282e-06,
"loss": 0.1345110058784485,
"memory(GiB)": 31.36,
"step": 1415,
"token_acc": 0.9496566716124357,
"train_speed(iter/s)": 0.113428
},
{
"epoch": 1.517232166711194,
"grad_norm": 0.701524019241333,
"learning_rate": 4.9020065036148885e-06,
"loss": 0.14322736263275146,
"memory(GiB)": 31.36,
"step": 1420,
"token_acc": 0.9406170170956966,
"train_speed(iter/s)": 0.113546
},
{
"epoch": 1.517232166711194,
"eval_loss": 0.18205159902572632,
"eval_runtime": 39.1325,
"eval_samples_per_second": 15.435,
"eval_steps_per_second": 3.859,
"eval_token_acc": 0.9375109910141757,
"step": 1420
},
{
"epoch": 1.5225754742185411,
"grad_norm": 0.6394919157028198,
"learning_rate": 4.874013629965138e-06,
"loss": 0.13802753686904906,
"memory(GiB)": 31.36,
"step": 1425,
"token_acc": 0.9402941276057832,
"train_speed(iter/s)": 0.113221
},
{
"epoch": 1.5279187817258884,
"grad_norm": 0.7372197508811951,
"learning_rate": 4.846024707219149e-06,
"loss": 0.1441117525100708,
"memory(GiB)": 31.36,
"step": 1430,
"token_acc": 0.9448832927481142,
"train_speed(iter/s)": 0.113341
},
{
"epoch": 1.5332620892332354,
"grad_norm": 0.6687447428703308,
"learning_rate": 4.818040613103139e-06,
"loss": 0.13662933111190795,
"memory(GiB)": 31.36,
"step": 1435,
"token_acc": 0.949387670379852,
"train_speed(iter/s)": 0.113439
},
{
"epoch": 1.5386053967405824,
"grad_norm": 0.7553586363792419,
"learning_rate": 4.790062225191902e-06,
"loss": 0.15836725234985352,
"memory(GiB)": 31.36,
"step": 1440,
"token_acc": 0.9384995877988458,
"train_speed(iter/s)": 0.113564
},
{
"epoch": 1.5386053967405824,
"eval_loss": 0.18115834891796112,
"eval_runtime": 39.2778,
"eval_samples_per_second": 15.378,
"eval_steps_per_second": 3.844,
"eval_token_acc": 0.9376825580647237,
"step": 1440
},
{
"epoch": 1.5439487042479296,
"grad_norm": 0.6660886406898499,
"learning_rate": 4.762090420881289e-06,
"loss": 0.1435617685317993,
"memory(GiB)": 31.36,
"step": 1445,
"token_acc": 0.9411263893262424,
"train_speed(iter/s)": 0.113249
},
{
"epoch": 1.5492920117552766,
"grad_norm": 0.6773468255996704,
"learning_rate": 4.734126077360685e-06,
"loss": 0.13354458808898925,
"memory(GiB)": 31.36,
"step": 1450,
"token_acc": 0.9558268311099924,
"train_speed(iter/s)": 0.113351
},
{
"epoch": 1.5546353192626237,
"grad_norm": 0.7248560786247253,
"learning_rate": 4.706170071585513e-06,
"loss": 0.1327458381652832,
"memory(GiB)": 31.36,
"step": 1455,
"token_acc": 0.955091649694501,
"train_speed(iter/s)": 0.113455
},
{
"epoch": 1.5599786267699707,
"grad_norm": 0.7063668966293335,
"learning_rate": 4.678223280249718e-06,
"loss": 0.12768800258636476,
"memory(GiB)": 31.36,
"step": 1460,
"token_acc": 0.949375866851595,
"train_speed(iter/s)": 0.113557
},
{
"epoch": 1.5599786267699707,
"eval_loss": 0.18129871785640717,
"eval_runtime": 39.1068,
"eval_samples_per_second": 15.445,
"eval_steps_per_second": 3.861,
"eval_token_acc": 0.9380900298097751,
"step": 1460
},
{
"epoch": 1.5653219342773177,
"grad_norm": 0.8191858530044556,
"learning_rate": 4.650286579758291e-06,
"loss": 0.13946748971939088,
"memory(GiB)": 31.36,
"step": 1465,
"token_acc": 0.9430553548200608,
"train_speed(iter/s)": 0.113238
},
{
"epoch": 1.5706652417846647,
"grad_norm": 0.7221189141273499,
"learning_rate": 4.622360846199772e-06,
"loss": 0.13773694038391113,
"memory(GiB)": 31.36,
"step": 1470,
"token_acc": 0.9456267929815197,
"train_speed(iter/s)": 0.113346
},
{
"epoch": 1.5760085492920117,
"grad_norm": 0.715904176235199,
"learning_rate": 4.594446955318781e-06,
"loss": 0.13796852827072142,
"memory(GiB)": 31.36,
"step": 1475,
"token_acc": 0.9508182349503215,
"train_speed(iter/s)": 0.113453
},
{
"epoch": 1.5813518567993587,
"grad_norm": 0.7929291129112244,
"learning_rate": 4.566545782488554e-06,
"loss": 0.14087553024291993,
"memory(GiB)": 31.36,
"step": 1480,
"token_acc": 0.9462326623398016,
"train_speed(iter/s)": 0.113554
},
{
"epoch": 1.5813518567993587,
"eval_loss": 0.18039724230766296,
"eval_runtime": 39.124,
"eval_samples_per_second": 15.438,
"eval_steps_per_second": 3.86,
"eval_token_acc": 0.9378841493491175,
"step": 1480
},
{
"epoch": 1.5866951643067058,
"grad_norm": 0.7181767225265503,
"learning_rate": 4.53865820268349e-06,
"loss": 0.14211044311523438,
"memory(GiB)": 31.36,
"step": 1485,
"token_acc": 0.9420699925539836,
"train_speed(iter/s)": 0.113263
},
{
"epoch": 1.5920384718140528,
"grad_norm": 0.6460716724395752,
"learning_rate": 4.510785090451719e-06,
"loss": 0.13882654905319214,
"memory(GiB)": 31.36,
"step": 1490,
"token_acc": 0.9472210254200776,
"train_speed(iter/s)": 0.11337
},
{
"epoch": 1.5973817793213998,
"grad_norm": 0.7372190952301025,
"learning_rate": 4.482927319887669e-06,
"loss": 0.14314990043640136,
"memory(GiB)": 31.36,
"step": 1495,
"token_acc": 0.9522527490349996,
"train_speed(iter/s)": 0.113469
},
{
"epoch": 1.602725086828747,
"grad_norm": 0.7231627702713013,
"learning_rate": 4.455085764604653e-06,
"loss": 0.14776058197021485,
"memory(GiB)": 31.36,
"step": 1500,
"token_acc": 0.9497078741203028,
"train_speed(iter/s)": 0.113578
},
{
"epoch": 1.602725086828747,
"eval_loss": 0.1816372275352478,
"eval_runtime": 39.1733,
"eval_samples_per_second": 15.419,
"eval_steps_per_second": 3.855,
"eval_token_acc": 0.9376225095970319,
"step": 1500
},
{
"epoch": 1.608068394336094,
"grad_norm": 0.7648904323577881,
"learning_rate": 4.427261297707482e-06,
"loss": 0.14478824138641358,
"memory(GiB)": 31.36,
"step": 1505,
"token_acc": 0.9387511308018835,
"train_speed(iter/s)": 0.113249
},
{
"epoch": 1.613411701843441,
"grad_norm": 0.6977179646492004,
"learning_rate": 4.399454791765076e-06,
"loss": 0.14400074481964112,
"memory(GiB)": 31.36,
"step": 1510,
"token_acc": 0.9406537812945908,
"train_speed(iter/s)": 0.113346
},
{
"epoch": 1.618755009350788,
"grad_norm": 0.6725999116897583,
"learning_rate": 4.371667118783101e-06,
"loss": 0.14132678508758545,
"memory(GiB)": 31.36,
"step": 1515,
"token_acc": 0.9514303482587064,
"train_speed(iter/s)": 0.113465
},
{
"epoch": 1.6240983168581353,
"grad_norm": 0.7282098531723022,
"learning_rate": 4.343899150176635e-06,
"loss": 0.1421644926071167,
"memory(GiB)": 31.36,
"step": 1520,
"token_acc": 0.9474935470724086,
"train_speed(iter/s)": 0.113561
},
{
"epoch": 1.6240983168581353,
"eval_loss": 0.18081353604793549,
"eval_runtime": 39.1617,
"eval_samples_per_second": 15.423,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9381457891012032,
"step": 1520
},
{
"epoch": 1.6294416243654823,
"grad_norm": 0.7503587603569031,
"learning_rate": 4.316151756742821e-06,
"loss": 0.13501241207122802,
"memory(GiB)": 31.36,
"step": 1525,
"token_acc": 0.9416415493274551,
"train_speed(iter/s)": 0.113245
},
{
"epoch": 1.6347849318728294,
"grad_norm": 0.7951456308364868,
"learning_rate": 4.2884258086335755e-06,
"loss": 0.14272716045379638,
"memory(GiB)": 31.36,
"step": 1530,
"token_acc": 0.9456025411951559,
"train_speed(iter/s)": 0.113354
},
{
"epoch": 1.6401282393801764,
"grad_norm": 0.728985607624054,
"learning_rate": 4.26072217532829e-06,
"loss": 0.1494640588760376,
"memory(GiB)": 31.36,
"step": 1535,
"token_acc": 0.9476955108993974,
"train_speed(iter/s)": 0.113456
},
{
"epoch": 1.6454715468875234,
"grad_norm": 0.7554564476013184,
"learning_rate": 4.233041725606573e-06,
"loss": 0.14563045501708985,
"memory(GiB)": 31.36,
"step": 1540,
"token_acc": 0.9460028304967885,
"train_speed(iter/s)": 0.113557
},
{
"epoch": 1.6454715468875234,
"eval_loss": 0.18041342496871948,
"eval_runtime": 39.1831,
"eval_samples_per_second": 15.415,
"eval_steps_per_second": 3.854,
"eval_token_acc": 0.9377854982950524,
"step": 1540
},
{
"epoch": 1.6508148543948704,
"grad_norm": 0.7042314410209656,
"learning_rate": 4.205385327521002e-06,
"loss": 0.15157747268676758,
"memory(GiB)": 31.36,
"step": 1545,
"token_acc": 0.9420924680785825,
"train_speed(iter/s)": 0.113252
},
{
"epoch": 1.6561581619022174,
"grad_norm": 0.7284954786300659,
"learning_rate": 4.177753848369892e-06,
"loss": 0.13592784404754638,
"memory(GiB)": 31.36,
"step": 1550,
"token_acc": 0.9485677708433349,
"train_speed(iter/s)": 0.113347
},
{
"epoch": 1.6615014694095644,
"grad_norm": 0.7814568877220154,
"learning_rate": 4.1501481546701185e-06,
"loss": 0.13980913162231445,
"memory(GiB)": 31.36,
"step": 1555,
"token_acc": 0.9546170365068003,
"train_speed(iter/s)": 0.113447
},
{
"epoch": 1.6668447769169115,
"grad_norm": 0.7283998727798462,
"learning_rate": 4.12256911212992e-06,
"loss": 0.13263875246047974,
"memory(GiB)": 31.36,
"step": 1560,
"token_acc": 0.9517357901112563,
"train_speed(iter/s)": 0.113556
},
{
"epoch": 1.6668447769169115,
"eval_loss": 0.17983108758926392,
"eval_runtime": 39.1475,
"eval_samples_per_second": 15.429,
"eval_steps_per_second": 3.857,
"eval_token_acc": 0.938132921572412,
"step": 1560
},
{
"epoch": 1.6721880844242585,
"grad_norm": 0.7712817192077637,
"learning_rate": 4.095017585621767e-06,
"loss": 0.1369832158088684,
"memory(GiB)": 31.36,
"step": 1565,
"token_acc": 0.9403439916887915,
"train_speed(iter/s)": 0.113246
},
{
"epoch": 1.6775313919316055,
"grad_norm": 0.7404097318649292,
"learning_rate": 4.067494439155236e-06,
"loss": 0.14037466049194336,
"memory(GiB)": 31.36,
"step": 1570,
"token_acc": 0.9485796116828188,
"train_speed(iter/s)": 0.113344
},
{
"epoch": 1.6828746994389527,
"grad_norm": 0.7791383862495422,
"learning_rate": 4.0400005358499e-06,
"loss": 0.1371939778327942,
"memory(GiB)": 31.36,
"step": 1575,
"token_acc": 0.9578335949764522,
"train_speed(iter/s)": 0.113426
},
{
"epoch": 1.6882180069462998,
"grad_norm": 0.7537409663200378,
"learning_rate": 4.012536737908288e-06,
"loss": 0.1379605770111084,
"memory(GiB)": 31.36,
"step": 1580,
"token_acc": 0.9421140684410646,
"train_speed(iter/s)": 0.113529
},
{
"epoch": 1.6882180069462998,
"eval_loss": 0.1801919788122177,
"eval_runtime": 39.1291,
"eval_samples_per_second": 15.436,
"eval_steps_per_second": 3.859,
"eval_token_acc": 0.9378026550001072,
"step": 1580
},
{
"epoch": 1.6935613144536468,
"grad_norm": 0.759289562702179,
"learning_rate": 3.985103906588821e-06,
"loss": 0.1377565622329712,
"memory(GiB)": 31.36,
"step": 1585,
"token_acc": 0.9409732617857719,
"train_speed(iter/s)": 0.113222
},
{
"epoch": 1.698904621960994,
"grad_norm": 0.6593434810638428,
"learning_rate": 3.957702902178816e-06,
"loss": 0.13015660047531127,
"memory(GiB)": 31.36,
"step": 1590,
"token_acc": 0.9526614173228346,
"train_speed(iter/s)": 0.11332
},
{
"epoch": 1.704247929468341,
"grad_norm": 0.7041330337524414,
"learning_rate": 3.930334583967514e-06,
"loss": 0.14423298835754395,
"memory(GiB)": 31.36,
"step": 1595,
"token_acc": 0.9480859417602792,
"train_speed(iter/s)": 0.113416
},
{
"epoch": 1.709591236975688,
"grad_norm": 0.6841444969177246,
"learning_rate": 3.902999810219109e-06,
"loss": 0.13458824157714844,
"memory(GiB)": 31.36,
"step": 1600,
"token_acc": 0.9575135610614279,
"train_speed(iter/s)": 0.113518
},
{
"epoch": 1.709591236975688,
"eval_loss": 0.17973561584949493,
"eval_runtime": 39.1386,
"eval_samples_per_second": 15.432,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9382272834502133,
"step": 1600
},
{
"epoch": 1.714934544483035,
"grad_norm": 0.8064398765563965,
"learning_rate": 3.875699438145862e-06,
"loss": 0.13843204975128173,
"memory(GiB)": 31.36,
"step": 1605,
"token_acc": 0.941116183732414,
"train_speed(iter/s)": 0.113215
},
{
"epoch": 1.720277851990382,
"grad_norm": 0.7203328609466553,
"learning_rate": 3.8484343238811976e-06,
"loss": 0.14074230194091797,
"memory(GiB)": 31.36,
"step": 1610,
"token_acc": 0.9515015593790518,
"train_speed(iter/s)": 0.113297
},
{
"epoch": 1.725621159497729,
"grad_norm": 0.7621043920516968,
"learning_rate": 3.821205322452863e-06,
"loss": 0.13528130054473878,
"memory(GiB)": 31.36,
"step": 1615,
"token_acc": 0.9499534380726367,
"train_speed(iter/s)": 0.11339
},
{
"epoch": 1.7309644670050761,
"grad_norm": 0.6963400840759277,
"learning_rate": 3.794013287756125e-06,
"loss": 0.13751909732818604,
"memory(GiB)": 31.36,
"step": 1620,
"token_acc": 0.9522881588161861,
"train_speed(iter/s)": 0.113481
},
{
"epoch": 1.7309644670050761,
"eval_loss": 0.17938879132270813,
"eval_runtime": 39.133,
"eval_samples_per_second": 15.435,
"eval_steps_per_second": 3.859,
"eval_token_acc": 0.9384074288532888,
"step": 1620
},
{
"epoch": 1.7363077745124231,
"grad_norm": 0.7448264956474304,
"learning_rate": 3.766859072526969e-06,
"loss": 0.129533851146698,
"memory(GiB)": 31.36,
"step": 1625,
"token_acc": 0.9424371772034192,
"train_speed(iter/s)": 0.113206
},
{
"epoch": 1.7416510820197701,
"grad_norm": 0.7266005873680115,
"learning_rate": 3.7397435283153795e-06,
"loss": 0.1342164993286133,
"memory(GiB)": 31.36,
"step": 1630,
"token_acc": 0.9512910597946584,
"train_speed(iter/s)": 0.113278
},
{
"epoch": 1.7469943895271172,
"grad_norm": 0.7142292857170105,
"learning_rate": 3.712667505458622e-06,
"loss": 0.14621845483779908,
"memory(GiB)": 31.36,
"step": 1635,
"token_acc": 0.9502799032760277,
"train_speed(iter/s)": 0.113376
},
{
"epoch": 1.7523376970344642,
"grad_norm": 0.6820980906486511,
"learning_rate": 3.685631853054583e-06,
"loss": 0.14358122348785402,
"memory(GiB)": 31.36,
"step": 1640,
"token_acc": 0.9519375470278405,
"train_speed(iter/s)": 0.113482
},
{
"epoch": 1.7523376970344642,
"eval_loss": 0.17889852821826935,
"eval_runtime": 39.0969,
"eval_samples_per_second": 15.449,
"eval_steps_per_second": 3.862,
"eval_token_acc": 0.9381586566299942,
"step": 1640
},
{
"epoch": 1.7576810045418114,
"grad_norm": 0.7781234979629517,
"learning_rate": 3.658637418935146e-06,
"loss": 0.13783912658691405,
"memory(GiB)": 31.36,
"step": 1645,
"token_acc": 0.9402854612580077,
"train_speed(iter/s)": 0.113198
},
{
"epoch": 1.7630243120491584,
"grad_norm": 0.7067362666130066,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.138936448097229,
"memory(GiB)": 31.36,
"step": 1650,
"token_acc": 0.9473799468168309,
"train_speed(iter/s)": 0.113308
},
{
"epoch": 1.7683676195565055,
"grad_norm": 0.7398570775985718,
"learning_rate": 3.6047755903880478e-06,
"loss": 0.14049469232559203,
"memory(GiB)": 31.36,
"step": 1655,
"token_acc": 0.9463533798334994,
"train_speed(iter/s)": 0.113405
},
{
"epoch": 1.7737109270638525,
"grad_norm": 0.8484781980514526,
"learning_rate": 3.577909885055019e-06,
"loss": 0.1409994840621948,
"memory(GiB)": 31.36,
"step": 1660,
"token_acc": 0.9477940181350656,
"train_speed(iter/s)": 0.113503
},
{
"epoch": 1.7737109270638525,
"eval_loss": 0.1786307543516159,
"eval_runtime": 39.1475,
"eval_samples_per_second": 15.429,
"eval_steps_per_second": 3.857,
"eval_token_acc": 0.9385661283750456,
"step": 1660
},
{
"epoch": 1.7790542345711997,
"grad_norm": 0.7654802799224854,
"learning_rate": 3.5510887761428764e-06,
"loss": 0.13482067584991456,
"memory(GiB)": 31.36,
"step": 1665,
"token_acc": 0.9418829437383344,
"train_speed(iter/s)": 0.113194
},
{
"epoch": 1.7843975420785467,
"grad_norm": 0.7164176106452942,
"learning_rate": 3.524313104755468e-06,
"loss": 0.13556787967681885,
"memory(GiB)": 31.36,
"step": 1670,
"token_acc": 0.9485995797016571,
"train_speed(iter/s)": 0.113282
},
{
"epoch": 1.7897408495858937,
"grad_norm": 0.7502966523170471,
"learning_rate": 3.4975837105717203e-06,
"loss": 0.13461077213287354,
"memory(GiB)": 31.36,
"step": 1675,
"token_acc": 0.9522437216961712,
"train_speed(iter/s)": 0.113382
},
{
"epoch": 1.7950841570932408,
"grad_norm": 0.7419613003730774,
"learning_rate": 3.4709014318193298e-06,
"loss": 0.1423276662826538,
"memory(GiB)": 31.36,
"step": 1680,
"token_acc": 0.9488505932943192,
"train_speed(iter/s)": 0.113481
},
{
"epoch": 1.7950841570932408,
"eval_loss": 0.1787741631269455,
"eval_runtime": 39.2394,
"eval_samples_per_second": 15.393,
"eval_steps_per_second": 3.848,
"eval_token_acc": 0.9386218876664737,
"step": 1680
},
{
"epoch": 1.8004274646005878,
"grad_norm": 0.7690842151641846,
"learning_rate": 3.4442671052484545e-06,
"loss": 0.1434476137161255,
"memory(GiB)": 31.36,
"step": 1685,
"token_acc": 0.9390579069378391,
"train_speed(iter/s)": 0.113206
},
{
"epoch": 1.8057707721079348,
"grad_norm": 0.7701799273490906,
"learning_rate": 3.4176815661054884e-06,
"loss": 0.13522175550460816,
"memory(GiB)": 31.36,
"step": 1690,
"token_acc": 0.9531173213642171,
"train_speed(iter/s)": 0.113311
},
{
"epoch": 1.8111140796152818,
"grad_norm": 0.7921913862228394,
"learning_rate": 3.3911456481068613e-06,
"loss": 0.13670728206634522,
"memory(GiB)": 31.36,
"step": 1695,
"token_acc": 0.9481580510992276,
"train_speed(iter/s)": 0.113396
},
{
"epoch": 1.8164573871226288,
"grad_norm": 0.8159610629081726,
"learning_rate": 3.3646601834128924e-06,
"loss": 0.141351580619812,
"memory(GiB)": 31.36,
"step": 1700,
"token_acc": 0.9491429380932144,
"train_speed(iter/s)": 0.113491
},
{
"epoch": 1.8164573871226288,
"eval_loss": 0.17854392528533936,
"eval_runtime": 39.1874,
"eval_samples_per_second": 15.413,
"eval_steps_per_second": 3.853,
"eval_token_acc": 0.9385875742563641,
"step": 1700
},
{
"epoch": 1.8218006946299758,
"grad_norm": 0.7273773550987244,
"learning_rate": 3.3382260026017027e-06,
"loss": 0.1371569514274597,
"memory(GiB)": 31.36,
"step": 1705,
"token_acc": 0.9418744245136776,
"train_speed(iter/s)": 0.113203
},
{
"epoch": 1.8271440021373229,
"grad_norm": 0.7564848065376282,
"learning_rate": 3.311843934643157e-06,
"loss": 0.12437918186187744,
"memory(GiB)": 31.36,
"step": 1710,
"token_acc": 0.9571033210332104,
"train_speed(iter/s)": 0.113276
},
{
"epoch": 1.83248730964467,
"grad_norm": 0.7761486172676086,
"learning_rate": 3.2855148068728753e-06,
"loss": 0.14540971517562867,
"memory(GiB)": 31.36,
"step": 1715,
"token_acc": 0.9477377595488337,
"train_speed(iter/s)": 0.113367
},
{
"epoch": 1.8378306171520171,
"grad_norm": 0.7440487742424011,
"learning_rate": 3.2592394449662867e-06,
"loss": 0.13129628896713258,
"memory(GiB)": 31.36,
"step": 1720,
"token_acc": 0.950944535784988,
"train_speed(iter/s)": 0.113447
},
{
"epoch": 1.8378306171520171,
"eval_loss": 0.17832355201244354,
"eval_runtime": 39.1412,
"eval_samples_per_second": 15.431,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9387977438932853,
"step": 1720
},
{
"epoch": 1.8431739246593641,
"grad_norm": 0.7634470462799072,
"learning_rate": 3.233018672912731e-06,
"loss": 0.14330395460128784,
"memory(GiB)": 31.36,
"step": 1725,
"token_acc": 0.9432163730078285,
"train_speed(iter/s)": 0.113192
},
{
"epoch": 1.8485172321667112,
"grad_norm": 0.7993137836456299,
"learning_rate": 3.2068533129896273e-06,
"loss": 0.1466256022453308,
"memory(GiB)": 31.36,
"step": 1730,
"token_acc": 0.950088022429419,
"train_speed(iter/s)": 0.113299
},
{
"epoch": 1.8538605396740584,
"grad_norm": 0.7558074593544006,
"learning_rate": 3.1807441857366798e-06,
"loss": 0.13543074131011962,
"memory(GiB)": 31.36,
"step": 1735,
"token_acc": 0.9463475916166196,
"train_speed(iter/s)": 0.1134
},
{
"epoch": 1.8592038471814054,
"grad_norm": 0.7222068309783936,
"learning_rate": 3.1546921099301507e-06,
"loss": 0.13885715007781982,
"memory(GiB)": 31.36,
"step": 1740,
"token_acc": 0.9468914264999093,
"train_speed(iter/s)": 0.113485
},
{
"epoch": 1.8592038471814054,
"eval_loss": 0.1776435226202011,
"eval_runtime": 39.1268,
"eval_samples_per_second": 15.437,
"eval_steps_per_second": 3.859,
"eval_token_acc": 0.9389221300049325,
"step": 1740
},
{
"epoch": 1.8645471546887524,
"grad_norm": 0.7544873356819153,
"learning_rate": 3.1286979025571817e-06,
"loss": 0.1199462890625,
"memory(GiB)": 31.36,
"step": 1745,
"token_acc": 0.942448560630803,
"train_speed(iter/s)": 0.113208
},
{
"epoch": 1.8698904621960994,
"grad_norm": 0.6838610768318176,
"learning_rate": 3.1027623787901706e-06,
"loss": 0.13257505893707275,
"memory(GiB)": 31.36,
"step": 1750,
"token_acc": 0.953539454854062,
"train_speed(iter/s)": 0.113284
},
{
"epoch": 1.8752337697034465,
"grad_norm": 0.7307048439979553,
"learning_rate": 3.076886351961217e-06,
"loss": 0.14348651170730592,
"memory(GiB)": 31.36,
"step": 1755,
"token_acc": 0.9522128203567298,
"train_speed(iter/s)": 0.113384
},
{
"epoch": 1.8805770772107935,
"grad_norm": 0.7680135369300842,
"learning_rate": 3.0510706335366034e-06,
"loss": 0.14112248420715331,
"memory(GiB)": 31.36,
"step": 1760,
"token_acc": 0.950036469730124,
"train_speed(iter/s)": 0.11347
},
{
"epoch": 1.8805770772107935,
"eval_loss": 0.17695675790309906,
"eval_runtime": 39.1442,
"eval_samples_per_second": 15.43,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9390207810589977,
"step": 1760
},
{
"epoch": 1.8859203847181405,
"grad_norm": 0.6436514258384705,
"learning_rate": 3.02531603309136e-06,
"loss": 0.13258137702941894,
"memory(GiB)": 31.36,
"step": 1765,
"token_acc": 0.9418506940943162,
"train_speed(iter/s)": 0.113198
},
{
"epoch": 1.8912636922254875,
"grad_norm": 0.783997654914856,
"learning_rate": 2.9996233582838686e-06,
"loss": 0.137099552154541,
"memory(GiB)": 31.36,
"step": 1770,
"token_acc": 0.9559140509228075,
"train_speed(iter/s)": 0.11329
},
{
"epoch": 1.8966069997328345,
"grad_norm": 0.6852074861526489,
"learning_rate": 2.973993414830534e-06,
"loss": 0.1261454463005066,
"memory(GiB)": 31.36,
"step": 1775,
"token_acc": 0.9524192587295912,
"train_speed(iter/s)": 0.113369
},
{
"epoch": 1.9019503072401815,
"grad_norm": 0.6981728076934814,
"learning_rate": 2.948427006480528e-06,
"loss": 0.1357527494430542,
"memory(GiB)": 31.36,
"step": 1780,
"token_acc": 0.9534122629704496,
"train_speed(iter/s)": 0.11347
},
{
"epoch": 1.9019503072401815,
"eval_loss": 0.17668980360031128,
"eval_runtime": 39.1514,
"eval_samples_per_second": 15.427,
"eval_steps_per_second": 3.857,
"eval_token_acc": 0.9394754337429496,
"step": 1780
},
{
"epoch": 1.9072936147475286,
"grad_norm": 0.6986772418022156,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.1431878089904785,
"memory(GiB)": 31.36,
"step": 1785,
"token_acc": 0.9415058402368222,
"train_speed(iter/s)": 0.113218
},
{
"epoch": 1.9126369222548758,
"grad_norm": 0.6814377903938293,
"learning_rate": 2.897488000099788e-06,
"loss": 0.13923795223236085,
"memory(GiB)": 31.36,
"step": 1790,
"token_acc": 0.9478648950197789,
"train_speed(iter/s)": 0.113294
},
{
"epoch": 1.9179802297622228,
"grad_norm": 0.7366232872009277,
"learning_rate": 2.8721169995046503e-06,
"loss": 0.13349125385284424,
"memory(GiB)": 31.36,
"step": 1795,
"token_acc": 0.9429411168541604,
"train_speed(iter/s)": 0.113372
},
{
"epoch": 1.9233235372695698,
"grad_norm": 0.7991761565208435,
"learning_rate": 2.846812728833931e-06,
"loss": 0.13615771532058715,
"memory(GiB)": 31.36,
"step": 1800,
"token_acc": 0.9488062932585297,
"train_speed(iter/s)": 0.113453
},
{
"epoch": 1.9233235372695698,
"eval_loss": 0.17635242640972137,
"eval_runtime": 39.1984,
"eval_samples_per_second": 15.409,
"eval_steps_per_second": 3.852,
"eval_token_acc": 0.9390679619978983,
"step": 1800
},
{
"epoch": 1.928666844776917,
"grad_norm": 0.7395577430725098,
"learning_rate": 2.8215759816237748e-06,
"loss": 0.1406429648399353,
"memory(GiB)": 31.36,
"step": 1805,
"token_acc": 0.9421045579401058,
"train_speed(iter/s)": 0.113193
},
{
"epoch": 1.934010152284264,
"grad_norm": 0.7564118504524231,
"learning_rate": 2.796407549292809e-06,
"loss": 0.13832550048828124,
"memory(GiB)": 31.36,
"step": 1810,
"token_acc": 0.956355867541584,
"train_speed(iter/s)": 0.113289
},
{
"epoch": 1.9393534597916111,
"grad_norm": 0.6458448767662048,
"learning_rate": 2.771308221117309e-06,
"loss": 0.13285930156707765,
"memory(GiB)": 31.36,
"step": 1815,
"token_acc": 0.950381946877847,
"train_speed(iter/s)": 0.113379
},
{
"epoch": 1.9446967672989581,
"grad_norm": 0.7599870562553406,
"learning_rate": 2.7462787842064753e-06,
"loss": 0.131211256980896,
"memory(GiB)": 31.36,
"step": 1820,
"token_acc": 0.9533532132424537,
"train_speed(iter/s)": 0.113463
},
{
"epoch": 1.9446967672989581,
"eval_loss": 0.17678451538085938,
"eval_runtime": 39.1954,
"eval_samples_per_second": 15.41,
"eval_steps_per_second": 3.852,
"eval_token_acc": 0.9391751914044908,
"step": 1820
},
{
"epoch": 1.9500400748063051,
"grad_norm": 0.7510969042778015,
"learning_rate": 2.7213200234777215e-06,
"loss": 0.151234769821167,
"memory(GiB)": 31.36,
"step": 1825,
"token_acc": 0.9407468474954135,
"train_speed(iter/s)": 0.113216
},
{
"epoch": 1.9553833823136522,
"grad_norm": 0.7278915047645569,
"learning_rate": 2.696432721632082e-06,
"loss": 0.13203661441802977,
"memory(GiB)": 31.36,
"step": 1830,
"token_acc": 0.9518988171303122,
"train_speed(iter/s)": 0.113284
},
{
"epoch": 1.9607266898209992,
"grad_norm": 0.7572088241577148,
"learning_rate": 2.671617659129655e-06,
"loss": 0.14195291996002196,
"memory(GiB)": 31.36,
"step": 1835,
"token_acc": 0.9429287939813056,
"train_speed(iter/s)": 0.113373
},
{
"epoch": 1.9660699973283462,
"grad_norm": 0.7470288276672363,
"learning_rate": 2.646875614165121e-06,
"loss": 0.1265857696533203,
"memory(GiB)": 31.36,
"step": 1840,
"token_acc": 0.9504980895196506,
"train_speed(iter/s)": 0.113448
},
{
"epoch": 1.9660699973283462,
"eval_loss": 0.17609840631484985,
"eval_runtime": 39.2428,
"eval_samples_per_second": 15.391,
"eval_steps_per_second": 3.848,
"eval_token_acc": 0.9394668553904223,
"step": 1840
},
{
"epoch": 1.9714133048356932,
"grad_norm": 0.6941691637039185,
"learning_rate": 2.6222073626433587e-06,
"loss": 0.13350989818572997,
"memory(GiB)": 31.36,
"step": 1845,
"token_acc": 0.941098463918665,
"train_speed(iter/s)": 0.113192
},
{
"epoch": 1.9767566123430402,
"grad_norm": 0.7226253747940063,
"learning_rate": 2.597613678155092e-06,
"loss": 0.13629913330078125,
"memory(GiB)": 31.36,
"step": 1850,
"token_acc": 0.9471854356964505,
"train_speed(iter/s)": 0.113272
},
{
"epoch": 1.9820999198503872,
"grad_norm": 0.7369644641876221,
"learning_rate": 2.573095331952646e-06,
"loss": 0.1342089891433716,
"memory(GiB)": 31.36,
"step": 1855,
"token_acc": 0.9550353716804326,
"train_speed(iter/s)": 0.113379
},
{
"epoch": 1.9874432273577345,
"grad_norm": 0.71446293592453,
"learning_rate": 2.5486530929257574e-06,
"loss": 0.13259618282318114,
"memory(GiB)": 31.36,
"step": 1860,
"token_acc": 0.9522027151471942,
"train_speed(iter/s)": 0.113466
},
{
"epoch": 1.9874432273577345,
"eval_loss": 0.1756637990474701,
"eval_runtime": 39.2304,
"eval_samples_per_second": 15.396,
"eval_steps_per_second": 3.849,
"eval_token_acc": 0.9394625662141586,
"step": 1860
},
{
"epoch": 1.9927865348650815,
"grad_norm": 0.7780827879905701,
"learning_rate": 2.5242877275774446e-06,
"loss": 0.1336849570274353,
"memory(GiB)": 31.36,
"step": 1865,
"token_acc": 0.9416884046261456,
"train_speed(iter/s)": 0.113224
},
{
"epoch": 1.9981298423724285,
"grad_norm": 0.6946832537651062,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.14616479873657226,
"memory(GiB)": 31.36,
"step": 1870,
"token_acc": 0.9527469722324645,
"train_speed(iter/s)": 0.113319
},
{
"epoch": 2.003205984504408,
"grad_norm": 0.6446176171302795,
"learning_rate": 2.475790671851007e-06,
"loss": 0.11167683601379394,
"memory(GiB)": 31.36,
"step": 1875,
"token_acc": 0.9654894371875504,
"train_speed(iter/s)": 0.113423
},
{
"epoch": 2.008549292011755,
"grad_norm": 0.6247937083244324,
"learning_rate": 2.4516605023294626e-06,
"loss": 0.10328346490859985,
"memory(GiB)": 31.36,
"step": 1880,
"token_acc": 0.966084815624222,
"train_speed(iter/s)": 0.11351
},
{
"epoch": 2.008549292011755,
"eval_loss": 0.17980773746967316,
"eval_runtime": 39.1557,
"eval_samples_per_second": 15.426,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9395440605631689,
"step": 1880
},
{
"epoch": 2.013892599519102,
"grad_norm": 0.6584553718566895,
"learning_rate": 2.4276102481519655e-06,
"loss": 0.10475772619247437,
"memory(GiB)": 31.36,
"step": 1885,
"token_acc": 0.94671126227415,
"train_speed(iter/s)": 0.113271
},
{
"epoch": 2.019235907026449,
"grad_norm": 0.6800820827484131,
"learning_rate": 2.403640663528986e-06,
"loss": 0.09606801271438599,
"memory(GiB)": 31.36,
"step": 1890,
"token_acc": 0.9638240304639744,
"train_speed(iter/s)": 0.113359
},
{
"epoch": 2.0245792145337966,
"grad_norm": 0.6699435710906982,
"learning_rate": 2.379752500141222e-06,
"loss": 0.09734945297241211,
"memory(GiB)": 31.36,
"step": 1895,
"token_acc": 0.9634120335110434,
"train_speed(iter/s)": 0.113444
},
{
"epoch": 2.0299225220411437,
"grad_norm": 0.7269750833511353,
"learning_rate": 2.355946507116012e-06,
"loss": 0.0947374939918518,
"memory(GiB)": 31.36,
"step": 1900,
"token_acc": 0.9667849182180642,
"train_speed(iter/s)": 0.113522
},
{
"epoch": 2.0299225220411437,
"eval_loss": 0.19244976341724396,
"eval_runtime": 39.1428,
"eval_samples_per_second": 15.431,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9386519119003195,
"step": 1900
},
{
"epoch": 2.0352658295484907,
"grad_norm": 0.6670167446136475,
"learning_rate": 2.332223431003859e-06,
"loss": 0.09205610752105713,
"memory(GiB)": 31.36,
"step": 1905,
"token_acc": 0.9472729334391654,
"train_speed(iter/s)": 0.11327
},
{
"epoch": 2.0406091370558377,
"grad_norm": 0.8095514178276062,
"learning_rate": 2.3085840157550036e-06,
"loss": 0.09566161036491394,
"memory(GiB)": 31.36,
"step": 1910,
"token_acc": 0.9658257869771453,
"train_speed(iter/s)": 0.113366
},
{
"epoch": 2.0459524445631847,
"grad_norm": 0.7360612154006958,
"learning_rate": 2.2850290026961032e-06,
"loss": 0.10009205341339111,
"memory(GiB)": 31.36,
"step": 1915,
"token_acc": 0.9632088520055325,
"train_speed(iter/s)": 0.113459
},
{
"epoch": 2.0512957520705317,
"grad_norm": 0.8155742883682251,
"learning_rate": 2.2615591305069846e-06,
"loss": 0.09215841293334961,
"memory(GiB)": 31.36,
"step": 1920,
"token_acc": 0.968572231196011,
"train_speed(iter/s)": 0.113542
},
{
"epoch": 2.0512957520705317,
"eval_loss": 0.19029787182807922,
"eval_runtime": 39.1166,
"eval_samples_per_second": 15.441,
"eval_steps_per_second": 3.86,
"eval_token_acc": 0.9387848763644941,
"step": 1920
},
{
"epoch": 2.0566390595778787,
"grad_norm": 0.7107601761817932,
"learning_rate": 2.238175135197471e-06,
"loss": 0.09783934354782105,
"memory(GiB)": 31.36,
"step": 1925,
"token_acc": 0.9484311762913965,
"train_speed(iter/s)": 0.113297
},
{
"epoch": 2.0619823670852258,
"grad_norm": 0.7164185643196106,
"learning_rate": 2.2148777500843125e-06,
"loss": 0.10058900117874145,
"memory(GiB)": 31.36,
"step": 1930,
"token_acc": 0.9623356362825941,
"train_speed(iter/s)": 0.113372
},
{
"epoch": 2.0673256745925728,
"grad_norm": 0.7500390410423279,
"learning_rate": 2.1916677057681786e-06,
"loss": 0.10025620460510254,
"memory(GiB)": 31.36,
"step": 1935,
"token_acc": 0.9640934730056406,
"train_speed(iter/s)": 0.113457
},
{
"epoch": 2.07266898209992,
"grad_norm": 0.7709511518478394,
"learning_rate": 2.1685457301107506e-06,
"loss": 0.10047693252563476,
"memory(GiB)": 31.36,
"step": 1940,
"token_acc": 0.9648505046059044,
"train_speed(iter/s)": 0.113539
},
{
"epoch": 2.07266898209992,
"eval_loss": 0.19032922387123108,
"eval_runtime": 39.188,
"eval_samples_per_second": 15.413,
"eval_steps_per_second": 3.853,
"eval_token_acc": 0.9384202963820798,
"step": 1940
},
{
"epoch": 2.078012289607267,
"grad_norm": 0.6654588580131531,
"learning_rate": 2.145512548211902e-06,
"loss": 0.09436342120170593,
"memory(GiB)": 31.36,
"step": 1945,
"token_acc": 0.9468971724468446,
"train_speed(iter/s)": 0.113306
},
{
"epoch": 2.083355597114614,
"grad_norm": 0.7399053573608398,
"learning_rate": 2.1225688823869494e-06,
"loss": 0.09469984173774719,
"memory(GiB)": 31.36,
"step": 1950,
"token_acc": 0.9699618029029794,
"train_speed(iter/s)": 0.113384
},
{
"epoch": 2.088698904621961,
"grad_norm": 0.7095410227775574,
"learning_rate": 2.09971545214401e-06,
"loss": 0.09509609937667847,
"memory(GiB)": 31.36,
"step": 1955,
"token_acc": 0.9642617302694969,
"train_speed(iter/s)": 0.113448
},
{
"epoch": 2.094042212129308,
"grad_norm": 0.684771716594696,
"learning_rate": 2.0769529741614297e-06,
"loss": 0.09686210751533508,
"memory(GiB)": 31.36,
"step": 1960,
"token_acc": 0.9665719778485553,
"train_speed(iter/s)": 0.113535
},
{
"epoch": 2.094042212129308,
"eval_loss": 0.19170063734054565,
"eval_runtime": 39.1596,
"eval_samples_per_second": 15.424,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9384932123785626,
"step": 1960
},
{
"epoch": 2.0993855196366553,
"grad_norm": 0.6467424631118774,
"learning_rate": 2.054282162265313e-06,
"loss": 0.09031983613967895,
"memory(GiB)": 31.36,
"step": 1965,
"token_acc": 0.9467923722623636,
"train_speed(iter/s)": 0.113303
},
{
"epoch": 2.1047288271440023,
"grad_norm": 0.6366815567016602,
"learning_rate": 2.0317037274071412e-06,
"loss": 0.08445571660995484,
"memory(GiB)": 31.36,
"step": 1970,
"token_acc": 0.9674427290836654,
"train_speed(iter/s)": 0.113383
},
{
"epoch": 2.1100721346513494,
"grad_norm": 0.6350346207618713,
"learning_rate": 2.009218377641466e-06,
"loss": 0.0988619565963745,
"memory(GiB)": 31.36,
"step": 1975,
"token_acc": 0.9623879433545537,
"train_speed(iter/s)": 0.113457
},
{
"epoch": 2.1154154421586964,
"grad_norm": 0.6696274280548096,
"learning_rate": 1.9868268181037186e-06,
"loss": 0.09375531673431396,
"memory(GiB)": 31.36,
"step": 1980,
"token_acc": 0.9656578045525053,
"train_speed(iter/s)": 0.113523
},
{
"epoch": 2.1154154421586964,
"eval_loss": 0.19252096116542816,
"eval_runtime": 39.2604,
"eval_samples_per_second": 15.384,
"eval_steps_per_second": 3.846,
"eval_token_acc": 0.9383945613244976,
"step": 1980
},
{
"epoch": 2.1207587496660434,
"grad_norm": 0.7472742199897766,
"learning_rate": 1.964529750988086e-06,
"loss": 0.09353007674217224,
"memory(GiB)": 31.36,
"step": 1985,
"token_acc": 0.9438841714662963,
"train_speed(iter/s)": 0.113283
},
{
"epoch": 2.1261020571733904,
"grad_norm": 0.7385168671607971,
"learning_rate": 1.9423278755254933e-06,
"loss": 0.09612951874732971,
"memory(GiB)": 31.36,
"step": 1990,
"token_acc": 0.962575335363878,
"train_speed(iter/s)": 0.113363
},
{
"epoch": 2.1314453646807374,
"grad_norm": 0.769904613494873,
"learning_rate": 1.9202218879616824e-06,
"loss": 0.09348126649856567,
"memory(GiB)": 31.36,
"step": 1995,
"token_acc": 0.9666374287325356,
"train_speed(iter/s)": 0.113436
},
{
"epoch": 2.1367886721880844,
"grad_norm": 0.7597969770431519,
"learning_rate": 1.8982124815353665e-06,
"loss": 0.09418823719024658,
"memory(GiB)": 31.37,
"step": 2000,
"token_acc": 0.9634146341463414,
"train_speed(iter/s)": 0.113511
},
{
"epoch": 2.1367886721880844,
"eval_loss": 0.1915539652109146,
"eval_runtime": 39.2063,
"eval_samples_per_second": 15.406,
"eval_steps_per_second": 3.851,
"eval_token_acc": 0.9383216453280148,
"step": 2000
},
{
"epoch": 2.1421319796954315,
"grad_norm": 0.7184767127037048,
"learning_rate": 1.8763003464565022e-06,
"loss": 0.0999064326286316,
"memory(GiB)": 31.37,
"step": 2005,
"token_acc": 0.9464714514407684,
"train_speed(iter/s)": 0.113276
},
{
"epoch": 2.1474752872027785,
"grad_norm": 0.6783972382545471,
"learning_rate": 1.854486169884635e-06,
"loss": 0.09837267994880676,
"memory(GiB)": 31.37,
"step": 2010,
"token_acc": 0.9645275422436406,
"train_speed(iter/s)": 0.113352
},
{
"epoch": 2.1528185947101255,
"grad_norm": 0.746529221534729,
"learning_rate": 1.8327706359073526e-06,
"loss": 0.09520338773727417,
"memory(GiB)": 31.37,
"step": 2015,
"token_acc": 0.9642478360532348,
"train_speed(iter/s)": 0.113417
},
{
"epoch": 2.1581619022174725,
"grad_norm": 0.6819494366645813,
"learning_rate": 1.8111544255188402e-06,
"loss": 0.09425632357597351,
"memory(GiB)": 31.37,
"step": 2020,
"token_acc": 0.9684003992977867,
"train_speed(iter/s)": 0.113509
},
{
"epoch": 2.1581619022174725,
"eval_loss": 0.1915895640850067,
"eval_runtime": 39.2397,
"eval_samples_per_second": 15.393,
"eval_steps_per_second": 3.848,
"eval_token_acc": 0.9385103690836175,
"step": 2020
},
{
"epoch": 2.1635052097248195,
"grad_norm": 0.7457813024520874,
"learning_rate": 1.7896382165985094e-06,
"loss": 0.09581427574157715,
"memory(GiB)": 31.37,
"step": 2025,
"token_acc": 0.9465357277296783,
"train_speed(iter/s)": 0.113279
},
{
"epoch": 2.1688485172321665,
"grad_norm": 0.6739791631698608,
"learning_rate": 1.768222683889757e-06,
"loss": 0.09893481731414795,
"memory(GiB)": 31.37,
"step": 2030,
"token_acc": 0.9669017905588714,
"train_speed(iter/s)": 0.113367
},
{
"epoch": 2.1741918247395136,
"grad_norm": 0.6932037472724915,
"learning_rate": 1.746908498978791e-06,
"loss": 0.0985231876373291,
"memory(GiB)": 31.37,
"step": 2035,
"token_acc": 0.9686975154919789,
"train_speed(iter/s)": 0.113426
},
{
"epoch": 2.179535132246861,
"grad_norm": 0.7460088133811951,
"learning_rate": 1.7256963302735752e-06,
"loss": 0.09795750975608826,
"memory(GiB)": 31.37,
"step": 2040,
"token_acc": 0.964236412083855,
"train_speed(iter/s)": 0.113512
},
{
"epoch": 2.179535132246861,
"eval_loss": 0.19136284291744232,
"eval_runtime": 39.234,
"eval_samples_per_second": 15.395,
"eval_steps_per_second": 3.849,
"eval_token_acc": 0.9384631881447169,
"step": 2040
},
{
"epoch": 2.184878439754208,
"grad_norm": 0.7117043137550354,
"learning_rate": 1.7045868429828745e-06,
"loss": 0.09070048332214356,
"memory(GiB)": 31.37,
"step": 2045,
"token_acc": 0.9479196430533882,
"train_speed(iter/s)": 0.113281
},
{
"epoch": 2.190221747261555,
"grad_norm": 0.652874767780304,
"learning_rate": 1.6835806990953802e-06,
"loss": 0.09067975282669068,
"memory(GiB)": 31.37,
"step": 2050,
"token_acc": 0.9657613864524992,
"train_speed(iter/s)": 0.113343
},
{
"epoch": 2.195565054768902,
"grad_norm": 0.6861611604690552,
"learning_rate": 1.6626785573589667e-06,
"loss": 0.09590352177619935,
"memory(GiB)": 31.37,
"step": 2055,
"token_acc": 0.9615629645359949,
"train_speed(iter/s)": 0.113419
},
{
"epoch": 2.200908362276249,
"grad_norm": 0.6657945513725281,
"learning_rate": 1.6418810732600177e-06,
"loss": 0.0890547752380371,
"memory(GiB)": 31.37,
"step": 2060,
"token_acc": 0.968454143363673,
"train_speed(iter/s)": 0.113499
},
{
"epoch": 2.200908362276249,
"eval_loss": 0.19242902100086212,
"eval_runtime": 39.2335,
"eval_samples_per_second": 15.395,
"eval_steps_per_second": 3.849,
"eval_token_acc": 0.9385103690836175,
"step": 2060
},
{
"epoch": 2.206251669783596,
"grad_norm": 0.7557697892189026,
"learning_rate": 1.6211888990028785e-06,
"loss": 0.09434689283370971,
"memory(GiB)": 31.37,
"step": 2065,
"token_acc": 0.9467688912388791,
"train_speed(iter/s)": 0.113268
},
{
"epoch": 2.211594977290943,
"grad_norm": 0.7953412532806396,
"learning_rate": 1.6006026834894068e-06,
"loss": 0.10123894214630128,
"memory(GiB)": 31.37,
"step": 2070,
"token_acc": 0.9602344454463481,
"train_speed(iter/s)": 0.113338
},
{
"epoch": 2.21693828479829,
"grad_norm": 0.720357358455658,
"learning_rate": 1.5801230722986104e-06,
"loss": 0.09082142114639283,
"memory(GiB)": 31.37,
"step": 2075,
"token_acc": 0.9670389539634977,
"train_speed(iter/s)": 0.113403
},
{
"epoch": 2.222281592305637,
"grad_norm": 0.6819139122962952,
"learning_rate": 1.5597507076664187e-06,
"loss": 0.088398015499115,
"memory(GiB)": 31.37,
"step": 2080,
"token_acc": 0.9697791553661371,
"train_speed(iter/s)": 0.11347
},
{
"epoch": 2.222281592305637,
"eval_loss": 0.1921994835138321,
"eval_runtime": 39.1975,
"eval_samples_per_second": 15.409,
"eval_steps_per_second": 3.852,
"eval_token_acc": 0.9386047309614188,
"step": 2080
},
{
"epoch": 2.227624899812984,
"grad_norm": 0.7003424167633057,
"learning_rate": 1.5394862284655266e-06,
"loss": 0.09183210134506226,
"memory(GiB)": 31.37,
"step": 2085,
"token_acc": 0.9471144387997177,
"train_speed(iter/s)": 0.113237
},
{
"epoch": 2.232968207320331,
"grad_norm": 0.735797107219696,
"learning_rate": 1.5193302701853674e-06,
"loss": 0.09793744683265686,
"memory(GiB)": 31.37,
"step": 2090,
"token_acc": 0.9629603963826617,
"train_speed(iter/s)": 0.113318
},
{
"epoch": 2.238311514827678,
"grad_norm": 0.7035617828369141,
"learning_rate": 1.499283464912188e-06,
"loss": 0.09379619359970093,
"memory(GiB)": 31.37,
"step": 2095,
"token_acc": 0.9692680087017802,
"train_speed(iter/s)": 0.113392
},
{
"epoch": 2.2436548223350252,
"grad_norm": 0.653683602809906,
"learning_rate": 1.4793464413092161e-06,
"loss": 0.08412163257598877,
"memory(GiB)": 31.37,
"step": 2100,
"token_acc": 0.9694610598455018,
"train_speed(iter/s)": 0.113463
},
{
"epoch": 2.2436548223350252,
"eval_loss": 0.19128654897212982,
"eval_runtime": 39.2107,
"eval_samples_per_second": 15.404,
"eval_steps_per_second": 3.851,
"eval_token_acc": 0.9386562010765832,
"step": 2100
},
{
"epoch": 2.2489981298423722,
"grad_norm": 0.720621645450592,
"learning_rate": 1.459519824596956e-06,
"loss": 0.09899259209632874,
"memory(GiB)": 31.37,
"step": 2105,
"token_acc": 0.9451508803713978,
"train_speed(iter/s)": 0.113243
},
{
"epoch": 2.2543414373497193,
"grad_norm": 0.6509672403335571,
"learning_rate": 1.4398042365335745e-06,
"loss": 0.09483298659324646,
"memory(GiB)": 31.37,
"step": 2110,
"token_acc": 0.9613455205736621,
"train_speed(iter/s)": 0.113303
},
{
"epoch": 2.2596847448570667,
"grad_norm": 0.7314789891242981,
"learning_rate": 1.4202002953954042e-06,
"loss": 0.0946409523487091,
"memory(GiB)": 31.37,
"step": 2115,
"token_acc": 0.9664519265832017,
"train_speed(iter/s)": 0.11336
},
{
"epoch": 2.2650280523644137,
"grad_norm": 0.7073846459388733,
"learning_rate": 1.4007086159575595e-06,
"loss": 0.0916548490524292,
"memory(GiB)": 31.37,
"step": 2120,
"token_acc": 0.967791956065256,
"train_speed(iter/s)": 0.113428
},
{
"epoch": 2.2650280523644137,
"eval_loss": 0.1914122849702835,
"eval_runtime": 39.2346,
"eval_samples_per_second": 15.395,
"eval_steps_per_second": 3.849,
"eval_token_acc": 0.9387119603680113,
"step": 2120
},
{
"epoch": 2.2703713598717608,
"grad_norm": 0.744266152381897,
"learning_rate": 1.3813298094746491e-06,
"loss": 0.10223530530929566,
"memory(GiB)": 31.37,
"step": 2125,
"token_acc": 0.9450423116125821,
"train_speed(iter/s)": 0.11321
},
{
"epoch": 2.2757146673791078,
"grad_norm": 0.7458162307739258,
"learning_rate": 1.362064483661617e-06,
"loss": 0.0993034303188324,
"memory(GiB)": 31.37,
"step": 2130,
"token_acc": 0.9647883245497191,
"train_speed(iter/s)": 0.113282
},
{
"epoch": 2.281057974886455,
"grad_norm": 0.7311099767684937,
"learning_rate": 1.3429132426746743e-06,
"loss": 0.08791648149490357,
"memory(GiB)": 31.37,
"step": 2135,
"token_acc": 0.9718640093786636,
"train_speed(iter/s)": 0.11335
},
{
"epoch": 2.286401282393802,
"grad_norm": 0.6718652248382568,
"learning_rate": 1.3238766870923592e-06,
"loss": 0.09885276556015014,
"memory(GiB)": 31.37,
"step": 2140,
"token_acc": 0.9679231605654223,
"train_speed(iter/s)": 0.113429
},
{
"epoch": 2.286401282393802,
"eval_loss": 0.1916726678609848,
"eval_runtime": 39.2123,
"eval_samples_per_second": 15.403,
"eval_steps_per_second": 3.851,
"eval_token_acc": 0.9388191897746038,
"step": 2140
},
{
"epoch": 2.291744589901149,
"grad_norm": 0.7811794877052307,
"learning_rate": 1.3049554138967052e-06,
"loss": 0.09513717889785767,
"memory(GiB)": 31.37,
"step": 2145,
"token_acc": 0.9481464839361942,
"train_speed(iter/s)": 0.113213
},
{
"epoch": 2.297087897408496,
"grad_norm": 0.7876958847045898,
"learning_rate": 1.286150016454511e-06,
"loss": 0.09975624084472656,
"memory(GiB)": 31.37,
"step": 2150,
"token_acc": 0.9664041395578892,
"train_speed(iter/s)": 0.113297
},
{
"epoch": 2.302431204915843,
"grad_norm": 0.6765829920768738,
"learning_rate": 1.267461084498744e-06,
"loss": 0.09555368423461914,
"memory(GiB)": 31.37,
"step": 2155,
"token_acc": 0.9689418005736669,
"train_speed(iter/s)": 0.113362
},
{
"epoch": 2.30777451242319,
"grad_norm": 0.7572174668312073,
"learning_rate": 1.2488892041100364e-06,
"loss": 0.09787599444389343,
"memory(GiB)": 31.37,
"step": 2160,
"token_acc": 0.9644865820343885,
"train_speed(iter/s)": 0.113441
},
{
"epoch": 2.30777451242319,
"eval_loss": 0.19186273217201233,
"eval_runtime": 39.232,
"eval_samples_per_second": 15.396,
"eval_steps_per_second": 3.849,
"eval_token_acc": 0.9388191897746038,
"step": 2160
},
{
"epoch": 2.313117819930537,
"grad_norm": 0.7487595677375793,
"learning_rate": 1.2304349576983094e-06,
"loss": 0.09345256090164185,
"memory(GiB)": 31.37,
"step": 2165,
"token_acc": 0.9464117197681211,
"train_speed(iter/s)": 0.113226
},
{
"epoch": 2.318461127437884,
"grad_norm": 0.7937645316123962,
"learning_rate": 1.2120989239845149e-06,
"loss": 0.09632455110549927,
"memory(GiB)": 31.37,
"step": 2170,
"token_acc": 0.9689201004033055,
"train_speed(iter/s)": 0.113299
},
{
"epoch": 2.323804434945231,
"grad_norm": 0.7212129831314087,
"learning_rate": 1.1938816779824753e-06,
"loss": 0.09377689361572265,
"memory(GiB)": 31.37,
"step": 2175,
"token_acc": 0.9685556654016647,
"train_speed(iter/s)": 0.113374
},
{
"epoch": 2.3291477424525784,
"grad_norm": 0.6610186696052551,
"learning_rate": 1.1757837909808628e-06,
"loss": 0.09794212579727173,
"memory(GiB)": 31.37,
"step": 2180,
"token_acc": 0.9649737302977233,
"train_speed(iter/s)": 0.113446
},
{
"epoch": 2.3291477424525784,
"eval_loss": 0.1913762092590332,
"eval_runtime": 39.2194,
"eval_samples_per_second": 15.401,
"eval_steps_per_second": 3.85,
"eval_token_acc": 0.9388835274185593,
"step": 2180
},
{
"epoch": 2.334491049959925,
"grad_norm": 0.6436466574668884,
"learning_rate": 1.157805830525275e-06,
"loss": 0.08866640329360961,
"memory(GiB)": 31.37,
"step": 2185,
"token_acc": 0.9485147195950446,
"train_speed(iter/s)": 0.113237
},
{
"epoch": 2.3398343574672724,
"grad_norm": 0.7666484713554382,
"learning_rate": 1.1399483604004403e-06,
"loss": 0.08878711462020875,
"memory(GiB)": 31.37,
"step": 2190,
"token_acc": 0.9614538598512622,
"train_speed(iter/s)": 0.113307
},
{
"epoch": 2.3451776649746194,
"grad_norm": 0.7943670153617859,
"learning_rate": 1.1222119406125426e-06,
"loss": 0.09242654442787171,
"memory(GiB)": 31.37,
"step": 2195,
"token_acc": 0.96523288032722,
"train_speed(iter/s)": 0.113373
},
{
"epoch": 2.3505209724819665,
"grad_norm": 0.755363941192627,
"learning_rate": 1.1045971273716476e-06,
"loss": 0.10125420093536378,
"memory(GiB)": 31.37,
"step": 2200,
"token_acc": 0.964818502602802,
"train_speed(iter/s)": 0.113435
},
{
"epoch": 2.3505209724819665,
"eval_loss": 0.19072869420051575,
"eval_runtime": 39.1742,
"eval_samples_per_second": 15.418,
"eval_steps_per_second": 3.855,
"eval_token_acc": 0.938943575886251,
"step": 2200
},
{
"epoch": 2.3558642799893135,
"grad_norm": 0.7719199657440186,
"learning_rate": 1.0871044730742752e-06,
"loss": 0.0994708001613617,
"memory(GiB)": 31.37,
"step": 2205,
"token_acc": 0.9452918677716055,
"train_speed(iter/s)": 0.113231
},
{
"epoch": 2.3612075874966605,
"grad_norm": 0.7336219549179077,
"learning_rate": 1.0697345262860638e-06,
"loss": 0.09733407497406006,
"memory(GiB)": 31.37,
"step": 2210,
"token_acc": 0.962336711024295,
"train_speed(iter/s)": 0.1133
},
{
"epoch": 2.3665508950040075,
"grad_norm": 0.7070329189300537,
"learning_rate": 1.0524878317245713e-06,
"loss": 0.08725832104682922,
"memory(GiB)": 31.37,
"step": 2215,
"token_acc": 0.9673966575828217,
"train_speed(iter/s)": 0.113371
},
{
"epoch": 2.3718942025113545,
"grad_norm": 0.7620383501052856,
"learning_rate": 1.0353649302421982e-06,
"loss": 0.08947555422782898,
"memory(GiB)": 31.37,
"step": 2220,
"token_acc": 0.9708626514987136,
"train_speed(iter/s)": 0.113435
},
{
"epoch": 2.3718942025113545,
"eval_loss": 0.19121414422988892,
"eval_runtime": 39.1416,
"eval_samples_per_second": 15.431,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9391365888181175,
"step": 2220
},
{
"epoch": 2.3772375100187015,
"grad_norm": 0.6480873227119446,
"learning_rate": 1.0183663588092214e-06,
"loss": 0.0861007571220398,
"memory(GiB)": 31.37,
"step": 2225,
"token_acc": 0.9472103487064117,
"train_speed(iter/s)": 0.11322
},
{
"epoch": 2.3825808175260486,
"grad_norm": 0.7264376878738403,
"learning_rate": 1.0014926504969535e-06,
"loss": 0.09383871555328369,
"memory(GiB)": 31.37,
"step": 2230,
"token_acc": 0.968945743273048,
"train_speed(iter/s)": 0.113288
},
{
"epoch": 2.3879241250333956,
"grad_norm": 0.6731327772140503,
"learning_rate": 9.847443344610296e-07,
"loss": 0.09176123142242432,
"memory(GiB)": 31.37,
"step": 2235,
"token_acc": 0.9693463125322331,
"train_speed(iter/s)": 0.113364
},
{
"epoch": 2.3932674325407426,
"grad_norm": 0.8005653023719788,
"learning_rate": 9.681219359248106e-07,
"loss": 0.09590315818786621,
"memory(GiB)": 31.37,
"step": 2240,
"token_acc": 0.9636749520427738,
"train_speed(iter/s)": 0.113431
},
{
"epoch": 2.3932674325407426,
"eval_loss": 0.19097845256328583,
"eval_runtime": 39.2123,
"eval_samples_per_second": 15.403,
"eval_steps_per_second": 3.851,
"eval_token_acc": 0.9388963949473503,
"step": 2240
},
{
"epoch": 2.3986107400480896,
"grad_norm": 0.7299765348434448,
"learning_rate": 9.516259761629148e-07,
"loss": 0.08505445718765259,
"memory(GiB)": 31.37,
"step": 2245,
"token_acc": 0.944954128440367,
"train_speed(iter/s)": 0.113227
},
{
"epoch": 2.4039540475554366,
"grad_norm": 0.7002612948417664,
"learning_rate": 9.352569724848715e-07,
"loss": 0.08956900835037232,
"memory(GiB)": 31.37,
"step": 2250,
"token_acc": 0.9705892762342948,
"train_speed(iter/s)": 0.113285
},
{
"epoch": 2.409297355062784,
"grad_norm": 0.7428932785987854,
"learning_rate": 9.190154382188921e-07,
"loss": 0.09664742350578308,
"memory(GiB)": 31.37,
"step": 2255,
"token_acc": 0.9705281875658588,
"train_speed(iter/s)": 0.113348
},
{
"epoch": 2.414640662570131,
"grad_norm": 0.6886446475982666,
"learning_rate": 9.029018826957775e-07,
"loss": 0.09689427018165589,
"memory(GiB)": 31.37,
"step": 2260,
"token_acc": 0.960169941582581,
"train_speed(iter/s)": 0.113411
},
{
"epoch": 2.414640662570131,
"eval_loss": 0.1908179074525833,
"eval_runtime": 39.2091,
"eval_samples_per_second": 15.405,
"eval_steps_per_second": 3.851,
"eval_token_acc": 0.9391408779943812,
"step": 2260
},
{
"epoch": 2.419983970077478,
"grad_norm": 0.7271323800086975,
"learning_rate": 8.86916811232944e-07,
"loss": 0.09017704725265503,
"memory(GiB)": 31.37,
"step": 2265,
"token_acc": 0.9487460692263789,
"train_speed(iter/s)": 0.113206
},
{
"epoch": 2.425327277584825,
"grad_norm": 0.7703067064285278,
"learning_rate": 8.710607251185799e-07,
"loss": 0.09243172407150269,
"memory(GiB)": 31.37,
"step": 2270,
"token_acc": 0.9636782618237683,
"train_speed(iter/s)": 0.113273
},
{
"epoch": 2.430670585092172,
"grad_norm": 0.7155824899673462,
"learning_rate": 8.553341215959215e-07,
"loss": 0.09737263917922974,
"memory(GiB)": 31.37,
"step": 2275,
"token_acc": 0.9632417350679687,
"train_speed(iter/s)": 0.113346
},
{
"epoch": 2.436013892599519,
"grad_norm": 0.7440068125724792,
"learning_rate": 8.397374938476594e-07,
"loss": 0.09268940091133118,
"memory(GiB)": 31.37,
"step": 2280,
"token_acc": 0.9701119443538746,
"train_speed(iter/s)": 0.11341
},
{
"epoch": 2.436013892599519,
"eval_loss": 0.190630704164505,
"eval_runtime": 39.2105,
"eval_samples_per_second": 15.404,
"eval_steps_per_second": 3.851,
"eval_token_acc": 0.9391837697570181,
"step": 2280
},
{
"epoch": 2.441357200106866,
"grad_norm": 0.7325111031532288,
"learning_rate": 8.242713309804729e-07,
"loss": 0.09075909256935119,
"memory(GiB)": 31.37,
"step": 2285,
"token_acc": 0.947367209794716,
"train_speed(iter/s)": 0.113205
},
{
"epoch": 2.446700507614213,
"grad_norm": 0.757805585861206,
"learning_rate": 8.089361180096927e-07,
"loss": 0.09486221075057984,
"memory(GiB)": 31.37,
"step": 2290,
"token_acc": 0.9661316211878009,
"train_speed(iter/s)": 0.113267
},
{
"epoch": 2.45204381512156,
"grad_norm": 0.7468327283859253,
"learning_rate": 7.937323358440935e-07,
"loss": 0.09470909833908081,
"memory(GiB)": 31.37,
"step": 2295,
"token_acc": 0.9667186525265128,
"train_speed(iter/s)": 0.113336
},
{
"epoch": 2.4573871226289072,
"grad_norm": 0.7450569868087769,
"learning_rate": 7.786604612708093e-07,
"loss": 0.09495973587036133,
"memory(GiB)": 31.37,
"step": 2300,
"token_acc": 0.9614005227645736,
"train_speed(iter/s)": 0.113403
},
{
"epoch": 2.4573871226289072,
"eval_loss": 0.19077461957931519,
"eval_runtime": 39.2014,
"eval_samples_per_second": 15.408,
"eval_steps_per_second": 3.852,
"eval_token_acc": 0.9391537455231723,
"step": 2300
},
{
"epoch": 2.4627304301362543,
"grad_norm": 0.6540588736534119,
"learning_rate": 7.637209669403789e-07,
"loss": 0.0869560956954956,
"memory(GiB)": 31.37,
"step": 2305,
"token_acc": 0.9489500731715618,
"train_speed(iter/s)": 0.113205
},
{
"epoch": 2.4680737376436013,
"grad_norm": 0.738757848739624,
"learning_rate": 7.489143213519301e-07,
"loss": 0.09310966730117798,
"memory(GiB)": 31.37,
"step": 2310,
"token_acc": 0.9699513919575784,
"train_speed(iter/s)": 0.113277
},
{
"epoch": 2.4734170451509483,
"grad_norm": 0.6581406593322754,
"learning_rate": 7.342409888384816e-07,
"loss": 0.08957692980766296,
"memory(GiB)": 31.37,
"step": 2315,
"token_acc": 0.9666096166887331,
"train_speed(iter/s)": 0.113338
},
{
"epoch": 2.4787603526582953,
"grad_norm": 0.6974185705184937,
"learning_rate": 7.197014295523879e-07,
"loss": 0.08715896606445313,
"memory(GiB)": 31.37,
"step": 2320,
"token_acc": 0.9711248792133235,
"train_speed(iter/s)": 0.113406
},
{
"epoch": 2.4787603526582953,
"eval_loss": 0.19092287123203278,
"eval_runtime": 39.2106,
"eval_samples_per_second": 15.404,
"eval_steps_per_second": 3.851,
"eval_token_acc": 0.9391966372858093,
"step": 2320
},
{
"epoch": 2.4841036601656423,
"grad_norm": 0.7533718347549438,
"learning_rate": 7.052960994509056e-07,
"loss": 0.09223737120628357,
"memory(GiB)": 31.37,
"step": 2325,
"token_acc": 0.9484453382745063,
"train_speed(iter/s)": 0.113199
},
{
"epoch": 2.48944696767299,
"grad_norm": 0.7056859731674194,
"learning_rate": 6.910254502818914e-07,
"loss": 0.08803938627243042,
"memory(GiB)": 31.37,
"step": 2330,
"token_acc": 0.9710507958653053,
"train_speed(iter/s)": 0.113258
},
{
"epoch": 2.494790275180337,
"grad_norm": 0.7328131198883057,
"learning_rate": 6.768899295696413e-07,
"loss": 0.09180974960327148,
"memory(GiB)": 31.37,
"step": 2335,
"token_acc": 0.9653558350581117,
"train_speed(iter/s)": 0.113316
},
{
"epoch": 2.500133582687684,
"grad_norm": 0.7516103982925415,
"learning_rate": 6.628899806008515e-07,
"loss": 0.09033479690551757,
"memory(GiB)": 31.37,
"step": 2340,
"token_acc": 0.9666970260959094,
"train_speed(iter/s)": 0.113382
},
{
"epoch": 2.500133582687684,
"eval_loss": 0.19086231291294098,
"eval_runtime": 39.1889,
"eval_samples_per_second": 15.413,
"eval_steps_per_second": 3.853,
"eval_token_acc": 0.939299577516138,
"step": 2340
},
{
"epoch": 2.505476890195031,
"grad_norm": 0.8012656569480896,
"learning_rate": 6.490260424107231e-07,
"loss": 0.09262714982032776,
"memory(GiB)": 31.37,
"step": 2345,
"token_acc": 0.9465704224528235,
"train_speed(iter/s)": 0.113185
},
{
"epoch": 2.510820197702378,
"grad_norm": 0.6409561634063721,
"learning_rate": 6.352985497691883e-07,
"loss": 0.09137773513793945,
"memory(GiB)": 31.37,
"step": 2350,
"token_acc": 0.9675587467362924,
"train_speed(iter/s)": 0.113254
},
{
"epoch": 2.516163505209725,
"grad_norm": 0.790011465549469,
"learning_rate": 6.217079331672777e-07,
"loss": 0.10272359848022461,
"memory(GiB)": 31.37,
"step": 2355,
"token_acc": 0.9673232662587969,
"train_speed(iter/s)": 0.113325
},
{
"epoch": 2.521506812717072,
"grad_norm": 0.7116812467575073,
"learning_rate": 6.082546188036204e-07,
"loss": 0.09815052747726441,
"memory(GiB)": 31.37,
"step": 2360,
"token_acc": 0.9669859985261606,
"train_speed(iter/s)": 0.113389
},
{
"epoch": 2.521506812717072,
"eval_loss": 0.19036073982715607,
"eval_runtime": 39.1911,
"eval_samples_per_second": 15.412,
"eval_steps_per_second": 3.853,
"eval_token_acc": 0.9394539878616311,
"step": 2360
},
{
"epoch": 2.526850120224419,
"grad_norm": 0.755511462688446,
"learning_rate": 5.949390285710777e-07,
"loss": 0.09070051908493042,
"memory(GiB)": 31.37,
"step": 2365,
"token_acc": 0.9475465313028765,
"train_speed(iter/s)": 0.113185
},
{
"epoch": 2.532193427731766,
"grad_norm": 0.694186806678772,
"learning_rate": 5.817615800435167e-07,
"loss": 0.09250964522361756,
"memory(GiB)": 31.37,
"step": 2370,
"token_acc": 0.9697894963718658,
"train_speed(iter/s)": 0.113247
},
{
"epoch": 2.537536735239113,
"grad_norm": 0.7912867665290833,
"learning_rate": 5.687226864627115e-07,
"loss": 0.10275306701660156,
"memory(GiB)": 31.37,
"step": 2375,
"token_acc": 0.9677234207772039,
"train_speed(iter/s)": 0.113304
},
{
"epoch": 2.54288004274646,
"grad_norm": 0.7408064007759094,
"learning_rate": 5.558227567253832e-07,
"loss": 0.09581139087677001,
"memory(GiB)": 31.37,
"step": 2380,
"token_acc": 0.9702766420961533,
"train_speed(iter/s)": 0.113383
},
{
"epoch": 2.54288004274646,
"eval_loss": 0.19033583998680115,
"eval_runtime": 39.1446,
"eval_samples_per_second": 15.43,
"eval_steps_per_second": 3.857,
"eval_token_acc": 0.9392781316348195,
"step": 2380
},
{
"epoch": 2.548223350253807,
"grad_norm": 0.6932405829429626,
"learning_rate": 5.430621953703785e-07,
"loss": 0.09184646606445312,
"memory(GiB)": 31.37,
"step": 2385,
"token_acc": 0.9487018329752709,
"train_speed(iter/s)": 0.113172
},
{
"epoch": 2.553566657761154,
"grad_norm": 0.6893176436424255,
"learning_rate": 5.304414025659832e-07,
"loss": 0.08671947121620179,
"memory(GiB)": 31.37,
"step": 2390,
"token_acc": 0.9663333126281135,
"train_speed(iter/s)": 0.113228
},
{
"epoch": 2.5589099652685015,
"grad_norm": 0.6595478057861328,
"learning_rate": 5.179607740973764e-07,
"loss": 0.09736074805259705,
"memory(GiB)": 31.37,
"step": 2395,
"token_acc": 0.9659873313136876,
"train_speed(iter/s)": 0.113281
},
{
"epoch": 2.564253272775848,
"grad_norm": 0.738300085067749,
"learning_rate": 5.056207013542131e-07,
"loss": 0.09036798477172851,
"memory(GiB)": 31.37,
"step": 2400,
"token_acc": 0.9698333820520316,
"train_speed(iter/s)": 0.113341
},
{
"epoch": 2.564253272775848,
"eval_loss": 0.1903306394815445,
"eval_runtime": 39.1763,
"eval_samples_per_second": 15.417,
"eval_steps_per_second": 3.854,
"eval_token_acc": 0.9393596259838298,
"step": 2400
},
{
"epoch": 2.5695965802831955,
"grad_norm": 0.6590713858604431,
"learning_rate": 4.934215713183527e-07,
"loss": 0.08946118354797364,
"memory(GiB)": 31.37,
"step": 2405,
"token_acc": 0.9487092468155388,
"train_speed(iter/s)": 0.113141
},
{
"epoch": 2.5749398877905425,
"grad_norm": 0.7545654773712158,
"learning_rate": 4.813637665517251e-07,
"loss": 0.08997320532798767,
"memory(GiB)": 31.37,
"step": 2410,
"token_acc": 0.9712623356584911,
"train_speed(iter/s)": 0.113199
},
{
"epoch": 2.5802831952978895,
"grad_norm": 0.722476601600647,
"learning_rate": 4.6944766518432936e-07,
"loss": 0.09507122039794921,
"memory(GiB)": 31.37,
"step": 2415,
"token_acc": 0.9640161909989023,
"train_speed(iter/s)": 0.113266
},
{
"epoch": 2.5856265028052365,
"grad_norm": 0.7068451046943665,
"learning_rate": 4.576736409023813e-07,
"loss": 0.08914280533790589,
"memory(GiB)": 31.37,
"step": 2420,
"token_acc": 0.9624791076849609,
"train_speed(iter/s)": 0.113323
},
{
"epoch": 2.5856265028052365,
"eval_loss": 0.18999898433685303,
"eval_runtime": 39.1317,
"eval_samples_per_second": 15.435,
"eval_steps_per_second": 3.859,
"eval_token_acc": 0.9394068069227305,
"step": 2420
},
{
"epoch": 2.5909698103125836,
"grad_norm": 0.7488506436347961,
"learning_rate": 4.460420629365919e-07,
"loss": 0.10495038032531738,
"memory(GiB)": 31.37,
"step": 2425,
"token_acc": 0.9451158336373586,
"train_speed(iter/s)": 0.113141
},
{
"epoch": 2.5963131178199306,
"grad_norm": 0.7361159324645996,
"learning_rate": 4.3455329605058436e-07,
"loss": 0.09409030675888061,
"memory(GiB)": 31.37,
"step": 2430,
"token_acc": 0.9666012515273817,
"train_speed(iter/s)": 0.113204
},
{
"epoch": 2.6016564253272776,
"grad_norm": 0.7987903356552124,
"learning_rate": 4.232077005294638e-07,
"loss": 0.09288793802261353,
"memory(GiB)": 31.37,
"step": 2435,
"token_acc": 0.9625309550299681,
"train_speed(iter/s)": 0.11326
},
{
"epoch": 2.6069997328346246,
"grad_norm": 0.7318440675735474,
"learning_rate": 4.120056321685101e-07,
"loss": 0.09065854549407959,
"memory(GiB)": 31.37,
"step": 2440,
"token_acc": 0.9711286089238845,
"train_speed(iter/s)": 0.113321
},
{
"epoch": 2.6069997328346246,
"eval_loss": 0.19047002494335175,
"eval_runtime": 39.1424,
"eval_samples_per_second": 15.431,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.939428252804049,
"step": 2440
},
{
"epoch": 2.6123430403419716,
"grad_norm": 0.7801464200019836,
"learning_rate": 4.009474422620269e-07,
"loss": 0.09015793800354004,
"memory(GiB)": 31.37,
"step": 2445,
"token_acc": 0.9462027912208955,
"train_speed(iter/s)": 0.113137
},
{
"epoch": 2.6176863478493186,
"grad_norm": 0.7523171305656433,
"learning_rate": 3.900334775923237e-07,
"loss": 0.08972238302230835,
"memory(GiB)": 31.37,
"step": 2450,
"token_acc": 0.964319157867545,
"train_speed(iter/s)": 0.113199
},
{
"epoch": 2.6230296553566657,
"grad_norm": 0.7075212001800537,
"learning_rate": 3.7926408041883355e-07,
"loss": 0.09695062637329102,
"memory(GiB)": 31.37,
"step": 2455,
"token_acc": 0.9670872765509989,
"train_speed(iter/s)": 0.113263
},
{
"epoch": 2.6283729628640127,
"grad_norm": 0.7174405455589294,
"learning_rate": 3.6863958846739213e-07,
"loss": 0.09820109009742736,
"memory(GiB)": 31.37,
"step": 2460,
"token_acc": 0.9666613545816733,
"train_speed(iter/s)": 0.113326
},
{
"epoch": 2.6283729628640127,
"eval_loss": 0.1904931664466858,
"eval_runtime": 38.991,
"eval_samples_per_second": 15.491,
"eval_steps_per_second": 3.873,
"eval_token_acc": 0.9393724935126209,
"step": 2460
},
{
"epoch": 2.6337162703713597,
"grad_norm": 0.7205042243003845,
"learning_rate": 3.581603349196372e-07,
"loss": 0.09016447067260742,
"memory(GiB)": 31.37,
"step": 2465,
"token_acc": 0.9470618527207186,
"train_speed(iter/s)": 0.113146
},
{
"epoch": 2.639059577878707,
"grad_norm": 0.7430869340896606,
"learning_rate": 3.4782664840256387e-07,
"loss": 0.09322860240936279,
"memory(GiB)": 31.37,
"step": 2470,
"token_acc": 0.9666258078894584,
"train_speed(iter/s)": 0.113215
},
{
"epoch": 2.6444028853860537,
"grad_norm": 0.7449864149093628,
"learning_rate": 3.3763885297822153e-07,
"loss": 0.09292680621147156,
"memory(GiB)": 31.37,
"step": 2475,
"token_acc": 0.965542892849704,
"train_speed(iter/s)": 0.113277
},
{
"epoch": 2.649746192893401,
"grad_norm": 0.6502243280410767,
"learning_rate": 3.275972681335421e-07,
"loss": 0.09386556148529053,
"memory(GiB)": 31.37,
"step": 2480,
"token_acc": 0.9635287435353204,
"train_speed(iter/s)": 0.113346
},
{
"epoch": 2.649746192893401,
"eval_loss": 0.19051562249660492,
"eval_runtime": 38.9188,
"eval_samples_per_second": 15.519,
"eval_steps_per_second": 3.88,
"eval_token_acc": 0.9393296017499839,
"step": 2480
},
{
"epoch": 2.655089500400748,
"grad_norm": 0.6717422604560852,
"learning_rate": 3.1770220877033243e-07,
"loss": 0.08921995162963867,
"memory(GiB)": 31.37,
"step": 2485,
"token_acc": 0.9476881346660132,
"train_speed(iter/s)": 0.113146
},
{
"epoch": 2.660432807908095,
"grad_norm": 0.7039415240287781,
"learning_rate": 3.0795398519539113e-07,
"loss": 0.08925142884254456,
"memory(GiB)": 31.37,
"step": 2490,
"token_acc": 0.971012390099527,
"train_speed(iter/s)": 0.113211
},
{
"epoch": 2.6657761154154422,
"grad_norm": 0.7549923062324524,
"learning_rate": 2.9835290311078123e-07,
"loss": 0.09008611440658569,
"memory(GiB)": 31.37,
"step": 2495,
"token_acc": 0.9727889176682867,
"train_speed(iter/s)": 0.113272
},
{
"epoch": 2.6711194229227893,
"grad_norm": 0.6972574591636658,
"learning_rate": 2.888992636042437e-07,
"loss": 0.08815158009529114,
"memory(GiB)": 31.37,
"step": 2500,
"token_acc": 0.9709897080314667,
"train_speed(iter/s)": 0.113333
},
{
"epoch": 2.6711194229227893,
"eval_loss": 0.1904427856206894,
"eval_runtime": 38.99,
"eval_samples_per_second": 15.491,
"eval_steps_per_second": 3.873,
"eval_token_acc": 0.9394454095091038,
"step": 2500
},
{
"epoch": 2.6764627304301363,
"grad_norm": 0.7152836322784424,
"learning_rate": 2.7959336313974847e-07,
"loss": 0.09650709629058837,
"memory(GiB)": 31.37,
"step": 2505,
"token_acc": 0.9477473770829048,
"train_speed(iter/s)": 0.113148
},
{
"epoch": 2.6818060379374833,
"grad_norm": 0.6270228028297424,
"learning_rate": 2.704354935482095e-07,
"loss": 0.08850882053375245,
"memory(GiB)": 31.37,
"step": 2510,
"token_acc": 0.9648082862758556,
"train_speed(iter/s)": 0.113203
},
{
"epoch": 2.6871493454448303,
"grad_norm": 0.7035424709320068,
"learning_rate": 2.6142594201832183e-07,
"loss": 0.10063533782958985,
"memory(GiB)": 31.37,
"step": 2515,
"token_acc": 0.9673570595099183,
"train_speed(iter/s)": 0.113268
},
{
"epoch": 2.6924926529521773,
"grad_norm": 0.6758410334587097,
"learning_rate": 2.525649910875627e-07,
"loss": 0.08648205399513245,
"memory(GiB)": 31.37,
"step": 2520,
"token_acc": 0.970028145397928,
"train_speed(iter/s)": 0.113337
},
{
"epoch": 2.6924926529521773,
"eval_loss": 0.19026722013950348,
"eval_runtime": 39.0306,
"eval_samples_per_second": 15.475,
"eval_steps_per_second": 3.869,
"eval_token_acc": 0.9392909991636106,
"step": 2520
},
{
"epoch": 2.6978359604595243,
"grad_norm": 0.7509530186653137,
"learning_rate": 2.438529186333288e-07,
"loss": 0.09757347106933593,
"memory(GiB)": 31.37,
"step": 2525,
"token_acc": 0.9475406925782507,
"train_speed(iter/s)": 0.11316
},
{
"epoch": 2.7031792679668714,
"grad_norm": 0.7634799480438232,
"learning_rate": 2.3528999786421758e-07,
"loss": 0.09107044339179993,
"memory(GiB)": 31.37,
"step": 2530,
"token_acc": 0.969339403512032,
"train_speed(iter/s)": 0.113213
},
{
"epoch": 2.708522575474219,
"grad_norm": 0.6541688442230225,
"learning_rate": 2.2687649731146844e-07,
"loss": 0.0989515721797943,
"memory(GiB)": 31.37,
"step": 2535,
"token_acc": 0.9692385434102896,
"train_speed(iter/s)": 0.113272
},
{
"epoch": 2.7138658829815654,
"grad_norm": 0.7726590037345886,
"learning_rate": 2.1861268082053466e-07,
"loss": 0.09174089431762696,
"memory(GiB)": 31.37,
"step": 2540,
"token_acc": 0.963183540877098,
"train_speed(iter/s)": 0.113328
},
{
"epoch": 2.7138658829815654,
"eval_loss": 0.1902547925710678,
"eval_runtime": 39.0546,
"eval_samples_per_second": 15.466,
"eval_steps_per_second": 3.866,
"eval_token_acc": 0.9391923481095455,
"step": 2540
},
{
"epoch": 2.719209190488913,
"grad_norm": 0.7492479681968689,
"learning_rate": 2.104988075428127e-07,
"loss": 0.10171656608581543,
"memory(GiB)": 31.37,
"step": 2545,
"token_acc": 0.947412690936007,
"train_speed(iter/s)": 0.113141
},
{
"epoch": 2.7245524979962594,
"grad_norm": 0.7487272620201111,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.10030395984649658,
"memory(GiB)": 31.37,
"step": 2550,
"token_acc": 0.9660484246469037,
"train_speed(iter/s)": 0.113213
},
{
"epoch": 2.729895805503607,
"grad_norm": 0.7023541927337646,
"learning_rate": 1.947219037136827e-07,
"loss": 0.09708930253982544,
"memory(GiB)": 31.37,
"step": 2555,
"token_acc": 0.9649252477742315,
"train_speed(iter/s)": 0.113282
},
{
"epoch": 2.735239113010954,
"grad_norm": 0.7202330231666565,
"learning_rate": 1.8705936792237255e-07,
"loss": 0.08675633072853088,
"memory(GiB)": 31.37,
"step": 2560,
"token_acc": 0.9641549104720565,
"train_speed(iter/s)": 0.113325
},
{
"epoch": 2.735239113010954,
"eval_loss": 0.1902298927307129,
"eval_runtime": 39.1479,
"eval_samples_per_second": 15.429,
"eval_steps_per_second": 3.857,
"eval_token_acc": 0.9393167342211928,
"step": 2560
},
{
"epoch": 2.740582420518301,
"grad_norm": 0.6873902082443237,
"learning_rate": 1.7954776484895188e-07,
"loss": 0.0941142499446869,
"memory(GiB)": 31.37,
"step": 2565,
"token_acc": 0.947600952293746,
"train_speed(iter/s)": 0.113141
},
{
"epoch": 2.745925728025648,
"grad_norm": 0.7027167081832886,
"learning_rate": 1.7218733005557707e-07,
"loss": 0.08929444551467895,
"memory(GiB)": 31.37,
"step": 2570,
"token_acc": 0.9652069391202651,
"train_speed(iter/s)": 0.1132
},
{
"epoch": 2.751269035532995,
"grad_norm": 0.7328935265541077,
"learning_rate": 1.6497829436380009e-07,
"loss": 0.09518647193908691,
"memory(GiB)": 31.37,
"step": 2575,
"token_acc": 0.9699323199822478,
"train_speed(iter/s)": 0.11326
},
{
"epoch": 2.756612343040342,
"grad_norm": 0.7890388369560242,
"learning_rate": 1.5792088384733174e-07,
"loss": 0.09950923323631286,
"memory(GiB)": 31.37,
"step": 2580,
"token_acc": 0.962556860615113,
"train_speed(iter/s)": 0.113323
},
{
"epoch": 2.756612343040342,
"eval_loss": 0.19023241102695465,
"eval_runtime": 39.1648,
"eval_samples_per_second": 15.422,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9394196744515216,
"step": 2580
},
{
"epoch": 2.761955650547689,
"grad_norm": 0.7428569793701172,
"learning_rate": 1.510153198249531e-07,
"loss": 0.08879505395889283,
"memory(GiB)": 31.37,
"step": 2585,
"token_acc": 0.9496286055977968,
"train_speed(iter/s)": 0.113151
},
{
"epoch": 2.767298958055036,
"grad_norm": 0.7776227593421936,
"learning_rate": 1.4426181885357215e-07,
"loss": 0.09481008052825927,
"memory(GiB)": 31.37,
"step": 2590,
"token_acc": 0.9610127994545511,
"train_speed(iter/s)": 0.113206
},
{
"epoch": 2.772642265562383,
"grad_norm": 0.6838895082473755,
"learning_rate": 1.376605927214364e-07,
"loss": 0.0857117772102356,
"memory(GiB)": 31.37,
"step": 2595,
"token_acc": 0.9700140999530001,
"train_speed(iter/s)": 0.113255
},
{
"epoch": 2.77798557306973,
"grad_norm": 0.911618173122406,
"learning_rate": 1.312118484414876e-07,
"loss": 0.09858800768852234,
"memory(GiB)": 31.37,
"step": 2600,
"token_acc": 0.9638269804901757,
"train_speed(iter/s)": 0.113319
},
{
"epoch": 2.77798557306973,
"eval_loss": 0.1901472806930542,
"eval_runtime": 39.1574,
"eval_samples_per_second": 15.425,
"eval_steps_per_second": 3.856,
"eval_token_acc": 0.9393510476313024,
"step": 2600
},
{
"epoch": 2.783328880577077,
"grad_norm": 0.7218348979949951,
"learning_rate": 1.2491578824487204e-07,
"loss": 0.09123666286468506,
"memory(GiB)": 31.37,
"step": 2605,
"token_acc": 0.9471299437601038,
"train_speed(iter/s)": 0.113146
},
{
"epoch": 2.7886721880844245,
"grad_norm": 0.7690821290016174,
"learning_rate": 1.1877260957459835e-07,
"loss": 0.09849429726600648,
"memory(GiB)": 31.37,
"step": 2610,
"token_acc": 0.9662001494666758,
"train_speed(iter/s)": 0.113204
},
{
"epoch": 2.794015495591771,
"grad_norm": 0.700734555721283,
"learning_rate": 1.1278250507934518e-07,
"loss": 0.09033372402191162,
"memory(GiB)": 31.37,
"step": 2615,
"token_acc": 0.967641649881776,
"train_speed(iter/s)": 0.113271
},
{
"epoch": 2.7993588030991186,
"grad_norm": 0.7667319178581238,
"learning_rate": 1.0694566260742001e-07,
"loss": 0.08968676328659057,
"memory(GiB)": 31.37,
"step": 2620,
"token_acc": 0.9692407567701249,
"train_speed(iter/s)": 0.11334
},
{
"epoch": 2.7993588030991186,
"eval_loss": 0.19011949002742767,
"eval_runtime": 39.1765,
"eval_samples_per_second": 15.417,
"eval_steps_per_second": 3.854,
"eval_token_acc": 0.9393338909262476,
"step": 2620
},
{
"epoch": 2.804702110606465,
"grad_norm": 0.6720722913742065,
"learning_rate": 1.0126226520086823e-07,
"loss": 0.09349075555801392,
"memory(GiB)": 31.37,
"step": 2625,
"token_acc": 0.9490298548952021,
"train_speed(iter/s)": 0.113158
},
{
"epoch": 2.8100454181138126,
"grad_norm": 0.7081384658813477,
"learning_rate": 9.573249108973281e-08,
"loss": 0.09482257962226867,
"memory(GiB)": 31.37,
"step": 2630,
"token_acc": 0.9653012204622176,
"train_speed(iter/s)": 0.113226
},
{
"epoch": 2.8153887256211596,
"grad_norm": 0.8206889629364014,
"learning_rate": 9.035651368646647e-08,
"loss": 0.08651464581489562,
"memory(GiB)": 31.37,
"step": 2635,
"token_acc": 0.965938566552901,
"train_speed(iter/s)": 0.113281
},
{
"epoch": 2.8207320331285066,
"grad_norm": 0.7286149263381958,
"learning_rate": 8.513450158049109e-08,
"loss": 0.09981081485748292,
"memory(GiB)": 31.37,
"step": 2640,
"token_acc": 0.9647688936806594,
"train_speed(iter/s)": 0.113351
},
{
"epoch": 2.8207320331285066,
"eval_loss": 0.19009515643119812,
"eval_runtime": 39.1536,
"eval_samples_per_second": 15.426,
"eval_steps_per_second": 3.857,
"eval_token_acc": 0.9394239636277852,
"step": 2640
},
{
"epoch": 2.8260753406358536,
"grad_norm": 0.6493250131607056,
"learning_rate": 8.006661853291298e-08,
"loss": 0.09253804683685303,
"memory(GiB)": 31.37,
"step": 2645,
"token_acc": 0.9492279023135687,
"train_speed(iter/s)": 0.113182
},
{
"epoch": 2.8314186481432007,
"grad_norm": 0.7164784669876099,
"learning_rate": 7.515302347138486e-08,
"loss": 0.10055129528045655,
"memory(GiB)": 31.37,
"step": 2650,
"token_acc": 0.9693806782717248,
"train_speed(iter/s)": 0.113245
},
{
"epoch": 2.8367619556505477,
"grad_norm": 0.7498785853385925,
"learning_rate": 7.03938704851248e-08,
"loss": 0.09636443257331848,
"memory(GiB)": 31.37,
"step": 2655,
"token_acc": 0.9618243584137769,
"train_speed(iter/s)": 0.113298
},
{
"epoch": 2.8421052631578947,
"grad_norm": 0.7123965620994568,
"learning_rate": 6.578930882008283e-08,
"loss": 0.09678665399551392,
"memory(GiB)": 31.37,
"step": 2660,
"token_acc": 0.9638537936625904,
"train_speed(iter/s)": 0.113365
},
{
"epoch": 2.8421052631578947,
"eval_loss": 0.18998001515865326,
"eval_runtime": 39.1365,
"eval_samples_per_second": 15.433,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9393296017499839,
"step": 2660
},
{
"epoch": 2.8474485706652417,
"grad_norm": 0.7025579214096069,
"learning_rate": 6.133948287426028e-08,
"loss": 0.09522255659103393,
"memory(GiB)": 31.37,
"step": 2665,
"token_acc": 0.9474585290238993,
"train_speed(iter/s)": 0.113185
},
{
"epoch": 2.8527918781725887,
"grad_norm": 0.7414459586143494,
"learning_rate": 5.704453219318118e-08,
"loss": 0.09207627773284913,
"memory(GiB)": 31.37,
"step": 2670,
"token_acc": 0.9669233407114132,
"train_speed(iter/s)": 0.113245
},
{
"epoch": 2.8581351856799357,
"grad_norm": 0.6940200328826904,
"learning_rate": 5.2904591465516855e-08,
"loss": 0.0982110857963562,
"memory(GiB)": 31.37,
"step": 2675,
"token_acc": 0.9648942786069652,
"train_speed(iter/s)": 0.113311
},
{
"epoch": 2.8634784931872828,
"grad_norm": 0.5920035243034363,
"learning_rate": 4.891979051886153e-08,
"loss": 0.08439633250236511,
"memory(GiB)": 31.37,
"step": 2680,
"token_acc": 0.9698492462311558,
"train_speed(iter/s)": 0.113369
},
{
"epoch": 2.8634784931872828,
"eval_loss": 0.19001102447509766,
"eval_runtime": 39.056,
"eval_samples_per_second": 15.465,
"eval_steps_per_second": 3.866,
"eval_token_acc": 0.9394068069227305,
"step": 2680
},
{
"epoch": 2.86882180069463,
"grad_norm": 0.7572726607322693,
"learning_rate": 4.509025431566283e-08,
"loss": 0.09407066106796265,
"memory(GiB)": 31.37,
"step": 2685,
"token_acc": 0.9469456798144891,
"train_speed(iter/s)": 0.113207
},
{
"epoch": 2.874165108201977,
"grad_norm": 0.7466961741447449,
"learning_rate": 4.141610294930043e-08,
"loss": 0.08772618174552918,
"memory(GiB)": 31.37,
"step": 2690,
"token_acc": 0.9621488738657316,
"train_speed(iter/s)": 0.113262
},
{
"epoch": 2.8795084157093243,
"grad_norm": 0.7221233248710632,
"learning_rate": 3.7897451640321326e-08,
"loss": 0.09155750274658203,
"memory(GiB)": 31.37,
"step": 2695,
"token_acc": 0.9711238770396626,
"train_speed(iter/s)": 0.113319
},
{
"epoch": 2.8848517232166713,
"grad_norm": 0.690424919128418,
"learning_rate": 3.4534410732825485e-08,
"loss": 0.09065448045730591,
"memory(GiB)": 31.37,
"step": 2700,
"token_acc": 0.9694952336302547,
"train_speed(iter/s)": 0.113377
},
{
"epoch": 2.8848517232166713,
"eval_loss": 0.18985731899738312,
"eval_runtime": 39.1204,
"eval_samples_per_second": 15.44,
"eval_steps_per_second": 3.86,
"eval_token_acc": 0.939342469278775,
"step": 2700
},
{
"epoch": 2.8901950307240183,
"grad_norm": 0.7742385864257812,
"learning_rate": 3.1327085691006954e-08,
"loss": 0.09011354446411132,
"memory(GiB)": 31.37,
"step": 2705,
"token_acc": 0.9484952623546592,
"train_speed(iter/s)": 0.113201
},
{
"epoch": 2.8955383382313653,
"grad_norm": 0.67616868019104,
"learning_rate": 2.8275577095846495e-08,
"loss": 0.08765259981155396,
"memory(GiB)": 31.37,
"step": 2710,
"token_acc": 0.9678389470704784,
"train_speed(iter/s)": 0.113251
},
{
"epoch": 2.9008816457387123,
"grad_norm": 0.6910288333892822,
"learning_rate": 2.5379980641955792e-08,
"loss": 0.09436768293380737,
"memory(GiB)": 31.37,
"step": 2715,
"token_acc": 0.9657377798081316,
"train_speed(iter/s)": 0.113309
},
{
"epoch": 2.9062249532460593,
"grad_norm": 0.7219937443733215,
"learning_rate": 2.264038713457706e-08,
"loss": 0.09317046403884888,
"memory(GiB)": 31.37,
"step": 2720,
"token_acc": 0.9689827817804372,
"train_speed(iter/s)": 0.113372
},
{
"epoch": 2.9062249532460593,
"eval_loss": 0.18988655507564545,
"eval_runtime": 39.1328,
"eval_samples_per_second": 15.435,
"eval_steps_per_second": 3.859,
"eval_token_acc": 0.9394668553904223,
"step": 2720
},
{
"epoch": 2.9115682607534064,
"grad_norm": 0.7260240316390991,
"learning_rate": 2.0056882486736982e-08,
"loss": 0.09241507053375245,
"memory(GiB)": 31.37,
"step": 2725,
"token_acc": 0.9490058931065415,
"train_speed(iter/s)": 0.113201
},
{
"epoch": 2.9169115682607534,
"grad_norm": 0.6595222353935242,
"learning_rate": 1.762954771655001e-08,
"loss": 0.08784698247909546,
"memory(GiB)": 31.37,
"step": 2730,
"token_acc": 0.967156078213068,
"train_speed(iter/s)": 0.11326
},
{
"epoch": 2.9222548757681004,
"grad_norm": 0.7033255100250244,
"learning_rate": 1.5358458944680356e-08,
"loss": 0.08944010734558105,
"memory(GiB)": 31.37,
"step": 2735,
"token_acc": 0.9681864301377934,
"train_speed(iter/s)": 0.113312
},
{
"epoch": 2.9275981832754474,
"grad_norm": 0.684511661529541,
"learning_rate": 1.3243687391952809e-08,
"loss": 0.08554937839508056,
"memory(GiB)": 31.37,
"step": 2740,
"token_acc": 0.9701811147059805,
"train_speed(iter/s)": 0.11336
},
{
"epoch": 2.9275981832754474,
"eval_loss": 0.18995320796966553,
"eval_runtime": 39.1422,
"eval_samples_per_second": 15.431,
"eval_steps_per_second": 3.858,
"eval_token_acc": 0.9393767826888846,
"step": 2740
},
{
"epoch": 2.9329414907827944,
"grad_norm": 0.6943471431732178,
"learning_rate": 1.1285299377118974e-08,
"loss": 0.08585541248321533,
"memory(GiB)": 31.37,
"step": 2745,
"token_acc": 0.9478389939459105,
"train_speed(iter/s)": 0.113182
},
{
"epoch": 2.9382847982901414,
"grad_norm": 0.778913676738739,
"learning_rate": 9.48335631477948e-09,
"loss": 0.09440468549728394,
"memory(GiB)": 31.37,
"step": 2750,
"token_acc": 0.9696560591449694,
"train_speed(iter/s)": 0.113235
},
{
"epoch": 2.9436281057974885,
"grad_norm": 0.6813073754310608,
"learning_rate": 7.837914713457184e-09,
"loss": 0.09091969132423401,
"memory(GiB)": 31.37,
"step": 2755,
"token_acc": 0.9660938225731538,
"train_speed(iter/s)": 0.113288
},
{
"epoch": 2.948971413304836,
"grad_norm": 0.8302053213119507,
"learning_rate": 6.349026173824713e-09,
"loss": 0.09277503490447998,
"memory(GiB)": 31.37,
"step": 2760,
"token_acc": 0.9680151152198118,
"train_speed(iter/s)": 0.113333
},
{
"epoch": 2.948971413304836,
"eval_loss": 0.1899597942829132,
"eval_runtime": 39.1837,
"eval_samples_per_second": 15.415,
"eval_steps_per_second": 3.854,
"eval_token_acc": 0.9393553368075661,
"step": 2760
},
{
"epoch": 2.9543147208121825,
"grad_norm": 0.7870346307754517,
"learning_rate": 5.016737387085191e-09,
"loss": 0.09893054962158203,
"memory(GiB)": 31.37,
"step": 2765,
"token_acc": 0.946137875713679,
"train_speed(iter/s)": 0.113161
},
{
"epoch": 2.95965802831953,
"grad_norm": 0.7045755386352539,
"learning_rate": 3.841090133511749e-09,
"loss": 0.08883514404296874,
"memory(GiB)": 31.37,
"step": 2770,
"token_acc": 0.9683565527543212,
"train_speed(iter/s)": 0.113206
},
{
"epoch": 2.965001335826877,
"grad_norm": 0.6870289444923401,
"learning_rate": 2.8221212811324616e-09,
"loss": 0.09689734578132629,
"memory(GiB)": 31.37,
"step": 2775,
"token_acc": 0.9711121335611936,
"train_speed(iter/s)": 0.113254
},
{
"epoch": 2.970344643334224,
"grad_norm": 0.7232000827789307,
"learning_rate": 1.959862784577937e-09,
"loss": 0.09915404319763184,
"memory(GiB)": 31.37,
"step": 2780,
"token_acc": 0.9624814043439452,
"train_speed(iter/s)": 0.113315
},
{
"epoch": 2.970344643334224,
"eval_loss": 0.1898777186870575,
"eval_runtime": 39.1537,
"eval_samples_per_second": 15.426,
"eval_steps_per_second": 3.857,
"eval_token_acc": 0.9394239636277852,
"step": 2780
},
{
"epoch": 2.975687950841571,
"grad_norm": 0.6913635730743408,
"learning_rate": 1.2543416840771206e-09,
"loss": 0.09539123177528382,
"memory(GiB)": 31.37,
"step": 2785,
"token_acc": 0.9471013034632325,
"train_speed(iter/s)": 0.113152
},
{
"epoch": 2.981031258348918,
"grad_norm": 0.704073429107666,
"learning_rate": 7.055801046113031e-10,
"loss": 0.08701257705688477,
"memory(GiB)": 31.37,
"step": 2790,
"token_acc": 0.969090176051606,
"train_speed(iter/s)": 0.113208
},
{
"epoch": 2.986374565856265,
"grad_norm": 0.7093097567558289,
"learning_rate": 3.1359525521801326e-10,
"loss": 0.08475543856620789,
"memory(GiB)": 31.37,
"step": 2795,
"token_acc": 0.9701114312493491,
"train_speed(iter/s)": 0.113264
},
{
"epoch": 2.991717873363612,
"grad_norm": 0.6521144509315491,
"learning_rate": 7.839942845144777e-11,
"loss": 0.09691762924194336,
"memory(GiB)": 31.37,
"step": 2800,
"token_acc": 0.9690315134805886,
"train_speed(iter/s)": 0.113325
},
{
"epoch": 2.991717873363612,
"eval_loss": 0.19001290202140808,
"eval_runtime": 39.1732,
"eval_samples_per_second": 15.419,
"eval_steps_per_second": 3.855,
"eval_token_acc": 0.939398228570203,
"step": 2800
},
{
"epoch": 2.997061180870959,
"grad_norm": 0.6936495900154114,
"learning_rate": 0.0,
"loss": 0.09210923910140992,
"memory(GiB)": 31.37,
"step": 2805,
"token_acc": 0.947245220359309,
"train_speed(iter/s)": 0.11316
},
{
"epoch": 2.997061180870959,
"eval_loss": 0.18997597694396973,
"eval_runtime": 38.9833,
"eval_samples_per_second": 15.494,
"eval_steps_per_second": 3.873,
"eval_token_acc": 0.9393338909262476,
"step": 2805
}
],
"logging_steps": 5,
"max_steps": 2805,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.1732509215136154e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}