Files
qwen2.5vl-3b-self-critic-cot/trainer_state.json
ModelHub XC dbcd48225b 初始化项目,由ModelHub XC社区提供模型
Model: waltonfuture/qwen2.5vl-3b-self-critic-cot
Source: Original Platform
2026-05-20 13:43:34 +08:00

6042 lines
171 KiB
JSON

{
"best_global_step": 1580,
"best_metric": 0.6043635,
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v27-20250503-235734/checkpoint-1580",
"epoch": 2.9970144683457094,
"eval_steps": 20,
"global_step": 2448,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001224833499196203,
"grad_norm": 5.5611701011657715,
"learning_rate": 9.999995882658711e-06,
"loss": 1.0937654972076416,
"memory(GiB)": 27.73,
"step": 1,
"token_acc": 0.7134680736898587,
"train_speed(iter/s)": 0.067022
},
{
"epoch": 0.006124167495981015,
"grad_norm": 3.0366287231445312,
"learning_rate": 9.999897066806807e-06,
"loss": 0.8203982710838318,
"memory(GiB)": 27.77,
"step": 5,
"token_acc": 0.7706657236318628,
"train_speed(iter/s)": 0.125794
},
{
"epoch": 0.01224833499196203,
"grad_norm": 1.5655843019485474,
"learning_rate": 9.999588271465324e-06,
"loss": 0.7133324623107911,
"memory(GiB)": 27.77,
"step": 10,
"token_acc": 0.7872333068225043,
"train_speed(iter/s)": 0.148685
},
{
"epoch": 0.018372502487943046,
"grad_norm": 1.4364837408065796,
"learning_rate": 9.999073626689664e-06,
"loss": 0.7077776908874511,
"memory(GiB)": 27.77,
"step": 15,
"token_acc": 0.7906767333933643,
"train_speed(iter/s)": 0.153421
},
{
"epoch": 0.02449666998392406,
"grad_norm": 1.2211848497390747,
"learning_rate": 9.998353153669443e-06,
"loss": 0.6643787860870362,
"memory(GiB)": 29.89,
"step": 20,
"token_acc": 0.8500581170089113,
"train_speed(iter/s)": 0.156098
},
{
"epoch": 0.02449666998392406,
"eval_loss": 0.6882059574127197,
"eval_runtime": 30.3868,
"eval_samples_per_second": 17.343,
"eval_steps_per_second": 4.344,
"eval_token_acc": 0.8109354822046594,
"step": 20
},
{
"epoch": 0.030620837479905075,
"grad_norm": 1.1730519533157349,
"learning_rate": 9.997426882068896e-06,
"loss": 0.7091597557067871,
"memory(GiB)": 29.89,
"step": 25,
"token_acc": 0.7997104179259739,
"train_speed(iter/s)": 0.123685
},
{
"epoch": 0.03674500497588609,
"grad_norm": 1.1173818111419678,
"learning_rate": 9.996294850025658e-06,
"loss": 0.6673955917358398,
"memory(GiB)": 29.89,
"step": 30,
"token_acc": 0.8030015197568389,
"train_speed(iter/s)": 0.129134
},
{
"epoch": 0.042869172471867105,
"grad_norm": 1.1728911399841309,
"learning_rate": 9.994957104149202e-06,
"loss": 0.6595910072326661,
"memory(GiB)": 29.89,
"step": 35,
"token_acc": 0.7975662938735317,
"train_speed(iter/s)": 0.133798
},
{
"epoch": 0.04899333996784812,
"grad_norm": 1.234574317932129,
"learning_rate": 9.993413699518906e-06,
"loss": 0.6554254055023193,
"memory(GiB)": 29.89,
"step": 40,
"token_acc": 0.8106137920979735,
"train_speed(iter/s)": 0.136376
},
{
"epoch": 0.04899333996784812,
"eval_loss": 0.6702007055282593,
"eval_runtime": 30.2289,
"eval_samples_per_second": 17.434,
"eval_steps_per_second": 4.367,
"eval_token_acc": 0.8143189214318921,
"step": 40
},
{
"epoch": 0.05511750746382914,
"grad_norm": 1.0441969633102417,
"learning_rate": 9.9916646996818e-06,
"loss": 0.671393871307373,
"memory(GiB)": 29.89,
"step": 45,
"token_acc": 0.8007245869993985,
"train_speed(iter/s)": 0.123931
},
{
"epoch": 0.06124167495981015,
"grad_norm": 1.1040817499160767,
"learning_rate": 9.989710176649937e-06,
"loss": 0.65097017288208,
"memory(GiB)": 29.89,
"step": 50,
"token_acc": 0.8223552894211577,
"train_speed(iter/s)": 0.127235
},
{
"epoch": 0.06736584245579116,
"grad_norm": 0.971693217754364,
"learning_rate": 9.987550210897433e-06,
"loss": 0.652859115600586,
"memory(GiB)": 29.89,
"step": 55,
"token_acc": 0.8024456033087575,
"train_speed(iter/s)": 0.130006
},
{
"epoch": 0.07349000995177218,
"grad_norm": 1.3039090633392334,
"learning_rate": 9.985184891357165e-06,
"loss": 0.6641504764556885,
"memory(GiB)": 29.89,
"step": 60,
"token_acc": 0.7929855290045457,
"train_speed(iter/s)": 0.132371
},
{
"epoch": 0.07349000995177218,
"eval_loss": 0.6628613471984863,
"eval_runtime": 30.3515,
"eval_samples_per_second": 17.363,
"eval_steps_per_second": 4.349,
"eval_token_acc": 0.8155018337724056,
"step": 60
},
{
"epoch": 0.07961417744775319,
"grad_norm": 1.1451324224472046,
"learning_rate": 9.982614315417084e-06,
"loss": 0.6779595851898194,
"memory(GiB)": 29.89,
"step": 65,
"token_acc": 0.8056370699533272,
"train_speed(iter/s)": 0.12411
},
{
"epoch": 0.08573834494373421,
"grad_norm": 1.1827325820922852,
"learning_rate": 9.979838588916229e-06,
"loss": 0.647182846069336,
"memory(GiB)": 29.89,
"step": 70,
"token_acc": 0.806831566548881,
"train_speed(iter/s)": 0.126104
},
{
"epoch": 0.09186251243971523,
"grad_norm": 1.2671717405319214,
"learning_rate": 9.976857826140354e-06,
"loss": 0.6356947898864747,
"memory(GiB)": 29.89,
"step": 75,
"token_acc": 0.8092331033486255,
"train_speed(iter/s)": 0.128389
},
{
"epoch": 0.09798667993569624,
"grad_norm": 1.1943690776824951,
"learning_rate": 9.973672149817232e-06,
"loss": 0.6435425758361817,
"memory(GiB)": 29.89,
"step": 80,
"token_acc": 0.7932153503641372,
"train_speed(iter/s)": 0.130134
},
{
"epoch": 0.09798667993569624,
"eval_loss": 0.6591480374336243,
"eval_runtime": 30.2192,
"eval_samples_per_second": 17.439,
"eval_steps_per_second": 4.368,
"eval_token_acc": 0.8162405082907175,
"step": 80
},
{
"epoch": 0.10411084743167726,
"grad_norm": 1.0277475118637085,
"learning_rate": 9.970281691111598e-06,
"loss": 0.6061644554138184,
"memory(GiB)": 29.89,
"step": 85,
"token_acc": 0.8139895703141835,
"train_speed(iter/s)": 0.123754
},
{
"epoch": 0.11023501492765828,
"grad_norm": 1.1457469463348389,
"learning_rate": 9.96668658961975e-06,
"loss": 0.6548227787017822,
"memory(GiB)": 29.89,
"step": 90,
"token_acc": 0.8061897136047075,
"train_speed(iter/s)": 0.125333
},
{
"epoch": 0.11635918242363928,
"grad_norm": 1.124259352684021,
"learning_rate": 9.962886993363797e-06,
"loss": 0.6785114288330079,
"memory(GiB)": 29.89,
"step": 95,
"token_acc": 0.7958439546965248,
"train_speed(iter/s)": 0.126782
},
{
"epoch": 0.1224833499196203,
"grad_norm": 1.1532223224639893,
"learning_rate": 9.95888305878557e-06,
"loss": 0.6254438400268555,
"memory(GiB)": 29.89,
"step": 100,
"token_acc": 0.825769669327252,
"train_speed(iter/s)": 0.128271
},
{
"epoch": 0.1224833499196203,
"eval_loss": 0.6548585891723633,
"eval_runtime": 30.2918,
"eval_samples_per_second": 17.397,
"eval_steps_per_second": 4.358,
"eval_token_acc": 0.8170463350379669,
"step": 100
},
{
"epoch": 0.1286075174156013,
"grad_norm": 1.1253662109375,
"learning_rate": 9.954674950740175e-06,
"loss": 0.6390158653259277,
"memory(GiB)": 29.89,
"step": 105,
"token_acc": 0.8143782730827323,
"train_speed(iter/s)": 0.123016
},
{
"epoch": 0.13473168491158233,
"grad_norm": 1.0638540983200073,
"learning_rate": 9.950262842489215e-06,
"loss": 0.5906115531921386,
"memory(GiB)": 29.89,
"step": 110,
"token_acc": 0.8280561419101581,
"train_speed(iter/s)": 0.124561
},
{
"epoch": 0.14085585240756335,
"grad_norm": 1.3548355102539062,
"learning_rate": 9.945646915693646e-06,
"loss": 0.5967195510864258,
"memory(GiB)": 29.89,
"step": 115,
"token_acc": 0.8018965390008117,
"train_speed(iter/s)": 0.126157
},
{
"epoch": 0.14698001990354437,
"grad_norm": 1.199034333229065,
"learning_rate": 9.940827360406297e-06,
"loss": 0.631542444229126,
"memory(GiB)": 29.89,
"step": 120,
"token_acc": 0.8094361557837628,
"train_speed(iter/s)": 0.127311
},
{
"epoch": 0.14698001990354437,
"eval_loss": 0.6537693738937378,
"eval_runtime": 30.0219,
"eval_samples_per_second": 17.554,
"eval_steps_per_second": 4.397,
"eval_token_acc": 0.8179503073505863,
"step": 120
},
{
"epoch": 0.1531041873995254,
"grad_norm": 1.1312533617019653,
"learning_rate": 9.93580437506406e-06,
"loss": 0.6321775913238525,
"memory(GiB)": 29.89,
"step": 125,
"token_acc": 0.8088573959255979,
"train_speed(iter/s)": 0.123165
},
{
"epoch": 0.15922835489550638,
"grad_norm": 1.1674045324325562,
"learning_rate": 9.9305781664797e-06,
"loss": 0.6246171951293945,
"memory(GiB)": 29.89,
"step": 130,
"token_acc": 0.8052116325942988,
"train_speed(iter/s)": 0.12427
},
{
"epoch": 0.1653525223914874,
"grad_norm": 1.3215515613555908,
"learning_rate": 9.925148949833356e-06,
"loss": 0.6429347515106201,
"memory(GiB)": 29.89,
"step": 135,
"token_acc": 0.8183133283809673,
"train_speed(iter/s)": 0.125515
},
{
"epoch": 0.17147668988746842,
"grad_norm": 1.138923168182373,
"learning_rate": 9.919516948663666e-06,
"loss": 0.6564007759094238,
"memory(GiB)": 32.12,
"step": 140,
"token_acc": 0.8169854580859952,
"train_speed(iter/s)": 0.126675
},
{
"epoch": 0.17147668988746842,
"eval_loss": 0.6485698819160461,
"eval_runtime": 30.1644,
"eval_samples_per_second": 17.471,
"eval_steps_per_second": 4.376,
"eval_token_acc": 0.8182240818224081,
"step": 140
},
{
"epoch": 0.17760085738344944,
"grad_norm": 1.258039951324463,
"learning_rate": 9.913682394858576e-06,
"loss": 0.6344574451446533,
"memory(GiB)": 32.12,
"step": 145,
"token_acc": 0.8038108277711221,
"train_speed(iter/s)": 0.123274
},
{
"epoch": 0.18372502487943046,
"grad_norm": 1.1063320636749268,
"learning_rate": 9.907645528645791e-06,
"loss": 0.6173704147338868,
"memory(GiB)": 32.12,
"step": 150,
"token_acc": 0.8146344955967638,
"train_speed(iter/s)": 0.124336
},
{
"epoch": 0.18984919237541148,
"grad_norm": 1.118895411491394,
"learning_rate": 9.901406598582874e-06,
"loss": 0.6216392517089844,
"memory(GiB)": 32.12,
"step": 155,
"token_acc": 0.8292195700016929,
"train_speed(iter/s)": 0.125258
},
{
"epoch": 0.19597335987139247,
"grad_norm": 0.9873996376991272,
"learning_rate": 9.894965861547023e-06,
"loss": 0.6492547512054443,
"memory(GiB)": 32.12,
"step": 160,
"token_acc": 0.812430195125156,
"train_speed(iter/s)": 0.126446
},
{
"epoch": 0.19597335987139247,
"eval_loss": 0.6458240747451782,
"eval_runtime": 30.1721,
"eval_samples_per_second": 17.466,
"eval_steps_per_second": 4.375,
"eval_token_acc": 0.8187303063174751,
"step": 160
},
{
"epoch": 0.2020975273673735,
"grad_norm": 1.0021984577178955,
"learning_rate": 9.888323582724493e-06,
"loss": 0.5956392288208008,
"memory(GiB)": 32.12,
"step": 165,
"token_acc": 0.8176234443998395,
"train_speed(iter/s)": 0.123317
},
{
"epoch": 0.2082216948633545,
"grad_norm": 1.0651910305023193,
"learning_rate": 9.881480035599667e-06,
"loss": 0.6227351665496826,
"memory(GiB)": 32.12,
"step": 170,
"token_acc": 0.7998247919404292,
"train_speed(iter/s)": 0.124306
},
{
"epoch": 0.21434586235933553,
"grad_norm": 1.161063551902771,
"learning_rate": 9.874435501943814e-06,
"loss": 0.6138211727142334,
"memory(GiB)": 32.12,
"step": 175,
"token_acc": 0.8047394093021469,
"train_speed(iter/s)": 0.12515
},
{
"epoch": 0.22047002985531655,
"grad_norm": 1.0052075386047363,
"learning_rate": 9.867190271803466e-06,
"loss": 0.6363819122314454,
"memory(GiB)": 32.12,
"step": 180,
"token_acc": 0.8109002326934264,
"train_speed(iter/s)": 0.125933
},
{
"epoch": 0.22047002985531655,
"eval_loss": 0.6438542008399963,
"eval_runtime": 30.2227,
"eval_samples_per_second": 17.437,
"eval_steps_per_second": 4.368,
"eval_token_acc": 0.8191693785836045,
"step": 180
},
{
"epoch": 0.22659419735129757,
"grad_norm": 1.093913197517395,
"learning_rate": 9.859744643488494e-06,
"loss": 0.6040900707244873,
"memory(GiB)": 32.12,
"step": 185,
"token_acc": 0.8167378167100959,
"train_speed(iter/s)": 0.123432
},
{
"epoch": 0.23271836484727856,
"grad_norm": 1.229707956314087,
"learning_rate": 9.852098923559819e-06,
"loss": 0.6707104206085205,
"memory(GiB)": 32.12,
"step": 190,
"token_acc": 0.79118295902499,
"train_speed(iter/s)": 0.124323
},
{
"epoch": 0.23884253234325958,
"grad_norm": 1.2590445280075073,
"learning_rate": 9.844253426816785e-06,
"loss": 0.594182014465332,
"memory(GiB)": 32.12,
"step": 195,
"token_acc": 0.8231213499822541,
"train_speed(iter/s)": 0.125194
},
{
"epoch": 0.2449666998392406,
"grad_norm": 1.1405773162841797,
"learning_rate": 9.836208476284208e-06,
"loss": 0.6203227996826172,
"memory(GiB)": 32.12,
"step": 200,
"token_acc": 0.8118512276400965,
"train_speed(iter/s)": 0.126092
},
{
"epoch": 0.2449666998392406,
"eval_loss": 0.6424054503440857,
"eval_runtime": 30.1137,
"eval_samples_per_second": 17.5,
"eval_steps_per_second": 4.383,
"eval_token_acc": 0.8195051397282918,
"step": 200
},
{
"epoch": 0.2510908673352216,
"grad_norm": 1.1559275388717651,
"learning_rate": 9.827964403199067e-06,
"loss": 0.6028561592102051,
"memory(GiB)": 32.12,
"step": 205,
"token_acc": 0.8117863720073665,
"train_speed(iter/s)": 0.123812
},
{
"epoch": 0.2572150348312026,
"grad_norm": 1.1424733400344849,
"learning_rate": 9.819521546996864e-06,
"loss": 0.6058461189270019,
"memory(GiB)": 32.12,
"step": 210,
"token_acc": 0.8184524805138327,
"train_speed(iter/s)": 0.12456
},
{
"epoch": 0.26333920232718366,
"grad_norm": 1.0866496562957764,
"learning_rate": 9.810880255297663e-06,
"loss": 0.6336095809936524,
"memory(GiB)": 32.12,
"step": 215,
"token_acc": 0.8254080406980853,
"train_speed(iter/s)": 0.125292
},
{
"epoch": 0.26946336982316466,
"grad_norm": 1.090539574623108,
"learning_rate": 9.802040883891762e-06,
"loss": 0.6297359466552734,
"memory(GiB)": 32.12,
"step": 220,
"token_acc": 0.7853244390539721,
"train_speed(iter/s)": 0.125916
},
{
"epoch": 0.26946336982316466,
"eval_loss": 0.6410297155380249,
"eval_runtime": 30.1465,
"eval_samples_per_second": 17.481,
"eval_steps_per_second": 4.379,
"eval_token_acc": 0.8207552042977426,
"step": 220
},
{
"epoch": 0.2755875373191457,
"grad_norm": 1.0024933815002441,
"learning_rate": 9.793003796725049e-06,
"loss": 0.5586746215820313,
"memory(GiB)": 32.12,
"step": 225,
"token_acc": 0.8293589430563468,
"train_speed(iter/s)": 0.123629
},
{
"epoch": 0.2817117048151267,
"grad_norm": 1.0108203887939453,
"learning_rate": 9.783769365884023e-06,
"loss": 0.6524643898010254,
"memory(GiB)": 32.12,
"step": 230,
"token_acc": 0.8032468163701759,
"train_speed(iter/s)": 0.124589
},
{
"epoch": 0.2878358723111077,
"grad_norm": 1.079897403717041,
"learning_rate": 9.774337971580464e-06,
"loss": 0.6641106605529785,
"memory(GiB)": 32.12,
"step": 235,
"token_acc": 0.8061291260724153,
"train_speed(iter/s)": 0.125376
},
{
"epoch": 0.29396003980708874,
"grad_norm": 1.1108851432800293,
"learning_rate": 9.764710002135784e-06,
"loss": 0.6327021598815918,
"memory(GiB)": 32.12,
"step": 240,
"token_acc": 0.8024784931974078,
"train_speed(iter/s)": 0.126001
},
{
"epoch": 0.29396003980708874,
"eval_loss": 0.6399893164634705,
"eval_runtime": 30.1753,
"eval_samples_per_second": 17.465,
"eval_steps_per_second": 4.374,
"eval_token_acc": 0.8202076553540989,
"step": 240
},
{
"epoch": 0.3000842073030697,
"grad_norm": 0.8851113319396973,
"learning_rate": 9.754885853965039e-06,
"loss": 0.6223061561584473,
"memory(GiB)": 32.12,
"step": 245,
"token_acc": 0.8136768110686491,
"train_speed(iter/s)": 0.124088
},
{
"epoch": 0.3062083747990508,
"grad_norm": 0.943510890007019,
"learning_rate": 9.744865931560606e-06,
"loss": 0.625941801071167,
"memory(GiB)": 32.12,
"step": 250,
"token_acc": 0.825910233887913,
"train_speed(iter/s)": 0.124764
},
{
"epoch": 0.31233254229503177,
"grad_norm": 1.120894193649292,
"learning_rate": 9.73465064747553e-06,
"loss": 0.6334005355834961,
"memory(GiB)": 32.12,
"step": 255,
"token_acc": 0.7913934426229509,
"train_speed(iter/s)": 0.125458
},
{
"epoch": 0.31845670979101276,
"grad_norm": 1.069145917892456,
"learning_rate": 9.724240422306531e-06,
"loss": 0.6185196876525879,
"memory(GiB)": 32.12,
"step": 260,
"token_acc": 0.7924378740438349,
"train_speed(iter/s)": 0.126007
},
{
"epoch": 0.31845670979101276,
"eval_loss": 0.6390168070793152,
"eval_runtime": 29.9843,
"eval_samples_per_second": 17.576,
"eval_steps_per_second": 4.402,
"eval_token_acc": 0.8199958675551423,
"step": 260
},
{
"epoch": 0.3245808772869938,
"grad_norm": 1.1289619207382202,
"learning_rate": 9.713635684676701e-06,
"loss": 0.6217617988586426,
"memory(GiB)": 32.12,
"step": 265,
"token_acc": 0.8116217614406958,
"train_speed(iter/s)": 0.12414
},
{
"epoch": 0.3307050447829748,
"grad_norm": 1.0119985342025757,
"learning_rate": 9.702836871217838e-06,
"loss": 0.6184762001037598,
"memory(GiB)": 32.12,
"step": 270,
"token_acc": 0.8169801035704551,
"train_speed(iter/s)": 0.124781
},
{
"epoch": 0.33682921227895585,
"grad_norm": 1.121479868888855,
"learning_rate": 9.691844426552488e-06,
"loss": 0.6679095268249512,
"memory(GiB)": 32.12,
"step": 275,
"token_acc": 0.8053990302712619,
"train_speed(iter/s)": 0.125508
},
{
"epoch": 0.34295337977493684,
"grad_norm": 1.1491762399673462,
"learning_rate": 9.68065880327562e-06,
"loss": 0.6015125274658203,
"memory(GiB)": 32.12,
"step": 280,
"token_acc": 0.8035652005425306,
"train_speed(iter/s)": 0.126152
},
{
"epoch": 0.34295337977493684,
"eval_loss": 0.6370740532875061,
"eval_runtime": 30.1382,
"eval_samples_per_second": 17.486,
"eval_steps_per_second": 4.38,
"eval_token_acc": 0.8203677875923343,
"step": 280
},
{
"epoch": 0.3490775472709179,
"grad_norm": 0.9209638833999634,
"learning_rate": 9.669280461936004e-06,
"loss": 0.6419768333435059,
"memory(GiB)": 32.12,
"step": 285,
"token_acc": 0.8053348308297066,
"train_speed(iter/s)": 0.124607
},
{
"epoch": 0.3552017147668989,
"grad_norm": 1.0387402772903442,
"learning_rate": 9.657709871017243e-06,
"loss": 0.6462045669555664,
"memory(GiB)": 32.12,
"step": 290,
"token_acc": 0.8079543874287304,
"train_speed(iter/s)": 0.125198
},
{
"epoch": 0.36132588226287987,
"grad_norm": 1.0344866514205933,
"learning_rate": 9.645947506918482e-06,
"loss": 0.6486867904663086,
"memory(GiB)": 32.12,
"step": 295,
"token_acc": 0.8222080724468499,
"train_speed(iter/s)": 0.12567
},
{
"epoch": 0.3674500497588609,
"grad_norm": 1.1158808469772339,
"learning_rate": 9.633993853934803e-06,
"loss": 0.6453632354736328,
"memory(GiB)": 32.12,
"step": 300,
"token_acc": 0.8016542381712202,
"train_speed(iter/s)": 0.126206
},
{
"epoch": 0.3674500497588609,
"eval_loss": 0.6345298886299133,
"eval_runtime": 30.2223,
"eval_samples_per_second": 17.437,
"eval_steps_per_second": 4.368,
"eval_token_acc": 0.821328581021747,
"step": 300
},
{
"epoch": 0.3735742172548419,
"grad_norm": 0.9398248195648193,
"learning_rate": 9.621849404237274e-06,
"loss": 0.6055630683898926,
"memory(GiB)": 32.12,
"step": 305,
"token_acc": 0.8250450524809817,
"train_speed(iter/s)": 0.124632
},
{
"epoch": 0.37969838475082296,
"grad_norm": 1.048771858215332,
"learning_rate": 9.60951465785269e-06,
"loss": 0.6380780220031739,
"memory(GiB)": 32.12,
"step": 310,
"token_acc": 0.815754208203955,
"train_speed(iter/s)": 0.125253
},
{
"epoch": 0.38582255224680395,
"grad_norm": 1.026041865348816,
"learning_rate": 9.596990122642984e-06,
"loss": 0.6475009441375732,
"memory(GiB)": 32.12,
"step": 315,
"token_acc": 0.8045922028222913,
"train_speed(iter/s)": 0.12584
},
{
"epoch": 0.39194671974278494,
"grad_norm": 1.112762212753296,
"learning_rate": 9.584276314284316e-06,
"loss": 0.6385052680969239,
"memory(GiB)": 32.12,
"step": 320,
"token_acc": 0.792560957804059,
"train_speed(iter/s)": 0.126392
},
{
"epoch": 0.39194671974278494,
"eval_loss": 0.633259654045105,
"eval_runtime": 30.0477,
"eval_samples_per_second": 17.539,
"eval_steps_per_second": 4.393,
"eval_token_acc": 0.8214318921431892,
"step": 320
},
{
"epoch": 0.398070887238766,
"grad_norm": 1.0934439897537231,
"learning_rate": 9.571373756245842e-06,
"loss": 0.6589271545410156,
"memory(GiB)": 32.12,
"step": 325,
"token_acc": 0.8136614281775572,
"train_speed(iter/s)": 0.124935
},
{
"epoch": 0.404195054734747,
"grad_norm": 1.0657905340194702,
"learning_rate": 9.558282979768164e-06,
"loss": 0.6037847995758057,
"memory(GiB)": 32.12,
"step": 330,
"token_acc": 0.7959645802352305,
"train_speed(iter/s)": 0.125393
},
{
"epoch": 0.41031922223072803,
"grad_norm": 1.0330870151519775,
"learning_rate": 9.545004523841452e-06,
"loss": 0.6114434242248535,
"memory(GiB)": 32.12,
"step": 335,
"token_acc": 0.8286545017044316,
"train_speed(iter/s)": 0.125804
},
{
"epoch": 0.416443389726709,
"grad_norm": 1.0614529848098755,
"learning_rate": 9.531538935183252e-06,
"loss": 0.6515423774719238,
"memory(GiB)": 32.12,
"step": 340,
"token_acc": 0.796179652197727,
"train_speed(iter/s)": 0.126217
},
{
"epoch": 0.416443389726709,
"eval_loss": 0.6336191296577454,
"eval_runtime": 30.124,
"eval_samples_per_second": 17.494,
"eval_steps_per_second": 4.382,
"eval_token_acc": 0.8210754687742136,
"step": 340
},
{
"epoch": 0.42256755722269,
"grad_norm": 0.917084276676178,
"learning_rate": 9.517886768215978e-06,
"loss": 0.5718442916870117,
"memory(GiB)": 32.12,
"step": 345,
"token_acc": 0.829221986539922,
"train_speed(iter/s)": 0.124734
},
{
"epoch": 0.42869172471867106,
"grad_norm": 1.054498314857483,
"learning_rate": 9.50404858504409e-06,
"loss": 0.5831773757934571,
"memory(GiB)": 32.12,
"step": 350,
"token_acc": 0.830837973923772,
"train_speed(iter/s)": 0.125116
},
{
"epoch": 0.43481589221465206,
"grad_norm": 1.1211163997650146,
"learning_rate": 9.490024955430936e-06,
"loss": 0.6414088249206543,
"memory(GiB)": 32.12,
"step": 355,
"token_acc": 0.7959719461425946,
"train_speed(iter/s)": 0.125645
},
{
"epoch": 0.4409400597106331,
"grad_norm": 1.011711835861206,
"learning_rate": 9.475816456775313e-06,
"loss": 0.6285918235778809,
"memory(GiB)": 32.12,
"step": 360,
"token_acc": 0.8033112582781456,
"train_speed(iter/s)": 0.126108
},
{
"epoch": 0.4409400597106331,
"eval_loss": 0.6306507587432861,
"eval_runtime": 30.1935,
"eval_samples_per_second": 17.454,
"eval_steps_per_second": 4.372,
"eval_token_acc": 0.8219639444186166,
"step": 360
},
{
"epoch": 0.4470642272066141,
"grad_norm": 0.9317595958709717,
"learning_rate": 9.46142367408767e-06,
"loss": 0.6225271224975586,
"memory(GiB)": 32.12,
"step": 365,
"token_acc": 0.8147164353461464,
"train_speed(iter/s)": 0.124805
},
{
"epoch": 0.45318839470259514,
"grad_norm": 1.0820285081863403,
"learning_rate": 9.446847199966042e-06,
"loss": 0.6166964530944824,
"memory(GiB)": 32.12,
"step": 370,
"token_acc": 0.8163280356945315,
"train_speed(iter/s)": 0.12523
},
{
"epoch": 0.45931256219857614,
"grad_norm": 1.0026404857635498,
"learning_rate": 9.432087634571638e-06,
"loss": 0.614093542098999,
"memory(GiB)": 32.12,
"step": 375,
"token_acc": 0.8399052293112201,
"train_speed(iter/s)": 0.125657
},
{
"epoch": 0.4654367296945571,
"grad_norm": 1.0475082397460938,
"learning_rate": 9.417145585604139e-06,
"loss": 0.5946948051452636,
"memory(GiB)": 32.12,
"step": 380,
"token_acc": 0.8166074313408723,
"train_speed(iter/s)": 0.126073
},
{
"epoch": 0.4654367296945571,
"eval_loss": 0.6289507150650024,
"eval_runtime": 30.1648,
"eval_samples_per_second": 17.471,
"eval_steps_per_second": 4.376,
"eval_token_acc": 0.8223720233483135,
"step": 380
},
{
"epoch": 0.4715608971905382,
"grad_norm": 1.0893336534500122,
"learning_rate": 9.402021668276669e-06,
"loss": 0.6302263259887695,
"memory(GiB)": 32.12,
"step": 385,
"token_acc": 0.8211241269317494,
"train_speed(iter/s)": 0.124795
},
{
"epoch": 0.47768506468651917,
"grad_norm": 0.9649361968040466,
"learning_rate": 9.386716505290467e-06,
"loss": 0.6190325736999511,
"memory(GiB)": 32.12,
"step": 390,
"token_acc": 0.8042954275641256,
"train_speed(iter/s)": 0.125176
},
{
"epoch": 0.4838092321825002,
"grad_norm": 1.1449054479599,
"learning_rate": 9.371230726809258e-06,
"loss": 0.6380712032318115,
"memory(GiB)": 32.12,
"step": 395,
"token_acc": 0.7973576211390309,
"train_speed(iter/s)": 0.125508
},
{
"epoch": 0.4899333996784812,
"grad_norm": 1.0124316215515137,
"learning_rate": 9.355564970433288e-06,
"loss": 0.6248594284057617,
"memory(GiB)": 32.12,
"step": 400,
"token_acc": 0.8122048129544943,
"train_speed(iter/s)": 0.12583
},
{
"epoch": 0.4899333996784812,
"eval_loss": 0.6282716989517212,
"eval_runtime": 30.1457,
"eval_samples_per_second": 17.482,
"eval_steps_per_second": 4.379,
"eval_token_acc": 0.822165401105429,
"step": 400
},
{
"epoch": 0.4960575671744622,
"grad_norm": 1.1087108850479126,
"learning_rate": 9.339719881173093e-06,
"loss": 0.5993568420410156,
"memory(GiB)": 32.12,
"step": 405,
"token_acc": 0.8224255219329111,
"train_speed(iter/s)": 0.124698
},
{
"epoch": 0.5021817346704432,
"grad_norm": 1.1231392621994019,
"learning_rate": 9.323696111422921e-06,
"loss": 0.6480292797088623,
"memory(GiB)": 32.12,
"step": 410,
"token_acc": 0.8032305992609242,
"train_speed(iter/s)": 0.125122
},
{
"epoch": 0.5083059021664242,
"grad_norm": 1.007530689239502,
"learning_rate": 9.307494320933893e-06,
"loss": 0.5892566204071045,
"memory(GiB)": 32.12,
"step": 415,
"token_acc": 0.8197306397306398,
"train_speed(iter/s)": 0.12542
},
{
"epoch": 0.5144300696624052,
"grad_norm": 1.0489633083343506,
"learning_rate": 9.291115176786814e-06,
"loss": 0.604928731918335,
"memory(GiB)": 32.12,
"step": 420,
"token_acc": 0.8093353125515421,
"train_speed(iter/s)": 0.125868
},
{
"epoch": 0.5144300696624052,
"eval_loss": 0.6283579468727112,
"eval_runtime": 30.1281,
"eval_samples_per_second": 17.492,
"eval_steps_per_second": 4.381,
"eval_token_acc": 0.8228782478433804,
"step": 420
},
{
"epoch": 0.5205542371583863,
"grad_norm": 1.126206636428833,
"learning_rate": 9.274559353364734e-06,
"loss": 0.6561414718627929,
"memory(GiB)": 32.12,
"step": 425,
"token_acc": 0.8186422368439512,
"train_speed(iter/s)": 0.124654
},
{
"epoch": 0.5266784046543673,
"grad_norm": 1.1023788452148438,
"learning_rate": 9.257827532325159e-06,
"loss": 0.6316391944885253,
"memory(GiB)": 32.12,
"step": 430,
"token_acc": 0.8269044804985495,
"train_speed(iter/s)": 0.125027
},
{
"epoch": 0.5328025721503483,
"grad_norm": 1.0733466148376465,
"learning_rate": 9.240920402571995e-06,
"loss": 0.6313365936279297,
"memory(GiB)": 32.12,
"step": 435,
"token_acc": 0.7986597170513775,
"train_speed(iter/s)": 0.125308
},
{
"epoch": 0.5389267396463293,
"grad_norm": 1.0384747982025146,
"learning_rate": 9.223838660227183e-06,
"loss": 0.5966384410858154,
"memory(GiB)": 32.12,
"step": 440,
"token_acc": 0.8461405390443899,
"train_speed(iter/s)": 0.125729
},
{
"epoch": 0.5389267396463293,
"eval_loss": 0.627018392086029,
"eval_runtime": 30.1938,
"eval_samples_per_second": 17.454,
"eval_steps_per_second": 4.372,
"eval_token_acc": 0.8226509633762075,
"step": 440
},
{
"epoch": 0.5450509071423103,
"grad_norm": 0.9662637710571289,
"learning_rate": 9.206583008602039e-06,
"loss": 0.6196205615997314,
"memory(GiB)": 32.12,
"step": 445,
"token_acc": 0.814478517479054,
"train_speed(iter/s)": 0.124648
},
{
"epoch": 0.5511750746382914,
"grad_norm": 1.0670920610427856,
"learning_rate": 9.189154158168293e-06,
"loss": 0.6371576309204101,
"memory(GiB)": 32.12,
"step": 450,
"token_acc": 0.8166518012952263,
"train_speed(iter/s)": 0.125033
},
{
"epoch": 0.5572992421342724,
"grad_norm": 0.9831814765930176,
"learning_rate": 9.171552826528832e-06,
"loss": 0.549981689453125,
"memory(GiB)": 32.12,
"step": 455,
"token_acc": 0.8483598990707121,
"train_speed(iter/s)": 0.125327
},
{
"epoch": 0.5634234096302534,
"grad_norm": 1.0345466136932373,
"learning_rate": 9.15377973838817e-06,
"loss": 0.6600610733032226,
"memory(GiB)": 32.12,
"step": 460,
"token_acc": 0.8050162396246843,
"train_speed(iter/s)": 0.125742
},
{
"epoch": 0.5634234096302534,
"eval_loss": 0.6251854300498962,
"eval_runtime": 29.9543,
"eval_samples_per_second": 17.593,
"eval_steps_per_second": 4.407,
"eval_token_acc": 0.823002221189111,
"step": 460
},
{
"epoch": 0.5695475771262344,
"grad_norm": 0.9955180287361145,
"learning_rate": 9.135835625522585e-06,
"loss": 0.5820852279663086,
"memory(GiB)": 32.12,
"step": 465,
"token_acc": 0.8245337424184316,
"train_speed(iter/s)": 0.124745
},
{
"epoch": 0.5756717446222154,
"grad_norm": 1.0886976718902588,
"learning_rate": 9.117721226750019e-06,
"loss": 0.6092466354370117,
"memory(GiB)": 32.12,
"step": 470,
"token_acc": 0.8014575318141463,
"train_speed(iter/s)": 0.125111
},
{
"epoch": 0.5817959121181965,
"grad_norm": 0.9607824087142944,
"learning_rate": 9.099437287899634e-06,
"loss": 0.6091058731079102,
"memory(GiB)": 32.12,
"step": 475,
"token_acc": 0.8267922900839333,
"train_speed(iter/s)": 0.125396
},
{
"epoch": 0.5879200796141775,
"grad_norm": 1.0514999628067017,
"learning_rate": 9.08098456178111e-06,
"loss": 0.620454978942871,
"memory(GiB)": 32.12,
"step": 480,
"token_acc": 0.8148612218551847,
"train_speed(iter/s)": 0.125738
},
{
"epoch": 0.5879200796141775,
"eval_loss": 0.6268740296363831,
"eval_runtime": 29.9706,
"eval_samples_per_second": 17.584,
"eval_steps_per_second": 4.404,
"eval_token_acc": 0.8231468567591301,
"step": 480
},
{
"epoch": 0.5940442471101585,
"grad_norm": 1.007743239402771,
"learning_rate": 9.06236380815366e-06,
"loss": 0.6164707183837891,
"memory(GiB)": 32.12,
"step": 485,
"token_acc": 0.8169734151329243,
"train_speed(iter/s)": 0.124816
},
{
"epoch": 0.6001684146061395,
"grad_norm": 1.0523988008499146,
"learning_rate": 9.043575793694733e-06,
"loss": 0.6281998157501221,
"memory(GiB)": 32.12,
"step": 490,
"token_acc": 0.8149852592219745,
"train_speed(iter/s)": 0.125171
},
{
"epoch": 0.6062925821021204,
"grad_norm": 0.9333862066268921,
"learning_rate": 9.024621291968461e-06,
"loss": 0.6068775653839111,
"memory(GiB)": 32.12,
"step": 495,
"token_acc": 0.8320725141416206,
"train_speed(iter/s)": 0.125492
},
{
"epoch": 0.6124167495981016,
"grad_norm": 1.0344208478927612,
"learning_rate": 9.005501083393799e-06,
"loss": 0.6447205543518066,
"memory(GiB)": 32.12,
"step": 500,
"token_acc": 0.8103164010363508,
"train_speed(iter/s)": 0.125823
},
{
"epoch": 0.6124167495981016,
"eval_loss": 0.6257410645484924,
"eval_runtime": 30.1552,
"eval_samples_per_second": 17.476,
"eval_steps_per_second": 4.377,
"eval_token_acc": 0.82304871119376,
"step": 500
},
{
"epoch": 0.6185409170940825,
"grad_norm": 1.125075340270996,
"learning_rate": 8.986215955212394e-06,
"loss": 0.6383331775665283,
"memory(GiB)": 32.12,
"step": 505,
"token_acc": 0.8156163386432991,
"train_speed(iter/s)": 0.124876
},
{
"epoch": 0.6246650845900635,
"grad_norm": 1.0361703634262085,
"learning_rate": 8.966766701456177e-06,
"loss": 0.6330353736877441,
"memory(GiB)": 32.12,
"step": 510,
"token_acc": 0.7972571857974826,
"train_speed(iter/s)": 0.125207
},
{
"epoch": 0.6307892520860445,
"grad_norm": 0.9697835445404053,
"learning_rate": 8.947154122914666e-06,
"loss": 0.6002368927001953,
"memory(GiB)": 32.12,
"step": 515,
"token_acc": 0.8171989033478781,
"train_speed(iter/s)": 0.12553
},
{
"epoch": 0.6369134195820255,
"grad_norm": 1.0072293281555176,
"learning_rate": 8.927379027101994e-06,
"loss": 0.6289110660552979,
"memory(GiB)": 32.12,
"step": 520,
"token_acc": 0.8173692196055149,
"train_speed(iter/s)": 0.125831
},
{
"epoch": 0.6369134195820255,
"eval_loss": 0.6232544183731079,
"eval_runtime": 30.2166,
"eval_samples_per_second": 17.441,
"eval_steps_per_second": 4.368,
"eval_token_acc": 0.8239630146185237,
"step": 520
},
{
"epoch": 0.6430375870780066,
"grad_norm": 1.0802470445632935,
"learning_rate": 8.907442228223668e-06,
"loss": 0.6614001274108887,
"memory(GiB)": 32.12,
"step": 525,
"token_acc": 0.8152090832024681,
"train_speed(iter/s)": 0.12495
},
{
"epoch": 0.6491617545739876,
"grad_norm": 0.9386376142501831,
"learning_rate": 8.887344547143032e-06,
"loss": 0.6672756195068359,
"memory(GiB)": 32.12,
"step": 530,
"token_acc": 0.8442938796480637,
"train_speed(iter/s)": 0.125255
},
{
"epoch": 0.6552859220699686,
"grad_norm": 0.8939440846443176,
"learning_rate": 8.867086811347483e-06,
"loss": 0.6040712356567383,
"memory(GiB)": 32.12,
"step": 535,
"token_acc": 0.8285196511496206,
"train_speed(iter/s)": 0.125541
},
{
"epoch": 0.6614100895659496,
"grad_norm": 0.9224595427513123,
"learning_rate": 8.846669854914395e-06,
"loss": 0.6459405899047852,
"memory(GiB)": 32.12,
"step": 540,
"token_acc": 0.8066893233285104,
"train_speed(iter/s)": 0.125873
},
{
"epoch": 0.6614100895659496,
"eval_loss": 0.6218589544296265,
"eval_runtime": 30.088,
"eval_samples_per_second": 17.515,
"eval_steps_per_second": 4.387,
"eval_token_acc": 0.8244330802210857,
"step": 540
},
{
"epoch": 0.6675342570619306,
"grad_norm": 1.198673129081726,
"learning_rate": 8.826094518476775e-06,
"loss": 0.6059948921203613,
"memory(GiB)": 32.12,
"step": 545,
"token_acc": 0.8176160233107487,
"train_speed(iter/s)": 0.125029
},
{
"epoch": 0.6736584245579117,
"grad_norm": 0.9881957769393921,
"learning_rate": 8.805361649188657e-06,
"loss": 0.5907226085662842,
"memory(GiB)": 32.12,
"step": 550,
"token_acc": 0.8072239136451702,
"train_speed(iter/s)": 0.125389
},
{
"epoch": 0.6797825920538927,
"grad_norm": 1.027031660079956,
"learning_rate": 8.784472100690215e-06,
"loss": 0.6389594554901123,
"memory(GiB)": 32.12,
"step": 555,
"token_acc": 0.8127651442767055,
"train_speed(iter/s)": 0.125703
},
{
"epoch": 0.6859067595498737,
"grad_norm": 1.0879848003387451,
"learning_rate": 8.763426733072624e-06,
"loss": 0.6162051200866699,
"memory(GiB)": 32.12,
"step": 560,
"token_acc": 0.8103369683368982,
"train_speed(iter/s)": 0.125956
},
{
"epoch": 0.6859067595498737,
"eval_loss": 0.6215130686759949,
"eval_runtime": 30.1587,
"eval_samples_per_second": 17.474,
"eval_steps_per_second": 4.377,
"eval_token_acc": 0.8242212924221293,
"step": 560
},
{
"epoch": 0.6920309270458547,
"grad_norm": 0.9450560808181763,
"learning_rate": 8.742226412842636e-06,
"loss": 0.6049357414245605,
"memory(GiB)": 32.12,
"step": 565,
"token_acc": 0.828604110069801,
"train_speed(iter/s)": 0.12514
},
{
"epoch": 0.6981550945418358,
"grad_norm": 1.0816230773925781,
"learning_rate": 8.720872012886918e-06,
"loss": 0.6060591697692871,
"memory(GiB)": 32.12,
"step": 570,
"token_acc": 0.8121618953603159,
"train_speed(iter/s)": 0.125424
},
{
"epoch": 0.7042792620378168,
"grad_norm": 1.0539354085922241,
"learning_rate": 8.6993644124361e-06,
"loss": 0.6078558921813965,
"memory(GiB)": 32.12,
"step": 575,
"token_acc": 0.8203018867924529,
"train_speed(iter/s)": 0.125669
},
{
"epoch": 0.7104034295337978,
"grad_norm": 0.9692308306694031,
"learning_rate": 8.677704497028579e-06,
"loss": 0.6092854976654053,
"memory(GiB)": 32.12,
"step": 580,
"token_acc": 0.813650025657943,
"train_speed(iter/s)": 0.125955
},
{
"epoch": 0.7104034295337978,
"eval_loss": 0.6210135817527771,
"eval_runtime": 30.0497,
"eval_samples_per_second": 17.538,
"eval_steps_per_second": 4.393,
"eval_token_acc": 0.8236375845859807,
"step": 580
},
{
"epoch": 0.7165275970297788,
"grad_norm": 1.0405429601669312,
"learning_rate": 8.655893158474056e-06,
"loss": 0.626552963256836,
"memory(GiB)": 32.12,
"step": 585,
"token_acc": 0.8233875988502245,
"train_speed(iter/s)": 0.12517
},
{
"epoch": 0.7226517645257597,
"grad_norm": 0.9403768181800842,
"learning_rate": 8.633931294816823e-06,
"loss": 0.6014822483062744,
"memory(GiB)": 32.12,
"step": 590,
"token_acc": 0.8108660890260682,
"train_speed(iter/s)": 0.125469
},
{
"epoch": 0.7287759320217408,
"grad_norm": 0.8821709752082825,
"learning_rate": 8.611819810298778e-06,
"loss": 0.6129745483398438,
"memory(GiB)": 32.12,
"step": 595,
"token_acc": 0.8298729368614095,
"train_speed(iter/s)": 0.125735
},
{
"epoch": 0.7349000995177218,
"grad_norm": 0.9912955164909363,
"learning_rate": 8.58955961532221e-06,
"loss": 0.5840856075286865,
"memory(GiB)": 32.12,
"step": 600,
"token_acc": 0.8248710481069511,
"train_speed(iter/s)": 0.125926
},
{
"epoch": 0.7349000995177218,
"eval_loss": 0.6222088932991028,
"eval_runtime": 30.0444,
"eval_samples_per_second": 17.541,
"eval_steps_per_second": 4.393,
"eval_token_acc": 0.8234412934552404,
"step": 600
},
{
"epoch": 0.7410242670137028,
"grad_norm": 1.037676453590393,
"learning_rate": 8.567151626412295e-06,
"loss": 0.6406550884246827,
"memory(GiB)": 32.12,
"step": 605,
"token_acc": 0.8165861175316943,
"train_speed(iter/s)": 0.125193
},
{
"epoch": 0.7471484345096838,
"grad_norm": 1.0239970684051514,
"learning_rate": 8.544596766179377e-06,
"loss": 0.5927177429199219,
"memory(GiB)": 32.12,
"step": 610,
"token_acc": 0.8224873999407056,
"train_speed(iter/s)": 0.125437
},
{
"epoch": 0.7532726020056648,
"grad_norm": 1.0227631330490112,
"learning_rate": 8.521895963280967e-06,
"loss": 0.5963564395904541,
"memory(GiB)": 32.12,
"step": 615,
"token_acc": 0.827058931465794,
"train_speed(iter/s)": 0.125665
},
{
"epoch": 0.7593967695016459,
"grad_norm": 1.0962486267089844,
"learning_rate": 8.499050152383519e-06,
"loss": 0.6459769248962403,
"memory(GiB)": 32.12,
"step": 620,
"token_acc": 0.8193405375450263,
"train_speed(iter/s)": 0.125922
},
{
"epoch": 0.7593967695016459,
"eval_loss": 0.6196050643920898,
"eval_runtime": 29.9245,
"eval_samples_per_second": 17.611,
"eval_steps_per_second": 4.411,
"eval_token_acc": 0.8242987757632109,
"step": 620
},
{
"epoch": 0.7655209369976269,
"grad_norm": 1.0802922248840332,
"learning_rate": 8.476060274123938e-06,
"loss": 0.5952530860900879,
"memory(GiB)": 32.12,
"step": 625,
"token_acc": 0.8222949637294201,
"train_speed(iter/s)": 0.125113
},
{
"epoch": 0.7716451044936079,
"grad_norm": 1.0066949129104614,
"learning_rate": 8.452927275070858e-06,
"loss": 0.6043106079101562,
"memory(GiB)": 32.12,
"step": 630,
"token_acc": 0.8247297031649302,
"train_speed(iter/s)": 0.125399
},
{
"epoch": 0.7777692719895889,
"grad_norm": 0.9393348693847656,
"learning_rate": 8.429652107685662e-06,
"loss": 0.633615779876709,
"memory(GiB)": 32.12,
"step": 635,
"token_acc": 0.8054982337099087,
"train_speed(iter/s)": 0.125629
},
{
"epoch": 0.7838934394855699,
"grad_norm": 1.0759190320968628,
"learning_rate": 8.40623573028327e-06,
"loss": 0.6218441009521485,
"memory(GiB)": 32.12,
"step": 640,
"token_acc": 0.8195294533875818,
"train_speed(iter/s)": 0.125911
},
{
"epoch": 0.7838934394855699,
"eval_loss": 0.6197024583816528,
"eval_runtime": 29.9919,
"eval_samples_per_second": 17.571,
"eval_steps_per_second": 4.401,
"eval_token_acc": 0.8242006301978408,
"step": 640
},
{
"epoch": 0.790017606981551,
"grad_norm": 0.8828935623168945,
"learning_rate": 8.382679106992687e-06,
"loss": 0.6121187210083008,
"memory(GiB)": 32.12,
"step": 645,
"token_acc": 0.8180191693290735,
"train_speed(iter/s)": 0.125215
},
{
"epoch": 0.796141774477532,
"grad_norm": 1.0172326564788818,
"learning_rate": 8.358983207717286e-06,
"loss": 0.6195911407470703,
"memory(GiB)": 32.12,
"step": 650,
"token_acc": 0.802275960170697,
"train_speed(iter/s)": 0.125435
},
{
"epoch": 0.802265941973513,
"grad_norm": 1.0442404747009277,
"learning_rate": 8.335149008094906e-06,
"loss": 0.5969693660736084,
"memory(GiB)": 32.12,
"step": 655,
"token_acc": 0.8279867846104657,
"train_speed(iter/s)": 0.125647
},
{
"epoch": 0.808390109469494,
"grad_norm": 0.9861543774604797,
"learning_rate": 8.311177489457653e-06,
"loss": 0.6027172088623047,
"memory(GiB)": 32.12,
"step": 660,
"token_acc": 0.8264142409459503,
"train_speed(iter/s)": 0.125901
},
{
"epoch": 0.808390109469494,
"eval_loss": 0.6189049482345581,
"eval_runtime": 30.0122,
"eval_samples_per_second": 17.56,
"eval_steps_per_second": 4.398,
"eval_token_acc": 0.8247378480293404,
"step": 660
},
{
"epoch": 0.814514276965475,
"grad_norm": 0.9293588399887085,
"learning_rate": 8.28706963879151e-06,
"loss": 0.5741694927215576,
"memory(GiB)": 32.12,
"step": 665,
"token_acc": 0.8175586677777479,
"train_speed(iter/s)": 0.125199
},
{
"epoch": 0.8206384444614561,
"grad_norm": 0.8590324521064758,
"learning_rate": 8.2628264486957e-06,
"loss": 0.6139655113220215,
"memory(GiB)": 32.12,
"step": 670,
"token_acc": 0.8059371841425523,
"train_speed(iter/s)": 0.125416
},
{
"epoch": 0.826762611957437,
"grad_norm": 1.0524649620056152,
"learning_rate": 8.23844891734181e-06,
"loss": 0.5746917724609375,
"memory(GiB)": 32.12,
"step": 675,
"token_acc": 0.8064527770760402,
"train_speed(iter/s)": 0.125689
},
{
"epoch": 0.832886779453418,
"grad_norm": 1.0562033653259277,
"learning_rate": 8.213938048432697e-06,
"loss": 0.5949201583862305,
"memory(GiB)": 32.12,
"step": 680,
"token_acc": 0.8262676641729011,
"train_speed(iter/s)": 0.125943
},
{
"epoch": 0.832886779453418,
"eval_loss": 0.6181398630142212,
"eval_runtime": 29.812,
"eval_samples_per_second": 17.677,
"eval_steps_per_second": 4.428,
"eval_token_acc": 0.8249702980525854,
"step": 680
},
{
"epoch": 0.839010946949399,
"grad_norm": 0.8997740149497986,
"learning_rate": 8.189294851161164e-06,
"loss": 0.6027894496917725,
"memory(GiB)": 32.12,
"step": 685,
"token_acc": 0.8181957698532284,
"train_speed(iter/s)": 0.125279
},
{
"epoch": 0.84513511444538,
"grad_norm": 0.9756171703338623,
"learning_rate": 8.164520340168404e-06,
"loss": 0.6199028015136718,
"memory(GiB)": 32.12,
"step": 690,
"token_acc": 0.816996805111821,
"train_speed(iter/s)": 0.125482
},
{
"epoch": 0.8512592819413611,
"grad_norm": 1.0920320749282837,
"learning_rate": 8.139615535502227e-06,
"loss": 0.6447176933288574,
"memory(GiB)": 32.12,
"step": 695,
"token_acc": 0.8085919407132932,
"train_speed(iter/s)": 0.125747
},
{
"epoch": 0.8573834494373421,
"grad_norm": 0.9692983627319336,
"learning_rate": 8.114581462575063e-06,
"loss": 0.6160262107849122,
"memory(GiB)": 32.12,
"step": 700,
"token_acc": 0.8179973169137629,
"train_speed(iter/s)": 0.12598
},
{
"epoch": 0.8573834494373421,
"eval_loss": 0.6175059080123901,
"eval_runtime": 30.0657,
"eval_samples_per_second": 17.528,
"eval_steps_per_second": 4.39,
"eval_token_acc": 0.8250374502815228,
"step": 700
},
{
"epoch": 0.8635076169333231,
"grad_norm": 0.9100112318992615,
"learning_rate": 8.089419152121736e-06,
"loss": 0.572049617767334,
"memory(GiB)": 32.12,
"step": 705,
"token_acc": 0.8311790668348046,
"train_speed(iter/s)": 0.125371
},
{
"epoch": 0.8696317844293041,
"grad_norm": 0.967882513999939,
"learning_rate": 8.064129640157033e-06,
"loss": 0.6320825576782226,
"memory(GiB)": 32.12,
"step": 710,
"token_acc": 0.8260635252113577,
"train_speed(iter/s)": 0.125628
},
{
"epoch": 0.8757559519252851,
"grad_norm": 0.9655548334121704,
"learning_rate": 8.038713967933043e-06,
"loss": 0.6211101055145264,
"memory(GiB)": 32.12,
"step": 715,
"token_acc": 0.818314430545932,
"train_speed(iter/s)": 0.125843
},
{
"epoch": 0.8818801194212662,
"grad_norm": 0.888866126537323,
"learning_rate": 8.013173181896283e-06,
"loss": 0.6331143379211426,
"memory(GiB)": 32.12,
"step": 720,
"token_acc": 0.8223672789139266,
"train_speed(iter/s)": 0.126057
},
{
"epoch": 0.8818801194212662,
"eval_loss": 0.6172851324081421,
"eval_runtime": 29.7983,
"eval_samples_per_second": 17.686,
"eval_steps_per_second": 4.43,
"eval_token_acc": 0.8249289736040085,
"step": 720
},
{
"epoch": 0.8880042869172472,
"grad_norm": 0.9664549827575684,
"learning_rate": 7.98750833364462e-06,
"loss": 0.6372400760650635,
"memory(GiB)": 32.12,
"step": 725,
"token_acc": 0.8170195878334325,
"train_speed(iter/s)": 0.125415
},
{
"epoch": 0.8941284544132282,
"grad_norm": 1.000975251197815,
"learning_rate": 7.961720479883967e-06,
"loss": 0.5750507354736328,
"memory(GiB)": 32.12,
"step": 730,
"token_acc": 0.8278389461108779,
"train_speed(iter/s)": 0.125636
},
{
"epoch": 0.9002526219092092,
"grad_norm": 1.094359278678894,
"learning_rate": 7.935810682384777e-06,
"loss": 0.5872611045837403,
"memory(GiB)": 32.12,
"step": 735,
"token_acc": 0.8236509437265819,
"train_speed(iter/s)": 0.125855
},
{
"epoch": 0.9063767894051903,
"grad_norm": 0.9531553387641907,
"learning_rate": 7.909780007938327e-06,
"loss": 0.597745418548584,
"memory(GiB)": 32.12,
"step": 740,
"token_acc": 0.8098763707480617,
"train_speed(iter/s)": 0.12604
},
{
"epoch": 0.9063767894051903,
"eval_loss": 0.616245687007904,
"eval_runtime": 30.0544,
"eval_samples_per_second": 17.535,
"eval_steps_per_second": 4.392,
"eval_token_acc": 0.8252440725244072,
"step": 740
},
{
"epoch": 0.9125009569011713,
"grad_norm": 1.0165457725524902,
"learning_rate": 7.883629528312794e-06,
"loss": 0.6201919555664063,
"memory(GiB)": 32.12,
"step": 745,
"token_acc": 0.8185958200091366,
"train_speed(iter/s)": 0.125398
},
{
"epoch": 0.9186251243971523,
"grad_norm": 1.1827678680419922,
"learning_rate": 7.857360320209126e-06,
"loss": 0.6491155624389648,
"memory(GiB)": 32.12,
"step": 750,
"token_acc": 0.8059945706742032,
"train_speed(iter/s)": 0.125601
},
{
"epoch": 0.9247492918931333,
"grad_norm": 1.0143615007400513,
"learning_rate": 7.830973465216712e-06,
"loss": 0.6207675933837891,
"memory(GiB)": 32.12,
"step": 755,
"token_acc": 0.8050835253456221,
"train_speed(iter/s)": 0.1258
},
{
"epoch": 0.9308734593891143,
"grad_norm": 0.929755449295044,
"learning_rate": 7.80447004976885e-06,
"loss": 0.5987899780273438,
"memory(GiB)": 32.12,
"step": 760,
"token_acc": 0.8344898639435169,
"train_speed(iter/s)": 0.126004
},
{
"epoch": 0.9308734593891143,
"eval_loss": 0.6154034733772278,
"eval_runtime": 29.9062,
"eval_samples_per_second": 17.622,
"eval_steps_per_second": 4.414,
"eval_token_acc": 0.8256728136783925,
"step": 760
},
{
"epoch": 0.9369976268850954,
"grad_norm": 0.9682816863059998,
"learning_rate": 7.777851165098012e-06,
"loss": 0.6217115879058838,
"memory(GiB)": 32.12,
"step": 765,
"token_acc": 0.8253138075313807,
"train_speed(iter/s)": 0.125416
},
{
"epoch": 0.9431217943810764,
"grad_norm": 0.9861950278282166,
"learning_rate": 7.751117907190919e-06,
"loss": 0.6429153442382812,
"memory(GiB)": 32.12,
"step": 770,
"token_acc": 0.7975187624444785,
"train_speed(iter/s)": 0.125587
},
{
"epoch": 0.9492459618770573,
"grad_norm": 1.0501208305358887,
"learning_rate": 7.724271376743408e-06,
"loss": 0.6119437694549561,
"memory(GiB)": 32.12,
"step": 775,
"token_acc": 0.8173751624280676,
"train_speed(iter/s)": 0.125823
},
{
"epoch": 0.9553701293730383,
"grad_norm": 0.865284264087677,
"learning_rate": 7.697312679115126e-06,
"loss": 0.6217618465423584,
"memory(GiB)": 32.12,
"step": 780,
"token_acc": 0.8141066272272696,
"train_speed(iter/s)": 0.126064
},
{
"epoch": 0.9553701293730383,
"eval_loss": 0.616006076335907,
"eval_runtime": 29.936,
"eval_samples_per_second": 17.604,
"eval_steps_per_second": 4.409,
"eval_token_acc": 0.8253008936412005,
"step": 780
},
{
"epoch": 0.9614942968690193,
"grad_norm": 0.9814106822013855,
"learning_rate": 7.670242924284e-06,
"loss": 0.6097393989562988,
"memory(GiB)": 32.12,
"step": 785,
"token_acc": 0.8181161935170403,
"train_speed(iter/s)": 0.125512
},
{
"epoch": 0.9676184643650004,
"grad_norm": 0.9246596693992615,
"learning_rate": 7.643063226800556e-06,
"loss": 0.5933025360107422,
"memory(GiB)": 32.12,
"step": 790,
"token_acc": 0.8117928174854171,
"train_speed(iter/s)": 0.125727
},
{
"epoch": 0.9737426318609814,
"grad_norm": 0.9575701951980591,
"learning_rate": 7.615774705742012e-06,
"loss": 0.6192699432373047,
"memory(GiB)": 32.12,
"step": 795,
"token_acc": 0.8193493150684932,
"train_speed(iter/s)": 0.125968
},
{
"epoch": 0.9798667993569624,
"grad_norm": 0.9669679403305054,
"learning_rate": 7.588378484666214e-06,
"loss": 0.5967386722564697,
"memory(GiB)": 32.12,
"step": 800,
"token_acc": 0.8374201589032559,
"train_speed(iter/s)": 0.126204
},
{
"epoch": 0.9798667993569624,
"eval_loss": 0.6141526699066162,
"eval_runtime": 29.9762,
"eval_samples_per_second": 17.581,
"eval_steps_per_second": 4.403,
"eval_token_acc": 0.825512681440157,
"step": 800
},
{
"epoch": 0.9859909668529434,
"grad_norm": 1.0676608085632324,
"learning_rate": 7.560875691565366e-06,
"loss": 0.6506372451782226,
"memory(GiB)": 32.12,
"step": 805,
"token_acc": 0.8162111860741997,
"train_speed(iter/s)": 0.125683
},
{
"epoch": 0.9921151343489244,
"grad_norm": 1.1090500354766846,
"learning_rate": 7.533267458819597e-06,
"loss": 0.6549376487731934,
"memory(GiB)": 32.12,
"step": 810,
"token_acc": 0.7903715821453611,
"train_speed(iter/s)": 0.125882
},
{
"epoch": 0.9982393018449055,
"grad_norm": 1.0215500593185425,
"learning_rate": 7.505554923150329e-06,
"loss": 0.6107999324798584,
"memory(GiB)": 32.12,
"step": 815,
"token_acc": 0.8255743651753326,
"train_speed(iter/s)": 0.126031
},
{
"epoch": 1.0036745004975887,
"grad_norm": 0.9610656499862671,
"learning_rate": 7.477739225573475e-06,
"loss": 0.5486949920654297,
"memory(GiB)": 32.12,
"step": 820,
"token_acc": 0.8164336957325642,
"train_speed(iter/s)": 0.126279
},
{
"epoch": 1.0036745004975887,
"eval_loss": 0.6169166564941406,
"eval_runtime": 29.8644,
"eval_samples_per_second": 17.646,
"eval_steps_per_second": 4.42,
"eval_token_acc": 0.8263649981920553,
"step": 820
},
{
"epoch": 1.0097986679935695,
"grad_norm": 0.8930652141571045,
"learning_rate": 7.449821511352465e-06,
"loss": 0.5580629348754883,
"memory(GiB)": 32.12,
"step": 825,
"token_acc": 0.8275424871864041,
"train_speed(iter/s)": 0.125684
},
{
"epoch": 1.0159228354895506,
"grad_norm": 1.043878197669983,
"learning_rate": 7.421802929951088e-06,
"loss": 0.537553071975708,
"memory(GiB)": 32.12,
"step": 830,
"token_acc": 0.8182638888888889,
"train_speed(iter/s)": 0.12587
},
{
"epoch": 1.0220470029855317,
"grad_norm": 0.9812543392181396,
"learning_rate": 7.393684634986165e-06,
"loss": 0.544792366027832,
"memory(GiB)": 32.12,
"step": 835,
"token_acc": 0.8522354565855342,
"train_speed(iter/s)": 0.12604
},
{
"epoch": 1.0281711704815126,
"grad_norm": 0.9970709085464478,
"learning_rate": 7.365467784180051e-06,
"loss": 0.5357254028320313,
"memory(GiB)": 32.12,
"step": 840,
"token_acc": 0.8398415604798027,
"train_speed(iter/s)": 0.126238
},
{
"epoch": 1.0281711704815126,
"eval_loss": 0.6195093393325806,
"eval_runtime": 30.0227,
"eval_samples_per_second": 17.553,
"eval_steps_per_second": 4.397,
"eval_token_acc": 0.8253680458701379,
"step": 840
},
{
"epoch": 1.0342953379774937,
"grad_norm": 0.9345646500587463,
"learning_rate": 7.337153539312968e-06,
"loss": 0.5476717948913574,
"memory(GiB)": 32.12,
"step": 845,
"token_acc": 0.8294938351719663,
"train_speed(iter/s)": 0.125697
},
{
"epoch": 1.0404195054734746,
"grad_norm": 0.9843780398368835,
"learning_rate": 7.308743066175172e-06,
"loss": 0.5259488105773926,
"memory(GiB)": 32.12,
"step": 850,
"token_acc": 0.8343461220380425,
"train_speed(iter/s)": 0.125882
},
{
"epoch": 1.0465436729694557,
"grad_norm": 0.9147416949272156,
"learning_rate": 7.280237534518948e-06,
"loss": 0.5354435443878174,
"memory(GiB)": 32.12,
"step": 855,
"token_acc": 0.8242645320363428,
"train_speed(iter/s)": 0.126046
},
{
"epoch": 1.0526678404654368,
"grad_norm": 0.9699310660362244,
"learning_rate": 7.251638118010456e-06,
"loss": 0.5579245567321778,
"memory(GiB)": 32.12,
"step": 860,
"token_acc": 0.8199693263596681,
"train_speed(iter/s)": 0.126211
},
{
"epoch": 1.0526678404654368,
"eval_loss": 0.6175369620323181,
"eval_runtime": 30.0015,
"eval_samples_per_second": 17.566,
"eval_steps_per_second": 4.4,
"eval_token_acc": 0.8255643370008782,
"step": 860
},
{
"epoch": 1.0587920079614177,
"grad_norm": 0.9256744384765625,
"learning_rate": 7.222945994181403e-06,
"loss": 0.5566354751586914,
"memory(GiB)": 32.12,
"step": 865,
"token_acc": 0.8209406091511238,
"train_speed(iter/s)": 0.125682
},
{
"epoch": 1.0649161754573988,
"grad_norm": 0.9246125221252441,
"learning_rate": 7.194162344380561e-06,
"loss": 0.5399526596069336,
"memory(GiB)": 32.12,
"step": 870,
"token_acc": 0.8396965685046696,
"train_speed(iter/s)": 0.125842
},
{
"epoch": 1.0710403429533797,
"grad_norm": 0.9580938816070557,
"learning_rate": 7.16528835372512e-06,
"loss": 0.540044641494751,
"memory(GiB)": 32.12,
"step": 875,
"token_acc": 0.8386737552985128,
"train_speed(iter/s)": 0.126028
},
{
"epoch": 1.0771645104493608,
"grad_norm": 1.0862672328948975,
"learning_rate": 7.136325211051905e-06,
"loss": 0.5482538223266602,
"memory(GiB)": 32.12,
"step": 880,
"token_acc": 0.8172277019200394,
"train_speed(iter/s)": 0.126217
},
{
"epoch": 1.0771645104493608,
"eval_loss": 0.619035542011261,
"eval_runtime": 29.9689,
"eval_samples_per_second": 17.585,
"eval_steps_per_second": 4.405,
"eval_token_acc": 0.8250477813936671,
"step": 880
},
{
"epoch": 1.083288677945342,
"grad_norm": 0.8554603457450867,
"learning_rate": 7.107274108868422e-06,
"loss": 0.5296638965606689,
"memory(GiB)": 32.12,
"step": 885,
"token_acc": 0.8246657960985986,
"train_speed(iter/s)": 0.12573
},
{
"epoch": 1.0894128454413228,
"grad_norm": 0.9336580634117126,
"learning_rate": 7.078136243303754e-06,
"loss": 0.5232193946838379,
"memory(GiB)": 32.12,
"step": 890,
"token_acc": 0.8418666840594834,
"train_speed(iter/s)": 0.12589
},
{
"epoch": 1.0955370129373039,
"grad_norm": 0.9728440642356873,
"learning_rate": 7.048912814059321e-06,
"loss": 0.5442141056060791,
"memory(GiB)": 32.12,
"step": 895,
"token_acc": 0.8226529199606543,
"train_speed(iter/s)": 0.126084
},
{
"epoch": 1.1016611804332848,
"grad_norm": 1.0364502668380737,
"learning_rate": 7.019605024359475e-06,
"loss": 0.5461842536926269,
"memory(GiB)": 32.12,
"step": 900,
"token_acc": 0.8352232590995279,
"train_speed(iter/s)": 0.12626
},
{
"epoch": 1.1016611804332848,
"eval_loss": 0.6182094812393188,
"eval_runtime": 29.9539,
"eval_samples_per_second": 17.594,
"eval_steps_per_second": 4.407,
"eval_token_acc": 0.8259620848184307,
"step": 900
},
{
"epoch": 1.1077853479292659,
"grad_norm": 0.9474948644638062,
"learning_rate": 6.990214080901971e-06,
"loss": 0.5203993797302247,
"memory(GiB)": 32.12,
"step": 905,
"token_acc": 0.8326191860072181,
"train_speed(iter/s)": 0.125742
},
{
"epoch": 1.113909515425247,
"grad_norm": 0.9584360718727112,
"learning_rate": 6.9607411938082735e-06,
"loss": 0.5354339122772217,
"memory(GiB)": 32.12,
"step": 910,
"token_acc": 0.8325466311381804,
"train_speed(iter/s)": 0.125916
},
{
"epoch": 1.1200336829212278,
"grad_norm": 0.9902798533439636,
"learning_rate": 6.931187576573733e-06,
"loss": 0.531368637084961,
"memory(GiB)": 32.12,
"step": 915,
"token_acc": 0.8471662228984405,
"train_speed(iter/s)": 0.126087
},
{
"epoch": 1.126157850417209,
"grad_norm": 0.8779637217521667,
"learning_rate": 6.9015544460176296e-06,
"loss": 0.5314560890197754,
"memory(GiB)": 32.12,
"step": 920,
"token_acc": 0.8275991535258379,
"train_speed(iter/s)": 0.126272
},
{
"epoch": 1.126157850417209,
"eval_loss": 0.6172027587890625,
"eval_runtime": 29.9497,
"eval_samples_per_second": 17.596,
"eval_steps_per_second": 4.407,
"eval_token_acc": 0.8254610258794359,
"step": 920
},
{
"epoch": 1.1322820179131898,
"grad_norm": 1.0086554288864136,
"learning_rate": 6.87184302223306e-06,
"loss": 0.5486597061157227,
"memory(GiB)": 32.12,
"step": 925,
"token_acc": 0.8296143047140535,
"train_speed(iter/s)": 0.125778
},
{
"epoch": 1.138406185409171,
"grad_norm": 1.055482029914856,
"learning_rate": 6.842054528536717e-06,
"loss": 0.5004231452941894,
"memory(GiB)": 32.12,
"step": 930,
"token_acc": 0.8338052711827488,
"train_speed(iter/s)": 0.125965
},
{
"epoch": 1.144530352905152,
"grad_norm": 0.9732358455657959,
"learning_rate": 6.812190191418508e-06,
"loss": 0.528237771987915,
"memory(GiB)": 32.12,
"step": 935,
"token_acc": 0.8312384161752316,
"train_speed(iter/s)": 0.126125
},
{
"epoch": 1.150654520401133,
"grad_norm": 0.8922236561775208,
"learning_rate": 6.782251240491071e-06,
"loss": 0.5213536262512207,
"memory(GiB)": 32.12,
"step": 940,
"token_acc": 0.846323478740266,
"train_speed(iter/s)": 0.126299
},
{
"epoch": 1.150654520401133,
"eval_loss": 0.6174668073654175,
"eval_runtime": 29.9268,
"eval_samples_per_second": 17.61,
"eval_steps_per_second": 4.411,
"eval_token_acc": 0.8251975825197583,
"step": 940
},
{
"epoch": 1.156778687897114,
"grad_norm": 1.022758960723877,
"learning_rate": 6.75223890843913e-06,
"loss": 0.5352741241455078,
"memory(GiB)": 32.12,
"step": 945,
"token_acc": 0.8363561308192181,
"train_speed(iter/s)": 0.12581
},
{
"epoch": 1.162902855393095,
"grad_norm": 0.9870294332504272,
"learning_rate": 6.722154430968755e-06,
"loss": 0.5349910259246826,
"memory(GiB)": 32.12,
"step": 950,
"token_acc": 0.8381542699724518,
"train_speed(iter/s)": 0.12598
},
{
"epoch": 1.169027022889076,
"grad_norm": 0.9218893051147461,
"learning_rate": 6.69199904675648e-06,
"loss": 0.5564836025238037,
"memory(GiB)": 32.12,
"step": 955,
"token_acc": 0.8276523535487679,
"train_speed(iter/s)": 0.12616
},
{
"epoch": 1.175151190385057,
"grad_norm": 0.9656640887260437,
"learning_rate": 6.6617739973982985e-06,
"loss": 0.505579948425293,
"memory(GiB)": 32.12,
"step": 960,
"token_acc": 0.8291429745838186,
"train_speed(iter/s)": 0.126306
},
{
"epoch": 1.175151190385057,
"eval_loss": 0.618171751499176,
"eval_runtime": 29.9871,
"eval_samples_per_second": 17.574,
"eval_steps_per_second": 4.402,
"eval_token_acc": 0.825326721421561,
"step": 960
},
{
"epoch": 1.181275357881038,
"grad_norm": 0.9255744218826294,
"learning_rate": 6.631480527358552e-06,
"loss": 0.5494061946868897,
"memory(GiB)": 32.12,
"step": 965,
"token_acc": 0.828285929606163,
"train_speed(iter/s)": 0.125817
},
{
"epoch": 1.187399525377019,
"grad_norm": 0.8625167608261108,
"learning_rate": 6.601119883918677e-06,
"loss": 0.5405423164367675,
"memory(GiB)": 32.12,
"step": 970,
"token_acc": 0.8340152804432053,
"train_speed(iter/s)": 0.125981
},
{
"epoch": 1.1935236928730002,
"grad_norm": 0.9901431202888489,
"learning_rate": 6.570693317125868e-06,
"loss": 0.5540534019470215,
"memory(GiB)": 32.12,
"step": 975,
"token_acc": 0.8329805323246695,
"train_speed(iter/s)": 0.12611
},
{
"epoch": 1.199647860368981,
"grad_norm": 0.8996224403381348,
"learning_rate": 6.540202079741594e-06,
"loss": 0.5333957672119141,
"memory(GiB)": 32.12,
"step": 980,
"token_acc": 0.8473111291632819,
"train_speed(iter/s)": 0.126282
},
{
"epoch": 1.199647860368981,
"eval_loss": 0.6176728010177612,
"eval_runtime": 30.0203,
"eval_samples_per_second": 17.555,
"eval_steps_per_second": 4.397,
"eval_token_acc": 0.8252853969729841,
"step": 980
},
{
"epoch": 1.2057720278649622,
"grad_norm": 1.0513380765914917,
"learning_rate": 6.509647427190029e-06,
"loss": 0.5554468631744385,
"memory(GiB)": 32.12,
"step": 985,
"token_acc": 0.8163728888561104,
"train_speed(iter/s)": 0.125784
},
{
"epoch": 1.211896195360943,
"grad_norm": 0.9925962686538696,
"learning_rate": 6.4790306175063535e-06,
"loss": 0.5358247756958008,
"memory(GiB)": 32.12,
"step": 990,
"token_acc": 0.8373621787068276,
"train_speed(iter/s)": 0.125984
},
{
"epoch": 1.2180203628569242,
"grad_norm": 0.9856204390525818,
"learning_rate": 6.44835291128496e-06,
"loss": 0.544157600402832,
"memory(GiB)": 32.12,
"step": 995,
"token_acc": 0.8250445425672012,
"train_speed(iter/s)": 0.126117
},
{
"epoch": 1.224144530352905,
"grad_norm": 0.9970067739486694,
"learning_rate": 6.417615571627555e-06,
"loss": 0.5199033260345459,
"memory(GiB)": 34.49,
"step": 1000,
"token_acc": 0.8412796162447737,
"train_speed(iter/s)": 0.126261
},
{
"epoch": 1.224144530352905,
"eval_loss": 0.6171393990516663,
"eval_runtime": 29.9792,
"eval_samples_per_second": 17.579,
"eval_steps_per_second": 4.403,
"eval_token_acc": 0.8254248669869312,
"step": 1000
},
{
"epoch": 1.2302686978488862,
"grad_norm": 0.978387176990509,
"learning_rate": 6.386819864091146e-06,
"loss": 0.5251027107238769,
"memory(GiB)": 34.49,
"step": 1005,
"token_acc": 0.8299365231042249,
"train_speed(iter/s)": 0.125803
},
{
"epoch": 1.2363928653448673,
"grad_norm": 0.9339916706085205,
"learning_rate": 6.35596705663594e-06,
"loss": 0.566818380355835,
"memory(GiB)": 34.49,
"step": 1010,
"token_acc": 0.8130651567649793,
"train_speed(iter/s)": 0.12596
},
{
"epoch": 1.2425170328408481,
"grad_norm": 0.9691733717918396,
"learning_rate": 6.325058419573131e-06,
"loss": 0.5325815200805664,
"memory(GiB)": 34.49,
"step": 1015,
"token_acc": 0.838964083981669,
"train_speed(iter/s)": 0.126159
},
{
"epoch": 1.2486412003368292,
"grad_norm": 0.9045368432998657,
"learning_rate": 6.294095225512604e-06,
"loss": 0.5249390602111816,
"memory(GiB)": 34.49,
"step": 1020,
"token_acc": 0.8380004706356944,
"train_speed(iter/s)": 0.126284
},
{
"epoch": 1.2486412003368292,
"eval_loss": 0.6158590316772461,
"eval_runtime": 30.0066,
"eval_samples_per_second": 17.563,
"eval_steps_per_second": 4.399,
"eval_token_acc": 0.8253318869776332,
"step": 1020
},
{
"epoch": 1.2547653678328103,
"grad_norm": 0.98622065782547,
"learning_rate": 6.263078749310534e-06,
"loss": 0.561451530456543,
"memory(GiB)": 34.49,
"step": 1025,
"token_acc": 0.8262592270950934,
"train_speed(iter/s)": 0.125824
},
{
"epoch": 1.2608895353287912,
"grad_norm": 0.9515383243560791,
"learning_rate": 6.232010268016895e-06,
"loss": 0.5291833877563477,
"memory(GiB)": 34.49,
"step": 1030,
"token_acc": 0.8373620599054125,
"train_speed(iter/s)": 0.125985
},
{
"epoch": 1.2670137028247723,
"grad_norm": 0.9982597827911377,
"learning_rate": 6.200891060822884e-06,
"loss": 0.577932071685791,
"memory(GiB)": 34.49,
"step": 1035,
"token_acc": 0.8188697951090549,
"train_speed(iter/s)": 0.126139
},
{
"epoch": 1.2731378703207532,
"grad_norm": 1.0038230419158936,
"learning_rate": 6.169722409008244e-06,
"loss": 0.5776113986968994,
"memory(GiB)": 34.49,
"step": 1040,
"token_acc": 0.8182007844446298,
"train_speed(iter/s)": 0.126292
},
{
"epoch": 1.2731378703207532,
"eval_loss": 0.6144587397575378,
"eval_runtime": 30.0744,
"eval_samples_per_second": 17.523,
"eval_steps_per_second": 4.389,
"eval_token_acc": 0.8263133426313343,
"step": 1040
},
{
"epoch": 1.2792620378167343,
"grad_norm": 0.9570845365524292,
"learning_rate": 6.13850559588852e-06,
"loss": 0.5415801048278809,
"memory(GiB)": 34.49,
"step": 1045,
"token_acc": 0.8337840538200226,
"train_speed(iter/s)": 0.125838
},
{
"epoch": 1.2853862053127152,
"grad_norm": 0.9676324725151062,
"learning_rate": 6.107241906762214e-06,
"loss": 0.5263193130493165,
"memory(GiB)": 34.49,
"step": 1050,
"token_acc": 0.8450373289877591,
"train_speed(iter/s)": 0.125977
},
{
"epoch": 1.2915103728086963,
"grad_norm": 0.8747360110282898,
"learning_rate": 6.075932628857869e-06,
"loss": 0.5368072032928467,
"memory(GiB)": 34.49,
"step": 1055,
"token_acc": 0.8371580206308865,
"train_speed(iter/s)": 0.126138
},
{
"epoch": 1.2976345403046774,
"grad_norm": 0.9015209674835205,
"learning_rate": 6.044579051281063e-06,
"loss": 0.4784068584442139,
"memory(GiB)": 34.49,
"step": 1060,
"token_acc": 0.8673299195318215,
"train_speed(iter/s)": 0.126244
},
{
"epoch": 1.2976345403046774,
"eval_loss": 0.6154947280883789,
"eval_runtime": 30.068,
"eval_samples_per_second": 17.527,
"eval_steps_per_second": 4.39,
"eval_token_acc": 0.8261997003977478,
"step": 1060
},
{
"epoch": 1.3037587078006583,
"grad_norm": 0.9384099841117859,
"learning_rate": 6.013182464961341e-06,
"loss": 0.5346551418304444,
"memory(GiB)": 34.49,
"step": 1065,
"token_acc": 0.8303387250508586,
"train_speed(iter/s)": 0.125804
},
{
"epoch": 1.3098828752966394,
"grad_norm": 0.8981488347053528,
"learning_rate": 5.981744162599057e-06,
"loss": 0.5211257934570312,
"memory(GiB)": 34.49,
"step": 1070,
"token_acc": 0.8500360490266763,
"train_speed(iter/s)": 0.12593
},
{
"epoch": 1.3160070427926205,
"grad_norm": 0.8698239922523499,
"learning_rate": 5.9502654386121505e-06,
"loss": 0.5495285034179688,
"memory(GiB)": 34.49,
"step": 1075,
"token_acc": 0.8446355346104413,
"train_speed(iter/s)": 0.12608
},
{
"epoch": 1.3221312102886014,
"grad_norm": 0.990492582321167,
"learning_rate": 5.918747589082853e-06,
"loss": 0.5515711307525635,
"memory(GiB)": 34.49,
"step": 1080,
"token_acc": 0.8194262671996039,
"train_speed(iter/s)": 0.126234
},
{
"epoch": 1.3221312102886014,
"eval_loss": 0.6156888008117676,
"eval_runtime": 30.042,
"eval_samples_per_second": 17.542,
"eval_steps_per_second": 4.394,
"eval_token_acc": 0.8256831447905367,
"step": 1080
},
{
"epoch": 1.3282553777845825,
"grad_norm": 1.0377113819122314,
"learning_rate": 5.887191911704322e-06,
"loss": 0.5179418087005615,
"memory(GiB)": 34.49,
"step": 1085,
"token_acc": 0.822986674391657,
"train_speed(iter/s)": 0.125752
},
{
"epoch": 1.3343795452805634,
"grad_norm": 1.1629189252853394,
"learning_rate": 5.855599705727212e-06,
"loss": 0.501689100265503,
"memory(GiB)": 34.49,
"step": 1090,
"token_acc": 0.8511506930497481,
"train_speed(iter/s)": 0.125871
},
{
"epoch": 1.3405037127765445,
"grad_norm": 1.021088719367981,
"learning_rate": 5.823972271906177e-06,
"loss": 0.5111154556274414,
"memory(GiB)": 34.49,
"step": 1095,
"token_acc": 0.827035490605428,
"train_speed(iter/s)": 0.126024
},
{
"epoch": 1.3466278802725253,
"grad_norm": 1.0440119504928589,
"learning_rate": 5.7923109124463264e-06,
"loss": 0.5382958889007569,
"memory(GiB)": 34.49,
"step": 1100,
"token_acc": 0.8228407178911946,
"train_speed(iter/s)": 0.12617
},
{
"epoch": 1.3466278802725253,
"eval_loss": 0.6151137948036194,
"eval_runtime": 29.8862,
"eval_samples_per_second": 17.634,
"eval_steps_per_second": 4.417,
"eval_token_acc": 0.8264734748695697,
"step": 1100
},
{
"epoch": 1.3527520477685064,
"grad_norm": 1.0229501724243164,
"learning_rate": 5.760616930949584e-06,
"loss": 0.542177963256836,
"memory(GiB)": 34.49,
"step": 1105,
"token_acc": 0.8330580493912673,
"train_speed(iter/s)": 0.125749
},
{
"epoch": 1.3588762152644875,
"grad_norm": 0.8944743871688843,
"learning_rate": 5.728891632361043e-06,
"loss": 0.5133552551269531,
"memory(GiB)": 34.49,
"step": 1110,
"token_acc": 0.8286529928320973,
"train_speed(iter/s)": 0.125908
},
{
"epoch": 1.3650003827604684,
"grad_norm": 0.8976428508758545,
"learning_rate": 5.697136322915218e-06,
"loss": 0.5297269821166992,
"memory(GiB)": 34.49,
"step": 1115,
"token_acc": 0.8331969608416131,
"train_speed(iter/s)": 0.126045
},
{
"epoch": 1.3711245502564495,
"grad_norm": 1.013662338256836,
"learning_rate": 5.66535231008227e-06,
"loss": 0.5449240684509278,
"memory(GiB)": 34.49,
"step": 1120,
"token_acc": 0.8382301504022386,
"train_speed(iter/s)": 0.126181
},
{
"epoch": 1.3711245502564495,
"eval_loss": 0.6147744059562683,
"eval_runtime": 30.0501,
"eval_samples_per_second": 17.537,
"eval_steps_per_second": 4.393,
"eval_token_acc": 0.8263081770752622,
"step": 1120
},
{
"epoch": 1.3772487177524306,
"grad_norm": 1.047998309135437,
"learning_rate": 5.63354090251417e-06,
"loss": 0.5496514320373536,
"memory(GiB)": 34.49,
"step": 1125,
"token_acc": 0.8280883107068635,
"train_speed(iter/s)": 0.125763
},
{
"epoch": 1.3833728852484115,
"grad_norm": 0.995204508304596,
"learning_rate": 5.6017034099908245e-06,
"loss": 0.5459441184997559,
"memory(GiB)": 34.49,
"step": 1130,
"token_acc": 0.8158013374408295,
"train_speed(iter/s)": 0.125925
},
{
"epoch": 1.3894970527443926,
"grad_norm": 0.9421271681785583,
"learning_rate": 5.569841143366141e-06,
"loss": 0.51002197265625,
"memory(GiB)": 34.49,
"step": 1135,
"token_acc": 0.8427027419120847,
"train_speed(iter/s)": 0.126047
},
{
"epoch": 1.3956212202403735,
"grad_norm": 0.9212712049484253,
"learning_rate": 5.537955414514058e-06,
"loss": 0.5343506813049317,
"memory(GiB)": 36.87,
"step": 1140,
"token_acc": 0.8508435329143236,
"train_speed(iter/s)": 0.126151
},
{
"epoch": 1.3956212202403735,
"eval_loss": 0.6138430237770081,
"eval_runtime": 30.0787,
"eval_samples_per_second": 17.521,
"eval_steps_per_second": 4.388,
"eval_token_acc": 0.8258587736969885,
"step": 1140
},
{
"epoch": 1.4017453877363546,
"grad_norm": 0.960340142250061,
"learning_rate": 5.506047536274529e-06,
"loss": 0.537141227722168,
"memory(GiB)": 36.87,
"step": 1145,
"token_acc": 0.825777386163379,
"train_speed(iter/s)": 0.125768
},
{
"epoch": 1.4078695552323355,
"grad_norm": 1.063237190246582,
"learning_rate": 5.474118822399476e-06,
"loss": 0.5870203018188477,
"memory(GiB)": 36.87,
"step": 1150,
"token_acc": 0.8254607459004498,
"train_speed(iter/s)": 0.125933
},
{
"epoch": 1.4139937227283166,
"grad_norm": 0.8735440373420715,
"learning_rate": 5.442170587498684e-06,
"loss": 0.5143415451049804,
"memory(GiB)": 36.87,
"step": 1155,
"token_acc": 0.8224962760245262,
"train_speed(iter/s)": 0.126052
},
{
"epoch": 1.4201178902242977,
"grad_norm": 0.8771001100540161,
"learning_rate": 5.41020414698569e-06,
"loss": 0.557903242111206,
"memory(GiB)": 36.87,
"step": 1160,
"token_acc": 0.8381047381546135,
"train_speed(iter/s)": 0.126212
},
{
"epoch": 1.4201178902242977,
"eval_loss": 0.6127957701683044,
"eval_runtime": 30.0731,
"eval_samples_per_second": 17.524,
"eval_steps_per_second": 4.389,
"eval_token_acc": 0.8258226148044837,
"step": 1160
},
{
"epoch": 1.4262420577202786,
"grad_norm": 0.9807034134864807,
"learning_rate": 5.378220817023609e-06,
"loss": 0.5510265350341796,
"memory(GiB)": 36.87,
"step": 1165,
"token_acc": 0.8266740684199569,
"train_speed(iter/s)": 0.125818
},
{
"epoch": 1.4323662252162597,
"grad_norm": 0.9031324982643127,
"learning_rate": 5.346221914470959e-06,
"loss": 0.5285142421722412,
"memory(GiB)": 36.87,
"step": 1170,
"token_acc": 0.8287966113464342,
"train_speed(iter/s)": 0.125945
},
{
"epoch": 1.4384903927122408,
"grad_norm": 0.9082944393157959,
"learning_rate": 5.314208756827425e-06,
"loss": 0.5313165664672852,
"memory(GiB)": 36.87,
"step": 1175,
"token_acc": 0.8377556371263765,
"train_speed(iter/s)": 0.126106
},
{
"epoch": 1.4446145602082217,
"grad_norm": 0.9939496517181396,
"learning_rate": 5.282182662179623e-06,
"loss": 0.559614896774292,
"memory(GiB)": 36.87,
"step": 1180,
"token_acc": 0.846643215328194,
"train_speed(iter/s)": 0.126246
},
{
"epoch": 1.4446145602082217,
"eval_loss": 0.6125648021697998,
"eval_runtime": 30.1611,
"eval_samples_per_second": 17.473,
"eval_steps_per_second": 4.376,
"eval_token_acc": 0.826158375949171,
"step": 1180
},
{
"epoch": 1.4507387277042028,
"grad_norm": 1.0398341417312622,
"learning_rate": 5.250144949146827e-06,
"loss": 0.5018705368041992,
"memory(GiB)": 36.87,
"step": 1185,
"token_acc": 0.8302141481179431,
"train_speed(iter/s)": 0.125825
},
{
"epoch": 1.4568628952001839,
"grad_norm": 1.0717829465866089,
"learning_rate": 5.218096936826681e-06,
"loss": 0.543729591369629,
"memory(GiB)": 36.87,
"step": 1190,
"token_acc": 0.8435318409753859,
"train_speed(iter/s)": 0.125983
},
{
"epoch": 1.4629870626961647,
"grad_norm": 0.9488953948020935,
"learning_rate": 5.186039944740882e-06,
"loss": 0.5498368740081787,
"memory(GiB)": 36.87,
"step": 1195,
"token_acc": 0.8358378225120499,
"train_speed(iter/s)": 0.126113
},
{
"epoch": 1.4691112301921456,
"grad_norm": 1.010858416557312,
"learning_rate": 5.153975292780852e-06,
"loss": 0.5265066623687744,
"memory(GiB)": 36.87,
"step": 1200,
"token_acc": 0.8429352241672207,
"train_speed(iter/s)": 0.126228
},
{
"epoch": 1.4691112301921456,
"eval_loss": 0.6125081181526184,
"eval_runtime": 29.9605,
"eval_samples_per_second": 17.59,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8266542693320936,
"step": 1200
},
{
"epoch": 1.4752353976881267,
"grad_norm": 0.8864910006523132,
"learning_rate": 5.1219043011534e-06,
"loss": 0.5261281967163086,
"memory(GiB)": 36.87,
"step": 1205,
"token_acc": 0.8282765708814958,
"train_speed(iter/s)": 0.125837
},
{
"epoch": 1.4813595651841078,
"grad_norm": 0.9174733757972717,
"learning_rate": 5.089828290326354e-06,
"loss": 0.5531785964965821,
"memory(GiB)": 36.87,
"step": 1210,
"token_acc": 0.8411006266657063,
"train_speed(iter/s)": 0.126012
},
{
"epoch": 1.4874837326800887,
"grad_norm": 0.9243429899215698,
"learning_rate": 5.057748580974204e-06,
"loss": 0.5176255702972412,
"memory(GiB)": 36.87,
"step": 1215,
"token_acc": 0.8498275862068966,
"train_speed(iter/s)": 0.12615
},
{
"epoch": 1.4936079001760698,
"grad_norm": 0.9391066431999207,
"learning_rate": 5.0256664939237186e-06,
"loss": 0.5616118431091308,
"memory(GiB)": 36.87,
"step": 1220,
"token_acc": 0.8409511815690051,
"train_speed(iter/s)": 0.126282
},
{
"epoch": 1.4936079001760698,
"eval_loss": 0.6114863157272339,
"eval_runtime": 29.9732,
"eval_samples_per_second": 17.582,
"eval_steps_per_second": 4.404,
"eval_token_acc": 0.8267007593367426,
"step": 1220
},
{
"epoch": 1.499732067672051,
"grad_norm": 0.8913131356239319,
"learning_rate": 4.99358335009956e-06,
"loss": 0.5003180027008056,
"memory(GiB)": 36.87,
"step": 1225,
"token_acc": 0.8392863897119082,
"train_speed(iter/s)": 0.125908
},
{
"epoch": 1.5058562351680318,
"grad_norm": 0.9838159084320068,
"learning_rate": 4.961500470469908e-06,
"loss": 0.5151349067687988,
"memory(GiB)": 36.87,
"step": 1230,
"token_acc": 0.8358106300867373,
"train_speed(iter/s)": 0.126051
},
{
"epoch": 1.511980402664013,
"grad_norm": 0.9471805095672607,
"learning_rate": 4.92941917599206e-06,
"loss": 0.5267168998718261,
"memory(GiB)": 36.87,
"step": 1235,
"token_acc": 0.8325013676148797,
"train_speed(iter/s)": 0.126177
},
{
"epoch": 1.518104570159994,
"grad_norm": 0.9928951263427734,
"learning_rate": 4.8973407875580485e-06,
"loss": 0.5807061195373535,
"memory(GiB)": 36.87,
"step": 1240,
"token_acc": 0.82605387834146,
"train_speed(iter/s)": 0.12635
},
{
"epoch": 1.518104570159994,
"eval_loss": 0.6120603084564209,
"eval_runtime": 30.0211,
"eval_samples_per_second": 17.554,
"eval_steps_per_second": 4.397,
"eval_token_acc": 0.8262875148509737,
"step": 1240
},
{
"epoch": 1.5242287376559749,
"grad_norm": 0.9785681366920471,
"learning_rate": 4.8652666259402584e-06,
"loss": 0.5564475059509277,
"memory(GiB)": 36.87,
"step": 1245,
"token_acc": 0.8232146560663671,
"train_speed(iter/s)": 0.125966
},
{
"epoch": 1.5303529051519558,
"grad_norm": 0.9484609365463257,
"learning_rate": 4.833198011737035e-06,
"loss": 0.5257096767425538,
"memory(GiB)": 36.87,
"step": 1250,
"token_acc": 0.8338689740420272,
"train_speed(iter/s)": 0.126082
},
{
"epoch": 1.5364770726479369,
"grad_norm": 1.0170414447784424,
"learning_rate": 4.8011362653183245e-06,
"loss": 0.5458654403686524,
"memory(GiB)": 36.87,
"step": 1255,
"token_acc": 0.8260180208051355,
"train_speed(iter/s)": 0.126209
},
{
"epoch": 1.542601240143918,
"grad_norm": 1.0465954542160034,
"learning_rate": 4.7690827067713035e-06,
"loss": 0.5092308998107911,
"memory(GiB)": 36.87,
"step": 1260,
"token_acc": 0.8562170404727111,
"train_speed(iter/s)": 0.126341
},
{
"epoch": 1.542601240143918,
"eval_loss": 0.6123631596565247,
"eval_runtime": 29.8837,
"eval_samples_per_second": 17.635,
"eval_steps_per_second": 4.417,
"eval_token_acc": 0.8266542693320936,
"step": 1260
},
{
"epoch": 1.5487254076398989,
"grad_norm": 1.0032224655151367,
"learning_rate": 4.737038655846023e-06,
"loss": 0.5465664863586426,
"memory(GiB)": 36.87,
"step": 1265,
"token_acc": 0.8245171081677705,
"train_speed(iter/s)": 0.125985
},
{
"epoch": 1.55484957513588,
"grad_norm": 1.0049303770065308,
"learning_rate": 4.70500543190108e-06,
"loss": 0.5189294338226318,
"memory(GiB)": 36.87,
"step": 1270,
"token_acc": 0.8236330189048495,
"train_speed(iter/s)": 0.12608
},
{
"epoch": 1.560973742631861,
"grad_norm": 1.006712794303894,
"learning_rate": 4.672984353849285e-06,
"loss": 0.5561445236206055,
"memory(GiB)": 36.87,
"step": 1275,
"token_acc": 0.8239827598801958,
"train_speed(iter/s)": 0.126214
},
{
"epoch": 1.567097910127842,
"grad_norm": 0.8475578427314758,
"learning_rate": 4.640976740103363e-06,
"loss": 0.5361814498901367,
"memory(GiB)": 36.87,
"step": 1280,
"token_acc": 0.8350327247674819,
"train_speed(iter/s)": 0.126343
},
{
"epoch": 1.567097910127842,
"eval_loss": 0.6125593185424805,
"eval_runtime": 29.9427,
"eval_samples_per_second": 17.6,
"eval_steps_per_second": 4.408,
"eval_token_acc": 0.8268918849114107,
"step": 1280
},
{
"epoch": 1.573222077623823,
"grad_norm": 1.068233847618103,
"learning_rate": 4.60898390852167e-06,
"loss": 0.5269934654235839,
"memory(GiB)": 36.87,
"step": 1285,
"token_acc": 0.8273768192895751,
"train_speed(iter/s)": 0.125978
},
{
"epoch": 1.5793462451198041,
"grad_norm": 1.0497961044311523,
"learning_rate": 4.577007176353931e-06,
"loss": 0.5188837051391602,
"memory(GiB)": 36.87,
"step": 1290,
"token_acc": 0.8475213675213675,
"train_speed(iter/s)": 0.126095
},
{
"epoch": 1.585470412615785,
"grad_norm": 0.9117013812065125,
"learning_rate": 4.5450478601870055e-06,
"loss": 0.49097652435302735,
"memory(GiB)": 36.87,
"step": 1295,
"token_acc": 0.83500768653248,
"train_speed(iter/s)": 0.126208
},
{
"epoch": 1.591594580111766,
"grad_norm": 0.865384042263031,
"learning_rate": 4.513107275890682e-06,
"loss": 0.5219059944152832,
"memory(GiB)": 36.87,
"step": 1300,
"token_acc": 0.8484564711960734,
"train_speed(iter/s)": 0.126317
},
{
"epoch": 1.591594580111766,
"eval_loss": 0.6109749674797058,
"eval_runtime": 29.957,
"eval_samples_per_second": 17.592,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8270158582571414,
"step": 1300
},
{
"epoch": 1.5977187476077472,
"grad_norm": 0.9944786429405212,
"learning_rate": 4.4811867385634916e-06,
"loss": 0.5182311058044433,
"memory(GiB)": 36.87,
"step": 1305,
"token_acc": 0.8336139447360602,
"train_speed(iter/s)": 0.125948
},
{
"epoch": 1.6038429151037281,
"grad_norm": 0.9752517342567444,
"learning_rate": 4.44928756247857e-06,
"loss": 0.49358739852905276,
"memory(GiB)": 36.87,
"step": 1310,
"token_acc": 0.8449874236435502,
"train_speed(iter/s)": 0.126049
},
{
"epoch": 1.609967082599709,
"grad_norm": 0.9614261984825134,
"learning_rate": 4.417411061029539e-06,
"loss": 0.536794376373291,
"memory(GiB)": 36.87,
"step": 1315,
"token_acc": 0.832827077457149,
"train_speed(iter/s)": 0.126177
},
{
"epoch": 1.61609125009569,
"grad_norm": 0.9478575587272644,
"learning_rate": 4.3855585466764305e-06,
"loss": 0.4996980667114258,
"memory(GiB)": 36.87,
"step": 1320,
"token_acc": 0.8452054794520548,
"train_speed(iter/s)": 0.126305
},
{
"epoch": 1.61609125009569,
"eval_loss": 0.6103559732437134,
"eval_runtime": 30.0278,
"eval_samples_per_second": 17.55,
"eval_steps_per_second": 4.396,
"eval_token_acc": 0.8273722816261171,
"step": 1320
},
{
"epoch": 1.6222154175916712,
"grad_norm": 1.074583649635315,
"learning_rate": 4.353731330891651e-06,
"loss": 0.529239273071289,
"memory(GiB)": 36.87,
"step": 1325,
"token_acc": 0.8302422791282821,
"train_speed(iter/s)": 0.125951
},
{
"epoch": 1.628339585087652,
"grad_norm": 0.9707440137863159,
"learning_rate": 4.321930724105979e-06,
"loss": 0.4900198936462402,
"memory(GiB)": 36.87,
"step": 1330,
"token_acc": 0.8524826181613877,
"train_speed(iter/s)": 0.126054
},
{
"epoch": 1.6344637525836332,
"grad_norm": 0.943321943283081,
"learning_rate": 4.290158035654618e-06,
"loss": 0.5417927265167236,
"memory(GiB)": 36.87,
"step": 1335,
"token_acc": 0.8254652088914634,
"train_speed(iter/s)": 0.1262
},
{
"epoch": 1.6405879200796143,
"grad_norm": 1.0129594802856445,
"learning_rate": 4.258414573723277e-06,
"loss": 0.545560359954834,
"memory(GiB)": 36.87,
"step": 1340,
"token_acc": 0.8416190929273708,
"train_speed(iter/s)": 0.126328
},
{
"epoch": 1.6405879200796143,
"eval_loss": 0.609876275062561,
"eval_runtime": 30.0571,
"eval_samples_per_second": 17.533,
"eval_steps_per_second": 4.392,
"eval_token_acc": 0.827418771630766,
"step": 1340
},
{
"epoch": 1.6467120875755952,
"grad_norm": 1.0571733713150024,
"learning_rate": 4.226701645294317e-06,
"loss": 0.5603596687316894,
"memory(GiB)": 36.87,
"step": 1345,
"token_acc": 0.8282726557865548,
"train_speed(iter/s)": 0.125982
},
{
"epoch": 1.652836255071576,
"grad_norm": 1.0039043426513672,
"learning_rate": 4.195020556092935e-06,
"loss": 0.5717378616333008,
"memory(GiB)": 36.87,
"step": 1350,
"token_acc": 0.8221712722738426,
"train_speed(iter/s)": 0.126115
},
{
"epoch": 1.6589604225675574,
"grad_norm": 1.0660555362701416,
"learning_rate": 4.1633726105334006e-06,
"loss": 0.5500486373901368,
"memory(GiB)": 36.87,
"step": 1355,
"token_acc": 0.8320635850853417,
"train_speed(iter/s)": 0.126243
},
{
"epoch": 1.6650845900635383,
"grad_norm": 0.9174071550369263,
"learning_rate": 4.131759111665349e-06,
"loss": 0.49724588394165037,
"memory(GiB)": 36.87,
"step": 1360,
"token_acc": 0.8494714160662582,
"train_speed(iter/s)": 0.126379
},
{
"epoch": 1.6650845900635383,
"eval_loss": 0.609088659286499,
"eval_runtime": 30.0177,
"eval_samples_per_second": 17.556,
"eval_steps_per_second": 4.397,
"eval_token_acc": 0.8274859238597035,
"step": 1360
},
{
"epoch": 1.6712087575595191,
"grad_norm": 1.0310657024383545,
"learning_rate": 4.100181361120136e-06,
"loss": 0.5943800926208496,
"memory(GiB)": 36.87,
"step": 1365,
"token_acc": 0.8274678717695241,
"train_speed(iter/s)": 0.126048
},
{
"epoch": 1.6773329250555002,
"grad_norm": 0.947372317314148,
"learning_rate": 4.068640659057242e-06,
"loss": 0.5227277755737305,
"memory(GiB)": 36.87,
"step": 1370,
"token_acc": 0.8481193255512322,
"train_speed(iter/s)": 0.126167
},
{
"epoch": 1.6834570925514813,
"grad_norm": 1.0015521049499512,
"learning_rate": 4.037138304110737e-06,
"loss": 0.5239052772521973,
"memory(GiB)": 36.87,
"step": 1375,
"token_acc": 0.8306528880372297,
"train_speed(iter/s)": 0.126277
},
{
"epoch": 1.6895812600474622,
"grad_norm": 1.014237880706787,
"learning_rate": 4.005675593335818e-06,
"loss": 0.5036933898925782,
"memory(GiB)": 36.87,
"step": 1380,
"token_acc": 0.8474381345177665,
"train_speed(iter/s)": 0.126378
},
{
"epoch": 1.6895812600474622,
"eval_loss": 0.6097399592399597,
"eval_runtime": 30.0485,
"eval_samples_per_second": 17.538,
"eval_steps_per_second": 4.393,
"eval_token_acc": 0.8270313549253577,
"step": 1380
},
{
"epoch": 1.6957054275434433,
"grad_norm": 0.970410943031311,
"learning_rate": 3.974253822155397e-06,
"loss": 0.5157362937927246,
"memory(GiB)": 36.87,
"step": 1385,
"token_acc": 0.8356764264051473,
"train_speed(iter/s)": 0.126006
},
{
"epoch": 1.7018295950394244,
"grad_norm": 0.9698341488838196,
"learning_rate": 3.942874284306774e-06,
"loss": 0.5165740966796875,
"memory(GiB)": 36.87,
"step": 1390,
"token_acc": 0.852775912640916,
"train_speed(iter/s)": 0.126136
},
{
"epoch": 1.7079537625354053,
"grad_norm": 0.889597475528717,
"learning_rate": 3.911538271788359e-06,
"loss": 0.5268959999084473,
"memory(GiB)": 36.87,
"step": 1395,
"token_acc": 0.8417914492851819,
"train_speed(iter/s)": 0.126261
},
{
"epoch": 1.7140779300313862,
"grad_norm": 0.9927029609680176,
"learning_rate": 3.8802470748064855e-06,
"loss": 0.5189975738525391,
"memory(GiB)": 36.87,
"step": 1400,
"token_acc": 0.8465264055174552,
"train_speed(iter/s)": 0.126349
},
{
"epoch": 1.7140779300313862,
"eval_loss": 0.6091334223747253,
"eval_runtime": 29.8469,
"eval_samples_per_second": 17.657,
"eval_steps_per_second": 4.423,
"eval_token_acc": 0.8271295004907279,
"step": 1400
},
{
"epoch": 1.7202020975273675,
"grad_norm": 0.9913120865821838,
"learning_rate": 3.849001981722285e-06,
"loss": 0.5513727188110351,
"memory(GiB)": 36.87,
"step": 1405,
"token_acc": 0.833267143235372,
"train_speed(iter/s)": 0.126016
},
{
"epoch": 1.7263262650233484,
"grad_norm": 0.9658275246620178,
"learning_rate": 3.8178042789986355e-06,
"loss": 0.5375414371490479,
"memory(GiB)": 36.87,
"step": 1410,
"token_acc": 0.8221340970845267,
"train_speed(iter/s)": 0.126151
},
{
"epoch": 1.7324504325193293,
"grad_norm": 0.9217929244041443,
"learning_rate": 3.786655251147204e-06,
"loss": 0.5318355560302734,
"memory(GiB)": 36.87,
"step": 1415,
"token_acc": 0.8423896524940057,
"train_speed(iter/s)": 0.126267
},
{
"epoch": 1.7385746000153104,
"grad_norm": 1.0436443090438843,
"learning_rate": 3.755556180675547e-06,
"loss": 0.5554102897644043,
"memory(GiB)": 36.87,
"step": 1420,
"token_acc": 0.8421204263900893,
"train_speed(iter/s)": 0.126374
},
{
"epoch": 1.7385746000153104,
"eval_loss": 0.6083164215087891,
"eval_runtime": 29.8608,
"eval_samples_per_second": 17.649,
"eval_steps_per_second": 4.421,
"eval_token_acc": 0.8271295004907279,
"step": 1420
},
{
"epoch": 1.7446987675112915,
"grad_norm": 1.0527832508087158,
"learning_rate": 3.7245083480343225e-06,
"loss": 0.5336908817291259,
"memory(GiB)": 36.87,
"step": 1425,
"token_acc": 0.8262728719172633,
"train_speed(iter/s)": 0.126038
},
{
"epoch": 1.7508229350072724,
"grad_norm": 0.9292203187942505,
"learning_rate": 3.693513031564549e-06,
"loss": 0.5425585746765137,
"memory(GiB)": 36.87,
"step": 1430,
"token_acc": 0.8410889737991266,
"train_speed(iter/s)": 0.126155
},
{
"epoch": 1.7569471025032535,
"grad_norm": 0.9655841588973999,
"learning_rate": 3.662571507444986e-06,
"loss": 0.5386072158813476,
"memory(GiB)": 36.87,
"step": 1435,
"token_acc": 0.857958101689923,
"train_speed(iter/s)": 0.126275
},
{
"epoch": 1.7630712699992346,
"grad_norm": 0.9359703660011292,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.5226363658905029,
"memory(GiB)": 36.87,
"step": 1440,
"token_acc": 0.8469182175175004,
"train_speed(iter/s)": 0.126376
},
{
"epoch": 1.7630712699992346,
"eval_loss": 0.6083342432975769,
"eval_runtime": 29.9595,
"eval_samples_per_second": 17.59,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8270623482617904,
"step": 1440
},
{
"epoch": 1.7691954374952155,
"grad_norm": 1.018633484840393,
"learning_rate": 3.6008549298450403e-06,
"loss": 0.5337300300598145,
"memory(GiB)": 36.87,
"step": 1445,
"token_acc": 0.8221121123846342,
"train_speed(iter/s)": 0.126056
},
{
"epoch": 1.7753196049911963,
"grad_norm": 0.8938316106796265,
"learning_rate": 3.5700824174384196e-06,
"loss": 0.47947111129760744,
"memory(GiB)": 36.87,
"step": 1450,
"token_acc": 0.83465726353315,
"train_speed(iter/s)": 0.126132
},
{
"epoch": 1.7814437724871777,
"grad_norm": 0.950809895992279,
"learning_rate": 3.5393687794249093e-06,
"loss": 0.5499818325042725,
"memory(GiB)": 36.87,
"step": 1455,
"token_acc": 0.8119980392896196,
"train_speed(iter/s)": 0.126246
},
{
"epoch": 1.7875679399831585,
"grad_norm": 0.9783928394317627,
"learning_rate": 3.508715280385644e-06,
"loss": 0.5239407062530518,
"memory(GiB)": 36.87,
"step": 1460,
"token_acc": 0.8301248357424441,
"train_speed(iter/s)": 0.126348
},
{
"epoch": 1.7875679399831585,
"eval_loss": 0.6072365641593933,
"eval_runtime": 29.9698,
"eval_samples_per_second": 17.584,
"eval_steps_per_second": 4.404,
"eval_token_acc": 0.8274497649671987,
"step": 1460
},
{
"epoch": 1.7936921074791394,
"grad_norm": 1.0948944091796875,
"learning_rate": 3.478123182425639e-06,
"loss": 0.5428466320037841,
"memory(GiB)": 36.87,
"step": 1465,
"token_acc": 0.825891086303621,
"train_speed(iter/s)": 0.12604
},
{
"epoch": 1.7998162749751205,
"grad_norm": 0.8970156311988831,
"learning_rate": 3.4475937451218257e-06,
"loss": 0.5330904960632324,
"memory(GiB)": 36.87,
"step": 1470,
"token_acc": 0.8345248968536424,
"train_speed(iter/s)": 0.126137
},
{
"epoch": 1.8059404424711016,
"grad_norm": 1.018349528312683,
"learning_rate": 3.4171282254711935e-06,
"loss": 0.5589166641235351,
"memory(GiB)": 36.87,
"step": 1475,
"token_acc": 0.8269609914096606,
"train_speed(iter/s)": 0.126239
},
{
"epoch": 1.8120646099670825,
"grad_norm": 0.9459341764450073,
"learning_rate": 3.386727877839027e-06,
"loss": 0.555328369140625,
"memory(GiB)": 36.87,
"step": 1480,
"token_acc": 0.8426273550787036,
"train_speed(iter/s)": 0.126358
},
{
"epoch": 1.8120646099670825,
"eval_loss": 0.6067067980766296,
"eval_runtime": 30.0242,
"eval_samples_per_second": 17.553,
"eval_steps_per_second": 4.396,
"eval_token_acc": 0.8277803605558138,
"step": 1480
},
{
"epoch": 1.8181887774630636,
"grad_norm": 0.9838424324989319,
"learning_rate": 3.356393953907271e-06,
"loss": 0.5277560710906982,
"memory(GiB)": 36.87,
"step": 1485,
"token_acc": 0.831073039771941,
"train_speed(iter/s)": 0.126055
},
{
"epoch": 1.8243129449590447,
"grad_norm": 1.041955590248108,
"learning_rate": 3.3261277026229857e-06,
"loss": 0.5799334049224854,
"memory(GiB)": 36.87,
"step": 1490,
"token_acc": 0.8321645313553607,
"train_speed(iter/s)": 0.126161
},
{
"epoch": 1.8304371124550256,
"grad_norm": 0.9292726516723633,
"learning_rate": 3.2959303701469254e-06,
"loss": 0.5210411071777343,
"memory(GiB)": 36.87,
"step": 1495,
"token_acc": 0.8229648473635522,
"train_speed(iter/s)": 0.12628
},
{
"epoch": 1.8365612799510067,
"grad_norm": 0.864344596862793,
"learning_rate": 3.2658031998022368e-06,
"loss": 0.5165549278259277,
"memory(GiB)": 36.87,
"step": 1500,
"token_acc": 0.8386885030686928,
"train_speed(iter/s)": 0.126396
},
{
"epoch": 1.8365612799510067,
"eval_loss": 0.6069810390472412,
"eval_runtime": 30.0285,
"eval_samples_per_second": 17.55,
"eval_steps_per_second": 4.396,
"eval_token_acc": 0.8276202283175784,
"step": 1500
},
{
"epoch": 1.8426854474469878,
"grad_norm": 0.8527396321296692,
"learning_rate": 3.2357474320232565e-06,
"loss": 0.5021331787109375,
"memory(GiB)": 36.87,
"step": 1505,
"token_acc": 0.8321268481969626,
"train_speed(iter/s)": 0.126074
},
{
"epoch": 1.8488096149429687,
"grad_norm": 0.9290481209754944,
"learning_rate": 3.2057643043044452e-06,
"loss": 0.5180329322814942,
"memory(GiB)": 36.87,
"step": 1510,
"token_acc": 0.8345307220417938,
"train_speed(iter/s)": 0.12617
},
{
"epoch": 1.8549337824389496,
"grad_norm": 0.9051028490066528,
"learning_rate": 3.1758550511494336e-06,
"loss": 0.5452617645263672,
"memory(GiB)": 36.87,
"step": 1515,
"token_acc": 0.8341819137404329,
"train_speed(iter/s)": 0.126302
},
{
"epoch": 1.8610579499349307,
"grad_norm": 1.0172206163406372,
"learning_rate": 3.1460209040201967e-06,
"loss": 0.5237324237823486,
"memory(GiB)": 36.87,
"step": 1520,
"token_acc": 0.8367892176409603,
"train_speed(iter/s)": 0.126404
},
{
"epoch": 1.8610579499349307,
"eval_loss": 0.6056584715843201,
"eval_runtime": 29.9559,
"eval_samples_per_second": 17.593,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8284363861769719,
"step": 1520
},
{
"epoch": 1.8671821174309118,
"grad_norm": 0.9731259942054749,
"learning_rate": 3.116263091286344e-06,
"loss": 0.5423327445983886,
"memory(GiB)": 36.87,
"step": 1525,
"token_acc": 0.8284452097329645,
"train_speed(iter/s)": 0.126099
},
{
"epoch": 1.8733062849268927,
"grad_norm": 0.9437146782875061,
"learning_rate": 3.0865828381745515e-06,
"loss": 0.5558583736419678,
"memory(GiB)": 36.87,
"step": 1530,
"token_acc": 0.828622035858878,
"train_speed(iter/s)": 0.126204
},
{
"epoch": 1.8794304524228738,
"grad_norm": 1.0038166046142578,
"learning_rate": 3.056981366718111e-06,
"loss": 0.5397710800170898,
"memory(GiB)": 36.87,
"step": 1535,
"token_acc": 0.8308211163879138,
"train_speed(iter/s)": 0.12631
},
{
"epoch": 1.8855546199188549,
"grad_norm": 0.8471850156784058,
"learning_rate": 3.0274598957066132e-06,
"loss": 0.4804985523223877,
"memory(GiB)": 36.87,
"step": 1540,
"token_acc": 0.8619780378558012,
"train_speed(iter/s)": 0.126389
},
{
"epoch": 1.8855546199188549,
"eval_loss": 0.6056827306747437,
"eval_runtime": 29.9968,
"eval_samples_per_second": 17.569,
"eval_steps_per_second": 4.4,
"eval_token_acc": 0.8282245983780154,
"step": 1540
},
{
"epoch": 1.8916787874148357,
"grad_norm": 0.938709557056427,
"learning_rate": 2.998019640635772e-06,
"loss": 0.5519435405731201,
"memory(GiB)": 36.87,
"step": 1545,
"token_acc": 0.8257935412641457,
"train_speed(iter/s)": 0.126081
},
{
"epoch": 1.8978029549108169,
"grad_norm": 0.9090867638587952,
"learning_rate": 2.96866181365737e-06,
"loss": 0.5426124572753906,
"memory(GiB)": 36.87,
"step": 1550,
"token_acc": 0.843109962219129,
"train_speed(iter/s)": 0.126179
},
{
"epoch": 1.903927122406798,
"grad_norm": 0.8900991678237915,
"learning_rate": 2.9393876235293578e-06,
"loss": 0.510080623626709,
"memory(GiB)": 36.87,
"step": 1555,
"token_acc": 0.8357965621123515,
"train_speed(iter/s)": 0.126271
},
{
"epoch": 1.9100512899027788,
"grad_norm": 0.8838712573051453,
"learning_rate": 2.910198275566085e-06,
"loss": 0.5103748321533204,
"memory(GiB)": 36.87,
"step": 1560,
"token_acc": 0.8447406983809286,
"train_speed(iter/s)": 0.126379
},
{
"epoch": 1.9100512899027788,
"eval_loss": 0.6069024205207825,
"eval_runtime": 29.8963,
"eval_samples_per_second": 17.628,
"eval_steps_per_second": 4.415,
"eval_token_acc": 0.8283692339480345,
"step": 1560
},
{
"epoch": 1.9161754573987597,
"grad_norm": 1.0007625818252563,
"learning_rate": 2.881094971588666e-06,
"loss": 0.5161759853363037,
"memory(GiB)": 36.87,
"step": 1565,
"token_acc": 0.8315062300454892,
"train_speed(iter/s)": 0.126082
},
{
"epoch": 1.9222996248947408,
"grad_norm": 0.9922500848770142,
"learning_rate": 2.8520789098755053e-06,
"loss": 0.5415813446044921,
"memory(GiB)": 36.87,
"step": 1570,
"token_acc": 0.8527313346785818,
"train_speed(iter/s)": 0.126203
},
{
"epoch": 1.928423792390722,
"grad_norm": 0.9492520093917847,
"learning_rate": 2.8231512851129596e-06,
"loss": 0.5504971981048584,
"memory(GiB)": 36.87,
"step": 1575,
"token_acc": 0.8186260917787328,
"train_speed(iter/s)": 0.126306
},
{
"epoch": 1.9345479598867028,
"grad_norm": 0.9212282299995422,
"learning_rate": 2.7943132883461434e-06,
"loss": 0.547866678237915,
"memory(GiB)": 36.87,
"step": 1580,
"token_acc": 0.83892855733954,
"train_speed(iter/s)": 0.126413
},
{
"epoch": 1.9345479598867028,
"eval_loss": 0.6043635010719299,
"eval_runtime": 29.9792,
"eval_samples_per_second": 17.579,
"eval_steps_per_second": 4.403,
"eval_token_acc": 0.8282814194948086,
"step": 1580
},
{
"epoch": 1.940672127382684,
"grad_norm": 1.0419548749923706,
"learning_rate": 2.7655661069298934e-06,
"loss": 0.5519622325897217,
"memory(GiB)": 36.87,
"step": 1585,
"token_acc": 0.8333945887874247,
"train_speed(iter/s)": 0.126139
},
{
"epoch": 1.946796294878665,
"grad_norm": 1.043246865272522,
"learning_rate": 2.736910924479881e-06,
"loss": 0.5610580921173096,
"memory(GiB)": 36.87,
"step": 1590,
"token_acc": 0.8335689174006017,
"train_speed(iter/s)": 0.126246
},
{
"epoch": 1.952920462374646,
"grad_norm": 0.9222803711891174,
"learning_rate": 2.7083489208238784e-06,
"loss": 0.5393799304962158,
"memory(GiB)": 36.87,
"step": 1595,
"token_acc": 0.8198356395308001,
"train_speed(iter/s)": 0.126365
},
{
"epoch": 1.959044629870627,
"grad_norm": 0.927827775478363,
"learning_rate": 2.6798812719531843e-06,
"loss": 0.5392462730407714,
"memory(GiB)": 36.87,
"step": 1600,
"token_acc": 0.8314373587282766,
"train_speed(iter/s)": 0.126473
},
{
"epoch": 1.959044629870627,
"eval_loss": 0.6057147979736328,
"eval_runtime": 29.992,
"eval_samples_per_second": 17.571,
"eval_steps_per_second": 4.401,
"eval_token_acc": 0.8276408905418668,
"step": 1600
},
{
"epoch": 1.965168797366608,
"grad_norm": 1.0039829015731812,
"learning_rate": 2.6515091499741946e-06,
"loss": 0.5505844116210937,
"memory(GiB)": 36.87,
"step": 1605,
"token_acc": 0.8264420910319964,
"train_speed(iter/s)": 0.126174
},
{
"epoch": 1.971292964862589,
"grad_norm": 0.9382634162902832,
"learning_rate": 2.623233723060157e-06,
"loss": 0.5243973731994629,
"memory(GiB)": 36.87,
"step": 1610,
"token_acc": 0.8320176612255821,
"train_speed(iter/s)": 0.126271
},
{
"epoch": 1.9774171323585699,
"grad_norm": 0.9044788479804993,
"learning_rate": 2.595056155403063e-06,
"loss": 0.48435544967651367,
"memory(GiB)": 36.87,
"step": 1615,
"token_acc": 0.843847529543781,
"train_speed(iter/s)": 0.126372
},
{
"epoch": 1.9835412998545512,
"grad_norm": 0.9093387722969055,
"learning_rate": 2.5669776071657194e-06,
"loss": 0.515876293182373,
"memory(GiB)": 36.87,
"step": 1620,
"token_acc": 0.8395097436639068,
"train_speed(iter/s)": 0.126455
},
{
"epoch": 1.9835412998545512,
"eval_loss": 0.6055964231491089,
"eval_runtime": 29.9496,
"eval_samples_per_second": 17.596,
"eval_steps_per_second": 4.407,
"eval_token_acc": 0.8280644661397799,
"step": 1620
},
{
"epoch": 1.989665467350532,
"grad_norm": 1.0032296180725098,
"learning_rate": 2.5389992344339787e-06,
"loss": 0.5630090713500977,
"memory(GiB)": 36.87,
"step": 1625,
"token_acc": 0.8223650962996237,
"train_speed(iter/s)": 0.126171
},
{
"epoch": 1.995789634846513,
"grad_norm": 0.9951412677764893,
"learning_rate": 2.5111221891691384e-06,
"loss": 0.5040010452270508,
"memory(GiB)": 36.87,
"step": 1630,
"token_acc": 0.8500885437951233,
"train_speed(iter/s)": 0.126256
},
{
"epoch": 2.001224833499196,
"grad_norm": 1.4779112339019775,
"learning_rate": 2.4833476191605136e-06,
"loss": 0.514947509765625,
"memory(GiB)": 36.87,
"step": 1635,
"token_acc": 0.8120487926313169,
"train_speed(iter/s)": 0.126393
},
{
"epoch": 2.0073490009951773,
"grad_norm": 0.9851377010345459,
"learning_rate": 2.4556766679781763e-06,
"loss": 0.4878593921661377,
"memory(GiB)": 36.87,
"step": 1640,
"token_acc": 0.8328894582476486,
"train_speed(iter/s)": 0.126502
},
{
"epoch": 2.0073490009951773,
"eval_loss": 0.6103575825691223,
"eval_runtime": 29.9214,
"eval_samples_per_second": 17.613,
"eval_steps_per_second": 4.412,
"eval_token_acc": 0.8283795650601787,
"step": 1640
},
{
"epoch": 2.013473168491158,
"grad_norm": 0.9571102857589722,
"learning_rate": 2.4281104749258716e-06,
"loss": 0.49354209899902346,
"memory(GiB)": 36.87,
"step": 1645,
"token_acc": 0.832244552629024,
"train_speed(iter/s)": 0.126228
},
{
"epoch": 2.019597335987139,
"grad_norm": 0.9675928354263306,
"learning_rate": 2.4006501749941097e-06,
"loss": 0.47706212997436526,
"memory(GiB)": 36.87,
"step": 1650,
"token_acc": 0.8675657501494322,
"train_speed(iter/s)": 0.126296
},
{
"epoch": 2.0257215034831204,
"grad_norm": 0.9996489882469177,
"learning_rate": 2.3732968988134343e-06,
"loss": 0.4821828842163086,
"memory(GiB)": 36.87,
"step": 1655,
"token_acc": 0.8707286339040842,
"train_speed(iter/s)": 0.126397
},
{
"epoch": 2.0318456709791013,
"grad_norm": 1.0168769359588623,
"learning_rate": 2.3460517726078696e-06,
"loss": 0.47524452209472656,
"memory(GiB)": 36.87,
"step": 1660,
"token_acc": 0.855996970531534,
"train_speed(iter/s)": 0.126479
},
{
"epoch": 2.0318456709791013,
"eval_loss": 0.6196611523628235,
"eval_runtime": 29.918,
"eval_samples_per_second": 17.615,
"eval_steps_per_second": 4.412,
"eval_token_acc": 0.8268350637946175,
"step": 1660
},
{
"epoch": 2.037969838475082,
"grad_norm": 0.9725021123886108,
"learning_rate": 2.3189159181485517e-06,
"loss": 0.4909340858459473,
"memory(GiB)": 36.87,
"step": 1665,
"token_acc": 0.8357392077717726,
"train_speed(iter/s)": 0.126201
},
{
"epoch": 2.0440940059710635,
"grad_norm": 0.9119012355804443,
"learning_rate": 2.291890452707539e-06,
"loss": 0.4890812873840332,
"memory(GiB)": 36.87,
"step": 1670,
"token_acc": 0.8586991348926626,
"train_speed(iter/s)": 0.126291
},
{
"epoch": 2.0502181734670444,
"grad_norm": 1.0054688453674316,
"learning_rate": 2.2649764890118158e-06,
"loss": 0.49579925537109376,
"memory(GiB)": 36.87,
"step": 1675,
"token_acc": 0.8483596157331883,
"train_speed(iter/s)": 0.126396
},
{
"epoch": 2.0563423409630253,
"grad_norm": 0.9014572501182556,
"learning_rate": 2.238175135197471e-06,
"loss": 0.47943267822265623,
"memory(GiB)": 36.87,
"step": 1680,
"token_acc": 0.8587532153124527,
"train_speed(iter/s)": 0.126492
},
{
"epoch": 2.0563423409630253,
"eval_loss": 0.6154988408088684,
"eval_runtime": 29.9636,
"eval_samples_per_second": 17.588,
"eval_steps_per_second": 4.405,
"eval_token_acc": 0.8270726793739346,
"step": 1680
},
{
"epoch": 2.062466508459006,
"grad_norm": 0.938705563545227,
"learning_rate": 2.2114874947640763e-06,
"loss": 0.45625782012939453,
"memory(GiB)": 36.87,
"step": 1685,
"token_acc": 0.8355256733948025,
"train_speed(iter/s)": 0.126208
},
{
"epoch": 2.0685906759549875,
"grad_norm": 0.8989147543907166,
"learning_rate": 2.1849146665292513e-06,
"loss": 0.46575441360473635,
"memory(GiB)": 36.87,
"step": 1690,
"token_acc": 0.8795854481354284,
"train_speed(iter/s)": 0.126306
},
{
"epoch": 2.0747148434509683,
"grad_norm": 0.9596337080001831,
"learning_rate": 2.1584577445834234e-06,
"loss": 0.48124160766601565,
"memory(GiB)": 36.87,
"step": 1695,
"token_acc": 0.8413667107206717,
"train_speed(iter/s)": 0.126388
},
{
"epoch": 2.0808390109469492,
"grad_norm": 0.788873016834259,
"learning_rate": 2.132117818244771e-06,
"loss": 0.46569390296936036,
"memory(GiB)": 36.87,
"step": 1700,
"token_acc": 0.8618104667609618,
"train_speed(iter/s)": 0.12649
},
{
"epoch": 2.0808390109469492,
"eval_loss": 0.6172361373901367,
"eval_runtime": 29.9107,
"eval_samples_per_second": 17.619,
"eval_steps_per_second": 4.413,
"eval_token_acc": 0.8270313549253577,
"step": 1700
},
{
"epoch": 2.0869631784429306,
"grad_norm": 1.0761973857879639,
"learning_rate": 2.1058959720143875e-06,
"loss": 0.4640150547027588,
"memory(GiB)": 36.87,
"step": 1705,
"token_acc": 0.8384597955079729,
"train_speed(iter/s)": 0.126222
},
{
"epoch": 2.0930873459389114,
"grad_norm": 0.9404869675636292,
"learning_rate": 2.0797932855316183e-06,
"loss": 0.48186473846435546,
"memory(GiB)": 36.87,
"step": 1710,
"token_acc": 0.8572665858305907,
"train_speed(iter/s)": 0.126305
},
{
"epoch": 2.0992115134348923,
"grad_norm": 0.979210376739502,
"learning_rate": 2.0538108335296107e-06,
"loss": 0.4823300361633301,
"memory(GiB)": 36.87,
"step": 1715,
"token_acc": 0.8577344523032946,
"train_speed(iter/s)": 0.1264
},
{
"epoch": 2.1053356809308736,
"grad_norm": 0.9393882751464844,
"learning_rate": 2.0279496857910667e-06,
"loss": 0.48357486724853516,
"memory(GiB)": 36.87,
"step": 1720,
"token_acc": 0.8569940863614386,
"train_speed(iter/s)": 0.126508
},
{
"epoch": 2.1053356809308736,
"eval_loss": 0.6190218329429626,
"eval_runtime": 30.0103,
"eval_samples_per_second": 17.561,
"eval_steps_per_second": 4.398,
"eval_token_acc": 0.826953871584276,
"step": 1720
},
{
"epoch": 2.1114598484268545,
"grad_norm": 0.9903694987297058,
"learning_rate": 2.0022109071041905e-06,
"loss": 0.485797643661499,
"memory(GiB)": 36.87,
"step": 1725,
"token_acc": 0.8363958585952803,
"train_speed(iter/s)": 0.126264
},
{
"epoch": 2.1175840159228354,
"grad_norm": 0.8971183896064758,
"learning_rate": 1.9765955572188578e-06,
"loss": 0.468338680267334,
"memory(GiB)": 36.87,
"step": 1730,
"token_acc": 0.850854499843211,
"train_speed(iter/s)": 0.126332
},
{
"epoch": 2.1237081834188167,
"grad_norm": 0.8892176151275635,
"learning_rate": 1.951104690802969e-06,
"loss": 0.45011487007141116,
"memory(GiB)": 36.87,
"step": 1735,
"token_acc": 0.8456293706293706,
"train_speed(iter/s)": 0.126424
},
{
"epoch": 2.1298323509147976,
"grad_norm": 0.9592292904853821,
"learning_rate": 1.925739357399038e-06,
"loss": 0.45401706695556643,
"memory(GiB)": 36.87,
"step": 1740,
"token_acc": 0.8367899677215894,
"train_speed(iter/s)": 0.126512
},
{
"epoch": 2.1298323509147976,
"eval_loss": 0.6162592768669128,
"eval_runtime": 29.8564,
"eval_samples_per_second": 17.651,
"eval_steps_per_second": 4.421,
"eval_token_acc": 0.8268247326824733,
"step": 1740
},
{
"epoch": 2.1359565184107785,
"grad_norm": 0.9750301241874695,
"learning_rate": 1.9005006013809662e-06,
"loss": 0.5132875442504883,
"memory(GiB)": 36.87,
"step": 1745,
"token_acc": 0.8310148067894547,
"train_speed(iter/s)": 0.126241
},
{
"epoch": 2.1420806859067594,
"grad_norm": 1.0531848669052124,
"learning_rate": 1.8753894619110547e-06,
"loss": 0.4934427261352539,
"memory(GiB)": 36.87,
"step": 1750,
"token_acc": 0.8594569186824312,
"train_speed(iter/s)": 0.126339
},
{
"epoch": 2.1482048534027407,
"grad_norm": 0.9899281859397888,
"learning_rate": 1.8504069728972124e-06,
"loss": 0.5067736625671386,
"memory(GiB)": 36.87,
"step": 1755,
"token_acc": 0.8443969645619981,
"train_speed(iter/s)": 0.126433
},
{
"epoch": 2.1543290208987216,
"grad_norm": 0.9110437035560608,
"learning_rate": 1.8255541629503865e-06,
"loss": 0.43926572799682617,
"memory(GiB)": 36.87,
"step": 1760,
"token_acc": 0.8647777628575265,
"train_speed(iter/s)": 0.126537
},
{
"epoch": 2.1543290208987216,
"eval_loss": 0.6184687614440918,
"eval_runtime": 29.9388,
"eval_samples_per_second": 17.603,
"eval_steps_per_second": 4.409,
"eval_token_acc": 0.8269796993646366,
"step": 1760
},
{
"epoch": 2.1604531883947025,
"grad_norm": 0.956470251083374,
"learning_rate": 1.8008320553422116e-06,
"loss": 0.48296318054199217,
"memory(GiB)": 36.87,
"step": 1765,
"token_acc": 0.8321491877005113,
"train_speed(iter/s)": 0.126272
},
{
"epoch": 2.166577355890684,
"grad_norm": 0.990215003490448,
"learning_rate": 1.7762416679628792e-06,
"loss": 0.4733391761779785,
"memory(GiB)": 36.87,
"step": 1770,
"token_acc": 0.8625864925445863,
"train_speed(iter/s)": 0.126373
},
{
"epoch": 2.1727015233866647,
"grad_norm": 0.9296258687973022,
"learning_rate": 1.751784013279228e-06,
"loss": 0.4612305164337158,
"memory(GiB)": 36.87,
"step": 1775,
"token_acc": 0.8483617060223321,
"train_speed(iter/s)": 0.126475
},
{
"epoch": 2.1788256908826455,
"grad_norm": 0.9242532253265381,
"learning_rate": 1.7274600982930544e-06,
"loss": 0.4506662368774414,
"memory(GiB)": 36.87,
"step": 1780,
"token_acc": 0.8471429097741591,
"train_speed(iter/s)": 0.126543
},
{
"epoch": 2.1788256908826455,
"eval_loss": 0.6181926131248474,
"eval_runtime": 29.9341,
"eval_samples_per_second": 17.605,
"eval_steps_per_second": 4.41,
"eval_token_acc": 0.8270055271449972,
"step": 1780
},
{
"epoch": 2.1849498583786264,
"grad_norm": 0.9514071345329285,
"learning_rate": 1.7032709244996559e-06,
"loss": 0.45079612731933594,
"memory(GiB)": 36.87,
"step": 1785,
"token_acc": 0.8306619810862547,
"train_speed(iter/s)": 0.126266
},
{
"epoch": 2.1910740258746078,
"grad_norm": 0.8734022974967957,
"learning_rate": 1.6792174878465933e-06,
"loss": 0.4914576530456543,
"memory(GiB)": 36.87,
"step": 1790,
"token_acc": 0.8544291529366156,
"train_speed(iter/s)": 0.126359
},
{
"epoch": 2.1971981933705886,
"grad_norm": 0.9682619571685791,
"learning_rate": 1.65530077869268e-06,
"loss": 0.46589956283569334,
"memory(GiB)": 36.87,
"step": 1795,
"token_acc": 0.856334134219794,
"train_speed(iter/s)": 0.126455
},
{
"epoch": 2.2033223608665695,
"grad_norm": 0.9326309561729431,
"learning_rate": 1.6315217817672142e-06,
"loss": 0.4956002712249756,
"memory(GiB)": 36.87,
"step": 1800,
"token_acc": 0.851890756302521,
"train_speed(iter/s)": 0.126552
},
{
"epoch": 2.2033223608665695,
"eval_loss": 0.6174434423446655,
"eval_runtime": 29.957,
"eval_samples_per_second": 17.592,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8268092360142569,
"step": 1800
},
{
"epoch": 2.209446528362551,
"grad_norm": 1.0249996185302734,
"learning_rate": 1.607881476129432e-06,
"loss": 0.480439281463623,
"memory(GiB)": 36.87,
"step": 1805,
"token_acc": 0.8305227415396907,
"train_speed(iter/s)": 0.126269
},
{
"epoch": 2.2155706958585317,
"grad_norm": 0.9621463418006897,
"learning_rate": 1.5843808351281913e-06,
"loss": 0.4549149513244629,
"memory(GiB)": 36.87,
"step": 1810,
"token_acc": 0.8661414578031861,
"train_speed(iter/s)": 0.126344
},
{
"epoch": 2.2216948633545126,
"grad_norm": 0.977536141872406,
"learning_rate": 1.5610208263619002e-06,
"loss": 0.48578948974609376,
"memory(GiB)": 36.87,
"step": 1815,
"token_acc": 0.8355557119234422,
"train_speed(iter/s)": 0.12644
},
{
"epoch": 2.227819030850494,
"grad_norm": 0.9830949902534485,
"learning_rate": 1.537802411638677e-06,
"loss": 0.4825616359710693,
"memory(GiB)": 36.87,
"step": 1820,
"token_acc": 0.8571990136868813,
"train_speed(iter/s)": 0.126532
},
{
"epoch": 2.227819030850494,
"eval_loss": 0.6185752749443054,
"eval_runtime": 29.9623,
"eval_samples_per_second": 17.589,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8268195671264011,
"step": 1820
},
{
"epoch": 2.233943198346475,
"grad_norm": 0.8971763253211975,
"learning_rate": 1.514726546936749e-06,
"loss": 0.4621254920959473,
"memory(GiB)": 36.87,
"step": 1825,
"token_acc": 0.8387506208266652,
"train_speed(iter/s)": 0.126271
},
{
"epoch": 2.2400673658424557,
"grad_norm": 0.8897117972373962,
"learning_rate": 1.4917941823650917e-06,
"loss": 0.4865126609802246,
"memory(GiB)": 36.87,
"step": 1830,
"token_acc": 0.8466782763348031,
"train_speed(iter/s)": 0.126353
},
{
"epoch": 2.246191533338437,
"grad_norm": 0.9670534133911133,
"learning_rate": 1.4690062621243117e-06,
"loss": 0.4749399185180664,
"memory(GiB)": 36.87,
"step": 1835,
"token_acc": 0.8677187834569174,
"train_speed(iter/s)": 0.126441
},
{
"epoch": 2.252315700834418,
"grad_norm": 0.8603255152702332,
"learning_rate": 1.4463637244677648e-06,
"loss": 0.46147994995117186,
"memory(GiB)": 36.87,
"step": 1840,
"token_acc": 0.8514672004229767,
"train_speed(iter/s)": 0.126525
},
{
"epoch": 2.252315700834418,
"eval_loss": 0.618859052658081,
"eval_runtime": 29.975,
"eval_samples_per_second": 17.581,
"eval_steps_per_second": 4.404,
"eval_token_acc": 0.8268815537992665,
"step": 1840
},
{
"epoch": 2.258439868330399,
"grad_norm": 0.9529172778129578,
"learning_rate": 1.423867501662934e-06,
"loss": 0.4659478187561035,
"memory(GiB)": 36.87,
"step": 1845,
"token_acc": 0.8376929823340248,
"train_speed(iter/s)": 0.126266
},
{
"epoch": 2.2645640358263797,
"grad_norm": 0.9903680086135864,
"learning_rate": 1.4015185199530378e-06,
"loss": 0.4695383071899414,
"memory(GiB)": 36.87,
"step": 1850,
"token_acc": 0.8556800687408256,
"train_speed(iter/s)": 0.126353
},
{
"epoch": 2.270688203322361,
"grad_norm": 0.9149890542030334,
"learning_rate": 1.379317699518898e-06,
"loss": 0.47596092224121095,
"memory(GiB)": 36.87,
"step": 1855,
"token_acc": 0.8519627185522824,
"train_speed(iter/s)": 0.126419
},
{
"epoch": 2.276812370818342,
"grad_norm": 0.8692817091941833,
"learning_rate": 1.3572659544410493e-06,
"loss": 0.43576741218566895,
"memory(GiB)": 36.87,
"step": 1860,
"token_acc": 0.8630936883995983,
"train_speed(iter/s)": 0.126518
},
{
"epoch": 2.276812370818342,
"eval_loss": 0.6181974411010742,
"eval_runtime": 30.0408,
"eval_samples_per_second": 17.543,
"eval_steps_per_second": 4.394,
"eval_token_acc": 0.8271088382664393,
"step": 1860
},
{
"epoch": 2.2829365383143227,
"grad_norm": 0.9488633871078491,
"learning_rate": 1.3353641926621065e-06,
"loss": 0.45254907608032224,
"memory(GiB)": 36.87,
"step": 1865,
"token_acc": 0.8336496980155307,
"train_speed(iter/s)": 0.126266
},
{
"epoch": 2.289060705810304,
"grad_norm": 1.0025756359100342,
"learning_rate": 1.3136133159493803e-06,
"loss": 0.4933184623718262,
"memory(GiB)": 36.87,
"step": 1870,
"token_acc": 0.8573630940411556,
"train_speed(iter/s)": 0.126361
},
{
"epoch": 2.295184873306285,
"grad_norm": 0.8357995748519897,
"learning_rate": 1.2920142198577484e-06,
"loss": 0.45499467849731445,
"memory(GiB)": 36.87,
"step": 1875,
"token_acc": 0.8629192723138147,
"train_speed(iter/s)": 0.126432
},
{
"epoch": 2.301309040802266,
"grad_norm": 0.9138444066047668,
"learning_rate": 1.2705677936927841e-06,
"loss": 0.4767561435699463,
"memory(GiB)": 36.87,
"step": 1880,
"token_acc": 0.8521506375701698,
"train_speed(iter/s)": 0.126523
},
{
"epoch": 2.301309040802266,
"eval_loss": 0.6184601187705994,
"eval_runtime": 30.0434,
"eval_samples_per_second": 17.541,
"eval_steps_per_second": 4.394,
"eval_token_acc": 0.8273567849579008,
"step": 1880
},
{
"epoch": 2.3074332082982467,
"grad_norm": 0.9720640182495117,
"learning_rate": 1.2492749204741368e-06,
"loss": 0.4715888500213623,
"memory(GiB)": 36.87,
"step": 1885,
"token_acc": 0.8328760826785792,
"train_speed(iter/s)": 0.126264
},
{
"epoch": 2.313557375794228,
"grad_norm": 1.062354564666748,
"learning_rate": 1.2281364768991804e-06,
"loss": 0.4756108283996582,
"memory(GiB)": 36.87,
"step": 1890,
"token_acc": 0.8549824466648663,
"train_speed(iter/s)": 0.126366
},
{
"epoch": 2.319681543290209,
"grad_norm": 1.040152907371521,
"learning_rate": 1.207153333306914e-06,
"loss": 0.457261848449707,
"memory(GiB)": 36.87,
"step": 1895,
"token_acc": 0.858182628393182,
"train_speed(iter/s)": 0.126458
},
{
"epoch": 2.32580571078619,
"grad_norm": 0.9648529887199402,
"learning_rate": 1.1863263536421261e-06,
"loss": 0.49726166725158694,
"memory(GiB)": 36.87,
"step": 1900,
"token_acc": 0.8323407202216067,
"train_speed(iter/s)": 0.126559
},
{
"epoch": 2.32580571078619,
"eval_loss": 0.6168169975280762,
"eval_runtime": 30.03,
"eval_samples_per_second": 17.549,
"eval_steps_per_second": 4.396,
"eval_token_acc": 0.8270106927010693,
"step": 1900
},
{
"epoch": 2.331929878282171,
"grad_norm": 0.9079554677009583,
"learning_rate": 1.1656563954198258e-06,
"loss": 0.5002402305603028,
"memory(GiB)": 36.87,
"step": 1905,
"token_acc": 0.8321498929943166,
"train_speed(iter/s)": 0.126312
},
{
"epoch": 2.338054045778152,
"grad_norm": 1.007360816001892,
"learning_rate": 1.145144309689934e-06,
"loss": 0.4659921646118164,
"memory(GiB)": 36.87,
"step": 1910,
"token_acc": 0.8422997172478793,
"train_speed(iter/s)": 0.126382
},
{
"epoch": 2.344178213274133,
"grad_norm": 1.0213356018066406,
"learning_rate": 1.1247909410022434e-06,
"loss": 0.46290979385375974,
"memory(GiB)": 36.87,
"step": 1915,
"token_acc": 0.8491659285503396,
"train_speed(iter/s)": 0.12648
},
{
"epoch": 2.350302380770114,
"grad_norm": 1.1062732934951782,
"learning_rate": 1.1045971273716476e-06,
"loss": 0.4558609962463379,
"memory(GiB)": 36.87,
"step": 1920,
"token_acc": 0.8681887684181262,
"train_speed(iter/s)": 0.126559
},
{
"epoch": 2.350302380770114,
"eval_loss": 0.6178110837936401,
"eval_runtime": 29.9953,
"eval_samples_per_second": 17.569,
"eval_steps_per_second": 4.401,
"eval_token_acc": 0.8270985071542951,
"step": 1920
},
{
"epoch": 2.356426548266095,
"grad_norm": 0.9201487302780151,
"learning_rate": 1.0845637002436344e-06,
"loss": 0.46529560089111327,
"memory(GiB)": 36.87,
"step": 1925,
"token_acc": 0.8382993992876508,
"train_speed(iter/s)": 0.126335
},
{
"epoch": 2.362550715762076,
"grad_norm": 2.0007822513580322,
"learning_rate": 1.0646914844600543e-06,
"loss": 0.46782960891723635,
"memory(GiB)": 36.87,
"step": 1930,
"token_acc": 0.8615680194148577,
"train_speed(iter/s)": 0.126411
},
{
"epoch": 2.3686748832580573,
"grad_norm": 1.0234315395355225,
"learning_rate": 1.0449812982251556e-06,
"loss": 0.4937599658966064,
"memory(GiB)": 36.87,
"step": 1935,
"token_acc": 0.8580788129877638,
"train_speed(iter/s)": 0.126511
},
{
"epoch": 2.374799050754038,
"grad_norm": 0.964102566242218,
"learning_rate": 1.0254339530719031e-06,
"loss": 0.49028477668762205,
"memory(GiB)": 36.87,
"step": 1940,
"token_acc": 0.8427895540736877,
"train_speed(iter/s)": 0.126597
},
{
"epoch": 2.374799050754038,
"eval_loss": 0.618255078792572,
"eval_runtime": 29.9499,
"eval_samples_per_second": 17.596,
"eval_steps_per_second": 4.407,
"eval_token_acc": 0.8271708249393047,
"step": 1940
},
{
"epoch": 2.380923218250019,
"grad_norm": 0.9083016514778137,
"learning_rate": 1.0060502538285582e-06,
"loss": 0.47533645629882815,
"memory(GiB)": 36.87,
"step": 1945,
"token_acc": 0.8339908186042594,
"train_speed(iter/s)": 0.126344
},
{
"epoch": 2.3870473857460004,
"grad_norm": 0.9915279746055603,
"learning_rate": 9.868309985855446e-07,
"loss": 0.4681232452392578,
"memory(GiB)": 36.87,
"step": 1950,
"token_acc": 0.8487209179913675,
"train_speed(iter/s)": 0.126421
},
{
"epoch": 2.3931715532419813,
"grad_norm": 0.9561747312545776,
"learning_rate": 9.677769786625869e-07,
"loss": 0.48569955825805666,
"memory(GiB)": 36.87,
"step": 1955,
"token_acc": 0.8486834496318285,
"train_speed(iter/s)": 0.126494
},
{
"epoch": 2.399295720737962,
"grad_norm": 0.9286803603172302,
"learning_rate": 9.488889785761324e-07,
"loss": 0.44054179191589354,
"memory(GiB)": 36.87,
"step": 1960,
"token_acc": 0.8660492977141283,
"train_speed(iter/s)": 0.126575
},
{
"epoch": 2.399295720737962,
"eval_loss": 0.6190705895423889,
"eval_runtime": 29.9923,
"eval_samples_per_second": 17.571,
"eval_steps_per_second": 4.401,
"eval_token_acc": 0.8266181104395888,
"step": 1960
},
{
"epoch": 2.405419888233943,
"grad_norm": 1.0242820978164673,
"learning_rate": 9.301677760070449e-07,
"loss": 0.4897134304046631,
"memory(GiB)": 36.87,
"step": 1965,
"token_acc": 0.8352815571190013,
"train_speed(iter/s)": 0.126327
},
{
"epoch": 2.4115440557299244,
"grad_norm": 0.939855694770813,
"learning_rate": 9.116141417685898e-07,
"loss": 0.45674614906311034,
"memory(GiB)": 36.87,
"step": 1970,
"token_acc": 0.8488702986251586,
"train_speed(iter/s)": 0.126411
},
{
"epoch": 2.4176682232259052,
"grad_norm": 0.9036867022514343,
"learning_rate": 8.932288397746919e-07,
"loss": 0.4510343074798584,
"memory(GiB)": 36.87,
"step": 1975,
"token_acc": 0.8560777957860616,
"train_speed(iter/s)": 0.126495
},
{
"epoch": 2.423792390721886,
"grad_norm": 0.9866623878479004,
"learning_rate": 8.750126270084891e-07,
"loss": 0.4746750831604004,
"memory(GiB)": 36.87,
"step": 1980,
"token_acc": 0.8668764857535072,
"train_speed(iter/s)": 0.126576
},
{
"epoch": 2.423792390721886,
"eval_loss": 0.6186906099319458,
"eval_runtime": 29.9649,
"eval_samples_per_second": 17.587,
"eval_steps_per_second": 4.405,
"eval_token_acc": 0.8270881760421509,
"step": 1980
},
{
"epoch": 2.4299165582178675,
"grad_norm": 0.9331910610198975,
"learning_rate": 8.569662534911605e-07,
"loss": 0.4652440071105957,
"memory(GiB)": 36.87,
"step": 1985,
"token_acc": 0.8312124178436582,
"train_speed(iter/s)": 0.126348
},
{
"epoch": 2.4360407257138483,
"grad_norm": 0.8783546686172485,
"learning_rate": 8.390904622510471e-07,
"loss": 0.43751039505004885,
"memory(GiB)": 36.87,
"step": 1990,
"token_acc": 0.8822175660357916,
"train_speed(iter/s)": 0.126428
},
{
"epoch": 2.442164893209829,
"grad_norm": 0.9757615923881531,
"learning_rate": 8.213859892930581e-07,
"loss": 0.4832446098327637,
"memory(GiB)": 36.87,
"step": 1995,
"token_acc": 0.8358402898094188,
"train_speed(iter/s)": 0.126498
},
{
"epoch": 2.44828906070581,
"grad_norm": 0.9304143190383911,
"learning_rate": 8.03853563568367e-07,
"loss": 0.46181411743164064,
"memory(GiB)": 36.87,
"step": 2000,
"token_acc": 0.8611440231700895,
"train_speed(iter/s)": 0.126561
},
{
"epoch": 2.44828906070581,
"eval_loss": 0.6176819205284119,
"eval_runtime": 29.9501,
"eval_samples_per_second": 17.596,
"eval_steps_per_second": 4.407,
"eval_token_acc": 0.8269900304767809,
"step": 2000
},
{
"epoch": 2.4544132282017914,
"grad_norm": 0.9659498333930969,
"learning_rate": 7.864939069444006e-07,
"loss": 0.47965211868286134,
"memory(GiB)": 36.87,
"step": 2005,
"token_acc": 0.831640513163097,
"train_speed(iter/s)": 0.126337
},
{
"epoch": 2.4605373956977723,
"grad_norm": 1.0172474384307861,
"learning_rate": 7.693077341751138e-07,
"loss": 0.4738880157470703,
"memory(GiB)": 36.87,
"step": 2010,
"token_acc": 0.8708014805078004,
"train_speed(iter/s)": 0.126397
},
{
"epoch": 2.466661563193753,
"grad_norm": 0.9642378091812134,
"learning_rate": 7.522957528715636e-07,
"loss": 0.4847827911376953,
"memory(GiB)": 36.87,
"step": 2015,
"token_acc": 0.8358588871654138,
"train_speed(iter/s)": 0.126477
},
{
"epoch": 2.4727857306897345,
"grad_norm": 1.015015721321106,
"learning_rate": 7.354586634727729e-07,
"loss": 0.48462276458740233,
"memory(GiB)": 36.87,
"step": 2020,
"token_acc": 0.8585737976782752,
"train_speed(iter/s)": 0.126571
},
{
"epoch": 2.4727857306897345,
"eval_loss": 0.6191264986991882,
"eval_runtime": 29.948,
"eval_samples_per_second": 17.597,
"eval_steps_per_second": 4.408,
"eval_token_acc": 0.8270416860375019,
"step": 2020
},
{
"epoch": 2.4789098981857154,
"grad_norm": 0.9471215009689331,
"learning_rate": 7.187971592168936e-07,
"loss": 0.4690380096435547,
"memory(GiB)": 36.87,
"step": 2025,
"token_acc": 0.83595499208511,
"train_speed(iter/s)": 0.126356
},
{
"epoch": 2.4850340656816963,
"grad_norm": 0.9936275482177734,
"learning_rate": 7.023119261126571e-07,
"loss": 0.4644585609436035,
"memory(GiB)": 36.87,
"step": 2030,
"token_acc": 0.8678402074165046,
"train_speed(iter/s)": 0.126441
},
{
"epoch": 2.4911582331776776,
"grad_norm": 0.9575950503349304,
"learning_rate": 6.860036429111394e-07,
"loss": 0.4721442699432373,
"memory(GiB)": 36.87,
"step": 2035,
"token_acc": 0.8546081813701331,
"train_speed(iter/s)": 0.126503
},
{
"epoch": 2.4972824006736585,
"grad_norm": 1.002123475074768,
"learning_rate": 6.698729810778065e-07,
"loss": 0.4855657577514648,
"memory(GiB)": 36.87,
"step": 2040,
"token_acc": 0.8467344696835466,
"train_speed(iter/s)": 0.126594
},
{
"epoch": 2.4972824006736585,
"eval_loss": 0.6174936294555664,
"eval_runtime": 29.9826,
"eval_samples_per_second": 17.577,
"eval_steps_per_second": 4.403,
"eval_token_acc": 0.8267524148974638,
"step": 2040
},
{
"epoch": 2.5034065681696394,
"grad_norm": 0.9535955786705017,
"learning_rate": 6.539206047648705e-07,
"loss": 0.46763386726379397,
"memory(GiB)": 36.87,
"step": 2045,
"token_acc": 0.8324564664169503,
"train_speed(iter/s)": 0.12638
},
{
"epoch": 2.5095307356656207,
"grad_norm": 0.9121899008750916,
"learning_rate": 6.381471707839449e-07,
"loss": 0.44632792472839355,
"memory(GiB)": 36.87,
"step": 2050,
"token_acc": 0.8615195671656654,
"train_speed(iter/s)": 0.126463
},
{
"epoch": 2.5156549031616016,
"grad_norm": 0.9412456750869751,
"learning_rate": 6.225533285789997e-07,
"loss": 0.46562681198120115,
"memory(GiB)": 36.87,
"step": 2055,
"token_acc": 0.8705161854768154,
"train_speed(iter/s)": 0.126535
},
{
"epoch": 2.5217790706575824,
"grad_norm": 0.9882362484931946,
"learning_rate": 6.071397201996243e-07,
"loss": 0.47277240753173827,
"memory(GiB)": 36.87,
"step": 2060,
"token_acc": 0.8616963064295485,
"train_speed(iter/s)": 0.126619
},
{
"epoch": 2.5217790706575824,
"eval_loss": 0.6171227097511292,
"eval_runtime": 29.9995,
"eval_samples_per_second": 17.567,
"eval_steps_per_second": 4.4,
"eval_token_acc": 0.8271191693785837,
"step": 2060
},
{
"epoch": 2.5279032381535638,
"grad_norm": 1.003340482711792,
"learning_rate": 5.919069802745914e-07,
"loss": 0.4641777515411377,
"memory(GiB)": 36.87,
"step": 2065,
"token_acc": 0.8311811067402455,
"train_speed(iter/s)": 0.126395
},
{
"epoch": 2.5340274056495447,
"grad_norm": 0.9802326560020447,
"learning_rate": 5.768557359857241e-07,
"loss": 0.4592477321624756,
"memory(GiB)": 36.87,
"step": 2070,
"token_acc": 0.8501814594270815,
"train_speed(iter/s)": 0.12647
},
{
"epoch": 2.5401515731455255,
"grad_norm": 0.9962190985679626,
"learning_rate": 5.619866070420766e-07,
"loss": 0.4591672897338867,
"memory(GiB)": 36.87,
"step": 2075,
"token_acc": 0.8545560747663551,
"train_speed(iter/s)": 0.126542
},
{
"epoch": 2.5462757406415064,
"grad_norm": 0.8959765434265137,
"learning_rate": 5.473002056544191e-07,
"loss": 0.43817138671875,
"memory(GiB)": 36.87,
"step": 2080,
"token_acc": 0.8604302151075538,
"train_speed(iter/s)": 0.12662
},
{
"epoch": 2.5462757406415064,
"eval_loss": 0.6181796789169312,
"eval_runtime": 29.9775,
"eval_samples_per_second": 17.58,
"eval_steps_per_second": 4.403,
"eval_token_acc": 0.8274600960793429,
"step": 2080
},
{
"epoch": 2.5523999081374873,
"grad_norm": 0.9490206837654114,
"learning_rate": 5.327971365100276e-07,
"loss": 0.4962893486022949,
"memory(GiB)": 36.87,
"step": 2085,
"token_acc": 0.8353908876332166,
"train_speed(iter/s)": 0.126398
},
{
"epoch": 2.5585240756334686,
"grad_norm": 0.9097649455070496,
"learning_rate": 5.184779967477893e-07,
"loss": 0.4803347110748291,
"memory(GiB)": 36.87,
"step": 2090,
"token_acc": 0.856787781665958,
"train_speed(iter/s)": 0.126471
},
{
"epoch": 2.5646482431294495,
"grad_norm": 1.0033832788467407,
"learning_rate": 5.043433759336158e-07,
"loss": 0.4686880111694336,
"memory(GiB)": 36.87,
"step": 2095,
"token_acc": 0.8430141843971631,
"train_speed(iter/s)": 0.126552
},
{
"epoch": 2.5707724106254304,
"grad_norm": 0.9607586860656738,
"learning_rate": 4.903938560361698e-07,
"loss": 0.48217024803161623,
"memory(GiB)": 36.87,
"step": 2100,
"token_acc": 0.8403233581785144,
"train_speed(iter/s)": 0.126627
},
{
"epoch": 2.5707724106254304,
"eval_loss": 0.6173272728919983,
"eval_runtime": 30.0012,
"eval_samples_per_second": 17.566,
"eval_steps_per_second": 4.4,
"eval_token_acc": 0.8270313549253577,
"step": 2100
},
{
"epoch": 2.5768965781214117,
"grad_norm": 0.9122663140296936,
"learning_rate": 4.76630011402901e-07,
"loss": 0.45368499755859376,
"memory(GiB)": 36.87,
"step": 2105,
"token_acc": 0.8375441091626303,
"train_speed(iter/s)": 0.1264
},
{
"epoch": 2.5830207456173926,
"grad_norm": 0.9508864879608154,
"learning_rate": 4.630524087364019e-07,
"loss": 0.4732816696166992,
"memory(GiB)": 36.87,
"step": 2110,
"token_acc": 0.8504168897728142,
"train_speed(iter/s)": 0.126473
},
{
"epoch": 2.5891449131133735,
"grad_norm": 0.9362801909446716,
"learning_rate": 4.4966160707107075e-07,
"loss": 0.48420238494873047,
"memory(GiB)": 36.87,
"step": 2115,
"token_acc": 0.8500457797822917,
"train_speed(iter/s)": 0.126544
},
{
"epoch": 2.595269080609355,
"grad_norm": 0.9054739475250244,
"learning_rate": 4.364581577500987e-07,
"loss": 0.43644113540649415,
"memory(GiB)": 36.87,
"step": 2120,
"token_acc": 0.8647457297507536,
"train_speed(iter/s)": 0.126611
},
{
"epoch": 2.595269080609355,
"eval_loss": 0.6172040104866028,
"eval_runtime": 29.9489,
"eval_samples_per_second": 17.597,
"eval_steps_per_second": 4.408,
"eval_token_acc": 0.827186321607521,
"step": 2120
},
{
"epoch": 2.6013932481053357,
"grad_norm": 0.9941717386245728,
"learning_rate": 4.2344260440276455e-07,
"loss": 0.5040837287902832,
"memory(GiB)": 36.87,
"step": 2125,
"token_acc": 0.8330278541475742,
"train_speed(iter/s)": 0.126387
},
{
"epoch": 2.6075174156013166,
"grad_norm": 0.9401910901069641,
"learning_rate": 4.10615482922056e-07,
"loss": 0.4815082550048828,
"memory(GiB)": 36.87,
"step": 2130,
"token_acc": 0.8560084700899947,
"train_speed(iter/s)": 0.126454
},
{
"epoch": 2.613641583097298,
"grad_norm": 0.9616697430610657,
"learning_rate": 3.979773214426019e-07,
"loss": 0.484088134765625,
"memory(GiB)": 36.87,
"step": 2135,
"token_acc": 0.8334314302530901,
"train_speed(iter/s)": 0.126532
},
{
"epoch": 2.6197657505932788,
"grad_norm": 0.9681402444839478,
"learning_rate": 3.85528640318929e-07,
"loss": 0.46643218994140623,
"memory(GiB)": 36.87,
"step": 2140,
"token_acc": 0.8545515745381106,
"train_speed(iter/s)": 0.126601
},
{
"epoch": 2.6197657505932788,
"eval_loss": 0.6171387434005737,
"eval_runtime": 30.0426,
"eval_samples_per_second": 17.542,
"eval_steps_per_second": 4.394,
"eval_token_acc": 0.8271914871635931,
"step": 2140
},
{
"epoch": 2.6258899180892596,
"grad_norm": 0.9676551222801208,
"learning_rate": 3.732699521040378e-07,
"loss": 0.46207480430603026,
"memory(GiB)": 36.87,
"step": 2145,
"token_acc": 0.8425608384317814,
"train_speed(iter/s)": 0.126373
},
{
"epoch": 2.632014085585241,
"grad_norm": 1.003116250038147,
"learning_rate": 3.612017615282964e-07,
"loss": 0.4896972179412842,
"memory(GiB)": 36.87,
"step": 2150,
"token_acc": 0.8342796309439319,
"train_speed(iter/s)": 0.126459
},
{
"epoch": 2.638138253081222,
"grad_norm": 1.0150328874588013,
"learning_rate": 3.49324565478662e-07,
"loss": 0.513043737411499,
"memory(GiB)": 36.87,
"step": 2155,
"token_acc": 0.855285740368815,
"train_speed(iter/s)": 0.126542
},
{
"epoch": 2.6442624205772027,
"grad_norm": 0.9732238054275513,
"learning_rate": 3.3763885297822153e-07,
"loss": 0.4932279586791992,
"memory(GiB)": 36.87,
"step": 2160,
"token_acc": 0.8633442370598422,
"train_speed(iter/s)": 0.126609
},
{
"epoch": 2.6442624205772027,
"eval_loss": 0.617060124874115,
"eval_runtime": 29.801,
"eval_samples_per_second": 17.684,
"eval_steps_per_second": 4.429,
"eval_token_acc": 0.827367116070045,
"step": 2160
},
{
"epoch": 2.650386588073184,
"grad_norm": 0.8650747537612915,
"learning_rate": 3.261451051660547e-07,
"loss": 0.47266697883605957,
"memory(GiB)": 36.87,
"step": 2165,
"token_acc": 0.8351215537145186,
"train_speed(iter/s)": 0.126394
},
{
"epoch": 2.656510755569165,
"grad_norm": 1.0742039680480957,
"learning_rate": 3.1484379527742746e-07,
"loss": 0.48064508438110354,
"memory(GiB)": 36.87,
"step": 2170,
"token_acc": 0.8557394880859308,
"train_speed(iter/s)": 0.126467
},
{
"epoch": 2.662634923065146,
"grad_norm": 1.0016287565231323,
"learning_rate": 3.037353886243055e-07,
"loss": 0.46164817810058595,
"memory(GiB)": 36.87,
"step": 2175,
"token_acc": 0.8649181267691426,
"train_speed(iter/s)": 0.126536
},
{
"epoch": 2.6687590905611267,
"grad_norm": 0.9850811958312988,
"learning_rate": 2.928203425761961e-07,
"loss": 0.4678659915924072,
"memory(GiB)": 36.87,
"step": 2180,
"token_acc": 0.8360384946854352,
"train_speed(iter/s)": 0.12661
},
{
"epoch": 2.6687590905611267,
"eval_loss": 0.6176232099533081,
"eval_runtime": 29.9574,
"eval_samples_per_second": 17.592,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8272896327289633,
"step": 2180
},
{
"epoch": 2.674883258057108,
"grad_norm": 0.9312725067138672,
"learning_rate": 2.820991065413159e-07,
"loss": 0.49228496551513673,
"memory(GiB)": 36.87,
"step": 2185,
"token_acc": 0.8372572060551601,
"train_speed(iter/s)": 0.12639
},
{
"epoch": 2.681007425553089,
"grad_norm": 0.9902373552322388,
"learning_rate": 2.71572121948091e-07,
"loss": 0.48822717666625975,
"memory(GiB)": 36.87,
"step": 2190,
"token_acc": 0.8330284513291558,
"train_speed(iter/s)": 0.126463
},
{
"epoch": 2.68713159304907,
"grad_norm": 0.9834261536598206,
"learning_rate": 2.612398222269752e-07,
"loss": 0.47580180168151853,
"memory(GiB)": 36.87,
"step": 2195,
"token_acc": 0.8485915492957746,
"train_speed(iter/s)": 0.126526
},
{
"epoch": 2.6932557605450507,
"grad_norm": 1.019545316696167,
"learning_rate": 2.511026327926114e-07,
"loss": 0.5008028507232666,
"memory(GiB)": 36.87,
"step": 2200,
"token_acc": 0.8462324594159034,
"train_speed(iter/s)": 0.126603
},
{
"epoch": 2.6932557605450507,
"eval_loss": 0.6174827218055725,
"eval_runtime": 30.0074,
"eval_samples_per_second": 17.562,
"eval_steps_per_second": 4.399,
"eval_token_acc": 0.8274962549718478,
"step": 2200
},
{
"epoch": 2.699379928041032,
"grad_norm": 1.0158801078796387,
"learning_rate": 2.411609710263091e-07,
"loss": 0.46188907623291015,
"memory(GiB)": 36.87,
"step": 2205,
"token_acc": 0.8344093700899872,
"train_speed(iter/s)": 0.126407
},
{
"epoch": 2.705504095537013,
"grad_norm": 0.8824003338813782,
"learning_rate": 2.314152462588659e-07,
"loss": 0.4691601753234863,
"memory(GiB)": 36.87,
"step": 2210,
"token_acc": 0.8543765099423899,
"train_speed(iter/s)": 0.126481
},
{
"epoch": 2.7116282630329938,
"grad_norm": 0.9510777592658997,
"learning_rate": 2.2186585975370935e-07,
"loss": 0.4572303771972656,
"memory(GiB)": 36.87,
"step": 2215,
"token_acc": 0.8658688406088109,
"train_speed(iter/s)": 0.126579
},
{
"epoch": 2.717752430528975,
"grad_norm": 0.9548938274383545,
"learning_rate": 2.1251320469037827e-07,
"loss": 0.4614152431488037,
"memory(GiB)": 36.87,
"step": 2220,
"token_acc": 0.8602180404138602,
"train_speed(iter/s)": 0.126644
},
{
"epoch": 2.717752430528975,
"eval_loss": 0.6176718473434448,
"eval_runtime": 29.9684,
"eval_samples_per_second": 17.585,
"eval_steps_per_second": 4.405,
"eval_token_acc": 0.8273722816261171,
"step": 2220
},
{
"epoch": 2.723876598024956,
"grad_norm": 0.9301165342330933,
"learning_rate": 2.0335766614833275e-07,
"loss": 0.4707462310791016,
"memory(GiB)": 36.87,
"step": 2225,
"token_acc": 0.8354241550286274,
"train_speed(iter/s)": 0.126431
},
{
"epoch": 2.730000765520937,
"grad_norm": 0.9284428954124451,
"learning_rate": 1.9439962109110032e-07,
"loss": 0.45088810920715333,
"memory(GiB)": 36.87,
"step": 2230,
"token_acc": 0.8519607113624525,
"train_speed(iter/s)": 0.126503
},
{
"epoch": 2.736124933016918,
"grad_norm": 0.9004064798355103,
"learning_rate": 1.8563943835075315e-07,
"loss": 0.4744719982147217,
"memory(GiB)": 36.87,
"step": 2235,
"token_acc": 0.8538511282801811,
"train_speed(iter/s)": 0.126568
},
{
"epoch": 2.742249100512899,
"grad_norm": 0.9502014517784119,
"learning_rate": 1.770774786127244e-07,
"loss": 0.4691894054412842,
"memory(GiB)": 36.87,
"step": 2240,
"token_acc": 0.8554979031914137,
"train_speed(iter/s)": 0.12664
},
{
"epoch": 2.742249100512899,
"eval_loss": 0.6172018051147461,
"eval_runtime": 29.9615,
"eval_samples_per_second": 17.589,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8271501627150163,
"step": 2240
},
{
"epoch": 2.74837326800888,
"grad_norm": 1.0235203504562378,
"learning_rate": 1.6871409440095687e-07,
"loss": 0.5208240509033203,
"memory(GiB)": 36.87,
"step": 2245,
"token_acc": 0.8266149979641408,
"train_speed(iter/s)": 0.12644
},
{
"epoch": 2.7544974355048613,
"grad_norm": 0.9667299389839172,
"learning_rate": 1.6054963006338742e-07,
"loss": 0.46748833656311034,
"memory(GiB)": 36.87,
"step": 2250,
"token_acc": 0.8670063058890591,
"train_speed(iter/s)": 0.126499
},
{
"epoch": 2.760621603000842,
"grad_norm": 0.990592360496521,
"learning_rate": 1.5258442175777045e-07,
"loss": 0.4987760066986084,
"memory(GiB)": 36.87,
"step": 2255,
"token_acc": 0.8603155845961351,
"train_speed(iter/s)": 0.126571
},
{
"epoch": 2.766745770496823,
"grad_norm": 1.023600459098816,
"learning_rate": 1.44818797437834e-07,
"loss": 0.5020921707153321,
"memory(GiB)": 36.87,
"step": 2260,
"token_acc": 0.8367556063532101,
"train_speed(iter/s)": 0.126649
},
{
"epoch": 2.766745770496823,
"eval_loss": 0.6171240210533142,
"eval_runtime": 29.9592,
"eval_samples_per_second": 17.591,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8274600960793429,
"step": 2260
},
{
"epoch": 2.7728699379928043,
"grad_norm": 0.9850562810897827,
"learning_rate": 1.372530768397845e-07,
"loss": 0.49890799522399903,
"memory(GiB)": 36.87,
"step": 2265,
"token_acc": 0.8388872065619528,
"train_speed(iter/s)": 0.126453
},
{
"epoch": 2.7789941054887852,
"grad_norm": 1.0308210849761963,
"learning_rate": 1.2988757146913223e-07,
"loss": 0.49098944664001465,
"memory(GiB)": 36.87,
"step": 2270,
"token_acc": 0.8366363778787405,
"train_speed(iter/s)": 0.12653
},
{
"epoch": 2.785118272984766,
"grad_norm": 0.9139099717140198,
"learning_rate": 1.227225845878721e-07,
"loss": 0.482135009765625,
"memory(GiB)": 36.87,
"step": 2275,
"token_acc": 0.8365253330381195,
"train_speed(iter/s)": 0.12659
},
{
"epoch": 2.791242440480747,
"grad_norm": 1.0616044998168945,
"learning_rate": 1.157584112019966e-07,
"loss": 0.5007448196411133,
"memory(GiB)": 36.87,
"step": 2280,
"token_acc": 0.83696904524157,
"train_speed(iter/s)": 0.126675
},
{
"epoch": 2.791242440480747,
"eval_loss": 0.6172557473182678,
"eval_runtime": 29.9856,
"eval_samples_per_second": 17.575,
"eval_steps_per_second": 4.402,
"eval_token_acc": 0.8271966527196652,
"step": 2280
},
{
"epoch": 2.7973666079767283,
"grad_norm": 0.8870491981506348,
"learning_rate": 1.0899533804934637e-07,
"loss": 0.4659425258636475,
"memory(GiB)": 36.87,
"step": 2285,
"token_acc": 0.8472861329549524,
"train_speed(iter/s)": 0.126465
},
{
"epoch": 2.803490775472709,
"grad_norm": 0.8363329768180847,
"learning_rate": 1.0243364358780817e-07,
"loss": 0.46242237091064453,
"memory(GiB)": 36.87,
"step": 2290,
"token_acc": 0.8513745704467354,
"train_speed(iter/s)": 0.126527
},
{
"epoch": 2.80961494296869,
"grad_norm": 1.0086287260055542,
"learning_rate": 9.607359798384785e-08,
"loss": 0.46642189025878905,
"memory(GiB)": 36.87,
"step": 2295,
"token_acc": 0.8569345046297867,
"train_speed(iter/s)": 0.126585
},
{
"epoch": 2.815739110464671,
"grad_norm": 0.9434347748756409,
"learning_rate": 8.991546310138599e-08,
"loss": 0.48851499557495115,
"memory(GiB)": 36.87,
"step": 2300,
"token_acc": 0.8429763909289578,
"train_speed(iter/s)": 0.126657
},
{
"epoch": 2.815739110464671,
"eval_loss": 0.6172496676445007,
"eval_runtime": 29.9586,
"eval_samples_per_second": 17.591,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8273929438504055,
"step": 2300
},
{
"epoch": 2.8218632779606523,
"grad_norm": 0.8611441254615784,
"learning_rate": 8.395949249101754e-08,
"loss": 0.4366280555725098,
"memory(GiB)": 36.87,
"step": 2305,
"token_acc": 0.8454527389547593,
"train_speed(iter/s)": 0.126446
},
{
"epoch": 2.827987445456633,
"grad_norm": 0.9308704137802124,
"learning_rate": 7.820593137957244e-08,
"loss": 0.49471750259399416,
"memory(GiB)": 36.87,
"step": 2310,
"token_acc": 0.8447947341070501,
"train_speed(iter/s)": 0.126514
},
{
"epoch": 2.834111612952614,
"grad_norm": 0.9204055070877075,
"learning_rate": 7.265501666001706e-08,
"loss": 0.5066485404968262,
"memory(GiB)": 36.87,
"step": 2315,
"token_acc": 0.8312655086848635,
"train_speed(iter/s)": 0.12658
},
{
"epoch": 2.8402357804485954,
"grad_norm": 0.9611912369728088,
"learning_rate": 6.730697688170251e-08,
"loss": 0.4841705322265625,
"memory(GiB)": 36.87,
"step": 2320,
"token_acc": 0.8579581483830057,
"train_speed(iter/s)": 0.126649
},
{
"epoch": 2.8402357804485954,
"eval_loss": 0.6172900199890137,
"eval_runtime": 29.9623,
"eval_samples_per_second": 17.589,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8273154605093238,
"step": 2320
},
{
"epoch": 2.8463599479445763,
"grad_norm": 0.9565967321395874,
"learning_rate": 6.216203224095386e-08,
"loss": 0.45609331130981445,
"memory(GiB)": 36.87,
"step": 2325,
"token_acc": 0.8340782438969194,
"train_speed(iter/s)": 0.126439
},
{
"epoch": 2.852484115440557,
"grad_norm": 0.9000151753425598,
"learning_rate": 5.722039457200235e-08,
"loss": 0.46810379028320315,
"memory(GiB)": 36.87,
"step": 2330,
"token_acc": 0.854035216434336,
"train_speed(iter/s)": 0.126509
},
{
"epoch": 2.8586082829365385,
"grad_norm": 0.9673875570297241,
"learning_rate": 5.248226733826689e-08,
"loss": 0.496975040435791,
"memory(GiB)": 36.87,
"step": 2335,
"token_acc": 0.8512393729597877,
"train_speed(iter/s)": 0.126574
},
{
"epoch": 2.8647324504325193,
"grad_norm": 0.9579722881317139,
"learning_rate": 4.794784562397459e-08,
"loss": 0.5033215045928955,
"memory(GiB)": 36.87,
"step": 2340,
"token_acc": 0.854410310614068,
"train_speed(iter/s)": 0.126653
},
{
"epoch": 2.8647324504325193,
"eval_loss": 0.6173553466796875,
"eval_runtime": 30.0724,
"eval_samples_per_second": 17.524,
"eval_steps_per_second": 4.389,
"eval_token_acc": 0.8274394338550545,
"step": 2340
},
{
"epoch": 2.8708566179285,
"grad_norm": 0.967046320438385,
"learning_rate": 4.361731612612607e-08,
"loss": 0.4593523025512695,
"memory(GiB)": 36.87,
"step": 2345,
"token_acc": 0.8404793034195522,
"train_speed(iter/s)": 0.126443
},
{
"epoch": 2.8769807854244815,
"grad_norm": 1.005210280418396,
"learning_rate": 3.949085714681389e-08,
"loss": 0.44494943618774413,
"memory(GiB)": 36.87,
"step": 2350,
"token_acc": 0.8719988481336165,
"train_speed(iter/s)": 0.126502
},
{
"epoch": 2.8831049529204624,
"grad_norm": 0.9526214599609375,
"learning_rate": 3.556863858587833e-08,
"loss": 0.4602672576904297,
"memory(GiB)": 36.87,
"step": 2355,
"token_acc": 0.8692367364835238,
"train_speed(iter/s)": 0.126552
},
{
"epoch": 2.8892291204164433,
"grad_norm": 0.9232168197631836,
"learning_rate": 3.185082193391143e-08,
"loss": 0.48589048385620115,
"memory(GiB)": 36.87,
"step": 2360,
"token_acc": 0.8542855225182898,
"train_speed(iter/s)": 0.126625
},
{
"epoch": 2.8892291204164433,
"eval_loss": 0.6173009276390076,
"eval_runtime": 29.9578,
"eval_samples_per_second": 17.591,
"eval_steps_per_second": 4.406,
"eval_token_acc": 0.8273516194018286,
"step": 2360
},
{
"epoch": 2.8953532879124246,
"grad_norm": 0.9514647126197815,
"learning_rate": 2.8337560265608853e-08,
"loss": 0.46231327056884763,
"memory(GiB)": 36.87,
"step": 2365,
"token_acc": 0.8406889558929477,
"train_speed(iter/s)": 0.126404
},
{
"epoch": 2.9014774554084055,
"grad_norm": 0.9117730259895325,
"learning_rate": 2.5028998233467272e-08,
"loss": 0.47223424911499023,
"memory(GiB)": 36.87,
"step": 2370,
"token_acc": 0.8618170593682478,
"train_speed(iter/s)": 0.12647
},
{
"epoch": 2.9076016229043864,
"grad_norm": 0.9326623678207397,
"learning_rate": 2.1925272061829038e-08,
"loss": 0.47979536056518557,
"memory(GiB)": 36.87,
"step": 2375,
"token_acc": 0.854614639049431,
"train_speed(iter/s)": 0.126535
},
{
"epoch": 2.9137257904003677,
"grad_norm": 0.9446553587913513,
"learning_rate": 1.9026509541272276e-08,
"loss": 0.4813851356506348,
"memory(GiB)": 36.87,
"step": 2380,
"token_acc": 0.8567724059536074,
"train_speed(iter/s)": 0.126602
},
{
"epoch": 2.9137257904003677,
"eval_loss": 0.6172758936882019,
"eval_runtime": 29.898,
"eval_samples_per_second": 17.627,
"eval_steps_per_second": 4.415,
"eval_token_acc": 0.8273154605093238,
"step": 2380
},
{
"epoch": 2.9198499578963486,
"grad_norm": 0.9821135401725769,
"learning_rate": 1.6332830023350065e-08,
"loss": 0.4704907417297363,
"memory(GiB)": 36.87,
"step": 2385,
"token_acc": 0.835956510119977,
"train_speed(iter/s)": 0.126422
},
{
"epoch": 2.9259741253923295,
"grad_norm": 0.9702714681625366,
"learning_rate": 1.3844344415676059e-08,
"loss": 0.5124542713165283,
"memory(GiB)": 36.87,
"step": 2390,
"token_acc": 0.8496258847320526,
"train_speed(iter/s)": 0.126502
},
{
"epoch": 2.9320982928883104,
"grad_norm": 0.8899897336959839,
"learning_rate": 1.156115517735812e-08,
"loss": 0.45607595443725585,
"memory(GiB)": 36.87,
"step": 2395,
"token_acc": 0.8606019542115398,
"train_speed(iter/s)": 0.12656
},
{
"epoch": 2.9382224603842912,
"grad_norm": 0.8827829360961914,
"learning_rate": 9.48335631477948e-09,
"loss": 0.4859360694885254,
"memory(GiB)": 36.87,
"step": 2400,
"token_acc": 0.8470638693305693,
"train_speed(iter/s)": 0.126623
},
{
"epoch": 2.9382224603842912,
"eval_loss": 0.617277204990387,
"eval_runtime": 29.9293,
"eval_samples_per_second": 17.608,
"eval_steps_per_second": 4.41,
"eval_token_acc": 0.8274910894157756,
"step": 2400
},
{
"epoch": 2.9443466278802726,
"grad_norm": 0.9587947726249695,
"learning_rate": 7.611033377729615e-09,
"loss": 0.5192126274108887,
"memory(GiB)": 36.87,
"step": 2405,
"token_acc": 0.8289369284093598,
"train_speed(iter/s)": 0.126434
},
{
"epoch": 2.9504707953762535,
"grad_norm": 0.9472404718399048,
"learning_rate": 5.944263455879284e-09,
"loss": 0.4903435707092285,
"memory(GiB)": 36.87,
"step": 2410,
"token_acc": 0.8476331360946746,
"train_speed(iter/s)": 0.1265
},
{
"epoch": 2.9565949628722343,
"grad_norm": 0.9245522618293762,
"learning_rate": 4.4831151756091766e-09,
"loss": 0.4952064037322998,
"memory(GiB)": 36.87,
"step": 2415,
"token_acc": 0.8508173686555399,
"train_speed(iter/s)": 0.126559
},
{
"epoch": 2.9627191303682157,
"grad_norm": 0.8794329762458801,
"learning_rate": 3.227648697182173e-09,
"loss": 0.4453686237335205,
"memory(GiB)": 36.87,
"step": 2420,
"token_acc": 0.850396277175889,
"train_speed(iter/s)": 0.126607
},
{
"epoch": 2.9627191303682157,
"eval_loss": 0.6172167062759399,
"eval_runtime": 29.9678,
"eval_samples_per_second": 17.586,
"eval_steps_per_second": 4.405,
"eval_token_acc": 0.8275840694250736,
"step": 2420
},
{
"epoch": 2.9688432978641965,
"grad_norm": 0.9200014472007751,
"learning_rate": 2.177915712268108e-09,
"loss": 0.4687533378601074,
"memory(GiB)": 36.87,
"step": 2425,
"token_acc": 0.8394573675668927,
"train_speed(iter/s)": 0.126424
},
{
"epoch": 2.9749674653601774,
"grad_norm": 0.9681059718132019,
"learning_rate": 1.3339594418138036e-09,
"loss": 0.46721735000610354,
"memory(GiB)": 36.87,
"step": 2430,
"token_acc": 0.868303713612332,
"train_speed(iter/s)": 0.126481
},
{
"epoch": 2.9810916328561587,
"grad_norm": 0.9787800908088684,
"learning_rate": 6.958146342650463e-10,
"loss": 0.46620631217956543,
"memory(GiB)": 36.87,
"step": 2435,
"token_acc": 0.8369136498098324,
"train_speed(iter/s)": 0.126545
},
{
"epoch": 2.9872158003521396,
"grad_norm": 0.9923424124717712,
"learning_rate": 2.6350756413440203e-10,
"loss": 0.4888926029205322,
"memory(GiB)": 36.87,
"step": 2440,
"token_acc": 0.849891526288923,
"train_speed(iter/s)": 0.12661
},
{
"epoch": 2.9872158003521396,
"eval_loss": 0.6173287630081177,
"eval_runtime": 29.9491,
"eval_samples_per_second": 17.597,
"eval_steps_per_second": 4.407,
"eval_token_acc": 0.8274859238597035,
"step": 2440
},
{
"epoch": 2.9933399678481205,
"grad_norm": 0.9687269926071167,
"learning_rate": 3.7056030921522877e-11,
"loss": 0.4582235336303711,
"memory(GiB)": 36.87,
"step": 2445,
"token_acc": 0.8376420474448975,
"train_speed(iter/s)": 0.126414
},
{
"epoch": 2.9970144683457094,
"eval_loss": 0.617242693901062,
"eval_runtime": 29.8933,
"eval_samples_per_second": 17.629,
"eval_steps_per_second": 4.416,
"eval_token_acc": 0.8273826127382613,
"step": 2448
}
],
"logging_steps": 5,
"max_steps": 2448,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.502734662946783e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}