{ "best_global_step": 1580, "best_metric": 0.6043635, "best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v27-20250503-235734/checkpoint-1580", "epoch": 2.9970144683457094, "eval_steps": 20, "global_step": 2448, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001224833499196203, "grad_norm": 5.5611701011657715, "learning_rate": 9.999995882658711e-06, "loss": 1.0937654972076416, "memory(GiB)": 27.73, "step": 1, "token_acc": 0.7134680736898587, "train_speed(iter/s)": 0.067022 }, { "epoch": 0.006124167495981015, "grad_norm": 3.0366287231445312, "learning_rate": 9.999897066806807e-06, "loss": 0.8203982710838318, "memory(GiB)": 27.77, "step": 5, "token_acc": 0.7706657236318628, "train_speed(iter/s)": 0.125794 }, { "epoch": 0.01224833499196203, "grad_norm": 1.5655843019485474, "learning_rate": 9.999588271465324e-06, "loss": 0.7133324623107911, "memory(GiB)": 27.77, "step": 10, "token_acc": 0.7872333068225043, "train_speed(iter/s)": 0.148685 }, { "epoch": 0.018372502487943046, "grad_norm": 1.4364837408065796, "learning_rate": 9.999073626689664e-06, "loss": 0.7077776908874511, "memory(GiB)": 27.77, "step": 15, "token_acc": 0.7906767333933643, "train_speed(iter/s)": 0.153421 }, { "epoch": 0.02449666998392406, "grad_norm": 1.2211848497390747, "learning_rate": 9.998353153669443e-06, "loss": 0.6643787860870362, "memory(GiB)": 29.89, "step": 20, "token_acc": 0.8500581170089113, "train_speed(iter/s)": 0.156098 }, { "epoch": 0.02449666998392406, "eval_loss": 0.6882059574127197, "eval_runtime": 30.3868, "eval_samples_per_second": 17.343, "eval_steps_per_second": 4.344, "eval_token_acc": 0.8109354822046594, "step": 20 }, { "epoch": 0.030620837479905075, "grad_norm": 1.1730519533157349, "learning_rate": 9.997426882068896e-06, "loss": 0.7091597557067871, "memory(GiB)": 29.89, "step": 25, "token_acc": 0.7997104179259739, "train_speed(iter/s)": 0.123685 }, { "epoch": 0.03674500497588609, "grad_norm": 1.1173818111419678, "learning_rate": 9.996294850025658e-06, "loss": 0.6673955917358398, "memory(GiB)": 29.89, "step": 30, "token_acc": 0.8030015197568389, "train_speed(iter/s)": 0.129134 }, { "epoch": 0.042869172471867105, "grad_norm": 1.1728911399841309, "learning_rate": 9.994957104149202e-06, "loss": 0.6595910072326661, "memory(GiB)": 29.89, "step": 35, "token_acc": 0.7975662938735317, "train_speed(iter/s)": 0.133798 }, { "epoch": 0.04899333996784812, "grad_norm": 1.234574317932129, "learning_rate": 9.993413699518906e-06, "loss": 0.6554254055023193, "memory(GiB)": 29.89, "step": 40, "token_acc": 0.8106137920979735, "train_speed(iter/s)": 0.136376 }, { "epoch": 0.04899333996784812, "eval_loss": 0.6702007055282593, "eval_runtime": 30.2289, "eval_samples_per_second": 17.434, "eval_steps_per_second": 4.367, "eval_token_acc": 0.8143189214318921, "step": 40 }, { "epoch": 0.05511750746382914, "grad_norm": 1.0441969633102417, "learning_rate": 9.9916646996818e-06, "loss": 0.671393871307373, "memory(GiB)": 29.89, "step": 45, "token_acc": 0.8007245869993985, "train_speed(iter/s)": 0.123931 }, { "epoch": 0.06124167495981015, "grad_norm": 1.1040817499160767, "learning_rate": 9.989710176649937e-06, "loss": 0.65097017288208, "memory(GiB)": 29.89, "step": 50, "token_acc": 0.8223552894211577, "train_speed(iter/s)": 0.127235 }, { "epoch": 0.06736584245579116, "grad_norm": 0.971693217754364, "learning_rate": 9.987550210897433e-06, "loss": 0.652859115600586, "memory(GiB)": 29.89, "step": 55, "token_acc": 0.8024456033087575, "train_speed(iter/s)": 0.130006 }, { "epoch": 0.07349000995177218, "grad_norm": 1.3039090633392334, "learning_rate": 9.985184891357165e-06, "loss": 0.6641504764556885, "memory(GiB)": 29.89, "step": 60, "token_acc": 0.7929855290045457, "train_speed(iter/s)": 0.132371 }, { "epoch": 0.07349000995177218, "eval_loss": 0.6628613471984863, "eval_runtime": 30.3515, "eval_samples_per_second": 17.363, "eval_steps_per_second": 4.349, "eval_token_acc": 0.8155018337724056, "step": 60 }, { "epoch": 0.07961417744775319, "grad_norm": 1.1451324224472046, "learning_rate": 9.982614315417084e-06, "loss": 0.6779595851898194, "memory(GiB)": 29.89, "step": 65, "token_acc": 0.8056370699533272, "train_speed(iter/s)": 0.12411 }, { "epoch": 0.08573834494373421, "grad_norm": 1.1827325820922852, "learning_rate": 9.979838588916229e-06, "loss": 0.647182846069336, "memory(GiB)": 29.89, "step": 70, "token_acc": 0.806831566548881, "train_speed(iter/s)": 0.126104 }, { "epoch": 0.09186251243971523, "grad_norm": 1.2671717405319214, "learning_rate": 9.976857826140354e-06, "loss": 0.6356947898864747, "memory(GiB)": 29.89, "step": 75, "token_acc": 0.8092331033486255, "train_speed(iter/s)": 0.128389 }, { "epoch": 0.09798667993569624, "grad_norm": 1.1943690776824951, "learning_rate": 9.973672149817232e-06, "loss": 0.6435425758361817, "memory(GiB)": 29.89, "step": 80, "token_acc": 0.7932153503641372, "train_speed(iter/s)": 0.130134 }, { "epoch": 0.09798667993569624, "eval_loss": 0.6591480374336243, "eval_runtime": 30.2192, "eval_samples_per_second": 17.439, "eval_steps_per_second": 4.368, "eval_token_acc": 0.8162405082907175, "step": 80 }, { "epoch": 0.10411084743167726, "grad_norm": 1.0277475118637085, "learning_rate": 9.970281691111598e-06, "loss": 0.6061644554138184, "memory(GiB)": 29.89, "step": 85, "token_acc": 0.8139895703141835, "train_speed(iter/s)": 0.123754 }, { "epoch": 0.11023501492765828, "grad_norm": 1.1457469463348389, "learning_rate": 9.96668658961975e-06, "loss": 0.6548227787017822, "memory(GiB)": 29.89, "step": 90, "token_acc": 0.8061897136047075, "train_speed(iter/s)": 0.125333 }, { "epoch": 0.11635918242363928, "grad_norm": 1.124259352684021, "learning_rate": 9.962886993363797e-06, "loss": 0.6785114288330079, "memory(GiB)": 29.89, "step": 95, "token_acc": 0.7958439546965248, "train_speed(iter/s)": 0.126782 }, { "epoch": 0.1224833499196203, "grad_norm": 1.1532223224639893, "learning_rate": 9.95888305878557e-06, "loss": 0.6254438400268555, "memory(GiB)": 29.89, "step": 100, "token_acc": 0.825769669327252, "train_speed(iter/s)": 0.128271 }, { "epoch": 0.1224833499196203, "eval_loss": 0.6548585891723633, "eval_runtime": 30.2918, "eval_samples_per_second": 17.397, "eval_steps_per_second": 4.358, "eval_token_acc": 0.8170463350379669, "step": 100 }, { "epoch": 0.1286075174156013, "grad_norm": 1.1253662109375, "learning_rate": 9.954674950740175e-06, "loss": 0.6390158653259277, "memory(GiB)": 29.89, "step": 105, "token_acc": 0.8143782730827323, "train_speed(iter/s)": 0.123016 }, { "epoch": 0.13473168491158233, "grad_norm": 1.0638540983200073, "learning_rate": 9.950262842489215e-06, "loss": 0.5906115531921386, "memory(GiB)": 29.89, "step": 110, "token_acc": 0.8280561419101581, "train_speed(iter/s)": 0.124561 }, { "epoch": 0.14085585240756335, "grad_norm": 1.3548355102539062, "learning_rate": 9.945646915693646e-06, "loss": 0.5967195510864258, "memory(GiB)": 29.89, "step": 115, "token_acc": 0.8018965390008117, "train_speed(iter/s)": 0.126157 }, { "epoch": 0.14698001990354437, "grad_norm": 1.199034333229065, "learning_rate": 9.940827360406297e-06, "loss": 0.631542444229126, "memory(GiB)": 29.89, "step": 120, "token_acc": 0.8094361557837628, "train_speed(iter/s)": 0.127311 }, { "epoch": 0.14698001990354437, "eval_loss": 0.6537693738937378, "eval_runtime": 30.0219, "eval_samples_per_second": 17.554, "eval_steps_per_second": 4.397, "eval_token_acc": 0.8179503073505863, "step": 120 }, { "epoch": 0.1531041873995254, "grad_norm": 1.1312533617019653, "learning_rate": 9.93580437506406e-06, "loss": 0.6321775913238525, "memory(GiB)": 29.89, "step": 125, "token_acc": 0.8088573959255979, "train_speed(iter/s)": 0.123165 }, { "epoch": 0.15922835489550638, "grad_norm": 1.1674045324325562, "learning_rate": 9.9305781664797e-06, "loss": 0.6246171951293945, "memory(GiB)": 29.89, "step": 130, "token_acc": 0.8052116325942988, "train_speed(iter/s)": 0.12427 }, { "epoch": 0.1653525223914874, "grad_norm": 1.3215515613555908, "learning_rate": 9.925148949833356e-06, "loss": 0.6429347515106201, "memory(GiB)": 29.89, "step": 135, "token_acc": 0.8183133283809673, "train_speed(iter/s)": 0.125515 }, { "epoch": 0.17147668988746842, "grad_norm": 1.138923168182373, "learning_rate": 9.919516948663666e-06, "loss": 0.6564007759094238, "memory(GiB)": 32.12, "step": 140, "token_acc": 0.8169854580859952, "train_speed(iter/s)": 0.126675 }, { "epoch": 0.17147668988746842, "eval_loss": 0.6485698819160461, "eval_runtime": 30.1644, "eval_samples_per_second": 17.471, "eval_steps_per_second": 4.376, "eval_token_acc": 0.8182240818224081, "step": 140 }, { "epoch": 0.17760085738344944, "grad_norm": 1.258039951324463, "learning_rate": 9.913682394858576e-06, "loss": 0.6344574451446533, "memory(GiB)": 32.12, "step": 145, "token_acc": 0.8038108277711221, "train_speed(iter/s)": 0.123274 }, { "epoch": 0.18372502487943046, "grad_norm": 1.1063320636749268, "learning_rate": 9.907645528645791e-06, "loss": 0.6173704147338868, "memory(GiB)": 32.12, "step": 150, "token_acc": 0.8146344955967638, "train_speed(iter/s)": 0.124336 }, { "epoch": 0.18984919237541148, "grad_norm": 1.118895411491394, "learning_rate": 9.901406598582874e-06, "loss": 0.6216392517089844, "memory(GiB)": 32.12, "step": 155, "token_acc": 0.8292195700016929, "train_speed(iter/s)": 0.125258 }, { "epoch": 0.19597335987139247, "grad_norm": 0.9873996376991272, "learning_rate": 9.894965861547023e-06, "loss": 0.6492547512054443, "memory(GiB)": 32.12, "step": 160, "token_acc": 0.812430195125156, "train_speed(iter/s)": 0.126446 }, { "epoch": 0.19597335987139247, "eval_loss": 0.6458240747451782, "eval_runtime": 30.1721, "eval_samples_per_second": 17.466, "eval_steps_per_second": 4.375, "eval_token_acc": 0.8187303063174751, "step": 160 }, { "epoch": 0.2020975273673735, "grad_norm": 1.0021984577178955, "learning_rate": 9.888323582724493e-06, "loss": 0.5956392288208008, "memory(GiB)": 32.12, "step": 165, "token_acc": 0.8176234443998395, "train_speed(iter/s)": 0.123317 }, { "epoch": 0.2082216948633545, "grad_norm": 1.0651910305023193, "learning_rate": 9.881480035599667e-06, "loss": 0.6227351665496826, "memory(GiB)": 32.12, "step": 170, "token_acc": 0.7998247919404292, "train_speed(iter/s)": 0.124306 }, { "epoch": 0.21434586235933553, "grad_norm": 1.161063551902771, "learning_rate": 9.874435501943814e-06, "loss": 0.6138211727142334, "memory(GiB)": 32.12, "step": 175, "token_acc": 0.8047394093021469, "train_speed(iter/s)": 0.12515 }, { "epoch": 0.22047002985531655, "grad_norm": 1.0052075386047363, "learning_rate": 9.867190271803466e-06, "loss": 0.6363819122314454, "memory(GiB)": 32.12, "step": 180, "token_acc": 0.8109002326934264, "train_speed(iter/s)": 0.125933 }, { "epoch": 0.22047002985531655, "eval_loss": 0.6438542008399963, "eval_runtime": 30.2227, "eval_samples_per_second": 17.437, "eval_steps_per_second": 4.368, "eval_token_acc": 0.8191693785836045, "step": 180 }, { "epoch": 0.22659419735129757, "grad_norm": 1.093913197517395, "learning_rate": 9.859744643488494e-06, "loss": 0.6040900707244873, "memory(GiB)": 32.12, "step": 185, "token_acc": 0.8167378167100959, "train_speed(iter/s)": 0.123432 }, { "epoch": 0.23271836484727856, "grad_norm": 1.229707956314087, "learning_rate": 9.852098923559819e-06, "loss": 0.6707104206085205, "memory(GiB)": 32.12, "step": 190, "token_acc": 0.79118295902499, "train_speed(iter/s)": 0.124323 }, { "epoch": 0.23884253234325958, "grad_norm": 1.2590445280075073, "learning_rate": 9.844253426816785e-06, "loss": 0.594182014465332, "memory(GiB)": 32.12, "step": 195, "token_acc": 0.8231213499822541, "train_speed(iter/s)": 0.125194 }, { "epoch": 0.2449666998392406, "grad_norm": 1.1405773162841797, "learning_rate": 9.836208476284208e-06, "loss": 0.6203227996826172, "memory(GiB)": 32.12, "step": 200, "token_acc": 0.8118512276400965, "train_speed(iter/s)": 0.126092 }, { "epoch": 0.2449666998392406, "eval_loss": 0.6424054503440857, "eval_runtime": 30.1137, "eval_samples_per_second": 17.5, "eval_steps_per_second": 4.383, "eval_token_acc": 0.8195051397282918, "step": 200 }, { "epoch": 0.2510908673352216, "grad_norm": 1.1559275388717651, "learning_rate": 9.827964403199067e-06, "loss": 0.6028561592102051, "memory(GiB)": 32.12, "step": 205, "token_acc": 0.8117863720073665, "train_speed(iter/s)": 0.123812 }, { "epoch": 0.2572150348312026, "grad_norm": 1.1424733400344849, "learning_rate": 9.819521546996864e-06, "loss": 0.6058461189270019, "memory(GiB)": 32.12, "step": 210, "token_acc": 0.8184524805138327, "train_speed(iter/s)": 0.12456 }, { "epoch": 0.26333920232718366, "grad_norm": 1.0866496562957764, "learning_rate": 9.810880255297663e-06, "loss": 0.6336095809936524, "memory(GiB)": 32.12, "step": 215, "token_acc": 0.8254080406980853, "train_speed(iter/s)": 0.125292 }, { "epoch": 0.26946336982316466, "grad_norm": 1.090539574623108, "learning_rate": 9.802040883891762e-06, "loss": 0.6297359466552734, "memory(GiB)": 32.12, "step": 220, "token_acc": 0.7853244390539721, "train_speed(iter/s)": 0.125916 }, { "epoch": 0.26946336982316466, "eval_loss": 0.6410297155380249, "eval_runtime": 30.1465, "eval_samples_per_second": 17.481, "eval_steps_per_second": 4.379, "eval_token_acc": 0.8207552042977426, "step": 220 }, { "epoch": 0.2755875373191457, "grad_norm": 1.0024933815002441, "learning_rate": 9.793003796725049e-06, "loss": 0.5586746215820313, "memory(GiB)": 32.12, "step": 225, "token_acc": 0.8293589430563468, "train_speed(iter/s)": 0.123629 }, { "epoch": 0.2817117048151267, "grad_norm": 1.0108203887939453, "learning_rate": 9.783769365884023e-06, "loss": 0.6524643898010254, "memory(GiB)": 32.12, "step": 230, "token_acc": 0.8032468163701759, "train_speed(iter/s)": 0.124589 }, { "epoch": 0.2878358723111077, "grad_norm": 1.079897403717041, "learning_rate": 9.774337971580464e-06, "loss": 0.6641106605529785, "memory(GiB)": 32.12, "step": 235, "token_acc": 0.8061291260724153, "train_speed(iter/s)": 0.125376 }, { "epoch": 0.29396003980708874, "grad_norm": 1.1108851432800293, "learning_rate": 9.764710002135784e-06, "loss": 0.6327021598815918, "memory(GiB)": 32.12, "step": 240, "token_acc": 0.8024784931974078, "train_speed(iter/s)": 0.126001 }, { "epoch": 0.29396003980708874, "eval_loss": 0.6399893164634705, "eval_runtime": 30.1753, "eval_samples_per_second": 17.465, "eval_steps_per_second": 4.374, "eval_token_acc": 0.8202076553540989, "step": 240 }, { "epoch": 0.3000842073030697, "grad_norm": 0.8851113319396973, "learning_rate": 9.754885853965039e-06, "loss": 0.6223061561584473, "memory(GiB)": 32.12, "step": 245, "token_acc": 0.8136768110686491, "train_speed(iter/s)": 0.124088 }, { "epoch": 0.3062083747990508, "grad_norm": 0.943510890007019, "learning_rate": 9.744865931560606e-06, "loss": 0.625941801071167, "memory(GiB)": 32.12, "step": 250, "token_acc": 0.825910233887913, "train_speed(iter/s)": 0.124764 }, { "epoch": 0.31233254229503177, "grad_norm": 1.120894193649292, "learning_rate": 9.73465064747553e-06, "loss": 0.6334005355834961, "memory(GiB)": 32.12, "step": 255, "token_acc": 0.7913934426229509, "train_speed(iter/s)": 0.125458 }, { "epoch": 0.31845670979101276, "grad_norm": 1.069145917892456, "learning_rate": 9.724240422306531e-06, "loss": 0.6185196876525879, "memory(GiB)": 32.12, "step": 260, "token_acc": 0.7924378740438349, "train_speed(iter/s)": 0.126007 }, { "epoch": 0.31845670979101276, "eval_loss": 0.6390168070793152, "eval_runtime": 29.9843, "eval_samples_per_second": 17.576, "eval_steps_per_second": 4.402, "eval_token_acc": 0.8199958675551423, "step": 260 }, { "epoch": 0.3245808772869938, "grad_norm": 1.1289619207382202, "learning_rate": 9.713635684676701e-06, "loss": 0.6217617988586426, "memory(GiB)": 32.12, "step": 265, "token_acc": 0.8116217614406958, "train_speed(iter/s)": 0.12414 }, { "epoch": 0.3307050447829748, "grad_norm": 1.0119985342025757, "learning_rate": 9.702836871217838e-06, "loss": 0.6184762001037598, "memory(GiB)": 32.12, "step": 270, "token_acc": 0.8169801035704551, "train_speed(iter/s)": 0.124781 }, { "epoch": 0.33682921227895585, "grad_norm": 1.121479868888855, "learning_rate": 9.691844426552488e-06, "loss": 0.6679095268249512, "memory(GiB)": 32.12, "step": 275, "token_acc": 0.8053990302712619, "train_speed(iter/s)": 0.125508 }, { "epoch": 0.34295337977493684, "grad_norm": 1.1491762399673462, "learning_rate": 9.68065880327562e-06, "loss": 0.6015125274658203, "memory(GiB)": 32.12, "step": 280, "token_acc": 0.8035652005425306, "train_speed(iter/s)": 0.126152 }, { "epoch": 0.34295337977493684, "eval_loss": 0.6370740532875061, "eval_runtime": 30.1382, "eval_samples_per_second": 17.486, "eval_steps_per_second": 4.38, "eval_token_acc": 0.8203677875923343, "step": 280 }, { "epoch": 0.3490775472709179, "grad_norm": 0.9209638833999634, "learning_rate": 9.669280461936004e-06, "loss": 0.6419768333435059, "memory(GiB)": 32.12, "step": 285, "token_acc": 0.8053348308297066, "train_speed(iter/s)": 0.124607 }, { "epoch": 0.3552017147668989, "grad_norm": 1.0387402772903442, "learning_rate": 9.657709871017243e-06, "loss": 0.6462045669555664, "memory(GiB)": 32.12, "step": 290, "token_acc": 0.8079543874287304, "train_speed(iter/s)": 0.125198 }, { "epoch": 0.36132588226287987, "grad_norm": 1.0344866514205933, "learning_rate": 9.645947506918482e-06, "loss": 0.6486867904663086, "memory(GiB)": 32.12, "step": 295, "token_acc": 0.8222080724468499, "train_speed(iter/s)": 0.12567 }, { "epoch": 0.3674500497588609, "grad_norm": 1.1158808469772339, "learning_rate": 9.633993853934803e-06, "loss": 0.6453632354736328, "memory(GiB)": 32.12, "step": 300, "token_acc": 0.8016542381712202, "train_speed(iter/s)": 0.126206 }, { "epoch": 0.3674500497588609, "eval_loss": 0.6345298886299133, "eval_runtime": 30.2223, "eval_samples_per_second": 17.437, "eval_steps_per_second": 4.368, "eval_token_acc": 0.821328581021747, "step": 300 }, { "epoch": 0.3735742172548419, "grad_norm": 0.9398248195648193, "learning_rate": 9.621849404237274e-06, "loss": 0.6055630683898926, "memory(GiB)": 32.12, "step": 305, "token_acc": 0.8250450524809817, "train_speed(iter/s)": 0.124632 }, { "epoch": 0.37969838475082296, "grad_norm": 1.048771858215332, "learning_rate": 9.60951465785269e-06, "loss": 0.6380780220031739, "memory(GiB)": 32.12, "step": 310, "token_acc": 0.815754208203955, "train_speed(iter/s)": 0.125253 }, { "epoch": 0.38582255224680395, "grad_norm": 1.026041865348816, "learning_rate": 9.596990122642984e-06, "loss": 0.6475009441375732, "memory(GiB)": 32.12, "step": 315, "token_acc": 0.8045922028222913, "train_speed(iter/s)": 0.12584 }, { "epoch": 0.39194671974278494, "grad_norm": 1.112762212753296, "learning_rate": 9.584276314284316e-06, "loss": 0.6385052680969239, "memory(GiB)": 32.12, "step": 320, "token_acc": 0.792560957804059, "train_speed(iter/s)": 0.126392 }, { "epoch": 0.39194671974278494, "eval_loss": 0.633259654045105, "eval_runtime": 30.0477, "eval_samples_per_second": 17.539, "eval_steps_per_second": 4.393, "eval_token_acc": 0.8214318921431892, "step": 320 }, { "epoch": 0.398070887238766, "grad_norm": 1.0934439897537231, "learning_rate": 9.571373756245842e-06, "loss": 0.6589271545410156, "memory(GiB)": 32.12, "step": 325, "token_acc": 0.8136614281775572, "train_speed(iter/s)": 0.124935 }, { "epoch": 0.404195054734747, "grad_norm": 1.0657905340194702, "learning_rate": 9.558282979768164e-06, "loss": 0.6037847995758057, "memory(GiB)": 32.12, "step": 330, "token_acc": 0.7959645802352305, "train_speed(iter/s)": 0.125393 }, { "epoch": 0.41031922223072803, "grad_norm": 1.0330870151519775, "learning_rate": 9.545004523841452e-06, "loss": 0.6114434242248535, "memory(GiB)": 32.12, "step": 335, "token_acc": 0.8286545017044316, "train_speed(iter/s)": 0.125804 }, { "epoch": 0.416443389726709, "grad_norm": 1.0614529848098755, "learning_rate": 9.531538935183252e-06, "loss": 0.6515423774719238, "memory(GiB)": 32.12, "step": 340, "token_acc": 0.796179652197727, "train_speed(iter/s)": 0.126217 }, { "epoch": 0.416443389726709, "eval_loss": 0.6336191296577454, "eval_runtime": 30.124, "eval_samples_per_second": 17.494, "eval_steps_per_second": 4.382, "eval_token_acc": 0.8210754687742136, "step": 340 }, { "epoch": 0.42256755722269, "grad_norm": 0.917084276676178, "learning_rate": 9.517886768215978e-06, "loss": 0.5718442916870117, "memory(GiB)": 32.12, "step": 345, "token_acc": 0.829221986539922, "train_speed(iter/s)": 0.124734 }, { "epoch": 0.42869172471867106, "grad_norm": 1.054498314857483, "learning_rate": 9.50404858504409e-06, "loss": 0.5831773757934571, "memory(GiB)": 32.12, "step": 350, "token_acc": 0.830837973923772, "train_speed(iter/s)": 0.125116 }, { "epoch": 0.43481589221465206, "grad_norm": 1.1211163997650146, "learning_rate": 9.490024955430936e-06, "loss": 0.6414088249206543, "memory(GiB)": 32.12, "step": 355, "token_acc": 0.7959719461425946, "train_speed(iter/s)": 0.125645 }, { "epoch": 0.4409400597106331, "grad_norm": 1.011711835861206, "learning_rate": 9.475816456775313e-06, "loss": 0.6285918235778809, "memory(GiB)": 32.12, "step": 360, "token_acc": 0.8033112582781456, "train_speed(iter/s)": 0.126108 }, { "epoch": 0.4409400597106331, "eval_loss": 0.6306507587432861, "eval_runtime": 30.1935, "eval_samples_per_second": 17.454, "eval_steps_per_second": 4.372, "eval_token_acc": 0.8219639444186166, "step": 360 }, { "epoch": 0.4470642272066141, "grad_norm": 0.9317595958709717, "learning_rate": 9.46142367408767e-06, "loss": 0.6225271224975586, "memory(GiB)": 32.12, "step": 365, "token_acc": 0.8147164353461464, "train_speed(iter/s)": 0.124805 }, { "epoch": 0.45318839470259514, "grad_norm": 1.0820285081863403, "learning_rate": 9.446847199966042e-06, "loss": 0.6166964530944824, "memory(GiB)": 32.12, "step": 370, "token_acc": 0.8163280356945315, "train_speed(iter/s)": 0.12523 }, { "epoch": 0.45931256219857614, "grad_norm": 1.0026404857635498, "learning_rate": 9.432087634571638e-06, "loss": 0.614093542098999, "memory(GiB)": 32.12, "step": 375, "token_acc": 0.8399052293112201, "train_speed(iter/s)": 0.125657 }, { "epoch": 0.4654367296945571, "grad_norm": 1.0475082397460938, "learning_rate": 9.417145585604139e-06, "loss": 0.5946948051452636, "memory(GiB)": 32.12, "step": 380, "token_acc": 0.8166074313408723, "train_speed(iter/s)": 0.126073 }, { "epoch": 0.4654367296945571, "eval_loss": 0.6289507150650024, "eval_runtime": 30.1648, "eval_samples_per_second": 17.471, "eval_steps_per_second": 4.376, "eval_token_acc": 0.8223720233483135, "step": 380 }, { "epoch": 0.4715608971905382, "grad_norm": 1.0893336534500122, "learning_rate": 9.402021668276669e-06, "loss": 0.6302263259887695, "memory(GiB)": 32.12, "step": 385, "token_acc": 0.8211241269317494, "train_speed(iter/s)": 0.124795 }, { "epoch": 0.47768506468651917, "grad_norm": 0.9649361968040466, "learning_rate": 9.386716505290467e-06, "loss": 0.6190325736999511, "memory(GiB)": 32.12, "step": 390, "token_acc": 0.8042954275641256, "train_speed(iter/s)": 0.125176 }, { "epoch": 0.4838092321825002, "grad_norm": 1.1449054479599, "learning_rate": 9.371230726809258e-06, "loss": 0.6380712032318115, "memory(GiB)": 32.12, "step": 395, "token_acc": 0.7973576211390309, "train_speed(iter/s)": 0.125508 }, { "epoch": 0.4899333996784812, "grad_norm": 1.0124316215515137, "learning_rate": 9.355564970433288e-06, "loss": 0.6248594284057617, "memory(GiB)": 32.12, "step": 400, "token_acc": 0.8122048129544943, "train_speed(iter/s)": 0.12583 }, { "epoch": 0.4899333996784812, "eval_loss": 0.6282716989517212, "eval_runtime": 30.1457, "eval_samples_per_second": 17.482, "eval_steps_per_second": 4.379, "eval_token_acc": 0.822165401105429, "step": 400 }, { "epoch": 0.4960575671744622, "grad_norm": 1.1087108850479126, "learning_rate": 9.339719881173093e-06, "loss": 0.5993568420410156, "memory(GiB)": 32.12, "step": 405, "token_acc": 0.8224255219329111, "train_speed(iter/s)": 0.124698 }, { "epoch": 0.5021817346704432, "grad_norm": 1.1231392621994019, "learning_rate": 9.323696111422921e-06, "loss": 0.6480292797088623, "memory(GiB)": 32.12, "step": 410, "token_acc": 0.8032305992609242, "train_speed(iter/s)": 0.125122 }, { "epoch": 0.5083059021664242, "grad_norm": 1.007530689239502, "learning_rate": 9.307494320933893e-06, "loss": 0.5892566204071045, "memory(GiB)": 32.12, "step": 415, "token_acc": 0.8197306397306398, "train_speed(iter/s)": 0.12542 }, { "epoch": 0.5144300696624052, "grad_norm": 1.0489633083343506, "learning_rate": 9.291115176786814e-06, "loss": 0.604928731918335, "memory(GiB)": 32.12, "step": 420, "token_acc": 0.8093353125515421, "train_speed(iter/s)": 0.125868 }, { "epoch": 0.5144300696624052, "eval_loss": 0.6283579468727112, "eval_runtime": 30.1281, "eval_samples_per_second": 17.492, "eval_steps_per_second": 4.381, "eval_token_acc": 0.8228782478433804, "step": 420 }, { "epoch": 0.5205542371583863, "grad_norm": 1.126206636428833, "learning_rate": 9.274559353364734e-06, "loss": 0.6561414718627929, "memory(GiB)": 32.12, "step": 425, "token_acc": 0.8186422368439512, "train_speed(iter/s)": 0.124654 }, { "epoch": 0.5266784046543673, "grad_norm": 1.1023788452148438, "learning_rate": 9.257827532325159e-06, "loss": 0.6316391944885253, "memory(GiB)": 32.12, "step": 430, "token_acc": 0.8269044804985495, "train_speed(iter/s)": 0.125027 }, { "epoch": 0.5328025721503483, "grad_norm": 1.0733466148376465, "learning_rate": 9.240920402571995e-06, "loss": 0.6313365936279297, "memory(GiB)": 32.12, "step": 435, "token_acc": 0.7986597170513775, "train_speed(iter/s)": 0.125308 }, { "epoch": 0.5389267396463293, "grad_norm": 1.0384747982025146, "learning_rate": 9.223838660227183e-06, "loss": 0.5966384410858154, "memory(GiB)": 32.12, "step": 440, "token_acc": 0.8461405390443899, "train_speed(iter/s)": 0.125729 }, { "epoch": 0.5389267396463293, "eval_loss": 0.627018392086029, "eval_runtime": 30.1938, "eval_samples_per_second": 17.454, "eval_steps_per_second": 4.372, "eval_token_acc": 0.8226509633762075, "step": 440 }, { "epoch": 0.5450509071423103, "grad_norm": 0.9662637710571289, "learning_rate": 9.206583008602039e-06, "loss": 0.6196205615997314, "memory(GiB)": 32.12, "step": 445, "token_acc": 0.814478517479054, "train_speed(iter/s)": 0.124648 }, { "epoch": 0.5511750746382914, "grad_norm": 1.0670920610427856, "learning_rate": 9.189154158168293e-06, "loss": 0.6371576309204101, "memory(GiB)": 32.12, "step": 450, "token_acc": 0.8166518012952263, "train_speed(iter/s)": 0.125033 }, { "epoch": 0.5572992421342724, "grad_norm": 0.9831814765930176, "learning_rate": 9.171552826528832e-06, "loss": 0.549981689453125, "memory(GiB)": 32.12, "step": 455, "token_acc": 0.8483598990707121, "train_speed(iter/s)": 0.125327 }, { "epoch": 0.5634234096302534, "grad_norm": 1.0345466136932373, "learning_rate": 9.15377973838817e-06, "loss": 0.6600610733032226, "memory(GiB)": 32.12, "step": 460, "token_acc": 0.8050162396246843, "train_speed(iter/s)": 0.125742 }, { "epoch": 0.5634234096302534, "eval_loss": 0.6251854300498962, "eval_runtime": 29.9543, "eval_samples_per_second": 17.593, "eval_steps_per_second": 4.407, "eval_token_acc": 0.823002221189111, "step": 460 }, { "epoch": 0.5695475771262344, "grad_norm": 0.9955180287361145, "learning_rate": 9.135835625522585e-06, "loss": 0.5820852279663086, "memory(GiB)": 32.12, "step": 465, "token_acc": 0.8245337424184316, "train_speed(iter/s)": 0.124745 }, { "epoch": 0.5756717446222154, "grad_norm": 1.0886976718902588, "learning_rate": 9.117721226750019e-06, "loss": 0.6092466354370117, "memory(GiB)": 32.12, "step": 470, "token_acc": 0.8014575318141463, "train_speed(iter/s)": 0.125111 }, { "epoch": 0.5817959121181965, "grad_norm": 0.9607824087142944, "learning_rate": 9.099437287899634e-06, "loss": 0.6091058731079102, "memory(GiB)": 32.12, "step": 475, "token_acc": 0.8267922900839333, "train_speed(iter/s)": 0.125396 }, { "epoch": 0.5879200796141775, "grad_norm": 1.0514999628067017, "learning_rate": 9.08098456178111e-06, "loss": 0.620454978942871, "memory(GiB)": 32.12, "step": 480, "token_acc": 0.8148612218551847, "train_speed(iter/s)": 0.125738 }, { "epoch": 0.5879200796141775, "eval_loss": 0.6268740296363831, "eval_runtime": 29.9706, "eval_samples_per_second": 17.584, "eval_steps_per_second": 4.404, "eval_token_acc": 0.8231468567591301, "step": 480 }, { "epoch": 0.5940442471101585, "grad_norm": 1.007743239402771, "learning_rate": 9.06236380815366e-06, "loss": 0.6164707183837891, "memory(GiB)": 32.12, "step": 485, "token_acc": 0.8169734151329243, "train_speed(iter/s)": 0.124816 }, { "epoch": 0.6001684146061395, "grad_norm": 1.0523988008499146, "learning_rate": 9.043575793694733e-06, "loss": 0.6281998157501221, "memory(GiB)": 32.12, "step": 490, "token_acc": 0.8149852592219745, "train_speed(iter/s)": 0.125171 }, { "epoch": 0.6062925821021204, "grad_norm": 0.9333862066268921, "learning_rate": 9.024621291968461e-06, "loss": 0.6068775653839111, "memory(GiB)": 32.12, "step": 495, "token_acc": 0.8320725141416206, "train_speed(iter/s)": 0.125492 }, { "epoch": 0.6124167495981016, "grad_norm": 1.0344208478927612, "learning_rate": 9.005501083393799e-06, "loss": 0.6447205543518066, "memory(GiB)": 32.12, "step": 500, "token_acc": 0.8103164010363508, "train_speed(iter/s)": 0.125823 }, { "epoch": 0.6124167495981016, "eval_loss": 0.6257410645484924, "eval_runtime": 30.1552, "eval_samples_per_second": 17.476, "eval_steps_per_second": 4.377, "eval_token_acc": 0.82304871119376, "step": 500 }, { "epoch": 0.6185409170940825, "grad_norm": 1.125075340270996, "learning_rate": 8.986215955212394e-06, "loss": 0.6383331775665283, "memory(GiB)": 32.12, "step": 505, "token_acc": 0.8156163386432991, "train_speed(iter/s)": 0.124876 }, { "epoch": 0.6246650845900635, "grad_norm": 1.0361703634262085, "learning_rate": 8.966766701456177e-06, "loss": 0.6330353736877441, "memory(GiB)": 32.12, "step": 510, "token_acc": 0.7972571857974826, "train_speed(iter/s)": 0.125207 }, { "epoch": 0.6307892520860445, "grad_norm": 0.9697835445404053, "learning_rate": 8.947154122914666e-06, "loss": 0.6002368927001953, "memory(GiB)": 32.12, "step": 515, "token_acc": 0.8171989033478781, "train_speed(iter/s)": 0.12553 }, { "epoch": 0.6369134195820255, "grad_norm": 1.0072293281555176, "learning_rate": 8.927379027101994e-06, "loss": 0.6289110660552979, "memory(GiB)": 32.12, "step": 520, "token_acc": 0.8173692196055149, "train_speed(iter/s)": 0.125831 }, { "epoch": 0.6369134195820255, "eval_loss": 0.6232544183731079, "eval_runtime": 30.2166, "eval_samples_per_second": 17.441, "eval_steps_per_second": 4.368, "eval_token_acc": 0.8239630146185237, "step": 520 }, { "epoch": 0.6430375870780066, "grad_norm": 1.0802470445632935, "learning_rate": 8.907442228223668e-06, "loss": 0.6614001274108887, "memory(GiB)": 32.12, "step": 525, "token_acc": 0.8152090832024681, "train_speed(iter/s)": 0.12495 }, { "epoch": 0.6491617545739876, "grad_norm": 0.9386376142501831, "learning_rate": 8.887344547143032e-06, "loss": 0.6672756195068359, "memory(GiB)": 32.12, "step": 530, "token_acc": 0.8442938796480637, "train_speed(iter/s)": 0.125255 }, { "epoch": 0.6552859220699686, "grad_norm": 0.8939440846443176, "learning_rate": 8.867086811347483e-06, "loss": 0.6040712356567383, "memory(GiB)": 32.12, "step": 535, "token_acc": 0.8285196511496206, "train_speed(iter/s)": 0.125541 }, { "epoch": 0.6614100895659496, "grad_norm": 0.9224595427513123, "learning_rate": 8.846669854914395e-06, "loss": 0.6459405899047852, "memory(GiB)": 32.12, "step": 540, "token_acc": 0.8066893233285104, "train_speed(iter/s)": 0.125873 }, { "epoch": 0.6614100895659496, "eval_loss": 0.6218589544296265, "eval_runtime": 30.088, "eval_samples_per_second": 17.515, "eval_steps_per_second": 4.387, "eval_token_acc": 0.8244330802210857, "step": 540 }, { "epoch": 0.6675342570619306, "grad_norm": 1.198673129081726, "learning_rate": 8.826094518476775e-06, "loss": 0.6059948921203613, "memory(GiB)": 32.12, "step": 545, "token_acc": 0.8176160233107487, "train_speed(iter/s)": 0.125029 }, { "epoch": 0.6736584245579117, "grad_norm": 0.9881957769393921, "learning_rate": 8.805361649188657e-06, "loss": 0.5907226085662842, "memory(GiB)": 32.12, "step": 550, "token_acc": 0.8072239136451702, "train_speed(iter/s)": 0.125389 }, { "epoch": 0.6797825920538927, "grad_norm": 1.027031660079956, "learning_rate": 8.784472100690215e-06, "loss": 0.6389594554901123, "memory(GiB)": 32.12, "step": 555, "token_acc": 0.8127651442767055, "train_speed(iter/s)": 0.125703 }, { "epoch": 0.6859067595498737, "grad_norm": 1.0879848003387451, "learning_rate": 8.763426733072624e-06, "loss": 0.6162051200866699, "memory(GiB)": 32.12, "step": 560, "token_acc": 0.8103369683368982, "train_speed(iter/s)": 0.125956 }, { "epoch": 0.6859067595498737, "eval_loss": 0.6215130686759949, "eval_runtime": 30.1587, "eval_samples_per_second": 17.474, "eval_steps_per_second": 4.377, "eval_token_acc": 0.8242212924221293, "step": 560 }, { "epoch": 0.6920309270458547, "grad_norm": 0.9450560808181763, "learning_rate": 8.742226412842636e-06, "loss": 0.6049357414245605, "memory(GiB)": 32.12, "step": 565, "token_acc": 0.828604110069801, "train_speed(iter/s)": 0.12514 }, { "epoch": 0.6981550945418358, "grad_norm": 1.0816230773925781, "learning_rate": 8.720872012886918e-06, "loss": 0.6060591697692871, "memory(GiB)": 32.12, "step": 570, "token_acc": 0.8121618953603159, "train_speed(iter/s)": 0.125424 }, { "epoch": 0.7042792620378168, "grad_norm": 1.0539354085922241, "learning_rate": 8.6993644124361e-06, "loss": 0.6078558921813965, "memory(GiB)": 32.12, "step": 575, "token_acc": 0.8203018867924529, "train_speed(iter/s)": 0.125669 }, { "epoch": 0.7104034295337978, "grad_norm": 0.9692308306694031, "learning_rate": 8.677704497028579e-06, "loss": 0.6092854976654053, "memory(GiB)": 32.12, "step": 580, "token_acc": 0.813650025657943, "train_speed(iter/s)": 0.125955 }, { "epoch": 0.7104034295337978, "eval_loss": 0.6210135817527771, "eval_runtime": 30.0497, "eval_samples_per_second": 17.538, "eval_steps_per_second": 4.393, "eval_token_acc": 0.8236375845859807, "step": 580 }, { "epoch": 0.7165275970297788, "grad_norm": 1.0405429601669312, "learning_rate": 8.655893158474056e-06, "loss": 0.626552963256836, "memory(GiB)": 32.12, "step": 585, "token_acc": 0.8233875988502245, "train_speed(iter/s)": 0.12517 }, { "epoch": 0.7226517645257597, "grad_norm": 0.9403768181800842, "learning_rate": 8.633931294816823e-06, "loss": 0.6014822483062744, "memory(GiB)": 32.12, "step": 590, "token_acc": 0.8108660890260682, "train_speed(iter/s)": 0.125469 }, { "epoch": 0.7287759320217408, "grad_norm": 0.8821709752082825, "learning_rate": 8.611819810298778e-06, "loss": 0.6129745483398438, "memory(GiB)": 32.12, "step": 595, "token_acc": 0.8298729368614095, "train_speed(iter/s)": 0.125735 }, { "epoch": 0.7349000995177218, "grad_norm": 0.9912955164909363, "learning_rate": 8.58955961532221e-06, "loss": 0.5840856075286865, "memory(GiB)": 32.12, "step": 600, "token_acc": 0.8248710481069511, "train_speed(iter/s)": 0.125926 }, { "epoch": 0.7349000995177218, "eval_loss": 0.6222088932991028, "eval_runtime": 30.0444, "eval_samples_per_second": 17.541, "eval_steps_per_second": 4.393, "eval_token_acc": 0.8234412934552404, "step": 600 }, { "epoch": 0.7410242670137028, "grad_norm": 1.037676453590393, "learning_rate": 8.567151626412295e-06, "loss": 0.6406550884246827, "memory(GiB)": 32.12, "step": 605, "token_acc": 0.8165861175316943, "train_speed(iter/s)": 0.125193 }, { "epoch": 0.7471484345096838, "grad_norm": 1.0239970684051514, "learning_rate": 8.544596766179377e-06, "loss": 0.5927177429199219, "memory(GiB)": 32.12, "step": 610, "token_acc": 0.8224873999407056, "train_speed(iter/s)": 0.125437 }, { "epoch": 0.7532726020056648, "grad_norm": 1.0227631330490112, "learning_rate": 8.521895963280967e-06, "loss": 0.5963564395904541, "memory(GiB)": 32.12, "step": 615, "token_acc": 0.827058931465794, "train_speed(iter/s)": 0.125665 }, { "epoch": 0.7593967695016459, "grad_norm": 1.0962486267089844, "learning_rate": 8.499050152383519e-06, "loss": 0.6459769248962403, "memory(GiB)": 32.12, "step": 620, "token_acc": 0.8193405375450263, "train_speed(iter/s)": 0.125922 }, { "epoch": 0.7593967695016459, "eval_loss": 0.6196050643920898, "eval_runtime": 29.9245, "eval_samples_per_second": 17.611, "eval_steps_per_second": 4.411, "eval_token_acc": 0.8242987757632109, "step": 620 }, { "epoch": 0.7655209369976269, "grad_norm": 1.0802922248840332, "learning_rate": 8.476060274123938e-06, "loss": 0.5952530860900879, "memory(GiB)": 32.12, "step": 625, "token_acc": 0.8222949637294201, "train_speed(iter/s)": 0.125113 }, { "epoch": 0.7716451044936079, "grad_norm": 1.0066949129104614, "learning_rate": 8.452927275070858e-06, "loss": 0.6043106079101562, "memory(GiB)": 32.12, "step": 630, "token_acc": 0.8247297031649302, "train_speed(iter/s)": 0.125399 }, { "epoch": 0.7777692719895889, "grad_norm": 0.9393348693847656, "learning_rate": 8.429652107685662e-06, "loss": 0.633615779876709, "memory(GiB)": 32.12, "step": 635, "token_acc": 0.8054982337099087, "train_speed(iter/s)": 0.125629 }, { "epoch": 0.7838934394855699, "grad_norm": 1.0759190320968628, "learning_rate": 8.40623573028327e-06, "loss": 0.6218441009521485, "memory(GiB)": 32.12, "step": 640, "token_acc": 0.8195294533875818, "train_speed(iter/s)": 0.125911 }, { "epoch": 0.7838934394855699, "eval_loss": 0.6197024583816528, "eval_runtime": 29.9919, "eval_samples_per_second": 17.571, "eval_steps_per_second": 4.401, "eval_token_acc": 0.8242006301978408, "step": 640 }, { "epoch": 0.790017606981551, "grad_norm": 0.8828935623168945, "learning_rate": 8.382679106992687e-06, "loss": 0.6121187210083008, "memory(GiB)": 32.12, "step": 645, "token_acc": 0.8180191693290735, "train_speed(iter/s)": 0.125215 }, { "epoch": 0.796141774477532, "grad_norm": 1.0172326564788818, "learning_rate": 8.358983207717286e-06, "loss": 0.6195911407470703, "memory(GiB)": 32.12, "step": 650, "token_acc": 0.802275960170697, "train_speed(iter/s)": 0.125435 }, { "epoch": 0.802265941973513, "grad_norm": 1.0442404747009277, "learning_rate": 8.335149008094906e-06, "loss": 0.5969693660736084, "memory(GiB)": 32.12, "step": 655, "token_acc": 0.8279867846104657, "train_speed(iter/s)": 0.125647 }, { "epoch": 0.808390109469494, "grad_norm": 0.9861543774604797, "learning_rate": 8.311177489457653e-06, "loss": 0.6027172088623047, "memory(GiB)": 32.12, "step": 660, "token_acc": 0.8264142409459503, "train_speed(iter/s)": 0.125901 }, { "epoch": 0.808390109469494, "eval_loss": 0.6189049482345581, "eval_runtime": 30.0122, "eval_samples_per_second": 17.56, "eval_steps_per_second": 4.398, "eval_token_acc": 0.8247378480293404, "step": 660 }, { "epoch": 0.814514276965475, "grad_norm": 0.9293588399887085, "learning_rate": 8.28706963879151e-06, "loss": 0.5741694927215576, "memory(GiB)": 32.12, "step": 665, "token_acc": 0.8175586677777479, "train_speed(iter/s)": 0.125199 }, { "epoch": 0.8206384444614561, "grad_norm": 0.8590324521064758, "learning_rate": 8.2628264486957e-06, "loss": 0.6139655113220215, "memory(GiB)": 32.12, "step": 670, "token_acc": 0.8059371841425523, "train_speed(iter/s)": 0.125416 }, { "epoch": 0.826762611957437, "grad_norm": 1.0524649620056152, "learning_rate": 8.23844891734181e-06, "loss": 0.5746917724609375, "memory(GiB)": 32.12, "step": 675, "token_acc": 0.8064527770760402, "train_speed(iter/s)": 0.125689 }, { "epoch": 0.832886779453418, "grad_norm": 1.0562033653259277, "learning_rate": 8.213938048432697e-06, "loss": 0.5949201583862305, "memory(GiB)": 32.12, "step": 680, "token_acc": 0.8262676641729011, "train_speed(iter/s)": 0.125943 }, { "epoch": 0.832886779453418, "eval_loss": 0.6181398630142212, "eval_runtime": 29.812, "eval_samples_per_second": 17.677, "eval_steps_per_second": 4.428, "eval_token_acc": 0.8249702980525854, "step": 680 }, { "epoch": 0.839010946949399, "grad_norm": 0.8997740149497986, "learning_rate": 8.189294851161164e-06, "loss": 0.6027894496917725, "memory(GiB)": 32.12, "step": 685, "token_acc": 0.8181957698532284, "train_speed(iter/s)": 0.125279 }, { "epoch": 0.84513511444538, "grad_norm": 0.9756171703338623, "learning_rate": 8.164520340168404e-06, "loss": 0.6199028015136718, "memory(GiB)": 32.12, "step": 690, "token_acc": 0.816996805111821, "train_speed(iter/s)": 0.125482 }, { "epoch": 0.8512592819413611, "grad_norm": 1.0920320749282837, "learning_rate": 8.139615535502227e-06, "loss": 0.6447176933288574, "memory(GiB)": 32.12, "step": 695, "token_acc": 0.8085919407132932, "train_speed(iter/s)": 0.125747 }, { "epoch": 0.8573834494373421, "grad_norm": 0.9692983627319336, "learning_rate": 8.114581462575063e-06, "loss": 0.6160262107849122, "memory(GiB)": 32.12, "step": 700, "token_acc": 0.8179973169137629, "train_speed(iter/s)": 0.12598 }, { "epoch": 0.8573834494373421, "eval_loss": 0.6175059080123901, "eval_runtime": 30.0657, "eval_samples_per_second": 17.528, "eval_steps_per_second": 4.39, "eval_token_acc": 0.8250374502815228, "step": 700 }, { "epoch": 0.8635076169333231, "grad_norm": 0.9100112318992615, "learning_rate": 8.089419152121736e-06, "loss": 0.572049617767334, "memory(GiB)": 32.12, "step": 705, "token_acc": 0.8311790668348046, "train_speed(iter/s)": 0.125371 }, { "epoch": 0.8696317844293041, "grad_norm": 0.967882513999939, "learning_rate": 8.064129640157033e-06, "loss": 0.6320825576782226, "memory(GiB)": 32.12, "step": 710, "token_acc": 0.8260635252113577, "train_speed(iter/s)": 0.125628 }, { "epoch": 0.8757559519252851, "grad_norm": 0.9655548334121704, "learning_rate": 8.038713967933043e-06, "loss": 0.6211101055145264, "memory(GiB)": 32.12, "step": 715, "token_acc": 0.818314430545932, "train_speed(iter/s)": 0.125843 }, { "epoch": 0.8818801194212662, "grad_norm": 0.888866126537323, "learning_rate": 8.013173181896283e-06, "loss": 0.6331143379211426, "memory(GiB)": 32.12, "step": 720, "token_acc": 0.8223672789139266, "train_speed(iter/s)": 0.126057 }, { "epoch": 0.8818801194212662, "eval_loss": 0.6172851324081421, "eval_runtime": 29.7983, "eval_samples_per_second": 17.686, "eval_steps_per_second": 4.43, "eval_token_acc": 0.8249289736040085, "step": 720 }, { "epoch": 0.8880042869172472, "grad_norm": 0.9664549827575684, "learning_rate": 7.98750833364462e-06, "loss": 0.6372400760650635, "memory(GiB)": 32.12, "step": 725, "token_acc": 0.8170195878334325, "train_speed(iter/s)": 0.125415 }, { "epoch": 0.8941284544132282, "grad_norm": 1.000975251197815, "learning_rate": 7.961720479883967e-06, "loss": 0.5750507354736328, "memory(GiB)": 32.12, "step": 730, "token_acc": 0.8278389461108779, "train_speed(iter/s)": 0.125636 }, { "epoch": 0.9002526219092092, "grad_norm": 1.094359278678894, "learning_rate": 7.935810682384777e-06, "loss": 0.5872611045837403, "memory(GiB)": 32.12, "step": 735, "token_acc": 0.8236509437265819, "train_speed(iter/s)": 0.125855 }, { "epoch": 0.9063767894051903, "grad_norm": 0.9531553387641907, "learning_rate": 7.909780007938327e-06, "loss": 0.597745418548584, "memory(GiB)": 32.12, "step": 740, "token_acc": 0.8098763707480617, "train_speed(iter/s)": 0.12604 }, { "epoch": 0.9063767894051903, "eval_loss": 0.616245687007904, "eval_runtime": 30.0544, "eval_samples_per_second": 17.535, "eval_steps_per_second": 4.392, "eval_token_acc": 0.8252440725244072, "step": 740 }, { "epoch": 0.9125009569011713, "grad_norm": 1.0165457725524902, "learning_rate": 7.883629528312794e-06, "loss": 0.6201919555664063, "memory(GiB)": 32.12, "step": 745, "token_acc": 0.8185958200091366, "train_speed(iter/s)": 0.125398 }, { "epoch": 0.9186251243971523, "grad_norm": 1.1827678680419922, "learning_rate": 7.857360320209126e-06, "loss": 0.6491155624389648, "memory(GiB)": 32.12, "step": 750, "token_acc": 0.8059945706742032, "train_speed(iter/s)": 0.125601 }, { "epoch": 0.9247492918931333, "grad_norm": 1.0143615007400513, "learning_rate": 7.830973465216712e-06, "loss": 0.6207675933837891, "memory(GiB)": 32.12, "step": 755, "token_acc": 0.8050835253456221, "train_speed(iter/s)": 0.1258 }, { "epoch": 0.9308734593891143, "grad_norm": 0.929755449295044, "learning_rate": 7.80447004976885e-06, "loss": 0.5987899780273438, "memory(GiB)": 32.12, "step": 760, "token_acc": 0.8344898639435169, "train_speed(iter/s)": 0.126004 }, { "epoch": 0.9308734593891143, "eval_loss": 0.6154034733772278, "eval_runtime": 29.9062, "eval_samples_per_second": 17.622, "eval_steps_per_second": 4.414, "eval_token_acc": 0.8256728136783925, "step": 760 }, { "epoch": 0.9369976268850954, "grad_norm": 0.9682816863059998, "learning_rate": 7.777851165098012e-06, "loss": 0.6217115879058838, "memory(GiB)": 32.12, "step": 765, "token_acc": 0.8253138075313807, "train_speed(iter/s)": 0.125416 }, { "epoch": 0.9431217943810764, "grad_norm": 0.9861950278282166, "learning_rate": 7.751117907190919e-06, "loss": 0.6429153442382812, "memory(GiB)": 32.12, "step": 770, "token_acc": 0.7975187624444785, "train_speed(iter/s)": 0.125587 }, { "epoch": 0.9492459618770573, "grad_norm": 1.0501208305358887, "learning_rate": 7.724271376743408e-06, "loss": 0.6119437694549561, "memory(GiB)": 32.12, "step": 775, "token_acc": 0.8173751624280676, "train_speed(iter/s)": 0.125823 }, { "epoch": 0.9553701293730383, "grad_norm": 0.865284264087677, "learning_rate": 7.697312679115126e-06, "loss": 0.6217618465423584, "memory(GiB)": 32.12, "step": 780, "token_acc": 0.8141066272272696, "train_speed(iter/s)": 0.126064 }, { "epoch": 0.9553701293730383, "eval_loss": 0.616006076335907, "eval_runtime": 29.936, "eval_samples_per_second": 17.604, "eval_steps_per_second": 4.409, "eval_token_acc": 0.8253008936412005, "step": 780 }, { "epoch": 0.9614942968690193, "grad_norm": 0.9814106822013855, "learning_rate": 7.670242924284e-06, "loss": 0.6097393989562988, "memory(GiB)": 32.12, "step": 785, "token_acc": 0.8181161935170403, "train_speed(iter/s)": 0.125512 }, { "epoch": 0.9676184643650004, "grad_norm": 0.9246596693992615, "learning_rate": 7.643063226800556e-06, "loss": 0.5933025360107422, "memory(GiB)": 32.12, "step": 790, "token_acc": 0.8117928174854171, "train_speed(iter/s)": 0.125727 }, { "epoch": 0.9737426318609814, "grad_norm": 0.9575701951980591, "learning_rate": 7.615774705742012e-06, "loss": 0.6192699432373047, "memory(GiB)": 32.12, "step": 795, "token_acc": 0.8193493150684932, "train_speed(iter/s)": 0.125968 }, { "epoch": 0.9798667993569624, "grad_norm": 0.9669679403305054, "learning_rate": 7.588378484666214e-06, "loss": 0.5967386722564697, "memory(GiB)": 32.12, "step": 800, "token_acc": 0.8374201589032559, "train_speed(iter/s)": 0.126204 }, { "epoch": 0.9798667993569624, "eval_loss": 0.6141526699066162, "eval_runtime": 29.9762, "eval_samples_per_second": 17.581, "eval_steps_per_second": 4.403, "eval_token_acc": 0.825512681440157, "step": 800 }, { "epoch": 0.9859909668529434, "grad_norm": 1.0676608085632324, "learning_rate": 7.560875691565366e-06, "loss": 0.6506372451782226, "memory(GiB)": 32.12, "step": 805, "token_acc": 0.8162111860741997, "train_speed(iter/s)": 0.125683 }, { "epoch": 0.9921151343489244, "grad_norm": 1.1090500354766846, "learning_rate": 7.533267458819597e-06, "loss": 0.6549376487731934, "memory(GiB)": 32.12, "step": 810, "token_acc": 0.7903715821453611, "train_speed(iter/s)": 0.125882 }, { "epoch": 0.9982393018449055, "grad_norm": 1.0215500593185425, "learning_rate": 7.505554923150329e-06, "loss": 0.6107999324798584, "memory(GiB)": 32.12, "step": 815, "token_acc": 0.8255743651753326, "train_speed(iter/s)": 0.126031 }, { "epoch": 1.0036745004975887, "grad_norm": 0.9610656499862671, "learning_rate": 7.477739225573475e-06, "loss": 0.5486949920654297, "memory(GiB)": 32.12, "step": 820, "token_acc": 0.8164336957325642, "train_speed(iter/s)": 0.126279 }, { "epoch": 1.0036745004975887, "eval_loss": 0.6169166564941406, "eval_runtime": 29.8644, "eval_samples_per_second": 17.646, "eval_steps_per_second": 4.42, "eval_token_acc": 0.8263649981920553, "step": 820 }, { "epoch": 1.0097986679935695, "grad_norm": 0.8930652141571045, "learning_rate": 7.449821511352465e-06, "loss": 0.5580629348754883, "memory(GiB)": 32.12, "step": 825, "token_acc": 0.8275424871864041, "train_speed(iter/s)": 0.125684 }, { "epoch": 1.0159228354895506, "grad_norm": 1.043878197669983, "learning_rate": 7.421802929951088e-06, "loss": 0.537553071975708, "memory(GiB)": 32.12, "step": 830, "token_acc": 0.8182638888888889, "train_speed(iter/s)": 0.12587 }, { "epoch": 1.0220470029855317, "grad_norm": 0.9812543392181396, "learning_rate": 7.393684634986165e-06, "loss": 0.544792366027832, "memory(GiB)": 32.12, "step": 835, "token_acc": 0.8522354565855342, "train_speed(iter/s)": 0.12604 }, { "epoch": 1.0281711704815126, "grad_norm": 0.9970709085464478, "learning_rate": 7.365467784180051e-06, "loss": 0.5357254028320313, "memory(GiB)": 32.12, "step": 840, "token_acc": 0.8398415604798027, "train_speed(iter/s)": 0.126238 }, { "epoch": 1.0281711704815126, "eval_loss": 0.6195093393325806, "eval_runtime": 30.0227, "eval_samples_per_second": 17.553, "eval_steps_per_second": 4.397, "eval_token_acc": 0.8253680458701379, "step": 840 }, { "epoch": 1.0342953379774937, "grad_norm": 0.9345646500587463, "learning_rate": 7.337153539312968e-06, "loss": 0.5476717948913574, "memory(GiB)": 32.12, "step": 845, "token_acc": 0.8294938351719663, "train_speed(iter/s)": 0.125697 }, { "epoch": 1.0404195054734746, "grad_norm": 0.9843780398368835, "learning_rate": 7.308743066175172e-06, "loss": 0.5259488105773926, "memory(GiB)": 32.12, "step": 850, "token_acc": 0.8343461220380425, "train_speed(iter/s)": 0.125882 }, { "epoch": 1.0465436729694557, "grad_norm": 0.9147416949272156, "learning_rate": 7.280237534518948e-06, "loss": 0.5354435443878174, "memory(GiB)": 32.12, "step": 855, "token_acc": 0.8242645320363428, "train_speed(iter/s)": 0.126046 }, { "epoch": 1.0526678404654368, "grad_norm": 0.9699310660362244, "learning_rate": 7.251638118010456e-06, "loss": 0.5579245567321778, "memory(GiB)": 32.12, "step": 860, "token_acc": 0.8199693263596681, "train_speed(iter/s)": 0.126211 }, { "epoch": 1.0526678404654368, "eval_loss": 0.6175369620323181, "eval_runtime": 30.0015, "eval_samples_per_second": 17.566, "eval_steps_per_second": 4.4, "eval_token_acc": 0.8255643370008782, "step": 860 }, { "epoch": 1.0587920079614177, "grad_norm": 0.9256744384765625, "learning_rate": 7.222945994181403e-06, "loss": 0.5566354751586914, "memory(GiB)": 32.12, "step": 865, "token_acc": 0.8209406091511238, "train_speed(iter/s)": 0.125682 }, { "epoch": 1.0649161754573988, "grad_norm": 0.9246125221252441, "learning_rate": 7.194162344380561e-06, "loss": 0.5399526596069336, "memory(GiB)": 32.12, "step": 870, "token_acc": 0.8396965685046696, "train_speed(iter/s)": 0.125842 }, { "epoch": 1.0710403429533797, "grad_norm": 0.9580938816070557, "learning_rate": 7.16528835372512e-06, "loss": 0.540044641494751, "memory(GiB)": 32.12, "step": 875, "token_acc": 0.8386737552985128, "train_speed(iter/s)": 0.126028 }, { "epoch": 1.0771645104493608, "grad_norm": 1.0862672328948975, "learning_rate": 7.136325211051905e-06, "loss": 0.5482538223266602, "memory(GiB)": 32.12, "step": 880, "token_acc": 0.8172277019200394, "train_speed(iter/s)": 0.126217 }, { "epoch": 1.0771645104493608, "eval_loss": 0.619035542011261, "eval_runtime": 29.9689, "eval_samples_per_second": 17.585, "eval_steps_per_second": 4.405, "eval_token_acc": 0.8250477813936671, "step": 880 }, { "epoch": 1.083288677945342, "grad_norm": 0.8554603457450867, "learning_rate": 7.107274108868422e-06, "loss": 0.5296638965606689, "memory(GiB)": 32.12, "step": 885, "token_acc": 0.8246657960985986, "train_speed(iter/s)": 0.12573 }, { "epoch": 1.0894128454413228, "grad_norm": 0.9336580634117126, "learning_rate": 7.078136243303754e-06, "loss": 0.5232193946838379, "memory(GiB)": 32.12, "step": 890, "token_acc": 0.8418666840594834, "train_speed(iter/s)": 0.12589 }, { "epoch": 1.0955370129373039, "grad_norm": 0.9728440642356873, "learning_rate": 7.048912814059321e-06, "loss": 0.5442141056060791, "memory(GiB)": 32.12, "step": 895, "token_acc": 0.8226529199606543, "train_speed(iter/s)": 0.126084 }, { "epoch": 1.1016611804332848, "grad_norm": 1.0364502668380737, "learning_rate": 7.019605024359475e-06, "loss": 0.5461842536926269, "memory(GiB)": 32.12, "step": 900, "token_acc": 0.8352232590995279, "train_speed(iter/s)": 0.12626 }, { "epoch": 1.1016611804332848, "eval_loss": 0.6182094812393188, "eval_runtime": 29.9539, "eval_samples_per_second": 17.594, "eval_steps_per_second": 4.407, "eval_token_acc": 0.8259620848184307, "step": 900 }, { "epoch": 1.1077853479292659, "grad_norm": 0.9474948644638062, "learning_rate": 6.990214080901971e-06, "loss": 0.5203993797302247, "memory(GiB)": 32.12, "step": 905, "token_acc": 0.8326191860072181, "train_speed(iter/s)": 0.125742 }, { "epoch": 1.113909515425247, "grad_norm": 0.9584360718727112, "learning_rate": 6.9607411938082735e-06, "loss": 0.5354339122772217, "memory(GiB)": 32.12, "step": 910, "token_acc": 0.8325466311381804, "train_speed(iter/s)": 0.125916 }, { "epoch": 1.1200336829212278, "grad_norm": 0.9902798533439636, "learning_rate": 6.931187576573733e-06, "loss": 0.531368637084961, "memory(GiB)": 32.12, "step": 915, "token_acc": 0.8471662228984405, "train_speed(iter/s)": 0.126087 }, { "epoch": 1.126157850417209, "grad_norm": 0.8779637217521667, "learning_rate": 6.9015544460176296e-06, "loss": 0.5314560890197754, "memory(GiB)": 32.12, "step": 920, "token_acc": 0.8275991535258379, "train_speed(iter/s)": 0.126272 }, { "epoch": 1.126157850417209, "eval_loss": 0.6172027587890625, "eval_runtime": 29.9497, "eval_samples_per_second": 17.596, "eval_steps_per_second": 4.407, "eval_token_acc": 0.8254610258794359, "step": 920 }, { "epoch": 1.1322820179131898, "grad_norm": 1.0086554288864136, "learning_rate": 6.87184302223306e-06, "loss": 0.5486597061157227, "memory(GiB)": 32.12, "step": 925, "token_acc": 0.8296143047140535, "train_speed(iter/s)": 0.125778 }, { "epoch": 1.138406185409171, "grad_norm": 1.055482029914856, "learning_rate": 6.842054528536717e-06, "loss": 0.5004231452941894, "memory(GiB)": 32.12, "step": 930, "token_acc": 0.8338052711827488, "train_speed(iter/s)": 0.125965 }, { "epoch": 1.144530352905152, "grad_norm": 0.9732358455657959, "learning_rate": 6.812190191418508e-06, "loss": 0.528237771987915, "memory(GiB)": 32.12, "step": 935, "token_acc": 0.8312384161752316, "train_speed(iter/s)": 0.126125 }, { "epoch": 1.150654520401133, "grad_norm": 0.8922236561775208, "learning_rate": 6.782251240491071e-06, "loss": 0.5213536262512207, "memory(GiB)": 32.12, "step": 940, "token_acc": 0.846323478740266, "train_speed(iter/s)": 0.126299 }, { "epoch": 1.150654520401133, "eval_loss": 0.6174668073654175, "eval_runtime": 29.9268, "eval_samples_per_second": 17.61, "eval_steps_per_second": 4.411, "eval_token_acc": 0.8251975825197583, "step": 940 }, { "epoch": 1.156778687897114, "grad_norm": 1.022758960723877, "learning_rate": 6.75223890843913e-06, "loss": 0.5352741241455078, "memory(GiB)": 32.12, "step": 945, "token_acc": 0.8363561308192181, "train_speed(iter/s)": 0.12581 }, { "epoch": 1.162902855393095, "grad_norm": 0.9870294332504272, "learning_rate": 6.722154430968755e-06, "loss": 0.5349910259246826, "memory(GiB)": 32.12, "step": 950, "token_acc": 0.8381542699724518, "train_speed(iter/s)": 0.12598 }, { "epoch": 1.169027022889076, "grad_norm": 0.9218893051147461, "learning_rate": 6.69199904675648e-06, "loss": 0.5564836025238037, "memory(GiB)": 32.12, "step": 955, "token_acc": 0.8276523535487679, "train_speed(iter/s)": 0.12616 }, { "epoch": 1.175151190385057, "grad_norm": 0.9656640887260437, "learning_rate": 6.6617739973982985e-06, "loss": 0.505579948425293, "memory(GiB)": 32.12, "step": 960, "token_acc": 0.8291429745838186, "train_speed(iter/s)": 0.126306 }, { "epoch": 1.175151190385057, "eval_loss": 0.618171751499176, "eval_runtime": 29.9871, "eval_samples_per_second": 17.574, "eval_steps_per_second": 4.402, "eval_token_acc": 0.825326721421561, "step": 960 }, { "epoch": 1.181275357881038, "grad_norm": 0.9255744218826294, "learning_rate": 6.631480527358552e-06, "loss": 0.5494061946868897, "memory(GiB)": 32.12, "step": 965, "token_acc": 0.828285929606163, "train_speed(iter/s)": 0.125817 }, { "epoch": 1.187399525377019, "grad_norm": 0.8625167608261108, "learning_rate": 6.601119883918677e-06, "loss": 0.5405423164367675, "memory(GiB)": 32.12, "step": 970, "token_acc": 0.8340152804432053, "train_speed(iter/s)": 0.125981 }, { "epoch": 1.1935236928730002, "grad_norm": 0.9901431202888489, "learning_rate": 6.570693317125868e-06, "loss": 0.5540534019470215, "memory(GiB)": 32.12, "step": 975, "token_acc": 0.8329805323246695, "train_speed(iter/s)": 0.12611 }, { "epoch": 1.199647860368981, "grad_norm": 0.8996224403381348, "learning_rate": 6.540202079741594e-06, "loss": 0.5333957672119141, "memory(GiB)": 32.12, "step": 980, "token_acc": 0.8473111291632819, "train_speed(iter/s)": 0.126282 }, { "epoch": 1.199647860368981, "eval_loss": 0.6176728010177612, "eval_runtime": 30.0203, "eval_samples_per_second": 17.555, "eval_steps_per_second": 4.397, "eval_token_acc": 0.8252853969729841, "step": 980 }, { "epoch": 1.2057720278649622, "grad_norm": 1.0513380765914917, "learning_rate": 6.509647427190029e-06, "loss": 0.5554468631744385, "memory(GiB)": 32.12, "step": 985, "token_acc": 0.8163728888561104, "train_speed(iter/s)": 0.125784 }, { "epoch": 1.211896195360943, "grad_norm": 0.9925962686538696, "learning_rate": 6.4790306175063535e-06, "loss": 0.5358247756958008, "memory(GiB)": 32.12, "step": 990, "token_acc": 0.8373621787068276, "train_speed(iter/s)": 0.125984 }, { "epoch": 1.2180203628569242, "grad_norm": 0.9856204390525818, "learning_rate": 6.44835291128496e-06, "loss": 0.544157600402832, "memory(GiB)": 32.12, "step": 995, "token_acc": 0.8250445425672012, "train_speed(iter/s)": 0.126117 }, { "epoch": 1.224144530352905, "grad_norm": 0.9970067739486694, "learning_rate": 6.417615571627555e-06, "loss": 0.5199033260345459, "memory(GiB)": 34.49, "step": 1000, "token_acc": 0.8412796162447737, "train_speed(iter/s)": 0.126261 }, { "epoch": 1.224144530352905, "eval_loss": 0.6171393990516663, "eval_runtime": 29.9792, "eval_samples_per_second": 17.579, "eval_steps_per_second": 4.403, "eval_token_acc": 0.8254248669869312, "step": 1000 }, { "epoch": 1.2302686978488862, "grad_norm": 0.978387176990509, "learning_rate": 6.386819864091146e-06, "loss": 0.5251027107238769, "memory(GiB)": 34.49, "step": 1005, "token_acc": 0.8299365231042249, "train_speed(iter/s)": 0.125803 }, { "epoch": 1.2363928653448673, "grad_norm": 0.9339916706085205, "learning_rate": 6.35596705663594e-06, "loss": 0.566818380355835, "memory(GiB)": 34.49, "step": 1010, "token_acc": 0.8130651567649793, "train_speed(iter/s)": 0.12596 }, { "epoch": 1.2425170328408481, "grad_norm": 0.9691733717918396, "learning_rate": 6.325058419573131e-06, "loss": 0.5325815200805664, "memory(GiB)": 34.49, "step": 1015, "token_acc": 0.838964083981669, "train_speed(iter/s)": 0.126159 }, { "epoch": 1.2486412003368292, "grad_norm": 0.9045368432998657, "learning_rate": 6.294095225512604e-06, "loss": 0.5249390602111816, "memory(GiB)": 34.49, "step": 1020, "token_acc": 0.8380004706356944, "train_speed(iter/s)": 0.126284 }, { "epoch": 1.2486412003368292, "eval_loss": 0.6158590316772461, "eval_runtime": 30.0066, "eval_samples_per_second": 17.563, "eval_steps_per_second": 4.399, "eval_token_acc": 0.8253318869776332, "step": 1020 }, { "epoch": 1.2547653678328103, "grad_norm": 0.98622065782547, "learning_rate": 6.263078749310534e-06, "loss": 0.561451530456543, "memory(GiB)": 34.49, "step": 1025, "token_acc": 0.8262592270950934, "train_speed(iter/s)": 0.125824 }, { "epoch": 1.2608895353287912, "grad_norm": 0.9515383243560791, "learning_rate": 6.232010268016895e-06, "loss": 0.5291833877563477, "memory(GiB)": 34.49, "step": 1030, "token_acc": 0.8373620599054125, "train_speed(iter/s)": 0.125985 }, { "epoch": 1.2670137028247723, "grad_norm": 0.9982597827911377, "learning_rate": 6.200891060822884e-06, "loss": 0.577932071685791, "memory(GiB)": 34.49, "step": 1035, "token_acc": 0.8188697951090549, "train_speed(iter/s)": 0.126139 }, { "epoch": 1.2731378703207532, "grad_norm": 1.0038230419158936, "learning_rate": 6.169722409008244e-06, "loss": 0.5776113986968994, "memory(GiB)": 34.49, "step": 1040, "token_acc": 0.8182007844446298, "train_speed(iter/s)": 0.126292 }, { "epoch": 1.2731378703207532, "eval_loss": 0.6144587397575378, "eval_runtime": 30.0744, "eval_samples_per_second": 17.523, "eval_steps_per_second": 4.389, "eval_token_acc": 0.8263133426313343, "step": 1040 }, { "epoch": 1.2792620378167343, "grad_norm": 0.9570845365524292, "learning_rate": 6.13850559588852e-06, "loss": 0.5415801048278809, "memory(GiB)": 34.49, "step": 1045, "token_acc": 0.8337840538200226, "train_speed(iter/s)": 0.125838 }, { "epoch": 1.2853862053127152, "grad_norm": 0.9676324725151062, "learning_rate": 6.107241906762214e-06, "loss": 0.5263193130493165, "memory(GiB)": 34.49, "step": 1050, "token_acc": 0.8450373289877591, "train_speed(iter/s)": 0.125977 }, { "epoch": 1.2915103728086963, "grad_norm": 0.8747360110282898, "learning_rate": 6.075932628857869e-06, "loss": 0.5368072032928467, "memory(GiB)": 34.49, "step": 1055, "token_acc": 0.8371580206308865, "train_speed(iter/s)": 0.126138 }, { "epoch": 1.2976345403046774, "grad_norm": 0.9015209674835205, "learning_rate": 6.044579051281063e-06, "loss": 0.4784068584442139, "memory(GiB)": 34.49, "step": 1060, "token_acc": 0.8673299195318215, "train_speed(iter/s)": 0.126244 }, { "epoch": 1.2976345403046774, "eval_loss": 0.6154947280883789, "eval_runtime": 30.068, "eval_samples_per_second": 17.527, "eval_steps_per_second": 4.39, "eval_token_acc": 0.8261997003977478, "step": 1060 }, { "epoch": 1.3037587078006583, "grad_norm": 0.9384099841117859, "learning_rate": 6.013182464961341e-06, "loss": 0.5346551418304444, "memory(GiB)": 34.49, "step": 1065, "token_acc": 0.8303387250508586, "train_speed(iter/s)": 0.125804 }, { "epoch": 1.3098828752966394, "grad_norm": 0.8981488347053528, "learning_rate": 5.981744162599057e-06, "loss": 0.5211257934570312, "memory(GiB)": 34.49, "step": 1070, "token_acc": 0.8500360490266763, "train_speed(iter/s)": 0.12593 }, { "epoch": 1.3160070427926205, "grad_norm": 0.8698239922523499, "learning_rate": 5.9502654386121505e-06, "loss": 0.5495285034179688, "memory(GiB)": 34.49, "step": 1075, "token_acc": 0.8446355346104413, "train_speed(iter/s)": 0.12608 }, { "epoch": 1.3221312102886014, "grad_norm": 0.990492582321167, "learning_rate": 5.918747589082853e-06, "loss": 0.5515711307525635, "memory(GiB)": 34.49, "step": 1080, "token_acc": 0.8194262671996039, "train_speed(iter/s)": 0.126234 }, { "epoch": 1.3221312102886014, "eval_loss": 0.6156888008117676, "eval_runtime": 30.042, "eval_samples_per_second": 17.542, "eval_steps_per_second": 4.394, "eval_token_acc": 0.8256831447905367, "step": 1080 }, { "epoch": 1.3282553777845825, "grad_norm": 1.0377113819122314, "learning_rate": 5.887191911704322e-06, "loss": 0.5179418087005615, "memory(GiB)": 34.49, "step": 1085, "token_acc": 0.822986674391657, "train_speed(iter/s)": 0.125752 }, { "epoch": 1.3343795452805634, "grad_norm": 1.1629189252853394, "learning_rate": 5.855599705727212e-06, "loss": 0.501689100265503, "memory(GiB)": 34.49, "step": 1090, "token_acc": 0.8511506930497481, "train_speed(iter/s)": 0.125871 }, { "epoch": 1.3405037127765445, "grad_norm": 1.021088719367981, "learning_rate": 5.823972271906177e-06, "loss": 0.5111154556274414, "memory(GiB)": 34.49, "step": 1095, "token_acc": 0.827035490605428, "train_speed(iter/s)": 0.126024 }, { "epoch": 1.3466278802725253, "grad_norm": 1.0440119504928589, "learning_rate": 5.7923109124463264e-06, "loss": 0.5382958889007569, "memory(GiB)": 34.49, "step": 1100, "token_acc": 0.8228407178911946, "train_speed(iter/s)": 0.12617 }, { "epoch": 1.3466278802725253, "eval_loss": 0.6151137948036194, "eval_runtime": 29.8862, "eval_samples_per_second": 17.634, "eval_steps_per_second": 4.417, "eval_token_acc": 0.8264734748695697, "step": 1100 }, { "epoch": 1.3527520477685064, "grad_norm": 1.0229501724243164, "learning_rate": 5.760616930949584e-06, "loss": 0.542177963256836, "memory(GiB)": 34.49, "step": 1105, "token_acc": 0.8330580493912673, "train_speed(iter/s)": 0.125749 }, { "epoch": 1.3588762152644875, "grad_norm": 0.8944743871688843, "learning_rate": 5.728891632361043e-06, "loss": 0.5133552551269531, "memory(GiB)": 34.49, "step": 1110, "token_acc": 0.8286529928320973, "train_speed(iter/s)": 0.125908 }, { "epoch": 1.3650003827604684, "grad_norm": 0.8976428508758545, "learning_rate": 5.697136322915218e-06, "loss": 0.5297269821166992, "memory(GiB)": 34.49, "step": 1115, "token_acc": 0.8331969608416131, "train_speed(iter/s)": 0.126045 }, { "epoch": 1.3711245502564495, "grad_norm": 1.013662338256836, "learning_rate": 5.66535231008227e-06, "loss": 0.5449240684509278, "memory(GiB)": 34.49, "step": 1120, "token_acc": 0.8382301504022386, "train_speed(iter/s)": 0.126181 }, { "epoch": 1.3711245502564495, "eval_loss": 0.6147744059562683, "eval_runtime": 30.0501, "eval_samples_per_second": 17.537, "eval_steps_per_second": 4.393, "eval_token_acc": 0.8263081770752622, "step": 1120 }, { "epoch": 1.3772487177524306, "grad_norm": 1.047998309135437, "learning_rate": 5.63354090251417e-06, "loss": 0.5496514320373536, "memory(GiB)": 34.49, "step": 1125, "token_acc": 0.8280883107068635, "train_speed(iter/s)": 0.125763 }, { "epoch": 1.3833728852484115, "grad_norm": 0.995204508304596, "learning_rate": 5.6017034099908245e-06, "loss": 0.5459441184997559, "memory(GiB)": 34.49, "step": 1130, "token_acc": 0.8158013374408295, "train_speed(iter/s)": 0.125925 }, { "epoch": 1.3894970527443926, "grad_norm": 0.9421271681785583, "learning_rate": 5.569841143366141e-06, "loss": 0.51002197265625, "memory(GiB)": 34.49, "step": 1135, "token_acc": 0.8427027419120847, "train_speed(iter/s)": 0.126047 }, { "epoch": 1.3956212202403735, "grad_norm": 0.9212712049484253, "learning_rate": 5.537955414514058e-06, "loss": 0.5343506813049317, "memory(GiB)": 36.87, "step": 1140, "token_acc": 0.8508435329143236, "train_speed(iter/s)": 0.126151 }, { "epoch": 1.3956212202403735, "eval_loss": 0.6138430237770081, "eval_runtime": 30.0787, "eval_samples_per_second": 17.521, "eval_steps_per_second": 4.388, "eval_token_acc": 0.8258587736969885, "step": 1140 }, { "epoch": 1.4017453877363546, "grad_norm": 0.960340142250061, "learning_rate": 5.506047536274529e-06, "loss": 0.537141227722168, "memory(GiB)": 36.87, "step": 1145, "token_acc": 0.825777386163379, "train_speed(iter/s)": 0.125768 }, { "epoch": 1.4078695552323355, "grad_norm": 1.063237190246582, "learning_rate": 5.474118822399476e-06, "loss": 0.5870203018188477, "memory(GiB)": 36.87, "step": 1150, "token_acc": 0.8254607459004498, "train_speed(iter/s)": 0.125933 }, { "epoch": 1.4139937227283166, "grad_norm": 0.8735440373420715, "learning_rate": 5.442170587498684e-06, "loss": 0.5143415451049804, "memory(GiB)": 36.87, "step": 1155, "token_acc": 0.8224962760245262, "train_speed(iter/s)": 0.126052 }, { "epoch": 1.4201178902242977, "grad_norm": 0.8771001100540161, "learning_rate": 5.41020414698569e-06, "loss": 0.557903242111206, "memory(GiB)": 36.87, "step": 1160, "token_acc": 0.8381047381546135, "train_speed(iter/s)": 0.126212 }, { "epoch": 1.4201178902242977, "eval_loss": 0.6127957701683044, "eval_runtime": 30.0731, "eval_samples_per_second": 17.524, "eval_steps_per_second": 4.389, "eval_token_acc": 0.8258226148044837, "step": 1160 }, { "epoch": 1.4262420577202786, "grad_norm": 0.9807034134864807, "learning_rate": 5.378220817023609e-06, "loss": 0.5510265350341796, "memory(GiB)": 36.87, "step": 1165, "token_acc": 0.8266740684199569, "train_speed(iter/s)": 0.125818 }, { "epoch": 1.4323662252162597, "grad_norm": 0.9031324982643127, "learning_rate": 5.346221914470959e-06, "loss": 0.5285142421722412, "memory(GiB)": 36.87, "step": 1170, "token_acc": 0.8287966113464342, "train_speed(iter/s)": 0.125945 }, { "epoch": 1.4384903927122408, "grad_norm": 0.9082944393157959, "learning_rate": 5.314208756827425e-06, "loss": 0.5313165664672852, "memory(GiB)": 36.87, "step": 1175, "token_acc": 0.8377556371263765, "train_speed(iter/s)": 0.126106 }, { "epoch": 1.4446145602082217, "grad_norm": 0.9939496517181396, "learning_rate": 5.282182662179623e-06, "loss": 0.559614896774292, "memory(GiB)": 36.87, "step": 1180, "token_acc": 0.846643215328194, "train_speed(iter/s)": 0.126246 }, { "epoch": 1.4446145602082217, "eval_loss": 0.6125648021697998, "eval_runtime": 30.1611, "eval_samples_per_second": 17.473, "eval_steps_per_second": 4.376, "eval_token_acc": 0.826158375949171, "step": 1180 }, { "epoch": 1.4507387277042028, "grad_norm": 1.0398341417312622, "learning_rate": 5.250144949146827e-06, "loss": 0.5018705368041992, "memory(GiB)": 36.87, "step": 1185, "token_acc": 0.8302141481179431, "train_speed(iter/s)": 0.125825 }, { "epoch": 1.4568628952001839, "grad_norm": 1.0717829465866089, "learning_rate": 5.218096936826681e-06, "loss": 0.543729591369629, "memory(GiB)": 36.87, "step": 1190, "token_acc": 0.8435318409753859, "train_speed(iter/s)": 0.125983 }, { "epoch": 1.4629870626961647, "grad_norm": 0.9488953948020935, "learning_rate": 5.186039944740882e-06, "loss": 0.5498368740081787, "memory(GiB)": 36.87, "step": 1195, "token_acc": 0.8358378225120499, "train_speed(iter/s)": 0.126113 }, { "epoch": 1.4691112301921456, "grad_norm": 1.010858416557312, "learning_rate": 5.153975292780852e-06, "loss": 0.5265066623687744, "memory(GiB)": 36.87, "step": 1200, "token_acc": 0.8429352241672207, "train_speed(iter/s)": 0.126228 }, { "epoch": 1.4691112301921456, "eval_loss": 0.6125081181526184, "eval_runtime": 29.9605, "eval_samples_per_second": 17.59, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8266542693320936, "step": 1200 }, { "epoch": 1.4752353976881267, "grad_norm": 0.8864910006523132, "learning_rate": 5.1219043011534e-06, "loss": 0.5261281967163086, "memory(GiB)": 36.87, "step": 1205, "token_acc": 0.8282765708814958, "train_speed(iter/s)": 0.125837 }, { "epoch": 1.4813595651841078, "grad_norm": 0.9174733757972717, "learning_rate": 5.089828290326354e-06, "loss": 0.5531785964965821, "memory(GiB)": 36.87, "step": 1210, "token_acc": 0.8411006266657063, "train_speed(iter/s)": 0.126012 }, { "epoch": 1.4874837326800887, "grad_norm": 0.9243429899215698, "learning_rate": 5.057748580974204e-06, "loss": 0.5176255702972412, "memory(GiB)": 36.87, "step": 1215, "token_acc": 0.8498275862068966, "train_speed(iter/s)": 0.12615 }, { "epoch": 1.4936079001760698, "grad_norm": 0.9391066431999207, "learning_rate": 5.0256664939237186e-06, "loss": 0.5616118431091308, "memory(GiB)": 36.87, "step": 1220, "token_acc": 0.8409511815690051, "train_speed(iter/s)": 0.126282 }, { "epoch": 1.4936079001760698, "eval_loss": 0.6114863157272339, "eval_runtime": 29.9732, "eval_samples_per_second": 17.582, "eval_steps_per_second": 4.404, "eval_token_acc": 0.8267007593367426, "step": 1220 }, { "epoch": 1.499732067672051, "grad_norm": 0.8913131356239319, "learning_rate": 4.99358335009956e-06, "loss": 0.5003180027008056, "memory(GiB)": 36.87, "step": 1225, "token_acc": 0.8392863897119082, "train_speed(iter/s)": 0.125908 }, { "epoch": 1.5058562351680318, "grad_norm": 0.9838159084320068, "learning_rate": 4.961500470469908e-06, "loss": 0.5151349067687988, "memory(GiB)": 36.87, "step": 1230, "token_acc": 0.8358106300867373, "train_speed(iter/s)": 0.126051 }, { "epoch": 1.511980402664013, "grad_norm": 0.9471805095672607, "learning_rate": 4.92941917599206e-06, "loss": 0.5267168998718261, "memory(GiB)": 36.87, "step": 1235, "token_acc": 0.8325013676148797, "train_speed(iter/s)": 0.126177 }, { "epoch": 1.518104570159994, "grad_norm": 0.9928951263427734, "learning_rate": 4.8973407875580485e-06, "loss": 0.5807061195373535, "memory(GiB)": 36.87, "step": 1240, "token_acc": 0.82605387834146, "train_speed(iter/s)": 0.12635 }, { "epoch": 1.518104570159994, "eval_loss": 0.6120603084564209, "eval_runtime": 30.0211, "eval_samples_per_second": 17.554, "eval_steps_per_second": 4.397, "eval_token_acc": 0.8262875148509737, "step": 1240 }, { "epoch": 1.5242287376559749, "grad_norm": 0.9785681366920471, "learning_rate": 4.8652666259402584e-06, "loss": 0.5564475059509277, "memory(GiB)": 36.87, "step": 1245, "token_acc": 0.8232146560663671, "train_speed(iter/s)": 0.125966 }, { "epoch": 1.5303529051519558, "grad_norm": 0.9484609365463257, "learning_rate": 4.833198011737035e-06, "loss": 0.5257096767425538, "memory(GiB)": 36.87, "step": 1250, "token_acc": 0.8338689740420272, "train_speed(iter/s)": 0.126082 }, { "epoch": 1.5364770726479369, "grad_norm": 1.0170414447784424, "learning_rate": 4.8011362653183245e-06, "loss": 0.5458654403686524, "memory(GiB)": 36.87, "step": 1255, "token_acc": 0.8260180208051355, "train_speed(iter/s)": 0.126209 }, { "epoch": 1.542601240143918, "grad_norm": 1.0465954542160034, "learning_rate": 4.7690827067713035e-06, "loss": 0.5092308998107911, "memory(GiB)": 36.87, "step": 1260, "token_acc": 0.8562170404727111, "train_speed(iter/s)": 0.126341 }, { "epoch": 1.542601240143918, "eval_loss": 0.6123631596565247, "eval_runtime": 29.8837, "eval_samples_per_second": 17.635, "eval_steps_per_second": 4.417, "eval_token_acc": 0.8266542693320936, "step": 1260 }, { "epoch": 1.5487254076398989, "grad_norm": 1.0032224655151367, "learning_rate": 4.737038655846023e-06, "loss": 0.5465664863586426, "memory(GiB)": 36.87, "step": 1265, "token_acc": 0.8245171081677705, "train_speed(iter/s)": 0.125985 }, { "epoch": 1.55484957513588, "grad_norm": 1.0049303770065308, "learning_rate": 4.70500543190108e-06, "loss": 0.5189294338226318, "memory(GiB)": 36.87, "step": 1270, "token_acc": 0.8236330189048495, "train_speed(iter/s)": 0.12608 }, { "epoch": 1.560973742631861, "grad_norm": 1.006712794303894, "learning_rate": 4.672984353849285e-06, "loss": 0.5561445236206055, "memory(GiB)": 36.87, "step": 1275, "token_acc": 0.8239827598801958, "train_speed(iter/s)": 0.126214 }, { "epoch": 1.567097910127842, "grad_norm": 0.8475578427314758, "learning_rate": 4.640976740103363e-06, "loss": 0.5361814498901367, "memory(GiB)": 36.87, "step": 1280, "token_acc": 0.8350327247674819, "train_speed(iter/s)": 0.126343 }, { "epoch": 1.567097910127842, "eval_loss": 0.6125593185424805, "eval_runtime": 29.9427, "eval_samples_per_second": 17.6, "eval_steps_per_second": 4.408, "eval_token_acc": 0.8268918849114107, "step": 1280 }, { "epoch": 1.573222077623823, "grad_norm": 1.068233847618103, "learning_rate": 4.60898390852167e-06, "loss": 0.5269934654235839, "memory(GiB)": 36.87, "step": 1285, "token_acc": 0.8273768192895751, "train_speed(iter/s)": 0.125978 }, { "epoch": 1.5793462451198041, "grad_norm": 1.0497961044311523, "learning_rate": 4.577007176353931e-06, "loss": 0.5188837051391602, "memory(GiB)": 36.87, "step": 1290, "token_acc": 0.8475213675213675, "train_speed(iter/s)": 0.126095 }, { "epoch": 1.585470412615785, "grad_norm": 0.9117013812065125, "learning_rate": 4.5450478601870055e-06, "loss": 0.49097652435302735, "memory(GiB)": 36.87, "step": 1295, "token_acc": 0.83500768653248, "train_speed(iter/s)": 0.126208 }, { "epoch": 1.591594580111766, "grad_norm": 0.865384042263031, "learning_rate": 4.513107275890682e-06, "loss": 0.5219059944152832, "memory(GiB)": 36.87, "step": 1300, "token_acc": 0.8484564711960734, "train_speed(iter/s)": 0.126317 }, { "epoch": 1.591594580111766, "eval_loss": 0.6109749674797058, "eval_runtime": 29.957, "eval_samples_per_second": 17.592, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8270158582571414, "step": 1300 }, { "epoch": 1.5977187476077472, "grad_norm": 0.9944786429405212, "learning_rate": 4.4811867385634916e-06, "loss": 0.5182311058044433, "memory(GiB)": 36.87, "step": 1305, "token_acc": 0.8336139447360602, "train_speed(iter/s)": 0.125948 }, { "epoch": 1.6038429151037281, "grad_norm": 0.9752517342567444, "learning_rate": 4.44928756247857e-06, "loss": 0.49358739852905276, "memory(GiB)": 36.87, "step": 1310, "token_acc": 0.8449874236435502, "train_speed(iter/s)": 0.126049 }, { "epoch": 1.609967082599709, "grad_norm": 0.9614261984825134, "learning_rate": 4.417411061029539e-06, "loss": 0.536794376373291, "memory(GiB)": 36.87, "step": 1315, "token_acc": 0.832827077457149, "train_speed(iter/s)": 0.126177 }, { "epoch": 1.61609125009569, "grad_norm": 0.9478575587272644, "learning_rate": 4.3855585466764305e-06, "loss": 0.4996980667114258, "memory(GiB)": 36.87, "step": 1320, "token_acc": 0.8452054794520548, "train_speed(iter/s)": 0.126305 }, { "epoch": 1.61609125009569, "eval_loss": 0.6103559732437134, "eval_runtime": 30.0278, "eval_samples_per_second": 17.55, "eval_steps_per_second": 4.396, "eval_token_acc": 0.8273722816261171, "step": 1320 }, { "epoch": 1.6222154175916712, "grad_norm": 1.074583649635315, "learning_rate": 4.353731330891651e-06, "loss": 0.529239273071289, "memory(GiB)": 36.87, "step": 1325, "token_acc": 0.8302422791282821, "train_speed(iter/s)": 0.125951 }, { "epoch": 1.628339585087652, "grad_norm": 0.9707440137863159, "learning_rate": 4.321930724105979e-06, "loss": 0.4900198936462402, "memory(GiB)": 36.87, "step": 1330, "token_acc": 0.8524826181613877, "train_speed(iter/s)": 0.126054 }, { "epoch": 1.6344637525836332, "grad_norm": 0.943321943283081, "learning_rate": 4.290158035654618e-06, "loss": 0.5417927265167236, "memory(GiB)": 36.87, "step": 1335, "token_acc": 0.8254652088914634, "train_speed(iter/s)": 0.1262 }, { "epoch": 1.6405879200796143, "grad_norm": 1.0129594802856445, "learning_rate": 4.258414573723277e-06, "loss": 0.545560359954834, "memory(GiB)": 36.87, "step": 1340, "token_acc": 0.8416190929273708, "train_speed(iter/s)": 0.126328 }, { "epoch": 1.6405879200796143, "eval_loss": 0.609876275062561, "eval_runtime": 30.0571, "eval_samples_per_second": 17.533, "eval_steps_per_second": 4.392, "eval_token_acc": 0.827418771630766, "step": 1340 }, { "epoch": 1.6467120875755952, "grad_norm": 1.0571733713150024, "learning_rate": 4.226701645294317e-06, "loss": 0.5603596687316894, "memory(GiB)": 36.87, "step": 1345, "token_acc": 0.8282726557865548, "train_speed(iter/s)": 0.125982 }, { "epoch": 1.652836255071576, "grad_norm": 1.0039043426513672, "learning_rate": 4.195020556092935e-06, "loss": 0.5717378616333008, "memory(GiB)": 36.87, "step": 1350, "token_acc": 0.8221712722738426, "train_speed(iter/s)": 0.126115 }, { "epoch": 1.6589604225675574, "grad_norm": 1.0660555362701416, "learning_rate": 4.1633726105334006e-06, "loss": 0.5500486373901368, "memory(GiB)": 36.87, "step": 1355, "token_acc": 0.8320635850853417, "train_speed(iter/s)": 0.126243 }, { "epoch": 1.6650845900635383, "grad_norm": 0.9174071550369263, "learning_rate": 4.131759111665349e-06, "loss": 0.49724588394165037, "memory(GiB)": 36.87, "step": 1360, "token_acc": 0.8494714160662582, "train_speed(iter/s)": 0.126379 }, { "epoch": 1.6650845900635383, "eval_loss": 0.609088659286499, "eval_runtime": 30.0177, "eval_samples_per_second": 17.556, "eval_steps_per_second": 4.397, "eval_token_acc": 0.8274859238597035, "step": 1360 }, { "epoch": 1.6712087575595191, "grad_norm": 1.0310657024383545, "learning_rate": 4.100181361120136e-06, "loss": 0.5943800926208496, "memory(GiB)": 36.87, "step": 1365, "token_acc": 0.8274678717695241, "train_speed(iter/s)": 0.126048 }, { "epoch": 1.6773329250555002, "grad_norm": 0.947372317314148, "learning_rate": 4.068640659057242e-06, "loss": 0.5227277755737305, "memory(GiB)": 36.87, "step": 1370, "token_acc": 0.8481193255512322, "train_speed(iter/s)": 0.126167 }, { "epoch": 1.6834570925514813, "grad_norm": 1.0015521049499512, "learning_rate": 4.037138304110737e-06, "loss": 0.5239052772521973, "memory(GiB)": 36.87, "step": 1375, "token_acc": 0.8306528880372297, "train_speed(iter/s)": 0.126277 }, { "epoch": 1.6895812600474622, "grad_norm": 1.014237880706787, "learning_rate": 4.005675593335818e-06, "loss": 0.5036933898925782, "memory(GiB)": 36.87, "step": 1380, "token_acc": 0.8474381345177665, "train_speed(iter/s)": 0.126378 }, { "epoch": 1.6895812600474622, "eval_loss": 0.6097399592399597, "eval_runtime": 30.0485, "eval_samples_per_second": 17.538, "eval_steps_per_second": 4.393, "eval_token_acc": 0.8270313549253577, "step": 1380 }, { "epoch": 1.6957054275434433, "grad_norm": 0.970410943031311, "learning_rate": 3.974253822155397e-06, "loss": 0.5157362937927246, "memory(GiB)": 36.87, "step": 1385, "token_acc": 0.8356764264051473, "train_speed(iter/s)": 0.126006 }, { "epoch": 1.7018295950394244, "grad_norm": 0.9698341488838196, "learning_rate": 3.942874284306774e-06, "loss": 0.5165740966796875, "memory(GiB)": 36.87, "step": 1390, "token_acc": 0.852775912640916, "train_speed(iter/s)": 0.126136 }, { "epoch": 1.7079537625354053, "grad_norm": 0.889597475528717, "learning_rate": 3.911538271788359e-06, "loss": 0.5268959999084473, "memory(GiB)": 36.87, "step": 1395, "token_acc": 0.8417914492851819, "train_speed(iter/s)": 0.126261 }, { "epoch": 1.7140779300313862, "grad_norm": 0.9927029609680176, "learning_rate": 3.8802470748064855e-06, "loss": 0.5189975738525391, "memory(GiB)": 36.87, "step": 1400, "token_acc": 0.8465264055174552, "train_speed(iter/s)": 0.126349 }, { "epoch": 1.7140779300313862, "eval_loss": 0.6091334223747253, "eval_runtime": 29.8469, "eval_samples_per_second": 17.657, "eval_steps_per_second": 4.423, "eval_token_acc": 0.8271295004907279, "step": 1400 }, { "epoch": 1.7202020975273675, "grad_norm": 0.9913120865821838, "learning_rate": 3.849001981722285e-06, "loss": 0.5513727188110351, "memory(GiB)": 36.87, "step": 1405, "token_acc": 0.833267143235372, "train_speed(iter/s)": 0.126016 }, { "epoch": 1.7263262650233484, "grad_norm": 0.9658275246620178, "learning_rate": 3.8178042789986355e-06, "loss": 0.5375414371490479, "memory(GiB)": 36.87, "step": 1410, "token_acc": 0.8221340970845267, "train_speed(iter/s)": 0.126151 }, { "epoch": 1.7324504325193293, "grad_norm": 0.9217929244041443, "learning_rate": 3.786655251147204e-06, "loss": 0.5318355560302734, "memory(GiB)": 36.87, "step": 1415, "token_acc": 0.8423896524940057, "train_speed(iter/s)": 0.126267 }, { "epoch": 1.7385746000153104, "grad_norm": 1.0436443090438843, "learning_rate": 3.755556180675547e-06, "loss": 0.5554102897644043, "memory(GiB)": 36.87, "step": 1420, "token_acc": 0.8421204263900893, "train_speed(iter/s)": 0.126374 }, { "epoch": 1.7385746000153104, "eval_loss": 0.6083164215087891, "eval_runtime": 29.8608, "eval_samples_per_second": 17.649, "eval_steps_per_second": 4.421, "eval_token_acc": 0.8271295004907279, "step": 1420 }, { "epoch": 1.7446987675112915, "grad_norm": 1.0527832508087158, "learning_rate": 3.7245083480343225e-06, "loss": 0.5336908817291259, "memory(GiB)": 36.87, "step": 1425, "token_acc": 0.8262728719172633, "train_speed(iter/s)": 0.126038 }, { "epoch": 1.7508229350072724, "grad_norm": 0.9292203187942505, "learning_rate": 3.693513031564549e-06, "loss": 0.5425585746765137, "memory(GiB)": 36.87, "step": 1430, "token_acc": 0.8410889737991266, "train_speed(iter/s)": 0.126155 }, { "epoch": 1.7569471025032535, "grad_norm": 0.9655841588973999, "learning_rate": 3.662571507444986e-06, "loss": 0.5386072158813476, "memory(GiB)": 36.87, "step": 1435, "token_acc": 0.857958101689923, "train_speed(iter/s)": 0.126275 }, { "epoch": 1.7630712699992346, "grad_norm": 0.9359703660011292, "learning_rate": 3.6316850496395863e-06, "loss": 0.5226363658905029, "memory(GiB)": 36.87, "step": 1440, "token_acc": 0.8469182175175004, "train_speed(iter/s)": 0.126376 }, { "epoch": 1.7630712699992346, "eval_loss": 0.6083342432975769, "eval_runtime": 29.9595, "eval_samples_per_second": 17.59, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8270623482617904, "step": 1440 }, { "epoch": 1.7691954374952155, "grad_norm": 1.018633484840393, "learning_rate": 3.6008549298450403e-06, "loss": 0.5337300300598145, "memory(GiB)": 36.87, "step": 1445, "token_acc": 0.8221121123846342, "train_speed(iter/s)": 0.126056 }, { "epoch": 1.7753196049911963, "grad_norm": 0.8938316106796265, "learning_rate": 3.5700824174384196e-06, "loss": 0.47947111129760744, "memory(GiB)": 36.87, "step": 1450, "token_acc": 0.83465726353315, "train_speed(iter/s)": 0.126132 }, { "epoch": 1.7814437724871777, "grad_norm": 0.950809895992279, "learning_rate": 3.5393687794249093e-06, "loss": 0.5499818325042725, "memory(GiB)": 36.87, "step": 1455, "token_acc": 0.8119980392896196, "train_speed(iter/s)": 0.126246 }, { "epoch": 1.7875679399831585, "grad_norm": 0.9783928394317627, "learning_rate": 3.508715280385644e-06, "loss": 0.5239407062530518, "memory(GiB)": 36.87, "step": 1460, "token_acc": 0.8301248357424441, "train_speed(iter/s)": 0.126348 }, { "epoch": 1.7875679399831585, "eval_loss": 0.6072365641593933, "eval_runtime": 29.9698, "eval_samples_per_second": 17.584, "eval_steps_per_second": 4.404, "eval_token_acc": 0.8274497649671987, "step": 1460 }, { "epoch": 1.7936921074791394, "grad_norm": 1.0948944091796875, "learning_rate": 3.478123182425639e-06, "loss": 0.5428466320037841, "memory(GiB)": 36.87, "step": 1465, "token_acc": 0.825891086303621, "train_speed(iter/s)": 0.12604 }, { "epoch": 1.7998162749751205, "grad_norm": 0.8970156311988831, "learning_rate": 3.4475937451218257e-06, "loss": 0.5330904960632324, "memory(GiB)": 36.87, "step": 1470, "token_acc": 0.8345248968536424, "train_speed(iter/s)": 0.126137 }, { "epoch": 1.8059404424711016, "grad_norm": 1.018349528312683, "learning_rate": 3.4171282254711935e-06, "loss": 0.5589166641235351, "memory(GiB)": 36.87, "step": 1475, "token_acc": 0.8269609914096606, "train_speed(iter/s)": 0.126239 }, { "epoch": 1.8120646099670825, "grad_norm": 0.9459341764450073, "learning_rate": 3.386727877839027e-06, "loss": 0.555328369140625, "memory(GiB)": 36.87, "step": 1480, "token_acc": 0.8426273550787036, "train_speed(iter/s)": 0.126358 }, { "epoch": 1.8120646099670825, "eval_loss": 0.6067067980766296, "eval_runtime": 30.0242, "eval_samples_per_second": 17.553, "eval_steps_per_second": 4.396, "eval_token_acc": 0.8277803605558138, "step": 1480 }, { "epoch": 1.8181887774630636, "grad_norm": 0.9838424324989319, "learning_rate": 3.356393953907271e-06, "loss": 0.5277560710906982, "memory(GiB)": 36.87, "step": 1485, "token_acc": 0.831073039771941, "train_speed(iter/s)": 0.126055 }, { "epoch": 1.8243129449590447, "grad_norm": 1.041955590248108, "learning_rate": 3.3261277026229857e-06, "loss": 0.5799334049224854, "memory(GiB)": 36.87, "step": 1490, "token_acc": 0.8321645313553607, "train_speed(iter/s)": 0.126161 }, { "epoch": 1.8304371124550256, "grad_norm": 0.9292726516723633, "learning_rate": 3.2959303701469254e-06, "loss": 0.5210411071777343, "memory(GiB)": 36.87, "step": 1495, "token_acc": 0.8229648473635522, "train_speed(iter/s)": 0.12628 }, { "epoch": 1.8365612799510067, "grad_norm": 0.864344596862793, "learning_rate": 3.2658031998022368e-06, "loss": 0.5165549278259277, "memory(GiB)": 36.87, "step": 1500, "token_acc": 0.8386885030686928, "train_speed(iter/s)": 0.126396 }, { "epoch": 1.8365612799510067, "eval_loss": 0.6069810390472412, "eval_runtime": 30.0285, "eval_samples_per_second": 17.55, "eval_steps_per_second": 4.396, "eval_token_acc": 0.8276202283175784, "step": 1500 }, { "epoch": 1.8426854474469878, "grad_norm": 0.8527396321296692, "learning_rate": 3.2357474320232565e-06, "loss": 0.5021331787109375, "memory(GiB)": 36.87, "step": 1505, "token_acc": 0.8321268481969626, "train_speed(iter/s)": 0.126074 }, { "epoch": 1.8488096149429687, "grad_norm": 0.9290481209754944, "learning_rate": 3.2057643043044452e-06, "loss": 0.5180329322814942, "memory(GiB)": 36.87, "step": 1510, "token_acc": 0.8345307220417938, "train_speed(iter/s)": 0.12617 }, { "epoch": 1.8549337824389496, "grad_norm": 0.9051028490066528, "learning_rate": 3.1758550511494336e-06, "loss": 0.5452617645263672, "memory(GiB)": 36.87, "step": 1515, "token_acc": 0.8341819137404329, "train_speed(iter/s)": 0.126302 }, { "epoch": 1.8610579499349307, "grad_norm": 1.0172206163406372, "learning_rate": 3.1460209040201967e-06, "loss": 0.5237324237823486, "memory(GiB)": 36.87, "step": 1520, "token_acc": 0.8367892176409603, "train_speed(iter/s)": 0.126404 }, { "epoch": 1.8610579499349307, "eval_loss": 0.6056584715843201, "eval_runtime": 29.9559, "eval_samples_per_second": 17.593, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8284363861769719, "step": 1520 }, { "epoch": 1.8671821174309118, "grad_norm": 0.9731259942054749, "learning_rate": 3.116263091286344e-06, "loss": 0.5423327445983886, "memory(GiB)": 36.87, "step": 1525, "token_acc": 0.8284452097329645, "train_speed(iter/s)": 0.126099 }, { "epoch": 1.8733062849268927, "grad_norm": 0.9437146782875061, "learning_rate": 3.0865828381745515e-06, "loss": 0.5558583736419678, "memory(GiB)": 36.87, "step": 1530, "token_acc": 0.828622035858878, "train_speed(iter/s)": 0.126204 }, { "epoch": 1.8794304524228738, "grad_norm": 1.0038166046142578, "learning_rate": 3.056981366718111e-06, "loss": 0.5397710800170898, "memory(GiB)": 36.87, "step": 1535, "token_acc": 0.8308211163879138, "train_speed(iter/s)": 0.12631 }, { "epoch": 1.8855546199188549, "grad_norm": 0.8471850156784058, "learning_rate": 3.0274598957066132e-06, "loss": 0.4804985523223877, "memory(GiB)": 36.87, "step": 1540, "token_acc": 0.8619780378558012, "train_speed(iter/s)": 0.126389 }, { "epoch": 1.8855546199188549, "eval_loss": 0.6056827306747437, "eval_runtime": 29.9968, "eval_samples_per_second": 17.569, "eval_steps_per_second": 4.4, "eval_token_acc": 0.8282245983780154, "step": 1540 }, { "epoch": 1.8916787874148357, "grad_norm": 0.938709557056427, "learning_rate": 2.998019640635772e-06, "loss": 0.5519435405731201, "memory(GiB)": 36.87, "step": 1545, "token_acc": 0.8257935412641457, "train_speed(iter/s)": 0.126081 }, { "epoch": 1.8978029549108169, "grad_norm": 0.9090867638587952, "learning_rate": 2.96866181365737e-06, "loss": 0.5426124572753906, "memory(GiB)": 36.87, "step": 1550, "token_acc": 0.843109962219129, "train_speed(iter/s)": 0.126179 }, { "epoch": 1.903927122406798, "grad_norm": 0.8900991678237915, "learning_rate": 2.9393876235293578e-06, "loss": 0.510080623626709, "memory(GiB)": 36.87, "step": 1555, "token_acc": 0.8357965621123515, "train_speed(iter/s)": 0.126271 }, { "epoch": 1.9100512899027788, "grad_norm": 0.8838712573051453, "learning_rate": 2.910198275566085e-06, "loss": 0.5103748321533204, "memory(GiB)": 36.87, "step": 1560, "token_acc": 0.8447406983809286, "train_speed(iter/s)": 0.126379 }, { "epoch": 1.9100512899027788, "eval_loss": 0.6069024205207825, "eval_runtime": 29.8963, "eval_samples_per_second": 17.628, "eval_steps_per_second": 4.415, "eval_token_acc": 0.8283692339480345, "step": 1560 }, { "epoch": 1.9161754573987597, "grad_norm": 1.0007625818252563, "learning_rate": 2.881094971588666e-06, "loss": 0.5161759853363037, "memory(GiB)": 36.87, "step": 1565, "token_acc": 0.8315062300454892, "train_speed(iter/s)": 0.126082 }, { "epoch": 1.9222996248947408, "grad_norm": 0.9922500848770142, "learning_rate": 2.8520789098755053e-06, "loss": 0.5415813446044921, "memory(GiB)": 36.87, "step": 1570, "token_acc": 0.8527313346785818, "train_speed(iter/s)": 0.126203 }, { "epoch": 1.928423792390722, "grad_norm": 0.9492520093917847, "learning_rate": 2.8231512851129596e-06, "loss": 0.5504971981048584, "memory(GiB)": 36.87, "step": 1575, "token_acc": 0.8186260917787328, "train_speed(iter/s)": 0.126306 }, { "epoch": 1.9345479598867028, "grad_norm": 0.9212282299995422, "learning_rate": 2.7943132883461434e-06, "loss": 0.547866678237915, "memory(GiB)": 36.87, "step": 1580, "token_acc": 0.83892855733954, "train_speed(iter/s)": 0.126413 }, { "epoch": 1.9345479598867028, "eval_loss": 0.6043635010719299, "eval_runtime": 29.9792, "eval_samples_per_second": 17.579, "eval_steps_per_second": 4.403, "eval_token_acc": 0.8282814194948086, "step": 1580 }, { "epoch": 1.940672127382684, "grad_norm": 1.0419548749923706, "learning_rate": 2.7655661069298934e-06, "loss": 0.5519622325897217, "memory(GiB)": 36.87, "step": 1585, "token_acc": 0.8333945887874247, "train_speed(iter/s)": 0.126139 }, { "epoch": 1.946796294878665, "grad_norm": 1.043246865272522, "learning_rate": 2.736910924479881e-06, "loss": 0.5610580921173096, "memory(GiB)": 36.87, "step": 1590, "token_acc": 0.8335689174006017, "train_speed(iter/s)": 0.126246 }, { "epoch": 1.952920462374646, "grad_norm": 0.9222803711891174, "learning_rate": 2.7083489208238784e-06, "loss": 0.5393799304962158, "memory(GiB)": 36.87, "step": 1595, "token_acc": 0.8198356395308001, "train_speed(iter/s)": 0.126365 }, { "epoch": 1.959044629870627, "grad_norm": 0.927827775478363, "learning_rate": 2.6798812719531843e-06, "loss": 0.5392462730407714, "memory(GiB)": 36.87, "step": 1600, "token_acc": 0.8314373587282766, "train_speed(iter/s)": 0.126473 }, { "epoch": 1.959044629870627, "eval_loss": 0.6057147979736328, "eval_runtime": 29.992, "eval_samples_per_second": 17.571, "eval_steps_per_second": 4.401, "eval_token_acc": 0.8276408905418668, "step": 1600 }, { "epoch": 1.965168797366608, "grad_norm": 1.0039829015731812, "learning_rate": 2.6515091499741946e-06, "loss": 0.5505844116210937, "memory(GiB)": 36.87, "step": 1605, "token_acc": 0.8264420910319964, "train_speed(iter/s)": 0.126174 }, { "epoch": 1.971292964862589, "grad_norm": 0.9382634162902832, "learning_rate": 2.623233723060157e-06, "loss": 0.5243973731994629, "memory(GiB)": 36.87, "step": 1610, "token_acc": 0.8320176612255821, "train_speed(iter/s)": 0.126271 }, { "epoch": 1.9774171323585699, "grad_norm": 0.9044788479804993, "learning_rate": 2.595056155403063e-06, "loss": 0.48435544967651367, "memory(GiB)": 36.87, "step": 1615, "token_acc": 0.843847529543781, "train_speed(iter/s)": 0.126372 }, { "epoch": 1.9835412998545512, "grad_norm": 0.9093387722969055, "learning_rate": 2.5669776071657194e-06, "loss": 0.515876293182373, "memory(GiB)": 36.87, "step": 1620, "token_acc": 0.8395097436639068, "train_speed(iter/s)": 0.126455 }, { "epoch": 1.9835412998545512, "eval_loss": 0.6055964231491089, "eval_runtime": 29.9496, "eval_samples_per_second": 17.596, "eval_steps_per_second": 4.407, "eval_token_acc": 0.8280644661397799, "step": 1620 }, { "epoch": 1.989665467350532, "grad_norm": 1.0032296180725098, "learning_rate": 2.5389992344339787e-06, "loss": 0.5630090713500977, "memory(GiB)": 36.87, "step": 1625, "token_acc": 0.8223650962996237, "train_speed(iter/s)": 0.126171 }, { "epoch": 1.995789634846513, "grad_norm": 0.9951412677764893, "learning_rate": 2.5111221891691384e-06, "loss": 0.5040010452270508, "memory(GiB)": 36.87, "step": 1630, "token_acc": 0.8500885437951233, "train_speed(iter/s)": 0.126256 }, { "epoch": 2.001224833499196, "grad_norm": 1.4779112339019775, "learning_rate": 2.4833476191605136e-06, "loss": 0.514947509765625, "memory(GiB)": 36.87, "step": 1635, "token_acc": 0.8120487926313169, "train_speed(iter/s)": 0.126393 }, { "epoch": 2.0073490009951773, "grad_norm": 0.9851377010345459, "learning_rate": 2.4556766679781763e-06, "loss": 0.4878593921661377, "memory(GiB)": 36.87, "step": 1640, "token_acc": 0.8328894582476486, "train_speed(iter/s)": 0.126502 }, { "epoch": 2.0073490009951773, "eval_loss": 0.6103575825691223, "eval_runtime": 29.9214, "eval_samples_per_second": 17.613, "eval_steps_per_second": 4.412, "eval_token_acc": 0.8283795650601787, "step": 1640 }, { "epoch": 2.013473168491158, "grad_norm": 0.9571102857589722, "learning_rate": 2.4281104749258716e-06, "loss": 0.49354209899902346, "memory(GiB)": 36.87, "step": 1645, "token_acc": 0.832244552629024, "train_speed(iter/s)": 0.126228 }, { "epoch": 2.019597335987139, "grad_norm": 0.9675928354263306, "learning_rate": 2.4006501749941097e-06, "loss": 0.47706212997436526, "memory(GiB)": 36.87, "step": 1650, "token_acc": 0.8675657501494322, "train_speed(iter/s)": 0.126296 }, { "epoch": 2.0257215034831204, "grad_norm": 0.9996489882469177, "learning_rate": 2.3732968988134343e-06, "loss": 0.4821828842163086, "memory(GiB)": 36.87, "step": 1655, "token_acc": 0.8707286339040842, "train_speed(iter/s)": 0.126397 }, { "epoch": 2.0318456709791013, "grad_norm": 1.0168769359588623, "learning_rate": 2.3460517726078696e-06, "loss": 0.47524452209472656, "memory(GiB)": 36.87, "step": 1660, "token_acc": 0.855996970531534, "train_speed(iter/s)": 0.126479 }, { "epoch": 2.0318456709791013, "eval_loss": 0.6196611523628235, "eval_runtime": 29.918, "eval_samples_per_second": 17.615, "eval_steps_per_second": 4.412, "eval_token_acc": 0.8268350637946175, "step": 1660 }, { "epoch": 2.037969838475082, "grad_norm": 0.9725021123886108, "learning_rate": 2.3189159181485517e-06, "loss": 0.4909340858459473, "memory(GiB)": 36.87, "step": 1665, "token_acc": 0.8357392077717726, "train_speed(iter/s)": 0.126201 }, { "epoch": 2.0440940059710635, "grad_norm": 0.9119012355804443, "learning_rate": 2.291890452707539e-06, "loss": 0.4890812873840332, "memory(GiB)": 36.87, "step": 1670, "token_acc": 0.8586991348926626, "train_speed(iter/s)": 0.126291 }, { "epoch": 2.0502181734670444, "grad_norm": 1.0054688453674316, "learning_rate": 2.2649764890118158e-06, "loss": 0.49579925537109376, "memory(GiB)": 36.87, "step": 1675, "token_acc": 0.8483596157331883, "train_speed(iter/s)": 0.126396 }, { "epoch": 2.0563423409630253, "grad_norm": 0.9014572501182556, "learning_rate": 2.238175135197471e-06, "loss": 0.47943267822265623, "memory(GiB)": 36.87, "step": 1680, "token_acc": 0.8587532153124527, "train_speed(iter/s)": 0.126492 }, { "epoch": 2.0563423409630253, "eval_loss": 0.6154988408088684, "eval_runtime": 29.9636, "eval_samples_per_second": 17.588, "eval_steps_per_second": 4.405, "eval_token_acc": 0.8270726793739346, "step": 1680 }, { "epoch": 2.062466508459006, "grad_norm": 0.938705563545227, "learning_rate": 2.2114874947640763e-06, "loss": 0.45625782012939453, "memory(GiB)": 36.87, "step": 1685, "token_acc": 0.8355256733948025, "train_speed(iter/s)": 0.126208 }, { "epoch": 2.0685906759549875, "grad_norm": 0.8989147543907166, "learning_rate": 2.1849146665292513e-06, "loss": 0.46575441360473635, "memory(GiB)": 36.87, "step": 1690, "token_acc": 0.8795854481354284, "train_speed(iter/s)": 0.126306 }, { "epoch": 2.0747148434509683, "grad_norm": 0.9596337080001831, "learning_rate": 2.1584577445834234e-06, "loss": 0.48124160766601565, "memory(GiB)": 36.87, "step": 1695, "token_acc": 0.8413667107206717, "train_speed(iter/s)": 0.126388 }, { "epoch": 2.0808390109469492, "grad_norm": 0.788873016834259, "learning_rate": 2.132117818244771e-06, "loss": 0.46569390296936036, "memory(GiB)": 36.87, "step": 1700, "token_acc": 0.8618104667609618, "train_speed(iter/s)": 0.12649 }, { "epoch": 2.0808390109469492, "eval_loss": 0.6172361373901367, "eval_runtime": 29.9107, "eval_samples_per_second": 17.619, "eval_steps_per_second": 4.413, "eval_token_acc": 0.8270313549253577, "step": 1700 }, { "epoch": 2.0869631784429306, "grad_norm": 1.0761973857879639, "learning_rate": 2.1058959720143875e-06, "loss": 0.4640150547027588, "memory(GiB)": 36.87, "step": 1705, "token_acc": 0.8384597955079729, "train_speed(iter/s)": 0.126222 }, { "epoch": 2.0930873459389114, "grad_norm": 0.9404869675636292, "learning_rate": 2.0797932855316183e-06, "loss": 0.48186473846435546, "memory(GiB)": 36.87, "step": 1710, "token_acc": 0.8572665858305907, "train_speed(iter/s)": 0.126305 }, { "epoch": 2.0992115134348923, "grad_norm": 0.979210376739502, "learning_rate": 2.0538108335296107e-06, "loss": 0.4823300361633301, "memory(GiB)": 36.87, "step": 1715, "token_acc": 0.8577344523032946, "train_speed(iter/s)": 0.1264 }, { "epoch": 2.1053356809308736, "grad_norm": 0.9393882751464844, "learning_rate": 2.0279496857910667e-06, "loss": 0.48357486724853516, "memory(GiB)": 36.87, "step": 1720, "token_acc": 0.8569940863614386, "train_speed(iter/s)": 0.126508 }, { "epoch": 2.1053356809308736, "eval_loss": 0.6190218329429626, "eval_runtime": 30.0103, "eval_samples_per_second": 17.561, "eval_steps_per_second": 4.398, "eval_token_acc": 0.826953871584276, "step": 1720 }, { "epoch": 2.1114598484268545, "grad_norm": 0.9903694987297058, "learning_rate": 2.0022109071041905e-06, "loss": 0.485797643661499, "memory(GiB)": 36.87, "step": 1725, "token_acc": 0.8363958585952803, "train_speed(iter/s)": 0.126264 }, { "epoch": 2.1175840159228354, "grad_norm": 0.8971183896064758, "learning_rate": 1.9765955572188578e-06, "loss": 0.468338680267334, "memory(GiB)": 36.87, "step": 1730, "token_acc": 0.850854499843211, "train_speed(iter/s)": 0.126332 }, { "epoch": 2.1237081834188167, "grad_norm": 0.8892176151275635, "learning_rate": 1.951104690802969e-06, "loss": 0.45011487007141116, "memory(GiB)": 36.87, "step": 1735, "token_acc": 0.8456293706293706, "train_speed(iter/s)": 0.126424 }, { "epoch": 2.1298323509147976, "grad_norm": 0.9592292904853821, "learning_rate": 1.925739357399038e-06, "loss": 0.45401706695556643, "memory(GiB)": 36.87, "step": 1740, "token_acc": 0.8367899677215894, "train_speed(iter/s)": 0.126512 }, { "epoch": 2.1298323509147976, "eval_loss": 0.6162592768669128, "eval_runtime": 29.8564, "eval_samples_per_second": 17.651, "eval_steps_per_second": 4.421, "eval_token_acc": 0.8268247326824733, "step": 1740 }, { "epoch": 2.1359565184107785, "grad_norm": 0.9750301241874695, "learning_rate": 1.9005006013809662e-06, "loss": 0.5132875442504883, "memory(GiB)": 36.87, "step": 1745, "token_acc": 0.8310148067894547, "train_speed(iter/s)": 0.126241 }, { "epoch": 2.1420806859067594, "grad_norm": 1.0531848669052124, "learning_rate": 1.8753894619110547e-06, "loss": 0.4934427261352539, "memory(GiB)": 36.87, "step": 1750, "token_acc": 0.8594569186824312, "train_speed(iter/s)": 0.126339 }, { "epoch": 2.1482048534027407, "grad_norm": 0.9899281859397888, "learning_rate": 1.8504069728972124e-06, "loss": 0.5067736625671386, "memory(GiB)": 36.87, "step": 1755, "token_acc": 0.8443969645619981, "train_speed(iter/s)": 0.126433 }, { "epoch": 2.1543290208987216, "grad_norm": 0.9110437035560608, "learning_rate": 1.8255541629503865e-06, "loss": 0.43926572799682617, "memory(GiB)": 36.87, "step": 1760, "token_acc": 0.8647777628575265, "train_speed(iter/s)": 0.126537 }, { "epoch": 2.1543290208987216, "eval_loss": 0.6184687614440918, "eval_runtime": 29.9388, "eval_samples_per_second": 17.603, "eval_steps_per_second": 4.409, "eval_token_acc": 0.8269796993646366, "step": 1760 }, { "epoch": 2.1604531883947025, "grad_norm": 0.956470251083374, "learning_rate": 1.8008320553422116e-06, "loss": 0.48296318054199217, "memory(GiB)": 36.87, "step": 1765, "token_acc": 0.8321491877005113, "train_speed(iter/s)": 0.126272 }, { "epoch": 2.166577355890684, "grad_norm": 0.990215003490448, "learning_rate": 1.7762416679628792e-06, "loss": 0.4733391761779785, "memory(GiB)": 36.87, "step": 1770, "token_acc": 0.8625864925445863, "train_speed(iter/s)": 0.126373 }, { "epoch": 2.1727015233866647, "grad_norm": 0.9296258687973022, "learning_rate": 1.751784013279228e-06, "loss": 0.4612305164337158, "memory(GiB)": 36.87, "step": 1775, "token_acc": 0.8483617060223321, "train_speed(iter/s)": 0.126475 }, { "epoch": 2.1788256908826455, "grad_norm": 0.9242532253265381, "learning_rate": 1.7274600982930544e-06, "loss": 0.4506662368774414, "memory(GiB)": 36.87, "step": 1780, "token_acc": 0.8471429097741591, "train_speed(iter/s)": 0.126543 }, { "epoch": 2.1788256908826455, "eval_loss": 0.6181926131248474, "eval_runtime": 29.9341, "eval_samples_per_second": 17.605, "eval_steps_per_second": 4.41, "eval_token_acc": 0.8270055271449972, "step": 1780 }, { "epoch": 2.1849498583786264, "grad_norm": 0.9514071345329285, "learning_rate": 1.7032709244996559e-06, "loss": 0.45079612731933594, "memory(GiB)": 36.87, "step": 1785, "token_acc": 0.8306619810862547, "train_speed(iter/s)": 0.126266 }, { "epoch": 2.1910740258746078, "grad_norm": 0.8734022974967957, "learning_rate": 1.6792174878465933e-06, "loss": 0.4914576530456543, "memory(GiB)": 36.87, "step": 1790, "token_acc": 0.8544291529366156, "train_speed(iter/s)": 0.126359 }, { "epoch": 2.1971981933705886, "grad_norm": 0.9682619571685791, "learning_rate": 1.65530077869268e-06, "loss": 0.46589956283569334, "memory(GiB)": 36.87, "step": 1795, "token_acc": 0.856334134219794, "train_speed(iter/s)": 0.126455 }, { "epoch": 2.2033223608665695, "grad_norm": 0.9326309561729431, "learning_rate": 1.6315217817672142e-06, "loss": 0.4956002712249756, "memory(GiB)": 36.87, "step": 1800, "token_acc": 0.851890756302521, "train_speed(iter/s)": 0.126552 }, { "epoch": 2.2033223608665695, "eval_loss": 0.6174434423446655, "eval_runtime": 29.957, "eval_samples_per_second": 17.592, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8268092360142569, "step": 1800 }, { "epoch": 2.209446528362551, "grad_norm": 1.0249996185302734, "learning_rate": 1.607881476129432e-06, "loss": 0.480439281463623, "memory(GiB)": 36.87, "step": 1805, "token_acc": 0.8305227415396907, "train_speed(iter/s)": 0.126269 }, { "epoch": 2.2155706958585317, "grad_norm": 0.9621463418006897, "learning_rate": 1.5843808351281913e-06, "loss": 0.4549149513244629, "memory(GiB)": 36.87, "step": 1810, "token_acc": 0.8661414578031861, "train_speed(iter/s)": 0.126344 }, { "epoch": 2.2216948633545126, "grad_norm": 0.977536141872406, "learning_rate": 1.5610208263619002e-06, "loss": 0.48578948974609376, "memory(GiB)": 36.87, "step": 1815, "token_acc": 0.8355557119234422, "train_speed(iter/s)": 0.12644 }, { "epoch": 2.227819030850494, "grad_norm": 0.9830949902534485, "learning_rate": 1.537802411638677e-06, "loss": 0.4825616359710693, "memory(GiB)": 36.87, "step": 1820, "token_acc": 0.8571990136868813, "train_speed(iter/s)": 0.126532 }, { "epoch": 2.227819030850494, "eval_loss": 0.6185752749443054, "eval_runtime": 29.9623, "eval_samples_per_second": 17.589, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8268195671264011, "step": 1820 }, { "epoch": 2.233943198346475, "grad_norm": 0.8971763253211975, "learning_rate": 1.514726546936749e-06, "loss": 0.4621254920959473, "memory(GiB)": 36.87, "step": 1825, "token_acc": 0.8387506208266652, "train_speed(iter/s)": 0.126271 }, { "epoch": 2.2400673658424557, "grad_norm": 0.8897117972373962, "learning_rate": 1.4917941823650917e-06, "loss": 0.4865126609802246, "memory(GiB)": 36.87, "step": 1830, "token_acc": 0.8466782763348031, "train_speed(iter/s)": 0.126353 }, { "epoch": 2.246191533338437, "grad_norm": 0.9670534133911133, "learning_rate": 1.4690062621243117e-06, "loss": 0.4749399185180664, "memory(GiB)": 36.87, "step": 1835, "token_acc": 0.8677187834569174, "train_speed(iter/s)": 0.126441 }, { "epoch": 2.252315700834418, "grad_norm": 0.8603255152702332, "learning_rate": 1.4463637244677648e-06, "loss": 0.46147994995117186, "memory(GiB)": 36.87, "step": 1840, "token_acc": 0.8514672004229767, "train_speed(iter/s)": 0.126525 }, { "epoch": 2.252315700834418, "eval_loss": 0.618859052658081, "eval_runtime": 29.975, "eval_samples_per_second": 17.581, "eval_steps_per_second": 4.404, "eval_token_acc": 0.8268815537992665, "step": 1840 }, { "epoch": 2.258439868330399, "grad_norm": 0.9529172778129578, "learning_rate": 1.423867501662934e-06, "loss": 0.4659478187561035, "memory(GiB)": 36.87, "step": 1845, "token_acc": 0.8376929823340248, "train_speed(iter/s)": 0.126266 }, { "epoch": 2.2645640358263797, "grad_norm": 0.9903680086135864, "learning_rate": 1.4015185199530378e-06, "loss": 0.4695383071899414, "memory(GiB)": 36.87, "step": 1850, "token_acc": 0.8556800687408256, "train_speed(iter/s)": 0.126353 }, { "epoch": 2.270688203322361, "grad_norm": 0.9149890542030334, "learning_rate": 1.379317699518898e-06, "loss": 0.47596092224121095, "memory(GiB)": 36.87, "step": 1855, "token_acc": 0.8519627185522824, "train_speed(iter/s)": 0.126419 }, { "epoch": 2.276812370818342, "grad_norm": 0.8692817091941833, "learning_rate": 1.3572659544410493e-06, "loss": 0.43576741218566895, "memory(GiB)": 36.87, "step": 1860, "token_acc": 0.8630936883995983, "train_speed(iter/s)": 0.126518 }, { "epoch": 2.276812370818342, "eval_loss": 0.6181974411010742, "eval_runtime": 30.0408, "eval_samples_per_second": 17.543, "eval_steps_per_second": 4.394, "eval_token_acc": 0.8271088382664393, "step": 1860 }, { "epoch": 2.2829365383143227, "grad_norm": 0.9488633871078491, "learning_rate": 1.3353641926621065e-06, "loss": 0.45254907608032224, "memory(GiB)": 36.87, "step": 1865, "token_acc": 0.8336496980155307, "train_speed(iter/s)": 0.126266 }, { "epoch": 2.289060705810304, "grad_norm": 1.0025756359100342, "learning_rate": 1.3136133159493803e-06, "loss": 0.4933184623718262, "memory(GiB)": 36.87, "step": 1870, "token_acc": 0.8573630940411556, "train_speed(iter/s)": 0.126361 }, { "epoch": 2.295184873306285, "grad_norm": 0.8357995748519897, "learning_rate": 1.2920142198577484e-06, "loss": 0.45499467849731445, "memory(GiB)": 36.87, "step": 1875, "token_acc": 0.8629192723138147, "train_speed(iter/s)": 0.126432 }, { "epoch": 2.301309040802266, "grad_norm": 0.9138444066047668, "learning_rate": 1.2705677936927841e-06, "loss": 0.4767561435699463, "memory(GiB)": 36.87, "step": 1880, "token_acc": 0.8521506375701698, "train_speed(iter/s)": 0.126523 }, { "epoch": 2.301309040802266, "eval_loss": 0.6184601187705994, "eval_runtime": 30.0434, "eval_samples_per_second": 17.541, "eval_steps_per_second": 4.394, "eval_token_acc": 0.8273567849579008, "step": 1880 }, { "epoch": 2.3074332082982467, "grad_norm": 0.9720640182495117, "learning_rate": 1.2492749204741368e-06, "loss": 0.4715888500213623, "memory(GiB)": 36.87, "step": 1885, "token_acc": 0.8328760826785792, "train_speed(iter/s)": 0.126264 }, { "epoch": 2.313557375794228, "grad_norm": 1.062354564666748, "learning_rate": 1.2281364768991804e-06, "loss": 0.4756108283996582, "memory(GiB)": 36.87, "step": 1890, "token_acc": 0.8549824466648663, "train_speed(iter/s)": 0.126366 }, { "epoch": 2.319681543290209, "grad_norm": 1.040152907371521, "learning_rate": 1.207153333306914e-06, "loss": 0.457261848449707, "memory(GiB)": 36.87, "step": 1895, "token_acc": 0.858182628393182, "train_speed(iter/s)": 0.126458 }, { "epoch": 2.32580571078619, "grad_norm": 0.9648529887199402, "learning_rate": 1.1863263536421261e-06, "loss": 0.49726166725158694, "memory(GiB)": 36.87, "step": 1900, "token_acc": 0.8323407202216067, "train_speed(iter/s)": 0.126559 }, { "epoch": 2.32580571078619, "eval_loss": 0.6168169975280762, "eval_runtime": 30.03, "eval_samples_per_second": 17.549, "eval_steps_per_second": 4.396, "eval_token_acc": 0.8270106927010693, "step": 1900 }, { "epoch": 2.331929878282171, "grad_norm": 0.9079554677009583, "learning_rate": 1.1656563954198258e-06, "loss": 0.5002402305603028, "memory(GiB)": 36.87, "step": 1905, "token_acc": 0.8321498929943166, "train_speed(iter/s)": 0.126312 }, { "epoch": 2.338054045778152, "grad_norm": 1.007360816001892, "learning_rate": 1.145144309689934e-06, "loss": 0.4659921646118164, "memory(GiB)": 36.87, "step": 1910, "token_acc": 0.8422997172478793, "train_speed(iter/s)": 0.126382 }, { "epoch": 2.344178213274133, "grad_norm": 1.0213356018066406, "learning_rate": 1.1247909410022434e-06, "loss": 0.46290979385375974, "memory(GiB)": 36.87, "step": 1915, "token_acc": 0.8491659285503396, "train_speed(iter/s)": 0.12648 }, { "epoch": 2.350302380770114, "grad_norm": 1.1062732934951782, "learning_rate": 1.1045971273716476e-06, "loss": 0.4558609962463379, "memory(GiB)": 36.87, "step": 1920, "token_acc": 0.8681887684181262, "train_speed(iter/s)": 0.126559 }, { "epoch": 2.350302380770114, "eval_loss": 0.6178110837936401, "eval_runtime": 29.9953, "eval_samples_per_second": 17.569, "eval_steps_per_second": 4.401, "eval_token_acc": 0.8270985071542951, "step": 1920 }, { "epoch": 2.356426548266095, "grad_norm": 0.9201487302780151, "learning_rate": 1.0845637002436344e-06, "loss": 0.46529560089111327, "memory(GiB)": 36.87, "step": 1925, "token_acc": 0.8382993992876508, "train_speed(iter/s)": 0.126335 }, { "epoch": 2.362550715762076, "grad_norm": 2.0007822513580322, "learning_rate": 1.0646914844600543e-06, "loss": 0.46782960891723635, "memory(GiB)": 36.87, "step": 1930, "token_acc": 0.8615680194148577, "train_speed(iter/s)": 0.126411 }, { "epoch": 2.3686748832580573, "grad_norm": 1.0234315395355225, "learning_rate": 1.0449812982251556e-06, "loss": 0.4937599658966064, "memory(GiB)": 36.87, "step": 1935, "token_acc": 0.8580788129877638, "train_speed(iter/s)": 0.126511 }, { "epoch": 2.374799050754038, "grad_norm": 0.964102566242218, "learning_rate": 1.0254339530719031e-06, "loss": 0.49028477668762205, "memory(GiB)": 36.87, "step": 1940, "token_acc": 0.8427895540736877, "train_speed(iter/s)": 0.126597 }, { "epoch": 2.374799050754038, "eval_loss": 0.618255078792572, "eval_runtime": 29.9499, "eval_samples_per_second": 17.596, "eval_steps_per_second": 4.407, "eval_token_acc": 0.8271708249393047, "step": 1940 }, { "epoch": 2.380923218250019, "grad_norm": 0.9083016514778137, "learning_rate": 1.0060502538285582e-06, "loss": 0.47533645629882815, "memory(GiB)": 36.87, "step": 1945, "token_acc": 0.8339908186042594, "train_speed(iter/s)": 0.126344 }, { "epoch": 2.3870473857460004, "grad_norm": 0.9915279746055603, "learning_rate": 9.868309985855446e-07, "loss": 0.4681232452392578, "memory(GiB)": 36.87, "step": 1950, "token_acc": 0.8487209179913675, "train_speed(iter/s)": 0.126421 }, { "epoch": 2.3931715532419813, "grad_norm": 0.9561747312545776, "learning_rate": 9.677769786625869e-07, "loss": 0.48569955825805666, "memory(GiB)": 36.87, "step": 1955, "token_acc": 0.8486834496318285, "train_speed(iter/s)": 0.126494 }, { "epoch": 2.399295720737962, "grad_norm": 0.9286803603172302, "learning_rate": 9.488889785761324e-07, "loss": 0.44054179191589354, "memory(GiB)": 36.87, "step": 1960, "token_acc": 0.8660492977141283, "train_speed(iter/s)": 0.126575 }, { "epoch": 2.399295720737962, "eval_loss": 0.6190705895423889, "eval_runtime": 29.9923, "eval_samples_per_second": 17.571, "eval_steps_per_second": 4.401, "eval_token_acc": 0.8266181104395888, "step": 1960 }, { "epoch": 2.405419888233943, "grad_norm": 1.0242820978164673, "learning_rate": 9.301677760070449e-07, "loss": 0.4897134304046631, "memory(GiB)": 36.87, "step": 1965, "token_acc": 0.8352815571190013, "train_speed(iter/s)": 0.126327 }, { "epoch": 2.4115440557299244, "grad_norm": 0.939855694770813, "learning_rate": 9.116141417685898e-07, "loss": 0.45674614906311034, "memory(GiB)": 36.87, "step": 1970, "token_acc": 0.8488702986251586, "train_speed(iter/s)": 0.126411 }, { "epoch": 2.4176682232259052, "grad_norm": 0.9036867022514343, "learning_rate": 8.932288397746919e-07, "loss": 0.4510343074798584, "memory(GiB)": 36.87, "step": 1975, "token_acc": 0.8560777957860616, "train_speed(iter/s)": 0.126495 }, { "epoch": 2.423792390721886, "grad_norm": 0.9866623878479004, "learning_rate": 8.750126270084891e-07, "loss": 0.4746750831604004, "memory(GiB)": 36.87, "step": 1980, "token_acc": 0.8668764857535072, "train_speed(iter/s)": 0.126576 }, { "epoch": 2.423792390721886, "eval_loss": 0.6186906099319458, "eval_runtime": 29.9649, "eval_samples_per_second": 17.587, "eval_steps_per_second": 4.405, "eval_token_acc": 0.8270881760421509, "step": 1980 }, { "epoch": 2.4299165582178675, "grad_norm": 0.9331910610198975, "learning_rate": 8.569662534911605e-07, "loss": 0.4652440071105957, "memory(GiB)": 36.87, "step": 1985, "token_acc": 0.8312124178436582, "train_speed(iter/s)": 0.126348 }, { "epoch": 2.4360407257138483, "grad_norm": 0.8783546686172485, "learning_rate": 8.390904622510471e-07, "loss": 0.43751039505004885, "memory(GiB)": 36.87, "step": 1990, "token_acc": 0.8822175660357916, "train_speed(iter/s)": 0.126428 }, { "epoch": 2.442164893209829, "grad_norm": 0.9757615923881531, "learning_rate": 8.213859892930581e-07, "loss": 0.4832446098327637, "memory(GiB)": 36.87, "step": 1995, "token_acc": 0.8358402898094188, "train_speed(iter/s)": 0.126498 }, { "epoch": 2.44828906070581, "grad_norm": 0.9304143190383911, "learning_rate": 8.03853563568367e-07, "loss": 0.46181411743164064, "memory(GiB)": 36.87, "step": 2000, "token_acc": 0.8611440231700895, "train_speed(iter/s)": 0.126561 }, { "epoch": 2.44828906070581, "eval_loss": 0.6176819205284119, "eval_runtime": 29.9501, "eval_samples_per_second": 17.596, "eval_steps_per_second": 4.407, "eval_token_acc": 0.8269900304767809, "step": 2000 }, { "epoch": 2.4544132282017914, "grad_norm": 0.9659498333930969, "learning_rate": 7.864939069444006e-07, "loss": 0.47965211868286134, "memory(GiB)": 36.87, "step": 2005, "token_acc": 0.831640513163097, "train_speed(iter/s)": 0.126337 }, { "epoch": 2.4605373956977723, "grad_norm": 1.0172474384307861, "learning_rate": 7.693077341751138e-07, "loss": 0.4738880157470703, "memory(GiB)": 36.87, "step": 2010, "token_acc": 0.8708014805078004, "train_speed(iter/s)": 0.126397 }, { "epoch": 2.466661563193753, "grad_norm": 0.9642378091812134, "learning_rate": 7.522957528715636e-07, "loss": 0.4847827911376953, "memory(GiB)": 36.87, "step": 2015, "token_acc": 0.8358588871654138, "train_speed(iter/s)": 0.126477 }, { "epoch": 2.4727857306897345, "grad_norm": 1.015015721321106, "learning_rate": 7.354586634727729e-07, "loss": 0.48462276458740233, "memory(GiB)": 36.87, "step": 2020, "token_acc": 0.8585737976782752, "train_speed(iter/s)": 0.126571 }, { "epoch": 2.4727857306897345, "eval_loss": 0.6191264986991882, "eval_runtime": 29.948, "eval_samples_per_second": 17.597, "eval_steps_per_second": 4.408, "eval_token_acc": 0.8270416860375019, "step": 2020 }, { "epoch": 2.4789098981857154, "grad_norm": 0.9471215009689331, "learning_rate": 7.187971592168936e-07, "loss": 0.4690380096435547, "memory(GiB)": 36.87, "step": 2025, "token_acc": 0.83595499208511, "train_speed(iter/s)": 0.126356 }, { "epoch": 2.4850340656816963, "grad_norm": 0.9936275482177734, "learning_rate": 7.023119261126571e-07, "loss": 0.4644585609436035, "memory(GiB)": 36.87, "step": 2030, "token_acc": 0.8678402074165046, "train_speed(iter/s)": 0.126441 }, { "epoch": 2.4911582331776776, "grad_norm": 0.9575950503349304, "learning_rate": 6.860036429111394e-07, "loss": 0.4721442699432373, "memory(GiB)": 36.87, "step": 2035, "token_acc": 0.8546081813701331, "train_speed(iter/s)": 0.126503 }, { "epoch": 2.4972824006736585, "grad_norm": 1.002123475074768, "learning_rate": 6.698729810778065e-07, "loss": 0.4855657577514648, "memory(GiB)": 36.87, "step": 2040, "token_acc": 0.8467344696835466, "train_speed(iter/s)": 0.126594 }, { "epoch": 2.4972824006736585, "eval_loss": 0.6174936294555664, "eval_runtime": 29.9826, "eval_samples_per_second": 17.577, "eval_steps_per_second": 4.403, "eval_token_acc": 0.8267524148974638, "step": 2040 }, { "epoch": 2.5034065681696394, "grad_norm": 0.9535955786705017, "learning_rate": 6.539206047648705e-07, "loss": 0.46763386726379397, "memory(GiB)": 36.87, "step": 2045, "token_acc": 0.8324564664169503, "train_speed(iter/s)": 0.12638 }, { "epoch": 2.5095307356656207, "grad_norm": 0.9121899008750916, "learning_rate": 6.381471707839449e-07, "loss": 0.44632792472839355, "memory(GiB)": 36.87, "step": 2050, "token_acc": 0.8615195671656654, "train_speed(iter/s)": 0.126463 }, { "epoch": 2.5156549031616016, "grad_norm": 0.9412456750869751, "learning_rate": 6.225533285789997e-07, "loss": 0.46562681198120115, "memory(GiB)": 36.87, "step": 2055, "token_acc": 0.8705161854768154, "train_speed(iter/s)": 0.126535 }, { "epoch": 2.5217790706575824, "grad_norm": 0.9882362484931946, "learning_rate": 6.071397201996243e-07, "loss": 0.47277240753173827, "memory(GiB)": 36.87, "step": 2060, "token_acc": 0.8616963064295485, "train_speed(iter/s)": 0.126619 }, { "epoch": 2.5217790706575824, "eval_loss": 0.6171227097511292, "eval_runtime": 29.9995, "eval_samples_per_second": 17.567, "eval_steps_per_second": 4.4, "eval_token_acc": 0.8271191693785837, "step": 2060 }, { "epoch": 2.5279032381535638, "grad_norm": 1.003340482711792, "learning_rate": 5.919069802745914e-07, "loss": 0.4641777515411377, "memory(GiB)": 36.87, "step": 2065, "token_acc": 0.8311811067402455, "train_speed(iter/s)": 0.126395 }, { "epoch": 2.5340274056495447, "grad_norm": 0.9802326560020447, "learning_rate": 5.768557359857241e-07, "loss": 0.4592477321624756, "memory(GiB)": 36.87, "step": 2070, "token_acc": 0.8501814594270815, "train_speed(iter/s)": 0.12647 }, { "epoch": 2.5401515731455255, "grad_norm": 0.9962190985679626, "learning_rate": 5.619866070420766e-07, "loss": 0.4591672897338867, "memory(GiB)": 36.87, "step": 2075, "token_acc": 0.8545560747663551, "train_speed(iter/s)": 0.126542 }, { "epoch": 2.5462757406415064, "grad_norm": 0.8959765434265137, "learning_rate": 5.473002056544191e-07, "loss": 0.43817138671875, "memory(GiB)": 36.87, "step": 2080, "token_acc": 0.8604302151075538, "train_speed(iter/s)": 0.12662 }, { "epoch": 2.5462757406415064, "eval_loss": 0.6181796789169312, "eval_runtime": 29.9775, "eval_samples_per_second": 17.58, "eval_steps_per_second": 4.403, "eval_token_acc": 0.8274600960793429, "step": 2080 }, { "epoch": 2.5523999081374873, "grad_norm": 0.9490206837654114, "learning_rate": 5.327971365100276e-07, "loss": 0.4962893486022949, "memory(GiB)": 36.87, "step": 2085, "token_acc": 0.8353908876332166, "train_speed(iter/s)": 0.126398 }, { "epoch": 2.5585240756334686, "grad_norm": 0.9097649455070496, "learning_rate": 5.184779967477893e-07, "loss": 0.4803347110748291, "memory(GiB)": 36.87, "step": 2090, "token_acc": 0.856787781665958, "train_speed(iter/s)": 0.126471 }, { "epoch": 2.5646482431294495, "grad_norm": 1.0033832788467407, "learning_rate": 5.043433759336158e-07, "loss": 0.4686880111694336, "memory(GiB)": 36.87, "step": 2095, "token_acc": 0.8430141843971631, "train_speed(iter/s)": 0.126552 }, { "epoch": 2.5707724106254304, "grad_norm": 0.9607586860656738, "learning_rate": 4.903938560361698e-07, "loss": 0.48217024803161623, "memory(GiB)": 36.87, "step": 2100, "token_acc": 0.8403233581785144, "train_speed(iter/s)": 0.126627 }, { "epoch": 2.5707724106254304, "eval_loss": 0.6173272728919983, "eval_runtime": 30.0012, "eval_samples_per_second": 17.566, "eval_steps_per_second": 4.4, "eval_token_acc": 0.8270313549253577, "step": 2100 }, { "epoch": 2.5768965781214117, "grad_norm": 0.9122663140296936, "learning_rate": 4.76630011402901e-07, "loss": 0.45368499755859376, "memory(GiB)": 36.87, "step": 2105, "token_acc": 0.8375441091626303, "train_speed(iter/s)": 0.1264 }, { "epoch": 2.5830207456173926, "grad_norm": 0.9508864879608154, "learning_rate": 4.630524087364019e-07, "loss": 0.4732816696166992, "memory(GiB)": 36.87, "step": 2110, "token_acc": 0.8504168897728142, "train_speed(iter/s)": 0.126473 }, { "epoch": 2.5891449131133735, "grad_norm": 0.9362801909446716, "learning_rate": 4.4966160707107075e-07, "loss": 0.48420238494873047, "memory(GiB)": 36.87, "step": 2115, "token_acc": 0.8500457797822917, "train_speed(iter/s)": 0.126544 }, { "epoch": 2.595269080609355, "grad_norm": 0.9054739475250244, "learning_rate": 4.364581577500987e-07, "loss": 0.43644113540649415, "memory(GiB)": 36.87, "step": 2120, "token_acc": 0.8647457297507536, "train_speed(iter/s)": 0.126611 }, { "epoch": 2.595269080609355, "eval_loss": 0.6172040104866028, "eval_runtime": 29.9489, "eval_samples_per_second": 17.597, "eval_steps_per_second": 4.408, "eval_token_acc": 0.827186321607521, "step": 2120 }, { "epoch": 2.6013932481053357, "grad_norm": 0.9941717386245728, "learning_rate": 4.2344260440276455e-07, "loss": 0.5040837287902832, "memory(GiB)": 36.87, "step": 2125, "token_acc": 0.8330278541475742, "train_speed(iter/s)": 0.126387 }, { "epoch": 2.6075174156013166, "grad_norm": 0.9401910901069641, "learning_rate": 4.10615482922056e-07, "loss": 0.4815082550048828, "memory(GiB)": 36.87, "step": 2130, "token_acc": 0.8560084700899947, "train_speed(iter/s)": 0.126454 }, { "epoch": 2.613641583097298, "grad_norm": 0.9616697430610657, "learning_rate": 3.979773214426019e-07, "loss": 0.484088134765625, "memory(GiB)": 36.87, "step": 2135, "token_acc": 0.8334314302530901, "train_speed(iter/s)": 0.126532 }, { "epoch": 2.6197657505932788, "grad_norm": 0.9681402444839478, "learning_rate": 3.85528640318929e-07, "loss": 0.46643218994140623, "memory(GiB)": 36.87, "step": 2140, "token_acc": 0.8545515745381106, "train_speed(iter/s)": 0.126601 }, { "epoch": 2.6197657505932788, "eval_loss": 0.6171387434005737, "eval_runtime": 30.0426, "eval_samples_per_second": 17.542, "eval_steps_per_second": 4.394, "eval_token_acc": 0.8271914871635931, "step": 2140 }, { "epoch": 2.6258899180892596, "grad_norm": 0.9676551222801208, "learning_rate": 3.732699521040378e-07, "loss": 0.46207480430603026, "memory(GiB)": 36.87, "step": 2145, "token_acc": 0.8425608384317814, "train_speed(iter/s)": 0.126373 }, { "epoch": 2.632014085585241, "grad_norm": 1.003116250038147, "learning_rate": 3.612017615282964e-07, "loss": 0.4896972179412842, "memory(GiB)": 36.87, "step": 2150, "token_acc": 0.8342796309439319, "train_speed(iter/s)": 0.126459 }, { "epoch": 2.638138253081222, "grad_norm": 1.0150328874588013, "learning_rate": 3.49324565478662e-07, "loss": 0.513043737411499, "memory(GiB)": 36.87, "step": 2155, "token_acc": 0.855285740368815, "train_speed(iter/s)": 0.126542 }, { "epoch": 2.6442624205772027, "grad_norm": 0.9732238054275513, "learning_rate": 3.3763885297822153e-07, "loss": 0.4932279586791992, "memory(GiB)": 36.87, "step": 2160, "token_acc": 0.8633442370598422, "train_speed(iter/s)": 0.126609 }, { "epoch": 2.6442624205772027, "eval_loss": 0.617060124874115, "eval_runtime": 29.801, "eval_samples_per_second": 17.684, "eval_steps_per_second": 4.429, "eval_token_acc": 0.827367116070045, "step": 2160 }, { "epoch": 2.650386588073184, "grad_norm": 0.8650747537612915, "learning_rate": 3.261451051660547e-07, "loss": 0.47266697883605957, "memory(GiB)": 36.87, "step": 2165, "token_acc": 0.8351215537145186, "train_speed(iter/s)": 0.126394 }, { "epoch": 2.656510755569165, "grad_norm": 1.0742039680480957, "learning_rate": 3.1484379527742746e-07, "loss": 0.48064508438110354, "memory(GiB)": 36.87, "step": 2170, "token_acc": 0.8557394880859308, "train_speed(iter/s)": 0.126467 }, { "epoch": 2.662634923065146, "grad_norm": 1.0016287565231323, "learning_rate": 3.037353886243055e-07, "loss": 0.46164817810058595, "memory(GiB)": 36.87, "step": 2175, "token_acc": 0.8649181267691426, "train_speed(iter/s)": 0.126536 }, { "epoch": 2.6687590905611267, "grad_norm": 0.9850811958312988, "learning_rate": 2.928203425761961e-07, "loss": 0.4678659915924072, "memory(GiB)": 36.87, "step": 2180, "token_acc": 0.8360384946854352, "train_speed(iter/s)": 0.12661 }, { "epoch": 2.6687590905611267, "eval_loss": 0.6176232099533081, "eval_runtime": 29.9574, "eval_samples_per_second": 17.592, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8272896327289633, "step": 2180 }, { "epoch": 2.674883258057108, "grad_norm": 0.9312725067138672, "learning_rate": 2.820991065413159e-07, "loss": 0.49228496551513673, "memory(GiB)": 36.87, "step": 2185, "token_acc": 0.8372572060551601, "train_speed(iter/s)": 0.12639 }, { "epoch": 2.681007425553089, "grad_norm": 0.9902373552322388, "learning_rate": 2.71572121948091e-07, "loss": 0.48822717666625975, "memory(GiB)": 36.87, "step": 2190, "token_acc": 0.8330284513291558, "train_speed(iter/s)": 0.126463 }, { "epoch": 2.68713159304907, "grad_norm": 0.9834261536598206, "learning_rate": 2.612398222269752e-07, "loss": 0.47580180168151853, "memory(GiB)": 36.87, "step": 2195, "token_acc": 0.8485915492957746, "train_speed(iter/s)": 0.126526 }, { "epoch": 2.6932557605450507, "grad_norm": 1.019545316696167, "learning_rate": 2.511026327926114e-07, "loss": 0.5008028507232666, "memory(GiB)": 36.87, "step": 2200, "token_acc": 0.8462324594159034, "train_speed(iter/s)": 0.126603 }, { "epoch": 2.6932557605450507, "eval_loss": 0.6174827218055725, "eval_runtime": 30.0074, "eval_samples_per_second": 17.562, "eval_steps_per_second": 4.399, "eval_token_acc": 0.8274962549718478, "step": 2200 }, { "epoch": 2.699379928041032, "grad_norm": 1.0158801078796387, "learning_rate": 2.411609710263091e-07, "loss": 0.46188907623291015, "memory(GiB)": 36.87, "step": 2205, "token_acc": 0.8344093700899872, "train_speed(iter/s)": 0.126407 }, { "epoch": 2.705504095537013, "grad_norm": 0.8824003338813782, "learning_rate": 2.314152462588659e-07, "loss": 0.4691601753234863, "memory(GiB)": 36.87, "step": 2210, "token_acc": 0.8543765099423899, "train_speed(iter/s)": 0.126481 }, { "epoch": 2.7116282630329938, "grad_norm": 0.9510777592658997, "learning_rate": 2.2186585975370935e-07, "loss": 0.4572303771972656, "memory(GiB)": 36.87, "step": 2215, "token_acc": 0.8658688406088109, "train_speed(iter/s)": 0.126579 }, { "epoch": 2.717752430528975, "grad_norm": 0.9548938274383545, "learning_rate": 2.1251320469037827e-07, "loss": 0.4614152431488037, "memory(GiB)": 36.87, "step": 2220, "token_acc": 0.8602180404138602, "train_speed(iter/s)": 0.126644 }, { "epoch": 2.717752430528975, "eval_loss": 0.6176718473434448, "eval_runtime": 29.9684, "eval_samples_per_second": 17.585, "eval_steps_per_second": 4.405, "eval_token_acc": 0.8273722816261171, "step": 2220 }, { "epoch": 2.723876598024956, "grad_norm": 0.9301165342330933, "learning_rate": 2.0335766614833275e-07, "loss": 0.4707462310791016, "memory(GiB)": 36.87, "step": 2225, "token_acc": 0.8354241550286274, "train_speed(iter/s)": 0.126431 }, { "epoch": 2.730000765520937, "grad_norm": 0.9284428954124451, "learning_rate": 1.9439962109110032e-07, "loss": 0.45088810920715333, "memory(GiB)": 36.87, "step": 2230, "token_acc": 0.8519607113624525, "train_speed(iter/s)": 0.126503 }, { "epoch": 2.736124933016918, "grad_norm": 0.9004064798355103, "learning_rate": 1.8563943835075315e-07, "loss": 0.4744719982147217, "memory(GiB)": 36.87, "step": 2235, "token_acc": 0.8538511282801811, "train_speed(iter/s)": 0.126568 }, { "epoch": 2.742249100512899, "grad_norm": 0.9502014517784119, "learning_rate": 1.770774786127244e-07, "loss": 0.4691894054412842, "memory(GiB)": 36.87, "step": 2240, "token_acc": 0.8554979031914137, "train_speed(iter/s)": 0.12664 }, { "epoch": 2.742249100512899, "eval_loss": 0.6172018051147461, "eval_runtime": 29.9615, "eval_samples_per_second": 17.589, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8271501627150163, "step": 2240 }, { "epoch": 2.74837326800888, "grad_norm": 1.0235203504562378, "learning_rate": 1.6871409440095687e-07, "loss": 0.5208240509033203, "memory(GiB)": 36.87, "step": 2245, "token_acc": 0.8266149979641408, "train_speed(iter/s)": 0.12644 }, { "epoch": 2.7544974355048613, "grad_norm": 0.9667299389839172, "learning_rate": 1.6054963006338742e-07, "loss": 0.46748833656311034, "memory(GiB)": 36.87, "step": 2250, "token_acc": 0.8670063058890591, "train_speed(iter/s)": 0.126499 }, { "epoch": 2.760621603000842, "grad_norm": 0.990592360496521, "learning_rate": 1.5258442175777045e-07, "loss": 0.4987760066986084, "memory(GiB)": 36.87, "step": 2255, "token_acc": 0.8603155845961351, "train_speed(iter/s)": 0.126571 }, { "epoch": 2.766745770496823, "grad_norm": 1.023600459098816, "learning_rate": 1.44818797437834e-07, "loss": 0.5020921707153321, "memory(GiB)": 36.87, "step": 2260, "token_acc": 0.8367556063532101, "train_speed(iter/s)": 0.126649 }, { "epoch": 2.766745770496823, "eval_loss": 0.6171240210533142, "eval_runtime": 29.9592, "eval_samples_per_second": 17.591, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8274600960793429, "step": 2260 }, { "epoch": 2.7728699379928043, "grad_norm": 0.9850562810897827, "learning_rate": 1.372530768397845e-07, "loss": 0.49890799522399903, "memory(GiB)": 36.87, "step": 2265, "token_acc": 0.8388872065619528, "train_speed(iter/s)": 0.126453 }, { "epoch": 2.7789941054887852, "grad_norm": 1.0308210849761963, "learning_rate": 1.2988757146913223e-07, "loss": 0.49098944664001465, "memory(GiB)": 36.87, "step": 2270, "token_acc": 0.8366363778787405, "train_speed(iter/s)": 0.12653 }, { "epoch": 2.785118272984766, "grad_norm": 0.9139099717140198, "learning_rate": 1.227225845878721e-07, "loss": 0.482135009765625, "memory(GiB)": 36.87, "step": 2275, "token_acc": 0.8365253330381195, "train_speed(iter/s)": 0.12659 }, { "epoch": 2.791242440480747, "grad_norm": 1.0616044998168945, "learning_rate": 1.157584112019966e-07, "loss": 0.5007448196411133, "memory(GiB)": 36.87, "step": 2280, "token_acc": 0.83696904524157, "train_speed(iter/s)": 0.126675 }, { "epoch": 2.791242440480747, "eval_loss": 0.6172557473182678, "eval_runtime": 29.9856, "eval_samples_per_second": 17.575, "eval_steps_per_second": 4.402, "eval_token_acc": 0.8271966527196652, "step": 2280 }, { "epoch": 2.7973666079767283, "grad_norm": 0.8870491981506348, "learning_rate": 1.0899533804934637e-07, "loss": 0.4659425258636475, "memory(GiB)": 36.87, "step": 2285, "token_acc": 0.8472861329549524, "train_speed(iter/s)": 0.126465 }, { "epoch": 2.803490775472709, "grad_norm": 0.8363329768180847, "learning_rate": 1.0243364358780817e-07, "loss": 0.46242237091064453, "memory(GiB)": 36.87, "step": 2290, "token_acc": 0.8513745704467354, "train_speed(iter/s)": 0.126527 }, { "epoch": 2.80961494296869, "grad_norm": 1.0086287260055542, "learning_rate": 9.607359798384785e-08, "loss": 0.46642189025878905, "memory(GiB)": 36.87, "step": 2295, "token_acc": 0.8569345046297867, "train_speed(iter/s)": 0.126585 }, { "epoch": 2.815739110464671, "grad_norm": 0.9434347748756409, "learning_rate": 8.991546310138599e-08, "loss": 0.48851499557495115, "memory(GiB)": 36.87, "step": 2300, "token_acc": 0.8429763909289578, "train_speed(iter/s)": 0.126657 }, { "epoch": 2.815739110464671, "eval_loss": 0.6172496676445007, "eval_runtime": 29.9586, "eval_samples_per_second": 17.591, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8273929438504055, "step": 2300 }, { "epoch": 2.8218632779606523, "grad_norm": 0.8611441254615784, "learning_rate": 8.395949249101754e-08, "loss": 0.4366280555725098, "memory(GiB)": 36.87, "step": 2305, "token_acc": 0.8454527389547593, "train_speed(iter/s)": 0.126446 }, { "epoch": 2.827987445456633, "grad_norm": 0.9308704137802124, "learning_rate": 7.820593137957244e-08, "loss": 0.49471750259399416, "memory(GiB)": 36.87, "step": 2310, "token_acc": 0.8447947341070501, "train_speed(iter/s)": 0.126514 }, { "epoch": 2.834111612952614, "grad_norm": 0.9204055070877075, "learning_rate": 7.265501666001706e-08, "loss": 0.5066485404968262, "memory(GiB)": 36.87, "step": 2315, "token_acc": 0.8312655086848635, "train_speed(iter/s)": 0.12658 }, { "epoch": 2.8402357804485954, "grad_norm": 0.9611912369728088, "learning_rate": 6.730697688170251e-08, "loss": 0.4841705322265625, "memory(GiB)": 36.87, "step": 2320, "token_acc": 0.8579581483830057, "train_speed(iter/s)": 0.126649 }, { "epoch": 2.8402357804485954, "eval_loss": 0.6172900199890137, "eval_runtime": 29.9623, "eval_samples_per_second": 17.589, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8273154605093238, "step": 2320 }, { "epoch": 2.8463599479445763, "grad_norm": 0.9565967321395874, "learning_rate": 6.216203224095386e-08, "loss": 0.45609331130981445, "memory(GiB)": 36.87, "step": 2325, "token_acc": 0.8340782438969194, "train_speed(iter/s)": 0.126439 }, { "epoch": 2.852484115440557, "grad_norm": 0.9000151753425598, "learning_rate": 5.722039457200235e-08, "loss": 0.46810379028320315, "memory(GiB)": 36.87, "step": 2330, "token_acc": 0.854035216434336, "train_speed(iter/s)": 0.126509 }, { "epoch": 2.8586082829365385, "grad_norm": 0.9673875570297241, "learning_rate": 5.248226733826689e-08, "loss": 0.496975040435791, "memory(GiB)": 36.87, "step": 2335, "token_acc": 0.8512393729597877, "train_speed(iter/s)": 0.126574 }, { "epoch": 2.8647324504325193, "grad_norm": 0.9579722881317139, "learning_rate": 4.794784562397459e-08, "loss": 0.5033215045928955, "memory(GiB)": 36.87, "step": 2340, "token_acc": 0.854410310614068, "train_speed(iter/s)": 0.126653 }, { "epoch": 2.8647324504325193, "eval_loss": 0.6173553466796875, "eval_runtime": 30.0724, "eval_samples_per_second": 17.524, "eval_steps_per_second": 4.389, "eval_token_acc": 0.8274394338550545, "step": 2340 }, { "epoch": 2.8708566179285, "grad_norm": 0.967046320438385, "learning_rate": 4.361731612612607e-08, "loss": 0.4593523025512695, "memory(GiB)": 36.87, "step": 2345, "token_acc": 0.8404793034195522, "train_speed(iter/s)": 0.126443 }, { "epoch": 2.8769807854244815, "grad_norm": 1.005210280418396, "learning_rate": 3.949085714681389e-08, "loss": 0.44494943618774413, "memory(GiB)": 36.87, "step": 2350, "token_acc": 0.8719988481336165, "train_speed(iter/s)": 0.126502 }, { "epoch": 2.8831049529204624, "grad_norm": 0.9526214599609375, "learning_rate": 3.556863858587833e-08, "loss": 0.4602672576904297, "memory(GiB)": 36.87, "step": 2355, "token_acc": 0.8692367364835238, "train_speed(iter/s)": 0.126552 }, { "epoch": 2.8892291204164433, "grad_norm": 0.9232168197631836, "learning_rate": 3.185082193391143e-08, "loss": 0.48589048385620115, "memory(GiB)": 36.87, "step": 2360, "token_acc": 0.8542855225182898, "train_speed(iter/s)": 0.126625 }, { "epoch": 2.8892291204164433, "eval_loss": 0.6173009276390076, "eval_runtime": 29.9578, "eval_samples_per_second": 17.591, "eval_steps_per_second": 4.406, "eval_token_acc": 0.8273516194018286, "step": 2360 }, { "epoch": 2.8953532879124246, "grad_norm": 0.9514647126197815, "learning_rate": 2.8337560265608853e-08, "loss": 0.46231327056884763, "memory(GiB)": 36.87, "step": 2365, "token_acc": 0.8406889558929477, "train_speed(iter/s)": 0.126404 }, { "epoch": 2.9014774554084055, "grad_norm": 0.9117730259895325, "learning_rate": 2.5028998233467272e-08, "loss": 0.47223424911499023, "memory(GiB)": 36.87, "step": 2370, "token_acc": 0.8618170593682478, "train_speed(iter/s)": 0.12647 }, { "epoch": 2.9076016229043864, "grad_norm": 0.9326623678207397, "learning_rate": 2.1925272061829038e-08, "loss": 0.47979536056518557, "memory(GiB)": 36.87, "step": 2375, "token_acc": 0.854614639049431, "train_speed(iter/s)": 0.126535 }, { "epoch": 2.9137257904003677, "grad_norm": 0.9446553587913513, "learning_rate": 1.9026509541272276e-08, "loss": 0.4813851356506348, "memory(GiB)": 36.87, "step": 2380, "token_acc": 0.8567724059536074, "train_speed(iter/s)": 0.126602 }, { "epoch": 2.9137257904003677, "eval_loss": 0.6172758936882019, "eval_runtime": 29.898, "eval_samples_per_second": 17.627, "eval_steps_per_second": 4.415, "eval_token_acc": 0.8273154605093238, "step": 2380 }, { "epoch": 2.9198499578963486, "grad_norm": 0.9821135401725769, "learning_rate": 1.6332830023350065e-08, "loss": 0.4704907417297363, "memory(GiB)": 36.87, "step": 2385, "token_acc": 0.835956510119977, "train_speed(iter/s)": 0.126422 }, { "epoch": 2.9259741253923295, "grad_norm": 0.9702714681625366, "learning_rate": 1.3844344415676059e-08, "loss": 0.5124542713165283, "memory(GiB)": 36.87, "step": 2390, "token_acc": 0.8496258847320526, "train_speed(iter/s)": 0.126502 }, { "epoch": 2.9320982928883104, "grad_norm": 0.8899897336959839, "learning_rate": 1.156115517735812e-08, "loss": 0.45607595443725585, "memory(GiB)": 36.87, "step": 2395, "token_acc": 0.8606019542115398, "train_speed(iter/s)": 0.12656 }, { "epoch": 2.9382224603842912, "grad_norm": 0.8827829360961914, "learning_rate": 9.48335631477948e-09, "loss": 0.4859360694885254, "memory(GiB)": 36.87, "step": 2400, "token_acc": 0.8470638693305693, "train_speed(iter/s)": 0.126623 }, { "epoch": 2.9382224603842912, "eval_loss": 0.617277204990387, "eval_runtime": 29.9293, "eval_samples_per_second": 17.608, "eval_steps_per_second": 4.41, "eval_token_acc": 0.8274910894157756, "step": 2400 }, { "epoch": 2.9443466278802726, "grad_norm": 0.9587947726249695, "learning_rate": 7.611033377729615e-09, "loss": 0.5192126274108887, "memory(GiB)": 36.87, "step": 2405, "token_acc": 0.8289369284093598, "train_speed(iter/s)": 0.126434 }, { "epoch": 2.9504707953762535, "grad_norm": 0.9472404718399048, "learning_rate": 5.944263455879284e-09, "loss": 0.4903435707092285, "memory(GiB)": 36.87, "step": 2410, "token_acc": 0.8476331360946746, "train_speed(iter/s)": 0.1265 }, { "epoch": 2.9565949628722343, "grad_norm": 0.9245522618293762, "learning_rate": 4.4831151756091766e-09, "loss": 0.4952064037322998, "memory(GiB)": 36.87, "step": 2415, "token_acc": 0.8508173686555399, "train_speed(iter/s)": 0.126559 }, { "epoch": 2.9627191303682157, "grad_norm": 0.8794329762458801, "learning_rate": 3.227648697182173e-09, "loss": 0.4453686237335205, "memory(GiB)": 36.87, "step": 2420, "token_acc": 0.850396277175889, "train_speed(iter/s)": 0.126607 }, { "epoch": 2.9627191303682157, "eval_loss": 0.6172167062759399, "eval_runtime": 29.9678, "eval_samples_per_second": 17.586, "eval_steps_per_second": 4.405, "eval_token_acc": 0.8275840694250736, "step": 2420 }, { "epoch": 2.9688432978641965, "grad_norm": 0.9200014472007751, "learning_rate": 2.177915712268108e-09, "loss": 0.4687533378601074, "memory(GiB)": 36.87, "step": 2425, "token_acc": 0.8394573675668927, "train_speed(iter/s)": 0.126424 }, { "epoch": 2.9749674653601774, "grad_norm": 0.9681059718132019, "learning_rate": 1.3339594418138036e-09, "loss": 0.46721735000610354, "memory(GiB)": 36.87, "step": 2430, "token_acc": 0.868303713612332, "train_speed(iter/s)": 0.126481 }, { "epoch": 2.9810916328561587, "grad_norm": 0.9787800908088684, "learning_rate": 6.958146342650463e-10, "loss": 0.46620631217956543, "memory(GiB)": 36.87, "step": 2435, "token_acc": 0.8369136498098324, "train_speed(iter/s)": 0.126545 }, { "epoch": 2.9872158003521396, "grad_norm": 0.9923424124717712, "learning_rate": 2.6350756413440203e-10, "loss": 0.4888926029205322, "memory(GiB)": 36.87, "step": 2440, "token_acc": 0.849891526288923, "train_speed(iter/s)": 0.12661 }, { "epoch": 2.9872158003521396, "eval_loss": 0.6173287630081177, "eval_runtime": 29.9491, "eval_samples_per_second": 17.597, "eval_steps_per_second": 4.407, "eval_token_acc": 0.8274859238597035, "step": 2440 }, { "epoch": 2.9933399678481205, "grad_norm": 0.9687269926071167, "learning_rate": 3.7056030921522877e-11, "loss": 0.4582235336303711, "memory(GiB)": 36.87, "step": 2445, "token_acc": 0.8376420474448975, "train_speed(iter/s)": 0.126414 }, { "epoch": 2.9970144683457094, "eval_loss": 0.617242693901062, "eval_runtime": 29.8933, "eval_samples_per_second": 17.629, "eval_steps_per_second": 4.416, "eval_token_acc": 0.8273826127382613, "step": 2448 } ], "logging_steps": 5, "max_steps": 2448, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.502734662946783e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }