6042 lines
171 KiB
JSON
6042 lines
171 KiB
JSON
{
|
|
"best_global_step": 1580,
|
|
"best_metric": 0.6043635,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b/v27-20250503-235734/checkpoint-1580",
|
|
"epoch": 2.9970144683457094,
|
|
"eval_steps": 20,
|
|
"global_step": 2448,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.001224833499196203,
|
|
"grad_norm": 5.5611701011657715,
|
|
"learning_rate": 9.999995882658711e-06,
|
|
"loss": 1.0937654972076416,
|
|
"memory(GiB)": 27.73,
|
|
"step": 1,
|
|
"token_acc": 0.7134680736898587,
|
|
"train_speed(iter/s)": 0.067022
|
|
},
|
|
{
|
|
"epoch": 0.006124167495981015,
|
|
"grad_norm": 3.0366287231445312,
|
|
"learning_rate": 9.999897066806807e-06,
|
|
"loss": 0.8203982710838318,
|
|
"memory(GiB)": 27.77,
|
|
"step": 5,
|
|
"token_acc": 0.7706657236318628,
|
|
"train_speed(iter/s)": 0.125794
|
|
},
|
|
{
|
|
"epoch": 0.01224833499196203,
|
|
"grad_norm": 1.5655843019485474,
|
|
"learning_rate": 9.999588271465324e-06,
|
|
"loss": 0.7133324623107911,
|
|
"memory(GiB)": 27.77,
|
|
"step": 10,
|
|
"token_acc": 0.7872333068225043,
|
|
"train_speed(iter/s)": 0.148685
|
|
},
|
|
{
|
|
"epoch": 0.018372502487943046,
|
|
"grad_norm": 1.4364837408065796,
|
|
"learning_rate": 9.999073626689664e-06,
|
|
"loss": 0.7077776908874511,
|
|
"memory(GiB)": 27.77,
|
|
"step": 15,
|
|
"token_acc": 0.7906767333933643,
|
|
"train_speed(iter/s)": 0.153421
|
|
},
|
|
{
|
|
"epoch": 0.02449666998392406,
|
|
"grad_norm": 1.2211848497390747,
|
|
"learning_rate": 9.998353153669443e-06,
|
|
"loss": 0.6643787860870362,
|
|
"memory(GiB)": 29.89,
|
|
"step": 20,
|
|
"token_acc": 0.8500581170089113,
|
|
"train_speed(iter/s)": 0.156098
|
|
},
|
|
{
|
|
"epoch": 0.02449666998392406,
|
|
"eval_loss": 0.6882059574127197,
|
|
"eval_runtime": 30.3868,
|
|
"eval_samples_per_second": 17.343,
|
|
"eval_steps_per_second": 4.344,
|
|
"eval_token_acc": 0.8109354822046594,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.030620837479905075,
|
|
"grad_norm": 1.1730519533157349,
|
|
"learning_rate": 9.997426882068896e-06,
|
|
"loss": 0.7091597557067871,
|
|
"memory(GiB)": 29.89,
|
|
"step": 25,
|
|
"token_acc": 0.7997104179259739,
|
|
"train_speed(iter/s)": 0.123685
|
|
},
|
|
{
|
|
"epoch": 0.03674500497588609,
|
|
"grad_norm": 1.1173818111419678,
|
|
"learning_rate": 9.996294850025658e-06,
|
|
"loss": 0.6673955917358398,
|
|
"memory(GiB)": 29.89,
|
|
"step": 30,
|
|
"token_acc": 0.8030015197568389,
|
|
"train_speed(iter/s)": 0.129134
|
|
},
|
|
{
|
|
"epoch": 0.042869172471867105,
|
|
"grad_norm": 1.1728911399841309,
|
|
"learning_rate": 9.994957104149202e-06,
|
|
"loss": 0.6595910072326661,
|
|
"memory(GiB)": 29.89,
|
|
"step": 35,
|
|
"token_acc": 0.7975662938735317,
|
|
"train_speed(iter/s)": 0.133798
|
|
},
|
|
{
|
|
"epoch": 0.04899333996784812,
|
|
"grad_norm": 1.234574317932129,
|
|
"learning_rate": 9.993413699518906e-06,
|
|
"loss": 0.6554254055023193,
|
|
"memory(GiB)": 29.89,
|
|
"step": 40,
|
|
"token_acc": 0.8106137920979735,
|
|
"train_speed(iter/s)": 0.136376
|
|
},
|
|
{
|
|
"epoch": 0.04899333996784812,
|
|
"eval_loss": 0.6702007055282593,
|
|
"eval_runtime": 30.2289,
|
|
"eval_samples_per_second": 17.434,
|
|
"eval_steps_per_second": 4.367,
|
|
"eval_token_acc": 0.8143189214318921,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.05511750746382914,
|
|
"grad_norm": 1.0441969633102417,
|
|
"learning_rate": 9.9916646996818e-06,
|
|
"loss": 0.671393871307373,
|
|
"memory(GiB)": 29.89,
|
|
"step": 45,
|
|
"token_acc": 0.8007245869993985,
|
|
"train_speed(iter/s)": 0.123931
|
|
},
|
|
{
|
|
"epoch": 0.06124167495981015,
|
|
"grad_norm": 1.1040817499160767,
|
|
"learning_rate": 9.989710176649937e-06,
|
|
"loss": 0.65097017288208,
|
|
"memory(GiB)": 29.89,
|
|
"step": 50,
|
|
"token_acc": 0.8223552894211577,
|
|
"train_speed(iter/s)": 0.127235
|
|
},
|
|
{
|
|
"epoch": 0.06736584245579116,
|
|
"grad_norm": 0.971693217754364,
|
|
"learning_rate": 9.987550210897433e-06,
|
|
"loss": 0.652859115600586,
|
|
"memory(GiB)": 29.89,
|
|
"step": 55,
|
|
"token_acc": 0.8024456033087575,
|
|
"train_speed(iter/s)": 0.130006
|
|
},
|
|
{
|
|
"epoch": 0.07349000995177218,
|
|
"grad_norm": 1.3039090633392334,
|
|
"learning_rate": 9.985184891357165e-06,
|
|
"loss": 0.6641504764556885,
|
|
"memory(GiB)": 29.89,
|
|
"step": 60,
|
|
"token_acc": 0.7929855290045457,
|
|
"train_speed(iter/s)": 0.132371
|
|
},
|
|
{
|
|
"epoch": 0.07349000995177218,
|
|
"eval_loss": 0.6628613471984863,
|
|
"eval_runtime": 30.3515,
|
|
"eval_samples_per_second": 17.363,
|
|
"eval_steps_per_second": 4.349,
|
|
"eval_token_acc": 0.8155018337724056,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.07961417744775319,
|
|
"grad_norm": 1.1451324224472046,
|
|
"learning_rate": 9.982614315417084e-06,
|
|
"loss": 0.6779595851898194,
|
|
"memory(GiB)": 29.89,
|
|
"step": 65,
|
|
"token_acc": 0.8056370699533272,
|
|
"train_speed(iter/s)": 0.12411
|
|
},
|
|
{
|
|
"epoch": 0.08573834494373421,
|
|
"grad_norm": 1.1827325820922852,
|
|
"learning_rate": 9.979838588916229e-06,
|
|
"loss": 0.647182846069336,
|
|
"memory(GiB)": 29.89,
|
|
"step": 70,
|
|
"token_acc": 0.806831566548881,
|
|
"train_speed(iter/s)": 0.126104
|
|
},
|
|
{
|
|
"epoch": 0.09186251243971523,
|
|
"grad_norm": 1.2671717405319214,
|
|
"learning_rate": 9.976857826140354e-06,
|
|
"loss": 0.6356947898864747,
|
|
"memory(GiB)": 29.89,
|
|
"step": 75,
|
|
"token_acc": 0.8092331033486255,
|
|
"train_speed(iter/s)": 0.128389
|
|
},
|
|
{
|
|
"epoch": 0.09798667993569624,
|
|
"grad_norm": 1.1943690776824951,
|
|
"learning_rate": 9.973672149817232e-06,
|
|
"loss": 0.6435425758361817,
|
|
"memory(GiB)": 29.89,
|
|
"step": 80,
|
|
"token_acc": 0.7932153503641372,
|
|
"train_speed(iter/s)": 0.130134
|
|
},
|
|
{
|
|
"epoch": 0.09798667993569624,
|
|
"eval_loss": 0.6591480374336243,
|
|
"eval_runtime": 30.2192,
|
|
"eval_samples_per_second": 17.439,
|
|
"eval_steps_per_second": 4.368,
|
|
"eval_token_acc": 0.8162405082907175,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.10411084743167726,
|
|
"grad_norm": 1.0277475118637085,
|
|
"learning_rate": 9.970281691111598e-06,
|
|
"loss": 0.6061644554138184,
|
|
"memory(GiB)": 29.89,
|
|
"step": 85,
|
|
"token_acc": 0.8139895703141835,
|
|
"train_speed(iter/s)": 0.123754
|
|
},
|
|
{
|
|
"epoch": 0.11023501492765828,
|
|
"grad_norm": 1.1457469463348389,
|
|
"learning_rate": 9.96668658961975e-06,
|
|
"loss": 0.6548227787017822,
|
|
"memory(GiB)": 29.89,
|
|
"step": 90,
|
|
"token_acc": 0.8061897136047075,
|
|
"train_speed(iter/s)": 0.125333
|
|
},
|
|
{
|
|
"epoch": 0.11635918242363928,
|
|
"grad_norm": 1.124259352684021,
|
|
"learning_rate": 9.962886993363797e-06,
|
|
"loss": 0.6785114288330079,
|
|
"memory(GiB)": 29.89,
|
|
"step": 95,
|
|
"token_acc": 0.7958439546965248,
|
|
"train_speed(iter/s)": 0.126782
|
|
},
|
|
{
|
|
"epoch": 0.1224833499196203,
|
|
"grad_norm": 1.1532223224639893,
|
|
"learning_rate": 9.95888305878557e-06,
|
|
"loss": 0.6254438400268555,
|
|
"memory(GiB)": 29.89,
|
|
"step": 100,
|
|
"token_acc": 0.825769669327252,
|
|
"train_speed(iter/s)": 0.128271
|
|
},
|
|
{
|
|
"epoch": 0.1224833499196203,
|
|
"eval_loss": 0.6548585891723633,
|
|
"eval_runtime": 30.2918,
|
|
"eval_samples_per_second": 17.397,
|
|
"eval_steps_per_second": 4.358,
|
|
"eval_token_acc": 0.8170463350379669,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.1286075174156013,
|
|
"grad_norm": 1.1253662109375,
|
|
"learning_rate": 9.954674950740175e-06,
|
|
"loss": 0.6390158653259277,
|
|
"memory(GiB)": 29.89,
|
|
"step": 105,
|
|
"token_acc": 0.8143782730827323,
|
|
"train_speed(iter/s)": 0.123016
|
|
},
|
|
{
|
|
"epoch": 0.13473168491158233,
|
|
"grad_norm": 1.0638540983200073,
|
|
"learning_rate": 9.950262842489215e-06,
|
|
"loss": 0.5906115531921386,
|
|
"memory(GiB)": 29.89,
|
|
"step": 110,
|
|
"token_acc": 0.8280561419101581,
|
|
"train_speed(iter/s)": 0.124561
|
|
},
|
|
{
|
|
"epoch": 0.14085585240756335,
|
|
"grad_norm": 1.3548355102539062,
|
|
"learning_rate": 9.945646915693646e-06,
|
|
"loss": 0.5967195510864258,
|
|
"memory(GiB)": 29.89,
|
|
"step": 115,
|
|
"token_acc": 0.8018965390008117,
|
|
"train_speed(iter/s)": 0.126157
|
|
},
|
|
{
|
|
"epoch": 0.14698001990354437,
|
|
"grad_norm": 1.199034333229065,
|
|
"learning_rate": 9.940827360406297e-06,
|
|
"loss": 0.631542444229126,
|
|
"memory(GiB)": 29.89,
|
|
"step": 120,
|
|
"token_acc": 0.8094361557837628,
|
|
"train_speed(iter/s)": 0.127311
|
|
},
|
|
{
|
|
"epoch": 0.14698001990354437,
|
|
"eval_loss": 0.6537693738937378,
|
|
"eval_runtime": 30.0219,
|
|
"eval_samples_per_second": 17.554,
|
|
"eval_steps_per_second": 4.397,
|
|
"eval_token_acc": 0.8179503073505863,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.1531041873995254,
|
|
"grad_norm": 1.1312533617019653,
|
|
"learning_rate": 9.93580437506406e-06,
|
|
"loss": 0.6321775913238525,
|
|
"memory(GiB)": 29.89,
|
|
"step": 125,
|
|
"token_acc": 0.8088573959255979,
|
|
"train_speed(iter/s)": 0.123165
|
|
},
|
|
{
|
|
"epoch": 0.15922835489550638,
|
|
"grad_norm": 1.1674045324325562,
|
|
"learning_rate": 9.9305781664797e-06,
|
|
"loss": 0.6246171951293945,
|
|
"memory(GiB)": 29.89,
|
|
"step": 130,
|
|
"token_acc": 0.8052116325942988,
|
|
"train_speed(iter/s)": 0.12427
|
|
},
|
|
{
|
|
"epoch": 0.1653525223914874,
|
|
"grad_norm": 1.3215515613555908,
|
|
"learning_rate": 9.925148949833356e-06,
|
|
"loss": 0.6429347515106201,
|
|
"memory(GiB)": 29.89,
|
|
"step": 135,
|
|
"token_acc": 0.8183133283809673,
|
|
"train_speed(iter/s)": 0.125515
|
|
},
|
|
{
|
|
"epoch": 0.17147668988746842,
|
|
"grad_norm": 1.138923168182373,
|
|
"learning_rate": 9.919516948663666e-06,
|
|
"loss": 0.6564007759094238,
|
|
"memory(GiB)": 32.12,
|
|
"step": 140,
|
|
"token_acc": 0.8169854580859952,
|
|
"train_speed(iter/s)": 0.126675
|
|
},
|
|
{
|
|
"epoch": 0.17147668988746842,
|
|
"eval_loss": 0.6485698819160461,
|
|
"eval_runtime": 30.1644,
|
|
"eval_samples_per_second": 17.471,
|
|
"eval_steps_per_second": 4.376,
|
|
"eval_token_acc": 0.8182240818224081,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.17760085738344944,
|
|
"grad_norm": 1.258039951324463,
|
|
"learning_rate": 9.913682394858576e-06,
|
|
"loss": 0.6344574451446533,
|
|
"memory(GiB)": 32.12,
|
|
"step": 145,
|
|
"token_acc": 0.8038108277711221,
|
|
"train_speed(iter/s)": 0.123274
|
|
},
|
|
{
|
|
"epoch": 0.18372502487943046,
|
|
"grad_norm": 1.1063320636749268,
|
|
"learning_rate": 9.907645528645791e-06,
|
|
"loss": 0.6173704147338868,
|
|
"memory(GiB)": 32.12,
|
|
"step": 150,
|
|
"token_acc": 0.8146344955967638,
|
|
"train_speed(iter/s)": 0.124336
|
|
},
|
|
{
|
|
"epoch": 0.18984919237541148,
|
|
"grad_norm": 1.118895411491394,
|
|
"learning_rate": 9.901406598582874e-06,
|
|
"loss": 0.6216392517089844,
|
|
"memory(GiB)": 32.12,
|
|
"step": 155,
|
|
"token_acc": 0.8292195700016929,
|
|
"train_speed(iter/s)": 0.125258
|
|
},
|
|
{
|
|
"epoch": 0.19597335987139247,
|
|
"grad_norm": 0.9873996376991272,
|
|
"learning_rate": 9.894965861547023e-06,
|
|
"loss": 0.6492547512054443,
|
|
"memory(GiB)": 32.12,
|
|
"step": 160,
|
|
"token_acc": 0.812430195125156,
|
|
"train_speed(iter/s)": 0.126446
|
|
},
|
|
{
|
|
"epoch": 0.19597335987139247,
|
|
"eval_loss": 0.6458240747451782,
|
|
"eval_runtime": 30.1721,
|
|
"eval_samples_per_second": 17.466,
|
|
"eval_steps_per_second": 4.375,
|
|
"eval_token_acc": 0.8187303063174751,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.2020975273673735,
|
|
"grad_norm": 1.0021984577178955,
|
|
"learning_rate": 9.888323582724493e-06,
|
|
"loss": 0.5956392288208008,
|
|
"memory(GiB)": 32.12,
|
|
"step": 165,
|
|
"token_acc": 0.8176234443998395,
|
|
"train_speed(iter/s)": 0.123317
|
|
},
|
|
{
|
|
"epoch": 0.2082216948633545,
|
|
"grad_norm": 1.0651910305023193,
|
|
"learning_rate": 9.881480035599667e-06,
|
|
"loss": 0.6227351665496826,
|
|
"memory(GiB)": 32.12,
|
|
"step": 170,
|
|
"token_acc": 0.7998247919404292,
|
|
"train_speed(iter/s)": 0.124306
|
|
},
|
|
{
|
|
"epoch": 0.21434586235933553,
|
|
"grad_norm": 1.161063551902771,
|
|
"learning_rate": 9.874435501943814e-06,
|
|
"loss": 0.6138211727142334,
|
|
"memory(GiB)": 32.12,
|
|
"step": 175,
|
|
"token_acc": 0.8047394093021469,
|
|
"train_speed(iter/s)": 0.12515
|
|
},
|
|
{
|
|
"epoch": 0.22047002985531655,
|
|
"grad_norm": 1.0052075386047363,
|
|
"learning_rate": 9.867190271803466e-06,
|
|
"loss": 0.6363819122314454,
|
|
"memory(GiB)": 32.12,
|
|
"step": 180,
|
|
"token_acc": 0.8109002326934264,
|
|
"train_speed(iter/s)": 0.125933
|
|
},
|
|
{
|
|
"epoch": 0.22047002985531655,
|
|
"eval_loss": 0.6438542008399963,
|
|
"eval_runtime": 30.2227,
|
|
"eval_samples_per_second": 17.437,
|
|
"eval_steps_per_second": 4.368,
|
|
"eval_token_acc": 0.8191693785836045,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.22659419735129757,
|
|
"grad_norm": 1.093913197517395,
|
|
"learning_rate": 9.859744643488494e-06,
|
|
"loss": 0.6040900707244873,
|
|
"memory(GiB)": 32.12,
|
|
"step": 185,
|
|
"token_acc": 0.8167378167100959,
|
|
"train_speed(iter/s)": 0.123432
|
|
},
|
|
{
|
|
"epoch": 0.23271836484727856,
|
|
"grad_norm": 1.229707956314087,
|
|
"learning_rate": 9.852098923559819e-06,
|
|
"loss": 0.6707104206085205,
|
|
"memory(GiB)": 32.12,
|
|
"step": 190,
|
|
"token_acc": 0.79118295902499,
|
|
"train_speed(iter/s)": 0.124323
|
|
},
|
|
{
|
|
"epoch": 0.23884253234325958,
|
|
"grad_norm": 1.2590445280075073,
|
|
"learning_rate": 9.844253426816785e-06,
|
|
"loss": 0.594182014465332,
|
|
"memory(GiB)": 32.12,
|
|
"step": 195,
|
|
"token_acc": 0.8231213499822541,
|
|
"train_speed(iter/s)": 0.125194
|
|
},
|
|
{
|
|
"epoch": 0.2449666998392406,
|
|
"grad_norm": 1.1405773162841797,
|
|
"learning_rate": 9.836208476284208e-06,
|
|
"loss": 0.6203227996826172,
|
|
"memory(GiB)": 32.12,
|
|
"step": 200,
|
|
"token_acc": 0.8118512276400965,
|
|
"train_speed(iter/s)": 0.126092
|
|
},
|
|
{
|
|
"epoch": 0.2449666998392406,
|
|
"eval_loss": 0.6424054503440857,
|
|
"eval_runtime": 30.1137,
|
|
"eval_samples_per_second": 17.5,
|
|
"eval_steps_per_second": 4.383,
|
|
"eval_token_acc": 0.8195051397282918,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.2510908673352216,
|
|
"grad_norm": 1.1559275388717651,
|
|
"learning_rate": 9.827964403199067e-06,
|
|
"loss": 0.6028561592102051,
|
|
"memory(GiB)": 32.12,
|
|
"step": 205,
|
|
"token_acc": 0.8117863720073665,
|
|
"train_speed(iter/s)": 0.123812
|
|
},
|
|
{
|
|
"epoch": 0.2572150348312026,
|
|
"grad_norm": 1.1424733400344849,
|
|
"learning_rate": 9.819521546996864e-06,
|
|
"loss": 0.6058461189270019,
|
|
"memory(GiB)": 32.12,
|
|
"step": 210,
|
|
"token_acc": 0.8184524805138327,
|
|
"train_speed(iter/s)": 0.12456
|
|
},
|
|
{
|
|
"epoch": 0.26333920232718366,
|
|
"grad_norm": 1.0866496562957764,
|
|
"learning_rate": 9.810880255297663e-06,
|
|
"loss": 0.6336095809936524,
|
|
"memory(GiB)": 32.12,
|
|
"step": 215,
|
|
"token_acc": 0.8254080406980853,
|
|
"train_speed(iter/s)": 0.125292
|
|
},
|
|
{
|
|
"epoch": 0.26946336982316466,
|
|
"grad_norm": 1.090539574623108,
|
|
"learning_rate": 9.802040883891762e-06,
|
|
"loss": 0.6297359466552734,
|
|
"memory(GiB)": 32.12,
|
|
"step": 220,
|
|
"token_acc": 0.7853244390539721,
|
|
"train_speed(iter/s)": 0.125916
|
|
},
|
|
{
|
|
"epoch": 0.26946336982316466,
|
|
"eval_loss": 0.6410297155380249,
|
|
"eval_runtime": 30.1465,
|
|
"eval_samples_per_second": 17.481,
|
|
"eval_steps_per_second": 4.379,
|
|
"eval_token_acc": 0.8207552042977426,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.2755875373191457,
|
|
"grad_norm": 1.0024933815002441,
|
|
"learning_rate": 9.793003796725049e-06,
|
|
"loss": 0.5586746215820313,
|
|
"memory(GiB)": 32.12,
|
|
"step": 225,
|
|
"token_acc": 0.8293589430563468,
|
|
"train_speed(iter/s)": 0.123629
|
|
},
|
|
{
|
|
"epoch": 0.2817117048151267,
|
|
"grad_norm": 1.0108203887939453,
|
|
"learning_rate": 9.783769365884023e-06,
|
|
"loss": 0.6524643898010254,
|
|
"memory(GiB)": 32.12,
|
|
"step": 230,
|
|
"token_acc": 0.8032468163701759,
|
|
"train_speed(iter/s)": 0.124589
|
|
},
|
|
{
|
|
"epoch": 0.2878358723111077,
|
|
"grad_norm": 1.079897403717041,
|
|
"learning_rate": 9.774337971580464e-06,
|
|
"loss": 0.6641106605529785,
|
|
"memory(GiB)": 32.12,
|
|
"step": 235,
|
|
"token_acc": 0.8061291260724153,
|
|
"train_speed(iter/s)": 0.125376
|
|
},
|
|
{
|
|
"epoch": 0.29396003980708874,
|
|
"grad_norm": 1.1108851432800293,
|
|
"learning_rate": 9.764710002135784e-06,
|
|
"loss": 0.6327021598815918,
|
|
"memory(GiB)": 32.12,
|
|
"step": 240,
|
|
"token_acc": 0.8024784931974078,
|
|
"train_speed(iter/s)": 0.126001
|
|
},
|
|
{
|
|
"epoch": 0.29396003980708874,
|
|
"eval_loss": 0.6399893164634705,
|
|
"eval_runtime": 30.1753,
|
|
"eval_samples_per_second": 17.465,
|
|
"eval_steps_per_second": 4.374,
|
|
"eval_token_acc": 0.8202076553540989,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.3000842073030697,
|
|
"grad_norm": 0.8851113319396973,
|
|
"learning_rate": 9.754885853965039e-06,
|
|
"loss": 0.6223061561584473,
|
|
"memory(GiB)": 32.12,
|
|
"step": 245,
|
|
"token_acc": 0.8136768110686491,
|
|
"train_speed(iter/s)": 0.124088
|
|
},
|
|
{
|
|
"epoch": 0.3062083747990508,
|
|
"grad_norm": 0.943510890007019,
|
|
"learning_rate": 9.744865931560606e-06,
|
|
"loss": 0.625941801071167,
|
|
"memory(GiB)": 32.12,
|
|
"step": 250,
|
|
"token_acc": 0.825910233887913,
|
|
"train_speed(iter/s)": 0.124764
|
|
},
|
|
{
|
|
"epoch": 0.31233254229503177,
|
|
"grad_norm": 1.120894193649292,
|
|
"learning_rate": 9.73465064747553e-06,
|
|
"loss": 0.6334005355834961,
|
|
"memory(GiB)": 32.12,
|
|
"step": 255,
|
|
"token_acc": 0.7913934426229509,
|
|
"train_speed(iter/s)": 0.125458
|
|
},
|
|
{
|
|
"epoch": 0.31845670979101276,
|
|
"grad_norm": 1.069145917892456,
|
|
"learning_rate": 9.724240422306531e-06,
|
|
"loss": 0.6185196876525879,
|
|
"memory(GiB)": 32.12,
|
|
"step": 260,
|
|
"token_acc": 0.7924378740438349,
|
|
"train_speed(iter/s)": 0.126007
|
|
},
|
|
{
|
|
"epoch": 0.31845670979101276,
|
|
"eval_loss": 0.6390168070793152,
|
|
"eval_runtime": 29.9843,
|
|
"eval_samples_per_second": 17.576,
|
|
"eval_steps_per_second": 4.402,
|
|
"eval_token_acc": 0.8199958675551423,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.3245808772869938,
|
|
"grad_norm": 1.1289619207382202,
|
|
"learning_rate": 9.713635684676701e-06,
|
|
"loss": 0.6217617988586426,
|
|
"memory(GiB)": 32.12,
|
|
"step": 265,
|
|
"token_acc": 0.8116217614406958,
|
|
"train_speed(iter/s)": 0.12414
|
|
},
|
|
{
|
|
"epoch": 0.3307050447829748,
|
|
"grad_norm": 1.0119985342025757,
|
|
"learning_rate": 9.702836871217838e-06,
|
|
"loss": 0.6184762001037598,
|
|
"memory(GiB)": 32.12,
|
|
"step": 270,
|
|
"token_acc": 0.8169801035704551,
|
|
"train_speed(iter/s)": 0.124781
|
|
},
|
|
{
|
|
"epoch": 0.33682921227895585,
|
|
"grad_norm": 1.121479868888855,
|
|
"learning_rate": 9.691844426552488e-06,
|
|
"loss": 0.6679095268249512,
|
|
"memory(GiB)": 32.12,
|
|
"step": 275,
|
|
"token_acc": 0.8053990302712619,
|
|
"train_speed(iter/s)": 0.125508
|
|
},
|
|
{
|
|
"epoch": 0.34295337977493684,
|
|
"grad_norm": 1.1491762399673462,
|
|
"learning_rate": 9.68065880327562e-06,
|
|
"loss": 0.6015125274658203,
|
|
"memory(GiB)": 32.12,
|
|
"step": 280,
|
|
"token_acc": 0.8035652005425306,
|
|
"train_speed(iter/s)": 0.126152
|
|
},
|
|
{
|
|
"epoch": 0.34295337977493684,
|
|
"eval_loss": 0.6370740532875061,
|
|
"eval_runtime": 30.1382,
|
|
"eval_samples_per_second": 17.486,
|
|
"eval_steps_per_second": 4.38,
|
|
"eval_token_acc": 0.8203677875923343,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.3490775472709179,
|
|
"grad_norm": 0.9209638833999634,
|
|
"learning_rate": 9.669280461936004e-06,
|
|
"loss": 0.6419768333435059,
|
|
"memory(GiB)": 32.12,
|
|
"step": 285,
|
|
"token_acc": 0.8053348308297066,
|
|
"train_speed(iter/s)": 0.124607
|
|
},
|
|
{
|
|
"epoch": 0.3552017147668989,
|
|
"grad_norm": 1.0387402772903442,
|
|
"learning_rate": 9.657709871017243e-06,
|
|
"loss": 0.6462045669555664,
|
|
"memory(GiB)": 32.12,
|
|
"step": 290,
|
|
"token_acc": 0.8079543874287304,
|
|
"train_speed(iter/s)": 0.125198
|
|
},
|
|
{
|
|
"epoch": 0.36132588226287987,
|
|
"grad_norm": 1.0344866514205933,
|
|
"learning_rate": 9.645947506918482e-06,
|
|
"loss": 0.6486867904663086,
|
|
"memory(GiB)": 32.12,
|
|
"step": 295,
|
|
"token_acc": 0.8222080724468499,
|
|
"train_speed(iter/s)": 0.12567
|
|
},
|
|
{
|
|
"epoch": 0.3674500497588609,
|
|
"grad_norm": 1.1158808469772339,
|
|
"learning_rate": 9.633993853934803e-06,
|
|
"loss": 0.6453632354736328,
|
|
"memory(GiB)": 32.12,
|
|
"step": 300,
|
|
"token_acc": 0.8016542381712202,
|
|
"train_speed(iter/s)": 0.126206
|
|
},
|
|
{
|
|
"epoch": 0.3674500497588609,
|
|
"eval_loss": 0.6345298886299133,
|
|
"eval_runtime": 30.2223,
|
|
"eval_samples_per_second": 17.437,
|
|
"eval_steps_per_second": 4.368,
|
|
"eval_token_acc": 0.821328581021747,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.3735742172548419,
|
|
"grad_norm": 0.9398248195648193,
|
|
"learning_rate": 9.621849404237274e-06,
|
|
"loss": 0.6055630683898926,
|
|
"memory(GiB)": 32.12,
|
|
"step": 305,
|
|
"token_acc": 0.8250450524809817,
|
|
"train_speed(iter/s)": 0.124632
|
|
},
|
|
{
|
|
"epoch": 0.37969838475082296,
|
|
"grad_norm": 1.048771858215332,
|
|
"learning_rate": 9.60951465785269e-06,
|
|
"loss": 0.6380780220031739,
|
|
"memory(GiB)": 32.12,
|
|
"step": 310,
|
|
"token_acc": 0.815754208203955,
|
|
"train_speed(iter/s)": 0.125253
|
|
},
|
|
{
|
|
"epoch": 0.38582255224680395,
|
|
"grad_norm": 1.026041865348816,
|
|
"learning_rate": 9.596990122642984e-06,
|
|
"loss": 0.6475009441375732,
|
|
"memory(GiB)": 32.12,
|
|
"step": 315,
|
|
"token_acc": 0.8045922028222913,
|
|
"train_speed(iter/s)": 0.12584
|
|
},
|
|
{
|
|
"epoch": 0.39194671974278494,
|
|
"grad_norm": 1.112762212753296,
|
|
"learning_rate": 9.584276314284316e-06,
|
|
"loss": 0.6385052680969239,
|
|
"memory(GiB)": 32.12,
|
|
"step": 320,
|
|
"token_acc": 0.792560957804059,
|
|
"train_speed(iter/s)": 0.126392
|
|
},
|
|
{
|
|
"epoch": 0.39194671974278494,
|
|
"eval_loss": 0.633259654045105,
|
|
"eval_runtime": 30.0477,
|
|
"eval_samples_per_second": 17.539,
|
|
"eval_steps_per_second": 4.393,
|
|
"eval_token_acc": 0.8214318921431892,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.398070887238766,
|
|
"grad_norm": 1.0934439897537231,
|
|
"learning_rate": 9.571373756245842e-06,
|
|
"loss": 0.6589271545410156,
|
|
"memory(GiB)": 32.12,
|
|
"step": 325,
|
|
"token_acc": 0.8136614281775572,
|
|
"train_speed(iter/s)": 0.124935
|
|
},
|
|
{
|
|
"epoch": 0.404195054734747,
|
|
"grad_norm": 1.0657905340194702,
|
|
"learning_rate": 9.558282979768164e-06,
|
|
"loss": 0.6037847995758057,
|
|
"memory(GiB)": 32.12,
|
|
"step": 330,
|
|
"token_acc": 0.7959645802352305,
|
|
"train_speed(iter/s)": 0.125393
|
|
},
|
|
{
|
|
"epoch": 0.41031922223072803,
|
|
"grad_norm": 1.0330870151519775,
|
|
"learning_rate": 9.545004523841452e-06,
|
|
"loss": 0.6114434242248535,
|
|
"memory(GiB)": 32.12,
|
|
"step": 335,
|
|
"token_acc": 0.8286545017044316,
|
|
"train_speed(iter/s)": 0.125804
|
|
},
|
|
{
|
|
"epoch": 0.416443389726709,
|
|
"grad_norm": 1.0614529848098755,
|
|
"learning_rate": 9.531538935183252e-06,
|
|
"loss": 0.6515423774719238,
|
|
"memory(GiB)": 32.12,
|
|
"step": 340,
|
|
"token_acc": 0.796179652197727,
|
|
"train_speed(iter/s)": 0.126217
|
|
},
|
|
{
|
|
"epoch": 0.416443389726709,
|
|
"eval_loss": 0.6336191296577454,
|
|
"eval_runtime": 30.124,
|
|
"eval_samples_per_second": 17.494,
|
|
"eval_steps_per_second": 4.382,
|
|
"eval_token_acc": 0.8210754687742136,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.42256755722269,
|
|
"grad_norm": 0.917084276676178,
|
|
"learning_rate": 9.517886768215978e-06,
|
|
"loss": 0.5718442916870117,
|
|
"memory(GiB)": 32.12,
|
|
"step": 345,
|
|
"token_acc": 0.829221986539922,
|
|
"train_speed(iter/s)": 0.124734
|
|
},
|
|
{
|
|
"epoch": 0.42869172471867106,
|
|
"grad_norm": 1.054498314857483,
|
|
"learning_rate": 9.50404858504409e-06,
|
|
"loss": 0.5831773757934571,
|
|
"memory(GiB)": 32.12,
|
|
"step": 350,
|
|
"token_acc": 0.830837973923772,
|
|
"train_speed(iter/s)": 0.125116
|
|
},
|
|
{
|
|
"epoch": 0.43481589221465206,
|
|
"grad_norm": 1.1211163997650146,
|
|
"learning_rate": 9.490024955430936e-06,
|
|
"loss": 0.6414088249206543,
|
|
"memory(GiB)": 32.12,
|
|
"step": 355,
|
|
"token_acc": 0.7959719461425946,
|
|
"train_speed(iter/s)": 0.125645
|
|
},
|
|
{
|
|
"epoch": 0.4409400597106331,
|
|
"grad_norm": 1.011711835861206,
|
|
"learning_rate": 9.475816456775313e-06,
|
|
"loss": 0.6285918235778809,
|
|
"memory(GiB)": 32.12,
|
|
"step": 360,
|
|
"token_acc": 0.8033112582781456,
|
|
"train_speed(iter/s)": 0.126108
|
|
},
|
|
{
|
|
"epoch": 0.4409400597106331,
|
|
"eval_loss": 0.6306507587432861,
|
|
"eval_runtime": 30.1935,
|
|
"eval_samples_per_second": 17.454,
|
|
"eval_steps_per_second": 4.372,
|
|
"eval_token_acc": 0.8219639444186166,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.4470642272066141,
|
|
"grad_norm": 0.9317595958709717,
|
|
"learning_rate": 9.46142367408767e-06,
|
|
"loss": 0.6225271224975586,
|
|
"memory(GiB)": 32.12,
|
|
"step": 365,
|
|
"token_acc": 0.8147164353461464,
|
|
"train_speed(iter/s)": 0.124805
|
|
},
|
|
{
|
|
"epoch": 0.45318839470259514,
|
|
"grad_norm": 1.0820285081863403,
|
|
"learning_rate": 9.446847199966042e-06,
|
|
"loss": 0.6166964530944824,
|
|
"memory(GiB)": 32.12,
|
|
"step": 370,
|
|
"token_acc": 0.8163280356945315,
|
|
"train_speed(iter/s)": 0.12523
|
|
},
|
|
{
|
|
"epoch": 0.45931256219857614,
|
|
"grad_norm": 1.0026404857635498,
|
|
"learning_rate": 9.432087634571638e-06,
|
|
"loss": 0.614093542098999,
|
|
"memory(GiB)": 32.12,
|
|
"step": 375,
|
|
"token_acc": 0.8399052293112201,
|
|
"train_speed(iter/s)": 0.125657
|
|
},
|
|
{
|
|
"epoch": 0.4654367296945571,
|
|
"grad_norm": 1.0475082397460938,
|
|
"learning_rate": 9.417145585604139e-06,
|
|
"loss": 0.5946948051452636,
|
|
"memory(GiB)": 32.12,
|
|
"step": 380,
|
|
"token_acc": 0.8166074313408723,
|
|
"train_speed(iter/s)": 0.126073
|
|
},
|
|
{
|
|
"epoch": 0.4654367296945571,
|
|
"eval_loss": 0.6289507150650024,
|
|
"eval_runtime": 30.1648,
|
|
"eval_samples_per_second": 17.471,
|
|
"eval_steps_per_second": 4.376,
|
|
"eval_token_acc": 0.8223720233483135,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.4715608971905382,
|
|
"grad_norm": 1.0893336534500122,
|
|
"learning_rate": 9.402021668276669e-06,
|
|
"loss": 0.6302263259887695,
|
|
"memory(GiB)": 32.12,
|
|
"step": 385,
|
|
"token_acc": 0.8211241269317494,
|
|
"train_speed(iter/s)": 0.124795
|
|
},
|
|
{
|
|
"epoch": 0.47768506468651917,
|
|
"grad_norm": 0.9649361968040466,
|
|
"learning_rate": 9.386716505290467e-06,
|
|
"loss": 0.6190325736999511,
|
|
"memory(GiB)": 32.12,
|
|
"step": 390,
|
|
"token_acc": 0.8042954275641256,
|
|
"train_speed(iter/s)": 0.125176
|
|
},
|
|
{
|
|
"epoch": 0.4838092321825002,
|
|
"grad_norm": 1.1449054479599,
|
|
"learning_rate": 9.371230726809258e-06,
|
|
"loss": 0.6380712032318115,
|
|
"memory(GiB)": 32.12,
|
|
"step": 395,
|
|
"token_acc": 0.7973576211390309,
|
|
"train_speed(iter/s)": 0.125508
|
|
},
|
|
{
|
|
"epoch": 0.4899333996784812,
|
|
"grad_norm": 1.0124316215515137,
|
|
"learning_rate": 9.355564970433288e-06,
|
|
"loss": 0.6248594284057617,
|
|
"memory(GiB)": 32.12,
|
|
"step": 400,
|
|
"token_acc": 0.8122048129544943,
|
|
"train_speed(iter/s)": 0.12583
|
|
},
|
|
{
|
|
"epoch": 0.4899333996784812,
|
|
"eval_loss": 0.6282716989517212,
|
|
"eval_runtime": 30.1457,
|
|
"eval_samples_per_second": 17.482,
|
|
"eval_steps_per_second": 4.379,
|
|
"eval_token_acc": 0.822165401105429,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.4960575671744622,
|
|
"grad_norm": 1.1087108850479126,
|
|
"learning_rate": 9.339719881173093e-06,
|
|
"loss": 0.5993568420410156,
|
|
"memory(GiB)": 32.12,
|
|
"step": 405,
|
|
"token_acc": 0.8224255219329111,
|
|
"train_speed(iter/s)": 0.124698
|
|
},
|
|
{
|
|
"epoch": 0.5021817346704432,
|
|
"grad_norm": 1.1231392621994019,
|
|
"learning_rate": 9.323696111422921e-06,
|
|
"loss": 0.6480292797088623,
|
|
"memory(GiB)": 32.12,
|
|
"step": 410,
|
|
"token_acc": 0.8032305992609242,
|
|
"train_speed(iter/s)": 0.125122
|
|
},
|
|
{
|
|
"epoch": 0.5083059021664242,
|
|
"grad_norm": 1.007530689239502,
|
|
"learning_rate": 9.307494320933893e-06,
|
|
"loss": 0.5892566204071045,
|
|
"memory(GiB)": 32.12,
|
|
"step": 415,
|
|
"token_acc": 0.8197306397306398,
|
|
"train_speed(iter/s)": 0.12542
|
|
},
|
|
{
|
|
"epoch": 0.5144300696624052,
|
|
"grad_norm": 1.0489633083343506,
|
|
"learning_rate": 9.291115176786814e-06,
|
|
"loss": 0.604928731918335,
|
|
"memory(GiB)": 32.12,
|
|
"step": 420,
|
|
"token_acc": 0.8093353125515421,
|
|
"train_speed(iter/s)": 0.125868
|
|
},
|
|
{
|
|
"epoch": 0.5144300696624052,
|
|
"eval_loss": 0.6283579468727112,
|
|
"eval_runtime": 30.1281,
|
|
"eval_samples_per_second": 17.492,
|
|
"eval_steps_per_second": 4.381,
|
|
"eval_token_acc": 0.8228782478433804,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.5205542371583863,
|
|
"grad_norm": 1.126206636428833,
|
|
"learning_rate": 9.274559353364734e-06,
|
|
"loss": 0.6561414718627929,
|
|
"memory(GiB)": 32.12,
|
|
"step": 425,
|
|
"token_acc": 0.8186422368439512,
|
|
"train_speed(iter/s)": 0.124654
|
|
},
|
|
{
|
|
"epoch": 0.5266784046543673,
|
|
"grad_norm": 1.1023788452148438,
|
|
"learning_rate": 9.257827532325159e-06,
|
|
"loss": 0.6316391944885253,
|
|
"memory(GiB)": 32.12,
|
|
"step": 430,
|
|
"token_acc": 0.8269044804985495,
|
|
"train_speed(iter/s)": 0.125027
|
|
},
|
|
{
|
|
"epoch": 0.5328025721503483,
|
|
"grad_norm": 1.0733466148376465,
|
|
"learning_rate": 9.240920402571995e-06,
|
|
"loss": 0.6313365936279297,
|
|
"memory(GiB)": 32.12,
|
|
"step": 435,
|
|
"token_acc": 0.7986597170513775,
|
|
"train_speed(iter/s)": 0.125308
|
|
},
|
|
{
|
|
"epoch": 0.5389267396463293,
|
|
"grad_norm": 1.0384747982025146,
|
|
"learning_rate": 9.223838660227183e-06,
|
|
"loss": 0.5966384410858154,
|
|
"memory(GiB)": 32.12,
|
|
"step": 440,
|
|
"token_acc": 0.8461405390443899,
|
|
"train_speed(iter/s)": 0.125729
|
|
},
|
|
{
|
|
"epoch": 0.5389267396463293,
|
|
"eval_loss": 0.627018392086029,
|
|
"eval_runtime": 30.1938,
|
|
"eval_samples_per_second": 17.454,
|
|
"eval_steps_per_second": 4.372,
|
|
"eval_token_acc": 0.8226509633762075,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.5450509071423103,
|
|
"grad_norm": 0.9662637710571289,
|
|
"learning_rate": 9.206583008602039e-06,
|
|
"loss": 0.6196205615997314,
|
|
"memory(GiB)": 32.12,
|
|
"step": 445,
|
|
"token_acc": 0.814478517479054,
|
|
"train_speed(iter/s)": 0.124648
|
|
},
|
|
{
|
|
"epoch": 0.5511750746382914,
|
|
"grad_norm": 1.0670920610427856,
|
|
"learning_rate": 9.189154158168293e-06,
|
|
"loss": 0.6371576309204101,
|
|
"memory(GiB)": 32.12,
|
|
"step": 450,
|
|
"token_acc": 0.8166518012952263,
|
|
"train_speed(iter/s)": 0.125033
|
|
},
|
|
{
|
|
"epoch": 0.5572992421342724,
|
|
"grad_norm": 0.9831814765930176,
|
|
"learning_rate": 9.171552826528832e-06,
|
|
"loss": 0.549981689453125,
|
|
"memory(GiB)": 32.12,
|
|
"step": 455,
|
|
"token_acc": 0.8483598990707121,
|
|
"train_speed(iter/s)": 0.125327
|
|
},
|
|
{
|
|
"epoch": 0.5634234096302534,
|
|
"grad_norm": 1.0345466136932373,
|
|
"learning_rate": 9.15377973838817e-06,
|
|
"loss": 0.6600610733032226,
|
|
"memory(GiB)": 32.12,
|
|
"step": 460,
|
|
"token_acc": 0.8050162396246843,
|
|
"train_speed(iter/s)": 0.125742
|
|
},
|
|
{
|
|
"epoch": 0.5634234096302534,
|
|
"eval_loss": 0.6251854300498962,
|
|
"eval_runtime": 29.9543,
|
|
"eval_samples_per_second": 17.593,
|
|
"eval_steps_per_second": 4.407,
|
|
"eval_token_acc": 0.823002221189111,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.5695475771262344,
|
|
"grad_norm": 0.9955180287361145,
|
|
"learning_rate": 9.135835625522585e-06,
|
|
"loss": 0.5820852279663086,
|
|
"memory(GiB)": 32.12,
|
|
"step": 465,
|
|
"token_acc": 0.8245337424184316,
|
|
"train_speed(iter/s)": 0.124745
|
|
},
|
|
{
|
|
"epoch": 0.5756717446222154,
|
|
"grad_norm": 1.0886976718902588,
|
|
"learning_rate": 9.117721226750019e-06,
|
|
"loss": 0.6092466354370117,
|
|
"memory(GiB)": 32.12,
|
|
"step": 470,
|
|
"token_acc": 0.8014575318141463,
|
|
"train_speed(iter/s)": 0.125111
|
|
},
|
|
{
|
|
"epoch": 0.5817959121181965,
|
|
"grad_norm": 0.9607824087142944,
|
|
"learning_rate": 9.099437287899634e-06,
|
|
"loss": 0.6091058731079102,
|
|
"memory(GiB)": 32.12,
|
|
"step": 475,
|
|
"token_acc": 0.8267922900839333,
|
|
"train_speed(iter/s)": 0.125396
|
|
},
|
|
{
|
|
"epoch": 0.5879200796141775,
|
|
"grad_norm": 1.0514999628067017,
|
|
"learning_rate": 9.08098456178111e-06,
|
|
"loss": 0.620454978942871,
|
|
"memory(GiB)": 32.12,
|
|
"step": 480,
|
|
"token_acc": 0.8148612218551847,
|
|
"train_speed(iter/s)": 0.125738
|
|
},
|
|
{
|
|
"epoch": 0.5879200796141775,
|
|
"eval_loss": 0.6268740296363831,
|
|
"eval_runtime": 29.9706,
|
|
"eval_samples_per_second": 17.584,
|
|
"eval_steps_per_second": 4.404,
|
|
"eval_token_acc": 0.8231468567591301,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.5940442471101585,
|
|
"grad_norm": 1.007743239402771,
|
|
"learning_rate": 9.06236380815366e-06,
|
|
"loss": 0.6164707183837891,
|
|
"memory(GiB)": 32.12,
|
|
"step": 485,
|
|
"token_acc": 0.8169734151329243,
|
|
"train_speed(iter/s)": 0.124816
|
|
},
|
|
{
|
|
"epoch": 0.6001684146061395,
|
|
"grad_norm": 1.0523988008499146,
|
|
"learning_rate": 9.043575793694733e-06,
|
|
"loss": 0.6281998157501221,
|
|
"memory(GiB)": 32.12,
|
|
"step": 490,
|
|
"token_acc": 0.8149852592219745,
|
|
"train_speed(iter/s)": 0.125171
|
|
},
|
|
{
|
|
"epoch": 0.6062925821021204,
|
|
"grad_norm": 0.9333862066268921,
|
|
"learning_rate": 9.024621291968461e-06,
|
|
"loss": 0.6068775653839111,
|
|
"memory(GiB)": 32.12,
|
|
"step": 495,
|
|
"token_acc": 0.8320725141416206,
|
|
"train_speed(iter/s)": 0.125492
|
|
},
|
|
{
|
|
"epoch": 0.6124167495981016,
|
|
"grad_norm": 1.0344208478927612,
|
|
"learning_rate": 9.005501083393799e-06,
|
|
"loss": 0.6447205543518066,
|
|
"memory(GiB)": 32.12,
|
|
"step": 500,
|
|
"token_acc": 0.8103164010363508,
|
|
"train_speed(iter/s)": 0.125823
|
|
},
|
|
{
|
|
"epoch": 0.6124167495981016,
|
|
"eval_loss": 0.6257410645484924,
|
|
"eval_runtime": 30.1552,
|
|
"eval_samples_per_second": 17.476,
|
|
"eval_steps_per_second": 4.377,
|
|
"eval_token_acc": 0.82304871119376,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.6185409170940825,
|
|
"grad_norm": 1.125075340270996,
|
|
"learning_rate": 8.986215955212394e-06,
|
|
"loss": 0.6383331775665283,
|
|
"memory(GiB)": 32.12,
|
|
"step": 505,
|
|
"token_acc": 0.8156163386432991,
|
|
"train_speed(iter/s)": 0.124876
|
|
},
|
|
{
|
|
"epoch": 0.6246650845900635,
|
|
"grad_norm": 1.0361703634262085,
|
|
"learning_rate": 8.966766701456177e-06,
|
|
"loss": 0.6330353736877441,
|
|
"memory(GiB)": 32.12,
|
|
"step": 510,
|
|
"token_acc": 0.7972571857974826,
|
|
"train_speed(iter/s)": 0.125207
|
|
},
|
|
{
|
|
"epoch": 0.6307892520860445,
|
|
"grad_norm": 0.9697835445404053,
|
|
"learning_rate": 8.947154122914666e-06,
|
|
"loss": 0.6002368927001953,
|
|
"memory(GiB)": 32.12,
|
|
"step": 515,
|
|
"token_acc": 0.8171989033478781,
|
|
"train_speed(iter/s)": 0.12553
|
|
},
|
|
{
|
|
"epoch": 0.6369134195820255,
|
|
"grad_norm": 1.0072293281555176,
|
|
"learning_rate": 8.927379027101994e-06,
|
|
"loss": 0.6289110660552979,
|
|
"memory(GiB)": 32.12,
|
|
"step": 520,
|
|
"token_acc": 0.8173692196055149,
|
|
"train_speed(iter/s)": 0.125831
|
|
},
|
|
{
|
|
"epoch": 0.6369134195820255,
|
|
"eval_loss": 0.6232544183731079,
|
|
"eval_runtime": 30.2166,
|
|
"eval_samples_per_second": 17.441,
|
|
"eval_steps_per_second": 4.368,
|
|
"eval_token_acc": 0.8239630146185237,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.6430375870780066,
|
|
"grad_norm": 1.0802470445632935,
|
|
"learning_rate": 8.907442228223668e-06,
|
|
"loss": 0.6614001274108887,
|
|
"memory(GiB)": 32.12,
|
|
"step": 525,
|
|
"token_acc": 0.8152090832024681,
|
|
"train_speed(iter/s)": 0.12495
|
|
},
|
|
{
|
|
"epoch": 0.6491617545739876,
|
|
"grad_norm": 0.9386376142501831,
|
|
"learning_rate": 8.887344547143032e-06,
|
|
"loss": 0.6672756195068359,
|
|
"memory(GiB)": 32.12,
|
|
"step": 530,
|
|
"token_acc": 0.8442938796480637,
|
|
"train_speed(iter/s)": 0.125255
|
|
},
|
|
{
|
|
"epoch": 0.6552859220699686,
|
|
"grad_norm": 0.8939440846443176,
|
|
"learning_rate": 8.867086811347483e-06,
|
|
"loss": 0.6040712356567383,
|
|
"memory(GiB)": 32.12,
|
|
"step": 535,
|
|
"token_acc": 0.8285196511496206,
|
|
"train_speed(iter/s)": 0.125541
|
|
},
|
|
{
|
|
"epoch": 0.6614100895659496,
|
|
"grad_norm": 0.9224595427513123,
|
|
"learning_rate": 8.846669854914395e-06,
|
|
"loss": 0.6459405899047852,
|
|
"memory(GiB)": 32.12,
|
|
"step": 540,
|
|
"token_acc": 0.8066893233285104,
|
|
"train_speed(iter/s)": 0.125873
|
|
},
|
|
{
|
|
"epoch": 0.6614100895659496,
|
|
"eval_loss": 0.6218589544296265,
|
|
"eval_runtime": 30.088,
|
|
"eval_samples_per_second": 17.515,
|
|
"eval_steps_per_second": 4.387,
|
|
"eval_token_acc": 0.8244330802210857,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.6675342570619306,
|
|
"grad_norm": 1.198673129081726,
|
|
"learning_rate": 8.826094518476775e-06,
|
|
"loss": 0.6059948921203613,
|
|
"memory(GiB)": 32.12,
|
|
"step": 545,
|
|
"token_acc": 0.8176160233107487,
|
|
"train_speed(iter/s)": 0.125029
|
|
},
|
|
{
|
|
"epoch": 0.6736584245579117,
|
|
"grad_norm": 0.9881957769393921,
|
|
"learning_rate": 8.805361649188657e-06,
|
|
"loss": 0.5907226085662842,
|
|
"memory(GiB)": 32.12,
|
|
"step": 550,
|
|
"token_acc": 0.8072239136451702,
|
|
"train_speed(iter/s)": 0.125389
|
|
},
|
|
{
|
|
"epoch": 0.6797825920538927,
|
|
"grad_norm": 1.027031660079956,
|
|
"learning_rate": 8.784472100690215e-06,
|
|
"loss": 0.6389594554901123,
|
|
"memory(GiB)": 32.12,
|
|
"step": 555,
|
|
"token_acc": 0.8127651442767055,
|
|
"train_speed(iter/s)": 0.125703
|
|
},
|
|
{
|
|
"epoch": 0.6859067595498737,
|
|
"grad_norm": 1.0879848003387451,
|
|
"learning_rate": 8.763426733072624e-06,
|
|
"loss": 0.6162051200866699,
|
|
"memory(GiB)": 32.12,
|
|
"step": 560,
|
|
"token_acc": 0.8103369683368982,
|
|
"train_speed(iter/s)": 0.125956
|
|
},
|
|
{
|
|
"epoch": 0.6859067595498737,
|
|
"eval_loss": 0.6215130686759949,
|
|
"eval_runtime": 30.1587,
|
|
"eval_samples_per_second": 17.474,
|
|
"eval_steps_per_second": 4.377,
|
|
"eval_token_acc": 0.8242212924221293,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.6920309270458547,
|
|
"grad_norm": 0.9450560808181763,
|
|
"learning_rate": 8.742226412842636e-06,
|
|
"loss": 0.6049357414245605,
|
|
"memory(GiB)": 32.12,
|
|
"step": 565,
|
|
"token_acc": 0.828604110069801,
|
|
"train_speed(iter/s)": 0.12514
|
|
},
|
|
{
|
|
"epoch": 0.6981550945418358,
|
|
"grad_norm": 1.0816230773925781,
|
|
"learning_rate": 8.720872012886918e-06,
|
|
"loss": 0.6060591697692871,
|
|
"memory(GiB)": 32.12,
|
|
"step": 570,
|
|
"token_acc": 0.8121618953603159,
|
|
"train_speed(iter/s)": 0.125424
|
|
},
|
|
{
|
|
"epoch": 0.7042792620378168,
|
|
"grad_norm": 1.0539354085922241,
|
|
"learning_rate": 8.6993644124361e-06,
|
|
"loss": 0.6078558921813965,
|
|
"memory(GiB)": 32.12,
|
|
"step": 575,
|
|
"token_acc": 0.8203018867924529,
|
|
"train_speed(iter/s)": 0.125669
|
|
},
|
|
{
|
|
"epoch": 0.7104034295337978,
|
|
"grad_norm": 0.9692308306694031,
|
|
"learning_rate": 8.677704497028579e-06,
|
|
"loss": 0.6092854976654053,
|
|
"memory(GiB)": 32.12,
|
|
"step": 580,
|
|
"token_acc": 0.813650025657943,
|
|
"train_speed(iter/s)": 0.125955
|
|
},
|
|
{
|
|
"epoch": 0.7104034295337978,
|
|
"eval_loss": 0.6210135817527771,
|
|
"eval_runtime": 30.0497,
|
|
"eval_samples_per_second": 17.538,
|
|
"eval_steps_per_second": 4.393,
|
|
"eval_token_acc": 0.8236375845859807,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.7165275970297788,
|
|
"grad_norm": 1.0405429601669312,
|
|
"learning_rate": 8.655893158474056e-06,
|
|
"loss": 0.626552963256836,
|
|
"memory(GiB)": 32.12,
|
|
"step": 585,
|
|
"token_acc": 0.8233875988502245,
|
|
"train_speed(iter/s)": 0.12517
|
|
},
|
|
{
|
|
"epoch": 0.7226517645257597,
|
|
"grad_norm": 0.9403768181800842,
|
|
"learning_rate": 8.633931294816823e-06,
|
|
"loss": 0.6014822483062744,
|
|
"memory(GiB)": 32.12,
|
|
"step": 590,
|
|
"token_acc": 0.8108660890260682,
|
|
"train_speed(iter/s)": 0.125469
|
|
},
|
|
{
|
|
"epoch": 0.7287759320217408,
|
|
"grad_norm": 0.8821709752082825,
|
|
"learning_rate": 8.611819810298778e-06,
|
|
"loss": 0.6129745483398438,
|
|
"memory(GiB)": 32.12,
|
|
"step": 595,
|
|
"token_acc": 0.8298729368614095,
|
|
"train_speed(iter/s)": 0.125735
|
|
},
|
|
{
|
|
"epoch": 0.7349000995177218,
|
|
"grad_norm": 0.9912955164909363,
|
|
"learning_rate": 8.58955961532221e-06,
|
|
"loss": 0.5840856075286865,
|
|
"memory(GiB)": 32.12,
|
|
"step": 600,
|
|
"token_acc": 0.8248710481069511,
|
|
"train_speed(iter/s)": 0.125926
|
|
},
|
|
{
|
|
"epoch": 0.7349000995177218,
|
|
"eval_loss": 0.6222088932991028,
|
|
"eval_runtime": 30.0444,
|
|
"eval_samples_per_second": 17.541,
|
|
"eval_steps_per_second": 4.393,
|
|
"eval_token_acc": 0.8234412934552404,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.7410242670137028,
|
|
"grad_norm": 1.037676453590393,
|
|
"learning_rate": 8.567151626412295e-06,
|
|
"loss": 0.6406550884246827,
|
|
"memory(GiB)": 32.12,
|
|
"step": 605,
|
|
"token_acc": 0.8165861175316943,
|
|
"train_speed(iter/s)": 0.125193
|
|
},
|
|
{
|
|
"epoch": 0.7471484345096838,
|
|
"grad_norm": 1.0239970684051514,
|
|
"learning_rate": 8.544596766179377e-06,
|
|
"loss": 0.5927177429199219,
|
|
"memory(GiB)": 32.12,
|
|
"step": 610,
|
|
"token_acc": 0.8224873999407056,
|
|
"train_speed(iter/s)": 0.125437
|
|
},
|
|
{
|
|
"epoch": 0.7532726020056648,
|
|
"grad_norm": 1.0227631330490112,
|
|
"learning_rate": 8.521895963280967e-06,
|
|
"loss": 0.5963564395904541,
|
|
"memory(GiB)": 32.12,
|
|
"step": 615,
|
|
"token_acc": 0.827058931465794,
|
|
"train_speed(iter/s)": 0.125665
|
|
},
|
|
{
|
|
"epoch": 0.7593967695016459,
|
|
"grad_norm": 1.0962486267089844,
|
|
"learning_rate": 8.499050152383519e-06,
|
|
"loss": 0.6459769248962403,
|
|
"memory(GiB)": 32.12,
|
|
"step": 620,
|
|
"token_acc": 0.8193405375450263,
|
|
"train_speed(iter/s)": 0.125922
|
|
},
|
|
{
|
|
"epoch": 0.7593967695016459,
|
|
"eval_loss": 0.6196050643920898,
|
|
"eval_runtime": 29.9245,
|
|
"eval_samples_per_second": 17.611,
|
|
"eval_steps_per_second": 4.411,
|
|
"eval_token_acc": 0.8242987757632109,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.7655209369976269,
|
|
"grad_norm": 1.0802922248840332,
|
|
"learning_rate": 8.476060274123938e-06,
|
|
"loss": 0.5952530860900879,
|
|
"memory(GiB)": 32.12,
|
|
"step": 625,
|
|
"token_acc": 0.8222949637294201,
|
|
"train_speed(iter/s)": 0.125113
|
|
},
|
|
{
|
|
"epoch": 0.7716451044936079,
|
|
"grad_norm": 1.0066949129104614,
|
|
"learning_rate": 8.452927275070858e-06,
|
|
"loss": 0.6043106079101562,
|
|
"memory(GiB)": 32.12,
|
|
"step": 630,
|
|
"token_acc": 0.8247297031649302,
|
|
"train_speed(iter/s)": 0.125399
|
|
},
|
|
{
|
|
"epoch": 0.7777692719895889,
|
|
"grad_norm": 0.9393348693847656,
|
|
"learning_rate": 8.429652107685662e-06,
|
|
"loss": 0.633615779876709,
|
|
"memory(GiB)": 32.12,
|
|
"step": 635,
|
|
"token_acc": 0.8054982337099087,
|
|
"train_speed(iter/s)": 0.125629
|
|
},
|
|
{
|
|
"epoch": 0.7838934394855699,
|
|
"grad_norm": 1.0759190320968628,
|
|
"learning_rate": 8.40623573028327e-06,
|
|
"loss": 0.6218441009521485,
|
|
"memory(GiB)": 32.12,
|
|
"step": 640,
|
|
"token_acc": 0.8195294533875818,
|
|
"train_speed(iter/s)": 0.125911
|
|
},
|
|
{
|
|
"epoch": 0.7838934394855699,
|
|
"eval_loss": 0.6197024583816528,
|
|
"eval_runtime": 29.9919,
|
|
"eval_samples_per_second": 17.571,
|
|
"eval_steps_per_second": 4.401,
|
|
"eval_token_acc": 0.8242006301978408,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.790017606981551,
|
|
"grad_norm": 0.8828935623168945,
|
|
"learning_rate": 8.382679106992687e-06,
|
|
"loss": 0.6121187210083008,
|
|
"memory(GiB)": 32.12,
|
|
"step": 645,
|
|
"token_acc": 0.8180191693290735,
|
|
"train_speed(iter/s)": 0.125215
|
|
},
|
|
{
|
|
"epoch": 0.796141774477532,
|
|
"grad_norm": 1.0172326564788818,
|
|
"learning_rate": 8.358983207717286e-06,
|
|
"loss": 0.6195911407470703,
|
|
"memory(GiB)": 32.12,
|
|
"step": 650,
|
|
"token_acc": 0.802275960170697,
|
|
"train_speed(iter/s)": 0.125435
|
|
},
|
|
{
|
|
"epoch": 0.802265941973513,
|
|
"grad_norm": 1.0442404747009277,
|
|
"learning_rate": 8.335149008094906e-06,
|
|
"loss": 0.5969693660736084,
|
|
"memory(GiB)": 32.12,
|
|
"step": 655,
|
|
"token_acc": 0.8279867846104657,
|
|
"train_speed(iter/s)": 0.125647
|
|
},
|
|
{
|
|
"epoch": 0.808390109469494,
|
|
"grad_norm": 0.9861543774604797,
|
|
"learning_rate": 8.311177489457653e-06,
|
|
"loss": 0.6027172088623047,
|
|
"memory(GiB)": 32.12,
|
|
"step": 660,
|
|
"token_acc": 0.8264142409459503,
|
|
"train_speed(iter/s)": 0.125901
|
|
},
|
|
{
|
|
"epoch": 0.808390109469494,
|
|
"eval_loss": 0.6189049482345581,
|
|
"eval_runtime": 30.0122,
|
|
"eval_samples_per_second": 17.56,
|
|
"eval_steps_per_second": 4.398,
|
|
"eval_token_acc": 0.8247378480293404,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.814514276965475,
|
|
"grad_norm": 0.9293588399887085,
|
|
"learning_rate": 8.28706963879151e-06,
|
|
"loss": 0.5741694927215576,
|
|
"memory(GiB)": 32.12,
|
|
"step": 665,
|
|
"token_acc": 0.8175586677777479,
|
|
"train_speed(iter/s)": 0.125199
|
|
},
|
|
{
|
|
"epoch": 0.8206384444614561,
|
|
"grad_norm": 0.8590324521064758,
|
|
"learning_rate": 8.2628264486957e-06,
|
|
"loss": 0.6139655113220215,
|
|
"memory(GiB)": 32.12,
|
|
"step": 670,
|
|
"token_acc": 0.8059371841425523,
|
|
"train_speed(iter/s)": 0.125416
|
|
},
|
|
{
|
|
"epoch": 0.826762611957437,
|
|
"grad_norm": 1.0524649620056152,
|
|
"learning_rate": 8.23844891734181e-06,
|
|
"loss": 0.5746917724609375,
|
|
"memory(GiB)": 32.12,
|
|
"step": 675,
|
|
"token_acc": 0.8064527770760402,
|
|
"train_speed(iter/s)": 0.125689
|
|
},
|
|
{
|
|
"epoch": 0.832886779453418,
|
|
"grad_norm": 1.0562033653259277,
|
|
"learning_rate": 8.213938048432697e-06,
|
|
"loss": 0.5949201583862305,
|
|
"memory(GiB)": 32.12,
|
|
"step": 680,
|
|
"token_acc": 0.8262676641729011,
|
|
"train_speed(iter/s)": 0.125943
|
|
},
|
|
{
|
|
"epoch": 0.832886779453418,
|
|
"eval_loss": 0.6181398630142212,
|
|
"eval_runtime": 29.812,
|
|
"eval_samples_per_second": 17.677,
|
|
"eval_steps_per_second": 4.428,
|
|
"eval_token_acc": 0.8249702980525854,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.839010946949399,
|
|
"grad_norm": 0.8997740149497986,
|
|
"learning_rate": 8.189294851161164e-06,
|
|
"loss": 0.6027894496917725,
|
|
"memory(GiB)": 32.12,
|
|
"step": 685,
|
|
"token_acc": 0.8181957698532284,
|
|
"train_speed(iter/s)": 0.125279
|
|
},
|
|
{
|
|
"epoch": 0.84513511444538,
|
|
"grad_norm": 0.9756171703338623,
|
|
"learning_rate": 8.164520340168404e-06,
|
|
"loss": 0.6199028015136718,
|
|
"memory(GiB)": 32.12,
|
|
"step": 690,
|
|
"token_acc": 0.816996805111821,
|
|
"train_speed(iter/s)": 0.125482
|
|
},
|
|
{
|
|
"epoch": 0.8512592819413611,
|
|
"grad_norm": 1.0920320749282837,
|
|
"learning_rate": 8.139615535502227e-06,
|
|
"loss": 0.6447176933288574,
|
|
"memory(GiB)": 32.12,
|
|
"step": 695,
|
|
"token_acc": 0.8085919407132932,
|
|
"train_speed(iter/s)": 0.125747
|
|
},
|
|
{
|
|
"epoch": 0.8573834494373421,
|
|
"grad_norm": 0.9692983627319336,
|
|
"learning_rate": 8.114581462575063e-06,
|
|
"loss": 0.6160262107849122,
|
|
"memory(GiB)": 32.12,
|
|
"step": 700,
|
|
"token_acc": 0.8179973169137629,
|
|
"train_speed(iter/s)": 0.12598
|
|
},
|
|
{
|
|
"epoch": 0.8573834494373421,
|
|
"eval_loss": 0.6175059080123901,
|
|
"eval_runtime": 30.0657,
|
|
"eval_samples_per_second": 17.528,
|
|
"eval_steps_per_second": 4.39,
|
|
"eval_token_acc": 0.8250374502815228,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.8635076169333231,
|
|
"grad_norm": 0.9100112318992615,
|
|
"learning_rate": 8.089419152121736e-06,
|
|
"loss": 0.572049617767334,
|
|
"memory(GiB)": 32.12,
|
|
"step": 705,
|
|
"token_acc": 0.8311790668348046,
|
|
"train_speed(iter/s)": 0.125371
|
|
},
|
|
{
|
|
"epoch": 0.8696317844293041,
|
|
"grad_norm": 0.967882513999939,
|
|
"learning_rate": 8.064129640157033e-06,
|
|
"loss": 0.6320825576782226,
|
|
"memory(GiB)": 32.12,
|
|
"step": 710,
|
|
"token_acc": 0.8260635252113577,
|
|
"train_speed(iter/s)": 0.125628
|
|
},
|
|
{
|
|
"epoch": 0.8757559519252851,
|
|
"grad_norm": 0.9655548334121704,
|
|
"learning_rate": 8.038713967933043e-06,
|
|
"loss": 0.6211101055145264,
|
|
"memory(GiB)": 32.12,
|
|
"step": 715,
|
|
"token_acc": 0.818314430545932,
|
|
"train_speed(iter/s)": 0.125843
|
|
},
|
|
{
|
|
"epoch": 0.8818801194212662,
|
|
"grad_norm": 0.888866126537323,
|
|
"learning_rate": 8.013173181896283e-06,
|
|
"loss": 0.6331143379211426,
|
|
"memory(GiB)": 32.12,
|
|
"step": 720,
|
|
"token_acc": 0.8223672789139266,
|
|
"train_speed(iter/s)": 0.126057
|
|
},
|
|
{
|
|
"epoch": 0.8818801194212662,
|
|
"eval_loss": 0.6172851324081421,
|
|
"eval_runtime": 29.7983,
|
|
"eval_samples_per_second": 17.686,
|
|
"eval_steps_per_second": 4.43,
|
|
"eval_token_acc": 0.8249289736040085,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.8880042869172472,
|
|
"grad_norm": 0.9664549827575684,
|
|
"learning_rate": 7.98750833364462e-06,
|
|
"loss": 0.6372400760650635,
|
|
"memory(GiB)": 32.12,
|
|
"step": 725,
|
|
"token_acc": 0.8170195878334325,
|
|
"train_speed(iter/s)": 0.125415
|
|
},
|
|
{
|
|
"epoch": 0.8941284544132282,
|
|
"grad_norm": 1.000975251197815,
|
|
"learning_rate": 7.961720479883967e-06,
|
|
"loss": 0.5750507354736328,
|
|
"memory(GiB)": 32.12,
|
|
"step": 730,
|
|
"token_acc": 0.8278389461108779,
|
|
"train_speed(iter/s)": 0.125636
|
|
},
|
|
{
|
|
"epoch": 0.9002526219092092,
|
|
"grad_norm": 1.094359278678894,
|
|
"learning_rate": 7.935810682384777e-06,
|
|
"loss": 0.5872611045837403,
|
|
"memory(GiB)": 32.12,
|
|
"step": 735,
|
|
"token_acc": 0.8236509437265819,
|
|
"train_speed(iter/s)": 0.125855
|
|
},
|
|
{
|
|
"epoch": 0.9063767894051903,
|
|
"grad_norm": 0.9531553387641907,
|
|
"learning_rate": 7.909780007938327e-06,
|
|
"loss": 0.597745418548584,
|
|
"memory(GiB)": 32.12,
|
|
"step": 740,
|
|
"token_acc": 0.8098763707480617,
|
|
"train_speed(iter/s)": 0.12604
|
|
},
|
|
{
|
|
"epoch": 0.9063767894051903,
|
|
"eval_loss": 0.616245687007904,
|
|
"eval_runtime": 30.0544,
|
|
"eval_samples_per_second": 17.535,
|
|
"eval_steps_per_second": 4.392,
|
|
"eval_token_acc": 0.8252440725244072,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.9125009569011713,
|
|
"grad_norm": 1.0165457725524902,
|
|
"learning_rate": 7.883629528312794e-06,
|
|
"loss": 0.6201919555664063,
|
|
"memory(GiB)": 32.12,
|
|
"step": 745,
|
|
"token_acc": 0.8185958200091366,
|
|
"train_speed(iter/s)": 0.125398
|
|
},
|
|
{
|
|
"epoch": 0.9186251243971523,
|
|
"grad_norm": 1.1827678680419922,
|
|
"learning_rate": 7.857360320209126e-06,
|
|
"loss": 0.6491155624389648,
|
|
"memory(GiB)": 32.12,
|
|
"step": 750,
|
|
"token_acc": 0.8059945706742032,
|
|
"train_speed(iter/s)": 0.125601
|
|
},
|
|
{
|
|
"epoch": 0.9247492918931333,
|
|
"grad_norm": 1.0143615007400513,
|
|
"learning_rate": 7.830973465216712e-06,
|
|
"loss": 0.6207675933837891,
|
|
"memory(GiB)": 32.12,
|
|
"step": 755,
|
|
"token_acc": 0.8050835253456221,
|
|
"train_speed(iter/s)": 0.1258
|
|
},
|
|
{
|
|
"epoch": 0.9308734593891143,
|
|
"grad_norm": 0.929755449295044,
|
|
"learning_rate": 7.80447004976885e-06,
|
|
"loss": 0.5987899780273438,
|
|
"memory(GiB)": 32.12,
|
|
"step": 760,
|
|
"token_acc": 0.8344898639435169,
|
|
"train_speed(iter/s)": 0.126004
|
|
},
|
|
{
|
|
"epoch": 0.9308734593891143,
|
|
"eval_loss": 0.6154034733772278,
|
|
"eval_runtime": 29.9062,
|
|
"eval_samples_per_second": 17.622,
|
|
"eval_steps_per_second": 4.414,
|
|
"eval_token_acc": 0.8256728136783925,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.9369976268850954,
|
|
"grad_norm": 0.9682816863059998,
|
|
"learning_rate": 7.777851165098012e-06,
|
|
"loss": 0.6217115879058838,
|
|
"memory(GiB)": 32.12,
|
|
"step": 765,
|
|
"token_acc": 0.8253138075313807,
|
|
"train_speed(iter/s)": 0.125416
|
|
},
|
|
{
|
|
"epoch": 0.9431217943810764,
|
|
"grad_norm": 0.9861950278282166,
|
|
"learning_rate": 7.751117907190919e-06,
|
|
"loss": 0.6429153442382812,
|
|
"memory(GiB)": 32.12,
|
|
"step": 770,
|
|
"token_acc": 0.7975187624444785,
|
|
"train_speed(iter/s)": 0.125587
|
|
},
|
|
{
|
|
"epoch": 0.9492459618770573,
|
|
"grad_norm": 1.0501208305358887,
|
|
"learning_rate": 7.724271376743408e-06,
|
|
"loss": 0.6119437694549561,
|
|
"memory(GiB)": 32.12,
|
|
"step": 775,
|
|
"token_acc": 0.8173751624280676,
|
|
"train_speed(iter/s)": 0.125823
|
|
},
|
|
{
|
|
"epoch": 0.9553701293730383,
|
|
"grad_norm": 0.865284264087677,
|
|
"learning_rate": 7.697312679115126e-06,
|
|
"loss": 0.6217618465423584,
|
|
"memory(GiB)": 32.12,
|
|
"step": 780,
|
|
"token_acc": 0.8141066272272696,
|
|
"train_speed(iter/s)": 0.126064
|
|
},
|
|
{
|
|
"epoch": 0.9553701293730383,
|
|
"eval_loss": 0.616006076335907,
|
|
"eval_runtime": 29.936,
|
|
"eval_samples_per_second": 17.604,
|
|
"eval_steps_per_second": 4.409,
|
|
"eval_token_acc": 0.8253008936412005,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.9614942968690193,
|
|
"grad_norm": 0.9814106822013855,
|
|
"learning_rate": 7.670242924284e-06,
|
|
"loss": 0.6097393989562988,
|
|
"memory(GiB)": 32.12,
|
|
"step": 785,
|
|
"token_acc": 0.8181161935170403,
|
|
"train_speed(iter/s)": 0.125512
|
|
},
|
|
{
|
|
"epoch": 0.9676184643650004,
|
|
"grad_norm": 0.9246596693992615,
|
|
"learning_rate": 7.643063226800556e-06,
|
|
"loss": 0.5933025360107422,
|
|
"memory(GiB)": 32.12,
|
|
"step": 790,
|
|
"token_acc": 0.8117928174854171,
|
|
"train_speed(iter/s)": 0.125727
|
|
},
|
|
{
|
|
"epoch": 0.9737426318609814,
|
|
"grad_norm": 0.9575701951980591,
|
|
"learning_rate": 7.615774705742012e-06,
|
|
"loss": 0.6192699432373047,
|
|
"memory(GiB)": 32.12,
|
|
"step": 795,
|
|
"token_acc": 0.8193493150684932,
|
|
"train_speed(iter/s)": 0.125968
|
|
},
|
|
{
|
|
"epoch": 0.9798667993569624,
|
|
"grad_norm": 0.9669679403305054,
|
|
"learning_rate": 7.588378484666214e-06,
|
|
"loss": 0.5967386722564697,
|
|
"memory(GiB)": 32.12,
|
|
"step": 800,
|
|
"token_acc": 0.8374201589032559,
|
|
"train_speed(iter/s)": 0.126204
|
|
},
|
|
{
|
|
"epoch": 0.9798667993569624,
|
|
"eval_loss": 0.6141526699066162,
|
|
"eval_runtime": 29.9762,
|
|
"eval_samples_per_second": 17.581,
|
|
"eval_steps_per_second": 4.403,
|
|
"eval_token_acc": 0.825512681440157,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.9859909668529434,
|
|
"grad_norm": 1.0676608085632324,
|
|
"learning_rate": 7.560875691565366e-06,
|
|
"loss": 0.6506372451782226,
|
|
"memory(GiB)": 32.12,
|
|
"step": 805,
|
|
"token_acc": 0.8162111860741997,
|
|
"train_speed(iter/s)": 0.125683
|
|
},
|
|
{
|
|
"epoch": 0.9921151343489244,
|
|
"grad_norm": 1.1090500354766846,
|
|
"learning_rate": 7.533267458819597e-06,
|
|
"loss": 0.6549376487731934,
|
|
"memory(GiB)": 32.12,
|
|
"step": 810,
|
|
"token_acc": 0.7903715821453611,
|
|
"train_speed(iter/s)": 0.125882
|
|
},
|
|
{
|
|
"epoch": 0.9982393018449055,
|
|
"grad_norm": 1.0215500593185425,
|
|
"learning_rate": 7.505554923150329e-06,
|
|
"loss": 0.6107999324798584,
|
|
"memory(GiB)": 32.12,
|
|
"step": 815,
|
|
"token_acc": 0.8255743651753326,
|
|
"train_speed(iter/s)": 0.126031
|
|
},
|
|
{
|
|
"epoch": 1.0036745004975887,
|
|
"grad_norm": 0.9610656499862671,
|
|
"learning_rate": 7.477739225573475e-06,
|
|
"loss": 0.5486949920654297,
|
|
"memory(GiB)": 32.12,
|
|
"step": 820,
|
|
"token_acc": 0.8164336957325642,
|
|
"train_speed(iter/s)": 0.126279
|
|
},
|
|
{
|
|
"epoch": 1.0036745004975887,
|
|
"eval_loss": 0.6169166564941406,
|
|
"eval_runtime": 29.8644,
|
|
"eval_samples_per_second": 17.646,
|
|
"eval_steps_per_second": 4.42,
|
|
"eval_token_acc": 0.8263649981920553,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.0097986679935695,
|
|
"grad_norm": 0.8930652141571045,
|
|
"learning_rate": 7.449821511352465e-06,
|
|
"loss": 0.5580629348754883,
|
|
"memory(GiB)": 32.12,
|
|
"step": 825,
|
|
"token_acc": 0.8275424871864041,
|
|
"train_speed(iter/s)": 0.125684
|
|
},
|
|
{
|
|
"epoch": 1.0159228354895506,
|
|
"grad_norm": 1.043878197669983,
|
|
"learning_rate": 7.421802929951088e-06,
|
|
"loss": 0.537553071975708,
|
|
"memory(GiB)": 32.12,
|
|
"step": 830,
|
|
"token_acc": 0.8182638888888889,
|
|
"train_speed(iter/s)": 0.12587
|
|
},
|
|
{
|
|
"epoch": 1.0220470029855317,
|
|
"grad_norm": 0.9812543392181396,
|
|
"learning_rate": 7.393684634986165e-06,
|
|
"loss": 0.544792366027832,
|
|
"memory(GiB)": 32.12,
|
|
"step": 835,
|
|
"token_acc": 0.8522354565855342,
|
|
"train_speed(iter/s)": 0.12604
|
|
},
|
|
{
|
|
"epoch": 1.0281711704815126,
|
|
"grad_norm": 0.9970709085464478,
|
|
"learning_rate": 7.365467784180051e-06,
|
|
"loss": 0.5357254028320313,
|
|
"memory(GiB)": 32.12,
|
|
"step": 840,
|
|
"token_acc": 0.8398415604798027,
|
|
"train_speed(iter/s)": 0.126238
|
|
},
|
|
{
|
|
"epoch": 1.0281711704815126,
|
|
"eval_loss": 0.6195093393325806,
|
|
"eval_runtime": 30.0227,
|
|
"eval_samples_per_second": 17.553,
|
|
"eval_steps_per_second": 4.397,
|
|
"eval_token_acc": 0.8253680458701379,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.0342953379774937,
|
|
"grad_norm": 0.9345646500587463,
|
|
"learning_rate": 7.337153539312968e-06,
|
|
"loss": 0.5476717948913574,
|
|
"memory(GiB)": 32.12,
|
|
"step": 845,
|
|
"token_acc": 0.8294938351719663,
|
|
"train_speed(iter/s)": 0.125697
|
|
},
|
|
{
|
|
"epoch": 1.0404195054734746,
|
|
"grad_norm": 0.9843780398368835,
|
|
"learning_rate": 7.308743066175172e-06,
|
|
"loss": 0.5259488105773926,
|
|
"memory(GiB)": 32.12,
|
|
"step": 850,
|
|
"token_acc": 0.8343461220380425,
|
|
"train_speed(iter/s)": 0.125882
|
|
},
|
|
{
|
|
"epoch": 1.0465436729694557,
|
|
"grad_norm": 0.9147416949272156,
|
|
"learning_rate": 7.280237534518948e-06,
|
|
"loss": 0.5354435443878174,
|
|
"memory(GiB)": 32.12,
|
|
"step": 855,
|
|
"token_acc": 0.8242645320363428,
|
|
"train_speed(iter/s)": 0.126046
|
|
},
|
|
{
|
|
"epoch": 1.0526678404654368,
|
|
"grad_norm": 0.9699310660362244,
|
|
"learning_rate": 7.251638118010456e-06,
|
|
"loss": 0.5579245567321778,
|
|
"memory(GiB)": 32.12,
|
|
"step": 860,
|
|
"token_acc": 0.8199693263596681,
|
|
"train_speed(iter/s)": 0.126211
|
|
},
|
|
{
|
|
"epoch": 1.0526678404654368,
|
|
"eval_loss": 0.6175369620323181,
|
|
"eval_runtime": 30.0015,
|
|
"eval_samples_per_second": 17.566,
|
|
"eval_steps_per_second": 4.4,
|
|
"eval_token_acc": 0.8255643370008782,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.0587920079614177,
|
|
"grad_norm": 0.9256744384765625,
|
|
"learning_rate": 7.222945994181403e-06,
|
|
"loss": 0.5566354751586914,
|
|
"memory(GiB)": 32.12,
|
|
"step": 865,
|
|
"token_acc": 0.8209406091511238,
|
|
"train_speed(iter/s)": 0.125682
|
|
},
|
|
{
|
|
"epoch": 1.0649161754573988,
|
|
"grad_norm": 0.9246125221252441,
|
|
"learning_rate": 7.194162344380561e-06,
|
|
"loss": 0.5399526596069336,
|
|
"memory(GiB)": 32.12,
|
|
"step": 870,
|
|
"token_acc": 0.8396965685046696,
|
|
"train_speed(iter/s)": 0.125842
|
|
},
|
|
{
|
|
"epoch": 1.0710403429533797,
|
|
"grad_norm": 0.9580938816070557,
|
|
"learning_rate": 7.16528835372512e-06,
|
|
"loss": 0.540044641494751,
|
|
"memory(GiB)": 32.12,
|
|
"step": 875,
|
|
"token_acc": 0.8386737552985128,
|
|
"train_speed(iter/s)": 0.126028
|
|
},
|
|
{
|
|
"epoch": 1.0771645104493608,
|
|
"grad_norm": 1.0862672328948975,
|
|
"learning_rate": 7.136325211051905e-06,
|
|
"loss": 0.5482538223266602,
|
|
"memory(GiB)": 32.12,
|
|
"step": 880,
|
|
"token_acc": 0.8172277019200394,
|
|
"train_speed(iter/s)": 0.126217
|
|
},
|
|
{
|
|
"epoch": 1.0771645104493608,
|
|
"eval_loss": 0.619035542011261,
|
|
"eval_runtime": 29.9689,
|
|
"eval_samples_per_second": 17.585,
|
|
"eval_steps_per_second": 4.405,
|
|
"eval_token_acc": 0.8250477813936671,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.083288677945342,
|
|
"grad_norm": 0.8554603457450867,
|
|
"learning_rate": 7.107274108868422e-06,
|
|
"loss": 0.5296638965606689,
|
|
"memory(GiB)": 32.12,
|
|
"step": 885,
|
|
"token_acc": 0.8246657960985986,
|
|
"train_speed(iter/s)": 0.12573
|
|
},
|
|
{
|
|
"epoch": 1.0894128454413228,
|
|
"grad_norm": 0.9336580634117126,
|
|
"learning_rate": 7.078136243303754e-06,
|
|
"loss": 0.5232193946838379,
|
|
"memory(GiB)": 32.12,
|
|
"step": 890,
|
|
"token_acc": 0.8418666840594834,
|
|
"train_speed(iter/s)": 0.12589
|
|
},
|
|
{
|
|
"epoch": 1.0955370129373039,
|
|
"grad_norm": 0.9728440642356873,
|
|
"learning_rate": 7.048912814059321e-06,
|
|
"loss": 0.5442141056060791,
|
|
"memory(GiB)": 32.12,
|
|
"step": 895,
|
|
"token_acc": 0.8226529199606543,
|
|
"train_speed(iter/s)": 0.126084
|
|
},
|
|
{
|
|
"epoch": 1.1016611804332848,
|
|
"grad_norm": 1.0364502668380737,
|
|
"learning_rate": 7.019605024359475e-06,
|
|
"loss": 0.5461842536926269,
|
|
"memory(GiB)": 32.12,
|
|
"step": 900,
|
|
"token_acc": 0.8352232590995279,
|
|
"train_speed(iter/s)": 0.12626
|
|
},
|
|
{
|
|
"epoch": 1.1016611804332848,
|
|
"eval_loss": 0.6182094812393188,
|
|
"eval_runtime": 29.9539,
|
|
"eval_samples_per_second": 17.594,
|
|
"eval_steps_per_second": 4.407,
|
|
"eval_token_acc": 0.8259620848184307,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.1077853479292659,
|
|
"grad_norm": 0.9474948644638062,
|
|
"learning_rate": 6.990214080901971e-06,
|
|
"loss": 0.5203993797302247,
|
|
"memory(GiB)": 32.12,
|
|
"step": 905,
|
|
"token_acc": 0.8326191860072181,
|
|
"train_speed(iter/s)": 0.125742
|
|
},
|
|
{
|
|
"epoch": 1.113909515425247,
|
|
"grad_norm": 0.9584360718727112,
|
|
"learning_rate": 6.9607411938082735e-06,
|
|
"loss": 0.5354339122772217,
|
|
"memory(GiB)": 32.12,
|
|
"step": 910,
|
|
"token_acc": 0.8325466311381804,
|
|
"train_speed(iter/s)": 0.125916
|
|
},
|
|
{
|
|
"epoch": 1.1200336829212278,
|
|
"grad_norm": 0.9902798533439636,
|
|
"learning_rate": 6.931187576573733e-06,
|
|
"loss": 0.531368637084961,
|
|
"memory(GiB)": 32.12,
|
|
"step": 915,
|
|
"token_acc": 0.8471662228984405,
|
|
"train_speed(iter/s)": 0.126087
|
|
},
|
|
{
|
|
"epoch": 1.126157850417209,
|
|
"grad_norm": 0.8779637217521667,
|
|
"learning_rate": 6.9015544460176296e-06,
|
|
"loss": 0.5314560890197754,
|
|
"memory(GiB)": 32.12,
|
|
"step": 920,
|
|
"token_acc": 0.8275991535258379,
|
|
"train_speed(iter/s)": 0.126272
|
|
},
|
|
{
|
|
"epoch": 1.126157850417209,
|
|
"eval_loss": 0.6172027587890625,
|
|
"eval_runtime": 29.9497,
|
|
"eval_samples_per_second": 17.596,
|
|
"eval_steps_per_second": 4.407,
|
|
"eval_token_acc": 0.8254610258794359,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.1322820179131898,
|
|
"grad_norm": 1.0086554288864136,
|
|
"learning_rate": 6.87184302223306e-06,
|
|
"loss": 0.5486597061157227,
|
|
"memory(GiB)": 32.12,
|
|
"step": 925,
|
|
"token_acc": 0.8296143047140535,
|
|
"train_speed(iter/s)": 0.125778
|
|
},
|
|
{
|
|
"epoch": 1.138406185409171,
|
|
"grad_norm": 1.055482029914856,
|
|
"learning_rate": 6.842054528536717e-06,
|
|
"loss": 0.5004231452941894,
|
|
"memory(GiB)": 32.12,
|
|
"step": 930,
|
|
"token_acc": 0.8338052711827488,
|
|
"train_speed(iter/s)": 0.125965
|
|
},
|
|
{
|
|
"epoch": 1.144530352905152,
|
|
"grad_norm": 0.9732358455657959,
|
|
"learning_rate": 6.812190191418508e-06,
|
|
"loss": 0.528237771987915,
|
|
"memory(GiB)": 32.12,
|
|
"step": 935,
|
|
"token_acc": 0.8312384161752316,
|
|
"train_speed(iter/s)": 0.126125
|
|
},
|
|
{
|
|
"epoch": 1.150654520401133,
|
|
"grad_norm": 0.8922236561775208,
|
|
"learning_rate": 6.782251240491071e-06,
|
|
"loss": 0.5213536262512207,
|
|
"memory(GiB)": 32.12,
|
|
"step": 940,
|
|
"token_acc": 0.846323478740266,
|
|
"train_speed(iter/s)": 0.126299
|
|
},
|
|
{
|
|
"epoch": 1.150654520401133,
|
|
"eval_loss": 0.6174668073654175,
|
|
"eval_runtime": 29.9268,
|
|
"eval_samples_per_second": 17.61,
|
|
"eval_steps_per_second": 4.411,
|
|
"eval_token_acc": 0.8251975825197583,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.156778687897114,
|
|
"grad_norm": 1.022758960723877,
|
|
"learning_rate": 6.75223890843913e-06,
|
|
"loss": 0.5352741241455078,
|
|
"memory(GiB)": 32.12,
|
|
"step": 945,
|
|
"token_acc": 0.8363561308192181,
|
|
"train_speed(iter/s)": 0.12581
|
|
},
|
|
{
|
|
"epoch": 1.162902855393095,
|
|
"grad_norm": 0.9870294332504272,
|
|
"learning_rate": 6.722154430968755e-06,
|
|
"loss": 0.5349910259246826,
|
|
"memory(GiB)": 32.12,
|
|
"step": 950,
|
|
"token_acc": 0.8381542699724518,
|
|
"train_speed(iter/s)": 0.12598
|
|
},
|
|
{
|
|
"epoch": 1.169027022889076,
|
|
"grad_norm": 0.9218893051147461,
|
|
"learning_rate": 6.69199904675648e-06,
|
|
"loss": 0.5564836025238037,
|
|
"memory(GiB)": 32.12,
|
|
"step": 955,
|
|
"token_acc": 0.8276523535487679,
|
|
"train_speed(iter/s)": 0.12616
|
|
},
|
|
{
|
|
"epoch": 1.175151190385057,
|
|
"grad_norm": 0.9656640887260437,
|
|
"learning_rate": 6.6617739973982985e-06,
|
|
"loss": 0.505579948425293,
|
|
"memory(GiB)": 32.12,
|
|
"step": 960,
|
|
"token_acc": 0.8291429745838186,
|
|
"train_speed(iter/s)": 0.126306
|
|
},
|
|
{
|
|
"epoch": 1.175151190385057,
|
|
"eval_loss": 0.618171751499176,
|
|
"eval_runtime": 29.9871,
|
|
"eval_samples_per_second": 17.574,
|
|
"eval_steps_per_second": 4.402,
|
|
"eval_token_acc": 0.825326721421561,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.181275357881038,
|
|
"grad_norm": 0.9255744218826294,
|
|
"learning_rate": 6.631480527358552e-06,
|
|
"loss": 0.5494061946868897,
|
|
"memory(GiB)": 32.12,
|
|
"step": 965,
|
|
"token_acc": 0.828285929606163,
|
|
"train_speed(iter/s)": 0.125817
|
|
},
|
|
{
|
|
"epoch": 1.187399525377019,
|
|
"grad_norm": 0.8625167608261108,
|
|
"learning_rate": 6.601119883918677e-06,
|
|
"loss": 0.5405423164367675,
|
|
"memory(GiB)": 32.12,
|
|
"step": 970,
|
|
"token_acc": 0.8340152804432053,
|
|
"train_speed(iter/s)": 0.125981
|
|
},
|
|
{
|
|
"epoch": 1.1935236928730002,
|
|
"grad_norm": 0.9901431202888489,
|
|
"learning_rate": 6.570693317125868e-06,
|
|
"loss": 0.5540534019470215,
|
|
"memory(GiB)": 32.12,
|
|
"step": 975,
|
|
"token_acc": 0.8329805323246695,
|
|
"train_speed(iter/s)": 0.12611
|
|
},
|
|
{
|
|
"epoch": 1.199647860368981,
|
|
"grad_norm": 0.8996224403381348,
|
|
"learning_rate": 6.540202079741594e-06,
|
|
"loss": 0.5333957672119141,
|
|
"memory(GiB)": 32.12,
|
|
"step": 980,
|
|
"token_acc": 0.8473111291632819,
|
|
"train_speed(iter/s)": 0.126282
|
|
},
|
|
{
|
|
"epoch": 1.199647860368981,
|
|
"eval_loss": 0.6176728010177612,
|
|
"eval_runtime": 30.0203,
|
|
"eval_samples_per_second": 17.555,
|
|
"eval_steps_per_second": 4.397,
|
|
"eval_token_acc": 0.8252853969729841,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.2057720278649622,
|
|
"grad_norm": 1.0513380765914917,
|
|
"learning_rate": 6.509647427190029e-06,
|
|
"loss": 0.5554468631744385,
|
|
"memory(GiB)": 32.12,
|
|
"step": 985,
|
|
"token_acc": 0.8163728888561104,
|
|
"train_speed(iter/s)": 0.125784
|
|
},
|
|
{
|
|
"epoch": 1.211896195360943,
|
|
"grad_norm": 0.9925962686538696,
|
|
"learning_rate": 6.4790306175063535e-06,
|
|
"loss": 0.5358247756958008,
|
|
"memory(GiB)": 32.12,
|
|
"step": 990,
|
|
"token_acc": 0.8373621787068276,
|
|
"train_speed(iter/s)": 0.125984
|
|
},
|
|
{
|
|
"epoch": 1.2180203628569242,
|
|
"grad_norm": 0.9856204390525818,
|
|
"learning_rate": 6.44835291128496e-06,
|
|
"loss": 0.544157600402832,
|
|
"memory(GiB)": 32.12,
|
|
"step": 995,
|
|
"token_acc": 0.8250445425672012,
|
|
"train_speed(iter/s)": 0.126117
|
|
},
|
|
{
|
|
"epoch": 1.224144530352905,
|
|
"grad_norm": 0.9970067739486694,
|
|
"learning_rate": 6.417615571627555e-06,
|
|
"loss": 0.5199033260345459,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1000,
|
|
"token_acc": 0.8412796162447737,
|
|
"train_speed(iter/s)": 0.126261
|
|
},
|
|
{
|
|
"epoch": 1.224144530352905,
|
|
"eval_loss": 0.6171393990516663,
|
|
"eval_runtime": 29.9792,
|
|
"eval_samples_per_second": 17.579,
|
|
"eval_steps_per_second": 4.403,
|
|
"eval_token_acc": 0.8254248669869312,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.2302686978488862,
|
|
"grad_norm": 0.978387176990509,
|
|
"learning_rate": 6.386819864091146e-06,
|
|
"loss": 0.5251027107238769,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1005,
|
|
"token_acc": 0.8299365231042249,
|
|
"train_speed(iter/s)": 0.125803
|
|
},
|
|
{
|
|
"epoch": 1.2363928653448673,
|
|
"grad_norm": 0.9339916706085205,
|
|
"learning_rate": 6.35596705663594e-06,
|
|
"loss": 0.566818380355835,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1010,
|
|
"token_acc": 0.8130651567649793,
|
|
"train_speed(iter/s)": 0.12596
|
|
},
|
|
{
|
|
"epoch": 1.2425170328408481,
|
|
"grad_norm": 0.9691733717918396,
|
|
"learning_rate": 6.325058419573131e-06,
|
|
"loss": 0.5325815200805664,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1015,
|
|
"token_acc": 0.838964083981669,
|
|
"train_speed(iter/s)": 0.126159
|
|
},
|
|
{
|
|
"epoch": 1.2486412003368292,
|
|
"grad_norm": 0.9045368432998657,
|
|
"learning_rate": 6.294095225512604e-06,
|
|
"loss": 0.5249390602111816,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1020,
|
|
"token_acc": 0.8380004706356944,
|
|
"train_speed(iter/s)": 0.126284
|
|
},
|
|
{
|
|
"epoch": 1.2486412003368292,
|
|
"eval_loss": 0.6158590316772461,
|
|
"eval_runtime": 30.0066,
|
|
"eval_samples_per_second": 17.563,
|
|
"eval_steps_per_second": 4.399,
|
|
"eval_token_acc": 0.8253318869776332,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.2547653678328103,
|
|
"grad_norm": 0.98622065782547,
|
|
"learning_rate": 6.263078749310534e-06,
|
|
"loss": 0.561451530456543,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1025,
|
|
"token_acc": 0.8262592270950934,
|
|
"train_speed(iter/s)": 0.125824
|
|
},
|
|
{
|
|
"epoch": 1.2608895353287912,
|
|
"grad_norm": 0.9515383243560791,
|
|
"learning_rate": 6.232010268016895e-06,
|
|
"loss": 0.5291833877563477,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1030,
|
|
"token_acc": 0.8373620599054125,
|
|
"train_speed(iter/s)": 0.125985
|
|
},
|
|
{
|
|
"epoch": 1.2670137028247723,
|
|
"grad_norm": 0.9982597827911377,
|
|
"learning_rate": 6.200891060822884e-06,
|
|
"loss": 0.577932071685791,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1035,
|
|
"token_acc": 0.8188697951090549,
|
|
"train_speed(iter/s)": 0.126139
|
|
},
|
|
{
|
|
"epoch": 1.2731378703207532,
|
|
"grad_norm": 1.0038230419158936,
|
|
"learning_rate": 6.169722409008244e-06,
|
|
"loss": 0.5776113986968994,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1040,
|
|
"token_acc": 0.8182007844446298,
|
|
"train_speed(iter/s)": 0.126292
|
|
},
|
|
{
|
|
"epoch": 1.2731378703207532,
|
|
"eval_loss": 0.6144587397575378,
|
|
"eval_runtime": 30.0744,
|
|
"eval_samples_per_second": 17.523,
|
|
"eval_steps_per_second": 4.389,
|
|
"eval_token_acc": 0.8263133426313343,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.2792620378167343,
|
|
"grad_norm": 0.9570845365524292,
|
|
"learning_rate": 6.13850559588852e-06,
|
|
"loss": 0.5415801048278809,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1045,
|
|
"token_acc": 0.8337840538200226,
|
|
"train_speed(iter/s)": 0.125838
|
|
},
|
|
{
|
|
"epoch": 1.2853862053127152,
|
|
"grad_norm": 0.9676324725151062,
|
|
"learning_rate": 6.107241906762214e-06,
|
|
"loss": 0.5263193130493165,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1050,
|
|
"token_acc": 0.8450373289877591,
|
|
"train_speed(iter/s)": 0.125977
|
|
},
|
|
{
|
|
"epoch": 1.2915103728086963,
|
|
"grad_norm": 0.8747360110282898,
|
|
"learning_rate": 6.075932628857869e-06,
|
|
"loss": 0.5368072032928467,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1055,
|
|
"token_acc": 0.8371580206308865,
|
|
"train_speed(iter/s)": 0.126138
|
|
},
|
|
{
|
|
"epoch": 1.2976345403046774,
|
|
"grad_norm": 0.9015209674835205,
|
|
"learning_rate": 6.044579051281063e-06,
|
|
"loss": 0.4784068584442139,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1060,
|
|
"token_acc": 0.8673299195318215,
|
|
"train_speed(iter/s)": 0.126244
|
|
},
|
|
{
|
|
"epoch": 1.2976345403046774,
|
|
"eval_loss": 0.6154947280883789,
|
|
"eval_runtime": 30.068,
|
|
"eval_samples_per_second": 17.527,
|
|
"eval_steps_per_second": 4.39,
|
|
"eval_token_acc": 0.8261997003977478,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.3037587078006583,
|
|
"grad_norm": 0.9384099841117859,
|
|
"learning_rate": 6.013182464961341e-06,
|
|
"loss": 0.5346551418304444,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1065,
|
|
"token_acc": 0.8303387250508586,
|
|
"train_speed(iter/s)": 0.125804
|
|
},
|
|
{
|
|
"epoch": 1.3098828752966394,
|
|
"grad_norm": 0.8981488347053528,
|
|
"learning_rate": 5.981744162599057e-06,
|
|
"loss": 0.5211257934570312,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1070,
|
|
"token_acc": 0.8500360490266763,
|
|
"train_speed(iter/s)": 0.12593
|
|
},
|
|
{
|
|
"epoch": 1.3160070427926205,
|
|
"grad_norm": 0.8698239922523499,
|
|
"learning_rate": 5.9502654386121505e-06,
|
|
"loss": 0.5495285034179688,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1075,
|
|
"token_acc": 0.8446355346104413,
|
|
"train_speed(iter/s)": 0.12608
|
|
},
|
|
{
|
|
"epoch": 1.3221312102886014,
|
|
"grad_norm": 0.990492582321167,
|
|
"learning_rate": 5.918747589082853e-06,
|
|
"loss": 0.5515711307525635,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1080,
|
|
"token_acc": 0.8194262671996039,
|
|
"train_speed(iter/s)": 0.126234
|
|
},
|
|
{
|
|
"epoch": 1.3221312102886014,
|
|
"eval_loss": 0.6156888008117676,
|
|
"eval_runtime": 30.042,
|
|
"eval_samples_per_second": 17.542,
|
|
"eval_steps_per_second": 4.394,
|
|
"eval_token_acc": 0.8256831447905367,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 1.3282553777845825,
|
|
"grad_norm": 1.0377113819122314,
|
|
"learning_rate": 5.887191911704322e-06,
|
|
"loss": 0.5179418087005615,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1085,
|
|
"token_acc": 0.822986674391657,
|
|
"train_speed(iter/s)": 0.125752
|
|
},
|
|
{
|
|
"epoch": 1.3343795452805634,
|
|
"grad_norm": 1.1629189252853394,
|
|
"learning_rate": 5.855599705727212e-06,
|
|
"loss": 0.501689100265503,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1090,
|
|
"token_acc": 0.8511506930497481,
|
|
"train_speed(iter/s)": 0.125871
|
|
},
|
|
{
|
|
"epoch": 1.3405037127765445,
|
|
"grad_norm": 1.021088719367981,
|
|
"learning_rate": 5.823972271906177e-06,
|
|
"loss": 0.5111154556274414,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1095,
|
|
"token_acc": 0.827035490605428,
|
|
"train_speed(iter/s)": 0.126024
|
|
},
|
|
{
|
|
"epoch": 1.3466278802725253,
|
|
"grad_norm": 1.0440119504928589,
|
|
"learning_rate": 5.7923109124463264e-06,
|
|
"loss": 0.5382958889007569,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1100,
|
|
"token_acc": 0.8228407178911946,
|
|
"train_speed(iter/s)": 0.12617
|
|
},
|
|
{
|
|
"epoch": 1.3466278802725253,
|
|
"eval_loss": 0.6151137948036194,
|
|
"eval_runtime": 29.8862,
|
|
"eval_samples_per_second": 17.634,
|
|
"eval_steps_per_second": 4.417,
|
|
"eval_token_acc": 0.8264734748695697,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 1.3527520477685064,
|
|
"grad_norm": 1.0229501724243164,
|
|
"learning_rate": 5.760616930949584e-06,
|
|
"loss": 0.542177963256836,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1105,
|
|
"token_acc": 0.8330580493912673,
|
|
"train_speed(iter/s)": 0.125749
|
|
},
|
|
{
|
|
"epoch": 1.3588762152644875,
|
|
"grad_norm": 0.8944743871688843,
|
|
"learning_rate": 5.728891632361043e-06,
|
|
"loss": 0.5133552551269531,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1110,
|
|
"token_acc": 0.8286529928320973,
|
|
"train_speed(iter/s)": 0.125908
|
|
},
|
|
{
|
|
"epoch": 1.3650003827604684,
|
|
"grad_norm": 0.8976428508758545,
|
|
"learning_rate": 5.697136322915218e-06,
|
|
"loss": 0.5297269821166992,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1115,
|
|
"token_acc": 0.8331969608416131,
|
|
"train_speed(iter/s)": 0.126045
|
|
},
|
|
{
|
|
"epoch": 1.3711245502564495,
|
|
"grad_norm": 1.013662338256836,
|
|
"learning_rate": 5.66535231008227e-06,
|
|
"loss": 0.5449240684509278,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1120,
|
|
"token_acc": 0.8382301504022386,
|
|
"train_speed(iter/s)": 0.126181
|
|
},
|
|
{
|
|
"epoch": 1.3711245502564495,
|
|
"eval_loss": 0.6147744059562683,
|
|
"eval_runtime": 30.0501,
|
|
"eval_samples_per_second": 17.537,
|
|
"eval_steps_per_second": 4.393,
|
|
"eval_token_acc": 0.8263081770752622,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 1.3772487177524306,
|
|
"grad_norm": 1.047998309135437,
|
|
"learning_rate": 5.63354090251417e-06,
|
|
"loss": 0.5496514320373536,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1125,
|
|
"token_acc": 0.8280883107068635,
|
|
"train_speed(iter/s)": 0.125763
|
|
},
|
|
{
|
|
"epoch": 1.3833728852484115,
|
|
"grad_norm": 0.995204508304596,
|
|
"learning_rate": 5.6017034099908245e-06,
|
|
"loss": 0.5459441184997559,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1130,
|
|
"token_acc": 0.8158013374408295,
|
|
"train_speed(iter/s)": 0.125925
|
|
},
|
|
{
|
|
"epoch": 1.3894970527443926,
|
|
"grad_norm": 0.9421271681785583,
|
|
"learning_rate": 5.569841143366141e-06,
|
|
"loss": 0.51002197265625,
|
|
"memory(GiB)": 34.49,
|
|
"step": 1135,
|
|
"token_acc": 0.8427027419120847,
|
|
"train_speed(iter/s)": 0.126047
|
|
},
|
|
{
|
|
"epoch": 1.3956212202403735,
|
|
"grad_norm": 0.9212712049484253,
|
|
"learning_rate": 5.537955414514058e-06,
|
|
"loss": 0.5343506813049317,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1140,
|
|
"token_acc": 0.8508435329143236,
|
|
"train_speed(iter/s)": 0.126151
|
|
},
|
|
{
|
|
"epoch": 1.3956212202403735,
|
|
"eval_loss": 0.6138430237770081,
|
|
"eval_runtime": 30.0787,
|
|
"eval_samples_per_second": 17.521,
|
|
"eval_steps_per_second": 4.388,
|
|
"eval_token_acc": 0.8258587736969885,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 1.4017453877363546,
|
|
"grad_norm": 0.960340142250061,
|
|
"learning_rate": 5.506047536274529e-06,
|
|
"loss": 0.537141227722168,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1145,
|
|
"token_acc": 0.825777386163379,
|
|
"train_speed(iter/s)": 0.125768
|
|
},
|
|
{
|
|
"epoch": 1.4078695552323355,
|
|
"grad_norm": 1.063237190246582,
|
|
"learning_rate": 5.474118822399476e-06,
|
|
"loss": 0.5870203018188477,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1150,
|
|
"token_acc": 0.8254607459004498,
|
|
"train_speed(iter/s)": 0.125933
|
|
},
|
|
{
|
|
"epoch": 1.4139937227283166,
|
|
"grad_norm": 0.8735440373420715,
|
|
"learning_rate": 5.442170587498684e-06,
|
|
"loss": 0.5143415451049804,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1155,
|
|
"token_acc": 0.8224962760245262,
|
|
"train_speed(iter/s)": 0.126052
|
|
},
|
|
{
|
|
"epoch": 1.4201178902242977,
|
|
"grad_norm": 0.8771001100540161,
|
|
"learning_rate": 5.41020414698569e-06,
|
|
"loss": 0.557903242111206,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1160,
|
|
"token_acc": 0.8381047381546135,
|
|
"train_speed(iter/s)": 0.126212
|
|
},
|
|
{
|
|
"epoch": 1.4201178902242977,
|
|
"eval_loss": 0.6127957701683044,
|
|
"eval_runtime": 30.0731,
|
|
"eval_samples_per_second": 17.524,
|
|
"eval_steps_per_second": 4.389,
|
|
"eval_token_acc": 0.8258226148044837,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 1.4262420577202786,
|
|
"grad_norm": 0.9807034134864807,
|
|
"learning_rate": 5.378220817023609e-06,
|
|
"loss": 0.5510265350341796,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1165,
|
|
"token_acc": 0.8266740684199569,
|
|
"train_speed(iter/s)": 0.125818
|
|
},
|
|
{
|
|
"epoch": 1.4323662252162597,
|
|
"grad_norm": 0.9031324982643127,
|
|
"learning_rate": 5.346221914470959e-06,
|
|
"loss": 0.5285142421722412,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1170,
|
|
"token_acc": 0.8287966113464342,
|
|
"train_speed(iter/s)": 0.125945
|
|
},
|
|
{
|
|
"epoch": 1.4384903927122408,
|
|
"grad_norm": 0.9082944393157959,
|
|
"learning_rate": 5.314208756827425e-06,
|
|
"loss": 0.5313165664672852,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1175,
|
|
"token_acc": 0.8377556371263765,
|
|
"train_speed(iter/s)": 0.126106
|
|
},
|
|
{
|
|
"epoch": 1.4446145602082217,
|
|
"grad_norm": 0.9939496517181396,
|
|
"learning_rate": 5.282182662179623e-06,
|
|
"loss": 0.559614896774292,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1180,
|
|
"token_acc": 0.846643215328194,
|
|
"train_speed(iter/s)": 0.126246
|
|
},
|
|
{
|
|
"epoch": 1.4446145602082217,
|
|
"eval_loss": 0.6125648021697998,
|
|
"eval_runtime": 30.1611,
|
|
"eval_samples_per_second": 17.473,
|
|
"eval_steps_per_second": 4.376,
|
|
"eval_token_acc": 0.826158375949171,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 1.4507387277042028,
|
|
"grad_norm": 1.0398341417312622,
|
|
"learning_rate": 5.250144949146827e-06,
|
|
"loss": 0.5018705368041992,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1185,
|
|
"token_acc": 0.8302141481179431,
|
|
"train_speed(iter/s)": 0.125825
|
|
},
|
|
{
|
|
"epoch": 1.4568628952001839,
|
|
"grad_norm": 1.0717829465866089,
|
|
"learning_rate": 5.218096936826681e-06,
|
|
"loss": 0.543729591369629,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1190,
|
|
"token_acc": 0.8435318409753859,
|
|
"train_speed(iter/s)": 0.125983
|
|
},
|
|
{
|
|
"epoch": 1.4629870626961647,
|
|
"grad_norm": 0.9488953948020935,
|
|
"learning_rate": 5.186039944740882e-06,
|
|
"loss": 0.5498368740081787,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1195,
|
|
"token_acc": 0.8358378225120499,
|
|
"train_speed(iter/s)": 0.126113
|
|
},
|
|
{
|
|
"epoch": 1.4691112301921456,
|
|
"grad_norm": 1.010858416557312,
|
|
"learning_rate": 5.153975292780852e-06,
|
|
"loss": 0.5265066623687744,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1200,
|
|
"token_acc": 0.8429352241672207,
|
|
"train_speed(iter/s)": 0.126228
|
|
},
|
|
{
|
|
"epoch": 1.4691112301921456,
|
|
"eval_loss": 0.6125081181526184,
|
|
"eval_runtime": 29.9605,
|
|
"eval_samples_per_second": 17.59,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8266542693320936,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 1.4752353976881267,
|
|
"grad_norm": 0.8864910006523132,
|
|
"learning_rate": 5.1219043011534e-06,
|
|
"loss": 0.5261281967163086,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1205,
|
|
"token_acc": 0.8282765708814958,
|
|
"train_speed(iter/s)": 0.125837
|
|
},
|
|
{
|
|
"epoch": 1.4813595651841078,
|
|
"grad_norm": 0.9174733757972717,
|
|
"learning_rate": 5.089828290326354e-06,
|
|
"loss": 0.5531785964965821,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1210,
|
|
"token_acc": 0.8411006266657063,
|
|
"train_speed(iter/s)": 0.126012
|
|
},
|
|
{
|
|
"epoch": 1.4874837326800887,
|
|
"grad_norm": 0.9243429899215698,
|
|
"learning_rate": 5.057748580974204e-06,
|
|
"loss": 0.5176255702972412,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1215,
|
|
"token_acc": 0.8498275862068966,
|
|
"train_speed(iter/s)": 0.12615
|
|
},
|
|
{
|
|
"epoch": 1.4936079001760698,
|
|
"grad_norm": 0.9391066431999207,
|
|
"learning_rate": 5.0256664939237186e-06,
|
|
"loss": 0.5616118431091308,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1220,
|
|
"token_acc": 0.8409511815690051,
|
|
"train_speed(iter/s)": 0.126282
|
|
},
|
|
{
|
|
"epoch": 1.4936079001760698,
|
|
"eval_loss": 0.6114863157272339,
|
|
"eval_runtime": 29.9732,
|
|
"eval_samples_per_second": 17.582,
|
|
"eval_steps_per_second": 4.404,
|
|
"eval_token_acc": 0.8267007593367426,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 1.499732067672051,
|
|
"grad_norm": 0.8913131356239319,
|
|
"learning_rate": 4.99358335009956e-06,
|
|
"loss": 0.5003180027008056,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1225,
|
|
"token_acc": 0.8392863897119082,
|
|
"train_speed(iter/s)": 0.125908
|
|
},
|
|
{
|
|
"epoch": 1.5058562351680318,
|
|
"grad_norm": 0.9838159084320068,
|
|
"learning_rate": 4.961500470469908e-06,
|
|
"loss": 0.5151349067687988,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1230,
|
|
"token_acc": 0.8358106300867373,
|
|
"train_speed(iter/s)": 0.126051
|
|
},
|
|
{
|
|
"epoch": 1.511980402664013,
|
|
"grad_norm": 0.9471805095672607,
|
|
"learning_rate": 4.92941917599206e-06,
|
|
"loss": 0.5267168998718261,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1235,
|
|
"token_acc": 0.8325013676148797,
|
|
"train_speed(iter/s)": 0.126177
|
|
},
|
|
{
|
|
"epoch": 1.518104570159994,
|
|
"grad_norm": 0.9928951263427734,
|
|
"learning_rate": 4.8973407875580485e-06,
|
|
"loss": 0.5807061195373535,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1240,
|
|
"token_acc": 0.82605387834146,
|
|
"train_speed(iter/s)": 0.12635
|
|
},
|
|
{
|
|
"epoch": 1.518104570159994,
|
|
"eval_loss": 0.6120603084564209,
|
|
"eval_runtime": 30.0211,
|
|
"eval_samples_per_second": 17.554,
|
|
"eval_steps_per_second": 4.397,
|
|
"eval_token_acc": 0.8262875148509737,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 1.5242287376559749,
|
|
"grad_norm": 0.9785681366920471,
|
|
"learning_rate": 4.8652666259402584e-06,
|
|
"loss": 0.5564475059509277,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1245,
|
|
"token_acc": 0.8232146560663671,
|
|
"train_speed(iter/s)": 0.125966
|
|
},
|
|
{
|
|
"epoch": 1.5303529051519558,
|
|
"grad_norm": 0.9484609365463257,
|
|
"learning_rate": 4.833198011737035e-06,
|
|
"loss": 0.5257096767425538,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1250,
|
|
"token_acc": 0.8338689740420272,
|
|
"train_speed(iter/s)": 0.126082
|
|
},
|
|
{
|
|
"epoch": 1.5364770726479369,
|
|
"grad_norm": 1.0170414447784424,
|
|
"learning_rate": 4.8011362653183245e-06,
|
|
"loss": 0.5458654403686524,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1255,
|
|
"token_acc": 0.8260180208051355,
|
|
"train_speed(iter/s)": 0.126209
|
|
},
|
|
{
|
|
"epoch": 1.542601240143918,
|
|
"grad_norm": 1.0465954542160034,
|
|
"learning_rate": 4.7690827067713035e-06,
|
|
"loss": 0.5092308998107911,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1260,
|
|
"token_acc": 0.8562170404727111,
|
|
"train_speed(iter/s)": 0.126341
|
|
},
|
|
{
|
|
"epoch": 1.542601240143918,
|
|
"eval_loss": 0.6123631596565247,
|
|
"eval_runtime": 29.8837,
|
|
"eval_samples_per_second": 17.635,
|
|
"eval_steps_per_second": 4.417,
|
|
"eval_token_acc": 0.8266542693320936,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 1.5487254076398989,
|
|
"grad_norm": 1.0032224655151367,
|
|
"learning_rate": 4.737038655846023e-06,
|
|
"loss": 0.5465664863586426,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1265,
|
|
"token_acc": 0.8245171081677705,
|
|
"train_speed(iter/s)": 0.125985
|
|
},
|
|
{
|
|
"epoch": 1.55484957513588,
|
|
"grad_norm": 1.0049303770065308,
|
|
"learning_rate": 4.70500543190108e-06,
|
|
"loss": 0.5189294338226318,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1270,
|
|
"token_acc": 0.8236330189048495,
|
|
"train_speed(iter/s)": 0.12608
|
|
},
|
|
{
|
|
"epoch": 1.560973742631861,
|
|
"grad_norm": 1.006712794303894,
|
|
"learning_rate": 4.672984353849285e-06,
|
|
"loss": 0.5561445236206055,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1275,
|
|
"token_acc": 0.8239827598801958,
|
|
"train_speed(iter/s)": 0.126214
|
|
},
|
|
{
|
|
"epoch": 1.567097910127842,
|
|
"grad_norm": 0.8475578427314758,
|
|
"learning_rate": 4.640976740103363e-06,
|
|
"loss": 0.5361814498901367,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1280,
|
|
"token_acc": 0.8350327247674819,
|
|
"train_speed(iter/s)": 0.126343
|
|
},
|
|
{
|
|
"epoch": 1.567097910127842,
|
|
"eval_loss": 0.6125593185424805,
|
|
"eval_runtime": 29.9427,
|
|
"eval_samples_per_second": 17.6,
|
|
"eval_steps_per_second": 4.408,
|
|
"eval_token_acc": 0.8268918849114107,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 1.573222077623823,
|
|
"grad_norm": 1.068233847618103,
|
|
"learning_rate": 4.60898390852167e-06,
|
|
"loss": 0.5269934654235839,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1285,
|
|
"token_acc": 0.8273768192895751,
|
|
"train_speed(iter/s)": 0.125978
|
|
},
|
|
{
|
|
"epoch": 1.5793462451198041,
|
|
"grad_norm": 1.0497961044311523,
|
|
"learning_rate": 4.577007176353931e-06,
|
|
"loss": 0.5188837051391602,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1290,
|
|
"token_acc": 0.8475213675213675,
|
|
"train_speed(iter/s)": 0.126095
|
|
},
|
|
{
|
|
"epoch": 1.585470412615785,
|
|
"grad_norm": 0.9117013812065125,
|
|
"learning_rate": 4.5450478601870055e-06,
|
|
"loss": 0.49097652435302735,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1295,
|
|
"token_acc": 0.83500768653248,
|
|
"train_speed(iter/s)": 0.126208
|
|
},
|
|
{
|
|
"epoch": 1.591594580111766,
|
|
"grad_norm": 0.865384042263031,
|
|
"learning_rate": 4.513107275890682e-06,
|
|
"loss": 0.5219059944152832,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1300,
|
|
"token_acc": 0.8484564711960734,
|
|
"train_speed(iter/s)": 0.126317
|
|
},
|
|
{
|
|
"epoch": 1.591594580111766,
|
|
"eval_loss": 0.6109749674797058,
|
|
"eval_runtime": 29.957,
|
|
"eval_samples_per_second": 17.592,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8270158582571414,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 1.5977187476077472,
|
|
"grad_norm": 0.9944786429405212,
|
|
"learning_rate": 4.4811867385634916e-06,
|
|
"loss": 0.5182311058044433,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1305,
|
|
"token_acc": 0.8336139447360602,
|
|
"train_speed(iter/s)": 0.125948
|
|
},
|
|
{
|
|
"epoch": 1.6038429151037281,
|
|
"grad_norm": 0.9752517342567444,
|
|
"learning_rate": 4.44928756247857e-06,
|
|
"loss": 0.49358739852905276,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1310,
|
|
"token_acc": 0.8449874236435502,
|
|
"train_speed(iter/s)": 0.126049
|
|
},
|
|
{
|
|
"epoch": 1.609967082599709,
|
|
"grad_norm": 0.9614261984825134,
|
|
"learning_rate": 4.417411061029539e-06,
|
|
"loss": 0.536794376373291,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1315,
|
|
"token_acc": 0.832827077457149,
|
|
"train_speed(iter/s)": 0.126177
|
|
},
|
|
{
|
|
"epoch": 1.61609125009569,
|
|
"grad_norm": 0.9478575587272644,
|
|
"learning_rate": 4.3855585466764305e-06,
|
|
"loss": 0.4996980667114258,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1320,
|
|
"token_acc": 0.8452054794520548,
|
|
"train_speed(iter/s)": 0.126305
|
|
},
|
|
{
|
|
"epoch": 1.61609125009569,
|
|
"eval_loss": 0.6103559732437134,
|
|
"eval_runtime": 30.0278,
|
|
"eval_samples_per_second": 17.55,
|
|
"eval_steps_per_second": 4.396,
|
|
"eval_token_acc": 0.8273722816261171,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 1.6222154175916712,
|
|
"grad_norm": 1.074583649635315,
|
|
"learning_rate": 4.353731330891651e-06,
|
|
"loss": 0.529239273071289,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1325,
|
|
"token_acc": 0.8302422791282821,
|
|
"train_speed(iter/s)": 0.125951
|
|
},
|
|
{
|
|
"epoch": 1.628339585087652,
|
|
"grad_norm": 0.9707440137863159,
|
|
"learning_rate": 4.321930724105979e-06,
|
|
"loss": 0.4900198936462402,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1330,
|
|
"token_acc": 0.8524826181613877,
|
|
"train_speed(iter/s)": 0.126054
|
|
},
|
|
{
|
|
"epoch": 1.6344637525836332,
|
|
"grad_norm": 0.943321943283081,
|
|
"learning_rate": 4.290158035654618e-06,
|
|
"loss": 0.5417927265167236,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1335,
|
|
"token_acc": 0.8254652088914634,
|
|
"train_speed(iter/s)": 0.1262
|
|
},
|
|
{
|
|
"epoch": 1.6405879200796143,
|
|
"grad_norm": 1.0129594802856445,
|
|
"learning_rate": 4.258414573723277e-06,
|
|
"loss": 0.545560359954834,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1340,
|
|
"token_acc": 0.8416190929273708,
|
|
"train_speed(iter/s)": 0.126328
|
|
},
|
|
{
|
|
"epoch": 1.6405879200796143,
|
|
"eval_loss": 0.609876275062561,
|
|
"eval_runtime": 30.0571,
|
|
"eval_samples_per_second": 17.533,
|
|
"eval_steps_per_second": 4.392,
|
|
"eval_token_acc": 0.827418771630766,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 1.6467120875755952,
|
|
"grad_norm": 1.0571733713150024,
|
|
"learning_rate": 4.226701645294317e-06,
|
|
"loss": 0.5603596687316894,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1345,
|
|
"token_acc": 0.8282726557865548,
|
|
"train_speed(iter/s)": 0.125982
|
|
},
|
|
{
|
|
"epoch": 1.652836255071576,
|
|
"grad_norm": 1.0039043426513672,
|
|
"learning_rate": 4.195020556092935e-06,
|
|
"loss": 0.5717378616333008,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1350,
|
|
"token_acc": 0.8221712722738426,
|
|
"train_speed(iter/s)": 0.126115
|
|
},
|
|
{
|
|
"epoch": 1.6589604225675574,
|
|
"grad_norm": 1.0660555362701416,
|
|
"learning_rate": 4.1633726105334006e-06,
|
|
"loss": 0.5500486373901368,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1355,
|
|
"token_acc": 0.8320635850853417,
|
|
"train_speed(iter/s)": 0.126243
|
|
},
|
|
{
|
|
"epoch": 1.6650845900635383,
|
|
"grad_norm": 0.9174071550369263,
|
|
"learning_rate": 4.131759111665349e-06,
|
|
"loss": 0.49724588394165037,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1360,
|
|
"token_acc": 0.8494714160662582,
|
|
"train_speed(iter/s)": 0.126379
|
|
},
|
|
{
|
|
"epoch": 1.6650845900635383,
|
|
"eval_loss": 0.609088659286499,
|
|
"eval_runtime": 30.0177,
|
|
"eval_samples_per_second": 17.556,
|
|
"eval_steps_per_second": 4.397,
|
|
"eval_token_acc": 0.8274859238597035,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 1.6712087575595191,
|
|
"grad_norm": 1.0310657024383545,
|
|
"learning_rate": 4.100181361120136e-06,
|
|
"loss": 0.5943800926208496,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1365,
|
|
"token_acc": 0.8274678717695241,
|
|
"train_speed(iter/s)": 0.126048
|
|
},
|
|
{
|
|
"epoch": 1.6773329250555002,
|
|
"grad_norm": 0.947372317314148,
|
|
"learning_rate": 4.068640659057242e-06,
|
|
"loss": 0.5227277755737305,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1370,
|
|
"token_acc": 0.8481193255512322,
|
|
"train_speed(iter/s)": 0.126167
|
|
},
|
|
{
|
|
"epoch": 1.6834570925514813,
|
|
"grad_norm": 1.0015521049499512,
|
|
"learning_rate": 4.037138304110737e-06,
|
|
"loss": 0.5239052772521973,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1375,
|
|
"token_acc": 0.8306528880372297,
|
|
"train_speed(iter/s)": 0.126277
|
|
},
|
|
{
|
|
"epoch": 1.6895812600474622,
|
|
"grad_norm": 1.014237880706787,
|
|
"learning_rate": 4.005675593335818e-06,
|
|
"loss": 0.5036933898925782,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1380,
|
|
"token_acc": 0.8474381345177665,
|
|
"train_speed(iter/s)": 0.126378
|
|
},
|
|
{
|
|
"epoch": 1.6895812600474622,
|
|
"eval_loss": 0.6097399592399597,
|
|
"eval_runtime": 30.0485,
|
|
"eval_samples_per_second": 17.538,
|
|
"eval_steps_per_second": 4.393,
|
|
"eval_token_acc": 0.8270313549253577,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 1.6957054275434433,
|
|
"grad_norm": 0.970410943031311,
|
|
"learning_rate": 3.974253822155397e-06,
|
|
"loss": 0.5157362937927246,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1385,
|
|
"token_acc": 0.8356764264051473,
|
|
"train_speed(iter/s)": 0.126006
|
|
},
|
|
{
|
|
"epoch": 1.7018295950394244,
|
|
"grad_norm": 0.9698341488838196,
|
|
"learning_rate": 3.942874284306774e-06,
|
|
"loss": 0.5165740966796875,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1390,
|
|
"token_acc": 0.852775912640916,
|
|
"train_speed(iter/s)": 0.126136
|
|
},
|
|
{
|
|
"epoch": 1.7079537625354053,
|
|
"grad_norm": 0.889597475528717,
|
|
"learning_rate": 3.911538271788359e-06,
|
|
"loss": 0.5268959999084473,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1395,
|
|
"token_acc": 0.8417914492851819,
|
|
"train_speed(iter/s)": 0.126261
|
|
},
|
|
{
|
|
"epoch": 1.7140779300313862,
|
|
"grad_norm": 0.9927029609680176,
|
|
"learning_rate": 3.8802470748064855e-06,
|
|
"loss": 0.5189975738525391,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1400,
|
|
"token_acc": 0.8465264055174552,
|
|
"train_speed(iter/s)": 0.126349
|
|
},
|
|
{
|
|
"epoch": 1.7140779300313862,
|
|
"eval_loss": 0.6091334223747253,
|
|
"eval_runtime": 29.8469,
|
|
"eval_samples_per_second": 17.657,
|
|
"eval_steps_per_second": 4.423,
|
|
"eval_token_acc": 0.8271295004907279,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 1.7202020975273675,
|
|
"grad_norm": 0.9913120865821838,
|
|
"learning_rate": 3.849001981722285e-06,
|
|
"loss": 0.5513727188110351,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1405,
|
|
"token_acc": 0.833267143235372,
|
|
"train_speed(iter/s)": 0.126016
|
|
},
|
|
{
|
|
"epoch": 1.7263262650233484,
|
|
"grad_norm": 0.9658275246620178,
|
|
"learning_rate": 3.8178042789986355e-06,
|
|
"loss": 0.5375414371490479,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1410,
|
|
"token_acc": 0.8221340970845267,
|
|
"train_speed(iter/s)": 0.126151
|
|
},
|
|
{
|
|
"epoch": 1.7324504325193293,
|
|
"grad_norm": 0.9217929244041443,
|
|
"learning_rate": 3.786655251147204e-06,
|
|
"loss": 0.5318355560302734,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1415,
|
|
"token_acc": 0.8423896524940057,
|
|
"train_speed(iter/s)": 0.126267
|
|
},
|
|
{
|
|
"epoch": 1.7385746000153104,
|
|
"grad_norm": 1.0436443090438843,
|
|
"learning_rate": 3.755556180675547e-06,
|
|
"loss": 0.5554102897644043,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1420,
|
|
"token_acc": 0.8421204263900893,
|
|
"train_speed(iter/s)": 0.126374
|
|
},
|
|
{
|
|
"epoch": 1.7385746000153104,
|
|
"eval_loss": 0.6083164215087891,
|
|
"eval_runtime": 29.8608,
|
|
"eval_samples_per_second": 17.649,
|
|
"eval_steps_per_second": 4.421,
|
|
"eval_token_acc": 0.8271295004907279,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 1.7446987675112915,
|
|
"grad_norm": 1.0527832508087158,
|
|
"learning_rate": 3.7245083480343225e-06,
|
|
"loss": 0.5336908817291259,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1425,
|
|
"token_acc": 0.8262728719172633,
|
|
"train_speed(iter/s)": 0.126038
|
|
},
|
|
{
|
|
"epoch": 1.7508229350072724,
|
|
"grad_norm": 0.9292203187942505,
|
|
"learning_rate": 3.693513031564549e-06,
|
|
"loss": 0.5425585746765137,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1430,
|
|
"token_acc": 0.8410889737991266,
|
|
"train_speed(iter/s)": 0.126155
|
|
},
|
|
{
|
|
"epoch": 1.7569471025032535,
|
|
"grad_norm": 0.9655841588973999,
|
|
"learning_rate": 3.662571507444986e-06,
|
|
"loss": 0.5386072158813476,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1435,
|
|
"token_acc": 0.857958101689923,
|
|
"train_speed(iter/s)": 0.126275
|
|
},
|
|
{
|
|
"epoch": 1.7630712699992346,
|
|
"grad_norm": 0.9359703660011292,
|
|
"learning_rate": 3.6316850496395863e-06,
|
|
"loss": 0.5226363658905029,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1440,
|
|
"token_acc": 0.8469182175175004,
|
|
"train_speed(iter/s)": 0.126376
|
|
},
|
|
{
|
|
"epoch": 1.7630712699992346,
|
|
"eval_loss": 0.6083342432975769,
|
|
"eval_runtime": 29.9595,
|
|
"eval_samples_per_second": 17.59,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8270623482617904,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 1.7691954374952155,
|
|
"grad_norm": 1.018633484840393,
|
|
"learning_rate": 3.6008549298450403e-06,
|
|
"loss": 0.5337300300598145,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1445,
|
|
"token_acc": 0.8221121123846342,
|
|
"train_speed(iter/s)": 0.126056
|
|
},
|
|
{
|
|
"epoch": 1.7753196049911963,
|
|
"grad_norm": 0.8938316106796265,
|
|
"learning_rate": 3.5700824174384196e-06,
|
|
"loss": 0.47947111129760744,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1450,
|
|
"token_acc": 0.83465726353315,
|
|
"train_speed(iter/s)": 0.126132
|
|
},
|
|
{
|
|
"epoch": 1.7814437724871777,
|
|
"grad_norm": 0.950809895992279,
|
|
"learning_rate": 3.5393687794249093e-06,
|
|
"loss": 0.5499818325042725,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1455,
|
|
"token_acc": 0.8119980392896196,
|
|
"train_speed(iter/s)": 0.126246
|
|
},
|
|
{
|
|
"epoch": 1.7875679399831585,
|
|
"grad_norm": 0.9783928394317627,
|
|
"learning_rate": 3.508715280385644e-06,
|
|
"loss": 0.5239407062530518,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1460,
|
|
"token_acc": 0.8301248357424441,
|
|
"train_speed(iter/s)": 0.126348
|
|
},
|
|
{
|
|
"epoch": 1.7875679399831585,
|
|
"eval_loss": 0.6072365641593933,
|
|
"eval_runtime": 29.9698,
|
|
"eval_samples_per_second": 17.584,
|
|
"eval_steps_per_second": 4.404,
|
|
"eval_token_acc": 0.8274497649671987,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 1.7936921074791394,
|
|
"grad_norm": 1.0948944091796875,
|
|
"learning_rate": 3.478123182425639e-06,
|
|
"loss": 0.5428466320037841,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1465,
|
|
"token_acc": 0.825891086303621,
|
|
"train_speed(iter/s)": 0.12604
|
|
},
|
|
{
|
|
"epoch": 1.7998162749751205,
|
|
"grad_norm": 0.8970156311988831,
|
|
"learning_rate": 3.4475937451218257e-06,
|
|
"loss": 0.5330904960632324,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1470,
|
|
"token_acc": 0.8345248968536424,
|
|
"train_speed(iter/s)": 0.126137
|
|
},
|
|
{
|
|
"epoch": 1.8059404424711016,
|
|
"grad_norm": 1.018349528312683,
|
|
"learning_rate": 3.4171282254711935e-06,
|
|
"loss": 0.5589166641235351,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1475,
|
|
"token_acc": 0.8269609914096606,
|
|
"train_speed(iter/s)": 0.126239
|
|
},
|
|
{
|
|
"epoch": 1.8120646099670825,
|
|
"grad_norm": 0.9459341764450073,
|
|
"learning_rate": 3.386727877839027e-06,
|
|
"loss": 0.555328369140625,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1480,
|
|
"token_acc": 0.8426273550787036,
|
|
"train_speed(iter/s)": 0.126358
|
|
},
|
|
{
|
|
"epoch": 1.8120646099670825,
|
|
"eval_loss": 0.6067067980766296,
|
|
"eval_runtime": 30.0242,
|
|
"eval_samples_per_second": 17.553,
|
|
"eval_steps_per_second": 4.396,
|
|
"eval_token_acc": 0.8277803605558138,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 1.8181887774630636,
|
|
"grad_norm": 0.9838424324989319,
|
|
"learning_rate": 3.356393953907271e-06,
|
|
"loss": 0.5277560710906982,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1485,
|
|
"token_acc": 0.831073039771941,
|
|
"train_speed(iter/s)": 0.126055
|
|
},
|
|
{
|
|
"epoch": 1.8243129449590447,
|
|
"grad_norm": 1.041955590248108,
|
|
"learning_rate": 3.3261277026229857e-06,
|
|
"loss": 0.5799334049224854,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1490,
|
|
"token_acc": 0.8321645313553607,
|
|
"train_speed(iter/s)": 0.126161
|
|
},
|
|
{
|
|
"epoch": 1.8304371124550256,
|
|
"grad_norm": 0.9292726516723633,
|
|
"learning_rate": 3.2959303701469254e-06,
|
|
"loss": 0.5210411071777343,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1495,
|
|
"token_acc": 0.8229648473635522,
|
|
"train_speed(iter/s)": 0.12628
|
|
},
|
|
{
|
|
"epoch": 1.8365612799510067,
|
|
"grad_norm": 0.864344596862793,
|
|
"learning_rate": 3.2658031998022368e-06,
|
|
"loss": 0.5165549278259277,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1500,
|
|
"token_acc": 0.8386885030686928,
|
|
"train_speed(iter/s)": 0.126396
|
|
},
|
|
{
|
|
"epoch": 1.8365612799510067,
|
|
"eval_loss": 0.6069810390472412,
|
|
"eval_runtime": 30.0285,
|
|
"eval_samples_per_second": 17.55,
|
|
"eval_steps_per_second": 4.396,
|
|
"eval_token_acc": 0.8276202283175784,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.8426854474469878,
|
|
"grad_norm": 0.8527396321296692,
|
|
"learning_rate": 3.2357474320232565e-06,
|
|
"loss": 0.5021331787109375,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1505,
|
|
"token_acc": 0.8321268481969626,
|
|
"train_speed(iter/s)": 0.126074
|
|
},
|
|
{
|
|
"epoch": 1.8488096149429687,
|
|
"grad_norm": 0.9290481209754944,
|
|
"learning_rate": 3.2057643043044452e-06,
|
|
"loss": 0.5180329322814942,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1510,
|
|
"token_acc": 0.8345307220417938,
|
|
"train_speed(iter/s)": 0.12617
|
|
},
|
|
{
|
|
"epoch": 1.8549337824389496,
|
|
"grad_norm": 0.9051028490066528,
|
|
"learning_rate": 3.1758550511494336e-06,
|
|
"loss": 0.5452617645263672,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1515,
|
|
"token_acc": 0.8341819137404329,
|
|
"train_speed(iter/s)": 0.126302
|
|
},
|
|
{
|
|
"epoch": 1.8610579499349307,
|
|
"grad_norm": 1.0172206163406372,
|
|
"learning_rate": 3.1460209040201967e-06,
|
|
"loss": 0.5237324237823486,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1520,
|
|
"token_acc": 0.8367892176409603,
|
|
"train_speed(iter/s)": 0.126404
|
|
},
|
|
{
|
|
"epoch": 1.8610579499349307,
|
|
"eval_loss": 0.6056584715843201,
|
|
"eval_runtime": 29.9559,
|
|
"eval_samples_per_second": 17.593,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8284363861769719,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 1.8671821174309118,
|
|
"grad_norm": 0.9731259942054749,
|
|
"learning_rate": 3.116263091286344e-06,
|
|
"loss": 0.5423327445983886,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1525,
|
|
"token_acc": 0.8284452097329645,
|
|
"train_speed(iter/s)": 0.126099
|
|
},
|
|
{
|
|
"epoch": 1.8733062849268927,
|
|
"grad_norm": 0.9437146782875061,
|
|
"learning_rate": 3.0865828381745515e-06,
|
|
"loss": 0.5558583736419678,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1530,
|
|
"token_acc": 0.828622035858878,
|
|
"train_speed(iter/s)": 0.126204
|
|
},
|
|
{
|
|
"epoch": 1.8794304524228738,
|
|
"grad_norm": 1.0038166046142578,
|
|
"learning_rate": 3.056981366718111e-06,
|
|
"loss": 0.5397710800170898,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1535,
|
|
"token_acc": 0.8308211163879138,
|
|
"train_speed(iter/s)": 0.12631
|
|
},
|
|
{
|
|
"epoch": 1.8855546199188549,
|
|
"grad_norm": 0.8471850156784058,
|
|
"learning_rate": 3.0274598957066132e-06,
|
|
"loss": 0.4804985523223877,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1540,
|
|
"token_acc": 0.8619780378558012,
|
|
"train_speed(iter/s)": 0.126389
|
|
},
|
|
{
|
|
"epoch": 1.8855546199188549,
|
|
"eval_loss": 0.6056827306747437,
|
|
"eval_runtime": 29.9968,
|
|
"eval_samples_per_second": 17.569,
|
|
"eval_steps_per_second": 4.4,
|
|
"eval_token_acc": 0.8282245983780154,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 1.8916787874148357,
|
|
"grad_norm": 0.938709557056427,
|
|
"learning_rate": 2.998019640635772e-06,
|
|
"loss": 0.5519435405731201,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1545,
|
|
"token_acc": 0.8257935412641457,
|
|
"train_speed(iter/s)": 0.126081
|
|
},
|
|
{
|
|
"epoch": 1.8978029549108169,
|
|
"grad_norm": 0.9090867638587952,
|
|
"learning_rate": 2.96866181365737e-06,
|
|
"loss": 0.5426124572753906,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1550,
|
|
"token_acc": 0.843109962219129,
|
|
"train_speed(iter/s)": 0.126179
|
|
},
|
|
{
|
|
"epoch": 1.903927122406798,
|
|
"grad_norm": 0.8900991678237915,
|
|
"learning_rate": 2.9393876235293578e-06,
|
|
"loss": 0.510080623626709,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1555,
|
|
"token_acc": 0.8357965621123515,
|
|
"train_speed(iter/s)": 0.126271
|
|
},
|
|
{
|
|
"epoch": 1.9100512899027788,
|
|
"grad_norm": 0.8838712573051453,
|
|
"learning_rate": 2.910198275566085e-06,
|
|
"loss": 0.5103748321533204,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1560,
|
|
"token_acc": 0.8447406983809286,
|
|
"train_speed(iter/s)": 0.126379
|
|
},
|
|
{
|
|
"epoch": 1.9100512899027788,
|
|
"eval_loss": 0.6069024205207825,
|
|
"eval_runtime": 29.8963,
|
|
"eval_samples_per_second": 17.628,
|
|
"eval_steps_per_second": 4.415,
|
|
"eval_token_acc": 0.8283692339480345,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 1.9161754573987597,
|
|
"grad_norm": 1.0007625818252563,
|
|
"learning_rate": 2.881094971588666e-06,
|
|
"loss": 0.5161759853363037,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1565,
|
|
"token_acc": 0.8315062300454892,
|
|
"train_speed(iter/s)": 0.126082
|
|
},
|
|
{
|
|
"epoch": 1.9222996248947408,
|
|
"grad_norm": 0.9922500848770142,
|
|
"learning_rate": 2.8520789098755053e-06,
|
|
"loss": 0.5415813446044921,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1570,
|
|
"token_acc": 0.8527313346785818,
|
|
"train_speed(iter/s)": 0.126203
|
|
},
|
|
{
|
|
"epoch": 1.928423792390722,
|
|
"grad_norm": 0.9492520093917847,
|
|
"learning_rate": 2.8231512851129596e-06,
|
|
"loss": 0.5504971981048584,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1575,
|
|
"token_acc": 0.8186260917787328,
|
|
"train_speed(iter/s)": 0.126306
|
|
},
|
|
{
|
|
"epoch": 1.9345479598867028,
|
|
"grad_norm": 0.9212282299995422,
|
|
"learning_rate": 2.7943132883461434e-06,
|
|
"loss": 0.547866678237915,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1580,
|
|
"token_acc": 0.83892855733954,
|
|
"train_speed(iter/s)": 0.126413
|
|
},
|
|
{
|
|
"epoch": 1.9345479598867028,
|
|
"eval_loss": 0.6043635010719299,
|
|
"eval_runtime": 29.9792,
|
|
"eval_samples_per_second": 17.579,
|
|
"eval_steps_per_second": 4.403,
|
|
"eval_token_acc": 0.8282814194948086,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 1.940672127382684,
|
|
"grad_norm": 1.0419548749923706,
|
|
"learning_rate": 2.7655661069298934e-06,
|
|
"loss": 0.5519622325897217,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1585,
|
|
"token_acc": 0.8333945887874247,
|
|
"train_speed(iter/s)": 0.126139
|
|
},
|
|
{
|
|
"epoch": 1.946796294878665,
|
|
"grad_norm": 1.043246865272522,
|
|
"learning_rate": 2.736910924479881e-06,
|
|
"loss": 0.5610580921173096,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1590,
|
|
"token_acc": 0.8335689174006017,
|
|
"train_speed(iter/s)": 0.126246
|
|
},
|
|
{
|
|
"epoch": 1.952920462374646,
|
|
"grad_norm": 0.9222803711891174,
|
|
"learning_rate": 2.7083489208238784e-06,
|
|
"loss": 0.5393799304962158,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1595,
|
|
"token_acc": 0.8198356395308001,
|
|
"train_speed(iter/s)": 0.126365
|
|
},
|
|
{
|
|
"epoch": 1.959044629870627,
|
|
"grad_norm": 0.927827775478363,
|
|
"learning_rate": 2.6798812719531843e-06,
|
|
"loss": 0.5392462730407714,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1600,
|
|
"token_acc": 0.8314373587282766,
|
|
"train_speed(iter/s)": 0.126473
|
|
},
|
|
{
|
|
"epoch": 1.959044629870627,
|
|
"eval_loss": 0.6057147979736328,
|
|
"eval_runtime": 29.992,
|
|
"eval_samples_per_second": 17.571,
|
|
"eval_steps_per_second": 4.401,
|
|
"eval_token_acc": 0.8276408905418668,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 1.965168797366608,
|
|
"grad_norm": 1.0039829015731812,
|
|
"learning_rate": 2.6515091499741946e-06,
|
|
"loss": 0.5505844116210937,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1605,
|
|
"token_acc": 0.8264420910319964,
|
|
"train_speed(iter/s)": 0.126174
|
|
},
|
|
{
|
|
"epoch": 1.971292964862589,
|
|
"grad_norm": 0.9382634162902832,
|
|
"learning_rate": 2.623233723060157e-06,
|
|
"loss": 0.5243973731994629,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1610,
|
|
"token_acc": 0.8320176612255821,
|
|
"train_speed(iter/s)": 0.126271
|
|
},
|
|
{
|
|
"epoch": 1.9774171323585699,
|
|
"grad_norm": 0.9044788479804993,
|
|
"learning_rate": 2.595056155403063e-06,
|
|
"loss": 0.48435544967651367,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1615,
|
|
"token_acc": 0.843847529543781,
|
|
"train_speed(iter/s)": 0.126372
|
|
},
|
|
{
|
|
"epoch": 1.9835412998545512,
|
|
"grad_norm": 0.9093387722969055,
|
|
"learning_rate": 2.5669776071657194e-06,
|
|
"loss": 0.515876293182373,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1620,
|
|
"token_acc": 0.8395097436639068,
|
|
"train_speed(iter/s)": 0.126455
|
|
},
|
|
{
|
|
"epoch": 1.9835412998545512,
|
|
"eval_loss": 0.6055964231491089,
|
|
"eval_runtime": 29.9496,
|
|
"eval_samples_per_second": 17.596,
|
|
"eval_steps_per_second": 4.407,
|
|
"eval_token_acc": 0.8280644661397799,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 1.989665467350532,
|
|
"grad_norm": 1.0032296180725098,
|
|
"learning_rate": 2.5389992344339787e-06,
|
|
"loss": 0.5630090713500977,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1625,
|
|
"token_acc": 0.8223650962996237,
|
|
"train_speed(iter/s)": 0.126171
|
|
},
|
|
{
|
|
"epoch": 1.995789634846513,
|
|
"grad_norm": 0.9951412677764893,
|
|
"learning_rate": 2.5111221891691384e-06,
|
|
"loss": 0.5040010452270508,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1630,
|
|
"token_acc": 0.8500885437951233,
|
|
"train_speed(iter/s)": 0.126256
|
|
},
|
|
{
|
|
"epoch": 2.001224833499196,
|
|
"grad_norm": 1.4779112339019775,
|
|
"learning_rate": 2.4833476191605136e-06,
|
|
"loss": 0.514947509765625,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1635,
|
|
"token_acc": 0.8120487926313169,
|
|
"train_speed(iter/s)": 0.126393
|
|
},
|
|
{
|
|
"epoch": 2.0073490009951773,
|
|
"grad_norm": 0.9851377010345459,
|
|
"learning_rate": 2.4556766679781763e-06,
|
|
"loss": 0.4878593921661377,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1640,
|
|
"token_acc": 0.8328894582476486,
|
|
"train_speed(iter/s)": 0.126502
|
|
},
|
|
{
|
|
"epoch": 2.0073490009951773,
|
|
"eval_loss": 0.6103575825691223,
|
|
"eval_runtime": 29.9214,
|
|
"eval_samples_per_second": 17.613,
|
|
"eval_steps_per_second": 4.412,
|
|
"eval_token_acc": 0.8283795650601787,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 2.013473168491158,
|
|
"grad_norm": 0.9571102857589722,
|
|
"learning_rate": 2.4281104749258716e-06,
|
|
"loss": 0.49354209899902346,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1645,
|
|
"token_acc": 0.832244552629024,
|
|
"train_speed(iter/s)": 0.126228
|
|
},
|
|
{
|
|
"epoch": 2.019597335987139,
|
|
"grad_norm": 0.9675928354263306,
|
|
"learning_rate": 2.4006501749941097e-06,
|
|
"loss": 0.47706212997436526,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1650,
|
|
"token_acc": 0.8675657501494322,
|
|
"train_speed(iter/s)": 0.126296
|
|
},
|
|
{
|
|
"epoch": 2.0257215034831204,
|
|
"grad_norm": 0.9996489882469177,
|
|
"learning_rate": 2.3732968988134343e-06,
|
|
"loss": 0.4821828842163086,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1655,
|
|
"token_acc": 0.8707286339040842,
|
|
"train_speed(iter/s)": 0.126397
|
|
},
|
|
{
|
|
"epoch": 2.0318456709791013,
|
|
"grad_norm": 1.0168769359588623,
|
|
"learning_rate": 2.3460517726078696e-06,
|
|
"loss": 0.47524452209472656,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1660,
|
|
"token_acc": 0.855996970531534,
|
|
"train_speed(iter/s)": 0.126479
|
|
},
|
|
{
|
|
"epoch": 2.0318456709791013,
|
|
"eval_loss": 0.6196611523628235,
|
|
"eval_runtime": 29.918,
|
|
"eval_samples_per_second": 17.615,
|
|
"eval_steps_per_second": 4.412,
|
|
"eval_token_acc": 0.8268350637946175,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 2.037969838475082,
|
|
"grad_norm": 0.9725021123886108,
|
|
"learning_rate": 2.3189159181485517e-06,
|
|
"loss": 0.4909340858459473,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1665,
|
|
"token_acc": 0.8357392077717726,
|
|
"train_speed(iter/s)": 0.126201
|
|
},
|
|
{
|
|
"epoch": 2.0440940059710635,
|
|
"grad_norm": 0.9119012355804443,
|
|
"learning_rate": 2.291890452707539e-06,
|
|
"loss": 0.4890812873840332,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1670,
|
|
"token_acc": 0.8586991348926626,
|
|
"train_speed(iter/s)": 0.126291
|
|
},
|
|
{
|
|
"epoch": 2.0502181734670444,
|
|
"grad_norm": 1.0054688453674316,
|
|
"learning_rate": 2.2649764890118158e-06,
|
|
"loss": 0.49579925537109376,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1675,
|
|
"token_acc": 0.8483596157331883,
|
|
"train_speed(iter/s)": 0.126396
|
|
},
|
|
{
|
|
"epoch": 2.0563423409630253,
|
|
"grad_norm": 0.9014572501182556,
|
|
"learning_rate": 2.238175135197471e-06,
|
|
"loss": 0.47943267822265623,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1680,
|
|
"token_acc": 0.8587532153124527,
|
|
"train_speed(iter/s)": 0.126492
|
|
},
|
|
{
|
|
"epoch": 2.0563423409630253,
|
|
"eval_loss": 0.6154988408088684,
|
|
"eval_runtime": 29.9636,
|
|
"eval_samples_per_second": 17.588,
|
|
"eval_steps_per_second": 4.405,
|
|
"eval_token_acc": 0.8270726793739346,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 2.062466508459006,
|
|
"grad_norm": 0.938705563545227,
|
|
"learning_rate": 2.2114874947640763e-06,
|
|
"loss": 0.45625782012939453,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1685,
|
|
"token_acc": 0.8355256733948025,
|
|
"train_speed(iter/s)": 0.126208
|
|
},
|
|
{
|
|
"epoch": 2.0685906759549875,
|
|
"grad_norm": 0.8989147543907166,
|
|
"learning_rate": 2.1849146665292513e-06,
|
|
"loss": 0.46575441360473635,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1690,
|
|
"token_acc": 0.8795854481354284,
|
|
"train_speed(iter/s)": 0.126306
|
|
},
|
|
{
|
|
"epoch": 2.0747148434509683,
|
|
"grad_norm": 0.9596337080001831,
|
|
"learning_rate": 2.1584577445834234e-06,
|
|
"loss": 0.48124160766601565,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1695,
|
|
"token_acc": 0.8413667107206717,
|
|
"train_speed(iter/s)": 0.126388
|
|
},
|
|
{
|
|
"epoch": 2.0808390109469492,
|
|
"grad_norm": 0.788873016834259,
|
|
"learning_rate": 2.132117818244771e-06,
|
|
"loss": 0.46569390296936036,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1700,
|
|
"token_acc": 0.8618104667609618,
|
|
"train_speed(iter/s)": 0.12649
|
|
},
|
|
{
|
|
"epoch": 2.0808390109469492,
|
|
"eval_loss": 0.6172361373901367,
|
|
"eval_runtime": 29.9107,
|
|
"eval_samples_per_second": 17.619,
|
|
"eval_steps_per_second": 4.413,
|
|
"eval_token_acc": 0.8270313549253577,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 2.0869631784429306,
|
|
"grad_norm": 1.0761973857879639,
|
|
"learning_rate": 2.1058959720143875e-06,
|
|
"loss": 0.4640150547027588,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1705,
|
|
"token_acc": 0.8384597955079729,
|
|
"train_speed(iter/s)": 0.126222
|
|
},
|
|
{
|
|
"epoch": 2.0930873459389114,
|
|
"grad_norm": 0.9404869675636292,
|
|
"learning_rate": 2.0797932855316183e-06,
|
|
"loss": 0.48186473846435546,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1710,
|
|
"token_acc": 0.8572665858305907,
|
|
"train_speed(iter/s)": 0.126305
|
|
},
|
|
{
|
|
"epoch": 2.0992115134348923,
|
|
"grad_norm": 0.979210376739502,
|
|
"learning_rate": 2.0538108335296107e-06,
|
|
"loss": 0.4823300361633301,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1715,
|
|
"token_acc": 0.8577344523032946,
|
|
"train_speed(iter/s)": 0.1264
|
|
},
|
|
{
|
|
"epoch": 2.1053356809308736,
|
|
"grad_norm": 0.9393882751464844,
|
|
"learning_rate": 2.0279496857910667e-06,
|
|
"loss": 0.48357486724853516,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1720,
|
|
"token_acc": 0.8569940863614386,
|
|
"train_speed(iter/s)": 0.126508
|
|
},
|
|
{
|
|
"epoch": 2.1053356809308736,
|
|
"eval_loss": 0.6190218329429626,
|
|
"eval_runtime": 30.0103,
|
|
"eval_samples_per_second": 17.561,
|
|
"eval_steps_per_second": 4.398,
|
|
"eval_token_acc": 0.826953871584276,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 2.1114598484268545,
|
|
"grad_norm": 0.9903694987297058,
|
|
"learning_rate": 2.0022109071041905e-06,
|
|
"loss": 0.485797643661499,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1725,
|
|
"token_acc": 0.8363958585952803,
|
|
"train_speed(iter/s)": 0.126264
|
|
},
|
|
{
|
|
"epoch": 2.1175840159228354,
|
|
"grad_norm": 0.8971183896064758,
|
|
"learning_rate": 1.9765955572188578e-06,
|
|
"loss": 0.468338680267334,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1730,
|
|
"token_acc": 0.850854499843211,
|
|
"train_speed(iter/s)": 0.126332
|
|
},
|
|
{
|
|
"epoch": 2.1237081834188167,
|
|
"grad_norm": 0.8892176151275635,
|
|
"learning_rate": 1.951104690802969e-06,
|
|
"loss": 0.45011487007141116,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1735,
|
|
"token_acc": 0.8456293706293706,
|
|
"train_speed(iter/s)": 0.126424
|
|
},
|
|
{
|
|
"epoch": 2.1298323509147976,
|
|
"grad_norm": 0.9592292904853821,
|
|
"learning_rate": 1.925739357399038e-06,
|
|
"loss": 0.45401706695556643,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1740,
|
|
"token_acc": 0.8367899677215894,
|
|
"train_speed(iter/s)": 0.126512
|
|
},
|
|
{
|
|
"epoch": 2.1298323509147976,
|
|
"eval_loss": 0.6162592768669128,
|
|
"eval_runtime": 29.8564,
|
|
"eval_samples_per_second": 17.651,
|
|
"eval_steps_per_second": 4.421,
|
|
"eval_token_acc": 0.8268247326824733,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 2.1359565184107785,
|
|
"grad_norm": 0.9750301241874695,
|
|
"learning_rate": 1.9005006013809662e-06,
|
|
"loss": 0.5132875442504883,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1745,
|
|
"token_acc": 0.8310148067894547,
|
|
"train_speed(iter/s)": 0.126241
|
|
},
|
|
{
|
|
"epoch": 2.1420806859067594,
|
|
"grad_norm": 1.0531848669052124,
|
|
"learning_rate": 1.8753894619110547e-06,
|
|
"loss": 0.4934427261352539,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1750,
|
|
"token_acc": 0.8594569186824312,
|
|
"train_speed(iter/s)": 0.126339
|
|
},
|
|
{
|
|
"epoch": 2.1482048534027407,
|
|
"grad_norm": 0.9899281859397888,
|
|
"learning_rate": 1.8504069728972124e-06,
|
|
"loss": 0.5067736625671386,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1755,
|
|
"token_acc": 0.8443969645619981,
|
|
"train_speed(iter/s)": 0.126433
|
|
},
|
|
{
|
|
"epoch": 2.1543290208987216,
|
|
"grad_norm": 0.9110437035560608,
|
|
"learning_rate": 1.8255541629503865e-06,
|
|
"loss": 0.43926572799682617,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1760,
|
|
"token_acc": 0.8647777628575265,
|
|
"train_speed(iter/s)": 0.126537
|
|
},
|
|
{
|
|
"epoch": 2.1543290208987216,
|
|
"eval_loss": 0.6184687614440918,
|
|
"eval_runtime": 29.9388,
|
|
"eval_samples_per_second": 17.603,
|
|
"eval_steps_per_second": 4.409,
|
|
"eval_token_acc": 0.8269796993646366,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 2.1604531883947025,
|
|
"grad_norm": 0.956470251083374,
|
|
"learning_rate": 1.8008320553422116e-06,
|
|
"loss": 0.48296318054199217,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1765,
|
|
"token_acc": 0.8321491877005113,
|
|
"train_speed(iter/s)": 0.126272
|
|
},
|
|
{
|
|
"epoch": 2.166577355890684,
|
|
"grad_norm": 0.990215003490448,
|
|
"learning_rate": 1.7762416679628792e-06,
|
|
"loss": 0.4733391761779785,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1770,
|
|
"token_acc": 0.8625864925445863,
|
|
"train_speed(iter/s)": 0.126373
|
|
},
|
|
{
|
|
"epoch": 2.1727015233866647,
|
|
"grad_norm": 0.9296258687973022,
|
|
"learning_rate": 1.751784013279228e-06,
|
|
"loss": 0.4612305164337158,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1775,
|
|
"token_acc": 0.8483617060223321,
|
|
"train_speed(iter/s)": 0.126475
|
|
},
|
|
{
|
|
"epoch": 2.1788256908826455,
|
|
"grad_norm": 0.9242532253265381,
|
|
"learning_rate": 1.7274600982930544e-06,
|
|
"loss": 0.4506662368774414,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1780,
|
|
"token_acc": 0.8471429097741591,
|
|
"train_speed(iter/s)": 0.126543
|
|
},
|
|
{
|
|
"epoch": 2.1788256908826455,
|
|
"eval_loss": 0.6181926131248474,
|
|
"eval_runtime": 29.9341,
|
|
"eval_samples_per_second": 17.605,
|
|
"eval_steps_per_second": 4.41,
|
|
"eval_token_acc": 0.8270055271449972,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 2.1849498583786264,
|
|
"grad_norm": 0.9514071345329285,
|
|
"learning_rate": 1.7032709244996559e-06,
|
|
"loss": 0.45079612731933594,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1785,
|
|
"token_acc": 0.8306619810862547,
|
|
"train_speed(iter/s)": 0.126266
|
|
},
|
|
{
|
|
"epoch": 2.1910740258746078,
|
|
"grad_norm": 0.8734022974967957,
|
|
"learning_rate": 1.6792174878465933e-06,
|
|
"loss": 0.4914576530456543,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1790,
|
|
"token_acc": 0.8544291529366156,
|
|
"train_speed(iter/s)": 0.126359
|
|
},
|
|
{
|
|
"epoch": 2.1971981933705886,
|
|
"grad_norm": 0.9682619571685791,
|
|
"learning_rate": 1.65530077869268e-06,
|
|
"loss": 0.46589956283569334,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1795,
|
|
"token_acc": 0.856334134219794,
|
|
"train_speed(iter/s)": 0.126455
|
|
},
|
|
{
|
|
"epoch": 2.2033223608665695,
|
|
"grad_norm": 0.9326309561729431,
|
|
"learning_rate": 1.6315217817672142e-06,
|
|
"loss": 0.4956002712249756,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1800,
|
|
"token_acc": 0.851890756302521,
|
|
"train_speed(iter/s)": 0.126552
|
|
},
|
|
{
|
|
"epoch": 2.2033223608665695,
|
|
"eval_loss": 0.6174434423446655,
|
|
"eval_runtime": 29.957,
|
|
"eval_samples_per_second": 17.592,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8268092360142569,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 2.209446528362551,
|
|
"grad_norm": 1.0249996185302734,
|
|
"learning_rate": 1.607881476129432e-06,
|
|
"loss": 0.480439281463623,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1805,
|
|
"token_acc": 0.8305227415396907,
|
|
"train_speed(iter/s)": 0.126269
|
|
},
|
|
{
|
|
"epoch": 2.2155706958585317,
|
|
"grad_norm": 0.9621463418006897,
|
|
"learning_rate": 1.5843808351281913e-06,
|
|
"loss": 0.4549149513244629,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1810,
|
|
"token_acc": 0.8661414578031861,
|
|
"train_speed(iter/s)": 0.126344
|
|
},
|
|
{
|
|
"epoch": 2.2216948633545126,
|
|
"grad_norm": 0.977536141872406,
|
|
"learning_rate": 1.5610208263619002e-06,
|
|
"loss": 0.48578948974609376,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1815,
|
|
"token_acc": 0.8355557119234422,
|
|
"train_speed(iter/s)": 0.12644
|
|
},
|
|
{
|
|
"epoch": 2.227819030850494,
|
|
"grad_norm": 0.9830949902534485,
|
|
"learning_rate": 1.537802411638677e-06,
|
|
"loss": 0.4825616359710693,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1820,
|
|
"token_acc": 0.8571990136868813,
|
|
"train_speed(iter/s)": 0.126532
|
|
},
|
|
{
|
|
"epoch": 2.227819030850494,
|
|
"eval_loss": 0.6185752749443054,
|
|
"eval_runtime": 29.9623,
|
|
"eval_samples_per_second": 17.589,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8268195671264011,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 2.233943198346475,
|
|
"grad_norm": 0.8971763253211975,
|
|
"learning_rate": 1.514726546936749e-06,
|
|
"loss": 0.4621254920959473,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1825,
|
|
"token_acc": 0.8387506208266652,
|
|
"train_speed(iter/s)": 0.126271
|
|
},
|
|
{
|
|
"epoch": 2.2400673658424557,
|
|
"grad_norm": 0.8897117972373962,
|
|
"learning_rate": 1.4917941823650917e-06,
|
|
"loss": 0.4865126609802246,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1830,
|
|
"token_acc": 0.8466782763348031,
|
|
"train_speed(iter/s)": 0.126353
|
|
},
|
|
{
|
|
"epoch": 2.246191533338437,
|
|
"grad_norm": 0.9670534133911133,
|
|
"learning_rate": 1.4690062621243117e-06,
|
|
"loss": 0.4749399185180664,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1835,
|
|
"token_acc": 0.8677187834569174,
|
|
"train_speed(iter/s)": 0.126441
|
|
},
|
|
{
|
|
"epoch": 2.252315700834418,
|
|
"grad_norm": 0.8603255152702332,
|
|
"learning_rate": 1.4463637244677648e-06,
|
|
"loss": 0.46147994995117186,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1840,
|
|
"token_acc": 0.8514672004229767,
|
|
"train_speed(iter/s)": 0.126525
|
|
},
|
|
{
|
|
"epoch": 2.252315700834418,
|
|
"eval_loss": 0.618859052658081,
|
|
"eval_runtime": 29.975,
|
|
"eval_samples_per_second": 17.581,
|
|
"eval_steps_per_second": 4.404,
|
|
"eval_token_acc": 0.8268815537992665,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 2.258439868330399,
|
|
"grad_norm": 0.9529172778129578,
|
|
"learning_rate": 1.423867501662934e-06,
|
|
"loss": 0.4659478187561035,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1845,
|
|
"token_acc": 0.8376929823340248,
|
|
"train_speed(iter/s)": 0.126266
|
|
},
|
|
{
|
|
"epoch": 2.2645640358263797,
|
|
"grad_norm": 0.9903680086135864,
|
|
"learning_rate": 1.4015185199530378e-06,
|
|
"loss": 0.4695383071899414,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1850,
|
|
"token_acc": 0.8556800687408256,
|
|
"train_speed(iter/s)": 0.126353
|
|
},
|
|
{
|
|
"epoch": 2.270688203322361,
|
|
"grad_norm": 0.9149890542030334,
|
|
"learning_rate": 1.379317699518898e-06,
|
|
"loss": 0.47596092224121095,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1855,
|
|
"token_acc": 0.8519627185522824,
|
|
"train_speed(iter/s)": 0.126419
|
|
},
|
|
{
|
|
"epoch": 2.276812370818342,
|
|
"grad_norm": 0.8692817091941833,
|
|
"learning_rate": 1.3572659544410493e-06,
|
|
"loss": 0.43576741218566895,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1860,
|
|
"token_acc": 0.8630936883995983,
|
|
"train_speed(iter/s)": 0.126518
|
|
},
|
|
{
|
|
"epoch": 2.276812370818342,
|
|
"eval_loss": 0.6181974411010742,
|
|
"eval_runtime": 30.0408,
|
|
"eval_samples_per_second": 17.543,
|
|
"eval_steps_per_second": 4.394,
|
|
"eval_token_acc": 0.8271088382664393,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 2.2829365383143227,
|
|
"grad_norm": 0.9488633871078491,
|
|
"learning_rate": 1.3353641926621065e-06,
|
|
"loss": 0.45254907608032224,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1865,
|
|
"token_acc": 0.8336496980155307,
|
|
"train_speed(iter/s)": 0.126266
|
|
},
|
|
{
|
|
"epoch": 2.289060705810304,
|
|
"grad_norm": 1.0025756359100342,
|
|
"learning_rate": 1.3136133159493803e-06,
|
|
"loss": 0.4933184623718262,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1870,
|
|
"token_acc": 0.8573630940411556,
|
|
"train_speed(iter/s)": 0.126361
|
|
},
|
|
{
|
|
"epoch": 2.295184873306285,
|
|
"grad_norm": 0.8357995748519897,
|
|
"learning_rate": 1.2920142198577484e-06,
|
|
"loss": 0.45499467849731445,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1875,
|
|
"token_acc": 0.8629192723138147,
|
|
"train_speed(iter/s)": 0.126432
|
|
},
|
|
{
|
|
"epoch": 2.301309040802266,
|
|
"grad_norm": 0.9138444066047668,
|
|
"learning_rate": 1.2705677936927841e-06,
|
|
"loss": 0.4767561435699463,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1880,
|
|
"token_acc": 0.8521506375701698,
|
|
"train_speed(iter/s)": 0.126523
|
|
},
|
|
{
|
|
"epoch": 2.301309040802266,
|
|
"eval_loss": 0.6184601187705994,
|
|
"eval_runtime": 30.0434,
|
|
"eval_samples_per_second": 17.541,
|
|
"eval_steps_per_second": 4.394,
|
|
"eval_token_acc": 0.8273567849579008,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 2.3074332082982467,
|
|
"grad_norm": 0.9720640182495117,
|
|
"learning_rate": 1.2492749204741368e-06,
|
|
"loss": 0.4715888500213623,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1885,
|
|
"token_acc": 0.8328760826785792,
|
|
"train_speed(iter/s)": 0.126264
|
|
},
|
|
{
|
|
"epoch": 2.313557375794228,
|
|
"grad_norm": 1.062354564666748,
|
|
"learning_rate": 1.2281364768991804e-06,
|
|
"loss": 0.4756108283996582,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1890,
|
|
"token_acc": 0.8549824466648663,
|
|
"train_speed(iter/s)": 0.126366
|
|
},
|
|
{
|
|
"epoch": 2.319681543290209,
|
|
"grad_norm": 1.040152907371521,
|
|
"learning_rate": 1.207153333306914e-06,
|
|
"loss": 0.457261848449707,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1895,
|
|
"token_acc": 0.858182628393182,
|
|
"train_speed(iter/s)": 0.126458
|
|
},
|
|
{
|
|
"epoch": 2.32580571078619,
|
|
"grad_norm": 0.9648529887199402,
|
|
"learning_rate": 1.1863263536421261e-06,
|
|
"loss": 0.49726166725158694,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1900,
|
|
"token_acc": 0.8323407202216067,
|
|
"train_speed(iter/s)": 0.126559
|
|
},
|
|
{
|
|
"epoch": 2.32580571078619,
|
|
"eval_loss": 0.6168169975280762,
|
|
"eval_runtime": 30.03,
|
|
"eval_samples_per_second": 17.549,
|
|
"eval_steps_per_second": 4.396,
|
|
"eval_token_acc": 0.8270106927010693,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 2.331929878282171,
|
|
"grad_norm": 0.9079554677009583,
|
|
"learning_rate": 1.1656563954198258e-06,
|
|
"loss": 0.5002402305603028,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1905,
|
|
"token_acc": 0.8321498929943166,
|
|
"train_speed(iter/s)": 0.126312
|
|
},
|
|
{
|
|
"epoch": 2.338054045778152,
|
|
"grad_norm": 1.007360816001892,
|
|
"learning_rate": 1.145144309689934e-06,
|
|
"loss": 0.4659921646118164,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1910,
|
|
"token_acc": 0.8422997172478793,
|
|
"train_speed(iter/s)": 0.126382
|
|
},
|
|
{
|
|
"epoch": 2.344178213274133,
|
|
"grad_norm": 1.0213356018066406,
|
|
"learning_rate": 1.1247909410022434e-06,
|
|
"loss": 0.46290979385375974,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1915,
|
|
"token_acc": 0.8491659285503396,
|
|
"train_speed(iter/s)": 0.12648
|
|
},
|
|
{
|
|
"epoch": 2.350302380770114,
|
|
"grad_norm": 1.1062732934951782,
|
|
"learning_rate": 1.1045971273716476e-06,
|
|
"loss": 0.4558609962463379,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1920,
|
|
"token_acc": 0.8681887684181262,
|
|
"train_speed(iter/s)": 0.126559
|
|
},
|
|
{
|
|
"epoch": 2.350302380770114,
|
|
"eval_loss": 0.6178110837936401,
|
|
"eval_runtime": 29.9953,
|
|
"eval_samples_per_second": 17.569,
|
|
"eval_steps_per_second": 4.401,
|
|
"eval_token_acc": 0.8270985071542951,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 2.356426548266095,
|
|
"grad_norm": 0.9201487302780151,
|
|
"learning_rate": 1.0845637002436344e-06,
|
|
"loss": 0.46529560089111327,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1925,
|
|
"token_acc": 0.8382993992876508,
|
|
"train_speed(iter/s)": 0.126335
|
|
},
|
|
{
|
|
"epoch": 2.362550715762076,
|
|
"grad_norm": 2.0007822513580322,
|
|
"learning_rate": 1.0646914844600543e-06,
|
|
"loss": 0.46782960891723635,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1930,
|
|
"token_acc": 0.8615680194148577,
|
|
"train_speed(iter/s)": 0.126411
|
|
},
|
|
{
|
|
"epoch": 2.3686748832580573,
|
|
"grad_norm": 1.0234315395355225,
|
|
"learning_rate": 1.0449812982251556e-06,
|
|
"loss": 0.4937599658966064,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1935,
|
|
"token_acc": 0.8580788129877638,
|
|
"train_speed(iter/s)": 0.126511
|
|
},
|
|
{
|
|
"epoch": 2.374799050754038,
|
|
"grad_norm": 0.964102566242218,
|
|
"learning_rate": 1.0254339530719031e-06,
|
|
"loss": 0.49028477668762205,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1940,
|
|
"token_acc": 0.8427895540736877,
|
|
"train_speed(iter/s)": 0.126597
|
|
},
|
|
{
|
|
"epoch": 2.374799050754038,
|
|
"eval_loss": 0.618255078792572,
|
|
"eval_runtime": 29.9499,
|
|
"eval_samples_per_second": 17.596,
|
|
"eval_steps_per_second": 4.407,
|
|
"eval_token_acc": 0.8271708249393047,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 2.380923218250019,
|
|
"grad_norm": 0.9083016514778137,
|
|
"learning_rate": 1.0060502538285582e-06,
|
|
"loss": 0.47533645629882815,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1945,
|
|
"token_acc": 0.8339908186042594,
|
|
"train_speed(iter/s)": 0.126344
|
|
},
|
|
{
|
|
"epoch": 2.3870473857460004,
|
|
"grad_norm": 0.9915279746055603,
|
|
"learning_rate": 9.868309985855446e-07,
|
|
"loss": 0.4681232452392578,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1950,
|
|
"token_acc": 0.8487209179913675,
|
|
"train_speed(iter/s)": 0.126421
|
|
},
|
|
{
|
|
"epoch": 2.3931715532419813,
|
|
"grad_norm": 0.9561747312545776,
|
|
"learning_rate": 9.677769786625869e-07,
|
|
"loss": 0.48569955825805666,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1955,
|
|
"token_acc": 0.8486834496318285,
|
|
"train_speed(iter/s)": 0.126494
|
|
},
|
|
{
|
|
"epoch": 2.399295720737962,
|
|
"grad_norm": 0.9286803603172302,
|
|
"learning_rate": 9.488889785761324e-07,
|
|
"loss": 0.44054179191589354,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1960,
|
|
"token_acc": 0.8660492977141283,
|
|
"train_speed(iter/s)": 0.126575
|
|
},
|
|
{
|
|
"epoch": 2.399295720737962,
|
|
"eval_loss": 0.6190705895423889,
|
|
"eval_runtime": 29.9923,
|
|
"eval_samples_per_second": 17.571,
|
|
"eval_steps_per_second": 4.401,
|
|
"eval_token_acc": 0.8266181104395888,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 2.405419888233943,
|
|
"grad_norm": 1.0242820978164673,
|
|
"learning_rate": 9.301677760070449e-07,
|
|
"loss": 0.4897134304046631,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1965,
|
|
"token_acc": 0.8352815571190013,
|
|
"train_speed(iter/s)": 0.126327
|
|
},
|
|
{
|
|
"epoch": 2.4115440557299244,
|
|
"grad_norm": 0.939855694770813,
|
|
"learning_rate": 9.116141417685898e-07,
|
|
"loss": 0.45674614906311034,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1970,
|
|
"token_acc": 0.8488702986251586,
|
|
"train_speed(iter/s)": 0.126411
|
|
},
|
|
{
|
|
"epoch": 2.4176682232259052,
|
|
"grad_norm": 0.9036867022514343,
|
|
"learning_rate": 8.932288397746919e-07,
|
|
"loss": 0.4510343074798584,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1975,
|
|
"token_acc": 0.8560777957860616,
|
|
"train_speed(iter/s)": 0.126495
|
|
},
|
|
{
|
|
"epoch": 2.423792390721886,
|
|
"grad_norm": 0.9866623878479004,
|
|
"learning_rate": 8.750126270084891e-07,
|
|
"loss": 0.4746750831604004,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1980,
|
|
"token_acc": 0.8668764857535072,
|
|
"train_speed(iter/s)": 0.126576
|
|
},
|
|
{
|
|
"epoch": 2.423792390721886,
|
|
"eval_loss": 0.6186906099319458,
|
|
"eval_runtime": 29.9649,
|
|
"eval_samples_per_second": 17.587,
|
|
"eval_steps_per_second": 4.405,
|
|
"eval_token_acc": 0.8270881760421509,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 2.4299165582178675,
|
|
"grad_norm": 0.9331910610198975,
|
|
"learning_rate": 8.569662534911605e-07,
|
|
"loss": 0.4652440071105957,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1985,
|
|
"token_acc": 0.8312124178436582,
|
|
"train_speed(iter/s)": 0.126348
|
|
},
|
|
{
|
|
"epoch": 2.4360407257138483,
|
|
"grad_norm": 0.8783546686172485,
|
|
"learning_rate": 8.390904622510471e-07,
|
|
"loss": 0.43751039505004885,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1990,
|
|
"token_acc": 0.8822175660357916,
|
|
"train_speed(iter/s)": 0.126428
|
|
},
|
|
{
|
|
"epoch": 2.442164893209829,
|
|
"grad_norm": 0.9757615923881531,
|
|
"learning_rate": 8.213859892930581e-07,
|
|
"loss": 0.4832446098327637,
|
|
"memory(GiB)": 36.87,
|
|
"step": 1995,
|
|
"token_acc": 0.8358402898094188,
|
|
"train_speed(iter/s)": 0.126498
|
|
},
|
|
{
|
|
"epoch": 2.44828906070581,
|
|
"grad_norm": 0.9304143190383911,
|
|
"learning_rate": 8.03853563568367e-07,
|
|
"loss": 0.46181411743164064,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2000,
|
|
"token_acc": 0.8611440231700895,
|
|
"train_speed(iter/s)": 0.126561
|
|
},
|
|
{
|
|
"epoch": 2.44828906070581,
|
|
"eval_loss": 0.6176819205284119,
|
|
"eval_runtime": 29.9501,
|
|
"eval_samples_per_second": 17.596,
|
|
"eval_steps_per_second": 4.407,
|
|
"eval_token_acc": 0.8269900304767809,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 2.4544132282017914,
|
|
"grad_norm": 0.9659498333930969,
|
|
"learning_rate": 7.864939069444006e-07,
|
|
"loss": 0.47965211868286134,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2005,
|
|
"token_acc": 0.831640513163097,
|
|
"train_speed(iter/s)": 0.126337
|
|
},
|
|
{
|
|
"epoch": 2.4605373956977723,
|
|
"grad_norm": 1.0172474384307861,
|
|
"learning_rate": 7.693077341751138e-07,
|
|
"loss": 0.4738880157470703,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2010,
|
|
"token_acc": 0.8708014805078004,
|
|
"train_speed(iter/s)": 0.126397
|
|
},
|
|
{
|
|
"epoch": 2.466661563193753,
|
|
"grad_norm": 0.9642378091812134,
|
|
"learning_rate": 7.522957528715636e-07,
|
|
"loss": 0.4847827911376953,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2015,
|
|
"token_acc": 0.8358588871654138,
|
|
"train_speed(iter/s)": 0.126477
|
|
},
|
|
{
|
|
"epoch": 2.4727857306897345,
|
|
"grad_norm": 1.015015721321106,
|
|
"learning_rate": 7.354586634727729e-07,
|
|
"loss": 0.48462276458740233,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2020,
|
|
"token_acc": 0.8585737976782752,
|
|
"train_speed(iter/s)": 0.126571
|
|
},
|
|
{
|
|
"epoch": 2.4727857306897345,
|
|
"eval_loss": 0.6191264986991882,
|
|
"eval_runtime": 29.948,
|
|
"eval_samples_per_second": 17.597,
|
|
"eval_steps_per_second": 4.408,
|
|
"eval_token_acc": 0.8270416860375019,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 2.4789098981857154,
|
|
"grad_norm": 0.9471215009689331,
|
|
"learning_rate": 7.187971592168936e-07,
|
|
"loss": 0.4690380096435547,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2025,
|
|
"token_acc": 0.83595499208511,
|
|
"train_speed(iter/s)": 0.126356
|
|
},
|
|
{
|
|
"epoch": 2.4850340656816963,
|
|
"grad_norm": 0.9936275482177734,
|
|
"learning_rate": 7.023119261126571e-07,
|
|
"loss": 0.4644585609436035,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2030,
|
|
"token_acc": 0.8678402074165046,
|
|
"train_speed(iter/s)": 0.126441
|
|
},
|
|
{
|
|
"epoch": 2.4911582331776776,
|
|
"grad_norm": 0.9575950503349304,
|
|
"learning_rate": 6.860036429111394e-07,
|
|
"loss": 0.4721442699432373,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2035,
|
|
"token_acc": 0.8546081813701331,
|
|
"train_speed(iter/s)": 0.126503
|
|
},
|
|
{
|
|
"epoch": 2.4972824006736585,
|
|
"grad_norm": 1.002123475074768,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.4855657577514648,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2040,
|
|
"token_acc": 0.8467344696835466,
|
|
"train_speed(iter/s)": 0.126594
|
|
},
|
|
{
|
|
"epoch": 2.4972824006736585,
|
|
"eval_loss": 0.6174936294555664,
|
|
"eval_runtime": 29.9826,
|
|
"eval_samples_per_second": 17.577,
|
|
"eval_steps_per_second": 4.403,
|
|
"eval_token_acc": 0.8267524148974638,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 2.5034065681696394,
|
|
"grad_norm": 0.9535955786705017,
|
|
"learning_rate": 6.539206047648705e-07,
|
|
"loss": 0.46763386726379397,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2045,
|
|
"token_acc": 0.8324564664169503,
|
|
"train_speed(iter/s)": 0.12638
|
|
},
|
|
{
|
|
"epoch": 2.5095307356656207,
|
|
"grad_norm": 0.9121899008750916,
|
|
"learning_rate": 6.381471707839449e-07,
|
|
"loss": 0.44632792472839355,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2050,
|
|
"token_acc": 0.8615195671656654,
|
|
"train_speed(iter/s)": 0.126463
|
|
},
|
|
{
|
|
"epoch": 2.5156549031616016,
|
|
"grad_norm": 0.9412456750869751,
|
|
"learning_rate": 6.225533285789997e-07,
|
|
"loss": 0.46562681198120115,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2055,
|
|
"token_acc": 0.8705161854768154,
|
|
"train_speed(iter/s)": 0.126535
|
|
},
|
|
{
|
|
"epoch": 2.5217790706575824,
|
|
"grad_norm": 0.9882362484931946,
|
|
"learning_rate": 6.071397201996243e-07,
|
|
"loss": 0.47277240753173827,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2060,
|
|
"token_acc": 0.8616963064295485,
|
|
"train_speed(iter/s)": 0.126619
|
|
},
|
|
{
|
|
"epoch": 2.5217790706575824,
|
|
"eval_loss": 0.6171227097511292,
|
|
"eval_runtime": 29.9995,
|
|
"eval_samples_per_second": 17.567,
|
|
"eval_steps_per_second": 4.4,
|
|
"eval_token_acc": 0.8271191693785837,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 2.5279032381535638,
|
|
"grad_norm": 1.003340482711792,
|
|
"learning_rate": 5.919069802745914e-07,
|
|
"loss": 0.4641777515411377,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2065,
|
|
"token_acc": 0.8311811067402455,
|
|
"train_speed(iter/s)": 0.126395
|
|
},
|
|
{
|
|
"epoch": 2.5340274056495447,
|
|
"grad_norm": 0.9802326560020447,
|
|
"learning_rate": 5.768557359857241e-07,
|
|
"loss": 0.4592477321624756,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2070,
|
|
"token_acc": 0.8501814594270815,
|
|
"train_speed(iter/s)": 0.12647
|
|
},
|
|
{
|
|
"epoch": 2.5401515731455255,
|
|
"grad_norm": 0.9962190985679626,
|
|
"learning_rate": 5.619866070420766e-07,
|
|
"loss": 0.4591672897338867,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2075,
|
|
"token_acc": 0.8545560747663551,
|
|
"train_speed(iter/s)": 0.126542
|
|
},
|
|
{
|
|
"epoch": 2.5462757406415064,
|
|
"grad_norm": 0.8959765434265137,
|
|
"learning_rate": 5.473002056544191e-07,
|
|
"loss": 0.43817138671875,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2080,
|
|
"token_acc": 0.8604302151075538,
|
|
"train_speed(iter/s)": 0.12662
|
|
},
|
|
{
|
|
"epoch": 2.5462757406415064,
|
|
"eval_loss": 0.6181796789169312,
|
|
"eval_runtime": 29.9775,
|
|
"eval_samples_per_second": 17.58,
|
|
"eval_steps_per_second": 4.403,
|
|
"eval_token_acc": 0.8274600960793429,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 2.5523999081374873,
|
|
"grad_norm": 0.9490206837654114,
|
|
"learning_rate": 5.327971365100276e-07,
|
|
"loss": 0.4962893486022949,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2085,
|
|
"token_acc": 0.8353908876332166,
|
|
"train_speed(iter/s)": 0.126398
|
|
},
|
|
{
|
|
"epoch": 2.5585240756334686,
|
|
"grad_norm": 0.9097649455070496,
|
|
"learning_rate": 5.184779967477893e-07,
|
|
"loss": 0.4803347110748291,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2090,
|
|
"token_acc": 0.856787781665958,
|
|
"train_speed(iter/s)": 0.126471
|
|
},
|
|
{
|
|
"epoch": 2.5646482431294495,
|
|
"grad_norm": 1.0033832788467407,
|
|
"learning_rate": 5.043433759336158e-07,
|
|
"loss": 0.4686880111694336,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2095,
|
|
"token_acc": 0.8430141843971631,
|
|
"train_speed(iter/s)": 0.126552
|
|
},
|
|
{
|
|
"epoch": 2.5707724106254304,
|
|
"grad_norm": 0.9607586860656738,
|
|
"learning_rate": 4.903938560361698e-07,
|
|
"loss": 0.48217024803161623,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2100,
|
|
"token_acc": 0.8403233581785144,
|
|
"train_speed(iter/s)": 0.126627
|
|
},
|
|
{
|
|
"epoch": 2.5707724106254304,
|
|
"eval_loss": 0.6173272728919983,
|
|
"eval_runtime": 30.0012,
|
|
"eval_samples_per_second": 17.566,
|
|
"eval_steps_per_second": 4.4,
|
|
"eval_token_acc": 0.8270313549253577,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 2.5768965781214117,
|
|
"grad_norm": 0.9122663140296936,
|
|
"learning_rate": 4.76630011402901e-07,
|
|
"loss": 0.45368499755859376,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2105,
|
|
"token_acc": 0.8375441091626303,
|
|
"train_speed(iter/s)": 0.1264
|
|
},
|
|
{
|
|
"epoch": 2.5830207456173926,
|
|
"grad_norm": 0.9508864879608154,
|
|
"learning_rate": 4.630524087364019e-07,
|
|
"loss": 0.4732816696166992,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2110,
|
|
"token_acc": 0.8504168897728142,
|
|
"train_speed(iter/s)": 0.126473
|
|
},
|
|
{
|
|
"epoch": 2.5891449131133735,
|
|
"grad_norm": 0.9362801909446716,
|
|
"learning_rate": 4.4966160707107075e-07,
|
|
"loss": 0.48420238494873047,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2115,
|
|
"token_acc": 0.8500457797822917,
|
|
"train_speed(iter/s)": 0.126544
|
|
},
|
|
{
|
|
"epoch": 2.595269080609355,
|
|
"grad_norm": 0.9054739475250244,
|
|
"learning_rate": 4.364581577500987e-07,
|
|
"loss": 0.43644113540649415,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2120,
|
|
"token_acc": 0.8647457297507536,
|
|
"train_speed(iter/s)": 0.126611
|
|
},
|
|
{
|
|
"epoch": 2.595269080609355,
|
|
"eval_loss": 0.6172040104866028,
|
|
"eval_runtime": 29.9489,
|
|
"eval_samples_per_second": 17.597,
|
|
"eval_steps_per_second": 4.408,
|
|
"eval_token_acc": 0.827186321607521,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 2.6013932481053357,
|
|
"grad_norm": 0.9941717386245728,
|
|
"learning_rate": 4.2344260440276455e-07,
|
|
"loss": 0.5040837287902832,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2125,
|
|
"token_acc": 0.8330278541475742,
|
|
"train_speed(iter/s)": 0.126387
|
|
},
|
|
{
|
|
"epoch": 2.6075174156013166,
|
|
"grad_norm": 0.9401910901069641,
|
|
"learning_rate": 4.10615482922056e-07,
|
|
"loss": 0.4815082550048828,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2130,
|
|
"token_acc": 0.8560084700899947,
|
|
"train_speed(iter/s)": 0.126454
|
|
},
|
|
{
|
|
"epoch": 2.613641583097298,
|
|
"grad_norm": 0.9616697430610657,
|
|
"learning_rate": 3.979773214426019e-07,
|
|
"loss": 0.484088134765625,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2135,
|
|
"token_acc": 0.8334314302530901,
|
|
"train_speed(iter/s)": 0.126532
|
|
},
|
|
{
|
|
"epoch": 2.6197657505932788,
|
|
"grad_norm": 0.9681402444839478,
|
|
"learning_rate": 3.85528640318929e-07,
|
|
"loss": 0.46643218994140623,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2140,
|
|
"token_acc": 0.8545515745381106,
|
|
"train_speed(iter/s)": 0.126601
|
|
},
|
|
{
|
|
"epoch": 2.6197657505932788,
|
|
"eval_loss": 0.6171387434005737,
|
|
"eval_runtime": 30.0426,
|
|
"eval_samples_per_second": 17.542,
|
|
"eval_steps_per_second": 4.394,
|
|
"eval_token_acc": 0.8271914871635931,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 2.6258899180892596,
|
|
"grad_norm": 0.9676551222801208,
|
|
"learning_rate": 3.732699521040378e-07,
|
|
"loss": 0.46207480430603026,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2145,
|
|
"token_acc": 0.8425608384317814,
|
|
"train_speed(iter/s)": 0.126373
|
|
},
|
|
{
|
|
"epoch": 2.632014085585241,
|
|
"grad_norm": 1.003116250038147,
|
|
"learning_rate": 3.612017615282964e-07,
|
|
"loss": 0.4896972179412842,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2150,
|
|
"token_acc": 0.8342796309439319,
|
|
"train_speed(iter/s)": 0.126459
|
|
},
|
|
{
|
|
"epoch": 2.638138253081222,
|
|
"grad_norm": 1.0150328874588013,
|
|
"learning_rate": 3.49324565478662e-07,
|
|
"loss": 0.513043737411499,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2155,
|
|
"token_acc": 0.855285740368815,
|
|
"train_speed(iter/s)": 0.126542
|
|
},
|
|
{
|
|
"epoch": 2.6442624205772027,
|
|
"grad_norm": 0.9732238054275513,
|
|
"learning_rate": 3.3763885297822153e-07,
|
|
"loss": 0.4932279586791992,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2160,
|
|
"token_acc": 0.8633442370598422,
|
|
"train_speed(iter/s)": 0.126609
|
|
},
|
|
{
|
|
"epoch": 2.6442624205772027,
|
|
"eval_loss": 0.617060124874115,
|
|
"eval_runtime": 29.801,
|
|
"eval_samples_per_second": 17.684,
|
|
"eval_steps_per_second": 4.429,
|
|
"eval_token_acc": 0.827367116070045,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 2.650386588073184,
|
|
"grad_norm": 0.8650747537612915,
|
|
"learning_rate": 3.261451051660547e-07,
|
|
"loss": 0.47266697883605957,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2165,
|
|
"token_acc": 0.8351215537145186,
|
|
"train_speed(iter/s)": 0.126394
|
|
},
|
|
{
|
|
"epoch": 2.656510755569165,
|
|
"grad_norm": 1.0742039680480957,
|
|
"learning_rate": 3.1484379527742746e-07,
|
|
"loss": 0.48064508438110354,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2170,
|
|
"token_acc": 0.8557394880859308,
|
|
"train_speed(iter/s)": 0.126467
|
|
},
|
|
{
|
|
"epoch": 2.662634923065146,
|
|
"grad_norm": 1.0016287565231323,
|
|
"learning_rate": 3.037353886243055e-07,
|
|
"loss": 0.46164817810058595,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2175,
|
|
"token_acc": 0.8649181267691426,
|
|
"train_speed(iter/s)": 0.126536
|
|
},
|
|
{
|
|
"epoch": 2.6687590905611267,
|
|
"grad_norm": 0.9850811958312988,
|
|
"learning_rate": 2.928203425761961e-07,
|
|
"loss": 0.4678659915924072,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2180,
|
|
"token_acc": 0.8360384946854352,
|
|
"train_speed(iter/s)": 0.12661
|
|
},
|
|
{
|
|
"epoch": 2.6687590905611267,
|
|
"eval_loss": 0.6176232099533081,
|
|
"eval_runtime": 29.9574,
|
|
"eval_samples_per_second": 17.592,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8272896327289633,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 2.674883258057108,
|
|
"grad_norm": 0.9312725067138672,
|
|
"learning_rate": 2.820991065413159e-07,
|
|
"loss": 0.49228496551513673,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2185,
|
|
"token_acc": 0.8372572060551601,
|
|
"train_speed(iter/s)": 0.12639
|
|
},
|
|
{
|
|
"epoch": 2.681007425553089,
|
|
"grad_norm": 0.9902373552322388,
|
|
"learning_rate": 2.71572121948091e-07,
|
|
"loss": 0.48822717666625975,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2190,
|
|
"token_acc": 0.8330284513291558,
|
|
"train_speed(iter/s)": 0.126463
|
|
},
|
|
{
|
|
"epoch": 2.68713159304907,
|
|
"grad_norm": 0.9834261536598206,
|
|
"learning_rate": 2.612398222269752e-07,
|
|
"loss": 0.47580180168151853,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2195,
|
|
"token_acc": 0.8485915492957746,
|
|
"train_speed(iter/s)": 0.126526
|
|
},
|
|
{
|
|
"epoch": 2.6932557605450507,
|
|
"grad_norm": 1.019545316696167,
|
|
"learning_rate": 2.511026327926114e-07,
|
|
"loss": 0.5008028507232666,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2200,
|
|
"token_acc": 0.8462324594159034,
|
|
"train_speed(iter/s)": 0.126603
|
|
},
|
|
{
|
|
"epoch": 2.6932557605450507,
|
|
"eval_loss": 0.6174827218055725,
|
|
"eval_runtime": 30.0074,
|
|
"eval_samples_per_second": 17.562,
|
|
"eval_steps_per_second": 4.399,
|
|
"eval_token_acc": 0.8274962549718478,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 2.699379928041032,
|
|
"grad_norm": 1.0158801078796387,
|
|
"learning_rate": 2.411609710263091e-07,
|
|
"loss": 0.46188907623291015,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2205,
|
|
"token_acc": 0.8344093700899872,
|
|
"train_speed(iter/s)": 0.126407
|
|
},
|
|
{
|
|
"epoch": 2.705504095537013,
|
|
"grad_norm": 0.8824003338813782,
|
|
"learning_rate": 2.314152462588659e-07,
|
|
"loss": 0.4691601753234863,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2210,
|
|
"token_acc": 0.8543765099423899,
|
|
"train_speed(iter/s)": 0.126481
|
|
},
|
|
{
|
|
"epoch": 2.7116282630329938,
|
|
"grad_norm": 0.9510777592658997,
|
|
"learning_rate": 2.2186585975370935e-07,
|
|
"loss": 0.4572303771972656,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2215,
|
|
"token_acc": 0.8658688406088109,
|
|
"train_speed(iter/s)": 0.126579
|
|
},
|
|
{
|
|
"epoch": 2.717752430528975,
|
|
"grad_norm": 0.9548938274383545,
|
|
"learning_rate": 2.1251320469037827e-07,
|
|
"loss": 0.4614152431488037,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2220,
|
|
"token_acc": 0.8602180404138602,
|
|
"train_speed(iter/s)": 0.126644
|
|
},
|
|
{
|
|
"epoch": 2.717752430528975,
|
|
"eval_loss": 0.6176718473434448,
|
|
"eval_runtime": 29.9684,
|
|
"eval_samples_per_second": 17.585,
|
|
"eval_steps_per_second": 4.405,
|
|
"eval_token_acc": 0.8273722816261171,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 2.723876598024956,
|
|
"grad_norm": 0.9301165342330933,
|
|
"learning_rate": 2.0335766614833275e-07,
|
|
"loss": 0.4707462310791016,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2225,
|
|
"token_acc": 0.8354241550286274,
|
|
"train_speed(iter/s)": 0.126431
|
|
},
|
|
{
|
|
"epoch": 2.730000765520937,
|
|
"grad_norm": 0.9284428954124451,
|
|
"learning_rate": 1.9439962109110032e-07,
|
|
"loss": 0.45088810920715333,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2230,
|
|
"token_acc": 0.8519607113624525,
|
|
"train_speed(iter/s)": 0.126503
|
|
},
|
|
{
|
|
"epoch": 2.736124933016918,
|
|
"grad_norm": 0.9004064798355103,
|
|
"learning_rate": 1.8563943835075315e-07,
|
|
"loss": 0.4744719982147217,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2235,
|
|
"token_acc": 0.8538511282801811,
|
|
"train_speed(iter/s)": 0.126568
|
|
},
|
|
{
|
|
"epoch": 2.742249100512899,
|
|
"grad_norm": 0.9502014517784119,
|
|
"learning_rate": 1.770774786127244e-07,
|
|
"loss": 0.4691894054412842,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2240,
|
|
"token_acc": 0.8554979031914137,
|
|
"train_speed(iter/s)": 0.12664
|
|
},
|
|
{
|
|
"epoch": 2.742249100512899,
|
|
"eval_loss": 0.6172018051147461,
|
|
"eval_runtime": 29.9615,
|
|
"eval_samples_per_second": 17.589,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8271501627150163,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 2.74837326800888,
|
|
"grad_norm": 1.0235203504562378,
|
|
"learning_rate": 1.6871409440095687e-07,
|
|
"loss": 0.5208240509033203,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2245,
|
|
"token_acc": 0.8266149979641408,
|
|
"train_speed(iter/s)": 0.12644
|
|
},
|
|
{
|
|
"epoch": 2.7544974355048613,
|
|
"grad_norm": 0.9667299389839172,
|
|
"learning_rate": 1.6054963006338742e-07,
|
|
"loss": 0.46748833656311034,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2250,
|
|
"token_acc": 0.8670063058890591,
|
|
"train_speed(iter/s)": 0.126499
|
|
},
|
|
{
|
|
"epoch": 2.760621603000842,
|
|
"grad_norm": 0.990592360496521,
|
|
"learning_rate": 1.5258442175777045e-07,
|
|
"loss": 0.4987760066986084,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2255,
|
|
"token_acc": 0.8603155845961351,
|
|
"train_speed(iter/s)": 0.126571
|
|
},
|
|
{
|
|
"epoch": 2.766745770496823,
|
|
"grad_norm": 1.023600459098816,
|
|
"learning_rate": 1.44818797437834e-07,
|
|
"loss": 0.5020921707153321,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2260,
|
|
"token_acc": 0.8367556063532101,
|
|
"train_speed(iter/s)": 0.126649
|
|
},
|
|
{
|
|
"epoch": 2.766745770496823,
|
|
"eval_loss": 0.6171240210533142,
|
|
"eval_runtime": 29.9592,
|
|
"eval_samples_per_second": 17.591,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8274600960793429,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 2.7728699379928043,
|
|
"grad_norm": 0.9850562810897827,
|
|
"learning_rate": 1.372530768397845e-07,
|
|
"loss": 0.49890799522399903,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2265,
|
|
"token_acc": 0.8388872065619528,
|
|
"train_speed(iter/s)": 0.126453
|
|
},
|
|
{
|
|
"epoch": 2.7789941054887852,
|
|
"grad_norm": 1.0308210849761963,
|
|
"learning_rate": 1.2988757146913223e-07,
|
|
"loss": 0.49098944664001465,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2270,
|
|
"token_acc": 0.8366363778787405,
|
|
"train_speed(iter/s)": 0.12653
|
|
},
|
|
{
|
|
"epoch": 2.785118272984766,
|
|
"grad_norm": 0.9139099717140198,
|
|
"learning_rate": 1.227225845878721e-07,
|
|
"loss": 0.482135009765625,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2275,
|
|
"token_acc": 0.8365253330381195,
|
|
"train_speed(iter/s)": 0.12659
|
|
},
|
|
{
|
|
"epoch": 2.791242440480747,
|
|
"grad_norm": 1.0616044998168945,
|
|
"learning_rate": 1.157584112019966e-07,
|
|
"loss": 0.5007448196411133,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2280,
|
|
"token_acc": 0.83696904524157,
|
|
"train_speed(iter/s)": 0.126675
|
|
},
|
|
{
|
|
"epoch": 2.791242440480747,
|
|
"eval_loss": 0.6172557473182678,
|
|
"eval_runtime": 29.9856,
|
|
"eval_samples_per_second": 17.575,
|
|
"eval_steps_per_second": 4.402,
|
|
"eval_token_acc": 0.8271966527196652,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 2.7973666079767283,
|
|
"grad_norm": 0.8870491981506348,
|
|
"learning_rate": 1.0899533804934637e-07,
|
|
"loss": 0.4659425258636475,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2285,
|
|
"token_acc": 0.8472861329549524,
|
|
"train_speed(iter/s)": 0.126465
|
|
},
|
|
{
|
|
"epoch": 2.803490775472709,
|
|
"grad_norm": 0.8363329768180847,
|
|
"learning_rate": 1.0243364358780817e-07,
|
|
"loss": 0.46242237091064453,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2290,
|
|
"token_acc": 0.8513745704467354,
|
|
"train_speed(iter/s)": 0.126527
|
|
},
|
|
{
|
|
"epoch": 2.80961494296869,
|
|
"grad_norm": 1.0086287260055542,
|
|
"learning_rate": 9.607359798384785e-08,
|
|
"loss": 0.46642189025878905,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2295,
|
|
"token_acc": 0.8569345046297867,
|
|
"train_speed(iter/s)": 0.126585
|
|
},
|
|
{
|
|
"epoch": 2.815739110464671,
|
|
"grad_norm": 0.9434347748756409,
|
|
"learning_rate": 8.991546310138599e-08,
|
|
"loss": 0.48851499557495115,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2300,
|
|
"token_acc": 0.8429763909289578,
|
|
"train_speed(iter/s)": 0.126657
|
|
},
|
|
{
|
|
"epoch": 2.815739110464671,
|
|
"eval_loss": 0.6172496676445007,
|
|
"eval_runtime": 29.9586,
|
|
"eval_samples_per_second": 17.591,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8273929438504055,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 2.8218632779606523,
|
|
"grad_norm": 0.8611441254615784,
|
|
"learning_rate": 8.395949249101754e-08,
|
|
"loss": 0.4366280555725098,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2305,
|
|
"token_acc": 0.8454527389547593,
|
|
"train_speed(iter/s)": 0.126446
|
|
},
|
|
{
|
|
"epoch": 2.827987445456633,
|
|
"grad_norm": 0.9308704137802124,
|
|
"learning_rate": 7.820593137957244e-08,
|
|
"loss": 0.49471750259399416,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2310,
|
|
"token_acc": 0.8447947341070501,
|
|
"train_speed(iter/s)": 0.126514
|
|
},
|
|
{
|
|
"epoch": 2.834111612952614,
|
|
"grad_norm": 0.9204055070877075,
|
|
"learning_rate": 7.265501666001706e-08,
|
|
"loss": 0.5066485404968262,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2315,
|
|
"token_acc": 0.8312655086848635,
|
|
"train_speed(iter/s)": 0.12658
|
|
},
|
|
{
|
|
"epoch": 2.8402357804485954,
|
|
"grad_norm": 0.9611912369728088,
|
|
"learning_rate": 6.730697688170251e-08,
|
|
"loss": 0.4841705322265625,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2320,
|
|
"token_acc": 0.8579581483830057,
|
|
"train_speed(iter/s)": 0.126649
|
|
},
|
|
{
|
|
"epoch": 2.8402357804485954,
|
|
"eval_loss": 0.6172900199890137,
|
|
"eval_runtime": 29.9623,
|
|
"eval_samples_per_second": 17.589,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8273154605093238,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 2.8463599479445763,
|
|
"grad_norm": 0.9565967321395874,
|
|
"learning_rate": 6.216203224095386e-08,
|
|
"loss": 0.45609331130981445,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2325,
|
|
"token_acc": 0.8340782438969194,
|
|
"train_speed(iter/s)": 0.126439
|
|
},
|
|
{
|
|
"epoch": 2.852484115440557,
|
|
"grad_norm": 0.9000151753425598,
|
|
"learning_rate": 5.722039457200235e-08,
|
|
"loss": 0.46810379028320315,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2330,
|
|
"token_acc": 0.854035216434336,
|
|
"train_speed(iter/s)": 0.126509
|
|
},
|
|
{
|
|
"epoch": 2.8586082829365385,
|
|
"grad_norm": 0.9673875570297241,
|
|
"learning_rate": 5.248226733826689e-08,
|
|
"loss": 0.496975040435791,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2335,
|
|
"token_acc": 0.8512393729597877,
|
|
"train_speed(iter/s)": 0.126574
|
|
},
|
|
{
|
|
"epoch": 2.8647324504325193,
|
|
"grad_norm": 0.9579722881317139,
|
|
"learning_rate": 4.794784562397459e-08,
|
|
"loss": 0.5033215045928955,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2340,
|
|
"token_acc": 0.854410310614068,
|
|
"train_speed(iter/s)": 0.126653
|
|
},
|
|
{
|
|
"epoch": 2.8647324504325193,
|
|
"eval_loss": 0.6173553466796875,
|
|
"eval_runtime": 30.0724,
|
|
"eval_samples_per_second": 17.524,
|
|
"eval_steps_per_second": 4.389,
|
|
"eval_token_acc": 0.8274394338550545,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 2.8708566179285,
|
|
"grad_norm": 0.967046320438385,
|
|
"learning_rate": 4.361731612612607e-08,
|
|
"loss": 0.4593523025512695,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2345,
|
|
"token_acc": 0.8404793034195522,
|
|
"train_speed(iter/s)": 0.126443
|
|
},
|
|
{
|
|
"epoch": 2.8769807854244815,
|
|
"grad_norm": 1.005210280418396,
|
|
"learning_rate": 3.949085714681389e-08,
|
|
"loss": 0.44494943618774413,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2350,
|
|
"token_acc": 0.8719988481336165,
|
|
"train_speed(iter/s)": 0.126502
|
|
},
|
|
{
|
|
"epoch": 2.8831049529204624,
|
|
"grad_norm": 0.9526214599609375,
|
|
"learning_rate": 3.556863858587833e-08,
|
|
"loss": 0.4602672576904297,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2355,
|
|
"token_acc": 0.8692367364835238,
|
|
"train_speed(iter/s)": 0.126552
|
|
},
|
|
{
|
|
"epoch": 2.8892291204164433,
|
|
"grad_norm": 0.9232168197631836,
|
|
"learning_rate": 3.185082193391143e-08,
|
|
"loss": 0.48589048385620115,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2360,
|
|
"token_acc": 0.8542855225182898,
|
|
"train_speed(iter/s)": 0.126625
|
|
},
|
|
{
|
|
"epoch": 2.8892291204164433,
|
|
"eval_loss": 0.6173009276390076,
|
|
"eval_runtime": 29.9578,
|
|
"eval_samples_per_second": 17.591,
|
|
"eval_steps_per_second": 4.406,
|
|
"eval_token_acc": 0.8273516194018286,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 2.8953532879124246,
|
|
"grad_norm": 0.9514647126197815,
|
|
"learning_rate": 2.8337560265608853e-08,
|
|
"loss": 0.46231327056884763,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2365,
|
|
"token_acc": 0.8406889558929477,
|
|
"train_speed(iter/s)": 0.126404
|
|
},
|
|
{
|
|
"epoch": 2.9014774554084055,
|
|
"grad_norm": 0.9117730259895325,
|
|
"learning_rate": 2.5028998233467272e-08,
|
|
"loss": 0.47223424911499023,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2370,
|
|
"token_acc": 0.8618170593682478,
|
|
"train_speed(iter/s)": 0.12647
|
|
},
|
|
{
|
|
"epoch": 2.9076016229043864,
|
|
"grad_norm": 0.9326623678207397,
|
|
"learning_rate": 2.1925272061829038e-08,
|
|
"loss": 0.47979536056518557,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2375,
|
|
"token_acc": 0.854614639049431,
|
|
"train_speed(iter/s)": 0.126535
|
|
},
|
|
{
|
|
"epoch": 2.9137257904003677,
|
|
"grad_norm": 0.9446553587913513,
|
|
"learning_rate": 1.9026509541272276e-08,
|
|
"loss": 0.4813851356506348,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2380,
|
|
"token_acc": 0.8567724059536074,
|
|
"train_speed(iter/s)": 0.126602
|
|
},
|
|
{
|
|
"epoch": 2.9137257904003677,
|
|
"eval_loss": 0.6172758936882019,
|
|
"eval_runtime": 29.898,
|
|
"eval_samples_per_second": 17.627,
|
|
"eval_steps_per_second": 4.415,
|
|
"eval_token_acc": 0.8273154605093238,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 2.9198499578963486,
|
|
"grad_norm": 0.9821135401725769,
|
|
"learning_rate": 1.6332830023350065e-08,
|
|
"loss": 0.4704907417297363,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2385,
|
|
"token_acc": 0.835956510119977,
|
|
"train_speed(iter/s)": 0.126422
|
|
},
|
|
{
|
|
"epoch": 2.9259741253923295,
|
|
"grad_norm": 0.9702714681625366,
|
|
"learning_rate": 1.3844344415676059e-08,
|
|
"loss": 0.5124542713165283,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2390,
|
|
"token_acc": 0.8496258847320526,
|
|
"train_speed(iter/s)": 0.126502
|
|
},
|
|
{
|
|
"epoch": 2.9320982928883104,
|
|
"grad_norm": 0.8899897336959839,
|
|
"learning_rate": 1.156115517735812e-08,
|
|
"loss": 0.45607595443725585,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2395,
|
|
"token_acc": 0.8606019542115398,
|
|
"train_speed(iter/s)": 0.12656
|
|
},
|
|
{
|
|
"epoch": 2.9382224603842912,
|
|
"grad_norm": 0.8827829360961914,
|
|
"learning_rate": 9.48335631477948e-09,
|
|
"loss": 0.4859360694885254,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2400,
|
|
"token_acc": 0.8470638693305693,
|
|
"train_speed(iter/s)": 0.126623
|
|
},
|
|
{
|
|
"epoch": 2.9382224603842912,
|
|
"eval_loss": 0.617277204990387,
|
|
"eval_runtime": 29.9293,
|
|
"eval_samples_per_second": 17.608,
|
|
"eval_steps_per_second": 4.41,
|
|
"eval_token_acc": 0.8274910894157756,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 2.9443466278802726,
|
|
"grad_norm": 0.9587947726249695,
|
|
"learning_rate": 7.611033377729615e-09,
|
|
"loss": 0.5192126274108887,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2405,
|
|
"token_acc": 0.8289369284093598,
|
|
"train_speed(iter/s)": 0.126434
|
|
},
|
|
{
|
|
"epoch": 2.9504707953762535,
|
|
"grad_norm": 0.9472404718399048,
|
|
"learning_rate": 5.944263455879284e-09,
|
|
"loss": 0.4903435707092285,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2410,
|
|
"token_acc": 0.8476331360946746,
|
|
"train_speed(iter/s)": 0.1265
|
|
},
|
|
{
|
|
"epoch": 2.9565949628722343,
|
|
"grad_norm": 0.9245522618293762,
|
|
"learning_rate": 4.4831151756091766e-09,
|
|
"loss": 0.4952064037322998,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2415,
|
|
"token_acc": 0.8508173686555399,
|
|
"train_speed(iter/s)": 0.126559
|
|
},
|
|
{
|
|
"epoch": 2.9627191303682157,
|
|
"grad_norm": 0.8794329762458801,
|
|
"learning_rate": 3.227648697182173e-09,
|
|
"loss": 0.4453686237335205,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2420,
|
|
"token_acc": 0.850396277175889,
|
|
"train_speed(iter/s)": 0.126607
|
|
},
|
|
{
|
|
"epoch": 2.9627191303682157,
|
|
"eval_loss": 0.6172167062759399,
|
|
"eval_runtime": 29.9678,
|
|
"eval_samples_per_second": 17.586,
|
|
"eval_steps_per_second": 4.405,
|
|
"eval_token_acc": 0.8275840694250736,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 2.9688432978641965,
|
|
"grad_norm": 0.9200014472007751,
|
|
"learning_rate": 2.177915712268108e-09,
|
|
"loss": 0.4687533378601074,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2425,
|
|
"token_acc": 0.8394573675668927,
|
|
"train_speed(iter/s)": 0.126424
|
|
},
|
|
{
|
|
"epoch": 2.9749674653601774,
|
|
"grad_norm": 0.9681059718132019,
|
|
"learning_rate": 1.3339594418138036e-09,
|
|
"loss": 0.46721735000610354,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2430,
|
|
"token_acc": 0.868303713612332,
|
|
"train_speed(iter/s)": 0.126481
|
|
},
|
|
{
|
|
"epoch": 2.9810916328561587,
|
|
"grad_norm": 0.9787800908088684,
|
|
"learning_rate": 6.958146342650463e-10,
|
|
"loss": 0.46620631217956543,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2435,
|
|
"token_acc": 0.8369136498098324,
|
|
"train_speed(iter/s)": 0.126545
|
|
},
|
|
{
|
|
"epoch": 2.9872158003521396,
|
|
"grad_norm": 0.9923424124717712,
|
|
"learning_rate": 2.6350756413440203e-10,
|
|
"loss": 0.4888926029205322,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2440,
|
|
"token_acc": 0.849891526288923,
|
|
"train_speed(iter/s)": 0.12661
|
|
},
|
|
{
|
|
"epoch": 2.9872158003521396,
|
|
"eval_loss": 0.6173287630081177,
|
|
"eval_runtime": 29.9491,
|
|
"eval_samples_per_second": 17.597,
|
|
"eval_steps_per_second": 4.407,
|
|
"eval_token_acc": 0.8274859238597035,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 2.9933399678481205,
|
|
"grad_norm": 0.9687269926071167,
|
|
"learning_rate": 3.7056030921522877e-11,
|
|
"loss": 0.4582235336303711,
|
|
"memory(GiB)": 36.87,
|
|
"step": 2445,
|
|
"token_acc": 0.8376420474448975,
|
|
"train_speed(iter/s)": 0.126414
|
|
},
|
|
{
|
|
"epoch": 2.9970144683457094,
|
|
"eval_loss": 0.617242693901062,
|
|
"eval_runtime": 29.8933,
|
|
"eval_samples_per_second": 17.629,
|
|
"eval_steps_per_second": 4.416,
|
|
"eval_token_acc": 0.8273826127382613,
|
|
"step": 2448
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 2448,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.502734662946783e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|