1750 lines
50 KiB
JSON
1750 lines
50 KiB
JSON
{
|
|
"best_global_step": 460,
|
|
"best_metric": 0.24467714,
|
|
"best_model_checkpoint": "/data/home/scyb089/CODE/scripts/ms-swift/3b-new/v29-20250507-134003/checkpoint-460",
|
|
"epoch": 2.9911123081066524,
|
|
"eval_steps": 20,
|
|
"global_step": 696,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0043091839482897925,
|
|
"grad_norm": 2.422614812850952,
|
|
"learning_rate": 9.99994906450425e-06,
|
|
"loss": 0.3783050775527954,
|
|
"memory(GiB)": 29.06,
|
|
"step": 1,
|
|
"token_acc": 0.8827366746221161,
|
|
"train_speed(iter/s)": 0.066016
|
|
},
|
|
{
|
|
"epoch": 0.02154591974144896,
|
|
"grad_norm": 1.8293063640594482,
|
|
"learning_rate": 9.99872666449397e-06,
|
|
"loss": 0.304108202457428,
|
|
"memory(GiB)": 29.06,
|
|
"step": 5,
|
|
"token_acc": 0.9021085311428756,
|
|
"train_speed(iter/s)": 0.123613
|
|
},
|
|
{
|
|
"epoch": 0.04309183948289792,
|
|
"grad_norm": 1.0181312561035156,
|
|
"learning_rate": 9.994907306529203e-06,
|
|
"loss": 0.30131869316101073,
|
|
"memory(GiB)": 29.06,
|
|
"step": 10,
|
|
"token_acc": 0.9080832657474993,
|
|
"train_speed(iter/s)": 0.141649
|
|
},
|
|
{
|
|
"epoch": 0.06463775922434689,
|
|
"grad_norm": 0.8247085213661194,
|
|
"learning_rate": 9.988543871435342e-06,
|
|
"loss": 0.29039506912231444,
|
|
"memory(GiB)": 29.07,
|
|
"step": 15,
|
|
"token_acc": 0.9026237111961776,
|
|
"train_speed(iter/s)": 0.145623
|
|
},
|
|
{
|
|
"epoch": 0.08618367896579585,
|
|
"grad_norm": 0.8313817977905273,
|
|
"learning_rate": 9.979639600327522e-06,
|
|
"loss": 0.2841599941253662,
|
|
"memory(GiB)": 29.07,
|
|
"step": 20,
|
|
"token_acc": 0.9051915284043123,
|
|
"train_speed(iter/s)": 0.148055
|
|
},
|
|
{
|
|
"epoch": 0.08618367896579585,
|
|
"eval_loss": 0.30637428164482117,
|
|
"eval_runtime": 9.2534,
|
|
"eval_samples_per_second": 16.21,
|
|
"eval_steps_per_second": 4.107,
|
|
"eval_token_acc": 0.8990793096302436,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.10772959870724481,
|
|
"grad_norm": 0.8578795790672302,
|
|
"learning_rate": 9.96819902845557e-06,
|
|
"loss": 0.26584060192108155,
|
|
"memory(GiB)": 29.07,
|
|
"step": 25,
|
|
"token_acc": 0.9061398699976505,
|
|
"train_speed(iter/s)": 0.133467
|
|
},
|
|
{
|
|
"epoch": 0.12927551844869378,
|
|
"grad_norm": 0.8536065220832825,
|
|
"learning_rate": 9.954227982894034e-06,
|
|
"loss": 0.2721074342727661,
|
|
"memory(GiB)": 29.07,
|
|
"step": 30,
|
|
"token_acc": 0.9100939031401429,
|
|
"train_speed(iter/s)": 0.137239
|
|
},
|
|
{
|
|
"epoch": 0.15082143819014274,
|
|
"grad_norm": 0.8567067384719849,
|
|
"learning_rate": 9.937733579574263e-06,
|
|
"loss": 0.26833133697509765,
|
|
"memory(GiB)": 29.07,
|
|
"step": 35,
|
|
"token_acc": 0.9014427903508017,
|
|
"train_speed(iter/s)": 0.139069
|
|
},
|
|
{
|
|
"epoch": 0.1723673579315917,
|
|
"grad_norm": 0.8759805560112,
|
|
"learning_rate": 9.918724219660013e-06,
|
|
"loss": 0.2696810483932495,
|
|
"memory(GiB)": 29.07,
|
|
"step": 40,
|
|
"token_acc": 0.9073056300268096,
|
|
"train_speed(iter/s)": 0.141475
|
|
},
|
|
{
|
|
"epoch": 0.1723673579315917,
|
|
"eval_loss": 0.28540194034576416,
|
|
"eval_runtime": 9.1496,
|
|
"eval_samples_per_second": 16.394,
|
|
"eval_steps_per_second": 4.153,
|
|
"eval_token_acc": 0.9062731282749638,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.19391327767304067,
|
|
"grad_norm": 0.8179587125778198,
|
|
"learning_rate": 9.897209585268459e-06,
|
|
"loss": 0.26938886642456056,
|
|
"memory(GiB)": 29.07,
|
|
"step": 45,
|
|
"token_acc": 0.9068100358422939,
|
|
"train_speed(iter/s)": 0.134153
|
|
},
|
|
{
|
|
"epoch": 0.21545919741448963,
|
|
"grad_norm": 0.7597087621688843,
|
|
"learning_rate": 9.873200634538746e-06,
|
|
"loss": 0.2661460876464844,
|
|
"memory(GiB)": 29.07,
|
|
"step": 50,
|
|
"token_acc": 0.9265829903627394,
|
|
"train_speed(iter/s)": 0.135778
|
|
},
|
|
{
|
|
"epoch": 0.23700511715593858,
|
|
"grad_norm": 0.9001930952072144,
|
|
"learning_rate": 9.846709596050646e-06,
|
|
"loss": 0.2637378692626953,
|
|
"memory(GiB)": 29.07,
|
|
"step": 55,
|
|
"token_acc": 0.9097354466352211,
|
|
"train_speed(iter/s)": 0.137425
|
|
},
|
|
{
|
|
"epoch": 0.25855103689738757,
|
|
"grad_norm": 0.894802451133728,
|
|
"learning_rate": 9.817749962596115e-06,
|
|
"loss": 0.26340594291687014,
|
|
"memory(GiB)": 29.07,
|
|
"step": 60,
|
|
"token_acc": 0.9115004961612785,
|
|
"train_speed(iter/s)": 0.139172
|
|
},
|
|
{
|
|
"epoch": 0.25855103689738757,
|
|
"eval_loss": 0.2748047709465027,
|
|
"eval_runtime": 9.1341,
|
|
"eval_samples_per_second": 16.422,
|
|
"eval_steps_per_second": 4.16,
|
|
"eval_token_acc": 0.9068652121140354,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.28009695663883655,
|
|
"grad_norm": 0.7363941669464111,
|
|
"learning_rate": 9.786336484306966e-06,
|
|
"loss": 0.27098889350891114,
|
|
"memory(GiB)": 29.07,
|
|
"step": 65,
|
|
"token_acc": 0.9083590733590734,
|
|
"train_speed(iter/s)": 0.135246
|
|
},
|
|
{
|
|
"epoch": 0.3016428763802855,
|
|
"grad_norm": 0.8381310105323792,
|
|
"learning_rate": 9.752485161142103e-06,
|
|
"loss": 0.2478638172149658,
|
|
"memory(GiB)": 29.07,
|
|
"step": 70,
|
|
"token_acc": 0.9232435033686237,
|
|
"train_speed(iter/s)": 0.136407
|
|
},
|
|
{
|
|
"epoch": 0.32318879612173446,
|
|
"grad_norm": 0.7714306116104126,
|
|
"learning_rate": 9.716213234738216e-06,
|
|
"loss": 0.2461942672729492,
|
|
"memory(GiB)": 29.07,
|
|
"step": 75,
|
|
"token_acc": 0.9149177216982468,
|
|
"train_speed(iter/s)": 0.137517
|
|
},
|
|
{
|
|
"epoch": 0.3447347158631834,
|
|
"grad_norm": 0.9231055974960327,
|
|
"learning_rate": 9.677539179628005e-06,
|
|
"loss": 0.24781365394592286,
|
|
"memory(GiB)": 29.07,
|
|
"step": 80,
|
|
"token_acc": 0.9261125903385318,
|
|
"train_speed(iter/s)": 0.138742
|
|
},
|
|
{
|
|
"epoch": 0.3447347158631834,
|
|
"eval_loss": 0.26845118403434753,
|
|
"eval_runtime": 9.1426,
|
|
"eval_samples_per_second": 16.407,
|
|
"eval_steps_per_second": 4.156,
|
|
"eval_token_acc": 0.9084194321915984,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.36628063560463237,
|
|
"grad_norm": 0.7188398241996765,
|
|
"learning_rate": 9.636482693830488e-06,
|
|
"loss": 0.26428771018981934,
|
|
"memory(GiB)": 29.07,
|
|
"step": 85,
|
|
"token_acc": 0.9042584492261823,
|
|
"train_speed(iter/s)": 0.135179
|
|
},
|
|
{
|
|
"epoch": 0.38782655534608135,
|
|
"grad_norm": 0.8020254969596863,
|
|
"learning_rate": 9.59306468881811e-06,
|
|
"loss": 0.2622120141983032,
|
|
"memory(GiB)": 29.07,
|
|
"step": 90,
|
|
"token_acc": 0.9098044980155814,
|
|
"train_speed(iter/s)": 0.13707
|
|
},
|
|
{
|
|
"epoch": 0.4093724750875303,
|
|
"grad_norm": 0.8326025605201721,
|
|
"learning_rate": 9.547307278865823e-06,
|
|
"loss": 0.2394162893295288,
|
|
"memory(GiB)": 29.07,
|
|
"step": 95,
|
|
"token_acc": 0.9164833305127771,
|
|
"train_speed(iter/s)": 0.138039
|
|
},
|
|
{
|
|
"epoch": 0.43091839482897926,
|
|
"grad_norm": 0.7730870842933655,
|
|
"learning_rate": 9.499233769787534e-06,
|
|
"loss": 0.24491536617279053,
|
|
"memory(GiB)": 29.07,
|
|
"step": 100,
|
|
"token_acc": 0.9174321989744152,
|
|
"train_speed(iter/s)": 0.139304
|
|
},
|
|
{
|
|
"epoch": 0.43091839482897926,
|
|
"eval_loss": 0.26416531205177307,
|
|
"eval_runtime": 9.1324,
|
|
"eval_samples_per_second": 16.425,
|
|
"eval_steps_per_second": 4.161,
|
|
"eval_token_acc": 0.9095295893898576,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.45246431457042824,
|
|
"grad_norm": 0.860744833946228,
|
|
"learning_rate": 9.448868647065644e-06,
|
|
"loss": 0.25905332565307615,
|
|
"memory(GiB)": 29.07,
|
|
"step": 105,
|
|
"token_acc": 0.9042013222435488,
|
|
"train_speed(iter/s)": 0.136744
|
|
},
|
|
{
|
|
"epoch": 0.47401023431187717,
|
|
"grad_norm": 0.7342467904090881,
|
|
"learning_rate": 9.396237563379761e-06,
|
|
"loss": 0.23780291080474852,
|
|
"memory(GiB)": 29.07,
|
|
"step": 110,
|
|
"token_acc": 0.9195750302763405,
|
|
"train_speed(iter/s)": 0.137488
|
|
},
|
|
{
|
|
"epoch": 0.49555615405332615,
|
|
"grad_norm": 0.8625522255897522,
|
|
"learning_rate": 9.341367325540921e-06,
|
|
"loss": 0.23777966499328612,
|
|
"memory(GiB)": 29.07,
|
|
"step": 115,
|
|
"token_acc": 0.9236284378674467,
|
|
"train_speed(iter/s)": 0.138134
|
|
},
|
|
{
|
|
"epoch": 0.5171020737947751,
|
|
"grad_norm": 0.7858663201332092,
|
|
"learning_rate": 9.284285880837947e-06,
|
|
"loss": 0.23805389404296876,
|
|
"memory(GiB)": 29.07,
|
|
"step": 120,
|
|
"token_acc": 0.9197558545092177,
|
|
"train_speed(iter/s)": 0.138839
|
|
},
|
|
{
|
|
"epoch": 0.5171020737947751,
|
|
"eval_loss": 0.26037347316741943,
|
|
"eval_runtime": 9.1916,
|
|
"eval_samples_per_second": 16.319,
|
|
"eval_steps_per_second": 4.134,
|
|
"eval_token_acc": 0.9109209864116758,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.5386479935362241,
|
|
"grad_norm": 0.69041508436203,
|
|
"learning_rate": 9.225022302802951e-06,
|
|
"loss": 0.23021812438964845,
|
|
"memory(GiB)": 29.07,
|
|
"step": 125,
|
|
"token_acc": 0.9175774697625143,
|
|
"train_speed(iter/s)": 0.136057
|
|
},
|
|
{
|
|
"epoch": 0.5601939132776731,
|
|
"grad_norm": 0.8726988434791565,
|
|
"learning_rate": 9.163606776403182e-06,
|
|
"loss": 0.24287738800048828,
|
|
"memory(GiB)": 29.07,
|
|
"step": 130,
|
|
"token_acc": 0.9174245368571058,
|
|
"train_speed(iter/s)": 0.136884
|
|
},
|
|
{
|
|
"epoch": 0.581739833019122,
|
|
"grad_norm": 0.782781720161438,
|
|
"learning_rate": 9.100070582666796e-06,
|
|
"loss": 0.2355494499206543,
|
|
"memory(GiB)": 29.07,
|
|
"step": 135,
|
|
"token_acc": 0.9218754381537254,
|
|
"train_speed(iter/s)": 0.137519
|
|
},
|
|
{
|
|
"epoch": 0.603285752760571,
|
|
"grad_norm": 0.8618416786193848,
|
|
"learning_rate": 9.034446082750352e-06,
|
|
"loss": 0.26097152233123777,
|
|
"memory(GiB)": 29.07,
|
|
"step": 140,
|
|
"token_acc": 0.9200457337339467,
|
|
"train_speed(iter/s)": 0.137992
|
|
},
|
|
{
|
|
"epoch": 0.603285752760571,
|
|
"eval_loss": 0.2570641040802002,
|
|
"eval_runtime": 9.1422,
|
|
"eval_samples_per_second": 16.407,
|
|
"eval_steps_per_second": 4.157,
|
|
"eval_token_acc": 0.9119867373220047,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.6248316725020199,
|
|
"grad_norm": 0.7354313135147095,
|
|
"learning_rate": 8.966766701456177e-06,
|
|
"loss": 0.2450572967529297,
|
|
"memory(GiB)": 29.07,
|
|
"step": 145,
|
|
"token_acc": 0.9104867095521301,
|
|
"train_speed(iter/s)": 0.135924
|
|
},
|
|
{
|
|
"epoch": 0.6463775922434689,
|
|
"grad_norm": 0.7455778121948242,
|
|
"learning_rate": 8.897066910207958e-06,
|
|
"loss": 0.24456512928009033,
|
|
"memory(GiB)": 29.07,
|
|
"step": 150,
|
|
"token_acc": 0.9091944119638131,
|
|
"train_speed(iter/s)": 0.136598
|
|
},
|
|
{
|
|
"epoch": 0.6679235119849178,
|
|
"grad_norm": 0.6894455552101135,
|
|
"learning_rate": 8.825382209493284e-06,
|
|
"loss": 0.22335777282714844,
|
|
"memory(GiB)": 29.07,
|
|
"step": 155,
|
|
"token_acc": 0.9254186825455899,
|
|
"train_speed(iter/s)": 0.137109
|
|
},
|
|
{
|
|
"epoch": 0.6894694317263668,
|
|
"grad_norm": 0.8090242743492126,
|
|
"learning_rate": 8.751749110782013e-06,
|
|
"loss": 0.22939071655273438,
|
|
"memory(GiB)": 29.07,
|
|
"step": 160,
|
|
"token_acc": 0.917912822144448,
|
|
"train_speed(iter/s)": 0.137664
|
|
},
|
|
{
|
|
"epoch": 0.6894694317263668,
|
|
"eval_loss": 0.2559308111667633,
|
|
"eval_runtime": 9.1265,
|
|
"eval_samples_per_second": 16.436,
|
|
"eval_steps_per_second": 4.164,
|
|
"eval_token_acc": 0.9122235708576334,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.7110153514678158,
|
|
"grad_norm": 0.6877977252006531,
|
|
"learning_rate": 8.676205117929752e-06,
|
|
"loss": 0.2350329875946045,
|
|
"memory(GiB)": 29.07,
|
|
"step": 165,
|
|
"token_acc": 0.9184605472599106,
|
|
"train_speed(iter/s)": 0.13599
|
|
},
|
|
{
|
|
"epoch": 0.7325612712092647,
|
|
"grad_norm": 0.7360235452651978,
|
|
"learning_rate": 8.598788708075844e-06,
|
|
"loss": 0.23023662567138672,
|
|
"memory(GiB)": 29.07,
|
|
"step": 170,
|
|
"token_acc": 0.9058568071626164,
|
|
"train_speed(iter/s)": 0.136418
|
|
},
|
|
{
|
|
"epoch": 0.7541071909507137,
|
|
"grad_norm": 0.8410085439682007,
|
|
"learning_rate": 8.51953931204566e-06,
|
|
"loss": 0.23642911911010742,
|
|
"memory(GiB)": 29.07,
|
|
"step": 175,
|
|
"token_acc": 0.9052932094269817,
|
|
"train_speed(iter/s)": 0.137007
|
|
},
|
|
{
|
|
"epoch": 0.7756531106921627,
|
|
"grad_norm": 0.7055257558822632,
|
|
"learning_rate": 8.438497294267117e-06,
|
|
"loss": 0.21671390533447266,
|
|
"memory(GiB)": 29.07,
|
|
"step": 180,
|
|
"token_acc": 0.9202059202059202,
|
|
"train_speed(iter/s)": 0.137438
|
|
},
|
|
{
|
|
"epoch": 0.7756531106921627,
|
|
"eval_loss": 0.251537024974823,
|
|
"eval_runtime": 9.1508,
|
|
"eval_samples_per_second": 16.392,
|
|
"eval_steps_per_second": 4.153,
|
|
"eval_token_acc": 0.9133337280558927,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.7971990304336116,
|
|
"grad_norm": 0.7327454686164856,
|
|
"learning_rate": 8.3557039322117e-06,
|
|
"loss": 0.23624320030212403,
|
|
"memory(GiB)": 29.07,
|
|
"step": 185,
|
|
"token_acc": 0.9212678936605317,
|
|
"train_speed(iter/s)": 0.135745
|
|
},
|
|
{
|
|
"epoch": 0.8187449501750605,
|
|
"grad_norm": 0.7439467906951904,
|
|
"learning_rate": 8.27120139537044e-06,
|
|
"loss": 0.226143741607666,
|
|
"memory(GiB)": 29.07,
|
|
"step": 190,
|
|
"token_acc": 0.9260711777101381,
|
|
"train_speed(iter/s)": 0.136185
|
|
},
|
|
{
|
|
"epoch": 0.8402908699165096,
|
|
"grad_norm": 0.7658076286315918,
|
|
"learning_rate": 8.18503272377554e-06,
|
|
"loss": 0.23765263557434083,
|
|
"memory(GiB)": 29.07,
|
|
"step": 195,
|
|
"token_acc": 0.9223176899233237,
|
|
"train_speed(iter/s)": 0.136807
|
|
},
|
|
{
|
|
"epoch": 0.8618367896579585,
|
|
"grad_norm": 0.739122211933136,
|
|
"learning_rate": 8.097241806078616e-06,
|
|
"loss": 0.2310422420501709,
|
|
"memory(GiB)": 29.07,
|
|
"step": 200,
|
|
"token_acc": 0.9296302733841532,
|
|
"train_speed(iter/s)": 0.13714
|
|
},
|
|
{
|
|
"epoch": 0.8618367896579585,
|
|
"eval_loss": 0.24832946062088013,
|
|
"eval_runtime": 9.1389,
|
|
"eval_samples_per_second": 16.413,
|
|
"eval_steps_per_second": 4.158,
|
|
"eval_token_acc": 0.9134077385357766,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.8833827093994074,
|
|
"grad_norm": 0.7651655077934265,
|
|
"learning_rate": 8.007873357196716e-06,
|
|
"loss": 0.24373788833618165,
|
|
"memory(GiB)": 29.07,
|
|
"step": 205,
|
|
"token_acc": 0.9188066080938974,
|
|
"train_speed(iter/s)": 0.135947
|
|
},
|
|
{
|
|
"epoch": 0.9049286291408565,
|
|
"grad_norm": 0.7528461217880249,
|
|
"learning_rate": 7.916972895537471e-06,
|
|
"loss": 0.23199746608734131,
|
|
"memory(GiB)": 29.07,
|
|
"step": 210,
|
|
"token_acc": 0.922004793261512,
|
|
"train_speed(iter/s)": 0.136394
|
|
},
|
|
{
|
|
"epoch": 0.9264745488823054,
|
|
"grad_norm": 0.8405919075012207,
|
|
"learning_rate": 7.824586719815019e-06,
|
|
"loss": 0.2173825740814209,
|
|
"memory(GiB)": 29.07,
|
|
"step": 215,
|
|
"token_acc": 0.9274689356403538,
|
|
"train_speed(iter/s)": 0.136811
|
|
},
|
|
{
|
|
"epoch": 0.9480204686237543,
|
|
"grad_norm": 0.7239152193069458,
|
|
"learning_rate": 7.730761885468486e-06,
|
|
"loss": 0.22583813667297364,
|
|
"memory(GiB)": 29.07,
|
|
"step": 220,
|
|
"token_acc": 0.9232335730673059,
|
|
"train_speed(iter/s)": 0.137394
|
|
},
|
|
{
|
|
"epoch": 0.9480204686237543,
|
|
"eval_loss": 0.2480185180902481,
|
|
"eval_runtime": 9.1514,
|
|
"eval_samples_per_second": 16.391,
|
|
"eval_steps_per_second": 4.152,
|
|
"eval_token_acc": 0.9135261553035909,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.9695663883652034,
|
|
"grad_norm": 0.7927827835083008,
|
|
"learning_rate": 7.635546180695039e-06,
|
|
"loss": 0.24525394439697265,
|
|
"memory(GiB)": 31.51,
|
|
"step": 225,
|
|
"token_acc": 0.9190034762456547,
|
|
"train_speed(iter/s)": 0.136277
|
|
},
|
|
{
|
|
"epoch": 0.9911123081066523,
|
|
"grad_norm": 0.7584970593452454,
|
|
"learning_rate": 7.538988102109728e-06,
|
|
"loss": 0.24703338146209716,
|
|
"memory(GiB)": 31.51,
|
|
"step": 230,
|
|
"token_acc": 0.9166930084197822,
|
|
"train_speed(iter/s)": 0.136884
|
|
},
|
|
{
|
|
"epoch": 1.0086183678965797,
|
|
"grad_norm": 0.6898382306098938,
|
|
"learning_rate": 7.441136830044495e-06,
|
|
"loss": 0.19301869869232177,
|
|
"memory(GiB)": 31.51,
|
|
"step": 235,
|
|
"token_acc": 0.9391968052558693,
|
|
"train_speed(iter/s)": 0.137591
|
|
},
|
|
{
|
|
"epoch": 1.0301642876380286,
|
|
"grad_norm": 0.8064629435539246,
|
|
"learning_rate": 7.342042203498952e-06,
|
|
"loss": 0.16187149286270142,
|
|
"memory(GiB)": 31.51,
|
|
"step": 240,
|
|
"token_acc": 0.9393010954707055,
|
|
"train_speed(iter/s)": 0.137991
|
|
},
|
|
{
|
|
"epoch": 1.0301642876380286,
|
|
"eval_loss": 0.2533319890499115,
|
|
"eval_runtime": 9.1306,
|
|
"eval_samples_per_second": 16.428,
|
|
"eval_steps_per_second": 4.162,
|
|
"eval_token_acc": 0.914014624470825,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.0517102073794775,
|
|
"grad_norm": 0.8042640089988708,
|
|
"learning_rate": 7.241754694755674e-06,
|
|
"loss": 0.16961312294006348,
|
|
"memory(GiB)": 31.51,
|
|
"step": 245,
|
|
"token_acc": 0.9325969259837942,
|
|
"train_speed(iter/s)": 0.13689
|
|
},
|
|
{
|
|
"epoch": 1.0732561271209264,
|
|
"grad_norm": 0.8370431661605835,
|
|
"learning_rate": 7.140325383672938e-06,
|
|
"loss": 0.1677647829055786,
|
|
"memory(GiB)": 31.51,
|
|
"step": 250,
|
|
"token_acc": 0.9420376456528234,
|
|
"train_speed(iter/s)": 0.13733
|
|
},
|
|
{
|
|
"epoch": 1.0948020468623754,
|
|
"grad_norm": 0.7853599190711975,
|
|
"learning_rate": 7.037805931668006e-06,
|
|
"loss": 0.16614892482757568,
|
|
"memory(GiB)": 31.51,
|
|
"step": 255,
|
|
"token_acc": 0.9391651270517043,
|
|
"train_speed(iter/s)": 0.137609
|
|
},
|
|
{
|
|
"epoch": 1.1163479666038243,
|
|
"grad_norm": 0.6807184815406799,
|
|
"learning_rate": 6.934248555404197e-06,
|
|
"loss": 0.1581436276435852,
|
|
"memory(GiB)": 31.51,
|
|
"step": 260,
|
|
"token_acc": 0.9458589779605179,
|
|
"train_speed(iter/s)": 0.137947
|
|
},
|
|
{
|
|
"epoch": 1.1163479666038243,
|
|
"eval_loss": 0.2524171471595764,
|
|
"eval_runtime": 9.156,
|
|
"eval_samples_per_second": 16.383,
|
|
"eval_steps_per_second": 4.15,
|
|
"eval_token_acc": 0.9150063649012701,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.1378938863452734,
|
|
"grad_norm": 0.7507938146591187,
|
|
"learning_rate": 6.8297060001951545e-06,
|
|
"loss": 0.16150083541870117,
|
|
"memory(GiB)": 31.51,
|
|
"step": 265,
|
|
"token_acc": 0.9309385421629347,
|
|
"train_speed(iter/s)": 0.137099
|
|
},
|
|
{
|
|
"epoch": 1.1594398060867224,
|
|
"grad_norm": 0.7291717529296875,
|
|
"learning_rate": 6.724231513139853e-06,
|
|
"loss": 0.16564717292785644,
|
|
"memory(GiB)": 31.51,
|
|
"step": 270,
|
|
"token_acc": 0.9417148494231771,
|
|
"train_speed(iter/s)": 0.137339
|
|
},
|
|
{
|
|
"epoch": 1.1809857258281713,
|
|
"grad_norm": 0.7378965616226196,
|
|
"learning_rate": 6.617878816002032e-06,
|
|
"loss": 0.1618717670440674,
|
|
"memory(GiB)": 31.51,
|
|
"step": 275,
|
|
"token_acc": 0.9485524256651017,
|
|
"train_speed(iter/s)": 0.137622
|
|
},
|
|
{
|
|
"epoch": 1.2025316455696202,
|
|
"grad_norm": 0.8035087585449219,
|
|
"learning_rate": 6.510702077847864e-06,
|
|
"loss": 0.1574448823928833,
|
|
"memory(GiB)": 31.51,
|
|
"step": 280,
|
|
"token_acc": 0.9411747078213965,
|
|
"train_speed(iter/s)": 0.138001
|
|
},
|
|
{
|
|
"epoch": 1.2025316455696202,
|
|
"eval_loss": 0.24995924532413483,
|
|
"eval_runtime": 9.1411,
|
|
"eval_samples_per_second": 16.409,
|
|
"eval_steps_per_second": 4.157,
|
|
"eval_token_acc": 0.9145474999259895,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.2240775653110691,
|
|
"grad_norm": 0.7014583349227905,
|
|
"learning_rate": 6.402755887455792e-06,
|
|
"loss": 0.1643718123435974,
|
|
"memory(GiB)": 31.51,
|
|
"step": 285,
|
|
"token_acc": 0.9339765241569784,
|
|
"train_speed(iter/s)": 0.137005
|
|
},
|
|
{
|
|
"epoch": 1.2456234850525183,
|
|
"grad_norm": 0.7766486406326294,
|
|
"learning_rate": 6.294095225512604e-06,
|
|
"loss": 0.16814930438995362,
|
|
"memory(GiB)": 31.51,
|
|
"step": 290,
|
|
"token_acc": 0.9360814298463542,
|
|
"train_speed(iter/s)": 0.137309
|
|
},
|
|
{
|
|
"epoch": 1.2671694047939672,
|
|
"grad_norm": 0.7775722146034241,
|
|
"learning_rate": 6.184775436609885e-06,
|
|
"loss": 0.1682277202606201,
|
|
"memory(GiB)": 31.51,
|
|
"step": 295,
|
|
"token_acc": 0.9411593528110813,
|
|
"train_speed(iter/s)": 0.137689
|
|
},
|
|
{
|
|
"epoch": 1.2887153245354162,
|
|
"grad_norm": 0.7489521503448486,
|
|
"learning_rate": 6.074852201055121e-06,
|
|
"loss": 0.16042615175247193,
|
|
"memory(GiB)": 31.51,
|
|
"step": 300,
|
|
"token_acc": 0.9415187229598687,
|
|
"train_speed(iter/s)": 0.138077
|
|
},
|
|
{
|
|
"epoch": 1.2887153245354162,
|
|
"eval_loss": 0.25172922015190125,
|
|
"eval_runtime": 9.1377,
|
|
"eval_samples_per_second": 16.415,
|
|
"eval_steps_per_second": 4.159,
|
|
"eval_token_acc": 0.9150211669972468,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.310261244276865,
|
|
"grad_norm": 0.792445182800293,
|
|
"learning_rate": 5.964381506511823e-06,
|
|
"loss": 0.16529514789581298,
|
|
"memory(GiB)": 31.51,
|
|
"step": 305,
|
|
"token_acc": 0.9390098282355103,
|
|
"train_speed(iter/s)": 0.137303
|
|
},
|
|
{
|
|
"epoch": 1.331807164018314,
|
|
"grad_norm": 0.7657850980758667,
|
|
"learning_rate": 5.853419619483083e-06,
|
|
"loss": 0.16101518869400025,
|
|
"memory(GiB)": 31.51,
|
|
"step": 310,
|
|
"token_acc": 0.9423482091553342,
|
|
"train_speed(iter/s)": 0.137569
|
|
},
|
|
{
|
|
"epoch": 1.353353083759763,
|
|
"grad_norm": 0.7221185564994812,
|
|
"learning_rate": 5.742023056653131e-06,
|
|
"loss": 0.16527080535888672,
|
|
"memory(GiB)": 31.51,
|
|
"step": 315,
|
|
"token_acc": 0.9436092441929018,
|
|
"train_speed(iter/s)": 0.13783
|
|
},
|
|
{
|
|
"epoch": 1.3748990035012119,
|
|
"grad_norm": 0.7651124596595764,
|
|
"learning_rate": 5.630248556101448e-06,
|
|
"loss": 0.16076445579528809,
|
|
"memory(GiB)": 31.51,
|
|
"step": 320,
|
|
"token_acc": 0.941539852342926,
|
|
"train_speed(iter/s)": 0.138138
|
|
},
|
|
{
|
|
"epoch": 1.3748990035012119,
|
|
"eval_loss": 0.2519991993904114,
|
|
"eval_runtime": 9.1625,
|
|
"eval_samples_per_second": 16.371,
|
|
"eval_steps_per_second": 4.147,
|
|
"eval_token_acc": 0.9146955208857575,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.3964449232426608,
|
|
"grad_norm": 0.6687130331993103,
|
|
"learning_rate": 5.51815304840412e-06,
|
|
"loss": 0.16071187257766723,
|
|
"memory(GiB)": 31.51,
|
|
"step": 325,
|
|
"token_acc": 0.93119978263401,
|
|
"train_speed(iter/s)": 0.137342
|
|
},
|
|
{
|
|
"epoch": 1.41799084298411,
|
|
"grad_norm": 0.7091411352157593,
|
|
"learning_rate": 5.405793627637157e-06,
|
|
"loss": 0.15800976753234863,
|
|
"memory(GiB)": 31.51,
|
|
"step": 330,
|
|
"token_acc": 0.9493767600253226,
|
|
"train_speed(iter/s)": 0.137567
|
|
},
|
|
{
|
|
"epoch": 1.4395367627255589,
|
|
"grad_norm": 0.7872418761253357,
|
|
"learning_rate": 5.293227522296517e-06,
|
|
"loss": 0.16303786039352416,
|
|
"memory(GiB)": 31.51,
|
|
"step": 335,
|
|
"token_acc": 0.9474813818783616,
|
|
"train_speed(iter/s)": 0.137773
|
|
},
|
|
{
|
|
"epoch": 1.4610826824670078,
|
|
"grad_norm": 0.696894109249115,
|
|
"learning_rate": 5.180512066149682e-06,
|
|
"loss": 0.1651884913444519,
|
|
"memory(GiB)": 31.51,
|
|
"step": 340,
|
|
"token_acc": 0.9437182487584908,
|
|
"train_speed(iter/s)": 0.138053
|
|
},
|
|
{
|
|
"epoch": 1.4610826824670078,
|
|
"eval_loss": 0.2488545924425125,
|
|
"eval_runtime": 9.1671,
|
|
"eval_samples_per_second": 16.363,
|
|
"eval_steps_per_second": 4.145,
|
|
"eval_token_acc": 0.9151395837650611,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.4826286022084567,
|
|
"grad_norm": 0.7297951579093933,
|
|
"learning_rate": 5.06770466903361e-06,
|
|
"loss": 0.15690993070602416,
|
|
"memory(GiB)": 31.51,
|
|
"step": 345,
|
|
"token_acc": 0.9340376019152534,
|
|
"train_speed(iter/s)": 0.137212
|
|
},
|
|
{
|
|
"epoch": 1.5041745219499059,
|
|
"grad_norm": 0.7707265019416809,
|
|
"learning_rate": 4.954862787613937e-06,
|
|
"loss": 0.15354007482528687,
|
|
"memory(GiB)": 31.51,
|
|
"step": 350,
|
|
"token_acc": 0.9396699063799393,
|
|
"train_speed(iter/s)": 0.13744
|
|
},
|
|
{
|
|
"epoch": 1.5257204416913548,
|
|
"grad_norm": 0.7526496052742004,
|
|
"learning_rate": 4.842043896120332e-06,
|
|
"loss": 0.16020708084106444,
|
|
"memory(GiB)": 31.51,
|
|
"step": 355,
|
|
"token_acc": 0.9479154768703598,
|
|
"train_speed(iter/s)": 0.137715
|
|
},
|
|
{
|
|
"epoch": 1.5472663614328037,
|
|
"grad_norm": 0.7758511900901794,
|
|
"learning_rate": 4.729305457072913e-06,
|
|
"loss": 0.16963763236999513,
|
|
"memory(GiB)": 31.51,
|
|
"step": 360,
|
|
"token_acc": 0.9411747908278363,
|
|
"train_speed(iter/s)": 0.138029
|
|
},
|
|
{
|
|
"epoch": 1.5472663614328037,
|
|
"eval_loss": 0.24962776899337769,
|
|
"eval_runtime": 9.1377,
|
|
"eval_samples_per_second": 16.415,
|
|
"eval_steps_per_second": 4.159,
|
|
"eval_token_acc": 0.9158204801799935,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.5688122811742526,
|
|
"grad_norm": 0.7562235593795776,
|
|
"learning_rate": 4.616704892014613e-06,
|
|
"loss": 0.1591555118560791,
|
|
"memory(GiB)": 31.51,
|
|
"step": 365,
|
|
"token_acc": 0.9352809509107749,
|
|
"train_speed(iter/s)": 0.137302
|
|
},
|
|
{
|
|
"epoch": 1.5903582009157016,
|
|
"grad_norm": 0.7587376236915588,
|
|
"learning_rate": 4.504299552264428e-06,
|
|
"loss": 0.15684648752212524,
|
|
"memory(GiB)": 31.51,
|
|
"step": 370,
|
|
"token_acc": 0.9416271651313239,
|
|
"train_speed(iter/s)": 0.137471
|
|
},
|
|
{
|
|
"epoch": 1.6119041206571505,
|
|
"grad_norm": 0.8137562870979309,
|
|
"learning_rate": 4.392146689706426e-06,
|
|
"loss": 0.1647357702255249,
|
|
"memory(GiB)": 31.51,
|
|
"step": 375,
|
|
"token_acc": 0.9458877409154104,
|
|
"train_speed(iter/s)": 0.137806
|
|
},
|
|
{
|
|
"epoch": 1.6334500403985994,
|
|
"grad_norm": 0.7551019191741943,
|
|
"learning_rate": 4.280303427629404e-06,
|
|
"loss": 0.15853278636932372,
|
|
"memory(GiB)": 31.51,
|
|
"step": 380,
|
|
"token_acc": 0.9448746907604604,
|
|
"train_speed(iter/s)": 0.138095
|
|
},
|
|
{
|
|
"epoch": 1.6334500403985994,
|
|
"eval_loss": 0.25000789761543274,
|
|
"eval_runtime": 9.1322,
|
|
"eval_samples_per_second": 16.425,
|
|
"eval_steps_per_second": 4.161,
|
|
"eval_token_acc": 0.9155096361644809,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.6549959601400483,
|
|
"grad_norm": 0.7977014780044556,
|
|
"learning_rate": 4.168826731632052e-06,
|
|
"loss": 0.15717003345489503,
|
|
"memory(GiB)": 31.51,
|
|
"step": 385,
|
|
"token_acc": 0.9312666413084824,
|
|
"train_speed(iter/s)": 0.137411
|
|
},
|
|
{
|
|
"epoch": 1.6765418798814973,
|
|
"grad_norm": 0.7832633852958679,
|
|
"learning_rate": 4.057773380608411e-06,
|
|
"loss": 0.1634294271469116,
|
|
"memory(GiB)": 31.51,
|
|
"step": 390,
|
|
"token_acc": 0.9465973781793492,
|
|
"train_speed(iter/s)": 0.137768
|
|
},
|
|
{
|
|
"epoch": 1.6980877996229464,
|
|
"grad_norm": 0.8293562531471252,
|
|
"learning_rate": 3.947199937828447e-06,
|
|
"loss": 0.16505708694458007,
|
|
"memory(GiB)": 31.51,
|
|
"step": 395,
|
|
"token_acc": 0.9415725074644342,
|
|
"train_speed(iter/s)": 0.138088
|
|
},
|
|
{
|
|
"epoch": 1.7196337193643954,
|
|
"grad_norm": 0.7886548042297363,
|
|
"learning_rate": 3.8371627221284495e-06,
|
|
"loss": 0.1561971426010132,
|
|
"memory(GiB)": 31.51,
|
|
"step": 400,
|
|
"token_acc": 0.9470925236321971,
|
|
"train_speed(iter/s)": 0.138285
|
|
},
|
|
{
|
|
"epoch": 1.7196337193643954,
|
|
"eval_loss": 0.24771690368652344,
|
|
"eval_runtime": 9.1277,
|
|
"eval_samples_per_second": 16.434,
|
|
"eval_steps_per_second": 4.163,
|
|
"eval_token_acc": 0.915642855028272,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.7411796391058443,
|
|
"grad_norm": 0.7254658937454224,
|
|
"learning_rate": 3.727717779225912e-06,
|
|
"loss": 0.1556318521499634,
|
|
"memory(GiB)": 31.51,
|
|
"step": 405,
|
|
"token_acc": 0.9359159282917783,
|
|
"train_speed(iter/s)": 0.137626
|
|
},
|
|
{
|
|
"epoch": 1.7627255588472934,
|
|
"grad_norm": 0.7896953225135803,
|
|
"learning_rate": 3.6189208531735354e-06,
|
|
"loss": 0.16613179445266724,
|
|
"memory(GiB)": 31.51,
|
|
"step": 410,
|
|
"token_acc": 0.9352341759749168,
|
|
"train_speed(iter/s)": 0.137957
|
|
},
|
|
{
|
|
"epoch": 1.7842714785887424,
|
|
"grad_norm": 0.6848239898681641,
|
|
"learning_rate": 3.510827357966876e-06,
|
|
"loss": 0.1551806092262268,
|
|
"memory(GiB)": 31.51,
|
|
"step": 415,
|
|
"token_acc": 0.9506668360218469,
|
|
"train_speed(iter/s)": 0.138211
|
|
},
|
|
{
|
|
"epoch": 1.8058173983301913,
|
|
"grad_norm": 0.7046887874603271,
|
|
"learning_rate": 3.403492349320101e-06,
|
|
"loss": 0.15121963024139404,
|
|
"memory(GiB)": 31.51,
|
|
"step": 420,
|
|
"token_acc": 0.9443215339233039,
|
|
"train_speed(iter/s)": 0.138347
|
|
},
|
|
{
|
|
"epoch": 1.8058173983301913,
|
|
"eval_loss": 0.24637845158576965,
|
|
"eval_runtime": 9.144,
|
|
"eval_samples_per_second": 16.404,
|
|
"eval_steps_per_second": 4.156,
|
|
"eval_token_acc": 0.9157020634121792,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.8273633180716402,
|
|
"grad_norm": 0.7264747619628906,
|
|
"learning_rate": 3.29697049662423e-06,
|
|
"loss": 0.1486160159111023,
|
|
"memory(GiB)": 31.51,
|
|
"step": 425,
|
|
"token_acc": 0.9378997513712539,
|
|
"train_speed(iter/s)": 0.137609
|
|
},
|
|
{
|
|
"epoch": 1.8489092378130891,
|
|
"grad_norm": 0.6881827116012573,
|
|
"learning_rate": 3.191316055102146e-06,
|
|
"loss": 0.14999903440475465,
|
|
"memory(GiB)": 31.51,
|
|
"step": 430,
|
|
"token_acc": 0.9458710676835081,
|
|
"train_speed(iter/s)": 0.13782
|
|
},
|
|
{
|
|
"epoch": 1.870455157554538,
|
|
"grad_norm": 0.7096033096313477,
|
|
"learning_rate": 3.0865828381745515e-06,
|
|
"loss": 0.15066919326782227,
|
|
"memory(GiB)": 31.51,
|
|
"step": 435,
|
|
"token_acc": 0.9486315094650982,
|
|
"train_speed(iter/s)": 0.137979
|
|
},
|
|
{
|
|
"epoch": 1.892001077295987,
|
|
"grad_norm": 0.7479064464569092,
|
|
"learning_rate": 2.982824190050958e-06,
|
|
"loss": 0.165749990940094,
|
|
"memory(GiB)": 31.51,
|
|
"step": 440,
|
|
"token_acc": 0.9481958622195534,
|
|
"train_speed(iter/s)": 0.138187
|
|
},
|
|
{
|
|
"epoch": 1.892001077295987,
|
|
"eval_loss": 0.24523746967315674,
|
|
"eval_runtime": 9.1328,
|
|
"eval_samples_per_second": 16.424,
|
|
"eval_steps_per_second": 4.161,
|
|
"eval_token_acc": 0.9161757304834365,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.913546997037436,
|
|
"grad_norm": 0.7331624031066895,
|
|
"learning_rate": 2.8800929585596506e-06,
|
|
"loss": 0.15496289730072021,
|
|
"memory(GiB)": 31.51,
|
|
"step": 445,
|
|
"token_acc": 0.9352829677768751,
|
|
"train_speed(iter/s)": 0.137542
|
|
},
|
|
{
|
|
"epoch": 1.9350929167788848,
|
|
"grad_norm": 0.6734929084777832,
|
|
"learning_rate": 2.778441468230483e-06,
|
|
"loss": 0.1523799180984497,
|
|
"memory(GiB)": 31.51,
|
|
"step": 450,
|
|
"token_acc": 0.9479633806554332,
|
|
"train_speed(iter/s)": 0.1377
|
|
},
|
|
{
|
|
"epoch": 1.956638836520334,
|
|
"grad_norm": 0.7542054057121277,
|
|
"learning_rate": 2.6779214936442056e-06,
|
|
"loss": 0.16172744035720826,
|
|
"memory(GiB)": 31.51,
|
|
"step": 455,
|
|
"token_acc": 0.935499950154521,
|
|
"train_speed(iter/s)": 0.137884
|
|
},
|
|
{
|
|
"epoch": 1.978184756261783,
|
|
"grad_norm": 0.7129687070846558,
|
|
"learning_rate": 2.5785842330619038e-06,
|
|
"loss": 0.15356701612472534,
|
|
"memory(GiB)": 31.51,
|
|
"step": 460,
|
|
"token_acc": 0.941601546088564,
|
|
"train_speed(iter/s)": 0.13804
|
|
},
|
|
{
|
|
"epoch": 1.978184756261783,
|
|
"eval_loss": 0.24467714130878448,
|
|
"eval_runtime": 9.1316,
|
|
"eval_samples_per_second": 16.426,
|
|
"eval_steps_per_second": 4.161,
|
|
"eval_token_acc": 0.9170934604339974,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 1.9997306760032318,
|
|
"grad_norm": 0.7861402630805969,
|
|
"learning_rate": 2.480480282347961e-06,
|
|
"loss": 0.15792056322097778,
|
|
"memory(GiB)": 31.51,
|
|
"step": 465,
|
|
"token_acc": 0.9337420552337027,
|
|
"train_speed(iter/s)": 0.137533
|
|
},
|
|
{
|
|
"epoch": 2.0172367357931593,
|
|
"grad_norm": 0.6826748847961426,
|
|
"learning_rate": 2.383659609199873e-06,
|
|
"loss": 0.14240689277648927,
|
|
"memory(GiB)": 31.51,
|
|
"step": 470,
|
|
"token_acc": 0.9578195371952166,
|
|
"train_speed(iter/s)": 0.137988
|
|
},
|
|
{
|
|
"epoch": 2.0387826555346082,
|
|
"grad_norm": 0.6501537561416626,
|
|
"learning_rate": 2.2881715276979705e-06,
|
|
"loss": 0.10814023017883301,
|
|
"memory(GiB)": 31.51,
|
|
"step": 475,
|
|
"token_acc": 0.9586633663366336,
|
|
"train_speed(iter/s)": 0.138103
|
|
},
|
|
{
|
|
"epoch": 2.060328575276057,
|
|
"grad_norm": 0.6575304269790649,
|
|
"learning_rate": 2.1940646731880887e-06,
|
|
"loss": 0.1118842363357544,
|
|
"memory(GiB)": 31.51,
|
|
"step": 480,
|
|
"token_acc": 0.9698543524895563,
|
|
"train_speed(iter/s)": 0.138265
|
|
},
|
|
{
|
|
"epoch": 2.060328575276057,
|
|
"eval_loss": 0.26591211557388306,
|
|
"eval_runtime": 9.1643,
|
|
"eval_samples_per_second": 16.368,
|
|
"eval_steps_per_second": 4.147,
|
|
"eval_token_acc": 0.9162201367713668,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 2.081874495017506,
|
|
"grad_norm": 0.765779972076416,
|
|
"learning_rate": 2.101386977509907e-06,
|
|
"loss": 0.12155743837356567,
|
|
"memory(GiB)": 31.51,
|
|
"step": 485,
|
|
"token_acc": 0.946289860026969,
|
|
"train_speed(iter/s)": 0.137773
|
|
},
|
|
{
|
|
"epoch": 2.103420414758955,
|
|
"grad_norm": 0.7146125435829163,
|
|
"learning_rate": 2.010185644583641e-06,
|
|
"loss": 0.11463183164596558,
|
|
"memory(GiB)": 31.51,
|
|
"step": 490,
|
|
"token_acc": 0.9624759934997784,
|
|
"train_speed(iter/s)": 0.137939
|
|
},
|
|
{
|
|
"epoch": 2.124966334500404,
|
|
"grad_norm": 0.6777431964874268,
|
|
"learning_rate": 1.920507126367448e-06,
|
|
"loss": 0.10685477256774903,
|
|
"memory(GiB)": 31.51,
|
|
"step": 495,
|
|
"token_acc": 0.9612281857095818,
|
|
"train_speed(iter/s)": 0.138102
|
|
},
|
|
{
|
|
"epoch": 2.146512254241853,
|
|
"grad_norm": 0.7272450923919678,
|
|
"learning_rate": 1.8323970991978823e-06,
|
|
"loss": 0.10419889688491821,
|
|
"memory(GiB)": 31.51,
|
|
"step": 500,
|
|
"token_acc": 0.9610325296357052,
|
|
"train_speed(iter/s)": 0.138212
|
|
},
|
|
{
|
|
"epoch": 2.146512254241853,
|
|
"eval_loss": 0.26627564430236816,
|
|
"eval_runtime": 9.1679,
|
|
"eval_samples_per_second": 16.361,
|
|
"eval_steps_per_second": 4.145,
|
|
"eval_token_acc": 0.9159833032357382,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 2.168058173983302,
|
|
"grad_norm": 0.7035279273986816,
|
|
"learning_rate": 1.7459004405253544e-06,
|
|
"loss": 0.1082868218421936,
|
|
"memory(GiB)": 31.51,
|
|
"step": 505,
|
|
"token_acc": 0.9475257941268758,
|
|
"train_speed(iter/s)": 0.137652
|
|
},
|
|
{
|
|
"epoch": 2.1896040937247507,
|
|
"grad_norm": 0.601047158241272,
|
|
"learning_rate": 1.6610612060565235e-06,
|
|
"loss": 0.09674398303031921,
|
|
"memory(GiB)": 31.51,
|
|
"step": 510,
|
|
"token_acc": 0.9646692233940556,
|
|
"train_speed(iter/s)": 0.137835
|
|
},
|
|
{
|
|
"epoch": 2.2111500134661997,
|
|
"grad_norm": 0.7340168356895447,
|
|
"learning_rate": 1.5779226073152071e-06,
|
|
"loss": 0.1145021677017212,
|
|
"memory(GiB)": 31.51,
|
|
"step": 515,
|
|
"token_acc": 0.9572068592615479,
|
|
"train_speed(iter/s)": 0.138109
|
|
},
|
|
{
|
|
"epoch": 2.2326959332076486,
|
|
"grad_norm": 0.7059099078178406,
|
|
"learning_rate": 1.4965269896332884e-06,
|
|
"loss": 0.1138340711593628,
|
|
"memory(GiB)": 31.51,
|
|
"step": 520,
|
|
"token_acc": 0.9643348939686037,
|
|
"train_speed(iter/s)": 0.138301
|
|
},
|
|
{
|
|
"epoch": 2.2326959332076486,
|
|
"eval_loss": 0.26528117060661316,
|
|
"eval_runtime": 9.1705,
|
|
"eval_samples_per_second": 16.357,
|
|
"eval_steps_per_second": 4.144,
|
|
"eval_token_acc": 0.9159536990437847,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 2.254241852949098,
|
|
"grad_norm": 0.7410480976104736,
|
|
"learning_rate": 1.4169158105827768e-06,
|
|
"loss": 0.11105086803436279,
|
|
"memory(GiB)": 31.51,
|
|
"step": 525,
|
|
"token_acc": 0.9514605435256503,
|
|
"train_speed(iter/s)": 0.137827
|
|
},
|
|
{
|
|
"epoch": 2.275787772690547,
|
|
"grad_norm": 0.6698100566864014,
|
|
"learning_rate": 1.3391296188600594e-06,
|
|
"loss": 0.10843292474746705,
|
|
"memory(GiB)": 31.51,
|
|
"step": 530,
|
|
"token_acc": 0.9629225092250923,
|
|
"train_speed(iter/s)": 0.137941
|
|
},
|
|
{
|
|
"epoch": 2.297333692431996,
|
|
"grad_norm": 0.6693587303161621,
|
|
"learning_rate": 1.2632080336330532e-06,
|
|
"loss": 0.11362366676330567,
|
|
"memory(GiB)": 31.51,
|
|
"step": 535,
|
|
"token_acc": 0.9621976353183642,
|
|
"train_speed(iter/s)": 0.138116
|
|
},
|
|
{
|
|
"epoch": 2.3188796121734447,
|
|
"grad_norm": 0.6858277320861816,
|
|
"learning_rate": 1.1891897243618184e-06,
|
|
"loss": 0.10754673480987549,
|
|
"memory(GiB)": 31.51,
|
|
"step": 540,
|
|
"token_acc": 0.9638930030070464,
|
|
"train_speed(iter/s)": 0.138279
|
|
},
|
|
{
|
|
"epoch": 2.3188796121734447,
|
|
"eval_loss": 0.2651301622390747,
|
|
"eval_runtime": 9.1447,
|
|
"eval_samples_per_second": 16.403,
|
|
"eval_steps_per_second": 4.155,
|
|
"eval_token_acc": 0.91620533467539,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 2.3404255319148937,
|
|
"grad_norm": 0.7329075932502747,
|
|
"learning_rate": 1.1171123911028692e-06,
|
|
"loss": 0.10752699375152588,
|
|
"memory(GiB)": 31.51,
|
|
"step": 545,
|
|
"token_acc": 0.946654961925566,
|
|
"train_speed(iter/s)": 0.137811
|
|
},
|
|
{
|
|
"epoch": 2.3619714516563426,
|
|
"grad_norm": 0.7588092684745789,
|
|
"learning_rate": 1.047012745307255e-06,
|
|
"loss": 0.10413261651992797,
|
|
"memory(GiB)": 31.51,
|
|
"step": 550,
|
|
"token_acc": 0.9638513608403786,
|
|
"train_speed(iter/s)": 0.137959
|
|
},
|
|
{
|
|
"epoch": 2.3835173713977915,
|
|
"grad_norm": 0.6776463389396667,
|
|
"learning_rate": 9.789264911221546e-07,
|
|
"loss": 0.11203373670578003,
|
|
"memory(GiB)": 31.51,
|
|
"step": 555,
|
|
"token_acc": 0.9539418840061927,
|
|
"train_speed(iter/s)": 0.138182
|
|
},
|
|
{
|
|
"epoch": 2.4050632911392404,
|
|
"grad_norm": 0.6233177185058594,
|
|
"learning_rate": 9.128883072055411e-07,
|
|
"loss": 0.10640518665313721,
|
|
"memory(GiB)": 31.51,
|
|
"step": 560,
|
|
"token_acc": 0.9605239362389232,
|
|
"train_speed(iter/s)": 0.138441
|
|
},
|
|
{
|
|
"epoch": 2.4050632911392404,
|
|
"eval_loss": 0.26431551575660706,
|
|
"eval_runtime": 9.1559,
|
|
"eval_samples_per_second": 16.383,
|
|
"eval_steps_per_second": 4.15,
|
|
"eval_token_acc": 0.9158056780840167,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 2.4266092108806894,
|
|
"grad_norm": 0.6891351938247681,
|
|
"learning_rate": 8.489318290631454e-07,
|
|
"loss": 0.11017493009567261,
|
|
"memory(GiB)": 31.51,
|
|
"step": 565,
|
|
"token_acc": 0.9501483222252186,
|
|
"train_speed(iter/s)": 0.137996
|
|
},
|
|
{
|
|
"epoch": 2.4481551306221383,
|
|
"grad_norm": 0.685417890548706,
|
|
"learning_rate": 7.870896319167548e-07,
|
|
"loss": 0.10502817630767822,
|
|
"memory(GiB)": 31.51,
|
|
"step": 570,
|
|
"token_acc": 0.9675666865866247,
|
|
"train_speed(iter/s)": 0.138123
|
|
},
|
|
{
|
|
"epoch": 2.4697010503635872,
|
|
"grad_norm": 0.8273110389709473,
|
|
"learning_rate": 7.273932141125256e-07,
|
|
"loss": 0.11376097202301025,
|
|
"memory(GiB)": 31.51,
|
|
"step": 575,
|
|
"token_acc": 0.9588286984389538,
|
|
"train_speed(iter/s)": 0.138286
|
|
},
|
|
{
|
|
"epoch": 2.4912469701050366,
|
|
"grad_norm": 0.7995973825454712,
|
|
"learning_rate": 6.698729810778065e-07,
|
|
"loss": 0.1191399335861206,
|
|
"memory(GiB)": 31.51,
|
|
"step": 580,
|
|
"token_acc": 0.9582757592998997,
|
|
"train_speed(iter/s)": 0.138394
|
|
},
|
|
{
|
|
"epoch": 2.4912469701050366,
|
|
"eval_loss": 0.26525548100471497,
|
|
"eval_runtime": 9.1367,
|
|
"eval_samples_per_second": 16.417,
|
|
"eval_steps_per_second": 4.159,
|
|
"eval_token_acc": 0.9161461262914828,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 2.5127928898464855,
|
|
"grad_norm": 0.6976614594459534,
|
|
"learning_rate": 6.145582298346153e-07,
|
|
"loss": 0.10850718021392822,
|
|
"memory(GiB)": 31.51,
|
|
"step": 585,
|
|
"token_acc": 0.9468696569536905,
|
|
"train_speed(iter/s)": 0.137863
|
|
},
|
|
{
|
|
"epoch": 2.5343388095879344,
|
|
"grad_norm": 0.7127689123153687,
|
|
"learning_rate": 5.614771340776559e-07,
|
|
"loss": 0.1049992561340332,
|
|
"memory(GiB)": 31.51,
|
|
"step": 590,
|
|
"token_acc": 0.9636775106082037,
|
|
"train_speed(iter/s)": 0.137998
|
|
},
|
|
{
|
|
"epoch": 2.5558847293293834,
|
|
"grad_norm": 0.7365370392799377,
|
|
"learning_rate": 5.106567298245008e-07,
|
|
"loss": 0.11682652235031128,
|
|
"memory(GiB)": 31.51,
|
|
"step": 595,
|
|
"token_acc": 0.9585528403681371,
|
|
"train_speed(iter/s)": 0.138108
|
|
},
|
|
{
|
|
"epoch": 2.5774306490708323,
|
|
"grad_norm": 0.6995398998260498,
|
|
"learning_rate": 4.6212290164521554e-07,
|
|
"loss": 0.10941903591156006,
|
|
"memory(GiB)": 31.51,
|
|
"step": 600,
|
|
"token_acc": 0.9588730068630993,
|
|
"train_speed(iter/s)": 0.138228
|
|
},
|
|
{
|
|
"epoch": 2.5774306490708323,
|
|
"eval_loss": 0.2648448944091797,
|
|
"eval_runtime": 9.1555,
|
|
"eval_samples_per_second": 16.384,
|
|
"eval_steps_per_second": 4.15,
|
|
"eval_token_acc": 0.91620533467539,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 2.5989765688122812,
|
|
"grad_norm": 0.6558308601379395,
|
|
"learning_rate": 4.159003694784647e-07,
|
|
"loss": 0.09994454979896546,
|
|
"memory(GiB)": 31.51,
|
|
"step": 605,
|
|
"token_acc": 0.9488222044057573,
|
|
"train_speed(iter/s)": 0.137823
|
|
},
|
|
{
|
|
"epoch": 2.62052248855373,
|
|
"grad_norm": 0.5623044371604919,
|
|
"learning_rate": 3.7201267604080436e-07,
|
|
"loss": 0.10503623485565186,
|
|
"memory(GiB)": 31.51,
|
|
"step": 610,
|
|
"token_acc": 0.9605492530908896,
|
|
"train_speed(iter/s)": 0.137935
|
|
},
|
|
{
|
|
"epoch": 2.642068408295179,
|
|
"grad_norm": 0.7411386966705322,
|
|
"learning_rate": 3.3048217483556743e-07,
|
|
"loss": 0.10335917472839355,
|
|
"memory(GiB)": 31.51,
|
|
"step": 615,
|
|
"token_acc": 0.9596229517824632,
|
|
"train_speed(iter/s)": 0.138049
|
|
},
|
|
{
|
|
"epoch": 2.663614328036628,
|
|
"grad_norm": 0.7703331112861633,
|
|
"learning_rate": 2.9133001876746004e-07,
|
|
"loss": 0.11330341100692749,
|
|
"memory(GiB)": 31.51,
|
|
"step": 620,
|
|
"token_acc": 0.9627358888545153,
|
|
"train_speed(iter/s)": 0.138195
|
|
},
|
|
{
|
|
"epoch": 2.663614328036628,
|
|
"eval_loss": 0.2645653188228607,
|
|
"eval_runtime": 9.1574,
|
|
"eval_samples_per_second": 16.38,
|
|
"eval_steps_per_second": 4.15,
|
|
"eval_token_acc": 0.9161461262914828,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 2.685160247778077,
|
|
"grad_norm": 0.7208820581436157,
|
|
"learning_rate": 2.545761493686666e-07,
|
|
"loss": 0.10512195825576783,
|
|
"memory(GiB)": 31.51,
|
|
"step": 625,
|
|
"token_acc": 0.9453959214438257,
|
|
"train_speed(iter/s)": 0.137691
|
|
},
|
|
{
|
|
"epoch": 2.706706167519526,
|
|
"grad_norm": 0.792917013168335,
|
|
"learning_rate": 2.2023928664194229e-07,
|
|
"loss": 0.10448248386383056,
|
|
"memory(GiB)": 31.51,
|
|
"step": 630,
|
|
"token_acc": 0.9634454263743831,
|
|
"train_speed(iter/s)": 0.137865
|
|
},
|
|
{
|
|
"epoch": 2.728252087260975,
|
|
"grad_norm": 0.7074964046478271,
|
|
"learning_rate": 1.8833691952587829e-07,
|
|
"loss": 0.10274065732955932,
|
|
"memory(GiB)": 31.51,
|
|
"step": 635,
|
|
"token_acc": 0.9614498168320434,
|
|
"train_speed(iter/s)": 0.137973
|
|
},
|
|
{
|
|
"epoch": 2.7497980070024237,
|
|
"grad_norm": 0.695501446723938,
|
|
"learning_rate": 1.5888529698718347e-07,
|
|
"loss": 0.111275053024292,
|
|
"memory(GiB)": 31.51,
|
|
"step": 640,
|
|
"token_acc": 0.9627026215729437,
|
|
"train_speed(iter/s)": 0.13808
|
|
},
|
|
{
|
|
"epoch": 2.7497980070024237,
|
|
"eval_loss": 0.26474642753601074,
|
|
"eval_runtime": 9.1444,
|
|
"eval_samples_per_second": 16.403,
|
|
"eval_steps_per_second": 4.156,
|
|
"eval_token_acc": 0.9162349388673436,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 2.7713439267438726,
|
|
"grad_norm": 0.7010114192962646,
|
|
"learning_rate": 1.3189941974453502e-07,
|
|
"loss": 0.11862779855728149,
|
|
"memory(GiB)": 31.51,
|
|
"step": 645,
|
|
"token_acc": 0.9463942439720986,
|
|
"train_speed(iter/s)": 0.13766
|
|
},
|
|
{
|
|
"epoch": 2.7928898464853216,
|
|
"grad_norm": 0.7053817510604858,
|
|
"learning_rate": 1.0739303262819301e-07,
|
|
"loss": 0.10773177146911621,
|
|
"memory(GiB)": 31.51,
|
|
"step": 650,
|
|
"token_acc": 0.9672267425750056,
|
|
"train_speed(iter/s)": 0.137765
|
|
},
|
|
{
|
|
"epoch": 2.814435766226771,
|
|
"grad_norm": 0.7381494641304016,
|
|
"learning_rate": 8.537861757929422e-08,
|
|
"loss": 0.10787509679794312,
|
|
"memory(GiB)": 31.51,
|
|
"step": 655,
|
|
"token_acc": 0.9632690990902866,
|
|
"train_speed(iter/s)": 0.137917
|
|
},
|
|
{
|
|
"epoch": 2.83598168596822,
|
|
"grad_norm": 0.7262890934944153,
|
|
"learning_rate": 6.58673872923693e-08,
|
|
"loss": 0.11206209659576416,
|
|
"memory(GiB)": 31.51,
|
|
"step": 660,
|
|
"token_acc": 0.965990990990991,
|
|
"train_speed(iter/s)": 0.138079
|
|
},
|
|
{
|
|
"epoch": 2.83598168596822,
|
|
"eval_loss": 0.2648203372955322,
|
|
"eval_runtime": 9.1399,
|
|
"eval_samples_per_second": 16.412,
|
|
"eval_steps_per_second": 4.158,
|
|
"eval_token_acc": 0.916279345155274,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 2.857527605709669,
|
|
"grad_norm": 0.7241911888122559,
|
|
"learning_rate": 4.88692795043344e-08,
|
|
"loss": 0.10918653011322021,
|
|
"memory(GiB)": 31.51,
|
|
"step": 665,
|
|
"token_acc": 0.9504457917261055,
|
|
"train_speed(iter/s)": 0.137668
|
|
},
|
|
{
|
|
"epoch": 2.8790735254511177,
|
|
"grad_norm": 0.7945267558097839,
|
|
"learning_rate": 3.439295193286174e-08,
|
|
"loss": 0.11153676509857177,
|
|
"memory(GiB)": 31.51,
|
|
"step": 670,
|
|
"token_acc": 0.9583095218657305,
|
|
"train_speed(iter/s)": 0.13784
|
|
},
|
|
{
|
|
"epoch": 2.9006194451925666,
|
|
"grad_norm": 0.7374799847602844,
|
|
"learning_rate": 2.2445777866709208e-08,
|
|
"loss": 0.10855717658996582,
|
|
"memory(GiB)": 31.51,
|
|
"step": 675,
|
|
"token_acc": 0.9629715143294179,
|
|
"train_speed(iter/s)": 0.13794
|
|
},
|
|
{
|
|
"epoch": 2.9221653649340156,
|
|
"grad_norm": 0.6748504042625427,
|
|
"learning_rate": 1.3033842410251074e-08,
|
|
"loss": 0.11381592750549316,
|
|
"memory(GiB)": 31.51,
|
|
"step": 680,
|
|
"token_acc": 0.9596367864459332,
|
|
"train_speed(iter/s)": 0.138055
|
|
},
|
|
{
|
|
"epoch": 2.9221653649340156,
|
|
"eval_loss": 0.26483333110809326,
|
|
"eval_runtime": 9.1492,
|
|
"eval_samples_per_second": 16.395,
|
|
"eval_steps_per_second": 4.153,
|
|
"eval_token_acc": 0.9160425116196453,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 2.9437112846754645,
|
|
"grad_norm": 0.7286980748176575,
|
|
"learning_rate": 6.16193938412557e-09,
|
|
"loss": 0.10510704517364503,
|
|
"memory(GiB)": 31.51,
|
|
"step": 685,
|
|
"token_acc": 0.9509096674461929,
|
|
"train_speed(iter/s)": 0.137639
|
|
},
|
|
{
|
|
"epoch": 2.9652572044169134,
|
|
"grad_norm": 0.6892649531364441,
|
|
"learning_rate": 1.8335688835802169e-09,
|
|
"loss": 0.105083167552948,
|
|
"memory(GiB)": 31.51,
|
|
"step": 690,
|
|
"token_acc": 0.9589703497799398,
|
|
"train_speed(iter/s)": 0.137777
|
|
},
|
|
{
|
|
"epoch": 2.9868031241583624,
|
|
"grad_norm": 0.7908564209938049,
|
|
"learning_rate": 5.093549575119205e-11,
|
|
"loss": 0.10409483909606934,
|
|
"memory(GiB)": 31.51,
|
|
"step": 695,
|
|
"token_acc": 0.9638513775207209,
|
|
"train_speed(iter/s)": 0.137894
|
|
},
|
|
{
|
|
"epoch": 2.9911123081066524,
|
|
"eval_loss": 0.26468953490257263,
|
|
"eval_runtime": 9.1426,
|
|
"eval_samples_per_second": 16.407,
|
|
"eval_steps_per_second": 4.156,
|
|
"eval_token_acc": 0.9161757304834365,
|
|
"step": 696
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 696,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 8.392166013990339e+17,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|