2600 lines
69 KiB
JSON
2600 lines
69 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 6.597039473684211,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 1421,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.024671052631578948,
|
||
|
|
"grad_norm": 5.377997875213623,
|
||
|
|
"learning_rate": 1.118881118881119e-06,
|
||
|
|
"loss": 0.5476,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.47340989112854004,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.049342105263157895,
|
||
|
|
"grad_norm": 5.392073154449463,
|
||
|
|
"learning_rate": 2.517482517482518e-06,
|
||
|
|
"loss": 0.5429,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.48170506954193115,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07401315789473684,
|
||
|
|
"grad_norm": 4.068439960479736,
|
||
|
|
"learning_rate": 3.916083916083917e-06,
|
||
|
|
"loss": 0.5108,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.5030127763748169,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09868421052631579,
|
||
|
|
"grad_norm": 1.50242280960083,
|
||
|
|
"learning_rate": 5.314685314685315e-06,
|
||
|
|
"loss": 0.4949,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.4839881658554077,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12335526315789473,
|
||
|
|
"grad_norm": 0.9494503736495972,
|
||
|
|
"learning_rate": 6.713286713286714e-06,
|
||
|
|
"loss": 0.4425,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.4382355809211731,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14802631578947367,
|
||
|
|
"grad_norm": 0.8043321967124939,
|
||
|
|
"learning_rate": 8.111888111888112e-06,
|
||
|
|
"loss": 0.4344,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.3997962474822998,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17269736842105263,
|
||
|
|
"grad_norm": 0.7665994763374329,
|
||
|
|
"learning_rate": 9.510489510489511e-06,
|
||
|
|
"loss": 0.4256,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.4312514364719391,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19736842105263158,
|
||
|
|
"grad_norm": 0.5361006855964661,
|
||
|
|
"learning_rate": 1.0909090909090909e-05,
|
||
|
|
"loss": 0.3893,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.33488091826438904,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22203947368421054,
|
||
|
|
"grad_norm": 0.4154154062271118,
|
||
|
|
"learning_rate": 1.230769230769231e-05,
|
||
|
|
"loss": 0.3565,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2907222509384155,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24671052631578946,
|
||
|
|
"grad_norm": 0.38533732295036316,
|
||
|
|
"learning_rate": 1.3706293706293707e-05,
|
||
|
|
"loss": 0.3607,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.38802745938301086,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2713815789473684,
|
||
|
|
"grad_norm": 0.6358391642570496,
|
||
|
|
"learning_rate": 1.5104895104895105e-05,
|
||
|
|
"loss": 0.336,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.35459989309310913,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29605263157894735,
|
||
|
|
"grad_norm": 0.7225229740142822,
|
||
|
|
"learning_rate": 1.6503496503496507e-05,
|
||
|
|
"loss": 0.3261,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.32752037048339844,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3207236842105263,
|
||
|
|
"grad_norm": 0.4612922668457031,
|
||
|
|
"learning_rate": 1.7902097902097903e-05,
|
||
|
|
"loss": 0.3038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.29034003615379333,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34539473684210525,
|
||
|
|
"grad_norm": 0.41292667388916016,
|
||
|
|
"learning_rate": 1.9300699300699302e-05,
|
||
|
|
"loss": 0.3026,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.29919901490211487,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37006578947368424,
|
||
|
|
"grad_norm": 0.3574488162994385,
|
||
|
|
"learning_rate": 2.06993006993007e-05,
|
||
|
|
"loss": 0.2916,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2877832353115082,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39473684210526316,
|
||
|
|
"grad_norm": 0.4183422923088074,
|
||
|
|
"learning_rate": 2.2097902097902097e-05,
|
||
|
|
"loss": 0.2951,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2495700865983963,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4194078947368421,
|
||
|
|
"grad_norm": 0.27113303542137146,
|
||
|
|
"learning_rate": 2.3496503496503496e-05,
|
||
|
|
"loss": 0.2694,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2706011235713959,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4440789473684211,
|
||
|
|
"grad_norm": 0.32469940185546875,
|
||
|
|
"learning_rate": 2.48951048951049e-05,
|
||
|
|
"loss": 0.2666,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2613554000854492,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46875,
|
||
|
|
"grad_norm": 0.3434113562107086,
|
||
|
|
"learning_rate": 2.6293706293706294e-05,
|
||
|
|
"loss": 0.2516,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2635467052459717,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4934210526315789,
|
||
|
|
"grad_norm": 0.44404661655426025,
|
||
|
|
"learning_rate": 2.7692307692307694e-05,
|
||
|
|
"loss": 0.2534,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2347426861524582,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.024671052631578948,
|
||
|
|
"grad_norm": 0.3727414608001709,
|
||
|
|
"learning_rate": 2.9090909090909093e-05,
|
||
|
|
"loss": 0.3065,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.25368186831474304,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.049342105263157895,
|
||
|
|
"grad_norm": 0.3068452477455139,
|
||
|
|
"learning_rate": 3.048951048951049e-05,
|
||
|
|
"loss": 0.2958,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2523610293865204,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07401315789473684,
|
||
|
|
"grad_norm": 0.29999276995658875,
|
||
|
|
"learning_rate": 3.188811188811189e-05,
|
||
|
|
"loss": 0.2856,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.27986133098602295,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09868421052631579,
|
||
|
|
"grad_norm": 0.28988873958587646,
|
||
|
|
"learning_rate": 3.328671328671329e-05,
|
||
|
|
"loss": 0.2982,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.30054572224617004,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12335526315789473,
|
||
|
|
"grad_norm": 0.2404654622077942,
|
||
|
|
"learning_rate": 3.468531468531469e-05,
|
||
|
|
"loss": 0.2746,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.27615606784820557,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14802631578947367,
|
||
|
|
"grad_norm": 0.24244281649589539,
|
||
|
|
"learning_rate": 3.608391608391609e-05,
|
||
|
|
"loss": 0.2767,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.25839051604270935,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17269736842105263,
|
||
|
|
"grad_norm": 0.25900793075561523,
|
||
|
|
"learning_rate": 3.748251748251749e-05,
|
||
|
|
"loss": 0.2763,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2887214124202728,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19736842105263158,
|
||
|
|
"grad_norm": 0.23659999668598175,
|
||
|
|
"learning_rate": 3.888111888111888e-05,
|
||
|
|
"loss": 0.2626,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.23153053224086761,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22203947368421054,
|
||
|
|
"grad_norm": 0.22117386758327484,
|
||
|
|
"learning_rate": 3.999993957205587e-05,
|
||
|
|
"loss": 0.2494,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20298953354358673,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24671052631578946,
|
||
|
|
"grad_norm": 0.26892349123954773,
|
||
|
|
"learning_rate": 3.999782463235198e-05,
|
||
|
|
"loss": 0.2604,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.28457778692245483,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2713815789473684,
|
||
|
|
"grad_norm": 0.32646119594573975,
|
||
|
|
"learning_rate": 3.999268866058499e-05,
|
||
|
|
"loss": 0.246,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2623623311519623,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29605263157894735,
|
||
|
|
"grad_norm": 0.32743313908576965,
|
||
|
|
"learning_rate": 3.9984532432636075e-05,
|
||
|
|
"loss": 0.2436,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.25287550687789917,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3207236842105263,
|
||
|
|
"grad_norm": 0.2839692234992981,
|
||
|
|
"learning_rate": 3.997335718065055e-05,
|
||
|
|
"loss": 0.2348,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.22406595945358276,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34539473684210525,
|
||
|
|
"grad_norm": 0.27507659792900085,
|
||
|
|
"learning_rate": 3.995916459285176e-05,
|
||
|
|
"loss": 0.2395,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.24255843460559845,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37006578947368424,
|
||
|
|
"grad_norm": 0.27469953894615173,
|
||
|
|
"learning_rate": 3.994195681328607e-05,
|
||
|
|
"loss": 0.235,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.23514795303344727,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39473684210526316,
|
||
|
|
"grad_norm": 0.30293864011764526,
|
||
|
|
"learning_rate": 3.99217364414989e-05,
|
||
|
|
"loss": 0.2436,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20414546132087708,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4194078947368421,
|
||
|
|
"grad_norm": 0.22480283677577972,
|
||
|
|
"learning_rate": 3.989850653214208e-05,
|
||
|
|
"loss": 0.2274,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.23089763522148132,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4440789473684211,
|
||
|
|
"grad_norm": 0.2698158621788025,
|
||
|
|
"learning_rate": 3.987227059451237e-05,
|
||
|
|
"loss": 0.2289,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.224493145942688,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46875,
|
||
|
|
"grad_norm": 0.30755510926246643,
|
||
|
|
"learning_rate": 3.984303259202129e-05,
|
||
|
|
"loss": 0.2179,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2319842278957367,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4934210526315789,
|
||
|
|
"grad_norm": 0.31990331411361694,
|
||
|
|
"learning_rate": 3.9810796941596414e-05,
|
||
|
|
"loss": 0.2229,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20815354585647583,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.024671052631578948,
|
||
|
|
"grad_norm": 0.2590547502040863,
|
||
|
|
"learning_rate": 3.97755685130141e-05,
|
||
|
|
"loss": 0.2447,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2057332992553711,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.049342105263157895,
|
||
|
|
"grad_norm": 0.2779129147529602,
|
||
|
|
"learning_rate": 3.973735262816381e-05,
|
||
|
|
"loss": 0.2418,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2053648829460144,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07401315789473684,
|
||
|
|
"grad_norm": 0.2121449112892151,
|
||
|
|
"learning_rate": 3.9696155060244166e-05,
|
||
|
|
"loss": 0.238,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.23251962661743164,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09868421052631579,
|
||
|
|
"grad_norm": 0.2546617090702057,
|
||
|
|
"learning_rate": 3.9651982032890774e-05,
|
||
|
|
"loss": 0.2522,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.25815051794052124,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12335526315789473,
|
||
|
|
"grad_norm": 0.20591332018375397,
|
||
|
|
"learning_rate": 3.960484021923606e-05,
|
||
|
|
"loss": 0.2365,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.23984409868717194,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14802631578947367,
|
||
|
|
"grad_norm": 0.22549138963222504,
|
||
|
|
"learning_rate": 3.9554736740901163e-05,
|
||
|
|
"loss": 0.2417,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.22766338288784027,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17269736842105263,
|
||
|
|
"grad_norm": 0.22996200621128082,
|
||
|
|
"learning_rate": 3.950167916692008e-05,
|
||
|
|
"loss": 0.2442,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.25690600275993347,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19736842105263158,
|
||
|
|
"grad_norm": 0.23695601522922516,
|
||
|
|
"learning_rate": 3.9445675512596224e-05,
|
||
|
|
"loss": 0.2347,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20780086517333984,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22203947368421054,
|
||
|
|
"grad_norm": 0.1984836608171463,
|
||
|
|
"learning_rate": 3.938673423829159e-05,
|
||
|
|
"loss": 0.2254,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18308115005493164,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24671052631578946,
|
||
|
|
"grad_norm": 0.23254786431789398,
|
||
|
|
"learning_rate": 3.932486424814865e-05,
|
||
|
|
"loss": 0.2369,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2599466145038605,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2713815789473684,
|
||
|
|
"grad_norm": 0.2990603446960449,
|
||
|
|
"learning_rate": 3.92600748887452e-05,
|
||
|
|
"loss": 0.2234,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.23988397419452667,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29605263157894735,
|
||
|
|
"grad_norm": 0.3279288411140442,
|
||
|
|
"learning_rate": 3.9192375947682436e-05,
|
||
|
|
"loss": 0.2225,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.23210851848125458,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3207236842105263,
|
||
|
|
"grad_norm": 0.28320780396461487,
|
||
|
|
"learning_rate": 3.9121777652106325e-05,
|
||
|
|
"loss": 0.2157,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20393934845924377,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34539473684210525,
|
||
|
|
"grad_norm": 0.2532626986503601,
|
||
|
|
"learning_rate": 3.904829066716263e-05,
|
||
|
|
"loss": 0.2206,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.22491160035133362,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37006578947368424,
|
||
|
|
"grad_norm": 0.24180759489536285,
|
||
|
|
"learning_rate": 3.8971926094385725e-05,
|
||
|
|
"loss": 0.217,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21774451434612274,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39473684210526316,
|
||
|
|
"grad_norm": 0.28742486238479614,
|
||
|
|
"learning_rate": 3.889269547002153e-05,
|
||
|
|
"loss": 0.2258,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18878665566444397,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4194078947368421,
|
||
|
|
"grad_norm": 0.2189558744430542,
|
||
|
|
"learning_rate": 3.881061076328475e-05,
|
||
|
|
"loss": 0.2121,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2159784585237503,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4440789473684211,
|
||
|
|
"grad_norm": 0.24447618424892426,
|
||
|
|
"learning_rate": 3.872568437455071e-05,
|
||
|
|
"loss": 0.2141,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20933416485786438,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46875,
|
||
|
|
"grad_norm": 0.2725847661495209,
|
||
|
|
"learning_rate": 3.863792913348202e-05,
|
||
|
|
"loss": 0.2045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21849879622459412,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4934210526315789,
|
||
|
|
"grad_norm": 0.2945365011692047,
|
||
|
|
"learning_rate": 3.854735829709049e-05,
|
||
|
|
"loss": 0.2099,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1962527334690094,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.024671052631579,
|
||
|
|
"grad_norm": 0.2862054705619812,
|
||
|
|
"learning_rate": 3.8453985547734364e-05,
|
||
|
|
"loss": 0.2265,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19139519333839417,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.049342105263158,
|
||
|
|
"grad_norm": 0.2504815459251404,
|
||
|
|
"learning_rate": 3.835782499105136e-05,
|
||
|
|
"loss": 0.2244,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19045579433441162,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0740131578947367,
|
||
|
|
"grad_norm": 0.22405609488487244,
|
||
|
|
"learning_rate": 3.825889115382777e-05,
|
||
|
|
"loss": 0.2215,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2161717265844345,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0986842105263157,
|
||
|
|
"grad_norm": 0.23503802716732025,
|
||
|
|
"learning_rate": 3.815719898180397e-05,
|
||
|
|
"loss": 0.2353,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.24189823865890503,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1233552631578947,
|
||
|
|
"grad_norm": 0.19855502247810364,
|
||
|
|
"learning_rate": 3.8052763837416496e-05,
|
||
|
|
"loss": 0.221,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.22518810629844666,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1480263157894737,
|
||
|
|
"grad_norm": 0.22038273513317108,
|
||
|
|
"learning_rate": 3.794560149747736e-05,
|
||
|
|
"loss": 0.2268,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21357837319374084,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1726973684210527,
|
||
|
|
"grad_norm": 0.2181052416563034,
|
||
|
|
"learning_rate": 3.7835728150790626e-05,
|
||
|
|
"loss": 0.2297,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.24232840538024902,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1973684210526316,
|
||
|
|
"grad_norm": 0.21558205783367157,
|
||
|
|
"learning_rate": 3.7723160395706846e-05,
|
||
|
|
"loss": 0.2213,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19611139595508575,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2220394736842106,
|
||
|
|
"grad_norm": 0.19245308637619019,
|
||
|
|
"learning_rate": 3.760791523761553e-05,
|
||
|
|
"loss": 0.213,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17266516387462616,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2467105263157894,
|
||
|
|
"grad_norm": 0.23584093153476715,
|
||
|
|
"learning_rate": 3.749001008637621e-05,
|
||
|
|
"loss": 0.2247,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.24593010544776917,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2713815789473684,
|
||
|
|
"grad_norm": 0.2926468253135681,
|
||
|
|
"learning_rate": 3.736946275368834e-05,
|
||
|
|
"loss": 0.2116,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.22750172019004822,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2960526315789473,
|
||
|
|
"grad_norm": 0.26042640209198,
|
||
|
|
"learning_rate": 3.724629145040056e-05,
|
||
|
|
"loss": 0.2112,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.22006699442863464,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3207236842105263,
|
||
|
|
"grad_norm": 0.26961809396743774,
|
||
|
|
"learning_rate": 3.7120514783759555e-05,
|
||
|
|
"loss": 0.206,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19352692365646362,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3453947368421053,
|
||
|
|
"grad_norm": 0.25605323910713196,
|
||
|
|
"learning_rate": 3.699215175459917e-05,
|
||
|
|
"loss": 0.2105,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21544437110424042,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3700657894736843,
|
||
|
|
"grad_norm": 0.28638923168182373,
|
||
|
|
"learning_rate": 3.686122175446992e-05,
|
||
|
|
"loss": 0.2072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20685678720474243,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3947368421052633,
|
||
|
|
"grad_norm": 0.26736754179000854,
|
||
|
|
"learning_rate": 3.672774456270959e-05,
|
||
|
|
"loss": 0.215,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1788450926542282,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.419407894736842,
|
||
|
|
"grad_norm": 0.19807708263397217,
|
||
|
|
"learning_rate": 3.659174034345522e-05,
|
||
|
|
"loss": 0.2027,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20726953446865082,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.444078947368421,
|
||
|
|
"grad_norm": 0.24334271252155304,
|
||
|
|
"learning_rate": 3.645322964259689e-05,
|
||
|
|
"loss": 0.2047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1996752917766571,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.46875,
|
||
|
|
"grad_norm": 0.2998853623867035,
|
||
|
|
"learning_rate": 3.631223338467394e-05,
|
||
|
|
"loss": 0.1961,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20962709188461304,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.493421052631579,
|
||
|
|
"grad_norm": 0.23867939412593842,
|
||
|
|
"learning_rate": 3.616877286971396e-05,
|
||
|
|
"loss": 0.2018,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18793398141860962,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.024671052631579,
|
||
|
|
"grad_norm": 0.2809722423553467,
|
||
|
|
"learning_rate": 3.6022869770014964e-05,
|
||
|
|
"loss": 0.2166,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18363182246685028,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.049342105263158,
|
||
|
|
"grad_norm": 0.22357912361621857,
|
||
|
|
"learning_rate": 3.587454612687148e-05,
|
||
|
|
"loss": 0.2144,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18171587586402893,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0740131578947367,
|
||
|
|
"grad_norm": 0.20701156556606293,
|
||
|
|
"learning_rate": 3.5723824347244745e-05,
|
||
|
|
"loss": 0.2119,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2063273936510086,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0986842105263157,
|
||
|
|
"grad_norm": 0.23529016971588135,
|
||
|
|
"learning_rate": 3.557072720037779e-05,
|
||
|
|
"loss": 0.225,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.23247161507606506,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1233552631578947,
|
||
|
|
"grad_norm": 0.2022576779127121,
|
||
|
|
"learning_rate": 3.541527781435568e-05,
|
||
|
|
"loss": 0.2115,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2152596414089203,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1480263157894737,
|
||
|
|
"grad_norm": 0.20009814202785492,
|
||
|
|
"learning_rate": 3.525749967261164e-05,
|
||
|
|
"loss": 0.2173,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20451240241527557,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1726973684210527,
|
||
|
|
"grad_norm": 0.2168785035610199,
|
||
|
|
"learning_rate": 3.509741661037945e-05,
|
||
|
|
"loss": 0.2202,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2329305112361908,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1973684210526316,
|
||
|
|
"grad_norm": 0.22165916860103607,
|
||
|
|
"learning_rate": 3.493505281109269e-05,
|
||
|
|
"loss": 0.2125,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18825937807559967,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2220394736842106,
|
||
|
|
"grad_norm": 0.1891016662120819,
|
||
|
|
"learning_rate": 3.477043280273139e-05,
|
||
|
|
"loss": 0.2048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16563668847084045,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2467105263157894,
|
||
|
|
"grad_norm": 0.21851275861263275,
|
||
|
|
"learning_rate": 3.460358145411669e-05,
|
||
|
|
"loss": 0.2163,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2363748550415039,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2713815789473684,
|
||
|
|
"grad_norm": 0.30250489711761475,
|
||
|
|
"learning_rate": 3.4434523971153876e-05,
|
||
|
|
"loss": 0.2038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21931317448616028,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2960526315789473,
|
||
|
|
"grad_norm": 0.25916534662246704,
|
||
|
|
"learning_rate": 3.426328589302463e-05,
|
||
|
|
"loss": 0.2034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21159030497074127,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3207236842105263,
|
||
|
|
"grad_norm": 0.28234443068504333,
|
||
|
|
"learning_rate": 3.408989308832887e-05,
|
||
|
|
"loss": 0.1997,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18619588017463684,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3453947368421053,
|
||
|
|
"grad_norm": 0.25823721289634705,
|
||
|
|
"learning_rate": 3.3914371751176806e-05,
|
||
|
|
"loss": 0.2034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20875269174575806,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3700657894736843,
|
||
|
|
"grad_norm": 0.25747162103652954,
|
||
|
|
"learning_rate": 3.3736748397231865e-05,
|
||
|
|
"loss": 0.2001,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20162513852119446,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3947368421052633,
|
||
|
|
"grad_norm": 0.2906784415245056,
|
||
|
|
"learning_rate": 3.3557049859705026e-05,
|
||
|
|
"loss": 0.208,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1721152365207672,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.419407894736842,
|
||
|
|
"grad_norm": 0.19228465855121613,
|
||
|
|
"learning_rate": 3.3375303285301175e-05,
|
||
|
|
"loss": 0.1964,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2005109190940857,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.444078947368421,
|
||
|
|
"grad_norm": 0.22863629460334778,
|
||
|
|
"learning_rate": 3.31915361301181e-05,
|
||
|
|
"loss": 0.1981,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19252851605415344,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.46875,
|
||
|
|
"grad_norm": 0.26857441663742065,
|
||
|
|
"learning_rate": 3.300577615549874e-05,
|
||
|
|
"loss": 0.1899,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2054862082004547,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.493421052631579,
|
||
|
|
"grad_norm": 0.23823519051074982,
|
||
|
|
"learning_rate": 3.281805142383738e-05,
|
||
|
|
"loss": 0.1957,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1814895123243332,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0246710526315788,
|
||
|
|
"grad_norm": 0.29881370067596436,
|
||
|
|
"learning_rate": 3.262839029434026e-05,
|
||
|
|
"loss": 0.2102,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1785247027873993,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.049342105263158,
|
||
|
|
"grad_norm": 0.24525128304958344,
|
||
|
|
"learning_rate": 3.243682141874147e-05,
|
||
|
|
"loss": 0.2074,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17585539817810059,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0740131578947367,
|
||
|
|
"grad_norm": 0.20698322355747223,
|
||
|
|
"learning_rate": 3.2243373736974524e-05,
|
||
|
|
"loss": 0.2051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1997726708650589,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.098684210526316,
|
||
|
|
"grad_norm": 0.22929632663726807,
|
||
|
|
"learning_rate": 3.204807647280049e-05,
|
||
|
|
"loss": 0.2176,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.22522705793380737,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1233552631578947,
|
||
|
|
"grad_norm": 0.18781138956546783,
|
||
|
|
"learning_rate": 3.185095912939324e-05,
|
||
|
|
"loss": 0.2046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20897895097732544,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1480263157894735,
|
||
|
|
"grad_norm": 0.2008383721113205,
|
||
|
|
"learning_rate": 3.165205148488242e-05,
|
||
|
|
"loss": 0.2105,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19787193834781647,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1726973684210527,
|
||
|
|
"grad_norm": 0.21053168177604675,
|
||
|
|
"learning_rate": 3.145138358785494e-05,
|
||
|
|
"loss": 0.2131,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2260877788066864,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1973684210526314,
|
||
|
|
"grad_norm": 0.2918646037578583,
|
||
|
|
"learning_rate": 3.124898575281562e-05,
|
||
|
|
"loss": 0.2057,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1822304129600525,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2220394736842106,
|
||
|
|
"grad_norm": 0.18496881425380707,
|
||
|
|
"learning_rate": 3.1044888555607594e-05,
|
||
|
|
"loss": 0.1985,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16007639467716217,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2467105263157894,
|
||
|
|
"grad_norm": 0.2168571650981903,
|
||
|
|
"learning_rate": 3.0839122828793314e-05,
|
||
|
|
"loss": 0.2098,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.22958366572856903,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2713815789473686,
|
||
|
|
"grad_norm": 0.29614129662513733,
|
||
|
|
"learning_rate": 3.0631719656996707e-05,
|
||
|
|
"loss": 0.1979,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21339645981788635,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2960526315789473,
|
||
|
|
"grad_norm": 0.2810326814651489,
|
||
|
|
"learning_rate": 3.042271037220731e-05,
|
||
|
|
"loss": 0.1972,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20447814464569092,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3207236842105265,
|
||
|
|
"grad_norm": 0.2408124804496765,
|
||
|
|
"learning_rate": 3.0212126549046986e-05,
|
||
|
|
"loss": 0.1923,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17865577340126038,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3453947368421053,
|
||
|
|
"grad_norm": 0.2610262334346771,
|
||
|
|
"learning_rate": 3.0000000000000004e-05,
|
||
|
|
"loss": 0.1969,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20306739211082458,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.370065789473684,
|
||
|
|
"grad_norm": 0.4123166501522064,
|
||
|
|
"learning_rate": 2.978636277060722e-05,
|
||
|
|
"loss": 0.1942,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2014990746974945,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3947368421052633,
|
||
|
|
"grad_norm": 0.277736634016037,
|
||
|
|
"learning_rate": 2.9571247134624985e-05,
|
||
|
|
"loss": 0.2033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16646766662597656,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.419407894736842,
|
||
|
|
"grad_norm": 0.2018052637577057,
|
||
|
|
"learning_rate": 2.9354685589149637e-05,
|
||
|
|
"loss": 0.1911,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19513532519340515,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4440789473684212,
|
||
|
|
"grad_norm": 0.24421605467796326,
|
||
|
|
"learning_rate": 2.9136710849708225e-05,
|
||
|
|
"loss": 0.1925,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18745818734169006,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.46875,
|
||
|
|
"grad_norm": 0.26889941096305847,
|
||
|
|
"learning_rate": 2.8917355845316214e-05,
|
||
|
|
"loss": 0.1844,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20007860660552979,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4934210526315788,
|
||
|
|
"grad_norm": 0.22898580133914948,
|
||
|
|
"learning_rate": 2.869665371350299e-05,
|
||
|
|
"loss": 0.1907,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17558446526527405,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0246710526315788,
|
||
|
|
"grad_norm": 0.29629117250442505,
|
||
|
|
"learning_rate": 2.8474637795305842e-05,
|
||
|
|
"loss": 0.2053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1745256781578064,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.049342105263158,
|
||
|
|
"grad_norm": 0.22614240646362305,
|
||
|
|
"learning_rate": 2.825134163023318e-05,
|
||
|
|
"loss": 0.2023,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17126205563545227,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0740131578947367,
|
||
|
|
"grad_norm": 0.20102806389331818,
|
||
|
|
"learning_rate": 2.802679895119778e-05,
|
||
|
|
"loss": 0.2,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19442611932754517,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.098684210526316,
|
||
|
|
"grad_norm": 0.2268202155828476,
|
||
|
|
"learning_rate": 2.7801043679420856e-05,
|
||
|
|
"loss": 0.2119,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21963992714881897,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1233552631578947,
|
||
|
|
"grad_norm": 0.1838330328464508,
|
||
|
|
"learning_rate": 2.75741099193076e-05,
|
||
|
|
"loss": 0.1992,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20316563546657562,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1480263157894735,
|
||
|
|
"grad_norm": 0.20691487193107605,
|
||
|
|
"learning_rate": 2.734603195329514e-05,
|
||
|
|
"loss": 0.205,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1925206482410431,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1726973684210527,
|
||
|
|
"grad_norm": 0.21233633160591125,
|
||
|
|
"learning_rate": 2.711684423667353e-05,
|
||
|
|
"loss": 0.2073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.22030431032180786,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1973684210526314,
|
||
|
|
"grad_norm": 0.2087518721818924,
|
||
|
|
"learning_rate": 2.688658139238067e-05,
|
||
|
|
"loss": 0.2004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17837481200695038,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2220394736842106,
|
||
|
|
"grad_norm": 0.18232280015945435,
|
||
|
|
"learning_rate": 2.6655278205771877e-05,
|
||
|
|
"loss": 0.1934,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15586256980895996,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2467105263157894,
|
||
|
|
"grad_norm": 0.2109103947877884,
|
||
|
|
"learning_rate": 2.6422969619364965e-05,
|
||
|
|
"loss": 0.2045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.223219633102417,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2713815789473686,
|
||
|
|
"grad_norm": 0.27833494544029236,
|
||
|
|
"learning_rate": 2.6189690727561478e-05,
|
||
|
|
"loss": 0.1929,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20833665132522583,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2960526315789473,
|
||
|
|
"grad_norm": 0.2569701075553894,
|
||
|
|
"learning_rate": 2.5955476771345116e-05,
|
||
|
|
"loss": 0.1928,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2008177787065506,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3207236842105265,
|
||
|
|
"grad_norm": 0.24047020077705383,
|
||
|
|
"learning_rate": 2.5720363132957915e-05,
|
||
|
|
"loss": 0.1874,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1724005788564682,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3453947368421053,
|
||
|
|
"grad_norm": 0.2805600166320801,
|
||
|
|
"learning_rate": 2.5484385330555138e-05,
|
||
|
|
"loss": 0.1915,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19802047312259674,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.370065789473684,
|
||
|
|
"grad_norm": 0.3280099332332611,
|
||
|
|
"learning_rate": 2.5247579012839584e-05,
|
||
|
|
"loss": 0.1895,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19793689250946045,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3947368421052633,
|
||
|
|
"grad_norm": 0.27804508805274963,
|
||
|
|
"learning_rate": 2.500997995367626e-05,
|
||
|
|
"loss": 0.2007,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16160696744918823,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.419407894736842,
|
||
|
|
"grad_norm": 0.2003459334373474,
|
||
|
|
"learning_rate": 2.4771624046688043e-05,
|
||
|
|
"loss": 0.1865,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19053032994270325,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4440789473684212,
|
||
|
|
"grad_norm": 0.2516365945339203,
|
||
|
|
"learning_rate": 2.4532547299833337e-05,
|
||
|
|
"loss": 0.1876,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1822153627872467,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.46875,
|
||
|
|
"grad_norm": 0.2790171205997467,
|
||
|
|
"learning_rate": 2.4292785829966407e-05,
|
||
|
|
"loss": 0.1798,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19594642519950867,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4934210526315788,
|
||
|
|
"grad_norm": 0.2231920063495636,
|
||
|
|
"learning_rate": 2.405237585738126e-05,
|
||
|
|
"loss": 0.1859,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1701628416776657,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0246710526315788,
|
||
|
|
"grad_norm": 0.28956133127212524,
|
||
|
|
"learning_rate": 2.381135370033996e-05,
|
||
|
|
"loss": 0.2012,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1710219532251358,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.049342105263158,
|
||
|
|
"grad_norm": 0.23850497603416443,
|
||
|
|
"learning_rate": 2.356975576958606e-05,
|
||
|
|
"loss": 0.1977,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16769038140773773,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0740131578947367,
|
||
|
|
"grad_norm": 0.19639335572719574,
|
||
|
|
"learning_rate": 2.3327618562844116e-05,
|
||
|
|
"loss": 0.1954,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18999743461608887,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.098684210526316,
|
||
|
|
"grad_norm": 0.22467830777168274,
|
||
|
|
"learning_rate": 2.3084978659306048e-05,
|
||
|
|
"loss": 0.2069,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21505983173847198,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1233552631578947,
|
||
|
|
"grad_norm": 0.19431814551353455,
|
||
|
|
"learning_rate": 2.2841872714105196e-05,
|
||
|
|
"loss": 0.1944,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19865034520626068,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1480263157894735,
|
||
|
|
"grad_norm": 0.2027537077665329,
|
||
|
|
"learning_rate": 2.25983374527789e-05,
|
||
|
|
"loss": 0.2002,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18814553320407867,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1726973684210527,
|
||
|
|
"grad_norm": 0.21179619431495667,
|
||
|
|
"learning_rate": 2.2354409665720427e-05,
|
||
|
|
"loss": 0.2024,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21532279253005981,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1973684210526314,
|
||
|
|
"grad_norm": 0.20413529872894287,
|
||
|
|
"learning_rate": 2.2110126202621162e-05,
|
||
|
|
"loss": 0.1957,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17406898736953735,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2220394736842106,
|
||
|
|
"grad_norm": 0.19533619284629822,
|
||
|
|
"learning_rate": 2.1865523966903758e-05,
|
||
|
|
"loss": 0.1889,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15199615061283112,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2467105263157894,
|
||
|
|
"grad_norm": 0.212092787027359,
|
||
|
|
"learning_rate": 2.16206399101472e-05,
|
||
|
|
"loss": 0.1998,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21798661351203918,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2713815789473686,
|
||
|
|
"grad_norm": 0.2671966552734375,
|
||
|
|
"learning_rate": 2.1375511026504653e-05,
|
||
|
|
"loss": 0.1885,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20311211049556732,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2960526315789473,
|
||
|
|
"grad_norm": 0.24363179504871368,
|
||
|
|
"learning_rate": 2.113017434711479e-05,
|
||
|
|
"loss": 0.1875,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1942652463912964,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3207236842105265,
|
||
|
|
"grad_norm": 0.22911496460437775,
|
||
|
|
"learning_rate": 2.088466693450758e-05,
|
||
|
|
"loss": 0.1823,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1669602394104004,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3453947368421053,
|
||
|
|
"grad_norm": 0.24698033928871155,
|
||
|
|
"learning_rate": 2.0639025877005308e-05,
|
||
|
|
"loss": 0.1863,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19326898455619812,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.370065789473684,
|
||
|
|
"grad_norm": 0.5782654881477356,
|
||
|
|
"learning_rate": 2.039328828311976e-05,
|
||
|
|
"loss": 0.1854,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20133379101753235,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3947368421052633,
|
||
|
|
"grad_norm": 0.29828956723213196,
|
||
|
|
"learning_rate": 2.014749127594625e-05,
|
||
|
|
"loss": 0.1983,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1581995040178299,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.419407894736842,
|
||
|
|
"grad_norm": 0.20889180898666382,
|
||
|
|
"learning_rate": 1.9901671987555568e-05,
|
||
|
|
"loss": 0.1828,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18696996569633484,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4440789473684212,
|
||
|
|
"grad_norm": 0.24528741836547852,
|
||
|
|
"learning_rate": 1.9655867553384472e-05,
|
||
|
|
"loss": 0.1834,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17848661541938782,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.46875,
|
||
|
|
"grad_norm": 0.24264982342720032,
|
||
|
|
"learning_rate": 1.9410115106625714e-05,
|
||
|
|
"loss": 0.1754,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18969368934631348,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4934210526315788,
|
||
|
|
"grad_norm": 0.2114223688840866,
|
||
|
|
"learning_rate": 1.9164451772618435e-05,
|
||
|
|
"loss": 0.1812,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16628679633140564,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0246710526315788,
|
||
|
|
"grad_norm": 0.308517187833786,
|
||
|
|
"learning_rate": 1.891891466323966e-05,
|
||
|
|
"loss": 0.1979,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16870887577533722,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.049342105263158,
|
||
|
|
"grad_norm": 0.24148114025592804,
|
||
|
|
"learning_rate": 1.8673540871297927e-05,
|
||
|
|
"loss": 0.1942,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16438522934913635,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0740131578947367,
|
||
|
|
"grad_norm": 0.1998043656349182,
|
||
|
|
"learning_rate": 1.842836746492971e-05,
|
||
|
|
"loss": 0.1916,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18565362691879272,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.098684210526316,
|
||
|
|
"grad_norm": 0.22963786125183105,
|
||
|
|
"learning_rate": 1.8183431481999658e-05,
|
||
|
|
"loss": 0.2026,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21126440167427063,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1233552631578947,
|
||
|
|
"grad_norm": 0.1848352700471878,
|
||
|
|
"learning_rate": 1.793876992450529e-05,
|
||
|
|
"loss": 0.1905,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1943422555923462,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1480263157894735,
|
||
|
|
"grad_norm": 0.20099779963493347,
|
||
|
|
"learning_rate": 1.769441975298726e-05,
|
||
|
|
"loss": 0.1958,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1838373839855194,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1726973684210527,
|
||
|
|
"grad_norm": 0.23807059228420258,
|
||
|
|
"learning_rate": 1.7450417880945705e-05,
|
||
|
|
"loss": 0.198,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21098263561725616,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1973684210526314,
|
||
|
|
"grad_norm": 0.2139764130115509,
|
||
|
|
"learning_rate": 1.720680116926388e-05,
|
||
|
|
"loss": 0.1914,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17009641230106354,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2220394736842106,
|
||
|
|
"grad_norm": 0.18776723742485046,
|
||
|
|
"learning_rate": 1.6963606420639602e-05,
|
||
|
|
"loss": 0.185,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14875054359436035,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2467105263157894,
|
||
|
|
"grad_norm": 0.21373049914836884,
|
||
|
|
"learning_rate": 1.6720870374025578e-05,
|
||
|
|
"loss": 0.1957,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.212870791554451,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2713815789473686,
|
||
|
|
"grad_norm": 0.2705714702606201,
|
||
|
|
"learning_rate": 1.6478629699079278e-05,
|
||
|
|
"loss": 0.1846,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19945034384727478,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2960526315789473,
|
||
|
|
"grad_norm": 0.25683969259262085,
|
||
|
|
"learning_rate": 1.6236920990623374e-05,
|
||
|
|
"loss": 0.1833,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19059477746486664,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3207236842105265,
|
||
|
|
"grad_norm": 0.2478659451007843,
|
||
|
|
"learning_rate": 1.5995780763117382e-05,
|
||
|
|
"loss": 0.1781,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16162365674972534,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3453947368421053,
|
||
|
|
"grad_norm": 0.24454790353775024,
|
||
|
|
"learning_rate": 1.5755245445141544e-05,
|
||
|
|
"loss": 0.182,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1889064908027649,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.370065789473684,
|
||
|
|
"grad_norm": 0.32232385873794556,
|
||
|
|
"learning_rate": 1.5515351373893573e-05,
|
||
|
|
"loss": 0.1804,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19105729460716248,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3947368421052633,
|
||
|
|
"grad_norm": 0.2774730324745178,
|
||
|
|
"learning_rate": 1.5276134789699344e-05,
|
||
|
|
"loss": 0.1936,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15349537134170532,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.419407894736842,
|
||
|
|
"grad_norm": 0.19822268187999725,
|
||
|
|
"learning_rate": 1.503763183053805e-05,
|
||
|
|
"loss": 0.1787,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18199126422405243,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4440789473684212,
|
||
|
|
"grad_norm": 0.2223716378211975,
|
||
|
|
"learning_rate": 1.4799878526582987e-05,
|
||
|
|
"loss": 0.1788,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17311102151870728,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.46875,
|
||
|
|
"grad_norm": 0.2528238594532013,
|
||
|
|
"learning_rate": 1.4562910794758488e-05,
|
||
|
|
"loss": 0.171,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1867274045944214,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4934210526315788,
|
||
|
|
"grad_norm": 0.21180278062820435,
|
||
|
|
"learning_rate": 1.4326764433314066e-05,
|
||
|
|
"loss": 0.1771,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16158296167850494,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.024671052631579,
|
||
|
|
"grad_norm": 0.2938304543495178,
|
||
|
|
"learning_rate": 1.4091475116416415e-05,
|
||
|
|
"loss": 0.195,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1667296290397644,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0493421052631575,
|
||
|
|
"grad_norm": 0.24434833228588104,
|
||
|
|
"learning_rate": 1.3857078388760203e-05,
|
||
|
|
"loss": 0.1909,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16151553392410278,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.074013157894737,
|
||
|
|
"grad_norm": 0.19539859890937805,
|
||
|
|
"learning_rate": 1.3623609660198373e-05,
|
||
|
|
"loss": 0.188,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18197977542877197,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.098684210526316,
|
||
|
|
"grad_norm": 0.23001670837402344,
|
||
|
|
"learning_rate": 1.3391104200392905e-05,
|
||
|
|
"loss": 0.1987,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2076358199119568,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.123355263157895,
|
||
|
|
"grad_norm": 0.18925786018371582,
|
||
|
|
"learning_rate": 1.3159597133486628e-05,
|
||
|
|
"loss": 0.1865,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19061236083507538,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1480263157894735,
|
||
|
|
"grad_norm": 0.20448650419712067,
|
||
|
|
"learning_rate": 1.292912343279713e-05,
|
||
|
|
"loss": 0.1918,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18021315336227417,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.172697368421052,
|
||
|
|
"grad_norm": 0.2233712524175644,
|
||
|
|
"learning_rate": 1.2699717915533402e-05,
|
||
|
|
"loss": 0.1942,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20732900500297546,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.197368421052632,
|
||
|
|
"grad_norm": 0.20618166029453278,
|
||
|
|
"learning_rate": 1.2471415237536065e-05,
|
||
|
|
"loss": 0.1874,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16642165184020996,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.222039473684211,
|
||
|
|
"grad_norm": 0.19250109791755676,
|
||
|
|
"learning_rate": 1.2244249888041955e-05,
|
||
|
|
"loss": 0.1813,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1461605280637741,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.246710526315789,
|
||
|
|
"grad_norm": 0.20326243340969086,
|
||
|
|
"learning_rate": 1.2018256184473967e-05,
|
||
|
|
"loss": 0.1919,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20845885574817657,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.271381578947368,
|
||
|
|
"grad_norm": 0.28202906250953674,
|
||
|
|
"learning_rate": 1.1793468267256709e-05,
|
||
|
|
"loss": 0.1804,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19517214596271515,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.296052631578947,
|
||
|
|
"grad_norm": 0.23696176707744598,
|
||
|
|
"learning_rate": 1.156992009465904e-05,
|
||
|
|
"loss": 0.1788,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18528538942337036,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.3207236842105265,
|
||
|
|
"grad_norm": 0.2820415794849396,
|
||
|
|
"learning_rate": 1.1347645437664032e-05,
|
||
|
|
"loss": 0.1738,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15721410512924194,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.345394736842105,
|
||
|
|
"grad_norm": 0.2320161908864975,
|
||
|
|
"learning_rate": 1.1126677874867245e-05,
|
||
|
|
"loss": 0.1776,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1845950335264206,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.370065789473684,
|
||
|
|
"grad_norm": 0.33977606892585754,
|
||
|
|
"learning_rate": 1.0907050787404105e-05,
|
||
|
|
"loss": 0.1757,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18589606881141663,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.394736842105263,
|
||
|
|
"grad_norm": 0.2689703404903412,
|
||
|
|
"learning_rate": 1.0688797353907052e-05,
|
||
|
|
"loss": 0.1882,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14957153797149658,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.4194078947368425,
|
||
|
|
"grad_norm": 0.2037592977285385,
|
||
|
|
"learning_rate": 1.0471950545493328e-05,
|
||
|
|
"loss": 0.1753,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17800632119178772,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.444078947368421,
|
||
|
|
"grad_norm": 0.2365749329328537,
|
||
|
|
"learning_rate": 1.0256543120784074e-05,
|
||
|
|
"loss": 0.1746,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16894236207008362,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.46875,
|
||
|
|
"grad_norm": 0.2567404806613922,
|
||
|
|
"learning_rate": 1.0042607620955592e-05,
|
||
|
|
"loss": 0.1669,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18090274930000305,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.493421052631579,
|
||
|
|
"grad_norm": 0.2108602672815323,
|
||
|
|
"learning_rate": 9.830176364823349e-06,
|
||
|
|
"loss": 0.1729,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15751853585243225,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.024671052631579,
|
||
|
|
"grad_norm": 0.30531227588653564,
|
||
|
|
"learning_rate": 9.619281443959711e-06,
|
||
|
|
"loss": 0.1925,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16471046209335327,
|
||
|
|
"step": 1005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0493421052631575,
|
||
|
|
"grad_norm": 0.2457159012556076,
|
||
|
|
"learning_rate": 9.409954717845861e-06,
|
||
|
|
"loss": 0.188,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15918654203414917,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.074013157894737,
|
||
|
|
"grad_norm": 0.2185594141483307,
|
||
|
|
"learning_rate": 9.202227809058912e-06,
|
||
|
|
"loss": 0.1848,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17858435213565826,
|
||
|
|
"step": 1015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.098684210526316,
|
||
|
|
"grad_norm": 0.31074318289756775,
|
||
|
|
"learning_rate": 8.996132098494688e-06,
|
||
|
|
"loss": 0.1951,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20442083477973938,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.123355263157895,
|
||
|
|
"grad_norm": 0.1880805939435959,
|
||
|
|
"learning_rate": 8.791698720627138e-06,
|
||
|
|
"loss": 0.183,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18682318925857544,
|
||
|
|
"step": 1025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1480263157894735,
|
||
|
|
"grad_norm": 0.1928797960281372,
|
||
|
|
"learning_rate": 8.58895855880484e-06,
|
||
|
|
"loss": 0.1881,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17656448483467102,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.172697368421052,
|
||
|
|
"grad_norm": 0.22544586658477783,
|
||
|
|
"learning_rate": 8.387942240585587e-06,
|
||
|
|
"loss": 0.1905,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20401433110237122,
|
||
|
|
"step": 1035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.197368421052632,
|
||
|
|
"grad_norm": 0.20387957990169525,
|
||
|
|
"learning_rate": 8.188680133109485e-06,
|
||
|
|
"loss": 0.1838,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16293783485889435,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.222039473684211,
|
||
|
|
"grad_norm": 0.22418615221977234,
|
||
|
|
"learning_rate": 7.991202338511477e-06,
|
||
|
|
"loss": 0.1779,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14349329471588135,
|
||
|
|
"step": 1045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.246710526315789,
|
||
|
|
"grad_norm": 0.21125830709934235,
|
||
|
|
"learning_rate": 7.795538689373859e-06,
|
||
|
|
"loss": 0.1881,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20435020327568054,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.271381578947368,
|
||
|
|
"grad_norm": 0.256409227848053,
|
||
|
|
"learning_rate": 7.601718744219555e-06,
|
||
|
|
"loss": 0.1768,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19160351157188416,
|
||
|
|
"step": 1055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.296052631578947,
|
||
|
|
"grad_norm": 0.23899729549884796,
|
||
|
|
"learning_rate": 7.409771783046733e-06,
|
||
|
|
"loss": 0.1747,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18137173354625702,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.3207236842105265,
|
||
|
|
"grad_norm": 0.2511383295059204,
|
||
|
|
"learning_rate": 7.219726802905573e-06,
|
||
|
|
"loss": 0.17,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15333089232444763,
|
||
|
|
"step": 1065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.345394736842105,
|
||
|
|
"grad_norm": 0.24497000873088837,
|
||
|
|
"learning_rate": 7.0316125135176935e-06,
|
||
|
|
"loss": 0.1735,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17997264862060547,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.370065789473684,
|
||
|
|
"grad_norm": 0.2674829661846161,
|
||
|
|
"learning_rate": 6.845457332939083e-06,
|
||
|
|
"loss": 0.1717,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18120409548282623,
|
||
|
|
"step": 1075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.394736842105263,
|
||
|
|
"grad_norm": 0.253813773393631,
|
||
|
|
"learning_rate": 6.661289383266984e-06,
|
||
|
|
"loss": 0.1848,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14498010277748108,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.4194078947368425,
|
||
|
|
"grad_norm": 0.20171727240085602,
|
||
|
|
"learning_rate": 6.479136486391599e-06,
|
||
|
|
"loss": 0.1709,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1741921305656433,
|
||
|
|
"step": 1085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.444078947368421,
|
||
|
|
"grad_norm": 0.265755832195282,
|
||
|
|
"learning_rate": 6.299026159793042e-06,
|
||
|
|
"loss": 0.1704,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16434627771377563,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.46875,
|
||
|
|
"grad_norm": 0.24092736840248108,
|
||
|
|
"learning_rate": 6.120985612384369e-06,
|
||
|
|
"loss": 0.1636,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17743858695030212,
|
||
|
|
"step": 1095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.493421052631579,
|
||
|
|
"grad_norm": 0.22309978306293488,
|
||
|
|
"learning_rate": 5.945041740401147e-06,
|
||
|
|
"loss": 0.1691,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1535460352897644,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.024671052631579,
|
||
|
|
"grad_norm": 0.296979695558548,
|
||
|
|
"learning_rate": 5.7712211233383104e-06,
|
||
|
|
"loss": 0.1901,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16269783675670624,
|
||
|
|
"step": 1105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0493421052631575,
|
||
|
|
"grad_norm": 0.2464631199836731,
|
||
|
|
"learning_rate": 5.5995500199348565e-06,
|
||
|
|
"loss": 0.1852,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15707314014434814,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.074013157894737,
|
||
|
|
"grad_norm": 0.19684258103370667,
|
||
|
|
"learning_rate": 5.430054364206965e-06,
|
||
|
|
"loss": 0.182,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17553085088729858,
|
||
|
|
"step": 1115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.098684210526316,
|
||
|
|
"grad_norm": 0.2454098016023636,
|
||
|
|
"learning_rate": 5.262759761530214e-06,
|
||
|
|
"loss": 0.1921,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20213395357131958,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.123355263157895,
|
||
|
|
"grad_norm": 0.19603319466114044,
|
||
|
|
"learning_rate": 5.097691484771434e-06,
|
||
|
|
"loss": 0.1797,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1835222691297531,
|
||
|
|
"step": 1125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.1480263157894735,
|
||
|
|
"grad_norm": 0.19945049285888672,
|
||
|
|
"learning_rate": 4.934874470470756e-06,
|
||
|
|
"loss": 0.1847,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17295250296592712,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.172697368421052,
|
||
|
|
"grad_norm": 0.23703482747077942,
|
||
|
|
"learning_rate": 4.77433331507454e-06,
|
||
|
|
"loss": 0.187,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2007107436656952,
|
||
|
|
"step": 1135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.197368421052632,
|
||
|
|
"grad_norm": 0.19850043952465057,
|
||
|
|
"learning_rate": 4.6160922712195875e-06,
|
||
|
|
"loss": 0.1803,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15952754020690918,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.222039473684211,
|
||
|
|
"grad_norm": 0.19503454864025116,
|
||
|
|
"learning_rate": 4.460175244069395e-06,
|
||
|
|
"loss": 0.1748,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14143893122673035,
|
||
|
|
"step": 1145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.246710526315789,
|
||
|
|
"grad_norm": 0.21830520033836365,
|
||
|
|
"learning_rate": 4.306605787702802e-06,
|
||
|
|
"loss": 0.1846,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20023545622825623,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.271381578947368,
|
||
|
|
"grad_norm": 0.2543354332447052,
|
||
|
|
"learning_rate": 4.155407101555764e-06,
|
||
|
|
"loss": 0.1731,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18734650313854218,
|
||
|
|
"step": 1155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.296052631578947,
|
||
|
|
"grad_norm": 0.2577175796031952,
|
||
|
|
"learning_rate": 4.006602026916617e-06,
|
||
|
|
"loss": 0.1708,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17666634917259216,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.3207236842105265,
|
||
|
|
"grad_norm": 0.2335437387228012,
|
||
|
|
"learning_rate": 3.860213043475531e-06,
|
||
|
|
"loss": 0.1663,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1490708291530609,
|
||
|
|
"step": 1165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.345394736842105,
|
||
|
|
"grad_norm": 0.23638561367988586,
|
||
|
|
"learning_rate": 3.7162622659285185e-06,
|
||
|
|
"loss": 0.1694,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1765957921743393,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.370065789473684,
|
||
|
|
"grad_norm": 0.3142367899417877,
|
||
|
|
"learning_rate": 3.5747714406366154e-06,
|
||
|
|
"loss": 0.1677,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17734456062316895,
|
||
|
|
"step": 1175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.394736842105263,
|
||
|
|
"grad_norm": 0.2611706852912903,
|
||
|
|
"learning_rate": 3.435761942340705e-06,
|
||
|
|
"loss": 0.1807,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14173588156700134,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.4194078947368425,
|
||
|
|
"grad_norm": 0.20892195403575897,
|
||
|
|
"learning_rate": 3.2992547709324964e-06,
|
||
|
|
"loss": 0.168,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17047154903411865,
|
||
|
|
"step": 1185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.444078947368421,
|
||
|
|
"grad_norm": 0.24029605090618134,
|
||
|
|
"learning_rate": 3.1652705482820665e-06,
|
||
|
|
"loss": 0.167,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16115230321884155,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.46875,
|
||
|
|
"grad_norm": 0.2471027821302414,
|
||
|
|
"learning_rate": 3.033829515122608e-06,
|
||
|
|
"loss": 0.1598,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1730382740497589,
|
||
|
|
"step": 1195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.493421052631579,
|
||
|
|
"grad_norm": 0.22252005338668823,
|
||
|
|
"learning_rate": 2.904951527992652e-06,
|
||
|
|
"loss": 0.1656,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1501745879650116,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.024671052631579,
|
||
|
|
"grad_norm": 0.2984672784805298,
|
||
|
|
"learning_rate": 2.7786560562364285e-06,
|
||
|
|
"loss": 0.1878,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16104725003242493,
|
||
|
|
"step": 1205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0493421052631575,
|
||
|
|
"grad_norm": 0.24322442710399628,
|
||
|
|
"learning_rate": 2.6549621790626166e-06,
|
||
|
|
"loss": 0.1825,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1545785665512085,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.074013157894737,
|
||
|
|
"grad_norm": 0.21504734456539154,
|
||
|
|
"learning_rate": 2.533888582662145e-06,
|
||
|
|
"loss": 0.1791,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17252680659294128,
|
||
|
|
"step": 1215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.098684210526316,
|
||
|
|
"grad_norm": 0.2362941950559616,
|
||
|
|
"learning_rate": 2.41545355738525e-06,
|
||
|
|
"loss": 0.189,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19943520426750183,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.123355263157895,
|
||
|
|
"grad_norm": 0.1955908238887787,
|
||
|
|
"learning_rate": 2.299674994978436e-06,
|
||
|
|
"loss": 0.1765,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1803397238254547,
|
||
|
|
"step": 1225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.1480263157894735,
|
||
|
|
"grad_norm": 0.19484035670757294,
|
||
|
|
"learning_rate": 2.1865703858815656e-06,
|
||
|
|
"loss": 0.1813,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1700981855392456,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.172697368421052,
|
||
|
|
"grad_norm": 0.2399033159017563,
|
||
|
|
"learning_rate": 2.076156816585639e-06,
|
||
|
|
"loss": 0.1836,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19748282432556152,
|
||
|
|
"step": 1235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.197368421052632,
|
||
|
|
"grad_norm": 0.20060895383358002,
|
||
|
|
"learning_rate": 1.9684509670515585e-06,
|
||
|
|
"loss": 0.177,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1564170867204666,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.222039473684211,
|
||
|
|
"grad_norm": 0.2051486074924469,
|
||
|
|
"learning_rate": 1.86346910819033e-06,
|
||
|
|
"loss": 0.1718,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13873499631881714,
|
||
|
|
"step": 1245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.246710526315789,
|
||
|
|
"grad_norm": 0.21883392333984375,
|
||
|
|
"learning_rate": 1.7612270994050362e-06,
|
||
|
|
"loss": 0.1812,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19669045507907867,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.271381578947368,
|
||
|
|
"grad_norm": 0.27726781368255615,
|
||
|
|
"learning_rate": 1.6617403861949898e-06,
|
||
|
|
"loss": 0.1702,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18430817127227783,
|
||
|
|
"step": 1255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.296052631578947,
|
||
|
|
"grad_norm": 0.25286152958869934,
|
||
|
|
"learning_rate": 1.5650239978224346e-06,
|
||
|
|
"loss": 0.1672,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17335672676563263,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.3207236842105265,
|
||
|
|
"grad_norm": 0.24115750193595886,
|
||
|
|
"learning_rate": 1.4710925450420632e-06,
|
||
|
|
"loss": 0.1629,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14512015879154205,
|
||
|
|
"step": 1265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.345394736842105,
|
||
|
|
"grad_norm": 0.36607709527015686,
|
||
|
|
"learning_rate": 1.379960217893841e-06,
|
||
|
|
"loss": 0.1659,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1726306825876236,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.370065789473684,
|
||
|
|
"grad_norm": 0.2740156054496765,
|
||
|
|
"learning_rate": 1.2916407835593093e-06,
|
||
|
|
"loss": 0.1641,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1730458289384842,
|
||
|
|
"step": 1275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.394736842105263,
|
||
|
|
"grad_norm": 0.2575497627258301,
|
||
|
|
"learning_rate": 1.2061475842818337e-06,
|
||
|
|
"loss": 0.1772,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13783614337444305,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.4194078947368425,
|
||
|
|
"grad_norm": 0.19674454629421234,
|
||
|
|
"learning_rate": 1.1234935353509946e-06,
|
||
|
|
"loss": 0.1638,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16719526052474976,
|
||
|
|
"step": 1285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.444078947368421,
|
||
|
|
"grad_norm": 0.25193727016448975,
|
||
|
|
"learning_rate": 1.0436911231515202e-06,
|
||
|
|
"loss": 0.1631,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15700435638427734,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.46875,
|
||
|
|
"grad_norm": 0.2387438416481018,
|
||
|
|
"learning_rate": 9.667524032769715e-07,
|
||
|
|
"loss": 0.1565,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16894632577896118,
|
||
|
|
"step": 1295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.493421052631579,
|
||
|
|
"grad_norm": 0.21962200105190277,
|
||
|
|
"learning_rate": 8.926889987085441e-07,
|
||
|
|
"loss": 0.1619,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1462545394897461,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.024671052631579,
|
||
|
|
"grad_norm": 0.294238418340683,
|
||
|
|
"learning_rate": 8.215120980591984e-07,
|
||
|
|
"loss": 0.1856,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15981429815292358,
|
||
|
|
"step": 1305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.0493421052631575,
|
||
|
|
"grad_norm": 0.2525791823863983,
|
||
|
|
"learning_rate": 7.532324538834279e-07,
|
||
|
|
"loss": 0.1802,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15260854363441467,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.074013157894737,
|
||
|
|
"grad_norm": 0.1967260092496872,
|
||
|
|
"learning_rate": 6.878603810528739e-07,
|
||
|
|
"loss": 0.1765,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1697266399860382,
|
||
|
|
"step": 1315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.098684210526316,
|
||
|
|
"grad_norm": 0.24107760190963745,
|
||
|
|
"learning_rate": 6.25405755198103e-07,
|
||
|
|
"loss": 0.1861,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19656933844089508,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.123355263157895,
|
||
|
|
"grad_norm": 0.21563208103179932,
|
||
|
|
"learning_rate": 5.658780112166872e-07,
|
||
|
|
"loss": 0.1735,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17719779908657074,
|
||
|
|
"step": 1325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.1480263157894735,
|
||
|
|
"grad_norm": 0.20149841904640198,
|
||
|
|
"learning_rate": 5.092861418479156e-07,
|
||
|
|
"loss": 0.1781,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16660000383853912,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.172697368421052,
|
||
|
|
"grad_norm": 0.24279265105724335,
|
||
|
|
"learning_rate": 4.556386963142645e-07,
|
||
|
|
"loss": 0.1805,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19446003437042236,
|
||
|
|
"step": 1335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.197368421052632,
|
||
|
|
"grad_norm": 0.2047039270401001,
|
||
|
|
"learning_rate": 4.04943779029896e-07,
|
||
|
|
"loss": 0.1738,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1529483050107956,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.222039473684211,
|
||
|
|
"grad_norm": 0.20605064928531647,
|
||
|
|
"learning_rate": 3.5720904837632355e-07,
|
||
|
|
"loss": 0.1688,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13626405596733093,
|
||
|
|
"step": 1345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.246710526315789,
|
||
|
|
"grad_norm": 0.20880380272865295,
|
||
|
|
"learning_rate": 3.124417155454884e-07,
|
||
|
|
"loss": 0.178,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19269785284996033,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.271381578947368,
|
||
|
|
"grad_norm": 0.2428048700094223,
|
||
|
|
"learning_rate": 2.7064854345037585e-07,
|
||
|
|
"loss": 0.167,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18093225359916687,
|
||
|
|
"step": 1355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.296052631578947,
|
||
|
|
"grad_norm": 0.2846923768520355,
|
||
|
|
"learning_rate": 2.3183584570335205e-07,
|
||
|
|
"loss": 0.1636,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16989687085151672,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.3207236842105265,
|
||
|
|
"grad_norm": 0.2494208961725235,
|
||
|
|
"learning_rate": 1.9600948566238287e-07,
|
||
|
|
"loss": 0.1596,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1410118043422699,
|
||
|
|
"step": 1365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.345394736842105,
|
||
|
|
"grad_norm": 0.25421878695487976,
|
||
|
|
"learning_rate": 1.631748755452667e-07,
|
||
|
|
"loss": 0.1629,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17008930444717407,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.370065789473684,
|
||
|
|
"grad_norm": 0.30556827783584595,
|
||
|
|
"learning_rate": 1.3333697561201732e-07,
|
||
|
|
"loss": 0.1603,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1695536971092224,
|
||
|
|
"step": 1375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.394736842105263,
|
||
|
|
"grad_norm": 0.2617764472961426,
|
||
|
|
"learning_rate": 1.0650029341553902e-07,
|
||
|
|
"loss": 0.1734,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13440750539302826,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.4194078947368425,
|
||
|
|
"grad_norm": 0.22152091562747955,
|
||
|
|
"learning_rate": 8.266888312066013e-08,
|
||
|
|
"loss": 0.1609,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16376778483390808,
|
||
|
|
"step": 1385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.444078947368421,
|
||
|
|
"grad_norm": 0.24168169498443604,
|
||
|
|
"learning_rate": 6.184634489169838e-08,
|
||
|
|
"loss": 0.1598,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15376853942871094,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.46875,
|
||
|
|
"grad_norm": 0.27041196823120117,
|
||
|
|
"learning_rate": 4.403582434857834e-08,
|
||
|
|
"loss": 0.1529,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1645251214504242,
|
||
|
|
"step": 1395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.493421052631579,
|
||
|
|
"grad_norm": 0.23931093513965607,
|
||
|
|
"learning_rate": 2.924001209163363e-08,
|
||
|
|
"loss": 0.1583,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14250212907791138,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.5180921052631575,
|
||
|
|
"grad_norm": 0.2812100648880005,
|
||
|
|
"learning_rate": 1.7461143295141036e-08,
|
||
|
|
"loss": 0.1821,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16714885830879211,
|
||
|
|
"step": 1405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.542763157894737,
|
||
|
|
"grad_norm": 0.21569399535655975,
|
||
|
|
"learning_rate": 8.700997369659459e-09,
|
||
|
|
"loss": 0.1983,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21874967217445374,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.567434210526316,
|
||
|
|
"grad_norm": 0.23459841310977936,
|
||
|
|
"learning_rate": 2.9608976932182788e-09,
|
||
|
|
"loss": 0.2006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18229544162750244,
|
||
|
|
"step": 1415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.592105263157895,
|
||
|
|
"grad_norm": 0.22248035669326782,
|
||
|
|
"learning_rate": 2.4171141139284204e-10,
|
||
|
|
"loss": 0.1941,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.18813243508338928,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.597039473684211,
|
||
|
|
"step": 1421,
|
||
|
|
"total_flos": 7.249602429950362e+16,
|
||
|
|
"train_loss": 0.0,
|
||
|
|
"train_runtime": 1.0653,
|
||
|
|
"train_samples_per_second": 7990.221,
|
||
|
|
"train_steps_per_second": 1333.894
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 1421,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 7,
|
||
|
|
"save_steps": 100,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 7.249602429950362e+16,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|