5590 lines
148 KiB
JSON
5590 lines
148 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 7.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 3080,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.011363636363636364,
|
||
|
|
"grad_norm": 2.4202071565877383,
|
||
|
|
"learning_rate": 5.194805194805196e-07,
|
||
|
|
"loss": 0.0702,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0671667754650116,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.022727272727272728,
|
||
|
|
"grad_norm": 2.7090778464248144,
|
||
|
|
"learning_rate": 1.168831168831169e-06,
|
||
|
|
"loss": 0.0688,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.06839145720005035,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03409090909090909,
|
||
|
|
"grad_norm": 2.631476074811997,
|
||
|
|
"learning_rate": 1.8181818181818183e-06,
|
||
|
|
"loss": 0.0638,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.05686648190021515,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.045454545454545456,
|
||
|
|
"grad_norm": 1.9137286691996278,
|
||
|
|
"learning_rate": 2.4675324675324676e-06,
|
||
|
|
"loss": 0.0499,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.04283757507801056,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.056818181818181816,
|
||
|
|
"grad_norm": 0.8025109184626946,
|
||
|
|
"learning_rate": 3.116883116883117e-06,
|
||
|
|
"loss": 0.0348,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.030694402754306793,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06818181818181818,
|
||
|
|
"grad_norm": 0.6637403122092724,
|
||
|
|
"learning_rate": 3.7662337662337666e-06,
|
||
|
|
"loss": 0.0255,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.02170516550540924,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07954545454545454,
|
||
|
|
"grad_norm": 0.7228989928571103,
|
||
|
|
"learning_rate": 4.415584415584416e-06,
|
||
|
|
"loss": 0.0214,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0209285169839859,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09090909090909091,
|
||
|
|
"grad_norm": 0.6619519646149139,
|
||
|
|
"learning_rate": 5.064935064935065e-06,
|
||
|
|
"loss": 0.0193,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01901041716337204,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10227272727272728,
|
||
|
|
"grad_norm": 0.6356152750924333,
|
||
|
|
"learning_rate": 5.7142857142857145e-06,
|
||
|
|
"loss": 0.0183,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0180702842772007,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11363636363636363,
|
||
|
|
"grad_norm": 0.7021690117422743,
|
||
|
|
"learning_rate": 6.363636363636364e-06,
|
||
|
|
"loss": 0.0179,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01755809225142002,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.125,
|
||
|
|
"grad_norm": 0.5565940262580861,
|
||
|
|
"learning_rate": 7.012987012987014e-06,
|
||
|
|
"loss": 0.0172,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.017491038888692856,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13636363636363635,
|
||
|
|
"grad_norm": 0.5339674872738339,
|
||
|
|
"learning_rate": 7.662337662337663e-06,
|
||
|
|
"loss": 0.0165,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01534716971218586,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14772727272727273,
|
||
|
|
"grad_norm": 0.7032524550427636,
|
||
|
|
"learning_rate": 8.311688311688313e-06,
|
||
|
|
"loss": 0.016,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01548848021775484,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1590909090909091,
|
||
|
|
"grad_norm": 0.6541053007196621,
|
||
|
|
"learning_rate": 8.96103896103896e-06,
|
||
|
|
"loss": 0.0159,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.015953991562128067,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17045454545454544,
|
||
|
|
"grad_norm": 0.5783265556701235,
|
||
|
|
"learning_rate": 9.610389610389611e-06,
|
||
|
|
"loss": 0.015,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.014620505273342133,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18181818181818182,
|
||
|
|
"grad_norm": 0.6345130960482329,
|
||
|
|
"learning_rate": 1.025974025974026e-05,
|
||
|
|
"loss": 0.0145,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.014695674180984497,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19318181818181818,
|
||
|
|
"grad_norm": 0.5562731660188355,
|
||
|
|
"learning_rate": 1.0909090909090909e-05,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01377495750784874,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20454545454545456,
|
||
|
|
"grad_norm": 0.6541952057242394,
|
||
|
|
"learning_rate": 1.155844155844156e-05,
|
||
|
|
"loss": 0.0142,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.014258481562137604,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2159090909090909,
|
||
|
|
"grad_norm": 0.5691908399519683,
|
||
|
|
"learning_rate": 1.2207792207792208e-05,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.013385016471147537,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22727272727272727,
|
||
|
|
"grad_norm": 0.580044434886064,
|
||
|
|
"learning_rate": 1.2857142857142859e-05,
|
||
|
|
"loss": 0.013,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.012309407815337181,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23863636363636365,
|
||
|
|
"grad_norm": 0.5266824152062917,
|
||
|
|
"learning_rate": 1.3506493506493508e-05,
|
||
|
|
"loss": 0.013,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.012853426858782768,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25,
|
||
|
|
"grad_norm": 0.6288032062777236,
|
||
|
|
"learning_rate": 1.4155844155844157e-05,
|
||
|
|
"loss": 0.0123,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.012903265655040741,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26136363636363635,
|
||
|
|
"grad_norm": 0.6032186484353697,
|
||
|
|
"learning_rate": 1.4805194805194807e-05,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.013433829881250858,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2727272727272727,
|
||
|
|
"grad_norm": 0.6239301765503266,
|
||
|
|
"learning_rate": 1.5454545454545454e-05,
|
||
|
|
"loss": 0.0124,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.011627838015556335,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2840909090909091,
|
||
|
|
"grad_norm": 0.7575483971341709,
|
||
|
|
"learning_rate": 1.6103896103896105e-05,
|
||
|
|
"loss": 0.0116,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.011799194850027561,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29545454545454547,
|
||
|
|
"grad_norm": 0.6069381753318757,
|
||
|
|
"learning_rate": 1.6753246753246756e-05,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.012594765052199364,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3068181818181818,
|
||
|
|
"grad_norm": 0.6028066475898874,
|
||
|
|
"learning_rate": 1.7402597402597403e-05,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.012447560206055641,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3181818181818182,
|
||
|
|
"grad_norm": 0.658991789032498,
|
||
|
|
"learning_rate": 1.8051948051948053e-05,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.010936323553323746,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32954545454545453,
|
||
|
|
"grad_norm": 0.6556747811129617,
|
||
|
|
"learning_rate": 1.8701298701298704e-05,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.011681806296110153,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3409090909090909,
|
||
|
|
"grad_norm": 0.5391995348614158,
|
||
|
|
"learning_rate": 1.9350649350649354e-05,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01041991263628006,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3522727272727273,
|
||
|
|
"grad_norm": 0.5939577172133247,
|
||
|
|
"learning_rate": 2e-05,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.010792195796966553,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36363636363636365,
|
||
|
|
"grad_norm": 0.6336332903948232,
|
||
|
|
"learning_rate": 2.0649350649350652e-05,
|
||
|
|
"loss": 0.011,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.011543366126716137,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.375,
|
||
|
|
"grad_norm": 1.0758163284934776,
|
||
|
|
"learning_rate": 2.12987012987013e-05,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.011055282317101955,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38636363636363635,
|
||
|
|
"grad_norm": 0.6586631874191279,
|
||
|
|
"learning_rate": 2.194805194805195e-05,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01110876351594925,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3977272727272727,
|
||
|
|
"grad_norm": 0.7163570265009843,
|
||
|
|
"learning_rate": 2.25974025974026e-05,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.010220736265182495,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4090909090909091,
|
||
|
|
"grad_norm": 0.656426832075113,
|
||
|
|
"learning_rate": 2.324675324675325e-05,
|
||
|
|
"loss": 0.0108,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01056276448071003,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42045454545454547,
|
||
|
|
"grad_norm": 0.6806105412030344,
|
||
|
|
"learning_rate": 2.3896103896103898e-05,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.010543874464929104,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4318181818181818,
|
||
|
|
"grad_norm": 0.6065541431220144,
|
||
|
|
"learning_rate": 2.454545454545455e-05,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01050608977675438,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4431818181818182,
|
||
|
|
"grad_norm": 0.7549491756612607,
|
||
|
|
"learning_rate": 2.51948051948052e-05,
|
||
|
|
"loss": 0.011,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009613769128918648,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45454545454545453,
|
||
|
|
"grad_norm": 0.7397236512363505,
|
||
|
|
"learning_rate": 2.5844155844155843e-05,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.010200144723057747,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4659090909090909,
|
||
|
|
"grad_norm": 0.721421064597475,
|
||
|
|
"learning_rate": 2.6493506493506497e-05,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.010110532864928246,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4772727272727273,
|
||
|
|
"grad_norm": 0.7321243599882241,
|
||
|
|
"learning_rate": 2.7142857142857148e-05,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01012840960174799,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48863636363636365,
|
||
|
|
"grad_norm": 0.618755135602394,
|
||
|
|
"learning_rate": 2.779220779220779e-05,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009899996221065521,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5,
|
||
|
|
"grad_norm": 0.5565341202041937,
|
||
|
|
"learning_rate": 2.8441558441558442e-05,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01080006267875433,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5113636363636364,
|
||
|
|
"grad_norm": 0.6342414142163366,
|
||
|
|
"learning_rate": 2.9090909090909093e-05,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.011034488677978516,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5227272727272727,
|
||
|
|
"grad_norm": 0.6057408249319223,
|
||
|
|
"learning_rate": 2.9740259740259743e-05,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.010540914721786976,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5340909090909091,
|
||
|
|
"grad_norm": 0.5841787888775418,
|
||
|
|
"learning_rate": 3.038961038961039e-05,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008908655494451523,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5454545454545454,
|
||
|
|
"grad_norm": 0.6506016488997325,
|
||
|
|
"learning_rate": 3.103896103896104e-05,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.010224273428320885,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5568181818181818,
|
||
|
|
"grad_norm": 0.5928426539202855,
|
||
|
|
"learning_rate": 3.1688311688311695e-05,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008863305673003197,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5681818181818182,
|
||
|
|
"grad_norm": 0.46323257255427763,
|
||
|
|
"learning_rate": 3.233766233766234e-05,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006387968547642231,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5795454545454546,
|
||
|
|
"grad_norm": 0.6368566022455184,
|
||
|
|
"learning_rate": 3.298701298701299e-05,
|
||
|
|
"loss": 0.0095,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009636376053094864,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5909090909090909,
|
||
|
|
"grad_norm": 0.6357801199796334,
|
||
|
|
"learning_rate": 3.363636363636364e-05,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009234972298145294,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6022727272727273,
|
||
|
|
"grad_norm": 0.6460363398484877,
|
||
|
|
"learning_rate": 3.4285714285714284e-05,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.010629944503307343,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6136363636363636,
|
||
|
|
"grad_norm": 0.6092803828103882,
|
||
|
|
"learning_rate": 3.493506493506494e-05,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009354619309306145,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.625,
|
||
|
|
"grad_norm": 0.5872535624821622,
|
||
|
|
"learning_rate": 3.5584415584415585e-05,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009486516937613487,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6363636363636364,
|
||
|
|
"grad_norm": 0.6232160784946711,
|
||
|
|
"learning_rate": 3.623376623376624e-05,
|
||
|
|
"loss": 0.0088,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008946949616074562,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6477272727272727,
|
||
|
|
"grad_norm": 0.5698281812155526,
|
||
|
|
"learning_rate": 3.6883116883116886e-05,
|
||
|
|
"loss": 0.0088,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008350740186870098,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6590909090909091,
|
||
|
|
"grad_norm": 0.5865393453193715,
|
||
|
|
"learning_rate": 3.753246753246753e-05,
|
||
|
|
"loss": 0.009,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009732929989695549,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6704545454545454,
|
||
|
|
"grad_norm": 0.6934292082458405,
|
||
|
|
"learning_rate": 3.818181818181819e-05,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009854786098003387,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6818181818181818,
|
||
|
|
"grad_norm": 0.6300123813854518,
|
||
|
|
"learning_rate": 3.8831168831168834e-05,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009741833433508873,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6931818181818182,
|
||
|
|
"grad_norm": 0.5939534509528255,
|
||
|
|
"learning_rate": 3.948051948051948e-05,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009663011878728867,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7045454545454546,
|
||
|
|
"grad_norm": 0.607950227032694,
|
||
|
|
"learning_rate": 3.999998715561701e-05,
|
||
|
|
"loss": 0.0088,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008878744207322598,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7159090909090909,
|
||
|
|
"grad_norm": 0.6573074772128468,
|
||
|
|
"learning_rate": 3.999953760394435e-05,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008361485786736012,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7272727272727273,
|
||
|
|
"grad_norm": 0.5722274388778257,
|
||
|
|
"learning_rate": 3.99984458496195e-05,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00826809648424387,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7386363636363636,
|
||
|
|
"grad_norm": 0.6301644296792285,
|
||
|
|
"learning_rate": 3.999671192769966e-05,
|
||
|
|
"loss": 0.0085,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008595539256930351,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.75,
|
||
|
|
"grad_norm": 0.5978999055713571,
|
||
|
|
"learning_rate": 3.999433589386259e-05,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008774494752287865,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7613636363636364,
|
||
|
|
"grad_norm": 0.6034089236347809,
|
||
|
|
"learning_rate": 3.9991317824404785e-05,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008122660219669342,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7727272727272727,
|
||
|
|
"grad_norm": 0.6709126750619671,
|
||
|
|
"learning_rate": 3.9987657816239124e-05,
|
||
|
|
"loss": 0.0086,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008076019585132599,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7840909090909091,
|
||
|
|
"grad_norm": 0.5394062287015313,
|
||
|
|
"learning_rate": 3.9983355986891664e-05,
|
||
|
|
"loss": 0.0085,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007619280368089676,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7954545454545454,
|
||
|
|
"grad_norm": 0.6488540438340623,
|
||
|
|
"learning_rate": 3.99784124744979e-05,
|
||
|
|
"loss": 0.0085,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008518952876329422,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8068181818181818,
|
||
|
|
"grad_norm": 0.6117782898228254,
|
||
|
|
"learning_rate": 3.997282743779835e-05,
|
||
|
|
"loss": 0.0086,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009256478399038315,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8181818181818182,
|
||
|
|
"grad_norm": 0.5701394749379847,
|
||
|
|
"learning_rate": 3.996660105613343e-05,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008787738159298897,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8295454545454546,
|
||
|
|
"grad_norm": 0.5468744454626169,
|
||
|
|
"learning_rate": 3.995973352943769e-05,
|
||
|
|
"loss": 0.008,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007650377228856087,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8409090909090909,
|
||
|
|
"grad_norm": 0.5788472026671531,
|
||
|
|
"learning_rate": 3.9952225078233435e-05,
|
||
|
|
"loss": 0.0083,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00858687050640583,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8522727272727273,
|
||
|
|
"grad_norm": 0.6724456328479279,
|
||
|
|
"learning_rate": 3.9944075943623605e-05,
|
||
|
|
"loss": 0.0088,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00910107046365738,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8636363636363636,
|
||
|
|
"grad_norm": 0.49220361670025053,
|
||
|
|
"learning_rate": 3.9935286387284035e-05,
|
||
|
|
"loss": 0.0086,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008576749823987484,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.875,
|
||
|
|
"grad_norm": 0.5372109147479404,
|
||
|
|
"learning_rate": 3.9925856691455075e-05,
|
||
|
|
"loss": 0.0085,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008667494170367718,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8863636363636364,
|
||
|
|
"grad_norm": 0.5721185715094734,
|
||
|
|
"learning_rate": 3.9915787158932505e-05,
|
||
|
|
"loss": 0.0082,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00844760425388813,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8977272727272727,
|
||
|
|
"grad_norm": 0.5517431281787601,
|
||
|
|
"learning_rate": 3.990507811305782e-05,
|
||
|
|
"loss": 0.0081,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007522025145590305,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9090909090909091,
|
||
|
|
"grad_norm": 0.6017721887816652,
|
||
|
|
"learning_rate": 3.989372989770787e-05,
|
||
|
|
"loss": 0.0079,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008214390836656094,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9204545454545454,
|
||
|
|
"grad_norm": 0.6600730500075375,
|
||
|
|
"learning_rate": 3.988174287728376e-05,
|
||
|
|
"loss": 0.008,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008041520603001118,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9318181818181818,
|
||
|
|
"grad_norm": 0.5632715573396425,
|
||
|
|
"learning_rate": 3.986911743669923e-05,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.009165343828499317,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9431818181818182,
|
||
|
|
"grad_norm": 1.4872493920090515,
|
||
|
|
"learning_rate": 3.9855853981368196e-05,
|
||
|
|
"loss": 0.0083,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007640507072210312,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9545454545454546,
|
||
|
|
"grad_norm": 0.5769476984554328,
|
||
|
|
"learning_rate": 3.984195293719182e-05,
|
||
|
|
"loss": 0.0083,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008218517526984215,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9659090909090909,
|
||
|
|
"grad_norm": 0.5978028109547544,
|
||
|
|
"learning_rate": 3.982741475054481e-05,
|
||
|
|
"loss": 0.008,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008345318958163261,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9772727272727273,
|
||
|
|
"grad_norm": 0.5574300718972974,
|
||
|
|
"learning_rate": 3.9812239888261054e-05,
|
||
|
|
"loss": 0.0076,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007650978863239288,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9886363636363636,
|
||
|
|
"grad_norm": 0.5120110312427406,
|
||
|
|
"learning_rate": 3.979642883761866e-05,
|
||
|
|
"loss": 0.0082,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008290043100714684,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0,
|
||
|
|
"grad_norm": 0.535007056168261,
|
||
|
|
"learning_rate": 3.9779982106324284e-05,
|
||
|
|
"loss": 0.0077,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007958535104990005,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0113636363636365,
|
||
|
|
"grad_norm": 0.5263606844624271,
|
||
|
|
"learning_rate": 3.976290022249687e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007756863720715046,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0227272727272727,
|
||
|
|
"grad_norm": 0.5456386490332087,
|
||
|
|
"learning_rate": 3.974518373465066e-05,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0069215744733810425,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0340909090909092,
|
||
|
|
"grad_norm": 0.5140576840256387,
|
||
|
|
"learning_rate": 3.9726833211677576e-05,
|
||
|
|
"loss": 0.0075,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007215406280010939,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0454545454545454,
|
||
|
|
"grad_norm": 0.5951073719013448,
|
||
|
|
"learning_rate": 3.970784924282896e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007084294687956572,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0568181818181819,
|
||
|
|
"grad_norm": 0.5715918038022714,
|
||
|
|
"learning_rate": 3.968823243769667e-05,
|
||
|
|
"loss": 0.0074,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007343347650021315,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0681818181818181,
|
||
|
|
"grad_norm": 0.5186709350157678,
|
||
|
|
"learning_rate": 3.9667983426193485e-05,
|
||
|
|
"loss": 0.0075,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007364596240222454,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0795454545454546,
|
||
|
|
"grad_norm": 0.5448125552678345,
|
||
|
|
"learning_rate": 3.964710285853287e-05,
|
||
|
|
"loss": 0.0077,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007313626818358898,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0909090909090908,
|
||
|
|
"grad_norm": 0.5911853899860636,
|
||
|
|
"learning_rate": 3.9625591405208145e-05,
|
||
|
|
"loss": 0.0076,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006968418136239052,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1022727272727273,
|
||
|
|
"grad_norm": 0.5955339375748643,
|
||
|
|
"learning_rate": 3.9603449756970877e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006995165254920721,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1136363636363635,
|
||
|
|
"grad_norm": 0.5238355538175061,
|
||
|
|
"learning_rate": 3.958067862480878e-05,
|
||
|
|
"loss": 0.0074,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007396093104034662,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.125,
|
||
|
|
"grad_norm": 0.524652353868063,
|
||
|
|
"learning_rate": 3.955727873992283e-05,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007254592142999172,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1363636363636362,
|
||
|
|
"grad_norm": 0.5991907014019456,
|
||
|
|
"learning_rate": 3.95332508537038e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00780677143484354,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1477272727272727,
|
||
|
|
"grad_norm": 0.49483973366541717,
|
||
|
|
"learning_rate": 3.950859573770815e-05,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006735119502991438,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1590909090909092,
|
||
|
|
"grad_norm": 0.5080518306783881,
|
||
|
|
"learning_rate": 3.9483314183633206e-05,
|
||
|
|
"loss": 0.0068,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0065904781222343445,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1704545454545454,
|
||
|
|
"grad_norm": 0.6414095876262249,
|
||
|
|
"learning_rate": 3.9457407003291826e-05,
|
||
|
|
"loss": 0.007,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007423494942486286,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1818181818181819,
|
||
|
|
"grad_norm": 0.5056850434770084,
|
||
|
|
"learning_rate": 3.943087502858621e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007466105278581381,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1931818181818181,
|
||
|
|
"grad_norm": 0.5296131674142839,
|
||
|
|
"learning_rate": 3.9403719111481295e-05,
|
||
|
|
"loss": 0.0077,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008099646307528019,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2045454545454546,
|
||
|
|
"grad_norm": 0.5315251299863072,
|
||
|
|
"learning_rate": 3.937594012397734e-05,
|
||
|
|
"loss": 0.0079,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007564317435026169,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2159090909090908,
|
||
|
|
"grad_norm": 0.4950784114913268,
|
||
|
|
"learning_rate": 3.934753895808193e-05,
|
||
|
|
"loss": 0.0074,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007739352062344551,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2272727272727273,
|
||
|
|
"grad_norm": 0.5290244974118238,
|
||
|
|
"learning_rate": 3.931851652578137e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0073152827098965645,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2386363636363638,
|
||
|
|
"grad_norm": 0.5571429830629839,
|
||
|
|
"learning_rate": 3.928887375901134e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007779883220791817,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.25,
|
||
|
|
"grad_norm": 0.5497246570465858,
|
||
|
|
"learning_rate": 3.9258611609627035e-05,
|
||
|
|
"loss": 0.007,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008137003518640995,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2613636363636362,
|
||
|
|
"grad_norm": 0.5980322062094636,
|
||
|
|
"learning_rate": 3.922773104937254e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006626577116549015,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2727272727272727,
|
||
|
|
"grad_norm": 0.5713517990750141,
|
||
|
|
"learning_rate": 3.919623306984967e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006742670200765133,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2840909090909092,
|
||
|
|
"grad_norm": 0.49571075772413836,
|
||
|
|
"learning_rate": 3.91641186824861e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007229169365018606,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2954545454545454,
|
||
|
|
"grad_norm": 0.686804757238871,
|
||
|
|
"learning_rate": 3.9131388918502914e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0075844405218958855,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3068181818181819,
|
||
|
|
"grad_norm": 0.5731331497884388,
|
||
|
|
"learning_rate": 3.9098044828881476e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0073439269326627254,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3181818181818181,
|
||
|
|
"grad_norm": 0.6007066583243181,
|
||
|
|
"learning_rate": 3.906408748432968e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006798421032726765,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3295454545454546,
|
||
|
|
"grad_norm": 0.5567875514837843,
|
||
|
|
"learning_rate": 3.902951797524757e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006960849743336439,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3409090909090908,
|
||
|
|
"grad_norm": 1.181124528974347,
|
||
|
|
"learning_rate": 3.899433741169233e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007384179159998894,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3522727272727273,
|
||
|
|
"grad_norm": 0.5129764810666245,
|
||
|
|
"learning_rate": 3.895854692334264e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00716067710891366,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3636363636363638,
|
||
|
|
"grad_norm": 0.5713691461694685,
|
||
|
|
"learning_rate": 3.892214765946239e-05,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007511813659220934,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.375,
|
||
|
|
"grad_norm": 0.6403221306923572,
|
||
|
|
"learning_rate": 3.8885140788863814e-05,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006619361229240894,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3863636363636362,
|
||
|
|
"grad_norm": 0.533944352255265,
|
||
|
|
"learning_rate": 3.8847527499869884e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007267755921930075,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3977272727272727,
|
||
|
|
"grad_norm": 0.5421207337934212,
|
||
|
|
"learning_rate": 3.8809309000276234e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0069773998111486435,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4090909090909092,
|
||
|
|
"grad_norm": 1.64206327533154,
|
||
|
|
"learning_rate": 3.877048651731232e-05,
|
||
|
|
"loss": 0.0068,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006747320294380188,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4204545454545454,
|
||
|
|
"grad_norm": 0.6229385815317037,
|
||
|
|
"learning_rate": 3.873106129760206e-05,
|
||
|
|
"loss": 0.0074,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007608484942466021,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4318181818181819,
|
||
|
|
"grad_norm": 0.5874493526375214,
|
||
|
|
"learning_rate": 3.8691034607123725e-05,
|
||
|
|
"loss": 0.0075,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.008636144921183586,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4431818181818181,
|
||
|
|
"grad_norm": 0.5649097168737776,
|
||
|
|
"learning_rate": 3.8650407731169395e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006587575189769268,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4545454545454546,
|
||
|
|
"grad_norm": 0.5437037550846343,
|
||
|
|
"learning_rate": 3.8609181974303596e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007958244532346725,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4659090909090908,
|
||
|
|
"grad_norm": 0.5350155617357305,
|
||
|
|
"learning_rate": 3.856735866032145e-05,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007098427973687649,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4772727272727273,
|
||
|
|
"grad_norm": 0.5364955171336037,
|
||
|
|
"learning_rate": 3.852493913220618e-05,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006704941391944885,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4886363636363638,
|
||
|
|
"grad_norm": 0.49421742514776257,
|
||
|
|
"learning_rate": 3.8481924752085935e-05,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007255823351442814,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5,
|
||
|
|
"grad_norm": 0.5433054902315646,
|
||
|
|
"learning_rate": 3.84383169011901e-05,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006950980983674526,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5113636363636362,
|
||
|
|
"grad_norm": 0.5425382334859696,
|
||
|
|
"learning_rate": 3.839411697980493e-05,
|
||
|
|
"loss": 0.007,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007066159974783659,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5227272727272727,
|
||
|
|
"grad_norm": 0.514763959493457,
|
||
|
|
"learning_rate": 3.834932640722857e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0074985395185649395,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5340909090909092,
|
||
|
|
"grad_norm": 0.4091438249580315,
|
||
|
|
"learning_rate": 3.830394662172551e-05,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006624377332627773,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5454545454545454,
|
||
|
|
"grad_norm": 0.5420003778380477,
|
||
|
|
"learning_rate": 3.8257979080480356e-05,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007473141886293888,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5568181818181817,
|
||
|
|
"grad_norm": 0.5510015264602999,
|
||
|
|
"learning_rate": 3.821142525955109e-05,
|
||
|
|
"loss": 0.007,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007171965204179287,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5681818181818183,
|
||
|
|
"grad_norm": 0.5089211739539228,
|
||
|
|
"learning_rate": 3.8164286653821633e-05,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006746960803866386,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5795454545454546,
|
||
|
|
"grad_norm": 0.3589859565561615,
|
||
|
|
"learning_rate": 3.811656477695385e-05,
|
||
|
|
"loss": 0.0066,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005060967057943344,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5909090909090908,
|
||
|
|
"grad_norm": 0.5048379034549834,
|
||
|
|
"learning_rate": 3.806826116133898e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006541744340211153,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6022727272727273,
|
||
|
|
"grad_norm": 0.7394143420945537,
|
||
|
|
"learning_rate": 3.801937735804838e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006246958859264851,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6136363636363638,
|
||
|
|
"grad_norm": 0.51858921274512,
|
||
|
|
"learning_rate": 3.7969914936783754e-05,
|
||
|
|
"loss": 0.0074,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007437014486640692,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.625,
|
||
|
|
"grad_norm": 0.46460258240184427,
|
||
|
|
"learning_rate": 3.791987548582672e-05,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00666455551981926,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6363636363636362,
|
||
|
|
"grad_norm": 0.4912606857919302,
|
||
|
|
"learning_rate": 3.7869260611987834e-05,
|
||
|
|
"loss": 0.0075,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00768289715051651,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6477272727272727,
|
||
|
|
"grad_norm": 0.508677178248691,
|
||
|
|
"learning_rate": 3.781807194055499e-05,
|
||
|
|
"loss": 0.0078,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007199929095804691,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6590909090909092,
|
||
|
|
"grad_norm": 0.5300921159411073,
|
||
|
|
"learning_rate": 3.776631111524121e-05,
|
||
|
|
"loss": 0.0076,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007103857584297657,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6704545454545454,
|
||
|
|
"grad_norm": 0.4791376882885577,
|
||
|
|
"learning_rate": 3.7713979798131886e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007111593149602413,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6818181818181817,
|
||
|
|
"grad_norm": 0.4559394796312738,
|
||
|
|
"learning_rate": 3.766107966963141e-05,
|
||
|
|
"loss": 0.0066,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006946143694221973,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6931818181818183,
|
||
|
|
"grad_norm": 0.49637388107603064,
|
||
|
|
"learning_rate": 3.760761242840918e-05,
|
||
|
|
"loss": 0.0068,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0074441926553845406,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7045454545454546,
|
||
|
|
"grad_norm": 0.559242858376479,
|
||
|
|
"learning_rate": 3.755357979134511e-05,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0071332817897200584,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7159090909090908,
|
||
|
|
"grad_norm": 0.5042549583166708,
|
||
|
|
"learning_rate": 3.749898349347446e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007963062264025211,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7272727272727273,
|
||
|
|
"grad_norm": 0.4616682198115151,
|
||
|
|
"learning_rate": 3.744382528793211e-05,
|
||
|
|
"loss": 0.007,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006798394024372101,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7386363636363638,
|
||
|
|
"grad_norm": 0.495266160422968,
|
||
|
|
"learning_rate": 3.738810694589631e-05,
|
||
|
|
"loss": 0.0068,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00613341573625803,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.75,
|
||
|
|
"grad_norm": 0.4928006241792697,
|
||
|
|
"learning_rate": 3.733183025653178e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006665781605988741,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7613636363636362,
|
||
|
|
"grad_norm": 0.5078072761733905,
|
||
|
|
"learning_rate": 3.7274997026932256e-05,
|
||
|
|
"loss": 0.0066,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006252589635550976,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7727272727272727,
|
||
|
|
"grad_norm": 0.5213591867225603,
|
||
|
|
"learning_rate": 3.721760908206247e-05,
|
||
|
|
"loss": 0.0066,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00697554275393486,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7840909090909092,
|
||
|
|
"grad_norm": 0.5183526092951634,
|
||
|
|
"learning_rate": 3.7159668264699546e-05,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007623917423188686,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7954545454545454,
|
||
|
|
"grad_norm": 0.48817992610683464,
|
||
|
|
"learning_rate": 3.710117643537383e-05,
|
||
|
|
"loss": 0.0066,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0066612763330340385,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8068181818181817,
|
||
|
|
"grad_norm": 0.45787093080914265,
|
||
|
|
"learning_rate": 3.7042135472309134e-05,
|
||
|
|
"loss": 0.0067,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006526663899421692,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8181818181818183,
|
||
|
|
"grad_norm": 0.471041929502915,
|
||
|
|
"learning_rate": 3.698254727136245e-05,
|
||
|
|
"loss": 0.0067,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007218446582555771,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8295454545454546,
|
||
|
|
"grad_norm": 0.4963460706229407,
|
||
|
|
"learning_rate": 3.692241374596306e-05,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007524269167333841,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8409090909090908,
|
||
|
|
"grad_norm": 0.47229112029444775,
|
||
|
|
"learning_rate": 3.6861736827051066e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00766750518232584,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8522727272727273,
|
||
|
|
"grad_norm": 0.4720231061897637,
|
||
|
|
"learning_rate": 3.680051846301543e-05,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007345844060182571,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8636363636363638,
|
||
|
|
"grad_norm": 0.5780164821061616,
|
||
|
|
"learning_rate": 3.67387606196314e-05,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007284290157258511,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.875,
|
||
|
|
"grad_norm": 0.4741104553305416,
|
||
|
|
"learning_rate": 3.6676465279997343e-05,
|
||
|
|
"loss": 0.007,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006840241141617298,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8863636363636362,
|
||
|
|
"grad_norm": 0.4262283214786933,
|
||
|
|
"learning_rate": 3.6613634444471145e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007350475527346134,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8977272727272727,
|
||
|
|
"grad_norm": 0.5157890127598841,
|
||
|
|
"learning_rate": 3.655027013060591e-05,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007742348592728376,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9090909090909092,
|
||
|
|
"grad_norm": 0.5169007346303762,
|
||
|
|
"learning_rate": 3.648637437308519e-05,
|
||
|
|
"loss": 0.0067,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0066361138597130775,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9204545454545454,
|
||
|
|
"grad_norm": 0.5505323294534453,
|
||
|
|
"learning_rate": 3.642194922365766e-05,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006925581954419613,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9318181818181817,
|
||
|
|
"grad_norm": 0.5583894099740615,
|
||
|
|
"learning_rate": 3.635699675107126e-05,
|
||
|
|
"loss": 0.0068,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006889252923429012,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9431818181818183,
|
||
|
|
"grad_norm": 0.5237508106974393,
|
||
|
|
"learning_rate": 3.629151904100672e-05,
|
||
|
|
"loss": 0.0065,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006494746543467045,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9545454545454546,
|
||
|
|
"grad_norm": 0.5478140154768331,
|
||
|
|
"learning_rate": 3.622551819601058e-05,
|
||
|
|
"loss": 0.0065,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006974754389375448,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9659090909090908,
|
||
|
|
"grad_norm": 0.4901565927254637,
|
||
|
|
"learning_rate": 3.615899633542775e-05,
|
||
|
|
"loss": 0.0066,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007577099371701479,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9772727272727273,
|
||
|
|
"grad_norm": 0.4861864224749252,
|
||
|
|
"learning_rate": 3.609195559533337e-05,
|
||
|
|
"loss": 0.0068,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006560392677783966,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9886363636363638,
|
||
|
|
"grad_norm": 0.5086683182785141,
|
||
|
|
"learning_rate": 3.6024398128464264e-05,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0073668742552399635,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0,
|
||
|
|
"grad_norm": 0.4866523538091737,
|
||
|
|
"learning_rate": 3.595632610414981e-05,
|
||
|
|
"loss": 0.0067,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006037401966750622,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0113636363636362,
|
||
|
|
"grad_norm": 0.4630397674349411,
|
||
|
|
"learning_rate": 3.588774170824225e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0056028589606285095,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.022727272727273,
|
||
|
|
"grad_norm": 0.45531441122486527,
|
||
|
|
"learning_rate": 3.581864714304659e-05,
|
||
|
|
"loss": 0.0065,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0062451446428895,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.034090909090909,
|
||
|
|
"grad_norm": 0.4922607055924082,
|
||
|
|
"learning_rate": 3.5749044627249744e-05,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006536226253956556,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0454545454545454,
|
||
|
|
"grad_norm": 0.44948584157435817,
|
||
|
|
"learning_rate": 3.56789363958494e-05,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006173080764710903,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0568181818181817,
|
||
|
|
"grad_norm": 0.5763350452548516,
|
||
|
|
"learning_rate": 3.560832470008223e-05,
|
||
|
|
"loss": 0.0066,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00653237197548151,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0681818181818183,
|
||
|
|
"grad_norm": 0.4801203990518026,
|
||
|
|
"learning_rate": 3.553721180735157e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005954197607934475,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0795454545454546,
|
||
|
|
"grad_norm": 0.4778441190134779,
|
||
|
|
"learning_rate": 3.546560000115461e-05,
|
||
|
|
"loss": 0.0064,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006330440286546946,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.090909090909091,
|
||
|
|
"grad_norm": 0.5347858724369077,
|
||
|
|
"learning_rate": 3.539349158100912e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005839129444211721,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.102272727272727,
|
||
|
|
"grad_norm": 0.48921521082512576,
|
||
|
|
"learning_rate": 3.532088886237956e-05,
|
||
|
|
"loss": 0.0066,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005895141512155533,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1136363636363638,
|
||
|
|
"grad_norm": 0.5146757560983731,
|
||
|
|
"learning_rate": 3.524779417660277e-05,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0059109157882630825,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.125,
|
||
|
|
"grad_norm": 0.6429144662492329,
|
||
|
|
"learning_rate": 3.517420987081304e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006881117820739746,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1363636363636362,
|
||
|
|
"grad_norm": 0.5048764322733581,
|
||
|
|
"learning_rate": 3.510013830786685e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006111334078013897,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.147727272727273,
|
||
|
|
"grad_norm": 0.5278971681123248,
|
||
|
|
"learning_rate": 3.502558186626687e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006541167851537466,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.159090909090909,
|
||
|
|
"grad_norm": 0.4455406565197607,
|
||
|
|
"learning_rate": 3.4950542940085695e-05,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00632286723703146,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1704545454545454,
|
||
|
|
"grad_norm": 0.5510905187000109,
|
||
|
|
"learning_rate": 3.48750239388889e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006969518028199673,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1818181818181817,
|
||
|
|
"grad_norm": 0.5624895585986736,
|
||
|
|
"learning_rate": 3.479902728765768e-05,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006708991248160601,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1931818181818183,
|
||
|
|
"grad_norm": 0.458254560373065,
|
||
|
|
"learning_rate": 3.4722555426711017e-05,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006090499460697174,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2045454545454546,
|
||
|
|
"grad_norm": 0.693897892710982,
|
||
|
|
"learning_rate": 3.464561081162728e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005784523673355579,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.215909090909091,
|
||
|
|
"grad_norm": 0.4862555425590794,
|
||
|
|
"learning_rate": 3.456819591316539e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006276763044297695,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.227272727272727,
|
||
|
|
"grad_norm": 0.4821015824320282,
|
||
|
|
"learning_rate": 3.4490313217185454e-05,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006492925807833672,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2386363636363638,
|
||
|
|
"grad_norm": 0.4296027449680172,
|
||
|
|
"learning_rate": 3.4411965224569006e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006225232966244221,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.25,
|
||
|
|
"grad_norm": 0.605266972216722,
|
||
|
|
"learning_rate": 3.4333154451138644e-05,
|
||
|
|
"loss": 0.0067,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006708680652081966,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2613636363636362,
|
||
|
|
"grad_norm": 0.4541560489432943,
|
||
|
|
"learning_rate": 3.4253883427577266e-05,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005775884725153446,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2727272727272725,
|
||
|
|
"grad_norm": 0.46856650577715975,
|
||
|
|
"learning_rate": 3.417415469934678e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005967448465526104,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.284090909090909,
|
||
|
|
"grad_norm": 0.4985004787357942,
|
||
|
|
"learning_rate": 3.409397082660643e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0061860037967562675,
|
||
|
|
"step": 1005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2954545454545454,
|
||
|
|
"grad_norm": 0.4629095284226778,
|
||
|
|
"learning_rate": 3.401333438413053e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0064862193539738655,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3068181818181817,
|
||
|
|
"grad_norm": 0.4839935039440178,
|
||
|
|
"learning_rate": 3.3932247961225805e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005635668523609638,
|
||
|
|
"step": 1015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3181818181818183,
|
||
|
|
"grad_norm": 0.4626909694044112,
|
||
|
|
"learning_rate": 3.385071416164824e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0058249845169484615,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3295454545454546,
|
||
|
|
"grad_norm": 0.46354066332426824,
|
||
|
|
"learning_rate": 3.376873560351948e-05,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006067879963666201,
|
||
|
|
"step": 1025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.340909090909091,
|
||
|
|
"grad_norm": 0.5122297881346973,
|
||
|
|
"learning_rate": 3.368631491924277e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006336583755910397,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3522727272727275,
|
||
|
|
"grad_norm": 0.4237417475830667,
|
||
|
|
"learning_rate": 3.360345475541839e-05,
|
||
|
|
"loss": 0.0065,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007529721595346928,
|
||
|
|
"step": 1035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3636363636363638,
|
||
|
|
"grad_norm": 0.4645416778269651,
|
||
|
|
"learning_rate": 3.3520157772758716e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006176627241075039,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.375,
|
||
|
|
"grad_norm": 0.5044028687299602,
|
||
|
|
"learning_rate": 3.343642664600273e-05,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00660756416618824,
|
||
|
|
"step": 1045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3863636363636362,
|
||
|
|
"grad_norm": 0.47017301373211334,
|
||
|
|
"learning_rate": 3.3352264063830184e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006091096438467503,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3977272727272725,
|
||
|
|
"grad_norm": 0.43705892726841483,
|
||
|
|
"learning_rate": 3.3267672728775245e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0061500780284404755,
|
||
|
|
"step": 1055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.409090909090909,
|
||
|
|
"grad_norm": 0.49153186096934803,
|
||
|
|
"learning_rate": 3.3182655357139686e-05,
|
||
|
|
"loss": 0.0064,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006093064323067665,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4204545454545454,
|
||
|
|
"grad_norm": 0.48501040856687555,
|
||
|
|
"learning_rate": 3.309721467890571e-05,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006339596584439278,
|
||
|
|
"step": 1065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4318181818181817,
|
||
|
|
"grad_norm": 0.460207072808941,
|
||
|
|
"learning_rate": 3.301135343764824e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006662009749561548,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4431818181818183,
|
||
|
|
"grad_norm": 0.5086631629233642,
|
||
|
|
"learning_rate": 3.292507439044689e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006738311145454645,
|
||
|
|
"step": 1075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4545454545454546,
|
||
|
|
"grad_norm": 0.47413969867526984,
|
||
|
|
"learning_rate": 3.283838030779733e-05,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00711568258702755,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.465909090909091,
|
||
|
|
"grad_norm": 0.484709187352696,
|
||
|
|
"learning_rate": 3.275127397352243e-05,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006409568712115288,
|
||
|
|
"step": 1085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4772727272727275,
|
||
|
|
"grad_norm": 0.4302542818712421,
|
||
|
|
"learning_rate": 3.2663758184682804e-05,
|
||
|
|
"loss": 0.0065,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007154181599617004,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4886363636363638,
|
||
|
|
"grad_norm": 0.4652093254753745,
|
||
|
|
"learning_rate": 3.257583575148699e-05,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005443320609629154,
|
||
|
|
"step": 1095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5,
|
||
|
|
"grad_norm": 0.45389989611595233,
|
||
|
|
"learning_rate": 3.2487509497201274e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006023968569934368,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5113636363636362,
|
||
|
|
"grad_norm": 0.5185036844454263,
|
||
|
|
"learning_rate": 3.239878225805895e-05,
|
||
|
|
"loss": 0.0066,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0073779672384262085,
|
||
|
|
"step": 1105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5227272727272725,
|
||
|
|
"grad_norm": 0.5331373246789413,
|
||
|
|
"learning_rate": 3.230965688316931e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006063676439225674,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.534090909090909,
|
||
|
|
"grad_norm": 0.47502160394446974,
|
||
|
|
"learning_rate": 3.222013623442613e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006197344046086073,
|
||
|
|
"step": 1115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5454545454545454,
|
||
|
|
"grad_norm": 0.5274100837413495,
|
||
|
|
"learning_rate": 3.2130223186415766e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005591326858848333,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5568181818181817,
|
||
|
|
"grad_norm": 0.477868112034783,
|
||
|
|
"learning_rate": 3.203992062632487e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0056713358499109745,
|
||
|
|
"step": 1125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5681818181818183,
|
||
|
|
"grad_norm": 0.4551006058255123,
|
||
|
|
"learning_rate": 3.194923145384766e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006257796660065651,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5795454545454546,
|
||
|
|
"grad_norm": 0.46468844092336814,
|
||
|
|
"learning_rate": 3.1858158581092816e-05,
|
||
|
|
"loss": 0.0057,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005319403484463692,
|
||
|
|
"step": 1135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.590909090909091,
|
||
|
|
"grad_norm": 0.4489441649909645,
|
||
|
|
"learning_rate": 3.176670493248997e-05,
|
||
|
|
"loss": 0.0064,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0064444029703736305,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6022727272727275,
|
||
|
|
"grad_norm": 0.4702167240533605,
|
||
|
|
"learning_rate": 3.1674873444695804e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006454262882471085,
|
||
|
|
"step": 1145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6136363636363638,
|
||
|
|
"grad_norm": 0.45164335617167706,
|
||
|
|
"learning_rate": 3.158266706649974e-05,
|
||
|
|
"loss": 0.0068,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006458786316215992,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.625,
|
||
|
|
"grad_norm": 0.39102326262348314,
|
||
|
|
"learning_rate": 3.1490088758729274e-05,
|
||
|
|
"loss": 0.0065,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005608841776847839,
|
||
|
|
"step": 1155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6363636363636362,
|
||
|
|
"grad_norm": 0.4409257535585994,
|
||
|
|
"learning_rate": 3.1397141494154864e-05,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00575103797018528,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6477272727272725,
|
||
|
|
"grad_norm": 0.48759850968139057,
|
||
|
|
"learning_rate": 3.13038282573945e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0064391097985208035,
|
||
|
|
"step": 1165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.659090909090909,
|
||
|
|
"grad_norm": 0.45473692448344905,
|
||
|
|
"learning_rate": 3.121015204481788e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006274147890508175,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6704545454545454,
|
||
|
|
"grad_norm": 0.47444274421515903,
|
||
|
|
"learning_rate": 3.111611586445015e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005559071898460388,
|
||
|
|
"step": 1175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6818181818181817,
|
||
|
|
"grad_norm": 0.4483625344303971,
|
||
|
|
"learning_rate": 3.1021722735875345e-05,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005905711092054844,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6931818181818183,
|
||
|
|
"grad_norm": 0.45134284924379864,
|
||
|
|
"learning_rate": 3.0926975690139415e-05,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.007338996976613998,
|
||
|
|
"step": 1185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7045454545454546,
|
||
|
|
"grad_norm": 0.45699971456056127,
|
||
|
|
"learning_rate": 3.0831877769652905e-05,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005969179328531027,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.715909090909091,
|
||
|
|
"grad_norm": 0.48032859712409776,
|
||
|
|
"learning_rate": 3.073643202809325e-05,
|
||
|
|
"loss": 0.0064,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006529618054628372,
|
||
|
|
"step": 1195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7272727272727275,
|
||
|
|
"grad_norm": 0.5317749913490281,
|
||
|
|
"learning_rate": 3.064064153030673e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00578355323523283,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7386363636363638,
|
||
|
|
"grad_norm": 0.45449325947395547,
|
||
|
|
"learning_rate": 3.054450935221005e-05,
|
||
|
|
"loss": 0.0058,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005896543152630329,
|
||
|
|
"step": 1205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.75,
|
||
|
|
"grad_norm": 0.47907363109287115,
|
||
|
|
"learning_rate": 3.0448038580691563e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006563876289874315,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7613636363636362,
|
||
|
|
"grad_norm": 0.4854239824964068,
|
||
|
|
"learning_rate": 3.0351232313512145e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00620038527995348,
|
||
|
|
"step": 1215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7727272727272725,
|
||
|
|
"grad_norm": 0.4692367936798186,
|
||
|
|
"learning_rate": 3.0254093659205752e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006536625791341066,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.784090909090909,
|
||
|
|
"grad_norm": 0.443729375843743,
|
||
|
|
"learning_rate": 3.015662573697957e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0054986486211419106,
|
||
|
|
"step": 1225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7954545454545454,
|
||
|
|
"grad_norm": 0.46206428677941797,
|
||
|
|
"learning_rate": 3.0058831676613854e-05,
|
||
|
|
"loss": 0.0058,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006025765556842089,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8068181818181817,
|
||
|
|
"grad_norm": 0.48541475266155165,
|
||
|
|
"learning_rate": 2.996071461836147e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006246485747396946,
|
||
|
|
"step": 1235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8181818181818183,
|
||
|
|
"grad_norm": 0.5226366747707972,
|
||
|
|
"learning_rate": 2.986227771284701e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005961126647889614,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8295454545454546,
|
||
|
|
"grad_norm": 0.4411214489001844,
|
||
|
|
"learning_rate": 2.976352412096563e-05,
|
||
|
|
"loss": 0.0057,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006008786149322987,
|
||
|
|
"step": 1245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.840909090909091,
|
||
|
|
"grad_norm": 0.4932922933360534,
|
||
|
|
"learning_rate": 2.9664457013781588e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006435011513531208,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8522727272727275,
|
||
|
|
"grad_norm": 0.4515975008989531,
|
||
|
|
"learning_rate": 2.956507957242637e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005883371457457542,
|
||
|
|
"step": 1255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8636363636363638,
|
||
|
|
"grad_norm": 0.46132522349986016,
|
||
|
|
"learning_rate": 2.9465394987996575e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005824087653309107,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.875,
|
||
|
|
"grad_norm": 0.48663120732476906,
|
||
|
|
"learning_rate": 2.9365406461451442e-05,
|
||
|
|
"loss": 0.0061,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005776618607342243,
|
||
|
|
"step": 1265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8863636363636362,
|
||
|
|
"grad_norm": 0.5041440432415826,
|
||
|
|
"learning_rate": 2.9265117203510045e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006313983350992203,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8977272727272725,
|
||
|
|
"grad_norm": 0.48629649213889986,
|
||
|
|
"learning_rate": 2.916453043454821e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005881605204194784,
|
||
|
|
"step": 1275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.909090909090909,
|
||
|
|
"grad_norm": 0.5032118801387048,
|
||
|
|
"learning_rate": 2.9063649384495104e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0056932102888822556,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9204545454545454,
|
||
|
|
"grad_norm": 0.4706285489476537,
|
||
|
|
"learning_rate": 2.896247729272951e-05,
|
||
|
|
"loss": 0.006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005304855294525623,
|
||
|
|
"step": 1285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9318181818181817,
|
||
|
|
"grad_norm": 0.5082140138556308,
|
||
|
|
"learning_rate": 2.8861017407975828e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006749492138624191,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9431818181818183,
|
||
|
|
"grad_norm": 0.49654640744802336,
|
||
|
|
"learning_rate": 2.8759272988199724e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005863680969923735,
|
||
|
|
"step": 1295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9545454545454546,
|
||
|
|
"grad_norm": 0.46438252714106815,
|
||
|
|
"learning_rate": 2.865724730050356e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0065032909624278545,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.965909090909091,
|
||
|
|
"grad_norm": 0.46598363602790127,
|
||
|
|
"learning_rate": 2.855494362102142e-05,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0062259710393846035,
|
||
|
|
"step": 1305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9772727272727275,
|
||
|
|
"grad_norm": 0.45993410705158827,
|
||
|
|
"learning_rate": 2.8452365234813992e-05,
|
||
|
|
"loss": 0.0062,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006372966803610325,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9886363636363638,
|
||
|
|
"grad_norm": 0.5037560097338225,
|
||
|
|
"learning_rate": 2.8349515435763e-05,
|
||
|
|
"loss": 0.0063,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0068403808400034904,
|
||
|
|
"step": 1315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0,
|
||
|
|
"grad_norm": 0.687441874495177,
|
||
|
|
"learning_rate": 2.824639752646549e-05,
|
||
|
|
"loss": 0.0059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005738408304750919,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0113636363636362,
|
||
|
|
"grad_norm": 0.5040643481396614,
|
||
|
|
"learning_rate": 2.814301481812776e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005009516142308712,
|
||
|
|
"step": 1325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.022727272727273,
|
||
|
|
"grad_norm": 0.5100500768266673,
|
||
|
|
"learning_rate": 2.8039370630459026e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0057281493209302425,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.034090909090909,
|
||
|
|
"grad_norm": 0.46520065511260555,
|
||
|
|
"learning_rate": 2.793546829156485e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0056807128712534904,
|
||
|
|
"step": 1335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0454545454545454,
|
||
|
|
"grad_norm": 0.44147813997772384,
|
||
|
|
"learning_rate": 2.7831311137840252e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005066130310297012,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0568181818181817,
|
||
|
|
"grad_norm": 0.4868436752847804,
|
||
|
|
"learning_rate": 2.7726902513862572e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005180031061172485,
|
||
|
|
"step": 1345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0681818181818183,
|
||
|
|
"grad_norm": 0.4703455643178739,
|
||
|
|
"learning_rate": 2.7622245772284086e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005173949059098959,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0795454545454546,
|
||
|
|
"grad_norm": 0.4589542967650583,
|
||
|
|
"learning_rate": 2.7517344273724344e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005243333056569099,
|
||
|
|
"step": 1355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.090909090909091,
|
||
|
|
"grad_norm": 0.5350742778980194,
|
||
|
|
"learning_rate": 2.7412201386662247e-05,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004697371739894152,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.102272727272727,
|
||
|
|
"grad_norm": 0.5625452878348347,
|
||
|
|
"learning_rate": 2.7306820487327906e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005057926289737225,
|
||
|
|
"step": 1365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1136363636363638,
|
||
|
|
"grad_norm": 0.568364687523711,
|
||
|
|
"learning_rate": 2.72012049595942e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005759899970144033,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.125,
|
||
|
|
"grad_norm": 0.44891504727695053,
|
||
|
|
"learning_rate": 2.7095358194868146e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005599320400506258,
|
||
|
|
"step": 1375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1363636363636362,
|
||
|
|
"grad_norm": 0.5156317883549054,
|
||
|
|
"learning_rate": 2.698928359198197e-05,
|
||
|
|
"loss": 0.0057,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006571016274392605,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.147727272727273,
|
||
|
|
"grad_norm": 0.4723862638867974,
|
||
|
|
"learning_rate": 2.6882984557083987e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005237076431512833,
|
||
|
|
"step": 1385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.159090909090909,
|
||
|
|
"grad_norm": 0.49229432271457835,
|
||
|
|
"learning_rate": 2.677646450352923e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005284374579787254,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1704545454545454,
|
||
|
|
"grad_norm": 0.5191319845295057,
|
||
|
|
"learning_rate": 2.6669726851769814e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00592380203306675,
|
||
|
|
"step": 1395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1818181818181817,
|
||
|
|
"grad_norm": 0.49651390447555893,
|
||
|
|
"learning_rate": 2.656277502924514e-05,
|
||
|
|
"loss": 0.0056,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005676039028912783,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1931818181818183,
|
||
|
|
"grad_norm": 0.5759910974149394,
|
||
|
|
"learning_rate": 2.6455612470271805e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0051645804196596146,
|
||
|
|
"step": 1405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2045454545454546,
|
||
|
|
"grad_norm": 0.44210035521717134,
|
||
|
|
"learning_rate": 2.6348242615933348e-05,
|
||
|
|
"loss": 0.0056,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005482735577970743,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.215909090909091,
|
||
|
|
"grad_norm": 0.5014548645558844,
|
||
|
|
"learning_rate": 2.6240668913969743e-05,
|
||
|
|
"loss": 0.0056,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005730169825255871,
|
||
|
|
"step": 1415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.227272727272727,
|
||
|
|
"grad_norm": 0.5516450136062995,
|
||
|
|
"learning_rate": 2.613289481866669e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005764037370681763,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2386363636363638,
|
||
|
|
"grad_norm": 0.510217197458097,
|
||
|
|
"learning_rate": 2.6024923790744686e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005522026680409908,
|
||
|
|
"step": 1425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.25,
|
||
|
|
"grad_norm": 0.4756969768654939,
|
||
|
|
"learning_rate": 2.5916759297247917e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0053534796461462975,
|
||
|
|
"step": 1430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2613636363636362,
|
||
|
|
"grad_norm": 0.4956430258999987,
|
||
|
|
"learning_rate": 2.5808404811432918e-05,
|
||
|
|
"loss": 0.0057,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00593971461057663,
|
||
|
|
"step": 1435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2727272727272725,
|
||
|
|
"grad_norm": 0.5115033693347459,
|
||
|
|
"learning_rate": 2.5699863812657033e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005368461832404137,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.284090909090909,
|
||
|
|
"grad_norm": 0.48357473176188076,
|
||
|
|
"learning_rate": 2.5591139786266705e-05,
|
||
|
|
"loss": 0.0056,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006003449205309153,
|
||
|
|
"step": 1445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2954545454545454,
|
||
|
|
"grad_norm": 0.48157888085736633,
|
||
|
|
"learning_rate": 2.5482236223485557e-05,
|
||
|
|
"loss": 0.0058,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005555244162678719,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3068181818181817,
|
||
|
|
"grad_norm": 0.5030137636028127,
|
||
|
|
"learning_rate": 2.537315662130228e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0054417382925748825,
|
||
|
|
"step": 1455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3181818181818183,
|
||
|
|
"grad_norm": 0.4134162169911966,
|
||
|
|
"learning_rate": 2.5263904482358353e-05,
|
||
|
|
"loss": 0.0057,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005619807168841362,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3295454545454546,
|
||
|
|
"grad_norm": 0.5027447195755913,
|
||
|
|
"learning_rate": 2.515448331483555e-05,
|
||
|
|
"loss": 0.0056,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005433700978755951,
|
||
|
|
"step": 1465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.340909090909091,
|
||
|
|
"grad_norm": 0.467936804280616,
|
||
|
|
"learning_rate": 2.5044896632343303e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005376646760851145,
|
||
|
|
"step": 1470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3522727272727275,
|
||
|
|
"grad_norm": 0.41322566194503085,
|
||
|
|
"learning_rate": 2.493514795380587e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005562332924455404,
|
||
|
|
"step": 1475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3636363636363638,
|
||
|
|
"grad_norm": 0.45750092579039,
|
||
|
|
"learning_rate": 2.4825240803349368e-05,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004756518639624119,
|
||
|
|
"step": 1480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.375,
|
||
|
|
"grad_norm": 0.6873963855415671,
|
||
|
|
"learning_rate": 2.471517871018855e-05,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005281743593513966,
|
||
|
|
"step": 1485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3863636363636362,
|
||
|
|
"grad_norm": 0.6674523186854076,
|
||
|
|
"learning_rate": 2.4604965208513535e-05,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004752852022647858,
|
||
|
|
"step": 1490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3977272727272725,
|
||
|
|
"grad_norm": 0.5773344008782416,
|
||
|
|
"learning_rate": 2.44946038373763e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005206132307648659,
|
||
|
|
"step": 1495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.409090909090909,
|
||
|
|
"grad_norm": 0.4950055384001163,
|
||
|
|
"learning_rate": 2.4384098140577048e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005069325678050518,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4204545454545454,
|
||
|
|
"grad_norm": 0.538413334858411,
|
||
|
|
"learning_rate": 2.4273451666550382e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005308193154633045,
|
||
|
|
"step": 1505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4318181818181817,
|
||
|
|
"grad_norm": 0.5286702440577258,
|
||
|
|
"learning_rate": 2.4162667968251414e-05,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005153653211891651,
|
||
|
|
"step": 1510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4431818181818183,
|
||
|
|
"grad_norm": 0.760543291483049,
|
||
|
|
"learning_rate": 2.4051750603041623e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005438433028757572,
|
||
|
|
"step": 1515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4545454545454546,
|
||
|
|
"grad_norm": 0.5655174560371635,
|
||
|
|
"learning_rate": 2.3940703132574664e-05,
|
||
|
|
"loss": 0.005,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005072030238807201,
|
||
|
|
"step": 1520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.465909090909091,
|
||
|
|
"grad_norm": 0.5099001384397973,
|
||
|
|
"learning_rate": 2.3829529122681977e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005368282087147236,
|
||
|
|
"step": 1525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4772727272727275,
|
||
|
|
"grad_norm": 0.6638656107593446,
|
||
|
|
"learning_rate": 2.3718232143258296e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005637164227664471,
|
||
|
|
"step": 1530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4886363636363638,
|
||
|
|
"grad_norm": 0.5062124762616506,
|
||
|
|
"learning_rate": 2.360681576814702e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0052521200850605965,
|
||
|
|
"step": 1535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5,
|
||
|
|
"grad_norm": 0.49539638807298375,
|
||
|
|
"learning_rate": 2.3495283575025445e-05,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0055895233526825905,
|
||
|
|
"step": 1540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5113636363636362,
|
||
|
|
"grad_norm": 0.6454536011583725,
|
||
|
|
"learning_rate": 2.3383639145289882e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005695558153092861,
|
||
|
|
"step": 1545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5227272727272725,
|
||
|
|
"grad_norm": 0.526026069637933,
|
||
|
|
"learning_rate": 2.3271886063940655e-05,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005132297985255718,
|
||
|
|
"step": 1550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.534090909090909,
|
||
|
|
"grad_norm": 0.5227530933770529,
|
||
|
|
"learning_rate": 2.3160027919467e-05,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00547978188842535,
|
||
|
|
"step": 1555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5454545454545454,
|
||
|
|
"grad_norm": 0.48848328218670495,
|
||
|
|
"learning_rate": 2.3048068303731808e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0051205093041062355,
|
||
|
|
"step": 1560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5568181818181817,
|
||
|
|
"grad_norm": 0.503096615264489,
|
||
|
|
"learning_rate": 2.2936010811856302e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005164875648915768,
|
||
|
|
"step": 1565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5681818181818183,
|
||
|
|
"grad_norm": 0.45980820751699264,
|
||
|
|
"learning_rate": 2.2823859042104596e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005503524094820023,
|
||
|
|
"step": 1570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5795454545454546,
|
||
|
|
"grad_norm": 0.5453015097037337,
|
||
|
|
"learning_rate": 2.2711616595768157e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006264137104153633,
|
||
|
|
"step": 1575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.590909090909091,
|
||
|
|
"grad_norm": 0.5045015958268411,
|
||
|
|
"learning_rate": 2.259928707705015e-05,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004797403700649738,
|
||
|
|
"step": 1580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6022727272727275,
|
||
|
|
"grad_norm": 0.5215245608963981,
|
||
|
|
"learning_rate": 2.2486874092949708e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005274273455142975,
|
||
|
|
"step": 1585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6136363636363638,
|
||
|
|
"grad_norm": 0.5073325241457628,
|
||
|
|
"learning_rate": 2.2374381253146105e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005827459041029215,
|
||
|
|
"step": 1590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.625,
|
||
|
|
"grad_norm": 0.4453123475733621,
|
||
|
|
"learning_rate": 2.226181216988287e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005131516605615616,
|
||
|
|
"step": 1595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6363636363636362,
|
||
|
|
"grad_norm": 0.49077347211976835,
|
||
|
|
"learning_rate": 2.2149170457851767e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005897030234336853,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6477272727272725,
|
||
|
|
"grad_norm": 0.5000844953326685,
|
||
|
|
"learning_rate": 2.2036459734076715e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005255206488072872,
|
||
|
|
"step": 1605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.659090909090909,
|
||
|
|
"grad_norm": 0.5080410565430957,
|
||
|
|
"learning_rate": 2.1923683617797685e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005152590572834015,
|
||
|
|
"step": 1610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6704545454545454,
|
||
|
|
"grad_norm": 0.5725176074174163,
|
||
|
|
"learning_rate": 2.1810845730354458e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005088301375508308,
|
||
|
|
"step": 1615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6818181818181817,
|
||
|
|
"grad_norm": 0.5247563512602053,
|
||
|
|
"learning_rate": 2.1697949695070326e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005536783020943403,
|
||
|
|
"step": 1620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6931818181818183,
|
||
|
|
"grad_norm": 0.5441938244723411,
|
||
|
|
"learning_rate": 2.158499913713577e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005170764867216349,
|
||
|
|
"step": 1625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7045454545454546,
|
||
|
|
"grad_norm": 0.4881480951657752,
|
||
|
|
"learning_rate": 2.1471997683492036e-05,
|
||
|
|
"loss": 0.0056,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005929634906351566,
|
||
|
|
"step": 1630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.715909090909091,
|
||
|
|
"grad_norm": 0.4585975933093515,
|
||
|
|
"learning_rate": 2.1358948962714684e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005046064965426922,
|
||
|
|
"step": 1635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7272727272727275,
|
||
|
|
"grad_norm": 0.4957264745476166,
|
||
|
|
"learning_rate": 2.1245856604897045e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005352713167667389,
|
||
|
|
"step": 1640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7386363636363638,
|
||
|
|
"grad_norm": 0.49369365983375335,
|
||
|
|
"learning_rate": 2.1132724241533692e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005089196376502514,
|
||
|
|
"step": 1645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.75,
|
||
|
|
"grad_norm": 0.4433946031787883,
|
||
|
|
"learning_rate": 2.10195555054038e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004971848800778389,
|
||
|
|
"step": 1650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7613636363636362,
|
||
|
|
"grad_norm": 0.43875784131352313,
|
||
|
|
"learning_rate": 2.0906354030454515e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005333467852324247,
|
||
|
|
"step": 1655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7727272727272725,
|
||
|
|
"grad_norm": 0.48730070089270655,
|
||
|
|
"learning_rate": 2.0793123451684248e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.006050648633390665,
|
||
|
|
"step": 1660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.784090909090909,
|
||
|
|
"grad_norm": 0.4868598147009363,
|
||
|
|
"learning_rate": 2.0679867405025956e-05,
|
||
|
|
"loss": 0.005,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005375531502068043,
|
||
|
|
"step": 1665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7954545454545454,
|
||
|
|
"grad_norm": 0.5131921474798169,
|
||
|
|
"learning_rate": 2.0566589527230404e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005499294959008694,
|
||
|
|
"step": 1670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8068181818181817,
|
||
|
|
"grad_norm": 0.513695849129778,
|
||
|
|
"learning_rate": 2.045329345574936e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0052980948239564896,
|
||
|
|
"step": 1675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8181818181818183,
|
||
|
|
"grad_norm": 0.43983207507502864,
|
||
|
|
"learning_rate": 2.0339982828618826e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005142249166965485,
|
||
|
|
"step": 1680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8295454545454546,
|
||
|
|
"grad_norm": 0.4816084292762279,
|
||
|
|
"learning_rate": 2.0226661284342168e-05,
|
||
|
|
"loss": 0.0056,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0055004809983074665,
|
||
|
|
"step": 1685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.840909090909091,
|
||
|
|
"grad_norm": 0.5086414719655078,
|
||
|
|
"learning_rate": 2.0113332461773344e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005008528009057045,
|
||
|
|
"step": 1690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8522727272727275,
|
||
|
|
"grad_norm": 0.4971288284433117,
|
||
|
|
"learning_rate": 2e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005218389444053173,
|
||
|
|
"step": 1695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8636363636363638,
|
||
|
|
"grad_norm": 0.567227239435726,
|
||
|
|
"learning_rate": 1.9886667538226663e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004845334216952324,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.875,
|
||
|
|
"grad_norm": 0.5698861051705016,
|
||
|
|
"learning_rate": 1.977333871565784e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005431466735899448,
|
||
|
|
"step": 1705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8863636363636362,
|
||
|
|
"grad_norm": 0.4338750390299288,
|
||
|
|
"learning_rate": 1.966001717138118e-05,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004926356021314859,
|
||
|
|
"step": 1710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8977272727272725,
|
||
|
|
"grad_norm": 0.462109221054352,
|
||
|
|
"learning_rate": 1.9546706544250646e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005377571564167738,
|
||
|
|
"step": 1715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.909090909090909,
|
||
|
|
"grad_norm": 0.5478730694730507,
|
||
|
|
"learning_rate": 1.94334104727696e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005751037038862705,
|
||
|
|
"step": 1720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9204545454545454,
|
||
|
|
"grad_norm": 0.49172708537055365,
|
||
|
|
"learning_rate": 1.9320132594974047e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005645253695547581,
|
||
|
|
"step": 1725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9318181818181817,
|
||
|
|
"grad_norm": 0.4861367517505634,
|
||
|
|
"learning_rate": 1.9206876548315755e-05,
|
||
|
|
"loss": 0.0054,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0062632085755467415,
|
||
|
|
"step": 1730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9431818181818183,
|
||
|
|
"grad_norm": 0.4573713828290483,
|
||
|
|
"learning_rate": 1.9093645969545488e-05,
|
||
|
|
"loss": 0.0053,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0047844573855400085,
|
||
|
|
"step": 1735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9545454545454546,
|
||
|
|
"grad_norm": 0.45821343473702675,
|
||
|
|
"learning_rate": 1.89804444945962e-05,
|
||
|
|
"loss": 0.0049,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0054860832169651985,
|
||
|
|
"step": 1740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.965909090909091,
|
||
|
|
"grad_norm": 0.4528162239732377,
|
||
|
|
"learning_rate": 1.886727575846631e-05,
|
||
|
|
"loss": 0.0055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005397276021540165,
|
||
|
|
"step": 1745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9772727272727275,
|
||
|
|
"grad_norm": 0.5060106483369168,
|
||
|
|
"learning_rate": 1.8754143395102958e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005089904181659222,
|
||
|
|
"step": 1750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9886363636363638,
|
||
|
|
"grad_norm": 0.5284103556238565,
|
||
|
|
"learning_rate": 1.8641051037285322e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005315642338246107,
|
||
|
|
"step": 1755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0,
|
||
|
|
"grad_norm": 0.47190256941707953,
|
||
|
|
"learning_rate": 1.8528002316507964e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004887085407972336,
|
||
|
|
"step": 1760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.011363636363637,
|
||
|
|
"grad_norm": 0.48472187640003456,
|
||
|
|
"learning_rate": 1.841500086286423e-05,
|
||
|
|
"loss": 0.0051,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004958770237863064,
|
||
|
|
"step": 1765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0227272727272725,
|
||
|
|
"grad_norm": 0.45640574799751016,
|
||
|
|
"learning_rate": 1.8302050304929674e-05,
|
||
|
|
"loss": 0.0049,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004390102811157703,
|
||
|
|
"step": 1770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.034090909090909,
|
||
|
|
"grad_norm": 0.4518139808678402,
|
||
|
|
"learning_rate": 1.8189154269645552e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004417848773300648,
|
||
|
|
"step": 1775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.045454545454546,
|
||
|
|
"grad_norm": 0.40561588123882186,
|
||
|
|
"learning_rate": 1.8076316382202325e-05,
|
||
|
|
"loss": 0.0052,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005202982574701309,
|
||
|
|
"step": 1780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.056818181818182,
|
||
|
|
"grad_norm": 0.43494240647897997,
|
||
|
|
"learning_rate": 1.7963540265923298e-05,
|
||
|
|
"loss": 0.0049,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004180293995887041,
|
||
|
|
"step": 1785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.068181818181818,
|
||
|
|
"grad_norm": 0.4664562873838021,
|
||
|
|
"learning_rate": 1.7850829542148247e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004639240447431803,
|
||
|
|
"step": 1790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.079545454545454,
|
||
|
|
"grad_norm": 0.46596572391893,
|
||
|
|
"learning_rate": 1.7738187830117134e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00412036431953311,
|
||
|
|
"step": 1795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.090909090909091,
|
||
|
|
"grad_norm": 0.5069307809682906,
|
||
|
|
"learning_rate": 1.7625618746853902e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004826963413506746,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1022727272727275,
|
||
|
|
"grad_norm": 0.4895720926847578,
|
||
|
|
"learning_rate": 1.7513125907050302e-05,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004888975061476231,
|
||
|
|
"step": 1805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.113636363636363,
|
||
|
|
"grad_norm": 0.5051048378547827,
|
||
|
|
"learning_rate": 1.7400712922949854e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004236047621816397,
|
||
|
|
"step": 1810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.125,
|
||
|
|
"grad_norm": 0.5571096419805928,
|
||
|
|
"learning_rate": 1.7288383404231846e-05,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0047796377912163734,
|
||
|
|
"step": 1815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.136363636363637,
|
||
|
|
"grad_norm": 0.45493846119105674,
|
||
|
|
"learning_rate": 1.7176140957895407e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004705213941633701,
|
||
|
|
"step": 1820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1477272727272725,
|
||
|
|
"grad_norm": 0.4916686978737417,
|
||
|
|
"learning_rate": 1.7063989188143705e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004946434870362282,
|
||
|
|
"step": 1825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.159090909090909,
|
||
|
|
"grad_norm": 0.4603566357905374,
|
||
|
|
"learning_rate": 1.69519316962682e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004155574832111597,
|
||
|
|
"step": 1830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.170454545454546,
|
||
|
|
"grad_norm": 0.510700506891165,
|
||
|
|
"learning_rate": 1.6839972080533004e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004701807163655758,
|
||
|
|
"step": 1835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.181818181818182,
|
||
|
|
"grad_norm": 0.44565214813742443,
|
||
|
|
"learning_rate": 1.6728113936059348e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00458088144659996,
|
||
|
|
"step": 1840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.193181818181818,
|
||
|
|
"grad_norm": 0.45815885091272146,
|
||
|
|
"learning_rate": 1.6616360854710125e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0051234508864581585,
|
||
|
|
"step": 1845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.204545454545454,
|
||
|
|
"grad_norm": 0.4512118075618576,
|
||
|
|
"learning_rate": 1.650471642497456e-05,
|
||
|
|
"loss": 0.0042,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0044530099257826805,
|
||
|
|
"step": 1850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.215909090909091,
|
||
|
|
"grad_norm": 0.508469582365544,
|
||
|
|
"learning_rate": 1.6393184231852983e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004900037776678801,
|
||
|
|
"step": 1855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.2272727272727275,
|
||
|
|
"grad_norm": 0.4983797085004779,
|
||
|
|
"learning_rate": 1.628176785674171e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004549206234514713,
|
||
|
|
"step": 1860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.238636363636363,
|
||
|
|
"grad_norm": 0.42098183590674565,
|
||
|
|
"learning_rate": 1.617047087731803e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004931753501296043,
|
||
|
|
"step": 1865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.25,
|
||
|
|
"grad_norm": 0.47011012903179167,
|
||
|
|
"learning_rate": 1.6059296867425343e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004656555131077766,
|
||
|
|
"step": 1870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.261363636363637,
|
||
|
|
"grad_norm": 0.5039062140230725,
|
||
|
|
"learning_rate": 1.5948249396958384e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004590118303894997,
|
||
|
|
"step": 1875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.2727272727272725,
|
||
|
|
"grad_norm": 0.46478425486476865,
|
||
|
|
"learning_rate": 1.583733203174859e-05,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004690377973020077,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.284090909090909,
|
||
|
|
"grad_norm": 0.5166163137898677,
|
||
|
|
"learning_rate": 1.572654833344962e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0049637118354439735,
|
||
|
|
"step": 1885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.295454545454546,
|
||
|
|
"grad_norm": 0.4327053524714821,
|
||
|
|
"learning_rate": 1.5615901859422956e-05,
|
||
|
|
"loss": 0.0043,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003967207856476307,
|
||
|
|
"step": 1890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.306818181818182,
|
||
|
|
"grad_norm": 0.44232960295306795,
|
||
|
|
"learning_rate": 1.55053961626237e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004847138654440641,
|
||
|
|
"step": 1895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.318181818181818,
|
||
|
|
"grad_norm": 0.43411523827396115,
|
||
|
|
"learning_rate": 1.539503479148647e-05,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0044979555532336235,
|
||
|
|
"step": 1900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.329545454545454,
|
||
|
|
"grad_norm": 0.5142155437981181,
|
||
|
|
"learning_rate": 1.5284821289811453e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004444721154868603,
|
||
|
|
"step": 1905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.340909090909091,
|
||
|
|
"grad_norm": 0.43790381398256156,
|
||
|
|
"learning_rate": 1.5174759196650637e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004545005038380623,
|
||
|
|
"step": 1910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.3522727272727275,
|
||
|
|
"grad_norm": 0.5304461755784035,
|
||
|
|
"learning_rate": 1.5064852046194127e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00425495533272624,
|
||
|
|
"step": 1915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.363636363636363,
|
||
|
|
"grad_norm": 0.4765324199160083,
|
||
|
|
"learning_rate": 1.49551033676567e-05,
|
||
|
|
"loss": 0.0043,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00418436573818326,
|
||
|
|
"step": 1920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.375,
|
||
|
|
"grad_norm": 0.4668901200715659,
|
||
|
|
"learning_rate": 1.484551668516446e-05,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00447133369743824,
|
||
|
|
"step": 1925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.386363636363637,
|
||
|
|
"grad_norm": 0.4839851996431911,
|
||
|
|
"learning_rate": 1.4736095517641654e-05,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0048002987168729305,
|
||
|
|
"step": 1930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.3977272727272725,
|
||
|
|
"grad_norm": 0.4708374971637072,
|
||
|
|
"learning_rate": 1.4626843378697725e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004776729270815849,
|
||
|
|
"step": 1935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.409090909090909,
|
||
|
|
"grad_norm": 0.5398596900335262,
|
||
|
|
"learning_rate": 1.4517763776514453e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005047261249274015,
|
||
|
|
"step": 1940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.420454545454546,
|
||
|
|
"grad_norm": 0.47231641489671194,
|
||
|
|
"learning_rate": 1.4408860213733307e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004825092852115631,
|
||
|
|
"step": 1945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.431818181818182,
|
||
|
|
"grad_norm": 0.5108128823483871,
|
||
|
|
"learning_rate": 1.4300136187342979e-05,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004371874965727329,
|
||
|
|
"step": 1950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.443181818181818,
|
||
|
|
"grad_norm": 0.5286634132329414,
|
||
|
|
"learning_rate": 1.4191595188567089e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004658691585063934,
|
||
|
|
"step": 1955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.454545454545454,
|
||
|
|
"grad_norm": 0.4843256885719304,
|
||
|
|
"learning_rate": 1.4083240702752088e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005307194776833057,
|
||
|
|
"step": 1960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.465909090909091,
|
||
|
|
"grad_norm": 0.4989681511625817,
|
||
|
|
"learning_rate": 1.3975076209255321e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004890813957899809,
|
||
|
|
"step": 1965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.4772727272727275,
|
||
|
|
"grad_norm": 0.45336140574772227,
|
||
|
|
"learning_rate": 1.3867105181333318e-05,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004235866479575634,
|
||
|
|
"step": 1970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.488636363636363,
|
||
|
|
"grad_norm": 0.49618410367541027,
|
||
|
|
"learning_rate": 1.375933108603026e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004726336803287268,
|
||
|
|
"step": 1975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5,
|
||
|
|
"grad_norm": 0.4994256624165742,
|
||
|
|
"learning_rate": 1.3651757384066657e-05,
|
||
|
|
"loss": 0.0049,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005034084431827068,
|
||
|
|
"step": 1980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.511363636363637,
|
||
|
|
"grad_norm": 0.754764822996127,
|
||
|
|
"learning_rate": 1.3544387529728203e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005025540478527546,
|
||
|
|
"step": 1985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5227272727272725,
|
||
|
|
"grad_norm": 0.4817031705205338,
|
||
|
|
"learning_rate": 1.3437224970754865e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00449532363563776,
|
||
|
|
"step": 1990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.534090909090909,
|
||
|
|
"grad_norm": 0.46803533246798695,
|
||
|
|
"learning_rate": 1.3330273148230191e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005232417024672031,
|
||
|
|
"step": 1995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.545454545454545,
|
||
|
|
"grad_norm": 0.5153722925635681,
|
||
|
|
"learning_rate": 1.3223535496470775e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004821739625185728,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.556818181818182,
|
||
|
|
"grad_norm": 0.5940621871477129,
|
||
|
|
"learning_rate": 1.3117015442916014e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004900893196463585,
|
||
|
|
"step": 2005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.568181818181818,
|
||
|
|
"grad_norm": 0.4550172963418441,
|
||
|
|
"learning_rate": 1.3010716408018037e-05,
|
||
|
|
"loss": 0.0049,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004688935354351997,
|
||
|
|
"step": 2010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.579545454545455,
|
||
|
|
"grad_norm": 0.47422795098695947,
|
||
|
|
"learning_rate": 1.290464180513186e-05,
|
||
|
|
"loss": 0.0043,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004211904481053352,
|
||
|
|
"step": 2015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.590909090909091,
|
||
|
|
"grad_norm": 0.5140792798658659,
|
||
|
|
"learning_rate": 1.2798795040405804e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004538238048553467,
|
||
|
|
"step": 2020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.6022727272727275,
|
||
|
|
"grad_norm": 0.4878774165154403,
|
||
|
|
"learning_rate": 1.26931795126721e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004598071798682213,
|
||
|
|
"step": 2025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.613636363636363,
|
||
|
|
"grad_norm": 0.48436717807791557,
|
||
|
|
"learning_rate": 1.2587798613337758e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00513102812692523,
|
||
|
|
"step": 2030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.625,
|
||
|
|
"grad_norm": 0.47254354884528127,
|
||
|
|
"learning_rate": 1.2482655726275661e-05,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004465391859412193,
|
||
|
|
"step": 2035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.636363636363637,
|
||
|
|
"grad_norm": 0.5246478326600279,
|
||
|
|
"learning_rate": 1.2377754227715915e-05,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0045776148326694965,
|
||
|
|
"step": 2040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.6477272727272725,
|
||
|
|
"grad_norm": 0.5036628427479677,
|
||
|
|
"learning_rate": 1.2273097486137426e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005189564544707537,
|
||
|
|
"step": 2045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.659090909090909,
|
||
|
|
"grad_norm": 0.5314651871443347,
|
||
|
|
"learning_rate": 1.2168688862159748e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004871468059718609,
|
||
|
|
"step": 2050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.670454545454545,
|
||
|
|
"grad_norm": 0.5244145840347939,
|
||
|
|
"learning_rate": 1.2064531708435148e-05,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005132294725626707,
|
||
|
|
"step": 2055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.681818181818182,
|
||
|
|
"grad_norm": 0.4988764429098378,
|
||
|
|
"learning_rate": 1.1960629369540974e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004751698113977909,
|
||
|
|
"step": 2060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.693181818181818,
|
||
|
|
"grad_norm": 0.5100604166698235,
|
||
|
|
"learning_rate": 1.1856985181872242e-05,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004565688315778971,
|
||
|
|
"step": 2065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.704545454545455,
|
||
|
|
"grad_norm": 0.5012009580100381,
|
||
|
|
"learning_rate": 1.1753602473534514e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00409556832164526,
|
||
|
|
"step": 2070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.715909090909091,
|
||
|
|
"grad_norm": 0.5540406420029943,
|
||
|
|
"learning_rate": 1.165048456423701e-05,
|
||
|
|
"loss": 0.0049,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00481591047719121,
|
||
|
|
"step": 2075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.7272727272727275,
|
||
|
|
"grad_norm": 0.4351638073930919,
|
||
|
|
"learning_rate": 1.1547634765186016e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004355187993496656,
|
||
|
|
"step": 2080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.738636363636363,
|
||
|
|
"grad_norm": 0.4488120065119166,
|
||
|
|
"learning_rate": 1.1445056378978588e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004363670013844967,
|
||
|
|
"step": 2085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.75,
|
||
|
|
"grad_norm": 0.4589274493774212,
|
||
|
|
"learning_rate": 1.1342752699496452e-05,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00424697482958436,
|
||
|
|
"step": 2090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.761363636363637,
|
||
|
|
"grad_norm": 0.4890203769911914,
|
||
|
|
"learning_rate": 1.1240727011800288e-05,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004736714530736208,
|
||
|
|
"step": 2095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.7727272727272725,
|
||
|
|
"grad_norm": 0.4644162243387919,
|
||
|
|
"learning_rate": 1.113898259202419e-05,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004376566037535667,
|
||
|
|
"step": 2100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.784090909090909,
|
||
|
|
"grad_norm": 0.4805824813519966,
|
||
|
|
"learning_rate": 1.1037522707270498e-05,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00481122313067317,
|
||
|
|
"step": 2105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.795454545454545,
|
||
|
|
"grad_norm": 0.5027825095032938,
|
||
|
|
"learning_rate": 1.093635061550491e-05,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005034390836954117,
|
||
|
|
"step": 2110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.806818181818182,
|
||
|
|
"grad_norm": 0.47263420063341305,
|
||
|
|
"learning_rate": 1.0835469565451792e-05,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004725735634565353,
|
||
|
|
"step": 2115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.818181818181818,
|
||
|
|
"grad_norm": 0.5278598392908539,
|
||
|
|
"learning_rate": 1.073488279648996e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004499510396271944,
|
||
|
|
"step": 2120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.829545454545455,
|
||
|
|
"grad_norm": 0.432737142814124,
|
||
|
|
"learning_rate": 1.063459353854856e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00414951192215085,
|
||
|
|
"step": 2125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.840909090909091,
|
||
|
|
"grad_norm": 0.5225898856178574,
|
||
|
|
"learning_rate": 1.0534605012003429e-05,
|
||
|
|
"loss": 0.0041,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004078148864209652,
|
||
|
|
"step": 2130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.8522727272727275,
|
||
|
|
"grad_norm": 0.4888653089966061,
|
||
|
|
"learning_rate": 1.0434920427573643e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004762607626616955,
|
||
|
|
"step": 2135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.863636363636363,
|
||
|
|
"grad_norm": 0.5078078118944286,
|
||
|
|
"learning_rate": 1.033554298621842e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004519781097769737,
|
||
|
|
"step": 2140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.875,
|
||
|
|
"grad_norm": 0.44860763334302267,
|
||
|
|
"learning_rate": 1.023647587903438e-05,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00418046023696661,
|
||
|
|
"step": 2145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.886363636363637,
|
||
|
|
"grad_norm": 0.4998834471053691,
|
||
|
|
"learning_rate": 1.0137722287152995e-05,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004569866694509983,
|
||
|
|
"step": 2150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.8977272727272725,
|
||
|
|
"grad_norm": 0.4513901427857684,
|
||
|
|
"learning_rate": 1.0039285381638532e-05,
|
||
|
|
"loss": 0.0043,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003977802582085133,
|
||
|
|
"step": 2155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.909090909090909,
|
||
|
|
"grad_norm": 0.4700040130496916,
|
||
|
|
"learning_rate": 9.941168323386146e-06,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004274777136743069,
|
||
|
|
"step": 2160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.920454545454545,
|
||
|
|
"grad_norm": 0.47113082188024086,
|
||
|
|
"learning_rate": 9.84337426302044e-06,
|
||
|
|
"loss": 0.0048,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004777199123054743,
|
||
|
|
"step": 2165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.931818181818182,
|
||
|
|
"grad_norm": 0.5158588495111642,
|
||
|
|
"learning_rate": 9.74590634079425e-06,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.005088199861347675,
|
||
|
|
"step": 2170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.943181818181818,
|
||
|
|
"grad_norm": 0.45688857870688926,
|
||
|
|
"learning_rate": 9.648767686487859e-06,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004706758540123701,
|
||
|
|
"step": 2175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.954545454545455,
|
||
|
|
"grad_norm": 0.4884644914477628,
|
||
|
|
"learning_rate": 9.551961419308447e-06,
|
||
|
|
"loss": 0.0047,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004676604177802801,
|
||
|
|
"step": 2180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.965909090909091,
|
||
|
|
"grad_norm": 0.4877508088408542,
|
||
|
|
"learning_rate": 9.455490647789951e-06,
|
||
|
|
"loss": 0.0045,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004318420775234699,
|
||
|
|
"step": 2185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.9772727272727275,
|
||
|
|
"grad_norm": 0.47255263332017355,
|
||
|
|
"learning_rate": 9.359358469693272e-06,
|
||
|
|
"loss": 0.0044,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004815556574612856,
|
||
|
|
"step": 2190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.988636363636363,
|
||
|
|
"grad_norm": 0.4921165200445906,
|
||
|
|
"learning_rate": 9.263567971906748e-06,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004798836074769497,
|
||
|
|
"step": 2195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0,
|
||
|
|
"grad_norm": 0.463181639474181,
|
||
|
|
"learning_rate": 9.168122230347098e-06,
|
||
|
|
"loss": 0.0046,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0045855361968278885,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.011363636363637,
|
||
|
|
"grad_norm": 0.4775346746140353,
|
||
|
|
"learning_rate": 9.073024309860583e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003912698477506638,
|
||
|
|
"step": 2205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0227272727272725,
|
||
|
|
"grad_norm": 0.48677451287350915,
|
||
|
|
"learning_rate": 8.978277264124655e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00385481771081686,
|
||
|
|
"step": 2210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.034090909090909,
|
||
|
|
"grad_norm": 0.5388158919478782,
|
||
|
|
"learning_rate": 8.883884135549852e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004276004619896412,
|
||
|
|
"step": 2215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.045454545454546,
|
||
|
|
"grad_norm": 0.4938703333172897,
|
||
|
|
"learning_rate": 8.789847955182118e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004115547519177198,
|
||
|
|
"step": 2220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.056818181818182,
|
||
|
|
"grad_norm": 0.5308417170790901,
|
||
|
|
"learning_rate": 8.696171742605508e-06,
|
||
|
|
"loss": 0.0037,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003737162798643112,
|
||
|
|
"step": 2225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.068181818181818,
|
||
|
|
"grad_norm": 0.47687032934795565,
|
||
|
|
"learning_rate": 8.602858505845149e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003796251490712166,
|
||
|
|
"step": 2230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.079545454545454,
|
||
|
|
"grad_norm": 0.4603724428344066,
|
||
|
|
"learning_rate": 8.509911241270741e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0042653316631913185,
|
||
|
|
"step": 2235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.090909090909091,
|
||
|
|
"grad_norm": 0.46374146272269046,
|
||
|
|
"learning_rate": 8.417332933500267e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003799492260441184,
|
||
|
|
"step": 2240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.1022727272727275,
|
||
|
|
"grad_norm": 0.4379222280104592,
|
||
|
|
"learning_rate": 8.325126555304208e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004223274067044258,
|
||
|
|
"step": 2245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.113636363636363,
|
||
|
|
"grad_norm": 0.42989156225993835,
|
||
|
|
"learning_rate": 8.233295067510036e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0036610844545066357,
|
||
|
|
"step": 2250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.125,
|
||
|
|
"grad_norm": 0.4672672088909349,
|
||
|
|
"learning_rate": 8.141841418907194e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0040005873888731,
|
||
|
|
"step": 2255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.136363636363637,
|
||
|
|
"grad_norm": 0.5196079952334239,
|
||
|
|
"learning_rate": 8.050768546152352e-06,
|
||
|
|
"loss": 0.0041,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004299596417695284,
|
||
|
|
"step": 2260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.1477272727272725,
|
||
|
|
"grad_norm": 0.4882718572601622,
|
||
|
|
"learning_rate": 7.960079373675135e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004049581941217184,
|
||
|
|
"step": 2265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.159090909090909,
|
||
|
|
"grad_norm": 0.4239583484917987,
|
||
|
|
"learning_rate": 7.86977681358424e-06,
|
||
|
|
"loss": 0.0041,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003955259453505278,
|
||
|
|
"step": 2270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.170454545454546,
|
||
|
|
"grad_norm": 0.42940582569735847,
|
||
|
|
"learning_rate": 7.779863765573874e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003957828506827354,
|
||
|
|
"step": 2275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.181818181818182,
|
||
|
|
"grad_norm": 0.41235189838752484,
|
||
|
|
"learning_rate": 7.690343116830696e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0036221854388713837,
|
||
|
|
"step": 2280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.193181818181818,
|
||
|
|
"grad_norm": 0.5160204137531623,
|
||
|
|
"learning_rate": 7.601217741941054e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003955465741455555,
|
||
|
|
"step": 2285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.204545454545454,
|
||
|
|
"grad_norm": 0.4090449076523376,
|
||
|
|
"learning_rate": 7.512490502798735e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035659437999129295,
|
||
|
|
"step": 2290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.215909090909091,
|
||
|
|
"grad_norm": 0.4594919078099957,
|
||
|
|
"learning_rate": 7.424164248513017e-06,
|
||
|
|
"loss": 0.0042,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0042147571220994,
|
||
|
|
"step": 2295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.2272727272727275,
|
||
|
|
"grad_norm": 0.5866972843890855,
|
||
|
|
"learning_rate": 7.336241815317207e-06,
|
||
|
|
"loss": 0.0042,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004257405176758766,
|
||
|
|
"step": 2300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.238636363636363,
|
||
|
|
"grad_norm": 0.47038298659526956,
|
||
|
|
"learning_rate": 7.248726026477577e-06,
|
||
|
|
"loss": 0.0041,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004454101901501417,
|
||
|
|
"step": 2305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.25,
|
||
|
|
"grad_norm": 0.4737403845524661,
|
||
|
|
"learning_rate": 7.161619692202675e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004023617133498192,
|
||
|
|
"step": 2310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.261363636363637,
|
||
|
|
"grad_norm": 0.572706506761817,
|
||
|
|
"learning_rate": 7.0749256095531225e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003950290381908417,
|
||
|
|
"step": 2315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.2727272727272725,
|
||
|
|
"grad_norm": 0.5088444958237405,
|
||
|
|
"learning_rate": 6.98864656235176e-06,
|
||
|
|
"loss": 0.0041,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004037501290440559,
|
||
|
|
"step": 2320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.284090909090909,
|
||
|
|
"grad_norm": 0.4596423996293368,
|
||
|
|
"learning_rate": 6.902785321094301e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003735282924026251,
|
||
|
|
"step": 2325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.295454545454546,
|
||
|
|
"grad_norm": 0.5471422831251165,
|
||
|
|
"learning_rate": 6.817344642860322e-06,
|
||
|
|
"loss": 0.0037,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004045432899147272,
|
||
|
|
"step": 2330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.306818181818182,
|
||
|
|
"grad_norm": 0.47656353430758264,
|
||
|
|
"learning_rate": 6.732327271224759e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003932539373636246,
|
||
|
|
"step": 2335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.318181818181818,
|
||
|
|
"grad_norm": 0.5069088879062458,
|
||
|
|
"learning_rate": 6.647735936169819e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0042952438816428185,
|
||
|
|
"step": 2340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.329545454545454,
|
||
|
|
"grad_norm": 0.452653428641188,
|
||
|
|
"learning_rate": 6.563573353997275e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003908766433596611,
|
||
|
|
"step": 2345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.340909090909091,
|
||
|
|
"grad_norm": 0.561631782902141,
|
||
|
|
"learning_rate": 6.479842227241293e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004032565746456385,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.3522727272727275,
|
||
|
|
"grad_norm": 0.5637937378243737,
|
||
|
|
"learning_rate": 6.396545244581609e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003837024327367544,
|
||
|
|
"step": 2355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.363636363636363,
|
||
|
|
"grad_norm": 0.5419651557365305,
|
||
|
|
"learning_rate": 6.313685080757235e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004137341398745775,
|
||
|
|
"step": 2360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.375,
|
||
|
|
"grad_norm": 0.46979208485127416,
|
||
|
|
"learning_rate": 6.231264396480525e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0037767025642096996,
|
||
|
|
"step": 2365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.386363636363637,
|
||
|
|
"grad_norm": 0.42580803822443575,
|
||
|
|
"learning_rate": 6.149285838351766e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0038591676857322454,
|
||
|
|
"step": 2370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.3977272727272725,
|
||
|
|
"grad_norm": 0.47386845448197756,
|
||
|
|
"learning_rate": 6.067752038774204e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0037694619968533516,
|
||
|
|
"step": 2375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.409090909090909,
|
||
|
|
"grad_norm": 0.5066531405838955,
|
||
|
|
"learning_rate": 5.986665615869478e-06,
|
||
|
|
"loss": 0.0037,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004015166778117418,
|
||
|
|
"step": 2380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.420454545454546,
|
||
|
|
"grad_norm": 0.48671736821053846,
|
||
|
|
"learning_rate": 5.9060291733935795e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004223925527185202,
|
||
|
|
"step": 2385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.431818181818182,
|
||
|
|
"grad_norm": 0.4638681849494172,
|
||
|
|
"learning_rate": 5.825845300653226e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004192112945020199,
|
||
|
|
"step": 2390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.443181818181818,
|
||
|
|
"grad_norm": 0.5279251504983492,
|
||
|
|
"learning_rate": 5.7461165724227485e-06,
|
||
|
|
"loss": 0.0041,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004224070347845554,
|
||
|
|
"step": 2395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.454545454545454,
|
||
|
|
"grad_norm": 0.5904197194538235,
|
||
|
|
"learning_rate": 5.666845548861357e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004307609051465988,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.465909090909091,
|
||
|
|
"grad_norm": 0.42076809077017335,
|
||
|
|
"learning_rate": 5.588034775430997e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0037920165341347456,
|
||
|
|
"step": 2405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.4772727272727275,
|
||
|
|
"grad_norm": 0.4334488783489292,
|
||
|
|
"learning_rate": 5.509686782814547e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003798619145527482,
|
||
|
|
"step": 2410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.488636363636363,
|
||
|
|
"grad_norm": 0.42421498211242736,
|
||
|
|
"learning_rate": 5.43180408683462e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035140167456120253,
|
||
|
|
"step": 2415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.5,
|
||
|
|
"grad_norm": 0.45563930175821943,
|
||
|
|
"learning_rate": 5.354389188372726e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003876676084473729,
|
||
|
|
"step": 2420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.511363636363637,
|
||
|
|
"grad_norm": 0.47371651404852366,
|
||
|
|
"learning_rate": 5.277444573288986e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003940898925065994,
|
||
|
|
"step": 2425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.5227272727272725,
|
||
|
|
"grad_norm": 0.5103411333783517,
|
||
|
|
"learning_rate": 5.200972712342327e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003949492704123259,
|
||
|
|
"step": 2430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.534090909090909,
|
||
|
|
"grad_norm": 0.44196664518497514,
|
||
|
|
"learning_rate": 5.124976061111109e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004002627916634083,
|
||
|
|
"step": 2435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.545454545454545,
|
||
|
|
"grad_norm": 0.4650969881730076,
|
||
|
|
"learning_rate": 5.049457059914311e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0036511081270873547,
|
||
|
|
"step": 2440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.556818181818182,
|
||
|
|
"grad_norm": 0.4774200733409531,
|
||
|
|
"learning_rate": 4.97441813373313e-06,
|
||
|
|
"loss": 0.0042,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004019967745989561,
|
||
|
|
"step": 2445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.568181818181818,
|
||
|
|
"grad_norm": 0.45537744448489115,
|
||
|
|
"learning_rate": 4.899861692133157e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003809658344835043,
|
||
|
|
"step": 2450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.579545454545455,
|
||
|
|
"grad_norm": 0.47362155545724277,
|
||
|
|
"learning_rate": 4.82579012918696e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004073746502399445,
|
||
|
|
"step": 2455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.590909090909091,
|
||
|
|
"grad_norm": 0.5251594108649331,
|
||
|
|
"learning_rate": 4.752205823397236e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004047977738082409,
|
||
|
|
"step": 2460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.6022727272727275,
|
||
|
|
"grad_norm": 0.5288297766505775,
|
||
|
|
"learning_rate": 4.679111137620442e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0039558978751301765,
|
||
|
|
"step": 2465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.613636363636363,
|
||
|
|
"grad_norm": 0.4280018292666887,
|
||
|
|
"learning_rate": 4.606508418990885e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003443428548052907,
|
||
|
|
"step": 2470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.625,
|
||
|
|
"grad_norm": 0.44259978860574006,
|
||
|
|
"learning_rate": 4.534399998845396e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003342390526086092,
|
||
|
|
"step": 2475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.636363636363637,
|
||
|
|
"grad_norm": 0.44742152940306495,
|
||
|
|
"learning_rate": 4.462788192648435e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004139516036957502,
|
||
|
|
"step": 2480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.6477272727272725,
|
||
|
|
"grad_norm": 0.5252500079842295,
|
||
|
|
"learning_rate": 4.391675299917768e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0040503814816474915,
|
||
|
|
"step": 2485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.659090909090909,
|
||
|
|
"grad_norm": 0.5018815074232453,
|
||
|
|
"learning_rate": 4.3210636041506e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0041411444544792175,
|
||
|
|
"step": 2490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.670454545454545,
|
||
|
|
"grad_norm": 0.47739693812599476,
|
||
|
|
"learning_rate": 4.25095537275026e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003971803467720747,
|
||
|
|
"step": 2495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.681818181818182,
|
||
|
|
"grad_norm": 0.47068282022821506,
|
||
|
|
"learning_rate": 4.181352856953418e-06,
|
||
|
|
"loss": 0.0037,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004058327525854111,
|
||
|
|
"step": 2500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.693181818181818,
|
||
|
|
"grad_norm": 0.47320248097749434,
|
||
|
|
"learning_rate": 4.112258291757747e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0038510779850184917,
|
||
|
|
"step": 2505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.704545454545455,
|
||
|
|
"grad_norm": 0.42370337463853497,
|
||
|
|
"learning_rate": 4.043673895850202e-06,
|
||
|
|
"loss": 0.0036,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003682773793116212,
|
||
|
|
"step": 2510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.715909090909091,
|
||
|
|
"grad_norm": 0.5647985138137753,
|
||
|
|
"learning_rate": 3.975601871535739e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003943915478885174,
|
||
|
|
"step": 2515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.7272727272727275,
|
||
|
|
"grad_norm": 0.483586153068834,
|
||
|
|
"learning_rate": 3.908044404666633e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004055343102663755,
|
||
|
|
"step": 2520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.738636363636363,
|
||
|
|
"grad_norm": 0.41788036471295964,
|
||
|
|
"learning_rate": 3.841003664572251e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003884321078658104,
|
||
|
|
"step": 2525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.75,
|
||
|
|
"grad_norm": 0.4646931177221419,
|
||
|
|
"learning_rate": 3.7744818039894202e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035459804348647594,
|
||
|
|
"step": 2530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.761363636363637,
|
||
|
|
"grad_norm": 0.44142346623289896,
|
||
|
|
"learning_rate": 3.708480958993286e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003818873316049576,
|
||
|
|
"step": 2535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.7727272727272725,
|
||
|
|
"grad_norm": 0.4997631855313708,
|
||
|
|
"learning_rate": 3.6430032489287424e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003779443446546793,
|
||
|
|
"step": 2540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.784090909090909,
|
||
|
|
"grad_norm": 0.47159777299464256,
|
||
|
|
"learning_rate": 3.5780507763423456e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0037581464275717735,
|
||
|
|
"step": 2545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.795454545454545,
|
||
|
|
"grad_norm": 0.46863473786391785,
|
||
|
|
"learning_rate": 3.513625626914823e-06,
|
||
|
|
"loss": 0.0037,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003644632175564766,
|
||
|
|
"step": 2550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.806818181818182,
|
||
|
|
"grad_norm": 0.47223068697551934,
|
||
|
|
"learning_rate": 3.4497298693941007e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0038760402239859104,
|
||
|
|
"step": 2555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.818181818181818,
|
||
|
|
"grad_norm": 0.445923694612174,
|
||
|
|
"learning_rate": 3.3863655555288564e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003601495875045657,
|
||
|
|
"step": 2560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.829545454545455,
|
||
|
|
"grad_norm": 0.502227263679269,
|
||
|
|
"learning_rate": 3.323534720002659e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003987235948443413,
|
||
|
|
"step": 2565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.840909090909091,
|
||
|
|
"grad_norm": 0.48454308485662256,
|
||
|
|
"learning_rate": 3.261239380368606e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0037424368783831596,
|
||
|
|
"step": 2570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.8522727272727275,
|
||
|
|
"grad_norm": 0.479420237247127,
|
||
|
|
"learning_rate": 3.199481536984572e-06,
|
||
|
|
"loss": 0.0041,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0039000785909593105,
|
||
|
|
"step": 2575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.863636363636363,
|
||
|
|
"grad_norm": 0.4412823184524044,
|
||
|
|
"learning_rate": 3.138263172948943e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004030333831906319,
|
||
|
|
"step": 2580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.875,
|
||
|
|
"grad_norm": 0.4820910373910857,
|
||
|
|
"learning_rate": 3.077586254036946e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004166883882135153,
|
||
|
|
"step": 2585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.886363636363637,
|
||
|
|
"grad_norm": 0.4858005469751616,
|
||
|
|
"learning_rate": 3.017452728637553e-06,
|
||
|
|
"loss": 0.0037,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004376193042844534,
|
||
|
|
"step": 2590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.8977272727272725,
|
||
|
|
"grad_norm": 0.5576855746273122,
|
||
|
|
"learning_rate": 2.95786452769087e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0044846381060779095,
|
||
|
|
"step": 2595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.909090909090909,
|
||
|
|
"grad_norm": 0.43873863281289766,
|
||
|
|
"learning_rate": 2.898823564626181e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0038866912946105003,
|
||
|
|
"step": 2600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.920454545454545,
|
||
|
|
"grad_norm": 0.42630411046161937,
|
||
|
|
"learning_rate": 2.840331735300459e-06,
|
||
|
|
"loss": 0.0036,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0030431475024670362,
|
||
|
|
"step": 2605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.931818181818182,
|
||
|
|
"grad_norm": 0.48065659691911583,
|
||
|
|
"learning_rate": 2.7823909179375365e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.004149057436734438,
|
||
|
|
"step": 2610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.943181818181818,
|
||
|
|
"grad_norm": 0.43754591621366506,
|
||
|
|
"learning_rate": 2.7250029730677498e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003820871002972126,
|
||
|
|
"step": 2615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.954545454545455,
|
||
|
|
"grad_norm": 0.501142323658502,
|
||
|
|
"learning_rate": 2.668169743468223e-06,
|
||
|
|
"loss": 0.0038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0033863571006804705,
|
||
|
|
"step": 2620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.965909090909091,
|
||
|
|
"grad_norm": 0.5089502622910189,
|
||
|
|
"learning_rate": 2.6118930541036937e-06,
|
||
|
|
"loss": 0.0039,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003630759660154581,
|
||
|
|
"step": 2625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.9772727272727275,
|
||
|
|
"grad_norm": 0.43286653171311407,
|
||
|
|
"learning_rate": 2.556174712067894e-06,
|
||
|
|
"loss": 0.0036,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0037029660306870937,
|
||
|
|
"step": 2630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.988636363636363,
|
||
|
|
"grad_norm": 0.48919209995098833,
|
||
|
|
"learning_rate": 2.5010165065255465e-06,
|
||
|
|
"loss": 0.004,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003930555656552315,
|
||
|
|
"step": 2635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.0,
|
||
|
|
"grad_norm": 0.4724403688314844,
|
||
|
|
"learning_rate": 2.4464202086548874e-06,
|
||
|
|
"loss": 0.0037,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003087595570832491,
|
||
|
|
"step": 2640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.011363636363637,
|
||
|
|
"grad_norm": 0.40544027703154945,
|
||
|
|
"learning_rate": 2.3923875715908195e-06,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031802121084183455,
|
||
|
|
"step": 2645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.0227272727272725,
|
||
|
|
"grad_norm": 0.4300091581791728,
|
||
|
|
"learning_rate": 2.338920330368599e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003233799245208502,
|
||
|
|
"step": 2650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.034090909090909,
|
||
|
|
"grad_norm": 0.44336142694552977,
|
||
|
|
"learning_rate": 2.2860202018681153e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031272168271243572,
|
||
|
|
"step": 2655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.045454545454546,
|
||
|
|
"grad_norm": 0.44019659806886935,
|
||
|
|
"learning_rate": 2.2336888847587978e-06,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0029117444064468145,
|
||
|
|
"step": 2660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.056818181818182,
|
||
|
|
"grad_norm": 0.46970548706157467,
|
||
|
|
"learning_rate": 2.1819280594450134e-06,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003628804348409176,
|
||
|
|
"step": 2665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.068181818181818,
|
||
|
|
"grad_norm": 0.47278769393448594,
|
||
|
|
"learning_rate": 2.1307393880121684e-06,
|
||
|
|
"loss": 0.0035,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003231466980651021,
|
||
|
|
"step": 2670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.079545454545454,
|
||
|
|
"grad_norm": 0.4423714301195872,
|
||
|
|
"learning_rate": 2.080124514173285e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0033233356662094593,
|
||
|
|
"step": 2675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.090909090909091,
|
||
|
|
"grad_norm": 0.43104394632158005,
|
||
|
|
"learning_rate": 2.0300850632162517e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0030674331355839968,
|
||
|
|
"step": 2680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.1022727272727275,
|
||
|
|
"grad_norm": 0.40160017259155645,
|
||
|
|
"learning_rate": 1.9806226419516195e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0030664096120744944,
|
||
|
|
"step": 2685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.113636363636363,
|
||
|
|
"grad_norm": 0.42216776130011896,
|
||
|
|
"learning_rate": 1.9317388386610213e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0034044866915792227,
|
||
|
|
"step": 2690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.125,
|
||
|
|
"grad_norm": 0.42123285424302964,
|
||
|
|
"learning_rate": 1.8834352230461506e-06,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0032480142544955015,
|
||
|
|
"step": 2695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.136363636363637,
|
||
|
|
"grad_norm": 0.4493039199437798,
|
||
|
|
"learning_rate": 1.835713346178376e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0032667412888258696,
|
||
|
|
"step": 2700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.1477272727272725,
|
||
|
|
"grad_norm": 0.6019607672429398,
|
||
|
|
"learning_rate": 1.7885747404489162e-06,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0030333197209984064,
|
||
|
|
"step": 2705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.159090909090909,
|
||
|
|
"grad_norm": 0.4704060217141435,
|
||
|
|
"learning_rate": 1.7420209195196447e-06,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031178079079836607,
|
||
|
|
"step": 2710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.170454545454546,
|
||
|
|
"grad_norm": 0.43486832123508107,
|
||
|
|
"learning_rate": 1.6960533782744938e-06,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035637228284031153,
|
||
|
|
"step": 2715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.181818181818182,
|
||
|
|
"grad_norm": 0.38586169591086233,
|
||
|
|
"learning_rate": 1.650673592771428e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0030642072670161724,
|
||
|
|
"step": 2720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.193181818181818,
|
||
|
|
"grad_norm": 0.4087315375435648,
|
||
|
|
"learning_rate": 1.6058830201950738e-06,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031899097375571728,
|
||
|
|
"step": 2725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.204545454545454,
|
||
|
|
"grad_norm": 0.40513660143064867,
|
||
|
|
"learning_rate": 1.561683098809903e-06,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003212493844330311,
|
||
|
|
"step": 2730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.215909090909091,
|
||
|
|
"grad_norm": 0.38682964161065564,
|
||
|
|
"learning_rate": 1.5180752479140727e-06,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0032450587023049593,
|
||
|
|
"step": 2735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.2272727272727275,
|
||
|
|
"grad_norm": 0.39517374775494607,
|
||
|
|
"learning_rate": 1.475060867793827e-06,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0032750635873526335,
|
||
|
|
"step": 2740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.238636363636363,
|
||
|
|
"grad_norm": 0.4129318034364053,
|
||
|
|
"learning_rate": 1.4326413396785488e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035757021978497505,
|
||
|
|
"step": 2745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.25,
|
||
|
|
"grad_norm": 0.4388962815801728,
|
||
|
|
"learning_rate": 1.3908180256964078e-06,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003209850285202265,
|
||
|
|
"step": 2750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.261363636363637,
|
||
|
|
"grad_norm": 0.45696896572624973,
|
||
|
|
"learning_rate": 1.3495922688306063e-06,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0037891422398388386,
|
||
|
|
"step": 2755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.2727272727272725,
|
||
|
|
"grad_norm": 0.4336875458153824,
|
||
|
|
"learning_rate": 1.3089653928762758e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0034277569502592087,
|
||
|
|
"step": 2760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.284090909090909,
|
||
|
|
"grad_norm": 0.40360943027697554,
|
||
|
|
"learning_rate": 1.268938702397946e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003308095969259739,
|
||
|
|
"step": 2765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.295454545454546,
|
||
|
|
"grad_norm": 0.42925634970034354,
|
||
|
|
"learning_rate": 1.2295134826876788e-06,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0032997161615639925,
|
||
|
|
"step": 2770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.306818181818182,
|
||
|
|
"grad_norm": 0.4362834534452196,
|
||
|
|
"learning_rate": 1.1906909997237714e-06,
|
||
|
|
"loss": 0.0035,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035847595427185297,
|
||
|
|
"step": 2775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.318181818181818,
|
||
|
|
"grad_norm": 0.40530536820562274,
|
||
|
|
"learning_rate": 1.1524725001301197e-06,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0030893716029822826,
|
||
|
|
"step": 2780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.329545454545454,
|
||
|
|
"grad_norm": 0.4620596145669866,
|
||
|
|
"learning_rate": 1.1148592111361945e-06,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035886347759515047,
|
||
|
|
"step": 2785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.340909090909091,
|
||
|
|
"grad_norm": 0.4565513148132163,
|
||
|
|
"learning_rate": 1.0778523405376084e-06,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031891907565295696,
|
||
|
|
"step": 2790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.3522727272727275,
|
||
|
|
"grad_norm": 0.4586341037449462,
|
||
|
|
"learning_rate": 1.0414530766573661e-06,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0033755083568394184,
|
||
|
|
"step": 2795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.363636363636363,
|
||
|
|
"grad_norm": 0.39774723663445105,
|
||
|
|
"learning_rate": 1.005662588307672e-06,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0030828884337097406,
|
||
|
|
"step": 2800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.375,
|
||
|
|
"grad_norm": 0.4897415519522406,
|
||
|
|
"learning_rate": 9.704820247524328e-07,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035574431531131268,
|
||
|
|
"step": 2805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.386363636363637,
|
||
|
|
"grad_norm": 0.38466513420966203,
|
||
|
|
"learning_rate": 9.359125156703208e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003028341569006443,
|
||
|
|
"step": 2810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.3977272727272725,
|
||
|
|
"grad_norm": 0.4423982092484484,
|
||
|
|
"learning_rate": 9.019551711185226e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003425930393859744,
|
||
|
|
"step": 2815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.409090909090909,
|
||
|
|
"grad_norm": 0.39513113336388056,
|
||
|
|
"learning_rate": 8.68611081497086e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003104181494563818,
|
||
|
|
"step": 2820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.420454545454546,
|
||
|
|
"grad_norm": 0.49651778911420774,
|
||
|
|
"learning_rate": 8.358813175139046e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003403285052627325,
|
||
|
|
"step": 2825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.431818181818182,
|
||
|
|
"grad_norm": 0.4527417824764246,
|
||
|
|
"learning_rate": 8.037669301503403e-07,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003313109278678894,
|
||
|
|
"step": 2830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.443181818181818,
|
||
|
|
"grad_norm": 0.4885023752314349,
|
||
|
|
"learning_rate": 7.722689506274639e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031580673530697823,
|
||
|
|
"step": 2835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.454545454545454,
|
||
|
|
"grad_norm": 0.41325768330025364,
|
||
|
|
"learning_rate": 7.413883903729701e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035900771617889404,
|
||
|
|
"step": 2840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.465909090909091,
|
||
|
|
"grad_norm": 0.5485931227337177,
|
||
|
|
"learning_rate": 7.111262409886599e-07,
|
||
|
|
"loss": 0.0035,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003448920324444771,
|
||
|
|
"step": 2845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.4772727272727275,
|
||
|
|
"grad_norm": 0.4078822509433075,
|
||
|
|
"learning_rate": 6.814834742186361e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.002837548963725567,
|
||
|
|
"step": 2850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.488636363636363,
|
||
|
|
"grad_norm": 0.4357810142890551,
|
||
|
|
"learning_rate": 6.52461041918071e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0032473006285727024,
|
||
|
|
"step": 2855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.5,
|
||
|
|
"grad_norm": 0.4556938750554629,
|
||
|
|
"learning_rate": 6.240598760226691e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003293708199635148,
|
||
|
|
"step": 2860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.511363636363637,
|
||
|
|
"grad_norm": 0.482234402447487,
|
||
|
|
"learning_rate": 5.962808885187121e-07,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0033216907177120447,
|
||
|
|
"step": 2865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.5227272727272725,
|
||
|
|
"grad_norm": 0.4635518576631173,
|
||
|
|
"learning_rate": 5.691249714137948e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031969414558261633,
|
||
|
|
"step": 2870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.534090909090909,
|
||
|
|
"grad_norm": 0.40780538331310706,
|
||
|
|
"learning_rate": 5.425929967081822e-07,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003067434299737215,
|
||
|
|
"step": 2875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.545454545454545,
|
||
|
|
"grad_norm": 0.4103981284069006,
|
||
|
|
"learning_rate": 5.166858163667932e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0033139484003186226,
|
||
|
|
"step": 2880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.556818181818182,
|
||
|
|
"grad_norm": 0.4131394046437643,
|
||
|
|
"learning_rate": 4.914042622918591e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031530228443443775,
|
||
|
|
"step": 2885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.568181818181818,
|
||
|
|
"grad_norm": 0.4921425198782738,
|
||
|
|
"learning_rate": 4.667491462962037e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003028211183845997,
|
||
|
|
"step": 2890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.579545454545455,
|
||
|
|
"grad_norm": 0.41229946456669675,
|
||
|
|
"learning_rate": 4.4272126007717466e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0034896559081971645,
|
||
|
|
"step": 2895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.590909090909091,
|
||
|
|
"grad_norm": 0.41157023476678206,
|
||
|
|
"learning_rate": 4.1932137519122175e-07,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003316685790196061,
|
||
|
|
"step": 2900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.6022727272727275,
|
||
|
|
"grad_norm": 0.47325186040897627,
|
||
|
|
"learning_rate": 3.965502430291235e-07,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0034516688901931047,
|
||
|
|
"step": 2905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.613636363636363,
|
||
|
|
"grad_norm": 0.4895866104284294,
|
||
|
|
"learning_rate": 3.7440859479185967e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0034444353077560663,
|
||
|
|
"step": 2910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.625,
|
||
|
|
"grad_norm": 0.4562369632128547,
|
||
|
|
"learning_rate": 3.5289714146713004e-07,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003392378333956003,
|
||
|
|
"step": 2915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.636363636363637,
|
||
|
|
"grad_norm": 0.46221279821886013,
|
||
|
|
"learning_rate": 3.3201657380652173e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003587019629776478,
|
||
|
|
"step": 2920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.6477272727272725,
|
||
|
|
"grad_norm": 0.41961046404747937,
|
||
|
|
"learning_rate": 3.117675623033334e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0030517023988068104,
|
||
|
|
"step": 2925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.659090909090909,
|
||
|
|
"grad_norm": 0.5173187409454828,
|
||
|
|
"learning_rate": 2.9215075717104356e-07,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0032806636299937963,
|
||
|
|
"step": 2930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.670454545454545,
|
||
|
|
"grad_norm": 0.4502023980941177,
|
||
|
|
"learning_rate": 2.731667883224298e-07,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003279878292232752,
|
||
|
|
"step": 2935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.681818181818182,
|
||
|
|
"grad_norm": 0.42749631297222007,
|
||
|
|
"learning_rate": 2.548162653493402e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0033847535960376263,
|
||
|
|
"step": 2940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.693181818181818,
|
||
|
|
"grad_norm": 0.47526826534949246,
|
||
|
|
"learning_rate": 2.370997775031292e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003315295558422804,
|
||
|
|
"step": 2945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.704545454545455,
|
||
|
|
"grad_norm": 0.48121583308568683,
|
||
|
|
"learning_rate": 2.2001789367571692e-07,
|
||
|
|
"loss": 0.0034,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0034815194085240364,
|
||
|
|
"step": 2950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.715909090909091,
|
||
|
|
"grad_norm": 0.4410697466915672,
|
||
|
|
"learning_rate": 2.0357116238134633e-07,
|
||
|
|
"loss": 0.0035,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031138830818235874,
|
||
|
|
"step": 2955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.7272727272727275,
|
||
|
|
"grad_norm": 0.48640179648780485,
|
||
|
|
"learning_rate": 1.8776011173894383e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0029965483117848635,
|
||
|
|
"step": 2960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.738636363636363,
|
||
|
|
"grad_norm": 0.4244938777058745,
|
||
|
|
"learning_rate": 1.725852494551883e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003152275923639536,
|
||
|
|
"step": 2965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.75,
|
||
|
|
"grad_norm": 0.666089786440141,
|
||
|
|
"learning_rate": 1.5804706280817984e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035941232927143574,
|
||
|
|
"step": 2970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.761363636363637,
|
||
|
|
"grad_norm": 0.4465790589285092,
|
||
|
|
"learning_rate": 1.441460186318122e-07,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031929747201502323,
|
||
|
|
"step": 2975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.7727272727272725,
|
||
|
|
"grad_norm": 0.3914607596463045,
|
||
|
|
"learning_rate": 1.3088256330078264e-07,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0030249631963670254,
|
||
|
|
"step": 2980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.784090909090909,
|
||
|
|
"grad_norm": 0.47000520296347703,
|
||
|
|
"learning_rate": 1.1825712271624102e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0032052146270871162,
|
||
|
|
"step": 2985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.795454545454545,
|
||
|
|
"grad_norm": 0.513750978096953,
|
||
|
|
"learning_rate": 1.0627010229213641e-07,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031967200338840485,
|
||
|
|
"step": 2990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.806818181818182,
|
||
|
|
"grad_norm": 0.45783389497188476,
|
||
|
|
"learning_rate": 9.492188694218085e-08,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0034831403754651546,
|
||
|
|
"step": 2995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.818181818181818,
|
||
|
|
"grad_norm": 0.37229691750669325,
|
||
|
|
"learning_rate": 8.421284106750139e-08,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.002955458126962185,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.829545454545455,
|
||
|
|
"grad_norm": 0.4782141499741039,
|
||
|
|
"learning_rate": 7.414330854492946e-08,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.00327532971277833,
|
||
|
|
"step": 3005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.840909090909091,
|
||
|
|
"grad_norm": 0.41410485134781283,
|
||
|
|
"learning_rate": 6.471361271596754e-08,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003174928482621908,
|
||
|
|
"step": 3010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.8522727272727275,
|
||
|
|
"grad_norm": 0.5942767083911783,
|
||
|
|
"learning_rate": 5.592405637639742e-08,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.002868573646992445,
|
||
|
|
"step": 3015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.863636363636363,
|
||
|
|
"grad_norm": 0.39946737762958795,
|
||
|
|
"learning_rate": 4.777492176656351e-08,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031276133377104998,
|
||
|
|
"step": 3020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.875,
|
||
|
|
"grad_norm": 0.4340971206798626,
|
||
|
|
"learning_rate": 4.026647056231348e-08,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0029534846544265747,
|
||
|
|
"step": 3025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.886363636363637,
|
||
|
|
"grad_norm": 0.420432790848945,
|
||
|
|
"learning_rate": 3.339894386657827e-08,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0033322260715067387,
|
||
|
|
"step": 3030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.8977272727272725,
|
||
|
|
"grad_norm": 0.4234728835456178,
|
||
|
|
"learning_rate": 2.7172562201656096e-08,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0034303716383874416,
|
||
|
|
"step": 3035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.909090909090909,
|
||
|
|
"grad_norm": 0.5212617848188786,
|
||
|
|
"learning_rate": 2.1587525502104745e-08,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0032378430478274822,
|
||
|
|
"step": 3040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.920454545454545,
|
||
|
|
"grad_norm": 0.4658177651774122,
|
||
|
|
"learning_rate": 1.6644013108342294e-08,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0032385908998548985,
|
||
|
|
"step": 3045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.931818181818182,
|
||
|
|
"grad_norm": 0.4703370076039564,
|
||
|
|
"learning_rate": 1.2342183760878368e-08,
|
||
|
|
"loss": 0.003,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0031367335468530655,
|
||
|
|
"step": 3050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.943181818181818,
|
||
|
|
"grad_norm": 0.48428094367160235,
|
||
|
|
"learning_rate": 8.682175595216002e-09,
|
||
|
|
"loss": 0.0032,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035202507860958576,
|
||
|
|
"step": 3055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.954545454545455,
|
||
|
|
"grad_norm": 0.46035982865651265,
|
||
|
|
"learning_rate": 5.664106137419634e-09,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003060789778828621,
|
||
|
|
"step": 3060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.965909090909091,
|
||
|
|
"grad_norm": 0.4818154247516169,
|
||
|
|
"learning_rate": 3.288072300340339e-09,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.003484451211988926,
|
||
|
|
"step": 3065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.9772727272727275,
|
||
|
|
"grad_norm": 0.43726610143618155,
|
||
|
|
"learning_rate": 1.5541503805027725e-09,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035455801989883184,
|
||
|
|
"step": 3070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.988636363636363,
|
||
|
|
"grad_norm": 0.4324328631276121,
|
||
|
|
"learning_rate": 4.623960556604523e-10,
|
||
|
|
"loss": 0.0033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0035286054480820894,
|
||
|
|
"step": 3075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 7.0,
|
||
|
|
"grad_norm": 0.37144429097125947,
|
||
|
|
"learning_rate": 1.2844382997201365e-11,
|
||
|
|
"loss": 0.0031,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.002985658124089241,
|
||
|
|
"step": 3080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 7.0,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.002985658124089241,
|
||
|
|
"step": 3080,
|
||
|
|
"total_flos": 331629159579648.0,
|
||
|
|
"train_loss": 0.006270654577846554,
|
||
|
|
"train_runtime": 8393.6988,
|
||
|
|
"train_samples_per_second": 5.86,
|
||
|
|
"train_steps_per_second": 0.367
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 3080,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 7,
|
||
|
|
"save_steps": 200,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 331629159579648.0,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|