1703 lines
43 KiB
JSON
1703 lines
43 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 3.0,
|
|
"eval_steps": 500,
|
|
"global_step": 237,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.012738853503184714,
|
|
"grad_norm": 27.578015588062595,
|
|
"learning_rate": 0.0,
|
|
"loss": 2.2484302520751953,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.025477707006369428,
|
|
"grad_norm": 27.351013009442298,
|
|
"learning_rate": 4.1666666666666667e-07,
|
|
"loss": 2.215416193008423,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.03821656050955414,
|
|
"grad_norm": 29.00648618644296,
|
|
"learning_rate": 8.333333333333333e-07,
|
|
"loss": 2.1969661712646484,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.050955414012738856,
|
|
"grad_norm": 24.279613305984917,
|
|
"learning_rate": 1.25e-06,
|
|
"loss": 2.1304638385772705,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.06369426751592357,
|
|
"grad_norm": 24.548939602077972,
|
|
"learning_rate": 1.6666666666666667e-06,
|
|
"loss": 2.321625232696533,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.07643312101910828,
|
|
"grad_norm": 25.979670653733457,
|
|
"learning_rate": 2.0833333333333334e-06,
|
|
"loss": 2.1303162574768066,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.08917197452229299,
|
|
"grad_norm": 18.68650885616665,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": 1.8076802492141724,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.10191082802547771,
|
|
"grad_norm": 16.7812576898942,
|
|
"learning_rate": 2.916666666666667e-06,
|
|
"loss": 1.7900886535644531,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.11464968152866242,
|
|
"grad_norm": 14.18738912625846,
|
|
"learning_rate": 3.3333333333333333e-06,
|
|
"loss": 1.7915903329849243,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.12738853503184713,
|
|
"grad_norm": 14.116799195872652,
|
|
"learning_rate": 3.7500000000000005e-06,
|
|
"loss": 1.8171511888504028,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.14012738853503184,
|
|
"grad_norm": 10.442018372124066,
|
|
"learning_rate": 4.166666666666667e-06,
|
|
"loss": 1.7455570697784424,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.15286624203821655,
|
|
"grad_norm": 9.192045468171578,
|
|
"learning_rate": 4.583333333333333e-06,
|
|
"loss": 1.6337864398956299,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.16560509554140126,
|
|
"grad_norm": 8.260571303853565,
|
|
"learning_rate": 5e-06,
|
|
"loss": 1.778015375137329,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.17834394904458598,
|
|
"grad_norm": 6.460613846297531,
|
|
"learning_rate": 5.416666666666667e-06,
|
|
"loss": 1.712306022644043,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.1910828025477707,
|
|
"grad_norm": 6.843351540555302,
|
|
"learning_rate": 5.833333333333334e-06,
|
|
"loss": 1.5768513679504395,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.20382165605095542,
|
|
"grad_norm": 6.133058262409406,
|
|
"learning_rate": 6.25e-06,
|
|
"loss": 1.5611257553100586,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.21656050955414013,
|
|
"grad_norm": 5.180005658869054,
|
|
"learning_rate": 6.666666666666667e-06,
|
|
"loss": 1.5967652797698975,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.22929936305732485,
|
|
"grad_norm": 5.305167134267678,
|
|
"learning_rate": 7.083333333333335e-06,
|
|
"loss": 1.364829659461975,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.24203821656050956,
|
|
"grad_norm": 5.355870721587038,
|
|
"learning_rate": 7.500000000000001e-06,
|
|
"loss": 1.6430319547653198,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.25477707006369427,
|
|
"grad_norm": 5.0292720888884075,
|
|
"learning_rate": 7.916666666666667e-06,
|
|
"loss": 1.5467270612716675,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.267515923566879,
|
|
"grad_norm": 4.948954166107489,
|
|
"learning_rate": 8.333333333333334e-06,
|
|
"loss": 1.5446631908416748,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.2802547770700637,
|
|
"grad_norm": 4.859436510097199,
|
|
"learning_rate": 8.750000000000001e-06,
|
|
"loss": 1.4481780529022217,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.2929936305732484,
|
|
"grad_norm": 5.3831221005725896,
|
|
"learning_rate": 9.166666666666666e-06,
|
|
"loss": 1.4933228492736816,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.3057324840764331,
|
|
"grad_norm": 4.473608276014855,
|
|
"learning_rate": 9.583333333333335e-06,
|
|
"loss": 1.5807710886001587,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.3184713375796178,
|
|
"grad_norm": 4.109425789809634,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.349104404449463,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.33121019108280253,
|
|
"grad_norm": 4.634192438556494,
|
|
"learning_rate": 9.999456158087994e-06,
|
|
"loss": 1.4354019165039062,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.34394904458598724,
|
|
"grad_norm": 5.0726515873395,
|
|
"learning_rate": 9.997824750657586e-06,
|
|
"loss": 1.566201090812683,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.35668789808917195,
|
|
"grad_norm": 4.8805336123469205,
|
|
"learning_rate": 9.995106132599869e-06,
|
|
"loss": 1.411285161972046,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.36942675159235666,
|
|
"grad_norm": 4.859867988307453,
|
|
"learning_rate": 9.99130089531422e-06,
|
|
"loss": 1.2867789268493652,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.3821656050955414,
|
|
"grad_norm": 4.432994127396081,
|
|
"learning_rate": 9.98640986657965e-06,
|
|
"loss": 1.5999436378479004,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.39490445859872614,
|
|
"grad_norm": 4.40223600447386,
|
|
"learning_rate": 9.980434110374725e-06,
|
|
"loss": 1.4318150281906128,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.40764331210191085,
|
|
"grad_norm": 4.902591623548149,
|
|
"learning_rate": 9.973374926646117e-06,
|
|
"loss": 1.607371211051941,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.42038216560509556,
|
|
"grad_norm": 4.909609907293681,
|
|
"learning_rate": 9.965233851025816e-06,
|
|
"loss": 1.443784236907959,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.43312101910828027,
|
|
"grad_norm": 4.456375484305202,
|
|
"learning_rate": 9.956012654497073e-06,
|
|
"loss": 1.570559024810791,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.445859872611465,
|
|
"grad_norm": 4.599861555148005,
|
|
"learning_rate": 9.945713343009154e-06,
|
|
"loss": 1.548865556716919,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.4585987261146497,
|
|
"grad_norm": 4.315411715741126,
|
|
"learning_rate": 9.934338157040953e-06,
|
|
"loss": 1.4340442419052124,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.4713375796178344,
|
|
"grad_norm": 4.598194925817704,
|
|
"learning_rate": 9.921889571113629e-06,
|
|
"loss": 1.5494410991668701,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.4840764331210191,
|
|
"grad_norm": 4.243095090396253,
|
|
"learning_rate": 9.90837029325229e-06,
|
|
"loss": 1.4130847454071045,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.4968152866242038,
|
|
"grad_norm": 4.980649623484297,
|
|
"learning_rate": 9.893783264396903e-06,
|
|
"loss": 1.4265036582946777,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.5095541401273885,
|
|
"grad_norm": 5.789533896785179,
|
|
"learning_rate": 9.878131657762535e-06,
|
|
"loss": 1.4373618364334106,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.5222929936305732,
|
|
"grad_norm": 4.658455364896436,
|
|
"learning_rate": 9.861418878149056e-06,
|
|
"loss": 1.4085681438446045,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.535031847133758,
|
|
"grad_norm": 5.324025858102516,
|
|
"learning_rate": 9.843648561200476e-06,
|
|
"loss": 1.452268123626709,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.5477707006369427,
|
|
"grad_norm": 4.692010278942193,
|
|
"learning_rate": 9.82482457261405e-06,
|
|
"loss": 1.6110832691192627,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.5605095541401274,
|
|
"grad_norm": 4.64177391127698,
|
|
"learning_rate": 9.80495100729936e-06,
|
|
"loss": 1.4959537982940674,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.5732484076433121,
|
|
"grad_norm": 4.196745696549577,
|
|
"learning_rate": 9.784032188487507e-06,
|
|
"loss": 1.4369564056396484,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.5859872611464968,
|
|
"grad_norm": 4.846501722779206,
|
|
"learning_rate": 9.762072666790658e-06,
|
|
"loss": 1.5659615993499756,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.5987261146496815,
|
|
"grad_norm": 4.475752207148854,
|
|
"learning_rate": 9.73907721921212e-06,
|
|
"loss": 1.6203088760375977,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.6114649681528662,
|
|
"grad_norm": 5.622443868901302,
|
|
"learning_rate": 9.715050848107167e-06,
|
|
"loss": 1.4394254684448242,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.6242038216560509,
|
|
"grad_norm": 5.282103052112391,
|
|
"learning_rate": 9.689998780094839e-06,
|
|
"loss": 1.3903216123580933,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.6369426751592356,
|
|
"grad_norm": 4.399414503844804,
|
|
"learning_rate": 9.663926464920959e-06,
|
|
"loss": 1.4829354286193848,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.6496815286624203,
|
|
"grad_norm": 4.523444723393011,
|
|
"learning_rate": 9.636839574272623e-06,
|
|
"loss": 1.5274395942687988,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.6624203821656051,
|
|
"grad_norm": 4.232538410435911,
|
|
"learning_rate": 9.608744000544392e-06,
|
|
"loss": 1.4694490432739258,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.6751592356687898,
|
|
"grad_norm": 4.354507161235457,
|
|
"learning_rate": 9.579645855556481e-06,
|
|
"loss": 1.2353503704071045,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.6878980891719745,
|
|
"grad_norm": 4.9180556110442595,
|
|
"learning_rate": 9.54955146922521e-06,
|
|
"loss": 1.4008901119232178,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.7006369426751592,
|
|
"grad_norm": 4.970650879718249,
|
|
"learning_rate": 9.51846738818602e-06,
|
|
"loss": 1.3539741039276123,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.7133757961783439,
|
|
"grad_norm": 4.837000418043291,
|
|
"learning_rate": 9.48640037436934e-06,
|
|
"loss": 1.3163714408874512,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.7261146496815286,
|
|
"grad_norm": 3.9679039776262064,
|
|
"learning_rate": 9.453357403529609e-06,
|
|
"loss": 1.3809059858322144,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.7388535031847133,
|
|
"grad_norm": 4.462452198138723,
|
|
"learning_rate": 9.419345663727805e-06,
|
|
"loss": 1.458146572113037,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.7515923566878981,
|
|
"grad_norm": 4.402982081383035,
|
|
"learning_rate": 9.38437255376777e-06,
|
|
"loss": 1.329193115234375,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.7643312101910829,
|
|
"grad_norm": 4.377947135685406,
|
|
"learning_rate": 9.348445681586703e-06,
|
|
"loss": 1.4500741958618164,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.7770700636942676,
|
|
"grad_norm": 4.198026205959271,
|
|
"learning_rate": 9.31157286260014e-06,
|
|
"loss": 1.4562097787857056,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.7898089171974523,
|
|
"grad_norm": 4.945285199299728,
|
|
"learning_rate": 9.273762118001837e-06,
|
|
"loss": 1.3661162853240967,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.802547770700637,
|
|
"grad_norm": 3.9573822911468266,
|
|
"learning_rate": 9.235021673018849e-06,
|
|
"loss": 1.3168445825576782,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.8152866242038217,
|
|
"grad_norm": 4.566194937738093,
|
|
"learning_rate": 9.195359955122244e-06,
|
|
"loss": 1.3281530141830444,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.8280254777070064,
|
|
"grad_norm": 4.657547711627972,
|
|
"learning_rate": 9.15478559219382e-06,
|
|
"loss": 1.3520253896713257,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.8407643312101911,
|
|
"grad_norm": 4.931346013168586,
|
|
"learning_rate": 9.113307410649222e-06,
|
|
"loss": 1.4982115030288696,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.8535031847133758,
|
|
"grad_norm": 4.205958958323587,
|
|
"learning_rate": 9.070934433517872e-06,
|
|
"loss": 1.402880311012268,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.8662420382165605,
|
|
"grad_norm": 4.265262844911349,
|
|
"learning_rate": 9.027675878480131e-06,
|
|
"loss": 1.4359843730926514,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.8789808917197452,
|
|
"grad_norm": 4.174081063602911,
|
|
"learning_rate": 8.983541155862114e-06,
|
|
"loss": 1.4095585346221924,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.89171974522293,
|
|
"grad_norm": 4.300497916224527,
|
|
"learning_rate": 8.938539866588593e-06,
|
|
"loss": 1.3254384994506836,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.9044585987261147,
|
|
"grad_norm": 4.321856820928802,
|
|
"learning_rate": 8.892681800094447e-06,
|
|
"loss": 1.3895121812820435,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.9171974522292994,
|
|
"grad_norm": 4.353418264893862,
|
|
"learning_rate": 8.845976932195104e-06,
|
|
"loss": 1.5136423110961914,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.9299363057324841,
|
|
"grad_norm": 4.509525726327509,
|
|
"learning_rate": 8.798435422916425e-06,
|
|
"loss": 1.560758352279663,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.9426751592356688,
|
|
"grad_norm": 4.14002546470194,
|
|
"learning_rate": 8.750067614284534e-06,
|
|
"loss": 1.2931057214736938,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.9554140127388535,
|
|
"grad_norm": 4.1809249012997345,
|
|
"learning_rate": 8.700884028076042e-06,
|
|
"loss": 1.5124843120574951,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.9681528662420382,
|
|
"grad_norm": 4.293885955875831,
|
|
"learning_rate": 8.650895363529172e-06,
|
|
"loss": 1.317713737487793,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.9808917197452229,
|
|
"grad_norm": 4.631688789038691,
|
|
"learning_rate": 8.600112495016289e-06,
|
|
"loss": 1.3039919137954712,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.9936305732484076,
|
|
"grad_norm": 4.27759477922895,
|
|
"learning_rate": 8.548546469678311e-06,
|
|
"loss": 1.495795488357544,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 5.956623944392664,
|
|
"learning_rate": 8.496208505021572e-06,
|
|
"loss": 1.429541826248169,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 1.0127388535031847,
|
|
"grad_norm": 4.45796601634621,
|
|
"learning_rate": 8.443109986477574e-06,
|
|
"loss": 0.8995598554611206,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 1.0254777070063694,
|
|
"grad_norm": 4.500661347515663,
|
|
"learning_rate": 8.389262464926256e-06,
|
|
"loss": 0.63990318775177,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 1.0382165605095541,
|
|
"grad_norm": 3.8748063820496257,
|
|
"learning_rate": 8.334677654183254e-06,
|
|
"loss": 0.6055729985237122,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 1.0509554140127388,
|
|
"grad_norm": 3.4628880356772096,
|
|
"learning_rate": 8.279367428451703e-06,
|
|
"loss": 0.7356538772583008,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 1.0636942675159236,
|
|
"grad_norm": 3.6516848444686265,
|
|
"learning_rate": 8.223343819739164e-06,
|
|
"loss": 0.692323625087738,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 1.0764331210191083,
|
|
"grad_norm": 3.968197707946131,
|
|
"learning_rate": 8.166619015240236e-06,
|
|
"loss": 0.6772887706756592,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 1.089171974522293,
|
|
"grad_norm": 3.845941294941666,
|
|
"learning_rate": 8.109205354685367e-06,
|
|
"loss": 0.5514630675315857,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 1.1019108280254777,
|
|
"grad_norm": 3.774618366335066,
|
|
"learning_rate": 8.051115327656538e-06,
|
|
"loss": 0.6684471964836121,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 1.1146496815286624,
|
|
"grad_norm": 3.7047992437252,
|
|
"learning_rate": 7.992361570870289e-06,
|
|
"loss": 0.5766518712043762,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 1.127388535031847,
|
|
"grad_norm": 4.214676734133472,
|
|
"learning_rate": 7.932956865428792e-06,
|
|
"loss": 0.5921903848648071,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 1.1401273885350318,
|
|
"grad_norm": 4.387324313211908,
|
|
"learning_rate": 7.872914134039485e-06,
|
|
"loss": 0.592995285987854,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 1.1528662420382165,
|
|
"grad_norm": 4.005865244271663,
|
|
"learning_rate": 7.812246438203905e-06,
|
|
"loss": 0.5482683181762695,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 1.1656050955414012,
|
|
"grad_norm": 4.636384134136274,
|
|
"learning_rate": 7.750966975376328e-06,
|
|
"loss": 0.6826972365379333,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 1.178343949044586,
|
|
"grad_norm": 4.6953248944517245,
|
|
"learning_rate": 7.689089076092851e-06,
|
|
"loss": 0.5954027771949768,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 1.1910828025477707,
|
|
"grad_norm": 4.80616798771938,
|
|
"learning_rate": 7.626626201071494e-06,
|
|
"loss": 0.6095083355903625,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 1.2038216560509554,
|
|
"grad_norm": 4.48252749973364,
|
|
"learning_rate": 7.563591938284012e-06,
|
|
"loss": 0.709877610206604,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 1.21656050955414,
|
|
"grad_norm": 4.596613533967055,
|
|
"learning_rate": 7.500000000000001e-06,
|
|
"loss": 0.5784502029418945,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 1.2292993630573248,
|
|
"grad_norm": 4.76402532258561,
|
|
"learning_rate": 7.4358642198039835e-06,
|
|
"loss": 0.5837893486022949,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 1.2420382165605095,
|
|
"grad_norm": 4.440144626730792,
|
|
"learning_rate": 7.371198549586091e-06,
|
|
"loss": 0.7246421575546265,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 1.2547770700636942,
|
|
"grad_norm": 4.1554626239444605,
|
|
"learning_rate": 7.306017056507018e-06,
|
|
"loss": 0.5735586285591125,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 1.267515923566879,
|
|
"grad_norm": 5.004892398076429,
|
|
"learning_rate": 7.240333919937893e-06,
|
|
"loss": 0.5463488101959229,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.2802547770700636,
|
|
"grad_norm": 5.088476554254515,
|
|
"learning_rate": 7.174163428375748e-06,
|
|
"loss": 0.5633252859115601,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 1.2929936305732483,
|
|
"grad_norm": 5.118792774795437,
|
|
"learning_rate": 7.107519976335241e-06,
|
|
"loss": 0.5037230253219604,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 1.305732484076433,
|
|
"grad_norm": 4.75623015993911,
|
|
"learning_rate": 7.040418061217325e-06,
|
|
"loss": 0.5365867614746094,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 1.3184713375796178,
|
|
"grad_norm": 5.157812619262671,
|
|
"learning_rate": 6.972872280155528e-06,
|
|
"loss": 0.6433064937591553,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 1.3312101910828025,
|
|
"grad_norm": 5.148365945239476,
|
|
"learning_rate": 6.9048973268405375e-06,
|
|
"loss": 0.6543390154838562,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 1.3439490445859872,
|
|
"grad_norm": 4.5311604864334125,
|
|
"learning_rate": 6.836507988323785e-06,
|
|
"loss": 0.6132720708847046,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 1.356687898089172,
|
|
"grad_norm": 4.88971282799509,
|
|
"learning_rate": 6.767719141800718e-06,
|
|
"loss": 0.6079248189926147,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 1.3694267515923566,
|
|
"grad_norm": 4.695137801905107,
|
|
"learning_rate": 6.698545751374465e-06,
|
|
"loss": 0.6232650279998779,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 1.3821656050955413,
|
|
"grad_norm": 4.267620300562347,
|
|
"learning_rate": 6.629002864800589e-06,
|
|
"loss": 0.5911256074905396,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 1.394904458598726,
|
|
"grad_norm": 5.30166521900121,
|
|
"learning_rate": 6.55910561021365e-06,
|
|
"loss": 0.6404790282249451,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 1.4076433121019107,
|
|
"grad_norm": 4.964813016784396,
|
|
"learning_rate": 6.488869192836279e-06,
|
|
"loss": 0.6661736965179443,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 1.4203821656050954,
|
|
"grad_norm": 5.042827042141295,
|
|
"learning_rate": 6.418308891671484e-06,
|
|
"loss": 0.5621084570884705,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 1.4331210191082802,
|
|
"grad_norm": 4.554306311318436,
|
|
"learning_rate": 6.347440056178904e-06,
|
|
"loss": 0.5913956165313721,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 1.4458598726114649,
|
|
"grad_norm": 3.999260338697589,
|
|
"learning_rate": 6.27627810293574e-06,
|
|
"loss": 0.5895659327507019,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 1.4585987261146496,
|
|
"grad_norm": 4.488817511346429,
|
|
"learning_rate": 6.204838512283073e-06,
|
|
"loss": 0.6066327691078186,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 1.4713375796178343,
|
|
"grad_norm": 4.2048895000167725,
|
|
"learning_rate": 6.133136824958334e-06,
|
|
"loss": 0.579125165939331,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 1.484076433121019,
|
|
"grad_norm": 4.865801929274413,
|
|
"learning_rate": 6.061188638714616e-06,
|
|
"loss": 0.5661747455596924,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 1.4968152866242037,
|
|
"grad_norm": 4.0216175803478365,
|
|
"learning_rate": 5.989009604927587e-06,
|
|
"loss": 0.5881543159484863,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 1.5095541401273884,
|
|
"grad_norm": 4.672593821116511,
|
|
"learning_rate": 5.916615425190744e-06,
|
|
"loss": 0.6381370425224304,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 1.5222929936305731,
|
|
"grad_norm": 4.579578448838088,
|
|
"learning_rate": 5.844021847899735e-06,
|
|
"loss": 0.5820121765136719,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 1.5350318471337578,
|
|
"grad_norm": 4.9782337341335845,
|
|
"learning_rate": 5.771244664826512e-06,
|
|
"loss": 0.5244691371917725,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 1.5477707006369426,
|
|
"grad_norm": 4.615280693095074,
|
|
"learning_rate": 5.698299707684031e-06,
|
|
"loss": 0.6596621870994568,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 1.5605095541401273,
|
|
"grad_norm": 4.1315152129695205,
|
|
"learning_rate": 5.6252028446822805e-06,
|
|
"loss": 0.6240249872207642,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 1.573248407643312,
|
|
"grad_norm": 4.34694030117767,
|
|
"learning_rate": 5.55196997707635e-06,
|
|
"loss": 0.6121684312820435,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 1.5859872611464967,
|
|
"grad_norm": 4.736014683349439,
|
|
"learning_rate": 5.478617035707337e-06,
|
|
"loss": 0.581444263458252,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 1.5987261146496814,
|
|
"grad_norm": 4.42473315063519,
|
|
"learning_rate": 5.4051599775368e-06,
|
|
"loss": 0.5702801942825317,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 1.611464968152866,
|
|
"grad_norm": 4.723043711831375,
|
|
"learning_rate": 5.33161478217552e-06,
|
|
"loss": 0.643683671951294,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 1.6242038216560508,
|
|
"grad_norm": 4.615535634313775,
|
|
"learning_rate": 5.257997448407366e-06,
|
|
"loss": 0.6429088115692139,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 1.6369426751592355,
|
|
"grad_norm": 3.943237517267742,
|
|
"learning_rate": 5.184323990708959e-06,
|
|
"loss": 0.5036097764968872,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 1.6496815286624202,
|
|
"grad_norm": 5.369249891502365,
|
|
"learning_rate": 5.110610435765935e-06,
|
|
"loss": 0.6377817392349243,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 1.662420382165605,
|
|
"grad_norm": 4.645492978424057,
|
|
"learning_rate": 5.0368728189865624e-06,
|
|
"loss": 0.5092718601226807,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 1.6751592356687897,
|
|
"grad_norm": 4.9878218164552255,
|
|
"learning_rate": 4.9631271810134375e-06,
|
|
"loss": 0.6005362868309021,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 1.6878980891719744,
|
|
"grad_norm": 4.710856517549427,
|
|
"learning_rate": 4.8893895642340665e-06,
|
|
"loss": 0.4808087944984436,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 1.700636942675159,
|
|
"grad_norm": 4.962556354741984,
|
|
"learning_rate": 4.815676009291044e-06,
|
|
"loss": 0.6739586591720581,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 1.7133757961783438,
|
|
"grad_norm": 4.864043235726367,
|
|
"learning_rate": 4.742002551592635e-06,
|
|
"loss": 0.5722870826721191,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 1.7261146496815285,
|
|
"grad_norm": 5.805499130195261,
|
|
"learning_rate": 4.668385217824482e-06,
|
|
"loss": 0.5560994148254395,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 1.7388535031847132,
|
|
"grad_norm": 4.3754614924647734,
|
|
"learning_rate": 4.594840022463201e-06,
|
|
"loss": 0.6376844644546509,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 1.7515923566878981,
|
|
"grad_norm": 4.6276825029066515,
|
|
"learning_rate": 4.5213829642926635e-06,
|
|
"loss": 0.5070189237594604,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 1.7643312101910829,
|
|
"grad_norm": 5.058486321341029,
|
|
"learning_rate": 4.4480300229236525e-06,
|
|
"loss": 0.6301469206809998,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 1.7770700636942676,
|
|
"grad_norm": 4.631581699502946,
|
|
"learning_rate": 4.374797155317721e-06,
|
|
"loss": 0.5686060190200806,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 1.7898089171974523,
|
|
"grad_norm": 4.839930377645928,
|
|
"learning_rate": 4.30170029231597e-06,
|
|
"loss": 0.5702610015869141,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 1.802547770700637,
|
|
"grad_norm": 4.634251405852573,
|
|
"learning_rate": 4.228755335173488e-06,
|
|
"loss": 0.5375156402587891,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 1.8152866242038217,
|
|
"grad_norm": 5.224378872859397,
|
|
"learning_rate": 4.155978152100266e-06,
|
|
"loss": 0.588652491569519,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 1.8280254777070064,
|
|
"grad_norm": 5.243980650196693,
|
|
"learning_rate": 4.0833845748092586e-06,
|
|
"loss": 0.6560136079788208,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 1.8407643312101911,
|
|
"grad_norm": 4.870640612365541,
|
|
"learning_rate": 4.010990395072414e-06,
|
|
"loss": 0.5707780718803406,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 1.8535031847133758,
|
|
"grad_norm": 4.896770007248889,
|
|
"learning_rate": 3.938811361285386e-06,
|
|
"loss": 0.578855574131012,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 1.8662420382165605,
|
|
"grad_norm": 5.621832570155973,
|
|
"learning_rate": 3.866863175041666e-06,
|
|
"loss": 0.7337894439697266,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 1.8789808917197452,
|
|
"grad_norm": 4.788974930837312,
|
|
"learning_rate": 3.7951614877169285e-06,
|
|
"loss": 0.6584663391113281,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 1.89171974522293,
|
|
"grad_norm": 5.197175599878351,
|
|
"learning_rate": 3.7237218970642624e-06,
|
|
"loss": 0.5132451057434082,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 1.9044585987261147,
|
|
"grad_norm": 4.494637523697752,
|
|
"learning_rate": 3.6525599438210956e-06,
|
|
"loss": 0.5699691772460938,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 1.9171974522292994,
|
|
"grad_norm": 4.436597339850294,
|
|
"learning_rate": 3.5816911083285165e-06,
|
|
"loss": 0.6117175817489624,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 1.929936305732484,
|
|
"grad_norm": 4.71698618164443,
|
|
"learning_rate": 3.511130807163724e-06,
|
|
"loss": 0.48447686433792114,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 1.9426751592356688,
|
|
"grad_norm": 4.586270355395819,
|
|
"learning_rate": 3.440894389786352e-06,
|
|
"loss": 0.5775331854820251,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 1.9554140127388535,
|
|
"grad_norm": 5.467603736362664,
|
|
"learning_rate": 3.370997135199413e-06,
|
|
"loss": 0.6822047829627991,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 1.9681528662420382,
|
|
"grad_norm": 5.092809942708443,
|
|
"learning_rate": 3.3014542486255365e-06,
|
|
"loss": 0.620025098323822,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 1.980891719745223,
|
|
"grad_norm": 4.782057480529959,
|
|
"learning_rate": 3.2322808581992825e-06,
|
|
"loss": 0.6051990985870361,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 1.9936305732484076,
|
|
"grad_norm": 5.07119310501042,
|
|
"learning_rate": 3.1634920116762175e-06,
|
|
"loss": 0.5013089776039124,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 5.834245362327659,
|
|
"learning_rate": 3.0951026731594634e-06,
|
|
"loss": 0.41039198637008667,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 2.0127388535031847,
|
|
"grad_norm": 2.970713570403218,
|
|
"learning_rate": 3.0271277198444737e-06,
|
|
"loss": 0.14488917589187622,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 2.0254777070063694,
|
|
"grad_norm": 3.3900669209478917,
|
|
"learning_rate": 2.9595819387826753e-06,
|
|
"loss": 0.17139403522014618,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 2.038216560509554,
|
|
"grad_norm": 3.148172373199878,
|
|
"learning_rate": 2.89248002366476e-06,
|
|
"loss": 0.13938947021961212,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 2.050955414012739,
|
|
"grad_norm": 3.292222772844883,
|
|
"learning_rate": 2.8258365716242543e-06,
|
|
"loss": 0.19142913818359375,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 2.0636942675159236,
|
|
"grad_norm": 5.062552654446493,
|
|
"learning_rate": 2.7596660800621076e-06,
|
|
"loss": 0.32667019963264465,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 2.0764331210191083,
|
|
"grad_norm": 2.9195663792104853,
|
|
"learning_rate": 2.6939829434929834e-06,
|
|
"loss": 0.16923490166664124,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 2.089171974522293,
|
|
"grad_norm": 2.660735105353199,
|
|
"learning_rate": 2.6288014504139104e-06,
|
|
"loss": 0.16544359922409058,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 2.1019108280254777,
|
|
"grad_norm": 2.9195377278173438,
|
|
"learning_rate": 2.5641357801960186e-06,
|
|
"loss": 0.13166563212871552,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 2.1146496815286624,
|
|
"grad_norm": 2.7115850726819133,
|
|
"learning_rate": 2.5000000000000015e-06,
|
|
"loss": 0.1502484679222107,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 2.127388535031847,
|
|
"grad_norm": 2.5246541477672957,
|
|
"learning_rate": 2.4364080617159885e-06,
|
|
"loss": 0.12001603841781616,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 2.140127388535032,
|
|
"grad_norm": 2.906306753932353,
|
|
"learning_rate": 2.373373798928507e-06,
|
|
"loss": 0.16388744115829468,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 2.1528662420382165,
|
|
"grad_norm": 3.3313464695860855,
|
|
"learning_rate": 2.310910923907149e-06,
|
|
"loss": 0.17085227370262146,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 2.1656050955414012,
|
|
"grad_norm": 3.537696001337278,
|
|
"learning_rate": 2.249033024623672e-06,
|
|
"loss": 0.1649709939956665,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 2.178343949044586,
|
|
"grad_norm": 3.0477614078497157,
|
|
"learning_rate": 2.187753561796097e-06,
|
|
"loss": 0.13725437223911285,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 2.1910828025477707,
|
|
"grad_norm": 3.108829906302373,
|
|
"learning_rate": 2.127085865960516e-06,
|
|
"loss": 0.14223095774650574,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 2.2038216560509554,
|
|
"grad_norm": 3.188987721207745,
|
|
"learning_rate": 2.0670431345712092e-06,
|
|
"loss": 0.1432873010635376,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 2.21656050955414,
|
|
"grad_norm": 3.5488199597897045,
|
|
"learning_rate": 2.0076384291297134e-06,
|
|
"loss": 0.1355983018875122,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 2.229299363057325,
|
|
"grad_norm": 2.9979876656948483,
|
|
"learning_rate": 1.9488846723434646e-06,
|
|
"loss": 0.13247933983802795,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 2.2420382165605095,
|
|
"grad_norm": 3.443337367597467,
|
|
"learning_rate": 1.890794645314633e-06,
|
|
"loss": 0.1308836191892624,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 2.254777070063694,
|
|
"grad_norm": 4.121646470867133,
|
|
"learning_rate": 1.8333809847597644e-06,
|
|
"loss": 0.15963426232337952,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 2.267515923566879,
|
|
"grad_norm": 4.118828059264668,
|
|
"learning_rate": 1.7766561802608374e-06,
|
|
"loss": 0.14805136620998383,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 2.2802547770700636,
|
|
"grad_norm": 3.9708198011551166,
|
|
"learning_rate": 1.7206325715483003e-06,
|
|
"loss": 0.12024472653865814,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 2.2929936305732483,
|
|
"grad_norm": 3.439106672469071,
|
|
"learning_rate": 1.665322345816746e-06,
|
|
"loss": 0.11454702913761139,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 2.305732484076433,
|
|
"grad_norm": 3.4010452876916615,
|
|
"learning_rate": 1.6107375350737437e-06,
|
|
"loss": 0.10992666333913803,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 2.3184713375796178,
|
|
"grad_norm": 3.5752577926580975,
|
|
"learning_rate": 1.556890013522428e-06,
|
|
"loss": 0.09631110727787018,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 2.3312101910828025,
|
|
"grad_norm": 3.8387220728977343,
|
|
"learning_rate": 1.50379149497843e-06,
|
|
"loss": 0.14856451749801636,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 2.343949044585987,
|
|
"grad_norm": 3.444989482317406,
|
|
"learning_rate": 1.4514535303216893e-06,
|
|
"loss": 0.09073778241872787,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 2.356687898089172,
|
|
"grad_norm": 3.2622590339488124,
|
|
"learning_rate": 1.3998875049837141e-06,
|
|
"loss": 0.10596369206905365,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 2.3694267515923566,
|
|
"grad_norm": 4.072722677232836,
|
|
"learning_rate": 1.3491046364708294e-06,
|
|
"loss": 0.1488298773765564,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 2.3821656050955413,
|
|
"grad_norm": 4.114774744144093,
|
|
"learning_rate": 1.2991159719239581e-06,
|
|
"loss": 0.13143031299114227,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 2.394904458598726,
|
|
"grad_norm": 3.792643277657603,
|
|
"learning_rate": 1.249932385715467e-06,
|
|
"loss": 0.12935219705104828,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 2.4076433121019107,
|
|
"grad_norm": 3.6041653995445,
|
|
"learning_rate": 1.2015645770835765e-06,
|
|
"loss": 0.10895463824272156,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 2.4203821656050954,
|
|
"grad_norm": 3.419036474508468,
|
|
"learning_rate": 1.1540230678048969e-06,
|
|
"loss": 0.11770664900541306,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 2.43312101910828,
|
|
"grad_norm": 3.8473062967203626,
|
|
"learning_rate": 1.1073181999055538e-06,
|
|
"loss": 0.12943175435066223,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 2.445859872611465,
|
|
"grad_norm": 4.213646564060963,
|
|
"learning_rate": 1.0614601334114099e-06,
|
|
"loss": 0.15990746021270752,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 2.4585987261146496,
|
|
"grad_norm": 3.292740607382361,
|
|
"learning_rate": 1.016458844137887e-06,
|
|
"loss": 0.0967484638094902,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 2.4713375796178343,
|
|
"grad_norm": 3.3587679937993675,
|
|
"learning_rate": 9.723241215198692e-07,
|
|
"loss": 0.09274256229400635,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 2.484076433121019,
|
|
"grad_norm": 3.415144877613833,
|
|
"learning_rate": 9.290655664821296e-07,
|
|
"loss": 0.12071307003498077,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 2.4968152866242037,
|
|
"grad_norm": 3.614520056467126,
|
|
"learning_rate": 8.866925893507805e-07,
|
|
"loss": 0.14337831735610962,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 2.5095541401273884,
|
|
"grad_norm": 3.1413281076463333,
|
|
"learning_rate": 8.45214407806182e-07,
|
|
"loss": 0.1311374008655548,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 2.522292993630573,
|
|
"grad_norm": 3.5634546960778963,
|
|
"learning_rate": 8.046400448777575e-07,
|
|
"loss": 0.12355434894561768,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 2.535031847133758,
|
|
"grad_norm": 3.55245812518791,
|
|
"learning_rate": 7.649783269811523e-07,
|
|
"loss": 0.11268627643585205,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 2.5477707006369426,
|
|
"grad_norm": 3.6047204962278205,
|
|
"learning_rate": 7.26237881998163e-07,
|
|
"loss": 0.1278030276298523,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 2.5605095541401273,
|
|
"grad_norm": 3.826082333377558,
|
|
"learning_rate": 6.884271373998608e-07,
|
|
"loss": 0.11588963866233826,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 2.573248407643312,
|
|
"grad_norm": 3.3477539285078044,
|
|
"learning_rate": 6.515543184133e-07,
|
|
"loss": 0.11168617010116577,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 2.5859872611464967,
|
|
"grad_norm": 3.40070063216114,
|
|
"learning_rate": 6.156274462322292e-07,
|
|
"loss": 0.14677459001541138,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 2.5987261146496814,
|
|
"grad_norm": 3.6867880675958333,
|
|
"learning_rate": 5.806543362721945e-07,
|
|
"loss": 0.1080314964056015,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 2.611464968152866,
|
|
"grad_norm": 3.50805046104141,
|
|
"learning_rate": 5.466425964703914e-07,
|
|
"loss": 0.10917598009109497,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 2.624203821656051,
|
|
"grad_norm": 3.744240792349818,
|
|
"learning_rate": 5.135996256306619e-07,
|
|
"loss": 0.10850804299116135,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 2.6369426751592355,
|
|
"grad_norm": 3.2636204792288184,
|
|
"learning_rate": 4.815326118139813e-07,
|
|
"loss": 0.23395496606826782,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 2.6496815286624202,
|
|
"grad_norm": 3.3320803212307895,
|
|
"learning_rate": 4.5044853077479134e-07,
|
|
"loss": 0.09678040444850922,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 2.662420382165605,
|
|
"grad_norm": 3.3126443611241005,
|
|
"learning_rate": 4.203541444435211e-07,
|
|
"loss": 0.09082137048244476,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 2.6751592356687897,
|
|
"grad_norm": 3.463640048859196,
|
|
"learning_rate": 3.9125599945560866e-07,
|
|
"loss": 0.12093393504619598,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 2.6878980891719744,
|
|
"grad_norm": 4.1484131801868225,
|
|
"learning_rate": 3.631604257273774e-07,
|
|
"loss": 0.12841008603572845,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 2.700636942675159,
|
|
"grad_norm": 3.4819962567564544,
|
|
"learning_rate": 3.360735350790428e-07,
|
|
"loss": 0.1454203575849533,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 2.713375796178344,
|
|
"grad_norm": 3.3217850732913834,
|
|
"learning_rate": 3.100012199051627e-07,
|
|
"loss": 0.12103286385536194,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 2.7261146496815285,
|
|
"grad_norm": 3.4551976218750706,
|
|
"learning_rate": 2.8494915189283325e-07,
|
|
"loss": 0.13519585132598877,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 2.738853503184713,
|
|
"grad_norm": 3.5046747113231738,
|
|
"learning_rate": 2.6092278078788004e-07,
|
|
"loss": 0.14792990684509277,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 2.7515923566878984,
|
|
"grad_norm": 4.057009589896516,
|
|
"learning_rate": 2.3792733320934348e-07,
|
|
"loss": 0.1573294997215271,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 2.7643312101910826,
|
|
"grad_norm": 3.485812762552763,
|
|
"learning_rate": 2.1596781151249524e-07,
|
|
"loss": 0.15241427719593048,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 2.777070063694268,
|
|
"grad_norm": 2.8563228482207395,
|
|
"learning_rate": 1.9504899270064105e-07,
|
|
"loss": 0.11122366786003113,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 2.789808917197452,
|
|
"grad_norm": 3.219771759621168,
|
|
"learning_rate": 1.7517542738595071e-07,
|
|
"loss": 0.11351308226585388,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 2.802547770700637,
|
|
"grad_norm": 3.4195554560904107,
|
|
"learning_rate": 1.5635143879952575e-07,
|
|
"loss": 0.1188071146607399,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 2.8152866242038215,
|
|
"grad_norm": 2.9103932269106374,
|
|
"learning_rate": 1.3858112185094418e-07,
|
|
"loss": 0.1164408028125763,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 2.8280254777070066,
|
|
"grad_norm": 3.6450799822214144,
|
|
"learning_rate": 1.2186834223746612e-07,
|
|
"loss": 0.12760576605796814,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 2.840764331210191,
|
|
"grad_norm": 3.3225130395239253,
|
|
"learning_rate": 1.0621673560309798e-07,
|
|
"loss": 0.11487654596567154,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 2.853503184713376,
|
|
"grad_norm": 3.2905886122232397,
|
|
"learning_rate": 9.162970674771177e-08,
|
|
"loss": 0.11246581375598907,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 2.8662420382165603,
|
|
"grad_norm": 3.504394500719592,
|
|
"learning_rate": 7.81104288863721e-08,
|
|
"loss": 0.09955516457557678,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 2.8789808917197455,
|
|
"grad_norm": 3.216564908375023,
|
|
"learning_rate": 6.566184295904777e-08,
|
|
"loss": 0.12330685555934906,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 2.8917197452229297,
|
|
"grad_norm": 3.607447433445088,
|
|
"learning_rate": 5.4286656990847897e-08,
|
|
"loss": 0.12849846482276917,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 2.904458598726115,
|
|
"grad_norm": 3.3244783180187425,
|
|
"learning_rate": 4.398734550292716e-08,
|
|
"loss": 0.11019767820835114,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 2.917197452229299,
|
|
"grad_norm": 3.1359379558395957,
|
|
"learning_rate": 3.476614897418573e-08,
|
|
"loss": 0.10802481323480606,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 2.9299363057324843,
|
|
"grad_norm": 3.236602655895111,
|
|
"learning_rate": 2.6625073353884756e-08,
|
|
"loss": 0.11602732539176941,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 2.9426751592356686,
|
|
"grad_norm": 3.2263437658209133,
|
|
"learning_rate": 1.9565889625275945e-08,
|
|
"loss": 0.12483286112546921,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 2.9554140127388537,
|
|
"grad_norm": 3.4340551157608235,
|
|
"learning_rate": 1.3590133420350315e-08,
|
|
"loss": 0.10575878620147705,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 2.968152866242038,
|
|
"grad_norm": 3.903227901454765,
|
|
"learning_rate": 8.699104685779835e-09,
|
|
"loss": 0.14583438634872437,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 2.980891719745223,
|
|
"grad_norm": 3.884552247317161,
|
|
"learning_rate": 4.89386740013198e-09,
|
|
"loss": 0.12648674845695496,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 2.9936305732484074,
|
|
"grad_norm": 3.2859302150161747,
|
|
"learning_rate": 2.1752493424148647e-09,
|
|
"loss": 0.1414915770292282,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 2.7515804191453306,
|
|
"learning_rate": 5.438419120062933e-10,
|
|
"loss": 0.0598013773560524,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"step": 237,
|
|
"total_flos": 4888319754240.0,
|
|
"train_loss": 0.7582035779575759,
|
|
"train_runtime": 573.8365,
|
|
"train_samples_per_second": 26.14,
|
|
"train_steps_per_second": 0.413
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 237,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4888319754240.0,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|