8511 lines
207 KiB
JSON
8511 lines
207 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.206761278412369,
|
|
"eval_steps": 92,
|
|
"global_step": 1196,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.001846082842967578,
|
|
"grad_norm": 1.943859338760376,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0781,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.003692165685935156,
|
|
"grad_norm": 2.365267753601074,
|
|
"learning_rate": 1.226993865030675e-07,
|
|
"loss": 0.0565,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.005538248528902735,
|
|
"grad_norm": 2.2332890033721924,
|
|
"learning_rate": 2.45398773006135e-07,
|
|
"loss": 0.0418,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.007384331371870312,
|
|
"grad_norm": 1.7750693559646606,
|
|
"learning_rate": 3.680981595092025e-07,
|
|
"loss": 0.0424,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.009230414214837892,
|
|
"grad_norm": 2.268247604370117,
|
|
"learning_rate": 4.9079754601227e-07,
|
|
"loss": 0.0743,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.01107649705780547,
|
|
"grad_norm": 2.219888210296631,
|
|
"learning_rate": 6.134969325153375e-07,
|
|
"loss": 0.0636,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.012922579900773046,
|
|
"grad_norm": 2.5264432430267334,
|
|
"learning_rate": 7.36196319018405e-07,
|
|
"loss": 0.0736,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.014768662743740625,
|
|
"grad_norm": 1.7726325988769531,
|
|
"learning_rate": 8.588957055214725e-07,
|
|
"loss": 0.0595,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.016614745586708203,
|
|
"grad_norm": 2.682034969329834,
|
|
"learning_rate": 9.8159509202454e-07,
|
|
"loss": 0.0871,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.018460828429675783,
|
|
"grad_norm": 0.9481790661811829,
|
|
"learning_rate": 1.1042944785276075e-06,
|
|
"loss": 0.0404,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.02030691127264336,
|
|
"grad_norm": 1.4664937257766724,
|
|
"learning_rate": 1.226993865030675e-06,
|
|
"loss": 0.0381,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.02215299411561094,
|
|
"grad_norm": 1.3055261373519897,
|
|
"learning_rate": 1.3496932515337425e-06,
|
|
"loss": 0.0468,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.023999076958578516,
|
|
"grad_norm": 0.9701064229011536,
|
|
"learning_rate": 1.47239263803681e-06,
|
|
"loss": 0.0306,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.025845159801546093,
|
|
"grad_norm": 0.937262773513794,
|
|
"learning_rate": 1.5950920245398775e-06,
|
|
"loss": 0.0396,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.027691242644513673,
|
|
"grad_norm": 1.1696547269821167,
|
|
"learning_rate": 1.717791411042945e-06,
|
|
"loss": 0.0501,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.02953732548748125,
|
|
"grad_norm": 0.7118489146232605,
|
|
"learning_rate": 1.8404907975460124e-06,
|
|
"loss": 0.0434,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.03138340833044883,
|
|
"grad_norm": 1.077755093574524,
|
|
"learning_rate": 1.96319018404908e-06,
|
|
"loss": 0.029,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.033229491173416406,
|
|
"grad_norm": 1.2112401723861694,
|
|
"learning_rate": 2.085889570552147e-06,
|
|
"loss": 0.019,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.03507557401638398,
|
|
"grad_norm": 0.7316231727600098,
|
|
"learning_rate": 2.208588957055215e-06,
|
|
"loss": 0.0331,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.036921656859351566,
|
|
"grad_norm": 0.5681014657020569,
|
|
"learning_rate": 2.331288343558282e-06,
|
|
"loss": 0.0272,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.03876773970231914,
|
|
"grad_norm": 0.49111905694007874,
|
|
"learning_rate": 2.45398773006135e-06,
|
|
"loss": 0.0509,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.04061382254528672,
|
|
"grad_norm": 0.3748130202293396,
|
|
"learning_rate": 2.5766871165644175e-06,
|
|
"loss": 0.0143,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.042459905388254296,
|
|
"grad_norm": 0.5386937260627747,
|
|
"learning_rate": 2.699386503067485e-06,
|
|
"loss": 0.0212,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.04430598823122188,
|
|
"grad_norm": 0.3053921163082123,
|
|
"learning_rate": 2.822085889570552e-06,
|
|
"loss": 0.0205,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.046152071074189456,
|
|
"grad_norm": 1.0585299730300903,
|
|
"learning_rate": 2.94478527607362e-06,
|
|
"loss": 0.0166,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.04799815391715703,
|
|
"grad_norm": 0.41786885261535645,
|
|
"learning_rate": 3.0674846625766875e-06,
|
|
"loss": 0.0199,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.04984423676012461,
|
|
"grad_norm": 0.3514385223388672,
|
|
"learning_rate": 3.190184049079755e-06,
|
|
"loss": 0.038,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.051690319603092186,
|
|
"grad_norm": 0.3386166989803314,
|
|
"learning_rate": 3.312883435582822e-06,
|
|
"loss": 0.0206,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.05353640244605977,
|
|
"grad_norm": 0.22080476582050323,
|
|
"learning_rate": 3.43558282208589e-06,
|
|
"loss": 0.0108,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.055382485289027346,
|
|
"grad_norm": 0.2033078372478485,
|
|
"learning_rate": 3.5582822085889574e-06,
|
|
"loss": 0.0133,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.05722856813199492,
|
|
"grad_norm": 0.27421537041664124,
|
|
"learning_rate": 3.680981595092025e-06,
|
|
"loss": 0.0195,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.0590746509749625,
|
|
"grad_norm": 0.20013940334320068,
|
|
"learning_rate": 3.8036809815950928e-06,
|
|
"loss": 0.0117,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.06092073381793008,
|
|
"grad_norm": 0.22301937639713287,
|
|
"learning_rate": 3.92638036809816e-06,
|
|
"loss": 0.0225,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.06276681666089766,
|
|
"grad_norm": 0.19992490112781525,
|
|
"learning_rate": 4.049079754601227e-06,
|
|
"loss": 0.0198,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.06461289950386524,
|
|
"grad_norm": 0.1544710397720337,
|
|
"learning_rate": 4.171779141104294e-06,
|
|
"loss": 0.0113,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.06645898234683281,
|
|
"grad_norm": 0.1416219025850296,
|
|
"learning_rate": 4.294478527607362e-06,
|
|
"loss": 0.0111,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.0683050651898004,
|
|
"grad_norm": 0.18002353608608246,
|
|
"learning_rate": 4.41717791411043e-06,
|
|
"loss": 0.0138,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.07015114803276797,
|
|
"grad_norm": 0.24586960673332214,
|
|
"learning_rate": 4.539877300613497e-06,
|
|
"loss": 0.0261,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.07199723087573555,
|
|
"grad_norm": 0.23644472658634186,
|
|
"learning_rate": 4.662576687116564e-06,
|
|
"loss": 0.0083,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.07384331371870313,
|
|
"grad_norm": 0.2607399821281433,
|
|
"learning_rate": 4.785276073619632e-06,
|
|
"loss": 0.0112,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.0756893965616707,
|
|
"grad_norm": 0.2863721251487732,
|
|
"learning_rate": 4.9079754601227e-06,
|
|
"loss": 0.061,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.07753547940463829,
|
|
"grad_norm": 0.2519051730632782,
|
|
"learning_rate": 5.030674846625767e-06,
|
|
"loss": 0.0168,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.07938156224760586,
|
|
"grad_norm": 0.1219317764043808,
|
|
"learning_rate": 5.153374233128835e-06,
|
|
"loss": 0.0201,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.08122764509057344,
|
|
"grad_norm": 0.20760662853717804,
|
|
"learning_rate": 5.276073619631902e-06,
|
|
"loss": 0.0079,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.08307372793354102,
|
|
"grad_norm": 0.1823451817035675,
|
|
"learning_rate": 5.39877300613497e-06,
|
|
"loss": 0.0139,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.08491981077650859,
|
|
"grad_norm": 0.2157115638256073,
|
|
"learning_rate": 5.521472392638038e-06,
|
|
"loss": 0.0122,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.08676589361947618,
|
|
"grad_norm": 0.2477337270975113,
|
|
"learning_rate": 5.644171779141104e-06,
|
|
"loss": 0.0506,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.08861197646244376,
|
|
"grad_norm": 0.1235961839556694,
|
|
"learning_rate": 5.766871165644172e-06,
|
|
"loss": 0.0066,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.09045805930541133,
|
|
"grad_norm": 0.19829532504081726,
|
|
"learning_rate": 5.88957055214724e-06,
|
|
"loss": 0.0348,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.09230414214837891,
|
|
"grad_norm": 0.22850783169269562,
|
|
"learning_rate": 6.012269938650307e-06,
|
|
"loss": 0.0162,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.09415022499134648,
|
|
"grad_norm": 0.1670844852924347,
|
|
"learning_rate": 6.134969325153375e-06,
|
|
"loss": 0.0152,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.09599630783431407,
|
|
"grad_norm": 0.27324262261390686,
|
|
"learning_rate": 6.257668711656443e-06,
|
|
"loss": 0.0143,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.09784239067728165,
|
|
"grad_norm": 0.20399723947048187,
|
|
"learning_rate": 6.38036809815951e-06,
|
|
"loss": 0.0121,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.09968847352024922,
|
|
"grad_norm": 0.1744169145822525,
|
|
"learning_rate": 6.503067484662578e-06,
|
|
"loss": 0.0079,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.1015345563632168,
|
|
"grad_norm": 0.15908847749233246,
|
|
"learning_rate": 6.625766871165644e-06,
|
|
"loss": 0.0093,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.10338063920618437,
|
|
"grad_norm": 0.11401887983083725,
|
|
"learning_rate": 6.748466257668712e-06,
|
|
"loss": 0.0064,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.10522672204915196,
|
|
"grad_norm": 0.16666960716247559,
|
|
"learning_rate": 6.87116564417178e-06,
|
|
"loss": 0.0095,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.10707280489211954,
|
|
"grad_norm": 0.1956368088722229,
|
|
"learning_rate": 6.993865030674847e-06,
|
|
"loss": 0.0272,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.10891888773508711,
|
|
"grad_norm": 0.2012910097837448,
|
|
"learning_rate": 7.116564417177915e-06,
|
|
"loss": 0.008,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.11076497057805469,
|
|
"grad_norm": 0.1742282509803772,
|
|
"learning_rate": 7.239263803680983e-06,
|
|
"loss": 0.0131,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.11261105342102228,
|
|
"grad_norm": 0.22643107175827026,
|
|
"learning_rate": 7.36196319018405e-06,
|
|
"loss": 0.0297,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.11445713626398984,
|
|
"grad_norm": 0.25800758600234985,
|
|
"learning_rate": 7.484662576687118e-06,
|
|
"loss": 0.0232,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.11630321910695743,
|
|
"grad_norm": 0.20819664001464844,
|
|
"learning_rate": 7.6073619631901856e-06,
|
|
"loss": 0.0148,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.118149301949925,
|
|
"grad_norm": 0.3256385028362274,
|
|
"learning_rate": 7.730061349693252e-06,
|
|
"loss": 0.0171,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.11999538479289258,
|
|
"grad_norm": 0.1211743876338005,
|
|
"learning_rate": 7.85276073619632e-06,
|
|
"loss": 0.0069,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.12184146763586017,
|
|
"grad_norm": 0.11064834147691727,
|
|
"learning_rate": 7.975460122699386e-06,
|
|
"loss": 0.0096,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.12368755047882773,
|
|
"grad_norm": 0.3300691843032837,
|
|
"learning_rate": 8.098159509202455e-06,
|
|
"loss": 0.0217,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.12553363332179532,
|
|
"grad_norm": 0.11716436594724655,
|
|
"learning_rate": 8.220858895705522e-06,
|
|
"loss": 0.007,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.1273797161647629,
|
|
"grad_norm": 0.19859378039836884,
|
|
"learning_rate": 8.343558282208589e-06,
|
|
"loss": 0.0105,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.12922579900773049,
|
|
"grad_norm": 0.24976450204849243,
|
|
"learning_rate": 8.466257668711658e-06,
|
|
"loss": 0.0371,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.13107188185069804,
|
|
"grad_norm": 0.26116761565208435,
|
|
"learning_rate": 8.588957055214725e-06,
|
|
"loss": 0.017,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.13291796469366562,
|
|
"grad_norm": 0.15941867232322693,
|
|
"learning_rate": 8.711656441717792e-06,
|
|
"loss": 0.0166,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.1347640475366332,
|
|
"grad_norm": 0.28257545828819275,
|
|
"learning_rate": 8.83435582822086e-06,
|
|
"loss": 0.0088,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.1366101303796008,
|
|
"grad_norm": 0.1715109497308731,
|
|
"learning_rate": 8.957055214723927e-06,
|
|
"loss": 0.0106,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.13845621322256838,
|
|
"grad_norm": 0.16345541179180145,
|
|
"learning_rate": 9.079754601226994e-06,
|
|
"loss": 0.0228,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.14030229606553593,
|
|
"grad_norm": 0.1364922970533371,
|
|
"learning_rate": 9.202453987730062e-06,
|
|
"loss": 0.0136,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.14214837890850351,
|
|
"grad_norm": 0.13153736293315887,
|
|
"learning_rate": 9.325153374233129e-06,
|
|
"loss": 0.0064,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.1439944617514711,
|
|
"grad_norm": 0.16387970745563507,
|
|
"learning_rate": 9.447852760736197e-06,
|
|
"loss": 0.0076,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.14584054459443868,
|
|
"grad_norm": 0.3637576997280121,
|
|
"learning_rate": 9.570552147239264e-06,
|
|
"loss": 0.0116,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.14768662743740626,
|
|
"grad_norm": 0.12230011820793152,
|
|
"learning_rate": 9.693251533742331e-06,
|
|
"loss": 0.0058,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.14953271028037382,
|
|
"grad_norm": 0.12101858854293823,
|
|
"learning_rate": 9.8159509202454e-06,
|
|
"loss": 0.0043,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.1513787931233414,
|
|
"grad_norm": 0.22512343525886536,
|
|
"learning_rate": 9.938650306748467e-06,
|
|
"loss": 0.0114,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.153224875966309,
|
|
"grad_norm": 0.10917024314403534,
|
|
"learning_rate": 1.0061349693251534e-05,
|
|
"loss": 0.0062,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.15507095880927657,
|
|
"grad_norm": 0.1253615766763687,
|
|
"learning_rate": 1.0184049079754601e-05,
|
|
"loss": 0.0053,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.15691704165224415,
|
|
"grad_norm": 0.12273586541414261,
|
|
"learning_rate": 1.030674846625767e-05,
|
|
"loss": 0.0052,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.1587631244952117,
|
|
"grad_norm": 0.161468505859375,
|
|
"learning_rate": 1.0429447852760737e-05,
|
|
"loss": 0.0138,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.1606092073381793,
|
|
"grad_norm": 0.13661441206932068,
|
|
"learning_rate": 1.0552147239263804e-05,
|
|
"loss": 0.0059,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.16245529018114688,
|
|
"grad_norm": 0.24264393746852875,
|
|
"learning_rate": 1.0674846625766873e-05,
|
|
"loss": 0.0117,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.16430137302411446,
|
|
"grad_norm": 0.19432157278060913,
|
|
"learning_rate": 1.079754601226994e-05,
|
|
"loss": 0.0152,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.16614745586708204,
|
|
"grad_norm": 0.13859547674655914,
|
|
"learning_rate": 1.0920245398773005e-05,
|
|
"loss": 0.0063,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.1679935387100496,
|
|
"grad_norm": 0.22563207149505615,
|
|
"learning_rate": 1.1042944785276076e-05,
|
|
"loss": 0.0109,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.16983962155301718,
|
|
"grad_norm": 0.14970119297504425,
|
|
"learning_rate": 1.1165644171779141e-05,
|
|
"loss": 0.0056,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.16983962155301718,
|
|
"eval_loss": 0.009778047911822796,
|
|
"eval_runtime": 91.7649,
|
|
"eval_samples_per_second": 9.949,
|
|
"eval_steps_per_second": 4.98,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.17168570439598477,
|
|
"grad_norm": 0.19423732161521912,
|
|
"learning_rate": 1.1288343558282208e-05,
|
|
"loss": 0.0312,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.17353178723895235,
|
|
"grad_norm": 0.1405217945575714,
|
|
"learning_rate": 1.1411042944785277e-05,
|
|
"loss": 0.0059,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.17537787008191993,
|
|
"grad_norm": 0.16466659307479858,
|
|
"learning_rate": 1.1533742331288344e-05,
|
|
"loss": 0.0117,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.17722395292488752,
|
|
"grad_norm": 0.15254826843738556,
|
|
"learning_rate": 1.1656441717791411e-05,
|
|
"loss": 0.0116,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.17907003576785507,
|
|
"grad_norm": 0.2417498081922531,
|
|
"learning_rate": 1.177914110429448e-05,
|
|
"loss": 0.0055,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.18091611861082266,
|
|
"grad_norm": 0.15393143892288208,
|
|
"learning_rate": 1.1901840490797547e-05,
|
|
"loss": 0.0072,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.18276220145379024,
|
|
"grad_norm": 0.09935597330331802,
|
|
"learning_rate": 1.2024539877300614e-05,
|
|
"loss": 0.0041,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.18460828429675782,
|
|
"grad_norm": 0.2075463831424713,
|
|
"learning_rate": 1.2147239263803683e-05,
|
|
"loss": 0.0225,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.1864543671397254,
|
|
"grad_norm": 0.1333729475736618,
|
|
"learning_rate": 1.226993865030675e-05,
|
|
"loss": 0.0065,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.18830044998269296,
|
|
"grad_norm": 0.15017342567443848,
|
|
"learning_rate": 1.2392638036809817e-05,
|
|
"loss": 0.0068,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.19014653282566055,
|
|
"grad_norm": 0.10981283336877823,
|
|
"learning_rate": 1.2515337423312886e-05,
|
|
"loss": 0.0059,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.19199261566862813,
|
|
"grad_norm": 0.12825554609298706,
|
|
"learning_rate": 1.2638036809815953e-05,
|
|
"loss": 0.0131,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.19383869851159571,
|
|
"grad_norm": 0.14562994241714478,
|
|
"learning_rate": 1.276073619631902e-05,
|
|
"loss": 0.0207,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.1956847813545633,
|
|
"grad_norm": 0.1667003035545349,
|
|
"learning_rate": 1.2883435582822085e-05,
|
|
"loss": 0.0081,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.19753086419753085,
|
|
"grad_norm": 0.13632525503635406,
|
|
"learning_rate": 1.3006134969325156e-05,
|
|
"loss": 0.0086,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.19937694704049844,
|
|
"grad_norm": 0.11093810200691223,
|
|
"learning_rate": 1.3128834355828221e-05,
|
|
"loss": 0.0052,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.20122302988346602,
|
|
"grad_norm": 0.1157720610499382,
|
|
"learning_rate": 1.3251533742331288e-05,
|
|
"loss": 0.0056,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.2030691127264336,
|
|
"grad_norm": 0.11683616042137146,
|
|
"learning_rate": 1.3374233128834357e-05,
|
|
"loss": 0.011,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.2049151955694012,
|
|
"grad_norm": 0.16558395326137543,
|
|
"learning_rate": 1.3496932515337424e-05,
|
|
"loss": 0.0126,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.20676127841236874,
|
|
"grad_norm": 0.11443828046321869,
|
|
"learning_rate": 1.3619631901840491e-05,
|
|
"loss": 0.003,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.20860736125533633,
|
|
"grad_norm": 0.17744530737400055,
|
|
"learning_rate": 1.374233128834356e-05,
|
|
"loss": 0.0128,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.2104534440983039,
|
|
"grad_norm": 0.1902821660041809,
|
|
"learning_rate": 1.3865030674846627e-05,
|
|
"loss": 0.0117,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.2122995269412715,
|
|
"grad_norm": 0.1485978364944458,
|
|
"learning_rate": 1.3987730061349694e-05,
|
|
"loss": 0.0115,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.21414560978423908,
|
|
"grad_norm": 0.1651626080274582,
|
|
"learning_rate": 1.4110429447852763e-05,
|
|
"loss": 0.0073,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.21599169262720663,
|
|
"grad_norm": 0.1463606357574463,
|
|
"learning_rate": 1.423312883435583e-05,
|
|
"loss": 0.0078,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.21783777547017422,
|
|
"grad_norm": 0.16828493773937225,
|
|
"learning_rate": 1.4355828220858897e-05,
|
|
"loss": 0.0062,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.2196838583131418,
|
|
"grad_norm": 0.1243981420993805,
|
|
"learning_rate": 1.4478527607361965e-05,
|
|
"loss": 0.0078,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.22152994115610938,
|
|
"grad_norm": 0.25788137316703796,
|
|
"learning_rate": 1.4601226993865032e-05,
|
|
"loss": 0.0103,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.22337602399907697,
|
|
"grad_norm": 0.2088393270969391,
|
|
"learning_rate": 1.47239263803681e-05,
|
|
"loss": 0.0123,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.22522210684204455,
|
|
"grad_norm": 0.11959819495677948,
|
|
"learning_rate": 1.4846625766871168e-05,
|
|
"loss": 0.0067,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.2270681896850121,
|
|
"grad_norm": 11.672961235046387,
|
|
"learning_rate": 1.4969325153374235e-05,
|
|
"loss": 0.0367,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.2289142725279797,
|
|
"grad_norm": 0.11650065332651138,
|
|
"learning_rate": 1.50920245398773e-05,
|
|
"loss": 0.0039,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.23076035537094727,
|
|
"grad_norm": 0.14429336786270142,
|
|
"learning_rate": 1.5214723926380371e-05,
|
|
"loss": 0.0071,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.23260643821391486,
|
|
"grad_norm": 1.8753544092178345,
|
|
"learning_rate": 1.5337423312883436e-05,
|
|
"loss": 0.0162,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.23445252105688244,
|
|
"grad_norm": 0.12293099611997604,
|
|
"learning_rate": 1.5460122699386504e-05,
|
|
"loss": 0.0093,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.23629860389985,
|
|
"grad_norm": 0.1450912058353424,
|
|
"learning_rate": 1.5582822085889574e-05,
|
|
"loss": 0.0061,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.23814468674281758,
|
|
"grad_norm": 0.26840445399284363,
|
|
"learning_rate": 1.570552147239264e-05,
|
|
"loss": 0.0127,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.23999076958578516,
|
|
"grad_norm": 0.33744606375694275,
|
|
"learning_rate": 1.5828220858895708e-05,
|
|
"loss": 0.0196,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.24183685242875275,
|
|
"grad_norm": 0.18890263140201569,
|
|
"learning_rate": 1.5950920245398772e-05,
|
|
"loss": 0.0107,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.24368293527172033,
|
|
"grad_norm": 0.15780992805957794,
|
|
"learning_rate": 1.6073619631901842e-05,
|
|
"loss": 0.0081,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.24552901811468789,
|
|
"grad_norm": 0.13231074810028076,
|
|
"learning_rate": 1.619631901840491e-05,
|
|
"loss": 0.0068,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.24737510095765547,
|
|
"grad_norm": 0.13381816446781158,
|
|
"learning_rate": 1.6319018404907976e-05,
|
|
"loss": 0.0095,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.24922118380062305,
|
|
"grad_norm": 0.18281026184558868,
|
|
"learning_rate": 1.6441717791411043e-05,
|
|
"loss": 0.0104,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.25106726664359064,
|
|
"grad_norm": 0.5789101719856262,
|
|
"learning_rate": 1.656441717791411e-05,
|
|
"loss": 0.0074,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.2529133494865582,
|
|
"grad_norm": 0.1189756840467453,
|
|
"learning_rate": 1.6687116564417178e-05,
|
|
"loss": 0.0074,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.2547594323295258,
|
|
"grad_norm": 0.11586418002843857,
|
|
"learning_rate": 1.6809815950920248e-05,
|
|
"loss": 0.0042,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.25660551517249336,
|
|
"grad_norm": 0.23946896195411682,
|
|
"learning_rate": 1.6932515337423315e-05,
|
|
"loss": 0.0346,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.25845159801546097,
|
|
"grad_norm": 0.16509361565113068,
|
|
"learning_rate": 1.7055214723926382e-05,
|
|
"loss": 0.0068,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.2602976808584285,
|
|
"grad_norm": 0.29569926857948303,
|
|
"learning_rate": 1.717791411042945e-05,
|
|
"loss": 0.0438,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.2621437637013961,
|
|
"grad_norm": 0.24651439487934113,
|
|
"learning_rate": 1.7300613496932516e-05,
|
|
"loss": 0.0079,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.2639898465443637,
|
|
"grad_norm": 0.14485400915145874,
|
|
"learning_rate": 1.7423312883435583e-05,
|
|
"loss": 0.0075,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.26583592938733125,
|
|
"grad_norm": 0.12196393311023712,
|
|
"learning_rate": 1.7546012269938654e-05,
|
|
"loss": 0.0069,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.26768201223029886,
|
|
"grad_norm": 0.10613025724887848,
|
|
"learning_rate": 1.766871165644172e-05,
|
|
"loss": 0.0047,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.2695280950732664,
|
|
"grad_norm": 0.2299281656742096,
|
|
"learning_rate": 1.7791411042944788e-05,
|
|
"loss": 0.0152,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.27137417791623397,
|
|
"grad_norm": 0.16602513194084167,
|
|
"learning_rate": 1.7914110429447855e-05,
|
|
"loss": 0.0111,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.2732202607592016,
|
|
"grad_norm": 0.1011560782790184,
|
|
"learning_rate": 1.8036809815950922e-05,
|
|
"loss": 0.0036,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.27506634360216914,
|
|
"grad_norm": 0.12293390929698944,
|
|
"learning_rate": 1.815950920245399e-05,
|
|
"loss": 0.006,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.27691242644513675,
|
|
"grad_norm": 0.09763162583112717,
|
|
"learning_rate": 1.828220858895706e-05,
|
|
"loss": 0.0062,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.2787585092881043,
|
|
"grad_norm": 0.12489021569490433,
|
|
"learning_rate": 1.8404907975460123e-05,
|
|
"loss": 0.0057,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.28060459213107186,
|
|
"grad_norm": 0.18567755818367004,
|
|
"learning_rate": 1.852760736196319e-05,
|
|
"loss": 0.0197,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.2824506749740395,
|
|
"grad_norm": 0.3939504325389862,
|
|
"learning_rate": 1.8650306748466257e-05,
|
|
"loss": 0.0085,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.28429675781700703,
|
|
"grad_norm": 0.3876646161079407,
|
|
"learning_rate": 1.8773006134969328e-05,
|
|
"loss": 0.0312,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.28614284065997464,
|
|
"grad_norm": 0.32117509841918945,
|
|
"learning_rate": 1.8895705521472395e-05,
|
|
"loss": 0.0257,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.2879889235029422,
|
|
"grad_norm": 0.1188175231218338,
|
|
"learning_rate": 1.9018404907975462e-05,
|
|
"loss": 0.0058,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.28983500634590975,
|
|
"grad_norm": 0.20765401422977448,
|
|
"learning_rate": 1.914110429447853e-05,
|
|
"loss": 0.0379,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.29168108918887736,
|
|
"grad_norm": 0.16120277345180511,
|
|
"learning_rate": 1.9263803680981596e-05,
|
|
"loss": 0.0085,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.2935271720318449,
|
|
"grad_norm": 0.2422802895307541,
|
|
"learning_rate": 1.9386503067484663e-05,
|
|
"loss": 0.0183,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.29537325487481253,
|
|
"grad_norm": 0.1309472918510437,
|
|
"learning_rate": 1.9509202453987733e-05,
|
|
"loss": 0.0092,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.2972193377177801,
|
|
"grad_norm": 0.18968728184700012,
|
|
"learning_rate": 1.96319018404908e-05,
|
|
"loss": 0.0076,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.29906542056074764,
|
|
"grad_norm": 0.5624260902404785,
|
|
"learning_rate": 1.9754601226993868e-05,
|
|
"loss": 0.0162,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.30091150340371525,
|
|
"grad_norm": 0.18030855059623718,
|
|
"learning_rate": 1.9877300613496935e-05,
|
|
"loss": 0.0109,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.3027575862466828,
|
|
"grad_norm": 0.20087158679962158,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.0082,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.3046036690896504,
|
|
"grad_norm": 0.13237029314041138,
|
|
"learning_rate": 1.9999976944161012e-05,
|
|
"loss": 0.0056,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.306449751932618,
|
|
"grad_norm": 0.2552473843097687,
|
|
"learning_rate": 1.9999907776750355e-05,
|
|
"loss": 0.0239,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.30829583477558553,
|
|
"grad_norm": 0.09501585364341736,
|
|
"learning_rate": 1.9999792498086977e-05,
|
|
"loss": 0.0074,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.31014191761855314,
|
|
"grad_norm": 0.2037140280008316,
|
|
"learning_rate": 1.9999631108702447e-05,
|
|
"loss": 0.0108,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.3119880004615207,
|
|
"grad_norm": 0.2303272783756256,
|
|
"learning_rate": 1.9999423609340957e-05,
|
|
"loss": 0.0156,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.3138340833044883,
|
|
"grad_norm": 0.3203318417072296,
|
|
"learning_rate": 1.9999170000959317e-05,
|
|
"loss": 0.009,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.31568016614745587,
|
|
"grad_norm": 0.15373510122299194,
|
|
"learning_rate": 1.9998870284726968e-05,
|
|
"loss": 0.0114,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.3175262489904234,
|
|
"grad_norm": 0.2735464870929718,
|
|
"learning_rate": 1.9998524462025943e-05,
|
|
"loss": 0.0071,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.31937233183339103,
|
|
"grad_norm": 0.26551553606987,
|
|
"learning_rate": 1.9998132534450893e-05,
|
|
"loss": 0.0193,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.3212184146763586,
|
|
"grad_norm": 0.13136360049247742,
|
|
"learning_rate": 1.9997694503809058e-05,
|
|
"loss": 0.0069,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.3230644975193262,
|
|
"grad_norm": 0.14918763935565948,
|
|
"learning_rate": 1.9997210372120276e-05,
|
|
"loss": 0.009,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.32491058036229375,
|
|
"grad_norm": 0.1694311797618866,
|
|
"learning_rate": 1.9996680141616956e-05,
|
|
"loss": 0.0091,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.3267566632052613,
|
|
"grad_norm": 0.18462465703487396,
|
|
"learning_rate": 1.999610381474408e-05,
|
|
"loss": 0.0114,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.3286027460482289,
|
|
"grad_norm": 0.12807750701904297,
|
|
"learning_rate": 1.999548139415919e-05,
|
|
"loss": 0.0074,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.3304488288911965,
|
|
"grad_norm": 0.21351908147335052,
|
|
"learning_rate": 1.9994812882732364e-05,
|
|
"loss": 0.0184,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.3322949117341641,
|
|
"grad_norm": 0.1473976969718933,
|
|
"learning_rate": 1.9994098283546234e-05,
|
|
"loss": 0.0123,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.33414099457713164,
|
|
"grad_norm": 0.12017160654067993,
|
|
"learning_rate": 1.9993337599895925e-05,
|
|
"loss": 0.0053,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.3359870774200992,
|
|
"grad_norm": 0.24068444967269897,
|
|
"learning_rate": 1.999253083528908e-05,
|
|
"loss": 0.0116,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.3378331602630668,
|
|
"grad_norm": 0.21027851104736328,
|
|
"learning_rate": 1.9991677993445832e-05,
|
|
"loss": 0.0032,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.33967924310603437,
|
|
"grad_norm": 0.12754715979099274,
|
|
"learning_rate": 1.999077907829877e-05,
|
|
"loss": 0.0115,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.33967924310603437,
|
|
"eval_loss": 0.01073263119906187,
|
|
"eval_runtime": 260.2018,
|
|
"eval_samples_per_second": 3.509,
|
|
"eval_steps_per_second": 1.756,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.341525325949002,
|
|
"grad_norm": 0.1997697800397873,
|
|
"learning_rate": 1.9989834093992945e-05,
|
|
"loss": 0.0137,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.34337140879196953,
|
|
"grad_norm": 0.11237557232379913,
|
|
"learning_rate": 1.998884304488584e-05,
|
|
"loss": 0.004,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.3452174916349371,
|
|
"grad_norm": 0.3121223747730255,
|
|
"learning_rate": 1.9987805935547347e-05,
|
|
"loss": 0.0228,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.3470635744779047,
|
|
"grad_norm": 0.25514844059944153,
|
|
"learning_rate": 1.998672277075975e-05,
|
|
"loss": 0.0074,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.34890965732087226,
|
|
"grad_norm": 0.10668331384658813,
|
|
"learning_rate": 1.998559355551771e-05,
|
|
"loss": 0.0075,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.35075574016383987,
|
|
"grad_norm": 0.14544381201267242,
|
|
"learning_rate": 1.9984418295028217e-05,
|
|
"loss": 0.0082,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.3526018230068074,
|
|
"grad_norm": 0.22068175673484802,
|
|
"learning_rate": 1.998319699471061e-05,
|
|
"loss": 0.0221,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.35444790584977504,
|
|
"grad_norm": 0.1968519538640976,
|
|
"learning_rate": 1.9981929660196492e-05,
|
|
"loss": 0.0134,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.3562939886927426,
|
|
"grad_norm": 0.09916682541370392,
|
|
"learning_rate": 1.9980616297329764e-05,
|
|
"loss": 0.0062,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.35814007153571015,
|
|
"grad_norm": 0.1616455614566803,
|
|
"learning_rate": 1.9979256912166565e-05,
|
|
"loss": 0.0313,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.35998615437867776,
|
|
"grad_norm": 0.10900194197893143,
|
|
"learning_rate": 1.9977851510975244e-05,
|
|
"loss": 0.0063,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.3618322372216453,
|
|
"grad_norm": 0.16465668380260468,
|
|
"learning_rate": 1.997640010023634e-05,
|
|
"loss": 0.0187,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.3636783200646129,
|
|
"grad_norm": 0.09527245908975601,
|
|
"learning_rate": 1.997490268664256e-05,
|
|
"loss": 0.003,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.3655244029075805,
|
|
"grad_norm": 0.12054583430290222,
|
|
"learning_rate": 1.997335927709872e-05,
|
|
"loss": 0.0051,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.36737048575054804,
|
|
"grad_norm": 0.17793945968151093,
|
|
"learning_rate": 1.9971769878721747e-05,
|
|
"loss": 0.0214,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.36921656859351565,
|
|
"grad_norm": 0.43558791279792786,
|
|
"learning_rate": 1.9970134498840617e-05,
|
|
"loss": 0.0165,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.3710626514364832,
|
|
"grad_norm": 0.12647399306297302,
|
|
"learning_rate": 1.9968453144996345e-05,
|
|
"loss": 0.0081,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.3729087342794508,
|
|
"grad_norm": 0.12886571884155273,
|
|
"learning_rate": 1.9966725824941933e-05,
|
|
"loss": 0.0086,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.37475481712241837,
|
|
"grad_norm": 0.12100410461425781,
|
|
"learning_rate": 1.996495254664235e-05,
|
|
"loss": 0.005,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.3766008999653859,
|
|
"grad_norm": 0.15734834969043732,
|
|
"learning_rate": 1.9963133318274475e-05,
|
|
"loss": 0.0154,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.37844698280835354,
|
|
"grad_norm": 0.20007383823394775,
|
|
"learning_rate": 1.9961268148227077e-05,
|
|
"loss": 0.0099,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.3802930656513211,
|
|
"grad_norm": 0.13359728455543518,
|
|
"learning_rate": 1.9959357045100764e-05,
|
|
"loss": 0.0065,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.3821391484942887,
|
|
"grad_norm": 0.20117157697677612,
|
|
"learning_rate": 1.995740001770796e-05,
|
|
"loss": 0.0165,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.38398523133725626,
|
|
"grad_norm": 0.3189203441143036,
|
|
"learning_rate": 1.995539707507284e-05,
|
|
"loss": 0.0104,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.3858313141802238,
|
|
"grad_norm": 0.13761986792087555,
|
|
"learning_rate": 1.995334822643131e-05,
|
|
"loss": 0.0342,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.38767739702319143,
|
|
"grad_norm": 0.15731483697891235,
|
|
"learning_rate": 1.9951253481230955e-05,
|
|
"loss": 0.0125,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.389523479866159,
|
|
"grad_norm": 0.11964155733585358,
|
|
"learning_rate": 1.9949112849131005e-05,
|
|
"loss": 0.026,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 0.3913695627091266,
|
|
"grad_norm": 0.3471106290817261,
|
|
"learning_rate": 1.9946926340002262e-05,
|
|
"loss": 0.0171,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 0.39321564555209415,
|
|
"grad_norm": 0.08175747841596603,
|
|
"learning_rate": 1.9944693963927092e-05,
|
|
"loss": 0.0044,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 0.3950617283950617,
|
|
"grad_norm": 0.11287415027618408,
|
|
"learning_rate": 1.9942415731199357e-05,
|
|
"loss": 0.0062,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 0.3969078112380293,
|
|
"grad_norm": 0.2272588163614273,
|
|
"learning_rate": 1.9940091652324363e-05,
|
|
"loss": 0.019,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.3987538940809969,
|
|
"grad_norm": 0.19430284202098846,
|
|
"learning_rate": 1.993772173801884e-05,
|
|
"loss": 0.0076,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 0.4005999769239645,
|
|
"grad_norm": 0.34528452157974243,
|
|
"learning_rate": 1.993530599921085e-05,
|
|
"loss": 0.0296,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 0.40244605976693204,
|
|
"grad_norm": 0.25836917757987976,
|
|
"learning_rate": 1.9932844447039775e-05,
|
|
"loss": 0.0142,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 0.4042921426098996,
|
|
"grad_norm": 0.10283704102039337,
|
|
"learning_rate": 1.9930337092856243e-05,
|
|
"loss": 0.0044,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 0.4061382254528672,
|
|
"grad_norm": 0.2654596269130707,
|
|
"learning_rate": 1.9927783948222084e-05,
|
|
"loss": 0.008,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.40798430829583476,
|
|
"grad_norm": 0.17902354896068573,
|
|
"learning_rate": 1.992518502491028e-05,
|
|
"loss": 0.017,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 0.4098303911388024,
|
|
"grad_norm": 0.23217414319515228,
|
|
"learning_rate": 1.9922540334904898e-05,
|
|
"loss": 0.0115,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 0.41167647398176993,
|
|
"grad_norm": 0.1511717438697815,
|
|
"learning_rate": 1.991984989040105e-05,
|
|
"loss": 0.0108,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 0.4135225568247375,
|
|
"grad_norm": 0.24292099475860596,
|
|
"learning_rate": 1.9917113703804828e-05,
|
|
"loss": 0.0085,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 0.4153686396677051,
|
|
"grad_norm": 0.16115987300872803,
|
|
"learning_rate": 1.9914331787733246e-05,
|
|
"loss": 0.0053,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.41721472251067265,
|
|
"grad_norm": 0.11056244373321533,
|
|
"learning_rate": 1.9911504155014187e-05,
|
|
"loss": 0.0074,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 0.41906080535364026,
|
|
"grad_norm": 0.1903485357761383,
|
|
"learning_rate": 1.990863081868634e-05,
|
|
"loss": 0.0187,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 0.4209068881966078,
|
|
"grad_norm": 0.15223729610443115,
|
|
"learning_rate": 1.9905711791999135e-05,
|
|
"loss": 0.0054,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 0.4227529710395754,
|
|
"grad_norm": 0.2494320571422577,
|
|
"learning_rate": 1.9902747088412703e-05,
|
|
"loss": 0.0211,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 0.424599053882543,
|
|
"grad_norm": 0.1578684151172638,
|
|
"learning_rate": 1.9899736721597787e-05,
|
|
"loss": 0.018,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.42644513672551054,
|
|
"grad_norm": 0.10511214286088943,
|
|
"learning_rate": 1.989668070543569e-05,
|
|
"loss": 0.0074,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 0.42829121956847815,
|
|
"grad_norm": 0.15655289590358734,
|
|
"learning_rate": 1.9893579054018216e-05,
|
|
"loss": 0.0234,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 0.4301373024114457,
|
|
"grad_norm": 0.27272099256515503,
|
|
"learning_rate": 1.98904317816476e-05,
|
|
"loss": 0.0177,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 0.43198338525441327,
|
|
"grad_norm": 0.09095671772956848,
|
|
"learning_rate": 1.988723890283645e-05,
|
|
"loss": 0.0034,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 0.4338294680973809,
|
|
"grad_norm": 0.14508356153964996,
|
|
"learning_rate": 1.9884000432307657e-05,
|
|
"loss": 0.0086,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.43567555094034843,
|
|
"grad_norm": 0.21108239889144897,
|
|
"learning_rate": 1.9880716384994355e-05,
|
|
"loss": 0.0131,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 0.43752163378331604,
|
|
"grad_norm": 0.14441920816898346,
|
|
"learning_rate": 1.987738677603984e-05,
|
|
"loss": 0.0082,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 0.4393677166262836,
|
|
"grad_norm": 0.10114026069641113,
|
|
"learning_rate": 1.9874011620797494e-05,
|
|
"loss": 0.0091,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 0.44121379946925116,
|
|
"grad_norm": 0.11774880439043045,
|
|
"learning_rate": 1.9870590934830726e-05,
|
|
"loss": 0.0037,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 0.44305988231221877,
|
|
"grad_norm": 0.2125924825668335,
|
|
"learning_rate": 1.986712473391289e-05,
|
|
"loss": 0.01,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.4449059651551863,
|
|
"grad_norm": 0.13634435832500458,
|
|
"learning_rate": 1.9863613034027224e-05,
|
|
"loss": 0.0077,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 0.44675204799815393,
|
|
"grad_norm": 0.09352454543113708,
|
|
"learning_rate": 1.9860055851366768e-05,
|
|
"loss": 0.0039,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 0.4485981308411215,
|
|
"grad_norm": 0.15363694727420807,
|
|
"learning_rate": 1.9856453202334277e-05,
|
|
"loss": 0.0092,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 0.4504442136840891,
|
|
"grad_norm": 0.16866520047187805,
|
|
"learning_rate": 1.985280510354218e-05,
|
|
"loss": 0.0164,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 0.45229029652705666,
|
|
"grad_norm": 0.16478340327739716,
|
|
"learning_rate": 1.984911157181247e-05,
|
|
"loss": 0.0094,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.4541363793700242,
|
|
"grad_norm": 0.12157045304775238,
|
|
"learning_rate": 1.9845372624176646e-05,
|
|
"loss": 0.0101,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 0.4559824622129918,
|
|
"grad_norm": 0.1132885068655014,
|
|
"learning_rate": 1.9841588277875613e-05,
|
|
"loss": 0.0062,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 0.4578285450559594,
|
|
"grad_norm": 0.10305500775575638,
|
|
"learning_rate": 1.9837758550359637e-05,
|
|
"loss": 0.0098,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 0.459674627898927,
|
|
"grad_norm": 0.11010725051164627,
|
|
"learning_rate": 1.9833883459288223e-05,
|
|
"loss": 0.0079,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 0.46152071074189455,
|
|
"grad_norm": 0.138060063123703,
|
|
"learning_rate": 1.9829963022530077e-05,
|
|
"loss": 0.0206,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.4633667935848621,
|
|
"grad_norm": 0.08483293652534485,
|
|
"learning_rate": 1.982599725816299e-05,
|
|
"loss": 0.0049,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 0.4652128764278297,
|
|
"grad_norm": 0.1332448273897171,
|
|
"learning_rate": 1.9821986184473757e-05,
|
|
"loss": 0.0085,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 0.46705895927079727,
|
|
"grad_norm": 0.10986470431089401,
|
|
"learning_rate": 1.981792981995812e-05,
|
|
"loss": 0.0085,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 0.4689050421137649,
|
|
"grad_norm": 0.09792988747358322,
|
|
"learning_rate": 1.9813828183320654e-05,
|
|
"loss": 0.0111,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 0.47075112495673244,
|
|
"grad_norm": 0.10201071947813034,
|
|
"learning_rate": 1.9809681293474693e-05,
|
|
"loss": 0.0062,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.4725972077997,
|
|
"grad_norm": 0.17517031729221344,
|
|
"learning_rate": 1.9805489169542245e-05,
|
|
"loss": 0.0295,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 0.4744432906426676,
|
|
"grad_norm": 0.07551269978284836,
|
|
"learning_rate": 1.9801251830853895e-05,
|
|
"loss": 0.0032,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 0.47628937348563516,
|
|
"grad_norm": 0.12221430987119675,
|
|
"learning_rate": 1.9796969296948723e-05,
|
|
"loss": 0.0056,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 0.47813545632860277,
|
|
"grad_norm": 0.2373015284538269,
|
|
"learning_rate": 1.9792641587574212e-05,
|
|
"loss": 0.0184,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 0.4799815391715703,
|
|
"grad_norm": 0.1124010682106018,
|
|
"learning_rate": 1.9788268722686153e-05,
|
|
"loss": 0.012,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.4818276220145379,
|
|
"grad_norm": 0.10452152788639069,
|
|
"learning_rate": 1.978385072244857e-05,
|
|
"loss": 0.0183,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 0.4836737048575055,
|
|
"grad_norm": 0.13262808322906494,
|
|
"learning_rate": 1.9779387607233587e-05,
|
|
"loss": 0.008,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 0.48551978770047305,
|
|
"grad_norm": 0.10975097864866257,
|
|
"learning_rate": 1.9774879397621387e-05,
|
|
"loss": 0.0043,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 0.48736587054344066,
|
|
"grad_norm": 0.1568657010793686,
|
|
"learning_rate": 1.977032611440008e-05,
|
|
"loss": 0.0137,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 0.4892119533864082,
|
|
"grad_norm": 0.12431374937295914,
|
|
"learning_rate": 1.976572777856562e-05,
|
|
"loss": 0.0053,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.49105803622937577,
|
|
"grad_norm": 0.10776858776807785,
|
|
"learning_rate": 1.9761084411321706e-05,
|
|
"loss": 0.0058,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 0.4929041190723434,
|
|
"grad_norm": 0.1078772023320198,
|
|
"learning_rate": 1.9756396034079678e-05,
|
|
"loss": 0.0089,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 0.49475020191531094,
|
|
"grad_norm": 0.20356160402297974,
|
|
"learning_rate": 1.9751662668458434e-05,
|
|
"loss": 0.0394,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 0.49659628475827855,
|
|
"grad_norm": 0.11029177159070969,
|
|
"learning_rate": 1.9746884336284316e-05,
|
|
"loss": 0.0073,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 0.4984423676012461,
|
|
"grad_norm": 0.15297511219978333,
|
|
"learning_rate": 1.974206105959102e-05,
|
|
"loss": 0.0094,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.5002884504442137,
|
|
"grad_norm": 0.1340862512588501,
|
|
"learning_rate": 1.9737192860619477e-05,
|
|
"loss": 0.0115,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 0.5021345332871813,
|
|
"grad_norm": 0.12668649852275848,
|
|
"learning_rate": 1.9732279761817774e-05,
|
|
"loss": 0.0079,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 0.5039806161301489,
|
|
"grad_norm": 0.07369110733270645,
|
|
"learning_rate": 1.9727321785841028e-05,
|
|
"loss": 0.0023,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 0.5058266989731164,
|
|
"grad_norm": 0.11879657953977585,
|
|
"learning_rate": 1.9722318955551307e-05,
|
|
"loss": 0.0065,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 0.507672781816084,
|
|
"grad_norm": 0.11271411180496216,
|
|
"learning_rate": 1.9717271294017495e-05,
|
|
"loss": 0.0201,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.5095188646590516,
|
|
"grad_norm": 0.12968410551548004,
|
|
"learning_rate": 1.971217882451521e-05,
|
|
"loss": 0.015,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 0.5095188646590516,
|
|
"eval_loss": 0.009394334629178047,
|
|
"eval_runtime": 91.3081,
|
|
"eval_samples_per_second": 9.999,
|
|
"eval_steps_per_second": 5.005,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 0.5113649475020191,
|
|
"grad_norm": 0.14326775074005127,
|
|
"learning_rate": 1.970704157052668e-05,
|
|
"loss": 0.0132,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 0.5132110303449867,
|
|
"grad_norm": 0.0933668315410614,
|
|
"learning_rate": 1.9701859555740647e-05,
|
|
"loss": 0.0054,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 0.5150571131879543,
|
|
"grad_norm": 0.12458498030900955,
|
|
"learning_rate": 1.969663280405225e-05,
|
|
"loss": 0.0115,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 0.5169031960309219,
|
|
"grad_norm": 0.16223905980587006,
|
|
"learning_rate": 1.9691361339562917e-05,
|
|
"loss": 0.029,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.5187492788738894,
|
|
"grad_norm": 0.15162405371665955,
|
|
"learning_rate": 1.9686045186580258e-05,
|
|
"loss": 0.0144,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 0.520595361716857,
|
|
"grad_norm": 0.21614527702331543,
|
|
"learning_rate": 1.9680684369617947e-05,
|
|
"loss": 0.0047,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 0.5224414445598247,
|
|
"grad_norm": 0.5095790028572083,
|
|
"learning_rate": 1.9675278913395605e-05,
|
|
"loss": 0.0223,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 0.5242875274027922,
|
|
"grad_norm": 0.07105882465839386,
|
|
"learning_rate": 1.96698288428387e-05,
|
|
"loss": 0.0029,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 0.5261336102457598,
|
|
"grad_norm": 0.16313835978507996,
|
|
"learning_rate": 1.966433418307843e-05,
|
|
"loss": 0.0098,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.5279796930887274,
|
|
"grad_norm": 0.15185286104679108,
|
|
"learning_rate": 1.9658794959451583e-05,
|
|
"loss": 0.026,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 0.5298257759316949,
|
|
"grad_norm": 0.14729946851730347,
|
|
"learning_rate": 1.9653211197500447e-05,
|
|
"loss": 0.0058,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 0.5316718587746625,
|
|
"grad_norm": 0.15539248287677765,
|
|
"learning_rate": 1.9647582922972696e-05,
|
|
"loss": 0.0161,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 0.5335179416176301,
|
|
"grad_norm": 0.12063451111316681,
|
|
"learning_rate": 1.964191016182124e-05,
|
|
"loss": 0.0066,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 0.5353640244605977,
|
|
"grad_norm": 0.10961133986711502,
|
|
"learning_rate": 1.9636192940204134e-05,
|
|
"loss": 0.0049,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.5372101073035652,
|
|
"grad_norm": 0.1027185246348381,
|
|
"learning_rate": 1.9630431284484447e-05,
|
|
"loss": 0.0075,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 0.5390561901465328,
|
|
"grad_norm": 0.0939934030175209,
|
|
"learning_rate": 1.9624625221230146e-05,
|
|
"loss": 0.0038,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 0.5409022729895004,
|
|
"grad_norm": 0.15825892984867096,
|
|
"learning_rate": 1.9618774777213954e-05,
|
|
"loss": 0.008,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 0.5427483558324679,
|
|
"grad_norm": 0.15646050870418549,
|
|
"learning_rate": 1.9612879979413252e-05,
|
|
"loss": 0.0165,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 0.5445944386754356,
|
|
"grad_norm": 0.11154604703187943,
|
|
"learning_rate": 1.9606940855009944e-05,
|
|
"loss": 0.0128,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.5464405215184032,
|
|
"grad_norm": 0.07165487110614777,
|
|
"learning_rate": 1.960095743139033e-05,
|
|
"loss": 0.0031,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 0.5482866043613707,
|
|
"grad_norm": 0.18987427651882172,
|
|
"learning_rate": 1.9594929736144978e-05,
|
|
"loss": 0.0186,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 0.5501326872043383,
|
|
"grad_norm": 0.1137097179889679,
|
|
"learning_rate": 1.9588857797068602e-05,
|
|
"loss": 0.0049,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 0.5519787700473059,
|
|
"grad_norm": 0.10796220600605011,
|
|
"learning_rate": 1.9582741642159933e-05,
|
|
"loss": 0.0046,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 0.5538248528902735,
|
|
"grad_norm": 0.07906807959079742,
|
|
"learning_rate": 1.9576581299621587e-05,
|
|
"loss": 0.005,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.555670935733241,
|
|
"grad_norm": 0.09325362741947174,
|
|
"learning_rate": 1.957037679785994e-05,
|
|
"loss": 0.0063,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 0.5575170185762086,
|
|
"grad_norm": 0.11685798317193985,
|
|
"learning_rate": 1.9564128165484987e-05,
|
|
"loss": 0.0071,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 0.5593631014191762,
|
|
"grad_norm": 0.14980854094028473,
|
|
"learning_rate": 1.955783543131022e-05,
|
|
"loss": 0.017,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 0.5612091842621437,
|
|
"grad_norm": 0.14807415008544922,
|
|
"learning_rate": 1.9551498624352497e-05,
|
|
"loss": 0.0155,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 0.5630552671051113,
|
|
"grad_norm": 0.13163696229457855,
|
|
"learning_rate": 1.9545117773831893e-05,
|
|
"loss": 0.0082,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.564901349948079,
|
|
"grad_norm": 0.21672053635120392,
|
|
"learning_rate": 1.953869290917158e-05,
|
|
"loss": 0.0076,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 0.5667474327910464,
|
|
"grad_norm": 0.1977837234735489,
|
|
"learning_rate": 1.9532224059997693e-05,
|
|
"loss": 0.0242,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 0.5685935156340141,
|
|
"grad_norm": 0.1022099182009697,
|
|
"learning_rate": 1.952571125613918e-05,
|
|
"loss": 0.005,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 0.5704395984769817,
|
|
"grad_norm": 0.20348013937473297,
|
|
"learning_rate": 1.9519154527627667e-05,
|
|
"loss": 0.0046,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 0.5722856813199493,
|
|
"grad_norm": 0.14997074007987976,
|
|
"learning_rate": 1.9512553904697332e-05,
|
|
"loss": 0.0182,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.5741317641629168,
|
|
"grad_norm": 0.0894525796175003,
|
|
"learning_rate": 1.9505909417784758e-05,
|
|
"loss": 0.0048,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 0.5759778470058844,
|
|
"grad_norm": 0.07030344009399414,
|
|
"learning_rate": 1.9499221097528785e-05,
|
|
"loss": 0.0037,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 0.577823929848852,
|
|
"grad_norm": 0.08951806277036667,
|
|
"learning_rate": 1.949248897477038e-05,
|
|
"loss": 0.0044,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 0.5796700126918195,
|
|
"grad_norm": 0.10578414797782898,
|
|
"learning_rate": 1.9485713080552492e-05,
|
|
"loss": 0.0047,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 0.5815160955347871,
|
|
"grad_norm": 0.05251196399331093,
|
|
"learning_rate": 1.9478893446119905e-05,
|
|
"loss": 0.0023,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.5833621783777547,
|
|
"grad_norm": 0.11290792375802994,
|
|
"learning_rate": 1.9472030102919102e-05,
|
|
"loss": 0.0176,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 0.5852082612207222,
|
|
"grad_norm": 0.12746615707874298,
|
|
"learning_rate": 1.9465123082598107e-05,
|
|
"loss": 0.0105,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 0.5870543440636898,
|
|
"grad_norm": 0.0989518091082573,
|
|
"learning_rate": 1.9458172417006347e-05,
|
|
"loss": 0.0047,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 0.5889004269066574,
|
|
"grad_norm": 0.1288129687309265,
|
|
"learning_rate": 1.9451178138194514e-05,
|
|
"loss": 0.0086,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 0.5907465097496251,
|
|
"grad_norm": 0.1171145960688591,
|
|
"learning_rate": 1.9444140278414395e-05,
|
|
"loss": 0.0066,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.5925925925925926,
|
|
"grad_norm": 0.1379767507314682,
|
|
"learning_rate": 1.9437058870118745e-05,
|
|
"loss": 0.0069,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 0.5944386754355602,
|
|
"grad_norm": 0.059574488550424576,
|
|
"learning_rate": 1.9429933945961126e-05,
|
|
"loss": 0.0058,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 0.5962847582785278,
|
|
"grad_norm": 0.08852468430995941,
|
|
"learning_rate": 1.9422765538795758e-05,
|
|
"loss": 0.0073,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 0.5981308411214953,
|
|
"grad_norm": 0.08040236681699753,
|
|
"learning_rate": 1.941555368167737e-05,
|
|
"loss": 0.0085,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 0.5999769239644629,
|
|
"grad_norm": 0.43157750368118286,
|
|
"learning_rate": 1.9408298407861045e-05,
|
|
"loss": 0.0172,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.6018230068074305,
|
|
"grad_norm": 0.06634240597486496,
|
|
"learning_rate": 1.940099975080207e-05,
|
|
"loss": 0.0057,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 0.6036690896503981,
|
|
"grad_norm": 0.13536123931407928,
|
|
"learning_rate": 1.939365774415577e-05,
|
|
"loss": 0.014,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 0.6055151724933656,
|
|
"grad_norm": 0.14865273237228394,
|
|
"learning_rate": 1.938627242177738e-05,
|
|
"loss": 0.0172,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 0.6073612553363332,
|
|
"grad_norm": 0.21095162630081177,
|
|
"learning_rate": 1.9378843817721856e-05,
|
|
"loss": 0.0107,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 0.6092073381793008,
|
|
"grad_norm": 0.1606818437576294,
|
|
"learning_rate": 1.9371371966243734e-05,
|
|
"loss": 0.0073,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.6110534210222683,
|
|
"grad_norm": 0.1389794498682022,
|
|
"learning_rate": 1.9363856901796984e-05,
|
|
"loss": 0.0085,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 0.612899503865236,
|
|
"grad_norm": 0.13808107376098633,
|
|
"learning_rate": 1.935629865903482e-05,
|
|
"loss": 0.0334,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 0.6147455867082036,
|
|
"grad_norm": 0.11393231898546219,
|
|
"learning_rate": 1.9348697272809568e-05,
|
|
"loss": 0.0054,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 0.6165916695511711,
|
|
"grad_norm": 0.14643581211566925,
|
|
"learning_rate": 1.9341052778172505e-05,
|
|
"loss": 0.0069,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 0.6184377523941387,
|
|
"grad_norm": 0.08248923718929291,
|
|
"learning_rate": 1.9333365210373668e-05,
|
|
"loss": 0.0082,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.6202838352371063,
|
|
"grad_norm": 0.16498494148254395,
|
|
"learning_rate": 1.9325634604861728e-05,
|
|
"loss": 0.0073,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 0.6221299180800739,
|
|
"grad_norm": 0.3315902054309845,
|
|
"learning_rate": 1.9317860997283803e-05,
|
|
"loss": 0.0156,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 0.6239760009230414,
|
|
"grad_norm": 0.13608397543430328,
|
|
"learning_rate": 1.9310044423485303e-05,
|
|
"loss": 0.006,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 0.625822083766009,
|
|
"grad_norm": 0.1520407497882843,
|
|
"learning_rate": 1.9302184919509758e-05,
|
|
"loss": 0.0086,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 0.6276681666089766,
|
|
"grad_norm": 0.09407825767993927,
|
|
"learning_rate": 1.929428252159866e-05,
|
|
"loss": 0.0059,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.6295142494519441,
|
|
"grad_norm": 0.08588697016239166,
|
|
"learning_rate": 1.9286337266191295e-05,
|
|
"loss": 0.0044,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 0.6313603322949117,
|
|
"grad_norm": 0.11330454796552658,
|
|
"learning_rate": 1.9278349189924565e-05,
|
|
"loss": 0.0067,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 0.6332064151378793,
|
|
"grad_norm": 0.1503840535879135,
|
|
"learning_rate": 1.9270318329632833e-05,
|
|
"loss": 0.0073,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 0.6350524979808468,
|
|
"grad_norm": 0.11648617684841156,
|
|
"learning_rate": 1.9262244722347746e-05,
|
|
"loss": 0.0064,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 0.6368985808238145,
|
|
"grad_norm": 0.10481298714876175,
|
|
"learning_rate": 1.9254128405298054e-05,
|
|
"loss": 0.0054,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.6387446636667821,
|
|
"grad_norm": 0.08493243157863617,
|
|
"learning_rate": 1.9245969415909464e-05,
|
|
"loss": 0.0186,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 0.6405907465097497,
|
|
"grad_norm": 0.12519818544387817,
|
|
"learning_rate": 1.923776779180444e-05,
|
|
"loss": 0.0157,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 0.6424368293527172,
|
|
"grad_norm": 0.25253045558929443,
|
|
"learning_rate": 1.922952357080205e-05,
|
|
"loss": 0.022,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 0.6442829121956848,
|
|
"grad_norm": 0.0882834643125534,
|
|
"learning_rate": 1.9221236790917784e-05,
|
|
"loss": 0.0091,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 0.6461289950386524,
|
|
"grad_norm": 0.10768232494592667,
|
|
"learning_rate": 1.9212907490363365e-05,
|
|
"loss": 0.0119,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.6479750778816199,
|
|
"grad_norm": 0.6699041724205017,
|
|
"learning_rate": 1.9204535707546602e-05,
|
|
"loss": 0.0176,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 0.6498211607245875,
|
|
"grad_norm": 0.16267555952072144,
|
|
"learning_rate": 1.919612148107119e-05,
|
|
"loss": 0.015,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 0.6516672435675551,
|
|
"grad_norm": 0.10609513521194458,
|
|
"learning_rate": 1.9187664849736542e-05,
|
|
"loss": 0.0062,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 0.6535133264105226,
|
|
"grad_norm": 0.16212430596351624,
|
|
"learning_rate": 1.9179165852537596e-05,
|
|
"loss": 0.0379,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 0.6553594092534902,
|
|
"grad_norm": 0.20922721922397614,
|
|
"learning_rate": 1.9170624528664658e-05,
|
|
"loss": 0.051,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.6572054920964578,
|
|
"grad_norm": 0.10107281059026718,
|
|
"learning_rate": 1.916204091750321e-05,
|
|
"loss": 0.0081,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 0.6590515749394255,
|
|
"grad_norm": 0.11072515696287155,
|
|
"learning_rate": 1.9153415058633714e-05,
|
|
"loss": 0.0066,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 0.660897657782393,
|
|
"grad_norm": 0.2692252993583679,
|
|
"learning_rate": 1.9144746991831463e-05,
|
|
"loss": 0.0248,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 0.6627437406253606,
|
|
"grad_norm": 0.15129469335079193,
|
|
"learning_rate": 1.9136036757066362e-05,
|
|
"loss": 0.0057,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 0.6645898234683282,
|
|
"grad_norm": 0.1859070360660553,
|
|
"learning_rate": 1.9127284394502765e-05,
|
|
"loss": 0.0055,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.6664359063112957,
|
|
"grad_norm": 0.14683200418949127,
|
|
"learning_rate": 1.9118489944499287e-05,
|
|
"loss": 0.0077,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 0.6682819891542633,
|
|
"grad_norm": 0.4112631678581238,
|
|
"learning_rate": 1.9109653447608607e-05,
|
|
"loss": 0.0186,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 0.6701280719972309,
|
|
"grad_norm": 0.11482436209917068,
|
|
"learning_rate": 1.9100774944577303e-05,
|
|
"loss": 0.0035,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 0.6719741548401984,
|
|
"grad_norm": 0.18928466737270355,
|
|
"learning_rate": 1.9091854476345634e-05,
|
|
"loss": 0.0113,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 0.673820237683166,
|
|
"grad_norm": 0.09932154417037964,
|
|
"learning_rate": 1.9082892084047384e-05,
|
|
"loss": 0.0066,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.6756663205261336,
|
|
"grad_norm": 0.08523999899625778,
|
|
"learning_rate": 1.907388780900964e-05,
|
|
"loss": 0.0096,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 0.6775124033691012,
|
|
"grad_norm": 0.3460504710674286,
|
|
"learning_rate": 1.906484169275263e-05,
|
|
"loss": 0.0067,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 0.6793584862120687,
|
|
"grad_norm": 0.16920697689056396,
|
|
"learning_rate": 1.9055753776989516e-05,
|
|
"loss": 0.0082,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 0.6793584862120687,
|
|
"eval_loss": 0.010424941778182983,
|
|
"eval_runtime": 91.3619,
|
|
"eval_samples_per_second": 9.993,
|
|
"eval_steps_per_second": 5.002,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 0.6812045690550363,
|
|
"grad_norm": 0.4086015820503235,
|
|
"learning_rate": 1.9046624103626194e-05,
|
|
"loss": 0.0137,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 0.683050651898004,
|
|
"grad_norm": 0.126896932721138,
|
|
"learning_rate": 1.903745271476113e-05,
|
|
"loss": 0.003,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.6848967347409715,
|
|
"grad_norm": 0.7828288674354553,
|
|
"learning_rate": 1.902823965268513e-05,
|
|
"loss": 0.0134,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 0.6867428175839391,
|
|
"grad_norm": 0.1559770405292511,
|
|
"learning_rate": 1.901898495988117e-05,
|
|
"loss": 0.0064,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 0.6885889004269067,
|
|
"grad_norm": 0.07461792975664139,
|
|
"learning_rate": 1.900968867902419e-05,
|
|
"loss": 0.0028,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 0.6904349832698742,
|
|
"grad_norm": 0.13255064189434052,
|
|
"learning_rate": 1.900035085298091e-05,
|
|
"loss": 0.0098,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 0.6922810661128418,
|
|
"grad_norm": 0.11017435044050217,
|
|
"learning_rate": 1.8990971524809602e-05,
|
|
"loss": 0.0108,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.6941271489558094,
|
|
"grad_norm": 0.49764785170555115,
|
|
"learning_rate": 1.8981550737759932e-05,
|
|
"loss": 0.0131,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 0.695973231798777,
|
|
"grad_norm": 0.10128708928823471,
|
|
"learning_rate": 1.8972088535272718e-05,
|
|
"loss": 0.0044,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 0.6978193146417445,
|
|
"grad_norm": 0.1349850744009018,
|
|
"learning_rate": 1.896258496097977e-05,
|
|
"loss": 0.0099,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 0.6996653974847121,
|
|
"grad_norm": 0.14308737218379974,
|
|
"learning_rate": 1.8953040058703668e-05,
|
|
"loss": 0.0124,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 0.7015114803276797,
|
|
"grad_norm": 0.14602790772914886,
|
|
"learning_rate": 1.894345387245755e-05,
|
|
"loss": 0.0072,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.7033575631706472,
|
|
"grad_norm": 0.0991426333785057,
|
|
"learning_rate": 1.8933826446444933e-05,
|
|
"loss": 0.0044,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 0.7052036460136148,
|
|
"grad_norm": 0.10930776596069336,
|
|
"learning_rate": 1.8924157825059496e-05,
|
|
"loss": 0.0099,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 0.7070497288565825,
|
|
"grad_norm": 0.8534165620803833,
|
|
"learning_rate": 1.891444805288487e-05,
|
|
"loss": 0.0103,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 0.7088958116995501,
|
|
"grad_norm": 0.31674516201019287,
|
|
"learning_rate": 1.8904697174694447e-05,
|
|
"loss": 0.0108,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 0.7107418945425176,
|
|
"grad_norm": 1.4162273406982422,
|
|
"learning_rate": 1.8894905235451163e-05,
|
|
"loss": 0.0121,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.7125879773854852,
|
|
"grad_norm": 0.17984510958194733,
|
|
"learning_rate": 1.888507228030729e-05,
|
|
"loss": 0.0222,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 0.7144340602284528,
|
|
"grad_norm": 1.0672144889831543,
|
|
"learning_rate": 1.887519835460423e-05,
|
|
"loss": 0.0074,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 0.7162801430714203,
|
|
"grad_norm": 0.2030014991760254,
|
|
"learning_rate": 1.8865283503872325e-05,
|
|
"loss": 0.0161,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 0.7181262259143879,
|
|
"grad_norm": 0.07635032385587692,
|
|
"learning_rate": 1.8855327773830604e-05,
|
|
"loss": 0.0038,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 0.7199723087573555,
|
|
"grad_norm": 0.09135939180850983,
|
|
"learning_rate": 1.8845331210386608e-05,
|
|
"loss": 0.0058,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.721818391600323,
|
|
"grad_norm": 0.13557861745357513,
|
|
"learning_rate": 1.8835293859636177e-05,
|
|
"loss": 0.0034,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 0.7236644744432906,
|
|
"grad_norm": 0.20347817242145538,
|
|
"learning_rate": 1.8825215767863215e-05,
|
|
"loss": 0.0161,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 0.7255105572862582,
|
|
"grad_norm": 0.986000120639801,
|
|
"learning_rate": 1.8815096981539494e-05,
|
|
"loss": 0.0242,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 0.7273566401292259,
|
|
"grad_norm": 0.3647541105747223,
|
|
"learning_rate": 1.8804937547324435e-05,
|
|
"loss": 0.0057,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 0.7292027229721934,
|
|
"grad_norm": 0.09787865728139877,
|
|
"learning_rate": 1.879473751206489e-05,
|
|
"loss": 0.0039,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.731048805815161,
|
|
"grad_norm": 0.16595730185508728,
|
|
"learning_rate": 1.8784496922794947e-05,
|
|
"loss": 0.0044,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 0.7328948886581286,
|
|
"grad_norm": 0.17809519171714783,
|
|
"learning_rate": 1.8774215826735664e-05,
|
|
"loss": 0.0091,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 0.7347409715010961,
|
|
"grad_norm": 0.13774670660495758,
|
|
"learning_rate": 1.8763894271294914e-05,
|
|
"loss": 0.0073,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 0.7365870543440637,
|
|
"grad_norm": 0.2830973267555237,
|
|
"learning_rate": 1.875353230406711e-05,
|
|
"loss": 0.0103,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 0.7384331371870313,
|
|
"grad_norm": 0.15855129063129425,
|
|
"learning_rate": 1.8743129972833033e-05,
|
|
"loss": 0.0073,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.7402792200299988,
|
|
"grad_norm": 0.7160941362380981,
|
|
"learning_rate": 1.873268732555957e-05,
|
|
"loss": 0.0264,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 0.7421253028729664,
|
|
"grad_norm": 0.239408940076828,
|
|
"learning_rate": 1.8722204410399524e-05,
|
|
"loss": 0.0132,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 0.743971385715934,
|
|
"grad_norm": 0.1071736142039299,
|
|
"learning_rate": 1.8711681275691366e-05,
|
|
"loss": 0.0073,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 0.7458174685589016,
|
|
"grad_norm": 1.5305790901184082,
|
|
"learning_rate": 1.870111796995905e-05,
|
|
"loss": 0.0061,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 0.7476635514018691,
|
|
"grad_norm": 0.22047969698905945,
|
|
"learning_rate": 1.8690514541911746e-05,
|
|
"loss": 0.0071,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.7495096342448367,
|
|
"grad_norm": 0.15638285875320435,
|
|
"learning_rate": 1.8679871040443632e-05,
|
|
"loss": 0.0078,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 0.7513557170878044,
|
|
"grad_norm": 0.23750250041484833,
|
|
"learning_rate": 1.866918751463369e-05,
|
|
"loss": 0.0074,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 0.7532017999307719,
|
|
"grad_norm": 0.18000538647174835,
|
|
"learning_rate": 1.8658464013745443e-05,
|
|
"loss": 0.0194,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 0.7550478827737395,
|
|
"grad_norm": 0.14077042043209076,
|
|
"learning_rate": 1.864770058722676e-05,
|
|
"loss": 0.0331,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 0.7568939656167071,
|
|
"grad_norm": 0.1602960228919983,
|
|
"learning_rate": 1.86368972847096e-05,
|
|
"loss": 0.0149,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.7587400484596746,
|
|
"grad_norm": 0.1850017011165619,
|
|
"learning_rate": 1.8626054156009807e-05,
|
|
"loss": 0.0073,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 0.7605861313026422,
|
|
"grad_norm": 0.37113574147224426,
|
|
"learning_rate": 1.8615171251126866e-05,
|
|
"loss": 0.0063,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 0.7624322141456098,
|
|
"grad_norm": 0.20095576345920563,
|
|
"learning_rate": 1.8604248620243682e-05,
|
|
"loss": 0.0075,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 0.7642782969885774,
|
|
"grad_norm": 0.16010351479053497,
|
|
"learning_rate": 1.8593286313726332e-05,
|
|
"loss": 0.0065,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 0.7661243798315449,
|
|
"grad_norm": 0.23952090740203857,
|
|
"learning_rate": 1.8582284382123853e-05,
|
|
"loss": 0.0678,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.7679704626745125,
|
|
"grad_norm": 0.6012998819351196,
|
|
"learning_rate": 1.8571242876167995e-05,
|
|
"loss": 0.0151,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 0.7698165455174801,
|
|
"grad_norm": 0.22734975814819336,
|
|
"learning_rate": 1.8560161846773002e-05,
|
|
"loss": 0.0132,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 0.7716626283604476,
|
|
"grad_norm": 0.3496147096157074,
|
|
"learning_rate": 1.8549041345035354e-05,
|
|
"loss": 0.0198,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 0.7735087112034152,
|
|
"grad_norm": 0.7178294658660889,
|
|
"learning_rate": 1.8537881422233553e-05,
|
|
"loss": 0.0232,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 0.7753547940463829,
|
|
"grad_norm": 0.8831137418746948,
|
|
"learning_rate": 1.8526682129827875e-05,
|
|
"loss": 0.0084,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.7772008768893504,
|
|
"grad_norm": 0.13973954319953918,
|
|
"learning_rate": 1.851544351946014e-05,
|
|
"loss": 0.0126,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 0.779046959732318,
|
|
"grad_norm": 0.21449421346187592,
|
|
"learning_rate": 1.8504165642953456e-05,
|
|
"loss": 0.0055,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 0.7808930425752856,
|
|
"grad_norm": 0.17940129339694977,
|
|
"learning_rate": 1.8492848552312016e-05,
|
|
"loss": 0.0091,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 0.7827391254182532,
|
|
"grad_norm": 0.142044335603714,
|
|
"learning_rate": 1.8481492299720817e-05,
|
|
"loss": 0.0143,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 0.7845852082612207,
|
|
"grad_norm": 0.18737457692623138,
|
|
"learning_rate": 1.8470096937545445e-05,
|
|
"loss": 0.0178,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.7864312911041883,
|
|
"grad_norm": 0.17866647243499756,
|
|
"learning_rate": 1.845866251833183e-05,
|
|
"loss": 0.016,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 0.7882773739471559,
|
|
"grad_norm": 0.13905107975006104,
|
|
"learning_rate": 1.8447189094805997e-05,
|
|
"loss": 0.0192,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 0.7901234567901234,
|
|
"grad_norm": 0.23687602579593658,
|
|
"learning_rate": 1.8435676719873828e-05,
|
|
"loss": 0.0059,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 0.791969539633091,
|
|
"grad_norm": 0.35541704297065735,
|
|
"learning_rate": 1.8424125446620812e-05,
|
|
"loss": 0.0176,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 0.7938156224760586,
|
|
"grad_norm": 0.22285489737987518,
|
|
"learning_rate": 1.8412535328311813e-05,
|
|
"loss": 0.0059,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.7956617053190262,
|
|
"grad_norm": 0.6638950109481812,
|
|
"learning_rate": 1.8400906418390808e-05,
|
|
"loss": 0.0112,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 0.7975077881619937,
|
|
"grad_norm": 0.1354030817747116,
|
|
"learning_rate": 1.8389238770480655e-05,
|
|
"loss": 0.0108,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 0.7993538710049614,
|
|
"grad_norm": 0.2083008885383606,
|
|
"learning_rate": 1.837753243838283e-05,
|
|
"loss": 0.0104,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 0.801199953847929,
|
|
"grad_norm": 0.17043934762477875,
|
|
"learning_rate": 1.83657874760772e-05,
|
|
"loss": 0.0231,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 0.8030460366908965,
|
|
"grad_norm": 0.27301961183547974,
|
|
"learning_rate": 1.8354003937721755e-05,
|
|
"loss": 0.0065,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.8048921195338641,
|
|
"grad_norm": 0.15334510803222656,
|
|
"learning_rate": 1.834218187765237e-05,
|
|
"loss": 0.0194,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 0.8067382023768317,
|
|
"grad_norm": 0.13855521380901337,
|
|
"learning_rate": 1.8330321350382545e-05,
|
|
"loss": 0.0041,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 0.8085842852197992,
|
|
"grad_norm": 0.1874759942293167,
|
|
"learning_rate": 1.8318422410603162e-05,
|
|
"loss": 0.0073,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 0.8104303680627668,
|
|
"grad_norm": 0.13221128284931183,
|
|
"learning_rate": 1.830648511318223e-05,
|
|
"loss": 0.0116,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 0.8122764509057344,
|
|
"grad_norm": 0.14108803868293762,
|
|
"learning_rate": 1.8294509513164632e-05,
|
|
"loss": 0.0057,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.814122533748702,
|
|
"grad_norm": 0.18224814534187317,
|
|
"learning_rate": 1.8282495665771864e-05,
|
|
"loss": 0.0057,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 0.8159686165916695,
|
|
"grad_norm": 0.11267901211977005,
|
|
"learning_rate": 1.8270443626401798e-05,
|
|
"loss": 0.0051,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 0.8178146994346371,
|
|
"grad_norm": 0.11145651340484619,
|
|
"learning_rate": 1.8258353450628402e-05,
|
|
"loss": 0.0047,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 0.8196607822776047,
|
|
"grad_norm": 0.10886496305465698,
|
|
"learning_rate": 1.8246225194201517e-05,
|
|
"loss": 0.01,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 0.8215068651205722,
|
|
"grad_norm": 0.13111929595470428,
|
|
"learning_rate": 1.823405891304656e-05,
|
|
"loss": 0.0117,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.8233529479635399,
|
|
"grad_norm": 0.1536678671836853,
|
|
"learning_rate": 1.8221854663264294e-05,
|
|
"loss": 0.0093,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 0.8251990308065075,
|
|
"grad_norm": 0.13959679007530212,
|
|
"learning_rate": 1.8209612501130566e-05,
|
|
"loss": 0.007,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 0.827045113649475,
|
|
"grad_norm": 0.1471884697675705,
|
|
"learning_rate": 1.819733248309604e-05,
|
|
"loss": 0.0209,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 0.8288911964924426,
|
|
"grad_norm": 0.4662853479385376,
|
|
"learning_rate": 1.8185014665785936e-05,
|
|
"loss": 0.0088,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 0.8307372793354102,
|
|
"grad_norm": 0.16569823026657104,
|
|
"learning_rate": 1.817265910599978e-05,
|
|
"loss": 0.0097,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.8325833621783778,
|
|
"grad_norm": 0.1576852947473526,
|
|
"learning_rate": 1.8160265860711134e-05,
|
|
"loss": 0.0092,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 0.8344294450213453,
|
|
"grad_norm": 0.14117415249347687,
|
|
"learning_rate": 1.8147834987067327e-05,
|
|
"loss": 0.0043,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 0.8362755278643129,
|
|
"grad_norm": 0.18965789675712585,
|
|
"learning_rate": 1.8135366542389202e-05,
|
|
"loss": 0.0137,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 0.8381216107072805,
|
|
"grad_norm": 0.17018474638462067,
|
|
"learning_rate": 1.8122860584170854e-05,
|
|
"loss": 0.0111,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 0.839967693550248,
|
|
"grad_norm": 0.12502162158489227,
|
|
"learning_rate": 1.8110317170079355e-05,
|
|
"loss": 0.0073,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.8418137763932156,
|
|
"grad_norm": 0.19878755509853363,
|
|
"learning_rate": 1.8097736357954487e-05,
|
|
"loss": 0.0139,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 0.8436598592361833,
|
|
"grad_norm": 0.13912063837051392,
|
|
"learning_rate": 1.808511820580849e-05,
|
|
"loss": 0.0066,
|
|
"step": 457
|
|
},
|
|
{
|
|
"epoch": 0.8455059420791508,
|
|
"grad_norm": 0.08366572856903076,
|
|
"learning_rate": 1.807246277182578e-05,
|
|
"loss": 0.0045,
|
|
"step": 458
|
|
},
|
|
{
|
|
"epoch": 0.8473520249221184,
|
|
"grad_norm": 5.883597373962402,
|
|
"learning_rate": 1.8059770114362686e-05,
|
|
"loss": 0.0109,
|
|
"step": 459
|
|
},
|
|
{
|
|
"epoch": 0.849198107765086,
|
|
"grad_norm": 0.12818843126296997,
|
|
"learning_rate": 1.804704029194718e-05,
|
|
"loss": 0.0094,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.849198107765086,
|
|
"eval_loss": 0.00954136997461319,
|
|
"eval_runtime": 91.3222,
|
|
"eval_samples_per_second": 9.998,
|
|
"eval_steps_per_second": 5.004,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.8510441906080536,
|
|
"grad_norm": 0.09318236261606216,
|
|
"learning_rate": 1.8034273363278615e-05,
|
|
"loss": 0.0028,
|
|
"step": 461
|
|
},
|
|
{
|
|
"epoch": 0.8528902734510211,
|
|
"grad_norm": 0.25973886251449585,
|
|
"learning_rate": 1.8021469387227433e-05,
|
|
"loss": 0.0191,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 0.8547363562939887,
|
|
"grad_norm": 0.1217271238565445,
|
|
"learning_rate": 1.8008628422834923e-05,
|
|
"loss": 0.0156,
|
|
"step": 463
|
|
},
|
|
{
|
|
"epoch": 0.8565824391369563,
|
|
"grad_norm": 0.1459590196609497,
|
|
"learning_rate": 1.7995750529312923e-05,
|
|
"loss": 0.0171,
|
|
"step": 464
|
|
},
|
|
{
|
|
"epoch": 0.8584285219799238,
|
|
"grad_norm": 0.09007342159748077,
|
|
"learning_rate": 1.798283576604356e-05,
|
|
"loss": 0.0047,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.8602746048228914,
|
|
"grad_norm": 0.13479134440422058,
|
|
"learning_rate": 1.7969884192578977e-05,
|
|
"loss": 0.0154,
|
|
"step": 466
|
|
},
|
|
{
|
|
"epoch": 0.862120687665859,
|
|
"grad_norm": 0.1289559155702591,
|
|
"learning_rate": 1.7956895868641053e-05,
|
|
"loss": 0.0122,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 0.8639667705088265,
|
|
"grad_norm": 0.10503463447093964,
|
|
"learning_rate": 1.7943870854121126e-05,
|
|
"loss": 0.0067,
|
|
"step": 468
|
|
},
|
|
{
|
|
"epoch": 0.8658128533517941,
|
|
"grad_norm": 0.1160660907626152,
|
|
"learning_rate": 1.7930809209079728e-05,
|
|
"loss": 0.0065,
|
|
"step": 469
|
|
},
|
|
{
|
|
"epoch": 0.8676589361947618,
|
|
"grad_norm": 0.14017629623413086,
|
|
"learning_rate": 1.791771099374629e-05,
|
|
"loss": 0.0068,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.8695050190377294,
|
|
"grad_norm": 0.12286079674959183,
|
|
"learning_rate": 1.7904576268518886e-05,
|
|
"loss": 0.0084,
|
|
"step": 471
|
|
},
|
|
{
|
|
"epoch": 0.8713511018806969,
|
|
"grad_norm": 0.1592610627412796,
|
|
"learning_rate": 1.789140509396394e-05,
|
|
"loss": 0.0099,
|
|
"step": 472
|
|
},
|
|
{
|
|
"epoch": 0.8731971847236645,
|
|
"grad_norm": 0.09278839081525803,
|
|
"learning_rate": 1.787819753081594e-05,
|
|
"loss": 0.0031,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 0.8750432675666321,
|
|
"grad_norm": 0.11374642699956894,
|
|
"learning_rate": 1.7864953639977177e-05,
|
|
"loss": 0.0134,
|
|
"step": 474
|
|
},
|
|
{
|
|
"epoch": 0.8768893504095996,
|
|
"grad_norm": 0.11769058555364609,
|
|
"learning_rate": 1.7851673482517458e-05,
|
|
"loss": 0.0087,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.8787354332525672,
|
|
"grad_norm": 0.1978936493396759,
|
|
"learning_rate": 1.783835711967382e-05,
|
|
"loss": 0.0171,
|
|
"step": 476
|
|
},
|
|
{
|
|
"epoch": 0.8805815160955348,
|
|
"grad_norm": 0.17544765770435333,
|
|
"learning_rate": 1.7825004612850242e-05,
|
|
"loss": 0.0075,
|
|
"step": 477
|
|
},
|
|
{
|
|
"epoch": 0.8824275989385023,
|
|
"grad_norm": 0.07806729525327682,
|
|
"learning_rate": 1.781161602361737e-05,
|
|
"loss": 0.0038,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 0.8842736817814699,
|
|
"grad_norm": 0.230962336063385,
|
|
"learning_rate": 1.7798191413712244e-05,
|
|
"loss": 0.037,
|
|
"step": 479
|
|
},
|
|
{
|
|
"epoch": 0.8861197646244375,
|
|
"grad_norm": 0.18117716908454895,
|
|
"learning_rate": 1.778473084503799e-05,
|
|
"loss": 0.0114,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.8879658474674051,
|
|
"grad_norm": 0.08140026032924652,
|
|
"learning_rate": 1.7771234379663545e-05,
|
|
"loss": 0.0043,
|
|
"step": 481
|
|
},
|
|
{
|
|
"epoch": 0.8898119303103726,
|
|
"grad_norm": 0.0821491926908493,
|
|
"learning_rate": 1.775770207982338e-05,
|
|
"loss": 0.0043,
|
|
"step": 482
|
|
},
|
|
{
|
|
"epoch": 0.8916580131533403,
|
|
"grad_norm": 0.1920117437839508,
|
|
"learning_rate": 1.7744134007917195e-05,
|
|
"loss": 0.0074,
|
|
"step": 483
|
|
},
|
|
{
|
|
"epoch": 0.8935040959963079,
|
|
"grad_norm": 0.09095922112464905,
|
|
"learning_rate": 1.7730530226509652e-05,
|
|
"loss": 0.0052,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 0.8953501788392754,
|
|
"grad_norm": 0.10795634239912033,
|
|
"learning_rate": 1.7716890798330066e-05,
|
|
"loss": 0.0061,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.897196261682243,
|
|
"grad_norm": 0.18433086574077606,
|
|
"learning_rate": 1.770321578627213e-05,
|
|
"loss": 0.0055,
|
|
"step": 486
|
|
},
|
|
{
|
|
"epoch": 0.8990423445252106,
|
|
"grad_norm": 0.09844094514846802,
|
|
"learning_rate": 1.768950525339362e-05,
|
|
"loss": 0.0057,
|
|
"step": 487
|
|
},
|
|
{
|
|
"epoch": 0.9008884273681782,
|
|
"grad_norm": 0.07778719812631607,
|
|
"learning_rate": 1.7675759262916105e-05,
|
|
"loss": 0.0062,
|
|
"step": 488
|
|
},
|
|
{
|
|
"epoch": 0.9027345102111457,
|
|
"grad_norm": 0.10741523653268814,
|
|
"learning_rate": 1.7661977878224653e-05,
|
|
"loss": 0.0137,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 0.9045805930541133,
|
|
"grad_norm": 0.08387381583452225,
|
|
"learning_rate": 1.7648161162867537e-05,
|
|
"loss": 0.0035,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.9064266758970809,
|
|
"grad_norm": 0.1607184261083603,
|
|
"learning_rate": 1.763430918055595e-05,
|
|
"loss": 0.0107,
|
|
"step": 491
|
|
},
|
|
{
|
|
"epoch": 0.9082727587400484,
|
|
"grad_norm": 0.21685011684894562,
|
|
"learning_rate": 1.7620421995163718e-05,
|
|
"loss": 0.0215,
|
|
"step": 492
|
|
},
|
|
{
|
|
"epoch": 0.910118841583016,
|
|
"grad_norm": 0.07172352075576782,
|
|
"learning_rate": 1.7606499670726972e-05,
|
|
"loss": 0.004,
|
|
"step": 493
|
|
},
|
|
{
|
|
"epoch": 0.9119649244259836,
|
|
"grad_norm": 0.06467904895544052,
|
|
"learning_rate": 1.7592542271443888e-05,
|
|
"loss": 0.0035,
|
|
"step": 494
|
|
},
|
|
{
|
|
"epoch": 0.9138110072689511,
|
|
"grad_norm": 0.09848224371671677,
|
|
"learning_rate": 1.7578549861674378e-05,
|
|
"loss": 0.0046,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.9156570901119188,
|
|
"grad_norm": 0.19907814264297485,
|
|
"learning_rate": 1.756452250593979e-05,
|
|
"loss": 0.0265,
|
|
"step": 496
|
|
},
|
|
{
|
|
"epoch": 0.9175031729548864,
|
|
"grad_norm": 0.19014231860637665,
|
|
"learning_rate": 1.7550460268922615e-05,
|
|
"loss": 0.0132,
|
|
"step": 497
|
|
},
|
|
{
|
|
"epoch": 0.919349255797854,
|
|
"grad_norm": 0.13054688274860382,
|
|
"learning_rate": 1.753636321546619e-05,
|
|
"loss": 0.0067,
|
|
"step": 498
|
|
},
|
|
{
|
|
"epoch": 0.9211953386408215,
|
|
"grad_norm": 0.13842567801475525,
|
|
"learning_rate": 1.752223141057439e-05,
|
|
"loss": 0.0279,
|
|
"step": 499
|
|
},
|
|
{
|
|
"epoch": 0.9230414214837891,
|
|
"grad_norm": 0.08342625945806503,
|
|
"learning_rate": 1.7508064919411344e-05,
|
|
"loss": 0.0039,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.9248875043267567,
|
|
"grad_norm": 0.17037242650985718,
|
|
"learning_rate": 1.7493863807301116e-05,
|
|
"loss": 0.0142,
|
|
"step": 501
|
|
},
|
|
{
|
|
"epoch": 0.9267335871697242,
|
|
"grad_norm": 0.09089969098567963,
|
|
"learning_rate": 1.7479628139727417e-05,
|
|
"loss": 0.0066,
|
|
"step": 502
|
|
},
|
|
{
|
|
"epoch": 0.9285796700126918,
|
|
"grad_norm": 0.1631207913160324,
|
|
"learning_rate": 1.7465357982333294e-05,
|
|
"loss": 0.01,
|
|
"step": 503
|
|
},
|
|
{
|
|
"epoch": 0.9304257528556594,
|
|
"grad_norm": 0.19571319222450256,
|
|
"learning_rate": 1.745105340092085e-05,
|
|
"loss": 0.0472,
|
|
"step": 504
|
|
},
|
|
{
|
|
"epoch": 0.9322718356986269,
|
|
"grad_norm": 0.11733975261449814,
|
|
"learning_rate": 1.74367144614509e-05,
|
|
"loss": 0.0084,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.9341179185415945,
|
|
"grad_norm": 0.09824282675981522,
|
|
"learning_rate": 1.74223412300427e-05,
|
|
"loss": 0.0046,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 0.9359640013845621,
|
|
"grad_norm": 0.20196324586868286,
|
|
"learning_rate": 1.7407933772973638e-05,
|
|
"loss": 0.0066,
|
|
"step": 507
|
|
},
|
|
{
|
|
"epoch": 0.9378100842275298,
|
|
"grad_norm": 0.15926776826381683,
|
|
"learning_rate": 1.739349215667891e-05,
|
|
"loss": 0.0146,
|
|
"step": 508
|
|
},
|
|
{
|
|
"epoch": 0.9396561670704973,
|
|
"grad_norm": 0.09395015984773636,
|
|
"learning_rate": 1.737901644775124e-05,
|
|
"loss": 0.0052,
|
|
"step": 509
|
|
},
|
|
{
|
|
"epoch": 0.9415022499134649,
|
|
"grad_norm": 0.10329575836658478,
|
|
"learning_rate": 1.736450671294054e-05,
|
|
"loss": 0.0214,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.9433483327564325,
|
|
"grad_norm": 0.06971684098243713,
|
|
"learning_rate": 1.7349963019153638e-05,
|
|
"loss": 0.0037,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 0.9451944155994,
|
|
"grad_norm": 0.08213736116886139,
|
|
"learning_rate": 1.7335385433453948e-05,
|
|
"loss": 0.0048,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 0.9470404984423676,
|
|
"grad_norm": 0.4655965268611908,
|
|
"learning_rate": 1.732077402306116e-05,
|
|
"loss": 0.0059,
|
|
"step": 513
|
|
},
|
|
{
|
|
"epoch": 0.9488865812853352,
|
|
"grad_norm": 0.08121432363986969,
|
|
"learning_rate": 1.730612885535094e-05,
|
|
"loss": 0.0048,
|
|
"step": 514
|
|
},
|
|
{
|
|
"epoch": 0.9507326641283027,
|
|
"grad_norm": 0.05632347986102104,
|
|
"learning_rate": 1.729144999785462e-05,
|
|
"loss": 0.0039,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.9525787469712703,
|
|
"grad_norm": 0.08921120315790176,
|
|
"learning_rate": 1.7276737518258865e-05,
|
|
"loss": 0.0048,
|
|
"step": 516
|
|
},
|
|
{
|
|
"epoch": 0.9544248298142379,
|
|
"grad_norm": 0.41118451952934265,
|
|
"learning_rate": 1.726199148440539e-05,
|
|
"loss": 0.0116,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 0.9562709126572055,
|
|
"grad_norm": 0.14769668877124786,
|
|
"learning_rate": 1.7247211964290635e-05,
|
|
"loss": 0.005,
|
|
"step": 518
|
|
},
|
|
{
|
|
"epoch": 0.958116995500173,
|
|
"grad_norm": 0.14160354435443878,
|
|
"learning_rate": 1.7232399026065445e-05,
|
|
"loss": 0.0309,
|
|
"step": 519
|
|
},
|
|
{
|
|
"epoch": 0.9599630783431407,
|
|
"grad_norm": 0.20757490396499634,
|
|
"learning_rate": 1.7217552738034763e-05,
|
|
"loss": 0.0281,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.9618091611861083,
|
|
"grad_norm": 1.3972127437591553,
|
|
"learning_rate": 1.7202673168657318e-05,
|
|
"loss": 0.0721,
|
|
"step": 521
|
|
},
|
|
{
|
|
"epoch": 0.9636552440290758,
|
|
"grad_norm": 0.14340125024318695,
|
|
"learning_rate": 1.7187760386545297e-05,
|
|
"loss": 0.0115,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 0.9655013268720434,
|
|
"grad_norm": 0.13766130805015564,
|
|
"learning_rate": 1.717281446046404e-05,
|
|
"loss": 0.0222,
|
|
"step": 523
|
|
},
|
|
{
|
|
"epoch": 0.967347409715011,
|
|
"grad_norm": 0.14681874215602875,
|
|
"learning_rate": 1.7157835459331726e-05,
|
|
"loss": 0.0125,
|
|
"step": 524
|
|
},
|
|
{
|
|
"epoch": 0.9691934925579785,
|
|
"grad_norm": 0.33441978693008423,
|
|
"learning_rate": 1.7142823452219036e-05,
|
|
"loss": 0.0348,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.9710395754009461,
|
|
"grad_norm": 0.15065835416316986,
|
|
"learning_rate": 1.7127778508348858e-05,
|
|
"loss": 0.0078,
|
|
"step": 526
|
|
},
|
|
{
|
|
"epoch": 0.9728856582439137,
|
|
"grad_norm": 0.12596829235553741,
|
|
"learning_rate": 1.7112700697095955e-05,
|
|
"loss": 0.0095,
|
|
"step": 527
|
|
},
|
|
{
|
|
"epoch": 0.9747317410868813,
|
|
"grad_norm": 0.20254209637641907,
|
|
"learning_rate": 1.709759008798663e-05,
|
|
"loss": 0.0082,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 0.9765778239298488,
|
|
"grad_norm": 0.12069059908390045,
|
|
"learning_rate": 1.708244675069846e-05,
|
|
"loss": 0.0057,
|
|
"step": 529
|
|
},
|
|
{
|
|
"epoch": 0.9784239067728164,
|
|
"grad_norm": 0.09891531616449356,
|
|
"learning_rate": 1.7067270755059897e-05,
|
|
"loss": 0.0065,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.980269989615784,
|
|
"grad_norm": 0.11425697803497314,
|
|
"learning_rate": 1.7052062171050008e-05,
|
|
"loss": 0.0087,
|
|
"step": 531
|
|
},
|
|
{
|
|
"epoch": 0.9821160724587515,
|
|
"grad_norm": 0.07714996486902237,
|
|
"learning_rate": 1.7036821068798127e-05,
|
|
"loss": 0.0052,
|
|
"step": 532
|
|
},
|
|
{
|
|
"epoch": 0.9839621553017192,
|
|
"grad_norm": 0.08949927240610123,
|
|
"learning_rate": 1.7021547518583536e-05,
|
|
"loss": 0.0049,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 0.9858082381446868,
|
|
"grad_norm": 0.24043439328670502,
|
|
"learning_rate": 1.7006241590835136e-05,
|
|
"loss": 0.015,
|
|
"step": 534
|
|
},
|
|
{
|
|
"epoch": 0.9876543209876543,
|
|
"grad_norm": 0.1122710183262825,
|
|
"learning_rate": 1.6990903356131125e-05,
|
|
"loss": 0.0052,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.9895004038306219,
|
|
"grad_norm": 0.13079215586185455,
|
|
"learning_rate": 1.6975532885198678e-05,
|
|
"loss": 0.0085,
|
|
"step": 536
|
|
},
|
|
{
|
|
"epoch": 0.9913464866735895,
|
|
"grad_norm": 0.12605947256088257,
|
|
"learning_rate": 1.696013024891362e-05,
|
|
"loss": 0.0076,
|
|
"step": 537
|
|
},
|
|
{
|
|
"epoch": 0.9931925695165571,
|
|
"grad_norm": 0.13878796994686127,
|
|
"learning_rate": 1.6944695518300087e-05,
|
|
"loss": 0.0127,
|
|
"step": 538
|
|
},
|
|
{
|
|
"epoch": 0.9950386523595246,
|
|
"grad_norm": 0.1519862711429596,
|
|
"learning_rate": 1.6929228764530214e-05,
|
|
"loss": 0.0076,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 0.9968847352024922,
|
|
"grad_norm": 0.11886841803789139,
|
|
"learning_rate": 1.69137300589238e-05,
|
|
"loss": 0.0087,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.9987308180454598,
|
|
"grad_norm": 0.09037019312381744,
|
|
"learning_rate": 1.6898199472947972e-05,
|
|
"loss": 0.0108,
|
|
"step": 541
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.16914087533950806,
|
|
"learning_rate": 1.6882637078216867e-05,
|
|
"loss": 0.0111,
|
|
"step": 542
|
|
},
|
|
{
|
|
"epoch": 1.0018460828429676,
|
|
"grad_norm": 0.0852038711309433,
|
|
"learning_rate": 1.6867042946491306e-05,
|
|
"loss": 0.0123,
|
|
"step": 543
|
|
},
|
|
{
|
|
"epoch": 1.0036921656859352,
|
|
"grad_norm": 0.13573428988456726,
|
|
"learning_rate": 1.6851417149678442e-05,
|
|
"loss": 0.0238,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 1.0055382485289028,
|
|
"grad_norm": 0.143440842628479,
|
|
"learning_rate": 1.6835759759831448e-05,
|
|
"loss": 0.0081,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 1.0073843313718702,
|
|
"grad_norm": 0.10767962038516998,
|
|
"learning_rate": 1.6820070849149174e-05,
|
|
"loss": 0.0057,
|
|
"step": 546
|
|
},
|
|
{
|
|
"epoch": 1.0092304142148378,
|
|
"grad_norm": 0.10234127938747406,
|
|
"learning_rate": 1.680435048997582e-05,
|
|
"loss": 0.0026,
|
|
"step": 547
|
|
},
|
|
{
|
|
"epoch": 1.0110764970578054,
|
|
"grad_norm": 0.054232899099588394,
|
|
"learning_rate": 1.6788598754800602e-05,
|
|
"loss": 0.002,
|
|
"step": 548
|
|
},
|
|
{
|
|
"epoch": 1.012922579900773,
|
|
"grad_norm": 0.10567190498113632,
|
|
"learning_rate": 1.6772815716257414e-05,
|
|
"loss": 0.0118,
|
|
"step": 549
|
|
},
|
|
{
|
|
"epoch": 1.0147686627437407,
|
|
"grad_norm": 0.09728987514972687,
|
|
"learning_rate": 1.6757001447124486e-05,
|
|
"loss": 0.0133,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 1.0166147455867083,
|
|
"grad_norm": 0.07242578268051147,
|
|
"learning_rate": 1.6741156020324086e-05,
|
|
"loss": 0.0026,
|
|
"step": 551
|
|
},
|
|
{
|
|
"epoch": 1.018460828429676,
|
|
"grad_norm": 0.06415614485740662,
|
|
"learning_rate": 1.6725279508922114e-05,
|
|
"loss": 0.0038,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 1.018460828429676,
|
|
"eval_loss": 0.00856359489262104,
|
|
"eval_runtime": 91.3138,
|
|
"eval_samples_per_second": 9.998,
|
|
"eval_steps_per_second": 5.005,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 1.0203069112726433,
|
|
"grad_norm": 0.10766559839248657,
|
|
"learning_rate": 1.6709371986127846e-05,
|
|
"loss": 0.0084,
|
|
"step": 553
|
|
},
|
|
{
|
|
"epoch": 1.022152994115611,
|
|
"grad_norm": 0.08855008333921432,
|
|
"learning_rate": 1.6693433525293525e-05,
|
|
"loss": 0.0024,
|
|
"step": 554
|
|
},
|
|
{
|
|
"epoch": 1.0239990769585785,
|
|
"grad_norm": 0.09996992349624634,
|
|
"learning_rate": 1.6677464199914076e-05,
|
|
"loss": 0.004,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 1.0258451598015461,
|
|
"grad_norm": 0.1220189779996872,
|
|
"learning_rate": 1.6661464083626734e-05,
|
|
"loss": 0.0101,
|
|
"step": 556
|
|
},
|
|
{
|
|
"epoch": 1.0276912426445137,
|
|
"grad_norm": 0.0515366792678833,
|
|
"learning_rate": 1.6645433250210726e-05,
|
|
"loss": 0.0018,
|
|
"step": 557
|
|
},
|
|
{
|
|
"epoch": 1.0295373254874813,
|
|
"grad_norm": 0.09471988677978516,
|
|
"learning_rate": 1.662937177358691e-05,
|
|
"loss": 0.0031,
|
|
"step": 558
|
|
},
|
|
{
|
|
"epoch": 1.0313834083304487,
|
|
"grad_norm": 0.08005911856889725,
|
|
"learning_rate": 1.661327972781745e-05,
|
|
"loss": 0.0106,
|
|
"step": 559
|
|
},
|
|
{
|
|
"epoch": 1.0332294911734163,
|
|
"grad_norm": 0.7784668207168579,
|
|
"learning_rate": 1.6597157187105475e-05,
|
|
"loss": 0.012,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 1.035075574016384,
|
|
"grad_norm": 0.07025888562202454,
|
|
"learning_rate": 1.6581004225794715e-05,
|
|
"loss": 0.0038,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 1.0369216568593516,
|
|
"grad_norm": 0.06929878145456314,
|
|
"learning_rate": 1.6564820918369194e-05,
|
|
"loss": 0.0026,
|
|
"step": 562
|
|
},
|
|
{
|
|
"epoch": 1.0387677397023192,
|
|
"grad_norm": 0.2543495297431946,
|
|
"learning_rate": 1.6548607339452853e-05,
|
|
"loss": 0.0099,
|
|
"step": 563
|
|
},
|
|
{
|
|
"epoch": 1.0406138225452868,
|
|
"grad_norm": 0.15400093793869019,
|
|
"learning_rate": 1.6532363563809226e-05,
|
|
"loss": 0.0043,
|
|
"step": 564
|
|
},
|
|
{
|
|
"epoch": 1.0424599053882544,
|
|
"grad_norm": 0.17001113295555115,
|
|
"learning_rate": 1.651608966634109e-05,
|
|
"loss": 0.0078,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 1.0443059882312218,
|
|
"grad_norm": 0.03794068843126297,
|
|
"learning_rate": 1.649978572209012e-05,
|
|
"loss": 0.0012,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 1.0461520710741894,
|
|
"grad_norm": 0.1631583869457245,
|
|
"learning_rate": 1.648345180623653e-05,
|
|
"loss": 0.0276,
|
|
"step": 567
|
|
},
|
|
{
|
|
"epoch": 1.047998153917157,
|
|
"grad_norm": 0.3792674243450165,
|
|
"learning_rate": 1.6467087994098753e-05,
|
|
"loss": 0.0051,
|
|
"step": 568
|
|
},
|
|
{
|
|
"epoch": 1.0498442367601246,
|
|
"grad_norm": 0.14960213005542755,
|
|
"learning_rate": 1.6450694361133068e-05,
|
|
"loss": 0.0114,
|
|
"step": 569
|
|
},
|
|
{
|
|
"epoch": 1.0516903196030922,
|
|
"grad_norm": 0.05881274864077568,
|
|
"learning_rate": 1.6434270982933272e-05,
|
|
"loss": 0.0027,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 1.0535364024460598,
|
|
"grad_norm": 0.13894124329090118,
|
|
"learning_rate": 1.6417817935230318e-05,
|
|
"loss": 0.0032,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 1.0553824852890274,
|
|
"grad_norm": 4.140699863433838,
|
|
"learning_rate": 1.6401335293891966e-05,
|
|
"loss": 0.0341,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 1.0572285681319948,
|
|
"grad_norm": 0.32235807180404663,
|
|
"learning_rate": 1.6384823134922444e-05,
|
|
"loss": 0.0069,
|
|
"step": 573
|
|
},
|
|
{
|
|
"epoch": 1.0590746509749625,
|
|
"grad_norm": 1.4171650409698486,
|
|
"learning_rate": 1.6368281534462088e-05,
|
|
"loss": 0.0134,
|
|
"step": 574
|
|
},
|
|
{
|
|
"epoch": 1.06092073381793,
|
|
"grad_norm": 0.09332925081253052,
|
|
"learning_rate": 1.635171056878699e-05,
|
|
"loss": 0.0049,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 1.0627668166608977,
|
|
"grad_norm": 0.08895128965377808,
|
|
"learning_rate": 1.6335110314308654e-05,
|
|
"loss": 0.0049,
|
|
"step": 576
|
|
},
|
|
{
|
|
"epoch": 1.0646128995038653,
|
|
"grad_norm": 0.15676097571849823,
|
|
"learning_rate": 1.631848084757364e-05,
|
|
"loss": 0.0064,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 1.066458982346833,
|
|
"grad_norm": 0.15743549168109894,
|
|
"learning_rate": 1.6301822245263212e-05,
|
|
"loss": 0.004,
|
|
"step": 578
|
|
},
|
|
{
|
|
"epoch": 1.0683050651898003,
|
|
"grad_norm": 0.47084152698516846,
|
|
"learning_rate": 1.6285134584192976e-05,
|
|
"loss": 0.0033,
|
|
"step": 579
|
|
},
|
|
{
|
|
"epoch": 1.070151148032768,
|
|
"grad_norm": 0.11432501673698425,
|
|
"learning_rate": 1.626841794131254e-05,
|
|
"loss": 0.0148,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 1.0719972308757355,
|
|
"grad_norm": 0.25219109654426575,
|
|
"learning_rate": 1.6251672393705155e-05,
|
|
"loss": 0.0033,
|
|
"step": 581
|
|
},
|
|
{
|
|
"epoch": 1.0738433137187031,
|
|
"grad_norm": 0.11923202127218246,
|
|
"learning_rate": 1.6234898018587336e-05,
|
|
"loss": 0.0038,
|
|
"step": 582
|
|
},
|
|
{
|
|
"epoch": 1.0756893965616707,
|
|
"grad_norm": 0.21197491884231567,
|
|
"learning_rate": 1.6218094893308553e-05,
|
|
"loss": 0.0071,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 1.0775354794046383,
|
|
"grad_norm": 0.15348570048809052,
|
|
"learning_rate": 1.6201263095350833e-05,
|
|
"loss": 0.0089,
|
|
"step": 584
|
|
},
|
|
{
|
|
"epoch": 1.079381562247606,
|
|
"grad_norm": 0.15847565233707428,
|
|
"learning_rate": 1.6184402702328426e-05,
|
|
"loss": 0.004,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 1.0812276450905733,
|
|
"grad_norm": 0.2338961660861969,
|
|
"learning_rate": 1.6167513791987423e-05,
|
|
"loss": 0.0155,
|
|
"step": 586
|
|
},
|
|
{
|
|
"epoch": 1.083073727933541,
|
|
"grad_norm": 0.10159553587436676,
|
|
"learning_rate": 1.615059644220543e-05,
|
|
"loss": 0.0039,
|
|
"step": 587
|
|
},
|
|
{
|
|
"epoch": 1.0849198107765086,
|
|
"grad_norm": 0.23770081996917725,
|
|
"learning_rate": 1.6133650730991183e-05,
|
|
"loss": 0.0054,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 1.0867658936194762,
|
|
"grad_norm": 0.08259685337543488,
|
|
"learning_rate": 1.6116676736484206e-05,
|
|
"loss": 0.0025,
|
|
"step": 589
|
|
},
|
|
{
|
|
"epoch": 1.0886119764624438,
|
|
"grad_norm": 0.11127261817455292,
|
|
"learning_rate": 1.6099674536954426e-05,
|
|
"loss": 0.0049,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 1.0904580593054114,
|
|
"grad_norm": 0.0952722355723381,
|
|
"learning_rate": 1.6082644210801846e-05,
|
|
"loss": 0.0022,
|
|
"step": 591
|
|
},
|
|
{
|
|
"epoch": 1.092304142148379,
|
|
"grad_norm": 0.19041772186756134,
|
|
"learning_rate": 1.6065585836556152e-05,
|
|
"loss": 0.0102,
|
|
"step": 592
|
|
},
|
|
{
|
|
"epoch": 1.0941502249913464,
|
|
"grad_norm": 0.14780960977077484,
|
|
"learning_rate": 1.6048499492876378e-05,
|
|
"loss": 0.0075,
|
|
"step": 593
|
|
},
|
|
{
|
|
"epoch": 1.095996307834314,
|
|
"grad_norm": 0.11060582101345062,
|
|
"learning_rate": 1.603138525855051e-05,
|
|
"loss": 0.0056,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 1.0978423906772816,
|
|
"grad_norm": 0.10337001830339432,
|
|
"learning_rate": 1.6014243212495167e-05,
|
|
"loss": 0.0064,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 1.0996884735202492,
|
|
"grad_norm": 0.18508298695087433,
|
|
"learning_rate": 1.5997073433755187e-05,
|
|
"loss": 0.0036,
|
|
"step": 596
|
|
},
|
|
{
|
|
"epoch": 1.1015345563632168,
|
|
"grad_norm": 0.17404358088970184,
|
|
"learning_rate": 1.597987600150331e-05,
|
|
"loss": 0.0091,
|
|
"step": 597
|
|
},
|
|
{
|
|
"epoch": 1.1033806392061845,
|
|
"grad_norm": 0.13263043761253357,
|
|
"learning_rate": 1.5962650995039783e-05,
|
|
"loss": 0.0035,
|
|
"step": 598
|
|
},
|
|
{
|
|
"epoch": 1.105226722049152,
|
|
"grad_norm": 0.20240873098373413,
|
|
"learning_rate": 1.594539849379199e-05,
|
|
"loss": 0.0257,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 1.1070728048921195,
|
|
"grad_norm": 0.1413458287715912,
|
|
"learning_rate": 1.5928118577314123e-05,
|
|
"loss": 0.0123,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.108918887735087,
|
|
"grad_norm": 0.11439207196235657,
|
|
"learning_rate": 1.5910811325286768e-05,
|
|
"loss": 0.0027,
|
|
"step": 601
|
|
},
|
|
{
|
|
"epoch": 1.1107649705780547,
|
|
"grad_norm": 0.05116959661245346,
|
|
"learning_rate": 1.5893476817516567e-05,
|
|
"loss": 0.0015,
|
|
"step": 602
|
|
},
|
|
{
|
|
"epoch": 1.1126110534210223,
|
|
"grad_norm": 0.10003647208213806,
|
|
"learning_rate": 1.587611513393585e-05,
|
|
"loss": 0.0045,
|
|
"step": 603
|
|
},
|
|
{
|
|
"epoch": 1.11445713626399,
|
|
"grad_norm": 0.11144063621759415,
|
|
"learning_rate": 1.5858726354602248e-05,
|
|
"loss": 0.0063,
|
|
"step": 604
|
|
},
|
|
{
|
|
"epoch": 1.1163032191069575,
|
|
"grad_norm": 0.15549401938915253,
|
|
"learning_rate": 1.5841310559698346e-05,
|
|
"loss": 0.0143,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 1.118149301949925,
|
|
"grad_norm": 0.13532917201519012,
|
|
"learning_rate": 1.582386782953129e-05,
|
|
"loss": 0.007,
|
|
"step": 606
|
|
},
|
|
{
|
|
"epoch": 1.1199953847928925,
|
|
"grad_norm": 0.10869234800338745,
|
|
"learning_rate": 1.580639824453244e-05,
|
|
"loss": 0.0077,
|
|
"step": 607
|
|
},
|
|
{
|
|
"epoch": 1.1218414676358601,
|
|
"grad_norm": 0.05746113136410713,
|
|
"learning_rate": 1.5788901885256983e-05,
|
|
"loss": 0.0021,
|
|
"step": 608
|
|
},
|
|
{
|
|
"epoch": 1.1236875504788277,
|
|
"grad_norm": 0.07222943007946014,
|
|
"learning_rate": 1.577137883238357e-05,
|
|
"loss": 0.0028,
|
|
"step": 609
|
|
},
|
|
{
|
|
"epoch": 1.1255336333217953,
|
|
"grad_norm": 0.07145451754331589,
|
|
"learning_rate": 1.575382916671393e-05,
|
|
"loss": 0.0056,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 1.127379716164763,
|
|
"grad_norm": 0.19978775084018707,
|
|
"learning_rate": 1.5736252969172522e-05,
|
|
"loss": 0.0064,
|
|
"step": 611
|
|
},
|
|
{
|
|
"epoch": 1.1292257990077306,
|
|
"grad_norm": 0.158962681889534,
|
|
"learning_rate": 1.5718650320806145e-05,
|
|
"loss": 0.0194,
|
|
"step": 612
|
|
},
|
|
{
|
|
"epoch": 1.131071881850698,
|
|
"grad_norm": 0.14478765428066254,
|
|
"learning_rate": 1.5701021302783557e-05,
|
|
"loss": 0.0214,
|
|
"step": 613
|
|
},
|
|
{
|
|
"epoch": 1.1329179646936656,
|
|
"grad_norm": 0.1333470642566681,
|
|
"learning_rate": 1.5683365996395123e-05,
|
|
"loss": 0.0156,
|
|
"step": 614
|
|
},
|
|
{
|
|
"epoch": 1.1347640475366332,
|
|
"grad_norm": 0.08896146714687347,
|
|
"learning_rate": 1.5665684483052425e-05,
|
|
"loss": 0.0138,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 1.1366101303796008,
|
|
"grad_norm": 0.48797956109046936,
|
|
"learning_rate": 1.5647976844287884e-05,
|
|
"loss": 0.0076,
|
|
"step": 616
|
|
},
|
|
{
|
|
"epoch": 1.1384562132225684,
|
|
"grad_norm": 0.1988457441329956,
|
|
"learning_rate": 1.5630243161754395e-05,
|
|
"loss": 0.0208,
|
|
"step": 617
|
|
},
|
|
{
|
|
"epoch": 1.140302296065536,
|
|
"grad_norm": 0.08605816215276718,
|
|
"learning_rate": 1.5612483517224942e-05,
|
|
"loss": 0.0086,
|
|
"step": 618
|
|
},
|
|
{
|
|
"epoch": 1.1421483789085034,
|
|
"grad_norm": 0.09312114864587784,
|
|
"learning_rate": 1.5594697992592232e-05,
|
|
"loss": 0.0042,
|
|
"step": 619
|
|
},
|
|
{
|
|
"epoch": 1.143994461751471,
|
|
"grad_norm": 0.08398724347352982,
|
|
"learning_rate": 1.5576886669868297e-05,
|
|
"loss": 0.0051,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 1.1458405445944386,
|
|
"grad_norm": 0.2768036127090454,
|
|
"learning_rate": 1.5559049631184136e-05,
|
|
"loss": 0.0041,
|
|
"step": 621
|
|
},
|
|
{
|
|
"epoch": 1.1476866274374062,
|
|
"grad_norm": 0.16800187528133392,
|
|
"learning_rate": 1.5541186958789327e-05,
|
|
"loss": 0.0029,
|
|
"step": 622
|
|
},
|
|
{
|
|
"epoch": 1.1495327102803738,
|
|
"grad_norm": 0.09099100530147552,
|
|
"learning_rate": 1.5523298735051657e-05,
|
|
"loss": 0.004,
|
|
"step": 623
|
|
},
|
|
{
|
|
"epoch": 1.1513787931233415,
|
|
"grad_norm": 0.08173321932554245,
|
|
"learning_rate": 1.5505385042456715e-05,
|
|
"loss": 0.0043,
|
|
"step": 624
|
|
},
|
|
{
|
|
"epoch": 1.153224875966309,
|
|
"grad_norm": 0.10678403079509735,
|
|
"learning_rate": 1.5487445963607554e-05,
|
|
"loss": 0.0026,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 1.1550709588092767,
|
|
"grad_norm": 0.08033821731805801,
|
|
"learning_rate": 1.5469481581224274e-05,
|
|
"loss": 0.0044,
|
|
"step": 626
|
|
},
|
|
{
|
|
"epoch": 1.156917041652244,
|
|
"grad_norm": 0.4525659382343292,
|
|
"learning_rate": 1.545149197814365e-05,
|
|
"loss": 0.0082,
|
|
"step": 627
|
|
},
|
|
{
|
|
"epoch": 1.1587631244952117,
|
|
"grad_norm": 0.07487817108631134,
|
|
"learning_rate": 1.5433477237318765e-05,
|
|
"loss": 0.0027,
|
|
"step": 628
|
|
},
|
|
{
|
|
"epoch": 1.1606092073381793,
|
|
"grad_norm": 0.09451524913311005,
|
|
"learning_rate": 1.5415437441818615e-05,
|
|
"loss": 0.0064,
|
|
"step": 629
|
|
},
|
|
{
|
|
"epoch": 1.162455290181147,
|
|
"grad_norm": 0.2998914122581482,
|
|
"learning_rate": 1.5397372674827723e-05,
|
|
"loss": 0.0242,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 1.1643013730241145,
|
|
"grad_norm": 0.3896186947822571,
|
|
"learning_rate": 1.5379283019645757e-05,
|
|
"loss": 0.0076,
|
|
"step": 631
|
|
},
|
|
{
|
|
"epoch": 1.1661474558670821,
|
|
"grad_norm": 0.1330152451992035,
|
|
"learning_rate": 1.5361168559687158e-05,
|
|
"loss": 0.0032,
|
|
"step": 632
|
|
},
|
|
{
|
|
"epoch": 1.1679935387100495,
|
|
"grad_norm": 0.11637061834335327,
|
|
"learning_rate": 1.5343029378480733e-05,
|
|
"loss": 0.0025,
|
|
"step": 633
|
|
},
|
|
{
|
|
"epoch": 1.1698396215530171,
|
|
"grad_norm": 0.11787394434213638,
|
|
"learning_rate": 1.5324865559669295e-05,
|
|
"loss": 0.0065,
|
|
"step": 634
|
|
},
|
|
{
|
|
"epoch": 1.1716857043959847,
|
|
"grad_norm": 0.23343242704868317,
|
|
"learning_rate": 1.5306677187009263e-05,
|
|
"loss": 0.0064,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 1.1735317872389524,
|
|
"grad_norm": 0.17973926663398743,
|
|
"learning_rate": 1.5288464344370267e-05,
|
|
"loss": 0.0056,
|
|
"step": 636
|
|
},
|
|
{
|
|
"epoch": 1.17537787008192,
|
|
"grad_norm": 0.09165448695421219,
|
|
"learning_rate": 1.527022711573479e-05,
|
|
"loss": 0.016,
|
|
"step": 637
|
|
},
|
|
{
|
|
"epoch": 1.1772239529248876,
|
|
"grad_norm": 0.10070207715034485,
|
|
"learning_rate": 1.5251965585197748e-05,
|
|
"loss": 0.0036,
|
|
"step": 638
|
|
},
|
|
{
|
|
"epoch": 1.1790700357678552,
|
|
"grad_norm": 0.11805471032857895,
|
|
"learning_rate": 1.5233679836966122e-05,
|
|
"loss": 0.0079,
|
|
"step": 639
|
|
},
|
|
{
|
|
"epoch": 1.1809161186108226,
|
|
"grad_norm": 0.11095066368579865,
|
|
"learning_rate": 1.5215369955358568e-05,
|
|
"loss": 0.007,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 1.1827622014537902,
|
|
"grad_norm": 0.1736016422510147,
|
|
"learning_rate": 1.5197036024805018e-05,
|
|
"loss": 0.0071,
|
|
"step": 641
|
|
},
|
|
{
|
|
"epoch": 1.1846082842967578,
|
|
"grad_norm": 0.07015091180801392,
|
|
"learning_rate": 1.5178678129846311e-05,
|
|
"loss": 0.0024,
|
|
"step": 642
|
|
},
|
|
{
|
|
"epoch": 1.1864543671397254,
|
|
"grad_norm": 0.17164985835552216,
|
|
"learning_rate": 1.5160296355133773e-05,
|
|
"loss": 0.0084,
|
|
"step": 643
|
|
},
|
|
{
|
|
"epoch": 1.188300449982693,
|
|
"grad_norm": 0.0814700499176979,
|
|
"learning_rate": 1.5141890785428855e-05,
|
|
"loss": 0.0029,
|
|
"step": 644
|
|
},
|
|
{
|
|
"epoch": 1.188300449982693,
|
|
"eval_loss": 0.009450200945138931,
|
|
"eval_runtime": 91.4476,
|
|
"eval_samples_per_second": 9.984,
|
|
"eval_steps_per_second": 4.997,
|
|
"step": 644
|
|
},
|
|
{
|
|
"epoch": 1.1901465328256606,
|
|
"grad_norm": 0.1890951693058014,
|
|
"learning_rate": 1.5123461505602728e-05,
|
|
"loss": 0.0093,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 1.191992615668628,
|
|
"grad_norm": 0.07871969789266586,
|
|
"learning_rate": 1.5105008600635888e-05,
|
|
"loss": 0.0038,
|
|
"step": 646
|
|
},
|
|
{
|
|
"epoch": 1.1938386985115956,
|
|
"grad_norm": 0.3163944184780121,
|
|
"learning_rate": 1.5086532155617785e-05,
|
|
"loss": 0.0046,
|
|
"step": 647
|
|
},
|
|
{
|
|
"epoch": 1.1956847813545632,
|
|
"grad_norm": 0.10958370566368103,
|
|
"learning_rate": 1.50680322557464e-05,
|
|
"loss": 0.0086,
|
|
"step": 648
|
|
},
|
|
{
|
|
"epoch": 1.1975308641975309,
|
|
"grad_norm": 0.08893739432096481,
|
|
"learning_rate": 1.5049508986327879e-05,
|
|
"loss": 0.0059,
|
|
"step": 649
|
|
},
|
|
{
|
|
"epoch": 1.1993769470404985,
|
|
"grad_norm": 0.09862252324819565,
|
|
"learning_rate": 1.5030962432776126e-05,
|
|
"loss": 0.0114,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 1.201223029883466,
|
|
"grad_norm": 0.1269105225801468,
|
|
"learning_rate": 1.5012392680612408e-05,
|
|
"loss": 0.0085,
|
|
"step": 651
|
|
},
|
|
{
|
|
"epoch": 1.2030691127264337,
|
|
"grad_norm": 0.17242087423801422,
|
|
"learning_rate": 1.499379981546497e-05,
|
|
"loss": 0.0049,
|
|
"step": 652
|
|
},
|
|
{
|
|
"epoch": 1.2049151955694013,
|
|
"grad_norm": 0.05630020052194595,
|
|
"learning_rate": 1.4975183923068637e-05,
|
|
"loss": 0.0027,
|
|
"step": 653
|
|
},
|
|
{
|
|
"epoch": 1.2067612784123687,
|
|
"grad_norm": 0.08747876435518265,
|
|
"learning_rate": 1.4956545089264408e-05,
|
|
"loss": 0.0087,
|
|
"step": 654
|
|
},
|
|
{
|
|
"epoch": 1.2086073612553363,
|
|
"grad_norm": 0.34542739391326904,
|
|
"learning_rate": 1.493788339999907e-05,
|
|
"loss": 0.0352,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 1.210453444098304,
|
|
"grad_norm": 0.10722552984952927,
|
|
"learning_rate": 1.4919198941324813e-05,
|
|
"loss": 0.005,
|
|
"step": 656
|
|
},
|
|
{
|
|
"epoch": 1.2122995269412715,
|
|
"grad_norm": 0.09170122444629669,
|
|
"learning_rate": 1.4900491799398802e-05,
|
|
"loss": 0.0026,
|
|
"step": 657
|
|
},
|
|
{
|
|
"epoch": 1.2141456097842391,
|
|
"grad_norm": 0.11170299351215363,
|
|
"learning_rate": 1.4881762060482814e-05,
|
|
"loss": 0.0065,
|
|
"step": 658
|
|
},
|
|
{
|
|
"epoch": 1.2159916926272065,
|
|
"grad_norm": 0.12743814289569855,
|
|
"learning_rate": 1.4863009810942814e-05,
|
|
"loss": 0.0055,
|
|
"step": 659
|
|
},
|
|
{
|
|
"epoch": 1.2178377754701741,
|
|
"grad_norm": 0.16897545754909515,
|
|
"learning_rate": 1.4844235137248575e-05,
|
|
"loss": 0.0117,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 1.2196838583131417,
|
|
"grad_norm": 0.055120084434747696,
|
|
"learning_rate": 1.4825438125973263e-05,
|
|
"loss": 0.0017,
|
|
"step": 661
|
|
},
|
|
{
|
|
"epoch": 1.2215299411561094,
|
|
"grad_norm": 0.0961981788277626,
|
|
"learning_rate": 1.4806618863793057e-05,
|
|
"loss": 0.0081,
|
|
"step": 662
|
|
},
|
|
{
|
|
"epoch": 1.223376023999077,
|
|
"grad_norm": 0.11256881803274155,
|
|
"learning_rate": 1.4787777437486723e-05,
|
|
"loss": 0.0067,
|
|
"step": 663
|
|
},
|
|
{
|
|
"epoch": 1.2252221068420446,
|
|
"grad_norm": 0.1181740015745163,
|
|
"learning_rate": 1.4768913933935249e-05,
|
|
"loss": 0.0052,
|
|
"step": 664
|
|
},
|
|
{
|
|
"epoch": 1.2270681896850122,
|
|
"grad_norm": 0.11756162345409393,
|
|
"learning_rate": 1.475002844012141e-05,
|
|
"loss": 0.0119,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 1.2289142725279798,
|
|
"grad_norm": 0.11867949366569519,
|
|
"learning_rate": 1.4731121043129392e-05,
|
|
"loss": 0.0042,
|
|
"step": 666
|
|
},
|
|
{
|
|
"epoch": 1.2307603553709472,
|
|
"grad_norm": 0.1038561463356018,
|
|
"learning_rate": 1.4712191830144369e-05,
|
|
"loss": 0.0083,
|
|
"step": 667
|
|
},
|
|
{
|
|
"epoch": 1.2326064382139148,
|
|
"grad_norm": 0.13451151549816132,
|
|
"learning_rate": 1.4693240888452121e-05,
|
|
"loss": 0.0064,
|
|
"step": 668
|
|
},
|
|
{
|
|
"epoch": 1.2344525210568824,
|
|
"grad_norm": 0.18300770223140717,
|
|
"learning_rate": 1.4674268305438624e-05,
|
|
"loss": 0.007,
|
|
"step": 669
|
|
},
|
|
{
|
|
"epoch": 1.23629860389985,
|
|
"grad_norm": 0.09086968749761581,
|
|
"learning_rate": 1.4655274168589635e-05,
|
|
"loss": 0.0031,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 1.2381446867428176,
|
|
"grad_norm": 0.031037267297506332,
|
|
"learning_rate": 1.4636258565490304e-05,
|
|
"loss": 0.0013,
|
|
"step": 671
|
|
},
|
|
{
|
|
"epoch": 1.2399907695857852,
|
|
"grad_norm": 0.10484003275632858,
|
|
"learning_rate": 1.461722158382478e-05,
|
|
"loss": 0.0082,
|
|
"step": 672
|
|
},
|
|
{
|
|
"epoch": 1.2418368524287526,
|
|
"grad_norm": 0.11549515277147293,
|
|
"learning_rate": 1.459816331137577e-05,
|
|
"loss": 0.0089,
|
|
"step": 673
|
|
},
|
|
{
|
|
"epoch": 1.2436829352717202,
|
|
"grad_norm": 0.09996828436851501,
|
|
"learning_rate": 1.4579083836024171e-05,
|
|
"loss": 0.0023,
|
|
"step": 674
|
|
},
|
|
{
|
|
"epoch": 1.2455290181146879,
|
|
"grad_norm": 0.11347710341215134,
|
|
"learning_rate": 1.4559983245748639e-05,
|
|
"loss": 0.0115,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 1.2473751009576555,
|
|
"grad_norm": 0.08734507858753204,
|
|
"learning_rate": 1.4540861628625207e-05,
|
|
"loss": 0.0057,
|
|
"step": 676
|
|
},
|
|
{
|
|
"epoch": 1.249221183800623,
|
|
"grad_norm": 0.08014446496963501,
|
|
"learning_rate": 1.4521719072826858e-05,
|
|
"loss": 0.0032,
|
|
"step": 677
|
|
},
|
|
{
|
|
"epoch": 1.2510672666435907,
|
|
"grad_norm": 0.1405206322669983,
|
|
"learning_rate": 1.450255566662313e-05,
|
|
"loss": 0.0086,
|
|
"step": 678
|
|
},
|
|
{
|
|
"epoch": 1.2529133494865583,
|
|
"grad_norm": 0.1792028844356537,
|
|
"learning_rate": 1.4483371498379702e-05,
|
|
"loss": 0.0139,
|
|
"step": 679
|
|
},
|
|
{
|
|
"epoch": 1.254759432329526,
|
|
"grad_norm": 0.08750908821821213,
|
|
"learning_rate": 1.4464166656557997e-05,
|
|
"loss": 0.0024,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 1.2566055151724933,
|
|
"grad_norm": 0.1796724945306778,
|
|
"learning_rate": 1.444494122971476e-05,
|
|
"loss": 0.0102,
|
|
"step": 681
|
|
},
|
|
{
|
|
"epoch": 1.258451598015461,
|
|
"grad_norm": 0.07849518209695816,
|
|
"learning_rate": 1.4425695306501656e-05,
|
|
"loss": 0.0035,
|
|
"step": 682
|
|
},
|
|
{
|
|
"epoch": 1.2602976808584285,
|
|
"grad_norm": 0.09903024882078171,
|
|
"learning_rate": 1.4406428975664875e-05,
|
|
"loss": 0.0115,
|
|
"step": 683
|
|
},
|
|
{
|
|
"epoch": 1.2621437637013961,
|
|
"grad_norm": 0.12543067336082458,
|
|
"learning_rate": 1.4387142326044696e-05,
|
|
"loss": 0.0099,
|
|
"step": 684
|
|
},
|
|
{
|
|
"epoch": 1.2639898465443637,
|
|
"grad_norm": 0.12240198999643326,
|
|
"learning_rate": 1.43678354465751e-05,
|
|
"loss": 0.0124,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 1.2658359293873311,
|
|
"grad_norm": 0.1332004815340042,
|
|
"learning_rate": 1.4348508426283342e-05,
|
|
"loss": 0.0054,
|
|
"step": 686
|
|
},
|
|
{
|
|
"epoch": 1.2676820122302987,
|
|
"grad_norm": 0.14584915339946747,
|
|
"learning_rate": 1.4329161354289562e-05,
|
|
"loss": 0.0044,
|
|
"step": 687
|
|
},
|
|
{
|
|
"epoch": 1.2695280950732664,
|
|
"grad_norm": 0.08272266387939453,
|
|
"learning_rate": 1.4309794319806356e-05,
|
|
"loss": 0.0035,
|
|
"step": 688
|
|
},
|
|
{
|
|
"epoch": 1.271374177916234,
|
|
"grad_norm": 0.1179232969880104,
|
|
"learning_rate": 1.4290407412138365e-05,
|
|
"loss": 0.0066,
|
|
"step": 689
|
|
},
|
|
{
|
|
"epoch": 1.2732202607592016,
|
|
"grad_norm": 0.12980088591575623,
|
|
"learning_rate": 1.4271000720681874e-05,
|
|
"loss": 0.0075,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 1.2750663436021692,
|
|
"grad_norm": 0.09224485605955124,
|
|
"learning_rate": 1.4251574334924395e-05,
|
|
"loss": 0.0066,
|
|
"step": 691
|
|
},
|
|
{
|
|
"epoch": 1.2769124264451368,
|
|
"grad_norm": 0.07015877217054367,
|
|
"learning_rate": 1.4232128344444251e-05,
|
|
"loss": 0.0019,
|
|
"step": 692
|
|
},
|
|
{
|
|
"epoch": 1.2787585092881044,
|
|
"grad_norm": 0.09918259084224701,
|
|
"learning_rate": 1.421266283891017e-05,
|
|
"loss": 0.0046,
|
|
"step": 693
|
|
},
|
|
{
|
|
"epoch": 1.2806045921310718,
|
|
"grad_norm": 0.12773172557353973,
|
|
"learning_rate": 1.419317790808086e-05,
|
|
"loss": 0.0171,
|
|
"step": 694
|
|
},
|
|
{
|
|
"epoch": 1.2824506749740394,
|
|
"grad_norm": 0.15679265558719635,
|
|
"learning_rate": 1.417367364180461e-05,
|
|
"loss": 0.0351,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 1.284296757817007,
|
|
"grad_norm": 0.04852623865008354,
|
|
"learning_rate": 1.4154150130018867e-05,
|
|
"loss": 0.0017,
|
|
"step": 696
|
|
},
|
|
{
|
|
"epoch": 1.2861428406599746,
|
|
"grad_norm": 0.3174004554748535,
|
|
"learning_rate": 1.4134607462749814e-05,
|
|
"loss": 0.0347,
|
|
"step": 697
|
|
},
|
|
{
|
|
"epoch": 1.2879889235029423,
|
|
"grad_norm": 0.11431937664747238,
|
|
"learning_rate": 1.411504573011197e-05,
|
|
"loss": 0.012,
|
|
"step": 698
|
|
},
|
|
{
|
|
"epoch": 1.2898350063459096,
|
|
"grad_norm": 0.12127841264009476,
|
|
"learning_rate": 1.409546502230777e-05,
|
|
"loss": 0.0292,
|
|
"step": 699
|
|
},
|
|
{
|
|
"epoch": 1.2916810891888773,
|
|
"grad_norm": 0.11356043070554733,
|
|
"learning_rate": 1.4075865429627143e-05,
|
|
"loss": 0.0082,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 1.2935271720318449,
|
|
"grad_norm": 0.0658322125673294,
|
|
"learning_rate": 1.4056247042447096e-05,
|
|
"loss": 0.0034,
|
|
"step": 701
|
|
},
|
|
{
|
|
"epoch": 1.2953732548748125,
|
|
"grad_norm": 0.10870260745286942,
|
|
"learning_rate": 1.4036609951231307e-05,
|
|
"loss": 0.0033,
|
|
"step": 702
|
|
},
|
|
{
|
|
"epoch": 1.29721933771778,
|
|
"grad_norm": 0.08417758345603943,
|
|
"learning_rate": 1.4016954246529697e-05,
|
|
"loss": 0.0038,
|
|
"step": 703
|
|
},
|
|
{
|
|
"epoch": 1.2990654205607477,
|
|
"grad_norm": 0.05838305503129959,
|
|
"learning_rate": 1.3997280018978018e-05,
|
|
"loss": 0.0019,
|
|
"step": 704
|
|
},
|
|
{
|
|
"epoch": 1.3009115034037153,
|
|
"grad_norm": 0.09693298488855362,
|
|
"learning_rate": 1.397758735929744e-05,
|
|
"loss": 0.005,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 1.302757586246683,
|
|
"grad_norm": 0.08534782379865646,
|
|
"learning_rate": 1.3957876358294115e-05,
|
|
"loss": 0.0044,
|
|
"step": 706
|
|
},
|
|
{
|
|
"epoch": 1.3046036690896505,
|
|
"grad_norm": 0.08009322732686996,
|
|
"learning_rate": 1.3938147106858776e-05,
|
|
"loss": 0.0034,
|
|
"step": 707
|
|
},
|
|
{
|
|
"epoch": 1.306449751932618,
|
|
"grad_norm": 0.08357948064804077,
|
|
"learning_rate": 1.391839969596632e-05,
|
|
"loss": 0.004,
|
|
"step": 708
|
|
},
|
|
{
|
|
"epoch": 1.3082958347755855,
|
|
"grad_norm": 0.14750374853610992,
|
|
"learning_rate": 1.3898634216675362e-05,
|
|
"loss": 0.0058,
|
|
"step": 709
|
|
},
|
|
{
|
|
"epoch": 1.3101419176185531,
|
|
"grad_norm": 0.06477048248052597,
|
|
"learning_rate": 1.3878850760127848e-05,
|
|
"loss": 0.0057,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 1.3119880004615208,
|
|
"grad_norm": 0.07694050669670105,
|
|
"learning_rate": 1.385904941754862e-05,
|
|
"loss": 0.0039,
|
|
"step": 711
|
|
},
|
|
{
|
|
"epoch": 1.3138340833044884,
|
|
"grad_norm": 0.1097872257232666,
|
|
"learning_rate": 1.3839230280244984e-05,
|
|
"loss": 0.0032,
|
|
"step": 712
|
|
},
|
|
{
|
|
"epoch": 1.3156801661474558,
|
|
"grad_norm": 0.24045175313949585,
|
|
"learning_rate": 1.3819393439606313e-05,
|
|
"loss": 0.0028,
|
|
"step": 713
|
|
},
|
|
{
|
|
"epoch": 1.3175262489904234,
|
|
"grad_norm": 0.05547857657074928,
|
|
"learning_rate": 1.37995389871036e-05,
|
|
"loss": 0.0026,
|
|
"step": 714
|
|
},
|
|
{
|
|
"epoch": 1.319372331833391,
|
|
"grad_norm": 0.057202987372875214,
|
|
"learning_rate": 1.3779667014289067e-05,
|
|
"loss": 0.0025,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 1.3212184146763586,
|
|
"grad_norm": 0.06509260088205338,
|
|
"learning_rate": 1.375977761279571e-05,
|
|
"loss": 0.0023,
|
|
"step": 716
|
|
},
|
|
{
|
|
"epoch": 1.3230644975193262,
|
|
"grad_norm": 0.09592520445585251,
|
|
"learning_rate": 1.3739870874336898e-05,
|
|
"loss": 0.0085,
|
|
"step": 717
|
|
},
|
|
{
|
|
"epoch": 1.3249105803622938,
|
|
"grad_norm": 0.06174939498305321,
|
|
"learning_rate": 1.371994689070594e-05,
|
|
"loss": 0.0025,
|
|
"step": 718
|
|
},
|
|
{
|
|
"epoch": 1.3267566632052614,
|
|
"grad_norm": 0.06818564236164093,
|
|
"learning_rate": 1.3700005753775671e-05,
|
|
"loss": 0.0022,
|
|
"step": 719
|
|
},
|
|
{
|
|
"epoch": 1.328602746048229,
|
|
"grad_norm": 0.10227657854557037,
|
|
"learning_rate": 1.3680047555498017e-05,
|
|
"loss": 0.0039,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 1.3304488288911964,
|
|
"grad_norm": 0.12304170429706573,
|
|
"learning_rate": 1.366007238790358e-05,
|
|
"loss": 0.0055,
|
|
"step": 721
|
|
},
|
|
{
|
|
"epoch": 1.332294911734164,
|
|
"grad_norm": 0.03807997703552246,
|
|
"learning_rate": 1.3640080343101209e-05,
|
|
"loss": 0.0015,
|
|
"step": 722
|
|
},
|
|
{
|
|
"epoch": 1.3341409945771316,
|
|
"grad_norm": 0.058405570685863495,
|
|
"learning_rate": 1.362007151327758e-05,
|
|
"loss": 0.0059,
|
|
"step": 723
|
|
},
|
|
{
|
|
"epoch": 1.3359870774200993,
|
|
"grad_norm": 0.1530926525592804,
|
|
"learning_rate": 1.3600045990696762e-05,
|
|
"loss": 0.0178,
|
|
"step": 724
|
|
},
|
|
{
|
|
"epoch": 1.3378331602630669,
|
|
"grad_norm": 0.07261884212493896,
|
|
"learning_rate": 1.3580003867699801e-05,
|
|
"loss": 0.0027,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 1.3396792431060343,
|
|
"grad_norm": 0.11661682277917862,
|
|
"learning_rate": 1.3559945236704286e-05,
|
|
"loss": 0.0039,
|
|
"step": 726
|
|
},
|
|
{
|
|
"epoch": 1.3415253259490019,
|
|
"grad_norm": 0.09348852932453156,
|
|
"learning_rate": 1.3539870190203937e-05,
|
|
"loss": 0.0098,
|
|
"step": 727
|
|
},
|
|
{
|
|
"epoch": 1.3433714087919695,
|
|
"grad_norm": 0.06435918807983398,
|
|
"learning_rate": 1.3519778820768157e-05,
|
|
"loss": 0.0013,
|
|
"step": 728
|
|
},
|
|
{
|
|
"epoch": 1.345217491634937,
|
|
"grad_norm": 0.13823622465133667,
|
|
"learning_rate": 1.349967122104162e-05,
|
|
"loss": 0.0086,
|
|
"step": 729
|
|
},
|
|
{
|
|
"epoch": 1.3470635744779047,
|
|
"grad_norm": 0.0859057605266571,
|
|
"learning_rate": 1.3479547483743847e-05,
|
|
"loss": 0.0078,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 1.3489096573208723,
|
|
"grad_norm": 0.05262494832277298,
|
|
"learning_rate": 1.3459407701668762e-05,
|
|
"loss": 0.0021,
|
|
"step": 731
|
|
},
|
|
{
|
|
"epoch": 1.35075574016384,
|
|
"grad_norm": 0.16662868857383728,
|
|
"learning_rate": 1.3439251967684288e-05,
|
|
"loss": 0.0301,
|
|
"step": 732
|
|
},
|
|
{
|
|
"epoch": 1.3526018230068075,
|
|
"grad_norm": 0.1268492341041565,
|
|
"learning_rate": 1.3419080374731889e-05,
|
|
"loss": 0.0079,
|
|
"step": 733
|
|
},
|
|
{
|
|
"epoch": 1.3544479058497751,
|
|
"grad_norm": 0.09013361483812332,
|
|
"learning_rate": 1.3398893015826166e-05,
|
|
"loss": 0.0044,
|
|
"step": 734
|
|
},
|
|
{
|
|
"epoch": 1.3562939886927425,
|
|
"grad_norm": 0.1784420609474182,
|
|
"learning_rate": 1.3378689984054426e-05,
|
|
"loss": 0.0095,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 1.3581400715357101,
|
|
"grad_norm": 0.08392225950956345,
|
|
"learning_rate": 1.3358471372576229e-05,
|
|
"loss": 0.01,
|
|
"step": 736
|
|
},
|
|
{
|
|
"epoch": 1.3581400715357101,
|
|
"eval_loss": 0.008186421357095242,
|
|
"eval_runtime": 91.7934,
|
|
"eval_samples_per_second": 9.946,
|
|
"eval_steps_per_second": 4.979,
|
|
"step": 736
|
|
},
|
|
{
|
|
"epoch": 1.3599861543786778,
|
|
"grad_norm": 0.0762593075633049,
|
|
"learning_rate": 1.3338237274622983e-05,
|
|
"loss": 0.0025,
|
|
"step": 737
|
|
},
|
|
{
|
|
"epoch": 1.3618322372216454,
|
|
"grad_norm": 0.07320329546928406,
|
|
"learning_rate": 1.331798778349752e-05,
|
|
"loss": 0.0082,
|
|
"step": 738
|
|
},
|
|
{
|
|
"epoch": 1.363678320064613,
|
|
"grad_norm": 0.07280085980892181,
|
|
"learning_rate": 1.3297722992573636e-05,
|
|
"loss": 0.0027,
|
|
"step": 739
|
|
},
|
|
{
|
|
"epoch": 1.3655244029075804,
|
|
"grad_norm": 0.05512760952115059,
|
|
"learning_rate": 1.327744299529568e-05,
|
|
"loss": 0.0028,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 1.367370485750548,
|
|
"grad_norm": 0.0936584621667862,
|
|
"learning_rate": 1.3257147885178125e-05,
|
|
"loss": 0.0033,
|
|
"step": 741
|
|
},
|
|
{
|
|
"epoch": 1.3692165685935156,
|
|
"grad_norm": 0.08296520262956619,
|
|
"learning_rate": 1.3236837755805127e-05,
|
|
"loss": 0.0037,
|
|
"step": 742
|
|
},
|
|
{
|
|
"epoch": 1.3710626514364832,
|
|
"grad_norm": 0.16028916835784912,
|
|
"learning_rate": 1.3216512700830104e-05,
|
|
"loss": 0.0044,
|
|
"step": 743
|
|
},
|
|
{
|
|
"epoch": 1.3729087342794508,
|
|
"grad_norm": 0.054944444447755814,
|
|
"learning_rate": 1.3196172813975294e-05,
|
|
"loss": 0.0018,
|
|
"step": 744
|
|
},
|
|
{
|
|
"epoch": 1.3747548171224184,
|
|
"grad_norm": 0.08457396924495697,
|
|
"learning_rate": 1.3175818189031326e-05,
|
|
"loss": 0.0037,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 1.376600899965386,
|
|
"grad_norm": 0.12011455744504929,
|
|
"learning_rate": 1.3155448919856792e-05,
|
|
"loss": 0.0079,
|
|
"step": 746
|
|
},
|
|
{
|
|
"epoch": 1.3784469828083536,
|
|
"grad_norm": 0.10577619820833206,
|
|
"learning_rate": 1.3135065100377816e-05,
|
|
"loss": 0.0033,
|
|
"step": 747
|
|
},
|
|
{
|
|
"epoch": 1.380293065651321,
|
|
"grad_norm": 0.10273445397615433,
|
|
"learning_rate": 1.31146668245876e-05,
|
|
"loss": 0.0185,
|
|
"step": 748
|
|
},
|
|
{
|
|
"epoch": 1.3821391484942886,
|
|
"grad_norm": 0.16664770245552063,
|
|
"learning_rate": 1.3094254186546018e-05,
|
|
"loss": 0.0137,
|
|
"step": 749
|
|
},
|
|
{
|
|
"epoch": 1.3839852313372563,
|
|
"grad_norm": 0.08871475607156754,
|
|
"learning_rate": 1.3073827280379177e-05,
|
|
"loss": 0.0033,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 1.3858313141802239,
|
|
"grad_norm": 0.06521476805210114,
|
|
"learning_rate": 1.3053386200278963e-05,
|
|
"loss": 0.0051,
|
|
"step": 751
|
|
},
|
|
{
|
|
"epoch": 1.3876773970231915,
|
|
"grad_norm": 0.11643893271684647,
|
|
"learning_rate": 1.3032931040502627e-05,
|
|
"loss": 0.0087,
|
|
"step": 752
|
|
},
|
|
{
|
|
"epoch": 1.3895234798661589,
|
|
"grad_norm": 0.10361618548631668,
|
|
"learning_rate": 1.3012461895372343e-05,
|
|
"loss": 0.0061,
|
|
"step": 753
|
|
},
|
|
{
|
|
"epoch": 1.3913695627091265,
|
|
"grad_norm": 0.2820046544075012,
|
|
"learning_rate": 1.2991978859274776e-05,
|
|
"loss": 0.002,
|
|
"step": 754
|
|
},
|
|
{
|
|
"epoch": 1.393215645552094,
|
|
"grad_norm": 0.10074072331190109,
|
|
"learning_rate": 1.2971482026660642e-05,
|
|
"loss": 0.0035,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 1.3950617283950617,
|
|
"grad_norm": 0.047872625291347504,
|
|
"learning_rate": 1.2950971492044272e-05,
|
|
"loss": 0.0018,
|
|
"step": 756
|
|
},
|
|
{
|
|
"epoch": 1.3969078112380293,
|
|
"grad_norm": 0.10103340446949005,
|
|
"learning_rate": 1.2930447350003186e-05,
|
|
"loss": 0.0087,
|
|
"step": 757
|
|
},
|
|
{
|
|
"epoch": 1.398753894080997,
|
|
"grad_norm": 0.11095941811800003,
|
|
"learning_rate": 1.2909909695177647e-05,
|
|
"loss": 0.0059,
|
|
"step": 758
|
|
},
|
|
{
|
|
"epoch": 1.4005999769239645,
|
|
"grad_norm": 0.07271669059991837,
|
|
"learning_rate": 1.2889358622270225e-05,
|
|
"loss": 0.003,
|
|
"step": 759
|
|
},
|
|
{
|
|
"epoch": 1.4024460597669322,
|
|
"grad_norm": 0.11334390938282013,
|
|
"learning_rate": 1.2868794226045367e-05,
|
|
"loss": 0.0085,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 1.4042921426098995,
|
|
"grad_norm": 0.06165868788957596,
|
|
"learning_rate": 1.2848216601328958e-05,
|
|
"loss": 0.003,
|
|
"step": 761
|
|
},
|
|
{
|
|
"epoch": 1.4061382254528672,
|
|
"grad_norm": 0.11465856432914734,
|
|
"learning_rate": 1.2827625843007871e-05,
|
|
"loss": 0.002,
|
|
"step": 762
|
|
},
|
|
{
|
|
"epoch": 1.4079843082958348,
|
|
"grad_norm": 0.1449616253376007,
|
|
"learning_rate": 1.2807022046029556e-05,
|
|
"loss": 0.014,
|
|
"step": 763
|
|
},
|
|
{
|
|
"epoch": 1.4098303911388024,
|
|
"grad_norm": 0.04920601472258568,
|
|
"learning_rate": 1.278640530540157e-05,
|
|
"loss": 0.0024,
|
|
"step": 764
|
|
},
|
|
{
|
|
"epoch": 1.41167647398177,
|
|
"grad_norm": 0.07521408796310425,
|
|
"learning_rate": 1.276577571619117e-05,
|
|
"loss": 0.0023,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 1.4135225568247374,
|
|
"grad_norm": 0.0906362533569336,
|
|
"learning_rate": 1.2745133373524855e-05,
|
|
"loss": 0.0054,
|
|
"step": 766
|
|
},
|
|
{
|
|
"epoch": 1.415368639667705,
|
|
"grad_norm": 0.14911456406116486,
|
|
"learning_rate": 1.2724478372587921e-05,
|
|
"loss": 0.0016,
|
|
"step": 767
|
|
},
|
|
{
|
|
"epoch": 1.4172147225106726,
|
|
"grad_norm": 0.059913989156484604,
|
|
"learning_rate": 1.2703810808624051e-05,
|
|
"loss": 0.0029,
|
|
"step": 768
|
|
},
|
|
{
|
|
"epoch": 1.4190608053536402,
|
|
"grad_norm": 0.10398763418197632,
|
|
"learning_rate": 1.268313077693485e-05,
|
|
"loss": 0.0094,
|
|
"step": 769
|
|
},
|
|
{
|
|
"epoch": 1.4209068881966078,
|
|
"grad_norm": 0.16045577824115753,
|
|
"learning_rate": 1.2662438372879409e-05,
|
|
"loss": 0.0032,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 1.4227529710395754,
|
|
"grad_norm": 0.0918615460395813,
|
|
"learning_rate": 1.2641733691873884e-05,
|
|
"loss": 0.0045,
|
|
"step": 771
|
|
},
|
|
{
|
|
"epoch": 1.424599053882543,
|
|
"grad_norm": 0.18795117735862732,
|
|
"learning_rate": 1.2621016829391022e-05,
|
|
"loss": 0.0192,
|
|
"step": 772
|
|
},
|
|
{
|
|
"epoch": 1.4264451367255107,
|
|
"grad_norm": 0.0472252257168293,
|
|
"learning_rate": 1.2600287880959762e-05,
|
|
"loss": 0.0014,
|
|
"step": 773
|
|
},
|
|
{
|
|
"epoch": 1.4282912195684783,
|
|
"grad_norm": 0.10216383635997772,
|
|
"learning_rate": 1.2579546942164762e-05,
|
|
"loss": 0.0038,
|
|
"step": 774
|
|
},
|
|
{
|
|
"epoch": 1.4301373024114457,
|
|
"grad_norm": 0.1046018898487091,
|
|
"learning_rate": 1.2558794108645966e-05,
|
|
"loss": 0.0045,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 1.4319833852544133,
|
|
"grad_norm": 0.08859994262456894,
|
|
"learning_rate": 1.2538029476098175e-05,
|
|
"loss": 0.0065,
|
|
"step": 776
|
|
},
|
|
{
|
|
"epoch": 1.4338294680973809,
|
|
"grad_norm": 0.06777019798755646,
|
|
"learning_rate": 1.2517253140270595e-05,
|
|
"loss": 0.0024,
|
|
"step": 777
|
|
},
|
|
{
|
|
"epoch": 1.4356755509403485,
|
|
"grad_norm": 0.08122848719358444,
|
|
"learning_rate": 1.2496465196966393e-05,
|
|
"loss": 0.0025,
|
|
"step": 778
|
|
},
|
|
{
|
|
"epoch": 1.437521633783316,
|
|
"grad_norm": 0.07865536212921143,
|
|
"learning_rate": 1.2475665742042269e-05,
|
|
"loss": 0.0033,
|
|
"step": 779
|
|
},
|
|
{
|
|
"epoch": 1.4393677166262835,
|
|
"grad_norm": 0.07937873899936676,
|
|
"learning_rate": 1.2454854871407993e-05,
|
|
"loss": 0.0109,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 1.441213799469251,
|
|
"grad_norm": 0.04683367908000946,
|
|
"learning_rate": 1.2434032681025986e-05,
|
|
"loss": 0.0022,
|
|
"step": 781
|
|
},
|
|
{
|
|
"epoch": 1.4430598823122187,
|
|
"grad_norm": 0.08413892239332199,
|
|
"learning_rate": 1.2413199266910865e-05,
|
|
"loss": 0.0018,
|
|
"step": 782
|
|
},
|
|
{
|
|
"epoch": 1.4449059651551863,
|
|
"grad_norm": 0.04942731931805611,
|
|
"learning_rate": 1.239235472512899e-05,
|
|
"loss": 0.0022,
|
|
"step": 783
|
|
},
|
|
{
|
|
"epoch": 1.446752047998154,
|
|
"grad_norm": 0.06107890605926514,
|
|
"learning_rate": 1.2371499151798046e-05,
|
|
"loss": 0.0022,
|
|
"step": 784
|
|
},
|
|
{
|
|
"epoch": 1.4485981308411215,
|
|
"grad_norm": 0.05639895424246788,
|
|
"learning_rate": 1.2350632643086583e-05,
|
|
"loss": 0.0019,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 1.4504442136840892,
|
|
"grad_norm": 0.03176642209291458,
|
|
"learning_rate": 1.2329755295213568e-05,
|
|
"loss": 0.0013,
|
|
"step": 786
|
|
},
|
|
{
|
|
"epoch": 1.4522902965270568,
|
|
"grad_norm": 0.08829797804355621,
|
|
"learning_rate": 1.2308867204447958e-05,
|
|
"loss": 0.0015,
|
|
"step": 787
|
|
},
|
|
{
|
|
"epoch": 1.4541363793700242,
|
|
"grad_norm": 0.09004661440849304,
|
|
"learning_rate": 1.228796846710825e-05,
|
|
"loss": 0.0105,
|
|
"step": 788
|
|
},
|
|
{
|
|
"epoch": 1.4559824622129918,
|
|
"grad_norm": 0.09356413036584854,
|
|
"learning_rate": 1.226705917956202e-05,
|
|
"loss": 0.0118,
|
|
"step": 789
|
|
},
|
|
{
|
|
"epoch": 1.4578285450559594,
|
|
"grad_norm": 0.08679736405611038,
|
|
"learning_rate": 1.2246139438225509e-05,
|
|
"loss": 0.0036,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 1.459674627898927,
|
|
"grad_norm": 0.05207599699497223,
|
|
"learning_rate": 1.2225209339563144e-05,
|
|
"loss": 0.0038,
|
|
"step": 791
|
|
},
|
|
{
|
|
"epoch": 1.4615207107418946,
|
|
"grad_norm": 0.1354241818189621,
|
|
"learning_rate": 1.2204268980087132e-05,
|
|
"loss": 0.0215,
|
|
"step": 792
|
|
},
|
|
{
|
|
"epoch": 1.463366793584862,
|
|
"grad_norm": 0.17882613837718964,
|
|
"learning_rate": 1.2183318456356984e-05,
|
|
"loss": 0.0118,
|
|
"step": 793
|
|
},
|
|
{
|
|
"epoch": 1.4652128764278296,
|
|
"grad_norm": 0.06782796233892441,
|
|
"learning_rate": 1.2162357864979073e-05,
|
|
"loss": 0.0019,
|
|
"step": 794
|
|
},
|
|
{
|
|
"epoch": 1.4670589592707972,
|
|
"grad_norm": 0.1393979787826538,
|
|
"learning_rate": 1.214138730260621e-05,
|
|
"loss": 0.0063,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 1.4689050421137648,
|
|
"grad_norm": 0.20122197270393372,
|
|
"learning_rate": 1.2120406865937174e-05,
|
|
"loss": 0.0104,
|
|
"step": 796
|
|
},
|
|
{
|
|
"epoch": 1.4707511249567324,
|
|
"grad_norm": 0.11681712418794632,
|
|
"learning_rate": 1.2099416651716277e-05,
|
|
"loss": 0.003,
|
|
"step": 797
|
|
},
|
|
{
|
|
"epoch": 1.4725972077997,
|
|
"grad_norm": 0.07837650179862976,
|
|
"learning_rate": 1.2078416756732925e-05,
|
|
"loss": 0.0018,
|
|
"step": 798
|
|
},
|
|
{
|
|
"epoch": 1.4744432906426677,
|
|
"grad_norm": 0.055501487106084824,
|
|
"learning_rate": 1.2057407277821148e-05,
|
|
"loss": 0.0023,
|
|
"step": 799
|
|
},
|
|
{
|
|
"epoch": 1.4762893734856353,
|
|
"grad_norm": 0.0729304850101471,
|
|
"learning_rate": 1.2036388311859189e-05,
|
|
"loss": 0.0085,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.4781354563286029,
|
|
"grad_norm": 0.05123327299952507,
|
|
"learning_rate": 1.2015359955769021e-05,
|
|
"loss": 0.002,
|
|
"step": 801
|
|
},
|
|
{
|
|
"epoch": 1.4799815391715703,
|
|
"grad_norm": 0.06856680661439896,
|
|
"learning_rate": 1.1994322306515926e-05,
|
|
"loss": 0.0027,
|
|
"step": 802
|
|
},
|
|
{
|
|
"epoch": 1.4818276220145379,
|
|
"grad_norm": 0.08618495613336563,
|
|
"learning_rate": 1.1973275461108027e-05,
|
|
"loss": 0.003,
|
|
"step": 803
|
|
},
|
|
{
|
|
"epoch": 1.4836737048575055,
|
|
"grad_norm": 0.11988022923469543,
|
|
"learning_rate": 1.1952219516595868e-05,
|
|
"loss": 0.0027,
|
|
"step": 804
|
|
},
|
|
{
|
|
"epoch": 1.485519787700473,
|
|
"grad_norm": 0.08761744946241379,
|
|
"learning_rate": 1.193115457007194e-05,
|
|
"loss": 0.0022,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 1.4873658705434407,
|
|
"grad_norm": 0.06701692938804626,
|
|
"learning_rate": 1.1910080718670246e-05,
|
|
"loss": 0.0045,
|
|
"step": 806
|
|
},
|
|
{
|
|
"epoch": 1.489211953386408,
|
|
"grad_norm": 0.05863157659769058,
|
|
"learning_rate": 1.1888998059565848e-05,
|
|
"loss": 0.0021,
|
|
"step": 807
|
|
},
|
|
{
|
|
"epoch": 1.4910580362293757,
|
|
"grad_norm": 0.08547432720661163,
|
|
"learning_rate": 1.186790668997443e-05,
|
|
"loss": 0.0038,
|
|
"step": 808
|
|
},
|
|
{
|
|
"epoch": 1.4929041190723433,
|
|
"grad_norm": 0.13616196811199188,
|
|
"learning_rate": 1.1846806707151832e-05,
|
|
"loss": 0.0042,
|
|
"step": 809
|
|
},
|
|
{
|
|
"epoch": 1.494750201915311,
|
|
"grad_norm": 0.05642635375261307,
|
|
"learning_rate": 1.182569820839362e-05,
|
|
"loss": 0.0036,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 1.4965962847582785,
|
|
"grad_norm": 0.11068624258041382,
|
|
"learning_rate": 1.1804581291034615e-05,
|
|
"loss": 0.0046,
|
|
"step": 811
|
|
},
|
|
{
|
|
"epoch": 1.4984423676012462,
|
|
"grad_norm": 0.07298606634140015,
|
|
"learning_rate": 1.1783456052448476e-05,
|
|
"loss": 0.0031,
|
|
"step": 812
|
|
},
|
|
{
|
|
"epoch": 1.5002884504442138,
|
|
"grad_norm": 0.1301308572292328,
|
|
"learning_rate": 1.176232259004722e-05,
|
|
"loss": 0.0139,
|
|
"step": 813
|
|
},
|
|
{
|
|
"epoch": 1.5021345332871814,
|
|
"grad_norm": 0.14820291101932526,
|
|
"learning_rate": 1.1741181001280783e-05,
|
|
"loss": 0.0079,
|
|
"step": 814
|
|
},
|
|
{
|
|
"epoch": 1.503980616130149,
|
|
"grad_norm": 0.0516553595662117,
|
|
"learning_rate": 1.1720031383636585e-05,
|
|
"loss": 0.0015,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 1.5058266989731164,
|
|
"grad_norm": 0.09537000954151154,
|
|
"learning_rate": 1.169887383463906e-05,
|
|
"loss": 0.0099,
|
|
"step": 816
|
|
},
|
|
{
|
|
"epoch": 1.507672781816084,
|
|
"grad_norm": 0.21092131733894348,
|
|
"learning_rate": 1.1677708451849214e-05,
|
|
"loss": 0.0663,
|
|
"step": 817
|
|
},
|
|
{
|
|
"epoch": 1.5095188646590516,
|
|
"grad_norm": 0.10161333531141281,
|
|
"learning_rate": 1.165653533286418e-05,
|
|
"loss": 0.0043,
|
|
"step": 818
|
|
},
|
|
{
|
|
"epoch": 1.511364947502019,
|
|
"grad_norm": 0.17274737358093262,
|
|
"learning_rate": 1.1635354575316765e-05,
|
|
"loss": 0.0243,
|
|
"step": 819
|
|
},
|
|
{
|
|
"epoch": 1.5132110303449866,
|
|
"grad_norm": 0.09211444109678268,
|
|
"learning_rate": 1.1614166276874994e-05,
|
|
"loss": 0.0044,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.5150571131879542,
|
|
"grad_norm": 0.10178298503160477,
|
|
"learning_rate": 1.1592970535241668e-05,
|
|
"loss": 0.0046,
|
|
"step": 821
|
|
},
|
|
{
|
|
"epoch": 1.5169031960309218,
|
|
"grad_norm": 0.14009442925453186,
|
|
"learning_rate": 1.15717674481539e-05,
|
|
"loss": 0.0164,
|
|
"step": 822
|
|
},
|
|
{
|
|
"epoch": 1.5187492788738894,
|
|
"grad_norm": 0.07609976083040237,
|
|
"learning_rate": 1.1550557113382697e-05,
|
|
"loss": 0.0027,
|
|
"step": 823
|
|
},
|
|
{
|
|
"epoch": 1.520595361716857,
|
|
"grad_norm": 0.08188093453645706,
|
|
"learning_rate": 1.1529339628732462e-05,
|
|
"loss": 0.0068,
|
|
"step": 824
|
|
},
|
|
{
|
|
"epoch": 1.5224414445598247,
|
|
"grad_norm": 0.10872019082307816,
|
|
"learning_rate": 1.1508115092040577e-05,
|
|
"loss": 0.0066,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 1.5242875274027923,
|
|
"grad_norm": 0.11876392364501953,
|
|
"learning_rate": 1.1486883601176944e-05,
|
|
"loss": 0.005,
|
|
"step": 826
|
|
},
|
|
{
|
|
"epoch": 1.5261336102457599,
|
|
"grad_norm": 0.08055685460567474,
|
|
"learning_rate": 1.146564525404353e-05,
|
|
"loss": 0.0033,
|
|
"step": 827
|
|
},
|
|
{
|
|
"epoch": 1.5279796930887275,
|
|
"grad_norm": 0.055356502532958984,
|
|
"learning_rate": 1.1444400148573918e-05,
|
|
"loss": 0.0019,
|
|
"step": 828
|
|
},
|
|
{
|
|
"epoch": 1.5279796930887275,
|
|
"eval_loss": 0.008090285584330559,
|
|
"eval_runtime": 91.6507,
|
|
"eval_samples_per_second": 9.962,
|
|
"eval_steps_per_second": 4.986,
|
|
"step": 828
|
|
},
|
|
{
|
|
"epoch": 1.5298257759316949,
|
|
"grad_norm": 0.11040205508470535,
|
|
"learning_rate": 1.1423148382732854e-05,
|
|
"loss": 0.0074,
|
|
"step": 829
|
|
},
|
|
{
|
|
"epoch": 1.5316718587746625,
|
|
"grad_norm": 0.05763715133070946,
|
|
"learning_rate": 1.1401890054515792e-05,
|
|
"loss": 0.0019,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 1.53351794161763,
|
|
"grad_norm": 0.13855917751789093,
|
|
"learning_rate": 1.1380625261948458e-05,
|
|
"loss": 0.0046,
|
|
"step": 831
|
|
},
|
|
{
|
|
"epoch": 1.5353640244605977,
|
|
"grad_norm": 0.07250423729419708,
|
|
"learning_rate": 1.1359354103086377e-05,
|
|
"loss": 0.0029,
|
|
"step": 832
|
|
},
|
|
{
|
|
"epoch": 1.537210107303565,
|
|
"grad_norm": 0.07645686715841293,
|
|
"learning_rate": 1.1338076676014427e-05,
|
|
"loss": 0.0048,
|
|
"step": 833
|
|
},
|
|
{
|
|
"epoch": 1.5390561901465327,
|
|
"grad_norm": 0.08407016843557358,
|
|
"learning_rate": 1.1316793078846395e-05,
|
|
"loss": 0.0047,
|
|
"step": 834
|
|
},
|
|
{
|
|
"epoch": 1.5409022729895003,
|
|
"grad_norm": 0.08176873624324799,
|
|
"learning_rate": 1.1295503409724526e-05,
|
|
"loss": 0.0053,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 1.542748355832468,
|
|
"grad_norm": 0.07881695032119751,
|
|
"learning_rate": 1.127420776681905e-05,
|
|
"loss": 0.0075,
|
|
"step": 836
|
|
},
|
|
{
|
|
"epoch": 1.5445944386754356,
|
|
"grad_norm": 0.052021872252225876,
|
|
"learning_rate": 1.1252906248327753e-05,
|
|
"loss": 0.0022,
|
|
"step": 837
|
|
},
|
|
{
|
|
"epoch": 1.5464405215184032,
|
|
"grad_norm": 0.12975731492042542,
|
|
"learning_rate": 1.1231598952475504e-05,
|
|
"loss": 0.0109,
|
|
"step": 838
|
|
},
|
|
{
|
|
"epoch": 1.5482866043613708,
|
|
"grad_norm": 0.06862396746873856,
|
|
"learning_rate": 1.1210285977513833e-05,
|
|
"loss": 0.003,
|
|
"step": 839
|
|
},
|
|
{
|
|
"epoch": 1.5501326872043384,
|
|
"grad_norm": 0.10307830572128296,
|
|
"learning_rate": 1.1188967421720434e-05,
|
|
"loss": 0.0068,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.551978770047306,
|
|
"grad_norm": 0.11857514083385468,
|
|
"learning_rate": 1.1167643383398746e-05,
|
|
"loss": 0.0075,
|
|
"step": 841
|
|
},
|
|
{
|
|
"epoch": 1.5538248528902736,
|
|
"grad_norm": 0.057690635323524475,
|
|
"learning_rate": 1.1146313960877486e-05,
|
|
"loss": 0.0018,
|
|
"step": 842
|
|
},
|
|
{
|
|
"epoch": 1.555670935733241,
|
|
"grad_norm": 0.0859212726354599,
|
|
"learning_rate": 1.1124979252510209e-05,
|
|
"loss": 0.0054,
|
|
"step": 843
|
|
},
|
|
{
|
|
"epoch": 1.5575170185762086,
|
|
"grad_norm": 0.14271004498004913,
|
|
"learning_rate": 1.1103639356674825e-05,
|
|
"loss": 0.0175,
|
|
"step": 844
|
|
},
|
|
{
|
|
"epoch": 1.5593631014191762,
|
|
"grad_norm": 0.0606565847992897,
|
|
"learning_rate": 1.1082294371773182e-05,
|
|
"loss": 0.0033,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 1.5612091842621436,
|
|
"grad_norm": 0.05779292806982994,
|
|
"learning_rate": 1.1060944396230583e-05,
|
|
"loss": 0.0028,
|
|
"step": 846
|
|
},
|
|
{
|
|
"epoch": 1.5630552671051112,
|
|
"grad_norm": 0.0749453604221344,
|
|
"learning_rate": 1.1039589528495347e-05,
|
|
"loss": 0.005,
|
|
"step": 847
|
|
},
|
|
{
|
|
"epoch": 1.5649013499480788,
|
|
"grad_norm": 0.14807872474193573,
|
|
"learning_rate": 1.1018229867038358e-05,
|
|
"loss": 0.0031,
|
|
"step": 848
|
|
},
|
|
{
|
|
"epoch": 1.5667474327910464,
|
|
"grad_norm": 0.08591480553150177,
|
|
"learning_rate": 1.099686551035259e-05,
|
|
"loss": 0.0039,
|
|
"step": 849
|
|
},
|
|
{
|
|
"epoch": 1.568593515634014,
|
|
"grad_norm": 0.06698640435934067,
|
|
"learning_rate": 1.0975496556952683e-05,
|
|
"loss": 0.0026,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 1.5704395984769817,
|
|
"grad_norm": 0.06228471174836159,
|
|
"learning_rate": 1.0954123105374468e-05,
|
|
"loss": 0.002,
|
|
"step": 851
|
|
},
|
|
{
|
|
"epoch": 1.5722856813199493,
|
|
"grad_norm": 0.29273462295532227,
|
|
"learning_rate": 1.0932745254174512e-05,
|
|
"loss": 0.0175,
|
|
"step": 852
|
|
},
|
|
{
|
|
"epoch": 1.574131764162917,
|
|
"grad_norm": 0.07816620916128159,
|
|
"learning_rate": 1.0911363101929677e-05,
|
|
"loss": 0.0057,
|
|
"step": 853
|
|
},
|
|
{
|
|
"epoch": 1.5759778470058845,
|
|
"grad_norm": 0.1502242088317871,
|
|
"learning_rate": 1.0889976747236657e-05,
|
|
"loss": 0.0067,
|
|
"step": 854
|
|
},
|
|
{
|
|
"epoch": 1.5778239298488521,
|
|
"grad_norm": 0.0736665204167366,
|
|
"learning_rate": 1.0868586288711515e-05,
|
|
"loss": 0.003,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 1.5796700126918195,
|
|
"grad_norm": 0.04509057104587555,
|
|
"learning_rate": 1.0847191824989252e-05,
|
|
"loss": 0.0053,
|
|
"step": 856
|
|
},
|
|
{
|
|
"epoch": 1.5815160955347871,
|
|
"grad_norm": 0.05116039142012596,
|
|
"learning_rate": 1.0825793454723325e-05,
|
|
"loss": 0.0021,
|
|
"step": 857
|
|
},
|
|
{
|
|
"epoch": 1.5833621783777547,
|
|
"grad_norm": 0.05853905901312828,
|
|
"learning_rate": 1.080439127658521e-05,
|
|
"loss": 0.007,
|
|
"step": 858
|
|
},
|
|
{
|
|
"epoch": 1.5852082612207221,
|
|
"grad_norm": 0.08044451475143433,
|
|
"learning_rate": 1.078298538926395e-05,
|
|
"loss": 0.0045,
|
|
"step": 859
|
|
},
|
|
{
|
|
"epoch": 1.5870543440636897,
|
|
"grad_norm": 0.09421449154615402,
|
|
"learning_rate": 1.076157589146567e-05,
|
|
"loss": 0.0093,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.5889004269066573,
|
|
"grad_norm": 0.15502449870109558,
|
|
"learning_rate": 1.0740162881913165e-05,
|
|
"loss": 0.0184,
|
|
"step": 861
|
|
},
|
|
{
|
|
"epoch": 1.590746509749625,
|
|
"grad_norm": 0.08580180257558823,
|
|
"learning_rate": 1.0718746459345415e-05,
|
|
"loss": 0.0048,
|
|
"step": 862
|
|
},
|
|
{
|
|
"epoch": 1.5925925925925926,
|
|
"grad_norm": 0.07770812511444092,
|
|
"learning_rate": 1.0697326722517137e-05,
|
|
"loss": 0.006,
|
|
"step": 863
|
|
},
|
|
{
|
|
"epoch": 1.5944386754355602,
|
|
"grad_norm": 0.15370580554008484,
|
|
"learning_rate": 1.0675903770198333e-05,
|
|
"loss": 0.0326,
|
|
"step": 864
|
|
},
|
|
{
|
|
"epoch": 1.5962847582785278,
|
|
"grad_norm": 0.08603129535913467,
|
|
"learning_rate": 1.0654477701173824e-05,
|
|
"loss": 0.0037,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 1.5981308411214954,
|
|
"grad_norm": 0.1838487982749939,
|
|
"learning_rate": 1.0633048614242817e-05,
|
|
"loss": 0.0274,
|
|
"step": 866
|
|
},
|
|
{
|
|
"epoch": 1.599976923964463,
|
|
"grad_norm": 0.08164787292480469,
|
|
"learning_rate": 1.0611616608218429e-05,
|
|
"loss": 0.0028,
|
|
"step": 867
|
|
},
|
|
{
|
|
"epoch": 1.6018230068074306,
|
|
"grad_norm": 0.05253343656659126,
|
|
"learning_rate": 1.0590181781927229e-05,
|
|
"loss": 0.0049,
|
|
"step": 868
|
|
},
|
|
{
|
|
"epoch": 1.6036690896503982,
|
|
"grad_norm": 0.0916278213262558,
|
|
"learning_rate": 1.05687442342088e-05,
|
|
"loss": 0.0053,
|
|
"step": 869
|
|
},
|
|
{
|
|
"epoch": 1.6055151724933656,
|
|
"grad_norm": 0.06521543860435486,
|
|
"learning_rate": 1.0547304063915277e-05,
|
|
"loss": 0.0058,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 1.6073612553363332,
|
|
"grad_norm": 0.08294078707695007,
|
|
"learning_rate": 1.0525861369910877e-05,
|
|
"loss": 0.0047,
|
|
"step": 871
|
|
},
|
|
{
|
|
"epoch": 1.6092073381793008,
|
|
"grad_norm": 0.09736410528421402,
|
|
"learning_rate": 1.0504416251071463e-05,
|
|
"loss": 0.0041,
|
|
"step": 872
|
|
},
|
|
{
|
|
"epoch": 1.6110534210222682,
|
|
"grad_norm": 0.1033322662115097,
|
|
"learning_rate": 1.0482968806284073e-05,
|
|
"loss": 0.0088,
|
|
"step": 873
|
|
},
|
|
{
|
|
"epoch": 1.6128995038652358,
|
|
"grad_norm": 0.054253265261650085,
|
|
"learning_rate": 1.0461519134446477e-05,
|
|
"loss": 0.0035,
|
|
"step": 874
|
|
},
|
|
{
|
|
"epoch": 1.6147455867082035,
|
|
"grad_norm": 0.08290667086839676,
|
|
"learning_rate": 1.0440067334466712e-05,
|
|
"loss": 0.0101,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 1.616591669551171,
|
|
"grad_norm": 0.13517652451992035,
|
|
"learning_rate": 1.0418613505262623e-05,
|
|
"loss": 0.0126,
|
|
"step": 876
|
|
},
|
|
{
|
|
"epoch": 1.6184377523941387,
|
|
"grad_norm": 0.06763678044080734,
|
|
"learning_rate": 1.0397157745761419e-05,
|
|
"loss": 0.0025,
|
|
"step": 877
|
|
},
|
|
{
|
|
"epoch": 1.6202838352371063,
|
|
"grad_norm": 0.07910499721765518,
|
|
"learning_rate": 1.0375700154899208e-05,
|
|
"loss": 0.006,
|
|
"step": 878
|
|
},
|
|
{
|
|
"epoch": 1.622129918080074,
|
|
"grad_norm": 0.111945740878582,
|
|
"learning_rate": 1.0354240831620542e-05,
|
|
"loss": 0.0046,
|
|
"step": 879
|
|
},
|
|
{
|
|
"epoch": 1.6239760009230415,
|
|
"grad_norm": 0.0739910826086998,
|
|
"learning_rate": 1.0332779874877959e-05,
|
|
"loss": 0.0049,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.6258220837660091,
|
|
"grad_norm": 0.11454188078641891,
|
|
"learning_rate": 1.0311317383631532e-05,
|
|
"loss": 0.01,
|
|
"step": 881
|
|
},
|
|
{
|
|
"epoch": 1.6276681666089767,
|
|
"grad_norm": 0.0890415832400322,
|
|
"learning_rate": 1.028985345684841e-05,
|
|
"loss": 0.0072,
|
|
"step": 882
|
|
},
|
|
{
|
|
"epoch": 1.6295142494519441,
|
|
"grad_norm": 0.06510549783706665,
|
|
"learning_rate": 1.0268388193502365e-05,
|
|
"loss": 0.0047,
|
|
"step": 883
|
|
},
|
|
{
|
|
"epoch": 1.6313603322949117,
|
|
"grad_norm": 0.060889944434165955,
|
|
"learning_rate": 1.0246921692573322e-05,
|
|
"loss": 0.0024,
|
|
"step": 884
|
|
},
|
|
{
|
|
"epoch": 1.6332064151378793,
|
|
"grad_norm": 0.14020121097564697,
|
|
"learning_rate": 1.0225454053046922e-05,
|
|
"loss": 0.0044,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 1.6350524979808467,
|
|
"grad_norm": 0.08631158620119095,
|
|
"learning_rate": 1.0203985373914056e-05,
|
|
"loss": 0.003,
|
|
"step": 886
|
|
},
|
|
{
|
|
"epoch": 1.6368985808238143,
|
|
"grad_norm": 0.17379646003246307,
|
|
"learning_rate": 1.0182515754170402e-05,
|
|
"loss": 0.0051,
|
|
"step": 887
|
|
},
|
|
{
|
|
"epoch": 1.638744663666782,
|
|
"grad_norm": 0.08604667335748672,
|
|
"learning_rate": 1.0161045292815974e-05,
|
|
"loss": 0.0053,
|
|
"step": 888
|
|
},
|
|
{
|
|
"epoch": 1.6405907465097496,
|
|
"grad_norm": 0.09943891316652298,
|
|
"learning_rate": 1.0139574088854682e-05,
|
|
"loss": 0.0073,
|
|
"step": 889
|
|
},
|
|
{
|
|
"epoch": 1.6424368293527172,
|
|
"grad_norm": 0.1661536991596222,
|
|
"learning_rate": 1.0118102241293848e-05,
|
|
"loss": 0.0056,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 1.6442829121956848,
|
|
"grad_norm": 0.062387865036726,
|
|
"learning_rate": 1.0096629849143757e-05,
|
|
"loss": 0.0053,
|
|
"step": 891
|
|
},
|
|
{
|
|
"epoch": 1.6461289950386524,
|
|
"grad_norm": 0.07064353674650192,
|
|
"learning_rate": 1.007515701141722e-05,
|
|
"loss": 0.0036,
|
|
"step": 892
|
|
},
|
|
{
|
|
"epoch": 1.64797507788162,
|
|
"grad_norm": 0.11467552930116653,
|
|
"learning_rate": 1.0053683827129091e-05,
|
|
"loss": 0.0134,
|
|
"step": 893
|
|
},
|
|
{
|
|
"epoch": 1.6498211607245876,
|
|
"grad_norm": 0.14931882917881012,
|
|
"learning_rate": 1.0032210395295829e-05,
|
|
"loss": 0.0165,
|
|
"step": 894
|
|
},
|
|
{
|
|
"epoch": 1.6516672435675552,
|
|
"grad_norm": 0.06062651425600052,
|
|
"learning_rate": 1.001073681493503e-05,
|
|
"loss": 0.0021,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 1.6535133264105226,
|
|
"grad_norm": 0.09045737981796265,
|
|
"learning_rate": 9.989263185064974e-06,
|
|
"loss": 0.0023,
|
|
"step": 896
|
|
},
|
|
{
|
|
"epoch": 1.6553594092534902,
|
|
"grad_norm": 0.15048907697200775,
|
|
"learning_rate": 9.967789604704173e-06,
|
|
"loss": 0.0048,
|
|
"step": 897
|
|
},
|
|
{
|
|
"epoch": 1.6572054920964578,
|
|
"grad_norm": 0.05222946032881737,
|
|
"learning_rate": 9.946316172870909e-06,
|
|
"loss": 0.0019,
|
|
"step": 898
|
|
},
|
|
{
|
|
"epoch": 1.6590515749394255,
|
|
"grad_norm": 0.09345798194408417,
|
|
"learning_rate": 9.924842988582783e-06,
|
|
"loss": 0.0055,
|
|
"step": 899
|
|
},
|
|
{
|
|
"epoch": 1.6608976577823928,
|
|
"grad_norm": 0.0894194170832634,
|
|
"learning_rate": 9.903370150856245e-06,
|
|
"loss": 0.005,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.6627437406253605,
|
|
"grad_norm": 0.10677597671747208,
|
|
"learning_rate": 9.881897758706155e-06,
|
|
"loss": 0.0069,
|
|
"step": 901
|
|
},
|
|
{
|
|
"epoch": 1.664589823468328,
|
|
"grad_norm": 0.06564165651798248,
|
|
"learning_rate": 9.860425911145323e-06,
|
|
"loss": 0.0072,
|
|
"step": 902
|
|
},
|
|
{
|
|
"epoch": 1.6664359063112957,
|
|
"grad_norm": 0.056849405169487,
|
|
"learning_rate": 9.83895470718403e-06,
|
|
"loss": 0.0018,
|
|
"step": 903
|
|
},
|
|
{
|
|
"epoch": 1.6682819891542633,
|
|
"grad_norm": 0.05910499766469002,
|
|
"learning_rate": 9.817484245829603e-06,
|
|
"loss": 0.0031,
|
|
"step": 904
|
|
},
|
|
{
|
|
"epoch": 1.670128071997231,
|
|
"grad_norm": 0.1368527114391327,
|
|
"learning_rate": 9.79601462608595e-06,
|
|
"loss": 0.0171,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 1.6719741548401985,
|
|
"grad_norm": 0.48427149653434753,
|
|
"learning_rate": 9.77454594695308e-06,
|
|
"loss": 0.0134,
|
|
"step": 906
|
|
},
|
|
{
|
|
"epoch": 1.6738202376831661,
|
|
"grad_norm": 0.045465849339962006,
|
|
"learning_rate": 9.75307830742668e-06,
|
|
"loss": 0.0016,
|
|
"step": 907
|
|
},
|
|
{
|
|
"epoch": 1.6756663205261337,
|
|
"grad_norm": 0.29681748151779175,
|
|
"learning_rate": 9.731611806497637e-06,
|
|
"loss": 0.006,
|
|
"step": 908
|
|
},
|
|
{
|
|
"epoch": 1.6775124033691013,
|
|
"grad_norm": 0.12164149433374405,
|
|
"learning_rate": 9.710146543151593e-06,
|
|
"loss": 0.0058,
|
|
"step": 909
|
|
},
|
|
{
|
|
"epoch": 1.6793584862120687,
|
|
"grad_norm": 0.0662166029214859,
|
|
"learning_rate": 9.688682616368471e-06,
|
|
"loss": 0.0028,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 1.6812045690550363,
|
|
"grad_norm": 0.10360967367887497,
|
|
"learning_rate": 9.667220125122044e-06,
|
|
"loss": 0.0069,
|
|
"step": 911
|
|
},
|
|
{
|
|
"epoch": 1.683050651898004,
|
|
"grad_norm": 0.06868352741003036,
|
|
"learning_rate": 9.645759168379463e-06,
|
|
"loss": 0.0028,
|
|
"step": 912
|
|
},
|
|
{
|
|
"epoch": 1.6848967347409713,
|
|
"grad_norm": 0.07601200044155121,
|
|
"learning_rate": 9.624299845100795e-06,
|
|
"loss": 0.0079,
|
|
"step": 913
|
|
},
|
|
{
|
|
"epoch": 1.686742817583939,
|
|
"grad_norm": 0.08922822028398514,
|
|
"learning_rate": 9.602842254238583e-06,
|
|
"loss": 0.0035,
|
|
"step": 914
|
|
},
|
|
{
|
|
"epoch": 1.6885889004269066,
|
|
"grad_norm": 0.267914742231369,
|
|
"learning_rate": 9.58138649473738e-06,
|
|
"loss": 0.0137,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 1.6904349832698742,
|
|
"grad_norm": 0.10150199383497238,
|
|
"learning_rate": 9.559932665533291e-06,
|
|
"loss": 0.0044,
|
|
"step": 916
|
|
},
|
|
{
|
|
"epoch": 1.6922810661128418,
|
|
"grad_norm": 0.08167190849781036,
|
|
"learning_rate": 9.538480865553523e-06,
|
|
"loss": 0.0028,
|
|
"step": 917
|
|
},
|
|
{
|
|
"epoch": 1.6941271489558094,
|
|
"grad_norm": 0.08491844683885574,
|
|
"learning_rate": 9.51703119371593e-06,
|
|
"loss": 0.0046,
|
|
"step": 918
|
|
},
|
|
{
|
|
"epoch": 1.695973231798777,
|
|
"grad_norm": 0.1229473352432251,
|
|
"learning_rate": 9.495583748928539e-06,
|
|
"loss": 0.0069,
|
|
"step": 919
|
|
},
|
|
{
|
|
"epoch": 1.6978193146417446,
|
|
"grad_norm": 0.10284758359193802,
|
|
"learning_rate": 9.474138630089124e-06,
|
|
"loss": 0.0045,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.6978193146417446,
|
|
"eval_loss": 0.008002725429832935,
|
|
"eval_runtime": 92.1065,
|
|
"eval_samples_per_second": 9.912,
|
|
"eval_steps_per_second": 4.962,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.6996653974847122,
|
|
"grad_norm": 0.1596386730670929,
|
|
"learning_rate": 9.452695936084728e-06,
|
|
"loss": 0.0124,
|
|
"step": 921
|
|
},
|
|
{
|
|
"epoch": 1.7015114803276798,
|
|
"grad_norm": 0.11229660362005234,
|
|
"learning_rate": 9.431255765791201e-06,
|
|
"loss": 0.0156,
|
|
"step": 922
|
|
},
|
|
{
|
|
"epoch": 1.7033575631706472,
|
|
"grad_norm": 0.05473716929554939,
|
|
"learning_rate": 9.409818218072774e-06,
|
|
"loss": 0.0021,
|
|
"step": 923
|
|
},
|
|
{
|
|
"epoch": 1.7052036460136148,
|
|
"grad_norm": 0.06023913249373436,
|
|
"learning_rate": 9.388383391781576e-06,
|
|
"loss": 0.0029,
|
|
"step": 924
|
|
},
|
|
{
|
|
"epoch": 1.7070497288565825,
|
|
"grad_norm": 0.06623807549476624,
|
|
"learning_rate": 9.366951385757184e-06,
|
|
"loss": 0.0024,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 1.70889581169955,
|
|
"grad_norm": 0.08034256845712662,
|
|
"learning_rate": 9.345522298826177e-06,
|
|
"loss": 0.0042,
|
|
"step": 926
|
|
},
|
|
{
|
|
"epoch": 1.7107418945425175,
|
|
"grad_norm": 0.06799780577421188,
|
|
"learning_rate": 9.324096229801673e-06,
|
|
"loss": 0.0074,
|
|
"step": 927
|
|
},
|
|
{
|
|
"epoch": 1.712587977385485,
|
|
"grad_norm": 0.24874067306518555,
|
|
"learning_rate": 9.302673277482867e-06,
|
|
"loss": 0.0075,
|
|
"step": 928
|
|
},
|
|
{
|
|
"epoch": 1.7144340602284527,
|
|
"grad_norm": 0.06625612080097198,
|
|
"learning_rate": 9.281253540654586e-06,
|
|
"loss": 0.005,
|
|
"step": 929
|
|
},
|
|
{
|
|
"epoch": 1.7162801430714203,
|
|
"grad_norm": 0.1352481245994568,
|
|
"learning_rate": 9.259837118086837e-06,
|
|
"loss": 0.0113,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 1.718126225914388,
|
|
"grad_norm": 0.10778124630451202,
|
|
"learning_rate": 9.238424108534333e-06,
|
|
"loss": 0.0048,
|
|
"step": 931
|
|
},
|
|
{
|
|
"epoch": 1.7199723087573555,
|
|
"grad_norm": 0.060022782534360886,
|
|
"learning_rate": 9.217014610736054e-06,
|
|
"loss": 0.0025,
|
|
"step": 932
|
|
},
|
|
{
|
|
"epoch": 1.7218183916003231,
|
|
"grad_norm": 0.07783634960651398,
|
|
"learning_rate": 9.19560872341479e-06,
|
|
"loss": 0.0079,
|
|
"step": 933
|
|
},
|
|
{
|
|
"epoch": 1.7236644744432907,
|
|
"grad_norm": 0.10427862405776978,
|
|
"learning_rate": 9.174206545276678e-06,
|
|
"loss": 0.0072,
|
|
"step": 934
|
|
},
|
|
{
|
|
"epoch": 1.7255105572862584,
|
|
"grad_norm": 0.10971418768167496,
|
|
"learning_rate": 9.15280817501075e-06,
|
|
"loss": 0.0032,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 1.727356640129226,
|
|
"grad_norm": 0.057751502841711044,
|
|
"learning_rate": 9.131413711288485e-06,
|
|
"loss": 0.0052,
|
|
"step": 936
|
|
},
|
|
{
|
|
"epoch": 1.7292027229721934,
|
|
"grad_norm": 0.08130896091461182,
|
|
"learning_rate": 9.110023252763348e-06,
|
|
"loss": 0.0039,
|
|
"step": 937
|
|
},
|
|
{
|
|
"epoch": 1.731048805815161,
|
|
"grad_norm": 0.07428565621376038,
|
|
"learning_rate": 9.088636898070326e-06,
|
|
"loss": 0.0028,
|
|
"step": 938
|
|
},
|
|
{
|
|
"epoch": 1.7328948886581286,
|
|
"grad_norm": 0.08366493880748749,
|
|
"learning_rate": 9.067254745825488e-06,
|
|
"loss": 0.0027,
|
|
"step": 939
|
|
},
|
|
{
|
|
"epoch": 1.734740971501096,
|
|
"grad_norm": 0.057682104408741,
|
|
"learning_rate": 9.045876894625537e-06,
|
|
"loss": 0.0023,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.7365870543440636,
|
|
"grad_norm": 0.13296572864055634,
|
|
"learning_rate": 9.024503443047318e-06,
|
|
"loss": 0.0064,
|
|
"step": 941
|
|
},
|
|
{
|
|
"epoch": 1.7384331371870312,
|
|
"grad_norm": 0.10311231017112732,
|
|
"learning_rate": 9.003134489647412e-06,
|
|
"loss": 0.0057,
|
|
"step": 942
|
|
},
|
|
{
|
|
"epoch": 1.7402792200299988,
|
|
"grad_norm": 0.0860329195857048,
|
|
"learning_rate": 8.981770132961649e-06,
|
|
"loss": 0.0053,
|
|
"step": 943
|
|
},
|
|
{
|
|
"epoch": 1.7421253028729664,
|
|
"grad_norm": 0.07848010212182999,
|
|
"learning_rate": 8.960410471504656e-06,
|
|
"loss": 0.0029,
|
|
"step": 944
|
|
},
|
|
{
|
|
"epoch": 1.743971385715934,
|
|
"grad_norm": 0.053934577852487564,
|
|
"learning_rate": 8.93905560376942e-06,
|
|
"loss": 0.0041,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 1.7458174685589016,
|
|
"grad_norm": 0.0708983913064003,
|
|
"learning_rate": 8.917705628226823e-06,
|
|
"loss": 0.0033,
|
|
"step": 946
|
|
},
|
|
{
|
|
"epoch": 1.7476635514018692,
|
|
"grad_norm": 0.1002105101943016,
|
|
"learning_rate": 8.896360643325177e-06,
|
|
"loss": 0.0037,
|
|
"step": 947
|
|
},
|
|
{
|
|
"epoch": 1.7495096342448369,
|
|
"grad_norm": 0.09933286160230637,
|
|
"learning_rate": 8.875020747489795e-06,
|
|
"loss": 0.0147,
|
|
"step": 948
|
|
},
|
|
{
|
|
"epoch": 1.7513557170878045,
|
|
"grad_norm": 0.10095307976007462,
|
|
"learning_rate": 8.853686039122519e-06,
|
|
"loss": 0.0165,
|
|
"step": 949
|
|
},
|
|
{
|
|
"epoch": 1.7532017999307719,
|
|
"grad_norm": 0.08462752401828766,
|
|
"learning_rate": 8.83235661660126e-06,
|
|
"loss": 0.0058,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 1.7550478827737395,
|
|
"grad_norm": 0.06106647104024887,
|
|
"learning_rate": 8.81103257827957e-06,
|
|
"loss": 0.0048,
|
|
"step": 951
|
|
},
|
|
{
|
|
"epoch": 1.756893965616707,
|
|
"grad_norm": 0.07836233079433441,
|
|
"learning_rate": 8.789714022486168e-06,
|
|
"loss": 0.0026,
|
|
"step": 952
|
|
},
|
|
{
|
|
"epoch": 1.7587400484596745,
|
|
"grad_norm": 0.09076899290084839,
|
|
"learning_rate": 8.768401047524498e-06,
|
|
"loss": 0.0131,
|
|
"step": 953
|
|
},
|
|
{
|
|
"epoch": 1.760586131302642,
|
|
"grad_norm": 0.07738377153873444,
|
|
"learning_rate": 8.74709375167225e-06,
|
|
"loss": 0.0061,
|
|
"step": 954
|
|
},
|
|
{
|
|
"epoch": 1.7624322141456097,
|
|
"grad_norm": 0.0879320502281189,
|
|
"learning_rate": 8.72579223318095e-06,
|
|
"loss": 0.0043,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 1.7642782969885773,
|
|
"grad_norm": 0.24148541688919067,
|
|
"learning_rate": 8.704496590275479e-06,
|
|
"loss": 0.0047,
|
|
"step": 956
|
|
},
|
|
{
|
|
"epoch": 1.766124379831545,
|
|
"grad_norm": 0.16469478607177734,
|
|
"learning_rate": 8.683206921153607e-06,
|
|
"loss": 0.0042,
|
|
"step": 957
|
|
},
|
|
{
|
|
"epoch": 1.7679704626745125,
|
|
"grad_norm": 0.24314740300178528,
|
|
"learning_rate": 8.661923323985576e-06,
|
|
"loss": 0.0118,
|
|
"step": 958
|
|
},
|
|
{
|
|
"epoch": 1.7698165455174801,
|
|
"grad_norm": 0.11575371026992798,
|
|
"learning_rate": 8.640645896913628e-06,
|
|
"loss": 0.0142,
|
|
"step": 959
|
|
},
|
|
{
|
|
"epoch": 1.7716626283604477,
|
|
"grad_norm": 0.08918927609920502,
|
|
"learning_rate": 8.619374738051543e-06,
|
|
"loss": 0.0181,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.7735087112034154,
|
|
"grad_norm": 0.05675291642546654,
|
|
"learning_rate": 8.598109945484208e-06,
|
|
"loss": 0.0029,
|
|
"step": 961
|
|
},
|
|
{
|
|
"epoch": 1.775354794046383,
|
|
"grad_norm": 0.07499504834413528,
|
|
"learning_rate": 8.576851617267151e-06,
|
|
"loss": 0.0025,
|
|
"step": 962
|
|
},
|
|
{
|
|
"epoch": 1.7772008768893504,
|
|
"grad_norm": 0.056793201714754105,
|
|
"learning_rate": 8.555599851426086e-06,
|
|
"loss": 0.0026,
|
|
"step": 963
|
|
},
|
|
{
|
|
"epoch": 1.779046959732318,
|
|
"grad_norm": 0.05858364701271057,
|
|
"learning_rate": 8.534354745956472e-06,
|
|
"loss": 0.0038,
|
|
"step": 964
|
|
},
|
|
{
|
|
"epoch": 1.7808930425752856,
|
|
"grad_norm": 0.07139363139867783,
|
|
"learning_rate": 8.51311639882306e-06,
|
|
"loss": 0.003,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 1.7827391254182532,
|
|
"grad_norm": 0.09954049438238144,
|
|
"learning_rate": 8.491884907959426e-06,
|
|
"loss": 0.0118,
|
|
"step": 966
|
|
},
|
|
{
|
|
"epoch": 1.7845852082612206,
|
|
"grad_norm": 0.09860999137163162,
|
|
"learning_rate": 8.47066037126754e-06,
|
|
"loss": 0.0109,
|
|
"step": 967
|
|
},
|
|
{
|
|
"epoch": 1.7864312911041882,
|
|
"grad_norm": 0.1161302700638771,
|
|
"learning_rate": 8.449442886617308e-06,
|
|
"loss": 0.0068,
|
|
"step": 968
|
|
},
|
|
{
|
|
"epoch": 1.7882773739471558,
|
|
"grad_norm": 0.13166776299476624,
|
|
"learning_rate": 8.428232551846101e-06,
|
|
"loss": 0.0317,
|
|
"step": 969
|
|
},
|
|
{
|
|
"epoch": 1.7901234567901234,
|
|
"grad_norm": 0.11878406256437302,
|
|
"learning_rate": 8.407029464758335e-06,
|
|
"loss": 0.0171,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 1.791969539633091,
|
|
"grad_norm": 0.06158612668514252,
|
|
"learning_rate": 8.385833723125006e-06,
|
|
"loss": 0.0022,
|
|
"step": 971
|
|
},
|
|
{
|
|
"epoch": 1.7938156224760586,
|
|
"grad_norm": 0.10814138501882553,
|
|
"learning_rate": 8.364645424683237e-06,
|
|
"loss": 0.0276,
|
|
"step": 972
|
|
},
|
|
{
|
|
"epoch": 1.7956617053190262,
|
|
"grad_norm": 0.10428439825773239,
|
|
"learning_rate": 8.343464667135821e-06,
|
|
"loss": 0.0101,
|
|
"step": 973
|
|
},
|
|
{
|
|
"epoch": 1.7975077881619939,
|
|
"grad_norm": 0.061348170042037964,
|
|
"learning_rate": 8.322291548150786e-06,
|
|
"loss": 0.0024,
|
|
"step": 974
|
|
},
|
|
{
|
|
"epoch": 1.7993538710049615,
|
|
"grad_norm": 0.04177645966410637,
|
|
"learning_rate": 8.301126165360944e-06,
|
|
"loss": 0.0021,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 1.801199953847929,
|
|
"grad_norm": 0.08466877043247223,
|
|
"learning_rate": 8.279968616363417e-06,
|
|
"loss": 0.0077,
|
|
"step": 976
|
|
},
|
|
{
|
|
"epoch": 1.8030460366908965,
|
|
"grad_norm": 0.18225808441638947,
|
|
"learning_rate": 8.258818998719218e-06,
|
|
"loss": 0.0136,
|
|
"step": 977
|
|
},
|
|
{
|
|
"epoch": 1.804892119533864,
|
|
"grad_norm": 0.06195460259914398,
|
|
"learning_rate": 8.237677409952784e-06,
|
|
"loss": 0.0022,
|
|
"step": 978
|
|
},
|
|
{
|
|
"epoch": 1.8067382023768317,
|
|
"grad_norm": 0.1816849410533905,
|
|
"learning_rate": 8.216543947551525e-06,
|
|
"loss": 0.0341,
|
|
"step": 979
|
|
},
|
|
{
|
|
"epoch": 1.808584285219799,
|
|
"grad_norm": 0.09929855912923813,
|
|
"learning_rate": 8.195418708965386e-06,
|
|
"loss": 0.006,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.8104303680627667,
|
|
"grad_norm": 0.043719321489334106,
|
|
"learning_rate": 8.174301791606384e-06,
|
|
"loss": 0.0023,
|
|
"step": 981
|
|
},
|
|
{
|
|
"epoch": 1.8122764509057343,
|
|
"grad_norm": 0.07386981695890427,
|
|
"learning_rate": 8.15319329284817e-06,
|
|
"loss": 0.0059,
|
|
"step": 982
|
|
},
|
|
{
|
|
"epoch": 1.814122533748702,
|
|
"grad_norm": 0.05119425803422928,
|
|
"learning_rate": 8.132093310025572e-06,
|
|
"loss": 0.0019,
|
|
"step": 983
|
|
},
|
|
{
|
|
"epoch": 1.8159686165916695,
|
|
"grad_norm": 0.06272052228450775,
|
|
"learning_rate": 8.111001940434156e-06,
|
|
"loss": 0.0036,
|
|
"step": 984
|
|
},
|
|
{
|
|
"epoch": 1.8178146994346371,
|
|
"grad_norm": 0.06586525589227676,
|
|
"learning_rate": 8.089919281329756e-06,
|
|
"loss": 0.0021,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 1.8196607822776047,
|
|
"grad_norm": 0.05978992208838463,
|
|
"learning_rate": 8.06884542992806e-06,
|
|
"loss": 0.0021,
|
|
"step": 986
|
|
},
|
|
{
|
|
"epoch": 1.8215068651205724,
|
|
"grad_norm": 0.09013515710830688,
|
|
"learning_rate": 8.047780483404135e-06,
|
|
"loss": 0.0052,
|
|
"step": 987
|
|
},
|
|
{
|
|
"epoch": 1.82335294796354,
|
|
"grad_norm": 0.1136859878897667,
|
|
"learning_rate": 8.026724538891976e-06,
|
|
"loss": 0.0063,
|
|
"step": 988
|
|
},
|
|
{
|
|
"epoch": 1.8251990308065076,
|
|
"grad_norm": 0.09807315468788147,
|
|
"learning_rate": 8.005677693484077e-06,
|
|
"loss": 0.0063,
|
|
"step": 989
|
|
},
|
|
{
|
|
"epoch": 1.827045113649475,
|
|
"grad_norm": 0.07622791081666946,
|
|
"learning_rate": 7.984640044230984e-06,
|
|
"loss": 0.0064,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 1.8288911964924426,
|
|
"grad_norm": 0.057751450687646866,
|
|
"learning_rate": 7.963611688140814e-06,
|
|
"loss": 0.0062,
|
|
"step": 991
|
|
},
|
|
{
|
|
"epoch": 1.8307372793354102,
|
|
"grad_norm": 0.14268364012241364,
|
|
"learning_rate": 7.942592722178853e-06,
|
|
"loss": 0.0094,
|
|
"step": 992
|
|
},
|
|
{
|
|
"epoch": 1.8325833621783778,
|
|
"grad_norm": 0.04325347766280174,
|
|
"learning_rate": 7.921583243267079e-06,
|
|
"loss": 0.0017,
|
|
"step": 993
|
|
},
|
|
{
|
|
"epoch": 1.8344294450213452,
|
|
"grad_norm": 0.06997363269329071,
|
|
"learning_rate": 7.900583348283726e-06,
|
|
"loss": 0.0031,
|
|
"step": 994
|
|
},
|
|
{
|
|
"epoch": 1.8362755278643128,
|
|
"grad_norm": 0.18991568684577942,
|
|
"learning_rate": 7.879593134062828e-06,
|
|
"loss": 0.0057,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 1.8381216107072804,
|
|
"grad_norm": 0.07828588038682938,
|
|
"learning_rate": 7.858612697393792e-06,
|
|
"loss": 0.0031,
|
|
"step": 996
|
|
},
|
|
{
|
|
"epoch": 1.839967693550248,
|
|
"grad_norm": 0.0916314348578453,
|
|
"learning_rate": 7.837642135020929e-06,
|
|
"loss": 0.0079,
|
|
"step": 997
|
|
},
|
|
{
|
|
"epoch": 1.8418137763932156,
|
|
"grad_norm": 0.09158594161272049,
|
|
"learning_rate": 7.816681543643019e-06,
|
|
"loss": 0.0054,
|
|
"step": 998
|
|
},
|
|
{
|
|
"epoch": 1.8436598592361833,
|
|
"grad_norm": 0.10289250314235687,
|
|
"learning_rate": 7.795731019912867e-06,
|
|
"loss": 0.0082,
|
|
"step": 999
|
|
},
|
|
{
|
|
"epoch": 1.8455059420791509,
|
|
"grad_norm": 0.07108542323112488,
|
|
"learning_rate": 7.774790660436857e-06,
|
|
"loss": 0.0069,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.8473520249221185,
|
|
"grad_norm": 0.08768683671951294,
|
|
"learning_rate": 7.753860561774495e-06,
|
|
"loss": 0.0158,
|
|
"step": 1001
|
|
},
|
|
{
|
|
"epoch": 1.849198107765086,
|
|
"grad_norm": 0.06569988280534744,
|
|
"learning_rate": 7.73294082043798e-06,
|
|
"loss": 0.0021,
|
|
"step": 1002
|
|
},
|
|
{
|
|
"epoch": 1.8510441906080537,
|
|
"grad_norm": 0.0680694505572319,
|
|
"learning_rate": 7.712031532891754e-06,
|
|
"loss": 0.0031,
|
|
"step": 1003
|
|
},
|
|
{
|
|
"epoch": 1.852890273451021,
|
|
"grad_norm": 0.07440797984600067,
|
|
"learning_rate": 7.691132795552044e-06,
|
|
"loss": 0.0085,
|
|
"step": 1004
|
|
},
|
|
{
|
|
"epoch": 1.8547363562939887,
|
|
"grad_norm": 0.06806052476167679,
|
|
"learning_rate": 7.670244704786432e-06,
|
|
"loss": 0.0031,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 1.8565824391369563,
|
|
"grad_norm": 0.0569663941860199,
|
|
"learning_rate": 7.649367356913422e-06,
|
|
"loss": 0.0021,
|
|
"step": 1006
|
|
},
|
|
{
|
|
"epoch": 1.8584285219799237,
|
|
"grad_norm": 0.23129606246948242,
|
|
"learning_rate": 7.628500848201956e-06,
|
|
"loss": 0.0428,
|
|
"step": 1007
|
|
},
|
|
{
|
|
"epoch": 1.8602746048228913,
|
|
"grad_norm": 0.06253650039434433,
|
|
"learning_rate": 7.607645274871013e-06,
|
|
"loss": 0.0027,
|
|
"step": 1008
|
|
},
|
|
{
|
|
"epoch": 1.862120687665859,
|
|
"grad_norm": 0.10426635295152664,
|
|
"learning_rate": 7.58680073308914e-06,
|
|
"loss": 0.0046,
|
|
"step": 1009
|
|
},
|
|
{
|
|
"epoch": 1.8639667705088265,
|
|
"grad_norm": 0.07335297763347626,
|
|
"learning_rate": 7.565967318974015e-06,
|
|
"loss": 0.0028,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 1.8658128533517941,
|
|
"grad_norm": 0.18146410584449768,
|
|
"learning_rate": 7.545145128592009e-06,
|
|
"loss": 0.0128,
|
|
"step": 1011
|
|
},
|
|
{
|
|
"epoch": 1.8676589361947618,
|
|
"grad_norm": 0.17065131664276123,
|
|
"learning_rate": 7.524334257957737e-06,
|
|
"loss": 0.0091,
|
|
"step": 1012
|
|
},
|
|
{
|
|
"epoch": 1.8676589361947618,
|
|
"eval_loss": 0.007683510426431894,
|
|
"eval_runtime": 91.4577,
|
|
"eval_samples_per_second": 9.983,
|
|
"eval_steps_per_second": 4.997,
|
|
"step": 1012
|
|
},
|
|
{
|
|
"epoch": 1.8695050190377294,
|
|
"grad_norm": 0.07308690249919891,
|
|
"learning_rate": 7.50353480303361e-06,
|
|
"loss": 0.0035,
|
|
"step": 1013
|
|
},
|
|
{
|
|
"epoch": 1.871351101880697,
|
|
"grad_norm": 0.07680094987154007,
|
|
"learning_rate": 7.482746859729408e-06,
|
|
"loss": 0.0048,
|
|
"step": 1014
|
|
},
|
|
{
|
|
"epoch": 1.8731971847236646,
|
|
"grad_norm": 0.05260724946856499,
|
|
"learning_rate": 7.461970523901827e-06,
|
|
"loss": 0.0026,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 1.8750432675666322,
|
|
"grad_norm": 0.15840424597263336,
|
|
"learning_rate": 7.441205891354037e-06,
|
|
"loss": 0.0063,
|
|
"step": 1016
|
|
},
|
|
{
|
|
"epoch": 1.8768893504095996,
|
|
"grad_norm": 0.060153163969516754,
|
|
"learning_rate": 7.42045305783524e-06,
|
|
"loss": 0.0019,
|
|
"step": 1017
|
|
},
|
|
{
|
|
"epoch": 1.8787354332525672,
|
|
"grad_norm": 0.13796140253543854,
|
|
"learning_rate": 7.3997121190402375e-06,
|
|
"loss": 0.0153,
|
|
"step": 1018
|
|
},
|
|
{
|
|
"epoch": 1.8805815160955348,
|
|
"grad_norm": 0.15525425970554352,
|
|
"learning_rate": 7.378983170608982e-06,
|
|
"loss": 0.0092,
|
|
"step": 1019
|
|
},
|
|
{
|
|
"epoch": 1.8824275989385022,
|
|
"grad_norm": 0.09056207537651062,
|
|
"learning_rate": 7.3582663081261195e-06,
|
|
"loss": 0.0037,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.8842736817814698,
|
|
"grad_norm": 0.18960314989089966,
|
|
"learning_rate": 7.337561627120591e-06,
|
|
"loss": 0.0053,
|
|
"step": 1021
|
|
},
|
|
{
|
|
"epoch": 1.8861197646244374,
|
|
"grad_norm": 0.08146478235721588,
|
|
"learning_rate": 7.316869223065156e-06,
|
|
"loss": 0.0094,
|
|
"step": 1022
|
|
},
|
|
{
|
|
"epoch": 1.887965847467405,
|
|
"grad_norm": 0.12083683162927628,
|
|
"learning_rate": 7.296189191375953e-06,
|
|
"loss": 0.0023,
|
|
"step": 1023
|
|
},
|
|
{
|
|
"epoch": 1.8898119303103726,
|
|
"grad_norm": 0.06863249838352203,
|
|
"learning_rate": 7.275521627412082e-06,
|
|
"loss": 0.0043,
|
|
"step": 1024
|
|
},
|
|
{
|
|
"epoch": 1.8916580131533403,
|
|
"grad_norm": 0.05190117284655571,
|
|
"learning_rate": 7.254866626475152e-06,
|
|
"loss": 0.0015,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 1.8935040959963079,
|
|
"grad_norm": 0.10676594078540802,
|
|
"learning_rate": 7.234224283808832e-06,
|
|
"loss": 0.0146,
|
|
"step": 1026
|
|
},
|
|
{
|
|
"epoch": 1.8953501788392755,
|
|
"grad_norm": 0.06794177740812302,
|
|
"learning_rate": 7.213594694598432e-06,
|
|
"loss": 0.0076,
|
|
"step": 1027
|
|
},
|
|
{
|
|
"epoch": 1.897196261682243,
|
|
"grad_norm": 0.04282936453819275,
|
|
"learning_rate": 7.192977953970448e-06,
|
|
"loss": 0.0018,
|
|
"step": 1028
|
|
},
|
|
{
|
|
"epoch": 1.8990423445252107,
|
|
"grad_norm": 0.0630037784576416,
|
|
"learning_rate": 7.172374156992131e-06,
|
|
"loss": 0.0023,
|
|
"step": 1029
|
|
},
|
|
{
|
|
"epoch": 1.9008884273681783,
|
|
"grad_norm": 0.069447822868824,
|
|
"learning_rate": 7.151783398671046e-06,
|
|
"loss": 0.0057,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 1.9027345102111457,
|
|
"grad_norm": 0.11327061057090759,
|
|
"learning_rate": 7.131205773954636e-06,
|
|
"loss": 0.0118,
|
|
"step": 1031
|
|
},
|
|
{
|
|
"epoch": 1.9045805930541133,
|
|
"grad_norm": 0.12302399426698685,
|
|
"learning_rate": 7.110641377729778e-06,
|
|
"loss": 0.0057,
|
|
"step": 1032
|
|
},
|
|
{
|
|
"epoch": 1.906426675897081,
|
|
"grad_norm": 0.09334293752908707,
|
|
"learning_rate": 7.090090304822356e-06,
|
|
"loss": 0.0038,
|
|
"step": 1033
|
|
},
|
|
{
|
|
"epoch": 1.9082727587400483,
|
|
"grad_norm": 0.05566679313778877,
|
|
"learning_rate": 7.069552649996819e-06,
|
|
"loss": 0.0026,
|
|
"step": 1034
|
|
},
|
|
{
|
|
"epoch": 1.910118841583016,
|
|
"grad_norm": 0.0628797709941864,
|
|
"learning_rate": 7.049028507955731e-06,
|
|
"loss": 0.0021,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 1.9119649244259835,
|
|
"grad_norm": 0.09773414582014084,
|
|
"learning_rate": 7.028517973339361e-06,
|
|
"loss": 0.0033,
|
|
"step": 1036
|
|
},
|
|
{
|
|
"epoch": 1.9138110072689511,
|
|
"grad_norm": 0.06920082867145538,
|
|
"learning_rate": 7.008021140725224e-06,
|
|
"loss": 0.0119,
|
|
"step": 1037
|
|
},
|
|
{
|
|
"epoch": 1.9156570901119188,
|
|
"grad_norm": 0.07204987853765488,
|
|
"learning_rate": 6.9875381046276605e-06,
|
|
"loss": 0.0026,
|
|
"step": 1038
|
|
},
|
|
{
|
|
"epoch": 1.9175031729548864,
|
|
"grad_norm": 0.298231303691864,
|
|
"learning_rate": 6.967068959497376e-06,
|
|
"loss": 0.0151,
|
|
"step": 1039
|
|
},
|
|
{
|
|
"epoch": 1.919349255797854,
|
|
"grad_norm": 0.07963436096906662,
|
|
"learning_rate": 6.946613799721038e-06,
|
|
"loss": 0.0056,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.9211953386408216,
|
|
"grad_norm": 0.10884180665016174,
|
|
"learning_rate": 6.926172719620827e-06,
|
|
"loss": 0.0104,
|
|
"step": 1041
|
|
},
|
|
{
|
|
"epoch": 1.9230414214837892,
|
|
"grad_norm": 0.06557092815637589,
|
|
"learning_rate": 6.905745813453983e-06,
|
|
"loss": 0.003,
|
|
"step": 1042
|
|
},
|
|
{
|
|
"epoch": 1.9248875043267568,
|
|
"grad_norm": 0.08651768416166306,
|
|
"learning_rate": 6.885333175412406e-06,
|
|
"loss": 0.0096,
|
|
"step": 1043
|
|
},
|
|
{
|
|
"epoch": 1.9267335871697242,
|
|
"grad_norm": 0.05259367451071739,
|
|
"learning_rate": 6.864934899622191e-06,
|
|
"loss": 0.0034,
|
|
"step": 1044
|
|
},
|
|
{
|
|
"epoch": 1.9285796700126918,
|
|
"grad_norm": 0.13659609854221344,
|
|
"learning_rate": 6.844551080143209e-06,
|
|
"loss": 0.0301,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 1.9304257528556594,
|
|
"grad_norm": 0.10816258192062378,
|
|
"learning_rate": 6.824181810968675e-06,
|
|
"loss": 0.0061,
|
|
"step": 1046
|
|
},
|
|
{
|
|
"epoch": 1.9322718356986268,
|
|
"grad_norm": 0.1165551021695137,
|
|
"learning_rate": 6.80382718602471e-06,
|
|
"loss": 0.0042,
|
|
"step": 1047
|
|
},
|
|
{
|
|
"epoch": 1.9341179185415944,
|
|
"grad_norm": 0.047005485743284225,
|
|
"learning_rate": 6.783487299169897e-06,
|
|
"loss": 0.0018,
|
|
"step": 1048
|
|
},
|
|
{
|
|
"epoch": 1.935964001384562,
|
|
"grad_norm": 0.048299964517354965,
|
|
"learning_rate": 6.763162244194874e-06,
|
|
"loss": 0.002,
|
|
"step": 1049
|
|
},
|
|
{
|
|
"epoch": 1.9378100842275297,
|
|
"grad_norm": 0.1873732954263687,
|
|
"learning_rate": 6.74285211482188e-06,
|
|
"loss": 0.0249,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 1.9396561670704973,
|
|
"grad_norm": 0.08936057239770889,
|
|
"learning_rate": 6.722557004704322e-06,
|
|
"loss": 0.0042,
|
|
"step": 1051
|
|
},
|
|
{
|
|
"epoch": 1.9415022499134649,
|
|
"grad_norm": 0.09909237921237946,
|
|
"learning_rate": 6.702277007426365e-06,
|
|
"loss": 0.0102,
|
|
"step": 1052
|
|
},
|
|
{
|
|
"epoch": 1.9433483327564325,
|
|
"grad_norm": 0.06664317846298218,
|
|
"learning_rate": 6.6820122165024845e-06,
|
|
"loss": 0.0066,
|
|
"step": 1053
|
|
},
|
|
{
|
|
"epoch": 1.9451944155994,
|
|
"grad_norm": 0.0672774612903595,
|
|
"learning_rate": 6.661762725377019e-06,
|
|
"loss": 0.0082,
|
|
"step": 1054
|
|
},
|
|
{
|
|
"epoch": 1.9470404984423677,
|
|
"grad_norm": 0.05301275476813316,
|
|
"learning_rate": 6.6415286274237744e-06,
|
|
"loss": 0.002,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 1.9488865812853353,
|
|
"grad_norm": 0.06802228093147278,
|
|
"learning_rate": 6.62131001594558e-06,
|
|
"loss": 0.0094,
|
|
"step": 1056
|
|
},
|
|
{
|
|
"epoch": 1.9507326641283027,
|
|
"grad_norm": 0.15557558834552765,
|
|
"learning_rate": 6.601106984173835e-06,
|
|
"loss": 0.0035,
|
|
"step": 1057
|
|
},
|
|
{
|
|
"epoch": 1.9525787469712703,
|
|
"grad_norm": 0.06863709539175034,
|
|
"learning_rate": 6.580919625268114e-06,
|
|
"loss": 0.0032,
|
|
"step": 1058
|
|
},
|
|
{
|
|
"epoch": 1.954424829814238,
|
|
"grad_norm": 0.05429449677467346,
|
|
"learning_rate": 6.560748032315713e-06,
|
|
"loss": 0.0037,
|
|
"step": 1059
|
|
},
|
|
{
|
|
"epoch": 1.9562709126572055,
|
|
"grad_norm": 0.09654875099658966,
|
|
"learning_rate": 6.540592298331239e-06,
|
|
"loss": 0.0087,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.958116995500173,
|
|
"grad_norm": 0.08214404433965683,
|
|
"learning_rate": 6.520452516256157e-06,
|
|
"loss": 0.0134,
|
|
"step": 1061
|
|
},
|
|
{
|
|
"epoch": 1.9599630783431405,
|
|
"grad_norm": 0.09481552988290787,
|
|
"learning_rate": 6.5003287789583825e-06,
|
|
"loss": 0.0033,
|
|
"step": 1062
|
|
},
|
|
{
|
|
"epoch": 1.9618091611861082,
|
|
"grad_norm": 0.05937489867210388,
|
|
"learning_rate": 6.480221179231849e-06,
|
|
"loss": 0.005,
|
|
"step": 1063
|
|
},
|
|
{
|
|
"epoch": 1.9636552440290758,
|
|
"grad_norm": 0.06381800770759583,
|
|
"learning_rate": 6.460129809796067e-06,
|
|
"loss": 0.0025,
|
|
"step": 1064
|
|
},
|
|
{
|
|
"epoch": 1.9655013268720434,
|
|
"grad_norm": 0.10020963102579117,
|
|
"learning_rate": 6.440054763295714e-06,
|
|
"loss": 0.0042,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 1.967347409715011,
|
|
"grad_norm": 0.10043366998434067,
|
|
"learning_rate": 6.419996132300203e-06,
|
|
"loss": 0.0062,
|
|
"step": 1066
|
|
},
|
|
{
|
|
"epoch": 1.9691934925579786,
|
|
"grad_norm": 0.07305952906608582,
|
|
"learning_rate": 6.3999540093032396e-06,
|
|
"loss": 0.0058,
|
|
"step": 1067
|
|
},
|
|
{
|
|
"epoch": 1.9710395754009462,
|
|
"grad_norm": 0.052628278732299805,
|
|
"learning_rate": 6.379928486722421e-06,
|
|
"loss": 0.0027,
|
|
"step": 1068
|
|
},
|
|
{
|
|
"epoch": 1.9728856582439138,
|
|
"grad_norm": 0.051156193017959595,
|
|
"learning_rate": 6.359919656898794e-06,
|
|
"loss": 0.002,
|
|
"step": 1069
|
|
},
|
|
{
|
|
"epoch": 1.9747317410868814,
|
|
"grad_norm": 0.13866794109344482,
|
|
"learning_rate": 6.3399276120964235e-06,
|
|
"loss": 0.0098,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 1.9765778239298488,
|
|
"grad_norm": 0.12971021234989166,
|
|
"learning_rate": 6.319952444501984e-06,
|
|
"loss": 0.0115,
|
|
"step": 1071
|
|
},
|
|
{
|
|
"epoch": 1.9784239067728164,
|
|
"grad_norm": 0.05518823862075806,
|
|
"learning_rate": 6.2999942462243345e-06,
|
|
"loss": 0.0034,
|
|
"step": 1072
|
|
},
|
|
{
|
|
"epoch": 1.980269989615784,
|
|
"grad_norm": 0.07717905938625336,
|
|
"learning_rate": 6.280053109294064e-06,
|
|
"loss": 0.0083,
|
|
"step": 1073
|
|
},
|
|
{
|
|
"epoch": 1.9821160724587514,
|
|
"grad_norm": 0.053649693727493286,
|
|
"learning_rate": 6.260129125663106e-06,
|
|
"loss": 0.0016,
|
|
"step": 1074
|
|
},
|
|
{
|
|
"epoch": 1.983962155301719,
|
|
"grad_norm": 0.12298066914081573,
|
|
"learning_rate": 6.240222387204293e-06,
|
|
"loss": 0.0122,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 1.9858082381446867,
|
|
"grad_norm": 0.10608681291341782,
|
|
"learning_rate": 6.220332985710936e-06,
|
|
"loss": 0.0037,
|
|
"step": 1076
|
|
},
|
|
{
|
|
"epoch": 1.9876543209876543,
|
|
"grad_norm": 0.07757773250341415,
|
|
"learning_rate": 6.200461012896401e-06,
|
|
"loss": 0.0027,
|
|
"step": 1077
|
|
},
|
|
{
|
|
"epoch": 1.9895004038306219,
|
|
"grad_norm": 0.14791713654994965,
|
|
"learning_rate": 6.180606560393694e-06,
|
|
"loss": 0.008,
|
|
"step": 1078
|
|
},
|
|
{
|
|
"epoch": 1.9913464866735895,
|
|
"grad_norm": 0.046395193785429,
|
|
"learning_rate": 6.16076971975502e-06,
|
|
"loss": 0.0019,
|
|
"step": 1079
|
|
},
|
|
{
|
|
"epoch": 1.993192569516557,
|
|
"grad_norm": 0.11292968690395355,
|
|
"learning_rate": 6.140950582451384e-06,
|
|
"loss": 0.0063,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 1.9950386523595247,
|
|
"grad_norm": 0.04149056598544121,
|
|
"learning_rate": 6.121149239872151e-06,
|
|
"loss": 0.0008,
|
|
"step": 1081
|
|
},
|
|
{
|
|
"epoch": 1.9968847352024923,
|
|
"grad_norm": 0.10369537770748138,
|
|
"learning_rate": 6.1013657833246396e-06,
|
|
"loss": 0.0128,
|
|
"step": 1082
|
|
},
|
|
{
|
|
"epoch": 1.99873081804546,
|
|
"grad_norm": 0.09148327261209488,
|
|
"learning_rate": 6.081600304033682e-06,
|
|
"loss": 0.0045,
|
|
"step": 1083
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.1143283098936081,
|
|
"learning_rate": 6.061852893141222e-06,
|
|
"loss": 0.0087,
|
|
"step": 1084
|
|
},
|
|
{
|
|
"epoch": 2.0018460828429676,
|
|
"grad_norm": 0.059561945497989655,
|
|
"learning_rate": 6.04212364170589e-06,
|
|
"loss": 0.0019,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 2.0036921656859352,
|
|
"grad_norm": 0.07360690087080002,
|
|
"learning_rate": 6.0224126407025616e-06,
|
|
"loss": 0.0008,
|
|
"step": 1086
|
|
},
|
|
{
|
|
"epoch": 2.005538248528903,
|
|
"grad_norm": 0.0354192815721035,
|
|
"learning_rate": 6.002719981021982e-06,
|
|
"loss": 0.0014,
|
|
"step": 1087
|
|
},
|
|
{
|
|
"epoch": 2.0073843313718704,
|
|
"grad_norm": 0.0898759588599205,
|
|
"learning_rate": 5.983045753470308e-06,
|
|
"loss": 0.0093,
|
|
"step": 1088
|
|
},
|
|
{
|
|
"epoch": 2.009230414214838,
|
|
"grad_norm": 0.049619074910879135,
|
|
"learning_rate": 5.963390048768698e-06,
|
|
"loss": 0.0017,
|
|
"step": 1089
|
|
},
|
|
{
|
|
"epoch": 2.0110764970578057,
|
|
"grad_norm": 0.031979288905858994,
|
|
"learning_rate": 5.9437529575529085e-06,
|
|
"loss": 0.0011,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 2.012922579900773,
|
|
"grad_norm": 0.031964387744665146,
|
|
"learning_rate": 5.924134570372863e-06,
|
|
"loss": 0.0021,
|
|
"step": 1091
|
|
},
|
|
{
|
|
"epoch": 2.0147686627437404,
|
|
"grad_norm": 0.028327271342277527,
|
|
"learning_rate": 5.9045349776922335e-06,
|
|
"loss": 0.0008,
|
|
"step": 1092
|
|
},
|
|
{
|
|
"epoch": 2.016614745586708,
|
|
"grad_norm": 0.05546770617365837,
|
|
"learning_rate": 5.884954269888032e-06,
|
|
"loss": 0.0011,
|
|
"step": 1093
|
|
},
|
|
{
|
|
"epoch": 2.0184608284296757,
|
|
"grad_norm": 0.11671288311481476,
|
|
"learning_rate": 5.865392537250191e-06,
|
|
"loss": 0.0037,
|
|
"step": 1094
|
|
},
|
|
{
|
|
"epoch": 2.0203069112726433,
|
|
"grad_norm": 0.09133674204349518,
|
|
"learning_rate": 5.845849869981137e-06,
|
|
"loss": 0.0126,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 2.022152994115611,
|
|
"grad_norm": 0.06473570317029953,
|
|
"learning_rate": 5.826326358195391e-06,
|
|
"loss": 0.0032,
|
|
"step": 1096
|
|
},
|
|
{
|
|
"epoch": 2.0239990769585785,
|
|
"grad_norm": 0.05737852305173874,
|
|
"learning_rate": 5.806822091919143e-06,
|
|
"loss": 0.0035,
|
|
"step": 1097
|
|
},
|
|
{
|
|
"epoch": 2.025845159801546,
|
|
"grad_norm": 0.05501372739672661,
|
|
"learning_rate": 5.787337161089836e-06,
|
|
"loss": 0.0037,
|
|
"step": 1098
|
|
},
|
|
{
|
|
"epoch": 2.0276912426445137,
|
|
"grad_norm": 0.03991509601473808,
|
|
"learning_rate": 5.7678716555557515e-06,
|
|
"loss": 0.0011,
|
|
"step": 1099
|
|
},
|
|
{
|
|
"epoch": 2.0295373254874813,
|
|
"grad_norm": 0.042931295931339264,
|
|
"learning_rate": 5.74842566507561e-06,
|
|
"loss": 0.0011,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 2.031383408330449,
|
|
"grad_norm": 0.09508123248815536,
|
|
"learning_rate": 5.728999279318131e-06,
|
|
"loss": 0.0144,
|
|
"step": 1101
|
|
},
|
|
{
|
|
"epoch": 2.0332294911734166,
|
|
"grad_norm": 0.04815695434808731,
|
|
"learning_rate": 5.709592587861637e-06,
|
|
"loss": 0.0026,
|
|
"step": 1102
|
|
},
|
|
{
|
|
"epoch": 2.035075574016384,
|
|
"grad_norm": 0.05574629828333855,
|
|
"learning_rate": 5.690205680193647e-06,
|
|
"loss": 0.0011,
|
|
"step": 1103
|
|
},
|
|
{
|
|
"epoch": 2.036921656859352,
|
|
"grad_norm": 0.10294865071773529,
|
|
"learning_rate": 5.670838645710439e-06,
|
|
"loss": 0.0057,
|
|
"step": 1104
|
|
},
|
|
{
|
|
"epoch": 2.036921656859352,
|
|
"eval_loss": 0.008138884790241718,
|
|
"eval_runtime": 91.5583,
|
|
"eval_samples_per_second": 9.972,
|
|
"eval_steps_per_second": 4.991,
|
|
"step": 1104
|
|
},
|
|
{
|
|
"epoch": 2.038767739702319,
|
|
"grad_norm": 0.03519001603126526,
|
|
"learning_rate": 5.651491573716657e-06,
|
|
"loss": 0.0028,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 2.0406138225452866,
|
|
"grad_norm": 0.046490173786878586,
|
|
"learning_rate": 5.632164553424904e-06,
|
|
"loss": 0.0032,
|
|
"step": 1106
|
|
},
|
|
{
|
|
"epoch": 2.042459905388254,
|
|
"grad_norm": 0.044620878994464874,
|
|
"learning_rate": 5.612857673955308e-06,
|
|
"loss": 0.0012,
|
|
"step": 1107
|
|
},
|
|
{
|
|
"epoch": 2.044305988231222,
|
|
"grad_norm": 0.0896461084485054,
|
|
"learning_rate": 5.593571024335126e-06,
|
|
"loss": 0.0033,
|
|
"step": 1108
|
|
},
|
|
{
|
|
"epoch": 2.0461520710741894,
|
|
"grad_norm": 0.06016547977924347,
|
|
"learning_rate": 5.574304693498346e-06,
|
|
"loss": 0.0055,
|
|
"step": 1109
|
|
},
|
|
{
|
|
"epoch": 2.047998153917157,
|
|
"grad_norm": 0.13048522174358368,
|
|
"learning_rate": 5.5550587702852465e-06,
|
|
"loss": 0.0086,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 2.0498442367601246,
|
|
"grad_norm": 0.05786297842860222,
|
|
"learning_rate": 5.5358333434420054e-06,
|
|
"loss": 0.0048,
|
|
"step": 1111
|
|
},
|
|
{
|
|
"epoch": 2.0516903196030922,
|
|
"grad_norm": 0.03565092384815216,
|
|
"learning_rate": 5.516628501620299e-06,
|
|
"loss": 0.0011,
|
|
"step": 1112
|
|
},
|
|
{
|
|
"epoch": 2.05353640244606,
|
|
"grad_norm": 0.05448061600327492,
|
|
"learning_rate": 5.497444333376874e-06,
|
|
"loss": 0.0038,
|
|
"step": 1113
|
|
},
|
|
{
|
|
"epoch": 2.0553824852890274,
|
|
"grad_norm": 0.04243125393986702,
|
|
"learning_rate": 5.478280927173145e-06,
|
|
"loss": 0.0013,
|
|
"step": 1114
|
|
},
|
|
{
|
|
"epoch": 2.057228568131995,
|
|
"grad_norm": 0.054316967725753784,
|
|
"learning_rate": 5.459138371374795e-06,
|
|
"loss": 0.002,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 2.0590746509749627,
|
|
"grad_norm": 0.03207079693675041,
|
|
"learning_rate": 5.440016754251364e-06,
|
|
"loss": 0.0012,
|
|
"step": 1116
|
|
},
|
|
{
|
|
"epoch": 2.0609207338179303,
|
|
"grad_norm": 0.20276927947998047,
|
|
"learning_rate": 5.420916163975836e-06,
|
|
"loss": 0.0013,
|
|
"step": 1117
|
|
},
|
|
{
|
|
"epoch": 2.0627668166608975,
|
|
"grad_norm": 0.04171600192785263,
|
|
"learning_rate": 5.401836688624231e-06,
|
|
"loss": 0.0014,
|
|
"step": 1118
|
|
},
|
|
{
|
|
"epoch": 2.064612899503865,
|
|
"grad_norm": 0.060076795518398285,
|
|
"learning_rate": 5.382778416175223e-06,
|
|
"loss": 0.003,
|
|
"step": 1119
|
|
},
|
|
{
|
|
"epoch": 2.0664589823468327,
|
|
"grad_norm": 0.0577683262526989,
|
|
"learning_rate": 5.363741434509697e-06,
|
|
"loss": 0.0011,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 2.0683050651898003,
|
|
"grad_norm": 0.1408967226743698,
|
|
"learning_rate": 5.344725831410369e-06,
|
|
"loss": 0.0154,
|
|
"step": 1121
|
|
},
|
|
{
|
|
"epoch": 2.070151148032768,
|
|
"grad_norm": 0.05401252210140228,
|
|
"learning_rate": 5.32573169456138e-06,
|
|
"loss": 0.0009,
|
|
"step": 1122
|
|
},
|
|
{
|
|
"epoch": 2.0719972308757355,
|
|
"grad_norm": 0.05760827288031578,
|
|
"learning_rate": 5.306759111547881e-06,
|
|
"loss": 0.0013,
|
|
"step": 1123
|
|
},
|
|
{
|
|
"epoch": 2.073843313718703,
|
|
"grad_norm": 0.04372551664710045,
|
|
"learning_rate": 5.28780816985563e-06,
|
|
"loss": 0.0035,
|
|
"step": 1124
|
|
},
|
|
{
|
|
"epoch": 2.0756893965616707,
|
|
"grad_norm": 0.1505114585161209,
|
|
"learning_rate": 5.26887895687061e-06,
|
|
"loss": 0.0045,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 2.0775354794046383,
|
|
"grad_norm": 0.033471547067165375,
|
|
"learning_rate": 5.24997155987859e-06,
|
|
"loss": 0.0007,
|
|
"step": 1126
|
|
},
|
|
{
|
|
"epoch": 2.079381562247606,
|
|
"grad_norm": 0.05673528090119362,
|
|
"learning_rate": 5.231086066064751e-06,
|
|
"loss": 0.0015,
|
|
"step": 1127
|
|
},
|
|
{
|
|
"epoch": 2.0812276450905736,
|
|
"grad_norm": 0.18660074472427368,
|
|
"learning_rate": 5.212222562513278e-06,
|
|
"loss": 0.0058,
|
|
"step": 1128
|
|
},
|
|
{
|
|
"epoch": 2.083073727933541,
|
|
"grad_norm": 0.10123515874147415,
|
|
"learning_rate": 5.193381136206948e-06,
|
|
"loss": 0.0122,
|
|
"step": 1129
|
|
},
|
|
{
|
|
"epoch": 2.084919810776509,
|
|
"grad_norm": 0.0788915827870369,
|
|
"learning_rate": 5.174561874026741e-06,
|
|
"loss": 0.0036,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 2.086765893619476,
|
|
"grad_norm": 0.10579413175582886,
|
|
"learning_rate": 5.155764862751427e-06,
|
|
"loss": 0.0023,
|
|
"step": 1131
|
|
},
|
|
{
|
|
"epoch": 2.0886119764624436,
|
|
"grad_norm": 0.039463870227336884,
|
|
"learning_rate": 5.136990189057187e-06,
|
|
"loss": 0.0025,
|
|
"step": 1132
|
|
},
|
|
{
|
|
"epoch": 2.090458059305411,
|
|
"grad_norm": 0.09689252078533173,
|
|
"learning_rate": 5.11823793951719e-06,
|
|
"loss": 0.0055,
|
|
"step": 1133
|
|
},
|
|
{
|
|
"epoch": 2.092304142148379,
|
|
"grad_norm": 0.09919244050979614,
|
|
"learning_rate": 5.099508200601198e-06,
|
|
"loss": 0.0045,
|
|
"step": 1134
|
|
},
|
|
{
|
|
"epoch": 2.0941502249913464,
|
|
"grad_norm": 0.06431836634874344,
|
|
"learning_rate": 5.080801058675191e-06,
|
|
"loss": 0.0087,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 2.095996307834314,
|
|
"grad_norm": 0.0821138471364975,
|
|
"learning_rate": 5.062116600000933e-06,
|
|
"loss": 0.0017,
|
|
"step": 1136
|
|
},
|
|
{
|
|
"epoch": 2.0978423906772816,
|
|
"grad_norm": 0.050396766513586044,
|
|
"learning_rate": 5.043454910735595e-06,
|
|
"loss": 0.0009,
|
|
"step": 1137
|
|
},
|
|
{
|
|
"epoch": 2.0996884735202492,
|
|
"grad_norm": 0.0441858284175396,
|
|
"learning_rate": 5.024816076931366e-06,
|
|
"loss": 0.0015,
|
|
"step": 1138
|
|
},
|
|
{
|
|
"epoch": 2.101534556363217,
|
|
"grad_norm": 0.057779595255851746,
|
|
"learning_rate": 5.006200184535033e-06,
|
|
"loss": 0.0025,
|
|
"step": 1139
|
|
},
|
|
{
|
|
"epoch": 2.1033806392061845,
|
|
"grad_norm": 0.06962257623672485,
|
|
"learning_rate": 4.987607319387593e-06,
|
|
"loss": 0.0019,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 2.105226722049152,
|
|
"grad_norm": 0.0638086274266243,
|
|
"learning_rate": 4.969037567223881e-06,
|
|
"loss": 0.0033,
|
|
"step": 1141
|
|
},
|
|
{
|
|
"epoch": 2.1070728048921197,
|
|
"grad_norm": 0.054150376468896866,
|
|
"learning_rate": 4.950491013672124e-06,
|
|
"loss": 0.0014,
|
|
"step": 1142
|
|
},
|
|
{
|
|
"epoch": 2.1089188877350873,
|
|
"grad_norm": 0.06254181265830994,
|
|
"learning_rate": 4.931967744253601e-06,
|
|
"loss": 0.0024,
|
|
"step": 1143
|
|
},
|
|
{
|
|
"epoch": 2.110764970578055,
|
|
"grad_norm": 0.04108688607811928,
|
|
"learning_rate": 4.913467844382217e-06,
|
|
"loss": 0.0017,
|
|
"step": 1144
|
|
},
|
|
{
|
|
"epoch": 2.112611053421022,
|
|
"grad_norm": 0.05044626444578171,
|
|
"learning_rate": 4.894991399364113e-06,
|
|
"loss": 0.0013,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 2.1144571362639897,
|
|
"grad_norm": 0.0502905435860157,
|
|
"learning_rate": 4.876538494397274e-06,
|
|
"loss": 0.0009,
|
|
"step": 1146
|
|
},
|
|
{
|
|
"epoch": 2.1163032191069573,
|
|
"grad_norm": 0.043060798197984695,
|
|
"learning_rate": 4.8581092145711466e-06,
|
|
"loss": 0.0009,
|
|
"step": 1147
|
|
},
|
|
{
|
|
"epoch": 2.118149301949925,
|
|
"grad_norm": 0.06760820746421814,
|
|
"learning_rate": 4.839703644866228e-06,
|
|
"loss": 0.0023,
|
|
"step": 1148
|
|
},
|
|
{
|
|
"epoch": 2.1199953847928925,
|
|
"grad_norm": 0.06913480162620544,
|
|
"learning_rate": 4.821321870153692e-06,
|
|
"loss": 0.0062,
|
|
"step": 1149
|
|
},
|
|
{
|
|
"epoch": 2.12184146763586,
|
|
"grad_norm": 0.10108120739459991,
|
|
"learning_rate": 4.802963975194981e-06,
|
|
"loss": 0.012,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 2.1236875504788277,
|
|
"grad_norm": 0.028255267068743706,
|
|
"learning_rate": 4.784630044641435e-06,
|
|
"loss": 0.0006,
|
|
"step": 1151
|
|
},
|
|
{
|
|
"epoch": 2.1255336333217953,
|
|
"grad_norm": 0.1202707514166832,
|
|
"learning_rate": 4.766320163033882e-06,
|
|
"loss": 0.0026,
|
|
"step": 1152
|
|
},
|
|
{
|
|
"epoch": 2.127379716164763,
|
|
"grad_norm": 0.05136001482605934,
|
|
"learning_rate": 4.7480344148022535e-06,
|
|
"loss": 0.0021,
|
|
"step": 1153
|
|
},
|
|
{
|
|
"epoch": 2.1292257990077306,
|
|
"grad_norm": 0.04602188244462013,
|
|
"learning_rate": 4.729772884265212e-06,
|
|
"loss": 0.0011,
|
|
"step": 1154
|
|
},
|
|
{
|
|
"epoch": 2.131071881850698,
|
|
"grad_norm": 0.06541673839092255,
|
|
"learning_rate": 4.711535655629735e-06,
|
|
"loss": 0.0023,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 2.132917964693666,
|
|
"grad_norm": 0.03694295138120651,
|
|
"learning_rate": 4.6933228129907395e-06,
|
|
"loss": 0.0009,
|
|
"step": 1156
|
|
},
|
|
{
|
|
"epoch": 2.1347640475366334,
|
|
"grad_norm": 0.06101083382964134,
|
|
"learning_rate": 4.675134440330706e-06,
|
|
"loss": 0.0006,
|
|
"step": 1157
|
|
},
|
|
{
|
|
"epoch": 2.1366101303796006,
|
|
"grad_norm": 0.028277406468987465,
|
|
"learning_rate": 4.65697062151927e-06,
|
|
"loss": 0.0006,
|
|
"step": 1158
|
|
},
|
|
{
|
|
"epoch": 2.138456213222568,
|
|
"grad_norm": 0.08642231673002243,
|
|
"learning_rate": 4.638831440312844e-06,
|
|
"loss": 0.0026,
|
|
"step": 1159
|
|
},
|
|
{
|
|
"epoch": 2.140302296065536,
|
|
"grad_norm": 0.07297013700008392,
|
|
"learning_rate": 4.620716980354248e-06,
|
|
"loss": 0.0017,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 2.1421483789085034,
|
|
"grad_norm": 0.06735076755285263,
|
|
"learning_rate": 4.602627325172279e-06,
|
|
"loss": 0.0021,
|
|
"step": 1161
|
|
},
|
|
{
|
|
"epoch": 2.143994461751471,
|
|
"grad_norm": 0.14374354481697083,
|
|
"learning_rate": 4.584562558181384e-06,
|
|
"loss": 0.0115,
|
|
"step": 1162
|
|
},
|
|
{
|
|
"epoch": 2.1458405445944386,
|
|
"grad_norm": 0.08532073348760605,
|
|
"learning_rate": 4.566522762681239e-06,
|
|
"loss": 0.0102,
|
|
"step": 1163
|
|
},
|
|
{
|
|
"epoch": 2.1476866274374062,
|
|
"grad_norm": 0.10443772375583649,
|
|
"learning_rate": 4.548508021856354e-06,
|
|
"loss": 0.0072,
|
|
"step": 1164
|
|
},
|
|
{
|
|
"epoch": 2.149532710280374,
|
|
"grad_norm": 0.043913282454013824,
|
|
"learning_rate": 4.530518418775734e-06,
|
|
"loss": 0.0027,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 2.1513787931233415,
|
|
"grad_norm": 0.08159793168306351,
|
|
"learning_rate": 4.512554036392448e-06,
|
|
"loss": 0.0014,
|
|
"step": 1166
|
|
},
|
|
{
|
|
"epoch": 2.153224875966309,
|
|
"grad_norm": 0.14023910462856293,
|
|
"learning_rate": 4.494614957543286e-06,
|
|
"loss": 0.0083,
|
|
"step": 1167
|
|
},
|
|
{
|
|
"epoch": 2.1550709588092767,
|
|
"grad_norm": 0.07015207409858704,
|
|
"learning_rate": 4.4767012649483484e-06,
|
|
"loss": 0.0056,
|
|
"step": 1168
|
|
},
|
|
{
|
|
"epoch": 2.1569170416522443,
|
|
"grad_norm": 0.027905292809009552,
|
|
"learning_rate": 4.458813041210672e-06,
|
|
"loss": 0.0008,
|
|
"step": 1169
|
|
},
|
|
{
|
|
"epoch": 2.158763124495212,
|
|
"grad_norm": 0.08288539946079254,
|
|
"learning_rate": 4.440950368815866e-06,
|
|
"loss": 0.0015,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 2.1606092073381795,
|
|
"grad_norm": 0.042242277413606644,
|
|
"learning_rate": 4.423113330131708e-06,
|
|
"loss": 0.001,
|
|
"step": 1171
|
|
},
|
|
{
|
|
"epoch": 2.1624552901811467,
|
|
"grad_norm": 0.04000399261713028,
|
|
"learning_rate": 4.40530200740777e-06,
|
|
"loss": 0.0007,
|
|
"step": 1172
|
|
},
|
|
{
|
|
"epoch": 2.1643013730241143,
|
|
"grad_norm": 0.07605951279401779,
|
|
"learning_rate": 4.387516482775058e-06,
|
|
"loss": 0.0015,
|
|
"step": 1173
|
|
},
|
|
{
|
|
"epoch": 2.166147455867082,
|
|
"grad_norm": 0.15174829959869385,
|
|
"learning_rate": 4.369756838245608e-06,
|
|
"loss": 0.0057,
|
|
"step": 1174
|
|
},
|
|
{
|
|
"epoch": 2.1679935387100495,
|
|
"grad_norm": 0.0797356367111206,
|
|
"learning_rate": 4.352023155712116e-06,
|
|
"loss": 0.0085,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 2.169839621553017,
|
|
"grad_norm": 0.04051666334271431,
|
|
"learning_rate": 4.33431551694758e-06,
|
|
"loss": 0.0007,
|
|
"step": 1176
|
|
},
|
|
{
|
|
"epoch": 2.1716857043959847,
|
|
"grad_norm": 0.06101857125759125,
|
|
"learning_rate": 4.316634003604878e-06,
|
|
"loss": 0.0046,
|
|
"step": 1177
|
|
},
|
|
{
|
|
"epoch": 2.1735317872389524,
|
|
"grad_norm": 0.16660486161708832,
|
|
"learning_rate": 4.298978697216442e-06,
|
|
"loss": 0.0049,
|
|
"step": 1178
|
|
},
|
|
{
|
|
"epoch": 2.17537787008192,
|
|
"grad_norm": 0.12328508496284485,
|
|
"learning_rate": 4.281349679193862e-06,
|
|
"loss": 0.0075,
|
|
"step": 1179
|
|
},
|
|
{
|
|
"epoch": 2.1772239529248876,
|
|
"grad_norm": 0.0972929298877716,
|
|
"learning_rate": 4.263747030827481e-06,
|
|
"loss": 0.0027,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 2.179070035767855,
|
|
"grad_norm": 0.0562286414206028,
|
|
"learning_rate": 4.246170833286075e-06,
|
|
"loss": 0.0045,
|
|
"step": 1181
|
|
},
|
|
{
|
|
"epoch": 2.180916118610823,
|
|
"grad_norm": 0.0480145663022995,
|
|
"learning_rate": 4.228621167616438e-06,
|
|
"loss": 0.0014,
|
|
"step": 1182
|
|
},
|
|
{
|
|
"epoch": 2.1827622014537904,
|
|
"grad_norm": 0.0439535528421402,
|
|
"learning_rate": 4.21109811474302e-06,
|
|
"loss": 0.0021,
|
|
"step": 1183
|
|
},
|
|
{
|
|
"epoch": 2.184608284296758,
|
|
"grad_norm": 0.03982429951429367,
|
|
"learning_rate": 4.1936017554675635e-06,
|
|
"loss": 0.001,
|
|
"step": 1184
|
|
},
|
|
{
|
|
"epoch": 2.186454367139725,
|
|
"grad_norm": 0.04404020681977272,
|
|
"learning_rate": 4.176132170468714e-06,
|
|
"loss": 0.0011,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 2.188300449982693,
|
|
"grad_norm": 0.05249952897429466,
|
|
"learning_rate": 4.1586894403016576e-06,
|
|
"loss": 0.0024,
|
|
"step": 1186
|
|
},
|
|
{
|
|
"epoch": 2.1901465328256604,
|
|
"grad_norm": 0.06995794177055359,
|
|
"learning_rate": 4.1412736453977545e-06,
|
|
"loss": 0.0021,
|
|
"step": 1187
|
|
},
|
|
{
|
|
"epoch": 2.191992615668628,
|
|
"grad_norm": 0.09189380705356598,
|
|
"learning_rate": 4.1238848660641504e-06,
|
|
"loss": 0.005,
|
|
"step": 1188
|
|
},
|
|
{
|
|
"epoch": 2.1938386985115956,
|
|
"grad_norm": 0.09258892387151718,
|
|
"learning_rate": 4.106523182483434e-06,
|
|
"loss": 0.0037,
|
|
"step": 1189
|
|
},
|
|
{
|
|
"epoch": 2.1956847813545632,
|
|
"grad_norm": 0.07874837517738342,
|
|
"learning_rate": 4.0891886747132356e-06,
|
|
"loss": 0.0041,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 2.197530864197531,
|
|
"grad_norm": 0.05509480461478233,
|
|
"learning_rate": 4.071881422685877e-06,
|
|
"loss": 0.0013,
|
|
"step": 1191
|
|
},
|
|
{
|
|
"epoch": 2.1993769470404985,
|
|
"grad_norm": 0.09273144602775574,
|
|
"learning_rate": 4.054601506208009e-06,
|
|
"loss": 0.0012,
|
|
"step": 1192
|
|
},
|
|
{
|
|
"epoch": 2.201223029883466,
|
|
"grad_norm": 0.0319955050945282,
|
|
"learning_rate": 4.03734900496022e-06,
|
|
"loss": 0.0005,
|
|
"step": 1193
|
|
},
|
|
{
|
|
"epoch": 2.2030691127264337,
|
|
"grad_norm": 0.06351076811552048,
|
|
"learning_rate": 4.020123998496688e-06,
|
|
"loss": 0.0041,
|
|
"step": 1194
|
|
},
|
|
{
|
|
"epoch": 2.2049151955694013,
|
|
"grad_norm": 0.11725535243749619,
|
|
"learning_rate": 4.002926566244816e-06,
|
|
"loss": 0.004,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 2.206761278412369,
|
|
"grad_norm": 0.034534893929958344,
|
|
"learning_rate": 3.985756787504837e-06,
|
|
"loss": 0.0006,
|
|
"step": 1196
|
|
},
|
|
{
|
|
"epoch": 2.206761278412369,
|
|
"eval_loss": 0.008584747090935707,
|
|
"eval_runtime": 91.2554,
|
|
"eval_samples_per_second": 10.005,
|
|
"eval_steps_per_second": 5.008,
|
|
"step": 1196
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 1626,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 92,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4.0829430617205637e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|