4599 lines
112 KiB
JSON
4599 lines
112 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.376600899965386,
|
|
"eval_steps": 92,
|
|
"global_step": 644,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.003692165685935156,
|
|
"grad_norm": 1.7612229585647583,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0643,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.007384331371870312,
|
|
"grad_norm": 1.5242195129394531,
|
|
"learning_rate": 4.878048780487805e-07,
|
|
"loss": 0.0413,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.01107649705780547,
|
|
"grad_norm": 1.9117767810821533,
|
|
"learning_rate": 9.75609756097561e-07,
|
|
"loss": 0.0658,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.014768662743740625,
|
|
"grad_norm": 1.904267430305481,
|
|
"learning_rate": 1.4634146341463414e-06,
|
|
"loss": 0.063,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.018460828429675783,
|
|
"grad_norm": 1.6038223505020142,
|
|
"learning_rate": 1.951219512195122e-06,
|
|
"loss": 0.0546,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.02215299411561094,
|
|
"grad_norm": 0.9653654098510742,
|
|
"learning_rate": 2.4390243902439027e-06,
|
|
"loss": 0.0417,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.025845159801546093,
|
|
"grad_norm": 0.8979856371879578,
|
|
"learning_rate": 2.926829268292683e-06,
|
|
"loss": 0.0398,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.02953732548748125,
|
|
"grad_norm": 0.7607001066207886,
|
|
"learning_rate": 3.414634146341464e-06,
|
|
"loss": 0.0419,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.033229491173416406,
|
|
"grad_norm": 0.7884329557418823,
|
|
"learning_rate": 3.902439024390244e-06,
|
|
"loss": 0.0267,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.036921656859351566,
|
|
"grad_norm": 0.3715834617614746,
|
|
"learning_rate": 4.390243902439025e-06,
|
|
"loss": 0.0277,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.04061382254528672,
|
|
"grad_norm": 0.31171751022338867,
|
|
"learning_rate": 4.8780487804878055e-06,
|
|
"loss": 0.0296,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.04430598823122188,
|
|
"grad_norm": 0.41572287678718567,
|
|
"learning_rate": 5.365853658536586e-06,
|
|
"loss": 0.0184,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.04799815391715703,
|
|
"grad_norm": 0.46749481558799744,
|
|
"learning_rate": 5.853658536585366e-06,
|
|
"loss": 0.0182,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.051690319603092186,
|
|
"grad_norm": 0.2749096751213074,
|
|
"learning_rate": 6.341463414634147e-06,
|
|
"loss": 0.0248,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.055382485289027346,
|
|
"grad_norm": 0.17462071776390076,
|
|
"learning_rate": 6.829268292682928e-06,
|
|
"loss": 0.0123,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.0590746509749625,
|
|
"grad_norm": 0.18316905200481415,
|
|
"learning_rate": 7.317073170731707e-06,
|
|
"loss": 0.0145,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.06276681666089766,
|
|
"grad_norm": 0.16834756731987,
|
|
"learning_rate": 7.804878048780489e-06,
|
|
"loss": 0.0194,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.06645898234683281,
|
|
"grad_norm": 0.150247722864151,
|
|
"learning_rate": 8.292682926829268e-06,
|
|
"loss": 0.0117,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.07015114803276797,
|
|
"grad_norm": 0.193730428814888,
|
|
"learning_rate": 8.78048780487805e-06,
|
|
"loss": 0.0208,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.07384331371870313,
|
|
"grad_norm": 0.16806142032146454,
|
|
"learning_rate": 9.268292682926831e-06,
|
|
"loss": 0.0099,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.07753547940463829,
|
|
"grad_norm": 0.2117680311203003,
|
|
"learning_rate": 9.756097560975611e-06,
|
|
"loss": 0.0368,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.08122764509057344,
|
|
"grad_norm": 0.12042353302240372,
|
|
"learning_rate": 1.024390243902439e-05,
|
|
"loss": 0.013,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.08491981077650859,
|
|
"grad_norm": 0.16888391971588135,
|
|
"learning_rate": 1.0731707317073172e-05,
|
|
"loss": 0.0137,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.08861197646244376,
|
|
"grad_norm": 0.32111823558807373,
|
|
"learning_rate": 1.1219512195121953e-05,
|
|
"loss": 0.0248,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.09230414214837891,
|
|
"grad_norm": 0.1346050500869751,
|
|
"learning_rate": 1.1707317073170731e-05,
|
|
"loss": 0.0256,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.09599630783431407,
|
|
"grad_norm": 0.1447174847126007,
|
|
"learning_rate": 1.2195121951219513e-05,
|
|
"loss": 0.0124,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.09968847352024922,
|
|
"grad_norm": 0.13269546627998352,
|
|
"learning_rate": 1.2682926829268294e-05,
|
|
"loss": 0.0096,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.10338063920618437,
|
|
"grad_norm": 0.08903706818819046,
|
|
"learning_rate": 1.3170731707317076e-05,
|
|
"loss": 0.0076,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.10707280489211954,
|
|
"grad_norm": 0.1776096373796463,
|
|
"learning_rate": 1.3658536585365855e-05,
|
|
"loss": 0.0181,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.11076497057805469,
|
|
"grad_norm": 0.10876930505037308,
|
|
"learning_rate": 1.4146341463414635e-05,
|
|
"loss": 0.0111,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.11445713626398984,
|
|
"grad_norm": 0.1793457418680191,
|
|
"learning_rate": 1.4634146341463415e-05,
|
|
"loss": 0.023,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.118149301949925,
|
|
"grad_norm": 0.17572703957557678,
|
|
"learning_rate": 1.5121951219512196e-05,
|
|
"loss": 0.0162,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.12184146763586017,
|
|
"grad_norm": 0.14067068696022034,
|
|
"learning_rate": 1.5609756097560978e-05,
|
|
"loss": 0.0071,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.12553363332179532,
|
|
"grad_norm": 0.1744857132434845,
|
|
"learning_rate": 1.6097560975609757e-05,
|
|
"loss": 0.0153,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.12922579900773049,
|
|
"grad_norm": 0.17557364702224731,
|
|
"learning_rate": 1.6585365853658537e-05,
|
|
"loss": 0.0227,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.13291796469366562,
|
|
"grad_norm": 0.10515311360359192,
|
|
"learning_rate": 1.7073170731707317e-05,
|
|
"loss": 0.0152,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.1366101303796008,
|
|
"grad_norm": 0.12522749602794647,
|
|
"learning_rate": 1.75609756097561e-05,
|
|
"loss": 0.0103,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.14030229606553593,
|
|
"grad_norm": 0.10669893026351929,
|
|
"learning_rate": 1.804878048780488e-05,
|
|
"loss": 0.0161,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.1439944617514711,
|
|
"grad_norm": 0.09148227423429489,
|
|
"learning_rate": 1.8536585365853663e-05,
|
|
"loss": 0.0072,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.14768662743740626,
|
|
"grad_norm": 0.2032286524772644,
|
|
"learning_rate": 1.902439024390244e-05,
|
|
"loss": 0.0092,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.1513787931233414,
|
|
"grad_norm": 0.10257123410701752,
|
|
"learning_rate": 1.9512195121951222e-05,
|
|
"loss": 0.007,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.15507095880927657,
|
|
"grad_norm": 0.07334409654140472,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.0064,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.1587631244952117,
|
|
"grad_norm": 0.09883085638284683,
|
|
"learning_rate": 2.048780487804878e-05,
|
|
"loss": 0.0089,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.16245529018114688,
|
|
"grad_norm": 0.10087648034095764,
|
|
"learning_rate": 2.0975609756097564e-05,
|
|
"loss": 0.0065,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.16614745586708204,
|
|
"grad_norm": 0.10451877117156982,
|
|
"learning_rate": 2.1463414634146344e-05,
|
|
"loss": 0.0102,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.16983962155301718,
|
|
"grad_norm": 0.13105975091457367,
|
|
"learning_rate": 2.1951219512195124e-05,
|
|
"loss": 0.009,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.17353178723895235,
|
|
"grad_norm": 0.2514360547065735,
|
|
"learning_rate": 2.2439024390243907e-05,
|
|
"loss": 0.0168,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.17722395292488752,
|
|
"grad_norm": 0.11838189512491226,
|
|
"learning_rate": 2.2926829268292683e-05,
|
|
"loss": 0.0119,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.18091611861082266,
|
|
"grad_norm": 0.16984423995018005,
|
|
"learning_rate": 2.3414634146341463e-05,
|
|
"loss": 0.0065,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.18460828429675782,
|
|
"grad_norm": 0.11164893954992294,
|
|
"learning_rate": 2.3902439024390246e-05,
|
|
"loss": 0.0135,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.18830044998269296,
|
|
"grad_norm": 0.09878280013799667,
|
|
"learning_rate": 2.4390243902439026e-05,
|
|
"loss": 0.0066,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.19199261566862813,
|
|
"grad_norm": 0.09212549030780792,
|
|
"learning_rate": 2.4878048780487805e-05,
|
|
"loss": 0.0082,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.1956847813545633,
|
|
"grad_norm": 0.09363257884979248,
|
|
"learning_rate": 2.536585365853659e-05,
|
|
"loss": 0.0099,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.19937694704049844,
|
|
"grad_norm": 0.07449876517057419,
|
|
"learning_rate": 2.5853658536585368e-05,
|
|
"loss": 0.006,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.2030691127264336,
|
|
"grad_norm": 0.07617678493261337,
|
|
"learning_rate": 2.634146341463415e-05,
|
|
"loss": 0.0076,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.20676127841236874,
|
|
"grad_norm": 0.09494733065366745,
|
|
"learning_rate": 2.682926829268293e-05,
|
|
"loss": 0.0082,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.2104534440983039,
|
|
"grad_norm": 0.10162504017353058,
|
|
"learning_rate": 2.731707317073171e-05,
|
|
"loss": 0.011,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.21414560978423908,
|
|
"grad_norm": 0.16772620379924774,
|
|
"learning_rate": 2.7804878048780487e-05,
|
|
"loss": 0.0086,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.21783777547017422,
|
|
"grad_norm": 0.10658068209886551,
|
|
"learning_rate": 2.829268292682927e-05,
|
|
"loss": 0.0067,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.22152994115610938,
|
|
"grad_norm": 0.11568617820739746,
|
|
"learning_rate": 2.878048780487805e-05,
|
|
"loss": 0.0081,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.22522210684204455,
|
|
"grad_norm": 0.11837513744831085,
|
|
"learning_rate": 2.926829268292683e-05,
|
|
"loss": 0.0086,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.2289142725279797,
|
|
"grad_norm": 0.08128459751605988,
|
|
"learning_rate": 2.9756097560975613e-05,
|
|
"loss": 0.0047,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.23260643821391486,
|
|
"grad_norm": 0.36463257670402527,
|
|
"learning_rate": 3.0243902439024392e-05,
|
|
"loss": 0.0099,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.23629860389985,
|
|
"grad_norm": 0.08298784494400024,
|
|
"learning_rate": 3.073170731707317e-05,
|
|
"loss": 0.006,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.23999076958578516,
|
|
"grad_norm": 0.2765791416168213,
|
|
"learning_rate": 3.1219512195121955e-05,
|
|
"loss": 0.015,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.24368293527172033,
|
|
"grad_norm": 0.09410832822322845,
|
|
"learning_rate": 3.170731707317074e-05,
|
|
"loss": 0.007,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.24737510095765547,
|
|
"grad_norm": 0.08687976747751236,
|
|
"learning_rate": 3.2195121951219514e-05,
|
|
"loss": 0.0054,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.25106726664359064,
|
|
"grad_norm": 0.1658174693584442,
|
|
"learning_rate": 3.268292682926829e-05,
|
|
"loss": 0.0059,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.2547594323295258,
|
|
"grad_norm": 0.16597941517829895,
|
|
"learning_rate": 3.3170731707317074e-05,
|
|
"loss": 0.0043,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.25845159801546097,
|
|
"grad_norm": 0.14238758385181427,
|
|
"learning_rate": 3.365853658536586e-05,
|
|
"loss": 0.0202,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.2621437637013961,
|
|
"grad_norm": 0.28217750787734985,
|
|
"learning_rate": 3.414634146341463e-05,
|
|
"loss": 0.0245,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.26583592938733125,
|
|
"grad_norm": 0.08783560991287231,
|
|
"learning_rate": 3.4634146341463416e-05,
|
|
"loss": 0.0062,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.2695280950732664,
|
|
"grad_norm": 0.13205395638942719,
|
|
"learning_rate": 3.51219512195122e-05,
|
|
"loss": 0.0091,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.2732202607592016,
|
|
"grad_norm": 0.11077655106782913,
|
|
"learning_rate": 3.5609756097560976e-05,
|
|
"loss": 0.0068,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.27691242644513675,
|
|
"grad_norm": 0.07510002702474594,
|
|
"learning_rate": 3.609756097560976e-05,
|
|
"loss": 0.0056,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.28060459213107186,
|
|
"grad_norm": 0.1183568611741066,
|
|
"learning_rate": 3.658536585365854e-05,
|
|
"loss": 0.0107,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.28429675781700703,
|
|
"grad_norm": 0.19435347616672516,
|
|
"learning_rate": 3.7073170731707325e-05,
|
|
"loss": 0.0194,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.2879889235029422,
|
|
"grad_norm": 0.1364523470401764,
|
|
"learning_rate": 3.75609756097561e-05,
|
|
"loss": 0.0149,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.29168108918887736,
|
|
"grad_norm": 0.20089037716388702,
|
|
"learning_rate": 3.804878048780488e-05,
|
|
"loss": 0.0211,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.29537325487481253,
|
|
"grad_norm": 0.13373729586601257,
|
|
"learning_rate": 3.853658536585366e-05,
|
|
"loss": 0.0111,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.29906542056074764,
|
|
"grad_norm": 0.08260685950517654,
|
|
"learning_rate": 3.9024390243902444e-05,
|
|
"loss": 0.0094,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.3027575862466828,
|
|
"grad_norm": 0.15578778088092804,
|
|
"learning_rate": 3.951219512195122e-05,
|
|
"loss": 0.01,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.306449751932618,
|
|
"grad_norm": 0.11668805778026581,
|
|
"learning_rate": 4e-05,
|
|
"loss": 0.0135,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.31014191761855314,
|
|
"grad_norm": 0.08324134349822998,
|
|
"learning_rate": 3.999981530109401e-05,
|
|
"loss": 0.0086,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.3138340833044883,
|
|
"grad_norm": 0.2406614124774933,
|
|
"learning_rate": 3.999926120778742e-05,
|
|
"loss": 0.0125,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.3175262489904234,
|
|
"grad_norm": 0.1322041004896164,
|
|
"learning_rate": 3.9998337730314274e-05,
|
|
"loss": 0.0094,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.3212184146763586,
|
|
"grad_norm": 0.18763262033462524,
|
|
"learning_rate": 3.999704488573108e-05,
|
|
"loss": 0.0123,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.32491058036229375,
|
|
"grad_norm": 0.16090139746665955,
|
|
"learning_rate": 3.9995382697916555e-05,
|
|
"loss": 0.0084,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.3286027460482289,
|
|
"grad_norm": 0.11704199016094208,
|
|
"learning_rate": 3.999335119757112e-05,
|
|
"loss": 0.0088,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.3322949117341641,
|
|
"grad_norm": 0.13167352974414825,
|
|
"learning_rate": 3.9990950422216367e-05,
|
|
"loss": 0.0148,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.3359870774200992,
|
|
"grad_norm": 0.09690048545598984,
|
|
"learning_rate": 3.998818041619435e-05,
|
|
"loss": 0.0077,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.33967924310603437,
|
|
"grad_norm": 0.11628638207912445,
|
|
"learning_rate": 3.998504123066679e-05,
|
|
"loss": 0.007,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.33967924310603437,
|
|
"eval_loss": 0.010080486536026001,
|
|
"eval_runtime": 89.886,
|
|
"eval_samples_per_second": 10.157,
|
|
"eval_steps_per_second": 5.084,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.34337140879196953,
|
|
"grad_norm": 0.11528339982032776,
|
|
"learning_rate": 3.9981532923614074e-05,
|
|
"loss": 0.0082,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.3470635744779047,
|
|
"grad_norm": 0.2657609283924103,
|
|
"learning_rate": 3.9977655559834275e-05,
|
|
"loss": 0.013,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.35075574016383987,
|
|
"grad_norm": 0.11277999728918076,
|
|
"learning_rate": 3.9973409210941864e-05,
|
|
"loss": 0.0076,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.35444790584977504,
|
|
"grad_norm": 5.088494777679443,
|
|
"learning_rate": 3.9968793955366445e-05,
|
|
"loss": 0.0289,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.35814007153571015,
|
|
"grad_norm": 0.1601097732782364,
|
|
"learning_rate": 3.996380987835128e-05,
|
|
"loss": 0.0219,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.3618322372216453,
|
|
"grad_norm": 0.20405948162078857,
|
|
"learning_rate": 3.995845707195173e-05,
|
|
"loss": 0.0162,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.3655244029075805,
|
|
"grad_norm": 0.6011687517166138,
|
|
"learning_rate": 3.995273563503355e-05,
|
|
"loss": 0.0096,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.36921656859351565,
|
|
"grad_norm": 0.2641352117061615,
|
|
"learning_rate": 3.9946645673271034e-05,
|
|
"loss": 0.0231,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.3729087342794508,
|
|
"grad_norm": 0.18041729927062988,
|
|
"learning_rate": 3.9940187299145134e-05,
|
|
"loss": 0.0124,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.3766008999653859,
|
|
"grad_norm": 0.13946884870529175,
|
|
"learning_rate": 3.9933360631941294e-05,
|
|
"loss": 0.0136,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.3802930656513211,
|
|
"grad_norm": 0.14014215767383575,
|
|
"learning_rate": 3.992616579774732e-05,
|
|
"loss": 0.0123,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.38398523133725626,
|
|
"grad_norm": 0.215935617685318,
|
|
"learning_rate": 3.9918602929451015e-05,
|
|
"loss": 0.0166,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.38767739702319143,
|
|
"grad_norm": 0.18600742518901825,
|
|
"learning_rate": 3.991067216673772e-05,
|
|
"loss": 0.0249,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.3913695627091266,
|
|
"grad_norm": 0.19758182764053345,
|
|
"learning_rate": 3.990237365608776e-05,
|
|
"loss": 0.0215,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.3950617283950617,
|
|
"grad_norm": 0.09306969493627548,
|
|
"learning_rate": 3.9893707550773714e-05,
|
|
"loss": 0.0062,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.3987538940809969,
|
|
"grad_norm": 0.15354327857494354,
|
|
"learning_rate": 3.988467401085761e-05,
|
|
"loss": 0.0153,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.40244605976693204,
|
|
"grad_norm": 0.18650248646736145,
|
|
"learning_rate": 3.987527320318793e-05,
|
|
"loss": 0.0207,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.4061382254528672,
|
|
"grad_norm": 0.11927786469459534,
|
|
"learning_rate": 3.986550530139657e-05,
|
|
"loss": 0.0071,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.4098303911388024,
|
|
"grad_norm": 0.18260295689105988,
|
|
"learning_rate": 3.985537048589561e-05,
|
|
"loss": 0.0165,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.4135225568247375,
|
|
"grad_norm": 0.13270637392997742,
|
|
"learning_rate": 3.9844868943873975e-05,
|
|
"loss": 0.0095,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.41721472251067265,
|
|
"grad_norm": 0.10370643436908722,
|
|
"learning_rate": 3.9834000869294e-05,
|
|
"loss": 0.0075,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.4209068881966078,
|
|
"grad_norm": 0.11767494678497314,
|
|
"learning_rate": 3.982276646288784e-05,
|
|
"loss": 0.0129,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.424599053882543,
|
|
"grad_norm": 0.12992636859416962,
|
|
"learning_rate": 3.981116593215374e-05,
|
|
"loss": 0.0145,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.42829121956847815,
|
|
"grad_norm": 0.27428507804870605,
|
|
"learning_rate": 3.9799199491352246e-05,
|
|
"loss": 0.0166,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.43198338525441327,
|
|
"grad_norm": 0.1593196988105774,
|
|
"learning_rate": 3.978686736150221e-05,
|
|
"loss": 0.0102,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.43567555094034843,
|
|
"grad_norm": 0.20442363619804382,
|
|
"learning_rate": 3.977416977037671e-05,
|
|
"loss": 0.0143,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.4393677166262836,
|
|
"grad_norm": 0.11397796869277954,
|
|
"learning_rate": 3.9761106952498874e-05,
|
|
"loss": 0.0108,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.44305988231221877,
|
|
"grad_norm": 0.12436648458242416,
|
|
"learning_rate": 3.974767914913751e-05,
|
|
"loss": 0.008,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.44675204799815393,
|
|
"grad_norm": 0.10099371522665024,
|
|
"learning_rate": 3.973388660830269e-05,
|
|
"loss": 0.0071,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.4504442136840891,
|
|
"grad_norm": 0.18873853981494904,
|
|
"learning_rate": 3.971972958474113e-05,
|
|
"loss": 0.0126,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.4541363793700242,
|
|
"grad_norm": 0.14228899776935577,
|
|
"learning_rate": 3.97052083399315e-05,
|
|
"loss": 0.0108,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.4578285450559594,
|
|
"grad_norm": 0.16871212422847748,
|
|
"learning_rate": 3.969032314207961e-05,
|
|
"loss": 0.0127,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.46152071074189455,
|
|
"grad_norm": 0.10817577689886093,
|
|
"learning_rate": 3.967507426611344e-05,
|
|
"loss": 0.0166,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.4652128764278297,
|
|
"grad_norm": 0.09942852705717087,
|
|
"learning_rate": 3.965946199367804e-05,
|
|
"loss": 0.0087,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.4689050421137649,
|
|
"grad_norm": 0.08521483838558197,
|
|
"learning_rate": 3.96434866131304e-05,
|
|
"loss": 0.01,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.4725972077997,
|
|
"grad_norm": 0.09523651003837585,
|
|
"learning_rate": 3.9627148419534026e-05,
|
|
"loss": 0.0172,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.47628937348563516,
|
|
"grad_norm": 0.08050525188446045,
|
|
"learning_rate": 3.961044771465359e-05,
|
|
"loss": 0.0052,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.4799815391715703,
|
|
"grad_norm": 0.13714231550693512,
|
|
"learning_rate": 3.9593384806949263e-05,
|
|
"loss": 0.0156,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.4836737048575055,
|
|
"grad_norm": 0.08329559117555618,
|
|
"learning_rate": 3.9575960011571106e-05,
|
|
"loss": 0.014,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.48736587054344066,
|
|
"grad_norm": 0.109232597053051,
|
|
"learning_rate": 3.955817365035316e-05,
|
|
"loss": 0.0081,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.49105803622937577,
|
|
"grad_norm": 0.09475994855165482,
|
|
"learning_rate": 3.954002605180759e-05,
|
|
"loss": 0.0066,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.49475020191531094,
|
|
"grad_norm": 0.1157636046409607,
|
|
"learning_rate": 3.952151755111855e-05,
|
|
"loss": 0.0226,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.4984423676012461,
|
|
"grad_norm": 0.1317131668329239,
|
|
"learning_rate": 3.9502648490136016e-05,
|
|
"loss": 0.0097,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.5021345332871813,
|
|
"grad_norm": 0.12749852240085602,
|
|
"learning_rate": 3.948341921736948e-05,
|
|
"loss": 0.0104,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.5058266989731164,
|
|
"grad_norm": 0.08242950588464737,
|
|
"learning_rate": 3.946383008798152e-05,
|
|
"loss": 0.0054,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.5095188646590516,
|
|
"grad_norm": 0.125824436545372,
|
|
"learning_rate": 3.94438814637812e-05,
|
|
"loss": 0.0173,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.5132110303449867,
|
|
"grad_norm": 0.09344890713691711,
|
|
"learning_rate": 3.942357371321743e-05,
|
|
"loss": 0.0105,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.5169031960309219,
|
|
"grad_norm": 0.11648397147655487,
|
|
"learning_rate": 3.940290721137214e-05,
|
|
"loss": 0.0211,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.520595361716857,
|
|
"grad_norm": 0.10685181617736816,
|
|
"learning_rate": 3.938188233995336e-05,
|
|
"loss": 0.0105,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.5242875274027922,
|
|
"grad_norm": 0.1415766328573227,
|
|
"learning_rate": 3.936049948728816e-05,
|
|
"loss": 0.0132,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.5279796930887274,
|
|
"grad_norm": 0.20898067951202393,
|
|
"learning_rate": 3.933875904831551e-05,
|
|
"loss": 0.0163,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.5316718587746625,
|
|
"grad_norm": 0.09275670349597931,
|
|
"learning_rate": 3.931666142457891e-05,
|
|
"loss": 0.0099,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.5353640244605977,
|
|
"grad_norm": 0.08970851451158524,
|
|
"learning_rate": 3.929420702421907e-05,
|
|
"loss": 0.0064,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.5390561901465328,
|
|
"grad_norm": 0.10725483298301697,
|
|
"learning_rate": 3.9271396261966305e-05,
|
|
"loss": 0.0061,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.5427483558324679,
|
|
"grad_norm": 0.10847834497690201,
|
|
"learning_rate": 3.92482295591329e-05,
|
|
"loss": 0.0135,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.5464405215184032,
|
|
"grad_norm": 0.08085348457098007,
|
|
"learning_rate": 3.9224707343605315e-05,
|
|
"loss": 0.0092,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.5501326872043383,
|
|
"grad_norm": 0.10083277523517609,
|
|
"learning_rate": 3.92008300498363e-05,
|
|
"loss": 0.0107,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.5538248528902735,
|
|
"grad_norm": 0.25901204347610474,
|
|
"learning_rate": 3.917659811883687e-05,
|
|
"loss": 0.0061,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.5575170185762086,
|
|
"grad_norm": 0.06085089221596718,
|
|
"learning_rate": 3.915201199816812e-05,
|
|
"loss": 0.0065,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.5612091842621437,
|
|
"grad_norm": 0.19471333920955658,
|
|
"learning_rate": 3.9127072141933025e-05,
|
|
"loss": 0.0159,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.564901349948079,
|
|
"grad_norm": 1.377177119255066,
|
|
"learning_rate": 3.910177901076799e-05,
|
|
"loss": 0.0129,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.5685935156340141,
|
|
"grad_norm": 0.12816794216632843,
|
|
"learning_rate": 3.907613307183439e-05,
|
|
"loss": 0.0145,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.5722856813199493,
|
|
"grad_norm": 1.0884933471679688,
|
|
"learning_rate": 3.905013479880992e-05,
|
|
"loss": 0.0296,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.5759778470058844,
|
|
"grad_norm": 0.19217349588871002,
|
|
"learning_rate": 3.902378467187981e-05,
|
|
"loss": 0.0066,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.5796700126918195,
|
|
"grad_norm": 0.17874930799007416,
|
|
"learning_rate": 3.8997083177728044e-05,
|
|
"loss": 0.007,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.5833621783777547,
|
|
"grad_norm": 0.17409397661685944,
|
|
"learning_rate": 3.897003080952828e-05,
|
|
"loss": 0.0131,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.5870543440636898,
|
|
"grad_norm": 0.1562495231628418,
|
|
"learning_rate": 3.8942628066934826e-05,
|
|
"loss": 0.0086,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.5907465097496251,
|
|
"grad_norm": 0.15565301477909088,
|
|
"learning_rate": 3.891487545607332e-05,
|
|
"loss": 0.0104,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.5944386754355602,
|
|
"grad_norm": 0.14708881080150604,
|
|
"learning_rate": 3.888677348953145e-05,
|
|
"loss": 0.0097,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.5981308411214953,
|
|
"grad_norm": 0.14112205803394318,
|
|
"learning_rate": 3.885832268634946e-05,
|
|
"loss": 0.0111,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.6018230068074305,
|
|
"grad_norm": 0.3906686305999756,
|
|
"learning_rate": 3.8829523572010586e-05,
|
|
"loss": 0.0163,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.6055151724933656,
|
|
"grad_norm": 0.33456534147262573,
|
|
"learning_rate": 3.880037667843131e-05,
|
|
"loss": 0.0183,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.6092073381793008,
|
|
"grad_norm": 0.18434567749500275,
|
|
"learning_rate": 3.877088254395157e-05,
|
|
"loss": 0.0128,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.612899503865236,
|
|
"grad_norm": 0.1495783030986786,
|
|
"learning_rate": 3.874104171332481e-05,
|
|
"loss": 0.0255,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.6165916695511711,
|
|
"grad_norm": 0.17371267080307007,
|
|
"learning_rate": 3.871085473770789e-05,
|
|
"loss": 0.01,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.6202838352371063,
|
|
"grad_norm": 0.11804357171058655,
|
|
"learning_rate": 3.868032217465097e-05,
|
|
"loss": 0.0089,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.6239760009230414,
|
|
"grad_norm": 0.19819952547550201,
|
|
"learning_rate": 3.864944458808712e-05,
|
|
"loss": 0.0146,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.6276681666089766,
|
|
"grad_norm": 0.14190199971199036,
|
|
"learning_rate": 3.861822254832201e-05,
|
|
"loss": 0.0106,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.6313603322949117,
|
|
"grad_norm": 0.08863028138875961,
|
|
"learning_rate": 3.858665663202329e-05,
|
|
"loss": 0.0077,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.6350524979808468,
|
|
"grad_norm": 0.2075955718755722,
|
|
"learning_rate": 3.855474742220998e-05,
|
|
"loss": 0.0097,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.6387446636667821,
|
|
"grad_norm": 0.1362001597881317,
|
|
"learning_rate": 3.852249550824169e-05,
|
|
"loss": 0.0168,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.6424368293527172,
|
|
"grad_norm": 0.8073941469192505,
|
|
"learning_rate": 3.848990148580776e-05,
|
|
"loss": 0.0298,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.6461289950386524,
|
|
"grad_norm": 0.08775315433740616,
|
|
"learning_rate": 3.84569659569162e-05,
|
|
"loss": 0.0106,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.6498211607245875,
|
|
"grad_norm": 0.3348487615585327,
|
|
"learning_rate": 3.8423689529882635e-05,
|
|
"loss": 0.0205,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.6535133264105226,
|
|
"grad_norm": 0.1472369283437729,
|
|
"learning_rate": 3.839007281931902e-05,
|
|
"loss": 0.0249,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.6572054920964578,
|
|
"grad_norm": 0.12712818384170532,
|
|
"learning_rate": 3.835611644612234e-05,
|
|
"loss": 0.0273,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.660897657782393,
|
|
"grad_norm": 0.16857929527759552,
|
|
"learning_rate": 3.832182103746308e-05,
|
|
"loss": 0.0154,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.6645898234683282,
|
|
"grad_norm": 0.11381746828556061,
|
|
"learning_rate": 3.828718722677369e-05,
|
|
"loss": 0.0072,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.6682819891542633,
|
|
"grad_norm": 0.27607351541519165,
|
|
"learning_rate": 3.825221565373687e-05,
|
|
"loss": 0.0147,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.6719741548401984,
|
|
"grad_norm": 0.09366216510534286,
|
|
"learning_rate": 3.821690696427373e-05,
|
|
"loss": 0.0075,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.6756663205261336,
|
|
"grad_norm": 0.11064954102039337,
|
|
"learning_rate": 3.8181261810531926e-05,
|
|
"loss": 0.0068,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.6793584862120687,
|
|
"grad_norm": 0.10137422382831573,
|
|
"learning_rate": 3.8145280850873524e-05,
|
|
"loss": 0.0078,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.6793584862120687,
|
|
"eval_loss": 0.01165215577930212,
|
|
"eval_runtime": 90.6071,
|
|
"eval_samples_per_second": 10.076,
|
|
"eval_steps_per_second": 5.044,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.683050651898004,
|
|
"grad_norm": 0.1427939236164093,
|
|
"learning_rate": 3.810896474986294e-05,
|
|
"loss": 0.0099,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.6867428175839391,
|
|
"grad_norm": 0.11883988231420517,
|
|
"learning_rate": 3.8072314178254556e-05,
|
|
"loss": 0.0111,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.6904349832698742,
|
|
"grad_norm": 0.10005701333284378,
|
|
"learning_rate": 3.803532981298044e-05,
|
|
"loss": 0.0066,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.6941271489558094,
|
|
"grad_norm": 0.11374926567077637,
|
|
"learning_rate": 3.7998012337137765e-05,
|
|
"loss": 0.0115,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.6978193146417445,
|
|
"grad_norm": 0.1569487750530243,
|
|
"learning_rate": 3.7960362439976234e-05,
|
|
"loss": 0.0086,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.7015114803276797,
|
|
"grad_norm": 0.11175687611103058,
|
|
"learning_rate": 3.7922380816885323e-05,
|
|
"loss": 0.0112,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.7052036460136148,
|
|
"grad_norm": 0.1092715710401535,
|
|
"learning_rate": 3.7884068169381454e-05,
|
|
"loss": 0.0077,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.7088958116995501,
|
|
"grad_norm": 0.19558553397655487,
|
|
"learning_rate": 3.784542520509503e-05,
|
|
"loss": 0.0098,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.7125879773854852,
|
|
"grad_norm": 0.13434702157974243,
|
|
"learning_rate": 3.78064526377574e-05,
|
|
"loss": 0.0151,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.7162801430714203,
|
|
"grad_norm": 0.15444593131542206,
|
|
"learning_rate": 3.7767151187187586e-05,
|
|
"loss": 0.0106,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.7199723087573555,
|
|
"grad_norm": 0.06894934922456741,
|
|
"learning_rate": 3.7727521579279095e-05,
|
|
"loss": 0.0049,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.7236644744432906,
|
|
"grad_norm": 0.1396859586238861,
|
|
"learning_rate": 3.768756454598645e-05,
|
|
"loss": 0.0118,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.7273566401292259,
|
|
"grad_norm": 0.1533813774585724,
|
|
"learning_rate": 3.764728082531169e-05,
|
|
"loss": 0.0104,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.731048805815161,
|
|
"grad_norm": 0.1005004420876503,
|
|
"learning_rate": 3.760667116129072e-05,
|
|
"loss": 0.0041,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.7347409715010961,
|
|
"grad_norm": 0.1538483053445816,
|
|
"learning_rate": 3.756573630397958e-05,
|
|
"loss": 0.0083,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.7384331371870313,
|
|
"grad_norm": 0.09973620623350143,
|
|
"learning_rate": 3.752447700944064e-05,
|
|
"loss": 0.0067,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.7421253028729664,
|
|
"grad_norm": 0.17357227206230164,
|
|
"learning_rate": 3.7482894039728525e-05,
|
|
"loss": 0.0165,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.7458174685589016,
|
|
"grad_norm": 0.1083260253071785,
|
|
"learning_rate": 3.744098816287616e-05,
|
|
"loss": 0.0063,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.7495096342448367,
|
|
"grad_norm": 0.0863720178604126,
|
|
"learning_rate": 3.7398760152880484e-05,
|
|
"loss": 0.0072,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.7532017999307719,
|
|
"grad_norm": 0.1260920763015747,
|
|
"learning_rate": 3.735621078968823e-05,
|
|
"loss": 0.0123,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.7568939656167071,
|
|
"grad_norm": 0.0873272716999054,
|
|
"learning_rate": 3.731334085918149e-05,
|
|
"loss": 0.0223,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.7605861313026422,
|
|
"grad_norm": 0.0700017586350441,
|
|
"learning_rate": 3.7270151153163174e-05,
|
|
"loss": 0.0073,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.7642782969885774,
|
|
"grad_norm": 0.07996451109647751,
|
|
"learning_rate": 3.722664246934244e-05,
|
|
"loss": 0.0061,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.7679704626745125,
|
|
"grad_norm": 0.10587499290704727,
|
|
"learning_rate": 3.718281561131992e-05,
|
|
"loss": 0.0365,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.7716626283604476,
|
|
"grad_norm": 0.14550632238388062,
|
|
"learning_rate": 3.713867138857288e-05,
|
|
"loss": 0.0151,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.7753547940463829,
|
|
"grad_norm": 0.17798705399036407,
|
|
"learning_rate": 3.7094210616440284e-05,
|
|
"loss": 0.0116,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.779046959732318,
|
|
"grad_norm": 0.08906543254852295,
|
|
"learning_rate": 3.704943411610774e-05,
|
|
"loss": 0.0076,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 0.7827391254182532,
|
|
"grad_norm": 0.07332167774438858,
|
|
"learning_rate": 3.700434271459229e-05,
|
|
"loss": 0.0086,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 0.7864312911041883,
|
|
"grad_norm": 0.11497493088245392,
|
|
"learning_rate": 3.69589372447272e-05,
|
|
"loss": 0.015,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 0.7901234567901234,
|
|
"grad_norm": 0.08599186688661575,
|
|
"learning_rate": 3.6913218545146536e-05,
|
|
"loss": 0.0111,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 0.7938156224760586,
|
|
"grad_norm": 0.0786437839269638,
|
|
"learning_rate": 3.686718746026967e-05,
|
|
"loss": 0.0084,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.7975077881619937,
|
|
"grad_norm": 0.07181154191493988,
|
|
"learning_rate": 3.68208448402857e-05,
|
|
"loss": 0.008,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 0.801199953847929,
|
|
"grad_norm": 0.11212016642093658,
|
|
"learning_rate": 3.677419154113776e-05,
|
|
"loss": 0.014,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 0.8048921195338641,
|
|
"grad_norm": 0.0851934626698494,
|
|
"learning_rate": 3.672722842450717e-05,
|
|
"loss": 0.012,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 0.8085842852197992,
|
|
"grad_norm": 0.053547583520412445,
|
|
"learning_rate": 3.667995635779756e-05,
|
|
"loss": 0.0048,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 0.8122764509057344,
|
|
"grad_norm": 0.09233611822128296,
|
|
"learning_rate": 3.6632376214118836e-05,
|
|
"loss": 0.0082,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.8159686165916695,
|
|
"grad_norm": 0.08250945061445236,
|
|
"learning_rate": 3.6584488872271035e-05,
|
|
"loss": 0.005,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 0.8196607822776047,
|
|
"grad_norm": 0.06966373324394226,
|
|
"learning_rate": 3.6536295216728136e-05,
|
|
"loss": 0.0058,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 0.8233529479635399,
|
|
"grad_norm": 0.08930462598800659,
|
|
"learning_rate": 3.648779613762167e-05,
|
|
"loss": 0.0098,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 0.827045113649475,
|
|
"grad_norm": 0.11651594936847687,
|
|
"learning_rate": 3.643899253072433e-05,
|
|
"loss": 0.0149,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 0.8307372793354102,
|
|
"grad_norm": 0.08121360838413239,
|
|
"learning_rate": 3.63898852974334e-05,
|
|
"loss": 0.0077,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.8344294450213453,
|
|
"grad_norm": 0.0807429626584053,
|
|
"learning_rate": 3.634047534475409e-05,
|
|
"loss": 0.0071,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 0.8381216107072805,
|
|
"grad_norm": 0.10255023092031479,
|
|
"learning_rate": 3.629076358528284e-05,
|
|
"loss": 0.0122,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 0.8418137763932156,
|
|
"grad_norm": 0.12542816996574402,
|
|
"learning_rate": 3.62407509371904e-05,
|
|
"loss": 0.0093,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 0.8455059420791508,
|
|
"grad_norm": 0.07941003143787384,
|
|
"learning_rate": 3.6190438324204905e-05,
|
|
"loss": 0.0059,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 0.849198107765086,
|
|
"grad_norm": 0.0921979695558548,
|
|
"learning_rate": 3.613982667559483e-05,
|
|
"loss": 0.0083,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.8528902734510211,
|
|
"grad_norm": 0.10604582726955414,
|
|
"learning_rate": 3.608891692615176e-05,
|
|
"loss": 0.0128,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 0.8565824391369563,
|
|
"grad_norm": 0.09270962327718735,
|
|
"learning_rate": 3.603771001617322e-05,
|
|
"loss": 0.0141,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 0.8602746048228914,
|
|
"grad_norm": 0.062396857887506485,
|
|
"learning_rate": 3.598620689144523e-05,
|
|
"loss": 0.0094,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 0.8639667705088265,
|
|
"grad_norm": 0.07858982682228088,
|
|
"learning_rate": 3.5934408503224864e-05,
|
|
"loss": 0.0071,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 0.8676589361947618,
|
|
"grad_norm": 0.07246547937393188,
|
|
"learning_rate": 3.588231580822269e-05,
|
|
"loss": 0.0057,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.8713511018806969,
|
|
"grad_norm": 0.3656146824359894,
|
|
"learning_rate": 3.5829929768585086e-05,
|
|
"loss": 0.0103,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 0.8750432675666321,
|
|
"grad_norm": 0.05800218507647514,
|
|
"learning_rate": 3.577725135187647e-05,
|
|
"loss": 0.0061,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 0.8787354332525672,
|
|
"grad_norm": 0.08162401616573334,
|
|
"learning_rate": 3.5724281531061436e-05,
|
|
"loss": 0.0143,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 0.8824275989385023,
|
|
"grad_norm": 0.07134553790092468,
|
|
"learning_rate": 3.567102128448678e-05,
|
|
"loss": 0.0054,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 0.8861197646244375,
|
|
"grad_norm": 0.13681039214134216,
|
|
"learning_rate": 3.561747159586343e-05,
|
|
"loss": 0.0198,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.8898119303103726,
|
|
"grad_norm": 0.05192103236913681,
|
|
"learning_rate": 3.5563633454248275e-05,
|
|
"loss": 0.0043,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 0.8935040959963079,
|
|
"grad_norm": 0.08083774149417877,
|
|
"learning_rate": 3.550950785402591e-05,
|
|
"loss": 0.0062,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 0.897196261682243,
|
|
"grad_norm": 0.17133909463882446,
|
|
"learning_rate": 3.5455095794890234e-05,
|
|
"loss": 0.0086,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 0.9008884273681782,
|
|
"grad_norm": 0.05909837409853935,
|
|
"learning_rate": 3.540039828182604e-05,
|
|
"loss": 0.0052,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 0.9045805930541133,
|
|
"grad_norm": 0.051370203495025635,
|
|
"learning_rate": 3.53454163250904e-05,
|
|
"loss": 0.0058,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.9082727587400484,
|
|
"grad_norm": 0.14762379229068756,
|
|
"learning_rate": 3.529015094019405e-05,
|
|
"loss": 0.0182,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 0.9119649244259836,
|
|
"grad_norm": 0.052717871963977814,
|
|
"learning_rate": 3.523460314788259e-05,
|
|
"loss": 0.004,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 0.9156570901119188,
|
|
"grad_norm": 0.0905941054224968,
|
|
"learning_rate": 3.517877397411768e-05,
|
|
"loss": 0.0152,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 0.919349255797854,
|
|
"grad_norm": 0.08593659847974777,
|
|
"learning_rate": 3.5122664450058044e-05,
|
|
"loss": 0.0089,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 0.9230414214837891,
|
|
"grad_norm": 0.08689778298139572,
|
|
"learning_rate": 3.506627561204045e-05,
|
|
"loss": 0.017,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.9267335871697242,
|
|
"grad_norm": 0.07897453010082245,
|
|
"learning_rate": 3.5009608501560585e-05,
|
|
"loss": 0.01,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 0.9304257528556594,
|
|
"grad_norm": 0.12059691548347473,
|
|
"learning_rate": 3.495266416525376e-05,
|
|
"loss": 0.0279,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 0.9341179185415945,
|
|
"grad_norm": 0.06975448876619339,
|
|
"learning_rate": 3.489544365487564e-05,
|
|
"loss": 0.0054,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 0.9378100842275298,
|
|
"grad_norm": 0.11304624378681183,
|
|
"learning_rate": 3.48379480272828e-05,
|
|
"loss": 0.0117,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 0.9415022499134649,
|
|
"grad_norm": 0.07075347006320953,
|
|
"learning_rate": 3.478017834441319e-05,
|
|
"loss": 0.0136,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.9451944155994,
|
|
"grad_norm": 0.04639885574579239,
|
|
"learning_rate": 3.472213567326652e-05,
|
|
"loss": 0.0039,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 0.9488865812853352,
|
|
"grad_norm": 0.048733506351709366,
|
|
"learning_rate": 3.4663821085884597e-05,
|
|
"loss": 0.0044,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 0.9525787469712703,
|
|
"grad_norm": 0.06614992767572403,
|
|
"learning_rate": 3.460523565933145e-05,
|
|
"loss": 0.0044,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 0.9562709126572055,
|
|
"grad_norm": 0.16428013145923615,
|
|
"learning_rate": 3.4546380475673514e-05,
|
|
"loss": 0.0079,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 0.9599630783431407,
|
|
"grad_norm": 0.11396708339452744,
|
|
"learning_rate": 3.448725662195959e-05,
|
|
"loss": 0.024,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.9636552440290758,
|
|
"grad_norm": 0.10393685102462769,
|
|
"learning_rate": 3.442786519020077e-05,
|
|
"loss": 0.0238,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 0.967347409715011,
|
|
"grad_norm": 0.09232791513204575,
|
|
"learning_rate": 3.436820727735031e-05,
|
|
"loss": 0.0145,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 0.9710395754009461,
|
|
"grad_norm": 0.1006862074136734,
|
|
"learning_rate": 3.430828398528336e-05,
|
|
"loss": 0.0166,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 0.9747317410868813,
|
|
"grad_norm": 0.10068142414093018,
|
|
"learning_rate": 3.4248096420776536e-05,
|
|
"loss": 0.0088,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 0.9784239067728164,
|
|
"grad_norm": 0.05003920570015907,
|
|
"learning_rate": 3.418764569548758e-05,
|
|
"loss": 0.005,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.9821160724587515,
|
|
"grad_norm": 0.05870620906352997,
|
|
"learning_rate": 3.412693292593478e-05,
|
|
"loss": 0.006,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 0.9858082381446868,
|
|
"grad_norm": 0.09036832302808762,
|
|
"learning_rate": 3.4065959233476334e-05,
|
|
"loss": 0.0097,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 0.9895004038306219,
|
|
"grad_norm": 0.07893083244562149,
|
|
"learning_rate": 3.4004725744289685e-05,
|
|
"loss": 0.0068,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 0.9931925695165571,
|
|
"grad_norm": 0.08609715849161148,
|
|
"learning_rate": 3.394323358935068e-05,
|
|
"loss": 0.0101,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 0.9968847352024922,
|
|
"grad_norm": 0.07740162312984467,
|
|
"learning_rate": 3.3881483904412685e-05,
|
|
"loss": 0.0087,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.0650748685002327,
|
|
"learning_rate": 3.3819477829985624e-05,
|
|
"loss": 0.0091,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 1.0036921656859352,
|
|
"grad_norm": 0.060567598789930344,
|
|
"learning_rate": 3.3757216511314915e-05,
|
|
"loss": 0.0158,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 1.0073843313718702,
|
|
"grad_norm": 0.08802874386310577,
|
|
"learning_rate": 3.3694701098360295e-05,
|
|
"loss": 0.0063,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 1.0110764970578054,
|
|
"grad_norm": 0.05286823958158493,
|
|
"learning_rate": 3.363193274577461e-05,
|
|
"loss": 0.0022,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 1.0147686627437407,
|
|
"grad_norm": 0.059792377054691315,
|
|
"learning_rate": 3.356891261288247e-05,
|
|
"loss": 0.0094,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 1.018460828429676,
|
|
"grad_norm": 0.03914966061711311,
|
|
"learning_rate": 3.350564186365882e-05,
|
|
"loss": 0.0027,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 1.018460828429676,
|
|
"eval_loss": 0.008386676199734211,
|
|
"eval_runtime": 89.8024,
|
|
"eval_samples_per_second": 10.167,
|
|
"eval_steps_per_second": 5.089,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 1.022152994115611,
|
|
"grad_norm": 0.05939140170812607,
|
|
"learning_rate": 3.344212166670748e-05,
|
|
"loss": 0.0046,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 1.0258451598015461,
|
|
"grad_norm": 0.23599718511104584,
|
|
"learning_rate": 3.3378353195239546e-05,
|
|
"loss": 0.0088,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 1.0295373254874813,
|
|
"grad_norm": 0.04266897588968277,
|
|
"learning_rate": 3.331433762705171e-05,
|
|
"loss": 0.0025,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 1.0332294911734163,
|
|
"grad_norm": 0.07201959937810898,
|
|
"learning_rate": 3.32500761445045e-05,
|
|
"loss": 0.0092,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.0369216568593516,
|
|
"grad_norm": 0.38548168540000916,
|
|
"learning_rate": 3.318556993450048e-05,
|
|
"loss": 0.0054,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 1.0406138225452868,
|
|
"grad_norm": 0.09021361917257309,
|
|
"learning_rate": 3.312082018846229e-05,
|
|
"loss": 0.0038,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 1.0443059882312218,
|
|
"grad_norm": 0.1946287751197815,
|
|
"learning_rate": 3.3055828102310656e-05,
|
|
"loss": 0.0054,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 1.047998153917157,
|
|
"grad_norm": 0.08428189903497696,
|
|
"learning_rate": 3.299059487644229e-05,
|
|
"loss": 0.0153,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 1.0516903196030922,
|
|
"grad_norm": 0.06667447090148926,
|
|
"learning_rate": 3.292512171570775e-05,
|
|
"loss": 0.0066,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 1.0553824852890274,
|
|
"grad_norm": 0.04148027300834656,
|
|
"learning_rate": 3.2859409829389146e-05,
|
|
"loss": 0.0029,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 1.0590746509749625,
|
|
"grad_norm": 0.06811388581991196,
|
|
"learning_rate": 3.2793460431177827e-05,
|
|
"loss": 0.0058,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 1.0627668166608977,
|
|
"grad_norm": 0.04447786882519722,
|
|
"learning_rate": 3.272727473915197e-05,
|
|
"loss": 0.0034,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 1.066458982346833,
|
|
"grad_norm": 0.22201436758041382,
|
|
"learning_rate": 3.266085397575406e-05,
|
|
"loss": 0.0123,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 1.070151148032768,
|
|
"grad_norm": 0.0670245811343193,
|
|
"learning_rate": 3.259419936776833e-05,
|
|
"loss": 0.0088,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 1.0738433137187031,
|
|
"grad_norm": 0.06902515143156052,
|
|
"learning_rate": 3.25273121462981e-05,
|
|
"loss": 0.0025,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 1.0775354794046383,
|
|
"grad_norm": 0.1305130422115326,
|
|
"learning_rate": 3.246019354674303e-05,
|
|
"loss": 0.0093,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 1.0812276450905733,
|
|
"grad_norm": 0.11023421585559845,
|
|
"learning_rate": 3.239284480877632e-05,
|
|
"loss": 0.0085,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 1.0849198107765086,
|
|
"grad_norm": 0.07404778897762299,
|
|
"learning_rate": 3.232526717632178e-05,
|
|
"loss": 0.0045,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 1.0886119764624438,
|
|
"grad_norm": 0.054306432604789734,
|
|
"learning_rate": 3.22574618975309e-05,
|
|
"loss": 0.0032,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 1.092304142148379,
|
|
"grad_norm": 0.05058205872774124,
|
|
"learning_rate": 3.218943022475975e-05,
|
|
"loss": 0.0046,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 1.095996307834314,
|
|
"grad_norm": 0.04942139610648155,
|
|
"learning_rate": 3.2121173414545886e-05,
|
|
"loss": 0.004,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 1.0996884735202492,
|
|
"grad_norm": 0.04936928302049637,
|
|
"learning_rate": 3.205269272758513e-05,
|
|
"loss": 0.0038,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 1.1033806392061845,
|
|
"grad_norm": 0.06841211020946503,
|
|
"learning_rate": 3.198398942870828e-05,
|
|
"loss": 0.005,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 1.1070728048921195,
|
|
"grad_norm": 0.11358197033405304,
|
|
"learning_rate": 3.1915064786857745e-05,
|
|
"loss": 0.0189,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.1107649705780547,
|
|
"grad_norm": 0.047055114060640335,
|
|
"learning_rate": 3.1845920075064115e-05,
|
|
"loss": 0.0022,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 1.11445713626399,
|
|
"grad_norm": 0.06010272353887558,
|
|
"learning_rate": 3.177655657042266e-05,
|
|
"loss": 0.0042,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 1.118149301949925,
|
|
"grad_norm": 0.24772150814533234,
|
|
"learning_rate": 3.170697555406972e-05,
|
|
"loss": 0.0104,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 1.1218414676358601,
|
|
"grad_norm": 0.04649265855550766,
|
|
"learning_rate": 3.163717831115906e-05,
|
|
"loss": 0.0032,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 1.1255336333217953,
|
|
"grad_norm": 0.0447462759912014,
|
|
"learning_rate": 3.156716613083811e-05,
|
|
"loss": 0.0038,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 1.1292257990077306,
|
|
"grad_norm": 0.09475228935480118,
|
|
"learning_rate": 3.1496940306224185e-05,
|
|
"loss": 0.0107,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 1.1329179646936656,
|
|
"grad_norm": 0.08427289873361588,
|
|
"learning_rate": 3.14265021343806e-05,
|
|
"loss": 0.0174,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 1.1366101303796008,
|
|
"grad_norm": 0.05597711727023125,
|
|
"learning_rate": 3.1355852916292654e-05,
|
|
"loss": 0.0089,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 1.140302296065536,
|
|
"grad_norm": 0.0892227441072464,
|
|
"learning_rate": 3.1284993956843685e-05,
|
|
"loss": 0.0135,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 1.143994461751471,
|
|
"grad_norm": 0.05312884971499443,
|
|
"learning_rate": 3.121392656479094e-05,
|
|
"loss": 0.0043,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 1.1476866274374062,
|
|
"grad_norm": 0.047065041959285736,
|
|
"learning_rate": 3.114265205274135e-05,
|
|
"loss": 0.0031,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 1.1513787931233415,
|
|
"grad_norm": 0.05725245550274849,
|
|
"learning_rate": 3.1071171737127375e-05,
|
|
"loss": 0.0035,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 1.1550709588092767,
|
|
"grad_norm": 0.04488014802336693,
|
|
"learning_rate": 3.0999486938182605e-05,
|
|
"loss": 0.003,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 1.1587631244952117,
|
|
"grad_norm": 0.047577228397130966,
|
|
"learning_rate": 3.0927598979917454e-05,
|
|
"loss": 0.0031,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 1.162455290181147,
|
|
"grad_norm": 0.0874548926949501,
|
|
"learning_rate": 3.085550919009464e-05,
|
|
"loss": 0.0102,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 1.1661474558670821,
|
|
"grad_norm": 0.19847019016742706,
|
|
"learning_rate": 3.078321890020469e-05,
|
|
"loss": 0.008,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 1.1698396215530171,
|
|
"grad_norm": 0.052011147141456604,
|
|
"learning_rate": 3.071072944544135e-05,
|
|
"loss": 0.0033,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 1.1735317872389524,
|
|
"grad_norm": 0.12744103372097015,
|
|
"learning_rate": 3.0638042164676915e-05,
|
|
"loss": 0.0048,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 1.1772239529248876,
|
|
"grad_norm": 0.0666290894150734,
|
|
"learning_rate": 3.0565158400437525e-05,
|
|
"loss": 0.0083,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 1.1809161186108226,
|
|
"grad_norm": 0.06896385550498962,
|
|
"learning_rate": 3.0492079498878318e-05,
|
|
"loss": 0.0067,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.1846082842967578,
|
|
"grad_norm": 0.07686945050954819,
|
|
"learning_rate": 3.041880680975861e-05,
|
|
"loss": 0.0046,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 1.188300449982693,
|
|
"grad_norm": 0.08590448647737503,
|
|
"learning_rate": 3.0345341686416955e-05,
|
|
"loss": 0.0045,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 1.191992615668628,
|
|
"grad_norm": 0.08735356479883194,
|
|
"learning_rate": 3.0271685485746154e-05,
|
|
"loss": 0.0051,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 1.1956847813545632,
|
|
"grad_norm": 0.05842231586575508,
|
|
"learning_rate": 3.0197839568168167e-05,
|
|
"loss": 0.0035,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 1.1993769470404985,
|
|
"grad_norm": 0.05802956968545914,
|
|
"learning_rate": 3.0123805297609005e-05,
|
|
"loss": 0.0065,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 1.2030691127264337,
|
|
"grad_norm": 0.08226858079433441,
|
|
"learning_rate": 3.004958404147356e-05,
|
|
"loss": 0.0044,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 1.2067612784123687,
|
|
"grad_norm": 0.04885147139430046,
|
|
"learning_rate": 2.9975177170620307e-05,
|
|
"loss": 0.0049,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 1.210453444098304,
|
|
"grad_norm": 0.24907778203487396,
|
|
"learning_rate": 2.9900586059336008e-05,
|
|
"loss": 0.0248,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 1.2141456097842391,
|
|
"grad_norm": 0.06446171551942825,
|
|
"learning_rate": 2.9825812085310327e-05,
|
|
"loss": 0.0036,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 1.2178377754701741,
|
|
"grad_norm": 4.368242263793945,
|
|
"learning_rate": 2.975085662961039e-05,
|
|
"loss": 0.0103,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 1.2215299411561094,
|
|
"grad_norm": 0.06198897585272789,
|
|
"learning_rate": 2.967572107665526e-05,
|
|
"loss": 0.0035,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 1.2252221068420446,
|
|
"grad_norm": 0.13626688718795776,
|
|
"learning_rate": 2.960040681419039e-05,
|
|
"loss": 0.0053,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 1.2289142725279798,
|
|
"grad_norm": 0.19492781162261963,
|
|
"learning_rate": 2.9524915233261944e-05,
|
|
"loss": 0.0084,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 1.2326064382139148,
|
|
"grad_norm": 0.2215728908777237,
|
|
"learning_rate": 2.944924772819119e-05,
|
|
"loss": 0.0086,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 1.23629860389985,
|
|
"grad_norm": 0.2793082892894745,
|
|
"learning_rate": 2.9373405696548656e-05,
|
|
"loss": 0.005,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 1.2399907695857852,
|
|
"grad_norm": 0.04554106295108795,
|
|
"learning_rate": 2.9297390539128364e-05,
|
|
"loss": 0.004,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 1.2436829352717202,
|
|
"grad_norm": 0.05815259367227554,
|
|
"learning_rate": 2.922120365992196e-05,
|
|
"loss": 0.0052,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 1.2473751009576555,
|
|
"grad_norm": 0.07114019244909286,
|
|
"learning_rate": 2.9144846466092773e-05,
|
|
"loss": 0.0079,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 1.2510672666435907,
|
|
"grad_norm": 0.0655088722705841,
|
|
"learning_rate": 2.9068320367949817e-05,
|
|
"loss": 0.0057,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 1.254759432329526,
|
|
"grad_norm": 1.1501847505569458,
|
|
"learning_rate": 2.899162677892175e-05,
|
|
"loss": 0.0089,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.258451598015461,
|
|
"grad_norm": 0.06778527051210403,
|
|
"learning_rate": 2.891476711553077e-05,
|
|
"loss": 0.0065,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 1.2621437637013961,
|
|
"grad_norm": 0.12092837691307068,
|
|
"learning_rate": 2.8837742797366454e-05,
|
|
"loss": 0.0112,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 1.2658359293873311,
|
|
"grad_norm": 0.08216589689254761,
|
|
"learning_rate": 2.876055524705953e-05,
|
|
"loss": 0.0085,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 1.2695280950732664,
|
|
"grad_norm": 0.0777222067117691,
|
|
"learning_rate": 2.8683205890255613e-05,
|
|
"loss": 0.0037,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 1.2732202607592016,
|
|
"grad_norm": 0.08206533640623093,
|
|
"learning_rate": 2.8605696155588855e-05,
|
|
"loss": 0.0061,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 1.2769124264451368,
|
|
"grad_norm": 0.09279124438762665,
|
|
"learning_rate": 2.852802747465558e-05,
|
|
"loss": 0.0054,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 1.2806045921310718,
|
|
"grad_norm": 0.16200777888298035,
|
|
"learning_rate": 2.845020128198782e-05,
|
|
"loss": 0.0099,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 1.284296757817007,
|
|
"grad_norm": 0.2419300675392151,
|
|
"learning_rate": 2.837221901502685e-05,
|
|
"loss": 0.0171,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 1.2879889235029423,
|
|
"grad_norm": 0.11904854327440262,
|
|
"learning_rate": 2.8294082114096607e-05,
|
|
"loss": 0.0187,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 1.2916810891888773,
|
|
"grad_norm": 0.07375206053256989,
|
|
"learning_rate": 2.8215792022377092e-05,
|
|
"loss": 0.0154,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 1.2953732548748125,
|
|
"grad_norm": 0.060616642236709595,
|
|
"learning_rate": 2.8137350185877744e-05,
|
|
"loss": 0.0031,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 1.2990654205607477,
|
|
"grad_norm": 0.04948243498802185,
|
|
"learning_rate": 2.8058758053410704e-05,
|
|
"loss": 0.0025,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 1.302757586246683,
|
|
"grad_norm": 0.07605982571840286,
|
|
"learning_rate": 2.7980017076564053e-05,
|
|
"loss": 0.0045,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 1.306449751932618,
|
|
"grad_norm": 0.06741812080144882,
|
|
"learning_rate": 2.7901128709675025e-05,
|
|
"loss": 0.005,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 1.3101419176185531,
|
|
"grad_norm": 0.09975893050432205,
|
|
"learning_rate": 2.782209440980312e-05,
|
|
"loss": 0.0067,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 1.3138340833044884,
|
|
"grad_norm": 0.06588315218687057,
|
|
"learning_rate": 2.774291563670322e-05,
|
|
"loss": 0.0027,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 1.3175262489904234,
|
|
"grad_norm": 0.11582572758197784,
|
|
"learning_rate": 2.766359385279859e-05,
|
|
"loss": 0.0047,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 1.3212184146763586,
|
|
"grad_norm": 0.05676430091261864,
|
|
"learning_rate": 2.7584130523153906e-05,
|
|
"loss": 0.0022,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 1.3249105803622938,
|
|
"grad_norm": 0.07599082589149475,
|
|
"learning_rate": 2.7504527115448176e-05,
|
|
"loss": 0.0047,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 1.328602746048229,
|
|
"grad_norm": 0.053951650857925415,
|
|
"learning_rate": 2.742478509994763e-05,
|
|
"loss": 0.0031,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.332294911734164,
|
|
"grad_norm": 0.05379689112305641,
|
|
"learning_rate": 2.7344905949478557e-05,
|
|
"loss": 0.0034,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 1.3359870774200993,
|
|
"grad_norm": 0.08939212560653687,
|
|
"learning_rate": 2.7264891139400155e-05,
|
|
"loss": 0.0103,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 1.3396792431060343,
|
|
"grad_norm": 0.05766845494508743,
|
|
"learning_rate": 2.718474214757719e-05,
|
|
"loss": 0.0036,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 1.3433714087919695,
|
|
"grad_norm": 0.11903363466262817,
|
|
"learning_rate": 2.710446045435278e-05,
|
|
"loss": 0.0057,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 1.3470635744779047,
|
|
"grad_norm": 0.07542143017053604,
|
|
"learning_rate": 2.7024047542521014e-05,
|
|
"loss": 0.0085,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 1.35075574016384,
|
|
"grad_norm": 0.08536005765199661,
|
|
"learning_rate": 2.694350489729958e-05,
|
|
"loss": 0.0144,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 1.3544479058497751,
|
|
"grad_norm": 0.09188759326934814,
|
|
"learning_rate": 2.6862834006302324e-05,
|
|
"loss": 0.0083,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 1.3581400715357101,
|
|
"grad_norm": 0.1899387389421463,
|
|
"learning_rate": 2.678203635951177e-05,
|
|
"loss": 0.0084,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 1.3581400715357101,
|
|
"eval_loss": 0.008687354624271393,
|
|
"eval_runtime": 90.5037,
|
|
"eval_samples_per_second": 10.088,
|
|
"eval_steps_per_second": 5.05,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 1.3618322372216454,
|
|
"grad_norm": 0.046323299407958984,
|
|
"learning_rate": 2.6701113449251618e-05,
|
|
"loss": 0.0044,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 1.3655244029075804,
|
|
"grad_norm": 0.06219512224197388,
|
|
"learning_rate": 2.6620066770159178e-05,
|
|
"loss": 0.0032,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 1.3692165685935156,
|
|
"grad_norm": 0.1851065307855606,
|
|
"learning_rate": 2.6538897819157733e-05,
|
|
"loss": 0.005,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 1.3729087342794508,
|
|
"grad_norm": 0.12302684038877487,
|
|
"learning_rate": 2.6457608095428925e-05,
|
|
"loss": 0.0056,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 1.376600899965386,
|
|
"grad_norm": 0.06654980778694153,
|
|
"learning_rate": 2.6376199100385074e-05,
|
|
"loss": 0.0049,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 1.380293065651321,
|
|
"grad_norm": 0.08494460582733154,
|
|
"learning_rate": 2.62946723376414e-05,
|
|
"loss": 0.011,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 1.3839852313372563,
|
|
"grad_norm": 0.08226186037063599,
|
|
"learning_rate": 2.6213029312988294e-05,
|
|
"loss": 0.008,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 1.3876773970231915,
|
|
"grad_norm": 0.06261271983385086,
|
|
"learning_rate": 2.6131271534363497e-05,
|
|
"loss": 0.0063,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 1.3913695627091265,
|
|
"grad_norm": 0.040595002472400665,
|
|
"learning_rate": 2.604940051182422e-05,
|
|
"loss": 0.0029,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 1.3950617283950617,
|
|
"grad_norm": 0.054169923067092896,
|
|
"learning_rate": 2.596741775751931e-05,
|
|
"loss": 0.0023,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 1.398753894080997,
|
|
"grad_norm": 0.06638327986001968,
|
|
"learning_rate": 2.5885324785661263e-05,
|
|
"loss": 0.0059,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 1.4024460597669322,
|
|
"grad_norm": 0.08735162764787674,
|
|
"learning_rate": 2.580312311249828e-05,
|
|
"loss": 0.0053,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.4061382254528672,
|
|
"grad_norm": 0.03515574708580971,
|
|
"learning_rate": 2.572081425628628e-05,
|
|
"loss": 0.0026,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 1.4098303911388024,
|
|
"grad_norm": 0.07844855636358261,
|
|
"learning_rate": 2.5638399737260837e-05,
|
|
"loss": 0.0071,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 1.4135225568247374,
|
|
"grad_norm": 0.05010690912604332,
|
|
"learning_rate": 2.555588107760909e-05,
|
|
"loss": 0.0032,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 1.4172147225106726,
|
|
"grad_norm": 0.048177916556596756,
|
|
"learning_rate": 2.5473259801441663e-05,
|
|
"loss": 0.0027,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 1.4209068881966078,
|
|
"grad_norm": 0.073530413210392,
|
|
"learning_rate": 2.5390537434764483e-05,
|
|
"loss": 0.0066,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 1.424599053882543,
|
|
"grad_norm": 0.13586187362670898,
|
|
"learning_rate": 2.530771550545061e-05,
|
|
"loss": 0.0111,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 1.4282912195684783,
|
|
"grad_norm": 0.08986911922693253,
|
|
"learning_rate": 2.522479554321203e-05,
|
|
"loss": 0.003,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 1.4319833852544133,
|
|
"grad_norm": 0.09543124586343765,
|
|
"learning_rate": 2.5141779079571366e-05,
|
|
"loss": 0.0058,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 1.4356755509403485,
|
|
"grad_norm": 0.06117438152432442,
|
|
"learning_rate": 2.5058667647833615e-05,
|
|
"loss": 0.0031,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 1.4393677166262835,
|
|
"grad_norm": 0.05431349202990532,
|
|
"learning_rate": 2.4975462783057837e-05,
|
|
"loss": 0.006,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 1.4430598823122187,
|
|
"grad_norm": 0.04814140498638153,
|
|
"learning_rate": 2.4892166022028778e-05,
|
|
"loss": 0.0026,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 1.446752047998154,
|
|
"grad_norm": 0.04245537519454956,
|
|
"learning_rate": 2.4808778903228506e-05,
|
|
"loss": 0.0024,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 1.4504442136840892,
|
|
"grad_norm": 0.027589252218604088,
|
|
"learning_rate": 2.472530296680797e-05,
|
|
"loss": 0.0014,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 1.4541363793700242,
|
|
"grad_norm": 0.053102582693099976,
|
|
"learning_rate": 2.4641739754558594e-05,
|
|
"loss": 0.0051,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 1.4578285450559594,
|
|
"grad_norm": 0.04420861601829529,
|
|
"learning_rate": 2.4558090809883767e-05,
|
|
"loss": 0.0053,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 1.4615207107418946,
|
|
"grad_norm": 0.07793322950601578,
|
|
"learning_rate": 2.4474357677770336e-05,
|
|
"loss": 0.013,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 1.4652128764278296,
|
|
"grad_norm": 0.08467935770750046,
|
|
"learning_rate": 2.4390541904760105e-05,
|
|
"loss": 0.0059,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 1.4689050421137648,
|
|
"grad_norm": 0.10742378234863281,
|
|
"learning_rate": 2.430664503892122e-05,
|
|
"loss": 0.0077,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 1.4725972077997,
|
|
"grad_norm": 0.04297586902976036,
|
|
"learning_rate": 2.4222668629819622e-05,
|
|
"loss": 0.0021,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 1.4762893734856353,
|
|
"grad_norm": 0.047660425305366516,
|
|
"learning_rate": 2.4138614228490395e-05,
|
|
"loss": 0.0048,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.4799815391715703,
|
|
"grad_norm": 0.046939097344875336,
|
|
"learning_rate": 2.4054483387409135e-05,
|
|
"loss": 0.0024,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 1.4836737048575055,
|
|
"grad_norm": 0.06429211050271988,
|
|
"learning_rate": 2.3970277660463275e-05,
|
|
"loss": 0.0032,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 1.4873658705434407,
|
|
"grad_norm": 0.053521350026130676,
|
|
"learning_rate": 2.3885998602923387e-05,
|
|
"loss": 0.0033,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 1.4910580362293757,
|
|
"grad_norm": 0.05599725991487503,
|
|
"learning_rate": 2.380164777141443e-05,
|
|
"loss": 0.0035,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 1.494750201915311,
|
|
"grad_norm": 0.055872391909360886,
|
|
"learning_rate": 2.3717226723887037e-05,
|
|
"loss": 0.0041,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 1.4984423676012462,
|
|
"grad_norm": 0.0680844634771347,
|
|
"learning_rate": 2.363273701958873e-05,
|
|
"loss": 0.0043,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 1.5021345332871814,
|
|
"grad_norm": 0.14843975007534027,
|
|
"learning_rate": 2.35481802190351e-05,
|
|
"loss": 0.0105,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 1.5058266989731164,
|
|
"grad_norm": 0.05503234639763832,
|
|
"learning_rate": 2.3463557883980995e-05,
|
|
"loss": 0.0046,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 1.5095188646590516,
|
|
"grad_norm": 0.11701611429452896,
|
|
"learning_rate": 2.337887157739169e-05,
|
|
"loss": 0.0338,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 1.5132110303449866,
|
|
"grad_norm": 0.08817101269960403,
|
|
"learning_rate": 2.3294122863414e-05,
|
|
"loss": 0.0129,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 1.5169031960309218,
|
|
"grad_norm": 0.08257415890693665,
|
|
"learning_rate": 2.3209313307347413e-05,
|
|
"loss": 0.0104,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 1.520595361716857,
|
|
"grad_norm": 0.05527732893824577,
|
|
"learning_rate": 2.312444447561514e-05,
|
|
"loss": 0.0038,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 1.5242875274027923,
|
|
"grad_norm": 0.06380025297403336,
|
|
"learning_rate": 2.3039517935735215e-05,
|
|
"loss": 0.0051,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 1.5279796930887275,
|
|
"grad_norm": 0.040247559547424316,
|
|
"learning_rate": 2.2954535256291554e-05,
|
|
"loss": 0.0027,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 1.5316718587746625,
|
|
"grad_norm": 0.04606495052576065,
|
|
"learning_rate": 2.2869498006904934e-05,
|
|
"loss": 0.0043,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 1.5353640244605977,
|
|
"grad_norm": 0.051601577550172806,
|
|
"learning_rate": 2.2784407758204054e-05,
|
|
"loss": 0.0039,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 1.5390561901465327,
|
|
"grad_norm": 0.06773068755865097,
|
|
"learning_rate": 2.2699266081796493e-05,
|
|
"loss": 0.0042,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 1.542748355832468,
|
|
"grad_norm": 0.06343487650156021,
|
|
"learning_rate": 2.2614074550239707e-05,
|
|
"loss": 0.0053,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 1.5464405215184032,
|
|
"grad_norm": 0.04785839468240738,
|
|
"learning_rate": 2.2528834737011963e-05,
|
|
"loss": 0.0057,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 1.5501326872043384,
|
|
"grad_norm": 0.06275342404842377,
|
|
"learning_rate": 2.2443548216483292e-05,
|
|
"loss": 0.0049,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.5538248528902736,
|
|
"grad_norm": 0.06469978392124176,
|
|
"learning_rate": 2.235821656388638e-05,
|
|
"loss": 0.0049,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 1.5575170185762086,
|
|
"grad_norm": 0.07403778284788132,
|
|
"learning_rate": 2.2272841355287526e-05,
|
|
"loss": 0.01,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 1.5612091842621436,
|
|
"grad_norm": 0.041104815900325775,
|
|
"learning_rate": 2.2187424167557496e-05,
|
|
"loss": 0.0027,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 1.5649013499480788,
|
|
"grad_norm": 0.09375158697366714,
|
|
"learning_rate": 2.210196657834239e-05,
|
|
"loss": 0.0037,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 1.568593515634014,
|
|
"grad_norm": 0.043952830135822296,
|
|
"learning_rate": 2.2016470166034544e-05,
|
|
"loss": 0.0034,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 1.5722856813199493,
|
|
"grad_norm": 0.07946529239416122,
|
|
"learning_rate": 2.193093650974334e-05,
|
|
"loss": 0.0103,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 1.5759778470058845,
|
|
"grad_norm": 0.09605729579925537,
|
|
"learning_rate": 2.184536718926604e-05,
|
|
"loss": 0.0063,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 1.5796700126918195,
|
|
"grad_norm": 0.04163216054439545,
|
|
"learning_rate": 2.175976378505865e-05,
|
|
"loss": 0.0036,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 1.5833621783777547,
|
|
"grad_norm": 0.036647167056798935,
|
|
"learning_rate": 2.1674127878206664e-05,
|
|
"loss": 0.0037,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 1.5870543440636897,
|
|
"grad_norm": 0.06975825875997543,
|
|
"learning_rate": 2.1588461050395918e-05,
|
|
"loss": 0.0077,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 1.590746509749625,
|
|
"grad_norm": 0.06405274569988251,
|
|
"learning_rate": 2.1502764883883355e-05,
|
|
"loss": 0.0085,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 1.5944386754355602,
|
|
"grad_norm": 0.09269712120294571,
|
|
"learning_rate": 2.141704096146779e-05,
|
|
"loss": 0.0192,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 1.5981308411214954,
|
|
"grad_norm": 0.07872427254915237,
|
|
"learning_rate": 2.133129086646069e-05,
|
|
"loss": 0.0115,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 1.6018230068074306,
|
|
"grad_norm": 0.053590744733810425,
|
|
"learning_rate": 2.1245516182656938e-05,
|
|
"loss": 0.0039,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 1.6055151724933656,
|
|
"grad_norm": 0.05257750675082207,
|
|
"learning_rate": 2.1159718494305547e-05,
|
|
"loss": 0.005,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 1.6092073381793008,
|
|
"grad_norm": 0.05436495319008827,
|
|
"learning_rate": 2.107389938608045e-05,
|
|
"loss": 0.0044,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 1.6128995038652358,
|
|
"grad_norm": 0.063501738011837,
|
|
"learning_rate": 2.0988060443051165e-05,
|
|
"loss": 0.0059,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 1.616591669551171,
|
|
"grad_norm": 0.06863530725240707,
|
|
"learning_rate": 2.0902203250653596e-05,
|
|
"loss": 0.0092,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 1.6202838352371063,
|
|
"grad_norm": 0.06638474762439728,
|
|
"learning_rate": 2.0816329394660696e-05,
|
|
"loss": 0.0031,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 1.6239760009230415,
|
|
"grad_norm": 0.03661806881427765,
|
|
"learning_rate": 2.0730440461153183e-05,
|
|
"loss": 0.0036,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.6276681666089767,
|
|
"grad_norm": 0.05380409210920334,
|
|
"learning_rate": 2.0644538036490257e-05,
|
|
"loss": 0.0062,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 1.6313603322949117,
|
|
"grad_norm": 0.0474727526307106,
|
|
"learning_rate": 2.0558623707280313e-05,
|
|
"loss": 0.0033,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 1.6350524979808467,
|
|
"grad_norm": 0.07232918590307236,
|
|
"learning_rate": 2.0472699060351614e-05,
|
|
"loss": 0.0035,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 1.638744663666782,
|
|
"grad_norm": 0.16932211816310883,
|
|
"learning_rate": 2.038676568272298e-05,
|
|
"loss": 0.0054,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 1.6424368293527172,
|
|
"grad_norm": 0.08939804881811142,
|
|
"learning_rate": 2.03008251615745e-05,
|
|
"loss": 0.0061,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 1.6461289950386524,
|
|
"grad_norm": 0.048010073602199554,
|
|
"learning_rate": 2.0214879084218193e-05,
|
|
"loss": 0.0033,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 1.6498211607245876,
|
|
"grad_norm": 0.08143167942762375,
|
|
"learning_rate": 2.0128929038068716e-05,
|
|
"loss": 0.0123,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 1.6535133264105226,
|
|
"grad_norm": 0.041366055607795715,
|
|
"learning_rate": 2.0042976610614006e-05,
|
|
"loss": 0.0022,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 1.6572054920964578,
|
|
"grad_norm": 0.06036004796624184,
|
|
"learning_rate": 1.9957023389385998e-05,
|
|
"loss": 0.0031,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 1.6608976577823928,
|
|
"grad_norm": 0.06090189516544342,
|
|
"learning_rate": 1.9871070961931294e-05,
|
|
"loss": 0.0046,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 1.664589823468328,
|
|
"grad_norm": 0.06394810229539871,
|
|
"learning_rate": 1.9785120915781813e-05,
|
|
"loss": 0.0055,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 1.6682819891542633,
|
|
"grad_norm": 0.05126480385661125,
|
|
"learning_rate": 1.9699174838425502e-05,
|
|
"loss": 0.0028,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 1.6719741548401985,
|
|
"grad_norm": 0.16779179871082306,
|
|
"learning_rate": 1.961323431727703e-05,
|
|
"loss": 0.0138,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 1.6756663205261337,
|
|
"grad_norm": 0.04399624094367027,
|
|
"learning_rate": 1.9527300939648396e-05,
|
|
"loss": 0.0028,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 1.6793584862120687,
|
|
"grad_norm": 0.07073678821325302,
|
|
"learning_rate": 1.9441376292719687e-05,
|
|
"loss": 0.0042,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 1.683050651898004,
|
|
"grad_norm": 0.06449951231479645,
|
|
"learning_rate": 1.935546196350975e-05,
|
|
"loss": 0.0047,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 1.686742817583939,
|
|
"grad_norm": 0.05503733456134796,
|
|
"learning_rate": 1.9269559538846823e-05,
|
|
"loss": 0.0054,
|
|
"step": 457
|
|
},
|
|
{
|
|
"epoch": 1.6904349832698742,
|
|
"grad_norm": 0.0840897262096405,
|
|
"learning_rate": 1.9183670605339314e-05,
|
|
"loss": 0.0096,
|
|
"step": 458
|
|
},
|
|
{
|
|
"epoch": 1.6941271489558094,
|
|
"grad_norm": 0.06032564863562584,
|
|
"learning_rate": 1.909779674934641e-05,
|
|
"loss": 0.0039,
|
|
"step": 459
|
|
},
|
|
{
|
|
"epoch": 1.6978193146417446,
|
|
"grad_norm": 0.08506152033805847,
|
|
"learning_rate": 1.9011939556948835e-05,
|
|
"loss": 0.0061,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 1.6978193146417446,
|
|
"eval_loss": 0.007821443490684032,
|
|
"eval_runtime": 90.0445,
|
|
"eval_samples_per_second": 10.139,
|
|
"eval_steps_per_second": 5.075,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 1.7015114803276798,
|
|
"grad_norm": 0.09359995275735855,
|
|
"learning_rate": 1.8926100613919565e-05,
|
|
"loss": 0.0137,
|
|
"step": 461
|
|
},
|
|
{
|
|
"epoch": 1.7052036460136148,
|
|
"grad_norm": 0.050462689250707626,
|
|
"learning_rate": 1.884028150569446e-05,
|
|
"loss": 0.0026,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 1.70889581169955,
|
|
"grad_norm": 0.05139093101024628,
|
|
"learning_rate": 1.8754483817343065e-05,
|
|
"loss": 0.0038,
|
|
"step": 463
|
|
},
|
|
{
|
|
"epoch": 1.712587977385485,
|
|
"grad_norm": 0.05640941858291626,
|
|
"learning_rate": 1.8668709133539316e-05,
|
|
"loss": 0.0048,
|
|
"step": 464
|
|
},
|
|
{
|
|
"epoch": 1.7162801430714203,
|
|
"grad_norm": 0.0617087222635746,
|
|
"learning_rate": 1.8582959038532216e-05,
|
|
"loss": 0.0066,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 1.7199723087573555,
|
|
"grad_norm": 0.05858307704329491,
|
|
"learning_rate": 1.8497235116116656e-05,
|
|
"loss": 0.0042,
|
|
"step": 466
|
|
},
|
|
{
|
|
"epoch": 1.7236644744432907,
|
|
"grad_norm": 0.06570729613304138,
|
|
"learning_rate": 1.841153894960409e-05,
|
|
"loss": 0.0069,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 1.727356640129226,
|
|
"grad_norm": 0.03923754021525383,
|
|
"learning_rate": 1.8325872121793343e-05,
|
|
"loss": 0.0038,
|
|
"step": 468
|
|
},
|
|
{
|
|
"epoch": 1.731048805815161,
|
|
"grad_norm": 0.04980519786477089,
|
|
"learning_rate": 1.824023621494136e-05,
|
|
"loss": 0.0033,
|
|
"step": 469
|
|
},
|
|
{
|
|
"epoch": 1.734740971501096,
|
|
"grad_norm": 0.0408487394452095,
|
|
"learning_rate": 1.815463281073396e-05,
|
|
"loss": 0.0025,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 1.7384331371870312,
|
|
"grad_norm": 0.07105151563882828,
|
|
"learning_rate": 1.8069063490256668e-05,
|
|
"loss": 0.0055,
|
|
"step": 471
|
|
},
|
|
{
|
|
"epoch": 1.7421253028729664,
|
|
"grad_norm": 0.048975639045238495,
|
|
"learning_rate": 1.7983529833965463e-05,
|
|
"loss": 0.0036,
|
|
"step": 472
|
|
},
|
|
{
|
|
"epoch": 1.7458174685589016,
|
|
"grad_norm": 0.034441813826560974,
|
|
"learning_rate": 1.7898033421657616e-05,
|
|
"loss": 0.0029,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 1.7495096342448369,
|
|
"grad_norm": 0.07011700421571732,
|
|
"learning_rate": 1.7812575832442518e-05,
|
|
"loss": 0.0097,
|
|
"step": 474
|
|
},
|
|
{
|
|
"epoch": 1.7532017999307719,
|
|
"grad_norm": 0.058152489364147186,
|
|
"learning_rate": 1.7727158644712484e-05,
|
|
"loss": 0.0092,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 1.756893965616707,
|
|
"grad_norm": 0.049807388335466385,
|
|
"learning_rate": 1.764178343611363e-05,
|
|
"loss": 0.0036,
|
|
"step": 476
|
|
},
|
|
{
|
|
"epoch": 1.760586131302642,
|
|
"grad_norm": 0.05702248960733414,
|
|
"learning_rate": 1.755645178351672e-05,
|
|
"loss": 0.0091,
|
|
"step": 477
|
|
},
|
|
{
|
|
"epoch": 1.7642782969885773,
|
|
"grad_norm": 0.051516178995370865,
|
|
"learning_rate": 1.747116526298804e-05,
|
|
"loss": 0.0044,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 1.7679704626745125,
|
|
"grad_norm": 0.09747370332479477,
|
|
"learning_rate": 1.7385925449760296e-05,
|
|
"loss": 0.0065,
|
|
"step": 479
|
|
},
|
|
{
|
|
"epoch": 1.7716626283604477,
|
|
"grad_norm": 0.05502758547663689,
|
|
"learning_rate": 1.7300733918203514e-05,
|
|
"loss": 0.0151,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 1.775354794046383,
|
|
"grad_norm": 0.05905942991375923,
|
|
"learning_rate": 1.7215592241795956e-05,
|
|
"loss": 0.0029,
|
|
"step": 481
|
|
},
|
|
{
|
|
"epoch": 1.779046959732318,
|
|
"grad_norm": 0.07898251712322235,
|
|
"learning_rate": 1.7130501993095076e-05,
|
|
"loss": 0.0034,
|
|
"step": 482
|
|
},
|
|
{
|
|
"epoch": 1.7827391254182532,
|
|
"grad_norm": 0.0482512004673481,
|
|
"learning_rate": 1.7045464743708456e-05,
|
|
"loss": 0.0065,
|
|
"step": 483
|
|
},
|
|
{
|
|
"epoch": 1.7864312911041882,
|
|
"grad_norm": 0.07192537188529968,
|
|
"learning_rate": 1.6960482064264788e-05,
|
|
"loss": 0.0076,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 1.7901234567901234,
|
|
"grad_norm": 0.08650217205286026,
|
|
"learning_rate": 1.687555552438487e-05,
|
|
"loss": 0.0209,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 1.7938156224760586,
|
|
"grad_norm": 0.06443698704242706,
|
|
"learning_rate": 1.679068669265259e-05,
|
|
"loss": 0.0148,
|
|
"step": 486
|
|
},
|
|
{
|
|
"epoch": 1.7975077881619939,
|
|
"grad_norm": 0.04942217096686363,
|
|
"learning_rate": 1.6705877136586e-05,
|
|
"loss": 0.0048,
|
|
"step": 487
|
|
},
|
|
{
|
|
"epoch": 1.801199953847929,
|
|
"grad_norm": 0.04607919976115227,
|
|
"learning_rate": 1.6621128422608318e-05,
|
|
"loss": 0.0034,
|
|
"step": 488
|
|
},
|
|
{
|
|
"epoch": 1.804892119533864,
|
|
"grad_norm": 0.06549520045518875,
|
|
"learning_rate": 1.6536442116019012e-05,
|
|
"loss": 0.0067,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 1.808584285219799,
|
|
"grad_norm": 0.1073460504412651,
|
|
"learning_rate": 1.6451819780964912e-05,
|
|
"loss": 0.0181,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 1.8122764509057343,
|
|
"grad_norm": 0.03819148242473602,
|
|
"learning_rate": 1.6367262980411273e-05,
|
|
"loss": 0.0031,
|
|
"step": 491
|
|
},
|
|
{
|
|
"epoch": 1.8159686165916695,
|
|
"grad_norm": 0.05567432940006256,
|
|
"learning_rate": 1.6282773276112963e-05,
|
|
"loss": 0.0032,
|
|
"step": 492
|
|
},
|
|
{
|
|
"epoch": 1.8196607822776047,
|
|
"grad_norm": 0.0490952767431736,
|
|
"learning_rate": 1.619835222858558e-05,
|
|
"loss": 0.0022,
|
|
"step": 493
|
|
},
|
|
{
|
|
"epoch": 1.82335294796354,
|
|
"grad_norm": 0.06697966158390045,
|
|
"learning_rate": 1.6114001397076623e-05,
|
|
"loss": 0.0051,
|
|
"step": 494
|
|
},
|
|
{
|
|
"epoch": 1.827045113649475,
|
|
"grad_norm": 0.06809218227863312,
|
|
"learning_rate": 1.6029722339536725e-05,
|
|
"loss": 0.0065,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 1.8307372793354102,
|
|
"grad_norm": 0.07054764032363892,
|
|
"learning_rate": 1.5945516612590872e-05,
|
|
"loss": 0.0051,
|
|
"step": 496
|
|
},
|
|
{
|
|
"epoch": 1.8344294450213452,
|
|
"grad_norm": 0.03432456776499748,
|
|
"learning_rate": 1.5861385771509612e-05,
|
|
"loss": 0.0021,
|
|
"step": 497
|
|
},
|
|
{
|
|
"epoch": 1.8381216107072804,
|
|
"grad_norm": 0.08520621806383133,
|
|
"learning_rate": 1.5777331370180388e-05,
|
|
"loss": 0.005,
|
|
"step": 498
|
|
},
|
|
{
|
|
"epoch": 1.8418137763932156,
|
|
"grad_norm": 0.05386526510119438,
|
|
"learning_rate": 1.5693354961078783e-05,
|
|
"loss": 0.0048,
|
|
"step": 499
|
|
},
|
|
{
|
|
"epoch": 1.8455059420791509,
|
|
"grad_norm": 0.07909571379423141,
|
|
"learning_rate": 1.56094580952399e-05,
|
|
"loss": 0.0068,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 1.849198107765086,
|
|
"grad_norm": 0.048736322671175,
|
|
"learning_rate": 1.5525642322229667e-05,
|
|
"loss": 0.0082,
|
|
"step": 501
|
|
},
|
|
{
|
|
"epoch": 1.852890273451021,
|
|
"grad_norm": 0.05122867971658707,
|
|
"learning_rate": 1.5441909190116237e-05,
|
|
"loss": 0.0054,
|
|
"step": 502
|
|
},
|
|
{
|
|
"epoch": 1.8565824391369563,
|
|
"grad_norm": 0.043926313519477844,
|
|
"learning_rate": 1.535826024544141e-05,
|
|
"loss": 0.0022,
|
|
"step": 503
|
|
},
|
|
{
|
|
"epoch": 1.8602746048228913,
|
|
"grad_norm": 0.11847585439682007,
|
|
"learning_rate": 1.5274697033192033e-05,
|
|
"loss": 0.0228,
|
|
"step": 504
|
|
},
|
|
{
|
|
"epoch": 1.8639667705088265,
|
|
"grad_norm": 0.06619201600551605,
|
|
"learning_rate": 1.51912210967715e-05,
|
|
"loss": 0.0038,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 1.8676589361947618,
|
|
"grad_norm": 0.11160582304000854,
|
|
"learning_rate": 1.5107833977971227e-05,
|
|
"loss": 0.0097,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 1.871351101880697,
|
|
"grad_norm": 0.053367406129837036,
|
|
"learning_rate": 1.5024537216942166e-05,
|
|
"loss": 0.0046,
|
|
"step": 507
|
|
},
|
|
{
|
|
"epoch": 1.8750432675666322,
|
|
"grad_norm": 0.04304146394133568,
|
|
"learning_rate": 1.4941332352166385e-05,
|
|
"loss": 0.0038,
|
|
"step": 508
|
|
},
|
|
{
|
|
"epoch": 1.8787354332525672,
|
|
"grad_norm": 0.06712479144334793,
|
|
"learning_rate": 1.485822092042864e-05,
|
|
"loss": 0.0094,
|
|
"step": 509
|
|
},
|
|
{
|
|
"epoch": 1.8824275989385022,
|
|
"grad_norm": 0.07085831463336945,
|
|
"learning_rate": 1.4775204456787973e-05,
|
|
"loss": 0.0065,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 1.8861197646244374,
|
|
"grad_norm": 0.06869763880968094,
|
|
"learning_rate": 1.469228449454939e-05,
|
|
"loss": 0.0062,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 1.8898119303103726,
|
|
"grad_norm": 0.25166359543800354,
|
|
"learning_rate": 1.4609462565235524e-05,
|
|
"loss": 0.0039,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 1.8935040959963079,
|
|
"grad_norm": 0.05382119119167328,
|
|
"learning_rate": 1.4526740198558345e-05,
|
|
"loss": 0.0076,
|
|
"step": 513
|
|
},
|
|
{
|
|
"epoch": 1.897196261682243,
|
|
"grad_norm": 0.03443971276283264,
|
|
"learning_rate": 1.4444118922390921e-05,
|
|
"loss": 0.0029,
|
|
"step": 514
|
|
},
|
|
{
|
|
"epoch": 1.9008884273681783,
|
|
"grad_norm": 0.04824664443731308,
|
|
"learning_rate": 1.4361600262739171e-05,
|
|
"loss": 0.0034,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 1.9045805930541133,
|
|
"grad_norm": 0.05474744364619255,
|
|
"learning_rate": 1.4279185743713721e-05,
|
|
"loss": 0.0053,
|
|
"step": 516
|
|
},
|
|
{
|
|
"epoch": 1.9082727587400483,
|
|
"grad_norm": 0.05808331444859505,
|
|
"learning_rate": 1.419687688750173e-05,
|
|
"loss": 0.0036,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 1.9119649244259835,
|
|
"grad_norm": 0.0522182323038578,
|
|
"learning_rate": 1.4114675214338745e-05,
|
|
"loss": 0.0029,
|
|
"step": 518
|
|
},
|
|
{
|
|
"epoch": 1.9156570901119188,
|
|
"grad_norm": 0.04176926612854004,
|
|
"learning_rate": 1.4032582242480692e-05,
|
|
"loss": 0.0069,
|
|
"step": 519
|
|
},
|
|
{
|
|
"epoch": 1.919349255797854,
|
|
"grad_norm": 0.3265964984893799,
|
|
"learning_rate": 1.3950599488175783e-05,
|
|
"loss": 0.0124,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 1.9230414214837892,
|
|
"grad_norm": 0.05882977694272995,
|
|
"learning_rate": 1.3868728465636508e-05,
|
|
"loss": 0.0048,
|
|
"step": 521
|
|
},
|
|
{
|
|
"epoch": 1.9267335871697242,
|
|
"grad_norm": 0.04733727127313614,
|
|
"learning_rate": 1.3786970687011713e-05,
|
|
"loss": 0.0057,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 1.9304257528556594,
|
|
"grad_norm": 0.07429318130016327,
|
|
"learning_rate": 1.3705327662358605e-05,
|
|
"loss": 0.0154,
|
|
"step": 523
|
|
},
|
|
{
|
|
"epoch": 1.9341179185415944,
|
|
"grad_norm": 0.04765889793634415,
|
|
"learning_rate": 1.362380089961493e-05,
|
|
"loss": 0.0025,
|
|
"step": 524
|
|
},
|
|
{
|
|
"epoch": 1.9378100842275297,
|
|
"grad_norm": 0.11108744144439697,
|
|
"learning_rate": 1.3542391904571082e-05,
|
|
"loss": 0.013,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 1.9415022499134649,
|
|
"grad_norm": 0.05669174715876579,
|
|
"learning_rate": 1.3461102180842274e-05,
|
|
"loss": 0.0063,
|
|
"step": 526
|
|
},
|
|
{
|
|
"epoch": 1.9451944155994,
|
|
"grad_norm": 0.04899504780769348,
|
|
"learning_rate": 1.3379933229840827e-05,
|
|
"loss": 0.0061,
|
|
"step": 527
|
|
},
|
|
{
|
|
"epoch": 1.9488865812853353,
|
|
"grad_norm": 0.04838700219988823,
|
|
"learning_rate": 1.3298886550748387e-05,
|
|
"loss": 0.0059,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 1.9525787469712703,
|
|
"grad_norm": 0.06490358710289001,
|
|
"learning_rate": 1.3217963640488232e-05,
|
|
"loss": 0.0032,
|
|
"step": 529
|
|
},
|
|
{
|
|
"epoch": 1.9562709126572055,
|
|
"grad_norm": 0.06235655024647713,
|
|
"learning_rate": 1.3137165993697687e-05,
|
|
"loss": 0.0052,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 1.9599630783431405,
|
|
"grad_norm": 0.06066396087408066,
|
|
"learning_rate": 1.3056495102700426e-05,
|
|
"loss": 0.0082,
|
|
"step": 531
|
|
},
|
|
{
|
|
"epoch": 1.9636552440290758,
|
|
"grad_norm": 0.03929363191127777,
|
|
"learning_rate": 1.2975952457478986e-05,
|
|
"loss": 0.0035,
|
|
"step": 532
|
|
},
|
|
{
|
|
"epoch": 1.967347409715011,
|
|
"grad_norm": 0.1256752759218216,
|
|
"learning_rate": 1.2895539545647229e-05,
|
|
"loss": 0.0051,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 1.9710395754009462,
|
|
"grad_norm": 0.04527255520224571,
|
|
"learning_rate": 1.2815257852422818e-05,
|
|
"loss": 0.0029,
|
|
"step": 534
|
|
},
|
|
{
|
|
"epoch": 1.9747317410868814,
|
|
"grad_norm": 0.08779493719339371,
|
|
"learning_rate": 1.2735108860599848e-05,
|
|
"loss": 0.0051,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 1.9784239067728164,
|
|
"grad_norm": 0.061192456632852554,
|
|
"learning_rate": 1.2655094050521447e-05,
|
|
"loss": 0.0061,
|
|
"step": 536
|
|
},
|
|
{
|
|
"epoch": 1.9821160724587514,
|
|
"grad_norm": 0.03906107693910599,
|
|
"learning_rate": 1.2575214900052378e-05,
|
|
"loss": 0.0035,
|
|
"step": 537
|
|
},
|
|
{
|
|
"epoch": 1.9858082381446867,
|
|
"grad_norm": 0.05745285004377365,
|
|
"learning_rate": 1.2495472884551836e-05,
|
|
"loss": 0.0061,
|
|
"step": 538
|
|
},
|
|
{
|
|
"epoch": 1.9895004038306219,
|
|
"grad_norm": 0.0978095754981041,
|
|
"learning_rate": 1.2415869476846101e-05,
|
|
"loss": 0.0058,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 1.993192569516557,
|
|
"grad_norm": 0.05764520913362503,
|
|
"learning_rate": 1.2336406147201411e-05,
|
|
"loss": 0.0038,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 1.9968847352024923,
|
|
"grad_norm": 0.04506031796336174,
|
|
"learning_rate": 1.225708436329679e-05,
|
|
"loss": 0.0066,
|
|
"step": 541
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.08254817128181458,
|
|
"learning_rate": 1.2177905590196884e-05,
|
|
"loss": 0.0048,
|
|
"step": 542
|
|
},
|
|
{
|
|
"epoch": 2.0036921656859352,
|
|
"grad_norm": 0.050131332129240036,
|
|
"learning_rate": 1.2098871290324974e-05,
|
|
"loss": 0.0011,
|
|
"step": 543
|
|
},
|
|
{
|
|
"epoch": 2.0073843313718704,
|
|
"grad_norm": 0.04068991169333458,
|
|
"learning_rate": 1.2019982923435954e-05,
|
|
"loss": 0.0048,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 2.0110764970578057,
|
|
"grad_norm": 0.04712502658367157,
|
|
"learning_rate": 1.1941241946589299e-05,
|
|
"loss": 0.0016,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 2.0147686627437404,
|
|
"grad_norm": 0.019259510561823845,
|
|
"learning_rate": 1.1862649814122263e-05,
|
|
"loss": 0.0013,
|
|
"step": 546
|
|
},
|
|
{
|
|
"epoch": 2.0184608284296757,
|
|
"grad_norm": 0.06433498114347458,
|
|
"learning_rate": 1.1784207977622914e-05,
|
|
"loss": 0.0021,
|
|
"step": 547
|
|
},
|
|
{
|
|
"epoch": 2.022152994115611,
|
|
"grad_norm": 0.054391320794820786,
|
|
"learning_rate": 1.1705917885903402e-05,
|
|
"loss": 0.0079,
|
|
"step": 548
|
|
},
|
|
{
|
|
"epoch": 2.025845159801546,
|
|
"grad_norm": 0.04541629180312157,
|
|
"learning_rate": 1.1627780984973153e-05,
|
|
"loss": 0.0029,
|
|
"step": 549
|
|
},
|
|
{
|
|
"epoch": 2.0295373254874813,
|
|
"grad_norm": 0.02488025464117527,
|
|
"learning_rate": 1.1549798718012184e-05,
|
|
"loss": 0.0013,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 2.0332294911734166,
|
|
"grad_norm": 0.06702622771263123,
|
|
"learning_rate": 1.1471972525344421e-05,
|
|
"loss": 0.0067,
|
|
"step": 551
|
|
},
|
|
{
|
|
"epoch": 2.036921656859352,
|
|
"grad_norm": 0.04141972213983536,
|
|
"learning_rate": 1.139430384441115e-05,
|
|
"loss": 0.0037,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 2.036921656859352,
|
|
"eval_loss": 0.0076590548269450665,
|
|
"eval_runtime": 89.9642,
|
|
"eval_samples_per_second": 10.148,
|
|
"eval_steps_per_second": 5.08,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 2.0406138225452866,
|
|
"grad_norm": 0.09934539347887039,
|
|
"learning_rate": 1.1316794109744394e-05,
|
|
"loss": 0.0017,
|
|
"step": 553
|
|
},
|
|
{
|
|
"epoch": 2.044305988231222,
|
|
"grad_norm": 0.04188989847898483,
|
|
"learning_rate": 1.1239444752940477e-05,
|
|
"loss": 0.0017,
|
|
"step": 554
|
|
},
|
|
{
|
|
"epoch": 2.047998153917157,
|
|
"grad_norm": 0.03835677355527878,
|
|
"learning_rate": 1.1162257202633548e-05,
|
|
"loss": 0.0054,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 2.0516903196030922,
|
|
"grad_norm": 0.04190275818109512,
|
|
"learning_rate": 1.1085232884469236e-05,
|
|
"loss": 0.0022,
|
|
"step": 556
|
|
},
|
|
{
|
|
"epoch": 2.0553824852890274,
|
|
"grad_norm": 0.03239237144589424,
|
|
"learning_rate": 1.1008373221078261e-05,
|
|
"loss": 0.0018,
|
|
"step": 557
|
|
},
|
|
{
|
|
"epoch": 2.0590746509749627,
|
|
"grad_norm": 0.022900836542248726,
|
|
"learning_rate": 1.0931679632050186e-05,
|
|
"loss": 0.0014,
|
|
"step": 558
|
|
},
|
|
{
|
|
"epoch": 2.0627668166608975,
|
|
"grad_norm": 0.04315221309661865,
|
|
"learning_rate": 1.085515353390723e-05,
|
|
"loss": 0.001,
|
|
"step": 559
|
|
},
|
|
{
|
|
"epoch": 2.0664589823468327,
|
|
"grad_norm": 0.03839712589979172,
|
|
"learning_rate": 1.0778796340078043e-05,
|
|
"loss": 0.0019,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 2.070151148032768,
|
|
"grad_norm": 0.04536756873130798,
|
|
"learning_rate": 1.070260946087164e-05,
|
|
"loss": 0.005,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 2.073843313718703,
|
|
"grad_norm": 0.0286678746342659,
|
|
"learning_rate": 1.0626594303451359e-05,
|
|
"loss": 0.0013,
|
|
"step": 562
|
|
},
|
|
{
|
|
"epoch": 2.0775354794046383,
|
|
"grad_norm": 0.04558982327580452,
|
|
"learning_rate": 1.0550752271808817e-05,
|
|
"loss": 0.0028,
|
|
"step": 563
|
|
},
|
|
{
|
|
"epoch": 2.0812276450905736,
|
|
"grad_norm": 0.08176779747009277,
|
|
"learning_rate": 1.0475084766738051e-05,
|
|
"loss": 0.0031,
|
|
"step": 564
|
|
},
|
|
{
|
|
"epoch": 2.084919810776509,
|
|
"grad_norm": 0.07253772020339966,
|
|
"learning_rate": 1.0399593185809625e-05,
|
|
"loss": 0.0081,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 2.0886119764624436,
|
|
"grad_norm": 0.045510150492191315,
|
|
"learning_rate": 1.0324278923344741e-05,
|
|
"loss": 0.0022,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 2.092304142148379,
|
|
"grad_norm": 0.04695519059896469,
|
|
"learning_rate": 1.0249143370389607e-05,
|
|
"loss": 0.0029,
|
|
"step": 567
|
|
},
|
|
{
|
|
"epoch": 2.095996307834314,
|
|
"grad_norm": 0.04093737155199051,
|
|
"learning_rate": 1.0174187914689681e-05,
|
|
"loss": 0.0038,
|
|
"step": 568
|
|
},
|
|
{
|
|
"epoch": 2.0996884735202492,
|
|
"grad_norm": 0.026440149173140526,
|
|
"learning_rate": 1.0099413940664e-05,
|
|
"loss": 0.0011,
|
|
"step": 569
|
|
},
|
|
{
|
|
"epoch": 2.1033806392061845,
|
|
"grad_norm": 0.040475890040397644,
|
|
"learning_rate": 1.0024822829379701e-05,
|
|
"loss": 0.0016,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 2.1070728048921197,
|
|
"grad_norm": 0.07037003338336945,
|
|
"learning_rate": 9.950415958526449e-06,
|
|
"loss": 0.0018,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 2.110764970578055,
|
|
"grad_norm": 0.04606116563081741,
|
|
"learning_rate": 9.876194702390998e-06,
|
|
"loss": 0.0018,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 2.1144571362639897,
|
|
"grad_norm": 0.03653557226061821,
|
|
"learning_rate": 9.802160431831845e-06,
|
|
"loss": 0.0011,
|
|
"step": 573
|
|
},
|
|
{
|
|
"epoch": 2.118149301949925,
|
|
"grad_norm": 0.03161030635237694,
|
|
"learning_rate": 9.728314514253856e-06,
|
|
"loss": 0.0015,
|
|
"step": 574
|
|
},
|
|
{
|
|
"epoch": 2.12184146763586,
|
|
"grad_norm": 0.054874520748853683,
|
|
"learning_rate": 9.654658313583045e-06,
|
|
"loss": 0.0084,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 2.1255336333217953,
|
|
"grad_norm": 0.07581143826246262,
|
|
"learning_rate": 9.581193190241398e-06,
|
|
"loss": 0.0031,
|
|
"step": 576
|
|
},
|
|
{
|
|
"epoch": 2.1292257990077306,
|
|
"grad_norm": 0.049715541303157806,
|
|
"learning_rate": 9.507920501121685e-06,
|
|
"loss": 0.0018,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 2.132917964693666,
|
|
"grad_norm": 0.02939217910170555,
|
|
"learning_rate": 9.434841599562487e-06,
|
|
"loss": 0.0012,
|
|
"step": 578
|
|
},
|
|
{
|
|
"epoch": 2.1366101303796006,
|
|
"grad_norm": 0.02201433666050434,
|
|
"learning_rate": 9.361957835323088e-06,
|
|
"loss": 0.0005,
|
|
"step": 579
|
|
},
|
|
{
|
|
"epoch": 2.140302296065536,
|
|
"grad_norm": 0.03674834221601486,
|
|
"learning_rate": 9.289270554558651e-06,
|
|
"loss": 0.0018,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 2.143994461751471,
|
|
"grad_norm": 0.061158619821071625,
|
|
"learning_rate": 9.216781099795322e-06,
|
|
"loss": 0.0056,
|
|
"step": 581
|
|
},
|
|
{
|
|
"epoch": 2.1476866274374062,
|
|
"grad_norm": 0.06538428366184235,
|
|
"learning_rate": 9.144490809905365e-06,
|
|
"loss": 0.0077,
|
|
"step": 582
|
|
},
|
|
{
|
|
"epoch": 2.1513787931233415,
|
|
"grad_norm": 0.03936760872602463,
|
|
"learning_rate": 9.072401020082542e-06,
|
|
"loss": 0.0021,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 2.1550709588092767,
|
|
"grad_norm": 0.06418730318546295,
|
|
"learning_rate": 9.0005130618174e-06,
|
|
"loss": 0.0042,
|
|
"step": 584
|
|
},
|
|
{
|
|
"epoch": 2.158763124495212,
|
|
"grad_norm": 0.027600156143307686,
|
|
"learning_rate": 8.928828262872633e-06,
|
|
"loss": 0.0008,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 2.1624552901811467,
|
|
"grad_norm": 0.02378762699663639,
|
|
"learning_rate": 8.857347947258657e-06,
|
|
"loss": 0.0008,
|
|
"step": 586
|
|
},
|
|
{
|
|
"epoch": 2.166147455867082,
|
|
"grad_norm": 0.05525508150458336,
|
|
"learning_rate": 8.786073435209072e-06,
|
|
"loss": 0.003,
|
|
"step": 587
|
|
},
|
|
{
|
|
"epoch": 2.169839621553017,
|
|
"grad_norm": 0.05668642744421959,
|
|
"learning_rate": 8.715006043156315e-06,
|
|
"loss": 0.0047,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 2.1735317872389524,
|
|
"grad_norm": 0.6010801196098328,
|
|
"learning_rate": 8.644147083707354e-06,
|
|
"loss": 0.0111,
|
|
"step": 589
|
|
},
|
|
{
|
|
"epoch": 2.1772239529248876,
|
|
"grad_norm": 0.364096075296402,
|
|
"learning_rate": 8.573497865619414e-06,
|
|
"loss": 0.018,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 2.180916118610823,
|
|
"grad_norm": 0.03992763161659241,
|
|
"learning_rate": 8.503059693775813e-06,
|
|
"loss": 0.0023,
|
|
"step": 591
|
|
},
|
|
{
|
|
"epoch": 2.184608284296758,
|
|
"grad_norm": 0.025853926315903664,
|
|
"learning_rate": 8.432833869161893e-06,
|
|
"loss": 0.0012,
|
|
"step": 592
|
|
},
|
|
{
|
|
"epoch": 2.188300449982693,
|
|
"grad_norm": 0.02981944940984249,
|
|
"learning_rate": 8.362821688840947e-06,
|
|
"loss": 0.0015,
|
|
"step": 593
|
|
},
|
|
{
|
|
"epoch": 2.191992615668628,
|
|
"grad_norm": 0.05791231989860535,
|
|
"learning_rate": 8.293024445930287e-06,
|
|
"loss": 0.0027,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 2.1956847813545632,
|
|
"grad_norm": 0.04732658341526985,
|
|
"learning_rate": 8.223443429577343e-06,
|
|
"loss": 0.0033,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 2.1993769470404985,
|
|
"grad_norm": 0.03762805834412575,
|
|
"learning_rate": 8.154079924935892e-06,
|
|
"loss": 0.0015,
|
|
"step": 596
|
|
},
|
|
{
|
|
"epoch": 2.2030691127264337,
|
|
"grad_norm": 0.07153934240341187,
|
|
"learning_rate": 8.084935213142269e-06,
|
|
"loss": 0.0022,
|
|
"step": 597
|
|
},
|
|
{
|
|
"epoch": 2.206761278412369,
|
|
"grad_norm": 0.04638079181313515,
|
|
"learning_rate": 8.016010571291725e-06,
|
|
"loss": 0.0018,
|
|
"step": 598
|
|
},
|
|
{
|
|
"epoch": 2.210453444098304,
|
|
"grad_norm": 0.030643608421087265,
|
|
"learning_rate": 7.947307272414874e-06,
|
|
"loss": 0.0029,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 2.214145609784239,
|
|
"grad_norm": 0.02873793989419937,
|
|
"learning_rate": 7.878826585454122e-06,
|
|
"loss": 0.0021,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 2.217837775470174,
|
|
"grad_norm": 0.04478053003549576,
|
|
"learning_rate": 7.810569775240257e-06,
|
|
"loss": 0.003,
|
|
"step": 601
|
|
},
|
|
{
|
|
"epoch": 2.2215299411561094,
|
|
"grad_norm": 0.046556588262319565,
|
|
"learning_rate": 7.742538102469111e-06,
|
|
"loss": 0.002,
|
|
"step": 602
|
|
},
|
|
{
|
|
"epoch": 2.2252221068420446,
|
|
"grad_norm": 0.039592090994119644,
|
|
"learning_rate": 7.674732823678228e-06,
|
|
"loss": 0.0036,
|
|
"step": 603
|
|
},
|
|
{
|
|
"epoch": 2.22891427252798,
|
|
"grad_norm": 0.031397104263305664,
|
|
"learning_rate": 7.607155191223683e-06,
|
|
"loss": 0.0013,
|
|
"step": 604
|
|
},
|
|
{
|
|
"epoch": 2.232606438213915,
|
|
"grad_norm": 0.035691820085048676,
|
|
"learning_rate": 7.539806453256973e-06,
|
|
"loss": 0.0014,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 2.23629860389985,
|
|
"grad_norm": 0.0710548609495163,
|
|
"learning_rate": 7.472687853701908e-06,
|
|
"loss": 0.0078,
|
|
"step": 606
|
|
},
|
|
{
|
|
"epoch": 2.239990769585785,
|
|
"grad_norm": 0.04906422272324562,
|
|
"learning_rate": 7.405800632231672e-06,
|
|
"loss": 0.0025,
|
|
"step": 607
|
|
},
|
|
{
|
|
"epoch": 2.2436829352717202,
|
|
"grad_norm": 0.08130212128162384,
|
|
"learning_rate": 7.339146024245947e-06,
|
|
"loss": 0.006,
|
|
"step": 608
|
|
},
|
|
{
|
|
"epoch": 2.2473751009576555,
|
|
"grad_norm": 0.0357891283929348,
|
|
"learning_rate": 7.272725260848037e-06,
|
|
"loss": 0.0018,
|
|
"step": 609
|
|
},
|
|
{
|
|
"epoch": 2.2510672666435907,
|
|
"grad_norm": 0.05284830555319786,
|
|
"learning_rate": 7.206539568822179e-06,
|
|
"loss": 0.0017,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 2.254759432329526,
|
|
"grad_norm": 0.03607559949159622,
|
|
"learning_rate": 7.140590170610857e-06,
|
|
"loss": 0.0023,
|
|
"step": 611
|
|
},
|
|
{
|
|
"epoch": 2.258451598015461,
|
|
"grad_norm": 0.043198052793741226,
|
|
"learning_rate": 7.0748782842922545e-06,
|
|
"loss": 0.0016,
|
|
"step": 612
|
|
},
|
|
{
|
|
"epoch": 2.262143763701396,
|
|
"grad_norm": 0.07694031298160553,
|
|
"learning_rate": 7.0094051235577155e-06,
|
|
"loss": 0.0025,
|
|
"step": 613
|
|
},
|
|
{
|
|
"epoch": 2.265835929387331,
|
|
"grad_norm": 0.040816958993673325,
|
|
"learning_rate": 6.944171897689349e-06,
|
|
"loss": 0.0043,
|
|
"step": 614
|
|
},
|
|
{
|
|
"epoch": 2.2695280950732664,
|
|
"grad_norm": 0.02998993545770645,
|
|
"learning_rate": 6.879179811537715e-06,
|
|
"loss": 0.001,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 2.2732202607592016,
|
|
"grad_norm": 0.0377618744969368,
|
|
"learning_rate": 6.814430065499526e-06,
|
|
"loss": 0.0017,
|
|
"step": 616
|
|
},
|
|
{
|
|
"epoch": 2.276912426445137,
|
|
"grad_norm": 0.05830111727118492,
|
|
"learning_rate": 6.749923855495502e-06,
|
|
"loss": 0.0038,
|
|
"step": 617
|
|
},
|
|
{
|
|
"epoch": 2.280604592131072,
|
|
"grad_norm": 0.042703039944171906,
|
|
"learning_rate": 6.685662372948298e-06,
|
|
"loss": 0.004,
|
|
"step": 618
|
|
},
|
|
{
|
|
"epoch": 2.284296757817007,
|
|
"grad_norm": 0.0343906469643116,
|
|
"learning_rate": 6.62164680476046e-06,
|
|
"loss": 0.0024,
|
|
"step": 619
|
|
},
|
|
{
|
|
"epoch": 2.287988923502942,
|
|
"grad_norm": 0.030185390263795853,
|
|
"learning_rate": 6.55787833329252e-06,
|
|
"loss": 0.001,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 2.2916810891888773,
|
|
"grad_norm": 0.03131229057908058,
|
|
"learning_rate": 6.4943581363411855e-06,
|
|
"loss": 0.0015,
|
|
"step": 621
|
|
},
|
|
{
|
|
"epoch": 2.2953732548748125,
|
|
"grad_norm": 0.03044010140001774,
|
|
"learning_rate": 6.431087387117538e-06,
|
|
"loss": 0.001,
|
|
"step": 622
|
|
},
|
|
{
|
|
"epoch": 2.2990654205607477,
|
|
"grad_norm": 0.028354087844491005,
|
|
"learning_rate": 6.368067254225387e-06,
|
|
"loss": 0.0009,
|
|
"step": 623
|
|
},
|
|
{
|
|
"epoch": 2.302757586246683,
|
|
"grad_norm": 0.06797152012586594,
|
|
"learning_rate": 6.305298901639704e-06,
|
|
"loss": 0.0081,
|
|
"step": 624
|
|
},
|
|
{
|
|
"epoch": 2.306449751932618,
|
|
"grad_norm": 0.035257063806056976,
|
|
"learning_rate": 6.242783488685091e-06,
|
|
"loss": 0.0013,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 2.3101419176185534,
|
|
"grad_norm": 0.09349807351827621,
|
|
"learning_rate": 6.1805221700143844e-06,
|
|
"loss": 0.017,
|
|
"step": 626
|
|
},
|
|
{
|
|
"epoch": 2.313834083304488,
|
|
"grad_norm": 0.07014710456132889,
|
|
"learning_rate": 6.118516095587321e-06,
|
|
"loss": 0.0109,
|
|
"step": 627
|
|
},
|
|
{
|
|
"epoch": 2.3175262489904234,
|
|
"grad_norm": 0.044707559049129486,
|
|
"learning_rate": 6.056766410649329e-06,
|
|
"loss": 0.0013,
|
|
"step": 628
|
|
},
|
|
{
|
|
"epoch": 2.3212184146763586,
|
|
"grad_norm": 0.1256485879421234,
|
|
"learning_rate": 5.99527425571032e-06,
|
|
"loss": 0.0066,
|
|
"step": 629
|
|
},
|
|
{
|
|
"epoch": 2.324910580362294,
|
|
"grad_norm": 0.03149070963263512,
|
|
"learning_rate": 5.934040766523668e-06,
|
|
"loss": 0.0026,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 2.328602746048229,
|
|
"grad_norm": 0.029719380661845207,
|
|
"learning_rate": 5.873067074065229e-06,
|
|
"loss": 0.001,
|
|
"step": 631
|
|
},
|
|
{
|
|
"epoch": 2.3322949117341643,
|
|
"grad_norm": 0.02905251644551754,
|
|
"learning_rate": 5.8123543045124285e-06,
|
|
"loss": 0.0015,
|
|
"step": 632
|
|
},
|
|
{
|
|
"epoch": 2.335987077420099,
|
|
"grad_norm": 0.030503999441862106,
|
|
"learning_rate": 5.751903579223468e-06,
|
|
"loss": 0.001,
|
|
"step": 633
|
|
},
|
|
{
|
|
"epoch": 2.3396792431060343,
|
|
"grad_norm": 0.039289508014917374,
|
|
"learning_rate": 5.6917160147166525e-06,
|
|
"loss": 0.0019,
|
|
"step": 634
|
|
},
|
|
{
|
|
"epoch": 2.3433714087919695,
|
|
"grad_norm": 0.04832150787115097,
|
|
"learning_rate": 5.6317927226496875e-06,
|
|
"loss": 0.0016,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 2.3470635744779047,
|
|
"grad_norm": 0.03619923070073128,
|
|
"learning_rate": 5.572134809799235e-06,
|
|
"loss": 0.0023,
|
|
"step": 636
|
|
},
|
|
{
|
|
"epoch": 2.35075574016384,
|
|
"grad_norm": 0.036724645644426346,
|
|
"learning_rate": 5.512743378040428e-06,
|
|
"loss": 0.001,
|
|
"step": 637
|
|
},
|
|
{
|
|
"epoch": 2.354447905849775,
|
|
"grad_norm": 0.026730459183454514,
|
|
"learning_rate": 5.453619524326495e-06,
|
|
"loss": 0.0013,
|
|
"step": 638
|
|
},
|
|
{
|
|
"epoch": 2.3581400715357104,
|
|
"grad_norm": 0.036537256091833115,
|
|
"learning_rate": 5.39476434066855e-06,
|
|
"loss": 0.0011,
|
|
"step": 639
|
|
},
|
|
{
|
|
"epoch": 2.361832237221645,
|
|
"grad_norm": 0.0313183031976223,
|
|
"learning_rate": 5.3361789141154085e-06,
|
|
"loss": 0.0011,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 2.3655244029075804,
|
|
"grad_norm": 0.036056943237781525,
|
|
"learning_rate": 5.277864326733484e-06,
|
|
"loss": 0.0015,
|
|
"step": 641
|
|
},
|
|
{
|
|
"epoch": 2.3692165685935156,
|
|
"grad_norm": 0.06026415154337883,
|
|
"learning_rate": 5.219821655586821e-06,
|
|
"loss": 0.0081,
|
|
"step": 642
|
|
},
|
|
{
|
|
"epoch": 2.372908734279451,
|
|
"grad_norm": 0.04079107567667961,
|
|
"learning_rate": 5.162051972717204e-06,
|
|
"loss": 0.0028,
|
|
"step": 643
|
|
},
|
|
{
|
|
"epoch": 2.376600899965386,
|
|
"grad_norm": 0.06928452104330063,
|
|
"learning_rate": 5.104556345124363e-06,
|
|
"loss": 0.0027,
|
|
"step": 644
|
|
},
|
|
{
|
|
"epoch": 2.376600899965386,
|
|
"eval_loss": 0.008511913008987904,
|
|
"eval_runtime": 89.9081,
|
|
"eval_samples_per_second": 10.155,
|
|
"eval_steps_per_second": 5.083,
|
|
"step": 644
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 813,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 92,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4.275551525360632e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|