4055 lines
99 KiB
JSON
4055 lines
99 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 1.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 5735,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.0017438311971401169,
|
||
|
|
"grad_norm": 75.95468738643471,
|
||
|
|
"learning_rate": 5.202312138728324e-06,
|
||
|
|
"loss": 3.0208,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0034876623942802338,
|
||
|
|
"grad_norm": 30.03360815962505,
|
||
|
|
"learning_rate": 1.0982658959537573e-05,
|
||
|
|
"loss": 1.9958,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0052314935914203504,
|
||
|
|
"grad_norm": 17.34695165918347,
|
||
|
|
"learning_rate": 1.676300578034682e-05,
|
||
|
|
"loss": 1.0962,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0069753247885604676,
|
||
|
|
"grad_norm": 12.491792934654203,
|
||
|
|
"learning_rate": 2.254335260115607e-05,
|
||
|
|
"loss": 0.6126,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.008719155985700585,
|
||
|
|
"grad_norm": 1.9114286148516435,
|
||
|
|
"learning_rate": 2.832369942196532e-05,
|
||
|
|
"loss": 0.4425,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.010462987182840701,
|
||
|
|
"grad_norm": 1.428534192558495,
|
||
|
|
"learning_rate": 3.410404624277457e-05,
|
||
|
|
"loss": 0.3402,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.012206818379980817,
|
||
|
|
"grad_norm": 2.416511852214603,
|
||
|
|
"learning_rate": 3.988439306358382e-05,
|
||
|
|
"loss": 0.2859,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.013950649577120935,
|
||
|
|
"grad_norm": 1.1798287792036775,
|
||
|
|
"learning_rate": 4.566473988439307e-05,
|
||
|
|
"loss": 0.2436,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01569448077426105,
|
||
|
|
"grad_norm": 0.9422829664821649,
|
||
|
|
"learning_rate": 5.1445086705202317e-05,
|
||
|
|
"loss": 0.2153,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01743831197140117,
|
||
|
|
"grad_norm": 0.6784111235814376,
|
||
|
|
"learning_rate": 5.722543352601156e-05,
|
||
|
|
"loss": 0.1949,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.019182143168541284,
|
||
|
|
"grad_norm": 0.6258877189910531,
|
||
|
|
"learning_rate": 6.300578034682081e-05,
|
||
|
|
"loss": 0.1748,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.020925974365681402,
|
||
|
|
"grad_norm": 1.0353787867694664,
|
||
|
|
"learning_rate": 6.878612716763007e-05,
|
||
|
|
"loss": 0.1579,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02266980556282152,
|
||
|
|
"grad_norm": 0.4027613120700805,
|
||
|
|
"learning_rate": 7.456647398843931e-05,
|
||
|
|
"loss": 0.15,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.024413636759961634,
|
||
|
|
"grad_norm": 0.31222620383041055,
|
||
|
|
"learning_rate": 8.034682080924855e-05,
|
||
|
|
"loss": 0.1412,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.026157467957101752,
|
||
|
|
"grad_norm": 0.4650470906801264,
|
||
|
|
"learning_rate": 8.612716763005781e-05,
|
||
|
|
"loss": 0.1351,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02790129915424187,
|
||
|
|
"grad_norm": 0.3734885043246775,
|
||
|
|
"learning_rate": 9.190751445086706e-05,
|
||
|
|
"loss": 0.1351,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.029645130351381985,
|
||
|
|
"grad_norm": 0.38052529337243646,
|
||
|
|
"learning_rate": 9.768786127167631e-05,
|
||
|
|
"loss": 0.1289,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0313889615485221,
|
||
|
|
"grad_norm": 0.24922983883737135,
|
||
|
|
"learning_rate": 9.999971286914107e-05,
|
||
|
|
"loss": 0.1278,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03313279274566222,
|
||
|
|
"grad_norm": 0.2337743034285069,
|
||
|
|
"learning_rate": 9.999795819250125e-05,
|
||
|
|
"loss": 0.12,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03487662394280234,
|
||
|
|
"grad_norm": 0.1925200524883156,
|
||
|
|
"learning_rate": 9.99946084122777e-05,
|
||
|
|
"loss": 0.1187,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03662045513994246,
|
||
|
|
"grad_norm": 0.25639013442385866,
|
||
|
|
"learning_rate": 9.998966363533971e-05,
|
||
|
|
"loss": 0.1155,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03836428633708257,
|
||
|
|
"grad_norm": 0.5969296827345937,
|
||
|
|
"learning_rate": 9.998312401944236e-05,
|
||
|
|
"loss": 0.1123,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.040108117534222686,
|
||
|
|
"grad_norm": 0.19274626117019628,
|
||
|
|
"learning_rate": 9.997498977322146e-05,
|
||
|
|
"loss": 0.113,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.041851948731362804,
|
||
|
|
"grad_norm": 0.29247520821204726,
|
||
|
|
"learning_rate": 9.996526115618692e-05,
|
||
|
|
"loss": 0.1117,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04359577992850292,
|
||
|
|
"grad_norm": 0.29663865702918896,
|
||
|
|
"learning_rate": 9.995393847871446e-05,
|
||
|
|
"loss": 0.1084,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04533961112564304,
|
||
|
|
"grad_norm": 0.7694784504132202,
|
||
|
|
"learning_rate": 9.994102210203567e-05,
|
||
|
|
"loss": 0.1075,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04708344232278316,
|
||
|
|
"grad_norm": 0.6628546556003793,
|
||
|
|
"learning_rate": 9.992651243822657e-05,
|
||
|
|
"loss": 0.1107,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04882727351992327,
|
||
|
|
"grad_norm": 0.19910200146659307,
|
||
|
|
"learning_rate": 9.991040995019441e-05,
|
||
|
|
"loss": 0.1042,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.050571104717063387,
|
||
|
|
"grad_norm": 0.482892752541729,
|
||
|
|
"learning_rate": 9.989271515166288e-05,
|
||
|
|
"loss": 0.1067,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.052314935914203504,
|
||
|
|
"grad_norm": 0.3316707002189607,
|
||
|
|
"learning_rate": 9.987342860715575e-05,
|
||
|
|
"loss": 0.1051,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05405876711134362,
|
||
|
|
"grad_norm": 0.1686835282656122,
|
||
|
|
"learning_rate": 9.985255093197889e-05,
|
||
|
|
"loss": 0.104,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05580259830848374,
|
||
|
|
"grad_norm": 0.20484377424861627,
|
||
|
|
"learning_rate": 9.983008279220061e-05,
|
||
|
|
"loss": 0.1003,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05754642950562386,
|
||
|
|
"grad_norm": 0.17506212895289214,
|
||
|
|
"learning_rate": 9.980602490463036e-05,
|
||
|
|
"loss": 0.1037,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05929026070276397,
|
||
|
|
"grad_norm": 0.21847685544030518,
|
||
|
|
"learning_rate": 9.978037803679594e-05,
|
||
|
|
"loss": 0.1051,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06103409189990409,
|
||
|
|
"grad_norm": 0.1545992773211455,
|
||
|
|
"learning_rate": 9.975314300691897e-05,
|
||
|
|
"loss": 0.1008,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0627779230970442,
|
||
|
|
"grad_norm": 0.18093862679980835,
|
||
|
|
"learning_rate": 9.972432068388884e-05,
|
||
|
|
"loss": 0.1013,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06452175429418432,
|
||
|
|
"grad_norm": 0.16690423874276675,
|
||
|
|
"learning_rate": 9.969391198723489e-05,
|
||
|
|
"loss": 0.1015,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06626558549132444,
|
||
|
|
"grad_norm": 0.1592587474414291,
|
||
|
|
"learning_rate": 9.966191788709716e-05,
|
||
|
|
"loss": 0.0997,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06800941668846455,
|
||
|
|
"grad_norm": 0.4135662463271595,
|
||
|
|
"learning_rate": 9.96283394041954e-05,
|
||
|
|
"loss": 0.0976,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06975324788560468,
|
||
|
|
"grad_norm": 0.2936141046578689,
|
||
|
|
"learning_rate": 9.959317760979653e-05,
|
||
|
|
"loss": 0.098,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07149707908274479,
|
||
|
|
"grad_norm": 0.16272012987508996,
|
||
|
|
"learning_rate": 9.955643362568047e-05,
|
||
|
|
"loss": 0.0975,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07324091027988491,
|
||
|
|
"grad_norm": 0.2718060027024407,
|
||
|
|
"learning_rate": 9.951810862410426e-05,
|
||
|
|
"loss": 0.1025,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07498474147702502,
|
||
|
|
"grad_norm": 0.48320652277711706,
|
||
|
|
"learning_rate": 9.947820382776483e-05,
|
||
|
|
"loss": 0.0998,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07672857267416514,
|
||
|
|
"grad_norm": 0.34047211822697626,
|
||
|
|
"learning_rate": 9.943672050975978e-05,
|
||
|
|
"loss": 0.0973,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07847240387130526,
|
||
|
|
"grad_norm": 0.12654950824029546,
|
||
|
|
"learning_rate": 9.9393659993547e-05,
|
||
|
|
"loss": 0.0952,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08021623506844537,
|
||
|
|
"grad_norm": 0.19206792789150806,
|
||
|
|
"learning_rate": 9.934902365290221e-05,
|
||
|
|
"loss": 0.0916,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0819600662655855,
|
||
|
|
"grad_norm": 0.3174049728598392,
|
||
|
|
"learning_rate": 9.930281291187533e-05,
|
||
|
|
"loss": 0.0926,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08370389746272561,
|
||
|
|
"grad_norm": 0.21583614516207328,
|
||
|
|
"learning_rate": 9.925502924474494e-05,
|
||
|
|
"loss": 0.0968,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08544772865986572,
|
||
|
|
"grad_norm": 0.20077795953974328,
|
||
|
|
"learning_rate": 9.920567417597127e-05,
|
||
|
|
"loss": 0.0947,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08719155985700584,
|
||
|
|
"grad_norm": 0.5320546316014596,
|
||
|
|
"learning_rate": 9.915474928014754e-05,
|
||
|
|
"loss": 0.0922,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08893539105414595,
|
||
|
|
"grad_norm": 0.25808125396162024,
|
||
|
|
"learning_rate": 9.910225618194979e-05,
|
||
|
|
"loss": 0.0938,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09067922225128608,
|
||
|
|
"grad_norm": 0.16298921826803214,
|
||
|
|
"learning_rate": 9.9048196556085e-05,
|
||
|
|
"loss": 0.0934,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09242305344842619,
|
||
|
|
"grad_norm": 0.23375968225532243,
|
||
|
|
"learning_rate": 9.89925721272376e-05,
|
||
|
|
"loss": 0.0932,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09416688464556631,
|
||
|
|
"grad_norm": 0.16382604539487483,
|
||
|
|
"learning_rate": 9.893538467001465e-05,
|
||
|
|
"loss": 0.0898,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09591071584270643,
|
||
|
|
"grad_norm": 0.10016462312465411,
|
||
|
|
"learning_rate": 9.887663600888897e-05,
|
||
|
|
"loss": 0.0942,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09765454703984654,
|
||
|
|
"grad_norm": 0.14233007452976884,
|
||
|
|
"learning_rate": 9.881632801814112e-05,
|
||
|
|
"loss": 0.0907,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09939837823698666,
|
||
|
|
"grad_norm": 0.17513502703887412,
|
||
|
|
"learning_rate": 9.875446262179948e-05,
|
||
|
|
"loss": 0.0911,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10114220943412677,
|
||
|
|
"grad_norm": 0.14030487648990883,
|
||
|
|
"learning_rate": 9.869104179357898e-05,
|
||
|
|
"loss": 0.0934,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1028860406312669,
|
||
|
|
"grad_norm": 0.12085926283675864,
|
||
|
|
"learning_rate": 9.862606755681805e-05,
|
||
|
|
"loss": 0.0884,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10462987182840701,
|
||
|
|
"grad_norm": 0.23368729653021336,
|
||
|
|
"learning_rate": 9.855954198441411e-05,
|
||
|
|
"loss": 0.0911,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10637370302554713,
|
||
|
|
"grad_norm": 0.16684004596578658,
|
||
|
|
"learning_rate": 9.849146719875737e-05,
|
||
|
|
"loss": 0.0922,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10811753422268724,
|
||
|
|
"grad_norm": 0.1412977238838307,
|
||
|
|
"learning_rate": 9.842184537166325e-05,
|
||
|
|
"loss": 0.0901,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10986136541982736,
|
||
|
|
"grad_norm": 0.15930483520725827,
|
||
|
|
"learning_rate": 9.835067872430298e-05,
|
||
|
|
"loss": 0.091,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11160519661696748,
|
||
|
|
"grad_norm": 0.15145556668891252,
|
||
|
|
"learning_rate": 9.827796952713271e-05,
|
||
|
|
"loss": 0.0896,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11334902781410759,
|
||
|
|
"grad_norm": 0.11359857757495083,
|
||
|
|
"learning_rate": 9.820372009982122e-05,
|
||
|
|
"loss": 0.0909,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11509285901124772,
|
||
|
|
"grad_norm": 0.10805407788602606,
|
||
|
|
"learning_rate": 9.81279328111758e-05,
|
||
|
|
"loss": 0.0897,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11683669020838783,
|
||
|
|
"grad_norm": 0.1379641185520003,
|
||
|
|
"learning_rate": 9.805061007906669e-05,
|
||
|
|
"loss": 0.0901,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11858052140552794,
|
||
|
|
"grad_norm": 0.17169121682691532,
|
||
|
|
"learning_rate": 9.797175437034997e-05,
|
||
|
|
"loss": 0.0898,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12032435260266806,
|
||
|
|
"grad_norm": 0.14025167310704112,
|
||
|
|
"learning_rate": 9.789136820078883e-05,
|
||
|
|
"loss": 0.0888,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12206818379980817,
|
||
|
|
"grad_norm": 0.1521428482995378,
|
||
|
|
"learning_rate": 9.780945413497337e-05,
|
||
|
|
"loss": 0.0883,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1238120149969483,
|
||
|
|
"grad_norm": 0.1564174281355041,
|
||
|
|
"learning_rate": 9.77260147862387e-05,
|
||
|
|
"loss": 0.0892,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1255558461940884,
|
||
|
|
"grad_norm": 0.13358902120457766,
|
||
|
|
"learning_rate": 9.76410528165816e-05,
|
||
|
|
"loss": 0.086,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12729967739122852,
|
||
|
|
"grad_norm": 0.21507213503538028,
|
||
|
|
"learning_rate": 9.755457093657561e-05,
|
||
|
|
"loss": 0.0852,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12904350858836863,
|
||
|
|
"grad_norm": 0.10784925841529905,
|
||
|
|
"learning_rate": 9.746657190528454e-05,
|
||
|
|
"loss": 0.0853,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13078733978550877,
|
||
|
|
"grad_norm": 0.1252377711727728,
|
||
|
|
"learning_rate": 9.737705853017441e-05,
|
||
|
|
"loss": 0.0885,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13253117098264888,
|
||
|
|
"grad_norm": 0.13791730839422697,
|
||
|
|
"learning_rate": 9.728603366702399e-05,
|
||
|
|
"loss": 0.0863,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.134275002179789,
|
||
|
|
"grad_norm": 0.20745890428434363,
|
||
|
|
"learning_rate": 9.719350021983356e-05,
|
||
|
|
"loss": 0.0853,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1360188333769291,
|
||
|
|
"grad_norm": 0.11532866798616691,
|
||
|
|
"learning_rate": 9.709946114073232e-05,
|
||
|
|
"loss": 0.0891,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13776266457406924,
|
||
|
|
"grad_norm": 0.09902310686484139,
|
||
|
|
"learning_rate": 9.700391942988423e-05,
|
||
|
|
"loss": 0.0874,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13950649577120935,
|
||
|
|
"grad_norm": 0.12236583263546631,
|
||
|
|
"learning_rate": 9.690687813539229e-05,
|
||
|
|
"loss": 0.0873,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14125032696834947,
|
||
|
|
"grad_norm": 0.28381242657810296,
|
||
|
|
"learning_rate": 9.680834035320127e-05,
|
||
|
|
"loss": 0.0847,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14299415816548958,
|
||
|
|
"grad_norm": 0.17407339426371563,
|
||
|
|
"learning_rate": 9.670830922699889e-05,
|
||
|
|
"loss": 0.0865,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1447379893626297,
|
||
|
|
"grad_norm": 0.17139082411532464,
|
||
|
|
"learning_rate": 9.660678794811568e-05,
|
||
|
|
"loss": 0.0873,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14648182055976983,
|
||
|
|
"grad_norm": 0.18180876897987616,
|
||
|
|
"learning_rate": 9.650377975542297e-05,
|
||
|
|
"loss": 0.0856,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14822565175690994,
|
||
|
|
"grad_norm": 0.14881131848289347,
|
||
|
|
"learning_rate": 9.639928793522976e-05,
|
||
|
|
"loss": 0.0857,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14996948295405005,
|
||
|
|
"grad_norm": 0.19484994491316202,
|
||
|
|
"learning_rate": 9.629331582117766e-05,
|
||
|
|
"loss": 0.0874,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15171331415119016,
|
||
|
|
"grad_norm": 0.13056276669158606,
|
||
|
|
"learning_rate": 9.618586679413477e-05,
|
||
|
|
"loss": 0.0863,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15345714534833027,
|
||
|
|
"grad_norm": 0.11769246064182205,
|
||
|
|
"learning_rate": 9.607694428208759e-05,
|
||
|
|
"loss": 0.0833,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1552009765454704,
|
||
|
|
"grad_norm": 0.14628105454358595,
|
||
|
|
"learning_rate": 9.596655176003184e-05,
|
||
|
|
"loss": 0.084,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15694480774261052,
|
||
|
|
"grad_norm": 0.10138944434580117,
|
||
|
|
"learning_rate": 9.585469274986147e-05,
|
||
|
|
"loss": 0.0837,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15868863893975063,
|
||
|
|
"grad_norm": 0.2727810370648225,
|
||
|
|
"learning_rate": 9.57413708202564e-05,
|
||
|
|
"loss": 0.083,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16043247013689074,
|
||
|
|
"grad_norm": 0.12791732468432543,
|
||
|
|
"learning_rate": 9.562658958656855e-05,
|
||
|
|
"loss": 0.0838,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16217630133403085,
|
||
|
|
"grad_norm": 0.08919744277549542,
|
||
|
|
"learning_rate": 9.551035271070664e-05,
|
||
|
|
"loss": 0.0858,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.163920132531171,
|
||
|
|
"grad_norm": 0.10948146823445754,
|
||
|
|
"learning_rate": 9.539266390101921e-05,
|
||
|
|
"loss": 0.0844,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1656639637283111,
|
||
|
|
"grad_norm": 0.14190891350092028,
|
||
|
|
"learning_rate": 9.527352691217648e-05,
|
||
|
|
"loss": 0.0821,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16740779492545121,
|
||
|
|
"grad_norm": 0.13665202223597164,
|
||
|
|
"learning_rate": 9.515294554505039e-05,
|
||
|
|
"loss": 0.0831,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16915162612259133,
|
||
|
|
"grad_norm": 0.2559230604815077,
|
||
|
|
"learning_rate": 9.503092364659343e-05,
|
||
|
|
"loss": 0.0851,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17089545731973144,
|
||
|
|
"grad_norm": 0.12867152006889135,
|
||
|
|
"learning_rate": 9.490746510971595e-05,
|
||
|
|
"loss": 0.084,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17263928851687158,
|
||
|
|
"grad_norm": 0.16931334823903008,
|
||
|
|
"learning_rate": 9.47825738731619e-05,
|
||
|
|
"loss": 0.0842,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1743831197140117,
|
||
|
|
"grad_norm": 0.14129496751949874,
|
||
|
|
"learning_rate": 9.465625392138313e-05,
|
||
|
|
"loss": 0.0813,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1761269509111518,
|
||
|
|
"grad_norm": 0.11143855280545686,
|
||
|
|
"learning_rate": 9.452850928441239e-05,
|
||
|
|
"loss": 0.0811,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1778707821082919,
|
||
|
|
"grad_norm": 0.1051133376541276,
|
||
|
|
"learning_rate": 9.439934403773468e-05,
|
||
|
|
"loss": 0.0829,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17961461330543205,
|
||
|
|
"grad_norm": 0.0936072144785802,
|
||
|
|
"learning_rate": 9.42687623021572e-05,
|
||
|
|
"loss": 0.0835,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18135844450257216,
|
||
|
|
"grad_norm": 0.08594669519246889,
|
||
|
|
"learning_rate": 9.413676824367799e-05,
|
||
|
|
"loss": 0.084,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18310227569971227,
|
||
|
|
"grad_norm": 0.155895314210675,
|
||
|
|
"learning_rate": 9.400336607335293e-05,
|
||
|
|
"loss": 0.0814,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18484610689685238,
|
||
|
|
"grad_norm": 0.09064911002854469,
|
||
|
|
"learning_rate": 9.38685600471614e-05,
|
||
|
|
"loss": 0.0816,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1865899380939925,
|
||
|
|
"grad_norm": 0.11656040445018741,
|
||
|
|
"learning_rate": 9.373235446587056e-05,
|
||
|
|
"loss": 0.0832,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18833376929113263,
|
||
|
|
"grad_norm": 0.08144570375176247,
|
||
|
|
"learning_rate": 9.359475367489806e-05,
|
||
|
|
"loss": 0.0812,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19007760048827274,
|
||
|
|
"grad_norm": 0.14753379134013667,
|
||
|
|
"learning_rate": 9.345576206417345e-05,
|
||
|
|
"loss": 0.0825,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19182143168541285,
|
||
|
|
"grad_norm": 0.08941481925466258,
|
||
|
|
"learning_rate": 9.331538406799816e-05,
|
||
|
|
"loss": 0.0849,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19356526288255296,
|
||
|
|
"grad_norm": 0.10342356855139785,
|
||
|
|
"learning_rate": 9.317362416490396e-05,
|
||
|
|
"loss": 0.0812,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19530909407969307,
|
||
|
|
"grad_norm": 0.09908246193961452,
|
||
|
|
"learning_rate": 9.303048687751015e-05,
|
||
|
|
"loss": 0.0827,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1970529252768332,
|
||
|
|
"grad_norm": 0.08548417434334415,
|
||
|
|
"learning_rate": 9.288597677237918e-05,
|
||
|
|
"loss": 0.0795,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19879675647397332,
|
||
|
|
"grad_norm": 0.19698254462320883,
|
||
|
|
"learning_rate": 9.274009845987106e-05,
|
||
|
|
"loss": 0.0784,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20054058767111343,
|
||
|
|
"grad_norm": 0.19931397597516964,
|
||
|
|
"learning_rate": 9.259285659399624e-05,
|
||
|
|
"loss": 0.0791,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20228441886825355,
|
||
|
|
"grad_norm": 0.09541689313639541,
|
||
|
|
"learning_rate": 9.244425587226708e-05,
|
||
|
|
"loss": 0.0811,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20402825006539366,
|
||
|
|
"grad_norm": 0.13754375259689228,
|
||
|
|
"learning_rate": 9.229430103554809e-05,
|
||
|
|
"loss": 0.0834,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2057720812625338,
|
||
|
|
"grad_norm": 0.0901357227026893,
|
||
|
|
"learning_rate": 9.214299686790453e-05,
|
||
|
|
"loss": 0.0811,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2075159124596739,
|
||
|
|
"grad_norm": 0.10558399227810011,
|
||
|
|
"learning_rate": 9.199034819644996e-05,
|
||
|
|
"loss": 0.0805,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20925974365681402,
|
||
|
|
"grad_norm": 0.15527795571225775,
|
||
|
|
"learning_rate": 9.18363598911921e-05,
|
||
|
|
"loss": 0.0788,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21100357485395413,
|
||
|
|
"grad_norm": 0.07370244400658628,
|
||
|
|
"learning_rate": 9.168103686487754e-05,
|
||
|
|
"loss": 0.0809,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21274740605109427,
|
||
|
|
"grad_norm": 0.1080328076461725,
|
||
|
|
"learning_rate": 9.152438407283492e-05,
|
||
|
|
"loss": 0.0794,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21449123724823438,
|
||
|
|
"grad_norm": 0.12193088880477042,
|
||
|
|
"learning_rate": 9.136640651281694e-05,
|
||
|
|
"loss": 0.078,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2162350684453745,
|
||
|
|
"grad_norm": 0.1435896505685483,
|
||
|
|
"learning_rate": 9.120710922484088e-05,
|
||
|
|
"loss": 0.0801,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2179788996425146,
|
||
|
|
"grad_norm": 0.15854421340115443,
|
||
|
|
"learning_rate": 9.104649729102774e-05,
|
||
|
|
"loss": 0.0767,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2197227308396547,
|
||
|
|
"grad_norm": 0.10152239382236611,
|
||
|
|
"learning_rate": 9.088457583544021e-05,
|
||
|
|
"loss": 0.0793,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22146656203679485,
|
||
|
|
"grad_norm": 0.09311072879845227,
|
||
|
|
"learning_rate": 9.072135002391911e-05,
|
||
|
|
"loss": 0.0808,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22321039323393496,
|
||
|
|
"grad_norm": 0.08102045640572195,
|
||
|
|
"learning_rate": 9.055682506391867e-05,
|
||
|
|
"loss": 0.0789,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22495422443107507,
|
||
|
|
"grad_norm": 0.11429382945365393,
|
||
|
|
"learning_rate": 9.039100620434025e-05,
|
||
|
|
"loss": 0.0809,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22669805562821518,
|
||
|
|
"grad_norm": 0.13272380745973458,
|
||
|
|
"learning_rate": 9.022389873536504e-05,
|
||
|
|
"loss": 0.0808,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2284418868253553,
|
||
|
|
"grad_norm": 0.1541801400684174,
|
||
|
|
"learning_rate": 9.005550798828522e-05,
|
||
|
|
"loss": 0.0781,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23018571802249543,
|
||
|
|
"grad_norm": 0.21341903886796226,
|
||
|
|
"learning_rate": 8.988583933533383e-05,
|
||
|
|
"loss": 0.0806,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23192954921963554,
|
||
|
|
"grad_norm": 0.1072377577262835,
|
||
|
|
"learning_rate": 8.971489818951348e-05,
|
||
|
|
"loss": 0.0815,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23367338041677566,
|
||
|
|
"grad_norm": 0.2539226009683929,
|
||
|
|
"learning_rate": 8.954269000442353e-05,
|
||
|
|
"loss": 0.0822,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23541721161391577,
|
||
|
|
"grad_norm": 0.0964733141512959,
|
||
|
|
"learning_rate": 8.936922027408618e-05,
|
||
|
|
"loss": 0.0802,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23716104281105588,
|
||
|
|
"grad_norm": 0.15344703114198005,
|
||
|
|
"learning_rate": 8.919449453277125e-05,
|
||
|
|
"loss": 0.0781,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23890487400819602,
|
||
|
|
"grad_norm": 0.13637749965223914,
|
||
|
|
"learning_rate": 8.901851835481946e-05,
|
||
|
|
"loss": 0.0768,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24064870520533613,
|
||
|
|
"grad_norm": 0.1439271667236502,
|
||
|
|
"learning_rate": 8.884129735446471e-05,
|
||
|
|
"loss": 0.0782,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24239253640247624,
|
||
|
|
"grad_norm": 0.10228448993969186,
|
||
|
|
"learning_rate": 8.866283718565497e-05,
|
||
|
|
"loss": 0.0802,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24413636759961635,
|
||
|
|
"grad_norm": 0.08357219560541651,
|
||
|
|
"learning_rate": 8.848314354187184e-05,
|
||
|
|
"loss": 0.0805,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24588019879675646,
|
||
|
|
"grad_norm": 0.12297612486088587,
|
||
|
|
"learning_rate": 8.83022221559489e-05,
|
||
|
|
"loss": 0.0807,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2476240299938966,
|
||
|
|
"grad_norm": 0.09213229043723821,
|
||
|
|
"learning_rate": 8.81200787998889e-05,
|
||
|
|
"loss": 0.0796,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2493678611910367,
|
||
|
|
"grad_norm": 0.23221554917761744,
|
||
|
|
"learning_rate": 8.793671928467953e-05,
|
||
|
|
"loss": 0.0795,
|
||
|
|
"step": 1430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2511116923881768,
|
||
|
|
"grad_norm": 0.15255422092338392,
|
||
|
|
"learning_rate": 8.775214946010806e-05,
|
||
|
|
"loss": 0.0791,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25285552358531693,
|
||
|
|
"grad_norm": 0.13430089153547967,
|
||
|
|
"learning_rate": 8.756637521457472e-05,
|
||
|
|
"loss": 0.0796,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25459935478245704,
|
||
|
|
"grad_norm": 0.10858403401784206,
|
||
|
|
"learning_rate": 8.737940247490488e-05,
|
||
|
|
"loss": 0.0776,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25634318597959715,
|
||
|
|
"grad_norm": 0.1087205013394363,
|
||
|
|
"learning_rate": 8.71912372061598e-05,
|
||
|
|
"loss": 0.0764,
|
||
|
|
"step": 1470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25808701717673727,
|
||
|
|
"grad_norm": 0.1801860214762826,
|
||
|
|
"learning_rate": 8.700188541144658e-05,
|
||
|
|
"loss": 0.0764,
|
||
|
|
"step": 1480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25983084837387743,
|
||
|
|
"grad_norm": 0.1089107768906771,
|
||
|
|
"learning_rate": 8.68113531317264e-05,
|
||
|
|
"loss": 0.0801,
|
||
|
|
"step": 1490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26157467957101754,
|
||
|
|
"grad_norm": 0.10862169809528116,
|
||
|
|
"learning_rate": 8.661964644562193e-05,
|
||
|
|
"loss": 0.0796,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26331851076815765,
|
||
|
|
"grad_norm": 0.08182932886020478,
|
||
|
|
"learning_rate": 8.64267714692234e-05,
|
||
|
|
"loss": 0.0784,
|
||
|
|
"step": 1510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26506234196529777,
|
||
|
|
"grad_norm": 0.10745086100269909,
|
||
|
|
"learning_rate": 8.623273435589338e-05,
|
||
|
|
"loss": 0.0771,
|
||
|
|
"step": 1520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2668061731624379,
|
||
|
|
"grad_norm": 0.0835190988442007,
|
||
|
|
"learning_rate": 8.603754129607055e-05,
|
||
|
|
"loss": 0.0785,
|
||
|
|
"step": 1530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.268550004359578,
|
||
|
|
"grad_norm": 0.15373146227023327,
|
||
|
|
"learning_rate": 8.584119851707224e-05,
|
||
|
|
"loss": 0.0787,
|
||
|
|
"step": 1540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2702938355567181,
|
||
|
|
"grad_norm": 0.16851363372769732,
|
||
|
|
"learning_rate": 8.564371228289562e-05,
|
||
|
|
"loss": 0.0782,
|
||
|
|
"step": 1550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2720376667538582,
|
||
|
|
"grad_norm": 0.17341701223323236,
|
||
|
|
"learning_rate": 8.5445088894018e-05,
|
||
|
|
"loss": 0.0763,
|
||
|
|
"step": 1560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2737814979509983,
|
||
|
|
"grad_norm": 0.15736312308266082,
|
||
|
|
"learning_rate": 8.524533468719568e-05,
|
||
|
|
"loss": 0.0778,
|
||
|
|
"step": 1570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2755253291481385,
|
||
|
|
"grad_norm": 0.10603350184378946,
|
||
|
|
"learning_rate": 8.504445603526201e-05,
|
||
|
|
"loss": 0.0769,
|
||
|
|
"step": 1580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2772691603452786,
|
||
|
|
"grad_norm": 0.10468271020039895,
|
||
|
|
"learning_rate": 8.484245934692379e-05,
|
||
|
|
"loss": 0.0764,
|
||
|
|
"step": 1590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2790129915424187,
|
||
|
|
"grad_norm": 0.1148841481305709,
|
||
|
|
"learning_rate": 8.463935106655704e-05,
|
||
|
|
"loss": 0.0773,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2807568227395588,
|
||
|
|
"grad_norm": 0.0821350828260079,
|
||
|
|
"learning_rate": 8.443513767400127e-05,
|
||
|
|
"loss": 0.0755,
|
||
|
|
"step": 1610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28250065393669893,
|
||
|
|
"grad_norm": 0.08498785215350724,
|
||
|
|
"learning_rate": 8.422982568435281e-05,
|
||
|
|
"loss": 0.0771,
|
||
|
|
"step": 1620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28424448513383904,
|
||
|
|
"grad_norm": 0.08457375312974288,
|
||
|
|
"learning_rate": 8.4023421647757e-05,
|
||
|
|
"loss": 0.0766,
|
||
|
|
"step": 1630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28598831633097915,
|
||
|
|
"grad_norm": 0.31067232704883735,
|
||
|
|
"learning_rate": 8.381593214919905e-05,
|
||
|
|
"loss": 0.0795,
|
||
|
|
"step": 1640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28773214752811926,
|
||
|
|
"grad_norm": 0.08612116427881779,
|
||
|
|
"learning_rate": 8.360736380829419e-05,
|
||
|
|
"loss": 0.0785,
|
||
|
|
"step": 1650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2894759787252594,
|
||
|
|
"grad_norm": 0.07915038596639898,
|
||
|
|
"learning_rate": 8.339772327907628e-05,
|
||
|
|
"loss": 0.076,
|
||
|
|
"step": 1660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2912198099223995,
|
||
|
|
"grad_norm": 0.21458699675313742,
|
||
|
|
"learning_rate": 8.318701724978564e-05,
|
||
|
|
"loss": 0.0776,
|
||
|
|
"step": 1670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29296364111953965,
|
||
|
|
"grad_norm": 0.14040279428045313,
|
||
|
|
"learning_rate": 8.29752524426556e-05,
|
||
|
|
"loss": 0.0767,
|
||
|
|
"step": 1680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29470747231667976,
|
||
|
|
"grad_norm": 0.08754112074945274,
|
||
|
|
"learning_rate": 8.276243561369814e-05,
|
||
|
|
"loss": 0.0758,
|
||
|
|
"step": 1690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2964513035138199,
|
||
|
|
"grad_norm": 0.10893709018139172,
|
||
|
|
"learning_rate": 8.254857355248824e-05,
|
||
|
|
"loss": 0.0747,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29819513471096,
|
||
|
|
"grad_norm": 0.06433336098088924,
|
||
|
|
"learning_rate": 8.233367308194734e-05,
|
||
|
|
"loss": 0.0774,
|
||
|
|
"step": 1710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2999389659081001,
|
||
|
|
"grad_norm": 0.09370553245193815,
|
||
|
|
"learning_rate": 8.21177410581256e-05,
|
||
|
|
"loss": 0.0745,
|
||
|
|
"step": 1720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3016827971052402,
|
||
|
|
"grad_norm": 0.1113308601153905,
|
||
|
|
"learning_rate": 8.190078436998326e-05,
|
||
|
|
"loss": 0.0742,
|
||
|
|
"step": 1730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3034266283023803,
|
||
|
|
"grad_norm": 0.0845496702841715,
|
||
|
|
"learning_rate": 8.168280993917077e-05,
|
||
|
|
"loss": 0.0765,
|
||
|
|
"step": 1740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30517045949952043,
|
||
|
|
"grad_norm": 0.07782628428869619,
|
||
|
|
"learning_rate": 8.146382471980803e-05,
|
||
|
|
"loss": 0.0764,
|
||
|
|
"step": 1750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30691429069666054,
|
||
|
|
"grad_norm": 0.07272801486673855,
|
||
|
|
"learning_rate": 8.124383569826253e-05,
|
||
|
|
"loss": 0.0743,
|
||
|
|
"step": 1760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3086581218938007,
|
||
|
|
"grad_norm": 0.08576203017796658,
|
||
|
|
"learning_rate": 8.102284989292638e-05,
|
||
|
|
"loss": 0.077,
|
||
|
|
"step": 1770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3104019530909408,
|
||
|
|
"grad_norm": 0.06773514930909028,
|
||
|
|
"learning_rate": 8.080087435399249e-05,
|
||
|
|
"loss": 0.0739,
|
||
|
|
"step": 1780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31214578428808093,
|
||
|
|
"grad_norm": 0.083064606175779,
|
||
|
|
"learning_rate": 8.057791616322959e-05,
|
||
|
|
"loss": 0.0755,
|
||
|
|
"step": 1790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31388961548522104,
|
||
|
|
"grad_norm": 0.12916703368799123,
|
||
|
|
"learning_rate": 8.035398243375636e-05,
|
||
|
|
"loss": 0.0752,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31563344668236115,
|
||
|
|
"grad_norm": 0.1381342843732955,
|
||
|
|
"learning_rate": 8.012908030981441e-05,
|
||
|
|
"loss": 0.0784,
|
||
|
|
"step": 1810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31737727787950126,
|
||
|
|
"grad_norm": 0.08149140898903617,
|
||
|
|
"learning_rate": 7.990321696654043e-05,
|
||
|
|
"loss": 0.0746,
|
||
|
|
"step": 1820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3191211090766414,
|
||
|
|
"grad_norm": 0.09800398683738969,
|
||
|
|
"learning_rate": 7.967639960973726e-05,
|
||
|
|
"loss": 0.0768,
|
||
|
|
"step": 1830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3208649402737815,
|
||
|
|
"grad_norm": 0.0720534859927381,
|
||
|
|
"learning_rate": 7.944863547564396e-05,
|
||
|
|
"loss": 0.0722,
|
||
|
|
"step": 1840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3226087714709216,
|
||
|
|
"grad_norm": 0.07931259123296684,
|
||
|
|
"learning_rate": 7.921993183070498e-05,
|
||
|
|
"loss": 0.0733,
|
||
|
|
"step": 1850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3243526026680617,
|
||
|
|
"grad_norm": 0.13837309451231886,
|
||
|
|
"learning_rate": 7.899029597133835e-05,
|
||
|
|
"loss": 0.0759,
|
||
|
|
"step": 1860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3260964338652019,
|
||
|
|
"grad_norm": 0.08485683188152221,
|
||
|
|
"learning_rate": 7.875973522370293e-05,
|
||
|
|
"loss": 0.0754,
|
||
|
|
"step": 1870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.327840265062342,
|
||
|
|
"grad_norm": 0.0772062101697051,
|
||
|
|
"learning_rate": 7.852825694346456e-05,
|
||
|
|
"loss": 0.0765,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3295840962594821,
|
||
|
|
"grad_norm": 0.089799828330404,
|
||
|
|
"learning_rate": 7.82958685155615e-05,
|
||
|
|
"loss": 0.0743,
|
||
|
|
"step": 1890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3313279274566222,
|
||
|
|
"grad_norm": 0.10045562894572037,
|
||
|
|
"learning_rate": 7.806257735396878e-05,
|
||
|
|
"loss": 0.0754,
|
||
|
|
"step": 1900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3330717586537623,
|
||
|
|
"grad_norm": 0.06500413003152958,
|
||
|
|
"learning_rate": 7.782839090146173e-05,
|
||
|
|
"loss": 0.073,
|
||
|
|
"step": 1910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33481558985090243,
|
||
|
|
"grad_norm": 0.1259139302957018,
|
||
|
|
"learning_rate": 7.759331662937841e-05,
|
||
|
|
"loss": 0.0769,
|
||
|
|
"step": 1920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33655942104804254,
|
||
|
|
"grad_norm": 0.11229045594501955,
|
||
|
|
"learning_rate": 7.735736203738138e-05,
|
||
|
|
"loss": 0.0737,
|
||
|
|
"step": 1930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33830325224518265,
|
||
|
|
"grad_norm": 0.11020678621947962,
|
||
|
|
"learning_rate": 7.71205346532183e-05,
|
||
|
|
"loss": 0.0792,
|
||
|
|
"step": 1940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34004708344232276,
|
||
|
|
"grad_norm": 0.07389223584844883,
|
||
|
|
"learning_rate": 7.688284203248196e-05,
|
||
|
|
"loss": 0.0754,
|
||
|
|
"step": 1950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3417909146394629,
|
||
|
|
"grad_norm": 0.07106281995095357,
|
||
|
|
"learning_rate": 7.664429175836903e-05,
|
||
|
|
"loss": 0.0759,
|
||
|
|
"step": 1960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34353474583660304,
|
||
|
|
"grad_norm": 0.09883942936733096,
|
||
|
|
"learning_rate": 7.64048914414382e-05,
|
||
|
|
"loss": 0.0763,
|
||
|
|
"step": 1970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34527857703374315,
|
||
|
|
"grad_norm": 0.12548493474348643,
|
||
|
|
"learning_rate": 7.616464871936749e-05,
|
||
|
|
"loss": 0.0759,
|
||
|
|
"step": 1980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34702240823088326,
|
||
|
|
"grad_norm": 0.10524371345048106,
|
||
|
|
"learning_rate": 7.592357125671039e-05,
|
||
|
|
"loss": 0.0734,
|
||
|
|
"step": 1990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3487662394280234,
|
||
|
|
"grad_norm": 0.0989300552899626,
|
||
|
|
"learning_rate": 7.56816667446515e-05,
|
||
|
|
"loss": 0.0765,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3505100706251635,
|
||
|
|
"grad_norm": 0.06808863715144056,
|
||
|
|
"learning_rate": 7.543894290076103e-05,
|
||
|
|
"loss": 0.0747,
|
||
|
|
"step": 2010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3522539018223036,
|
||
|
|
"grad_norm": 0.1241267483898317,
|
||
|
|
"learning_rate": 7.519540746874868e-05,
|
||
|
|
"loss": 0.0752,
|
||
|
|
"step": 2020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3539977330194437,
|
||
|
|
"grad_norm": 0.12046241979127437,
|
||
|
|
"learning_rate": 7.495106821821655e-05,
|
||
|
|
"loss": 0.0753,
|
||
|
|
"step": 2030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3557415642165838,
|
||
|
|
"grad_norm": 0.09115653788000529,
|
||
|
|
"learning_rate": 7.470593294441124e-05,
|
||
|
|
"loss": 0.0744,
|
||
|
|
"step": 2040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3574853954137239,
|
||
|
|
"grad_norm": 0.13456888345338583,
|
||
|
|
"learning_rate": 7.44600094679752e-05,
|
||
|
|
"loss": 0.0738,
|
||
|
|
"step": 2050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3592292266108641,
|
||
|
|
"grad_norm": 0.06806889038779979,
|
||
|
|
"learning_rate": 7.421330563469716e-05,
|
||
|
|
"loss": 0.0739,
|
||
|
|
"step": 2060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3609730578080042,
|
||
|
|
"grad_norm": 0.11182927221959478,
|
||
|
|
"learning_rate": 7.396582931526193e-05,
|
||
|
|
"loss": 0.0735,
|
||
|
|
"step": 2070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3627168890051443,
|
||
|
|
"grad_norm": 0.06913605337271556,
|
||
|
|
"learning_rate": 7.37175884049992e-05,
|
||
|
|
"loss": 0.0723,
|
||
|
|
"step": 2080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3644607202022844,
|
||
|
|
"grad_norm": 0.06420773618946907,
|
||
|
|
"learning_rate": 7.346859082363171e-05,
|
||
|
|
"loss": 0.0745,
|
||
|
|
"step": 2090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36620455139942454,
|
||
|
|
"grad_norm": 0.09270103993797899,
|
||
|
|
"learning_rate": 7.321884451502252e-05,
|
||
|
|
"loss": 0.072,
|
||
|
|
"step": 2100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36794838259656465,
|
||
|
|
"grad_norm": 0.23156282710055895,
|
||
|
|
"learning_rate": 7.296835744692163e-05,
|
||
|
|
"loss": 0.0719,
|
||
|
|
"step": 2110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36969221379370476,
|
||
|
|
"grad_norm": 0.07236089370303052,
|
||
|
|
"learning_rate": 7.271713761071181e-05,
|
||
|
|
"loss": 0.0727,
|
||
|
|
"step": 2120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37143604499084487,
|
||
|
|
"grad_norm": 0.0896827167833697,
|
||
|
|
"learning_rate": 7.246519302115355e-05,
|
||
|
|
"loss": 0.0721,
|
||
|
|
"step": 2130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.373179876187985,
|
||
|
|
"grad_norm": 0.07998918062338511,
|
||
|
|
"learning_rate": 7.221253171612944e-05,
|
||
|
|
"loss": 0.0748,
|
||
|
|
"step": 2140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3749237073851251,
|
||
|
|
"grad_norm": 0.08554261487613933,
|
||
|
|
"learning_rate": 7.195916175638772e-05,
|
||
|
|
"loss": 0.0733,
|
||
|
|
"step": 2150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37666753858226526,
|
||
|
|
"grad_norm": 0.11129041716978767,
|
||
|
|
"learning_rate": 7.170509122528512e-05,
|
||
|
|
"loss": 0.075,
|
||
|
|
"step": 2160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37841136977940537,
|
||
|
|
"grad_norm": 0.10613761388491819,
|
||
|
|
"learning_rate": 7.14503282285289e-05,
|
||
|
|
"loss": 0.0757,
|
||
|
|
"step": 2170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3801552009765455,
|
||
|
|
"grad_norm": 0.11019197584016041,
|
||
|
|
"learning_rate": 7.119488089391835e-05,
|
||
|
|
"loss": 0.0744,
|
||
|
|
"step": 2180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3818990321736856,
|
||
|
|
"grad_norm": 0.06347022683878861,
|
||
|
|
"learning_rate": 7.093875737108549e-05,
|
||
|
|
"loss": 0.0719,
|
||
|
|
"step": 2190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3836428633708257,
|
||
|
|
"grad_norm": 0.21254740042731002,
|
||
|
|
"learning_rate": 7.068196583123496e-05,
|
||
|
|
"loss": 0.0748,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3853866945679658,
|
||
|
|
"grad_norm": 0.07496306455978158,
|
||
|
|
"learning_rate": 7.042451446688342e-05,
|
||
|
|
"loss": 0.0746,
|
||
|
|
"step": 2210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3871305257651059,
|
||
|
|
"grad_norm": 0.3232649711858286,
|
||
|
|
"learning_rate": 7.016641149159815e-05,
|
||
|
|
"loss": 0.0717,
|
||
|
|
"step": 2220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38887435696224604,
|
||
|
|
"grad_norm": 0.11723985352277601,
|
||
|
|
"learning_rate": 6.990766513973503e-05,
|
||
|
|
"loss": 0.0723,
|
||
|
|
"step": 2230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39061818815938615,
|
||
|
|
"grad_norm": 0.06703767679829058,
|
||
|
|
"learning_rate": 6.964828366617583e-05,
|
||
|
|
"loss": 0.0723,
|
||
|
|
"step": 2240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3923620193565263,
|
||
|
|
"grad_norm": 0.06452655503833808,
|
||
|
|
"learning_rate": 6.938827534606483e-05,
|
||
|
|
"loss": 0.0726,
|
||
|
|
"step": 2250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3941058505536664,
|
||
|
|
"grad_norm": 0.11182644422545931,
|
||
|
|
"learning_rate": 6.912764847454485e-05,
|
||
|
|
"loss": 0.0738,
|
||
|
|
"step": 2260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39584968175080654,
|
||
|
|
"grad_norm": 0.0860141601894516,
|
||
|
|
"learning_rate": 6.886641136649255e-05,
|
||
|
|
"loss": 0.073,
|
||
|
|
"step": 2270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39759351294794665,
|
||
|
|
"grad_norm": 0.0797656680775507,
|
||
|
|
"learning_rate": 6.860457235625322e-05,
|
||
|
|
"loss": 0.0737,
|
||
|
|
"step": 2280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39933734414508676,
|
||
|
|
"grad_norm": 0.10052131057107375,
|
||
|
|
"learning_rate": 6.834213979737487e-05,
|
||
|
|
"loss": 0.0733,
|
||
|
|
"step": 2290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40108117534222687,
|
||
|
|
"grad_norm": 0.10115391466030552,
|
||
|
|
"learning_rate": 6.807912206234168e-05,
|
||
|
|
"loss": 0.0763,
|
||
|
|
"step": 2300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.402825006539367,
|
||
|
|
"grad_norm": 0.06133679525617129,
|
||
|
|
"learning_rate": 6.7815527542307e-05,
|
||
|
|
"loss": 0.0742,
|
||
|
|
"step": 2310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4045688377365071,
|
||
|
|
"grad_norm": 0.09619739127280517,
|
||
|
|
"learning_rate": 6.755136464682545e-05,
|
||
|
|
"loss": 0.0725,
|
||
|
|
"step": 2320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4063126689336472,
|
||
|
|
"grad_norm": 0.09091646569455752,
|
||
|
|
"learning_rate": 6.728664180358487e-05,
|
||
|
|
"loss": 0.0721,
|
||
|
|
"step": 2330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4080565001307873,
|
||
|
|
"grad_norm": 0.07656697054608402,
|
||
|
|
"learning_rate": 6.702136745813721e-05,
|
||
|
|
"loss": 0.074,
|
||
|
|
"step": 2340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4098003313279275,
|
||
|
|
"grad_norm": 0.07204279463021213,
|
||
|
|
"learning_rate": 6.67555500736293e-05,
|
||
|
|
"loss": 0.073,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4115441625250676,
|
||
|
|
"grad_norm": 0.09813297810926205,
|
||
|
|
"learning_rate": 6.648919813053266e-05,
|
||
|
|
"loss": 0.0718,
|
||
|
|
"step": 2360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4132879937222077,
|
||
|
|
"grad_norm": 0.09532683884750826,
|
||
|
|
"learning_rate": 6.62223201263731e-05,
|
||
|
|
"loss": 0.0722,
|
||
|
|
"step": 2370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4150318249193478,
|
||
|
|
"grad_norm": 0.14719992956290695,
|
||
|
|
"learning_rate": 6.595492457545953e-05,
|
||
|
|
"loss": 0.0721,
|
||
|
|
"step": 2380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4167756561164879,
|
||
|
|
"grad_norm": 0.10011053762401607,
|
||
|
|
"learning_rate": 6.568702000861234e-05,
|
||
|
|
"loss": 0.0735,
|
||
|
|
"step": 2390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41851948731362804,
|
||
|
|
"grad_norm": 0.08517512639021824,
|
||
|
|
"learning_rate": 6.541861497289126e-05,
|
||
|
|
"loss": 0.0731,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42026331851076815,
|
||
|
|
"grad_norm": 0.1059116573619961,
|
||
|
|
"learning_rate": 6.514971803132264e-05,
|
||
|
|
"loss": 0.0727,
|
||
|
|
"step": 2410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42200714970790826,
|
||
|
|
"grad_norm": 0.07521574766408227,
|
||
|
|
"learning_rate": 6.488033776262631e-05,
|
||
|
|
"loss": 0.0746,
|
||
|
|
"step": 2420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42375098090504837,
|
||
|
|
"grad_norm": 0.07789931436910726,
|
||
|
|
"learning_rate": 6.461048276094189e-05,
|
||
|
|
"loss": 0.0705,
|
||
|
|
"step": 2430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42549481210218854,
|
||
|
|
"grad_norm": 0.07966418822617252,
|
||
|
|
"learning_rate": 6.434016163555452e-05,
|
||
|
|
"loss": 0.0731,
|
||
|
|
"step": 2440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42723864329932865,
|
||
|
|
"grad_norm": 0.06805896335096397,
|
||
|
|
"learning_rate": 6.406938301062032e-05,
|
||
|
|
"loss": 0.0726,
|
||
|
|
"step": 2450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42898247449646876,
|
||
|
|
"grad_norm": 0.09447753229565763,
|
||
|
|
"learning_rate": 6.379815552489112e-05,
|
||
|
|
"loss": 0.0706,
|
||
|
|
"step": 2460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43072630569360887,
|
||
|
|
"grad_norm": 0.08285112104375023,
|
||
|
|
"learning_rate": 6.352648783143904e-05,
|
||
|
|
"loss": 0.0717,
|
||
|
|
"step": 2470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.432470136890749,
|
||
|
|
"grad_norm": 0.13098455739480758,
|
||
|
|
"learning_rate": 6.325438859738016e-05,
|
||
|
|
"loss": 0.0729,
|
||
|
|
"step": 2480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4342139680878891,
|
||
|
|
"grad_norm": 0.08270950207818945,
|
||
|
|
"learning_rate": 6.298186650359832e-05,
|
||
|
|
"loss": 0.0707,
|
||
|
|
"step": 2490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4359577992850292,
|
||
|
|
"grad_norm": 0.07129608509123005,
|
||
|
|
"learning_rate": 6.270893024446788e-05,
|
||
|
|
"loss": 0.0722,
|
||
|
|
"step": 2500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4377016304821693,
|
||
|
|
"grad_norm": 0.10000820899680192,
|
||
|
|
"learning_rate": 6.243558852757653e-05,
|
||
|
|
"loss": 0.0705,
|
||
|
|
"step": 2510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4394454616793094,
|
||
|
|
"grad_norm": 0.10373969721399491,
|
||
|
|
"learning_rate": 6.216185007344744e-05,
|
||
|
|
"loss": 0.0733,
|
||
|
|
"step": 2520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44118929287644953,
|
||
|
|
"grad_norm": 0.10034693367567056,
|
||
|
|
"learning_rate": 6.188772361526104e-05,
|
||
|
|
"loss": 0.0735,
|
||
|
|
"step": 2530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4429331240735897,
|
||
|
|
"grad_norm": 0.07190792039286192,
|
||
|
|
"learning_rate": 6.161321789857635e-05,
|
||
|
|
"loss": 0.0732,
|
||
|
|
"step": 2540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4446769552707298,
|
||
|
|
"grad_norm": 0.06736312666924092,
|
||
|
|
"learning_rate": 6.133834168105206e-05,
|
||
|
|
"loss": 0.0718,
|
||
|
|
"step": 2550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4464207864678699,
|
||
|
|
"grad_norm": 0.08694623472328794,
|
||
|
|
"learning_rate": 6.106310373216706e-05,
|
||
|
|
"loss": 0.0715,
|
||
|
|
"step": 2560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44816461766501003,
|
||
|
|
"grad_norm": 0.06751618527562933,
|
||
|
|
"learning_rate": 6.078751283294075e-05,
|
||
|
|
"loss": 0.0727,
|
||
|
|
"step": 2570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44990844886215015,
|
||
|
|
"grad_norm": 0.07166164012759571,
|
||
|
|
"learning_rate": 6.051157777565274e-05,
|
||
|
|
"loss": 0.0719,
|
||
|
|
"step": 2580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45165228005929026,
|
||
|
|
"grad_norm": 0.07205363465786559,
|
||
|
|
"learning_rate": 6.023530736356252e-05,
|
||
|
|
"loss": 0.0711,
|
||
|
|
"step": 2590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45339611125643037,
|
||
|
|
"grad_norm": 0.06726674291216705,
|
||
|
|
"learning_rate": 5.9958710410628515e-05,
|
||
|
|
"loss": 0.0731,
|
||
|
|
"step": 2600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4551399424535705,
|
||
|
|
"grad_norm": 0.1362051473345596,
|
||
|
|
"learning_rate": 5.96817957412269e-05,
|
||
|
|
"loss": 0.0692,
|
||
|
|
"step": 2610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4568837736507106,
|
||
|
|
"grad_norm": 0.07049177639671635,
|
||
|
|
"learning_rate": 5.940457218987003e-05,
|
||
|
|
"loss": 0.0703,
|
||
|
|
"step": 2620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4586276048478507,
|
||
|
|
"grad_norm": 0.09112289372540164,
|
||
|
|
"learning_rate": 5.912704860092473e-05,
|
||
|
|
"loss": 0.0714,
|
||
|
|
"step": 2630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46037143604499087,
|
||
|
|
"grad_norm": 0.11655945219337893,
|
||
|
|
"learning_rate": 5.884923382832996e-05,
|
||
|
|
"loss": 0.0723,
|
||
|
|
"step": 2640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.462115267242131,
|
||
|
|
"grad_norm": 0.0768073584235417,
|
||
|
|
"learning_rate": 5.8571136735314456e-05,
|
||
|
|
"loss": 0.0712,
|
||
|
|
"step": 2650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4638590984392711,
|
||
|
|
"grad_norm": 0.10315448855852374,
|
||
|
|
"learning_rate": 5.829276619411392e-05,
|
||
|
|
"loss": 0.0693,
|
||
|
|
"step": 2660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4656029296364112,
|
||
|
|
"grad_norm": 0.06863388845145796,
|
||
|
|
"learning_rate": 5.801413108568797e-05,
|
||
|
|
"loss": 0.0723,
|
||
|
|
"step": 2670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4673467608335513,
|
||
|
|
"grad_norm": 0.07431300580022854,
|
||
|
|
"learning_rate": 5.773524029943682e-05,
|
||
|
|
"loss": 0.0723,
|
||
|
|
"step": 2680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4690905920306914,
|
||
|
|
"grad_norm": 0.0678468394020785,
|
||
|
|
"learning_rate": 5.745610273291766e-05,
|
||
|
|
"loss": 0.0697,
|
||
|
|
"step": 2690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47083442322783153,
|
||
|
|
"grad_norm": 0.051461083413721374,
|
||
|
|
"learning_rate": 5.7176727291560814e-05,
|
||
|
|
"loss": 0.0692,
|
||
|
|
"step": 2700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47257825442497164,
|
||
|
|
"grad_norm": 0.3637961443238326,
|
||
|
|
"learning_rate": 5.689712288838561e-05,
|
||
|
|
"loss": 0.0708,
|
||
|
|
"step": 2710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47432208562211176,
|
||
|
|
"grad_norm": 0.12899584659018307,
|
||
|
|
"learning_rate": 5.661729844371601e-05,
|
||
|
|
"loss": 0.0717,
|
||
|
|
"step": 2720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4760659168192519,
|
||
|
|
"grad_norm": 0.0970116230063514,
|
||
|
|
"learning_rate": 5.633726288489609e-05,
|
||
|
|
"loss": 0.0702,
|
||
|
|
"step": 2730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47780974801639203,
|
||
|
|
"grad_norm": 0.06925056867854305,
|
||
|
|
"learning_rate": 5.6057025146005126e-05,
|
||
|
|
"loss": 0.0694,
|
||
|
|
"step": 2740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47955357921353214,
|
||
|
|
"grad_norm": 0.09518941391005381,
|
||
|
|
"learning_rate": 5.577659416757267e-05,
|
||
|
|
"loss": 0.0693,
|
||
|
|
"step": 2750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48129741041067226,
|
||
|
|
"grad_norm": 0.0688787651664979,
|
||
|
|
"learning_rate": 5.5495978896293244e-05,
|
||
|
|
"loss": 0.0685,
|
||
|
|
"step": 2760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48304124160781237,
|
||
|
|
"grad_norm": 0.09890275204254635,
|
||
|
|
"learning_rate": 5.521518828474091e-05,
|
||
|
|
"loss": 0.0712,
|
||
|
|
"step": 2770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4847850728049525,
|
||
|
|
"grad_norm": 0.07029397820447766,
|
||
|
|
"learning_rate": 5.4934231291083724e-05,
|
||
|
|
"loss": 0.0705,
|
||
|
|
"step": 2780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4865289040020926,
|
||
|
|
"grad_norm": 0.114703628881282,
|
||
|
|
"learning_rate": 5.465311687879785e-05,
|
||
|
|
"loss": 0.0728,
|
||
|
|
"step": 2790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4882727351992327,
|
||
|
|
"grad_norm": 0.06900079674714125,
|
||
|
|
"learning_rate": 5.4371854016381686e-05,
|
||
|
|
"loss": 0.0693,
|
||
|
|
"step": 2800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4900165663963728,
|
||
|
|
"grad_norm": 0.07064873333381244,
|
||
|
|
"learning_rate": 5.409045167706962e-05,
|
||
|
|
"loss": 0.069,
|
||
|
|
"step": 2810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4917603975935129,
|
||
|
|
"grad_norm": 0.09125097079525542,
|
||
|
|
"learning_rate": 5.380891883854591e-05,
|
||
|
|
"loss": 0.0719,
|
||
|
|
"step": 2820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4935042287906531,
|
||
|
|
"grad_norm": 0.053404142344694504,
|
||
|
|
"learning_rate": 5.352726448265808e-05,
|
||
|
|
"loss": 0.0716,
|
||
|
|
"step": 2830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4952480599877932,
|
||
|
|
"grad_norm": 0.07962910381863947,
|
||
|
|
"learning_rate": 5.3245497595130575e-05,
|
||
|
|
"loss": 0.0702,
|
||
|
|
"step": 2840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4969918911849333,
|
||
|
|
"grad_norm": 0.08362471604956066,
|
||
|
|
"learning_rate": 5.296362716527788e-05,
|
||
|
|
"loss": 0.0699,
|
||
|
|
"step": 2850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4987357223820734,
|
||
|
|
"grad_norm": 0.06307457307002638,
|
||
|
|
"learning_rate": 5.268166218571792e-05,
|
||
|
|
"loss": 0.0687,
|
||
|
|
"step": 2860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5004795535792136,
|
||
|
|
"grad_norm": 0.11296321834314839,
|
||
|
|
"learning_rate": 5.239961165208499e-05,
|
||
|
|
"loss": 0.069,
|
||
|
|
"step": 2870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5022233847763536,
|
||
|
|
"grad_norm": 0.1053843511995927,
|
||
|
|
"learning_rate": 5.211748456274291e-05,
|
||
|
|
"loss": 0.0691,
|
||
|
|
"step": 2880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5039672159734938,
|
||
|
|
"grad_norm": 0.1109187673591365,
|
||
|
|
"learning_rate": 5.183528991849784e-05,
|
||
|
|
"loss": 0.0704,
|
||
|
|
"step": 2890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5057110471706339,
|
||
|
|
"grad_norm": 0.055240959366235146,
|
||
|
|
"learning_rate": 5.155303672231123e-05,
|
||
|
|
"loss": 0.0689,
|
||
|
|
"step": 2900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.507454878367774,
|
||
|
|
"grad_norm": 0.062304367575237746,
|
||
|
|
"learning_rate": 5.127073397901248e-05,
|
||
|
|
"loss": 0.0715,
|
||
|
|
"step": 2910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5091987095649141,
|
||
|
|
"grad_norm": 0.09104354683280115,
|
||
|
|
"learning_rate": 5.09883906950117e-05,
|
||
|
|
"loss": 0.0715,
|
||
|
|
"step": 2920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5109425407620543,
|
||
|
|
"grad_norm": 0.09335898355116179,
|
||
|
|
"learning_rate": 5.070601587801246e-05,
|
||
|
|
"loss": 0.0721,
|
||
|
|
"step": 2930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5126863719591943,
|
||
|
|
"grad_norm": 0.1212833028716344,
|
||
|
|
"learning_rate": 5.042361853672428e-05,
|
||
|
|
"loss": 0.0688,
|
||
|
|
"step": 2940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5144302031563345,
|
||
|
|
"grad_norm": 0.1267863802852585,
|
||
|
|
"learning_rate": 5.0141207680575265e-05,
|
||
|
|
"loss": 0.0692,
|
||
|
|
"step": 2950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5161740343534745,
|
||
|
|
"grad_norm": 0.13749347686727084,
|
||
|
|
"learning_rate": 4.985879231942474e-05,
|
||
|
|
"loss": 0.069,
|
||
|
|
"step": 2960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5179178655506147,
|
||
|
|
"grad_norm": 0.08673837874814297,
|
||
|
|
"learning_rate": 4.957638146327574e-05,
|
||
|
|
"loss": 0.0694,
|
||
|
|
"step": 2970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5196616967477549,
|
||
|
|
"grad_norm": 0.09636753443359779,
|
||
|
|
"learning_rate": 4.929398412198755e-05,
|
||
|
|
"loss": 0.0703,
|
||
|
|
"step": 2980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5214055279448949,
|
||
|
|
"grad_norm": 0.06507262712865346,
|
||
|
|
"learning_rate": 4.9011609304988295e-05,
|
||
|
|
"loss": 0.0683,
|
||
|
|
"step": 2990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5231493591420351,
|
||
|
|
"grad_norm": 0.06619223766291939,
|
||
|
|
"learning_rate": 4.8729266020987553e-05,
|
||
|
|
"loss": 0.0677,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5248931903391751,
|
||
|
|
"grad_norm": 0.05287355674779802,
|
||
|
|
"learning_rate": 4.844696327768878e-05,
|
||
|
|
"loss": 0.0698,
|
||
|
|
"step": 3010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5266370215363153,
|
||
|
|
"grad_norm": 0.10505577067699386,
|
||
|
|
"learning_rate": 4.8164710081502165e-05,
|
||
|
|
"loss": 0.0687,
|
||
|
|
"step": 3020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5283808527334554,
|
||
|
|
"grad_norm": 0.056820349432236315,
|
||
|
|
"learning_rate": 4.788251543725711e-05,
|
||
|
|
"loss": 0.0699,
|
||
|
|
"step": 3030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5301246839305955,
|
||
|
|
"grad_norm": 0.1260634689291685,
|
||
|
|
"learning_rate": 4.760038834791503e-05,
|
||
|
|
"loss": 0.0673,
|
||
|
|
"step": 3040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5318685151277356,
|
||
|
|
"grad_norm": 0.0968264946104606,
|
||
|
|
"learning_rate": 4.7318337814282085e-05,
|
||
|
|
"loss": 0.0674,
|
||
|
|
"step": 3050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5336123463248758,
|
||
|
|
"grad_norm": 0.085559283784732,
|
||
|
|
"learning_rate": 4.703637283472213e-05,
|
||
|
|
"loss": 0.0714,
|
||
|
|
"step": 3060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5353561775220159,
|
||
|
|
"grad_norm": 0.09957416211753474,
|
||
|
|
"learning_rate": 4.675450240486943e-05,
|
||
|
|
"loss": 0.0706,
|
||
|
|
"step": 3070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.537100008719156,
|
||
|
|
"grad_norm": 0.06932794550036621,
|
||
|
|
"learning_rate": 4.647273551734192e-05,
|
||
|
|
"loss": 0.0706,
|
||
|
|
"step": 3080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5388438399162961,
|
||
|
|
"grad_norm": 0.0959577561976307,
|
||
|
|
"learning_rate": 4.619108116145411e-05,
|
||
|
|
"loss": 0.0686,
|
||
|
|
"step": 3090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5405876711134362,
|
||
|
|
"grad_norm": 0.11053090195002527,
|
||
|
|
"learning_rate": 4.5909548322930386e-05,
|
||
|
|
"loss": 0.0685,
|
||
|
|
"step": 3100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5423315023105764,
|
||
|
|
"grad_norm": 0.1385181499715123,
|
||
|
|
"learning_rate": 4.562814598361834e-05,
|
||
|
|
"loss": 0.069,
|
||
|
|
"step": 3110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5440753335077164,
|
||
|
|
"grad_norm": 0.0860474878686007,
|
||
|
|
"learning_rate": 4.534688312120215e-05,
|
||
|
|
"loss": 0.0719,
|
||
|
|
"step": 3120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5458191647048566,
|
||
|
|
"grad_norm": 0.06367906769350236,
|
||
|
|
"learning_rate": 4.506576870891628e-05,
|
||
|
|
"loss": 0.0667,
|
||
|
|
"step": 3130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5475629959019966,
|
||
|
|
"grad_norm": 0.056417791244654396,
|
||
|
|
"learning_rate": 4.478481171525909e-05,
|
||
|
|
"loss": 0.0696,
|
||
|
|
"step": 3140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5493068270991368,
|
||
|
|
"grad_norm": 0.07510052439666184,
|
||
|
|
"learning_rate": 4.450402110370677e-05,
|
||
|
|
"loss": 0.07,
|
||
|
|
"step": 3150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.551050658296277,
|
||
|
|
"grad_norm": 0.07820484635095121,
|
||
|
|
"learning_rate": 4.422340583242733e-05,
|
||
|
|
"loss": 0.071,
|
||
|
|
"step": 3160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.552794489493417,
|
||
|
|
"grad_norm": 0.1104677720885283,
|
||
|
|
"learning_rate": 4.3942974853994885e-05,
|
||
|
|
"loss": 0.0688,
|
||
|
|
"step": 3170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5545383206905572,
|
||
|
|
"grad_norm": 0.09272002167916459,
|
||
|
|
"learning_rate": 4.366273711510392e-05,
|
||
|
|
"loss": 0.0681,
|
||
|
|
"step": 3180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5562821518876973,
|
||
|
|
"grad_norm": 0.1036837406322013,
|
||
|
|
"learning_rate": 4.3382701556284006e-05,
|
||
|
|
"loss": 0.0677,
|
||
|
|
"step": 3190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5580259830848374,
|
||
|
|
"grad_norm": 0.07390053294467375,
|
||
|
|
"learning_rate": 4.3102877111614406e-05,
|
||
|
|
"loss": 0.0661,
|
||
|
|
"step": 3200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5597698142819775,
|
||
|
|
"grad_norm": 0.10154107392434804,
|
||
|
|
"learning_rate": 4.282327270843919e-05,
|
||
|
|
"loss": 0.0678,
|
||
|
|
"step": 3210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5615136454791176,
|
||
|
|
"grad_norm": 0.05726446240521516,
|
||
|
|
"learning_rate": 4.2543897267082346e-05,
|
||
|
|
"loss": 0.0704,
|
||
|
|
"step": 3220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5632574766762577,
|
||
|
|
"grad_norm": 0.07584421387499146,
|
||
|
|
"learning_rate": 4.226475970056319e-05,
|
||
|
|
"loss": 0.0682,
|
||
|
|
"step": 3230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5650013078733979,
|
||
|
|
"grad_norm": 0.12385055066323168,
|
||
|
|
"learning_rate": 4.1985868914312035e-05,
|
||
|
|
"loss": 0.0671,
|
||
|
|
"step": 3240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5667451390705379,
|
||
|
|
"grad_norm": 0.057812117646785455,
|
||
|
|
"learning_rate": 4.1707233805886096e-05,
|
||
|
|
"loss": 0.0673,
|
||
|
|
"step": 3250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5684889702676781,
|
||
|
|
"grad_norm": 0.05830094610213767,
|
||
|
|
"learning_rate": 4.1428863264685556e-05,
|
||
|
|
"loss": 0.0681,
|
||
|
|
"step": 3260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5702328014648183,
|
||
|
|
"grad_norm": 0.08814953133237691,
|
||
|
|
"learning_rate": 4.1150766171670044e-05,
|
||
|
|
"loss": 0.0677,
|
||
|
|
"step": 3270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5719766326619583,
|
||
|
|
"grad_norm": 0.08651430164476416,
|
||
|
|
"learning_rate": 4.087295139907528e-05,
|
||
|
|
"loss": 0.0677,
|
||
|
|
"step": 3280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5737204638590985,
|
||
|
|
"grad_norm": 0.23832741853388886,
|
||
|
|
"learning_rate": 4.059542781012998e-05,
|
||
|
|
"loss": 0.0678,
|
||
|
|
"step": 3290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5754642950562385,
|
||
|
|
"grad_norm": 0.05885238799413647,
|
||
|
|
"learning_rate": 4.0318204258773126e-05,
|
||
|
|
"loss": 0.0684,
|
||
|
|
"step": 3300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5772081262533787,
|
||
|
|
"grad_norm": 0.07791148857337749,
|
||
|
|
"learning_rate": 4.00412895893715e-05,
|
||
|
|
"loss": 0.0687,
|
||
|
|
"step": 3310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5789519574505188,
|
||
|
|
"grad_norm": 0.06786486116480168,
|
||
|
|
"learning_rate": 3.9764692636437484e-05,
|
||
|
|
"loss": 0.0679,
|
||
|
|
"step": 3320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5806957886476589,
|
||
|
|
"grad_norm": 0.05788405804070718,
|
||
|
|
"learning_rate": 3.948842222434728e-05,
|
||
|
|
"loss": 0.0697,
|
||
|
|
"step": 3330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.582439619844799,
|
||
|
|
"grad_norm": 0.06948825516042008,
|
||
|
|
"learning_rate": 3.921248716705927e-05,
|
||
|
|
"loss": 0.067,
|
||
|
|
"step": 3340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5841834510419391,
|
||
|
|
"grad_norm": 0.09718375497294167,
|
||
|
|
"learning_rate": 3.8936896267832935e-05,
|
||
|
|
"loss": 0.0671,
|
||
|
|
"step": 3350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5859272822390793,
|
||
|
|
"grad_norm": 0.32522050061514907,
|
||
|
|
"learning_rate": 3.866165831894796e-05,
|
||
|
|
"loss": 0.0688,
|
||
|
|
"step": 3360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5876711134362194,
|
||
|
|
"grad_norm": 0.09966939851122777,
|
||
|
|
"learning_rate": 3.8386782101423665e-05,
|
||
|
|
"loss": 0.0688,
|
||
|
|
"step": 3370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5894149446333595,
|
||
|
|
"grad_norm": 0.1063520167218931,
|
||
|
|
"learning_rate": 3.811227638473897e-05,
|
||
|
|
"loss": 0.0681,
|
||
|
|
"step": 3380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5911587758304996,
|
||
|
|
"grad_norm": 0.07790458741475327,
|
||
|
|
"learning_rate": 3.783814992655256e-05,
|
||
|
|
"loss": 0.0694,
|
||
|
|
"step": 3390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5929026070276397,
|
||
|
|
"grad_norm": 0.07766471726971445,
|
||
|
|
"learning_rate": 3.7564411472423464e-05,
|
||
|
|
"loss": 0.0696,
|
||
|
|
"step": 3400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5946464382247798,
|
||
|
|
"grad_norm": 0.060585652583308065,
|
||
|
|
"learning_rate": 3.729106975553214e-05,
|
||
|
|
"loss": 0.0659,
|
||
|
|
"step": 3410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.59639026942192,
|
||
|
|
"grad_norm": 0.1309180860317793,
|
||
|
|
"learning_rate": 3.701813349640169e-05,
|
||
|
|
"loss": 0.0662,
|
||
|
|
"step": 3420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.59813410061906,
|
||
|
|
"grad_norm": 0.13278490095026282,
|
||
|
|
"learning_rate": 3.674561140261983e-05,
|
||
|
|
"loss": 0.0675,
|
||
|
|
"step": 3430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5998779318162002,
|
||
|
|
"grad_norm": 0.08819239845829921,
|
||
|
|
"learning_rate": 3.647351216856099e-05,
|
||
|
|
"loss": 0.0661,
|
||
|
|
"step": 3440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6016217630133404,
|
||
|
|
"grad_norm": 0.0694256875185968,
|
||
|
|
"learning_rate": 3.620184447510888e-05,
|
||
|
|
"loss": 0.0691,
|
||
|
|
"step": 3450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6033655942104804,
|
||
|
|
"grad_norm": 0.05045493054224779,
|
||
|
|
"learning_rate": 3.5930616989379695e-05,
|
||
|
|
"loss": 0.0654,
|
||
|
|
"step": 3460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6051094254076206,
|
||
|
|
"grad_norm": 0.05387040717024746,
|
||
|
|
"learning_rate": 3.5659838364445505e-05,
|
||
|
|
"loss": 0.0672,
|
||
|
|
"step": 3470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6068532566047606,
|
||
|
|
"grad_norm": 0.05687136811369198,
|
||
|
|
"learning_rate": 3.5389517239058126e-05,
|
||
|
|
"loss": 0.0665,
|
||
|
|
"step": 3480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6085970878019008,
|
||
|
|
"grad_norm": 0.17637553404307893,
|
||
|
|
"learning_rate": 3.511966223737368e-05,
|
||
|
|
"loss": 0.0676,
|
||
|
|
"step": 3490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6103409189990409,
|
||
|
|
"grad_norm": 0.09417795829049438,
|
||
|
|
"learning_rate": 3.485028196867738e-05,
|
||
|
|
"loss": 0.0656,
|
||
|
|
"step": 3500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.612084750196181,
|
||
|
|
"grad_norm": 0.09675688928564037,
|
||
|
|
"learning_rate": 3.458138502710876e-05,
|
||
|
|
"loss": 0.0662,
|
||
|
|
"step": 3510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6138285813933211,
|
||
|
|
"grad_norm": 0.06982987968601855,
|
||
|
|
"learning_rate": 3.431297999138768e-05,
|
||
|
|
"loss": 0.0669,
|
||
|
|
"step": 3520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6155724125904612,
|
||
|
|
"grad_norm": 0.060806762419563676,
|
||
|
|
"learning_rate": 3.4045075424540484e-05,
|
||
|
|
"loss": 0.0665,
|
||
|
|
"step": 3530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6173162437876014,
|
||
|
|
"grad_norm": 0.057810486903244925,
|
||
|
|
"learning_rate": 3.37776798736269e-05,
|
||
|
|
"loss": 0.0687,
|
||
|
|
"step": 3540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6190600749847415,
|
||
|
|
"grad_norm": 0.09052003071252049,
|
||
|
|
"learning_rate": 3.3510801869467354e-05,
|
||
|
|
"loss": 0.0687,
|
||
|
|
"step": 3550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6208039061818816,
|
||
|
|
"grad_norm": 0.05915583777230159,
|
||
|
|
"learning_rate": 3.324444992637071e-05,
|
||
|
|
"loss": 0.0667,
|
||
|
|
"step": 3560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6225477373790217,
|
||
|
|
"grad_norm": 0.056903572931067674,
|
||
|
|
"learning_rate": 3.297863254186279e-05,
|
||
|
|
"loss": 0.0661,
|
||
|
|
"step": 3570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6242915685761619,
|
||
|
|
"grad_norm": 0.07457729627076666,
|
||
|
|
"learning_rate": 3.2713358196415146e-05,
|
||
|
|
"loss": 0.067,
|
||
|
|
"step": 3580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6260353997733019,
|
||
|
|
"grad_norm": 0.0688133988676,
|
||
|
|
"learning_rate": 3.244863535317455e-05,
|
||
|
|
"loss": 0.0664,
|
||
|
|
"step": 3590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6277792309704421,
|
||
|
|
"grad_norm": 0.08316153717072015,
|
||
|
|
"learning_rate": 3.2184472457693006e-05,
|
||
|
|
"loss": 0.0663,
|
||
|
|
"step": 3600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6295230621675821,
|
||
|
|
"grad_norm": 0.07001307487762984,
|
||
|
|
"learning_rate": 3.192087793765832e-05,
|
||
|
|
"loss": 0.0647,
|
||
|
|
"step": 3610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6312668933647223,
|
||
|
|
"grad_norm": 0.15620567579347566,
|
||
|
|
"learning_rate": 3.1657860202625146e-05,
|
||
|
|
"loss": 0.0701,
|
||
|
|
"step": 3620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6330107245618624,
|
||
|
|
"grad_norm": 0.10373195476410263,
|
||
|
|
"learning_rate": 3.1395427643746796e-05,
|
||
|
|
"loss": 0.0668,
|
||
|
|
"step": 3630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6347545557590025,
|
||
|
|
"grad_norm": 0.0734675356947609,
|
||
|
|
"learning_rate": 3.113358863350747e-05,
|
||
|
|
"loss": 0.0681,
|
||
|
|
"step": 3640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6364983869561427,
|
||
|
|
"grad_norm": 0.07535325822125089,
|
||
|
|
"learning_rate": 3.0872351525455166e-05,
|
||
|
|
"loss": 0.0652,
|
||
|
|
"step": 3650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6382422181532827,
|
||
|
|
"grad_norm": 0.15252639717160144,
|
||
|
|
"learning_rate": 3.061172465393518e-05,
|
||
|
|
"loss": 0.0656,
|
||
|
|
"step": 3660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6399860493504229,
|
||
|
|
"grad_norm": 0.052833579208426094,
|
||
|
|
"learning_rate": 3.035171633382419e-05,
|
||
|
|
"loss": 0.0658,
|
||
|
|
"step": 3670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.641729880547563,
|
||
|
|
"grad_norm": 0.14292022401062196,
|
||
|
|
"learning_rate": 3.009233486026497e-05,
|
||
|
|
"loss": 0.0664,
|
||
|
|
"step": 3680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6434737117447031,
|
||
|
|
"grad_norm": 0.10589992815491318,
|
||
|
|
"learning_rate": 2.9833588508401866e-05,
|
||
|
|
"loss": 0.0657,
|
||
|
|
"step": 3690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6452175429418432,
|
||
|
|
"grad_norm": 0.04779313436976461,
|
||
|
|
"learning_rate": 2.9575485533116598e-05,
|
||
|
|
"loss": 0.0653,
|
||
|
|
"step": 3700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6469613741389834,
|
||
|
|
"grad_norm": 0.05400461509544075,
|
||
|
|
"learning_rate": 2.9318034168765046e-05,
|
||
|
|
"loss": 0.0667,
|
||
|
|
"step": 3710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6487052053361234,
|
||
|
|
"grad_norm": 0.059519586489227526,
|
||
|
|
"learning_rate": 2.906124262891451e-05,
|
||
|
|
"loss": 0.0672,
|
||
|
|
"step": 3720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6504490365332636,
|
||
|
|
"grad_norm": 0.0648463847094917,
|
||
|
|
"learning_rate": 2.880511910608164e-05,
|
||
|
|
"loss": 0.0679,
|
||
|
|
"step": 3730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6521928677304037,
|
||
|
|
"grad_norm": 0.1183811255273542,
|
||
|
|
"learning_rate": 2.8549671771471133e-05,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"step": 3740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6539366989275438,
|
||
|
|
"grad_norm": 0.05476695117570433,
|
||
|
|
"learning_rate": 2.829490877471491e-05,
|
||
|
|
"loss": 0.0642,
|
||
|
|
"step": 3750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.655680530124684,
|
||
|
|
"grad_norm": 0.06820911786107267,
|
||
|
|
"learning_rate": 2.8040838243612288e-05,
|
||
|
|
"loss": 0.0665,
|
||
|
|
"step": 3760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.657424361321824,
|
||
|
|
"grad_norm": 0.10776643187307983,
|
||
|
|
"learning_rate": 2.7787468283870577e-05,
|
||
|
|
"loss": 0.0663,
|
||
|
|
"step": 3770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6591681925189642,
|
||
|
|
"grad_norm": 0.11186248154441321,
|
||
|
|
"learning_rate": 2.7534806978846465e-05,
|
||
|
|
"loss": 0.0667,
|
||
|
|
"step": 3780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6609120237161042,
|
||
|
|
"grad_norm": 0.11155104114763678,
|
||
|
|
"learning_rate": 2.7282862389288206e-05,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"step": 3790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6626558549132444,
|
||
|
|
"grad_norm": 0.08353799399740781,
|
||
|
|
"learning_rate": 2.7031642553078374e-05,
|
||
|
|
"loss": 0.0663,
|
||
|
|
"step": 3800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6643996861103845,
|
||
|
|
"grad_norm": 0.1252138214848371,
|
||
|
|
"learning_rate": 2.6781155484977493e-05,
|
||
|
|
"loss": 0.0659,
|
||
|
|
"step": 3810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6661435173075246,
|
||
|
|
"grad_norm": 0.11677307876369836,
|
||
|
|
"learning_rate": 2.6531409176368295e-05,
|
||
|
|
"loss": 0.0652,
|
||
|
|
"step": 3820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6678873485046648,
|
||
|
|
"grad_norm": 0.13081371005512618,
|
||
|
|
"learning_rate": 2.6282411595000812e-05,
|
||
|
|
"loss": 0.0663,
|
||
|
|
"step": 3830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6696311797018049,
|
||
|
|
"grad_norm": 0.09064110239185938,
|
||
|
|
"learning_rate": 2.6034170684738064e-05,
|
||
|
|
"loss": 0.0657,
|
||
|
|
"step": 3840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.671375010898945,
|
||
|
|
"grad_norm": 0.10497326491021886,
|
||
|
|
"learning_rate": 2.5786694365302856e-05,
|
||
|
|
"loss": 0.0662,
|
||
|
|
"step": 3850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6731188420960851,
|
||
|
|
"grad_norm": 0.1894881130574952,
|
||
|
|
"learning_rate": 2.5539990532024825e-05,
|
||
|
|
"loss": 0.0663,
|
||
|
|
"step": 3860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6748626732932252,
|
||
|
|
"grad_norm": 0.11340703925871584,
|
||
|
|
"learning_rate": 2.5294067055588765e-05,
|
||
|
|
"loss": 0.0647,
|
||
|
|
"step": 3870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6766065044903653,
|
||
|
|
"grad_norm": 0.0674795553477859,
|
||
|
|
"learning_rate": 2.5048931781783456e-05,
|
||
|
|
"loss": 0.0663,
|
||
|
|
"step": 3880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6783503356875055,
|
||
|
|
"grad_norm": 0.06649329429370544,
|
||
|
|
"learning_rate": 2.480459253125132e-05,
|
||
|
|
"loss": 0.0654,
|
||
|
|
"step": 3890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6800941668846455,
|
||
|
|
"grad_norm": 0.098929499241832,
|
||
|
|
"learning_rate": 2.456105709923897e-05,
|
||
|
|
"loss": 0.0651,
|
||
|
|
"step": 3900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6818379980817857,
|
||
|
|
"grad_norm": 0.06386861210237169,
|
||
|
|
"learning_rate": 2.4318333255348525e-05,
|
||
|
|
"loss": 0.0676,
|
||
|
|
"step": 3910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6835818292789257,
|
||
|
|
"grad_norm": 0.08662939273115117,
|
||
|
|
"learning_rate": 2.4076428743289608e-05,
|
||
|
|
"loss": 0.0647,
|
||
|
|
"step": 3920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6853256604760659,
|
||
|
|
"grad_norm": 0.1405260835479582,
|
||
|
|
"learning_rate": 2.3835351280632513e-05,
|
||
|
|
"loss": 0.0681,
|
||
|
|
"step": 3930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6870694916732061,
|
||
|
|
"grad_norm": 0.07786293851164922,
|
||
|
|
"learning_rate": 2.3595108558561812e-05,
|
||
|
|
"loss": 0.0661,
|
||
|
|
"step": 3940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6888133228703461,
|
||
|
|
"grad_norm": 0.08244123534587308,
|
||
|
|
"learning_rate": 2.3355708241630998e-05,
|
||
|
|
"loss": 0.0644,
|
||
|
|
"step": 3950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6905571540674863,
|
||
|
|
"grad_norm": 0.08600292121296654,
|
||
|
|
"learning_rate": 2.311715796751805e-05,
|
||
|
|
"loss": 0.0696,
|
||
|
|
"step": 3960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6923009852646264,
|
||
|
|
"grad_norm": 0.06809070228006706,
|
||
|
|
"learning_rate": 2.2879465346781703e-05,
|
||
|
|
"loss": 0.0667,
|
||
|
|
"step": 3970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6940448164617665,
|
||
|
|
"grad_norm": 0.10596061042299985,
|
||
|
|
"learning_rate": 2.264263796261864e-05,
|
||
|
|
"loss": 0.0662,
|
||
|
|
"step": 3980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6957886476589066,
|
||
|
|
"grad_norm": 0.05600677052268759,
|
||
|
|
"learning_rate": 2.2406683370621618e-05,
|
||
|
|
"loss": 0.065,
|
||
|
|
"step": 3990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6975324788560467,
|
||
|
|
"grad_norm": 0.12350042680040363,
|
||
|
|
"learning_rate": 2.2171609098538278e-05,
|
||
|
|
"loss": 0.0674,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6992763100531868,
|
||
|
|
"grad_norm": 0.08609523107160226,
|
||
|
|
"learning_rate": 2.1937422646031214e-05,
|
||
|
|
"loss": 0.0672,
|
||
|
|
"step": 4010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.701020141250327,
|
||
|
|
"grad_norm": 0.06523963542097193,
|
||
|
|
"learning_rate": 2.170413148443852e-05,
|
||
|
|
"loss": 0.0645,
|
||
|
|
"step": 4020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7027639724474671,
|
||
|
|
"grad_norm": 0.09613588330972808,
|
||
|
|
"learning_rate": 2.1471743056535455e-05,
|
||
|
|
"loss": 0.0655,
|
||
|
|
"step": 4030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7045078036446072,
|
||
|
|
"grad_norm": 0.08438802353157454,
|
||
|
|
"learning_rate": 2.124026477629706e-05,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"step": 4040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7062516348417474,
|
||
|
|
"grad_norm": 0.0838100106180724,
|
||
|
|
"learning_rate": 2.100970402866164e-05,
|
||
|
|
"loss": 0.0649,
|
||
|
|
"step": 4050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7079954660388874,
|
||
|
|
"grad_norm": 0.048878932271227936,
|
||
|
|
"learning_rate": 2.0780068169295032e-05,
|
||
|
|
"loss": 0.0646,
|
||
|
|
"step": 4060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7097392972360276,
|
||
|
|
"grad_norm": 0.05490806540035838,
|
||
|
|
"learning_rate": 2.0551364524356054e-05,
|
||
|
|
"loss": 0.0637,
|
||
|
|
"step": 4070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7114831284331676,
|
||
|
|
"grad_norm": 0.0928466829346082,
|
||
|
|
"learning_rate": 2.0323600390262742e-05,
|
||
|
|
"loss": 0.0631,
|
||
|
|
"step": 4080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7132269596303078,
|
||
|
|
"grad_norm": 0.07478264466938572,
|
||
|
|
"learning_rate": 2.0096783033459564e-05,
|
||
|
|
"loss": 0.0653,
|
||
|
|
"step": 4090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7149707908274479,
|
||
|
|
"grad_norm": 0.062316603777551216,
|
||
|
|
"learning_rate": 1.987091969018561e-05,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"step": 4100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.716714622024588,
|
||
|
|
"grad_norm": 0.09437152013364276,
|
||
|
|
"learning_rate": 1.9646017566243658e-05,
|
||
|
|
"loss": 0.0661,
|
||
|
|
"step": 4110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7184584532217282,
|
||
|
|
"grad_norm": 0.05322246323194081,
|
||
|
|
"learning_rate": 1.9422083836770406e-05,
|
||
|
|
"loss": 0.0667,
|
||
|
|
"step": 4120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7202022844188682,
|
||
|
|
"grad_norm": 0.07479897442184201,
|
||
|
|
"learning_rate": 1.919912564600753e-05,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"step": 4130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7219461156160084,
|
||
|
|
"grad_norm": 0.05492038679380218,
|
||
|
|
"learning_rate": 1.8977150107073633e-05,
|
||
|
|
"loss": 0.0642,
|
||
|
|
"step": 4140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7236899468131485,
|
||
|
|
"grad_norm": 0.08536624329932077,
|
||
|
|
"learning_rate": 1.8756164301737476e-05,
|
||
|
|
"loss": 0.0626,
|
||
|
|
"step": 4150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7254337780102886,
|
||
|
|
"grad_norm": 0.06859524210266144,
|
||
|
|
"learning_rate": 1.853617528019197e-05,
|
||
|
|
"loss": 0.0653,
|
||
|
|
"step": 4160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7271776092074287,
|
||
|
|
"grad_norm": 0.12160361687651873,
|
||
|
|
"learning_rate": 1.831719006082924e-05,
|
||
|
|
"loss": 0.0653,
|
||
|
|
"step": 4170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7289214404045689,
|
||
|
|
"grad_norm": 0.18282902871378182,
|
||
|
|
"learning_rate": 1.809921563001676e-05,
|
||
|
|
"loss": 0.0667,
|
||
|
|
"step": 4180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7306652716017089,
|
||
|
|
"grad_norm": 0.0856597489019827,
|
||
|
|
"learning_rate": 1.7882258941874432e-05,
|
||
|
|
"loss": 0.0663,
|
||
|
|
"step": 4190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7324091027988491,
|
||
|
|
"grad_norm": 0.10649135403842602,
|
||
|
|
"learning_rate": 1.7666326918052667e-05,
|
||
|
|
"loss": 0.0642,
|
||
|
|
"step": 4200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7341529339959892,
|
||
|
|
"grad_norm": 0.12225388455758668,
|
||
|
|
"learning_rate": 1.745142644751177e-05,
|
||
|
|
"loss": 0.0657,
|
||
|
|
"step": 4210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7358967651931293,
|
||
|
|
"grad_norm": 0.11567104795291488,
|
||
|
|
"learning_rate": 1.7237564386301868e-05,
|
||
|
|
"loss": 0.0661,
|
||
|
|
"step": 4220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7376405963902695,
|
||
|
|
"grad_norm": 0.07264396441233593,
|
||
|
|
"learning_rate": 1.702474755734441e-05,
|
||
|
|
"loss": 0.0657,
|
||
|
|
"step": 4230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7393844275874095,
|
||
|
|
"grad_norm": 0.07898587680558024,
|
||
|
|
"learning_rate": 1.6812982750214385e-05,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"step": 4240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7411282587845497,
|
||
|
|
"grad_norm": 0.05769295801958345,
|
||
|
|
"learning_rate": 1.660227672092373e-05,
|
||
|
|
"loss": 0.0646,
|
||
|
|
"step": 4250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7428720899816897,
|
||
|
|
"grad_norm": 0.09139721064795,
|
||
|
|
"learning_rate": 1.6392636191705817e-05,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"step": 4260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7446159211788299,
|
||
|
|
"grad_norm": 0.04328001632629656,
|
||
|
|
"learning_rate": 1.618406785080095e-05,
|
||
|
|
"loss": 0.0642,
|
||
|
|
"step": 4270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.74635975237597,
|
||
|
|
"grad_norm": 0.10300148190920724,
|
||
|
|
"learning_rate": 1.5976578352243017e-05,
|
||
|
|
"loss": 0.065,
|
||
|
|
"step": 4280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7481035835731101,
|
||
|
|
"grad_norm": 0.05439102528085213,
|
||
|
|
"learning_rate": 1.5770174315647186e-05,
|
||
|
|
"loss": 0.0641,
|
||
|
|
"step": 4290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7498474147702502,
|
||
|
|
"grad_norm": 0.07283525225952207,
|
||
|
|
"learning_rate": 1.5564862325998753e-05,
|
||
|
|
"loss": 0.0663,
|
||
|
|
"step": 4300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7515912459673904,
|
||
|
|
"grad_norm": 0.08943729068392149,
|
||
|
|
"learning_rate": 1.5360648933442977e-05,
|
||
|
|
"loss": 0.0641,
|
||
|
|
"step": 4310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7533350771645305,
|
||
|
|
"grad_norm": 0.06719452778739451,
|
||
|
|
"learning_rate": 1.5157540653076219e-05,
|
||
|
|
"loss": 0.0642,
|
||
|
|
"step": 4320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7550789083616706,
|
||
|
|
"grad_norm": 0.05251648332079966,
|
||
|
|
"learning_rate": 1.4955543964738e-05,
|
||
|
|
"loss": 0.066,
|
||
|
|
"step": 4330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7568227395588107,
|
||
|
|
"grad_norm": 0.05274636216164364,
|
||
|
|
"learning_rate": 1.4754665312804311e-05,
|
||
|
|
"loss": 0.0641,
|
||
|
|
"step": 4340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7585665707559508,
|
||
|
|
"grad_norm": 0.11011665466316892,
|
||
|
|
"learning_rate": 1.4554911105982021e-05,
|
||
|
|
"loss": 0.0639,
|
||
|
|
"step": 4350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.760310401953091,
|
||
|
|
"grad_norm": 0.10498230203392513,
|
||
|
|
"learning_rate": 1.4356287717104383e-05,
|
||
|
|
"loss": 0.064,
|
||
|
|
"step": 4360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.762054233150231,
|
||
|
|
"grad_norm": 0.09524120006808019,
|
||
|
|
"learning_rate": 1.4158801482927764e-05,
|
||
|
|
"loss": 0.065,
|
||
|
|
"step": 4370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7637980643473712,
|
||
|
|
"grad_norm": 0.09453574859798163,
|
||
|
|
"learning_rate": 1.3962458703929459e-05,
|
||
|
|
"loss": 0.0643,
|
||
|
|
"step": 4380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7655418955445112,
|
||
|
|
"grad_norm": 0.056312588367211,
|
||
|
|
"learning_rate": 1.376726564410663e-05,
|
||
|
|
"loss": 0.0638,
|
||
|
|
"step": 4390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7672857267416514,
|
||
|
|
"grad_norm": 0.05345047180852192,
|
||
|
|
"learning_rate": 1.3573228530776605e-05,
|
||
|
|
"loss": 0.0641,
|
||
|
|
"step": 4400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7690295579387916,
|
||
|
|
"grad_norm": 0.058868492999594034,
|
||
|
|
"learning_rate": 1.3380353554378073e-05,
|
||
|
|
"loss": 0.0652,
|
||
|
|
"step": 4410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7707733891359316,
|
||
|
|
"grad_norm": 0.05698080816187847,
|
||
|
|
"learning_rate": 1.3188646868273613e-05,
|
||
|
|
"loss": 0.0633,
|
||
|
|
"step": 4420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7725172203330718,
|
||
|
|
"grad_norm": 0.10673862985608311,
|
||
|
|
"learning_rate": 1.2998114588553429e-05,
|
||
|
|
"loss": 0.0625,
|
||
|
|
"step": 4430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7742610515302119,
|
||
|
|
"grad_norm": 0.08436740252106527,
|
||
|
|
"learning_rate": 1.2808762793840201e-05,
|
||
|
|
"loss": 0.0649,
|
||
|
|
"step": 4440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.776004882727352,
|
||
|
|
"grad_norm": 0.06436259428184785,
|
||
|
|
"learning_rate": 1.2620597525095136e-05,
|
||
|
|
"loss": 0.0646,
|
||
|
|
"step": 4450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7777487139244921,
|
||
|
|
"grad_norm": 0.06691706834113023,
|
||
|
|
"learning_rate": 1.2433624785425291e-05,
|
||
|
|
"loss": 0.0654,
|
||
|
|
"step": 4460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7794925451216322,
|
||
|
|
"grad_norm": 0.10712410910395756,
|
||
|
|
"learning_rate": 1.2247850539891948e-05,
|
||
|
|
"loss": 0.0641,
|
||
|
|
"step": 4470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7812363763187723,
|
||
|
|
"grad_norm": 0.08342722973198667,
|
||
|
|
"learning_rate": 1.206328071532048e-05,
|
||
|
|
"loss": 0.0637,
|
||
|
|
"step": 4480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7829802075159125,
|
||
|
|
"grad_norm": 0.056140033330775385,
|
||
|
|
"learning_rate": 1.187992120011111e-05,
|
||
|
|
"loss": 0.0636,
|
||
|
|
"step": 4490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7847240387130526,
|
||
|
|
"grad_norm": 0.1470353453729844,
|
||
|
|
"learning_rate": 1.1697777844051105e-05,
|
||
|
|
"loss": 0.0623,
|
||
|
|
"step": 4500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7864678699101927,
|
||
|
|
"grad_norm": 0.06174798787453687,
|
||
|
|
"learning_rate": 1.1516856458128167e-05,
|
||
|
|
"loss": 0.0637,
|
||
|
|
"step": 4510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7882117011073329,
|
||
|
|
"grad_norm": 0.10563261980312372,
|
||
|
|
"learning_rate": 1.133716281434502e-05,
|
||
|
|
"loss": 0.0618,
|
||
|
|
"step": 4520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7899555323044729,
|
||
|
|
"grad_norm": 0.059155903025558475,
|
||
|
|
"learning_rate": 1.1158702645535286e-05,
|
||
|
|
"loss": 0.0654,
|
||
|
|
"step": 4530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7916993635016131,
|
||
|
|
"grad_norm": 0.2236752898985674,
|
||
|
|
"learning_rate": 1.0981481645180564e-05,
|
||
|
|
"loss": 0.0655,
|
||
|
|
"step": 4540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7934431946987531,
|
||
|
|
"grad_norm": 0.05542585716628748,
|
||
|
|
"learning_rate": 1.080550546722876e-05,
|
||
|
|
"loss": 0.0632,
|
||
|
|
"step": 4550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7951870258958933,
|
||
|
|
"grad_norm": 0.12157637986614808,
|
||
|
|
"learning_rate": 1.063077972591382e-05,
|
||
|
|
"loss": 0.0619,
|
||
|
|
"step": 4560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7969308570930334,
|
||
|
|
"grad_norm": 0.06387726990724843,
|
||
|
|
"learning_rate": 1.0457309995576497e-05,
|
||
|
|
"loss": 0.063,
|
||
|
|
"step": 4570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7986746882901735,
|
||
|
|
"grad_norm": 0.08939875101367574,
|
||
|
|
"learning_rate": 1.0285101810486535e-05,
|
||
|
|
"loss": 0.0632,
|
||
|
|
"step": 4580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8004185194873136,
|
||
|
|
"grad_norm": 0.06361929804120545,
|
||
|
|
"learning_rate": 1.0114160664666155e-05,
|
||
|
|
"loss": 0.0638,
|
||
|
|
"step": 4590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8021623506844537,
|
||
|
|
"grad_norm": 0.08559047394309637,
|
||
|
|
"learning_rate": 9.94449201171479e-06,
|
||
|
|
"loss": 0.0623,
|
||
|
|
"step": 4600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8039061818815939,
|
||
|
|
"grad_norm": 0.1318141758332588,
|
||
|
|
"learning_rate": 9.776101264634969e-06,
|
||
|
|
"loss": 0.0635,
|
||
|
|
"step": 4610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.805650013078734,
|
||
|
|
"grad_norm": 0.12389032549293967,
|
||
|
|
"learning_rate": 9.608993795659765e-06,
|
||
|
|
"loss": 0.064,
|
||
|
|
"step": 4620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8073938442758741,
|
||
|
|
"grad_norm": 0.0823315848080731,
|
||
|
|
"learning_rate": 9.443174936081345e-06,
|
||
|
|
"loss": 0.0638,
|
||
|
|
"step": 4630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8091376754730142,
|
||
|
|
"grad_norm": 0.051973482541530225,
|
||
|
|
"learning_rate": 9.278649976080889e-06,
|
||
|
|
"loss": 0.0649,
|
||
|
|
"step": 4640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8108815066701544,
|
||
|
|
"grad_norm": 0.059530345330728256,
|
||
|
|
"learning_rate": 9.11542416455981e-06,
|
||
|
|
"loss": 0.0656,
|
||
|
|
"step": 4650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8126253378672944,
|
||
|
|
"grad_norm": 0.10552903129002746,
|
||
|
|
"learning_rate": 8.953502708972278e-06,
|
||
|
|
"loss": 0.0638,
|
||
|
|
"step": 4660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8143691690644346,
|
||
|
|
"grad_norm": 0.07221277969124819,
|
||
|
|
"learning_rate": 8.792890775159125e-06,
|
||
|
|
"loss": 0.0638,
|
||
|
|
"step": 4670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8161130002615746,
|
||
|
|
"grad_norm": 0.0772500242885849,
|
||
|
|
"learning_rate": 8.633593487183067e-06,
|
||
|
|
"loss": 0.0663,
|
||
|
|
"step": 4680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8178568314587148,
|
||
|
|
"grad_norm": 0.06591420894663506,
|
||
|
|
"learning_rate": 8.475615927165093e-06,
|
||
|
|
"loss": 0.0623,
|
||
|
|
"step": 4690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.819600662655855,
|
||
|
|
"grad_norm": 0.06507913039563108,
|
||
|
|
"learning_rate": 8.31896313512247e-06,
|
||
|
|
"loss": 0.0634,
|
||
|
|
"step": 4700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.821344493852995,
|
||
|
|
"grad_norm": 0.07878320811810173,
|
||
|
|
"learning_rate": 8.163640108807896e-06,
|
||
|
|
"loss": 0.0644,
|
||
|
|
"step": 4710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8230883250501352,
|
||
|
|
"grad_norm": 0.04897281046480619,
|
||
|
|
"learning_rate": 8.009651803550045e-06,
|
||
|
|
"loss": 0.0624,
|
||
|
|
"step": 4720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8248321562472752,
|
||
|
|
"grad_norm": 0.0952496494325907,
|
||
|
|
"learning_rate": 7.85700313209548e-06,
|
||
|
|
"loss": 0.065,
|
||
|
|
"step": 4730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8265759874444154,
|
||
|
|
"grad_norm": 0.05733587546669899,
|
||
|
|
"learning_rate": 7.70569896445194e-06,
|
||
|
|
"loss": 0.0656,
|
||
|
|
"step": 4740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8283198186415555,
|
||
|
|
"grad_norm": 0.09396840285731554,
|
||
|
|
"learning_rate": 7.555744127732922e-06,
|
||
|
|
"loss": 0.065,
|
||
|
|
"step": 4750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8300636498386956,
|
||
|
|
"grad_norm": 0.10474980239912153,
|
||
|
|
"learning_rate": 7.40714340600378e-06,
|
||
|
|
"loss": 0.0631,
|
||
|
|
"step": 4760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8318074810358357,
|
||
|
|
"grad_norm": 0.05498607201861292,
|
||
|
|
"learning_rate": 7.2599015401289496e-06,
|
||
|
|
"loss": 0.0635,
|
||
|
|
"step": 4770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8335513122329758,
|
||
|
|
"grad_norm": 0.055760501809272785,
|
||
|
|
"learning_rate": 7.114023227620831e-06,
|
||
|
|
"loss": 0.0642,
|
||
|
|
"step": 4780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.835295143430116,
|
||
|
|
"grad_norm": 0.05306162693666742,
|
||
|
|
"learning_rate": 6.969513122489863e-06,
|
||
|
|
"loss": 0.0597,
|
||
|
|
"step": 4790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8370389746272561,
|
||
|
|
"grad_norm": 0.1088676487625842,
|
||
|
|
"learning_rate": 6.826375835096038e-06,
|
||
|
|
"loss": 0.0644,
|
||
|
|
"step": 4800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8387828058243962,
|
||
|
|
"grad_norm": 0.07413608503611739,
|
||
|
|
"learning_rate": 6.6846159320018475e-06,
|
||
|
|
"loss": 0.0623,
|
||
|
|
"step": 4810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8405266370215363,
|
||
|
|
"grad_norm": 0.128062087657044,
|
||
|
|
"learning_rate": 6.5442379358265585e-06,
|
||
|
|
"loss": 0.0629,
|
||
|
|
"step": 4820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8422704682186765,
|
||
|
|
"grad_norm": 0.0711990370216347,
|
||
|
|
"learning_rate": 6.405246325101954e-06,
|
||
|
|
"loss": 0.064,
|
||
|
|
"step": 4830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8440142994158165,
|
||
|
|
"grad_norm": 0.05618385822056352,
|
||
|
|
"learning_rate": 6.267645534129446e-06,
|
||
|
|
"loss": 0.0625,
|
||
|
|
"step": 4840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8457581306129567,
|
||
|
|
"grad_norm": 0.059419896247745525,
|
||
|
|
"learning_rate": 6.131439952838608e-06,
|
||
|
|
"loss": 0.0639,
|
||
|
|
"step": 4850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8475019618100967,
|
||
|
|
"grad_norm": 0.051860827184772876,
|
||
|
|
"learning_rate": 5.996633926647083e-06,
|
||
|
|
"loss": 0.0658,
|
||
|
|
"step": 4860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8492457930072369,
|
||
|
|
"grad_norm": 0.08959041805389602,
|
||
|
|
"learning_rate": 5.863231756322019e-06,
|
||
|
|
"loss": 0.0641,
|
||
|
|
"step": 4870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8509896242043771,
|
||
|
|
"grad_norm": 0.08513940911176207,
|
||
|
|
"learning_rate": 5.7312376978428115e-06,
|
||
|
|
"loss": 0.0645,
|
||
|
|
"step": 4880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8527334554015171,
|
||
|
|
"grad_norm": 0.0548611114427665,
|
||
|
|
"learning_rate": 5.600655962265345e-06,
|
||
|
|
"loss": 0.0634,
|
||
|
|
"step": 4890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8544772865986573,
|
||
|
|
"grad_norm": 0.05781000046532928,
|
||
|
|
"learning_rate": 5.4714907155876184e-06,
|
||
|
|
"loss": 0.0633,
|
||
|
|
"step": 4900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8562211177957973,
|
||
|
|
"grad_norm": 0.08211213473524111,
|
||
|
|
"learning_rate": 5.3437460786168795e-06,
|
||
|
|
"loss": 0.0645,
|
||
|
|
"step": 4910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8579649489929375,
|
||
|
|
"grad_norm": 0.05678818154491414,
|
||
|
|
"learning_rate": 5.21742612683811e-06,
|
||
|
|
"loss": 0.064,
|
||
|
|
"step": 4920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8597087801900776,
|
||
|
|
"grad_norm": 0.0407657392322624,
|
||
|
|
"learning_rate": 5.092534890284056e-06,
|
||
|
|
"loss": 0.064,
|
||
|
|
"step": 4930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8614526113872177,
|
||
|
|
"grad_norm": 0.058259700076195416,
|
||
|
|
"learning_rate": 4.969076353406571e-06,
|
||
|
|
"loss": 0.0633,
|
||
|
|
"step": 4940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8631964425843578,
|
||
|
|
"grad_norm": 0.0451262739997244,
|
||
|
|
"learning_rate": 4.847054454949618e-06,
|
||
|
|
"loss": 0.0672,
|
||
|
|
"step": 4950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.864940273781498,
|
||
|
|
"grad_norm": 0.057460078382529416,
|
||
|
|
"learning_rate": 4.726473087823524e-06,
|
||
|
|
"loss": 0.063,
|
||
|
|
"step": 4960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.866684104978638,
|
||
|
|
"grad_norm": 0.06326571394215523,
|
||
|
|
"learning_rate": 4.6073360989807805e-06,
|
||
|
|
"loss": 0.0626,
|
||
|
|
"step": 4970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8684279361757782,
|
||
|
|
"grad_norm": 0.06116487077481144,
|
||
|
|
"learning_rate": 4.489647289293369e-06,
|
||
|
|
"loss": 0.0627,
|
||
|
|
"step": 4980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8701717673729183,
|
||
|
|
"grad_norm": 0.065179252399081,
|
||
|
|
"learning_rate": 4.3734104134314505e-06,
|
||
|
|
"loss": 0.0641,
|
||
|
|
"step": 4990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8719155985700584,
|
||
|
|
"grad_norm": 0.05066888191707678,
|
||
|
|
"learning_rate": 4.258629179743611e-06,
|
||
|
|
"loss": 0.0637,
|
||
|
|
"step": 5000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8736594297671986,
|
||
|
|
"grad_norm": 0.0490167056123082,
|
||
|
|
"learning_rate": 4.145307250138541e-06,
|
||
|
|
"loss": 0.0632,
|
||
|
|
"step": 5010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8754032609643386,
|
||
|
|
"grad_norm": 0.06459688078405998,
|
||
|
|
"learning_rate": 4.033448239968168e-06,
|
||
|
|
"loss": 0.0632,
|
||
|
|
"step": 5020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8771470921614788,
|
||
|
|
"grad_norm": 0.04132712459317927,
|
||
|
|
"learning_rate": 3.92305571791241e-06,
|
||
|
|
"loss": 0.0639,
|
||
|
|
"step": 5030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8788909233586188,
|
||
|
|
"grad_norm": 0.05973904076165523,
|
||
|
|
"learning_rate": 3.8141332058652447e-06,
|
||
|
|
"loss": 0.0625,
|
||
|
|
"step": 5040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.880634754555759,
|
||
|
|
"grad_norm": 0.1688894978424013,
|
||
|
|
"learning_rate": 3.7066841788223394e-06,
|
||
|
|
"loss": 0.063,
|
||
|
|
"step": 5050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8823785857528991,
|
||
|
|
"grad_norm": 0.06014827411694666,
|
||
|
|
"learning_rate": 3.6007120647702564e-06,
|
||
|
|
"loss": 0.0627,
|
||
|
|
"step": 5060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8841224169500392,
|
||
|
|
"grad_norm": 0.0690347027695397,
|
||
|
|
"learning_rate": 3.4962202445770254e-06,
|
||
|
|
"loss": 0.0631,
|
||
|
|
"step": 5070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8858662481471794,
|
||
|
|
"grad_norm": 0.06536837705797424,
|
||
|
|
"learning_rate": 3.3932120518843314e-06,
|
||
|
|
"loss": 0.0627,
|
||
|
|
"step": 5080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8876100793443195,
|
||
|
|
"grad_norm": 0.06603810637670604,
|
||
|
|
"learning_rate": 3.291690773001116e-06,
|
||
|
|
"loss": 0.0632,
|
||
|
|
"step": 5090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8893539105414596,
|
||
|
|
"grad_norm": 0.07530260933577586,
|
||
|
|
"learning_rate": 3.191659646798739e-06,
|
||
|
|
"loss": 0.0616,
|
||
|
|
"step": 5100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8910977417385997,
|
||
|
|
"grad_norm": 0.0842389348120118,
|
||
|
|
"learning_rate": 3.0931218646077065e-06,
|
||
|
|
"loss": 0.0613,
|
||
|
|
"step": 5110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8928415729357398,
|
||
|
|
"grad_norm": 0.052177775217524563,
|
||
|
|
"learning_rate": 2.996080570115778e-06,
|
||
|
|
"loss": 0.0605,
|
||
|
|
"step": 5120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8945854041328799,
|
||
|
|
"grad_norm": 0.07153236769063971,
|
||
|
|
"learning_rate": 2.9005388592676985e-06,
|
||
|
|
"loss": 0.064,
|
||
|
|
"step": 5130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8963292353300201,
|
||
|
|
"grad_norm": 0.057729253108185115,
|
||
|
|
"learning_rate": 2.806499780166455e-06,
|
||
|
|
"loss": 0.0649,
|
||
|
|
"step": 5140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8980730665271601,
|
||
|
|
"grad_norm": 0.05652295229188241,
|
||
|
|
"learning_rate": 2.71396633297602e-06,
|
||
|
|
"loss": 0.0637,
|
||
|
|
"step": 5150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8998168977243003,
|
||
|
|
"grad_norm": 0.05132200378789997,
|
||
|
|
"learning_rate": 2.6229414698255906e-06,
|
||
|
|
"loss": 0.0649,
|
||
|
|
"step": 5160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9015607289214405,
|
||
|
|
"grad_norm": 0.06529704901316756,
|
||
|
|
"learning_rate": 2.5334280947154733e-06,
|
||
|
|
"loss": 0.0625,
|
||
|
|
"step": 5170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9033045601185805,
|
||
|
|
"grad_norm": 0.08839424803043143,
|
||
|
|
"learning_rate": 2.4454290634243927e-06,
|
||
|
|
"loss": 0.0639,
|
||
|
|
"step": 5180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9050483913157207,
|
||
|
|
"grad_norm": 0.07271889072883368,
|
||
|
|
"learning_rate": 2.3589471834183976e-06,
|
||
|
|
"loss": 0.0636,
|
||
|
|
"step": 5190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9067922225128607,
|
||
|
|
"grad_norm": 0.06780310617809011,
|
||
|
|
"learning_rate": 2.273985213761298e-06,
|
||
|
|
"loss": 0.0624,
|
||
|
|
"step": 5200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9085360537100009,
|
||
|
|
"grad_norm": 0.05091168643328729,
|
||
|
|
"learning_rate": 2.1905458650266276e-06,
|
||
|
|
"loss": 0.0629,
|
||
|
|
"step": 5210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.910279884907141,
|
||
|
|
"grad_norm": 0.09149792256181204,
|
||
|
|
"learning_rate": 2.108631799211158e-06,
|
||
|
|
"loss": 0.063,
|
||
|
|
"step": 5220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9120237161042811,
|
||
|
|
"grad_norm": 0.07438156642001797,
|
||
|
|
"learning_rate": 2.0282456296500386e-06,
|
||
|
|
"loss": 0.0638,
|
||
|
|
"step": 5230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9137675473014212,
|
||
|
|
"grad_norm": 0.07104948596015835,
|
||
|
|
"learning_rate": 1.9493899209333145e-06,
|
||
|
|
"loss": 0.0635,
|
||
|
|
"step": 5240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9155113784985613,
|
||
|
|
"grad_norm": 0.07176357436199812,
|
||
|
|
"learning_rate": 1.8720671888242059e-06,
|
||
|
|
"loss": 0.0649,
|
||
|
|
"step": 5250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9172552096957014,
|
||
|
|
"grad_norm": 0.05532043790257546,
|
||
|
|
"learning_rate": 1.7962799001787822e-06,
|
||
|
|
"loss": 0.0625,
|
||
|
|
"step": 5260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9189990408928416,
|
||
|
|
"grad_norm": 0.06036596788049541,
|
||
|
|
"learning_rate": 1.7220304728672976e-06,
|
||
|
|
"loss": 0.0621,
|
||
|
|
"step": 5270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9207428720899817,
|
||
|
|
"grad_norm": 0.05175396289068676,
|
||
|
|
"learning_rate": 1.6493212756970355e-06,
|
||
|
|
"loss": 0.0638,
|
||
|
|
"step": 5280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9224867032871218,
|
||
|
|
"grad_norm": 0.04494875714954003,
|
||
|
|
"learning_rate": 1.5781546283367531e-06,
|
||
|
|
"loss": 0.0618,
|
||
|
|
"step": 5290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.924230534484262,
|
||
|
|
"grad_norm": 0.07299420434011313,
|
||
|
|
"learning_rate": 1.5085328012426291e-06,
|
||
|
|
"loss": 0.0637,
|
||
|
|
"step": 5300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.925974365681402,
|
||
|
|
"grad_norm": 0.08713836743706539,
|
||
|
|
"learning_rate": 1.4404580155859103e-06,
|
||
|
|
"loss": 0.0627,
|
||
|
|
"step": 5310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9277181968785422,
|
||
|
|
"grad_norm": 0.0648825275044856,
|
||
|
|
"learning_rate": 1.3739324431819579e-06,
|
||
|
|
"loss": 0.0617,
|
||
|
|
"step": 5320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9294620280756822,
|
||
|
|
"grad_norm": 0.0603878346389834,
|
||
|
|
"learning_rate": 1.3089582064210293e-06,
|
||
|
|
"loss": 0.062,
|
||
|
|
"step": 5330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9312058592728224,
|
||
|
|
"grad_norm": 0.06718431446540517,
|
||
|
|
"learning_rate": 1.2455373782005342e-06,
|
||
|
|
"loss": 0.0641,
|
||
|
|
"step": 5340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9329496904699625,
|
||
|
|
"grad_norm": 0.08168020687674889,
|
||
|
|
"learning_rate": 1.183671981858897e-06,
|
||
|
|
"loss": 0.064,
|
||
|
|
"step": 5350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9346935216671026,
|
||
|
|
"grad_norm": 0.05000874414008422,
|
||
|
|
"learning_rate": 1.1233639911110317e-06,
|
||
|
|
"loss": 0.064,
|
||
|
|
"step": 5360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9364373528642428,
|
||
|
|
"grad_norm": 0.16836322984135893,
|
||
|
|
"learning_rate": 1.0646153299853523e-06,
|
||
|
|
"loss": 0.0635,
|
||
|
|
"step": 5370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9381811840613828,
|
||
|
|
"grad_norm": 0.09538127932642303,
|
||
|
|
"learning_rate": 1.0074278727623953e-06,
|
||
|
|
"loss": 0.0626,
|
||
|
|
"step": 5380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.939925015258523,
|
||
|
|
"grad_norm": 0.052191949959066446,
|
||
|
|
"learning_rate": 9.51803443915017e-07,
|
||
|
|
"loss": 0.0611,
|
||
|
|
"step": 5390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9416688464556631,
|
||
|
|
"grad_norm": 0.061928758941747805,
|
||
|
|
"learning_rate": 8.977438180502118e-07,
|
||
|
|
"loss": 0.0628,
|
||
|
|
"step": 5400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9434126776528032,
|
||
|
|
"grad_norm": 0.09241870053906379,
|
||
|
|
"learning_rate": 8.452507198524584e-07,
|
||
|
|
"loss": 0.0645,
|
||
|
|
"step": 5410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9451565088499433,
|
||
|
|
"grad_norm": 0.07538730319047023,
|
||
|
|
"learning_rate": 7.943258240287354e-07,
|
||
|
|
"loss": 0.0634,
|
||
|
|
"step": 5420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9469003400470835,
|
||
|
|
"grad_norm": 0.06605596093399979,
|
||
|
|
"learning_rate": 7.449707552550533e-07,
|
||
|
|
"loss": 0.0648,
|
||
|
|
"step": 5430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9486441712442235,
|
||
|
|
"grad_norm": 0.05499334424638046,
|
||
|
|
"learning_rate": 6.971870881246678e-07,
|
||
|
|
"loss": 0.0619,
|
||
|
|
"step": 5440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9503880024413637,
|
||
|
|
"grad_norm": 0.09947976745780569,
|
||
|
|
"learning_rate": 6.509763470977926e-07,
|
||
|
|
"loss": 0.0632,
|
||
|
|
"step": 5450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9521318336385038,
|
||
|
|
"grad_norm": 0.049014883398517665,
|
||
|
|
"learning_rate": 6.063400064530155e-07,
|
||
|
|
"loss": 0.0628,
|
||
|
|
"step": 5460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9538756648356439,
|
||
|
|
"grad_norm": 0.06291101959215979,
|
||
|
|
"learning_rate": 5.632794902402206e-07,
|
||
|
|
"loss": 0.0612,
|
||
|
|
"step": 5470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9556194960327841,
|
||
|
|
"grad_norm": 0.09331479294747827,
|
||
|
|
"learning_rate": 5.217961722351894e-07,
|
||
|
|
"loss": 0.0601,
|
||
|
|
"step": 5480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9573633272299241,
|
||
|
|
"grad_norm": 0.057764257528601204,
|
||
|
|
"learning_rate": 4.818913758957377e-07,
|
||
|
|
"loss": 0.0608,
|
||
|
|
"step": 5490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9591071584270643,
|
||
|
|
"grad_norm": 0.08288004474062129,
|
||
|
|
"learning_rate": 4.4356637431953727e-07,
|
||
|
|
"loss": 0.0621,
|
||
|
|
"step": 5500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9608509896242043,
|
||
|
|
"grad_norm": 0.07391577549421642,
|
||
|
|
"learning_rate": 4.068223902034651e-07,
|
||
|
|
"loss": 0.064,
|
||
|
|
"step": 5510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9625948208213445,
|
||
|
|
"grad_norm": 0.06724508233844487,
|
||
|
|
"learning_rate": 3.716605958046071e-07,
|
||
|
|
"loss": 0.0628,
|
||
|
|
"step": 5520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9643386520184846,
|
||
|
|
"grad_norm": 0.05990916915396748,
|
||
|
|
"learning_rate": 3.380821129028489e-07,
|
||
|
|
"loss": 0.0615,
|
||
|
|
"step": 5530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9660824832156247,
|
||
|
|
"grad_norm": 0.053834596475272396,
|
||
|
|
"learning_rate": 3.0608801276511554e-07,
|
||
|
|
"loss": 0.0641,
|
||
|
|
"step": 5540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9678263144127649,
|
||
|
|
"grad_norm": 0.06287658543763808,
|
||
|
|
"learning_rate": 2.7567931611116037e-07,
|
||
|
|
"loss": 0.0613,
|
||
|
|
"step": 5550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.969570145609905,
|
||
|
|
"grad_norm": 0.07416621242751696,
|
||
|
|
"learning_rate": 2.468569930810238e-07,
|
||
|
|
"loss": 0.0633,
|
||
|
|
"step": 5560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9713139768070451,
|
||
|
|
"grad_norm": 0.04757779630445172,
|
||
|
|
"learning_rate": 2.1962196320406414e-07,
|
||
|
|
"loss": 0.0611,
|
||
|
|
"step": 5570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9730578080041852,
|
||
|
|
"grad_norm": 0.0748135009816523,
|
||
|
|
"learning_rate": 1.9397509536964175e-07,
|
||
|
|
"loss": 0.0608,
|
||
|
|
"step": 5580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9748016392013253,
|
||
|
|
"grad_norm": 0.048298795572652944,
|
||
|
|
"learning_rate": 1.699172077993916e-07,
|
||
|
|
"loss": 0.0632,
|
||
|
|
"step": 5590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9765454703984654,
|
||
|
|
"grad_norm": 0.053997077150683304,
|
||
|
|
"learning_rate": 1.4744906802110493e-07,
|
||
|
|
"loss": 0.0608,
|
||
|
|
"step": 5600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9782893015956056,
|
||
|
|
"grad_norm": 0.04600674397243153,
|
||
|
|
"learning_rate": 1.2657139284425467e-07,
|
||
|
|
"loss": 0.0646,
|
||
|
|
"step": 5610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9800331327927456,
|
||
|
|
"grad_norm": 0.0551671569065242,
|
||
|
|
"learning_rate": 1.0728484833713582e-07,
|
||
|
|
"loss": 0.0633,
|
||
|
|
"step": 5620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9817769639898858,
|
||
|
|
"grad_norm": 0.06031021677931391,
|
||
|
|
"learning_rate": 8.959004980559904e-08,
|
||
|
|
"loss": 0.0647,
|
||
|
|
"step": 5630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9835207951870258,
|
||
|
|
"grad_norm": 0.07156946663140798,
|
||
|
|
"learning_rate": 7.34875617734332e-08,
|
||
|
|
"loss": 0.0651,
|
||
|
|
"step": 5640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.985264626384166,
|
||
|
|
"grad_norm": 0.0442263642401745,
|
||
|
|
"learning_rate": 5.897789796433517e-08,
|
||
|
|
"loss": 0.0627,
|
||
|
|
"step": 5650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9870084575813062,
|
||
|
|
"grad_norm": 0.11967681340422083,
|
||
|
|
"learning_rate": 4.6061521285550856e-08,
|
||
|
|
"loss": 0.0659,
|
||
|
|
"step": 5660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9887522887784462,
|
||
|
|
"grad_norm": 0.06948791634563219,
|
||
|
|
"learning_rate": 3.47388438130758e-08,
|
||
|
|
"loss": 0.0654,
|
||
|
|
"step": 5670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9904961199755864,
|
||
|
|
"grad_norm": 0.06338361823849195,
|
||
|
|
"learning_rate": 2.5010226778537925e-08,
|
||
|
|
"loss": 0.0627,
|
||
|
|
"step": 5680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9922399511727265,
|
||
|
|
"grad_norm": 0.05046440916449036,
|
||
|
|
"learning_rate": 1.687598055764017e-08,
|
||
|
|
"loss": 0.0649,
|
||
|
|
"step": 5690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9939837823698666,
|
||
|
|
"grad_norm": 0.0621800160395451,
|
||
|
|
"learning_rate": 1.0336364660290532e-08,
|
||
|
|
"loss": 0.0632,
|
||
|
|
"step": 5700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9957276135670067,
|
||
|
|
"grad_norm": 0.06262117432768159,
|
||
|
|
"learning_rate": 5.391587722303193e-09,
|
||
|
|
"loss": 0.0646,
|
||
|
|
"step": 5710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9974714447641468,
|
||
|
|
"grad_norm": 0.10303152613717605,
|
||
|
|
"learning_rate": 2.0418074987538227e-09,
|
||
|
|
"loss": 0.0619,
|
||
|
|
"step": 5720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9992152759612869,
|
||
|
|
"grad_norm": 0.09188013889582053,
|
||
|
|
"learning_rate": 2.871308589280641e-10,
|
||
|
|
"loss": 0.0629,
|
||
|
|
"step": 5730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0,
|
||
|
|
"step": 5735,
|
||
|
|
"total_flos": 4951250244206592.0,
|
||
|
|
"train_loss": 0.08733252992605062,
|
||
|
|
"train_runtime": 92184.4704,
|
||
|
|
"train_samples_per_second": 1.991,
|
||
|
|
"train_steps_per_second": 0.062
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 10,
|
||
|
|
"max_steps": 5735,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 1,
|
||
|
|
"save_steps": 1000,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 4951250244206592.0,
|
||
|
|
"train_batch_size": 2,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|