7750 lines
190 KiB
JSON
7750 lines
190 KiB
JSON
|
|
{
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 0.4999858203495,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 11019,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.00045374881599918323,
|
||
|
|
"grad_norm": 3.5486392974853516,
|
||
|
|
"learning_rate": 6.042296072507553e-07,
|
||
|
|
"loss": 0.3781,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0009074976319983665,
|
||
|
|
"grad_norm": 2.2130320072174072,
|
||
|
|
"learning_rate": 1.2084592145015106e-06,
|
||
|
|
"loss": 0.3327,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0013612464479975498,
|
||
|
|
"grad_norm": 1.3342896699905396,
|
||
|
|
"learning_rate": 1.8126888217522659e-06,
|
||
|
|
"loss": 0.2722,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.001814995263996733,
|
||
|
|
"grad_norm": 1.0985041856765747,
|
||
|
|
"learning_rate": 2.4169184290030213e-06,
|
||
|
|
"loss": 0.1678,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.002268744079995916,
|
||
|
|
"grad_norm": 0.8793349266052246,
|
||
|
|
"learning_rate": 3.0211480362537765e-06,
|
||
|
|
"loss": 0.1129,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0027224928959950997,
|
||
|
|
"grad_norm": 0.6948434710502625,
|
||
|
|
"learning_rate": 3.6253776435045317e-06,
|
||
|
|
"loss": 0.0963,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0031762417119942828,
|
||
|
|
"grad_norm": 0.6138444542884827,
|
||
|
|
"learning_rate": 4.229607250755287e-06,
|
||
|
|
"loss": 0.0921,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.003629990527993466,
|
||
|
|
"grad_norm": 0.860797643661499,
|
||
|
|
"learning_rate": 4.833836858006043e-06,
|
||
|
|
"loss": 0.0774,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.004083739343992649,
|
||
|
|
"grad_norm": 0.610936164855957,
|
||
|
|
"learning_rate": 5.438066465256799e-06,
|
||
|
|
"loss": 0.0693,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.004537488159991832,
|
||
|
|
"grad_norm": 0.5868649482727051,
|
||
|
|
"learning_rate": 6.042296072507553e-06,
|
||
|
|
"loss": 0.0603,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.004991236975991016,
|
||
|
|
"grad_norm": 0.49356886744499207,
|
||
|
|
"learning_rate": 6.646525679758309e-06,
|
||
|
|
"loss": 0.0752,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.005444985791990199,
|
||
|
|
"grad_norm": 0.5456549525260925,
|
||
|
|
"learning_rate": 7.2507552870090635e-06,
|
||
|
|
"loss": 0.0571,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.005898734607989382,
|
||
|
|
"grad_norm": 0.6095007061958313,
|
||
|
|
"learning_rate": 7.85498489425982e-06,
|
||
|
|
"loss": 0.0552,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0063524834239885655,
|
||
|
|
"grad_norm": 0.42237332463264465,
|
||
|
|
"learning_rate": 8.459214501510575e-06,
|
||
|
|
"loss": 0.0639,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.006806232239987749,
|
||
|
|
"grad_norm": 0.571368932723999,
|
||
|
|
"learning_rate": 9.06344410876133e-06,
|
||
|
|
"loss": 0.0534,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.007259981055986932,
|
||
|
|
"grad_norm": 0.5768399834632874,
|
||
|
|
"learning_rate": 9.667673716012085e-06,
|
||
|
|
"loss": 0.0546,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.007713729871986115,
|
||
|
|
"grad_norm": 0.5476046204566956,
|
||
|
|
"learning_rate": 1.0271903323262842e-05,
|
||
|
|
"loss": 0.042,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.008167478687985298,
|
||
|
|
"grad_norm": 0.4237188696861267,
|
||
|
|
"learning_rate": 1.0876132930513597e-05,
|
||
|
|
"loss": 0.0466,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.008621227503984481,
|
||
|
|
"grad_norm": 0.4303552210330963,
|
||
|
|
"learning_rate": 1.1480362537764351e-05,
|
||
|
|
"loss": 0.0555,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.009074976319983665,
|
||
|
|
"grad_norm": 0.4543503522872925,
|
||
|
|
"learning_rate": 1.2084592145015106e-05,
|
||
|
|
"loss": 0.0418,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.009528725135982848,
|
||
|
|
"grad_norm": 0.47898203134536743,
|
||
|
|
"learning_rate": 1.2688821752265863e-05,
|
||
|
|
"loss": 0.0428,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.009982473951982032,
|
||
|
|
"grad_norm": 0.417222797870636,
|
||
|
|
"learning_rate": 1.3293051359516618e-05,
|
||
|
|
"loss": 0.0396,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.010436222767981215,
|
||
|
|
"grad_norm": 0.4032108187675476,
|
||
|
|
"learning_rate": 1.3897280966767372e-05,
|
||
|
|
"loss": 0.0391,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.010889971583980399,
|
||
|
|
"grad_norm": 0.3091837465763092,
|
||
|
|
"learning_rate": 1.4501510574018127e-05,
|
||
|
|
"loss": 0.0426,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01134372039997958,
|
||
|
|
"grad_norm": 0.32662463188171387,
|
||
|
|
"learning_rate": 1.5105740181268884e-05,
|
||
|
|
"loss": 0.038,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.011797469215978764,
|
||
|
|
"grad_norm": 0.4448952078819275,
|
||
|
|
"learning_rate": 1.570996978851964e-05,
|
||
|
|
"loss": 0.0477,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.012251218031977948,
|
||
|
|
"grad_norm": 0.4139327108860016,
|
||
|
|
"learning_rate": 1.6314199395770393e-05,
|
||
|
|
"loss": 0.0469,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.012704966847977131,
|
||
|
|
"grad_norm": 0.3968771994113922,
|
||
|
|
"learning_rate": 1.691842900302115e-05,
|
||
|
|
"loss": 0.0454,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.013158715663976315,
|
||
|
|
"grad_norm": 0.4650236666202545,
|
||
|
|
"learning_rate": 1.7522658610271906e-05,
|
||
|
|
"loss": 0.0451,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.013612464479975498,
|
||
|
|
"grad_norm": 0.4143680930137634,
|
||
|
|
"learning_rate": 1.812688821752266e-05,
|
||
|
|
"loss": 0.0377,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.014066213295974682,
|
||
|
|
"grad_norm": 0.37959203124046326,
|
||
|
|
"learning_rate": 1.8731117824773413e-05,
|
||
|
|
"loss": 0.0432,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.014519962111973863,
|
||
|
|
"grad_norm": 0.43344926834106445,
|
||
|
|
"learning_rate": 1.933534743202417e-05,
|
||
|
|
"loss": 0.0402,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.014973710927973047,
|
||
|
|
"grad_norm": 0.32157906889915466,
|
||
|
|
"learning_rate": 1.9939577039274927e-05,
|
||
|
|
"loss": 0.0372,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01542745974397223,
|
||
|
|
"grad_norm": 0.3956120014190674,
|
||
|
|
"learning_rate": 1.9999965008575334e-05,
|
||
|
|
"loss": 0.0409,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.015881208559971414,
|
||
|
|
"grad_norm": 0.4085632264614105,
|
||
|
|
"learning_rate": 1.999984405087852e-05,
|
||
|
|
"loss": 0.0374,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.016334957375970596,
|
||
|
|
"grad_norm": 0.3496035933494568,
|
||
|
|
"learning_rate": 1.999963669596147e-05,
|
||
|
|
"loss": 0.0326,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01678870619196978,
|
||
|
|
"grad_norm": 0.3280976116657257,
|
||
|
|
"learning_rate": 1.9999342945615705e-05,
|
||
|
|
"loss": 0.04,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.017242455007968963,
|
||
|
|
"grad_norm": 0.35505032539367676,
|
||
|
|
"learning_rate": 1.9998962802379185e-05,
|
||
|
|
"loss": 0.0336,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.017696203823968148,
|
||
|
|
"grad_norm": 0.36404356360435486,
|
||
|
|
"learning_rate": 1.9998496269536293e-05,
|
||
|
|
"loss": 0.0354,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01814995263996733,
|
||
|
|
"grad_norm": 0.3447716534137726,
|
||
|
|
"learning_rate": 1.9997943351117804e-05,
|
||
|
|
"loss": 0.0368,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.018603701455966515,
|
||
|
|
"grad_norm": 0.26203060150146484,
|
||
|
|
"learning_rate": 1.9997304051900853e-05,
|
||
|
|
"loss": 0.0337,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.019057450271965697,
|
||
|
|
"grad_norm": 0.32277223467826843,
|
||
|
|
"learning_rate": 1.9996578377408897e-05,
|
||
|
|
"loss": 0.0332,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01951119908796488,
|
||
|
|
"grad_norm": 0.43078309297561646,
|
||
|
|
"learning_rate": 1.9995766333911663e-05,
|
||
|
|
"loss": 0.0297,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.019964947903964064,
|
||
|
|
"grad_norm": 0.30445027351379395,
|
||
|
|
"learning_rate": 1.999486792842508e-05,
|
||
|
|
"loss": 0.0389,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.020418696719963245,
|
||
|
|
"grad_norm": 0.44035622477531433,
|
||
|
|
"learning_rate": 1.999388316871125e-05,
|
||
|
|
"loss": 0.0375,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02087244553596243,
|
||
|
|
"grad_norm": 0.372060626745224,
|
||
|
|
"learning_rate": 1.9992812063278354e-05,
|
||
|
|
"loss": 0.0331,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.021326194351961612,
|
||
|
|
"grad_norm": 0.27705034613609314,
|
||
|
|
"learning_rate": 1.9991654621380593e-05,
|
||
|
|
"loss": 0.0248,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.021779943167960798,
|
||
|
|
"grad_norm": 0.2888132631778717,
|
||
|
|
"learning_rate": 1.9990410853018094e-05,
|
||
|
|
"loss": 0.0361,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02223369198395998,
|
||
|
|
"grad_norm": 0.3042416274547577,
|
||
|
|
"learning_rate": 1.998908076893684e-05,
|
||
|
|
"loss": 0.029,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02268744079995916,
|
||
|
|
"grad_norm": 0.24554918706417084,
|
||
|
|
"learning_rate": 1.9987664380628566e-05,
|
||
|
|
"loss": 0.0362,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.023141189615958346,
|
||
|
|
"grad_norm": 0.25402751564979553,
|
||
|
|
"learning_rate": 1.9986161700330668e-05,
|
||
|
|
"loss": 0.0296,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.023594938431957528,
|
||
|
|
"grad_norm": 0.3481331467628479,
|
||
|
|
"learning_rate": 1.998457274102608e-05,
|
||
|
|
"loss": 0.0329,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.024048687247956713,
|
||
|
|
"grad_norm": 0.31753596663475037,
|
||
|
|
"learning_rate": 1.9982897516443194e-05,
|
||
|
|
"loss": 0.0339,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.024502436063955895,
|
||
|
|
"grad_norm": 0.35809817910194397,
|
||
|
|
"learning_rate": 1.9981136041055703e-05,
|
||
|
|
"loss": 0.0353,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02495618487995508,
|
||
|
|
"grad_norm": 0.3065260946750641,
|
||
|
|
"learning_rate": 1.99792883300825e-05,
|
||
|
|
"loss": 0.0353,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.025409933695954262,
|
||
|
|
"grad_norm": 0.34474602341651917,
|
||
|
|
"learning_rate": 1.997735439948755e-05,
|
||
|
|
"loss": 0.0336,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.025863682511953444,
|
||
|
|
"grad_norm": 0.269667387008667,
|
||
|
|
"learning_rate": 1.997533426597973e-05,
|
||
|
|
"loss": 0.032,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02631743132795263,
|
||
|
|
"grad_norm": 0.39037978649139404,
|
||
|
|
"learning_rate": 1.9973227947012713e-05,
|
||
|
|
"loss": 0.0278,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02677118014395181,
|
||
|
|
"grad_norm": 0.3307776153087616,
|
||
|
|
"learning_rate": 1.9971035460784783e-05,
|
||
|
|
"loss": 0.0344,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.027224928959950996,
|
||
|
|
"grad_norm": 0.2869550585746765,
|
||
|
|
"learning_rate": 1.9968756826238713e-05,
|
||
|
|
"loss": 0.0327,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.027678677775950178,
|
||
|
|
"grad_norm": 0.25692203640937805,
|
||
|
|
"learning_rate": 1.9966392063061573e-05,
|
||
|
|
"loss": 0.0346,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.028132426591949363,
|
||
|
|
"grad_norm": 0.21003565192222595,
|
||
|
|
"learning_rate": 1.9963941191684585e-05,
|
||
|
|
"loss": 0.0287,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.028586175407948545,
|
||
|
|
"grad_norm": 0.23621736466884613,
|
||
|
|
"learning_rate": 1.9961404233282926e-05,
|
||
|
|
"loss": 0.0283,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.029039924223947727,
|
||
|
|
"grad_norm": 0.33250758051872253,
|
||
|
|
"learning_rate": 1.995878120977555e-05,
|
||
|
|
"loss": 0.031,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.029493673039946912,
|
||
|
|
"grad_norm": 0.24574914574623108,
|
||
|
|
"learning_rate": 1.9956072143825006e-05,
|
||
|
|
"loss": 0.03,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.029947421855946094,
|
||
|
|
"grad_norm": 0.28682562708854675,
|
||
|
|
"learning_rate": 1.9953277058837237e-05,
|
||
|
|
"loss": 0.0304,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03040117067194528,
|
||
|
|
"grad_norm": 0.4012982249259949,
|
||
|
|
"learning_rate": 1.9950395978961376e-05,
|
||
|
|
"loss": 0.0347,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03085491948794446,
|
||
|
|
"grad_norm": 0.3115476369857788,
|
||
|
|
"learning_rate": 1.9947428929089536e-05,
|
||
|
|
"loss": 0.0289,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.031308668303943646,
|
||
|
|
"grad_norm": 0.24223241209983826,
|
||
|
|
"learning_rate": 1.9944375934856606e-05,
|
||
|
|
"loss": 0.0269,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03176241711994283,
|
||
|
|
"grad_norm": 0.20952603220939636,
|
||
|
|
"learning_rate": 1.9941237022640024e-05,
|
||
|
|
"loss": 0.0266,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03221616593594201,
|
||
|
|
"grad_norm": 0.269159197807312,
|
||
|
|
"learning_rate": 1.9938012219559536e-05,
|
||
|
|
"loss": 0.0256,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03266991475194119,
|
||
|
|
"grad_norm": 0.21880276501178741,
|
||
|
|
"learning_rate": 1.9934701553476983e-05,
|
||
|
|
"loss": 0.0321,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03312366356794038,
|
||
|
|
"grad_norm": 0.29239243268966675,
|
||
|
|
"learning_rate": 1.993130505299604e-05,
|
||
|
|
"loss": 0.0295,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03357741238393956,
|
||
|
|
"grad_norm": 0.2072737067937851,
|
||
|
|
"learning_rate": 1.9927822747461987e-05,
|
||
|
|
"loss": 0.0262,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03403116119993874,
|
||
|
|
"grad_norm": 0.23652520775794983,
|
||
|
|
"learning_rate": 1.9924254666961446e-05,
|
||
|
|
"loss": 0.0247,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.034484910015937925,
|
||
|
|
"grad_norm": 0.2358781099319458,
|
||
|
|
"learning_rate": 1.9920600842322123e-05,
|
||
|
|
"loss": 0.0211,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.034938658831937114,
|
||
|
|
"grad_norm": 0.2626686990261078,
|
||
|
|
"learning_rate": 1.9916861305112536e-05,
|
||
|
|
"loss": 0.0278,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.035392407647936296,
|
||
|
|
"grad_norm": 0.30486413836479187,
|
||
|
|
"learning_rate": 1.9913036087641756e-05,
|
||
|
|
"loss": 0.029,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03584615646393548,
|
||
|
|
"grad_norm": 0.22027461230754852,
|
||
|
|
"learning_rate": 1.9909125222959106e-05,
|
||
|
|
"loss": 0.0249,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03629990527993466,
|
||
|
|
"grad_norm": 0.36433184146881104,
|
||
|
|
"learning_rate": 1.9905128744853903e-05,
|
||
|
|
"loss": 0.0262,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03675365409593384,
|
||
|
|
"grad_norm": 0.2639385759830475,
|
||
|
|
"learning_rate": 1.9901046687855142e-05,
|
||
|
|
"loss": 0.0332,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03720740291193303,
|
||
|
|
"grad_norm": 0.2315681129693985,
|
||
|
|
"learning_rate": 1.9896879087231212e-05,
|
||
|
|
"loss": 0.0314,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03766115172793221,
|
||
|
|
"grad_norm": 0.18850234150886536,
|
||
|
|
"learning_rate": 1.989262597898959e-05,
|
||
|
|
"loss": 0.0245,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03811490054393139,
|
||
|
|
"grad_norm": 0.19689935445785522,
|
||
|
|
"learning_rate": 1.9888287399876514e-05,
|
||
|
|
"loss": 0.0262,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.038568649359930575,
|
||
|
|
"grad_norm": 0.25434932112693787,
|
||
|
|
"learning_rate": 1.9883863387376688e-05,
|
||
|
|
"loss": 0.0252,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03902239817592976,
|
||
|
|
"grad_norm": 0.2297980934381485,
|
||
|
|
"learning_rate": 1.9879353979712953e-05,
|
||
|
|
"loss": 0.0289,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.039476146991928945,
|
||
|
|
"grad_norm": 0.18525287508964539,
|
||
|
|
"learning_rate": 1.987475921584594e-05,
|
||
|
|
"loss": 0.0214,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03992989580792813,
|
||
|
|
"grad_norm": 0.2375829815864563,
|
||
|
|
"learning_rate": 1.987007913547375e-05,
|
||
|
|
"loss": 0.0281,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04038364462392731,
|
||
|
|
"grad_norm": 0.2593577802181244,
|
||
|
|
"learning_rate": 1.9865313779031607e-05,
|
||
|
|
"loss": 0.0292,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04083739343992649,
|
||
|
|
"grad_norm": 0.26970919966697693,
|
||
|
|
"learning_rate": 1.986046318769151e-05,
|
||
|
|
"loss": 0.0301,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04129114225592568,
|
||
|
|
"grad_norm": 0.20702382922172546,
|
||
|
|
"learning_rate": 1.9855527403361874e-05,
|
||
|
|
"loss": 0.0282,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04174489107192486,
|
||
|
|
"grad_norm": 0.23546156287193298,
|
||
|
|
"learning_rate": 1.9850506468687164e-05,
|
||
|
|
"loss": 0.0269,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04219863988792404,
|
||
|
|
"grad_norm": 0.18576021492481232,
|
||
|
|
"learning_rate": 1.9845400427047542e-05,
|
||
|
|
"loss": 0.022,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.042652388703923225,
|
||
|
|
"grad_norm": 0.2637688219547272,
|
||
|
|
"learning_rate": 1.9840209322558476e-05,
|
||
|
|
"loss": 0.029,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.043106137519922406,
|
||
|
|
"grad_norm": 0.6882661581039429,
|
||
|
|
"learning_rate": 1.983493320007036e-05,
|
||
|
|
"loss": 0.0246,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.043559886335921595,
|
||
|
|
"grad_norm": 0.28765228390693665,
|
||
|
|
"learning_rate": 1.9829572105168137e-05,
|
||
|
|
"loss": 0.0308,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04401363515192078,
|
||
|
|
"grad_norm": 0.2787792980670929,
|
||
|
|
"learning_rate": 1.9824126084170907e-05,
|
||
|
|
"loss": 0.0271,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04446738396791996,
|
||
|
|
"grad_norm": 0.2273041307926178,
|
||
|
|
"learning_rate": 1.9818595184131505e-05,
|
||
|
|
"loss": 0.0296,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04492113278391914,
|
||
|
|
"grad_norm": 0.26197728514671326,
|
||
|
|
"learning_rate": 1.9812979452836117e-05,
|
||
|
|
"loss": 0.0286,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04537488159991832,
|
||
|
|
"grad_norm": 0.2646290957927704,
|
||
|
|
"learning_rate": 1.9807278938803853e-05,
|
||
|
|
"loss": 0.0281,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04582863041591751,
|
||
|
|
"grad_norm": 0.16532529890537262,
|
||
|
|
"learning_rate": 1.980149369128634e-05,
|
||
|
|
"loss": 0.0234,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04628237923191669,
|
||
|
|
"grad_norm": 0.2719618082046509,
|
||
|
|
"learning_rate": 1.9795623760267294e-05,
|
||
|
|
"loss": 0.0243,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.046736128047915874,
|
||
|
|
"grad_norm": 0.16915248334407806,
|
||
|
|
"learning_rate": 1.9789669196462072e-05,
|
||
|
|
"loss": 0.0261,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.047189876863915056,
|
||
|
|
"grad_norm": 0.26485320925712585,
|
||
|
|
"learning_rate": 1.978363005131725e-05,
|
||
|
|
"loss": 0.0265,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.047643625679914245,
|
||
|
|
"grad_norm": 0.17664441466331482,
|
||
|
|
"learning_rate": 1.9777506377010182e-05,
|
||
|
|
"loss": 0.0241,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04809737449591343,
|
||
|
|
"grad_norm": 0.19183993339538574,
|
||
|
|
"learning_rate": 1.9771298226448535e-05,
|
||
|
|
"loss": 0.0222,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04855112331191261,
|
||
|
|
"grad_norm": 0.22114388644695282,
|
||
|
|
"learning_rate": 1.9765005653269842e-05,
|
||
|
|
"loss": 0.0203,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04900487212791179,
|
||
|
|
"grad_norm": 0.3064383566379547,
|
||
|
|
"learning_rate": 1.9758628711841035e-05,
|
||
|
|
"loss": 0.0283,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04945862094391097,
|
||
|
|
"grad_norm": 0.18376608192920685,
|
||
|
|
"learning_rate": 1.975216745725797e-05,
|
||
|
|
"loss": 0.0267,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04991236975991016,
|
||
|
|
"grad_norm": 0.279770165681839,
|
||
|
|
"learning_rate": 1.974562194534496e-05,
|
||
|
|
"loss": 0.0242,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05036611857590934,
|
||
|
|
"grad_norm": 0.37529680132865906,
|
||
|
|
"learning_rate": 1.9738992232654296e-05,
|
||
|
|
"loss": 0.0243,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.050819867391908524,
|
||
|
|
"grad_norm": 0.20544101297855377,
|
||
|
|
"learning_rate": 1.9732278376465746e-05,
|
||
|
|
"loss": 0.0267,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.051273616207907706,
|
||
|
|
"grad_norm": 0.31520286202430725,
|
||
|
|
"learning_rate": 1.9725480434786065e-05,
|
||
|
|
"loss": 0.0201,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05172736502390689,
|
||
|
|
"grad_norm": 0.2995913028717041,
|
||
|
|
"learning_rate": 1.971859846634849e-05,
|
||
|
|
"loss": 0.0264,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.052181113839906076,
|
||
|
|
"grad_norm": 0.18620726466178894,
|
||
|
|
"learning_rate": 1.9711632530612247e-05,
|
||
|
|
"loss": 0.0268,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05263486265590526,
|
||
|
|
"grad_norm": 0.26744094491004944,
|
||
|
|
"learning_rate": 1.970458268776202e-05,
|
||
|
|
"loss": 0.0238,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05308861147190444,
|
||
|
|
"grad_norm": 0.16825206577777863,
|
||
|
|
"learning_rate": 1.9697448998707448e-05,
|
||
|
|
"loss": 0.023,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05354236028790362,
|
||
|
|
"grad_norm": 0.16264687478542328,
|
||
|
|
"learning_rate": 1.9690231525082576e-05,
|
||
|
|
"loss": 0.0244,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05399610910390281,
|
||
|
|
"grad_norm": 0.22812557220458984,
|
||
|
|
"learning_rate": 1.968293032924535e-05,
|
||
|
|
"loss": 0.0209,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05444985791990199,
|
||
|
|
"grad_norm": 0.22756661474704742,
|
||
|
|
"learning_rate": 1.9675545474277045e-05,
|
||
|
|
"loss": 0.0253,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.054903606735901174,
|
||
|
|
"grad_norm": 0.24875099956989288,
|
||
|
|
"learning_rate": 1.966807702398176e-05,
|
||
|
|
"loss": 0.0213,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.055357355551900356,
|
||
|
|
"grad_norm": 0.265860915184021,
|
||
|
|
"learning_rate": 1.9660525042885828e-05,
|
||
|
|
"loss": 0.0225,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05581110436789954,
|
||
|
|
"grad_norm": 0.17177896201610565,
|
||
|
|
"learning_rate": 1.965288959623729e-05,
|
||
|
|
"loss": 0.0246,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.056264853183898726,
|
||
|
|
"grad_norm": 0.20631885528564453,
|
||
|
|
"learning_rate": 1.964517075000531e-05,
|
||
|
|
"loss": 0.0262,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05671860199989791,
|
||
|
|
"grad_norm": 0.1978541612625122,
|
||
|
|
"learning_rate": 1.9637368570879612e-05,
|
||
|
|
"loss": 0.0228,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05717235081589709,
|
||
|
|
"grad_norm": 0.30454206466674805,
|
||
|
|
"learning_rate": 1.9629483126269904e-05,
|
||
|
|
"loss": 0.0294,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05762609963189627,
|
||
|
|
"grad_norm": 0.21276606619358063,
|
||
|
|
"learning_rate": 1.9621514484305308e-05,
|
||
|
|
"loss": 0.0256,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05807984844789545,
|
||
|
|
"grad_norm": 0.2528305947780609,
|
||
|
|
"learning_rate": 1.9613462713833734e-05,
|
||
|
|
"loss": 0.0251,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05853359726389464,
|
||
|
|
"grad_norm": 0.16171447932720184,
|
||
|
|
"learning_rate": 1.9605327884421338e-05,
|
||
|
|
"loss": 0.0249,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.058987346079893824,
|
||
|
|
"grad_norm": 0.2728146016597748,
|
||
|
|
"learning_rate": 1.9597110066351875e-05,
|
||
|
|
"loss": 0.024,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.059441094895893005,
|
||
|
|
"grad_norm": 0.20263759791851044,
|
||
|
|
"learning_rate": 1.958880933062612e-05,
|
||
|
|
"loss": 0.0217,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05989484371189219,
|
||
|
|
"grad_norm": 0.18950790166854858,
|
||
|
|
"learning_rate": 1.958042574896124e-05,
|
||
|
|
"loss": 0.0211,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.060348592527891376,
|
||
|
|
"grad_norm": 0.18953697383403778,
|
||
|
|
"learning_rate": 1.9571959393790174e-05,
|
||
|
|
"loss": 0.0222,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06080234134389056,
|
||
|
|
"grad_norm": 0.18523451685905457,
|
||
|
|
"learning_rate": 1.9563410338261022e-05,
|
||
|
|
"loss": 0.0238,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06125609015988974,
|
||
|
|
"grad_norm": 0.18328619003295898,
|
||
|
|
"learning_rate": 1.9554778656236402e-05,
|
||
|
|
"loss": 0.0238,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06170983897588892,
|
||
|
|
"grad_norm": 0.18588705360889435,
|
||
|
|
"learning_rate": 1.9546064422292806e-05,
|
||
|
|
"loss": 0.0217,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0621635877918881,
|
||
|
|
"grad_norm": 0.3565521836280823,
|
||
|
|
"learning_rate": 1.9537267711719966e-05,
|
||
|
|
"loss": 0.0282,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06261733660788729,
|
||
|
|
"grad_norm": 0.2516770362854004,
|
||
|
|
"learning_rate": 1.9528388600520208e-05,
|
||
|
|
"loss": 0.0224,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06307108542388647,
|
||
|
|
"grad_norm": 0.20207948982715607,
|
||
|
|
"learning_rate": 1.9519427165407773e-05,
|
||
|
|
"loss": 0.0212,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06352483423988566,
|
||
|
|
"grad_norm": 0.19543805718421936,
|
||
|
|
"learning_rate": 1.9510383483808183e-05,
|
||
|
|
"loss": 0.0233,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06397858305588484,
|
||
|
|
"grad_norm": 0.16665951907634735,
|
||
|
|
"learning_rate": 1.950125763385755e-05,
|
||
|
|
"loss": 0.0182,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06443233187188402,
|
||
|
|
"grad_norm": 0.13467726111412048,
|
||
|
|
"learning_rate": 1.949204969440191e-05,
|
||
|
|
"loss": 0.024,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06488608068788321,
|
||
|
|
"grad_norm": 0.18848440051078796,
|
||
|
|
"learning_rate": 1.9482759744996537e-05,
|
||
|
|
"loss": 0.0257,
|
||
|
|
"step": 1430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06533982950388238,
|
||
|
|
"grad_norm": 0.13075998425483704,
|
||
|
|
"learning_rate": 1.9473387865905268e-05,
|
||
|
|
"loss": 0.0261,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06579357831988157,
|
||
|
|
"grad_norm": 0.24777375161647797,
|
||
|
|
"learning_rate": 1.9463934138099796e-05,
|
||
|
|
"loss": 0.0242,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06624732713588076,
|
||
|
|
"grad_norm": 0.13372217118740082,
|
||
|
|
"learning_rate": 1.945439864325897e-05,
|
||
|
|
"loss": 0.0216,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06670107595187993,
|
||
|
|
"grad_norm": 0.2603487968444824,
|
||
|
|
"learning_rate": 1.944478146376811e-05,
|
||
|
|
"loss": 0.0243,
|
||
|
|
"step": 1470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06715482476787912,
|
||
|
|
"grad_norm": 0.20325487852096558,
|
||
|
|
"learning_rate": 1.943508268271826e-05,
|
||
|
|
"loss": 0.0262,
|
||
|
|
"step": 1480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0676085735838783,
|
||
|
|
"grad_norm": 0.23849761486053467,
|
||
|
|
"learning_rate": 1.9425302383905497e-05,
|
||
|
|
"loss": 0.0247,
|
||
|
|
"step": 1490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06806232239987749,
|
||
|
|
"grad_norm": 0.17411549389362335,
|
||
|
|
"learning_rate": 1.941544065183021e-05,
|
||
|
|
"loss": 0.0213,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06851607121587668,
|
||
|
|
"grad_norm": 0.2012050598859787,
|
||
|
|
"learning_rate": 1.9405497571696347e-05,
|
||
|
|
"loss": 0.0215,
|
||
|
|
"step": 1510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06896982003187585,
|
||
|
|
"grad_norm": 0.19042851030826569,
|
||
|
|
"learning_rate": 1.93954732294107e-05,
|
||
|
|
"loss": 0.0228,
|
||
|
|
"step": 1520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06942356884787504,
|
||
|
|
"grad_norm": 0.20282061398029327,
|
||
|
|
"learning_rate": 1.9385367711582142e-05,
|
||
|
|
"loss": 0.0199,
|
||
|
|
"step": 1530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06987731766387423,
|
||
|
|
"grad_norm": 0.22637954354286194,
|
||
|
|
"learning_rate": 1.9375181105520907e-05,
|
||
|
|
"loss": 0.0188,
|
||
|
|
"step": 1540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0703310664798734,
|
||
|
|
"grad_norm": 0.17371779680252075,
|
||
|
|
"learning_rate": 1.9364913499237814e-05,
|
||
|
|
"loss": 0.0169,
|
||
|
|
"step": 1550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07078481529587259,
|
||
|
|
"grad_norm": 0.20600782334804535,
|
||
|
|
"learning_rate": 1.93545649814435e-05,
|
||
|
|
"loss": 0.0194,
|
||
|
|
"step": 1560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07123856411187177,
|
||
|
|
"grad_norm": 0.19030754268169403,
|
||
|
|
"learning_rate": 1.934413564154769e-05,
|
||
|
|
"loss": 0.0187,
|
||
|
|
"step": 1570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07169231292787095,
|
||
|
|
"grad_norm": 0.3098970651626587,
|
||
|
|
"learning_rate": 1.9333625569658377e-05,
|
||
|
|
"loss": 0.0299,
|
||
|
|
"step": 1580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07214606174387014,
|
||
|
|
"grad_norm": 0.16969068348407745,
|
||
|
|
"learning_rate": 1.9323034856581083e-05,
|
||
|
|
"loss": 0.0212,
|
||
|
|
"step": 1590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07259981055986932,
|
||
|
|
"grad_norm": 0.17913642525672913,
|
||
|
|
"learning_rate": 1.9312363593818045e-05,
|
||
|
|
"loss": 0.0238,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07305355937586851,
|
||
|
|
"grad_norm": 0.27264851331710815,
|
||
|
|
"learning_rate": 1.930161187356745e-05,
|
||
|
|
"loss": 0.0202,
|
||
|
|
"step": 1610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07350730819186768,
|
||
|
|
"grad_norm": 0.23685918748378754,
|
||
|
|
"learning_rate": 1.929077978872262e-05,
|
||
|
|
"loss": 0.0225,
|
||
|
|
"step": 1620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07396105700786687,
|
||
|
|
"grad_norm": 0.23093511164188385,
|
||
|
|
"learning_rate": 1.9279867432871215e-05,
|
||
|
|
"loss": 0.0206,
|
||
|
|
"step": 1630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07441480582386606,
|
||
|
|
"grad_norm": 0.19286802411079407,
|
||
|
|
"learning_rate": 1.9268874900294426e-05,
|
||
|
|
"loss": 0.0242,
|
||
|
|
"step": 1640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07486855463986523,
|
||
|
|
"grad_norm": 0.15499348938465118,
|
||
|
|
"learning_rate": 1.9257802285966166e-05,
|
||
|
|
"loss": 0.0211,
|
||
|
|
"step": 1650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07532230345586442,
|
||
|
|
"grad_norm": 0.25941014289855957,
|
||
|
|
"learning_rate": 1.924664968555223e-05,
|
||
|
|
"loss": 0.0234,
|
||
|
|
"step": 1660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0757760522718636,
|
||
|
|
"grad_norm": 0.2614118456840515,
|
||
|
|
"learning_rate": 1.9235417195409487e-05,
|
||
|
|
"loss": 0.0203,
|
||
|
|
"step": 1670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07622980108786279,
|
||
|
|
"grad_norm": 0.24374106526374817,
|
||
|
|
"learning_rate": 1.922410491258505e-05,
|
||
|
|
"loss": 0.0225,
|
||
|
|
"step": 1680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07668354990386198,
|
||
|
|
"grad_norm": 0.2553516626358032,
|
||
|
|
"learning_rate": 1.9212712934815413e-05,
|
||
|
|
"loss": 0.0253,
|
||
|
|
"step": 1690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07713729871986115,
|
||
|
|
"grad_norm": 0.18607620894908905,
|
||
|
|
"learning_rate": 1.9201241360525643e-05,
|
||
|
|
"loss": 0.0205,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07759104753586034,
|
||
|
|
"grad_norm": 0.2262287437915802,
|
||
|
|
"learning_rate": 1.9189690288828487e-05,
|
||
|
|
"loss": 0.0245,
|
||
|
|
"step": 1710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07804479635185951,
|
||
|
|
"grad_norm": 0.24479566514492035,
|
||
|
|
"learning_rate": 1.9178059819523563e-05,
|
||
|
|
"loss": 0.0208,
|
||
|
|
"step": 1720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0784985451678587,
|
||
|
|
"grad_norm": 0.2556163966655731,
|
||
|
|
"learning_rate": 1.9166350053096453e-05,
|
||
|
|
"loss": 0.0215,
|
||
|
|
"step": 1730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07895229398385789,
|
||
|
|
"grad_norm": 0.8043221831321716,
|
||
|
|
"learning_rate": 1.9154561090717857e-05,
|
||
|
|
"loss": 0.021,
|
||
|
|
"step": 1740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07940604279985707,
|
||
|
|
"grad_norm": 0.2874302864074707,
|
||
|
|
"learning_rate": 1.9142693034242726e-05,
|
||
|
|
"loss": 0.0194,
|
||
|
|
"step": 1750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07985979161585625,
|
||
|
|
"grad_norm": 0.2852942943572998,
|
||
|
|
"learning_rate": 1.913074598620937e-05,
|
||
|
|
"loss": 0.0228,
|
||
|
|
"step": 1760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08031354043185543,
|
||
|
|
"grad_norm": 0.18303954601287842,
|
||
|
|
"learning_rate": 1.9118720049838567e-05,
|
||
|
|
"loss": 0.0212,
|
||
|
|
"step": 1770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08076728924785462,
|
||
|
|
"grad_norm": 0.14635030925273895,
|
||
|
|
"learning_rate": 1.9106615329032695e-05,
|
||
|
|
"loss": 0.0226,
|
||
|
|
"step": 1780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0812210380638538,
|
||
|
|
"grad_norm": 0.17153240740299225,
|
||
|
|
"learning_rate": 1.9094431928374798e-05,
|
||
|
|
"loss": 0.0195,
|
||
|
|
"step": 1790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08167478687985298,
|
||
|
|
"grad_norm": 0.23255468904972076,
|
||
|
|
"learning_rate": 1.9082169953127714e-05,
|
||
|
|
"loss": 0.0206,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08212853569585217,
|
||
|
|
"grad_norm": 0.17257371544837952,
|
||
|
|
"learning_rate": 1.9069829509233156e-05,
|
||
|
|
"loss": 0.0191,
|
||
|
|
"step": 1810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08258228451185136,
|
||
|
|
"grad_norm": 0.15475516021251678,
|
||
|
|
"learning_rate": 1.9057410703310788e-05,
|
||
|
|
"loss": 0.0193,
|
||
|
|
"step": 1820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08303603332785053,
|
||
|
|
"grad_norm": 0.17117761075496674,
|
||
|
|
"learning_rate": 1.9044913642657318e-05,
|
||
|
|
"loss": 0.0204,
|
||
|
|
"step": 1830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08348978214384972,
|
||
|
|
"grad_norm": 0.20223674178123474,
|
||
|
|
"learning_rate": 1.9032338435245557e-05,
|
||
|
|
"loss": 0.0204,
|
||
|
|
"step": 1840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0839435309598489,
|
||
|
|
"grad_norm": 0.24889586865901947,
|
||
|
|
"learning_rate": 1.9019685189723497e-05,
|
||
|
|
"loss": 0.0244,
|
||
|
|
"step": 1850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08439727977584809,
|
||
|
|
"grad_norm": 0.2555040419101715,
|
||
|
|
"learning_rate": 1.900695401541337e-05,
|
||
|
|
"loss": 0.0203,
|
||
|
|
"step": 1860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08485102859184727,
|
||
|
|
"grad_norm": 0.33612123131752014,
|
||
|
|
"learning_rate": 1.8994145022310693e-05,
|
||
|
|
"loss": 0.0187,
|
||
|
|
"step": 1870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08530477740784645,
|
||
|
|
"grad_norm": 0.17665745317935944,
|
||
|
|
"learning_rate": 1.8981258321083335e-05,
|
||
|
|
"loss": 0.0221,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08575852622384564,
|
||
|
|
"grad_norm": 0.22196491062641144,
|
||
|
|
"learning_rate": 1.8968294023070548e-05,
|
||
|
|
"loss": 0.0203,
|
||
|
|
"step": 1890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08621227503984481,
|
||
|
|
"grad_norm": 0.16377434134483337,
|
||
|
|
"learning_rate": 1.895525224028201e-05,
|
||
|
|
"loss": 0.0213,
|
||
|
|
"step": 1900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.086666023855844,
|
||
|
|
"grad_norm": 0.1716269701719284,
|
||
|
|
"learning_rate": 1.8942133085396855e-05,
|
||
|
|
"loss": 0.0175,
|
||
|
|
"step": 1910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08711977267184319,
|
||
|
|
"grad_norm": 0.15549753606319427,
|
||
|
|
"learning_rate": 1.8928936671762704e-05,
|
||
|
|
"loss": 0.0211,
|
||
|
|
"step": 1920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08757352148784237,
|
||
|
|
"grad_norm": 0.20391225814819336,
|
||
|
|
"learning_rate": 1.8915663113394677e-05,
|
||
|
|
"loss": 0.0243,
|
||
|
|
"step": 1930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08802727030384155,
|
||
|
|
"grad_norm": 0.20363971590995789,
|
||
|
|
"learning_rate": 1.890231252497442e-05,
|
||
|
|
"loss": 0.0191,
|
||
|
|
"step": 1940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08848101911984073,
|
||
|
|
"grad_norm": 0.16531780362129211,
|
||
|
|
"learning_rate": 1.8888885021849103e-05,
|
||
|
|
"loss": 0.0202,
|
||
|
|
"step": 1950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08893476793583992,
|
||
|
|
"grad_norm": 0.19257767498493195,
|
||
|
|
"learning_rate": 1.8875380720030434e-05,
|
||
|
|
"loss": 0.0224,
|
||
|
|
"step": 1960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0893885167518391,
|
||
|
|
"grad_norm": 0.21032428741455078,
|
||
|
|
"learning_rate": 1.886179973619364e-05,
|
||
|
|
"loss": 0.0221,
|
||
|
|
"step": 1970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08984226556783828,
|
||
|
|
"grad_norm": 0.258282870054245,
|
||
|
|
"learning_rate": 1.8848142187676485e-05,
|
||
|
|
"loss": 0.0203,
|
||
|
|
"step": 1980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09029601438383747,
|
||
|
|
"grad_norm": 0.15617872774600983,
|
||
|
|
"learning_rate": 1.883440819247822e-05,
|
||
|
|
"loss": 0.0225,
|
||
|
|
"step": 1990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09074976319983664,
|
||
|
|
"grad_norm": 0.10580911487340927,
|
||
|
|
"learning_rate": 1.8820597869258606e-05,
|
||
|
|
"loss": 0.0208,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09120351201583583,
|
||
|
|
"grad_norm": 0.15046417713165283,
|
||
|
|
"learning_rate": 1.8806711337336852e-05,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"step": 2010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09165726083183502,
|
||
|
|
"grad_norm": 0.19403544068336487,
|
||
|
|
"learning_rate": 1.8792748716690608e-05,
|
||
|
|
"loss": 0.0233,
|
||
|
|
"step": 2020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0921110096478342,
|
||
|
|
"grad_norm": 0.18249863386154175,
|
||
|
|
"learning_rate": 1.8778710127954912e-05,
|
||
|
|
"loss": 0.0233,
|
||
|
|
"step": 2030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09256475846383339,
|
||
|
|
"grad_norm": 0.19264620542526245,
|
||
|
|
"learning_rate": 1.8764595692421163e-05,
|
||
|
|
"loss": 0.0199,
|
||
|
|
"step": 2040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09301850727983256,
|
||
|
|
"grad_norm": 0.21285632252693176,
|
||
|
|
"learning_rate": 1.8750405532036064e-05,
|
||
|
|
"loss": 0.0207,
|
||
|
|
"step": 2050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09347225609583175,
|
||
|
|
"grad_norm": 0.18511101603507996,
|
||
|
|
"learning_rate": 1.8736139769400567e-05,
|
||
|
|
"loss": 0.0211,
|
||
|
|
"step": 2060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09392600491183094,
|
||
|
|
"grad_norm": 0.15465500950813293,
|
||
|
|
"learning_rate": 1.8721798527768813e-05,
|
||
|
|
"loss": 0.0199,
|
||
|
|
"step": 2070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09437975372783011,
|
||
|
|
"grad_norm": 0.155814990401268,
|
||
|
|
"learning_rate": 1.870738193104708e-05,
|
||
|
|
"loss": 0.0193,
|
||
|
|
"step": 2080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0948335025438293,
|
||
|
|
"grad_norm": 0.11972299963235855,
|
||
|
|
"learning_rate": 1.86928901037927e-05,
|
||
|
|
"loss": 0.0184,
|
||
|
|
"step": 2090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09528725135982849,
|
||
|
|
"grad_norm": 0.19595398008823395,
|
||
|
|
"learning_rate": 1.8678323171212982e-05,
|
||
|
|
"loss": 0.0257,
|
||
|
|
"step": 2100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09574100017582766,
|
||
|
|
"grad_norm": 0.24550503492355347,
|
||
|
|
"learning_rate": 1.866368125916414e-05,
|
||
|
|
"loss": 0.0189,
|
||
|
|
"step": 2110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09619474899182685,
|
||
|
|
"grad_norm": 0.1504066437482834,
|
||
|
|
"learning_rate": 1.864896449415019e-05,
|
||
|
|
"loss": 0.0172,
|
||
|
|
"step": 2120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09664849780782603,
|
||
|
|
"grad_norm": 0.2157190591096878,
|
||
|
|
"learning_rate": 1.863417300332188e-05,
|
||
|
|
"loss": 0.0193,
|
||
|
|
"step": 2130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09710224662382522,
|
||
|
|
"grad_norm": 0.2792034149169922,
|
||
|
|
"learning_rate": 1.8619306914475573e-05,
|
||
|
|
"loss": 0.0199,
|
||
|
|
"step": 2140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0975559954398244,
|
||
|
|
"grad_norm": 0.14018727838993073,
|
||
|
|
"learning_rate": 1.860436635605214e-05,
|
||
|
|
"loss": 0.0216,
|
||
|
|
"step": 2150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09800974425582358,
|
||
|
|
"grad_norm": 0.17746350169181824,
|
||
|
|
"learning_rate": 1.8589351457135873e-05,
|
||
|
|
"loss": 0.0171,
|
||
|
|
"step": 2160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09846349307182277,
|
||
|
|
"grad_norm": 0.1806003302335739,
|
||
|
|
"learning_rate": 1.8574262347453344e-05,
|
||
|
|
"loss": 0.0238,
|
||
|
|
"step": 2170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09891724188782194,
|
||
|
|
"grad_norm": 0.15680097043514252,
|
||
|
|
"learning_rate": 1.85590991573723e-05,
|
||
|
|
"loss": 0.0205,
|
||
|
|
"step": 2180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09937099070382113,
|
||
|
|
"grad_norm": 0.14457744359970093,
|
||
|
|
"learning_rate": 1.854386201790053e-05,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"step": 2190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09982473951982032,
|
||
|
|
"grad_norm": 0.1748875379562378,
|
||
|
|
"learning_rate": 1.8528551060684744e-05,
|
||
|
|
"loss": 0.0204,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1002784883358195,
|
||
|
|
"grad_norm": 0.2859458923339844,
|
||
|
|
"learning_rate": 1.851316641800941e-05,
|
||
|
|
"loss": 0.0204,
|
||
|
|
"step": 2210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10073223715181868,
|
||
|
|
"grad_norm": 0.16153216361999512,
|
||
|
|
"learning_rate": 1.8497708222795638e-05,
|
||
|
|
"loss": 0.019,
|
||
|
|
"step": 2220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10118598596781786,
|
||
|
|
"grad_norm": 0.1441313922405243,
|
||
|
|
"learning_rate": 1.8482176608600025e-05,
|
||
|
|
"loss": 0.0202,
|
||
|
|
"step": 2230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10163973478381705,
|
||
|
|
"grad_norm": 0.19473570585250854,
|
||
|
|
"learning_rate": 1.846657170961349e-05,
|
||
|
|
"loss": 0.0237,
|
||
|
|
"step": 2240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10209348359981624,
|
||
|
|
"grad_norm": 1.6714526414871216,
|
||
|
|
"learning_rate": 1.8450893660660126e-05,
|
||
|
|
"loss": 0.0226,
|
||
|
|
"step": 2250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10254723241581541,
|
||
|
|
"grad_norm": 0.1397905796766281,
|
||
|
|
"learning_rate": 1.8435142597196033e-05,
|
||
|
|
"loss": 0.02,
|
||
|
|
"step": 2260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1030009812318146,
|
||
|
|
"grad_norm": 0.31626781821250916,
|
||
|
|
"learning_rate": 1.8419318655308135e-05,
|
||
|
|
"loss": 0.021,
|
||
|
|
"step": 2270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10345473004781378,
|
||
|
|
"grad_norm": 0.2210594266653061,
|
||
|
|
"learning_rate": 1.8403421971713034e-05,
|
||
|
|
"loss": 0.0249,
|
||
|
|
"step": 2280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10390847886381296,
|
||
|
|
"grad_norm": 0.26183414459228516,
|
||
|
|
"learning_rate": 1.838745268375579e-05,
|
||
|
|
"loss": 0.0182,
|
||
|
|
"step": 2290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10436222767981215,
|
||
|
|
"grad_norm": 0.21002666652202606,
|
||
|
|
"learning_rate": 1.8371410929408767e-05,
|
||
|
|
"loss": 0.0202,
|
||
|
|
"step": 2300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10481597649581133,
|
||
|
|
"grad_norm": 0.206701397895813,
|
||
|
|
"learning_rate": 1.835529684727043e-05,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"step": 2310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10526972531181052,
|
||
|
|
"grad_norm": 0.16457580029964447,
|
||
|
|
"learning_rate": 1.8339110576564132e-05,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 2320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10572347412780969,
|
||
|
|
"grad_norm": 0.1727597862482071,
|
||
|
|
"learning_rate": 1.8322852257136935e-05,
|
||
|
|
"loss": 0.0174,
|
||
|
|
"step": 2330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10617722294380888,
|
||
|
|
"grad_norm": 0.18292547762393951,
|
||
|
|
"learning_rate": 1.8306522029458395e-05,
|
||
|
|
"loss": 0.0171,
|
||
|
|
"step": 2340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10663097175980807,
|
||
|
|
"grad_norm": 0.22897396981716156,
|
||
|
|
"learning_rate": 1.8290120034619335e-05,
|
||
|
|
"loss": 0.0197,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10708472057580724,
|
||
|
|
"grad_norm": 0.18409202992916107,
|
||
|
|
"learning_rate": 1.8273646414330645e-05,
|
||
|
|
"loss": 0.0207,
|
||
|
|
"step": 2360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10753846939180643,
|
||
|
|
"grad_norm": 0.22689014673233032,
|
||
|
|
"learning_rate": 1.8257101310922042e-05,
|
||
|
|
"loss": 0.0237,
|
||
|
|
"step": 2370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10799221820780562,
|
||
|
|
"grad_norm": 0.19369293749332428,
|
||
|
|
"learning_rate": 1.8240484867340852e-05,
|
||
|
|
"loss": 0.0159,
|
||
|
|
"step": 2380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1084459670238048,
|
||
|
|
"grad_norm": 0.13972799479961395,
|
||
|
|
"learning_rate": 1.8223797227150762e-05,
|
||
|
|
"loss": 0.02,
|
||
|
|
"step": 2390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10889971583980398,
|
||
|
|
"grad_norm": 0.16729070246219635,
|
||
|
|
"learning_rate": 1.8207038534530598e-05,
|
||
|
|
"loss": 0.0183,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10935346465580316,
|
||
|
|
"grad_norm": 0.21379055082798004,
|
||
|
|
"learning_rate": 1.819020893427306e-05,
|
||
|
|
"loss": 0.0184,
|
||
|
|
"step": 2410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10980721347180235,
|
||
|
|
"grad_norm": 0.24638915061950684,
|
||
|
|
"learning_rate": 1.817330857178349e-05,
|
||
|
|
"loss": 0.0235,
|
||
|
|
"step": 2420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11026096228780154,
|
||
|
|
"grad_norm": 0.1810813695192337,
|
||
|
|
"learning_rate": 1.8156337593078594e-05,
|
||
|
|
"loss": 0.0179,
|
||
|
|
"step": 2430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11071471110380071,
|
||
|
|
"grad_norm": 0.21317435801029205,
|
||
|
|
"learning_rate": 1.81392961447852e-05,
|
||
|
|
"loss": 0.0189,
|
||
|
|
"step": 2440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1111684599197999,
|
||
|
|
"grad_norm": 0.17048487067222595,
|
||
|
|
"learning_rate": 1.8122184374138973e-05,
|
||
|
|
"loss": 0.0183,
|
||
|
|
"step": 2450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11162220873579907,
|
||
|
|
"grad_norm": 0.1683121770620346,
|
||
|
|
"learning_rate": 1.810500242898317e-05,
|
||
|
|
"loss": 0.0195,
|
||
|
|
"step": 2460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11207595755179826,
|
||
|
|
"grad_norm": 0.15048103034496307,
|
||
|
|
"learning_rate": 1.808775045776733e-05,
|
||
|
|
"loss": 0.0205,
|
||
|
|
"step": 2470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11252970636779745,
|
||
|
|
"grad_norm": 0.21140700578689575,
|
||
|
|
"learning_rate": 1.8070428609546012e-05,
|
||
|
|
"loss": 0.0214,
|
||
|
|
"step": 2480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11298345518379663,
|
||
|
|
"grad_norm": 0.18310706317424774,
|
||
|
|
"learning_rate": 1.8053037033977513e-05,
|
||
|
|
"loss": 0.0158,
|
||
|
|
"step": 2490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11343720399979582,
|
||
|
|
"grad_norm": 0.20921604335308075,
|
||
|
|
"learning_rate": 1.803557588132254e-05,
|
||
|
|
"loss": 0.0171,
|
||
|
|
"step": 2500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11389095281579499,
|
||
|
|
"grad_norm": 0.20439156889915466,
|
||
|
|
"learning_rate": 1.8018045302442966e-05,
|
||
|
|
"loss": 0.0221,
|
||
|
|
"step": 2510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11434470163179418,
|
||
|
|
"grad_norm": 0.19221094250679016,
|
||
|
|
"learning_rate": 1.8000445448800473e-05,
|
||
|
|
"loss": 0.017,
|
||
|
|
"step": 2520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11479845044779337,
|
||
|
|
"grad_norm": 0.18272683024406433,
|
||
|
|
"learning_rate": 1.7982776472455274e-05,
|
||
|
|
"loss": 0.0187,
|
||
|
|
"step": 2530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11525219926379254,
|
||
|
|
"grad_norm": 0.20446264743804932,
|
||
|
|
"learning_rate": 1.7965038526064796e-05,
|
||
|
|
"loss": 0.0253,
|
||
|
|
"step": 2540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11570594807979173,
|
||
|
|
"grad_norm": 0.1428219974040985,
|
||
|
|
"learning_rate": 1.794723176288236e-05,
|
||
|
|
"loss": 0.0221,
|
||
|
|
"step": 2550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1161596968957909,
|
||
|
|
"grad_norm": 0.3095003068447113,
|
||
|
|
"learning_rate": 1.7929356336755842e-05,
|
||
|
|
"loss": 0.019,
|
||
|
|
"step": 2560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1166134457117901,
|
||
|
|
"grad_norm": 0.2172461450099945,
|
||
|
|
"learning_rate": 1.7911412402126366e-05,
|
||
|
|
"loss": 0.0209,
|
||
|
|
"step": 2570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11706719452778928,
|
||
|
|
"grad_norm": 0.15558798611164093,
|
||
|
|
"learning_rate": 1.789340011402696e-05,
|
||
|
|
"loss": 0.0203,
|
||
|
|
"step": 2580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11752094334378846,
|
||
|
|
"grad_norm": 0.20151139795780182,
|
||
|
|
"learning_rate": 1.7875319628081205e-05,
|
||
|
|
"loss": 0.0199,
|
||
|
|
"step": 2590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11797469215978765,
|
||
|
|
"grad_norm": 0.2480895221233368,
|
||
|
|
"learning_rate": 1.785717110050192e-05,
|
||
|
|
"loss": 0.0196,
|
||
|
|
"step": 2600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11842844097578682,
|
||
|
|
"grad_norm": 0.24924373626708984,
|
||
|
|
"learning_rate": 1.7838954688089777e-05,
|
||
|
|
"loss": 0.0195,
|
||
|
|
"step": 2610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11888218979178601,
|
||
|
|
"grad_norm": 0.20633499324321747,
|
||
|
|
"learning_rate": 1.782067054823197e-05,
|
||
|
|
"loss": 0.0198,
|
||
|
|
"step": 2620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1193359386077852,
|
||
|
|
"grad_norm": 0.2311122715473175,
|
||
|
|
"learning_rate": 1.7802318838900855e-05,
|
||
|
|
"loss": 0.0193,
|
||
|
|
"step": 2630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11978968742378437,
|
||
|
|
"grad_norm": 0.2317935824394226,
|
||
|
|
"learning_rate": 1.7783899718652563e-05,
|
||
|
|
"loss": 0.0184,
|
||
|
|
"step": 2640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12024343623978356,
|
||
|
|
"grad_norm": 0.1745319962501526,
|
||
|
|
"learning_rate": 1.776541334662566e-05,
|
||
|
|
"loss": 0.0209,
|
||
|
|
"step": 2650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12069718505578275,
|
||
|
|
"grad_norm": 0.19152095913887024,
|
||
|
|
"learning_rate": 1.7746859882539747e-05,
|
||
|
|
"loss": 0.0208,
|
||
|
|
"step": 2660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12115093387178193,
|
||
|
|
"grad_norm": 0.21103863418102264,
|
||
|
|
"learning_rate": 1.7728239486694104e-05,
|
||
|
|
"loss": 0.0216,
|
||
|
|
"step": 2670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12160468268778112,
|
||
|
|
"grad_norm": 0.11558584868907928,
|
||
|
|
"learning_rate": 1.7709552319966275e-05,
|
||
|
|
"loss": 0.0195,
|
||
|
|
"step": 2680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12205843150378029,
|
||
|
|
"grad_norm": 0.19106024503707886,
|
||
|
|
"learning_rate": 1.7690798543810715e-05,
|
||
|
|
"loss": 0.0206,
|
||
|
|
"step": 2690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12251218031977948,
|
||
|
|
"grad_norm": 0.22221724689006805,
|
||
|
|
"learning_rate": 1.7671978320257356e-05,
|
||
|
|
"loss": 0.0193,
|
||
|
|
"step": 2700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12296592913577867,
|
||
|
|
"grad_norm": 0.2026868611574173,
|
||
|
|
"learning_rate": 1.7653091811910236e-05,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"step": 2710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12341967795177784,
|
||
|
|
"grad_norm": 0.16690614819526672,
|
||
|
|
"learning_rate": 1.763413918194608e-05,
|
||
|
|
"loss": 0.0201,
|
||
|
|
"step": 2720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12387342676777703,
|
||
|
|
"grad_norm": 0.13100586831569672,
|
||
|
|
"learning_rate": 1.7615120594112895e-05,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 2730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1243271755837762,
|
||
|
|
"grad_norm": 0.20827825367450714,
|
||
|
|
"learning_rate": 1.7596036212728558e-05,
|
||
|
|
"loss": 0.0149,
|
||
|
|
"step": 2740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1247809243997754,
|
||
|
|
"grad_norm": 0.11902619153261185,
|
||
|
|
"learning_rate": 1.757688620267939e-05,
|
||
|
|
"loss": 0.0184,
|
||
|
|
"step": 2750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12523467321577458,
|
||
|
|
"grad_norm": 0.18187430500984192,
|
||
|
|
"learning_rate": 1.755767072941874e-05,
|
||
|
|
"loss": 0.0187,
|
||
|
|
"step": 2760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12568842203177377,
|
||
|
|
"grad_norm": 0.1461418867111206,
|
||
|
|
"learning_rate": 1.7538389958965537e-05,
|
||
|
|
"loss": 0.0207,
|
||
|
|
"step": 2770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12614217084777293,
|
||
|
|
"grad_norm": 0.1332821249961853,
|
||
|
|
"learning_rate": 1.7519044057902877e-05,
|
||
|
|
"loss": 0.0201,
|
||
|
|
"step": 2780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12659591966377212,
|
||
|
|
"grad_norm": 0.16073133051395416,
|
||
|
|
"learning_rate": 1.749963319337658e-05,
|
||
|
|
"loss": 0.0219,
|
||
|
|
"step": 2790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1270496684797713,
|
||
|
|
"grad_norm": 0.1910991072654724,
|
||
|
|
"learning_rate": 1.748015753309373e-05,
|
||
|
|
"loss": 0.0148,
|
||
|
|
"step": 2800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1275034172957705,
|
||
|
|
"grad_norm": 0.1660872995853424,
|
||
|
|
"learning_rate": 1.746061724532124e-05,
|
||
|
|
"loss": 0.0192,
|
||
|
|
"step": 2810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1279571661117697,
|
||
|
|
"grad_norm": 0.1372659057378769,
|
||
|
|
"learning_rate": 1.7441012498884402e-05,
|
||
|
|
"loss": 0.0158,
|
||
|
|
"step": 2820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12841091492776885,
|
||
|
|
"grad_norm": 0.14543624222278595,
|
||
|
|
"learning_rate": 1.7421343463165415e-05,
|
||
|
|
"loss": 0.0192,
|
||
|
|
"step": 2830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12886466374376804,
|
||
|
|
"grad_norm": 0.2286180555820465,
|
||
|
|
"learning_rate": 1.7401610308101933e-05,
|
||
|
|
"loss": 0.0205,
|
||
|
|
"step": 2840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12931841255976723,
|
||
|
|
"grad_norm": 0.11448660492897034,
|
||
|
|
"learning_rate": 1.7381813204185585e-05,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"step": 2850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12977216137576641,
|
||
|
|
"grad_norm": 0.10879474133253098,
|
||
|
|
"learning_rate": 1.7361952322460513e-05,
|
||
|
|
"loss": 0.0174,
|
||
|
|
"step": 2860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1302259101917656,
|
||
|
|
"grad_norm": 0.17434784770011902,
|
||
|
|
"learning_rate": 1.7342027834521896e-05,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"step": 2870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13067965900776476,
|
||
|
|
"grad_norm": 0.21135075390338898,
|
||
|
|
"learning_rate": 1.7322039912514453e-05,
|
||
|
|
"loss": 0.0148,
|
||
|
|
"step": 2880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13113340782376395,
|
||
|
|
"grad_norm": 0.1227855533361435,
|
||
|
|
"learning_rate": 1.7301988729130964e-05,
|
||
|
|
"loss": 0.0178,
|
||
|
|
"step": 2890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13158715663976314,
|
||
|
|
"grad_norm": 0.1610281616449356,
|
||
|
|
"learning_rate": 1.7281874457610787e-05,
|
||
|
|
"loss": 0.0166,
|
||
|
|
"step": 2900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13204090545576233,
|
||
|
|
"grad_norm": 0.14950548112392426,
|
||
|
|
"learning_rate": 1.7261697271738337e-05,
|
||
|
|
"loss": 0.0203,
|
||
|
|
"step": 2910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13249465427176152,
|
||
|
|
"grad_norm": 0.19927935302257538,
|
||
|
|
"learning_rate": 1.724145734584162e-05,
|
||
|
|
"loss": 0.016,
|
||
|
|
"step": 2920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13294840308776068,
|
||
|
|
"grad_norm": 0.12208227068185806,
|
||
|
|
"learning_rate": 1.7221154854790696e-05,
|
||
|
|
"loss": 0.0164,
|
||
|
|
"step": 2930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13340215190375987,
|
||
|
|
"grad_norm": 0.19058668613433838,
|
||
|
|
"learning_rate": 1.7200789973996172e-05,
|
||
|
|
"loss": 0.0176,
|
||
|
|
"step": 2940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13385590071975906,
|
||
|
|
"grad_norm": 0.1380406767129898,
|
||
|
|
"learning_rate": 1.7180362879407707e-05,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"step": 2950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13430964953575825,
|
||
|
|
"grad_norm": 0.1319362074136734,
|
||
|
|
"learning_rate": 1.7159873747512472e-05,
|
||
|
|
"loss": 0.0181,
|
||
|
|
"step": 2960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13476339835175744,
|
||
|
|
"grad_norm": 0.16067825257778168,
|
||
|
|
"learning_rate": 1.713932275533363e-05,
|
||
|
|
"loss": 0.0186,
|
||
|
|
"step": 2970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1352171471677566,
|
||
|
|
"grad_norm": 0.1704990565776825,
|
||
|
|
"learning_rate": 1.7118710080428807e-05,
|
||
|
|
"loss": 0.0187,
|
||
|
|
"step": 2980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13567089598375578,
|
||
|
|
"grad_norm": 0.22068627178668976,
|
||
|
|
"learning_rate": 1.7098035900888566e-05,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"step": 2990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13612464479975497,
|
||
|
|
"grad_norm": 0.15751402080059052,
|
||
|
|
"learning_rate": 1.7077300395334857e-05,
|
||
|
|
"loss": 0.0216,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13657839361575416,
|
||
|
|
"grad_norm": 0.26060059666633606,
|
||
|
|
"learning_rate": 1.7056503742919476e-05,
|
||
|
|
"loss": 0.0183,
|
||
|
|
"step": 3010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13703214243175335,
|
||
|
|
"grad_norm": 0.25333723425865173,
|
||
|
|
"learning_rate": 1.703564612332252e-05,
|
||
|
|
"loss": 0.0169,
|
||
|
|
"step": 3020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1374858912477525,
|
||
|
|
"grad_norm": 0.11502993106842041,
|
||
|
|
"learning_rate": 1.7014727716750842e-05,
|
||
|
|
"loss": 0.0177,
|
||
|
|
"step": 3030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1379396400637517,
|
||
|
|
"grad_norm": 0.15380710363388062,
|
||
|
|
"learning_rate": 1.699374870393647e-05,
|
||
|
|
"loss": 0.0179,
|
||
|
|
"step": 3040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1383933888797509,
|
||
|
|
"grad_norm": 0.12410028278827667,
|
||
|
|
"learning_rate": 1.697270926613507e-05,
|
||
|
|
"loss": 0.0169,
|
||
|
|
"step": 3050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13884713769575008,
|
||
|
|
"grad_norm": 0.22475175559520721,
|
||
|
|
"learning_rate": 1.6951609585124377e-05,
|
||
|
|
"loss": 0.0201,
|
||
|
|
"step": 3060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13930088651174927,
|
||
|
|
"grad_norm": 0.1766625940799713,
|
||
|
|
"learning_rate": 1.6930449843202607e-05,
|
||
|
|
"loss": 0.0179,
|
||
|
|
"step": 3070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13975463532774846,
|
||
|
|
"grad_norm": 0.1324724704027176,
|
||
|
|
"learning_rate": 1.69092302231869e-05,
|
||
|
|
"loss": 0.0198,
|
||
|
|
"step": 3080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14020838414374762,
|
||
|
|
"grad_norm": 0.13342659175395966,
|
||
|
|
"learning_rate": 1.688795090841173e-05,
|
||
|
|
"loss": 0.0162,
|
||
|
|
"step": 3090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1406621329597468,
|
||
|
|
"grad_norm": 0.17313902080059052,
|
||
|
|
"learning_rate": 1.686661208272734e-05,
|
||
|
|
"loss": 0.0165,
|
||
|
|
"step": 3100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.141115881775746,
|
||
|
|
"grad_norm": 0.1365835964679718,
|
||
|
|
"learning_rate": 1.6845213930498122e-05,
|
||
|
|
"loss": 0.0189,
|
||
|
|
"step": 3110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14156963059174518,
|
||
|
|
"grad_norm": 0.11911448836326599,
|
||
|
|
"learning_rate": 1.682375663660104e-05,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 3120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14202337940774437,
|
||
|
|
"grad_norm": 0.15301299095153809,
|
||
|
|
"learning_rate": 1.680224038642405e-05,
|
||
|
|
"loss": 0.0151,
|
||
|
|
"step": 3130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14247712822374353,
|
||
|
|
"grad_norm": 0.18948419392108917,
|
||
|
|
"learning_rate": 1.6780665365864465e-05,
|
||
|
|
"loss": 0.0197,
|
||
|
|
"step": 3140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14293087703974272,
|
||
|
|
"grad_norm": 0.17763479053974152,
|
||
|
|
"learning_rate": 1.675903176132737e-05,
|
||
|
|
"loss": 0.0163,
|
||
|
|
"step": 3150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1433846258557419,
|
||
|
|
"grad_norm": 0.14466649293899536,
|
||
|
|
"learning_rate": 1.6737339759724016e-05,
|
||
|
|
"loss": 0.0177,
|
||
|
|
"step": 3160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1438383746717411,
|
||
|
|
"grad_norm": 0.1917739361524582,
|
||
|
|
"learning_rate": 1.6715589548470187e-05,
|
||
|
|
"loss": 0.0165,
|
||
|
|
"step": 3170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1442921234877403,
|
||
|
|
"grad_norm": 0.15840497612953186,
|
||
|
|
"learning_rate": 1.669378131548459e-05,
|
||
|
|
"loss": 0.0162,
|
||
|
|
"step": 3180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14474587230373945,
|
||
|
|
"grad_norm": 0.13481085002422333,
|
||
|
|
"learning_rate": 1.6671915249187237e-05,
|
||
|
|
"loss": 0.0145,
|
||
|
|
"step": 3190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14519962111973864,
|
||
|
|
"grad_norm": 0.1597055047750473,
|
||
|
|
"learning_rate": 1.6649991538497808e-05,
|
||
|
|
"loss": 0.0176,
|
||
|
|
"step": 3200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14565336993573783,
|
||
|
|
"grad_norm": 0.19727839529514313,
|
||
|
|
"learning_rate": 1.6628010372834028e-05,
|
||
|
|
"loss": 0.0196,
|
||
|
|
"step": 3210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14610711875173701,
|
||
|
|
"grad_norm": 0.15541918575763702,
|
||
|
|
"learning_rate": 1.660597194211001e-05,
|
||
|
|
"loss": 0.0177,
|
||
|
|
"step": 3220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1465608675677362,
|
||
|
|
"grad_norm": 0.15050934255123138,
|
||
|
|
"learning_rate": 1.6583876436734646e-05,
|
||
|
|
"loss": 0.0176,
|
||
|
|
"step": 3230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14701461638373536,
|
||
|
|
"grad_norm": 0.18570442497730255,
|
||
|
|
"learning_rate": 1.6561724047609936e-05,
|
||
|
|
"loss": 0.0164,
|
||
|
|
"step": 3240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14746836519973455,
|
||
|
|
"grad_norm": 0.22303296625614166,
|
||
|
|
"learning_rate": 1.653951496612935e-05,
|
||
|
|
"loss": 0.0153,
|
||
|
|
"step": 3250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14792211401573374,
|
||
|
|
"grad_norm": 0.14683152735233307,
|
||
|
|
"learning_rate": 1.6517249384176163e-05,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"step": 3260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14837586283173293,
|
||
|
|
"grad_norm": 0.32259002327919006,
|
||
|
|
"learning_rate": 1.6494927494121827e-05,
|
||
|
|
"loss": 0.015,
|
||
|
|
"step": 3270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14882961164773212,
|
||
|
|
"grad_norm": 0.1250266432762146,
|
||
|
|
"learning_rate": 1.647254948882426e-05,
|
||
|
|
"loss": 0.0156,
|
||
|
|
"step": 3280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14928336046373128,
|
||
|
|
"grad_norm": 0.14093117415905,
|
||
|
|
"learning_rate": 1.6450115561626237e-05,
|
||
|
|
"loss": 0.0201,
|
||
|
|
"step": 3290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14973710927973047,
|
||
|
|
"grad_norm": 0.13918519020080566,
|
||
|
|
"learning_rate": 1.6427625906353667e-05,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"step": 3300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15019085809572966,
|
||
|
|
"grad_norm": 0.1556759774684906,
|
||
|
|
"learning_rate": 1.640508071731395e-05,
|
||
|
|
"loss": 0.0194,
|
||
|
|
"step": 3310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15064460691172885,
|
||
|
|
"grad_norm": 0.15330232679843903,
|
||
|
|
"learning_rate": 1.6382480189294293e-05,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"step": 3320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15109835572772803,
|
||
|
|
"grad_norm": 0.22334079444408417,
|
||
|
|
"learning_rate": 1.635982451756002e-05,
|
||
|
|
"loss": 0.0185,
|
||
|
|
"step": 3330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1515521045437272,
|
||
|
|
"grad_norm": 0.14111435413360596,
|
||
|
|
"learning_rate": 1.6337113897852887e-05,
|
||
|
|
"loss": 0.0145,
|
||
|
|
"step": 3340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15200585335972638,
|
||
|
|
"grad_norm": 0.19717109203338623,
|
||
|
|
"learning_rate": 1.6314348526389396e-05,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 3350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15245960217572557,
|
||
|
|
"grad_norm": 0.18821868300437927,
|
||
|
|
"learning_rate": 1.6291528599859102e-05,
|
||
|
|
"loss": 0.0156,
|
||
|
|
"step": 3360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15291335099172476,
|
||
|
|
"grad_norm": 0.21783433854579926,
|
||
|
|
"learning_rate": 1.6268654315422892e-05,
|
||
|
|
"loss": 0.0192,
|
||
|
|
"step": 3370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15336709980772395,
|
||
|
|
"grad_norm": 0.18887248635292053,
|
||
|
|
"learning_rate": 1.6245725870711314e-05,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"step": 3380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1538208486237231,
|
||
|
|
"grad_norm": 0.13798531889915466,
|
||
|
|
"learning_rate": 1.6222743463822842e-05,
|
||
|
|
"loss": 0.0161,
|
||
|
|
"step": 3390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1542745974397223,
|
||
|
|
"grad_norm": 0.11533325910568237,
|
||
|
|
"learning_rate": 1.6199707293322183e-05,
|
||
|
|
"loss": 0.0166,
|
||
|
|
"step": 3400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1547283462557215,
|
||
|
|
"grad_norm": 0.15065830945968628,
|
||
|
|
"learning_rate": 1.6176617558238548e-05,
|
||
|
|
"loss": 0.0203,
|
||
|
|
"step": 3410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15518209507172068,
|
||
|
|
"grad_norm": 0.15858496725559235,
|
||
|
|
"learning_rate": 1.615347445806394e-05,
|
||
|
|
"loss": 0.0197,
|
||
|
|
"step": 3420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15563584388771987,
|
||
|
|
"grad_norm": 0.14009806513786316,
|
||
|
|
"learning_rate": 1.613027819275143e-05,
|
||
|
|
"loss": 0.0174,
|
||
|
|
"step": 3430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15608959270371903,
|
||
|
|
"grad_norm": 0.10621248185634613,
|
||
|
|
"learning_rate": 1.6107028962713433e-05,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"step": 3440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15654334151971822,
|
||
|
|
"grad_norm": 0.1479751616716385,
|
||
|
|
"learning_rate": 1.608372696881996e-05,
|
||
|
|
"loss": 0.0163,
|
||
|
|
"step": 3450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1569970903357174,
|
||
|
|
"grad_norm": 0.1862780600786209,
|
||
|
|
"learning_rate": 1.60603724123969e-05,
|
||
|
|
"loss": 0.0161,
|
||
|
|
"step": 3460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1574508391517166,
|
||
|
|
"grad_norm": 0.12772873044013977,
|
||
|
|
"learning_rate": 1.603696549522428e-05,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"step": 3470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15790458796771578,
|
||
|
|
"grad_norm": 0.1471777707338333,
|
||
|
|
"learning_rate": 1.6013506419534505e-05,
|
||
|
|
"loss": 0.02,
|
||
|
|
"step": 3480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15835833678371494,
|
||
|
|
"grad_norm": 0.1465907096862793,
|
||
|
|
"learning_rate": 1.598999538801064e-05,
|
||
|
|
"loss": 0.0154,
|
||
|
|
"step": 3490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15881208559971413,
|
||
|
|
"grad_norm": 0.1613445281982422,
|
||
|
|
"learning_rate": 1.5966432603784615e-05,
|
||
|
|
"loss": 0.0144,
|
||
|
|
"step": 3500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15926583441571332,
|
||
|
|
"grad_norm": 0.12628807127475739,
|
||
|
|
"learning_rate": 1.594281827043552e-05,
|
||
|
|
"loss": 0.015,
|
||
|
|
"step": 3510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1597195832317125,
|
||
|
|
"grad_norm": 0.22475102543830872,
|
||
|
|
"learning_rate": 1.5919152591987814e-05,
|
||
|
|
"loss": 0.0157,
|
||
|
|
"step": 3520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1601733320477117,
|
||
|
|
"grad_norm": 0.1369459331035614,
|
||
|
|
"learning_rate": 1.5895435772909564e-05,
|
||
|
|
"loss": 0.0161,
|
||
|
|
"step": 3530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16062708086371086,
|
||
|
|
"grad_norm": 0.14013464748859406,
|
||
|
|
"learning_rate": 1.5871668018110694e-05,
|
||
|
|
"loss": 0.0197,
|
||
|
|
"step": 3540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16108082967971005,
|
||
|
|
"grad_norm": 0.1429699957370758,
|
||
|
|
"learning_rate": 1.5847849532941196e-05,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"step": 3550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16153457849570924,
|
||
|
|
"grad_norm": 0.1818908452987671,
|
||
|
|
"learning_rate": 1.5823980523189373e-05,
|
||
|
|
"loss": 0.0134,
|
||
|
|
"step": 3560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16198832731170842,
|
||
|
|
"grad_norm": 0.20686101913452148,
|
||
|
|
"learning_rate": 1.580006119508005e-05,
|
||
|
|
"loss": 0.0162,
|
||
|
|
"step": 3570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1624420761277076,
|
||
|
|
"grad_norm": 0.210786372423172,
|
||
|
|
"learning_rate": 1.5776091755272792e-05,
|
||
|
|
"loss": 0.0141,
|
||
|
|
"step": 3580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16289582494370677,
|
||
|
|
"grad_norm": 0.1615171581506729,
|
||
|
|
"learning_rate": 1.5752072410860132e-05,
|
||
|
|
"loss": 0.0201,
|
||
|
|
"step": 3590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16334957375970596,
|
||
|
|
"grad_norm": 0.20601245760917664,
|
||
|
|
"learning_rate": 1.5728003369365763e-05,
|
||
|
|
"loss": 0.0166,
|
||
|
|
"step": 3600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16380332257570515,
|
||
|
|
"grad_norm": 0.1881730854511261,
|
||
|
|
"learning_rate": 1.5703884838742755e-05,
|
||
|
|
"loss": 0.0166,
|
||
|
|
"step": 3610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16425707139170434,
|
||
|
|
"grad_norm": 0.19188909232616425,
|
||
|
|
"learning_rate": 1.5679717027371756e-05,
|
||
|
|
"loss": 0.0184,
|
||
|
|
"step": 3620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16471082020770353,
|
||
|
|
"grad_norm": 0.15665607154369354,
|
||
|
|
"learning_rate": 1.5655500144059202e-05,
|
||
|
|
"loss": 0.0145,
|
||
|
|
"step": 3630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16516456902370272,
|
||
|
|
"grad_norm": 0.12721318006515503,
|
||
|
|
"learning_rate": 1.5631234398035483e-05,
|
||
|
|
"loss": 0.0161,
|
||
|
|
"step": 3640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16561831783970188,
|
||
|
|
"grad_norm": 0.1763559728860855,
|
||
|
|
"learning_rate": 1.5606919998953182e-05,
|
||
|
|
"loss": 0.0172,
|
||
|
|
"step": 3650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16607206665570107,
|
||
|
|
"grad_norm": 0.21826671063899994,
|
||
|
|
"learning_rate": 1.5582557156885218e-05,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"step": 3660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16652581547170026,
|
||
|
|
"grad_norm": 0.1324024349451065,
|
||
|
|
"learning_rate": 1.5558146082323056e-05,
|
||
|
|
"loss": 0.0178,
|
||
|
|
"step": 3670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16697956428769944,
|
||
|
|
"grad_norm": 0.18612372875213623,
|
||
|
|
"learning_rate": 1.5533686986174885e-05,
|
||
|
|
"loss": 0.0152,
|
||
|
|
"step": 3680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16743331310369863,
|
||
|
|
"grad_norm": 0.15666766464710236,
|
||
|
|
"learning_rate": 1.5509180079763794e-05,
|
||
|
|
"loss": 0.0198,
|
||
|
|
"step": 3690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1678870619196978,
|
||
|
|
"grad_norm": 0.14181271195411682,
|
||
|
|
"learning_rate": 1.548462557482594e-05,
|
||
|
|
"loss": 0.0165,
|
||
|
|
"step": 3700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16834081073569698,
|
||
|
|
"grad_norm": 0.12797847390174866,
|
||
|
|
"learning_rate": 1.546002368350873e-05,
|
||
|
|
"loss": 0.0141,
|
||
|
|
"step": 3710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16879455955169617,
|
||
|
|
"grad_norm": 0.18827445805072784,
|
||
|
|
"learning_rate": 1.5435374618368987e-05,
|
||
|
|
"loss": 0.0171,
|
||
|
|
"step": 3720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16924830836769536,
|
||
|
|
"grad_norm": 0.24247965216636658,
|
||
|
|
"learning_rate": 1.5410678592371097e-05,
|
||
|
|
"loss": 0.0186,
|
||
|
|
"step": 3730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16970205718369455,
|
||
|
|
"grad_norm": 0.15781782567501068,
|
||
|
|
"learning_rate": 1.5385935818885185e-05,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 3740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1701558059996937,
|
||
|
|
"grad_norm": 0.1117536649107933,
|
||
|
|
"learning_rate": 1.5361146511685275e-05,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"step": 3750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1706095548156929,
|
||
|
|
"grad_norm": 0.12188609689474106,
|
||
|
|
"learning_rate": 1.5336310884947424e-05,
|
||
|
|
"loss": 0.0141,
|
||
|
|
"step": 3760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1710633036316921,
|
||
|
|
"grad_norm": 0.25685781240463257,
|
||
|
|
"learning_rate": 1.5311429153247898e-05,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"step": 3770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17151705244769128,
|
||
|
|
"grad_norm": 0.19103960692882538,
|
||
|
|
"learning_rate": 1.5286501531561292e-05,
|
||
|
|
"loss": 0.0151,
|
||
|
|
"step": 3780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17197080126369046,
|
||
|
|
"grad_norm": 0.16486330330371857,
|
||
|
|
"learning_rate": 1.526152823525868e-05,
|
||
|
|
"loss": 0.0164,
|
||
|
|
"step": 3790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17242455007968963,
|
||
|
|
"grad_norm": 0.18085534870624542,
|
||
|
|
"learning_rate": 1.5236509480105781e-05,
|
||
|
|
"loss": 0.0188,
|
||
|
|
"step": 3800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17287829889568881,
|
||
|
|
"grad_norm": 0.1550951898097992,
|
||
|
|
"learning_rate": 1.5211445482261039e-05,
|
||
|
|
"loss": 0.0194,
|
||
|
|
"step": 3810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.173332047711688,
|
||
|
|
"grad_norm": 0.137322798371315,
|
||
|
|
"learning_rate": 1.5186336458273809e-05,
|
||
|
|
"loss": 0.0174,
|
||
|
|
"step": 3820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1737857965276872,
|
||
|
|
"grad_norm": 0.17041611671447754,
|
||
|
|
"learning_rate": 1.5161182625082469e-05,
|
||
|
|
"loss": 0.0161,
|
||
|
|
"step": 3830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17423954534368638,
|
||
|
|
"grad_norm": 0.1689881682395935,
|
||
|
|
"learning_rate": 1.5135984200012526e-05,
|
||
|
|
"loss": 0.0188,
|
||
|
|
"step": 3840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17469329415968554,
|
||
|
|
"grad_norm": 0.1273307353258133,
|
||
|
|
"learning_rate": 1.511074140077477e-05,
|
||
|
|
"loss": 0.0154,
|
||
|
|
"step": 3850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17514704297568473,
|
||
|
|
"grad_norm": 0.17848558723926544,
|
||
|
|
"learning_rate": 1.5085454445463367e-05,
|
||
|
|
"loss": 0.0177,
|
||
|
|
"step": 3860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17560079179168392,
|
||
|
|
"grad_norm": 0.12270861119031906,
|
||
|
|
"learning_rate": 1.506012355255399e-05,
|
||
|
|
"loss": 0.0138,
|
||
|
|
"step": 3870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1760545406076831,
|
||
|
|
"grad_norm": 0.19836512207984924,
|
||
|
|
"learning_rate": 1.503474894090193e-05,
|
||
|
|
"loss": 0.015,
|
||
|
|
"step": 3880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1765082894236823,
|
||
|
|
"grad_norm": 0.1289934664964676,
|
||
|
|
"learning_rate": 1.5009330829740183e-05,
|
||
|
|
"loss": 0.0143,
|
||
|
|
"step": 3890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17696203823968146,
|
||
|
|
"grad_norm": 0.09760569036006927,
|
||
|
|
"learning_rate": 1.4983869438677605e-05,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"step": 3900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17741578705568065,
|
||
|
|
"grad_norm": 0.19151850044727325,
|
||
|
|
"learning_rate": 1.4958364987696956e-05,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 3910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17786953587167983,
|
||
|
|
"grad_norm": 0.1450025737285614,
|
||
|
|
"learning_rate": 1.4932817697153046e-05,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 3920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17832328468767902,
|
||
|
|
"grad_norm": 0.17141938209533691,
|
||
|
|
"learning_rate": 1.4907227787770805e-05,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"step": 3930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1787770335036782,
|
||
|
|
"grad_norm": 0.15546472370624542,
|
||
|
|
"learning_rate": 1.4881595480643379e-05,
|
||
|
|
"loss": 0.0163,
|
||
|
|
"step": 3940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17923078231967737,
|
||
|
|
"grad_norm": 0.11263284087181091,
|
||
|
|
"learning_rate": 1.4855920997230238e-05,
|
||
|
|
"loss": 0.016,
|
||
|
|
"step": 3950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17968453113567656,
|
||
|
|
"grad_norm": 0.1784234344959259,
|
||
|
|
"learning_rate": 1.4830204559355234e-05,
|
||
|
|
"loss": 0.0165,
|
||
|
|
"step": 3960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18013827995167575,
|
||
|
|
"grad_norm": 0.126837357878685,
|
||
|
|
"learning_rate": 1.4804446389204715e-05,
|
||
|
|
"loss": 0.0144,
|
||
|
|
"step": 3970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18059202876767494,
|
||
|
|
"grad_norm": 0.18918831646442413,
|
||
|
|
"learning_rate": 1.4778646709325573e-05,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 3980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18104577758367413,
|
||
|
|
"grad_norm": 0.13399192690849304,
|
||
|
|
"learning_rate": 1.4752805742623349e-05,
|
||
|
|
"loss": 0.0161,
|
||
|
|
"step": 3990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1814995263996733,
|
||
|
|
"grad_norm": 0.22408077120780945,
|
||
|
|
"learning_rate": 1.47269237123603e-05,
|
||
|
|
"loss": 0.0184,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18195327521567248,
|
||
|
|
"grad_norm": 0.07992484420537949,
|
||
|
|
"learning_rate": 1.470100084215345e-05,
|
||
|
|
"loss": 0.0148,
|
||
|
|
"step": 4010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18240702403167167,
|
||
|
|
"grad_norm": 0.25986817479133606,
|
||
|
|
"learning_rate": 1.4675037355972693e-05,
|
||
|
|
"loss": 0.0201,
|
||
|
|
"step": 4020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18286077284767085,
|
||
|
|
"grad_norm": 0.16225387156009674,
|
||
|
|
"learning_rate": 1.4649033478138825e-05,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 4030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18331452166367004,
|
||
|
|
"grad_norm": 0.13349366188049316,
|
||
|
|
"learning_rate": 1.4622989433321627e-05,
|
||
|
|
"loss": 0.0218,
|
||
|
|
"step": 4040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1837682704796692,
|
||
|
|
"grad_norm": 0.1226244643330574,
|
||
|
|
"learning_rate": 1.459690544653792e-05,
|
||
|
|
"loss": 0.0174,
|
||
|
|
"step": 4050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1842220192956684,
|
||
|
|
"grad_norm": 0.1379251480102539,
|
||
|
|
"learning_rate": 1.457078174314961e-05,
|
||
|
|
"loss": 0.0154,
|
||
|
|
"step": 4060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18467576811166758,
|
||
|
|
"grad_norm": 0.131155326962471,
|
||
|
|
"learning_rate": 1.4544618548861753e-05,
|
||
|
|
"loss": 0.0153,
|
||
|
|
"step": 4070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18512951692766677,
|
||
|
|
"grad_norm": 0.12272419035434723,
|
||
|
|
"learning_rate": 1.45184160897206e-05,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"step": 4080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18558326574366596,
|
||
|
|
"grad_norm": 0.13305765390396118,
|
||
|
|
"learning_rate": 1.4492174592111642e-05,
|
||
|
|
"loss": 0.0182,
|
||
|
|
"step": 4090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18603701455966512,
|
||
|
|
"grad_norm": 0.17420318722724915,
|
||
|
|
"learning_rate": 1.4465894282757662e-05,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 4100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1864907633756643,
|
||
|
|
"grad_norm": 0.15594936907291412,
|
||
|
|
"learning_rate": 1.4439575388716768e-05,
|
||
|
|
"loss": 0.0161,
|
||
|
|
"step": 4110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1869445121916635,
|
||
|
|
"grad_norm": 0.14801789820194244,
|
||
|
|
"learning_rate": 1.441321813738044e-05,
|
||
|
|
"loss": 0.0151,
|
||
|
|
"step": 4120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1873982610076627,
|
||
|
|
"grad_norm": 0.19852624833583832,
|
||
|
|
"learning_rate": 1.4386822756471545e-05,
|
||
|
|
"loss": 0.0165,
|
||
|
|
"step": 4130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18785200982366188,
|
||
|
|
"grad_norm": 0.08428862690925598,
|
||
|
|
"learning_rate": 1.43603894740424e-05,
|
||
|
|
"loss": 0.0154,
|
||
|
|
"step": 4140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18830575863966104,
|
||
|
|
"grad_norm": 0.18242783844470978,
|
||
|
|
"learning_rate": 1.4333918518472773e-05,
|
||
|
|
"loss": 0.0163,
|
||
|
|
"step": 4150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18875950745566022,
|
||
|
|
"grad_norm": 0.14235183596611023,
|
||
|
|
"learning_rate": 1.4307410118467932e-05,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"step": 4160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1892132562716594,
|
||
|
|
"grad_norm": 0.10563075542449951,
|
||
|
|
"learning_rate": 1.428086450305666e-05,
|
||
|
|
"loss": 0.0158,
|
||
|
|
"step": 4170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1896670050876586,
|
||
|
|
"grad_norm": 0.16389837861061096,
|
||
|
|
"learning_rate": 1.4254281901589263e-05,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 4180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1901207539036578,
|
||
|
|
"grad_norm": 0.11111581325531006,
|
||
|
|
"learning_rate": 1.4227662543735618e-05,
|
||
|
|
"loss": 0.0149,
|
||
|
|
"step": 4190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19057450271965698,
|
||
|
|
"grad_norm": 0.15358184278011322,
|
||
|
|
"learning_rate": 1.4201006659483156e-05,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 4200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19102825153565614,
|
||
|
|
"grad_norm": 0.18707069754600525,
|
||
|
|
"learning_rate": 1.4174314479134909e-05,
|
||
|
|
"loss": 0.014,
|
||
|
|
"step": 4210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19148200035165533,
|
||
|
|
"grad_norm": 0.1715817004442215,
|
||
|
|
"learning_rate": 1.4147586233307485e-05,
|
||
|
|
"loss": 0.0172,
|
||
|
|
"step": 4220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19193574916765452,
|
||
|
|
"grad_norm": 0.17705568671226501,
|
||
|
|
"learning_rate": 1.4120822152929099e-05,
|
||
|
|
"loss": 0.0162,
|
||
|
|
"step": 4230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1923894979836537,
|
||
|
|
"grad_norm": 0.19288791716098785,
|
||
|
|
"learning_rate": 1.4094022469237577e-05,
|
||
|
|
"loss": 0.0175,
|
||
|
|
"step": 4240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1928432467996529,
|
||
|
|
"grad_norm": 0.0979590117931366,
|
||
|
|
"learning_rate": 1.4067187413778338e-05,
|
||
|
|
"loss": 0.0129,
|
||
|
|
"step": 4250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19329699561565206,
|
||
|
|
"grad_norm": 0.15775372087955475,
|
||
|
|
"learning_rate": 1.4040317218402426e-05,
|
||
|
|
"loss": 0.0143,
|
||
|
|
"step": 4260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19375074443165125,
|
||
|
|
"grad_norm": 0.17242823541164398,
|
||
|
|
"learning_rate": 1.4013412115264477e-05,
|
||
|
|
"loss": 0.0184,
|
||
|
|
"step": 4270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19420449324765043,
|
||
|
|
"grad_norm": 0.15347068011760712,
|
||
|
|
"learning_rate": 1.398647233682073e-05,
|
||
|
|
"loss": 0.0157,
|
||
|
|
"step": 4280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19465824206364962,
|
||
|
|
"grad_norm": 0.14956066012382507,
|
||
|
|
"learning_rate": 1.3959498115827007e-05,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 4290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1951119908796488,
|
||
|
|
"grad_norm": 0.1466713845729828,
|
||
|
|
"learning_rate": 1.3932489685336722e-05,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 4300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19556573969564797,
|
||
|
|
"grad_norm": 0.15853281319141388,
|
||
|
|
"learning_rate": 1.3905447278698838e-05,
|
||
|
|
"loss": 0.0162,
|
||
|
|
"step": 4310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19601948851164716,
|
||
|
|
"grad_norm": 0.16414372622966766,
|
||
|
|
"learning_rate": 1.3878371129555874e-05,
|
||
|
|
"loss": 0.0137,
|
||
|
|
"step": 4320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19647323732764635,
|
||
|
|
"grad_norm": 0.13074234127998352,
|
||
|
|
"learning_rate": 1.3851261471841891e-05,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"step": 4330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19692698614364554,
|
||
|
|
"grad_norm": 0.1296066790819168,
|
||
|
|
"learning_rate": 1.382411853978044e-05,
|
||
|
|
"loss": 0.0141,
|
||
|
|
"step": 4340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19738073495964473,
|
||
|
|
"grad_norm": 0.11560890078544617,
|
||
|
|
"learning_rate": 1.3796942567882565e-05,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"step": 4350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1978344837756439,
|
||
|
|
"grad_norm": 0.14548613131046295,
|
||
|
|
"learning_rate": 1.3769733790944777e-05,
|
||
|
|
"loss": 0.0143,
|
||
|
|
"step": 4360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19828823259164308,
|
||
|
|
"grad_norm": 0.1252041906118393,
|
||
|
|
"learning_rate": 1.3742492444047e-05,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 4370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19874198140764227,
|
||
|
|
"grad_norm": 0.18141184747219086,
|
||
|
|
"learning_rate": 1.3715218762550584e-05,
|
||
|
|
"loss": 0.0138,
|
||
|
|
"step": 4380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19919573022364145,
|
||
|
|
"grad_norm": 0.14405906200408936,
|
||
|
|
"learning_rate": 1.368791298209622e-05,
|
||
|
|
"loss": 0.0152,
|
||
|
|
"step": 4390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19964947903964064,
|
||
|
|
"grad_norm": 0.14750778675079346,
|
||
|
|
"learning_rate": 1.3660575338601945e-05,
|
||
|
|
"loss": 0.0135,
|
||
|
|
"step": 4400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2001032278556398,
|
||
|
|
"grad_norm": 0.14470447599887848,
|
||
|
|
"learning_rate": 1.363320606826108e-05,
|
||
|
|
"loss": 0.0149,
|
||
|
|
"step": 4410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.200556976671639,
|
||
|
|
"grad_norm": 0.19202418625354767,
|
||
|
|
"learning_rate": 1.36058054075402e-05,
|
||
|
|
"loss": 0.0134,
|
||
|
|
"step": 4420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20101072548763818,
|
||
|
|
"grad_norm": 0.14392217993736267,
|
||
|
|
"learning_rate": 1.3578373593177091e-05,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 4430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20146447430363737,
|
||
|
|
"grad_norm": 0.19024603068828583,
|
||
|
|
"learning_rate": 1.35509108621787e-05,
|
||
|
|
"loss": 0.0158,
|
||
|
|
"step": 4440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20191822311963656,
|
||
|
|
"grad_norm": 0.19015447795391083,
|
||
|
|
"learning_rate": 1.3523417451819087e-05,
|
||
|
|
"loss": 0.0144,
|
||
|
|
"step": 4450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20237197193563572,
|
||
|
|
"grad_norm": 0.1269548237323761,
|
||
|
|
"learning_rate": 1.3495893599637385e-05,
|
||
|
|
"loss": 0.0143,
|
||
|
|
"step": 4460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2028257207516349,
|
||
|
|
"grad_norm": 0.5116326808929443,
|
||
|
|
"learning_rate": 1.3468339543435725e-05,
|
||
|
|
"loss": 0.0146,
|
||
|
|
"step": 4470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2032794695676341,
|
||
|
|
"grad_norm": 0.1450648307800293,
|
||
|
|
"learning_rate": 1.3440755521277209e-05,
|
||
|
|
"loss": 0.0174,
|
||
|
|
"step": 4480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20373321838363329,
|
||
|
|
"grad_norm": 0.10610708594322205,
|
||
|
|
"learning_rate": 1.3413141771483842e-05,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 4490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20418696719963247,
|
||
|
|
"grad_norm": 0.15620329976081848,
|
||
|
|
"learning_rate": 1.3385498532634465e-05,
|
||
|
|
"loss": 0.0149,
|
||
|
|
"step": 4500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20464071601563164,
|
||
|
|
"grad_norm": 0.19468973577022552,
|
||
|
|
"learning_rate": 1.3357826043562698e-05,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"step": 4510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20509446483163082,
|
||
|
|
"grad_norm": 0.1022220253944397,
|
||
|
|
"learning_rate": 1.3330124543354888e-05,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 4520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20554821364763,
|
||
|
|
"grad_norm": 0.21672423183918,
|
||
|
|
"learning_rate": 1.3302394271348026e-05,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"step": 4530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2060019624636292,
|
||
|
|
"grad_norm": 0.2700170576572418,
|
||
|
|
"learning_rate": 1.3274635467127688e-05,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 4540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2064557112796284,
|
||
|
|
"grad_norm": 0.1277543008327484,
|
||
|
|
"learning_rate": 1.3246848370525973e-05,
|
||
|
|
"loss": 0.0158,
|
||
|
|
"step": 4550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20690946009562755,
|
||
|
|
"grad_norm": 0.1226678118109703,
|
||
|
|
"learning_rate": 1.3219033221619408e-05,
|
||
|
|
"loss": 0.0142,
|
||
|
|
"step": 4560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20736320891162674,
|
||
|
|
"grad_norm": 0.13949912786483765,
|
||
|
|
"learning_rate": 1.3191190260726903e-05,
|
||
|
|
"loss": 0.0142,
|
||
|
|
"step": 4570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20781695772762593,
|
||
|
|
"grad_norm": 0.19324158132076263,
|
||
|
|
"learning_rate": 1.3163319728407645e-05,
|
||
|
|
"loss": 0.0157,
|
||
|
|
"step": 4580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20827070654362512,
|
||
|
|
"grad_norm": 0.2552565336227417,
|
||
|
|
"learning_rate": 1.3135421865459042e-05,
|
||
|
|
"loss": 0.0175,
|
||
|
|
"step": 4590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2087244553596243,
|
||
|
|
"grad_norm": 0.12216931581497192,
|
||
|
|
"learning_rate": 1.3107496912914636e-05,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"step": 4600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20917820417562347,
|
||
|
|
"grad_norm": 0.1292363703250885,
|
||
|
|
"learning_rate": 1.307954511204202e-05,
|
||
|
|
"loss": 0.0164,
|
||
|
|
"step": 4610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20963195299162266,
|
||
|
|
"grad_norm": 0.13501308858394623,
|
||
|
|
"learning_rate": 1.3051566704340746e-05,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"step": 4620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21008570180762184,
|
||
|
|
"grad_norm": 0.14220742881298065,
|
||
|
|
"learning_rate": 1.3023561931540247e-05,
|
||
|
|
"loss": 0.0145,
|
||
|
|
"step": 4630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21053945062362103,
|
||
|
|
"grad_norm": 0.14298029243946075,
|
||
|
|
"learning_rate": 1.2995531035597753e-05,
|
||
|
|
"loss": 0.0138,
|
||
|
|
"step": 4640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21099319943962022,
|
||
|
|
"grad_norm": 0.17490971088409424,
|
||
|
|
"learning_rate": 1.2967474258696186e-05,
|
||
|
|
"loss": 0.0136,
|
||
|
|
"step": 4650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21144694825561938,
|
||
|
|
"grad_norm": 0.1275247186422348,
|
||
|
|
"learning_rate": 1.2939391843242082e-05,
|
||
|
|
"loss": 0.0176,
|
||
|
|
"step": 4660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21190069707161857,
|
||
|
|
"grad_norm": 0.08051750063896179,
|
||
|
|
"learning_rate": 1.291128403186349e-05,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 4670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21235444588761776,
|
||
|
|
"grad_norm": 0.21460536122322083,
|
||
|
|
"learning_rate": 1.2883151067407866e-05,
|
||
|
|
"loss": 0.0146,
|
||
|
|
"step": 4680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21280819470361695,
|
||
|
|
"grad_norm": 0.09482090175151825,
|
||
|
|
"learning_rate": 1.2854993192940005e-05,
|
||
|
|
"loss": 0.0148,
|
||
|
|
"step": 4690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21326194351961614,
|
||
|
|
"grad_norm": 0.12067610770463943,
|
||
|
|
"learning_rate": 1.2826810651739899e-05,
|
||
|
|
"loss": 0.0166,
|
||
|
|
"step": 4700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2137156923356153,
|
||
|
|
"grad_norm": 0.12952324748039246,
|
||
|
|
"learning_rate": 1.279860368730067e-05,
|
||
|
|
"loss": 0.0142,
|
||
|
|
"step": 4710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2141694411516145,
|
||
|
|
"grad_norm": 0.11326833069324493,
|
||
|
|
"learning_rate": 1.2770372543326454e-05,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"step": 4720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21462318996761368,
|
||
|
|
"grad_norm": 0.13386495411396027,
|
||
|
|
"learning_rate": 1.2742117463730289e-05,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 4730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21507693878361286,
|
||
|
|
"grad_norm": 0.12700419127941132,
|
||
|
|
"learning_rate": 1.2713838692632015e-05,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 4740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21553068759961205,
|
||
|
|
"grad_norm": 0.13237282633781433,
|
||
|
|
"learning_rate": 1.2685536474356161e-05,
|
||
|
|
"loss": 0.0157,
|
||
|
|
"step": 4750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21598443641561124,
|
||
|
|
"grad_norm": 0.12272775173187256,
|
||
|
|
"learning_rate": 1.2657211053429844e-05,
|
||
|
|
"loss": 0.0142,
|
||
|
|
"step": 4760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2164381852316104,
|
||
|
|
"grad_norm": 0.2121793031692505,
|
||
|
|
"learning_rate": 1.2628862674580642e-05,
|
||
|
|
"loss": 0.0154,
|
||
|
|
"step": 4770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2168919340476096,
|
||
|
|
"grad_norm": 0.14809030294418335,
|
||
|
|
"learning_rate": 1.2600491582734484e-05,
|
||
|
|
"loss": 0.0151,
|
||
|
|
"step": 4780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21734568286360878,
|
||
|
|
"grad_norm": 0.16274145245552063,
|
||
|
|
"learning_rate": 1.2572098023013544e-05,
|
||
|
|
"loss": 0.0164,
|
||
|
|
"step": 4790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21779943167960797,
|
||
|
|
"grad_norm": 0.12132669985294342,
|
||
|
|
"learning_rate": 1.254368224073411e-05,
|
||
|
|
"loss": 0.0151,
|
||
|
|
"step": 4800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21825318049560716,
|
||
|
|
"grad_norm": 0.13608211278915405,
|
||
|
|
"learning_rate": 1.251524448140447e-05,
|
||
|
|
"loss": 0.0171,
|
||
|
|
"step": 4810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21870692931160632,
|
||
|
|
"grad_norm": 0.1499515175819397,
|
||
|
|
"learning_rate": 1.2486784990722791e-05,
|
||
|
|
"loss": 0.0169,
|
||
|
|
"step": 4820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2191606781276055,
|
||
|
|
"grad_norm": 0.08290209621191025,
|
||
|
|
"learning_rate": 1.2458304014574996e-05,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"step": 4830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2196144269436047,
|
||
|
|
"grad_norm": 0.19991664588451385,
|
||
|
|
"learning_rate": 1.242980179903264e-05,
|
||
|
|
"loss": 0.0172,
|
||
|
|
"step": 4840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22006817575960388,
|
||
|
|
"grad_norm": 0.15450483560562134,
|
||
|
|
"learning_rate": 1.2401278590350782e-05,
|
||
|
|
"loss": 0.0173,
|
||
|
|
"step": 4850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22052192457560307,
|
||
|
|
"grad_norm": 0.14158503711223602,
|
||
|
|
"learning_rate": 1.2372734634965861e-05,
|
||
|
|
"loss": 0.0159,
|
||
|
|
"step": 4860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22097567339160223,
|
||
|
|
"grad_norm": 0.16519369184970856,
|
||
|
|
"learning_rate": 1.234417017949356e-05,
|
||
|
|
"loss": 0.0151,
|
||
|
|
"step": 4870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22142942220760142,
|
||
|
|
"grad_norm": 0.1639871746301651,
|
||
|
|
"learning_rate": 1.2315585470726685e-05,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"step": 4880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2218831710236006,
|
||
|
|
"grad_norm": 0.1480032354593277,
|
||
|
|
"learning_rate": 1.2286980755633027e-05,
|
||
|
|
"loss": 0.0149,
|
||
|
|
"step": 4890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2223369198395998,
|
||
|
|
"grad_norm": 0.1687663197517395,
|
||
|
|
"learning_rate": 1.225835628135322e-05,
|
||
|
|
"loss": 0.0143,
|
||
|
|
"step": 4900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.222790668655599,
|
||
|
|
"grad_norm": 0.19108609855175018,
|
||
|
|
"learning_rate": 1.2229712295198633e-05,
|
||
|
|
"loss": 0.0145,
|
||
|
|
"step": 4910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22324441747159815,
|
||
|
|
"grad_norm": 0.14848002791404724,
|
||
|
|
"learning_rate": 1.2201049044649192e-05,
|
||
|
|
"loss": 0.0165,
|
||
|
|
"step": 4920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22369816628759734,
|
||
|
|
"grad_norm": 0.16095581650733948,
|
||
|
|
"learning_rate": 1.217236677735128e-05,
|
||
|
|
"loss": 0.0129,
|
||
|
|
"step": 4930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22415191510359653,
|
||
|
|
"grad_norm": 0.11495261639356613,
|
||
|
|
"learning_rate": 1.2143665741115581e-05,
|
||
|
|
"loss": 0.0123,
|
||
|
|
"step": 4940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22460566391959572,
|
||
|
|
"grad_norm": 0.1297575980424881,
|
||
|
|
"learning_rate": 1.2114946183914935e-05,
|
||
|
|
"loss": 0.0129,
|
||
|
|
"step": 4950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2250594127355949,
|
||
|
|
"grad_norm": 0.2002328485250473,
|
||
|
|
"learning_rate": 1.2086208353882203e-05,
|
||
|
|
"loss": 0.0181,
|
||
|
|
"step": 4960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22551316155159407,
|
||
|
|
"grad_norm": 0.1517852246761322,
|
||
|
|
"learning_rate": 1.2057452499308117e-05,
|
||
|
|
"loss": 0.016,
|
||
|
|
"step": 4970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22596691036759325,
|
||
|
|
"grad_norm": 0.1382087916135788,
|
||
|
|
"learning_rate": 1.2028678868639147e-05,
|
||
|
|
"loss": 0.013,
|
||
|
|
"step": 4980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22642065918359244,
|
||
|
|
"grad_norm": 0.16167710721492767,
|
||
|
|
"learning_rate": 1.1999887710475337e-05,
|
||
|
|
"loss": 0.0146,
|
||
|
|
"step": 4990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22687440799959163,
|
||
|
|
"grad_norm": 0.16487346589565277,
|
||
|
|
"learning_rate": 1.197107927356817e-05,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"step": 5000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22732815681559082,
|
||
|
|
"grad_norm": 0.16545337438583374,
|
||
|
|
"learning_rate": 1.1942253806818414e-05,
|
||
|
|
"loss": 0.016,
|
||
|
|
"step": 5010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22778190563158998,
|
||
|
|
"grad_norm": 0.13121789693832397,
|
||
|
|
"learning_rate": 1.1913411559273973e-05,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"step": 5020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22823565444758917,
|
||
|
|
"grad_norm": 0.13568609952926636,
|
||
|
|
"learning_rate": 1.1884552780127736e-05,
|
||
|
|
"loss": 0.0153,
|
||
|
|
"step": 5030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22868940326358836,
|
||
|
|
"grad_norm": 0.12294431775808334,
|
||
|
|
"learning_rate": 1.1855677718715417e-05,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"step": 5040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22914315207958755,
|
||
|
|
"grad_norm": 0.15326954424381256,
|
||
|
|
"learning_rate": 1.1826786624513416e-05,
|
||
|
|
"loss": 0.0156,
|
||
|
|
"step": 5050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22959690089558674,
|
||
|
|
"grad_norm": 0.22886556386947632,
|
||
|
|
"learning_rate": 1.1797879747136645e-05,
|
||
|
|
"loss": 0.012,
|
||
|
|
"step": 5060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2300506497115859,
|
||
|
|
"grad_norm": 0.14157430827617645,
|
||
|
|
"learning_rate": 1.1768957336336384e-05,
|
||
|
|
"loss": 0.0124,
|
||
|
|
"step": 5070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23050439852758509,
|
||
|
|
"grad_norm": 0.09879475831985474,
|
||
|
|
"learning_rate": 1.1740019641998124e-05,
|
||
|
|
"loss": 0.0152,
|
||
|
|
"step": 5080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23095814734358427,
|
||
|
|
"grad_norm": 0.1359705924987793,
|
||
|
|
"learning_rate": 1.171106691413939e-05,
|
||
|
|
"loss": 0.0134,
|
||
|
|
"step": 5090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23141189615958346,
|
||
|
|
"grad_norm": 0.11448939144611359,
|
||
|
|
"learning_rate": 1.1682099402907612e-05,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 5100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23186564497558265,
|
||
|
|
"grad_norm": 0.15960171818733215,
|
||
|
|
"learning_rate": 1.1653117358577937e-05,
|
||
|
|
"loss": 0.0141,
|
||
|
|
"step": 5110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2323193937915818,
|
||
|
|
"grad_norm": 0.18288302421569824,
|
||
|
|
"learning_rate": 1.1624121031551073e-05,
|
||
|
|
"loss": 0.0146,
|
||
|
|
"step": 5120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.232773142607581,
|
||
|
|
"grad_norm": 0.13190463185310364,
|
||
|
|
"learning_rate": 1.1595110672351132e-05,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 5130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2332268914235802,
|
||
|
|
"grad_norm": 0.1572716385126114,
|
||
|
|
"learning_rate": 1.1566086531623464e-05,
|
||
|
|
"loss": 0.0149,
|
||
|
|
"step": 5140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23368064023957938,
|
||
|
|
"grad_norm": 0.15305542945861816,
|
||
|
|
"learning_rate": 1.1537048860132487e-05,
|
||
|
|
"loss": 0.0141,
|
||
|
|
"step": 5150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23413438905557857,
|
||
|
|
"grad_norm": 0.1395915448665619,
|
||
|
|
"learning_rate": 1.1507997908759525e-05,
|
||
|
|
"loss": 0.014,
|
||
|
|
"step": 5160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23458813787157773,
|
||
|
|
"grad_norm": 0.15027165412902832,
|
||
|
|
"learning_rate": 1.1478933928500635e-05,
|
||
|
|
"loss": 0.0123,
|
||
|
|
"step": 5170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23504188668757692,
|
||
|
|
"grad_norm": 0.12792900204658508,
|
||
|
|
"learning_rate": 1.1449857170464445e-05,
|
||
|
|
"loss": 0.013,
|
||
|
|
"step": 5180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2354956355035761,
|
||
|
|
"grad_norm": 0.18118540942668915,
|
||
|
|
"learning_rate": 1.1420767885869974e-05,
|
||
|
|
"loss": 0.0168,
|
||
|
|
"step": 5190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2359493843195753,
|
||
|
|
"grad_norm": 0.10059325397014618,
|
||
|
|
"learning_rate": 1.1391666326044484e-05,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 5200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23640313313557448,
|
||
|
|
"grad_norm": 0.08281472325325012,
|
||
|
|
"learning_rate": 1.1362552742421269e-05,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 5210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23685688195157364,
|
||
|
|
"grad_norm": 0.16372814774513245,
|
||
|
|
"learning_rate": 1.1333427386537537e-05,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 5220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23731063076757283,
|
||
|
|
"grad_norm": 0.15756796300411224,
|
||
|
|
"learning_rate": 1.1304290510032184e-05,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"step": 5230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23776437958357202,
|
||
|
|
"grad_norm": 0.10431189090013504,
|
||
|
|
"learning_rate": 1.1275142364643645e-05,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 5240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2382181283995712,
|
||
|
|
"grad_norm": 0.17831729352474213,
|
||
|
|
"learning_rate": 1.1245983202207729e-05,
|
||
|
|
"loss": 0.0146,
|
||
|
|
"step": 5250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2386718772155704,
|
||
|
|
"grad_norm": 0.11174460500478745,
|
||
|
|
"learning_rate": 1.1216813274655417e-05,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"step": 5260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23912562603156956,
|
||
|
|
"grad_norm": 0.18887224793434143,
|
||
|
|
"learning_rate": 1.1187632834010707e-05,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 5270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23957937484756875,
|
||
|
|
"grad_norm": 0.2523764371871948,
|
||
|
|
"learning_rate": 1.1158442132388427e-05,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 5280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24003312366356794,
|
||
|
|
"grad_norm": 0.22768224775791168,
|
||
|
|
"learning_rate": 1.1129241421992059e-05,
|
||
|
|
"loss": 0.0164,
|
||
|
|
"step": 5290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24048687247956713,
|
||
|
|
"grad_norm": 0.13157208263874054,
|
||
|
|
"learning_rate": 1.1100030955111554e-05,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 5300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24094062129556632,
|
||
|
|
"grad_norm": 0.13875404000282288,
|
||
|
|
"learning_rate": 1.1070810984121164e-05,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 5310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2413943701115655,
|
||
|
|
"grad_norm": 0.18443532288074493,
|
||
|
|
"learning_rate": 1.1041581761477252e-05,
|
||
|
|
"loss": 0.0166,
|
||
|
|
"step": 5320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24184811892756466,
|
||
|
|
"grad_norm": 0.16607621312141418,
|
||
|
|
"learning_rate": 1.1012343539716115e-05,
|
||
|
|
"loss": 0.0185,
|
||
|
|
"step": 5330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24230186774356385,
|
||
|
|
"grad_norm": 0.17037932574748993,
|
||
|
|
"learning_rate": 1.0983096571451805e-05,
|
||
|
|
"loss": 0.0151,
|
||
|
|
"step": 5340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24275561655956304,
|
||
|
|
"grad_norm": 0.15466848015785217,
|
||
|
|
"learning_rate": 1.0953841109373935e-05,
|
||
|
|
"loss": 0.0167,
|
||
|
|
"step": 5350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24320936537556223,
|
||
|
|
"grad_norm": 0.11539296060800552,
|
||
|
|
"learning_rate": 1.0924577406245507e-05,
|
||
|
|
"loss": 0.0116,
|
||
|
|
"step": 5360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24366311419156142,
|
||
|
|
"grad_norm": 0.11359529942274094,
|
||
|
|
"learning_rate": 1.0895305714900721e-05,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"step": 5370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24411686300756058,
|
||
|
|
"grad_norm": 0.12212841212749481,
|
||
|
|
"learning_rate": 1.0866026288242803e-05,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 5380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24457061182355977,
|
||
|
|
"grad_norm": 0.1847812980413437,
|
||
|
|
"learning_rate": 1.0836739379241805e-05,
|
||
|
|
"loss": 0.0148,
|
||
|
|
"step": 5390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24502436063955896,
|
||
|
|
"grad_norm": 0.19463911652565002,
|
||
|
|
"learning_rate": 1.0807445240932422e-05,
|
||
|
|
"loss": 0.0191,
|
||
|
|
"step": 5400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24547810945555815,
|
||
|
|
"grad_norm": 0.08880347013473511,
|
||
|
|
"learning_rate": 1.0778144126411815e-05,
|
||
|
|
"loss": 0.0123,
|
||
|
|
"step": 5410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24593185827155734,
|
||
|
|
"grad_norm": 0.1543634831905365,
|
||
|
|
"learning_rate": 1.0748836288837418e-05,
|
||
|
|
"loss": 0.0153,
|
||
|
|
"step": 5420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2463856070875565,
|
||
|
|
"grad_norm": 0.16077403724193573,
|
||
|
|
"learning_rate": 1.0719521981424745e-05,
|
||
|
|
"loss": 0.0137,
|
||
|
|
"step": 5430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24683935590355569,
|
||
|
|
"grad_norm": 0.16406534612178802,
|
||
|
|
"learning_rate": 1.0690201457445218e-05,
|
||
|
|
"loss": 0.0121,
|
||
|
|
"step": 5440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24729310471955487,
|
||
|
|
"grad_norm": 0.15214091539382935,
|
||
|
|
"learning_rate": 1.0660874970223963e-05,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"step": 5450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24774685353555406,
|
||
|
|
"grad_norm": 0.16863113641738892,
|
||
|
|
"learning_rate": 1.0631542773137627e-05,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 5460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24820060235155325,
|
||
|
|
"grad_norm": 0.1798219382762909,
|
||
|
|
"learning_rate": 1.060220511961219e-05,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 5470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2486543511675524,
|
||
|
|
"grad_norm": 0.13751454651355743,
|
||
|
|
"learning_rate": 1.0572862263120784e-05,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"step": 5480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2491080999835516,
|
||
|
|
"grad_norm": 0.07154980301856995,
|
||
|
|
"learning_rate": 1.0543514457181476e-05,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 5490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2495618487995508,
|
||
|
|
"grad_norm": 0.07695973664522171,
|
||
|
|
"learning_rate": 1.051416195535511e-05,
|
||
|
|
"loss": 0.0134,
|
||
|
|
"step": 5500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25001559761555,
|
||
|
|
"grad_norm": 0.111163429915905,
|
||
|
|
"learning_rate": 1.0484805011243102e-05,
|
||
|
|
"loss": 0.012,
|
||
|
|
"step": 5510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25046934643154917,
|
||
|
|
"grad_norm": 0.1088341623544693,
|
||
|
|
"learning_rate": 1.0455443878485238e-05,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 5520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25092309524754836,
|
||
|
|
"grad_norm": 0.1207728236913681,
|
||
|
|
"learning_rate": 1.0426078810757502e-05,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"step": 5530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25137684406354754,
|
||
|
|
"grad_norm": 0.1364191770553589,
|
||
|
|
"learning_rate": 1.039671006176987e-05,
|
||
|
|
"loss": 0.0135,
|
||
|
|
"step": 5540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2518305928795467,
|
||
|
|
"grad_norm": 0.14863966405391693,
|
||
|
|
"learning_rate": 1.0367337885264128e-05,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"step": 5550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25228434169554587,
|
||
|
|
"grad_norm": 0.10519534349441528,
|
||
|
|
"learning_rate": 1.0337962535011679e-05,
|
||
|
|
"loss": 0.0124,
|
||
|
|
"step": 5560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25273809051154505,
|
||
|
|
"grad_norm": 0.07501281052827835,
|
||
|
|
"learning_rate": 1.0308584264811332e-05,
|
||
|
|
"loss": 0.0108,
|
||
|
|
"step": 5570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25319183932754424,
|
||
|
|
"grad_norm": 0.1170443519949913,
|
||
|
|
"learning_rate": 1.0279203328487142e-05,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 5580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25364558814354343,
|
||
|
|
"grad_norm": 0.1034766435623169,
|
||
|
|
"learning_rate": 1.0249819979886184e-05,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 5590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2540993369595426,
|
||
|
|
"grad_norm": 0.12563982605934143,
|
||
|
|
"learning_rate": 1.0220434472876384e-05,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"step": 5600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2545530857755418,
|
||
|
|
"grad_norm": 0.11183736473321915,
|
||
|
|
"learning_rate": 1.0191047061344315e-05,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"step": 5610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.255006834591541,
|
||
|
|
"grad_norm": 0.1348152458667755,
|
||
|
|
"learning_rate": 1.0161657999192998e-05,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"step": 5620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2554605834075402,
|
||
|
|
"grad_norm": 0.09866070747375488,
|
||
|
|
"learning_rate": 1.0132267540339726e-05,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 5630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2559143322235394,
|
||
|
|
"grad_norm": 0.12439010292291641,
|
||
|
|
"learning_rate": 1.010287593871385e-05,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"step": 5640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25636808103953856,
|
||
|
|
"grad_norm": 0.09671192616224289,
|
||
|
|
"learning_rate": 1.0073483448254599e-05,
|
||
|
|
"loss": 0.0121,
|
||
|
|
"step": 5650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2568218298555377,
|
||
|
|
"grad_norm": 0.14748752117156982,
|
||
|
|
"learning_rate": 1.0044090322908884e-05,
|
||
|
|
"loss": 0.0108,
|
||
|
|
"step": 5660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2572755786715369,
|
||
|
|
"grad_norm": 0.14265309274196625,
|
||
|
|
"learning_rate": 1.0014696816629093e-05,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"step": 5670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2577293274875361,
|
||
|
|
"grad_norm": 0.14851878583431244,
|
||
|
|
"learning_rate": 9.985303183370909e-06,
|
||
|
|
"loss": 0.0156,
|
||
|
|
"step": 5680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25818307630353526,
|
||
|
|
"grad_norm": 0.10572591423988342,
|
||
|
|
"learning_rate": 9.95590967709112e-06,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 5690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25863682511953445,
|
||
|
|
"grad_norm": 0.13734707236289978,
|
||
|
|
"learning_rate": 9.926516551745401e-06,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 5700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25909057393553364,
|
||
|
|
"grad_norm": 0.13167889416217804,
|
||
|
|
"learning_rate": 9.897124061286152e-06,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 5710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25954432275153283,
|
||
|
|
"grad_norm": 0.1438244879245758,
|
||
|
|
"learning_rate": 9.867732459660277e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 5720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.259998071567532,
|
||
|
|
"grad_norm": 0.16616496443748474,
|
||
|
|
"learning_rate": 9.838342000807006e-06,
|
||
|
|
"loss": 0.013,
|
||
|
|
"step": 5730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2604518203835312,
|
||
|
|
"grad_norm": 0.12727349996566772,
|
||
|
|
"learning_rate": 9.808952938655689e-06,
|
||
|
|
"loss": 0.0146,
|
||
|
|
"step": 5740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2609055691995304,
|
||
|
|
"grad_norm": 0.159709170460701,
|
||
|
|
"learning_rate": 9.77956552712362e-06,
|
||
|
|
"loss": 0.0156,
|
||
|
|
"step": 5750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26135931801552953,
|
||
|
|
"grad_norm": 0.09865106642246246,
|
||
|
|
"learning_rate": 9.75018002011382e-06,
|
||
|
|
"loss": 0.0095,
|
||
|
|
"step": 5760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2618130668315287,
|
||
|
|
"grad_norm": 0.17693890631198883,
|
||
|
|
"learning_rate": 9.720796671512863e-06,
|
||
|
|
"loss": 0.014,
|
||
|
|
"step": 5770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2622668156475279,
|
||
|
|
"grad_norm": 0.12724372744560242,
|
||
|
|
"learning_rate": 9.69141573518867e-06,
|
||
|
|
"loss": 0.0134,
|
||
|
|
"step": 5780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2627205644635271,
|
||
|
|
"grad_norm": 0.11850858479738235,
|
||
|
|
"learning_rate": 9.662037464988323e-06,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 5790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2631743132795263,
|
||
|
|
"grad_norm": 0.13083550333976746,
|
||
|
|
"learning_rate": 9.63266211473587e-06,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 5800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2636280620955255,
|
||
|
|
"grad_norm": 0.16072624921798706,
|
||
|
|
"learning_rate": 9.603289938230132e-06,
|
||
|
|
"loss": 0.0129,
|
||
|
|
"step": 5810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26408181091152466,
|
||
|
|
"grad_norm": 0.1012595146894455,
|
||
|
|
"learning_rate": 9.573921189242501e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 5820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26453555972752385,
|
||
|
|
"grad_norm": 0.17362189292907715,
|
||
|
|
"learning_rate": 9.544556121514765e-06,
|
||
|
|
"loss": 0.014,
|
||
|
|
"step": 5830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26498930854352304,
|
||
|
|
"grad_norm": 0.15318715572357178,
|
||
|
|
"learning_rate": 9.5151949887569e-06,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"step": 5840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2654430573595222,
|
||
|
|
"grad_norm": 0.1297263503074646,
|
||
|
|
"learning_rate": 9.485838044644891e-06,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"step": 5850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26589680617552136,
|
||
|
|
"grad_norm": 0.13620354235172272,
|
||
|
|
"learning_rate": 9.456485542818527e-06,
|
||
|
|
"loss": 0.0142,
|
||
|
|
"step": 5860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26635055499152055,
|
||
|
|
"grad_norm": 0.14983710646629333,
|
||
|
|
"learning_rate": 9.427137736879222e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 5870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26680430380751974,
|
||
|
|
"grad_norm": 0.13920733332633972,
|
||
|
|
"learning_rate": 9.397794880387812e-06,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"step": 5880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2672580526235189,
|
||
|
|
"grad_norm": 0.13266463577747345,
|
||
|
|
"learning_rate": 9.368457226862378e-06,
|
||
|
|
"loss": 0.0117,
|
||
|
|
"step": 5890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2677118014395181,
|
||
|
|
"grad_norm": 0.15641669929027557,
|
||
|
|
"learning_rate": 9.339125029776039e-06,
|
||
|
|
"loss": 0.015,
|
||
|
|
"step": 5900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2681655502555173,
|
||
|
|
"grad_norm": 0.14706820249557495,
|
||
|
|
"learning_rate": 9.309798542554782e-06,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 5910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2686192990715165,
|
||
|
|
"grad_norm": 0.09607023000717163,
|
||
|
|
"learning_rate": 9.280478018575257e-06,
|
||
|
|
"loss": 0.012,
|
||
|
|
"step": 5920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2690730478875157,
|
||
|
|
"grad_norm": 0.13049288094043732,
|
||
|
|
"learning_rate": 9.251163711162584e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 5930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26952679670351487,
|
||
|
|
"grad_norm": 0.10754423588514328,
|
||
|
|
"learning_rate": 9.221855873588187e-06,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"step": 5940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26998054551951406,
|
||
|
|
"grad_norm": 0.14173737168312073,
|
||
|
|
"learning_rate": 9.192554759067581e-06,
|
||
|
|
"loss": 0.015,
|
||
|
|
"step": 5950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2704342943355132,
|
||
|
|
"grad_norm": 0.08642645925283432,
|
||
|
|
"learning_rate": 9.163260620758197e-06,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 5960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2708880431515124,
|
||
|
|
"grad_norm": 0.09053780883550644,
|
||
|
|
"learning_rate": 9.133973711757198e-06,
|
||
|
|
"loss": 0.0116,
|
||
|
|
"step": 5970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27134179196751157,
|
||
|
|
"grad_norm": 0.17731283605098724,
|
||
|
|
"learning_rate": 9.10469428509928e-06,
|
||
|
|
"loss": 0.0134,
|
||
|
|
"step": 5980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27179554078351076,
|
||
|
|
"grad_norm": 0.14436522126197815,
|
||
|
|
"learning_rate": 9.075422593754498e-06,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 5990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27224928959950995,
|
||
|
|
"grad_norm": 0.21791435778141022,
|
||
|
|
"learning_rate": 9.046158890626069e-06,
|
||
|
|
"loss": 0.0163,
|
||
|
|
"step": 6000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27270303841550914,
|
||
|
|
"grad_norm": 0.1288122683763504,
|
||
|
|
"learning_rate": 9.016903428548195e-06,
|
||
|
|
"loss": 0.0137,
|
||
|
|
"step": 6010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2731567872315083,
|
||
|
|
"grad_norm": 0.179439514875412,
|
||
|
|
"learning_rate": 8.987656460283885e-06,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 6020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2736105360475075,
|
||
|
|
"grad_norm": 0.1351165622472763,
|
||
|
|
"learning_rate": 8.958418238522748e-06,
|
||
|
|
"loss": 0.0141,
|
||
|
|
"step": 6030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2740642848635067,
|
||
|
|
"grad_norm": 0.12908992171287537,
|
||
|
|
"learning_rate": 8.929189015878838e-06,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"step": 6040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2745180336795059,
|
||
|
|
"grad_norm": 0.19940853118896484,
|
||
|
|
"learning_rate": 8.899969044888448e-06,
|
||
|
|
"loss": 0.012,
|
||
|
|
"step": 6050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.274971782495505,
|
||
|
|
"grad_norm": 0.16033363342285156,
|
||
|
|
"learning_rate": 8.870758578007944e-06,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 6060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2754255313115042,
|
||
|
|
"grad_norm": 0.13968591392040253,
|
||
|
|
"learning_rate": 8.841557867611576e-06,
|
||
|
|
"loss": 0.013,
|
||
|
|
"step": 6070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2758792801275034,
|
||
|
|
"grad_norm": 0.1749831885099411,
|
||
|
|
"learning_rate": 8.812367165989295e-06,
|
||
|
|
"loss": 0.0134,
|
||
|
|
"step": 6080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2763330289435026,
|
||
|
|
"grad_norm": 0.11019530147314072,
|
||
|
|
"learning_rate": 8.783186725344588e-06,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"step": 6090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2767867777595018,
|
||
|
|
"grad_norm": 0.1606026291847229,
|
||
|
|
"learning_rate": 8.754016797792276e-06,
|
||
|
|
"loss": 0.0144,
|
||
|
|
"step": 6100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27724052657550097,
|
||
|
|
"grad_norm": 0.13256487250328064,
|
||
|
|
"learning_rate": 8.72485763535636e-06,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 6110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27769427539150016,
|
||
|
|
"grad_norm": 0.1421864628791809,
|
||
|
|
"learning_rate": 8.695709489967821e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 6120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27814802420749934,
|
||
|
|
"grad_norm": 0.14893385767936707,
|
||
|
|
"learning_rate": 8.666572613462465e-06,
|
||
|
|
"loss": 0.0143,
|
||
|
|
"step": 6130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27860177302349853,
|
||
|
|
"grad_norm": 0.14165258407592773,
|
||
|
|
"learning_rate": 8.63744725757873e-06,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"step": 6140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2790555218394977,
|
||
|
|
"grad_norm": 0.0763816237449646,
|
||
|
|
"learning_rate": 8.60833367395552e-06,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 6150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2795092706554969,
|
||
|
|
"grad_norm": 0.12333854287862778,
|
||
|
|
"learning_rate": 8.579232114130027e-06,
|
||
|
|
"loss": 0.0143,
|
||
|
|
"step": 6160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27996301947149604,
|
||
|
|
"grad_norm": 0.11713657528162003,
|
||
|
|
"learning_rate": 8.550142829535559e-06,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"step": 6170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28041676828749523,
|
||
|
|
"grad_norm": 0.21120575070381165,
|
||
|
|
"learning_rate": 8.521066071499368e-06,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 6180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2808705171034944,
|
||
|
|
"grad_norm": 0.146609827876091,
|
||
|
|
"learning_rate": 8.492002091240478e-06,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"step": 6190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2813242659194936,
|
||
|
|
"grad_norm": 0.10008296370506287,
|
||
|
|
"learning_rate": 8.462951139867514e-06,
|
||
|
|
"loss": 0.0129,
|
||
|
|
"step": 6200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2817780147354928,
|
||
|
|
"grad_norm": 0.12781381607055664,
|
||
|
|
"learning_rate": 8.43391346837654e-06,
|
||
|
|
"loss": 0.0121,
|
||
|
|
"step": 6210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.282231763551492,
|
||
|
|
"grad_norm": 0.14611364901065826,
|
||
|
|
"learning_rate": 8.404889327648873e-06,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 6220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2826855123674912,
|
||
|
|
"grad_norm": 0.17796698212623596,
|
||
|
|
"learning_rate": 8.375878968448934e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 6230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28313926118349036,
|
||
|
|
"grad_norm": 0.0997767299413681,
|
||
|
|
"learning_rate": 8.346882641422066e-06,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 6240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28359300999948955,
|
||
|
|
"grad_norm": 0.17882458865642548,
|
||
|
|
"learning_rate": 8.317900597092388e-06,
|
||
|
|
"loss": 0.0136,
|
||
|
|
"step": 6250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28404675881548874,
|
||
|
|
"grad_norm": 0.12098672240972519,
|
||
|
|
"learning_rate": 8.288933085860611e-06,
|
||
|
|
"loss": 0.0117,
|
||
|
|
"step": 6260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2845005076314879,
|
||
|
|
"grad_norm": 0.09727644175291061,
|
||
|
|
"learning_rate": 8.25998035800188e-06,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 6270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28495425644748706,
|
||
|
|
"grad_norm": 0.07772582769393921,
|
||
|
|
"learning_rate": 8.231042663663619e-06,
|
||
|
|
"loss": 0.0138,
|
||
|
|
"step": 6280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28540800526348625,
|
||
|
|
"grad_norm": 0.14961278438568115,
|
||
|
|
"learning_rate": 8.202120252863359e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 6290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28586175407948544,
|
||
|
|
"grad_norm": 0.0879221111536026,
|
||
|
|
"learning_rate": 8.173213375486589e-06,
|
||
|
|
"loss": 0.0135,
|
||
|
|
"step": 6300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28631550289548463,
|
||
|
|
"grad_norm": 0.1098606288433075,
|
||
|
|
"learning_rate": 8.144322281284586e-06,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 6310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2867692517114838,
|
||
|
|
"grad_norm": 0.18294364213943481,
|
||
|
|
"learning_rate": 8.11544721987227e-06,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"step": 6320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.287223000527483,
|
||
|
|
"grad_norm": 0.15368086099624634,
|
||
|
|
"learning_rate": 8.086588440726034e-06,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 6330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2876767493434822,
|
||
|
|
"grad_norm": 0.16978329420089722,
|
||
|
|
"learning_rate": 8.057746193181591e-06,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"step": 6340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2881304981594814,
|
||
|
|
"grad_norm": 0.20677420496940613,
|
||
|
|
"learning_rate": 8.028920726431832e-06,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 6350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2885842469754806,
|
||
|
|
"grad_norm": 0.11161544919013977,
|
||
|
|
"learning_rate": 8.000112289524666e-06,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"step": 6360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2890379957914797,
|
||
|
|
"grad_norm": 0.14191238582134247,
|
||
|
|
"learning_rate": 7.971321131360855e-06,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 6370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2894917446074789,
|
||
|
|
"grad_norm": 0.10466880351305008,
|
||
|
|
"learning_rate": 7.942547500691884e-06,
|
||
|
|
"loss": 0.0136,
|
||
|
|
"step": 6380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2899454934234781,
|
||
|
|
"grad_norm": 0.17010283470153809,
|
||
|
|
"learning_rate": 7.913791646117798e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 6390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2903992422394773,
|
||
|
|
"grad_norm": 0.11536948382854462,
|
||
|
|
"learning_rate": 7.885053816085067e-06,
|
||
|
|
"loss": 0.0151,
|
||
|
|
"step": 6400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29085299105547646,
|
||
|
|
"grad_norm": 0.1053222119808197,
|
||
|
|
"learning_rate": 7.85633425888442e-06,
|
||
|
|
"loss": 0.0117,
|
||
|
|
"step": 6410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29130673987147565,
|
||
|
|
"grad_norm": 0.11969737708568573,
|
||
|
|
"learning_rate": 7.827633222648722e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 6420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29176048868747484,
|
||
|
|
"grad_norm": 0.13598807156085968,
|
||
|
|
"learning_rate": 7.798950955350812e-06,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 6430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29221423750347403,
|
||
|
|
"grad_norm": 0.12094290554523468,
|
||
|
|
"learning_rate": 7.770287704801374e-06,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 6440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2926679863194732,
|
||
|
|
"grad_norm": 0.12500420212745667,
|
||
|
|
"learning_rate": 7.741643718646783e-06,
|
||
|
|
"loss": 0.0123,
|
||
|
|
"step": 6450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2931217351354724,
|
||
|
|
"grad_norm": 0.088765949010849,
|
||
|
|
"learning_rate": 7.713019244366977e-06,
|
||
|
|
"loss": 0.0123,
|
||
|
|
"step": 6460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29357548395147154,
|
||
|
|
"grad_norm": 0.13747364282608032,
|
||
|
|
"learning_rate": 7.684414529273315e-06,
|
||
|
|
"loss": 0.0116,
|
||
|
|
"step": 6470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2940292327674707,
|
||
|
|
"grad_norm": 0.10825503617525101,
|
||
|
|
"learning_rate": 7.655829820506442e-06,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"step": 6480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2944829815834699,
|
||
|
|
"grad_norm": 0.1315292865037918,
|
||
|
|
"learning_rate": 7.627265365034141e-06,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"step": 6490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2949367303994691,
|
||
|
|
"grad_norm": 0.09187211841344833,
|
||
|
|
"learning_rate": 7.59872140964922e-06,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 6500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2953904792154683,
|
||
|
|
"grad_norm": 0.13788200914859772,
|
||
|
|
"learning_rate": 7.570198200967363e-06,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 6510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2958442280314675,
|
||
|
|
"grad_norm": 0.08541585505008698,
|
||
|
|
"learning_rate": 7.5416959854250076e-06,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 6520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29629797684746667,
|
||
|
|
"grad_norm": 0.5758150219917297,
|
||
|
|
"learning_rate": 7.513215009277212e-06,
|
||
|
|
"loss": 0.0135,
|
||
|
|
"step": 6530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29675172566346586,
|
||
|
|
"grad_norm": 0.2074153572320938,
|
||
|
|
"learning_rate": 7.484755518595534e-06,
|
||
|
|
"loss": 0.0146,
|
||
|
|
"step": 6540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29720547447946505,
|
||
|
|
"grad_norm": 0.13672949373722076,
|
||
|
|
"learning_rate": 7.456317759265893e-06,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 6550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29765922329546424,
|
||
|
|
"grad_norm": 0.15952400863170624,
|
||
|
|
"learning_rate": 7.4279019769864605e-06,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 6560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29811297211146337,
|
||
|
|
"grad_norm": 0.1452173888683319,
|
||
|
|
"learning_rate": 7.399508417265517e-06,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"step": 6570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29856672092746256,
|
||
|
|
"grad_norm": 0.12948384881019592,
|
||
|
|
"learning_rate": 7.3711373254193595e-06,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"step": 6580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29902046974346175,
|
||
|
|
"grad_norm": 0.11394894123077393,
|
||
|
|
"learning_rate": 7.342788946570159e-06,
|
||
|
|
"loss": 0.0121,
|
||
|
|
"step": 6590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29947421855946094,
|
||
|
|
"grad_norm": 0.14150279760360718,
|
||
|
|
"learning_rate": 7.314463525643842e-06,
|
||
|
|
"loss": 0.0123,
|
||
|
|
"step": 6600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2999279673754601,
|
||
|
|
"grad_norm": 0.12348120659589767,
|
||
|
|
"learning_rate": 7.286161307367989e-06,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 6610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3003817161914593,
|
||
|
|
"grad_norm": 0.12587220966815948,
|
||
|
|
"learning_rate": 7.257882536269716e-06,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 6620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3008354650074585,
|
||
|
|
"grad_norm": 0.17185339331626892,
|
||
|
|
"learning_rate": 7.2296274566735494e-06,
|
||
|
|
"loss": 0.0117,
|
||
|
|
"step": 6630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3012892138234577,
|
||
|
|
"grad_norm": 0.1300371140241623,
|
||
|
|
"learning_rate": 7.201396312699334e-06,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"step": 6640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3017429626394569,
|
||
|
|
"grad_norm": 0.1046392098069191,
|
||
|
|
"learning_rate": 7.173189348260105e-06,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 6650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30219671145545607,
|
||
|
|
"grad_norm": 0.1329888254404068,
|
||
|
|
"learning_rate": 7.145006807060002e-06,
|
||
|
|
"loss": 0.0112,
|
||
|
|
"step": 6660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3026504602714552,
|
||
|
|
"grad_norm": 0.12515147030353546,
|
||
|
|
"learning_rate": 7.116848932592136e-06,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 6670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3031042090874544,
|
||
|
|
"grad_norm": 0.10097947716712952,
|
||
|
|
"learning_rate": 7.088715968136513e-06,
|
||
|
|
"loss": 0.0139,
|
||
|
|
"step": 6680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3035579579034536,
|
||
|
|
"grad_norm": 0.1693345010280609,
|
||
|
|
"learning_rate": 7.06060815675792e-06,
|
||
|
|
"loss": 0.0116,
|
||
|
|
"step": 6690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30401170671945277,
|
||
|
|
"grad_norm": 0.12788210809230804,
|
||
|
|
"learning_rate": 7.032525741303815e-06,
|
||
|
|
"loss": 0.0136,
|
||
|
|
"step": 6700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30446545553545196,
|
||
|
|
"grad_norm": 0.10201315581798553,
|
||
|
|
"learning_rate": 7.00446896440225e-06,
|
||
|
|
"loss": 0.0129,
|
||
|
|
"step": 6710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30491920435145115,
|
||
|
|
"grad_norm": 0.13046912848949432,
|
||
|
|
"learning_rate": 6.976438068459756e-06,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 6720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30537295316745033,
|
||
|
|
"grad_norm": 0.13688482344150543,
|
||
|
|
"learning_rate": 6.948433295659258e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 6730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3058267019834495,
|
||
|
|
"grad_norm": 0.14753024280071259,
|
||
|
|
"learning_rate": 6.920454887957984e-06,
|
||
|
|
"loss": 0.0145,
|
||
|
|
"step": 6740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3062804507994487,
|
||
|
|
"grad_norm": 0.2111503630876541,
|
||
|
|
"learning_rate": 6.892503087085365e-06,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"step": 6750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3067341996154479,
|
||
|
|
"grad_norm": 0.09545427560806274,
|
||
|
|
"learning_rate": 6.864578134540961e-06,
|
||
|
|
"loss": 0.0124,
|
||
|
|
"step": 6760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3071879484314471,
|
||
|
|
"grad_norm": 0.11243736743927002,
|
||
|
|
"learning_rate": 6.83668027159236e-06,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"step": 6770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3076416972474462,
|
||
|
|
"grad_norm": 0.1419551521539688,
|
||
|
|
"learning_rate": 6.8088097392731035e-06,
|
||
|
|
"loss": 0.0136,
|
||
|
|
"step": 6780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3080954460634454,
|
||
|
|
"grad_norm": 0.21375605463981628,
|
||
|
|
"learning_rate": 6.7809667783805934e-06,
|
||
|
|
"loss": 0.0129,
|
||
|
|
"step": 6790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3085491948794446,
|
||
|
|
"grad_norm": 0.11559443175792694,
|
||
|
|
"learning_rate": 6.753151629474028e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 6800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3090029436954438,
|
||
|
|
"grad_norm": 0.26248887181282043,
|
||
|
|
"learning_rate": 6.725364532872312e-06,
|
||
|
|
"loss": 0.0124,
|
||
|
|
"step": 6810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.309456692511443,
|
||
|
|
"grad_norm": 0.1434881091117859,
|
||
|
|
"learning_rate": 6.697605728651977e-06,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"step": 6820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30991044132744217,
|
||
|
|
"grad_norm": 0.14054977893829346,
|
||
|
|
"learning_rate": 6.669875456645115e-06,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 6830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31036419014344135,
|
||
|
|
"grad_norm": 0.14137424528598785,
|
||
|
|
"learning_rate": 6.642173956437306e-06,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 6840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31081793895944054,
|
||
|
|
"grad_norm": 0.11850173771381378,
|
||
|
|
"learning_rate": 6.614501467365539e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 6850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31127168777543973,
|
||
|
|
"grad_norm": 0.09213166683912277,
|
||
|
|
"learning_rate": 6.586858228516162e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 6860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3117254365914389,
|
||
|
|
"grad_norm": 0.18658258020877838,
|
||
|
|
"learning_rate": 6.559244478722792e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 6870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31217918540743805,
|
||
|
|
"grad_norm": 0.14191250503063202,
|
||
|
|
"learning_rate": 6.531660456564282e-06,
|
||
|
|
"loss": 0.0171,
|
||
|
|
"step": 6880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31263293422343724,
|
||
|
|
"grad_norm": 0.14040282368659973,
|
||
|
|
"learning_rate": 6.504106400362621e-06,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 6890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31308668303943643,
|
||
|
|
"grad_norm": 0.10161775350570679,
|
||
|
|
"learning_rate": 6.476582548180912e-06,
|
||
|
|
"loss": 0.0108,
|
||
|
|
"step": 6900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3135404318554356,
|
||
|
|
"grad_norm": 0.09201598167419434,
|
||
|
|
"learning_rate": 6.449089137821301e-06,
|
||
|
|
"loss": 0.0142,
|
||
|
|
"step": 6910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3139941806714348,
|
||
|
|
"grad_norm": 0.15806888043880463,
|
||
|
|
"learning_rate": 6.421626406822909e-06,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 6920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.314447929487434,
|
||
|
|
"grad_norm": 0.15518872439861298,
|
||
|
|
"learning_rate": 6.394194592459801e-06,
|
||
|
|
"loss": 0.0145,
|
||
|
|
"step": 6930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3149016783034332,
|
||
|
|
"grad_norm": 0.16833215951919556,
|
||
|
|
"learning_rate": 6.366793931738922e-06,
|
||
|
|
"loss": 0.0136,
|
||
|
|
"step": 6940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3153554271194324,
|
||
|
|
"grad_norm": 0.08252312988042831,
|
||
|
|
"learning_rate": 6.339424661398058e-06,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"step": 6950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31580917593543156,
|
||
|
|
"grad_norm": 0.13390517234802246,
|
||
|
|
"learning_rate": 6.312087017903783e-06,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 6960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31626292475143075,
|
||
|
|
"grad_norm": 0.11417072266340256,
|
||
|
|
"learning_rate": 6.284781237449419e-06,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"step": 6970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3167166735674299,
|
||
|
|
"grad_norm": 0.11521535366773605,
|
||
|
|
"learning_rate": 6.257507555953002e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 6980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3171704223834291,
|
||
|
|
"grad_norm": 0.133391872048378,
|
||
|
|
"learning_rate": 6.230266209055229e-06,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 6990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31762417119942826,
|
||
|
|
"grad_norm": 0.14856669306755066,
|
||
|
|
"learning_rate": 6.20305743211744e-06,
|
||
|
|
"loss": 0.0124,
|
||
|
|
"step": 7000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31807792001542745,
|
||
|
|
"grad_norm": 0.14302918314933777,
|
||
|
|
"learning_rate": 6.175881460219565e-06,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 7010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31853166883142664,
|
||
|
|
"grad_norm": 0.11710617691278458,
|
||
|
|
"learning_rate": 6.148738528158109e-06,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"step": 7020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31898541764742583,
|
||
|
|
"grad_norm": 0.10232780873775482,
|
||
|
|
"learning_rate": 6.1216288704441255e-06,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 7030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.319439166463425,
|
||
|
|
"grad_norm": 0.08459633588790894,
|
||
|
|
"learning_rate": 6.094552721301164e-06,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 7040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3198929152794242,
|
||
|
|
"grad_norm": 0.12328217923641205,
|
||
|
|
"learning_rate": 6.067510314663283e-06,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"step": 7050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3203466640954234,
|
||
|
|
"grad_norm": 0.11582467705011368,
|
||
|
|
"learning_rate": 6.0405018841729934e-06,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 7060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3208004129114226,
|
||
|
|
"grad_norm": 0.12689736485481262,
|
||
|
|
"learning_rate": 6.013527663179275e-06,
|
||
|
|
"loss": 0.012,
|
||
|
|
"step": 7070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3212541617274217,
|
||
|
|
"grad_norm": 0.17244750261306763,
|
||
|
|
"learning_rate": 5.986587884735526e-06,
|
||
|
|
"loss": 0.0148,
|
||
|
|
"step": 7080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3217079105434209,
|
||
|
|
"grad_norm": 0.1367264688014984,
|
||
|
|
"learning_rate": 5.9596827815975775e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 7090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3221616593594201,
|
||
|
|
"grad_norm": 0.15937544405460358,
|
||
|
|
"learning_rate": 5.9328125862216676e-06,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 7100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3226154081754193,
|
||
|
|
"grad_norm": 0.1453155279159546,
|
||
|
|
"learning_rate": 5.90597753076243e-06,
|
||
|
|
"loss": 0.0117,
|
||
|
|
"step": 7110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32306915699141847,
|
||
|
|
"grad_norm": 0.1392073631286621,
|
||
|
|
"learning_rate": 5.879177847070906e-06,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 7120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32352290580741766,
|
||
|
|
"grad_norm": 0.12530963122844696,
|
||
|
|
"learning_rate": 5.8524137666925174e-06,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 7130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32397665462341685,
|
||
|
|
"grad_norm": 0.11018750071525574,
|
||
|
|
"learning_rate": 5.825685520865092e-06,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"step": 7140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32443040343941604,
|
||
|
|
"grad_norm": 0.1296696662902832,
|
||
|
|
"learning_rate": 5.798993340516843e-06,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 7150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3248841522554152,
|
||
|
|
"grad_norm": 0.16327860951423645,
|
||
|
|
"learning_rate": 5.772337456264386e-06,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"step": 7160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3253379010714144,
|
||
|
|
"grad_norm": 0.14514560997486115,
|
||
|
|
"learning_rate": 5.745718098410737e-06,
|
||
|
|
"loss": 0.0129,
|
||
|
|
"step": 7170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32579164988741355,
|
||
|
|
"grad_norm": 0.11294388025999069,
|
||
|
|
"learning_rate": 5.719135496943343e-06,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 7180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32624539870341274,
|
||
|
|
"grad_norm": 0.1433974653482437,
|
||
|
|
"learning_rate": 5.69258988153207e-06,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 7190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3266991475194119,
|
||
|
|
"grad_norm": 0.1322980523109436,
|
||
|
|
"learning_rate": 5.666081481527232e-06,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"step": 7200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3271528963354111,
|
||
|
|
"grad_norm": 0.14572355151176453,
|
||
|
|
"learning_rate": 5.639610525957604e-06,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"step": 7210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3276066451514103,
|
||
|
|
"grad_norm": 0.07759486883878708,
|
||
|
|
"learning_rate": 5.613177243528458e-06,
|
||
|
|
"loss": 0.0116,
|
||
|
|
"step": 7220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3280603939674095,
|
||
|
|
"grad_norm": 0.10589128732681274,
|
||
|
|
"learning_rate": 5.586781862619566e-06,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"step": 7230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3285141427834087,
|
||
|
|
"grad_norm": 0.1211918294429779,
|
||
|
|
"learning_rate": 5.560424611283231e-06,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 7240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32896789159940787,
|
||
|
|
"grad_norm": 0.12552636861801147,
|
||
|
|
"learning_rate": 5.53410571724234e-06,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"step": 7250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32942164041540706,
|
||
|
|
"grad_norm": 0.15319830179214478,
|
||
|
|
"learning_rate": 5.507825407888362e-06,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"step": 7260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32987538923140625,
|
||
|
|
"grad_norm": 0.1404561698436737,
|
||
|
|
"learning_rate": 5.481583910279402e-06,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"step": 7270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33032913804740544,
|
||
|
|
"grad_norm": 0.13395948708057404,
|
||
|
|
"learning_rate": 5.4553814511382485e-06,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 7280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33078288686340457,
|
||
|
|
"grad_norm": 0.13249263167381287,
|
||
|
|
"learning_rate": 5.429218256850393e-06,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 7290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33123663567940376,
|
||
|
|
"grad_norm": 0.1319146752357483,
|
||
|
|
"learning_rate": 5.403094553462083e-06,
|
||
|
|
"loss": 0.0121,
|
||
|
|
"step": 7300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33169038449540295,
|
||
|
|
"grad_norm": 0.07734081894159317,
|
||
|
|
"learning_rate": 5.377010566678371e-06,
|
||
|
|
"loss": 0.013,
|
||
|
|
"step": 7310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33214413331140213,
|
||
|
|
"grad_norm": 0.10048568248748779,
|
||
|
|
"learning_rate": 5.350966521861178e-06,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 7320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3325978821274013,
|
||
|
|
"grad_norm": 0.1091848835349083,
|
||
|
|
"learning_rate": 5.324962644027312e-06,
|
||
|
|
"loss": 0.0146,
|
||
|
|
"step": 7330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3330516309434005,
|
||
|
|
"grad_norm": 0.08323021978139877,
|
||
|
|
"learning_rate": 5.298999157846555e-06,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"step": 7340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3335053797593997,
|
||
|
|
"grad_norm": 0.11683864891529083,
|
||
|
|
"learning_rate": 5.273076287639704e-06,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 7350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3339591285753989,
|
||
|
|
"grad_norm": 0.10861805081367493,
|
||
|
|
"learning_rate": 5.247194257376653e-06,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 7360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3344128773913981,
|
||
|
|
"grad_norm": 0.11935005336999893,
|
||
|
|
"learning_rate": 5.221353290674429e-06,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 7370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33486662620739727,
|
||
|
|
"grad_norm": 0.09110080450773239,
|
||
|
|
"learning_rate": 5.1955536107952885e-06,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 7380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3353203750233964,
|
||
|
|
"grad_norm": 0.10009435564279556,
|
||
|
|
"learning_rate": 5.169795440644767e-06,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"step": 7390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3357741238393956,
|
||
|
|
"grad_norm": 0.13338936865329742,
|
||
|
|
"learning_rate": 5.144079002769766e-06,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 7400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3362278726553948,
|
||
|
|
"grad_norm": 0.10619942098855972,
|
||
|
|
"learning_rate": 5.118404519356621e-06,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 7410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33668162147139397,
|
||
|
|
"grad_norm": 0.19180886447429657,
|
||
|
|
"learning_rate": 5.0927722122292e-06,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"step": 7420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33713537028739315,
|
||
|
|
"grad_norm": 0.13941583037376404,
|
||
|
|
"learning_rate": 5.067182302846958e-06,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"step": 7430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33758911910339234,
|
||
|
|
"grad_norm": 0.14227934181690216,
|
||
|
|
"learning_rate": 5.041635012303048e-06,
|
||
|
|
"loss": 0.0121,
|
||
|
|
"step": 7440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33804286791939153,
|
||
|
|
"grad_norm": 0.11831536144018173,
|
||
|
|
"learning_rate": 5.016130561322399e-06,
|
||
|
|
"loss": 0.009,
|
||
|
|
"step": 7450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3384966167353907,
|
||
|
|
"grad_norm": 0.10274643450975418,
|
||
|
|
"learning_rate": 4.990669170259816e-06,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 7460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3389503655513899,
|
||
|
|
"grad_norm": 0.22540533542633057,
|
||
|
|
"learning_rate": 4.965251059098074e-06,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 7470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3394041143673891,
|
||
|
|
"grad_norm": 0.1153121292591095,
|
||
|
|
"learning_rate": 4.93987644744601e-06,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"step": 7480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33985786318338823,
|
||
|
|
"grad_norm": 0.13815748691558838,
|
||
|
|
"learning_rate": 4.9145455545366335e-06,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 7490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3403116119993874,
|
||
|
|
"grad_norm": 0.1963374763727188,
|
||
|
|
"learning_rate": 4.889258599225233e-06,
|
||
|
|
"loss": 0.0136,
|
||
|
|
"step": 7500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3407653608153866,
|
||
|
|
"grad_norm": 0.10098478198051453,
|
||
|
|
"learning_rate": 4.864015799987474e-06,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 7510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3412191096313858,
|
||
|
|
"grad_norm": 0.13396380841732025,
|
||
|
|
"learning_rate": 4.838817374917534e-06,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 7520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.341672858447385,
|
||
|
|
"grad_norm": 0.14336052536964417,
|
||
|
|
"learning_rate": 4.8136635417261935e-06,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"step": 7530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3421266072633842,
|
||
|
|
"grad_norm": 0.10059235990047455,
|
||
|
|
"learning_rate": 4.788554517738967e-06,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"step": 7540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34258035607938336,
|
||
|
|
"grad_norm": 0.1468759924173355,
|
||
|
|
"learning_rate": 4.763490519894223e-06,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 7550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34303410489538255,
|
||
|
|
"grad_norm": 0.12398961931467056,
|
||
|
|
"learning_rate": 4.738471764741319e-06,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 7560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34348785371138174,
|
||
|
|
"grad_norm": 0.09385305643081665,
|
||
|
|
"learning_rate": 4.713498468438709e-06,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 7570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34394160252738093,
|
||
|
|
"grad_norm": 0.21803854405879974,
|
||
|
|
"learning_rate": 4.6885708467521015e-06,
|
||
|
|
"loss": 0.0147,
|
||
|
|
"step": 7580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34439535134338006,
|
||
|
|
"grad_norm": 0.1330396980047226,
|
||
|
|
"learning_rate": 4.6636891150525765e-06,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 7590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34484910015937925,
|
||
|
|
"grad_norm": 0.12276133894920349,
|
||
|
|
"learning_rate": 4.638853488314727e-06,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 7600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34530284897537844,
|
||
|
|
"grad_norm": 0.14330561459064484,
|
||
|
|
"learning_rate": 4.614064181114817e-06,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 7610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34575659779137763,
|
||
|
|
"grad_norm": 0.1479530930519104,
|
||
|
|
"learning_rate": 4.589321407628907e-06,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"step": 7620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3462103466073768,
|
||
|
|
"grad_norm": 0.10039757937192917,
|
||
|
|
"learning_rate": 4.5646253816310175e-06,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"step": 7630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.346664095423376,
|
||
|
|
"grad_norm": 0.14793506264686584,
|
||
|
|
"learning_rate": 4.539976316491272e-06,
|
||
|
|
"loss": 0.0117,
|
||
|
|
"step": 7640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3471178442393752,
|
||
|
|
"grad_norm": 0.11778666079044342,
|
||
|
|
"learning_rate": 4.515374425174062e-06,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 7650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3475715930553744,
|
||
|
|
"grad_norm": 0.143499955534935,
|
||
|
|
"learning_rate": 4.49081992023621e-06,
|
||
|
|
"loss": 0.0085,
|
||
|
|
"step": 7660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3480253418713736,
|
||
|
|
"grad_norm": 0.1290217489004135,
|
||
|
|
"learning_rate": 4.466313013825119e-06,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 7670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34847909068737276,
|
||
|
|
"grad_norm": 0.1522817611694336,
|
||
|
|
"learning_rate": 4.4418539176769456e-06,
|
||
|
|
"loss": 0.0112,
|
||
|
|
"step": 7680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3489328395033719,
|
||
|
|
"grad_norm": 0.11977915465831757,
|
||
|
|
"learning_rate": 4.417442843114786e-06,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 7690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3493865883193711,
|
||
|
|
"grad_norm": 0.10978726297616959,
|
||
|
|
"learning_rate": 4.393080001046818e-06,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"step": 7700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34984033713537027,
|
||
|
|
"grad_norm": 0.14366991817951202,
|
||
|
|
"learning_rate": 4.368765601964516e-06,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"step": 7710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35029408595136946,
|
||
|
|
"grad_norm": 0.1092318445444107,
|
||
|
|
"learning_rate": 4.3444998559408025e-06,
|
||
|
|
"loss": 0.0093,
|
||
|
|
"step": 7720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35074783476736865,
|
||
|
|
"grad_norm": 0.11551111936569214,
|
||
|
|
"learning_rate": 4.320282972628246e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 7730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35120158358336784,
|
||
|
|
"grad_norm": 0.15743663907051086,
|
||
|
|
"learning_rate": 4.2961151612572495e-06,
|
||
|
|
"loss": 0.0123,
|
||
|
|
"step": 7740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.351655332399367,
|
||
|
|
"grad_norm": 0.10656075179576874,
|
||
|
|
"learning_rate": 4.2719966306342386e-06,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 7750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3521090812153662,
|
||
|
|
"grad_norm": 0.1292315423488617,
|
||
|
|
"learning_rate": 4.247927589139869e-06,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 7760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3525628300313654,
|
||
|
|
"grad_norm": 0.061548516154289246,
|
||
|
|
"learning_rate": 4.223908244727211e-06,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 7770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3530165788473646,
|
||
|
|
"grad_norm": 0.13080838322639465,
|
||
|
|
"learning_rate": 4.199938804919957e-06,
|
||
|
|
"loss": 0.0093,
|
||
|
|
"step": 7780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3534703276633637,
|
||
|
|
"grad_norm": 0.18990670144557953,
|
||
|
|
"learning_rate": 4.176019476810631e-06,
|
||
|
|
"loss": 0.014,
|
||
|
|
"step": 7790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3539240764793629,
|
||
|
|
"grad_norm": 0.1360110193490982,
|
||
|
|
"learning_rate": 4.152150467058805e-06,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 7800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3543778252953621,
|
||
|
|
"grad_norm": 0.10822978615760803,
|
||
|
|
"learning_rate": 4.128331981889309e-06,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"step": 7810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3548315741113613,
|
||
|
|
"grad_norm": 0.10335934162139893,
|
||
|
|
"learning_rate": 4.104564227090437e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 7820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3552853229273605,
|
||
|
|
"grad_norm": 0.12353721261024475,
|
||
|
|
"learning_rate": 4.080847408012189e-06,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 7830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35573907174335967,
|
||
|
|
"grad_norm": 0.12535031139850616,
|
||
|
|
"learning_rate": 4.057181729564478e-06,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"step": 7840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35619282055935886,
|
||
|
|
"grad_norm": 0.13195911049842834,
|
||
|
|
"learning_rate": 4.033567396215387e-06,
|
||
|
|
"loss": 0.009,
|
||
|
|
"step": 7850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35664656937535805,
|
||
|
|
"grad_norm": 0.15567561984062195,
|
||
|
|
"learning_rate": 4.0100046119893654e-06,
|
||
|
|
"loss": 0.0142,
|
||
|
|
"step": 7860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35710031819135724,
|
||
|
|
"grad_norm": 0.12166192382574081,
|
||
|
|
"learning_rate": 3.986493580465498e-06,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 7870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3575540670073564,
|
||
|
|
"grad_norm": 0.15031491219997406,
|
||
|
|
"learning_rate": 3.963034504775727e-06,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"step": 7880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3580078158233556,
|
||
|
|
"grad_norm": 0.1494155079126358,
|
||
|
|
"learning_rate": 3.939627587603103e-06,
|
||
|
|
"loss": 0.0141,
|
||
|
|
"step": 7890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35846156463935475,
|
||
|
|
"grad_norm": 0.16597557067871094,
|
||
|
|
"learning_rate": 3.9162730311800455e-06,
|
||
|
|
"loss": 0.012,
|
||
|
|
"step": 7900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35891531345535393,
|
||
|
|
"grad_norm": 0.08207192271947861,
|
||
|
|
"learning_rate": 3.8929710372865696e-06,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 7910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3593690622713531,
|
||
|
|
"grad_norm": 0.19074343144893646,
|
||
|
|
"learning_rate": 3.869721807248571e-06,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 7920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3598228110873523,
|
||
|
|
"grad_norm": 0.12037303298711777,
|
||
|
|
"learning_rate": 3.8465255419360635e-06,
|
||
|
|
"loss": 0.0112,
|
||
|
|
"step": 7930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3602765599033515,
|
||
|
|
"grad_norm": 0.10831968486309052,
|
||
|
|
"learning_rate": 3.823382441761454e-06,
|
||
|
|
"loss": 0.0095,
|
||
|
|
"step": 7940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3607303087193507,
|
||
|
|
"grad_norm": 0.08468834310770035,
|
||
|
|
"learning_rate": 3.8002927066778193e-06,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"step": 7950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3611840575353499,
|
||
|
|
"grad_norm": 0.10347823798656464,
|
||
|
|
"learning_rate": 3.7772565361771596e-06,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"step": 7960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36163780635134907,
|
||
|
|
"grad_norm": 0.1401902288198471,
|
||
|
|
"learning_rate": 3.75427412928869e-06,
|
||
|
|
"loss": 0.014,
|
||
|
|
"step": 7970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36209155516734826,
|
||
|
|
"grad_norm": 0.1368122547864914,
|
||
|
|
"learning_rate": 3.731345684577109e-06,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 7980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36254530398334744,
|
||
|
|
"grad_norm": 0.1546686738729477,
|
||
|
|
"learning_rate": 3.7084714001409016e-06,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"step": 7990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3629990527993466,
|
||
|
|
"grad_norm": 0.08418429642915726,
|
||
|
|
"learning_rate": 3.6856514736106063e-06,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 8000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36345280161534577,
|
||
|
|
"grad_norm": 0.09853670001029968,
|
||
|
|
"learning_rate": 3.6628861021471185e-06,
|
||
|
|
"loss": 0.0121,
|
||
|
|
"step": 8010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36390655043134496,
|
||
|
|
"grad_norm": 0.09226000308990479,
|
||
|
|
"learning_rate": 3.6401754824399837e-06,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 8020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36436029924734414,
|
||
|
|
"grad_norm": 0.09666698426008224,
|
||
|
|
"learning_rate": 3.6175198107057107e-06,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"step": 8030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36481404806334333,
|
||
|
|
"grad_norm": 0.15817151963710785,
|
||
|
|
"learning_rate": 3.5949192826860513e-06,
|
||
|
|
"loss": 0.0086,
|
||
|
|
"step": 8040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3652677968793425,
|
||
|
|
"grad_norm": 0.10350827127695084,
|
||
|
|
"learning_rate": 3.572374093646336e-06,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 8050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3657215456953417,
|
||
|
|
"grad_norm": 0.15225833654403687,
|
||
|
|
"learning_rate": 3.5498844383737653e-06,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"step": 8060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3661752945113409,
|
||
|
|
"grad_norm": 0.1303558200597763,
|
||
|
|
"learning_rate": 3.5274505111757405e-06,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 8070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3666290433273401,
|
||
|
|
"grad_norm": 0.145473450422287,
|
||
|
|
"learning_rate": 3.5050725058781765e-06,
|
||
|
|
"loss": 0.013,
|
||
|
|
"step": 8080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3670827921433393,
|
||
|
|
"grad_norm": 0.1219491958618164,
|
||
|
|
"learning_rate": 3.482750615823838e-06,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 8090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3675365409593384,
|
||
|
|
"grad_norm": 0.09301994740962982,
|
||
|
|
"learning_rate": 3.4604850338706554e-06,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 8100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3679902897753376,
|
||
|
|
"grad_norm": 0.09377678483724594,
|
||
|
|
"learning_rate": 3.4382759523900678e-06,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 8110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3684440385913368,
|
||
|
|
"grad_norm": 0.13796672224998474,
|
||
|
|
"learning_rate": 3.4161235632653587e-06,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 8120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.368897787407336,
|
||
|
|
"grad_norm": 0.11785976588726044,
|
||
|
|
"learning_rate": 3.394028057889992e-06,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 8130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36935153622333516,
|
||
|
|
"grad_norm": 0.14343872666358948,
|
||
|
|
"learning_rate": 3.3719896271659734e-06,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"step": 8140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36980528503933435,
|
||
|
|
"grad_norm": 0.16964061558246613,
|
||
|
|
"learning_rate": 3.3500084615021912e-06,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 8150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37025903385533354,
|
||
|
|
"grad_norm": 0.10996939241886139,
|
||
|
|
"learning_rate": 3.3280847508127644e-06,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"step": 8160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37071278267133273,
|
||
|
|
"grad_norm": 0.13238005340099335,
|
||
|
|
"learning_rate": 3.306218684515413e-06,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"step": 8170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3711665314873319,
|
||
|
|
"grad_norm": 0.12384450435638428,
|
||
|
|
"learning_rate": 3.284410451529816e-06,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"step": 8180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3716202803033311,
|
||
|
|
"grad_norm": 0.1461249142885208,
|
||
|
|
"learning_rate": 3.2626602402759865e-06,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 8190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37207402911933024,
|
||
|
|
"grad_norm": 0.14945949614048004,
|
||
|
|
"learning_rate": 3.240968238672633e-06,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 8200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37252777793532943,
|
||
|
|
"grad_norm": 0.11225463449954987,
|
||
|
|
"learning_rate": 3.2193346341355413e-06,
|
||
|
|
"loss": 0.0116,
|
||
|
|
"step": 8210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3729815267513286,
|
||
|
|
"grad_norm": 0.13144910335540771,
|
||
|
|
"learning_rate": 3.1977596135759524e-06,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"step": 8220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3734352755673278,
|
||
|
|
"grad_norm": 0.14924506843090057,
|
||
|
|
"learning_rate": 3.176243363398961e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 8230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.373889024383327,
|
||
|
|
"grad_norm": 0.257318913936615,
|
||
|
|
"learning_rate": 3.1547860695018793e-06,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"step": 8240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3743427731993262,
|
||
|
|
"grad_norm": 0.0802023783326149,
|
||
|
|
"learning_rate": 3.13338791727266e-06,
|
||
|
|
"loss": 0.0116,
|
||
|
|
"step": 8250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3747965220153254,
|
||
|
|
"grad_norm": 0.13596881926059723,
|
||
|
|
"learning_rate": 3.1120490915882694e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 8260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37525027083132456,
|
||
|
|
"grad_norm": 0.1355019360780716,
|
||
|
|
"learning_rate": 3.090769776813106e-06,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 8270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37570401964732375,
|
||
|
|
"grad_norm": 0.09479621052742004,
|
||
|
|
"learning_rate": 3.0695501567973983e-06,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"step": 8280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37615776846332294,
|
||
|
|
"grad_norm": 0.17646194994449615,
|
||
|
|
"learning_rate": 3.0483904148756284e-06,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 8290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37661151727932207,
|
||
|
|
"grad_norm": 0.13487325608730316,
|
||
|
|
"learning_rate": 3.0272907338649337e-06,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 8300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37706526609532126,
|
||
|
|
"grad_norm": 0.13014310598373413,
|
||
|
|
"learning_rate": 3.006251296063536e-06,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 8310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37751901491132045,
|
||
|
|
"grad_norm": 0.10160695761442184,
|
||
|
|
"learning_rate": 2.985272283249161e-06,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"step": 8320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37797276372731964,
|
||
|
|
"grad_norm": 0.11651884764432907,
|
||
|
|
"learning_rate": 2.9643538766774793e-06,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 8330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3784265125433188,
|
||
|
|
"grad_norm": 0.08997322618961334,
|
||
|
|
"learning_rate": 2.943496257080527e-06,
|
||
|
|
"loss": 0.0095,
|
||
|
|
"step": 8340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.378880261359318,
|
||
|
|
"grad_norm": 0.10125599801540375,
|
||
|
|
"learning_rate": 2.9226996046651435e-06,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"step": 8350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3793340101753172,
|
||
|
|
"grad_norm": 0.10461420565843582,
|
||
|
|
"learning_rate": 2.901964099111435e-06,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 8360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3797877589913164,
|
||
|
|
"grad_norm": 0.1022501215338707,
|
||
|
|
"learning_rate": 2.881289919571193e-06,
|
||
|
|
"loss": 0.0121,
|
||
|
|
"step": 8370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3802415078073156,
|
||
|
|
"grad_norm": 0.11119214445352554,
|
||
|
|
"learning_rate": 2.860677244666373e-06,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"step": 8380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38069525662331477,
|
||
|
|
"grad_norm": 0.10773328691720963,
|
||
|
|
"learning_rate": 2.840126252487532e-06,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 8390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38114900543931396,
|
||
|
|
"grad_norm": 0.13215845823287964,
|
||
|
|
"learning_rate": 2.8196371205922955e-06,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"step": 8400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3816027542553131,
|
||
|
|
"grad_norm": 0.0992102399468422,
|
||
|
|
"learning_rate": 2.799210026003831e-06,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 8410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3820565030713123,
|
||
|
|
"grad_norm": 0.16861070692539215,
|
||
|
|
"learning_rate": 2.7788451452093067e-06,
|
||
|
|
"loss": 0.0132,
|
||
|
|
"step": 8420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38251025188731147,
|
||
|
|
"grad_norm": 0.11336692422628403,
|
||
|
|
"learning_rate": 2.75854265415838e-06,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"step": 8430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38296400070331066,
|
||
|
|
"grad_norm": 0.21186785399913788,
|
||
|
|
"learning_rate": 2.738302728261665e-06,
|
||
|
|
"loss": 0.0124,
|
||
|
|
"step": 8440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38341774951930985,
|
||
|
|
"grad_norm": 0.09690193086862564,
|
||
|
|
"learning_rate": 2.7181255423892192e-06,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 8450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38387149833530904,
|
||
|
|
"grad_norm": 0.1311059594154358,
|
||
|
|
"learning_rate": 2.6980112708690374e-06,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 8460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3843252471513082,
|
||
|
|
"grad_norm": 0.13914784789085388,
|
||
|
|
"learning_rate": 2.677960087485547e-06,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"step": 8470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3847789959673074,
|
||
|
|
"grad_norm": 0.11388586461544037,
|
||
|
|
"learning_rate": 2.657972165478103e-06,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"step": 8480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3852327447833066,
|
||
|
|
"grad_norm": 0.18526536226272583,
|
||
|
|
"learning_rate": 2.638047677539487e-06,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 8490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3856864935993058,
|
||
|
|
"grad_norm": 0.08570456504821777,
|
||
|
|
"learning_rate": 2.618186795814418e-06,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"step": 8500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3861402424153049,
|
||
|
|
"grad_norm": 0.09870341420173645,
|
||
|
|
"learning_rate": 2.598389691898072e-06,
|
||
|
|
"loss": 0.0131,
|
||
|
|
"step": 8510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3865939912313041,
|
||
|
|
"grad_norm": 0.13254578411579132,
|
||
|
|
"learning_rate": 2.578656536834586e-06,
|
||
|
|
"loss": 0.013,
|
||
|
|
"step": 8520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3870477400473033,
|
||
|
|
"grad_norm": 0.22662994265556335,
|
||
|
|
"learning_rate": 2.5589875011156008e-06,
|
||
|
|
"loss": 0.0134,
|
||
|
|
"step": 8530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3875014888633025,
|
||
|
|
"grad_norm": 0.10000614821910858,
|
||
|
|
"learning_rate": 2.539382754678764e-06,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 8540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3879552376793017,
|
||
|
|
"grad_norm": 0.14206960797309875,
|
||
|
|
"learning_rate": 2.519842466906276e-06,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"step": 8550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38840898649530087,
|
||
|
|
"grad_norm": 0.14792758226394653,
|
||
|
|
"learning_rate": 2.5003668066234233e-06,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 8560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38886273531130006,
|
||
|
|
"grad_norm": 0.08291668444871902,
|
||
|
|
"learning_rate": 2.480955942097121e-06,
|
||
|
|
"loss": 0.0108,
|
||
|
|
"step": 8570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38931648412729924,
|
||
|
|
"grad_norm": 0.11204250901937485,
|
||
|
|
"learning_rate": 2.4616100410344634e-06,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"step": 8580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38977023294329843,
|
||
|
|
"grad_norm": 0.16575933992862701,
|
||
|
|
"learning_rate": 2.442329270581262e-06,
|
||
|
|
"loss": 0.013,
|
||
|
|
"step": 8590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3902239817592976,
|
||
|
|
"grad_norm": 0.10479316860437393,
|
||
|
|
"learning_rate": 2.4231137973206097e-06,
|
||
|
|
"loss": 0.0138,
|
||
|
|
"step": 8600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39067773057529676,
|
||
|
|
"grad_norm": 0.1315084546804428,
|
||
|
|
"learning_rate": 2.4039637872714417e-06,
|
||
|
|
"loss": 0.0093,
|
||
|
|
"step": 8610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39113147939129594,
|
||
|
|
"grad_norm": 0.14845795929431915,
|
||
|
|
"learning_rate": 2.3848794058871073e-06,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"step": 8620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39158522820729513,
|
||
|
|
"grad_norm": 0.09940434992313385,
|
||
|
|
"learning_rate": 2.3658608180539243e-06,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 8630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3920389770232943,
|
||
|
|
"grad_norm": 0.13721604645252228,
|
||
|
|
"learning_rate": 2.3469081880897694e-06,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 8640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3924927258392935,
|
||
|
|
"grad_norm": 0.11317367106676102,
|
||
|
|
"learning_rate": 2.328021679742648e-06,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"step": 8650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3929464746552927,
|
||
|
|
"grad_norm": 0.10845492035150528,
|
||
|
|
"learning_rate": 2.309201456189286e-06,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 8660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3934002234712919,
|
||
|
|
"grad_norm": 0.11880891025066376,
|
||
|
|
"learning_rate": 2.290447680033725e-06,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 8670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3938539722872911,
|
||
|
|
"grad_norm": 0.11626913398504257,
|
||
|
|
"learning_rate": 2.2717605133059007e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 8680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39430772110329027,
|
||
|
|
"grad_norm": 0.10748831182718277,
|
||
|
|
"learning_rate": 2.253140117460255e-06,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"step": 8690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39476146991928945,
|
||
|
|
"grad_norm": 0.12304140627384186,
|
||
|
|
"learning_rate": 2.2345866533743453e-06,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 8700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3952152187352886,
|
||
|
|
"grad_norm": 0.10171318054199219,
|
||
|
|
"learning_rate": 2.2161002813474397e-06,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 8710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3956689675512878,
|
||
|
|
"grad_norm": 0.13390186429023743,
|
||
|
|
"learning_rate": 2.197681161099149e-06,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"step": 8720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39612271636728696,
|
||
|
|
"grad_norm": 0.1202036514878273,
|
||
|
|
"learning_rate": 2.179329451768031e-06,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"step": 8730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39657646518328615,
|
||
|
|
"grad_norm": 0.11887665092945099,
|
||
|
|
"learning_rate": 2.161045311910227e-06,
|
||
|
|
"loss": 0.0116,
|
||
|
|
"step": 8740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39703021399928534,
|
||
|
|
"grad_norm": 0.12759540975093842,
|
||
|
|
"learning_rate": 2.1428288994980816e-06,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 8750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39748396281528453,
|
||
|
|
"grad_norm": 0.1575375497341156,
|
||
|
|
"learning_rate": 2.124680371918796e-06,
|
||
|
|
"loss": 0.0121,
|
||
|
|
"step": 8760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3979377116312837,
|
||
|
|
"grad_norm": 0.10857710242271423,
|
||
|
|
"learning_rate": 2.106599885973044e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 8770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3983914604472829,
|
||
|
|
"grad_norm": 0.13608422875404358,
|
||
|
|
"learning_rate": 2.088587597873637e-06,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 8780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3988452092632821,
|
||
|
|
"grad_norm": 0.18532246351242065,
|
||
|
|
"learning_rate": 2.070643663244163e-06,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"step": 8790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3992989580792813,
|
||
|
|
"grad_norm": 0.14446675777435303,
|
||
|
|
"learning_rate": 2.052768237117644e-06,
|
||
|
|
"loss": 0.0123,
|
||
|
|
"step": 8800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3997527068952804,
|
||
|
|
"grad_norm": 0.16273485124111176,
|
||
|
|
"learning_rate": 2.034961473935203e-06,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"step": 8810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4002064557112796,
|
||
|
|
"grad_norm": 0.0890413299202919,
|
||
|
|
"learning_rate": 2.0172235275447284e-06,
|
||
|
|
"loss": 0.009,
|
||
|
|
"step": 8820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4006602045272788,
|
||
|
|
"grad_norm": 0.1420058310031891,
|
||
|
|
"learning_rate": 1.9995545511995316e-06,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 8830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.401113953343278,
|
||
|
|
"grad_norm": 0.10661931335926056,
|
||
|
|
"learning_rate": 1.9819546975570382e-06,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 8840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4015677021592772,
|
||
|
|
"grad_norm": 0.15149742364883423,
|
||
|
|
"learning_rate": 1.9644241186774593e-06,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 8850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40202145097527636,
|
||
|
|
"grad_norm": 0.09179948270320892,
|
||
|
|
"learning_rate": 1.9469629660224907e-06,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 8860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40247519979127555,
|
||
|
|
"grad_norm": 0.12367530912160873,
|
||
|
|
"learning_rate": 1.9295713904539892e-06,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"step": 8870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40292894860727474,
|
||
|
|
"grad_norm": 0.08769435435533524,
|
||
|
|
"learning_rate": 1.912249542232675e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 8880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40338269742327393,
|
||
|
|
"grad_norm": 0.12802784144878387,
|
||
|
|
"learning_rate": 1.8949975710168357e-06,
|
||
|
|
"loss": 0.0112,
|
||
|
|
"step": 8890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4038364462392731,
|
||
|
|
"grad_norm": 0.102344810962677,
|
||
|
|
"learning_rate": 1.8778156258610292e-06,
|
||
|
|
"loss": 0.0127,
|
||
|
|
"step": 8900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40429019505527225,
|
||
|
|
"grad_norm": 0.1247081309556961,
|
||
|
|
"learning_rate": 1.8607038552148039e-06,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 8910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40474394387127144,
|
||
|
|
"grad_norm": 0.08801490813493729,
|
||
|
|
"learning_rate": 1.8436624069214071e-06,
|
||
|
|
"loss": 0.0078,
|
||
|
|
"step": 8920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4051976926872706,
|
||
|
|
"grad_norm": 0.10792089253664017,
|
||
|
|
"learning_rate": 1.8266914282165116e-06,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 8930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4056514415032698,
|
||
|
|
"grad_norm": 0.073944590985775,
|
||
|
|
"learning_rate": 1.80979106572694e-06,
|
||
|
|
"loss": 0.009,
|
||
|
|
"step": 8940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.406105190319269,
|
||
|
|
"grad_norm": 0.12981021404266357,
|
||
|
|
"learning_rate": 1.792961465469404e-06,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 8950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4065589391352682,
|
||
|
|
"grad_norm": 0.19840888679027557,
|
||
|
|
"learning_rate": 1.7762027728492405e-06,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 8960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4070126879512674,
|
||
|
|
"grad_norm": 0.14734525978565216,
|
||
|
|
"learning_rate": 1.759515132659153e-06,
|
||
|
|
"loss": 0.0108,
|
||
|
|
"step": 8970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40746643676726657,
|
||
|
|
"grad_norm": 0.1168968454003334,
|
||
|
|
"learning_rate": 1.742898689077961e-06,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"step": 8980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40792018558326576,
|
||
|
|
"grad_norm": 0.07594756782054901,
|
||
|
|
"learning_rate": 1.726353585669356e-06,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"step": 8990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40837393439926495,
|
||
|
|
"grad_norm": 0.1320488303899765,
|
||
|
|
"learning_rate": 1.7098799653806663e-06,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 9000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40882768321526414,
|
||
|
|
"grad_norm": 0.12867127358913422,
|
||
|
|
"learning_rate": 1.6934779705416082e-06,
|
||
|
|
"loss": 0.0093,
|
||
|
|
"step": 9010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40928143203126327,
|
||
|
|
"grad_norm": 0.13493511080741882,
|
||
|
|
"learning_rate": 1.6771477428630656e-06,
|
||
|
|
"loss": 0.0112,
|
||
|
|
"step": 9020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40973518084726246,
|
||
|
|
"grad_norm": 0.13557837903499603,
|
||
|
|
"learning_rate": 1.6608894234358708e-06,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 9030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41018892966326165,
|
||
|
|
"grad_norm": 0.1060461550951004,
|
||
|
|
"learning_rate": 1.6447031527295744e-06,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 9040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41064267847926084,
|
||
|
|
"grad_norm": 0.14427106082439423,
|
||
|
|
"learning_rate": 1.628589070591232e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 9050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41109642729526,
|
||
|
|
"grad_norm": 0.11962931603193283,
|
||
|
|
"learning_rate": 1.6125473162442107e-06,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 9060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4115501761112592,
|
||
|
|
"grad_norm": 0.17841389775276184,
|
||
|
|
"learning_rate": 1.5965780282869693e-06,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 9070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4120039249272584,
|
||
|
|
"grad_norm": 0.06603584438562393,
|
||
|
|
"learning_rate": 1.5806813446918657e-06,
|
||
|
|
"loss": 0.0081,
|
||
|
|
"step": 9080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4124576737432576,
|
||
|
|
"grad_norm": 0.10497824102640152,
|
||
|
|
"learning_rate": 1.56485740280397e-06,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 9090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4129114225592568,
|
||
|
|
"grad_norm": 0.10903108865022659,
|
||
|
|
"learning_rate": 1.5491063393398742e-06,
|
||
|
|
"loss": 0.0112,
|
||
|
|
"step": 9100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41336517137525597,
|
||
|
|
"grad_norm": 0.15178728103637695,
|
||
|
|
"learning_rate": 1.5334282903865116e-06,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"step": 9110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4138189201912551,
|
||
|
|
"grad_norm": 0.07373558729887009,
|
||
|
|
"learning_rate": 1.5178233913999784e-06,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 9120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4142726690072543,
|
||
|
|
"grad_norm": 0.13341090083122253,
|
||
|
|
"learning_rate": 1.5022917772043633e-06,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 9130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4147264178232535,
|
||
|
|
"grad_norm": 0.08545683324337006,
|
||
|
|
"learning_rate": 1.4868335819905922e-06,
|
||
|
|
"loss": 0.0093,
|
||
|
|
"step": 9140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41518016663925267,
|
||
|
|
"grad_norm": 0.11698795109987259,
|
||
|
|
"learning_rate": 1.4714489393152586e-06,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 9150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41563391545525186,
|
||
|
|
"grad_norm": 0.10725052654743195,
|
||
|
|
"learning_rate": 1.4561379820994692e-06,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"step": 9160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41608766427125105,
|
||
|
|
"grad_norm": 0.14751501381397247,
|
||
|
|
"learning_rate": 1.4409008426277028e-06,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 9170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41654141308725023,
|
||
|
|
"grad_norm": 0.11136484146118164,
|
||
|
|
"learning_rate": 1.4257376525466594e-06,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 9180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4169951619032494,
|
||
|
|
"grad_norm": 0.0880153626203537,
|
||
|
|
"learning_rate": 1.4106485428641292e-06,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"step": 9190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4174489107192486,
|
||
|
|
"grad_norm": 0.1130552664399147,
|
||
|
|
"learning_rate": 1.3956336439478612e-06,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 9200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4179026595352478,
|
||
|
|
"grad_norm": 0.0887952521443367,
|
||
|
|
"learning_rate": 1.3806930855244315e-06,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 9210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41835640835124693,
|
||
|
|
"grad_norm": 0.13212358951568604,
|
||
|
|
"learning_rate": 1.3658269966781223e-06,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 9220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4188101571672461,
|
||
|
|
"grad_norm": 0.10839957743883133,
|
||
|
|
"learning_rate": 1.3510355058498114e-06,
|
||
|
|
"loss": 0.0083,
|
||
|
|
"step": 9230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4192639059832453,
|
||
|
|
"grad_norm": 0.08262749761343002,
|
||
|
|
"learning_rate": 1.3363187408358612e-06,
|
||
|
|
"loss": 0.0095,
|
||
|
|
"step": 9240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4197176547992445,
|
||
|
|
"grad_norm": 0.11758922785520554,
|
||
|
|
"learning_rate": 1.3216768287870185e-06,
|
||
|
|
"loss": 0.0095,
|
||
|
|
"step": 9250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4201714036152437,
|
||
|
|
"grad_norm": 0.13314858078956604,
|
||
|
|
"learning_rate": 1.3071098962073004e-06,
|
||
|
|
"loss": 0.0076,
|
||
|
|
"step": 9260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4206251524312429,
|
||
|
|
"grad_norm": 0.08712945133447647,
|
||
|
|
"learning_rate": 1.292618068952921e-06,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 9270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42107890124724207,
|
||
|
|
"grad_norm": 0.12162909656763077,
|
||
|
|
"learning_rate": 1.2782014722311897e-06,
|
||
|
|
"loss": 0.0086,
|
||
|
|
"step": 9280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42153265006324125,
|
||
|
|
"grad_norm": 0.0974271148443222,
|
||
|
|
"learning_rate": 1.2638602305994364e-06,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 9290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42198639887924044,
|
||
|
|
"grad_norm": 0.08132126927375793,
|
||
|
|
"learning_rate": 1.2495944679639383e-06,
|
||
|
|
"loss": 0.0108,
|
||
|
|
"step": 9300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42244014769523963,
|
||
|
|
"grad_norm": 0.09410033375024796,
|
||
|
|
"learning_rate": 1.2354043075788391e-06,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 9310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42289389651123876,
|
||
|
|
"grad_norm": 0.11205838620662689,
|
||
|
|
"learning_rate": 1.2212898720450915e-06,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"step": 9320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42334764532723795,
|
||
|
|
"grad_norm": 0.08991268277168274,
|
||
|
|
"learning_rate": 1.2072512833093964e-06,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 9330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42380139414323714,
|
||
|
|
"grad_norm": 0.15082821249961853,
|
||
|
|
"learning_rate": 1.1932886626631512e-06,
|
||
|
|
"loss": 0.0117,
|
||
|
|
"step": 9340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42425514295923633,
|
||
|
|
"grad_norm": 0.16661033034324646,
|
||
|
|
"learning_rate": 1.179402130741396e-06,
|
||
|
|
"loss": 0.0082,
|
||
|
|
"step": 9350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4247088917752355,
|
||
|
|
"grad_norm": 0.12957607209682465,
|
||
|
|
"learning_rate": 1.165591807521781e-06,
|
||
|
|
"loss": 0.0117,
|
||
|
|
"step": 9360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4251626405912347,
|
||
|
|
"grad_norm": 0.12426209449768066,
|
||
|
|
"learning_rate": 1.1518578123235191e-06,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"step": 9370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4256163894072339,
|
||
|
|
"grad_norm": 0.1846769154071808,
|
||
|
|
"learning_rate": 1.1382002638063584e-06,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 9380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4260701382232331,
|
||
|
|
"grad_norm": 0.13810695707798004,
|
||
|
|
"learning_rate": 1.1246192799695666e-06,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 9390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4265238870392323,
|
||
|
|
"grad_norm": 0.16302891075611115,
|
||
|
|
"learning_rate": 1.1111149781508968e-06,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"step": 9400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42697763585523146,
|
||
|
|
"grad_norm": 0.12007147818803787,
|
||
|
|
"learning_rate": 1.0976874750255828e-06,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 9410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4274313846712306,
|
||
|
|
"grad_norm": 0.10976436734199524,
|
||
|
|
"learning_rate": 1.0843368866053271e-06,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 9420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4278851334872298,
|
||
|
|
"grad_norm": 0.12193258106708527,
|
||
|
|
"learning_rate": 1.0710633282372996e-06,
|
||
|
|
"loss": 0.0107,
|
||
|
|
"step": 9430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.428338882303229,
|
||
|
|
"grad_norm": 0.12088105827569962,
|
||
|
|
"learning_rate": 1.0578669146031484e-06,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 9440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42879263111922816,
|
||
|
|
"grad_norm": 0.1749945878982544,
|
||
|
|
"learning_rate": 1.0447477597179945e-06,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 9450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42924637993522735,
|
||
|
|
"grad_norm": 0.09087590128183365,
|
||
|
|
"learning_rate": 1.0317059769294557e-06,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 9460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42970012875122654,
|
||
|
|
"grad_norm": 0.1413136124610901,
|
||
|
|
"learning_rate": 1.0187416789166672e-06,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"step": 9470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43015387756722573,
|
||
|
|
"grad_norm": 0.1012006476521492,
|
||
|
|
"learning_rate": 1.0058549776893068e-06,
|
||
|
|
"loss": 0.012,
|
||
|
|
"step": 9480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4306076263832249,
|
||
|
|
"grad_norm": 0.15488676726818085,
|
||
|
|
"learning_rate": 9.930459845866313e-07,
|
||
|
|
"loss": 0.0093,
|
||
|
|
"step": 9490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4310613751992241,
|
||
|
|
"grad_norm": 0.12633702158927917,
|
||
|
|
"learning_rate": 9.803148102765026e-07,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"step": 9500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4315151240152233,
|
||
|
|
"grad_norm": 0.13584880530834198,
|
||
|
|
"learning_rate": 9.676615647544452e-07,
|
||
|
|
"loss": 0.0086,
|
||
|
|
"step": 9510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4319688728312225,
|
||
|
|
"grad_norm": 0.17178885638713837,
|
||
|
|
"learning_rate": 9.550863573426838e-07,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"step": 9520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4324226216472216,
|
||
|
|
"grad_norm": 0.13634297251701355,
|
||
|
|
"learning_rate": 9.425892966892136e-07,
|
||
|
|
"loss": 0.0112,
|
||
|
|
"step": 9530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4328763704632208,
|
||
|
|
"grad_norm": 0.1509203016757965,
|
||
|
|
"learning_rate": 9.301704907668474e-07,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 9540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43333011927922,
|
||
|
|
"grad_norm": 0.10559886693954468,
|
||
|
|
"learning_rate": 9.178300468722901e-07,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 9550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4337838680952192,
|
||
|
|
"grad_norm": 0.1276906430721283,
|
||
|
|
"learning_rate": 9.055680716252068e-07,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 9560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43423761691121837,
|
||
|
|
"grad_norm": 0.12274769693613052,
|
||
|
|
"learning_rate": 8.933846709673078e-07,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 9570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43469136572721756,
|
||
|
|
"grad_norm": 0.12317559868097305,
|
||
|
|
"learning_rate": 8.812799501614311e-07,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"step": 9580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43514511454321675,
|
||
|
|
"grad_norm": 0.08936764299869537,
|
||
|
|
"learning_rate": 8.692540137906314e-07,
|
||
|
|
"loss": 0.0095,
|
||
|
|
"step": 9590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43559886335921594,
|
||
|
|
"grad_norm": 0.13898974657058716,
|
||
|
|
"learning_rate": 8.573069657572752e-07,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 9600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4360526121752151,
|
||
|
|
"grad_norm": 0.10988219082355499,
|
||
|
|
"learning_rate": 8.454389092821458e-07,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 9610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4365063609912143,
|
||
|
|
"grad_norm": 0.13083656132221222,
|
||
|
|
"learning_rate": 8.336499469035509e-07,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"step": 9620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43696010980721345,
|
||
|
|
"grad_norm": 0.08637557923793793,
|
||
|
|
"learning_rate": 8.219401804764382e-07,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 9630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43741385862321264,
|
||
|
|
"grad_norm": 0.12404067814350128,
|
||
|
|
"learning_rate": 8.10309711171512e-07,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 9640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4378676074392118,
|
||
|
|
"grad_norm": 0.1374865174293518,
|
||
|
|
"learning_rate": 7.987586394743608e-07,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 9650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.438321356255211,
|
||
|
|
"grad_norm": 0.13313066959381104,
|
||
|
|
"learning_rate": 7.872870651845888e-07,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"step": 9660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4387751050712102,
|
||
|
|
"grad_norm": 0.1515800505876541,
|
||
|
|
"learning_rate": 7.758950874149541e-07,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"step": 9670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4392288538872094,
|
||
|
|
"grad_norm": 0.19143113493919373,
|
||
|
|
"learning_rate": 7.645828045905157e-07,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"step": 9680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4396826027032086,
|
||
|
|
"grad_norm": 0.125772625207901,
|
||
|
|
"learning_rate": 7.533503144477738e-07,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 9690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44013635151920777,
|
||
|
|
"grad_norm": 0.13882988691329956,
|
||
|
|
"learning_rate": 7.421977140338376e-07,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"step": 9700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44059010033520696,
|
||
|
|
"grad_norm": 0.12801378965377808,
|
||
|
|
"learning_rate": 7.311250997055752e-07,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 9710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44104384915120615,
|
||
|
|
"grad_norm": 0.12278799712657928,
|
||
|
|
"learning_rate": 7.201325671287862e-07,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"step": 9720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4414975979672053,
|
||
|
|
"grad_norm": 0.10260124504566193,
|
||
|
|
"learning_rate": 7.092202112773817e-07,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 9730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44195134678320447,
|
||
|
|
"grad_norm": 0.07633895426988602,
|
||
|
|
"learning_rate": 6.983881264325521e-07,
|
||
|
|
"loss": 0.0074,
|
||
|
|
"step": 9740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44240509559920366,
|
||
|
|
"grad_norm": 0.1047191321849823,
|
||
|
|
"learning_rate": 6.876364061819574e-07,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 9750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44285884441520285,
|
||
|
|
"grad_norm": 0.1236443743109703,
|
||
|
|
"learning_rate": 6.769651434189195e-07,
|
||
|
|
"loss": 0.0084,
|
||
|
|
"step": 9760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44331259323120203,
|
||
|
|
"grad_norm": 0.1673034131526947,
|
||
|
|
"learning_rate": 6.663744303416231e-07,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"step": 9770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4437663420472012,
|
||
|
|
"grad_norm": 0.11086335778236389,
|
||
|
|
"learning_rate": 6.558643584523117e-07,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 9780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4442200908632004,
|
||
|
|
"grad_norm": 0.12456338852643967,
|
||
|
|
"learning_rate": 6.454350185564994e-07,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"step": 9790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4446738396791996,
|
||
|
|
"grad_norm": 0.0809222161769867,
|
||
|
|
"learning_rate": 6.350865007621887e-07,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 9800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4451275884951988,
|
||
|
|
"grad_norm": 0.12166428565979004,
|
||
|
|
"learning_rate": 6.248188944790933e-07,
|
||
|
|
"loss": 0.0072,
|
||
|
|
"step": 9810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.445581337311198,
|
||
|
|
"grad_norm": 0.051547128707170486,
|
||
|
|
"learning_rate": 6.146322884178591e-07,
|
||
|
|
"loss": 0.0086,
|
||
|
|
"step": 9820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4460350861271971,
|
||
|
|
"grad_norm": 0.15253229439258575,
|
||
|
|
"learning_rate": 6.045267705893043e-07,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 9830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4464888349431963,
|
||
|
|
"grad_norm": 0.094617560505867,
|
||
|
|
"learning_rate": 5.945024283036549e-07,
|
||
|
|
"loss": 0.0083,
|
||
|
|
"step": 9840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4469425837591955,
|
||
|
|
"grad_norm": 0.10362132638692856,
|
||
|
|
"learning_rate": 5.845593481697931e-07,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 9850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4473963325751947,
|
||
|
|
"grad_norm": 0.11719254404306412,
|
||
|
|
"learning_rate": 5.746976160945051e-07,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 9860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44785008139119387,
|
||
|
|
"grad_norm": 0.18373169004917145,
|
||
|
|
"learning_rate": 5.649173172817457e-07,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 9870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44830383020719305,
|
||
|
|
"grad_norm": 0.15360820293426514,
|
||
|
|
"learning_rate": 5.55218536231894e-07,
|
||
|
|
"loss": 0.0125,
|
||
|
|
"step": 9880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44875757902319224,
|
||
|
|
"grad_norm": 0.09183119237422943,
|
||
|
|
"learning_rate": 5.456013567410312e-07,
|
||
|
|
"loss": 0.009,
|
||
|
|
"step": 9890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44921132783919143,
|
||
|
|
"grad_norm": 0.11628349125385284,
|
||
|
|
"learning_rate": 5.360658619002068e-07,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"step": 9900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4496650766551906,
|
||
|
|
"grad_norm": 0.10773396492004395,
|
||
|
|
"learning_rate": 5.266121340947327e-07,
|
||
|
|
"loss": 0.0093,
|
||
|
|
"step": 9910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4501188254711898,
|
||
|
|
"grad_norm": 0.10770761966705322,
|
||
|
|
"learning_rate": 5.172402550034639e-07,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"step": 9920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45057257428718894,
|
||
|
|
"grad_norm": 0.06062379479408264,
|
||
|
|
"learning_rate": 5.079503055980939e-07,
|
||
|
|
"loss": 0.0083,
|
||
|
|
"step": 9930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45102632310318813,
|
||
|
|
"grad_norm": 0.09935268759727478,
|
||
|
|
"learning_rate": 4.987423661424517e-07,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 9940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4514800719191873,
|
||
|
|
"grad_norm": 0.10356450080871582,
|
||
|
|
"learning_rate": 4.896165161918176e-07,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 9950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4519338207351865,
|
||
|
|
"grad_norm": 0.16462033987045288,
|
||
|
|
"learning_rate": 4.805728345922267e-07,
|
||
|
|
"loss": 0.009,
|
||
|
|
"step": 9960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4523875695511857,
|
||
|
|
"grad_norm": 0.13190323114395142,
|
||
|
|
"learning_rate": 4.716113994797944e-07,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"step": 9970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4528413183671849,
|
||
|
|
"grad_norm": 0.10049797594547272,
|
||
|
|
"learning_rate": 4.627322882800345e-07,
|
||
|
|
"loss": 0.0117,
|
||
|
|
"step": 9980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4532950671831841,
|
||
|
|
"grad_norm": 0.15959760546684265,
|
||
|
|
"learning_rate": 4.5393557770719744e-07,
|
||
|
|
"loss": 0.0109,
|
||
|
|
"step": 9990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45374881599918326,
|
||
|
|
"grad_norm": 0.18049407005310059,
|
||
|
|
"learning_rate": 4.4522134376359995e-07,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 10000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45420256481518245,
|
||
|
|
"grad_norm": 0.1254950761795044,
|
||
|
|
"learning_rate": 4.3658966173897866e-07,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 10010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45465631363118164,
|
||
|
|
"grad_norm": 0.1041630282998085,
|
||
|
|
"learning_rate": 4.2804060620982747e-07,
|
||
|
|
"loss": 0.0081,
|
||
|
|
"step": 10020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4551100624471808,
|
||
|
|
"grad_norm": 0.13439397513866425,
|
||
|
|
"learning_rate": 4.1957425103876235e-07,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 10030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45556381126317996,
|
||
|
|
"grad_norm": 0.09225888550281525,
|
||
|
|
"learning_rate": 4.111906693738799e-07,
|
||
|
|
"loss": 0.0076,
|
||
|
|
"step": 10040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45601756007917915,
|
||
|
|
"grad_norm": 0.11776017397642136,
|
||
|
|
"learning_rate": 4.02889933648124e-07,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 10050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45647130889517834,
|
||
|
|
"grad_norm": 0.08760102838277817,
|
||
|
|
"learning_rate": 3.946721155786615e-07,
|
||
|
|
"loss": 0.0069,
|
||
|
|
"step": 10060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45692505771117753,
|
||
|
|
"grad_norm": 0.11535688489675522,
|
||
|
|
"learning_rate": 3.865372861662664e-07,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 10070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4573788065271767,
|
||
|
|
"grad_norm": 0.13672105967998505,
|
||
|
|
"learning_rate": 3.784855156946965e-07,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"step": 10080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4578325553431759,
|
||
|
|
"grad_norm": 0.12640318274497986,
|
||
|
|
"learning_rate": 3.705168737300968e-07,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"step": 10090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4582863041591751,
|
||
|
|
"grad_norm": 0.12096633017063141,
|
||
|
|
"learning_rate": 3.626314291203914e-07,
|
||
|
|
"loss": 0.0108,
|
||
|
|
"step": 10100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4587400529751743,
|
||
|
|
"grad_norm": 0.15392784774303436,
|
||
|
|
"learning_rate": 3.548292499946937e-07,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 10110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4591938017911735,
|
||
|
|
"grad_norm": 0.123865507543087,
|
||
|
|
"learning_rate": 3.4711040376271264e-07,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 10120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45964755060717266,
|
||
|
|
"grad_norm": 0.10113903135061264,
|
||
|
|
"learning_rate": 3.394749571141731e-07,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 10130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4601012994231718,
|
||
|
|
"grad_norm": 0.1781150996685028,
|
||
|
|
"learning_rate": 3.319229760182441e-07,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 10140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.460555048239171,
|
||
|
|
"grad_norm": 0.14278154075145721,
|
||
|
|
"learning_rate": 3.244545257229559e-07,
|
||
|
|
"loss": 0.0126,
|
||
|
|
"step": 10150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46100879705517017,
|
||
|
|
"grad_norm": 0.14660927653312683,
|
||
|
|
"learning_rate": 3.170696707546539e-07,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"step": 10160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46146254587116936,
|
||
|
|
"grad_norm": 0.15213704109191895,
|
||
|
|
"learning_rate": 3.0976847491742347e-07,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 10170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46191629468716855,
|
||
|
|
"grad_norm": 0.10338576883077621,
|
||
|
|
"learning_rate": 3.0255100129255364e-07,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 10180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46237004350316774,
|
||
|
|
"grad_norm": 0.13408440351486206,
|
||
|
|
"learning_rate": 2.9541731223797997e-07,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 10190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4628237923191669,
|
||
|
|
"grad_norm": 0.10805374383926392,
|
||
|
|
"learning_rate": 2.883674693877558e-07,
|
||
|
|
"loss": 0.0095,
|
||
|
|
"step": 10200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4632775411351661,
|
||
|
|
"grad_norm": 0.06863702088594437,
|
||
|
|
"learning_rate": 2.8140153365151304e-07,
|
||
|
|
"loss": 0.0121,
|
||
|
|
"step": 10210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4637312899511653,
|
||
|
|
"grad_norm": 0.05823841691017151,
|
||
|
|
"learning_rate": 2.7451956521393983e-07,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 10220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4641850387671645,
|
||
|
|
"grad_norm": 0.12672090530395508,
|
||
|
|
"learning_rate": 2.677216235342561e-07,
|
||
|
|
"loss": 0.0093,
|
||
|
|
"step": 10230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4646387875831636,
|
||
|
|
"grad_norm": 0.13820886611938477,
|
||
|
|
"learning_rate": 2.6100776734570345e-07,
|
||
|
|
"loss": 0.0114,
|
||
|
|
"step": 10240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4650925363991628,
|
||
|
|
"grad_norm": 0.11121530085802078,
|
||
|
|
"learning_rate": 2.543780546550401e-07,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 10250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.465546285215162,
|
||
|
|
"grad_norm": 0.17037838697433472,
|
||
|
|
"learning_rate": 2.478325427420336e-07,
|
||
|
|
"loss": 0.0105,
|
||
|
|
"step": 10260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4660000340311612,
|
||
|
|
"grad_norm": 0.0819673016667366,
|
||
|
|
"learning_rate": 2.4137128815896803e-07,
|
||
|
|
"loss": 0.0108,
|
||
|
|
"step": 10270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4664537828471604,
|
||
|
|
"grad_norm": 0.10760702937841415,
|
||
|
|
"learning_rate": 2.3499434673015852e-07,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 10280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46690753166315957,
|
||
|
|
"grad_norm": 0.0882890373468399,
|
||
|
|
"learning_rate": 2.2870177355146406e-07,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 10290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46736128047915876,
|
||
|
|
"grad_norm": 0.135042205452919,
|
||
|
|
"learning_rate": 2.2249362298981892e-07,
|
||
|
|
"loss": 0.0124,
|
||
|
|
"step": 10300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46781502929515795,
|
||
|
|
"grad_norm": 0.11944553256034851,
|
||
|
|
"learning_rate": 2.1636994868275085e-07,
|
||
|
|
"loss": 0.0079,
|
||
|
|
"step": 10310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46826877811115714,
|
||
|
|
"grad_norm": 0.09973999112844467,
|
||
|
|
"learning_rate": 2.1033080353793144e-07,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 10320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4687225269271563,
|
||
|
|
"grad_norm": 0.10816528648138046,
|
||
|
|
"learning_rate": 2.043762397327087e-07,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"step": 10330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46917627574315546,
|
||
|
|
"grad_norm": 0.1579977422952652,
|
||
|
|
"learning_rate": 1.985063087136596e-07,
|
||
|
|
"loss": 0.0086,
|
||
|
|
"step": 10340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46963002455915465,
|
||
|
|
"grad_norm": 0.11629801988601685,
|
||
|
|
"learning_rate": 1.927210611961494e-07,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 10350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47008377337515383,
|
||
|
|
"grad_norm": 0.12378907203674316,
|
||
|
|
"learning_rate": 1.870205471638864e-07,
|
||
|
|
"loss": 0.0088,
|
||
|
|
"step": 10360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.470537522191153,
|
||
|
|
"grad_norm": 0.11535657197237015,
|
||
|
|
"learning_rate": 1.814048158684978e-07,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 10370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4709912710071522,
|
||
|
|
"grad_norm": 0.16682352125644684,
|
||
|
|
"learning_rate": 1.7587391582909452e-07,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 10380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4714450198231514,
|
||
|
|
"grad_norm": 0.11436964571475983,
|
||
|
|
"learning_rate": 1.7042789483186273e-07,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 10390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4718987686391506,
|
||
|
|
"grad_norm": 0.11008341610431671,
|
||
|
|
"learning_rate": 1.6506679992964292e-07,
|
||
|
|
"loss": 0.0089,
|
||
|
|
"step": 10400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4723525174551498,
|
||
|
|
"grad_norm": 0.1441035121679306,
|
||
|
|
"learning_rate": 1.597906774415281e-07,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"step": 10410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47280626627114897,
|
||
|
|
"grad_norm": 0.11642053723335266,
|
||
|
|
"learning_rate": 1.5459957295245965e-07,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 10420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47326001508714816,
|
||
|
|
"grad_norm": 0.12789706885814667,
|
||
|
|
"learning_rate": 1.494935313128376e-07,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"step": 10430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4737137639031473,
|
||
|
|
"grad_norm": 0.127280130982399,
|
||
|
|
"learning_rate": 1.4447259663812886e-07,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"step": 10440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4741675127191465,
|
||
|
|
"grad_norm": 0.13570982217788696,
|
||
|
|
"learning_rate": 1.395368123084917e-07,
|
||
|
|
"loss": 0.0076,
|
||
|
|
"step": 10450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47462126153514567,
|
||
|
|
"grad_norm": 0.15512555837631226,
|
||
|
|
"learning_rate": 1.3468622096839524e-07,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"step": 10460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47507501035114486,
|
||
|
|
"grad_norm": 0.09604503214359283,
|
||
|
|
"learning_rate": 1.2992086452625175e-07,
|
||
|
|
"loss": 0.0083,
|
||
|
|
"step": 10470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47552875916714404,
|
||
|
|
"grad_norm": 0.1286637783050537,
|
||
|
|
"learning_rate": 1.252407841540626e-07,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 10480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47598250798314323,
|
||
|
|
"grad_norm": 0.12484487891197205,
|
||
|
|
"learning_rate": 1.2064602028704742e-07,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 10490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4764362567991424,
|
||
|
|
"grad_norm": 0.10206886380910873,
|
||
|
|
"learning_rate": 1.1613661262331099e-07,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 10500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4768900056151416,
|
||
|
|
"grad_norm": 0.12927402555942535,
|
||
|
|
"learning_rate": 1.1171260012348805e-07,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 10510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4773437544311408,
|
||
|
|
"grad_norm": 0.09732365608215332,
|
||
|
|
"learning_rate": 1.0737402101041349e-07,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 10520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47779750324714,
|
||
|
|
"grad_norm": 0.10109642148017883,
|
||
|
|
"learning_rate": 1.0312091276878821e-07,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 10530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4782512520631391,
|
||
|
|
"grad_norm": 0.09376849234104156,
|
||
|
|
"learning_rate": 9.895331214485937e-08,
|
||
|
|
"loss": 0.0094,
|
||
|
|
"step": 10540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4787050008791383,
|
||
|
|
"grad_norm": 0.11916600167751312,
|
||
|
|
"learning_rate": 9.487125514610063e-08,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"step": 10550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4791587496951375,
|
||
|
|
"grad_norm": 0.1303083449602127,
|
||
|
|
"learning_rate": 9.087477704089686e-08,
|
||
|
|
"loss": 0.0093,
|
||
|
|
"step": 10560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4796124985111367,
|
||
|
|
"grad_norm": 0.11214817315340042,
|
||
|
|
"learning_rate": 8.696391235824886e-08,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 10570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4800662473271359,
|
||
|
|
"grad_norm": 0.14808695018291473,
|
||
|
|
"learning_rate": 8.313869488746574e-08,
|
||
|
|
"loss": 0.0085,
|
||
|
|
"step": 10580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48051999614313506,
|
||
|
|
"grad_norm": 0.15055687725543976,
|
||
|
|
"learning_rate": 7.939915767787853e-08,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"step": 10590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48097374495913425,
|
||
|
|
"grad_norm": 0.12333395332098007,
|
||
|
|
"learning_rate": 7.574533303855491e-08,
|
||
|
|
"loss": 0.0106,
|
||
|
|
"step": 10600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48142749377513344,
|
||
|
|
"grad_norm": 0.10209726542234421,
|
||
|
|
"learning_rate": 7.217725253801488e-08,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 10610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48188124259113263,
|
||
|
|
"grad_norm": 0.13615788519382477,
|
||
|
|
"learning_rate": 6.869494700396328e-08,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"step": 10620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4823349914071318,
|
||
|
|
"grad_norm": 0.12133138626813889,
|
||
|
|
"learning_rate": 6.529844652301997e-08,
|
||
|
|
"loss": 0.0113,
|
||
|
|
"step": 10630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.482788740223131,
|
||
|
|
"grad_norm": 0.11862960457801819,
|
||
|
|
"learning_rate": 6.19877804404645e-08,
|
||
|
|
"loss": 0.0073,
|
||
|
|
"step": 10640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48324248903913014,
|
||
|
|
"grad_norm": 0.11843565106391907,
|
||
|
|
"learning_rate": 5.876297735997738e-08,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 10650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48369623785512933,
|
||
|
|
"grad_norm": 0.10824906080961227,
|
||
|
|
"learning_rate": 5.562406514339369e-08,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 10660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4841499866711285,
|
||
|
|
"grad_norm": 0.10267908871173859,
|
||
|
|
"learning_rate": 5.257107091046654e-08,
|
||
|
|
"loss": 0.0086,
|
||
|
|
"step": 10670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4846037354871277,
|
||
|
|
"grad_norm": 0.1007552221417427,
|
||
|
|
"learning_rate": 4.9604021038628384e-08,
|
||
|
|
"loss": 0.0088,
|
||
|
|
"step": 10680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4850574843031269,
|
||
|
|
"grad_norm": 0.1403747946023941,
|
||
|
|
"learning_rate": 4.6722941162764546e-08,
|
||
|
|
"loss": 0.0091,
|
||
|
|
"step": 10690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4855112331191261,
|
||
|
|
"grad_norm": 0.1846962571144104,
|
||
|
|
"learning_rate": 4.392785617499451e-08,
|
||
|
|
"loss": 0.0096,
|
||
|
|
"step": 10700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4859649819351253,
|
||
|
|
"grad_norm": 0.11575271934270859,
|
||
|
|
"learning_rate": 4.1218790224450965e-08,
|
||
|
|
"loss": 0.0112,
|
||
|
|
"step": 10710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48641873075112446,
|
||
|
|
"grad_norm": 0.1052660271525383,
|
||
|
|
"learning_rate": 3.859576671707554e-08,
|
||
|
|
"loss": 0.0092,
|
||
|
|
"step": 10720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48687247956712365,
|
||
|
|
"grad_norm": 0.12023202329874039,
|
||
|
|
"learning_rate": 3.605880831541564e-08,
|
||
|
|
"loss": 0.0122,
|
||
|
|
"step": 10730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48732622838312284,
|
||
|
|
"grad_norm": 0.13231338560581207,
|
||
|
|
"learning_rate": 3.36079369384279e-08,
|
||
|
|
"loss": 0.0118,
|
||
|
|
"step": 10740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.487779977199122,
|
||
|
|
"grad_norm": 0.09625431150197983,
|
||
|
|
"learning_rate": 3.124317376129171e-08,
|
||
|
|
"loss": 0.0068,
|
||
|
|
"step": 10750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48823372601512116,
|
||
|
|
"grad_norm": 0.1553715318441391,
|
||
|
|
"learning_rate": 2.8964539215220468e-08,
|
||
|
|
"loss": 0.0097,
|
||
|
|
"step": 10760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48868747483112035,
|
||
|
|
"grad_norm": 0.10737349838018417,
|
||
|
|
"learning_rate": 2.6772052987290575e-08,
|
||
|
|
"loss": 0.009,
|
||
|
|
"step": 10770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48914122364711954,
|
||
|
|
"grad_norm": 0.0638694316148758,
|
||
|
|
"learning_rate": 2.4665734020270503e-08,
|
||
|
|
"loss": 0.0086,
|
||
|
|
"step": 10780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4895949724631187,
|
||
|
|
"grad_norm": 0.06933131068944931,
|
||
|
|
"learning_rate": 2.2645600512452016e-08,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 10790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4900487212791179,
|
||
|
|
"grad_norm": 0.1229269951581955,
|
||
|
|
"learning_rate": 2.0711669917501398e-08,
|
||
|
|
"loss": 0.0078,
|
||
|
|
"step": 10800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4905024700951171,
|
||
|
|
"grad_norm": 0.14476285874843597,
|
||
|
|
"learning_rate": 1.8863958944300708e-08,
|
||
|
|
"loss": 0.0101,
|
||
|
|
"step": 10810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4909562189111163,
|
||
|
|
"grad_norm": 0.1204502135515213,
|
||
|
|
"learning_rate": 1.710248355680788e-08,
|
||
|
|
"loss": 0.011,
|
||
|
|
"step": 10820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4914099677271155,
|
||
|
|
"grad_norm": 0.09992097318172455,
|
||
|
|
"learning_rate": 1.5427258973919058e-08,
|
||
|
|
"loss": 0.0088,
|
||
|
|
"step": 10830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49186371654311467,
|
||
|
|
"grad_norm": 0.1434265375137329,
|
||
|
|
"learning_rate": 1.3838299669334255e-08,
|
||
|
|
"loss": 0.009,
|
||
|
|
"step": 10840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4923174653591138,
|
||
|
|
"grad_norm": 0.08076232671737671,
|
||
|
|
"learning_rate": 1.2335619371434126e-08,
|
||
|
|
"loss": 0.0082,
|
||
|
|
"step": 10850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.492771214175113,
|
||
|
|
"grad_norm": 0.10127487778663635,
|
||
|
|
"learning_rate": 1.0919231063161173e-08,
|
||
|
|
"loss": 0.0095,
|
||
|
|
"step": 10860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4932249629911122,
|
||
|
|
"grad_norm": 0.11844775080680847,
|
||
|
|
"learning_rate": 9.589146981907604e-09,
|
||
|
|
"loss": 0.0087,
|
||
|
|
"step": 10870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49367871180711137,
|
||
|
|
"grad_norm": 0.10561437904834747,
|
||
|
|
"learning_rate": 8.345378619408762e-09,
|
||
|
|
"loss": 0.0071,
|
||
|
|
"step": 10880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49413246062311056,
|
||
|
|
"grad_norm": 0.14991527795791626,
|
||
|
|
"learning_rate": 7.187936721646527e-09,
|
||
|
|
"loss": 0.0102,
|
||
|
|
"step": 10890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49458620943910975,
|
||
|
|
"grad_norm": 0.12201883643865585,
|
||
|
|
"learning_rate": 6.116831288751624e-09,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 10900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49503995825510894,
|
||
|
|
"grad_norm": 0.09622867405414581,
|
||
|
|
"learning_rate": 5.13207157492257e-09,
|
||
|
|
"loss": 0.0099,
|
||
|
|
"step": 10910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4954937070711081,
|
||
|
|
"grad_norm": 0.1273748129606247,
|
||
|
|
"learning_rate": 4.233666088341304e-09,
|
||
|
|
"loss": 0.01,
|
||
|
|
"step": 10920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4959474558871073,
|
||
|
|
"grad_norm": 0.13358333706855774,
|
||
|
|
"learning_rate": 3.4216225911032354e-09,
|
||
|
|
"loss": 0.0098,
|
||
|
|
"step": 10930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4964012047031065,
|
||
|
|
"grad_norm": 0.11978624016046524,
|
||
|
|
"learning_rate": 2.6959480991484157e-09,
|
||
|
|
"loss": 0.0081,
|
||
|
|
"step": 10940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49685495351910564,
|
||
|
|
"grad_norm": 0.15657968819141388,
|
||
|
|
"learning_rate": 2.0566488821993635e-09,
|
||
|
|
"loss": 0.012,
|
||
|
|
"step": 10950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4973087023351048,
|
||
|
|
"grad_norm": 0.13145264983177185,
|
||
|
|
"learning_rate": 1.503730463709996e-09,
|
||
|
|
"loss": 0.0079,
|
||
|
|
"step": 10960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.497762451151104,
|
||
|
|
"grad_norm": 0.19312238693237305,
|
||
|
|
"learning_rate": 1.0371976208167766e-09,
|
||
|
|
"loss": 0.0119,
|
||
|
|
"step": 10970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4982161999671032,
|
||
|
|
"grad_norm": 0.09484487026929855,
|
||
|
|
"learning_rate": 6.570543842965293e-10,
|
||
|
|
"loss": 0.0104,
|
||
|
|
"step": 10980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4986699487831024,
|
||
|
|
"grad_norm": 0.06194043532013893,
|
||
|
|
"learning_rate": 3.6330403853201966e-10,
|
||
|
|
"loss": 0.0074,
|
||
|
|
"step": 10990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4991236975991016,
|
||
|
|
"grad_norm": 0.09161436557769775,
|
||
|
|
"learning_rate": 1.5594912148420017e-10,
|
||
|
|
"loss": 0.0103,
|
||
|
|
"step": 11000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49957744641510077,
|
||
|
|
"grad_norm": 0.12720157206058502,
|
||
|
|
"learning_rate": 3.49914246700056e-11,
|
||
|
|
"loss": 0.0129,
|
||
|
|
"step": 11010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4999858203495,
|
||
|
|
"step": 11019,
|
||
|
|
"total_flos": 5.677505360785572e+18,
|
||
|
|
"train_loss": 0.017134081627401078,
|
||
|
|
"train_runtime": 75300.0846,
|
||
|
|
"train_samples_per_second": 1.171,
|
||
|
|
"train_steps_per_second": 0.146
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 10,
|
||
|
|
"max_steps": 11019,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 1,
|
||
|
|
"save_steps": 500,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 5.677505360785572e+18,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|