3983 lines
97 KiB
JSON
3983 lines
97 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 5.0,
|
|
"eval_steps": 100,
|
|
"global_step": 2669,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.00937207122774133,
|
|
"grad_norm": 32.909061431884766,
|
|
"learning_rate": 5.970149253731343e-07,
|
|
"loss": 1.2647,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.01874414245548266,
|
|
"grad_norm": 25.104095458984375,
|
|
"learning_rate": 1.3432835820895524e-06,
|
|
"loss": 1.1231,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.028116213683223992,
|
|
"grad_norm": 9.685723304748535,
|
|
"learning_rate": 2.08955223880597e-06,
|
|
"loss": 0.7966,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.03748828491096532,
|
|
"grad_norm": 3.455462694168091,
|
|
"learning_rate": 2.835820895522388e-06,
|
|
"loss": 0.59,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.046860356138706656,
|
|
"grad_norm": 3.022719621658325,
|
|
"learning_rate": 3.582089552238806e-06,
|
|
"loss": 0.5539,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.056232427366447985,
|
|
"grad_norm": 2.2470591068267822,
|
|
"learning_rate": 4.3283582089552236e-06,
|
|
"loss": 0.4943,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.06560449859418932,
|
|
"grad_norm": 2.7969272136688232,
|
|
"learning_rate": 5.074626865671642e-06,
|
|
"loss": 0.5235,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.07497656982193064,
|
|
"grad_norm": 4.2170538902282715,
|
|
"learning_rate": 5.820895522388061e-06,
|
|
"loss": 0.5126,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.08434864104967198,
|
|
"grad_norm": 34.95182418823242,
|
|
"learning_rate": 6.567164179104478e-06,
|
|
"loss": 0.4857,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.09372071227741331,
|
|
"grad_norm": 2.0386860370635986,
|
|
"learning_rate": 7.313432835820896e-06,
|
|
"loss": 0.4885,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.10309278350515463,
|
|
"grad_norm": 2.5301098823547363,
|
|
"learning_rate": 8.059701492537314e-06,
|
|
"loss": 0.48,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.11246485473289597,
|
|
"grad_norm": 2.5187911987304688,
|
|
"learning_rate": 8.805970149253732e-06,
|
|
"loss": 0.4729,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.1218369259606373,
|
|
"grad_norm": 2.380256175994873,
|
|
"learning_rate": 9.552238805970149e-06,
|
|
"loss": 0.5112,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.13120899718837864,
|
|
"grad_norm": 2.1329147815704346,
|
|
"learning_rate": 1.029850746268657e-05,
|
|
"loss": 0.4578,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.14058106841611998,
|
|
"grad_norm": 2.7913565635681152,
|
|
"learning_rate": 1.1044776119402986e-05,
|
|
"loss": 0.4598,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.14995313964386128,
|
|
"grad_norm": 2.056675910949707,
|
|
"learning_rate": 1.1791044776119405e-05,
|
|
"loss": 0.4831,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.15932521087160262,
|
|
"grad_norm": 2.386592388153076,
|
|
"learning_rate": 1.2537313432835823e-05,
|
|
"loss": 0.473,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.16869728209934395,
|
|
"grad_norm": 2.63767409324646,
|
|
"learning_rate": 1.328358208955224e-05,
|
|
"loss": 0.4841,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.1780693533270853,
|
|
"grad_norm": 2.0835254192352295,
|
|
"learning_rate": 1.4029850746268658e-05,
|
|
"loss": 0.4657,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.18744142455482662,
|
|
"grad_norm": 2.168680429458618,
|
|
"learning_rate": 1.4776119402985077e-05,
|
|
"loss": 0.4937,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.18744142455482662,
|
|
"eval_loss": 0.6319828033447266,
|
|
"eval_runtime": 111.5664,
|
|
"eval_samples_per_second": 4.482,
|
|
"eval_steps_per_second": 2.241,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.19681349578256796,
|
|
"grad_norm": 1.953171730041504,
|
|
"learning_rate": 1.5522388059701494e-05,
|
|
"loss": 0.4405,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.20618556701030927,
|
|
"grad_norm": 1.805274248123169,
|
|
"learning_rate": 1.626865671641791e-05,
|
|
"loss": 0.4922,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.2155576382380506,
|
|
"grad_norm": 2.0582938194274902,
|
|
"learning_rate": 1.701492537313433e-05,
|
|
"loss": 0.4722,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.22492970946579194,
|
|
"grad_norm": 2.067007064819336,
|
|
"learning_rate": 1.7761194029850748e-05,
|
|
"loss": 0.4876,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.23430178069353327,
|
|
"grad_norm": 2.0720055103302,
|
|
"learning_rate": 1.8507462686567165e-05,
|
|
"loss": 0.479,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.2436738519212746,
|
|
"grad_norm": 2.026207208633423,
|
|
"learning_rate": 1.9253731343283585e-05,
|
|
"loss": 0.4642,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.2530459231490159,
|
|
"grad_norm": 1.9669533967971802,
|
|
"learning_rate": 2e-05,
|
|
"loss": 0.479,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.2624179943767573,
|
|
"grad_norm": 1.7911089658737183,
|
|
"learning_rate": 1.9999808172939662e-05,
|
|
"loss": 0.484,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.2717900656044986,
|
|
"grad_norm": 1.6933950185775757,
|
|
"learning_rate": 1.9999232699118173e-05,
|
|
"loss": 0.4945,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.28116213683223995,
|
|
"grad_norm": 1.988083839416504,
|
|
"learning_rate": 1.9998273600613825e-05,
|
|
"loss": 0.5123,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.29053420805998126,
|
|
"grad_norm": 2.103421688079834,
|
|
"learning_rate": 1.999693091422282e-05,
|
|
"loss": 0.4682,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.29990627928772257,
|
|
"grad_norm": 2.0768039226531982,
|
|
"learning_rate": 1.9995204691457883e-05,
|
|
"loss": 0.4885,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.30927835051546393,
|
|
"grad_norm": 1.8248564004898071,
|
|
"learning_rate": 1.9993094998546257e-05,
|
|
"loss": 0.4735,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.31865042174320524,
|
|
"grad_norm": 1.6503818035125732,
|
|
"learning_rate": 1.9990601916427183e-05,
|
|
"loss": 0.4733,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.3280224929709466,
|
|
"grad_norm": 1.6220550537109375,
|
|
"learning_rate": 1.998772554074878e-05,
|
|
"loss": 0.4898,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.3373945641986879,
|
|
"grad_norm": 1.5691450834274292,
|
|
"learning_rate": 1.9984465981864393e-05,
|
|
"loss": 0.4697,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.3467666354264292,
|
|
"grad_norm": 1.856009602546692,
|
|
"learning_rate": 1.998082336482833e-05,
|
|
"loss": 0.46,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.3561387066541706,
|
|
"grad_norm": 1.7977851629257202,
|
|
"learning_rate": 1.9976797829391104e-05,
|
|
"loss": 0.5193,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.3655107778819119,
|
|
"grad_norm": 1.5994986295700073,
|
|
"learning_rate": 1.9972389529994043e-05,
|
|
"loss": 0.4666,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.37488284910965325,
|
|
"grad_norm": 1.8488245010375977,
|
|
"learning_rate": 1.996759863576336e-05,
|
|
"loss": 0.511,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.37488284910965325,
|
|
"eval_loss": 0.6320933699607849,
|
|
"eval_runtime": 111.4483,
|
|
"eval_samples_per_second": 4.486,
|
|
"eval_steps_per_second": 2.243,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.38425492033739456,
|
|
"grad_norm": 2.441446542739868,
|
|
"learning_rate": 1.9962425330503693e-05,
|
|
"loss": 0.4696,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.3936269915651359,
|
|
"grad_norm": 1.8430964946746826,
|
|
"learning_rate": 1.995686981269103e-05,
|
|
"loss": 0.4649,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.4029990627928772,
|
|
"grad_norm": 1.7581799030303955,
|
|
"learning_rate": 1.9950932295465102e-05,
|
|
"loss": 0.4885,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.41237113402061853,
|
|
"grad_norm": 1.6407780647277832,
|
|
"learning_rate": 1.9944613006621197e-05,
|
|
"loss": 0.4754,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.4217432052483599,
|
|
"grad_norm": 1.6698272228240967,
|
|
"learning_rate": 1.9937912188601444e-05,
|
|
"loss": 0.4823,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.4311152764761012,
|
|
"grad_norm": 1.5131304264068604,
|
|
"learning_rate": 1.9930830098485484e-05,
|
|
"loss": 0.4692,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.44048734770384257,
|
|
"grad_norm": 1.6762291193008423,
|
|
"learning_rate": 1.992336700798062e-05,
|
|
"loss": 0.4901,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.4498594189315839,
|
|
"grad_norm": 1.7265088558197021,
|
|
"learning_rate": 1.9915523203411397e-05,
|
|
"loss": 0.4627,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.4592314901593252,
|
|
"grad_norm": 1.5664938688278198,
|
|
"learning_rate": 1.990729898570861e-05,
|
|
"loss": 0.4715,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.46860356138706655,
|
|
"grad_norm": 1.6908628940582275,
|
|
"learning_rate": 1.989869467039776e-05,
|
|
"loss": 0.4984,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.47797563261480785,
|
|
"grad_norm": 1.3762125968933105,
|
|
"learning_rate": 1.9889710587586953e-05,
|
|
"loss": 0.4663,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.4873477038425492,
|
|
"grad_norm": 1.605967402458191,
|
|
"learning_rate": 1.9880347081954217e-05,
|
|
"loss": 0.4711,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.4967197750702905,
|
|
"grad_norm": 1.5448018312454224,
|
|
"learning_rate": 1.987060451273432e-05,
|
|
"loss": 0.4637,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.5060918462980318,
|
|
"grad_norm": 1.4862360954284668,
|
|
"learning_rate": 1.986048325370493e-05,
|
|
"loss": 0.4614,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.5154639175257731,
|
|
"grad_norm": 1.587640643119812,
|
|
"learning_rate": 1.9849983693172324e-05,
|
|
"loss": 0.4819,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.5248359887535146,
|
|
"grad_norm": 1.5963493585586548,
|
|
"learning_rate": 1.9839106233956474e-05,
|
|
"loss": 0.4912,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.5342080599812559,
|
|
"grad_norm": 1.4431391954421997,
|
|
"learning_rate": 1.982785129337558e-05,
|
|
"loss": 0.4727,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.5435801312089972,
|
|
"grad_norm": 1.4035024642944336,
|
|
"learning_rate": 1.9816219303230077e-05,
|
|
"loss": 0.4642,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.5529522024367385,
|
|
"grad_norm": 1.5492186546325684,
|
|
"learning_rate": 1.980421070978606e-05,
|
|
"loss": 0.4881,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.5623242736644799,
|
|
"grad_norm": 1.4136260747909546,
|
|
"learning_rate": 1.9791825973758167e-05,
|
|
"loss": 0.4657,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.5623242736644799,
|
|
"eval_loss": 0.6458946466445923,
|
|
"eval_runtime": 111.4442,
|
|
"eval_samples_per_second": 4.487,
|
|
"eval_steps_per_second": 2.243,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.5716963448922212,
|
|
"grad_norm": 1.6971220970153809,
|
|
"learning_rate": 1.9779065570291894e-05,
|
|
"loss": 0.4685,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.5810684161199625,
|
|
"grad_norm": 1.4709703922271729,
|
|
"learning_rate": 1.9765929988945382e-05,
|
|
"loss": 0.4948,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.5904404873477038,
|
|
"grad_norm": 1.7003625631332397,
|
|
"learning_rate": 1.975241973367062e-05,
|
|
"loss": 0.4963,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.5998125585754451,
|
|
"grad_norm": 1.4582606554031372,
|
|
"learning_rate": 1.9738535322794122e-05,
|
|
"loss": 0.4827,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.6091846298031866,
|
|
"grad_norm": 1.4838789701461792,
|
|
"learning_rate": 1.972427728899703e-05,
|
|
"loss": 0.4545,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.6185567010309279,
|
|
"grad_norm": 1.3947144746780396,
|
|
"learning_rate": 1.9709646179294687e-05,
|
|
"loss": 0.4712,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.6279287722586692,
|
|
"grad_norm": 1.554185390472412,
|
|
"learning_rate": 1.9694642555015643e-05,
|
|
"loss": 0.4702,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.6373008434864105,
|
|
"grad_norm": 1.6660090684890747,
|
|
"learning_rate": 1.9679266991780128e-05,
|
|
"loss": 0.5128,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.6466729147141518,
|
|
"grad_norm": 1.5578211545944214,
|
|
"learning_rate": 1.966352007947796e-05,
|
|
"loss": 0.4844,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.6560449859418932,
|
|
"grad_norm": 1.4711651802062988,
|
|
"learning_rate": 1.964740242224592e-05,
|
|
"loss": 0.4798,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.6654170571696345,
|
|
"grad_norm": 1.7752223014831543,
|
|
"learning_rate": 1.9630914638444572e-05,
|
|
"loss": 0.4922,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.6747891283973758,
|
|
"grad_norm": 1.8880064487457275,
|
|
"learning_rate": 1.961405736063453e-05,
|
|
"loss": 0.4928,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.6841611996251171,
|
|
"grad_norm": 1.5307488441467285,
|
|
"learning_rate": 1.9596831235552205e-05,
|
|
"loss": 0.4492,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.6935332708528584,
|
|
"grad_norm": 1.4398698806762695,
|
|
"learning_rate": 1.957923692408499e-05,
|
|
"loss": 0.45,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.7029053420805998,
|
|
"grad_norm": 1.5632487535476685,
|
|
"learning_rate": 1.9561275101245886e-05,
|
|
"loss": 0.4878,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.7122774133083412,
|
|
"grad_norm": 1.4883025884628296,
|
|
"learning_rate": 1.954294645614763e-05,
|
|
"loss": 0.4799,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.7216494845360825,
|
|
"grad_norm": 1.4776026010513306,
|
|
"learning_rate": 1.9524251691976243e-05,
|
|
"loss": 0.5043,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.7310215557638238,
|
|
"grad_norm": 1.5219204425811768,
|
|
"learning_rate": 1.950519152596406e-05,
|
|
"loss": 0.4737,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.7403936269915652,
|
|
"grad_norm": 1.6262823343276978,
|
|
"learning_rate": 1.9485766689362205e-05,
|
|
"loss": 0.4575,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.7497656982193065,
|
|
"grad_norm": 1.5888830423355103,
|
|
"learning_rate": 1.9465977927412535e-05,
|
|
"loss": 0.4577,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.7497656982193065,
|
|
"eval_loss": 0.6419793963432312,
|
|
"eval_runtime": 111.6005,
|
|
"eval_samples_per_second": 4.48,
|
|
"eval_steps_per_second": 2.24,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.7591377694470478,
|
|
"grad_norm": 1.4423269033432007,
|
|
"learning_rate": 1.9445825999319057e-05,
|
|
"loss": 0.4451,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.7685098406747891,
|
|
"grad_norm": 1.5620957612991333,
|
|
"learning_rate": 1.94253116782188e-05,
|
|
"loss": 0.4578,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.7778819119025304,
|
|
"grad_norm": 1.326604962348938,
|
|
"learning_rate": 1.9404435751152134e-05,
|
|
"loss": 0.4772,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.7872539831302718,
|
|
"grad_norm": 1.3322705030441284,
|
|
"learning_rate": 1.938319901903262e-05,
|
|
"loss": 0.4829,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.7966260543580131,
|
|
"grad_norm": 1.4845640659332275,
|
|
"learning_rate": 1.9361602296616223e-05,
|
|
"loss": 0.4598,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.8059981255857545,
|
|
"grad_norm": 1.4712871313095093,
|
|
"learning_rate": 1.9339646412470106e-05,
|
|
"loss": 0.4695,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.8153701968134958,
|
|
"grad_norm": 1.3889317512512207,
|
|
"learning_rate": 1.931733220894081e-05,
|
|
"loss": 0.447,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.8247422680412371,
|
|
"grad_norm": 1.4443881511688232,
|
|
"learning_rate": 1.9294660542121944e-05,
|
|
"loss": 0.4662,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.8341143392689785,
|
|
"grad_norm": 1.5155857801437378,
|
|
"learning_rate": 1.9271632281821354e-05,
|
|
"loss": 0.4873,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.8434864104967198,
|
|
"grad_norm": 1.5591299533843994,
|
|
"learning_rate": 1.9248248311527735e-05,
|
|
"loss": 0.4942,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.8528584817244611,
|
|
"grad_norm": 1.5808844566345215,
|
|
"learning_rate": 1.9224509528376737e-05,
|
|
"loss": 0.472,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.8622305529522024,
|
|
"grad_norm": 1.8616470098495483,
|
|
"learning_rate": 1.9200416843116562e-05,
|
|
"loss": 0.4577,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.8716026241799437,
|
|
"grad_norm": 1.918115496635437,
|
|
"learning_rate": 1.9175971180073012e-05,
|
|
"loss": 0.4774,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.8809746954076851,
|
|
"grad_norm": 1.411353349685669,
|
|
"learning_rate": 1.9151173477114015e-05,
|
|
"loss": 0.4682,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.8903467666354264,
|
|
"grad_norm": 1.625918984413147,
|
|
"learning_rate": 1.9126024685613664e-05,
|
|
"loss": 0.4923,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.8997188378631678,
|
|
"grad_norm": 1.388818621635437,
|
|
"learning_rate": 1.9100525770415713e-05,
|
|
"loss": 0.4766,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.9090909090909091,
|
|
"grad_norm": 1.4252229928970337,
|
|
"learning_rate": 1.907467770979655e-05,
|
|
"loss": 0.4622,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.9184629803186504,
|
|
"grad_norm": 1.6216133832931519,
|
|
"learning_rate": 1.9048481495427667e-05,
|
|
"loss": 0.4824,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.9278350515463918,
|
|
"grad_norm": 1.6171802282333374,
|
|
"learning_rate": 1.9021938132337628e-05,
|
|
"loss": 0.4979,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.9372071227741331,
|
|
"grad_norm": 1.5567573308944702,
|
|
"learning_rate": 1.8995048638873494e-05,
|
|
"loss": 0.4634,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.9372071227741331,
|
|
"eval_loss": 0.6470092535018921,
|
|
"eval_runtime": 111.4606,
|
|
"eval_samples_per_second": 4.486,
|
|
"eval_steps_per_second": 2.243,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.9465791940018744,
|
|
"grad_norm": 1.3252328634262085,
|
|
"learning_rate": 1.896781404666176e-05,
|
|
"loss": 0.4682,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.9559512652296157,
|
|
"grad_norm": 1.6408551931381226,
|
|
"learning_rate": 1.8940235400568784e-05,
|
|
"loss": 0.4762,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.9653233364573571,
|
|
"grad_norm": 1.6290283203125,
|
|
"learning_rate": 1.891231375866068e-05,
|
|
"loss": 0.4661,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.9746954076850984,
|
|
"grad_norm": 1.402817964553833,
|
|
"learning_rate": 1.888405019216275e-05,
|
|
"loss": 0.5037,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.9840674789128397,
|
|
"grad_norm": 1.366844892501831,
|
|
"learning_rate": 1.885544578541837e-05,
|
|
"loss": 0.4596,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.993439550140581,
|
|
"grad_norm": 1.4287879467010498,
|
|
"learning_rate": 1.8826501635847392e-05,
|
|
"loss": 0.4652,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 1.0037488284910965,
|
|
"grad_norm": 1.2563650608062744,
|
|
"learning_rate": 1.8797218853904037e-05,
|
|
"loss": 0.4833,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 1.013120899718838,
|
|
"grad_norm": 1.2508701086044312,
|
|
"learning_rate": 1.8767598563034304e-05,
|
|
"loss": 0.287,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 1.022492970946579,
|
|
"grad_norm": 1.3916987180709839,
|
|
"learning_rate": 1.8737641899632857e-05,
|
|
"loss": 0.2859,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 1.0318650421743205,
|
|
"grad_norm": 1.4374034404754639,
|
|
"learning_rate": 1.870735001299943e-05,
|
|
"loss": 0.2746,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 1.041237113402062,
|
|
"grad_norm": 1.3931026458740234,
|
|
"learning_rate": 1.8676724065294744e-05,
|
|
"loss": 0.255,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 1.0506091846298031,
|
|
"grad_norm": 1.4676737785339355,
|
|
"learning_rate": 1.864576523149589e-05,
|
|
"loss": 0.2609,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 1.0599812558575445,
|
|
"grad_norm": 1.3945457935333252,
|
|
"learning_rate": 1.8614474699351294e-05,
|
|
"loss": 0.2595,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 1.069353327085286,
|
|
"grad_norm": 1.413190245628357,
|
|
"learning_rate": 1.8582853669335107e-05,
|
|
"loss": 0.2704,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 1.0787253983130272,
|
|
"grad_norm": 1.2427492141723633,
|
|
"learning_rate": 1.8550903354601182e-05,
|
|
"loss": 0.2444,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 1.0880974695407686,
|
|
"grad_norm": 1.3134554624557495,
|
|
"learning_rate": 1.851862498093651e-05,
|
|
"loss": 0.2606,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 1.0974695407685098,
|
|
"grad_norm": 1.3855392932891846,
|
|
"learning_rate": 1.8486019786714194e-05,
|
|
"loss": 0.263,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 1.1068416119962512,
|
|
"grad_norm": 1.4354616403579712,
|
|
"learning_rate": 1.8453089022845943e-05,
|
|
"loss": 0.2488,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 1.1162136832239926,
|
|
"grad_norm": 1.1863958835601807,
|
|
"learning_rate": 1.8419833952734094e-05,
|
|
"loss": 0.2506,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 1.1255857544517338,
|
|
"grad_norm": 1.5044498443603516,
|
|
"learning_rate": 1.83862558522231e-05,
|
|
"loss": 0.2661,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.1255857544517338,
|
|
"eval_loss": 0.6920709013938904,
|
|
"eval_runtime": 111.4907,
|
|
"eval_samples_per_second": 4.485,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.1349578256794752,
|
|
"grad_norm": 1.4557913541793823,
|
|
"learning_rate": 1.835235600955064e-05,
|
|
"loss": 0.265,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 1.1443298969072164,
|
|
"grad_norm": 1.3041000366210938,
|
|
"learning_rate": 1.8318135725298133e-05,
|
|
"loss": 0.261,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 1.1537019681349578,
|
|
"grad_norm": 1.3250569105148315,
|
|
"learning_rate": 1.8283596312340893e-05,
|
|
"loss": 0.2638,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 1.1630740393626993,
|
|
"grad_norm": 1.3970952033996582,
|
|
"learning_rate": 1.8248739095797726e-05,
|
|
"loss": 0.2642,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 1.1724461105904405,
|
|
"grad_norm": 1.4045438766479492,
|
|
"learning_rate": 1.8213565412980114e-05,
|
|
"loss": 0.2909,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 1.1818181818181819,
|
|
"grad_norm": 1.3580117225646973,
|
|
"learning_rate": 1.8178076613340886e-05,
|
|
"loss": 0.2541,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 1.191190253045923,
|
|
"grad_norm": 1.3984880447387695,
|
|
"learning_rate": 1.8142274058422467e-05,
|
|
"loss": 0.253,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 1.2005623242736645,
|
|
"grad_norm": 1.275099754333496,
|
|
"learning_rate": 1.8106159121804633e-05,
|
|
"loss": 0.2679,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 1.209934395501406,
|
|
"grad_norm": 1.4693080186843872,
|
|
"learning_rate": 1.8069733189051802e-05,
|
|
"loss": 0.2586,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 1.219306466729147,
|
|
"grad_norm": 1.3677211999893188,
|
|
"learning_rate": 1.80329976576599e-05,
|
|
"loss": 0.2877,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 1.2286785379568885,
|
|
"grad_norm": 1.376230001449585,
|
|
"learning_rate": 1.7995953937002723e-05,
|
|
"loss": 0.2499,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 1.2380506091846297,
|
|
"grad_norm": 1.380204677581787,
|
|
"learning_rate": 1.7958603448277882e-05,
|
|
"loss": 0.2426,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 1.2474226804123711,
|
|
"grad_norm": 1.4259058237075806,
|
|
"learning_rate": 1.7920947624452264e-05,
|
|
"loss": 0.2806,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 1.2567947516401126,
|
|
"grad_norm": 1.4305455684661865,
|
|
"learning_rate": 1.7882987910207066e-05,
|
|
"loss": 0.2657,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 1.2661668228678538,
|
|
"grad_norm": 1.4844595193862915,
|
|
"learning_rate": 1.784472576188237e-05,
|
|
"loss": 0.2704,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 1.2755388940955952,
|
|
"grad_norm": 1.28706693649292,
|
|
"learning_rate": 1.780616264742126e-05,
|
|
"loss": 0.2534,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 1.2849109653233364,
|
|
"grad_norm": 1.3618587255477905,
|
|
"learning_rate": 1.776730004631352e-05,
|
|
"loss": 0.2715,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 1.2942830365510778,
|
|
"grad_norm": 1.399498701095581,
|
|
"learning_rate": 1.7728139449538848e-05,
|
|
"loss": 0.2748,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 1.3036551077788192,
|
|
"grad_norm": 1.3688334226608276,
|
|
"learning_rate": 1.768868235950968e-05,
|
|
"loss": 0.2625,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 1.3130271790065604,
|
|
"grad_norm": 1.327973484992981,
|
|
"learning_rate": 1.7648930290013532e-05,
|
|
"loss": 0.2427,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 1.3130271790065604,
|
|
"eval_loss": 0.6904003620147705,
|
|
"eval_runtime": 111.5048,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 1.3223992502343018,
|
|
"grad_norm": 1.5537965297698975,
|
|
"learning_rate": 1.760888476615493e-05,
|
|
"loss": 0.2487,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 1.331771321462043,
|
|
"grad_norm": 1.382699728012085,
|
|
"learning_rate": 1.75685473242969e-05,
|
|
"loss": 0.2417,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 1.3411433926897844,
|
|
"grad_norm": 1.4410724639892578,
|
|
"learning_rate": 1.7527919512002025e-05,
|
|
"loss": 0.2467,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 1.3505154639175259,
|
|
"grad_norm": 1.448276400566101,
|
|
"learning_rate": 1.7487002887973057e-05,
|
|
"loss": 0.2525,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 1.359887535145267,
|
|
"grad_norm": 1.4892441034317017,
|
|
"learning_rate": 1.7445799021993138e-05,
|
|
"loss": 0.2336,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 1.3692596063730085,
|
|
"grad_norm": 1.2686562538146973,
|
|
"learning_rate": 1.7404309494865572e-05,
|
|
"loss": 0.2624,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 1.3786316776007497,
|
|
"grad_norm": 1.36681067943573,
|
|
"learning_rate": 1.736253589835316e-05,
|
|
"loss": 0.279,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 1.388003748828491,
|
|
"grad_norm": 1.4178364276885986,
|
|
"learning_rate": 1.7320479835117142e-05,
|
|
"loss": 0.2634,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 1.3973758200562325,
|
|
"grad_norm": 1.7909929752349854,
|
|
"learning_rate": 1.7278142918655717e-05,
|
|
"loss": 0.2568,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 1.4067478912839737,
|
|
"grad_norm": 1.4352169036865234,
|
|
"learning_rate": 1.7235526773242136e-05,
|
|
"loss": 0.2487,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 1.4161199625117151,
|
|
"grad_norm": 1.3589709997177124,
|
|
"learning_rate": 1.719263303386237e-05,
|
|
"loss": 0.2612,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 1.4254920337394563,
|
|
"grad_norm": 1.3523000478744507,
|
|
"learning_rate": 1.7149463346152412e-05,
|
|
"loss": 0.2644,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 1.4348641049671977,
|
|
"grad_norm": 1.396602988243103,
|
|
"learning_rate": 1.7106019366335113e-05,
|
|
"loss": 0.2704,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 1.4442361761949392,
|
|
"grad_norm": 1.379135012626648,
|
|
"learning_rate": 1.7062302761156667e-05,
|
|
"loss": 0.2593,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 1.4536082474226804,
|
|
"grad_norm": 1.301147699356079,
|
|
"learning_rate": 1.701831520782264e-05,
|
|
"loss": 0.2592,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 1.4629803186504218,
|
|
"grad_norm": 1.4539448022842407,
|
|
"learning_rate": 1.6974058393933647e-05,
|
|
"loss": 0.2909,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 1.472352389878163,
|
|
"grad_norm": 1.5490386486053467,
|
|
"learning_rate": 1.692953401742059e-05,
|
|
"loss": 0.2771,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 1.4817244611059044,
|
|
"grad_norm": 1.4883418083190918,
|
|
"learning_rate": 1.6884743786479513e-05,
|
|
"loss": 0.2529,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 1.4910965323336458,
|
|
"grad_norm": 1.5105490684509277,
|
|
"learning_rate": 1.6839689419506092e-05,
|
|
"loss": 0.265,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 1.5004686035613872,
|
|
"grad_norm": 1.461634635925293,
|
|
"learning_rate": 1.6794372645029674e-05,
|
|
"loss": 0.2608,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.5004686035613872,
|
|
"eval_loss": 0.6895884871482849,
|
|
"eval_runtime": 111.5059,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 1.5098406747891284,
|
|
"grad_norm": 1.523145079612732,
|
|
"learning_rate": 1.6748795201646992e-05,
|
|
"loss": 0.2762,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 1.5192127460168696,
|
|
"grad_norm": 1.366004228591919,
|
|
"learning_rate": 1.670295883795544e-05,
|
|
"loss": 0.28,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 1.528584817244611,
|
|
"grad_norm": 1.6428511142730713,
|
|
"learning_rate": 1.6656865312485996e-05,
|
|
"loss": 0.2489,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 1.5379568884723525,
|
|
"grad_norm": 1.31986665725708,
|
|
"learning_rate": 1.6610516393635757e-05,
|
|
"loss": 0.2498,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 1.5473289597000939,
|
|
"grad_norm": 1.5260220766067505,
|
|
"learning_rate": 1.6563913859600102e-05,
|
|
"loss": 0.338,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 1.556701030927835,
|
|
"grad_norm": 1.3370164632797241,
|
|
"learning_rate": 1.6517059498304444e-05,
|
|
"loss": 0.2468,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 1.5660731021555763,
|
|
"grad_norm": 1.4251459836959839,
|
|
"learning_rate": 1.6469955107335666e-05,
|
|
"loss": 0.2764,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 1.5754451733833177,
|
|
"grad_norm": 1.2612155675888062,
|
|
"learning_rate": 1.6422602493873137e-05,
|
|
"loss": 0.2613,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 1.584817244611059,
|
|
"grad_norm": 1.3020036220550537,
|
|
"learning_rate": 1.637500347461938e-05,
|
|
"loss": 0.2618,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 1.5941893158388005,
|
|
"grad_norm": 1.3664627075195312,
|
|
"learning_rate": 1.6327159875730393e-05,
|
|
"loss": 0.2476,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 1.6035613870665417,
|
|
"grad_norm": 1.4827312231063843,
|
|
"learning_rate": 1.627907353274555e-05,
|
|
"loss": 0.2674,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 1.612933458294283,
|
|
"grad_norm": 1.2991149425506592,
|
|
"learning_rate": 1.6230746290517227e-05,
|
|
"loss": 0.2716,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 1.6223055295220243,
|
|
"grad_norm": 1.5782040357589722,
|
|
"learning_rate": 1.618218000313998e-05,
|
|
"loss": 0.2875,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 1.6316776007497658,
|
|
"grad_norm": 1.4465105533599854,
|
|
"learning_rate": 1.613337653387943e-05,
|
|
"loss": 0.2723,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 1.6410496719775072,
|
|
"grad_norm": 1.3791197538375854,
|
|
"learning_rate": 1.6084337755100795e-05,
|
|
"loss": 0.2572,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 1.6504217432052484,
|
|
"grad_norm": 1.3755207061767578,
|
|
"learning_rate": 1.603506554819703e-05,
|
|
"loss": 0.2562,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 1.6597938144329896,
|
|
"grad_norm": 1.4186309576034546,
|
|
"learning_rate": 1.598556180351665e-05,
|
|
"loss": 0.2679,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 1.669165885660731,
|
|
"grad_norm": 1.3663445711135864,
|
|
"learning_rate": 1.5935828420291227e-05,
|
|
"loss": 0.2505,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 1.6785379568884724,
|
|
"grad_norm": 1.4272841215133667,
|
|
"learning_rate": 1.588586730656249e-05,
|
|
"loss": 0.2861,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 1.6879100281162138,
|
|
"grad_norm": 1.3556526899337769,
|
|
"learning_rate": 1.5835680379109166e-05,
|
|
"loss": 0.2811,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.6879100281162138,
|
|
"eval_loss": 0.6763415336608887,
|
|
"eval_runtime": 111.4991,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.697282099343955,
|
|
"grad_norm": 1.527638554573059,
|
|
"learning_rate": 1.5785269563373402e-05,
|
|
"loss": 0.2655,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 1.7066541705716962,
|
|
"grad_norm": 1.347113847732544,
|
|
"learning_rate": 1.573463679338692e-05,
|
|
"loss": 0.2783,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 1.7160262417994376,
|
|
"grad_norm": 1.346537470817566,
|
|
"learning_rate": 1.56837840116968e-05,
|
|
"loss": 0.2712,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 1.725398313027179,
|
|
"grad_norm": 1.3698228597640991,
|
|
"learning_rate": 1.5632713169290962e-05,
|
|
"loss": 0.2582,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.7347703842549205,
|
|
"grad_norm": 1.4085627794265747,
|
|
"learning_rate": 1.5581426225523333e-05,
|
|
"loss": 0.262,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 1.7441424554826617,
|
|
"grad_norm": 1.4400358200073242,
|
|
"learning_rate": 1.5529925148038635e-05,
|
|
"loss": 0.2636,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 1.7535145267104029,
|
|
"grad_norm": 1.2298705577850342,
|
|
"learning_rate": 1.547821191269693e-05,
|
|
"loss": 0.2542,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 1.7628865979381443,
|
|
"grad_norm": 1.4320347309112549,
|
|
"learning_rate": 1.5426288503497802e-05,
|
|
"loss": 0.2607,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.7722586691658857,
|
|
"grad_norm": 1.4086341857910156,
|
|
"learning_rate": 1.5374156912504236e-05,
|
|
"loss": 0.2464,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 1.7816307403936271,
|
|
"grad_norm": 1.3747973442077637,
|
|
"learning_rate": 1.532181913976621e-05,
|
|
"loss": 0.2781,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 1.7910028116213683,
|
|
"grad_norm": 1.4264485836029053,
|
|
"learning_rate": 1.5269277193243936e-05,
|
|
"loss": 0.2872,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 1.8003748828491095,
|
|
"grad_norm": 1.3113363981246948,
|
|
"learning_rate": 1.5216533088730844e-05,
|
|
"loss": 0.2693,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.809746954076851,
|
|
"grad_norm": 1.3197410106658936,
|
|
"learning_rate": 1.516358884977624e-05,
|
|
"loss": 0.2495,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 1.8191190253045924,
|
|
"grad_norm": 1.4005447626113892,
|
|
"learning_rate": 1.5110446507607666e-05,
|
|
"loss": 0.2792,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 1.8284910965323338,
|
|
"grad_norm": 1.3619177341461182,
|
|
"learning_rate": 1.5057108101052978e-05,
|
|
"loss": 0.2496,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 1.837863167760075,
|
|
"grad_norm": 1.3972722291946411,
|
|
"learning_rate": 1.5003575676462126e-05,
|
|
"loss": 0.2586,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.8472352389878162,
|
|
"grad_norm": 1.3040308952331543,
|
|
"learning_rate": 1.4949851287628631e-05,
|
|
"loss": 0.2593,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 1.8566073102155576,
|
|
"grad_norm": 1.4333730936050415,
|
|
"learning_rate": 1.4895936995710815e-05,
|
|
"loss": 0.2643,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 1.865979381443299,
|
|
"grad_norm": 1.304624319076538,
|
|
"learning_rate": 1.4841834869152703e-05,
|
|
"loss": 0.2478,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 1.8753514526710404,
|
|
"grad_norm": 1.3824489116668701,
|
|
"learning_rate": 1.478754698360467e-05,
|
|
"loss": 0.2506,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.8753514526710404,
|
|
"eval_loss": 0.6781994104385376,
|
|
"eval_runtime": 111.5183,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.8847235238987816,
|
|
"grad_norm": 1.5689202547073364,
|
|
"learning_rate": 1.473307542184382e-05,
|
|
"loss": 0.2811,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 1.8940955951265228,
|
|
"grad_norm": 1.357867956161499,
|
|
"learning_rate": 1.4678422273694062e-05,
|
|
"loss": 0.2637,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 1.9034676663542642,
|
|
"grad_norm": 1.241373896598816,
|
|
"learning_rate": 1.462358963594595e-05,
|
|
"loss": 0.2636,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 1.9128397375820057,
|
|
"grad_norm": 1.3964288234710693,
|
|
"learning_rate": 1.4568579612276222e-05,
|
|
"loss": 0.2741,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.922211808809747,
|
|
"grad_norm": 1.3163318634033203,
|
|
"learning_rate": 1.4513394313167104e-05,
|
|
"loss": 0.2621,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 1.9315838800374883,
|
|
"grad_norm": 1.3993713855743408,
|
|
"learning_rate": 1.4458035855825341e-05,
|
|
"loss": 0.2657,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 1.9409559512652295,
|
|
"grad_norm": 1.3384408950805664,
|
|
"learning_rate": 1.4402506364100957e-05,
|
|
"loss": 0.2598,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 1.9503280224929709,
|
|
"grad_norm": 1.4588673114776611,
|
|
"learning_rate": 1.4346807968405783e-05,
|
|
"loss": 0.2536,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.9597000937207123,
|
|
"grad_norm": 1.326058268547058,
|
|
"learning_rate": 1.4290942805631722e-05,
|
|
"loss": 0.2563,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 1.9690721649484537,
|
|
"grad_norm": 1.353257179260254,
|
|
"learning_rate": 1.4234913019068769e-05,
|
|
"loss": 0.2564,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 1.978444236176195,
|
|
"grad_norm": 1.4586265087127686,
|
|
"learning_rate": 1.4178720758322761e-05,
|
|
"loss": 0.2769,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 1.9878163074039361,
|
|
"grad_norm": 1.2936612367630005,
|
|
"learning_rate": 1.412236817923295e-05,
|
|
"loss": 0.2737,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.9971883786316775,
|
|
"grad_norm": 1.4073734283447266,
|
|
"learning_rate": 1.4065857443789246e-05,
|
|
"loss": 0.2717,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 2.005623242736645,
|
|
"grad_norm": 1.2421205043792725,
|
|
"learning_rate": 1.4009190720049309e-05,
|
|
"loss": 0.1902,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 2.014995313964386,
|
|
"grad_norm": 1.3869972229003906,
|
|
"learning_rate": 1.3952370182055332e-05,
|
|
"loss": 0.1134,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 2.0243673851921273,
|
|
"grad_norm": 1.3595290184020996,
|
|
"learning_rate": 1.389539800975068e-05,
|
|
"loss": 0.097,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 2.0337394564198688,
|
|
"grad_norm": 1.2397971153259277,
|
|
"learning_rate": 1.3838276388896216e-05,
|
|
"loss": 0.1022,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 2.04311152764761,
|
|
"grad_norm": 1.1282893419265747,
|
|
"learning_rate": 1.3781007510986464e-05,
|
|
"loss": 0.1003,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 2.0524835988753516,
|
|
"grad_norm": 1.2011518478393555,
|
|
"learning_rate": 1.3723593573165523e-05,
|
|
"loss": 0.0993,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 2.0618556701030926,
|
|
"grad_norm": 1.1846802234649658,
|
|
"learning_rate": 1.3666036778142773e-05,
|
|
"loss": 0.1031,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 2.0618556701030926,
|
|
"eval_loss": 0.7819597125053406,
|
|
"eval_runtime": 111.4871,
|
|
"eval_samples_per_second": 4.485,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 2.071227741330834,
|
|
"grad_norm": 1.1528737545013428,
|
|
"learning_rate": 1.3608339334108378e-05,
|
|
"loss": 0.0938,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 2.0805998125585754,
|
|
"grad_norm": 1.2607845067977905,
|
|
"learning_rate": 1.355050345464855e-05,
|
|
"loss": 0.1048,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 2.089971883786317,
|
|
"grad_norm": 1.0643517971038818,
|
|
"learning_rate": 1.3492531358660634e-05,
|
|
"loss": 0.1056,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 2.0993439550140582,
|
|
"grad_norm": 1.2049908638000488,
|
|
"learning_rate": 1.3434425270267983e-05,
|
|
"loss": 0.1078,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 2.108716026241799,
|
|
"grad_norm": 1.1504206657409668,
|
|
"learning_rate": 1.3376187418734626e-05,
|
|
"loss": 0.0987,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 2.1180880974695406,
|
|
"grad_norm": 1.103416085243225,
|
|
"learning_rate": 1.3317820038379731e-05,
|
|
"loss": 0.1011,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 2.127460168697282,
|
|
"grad_norm": 1.2639893293380737,
|
|
"learning_rate": 1.3259325368491897e-05,
|
|
"loss": 0.1065,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 2.1368322399250235,
|
|
"grad_norm": 1.2981096506118774,
|
|
"learning_rate": 1.320070565324324e-05,
|
|
"loss": 0.1089,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 2.146204311152765,
|
|
"grad_norm": 1.3471019268035889,
|
|
"learning_rate": 1.314196314160329e-05,
|
|
"loss": 0.1034,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 2.155576382380506,
|
|
"grad_norm": 1.2037670612335205,
|
|
"learning_rate": 1.308310008725271e-05,
|
|
"loss": 0.0954,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 2.1649484536082473,
|
|
"grad_norm": 1.124943733215332,
|
|
"learning_rate": 1.3024118748496834e-05,
|
|
"loss": 0.1086,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 2.1743205248359887,
|
|
"grad_norm": 1.2061023712158203,
|
|
"learning_rate": 1.2965021388179036e-05,
|
|
"loss": 0.1032,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 2.18369259606373,
|
|
"grad_norm": 1.2710933685302734,
|
|
"learning_rate": 1.2905810273593887e-05,
|
|
"loss": 0.1024,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 2.1930646672914715,
|
|
"grad_norm": 1.1786785125732422,
|
|
"learning_rate": 1.28464876764002e-05,
|
|
"loss": 0.103,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 2.2024367385192125,
|
|
"grad_norm": 1.5116946697235107,
|
|
"learning_rate": 1.2787055872533867e-05,
|
|
"loss": 0.1107,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 2.211808809746954,
|
|
"grad_norm": 1.2890318632125854,
|
|
"learning_rate": 1.2727517142120527e-05,
|
|
"loss": 0.1019,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 2.2211808809746953,
|
|
"grad_norm": 1.184844970703125,
|
|
"learning_rate": 1.266787376938811e-05,
|
|
"loss": 0.1067,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 2.2305529522024368,
|
|
"grad_norm": 1.3428583145141602,
|
|
"learning_rate": 1.2608128042579185e-05,
|
|
"loss": 0.1066,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 2.239925023430178,
|
|
"grad_norm": 1.2953709363937378,
|
|
"learning_rate": 1.2548282253863181e-05,
|
|
"loss": 0.1138,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 2.2492970946579196,
|
|
"grad_norm": 1.1381481885910034,
|
|
"learning_rate": 1.2488338699248443e-05,
|
|
"loss": 0.1053,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 2.2492970946579196,
|
|
"eval_loss": 0.7939261198043823,
|
|
"eval_runtime": 111.5111,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 2.2586691658856606,
|
|
"grad_norm": 1.5689799785614014,
|
|
"learning_rate": 1.2428299678494146e-05,
|
|
"loss": 0.098,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 2.268041237113402,
|
|
"grad_norm": 1.3094913959503174,
|
|
"learning_rate": 1.236816749502206e-05,
|
|
"loss": 0.1111,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 2.2774133083411434,
|
|
"grad_norm": 1.2114543914794922,
|
|
"learning_rate": 1.2307944455828178e-05,
|
|
"loss": 0.1051,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 2.286785379568885,
|
|
"grad_norm": 1.1505310535430908,
|
|
"learning_rate": 1.2247632871394223e-05,
|
|
"loss": 0.0927,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 2.296157450796626,
|
|
"grad_norm": 1.2007763385772705,
|
|
"learning_rate": 1.218723505559898e-05,
|
|
"loss": 0.1081,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 2.3055295220243672,
|
|
"grad_norm": 1.1881816387176514,
|
|
"learning_rate": 1.2126753325629543e-05,
|
|
"loss": 0.0984,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 2.3149015932521086,
|
|
"grad_norm": 1.2576075792312622,
|
|
"learning_rate": 1.2066190001892398e-05,
|
|
"loss": 0.112,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 2.32427366447985,
|
|
"grad_norm": 1.2001255750656128,
|
|
"learning_rate": 1.200554740792442e-05,
|
|
"loss": 0.107,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 2.3336457357075915,
|
|
"grad_norm": 1.2408965826034546,
|
|
"learning_rate": 1.1944827870303719e-05,
|
|
"loss": 0.1166,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 2.3430178069353325,
|
|
"grad_norm": 1.1618740558624268,
|
|
"learning_rate": 1.1884033718560372e-05,
|
|
"loss": 0.0978,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 2.352389878163074,
|
|
"grad_norm": 1.177768349647522,
|
|
"learning_rate": 1.1823167285087064e-05,
|
|
"loss": 0.1027,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 2.3617619493908153,
|
|
"grad_norm": 1.1294364929199219,
|
|
"learning_rate": 1.1762230905049593e-05,
|
|
"loss": 0.1087,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 2.3711340206185567,
|
|
"grad_norm": 1.4736202955245972,
|
|
"learning_rate": 1.1701226916297295e-05,
|
|
"loss": 0.1142,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 2.380506091846298,
|
|
"grad_norm": 1.2007415294647217,
|
|
"learning_rate": 1.164015765927333e-05,
|
|
"loss": 0.1076,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 2.3898781630740396,
|
|
"grad_norm": 1.274434208869934,
|
|
"learning_rate": 1.1579025476924912e-05,
|
|
"loss": 0.1116,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 2.3992502343017805,
|
|
"grad_norm": 1.3655272722244263,
|
|
"learning_rate": 1.1517832714613406e-05,
|
|
"loss": 0.1079,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 2.408622305529522,
|
|
"grad_norm": 1.2331844568252563,
|
|
"learning_rate": 1.1456581720024356e-05,
|
|
"loss": 0.1056,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 2.4179943767572634,
|
|
"grad_norm": 1.1586816310882568,
|
|
"learning_rate": 1.1395274843077405e-05,
|
|
"loss": 0.1067,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 2.427366447985005,
|
|
"grad_norm": 1.271945834159851,
|
|
"learning_rate": 1.1333914435836153e-05,
|
|
"loss": 0.1051,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 2.436738519212746,
|
|
"grad_norm": 1.1621251106262207,
|
|
"learning_rate": 1.1272502852417908e-05,
|
|
"loss": 0.1009,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 2.436738519212746,
|
|
"eval_loss": 0.777266263961792,
|
|
"eval_runtime": 111.4978,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 2.446110590440487,
|
|
"grad_norm": 1.1645290851593018,
|
|
"learning_rate": 1.1211042448903374e-05,
|
|
"loss": 0.1169,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 2.4554826616682286,
|
|
"grad_norm": 1.163246989250183,
|
|
"learning_rate": 1.1149535583246253e-05,
|
|
"loss": 0.0952,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 2.46485473289597,
|
|
"grad_norm": 1.3993792533874512,
|
|
"learning_rate": 1.1087984615182797e-05,
|
|
"loss": 0.1178,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 2.4742268041237114,
|
|
"grad_norm": 1.1687663793563843,
|
|
"learning_rate": 1.1026391906141255e-05,
|
|
"loss": 0.0978,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 2.483598875351453,
|
|
"grad_norm": 1.1476637125015259,
|
|
"learning_rate": 1.0964759819151289e-05,
|
|
"loss": 0.0946,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 2.492970946579194,
|
|
"grad_norm": 1.0236659049987793,
|
|
"learning_rate": 1.0903090718753317e-05,
|
|
"loss": 0.1057,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 2.5023430178069352,
|
|
"grad_norm": 1.4007511138916016,
|
|
"learning_rate": 1.0841386970907786e-05,
|
|
"loss": 0.1124,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 2.5117150890346767,
|
|
"grad_norm": 1.2030051946640015,
|
|
"learning_rate": 1.077965094290441e-05,
|
|
"loss": 0.102,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 2.521087160262418,
|
|
"grad_norm": 1.0863361358642578,
|
|
"learning_rate": 1.0717885003271338e-05,
|
|
"loss": 0.1501,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 2.530459231490159,
|
|
"grad_norm": 1.441186547279358,
|
|
"learning_rate": 1.0656091521684297e-05,
|
|
"loss": 0.1111,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 2.539831302717901,
|
|
"grad_norm": 1.1081117391586304,
|
|
"learning_rate": 1.0594272868875677e-05,
|
|
"loss": 0.0995,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 2.549203373945642,
|
|
"grad_norm": 1.3063805103302002,
|
|
"learning_rate": 1.0532431416543559e-05,
|
|
"loss": 0.1026,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 2.5585754451733833,
|
|
"grad_norm": 1.265457034111023,
|
|
"learning_rate": 1.0470569537260746e-05,
|
|
"loss": 0.1137,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 2.5679475164011247,
|
|
"grad_norm": 1.1931920051574707,
|
|
"learning_rate": 1.040868960438373e-05,
|
|
"loss": 0.1056,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 2.5773195876288657,
|
|
"grad_norm": 1.2705389261245728,
|
|
"learning_rate": 1.0346793991961636e-05,
|
|
"loss": 0.0992,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 2.5866916588566076,
|
|
"grad_norm": 1.2234851121902466,
|
|
"learning_rate": 1.0284885074645139e-05,
|
|
"loss": 0.1067,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 2.5960637300843485,
|
|
"grad_norm": 1.30626380443573,
|
|
"learning_rate": 1.022296522759536e-05,
|
|
"loss": 0.1071,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 2.60543580131209,
|
|
"grad_norm": 1.1325551271438599,
|
|
"learning_rate": 1.016103682639275e-05,
|
|
"loss": 0.0946,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 2.6148078725398314,
|
|
"grad_norm": 1.2140247821807861,
|
|
"learning_rate": 1.009910224694593e-05,
|
|
"loss": 0.1012,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 2.624179943767573,
|
|
"grad_norm": 1.2330358028411865,
|
|
"learning_rate": 1.0037163865400577e-05,
|
|
"loss": 0.1022,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 2.624179943767573,
|
|
"eval_loss": 0.7983193397521973,
|
|
"eval_runtime": 111.5048,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 2.633552014995314,
|
|
"grad_norm": 1.2977453470230103,
|
|
"learning_rate": 9.97522405804821e-06,
|
|
"loss": 0.1086,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 2.642924086223055,
|
|
"grad_norm": 1.2647531032562256,
|
|
"learning_rate": 9.913285201235065e-06,
|
|
"loss": 0.1051,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 2.6522961574507966,
|
|
"grad_norm": 1.3180173635482788,
|
|
"learning_rate": 9.85134967127091e-06,
|
|
"loss": 0.1142,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 2.661668228678538,
|
|
"grad_norm": 1.2392545938491821,
|
|
"learning_rate": 9.789419844337868e-06,
|
|
"loss": 0.1047,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 2.6710402999062794,
|
|
"grad_norm": 1.1911959648132324,
|
|
"learning_rate": 9.727498096399272e-06,
|
|
"loss": 0.0908,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 2.680412371134021,
|
|
"grad_norm": 1.3625760078430176,
|
|
"learning_rate": 9.665586803108495e-06,
|
|
"loss": 0.0967,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 2.689784442361762,
|
|
"grad_norm": 1.077038288116455,
|
|
"learning_rate": 9.603688339717818e-06,
|
|
"loss": 0.1055,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 2.6991565135895033,
|
|
"grad_norm": 1.2724173069000244,
|
|
"learning_rate": 9.541805080987298e-06,
|
|
"loss": 0.1024,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 2.7085285848172447,
|
|
"grad_norm": 1.246999979019165,
|
|
"learning_rate": 9.47993940109365e-06,
|
|
"loss": 0.1096,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 2.717900656044986,
|
|
"grad_norm": 1.1447161436080933,
|
|
"learning_rate": 9.418093673539181e-06,
|
|
"loss": 0.0964,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 2.7272727272727275,
|
|
"grad_norm": 1.3298566341400146,
|
|
"learning_rate": 9.356270271060711e-06,
|
|
"loss": 0.1036,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 2.7366447985004685,
|
|
"grad_norm": 1.3487498760223389,
|
|
"learning_rate": 9.294471565538552e-06,
|
|
"loss": 0.1054,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 2.74601686972821,
|
|
"grad_norm": 1.2166017293930054,
|
|
"learning_rate": 9.232699927905508e-06,
|
|
"loss": 0.1031,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 2.7553889409559513,
|
|
"grad_norm": 1.1950914859771729,
|
|
"learning_rate": 9.170957728055907e-06,
|
|
"loss": 0.0988,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 2.7647610121836927,
|
|
"grad_norm": 1.0390832424163818,
|
|
"learning_rate": 9.10924733475469e-06,
|
|
"loss": 0.1038,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 2.774133083411434,
|
|
"grad_norm": 1.190873146057129,
|
|
"learning_rate": 9.047571115546526e-06,
|
|
"loss": 0.1036,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 2.783505154639175,
|
|
"grad_norm": 1.1870976686477661,
|
|
"learning_rate": 8.985931436664981e-06,
|
|
"loss": 0.1032,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 2.7928772258669166,
|
|
"grad_norm": 1.2104380130767822,
|
|
"learning_rate": 8.924330662941731e-06,
|
|
"loss": 0.1006,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 2.802249297094658,
|
|
"grad_norm": 1.1908341646194458,
|
|
"learning_rate": 8.862771157715847e-06,
|
|
"loss": 0.0984,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 2.8116213683223994,
|
|
"grad_norm": 1.3652592897415161,
|
|
"learning_rate": 8.801255282743113e-06,
|
|
"loss": 0.1087,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 2.8116213683223994,
|
|
"eval_loss": 0.8067182898521423,
|
|
"eval_runtime": 111.5238,
|
|
"eval_samples_per_second": 4.483,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 2.820993439550141,
|
|
"grad_norm": 1.3108559846878052,
|
|
"learning_rate": 8.739785398105419e-06,
|
|
"loss": 0.1096,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 2.830365510777882,
|
|
"grad_norm": 1.1820882558822632,
|
|
"learning_rate": 8.678363862120224e-06,
|
|
"loss": 0.0961,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 2.839737582005623,
|
|
"grad_norm": 1.0882302522659302,
|
|
"learning_rate": 8.616993031250059e-06,
|
|
"loss": 0.097,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 2.8491096532333646,
|
|
"grad_norm": 1.3416924476623535,
|
|
"learning_rate": 8.555675260012137e-06,
|
|
"loss": 0.1011,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 2.858481724461106,
|
|
"grad_norm": 1.3005818128585815,
|
|
"learning_rate": 8.49441290088803e-06,
|
|
"loss": 0.1064,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 2.8678537956888475,
|
|
"grad_norm": 1.203696846961975,
|
|
"learning_rate": 8.433208304233383e-06,
|
|
"loss": 0.0907,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 2.8772258669165884,
|
|
"grad_norm": 1.1533688306808472,
|
|
"learning_rate": 8.372063818187768e-06,
|
|
"loss": 0.0951,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 2.88659793814433,
|
|
"grad_norm": 1.21674382686615,
|
|
"learning_rate": 8.31098178858459e-06,
|
|
"loss": 0.0924,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 2.8959700093720713,
|
|
"grad_norm": 1.3103758096694946,
|
|
"learning_rate": 8.249964558861084e-06,
|
|
"loss": 0.1038,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 2.9053420805998127,
|
|
"grad_norm": 1.1318589448928833,
|
|
"learning_rate": 8.189014469968407e-06,
|
|
"loss": 0.0991,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 2.914714151827554,
|
|
"grad_norm": 1.3271617889404297,
|
|
"learning_rate": 8.128133860281838e-06,
|
|
"loss": 0.1061,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 2.924086223055295,
|
|
"grad_norm": 1.2122989892959595,
|
|
"learning_rate": 8.067325065511056e-06,
|
|
"loss": 0.0995,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 2.9334582942830365,
|
|
"grad_norm": 1.286104440689087,
|
|
"learning_rate": 8.006590418610523e-06,
|
|
"loss": 0.1069,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 2.942830365510778,
|
|
"grad_norm": 1.3062405586242676,
|
|
"learning_rate": 7.945932249690002e-06,
|
|
"loss": 0.1025,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 2.9522024367385193,
|
|
"grad_norm": 1.2752856016159058,
|
|
"learning_rate": 7.885352885925139e-06,
|
|
"loss": 0.1097,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 2.9615745079662608,
|
|
"grad_norm": 1.1971313953399658,
|
|
"learning_rate": 7.824854651468187e-06,
|
|
"loss": 0.1002,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 2.9709465791940017,
|
|
"grad_norm": 1.3056398630142212,
|
|
"learning_rate": 7.764439867358836e-06,
|
|
"loss": 0.1088,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 2.980318650421743,
|
|
"grad_norm": 1.2253344058990479,
|
|
"learning_rate": 7.704110851435174e-06,
|
|
"loss": 0.1047,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 2.9896907216494846,
|
|
"grad_norm": 1.1375926733016968,
|
|
"learning_rate": 7.643869918244759e-06,
|
|
"loss": 0.0937,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 2.999062792877226,
|
|
"grad_norm": 1.2414946556091309,
|
|
"learning_rate": 7.583719378955816e-06,
|
|
"loss": 0.1046,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 2.999062792877226,
|
|
"eval_loss": 0.8037455081939697,
|
|
"eval_runtime": 111.5354,
|
|
"eval_samples_per_second": 4.483,
|
|
"eval_steps_per_second": 2.241,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 3.007497656982193,
|
|
"grad_norm": 0.8191234469413757,
|
|
"learning_rate": 7.523661541268571e-06,
|
|
"loss": 0.054,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 3.0168697282099344,
|
|
"grad_norm": 0.6123488545417786,
|
|
"learning_rate": 7.463698709326708e-06,
|
|
"loss": 0.0328,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 3.026241799437676,
|
|
"grad_norm": 1.0028489828109741,
|
|
"learning_rate": 7.403833183628995e-06,
|
|
"loss": 0.0345,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 3.035613870665417,
|
|
"grad_norm": 1.0307646989822388,
|
|
"learning_rate": 7.344067260940989e-06,
|
|
"loss": 0.0323,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 3.044985941893158,
|
|
"grad_norm": 0.9559040069580078,
|
|
"learning_rate": 7.284403234206939e-06,
|
|
"loss": 0.035,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 3.0543580131208996,
|
|
"grad_norm": 0.9424014687538147,
|
|
"learning_rate": 7.224843392461818e-06,
|
|
"loss": 0.033,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 3.063730084348641,
|
|
"grad_norm": 0.845702588558197,
|
|
"learning_rate": 7.165390020743498e-06,
|
|
"loss": 0.0324,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 3.0731021555763824,
|
|
"grad_norm": 0.8844259977340698,
|
|
"learning_rate": 7.106045400005083e-06,
|
|
"loss": 0.0284,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 3.082474226804124,
|
|
"grad_norm": 0.7264754772186279,
|
|
"learning_rate": 7.046811807027401e-06,
|
|
"loss": 0.0344,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 3.091846298031865,
|
|
"grad_norm": 0.8641548156738281,
|
|
"learning_rate": 6.987691514331656e-06,
|
|
"loss": 0.0366,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 3.1012183692596063,
|
|
"grad_norm": 0.8383805155754089,
|
|
"learning_rate": 6.928686790092235e-06,
|
|
"loss": 0.0323,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 3.1105904404873477,
|
|
"grad_norm": 1.0214649438858032,
|
|
"learning_rate": 6.869799898049704e-06,
|
|
"loss": 0.0333,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 3.119962511715089,
|
|
"grad_norm": 1.09578537940979,
|
|
"learning_rate": 6.811033097423938e-06,
|
|
"loss": 0.0357,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 3.1293345829428305,
|
|
"grad_norm": 0.9607039093971252,
|
|
"learning_rate": 6.752388642827459e-06,
|
|
"loss": 0.0356,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 3.138706654170572,
|
|
"grad_norm": 0.9811620712280273,
|
|
"learning_rate": 6.693868784178934e-06,
|
|
"loss": 0.0325,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 3.148078725398313,
|
|
"grad_norm": 1.125432014465332,
|
|
"learning_rate": 6.635475766616852e-06,
|
|
"loss": 0.0341,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 3.1574507966260543,
|
|
"grad_norm": 0.8190117478370667,
|
|
"learning_rate": 6.577211830413397e-06,
|
|
"loss": 0.0318,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 3.1668228678537957,
|
|
"grad_norm": 0.8427776098251343,
|
|
"learning_rate": 6.519079210888486e-06,
|
|
"loss": 0.0326,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 3.176194939081537,
|
|
"grad_norm": 0.9349907636642456,
|
|
"learning_rate": 6.461080138324025e-06,
|
|
"loss": 0.0303,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 3.1855670103092786,
|
|
"grad_norm": 0.7530879378318787,
|
|
"learning_rate": 6.40321683787833e-06,
|
|
"loss": 0.0311,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 3.1855670103092786,
|
|
"eval_loss": 0.9447797536849976,
|
|
"eval_runtime": 111.5066,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 3.1949390815370196,
|
|
"grad_norm": 1.094067096710205,
|
|
"learning_rate": 6.345491529500769e-06,
|
|
"loss": 0.0362,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 3.204311152764761,
|
|
"grad_norm": 0.9973980784416199,
|
|
"learning_rate": 6.287906427846583e-06,
|
|
"loss": 0.0311,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 3.2136832239925024,
|
|
"grad_norm": 0.954328179359436,
|
|
"learning_rate": 6.230463742191926e-06,
|
|
"loss": 0.0316,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 3.223055295220244,
|
|
"grad_norm": 0.8958219289779663,
|
|
"learning_rate": 6.173165676349103e-06,
|
|
"loss": 0.0319,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 3.2324273664479852,
|
|
"grad_norm": 0.8772101402282715,
|
|
"learning_rate": 6.116014428582022e-06,
|
|
"loss": 0.033,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 3.241799437675726,
|
|
"grad_norm": 0.8836532235145569,
|
|
"learning_rate": 6.059012191521853e-06,
|
|
"loss": 0.0345,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 3.2511715089034676,
|
|
"grad_norm": 1.0338672399520874,
|
|
"learning_rate": 6.002161152082909e-06,
|
|
"loss": 0.0322,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 3.260543580131209,
|
|
"grad_norm": 0.7626182436943054,
|
|
"learning_rate": 5.945463491378746e-06,
|
|
"loss": 0.034,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 3.2699156513589505,
|
|
"grad_norm": 1.0167630910873413,
|
|
"learning_rate": 5.888921384638477e-06,
|
|
"loss": 0.0323,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 3.279287722586692,
|
|
"grad_norm": 0.8768958449363708,
|
|
"learning_rate": 5.832537001123328e-06,
|
|
"loss": 0.0335,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 3.288659793814433,
|
|
"grad_norm": 0.8373109698295593,
|
|
"learning_rate": 5.7763125040434084e-06,
|
|
"loss": 0.0306,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 3.2980318650421743,
|
|
"grad_norm": 0.7997825741767883,
|
|
"learning_rate": 5.720250050474723e-06,
|
|
"loss": 0.0314,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 3.3074039362699157,
|
|
"grad_norm": 0.9116000533103943,
|
|
"learning_rate": 5.66435179127639e-06,
|
|
"loss": 0.0342,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 3.316776007497657,
|
|
"grad_norm": 0.7944602370262146,
|
|
"learning_rate": 5.608619871008166e-06,
|
|
"loss": 0.0314,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 3.3261480787253985,
|
|
"grad_norm": 0.9112783074378967,
|
|
"learning_rate": 5.553056427848136e-06,
|
|
"loss": 0.0305,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 3.3355201499531395,
|
|
"grad_norm": 0.9411343336105347,
|
|
"learning_rate": 5.497663593510693e-06,
|
|
"loss": 0.0362,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 3.344892221180881,
|
|
"grad_norm": 0.9458235502243042,
|
|
"learning_rate": 5.442443493164753e-06,
|
|
"loss": 0.0311,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 3.3542642924086223,
|
|
"grad_norm": 0.9986944794654846,
|
|
"learning_rate": 5.387398245352213e-06,
|
|
"loss": 0.0346,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 3.3636363636363638,
|
|
"grad_norm": 0.8632819652557373,
|
|
"learning_rate": 5.332529961906699e-06,
|
|
"loss": 0.0322,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 3.373008434864105,
|
|
"grad_norm": 0.8336763978004456,
|
|
"learning_rate": 5.277840747872509e-06,
|
|
"loss": 0.0343,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 3.373008434864105,
|
|
"eval_loss": 0.9443374872207642,
|
|
"eval_runtime": 111.5074,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 3.382380506091846,
|
|
"grad_norm": 0.7421078085899353,
|
|
"learning_rate": 5.223332701423875e-06,
|
|
"loss": 0.0299,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 3.3917525773195876,
|
|
"grad_norm": 0.7075040340423584,
|
|
"learning_rate": 5.169007913784462e-06,
|
|
"loss": 0.0333,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 3.401124648547329,
|
|
"grad_norm": 0.8889288306236267,
|
|
"learning_rate": 5.11486846914713e-06,
|
|
"loss": 0.033,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 3.4104967197750704,
|
|
"grad_norm": 1.1044409275054932,
|
|
"learning_rate": 5.060916444593985e-06,
|
|
"loss": 0.0353,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 3.419868791002812,
|
|
"grad_norm": 0.9357883334159851,
|
|
"learning_rate": 5.00715391001668e-06,
|
|
"loss": 0.0304,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 3.429240862230553,
|
|
"grad_norm": 0.9663400650024414,
|
|
"learning_rate": 4.953582928037005e-06,
|
|
"loss": 0.0332,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 3.438612933458294,
|
|
"grad_norm": 1.0516884326934814,
|
|
"learning_rate": 4.900205553927761e-06,
|
|
"loss": 0.035,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 3.4479850046860356,
|
|
"grad_norm": 1.041757345199585,
|
|
"learning_rate": 4.847023835533903e-06,
|
|
"loss": 0.0315,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 3.457357075913777,
|
|
"grad_norm": 0.8891613483428955,
|
|
"learning_rate": 4.794039813193967e-06,
|
|
"loss": 0.0326,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 3.4667291471415185,
|
|
"grad_norm": 0.9261044859886169,
|
|
"learning_rate": 4.741255519661806e-06,
|
|
"loss": 0.0304,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 3.4761012183692594,
|
|
"grad_norm": 1.3144643306732178,
|
|
"learning_rate": 4.68867298002859e-06,
|
|
"loss": 0.0354,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 3.485473289597001,
|
|
"grad_norm": 0.8868503570556641,
|
|
"learning_rate": 4.6362942116451226e-06,
|
|
"loss": 0.0304,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 3.4948453608247423,
|
|
"grad_norm": 0.9837562441825867,
|
|
"learning_rate": 4.5841212240444334e-06,
|
|
"loss": 0.032,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 3.5042174320524837,
|
|
"grad_norm": 0.8227118253707886,
|
|
"learning_rate": 4.532156018864692e-06,
|
|
"loss": 0.0307,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 3.513589503280225,
|
|
"grad_norm": 0.7651123404502869,
|
|
"learning_rate": 4.480400589772409e-06,
|
|
"loss": 0.0264,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 3.522961574507966,
|
|
"grad_norm": 0.9286547899246216,
|
|
"learning_rate": 4.428856922385942e-06,
|
|
"loss": 0.0285,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 3.5323336457357075,
|
|
"grad_norm": 0.9905438423156738,
|
|
"learning_rate": 4.37752699419934e-06,
|
|
"loss": 0.0337,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 3.541705716963449,
|
|
"grad_norm": 0.914618194103241,
|
|
"learning_rate": 4.326412774506444e-06,
|
|
"loss": 0.0287,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 3.5510777881911904,
|
|
"grad_norm": 0.8570281863212585,
|
|
"learning_rate": 4.275516224325356e-06,
|
|
"loss": 0.0319,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 3.5604498594189318,
|
|
"grad_norm": 0.8986263871192932,
|
|
"learning_rate": 4.224839296323196e-06,
|
|
"loss": 0.0322,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 3.5604498594189318,
|
|
"eval_loss": 0.9526164531707764,
|
|
"eval_runtime": 111.507,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 3.5698219306466727,
|
|
"grad_norm": 1.0641896724700928,
|
|
"learning_rate": 4.1743839347411875e-06,
|
|
"loss": 0.0317,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 3.579194001874414,
|
|
"grad_norm": 1.0256502628326416,
|
|
"learning_rate": 4.124152075320071e-06,
|
|
"loss": 0.0346,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 3.5885660731021556,
|
|
"grad_norm": 0.8067216277122498,
|
|
"learning_rate": 4.074145645225831e-06,
|
|
"loss": 0.0302,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 3.597938144329897,
|
|
"grad_norm": 0.9786953926086426,
|
|
"learning_rate": 4.0243665629757654e-06,
|
|
"loss": 0.0362,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 3.6073102155576384,
|
|
"grad_norm": 0.8346753716468811,
|
|
"learning_rate": 3.974816738364875e-06,
|
|
"loss": 0.0309,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 3.6166822867853794,
|
|
"grad_norm": 0.7229898571968079,
|
|
"learning_rate": 3.9254980723926e-06,
|
|
"loss": 0.03,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 3.626054358013121,
|
|
"grad_norm": 0.9483016729354858,
|
|
"learning_rate": 3.876412457189883e-06,
|
|
"loss": 0.032,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 3.6354264292408622,
|
|
"grad_norm": 0.9327901601791382,
|
|
"learning_rate": 3.8275617759465775e-06,
|
|
"loss": 0.0323,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 3.6447985004686037,
|
|
"grad_norm": 0.8537086844444275,
|
|
"learning_rate": 3.7789479028392007e-06,
|
|
"loss": 0.029,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 3.654170571696345,
|
|
"grad_norm": 0.891110360622406,
|
|
"learning_rate": 3.7305727029590245e-06,
|
|
"loss": 0.0342,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 3.663542642924086,
|
|
"grad_norm": 0.8868283629417419,
|
|
"learning_rate": 3.6824380322405273e-06,
|
|
"loss": 0.0315,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 3.6729147141518275,
|
|
"grad_norm": 0.9474219679832458,
|
|
"learning_rate": 3.6345457373901848e-06,
|
|
"loss": 0.0302,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 3.682286785379569,
|
|
"grad_norm": 0.9067096710205078,
|
|
"learning_rate": 3.5868976558156254e-06,
|
|
"loss": 0.0291,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 3.6916588566073103,
|
|
"grad_norm": 0.8193556070327759,
|
|
"learning_rate": 3.5394956155551285e-06,
|
|
"loss": 0.0309,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 3.7010309278350517,
|
|
"grad_norm": 0.8624306321144104,
|
|
"learning_rate": 3.492341435207509e-06,
|
|
"loss": 0.0312,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 3.7104029990627927,
|
|
"grad_norm": 0.7553118467330933,
|
|
"learning_rate": 3.445436923862322e-06,
|
|
"loss": 0.0298,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 3.719775070290534,
|
|
"grad_norm": 0.8075463175773621,
|
|
"learning_rate": 3.3987838810304752e-06,
|
|
"loss": 0.0297,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 3.7291471415182755,
|
|
"grad_norm": 1.0225906372070312,
|
|
"learning_rate": 3.3523840965751788e-06,
|
|
"loss": 0.032,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 3.738519212746017,
|
|
"grad_norm": 0.8977119326591492,
|
|
"learning_rate": 3.3062393506432843e-06,
|
|
"loss": 0.0705,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 3.7478912839737584,
|
|
"grad_norm": 0.8516520857810974,
|
|
"learning_rate": 3.2603514135969837e-06,
|
|
"loss": 0.0299,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 3.7478912839737584,
|
|
"eval_loss": 0.967979371547699,
|
|
"eval_runtime": 111.5017,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 3.7572633552014993,
|
|
"grad_norm": 0.9143263697624207,
|
|
"learning_rate": 3.214722045945895e-06,
|
|
"loss": 0.0295,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"epoch": 3.7666354264292408,
|
|
"grad_norm": 0.8708062767982483,
|
|
"learning_rate": 3.1693529982795036e-06,
|
|
"loss": 0.0281,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 3.776007497656982,
|
|
"grad_norm": 0.9132674932479858,
|
|
"learning_rate": 3.124246011200018e-06,
|
|
"loss": 0.0301,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"epoch": 3.7853795688847236,
|
|
"grad_norm": 0.9853923916816711,
|
|
"learning_rate": 3.079402815255591e-06,
|
|
"loss": 0.0313,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 3.794751640112465,
|
|
"grad_norm": 1.0308923721313477,
|
|
"learning_rate": 3.0348251308739106e-06,
|
|
"loss": 0.032,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 3.804123711340206,
|
|
"grad_norm": 0.7933114767074585,
|
|
"learning_rate": 2.9905146682962073e-06,
|
|
"loss": 0.0311,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 3.8134957825679474,
|
|
"grad_norm": 0.8838526606559753,
|
|
"learning_rate": 2.9464731275116355e-06,
|
|
"loss": 0.0325,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"epoch": 3.822867853795689,
|
|
"grad_norm": 0.8747525811195374,
|
|
"learning_rate": 2.9027021981920566e-06,
|
|
"loss": 0.0314,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 3.8322399250234302,
|
|
"grad_norm": 0.7285995483398438,
|
|
"learning_rate": 2.8592035596272118e-06,
|
|
"loss": 0.0294,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"epoch": 3.8416119962511717,
|
|
"grad_norm": 0.8272311091423035,
|
|
"learning_rate": 2.8159788806602904e-06,
|
|
"loss": 0.0318,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 3.8509840674789126,
|
|
"grad_norm": 0.7552247047424316,
|
|
"learning_rate": 2.773029819623916e-06,
|
|
"loss": 0.03,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"epoch": 3.860356138706654,
|
|
"grad_norm": 0.9183073043823242,
|
|
"learning_rate": 2.730358024276509e-06,
|
|
"loss": 0.0314,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 3.8697282099343955,
|
|
"grad_norm": 0.8467240333557129,
|
|
"learning_rate": 2.6879651317390864e-06,
|
|
"loss": 0.0256,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"epoch": 3.879100281162137,
|
|
"grad_norm": 0.850248396396637,
|
|
"learning_rate": 2.6458527684324376e-06,
|
|
"loss": 0.0299,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 3.8884723523898783,
|
|
"grad_norm": 0.7223458290100098,
|
|
"learning_rate": 2.6040225500147365e-06,
|
|
"loss": 0.0305,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"epoch": 3.8978444236176193,
|
|
"grad_norm": 0.8155651092529297,
|
|
"learning_rate": 2.5624760813195436e-06,
|
|
"loss": 0.0298,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 3.9072164948453607,
|
|
"grad_norm": 0.7251290082931519,
|
|
"learning_rate": 2.5212149562942535e-06,
|
|
"loss": 0.0276,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"epoch": 3.916588566073102,
|
|
"grad_norm": 1.1165629625320435,
|
|
"learning_rate": 2.48024075793893e-06,
|
|
"loss": 0.0309,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 3.9259606373008435,
|
|
"grad_norm": 1.0103236436843872,
|
|
"learning_rate": 2.4395550582455774e-06,
|
|
"loss": 0.0277,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"epoch": 3.935332708528585,
|
|
"grad_norm": 0.912944495677948,
|
|
"learning_rate": 2.3991594181378286e-06,
|
|
"loss": 0.0335,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 3.935332708528585,
|
|
"eval_loss": 0.9605706930160522,
|
|
"eval_runtime": 111.5358,
|
|
"eval_samples_per_second": 4.483,
|
|
"eval_steps_per_second": 2.241,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 3.944704779756326,
|
|
"grad_norm": 0.925261378288269,
|
|
"learning_rate": 2.359055387411061e-06,
|
|
"loss": 0.0311,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"epoch": 3.9540768509840674,
|
|
"grad_norm": 0.9867929220199585,
|
|
"learning_rate": 2.319244504672943e-06,
|
|
"loss": 0.0306,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 3.963448922211809,
|
|
"grad_norm": 0.9533296227455139,
|
|
"learning_rate": 2.279728297284394e-06,
|
|
"loss": 0.0309,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"epoch": 3.97282099343955,
|
|
"grad_norm": 0.8042296171188354,
|
|
"learning_rate": 2.2405082813009926e-06,
|
|
"loss": 0.0257,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 3.9821930646672916,
|
|
"grad_norm": 0.8513698577880859,
|
|
"learning_rate": 2.201585961414815e-06,
|
|
"loss": 0.0277,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"epoch": 3.9915651358950326,
|
|
"grad_norm": 0.8996440768241882,
|
|
"learning_rate": 2.1629628308967e-06,
|
|
"loss": 0.0309,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"grad_norm": 1.3045603036880493,
|
|
"learning_rate": 2.1246403715389675e-06,
|
|
"loss": 0.0307,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"epoch": 4.009372071227741,
|
|
"grad_norm": 0.5760667324066162,
|
|
"learning_rate": 2.0866200535985616e-06,
|
|
"loss": 0.0104,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 4.018744142455483,
|
|
"grad_norm": 0.32251426577568054,
|
|
"learning_rate": 2.0489033357406464e-06,
|
|
"loss": 0.0091,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"epoch": 4.028116213683224,
|
|
"grad_norm": 0.3890618681907654,
|
|
"learning_rate": 2.011491664982644e-06,
|
|
"loss": 0.0093,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 4.037488284910966,
|
|
"grad_norm": 0.4246854782104492,
|
|
"learning_rate": 1.9743864766387198e-06,
|
|
"loss": 0.0094,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"epoch": 4.046860356138707,
|
|
"grad_norm": 0.37308433651924133,
|
|
"learning_rate": 1.937589194264715e-06,
|
|
"loss": 0.0083,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 4.056232427366448,
|
|
"grad_norm": 0.29468032717704773,
|
|
"learning_rate": 1.9011012296035303e-06,
|
|
"loss": 0.0072,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"epoch": 4.0656044985941895,
|
|
"grad_norm": 0.49253249168395996,
|
|
"learning_rate": 1.864923982530965e-06,
|
|
"loss": 0.0078,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 4.0749765698219305,
|
|
"grad_norm": 0.5254181623458862,
|
|
"learning_rate": 1.8290588410020116e-06,
|
|
"loss": 0.0078,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"epoch": 4.084348641049672,
|
|
"grad_norm": 0.3478500247001648,
|
|
"learning_rate": 1.7935071809976035e-06,
|
|
"loss": 0.0075,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 4.093720712277413,
|
|
"grad_norm": 0.3770616352558136,
|
|
"learning_rate": 1.7582703664718247e-06,
|
|
"loss": 0.0082,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"epoch": 4.103092783505154,
|
|
"grad_norm": 0.349509596824646,
|
|
"learning_rate": 1.7233497492995865e-06,
|
|
"loss": 0.0069,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 4.112464854732896,
|
|
"grad_norm": 0.43029990792274475,
|
|
"learning_rate": 1.6887466692247556e-06,
|
|
"loss": 0.0077,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"epoch": 4.121836925960637,
|
|
"grad_norm": 0.6748161911964417,
|
|
"learning_rate": 1.654462453808755e-06,
|
|
"loss": 0.0073,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 4.121836925960637,
|
|
"eval_loss": 1.0975761413574219,
|
|
"eval_runtime": 111.5036,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 4.131208997188379,
|
|
"grad_norm": 0.6008536219596863,
|
|
"learning_rate": 1.6204984183796425e-06,
|
|
"loss": 0.0079,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"epoch": 4.14058106841612,
|
|
"grad_norm": 0.4357309937477112,
|
|
"learning_rate": 1.5868558659816302e-06,
|
|
"loss": 0.0082,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 4.149953139643861,
|
|
"grad_norm": 0.4295269250869751,
|
|
"learning_rate": 1.5535360873251026e-06,
|
|
"loss": 0.008,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"epoch": 4.159325210871603,
|
|
"grad_norm": 0.3729182183742523,
|
|
"learning_rate": 1.5205403607370984e-06,
|
|
"loss": 0.0071,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 4.168697282099344,
|
|
"grad_norm": 0.5101849436759949,
|
|
"learning_rate": 1.4878699521122654e-06,
|
|
"loss": 0.0081,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"epoch": 4.178069353327086,
|
|
"grad_norm": 0.5576186776161194,
|
|
"learning_rate": 1.4555261148642929e-06,
|
|
"loss": 0.0088,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 4.187441424554827,
|
|
"grad_norm": 0.39585602283477783,
|
|
"learning_rate": 1.423510089877823e-06,
|
|
"loss": 0.0078,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"epoch": 4.196813495782568,
|
|
"grad_norm": 0.45328739285469055,
|
|
"learning_rate": 1.3918231054608499e-06,
|
|
"loss": 0.0077,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 4.206185567010309,
|
|
"grad_norm": 0.45810526609420776,
|
|
"learning_rate": 1.3604663772975856e-06,
|
|
"loss": 0.0093,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"epoch": 4.21555763823805,
|
|
"grad_norm": 0.4543026089668274,
|
|
"learning_rate": 1.3294411084018277e-06,
|
|
"loss": 0.007,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 4.224929709465792,
|
|
"grad_norm": 1.054495930671692,
|
|
"learning_rate": 1.2987484890708024e-06,
|
|
"loss": 0.0087,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"epoch": 4.234301780693533,
|
|
"grad_norm": 0.5703629851341248,
|
|
"learning_rate": 1.268389696839497e-06,
|
|
"loss": 0.008,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 4.243673851921274,
|
|
"grad_norm": 0.41296708583831787,
|
|
"learning_rate": 1.2383658964354861e-06,
|
|
"loss": 0.006,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"epoch": 4.253045923149016,
|
|
"grad_norm": 0.6897146701812744,
|
|
"learning_rate": 1.2086782397342445e-06,
|
|
"loss": 0.0076,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 4.262417994376757,
|
|
"grad_norm": 0.39745044708251953,
|
|
"learning_rate": 1.1793278657149532e-06,
|
|
"loss": 0.0084,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"epoch": 4.271790065604499,
|
|
"grad_norm": 0.6803708672523499,
|
|
"learning_rate": 1.1503159004168074e-06,
|
|
"loss": 0.0063,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 4.28116213683224,
|
|
"grad_norm": 0.49779579043388367,
|
|
"learning_rate": 1.12164345689581e-06,
|
|
"loss": 0.0077,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"epoch": 4.290534208059981,
|
|
"grad_norm": 0.42171531915664673,
|
|
"learning_rate": 1.0933116351820695e-06,
|
|
"loss": 0.0074,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 4.299906279287723,
|
|
"grad_norm": 0.41160067915916443,
|
|
"learning_rate": 1.0653215222376045e-06,
|
|
"loss": 0.0068,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"epoch": 4.309278350515464,
|
|
"grad_norm": 0.4333638548851013,
|
|
"learning_rate": 1.0376741919146305e-06,
|
|
"loss": 0.0069,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 4.309278350515464,
|
|
"eval_loss": 1.1144713163375854,
|
|
"eval_runtime": 111.5268,
|
|
"eval_samples_per_second": 4.483,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 4.318650421743206,
|
|
"grad_norm": 0.621540367603302,
|
|
"learning_rate": 1.0103707049143673e-06,
|
|
"loss": 0.008,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"epoch": 4.3280224929709465,
|
|
"grad_norm": 0.3928787112236023,
|
|
"learning_rate": 9.834121087463445e-07,
|
|
"loss": 0.0068,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 4.3373945641986875,
|
|
"grad_norm": 0.4444401264190674,
|
|
"learning_rate": 9.56799437688214e-07,
|
|
"loss": 0.0076,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"epoch": 4.346766635426429,
|
|
"grad_norm": 0.4709712266921997,
|
|
"learning_rate": 9.305337127460678e-07,
|
|
"loss": 0.0064,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 4.35613870665417,
|
|
"grad_norm": 0.6003327369689941,
|
|
"learning_rate": 9.046159416152633e-07,
|
|
"loss": 0.007,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"epoch": 4.365510777881912,
|
|
"grad_norm": 0.3838503360748291,
|
|
"learning_rate": 8.790471186417715e-07,
|
|
"loss": 0.0076,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 4.374882849109653,
|
|
"grad_norm": 0.5418089032173157,
|
|
"learning_rate": 8.538282247840201e-07,
|
|
"loss": 0.0072,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"epoch": 4.384254920337394,
|
|
"grad_norm": 0.7511455416679382,
|
|
"learning_rate": 8.289602275752673e-07,
|
|
"loss": 0.009,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 4.393626991565136,
|
|
"grad_norm": 0.5192817449569702,
|
|
"learning_rate": 8.044440810864718e-07,
|
|
"loss": 0.0081,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"epoch": 4.402999062792877,
|
|
"grad_norm": 0.6360767483711243,
|
|
"learning_rate": 7.80280725889696e-07,
|
|
"loss": 0.0079,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 4.412371134020619,
|
|
"grad_norm": 0.5308467149734497,
|
|
"learning_rate": 7.564710890220183e-07,
|
|
"loss": 0.0083,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"epoch": 4.42174320524836,
|
|
"grad_norm": 0.4319888949394226,
|
|
"learning_rate": 7.3301608394997e-07,
|
|
"loss": 0.0079,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 4.431115276476101,
|
|
"grad_norm": 0.46917620301246643,
|
|
"learning_rate": 7.099166105344835e-07,
|
|
"loss": 0.0064,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"epoch": 4.440487347703843,
|
|
"grad_norm": 0.455216646194458,
|
|
"learning_rate": 6.871735549963765e-07,
|
|
"loss": 0.007,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 4.449859418931584,
|
|
"grad_norm": 0.40280669927597046,
|
|
"learning_rate": 6.647877898823463e-07,
|
|
"loss": 0.0068,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"epoch": 4.4592314901593255,
|
|
"grad_norm": 0.32350170612335205,
|
|
"learning_rate": 6.427601740314926e-07,
|
|
"loss": 0.0077,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 4.4686035613870665,
|
|
"grad_norm": 0.30398938059806824,
|
|
"learning_rate": 6.2109155254238e-07,
|
|
"loss": 0.0068,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"epoch": 4.4779756326148075,
|
|
"grad_norm": 0.5104652047157288,
|
|
"learning_rate": 5.997827567405978e-07,
|
|
"loss": 0.0069,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 4.487347703842549,
|
|
"grad_norm": 0.4495840072631836,
|
|
"learning_rate": 5.788346041468796e-07,
|
|
"loss": 0.0065,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"epoch": 4.49671977507029,
|
|
"grad_norm": 0.3475983440876007,
|
|
"learning_rate": 5.582478984457284e-07,
|
|
"loss": 0.0064,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 4.49671977507029,
|
|
"eval_loss": 1.1217763423919678,
|
|
"eval_runtime": 111.5486,
|
|
"eval_samples_per_second": 4.482,
|
|
"eval_steps_per_second": 2.241,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 4.506091846298032,
|
|
"grad_norm": 0.46389687061309814,
|
|
"learning_rate": 5.380234294545938e-07,
|
|
"loss": 0.0071,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"epoch": 4.515463917525773,
|
|
"grad_norm": 0.3474023640155792,
|
|
"learning_rate": 5.181619730935617e-07,
|
|
"loss": 0.0067,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 4.524835988753514,
|
|
"grad_norm": 0.3991861045360565,
|
|
"learning_rate": 4.986642913555895e-07,
|
|
"loss": 0.0068,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"epoch": 4.534208059981256,
|
|
"grad_norm": 0.4194345772266388,
|
|
"learning_rate": 4.795311322772722e-07,
|
|
"loss": 0.0077,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 4.543580131208997,
|
|
"grad_norm": 0.34731411933898926,
|
|
"learning_rate": 4.6076322991013946e-07,
|
|
"loss": 0.0063,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"epoch": 4.552952202436739,
|
|
"grad_norm": 0.7513842582702637,
|
|
"learning_rate": 4.4236130429250347e-07,
|
|
"loss": 0.007,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 4.56232427366448,
|
|
"grad_norm": 0.35471469163894653,
|
|
"learning_rate": 4.2432606142182145e-07,
|
|
"loss": 0.0071,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"epoch": 4.571696344892221,
|
|
"grad_norm": 0.3158963918685913,
|
|
"learning_rate": 4.06658193227617e-07,
|
|
"loss": 0.008,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 4.581068416119963,
|
|
"grad_norm": 0.510502815246582,
|
|
"learning_rate": 3.8935837754493497e-07,
|
|
"loss": 0.0083,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"epoch": 4.590440487347704,
|
|
"grad_norm": 0.5745358467102051,
|
|
"learning_rate": 3.72427278088332e-07,
|
|
"loss": 0.0075,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 4.5998125585754455,
|
|
"grad_norm": 0.48121458292007446,
|
|
"learning_rate": 3.5586554442641587e-07,
|
|
"loss": 0.0081,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"epoch": 4.609184629803186,
|
|
"grad_norm": 0.4651750922203064,
|
|
"learning_rate": 3.3967381195692317e-07,
|
|
"loss": 0.0069,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 4.618556701030927,
|
|
"grad_norm": 0.4792514443397522,
|
|
"learning_rate": 3.238527018823423e-07,
|
|
"loss": 0.0081,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"epoch": 4.627928772258669,
|
|
"grad_norm": 0.4478175640106201,
|
|
"learning_rate": 3.08402821186079e-07,
|
|
"loss": 0.0063,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 4.63730084348641,
|
|
"grad_norm": 0.3196679949760437,
|
|
"learning_rate": 2.933247626091751e-07,
|
|
"loss": 0.0068,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"epoch": 4.646672914714152,
|
|
"grad_norm": 0.5067555904388428,
|
|
"learning_rate": 2.786191046275588e-07,
|
|
"loss": 0.0076,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 4.656044985941893,
|
|
"grad_norm": 0.5797865986824036,
|
|
"learning_rate": 2.6428641142986043e-07,
|
|
"loss": 0.009,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"epoch": 4.665417057169634,
|
|
"grad_norm": 0.5033183693885803,
|
|
"learning_rate": 2.503272328957584e-07,
|
|
"loss": 0.0078,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 4.674789128397376,
|
|
"grad_norm": 0.30220600962638855,
|
|
"learning_rate": 2.367421045748908e-07,
|
|
"loss": 0.007,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"epoch": 4.684161199625117,
|
|
"grad_norm": 0.5532141923904419,
|
|
"learning_rate": 2.2353154766630358e-07,
|
|
"loss": 0.0086,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 4.684161199625117,
|
|
"eval_loss": 1.1228344440460205,
|
|
"eval_runtime": 111.5053,
|
|
"eval_samples_per_second": 4.484,
|
|
"eval_steps_per_second": 2.242,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 4.693533270852859,
|
|
"grad_norm": 0.4479539692401886,
|
|
"learning_rate": 2.1069606899845497e-07,
|
|
"loss": 0.0077,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"epoch": 4.7029053420806,
|
|
"grad_norm": 0.4743359386920929,
|
|
"learning_rate": 1.9823616100977495e-07,
|
|
"loss": 0.0081,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 4.712277413308341,
|
|
"grad_norm": 0.38026347756385803,
|
|
"learning_rate": 1.8615230172976507e-07,
|
|
"loss": 0.0065,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"epoch": 4.721649484536083,
|
|
"grad_norm": 0.5804769396781921,
|
|
"learning_rate": 1.744449547606697e-07,
|
|
"loss": 0.0092,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 4.7310215557638235,
|
|
"grad_norm": 0.5354004502296448,
|
|
"learning_rate": 1.6311456925967583e-07,
|
|
"loss": 0.0074,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"epoch": 4.740393626991565,
|
|
"grad_norm": 0.6035090088844299,
|
|
"learning_rate": 1.5216157992169577e-07,
|
|
"loss": 0.0067,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 4.749765698219306,
|
|
"grad_norm": 0.5137022733688354,
|
|
"learning_rate": 1.41586406962676e-07,
|
|
"loss": 0.0075,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"epoch": 4.759137769447047,
|
|
"grad_norm": 0.2721659541130066,
|
|
"learning_rate": 1.3138945610348564e-07,
|
|
"loss": 0.0072,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 4.768509840674789,
|
|
"grad_norm": 0.4901478886604309,
|
|
"learning_rate": 1.2157111855434667e-07,
|
|
"loss": 0.0065,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"epoch": 4.77788191190253,
|
|
"grad_norm": 0.2981049716472626,
|
|
"learning_rate": 1.1213177099982376e-07,
|
|
"loss": 0.0069,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 4.787253983130272,
|
|
"grad_norm": 0.49158474802970886,
|
|
"learning_rate": 1.0307177558437686e-07,
|
|
"loss": 0.0082,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"epoch": 4.796626054358013,
|
|
"grad_norm": 0.6860193610191345,
|
|
"learning_rate": 9.439147989846354e-08,
|
|
"loss": 0.0081,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 4.805998125585754,
|
|
"grad_norm": 0.7087129354476929,
|
|
"learning_rate": 8.609121696520283e-08,
|
|
"loss": 0.0084,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"epoch": 4.815370196813496,
|
|
"grad_norm": 0.727730929851532,
|
|
"learning_rate": 7.817130522760452e-08,
|
|
"loss": 0.0334,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 4.824742268041237,
|
|
"grad_norm": 0.4352070391178131,
|
|
"learning_rate": 7.063204853634543e-08,
|
|
"loss": 0.0076,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"epoch": 4.834114339268979,
|
|
"grad_norm": 0.3776610791683197,
|
|
"learning_rate": 6.347373613811325e-08,
|
|
"loss": 0.0059,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 4.84348641049672,
|
|
"grad_norm": 0.5180082321166992,
|
|
"learning_rate": 5.6696642664515465e-08,
|
|
"loss": 0.0081,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"epoch": 4.852858481724461,
|
|
"grad_norm": 0.49723920226097107,
|
|
"learning_rate": 5.030102812153548e-08,
|
|
"loss": 0.0081,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 4.8622305529522025,
|
|
"grad_norm": 0.2777559161186218,
|
|
"learning_rate": 4.428713787955841e-08,
|
|
"loss": 0.007,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"epoch": 4.8716026241799435,
|
|
"grad_norm": 0.44526979327201843,
|
|
"learning_rate": 3.865520266396416e-08,
|
|
"loss": 0.0072,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 4.8716026241799435,
|
|
"eval_loss": 1.1233325004577637,
|
|
"eval_runtime": 111.5373,
|
|
"eval_samples_per_second": 4.483,
|
|
"eval_steps_per_second": 2.241,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 4.880974695407685,
|
|
"grad_norm": 0.49195200204849243,
|
|
"learning_rate": 3.340543854626566e-08,
|
|
"loss": 0.0081,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"epoch": 4.890346766635426,
|
|
"grad_norm": 0.33215323090553284,
|
|
"learning_rate": 2.8538046935828733e-08,
|
|
"loss": 0.0069,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 4.899718837863167,
|
|
"grad_norm": 0.43431356549263,
|
|
"learning_rate": 2.4053214572137274e-08,
|
|
"loss": 0.0066,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"epoch": 4.909090909090909,
|
|
"grad_norm": 0.49866101145744324,
|
|
"learning_rate": 1.9951113517633346e-08,
|
|
"loss": 0.007,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 4.91846298031865,
|
|
"grad_norm": 0.5170955657958984,
|
|
"learning_rate": 1.6231901151113617e-08,
|
|
"loss": 0.0083,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"epoch": 4.927835051546392,
|
|
"grad_norm": 0.3568389117717743,
|
|
"learning_rate": 1.2895720161693048e-08,
|
|
"loss": 0.0073,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 4.937207122774133,
|
|
"grad_norm": 0.4371294379234314,
|
|
"learning_rate": 9.942698543330409e-09,
|
|
"loss": 0.008,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"epoch": 4.946579194001874,
|
|
"grad_norm": 0.6057606935501099,
|
|
"learning_rate": 7.372949589916633e-09,
|
|
"loss": 0.0078,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 4.955951265229616,
|
|
"grad_norm": 0.44616734981536865,
|
|
"learning_rate": 5.186571890929415e-09,
|
|
"loss": 0.0079,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"epoch": 4.965323336457357,
|
|
"grad_norm": 0.5187500715255737,
|
|
"learning_rate": 3.383649327650673e-09,
|
|
"loss": 0.0082,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 4.974695407685099,
|
|
"grad_norm": 0.3702596426010132,
|
|
"learning_rate": 1.9642510699469096e-09,
|
|
"loss": 0.0074,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"epoch": 4.98406747891284,
|
|
"grad_norm": 0.38890424370765686,
|
|
"learning_rate": 9.284315736168837e-10,
|
|
"loss": 0.0068,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 4.993439550140581,
|
|
"grad_norm": 0.40768128633499146,
|
|
"learning_rate": 2.762305783021724e-10,
|
|
"loss": 0.0079,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"step": 2669,
|
|
"total_flos": 3.0080813400754176e+18,
|
|
"train_loss": 0.12334943315905438,
|
|
"train_runtime": 40705.595,
|
|
"train_samples_per_second": 2.097,
|
|
"train_steps_per_second": 0.066
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 2670,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 5,
|
|
"save_steps": 100,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3.0080813400754176e+18,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|