Files
qwen3-4B-instruct-refiner-sft/trainer_state.json
ModelHub XC 68b2b217aa 初始化项目,由ModelHub XC社区提供模型
Model: lihaoxin2020/qwen3-4B-instruct-refiner-sft
Source: Original Platform
2026-05-10 14:51:59 +08:00

3983 lines
97 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 100,
"global_step": 2669,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00937207122774133,
"grad_norm": 32.909061431884766,
"learning_rate": 5.970149253731343e-07,
"loss": 1.2647,
"step": 5
},
{
"epoch": 0.01874414245548266,
"grad_norm": 25.104095458984375,
"learning_rate": 1.3432835820895524e-06,
"loss": 1.1231,
"step": 10
},
{
"epoch": 0.028116213683223992,
"grad_norm": 9.685723304748535,
"learning_rate": 2.08955223880597e-06,
"loss": 0.7966,
"step": 15
},
{
"epoch": 0.03748828491096532,
"grad_norm": 3.455462694168091,
"learning_rate": 2.835820895522388e-06,
"loss": 0.59,
"step": 20
},
{
"epoch": 0.046860356138706656,
"grad_norm": 3.022719621658325,
"learning_rate": 3.582089552238806e-06,
"loss": 0.5539,
"step": 25
},
{
"epoch": 0.056232427366447985,
"grad_norm": 2.2470591068267822,
"learning_rate": 4.3283582089552236e-06,
"loss": 0.4943,
"step": 30
},
{
"epoch": 0.06560449859418932,
"grad_norm": 2.7969272136688232,
"learning_rate": 5.074626865671642e-06,
"loss": 0.5235,
"step": 35
},
{
"epoch": 0.07497656982193064,
"grad_norm": 4.2170538902282715,
"learning_rate": 5.820895522388061e-06,
"loss": 0.5126,
"step": 40
},
{
"epoch": 0.08434864104967198,
"grad_norm": 34.95182418823242,
"learning_rate": 6.567164179104478e-06,
"loss": 0.4857,
"step": 45
},
{
"epoch": 0.09372071227741331,
"grad_norm": 2.0386860370635986,
"learning_rate": 7.313432835820896e-06,
"loss": 0.4885,
"step": 50
},
{
"epoch": 0.10309278350515463,
"grad_norm": 2.5301098823547363,
"learning_rate": 8.059701492537314e-06,
"loss": 0.48,
"step": 55
},
{
"epoch": 0.11246485473289597,
"grad_norm": 2.5187911987304688,
"learning_rate": 8.805970149253732e-06,
"loss": 0.4729,
"step": 60
},
{
"epoch": 0.1218369259606373,
"grad_norm": 2.380256175994873,
"learning_rate": 9.552238805970149e-06,
"loss": 0.5112,
"step": 65
},
{
"epoch": 0.13120899718837864,
"grad_norm": 2.1329147815704346,
"learning_rate": 1.029850746268657e-05,
"loss": 0.4578,
"step": 70
},
{
"epoch": 0.14058106841611998,
"grad_norm": 2.7913565635681152,
"learning_rate": 1.1044776119402986e-05,
"loss": 0.4598,
"step": 75
},
{
"epoch": 0.14995313964386128,
"grad_norm": 2.056675910949707,
"learning_rate": 1.1791044776119405e-05,
"loss": 0.4831,
"step": 80
},
{
"epoch": 0.15932521087160262,
"grad_norm": 2.386592388153076,
"learning_rate": 1.2537313432835823e-05,
"loss": 0.473,
"step": 85
},
{
"epoch": 0.16869728209934395,
"grad_norm": 2.63767409324646,
"learning_rate": 1.328358208955224e-05,
"loss": 0.4841,
"step": 90
},
{
"epoch": 0.1780693533270853,
"grad_norm": 2.0835254192352295,
"learning_rate": 1.4029850746268658e-05,
"loss": 0.4657,
"step": 95
},
{
"epoch": 0.18744142455482662,
"grad_norm": 2.168680429458618,
"learning_rate": 1.4776119402985077e-05,
"loss": 0.4937,
"step": 100
},
{
"epoch": 0.18744142455482662,
"eval_loss": 0.6319828033447266,
"eval_runtime": 111.5664,
"eval_samples_per_second": 4.482,
"eval_steps_per_second": 2.241,
"step": 100
},
{
"epoch": 0.19681349578256796,
"grad_norm": 1.953171730041504,
"learning_rate": 1.5522388059701494e-05,
"loss": 0.4405,
"step": 105
},
{
"epoch": 0.20618556701030927,
"grad_norm": 1.805274248123169,
"learning_rate": 1.626865671641791e-05,
"loss": 0.4922,
"step": 110
},
{
"epoch": 0.2155576382380506,
"grad_norm": 2.0582938194274902,
"learning_rate": 1.701492537313433e-05,
"loss": 0.4722,
"step": 115
},
{
"epoch": 0.22492970946579194,
"grad_norm": 2.067007064819336,
"learning_rate": 1.7761194029850748e-05,
"loss": 0.4876,
"step": 120
},
{
"epoch": 0.23430178069353327,
"grad_norm": 2.0720055103302,
"learning_rate": 1.8507462686567165e-05,
"loss": 0.479,
"step": 125
},
{
"epoch": 0.2436738519212746,
"grad_norm": 2.026207208633423,
"learning_rate": 1.9253731343283585e-05,
"loss": 0.4642,
"step": 130
},
{
"epoch": 0.2530459231490159,
"grad_norm": 1.9669533967971802,
"learning_rate": 2e-05,
"loss": 0.479,
"step": 135
},
{
"epoch": 0.2624179943767573,
"grad_norm": 1.7911089658737183,
"learning_rate": 1.9999808172939662e-05,
"loss": 0.484,
"step": 140
},
{
"epoch": 0.2717900656044986,
"grad_norm": 1.6933950185775757,
"learning_rate": 1.9999232699118173e-05,
"loss": 0.4945,
"step": 145
},
{
"epoch": 0.28116213683223995,
"grad_norm": 1.988083839416504,
"learning_rate": 1.9998273600613825e-05,
"loss": 0.5123,
"step": 150
},
{
"epoch": 0.29053420805998126,
"grad_norm": 2.103421688079834,
"learning_rate": 1.999693091422282e-05,
"loss": 0.4682,
"step": 155
},
{
"epoch": 0.29990627928772257,
"grad_norm": 2.0768039226531982,
"learning_rate": 1.9995204691457883e-05,
"loss": 0.4885,
"step": 160
},
{
"epoch": 0.30927835051546393,
"grad_norm": 1.8248564004898071,
"learning_rate": 1.9993094998546257e-05,
"loss": 0.4735,
"step": 165
},
{
"epoch": 0.31865042174320524,
"grad_norm": 1.6503818035125732,
"learning_rate": 1.9990601916427183e-05,
"loss": 0.4733,
"step": 170
},
{
"epoch": 0.3280224929709466,
"grad_norm": 1.6220550537109375,
"learning_rate": 1.998772554074878e-05,
"loss": 0.4898,
"step": 175
},
{
"epoch": 0.3373945641986879,
"grad_norm": 1.5691450834274292,
"learning_rate": 1.9984465981864393e-05,
"loss": 0.4697,
"step": 180
},
{
"epoch": 0.3467666354264292,
"grad_norm": 1.856009602546692,
"learning_rate": 1.998082336482833e-05,
"loss": 0.46,
"step": 185
},
{
"epoch": 0.3561387066541706,
"grad_norm": 1.7977851629257202,
"learning_rate": 1.9976797829391104e-05,
"loss": 0.5193,
"step": 190
},
{
"epoch": 0.3655107778819119,
"grad_norm": 1.5994986295700073,
"learning_rate": 1.9972389529994043e-05,
"loss": 0.4666,
"step": 195
},
{
"epoch": 0.37488284910965325,
"grad_norm": 1.8488245010375977,
"learning_rate": 1.996759863576336e-05,
"loss": 0.511,
"step": 200
},
{
"epoch": 0.37488284910965325,
"eval_loss": 0.6320933699607849,
"eval_runtime": 111.4483,
"eval_samples_per_second": 4.486,
"eval_steps_per_second": 2.243,
"step": 200
},
{
"epoch": 0.38425492033739456,
"grad_norm": 2.441446542739868,
"learning_rate": 1.9962425330503693e-05,
"loss": 0.4696,
"step": 205
},
{
"epoch": 0.3936269915651359,
"grad_norm": 1.8430964946746826,
"learning_rate": 1.995686981269103e-05,
"loss": 0.4649,
"step": 210
},
{
"epoch": 0.4029990627928772,
"grad_norm": 1.7581799030303955,
"learning_rate": 1.9950932295465102e-05,
"loss": 0.4885,
"step": 215
},
{
"epoch": 0.41237113402061853,
"grad_norm": 1.6407780647277832,
"learning_rate": 1.9944613006621197e-05,
"loss": 0.4754,
"step": 220
},
{
"epoch": 0.4217432052483599,
"grad_norm": 1.6698272228240967,
"learning_rate": 1.9937912188601444e-05,
"loss": 0.4823,
"step": 225
},
{
"epoch": 0.4311152764761012,
"grad_norm": 1.5131304264068604,
"learning_rate": 1.9930830098485484e-05,
"loss": 0.4692,
"step": 230
},
{
"epoch": 0.44048734770384257,
"grad_norm": 1.6762291193008423,
"learning_rate": 1.992336700798062e-05,
"loss": 0.4901,
"step": 235
},
{
"epoch": 0.4498594189315839,
"grad_norm": 1.7265088558197021,
"learning_rate": 1.9915523203411397e-05,
"loss": 0.4627,
"step": 240
},
{
"epoch": 0.4592314901593252,
"grad_norm": 1.5664938688278198,
"learning_rate": 1.990729898570861e-05,
"loss": 0.4715,
"step": 245
},
{
"epoch": 0.46860356138706655,
"grad_norm": 1.6908628940582275,
"learning_rate": 1.989869467039776e-05,
"loss": 0.4984,
"step": 250
},
{
"epoch": 0.47797563261480785,
"grad_norm": 1.3762125968933105,
"learning_rate": 1.9889710587586953e-05,
"loss": 0.4663,
"step": 255
},
{
"epoch": 0.4873477038425492,
"grad_norm": 1.605967402458191,
"learning_rate": 1.9880347081954217e-05,
"loss": 0.4711,
"step": 260
},
{
"epoch": 0.4967197750702905,
"grad_norm": 1.5448018312454224,
"learning_rate": 1.987060451273432e-05,
"loss": 0.4637,
"step": 265
},
{
"epoch": 0.5060918462980318,
"grad_norm": 1.4862360954284668,
"learning_rate": 1.986048325370493e-05,
"loss": 0.4614,
"step": 270
},
{
"epoch": 0.5154639175257731,
"grad_norm": 1.587640643119812,
"learning_rate": 1.9849983693172324e-05,
"loss": 0.4819,
"step": 275
},
{
"epoch": 0.5248359887535146,
"grad_norm": 1.5963493585586548,
"learning_rate": 1.9839106233956474e-05,
"loss": 0.4912,
"step": 280
},
{
"epoch": 0.5342080599812559,
"grad_norm": 1.4431391954421997,
"learning_rate": 1.982785129337558e-05,
"loss": 0.4727,
"step": 285
},
{
"epoch": 0.5435801312089972,
"grad_norm": 1.4035024642944336,
"learning_rate": 1.9816219303230077e-05,
"loss": 0.4642,
"step": 290
},
{
"epoch": 0.5529522024367385,
"grad_norm": 1.5492186546325684,
"learning_rate": 1.980421070978606e-05,
"loss": 0.4881,
"step": 295
},
{
"epoch": 0.5623242736644799,
"grad_norm": 1.4136260747909546,
"learning_rate": 1.9791825973758167e-05,
"loss": 0.4657,
"step": 300
},
{
"epoch": 0.5623242736644799,
"eval_loss": 0.6458946466445923,
"eval_runtime": 111.4442,
"eval_samples_per_second": 4.487,
"eval_steps_per_second": 2.243,
"step": 300
},
{
"epoch": 0.5716963448922212,
"grad_norm": 1.6971220970153809,
"learning_rate": 1.9779065570291894e-05,
"loss": 0.4685,
"step": 305
},
{
"epoch": 0.5810684161199625,
"grad_norm": 1.4709703922271729,
"learning_rate": 1.9765929988945382e-05,
"loss": 0.4948,
"step": 310
},
{
"epoch": 0.5904404873477038,
"grad_norm": 1.7003625631332397,
"learning_rate": 1.975241973367062e-05,
"loss": 0.4963,
"step": 315
},
{
"epoch": 0.5998125585754451,
"grad_norm": 1.4582606554031372,
"learning_rate": 1.9738535322794122e-05,
"loss": 0.4827,
"step": 320
},
{
"epoch": 0.6091846298031866,
"grad_norm": 1.4838789701461792,
"learning_rate": 1.972427728899703e-05,
"loss": 0.4545,
"step": 325
},
{
"epoch": 0.6185567010309279,
"grad_norm": 1.3947144746780396,
"learning_rate": 1.9709646179294687e-05,
"loss": 0.4712,
"step": 330
},
{
"epoch": 0.6279287722586692,
"grad_norm": 1.554185390472412,
"learning_rate": 1.9694642555015643e-05,
"loss": 0.4702,
"step": 335
},
{
"epoch": 0.6373008434864105,
"grad_norm": 1.6660090684890747,
"learning_rate": 1.9679266991780128e-05,
"loss": 0.5128,
"step": 340
},
{
"epoch": 0.6466729147141518,
"grad_norm": 1.5578211545944214,
"learning_rate": 1.966352007947796e-05,
"loss": 0.4844,
"step": 345
},
{
"epoch": 0.6560449859418932,
"grad_norm": 1.4711651802062988,
"learning_rate": 1.964740242224592e-05,
"loss": 0.4798,
"step": 350
},
{
"epoch": 0.6654170571696345,
"grad_norm": 1.7752223014831543,
"learning_rate": 1.9630914638444572e-05,
"loss": 0.4922,
"step": 355
},
{
"epoch": 0.6747891283973758,
"grad_norm": 1.8880064487457275,
"learning_rate": 1.961405736063453e-05,
"loss": 0.4928,
"step": 360
},
{
"epoch": 0.6841611996251171,
"grad_norm": 1.5307488441467285,
"learning_rate": 1.9596831235552205e-05,
"loss": 0.4492,
"step": 365
},
{
"epoch": 0.6935332708528584,
"grad_norm": 1.4398698806762695,
"learning_rate": 1.957923692408499e-05,
"loss": 0.45,
"step": 370
},
{
"epoch": 0.7029053420805998,
"grad_norm": 1.5632487535476685,
"learning_rate": 1.9561275101245886e-05,
"loss": 0.4878,
"step": 375
},
{
"epoch": 0.7122774133083412,
"grad_norm": 1.4883025884628296,
"learning_rate": 1.954294645614763e-05,
"loss": 0.4799,
"step": 380
},
{
"epoch": 0.7216494845360825,
"grad_norm": 1.4776026010513306,
"learning_rate": 1.9524251691976243e-05,
"loss": 0.5043,
"step": 385
},
{
"epoch": 0.7310215557638238,
"grad_norm": 1.5219204425811768,
"learning_rate": 1.950519152596406e-05,
"loss": 0.4737,
"step": 390
},
{
"epoch": 0.7403936269915652,
"grad_norm": 1.6262823343276978,
"learning_rate": 1.9485766689362205e-05,
"loss": 0.4575,
"step": 395
},
{
"epoch": 0.7497656982193065,
"grad_norm": 1.5888830423355103,
"learning_rate": 1.9465977927412535e-05,
"loss": 0.4577,
"step": 400
},
{
"epoch": 0.7497656982193065,
"eval_loss": 0.6419793963432312,
"eval_runtime": 111.6005,
"eval_samples_per_second": 4.48,
"eval_steps_per_second": 2.24,
"step": 400
},
{
"epoch": 0.7591377694470478,
"grad_norm": 1.4423269033432007,
"learning_rate": 1.9445825999319057e-05,
"loss": 0.4451,
"step": 405
},
{
"epoch": 0.7685098406747891,
"grad_norm": 1.5620957612991333,
"learning_rate": 1.94253116782188e-05,
"loss": 0.4578,
"step": 410
},
{
"epoch": 0.7778819119025304,
"grad_norm": 1.326604962348938,
"learning_rate": 1.9404435751152134e-05,
"loss": 0.4772,
"step": 415
},
{
"epoch": 0.7872539831302718,
"grad_norm": 1.3322705030441284,
"learning_rate": 1.938319901903262e-05,
"loss": 0.4829,
"step": 420
},
{
"epoch": 0.7966260543580131,
"grad_norm": 1.4845640659332275,
"learning_rate": 1.9361602296616223e-05,
"loss": 0.4598,
"step": 425
},
{
"epoch": 0.8059981255857545,
"grad_norm": 1.4712871313095093,
"learning_rate": 1.9339646412470106e-05,
"loss": 0.4695,
"step": 430
},
{
"epoch": 0.8153701968134958,
"grad_norm": 1.3889317512512207,
"learning_rate": 1.931733220894081e-05,
"loss": 0.447,
"step": 435
},
{
"epoch": 0.8247422680412371,
"grad_norm": 1.4443881511688232,
"learning_rate": 1.9294660542121944e-05,
"loss": 0.4662,
"step": 440
},
{
"epoch": 0.8341143392689785,
"grad_norm": 1.5155857801437378,
"learning_rate": 1.9271632281821354e-05,
"loss": 0.4873,
"step": 445
},
{
"epoch": 0.8434864104967198,
"grad_norm": 1.5591299533843994,
"learning_rate": 1.9248248311527735e-05,
"loss": 0.4942,
"step": 450
},
{
"epoch": 0.8528584817244611,
"grad_norm": 1.5808844566345215,
"learning_rate": 1.9224509528376737e-05,
"loss": 0.472,
"step": 455
},
{
"epoch": 0.8622305529522024,
"grad_norm": 1.8616470098495483,
"learning_rate": 1.9200416843116562e-05,
"loss": 0.4577,
"step": 460
},
{
"epoch": 0.8716026241799437,
"grad_norm": 1.918115496635437,
"learning_rate": 1.9175971180073012e-05,
"loss": 0.4774,
"step": 465
},
{
"epoch": 0.8809746954076851,
"grad_norm": 1.411353349685669,
"learning_rate": 1.9151173477114015e-05,
"loss": 0.4682,
"step": 470
},
{
"epoch": 0.8903467666354264,
"grad_norm": 1.625918984413147,
"learning_rate": 1.9126024685613664e-05,
"loss": 0.4923,
"step": 475
},
{
"epoch": 0.8997188378631678,
"grad_norm": 1.388818621635437,
"learning_rate": 1.9100525770415713e-05,
"loss": 0.4766,
"step": 480
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.4252229928970337,
"learning_rate": 1.907467770979655e-05,
"loss": 0.4622,
"step": 485
},
{
"epoch": 0.9184629803186504,
"grad_norm": 1.6216133832931519,
"learning_rate": 1.9048481495427667e-05,
"loss": 0.4824,
"step": 490
},
{
"epoch": 0.9278350515463918,
"grad_norm": 1.6171802282333374,
"learning_rate": 1.9021938132337628e-05,
"loss": 0.4979,
"step": 495
},
{
"epoch": 0.9372071227741331,
"grad_norm": 1.5567573308944702,
"learning_rate": 1.8995048638873494e-05,
"loss": 0.4634,
"step": 500
},
{
"epoch": 0.9372071227741331,
"eval_loss": 0.6470092535018921,
"eval_runtime": 111.4606,
"eval_samples_per_second": 4.486,
"eval_steps_per_second": 2.243,
"step": 500
},
{
"epoch": 0.9465791940018744,
"grad_norm": 1.3252328634262085,
"learning_rate": 1.896781404666176e-05,
"loss": 0.4682,
"step": 505
},
{
"epoch": 0.9559512652296157,
"grad_norm": 1.6408551931381226,
"learning_rate": 1.8940235400568784e-05,
"loss": 0.4762,
"step": 510
},
{
"epoch": 0.9653233364573571,
"grad_norm": 1.6290283203125,
"learning_rate": 1.891231375866068e-05,
"loss": 0.4661,
"step": 515
},
{
"epoch": 0.9746954076850984,
"grad_norm": 1.402817964553833,
"learning_rate": 1.888405019216275e-05,
"loss": 0.5037,
"step": 520
},
{
"epoch": 0.9840674789128397,
"grad_norm": 1.366844892501831,
"learning_rate": 1.885544578541837e-05,
"loss": 0.4596,
"step": 525
},
{
"epoch": 0.993439550140581,
"grad_norm": 1.4287879467010498,
"learning_rate": 1.8826501635847392e-05,
"loss": 0.4652,
"step": 530
},
{
"epoch": 1.0037488284910965,
"grad_norm": 1.2563650608062744,
"learning_rate": 1.8797218853904037e-05,
"loss": 0.4833,
"step": 535
},
{
"epoch": 1.013120899718838,
"grad_norm": 1.2508701086044312,
"learning_rate": 1.8767598563034304e-05,
"loss": 0.287,
"step": 540
},
{
"epoch": 1.022492970946579,
"grad_norm": 1.3916987180709839,
"learning_rate": 1.8737641899632857e-05,
"loss": 0.2859,
"step": 545
},
{
"epoch": 1.0318650421743205,
"grad_norm": 1.4374034404754639,
"learning_rate": 1.870735001299943e-05,
"loss": 0.2746,
"step": 550
},
{
"epoch": 1.041237113402062,
"grad_norm": 1.3931026458740234,
"learning_rate": 1.8676724065294744e-05,
"loss": 0.255,
"step": 555
},
{
"epoch": 1.0506091846298031,
"grad_norm": 1.4676737785339355,
"learning_rate": 1.864576523149589e-05,
"loss": 0.2609,
"step": 560
},
{
"epoch": 1.0599812558575445,
"grad_norm": 1.3945457935333252,
"learning_rate": 1.8614474699351294e-05,
"loss": 0.2595,
"step": 565
},
{
"epoch": 1.069353327085286,
"grad_norm": 1.413190245628357,
"learning_rate": 1.8582853669335107e-05,
"loss": 0.2704,
"step": 570
},
{
"epoch": 1.0787253983130272,
"grad_norm": 1.2427492141723633,
"learning_rate": 1.8550903354601182e-05,
"loss": 0.2444,
"step": 575
},
{
"epoch": 1.0880974695407686,
"grad_norm": 1.3134554624557495,
"learning_rate": 1.851862498093651e-05,
"loss": 0.2606,
"step": 580
},
{
"epoch": 1.0974695407685098,
"grad_norm": 1.3855392932891846,
"learning_rate": 1.8486019786714194e-05,
"loss": 0.263,
"step": 585
},
{
"epoch": 1.1068416119962512,
"grad_norm": 1.4354616403579712,
"learning_rate": 1.8453089022845943e-05,
"loss": 0.2488,
"step": 590
},
{
"epoch": 1.1162136832239926,
"grad_norm": 1.1863958835601807,
"learning_rate": 1.8419833952734094e-05,
"loss": 0.2506,
"step": 595
},
{
"epoch": 1.1255857544517338,
"grad_norm": 1.5044498443603516,
"learning_rate": 1.83862558522231e-05,
"loss": 0.2661,
"step": 600
},
{
"epoch": 1.1255857544517338,
"eval_loss": 0.6920709013938904,
"eval_runtime": 111.4907,
"eval_samples_per_second": 4.485,
"eval_steps_per_second": 2.242,
"step": 600
},
{
"epoch": 1.1349578256794752,
"grad_norm": 1.4557913541793823,
"learning_rate": 1.835235600955064e-05,
"loss": 0.265,
"step": 605
},
{
"epoch": 1.1443298969072164,
"grad_norm": 1.3041000366210938,
"learning_rate": 1.8318135725298133e-05,
"loss": 0.261,
"step": 610
},
{
"epoch": 1.1537019681349578,
"grad_norm": 1.3250569105148315,
"learning_rate": 1.8283596312340893e-05,
"loss": 0.2638,
"step": 615
},
{
"epoch": 1.1630740393626993,
"grad_norm": 1.3970952033996582,
"learning_rate": 1.8248739095797726e-05,
"loss": 0.2642,
"step": 620
},
{
"epoch": 1.1724461105904405,
"grad_norm": 1.4045438766479492,
"learning_rate": 1.8213565412980114e-05,
"loss": 0.2909,
"step": 625
},
{
"epoch": 1.1818181818181819,
"grad_norm": 1.3580117225646973,
"learning_rate": 1.8178076613340886e-05,
"loss": 0.2541,
"step": 630
},
{
"epoch": 1.191190253045923,
"grad_norm": 1.3984880447387695,
"learning_rate": 1.8142274058422467e-05,
"loss": 0.253,
"step": 635
},
{
"epoch": 1.2005623242736645,
"grad_norm": 1.275099754333496,
"learning_rate": 1.8106159121804633e-05,
"loss": 0.2679,
"step": 640
},
{
"epoch": 1.209934395501406,
"grad_norm": 1.4693080186843872,
"learning_rate": 1.8069733189051802e-05,
"loss": 0.2586,
"step": 645
},
{
"epoch": 1.219306466729147,
"grad_norm": 1.3677211999893188,
"learning_rate": 1.80329976576599e-05,
"loss": 0.2877,
"step": 650
},
{
"epoch": 1.2286785379568885,
"grad_norm": 1.376230001449585,
"learning_rate": 1.7995953937002723e-05,
"loss": 0.2499,
"step": 655
},
{
"epoch": 1.2380506091846297,
"grad_norm": 1.380204677581787,
"learning_rate": 1.7958603448277882e-05,
"loss": 0.2426,
"step": 660
},
{
"epoch": 1.2474226804123711,
"grad_norm": 1.4259058237075806,
"learning_rate": 1.7920947624452264e-05,
"loss": 0.2806,
"step": 665
},
{
"epoch": 1.2567947516401126,
"grad_norm": 1.4305455684661865,
"learning_rate": 1.7882987910207066e-05,
"loss": 0.2657,
"step": 670
},
{
"epoch": 1.2661668228678538,
"grad_norm": 1.4844595193862915,
"learning_rate": 1.784472576188237e-05,
"loss": 0.2704,
"step": 675
},
{
"epoch": 1.2755388940955952,
"grad_norm": 1.28706693649292,
"learning_rate": 1.780616264742126e-05,
"loss": 0.2534,
"step": 680
},
{
"epoch": 1.2849109653233364,
"grad_norm": 1.3618587255477905,
"learning_rate": 1.776730004631352e-05,
"loss": 0.2715,
"step": 685
},
{
"epoch": 1.2942830365510778,
"grad_norm": 1.399498701095581,
"learning_rate": 1.7728139449538848e-05,
"loss": 0.2748,
"step": 690
},
{
"epoch": 1.3036551077788192,
"grad_norm": 1.3688334226608276,
"learning_rate": 1.768868235950968e-05,
"loss": 0.2625,
"step": 695
},
{
"epoch": 1.3130271790065604,
"grad_norm": 1.327973484992981,
"learning_rate": 1.7648930290013532e-05,
"loss": 0.2427,
"step": 700
},
{
"epoch": 1.3130271790065604,
"eval_loss": 0.6904003620147705,
"eval_runtime": 111.5048,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 700
},
{
"epoch": 1.3223992502343018,
"grad_norm": 1.5537965297698975,
"learning_rate": 1.760888476615493e-05,
"loss": 0.2487,
"step": 705
},
{
"epoch": 1.331771321462043,
"grad_norm": 1.382699728012085,
"learning_rate": 1.75685473242969e-05,
"loss": 0.2417,
"step": 710
},
{
"epoch": 1.3411433926897844,
"grad_norm": 1.4410724639892578,
"learning_rate": 1.7527919512002025e-05,
"loss": 0.2467,
"step": 715
},
{
"epoch": 1.3505154639175259,
"grad_norm": 1.448276400566101,
"learning_rate": 1.7487002887973057e-05,
"loss": 0.2525,
"step": 720
},
{
"epoch": 1.359887535145267,
"grad_norm": 1.4892441034317017,
"learning_rate": 1.7445799021993138e-05,
"loss": 0.2336,
"step": 725
},
{
"epoch": 1.3692596063730085,
"grad_norm": 1.2686562538146973,
"learning_rate": 1.7404309494865572e-05,
"loss": 0.2624,
"step": 730
},
{
"epoch": 1.3786316776007497,
"grad_norm": 1.36681067943573,
"learning_rate": 1.736253589835316e-05,
"loss": 0.279,
"step": 735
},
{
"epoch": 1.388003748828491,
"grad_norm": 1.4178364276885986,
"learning_rate": 1.7320479835117142e-05,
"loss": 0.2634,
"step": 740
},
{
"epoch": 1.3973758200562325,
"grad_norm": 1.7909929752349854,
"learning_rate": 1.7278142918655717e-05,
"loss": 0.2568,
"step": 745
},
{
"epoch": 1.4067478912839737,
"grad_norm": 1.4352169036865234,
"learning_rate": 1.7235526773242136e-05,
"loss": 0.2487,
"step": 750
},
{
"epoch": 1.4161199625117151,
"grad_norm": 1.3589709997177124,
"learning_rate": 1.719263303386237e-05,
"loss": 0.2612,
"step": 755
},
{
"epoch": 1.4254920337394563,
"grad_norm": 1.3523000478744507,
"learning_rate": 1.7149463346152412e-05,
"loss": 0.2644,
"step": 760
},
{
"epoch": 1.4348641049671977,
"grad_norm": 1.396602988243103,
"learning_rate": 1.7106019366335113e-05,
"loss": 0.2704,
"step": 765
},
{
"epoch": 1.4442361761949392,
"grad_norm": 1.379135012626648,
"learning_rate": 1.7062302761156667e-05,
"loss": 0.2593,
"step": 770
},
{
"epoch": 1.4536082474226804,
"grad_norm": 1.301147699356079,
"learning_rate": 1.701831520782264e-05,
"loss": 0.2592,
"step": 775
},
{
"epoch": 1.4629803186504218,
"grad_norm": 1.4539448022842407,
"learning_rate": 1.6974058393933647e-05,
"loss": 0.2909,
"step": 780
},
{
"epoch": 1.472352389878163,
"grad_norm": 1.5490386486053467,
"learning_rate": 1.692953401742059e-05,
"loss": 0.2771,
"step": 785
},
{
"epoch": 1.4817244611059044,
"grad_norm": 1.4883418083190918,
"learning_rate": 1.6884743786479513e-05,
"loss": 0.2529,
"step": 790
},
{
"epoch": 1.4910965323336458,
"grad_norm": 1.5105490684509277,
"learning_rate": 1.6839689419506092e-05,
"loss": 0.265,
"step": 795
},
{
"epoch": 1.5004686035613872,
"grad_norm": 1.461634635925293,
"learning_rate": 1.6794372645029674e-05,
"loss": 0.2608,
"step": 800
},
{
"epoch": 1.5004686035613872,
"eval_loss": 0.6895884871482849,
"eval_runtime": 111.5059,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 800
},
{
"epoch": 1.5098406747891284,
"grad_norm": 1.523145079612732,
"learning_rate": 1.6748795201646992e-05,
"loss": 0.2762,
"step": 805
},
{
"epoch": 1.5192127460168696,
"grad_norm": 1.366004228591919,
"learning_rate": 1.670295883795544e-05,
"loss": 0.28,
"step": 810
},
{
"epoch": 1.528584817244611,
"grad_norm": 1.6428511142730713,
"learning_rate": 1.6656865312485996e-05,
"loss": 0.2489,
"step": 815
},
{
"epoch": 1.5379568884723525,
"grad_norm": 1.31986665725708,
"learning_rate": 1.6610516393635757e-05,
"loss": 0.2498,
"step": 820
},
{
"epoch": 1.5473289597000939,
"grad_norm": 1.5260220766067505,
"learning_rate": 1.6563913859600102e-05,
"loss": 0.338,
"step": 825
},
{
"epoch": 1.556701030927835,
"grad_norm": 1.3370164632797241,
"learning_rate": 1.6517059498304444e-05,
"loss": 0.2468,
"step": 830
},
{
"epoch": 1.5660731021555763,
"grad_norm": 1.4251459836959839,
"learning_rate": 1.6469955107335666e-05,
"loss": 0.2764,
"step": 835
},
{
"epoch": 1.5754451733833177,
"grad_norm": 1.2612155675888062,
"learning_rate": 1.6422602493873137e-05,
"loss": 0.2613,
"step": 840
},
{
"epoch": 1.584817244611059,
"grad_norm": 1.3020036220550537,
"learning_rate": 1.637500347461938e-05,
"loss": 0.2618,
"step": 845
},
{
"epoch": 1.5941893158388005,
"grad_norm": 1.3664627075195312,
"learning_rate": 1.6327159875730393e-05,
"loss": 0.2476,
"step": 850
},
{
"epoch": 1.6035613870665417,
"grad_norm": 1.4827312231063843,
"learning_rate": 1.627907353274555e-05,
"loss": 0.2674,
"step": 855
},
{
"epoch": 1.612933458294283,
"grad_norm": 1.2991149425506592,
"learning_rate": 1.6230746290517227e-05,
"loss": 0.2716,
"step": 860
},
{
"epoch": 1.6223055295220243,
"grad_norm": 1.5782040357589722,
"learning_rate": 1.618218000313998e-05,
"loss": 0.2875,
"step": 865
},
{
"epoch": 1.6316776007497658,
"grad_norm": 1.4465105533599854,
"learning_rate": 1.613337653387943e-05,
"loss": 0.2723,
"step": 870
},
{
"epoch": 1.6410496719775072,
"grad_norm": 1.3791197538375854,
"learning_rate": 1.6084337755100795e-05,
"loss": 0.2572,
"step": 875
},
{
"epoch": 1.6504217432052484,
"grad_norm": 1.3755207061767578,
"learning_rate": 1.603506554819703e-05,
"loss": 0.2562,
"step": 880
},
{
"epoch": 1.6597938144329896,
"grad_norm": 1.4186309576034546,
"learning_rate": 1.598556180351665e-05,
"loss": 0.2679,
"step": 885
},
{
"epoch": 1.669165885660731,
"grad_norm": 1.3663445711135864,
"learning_rate": 1.5935828420291227e-05,
"loss": 0.2505,
"step": 890
},
{
"epoch": 1.6785379568884724,
"grad_norm": 1.4272841215133667,
"learning_rate": 1.588586730656249e-05,
"loss": 0.2861,
"step": 895
},
{
"epoch": 1.6879100281162138,
"grad_norm": 1.3556526899337769,
"learning_rate": 1.5835680379109166e-05,
"loss": 0.2811,
"step": 900
},
{
"epoch": 1.6879100281162138,
"eval_loss": 0.6763415336608887,
"eval_runtime": 111.4991,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 900
},
{
"epoch": 1.697282099343955,
"grad_norm": 1.527638554573059,
"learning_rate": 1.5785269563373402e-05,
"loss": 0.2655,
"step": 905
},
{
"epoch": 1.7066541705716962,
"grad_norm": 1.347113847732544,
"learning_rate": 1.573463679338692e-05,
"loss": 0.2783,
"step": 910
},
{
"epoch": 1.7160262417994376,
"grad_norm": 1.346537470817566,
"learning_rate": 1.56837840116968e-05,
"loss": 0.2712,
"step": 915
},
{
"epoch": 1.725398313027179,
"grad_norm": 1.3698228597640991,
"learning_rate": 1.5632713169290962e-05,
"loss": 0.2582,
"step": 920
},
{
"epoch": 1.7347703842549205,
"grad_norm": 1.4085627794265747,
"learning_rate": 1.5581426225523333e-05,
"loss": 0.262,
"step": 925
},
{
"epoch": 1.7441424554826617,
"grad_norm": 1.4400358200073242,
"learning_rate": 1.5529925148038635e-05,
"loss": 0.2636,
"step": 930
},
{
"epoch": 1.7535145267104029,
"grad_norm": 1.2298705577850342,
"learning_rate": 1.547821191269693e-05,
"loss": 0.2542,
"step": 935
},
{
"epoch": 1.7628865979381443,
"grad_norm": 1.4320347309112549,
"learning_rate": 1.5426288503497802e-05,
"loss": 0.2607,
"step": 940
},
{
"epoch": 1.7722586691658857,
"grad_norm": 1.4086341857910156,
"learning_rate": 1.5374156912504236e-05,
"loss": 0.2464,
"step": 945
},
{
"epoch": 1.7816307403936271,
"grad_norm": 1.3747973442077637,
"learning_rate": 1.532181913976621e-05,
"loss": 0.2781,
"step": 950
},
{
"epoch": 1.7910028116213683,
"grad_norm": 1.4264485836029053,
"learning_rate": 1.5269277193243936e-05,
"loss": 0.2872,
"step": 955
},
{
"epoch": 1.8003748828491095,
"grad_norm": 1.3113363981246948,
"learning_rate": 1.5216533088730844e-05,
"loss": 0.2693,
"step": 960
},
{
"epoch": 1.809746954076851,
"grad_norm": 1.3197410106658936,
"learning_rate": 1.516358884977624e-05,
"loss": 0.2495,
"step": 965
},
{
"epoch": 1.8191190253045924,
"grad_norm": 1.4005447626113892,
"learning_rate": 1.5110446507607666e-05,
"loss": 0.2792,
"step": 970
},
{
"epoch": 1.8284910965323338,
"grad_norm": 1.3619177341461182,
"learning_rate": 1.5057108101052978e-05,
"loss": 0.2496,
"step": 975
},
{
"epoch": 1.837863167760075,
"grad_norm": 1.3972722291946411,
"learning_rate": 1.5003575676462126e-05,
"loss": 0.2586,
"step": 980
},
{
"epoch": 1.8472352389878162,
"grad_norm": 1.3040308952331543,
"learning_rate": 1.4949851287628631e-05,
"loss": 0.2593,
"step": 985
},
{
"epoch": 1.8566073102155576,
"grad_norm": 1.4333730936050415,
"learning_rate": 1.4895936995710815e-05,
"loss": 0.2643,
"step": 990
},
{
"epoch": 1.865979381443299,
"grad_norm": 1.304624319076538,
"learning_rate": 1.4841834869152703e-05,
"loss": 0.2478,
"step": 995
},
{
"epoch": 1.8753514526710404,
"grad_norm": 1.3824489116668701,
"learning_rate": 1.478754698360467e-05,
"loss": 0.2506,
"step": 1000
},
{
"epoch": 1.8753514526710404,
"eval_loss": 0.6781994104385376,
"eval_runtime": 111.5183,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 1000
},
{
"epoch": 1.8847235238987816,
"grad_norm": 1.5689202547073364,
"learning_rate": 1.473307542184382e-05,
"loss": 0.2811,
"step": 1005
},
{
"epoch": 1.8940955951265228,
"grad_norm": 1.357867956161499,
"learning_rate": 1.4678422273694062e-05,
"loss": 0.2637,
"step": 1010
},
{
"epoch": 1.9034676663542642,
"grad_norm": 1.241373896598816,
"learning_rate": 1.462358963594595e-05,
"loss": 0.2636,
"step": 1015
},
{
"epoch": 1.9128397375820057,
"grad_norm": 1.3964288234710693,
"learning_rate": 1.4568579612276222e-05,
"loss": 0.2741,
"step": 1020
},
{
"epoch": 1.922211808809747,
"grad_norm": 1.3163318634033203,
"learning_rate": 1.4513394313167104e-05,
"loss": 0.2621,
"step": 1025
},
{
"epoch": 1.9315838800374883,
"grad_norm": 1.3993713855743408,
"learning_rate": 1.4458035855825341e-05,
"loss": 0.2657,
"step": 1030
},
{
"epoch": 1.9409559512652295,
"grad_norm": 1.3384408950805664,
"learning_rate": 1.4402506364100957e-05,
"loss": 0.2598,
"step": 1035
},
{
"epoch": 1.9503280224929709,
"grad_norm": 1.4588673114776611,
"learning_rate": 1.4346807968405783e-05,
"loss": 0.2536,
"step": 1040
},
{
"epoch": 1.9597000937207123,
"grad_norm": 1.326058268547058,
"learning_rate": 1.4290942805631722e-05,
"loss": 0.2563,
"step": 1045
},
{
"epoch": 1.9690721649484537,
"grad_norm": 1.353257179260254,
"learning_rate": 1.4234913019068769e-05,
"loss": 0.2564,
"step": 1050
},
{
"epoch": 1.978444236176195,
"grad_norm": 1.4586265087127686,
"learning_rate": 1.4178720758322761e-05,
"loss": 0.2769,
"step": 1055
},
{
"epoch": 1.9878163074039361,
"grad_norm": 1.2936612367630005,
"learning_rate": 1.412236817923295e-05,
"loss": 0.2737,
"step": 1060
},
{
"epoch": 1.9971883786316775,
"grad_norm": 1.4073734283447266,
"learning_rate": 1.4065857443789246e-05,
"loss": 0.2717,
"step": 1065
},
{
"epoch": 2.005623242736645,
"grad_norm": 1.2421205043792725,
"learning_rate": 1.4009190720049309e-05,
"loss": 0.1902,
"step": 1070
},
{
"epoch": 2.014995313964386,
"grad_norm": 1.3869972229003906,
"learning_rate": 1.3952370182055332e-05,
"loss": 0.1134,
"step": 1075
},
{
"epoch": 2.0243673851921273,
"grad_norm": 1.3595290184020996,
"learning_rate": 1.389539800975068e-05,
"loss": 0.097,
"step": 1080
},
{
"epoch": 2.0337394564198688,
"grad_norm": 1.2397971153259277,
"learning_rate": 1.3838276388896216e-05,
"loss": 0.1022,
"step": 1085
},
{
"epoch": 2.04311152764761,
"grad_norm": 1.1282893419265747,
"learning_rate": 1.3781007510986464e-05,
"loss": 0.1003,
"step": 1090
},
{
"epoch": 2.0524835988753516,
"grad_norm": 1.2011518478393555,
"learning_rate": 1.3723593573165523e-05,
"loss": 0.0993,
"step": 1095
},
{
"epoch": 2.0618556701030926,
"grad_norm": 1.1846802234649658,
"learning_rate": 1.3666036778142773e-05,
"loss": 0.1031,
"step": 1100
},
{
"epoch": 2.0618556701030926,
"eval_loss": 0.7819597125053406,
"eval_runtime": 111.4871,
"eval_samples_per_second": 4.485,
"eval_steps_per_second": 2.242,
"step": 1100
},
{
"epoch": 2.071227741330834,
"grad_norm": 1.1528737545013428,
"learning_rate": 1.3608339334108378e-05,
"loss": 0.0938,
"step": 1105
},
{
"epoch": 2.0805998125585754,
"grad_norm": 1.2607845067977905,
"learning_rate": 1.355050345464855e-05,
"loss": 0.1048,
"step": 1110
},
{
"epoch": 2.089971883786317,
"grad_norm": 1.0643517971038818,
"learning_rate": 1.3492531358660634e-05,
"loss": 0.1056,
"step": 1115
},
{
"epoch": 2.0993439550140582,
"grad_norm": 1.2049908638000488,
"learning_rate": 1.3434425270267983e-05,
"loss": 0.1078,
"step": 1120
},
{
"epoch": 2.108716026241799,
"grad_norm": 1.1504206657409668,
"learning_rate": 1.3376187418734626e-05,
"loss": 0.0987,
"step": 1125
},
{
"epoch": 2.1180880974695406,
"grad_norm": 1.103416085243225,
"learning_rate": 1.3317820038379731e-05,
"loss": 0.1011,
"step": 1130
},
{
"epoch": 2.127460168697282,
"grad_norm": 1.2639893293380737,
"learning_rate": 1.3259325368491897e-05,
"loss": 0.1065,
"step": 1135
},
{
"epoch": 2.1368322399250235,
"grad_norm": 1.2981096506118774,
"learning_rate": 1.320070565324324e-05,
"loss": 0.1089,
"step": 1140
},
{
"epoch": 2.146204311152765,
"grad_norm": 1.3471019268035889,
"learning_rate": 1.314196314160329e-05,
"loss": 0.1034,
"step": 1145
},
{
"epoch": 2.155576382380506,
"grad_norm": 1.2037670612335205,
"learning_rate": 1.308310008725271e-05,
"loss": 0.0954,
"step": 1150
},
{
"epoch": 2.1649484536082473,
"grad_norm": 1.124943733215332,
"learning_rate": 1.3024118748496834e-05,
"loss": 0.1086,
"step": 1155
},
{
"epoch": 2.1743205248359887,
"grad_norm": 1.2061023712158203,
"learning_rate": 1.2965021388179036e-05,
"loss": 0.1032,
"step": 1160
},
{
"epoch": 2.18369259606373,
"grad_norm": 1.2710933685302734,
"learning_rate": 1.2905810273593887e-05,
"loss": 0.1024,
"step": 1165
},
{
"epoch": 2.1930646672914715,
"grad_norm": 1.1786785125732422,
"learning_rate": 1.28464876764002e-05,
"loss": 0.103,
"step": 1170
},
{
"epoch": 2.2024367385192125,
"grad_norm": 1.5116946697235107,
"learning_rate": 1.2787055872533867e-05,
"loss": 0.1107,
"step": 1175
},
{
"epoch": 2.211808809746954,
"grad_norm": 1.2890318632125854,
"learning_rate": 1.2727517142120527e-05,
"loss": 0.1019,
"step": 1180
},
{
"epoch": 2.2211808809746953,
"grad_norm": 1.184844970703125,
"learning_rate": 1.266787376938811e-05,
"loss": 0.1067,
"step": 1185
},
{
"epoch": 2.2305529522024368,
"grad_norm": 1.3428583145141602,
"learning_rate": 1.2608128042579185e-05,
"loss": 0.1066,
"step": 1190
},
{
"epoch": 2.239925023430178,
"grad_norm": 1.2953709363937378,
"learning_rate": 1.2548282253863181e-05,
"loss": 0.1138,
"step": 1195
},
{
"epoch": 2.2492970946579196,
"grad_norm": 1.1381481885910034,
"learning_rate": 1.2488338699248443e-05,
"loss": 0.1053,
"step": 1200
},
{
"epoch": 2.2492970946579196,
"eval_loss": 0.7939261198043823,
"eval_runtime": 111.5111,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 1200
},
{
"epoch": 2.2586691658856606,
"grad_norm": 1.5689799785614014,
"learning_rate": 1.2428299678494146e-05,
"loss": 0.098,
"step": 1205
},
{
"epoch": 2.268041237113402,
"grad_norm": 1.3094913959503174,
"learning_rate": 1.236816749502206e-05,
"loss": 0.1111,
"step": 1210
},
{
"epoch": 2.2774133083411434,
"grad_norm": 1.2114543914794922,
"learning_rate": 1.2307944455828178e-05,
"loss": 0.1051,
"step": 1215
},
{
"epoch": 2.286785379568885,
"grad_norm": 1.1505310535430908,
"learning_rate": 1.2247632871394223e-05,
"loss": 0.0927,
"step": 1220
},
{
"epoch": 2.296157450796626,
"grad_norm": 1.2007763385772705,
"learning_rate": 1.218723505559898e-05,
"loss": 0.1081,
"step": 1225
},
{
"epoch": 2.3055295220243672,
"grad_norm": 1.1881816387176514,
"learning_rate": 1.2126753325629543e-05,
"loss": 0.0984,
"step": 1230
},
{
"epoch": 2.3149015932521086,
"grad_norm": 1.2576075792312622,
"learning_rate": 1.2066190001892398e-05,
"loss": 0.112,
"step": 1235
},
{
"epoch": 2.32427366447985,
"grad_norm": 1.2001255750656128,
"learning_rate": 1.200554740792442e-05,
"loss": 0.107,
"step": 1240
},
{
"epoch": 2.3336457357075915,
"grad_norm": 1.2408965826034546,
"learning_rate": 1.1944827870303719e-05,
"loss": 0.1166,
"step": 1245
},
{
"epoch": 2.3430178069353325,
"grad_norm": 1.1618740558624268,
"learning_rate": 1.1884033718560372e-05,
"loss": 0.0978,
"step": 1250
},
{
"epoch": 2.352389878163074,
"grad_norm": 1.177768349647522,
"learning_rate": 1.1823167285087064e-05,
"loss": 0.1027,
"step": 1255
},
{
"epoch": 2.3617619493908153,
"grad_norm": 1.1294364929199219,
"learning_rate": 1.1762230905049593e-05,
"loss": 0.1087,
"step": 1260
},
{
"epoch": 2.3711340206185567,
"grad_norm": 1.4736202955245972,
"learning_rate": 1.1701226916297295e-05,
"loss": 0.1142,
"step": 1265
},
{
"epoch": 2.380506091846298,
"grad_norm": 1.2007415294647217,
"learning_rate": 1.164015765927333e-05,
"loss": 0.1076,
"step": 1270
},
{
"epoch": 2.3898781630740396,
"grad_norm": 1.274434208869934,
"learning_rate": 1.1579025476924912e-05,
"loss": 0.1116,
"step": 1275
},
{
"epoch": 2.3992502343017805,
"grad_norm": 1.3655272722244263,
"learning_rate": 1.1517832714613406e-05,
"loss": 0.1079,
"step": 1280
},
{
"epoch": 2.408622305529522,
"grad_norm": 1.2331844568252563,
"learning_rate": 1.1456581720024356e-05,
"loss": 0.1056,
"step": 1285
},
{
"epoch": 2.4179943767572634,
"grad_norm": 1.1586816310882568,
"learning_rate": 1.1395274843077405e-05,
"loss": 0.1067,
"step": 1290
},
{
"epoch": 2.427366447985005,
"grad_norm": 1.271945834159851,
"learning_rate": 1.1333914435836153e-05,
"loss": 0.1051,
"step": 1295
},
{
"epoch": 2.436738519212746,
"grad_norm": 1.1621251106262207,
"learning_rate": 1.1272502852417908e-05,
"loss": 0.1009,
"step": 1300
},
{
"epoch": 2.436738519212746,
"eval_loss": 0.777266263961792,
"eval_runtime": 111.4978,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 1300
},
{
"epoch": 2.446110590440487,
"grad_norm": 1.1645290851593018,
"learning_rate": 1.1211042448903374e-05,
"loss": 0.1169,
"step": 1305
},
{
"epoch": 2.4554826616682286,
"grad_norm": 1.163246989250183,
"learning_rate": 1.1149535583246253e-05,
"loss": 0.0952,
"step": 1310
},
{
"epoch": 2.46485473289597,
"grad_norm": 1.3993792533874512,
"learning_rate": 1.1087984615182797e-05,
"loss": 0.1178,
"step": 1315
},
{
"epoch": 2.4742268041237114,
"grad_norm": 1.1687663793563843,
"learning_rate": 1.1026391906141255e-05,
"loss": 0.0978,
"step": 1320
},
{
"epoch": 2.483598875351453,
"grad_norm": 1.1476637125015259,
"learning_rate": 1.0964759819151289e-05,
"loss": 0.0946,
"step": 1325
},
{
"epoch": 2.492970946579194,
"grad_norm": 1.0236659049987793,
"learning_rate": 1.0903090718753317e-05,
"loss": 0.1057,
"step": 1330
},
{
"epoch": 2.5023430178069352,
"grad_norm": 1.4007511138916016,
"learning_rate": 1.0841386970907786e-05,
"loss": 0.1124,
"step": 1335
},
{
"epoch": 2.5117150890346767,
"grad_norm": 1.2030051946640015,
"learning_rate": 1.077965094290441e-05,
"loss": 0.102,
"step": 1340
},
{
"epoch": 2.521087160262418,
"grad_norm": 1.0863361358642578,
"learning_rate": 1.0717885003271338e-05,
"loss": 0.1501,
"step": 1345
},
{
"epoch": 2.530459231490159,
"grad_norm": 1.441186547279358,
"learning_rate": 1.0656091521684297e-05,
"loss": 0.1111,
"step": 1350
},
{
"epoch": 2.539831302717901,
"grad_norm": 1.1081117391586304,
"learning_rate": 1.0594272868875677e-05,
"loss": 0.0995,
"step": 1355
},
{
"epoch": 2.549203373945642,
"grad_norm": 1.3063805103302002,
"learning_rate": 1.0532431416543559e-05,
"loss": 0.1026,
"step": 1360
},
{
"epoch": 2.5585754451733833,
"grad_norm": 1.265457034111023,
"learning_rate": 1.0470569537260746e-05,
"loss": 0.1137,
"step": 1365
},
{
"epoch": 2.5679475164011247,
"grad_norm": 1.1931920051574707,
"learning_rate": 1.040868960438373e-05,
"loss": 0.1056,
"step": 1370
},
{
"epoch": 2.5773195876288657,
"grad_norm": 1.2705389261245728,
"learning_rate": 1.0346793991961636e-05,
"loss": 0.0992,
"step": 1375
},
{
"epoch": 2.5866916588566076,
"grad_norm": 1.2234851121902466,
"learning_rate": 1.0284885074645139e-05,
"loss": 0.1067,
"step": 1380
},
{
"epoch": 2.5960637300843485,
"grad_norm": 1.30626380443573,
"learning_rate": 1.022296522759536e-05,
"loss": 0.1071,
"step": 1385
},
{
"epoch": 2.60543580131209,
"grad_norm": 1.1325551271438599,
"learning_rate": 1.016103682639275e-05,
"loss": 0.0946,
"step": 1390
},
{
"epoch": 2.6148078725398314,
"grad_norm": 1.2140247821807861,
"learning_rate": 1.009910224694593e-05,
"loss": 0.1012,
"step": 1395
},
{
"epoch": 2.624179943767573,
"grad_norm": 1.2330358028411865,
"learning_rate": 1.0037163865400577e-05,
"loss": 0.1022,
"step": 1400
},
{
"epoch": 2.624179943767573,
"eval_loss": 0.7983193397521973,
"eval_runtime": 111.5048,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 1400
},
{
"epoch": 2.633552014995314,
"grad_norm": 1.2977453470230103,
"learning_rate": 9.97522405804821e-06,
"loss": 0.1086,
"step": 1405
},
{
"epoch": 2.642924086223055,
"grad_norm": 1.2647531032562256,
"learning_rate": 9.913285201235065e-06,
"loss": 0.1051,
"step": 1410
},
{
"epoch": 2.6522961574507966,
"grad_norm": 1.3180173635482788,
"learning_rate": 9.85134967127091e-06,
"loss": 0.1142,
"step": 1415
},
{
"epoch": 2.661668228678538,
"grad_norm": 1.2392545938491821,
"learning_rate": 9.789419844337868e-06,
"loss": 0.1047,
"step": 1420
},
{
"epoch": 2.6710402999062794,
"grad_norm": 1.1911959648132324,
"learning_rate": 9.727498096399272e-06,
"loss": 0.0908,
"step": 1425
},
{
"epoch": 2.680412371134021,
"grad_norm": 1.3625760078430176,
"learning_rate": 9.665586803108495e-06,
"loss": 0.0967,
"step": 1430
},
{
"epoch": 2.689784442361762,
"grad_norm": 1.077038288116455,
"learning_rate": 9.603688339717818e-06,
"loss": 0.1055,
"step": 1435
},
{
"epoch": 2.6991565135895033,
"grad_norm": 1.2724173069000244,
"learning_rate": 9.541805080987298e-06,
"loss": 0.1024,
"step": 1440
},
{
"epoch": 2.7085285848172447,
"grad_norm": 1.246999979019165,
"learning_rate": 9.47993940109365e-06,
"loss": 0.1096,
"step": 1445
},
{
"epoch": 2.717900656044986,
"grad_norm": 1.1447161436080933,
"learning_rate": 9.418093673539181e-06,
"loss": 0.0964,
"step": 1450
},
{
"epoch": 2.7272727272727275,
"grad_norm": 1.3298566341400146,
"learning_rate": 9.356270271060711e-06,
"loss": 0.1036,
"step": 1455
},
{
"epoch": 2.7366447985004685,
"grad_norm": 1.3487498760223389,
"learning_rate": 9.294471565538552e-06,
"loss": 0.1054,
"step": 1460
},
{
"epoch": 2.74601686972821,
"grad_norm": 1.2166017293930054,
"learning_rate": 9.232699927905508e-06,
"loss": 0.1031,
"step": 1465
},
{
"epoch": 2.7553889409559513,
"grad_norm": 1.1950914859771729,
"learning_rate": 9.170957728055907e-06,
"loss": 0.0988,
"step": 1470
},
{
"epoch": 2.7647610121836927,
"grad_norm": 1.0390832424163818,
"learning_rate": 9.10924733475469e-06,
"loss": 0.1038,
"step": 1475
},
{
"epoch": 2.774133083411434,
"grad_norm": 1.190873146057129,
"learning_rate": 9.047571115546526e-06,
"loss": 0.1036,
"step": 1480
},
{
"epoch": 2.783505154639175,
"grad_norm": 1.1870976686477661,
"learning_rate": 8.985931436664981e-06,
"loss": 0.1032,
"step": 1485
},
{
"epoch": 2.7928772258669166,
"grad_norm": 1.2104380130767822,
"learning_rate": 8.924330662941731e-06,
"loss": 0.1006,
"step": 1490
},
{
"epoch": 2.802249297094658,
"grad_norm": 1.1908341646194458,
"learning_rate": 8.862771157715847e-06,
"loss": 0.0984,
"step": 1495
},
{
"epoch": 2.8116213683223994,
"grad_norm": 1.3652592897415161,
"learning_rate": 8.801255282743113e-06,
"loss": 0.1087,
"step": 1500
},
{
"epoch": 2.8116213683223994,
"eval_loss": 0.8067182898521423,
"eval_runtime": 111.5238,
"eval_samples_per_second": 4.483,
"eval_steps_per_second": 2.242,
"step": 1500
},
{
"epoch": 2.820993439550141,
"grad_norm": 1.3108559846878052,
"learning_rate": 8.739785398105419e-06,
"loss": 0.1096,
"step": 1505
},
{
"epoch": 2.830365510777882,
"grad_norm": 1.1820882558822632,
"learning_rate": 8.678363862120224e-06,
"loss": 0.0961,
"step": 1510
},
{
"epoch": 2.839737582005623,
"grad_norm": 1.0882302522659302,
"learning_rate": 8.616993031250059e-06,
"loss": 0.097,
"step": 1515
},
{
"epoch": 2.8491096532333646,
"grad_norm": 1.3416924476623535,
"learning_rate": 8.555675260012137e-06,
"loss": 0.1011,
"step": 1520
},
{
"epoch": 2.858481724461106,
"grad_norm": 1.3005818128585815,
"learning_rate": 8.49441290088803e-06,
"loss": 0.1064,
"step": 1525
},
{
"epoch": 2.8678537956888475,
"grad_norm": 1.203696846961975,
"learning_rate": 8.433208304233383e-06,
"loss": 0.0907,
"step": 1530
},
{
"epoch": 2.8772258669165884,
"grad_norm": 1.1533688306808472,
"learning_rate": 8.372063818187768e-06,
"loss": 0.0951,
"step": 1535
},
{
"epoch": 2.88659793814433,
"grad_norm": 1.21674382686615,
"learning_rate": 8.31098178858459e-06,
"loss": 0.0924,
"step": 1540
},
{
"epoch": 2.8959700093720713,
"grad_norm": 1.3103758096694946,
"learning_rate": 8.249964558861084e-06,
"loss": 0.1038,
"step": 1545
},
{
"epoch": 2.9053420805998127,
"grad_norm": 1.1318589448928833,
"learning_rate": 8.189014469968407e-06,
"loss": 0.0991,
"step": 1550
},
{
"epoch": 2.914714151827554,
"grad_norm": 1.3271617889404297,
"learning_rate": 8.128133860281838e-06,
"loss": 0.1061,
"step": 1555
},
{
"epoch": 2.924086223055295,
"grad_norm": 1.2122989892959595,
"learning_rate": 8.067325065511056e-06,
"loss": 0.0995,
"step": 1560
},
{
"epoch": 2.9334582942830365,
"grad_norm": 1.286104440689087,
"learning_rate": 8.006590418610523e-06,
"loss": 0.1069,
"step": 1565
},
{
"epoch": 2.942830365510778,
"grad_norm": 1.3062405586242676,
"learning_rate": 7.945932249690002e-06,
"loss": 0.1025,
"step": 1570
},
{
"epoch": 2.9522024367385193,
"grad_norm": 1.2752856016159058,
"learning_rate": 7.885352885925139e-06,
"loss": 0.1097,
"step": 1575
},
{
"epoch": 2.9615745079662608,
"grad_norm": 1.1971313953399658,
"learning_rate": 7.824854651468187e-06,
"loss": 0.1002,
"step": 1580
},
{
"epoch": 2.9709465791940017,
"grad_norm": 1.3056398630142212,
"learning_rate": 7.764439867358836e-06,
"loss": 0.1088,
"step": 1585
},
{
"epoch": 2.980318650421743,
"grad_norm": 1.2253344058990479,
"learning_rate": 7.704110851435174e-06,
"loss": 0.1047,
"step": 1590
},
{
"epoch": 2.9896907216494846,
"grad_norm": 1.1375926733016968,
"learning_rate": 7.643869918244759e-06,
"loss": 0.0937,
"step": 1595
},
{
"epoch": 2.999062792877226,
"grad_norm": 1.2414946556091309,
"learning_rate": 7.583719378955816e-06,
"loss": 0.1046,
"step": 1600
},
{
"epoch": 2.999062792877226,
"eval_loss": 0.8037455081939697,
"eval_runtime": 111.5354,
"eval_samples_per_second": 4.483,
"eval_steps_per_second": 2.241,
"step": 1600
},
{
"epoch": 3.007497656982193,
"grad_norm": 0.8191234469413757,
"learning_rate": 7.523661541268571e-06,
"loss": 0.054,
"step": 1605
},
{
"epoch": 3.0168697282099344,
"grad_norm": 0.6123488545417786,
"learning_rate": 7.463698709326708e-06,
"loss": 0.0328,
"step": 1610
},
{
"epoch": 3.026241799437676,
"grad_norm": 1.0028489828109741,
"learning_rate": 7.403833183628995e-06,
"loss": 0.0345,
"step": 1615
},
{
"epoch": 3.035613870665417,
"grad_norm": 1.0307646989822388,
"learning_rate": 7.344067260940989e-06,
"loss": 0.0323,
"step": 1620
},
{
"epoch": 3.044985941893158,
"grad_norm": 0.9559040069580078,
"learning_rate": 7.284403234206939e-06,
"loss": 0.035,
"step": 1625
},
{
"epoch": 3.0543580131208996,
"grad_norm": 0.9424014687538147,
"learning_rate": 7.224843392461818e-06,
"loss": 0.033,
"step": 1630
},
{
"epoch": 3.063730084348641,
"grad_norm": 0.845702588558197,
"learning_rate": 7.165390020743498e-06,
"loss": 0.0324,
"step": 1635
},
{
"epoch": 3.0731021555763824,
"grad_norm": 0.8844259977340698,
"learning_rate": 7.106045400005083e-06,
"loss": 0.0284,
"step": 1640
},
{
"epoch": 3.082474226804124,
"grad_norm": 0.7264754772186279,
"learning_rate": 7.046811807027401e-06,
"loss": 0.0344,
"step": 1645
},
{
"epoch": 3.091846298031865,
"grad_norm": 0.8641548156738281,
"learning_rate": 6.987691514331656e-06,
"loss": 0.0366,
"step": 1650
},
{
"epoch": 3.1012183692596063,
"grad_norm": 0.8383805155754089,
"learning_rate": 6.928686790092235e-06,
"loss": 0.0323,
"step": 1655
},
{
"epoch": 3.1105904404873477,
"grad_norm": 1.0214649438858032,
"learning_rate": 6.869799898049704e-06,
"loss": 0.0333,
"step": 1660
},
{
"epoch": 3.119962511715089,
"grad_norm": 1.09578537940979,
"learning_rate": 6.811033097423938e-06,
"loss": 0.0357,
"step": 1665
},
{
"epoch": 3.1293345829428305,
"grad_norm": 0.9607039093971252,
"learning_rate": 6.752388642827459e-06,
"loss": 0.0356,
"step": 1670
},
{
"epoch": 3.138706654170572,
"grad_norm": 0.9811620712280273,
"learning_rate": 6.693868784178934e-06,
"loss": 0.0325,
"step": 1675
},
{
"epoch": 3.148078725398313,
"grad_norm": 1.125432014465332,
"learning_rate": 6.635475766616852e-06,
"loss": 0.0341,
"step": 1680
},
{
"epoch": 3.1574507966260543,
"grad_norm": 0.8190117478370667,
"learning_rate": 6.577211830413397e-06,
"loss": 0.0318,
"step": 1685
},
{
"epoch": 3.1668228678537957,
"grad_norm": 0.8427776098251343,
"learning_rate": 6.519079210888486e-06,
"loss": 0.0326,
"step": 1690
},
{
"epoch": 3.176194939081537,
"grad_norm": 0.9349907636642456,
"learning_rate": 6.461080138324025e-06,
"loss": 0.0303,
"step": 1695
},
{
"epoch": 3.1855670103092786,
"grad_norm": 0.7530879378318787,
"learning_rate": 6.40321683787833e-06,
"loss": 0.0311,
"step": 1700
},
{
"epoch": 3.1855670103092786,
"eval_loss": 0.9447797536849976,
"eval_runtime": 111.5066,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 1700
},
{
"epoch": 3.1949390815370196,
"grad_norm": 1.094067096710205,
"learning_rate": 6.345491529500769e-06,
"loss": 0.0362,
"step": 1705
},
{
"epoch": 3.204311152764761,
"grad_norm": 0.9973980784416199,
"learning_rate": 6.287906427846583e-06,
"loss": 0.0311,
"step": 1710
},
{
"epoch": 3.2136832239925024,
"grad_norm": 0.954328179359436,
"learning_rate": 6.230463742191926e-06,
"loss": 0.0316,
"step": 1715
},
{
"epoch": 3.223055295220244,
"grad_norm": 0.8958219289779663,
"learning_rate": 6.173165676349103e-06,
"loss": 0.0319,
"step": 1720
},
{
"epoch": 3.2324273664479852,
"grad_norm": 0.8772101402282715,
"learning_rate": 6.116014428582022e-06,
"loss": 0.033,
"step": 1725
},
{
"epoch": 3.241799437675726,
"grad_norm": 0.8836532235145569,
"learning_rate": 6.059012191521853e-06,
"loss": 0.0345,
"step": 1730
},
{
"epoch": 3.2511715089034676,
"grad_norm": 1.0338672399520874,
"learning_rate": 6.002161152082909e-06,
"loss": 0.0322,
"step": 1735
},
{
"epoch": 3.260543580131209,
"grad_norm": 0.7626182436943054,
"learning_rate": 5.945463491378746e-06,
"loss": 0.034,
"step": 1740
},
{
"epoch": 3.2699156513589505,
"grad_norm": 1.0167630910873413,
"learning_rate": 5.888921384638477e-06,
"loss": 0.0323,
"step": 1745
},
{
"epoch": 3.279287722586692,
"grad_norm": 0.8768958449363708,
"learning_rate": 5.832537001123328e-06,
"loss": 0.0335,
"step": 1750
},
{
"epoch": 3.288659793814433,
"grad_norm": 0.8373109698295593,
"learning_rate": 5.7763125040434084e-06,
"loss": 0.0306,
"step": 1755
},
{
"epoch": 3.2980318650421743,
"grad_norm": 0.7997825741767883,
"learning_rate": 5.720250050474723e-06,
"loss": 0.0314,
"step": 1760
},
{
"epoch": 3.3074039362699157,
"grad_norm": 0.9116000533103943,
"learning_rate": 5.66435179127639e-06,
"loss": 0.0342,
"step": 1765
},
{
"epoch": 3.316776007497657,
"grad_norm": 0.7944602370262146,
"learning_rate": 5.608619871008166e-06,
"loss": 0.0314,
"step": 1770
},
{
"epoch": 3.3261480787253985,
"grad_norm": 0.9112783074378967,
"learning_rate": 5.553056427848136e-06,
"loss": 0.0305,
"step": 1775
},
{
"epoch": 3.3355201499531395,
"grad_norm": 0.9411343336105347,
"learning_rate": 5.497663593510693e-06,
"loss": 0.0362,
"step": 1780
},
{
"epoch": 3.344892221180881,
"grad_norm": 0.9458235502243042,
"learning_rate": 5.442443493164753e-06,
"loss": 0.0311,
"step": 1785
},
{
"epoch": 3.3542642924086223,
"grad_norm": 0.9986944794654846,
"learning_rate": 5.387398245352213e-06,
"loss": 0.0346,
"step": 1790
},
{
"epoch": 3.3636363636363638,
"grad_norm": 0.8632819652557373,
"learning_rate": 5.332529961906699e-06,
"loss": 0.0322,
"step": 1795
},
{
"epoch": 3.373008434864105,
"grad_norm": 0.8336763978004456,
"learning_rate": 5.277840747872509e-06,
"loss": 0.0343,
"step": 1800
},
{
"epoch": 3.373008434864105,
"eval_loss": 0.9443374872207642,
"eval_runtime": 111.5074,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 1800
},
{
"epoch": 3.382380506091846,
"grad_norm": 0.7421078085899353,
"learning_rate": 5.223332701423875e-06,
"loss": 0.0299,
"step": 1805
},
{
"epoch": 3.3917525773195876,
"grad_norm": 0.7075040340423584,
"learning_rate": 5.169007913784462e-06,
"loss": 0.0333,
"step": 1810
},
{
"epoch": 3.401124648547329,
"grad_norm": 0.8889288306236267,
"learning_rate": 5.11486846914713e-06,
"loss": 0.033,
"step": 1815
},
{
"epoch": 3.4104967197750704,
"grad_norm": 1.1044409275054932,
"learning_rate": 5.060916444593985e-06,
"loss": 0.0353,
"step": 1820
},
{
"epoch": 3.419868791002812,
"grad_norm": 0.9357883334159851,
"learning_rate": 5.00715391001668e-06,
"loss": 0.0304,
"step": 1825
},
{
"epoch": 3.429240862230553,
"grad_norm": 0.9663400650024414,
"learning_rate": 4.953582928037005e-06,
"loss": 0.0332,
"step": 1830
},
{
"epoch": 3.438612933458294,
"grad_norm": 1.0516884326934814,
"learning_rate": 4.900205553927761e-06,
"loss": 0.035,
"step": 1835
},
{
"epoch": 3.4479850046860356,
"grad_norm": 1.041757345199585,
"learning_rate": 4.847023835533903e-06,
"loss": 0.0315,
"step": 1840
},
{
"epoch": 3.457357075913777,
"grad_norm": 0.8891613483428955,
"learning_rate": 4.794039813193967e-06,
"loss": 0.0326,
"step": 1845
},
{
"epoch": 3.4667291471415185,
"grad_norm": 0.9261044859886169,
"learning_rate": 4.741255519661806e-06,
"loss": 0.0304,
"step": 1850
},
{
"epoch": 3.4761012183692594,
"grad_norm": 1.3144643306732178,
"learning_rate": 4.68867298002859e-06,
"loss": 0.0354,
"step": 1855
},
{
"epoch": 3.485473289597001,
"grad_norm": 0.8868503570556641,
"learning_rate": 4.6362942116451226e-06,
"loss": 0.0304,
"step": 1860
},
{
"epoch": 3.4948453608247423,
"grad_norm": 0.9837562441825867,
"learning_rate": 4.5841212240444334e-06,
"loss": 0.032,
"step": 1865
},
{
"epoch": 3.5042174320524837,
"grad_norm": 0.8227118253707886,
"learning_rate": 4.532156018864692e-06,
"loss": 0.0307,
"step": 1870
},
{
"epoch": 3.513589503280225,
"grad_norm": 0.7651123404502869,
"learning_rate": 4.480400589772409e-06,
"loss": 0.0264,
"step": 1875
},
{
"epoch": 3.522961574507966,
"grad_norm": 0.9286547899246216,
"learning_rate": 4.428856922385942e-06,
"loss": 0.0285,
"step": 1880
},
{
"epoch": 3.5323336457357075,
"grad_norm": 0.9905438423156738,
"learning_rate": 4.37752699419934e-06,
"loss": 0.0337,
"step": 1885
},
{
"epoch": 3.541705716963449,
"grad_norm": 0.914618194103241,
"learning_rate": 4.326412774506444e-06,
"loss": 0.0287,
"step": 1890
},
{
"epoch": 3.5510777881911904,
"grad_norm": 0.8570281863212585,
"learning_rate": 4.275516224325356e-06,
"loss": 0.0319,
"step": 1895
},
{
"epoch": 3.5604498594189318,
"grad_norm": 0.8986263871192932,
"learning_rate": 4.224839296323196e-06,
"loss": 0.0322,
"step": 1900
},
{
"epoch": 3.5604498594189318,
"eval_loss": 0.9526164531707764,
"eval_runtime": 111.507,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 1900
},
{
"epoch": 3.5698219306466727,
"grad_norm": 1.0641896724700928,
"learning_rate": 4.1743839347411875e-06,
"loss": 0.0317,
"step": 1905
},
{
"epoch": 3.579194001874414,
"grad_norm": 1.0256502628326416,
"learning_rate": 4.124152075320071e-06,
"loss": 0.0346,
"step": 1910
},
{
"epoch": 3.5885660731021556,
"grad_norm": 0.8067216277122498,
"learning_rate": 4.074145645225831e-06,
"loss": 0.0302,
"step": 1915
},
{
"epoch": 3.597938144329897,
"grad_norm": 0.9786953926086426,
"learning_rate": 4.0243665629757654e-06,
"loss": 0.0362,
"step": 1920
},
{
"epoch": 3.6073102155576384,
"grad_norm": 0.8346753716468811,
"learning_rate": 3.974816738364875e-06,
"loss": 0.0309,
"step": 1925
},
{
"epoch": 3.6166822867853794,
"grad_norm": 0.7229898571968079,
"learning_rate": 3.9254980723926e-06,
"loss": 0.03,
"step": 1930
},
{
"epoch": 3.626054358013121,
"grad_norm": 0.9483016729354858,
"learning_rate": 3.876412457189883e-06,
"loss": 0.032,
"step": 1935
},
{
"epoch": 3.6354264292408622,
"grad_norm": 0.9327901601791382,
"learning_rate": 3.8275617759465775e-06,
"loss": 0.0323,
"step": 1940
},
{
"epoch": 3.6447985004686037,
"grad_norm": 0.8537086844444275,
"learning_rate": 3.7789479028392007e-06,
"loss": 0.029,
"step": 1945
},
{
"epoch": 3.654170571696345,
"grad_norm": 0.891110360622406,
"learning_rate": 3.7305727029590245e-06,
"loss": 0.0342,
"step": 1950
},
{
"epoch": 3.663542642924086,
"grad_norm": 0.8868283629417419,
"learning_rate": 3.6824380322405273e-06,
"loss": 0.0315,
"step": 1955
},
{
"epoch": 3.6729147141518275,
"grad_norm": 0.9474219679832458,
"learning_rate": 3.6345457373901848e-06,
"loss": 0.0302,
"step": 1960
},
{
"epoch": 3.682286785379569,
"grad_norm": 0.9067096710205078,
"learning_rate": 3.5868976558156254e-06,
"loss": 0.0291,
"step": 1965
},
{
"epoch": 3.6916588566073103,
"grad_norm": 0.8193556070327759,
"learning_rate": 3.5394956155551285e-06,
"loss": 0.0309,
"step": 1970
},
{
"epoch": 3.7010309278350517,
"grad_norm": 0.8624306321144104,
"learning_rate": 3.492341435207509e-06,
"loss": 0.0312,
"step": 1975
},
{
"epoch": 3.7104029990627927,
"grad_norm": 0.7553118467330933,
"learning_rate": 3.445436923862322e-06,
"loss": 0.0298,
"step": 1980
},
{
"epoch": 3.719775070290534,
"grad_norm": 0.8075463175773621,
"learning_rate": 3.3987838810304752e-06,
"loss": 0.0297,
"step": 1985
},
{
"epoch": 3.7291471415182755,
"grad_norm": 1.0225906372070312,
"learning_rate": 3.3523840965751788e-06,
"loss": 0.032,
"step": 1990
},
{
"epoch": 3.738519212746017,
"grad_norm": 0.8977119326591492,
"learning_rate": 3.3062393506432843e-06,
"loss": 0.0705,
"step": 1995
},
{
"epoch": 3.7478912839737584,
"grad_norm": 0.8516520857810974,
"learning_rate": 3.2603514135969837e-06,
"loss": 0.0299,
"step": 2000
},
{
"epoch": 3.7478912839737584,
"eval_loss": 0.967979371547699,
"eval_runtime": 111.5017,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 2000
},
{
"epoch": 3.7572633552014993,
"grad_norm": 0.9143263697624207,
"learning_rate": 3.214722045945895e-06,
"loss": 0.0295,
"step": 2005
},
{
"epoch": 3.7666354264292408,
"grad_norm": 0.8708062767982483,
"learning_rate": 3.1693529982795036e-06,
"loss": 0.0281,
"step": 2010
},
{
"epoch": 3.776007497656982,
"grad_norm": 0.9132674932479858,
"learning_rate": 3.124246011200018e-06,
"loss": 0.0301,
"step": 2015
},
{
"epoch": 3.7853795688847236,
"grad_norm": 0.9853923916816711,
"learning_rate": 3.079402815255591e-06,
"loss": 0.0313,
"step": 2020
},
{
"epoch": 3.794751640112465,
"grad_norm": 1.0308923721313477,
"learning_rate": 3.0348251308739106e-06,
"loss": 0.032,
"step": 2025
},
{
"epoch": 3.804123711340206,
"grad_norm": 0.7933114767074585,
"learning_rate": 2.9905146682962073e-06,
"loss": 0.0311,
"step": 2030
},
{
"epoch": 3.8134957825679474,
"grad_norm": 0.8838526606559753,
"learning_rate": 2.9464731275116355e-06,
"loss": 0.0325,
"step": 2035
},
{
"epoch": 3.822867853795689,
"grad_norm": 0.8747525811195374,
"learning_rate": 2.9027021981920566e-06,
"loss": 0.0314,
"step": 2040
},
{
"epoch": 3.8322399250234302,
"grad_norm": 0.7285995483398438,
"learning_rate": 2.8592035596272118e-06,
"loss": 0.0294,
"step": 2045
},
{
"epoch": 3.8416119962511717,
"grad_norm": 0.8272311091423035,
"learning_rate": 2.8159788806602904e-06,
"loss": 0.0318,
"step": 2050
},
{
"epoch": 3.8509840674789126,
"grad_norm": 0.7552247047424316,
"learning_rate": 2.773029819623916e-06,
"loss": 0.03,
"step": 2055
},
{
"epoch": 3.860356138706654,
"grad_norm": 0.9183073043823242,
"learning_rate": 2.730358024276509e-06,
"loss": 0.0314,
"step": 2060
},
{
"epoch": 3.8697282099343955,
"grad_norm": 0.8467240333557129,
"learning_rate": 2.6879651317390864e-06,
"loss": 0.0256,
"step": 2065
},
{
"epoch": 3.879100281162137,
"grad_norm": 0.850248396396637,
"learning_rate": 2.6458527684324376e-06,
"loss": 0.0299,
"step": 2070
},
{
"epoch": 3.8884723523898783,
"grad_norm": 0.7223458290100098,
"learning_rate": 2.6040225500147365e-06,
"loss": 0.0305,
"step": 2075
},
{
"epoch": 3.8978444236176193,
"grad_norm": 0.8155651092529297,
"learning_rate": 2.5624760813195436e-06,
"loss": 0.0298,
"step": 2080
},
{
"epoch": 3.9072164948453607,
"grad_norm": 0.7251290082931519,
"learning_rate": 2.5212149562942535e-06,
"loss": 0.0276,
"step": 2085
},
{
"epoch": 3.916588566073102,
"grad_norm": 1.1165629625320435,
"learning_rate": 2.48024075793893e-06,
"loss": 0.0309,
"step": 2090
},
{
"epoch": 3.9259606373008435,
"grad_norm": 1.0103236436843872,
"learning_rate": 2.4395550582455774e-06,
"loss": 0.0277,
"step": 2095
},
{
"epoch": 3.935332708528585,
"grad_norm": 0.912944495677948,
"learning_rate": 2.3991594181378286e-06,
"loss": 0.0335,
"step": 2100
},
{
"epoch": 3.935332708528585,
"eval_loss": 0.9605706930160522,
"eval_runtime": 111.5358,
"eval_samples_per_second": 4.483,
"eval_steps_per_second": 2.241,
"step": 2100
},
{
"epoch": 3.944704779756326,
"grad_norm": 0.925261378288269,
"learning_rate": 2.359055387411061e-06,
"loss": 0.0311,
"step": 2105
},
{
"epoch": 3.9540768509840674,
"grad_norm": 0.9867929220199585,
"learning_rate": 2.319244504672943e-06,
"loss": 0.0306,
"step": 2110
},
{
"epoch": 3.963448922211809,
"grad_norm": 0.9533296227455139,
"learning_rate": 2.279728297284394e-06,
"loss": 0.0309,
"step": 2115
},
{
"epoch": 3.97282099343955,
"grad_norm": 0.8042296171188354,
"learning_rate": 2.2405082813009926e-06,
"loss": 0.0257,
"step": 2120
},
{
"epoch": 3.9821930646672916,
"grad_norm": 0.8513698577880859,
"learning_rate": 2.201585961414815e-06,
"loss": 0.0277,
"step": 2125
},
{
"epoch": 3.9915651358950326,
"grad_norm": 0.8996440768241882,
"learning_rate": 2.1629628308967e-06,
"loss": 0.0309,
"step": 2130
},
{
"epoch": 4.0,
"grad_norm": 1.3045603036880493,
"learning_rate": 2.1246403715389675e-06,
"loss": 0.0307,
"step": 2135
},
{
"epoch": 4.009372071227741,
"grad_norm": 0.5760667324066162,
"learning_rate": 2.0866200535985616e-06,
"loss": 0.0104,
"step": 2140
},
{
"epoch": 4.018744142455483,
"grad_norm": 0.32251426577568054,
"learning_rate": 2.0489033357406464e-06,
"loss": 0.0091,
"step": 2145
},
{
"epoch": 4.028116213683224,
"grad_norm": 0.3890618681907654,
"learning_rate": 2.011491664982644e-06,
"loss": 0.0093,
"step": 2150
},
{
"epoch": 4.037488284910966,
"grad_norm": 0.4246854782104492,
"learning_rate": 1.9743864766387198e-06,
"loss": 0.0094,
"step": 2155
},
{
"epoch": 4.046860356138707,
"grad_norm": 0.37308433651924133,
"learning_rate": 1.937589194264715e-06,
"loss": 0.0083,
"step": 2160
},
{
"epoch": 4.056232427366448,
"grad_norm": 0.29468032717704773,
"learning_rate": 1.9011012296035303e-06,
"loss": 0.0072,
"step": 2165
},
{
"epoch": 4.0656044985941895,
"grad_norm": 0.49253249168395996,
"learning_rate": 1.864923982530965e-06,
"loss": 0.0078,
"step": 2170
},
{
"epoch": 4.0749765698219305,
"grad_norm": 0.5254181623458862,
"learning_rate": 1.8290588410020116e-06,
"loss": 0.0078,
"step": 2175
},
{
"epoch": 4.084348641049672,
"grad_norm": 0.3478500247001648,
"learning_rate": 1.7935071809976035e-06,
"loss": 0.0075,
"step": 2180
},
{
"epoch": 4.093720712277413,
"grad_norm": 0.3770616352558136,
"learning_rate": 1.7582703664718247e-06,
"loss": 0.0082,
"step": 2185
},
{
"epoch": 4.103092783505154,
"grad_norm": 0.349509596824646,
"learning_rate": 1.7233497492995865e-06,
"loss": 0.0069,
"step": 2190
},
{
"epoch": 4.112464854732896,
"grad_norm": 0.43029990792274475,
"learning_rate": 1.6887466692247556e-06,
"loss": 0.0077,
"step": 2195
},
{
"epoch": 4.121836925960637,
"grad_norm": 0.6748161911964417,
"learning_rate": 1.654462453808755e-06,
"loss": 0.0073,
"step": 2200
},
{
"epoch": 4.121836925960637,
"eval_loss": 1.0975761413574219,
"eval_runtime": 111.5036,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 2200
},
{
"epoch": 4.131208997188379,
"grad_norm": 0.6008536219596863,
"learning_rate": 1.6204984183796425e-06,
"loss": 0.0079,
"step": 2205
},
{
"epoch": 4.14058106841612,
"grad_norm": 0.4357309937477112,
"learning_rate": 1.5868558659816302e-06,
"loss": 0.0082,
"step": 2210
},
{
"epoch": 4.149953139643861,
"grad_norm": 0.4295269250869751,
"learning_rate": 1.5535360873251026e-06,
"loss": 0.008,
"step": 2215
},
{
"epoch": 4.159325210871603,
"grad_norm": 0.3729182183742523,
"learning_rate": 1.5205403607370984e-06,
"loss": 0.0071,
"step": 2220
},
{
"epoch": 4.168697282099344,
"grad_norm": 0.5101849436759949,
"learning_rate": 1.4878699521122654e-06,
"loss": 0.0081,
"step": 2225
},
{
"epoch": 4.178069353327086,
"grad_norm": 0.5576186776161194,
"learning_rate": 1.4555261148642929e-06,
"loss": 0.0088,
"step": 2230
},
{
"epoch": 4.187441424554827,
"grad_norm": 0.39585602283477783,
"learning_rate": 1.423510089877823e-06,
"loss": 0.0078,
"step": 2235
},
{
"epoch": 4.196813495782568,
"grad_norm": 0.45328739285469055,
"learning_rate": 1.3918231054608499e-06,
"loss": 0.0077,
"step": 2240
},
{
"epoch": 4.206185567010309,
"grad_norm": 0.45810526609420776,
"learning_rate": 1.3604663772975856e-06,
"loss": 0.0093,
"step": 2245
},
{
"epoch": 4.21555763823805,
"grad_norm": 0.4543026089668274,
"learning_rate": 1.3294411084018277e-06,
"loss": 0.007,
"step": 2250
},
{
"epoch": 4.224929709465792,
"grad_norm": 1.054495930671692,
"learning_rate": 1.2987484890708024e-06,
"loss": 0.0087,
"step": 2255
},
{
"epoch": 4.234301780693533,
"grad_norm": 0.5703629851341248,
"learning_rate": 1.268389696839497e-06,
"loss": 0.008,
"step": 2260
},
{
"epoch": 4.243673851921274,
"grad_norm": 0.41296708583831787,
"learning_rate": 1.2383658964354861e-06,
"loss": 0.006,
"step": 2265
},
{
"epoch": 4.253045923149016,
"grad_norm": 0.6897146701812744,
"learning_rate": 1.2086782397342445e-06,
"loss": 0.0076,
"step": 2270
},
{
"epoch": 4.262417994376757,
"grad_norm": 0.39745044708251953,
"learning_rate": 1.1793278657149532e-06,
"loss": 0.0084,
"step": 2275
},
{
"epoch": 4.271790065604499,
"grad_norm": 0.6803708672523499,
"learning_rate": 1.1503159004168074e-06,
"loss": 0.0063,
"step": 2280
},
{
"epoch": 4.28116213683224,
"grad_norm": 0.49779579043388367,
"learning_rate": 1.12164345689581e-06,
"loss": 0.0077,
"step": 2285
},
{
"epoch": 4.290534208059981,
"grad_norm": 0.42171531915664673,
"learning_rate": 1.0933116351820695e-06,
"loss": 0.0074,
"step": 2290
},
{
"epoch": 4.299906279287723,
"grad_norm": 0.41160067915916443,
"learning_rate": 1.0653215222376045e-06,
"loss": 0.0068,
"step": 2295
},
{
"epoch": 4.309278350515464,
"grad_norm": 0.4333638548851013,
"learning_rate": 1.0376741919146305e-06,
"loss": 0.0069,
"step": 2300
},
{
"epoch": 4.309278350515464,
"eval_loss": 1.1144713163375854,
"eval_runtime": 111.5268,
"eval_samples_per_second": 4.483,
"eval_steps_per_second": 2.242,
"step": 2300
},
{
"epoch": 4.318650421743206,
"grad_norm": 0.621540367603302,
"learning_rate": 1.0103707049143673e-06,
"loss": 0.008,
"step": 2305
},
{
"epoch": 4.3280224929709465,
"grad_norm": 0.3928787112236023,
"learning_rate": 9.834121087463445e-07,
"loss": 0.0068,
"step": 2310
},
{
"epoch": 4.3373945641986875,
"grad_norm": 0.4444401264190674,
"learning_rate": 9.56799437688214e-07,
"loss": 0.0076,
"step": 2315
},
{
"epoch": 4.346766635426429,
"grad_norm": 0.4709712266921997,
"learning_rate": 9.305337127460678e-07,
"loss": 0.0064,
"step": 2320
},
{
"epoch": 4.35613870665417,
"grad_norm": 0.6003327369689941,
"learning_rate": 9.046159416152633e-07,
"loss": 0.007,
"step": 2325
},
{
"epoch": 4.365510777881912,
"grad_norm": 0.3838503360748291,
"learning_rate": 8.790471186417715e-07,
"loss": 0.0076,
"step": 2330
},
{
"epoch": 4.374882849109653,
"grad_norm": 0.5418089032173157,
"learning_rate": 8.538282247840201e-07,
"loss": 0.0072,
"step": 2335
},
{
"epoch": 4.384254920337394,
"grad_norm": 0.7511455416679382,
"learning_rate": 8.289602275752673e-07,
"loss": 0.009,
"step": 2340
},
{
"epoch": 4.393626991565136,
"grad_norm": 0.5192817449569702,
"learning_rate": 8.044440810864718e-07,
"loss": 0.0081,
"step": 2345
},
{
"epoch": 4.402999062792877,
"grad_norm": 0.6360767483711243,
"learning_rate": 7.80280725889696e-07,
"loss": 0.0079,
"step": 2350
},
{
"epoch": 4.412371134020619,
"grad_norm": 0.5308467149734497,
"learning_rate": 7.564710890220183e-07,
"loss": 0.0083,
"step": 2355
},
{
"epoch": 4.42174320524836,
"grad_norm": 0.4319888949394226,
"learning_rate": 7.3301608394997e-07,
"loss": 0.0079,
"step": 2360
},
{
"epoch": 4.431115276476101,
"grad_norm": 0.46917620301246643,
"learning_rate": 7.099166105344835e-07,
"loss": 0.0064,
"step": 2365
},
{
"epoch": 4.440487347703843,
"grad_norm": 0.455216646194458,
"learning_rate": 6.871735549963765e-07,
"loss": 0.007,
"step": 2370
},
{
"epoch": 4.449859418931584,
"grad_norm": 0.40280669927597046,
"learning_rate": 6.647877898823463e-07,
"loss": 0.0068,
"step": 2375
},
{
"epoch": 4.4592314901593255,
"grad_norm": 0.32350170612335205,
"learning_rate": 6.427601740314926e-07,
"loss": 0.0077,
"step": 2380
},
{
"epoch": 4.4686035613870665,
"grad_norm": 0.30398938059806824,
"learning_rate": 6.2109155254238e-07,
"loss": 0.0068,
"step": 2385
},
{
"epoch": 4.4779756326148075,
"grad_norm": 0.5104652047157288,
"learning_rate": 5.997827567405978e-07,
"loss": 0.0069,
"step": 2390
},
{
"epoch": 4.487347703842549,
"grad_norm": 0.4495840072631836,
"learning_rate": 5.788346041468796e-07,
"loss": 0.0065,
"step": 2395
},
{
"epoch": 4.49671977507029,
"grad_norm": 0.3475983440876007,
"learning_rate": 5.582478984457284e-07,
"loss": 0.0064,
"step": 2400
},
{
"epoch": 4.49671977507029,
"eval_loss": 1.1217763423919678,
"eval_runtime": 111.5486,
"eval_samples_per_second": 4.482,
"eval_steps_per_second": 2.241,
"step": 2400
},
{
"epoch": 4.506091846298032,
"grad_norm": 0.46389687061309814,
"learning_rate": 5.380234294545938e-07,
"loss": 0.0071,
"step": 2405
},
{
"epoch": 4.515463917525773,
"grad_norm": 0.3474023640155792,
"learning_rate": 5.181619730935617e-07,
"loss": 0.0067,
"step": 2410
},
{
"epoch": 4.524835988753514,
"grad_norm": 0.3991861045360565,
"learning_rate": 4.986642913555895e-07,
"loss": 0.0068,
"step": 2415
},
{
"epoch": 4.534208059981256,
"grad_norm": 0.4194345772266388,
"learning_rate": 4.795311322772722e-07,
"loss": 0.0077,
"step": 2420
},
{
"epoch": 4.543580131208997,
"grad_norm": 0.34731411933898926,
"learning_rate": 4.6076322991013946e-07,
"loss": 0.0063,
"step": 2425
},
{
"epoch": 4.552952202436739,
"grad_norm": 0.7513842582702637,
"learning_rate": 4.4236130429250347e-07,
"loss": 0.007,
"step": 2430
},
{
"epoch": 4.56232427366448,
"grad_norm": 0.35471469163894653,
"learning_rate": 4.2432606142182145e-07,
"loss": 0.0071,
"step": 2435
},
{
"epoch": 4.571696344892221,
"grad_norm": 0.3158963918685913,
"learning_rate": 4.06658193227617e-07,
"loss": 0.008,
"step": 2440
},
{
"epoch": 4.581068416119963,
"grad_norm": 0.510502815246582,
"learning_rate": 3.8935837754493497e-07,
"loss": 0.0083,
"step": 2445
},
{
"epoch": 4.590440487347704,
"grad_norm": 0.5745358467102051,
"learning_rate": 3.72427278088332e-07,
"loss": 0.0075,
"step": 2450
},
{
"epoch": 4.5998125585754455,
"grad_norm": 0.48121458292007446,
"learning_rate": 3.5586554442641587e-07,
"loss": 0.0081,
"step": 2455
},
{
"epoch": 4.609184629803186,
"grad_norm": 0.4651750922203064,
"learning_rate": 3.3967381195692317e-07,
"loss": 0.0069,
"step": 2460
},
{
"epoch": 4.618556701030927,
"grad_norm": 0.4792514443397522,
"learning_rate": 3.238527018823423e-07,
"loss": 0.0081,
"step": 2465
},
{
"epoch": 4.627928772258669,
"grad_norm": 0.4478175640106201,
"learning_rate": 3.08402821186079e-07,
"loss": 0.0063,
"step": 2470
},
{
"epoch": 4.63730084348641,
"grad_norm": 0.3196679949760437,
"learning_rate": 2.933247626091751e-07,
"loss": 0.0068,
"step": 2475
},
{
"epoch": 4.646672914714152,
"grad_norm": 0.5067555904388428,
"learning_rate": 2.786191046275588e-07,
"loss": 0.0076,
"step": 2480
},
{
"epoch": 4.656044985941893,
"grad_norm": 0.5797865986824036,
"learning_rate": 2.6428641142986043e-07,
"loss": 0.009,
"step": 2485
},
{
"epoch": 4.665417057169634,
"grad_norm": 0.5033183693885803,
"learning_rate": 2.503272328957584e-07,
"loss": 0.0078,
"step": 2490
},
{
"epoch": 4.674789128397376,
"grad_norm": 0.30220600962638855,
"learning_rate": 2.367421045748908e-07,
"loss": 0.007,
"step": 2495
},
{
"epoch": 4.684161199625117,
"grad_norm": 0.5532141923904419,
"learning_rate": 2.2353154766630358e-07,
"loss": 0.0086,
"step": 2500
},
{
"epoch": 4.684161199625117,
"eval_loss": 1.1228344440460205,
"eval_runtime": 111.5053,
"eval_samples_per_second": 4.484,
"eval_steps_per_second": 2.242,
"step": 2500
},
{
"epoch": 4.693533270852859,
"grad_norm": 0.4479539692401886,
"learning_rate": 2.1069606899845497e-07,
"loss": 0.0077,
"step": 2505
},
{
"epoch": 4.7029053420806,
"grad_norm": 0.4743359386920929,
"learning_rate": 1.9823616100977495e-07,
"loss": 0.0081,
"step": 2510
},
{
"epoch": 4.712277413308341,
"grad_norm": 0.38026347756385803,
"learning_rate": 1.8615230172976507e-07,
"loss": 0.0065,
"step": 2515
},
{
"epoch": 4.721649484536083,
"grad_norm": 0.5804769396781921,
"learning_rate": 1.744449547606697e-07,
"loss": 0.0092,
"step": 2520
},
{
"epoch": 4.7310215557638235,
"grad_norm": 0.5354004502296448,
"learning_rate": 1.6311456925967583e-07,
"loss": 0.0074,
"step": 2525
},
{
"epoch": 4.740393626991565,
"grad_norm": 0.6035090088844299,
"learning_rate": 1.5216157992169577e-07,
"loss": 0.0067,
"step": 2530
},
{
"epoch": 4.749765698219306,
"grad_norm": 0.5137022733688354,
"learning_rate": 1.41586406962676e-07,
"loss": 0.0075,
"step": 2535
},
{
"epoch": 4.759137769447047,
"grad_norm": 0.2721659541130066,
"learning_rate": 1.3138945610348564e-07,
"loss": 0.0072,
"step": 2540
},
{
"epoch": 4.768509840674789,
"grad_norm": 0.4901478886604309,
"learning_rate": 1.2157111855434667e-07,
"loss": 0.0065,
"step": 2545
},
{
"epoch": 4.77788191190253,
"grad_norm": 0.2981049716472626,
"learning_rate": 1.1213177099982376e-07,
"loss": 0.0069,
"step": 2550
},
{
"epoch": 4.787253983130272,
"grad_norm": 0.49158474802970886,
"learning_rate": 1.0307177558437686e-07,
"loss": 0.0082,
"step": 2555
},
{
"epoch": 4.796626054358013,
"grad_norm": 0.6860193610191345,
"learning_rate": 9.439147989846354e-08,
"loss": 0.0081,
"step": 2560
},
{
"epoch": 4.805998125585754,
"grad_norm": 0.7087129354476929,
"learning_rate": 8.609121696520283e-08,
"loss": 0.0084,
"step": 2565
},
{
"epoch": 4.815370196813496,
"grad_norm": 0.727730929851532,
"learning_rate": 7.817130522760452e-08,
"loss": 0.0334,
"step": 2570
},
{
"epoch": 4.824742268041237,
"grad_norm": 0.4352070391178131,
"learning_rate": 7.063204853634543e-08,
"loss": 0.0076,
"step": 2575
},
{
"epoch": 4.834114339268979,
"grad_norm": 0.3776610791683197,
"learning_rate": 6.347373613811325e-08,
"loss": 0.0059,
"step": 2580
},
{
"epoch": 4.84348641049672,
"grad_norm": 0.5180082321166992,
"learning_rate": 5.6696642664515465e-08,
"loss": 0.0081,
"step": 2585
},
{
"epoch": 4.852858481724461,
"grad_norm": 0.49723920226097107,
"learning_rate": 5.030102812153548e-08,
"loss": 0.0081,
"step": 2590
},
{
"epoch": 4.8622305529522025,
"grad_norm": 0.2777559161186218,
"learning_rate": 4.428713787955841e-08,
"loss": 0.007,
"step": 2595
},
{
"epoch": 4.8716026241799435,
"grad_norm": 0.44526979327201843,
"learning_rate": 3.865520266396416e-08,
"loss": 0.0072,
"step": 2600
},
{
"epoch": 4.8716026241799435,
"eval_loss": 1.1233325004577637,
"eval_runtime": 111.5373,
"eval_samples_per_second": 4.483,
"eval_steps_per_second": 2.241,
"step": 2600
},
{
"epoch": 4.880974695407685,
"grad_norm": 0.49195200204849243,
"learning_rate": 3.340543854626566e-08,
"loss": 0.0081,
"step": 2605
},
{
"epoch": 4.890346766635426,
"grad_norm": 0.33215323090553284,
"learning_rate": 2.8538046935828733e-08,
"loss": 0.0069,
"step": 2610
},
{
"epoch": 4.899718837863167,
"grad_norm": 0.43431356549263,
"learning_rate": 2.4053214572137274e-08,
"loss": 0.0066,
"step": 2615
},
{
"epoch": 4.909090909090909,
"grad_norm": 0.49866101145744324,
"learning_rate": 1.9951113517633346e-08,
"loss": 0.007,
"step": 2620
},
{
"epoch": 4.91846298031865,
"grad_norm": 0.5170955657958984,
"learning_rate": 1.6231901151113617e-08,
"loss": 0.0083,
"step": 2625
},
{
"epoch": 4.927835051546392,
"grad_norm": 0.3568389117717743,
"learning_rate": 1.2895720161693048e-08,
"loss": 0.0073,
"step": 2630
},
{
"epoch": 4.937207122774133,
"grad_norm": 0.4371294379234314,
"learning_rate": 9.942698543330409e-09,
"loss": 0.008,
"step": 2635
},
{
"epoch": 4.946579194001874,
"grad_norm": 0.6057606935501099,
"learning_rate": 7.372949589916633e-09,
"loss": 0.0078,
"step": 2640
},
{
"epoch": 4.955951265229616,
"grad_norm": 0.44616734981536865,
"learning_rate": 5.186571890929415e-09,
"loss": 0.0079,
"step": 2645
},
{
"epoch": 4.965323336457357,
"grad_norm": 0.5187500715255737,
"learning_rate": 3.383649327650673e-09,
"loss": 0.0082,
"step": 2650
},
{
"epoch": 4.974695407685099,
"grad_norm": 0.3702596426010132,
"learning_rate": 1.9642510699469096e-09,
"loss": 0.0074,
"step": 2655
},
{
"epoch": 4.98406747891284,
"grad_norm": 0.38890424370765686,
"learning_rate": 9.284315736168837e-10,
"loss": 0.0068,
"step": 2660
},
{
"epoch": 4.993439550140581,
"grad_norm": 0.40768128633499146,
"learning_rate": 2.762305783021724e-10,
"loss": 0.0079,
"step": 2665
},
{
"epoch": 5.0,
"step": 2669,
"total_flos": 3.0080813400754176e+18,
"train_loss": 0.12334943315905438,
"train_runtime": 40705.595,
"train_samples_per_second": 2.097,
"train_steps_per_second": 0.066
}
],
"logging_steps": 5,
"max_steps": 2670,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.0080813400754176e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}