Files
Llama3.2-3B_Paper_Impact_SFT/trainer_state.json
ModelHub XC 94a68644cd 初始化项目,由ModelHub XC社区提供模型
Model: FlyPig23/Llama3.2-3B_Paper_Impact_SFT
Source: Original Platform
2026-04-13 17:33:03 +08:00

2981 lines
73 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2076,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007228044813877846,
"grad_norm": 2.43762469291687,
"learning_rate": 3.846153846153847e-07,
"loss": 0.2014,
"step": 5
},
{
"epoch": 0.014456089627755691,
"grad_norm": 0.7071607112884521,
"learning_rate": 8.653846153846154e-07,
"loss": 0.1441,
"step": 10
},
{
"epoch": 0.02168413444163354,
"grad_norm": 0.25739526748657227,
"learning_rate": 1.3461538461538462e-06,
"loss": 0.0935,
"step": 15
},
{
"epoch": 0.028912179255511383,
"grad_norm": 0.1548013836145401,
"learning_rate": 1.826923076923077e-06,
"loss": 0.0841,
"step": 20
},
{
"epoch": 0.03614022406938923,
"grad_norm": 0.1367483139038086,
"learning_rate": 2.307692307692308e-06,
"loss": 0.0844,
"step": 25
},
{
"epoch": 0.04336826888326708,
"grad_norm": 0.012530342675745487,
"learning_rate": 2.7884615384615386e-06,
"loss": 0.0809,
"step": 30
},
{
"epoch": 0.05059631369714492,
"grad_norm": 0.049705736339092255,
"learning_rate": 3.2692307692307696e-06,
"loss": 0.0819,
"step": 35
},
{
"epoch": 0.057824358511022765,
"grad_norm": 0.051718585193157196,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0803,
"step": 40
},
{
"epoch": 0.06505240332490062,
"grad_norm": 0.01605582982301712,
"learning_rate": 4.230769230769231e-06,
"loss": 0.0813,
"step": 45
},
{
"epoch": 0.07228044813877846,
"grad_norm": 0.06171448528766632,
"learning_rate": 4.711538461538462e-06,
"loss": 0.0801,
"step": 50
},
{
"epoch": 0.0795084929526563,
"grad_norm": 0.05586954951286316,
"learning_rate": 5.192307692307693e-06,
"loss": 0.0811,
"step": 55
},
{
"epoch": 0.08673653776653416,
"grad_norm": 0.028248045593500137,
"learning_rate": 5.6730769230769235e-06,
"loss": 0.0806,
"step": 60
},
{
"epoch": 0.093964582580412,
"grad_norm": 0.04312776029109955,
"learning_rate": 6.153846153846155e-06,
"loss": 0.0808,
"step": 65
},
{
"epoch": 0.10119262739428984,
"grad_norm": 0.10684467852115631,
"learning_rate": 6.6346153846153846e-06,
"loss": 0.0808,
"step": 70
},
{
"epoch": 0.10842067220816769,
"grad_norm": 0.07184753566980362,
"learning_rate": 7.115384615384616e-06,
"loss": 0.0805,
"step": 75
},
{
"epoch": 0.11564871702204553,
"grad_norm": 0.09277470409870148,
"learning_rate": 7.5961538461538465e-06,
"loss": 0.0807,
"step": 80
},
{
"epoch": 0.12287676183592339,
"grad_norm": 0.02109931781888008,
"learning_rate": 8.076923076923077e-06,
"loss": 0.0809,
"step": 85
},
{
"epoch": 0.13010480664980123,
"grad_norm": 0.03073902055621147,
"learning_rate": 8.557692307692308e-06,
"loss": 0.0812,
"step": 90
},
{
"epoch": 0.13733285146367907,
"grad_norm": 0.06802671402692795,
"learning_rate": 9.03846153846154e-06,
"loss": 0.08,
"step": 95
},
{
"epoch": 0.14456089627755692,
"grad_norm": 0.03209488093852997,
"learning_rate": 9.51923076923077e-06,
"loss": 0.08,
"step": 100
},
{
"epoch": 0.15178894109143476,
"grad_norm": 0.07769843190908432,
"learning_rate": 1e-05,
"loss": 0.0817,
"step": 105
},
{
"epoch": 0.1590169859053126,
"grad_norm": 0.06089721992611885,
"learning_rate": 1.0480769230769232e-05,
"loss": 0.0805,
"step": 110
},
{
"epoch": 0.16624503071919045,
"grad_norm": 0.021150365471839905,
"learning_rate": 1.0961538461538464e-05,
"loss": 0.0797,
"step": 115
},
{
"epoch": 0.17347307553306832,
"grad_norm": 0.035523343831300735,
"learning_rate": 1.1442307692307693e-05,
"loss": 0.0808,
"step": 120
},
{
"epoch": 0.18070112034694616,
"grad_norm": 0.019112691283226013,
"learning_rate": 1.1923076923076925e-05,
"loss": 0.0805,
"step": 125
},
{
"epoch": 0.187929165160824,
"grad_norm": 0.23690395057201385,
"learning_rate": 1.2403846153846156e-05,
"loss": 0.0836,
"step": 130
},
{
"epoch": 0.19515720997470185,
"grad_norm": 0.08766212314367294,
"learning_rate": 1.2884615384615386e-05,
"loss": 0.0802,
"step": 135
},
{
"epoch": 0.2023852547885797,
"grad_norm": 0.04037011042237282,
"learning_rate": 1.3365384615384615e-05,
"loss": 0.0811,
"step": 140
},
{
"epoch": 0.20961329960245753,
"grad_norm": 0.15161241590976715,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.0829,
"step": 145
},
{
"epoch": 0.21684134441633537,
"grad_norm": 0.09708157926797867,
"learning_rate": 1.4326923076923078e-05,
"loss": 0.0816,
"step": 150
},
{
"epoch": 0.22406938923021322,
"grad_norm": 0.09547246992588043,
"learning_rate": 1.480769230769231e-05,
"loss": 0.0801,
"step": 155
},
{
"epoch": 0.23129743404409106,
"grad_norm": 0.015538723208010197,
"learning_rate": 1.528846153846154e-05,
"loss": 0.0813,
"step": 160
},
{
"epoch": 0.23852547885796893,
"grad_norm": 0.053264446556568146,
"learning_rate": 1.576923076923077e-05,
"loss": 0.0811,
"step": 165
},
{
"epoch": 0.24575352367184677,
"grad_norm": 0.0034629153087735176,
"learning_rate": 1.6250000000000002e-05,
"loss": 0.081,
"step": 170
},
{
"epoch": 0.2529815684857246,
"grad_norm": 0.06591220200061798,
"learning_rate": 1.673076923076923e-05,
"loss": 0.0793,
"step": 175
},
{
"epoch": 0.26020961329960246,
"grad_norm": 0.02226085402071476,
"learning_rate": 1.7211538461538465e-05,
"loss": 0.0811,
"step": 180
},
{
"epoch": 0.2674376581134803,
"grad_norm": 0.031285785138607025,
"learning_rate": 1.7692307692307694e-05,
"loss": 0.0819,
"step": 185
},
{
"epoch": 0.27466570292735815,
"grad_norm": 0.014217695221304893,
"learning_rate": 1.8173076923076924e-05,
"loss": 0.0797,
"step": 190
},
{
"epoch": 0.281893747741236,
"grad_norm": 0.061094146221876144,
"learning_rate": 1.8653846153846157e-05,
"loss": 0.0792,
"step": 195
},
{
"epoch": 0.28912179255511383,
"grad_norm": 0.1535295695066452,
"learning_rate": 1.9134615384615387e-05,
"loss": 0.0948,
"step": 200
},
{
"epoch": 0.2963498373689917,
"grad_norm": 0.15612953901290894,
"learning_rate": 1.9615384615384617e-05,
"loss": 0.0895,
"step": 205
},
{
"epoch": 0.3035778821828695,
"grad_norm": 0.015299557708203793,
"learning_rate": 1.999998585783488e-05,
"loss": 0.0818,
"step": 210
},
{
"epoch": 0.31080592699674736,
"grad_norm": 0.07040851563215256,
"learning_rate": 1.9999490886255767e-05,
"loss": 0.0812,
"step": 215
},
{
"epoch": 0.3180339718106252,
"grad_norm": 0.008683345280587673,
"learning_rate": 1.999828884642042e-05,
"loss": 0.0814,
"step": 220
},
{
"epoch": 0.32526201662450305,
"grad_norm": 0.10729002952575684,
"learning_rate": 1.9996379823325586e-05,
"loss": 0.0806,
"step": 225
},
{
"epoch": 0.3324900614383809,
"grad_norm": 0.07564987987279892,
"learning_rate": 1.9993763951959107e-05,
"loss": 0.0815,
"step": 230
},
{
"epoch": 0.3397181062522588,
"grad_norm": 0.10431886464357376,
"learning_rate": 1.9990441417290358e-05,
"loss": 0.0846,
"step": 235
},
{
"epoch": 0.34694615106613663,
"grad_norm": 0.11196550726890564,
"learning_rate": 1.9986412454257178e-05,
"loss": 0.0849,
"step": 240
},
{
"epoch": 0.3541741958800145,
"grad_norm": 0.04622909799218178,
"learning_rate": 1.998167734774926e-05,
"loss": 0.0821,
"step": 245
},
{
"epoch": 0.3614022406938923,
"grad_norm": 0.06048440560698509,
"learning_rate": 1.9976236432588002e-05,
"loss": 0.0808,
"step": 250
},
{
"epoch": 0.36863028550777016,
"grad_norm": 0.01802447997033596,
"learning_rate": 1.997009009350283e-05,
"loss": 0.0805,
"step": 255
},
{
"epoch": 0.375858330321648,
"grad_norm": 0.04636721312999725,
"learning_rate": 1.996323876510399e-05,
"loss": 0.081,
"step": 260
},
{
"epoch": 0.38308637513552585,
"grad_norm": 0.062323447316884995,
"learning_rate": 1.9955682931851835e-05,
"loss": 0.083,
"step": 265
},
{
"epoch": 0.3903144199494037,
"grad_norm": 0.0894196629524231,
"learning_rate": 1.994742312802255e-05,
"loss": 0.0815,
"step": 270
},
{
"epoch": 0.39754246476328153,
"grad_norm": 0.0037153863813728094,
"learning_rate": 1.993845993767038e-05,
"loss": 0.0801,
"step": 275
},
{
"epoch": 0.4047705095771594,
"grad_norm": 0.14254964888095856,
"learning_rate": 1.9928793994586323e-05,
"loss": 0.0857,
"step": 280
},
{
"epoch": 0.4119985543910372,
"grad_norm": 0.08120843023061752,
"learning_rate": 1.9918425982253335e-05,
"loss": 0.0822,
"step": 285
},
{
"epoch": 0.41922659920491506,
"grad_norm": 0.029054520651698112,
"learning_rate": 1.9907356633797978e-05,
"loss": 0.0813,
"step": 290
},
{
"epoch": 0.4264546440187929,
"grad_norm": 0.03537634015083313,
"learning_rate": 1.9895586731938593e-05,
"loss": 0.0812,
"step": 295
},
{
"epoch": 0.43368268883267075,
"grad_norm": 0.015365133993327618,
"learning_rate": 1.9883117108929947e-05,
"loss": 0.0825,
"step": 300
},
{
"epoch": 0.4409107336465486,
"grad_norm": 0.029979810118675232,
"learning_rate": 1.986994864650439e-05,
"loss": 0.0821,
"step": 305
},
{
"epoch": 0.44813877846042643,
"grad_norm": 0.021777283400297165,
"learning_rate": 1.9856082275809508e-05,
"loss": 0.0817,
"step": 310
},
{
"epoch": 0.4553668232743043,
"grad_norm": 0.005012670066207647,
"learning_rate": 1.9841518977342274e-05,
"loss": 0.081,
"step": 315
},
{
"epoch": 0.4625948680881821,
"grad_norm": 0.07129844278097153,
"learning_rate": 1.9826259780879716e-05,
"loss": 0.081,
"step": 320
},
{
"epoch": 0.46982291290206,
"grad_norm": 0.07923093438148499,
"learning_rate": 1.981030576540612e-05,
"loss": 0.0806,
"step": 325
},
{
"epoch": 0.47705095771593786,
"grad_norm": 0.04692668095231056,
"learning_rate": 1.9793658059036697e-05,
"loss": 0.0799,
"step": 330
},
{
"epoch": 0.4842790025298157,
"grad_norm": 0.04923313483595848,
"learning_rate": 1.977631783893786e-05,
"loss": 0.0813,
"step": 335
},
{
"epoch": 0.49150704734369355,
"grad_norm": 0.04896867647767067,
"learning_rate": 1.975828633124394e-05,
"loss": 0.0805,
"step": 340
},
{
"epoch": 0.4987350921575714,
"grad_norm": 0.004160281270742416,
"learning_rate": 1.9739564810970534e-05,
"loss": 0.0804,
"step": 345
},
{
"epoch": 0.5059631369714492,
"grad_norm": 0.07328899949789047,
"learning_rate": 1.9720154601924295e-05,
"loss": 0.0802,
"step": 350
},
{
"epoch": 0.5131911817853271,
"grad_norm": 0.007687863428145647,
"learning_rate": 1.9700057076609377e-05,
"loss": 0.08,
"step": 355
},
{
"epoch": 0.5204192265992049,
"grad_norm": 0.011853563599288464,
"learning_rate": 1.967927365613034e-05,
"loss": 0.0802,
"step": 360
},
{
"epoch": 0.5276472714130828,
"grad_norm": 0.007507723290473223,
"learning_rate": 1.96578058100917e-05,
"loss": 0.0799,
"step": 365
},
{
"epoch": 0.5348753162269606,
"grad_norm": 0.0316060446202755,
"learning_rate": 1.963565505649398e-05,
"loss": 0.081,
"step": 370
},
{
"epoch": 0.5421033610408384,
"grad_norm": 0.03697923943400383,
"learning_rate": 1.961282296162639e-05,
"loss": 0.0799,
"step": 375
},
{
"epoch": 0.5493314058547163,
"grad_norm": 0.007007018197327852,
"learning_rate": 1.9589311139956086e-05,
"loss": 0.0855,
"step": 380
},
{
"epoch": 0.5565594506685941,
"grad_norm": 0.024093549698591232,
"learning_rate": 1.956512125401398e-05,
"loss": 0.0809,
"step": 385
},
{
"epoch": 0.563787495482472,
"grad_norm": 0.007132918573915958,
"learning_rate": 1.9540255014277198e-05,
"loss": 0.0796,
"step": 390
},
{
"epoch": 0.5710155402963498,
"grad_norm": 0.055107131600379944,
"learning_rate": 1.9514714179048138e-05,
"loss": 0.0792,
"step": 395
},
{
"epoch": 0.5782435851102277,
"grad_norm": 0.036154747009277344,
"learning_rate": 1.9488500554330126e-05,
"loss": 0.0791,
"step": 400
},
{
"epoch": 0.5854716299241055,
"grad_norm": 0.02283984236419201,
"learning_rate": 1.946161599369973e-05,
"loss": 0.0794,
"step": 405
},
{
"epoch": 0.5926996747379834,
"grad_norm": 0.08648855239152908,
"learning_rate": 1.9434062398175667e-05,
"loss": 0.0776,
"step": 410
},
{
"epoch": 0.5999277195518612,
"grad_norm": 0.042574405670166016,
"learning_rate": 1.9405841716084403e-05,
"loss": 0.0772,
"step": 415
},
{
"epoch": 0.607155764365739,
"grad_norm": 0.035125792026519775,
"learning_rate": 1.937695594292238e-05,
"loss": 0.0765,
"step": 420
},
{
"epoch": 0.6143838091796169,
"grad_norm": 0.05990980565547943,
"learning_rate": 1.9347407121214917e-05,
"loss": 0.075,
"step": 425
},
{
"epoch": 0.6216118539934947,
"grad_norm": 0.13196605443954468,
"learning_rate": 1.9317197340371764e-05,
"loss": 0.0706,
"step": 430
},
{
"epoch": 0.6288398988073726,
"grad_norm": 0.04624694585800171,
"learning_rate": 1.9286328736539385e-05,
"loss": 0.0827,
"step": 435
},
{
"epoch": 0.6360679436212504,
"grad_norm": 0.03560846298933029,
"learning_rate": 1.9254803492449894e-05,
"loss": 0.0815,
"step": 440
},
{
"epoch": 0.6432959884351283,
"grad_norm": 0.0264581311494112,
"learning_rate": 1.922262383726672e-05,
"loss": 0.0816,
"step": 445
},
{
"epoch": 0.6505240332490061,
"grad_norm": 0.013737207278609276,
"learning_rate": 1.9189792046426972e-05,
"loss": 0.0795,
"step": 450
},
{
"epoch": 0.6577520780628839,
"grad_norm": 0.008679666556417942,
"learning_rate": 1.9156310441480557e-05,
"loss": 0.0795,
"step": 455
},
{
"epoch": 0.6649801228767618,
"grad_norm": 0.011995796114206314,
"learning_rate": 1.912218138992601e-05,
"loss": 0.0772,
"step": 460
},
{
"epoch": 0.6722081676906397,
"grad_norm": 0.025770675390958786,
"learning_rate": 1.9087407305043085e-05,
"loss": 0.0732,
"step": 465
},
{
"epoch": 0.6794362125045176,
"grad_norm": 0.11803118139505386,
"learning_rate": 1.9051990645722133e-05,
"loss": 0.0694,
"step": 470
},
{
"epoch": 0.6866642573183954,
"grad_norm": 0.0760372206568718,
"learning_rate": 1.9015933916290202e-05,
"loss": 0.0676,
"step": 475
},
{
"epoch": 0.6938923021322733,
"grad_norm": 0.03851527348160744,
"learning_rate": 1.8979239666333975e-05,
"loss": 0.0635,
"step": 480
},
{
"epoch": 0.7011203469461511,
"grad_norm": 0.08836951106786728,
"learning_rate": 1.8941910490519483e-05,
"loss": 0.0636,
"step": 485
},
{
"epoch": 0.708348391760029,
"grad_norm": 0.06829584389925003,
"learning_rate": 1.8903949028408636e-05,
"loss": 0.0648,
"step": 490
},
{
"epoch": 0.7155764365739068,
"grad_norm": 0.02876531518995762,
"learning_rate": 1.8865357964272576e-05,
"loss": 0.062,
"step": 495
},
{
"epoch": 0.7228044813877846,
"grad_norm": 0.039769161492586136,
"learning_rate": 1.8826140026901873e-05,
"loss": 0.0607,
"step": 500
},
{
"epoch": 0.7228044813877846,
"eval_loss": 0.07328393310308456,
"eval_runtime": 1144.2594,
"eval_samples_per_second": 56.111,
"eval_steps_per_second": 1.754,
"step": 500
},
{
"epoch": 0.7300325262016625,
"grad_norm": 0.04059358313679695,
"learning_rate": 1.878629798941357e-05,
"loss": 0.0631,
"step": 505
},
{
"epoch": 0.7372605710155403,
"grad_norm": 0.11944068223237991,
"learning_rate": 1.8745834669055085e-05,
"loss": 0.064,
"step": 510
},
{
"epoch": 0.7444886158294182,
"grad_norm": 0.04337216168642044,
"learning_rate": 1.8704752927005034e-05,
"loss": 0.0618,
"step": 515
},
{
"epoch": 0.751716660643296,
"grad_norm": 0.04100070148706436,
"learning_rate": 1.8663055668170873e-05,
"loss": 0.0613,
"step": 520
},
{
"epoch": 0.7589447054571739,
"grad_norm": 0.0634031817317009,
"learning_rate": 1.8620745840983522e-05,
"loss": 0.0589,
"step": 525
},
{
"epoch": 0.7661727502710517,
"grad_norm": 0.08053874224424362,
"learning_rate": 1.857782643718887e-05,
"loss": 0.0578,
"step": 530
},
{
"epoch": 0.7734007950849295,
"grad_norm": 0.037648145109415054,
"learning_rate": 1.8534300491636225e-05,
"loss": 0.0561,
"step": 535
},
{
"epoch": 0.7806288398988074,
"grad_norm": 0.07604615390300751,
"learning_rate": 1.849017108206372e-05,
"loss": 0.0615,
"step": 540
},
{
"epoch": 0.7878568847126852,
"grad_norm": 0.07877160608768463,
"learning_rate": 1.844544132888068e-05,
"loss": 0.0613,
"step": 545
},
{
"epoch": 0.7950849295265631,
"grad_norm": 0.09688904136419296,
"learning_rate": 1.8400114394947003e-05,
"loss": 0.0546,
"step": 550
},
{
"epoch": 0.8023129743404409,
"grad_norm": 0.0808292031288147,
"learning_rate": 1.8354193485349468e-05,
"loss": 0.0521,
"step": 555
},
{
"epoch": 0.8095410191543188,
"grad_norm": 0.05046294629573822,
"learning_rate": 1.830768184717514e-05,
"loss": 0.0581,
"step": 560
},
{
"epoch": 0.8167690639681966,
"grad_norm": 0.045729391276836395,
"learning_rate": 1.8260582769281747e-05,
"loss": 0.0596,
"step": 565
},
{
"epoch": 0.8239971087820744,
"grad_norm": 0.06313765794038773,
"learning_rate": 1.821289958206513e-05,
"loss": 0.0588,
"step": 570
},
{
"epoch": 0.8312251535959523,
"grad_norm": 0.04902196675539017,
"learning_rate": 1.8164635657223755e-05,
"loss": 0.0584,
"step": 575
},
{
"epoch": 0.8384531984098301,
"grad_norm": 0.08288609981536865,
"learning_rate": 1.8115794407520287e-05,
"loss": 0.0558,
"step": 580
},
{
"epoch": 0.845681243223708,
"grad_norm": 0.04202403128147125,
"learning_rate": 1.8066379286540278e-05,
"loss": 0.0524,
"step": 585
},
{
"epoch": 0.8529092880375858,
"grad_norm": 0.046127066016197205,
"learning_rate": 1.8016393788447964e-05,
"loss": 0.0549,
"step": 590
},
{
"epoch": 0.8601373328514637,
"grad_norm": 0.04576544463634491,
"learning_rate": 1.7965841447739185e-05,
"loss": 0.0491,
"step": 595
},
{
"epoch": 0.8673653776653415,
"grad_norm": 0.03939468041062355,
"learning_rate": 1.7914725838991472e-05,
"loss": 0.0543,
"step": 600
},
{
"epoch": 0.8745934224792193,
"grad_norm": 0.03499499708414078,
"learning_rate": 1.7863050576611267e-05,
"loss": 0.051,
"step": 605
},
{
"epoch": 0.8818214672930972,
"grad_norm": 0.04491008073091507,
"learning_rate": 1.781081931457837e-05,
"loss": 0.0553,
"step": 610
},
{
"epoch": 0.889049512106975,
"grad_norm": 0.06234387680888176,
"learning_rate": 1.7758035746187553e-05,
"loss": 0.0517,
"step": 615
},
{
"epoch": 0.8962775569208529,
"grad_norm": 0.04003310948610306,
"learning_rate": 1.770470360378739e-05,
"loss": 0.0545,
"step": 620
},
{
"epoch": 0.9035056017347307,
"grad_norm": 0.08948640525341034,
"learning_rate": 1.7650826658516375e-05,
"loss": 0.0548,
"step": 625
},
{
"epoch": 0.9107336465486086,
"grad_norm": 0.04328719154000282,
"learning_rate": 1.7596408720036232e-05,
"loss": 0.0551,
"step": 630
},
{
"epoch": 0.9179616913624864,
"grad_norm": 0.03560628369450569,
"learning_rate": 1.754145363626256e-05,
"loss": 0.05,
"step": 635
},
{
"epoch": 0.9251897361763642,
"grad_norm": 0.04258381202816963,
"learning_rate": 1.748596529309271e-05,
"loss": 0.0503,
"step": 640
},
{
"epoch": 0.9324177809902421,
"grad_norm": 0.07129397243261337,
"learning_rate": 1.742994761413105e-05,
"loss": 0.0488,
"step": 645
},
{
"epoch": 0.93964582580412,
"grad_norm": 0.03454764559864998,
"learning_rate": 1.73734045604115e-05,
"loss": 0.0553,
"step": 650
},
{
"epoch": 0.9468738706179979,
"grad_norm": 0.053935691714286804,
"learning_rate": 1.731634013011745e-05,
"loss": 0.0499,
"step": 655
},
{
"epoch": 0.9541019154318757,
"grad_norm": 0.08872876316308975,
"learning_rate": 1.7258758358299053e-05,
"loss": 0.0521,
"step": 660
},
{
"epoch": 0.9613299602457536,
"grad_norm": 0.057639699429273605,
"learning_rate": 1.7200663316587897e-05,
"loss": 0.0505,
"step": 665
},
{
"epoch": 0.9685580050596314,
"grad_norm": 0.08045148104429245,
"learning_rate": 1.7142059112909107e-05,
"loss": 0.0537,
"step": 670
},
{
"epoch": 0.9757860498735093,
"grad_norm": 0.05597732216119766,
"learning_rate": 1.708294989119087e-05,
"loss": 0.0466,
"step": 675
},
{
"epoch": 0.9830140946873871,
"grad_norm": 0.07576154917478561,
"learning_rate": 1.7023339831071408e-05,
"loss": 0.0527,
"step": 680
},
{
"epoch": 0.9902421395012649,
"grad_norm": 0.03821377828717232,
"learning_rate": 1.696323314760344e-05,
"loss": 0.0483,
"step": 685
},
{
"epoch": 0.9974701843151428,
"grad_norm": 0.07789347320795059,
"learning_rate": 1.690263409095614e-05,
"loss": 0.0483,
"step": 690
},
{
"epoch": 1.0043368268883268,
"grad_norm": 0.03913086652755737,
"learning_rate": 1.6841546946114586e-05,
"loss": 0.0417,
"step": 695
},
{
"epoch": 1.0115648717022045,
"grad_norm": 0.054379936307668686,
"learning_rate": 1.6779976032576792e-05,
"loss": 0.0336,
"step": 700
},
{
"epoch": 1.0187929165160825,
"grad_norm": 0.08715476840734482,
"learning_rate": 1.6717925704048256e-05,
"loss": 0.0416,
"step": 705
},
{
"epoch": 1.0260209613299602,
"grad_norm": 0.0766800120472908,
"learning_rate": 1.6655400348134122e-05,
"loss": 0.0404,
"step": 710
},
{
"epoch": 1.0332490061438382,
"grad_norm": 0.06571623682975769,
"learning_rate": 1.659240438602891e-05,
"loss": 0.0431,
"step": 715
},
{
"epoch": 1.040477050957716,
"grad_norm": 0.09106060862541199,
"learning_rate": 1.6528942272203912e-05,
"loss": 0.0419,
"step": 720
},
{
"epoch": 1.0477050957715939,
"grad_norm": 0.0675068348646164,
"learning_rate": 1.6465018494092213e-05,
"loss": 0.0401,
"step": 725
},
{
"epoch": 1.0549331405854716,
"grad_norm": 0.06592784821987152,
"learning_rate": 1.6400637571771354e-05,
"loss": 0.0383,
"step": 730
},
{
"epoch": 1.0621611853993496,
"grad_norm": 0.08610466867685318,
"learning_rate": 1.633580405764376e-05,
"loss": 0.0355,
"step": 735
},
{
"epoch": 1.0693892302132273,
"grad_norm": 0.09420937299728394,
"learning_rate": 1.6270522536114813e-05,
"loss": 0.0393,
"step": 740
},
{
"epoch": 1.0766172750271052,
"grad_norm": 0.066034696996212,
"learning_rate": 1.6204797623268675e-05,
"loss": 0.0353,
"step": 745
},
{
"epoch": 1.083845319840983,
"grad_norm": 0.05469588562846184,
"learning_rate": 1.6138633966541905e-05,
"loss": 0.0395,
"step": 750
},
{
"epoch": 1.091073364654861,
"grad_norm": 0.05333936959505081,
"learning_rate": 1.6072036244394836e-05,
"loss": 0.0409,
"step": 755
},
{
"epoch": 1.0983014094687387,
"grad_norm": 0.06300196051597595,
"learning_rate": 1.600500916598074e-05,
"loss": 0.0382,
"step": 760
},
{
"epoch": 1.1055294542826166,
"grad_norm": 0.08063532412052155,
"learning_rate": 1.5937557470812852e-05,
"loss": 0.0369,
"step": 765
},
{
"epoch": 1.1127574990964944,
"grad_norm": 0.07369716465473175,
"learning_rate": 1.5869685928429253e-05,
"loss": 0.0393,
"step": 770
},
{
"epoch": 1.1199855439103723,
"grad_norm": 0.058140210807323456,
"learning_rate": 1.5801399338055584e-05,
"loss": 0.0352,
"step": 775
},
{
"epoch": 1.12721358872425,
"grad_norm": 0.060627613216638565,
"learning_rate": 1.5732702528265716e-05,
"loss": 0.0381,
"step": 780
},
{
"epoch": 1.134441633538128,
"grad_norm": 0.07465813308954239,
"learning_rate": 1.5663600356640306e-05,
"loss": 0.0367,
"step": 785
},
{
"epoch": 1.1416696783520057,
"grad_norm": 0.047345198690891266,
"learning_rate": 1.5594097709423316e-05,
"loss": 0.0389,
"step": 790
},
{
"epoch": 1.1488977231658837,
"grad_norm": 0.06834863871335983,
"learning_rate": 1.552419950117651e-05,
"loss": 0.0381,
"step": 795
},
{
"epoch": 1.1561257679797614,
"grad_norm": 0.047312233597040176,
"learning_rate": 1.545391067443194e-05,
"loss": 0.0343,
"step": 800
},
{
"epoch": 1.1633538127936394,
"grad_norm": 0.06944846361875534,
"learning_rate": 1.538323619934247e-05,
"loss": 0.0389,
"step": 805
},
{
"epoch": 1.170581857607517,
"grad_norm": 0.04954347014427185,
"learning_rate": 1.5312181073330295e-05,
"loss": 0.0366,
"step": 810
},
{
"epoch": 1.177809902421395,
"grad_norm": 0.06755795329809189,
"learning_rate": 1.524075032073363e-05,
"loss": 0.0373,
"step": 815
},
{
"epoch": 1.1850379472352728,
"grad_norm": 0.08281169086694717,
"learning_rate": 1.5168948992451382e-05,
"loss": 0.0356,
"step": 820
},
{
"epoch": 1.1922659920491507,
"grad_norm": 0.08935344219207764,
"learning_rate": 1.5096782165586037e-05,
"loss": 0.0365,
"step": 825
},
{
"epoch": 1.1994940368630285,
"grad_norm": 0.04098968952894211,
"learning_rate": 1.5024254943084629e-05,
"loss": 0.0381,
"step": 830
},
{
"epoch": 1.2067220816769064,
"grad_norm": 0.05055451765656471,
"learning_rate": 1.495137245337794e-05,
"loss": 0.0367,
"step": 835
},
{
"epoch": 1.2139501264907842,
"grad_norm": 0.06718173623085022,
"learning_rate": 1.487813985001782e-05,
"loss": 0.0356,
"step": 840
},
{
"epoch": 1.221178171304662,
"grad_norm": 0.05843829736113548,
"learning_rate": 1.480456231131283e-05,
"loss": 0.0352,
"step": 845
},
{
"epoch": 1.2284062161185398,
"grad_norm": 0.052432768046855927,
"learning_rate": 1.4730645039962044e-05,
"loss": 0.0364,
"step": 850
},
{
"epoch": 1.2356342609324178,
"grad_norm": 0.05346972495317459,
"learning_rate": 1.4656393262687172e-05,
"loss": 0.0368,
"step": 855
},
{
"epoch": 1.2428623057462955,
"grad_norm": 0.06525395065546036,
"learning_rate": 1.4581812229862993e-05,
"loss": 0.0338,
"step": 860
},
{
"epoch": 1.2500903505601735,
"grad_norm": 0.07090573757886887,
"learning_rate": 1.4506907215146075e-05,
"loss": 0.0328,
"step": 865
},
{
"epoch": 1.2573183953740514,
"grad_norm": 0.05279651656746864,
"learning_rate": 1.443168351510189e-05,
"loss": 0.0348,
"step": 870
},
{
"epoch": 1.2645464401879292,
"grad_norm": 0.05886390060186386,
"learning_rate": 1.4356146448830277e-05,
"loss": 0.0341,
"step": 875
},
{
"epoch": 1.271774485001807,
"grad_norm": 0.0537516325712204,
"learning_rate": 1.4280301357589349e-05,
"loss": 0.0379,
"step": 880
},
{
"epoch": 1.2790025298156849,
"grad_norm": 0.049141135066747665,
"learning_rate": 1.4204153604417775e-05,
"loss": 0.0339,
"step": 885
},
{
"epoch": 1.2862305746295628,
"grad_norm": 0.05724327638745308,
"learning_rate": 1.4127708573755599e-05,
"loss": 0.0317,
"step": 890
},
{
"epoch": 1.2934586194434405,
"grad_norm": 0.05847681313753128,
"learning_rate": 1.4050971671063464e-05,
"loss": 0.0341,
"step": 895
},
{
"epoch": 1.3006866642573183,
"grad_norm": 0.04777985066175461,
"learning_rate": 1.3973948322440427e-05,
"loss": 0.0388,
"step": 900
},
{
"epoch": 1.3079147090711962,
"grad_norm": 0.062013089656829834,
"learning_rate": 1.3896643974240245e-05,
"loss": 0.0309,
"step": 905
},
{
"epoch": 1.3151427538850742,
"grad_norm": 0.08561990410089493,
"learning_rate": 1.3819064092686278e-05,
"loss": 0.0327,
"step": 910
},
{
"epoch": 1.322370798698952,
"grad_norm": 0.08605846017599106,
"learning_rate": 1.3741214163484968e-05,
"loss": 0.0345,
"step": 915
},
{
"epoch": 1.3295988435128296,
"grad_norm": 0.06043161824345589,
"learning_rate": 1.3663099691437945e-05,
"loss": 0.0336,
"step": 920
},
{
"epoch": 1.3368268883267076,
"grad_norm": 0.04175262525677681,
"learning_rate": 1.3584726200052767e-05,
"loss": 0.038,
"step": 925
},
{
"epoch": 1.3440549331405856,
"grad_norm": 0.04762093350291252,
"learning_rate": 1.3506099231152366e-05,
"loss": 0.0346,
"step": 930
},
{
"epoch": 1.3512829779544633,
"grad_norm": 0.06360676139593124,
"learning_rate": 1.3427224344483178e-05,
"loss": 0.0296,
"step": 935
},
{
"epoch": 1.3585110227683412,
"grad_norm": 0.06761486828327179,
"learning_rate": 1.3348107117322004e-05,
"loss": 0.0309,
"step": 940
},
{
"epoch": 1.365739067582219,
"grad_norm": 0.06754028797149658,
"learning_rate": 1.3268753144081652e-05,
"loss": 0.028,
"step": 945
},
{
"epoch": 1.372967112396097,
"grad_norm": 0.06639332324266434,
"learning_rate": 1.3189168035915337e-05,
"loss": 0.0331,
"step": 950
},
{
"epoch": 1.3801951572099747,
"grad_norm": 0.05263343080878258,
"learning_rate": 1.3109357420319933e-05,
"loss": 0.031,
"step": 955
},
{
"epoch": 1.3874232020238526,
"grad_norm": 0.07213468849658966,
"learning_rate": 1.3029326940738032e-05,
"loss": 0.0338,
"step": 960
},
{
"epoch": 1.3946512468377303,
"grad_norm": 0.05976350978016853,
"learning_rate": 1.2949082256158904e-05,
"loss": 0.0313,
"step": 965
},
{
"epoch": 1.4018792916516083,
"grad_norm": 0.054479606449604034,
"learning_rate": 1.286862904071835e-05,
"loss": 0.0324,
"step": 970
},
{
"epoch": 1.409107336465486,
"grad_norm": 0.07411843538284302,
"learning_rate": 1.2787972983297472e-05,
"loss": 0.0312,
"step": 975
},
{
"epoch": 1.416335381279364,
"grad_norm": 0.05356777831912041,
"learning_rate": 1.2707119787120417e-05,
"loss": 0.0347,
"step": 980
},
{
"epoch": 1.4235634260932417,
"grad_norm": 0.05905517190694809,
"learning_rate": 1.26260751693511e-05,
"loss": 0.0317,
"step": 985
},
{
"epoch": 1.4307914709071197,
"grad_norm": 0.07836019992828369,
"learning_rate": 1.254484486068893e-05,
"loss": 0.0316,
"step": 990
},
{
"epoch": 1.4380195157209974,
"grad_norm": 0.06900329887866974,
"learning_rate": 1.24634346049636e-05,
"loss": 0.0324,
"step": 995
},
{
"epoch": 1.4452475605348754,
"grad_norm": 0.05929545313119888,
"learning_rate": 1.2381850158728952e-05,
"loss": 0.029,
"step": 1000
},
{
"epoch": 1.4452475605348754,
"eval_loss": 0.08190815895795822,
"eval_runtime": 1141.9961,
"eval_samples_per_second": 56.223,
"eval_steps_per_second": 1.757,
"step": 1000
},
{
"epoch": 1.452475605348753,
"grad_norm": 0.0604124590754509,
"learning_rate": 1.2300097290855887e-05,
"loss": 0.0285,
"step": 1005
},
{
"epoch": 1.459703650162631,
"grad_norm": 0.06895657628774643,
"learning_rate": 1.2218181782124496e-05,
"loss": 0.0316,
"step": 1010
},
{
"epoch": 1.466931694976509,
"grad_norm": 0.048645876348018646,
"learning_rate": 1.2136109424815258e-05,
"loss": 0.0306,
"step": 1015
},
{
"epoch": 1.4741597397903867,
"grad_norm": 0.06193140521645546,
"learning_rate": 1.205388602229949e-05,
"loss": 0.03,
"step": 1020
},
{
"epoch": 1.4813877846042645,
"grad_norm": 0.07050759345293045,
"learning_rate": 1.1971517388628972e-05,
"loss": 0.0341,
"step": 1025
},
{
"epoch": 1.4886158294181424,
"grad_norm": 0.05533516779541969,
"learning_rate": 1.1889009348124857e-05,
"loss": 0.0303,
"step": 1030
},
{
"epoch": 1.4958438742320204,
"grad_norm": 0.04415017366409302,
"learning_rate": 1.180636773496579e-05,
"loss": 0.0276,
"step": 1035
},
{
"epoch": 1.503071919045898,
"grad_norm": 0.059612423181533813,
"learning_rate": 1.1723598392775415e-05,
"loss": 0.0273,
"step": 1040
},
{
"epoch": 1.5102999638597758,
"grad_norm": 0.06513796001672745,
"learning_rate": 1.1640707174209147e-05,
"loss": 0.0299,
"step": 1045
},
{
"epoch": 1.5175280086736538,
"grad_norm": 0.056087836623191833,
"learning_rate": 1.1557699940540321e-05,
"loss": 0.0308,
"step": 1050
},
{
"epoch": 1.5247560534875317,
"grad_norm": 0.06388755887746811,
"learning_rate": 1.1474582561245767e-05,
"loss": 0.0278,
"step": 1055
},
{
"epoch": 1.5319840983014095,
"grad_norm": 0.06793609261512756,
"learning_rate": 1.1391360913590736e-05,
"loss": 0.0342,
"step": 1060
},
{
"epoch": 1.5392121431152872,
"grad_norm": 0.04267344996333122,
"learning_rate": 1.1308040882213363e-05,
"loss": 0.03,
"step": 1065
},
{
"epoch": 1.5464401879291652,
"grad_norm": 0.06034848093986511,
"learning_rate": 1.122462835870852e-05,
"loss": 0.0279,
"step": 1070
},
{
"epoch": 1.553668232743043,
"grad_norm": 0.06860997527837753,
"learning_rate": 1.1141129241211246e-05,
"loss": 0.0263,
"step": 1075
},
{
"epoch": 1.5608962775569208,
"grad_norm": 0.04508688300848007,
"learning_rate": 1.1057549433979675e-05,
"loss": 0.0365,
"step": 1080
},
{
"epoch": 1.5681243223707986,
"grad_norm": 0.04600263386964798,
"learning_rate": 1.0973894846977548e-05,
"loss": 0.0286,
"step": 1085
},
{
"epoch": 1.5753523671846765,
"grad_norm": 0.05820371210575104,
"learning_rate": 1.089017139545631e-05,
"loss": 0.0264,
"step": 1090
},
{
"epoch": 1.5825804119985545,
"grad_norm": 0.0669277012348175,
"learning_rate": 1.0806384999536857e-05,
"loss": 0.028,
"step": 1095
},
{
"epoch": 1.5898084568124322,
"grad_norm": 0.05904907360672951,
"learning_rate": 1.0722541583790898e-05,
"loss": 0.0247,
"step": 1100
},
{
"epoch": 1.59703650162631,
"grad_norm": 0.05929577723145485,
"learning_rate": 1.0638647076822041e-05,
"loss": 0.0305,
"step": 1105
},
{
"epoch": 1.604264546440188,
"grad_norm": 0.058572858572006226,
"learning_rate": 1.0554707410846585e-05,
"loss": 0.0294,
"step": 1110
},
{
"epoch": 1.6114925912540659,
"grad_norm": 0.05682854354381561,
"learning_rate": 1.0470728521274028e-05,
"loss": 0.028,
"step": 1115
},
{
"epoch": 1.6187206360679436,
"grad_norm": 0.0703597441315651,
"learning_rate": 1.0386716346287398e-05,
"loss": 0.0278,
"step": 1120
},
{
"epoch": 1.6259486808818213,
"grad_norm": 0.07455068826675415,
"learning_rate": 1.030267682642334e-05,
"loss": 0.0312,
"step": 1125
},
{
"epoch": 1.6331767256956993,
"grad_norm": 0.06019241735339165,
"learning_rate": 1.0218615904152067e-05,
"loss": 0.027,
"step": 1130
},
{
"epoch": 1.6404047705095772,
"grad_norm": 0.05638565123081207,
"learning_rate": 1.0134539523457172e-05,
"loss": 0.0301,
"step": 1135
},
{
"epoch": 1.647632815323455,
"grad_norm": 0.06251167505979538,
"learning_rate": 1.0050453629415317e-05,
"loss": 0.027,
"step": 1140
},
{
"epoch": 1.6548608601373327,
"grad_norm": 0.08058342337608337,
"learning_rate": 9.966364167775851e-06,
"loss": 0.0307,
"step": 1145
},
{
"epoch": 1.6620889049512106,
"grad_norm": 0.05652245879173279,
"learning_rate": 9.882277084540399e-06,
"loss": 0.0267,
"step": 1150
},
{
"epoch": 1.6693169497650886,
"grad_norm": 0.06871891021728516,
"learning_rate": 9.798198325542399e-06,
"loss": 0.0255,
"step": 1155
},
{
"epoch": 1.6765449945789666,
"grad_norm": 0.07430125027894974,
"learning_rate": 9.714133836026687e-06,
"loss": 0.0277,
"step": 1160
},
{
"epoch": 1.6837730393928443,
"grad_norm": 0.058816712349653244,
"learning_rate": 9.630089560229088e-06,
"loss": 0.0248,
"step": 1165
},
{
"epoch": 1.691001084206722,
"grad_norm": 0.06506705284118652,
"learning_rate": 9.546071440956115e-06,
"loss": 0.0298,
"step": 1170
},
{
"epoch": 1.6982291290206,
"grad_norm": 0.06538432091474533,
"learning_rate": 9.46208541916474e-06,
"loss": 0.0308,
"step": 1175
},
{
"epoch": 1.705457173834478,
"grad_norm": 0.057376306504011154,
"learning_rate": 9.378137433542305e-06,
"loss": 0.0293,
"step": 1180
},
{
"epoch": 1.7126852186483557,
"grad_norm": 0.04726172983646393,
"learning_rate": 9.294233420086604e-06,
"loss": 0.0271,
"step": 1185
},
{
"epoch": 1.7199132634622334,
"grad_norm": 0.05788370966911316,
"learning_rate": 9.210379311686129e-06,
"loss": 0.0293,
"step": 1190
},
{
"epoch": 1.7271413082761113,
"grad_norm": 0.04595355689525604,
"learning_rate": 9.12658103770058e-06,
"loss": 0.0289,
"step": 1195
},
{
"epoch": 1.7343693530899893,
"grad_norm": 0.06266051530838013,
"learning_rate": 9.042844523541572e-06,
"loss": 0.0286,
"step": 1200
},
{
"epoch": 1.741597397903867,
"grad_norm": 0.049365997314453125,
"learning_rate": 8.95917569025366e-06,
"loss": 0.0275,
"step": 1205
},
{
"epoch": 1.7488254427177448,
"grad_norm": 0.056487612426280975,
"learning_rate": 8.875580454095651e-06,
"loss": 0.0239,
"step": 1210
},
{
"epoch": 1.7560534875316227,
"grad_norm": 0.04812345653772354,
"learning_rate": 8.792064726122275e-06,
"loss": 0.0262,
"step": 1215
},
{
"epoch": 1.7632815323455007,
"grad_norm": 0.06868524849414825,
"learning_rate": 8.708634411766195e-06,
"loss": 0.0277,
"step": 1220
},
{
"epoch": 1.7705095771593784,
"grad_norm": 0.07294084876775742,
"learning_rate": 8.625295410420451e-06,
"loss": 0.0235,
"step": 1225
},
{
"epoch": 1.7777376219732561,
"grad_norm": 0.05644133314490318,
"learning_rate": 8.542053615021291e-06,
"loss": 0.0271,
"step": 1230
},
{
"epoch": 1.784965666787134,
"grad_norm": 0.059861283749341965,
"learning_rate": 8.4589149116315e-06,
"loss": 0.025,
"step": 1235
},
{
"epoch": 1.792193711601012,
"grad_norm": 0.06358060985803604,
"learning_rate": 8.375885179024175e-06,
"loss": 0.0294,
"step": 1240
},
{
"epoch": 1.7994217564148898,
"grad_norm": 0.03532201051712036,
"learning_rate": 8.292970288267043e-06,
"loss": 0.0239,
"step": 1245
},
{
"epoch": 1.8066498012287675,
"grad_norm": 0.047285765409469604,
"learning_rate": 8.21017610230732e-06,
"loss": 0.0312,
"step": 1250
},
{
"epoch": 1.8138778460426455,
"grad_norm": 0.044171951711177826,
"learning_rate": 8.12750847555713e-06,
"loss": 0.0288,
"step": 1255
},
{
"epoch": 1.8211058908565234,
"grad_norm": 0.05230150744318962,
"learning_rate": 8.044973253479544e-06,
"loss": 0.0242,
"step": 1260
},
{
"epoch": 1.8283339356704011,
"grad_norm": 0.04772350192070007,
"learning_rate": 7.96257627217524e-06,
"loss": 0.0292,
"step": 1265
},
{
"epoch": 1.8355619804842789,
"grad_norm": 0.04245223104953766,
"learning_rate": 7.880323357969838e-06,
"loss": 0.0239,
"step": 1270
},
{
"epoch": 1.8427900252981568,
"grad_norm": 0.05859874188899994,
"learning_rate": 7.798220327001898e-06,
"loss": 0.0245,
"step": 1275
},
{
"epoch": 1.8500180701120348,
"grad_norm": 0.06144941225647926,
"learning_rate": 7.716272984811688e-06,
"loss": 0.0261,
"step": 1280
},
{
"epoch": 1.8572461149259125,
"grad_norm": 0.03744060546159744,
"learning_rate": 7.634487125930649e-06,
"loss": 0.0259,
"step": 1285
},
{
"epoch": 1.8644741597397902,
"grad_norm": 0.06158106401562691,
"learning_rate": 7.55286853347167e-06,
"loss": 0.0237,
"step": 1290
},
{
"epoch": 1.8717022045536682,
"grad_norm": 0.05013835057616234,
"learning_rate": 7.471422978720162e-06,
"loss": 0.0244,
"step": 1295
},
{
"epoch": 1.8789302493675462,
"grad_norm": 0.06363669037818909,
"learning_rate": 7.3901562207259555e-06,
"loss": 0.0245,
"step": 1300
},
{
"epoch": 1.8861582941814239,
"grad_norm": 0.05522134155035019,
"learning_rate": 7.309074005896103e-06,
"loss": 0.0216,
"step": 1305
},
{
"epoch": 1.8933863389953016,
"grad_norm": 0.06466201692819595,
"learning_rate": 7.228182067588518e-06,
"loss": 0.0278,
"step": 1310
},
{
"epoch": 1.9006143838091796,
"grad_norm": 0.047263894230127335,
"learning_rate": 7.1474861257065866e-06,
"loss": 0.0258,
"step": 1315
},
{
"epoch": 1.9078424286230575,
"grad_norm": 0.051960770040750504,
"learning_rate": 7.066991886294702e-06,
"loss": 0.0227,
"step": 1320
},
{
"epoch": 1.9150704734369353,
"grad_norm": 0.06168799102306366,
"learning_rate": 6.9867050411347955e-06,
"loss": 0.0333,
"step": 1325
},
{
"epoch": 1.922298518250813,
"grad_norm": 0.03766432777047157,
"learning_rate": 6.906631267343849e-06,
"loss": 0.0235,
"step": 1330
},
{
"epoch": 1.929526563064691,
"grad_norm": 0.04953250661492348,
"learning_rate": 6.826776226972489e-06,
"loss": 0.0247,
"step": 1335
},
{
"epoch": 1.936754607878569,
"grad_norm": 0.047898851335048676,
"learning_rate": 6.747145566604605e-06,
"loss": 0.0281,
"step": 1340
},
{
"epoch": 1.9439826526924469,
"grad_norm": 0.062446679919958115,
"learning_rate": 6.667744916958085e-06,
"loss": 0.0242,
"step": 1345
},
{
"epoch": 1.9512106975063246,
"grad_norm": 0.050179507583379745,
"learning_rate": 6.588579892486657e-06,
"loss": 0.0254,
"step": 1350
},
{
"epoch": 1.9584387423202023,
"grad_norm": 0.052683789283037186,
"learning_rate": 6.5096560909828855e-06,
"loss": 0.0206,
"step": 1355
},
{
"epoch": 1.9656667871340803,
"grad_norm": 0.06998462975025177,
"learning_rate": 6.430979093182372e-06,
"loss": 0.0223,
"step": 1360
},
{
"epoch": 1.9728948319479582,
"grad_norm": 0.07918884605169296,
"learning_rate": 6.352554462369112e-06,
"loss": 0.0281,
"step": 1365
},
{
"epoch": 1.980122876761836,
"grad_norm": 0.06278680264949799,
"learning_rate": 6.274387743982127e-06,
"loss": 0.0234,
"step": 1370
},
{
"epoch": 1.9873509215757137,
"grad_norm": 0.04667511582374573,
"learning_rate": 6.196484465223343e-06,
"loss": 0.0218,
"step": 1375
},
{
"epoch": 1.9945789663895916,
"grad_norm": 0.05683530122041702,
"learning_rate": 6.1188501346667536e-06,
"loss": 0.0267,
"step": 1380
},
{
"epoch": 2.0014456089627757,
"grad_norm": 0.021900292485952377,
"learning_rate": 6.04149024186891e-06,
"loss": 0.0196,
"step": 1385
},
{
"epoch": 2.0086736537766536,
"grad_norm": 0.014646291732788086,
"learning_rate": 5.964410256980762e-06,
"loss": 0.0085,
"step": 1390
},
{
"epoch": 2.015901698590531,
"grad_norm": 0.018468832597136497,
"learning_rate": 5.887615630360836e-06,
"loss": 0.0064,
"step": 1395
},
{
"epoch": 2.023129743404409,
"grad_norm": 0.02347305603325367,
"learning_rate": 5.811111792189873e-06,
"loss": 0.0058,
"step": 1400
},
{
"epoch": 2.030357788218287,
"grad_norm": 0.022464651614427567,
"learning_rate": 5.734904152086829e-06,
"loss": 0.0052,
"step": 1405
},
{
"epoch": 2.037585833032165,
"grad_norm": 0.027769049629569054,
"learning_rate": 5.658998098726361e-06,
"loss": 0.0054,
"step": 1410
},
{
"epoch": 2.0448138778460425,
"grad_norm": 0.03556771203875542,
"learning_rate": 5.583398999457812e-06,
"loss": 0.0043,
"step": 1415
},
{
"epoch": 2.0520419226599205,
"grad_norm": 0.030191823840141296,
"learning_rate": 5.508112199925659e-06,
"loss": 0.0041,
"step": 1420
},
{
"epoch": 2.0592699674737984,
"grad_norm": 0.03760818764567375,
"learning_rate": 5.433143023691547e-06,
"loss": 0.0038,
"step": 1425
},
{
"epoch": 2.0664980122876764,
"grad_norm": 0.014797261916100979,
"learning_rate": 5.358496771857831e-06,
"loss": 0.0044,
"step": 1430
},
{
"epoch": 2.073726057101554,
"grad_norm": 0.06623335927724838,
"learning_rate": 5.284178722692743e-06,
"loss": 0.0052,
"step": 1435
},
{
"epoch": 2.080954101915432,
"grad_norm": 0.016557743772864342,
"learning_rate": 5.2101941312571724e-06,
"loss": 0.0037,
"step": 1440
},
{
"epoch": 2.08818214672931,
"grad_norm": 0.03200926259160042,
"learning_rate": 5.136548229033065e-06,
"loss": 0.0047,
"step": 1445
},
{
"epoch": 2.0954101915431878,
"grad_norm": 0.03867388516664505,
"learning_rate": 5.063246223553509e-06,
"loss": 0.0052,
"step": 1450
},
{
"epoch": 2.1026382363570653,
"grad_norm": 0.009015249088406563,
"learning_rate": 4.990293298034505e-06,
"loss": 0.0045,
"step": 1455
},
{
"epoch": 2.109866281170943,
"grad_norm": 0.01907913200557232,
"learning_rate": 4.917694611008477e-06,
"loss": 0.0041,
"step": 1460
},
{
"epoch": 2.117094325984821,
"grad_norm": 0.02901625819504261,
"learning_rate": 4.845455295959468e-06,
"loss": 0.0051,
"step": 1465
},
{
"epoch": 2.124322370798699,
"grad_norm": 0.03763509541749954,
"learning_rate": 4.773580460960195e-06,
"loss": 0.0065,
"step": 1470
},
{
"epoch": 2.1315504156125766,
"grad_norm": 0.02262153849005699,
"learning_rate": 4.702075188310826e-06,
"loss": 0.0052,
"step": 1475
},
{
"epoch": 2.1387784604264546,
"grad_norm": 0.02351069077849388,
"learning_rate": 4.6309445341796286e-06,
"loss": 0.0048,
"step": 1480
},
{
"epoch": 2.1460065052403325,
"grad_norm": 0.009482895024120808,
"learning_rate": 4.5601935282454255e-06,
"loss": 0.0035,
"step": 1485
},
{
"epoch": 2.1532345500542105,
"grad_norm": 0.039236586540937424,
"learning_rate": 4.489827173341957e-06,
"loss": 0.0046,
"step": 1490
},
{
"epoch": 2.160462594868088,
"grad_norm": 0.029299462214112282,
"learning_rate": 4.419850445104126e-06,
"loss": 0.0066,
"step": 1495
},
{
"epoch": 2.167690639681966,
"grad_norm": 0.038081999868154526,
"learning_rate": 4.350268291616166e-06,
"loss": 0.0058,
"step": 1500
},
{
"epoch": 2.167690639681966,
"eval_loss": 0.15236619114875793,
"eval_runtime": 1142.5896,
"eval_samples_per_second": 56.193,
"eval_steps_per_second": 1.757,
"step": 1500
},
{
"epoch": 2.174918684495844,
"grad_norm": 0.05242437124252319,
"learning_rate": 4.281085633061764e-06,
"loss": 0.0047,
"step": 1505
},
{
"epoch": 2.182146729309722,
"grad_norm": 0.04368291050195694,
"learning_rate": 4.212307361376146e-06,
"loss": 0.0078,
"step": 1510
},
{
"epoch": 2.1893747741235994,
"grad_norm": 0.033737700432538986,
"learning_rate": 4.1439383399001865e-06,
"loss": 0.0043,
"step": 1515
},
{
"epoch": 2.1966028189374773,
"grad_norm": 0.05659673735499382,
"learning_rate": 4.075983403036479e-06,
"loss": 0.0045,
"step": 1520
},
{
"epoch": 2.2038308637513553,
"grad_norm": 0.0330926850438118,
"learning_rate": 4.0084473559075335e-06,
"loss": 0.0073,
"step": 1525
},
{
"epoch": 2.2110589085652332,
"grad_norm": 0.01673804223537445,
"learning_rate": 3.941334974015981e-06,
"loss": 0.006,
"step": 1530
},
{
"epoch": 2.2182869533791107,
"grad_norm": 0.013828652910888195,
"learning_rate": 3.874651002906915e-06,
"loss": 0.0047,
"step": 1535
},
{
"epoch": 2.2255149981929887,
"grad_norm": 0.02410770393908024,
"learning_rate": 3.8084001578323093e-06,
"loss": 0.0049,
"step": 1540
},
{
"epoch": 2.2327430430068667,
"grad_norm": 0.04236437752842903,
"learning_rate": 3.7425871234176134e-06,
"loss": 0.0069,
"step": 1545
},
{
"epoch": 2.2399710878207446,
"grad_norm": 0.017214614897966385,
"learning_rate": 3.6772165533305024e-06,
"loss": 0.0034,
"step": 1550
},
{
"epoch": 2.2471991326346226,
"grad_norm": 0.007290941663086414,
"learning_rate": 3.6122930699518057e-06,
"loss": 0.0045,
"step": 1555
},
{
"epoch": 2.2544271774485,
"grad_norm": 0.02728499099612236,
"learning_rate": 3.5478212640486652e-06,
"loss": 0.0049,
"step": 1560
},
{
"epoch": 2.261655222262378,
"grad_norm": 0.023531029000878334,
"learning_rate": 3.483805694449913e-06,
"loss": 0.0034,
"step": 1565
},
{
"epoch": 2.268883267076256,
"grad_norm": 0.01806485652923584,
"learning_rate": 3.420250887723722e-06,
"loss": 0.0044,
"step": 1570
},
{
"epoch": 2.2761113118901335,
"grad_norm": 0.022033169865608215,
"learning_rate": 3.357161337857523e-06,
"loss": 0.0037,
"step": 1575
},
{
"epoch": 2.2833393567040114,
"grad_norm": 0.03150279447436333,
"learning_rate": 3.2945415059402363e-06,
"loss": 0.0051,
"step": 1580
},
{
"epoch": 2.2905674015178894,
"grad_norm": 0.01539881806820631,
"learning_rate": 3.232395819846824e-06,
"loss": 0.0036,
"step": 1585
},
{
"epoch": 2.2977954463317674,
"grad_norm": 0.022633062675595284,
"learning_rate": 3.170728673925206e-06,
"loss": 0.003,
"step": 1590
},
{
"epoch": 2.3050234911456453,
"grad_norm": 0.07029638439416885,
"learning_rate": 3.1095444286855112e-06,
"loss": 0.0056,
"step": 1595
},
{
"epoch": 2.312251535959523,
"grad_norm": 0.02109723724424839,
"learning_rate": 3.04884741049176e-06,
"loss": 0.0047,
"step": 1600
},
{
"epoch": 2.3194795807734008,
"grad_norm": 0.03571590408682823,
"learning_rate": 2.9886419112559396e-06,
"loss": 0.005,
"step": 1605
},
{
"epoch": 2.3267076255872787,
"grad_norm": 0.047896191477775574,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.0065,
"step": 1610
},
{
"epoch": 2.3339356704011567,
"grad_norm": 0.018771937116980553,
"learning_rate": 2.86972246322745e-06,
"loss": 0.0029,
"step": 1615
},
{
"epoch": 2.341163715215034,
"grad_norm": 0.013248492032289505,
"learning_rate": 2.8110169232795615e-06,
"loss": 0.0029,
"step": 1620
},
{
"epoch": 2.348391760028912,
"grad_norm": 0.0571102574467659,
"learning_rate": 2.752819719384573e-06,
"loss": 0.0059,
"step": 1625
},
{
"epoch": 2.35561980484279,
"grad_norm": 0.037497229874134064,
"learning_rate": 2.6951349666915404e-06,
"loss": 0.0046,
"step": 1630
},
{
"epoch": 2.362847849656668,
"grad_norm": 0.050996676087379456,
"learning_rate": 2.637966744113877e-06,
"loss": 0.005,
"step": 1635
},
{
"epoch": 2.3700758944705456,
"grad_norm": 0.017970601096749306,
"learning_rate": 2.581319094040927e-06,
"loss": 0.0046,
"step": 1640
},
{
"epoch": 2.3773039392844235,
"grad_norm": 0.012019157409667969,
"learning_rate": 2.5251960220521422e-06,
"loss": 0.0036,
"step": 1645
},
{
"epoch": 2.3845319840983015,
"grad_norm": 0.04049897938966751,
"learning_rate": 2.4696014966338267e-06,
"loss": 0.0043,
"step": 1650
},
{
"epoch": 2.3917600289121794,
"grad_norm": 0.011841571889817715,
"learning_rate": 2.4145394488985307e-06,
"loss": 0.0031,
"step": 1655
},
{
"epoch": 2.398988073726057,
"grad_norm": 0.05339455232024193,
"learning_rate": 2.360013772307086e-06,
"loss": 0.0039,
"step": 1660
},
{
"epoch": 2.406216118539935,
"grad_norm": 0.017674589529633522,
"learning_rate": 2.3060283223932876e-06,
"loss": 0.0041,
"step": 1665
},
{
"epoch": 2.413444163353813,
"grad_norm": 0.036407146602869034,
"learning_rate": 2.252586916491275e-06,
"loss": 0.0062,
"step": 1670
},
{
"epoch": 2.420672208167691,
"grad_norm": 0.05213891342282295,
"learning_rate": 2.1996933334656044e-06,
"loss": 0.0048,
"step": 1675
},
{
"epoch": 2.4279002529815683,
"grad_norm": 0.027244996279478073,
"learning_rate": 2.1473513134440425e-06,
"loss": 0.0064,
"step": 1680
},
{
"epoch": 2.4351282977954463,
"grad_norm": 0.020630542188882828,
"learning_rate": 2.0955645575531e-06,
"loss": 0.005,
"step": 1685
},
{
"epoch": 2.442356342609324,
"grad_norm": 0.01828751713037491,
"learning_rate": 2.0443367276563277e-06,
"loss": 0.0056,
"step": 1690
},
{
"epoch": 2.449584387423202,
"grad_norm": 0.07830678671598434,
"learning_rate": 1.9936714460953743e-06,
"loss": 0.0058,
"step": 1695
},
{
"epoch": 2.4568124322370797,
"grad_norm": 0.04249007627367973,
"learning_rate": 1.9435722954338675e-06,
"loss": 0.0038,
"step": 1700
},
{
"epoch": 2.4640404770509576,
"grad_norm": 0.04109486937522888,
"learning_rate": 1.8940428182040715e-06,
"loss": 0.0042,
"step": 1705
},
{
"epoch": 2.4712685218648356,
"grad_norm": 0.011558642610907555,
"learning_rate": 1.8450865166564003e-06,
"loss": 0.0029,
"step": 1710
},
{
"epoch": 2.4784965666787135,
"grad_norm": 0.0303326603025198,
"learning_rate": 1.7967068525117658e-06,
"loss": 0.0041,
"step": 1715
},
{
"epoch": 2.485724611492591,
"grad_norm": 0.016660748049616814,
"learning_rate": 1.7489072467168166e-06,
"loss": 0.0039,
"step": 1720
},
{
"epoch": 2.492952656306469,
"grad_norm": 0.044775962829589844,
"learning_rate": 1.7016910792020191e-06,
"loss": 0.0062,
"step": 1725
},
{
"epoch": 2.500180701120347,
"grad_norm": 0.018128257244825363,
"learning_rate": 1.6550616886426718e-06,
"loss": 0.0033,
"step": 1730
},
{
"epoch": 2.507408745934225,
"grad_norm": 0.04440128430724144,
"learning_rate": 1.609022372222827e-06,
"loss": 0.005,
"step": 1735
},
{
"epoch": 2.514636790748103,
"grad_norm": 0.013020209036767483,
"learning_rate": 1.5635763854021424e-06,
"loss": 0.004,
"step": 1740
},
{
"epoch": 2.5218648355619804,
"grad_norm": 0.04560156539082527,
"learning_rate": 1.5187269416856875e-06,
"loss": 0.0044,
"step": 1745
},
{
"epoch": 2.5290928803758583,
"grad_norm": 0.020239338278770447,
"learning_rate": 1.474477212396712e-06,
"loss": 0.003,
"step": 1750
},
{
"epoch": 2.5363209251897363,
"grad_norm": 0.020898908376693726,
"learning_rate": 1.4308303264524115e-06,
"loss": 0.0046,
"step": 1755
},
{
"epoch": 2.543548970003614,
"grad_norm": 0.01411470677703619,
"learning_rate": 1.3877893701426637e-06,
"loss": 0.0037,
"step": 1760
},
{
"epoch": 2.5507770148174918,
"grad_norm": 0.027346884831786156,
"learning_rate": 1.3453573869118097e-06,
"loss": 0.0056,
"step": 1765
},
{
"epoch": 2.5580050596313697,
"grad_norm": 0.017516661435365677,
"learning_rate": 1.3035373771434356e-06,
"loss": 0.0063,
"step": 1770
},
{
"epoch": 2.5652331044452477,
"grad_norm": 0.01695055328309536,
"learning_rate": 1.2623322979482355e-06,
"loss": 0.0049,
"step": 1775
},
{
"epoch": 2.5724611492591256,
"grad_norm": 0.03533555567264557,
"learning_rate": 1.2217450629548955e-06,
"loss": 0.0042,
"step": 1780
},
{
"epoch": 2.579689194073003,
"grad_norm": 0.012438401579856873,
"learning_rate": 1.181778542104075e-06,
"loss": 0.0025,
"step": 1785
},
{
"epoch": 2.586917238886881,
"grad_norm": 0.02840145118534565,
"learning_rate": 1.1424355614454718e-06,
"loss": 0.0047,
"step": 1790
},
{
"epoch": 2.594145283700759,
"grad_norm": 0.03050726279616356,
"learning_rate": 1.1037189029379925e-06,
"loss": 0.0078,
"step": 1795
},
{
"epoch": 2.6013733285146365,
"grad_norm": 0.025619490072131157,
"learning_rate": 1.0656313042530376e-06,
"loss": 0.0051,
"step": 1800
},
{
"epoch": 2.6086013733285145,
"grad_norm": 0.03565088286995888,
"learning_rate": 1.028175458580918e-06,
"loss": 0.0057,
"step": 1805
},
{
"epoch": 2.6158294181423924,
"grad_norm": 0.048903122544288635,
"learning_rate": 9.913540144404254e-07,
"loss": 0.0029,
"step": 1810
},
{
"epoch": 2.6230574629562704,
"grad_norm": 0.06714732199907303,
"learning_rate": 9.551695754915447e-07,
"loss": 0.0058,
"step": 1815
},
{
"epoch": 2.6302855077701484,
"grad_norm": 0.03730113059282303,
"learning_rate": 9.196247003513537e-07,
"loss": 0.0056,
"step": 1820
},
{
"epoch": 2.637513552584026,
"grad_norm": 0.008724790997803211,
"learning_rate": 8.84721902413097e-07,
"loss": 0.0042,
"step": 1825
},
{
"epoch": 2.644741597397904,
"grad_norm": 0.03197433799505234,
"learning_rate": 8.50463649668477e-07,
"loss": 0.0043,
"step": 1830
},
{
"epoch": 2.6519696422117818,
"grad_norm": 0.05495726689696312,
"learning_rate": 8.168523645331216e-07,
"loss": 0.0047,
"step": 1835
},
{
"epoch": 2.6591976870256593,
"grad_norm": 0.01701589673757553,
"learning_rate": 7.838904236753087e-07,
"loss": 0.0041,
"step": 1840
},
{
"epoch": 2.6664257318395372,
"grad_norm": 0.02677042968571186,
"learning_rate": 7.515801578479032e-07,
"loss": 0.0065,
"step": 1845
},
{
"epoch": 2.673653776653415,
"grad_norm": 0.014987285248935223,
"learning_rate": 7.199238517235541e-07,
"loss": 0.003,
"step": 1850
},
{
"epoch": 2.680881821467293,
"grad_norm": 0.011919076554477215,
"learning_rate": 6.889237437331398e-07,
"loss": 0.0036,
"step": 1855
},
{
"epoch": 2.688109866281171,
"grad_norm": 0.05691038444638252,
"learning_rate": 6.585820259074882e-07,
"loss": 0.005,
"step": 1860
},
{
"epoch": 2.6953379110950486,
"grad_norm": 0.021997489035129547,
"learning_rate": 6.289008437223798e-07,
"loss": 0.0061,
"step": 1865
},
{
"epoch": 2.7025659559089266,
"grad_norm": 0.04470158740878105,
"learning_rate": 5.998822959468409e-07,
"loss": 0.0044,
"step": 1870
},
{
"epoch": 2.7097940007228045,
"grad_norm": 0.023458922281861305,
"learning_rate": 5.715284344947358e-07,
"loss": 0.0052,
"step": 1875
},
{
"epoch": 2.7170220455366825,
"grad_norm": 0.007212420925498009,
"learning_rate": 5.438412642796686e-07,
"loss": 0.004,
"step": 1880
},
{
"epoch": 2.7242500903505604,
"grad_norm": 0.021170541644096375,
"learning_rate": 5.168227430732353e-07,
"loss": 0.0046,
"step": 1885
},
{
"epoch": 2.731478135164438,
"grad_norm": 0.04506688937544823,
"learning_rate": 4.904747813665656e-07,
"loss": 0.005,
"step": 1890
},
{
"epoch": 2.738706179978316,
"grad_norm": 0.03043074533343315,
"learning_rate": 4.6479924223524655e-07,
"loss": 0.0056,
"step": 1895
},
{
"epoch": 2.745934224792194,
"grad_norm": 0.06903711706399918,
"learning_rate": 4.39797941207577e-07,
"loss": 0.004,
"step": 1900
},
{
"epoch": 2.7531622696060714,
"grad_norm": 0.008001566864550114,
"learning_rate": 4.1547264613619243e-07,
"loss": 0.0052,
"step": 1905
},
{
"epoch": 2.7603903144199493,
"grad_norm": 0.016265859827399254,
"learning_rate": 3.9182507707305915e-07,
"loss": 0.0055,
"step": 1910
},
{
"epoch": 2.7676183592338273,
"grad_norm": 0.019273990765213966,
"learning_rate": 3.6885690614785197e-07,
"loss": 0.0043,
"step": 1915
},
{
"epoch": 2.774846404047705,
"grad_norm": 0.051555391401052475,
"learning_rate": 3.4656975744970846e-07,
"loss": 0.0046,
"step": 1920
},
{
"epoch": 2.782074448861583,
"grad_norm": 0.010696332901716232,
"learning_rate": 3.249652069124032e-07,
"loss": 0.0028,
"step": 1925
},
{
"epoch": 2.7893024936754607,
"grad_norm": 0.04233001545071602,
"learning_rate": 3.040447822028958e-07,
"loss": 0.0048,
"step": 1930
},
{
"epoch": 2.7965305384893386,
"grad_norm": 0.07739260792732239,
"learning_rate": 2.838099626133206e-07,
"loss": 0.004,
"step": 1935
},
{
"epoch": 2.8037585833032166,
"grad_norm": 0.05327356979250908,
"learning_rate": 2.642621789563848e-07,
"loss": 0.0069,
"step": 1940
},
{
"epoch": 2.810986628117094,
"grad_norm": 0.027605120092630386,
"learning_rate": 2.4540281346418946e-07,
"loss": 0.0037,
"step": 1945
},
{
"epoch": 2.818214672930972,
"grad_norm": 0.019115762785077095,
"learning_rate": 2.2723319969049307e-07,
"loss": 0.0036,
"step": 1950
},
{
"epoch": 2.82544271774485,
"grad_norm": 0.02181391790509224,
"learning_rate": 2.0975462241642042e-07,
"loss": 0.0024,
"step": 1955
},
{
"epoch": 2.832670762558728,
"grad_norm": 0.009364648722112179,
"learning_rate": 1.9296831755960753e-07,
"loss": 0.0052,
"step": 1960
},
{
"epoch": 2.839898807372606,
"grad_norm": 0.01776730641722679,
"learning_rate": 1.76875472086816e-07,
"loss": 0.0037,
"step": 1965
},
{
"epoch": 2.8471268521864834,
"grad_norm": 0.022552713751792908,
"learning_rate": 1.6147722392999887e-07,
"loss": 0.0038,
"step": 1970
},
{
"epoch": 2.8543548970003614,
"grad_norm": 0.05256934091448784,
"learning_rate": 1.467746619058341e-07,
"loss": 0.0049,
"step": 1975
},
{
"epoch": 2.8615829418142393,
"grad_norm": 0.01704435609281063,
"learning_rate": 1.327688256387416e-07,
"loss": 0.0042,
"step": 1980
},
{
"epoch": 2.868810986628117,
"grad_norm": 0.008595878258347511,
"learning_rate": 1.1946070548736532e-07,
"loss": 0.0035,
"step": 1985
},
{
"epoch": 2.876039031441995,
"grad_norm": 0.025025706738233566,
"learning_rate": 1.0685124247454159e-07,
"loss": 0.0038,
"step": 1990
},
{
"epoch": 2.8832670762558728,
"grad_norm": 0.018959928303956985,
"learning_rate": 9.494132822077007e-08,
"loss": 0.0043,
"step": 1995
},
{
"epoch": 2.8904951210697507,
"grad_norm": 0.01306887436658144,
"learning_rate": 8.373180488115529e-08,
"loss": 0.005,
"step": 2000
},
{
"epoch": 2.8904951210697507,
"eval_loss": 0.1443248987197876,
"eval_runtime": 1199.7992,
"eval_samples_per_second": 53.514,
"eval_steps_per_second": 1.673,
"step": 2000
},
{
"epoch": 2.8977231658836287,
"grad_norm": 0.07084480673074722,
"learning_rate": 7.322346508586209e-08,
"loss": 0.0043,
"step": 2005
},
{
"epoch": 2.904951210697506,
"grad_norm": 0.04223432019352913,
"learning_rate": 6.341705188407043e-08,
"loss": 0.0048,
"step": 2010
},
{
"epoch": 2.912179255511384,
"grad_norm": 0.019037162885069847,
"learning_rate": 5.431325869143189e-08,
"loss": 0.0054,
"step": 2015
},
{
"epoch": 2.919407300325262,
"grad_norm": 0.014710099436342716,
"learning_rate": 4.5912729241036624e-08,
"loss": 0.0038,
"step": 2020
},
{
"epoch": 2.9266353451391396,
"grad_norm": 0.012474890798330307,
"learning_rate": 3.821605753789648e-08,
"loss": 0.0038,
"step": 2025
},
{
"epoch": 2.933863389953018,
"grad_norm": 0.01217850111424923,
"learning_rate": 3.122378781694524e-08,
"loss": 0.0028,
"step": 2030
},
{
"epoch": 2.9410914347668955,
"grad_norm": 0.0497884601354599,
"learning_rate": 2.493641450454942e-08,
"loss": 0.0041,
"step": 2035
},
{
"epoch": 2.9483194795807735,
"grad_norm": 0.013583734631538391,
"learning_rate": 1.93543821835529e-08,
"loss": 0.0037,
"step": 2040
},
{
"epoch": 2.9555475243946514,
"grad_norm": 0.046894483268260956,
"learning_rate": 1.4478085561835387e-08,
"loss": 0.0041,
"step": 2045
},
{
"epoch": 2.962775569208529,
"grad_norm": 0.011021456681191921,
"learning_rate": 1.0307869444406981e-08,
"loss": 0.0043,
"step": 2050
},
{
"epoch": 2.970003614022407,
"grad_norm": 0.008113077841699123,
"learning_rate": 6.844028709024342e-09,
"loss": 0.0053,
"step": 2055
},
{
"epoch": 2.977231658836285,
"grad_norm": 0.02427099458873272,
"learning_rate": 4.086808285338472e-09,
"loss": 0.0043,
"step": 2060
},
{
"epoch": 2.9844597036501628,
"grad_norm": 0.0245045255869627,
"learning_rate": 2.0364031375819104e-09,
"loss": 0.0034,
"step": 2065
},
{
"epoch": 2.9916877484640407,
"grad_norm": 0.016123216599225998,
"learning_rate": 6.929582507719801e-10,
"loss": 0.0055,
"step": 2070
},
{
"epoch": 2.9989157932779182,
"grad_norm": 0.04469776526093483,
"learning_rate": 5.6568620471209035e-11,
"loss": 0.0047,
"step": 2075
},
{
"epoch": 3.0,
"step": 2076,
"total_flos": 6.274047864041636e+18,
"train_loss": 0.03664169063040653,
"train_runtime": 42810.6565,
"train_samples_per_second": 6.203,
"train_steps_per_second": 0.048
}
],
"logging_steps": 5,
"max_steps": 2076,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.274047864041636e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}