Files
Qwen2.5-Coder-14B-Instruct-…/trainer_state.json

4181 lines
109 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 5916,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016904205977749838,
"grad_norm": 1.5393127157503101,
"learning_rate": 1.5202702702702706e-07,
"loss": 0.7280044555664062,
"step": 10
},
{
"epoch": 0.0033808411955499676,
"grad_norm": 1.4909846560751012,
"learning_rate": 3.2094594594594594e-07,
"loss": 0.7421677589416504,
"step": 20
},
{
"epoch": 0.005071261793324951,
"grad_norm": 1.1386498100093034,
"learning_rate": 4.898648648648649e-07,
"loss": 0.7222579956054688,
"step": 30
},
{
"epoch": 0.006761682391099935,
"grad_norm": 0.8683143176223492,
"learning_rate": 6.587837837837838e-07,
"loss": 0.6997623920440674,
"step": 40
},
{
"epoch": 0.008452102988874919,
"grad_norm": 0.6915935865750568,
"learning_rate": 8.277027027027028e-07,
"loss": 0.6706014156341553,
"step": 50
},
{
"epoch": 0.010142523586649903,
"grad_norm": 0.5119352644321808,
"learning_rate": 9.966216216216217e-07,
"loss": 0.6358686923980713,
"step": 60
},
{
"epoch": 0.011832944184424887,
"grad_norm": 0.4337471829765602,
"learning_rate": 1.1655405405405406e-06,
"loss": 0.618624210357666,
"step": 70
},
{
"epoch": 0.01352336478219987,
"grad_norm": 0.4205476110134685,
"learning_rate": 1.3344594594594596e-06,
"loss": 0.6060873508453369,
"step": 80
},
{
"epoch": 0.015213785379974854,
"grad_norm": 0.39627091778455564,
"learning_rate": 1.5033783783783785e-06,
"loss": 0.5913969993591308,
"step": 90
},
{
"epoch": 0.016904205977749838,
"grad_norm": 0.40212736895916457,
"learning_rate": 1.6722972972972977e-06,
"loss": 0.5814601421356201,
"step": 100
},
{
"epoch": 0.018594626575524822,
"grad_norm": 0.4773638249628802,
"learning_rate": 1.8412162162162164e-06,
"loss": 0.5719439029693604,
"step": 110
},
{
"epoch": 0.020285047173299806,
"grad_norm": 0.38472922772015794,
"learning_rate": 2.0101351351351353e-06,
"loss": 0.5604746818542481,
"step": 120
},
{
"epoch": 0.02197546777107479,
"grad_norm": 0.40656017027176017,
"learning_rate": 2.1790540540540543e-06,
"loss": 0.5584488391876221,
"step": 130
},
{
"epoch": 0.023665888368849773,
"grad_norm": 0.371308302071795,
"learning_rate": 2.347972972972973e-06,
"loss": 0.5508419036865234,
"step": 140
},
{
"epoch": 0.025356308966624757,
"grad_norm": 0.6240307244052877,
"learning_rate": 2.516891891891892e-06,
"loss": 0.545355224609375,
"step": 150
},
{
"epoch": 0.02704672956439974,
"grad_norm": 0.4324101126507423,
"learning_rate": 2.685810810810811e-06,
"loss": 0.5469128608703613,
"step": 160
},
{
"epoch": 0.028737150162174725,
"grad_norm": 0.4050600506019846,
"learning_rate": 2.85472972972973e-06,
"loss": 0.5449463367462158,
"step": 170
},
{
"epoch": 0.03042757075994971,
"grad_norm": 0.40162846746542147,
"learning_rate": 3.023648648648649e-06,
"loss": 0.5376415252685547,
"step": 180
},
{
"epoch": 0.032117991357724696,
"grad_norm": 0.38755458720694097,
"learning_rate": 3.192567567567568e-06,
"loss": 0.5347342491149902,
"step": 190
},
{
"epoch": 0.033808411955499676,
"grad_norm": 0.4920608811887175,
"learning_rate": 3.3614864864864864e-06,
"loss": 0.537785816192627,
"step": 200
},
{
"epoch": 0.03549883255327466,
"grad_norm": 0.4350756261944741,
"learning_rate": 3.5304054054054053e-06,
"loss": 0.5192846775054931,
"step": 210
},
{
"epoch": 0.037189253151049644,
"grad_norm": 0.4693428284277957,
"learning_rate": 3.6993243243243247e-06,
"loss": 0.5229539394378662,
"step": 220
},
{
"epoch": 0.03887967374882463,
"grad_norm": 0.47341352669542003,
"learning_rate": 3.868243243243244e-06,
"loss": 0.5329394340515137,
"step": 230
},
{
"epoch": 0.04057009434659961,
"grad_norm": 0.42129758409095647,
"learning_rate": 4.037162162162163e-06,
"loss": 0.5307604312896729,
"step": 240
},
{
"epoch": 0.0422605149443746,
"grad_norm": 0.40815950715332777,
"learning_rate": 4.206081081081081e-06,
"loss": 0.5192375183105469,
"step": 250
},
{
"epoch": 0.04395093554214958,
"grad_norm": 0.4200400303493133,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.5104325294494629,
"step": 260
},
{
"epoch": 0.045641356139924566,
"grad_norm": 0.4759394203212884,
"learning_rate": 4.543918918918919e-06,
"loss": 0.5136491775512695,
"step": 270
},
{
"epoch": 0.047331776737699546,
"grad_norm": 0.4354678343040916,
"learning_rate": 4.712837837837838e-06,
"loss": 0.5056795597076416,
"step": 280
},
{
"epoch": 0.049022197335474534,
"grad_norm": 0.4796606080973553,
"learning_rate": 4.881756756756757e-06,
"loss": 0.5168024063110351,
"step": 290
},
{
"epoch": 0.050712617933249514,
"grad_norm": 0.5075769744330321,
"learning_rate": 5.050675675675676e-06,
"loss": 0.5202849388122559,
"step": 300
},
{
"epoch": 0.0524030385310245,
"grad_norm": 0.4153292083599306,
"learning_rate": 5.219594594594595e-06,
"loss": 0.5088020324707031,
"step": 310
},
{
"epoch": 0.05409345912879948,
"grad_norm": 0.6184601220323779,
"learning_rate": 5.388513513513513e-06,
"loss": 0.5058822154998779,
"step": 320
},
{
"epoch": 0.05578387972657447,
"grad_norm": 0.5404243375856757,
"learning_rate": 5.557432432432433e-06,
"loss": 0.5120021820068359,
"step": 330
},
{
"epoch": 0.05747430032434945,
"grad_norm": 0.4787472095986187,
"learning_rate": 5.726351351351351e-06,
"loss": 0.5063863754272461,
"step": 340
},
{
"epoch": 0.059164720922124436,
"grad_norm": 0.4911006620908002,
"learning_rate": 5.8952702702702705e-06,
"loss": 0.502506160736084,
"step": 350
},
{
"epoch": 0.06085514151989942,
"grad_norm": 0.7405518769222905,
"learning_rate": 6.06418918918919e-06,
"loss": 0.4920937538146973,
"step": 360
},
{
"epoch": 0.0625455621176744,
"grad_norm": 0.47943676154065007,
"learning_rate": 6.233108108108109e-06,
"loss": 0.5055969715118408,
"step": 370
},
{
"epoch": 0.06423598271544939,
"grad_norm": 0.5295416406304393,
"learning_rate": 6.402027027027028e-06,
"loss": 0.49520511627197267,
"step": 380
},
{
"epoch": 0.06592640331322437,
"grad_norm": 0.44250966671562564,
"learning_rate": 6.570945945945947e-06,
"loss": 0.5030886650085449,
"step": 390
},
{
"epoch": 0.06761682391099935,
"grad_norm": 0.5383736396097062,
"learning_rate": 6.739864864864866e-06,
"loss": 0.4966264247894287,
"step": 400
},
{
"epoch": 0.06930724450877435,
"grad_norm": 0.4378624064122213,
"learning_rate": 6.908783783783785e-06,
"loss": 0.5109675407409668,
"step": 410
},
{
"epoch": 0.07099766510654933,
"grad_norm": 0.7886546643219341,
"learning_rate": 7.0777027027027035e-06,
"loss": 0.4995694637298584,
"step": 420
},
{
"epoch": 0.0726880857043243,
"grad_norm": 0.44892133998631173,
"learning_rate": 7.246621621621622e-06,
"loss": 0.4928102970123291,
"step": 430
},
{
"epoch": 0.07437850630209929,
"grad_norm": 0.4522504419948549,
"learning_rate": 7.415540540540541e-06,
"loss": 0.4935169219970703,
"step": 440
},
{
"epoch": 0.07606892689987428,
"grad_norm": 0.47388387012658867,
"learning_rate": 7.58445945945946e-06,
"loss": 0.4919950008392334,
"step": 450
},
{
"epoch": 0.07775934749764926,
"grad_norm": 0.4751247826361683,
"learning_rate": 7.753378378378378e-06,
"loss": 0.4960982322692871,
"step": 460
},
{
"epoch": 0.07944976809542424,
"grad_norm": 0.48392017417638244,
"learning_rate": 7.922297297297298e-06,
"loss": 0.4992071151733398,
"step": 470
},
{
"epoch": 0.08114018869319922,
"grad_norm": 0.4974127003472514,
"learning_rate": 8.091216216216217e-06,
"loss": 0.4960615634918213,
"step": 480
},
{
"epoch": 0.08283060929097422,
"grad_norm": 0.45126823416690404,
"learning_rate": 8.260135135135135e-06,
"loss": 0.4928754806518555,
"step": 490
},
{
"epoch": 0.0845210298887492,
"grad_norm": 0.5896794617176134,
"learning_rate": 8.429054054054054e-06,
"loss": 0.49259119033813475,
"step": 500
},
{
"epoch": 0.08621145048652418,
"grad_norm": 0.46198142303389655,
"learning_rate": 8.597972972972974e-06,
"loss": 0.4931736946105957,
"step": 510
},
{
"epoch": 0.08790187108429916,
"grad_norm": 0.5089850771284543,
"learning_rate": 8.766891891891893e-06,
"loss": 0.48741836547851564,
"step": 520
},
{
"epoch": 0.08959229168207415,
"grad_norm": 0.498017321228989,
"learning_rate": 8.93581081081081e-06,
"loss": 0.4871868133544922,
"step": 530
},
{
"epoch": 0.09128271227984913,
"grad_norm": 0.4364519713463667,
"learning_rate": 9.104729729729732e-06,
"loss": 0.4887679100036621,
"step": 540
},
{
"epoch": 0.09297313287762411,
"grad_norm": 0.4482779936423717,
"learning_rate": 9.27364864864865e-06,
"loss": 0.48195371627807615,
"step": 550
},
{
"epoch": 0.09466355347539909,
"grad_norm": 0.4562904990399742,
"learning_rate": 9.442567567567569e-06,
"loss": 0.48915772438049315,
"step": 560
},
{
"epoch": 0.09635397407317409,
"grad_norm": 0.49452467224625607,
"learning_rate": 9.611486486486488e-06,
"loss": 0.4924494743347168,
"step": 570
},
{
"epoch": 0.09804439467094907,
"grad_norm": 0.4652453454681303,
"learning_rate": 9.780405405405407e-06,
"loss": 0.4876460075378418,
"step": 580
},
{
"epoch": 0.09973481526872405,
"grad_norm": 0.46684881016372576,
"learning_rate": 9.949324324324325e-06,
"loss": 0.4890481948852539,
"step": 590
},
{
"epoch": 0.10142523586649903,
"grad_norm": 0.5490321111052099,
"learning_rate": 9.999957346063017e-06,
"loss": 0.484678840637207,
"step": 600
},
{
"epoch": 0.10311565646427402,
"grad_norm": 0.7722942859795399,
"learning_rate": 9.999748430572558e-06,
"loss": 0.48883733749389646,
"step": 610
},
{
"epoch": 0.104806077062049,
"grad_norm": 0.5530949398407944,
"learning_rate": 9.999365426397275e-06,
"loss": 0.48575553894042967,
"step": 620
},
{
"epoch": 0.10649649765982398,
"grad_norm": 0.47335899243694096,
"learning_rate": 9.998808346873179e-06,
"loss": 0.4800987720489502,
"step": 630
},
{
"epoch": 0.10818691825759896,
"grad_norm": 0.47768175202180385,
"learning_rate": 9.99807721139749e-06,
"loss": 0.47647809982299805,
"step": 640
},
{
"epoch": 0.10987733885537396,
"grad_norm": 0.46600234628629655,
"learning_rate": 9.997172045427974e-06,
"loss": 0.47447705268859863,
"step": 650
},
{
"epoch": 0.11156775945314894,
"grad_norm": 0.5180380166358451,
"learning_rate": 9.996092880482047e-06,
"loss": 0.4927847385406494,
"step": 660
},
{
"epoch": 0.11325818005092392,
"grad_norm": 0.7812305828383127,
"learning_rate": 9.99483975413568e-06,
"loss": 0.48745927810668943,
"step": 670
},
{
"epoch": 0.1149486006486989,
"grad_norm": 0.43644647448782514,
"learning_rate": 9.993412710022096e-06,
"loss": 0.48123817443847655,
"step": 680
},
{
"epoch": 0.11663902124647389,
"grad_norm": 0.49127236612828806,
"learning_rate": 9.991811797830238e-06,
"loss": 0.4744853973388672,
"step": 690
},
{
"epoch": 0.11832944184424887,
"grad_norm": 0.5833255708338916,
"learning_rate": 9.99003707330305e-06,
"loss": 0.48299551010131836,
"step": 700
},
{
"epoch": 0.12001986244202385,
"grad_norm": 0.5184397410093639,
"learning_rate": 9.988088598235532e-06,
"loss": 0.48394503593444826,
"step": 710
},
{
"epoch": 0.12171028303979883,
"grad_norm": 0.5258890543376203,
"learning_rate": 9.985966440472594e-06,
"loss": 0.4847887992858887,
"step": 720
},
{
"epoch": 0.12340070363757383,
"grad_norm": 0.4640524383999511,
"learning_rate": 9.98367067390668e-06,
"loss": 0.48206357955932616,
"step": 730
},
{
"epoch": 0.1250911242353488,
"grad_norm": 0.4273195720362554,
"learning_rate": 9.981201378475213e-06,
"loss": 0.48335995674133303,
"step": 740
},
{
"epoch": 0.1267815448331238,
"grad_norm": 0.4888196321723977,
"learning_rate": 9.978558640157794e-06,
"loss": 0.4779070854187012,
"step": 750
},
{
"epoch": 0.12847196543089878,
"grad_norm": 0.4492781187011461,
"learning_rate": 9.975742550973223e-06,
"loss": 0.47565546035766604,
"step": 760
},
{
"epoch": 0.13016238602867375,
"grad_norm": 0.6183045865145201,
"learning_rate": 9.972753208976283e-06,
"loss": 0.4773505687713623,
"step": 770
},
{
"epoch": 0.13185280662644874,
"grad_norm": 0.4825877361384316,
"learning_rate": 9.969590718254337e-06,
"loss": 0.47838778495788575,
"step": 780
},
{
"epoch": 0.13354322722422374,
"grad_norm": 0.4228373437680581,
"learning_rate": 9.966255188923694e-06,
"loss": 0.47666234970092775,
"step": 790
},
{
"epoch": 0.1352336478219987,
"grad_norm": 0.49439709272758775,
"learning_rate": 9.962746737125783e-06,
"loss": 0.47107582092285155,
"step": 800
},
{
"epoch": 0.1369240684197737,
"grad_norm": 0.4644619381643305,
"learning_rate": 9.959065485023099e-06,
"loss": 0.47295198440551756,
"step": 810
},
{
"epoch": 0.1386144890175487,
"grad_norm": 0.40092407936813573,
"learning_rate": 9.95521156079496e-06,
"loss": 0.46324882507324217,
"step": 820
},
{
"epoch": 0.14030490961532366,
"grad_norm": 0.40293191886864566,
"learning_rate": 9.951185098633039e-06,
"loss": 0.4737586975097656,
"step": 830
},
{
"epoch": 0.14199533021309865,
"grad_norm": 0.4315377378558263,
"learning_rate": 9.946986238736688e-06,
"loss": 0.47527265548706055,
"step": 840
},
{
"epoch": 0.14368575081087362,
"grad_norm": 0.43129828851312546,
"learning_rate": 9.942615127308064e-06,
"loss": 0.4667802810668945,
"step": 850
},
{
"epoch": 0.1453761714086486,
"grad_norm": 0.7253025541777565,
"learning_rate": 9.938071916547033e-06,
"loss": 0.4730061531066895,
"step": 860
},
{
"epoch": 0.1470665920064236,
"grad_norm": 0.4358429443768519,
"learning_rate": 9.933356764645871e-06,
"loss": 0.4679386138916016,
"step": 870
},
{
"epoch": 0.14875701260419857,
"grad_norm": 0.5060899359980359,
"learning_rate": 9.928469835783757e-06,
"loss": 0.47495126724243164,
"step": 880
},
{
"epoch": 0.15044743320197357,
"grad_norm": 0.3895332485316844,
"learning_rate": 9.923411300121055e-06,
"loss": 0.4657557010650635,
"step": 890
},
{
"epoch": 0.15213785379974856,
"grad_norm": 0.43200962105069246,
"learning_rate": 9.918181333793393e-06,
"loss": 0.4725798606872559,
"step": 900
},
{
"epoch": 0.15382827439752353,
"grad_norm": 0.5031797328287678,
"learning_rate": 9.912780118905524e-06,
"loss": 0.4736931800842285,
"step": 910
},
{
"epoch": 0.15551869499529852,
"grad_norm": 0.45701564911823583,
"learning_rate": 9.90720784352499e-06,
"loss": 0.47264862060546875,
"step": 920
},
{
"epoch": 0.1572091155930735,
"grad_norm": 0.4450162875737329,
"learning_rate": 9.901464701675575e-06,
"loss": 0.47407169342041017,
"step": 930
},
{
"epoch": 0.15889953619084848,
"grad_norm": 0.45122564139548105,
"learning_rate": 9.895550893330537e-06,
"loss": 0.4679454803466797,
"step": 940
},
{
"epoch": 0.16058995678862348,
"grad_norm": 0.4573555163342898,
"learning_rate": 9.889466624405664e-06,
"loss": 0.4702300071716309,
"step": 950
},
{
"epoch": 0.16228037738639844,
"grad_norm": 0.4499082495323313,
"learning_rate": 9.883212106752088e-06,
"loss": 0.4699045181274414,
"step": 960
},
{
"epoch": 0.16397079798417344,
"grad_norm": 0.4931931137952571,
"learning_rate": 9.876787558148918e-06,
"loss": 0.4716080665588379,
"step": 970
},
{
"epoch": 0.16566121858194843,
"grad_norm": 0.42457299398663156,
"learning_rate": 9.87019320229565e-06,
"loss": 0.4631767272949219,
"step": 980
},
{
"epoch": 0.1673516391797234,
"grad_norm": 0.4329347769304198,
"learning_rate": 9.863429268804388e-06,
"loss": 0.46808829307556155,
"step": 990
},
{
"epoch": 0.1690420597774984,
"grad_norm": 0.48564688866162065,
"learning_rate": 9.856495993191836e-06,
"loss": 0.4762742042541504,
"step": 1000
},
{
"epoch": 0.17073248037527336,
"grad_norm": 0.4322957425108914,
"learning_rate": 9.849393616871107e-06,
"loss": 0.46973333358764646,
"step": 1010
},
{
"epoch": 0.17242290097304835,
"grad_norm": 0.44726230667077793,
"learning_rate": 9.842122387143317e-06,
"loss": 0.46886191368103025,
"step": 1020
},
{
"epoch": 0.17411332157082335,
"grad_norm": 0.4877471481922099,
"learning_rate": 9.834682557188967e-06,
"loss": 0.46724977493286135,
"step": 1030
},
{
"epoch": 0.17580374216859831,
"grad_norm": 0.45139860892547107,
"learning_rate": 9.827074386059135e-06,
"loss": 0.468550968170166,
"step": 1040
},
{
"epoch": 0.1774941627663733,
"grad_norm": 0.4083605201702828,
"learning_rate": 9.819298138666446e-06,
"loss": 0.4680886745452881,
"step": 1050
},
{
"epoch": 0.1791845833641483,
"grad_norm": 0.5192065551375825,
"learning_rate": 9.811354085775865e-06,
"loss": 0.4614398002624512,
"step": 1060
},
{
"epoch": 0.18087500396192327,
"grad_norm": 0.41211296295044714,
"learning_rate": 9.80324250399525e-06,
"loss": 0.46942787170410155,
"step": 1070
},
{
"epoch": 0.18256542455969826,
"grad_norm": 0.5801659241580247,
"learning_rate": 9.794963675765734e-06,
"loss": 0.4630770206451416,
"step": 1080
},
{
"epoch": 0.18425584515747323,
"grad_norm": 0.5983535960495054,
"learning_rate": 9.786517889351882e-06,
"loss": 0.46579856872558595,
"step": 1090
},
{
"epoch": 0.18594626575524822,
"grad_norm": 0.43798357338036104,
"learning_rate": 9.777905438831663e-06,
"loss": 0.46267199516296387,
"step": 1100
},
{
"epoch": 0.18763668635302322,
"grad_norm": 0.4325962700066863,
"learning_rate": 9.769126624086202e-06,
"loss": 0.4716958999633789,
"step": 1110
},
{
"epoch": 0.18932710695079819,
"grad_norm": 0.41620408070601617,
"learning_rate": 9.76018175078934e-06,
"loss": 0.46593875885009767,
"step": 1120
},
{
"epoch": 0.19101752754857318,
"grad_norm": 0.6965781109451706,
"learning_rate": 9.751071130396991e-06,
"loss": 0.4633523464202881,
"step": 1130
},
{
"epoch": 0.19270794814634817,
"grad_norm": 0.4459465191100755,
"learning_rate": 9.741795080136305e-06,
"loss": 0.47896766662597656,
"step": 1140
},
{
"epoch": 0.19439836874412314,
"grad_norm": 0.6319696655865172,
"learning_rate": 9.732353922994608e-06,
"loss": 0.46693859100341795,
"step": 1150
},
{
"epoch": 0.19608878934189813,
"grad_norm": 0.47012774591561624,
"learning_rate": 9.722747987708165e-06,
"loss": 0.4546792984008789,
"step": 1160
},
{
"epoch": 0.19777920993967313,
"grad_norm": 0.832723029381089,
"learning_rate": 9.712977608750735e-06,
"loss": 0.47440481185913086,
"step": 1170
},
{
"epoch": 0.1994696305374481,
"grad_norm": 0.4829770005211469,
"learning_rate": 9.703043126321921e-06,
"loss": 0.46451354026794434,
"step": 1180
},
{
"epoch": 0.2011600511352231,
"grad_norm": 0.47584739817374816,
"learning_rate": 9.692944886335319e-06,
"loss": 0.46211748123168944,
"step": 1190
},
{
"epoch": 0.20285047173299806,
"grad_norm": 0.39648704616543984,
"learning_rate": 9.682683240406485e-06,
"loss": 0.47053070068359376,
"step": 1200
},
{
"epoch": 0.20454089233077305,
"grad_norm": 0.4534813956704384,
"learning_rate": 9.672258545840687e-06,
"loss": 0.462983226776123,
"step": 1210
},
{
"epoch": 0.20623131292854804,
"grad_norm": 0.4391686833686041,
"learning_rate": 9.66167116562046e-06,
"loss": 0.4643733024597168,
"step": 1220
},
{
"epoch": 0.207921733526323,
"grad_norm": 0.3992535539144578,
"learning_rate": 9.650921468392974e-06,
"loss": 0.45770740509033203,
"step": 1230
},
{
"epoch": 0.209612154124098,
"grad_norm": 0.6549411613955338,
"learning_rate": 9.640009828457187e-06,
"loss": 0.46063737869262694,
"step": 1240
},
{
"epoch": 0.211302574721873,
"grad_norm": 0.4204768929959213,
"learning_rate": 9.628936625750828e-06,
"loss": 0.46723523139953616,
"step": 1250
},
{
"epoch": 0.21299299531964797,
"grad_norm": 0.38551469436375513,
"learning_rate": 9.617702245837157e-06,
"loss": 0.45136494636535646,
"step": 1260
},
{
"epoch": 0.21468341591742296,
"grad_norm": 0.4206545742529992,
"learning_rate": 9.606307079891537e-06,
"loss": 0.4603860855102539,
"step": 1270
},
{
"epoch": 0.21637383651519793,
"grad_norm": 0.5019242021118502,
"learning_rate": 9.594751524687821e-06,
"loss": 0.46168107986450196,
"step": 1280
},
{
"epoch": 0.21806425711297292,
"grad_norm": 0.4065610280835682,
"learning_rate": 9.583035982584538e-06,
"loss": 0.46237959861755373,
"step": 1290
},
{
"epoch": 0.21975467771074791,
"grad_norm": 0.4201022803564497,
"learning_rate": 9.571160861510875e-06,
"loss": 0.45450439453125,
"step": 1300
},
{
"epoch": 0.22144509830852288,
"grad_norm": 0.48129418825307496,
"learning_rate": 9.559126574952477e-06,
"loss": 0.45772609710693357,
"step": 1310
},
{
"epoch": 0.22313551890629787,
"grad_norm": 0.4380826104631551,
"learning_rate": 9.546933541937052e-06,
"loss": 0.4615782737731934,
"step": 1320
},
{
"epoch": 0.22482593950407287,
"grad_norm": 0.5041722965228266,
"learning_rate": 9.534582187019777e-06,
"loss": 0.45874805450439454,
"step": 1330
},
{
"epoch": 0.22651636010184784,
"grad_norm": 0.5332437794142343,
"learning_rate": 9.522072940268515e-06,
"loss": 0.4533642292022705,
"step": 1340
},
{
"epoch": 0.22820678069962283,
"grad_norm": 0.49071947040909053,
"learning_rate": 9.509406237248847e-06,
"loss": 0.4662328720092773,
"step": 1350
},
{
"epoch": 0.2298972012973978,
"grad_norm": 0.40385017293891995,
"learning_rate": 9.496582519008897e-06,
"loss": 0.4652996063232422,
"step": 1360
},
{
"epoch": 0.2315876218951728,
"grad_norm": 0.4146237415189512,
"learning_rate": 9.483602232063979e-06,
"loss": 0.4565859794616699,
"step": 1370
},
{
"epoch": 0.23327804249294778,
"grad_norm": 0.46073779616681215,
"learning_rate": 9.47046582838105e-06,
"loss": 0.46174845695495603,
"step": 1380
},
{
"epoch": 0.23496846309072275,
"grad_norm": 0.38101750619091945,
"learning_rate": 9.45717376536297e-06,
"loss": 0.4629813194274902,
"step": 1390
},
{
"epoch": 0.23665888368849775,
"grad_norm": 0.4379618065240927,
"learning_rate": 9.443726505832584e-06,
"loss": 0.46460161209106443,
"step": 1400
},
{
"epoch": 0.23834930428627274,
"grad_norm": 0.4676173735785799,
"learning_rate": 9.43012451801659e-06,
"loss": 0.4546971321105957,
"step": 1410
},
{
"epoch": 0.2400397248840477,
"grad_norm": 0.4702750217508668,
"learning_rate": 9.416368275529255e-06,
"loss": 0.45699052810668944,
"step": 1420
},
{
"epoch": 0.2417301454818227,
"grad_norm": 0.570311072883872,
"learning_rate": 9.402458257355911e-06,
"loss": 0.4610409736633301,
"step": 1430
},
{
"epoch": 0.24342056607959767,
"grad_norm": 0.4087099940494567,
"learning_rate": 9.388394947836278e-06,
"loss": 0.46268315315246583,
"step": 1440
},
{
"epoch": 0.24511098667737266,
"grad_norm": 0.596327200385323,
"learning_rate": 9.374178836647609e-06,
"loss": 0.4597465515136719,
"step": 1450
},
{
"epoch": 0.24680140727514766,
"grad_norm": 0.4268602658887389,
"learning_rate": 9.359810418787626e-06,
"loss": 0.4541053295135498,
"step": 1460
},
{
"epoch": 0.24849182787292262,
"grad_norm": 0.5653285477886548,
"learning_rate": 9.3452901945573e-06,
"loss": 0.45857672691345214,
"step": 1470
},
{
"epoch": 0.2501822484706976,
"grad_norm": 0.7824638784414897,
"learning_rate": 9.33061866954341e-06,
"loss": 0.45687179565429686,
"step": 1480
},
{
"epoch": 0.2518726690684726,
"grad_norm": 0.4685023923882284,
"learning_rate": 9.31579635460096e-06,
"loss": 0.45577211380004884,
"step": 1490
},
{
"epoch": 0.2535630896662476,
"grad_norm": 0.42596174239181667,
"learning_rate": 9.300823765835385e-06,
"loss": 0.4546334266662598,
"step": 1500
},
{
"epoch": 0.25525351026402254,
"grad_norm": 0.459993088579298,
"learning_rate": 9.285701424584568e-06,
"loss": 0.4472480773925781,
"step": 1510
},
{
"epoch": 0.25694393086179756,
"grad_norm": 0.596362594134855,
"learning_rate": 9.270429857400703e-06,
"loss": 0.453325891494751,
"step": 1520
},
{
"epoch": 0.25863435145957253,
"grad_norm": 0.4178802706307328,
"learning_rate": 9.255009596031952e-06,
"loss": 0.4538599967956543,
"step": 1530
},
{
"epoch": 0.2603247720573475,
"grad_norm": 0.41372980820233307,
"learning_rate": 9.239441177403938e-06,
"loss": 0.45840139389038087,
"step": 1540
},
{
"epoch": 0.2620151926551225,
"grad_norm": 0.44229655857380346,
"learning_rate": 9.223725143601037e-06,
"loss": 0.44556608200073244,
"step": 1550
},
{
"epoch": 0.2637056132528975,
"grad_norm": 0.4568368063923109,
"learning_rate": 9.207862041847513e-06,
"loss": 0.4543326377868652,
"step": 1560
},
{
"epoch": 0.26539603385067245,
"grad_norm": 0.3992030096330564,
"learning_rate": 9.191852424488464e-06,
"loss": 0.45322580337524415,
"step": 1570
},
{
"epoch": 0.2670864544484475,
"grad_norm": 0.4800872482091329,
"learning_rate": 9.175696848970579e-06,
"loss": 0.448167610168457,
"step": 1580
},
{
"epoch": 0.26877687504622244,
"grad_norm": 1.5893046541101095,
"learning_rate": 9.159395877822743e-06,
"loss": 0.4591785430908203,
"step": 1590
},
{
"epoch": 0.2704672956439974,
"grad_norm": 0.7772627717928515,
"learning_rate": 9.142950078636438e-06,
"loss": 0.44793338775634767,
"step": 1600
},
{
"epoch": 0.27215771624177243,
"grad_norm": 0.45054785945064246,
"learning_rate": 9.126360024045987e-06,
"loss": 0.45564422607421873,
"step": 1610
},
{
"epoch": 0.2738481368395474,
"grad_norm": 0.4062144277621208,
"learning_rate": 9.10962629170861e-06,
"loss": 0.4568020820617676,
"step": 1620
},
{
"epoch": 0.27553855743732236,
"grad_norm": 0.4034609731421723,
"learning_rate": 9.092749464284316e-06,
"loss": 0.45940208435058594,
"step": 1630
},
{
"epoch": 0.2772289780350974,
"grad_norm": 0.4018136021331866,
"learning_rate": 9.075730129415605e-06,
"loss": 0.45337772369384766,
"step": 1640
},
{
"epoch": 0.27891939863287235,
"grad_norm": 0.43962151340240485,
"learning_rate": 9.058568879707024e-06,
"loss": 0.45505146980285643,
"step": 1650
},
{
"epoch": 0.2806098192306473,
"grad_norm": 0.4199289492540323,
"learning_rate": 9.041266312704511e-06,
"loss": 0.45896520614624026,
"step": 1660
},
{
"epoch": 0.2823002398284223,
"grad_norm": 0.4181263154601458,
"learning_rate": 9.023823030874608e-06,
"loss": 0.460459041595459,
"step": 1670
},
{
"epoch": 0.2839906604261973,
"grad_norm": 0.40795828627859493,
"learning_rate": 9.006239641583471e-06,
"loss": 0.45454959869384765,
"step": 1680
},
{
"epoch": 0.28568108102397227,
"grad_norm": 0.4046378255672331,
"learning_rate": 8.98851675707573e-06,
"loss": 0.45649113655090334,
"step": 1690
},
{
"epoch": 0.28737150162174724,
"grad_norm": 0.40847471276070824,
"learning_rate": 8.970654994453163e-06,
"loss": 0.4552486419677734,
"step": 1700
},
{
"epoch": 0.28906192221952226,
"grad_norm": 0.38887379249470955,
"learning_rate": 8.952654975653215e-06,
"loss": 0.446870231628418,
"step": 1710
},
{
"epoch": 0.2907523428172972,
"grad_norm": 0.44918248701550884,
"learning_rate": 8.93451732742734e-06,
"loss": 0.4545734405517578,
"step": 1720
},
{
"epoch": 0.2924427634150722,
"grad_norm": 0.42111108783651335,
"learning_rate": 8.91624268131918e-06,
"loss": 0.4571378231048584,
"step": 1730
},
{
"epoch": 0.2941331840128472,
"grad_norm": 0.4376361106159771,
"learning_rate": 8.89783167364257e-06,
"loss": 0.4523021697998047,
"step": 1740
},
{
"epoch": 0.2958236046106222,
"grad_norm": 0.4125543712485479,
"learning_rate": 8.879284945459388e-06,
"loss": 0.45134286880493163,
"step": 1750
},
{
"epoch": 0.29751402520839715,
"grad_norm": 0.39223691908106756,
"learning_rate": 8.860603142557227e-06,
"loss": 0.4513846397399902,
"step": 1760
},
{
"epoch": 0.29920444580617217,
"grad_norm": 0.37509510978050736,
"learning_rate": 8.841786915426918e-06,
"loss": 0.4400909900665283,
"step": 1770
},
{
"epoch": 0.30089486640394714,
"grad_norm": 0.44139926255235873,
"learning_rate": 8.822836919239873e-06,
"loss": 0.45766630172729494,
"step": 1780
},
{
"epoch": 0.3025852870017221,
"grad_norm": 0.48830742143223577,
"learning_rate": 8.803753813825271e-06,
"loss": 0.45400180816650393,
"step": 1790
},
{
"epoch": 0.3042757075994971,
"grad_norm": 0.5574507035046785,
"learning_rate": 8.784538263647088e-06,
"loss": 0.4435715675354004,
"step": 1800
},
{
"epoch": 0.3059661281972721,
"grad_norm": 0.5225167208182738,
"learning_rate": 8.765190937780964e-06,
"loss": 0.4518153190612793,
"step": 1810
},
{
"epoch": 0.30765654879504706,
"grad_norm": 0.456709142206432,
"learning_rate": 8.74571250989089e-06,
"loss": 0.4490679740905762,
"step": 1820
},
{
"epoch": 0.309346969392822,
"grad_norm": 0.3950564064324204,
"learning_rate": 8.726103658205772e-06,
"loss": 0.4453883171081543,
"step": 1830
},
{
"epoch": 0.31103738999059705,
"grad_norm": 0.43108949686604414,
"learning_rate": 8.706365065495806e-06,
"loss": 0.4514158248901367,
"step": 1840
},
{
"epoch": 0.312727810588372,
"grad_norm": 0.4029582903079163,
"learning_rate": 8.686497419048696e-06,
"loss": 0.4506711006164551,
"step": 1850
},
{
"epoch": 0.314418231186147,
"grad_norm": 0.47097814167708196,
"learning_rate": 8.66650141064574e-06,
"loss": 0.4476929664611816,
"step": 1860
},
{
"epoch": 0.316108651783922,
"grad_norm": 0.48440126583440685,
"learning_rate": 8.64637773653773e-06,
"loss": 0.44446544647216796,
"step": 1870
},
{
"epoch": 0.31779907238169697,
"grad_norm": 0.39849105565330367,
"learning_rate": 8.626127097420711e-06,
"loss": 0.44705805778503416,
"step": 1880
},
{
"epoch": 0.31948949297947193,
"grad_norm": 0.43218756408209813,
"learning_rate": 8.605750198411586e-06,
"loss": 0.4613940238952637,
"step": 1890
},
{
"epoch": 0.32117991357724696,
"grad_norm": 0.38906721440922765,
"learning_rate": 8.585247749023567e-06,
"loss": 0.4503718376159668,
"step": 1900
},
{
"epoch": 0.3228703341750219,
"grad_norm": 0.4502618710490373,
"learning_rate": 8.564620463141455e-06,
"loss": 0.4474172592163086,
"step": 1910
},
{
"epoch": 0.3245607547727969,
"grad_norm": 0.48392261273487897,
"learning_rate": 8.543869058996807e-06,
"loss": 0.45044708251953125,
"step": 1920
},
{
"epoch": 0.3262511753705719,
"grad_norm": 0.46357543002429463,
"learning_rate": 8.52299425914291e-06,
"loss": 0.45404787063598634,
"step": 1930
},
{
"epoch": 0.3279415959683469,
"grad_norm": 0.7728990225682476,
"learning_rate": 8.501996790429618e-06,
"loss": 0.45133085250854493,
"step": 1940
},
{
"epoch": 0.32963201656612184,
"grad_norm": 0.46229314858380605,
"learning_rate": 8.480877383978066e-06,
"loss": 0.45330057144165037,
"step": 1950
},
{
"epoch": 0.33132243716389687,
"grad_norm": 0.45494105017751585,
"learning_rate": 8.45963677515519e-06,
"loss": 0.44877138137817385,
"step": 1960
},
{
"epoch": 0.33301285776167183,
"grad_norm": 0.4317089109386061,
"learning_rate": 8.43827570354813e-06,
"loss": 0.4513542175292969,
"step": 1970
},
{
"epoch": 0.3347032783594468,
"grad_norm": 0.6488590237946437,
"learning_rate": 8.416794912938483e-06,
"loss": 0.4554163932800293,
"step": 1980
},
{
"epoch": 0.3363936989572218,
"grad_norm": 0.528391137748562,
"learning_rate": 8.395195151276397e-06,
"loss": 0.44977054595947263,
"step": 1990
},
{
"epoch": 0.3380841195549968,
"grad_norm": 0.5024989817984604,
"learning_rate": 8.373477170654536e-06,
"loss": 0.4485034942626953,
"step": 2000
},
{
"epoch": 0.33977454015277175,
"grad_norm": 0.41678908010057425,
"learning_rate": 8.351641727281882e-06,
"loss": 0.45275564193725587,
"step": 2010
},
{
"epoch": 0.3414649607505467,
"grad_norm": 0.4165453002530441,
"learning_rate": 8.329689581457412e-06,
"loss": 0.4469959259033203,
"step": 2020
},
{
"epoch": 0.34315538134832174,
"grad_norm": 0.4395909846620004,
"learning_rate": 8.307621497543625e-06,
"loss": 0.4442157745361328,
"step": 2030
},
{
"epoch": 0.3448458019460967,
"grad_norm": 0.5120332702550214,
"learning_rate": 8.285438243939923e-06,
"loss": 0.44135217666625975,
"step": 2040
},
{
"epoch": 0.3465362225438717,
"grad_norm": 0.40470346443657984,
"learning_rate": 8.263140593055856e-06,
"loss": 0.4434605598449707,
"step": 2050
},
{
"epoch": 0.3482266431416467,
"grad_norm": 0.5574860966144031,
"learning_rate": 8.240729321284233e-06,
"loss": 0.4367219924926758,
"step": 2060
},
{
"epoch": 0.34991706373942166,
"grad_norm": 0.6284853204866423,
"learning_rate": 8.218205208974081e-06,
"loss": 0.4486133575439453,
"step": 2070
},
{
"epoch": 0.35160748433719663,
"grad_norm": 0.5523042621914968,
"learning_rate": 8.195569040403478e-06,
"loss": 0.44528541564941404,
"step": 2080
},
{
"epoch": 0.35329790493497165,
"grad_norm": 0.4861292804265429,
"learning_rate": 8.172821603752244e-06,
"loss": 0.4419032096862793,
"step": 2090
},
{
"epoch": 0.3549883255327466,
"grad_norm": 0.5151777753558611,
"learning_rate": 8.149963691074494e-06,
"loss": 0.45229430198669435,
"step": 2100
},
{
"epoch": 0.3566787461305216,
"grad_norm": 0.5350570897896791,
"learning_rate": 8.126996098271068e-06,
"loss": 0.44216156005859375,
"step": 2110
},
{
"epoch": 0.3583691667282966,
"grad_norm": 0.4480111640687634,
"learning_rate": 8.103919625061803e-06,
"loss": 0.4482156753540039,
"step": 2120
},
{
"epoch": 0.3600595873260716,
"grad_norm": 0.41191616176536405,
"learning_rate": 8.080735074957706e-06,
"loss": 0.450608491897583,
"step": 2130
},
{
"epoch": 0.36175000792384654,
"grad_norm": 0.40297834092541657,
"learning_rate": 8.05744325523296e-06,
"loss": 0.44606647491455076,
"step": 2140
},
{
"epoch": 0.36344042852162156,
"grad_norm": 0.4085693775301131,
"learning_rate": 8.034044976896818e-06,
"loss": 0.44092235565185545,
"step": 2150
},
{
"epoch": 0.36513084911939653,
"grad_norm": 0.4579341322125244,
"learning_rate": 8.01054105466538e-06,
"loss": 0.4550692081451416,
"step": 2160
},
{
"epoch": 0.3668212697171715,
"grad_norm": 0.41154951045968907,
"learning_rate": 7.986932306933197e-06,
"loss": 0.4456637382507324,
"step": 2170
},
{
"epoch": 0.36851169031494646,
"grad_norm": 0.45743856816562845,
"learning_rate": 7.963219555744802e-06,
"loss": 0.44521183967590333,
"step": 2180
},
{
"epoch": 0.3702021109127215,
"grad_norm": 0.45144429372810385,
"learning_rate": 7.939403626766072e-06,
"loss": 0.4406290531158447,
"step": 2190
},
{
"epoch": 0.37189253151049645,
"grad_norm": 0.44161291067919384,
"learning_rate": 7.915485349255477e-06,
"loss": 0.4410409927368164,
"step": 2200
},
{
"epoch": 0.3735829521082714,
"grad_norm": 0.5482442710608512,
"learning_rate": 7.891465556035219e-06,
"loss": 0.4488658905029297,
"step": 2210
},
{
"epoch": 0.37527337270604644,
"grad_norm": 0.4458309105591097,
"learning_rate": 7.867345083462215e-06,
"loss": 0.44423704147338866,
"step": 2220
},
{
"epoch": 0.3769637933038214,
"grad_norm": 0.41067167379424,
"learning_rate": 7.843124771398997e-06,
"loss": 0.4448094844818115,
"step": 2230
},
{
"epoch": 0.37865421390159637,
"grad_norm": 0.39552535741630207,
"learning_rate": 7.818805463184449e-06,
"loss": 0.44344267845153806,
"step": 2240
},
{
"epoch": 0.3803446344993714,
"grad_norm": 0.41067808788053767,
"learning_rate": 7.794388005604451e-06,
"loss": 0.4351670265197754,
"step": 2250
},
{
"epoch": 0.38203505509714636,
"grad_norm": 0.47593510766741803,
"learning_rate": 7.7698732488624e-06,
"loss": 0.44367637634277346,
"step": 2260
},
{
"epoch": 0.3837254756949213,
"grad_norm": 0.4086182566076896,
"learning_rate": 7.745262046549588e-06,
"loss": 0.44177837371826173,
"step": 2270
},
{
"epoch": 0.38541589629269635,
"grad_norm": 0.43047766375857177,
"learning_rate": 7.720555255615508e-06,
"loss": 0.4466276168823242,
"step": 2280
},
{
"epoch": 0.3871063168904713,
"grad_norm": 0.3965968852531915,
"learning_rate": 7.695753736337987e-06,
"loss": 0.44551968574523926,
"step": 2290
},
{
"epoch": 0.3887967374882463,
"grad_norm": 0.4251738713536779,
"learning_rate": 7.67085835229325e-06,
"loss": 0.44549951553344724,
"step": 2300
},
{
"epoch": 0.3904871580860213,
"grad_norm": 0.46815316080000746,
"learning_rate": 7.645869970325848e-06,
"loss": 0.43446955680847166,
"step": 2310
},
{
"epoch": 0.39217757868379627,
"grad_norm": 0.4184962573632885,
"learning_rate": 7.620789460518465e-06,
"loss": 0.44701266288757324,
"step": 2320
},
{
"epoch": 0.39386799928157123,
"grad_norm": 0.38087617164724735,
"learning_rate": 7.595617696161635e-06,
"loss": 0.445133113861084,
"step": 2330
},
{
"epoch": 0.39555841987934626,
"grad_norm": 0.5453845457808255,
"learning_rate": 7.570355553723325e-06,
"loss": 0.44228591918945315,
"step": 2340
},
{
"epoch": 0.3972488404771212,
"grad_norm": 0.5390485070913686,
"learning_rate": 7.545003912818424e-06,
"loss": 0.4409176826477051,
"step": 2350
},
{
"epoch": 0.3989392610748962,
"grad_norm": 0.47828293578657854,
"learning_rate": 7.5195636561781084e-06,
"loss": 0.4372897148132324,
"step": 2360
},
{
"epoch": 0.40062968167267116,
"grad_norm": 0.4156886189220343,
"learning_rate": 7.4940356696191144e-06,
"loss": 0.447946834564209,
"step": 2370
},
{
"epoch": 0.4023201022704462,
"grad_norm": 0.4259105437856762,
"learning_rate": 7.468420842012882e-06,
"loss": 0.443576717376709,
"step": 2380
},
{
"epoch": 0.40401052286822114,
"grad_norm": 0.42755879405461517,
"learning_rate": 7.442720065254621e-06,
"loss": 0.45192480087280273,
"step": 2390
},
{
"epoch": 0.4057009434659961,
"grad_norm": 0.4446235368031096,
"learning_rate": 7.416934234232236e-06,
"loss": 0.444570255279541,
"step": 2400
},
{
"epoch": 0.40739136406377113,
"grad_norm": 0.40743320730035465,
"learning_rate": 7.3910642467951864e-06,
"loss": 0.4399536609649658,
"step": 2410
},
{
"epoch": 0.4090817846615461,
"grad_norm": 0.5746706375826369,
"learning_rate": 7.36511100372321e-06,
"loss": 0.43631649017333984,
"step": 2420
},
{
"epoch": 0.41077220525932107,
"grad_norm": 0.4340142399235041,
"learning_rate": 7.339075408694968e-06,
"loss": 0.45034146308898926,
"step": 2430
},
{
"epoch": 0.4124626258570961,
"grad_norm": 0.44523108344512796,
"learning_rate": 7.312958368256569e-06,
"loss": 0.43903651237487795,
"step": 2440
},
{
"epoch": 0.41415304645487105,
"grad_norm": 0.5091716719016176,
"learning_rate": 7.286760791790013e-06,
"loss": 0.4393869400024414,
"step": 2450
},
{
"epoch": 0.415843467052646,
"grad_norm": 0.43382179526470277,
"learning_rate": 7.260483591481522e-06,
"loss": 0.4424809455871582,
"step": 2460
},
{
"epoch": 0.41753388765042104,
"grad_norm": 0.407012689135247,
"learning_rate": 7.234127682289778e-06,
"loss": 0.4412867546081543,
"step": 2470
},
{
"epoch": 0.419224308248196,
"grad_norm": 0.41215616692154944,
"learning_rate": 7.207693981914071e-06,
"loss": 0.44841842651367186,
"step": 2480
},
{
"epoch": 0.420914728845971,
"grad_norm": 0.4518038177652311,
"learning_rate": 7.1811834107623344e-06,
"loss": 0.44159650802612305,
"step": 2490
},
{
"epoch": 0.422605149443746,
"grad_norm": 0.5163947809066302,
"learning_rate": 7.154596891919105e-06,
"loss": 0.4374223709106445,
"step": 2500
},
{
"epoch": 0.42429557004152096,
"grad_norm": 0.39820310977742257,
"learning_rate": 7.127935351113384e-06,
"loss": 0.43600940704345703,
"step": 2510
},
{
"epoch": 0.42598599063929593,
"grad_norm": 0.41241743553445703,
"learning_rate": 7.10119971668639e-06,
"loss": 0.44450817108154295,
"step": 2520
},
{
"epoch": 0.4276764112370709,
"grad_norm": 0.425125793245871,
"learning_rate": 7.074390919559249e-06,
"loss": 0.4380540370941162,
"step": 2530
},
{
"epoch": 0.4293668318348459,
"grad_norm": 0.6461727062118474,
"learning_rate": 7.047509893200577e-06,
"loss": 0.435422420501709,
"step": 2540
},
{
"epoch": 0.4310572524326209,
"grad_norm": 0.5524697209445791,
"learning_rate": 7.020557573593968e-06,
"loss": 0.4344505310058594,
"step": 2550
},
{
"epoch": 0.43274767303039585,
"grad_norm": 0.4106799649591827,
"learning_rate": 6.993534899205418e-06,
"loss": 0.43467392921447756,
"step": 2560
},
{
"epoch": 0.4344380936281709,
"grad_norm": 0.47419462628081854,
"learning_rate": 6.966442810950635e-06,
"loss": 0.4437819480895996,
"step": 2570
},
{
"epoch": 0.43612851422594584,
"grad_norm": 0.5091288622298781,
"learning_rate": 6.939282252162286e-06,
"loss": 0.4341723918914795,
"step": 2580
},
{
"epoch": 0.4378189348237208,
"grad_norm": 0.40678856254540896,
"learning_rate": 6.9120541685571444e-06,
"loss": 0.4385373592376709,
"step": 2590
},
{
"epoch": 0.43950935542149583,
"grad_norm": 0.40466633467374624,
"learning_rate": 6.884759508203164e-06,
"loss": 0.44423751831054686,
"step": 2600
},
{
"epoch": 0.4411997760192708,
"grad_norm": 0.4150721188072644,
"learning_rate": 6.857399221486467e-06,
"loss": 0.44628586769104006,
"step": 2610
},
{
"epoch": 0.44289019661704576,
"grad_norm": 0.4409008284827818,
"learning_rate": 6.8299742610782535e-06,
"loss": 0.44063844680786135,
"step": 2620
},
{
"epoch": 0.4445806172148208,
"grad_norm": 0.4430457862634213,
"learning_rate": 6.802485581901626e-06,
"loss": 0.4308623313903809,
"step": 2630
},
{
"epoch": 0.44627103781259575,
"grad_norm": 0.4592560146145528,
"learning_rate": 6.774934141098344e-06,
"loss": 0.4483074188232422,
"step": 2640
},
{
"epoch": 0.4479614584103707,
"grad_norm": 0.4182419384464618,
"learning_rate": 6.747320897995493e-06,
"loss": 0.4336414813995361,
"step": 2650
},
{
"epoch": 0.44965187900814574,
"grad_norm": 0.438063822803867,
"learning_rate": 6.719646814072084e-06,
"loss": 0.43684959411621094,
"step": 2660
},
{
"epoch": 0.4513422996059207,
"grad_norm": 0.42289652128353217,
"learning_rate": 6.691912852925574e-06,
"loss": 0.44373302459716796,
"step": 2670
},
{
"epoch": 0.45303272020369567,
"grad_norm": 0.4036390318140732,
"learning_rate": 6.664119980238315e-06,
"loss": 0.44059133529663086,
"step": 2680
},
{
"epoch": 0.4547231408014707,
"grad_norm": 0.43302204039469716,
"learning_rate": 6.636269163743928e-06,
"loss": 0.44221057891845705,
"step": 2690
},
{
"epoch": 0.45641356139924566,
"grad_norm": 0.40358739632993024,
"learning_rate": 6.608361373193608e-06,
"loss": 0.44517173767089846,
"step": 2700
},
{
"epoch": 0.4581039819970206,
"grad_norm": 0.4358355709244199,
"learning_rate": 6.580397580322358e-06,
"loss": 0.42831597328186033,
"step": 2710
},
{
"epoch": 0.4597944025947956,
"grad_norm": 0.43280631627772653,
"learning_rate": 6.55237875881515e-06,
"loss": 0.4354698181152344,
"step": 2720
},
{
"epoch": 0.4614848231925706,
"grad_norm": 0.4245846691082156,
"learning_rate": 6.52430588427303e-06,
"loss": 0.4449951171875,
"step": 2730
},
{
"epoch": 0.4631752437903456,
"grad_norm": 0.4314018816383608,
"learning_rate": 6.49617993417914e-06,
"loss": 0.44388618469238283,
"step": 2740
},
{
"epoch": 0.46486566438812055,
"grad_norm": 0.44826365164777787,
"learning_rate": 6.468001887864688e-06,
"loss": 0.44153881072998047,
"step": 2750
},
{
"epoch": 0.46655608498589557,
"grad_norm": 0.43908236075177787,
"learning_rate": 6.43977272647484e-06,
"loss": 0.44250946044921874,
"step": 2760
},
{
"epoch": 0.46824650558367054,
"grad_norm": 0.3850389053146295,
"learning_rate": 6.4114934329345715e-06,
"loss": 0.4358978271484375,
"step": 2770
},
{
"epoch": 0.4699369261814455,
"grad_norm": 0.6097960037471896,
"learning_rate": 6.383164991914424e-06,
"loss": 0.43413305282592773,
"step": 2780
},
{
"epoch": 0.4716273467792205,
"grad_norm": 0.4461689157475998,
"learning_rate": 6.354788389796238e-06,
"loss": 0.4385429859161377,
"step": 2790
},
{
"epoch": 0.4733177673769955,
"grad_norm": 0.5441220360355755,
"learning_rate": 6.326364614638794e-06,
"loss": 0.43816003799438474,
"step": 2800
},
{
"epoch": 0.47500818797477046,
"grad_norm": 0.41572374444805754,
"learning_rate": 6.297894656143415e-06,
"loss": 0.438665771484375,
"step": 2810
},
{
"epoch": 0.4766986085725455,
"grad_norm": 0.4571323669570309,
"learning_rate": 6.269379505619504e-06,
"loss": 0.4369631767272949,
"step": 2820
},
{
"epoch": 0.47838902917032045,
"grad_norm": 0.4264556810440158,
"learning_rate": 6.240820155950027e-06,
"loss": 0.44220762252807616,
"step": 2830
},
{
"epoch": 0.4800794497680954,
"grad_norm": 0.397942977911585,
"learning_rate": 6.2122176015569405e-06,
"loss": 0.43413677215576174,
"step": 2840
},
{
"epoch": 0.48176987036587043,
"grad_norm": 0.45124864351563504,
"learning_rate": 6.183572838366572e-06,
"loss": 0.4400357246398926,
"step": 2850
},
{
"epoch": 0.4834602909636454,
"grad_norm": 0.42428517380688374,
"learning_rate": 6.1548868637749306e-06,
"loss": 0.4344414234161377,
"step": 2860
},
{
"epoch": 0.48515071156142037,
"grad_norm": 0.41383689495932036,
"learning_rate": 6.126160676612992e-06,
"loss": 0.4421424388885498,
"step": 2870
},
{
"epoch": 0.48684113215919533,
"grad_norm": 0.4529042500836241,
"learning_rate": 6.097395277111909e-06,
"loss": 0.43562684059143064,
"step": 2880
},
{
"epoch": 0.48853155275697036,
"grad_norm": 0.436461736013555,
"learning_rate": 6.0685916668681925e-06,
"loss": 0.4411266326904297,
"step": 2890
},
{
"epoch": 0.4902219733547453,
"grad_norm": 0.4660224260239127,
"learning_rate": 6.039750848808826e-06,
"loss": 0.4391491889953613,
"step": 2900
},
{
"epoch": 0.4919123939525203,
"grad_norm": 0.4265622760480254,
"learning_rate": 6.010873827156352e-06,
"loss": 0.43004140853881834,
"step": 2910
},
{
"epoch": 0.4936028145502953,
"grad_norm": 0.5891326466041507,
"learning_rate": 5.981961607393905e-06,
"loss": 0.43016576766967773,
"step": 2920
},
{
"epoch": 0.4952932351480703,
"grad_norm": 0.42731374205948136,
"learning_rate": 5.953015196230201e-06,
"loss": 0.437261962890625,
"step": 2930
},
{
"epoch": 0.49698365574584524,
"grad_norm": 0.40601940500233463,
"learning_rate": 5.924035601564478e-06,
"loss": 0.43418092727661134,
"step": 2940
},
{
"epoch": 0.49867407634362027,
"grad_norm": 0.44316397124344875,
"learning_rate": 5.895023832451414e-06,
"loss": 0.4333051681518555,
"step": 2950
},
{
"epoch": 0.5003644969413952,
"grad_norm": 0.42103355983145363,
"learning_rate": 5.865980899065979e-06,
"loss": 0.4366158485412598,
"step": 2960
},
{
"epoch": 0.5020549175391702,
"grad_norm": 0.4189038279835885,
"learning_rate": 5.836907812668267e-06,
"loss": 0.43808746337890625,
"step": 2970
},
{
"epoch": 0.5037453381369452,
"grad_norm": 0.4045771583980379,
"learning_rate": 5.8078055855682904e-06,
"loss": 0.4391347885131836,
"step": 2980
},
{
"epoch": 0.5054357587347201,
"grad_norm": 0.6123791815482257,
"learning_rate": 5.778675231090715e-06,
"loss": 0.4369372844696045,
"step": 2990
},
{
"epoch": 0.5071261793324952,
"grad_norm": 0.4504376197363758,
"learning_rate": 5.749517763539601e-06,
"loss": 0.42980470657348635,
"step": 3000
},
{
"epoch": 0.5088165999302702,
"grad_norm": 0.43408260936457443,
"learning_rate": 5.720334198163063e-06,
"loss": 0.4386304378509521,
"step": 3010
},
{
"epoch": 0.5105070205280451,
"grad_norm": 0.4225926173781409,
"learning_rate": 5.6911255511179295e-06,
"loss": 0.4303256034851074,
"step": 3020
},
{
"epoch": 0.5121974411258201,
"grad_norm": 0.4464137671609211,
"learning_rate": 5.661892839434362e-06,
"loss": 0.4334456443786621,
"step": 3030
},
{
"epoch": 0.5138878617235951,
"grad_norm": 0.4264014041223435,
"learning_rate": 5.63263708098044e-06,
"loss": 0.4284989833831787,
"step": 3040
},
{
"epoch": 0.51557828232137,
"grad_norm": 0.4469737217816344,
"learning_rate": 5.603359294426717e-06,
"loss": 0.4342545986175537,
"step": 3050
},
{
"epoch": 0.5172687029191451,
"grad_norm": 0.42589508183124813,
"learning_rate": 5.574060499210759e-06,
"loss": 0.4403389930725098,
"step": 3060
},
{
"epoch": 0.5189591235169201,
"grad_norm": 0.5297222464707912,
"learning_rate": 5.54474171550164e-06,
"loss": 0.435770320892334,
"step": 3070
},
{
"epoch": 0.520649544114695,
"grad_norm": 0.3800451330600584,
"learning_rate": 5.515403964164421e-06,
"loss": 0.43670501708984377,
"step": 3080
},
{
"epoch": 0.52233996471247,
"grad_norm": 0.5333744177944942,
"learning_rate": 5.486048266724609e-06,
"loss": 0.43523521423339845,
"step": 3090
},
{
"epoch": 0.524030385310245,
"grad_norm": 0.5434046403400835,
"learning_rate": 5.4566756453325835e-06,
"loss": 0.4394557952880859,
"step": 3100
},
{
"epoch": 0.52572080590802,
"grad_norm": 0.6610762094695175,
"learning_rate": 5.427287122728008e-06,
"loss": 0.4303136348724365,
"step": 3110
},
{
"epoch": 0.527411226505795,
"grad_norm": 0.4682270464152013,
"learning_rate": 5.39788372220422e-06,
"loss": 0.4351104736328125,
"step": 3120
},
{
"epoch": 0.52910164710357,
"grad_norm": 0.5276146962130529,
"learning_rate": 5.368466467572595e-06,
"loss": 0.42786803245544436,
"step": 3130
},
{
"epoch": 0.5307920677013449,
"grad_norm": 0.3898103195251585,
"learning_rate": 5.339036383126905e-06,
"loss": 0.4315065383911133,
"step": 3140
},
{
"epoch": 0.5324824882991199,
"grad_norm": 0.49019068218850664,
"learning_rate": 5.309594493607646e-06,
"loss": 0.4383066177368164,
"step": 3150
},
{
"epoch": 0.534172908896895,
"grad_norm": 0.4028365280209736,
"learning_rate": 5.280141824166363e-06,
"loss": 0.43825907707214357,
"step": 3160
},
{
"epoch": 0.5358633294946699,
"grad_norm": 0.4094965220781369,
"learning_rate": 5.250679400329953e-06,
"loss": 0.43253560066223146,
"step": 3170
},
{
"epoch": 0.5375537500924449,
"grad_norm": 0.40846275540289434,
"learning_rate": 5.221208247964951e-06,
"loss": 0.43136014938354494,
"step": 3180
},
{
"epoch": 0.5392441706902199,
"grad_norm": 0.49954149484857235,
"learning_rate": 5.191729393241822e-06,
"loss": 0.4329479694366455,
"step": 3190
},
{
"epoch": 0.5409345912879948,
"grad_norm": 0.3931028479875822,
"learning_rate": 5.162243862599221e-06,
"loss": 0.4355682373046875,
"step": 3200
},
{
"epoch": 0.5426250118857698,
"grad_norm": 0.42120237348360007,
"learning_rate": 5.132752682708252e-06,
"loss": 0.43157129287719725,
"step": 3210
},
{
"epoch": 0.5443154324835449,
"grad_norm": 0.43276553833311104,
"learning_rate": 5.103256880436724e-06,
"loss": 0.4270349025726318,
"step": 3220
},
{
"epoch": 0.5460058530813198,
"grad_norm": 0.5211667786145312,
"learning_rate": 5.073757482813397e-06,
"loss": 0.4324329853057861,
"step": 3230
},
{
"epoch": 0.5476962736790948,
"grad_norm": 0.4755287439381494,
"learning_rate": 5.044255516992218e-06,
"loss": 0.435423755645752,
"step": 3240
},
{
"epoch": 0.5493866942768698,
"grad_norm": 0.4637604896810374,
"learning_rate": 5.014752010216558e-06,
"loss": 0.4313016891479492,
"step": 3250
},
{
"epoch": 0.5510771148746447,
"grad_norm": 0.40550687413116615,
"learning_rate": 4.9852479897834424e-06,
"loss": 0.4341496467590332,
"step": 3260
},
{
"epoch": 0.5527675354724197,
"grad_norm": 0.4657009916810442,
"learning_rate": 4.955744483007784e-06,
"loss": 0.42819700241088865,
"step": 3270
},
{
"epoch": 0.5544579560701948,
"grad_norm": 0.4956322489349777,
"learning_rate": 4.926242517186603e-06,
"loss": 0.4234212875366211,
"step": 3280
},
{
"epoch": 0.5561483766679697,
"grad_norm": 0.40681714039435574,
"learning_rate": 4.896743119563279e-06,
"loss": 0.4332298278808594,
"step": 3290
},
{
"epoch": 0.5578387972657447,
"grad_norm": 0.38587035769576944,
"learning_rate": 4.867247317291751e-06,
"loss": 0.438218355178833,
"step": 3300
},
{
"epoch": 0.5595292178635196,
"grad_norm": 0.4551274587379102,
"learning_rate": 4.8377561374007805e-06,
"loss": 0.43701925277709963,
"step": 3310
},
{
"epoch": 0.5612196384612946,
"grad_norm": 0.4040304584035835,
"learning_rate": 4.808270606758179e-06,
"loss": 0.4318737030029297,
"step": 3320
},
{
"epoch": 0.5629100590590697,
"grad_norm": 0.4512054219017183,
"learning_rate": 4.77879175203505e-06,
"loss": 0.43484320640563967,
"step": 3330
},
{
"epoch": 0.5646004796568446,
"grad_norm": 0.4082489988390108,
"learning_rate": 4.74932059967005e-06,
"loss": 0.4284003734588623,
"step": 3340
},
{
"epoch": 0.5662909002546196,
"grad_norm": 0.42276817092848296,
"learning_rate": 4.7198581758336396e-06,
"loss": 0.43506608009338377,
"step": 3350
},
{
"epoch": 0.5679813208523946,
"grad_norm": 0.42621910508451183,
"learning_rate": 4.690405506392355e-06,
"loss": 0.43180079460144044,
"step": 3360
},
{
"epoch": 0.5696717414501695,
"grad_norm": 0.58603996179948,
"learning_rate": 4.660963616873096e-06,
"loss": 0.43529691696166994,
"step": 3370
},
{
"epoch": 0.5713621620479445,
"grad_norm": 0.43475498420426534,
"learning_rate": 4.631533532427405e-06,
"loss": 0.42524237632751466,
"step": 3380
},
{
"epoch": 0.5730525826457196,
"grad_norm": 0.44296635506788934,
"learning_rate": 4.6021162777957815e-06,
"loss": 0.42964849472045896,
"step": 3390
},
{
"epoch": 0.5747430032434945,
"grad_norm": 0.4739115865696253,
"learning_rate": 4.572712877271993e-06,
"loss": 0.4318963050842285,
"step": 3400
},
{
"epoch": 0.5764334238412695,
"grad_norm": 0.4251182604533514,
"learning_rate": 4.543324354667418e-06,
"loss": 0.429244327545166,
"step": 3410
},
{
"epoch": 0.5781238444390445,
"grad_norm": 0.4422586426616063,
"learning_rate": 4.513951733275395e-06,
"loss": 0.42932448387145994,
"step": 3420
},
{
"epoch": 0.5798142650368194,
"grad_norm": 0.4691091599178207,
"learning_rate": 4.48459603583558e-06,
"loss": 0.4259922027587891,
"step": 3430
},
{
"epoch": 0.5815046856345945,
"grad_norm": 0.5464764518023947,
"learning_rate": 4.455258284498363e-06,
"loss": 0.42319440841674805,
"step": 3440
},
{
"epoch": 0.5831951062323695,
"grad_norm": 0.41304172848458615,
"learning_rate": 4.42593950078924e-06,
"loss": 0.4340841293334961,
"step": 3450
},
{
"epoch": 0.5848855268301444,
"grad_norm": 0.44352336434522394,
"learning_rate": 4.396640705573284e-06,
"loss": 0.42516536712646485,
"step": 3460
},
{
"epoch": 0.5865759474279194,
"grad_norm": 0.4396217564214065,
"learning_rate": 4.367362919019561e-06,
"loss": 0.43398313522338866,
"step": 3470
},
{
"epoch": 0.5882663680256944,
"grad_norm": 0.4639850426028468,
"learning_rate": 4.338107160565639e-06,
"loss": 0.42090563774108886,
"step": 3480
},
{
"epoch": 0.5899567886234693,
"grad_norm": 0.49730143479919997,
"learning_rate": 4.308874448882072e-06,
"loss": 0.42863998413085935,
"step": 3490
},
{
"epoch": 0.5916472092212444,
"grad_norm": 0.4320500925423939,
"learning_rate": 4.279665801836938e-06,
"loss": 0.43216619491577146,
"step": 3500
},
{
"epoch": 0.5933376298190194,
"grad_norm": 0.38405843815880253,
"learning_rate": 4.250482236460399e-06,
"loss": 0.4306344985961914,
"step": 3510
},
{
"epoch": 0.5950280504167943,
"grad_norm": 0.41562127671231863,
"learning_rate": 4.2213247689092846e-06,
"loss": 0.4339097499847412,
"step": 3520
},
{
"epoch": 0.5967184710145693,
"grad_norm": 0.5206266110542977,
"learning_rate": 4.192194414431712e-06,
"loss": 0.42906818389892576,
"step": 3530
},
{
"epoch": 0.5984088916123443,
"grad_norm": 0.44079942953014034,
"learning_rate": 4.163092187331733e-06,
"loss": 0.42707481384277346,
"step": 3540
},
{
"epoch": 0.6000993122101193,
"grad_norm": 0.5283111002109292,
"learning_rate": 4.1340191009340215e-06,
"loss": 0.43108377456665037,
"step": 3550
},
{
"epoch": 0.6017897328078943,
"grad_norm": 0.497346167548402,
"learning_rate": 4.104976167548587e-06,
"loss": 0.42334275245666503,
"step": 3560
},
{
"epoch": 0.6034801534056693,
"grad_norm": 0.41994436685076686,
"learning_rate": 4.075964398435522e-06,
"loss": 0.4256128787994385,
"step": 3570
},
{
"epoch": 0.6051705740034442,
"grad_norm": 0.44264493338347205,
"learning_rate": 4.046984803769801e-06,
"loss": 0.42928495407104494,
"step": 3580
},
{
"epoch": 0.6068609946012192,
"grad_norm": 0.45103704784300114,
"learning_rate": 4.018038392606096e-06,
"loss": 0.4291574478149414,
"step": 3590
},
{
"epoch": 0.6085514151989942,
"grad_norm": 0.4052714162982127,
"learning_rate": 3.98912617284365e-06,
"loss": 0.42896642684936526,
"step": 3600
},
{
"epoch": 0.6102418357967692,
"grad_norm": 0.40164760162129637,
"learning_rate": 3.960249151191178e-06,
"loss": 0.4323906898498535,
"step": 3610
},
{
"epoch": 0.6119322563945442,
"grad_norm": 0.4505206662224979,
"learning_rate": 3.931408333131809e-06,
"loss": 0.42946605682373046,
"step": 3620
},
{
"epoch": 0.6136226769923192,
"grad_norm": 0.38809773635856676,
"learning_rate": 3.902604722888092e-06,
"loss": 0.4359119892120361,
"step": 3630
},
{
"epoch": 0.6153130975900941,
"grad_norm": 0.4515178914649961,
"learning_rate": 3.873839323387009e-06,
"loss": 0.42914657592773436,
"step": 3640
},
{
"epoch": 0.6170035181878691,
"grad_norm": 0.48547177051092294,
"learning_rate": 3.845113136225072e-06,
"loss": 0.43363237380981445,
"step": 3650
},
{
"epoch": 0.618693938785644,
"grad_norm": 0.4431323439189628,
"learning_rate": 3.81642716163343e-06,
"loss": 0.427580738067627,
"step": 3660
},
{
"epoch": 0.6203843593834191,
"grad_norm": 0.40919513437853744,
"learning_rate": 3.7877823984430608e-06,
"loss": 0.42395830154418945,
"step": 3670
},
{
"epoch": 0.6220747799811941,
"grad_norm": 0.445422416784889,
"learning_rate": 3.7591798440499755e-06,
"loss": 0.4330305099487305,
"step": 3680
},
{
"epoch": 0.623765200578969,
"grad_norm": 0.42604546747481925,
"learning_rate": 3.7306204943804973e-06,
"loss": 0.4383516311645508,
"step": 3690
},
{
"epoch": 0.625455621176744,
"grad_norm": 0.4655383013749577,
"learning_rate": 3.7021053438565863e-06,
"loss": 0.43866143226623533,
"step": 3700
},
{
"epoch": 0.627146041774519,
"grad_norm": 0.3989287897287261,
"learning_rate": 3.673635385361206e-06,
"loss": 0.43120598793029785,
"step": 3710
},
{
"epoch": 0.628836462372294,
"grad_norm": 0.43139571316850417,
"learning_rate": 3.6452116102037625e-06,
"loss": 0.43346233367919923,
"step": 3720
},
{
"epoch": 0.630526882970069,
"grad_norm": 0.43077204056512486,
"learning_rate": 3.6168350080855785e-06,
"loss": 0.4254899978637695,
"step": 3730
},
{
"epoch": 0.632217303567844,
"grad_norm": 0.4116188525169869,
"learning_rate": 3.5885065670654306e-06,
"loss": 0.43075990676879883,
"step": 3740
},
{
"epoch": 0.6339077241656189,
"grad_norm": 0.48980622755885445,
"learning_rate": 3.560227273525162e-06,
"loss": 0.43142094612121584,
"step": 3750
},
{
"epoch": 0.6355981447633939,
"grad_norm": 0.45257900417419006,
"learning_rate": 3.5319981121353133e-06,
"loss": 0.4283766746520996,
"step": 3760
},
{
"epoch": 0.637288565361169,
"grad_norm": 0.43598385649668175,
"learning_rate": 3.503820065820861e-06,
"loss": 0.4272482395172119,
"step": 3770
},
{
"epoch": 0.6389789859589439,
"grad_norm": 0.4436945578231624,
"learning_rate": 3.47569411572697e-06,
"loss": 0.4362170696258545,
"step": 3780
},
{
"epoch": 0.6406694065567189,
"grad_norm": 0.4423636015024226,
"learning_rate": 3.447621241184852e-06,
"loss": 0.432710075378418,
"step": 3790
},
{
"epoch": 0.6423598271544939,
"grad_norm": 0.5550229086437715,
"learning_rate": 3.4196024196776452e-06,
"loss": 0.42471537590026853,
"step": 3800
},
{
"epoch": 0.6440502477522688,
"grad_norm": 0.4441992666501525,
"learning_rate": 3.391638626806393e-06,
"loss": 0.429352855682373,
"step": 3810
},
{
"epoch": 0.6457406683500438,
"grad_norm": 0.3852461787826987,
"learning_rate": 3.363730836256074e-06,
"loss": 0.42467470169067384,
"step": 3820
},
{
"epoch": 0.6474310889478189,
"grad_norm": 0.47623628052793926,
"learning_rate": 3.3358800197616856e-06,
"loss": 0.4357749938964844,
"step": 3830
},
{
"epoch": 0.6491215095455938,
"grad_norm": 0.42486869443758596,
"learning_rate": 3.3080871470744273e-06,
"loss": 0.4325972557067871,
"step": 3840
},
{
"epoch": 0.6508119301433688,
"grad_norm": 0.5004616407501641,
"learning_rate": 3.280353185927918e-06,
"loss": 0.42637929916381834,
"step": 3850
},
{
"epoch": 0.6525023507411438,
"grad_norm": 0.4596076466050951,
"learning_rate": 3.252679102004509e-06,
"loss": 0.4192366600036621,
"step": 3860
},
{
"epoch": 0.6541927713389187,
"grad_norm": 0.4598817421019848,
"learning_rate": 3.225065858901658e-06,
"loss": 0.42298049926757814,
"step": 3870
},
{
"epoch": 0.6558831919366938,
"grad_norm": 0.4869637478788273,
"learning_rate": 3.197514418098375e-06,
"loss": 0.4236114501953125,
"step": 3880
},
{
"epoch": 0.6575736125344688,
"grad_norm": 0.4191779853522958,
"learning_rate": 3.170025738921748e-06,
"loss": 0.4369206428527832,
"step": 3890
},
{
"epoch": 0.6592640331322437,
"grad_norm": 0.46354651428620175,
"learning_rate": 3.142600778513534e-06,
"loss": 0.4308767795562744,
"step": 3900
},
{
"epoch": 0.6609544537300187,
"grad_norm": 0.433362558403198,
"learning_rate": 3.1152404917968376e-06,
"loss": 0.4263267517089844,
"step": 3910
},
{
"epoch": 0.6626448743277937,
"grad_norm": 0.4183955388560034,
"learning_rate": 3.087945831442859e-06,
"loss": 0.4287719249725342,
"step": 3920
},
{
"epoch": 0.6643352949255686,
"grad_norm": 0.4569797787084886,
"learning_rate": 3.0607177478377146e-06,
"loss": 0.42838282585144044,
"step": 3930
},
{
"epoch": 0.6660257155233437,
"grad_norm": 0.405528119203322,
"learning_rate": 3.033557189049367e-06,
"loss": 0.4259345054626465,
"step": 3940
},
{
"epoch": 0.6677161361211187,
"grad_norm": 0.4445510274119577,
"learning_rate": 3.006465100794583e-06,
"loss": 0.4231560230255127,
"step": 3950
},
{
"epoch": 0.6694065567188936,
"grad_norm": 0.3956379066918208,
"learning_rate": 2.979442426406034e-06,
"loss": 0.42560606002807616,
"step": 3960
},
{
"epoch": 0.6710969773166686,
"grad_norm": 0.487413886720085,
"learning_rate": 2.9524901067994238e-06,
"loss": 0.4272177696228027,
"step": 3970
},
{
"epoch": 0.6727873979144436,
"grad_norm": 0.4557490354463524,
"learning_rate": 2.9256090804407522e-06,
"loss": 0.4238264083862305,
"step": 3980
},
{
"epoch": 0.6744778185122186,
"grad_norm": 0.46762585507399806,
"learning_rate": 2.8988002833136114e-06,
"loss": 0.4286961555480957,
"step": 3990
},
{
"epoch": 0.6761682391099936,
"grad_norm": 0.459519945726867,
"learning_rate": 2.872064648886618e-06,
"loss": 0.4258302688598633,
"step": 4000
},
{
"epoch": 0.6778586597077685,
"grad_norm": 0.4158599550893325,
"learning_rate": 2.845403108080895e-06,
"loss": 0.42467894554138186,
"step": 4010
},
{
"epoch": 0.6795490803055435,
"grad_norm": 0.764661370612818,
"learning_rate": 2.8188165892376655e-06,
"loss": 0.42564783096313474,
"step": 4020
},
{
"epoch": 0.6812395009033185,
"grad_norm": 0.46090960281188625,
"learning_rate": 2.792306018085932e-06,
"loss": 0.4252346992492676,
"step": 4030
},
{
"epoch": 0.6829299215010934,
"grad_norm": 0.5052241362723152,
"learning_rate": 2.7658723177102243e-06,
"loss": 0.42507052421569824,
"step": 4040
},
{
"epoch": 0.6846203420988685,
"grad_norm": 0.4969762444434996,
"learning_rate": 2.73951640851848e-06,
"loss": 0.42835030555725095,
"step": 4050
},
{
"epoch": 0.6863107626966435,
"grad_norm": 0.3887380837718711,
"learning_rate": 2.713239208209989e-06,
"loss": 0.4258549690246582,
"step": 4060
},
{
"epoch": 0.6880011832944184,
"grad_norm": 0.3783428232134585,
"learning_rate": 2.6870416317434334e-06,
"loss": 0.4262125015258789,
"step": 4070
},
{
"epoch": 0.6896916038921934,
"grad_norm": 0.41965865205854214,
"learning_rate": 2.6609245913050345e-06,
"loss": 0.42023792266845705,
"step": 4080
},
{
"epoch": 0.6913820244899684,
"grad_norm": 0.40186299701928146,
"learning_rate": 2.63488899627679e-06,
"loss": 0.42939143180847167,
"step": 4090
},
{
"epoch": 0.6930724450877433,
"grad_norm": 0.5147185623048582,
"learning_rate": 2.6089357532048152e-06,
"loss": 0.42742137908935546,
"step": 4100
},
{
"epoch": 0.6947628656855184,
"grad_norm": 0.4004984558472973,
"learning_rate": 2.583065765767766e-06,
"loss": 0.42586841583251955,
"step": 4110
},
{
"epoch": 0.6964532862832934,
"grad_norm": 0.49746517756929526,
"learning_rate": 2.5572799347453813e-06,
"loss": 0.4294744968414307,
"step": 4120
},
{
"epoch": 0.6981437068810683,
"grad_norm": 0.45191836993834755,
"learning_rate": 2.531579157987119e-06,
"loss": 0.42096834182739257,
"step": 4130
},
{
"epoch": 0.6998341274788433,
"grad_norm": 0.4388169816996864,
"learning_rate": 2.505964330380886e-06,
"loss": 0.43329315185546874,
"step": 4140
},
{
"epoch": 0.7015245480766183,
"grad_norm": 0.4646683084770869,
"learning_rate": 2.480436343821892e-06,
"loss": 0.4317659378051758,
"step": 4150
},
{
"epoch": 0.7032149686743933,
"grad_norm": 0.4144813387224547,
"learning_rate": 2.4549960871815777e-06,
"loss": 0.4222762107849121,
"step": 4160
},
{
"epoch": 0.7049053892721683,
"grad_norm": 1.149880889208155,
"learning_rate": 2.4296444462766766e-06,
"loss": 0.426224422454834,
"step": 4170
},
{
"epoch": 0.7065958098699433,
"grad_norm": 0.41212484110906356,
"learning_rate": 2.4043823038383675e-06,
"loss": 0.43084254264831545,
"step": 4180
},
{
"epoch": 0.7082862304677182,
"grad_norm": 0.40025278610336024,
"learning_rate": 2.3792105394815347e-06,
"loss": 0.42110452651977537,
"step": 4190
},
{
"epoch": 0.7099766510654932,
"grad_norm": 0.4205397675832194,
"learning_rate": 2.3541300296741535e-06,
"loss": 0.42673492431640625,
"step": 4200
},
{
"epoch": 0.7116670716632683,
"grad_norm": 0.43501797735562275,
"learning_rate": 2.3291416477067493e-06,
"loss": 0.4395922660827637,
"step": 4210
},
{
"epoch": 0.7133574922610432,
"grad_norm": 0.4286409355798364,
"learning_rate": 2.304246263662014e-06,
"loss": 0.42594170570373535,
"step": 4220
},
{
"epoch": 0.7150479128588182,
"grad_norm": 0.39409267212647164,
"learning_rate": 2.2794447443844935e-06,
"loss": 0.42094078063964846,
"step": 4230
},
{
"epoch": 0.7167383334565932,
"grad_norm": 0.448306587655997,
"learning_rate": 2.254737953450413e-06,
"loss": 0.4300067901611328,
"step": 4240
},
{
"epoch": 0.7184287540543681,
"grad_norm": 0.41278204876846897,
"learning_rate": 2.230126751137604e-06,
"loss": 0.42948102951049805,
"step": 4250
},
{
"epoch": 0.7201191746521431,
"grad_norm": 0.8234588798535949,
"learning_rate": 2.2056119943955493e-06,
"loss": 0.423651123046875,
"step": 4260
},
{
"epoch": 0.7218095952499182,
"grad_norm": 0.4295386395064719,
"learning_rate": 2.181194536815553e-06,
"loss": 0.4270059585571289,
"step": 4270
},
{
"epoch": 0.7235000158476931,
"grad_norm": 0.4845164711653868,
"learning_rate": 2.1568752286010046e-06,
"loss": 0.42238831520080566,
"step": 4280
},
{
"epoch": 0.7251904364454681,
"grad_norm": 0.4188111308447989,
"learning_rate": 2.132654916537786e-06,
"loss": 0.42424306869506834,
"step": 4290
},
{
"epoch": 0.7268808570432431,
"grad_norm": 0.4083567150439788,
"learning_rate": 2.108534443964785e-06,
"loss": 0.4173469066619873,
"step": 4300
},
{
"epoch": 0.728571277641018,
"grad_norm": 0.49701750342474227,
"learning_rate": 2.0845146507445234e-06,
"loss": 0.4256436347961426,
"step": 4310
},
{
"epoch": 0.7302616982387931,
"grad_norm": 0.4556476468988333,
"learning_rate": 2.0605963732339294e-06,
"loss": 0.4249898433685303,
"step": 4320
},
{
"epoch": 0.7319521188365681,
"grad_norm": 0.390956985533463,
"learning_rate": 2.0367804442551987e-06,
"loss": 0.4315620422363281,
"step": 4330
},
{
"epoch": 0.733642539434343,
"grad_norm": 0.3993855335495715,
"learning_rate": 2.013067693066805e-06,
"loss": 0.4309099197387695,
"step": 4340
},
{
"epoch": 0.735332960032118,
"grad_norm": 0.4126264964735023,
"learning_rate": 1.989458945334623e-06,
"loss": 0.42850918769836427,
"step": 4350
},
{
"epoch": 0.7370233806298929,
"grad_norm": 0.44736633124886904,
"learning_rate": 1.9659550231031816e-06,
"loss": 0.4222278594970703,
"step": 4360
},
{
"epoch": 0.7387138012276679,
"grad_norm": 0.42368325672394164,
"learning_rate": 1.942556744767044e-06,
"loss": 0.42037811279296877,
"step": 4370
},
{
"epoch": 0.740404221825443,
"grad_norm": 0.5501466174054114,
"learning_rate": 1.919264925042295e-06,
"loss": 0.41927204132080076,
"step": 4380
},
{
"epoch": 0.7420946424232179,
"grad_norm": 0.39918622365681766,
"learning_rate": 1.8960803749381973e-06,
"loss": 0.42456836700439454,
"step": 4390
},
{
"epoch": 0.7437850630209929,
"grad_norm": 0.43304426432279913,
"learning_rate": 1.8730039017289326e-06,
"loss": 0.4330174446105957,
"step": 4400
},
{
"epoch": 0.7454754836187679,
"grad_norm": 0.39829603640659866,
"learning_rate": 1.8500363089255074e-06,
"loss": 0.4192543029785156,
"step": 4410
},
{
"epoch": 0.7471659042165428,
"grad_norm": 0.41399088851086324,
"learning_rate": 1.827178396247759e-06,
"loss": 0.42575607299804685,
"step": 4420
},
{
"epoch": 0.7488563248143179,
"grad_norm": 0.7360790909254504,
"learning_rate": 1.8044309595965225e-06,
"loss": 0.4194206237792969,
"step": 4430
},
{
"epoch": 0.7505467454120929,
"grad_norm": 0.44707313603093624,
"learning_rate": 1.7817947910259197e-06,
"loss": 0.42949066162109373,
"step": 4440
},
{
"epoch": 0.7522371660098678,
"grad_norm": 0.6041320702862265,
"learning_rate": 1.7592706787157682e-06,
"loss": 0.4255552291870117,
"step": 4450
},
{
"epoch": 0.7539275866076428,
"grad_norm": 0.45773641763660167,
"learning_rate": 1.7368594069441452e-06,
"loss": 0.43103628158569335,
"step": 4460
},
{
"epoch": 0.7556180072054178,
"grad_norm": 0.41301944176532956,
"learning_rate": 1.7145617560600775e-06,
"loss": 0.4204230785369873,
"step": 4470
},
{
"epoch": 0.7573084278031927,
"grad_norm": 0.6409284079564223,
"learning_rate": 1.6923785024563755e-06,
"loss": 0.43199663162231444,
"step": 4480
},
{
"epoch": 0.7589988484009678,
"grad_norm": 0.42176176532876736,
"learning_rate": 1.670310418542589e-06,
"loss": 0.4303304672241211,
"step": 4490
},
{
"epoch": 0.7606892689987428,
"grad_norm": 0.41067768414220995,
"learning_rate": 1.6483582727181203e-06,
"loss": 0.42124075889587403,
"step": 4500
},
{
"epoch": 0.7623796895965177,
"grad_norm": 0.4783665056517865,
"learning_rate": 1.626522829345466e-06,
"loss": 0.4269865989685059,
"step": 4510
},
{
"epoch": 0.7640701101942927,
"grad_norm": 0.5049031164710026,
"learning_rate": 1.604804848723603e-06,
"loss": 0.426270866394043,
"step": 4520
},
{
"epoch": 0.7657605307920677,
"grad_norm": 0.4857945955434734,
"learning_rate": 1.583205087061519e-06,
"loss": 0.4200442314147949,
"step": 4530
},
{
"epoch": 0.7674509513898427,
"grad_norm": 0.6317053106938714,
"learning_rate": 1.5617242964518737e-06,
"loss": 0.4251349925994873,
"step": 4540
},
{
"epoch": 0.7691413719876177,
"grad_norm": 0.44940467166388814,
"learning_rate": 1.5403632248448126e-06,
"loss": 0.4180570125579834,
"step": 4550
},
{
"epoch": 0.7708317925853927,
"grad_norm": 0.4779559868678119,
"learning_rate": 1.5191226160219353e-06,
"loss": 0.42661218643188475,
"step": 4560
},
{
"epoch": 0.7725222131831676,
"grad_norm": 0.48731681458704584,
"learning_rate": 1.4980032095703812e-06,
"loss": 0.4164596080780029,
"step": 4570
},
{
"epoch": 0.7742126337809426,
"grad_norm": 0.42493021443963513,
"learning_rate": 1.4770057408570932e-06,
"loss": 0.42499027252197263,
"step": 4580
},
{
"epoch": 0.7759030543787176,
"grad_norm": 0.4461752465394288,
"learning_rate": 1.4561309410031927e-06,
"loss": 0.4126904964447021,
"step": 4590
},
{
"epoch": 0.7775934749764926,
"grad_norm": 0.45608491001161766,
"learning_rate": 1.4353795368585455e-06,
"loss": 0.42144598960876467,
"step": 4600
},
{
"epoch": 0.7792838955742676,
"grad_norm": 0.49822249900794946,
"learning_rate": 1.4147522509764354e-06,
"loss": 0.43109354972839353,
"step": 4610
},
{
"epoch": 0.7809743161720426,
"grad_norm": 0.4695623975406333,
"learning_rate": 1.3942498015884148e-06,
"loss": 0.42487325668334963,
"step": 4620
},
{
"epoch": 0.7826647367698175,
"grad_norm": 0.6458133705476774,
"learning_rate": 1.3738729025792908e-06,
"loss": 0.4279775619506836,
"step": 4630
},
{
"epoch": 0.7843551573675925,
"grad_norm": 0.5664119160533859,
"learning_rate": 1.3536222634622704e-06,
"loss": 0.43117513656616213,
"step": 4640
},
{
"epoch": 0.7860455779653676,
"grad_norm": 0.42786217028357204,
"learning_rate": 1.3334985893542596e-06,
"loss": 0.42508134841918943,
"step": 4650
},
{
"epoch": 0.7877359985631425,
"grad_norm": 0.43637907827702244,
"learning_rate": 1.3135025809513047e-06,
"loss": 0.4202974796295166,
"step": 4660
},
{
"epoch": 0.7894264191609175,
"grad_norm": 0.438569542782335,
"learning_rate": 1.293634934504196e-06,
"loss": 0.4253946304321289,
"step": 4670
},
{
"epoch": 0.7911168397586925,
"grad_norm": 0.587103245107529,
"learning_rate": 1.273896341794229e-06,
"loss": 0.4194544792175293,
"step": 4680
},
{
"epoch": 0.7928072603564674,
"grad_norm": 0.4010620435372963,
"learning_rate": 1.2542874901091111e-06,
"loss": 0.4227456092834473,
"step": 4690
},
{
"epoch": 0.7944976809542424,
"grad_norm": 0.4027753025082721,
"learning_rate": 1.234809062219039e-06,
"loss": 0.4224235534667969,
"step": 4700
},
{
"epoch": 0.7961881015520174,
"grad_norm": 0.4860032717278702,
"learning_rate": 1.2154617363529126e-06,
"loss": 0.4228099822998047,
"step": 4710
},
{
"epoch": 0.7978785221497924,
"grad_norm": 0.48458357986452577,
"learning_rate": 1.1962461861747305e-06,
"loss": 0.4361612319946289,
"step": 4720
},
{
"epoch": 0.7995689427475674,
"grad_norm": 0.5128580678938933,
"learning_rate": 1.1771630807601287e-06,
"loss": 0.4273702144622803,
"step": 4730
},
{
"epoch": 0.8012593633453423,
"grad_norm": 0.4046622055593574,
"learning_rate": 1.1582130845730826e-06,
"loss": 0.4226740837097168,
"step": 4740
},
{
"epoch": 0.8029497839431173,
"grad_norm": 0.3997957153545625,
"learning_rate": 1.1393968574427744e-06,
"loss": 0.42818431854248046,
"step": 4750
},
{
"epoch": 0.8046402045408924,
"grad_norm": 0.4025063941387675,
"learning_rate": 1.1207150545406136e-06,
"loss": 0.425289249420166,
"step": 4760
},
{
"epoch": 0.8063306251386673,
"grad_norm": 0.4183154464205255,
"learning_rate": 1.1021683263574313e-06,
"loss": 0.4195976734161377,
"step": 4770
},
{
"epoch": 0.8080210457364423,
"grad_norm": 0.4384474275798556,
"learning_rate": 1.0837573186808214e-06,
"loss": 0.43142261505126955,
"step": 4780
},
{
"epoch": 0.8097114663342173,
"grad_norm": 0.4557749163813245,
"learning_rate": 1.0654826725726608e-06,
"loss": 0.4209465980529785,
"step": 4790
},
{
"epoch": 0.8114018869319922,
"grad_norm": 0.5432867266168557,
"learning_rate": 1.0473450243467865e-06,
"loss": 0.4263154029846191,
"step": 4800
},
{
"epoch": 0.8130923075297672,
"grad_norm": 0.5025593460716719,
"learning_rate": 1.0293450055468374e-06,
"loss": 0.4319735527038574,
"step": 4810
},
{
"epoch": 0.8147827281275423,
"grad_norm": 0.4755935024526056,
"learning_rate": 1.0114832429242705e-06,
"loss": 0.4187938690185547,
"step": 4820
},
{
"epoch": 0.8164731487253172,
"grad_norm": 0.4010757390976713,
"learning_rate": 9.93760358416529e-07,
"loss": 0.4261464595794678,
"step": 4830
},
{
"epoch": 0.8181635693230922,
"grad_norm": 0.7131694816309398,
"learning_rate": 9.761769691253931e-07,
"loss": 0.4193047046661377,
"step": 4840
},
{
"epoch": 0.8198539899208672,
"grad_norm": 0.5034513530726354,
"learning_rate": 9.587336872954906e-07,
"loss": 0.4203728199005127,
"step": 4850
},
{
"epoch": 0.8215444105186421,
"grad_norm": 0.42178444814812144,
"learning_rate": 9.414311202929771e-07,
"loss": 0.42111892700195314,
"step": 4860
},
{
"epoch": 0.8232348311164172,
"grad_norm": 0.5122204909729405,
"learning_rate": 9.242698705843961e-07,
"loss": 0.423065185546875,
"step": 4870
},
{
"epoch": 0.8249252517141922,
"grad_norm": 0.3989956407391952,
"learning_rate": 9.072505357156858e-07,
"loss": 0.42530975341796873,
"step": 4880
},
{
"epoch": 0.8266156723119671,
"grad_norm": 0.39938195227376233,
"learning_rate": 8.903737082913905e-07,
"loss": 0.4214590072631836,
"step": 4890
},
{
"epoch": 0.8283060929097421,
"grad_norm": 0.40185499788640217,
"learning_rate": 8.736399759540132e-07,
"loss": 0.4245802879333496,
"step": 4900
},
{
"epoch": 0.8299965135075171,
"grad_norm": 0.40431837535545917,
"learning_rate": 8.570499213635635e-07,
"loss": 0.41890692710876465,
"step": 4910
},
{
"epoch": 0.831686934105292,
"grad_norm": 0.4407892358374833,
"learning_rate": 8.406041221772593e-07,
"loss": 0.4315225124359131,
"step": 4920
},
{
"epoch": 0.8333773547030671,
"grad_norm": 0.587964877158027,
"learning_rate": 8.243031510294225e-07,
"loss": 0.4260035514831543,
"step": 4930
},
{
"epoch": 0.8350677753008421,
"grad_norm": 0.3867363003903258,
"learning_rate": 8.081475755115381e-07,
"loss": 0.4230846881866455,
"step": 4940
},
{
"epoch": 0.836758195898617,
"grad_norm": 0.4125267432837227,
"learning_rate": 7.921379581524879e-07,
"loss": 0.4201014518737793,
"step": 4950
},
{
"epoch": 0.838448616496392,
"grad_norm": 0.4145768688095069,
"learning_rate": 7.762748563989653e-07,
"loss": 0.42217350006103516,
"step": 4960
},
{
"epoch": 0.840139037094167,
"grad_norm": 0.3827238095008044,
"learning_rate": 7.605588225960631e-07,
"loss": 0.4181986808776855,
"step": 4970
},
{
"epoch": 0.841829457691942,
"grad_norm": 0.45519859678121805,
"learning_rate": 7.449904039680483e-07,
"loss": 0.42473936080932617,
"step": 4980
},
{
"epoch": 0.843519878289717,
"grad_norm": 0.38187166057209,
"learning_rate": 7.295701425992984e-07,
"loss": 0.42186822891235354,
"step": 4990
},
{
"epoch": 0.845210298887492,
"grad_norm": 0.4100653082076505,
"learning_rate": 7.142985754154336e-07,
"loss": 0.41666412353515625,
"step": 5000
},
{
"epoch": 0.8469007194852669,
"grad_norm": 0.3990265533675085,
"learning_rate": 6.991762341646163e-07,
"loss": 0.41996259689331056,
"step": 5010
},
{
"epoch": 0.8485911400830419,
"grad_norm": 0.42655171589794805,
"learning_rate": 6.842036453990386e-07,
"loss": 0.41867885589599607,
"step": 5020
},
{
"epoch": 0.850281560680817,
"grad_norm": 0.40280454432813595,
"learning_rate": 6.69381330456591e-07,
"loss": 0.42267436981201173,
"step": 5030
},
{
"epoch": 0.8519719812785919,
"grad_norm": 0.7423046244692998,
"learning_rate": 6.547098054427031e-07,
"loss": 0.42384800910949705,
"step": 5040
},
{
"epoch": 0.8536624018763669,
"grad_norm": 0.4385504136747162,
"learning_rate": 6.401895812123737e-07,
"loss": 0.42662858963012695,
"step": 5050
},
{
"epoch": 0.8553528224741418,
"grad_norm": 0.40221301509494206,
"learning_rate": 6.25821163352392e-07,
"loss": 0.42463250160217286,
"step": 5060
},
{
"epoch": 0.8570432430719168,
"grad_norm": 0.4678190928201736,
"learning_rate": 6.116050521637218e-07,
"loss": 0.4259337425231934,
"step": 5070
},
{
"epoch": 0.8587336636696918,
"grad_norm": 1.5908098668249737,
"learning_rate": 5.975417426440911e-07,
"loss": 0.42664356231689454,
"step": 5080
},
{
"epoch": 0.8604240842674667,
"grad_norm": 0.40958701025875693,
"learning_rate": 5.836317244707451e-07,
"loss": 0.42066402435302735,
"step": 5090
},
{
"epoch": 0.8621145048652418,
"grad_norm": 0.38142342331351625,
"learning_rate": 5.698754819834107e-07,
"loss": 0.42577228546142576,
"step": 5100
},
{
"epoch": 0.8638049254630168,
"grad_norm": 0.5995765733030113,
"learning_rate": 5.562734941674175e-07,
"loss": 0.42906174659729,
"step": 5110
},
{
"epoch": 0.8654953460607917,
"grad_norm": 0.4124816632193718,
"learning_rate": 5.428262346370305e-07,
"loss": 0.42333269119262695,
"step": 5120
},
{
"epoch": 0.8671857666585667,
"grad_norm": 0.43153283983780877,
"learning_rate": 5.295341716189522e-07,
"loss": 0.42881431579589846,
"step": 5130
},
{
"epoch": 0.8688761872563417,
"grad_norm": 0.9191108566472296,
"learning_rate": 5.163977679360221e-07,
"loss": 0.41905965805053713,
"step": 5140
},
{
"epoch": 0.8705666078541167,
"grad_norm": 0.4198392449021922,
"learning_rate": 5.034174809911042e-07,
"loss": 0.427170467376709,
"step": 5150
},
{
"epoch": 0.8722570284518917,
"grad_norm": 0.4444926311135468,
"learning_rate": 4.905937627511536e-07,
"loss": 0.4227573394775391,
"step": 5160
},
{
"epoch": 0.8739474490496667,
"grad_norm": 0.5606934658286145,
"learning_rate": 4.779270597314861e-07,
"loss": 0.4257050514221191,
"step": 5170
},
{
"epoch": 0.8756378696474416,
"grad_norm": 0.41569630734941043,
"learning_rate": 4.65417812980225e-07,
"loss": 0.4265610694885254,
"step": 5180
},
{
"epoch": 0.8773282902452166,
"grad_norm": 0.47494065883948217,
"learning_rate": 4.5306645806294904e-07,
"loss": 0.41692366600036623,
"step": 5190
},
{
"epoch": 0.8790187108429917,
"grad_norm": 0.45085269223305413,
"learning_rate": 4.4087342504752383e-07,
"loss": 0.4252819061279297,
"step": 5200
},
{
"epoch": 0.8807091314407666,
"grad_norm": 0.3913069944608277,
"learning_rate": 4.288391384891261e-07,
"loss": 0.42319507598876954,
"step": 5210
},
{
"epoch": 0.8823995520385416,
"grad_norm": 0.48873707062317545,
"learning_rate": 4.169640174154627e-07,
"loss": 0.4205298900604248,
"step": 5220
},
{
"epoch": 0.8840899726363166,
"grad_norm": 0.3939589445180343,
"learning_rate": 4.052484753121799e-07,
"loss": 0.4183074951171875,
"step": 5230
},
{
"epoch": 0.8857803932340915,
"grad_norm": 0.9063181567182339,
"learning_rate": 3.936929201084644e-07,
"loss": 0.42419729232788084,
"step": 5240
},
{
"epoch": 0.8874708138318665,
"grad_norm": 0.4205501036887155,
"learning_rate": 3.822977541628453e-07,
"loss": 0.424894905090332,
"step": 5250
},
{
"epoch": 0.8891612344296416,
"grad_norm": 0.43108338551767145,
"learning_rate": 3.7106337424917205e-07,
"loss": 0.42048206329345705,
"step": 5260
},
{
"epoch": 0.8908516550274165,
"grad_norm": 0.5544953283474947,
"learning_rate": 3.599901715428139e-07,
"loss": 0.4223438262939453,
"step": 5270
},
{
"epoch": 0.8925420756251915,
"grad_norm": 0.4304365854392619,
"learning_rate": 3.4907853160702777e-07,
"loss": 0.42336835861206057,
"step": 5280
},
{
"epoch": 0.8942324962229665,
"grad_norm": 0.3906588855234446,
"learning_rate": 3.38328834379541e-07,
"loss": 0.4154191970825195,
"step": 5290
},
{
"epoch": 0.8959229168207414,
"grad_norm": 0.5031541905142518,
"learning_rate": 3.277414541593144e-07,
"loss": 0.41904850006103517,
"step": 5300
},
{
"epoch": 0.8976133374185165,
"grad_norm": 0.4530444890179973,
"learning_rate": 3.173167595935156e-07,
"loss": 0.4230810165405273,
"step": 5310
},
{
"epoch": 0.8993037580162915,
"grad_norm": 0.48713249648075824,
"learning_rate": 3.0705511366468264e-07,
"loss": 0.4264675617218018,
"step": 5320
},
{
"epoch": 0.9009941786140664,
"grad_norm": 0.5208238749067577,
"learning_rate": 2.969568736780809e-07,
"loss": 0.4253951072692871,
"step": 5330
},
{
"epoch": 0.9026845992118414,
"grad_norm": 0.4822048723169926,
"learning_rate": 2.8702239124926536e-07,
"loss": 0.42444214820861814,
"step": 5340
},
{
"epoch": 0.9043750198096164,
"grad_norm": 0.4592549523630122,
"learning_rate": 2.7725201229183595e-07,
"loss": 0.41964020729064944,
"step": 5350
},
{
"epoch": 0.9060654404073913,
"grad_norm": 0.5273092393546448,
"learning_rate": 2.676460770053935e-07,
"loss": 0.4201678276062012,
"step": 5360
},
{
"epoch": 0.9077558610051664,
"grad_norm": 0.49079242898504394,
"learning_rate": 2.5820491986369655e-07,
"loss": 0.4261648654937744,
"step": 5370
},
{
"epoch": 0.9094462816029414,
"grad_norm": 0.39447657056029006,
"learning_rate": 2.4892886960300955e-07,
"loss": 0.42170066833496095,
"step": 5380
},
{
"epoch": 0.9111367022007163,
"grad_norm": 0.39984798676859423,
"learning_rate": 2.3981824921066264e-07,
"loss": 0.4172752857208252,
"step": 5390
},
{
"epoch": 0.9128271227984913,
"grad_norm": 0.42983400440300035,
"learning_rate": 2.3087337591379877e-07,
"loss": 0.42790851593017576,
"step": 5400
},
{
"epoch": 0.9145175433962662,
"grad_norm": 0.5322157941863958,
"learning_rate": 2.2209456116833726e-07,
"loss": 0.42197275161743164,
"step": 5410
},
{
"epoch": 0.9162079639940413,
"grad_norm": 0.7319643214256284,
"learning_rate": 2.1348211064811886e-07,
"loss": 0.4245607376098633,
"step": 5420
},
{
"epoch": 0.9178983845918163,
"grad_norm": 0.6154493827068899,
"learning_rate": 2.050363242342679e-07,
"loss": 0.4241465091705322,
"step": 5430
},
{
"epoch": 0.9195888051895912,
"grad_norm": 0.38677628278359527,
"learning_rate": 1.9675749600475137e-07,
"loss": 0.42714385986328124,
"step": 5440
},
{
"epoch": 0.9212792257873662,
"grad_norm": 0.4023137059518498,
"learning_rate": 1.8864591422413647e-07,
"loss": 0.4192478179931641,
"step": 5450
},
{
"epoch": 0.9229696463851412,
"grad_norm": 0.5187932907244228,
"learning_rate": 1.8070186133355482e-07,
"loss": 0.41727705001831056,
"step": 5460
},
{
"epoch": 0.9246600669829161,
"grad_norm": 0.965073641979697,
"learning_rate": 1.7292561394086638e-07,
"loss": 0.42743620872497556,
"step": 5470
},
{
"epoch": 0.9263504875806912,
"grad_norm": 0.7609466724404457,
"learning_rate": 1.6531744281103268e-07,
"loss": 0.42295517921447756,
"step": 5480
},
{
"epoch": 0.9280409081784662,
"grad_norm": 0.5111775114201492,
"learning_rate": 1.578776128566828e-07,
"loss": 0.4170988082885742,
"step": 5490
},
{
"epoch": 0.9297313287762411,
"grad_norm": 0.4545039552455224,
"learning_rate": 1.5060638312889288e-07,
"loss": 0.4168592929840088,
"step": 5500
},
{
"epoch": 0.9314217493740161,
"grad_norm": 0.3892891280788642,
"learning_rate": 1.4350400680816555e-07,
"loss": 0.4170875072479248,
"step": 5510
},
{
"epoch": 0.9331121699717911,
"grad_norm": 0.40500981225510563,
"learning_rate": 1.365707311956138e-07,
"loss": 0.4208053112030029,
"step": 5520
},
{
"epoch": 0.934802590569566,
"grad_norm": 0.41669067149487427,
"learning_rate": 1.29806797704351e-07,
"loss": 0.4162314414978027,
"step": 5530
},
{
"epoch": 0.9364930111673411,
"grad_norm": 0.45468975448954785,
"learning_rate": 1.2321244185108438e-07,
"loss": 0.42525811195373536,
"step": 5540
},
{
"epoch": 0.9381834317651161,
"grad_norm": 0.5324807640892435,
"learning_rate": 1.1678789324791385e-07,
"loss": 0.4258564949035645,
"step": 5550
},
{
"epoch": 0.939873852362891,
"grad_norm": 0.42891448398762166,
"learning_rate": 1.1053337559433774e-07,
"loss": 0.42623538970947267,
"step": 5560
},
{
"epoch": 0.941564272960666,
"grad_norm": 0.501340218520871,
"learning_rate": 1.0444910666946362e-07,
"loss": 0.42596940994262694,
"step": 5570
},
{
"epoch": 0.943254693558441,
"grad_norm": 0.4031637084492024,
"learning_rate": 9.853529832442643e-08,
"loss": 0.4168665885925293,
"step": 5580
},
{
"epoch": 0.944945114156216,
"grad_norm": 0.41610280627051877,
"learning_rate": 9.27921564750095e-08,
"loss": 0.4280057907104492,
"step": 5590
},
{
"epoch": 0.946635534753991,
"grad_norm": 0.3941913310019063,
"learning_rate": 8.721988109447632e-08,
"loss": 0.4269193172454834,
"step": 5600
},
{
"epoch": 0.948325955351766,
"grad_norm": 0.4025191194269793,
"learning_rate": 8.181866620660839e-08,
"loss": 0.42088003158569337,
"step": 5610
},
{
"epoch": 0.9500163759495409,
"grad_norm": 0.4071300779033532,
"learning_rate": 7.658869987894612e-08,
"loss": 0.41224422454833987,
"step": 5620
},
{
"epoch": 0.9517067965473159,
"grad_norm": 0.4950344979035128,
"learning_rate": 7.153016421624525e-08,
"loss": 0.4289411544799805,
"step": 5630
},
{
"epoch": 0.953397217145091,
"grad_norm": 0.42576170549234565,
"learning_rate": 6.66432353541302e-08,
"loss": 0.4195157527923584,
"step": 5640
},
{
"epoch": 0.9550876377428659,
"grad_norm": 0.41466397846386327,
"learning_rate": 6.192808345296786e-08,
"loss": 0.42493529319763185,
"step": 5650
},
{
"epoch": 0.9567780583406409,
"grad_norm": 0.3861387762178082,
"learning_rate": 5.7384872691936264e-08,
"loss": 0.42592926025390626,
"step": 5660
},
{
"epoch": 0.9584684789384159,
"grad_norm": 0.5844447829296148,
"learning_rate": 5.301376126331248e-08,
"loss": 0.42506136894226076,
"step": 5670
},
{
"epoch": 0.9601588995361908,
"grad_norm": 0.4583100629468275,
"learning_rate": 4.8814901366961985e-08,
"loss": 0.4225175857543945,
"step": 5680
},
{
"epoch": 0.9618493201339658,
"grad_norm": 0.4511824979451254,
"learning_rate": 4.478843920504017e-08,
"loss": 0.424249267578125,
"step": 5690
},
{
"epoch": 0.9635397407317409,
"grad_norm": 0.36704192082082593,
"learning_rate": 4.093451497690193e-08,
"loss": 0.42214536666870117,
"step": 5700
},
{
"epoch": 0.9652301613295158,
"grad_norm": 0.46415491364375805,
"learning_rate": 3.725326287421838e-08,
"loss": 0.42182073593139646,
"step": 5710
},
{
"epoch": 0.9669205819272908,
"grad_norm": 0.43678313014125836,
"learning_rate": 3.374481107630612e-08,
"loss": 0.4228252410888672,
"step": 5720
},
{
"epoch": 0.9686110025250658,
"grad_norm": 0.44999534894541443,
"learning_rate": 3.040928174566415e-08,
"loss": 0.4173929214477539,
"step": 5730
},
{
"epoch": 0.9703014231228407,
"grad_norm": 0.5166241536078265,
"learning_rate": 2.7246791023717854e-08,
"loss": 0.42366867065429686,
"step": 5740
},
{
"epoch": 0.9719918437206158,
"grad_norm": 0.49082550972111405,
"learning_rate": 2.42574490267794e-08,
"loss": 0.42436580657958983,
"step": 5750
},
{
"epoch": 0.9736822643183907,
"grad_norm": 0.41566861360225094,
"learning_rate": 2.1441359842206966e-08,
"loss": 0.41978960037231444,
"step": 5760
},
{
"epoch": 0.9753726849161657,
"grad_norm": 0.4467142353931868,
"learning_rate": 1.8798621524788173e-08,
"loss": 0.4175607681274414,
"step": 5770
},
{
"epoch": 0.9770631055139407,
"grad_norm": 0.41925536664649327,
"learning_rate": 1.6329326093320053e-08,
"loss": 0.4178473472595215,
"step": 5780
},
{
"epoch": 0.9787535261117156,
"grad_norm": 0.4142968221515882,
"learning_rate": 1.4033559527407703e-08,
"loss": 0.41806626319885254,
"step": 5790
},
{
"epoch": 0.9804439467094906,
"grad_norm": 0.48270508356543623,
"learning_rate": 1.1911401764468922e-08,
"loss": 0.41689310073852537,
"step": 5800
},
{
"epoch": 0.9821343673072657,
"grad_norm": 0.4223736800888985,
"learning_rate": 9.96292669695198e-09,
"loss": 0.418758487701416,
"step": 5810
},
{
"epoch": 0.9838247879050406,
"grad_norm": 0.835433326472655,
"learning_rate": 8.188202169763793e-09,
"loss": 0.4206347942352295,
"step": 5820
},
{
"epoch": 0.9855152085028156,
"grad_norm": 0.43870192086162735,
"learning_rate": 6.5872899779045875e-09,
"loss": 0.4244359016418457,
"step": 5830
},
{
"epoch": 0.9872056291005906,
"grad_norm": 0.40318943557648546,
"learning_rate": 5.160245864319069e-09,
"loss": 0.41902694702148435,
"step": 5840
},
{
"epoch": 0.9888960496983655,
"grad_norm": 0.44780330264589363,
"learning_rate": 3.907119517954083e-09,
"loss": 0.4193845748901367,
"step": 5850
},
{
"epoch": 0.9905864702961406,
"grad_norm": 0.46299907040473565,
"learning_rate": 2.827954572027225e-09,
"loss": 0.41649885177612306,
"step": 5860
},
{
"epoch": 0.9922768908939156,
"grad_norm": 0.7022500478758524,
"learning_rate": 1.922788602511938e-09,
"loss": 0.4247690200805664,
"step": 5870
},
{
"epoch": 0.9939673114916905,
"grad_norm": 0.4194367425429352,
"learning_rate": 1.1916531268230114e-09,
"loss": 0.4229640007019043,
"step": 5880
},
{
"epoch": 0.9956577320894655,
"grad_norm": 0.40623075402193076,
"learning_rate": 6.345736027257853e-10,
"loss": 0.42009563446044923,
"step": 5890
},
{
"epoch": 0.9973481526872405,
"grad_norm": 0.47810385384154763,
"learning_rate": 2.51569427442977e-10,
"loss": 0.4222869396209717,
"step": 5900
},
{
"epoch": 0.9990385732850154,
"grad_norm": 0.40176422604323225,
"learning_rate": 4.2653936984660136e-11,
"loss": 0.42395753860473634,
"step": 5910
},
{
"epoch": 1.0,
"step": 5916,
"total_flos": 1.962434144184251e+19,
"train_loss": 0.44837146779828335,
"train_runtime": 384303.2502,
"train_samples_per_second": 1.97,
"train_steps_per_second": 0.015
}
],
"logging_steps": 10,
"max_steps": 5916,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1479,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.962434144184251e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}