{ "best_global_step": 1380, "best_metric": 0.6777992248535156, "best_model_checkpoint": "saves/qwen3-4B/Qwen3-4B-SFT-science-1e-5/checkpoint-1380", "epoch": 3.0, "eval_steps": 230, "global_step": 2313, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012976480129764801, "grad_norm": 8.159334182739258, "learning_rate": 0.0, "loss": 1.117659091949463, "step": 1 }, { "epoch": 0.0025952960259529602, "grad_norm": 7.678379535675049, "learning_rate": 8.620689655172414e-08, "loss": 1.0263863801956177, "step": 2 }, { "epoch": 0.0038929440389294406, "grad_norm": 8.245121002197266, "learning_rate": 1.7241379310344828e-07, "loss": 1.1220793724060059, "step": 3 }, { "epoch": 0.0051905920519059205, "grad_norm": 8.546252250671387, "learning_rate": 2.5862068965517245e-07, "loss": 1.18021821975708, "step": 4 }, { "epoch": 0.006488240064882401, "grad_norm": 7.886499404907227, "learning_rate": 3.4482758620689656e-07, "loss": 1.107445240020752, "step": 5 }, { "epoch": 0.007785888077858881, "grad_norm": 10.850175857543945, "learning_rate": 4.3103448275862073e-07, "loss": 1.099359393119812, "step": 6 }, { "epoch": 0.009083536090835361, "grad_norm": 8.097647666931152, "learning_rate": 5.172413793103449e-07, "loss": 1.0631245374679565, "step": 7 }, { "epoch": 0.010381184103811841, "grad_norm": 7.725368022918701, "learning_rate": 6.034482758620691e-07, "loss": 1.0364526510238647, "step": 8 }, { "epoch": 0.01167883211678832, "grad_norm": 7.084433555603027, "learning_rate": 6.896551724137931e-07, "loss": 0.977345883846283, "step": 9 }, { "epoch": 0.012976480129764802, "grad_norm": 7.370170593261719, "learning_rate": 7.758620689655173e-07, "loss": 1.0759401321411133, "step": 10 }, { "epoch": 0.014274128142741281, "grad_norm": 7.056736469268799, "learning_rate": 8.620689655172415e-07, "loss": 1.051821231842041, "step": 11 }, { "epoch": 0.015571776155717762, "grad_norm": 6.129208564758301, "learning_rate": 9.482758620689655e-07, "loss": 1.008002758026123, "step": 12 }, { "epoch": 0.01686942416869424, "grad_norm": 6.331120491027832, "learning_rate": 1.0344827586206898e-06, "loss": 1.0412415266036987, "step": 13 }, { "epoch": 0.018167072181670723, "grad_norm": 5.9186625480651855, "learning_rate": 1.120689655172414e-06, "loss": 1.0198311805725098, "step": 14 }, { "epoch": 0.019464720194647202, "grad_norm": 5.27198600769043, "learning_rate": 1.2068965517241381e-06, "loss": 1.0152095556259155, "step": 15 }, { "epoch": 0.020762368207623682, "grad_norm": 4.562581539154053, "learning_rate": 1.2931034482758623e-06, "loss": 0.9857317805290222, "step": 16 }, { "epoch": 0.02206001622060016, "grad_norm": 4.586100101470947, "learning_rate": 1.3793103448275862e-06, "loss": 1.0225930213928223, "step": 17 }, { "epoch": 0.02335766423357664, "grad_norm": 4.058810234069824, "learning_rate": 1.4655172413793104e-06, "loss": 0.9823198318481445, "step": 18 }, { "epoch": 0.024655312246553124, "grad_norm": 4.066655158996582, "learning_rate": 1.5517241379310346e-06, "loss": 0.9863596558570862, "step": 19 }, { "epoch": 0.025952960259529603, "grad_norm": 3.7554173469543457, "learning_rate": 1.6379310344827587e-06, "loss": 0.9025828838348389, "step": 20 }, { "epoch": 0.027250608272506083, "grad_norm": 2.6631460189819336, "learning_rate": 1.724137931034483e-06, "loss": 0.9907147288322449, "step": 21 }, { "epoch": 0.028548256285482562, "grad_norm": 2.3198695182800293, "learning_rate": 1.810344827586207e-06, "loss": 0.872843861579895, "step": 22 }, { "epoch": 0.02984590429845904, "grad_norm": 2.0851941108703613, "learning_rate": 1.896551724137931e-06, "loss": 0.896687388420105, "step": 23 }, { "epoch": 0.031143552311435525, "grad_norm": 1.9391196966171265, "learning_rate": 1.982758620689655e-06, "loss": 0.8471081852912903, "step": 24 }, { "epoch": 0.032441200324412, "grad_norm": 1.705809473991394, "learning_rate": 2.0689655172413796e-06, "loss": 0.8402453064918518, "step": 25 }, { "epoch": 0.03373884833738848, "grad_norm": 1.6870861053466797, "learning_rate": 2.1551724137931035e-06, "loss": 0.8823003768920898, "step": 26 }, { "epoch": 0.035036496350364967, "grad_norm": 1.63539719581604, "learning_rate": 2.241379310344828e-06, "loss": 0.8520861864089966, "step": 27 }, { "epoch": 0.036334144363341446, "grad_norm": 1.215566873550415, "learning_rate": 2.327586206896552e-06, "loss": 0.8638472557067871, "step": 28 }, { "epoch": 0.037631792376317925, "grad_norm": 1.3506380319595337, "learning_rate": 2.4137931034482762e-06, "loss": 0.8670657277107239, "step": 29 }, { "epoch": 0.038929440389294405, "grad_norm": 1.3755369186401367, "learning_rate": 2.5e-06, "loss": 0.8863908648490906, "step": 30 }, { "epoch": 0.040227088402270884, "grad_norm": 1.3734447956085205, "learning_rate": 2.5862068965517246e-06, "loss": 0.8129177093505859, "step": 31 }, { "epoch": 0.041524736415247364, "grad_norm": 1.294492244720459, "learning_rate": 2.672413793103448e-06, "loss": 0.8296308517456055, "step": 32 }, { "epoch": 0.04282238442822384, "grad_norm": 1.0568984746932983, "learning_rate": 2.7586206896551725e-06, "loss": 0.8287128210067749, "step": 33 }, { "epoch": 0.04412003244120032, "grad_norm": 0.9133521914482117, "learning_rate": 2.844827586206897e-06, "loss": 0.7776259183883667, "step": 34 }, { "epoch": 0.0454176804541768, "grad_norm": 0.9421447515487671, "learning_rate": 2.931034482758621e-06, "loss": 0.8422408103942871, "step": 35 }, { "epoch": 0.04671532846715328, "grad_norm": 0.9022809863090515, "learning_rate": 3.017241379310345e-06, "loss": 0.8629512190818787, "step": 36 }, { "epoch": 0.04801297648012977, "grad_norm": 0.780587375164032, "learning_rate": 3.103448275862069e-06, "loss": 0.781667947769165, "step": 37 }, { "epoch": 0.04931062449310625, "grad_norm": 0.7616261839866638, "learning_rate": 3.1896551724137935e-06, "loss": 0.7705612182617188, "step": 38 }, { "epoch": 0.05060827250608273, "grad_norm": 0.7669604420661926, "learning_rate": 3.2758620689655175e-06, "loss": 0.8600028157234192, "step": 39 }, { "epoch": 0.05190592051905921, "grad_norm": 0.8013553619384766, "learning_rate": 3.362068965517242e-06, "loss": 0.8316032886505127, "step": 40 }, { "epoch": 0.053203568532035686, "grad_norm": 0.760819673538208, "learning_rate": 3.448275862068966e-06, "loss": 0.8170580863952637, "step": 41 }, { "epoch": 0.054501216545012166, "grad_norm": 0.719124436378479, "learning_rate": 3.5344827586206898e-06, "loss": 0.7726640701293945, "step": 42 }, { "epoch": 0.055798864557988645, "grad_norm": 0.7333022952079773, "learning_rate": 3.620689655172414e-06, "loss": 0.7847077250480652, "step": 43 }, { "epoch": 0.057096512570965124, "grad_norm": 0.7520370483398438, "learning_rate": 3.7068965517241385e-06, "loss": 0.7839537858963013, "step": 44 }, { "epoch": 0.058394160583941604, "grad_norm": 0.7901465892791748, "learning_rate": 3.793103448275862e-06, "loss": 0.8387829065322876, "step": 45 }, { "epoch": 0.05969180859691808, "grad_norm": 0.7442818284034729, "learning_rate": 3.8793103448275865e-06, "loss": 0.7767361402511597, "step": 46 }, { "epoch": 0.06098945660989456, "grad_norm": 0.6601076722145081, "learning_rate": 3.96551724137931e-06, "loss": 0.7320765256881714, "step": 47 }, { "epoch": 0.06228710462287105, "grad_norm": 0.6948726773262024, "learning_rate": 4.051724137931034e-06, "loss": 0.8290716409683228, "step": 48 }, { "epoch": 0.06358475263584752, "grad_norm": 0.6669663190841675, "learning_rate": 4.137931034482759e-06, "loss": 0.7917401790618896, "step": 49 }, { "epoch": 0.064882400648824, "grad_norm": 0.6616993546485901, "learning_rate": 4.224137931034483e-06, "loss": 0.7917563915252686, "step": 50 }, { "epoch": 0.06618004866180048, "grad_norm": 0.6595159769058228, "learning_rate": 4.310344827586207e-06, "loss": 0.7899826765060425, "step": 51 }, { "epoch": 0.06747769667477696, "grad_norm": 0.6776856184005737, "learning_rate": 4.396551724137931e-06, "loss": 0.8258700370788574, "step": 52 }, { "epoch": 0.06877534468775345, "grad_norm": 0.7086785435676575, "learning_rate": 4.482758620689656e-06, "loss": 0.8281067609786987, "step": 53 }, { "epoch": 0.07007299270072993, "grad_norm": 0.6362385153770447, "learning_rate": 4.56896551724138e-06, "loss": 0.7703720331192017, "step": 54 }, { "epoch": 0.07137064071370641, "grad_norm": 1.0633333921432495, "learning_rate": 4.655172413793104e-06, "loss": 0.7698659896850586, "step": 55 }, { "epoch": 0.07266828872668289, "grad_norm": 0.6450533270835876, "learning_rate": 4.741379310344828e-06, "loss": 0.7988396286964417, "step": 56 }, { "epoch": 0.07396593673965937, "grad_norm": 0.6176488995552063, "learning_rate": 4.8275862068965525e-06, "loss": 0.7486166954040527, "step": 57 }, { "epoch": 0.07526358475263585, "grad_norm": 0.6564953327178955, "learning_rate": 4.9137931034482765e-06, "loss": 0.8380484580993652, "step": 58 }, { "epoch": 0.07656123276561233, "grad_norm": 1.4383426904678345, "learning_rate": 5e-06, "loss": 0.8178592920303345, "step": 59 }, { "epoch": 0.07785888077858881, "grad_norm": 0.6065345406532288, "learning_rate": 5.086206896551724e-06, "loss": 0.7402592897415161, "step": 60 }, { "epoch": 0.07915652879156529, "grad_norm": 0.6361149549484253, "learning_rate": 5.172413793103449e-06, "loss": 0.7722781300544739, "step": 61 }, { "epoch": 0.08045417680454177, "grad_norm": 0.6287536025047302, "learning_rate": 5.258620689655173e-06, "loss": 0.8166277408599854, "step": 62 }, { "epoch": 0.08175182481751825, "grad_norm": 0.6238293051719666, "learning_rate": 5.344827586206896e-06, "loss": 0.7863017320632935, "step": 63 }, { "epoch": 0.08304947283049473, "grad_norm": 0.6116371750831604, "learning_rate": 5.431034482758621e-06, "loss": 0.8139179944992065, "step": 64 }, { "epoch": 0.08434712084347121, "grad_norm": 0.6211651563644409, "learning_rate": 5.517241379310345e-06, "loss": 0.802246630191803, "step": 65 }, { "epoch": 0.08564476885644769, "grad_norm": 0.6179801821708679, "learning_rate": 5.603448275862069e-06, "loss": 0.8019924163818359, "step": 66 }, { "epoch": 0.08694241686942417, "grad_norm": 0.6304736733436584, "learning_rate": 5.689655172413794e-06, "loss": 0.797938346862793, "step": 67 }, { "epoch": 0.08824006488240065, "grad_norm": 0.5991215705871582, "learning_rate": 5.775862068965518e-06, "loss": 0.7311227917671204, "step": 68 }, { "epoch": 0.08953771289537713, "grad_norm": 0.6336483955383301, "learning_rate": 5.862068965517242e-06, "loss": 0.8222885131835938, "step": 69 }, { "epoch": 0.0908353609083536, "grad_norm": 0.6269424557685852, "learning_rate": 5.9482758620689665e-06, "loss": 0.7962170839309692, "step": 70 }, { "epoch": 0.09213300892133008, "grad_norm": 0.6373898983001709, "learning_rate": 6.03448275862069e-06, "loss": 0.8021715879440308, "step": 71 }, { "epoch": 0.09343065693430656, "grad_norm": 0.6345935463905334, "learning_rate": 6.1206896551724135e-06, "loss": 0.8776074647903442, "step": 72 }, { "epoch": 0.09472830494728304, "grad_norm": 0.6083796620368958, "learning_rate": 6.206896551724138e-06, "loss": 0.7513650059700012, "step": 73 }, { "epoch": 0.09602595296025954, "grad_norm": 0.6068538427352905, "learning_rate": 6.293103448275862e-06, "loss": 0.7684041857719421, "step": 74 }, { "epoch": 0.09732360097323602, "grad_norm": 0.6176103949546814, "learning_rate": 6.379310344827587e-06, "loss": 0.7645843029022217, "step": 75 }, { "epoch": 0.0986212489862125, "grad_norm": 0.6182767152786255, "learning_rate": 6.465517241379311e-06, "loss": 0.8177169561386108, "step": 76 }, { "epoch": 0.09991889699918897, "grad_norm": 0.6175945997238159, "learning_rate": 6.551724137931035e-06, "loss": 0.7822265625, "step": 77 }, { "epoch": 0.10121654501216545, "grad_norm": 0.6050496101379395, "learning_rate": 6.63793103448276e-06, "loss": 0.7576093673706055, "step": 78 }, { "epoch": 0.10251419302514193, "grad_norm": 0.7123962640762329, "learning_rate": 6.724137931034484e-06, "loss": 0.8231764435768127, "step": 79 }, { "epoch": 0.10381184103811841, "grad_norm": 0.61634361743927, "learning_rate": 6.810344827586207e-06, "loss": 0.7479314804077148, "step": 80 }, { "epoch": 0.10510948905109489, "grad_norm": 0.5944046378135681, "learning_rate": 6.896551724137932e-06, "loss": 0.7602187395095825, "step": 81 }, { "epoch": 0.10640713706407137, "grad_norm": 1.9641212224960327, "learning_rate": 6.982758620689656e-06, "loss": 0.7291417121887207, "step": 82 }, { "epoch": 0.10770478507704785, "grad_norm": 0.6604083776473999, "learning_rate": 7.0689655172413796e-06, "loss": 0.7462600469589233, "step": 83 }, { "epoch": 0.10900243309002433, "grad_norm": 0.6202764511108398, "learning_rate": 7.155172413793104e-06, "loss": 0.8041630983352661, "step": 84 }, { "epoch": 0.11030008110300081, "grad_norm": 0.6278896331787109, "learning_rate": 7.241379310344828e-06, "loss": 0.7589733600616455, "step": 85 }, { "epoch": 0.11159772911597729, "grad_norm": 0.5918757915496826, "learning_rate": 7.327586206896552e-06, "loss": 0.7733231782913208, "step": 86 }, { "epoch": 0.11289537712895377, "grad_norm": 0.6275747418403625, "learning_rate": 7.413793103448277e-06, "loss": 0.7821832299232483, "step": 87 }, { "epoch": 0.11419302514193025, "grad_norm": 0.5935595631599426, "learning_rate": 7.500000000000001e-06, "loss": 0.7410198450088501, "step": 88 }, { "epoch": 0.11549067315490673, "grad_norm": 0.6088429093360901, "learning_rate": 7.586206896551724e-06, "loss": 0.78556227684021, "step": 89 }, { "epoch": 0.11678832116788321, "grad_norm": 0.6014888286590576, "learning_rate": 7.672413793103449e-06, "loss": 0.7850443124771118, "step": 90 }, { "epoch": 0.11808596918085969, "grad_norm": 0.644192636013031, "learning_rate": 7.758620689655173e-06, "loss": 0.7852208614349365, "step": 91 }, { "epoch": 0.11938361719383617, "grad_norm": 0.6681314706802368, "learning_rate": 7.844827586206897e-06, "loss": 0.8286585211753845, "step": 92 }, { "epoch": 0.12068126520681265, "grad_norm": 0.6156536936759949, "learning_rate": 7.93103448275862e-06, "loss": 0.7740339040756226, "step": 93 }, { "epoch": 0.12197891321978913, "grad_norm": 0.5617393255233765, "learning_rate": 8.017241379310345e-06, "loss": 0.7133764028549194, "step": 94 }, { "epoch": 0.12327656123276562, "grad_norm": 0.6284353733062744, "learning_rate": 8.103448275862069e-06, "loss": 0.8572052121162415, "step": 95 }, { "epoch": 0.1245742092457421, "grad_norm": 0.6048849821090698, "learning_rate": 8.189655172413794e-06, "loss": 0.7354931831359863, "step": 96 }, { "epoch": 0.12587185725871858, "grad_norm": 0.717276930809021, "learning_rate": 8.275862068965518e-06, "loss": 0.7633223533630371, "step": 97 }, { "epoch": 0.12716950527169504, "grad_norm": 0.5850024223327637, "learning_rate": 8.362068965517242e-06, "loss": 0.7660566568374634, "step": 98 }, { "epoch": 0.12846715328467154, "grad_norm": 0.6040444374084473, "learning_rate": 8.448275862068966e-06, "loss": 0.687772274017334, "step": 99 }, { "epoch": 0.129764801297648, "grad_norm": 0.635793924331665, "learning_rate": 8.53448275862069e-06, "loss": 0.8365746736526489, "step": 100 }, { "epoch": 0.1310624493106245, "grad_norm": 0.681013822555542, "learning_rate": 8.620689655172414e-06, "loss": 0.8097229599952698, "step": 101 }, { "epoch": 0.13236009732360096, "grad_norm": 0.5976776480674744, "learning_rate": 8.706896551724138e-06, "loss": 0.7460197806358337, "step": 102 }, { "epoch": 0.13365774533657745, "grad_norm": 0.5931348204612732, "learning_rate": 8.793103448275862e-06, "loss": 0.7234000563621521, "step": 103 }, { "epoch": 0.13495539334955392, "grad_norm": 0.6787912845611572, "learning_rate": 8.879310344827588e-06, "loss": 0.8100739121437073, "step": 104 }, { "epoch": 0.1362530413625304, "grad_norm": 0.6532299518585205, "learning_rate": 8.965517241379312e-06, "loss": 0.8400453925132751, "step": 105 }, { "epoch": 0.1375506893755069, "grad_norm": 0.6569010615348816, "learning_rate": 9.051724137931036e-06, "loss": 0.8247137069702148, "step": 106 }, { "epoch": 0.13884833738848337, "grad_norm": 0.6199808716773987, "learning_rate": 9.13793103448276e-06, "loss": 0.7428423166275024, "step": 107 }, { "epoch": 0.14014598540145987, "grad_norm": 0.6075517535209656, "learning_rate": 9.224137931034484e-06, "loss": 0.7575728893280029, "step": 108 }, { "epoch": 0.14144363341443633, "grad_norm": 0.6420115232467651, "learning_rate": 9.310344827586207e-06, "loss": 0.8051052093505859, "step": 109 }, { "epoch": 0.14274128142741282, "grad_norm": 0.6138091683387756, "learning_rate": 9.396551724137931e-06, "loss": 0.8522422313690186, "step": 110 }, { "epoch": 0.1440389294403893, "grad_norm": 0.650187075138092, "learning_rate": 9.482758620689655e-06, "loss": 0.8301827907562256, "step": 111 }, { "epoch": 0.14533657745336578, "grad_norm": 0.6030973196029663, "learning_rate": 9.56896551724138e-06, "loss": 0.7207387089729309, "step": 112 }, { "epoch": 0.14663422546634225, "grad_norm": 0.622131884098053, "learning_rate": 9.655172413793105e-06, "loss": 0.7915451526641846, "step": 113 }, { "epoch": 0.14793187347931874, "grad_norm": 0.6085039377212524, "learning_rate": 9.741379310344829e-06, "loss": 0.7769342064857483, "step": 114 }, { "epoch": 0.1492295214922952, "grad_norm": 0.6578651666641235, "learning_rate": 9.827586206896553e-06, "loss": 0.7566852569580078, "step": 115 }, { "epoch": 0.1505271695052717, "grad_norm": 0.6066433787345886, "learning_rate": 9.913793103448277e-06, "loss": 0.7825925350189209, "step": 116 }, { "epoch": 0.15182481751824817, "grad_norm": 0.6409288644790649, "learning_rate": 1e-05, "loss": 0.8247882127761841, "step": 117 }, { "epoch": 0.15312246553122466, "grad_norm": 0.6675072312355042, "learning_rate": 9.99999488813276e-06, "loss": 0.8096261024475098, "step": 118 }, { "epoch": 0.15442011354420113, "grad_norm": 0.6444228887557983, "learning_rate": 9.999979552541496e-06, "loss": 0.7326732873916626, "step": 119 }, { "epoch": 0.15571776155717762, "grad_norm": 0.6155293583869934, "learning_rate": 9.99995399325756e-06, "loss": 0.7694077491760254, "step": 120 }, { "epoch": 0.15701540957015409, "grad_norm": 0.6370646953582764, "learning_rate": 9.999918210333219e-06, "loss": 0.8235340118408203, "step": 121 }, { "epoch": 0.15831305758313058, "grad_norm": 0.6056079864501953, "learning_rate": 9.999872203841635e-06, "loss": 0.7498428821563721, "step": 122 }, { "epoch": 0.15961070559610704, "grad_norm": 0.6514161825180054, "learning_rate": 9.999815973876888e-06, "loss": 0.772469162940979, "step": 123 }, { "epoch": 0.16090835360908354, "grad_norm": 0.6417706608772278, "learning_rate": 9.999749520553945e-06, "loss": 0.8074150085449219, "step": 124 }, { "epoch": 0.16220600162206, "grad_norm": 0.6162619590759277, "learning_rate": 9.99967284400869e-06, "loss": 0.7649105191230774, "step": 125 }, { "epoch": 0.1635036496350365, "grad_norm": 0.6231618523597717, "learning_rate": 9.99958594439791e-06, "loss": 0.7435484528541565, "step": 126 }, { "epoch": 0.164801297648013, "grad_norm": 0.6211341023445129, "learning_rate": 9.999488821899286e-06, "loss": 0.7700725793838501, "step": 127 }, { "epoch": 0.16609894566098946, "grad_norm": 0.6546758413314819, "learning_rate": 9.999381476711416e-06, "loss": 0.7208442091941833, "step": 128 }, { "epoch": 0.16739659367396595, "grad_norm": 0.6165010333061218, "learning_rate": 9.999263909053789e-06, "loss": 0.7380815148353577, "step": 129 }, { "epoch": 0.16869424168694241, "grad_norm": 0.7457146048545837, "learning_rate": 9.999136119166803e-06, "loss": 0.7085788249969482, "step": 130 }, { "epoch": 0.1699918896999189, "grad_norm": 0.6893863677978516, "learning_rate": 9.998998107311758e-06, "loss": 0.8248496055603027, "step": 131 }, { "epoch": 0.17128953771289537, "grad_norm": 0.6099883317947388, "learning_rate": 9.998849873770849e-06, "loss": 0.7661588191986084, "step": 132 }, { "epoch": 0.17258718572587187, "grad_norm": 0.5964142084121704, "learning_rate": 9.998691418847177e-06, "loss": 0.7037764191627502, "step": 133 }, { "epoch": 0.17388483373884833, "grad_norm": 0.6277547478675842, "learning_rate": 9.998522742864745e-06, "loss": 0.8015055060386658, "step": 134 }, { "epoch": 0.17518248175182483, "grad_norm": 0.6385223865509033, "learning_rate": 9.998343846168448e-06, "loss": 0.7598564028739929, "step": 135 }, { "epoch": 0.1764801297648013, "grad_norm": 0.6057168245315552, "learning_rate": 9.998154729124092e-06, "loss": 0.7190810441970825, "step": 136 }, { "epoch": 0.17777777777777778, "grad_norm": 0.6524573564529419, "learning_rate": 9.997955392118365e-06, "loss": 0.7655267715454102, "step": 137 }, { "epoch": 0.17907542579075425, "grad_norm": 0.593307614326477, "learning_rate": 9.997745835558867e-06, "loss": 0.6991128921508789, "step": 138 }, { "epoch": 0.18037307380373074, "grad_norm": 0.6667762994766235, "learning_rate": 9.997526059874086e-06, "loss": 0.7836197018623352, "step": 139 }, { "epoch": 0.1816707218167072, "grad_norm": 0.6364095211029053, "learning_rate": 9.997296065513405e-06, "loss": 0.7866847515106201, "step": 140 }, { "epoch": 0.1829683698296837, "grad_norm": 0.6693204641342163, "learning_rate": 9.997055852947109e-06, "loss": 0.8498630523681641, "step": 141 }, { "epoch": 0.18426601784266017, "grad_norm": 0.6703641414642334, "learning_rate": 9.996805422666367e-06, "loss": 0.7902424335479736, "step": 142 }, { "epoch": 0.18556366585563666, "grad_norm": 0.6226605772972107, "learning_rate": 9.99654477518325e-06, "loss": 0.7982854843139648, "step": 143 }, { "epoch": 0.18686131386861313, "grad_norm": 0.5963988304138184, "learning_rate": 9.996273911030714e-06, "loss": 0.7364012598991394, "step": 144 }, { "epoch": 0.18815896188158962, "grad_norm": 3.2399189472198486, "learning_rate": 9.995992830762608e-06, "loss": 0.8748813271522522, "step": 145 }, { "epoch": 0.18945660989456609, "grad_norm": 0.6035348773002625, "learning_rate": 9.99570153495367e-06, "loss": 0.7249287366867065, "step": 146 }, { "epoch": 0.19075425790754258, "grad_norm": 0.6258792877197266, "learning_rate": 9.995400024199526e-06, "loss": 0.7734540700912476, "step": 147 }, { "epoch": 0.19205190592051907, "grad_norm": 0.6568045020103455, "learning_rate": 9.99508829911669e-06, "loss": 0.8293142318725586, "step": 148 }, { "epoch": 0.19334955393349554, "grad_norm": 0.8624785542488098, "learning_rate": 9.994766360342557e-06, "loss": 0.8258950710296631, "step": 149 }, { "epoch": 0.19464720194647203, "grad_norm": 0.591865599155426, "learning_rate": 9.994434208535415e-06, "loss": 0.7743998765945435, "step": 150 }, { "epoch": 0.1959448499594485, "grad_norm": 0.6273242831230164, "learning_rate": 9.994091844374431e-06, "loss": 0.8304177522659302, "step": 151 }, { "epoch": 0.197242497972425, "grad_norm": 0.6169039011001587, "learning_rate": 9.993739268559648e-06, "loss": 0.8317509889602661, "step": 152 }, { "epoch": 0.19854014598540146, "grad_norm": 0.6500508785247803, "learning_rate": 9.993376481812001e-06, "loss": 0.8074177503585815, "step": 153 }, { "epoch": 0.19983779399837795, "grad_norm": 0.691698431968689, "learning_rate": 9.99300348487329e-06, "loss": 0.7966357469558716, "step": 154 }, { "epoch": 0.20113544201135442, "grad_norm": 0.6341277956962585, "learning_rate": 9.992620278506203e-06, "loss": 0.7922544479370117, "step": 155 }, { "epoch": 0.2024330900243309, "grad_norm": 0.5936447381973267, "learning_rate": 9.9922268634943e-06, "loss": 0.6732587218284607, "step": 156 }, { "epoch": 0.20373073803730737, "grad_norm": 0.6575024127960205, "learning_rate": 9.991823240642014e-06, "loss": 0.8733258247375488, "step": 157 }, { "epoch": 0.20502838605028387, "grad_norm": 0.6686046719551086, "learning_rate": 9.991409410774654e-06, "loss": 0.790815532207489, "step": 158 }, { "epoch": 0.20632603406326033, "grad_norm": 1.4253793954849243, "learning_rate": 9.990985374738396e-06, "loss": 0.7325870990753174, "step": 159 }, { "epoch": 0.20762368207623683, "grad_norm": 0.6524296998977661, "learning_rate": 9.990551133400284e-06, "loss": 0.7516152858734131, "step": 160 }, { "epoch": 0.2089213300892133, "grad_norm": 0.6569153666496277, "learning_rate": 9.990106687648234e-06, "loss": 0.7317984104156494, "step": 161 }, { "epoch": 0.21021897810218979, "grad_norm": 0.5729793906211853, "learning_rate": 9.989652038391025e-06, "loss": 0.7050694227218628, "step": 162 }, { "epoch": 0.21151662611516625, "grad_norm": 0.5924677848815918, "learning_rate": 9.9891871865583e-06, "loss": 0.7387759685516357, "step": 163 }, { "epoch": 0.21281427412814274, "grad_norm": 0.9845248460769653, "learning_rate": 9.988712133100563e-06, "loss": 0.8402718305587769, "step": 164 }, { "epoch": 0.2141119221411192, "grad_norm": 0.6559567451477051, "learning_rate": 9.988226878989178e-06, "loss": 0.7516730427742004, "step": 165 }, { "epoch": 0.2154095701540957, "grad_norm": 0.603742778301239, "learning_rate": 9.987731425216364e-06, "loss": 0.6687497496604919, "step": 166 }, { "epoch": 0.21670721816707217, "grad_norm": 0.6345369815826416, "learning_rate": 9.987225772795204e-06, "loss": 0.8063400387763977, "step": 167 }, { "epoch": 0.21800486618004866, "grad_norm": 0.6372174024581909, "learning_rate": 9.986709922759626e-06, "loss": 0.7703537940979004, "step": 168 }, { "epoch": 0.21930251419302516, "grad_norm": 0.607814371585846, "learning_rate": 9.986183876164412e-06, "loss": 0.6834731101989746, "step": 169 }, { "epoch": 0.22060016220600162, "grad_norm": 0.5630145072937012, "learning_rate": 9.985647634085197e-06, "loss": 0.7261765599250793, "step": 170 }, { "epoch": 0.22189781021897811, "grad_norm": 0.6719157695770264, "learning_rate": 9.985101197618456e-06, "loss": 0.7341983318328857, "step": 171 }, { "epoch": 0.22319545823195458, "grad_norm": 0.6283457279205322, "learning_rate": 9.98454456788152e-06, "loss": 0.7351614832878113, "step": 172 }, { "epoch": 0.22449310624493107, "grad_norm": 0.6344905495643616, "learning_rate": 9.983977746012547e-06, "loss": 0.7843720316886902, "step": 173 }, { "epoch": 0.22579075425790754, "grad_norm": 0.605237603187561, "learning_rate": 9.983400733170553e-06, "loss": 0.7114173769950867, "step": 174 }, { "epoch": 0.22708840227088403, "grad_norm": 0.626672089099884, "learning_rate": 9.982813530535377e-06, "loss": 0.7024215459823608, "step": 175 }, { "epoch": 0.2283860502838605, "grad_norm": 0.6185852885246277, "learning_rate": 9.982216139307705e-06, "loss": 0.8043787479400635, "step": 176 }, { "epoch": 0.229683698296837, "grad_norm": 0.5857049226760864, "learning_rate": 9.981608560709044e-06, "loss": 0.6755383014678955, "step": 177 }, { "epoch": 0.23098134630981346, "grad_norm": 0.6019972562789917, "learning_rate": 9.980990795981747e-06, "loss": 0.7932974100112915, "step": 178 }, { "epoch": 0.23227899432278995, "grad_norm": 0.6226310729980469, "learning_rate": 9.980362846388978e-06, "loss": 0.784454882144928, "step": 179 }, { "epoch": 0.23357664233576642, "grad_norm": 0.643936812877655, "learning_rate": 9.97972471321474e-06, "loss": 0.768436849117279, "step": 180 }, { "epoch": 0.2348742903487429, "grad_norm": 0.629254162311554, "learning_rate": 9.979076397763853e-06, "loss": 0.7261864542961121, "step": 181 }, { "epoch": 0.23617193836171937, "grad_norm": 0.6138353943824768, "learning_rate": 9.978417901361958e-06, "loss": 0.8290830254554749, "step": 182 }, { "epoch": 0.23746958637469587, "grad_norm": 0.6166982054710388, "learning_rate": 9.977749225355513e-06, "loss": 0.7295878529548645, "step": 183 }, { "epoch": 0.23876723438767233, "grad_norm": 0.5729910731315613, "learning_rate": 9.977070371111793e-06, "loss": 0.7391046285629272, "step": 184 }, { "epoch": 0.24006488240064883, "grad_norm": 0.6283906102180481, "learning_rate": 9.976381340018879e-06, "loss": 0.7741225957870483, "step": 185 }, { "epoch": 0.2413625304136253, "grad_norm": 0.5742847919464111, "learning_rate": 9.97568213348567e-06, "loss": 0.7565523386001587, "step": 186 }, { "epoch": 0.24266017842660179, "grad_norm": 0.5885831713676453, "learning_rate": 9.974972752941861e-06, "loss": 0.7079343199729919, "step": 187 }, { "epoch": 0.24395782643957825, "grad_norm": 0.6233158707618713, "learning_rate": 9.97425319983796e-06, "loss": 0.802773118019104, "step": 188 }, { "epoch": 0.24525547445255474, "grad_norm": 0.6107950210571289, "learning_rate": 9.97352347564527e-06, "loss": 0.7514665126800537, "step": 189 }, { "epoch": 0.24655312246553124, "grad_norm": 0.6127108335494995, "learning_rate": 9.972783581855894e-06, "loss": 0.766715943813324, "step": 190 }, { "epoch": 0.2478507704785077, "grad_norm": 0.5911589860916138, "learning_rate": 9.972033519982722e-06, "loss": 0.719687283039093, "step": 191 }, { "epoch": 0.2491484184914842, "grad_norm": 0.7104600071907043, "learning_rate": 9.971273291559447e-06, "loss": 0.7840068340301514, "step": 192 }, { "epoch": 0.25044606650446066, "grad_norm": 1.2322938442230225, "learning_rate": 9.97050289814054e-06, "loss": 0.7457755208015442, "step": 193 }, { "epoch": 0.25174371451743716, "grad_norm": 0.568343460559845, "learning_rate": 9.969722341301261e-06, "loss": 0.6806910037994385, "step": 194 }, { "epoch": 0.25304136253041365, "grad_norm": 0.6099660396575928, "learning_rate": 9.968931622637652e-06, "loss": 0.7885247468948364, "step": 195 }, { "epoch": 0.2543390105433901, "grad_norm": 0.5906837582588196, "learning_rate": 9.968130743766533e-06, "loss": 0.7320465445518494, "step": 196 }, { "epoch": 0.2556366585563666, "grad_norm": 0.5778429508209229, "learning_rate": 9.967319706325495e-06, "loss": 0.7082957029342651, "step": 197 }, { "epoch": 0.2569343065693431, "grad_norm": 0.5944257974624634, "learning_rate": 9.96649851197291e-06, "loss": 0.7171834707260132, "step": 198 }, { "epoch": 0.25823195458231957, "grad_norm": 0.8729922771453857, "learning_rate": 9.965667162387908e-06, "loss": 0.8201053142547607, "step": 199 }, { "epoch": 0.259529602595296, "grad_norm": 0.6156542897224426, "learning_rate": 9.964825659270391e-06, "loss": 0.7408115863800049, "step": 200 }, { "epoch": 0.2608272506082725, "grad_norm": 0.5976687669754028, "learning_rate": 9.963974004341019e-06, "loss": 0.7426021099090576, "step": 201 }, { "epoch": 0.262124898621249, "grad_norm": 0.6217131018638611, "learning_rate": 9.963112199341212e-06, "loss": 0.7804723978042603, "step": 202 }, { "epoch": 0.2634225466342255, "grad_norm": 0.5792650580406189, "learning_rate": 9.96224024603314e-06, "loss": 0.6894349455833435, "step": 203 }, { "epoch": 0.2647201946472019, "grad_norm": 0.6177152395248413, "learning_rate": 9.961358146199729e-06, "loss": 0.717537522315979, "step": 204 }, { "epoch": 0.2660178426601784, "grad_norm": 0.6125051975250244, "learning_rate": 9.960465901644651e-06, "loss": 0.774456799030304, "step": 205 }, { "epoch": 0.2673154906731549, "grad_norm": 0.6172115206718445, "learning_rate": 9.959563514192317e-06, "loss": 0.7355530261993408, "step": 206 }, { "epoch": 0.2686131386861314, "grad_norm": 0.6835010051727295, "learning_rate": 9.958650985687884e-06, "loss": 0.8002670407295227, "step": 207 }, { "epoch": 0.26991078669910784, "grad_norm": 0.6039808392524719, "learning_rate": 9.95772831799724e-06, "loss": 0.784502387046814, "step": 208 }, { "epoch": 0.27120843471208433, "grad_norm": 3.698056936264038, "learning_rate": 9.956795513007008e-06, "loss": 0.7473998069763184, "step": 209 }, { "epoch": 0.2725060827250608, "grad_norm": 0.6423486471176147, "learning_rate": 9.955852572624538e-06, "loss": 0.7945725917816162, "step": 210 }, { "epoch": 0.2738037307380373, "grad_norm": 0.5756685137748718, "learning_rate": 9.954899498777903e-06, "loss": 0.7909812927246094, "step": 211 }, { "epoch": 0.2751013787510138, "grad_norm": 0.5984244346618652, "learning_rate": 9.9539362934159e-06, "loss": 0.7091703414916992, "step": 212 }, { "epoch": 0.27639902676399025, "grad_norm": 0.6023333072662354, "learning_rate": 9.952962958508038e-06, "loss": 0.7251565456390381, "step": 213 }, { "epoch": 0.27769667477696675, "grad_norm": 0.6191360950469971, "learning_rate": 9.951979496044544e-06, "loss": 0.7646386027336121, "step": 214 }, { "epoch": 0.27899432278994324, "grad_norm": 0.6032703518867493, "learning_rate": 9.950985908036346e-06, "loss": 0.76767897605896, "step": 215 }, { "epoch": 0.28029197080291973, "grad_norm": 0.5847381949424744, "learning_rate": 9.94998219651508e-06, "loss": 0.7368282079696655, "step": 216 }, { "epoch": 0.28158961881589617, "grad_norm": 0.6057823896408081, "learning_rate": 9.948968363533085e-06, "loss": 0.7350323796272278, "step": 217 }, { "epoch": 0.28288726682887266, "grad_norm": 0.6186010241508484, "learning_rate": 9.947944411163391e-06, "loss": 0.7249234318733215, "step": 218 }, { "epoch": 0.28418491484184916, "grad_norm": 0.6159788370132446, "learning_rate": 9.946910341499722e-06, "loss": 0.761109471321106, "step": 219 }, { "epoch": 0.28548256285482565, "grad_norm": 0.5817273259162903, "learning_rate": 9.945866156656487e-06, "loss": 0.7725365161895752, "step": 220 }, { "epoch": 0.2867802108678021, "grad_norm": 0.655717134475708, "learning_rate": 9.944811858768782e-06, "loss": 0.7668634057044983, "step": 221 }, { "epoch": 0.2880778588807786, "grad_norm": 0.6457056403160095, "learning_rate": 9.943747449992379e-06, "loss": 0.7912311553955078, "step": 222 }, { "epoch": 0.2893755068937551, "grad_norm": 0.5742535591125488, "learning_rate": 9.942672932503722e-06, "loss": 0.7619901299476624, "step": 223 }, { "epoch": 0.29067315490673157, "grad_norm": 0.5950078964233398, "learning_rate": 9.941588308499932e-06, "loss": 0.7898773550987244, "step": 224 }, { "epoch": 0.291970802919708, "grad_norm": 0.6142423152923584, "learning_rate": 9.940493580198787e-06, "loss": 0.7200186252593994, "step": 225 }, { "epoch": 0.2932684509326845, "grad_norm": 0.6070595979690552, "learning_rate": 9.93938874983873e-06, "loss": 0.6990747451782227, "step": 226 }, { "epoch": 0.294566098945661, "grad_norm": 0.6014435887336731, "learning_rate": 9.93827381967886e-06, "loss": 0.7597475647926331, "step": 227 }, { "epoch": 0.2958637469586375, "grad_norm": 0.5983416438102722, "learning_rate": 9.937148791998926e-06, "loss": 0.738788366317749, "step": 228 }, { "epoch": 0.2971613949716139, "grad_norm": 2.7879600524902344, "learning_rate": 9.936013669099326e-06, "loss": 0.7541340589523315, "step": 229 }, { "epoch": 0.2984590429845904, "grad_norm": 0.6435497403144836, "learning_rate": 9.9348684533011e-06, "loss": 0.8065454959869385, "step": 230 }, { "epoch": 0.2984590429845904, "eval_loss": 0.7250053882598877, "eval_runtime": 73.3232, "eval_samples_per_second": 70.81, "eval_steps_per_second": 8.851, "step": 230 }, { "epoch": 0.2997566909975669, "grad_norm": 2.4210150241851807, "learning_rate": 9.93371314694592e-06, "loss": 0.7646887302398682, "step": 231 }, { "epoch": 0.3010543390105434, "grad_norm": 0.601508617401123, "learning_rate": 9.9325477523961e-06, "loss": 0.7489044070243835, "step": 232 }, { "epoch": 0.3023519870235199, "grad_norm": 0.5808404684066772, "learning_rate": 9.931372272034573e-06, "loss": 0.7624624371528625, "step": 233 }, { "epoch": 0.30364963503649633, "grad_norm": 1.0590876340866089, "learning_rate": 9.930186708264902e-06, "loss": 0.7188542485237122, "step": 234 }, { "epoch": 0.30494728304947283, "grad_norm": 0.6582311391830444, "learning_rate": 9.928991063511264e-06, "loss": 0.7417193055152893, "step": 235 }, { "epoch": 0.3062449310624493, "grad_norm": 0.5886158347129822, "learning_rate": 9.927785340218448e-06, "loss": 0.7227447032928467, "step": 236 }, { "epoch": 0.3075425790754258, "grad_norm": 0.8434078693389893, "learning_rate": 9.926569540851856e-06, "loss": 0.8079698085784912, "step": 237 }, { "epoch": 0.30884022708840225, "grad_norm": 0.7032890915870667, "learning_rate": 9.925343667897487e-06, "loss": 0.730448842048645, "step": 238 }, { "epoch": 0.31013787510137875, "grad_norm": 0.5958182215690613, "learning_rate": 9.924107723861944e-06, "loss": 0.7622323036193848, "step": 239 }, { "epoch": 0.31143552311435524, "grad_norm": 0.7387073040008545, "learning_rate": 9.922861711272417e-06, "loss": 0.8103834390640259, "step": 240 }, { "epoch": 0.31273317112733173, "grad_norm": 0.589846134185791, "learning_rate": 9.921605632676688e-06, "loss": 0.7218436002731323, "step": 241 }, { "epoch": 0.31403081914030817, "grad_norm": 1.18753182888031, "learning_rate": 9.920339490643119e-06, "loss": 0.6769864559173584, "step": 242 }, { "epoch": 0.31532846715328466, "grad_norm": 0.6063650250434875, "learning_rate": 9.91906328776065e-06, "loss": 0.6872894763946533, "step": 243 }, { "epoch": 0.31662611516626116, "grad_norm": 0.6060184240341187, "learning_rate": 9.917777026638794e-06, "loss": 0.7477156519889832, "step": 244 }, { "epoch": 0.31792376317923765, "grad_norm": 0.5981388092041016, "learning_rate": 9.916480709907626e-06, "loss": 0.6859747767448425, "step": 245 }, { "epoch": 0.3192214111922141, "grad_norm": 0.5809654593467712, "learning_rate": 9.91517434021779e-06, "loss": 0.7025295495986938, "step": 246 }, { "epoch": 0.3205190592051906, "grad_norm": 0.6036680340766907, "learning_rate": 9.913857920240481e-06, "loss": 0.8275207877159119, "step": 247 }, { "epoch": 0.3218167072181671, "grad_norm": 0.5851848125457764, "learning_rate": 9.912531452667441e-06, "loss": 0.7031136155128479, "step": 248 }, { "epoch": 0.32311435523114357, "grad_norm": 0.5534024238586426, "learning_rate": 9.911194940210964e-06, "loss": 0.7281129956245422, "step": 249 }, { "epoch": 0.32441200324412, "grad_norm": 0.6152268052101135, "learning_rate": 9.909848385603878e-06, "loss": 0.7846366167068481, "step": 250 }, { "epoch": 0.3257096512570965, "grad_norm": 0.5951406359672546, "learning_rate": 9.908491791599546e-06, "loss": 0.7278503179550171, "step": 251 }, { "epoch": 0.327007299270073, "grad_norm": 0.6011956334114075, "learning_rate": 9.90712516097186e-06, "loss": 0.7939674854278564, "step": 252 }, { "epoch": 0.3283049472830495, "grad_norm": 0.6651070713996887, "learning_rate": 9.905748496515235e-06, "loss": 0.772196888923645, "step": 253 }, { "epoch": 0.329602595296026, "grad_norm": 0.617461085319519, "learning_rate": 9.904361801044599e-06, "loss": 0.7933390140533447, "step": 254 }, { "epoch": 0.3309002433090024, "grad_norm": 0.5844789147377014, "learning_rate": 9.902965077395395e-06, "loss": 0.7286657691001892, "step": 255 }, { "epoch": 0.3321978913219789, "grad_norm": 0.6185967326164246, "learning_rate": 9.901558328423568e-06, "loss": 0.8058604001998901, "step": 256 }, { "epoch": 0.3334955393349554, "grad_norm": 0.6511676907539368, "learning_rate": 9.900141557005567e-06, "loss": 0.7281938195228577, "step": 257 }, { "epoch": 0.3347931873479319, "grad_norm": 0.6114381551742554, "learning_rate": 9.898714766038326e-06, "loss": 0.7546758651733398, "step": 258 }, { "epoch": 0.33609083536090834, "grad_norm": 0.5931724905967712, "learning_rate": 9.897277958439274e-06, "loss": 0.811058759689331, "step": 259 }, { "epoch": 0.33738848337388483, "grad_norm": 0.5811541080474854, "learning_rate": 9.895831137146319e-06, "loss": 0.764075517654419, "step": 260 }, { "epoch": 0.3386861313868613, "grad_norm": 0.5857120156288147, "learning_rate": 9.894374305117844e-06, "loss": 0.730948805809021, "step": 261 }, { "epoch": 0.3399837793998378, "grad_norm": 0.5755126476287842, "learning_rate": 9.892907465332702e-06, "loss": 0.7732649445533752, "step": 262 }, { "epoch": 0.34128142741281425, "grad_norm": 0.5852351784706116, "learning_rate": 9.891430620790208e-06, "loss": 0.6883482933044434, "step": 263 }, { "epoch": 0.34257907542579075, "grad_norm": 0.5931571125984192, "learning_rate": 9.889943774510136e-06, "loss": 0.7685630321502686, "step": 264 }, { "epoch": 0.34387672343876724, "grad_norm": 0.7222980260848999, "learning_rate": 9.888446929532712e-06, "loss": 0.7235557436943054, "step": 265 }, { "epoch": 0.34517437145174373, "grad_norm": 0.6728655695915222, "learning_rate": 9.886940088918601e-06, "loss": 0.7901487350463867, "step": 266 }, { "epoch": 0.34647201946472017, "grad_norm": 0.5990903973579407, "learning_rate": 9.885423255748916e-06, "loss": 0.7315446138381958, "step": 267 }, { "epoch": 0.34776966747769666, "grad_norm": 0.6058611869812012, "learning_rate": 9.883896433125193e-06, "loss": 0.748113751411438, "step": 268 }, { "epoch": 0.34906731549067316, "grad_norm": 0.6079699397087097, "learning_rate": 9.8823596241694e-06, "loss": 0.7346718907356262, "step": 269 }, { "epoch": 0.35036496350364965, "grad_norm": 0.5837222337722778, "learning_rate": 9.88081283202392e-06, "loss": 0.6944899559020996, "step": 270 }, { "epoch": 0.3516626115166261, "grad_norm": 0.5878487229347229, "learning_rate": 9.879256059851553e-06, "loss": 0.766356885433197, "step": 271 }, { "epoch": 0.3529602595296026, "grad_norm": 0.605903685092926, "learning_rate": 9.877689310835503e-06, "loss": 0.7980437278747559, "step": 272 }, { "epoch": 0.3542579075425791, "grad_norm": 0.5946698784828186, "learning_rate": 9.876112588179378e-06, "loss": 0.7276085019111633, "step": 273 }, { "epoch": 0.35555555555555557, "grad_norm": 0.5997035503387451, "learning_rate": 9.874525895107175e-06, "loss": 0.7429395318031311, "step": 274 }, { "epoch": 0.35685320356853206, "grad_norm": 0.5639536380767822, "learning_rate": 9.872929234863277e-06, "loss": 0.7452772855758667, "step": 275 }, { "epoch": 0.3581508515815085, "grad_norm": 0.5665518641471863, "learning_rate": 9.871322610712452e-06, "loss": 0.6850217580795288, "step": 276 }, { "epoch": 0.359448499594485, "grad_norm": 0.5540530681610107, "learning_rate": 9.869706025939843e-06, "loss": 0.6755887269973755, "step": 277 }, { "epoch": 0.3607461476074615, "grad_norm": 0.5980620384216309, "learning_rate": 9.868079483850955e-06, "loss": 0.7464824914932251, "step": 278 }, { "epoch": 0.362043795620438, "grad_norm": 0.619748055934906, "learning_rate": 9.86644298777165e-06, "loss": 0.778630793094635, "step": 279 }, { "epoch": 0.3633414436334144, "grad_norm": 0.5898886919021606, "learning_rate": 9.864796541048155e-06, "loss": 0.7965477705001831, "step": 280 }, { "epoch": 0.3646390916463909, "grad_norm": 0.5768588185310364, "learning_rate": 9.863140147047034e-06, "loss": 0.7540180087089539, "step": 281 }, { "epoch": 0.3659367396593674, "grad_norm": 0.6073225140571594, "learning_rate": 9.861473809155192e-06, "loss": 0.7069481015205383, "step": 282 }, { "epoch": 0.3672343876723439, "grad_norm": 0.853999137878418, "learning_rate": 9.859797530779871e-06, "loss": 0.6730421185493469, "step": 283 }, { "epoch": 0.36853203568532034, "grad_norm": 0.5999425649642944, "learning_rate": 9.858111315348633e-06, "loss": 0.7877826690673828, "step": 284 }, { "epoch": 0.36982968369829683, "grad_norm": 0.9857465624809265, "learning_rate": 9.856415166309365e-06, "loss": 0.7664862871170044, "step": 285 }, { "epoch": 0.3711273317112733, "grad_norm": 0.6046482920646667, "learning_rate": 9.854709087130261e-06, "loss": 0.7595510482788086, "step": 286 }, { "epoch": 0.3724249797242498, "grad_norm": 0.6335992217063904, "learning_rate": 9.852993081299821e-06, "loss": 0.7546533346176147, "step": 287 }, { "epoch": 0.37372262773722625, "grad_norm": 0.6080864667892456, "learning_rate": 9.851267152326842e-06, "loss": 0.7263352870941162, "step": 288 }, { "epoch": 0.37502027575020275, "grad_norm": 0.6323843598365784, "learning_rate": 9.849531303740414e-06, "loss": 0.7602711915969849, "step": 289 }, { "epoch": 0.37631792376317924, "grad_norm": 0.6081179976463318, "learning_rate": 9.847785539089904e-06, "loss": 0.740424633026123, "step": 290 }, { "epoch": 0.37761557177615573, "grad_norm": 0.6082411408424377, "learning_rate": 9.846029861944964e-06, "loss": 0.7497418522834778, "step": 291 }, { "epoch": 0.37891321978913217, "grad_norm": 2.8806638717651367, "learning_rate": 9.844264275895505e-06, "loss": 0.7668443918228149, "step": 292 }, { "epoch": 0.38021086780210867, "grad_norm": 0.6383978128433228, "learning_rate": 9.842488784551707e-06, "loss": 0.7615733742713928, "step": 293 }, { "epoch": 0.38150851581508516, "grad_norm": 0.589131772518158, "learning_rate": 9.840703391543999e-06, "loss": 0.6759642362594604, "step": 294 }, { "epoch": 0.38280616382806165, "grad_norm": 0.5658035278320312, "learning_rate": 9.838908100523056e-06, "loss": 0.6837214231491089, "step": 295 }, { "epoch": 0.38410381184103815, "grad_norm": 0.7991520166397095, "learning_rate": 9.837102915159797e-06, "loss": 0.6950873732566833, "step": 296 }, { "epoch": 0.3854014598540146, "grad_norm": 0.6660937666893005, "learning_rate": 9.835287839145366e-06, "loss": 0.7929595708847046, "step": 297 }, { "epoch": 0.3866991078669911, "grad_norm": 0.5755690336227417, "learning_rate": 9.833462876191138e-06, "loss": 0.7429145574569702, "step": 298 }, { "epoch": 0.38799675587996757, "grad_norm": 0.5845285654067993, "learning_rate": 9.831628030028698e-06, "loss": 0.673062801361084, "step": 299 }, { "epoch": 0.38929440389294406, "grad_norm": 0.6984291672706604, "learning_rate": 9.829783304409838e-06, "loss": 0.7271926403045654, "step": 300 }, { "epoch": 0.3905920519059205, "grad_norm": 0.6314187049865723, "learning_rate": 9.827928703106562e-06, "loss": 0.7842410206794739, "step": 301 }, { "epoch": 0.391889699918897, "grad_norm": 0.5774804353713989, "learning_rate": 9.826064229911056e-06, "loss": 0.7108284831047058, "step": 302 }, { "epoch": 0.3931873479318735, "grad_norm": 0.5863385200500488, "learning_rate": 9.824189888635699e-06, "loss": 0.6845728158950806, "step": 303 }, { "epoch": 0.39448499594485, "grad_norm": 0.6258076429367065, "learning_rate": 9.82230568311304e-06, "loss": 0.7528674602508545, "step": 304 }, { "epoch": 0.3957826439578264, "grad_norm": 0.5792856216430664, "learning_rate": 9.820411617195807e-06, "loss": 0.6762325763702393, "step": 305 }, { "epoch": 0.3970802919708029, "grad_norm": 0.6361887454986572, "learning_rate": 9.818507694756883e-06, "loss": 0.7917072176933289, "step": 306 }, { "epoch": 0.3983779399837794, "grad_norm": 0.5518248677253723, "learning_rate": 9.816593919689305e-06, "loss": 0.6964313387870789, "step": 307 }, { "epoch": 0.3996755879967559, "grad_norm": 0.5932815670967102, "learning_rate": 9.814670295906265e-06, "loss": 0.7426280975341797, "step": 308 }, { "epoch": 0.40097323600973234, "grad_norm": 0.6102697253227234, "learning_rate": 9.81273682734108e-06, "loss": 0.7797576189041138, "step": 309 }, { "epoch": 0.40227088402270883, "grad_norm": 0.5859159827232361, "learning_rate": 9.81079351794721e-06, "loss": 0.6963766813278198, "step": 310 }, { "epoch": 0.4035685320356853, "grad_norm": 0.6081574559211731, "learning_rate": 9.808840371698226e-06, "loss": 0.7762277722358704, "step": 311 }, { "epoch": 0.4048661800486618, "grad_norm": 0.5929109454154968, "learning_rate": 9.80687739258782e-06, "loss": 0.6928838491439819, "step": 312 }, { "epoch": 0.40616382806163825, "grad_norm": 0.6156943440437317, "learning_rate": 9.804904584629786e-06, "loss": 0.7755375504493713, "step": 313 }, { "epoch": 0.40746147607461475, "grad_norm": 0.6252034306526184, "learning_rate": 9.80292195185802e-06, "loss": 0.7410427927970886, "step": 314 }, { "epoch": 0.40875912408759124, "grad_norm": 0.5801575183868408, "learning_rate": 9.800929498326502e-06, "loss": 0.7257661819458008, "step": 315 }, { "epoch": 0.41005677210056773, "grad_norm": 0.6071752309799194, "learning_rate": 9.798927228109294e-06, "loss": 0.72821044921875, "step": 316 }, { "epoch": 0.41135442011354423, "grad_norm": 0.6007112264633179, "learning_rate": 9.796915145300534e-06, "loss": 0.7845569849014282, "step": 317 }, { "epoch": 0.41265206812652067, "grad_norm": 0.5841884016990662, "learning_rate": 9.794893254014421e-06, "loss": 0.7238840460777283, "step": 318 }, { "epoch": 0.41394971613949716, "grad_norm": 0.7773919701576233, "learning_rate": 9.792861558385212e-06, "loss": 0.7452490329742432, "step": 319 }, { "epoch": 0.41524736415247365, "grad_norm": 0.6115602254867554, "learning_rate": 9.790820062567208e-06, "loss": 0.769629716873169, "step": 320 }, { "epoch": 0.41654501216545015, "grad_norm": 0.597138524055481, "learning_rate": 9.788768770734753e-06, "loss": 0.7215956449508667, "step": 321 }, { "epoch": 0.4178426601784266, "grad_norm": 0.5886080265045166, "learning_rate": 9.78670768708222e-06, "loss": 0.6885201930999756, "step": 322 }, { "epoch": 0.4191403081914031, "grad_norm": 0.6041279435157776, "learning_rate": 9.784636815824003e-06, "loss": 0.748660147190094, "step": 323 }, { "epoch": 0.42043795620437957, "grad_norm": 0.6275052428245544, "learning_rate": 9.782556161194508e-06, "loss": 0.7351919412612915, "step": 324 }, { "epoch": 0.42173560421735606, "grad_norm": 0.6083272695541382, "learning_rate": 9.78046572744815e-06, "loss": 0.7183579206466675, "step": 325 }, { "epoch": 0.4230332522303325, "grad_norm": 0.5836600065231323, "learning_rate": 9.778365518859334e-06, "loss": 0.6470940113067627, "step": 326 }, { "epoch": 0.424330900243309, "grad_norm": 0.611179769039154, "learning_rate": 9.776255539722457e-06, "loss": 0.7807853817939758, "step": 327 }, { "epoch": 0.4256285482562855, "grad_norm": 0.5962700843811035, "learning_rate": 9.774135794351892e-06, "loss": 0.7775930166244507, "step": 328 }, { "epoch": 0.426926196269262, "grad_norm": 0.5820413827896118, "learning_rate": 9.77200628708198e-06, "loss": 0.6654623746871948, "step": 329 }, { "epoch": 0.4282238442822384, "grad_norm": 0.5713212490081787, "learning_rate": 9.769867022267028e-06, "loss": 0.7844803333282471, "step": 330 }, { "epoch": 0.4295214922952149, "grad_norm": 0.6236836314201355, "learning_rate": 9.767718004281288e-06, "loss": 0.7271528244018555, "step": 331 }, { "epoch": 0.4308191403081914, "grad_norm": 0.5810200572013855, "learning_rate": 9.765559237518958e-06, "loss": 0.6717958450317383, "step": 332 }, { "epoch": 0.4321167883211679, "grad_norm": 0.5980990529060364, "learning_rate": 9.763390726394171e-06, "loss": 0.7378814220428467, "step": 333 }, { "epoch": 0.43341443633414434, "grad_norm": 0.620817244052887, "learning_rate": 9.761212475340982e-06, "loss": 0.7411800026893616, "step": 334 }, { "epoch": 0.43471208434712083, "grad_norm": 0.5831018686294556, "learning_rate": 9.759024488813364e-06, "loss": 0.6943602561950684, "step": 335 }, { "epoch": 0.4360097323600973, "grad_norm": 0.6330239176750183, "learning_rate": 9.756826771285195e-06, "loss": 0.6916518211364746, "step": 336 }, { "epoch": 0.4373073803730738, "grad_norm": 0.5482841730117798, "learning_rate": 9.754619327250253e-06, "loss": 0.6894945502281189, "step": 337 }, { "epoch": 0.4386050283860503, "grad_norm": 0.5814421772956848, "learning_rate": 9.7524021612222e-06, "loss": 0.7126766443252563, "step": 338 }, { "epoch": 0.43990267639902675, "grad_norm": 0.6360822916030884, "learning_rate": 9.750175277734582e-06, "loss": 0.7301243543624878, "step": 339 }, { "epoch": 0.44120032441200324, "grad_norm": 0.5673643946647644, "learning_rate": 9.747938681340807e-06, "loss": 0.632249116897583, "step": 340 }, { "epoch": 0.44249797242497974, "grad_norm": 0.59381103515625, "learning_rate": 9.745692376614154e-06, "loss": 0.7363812923431396, "step": 341 }, { "epoch": 0.44379562043795623, "grad_norm": 0.5689446926116943, "learning_rate": 9.743436368147745e-06, "loss": 0.6463121175765991, "step": 342 }, { "epoch": 0.44509326845093267, "grad_norm": 0.5716972351074219, "learning_rate": 9.741170660554548e-06, "loss": 0.726833701133728, "step": 343 }, { "epoch": 0.44639091646390916, "grad_norm": 0.6090091466903687, "learning_rate": 9.73889525846736e-06, "loss": 0.7105214595794678, "step": 344 }, { "epoch": 0.44768856447688565, "grad_norm": 0.6220769286155701, "learning_rate": 9.736610166538802e-06, "loss": 0.7986119389533997, "step": 345 }, { "epoch": 0.44898621248986215, "grad_norm": 0.6415942311286926, "learning_rate": 9.73431538944131e-06, "loss": 0.8365704417228699, "step": 346 }, { "epoch": 0.4502838605028386, "grad_norm": 0.6018549203872681, "learning_rate": 9.73201093186712e-06, "loss": 0.754788875579834, "step": 347 }, { "epoch": 0.4515815085158151, "grad_norm": 0.6342391967773438, "learning_rate": 9.729696798528268e-06, "loss": 0.6986638307571411, "step": 348 }, { "epoch": 0.45287915652879157, "grad_norm": 0.6728231906890869, "learning_rate": 9.727372994156568e-06, "loss": 0.7003589272499084, "step": 349 }, { "epoch": 0.45417680454176806, "grad_norm": 0.5958974957466125, "learning_rate": 9.725039523503615e-06, "loss": 0.7366368770599365, "step": 350 }, { "epoch": 0.4554744525547445, "grad_norm": 0.5878227353096008, "learning_rate": 9.722696391340762e-06, "loss": 0.6686346530914307, "step": 351 }, { "epoch": 0.456772100567721, "grad_norm": 0.5995833277702332, "learning_rate": 9.720343602459123e-06, "loss": 0.720341682434082, "step": 352 }, { "epoch": 0.4580697485806975, "grad_norm": 0.5677472352981567, "learning_rate": 9.717981161669556e-06, "loss": 0.7040742039680481, "step": 353 }, { "epoch": 0.459367396593674, "grad_norm": 0.5821993350982666, "learning_rate": 9.715609073802653e-06, "loss": 0.7871376276016235, "step": 354 }, { "epoch": 0.4606650446066504, "grad_norm": 0.6043302416801453, "learning_rate": 9.713227343708737e-06, "loss": 0.6964189410209656, "step": 355 }, { "epoch": 0.4619626926196269, "grad_norm": 0.5885515213012695, "learning_rate": 9.71083597625784e-06, "loss": 0.6288225054740906, "step": 356 }, { "epoch": 0.4632603406326034, "grad_norm": 0.5931031703948975, "learning_rate": 9.708434976339704e-06, "loss": 0.7654111981391907, "step": 357 }, { "epoch": 0.4645579886455799, "grad_norm": 0.5929883122444153, "learning_rate": 9.706024348863766e-06, "loss": 0.7472108602523804, "step": 358 }, { "epoch": 0.4658556366585564, "grad_norm": 0.6003252267837524, "learning_rate": 9.703604098759148e-06, "loss": 0.7266678810119629, "step": 359 }, { "epoch": 0.46715328467153283, "grad_norm": 0.6148797869682312, "learning_rate": 9.70117423097465e-06, "loss": 0.6877753734588623, "step": 360 }, { "epoch": 0.4684509326845093, "grad_norm": 0.632279634475708, "learning_rate": 9.698734750478739e-06, "loss": 0.7512223720550537, "step": 361 }, { "epoch": 0.4697485806974858, "grad_norm": 0.5888375639915466, "learning_rate": 9.69628566225953e-06, "loss": 0.7822796702384949, "step": 362 }, { "epoch": 0.4710462287104623, "grad_norm": 0.6794424057006836, "learning_rate": 9.693826971324793e-06, "loss": 0.7204307317733765, "step": 363 }, { "epoch": 0.47234387672343875, "grad_norm": 0.5850203633308411, "learning_rate": 9.691358682701927e-06, "loss": 0.7395058870315552, "step": 364 }, { "epoch": 0.47364152473641524, "grad_norm": 0.947333574295044, "learning_rate": 9.688880801437957e-06, "loss": 0.7230464220046997, "step": 365 }, { "epoch": 0.47493917274939174, "grad_norm": 0.6044790744781494, "learning_rate": 9.686393332599525e-06, "loss": 0.7762792110443115, "step": 366 }, { "epoch": 0.47623682076236823, "grad_norm": 0.558193027973175, "learning_rate": 9.683896281272872e-06, "loss": 0.7202603816986084, "step": 367 }, { "epoch": 0.47753446877534467, "grad_norm": 0.6356004476547241, "learning_rate": 9.681389652563837e-06, "loss": 0.6806402206420898, "step": 368 }, { "epoch": 0.47883211678832116, "grad_norm": 0.5731885433197021, "learning_rate": 9.678873451597843e-06, "loss": 0.7234804630279541, "step": 369 }, { "epoch": 0.48012976480129765, "grad_norm": 0.6563818454742432, "learning_rate": 9.676347683519882e-06, "loss": 0.7021783590316772, "step": 370 }, { "epoch": 0.48142741281427415, "grad_norm": 0.632475733757019, "learning_rate": 9.673812353494513e-06, "loss": 0.7313486337661743, "step": 371 }, { "epoch": 0.4827250608272506, "grad_norm": 0.6746646761894226, "learning_rate": 9.671267466705841e-06, "loss": 0.7820821404457092, "step": 372 }, { "epoch": 0.4840227088402271, "grad_norm": 0.558120608329773, "learning_rate": 9.668713028357518e-06, "loss": 0.7215161323547363, "step": 373 }, { "epoch": 0.48532035685320357, "grad_norm": 0.5888929963111877, "learning_rate": 9.666149043672724e-06, "loss": 0.7091335654258728, "step": 374 }, { "epoch": 0.48661800486618007, "grad_norm": 7.202490329742432, "learning_rate": 9.663575517894155e-06, "loss": 0.7597553133964539, "step": 375 }, { "epoch": 0.4879156528791565, "grad_norm": 0.6477593183517456, "learning_rate": 9.660992456284024e-06, "loss": 0.6395682692527771, "step": 376 }, { "epoch": 0.489213300892133, "grad_norm": 0.6040880680084229, "learning_rate": 9.658399864124037e-06, "loss": 0.7132856249809265, "step": 377 }, { "epoch": 0.4905109489051095, "grad_norm": 0.6065711379051208, "learning_rate": 9.655797746715388e-06, "loss": 0.7926105260848999, "step": 378 }, { "epoch": 0.491808596918086, "grad_norm": 0.6568942666053772, "learning_rate": 9.65318610937875e-06, "loss": 0.7595465183258057, "step": 379 }, { "epoch": 0.4931062449310625, "grad_norm": 0.5950395464897156, "learning_rate": 9.650564957454258e-06, "loss": 0.7643356919288635, "step": 380 }, { "epoch": 0.4944038929440389, "grad_norm": 0.608245313167572, "learning_rate": 9.647934296301506e-06, "loss": 0.8734641075134277, "step": 381 }, { "epoch": 0.4957015409570154, "grad_norm": 0.6461122632026672, "learning_rate": 9.64529413129953e-06, "loss": 0.7460113167762756, "step": 382 }, { "epoch": 0.4969991889699919, "grad_norm": 0.5779212117195129, "learning_rate": 9.642644467846799e-06, "loss": 0.707379937171936, "step": 383 }, { "epoch": 0.4982968369829684, "grad_norm": 0.5882854461669922, "learning_rate": 9.639985311361202e-06, "loss": 0.74379563331604, "step": 384 }, { "epoch": 0.49959448499594483, "grad_norm": 0.6086680293083191, "learning_rate": 9.637316667280046e-06, "loss": 0.7925621271133423, "step": 385 }, { "epoch": 0.5008921330089213, "grad_norm": 0.5651184916496277, "learning_rate": 9.634638541060027e-06, "loss": 0.7554738521575928, "step": 386 }, { "epoch": 0.5021897810218978, "grad_norm": 0.5808055400848389, "learning_rate": 9.63195093817724e-06, "loss": 0.7644078731536865, "step": 387 }, { "epoch": 0.5034874290348743, "grad_norm": 0.6111287474632263, "learning_rate": 9.62925386412715e-06, "loss": 0.7364607453346252, "step": 388 }, { "epoch": 0.5047850770478508, "grad_norm": 0.6057661175727844, "learning_rate": 9.626547324424592e-06, "loss": 0.7212823629379272, "step": 389 }, { "epoch": 0.5060827250608273, "grad_norm": 0.6477599740028381, "learning_rate": 9.623831324603755e-06, "loss": 0.813086748123169, "step": 390 }, { "epoch": 0.5073803730738037, "grad_norm": 0.5950746536254883, "learning_rate": 9.621105870218167e-06, "loss": 0.7306693196296692, "step": 391 }, { "epoch": 0.5086780210867802, "grad_norm": 0.6298786401748657, "learning_rate": 9.618370966840698e-06, "loss": 0.7335579991340637, "step": 392 }, { "epoch": 0.5099756690997567, "grad_norm": 0.5998733639717102, "learning_rate": 9.615626620063531e-06, "loss": 0.6837765574455261, "step": 393 }, { "epoch": 0.5112733171127332, "grad_norm": 0.6094253659248352, "learning_rate": 9.61287283549816e-06, "loss": 0.7273898720741272, "step": 394 }, { "epoch": 0.5125709651257097, "grad_norm": 0.5919696092605591, "learning_rate": 9.610109618775379e-06, "loss": 0.7142295241355896, "step": 395 }, { "epoch": 0.5138686131386861, "grad_norm": 0.5768521428108215, "learning_rate": 9.607336975545264e-06, "loss": 0.6993876695632935, "step": 396 }, { "epoch": 0.5151662611516626, "grad_norm": 0.6359198689460754, "learning_rate": 9.604554911477173e-06, "loss": 0.751734733581543, "step": 397 }, { "epoch": 0.5164639091646391, "grad_norm": 0.612307071685791, "learning_rate": 9.601763432259716e-06, "loss": 0.7581944465637207, "step": 398 }, { "epoch": 0.5177615571776155, "grad_norm": 0.5969548225402832, "learning_rate": 9.59896254360077e-06, "loss": 0.7034813165664673, "step": 399 }, { "epoch": 0.519059205190592, "grad_norm": 0.5891065001487732, "learning_rate": 9.596152251227438e-06, "loss": 0.7002313137054443, "step": 400 }, { "epoch": 0.5203568532035685, "grad_norm": 0.5791100263595581, "learning_rate": 9.593332560886055e-06, "loss": 0.7138193845748901, "step": 401 }, { "epoch": 0.521654501216545, "grad_norm": 0.7952408790588379, "learning_rate": 9.59050347834218e-06, "loss": 0.6865421533584595, "step": 402 }, { "epoch": 0.5229521492295215, "grad_norm": 0.6096974015235901, "learning_rate": 9.587665009380565e-06, "loss": 0.7312819957733154, "step": 403 }, { "epoch": 0.524249797242498, "grad_norm": 0.6021596789360046, "learning_rate": 9.584817159805164e-06, "loss": 0.7670427560806274, "step": 404 }, { "epoch": 0.5255474452554745, "grad_norm": 0.6113924980163574, "learning_rate": 9.58195993543911e-06, "loss": 0.7259009480476379, "step": 405 }, { "epoch": 0.526845093268451, "grad_norm": 0.6386753916740417, "learning_rate": 9.579093342124699e-06, "loss": 0.7742621898651123, "step": 406 }, { "epoch": 0.5281427412814275, "grad_norm": 0.5846640467643738, "learning_rate": 9.576217385723391e-06, "loss": 0.6874604225158691, "step": 407 }, { "epoch": 0.5294403892944038, "grad_norm": 0.5714486241340637, "learning_rate": 9.57333207211579e-06, "loss": 0.6830397844314575, "step": 408 }, { "epoch": 0.5307380373073803, "grad_norm": 0.5846112370491028, "learning_rate": 9.57043740720163e-06, "loss": 0.7333765029907227, "step": 409 }, { "epoch": 0.5320356853203568, "grad_norm": 0.6309279799461365, "learning_rate": 9.567533396899769e-06, "loss": 0.698890209197998, "step": 410 }, { "epoch": 0.5333333333333333, "grad_norm": 0.5987696647644043, "learning_rate": 9.564620047148174e-06, "loss": 0.7424242496490479, "step": 411 }, { "epoch": 0.5346309813463098, "grad_norm": 0.5915178656578064, "learning_rate": 9.561697363903908e-06, "loss": 0.7625330090522766, "step": 412 }, { "epoch": 0.5359286293592863, "grad_norm": 0.6682938933372498, "learning_rate": 9.558765353143116e-06, "loss": 0.7808880805969238, "step": 413 }, { "epoch": 0.5372262773722628, "grad_norm": 0.5921300649642944, "learning_rate": 9.555824020861022e-06, "loss": 0.7293972969055176, "step": 414 }, { "epoch": 0.5385239253852393, "grad_norm": 0.6055417060852051, "learning_rate": 9.5528733730719e-06, "loss": 0.7130710482597351, "step": 415 }, { "epoch": 0.5398215733982157, "grad_norm": 1.2821067571640015, "learning_rate": 9.549913415809084e-06, "loss": 0.6902526617050171, "step": 416 }, { "epoch": 0.5411192214111922, "grad_norm": 0.5723661184310913, "learning_rate": 9.546944155124935e-06, "loss": 0.7237967252731323, "step": 417 }, { "epoch": 0.5424168694241687, "grad_norm": 0.5984989404678345, "learning_rate": 9.54396559709084e-06, "loss": 0.7385105490684509, "step": 418 }, { "epoch": 0.5437145174371452, "grad_norm": 0.6114164590835571, "learning_rate": 9.540977747797194e-06, "loss": 0.6872152090072632, "step": 419 }, { "epoch": 0.5450121654501217, "grad_norm": 0.585870087146759, "learning_rate": 9.537980613353392e-06, "loss": 0.7558926343917847, "step": 420 }, { "epoch": 0.5463098134630981, "grad_norm": 0.5969951748847961, "learning_rate": 9.53497419988782e-06, "loss": 0.7628536224365234, "step": 421 }, { "epoch": 0.5476074614760746, "grad_norm": 0.6526360511779785, "learning_rate": 9.531958513547832e-06, "loss": 0.7417917251586914, "step": 422 }, { "epoch": 0.5489051094890511, "grad_norm": 0.6217682361602783, "learning_rate": 9.52893356049974e-06, "loss": 0.7846866846084595, "step": 423 }, { "epoch": 0.5502027575020276, "grad_norm": 0.6098693013191223, "learning_rate": 9.525899346928809e-06, "loss": 0.7403139472007751, "step": 424 }, { "epoch": 0.551500405515004, "grad_norm": 0.6113680005073547, "learning_rate": 9.52285587903924e-06, "loss": 0.7699853181838989, "step": 425 }, { "epoch": 0.5527980535279805, "grad_norm": 0.5491748452186584, "learning_rate": 9.519803163054149e-06, "loss": 0.7141760587692261, "step": 426 }, { "epoch": 0.554095701540957, "grad_norm": 0.6018276214599609, "learning_rate": 9.51674120521557e-06, "loss": 0.7314755916595459, "step": 427 }, { "epoch": 0.5553933495539335, "grad_norm": 0.6114900708198547, "learning_rate": 9.513670011784435e-06, "loss": 0.7220840454101562, "step": 428 }, { "epoch": 0.55669099756691, "grad_norm": 0.5553966760635376, "learning_rate": 9.510589589040554e-06, "loss": 0.630115270614624, "step": 429 }, { "epoch": 0.5579886455798865, "grad_norm": 0.5907071232795715, "learning_rate": 9.507499943282613e-06, "loss": 0.6516691446304321, "step": 430 }, { "epoch": 0.559286293592863, "grad_norm": 0.5842899084091187, "learning_rate": 9.504401080828154e-06, "loss": 0.7031220197677612, "step": 431 }, { "epoch": 0.5605839416058395, "grad_norm": 0.5828782916069031, "learning_rate": 9.501293008013568e-06, "loss": 0.7107349038124084, "step": 432 }, { "epoch": 0.5618815896188158, "grad_norm": 0.5939279198646545, "learning_rate": 9.498175731194077e-06, "loss": 0.7517828941345215, "step": 433 }, { "epoch": 0.5631792376317923, "grad_norm": 0.6058377623558044, "learning_rate": 9.495049256743723e-06, "loss": 0.7890589237213135, "step": 434 }, { "epoch": 0.5644768856447688, "grad_norm": 0.6133562922477722, "learning_rate": 9.491913591055356e-06, "loss": 0.6695548892021179, "step": 435 }, { "epoch": 0.5657745336577453, "grad_norm": 0.6204050183296204, "learning_rate": 9.488768740540615e-06, "loss": 0.7749900817871094, "step": 436 }, { "epoch": 0.5670721816707218, "grad_norm": 0.5636538863182068, "learning_rate": 9.485614711629927e-06, "loss": 0.6592154502868652, "step": 437 }, { "epoch": 0.5683698296836983, "grad_norm": 0.5660319328308105, "learning_rate": 9.482451510772482e-06, "loss": 0.7120122313499451, "step": 438 }, { "epoch": 0.5696674776966748, "grad_norm": 0.574423611164093, "learning_rate": 9.479279144436224e-06, "loss": 0.7538824081420898, "step": 439 }, { "epoch": 0.5709651257096513, "grad_norm": 0.5769577622413635, "learning_rate": 9.47609761910784e-06, "loss": 0.6975010633468628, "step": 440 }, { "epoch": 0.5722627737226277, "grad_norm": 1.1428693532943726, "learning_rate": 9.472906941292746e-06, "loss": 0.7184154987335205, "step": 441 }, { "epoch": 0.5735604217356042, "grad_norm": 0.6155918836593628, "learning_rate": 9.469707117515068e-06, "loss": 0.7325999140739441, "step": 442 }, { "epoch": 0.5748580697485807, "grad_norm": 0.6040661931037903, "learning_rate": 9.466498154317635e-06, "loss": 0.6905105113983154, "step": 443 }, { "epoch": 0.5761557177615572, "grad_norm": 0.6275285482406616, "learning_rate": 9.463280058261965e-06, "loss": 0.7441266775131226, "step": 444 }, { "epoch": 0.5774533657745337, "grad_norm": 0.5689868927001953, "learning_rate": 9.460052835928254e-06, "loss": 0.6997857093811035, "step": 445 }, { "epoch": 0.5787510137875101, "grad_norm": 0.5860233902931213, "learning_rate": 9.45681649391535e-06, "loss": 0.6657996773719788, "step": 446 }, { "epoch": 0.5800486618004866, "grad_norm": 0.5518195629119873, "learning_rate": 9.453571038840755e-06, "loss": 0.6410640478134155, "step": 447 }, { "epoch": 0.5813463098134631, "grad_norm": 0.7139276266098022, "learning_rate": 9.450316477340602e-06, "loss": 0.7444489598274231, "step": 448 }, { "epoch": 0.5826439578264396, "grad_norm": 0.6063182950019836, "learning_rate": 9.447052816069648e-06, "loss": 0.7016487121582031, "step": 449 }, { "epoch": 0.583941605839416, "grad_norm": 0.5990587472915649, "learning_rate": 9.443780061701252e-06, "loss": 0.7742944359779358, "step": 450 }, { "epoch": 0.5852392538523925, "grad_norm": 0.5863263010978699, "learning_rate": 9.44049822092737e-06, "loss": 0.7078189253807068, "step": 451 }, { "epoch": 0.586536901865369, "grad_norm": 0.5597153902053833, "learning_rate": 9.437207300458535e-06, "loss": 0.7037616968154907, "step": 452 }, { "epoch": 0.5878345498783455, "grad_norm": 0.5865596532821655, "learning_rate": 9.433907307023845e-06, "loss": 0.7111040353775024, "step": 453 }, { "epoch": 0.589132197891322, "grad_norm": 0.595535397529602, "learning_rate": 9.430598247370955e-06, "loss": 0.6840265393257141, "step": 454 }, { "epoch": 0.5904298459042985, "grad_norm": 0.6209713816642761, "learning_rate": 9.427280128266049e-06, "loss": 0.6608985066413879, "step": 455 }, { "epoch": 0.591727493917275, "grad_norm": 0.7749186158180237, "learning_rate": 9.423952956493846e-06, "loss": 0.6757811307907104, "step": 456 }, { "epoch": 0.5930251419302515, "grad_norm": 0.6284626126289368, "learning_rate": 9.420616738857568e-06, "loss": 0.6912366151809692, "step": 457 }, { "epoch": 0.5943227899432278, "grad_norm": 0.5830085277557373, "learning_rate": 9.417271482178938e-06, "loss": 0.7678932547569275, "step": 458 }, { "epoch": 0.5956204379562043, "grad_norm": 0.5680383443832397, "learning_rate": 9.413917193298153e-06, "loss": 0.7322279810905457, "step": 459 }, { "epoch": 0.5969180859691808, "grad_norm": 0.5904244184494019, "learning_rate": 9.41055387907389e-06, "loss": 0.6763080358505249, "step": 460 }, { "epoch": 0.5969180859691808, "eval_loss": 0.7040426731109619, "eval_runtime": 73.0729, "eval_samples_per_second": 71.052, "eval_steps_per_second": 8.882, "step": 460 }, { "epoch": 0.5982157339821573, "grad_norm": 0.5804091691970825, "learning_rate": 9.407181546383275e-06, "loss": 0.7188655138015747, "step": 461 }, { "epoch": 0.5995133819951338, "grad_norm": 0.5912026166915894, "learning_rate": 9.403800202121873e-06, "loss": 0.6785882711410522, "step": 462 }, { "epoch": 0.6008110300081103, "grad_norm": 0.5554898381233215, "learning_rate": 9.400409853203677e-06, "loss": 0.7052475214004517, "step": 463 }, { "epoch": 0.6021086780210868, "grad_norm": 0.6723419427871704, "learning_rate": 9.397010506561096e-06, "loss": 0.6488598585128784, "step": 464 }, { "epoch": 0.6034063260340633, "grad_norm": 0.5925308465957642, "learning_rate": 9.393602169144929e-06, "loss": 0.7316585779190063, "step": 465 }, { "epoch": 0.6047039740470398, "grad_norm": 0.7151989936828613, "learning_rate": 9.390184847924366e-06, "loss": 0.7060757875442505, "step": 466 }, { "epoch": 0.6060016220600162, "grad_norm": 0.5946957468986511, "learning_rate": 9.386758549886964e-06, "loss": 0.7584104537963867, "step": 467 }, { "epoch": 0.6072992700729927, "grad_norm": 0.568766176700592, "learning_rate": 9.383323282038632e-06, "loss": 0.725806713104248, "step": 468 }, { "epoch": 0.6085969180859692, "grad_norm": 0.5797498226165771, "learning_rate": 9.379879051403627e-06, "loss": 0.6769331693649292, "step": 469 }, { "epoch": 0.6098945660989457, "grad_norm": 0.7914499640464783, "learning_rate": 9.376425865024527e-06, "loss": 0.7631534934043884, "step": 470 }, { "epoch": 0.6111922141119221, "grad_norm": 0.601610004901886, "learning_rate": 9.372963729962227e-06, "loss": 0.8109684586524963, "step": 471 }, { "epoch": 0.6124898621248986, "grad_norm": 0.6191813349723816, "learning_rate": 9.369492653295913e-06, "loss": 0.6854857206344604, "step": 472 }, { "epoch": 0.6137875101378751, "grad_norm": 0.8444225192070007, "learning_rate": 9.366012642123061e-06, "loss": 0.7072763442993164, "step": 473 }, { "epoch": 0.6150851581508516, "grad_norm": 0.5926432609558105, "learning_rate": 9.362523703559412e-06, "loss": 0.7057541012763977, "step": 474 }, { "epoch": 0.616382806163828, "grad_norm": 0.5982694625854492, "learning_rate": 9.359025844738962e-06, "loss": 0.7388914823532104, "step": 475 }, { "epoch": 0.6176804541768045, "grad_norm": 0.6068631410598755, "learning_rate": 9.355519072813946e-06, "loss": 0.7815642356872559, "step": 476 }, { "epoch": 0.618978102189781, "grad_norm": 0.5807543396949768, "learning_rate": 9.352003394954827e-06, "loss": 0.7441459894180298, "step": 477 }, { "epoch": 0.6202757502027575, "grad_norm": 0.5668230056762695, "learning_rate": 9.348478818350277e-06, "loss": 0.7281776666641235, "step": 478 }, { "epoch": 0.621573398215734, "grad_norm": 0.6428498029708862, "learning_rate": 9.34494535020716e-06, "loss": 0.754060685634613, "step": 479 }, { "epoch": 0.6228710462287105, "grad_norm": 0.6553912162780762, "learning_rate": 9.341402997750526e-06, "loss": 0.6970114707946777, "step": 480 }, { "epoch": 0.624168694241687, "grad_norm": 0.5876368880271912, "learning_rate": 9.337851768223589e-06, "loss": 0.7278268933296204, "step": 481 }, { "epoch": 0.6254663422546635, "grad_norm": 0.6632186770439148, "learning_rate": 9.334291668887716e-06, "loss": 0.724956750869751, "step": 482 }, { "epoch": 0.6267639902676398, "grad_norm": 0.582115113735199, "learning_rate": 9.330722707022406e-06, "loss": 0.7292401790618896, "step": 483 }, { "epoch": 0.6280616382806163, "grad_norm": 0.5983607769012451, "learning_rate": 9.327144889925286e-06, "loss": 0.7359820604324341, "step": 484 }, { "epoch": 0.6293592862935928, "grad_norm": 0.594374418258667, "learning_rate": 9.323558224912083e-06, "loss": 0.7724255323410034, "step": 485 }, { "epoch": 0.6306569343065693, "grad_norm": 0.5669406056404114, "learning_rate": 9.319962719316621e-06, "loss": 0.7348428964614868, "step": 486 }, { "epoch": 0.6319545823195458, "grad_norm": 0.6060366630554199, "learning_rate": 9.3163583804908e-06, "loss": 0.682552695274353, "step": 487 }, { "epoch": 0.6332522303325223, "grad_norm": 0.6307089328765869, "learning_rate": 9.312745215804577e-06, "loss": 0.8117605447769165, "step": 488 }, { "epoch": 0.6345498783454988, "grad_norm": 0.5955522656440735, "learning_rate": 9.309123232645963e-06, "loss": 0.7129393219947815, "step": 489 }, { "epoch": 0.6358475263584753, "grad_norm": 0.6481534242630005, "learning_rate": 9.305492438420995e-06, "loss": 0.6988842487335205, "step": 490 }, { "epoch": 0.6371451743714518, "grad_norm": 0.5734648108482361, "learning_rate": 9.301852840553728e-06, "loss": 0.678565263748169, "step": 491 }, { "epoch": 0.6384428223844282, "grad_norm": 0.5938750505447388, "learning_rate": 9.298204446486221e-06, "loss": 0.7267583608627319, "step": 492 }, { "epoch": 0.6397404703974047, "grad_norm": 0.5493259429931641, "learning_rate": 9.294547263678515e-06, "loss": 0.665608286857605, "step": 493 }, { "epoch": 0.6410381184103812, "grad_norm": 0.6349811553955078, "learning_rate": 9.29088129960862e-06, "loss": 0.7591350078582764, "step": 494 }, { "epoch": 0.6423357664233577, "grad_norm": 0.5922753214836121, "learning_rate": 9.28720656177251e-06, "loss": 0.6984656453132629, "step": 495 }, { "epoch": 0.6436334144363342, "grad_norm": 0.5910064578056335, "learning_rate": 9.28352305768409e-06, "loss": 0.7371819019317627, "step": 496 }, { "epoch": 0.6449310624493106, "grad_norm": 0.5690438151359558, "learning_rate": 9.279830794875194e-06, "loss": 0.7185039520263672, "step": 497 }, { "epoch": 0.6462287104622871, "grad_norm": 0.6163010597229004, "learning_rate": 9.276129780895566e-06, "loss": 0.6993834972381592, "step": 498 }, { "epoch": 0.6475263584752636, "grad_norm": 0.6288541555404663, "learning_rate": 9.272420023312843e-06, "loss": 0.8217408657073975, "step": 499 }, { "epoch": 0.64882400648824, "grad_norm": 0.620994508266449, "learning_rate": 9.268701529712541e-06, "loss": 0.7522677779197693, "step": 500 }, { "epoch": 0.6501216545012165, "grad_norm": 0.5998205542564392, "learning_rate": 9.264974307698034e-06, "loss": 0.6935300827026367, "step": 501 }, { "epoch": 0.651419302514193, "grad_norm": 0.8760928511619568, "learning_rate": 9.261238364890553e-06, "loss": 0.7158179879188538, "step": 502 }, { "epoch": 0.6527169505271695, "grad_norm": 0.6253861784934998, "learning_rate": 9.257493708929153e-06, "loss": 0.7684556841850281, "step": 503 }, { "epoch": 0.654014598540146, "grad_norm": 0.6935423016548157, "learning_rate": 9.253740347470708e-06, "loss": 0.778200626373291, "step": 504 }, { "epoch": 0.6553122465531225, "grad_norm": 0.6469247937202454, "learning_rate": 9.24997828818989e-06, "loss": 0.7509121894836426, "step": 505 }, { "epoch": 0.656609894566099, "grad_norm": 0.6015416979789734, "learning_rate": 9.246207538779162e-06, "loss": 0.7778556942939758, "step": 506 }, { "epoch": 0.6579075425790755, "grad_norm": 0.5774285793304443, "learning_rate": 9.242428106948748e-06, "loss": 0.7515290975570679, "step": 507 }, { "epoch": 0.659205190592052, "grad_norm": 0.5681214332580566, "learning_rate": 9.238640000426635e-06, "loss": 0.7492050528526306, "step": 508 }, { "epoch": 0.6605028386050283, "grad_norm": 0.5640445351600647, "learning_rate": 9.234843226958537e-06, "loss": 0.6927063465118408, "step": 509 }, { "epoch": 0.6618004866180048, "grad_norm": 0.6083568334579468, "learning_rate": 9.231037794307896e-06, "loss": 0.7587168216705322, "step": 510 }, { "epoch": 0.6630981346309813, "grad_norm": 0.5821657776832581, "learning_rate": 9.22722371025586e-06, "loss": 0.7126904726028442, "step": 511 }, { "epoch": 0.6643957826439578, "grad_norm": 2.4457342624664307, "learning_rate": 9.223400982601262e-06, "loss": 0.6615161895751953, "step": 512 }, { "epoch": 0.6656934306569343, "grad_norm": 0.6009355187416077, "learning_rate": 9.219569619160618e-06, "loss": 0.7299069166183472, "step": 513 }, { "epoch": 0.6669910786699108, "grad_norm": 0.6069469451904297, "learning_rate": 9.215729627768093e-06, "loss": 0.7864600419998169, "step": 514 }, { "epoch": 0.6682887266828873, "grad_norm": 0.6514759659767151, "learning_rate": 9.2118810162755e-06, "loss": 0.6937267184257507, "step": 515 }, { "epoch": 0.6695863746958638, "grad_norm": 0.795812726020813, "learning_rate": 9.20802379255227e-06, "loss": 0.704431414604187, "step": 516 }, { "epoch": 0.6708840227088402, "grad_norm": 0.6042063236236572, "learning_rate": 9.204157964485454e-06, "loss": 0.7550405263900757, "step": 517 }, { "epoch": 0.6721816707218167, "grad_norm": 0.6756092309951782, "learning_rate": 9.200283539979691e-06, "loss": 0.7409992218017578, "step": 518 }, { "epoch": 0.6734793187347932, "grad_norm": 0.7710636854171753, "learning_rate": 9.196400526957198e-06, "loss": 0.7560484409332275, "step": 519 }, { "epoch": 0.6747769667477697, "grad_norm": 0.7084681987762451, "learning_rate": 9.192508933357753e-06, "loss": 0.7406056523323059, "step": 520 }, { "epoch": 0.6760746147607462, "grad_norm": 0.6131231188774109, "learning_rate": 9.188608767138683e-06, "loss": 0.7801857590675354, "step": 521 }, { "epoch": 0.6773722627737226, "grad_norm": 0.6520926356315613, "learning_rate": 9.184700036274837e-06, "loss": 0.7538937926292419, "step": 522 }, { "epoch": 0.6786699107866991, "grad_norm": 0.5901785492897034, "learning_rate": 9.180782748758583e-06, "loss": 0.7579227089881897, "step": 523 }, { "epoch": 0.6799675587996756, "grad_norm": 0.5867577195167542, "learning_rate": 9.17685691259978e-06, "loss": 0.7785968780517578, "step": 524 }, { "epoch": 0.681265206812652, "grad_norm": 0.6682732105255127, "learning_rate": 9.172922535825772e-06, "loss": 0.6564942598342896, "step": 525 }, { "epoch": 0.6825628548256285, "grad_norm": 0.5923816561698914, "learning_rate": 9.168979626481364e-06, "loss": 0.7041895985603333, "step": 526 }, { "epoch": 0.683860502838605, "grad_norm": 0.5651242136955261, "learning_rate": 9.165028192628803e-06, "loss": 0.7024134397506714, "step": 527 }, { "epoch": 0.6851581508515815, "grad_norm": 0.6138148307800293, "learning_rate": 9.161068242347777e-06, "loss": 0.680936872959137, "step": 528 }, { "epoch": 0.686455798864558, "grad_norm": 0.5655775666236877, "learning_rate": 9.157099783735378e-06, "loss": 0.6618273854255676, "step": 529 }, { "epoch": 0.6877534468775345, "grad_norm": 0.6033377051353455, "learning_rate": 9.1531228249061e-06, "loss": 0.7136421203613281, "step": 530 }, { "epoch": 0.689051094890511, "grad_norm": 0.7331950068473816, "learning_rate": 9.149137373991819e-06, "loss": 0.7970547676086426, "step": 531 }, { "epoch": 0.6903487429034875, "grad_norm": 0.5791338682174683, "learning_rate": 9.145143439141771e-06, "loss": 0.6997847557067871, "step": 532 }, { "epoch": 0.691646390916464, "grad_norm": 0.578549325466156, "learning_rate": 9.141141028522544e-06, "loss": 0.7562875151634216, "step": 533 }, { "epoch": 0.6929440389294403, "grad_norm": 1.920037865638733, "learning_rate": 9.137130150318055e-06, "loss": 0.6756929755210876, "step": 534 }, { "epoch": 0.6942416869424168, "grad_norm": 0.6300271153450012, "learning_rate": 9.133110812729532e-06, "loss": 0.7216504216194153, "step": 535 }, { "epoch": 0.6955393349553933, "grad_norm": 0.6114068031311035, "learning_rate": 9.129083023975505e-06, "loss": 0.7115483283996582, "step": 536 }, { "epoch": 0.6968369829683698, "grad_norm": 0.6002055406570435, "learning_rate": 9.125046792291784e-06, "loss": 0.7236282229423523, "step": 537 }, { "epoch": 0.6981346309813463, "grad_norm": 0.6047035455703735, "learning_rate": 9.121002125931436e-06, "loss": 0.6811922788619995, "step": 538 }, { "epoch": 0.6994322789943228, "grad_norm": 0.6067850589752197, "learning_rate": 9.116949033164785e-06, "loss": 0.7463216781616211, "step": 539 }, { "epoch": 0.7007299270072993, "grad_norm": 0.5822233557701111, "learning_rate": 9.112887522279378e-06, "loss": 0.7334940433502197, "step": 540 }, { "epoch": 0.7020275750202758, "grad_norm": 0.5947557687759399, "learning_rate": 9.108817601579978e-06, "loss": 0.7504947185516357, "step": 541 }, { "epoch": 0.7033252230332522, "grad_norm": 0.6123725771903992, "learning_rate": 9.104739279388542e-06, "loss": 0.7778276205062866, "step": 542 }, { "epoch": 0.7046228710462287, "grad_norm": 0.6185777187347412, "learning_rate": 9.100652564044206e-06, "loss": 0.7200486660003662, "step": 543 }, { "epoch": 0.7059205190592052, "grad_norm": 1.0739803314208984, "learning_rate": 9.09655746390327e-06, "loss": 0.7538056969642639, "step": 544 }, { "epoch": 0.7072181670721817, "grad_norm": 0.5895283818244934, "learning_rate": 9.092453987339174e-06, "loss": 0.6963307857513428, "step": 545 }, { "epoch": 0.7085158150851582, "grad_norm": 0.5688499212265015, "learning_rate": 9.088342142742493e-06, "loss": 0.7032905220985413, "step": 546 }, { "epoch": 0.7098134630981346, "grad_norm": 0.6233918070793152, "learning_rate": 9.084221938520906e-06, "loss": 0.6713303923606873, "step": 547 }, { "epoch": 0.7111111111111111, "grad_norm": 0.7095353007316589, "learning_rate": 9.080093383099187e-06, "loss": 0.7268386483192444, "step": 548 }, { "epoch": 0.7124087591240876, "grad_norm": 0.6135478019714355, "learning_rate": 9.07595648491919e-06, "loss": 0.7246679663658142, "step": 549 }, { "epoch": 0.7137064071370641, "grad_norm": 0.582713782787323, "learning_rate": 9.071811252439823e-06, "loss": 0.691692590713501, "step": 550 }, { "epoch": 0.7150040551500405, "grad_norm": 0.542813241481781, "learning_rate": 9.067657694137038e-06, "loss": 0.7191475629806519, "step": 551 }, { "epoch": 0.716301703163017, "grad_norm": 0.6026738286018372, "learning_rate": 9.063495818503809e-06, "loss": 0.7817606925964355, "step": 552 }, { "epoch": 0.7175993511759935, "grad_norm": 0.8981631398200989, "learning_rate": 9.059325634050118e-06, "loss": 0.7415137887001038, "step": 553 }, { "epoch": 0.71889699918897, "grad_norm": 0.624947190284729, "learning_rate": 9.05514714930294e-06, "loss": 0.7271240949630737, "step": 554 }, { "epoch": 0.7201946472019465, "grad_norm": 0.5546719431877136, "learning_rate": 9.050960372806214e-06, "loss": 0.698599100112915, "step": 555 }, { "epoch": 0.721492295214923, "grad_norm": 0.5948834419250488, "learning_rate": 9.046765313120842e-06, "loss": 0.7756059169769287, "step": 556 }, { "epoch": 0.7227899432278995, "grad_norm": 0.5877026915550232, "learning_rate": 9.042561978824657e-06, "loss": 0.7625119090080261, "step": 557 }, { "epoch": 0.724087591240876, "grad_norm": 0.6063138246536255, "learning_rate": 9.038350378512417e-06, "loss": 0.7803001403808594, "step": 558 }, { "epoch": 0.7253852392538523, "grad_norm": 0.5974534153938293, "learning_rate": 9.034130520795774e-06, "loss": 0.716859757900238, "step": 559 }, { "epoch": 0.7266828872668288, "grad_norm": 0.5728408694267273, "learning_rate": 9.029902414303273e-06, "loss": 0.749966561794281, "step": 560 }, { "epoch": 0.7279805352798053, "grad_norm": 0.5723510384559631, "learning_rate": 9.025666067680319e-06, "loss": 0.6597641706466675, "step": 561 }, { "epoch": 0.7292781832927818, "grad_norm": 0.6084505915641785, "learning_rate": 9.021421489589169e-06, "loss": 0.710649847984314, "step": 562 }, { "epoch": 0.7305758313057583, "grad_norm": 0.5824548006057739, "learning_rate": 9.017168688708913e-06, "loss": 0.6628729104995728, "step": 563 }, { "epoch": 0.7318734793187348, "grad_norm": 0.594218373298645, "learning_rate": 9.01290767373545e-06, "loss": 0.730206310749054, "step": 564 }, { "epoch": 0.7331711273317113, "grad_norm": 0.7261629700660706, "learning_rate": 9.008638453381477e-06, "loss": 0.6241463422775269, "step": 565 }, { "epoch": 0.7344687753446878, "grad_norm": 0.6365723609924316, "learning_rate": 9.004361036376472e-06, "loss": 0.7979130148887634, "step": 566 }, { "epoch": 0.7357664233576642, "grad_norm": 0.6350899934768677, "learning_rate": 9.000075431466668e-06, "loss": 0.7318904399871826, "step": 567 }, { "epoch": 0.7370640713706407, "grad_norm": 0.5833107829093933, "learning_rate": 8.995781647415041e-06, "loss": 0.6889808177947998, "step": 568 }, { "epoch": 0.7383617193836172, "grad_norm": 1.110663652420044, "learning_rate": 8.991479693001296e-06, "loss": 0.7418273687362671, "step": 569 }, { "epoch": 0.7396593673965937, "grad_norm": 0.5860966444015503, "learning_rate": 8.987169577021838e-06, "loss": 0.7295401096343994, "step": 570 }, { "epoch": 0.7409570154095702, "grad_norm": 2.7430782318115234, "learning_rate": 8.982851308289765e-06, "loss": 0.7898417711257935, "step": 571 }, { "epoch": 0.7422546634225466, "grad_norm": 0.6228799223899841, "learning_rate": 8.978524895634842e-06, "loss": 0.7360432147979736, "step": 572 }, { "epoch": 0.7435523114355231, "grad_norm": 0.6052027344703674, "learning_rate": 8.974190347903491e-06, "loss": 0.7148642539978027, "step": 573 }, { "epoch": 0.7448499594484996, "grad_norm": 0.5462301969528198, "learning_rate": 8.96984767395876e-06, "loss": 0.6608201861381531, "step": 574 }, { "epoch": 0.7461476074614761, "grad_norm": 0.6186708211898804, "learning_rate": 8.965496882680322e-06, "loss": 0.7763011455535889, "step": 575 }, { "epoch": 0.7474452554744525, "grad_norm": 0.5678666830062866, "learning_rate": 8.961137982964445e-06, "loss": 0.6967377662658691, "step": 576 }, { "epoch": 0.748742903487429, "grad_norm": 0.5985408425331116, "learning_rate": 8.95677098372397e-06, "loss": 0.7348828911781311, "step": 577 }, { "epoch": 0.7500405515004055, "grad_norm": 0.5867311954498291, "learning_rate": 8.95239589388831e-06, "loss": 0.7279753684997559, "step": 578 }, { "epoch": 0.751338199513382, "grad_norm": 0.5872586369514465, "learning_rate": 8.948012722403417e-06, "loss": 0.7667936086654663, "step": 579 }, { "epoch": 0.7526358475263585, "grad_norm": 0.6062989234924316, "learning_rate": 8.943621478231764e-06, "loss": 0.7433009147644043, "step": 580 }, { "epoch": 0.753933495539335, "grad_norm": 0.5952759981155396, "learning_rate": 8.939222170352333e-06, "loss": 0.7213162183761597, "step": 581 }, { "epoch": 0.7552311435523115, "grad_norm": 0.6251077651977539, "learning_rate": 8.9348148077606e-06, "loss": 0.6798166632652283, "step": 582 }, { "epoch": 0.756528791565288, "grad_norm": 0.6643015742301941, "learning_rate": 8.9303993994685e-06, "loss": 0.697973370552063, "step": 583 }, { "epoch": 0.7578264395782643, "grad_norm": 0.614818274974823, "learning_rate": 8.925975954504432e-06, "loss": 0.6740398406982422, "step": 584 }, { "epoch": 0.7591240875912408, "grad_norm": 0.5874298214912415, "learning_rate": 8.921544481913218e-06, "loss": 0.6789122819900513, "step": 585 }, { "epoch": 0.7604217356042173, "grad_norm": 0.5964909791946411, "learning_rate": 8.917104990756096e-06, "loss": 0.7620725631713867, "step": 586 }, { "epoch": 0.7617193836171938, "grad_norm": 0.6049628853797913, "learning_rate": 8.912657490110705e-06, "loss": 0.7080841064453125, "step": 587 }, { "epoch": 0.7630170316301703, "grad_norm": 0.5781946778297424, "learning_rate": 8.908201989071055e-06, "loss": 0.7524607181549072, "step": 588 }, { "epoch": 0.7643146796431468, "grad_norm": 0.585602879524231, "learning_rate": 8.903738496747523e-06, "loss": 0.775031566619873, "step": 589 }, { "epoch": 0.7656123276561233, "grad_norm": 0.5722633004188538, "learning_rate": 8.899267022266815e-06, "loss": 0.7250426411628723, "step": 590 }, { "epoch": 0.7669099756690998, "grad_norm": 0.5955145359039307, "learning_rate": 8.894787574771968e-06, "loss": 0.7013397216796875, "step": 591 }, { "epoch": 0.7682076236820763, "grad_norm": 0.5935817956924438, "learning_rate": 8.890300163422319e-06, "loss": 0.7290763854980469, "step": 592 }, { "epoch": 0.7695052716950527, "grad_norm": 0.5822441577911377, "learning_rate": 8.885804797393484e-06, "loss": 0.7267876863479614, "step": 593 }, { "epoch": 0.7708029197080292, "grad_norm": 0.6610195636749268, "learning_rate": 8.881301485877355e-06, "loss": 0.7642419338226318, "step": 594 }, { "epoch": 0.7721005677210057, "grad_norm": 0.5827111005783081, "learning_rate": 8.87679023808206e-06, "loss": 0.6633021831512451, "step": 595 }, { "epoch": 0.7733982157339822, "grad_norm": 0.5982354283332825, "learning_rate": 8.87227106323196e-06, "loss": 0.7427453994750977, "step": 596 }, { "epoch": 0.7746958637469586, "grad_norm": 0.5927367210388184, "learning_rate": 8.867743970567625e-06, "loss": 0.6740269660949707, "step": 597 }, { "epoch": 0.7759935117599351, "grad_norm": 0.5812351703643799, "learning_rate": 8.86320896934581e-06, "loss": 0.7781720161437988, "step": 598 }, { "epoch": 0.7772911597729116, "grad_norm": 0.5589850544929504, "learning_rate": 8.858666068839447e-06, "loss": 0.6646384000778198, "step": 599 }, { "epoch": 0.7785888077858881, "grad_norm": 0.6152946352958679, "learning_rate": 8.85411527833762e-06, "loss": 0.7158241868019104, "step": 600 }, { "epoch": 0.7798864557988645, "grad_norm": 0.6571215987205505, "learning_rate": 8.849556607145541e-06, "loss": 0.6301259994506836, "step": 601 }, { "epoch": 0.781184103811841, "grad_norm": 0.650355339050293, "learning_rate": 8.84499006458454e-06, "loss": 0.7729838490486145, "step": 602 }, { "epoch": 0.7824817518248175, "grad_norm": 0.5668020844459534, "learning_rate": 8.840415659992038e-06, "loss": 0.7071006298065186, "step": 603 }, { "epoch": 0.783779399837794, "grad_norm": 0.5940731763839722, "learning_rate": 8.835833402721538e-06, "loss": 0.709991991519928, "step": 604 }, { "epoch": 0.7850770478507705, "grad_norm": 0.6069549918174744, "learning_rate": 8.831243302142595e-06, "loss": 0.7425503730773926, "step": 605 }, { "epoch": 0.786374695863747, "grad_norm": 0.6917547583580017, "learning_rate": 8.826645367640803e-06, "loss": 0.7509415149688721, "step": 606 }, { "epoch": 0.7876723438767235, "grad_norm": 0.5669399499893188, "learning_rate": 8.822039608617773e-06, "loss": 0.7422374486923218, "step": 607 }, { "epoch": 0.7889699918897, "grad_norm": 0.5998254418373108, "learning_rate": 8.81742603449112e-06, "loss": 0.6498250961303711, "step": 608 }, { "epoch": 0.7902676399026763, "grad_norm": 0.5784206390380859, "learning_rate": 8.81280465469443e-06, "loss": 0.7794440388679504, "step": 609 }, { "epoch": 0.7915652879156528, "grad_norm": 0.5644393563270569, "learning_rate": 8.808175478677261e-06, "loss": 0.697083055973053, "step": 610 }, { "epoch": 0.7928629359286293, "grad_norm": 0.5781574249267578, "learning_rate": 8.803538515905102e-06, "loss": 0.6970184445381165, "step": 611 }, { "epoch": 0.7941605839416058, "grad_norm": 0.585652768611908, "learning_rate": 8.79889377585937e-06, "loss": 0.7602633833885193, "step": 612 }, { "epoch": 0.7954582319545823, "grad_norm": 0.5716352462768555, "learning_rate": 8.79424126803738e-06, "loss": 0.717863142490387, "step": 613 }, { "epoch": 0.7967558799675588, "grad_norm": 0.5922728776931763, "learning_rate": 8.789581001952339e-06, "loss": 0.7333586812019348, "step": 614 }, { "epoch": 0.7980535279805353, "grad_norm": 0.7918326258659363, "learning_rate": 8.784912987133305e-06, "loss": 0.7329719066619873, "step": 615 }, { "epoch": 0.7993511759935118, "grad_norm": 0.6318597793579102, "learning_rate": 8.78023723312519e-06, "loss": 0.71714848279953, "step": 616 }, { "epoch": 0.8006488240064883, "grad_norm": 0.5931165814399719, "learning_rate": 8.775553749488729e-06, "loss": 0.6446089744567871, "step": 617 }, { "epoch": 0.8019464720194647, "grad_norm": 0.5699899196624756, "learning_rate": 8.770862545800459e-06, "loss": 0.6896922588348389, "step": 618 }, { "epoch": 0.8032441200324412, "grad_norm": 0.5788043141365051, "learning_rate": 8.766163631652702e-06, "loss": 0.7116216421127319, "step": 619 }, { "epoch": 0.8045417680454177, "grad_norm": 0.6152717471122742, "learning_rate": 8.76145701665355e-06, "loss": 0.7757282853126526, "step": 620 }, { "epoch": 0.8058394160583942, "grad_norm": 0.6117092967033386, "learning_rate": 8.756742710426842e-06, "loss": 0.6977071166038513, "step": 621 }, { "epoch": 0.8071370640713706, "grad_norm": 0.5893334150314331, "learning_rate": 8.752020722612135e-06, "loss": 0.7122848033905029, "step": 622 }, { "epoch": 0.8084347120843471, "grad_norm": 0.613097608089447, "learning_rate": 8.747291062864704e-06, "loss": 0.7448244094848633, "step": 623 }, { "epoch": 0.8097323600973236, "grad_norm": 0.5860653519630432, "learning_rate": 8.742553740855507e-06, "loss": 0.6702634692192078, "step": 624 }, { "epoch": 0.8110300081103001, "grad_norm": 0.6024116277694702, "learning_rate": 8.737808766271163e-06, "loss": 0.6898221969604492, "step": 625 }, { "epoch": 0.8123276561232765, "grad_norm": 0.5622679591178894, "learning_rate": 8.733056148813947e-06, "loss": 0.7181109189987183, "step": 626 }, { "epoch": 0.813625304136253, "grad_norm": 0.595656156539917, "learning_rate": 8.728295898201762e-06, "loss": 0.7352790832519531, "step": 627 }, { "epoch": 0.8149229521492295, "grad_norm": 0.5798142552375793, "learning_rate": 8.72352802416811e-06, "loss": 0.6691849231719971, "step": 628 }, { "epoch": 0.816220600162206, "grad_norm": 0.6328383088111877, "learning_rate": 8.718752536462089e-06, "loss": 0.7578571438789368, "step": 629 }, { "epoch": 0.8175182481751825, "grad_norm": 0.6140182018280029, "learning_rate": 8.713969444848365e-06, "loss": 0.8000912666320801, "step": 630 }, { "epoch": 0.818815896188159, "grad_norm": 0.5924091935157776, "learning_rate": 8.709178759107146e-06, "loss": 0.7412709593772888, "step": 631 }, { "epoch": 0.8201135442011355, "grad_norm": 0.5865992903709412, "learning_rate": 8.704380489034172e-06, "loss": 0.6817134022712708, "step": 632 }, { "epoch": 0.821411192214112, "grad_norm": 0.6066908240318298, "learning_rate": 8.699574644440696e-06, "loss": 0.7462890148162842, "step": 633 }, { "epoch": 0.8227088402270885, "grad_norm": 0.6996213793754578, "learning_rate": 8.694761235153446e-06, "loss": 0.7541388273239136, "step": 634 }, { "epoch": 0.8240064882400648, "grad_norm": 0.5837500691413879, "learning_rate": 8.689940271014631e-06, "loss": 0.7211518883705139, "step": 635 }, { "epoch": 0.8253041362530413, "grad_norm": 0.6041287183761597, "learning_rate": 8.685111761881902e-06, "loss": 0.7510079741477966, "step": 636 }, { "epoch": 0.8266017842660178, "grad_norm": 0.5609418153762817, "learning_rate": 8.680275717628336e-06, "loss": 0.7399103045463562, "step": 637 }, { "epoch": 0.8278994322789943, "grad_norm": 0.6362541913986206, "learning_rate": 8.675432148142423e-06, "loss": 0.7379388809204102, "step": 638 }, { "epoch": 0.8291970802919708, "grad_norm": 0.555855393409729, "learning_rate": 8.670581063328031e-06, "loss": 0.6878998279571533, "step": 639 }, { "epoch": 0.8304947283049473, "grad_norm": 0.5522022843360901, "learning_rate": 8.665722473104407e-06, "loss": 0.6912398338317871, "step": 640 }, { "epoch": 0.8317923763179238, "grad_norm": 0.6348553895950317, "learning_rate": 8.660856387406134e-06, "loss": 0.7144729495048523, "step": 641 }, { "epoch": 0.8330900243309003, "grad_norm": 0.5787035226821899, "learning_rate": 8.655982816183127e-06, "loss": 0.7252941727638245, "step": 642 }, { "epoch": 0.8343876723438767, "grad_norm": 1.6580746173858643, "learning_rate": 8.651101769400606e-06, "loss": 0.7200146913528442, "step": 643 }, { "epoch": 0.8356853203568532, "grad_norm": 1.0832597017288208, "learning_rate": 8.646213257039076e-06, "loss": 0.7684627771377563, "step": 644 }, { "epoch": 0.8369829683698297, "grad_norm": 1.513912320137024, "learning_rate": 8.641317289094306e-06, "loss": 0.7325241565704346, "step": 645 }, { "epoch": 0.8382806163828062, "grad_norm": 0.6023765802383423, "learning_rate": 8.636413875577314e-06, "loss": 0.74098801612854, "step": 646 }, { "epoch": 0.8395782643957826, "grad_norm": 0.6051165461540222, "learning_rate": 8.631503026514337e-06, "loss": 0.6847478151321411, "step": 647 }, { "epoch": 0.8408759124087591, "grad_norm": 0.5932079553604126, "learning_rate": 8.626584751946818e-06, "loss": 0.731514036655426, "step": 648 }, { "epoch": 0.8421735604217356, "grad_norm": 0.592435359954834, "learning_rate": 8.621659061931389e-06, "loss": 0.7055472731590271, "step": 649 }, { "epoch": 0.8434712084347121, "grad_norm": 2.370189905166626, "learning_rate": 8.616725966539831e-06, "loss": 0.6948425769805908, "step": 650 }, { "epoch": 0.8447688564476885, "grad_norm": 0.6067817807197571, "learning_rate": 8.611785475859083e-06, "loss": 0.7035855650901794, "step": 651 }, { "epoch": 0.846066504460665, "grad_norm": 0.6086214780807495, "learning_rate": 8.606837599991194e-06, "loss": 0.7720967531204224, "step": 652 }, { "epoch": 0.8473641524736415, "grad_norm": 0.5939242243766785, "learning_rate": 8.601882349053318e-06, "loss": 0.7347517609596252, "step": 653 }, { "epoch": 0.848661800486618, "grad_norm": 0.6451635360717773, "learning_rate": 8.596919733177692e-06, "loss": 0.6510732173919678, "step": 654 }, { "epoch": 0.8499594484995945, "grad_norm": 0.6460222601890564, "learning_rate": 8.591949762511606e-06, "loss": 0.6970388293266296, "step": 655 }, { "epoch": 0.851257096512571, "grad_norm": 0.5829662084579468, "learning_rate": 8.586972447217392e-06, "loss": 0.6706767678260803, "step": 656 }, { "epoch": 0.8525547445255475, "grad_norm": 0.5833383798599243, "learning_rate": 8.581987797472404e-06, "loss": 0.7589589357376099, "step": 657 }, { "epoch": 0.853852392538524, "grad_norm": 0.5842010974884033, "learning_rate": 8.576995823468984e-06, "loss": 0.7162166833877563, "step": 658 }, { "epoch": 0.8551500405515005, "grad_norm": 0.5614502429962158, "learning_rate": 8.571996535414457e-06, "loss": 0.6840311288833618, "step": 659 }, { "epoch": 0.8564476885644768, "grad_norm": 0.5722468495368958, "learning_rate": 8.566989943531106e-06, "loss": 0.7161433100700378, "step": 660 }, { "epoch": 0.8577453365774533, "grad_norm": 0.6029196977615356, "learning_rate": 8.561976058056138e-06, "loss": 0.7230268716812134, "step": 661 }, { "epoch": 0.8590429845904298, "grad_norm": 0.5787186622619629, "learning_rate": 8.556954889241682e-06, "loss": 0.7280833721160889, "step": 662 }, { "epoch": 0.8603406326034063, "grad_norm": 0.6488873362541199, "learning_rate": 8.551926447354759e-06, "loss": 0.6804985404014587, "step": 663 }, { "epoch": 0.8616382806163828, "grad_norm": 0.5842364430427551, "learning_rate": 8.546890742677259e-06, "loss": 0.669411301612854, "step": 664 }, { "epoch": 0.8629359286293593, "grad_norm": 0.5956006646156311, "learning_rate": 8.541847785505921e-06, "loss": 0.7321279048919678, "step": 665 }, { "epoch": 0.8642335766423358, "grad_norm": 3.8146164417266846, "learning_rate": 8.53679758615232e-06, "loss": 0.693459153175354, "step": 666 }, { "epoch": 0.8655312246553123, "grad_norm": 0.7075020670890808, "learning_rate": 8.531740154942834e-06, "loss": 0.6751031875610352, "step": 667 }, { "epoch": 0.8668288726682887, "grad_norm": 0.5840404629707336, "learning_rate": 8.526675502218629e-06, "loss": 0.7010972499847412, "step": 668 }, { "epoch": 0.8681265206812652, "grad_norm": 0.5663997530937195, "learning_rate": 8.521603638335638e-06, "loss": 0.7152513265609741, "step": 669 }, { "epoch": 0.8694241686942417, "grad_norm": 0.58479243516922, "learning_rate": 8.516524573664539e-06, "loss": 0.7431036233901978, "step": 670 }, { "epoch": 0.8707218167072182, "grad_norm": 0.5867894887924194, "learning_rate": 8.511438318590735e-06, "loss": 0.6411721706390381, "step": 671 }, { "epoch": 0.8720194647201946, "grad_norm": 0.595013439655304, "learning_rate": 8.506344883514328e-06, "loss": 0.6847820281982422, "step": 672 }, { "epoch": 0.8733171127331711, "grad_norm": 0.6092846989631653, "learning_rate": 8.501244278850105e-06, "loss": 0.7914074659347534, "step": 673 }, { "epoch": 0.8746147607461476, "grad_norm": 0.6108312606811523, "learning_rate": 8.496136515027511e-06, "loss": 0.7064344882965088, "step": 674 }, { "epoch": 0.8759124087591241, "grad_norm": 0.6098673343658447, "learning_rate": 8.491021602490632e-06, "loss": 0.7082339525222778, "step": 675 }, { "epoch": 0.8772100567721006, "grad_norm": 0.5852345824241638, "learning_rate": 8.485899551698166e-06, "loss": 0.6980363130569458, "step": 676 }, { "epoch": 0.878507704785077, "grad_norm": 0.60945725440979, "learning_rate": 8.480770373123415e-06, "loss": 0.7337608933448792, "step": 677 }, { "epoch": 0.8798053527980535, "grad_norm": 0.5622206926345825, "learning_rate": 8.475634077254248e-06, "loss": 0.7212387919425964, "step": 678 }, { "epoch": 0.88110300081103, "grad_norm": 1.9474778175354004, "learning_rate": 8.470490674593091e-06, "loss": 0.7507941722869873, "step": 679 }, { "epoch": 0.8824006488240065, "grad_norm": 0.5891706943511963, "learning_rate": 8.4653401756569e-06, "loss": 0.72685706615448, "step": 680 }, { "epoch": 0.883698296836983, "grad_norm": 0.5848804116249084, "learning_rate": 8.460182590977142e-06, "loss": 0.7391736507415771, "step": 681 }, { "epoch": 0.8849959448499595, "grad_norm": 0.5995469093322754, "learning_rate": 8.455017931099772e-06, "loss": 0.7077188491821289, "step": 682 }, { "epoch": 0.886293592862936, "grad_norm": 0.5778690576553345, "learning_rate": 8.449846206585211e-06, "loss": 0.7160015106201172, "step": 683 }, { "epoch": 0.8875912408759125, "grad_norm": 0.6114044785499573, "learning_rate": 8.44466742800833e-06, "loss": 0.7118149995803833, "step": 684 }, { "epoch": 0.8888888888888888, "grad_norm": 0.5748172998428345, "learning_rate": 8.439481605958416e-06, "loss": 0.7232242822647095, "step": 685 }, { "epoch": 0.8901865369018653, "grad_norm": 0.9608264565467834, "learning_rate": 8.434288751039168e-06, "loss": 0.7293300032615662, "step": 686 }, { "epoch": 0.8914841849148418, "grad_norm": 0.5927110910415649, "learning_rate": 8.429088873868656e-06, "loss": 0.7629004716873169, "step": 687 }, { "epoch": 0.8927818329278183, "grad_norm": 0.5677574872970581, "learning_rate": 8.423881985079315e-06, "loss": 0.6493050456047058, "step": 688 }, { "epoch": 0.8940794809407948, "grad_norm": 0.5510875582695007, "learning_rate": 8.418668095317912e-06, "loss": 0.6685976386070251, "step": 689 }, { "epoch": 0.8953771289537713, "grad_norm": 0.5691307187080383, "learning_rate": 8.413447215245534e-06, "loss": 0.7029674053192139, "step": 690 }, { "epoch": 0.8953771289537713, "eval_loss": 0.6914051175117493, "eval_runtime": 73.0841, "eval_samples_per_second": 71.041, "eval_steps_per_second": 8.88, "step": 690 }, { "epoch": 0.8966747769667478, "grad_norm": 0.5947213172912598, "learning_rate": 8.408219355537557e-06, "loss": 0.7144750356674194, "step": 691 }, { "epoch": 0.8979724249797243, "grad_norm": 0.6758149266242981, "learning_rate": 8.402984526883635e-06, "loss": 0.7232916355133057, "step": 692 }, { "epoch": 0.8992700729927007, "grad_norm": 0.6068633198738098, "learning_rate": 8.397742739987664e-06, "loss": 0.6896466612815857, "step": 693 }, { "epoch": 0.9005677210056772, "grad_norm": 0.5855746865272522, "learning_rate": 8.392494005567773e-06, "loss": 0.7137375473976135, "step": 694 }, { "epoch": 0.9018653690186537, "grad_norm": 0.6378610134124756, "learning_rate": 8.387238334356294e-06, "loss": 0.6991242170333862, "step": 695 }, { "epoch": 0.9031630170316302, "grad_norm": 0.5615161657333374, "learning_rate": 8.381975737099745e-06, "loss": 0.7315720319747925, "step": 696 }, { "epoch": 0.9044606650446066, "grad_norm": 0.5945183634757996, "learning_rate": 8.376706224558807e-06, "loss": 0.7387629151344299, "step": 697 }, { "epoch": 0.9057583130575831, "grad_norm": 0.5757802724838257, "learning_rate": 8.3714298075083e-06, "loss": 0.769163191318512, "step": 698 }, { "epoch": 0.9070559610705596, "grad_norm": 0.6023557186126709, "learning_rate": 8.366146496737158e-06, "loss": 0.7032333016395569, "step": 699 }, { "epoch": 0.9083536090835361, "grad_norm": 0.5623191595077515, "learning_rate": 8.360856303048417e-06, "loss": 0.688059389591217, "step": 700 }, { "epoch": 0.9096512570965126, "grad_norm": 0.6660424470901489, "learning_rate": 8.355559237259181e-06, "loss": 0.6570596098899841, "step": 701 }, { "epoch": 0.910948905109489, "grad_norm": 0.6358682513237, "learning_rate": 8.350255310200611e-06, "loss": 0.6851440668106079, "step": 702 }, { "epoch": 0.9122465531224655, "grad_norm": 0.5915968418121338, "learning_rate": 8.344944532717898e-06, "loss": 0.7370898127555847, "step": 703 }, { "epoch": 0.913544201135442, "grad_norm": 0.6724914908409119, "learning_rate": 8.339626915670234e-06, "loss": 0.6419695615768433, "step": 704 }, { "epoch": 0.9148418491484185, "grad_norm": 0.5758830308914185, "learning_rate": 8.3343024699308e-06, "loss": 0.7100552320480347, "step": 705 }, { "epoch": 0.916139497161395, "grad_norm": 0.5856196284294128, "learning_rate": 8.328971206386742e-06, "loss": 0.7285655736923218, "step": 706 }, { "epoch": 0.9174371451743715, "grad_norm": 0.6096091270446777, "learning_rate": 8.323633135939145e-06, "loss": 0.7508881092071533, "step": 707 }, { "epoch": 0.918734793187348, "grad_norm": 0.5876352787017822, "learning_rate": 8.318288269503007e-06, "loss": 0.7147477865219116, "step": 708 }, { "epoch": 0.9200324412003245, "grad_norm": 0.5633363127708435, "learning_rate": 8.312936618007232e-06, "loss": 0.7191579937934875, "step": 709 }, { "epoch": 0.9213300892133008, "grad_norm": 0.6324480772018433, "learning_rate": 8.307578192394592e-06, "loss": 0.6980431079864502, "step": 710 }, { "epoch": 0.9226277372262773, "grad_norm": 0.559508740901947, "learning_rate": 8.30221300362171e-06, "loss": 0.6977928280830383, "step": 711 }, { "epoch": 0.9239253852392538, "grad_norm": 0.5924115180969238, "learning_rate": 8.29684106265904e-06, "loss": 0.7254680395126343, "step": 712 }, { "epoch": 0.9252230332522303, "grad_norm": 0.5572075843811035, "learning_rate": 8.291462380490842e-06, "loss": 0.7060861587524414, "step": 713 }, { "epoch": 0.9265206812652068, "grad_norm": 0.5710304975509644, "learning_rate": 8.286076968115158e-06, "loss": 0.6528699398040771, "step": 714 }, { "epoch": 0.9278183292781833, "grad_norm": 0.7677385210990906, "learning_rate": 8.280684836543794e-06, "loss": 0.7742418646812439, "step": 715 }, { "epoch": 0.9291159772911598, "grad_norm": 0.5909350514411926, "learning_rate": 8.275285996802293e-06, "loss": 0.7355895042419434, "step": 716 }, { "epoch": 0.9304136253041363, "grad_norm": 0.6246051788330078, "learning_rate": 8.269880459929919e-06, "loss": 0.7119331955909729, "step": 717 }, { "epoch": 0.9317112733171128, "grad_norm": 1.3237872123718262, "learning_rate": 8.264468236979626e-06, "loss": 0.724329948425293, "step": 718 }, { "epoch": 0.9330089213300892, "grad_norm": 0.6042487621307373, "learning_rate": 8.259049339018036e-06, "loss": 0.7507586479187012, "step": 719 }, { "epoch": 0.9343065693430657, "grad_norm": 0.6646915078163147, "learning_rate": 8.25362377712543e-06, "loss": 0.7630937695503235, "step": 720 }, { "epoch": 0.9356042173560422, "grad_norm": 1.2076338529586792, "learning_rate": 8.248191562395703e-06, "loss": 0.6889426708221436, "step": 721 }, { "epoch": 0.9369018653690186, "grad_norm": 0.7128719091415405, "learning_rate": 8.242752705936363e-06, "loss": 0.7193243503570557, "step": 722 }, { "epoch": 0.9381995133819951, "grad_norm": 0.5779634714126587, "learning_rate": 8.237307218868493e-06, "loss": 0.7252578735351562, "step": 723 }, { "epoch": 0.9394971613949716, "grad_norm": 0.5774085521697998, "learning_rate": 8.231855112326738e-06, "loss": 0.7056664228439331, "step": 724 }, { "epoch": 0.9407948094079481, "grad_norm": 0.5759864449501038, "learning_rate": 8.226396397459272e-06, "loss": 0.7182119488716125, "step": 725 }, { "epoch": 0.9420924574209246, "grad_norm": 0.5475362539291382, "learning_rate": 8.22093108542779e-06, "loss": 0.7100398540496826, "step": 726 }, { "epoch": 0.943390105433901, "grad_norm": 0.6080360412597656, "learning_rate": 8.215459187407468e-06, "loss": 0.7540023326873779, "step": 727 }, { "epoch": 0.9446877534468775, "grad_norm": 0.5985339283943176, "learning_rate": 8.209980714586955e-06, "loss": 0.7655041217803955, "step": 728 }, { "epoch": 0.945985401459854, "grad_norm": 0.5587835311889648, "learning_rate": 8.20449567816834e-06, "loss": 0.7308551669120789, "step": 729 }, { "epoch": 0.9472830494728305, "grad_norm": 0.5767388939857483, "learning_rate": 8.199004089367136e-06, "loss": 0.7747267484664917, "step": 730 }, { "epoch": 0.948580697485807, "grad_norm": 0.5542681217193604, "learning_rate": 8.193505959412246e-06, "loss": 0.7009122371673584, "step": 731 }, { "epoch": 0.9498783454987835, "grad_norm": 0.7035977244377136, "learning_rate": 8.188001299545963e-06, "loss": 0.7160595655441284, "step": 732 }, { "epoch": 0.95117599351176, "grad_norm": 3.6369824409484863, "learning_rate": 8.182490121023918e-06, "loss": 0.7146700620651245, "step": 733 }, { "epoch": 0.9524736415247365, "grad_norm": 0.6017202734947205, "learning_rate": 8.176972435115075e-06, "loss": 0.7427970170974731, "step": 734 }, { "epoch": 0.9537712895377128, "grad_norm": 0.5797709822654724, "learning_rate": 8.17144825310171e-06, "loss": 0.7534258365631104, "step": 735 }, { "epoch": 0.9550689375506893, "grad_norm": 0.6132066249847412, "learning_rate": 8.165917586279374e-06, "loss": 0.6742781400680542, "step": 736 }, { "epoch": 0.9563665855636658, "grad_norm": 0.5700656175613403, "learning_rate": 8.16038044595688e-06, "loss": 0.7190455794334412, "step": 737 }, { "epoch": 0.9576642335766423, "grad_norm": 0.5793234705924988, "learning_rate": 8.15483684345628e-06, "loss": 0.7258193492889404, "step": 738 }, { "epoch": 0.9589618815896188, "grad_norm": 0.589043378829956, "learning_rate": 8.149286790112838e-06, "loss": 0.6817978620529175, "step": 739 }, { "epoch": 0.9602595296025953, "grad_norm": 0.5883787870407104, "learning_rate": 8.143730297275008e-06, "loss": 0.6951944828033447, "step": 740 }, { "epoch": 0.9615571776155718, "grad_norm": 0.6058008074760437, "learning_rate": 8.138167376304411e-06, "loss": 0.7065063118934631, "step": 741 }, { "epoch": 0.9628548256285483, "grad_norm": 0.5645580291748047, "learning_rate": 8.132598038575814e-06, "loss": 0.6607494354248047, "step": 742 }, { "epoch": 0.9641524736415248, "grad_norm": 0.5984307527542114, "learning_rate": 8.1270222954771e-06, "loss": 0.7731702327728271, "step": 743 }, { "epoch": 0.9654501216545012, "grad_norm": 0.5940436124801636, "learning_rate": 8.121440158409255e-06, "loss": 0.7217580080032349, "step": 744 }, { "epoch": 0.9667477696674777, "grad_norm": 0.6139102578163147, "learning_rate": 8.115851638786335e-06, "loss": 0.761775553226471, "step": 745 }, { "epoch": 0.9680454176804542, "grad_norm": 0.5621196627616882, "learning_rate": 8.11025674803545e-06, "loss": 0.7084890007972717, "step": 746 }, { "epoch": 0.9693430656934306, "grad_norm": 0.634238064289093, "learning_rate": 8.104655497596734e-06, "loss": 0.7413675785064697, "step": 747 }, { "epoch": 0.9706407137064071, "grad_norm": 0.6062578558921814, "learning_rate": 8.099047898923326e-06, "loss": 0.6940469741821289, "step": 748 }, { "epoch": 0.9719383617193836, "grad_norm": 1.2983204126358032, "learning_rate": 8.093433963481348e-06, "loss": 0.7091077566146851, "step": 749 }, { "epoch": 0.9732360097323601, "grad_norm": 0.5655047297477722, "learning_rate": 8.087813702749873e-06, "loss": 0.7066688537597656, "step": 750 }, { "epoch": 0.9745336577453366, "grad_norm": 0.6067200303077698, "learning_rate": 8.082187128220918e-06, "loss": 0.7150874137878418, "step": 751 }, { "epoch": 0.975831305758313, "grad_norm": 0.5860595107078552, "learning_rate": 8.076554251399398e-06, "loss": 0.7268061637878418, "step": 752 }, { "epoch": 0.9771289537712895, "grad_norm": 0.5691843628883362, "learning_rate": 8.070915083803124e-06, "loss": 0.7130003571510315, "step": 753 }, { "epoch": 0.978426601784266, "grad_norm": 0.5511523485183716, "learning_rate": 8.065269636962765e-06, "loss": 0.7632818222045898, "step": 754 }, { "epoch": 0.9797242497972425, "grad_norm": 0.9720051884651184, "learning_rate": 8.059617922421832e-06, "loss": 0.6920190453529358, "step": 755 }, { "epoch": 0.981021897810219, "grad_norm": 0.9689953327178955, "learning_rate": 8.053959951736647e-06, "loss": 0.7026671171188354, "step": 756 }, { "epoch": 0.9823195458231955, "grad_norm": 0.5877639055252075, "learning_rate": 8.048295736476332e-06, "loss": 0.7458422780036926, "step": 757 }, { "epoch": 0.983617193836172, "grad_norm": 0.5555517077445984, "learning_rate": 8.042625288222774e-06, "loss": 0.6832958459854126, "step": 758 }, { "epoch": 0.9849148418491485, "grad_norm": 0.5778935551643372, "learning_rate": 8.036948618570601e-06, "loss": 0.6715413331985474, "step": 759 }, { "epoch": 0.986212489862125, "grad_norm": 0.5913302898406982, "learning_rate": 8.031265739127167e-06, "loss": 0.6345862150192261, "step": 760 }, { "epoch": 0.9875101378751013, "grad_norm": 0.5491726994514465, "learning_rate": 8.025576661512524e-06, "loss": 0.6723500490188599, "step": 761 }, { "epoch": 0.9888077858880778, "grad_norm": 0.5520846247673035, "learning_rate": 8.019881397359395e-06, "loss": 0.7205091118812561, "step": 762 }, { "epoch": 0.9901054339010543, "grad_norm": 0.5902574062347412, "learning_rate": 8.014179958313154e-06, "loss": 0.7127419114112854, "step": 763 }, { "epoch": 0.9914030819140308, "grad_norm": 0.5558638572692871, "learning_rate": 8.008472356031795e-06, "loss": 0.6300485134124756, "step": 764 }, { "epoch": 0.9927007299270073, "grad_norm": 0.5584984421730042, "learning_rate": 8.00275860218593e-06, "loss": 0.6915569305419922, "step": 765 }, { "epoch": 0.9939983779399838, "grad_norm": 0.5804587006568909, "learning_rate": 7.99703870845873e-06, "loss": 0.7401936054229736, "step": 766 }, { "epoch": 0.9952960259529603, "grad_norm": 0.562065064907074, "learning_rate": 7.991312686545939e-06, "loss": 0.6845479011535645, "step": 767 }, { "epoch": 0.9965936739659368, "grad_norm": 0.5887646079063416, "learning_rate": 7.985580548155814e-06, "loss": 0.7238905429840088, "step": 768 }, { "epoch": 0.9978913219789132, "grad_norm": 0.70610111951828, "learning_rate": 7.979842305009133e-06, "loss": 0.6573514342308044, "step": 769 }, { "epoch": 0.9991889699918897, "grad_norm": 0.5765895843505859, "learning_rate": 7.974097968839149e-06, "loss": 0.6816248297691345, "step": 770 }, { "epoch": 1.0, "grad_norm": 0.7250688672065735, "learning_rate": 7.968347551391574e-06, "loss": 0.6674489974975586, "step": 771 }, { "epoch": 1.0012976480129765, "grad_norm": 0.6792595982551575, "learning_rate": 7.962591064424558e-06, "loss": 0.6514409184455872, "step": 772 }, { "epoch": 1.002595296025953, "grad_norm": 0.7125512361526489, "learning_rate": 7.95682851970866e-06, "loss": 0.6212759613990784, "step": 773 }, { "epoch": 1.0038929440389295, "grad_norm": 0.6438767313957214, "learning_rate": 7.951059929026826e-06, "loss": 0.6282512545585632, "step": 774 }, { "epoch": 1.005190592051906, "grad_norm": 0.572353720664978, "learning_rate": 7.94528530417436e-06, "loss": 0.6370311379432678, "step": 775 }, { "epoch": 1.0064882400648825, "grad_norm": 0.5794159173965454, "learning_rate": 7.939504656958913e-06, "loss": 0.6351627707481384, "step": 776 }, { "epoch": 1.007785888077859, "grad_norm": 0.6709707379341125, "learning_rate": 7.933717999200442e-06, "loss": 0.7240197658538818, "step": 777 }, { "epoch": 1.0090835360908355, "grad_norm": 0.7591879963874817, "learning_rate": 7.927925342731202e-06, "loss": 0.662930428981781, "step": 778 }, { "epoch": 1.010381184103812, "grad_norm": 0.6731166243553162, "learning_rate": 7.922126699395705e-06, "loss": 0.6665748357772827, "step": 779 }, { "epoch": 1.0116788321167882, "grad_norm": 0.6249240040779114, "learning_rate": 7.916322081050708e-06, "loss": 0.6313880681991577, "step": 780 }, { "epoch": 1.0129764801297647, "grad_norm": 0.6070784330368042, "learning_rate": 7.910511499565192e-06, "loss": 0.5778607130050659, "step": 781 }, { "epoch": 1.0142741281427412, "grad_norm": 0.5682867765426636, "learning_rate": 7.90469496682032e-06, "loss": 0.5984998941421509, "step": 782 }, { "epoch": 1.0155717761557177, "grad_norm": 0.5944799184799194, "learning_rate": 7.89887249470943e-06, "loss": 0.6242648363113403, "step": 783 }, { "epoch": 1.0168694241686942, "grad_norm": 0.8286924958229065, "learning_rate": 7.89304409513801e-06, "loss": 0.612074613571167, "step": 784 }, { "epoch": 1.0181670721816707, "grad_norm": 0.6117927432060242, "learning_rate": 7.887209780023652e-06, "loss": 0.6674654483795166, "step": 785 }, { "epoch": 1.0194647201946472, "grad_norm": 0.6768798828125, "learning_rate": 7.881369561296061e-06, "loss": 0.6811670660972595, "step": 786 }, { "epoch": 1.0207623682076237, "grad_norm": 0.6664367914199829, "learning_rate": 7.875523450897004e-06, "loss": 0.638746440410614, "step": 787 }, { "epoch": 1.0220600162206002, "grad_norm": 1.1638799905776978, "learning_rate": 7.869671460780297e-06, "loss": 0.6403613090515137, "step": 788 }, { "epoch": 1.0233576642335767, "grad_norm": 0.5986616015434265, "learning_rate": 7.863813602911777e-06, "loss": 0.6099958419799805, "step": 789 }, { "epoch": 1.0246553122465532, "grad_norm": 1.8672071695327759, "learning_rate": 7.857949889269285e-06, "loss": 0.6486390829086304, "step": 790 }, { "epoch": 1.0259529602595296, "grad_norm": 0.6674206852912903, "learning_rate": 7.852080331842627e-06, "loss": 0.5824840664863586, "step": 791 }, { "epoch": 1.0272506082725061, "grad_norm": 0.6552616953849792, "learning_rate": 7.846204942633564e-06, "loss": 0.7385782599449158, "step": 792 }, { "epoch": 1.0285482562854826, "grad_norm": 0.636968195438385, "learning_rate": 7.84032373365578e-06, "loss": 0.6557282209396362, "step": 793 }, { "epoch": 1.0298459042984591, "grad_norm": 0.5769335627555847, "learning_rate": 7.834436716934859e-06, "loss": 0.5607404708862305, "step": 794 }, { "epoch": 1.0311435523114356, "grad_norm": 0.6747480034828186, "learning_rate": 7.828543904508258e-06, "loss": 0.6176875829696655, "step": 795 }, { "epoch": 1.0324412003244121, "grad_norm": 0.5826826691627502, "learning_rate": 7.82264530842529e-06, "loss": 0.6352604627609253, "step": 796 }, { "epoch": 1.0337388483373884, "grad_norm": 0.5748003721237183, "learning_rate": 7.816740940747089e-06, "loss": 0.5930640697479248, "step": 797 }, { "epoch": 1.0350364963503649, "grad_norm": 0.5976374745368958, "learning_rate": 7.810830813546594e-06, "loss": 0.6040553450584412, "step": 798 }, { "epoch": 1.0363341443633414, "grad_norm": 0.5924686789512634, "learning_rate": 7.80491493890852e-06, "loss": 0.6496337652206421, "step": 799 }, { "epoch": 1.0376317923763179, "grad_norm": 0.5696931481361389, "learning_rate": 7.798993328929328e-06, "loss": 0.6347925662994385, "step": 800 }, { "epoch": 1.0389294403892944, "grad_norm": 0.5750864148139954, "learning_rate": 7.793065995717217e-06, "loss": 0.6404843330383301, "step": 801 }, { "epoch": 1.0402270884022708, "grad_norm": 0.5975061058998108, "learning_rate": 7.787132951392082e-06, "loss": 0.5997766256332397, "step": 802 }, { "epoch": 1.0415247364152473, "grad_norm": 0.6157170534133911, "learning_rate": 7.781194208085495e-06, "loss": 0.6501672267913818, "step": 803 }, { "epoch": 1.0428223844282238, "grad_norm": 0.6032687425613403, "learning_rate": 7.775249777940685e-06, "loss": 0.6564816832542419, "step": 804 }, { "epoch": 1.0441200324412003, "grad_norm": 0.5874586701393127, "learning_rate": 7.769299673112507e-06, "loss": 0.6064618825912476, "step": 805 }, { "epoch": 1.0454176804541768, "grad_norm": 0.6239724159240723, "learning_rate": 7.76334390576742e-06, "loss": 0.6170182228088379, "step": 806 }, { "epoch": 1.0467153284671533, "grad_norm": 0.6056293845176697, "learning_rate": 7.757382488083458e-06, "loss": 0.7019131183624268, "step": 807 }, { "epoch": 1.0480129764801298, "grad_norm": 0.5994875431060791, "learning_rate": 7.751415432250213e-06, "loss": 0.6316931247711182, "step": 808 }, { "epoch": 1.0493106244931063, "grad_norm": 0.6516374945640564, "learning_rate": 7.745442750468803e-06, "loss": 0.649019718170166, "step": 809 }, { "epoch": 1.0506082725060828, "grad_norm": 0.5792532563209534, "learning_rate": 7.739464454951853e-06, "loss": 0.6500118374824524, "step": 810 }, { "epoch": 1.0519059205190593, "grad_norm": 0.745469331741333, "learning_rate": 7.733480557923464e-06, "loss": 0.5821675658226013, "step": 811 }, { "epoch": 1.0532035685320358, "grad_norm": 0.6124119162559509, "learning_rate": 7.727491071619186e-06, "loss": 0.6384508609771729, "step": 812 }, { "epoch": 1.0545012165450123, "grad_norm": 0.5831156969070435, "learning_rate": 7.72149600828601e-06, "loss": 0.6578410267829895, "step": 813 }, { "epoch": 1.0557988645579885, "grad_norm": 0.605689287185669, "learning_rate": 7.715495380182314e-06, "loss": 0.6352893710136414, "step": 814 }, { "epoch": 1.057096512570965, "grad_norm": 0.5769819617271423, "learning_rate": 7.709489199577874e-06, "loss": 0.5956138372421265, "step": 815 }, { "epoch": 1.0583941605839415, "grad_norm": 1.2673306465148926, "learning_rate": 7.7034774787538e-06, "loss": 0.6302381753921509, "step": 816 }, { "epoch": 1.059691808596918, "grad_norm": 0.5970334410667419, "learning_rate": 7.697460230002545e-06, "loss": 0.6213703751564026, "step": 817 }, { "epoch": 1.0609894566098945, "grad_norm": 0.5932973623275757, "learning_rate": 7.691437465627859e-06, "loss": 0.6656537652015686, "step": 818 }, { "epoch": 1.062287104622871, "grad_norm": 0.5778910517692566, "learning_rate": 7.685409197944768e-06, "loss": 0.6016901135444641, "step": 819 }, { "epoch": 1.0635847526358475, "grad_norm": 0.6970887780189514, "learning_rate": 7.679375439279557e-06, "loss": 0.6404139995574951, "step": 820 }, { "epoch": 1.064882400648824, "grad_norm": 0.8317319750785828, "learning_rate": 7.673336201969733e-06, "loss": 0.670491099357605, "step": 821 }, { "epoch": 1.0661800486618005, "grad_norm": 0.5904209613800049, "learning_rate": 7.667291498364009e-06, "loss": 0.697813868522644, "step": 822 }, { "epoch": 1.067477696674777, "grad_norm": 0.6368371844291687, "learning_rate": 7.661241340822274e-06, "loss": 0.6957151889801025, "step": 823 }, { "epoch": 1.0687753446877535, "grad_norm": 0.6323496103286743, "learning_rate": 7.655185741715569e-06, "loss": 0.6282387375831604, "step": 824 }, { "epoch": 1.07007299270073, "grad_norm": 0.582459568977356, "learning_rate": 7.64912471342606e-06, "loss": 0.6632883548736572, "step": 825 }, { "epoch": 1.0713706407137065, "grad_norm": 0.5815753936767578, "learning_rate": 7.643058268347015e-06, "loss": 0.6437957882881165, "step": 826 }, { "epoch": 1.072668288726683, "grad_norm": 0.5913931131362915, "learning_rate": 7.636986418882783e-06, "loss": 0.6558079719543457, "step": 827 }, { "epoch": 1.0739659367396595, "grad_norm": 0.5545955300331116, "learning_rate": 7.630909177448755e-06, "loss": 0.6246286630630493, "step": 828 }, { "epoch": 1.075263584752636, "grad_norm": 0.5951606631278992, "learning_rate": 7.624826556471354e-06, "loss": 0.6540351510047913, "step": 829 }, { "epoch": 1.0765612327656124, "grad_norm": 0.6533515453338623, "learning_rate": 7.618738568388e-06, "loss": 0.6222127676010132, "step": 830 }, { "epoch": 1.0778588807785887, "grad_norm": 0.5797233581542969, "learning_rate": 7.612645225647086e-06, "loss": 0.5815407037734985, "step": 831 }, { "epoch": 1.0791565287915652, "grad_norm": 0.6024124622344971, "learning_rate": 7.60654654070796e-06, "loss": 0.609170138835907, "step": 832 }, { "epoch": 1.0804541768045417, "grad_norm": 0.6007437109947205, "learning_rate": 7.600442526040883e-06, "loss": 0.6566615104675293, "step": 833 }, { "epoch": 1.0817518248175182, "grad_norm": 0.6132609844207764, "learning_rate": 7.594333194127025e-06, "loss": 0.6762999892234802, "step": 834 }, { "epoch": 1.0830494728304947, "grad_norm": 0.6206640005111694, "learning_rate": 7.58821855745842e-06, "loss": 0.6008488535881042, "step": 835 }, { "epoch": 1.0843471208434712, "grad_norm": 0.5727500319480896, "learning_rate": 7.582098628537955e-06, "loss": 0.6291306018829346, "step": 836 }, { "epoch": 1.0856447688564477, "grad_norm": 0.5835679769515991, "learning_rate": 7.5759734198793365e-06, "loss": 0.598922848701477, "step": 837 }, { "epoch": 1.0869424168694242, "grad_norm": 0.6435012817382812, "learning_rate": 7.5698429440070616e-06, "loss": 0.6742567420005798, "step": 838 }, { "epoch": 1.0882400648824007, "grad_norm": 0.6521117687225342, "learning_rate": 7.563707213456405e-06, "loss": 0.7133705615997314, "step": 839 }, { "epoch": 1.0895377128953772, "grad_norm": 0.6230207085609436, "learning_rate": 7.5575662407733815e-06, "loss": 0.6346240043640137, "step": 840 }, { "epoch": 1.0908353609083536, "grad_norm": 0.6041070818901062, "learning_rate": 7.551420038514726e-06, "loss": 0.5786027908325195, "step": 841 }, { "epoch": 1.0921330089213301, "grad_norm": 0.6142879724502563, "learning_rate": 7.54526861924787e-06, "loss": 0.689670205116272, "step": 842 }, { "epoch": 1.0934306569343066, "grad_norm": 0.5727767944335938, "learning_rate": 7.5391119955509026e-06, "loss": 0.6093534827232361, "step": 843 }, { "epoch": 1.0947283049472831, "grad_norm": 0.5920162796974182, "learning_rate": 7.532950180012564e-06, "loss": 0.6508292555809021, "step": 844 }, { "epoch": 1.0960259529602596, "grad_norm": 0.6140349507331848, "learning_rate": 7.526783185232208e-06, "loss": 0.6522685885429382, "step": 845 }, { "epoch": 1.0973236009732361, "grad_norm": 0.6111754179000854, "learning_rate": 7.520611023819779e-06, "loss": 0.6456558704376221, "step": 846 }, { "epoch": 1.0986212489862126, "grad_norm": 0.5693365931510925, "learning_rate": 7.514433708395783e-06, "loss": 0.6057475805282593, "step": 847 }, { "epoch": 1.0999188969991889, "grad_norm": 0.6043863892555237, "learning_rate": 7.508251251591266e-06, "loss": 0.6344411969184875, "step": 848 }, { "epoch": 1.1012165450121654, "grad_norm": 0.6892386078834534, "learning_rate": 7.5020636660477894e-06, "loss": 0.6500993371009827, "step": 849 }, { "epoch": 1.1025141930251419, "grad_norm": 0.6054773926734924, "learning_rate": 7.4958709644174e-06, "loss": 0.6792426109313965, "step": 850 }, { "epoch": 1.1038118410381184, "grad_norm": 0.6106455326080322, "learning_rate": 7.4896731593626015e-06, "loss": 0.648511528968811, "step": 851 }, { "epoch": 1.1051094890510949, "grad_norm": 0.5832105875015259, "learning_rate": 7.4834702635563395e-06, "loss": 0.6617711782455444, "step": 852 }, { "epoch": 1.1064071370640713, "grad_norm": 0.668353259563446, "learning_rate": 7.477262289681966e-06, "loss": 0.6955296397209167, "step": 853 }, { "epoch": 1.1077047850770478, "grad_norm": 0.5962719917297363, "learning_rate": 7.471049250433214e-06, "loss": 0.680686354637146, "step": 854 }, { "epoch": 1.1090024330900243, "grad_norm": 0.6140416860580444, "learning_rate": 7.464831158514179e-06, "loss": 0.6445127725601196, "step": 855 }, { "epoch": 1.1103000811030008, "grad_norm": 0.6690049171447754, "learning_rate": 7.458608026639285e-06, "loss": 0.6185108423233032, "step": 856 }, { "epoch": 1.1115977291159773, "grad_norm": 0.7241218090057373, "learning_rate": 7.45237986753326e-06, "loss": 0.6828392744064331, "step": 857 }, { "epoch": 1.1128953771289538, "grad_norm": 0.6075162887573242, "learning_rate": 7.446146693931111e-06, "loss": 0.6688688397407532, "step": 858 }, { "epoch": 1.1141930251419303, "grad_norm": 0.7877935767173767, "learning_rate": 7.439908518578105e-06, "loss": 0.6596081852912903, "step": 859 }, { "epoch": 1.1154906731549068, "grad_norm": 0.5754934549331665, "learning_rate": 7.433665354229731e-06, "loss": 0.655542254447937, "step": 860 }, { "epoch": 1.1167883211678833, "grad_norm": 0.6457986831665039, "learning_rate": 7.4274172136516766e-06, "loss": 0.6543152928352356, "step": 861 }, { "epoch": 1.1180859691808598, "grad_norm": 0.5904266238212585, "learning_rate": 7.421164109619809e-06, "loss": 0.6421469449996948, "step": 862 }, { "epoch": 1.119383617193836, "grad_norm": 0.5537955164909363, "learning_rate": 7.4149060549201455e-06, "loss": 0.609650194644928, "step": 863 }, { "epoch": 1.1206812652068125, "grad_norm": 0.5964105129241943, "learning_rate": 7.408643062348824e-06, "loss": 0.6043794751167297, "step": 864 }, { "epoch": 1.121978913219789, "grad_norm": 0.5994772911071777, "learning_rate": 7.402375144712075e-06, "loss": 0.6849918365478516, "step": 865 }, { "epoch": 1.1232765612327655, "grad_norm": 0.6322051286697388, "learning_rate": 7.396102314826207e-06, "loss": 0.6219741106033325, "step": 866 }, { "epoch": 1.124574209245742, "grad_norm": 0.5794394016265869, "learning_rate": 7.389824585517569e-06, "loss": 0.6507738828659058, "step": 867 }, { "epoch": 1.1258718572587185, "grad_norm": 0.6662233471870422, "learning_rate": 7.3835419696225275e-06, "loss": 0.6731002330780029, "step": 868 }, { "epoch": 1.127169505271695, "grad_norm": 0.5842033624649048, "learning_rate": 7.377254479987445e-06, "loss": 0.6546036005020142, "step": 869 }, { "epoch": 1.1284671532846715, "grad_norm": 2.6347815990448, "learning_rate": 7.370962129468642e-06, "loss": 0.61831134557724, "step": 870 }, { "epoch": 1.129764801297648, "grad_norm": 0.6191915273666382, "learning_rate": 7.364664930932385e-06, "loss": 0.682953953742981, "step": 871 }, { "epoch": 1.1310624493106245, "grad_norm": 0.6216323375701904, "learning_rate": 7.35836289725485e-06, "loss": 0.6735019087791443, "step": 872 }, { "epoch": 1.132360097323601, "grad_norm": 0.5958914756774902, "learning_rate": 7.352056041322103e-06, "loss": 0.6420754194259644, "step": 873 }, { "epoch": 1.1336577453365775, "grad_norm": 0.5970807671546936, "learning_rate": 7.345744376030066e-06, "loss": 0.6589509844779968, "step": 874 }, { "epoch": 1.134955393349554, "grad_norm": 0.6387295126914978, "learning_rate": 7.339427914284498e-06, "loss": 0.5913777351379395, "step": 875 }, { "epoch": 1.1362530413625305, "grad_norm": 1.3676766157150269, "learning_rate": 7.3331066690009644e-06, "loss": 0.6156778931617737, "step": 876 }, { "epoch": 1.137550689375507, "grad_norm": 0.5990293025970459, "learning_rate": 7.326780653104813e-06, "loss": 0.6320254802703857, "step": 877 }, { "epoch": 1.1388483373884835, "grad_norm": 0.6619262099266052, "learning_rate": 7.320449879531143e-06, "loss": 0.6741781830787659, "step": 878 }, { "epoch": 1.14014598540146, "grad_norm": 0.6091610193252563, "learning_rate": 7.314114361224785e-06, "loss": 0.6403502821922302, "step": 879 }, { "epoch": 1.1414436334144362, "grad_norm": 0.6015101075172424, "learning_rate": 7.30777411114027e-06, "loss": 0.6477581858634949, "step": 880 }, { "epoch": 1.142741281427413, "grad_norm": 0.5771135687828064, "learning_rate": 7.301429142241805e-06, "loss": 0.5903566479682922, "step": 881 }, { "epoch": 1.1440389294403892, "grad_norm": 0.571612536907196, "learning_rate": 7.295079467503247e-06, "loss": 0.5671682357788086, "step": 882 }, { "epoch": 1.1453365774533657, "grad_norm": 0.7478623390197754, "learning_rate": 7.288725099908071e-06, "loss": 0.6659491658210754, "step": 883 }, { "epoch": 1.1466342254663422, "grad_norm": 0.6303284764289856, "learning_rate": 7.282366052449351e-06, "loss": 0.7001731395721436, "step": 884 }, { "epoch": 1.1479318734793187, "grad_norm": 0.5829930901527405, "learning_rate": 7.276002338129731e-06, "loss": 0.632986843585968, "step": 885 }, { "epoch": 1.1492295214922952, "grad_norm": 0.6018064022064209, "learning_rate": 7.269633969961395e-06, "loss": 0.6848266124725342, "step": 886 }, { "epoch": 1.1505271695052717, "grad_norm": 0.7479543089866638, "learning_rate": 7.2632609609660456e-06, "loss": 0.6810072064399719, "step": 887 }, { "epoch": 1.1518248175182482, "grad_norm": 0.5979959964752197, "learning_rate": 7.256883324174871e-06, "loss": 0.59900963306427, "step": 888 }, { "epoch": 1.1531224655312247, "grad_norm": 0.608985424041748, "learning_rate": 7.250501072628524e-06, "loss": 0.6502770185470581, "step": 889 }, { "epoch": 1.1544201135442012, "grad_norm": 0.5771687626838684, "learning_rate": 7.2441142193770955e-06, "loss": 0.6427179574966431, "step": 890 }, { "epoch": 1.1557177615571776, "grad_norm": 0.7472683787345886, "learning_rate": 7.237722777480083e-06, "loss": 0.6853768825531006, "step": 891 }, { "epoch": 1.1570154095701541, "grad_norm": 0.5946991443634033, "learning_rate": 7.231326760006368e-06, "loss": 0.6969834566116333, "step": 892 }, { "epoch": 1.1583130575831306, "grad_norm": 0.6238925457000732, "learning_rate": 7.224926180034186e-06, "loss": 0.6919976472854614, "step": 893 }, { "epoch": 1.1596107055961071, "grad_norm": 0.6162919402122498, "learning_rate": 7.218521050651106e-06, "loss": 0.6636837720870972, "step": 894 }, { "epoch": 1.1609083536090836, "grad_norm": 0.5723338723182678, "learning_rate": 7.212111384953993e-06, "loss": 0.6149659156799316, "step": 895 }, { "epoch": 1.1622060016220601, "grad_norm": 0.6074439883232117, "learning_rate": 7.205697196048992e-06, "loss": 0.6255541443824768, "step": 896 }, { "epoch": 1.1635036496350364, "grad_norm": 0.6277779936790466, "learning_rate": 7.199278497051498e-06, "loss": 0.6648150086402893, "step": 897 }, { "epoch": 1.164801297648013, "grad_norm": 0.6254341006278992, "learning_rate": 7.192855301086123e-06, "loss": 0.6707339882850647, "step": 898 }, { "epoch": 1.1660989456609894, "grad_norm": 0.6244154572486877, "learning_rate": 7.186427621286678e-06, "loss": 0.6344256401062012, "step": 899 }, { "epoch": 1.1673965936739659, "grad_norm": 0.6074284911155701, "learning_rate": 7.179995470796141e-06, "loss": 0.6663004159927368, "step": 900 }, { "epoch": 1.1686942416869424, "grad_norm": 0.6512662768363953, "learning_rate": 7.1735588627666346e-06, "loss": 0.6009752154350281, "step": 901 }, { "epoch": 1.1699918896999189, "grad_norm": 0.6028872132301331, "learning_rate": 7.167117810359387e-06, "loss": 0.5874291062355042, "step": 902 }, { "epoch": 1.1712895377128953, "grad_norm": 0.6266588568687439, "learning_rate": 7.160672326744726e-06, "loss": 0.6230692267417908, "step": 903 }, { "epoch": 1.1725871857258718, "grad_norm": 3.8021433353424072, "learning_rate": 7.154222425102033e-06, "loss": 0.6242640018463135, "step": 904 }, { "epoch": 1.1738848337388483, "grad_norm": 0.6971346735954285, "learning_rate": 7.1477681186197225e-06, "loss": 0.6548742651939392, "step": 905 }, { "epoch": 1.1751824817518248, "grad_norm": 0.612678587436676, "learning_rate": 7.141309420495219e-06, "loss": 0.6528737545013428, "step": 906 }, { "epoch": 1.1764801297648013, "grad_norm": 0.6218580007553101, "learning_rate": 7.134846343934924e-06, "loss": 0.6845676898956299, "step": 907 }, { "epoch": 1.1777777777777778, "grad_norm": 0.6113817691802979, "learning_rate": 7.128378902154195e-06, "loss": 0.6958880424499512, "step": 908 }, { "epoch": 1.1790754257907543, "grad_norm": 0.6120286583900452, "learning_rate": 7.121907108377313e-06, "loss": 0.6543635725975037, "step": 909 }, { "epoch": 1.1803730738037308, "grad_norm": 0.6076055765151978, "learning_rate": 7.115430975837457e-06, "loss": 0.6869640946388245, "step": 910 }, { "epoch": 1.1816707218167073, "grad_norm": 0.6232397556304932, "learning_rate": 7.10895051777668e-06, "loss": 0.6338291764259338, "step": 911 }, { "epoch": 1.1829683698296838, "grad_norm": 0.6153266429901123, "learning_rate": 7.1024657474458795e-06, "loss": 0.6337912678718567, "step": 912 }, { "epoch": 1.1842660178426603, "grad_norm": 0.6057350039482117, "learning_rate": 7.095976678104768e-06, "loss": 0.6359199285507202, "step": 913 }, { "epoch": 1.1855636658556366, "grad_norm": 0.6107894778251648, "learning_rate": 7.089483323021851e-06, "loss": 0.6233211755752563, "step": 914 }, { "epoch": 1.186861313868613, "grad_norm": 0.5987040400505066, "learning_rate": 7.082985695474394e-06, "loss": 0.6974512338638306, "step": 915 }, { "epoch": 1.1881589618815895, "grad_norm": 0.5928195118904114, "learning_rate": 7.076483808748402e-06, "loss": 0.6281331777572632, "step": 916 }, { "epoch": 1.189456609894566, "grad_norm": 0.751203179359436, "learning_rate": 7.069977676138588e-06, "loss": 0.6113827228546143, "step": 917 }, { "epoch": 1.1907542579075425, "grad_norm": 0.6335259079933167, "learning_rate": 7.063467310948346e-06, "loss": 0.5900315046310425, "step": 918 }, { "epoch": 1.192051905920519, "grad_norm": 0.6231621503829956, "learning_rate": 7.0569527264897275e-06, "loss": 0.6505625247955322, "step": 919 }, { "epoch": 1.1933495539334955, "grad_norm": 0.6135134696960449, "learning_rate": 7.050433936083405e-06, "loss": 0.6122363805770874, "step": 920 }, { "epoch": 1.1933495539334955, "eval_loss": 0.68769770860672, "eval_runtime": 73.0979, "eval_samples_per_second": 71.028, "eval_steps_per_second": 8.879, "step": 920 }, { "epoch": 1.194647201946472, "grad_norm": 0.5773142576217651, "learning_rate": 7.043910953058657e-06, "loss": 0.5964255332946777, "step": 921 }, { "epoch": 1.1959448499594485, "grad_norm": 0.6031613945960999, "learning_rate": 7.037383790753333e-06, "loss": 0.662893533706665, "step": 922 }, { "epoch": 1.197242497972425, "grad_norm": 0.6189724206924438, "learning_rate": 7.030852462513827e-06, "loss": 0.6189711093902588, "step": 923 }, { "epoch": 1.1985401459854015, "grad_norm": 0.6367059946060181, "learning_rate": 7.024316981695053e-06, "loss": 0.6123430132865906, "step": 924 }, { "epoch": 1.199837793998378, "grad_norm": 0.6039940118789673, "learning_rate": 7.017777361660414e-06, "loss": 0.6341007947921753, "step": 925 }, { "epoch": 1.2011354420113545, "grad_norm": 0.7465354204177856, "learning_rate": 7.011233615781777e-06, "loss": 0.6174352765083313, "step": 926 }, { "epoch": 1.202433090024331, "grad_norm": 0.6807838678359985, "learning_rate": 7.004685757439449e-06, "loss": 0.7061627507209778, "step": 927 }, { "epoch": 1.2037307380373075, "grad_norm": 0.5960806012153625, "learning_rate": 6.99813380002214e-06, "loss": 0.6526781320571899, "step": 928 }, { "epoch": 1.205028386050284, "grad_norm": 0.5771905183792114, "learning_rate": 6.991577756926948e-06, "loss": 0.6951519250869751, "step": 929 }, { "epoch": 1.2063260340632604, "grad_norm": 0.632168710231781, "learning_rate": 6.9850176415593195e-06, "loss": 0.6279127597808838, "step": 930 }, { "epoch": 1.2076236820762367, "grad_norm": 0.6110833287239075, "learning_rate": 6.978453467333028e-06, "loss": 0.6424981355667114, "step": 931 }, { "epoch": 1.2089213300892132, "grad_norm": 0.5829861164093018, "learning_rate": 6.9718852476701535e-06, "loss": 0.6850586533546448, "step": 932 }, { "epoch": 1.2102189781021897, "grad_norm": 0.6042872071266174, "learning_rate": 6.965312996001038e-06, "loss": 0.628888726234436, "step": 933 }, { "epoch": 1.2115166261151662, "grad_norm": 0.641800045967102, "learning_rate": 6.958736725764275e-06, "loss": 0.6589823961257935, "step": 934 }, { "epoch": 1.2128142741281427, "grad_norm": 0.5857986211776733, "learning_rate": 6.952156450406673e-06, "loss": 0.5867838859558105, "step": 935 }, { "epoch": 1.2141119221411192, "grad_norm": 0.6070905923843384, "learning_rate": 6.945572183383229e-06, "loss": 0.6120666265487671, "step": 936 }, { "epoch": 1.2154095701540957, "grad_norm": 0.620799720287323, "learning_rate": 6.9389839381571025e-06, "loss": 0.6689779758453369, "step": 937 }, { "epoch": 1.2167072181670722, "grad_norm": 3.69341778755188, "learning_rate": 6.932391728199587e-06, "loss": 0.6268787384033203, "step": 938 }, { "epoch": 1.2180048661800487, "grad_norm": 0.6159505248069763, "learning_rate": 6.925795566990083e-06, "loss": 0.6517162322998047, "step": 939 }, { "epoch": 1.2193025141930252, "grad_norm": 0.6000729203224182, "learning_rate": 6.919195468016073e-06, "loss": 0.6077402234077454, "step": 940 }, { "epoch": 1.2206001622060016, "grad_norm": 0.5589438080787659, "learning_rate": 6.9125914447730865e-06, "loss": 0.596868634223938, "step": 941 }, { "epoch": 1.2218978102189781, "grad_norm": 2.3887641429901123, "learning_rate": 6.905983510764681e-06, "loss": 0.6510117053985596, "step": 942 }, { "epoch": 1.2231954582319546, "grad_norm": 0.5905357003211975, "learning_rate": 6.899371679502408e-06, "loss": 0.6385715007781982, "step": 943 }, { "epoch": 1.2244931062449311, "grad_norm": 0.6210343837738037, "learning_rate": 6.89275596450579e-06, "loss": 0.5893187522888184, "step": 944 }, { "epoch": 1.2257907542579076, "grad_norm": 0.5834376215934753, "learning_rate": 6.886136379302288e-06, "loss": 0.6301822662353516, "step": 945 }, { "epoch": 1.2270884022708841, "grad_norm": 0.6120421886444092, "learning_rate": 6.87951293742728e-06, "loss": 0.6227176189422607, "step": 946 }, { "epoch": 1.2283860502838606, "grad_norm": 0.5846749544143677, "learning_rate": 6.872885652424028e-06, "loss": 0.5956023931503296, "step": 947 }, { "epoch": 1.2296836982968369, "grad_norm": 0.6237694025039673, "learning_rate": 6.866254537843651e-06, "loss": 0.619324266910553, "step": 948 }, { "epoch": 1.2309813463098134, "grad_norm": 0.6295216679573059, "learning_rate": 6.859619607245102e-06, "loss": 0.6520287990570068, "step": 949 }, { "epoch": 1.2322789943227899, "grad_norm": 0.6216979026794434, "learning_rate": 6.852980874195132e-06, "loss": 0.6138555407524109, "step": 950 }, { "epoch": 1.2335766423357664, "grad_norm": 0.59978848695755, "learning_rate": 6.846338352268273e-06, "loss": 0.6959421038627625, "step": 951 }, { "epoch": 1.2348742903487429, "grad_norm": 0.6199280619621277, "learning_rate": 6.839692055046801e-06, "loss": 0.6330957412719727, "step": 952 }, { "epoch": 1.2361719383617193, "grad_norm": 0.6078975200653076, "learning_rate": 6.833041996120707e-06, "loss": 0.6647271513938904, "step": 953 }, { "epoch": 1.2374695863746958, "grad_norm": 0.6505293846130371, "learning_rate": 6.826388189087683e-06, "loss": 0.6796462535858154, "step": 954 }, { "epoch": 1.2387672343876723, "grad_norm": 2.935091257095337, "learning_rate": 6.819730647553079e-06, "loss": 0.6220841407775879, "step": 955 }, { "epoch": 1.2400648824006488, "grad_norm": 0.6445925831794739, "learning_rate": 6.813069385129883e-06, "loss": 0.5865710973739624, "step": 956 }, { "epoch": 1.2413625304136253, "grad_norm": 0.5919390320777893, "learning_rate": 6.806404415438689e-06, "loss": 0.6186652779579163, "step": 957 }, { "epoch": 1.2426601784266018, "grad_norm": 0.601252019405365, "learning_rate": 6.7997357521076735e-06, "loss": 0.6536276340484619, "step": 958 }, { "epoch": 1.2439578264395783, "grad_norm": 1.1728289127349854, "learning_rate": 6.793063408772565e-06, "loss": 0.6327337026596069, "step": 959 }, { "epoch": 1.2452554744525548, "grad_norm": 0.6600290536880493, "learning_rate": 6.78638739907662e-06, "loss": 0.6598416566848755, "step": 960 }, { "epoch": 1.2465531224655313, "grad_norm": 0.6247118711471558, "learning_rate": 6.779707736670585e-06, "loss": 0.6106679439544678, "step": 961 }, { "epoch": 1.2478507704785078, "grad_norm": 0.588431179523468, "learning_rate": 6.773024435212678e-06, "loss": 0.6234384775161743, "step": 962 }, { "epoch": 1.2491484184914843, "grad_norm": 0.6060811281204224, "learning_rate": 6.7663375083685635e-06, "loss": 0.6653448343276978, "step": 963 }, { "epoch": 1.2504460665044608, "grad_norm": 0.7780699729919434, "learning_rate": 6.759646969811311e-06, "loss": 0.7183551788330078, "step": 964 }, { "epoch": 1.251743714517437, "grad_norm": 0.6161801815032959, "learning_rate": 6.752952833221379e-06, "loss": 0.693482518196106, "step": 965 }, { "epoch": 1.2530413625304138, "grad_norm": 0.5934755802154541, "learning_rate": 6.7462551122865825e-06, "loss": 0.6136157512664795, "step": 966 }, { "epoch": 1.25433901054339, "grad_norm": 0.5638807415962219, "learning_rate": 6.739553820702067e-06, "loss": 0.6110460758209229, "step": 967 }, { "epoch": 1.2556366585563665, "grad_norm": 2.232645273208618, "learning_rate": 6.732848972170276e-06, "loss": 0.5771392583847046, "step": 968 }, { "epoch": 1.256934306569343, "grad_norm": 0.5793489217758179, "learning_rate": 6.726140580400928e-06, "loss": 0.637577474117279, "step": 969 }, { "epoch": 1.2582319545823195, "grad_norm": 0.6198015213012695, "learning_rate": 6.719428659110987e-06, "loss": 0.6566798686981201, "step": 970 }, { "epoch": 1.259529602595296, "grad_norm": 8.447957992553711, "learning_rate": 6.712713222024633e-06, "loss": 0.6350081562995911, "step": 971 }, { "epoch": 1.2608272506082725, "grad_norm": 0.6281896233558655, "learning_rate": 6.705994282873233e-06, "loss": 0.6955903172492981, "step": 972 }, { "epoch": 1.262124898621249, "grad_norm": 0.5929207801818848, "learning_rate": 6.699271855395321e-06, "loss": 0.6420506834983826, "step": 973 }, { "epoch": 1.2634225466342255, "grad_norm": 0.6053920388221741, "learning_rate": 6.6925459533365576e-06, "loss": 0.6596835851669312, "step": 974 }, { "epoch": 1.264720194647202, "grad_norm": 0.6256871819496155, "learning_rate": 6.685816590449708e-06, "loss": 0.7071737051010132, "step": 975 }, { "epoch": 1.2660178426601785, "grad_norm": 0.5950897336006165, "learning_rate": 6.67908378049462e-06, "loss": 0.656615674495697, "step": 976 }, { "epoch": 1.267315490673155, "grad_norm": 0.6450179815292358, "learning_rate": 6.672347537238183e-06, "loss": 0.6895189881324768, "step": 977 }, { "epoch": 1.2686131386861315, "grad_norm": 0.6535899639129639, "learning_rate": 6.665607874454311e-06, "loss": 0.6748580932617188, "step": 978 }, { "epoch": 1.269910786699108, "grad_norm": 3.30841326713562, "learning_rate": 6.658864805923909e-06, "loss": 0.6493468284606934, "step": 979 }, { "epoch": 1.2712084347120842, "grad_norm": 0.6671776175498962, "learning_rate": 6.652118345434844e-06, "loss": 0.6867607235908508, "step": 980 }, { "epoch": 1.272506082725061, "grad_norm": 0.623457670211792, "learning_rate": 6.64536850678192e-06, "loss": 0.6442928314208984, "step": 981 }, { "epoch": 1.2738037307380372, "grad_norm": 0.5984421372413635, "learning_rate": 6.638615303766849e-06, "loss": 0.5990972518920898, "step": 982 }, { "epoch": 1.275101378751014, "grad_norm": 0.7166045904159546, "learning_rate": 6.631858750198223e-06, "loss": 0.6415522694587708, "step": 983 }, { "epoch": 1.2763990267639902, "grad_norm": 0.6510207056999207, "learning_rate": 6.625098859891483e-06, "loss": 0.6367224454879761, "step": 984 }, { "epoch": 1.2776966747769667, "grad_norm": 0.6455490589141846, "learning_rate": 6.618335646668894e-06, "loss": 0.6474705934524536, "step": 985 }, { "epoch": 1.2789943227899432, "grad_norm": 0.6324385404586792, "learning_rate": 6.611569124359516e-06, "loss": 0.6616948843002319, "step": 986 }, { "epoch": 1.2802919708029197, "grad_norm": 0.6118378043174744, "learning_rate": 6.604799306799172e-06, "loss": 0.628074586391449, "step": 987 }, { "epoch": 1.2815896188158962, "grad_norm": 0.5939401984214783, "learning_rate": 6.598026207830428e-06, "loss": 0.6460234522819519, "step": 988 }, { "epoch": 1.2828872668288727, "grad_norm": 0.5931558609008789, "learning_rate": 6.591249841302555e-06, "loss": 0.7053772211074829, "step": 989 }, { "epoch": 1.2841849148418492, "grad_norm": 0.6080952882766724, "learning_rate": 6.58447022107151e-06, "loss": 0.6465653777122498, "step": 990 }, { "epoch": 1.2854825628548256, "grad_norm": 0.5909331440925598, "learning_rate": 6.577687360999898e-06, "loss": 0.6280587911605835, "step": 991 }, { "epoch": 1.2867802108678021, "grad_norm": 0.6082817912101746, "learning_rate": 6.5709012749569535e-06, "loss": 0.6570587158203125, "step": 992 }, { "epoch": 1.2880778588807786, "grad_norm": 0.5879994630813599, "learning_rate": 6.564111976818501e-06, "loss": 0.6010950803756714, "step": 993 }, { "epoch": 1.2893755068937551, "grad_norm": 0.6213524341583252, "learning_rate": 6.5573194804669416e-06, "loss": 0.7210543751716614, "step": 994 }, { "epoch": 1.2906731549067316, "grad_norm": 0.8193002343177795, "learning_rate": 6.550523799791207e-06, "loss": 0.6705042123794556, "step": 995 }, { "epoch": 1.2919708029197081, "grad_norm": 0.6038559079170227, "learning_rate": 6.543724948686747e-06, "loss": 0.6417216062545776, "step": 996 }, { "epoch": 1.2932684509326844, "grad_norm": 0.6030299067497253, "learning_rate": 6.53692294105549e-06, "loss": 0.6307570338249207, "step": 997 }, { "epoch": 1.294566098945661, "grad_norm": 0.6002436876296997, "learning_rate": 6.53011779080582e-06, "loss": 0.6394779086112976, "step": 998 }, { "epoch": 1.2958637469586374, "grad_norm": 0.6847420334815979, "learning_rate": 6.523309511852547e-06, "loss": 0.7355165481567383, "step": 999 }, { "epoch": 1.2971613949716139, "grad_norm": 0.6133946180343628, "learning_rate": 6.516498118116878e-06, "loss": 0.6960593461990356, "step": 1000 }, { "epoch": 1.2984590429845904, "grad_norm": 0.6106923222541809, "learning_rate": 6.5096836235263904e-06, "loss": 0.6673066020011902, "step": 1001 }, { "epoch": 1.2997566909975669, "grad_norm": 0.6132566928863525, "learning_rate": 6.502866042015e-06, "loss": 0.6237598657608032, "step": 1002 }, { "epoch": 1.3010543390105433, "grad_norm": 0.8997653126716614, "learning_rate": 6.496045387522934e-06, "loss": 0.6304394006729126, "step": 1003 }, { "epoch": 1.3023519870235198, "grad_norm": 0.5679188966751099, "learning_rate": 6.489221673996708e-06, "loss": 0.575568675994873, "step": 1004 }, { "epoch": 1.3036496350364963, "grad_norm": 0.6406558752059937, "learning_rate": 6.482394915389085e-06, "loss": 0.632392406463623, "step": 1005 }, { "epoch": 1.3049472830494728, "grad_norm": 0.6094868183135986, "learning_rate": 6.475565125659063e-06, "loss": 0.6548421382904053, "step": 1006 }, { "epoch": 1.3062449310624493, "grad_norm": 0.5837537050247192, "learning_rate": 6.4687323187718276e-06, "loss": 0.6500783562660217, "step": 1007 }, { "epoch": 1.3075425790754258, "grad_norm": 0.5676296353340149, "learning_rate": 6.461896508698744e-06, "loss": 0.5843409299850464, "step": 1008 }, { "epoch": 1.3088402270884023, "grad_norm": 0.5929064154624939, "learning_rate": 6.455057709417312e-06, "loss": 0.5786738395690918, "step": 1009 }, { "epoch": 1.3101378751013788, "grad_norm": 0.6186608672142029, "learning_rate": 6.448215934911145e-06, "loss": 0.7198565006256104, "step": 1010 }, { "epoch": 1.3114355231143553, "grad_norm": 0.573298454284668, "learning_rate": 6.441371199169942e-06, "loss": 0.6153538227081299, "step": 1011 }, { "epoch": 1.3127331711273318, "grad_norm": 0.6731165051460266, "learning_rate": 6.434523516189453e-06, "loss": 0.6571598052978516, "step": 1012 }, { "epoch": 1.3140308191403083, "grad_norm": 0.5842266082763672, "learning_rate": 6.427672899971457e-06, "loss": 0.6164257526397705, "step": 1013 }, { "epoch": 1.3153284671532846, "grad_norm": 0.6072558760643005, "learning_rate": 6.4208193645237314e-06, "loss": 0.6229099035263062, "step": 1014 }, { "epoch": 1.3166261151662613, "grad_norm": 0.6617994904518127, "learning_rate": 6.413962923860021e-06, "loss": 0.634198009967804, "step": 1015 }, { "epoch": 1.3179237631792375, "grad_norm": 5.200798511505127, "learning_rate": 6.407103592000009e-06, "loss": 0.6058683395385742, "step": 1016 }, { "epoch": 1.319221411192214, "grad_norm": 0.5821889042854309, "learning_rate": 6.400241382969297e-06, "loss": 0.6865833401679993, "step": 1017 }, { "epoch": 1.3205190592051905, "grad_norm": 0.5700265169143677, "learning_rate": 6.393376310799363e-06, "loss": 0.6534625291824341, "step": 1018 }, { "epoch": 1.321816707218167, "grad_norm": 0.5971737504005432, "learning_rate": 6.386508389527544e-06, "loss": 0.6178575158119202, "step": 1019 }, { "epoch": 1.3231143552311435, "grad_norm": 0.5835508108139038, "learning_rate": 6.379637633196999e-06, "loss": 0.6270486116409302, "step": 1020 }, { "epoch": 1.32441200324412, "grad_norm": 0.5576135516166687, "learning_rate": 6.3727640558566865e-06, "loss": 0.6197627782821655, "step": 1021 }, { "epoch": 1.3257096512570965, "grad_norm": 0.6085971593856812, "learning_rate": 6.3658876715613315e-06, "loss": 0.6738483309745789, "step": 1022 }, { "epoch": 1.327007299270073, "grad_norm": 0.6080042719841003, "learning_rate": 6.3590084943713995e-06, "loss": 0.6581575870513916, "step": 1023 }, { "epoch": 1.3283049472830495, "grad_norm": 0.5854855179786682, "learning_rate": 6.35212653835307e-06, "loss": 0.6252275705337524, "step": 1024 }, { "epoch": 1.329602595296026, "grad_norm": 0.5838765501976013, "learning_rate": 6.345241817578196e-06, "loss": 0.6577827334403992, "step": 1025 }, { "epoch": 1.3309002433090025, "grad_norm": 0.5933322310447693, "learning_rate": 6.3383543461242914e-06, "loss": 0.6144447326660156, "step": 1026 }, { "epoch": 1.332197891321979, "grad_norm": 0.6093854904174805, "learning_rate": 6.331464138074493e-06, "loss": 0.6428185701370239, "step": 1027 }, { "epoch": 1.3334955393349555, "grad_norm": 0.6086922287940979, "learning_rate": 6.32457120751753e-06, "loss": 0.6779354810714722, "step": 1028 }, { "epoch": 1.334793187347932, "grad_norm": 0.5752759575843811, "learning_rate": 6.317675568547704e-06, "loss": 0.6089493036270142, "step": 1029 }, { "epoch": 1.3360908353609084, "grad_norm": 0.5942736268043518, "learning_rate": 6.310777235264849e-06, "loss": 0.6579400300979614, "step": 1030 }, { "epoch": 1.3373884833738847, "grad_norm": 0.5779575705528259, "learning_rate": 6.303876221774311e-06, "loss": 0.6444313526153564, "step": 1031 }, { "epoch": 1.3386861313868614, "grad_norm": 0.6055609583854675, "learning_rate": 6.296972542186915e-06, "loss": 0.6654270887374878, "step": 1032 }, { "epoch": 1.3399837793998377, "grad_norm": 0.60945063829422, "learning_rate": 6.2900662106189415e-06, "loss": 0.661444902420044, "step": 1033 }, { "epoch": 1.3412814274128142, "grad_norm": 1.2685060501098633, "learning_rate": 6.283157241192087e-06, "loss": 0.6629235148429871, "step": 1034 }, { "epoch": 1.3425790754257907, "grad_norm": 0.6141192317008972, "learning_rate": 6.276245648033447e-06, "loss": 0.6560642719268799, "step": 1035 }, { "epoch": 1.3438767234387672, "grad_norm": 0.5949526429176331, "learning_rate": 6.2693314452754796e-06, "loss": 0.7151345014572144, "step": 1036 }, { "epoch": 1.3451743714517437, "grad_norm": 0.6198956370353699, "learning_rate": 6.26241464705598e-06, "loss": 0.6870914101600647, "step": 1037 }, { "epoch": 1.3464720194647202, "grad_norm": 0.6193254590034485, "learning_rate": 6.25549526751805e-06, "loss": 0.6167775392532349, "step": 1038 }, { "epoch": 1.3477696674776967, "grad_norm": 0.582747757434845, "learning_rate": 6.24857332081007e-06, "loss": 0.6421079635620117, "step": 1039 }, { "epoch": 1.3490673154906732, "grad_norm": 0.6055523753166199, "learning_rate": 6.241648821085666e-06, "loss": 0.642772376537323, "step": 1040 }, { "epoch": 1.3503649635036497, "grad_norm": 0.5949704051017761, "learning_rate": 6.23472178250369e-06, "loss": 0.6979063153266907, "step": 1041 }, { "epoch": 1.3516626115166261, "grad_norm": 0.6037271022796631, "learning_rate": 6.227792219228183e-06, "loss": 0.6815102100372314, "step": 1042 }, { "epoch": 1.3529602595296026, "grad_norm": 0.5929699540138245, "learning_rate": 6.220860145428347e-06, "loss": 0.6474612951278687, "step": 1043 }, { "epoch": 1.3542579075425791, "grad_norm": 0.612301230430603, "learning_rate": 6.213925575278518e-06, "loss": 0.669405460357666, "step": 1044 }, { "epoch": 1.3555555555555556, "grad_norm": 0.5837467908859253, "learning_rate": 6.206988522958135e-06, "loss": 0.5990941524505615, "step": 1045 }, { "epoch": 1.3568532035685321, "grad_norm": 0.614405632019043, "learning_rate": 6.200049002651718e-06, "loss": 0.6845515370368958, "step": 1046 }, { "epoch": 1.3581508515815086, "grad_norm": 0.713435709476471, "learning_rate": 6.19310702854883e-06, "loss": 0.5659395456314087, "step": 1047 }, { "epoch": 1.3594484995944849, "grad_norm": 0.6173283457756042, "learning_rate": 6.186162614844047e-06, "loss": 0.6531370282173157, "step": 1048 }, { "epoch": 1.3607461476074616, "grad_norm": 0.6224690675735474, "learning_rate": 6.17921577573694e-06, "loss": 0.6006350517272949, "step": 1049 }, { "epoch": 1.3620437956204379, "grad_norm": 0.5716680288314819, "learning_rate": 6.172266525432036e-06, "loss": 0.6007007360458374, "step": 1050 }, { "epoch": 1.3633414436334144, "grad_norm": 0.964508593082428, "learning_rate": 6.165314878138794e-06, "loss": 0.5759468674659729, "step": 1051 }, { "epoch": 1.3646390916463909, "grad_norm": 1.8147951364517212, "learning_rate": 6.1583608480715705e-06, "loss": 0.6763917207717896, "step": 1052 }, { "epoch": 1.3659367396593673, "grad_norm": 0.5682212710380554, "learning_rate": 6.1514044494496e-06, "loss": 0.5627442002296448, "step": 1053 }, { "epoch": 1.3672343876723438, "grad_norm": 0.6249387860298157, "learning_rate": 6.144445696496955e-06, "loss": 0.7233635187149048, "step": 1054 }, { "epoch": 1.3685320356853203, "grad_norm": 0.5967603921890259, "learning_rate": 6.137484603442524e-06, "loss": 0.60671067237854, "step": 1055 }, { "epoch": 1.3698296836982968, "grad_norm": 0.9533456563949585, "learning_rate": 6.130521184519983e-06, "loss": 0.6718368530273438, "step": 1056 }, { "epoch": 1.3711273317112733, "grad_norm": 0.577439546585083, "learning_rate": 6.123555453967759e-06, "loss": 0.6093976497650146, "step": 1057 }, { "epoch": 1.3724249797242498, "grad_norm": 0.5558829307556152, "learning_rate": 6.1165874260290074e-06, "loss": 0.6086419224739075, "step": 1058 }, { "epoch": 1.3737226277372263, "grad_norm": 0.6080211400985718, "learning_rate": 6.109617114951581e-06, "loss": 0.6369859576225281, "step": 1059 }, { "epoch": 1.3750202757502028, "grad_norm": 0.70982426404953, "learning_rate": 6.102644534988006e-06, "loss": 0.6179996728897095, "step": 1060 }, { "epoch": 1.3763179237631793, "grad_norm": 0.6002638936042786, "learning_rate": 6.0956697003954404e-06, "loss": 0.6171343326568604, "step": 1061 }, { "epoch": 1.3776155717761558, "grad_norm": 0.74629145860672, "learning_rate": 6.088692625435656e-06, "loss": 0.64389967918396, "step": 1062 }, { "epoch": 1.378913219789132, "grad_norm": 0.5946625471115112, "learning_rate": 6.0817133243750046e-06, "loss": 0.6315205097198486, "step": 1063 }, { "epoch": 1.3802108678021088, "grad_norm": 0.6307440996170044, "learning_rate": 6.074731811484391e-06, "loss": 0.6365832090377808, "step": 1064 }, { "epoch": 1.381508515815085, "grad_norm": 0.958493173122406, "learning_rate": 6.067748101039243e-06, "loss": 0.588029146194458, "step": 1065 }, { "epoch": 1.3828061638280618, "grad_norm": 2.589282512664795, "learning_rate": 6.060762207319479e-06, "loss": 0.6348222494125366, "step": 1066 }, { "epoch": 1.384103811841038, "grad_norm": 0.6122376322746277, "learning_rate": 6.053774144609484e-06, "loss": 0.6187014579772949, "step": 1067 }, { "epoch": 1.3854014598540145, "grad_norm": 0.6017574071884155, "learning_rate": 6.046783927198079e-06, "loss": 0.646289587020874, "step": 1068 }, { "epoch": 1.386699107866991, "grad_norm": 0.5894978046417236, "learning_rate": 6.039791569378488e-06, "loss": 0.6435679197311401, "step": 1069 }, { "epoch": 1.3879967558799675, "grad_norm": 0.5931923389434814, "learning_rate": 6.032797085448315e-06, "loss": 0.6404111981391907, "step": 1070 }, { "epoch": 1.389294403892944, "grad_norm": 0.5930508971214294, "learning_rate": 6.025800489709505e-06, "loss": 0.6763365268707275, "step": 1071 }, { "epoch": 1.3905920519059205, "grad_norm": 0.621198832988739, "learning_rate": 6.018801796468328e-06, "loss": 0.7032692432403564, "step": 1072 }, { "epoch": 1.391889699918897, "grad_norm": 0.6337350606918335, "learning_rate": 6.0118010200353396e-06, "loss": 0.7524909973144531, "step": 1073 }, { "epoch": 1.3931873479318735, "grad_norm": 0.5976428389549255, "learning_rate": 6.004798174725358e-06, "loss": 0.6851296424865723, "step": 1074 }, { "epoch": 1.39448499594485, "grad_norm": 0.9778940081596375, "learning_rate": 5.997793274857427e-06, "loss": 0.6498898267745972, "step": 1075 }, { "epoch": 1.3957826439578265, "grad_norm": 0.638106644153595, "learning_rate": 5.990786334754795e-06, "loss": 0.707371711730957, "step": 1076 }, { "epoch": 1.397080291970803, "grad_norm": 4.414336204528809, "learning_rate": 5.983777368744881e-06, "loss": 0.6448768973350525, "step": 1077 }, { "epoch": 1.3983779399837795, "grad_norm": 0.5891152620315552, "learning_rate": 5.9767663911592454e-06, "loss": 0.6236732602119446, "step": 1078 }, { "epoch": 1.399675587996756, "grad_norm": 0.59264075756073, "learning_rate": 5.9697534163335645e-06, "loss": 0.6284846663475037, "step": 1079 }, { "epoch": 1.4009732360097322, "grad_norm": 0.6076551675796509, "learning_rate": 5.9627384586075954e-06, "loss": 0.6464221477508545, "step": 1080 }, { "epoch": 1.402270884022709, "grad_norm": 0.6048544645309448, "learning_rate": 5.955721532325151e-06, "loss": 0.6747769713401794, "step": 1081 }, { "epoch": 1.4035685320356852, "grad_norm": 1.4065572023391724, "learning_rate": 5.94870265183407e-06, "loss": 0.6566921472549438, "step": 1082 }, { "epoch": 1.404866180048662, "grad_norm": 0.5989380478858948, "learning_rate": 5.941681831486188e-06, "loss": 0.65166175365448, "step": 1083 }, { "epoch": 1.4061638280616382, "grad_norm": 0.566685676574707, "learning_rate": 5.934659085637303e-06, "loss": 0.6065230369567871, "step": 1084 }, { "epoch": 1.4074614760746147, "grad_norm": 0.5942695736885071, "learning_rate": 5.927634428647154e-06, "loss": 0.6362863183021545, "step": 1085 }, { "epoch": 1.4087591240875912, "grad_norm": 0.6072388887405396, "learning_rate": 5.920607874879387e-06, "loss": 0.6389554738998413, "step": 1086 }, { "epoch": 1.4100567721005677, "grad_norm": 0.6188286542892456, "learning_rate": 5.913579438701525e-06, "loss": 0.7114623188972473, "step": 1087 }, { "epoch": 1.4113544201135442, "grad_norm": 0.639775812625885, "learning_rate": 5.906549134484943e-06, "loss": 0.6554163694381714, "step": 1088 }, { "epoch": 1.4126520681265207, "grad_norm": 0.5889431238174438, "learning_rate": 5.899516976604832e-06, "loss": 0.6516610383987427, "step": 1089 }, { "epoch": 1.4139497161394972, "grad_norm": 0.5683200359344482, "learning_rate": 5.892482979440175e-06, "loss": 0.6421197652816772, "step": 1090 }, { "epoch": 1.4152473641524737, "grad_norm": 0.6089890003204346, "learning_rate": 5.885447157373716e-06, "loss": 0.6774452924728394, "step": 1091 }, { "epoch": 1.4165450121654501, "grad_norm": 0.6220733523368835, "learning_rate": 5.878409524791931e-06, "loss": 0.6213857531547546, "step": 1092 }, { "epoch": 1.4178426601784266, "grad_norm": 0.6474610567092896, "learning_rate": 5.871370096084997e-06, "loss": 0.6641533970832825, "step": 1093 }, { "epoch": 1.4191403081914031, "grad_norm": 0.5982186198234558, "learning_rate": 5.864328885646764e-06, "loss": 0.6307400465011597, "step": 1094 }, { "epoch": 1.4204379562043796, "grad_norm": 1.0146141052246094, "learning_rate": 5.857285907874725e-06, "loss": 0.6501115560531616, "step": 1095 }, { "epoch": 1.4217356042173561, "grad_norm": 0.6026197671890259, "learning_rate": 5.850241177169986e-06, "loss": 0.6877589225769043, "step": 1096 }, { "epoch": 1.4230332522303324, "grad_norm": 0.6162115931510925, "learning_rate": 5.84319470793724e-06, "loss": 0.6401875019073486, "step": 1097 }, { "epoch": 1.424330900243309, "grad_norm": 0.5684193968772888, "learning_rate": 5.836146514584733e-06, "loss": 0.6159685850143433, "step": 1098 }, { "epoch": 1.4256285482562854, "grad_norm": 0.6261927485466003, "learning_rate": 5.829096611524235e-06, "loss": 0.6478676199913025, "step": 1099 }, { "epoch": 1.426926196269262, "grad_norm": 0.6026703119277954, "learning_rate": 5.822045013171015e-06, "loss": 0.6607078313827515, "step": 1100 }, { "epoch": 1.4282238442822384, "grad_norm": 0.602356493473053, "learning_rate": 5.814991733943805e-06, "loss": 0.6449368000030518, "step": 1101 }, { "epoch": 1.4295214922952149, "grad_norm": 0.5841164588928223, "learning_rate": 5.807936788264778e-06, "loss": 0.6442397236824036, "step": 1102 }, { "epoch": 1.4308191403081914, "grad_norm": 0.6046425104141235, "learning_rate": 5.800880190559511e-06, "loss": 0.6141000986099243, "step": 1103 }, { "epoch": 1.4321167883211678, "grad_norm": 0.6180353760719299, "learning_rate": 5.79382195525696e-06, "loss": 0.7307353019714355, "step": 1104 }, { "epoch": 1.4334144363341443, "grad_norm": 0.5996196269989014, "learning_rate": 5.786762096789431e-06, "loss": 0.6220886707305908, "step": 1105 }, { "epoch": 1.4347120843471208, "grad_norm": 0.6037473678588867, "learning_rate": 5.779700629592547e-06, "loss": 0.7145535945892334, "step": 1106 }, { "epoch": 1.4360097323600973, "grad_norm": 0.5726904273033142, "learning_rate": 5.7726375681052205e-06, "loss": 0.6307674646377563, "step": 1107 }, { "epoch": 1.4373073803730738, "grad_norm": 0.6289665102958679, "learning_rate": 5.765572926769625e-06, "loss": 0.7094706296920776, "step": 1108 }, { "epoch": 1.4386050283860503, "grad_norm": 0.5811914801597595, "learning_rate": 5.758506720031163e-06, "loss": 0.6041115522384644, "step": 1109 }, { "epoch": 1.4399026763990268, "grad_norm": 0.5376439094543457, "learning_rate": 5.751438962338441e-06, "loss": 0.5803889036178589, "step": 1110 }, { "epoch": 1.4412003244120033, "grad_norm": 0.5952728390693665, "learning_rate": 5.744369668143233e-06, "loss": 0.6684442758560181, "step": 1111 }, { "epoch": 1.4424979724249798, "grad_norm": 0.5791693329811096, "learning_rate": 5.737298851900457e-06, "loss": 0.6404840350151062, "step": 1112 }, { "epoch": 1.4437956204379563, "grad_norm": 0.6007118225097656, "learning_rate": 5.730226528068142e-06, "loss": 0.6698148846626282, "step": 1113 }, { "epoch": 1.4450932684509326, "grad_norm": 0.613433301448822, "learning_rate": 5.7231527111074e-06, "loss": 0.7007705569267273, "step": 1114 }, { "epoch": 1.4463909164639093, "grad_norm": 0.5919564962387085, "learning_rate": 5.716077415482398e-06, "loss": 0.6769901514053345, "step": 1115 }, { "epoch": 1.4476885644768855, "grad_norm": 0.5912166833877563, "learning_rate": 5.709000655660324e-06, "loss": 0.6436672210693359, "step": 1116 }, { "epoch": 1.4489862124898623, "grad_norm": 0.5603325366973877, "learning_rate": 5.7019224461113585e-06, "loss": 0.5793130993843079, "step": 1117 }, { "epoch": 1.4502838605028385, "grad_norm": 0.611814558506012, "learning_rate": 5.694842801308651e-06, "loss": 0.6368833780288696, "step": 1118 }, { "epoch": 1.451581508515815, "grad_norm": 0.5689136385917664, "learning_rate": 5.687761735728282e-06, "loss": 0.6261428594589233, "step": 1119 }, { "epoch": 1.4528791565287915, "grad_norm": 0.6117684245109558, "learning_rate": 5.680679263849241e-06, "loss": 0.6463526487350464, "step": 1120 }, { "epoch": 1.454176804541768, "grad_norm": 0.5878109931945801, "learning_rate": 5.673595400153385e-06, "loss": 0.6132445335388184, "step": 1121 }, { "epoch": 1.4554744525547445, "grad_norm": 0.5826682448387146, "learning_rate": 5.666510159125427e-06, "loss": 0.6556754112243652, "step": 1122 }, { "epoch": 1.456772100567721, "grad_norm": 0.5753729939460754, "learning_rate": 5.65942355525289e-06, "loss": 0.6176761388778687, "step": 1123 }, { "epoch": 1.4580697485806975, "grad_norm": 0.7028788924217224, "learning_rate": 5.652335603026084e-06, "loss": 0.5802330374717712, "step": 1124 }, { "epoch": 1.459367396593674, "grad_norm": 0.5847388505935669, "learning_rate": 5.645246316938082e-06, "loss": 0.6626067161560059, "step": 1125 }, { "epoch": 1.4606650446066505, "grad_norm": 1.399409294128418, "learning_rate": 5.638155711484674e-06, "loss": 0.6308712959289551, "step": 1126 }, { "epoch": 1.461962692619627, "grad_norm": 0.602827250957489, "learning_rate": 5.631063801164356e-06, "loss": 0.6493173241615295, "step": 1127 }, { "epoch": 1.4632603406326035, "grad_norm": 0.7403953075408936, "learning_rate": 5.62397060047829e-06, "loss": 0.620072603225708, "step": 1128 }, { "epoch": 1.46455798864558, "grad_norm": 0.6334176063537598, "learning_rate": 5.6168761239302745e-06, "loss": 0.665931761264801, "step": 1129 }, { "epoch": 1.4658556366585564, "grad_norm": 0.6131840944290161, "learning_rate": 5.609780386026721e-06, "loss": 0.6492164731025696, "step": 1130 }, { "epoch": 1.4671532846715327, "grad_norm": 0.6045870780944824, "learning_rate": 5.6026834012766155e-06, "loss": 0.6135592460632324, "step": 1131 }, { "epoch": 1.4684509326845094, "grad_norm": 0.650088906288147, "learning_rate": 5.595585184191496e-06, "loss": 0.7170080542564392, "step": 1132 }, { "epoch": 1.4697485806974857, "grad_norm": 0.5771186351776123, "learning_rate": 5.58848574928542e-06, "loss": 0.6513093709945679, "step": 1133 }, { "epoch": 1.4710462287104624, "grad_norm": 0.7128145694732666, "learning_rate": 5.5813851110749365e-06, "loss": 0.6579954624176025, "step": 1134 }, { "epoch": 1.4723438767234387, "grad_norm": 0.5734491348266602, "learning_rate": 5.574283284079049e-06, "loss": 0.6137959361076355, "step": 1135 }, { "epoch": 1.4736415247364152, "grad_norm": 0.5757655501365662, "learning_rate": 5.567180282819201e-06, "loss": 0.6633074283599854, "step": 1136 }, { "epoch": 1.4749391727493917, "grad_norm": 0.5958343148231506, "learning_rate": 5.560076121819229e-06, "loss": 0.6766320466995239, "step": 1137 }, { "epoch": 1.4762368207623682, "grad_norm": 0.5708390474319458, "learning_rate": 5.552970815605347e-06, "loss": 0.6593270897865295, "step": 1138 }, { "epoch": 1.4775344687753447, "grad_norm": 0.5592367649078369, "learning_rate": 5.545864378706106e-06, "loss": 0.6107625961303711, "step": 1139 }, { "epoch": 1.4788321167883212, "grad_norm": 0.5908456444740295, "learning_rate": 5.53875682565237e-06, "loss": 0.612775444984436, "step": 1140 }, { "epoch": 1.4801297648012977, "grad_norm": 0.7283220291137695, "learning_rate": 5.5316481709772886e-06, "loss": 0.6324783563613892, "step": 1141 }, { "epoch": 1.4814274128142741, "grad_norm": 0.5963947176933289, "learning_rate": 5.524538429216258e-06, "loss": 0.6906737089157104, "step": 1142 }, { "epoch": 1.4827250608272506, "grad_norm": 0.6059021949768066, "learning_rate": 5.517427614906906e-06, "loss": 0.6746259331703186, "step": 1143 }, { "epoch": 1.4840227088402271, "grad_norm": 0.5953018069267273, "learning_rate": 5.510315742589042e-06, "loss": 0.6834631562232971, "step": 1144 }, { "epoch": 1.4853203568532036, "grad_norm": 0.5694923996925354, "learning_rate": 5.503202826804647e-06, "loss": 0.6960294246673584, "step": 1145 }, { "epoch": 1.4866180048661801, "grad_norm": 0.6007208228111267, "learning_rate": 5.496088882097836e-06, "loss": 0.657875657081604, "step": 1146 }, { "epoch": 1.4879156528791566, "grad_norm": 0.6081047654151917, "learning_rate": 5.488973923014821e-06, "loss": 0.6561139225959778, "step": 1147 }, { "epoch": 1.4892133008921329, "grad_norm": 0.5819503664970398, "learning_rate": 5.4818579641038974e-06, "loss": 0.6176397204399109, "step": 1148 }, { "epoch": 1.4905109489051096, "grad_norm": 0.6077326536178589, "learning_rate": 5.474741019915395e-06, "loss": 0.6847512722015381, "step": 1149 }, { "epoch": 1.4918085969180859, "grad_norm": 0.6074263453483582, "learning_rate": 5.467623105001667e-06, "loss": 0.6360629200935364, "step": 1150 }, { "epoch": 1.4918085969180859, "eval_loss": 0.6826658844947815, "eval_runtime": 73.0405, "eval_samples_per_second": 71.084, "eval_steps_per_second": 8.885, "step": 1150 }, { "epoch": 1.4931062449310626, "grad_norm": 0.5855403542518616, "learning_rate": 5.460504233917047e-06, "loss": 0.6704986095428467, "step": 1151 }, { "epoch": 1.4944038929440389, "grad_norm": 0.6127449870109558, "learning_rate": 5.453384421217823e-06, "loss": 0.6719274520874023, "step": 1152 }, { "epoch": 1.4957015409570154, "grad_norm": 0.5484548211097717, "learning_rate": 5.446263681462213e-06, "loss": 0.6012224555015564, "step": 1153 }, { "epoch": 1.4969991889699918, "grad_norm": 0.5728206038475037, "learning_rate": 5.439142029210323e-06, "loss": 0.6711239218711853, "step": 1154 }, { "epoch": 1.4982968369829683, "grad_norm": 0.5789787769317627, "learning_rate": 5.4320194790241335e-06, "loss": 0.5949071645736694, "step": 1155 }, { "epoch": 1.4995944849959448, "grad_norm": 0.5778141021728516, "learning_rate": 5.424896045467455e-06, "loss": 0.6263710260391235, "step": 1156 }, { "epoch": 1.5008921330089213, "grad_norm": 0.5851665139198303, "learning_rate": 5.417771743105908e-06, "loss": 0.690178632736206, "step": 1157 }, { "epoch": 1.5021897810218978, "grad_norm": 0.620339035987854, "learning_rate": 5.4106465865068846e-06, "loss": 0.6553722620010376, "step": 1158 }, { "epoch": 1.5034874290348743, "grad_norm": 0.5484940409660339, "learning_rate": 5.403520590239527e-06, "loss": 0.5462528467178345, "step": 1159 }, { "epoch": 1.5047850770478508, "grad_norm": 0.62648606300354, "learning_rate": 5.396393768874696e-06, "loss": 0.7103927135467529, "step": 1160 }, { "epoch": 1.5060827250608273, "grad_norm": 0.5696239471435547, "learning_rate": 5.389266136984939e-06, "loss": 0.6234554648399353, "step": 1161 }, { "epoch": 1.5073803730738038, "grad_norm": 0.6027652025222778, "learning_rate": 5.382137709144454e-06, "loss": 0.6729198694229126, "step": 1162 }, { "epoch": 1.50867802108678, "grad_norm": 0.5693642497062683, "learning_rate": 5.3750084999290755e-06, "loss": 0.6457726359367371, "step": 1163 }, { "epoch": 1.5099756690997568, "grad_norm": 1.6674511432647705, "learning_rate": 5.3678785239162305e-06, "loss": 0.656345009803772, "step": 1164 }, { "epoch": 1.511273317112733, "grad_norm": 0.5577940344810486, "learning_rate": 5.360747795684916e-06, "loss": 0.5705595016479492, "step": 1165 }, { "epoch": 1.5125709651257098, "grad_norm": 0.5919134616851807, "learning_rate": 5.353616329815667e-06, "loss": 0.6972566246986389, "step": 1166 }, { "epoch": 1.513868613138686, "grad_norm": 0.6095024347305298, "learning_rate": 5.346484140890523e-06, "loss": 0.6107922196388245, "step": 1167 }, { "epoch": 1.5151662611516628, "grad_norm": 0.5990864634513855, "learning_rate": 5.339351243493008e-06, "loss": 0.5962531566619873, "step": 1168 }, { "epoch": 1.516463909164639, "grad_norm": 0.5995983481407166, "learning_rate": 5.332217652208093e-06, "loss": 0.6228233575820923, "step": 1169 }, { "epoch": 1.5177615571776155, "grad_norm": 0.5965218544006348, "learning_rate": 5.325083381622165e-06, "loss": 0.6963210105895996, "step": 1170 }, { "epoch": 1.519059205190592, "grad_norm": 0.5758861303329468, "learning_rate": 5.317948446322999e-06, "loss": 0.58036869764328, "step": 1171 }, { "epoch": 1.5203568532035685, "grad_norm": 0.5857213139533997, "learning_rate": 5.310812860899737e-06, "loss": 0.6398880481719971, "step": 1172 }, { "epoch": 1.521654501216545, "grad_norm": 0.706536054611206, "learning_rate": 5.303676639942841e-06, "loss": 0.6162217855453491, "step": 1173 }, { "epoch": 1.5229521492295215, "grad_norm": 0.5781589150428772, "learning_rate": 5.296539798044078e-06, "loss": 0.6084649562835693, "step": 1174 }, { "epoch": 1.524249797242498, "grad_norm": 0.5943130850791931, "learning_rate": 5.289402349796484e-06, "loss": 0.6497021913528442, "step": 1175 }, { "epoch": 1.5255474452554745, "grad_norm": 0.5641393065452576, "learning_rate": 5.282264309794334e-06, "loss": 0.5834084749221802, "step": 1176 }, { "epoch": 1.526845093268451, "grad_norm": 0.5564937591552734, "learning_rate": 5.2751256926331115e-06, "loss": 0.6279217004776001, "step": 1177 }, { "epoch": 1.5281427412814275, "grad_norm": 0.5945193767547607, "learning_rate": 5.267986512909484e-06, "loss": 0.6333688497543335, "step": 1178 }, { "epoch": 1.529440389294404, "grad_norm": 0.6081971526145935, "learning_rate": 5.2608467852212665e-06, "loss": 0.6803103685379028, "step": 1179 }, { "epoch": 1.5307380373073802, "grad_norm": 0.584886908531189, "learning_rate": 5.253706524167395e-06, "loss": 0.6653470993041992, "step": 1180 }, { "epoch": 1.532035685320357, "grad_norm": 0.8528439998626709, "learning_rate": 5.246565744347894e-06, "loss": 0.6093430519104004, "step": 1181 }, { "epoch": 1.5333333333333332, "grad_norm": 0.573440432548523, "learning_rate": 5.2394244603638536e-06, "loss": 0.6251604557037354, "step": 1182 }, { "epoch": 1.53463098134631, "grad_norm": 0.5646257996559143, "learning_rate": 5.232282686817392e-06, "loss": 0.5792976021766663, "step": 1183 }, { "epoch": 1.5359286293592862, "grad_norm": 0.5741854310035706, "learning_rate": 5.2251404383116265e-06, "loss": 0.6484105587005615, "step": 1184 }, { "epoch": 1.537226277372263, "grad_norm": 0.5606357455253601, "learning_rate": 5.217997729450649e-06, "loss": 0.6315451860427856, "step": 1185 }, { "epoch": 1.5385239253852392, "grad_norm": 0.5854267477989197, "learning_rate": 5.21085457483949e-06, "loss": 0.6010391712188721, "step": 1186 }, { "epoch": 1.5398215733982157, "grad_norm": 0.6120538711547852, "learning_rate": 5.203710989084093e-06, "loss": 0.6872812509536743, "step": 1187 }, { "epoch": 1.5411192214111922, "grad_norm": 0.6018205881118774, "learning_rate": 5.196566986791286e-06, "loss": 0.6842239499092102, "step": 1188 }, { "epoch": 1.5424168694241687, "grad_norm": 0.5673507452011108, "learning_rate": 5.189422582568742e-06, "loss": 0.6135258674621582, "step": 1189 }, { "epoch": 1.5437145174371452, "grad_norm": 0.5736320614814758, "learning_rate": 5.182277791024959e-06, "loss": 0.6442878246307373, "step": 1190 }, { "epoch": 1.5450121654501217, "grad_norm": 0.5806821584701538, "learning_rate": 5.175132626769229e-06, "loss": 0.6409611701965332, "step": 1191 }, { "epoch": 1.5463098134630981, "grad_norm": 0.6098542213439941, "learning_rate": 5.167987104411605e-06, "loss": 0.6895368695259094, "step": 1192 }, { "epoch": 1.5476074614760746, "grad_norm": 0.6138260364532471, "learning_rate": 5.160841238562872e-06, "loss": 0.6403982043266296, "step": 1193 }, { "epoch": 1.5489051094890511, "grad_norm": 0.5820956826210022, "learning_rate": 5.153695043834513e-06, "loss": 0.6204026937484741, "step": 1194 }, { "epoch": 1.5502027575020276, "grad_norm": 0.5773366093635559, "learning_rate": 5.146548534838691e-06, "loss": 0.645720899105072, "step": 1195 }, { "epoch": 1.5515004055150041, "grad_norm": 0.5759880542755127, "learning_rate": 5.139401726188208e-06, "loss": 0.5854007601737976, "step": 1196 }, { "epoch": 1.5527980535279804, "grad_norm": 0.584076464176178, "learning_rate": 5.132254632496477e-06, "loss": 0.662139892578125, "step": 1197 }, { "epoch": 1.554095701540957, "grad_norm": 0.6095874905586243, "learning_rate": 5.125107268377498e-06, "loss": 0.6662768125534058, "step": 1198 }, { "epoch": 1.5553933495539334, "grad_norm": 0.5676849484443665, "learning_rate": 5.117959648445821e-06, "loss": 0.593256413936615, "step": 1199 }, { "epoch": 1.55669099756691, "grad_norm": 0.9843289852142334, "learning_rate": 5.1108117873165175e-06, "loss": 0.6919536590576172, "step": 1200 }, { "epoch": 1.5579886455798864, "grad_norm": 0.5795591473579407, "learning_rate": 5.1036636996051556e-06, "loss": 0.6274605989456177, "step": 1201 }, { "epoch": 1.559286293592863, "grad_norm": 0.5853375196456909, "learning_rate": 5.096515399927767e-06, "loss": 0.6070197820663452, "step": 1202 }, { "epoch": 1.5605839416058394, "grad_norm": 0.6098043918609619, "learning_rate": 5.089366902900813e-06, "loss": 0.6619631052017212, "step": 1203 }, { "epoch": 1.5618815896188158, "grad_norm": 0.6142205595970154, "learning_rate": 5.082218223141162e-06, "loss": 0.6737958192825317, "step": 1204 }, { "epoch": 1.5631792376317923, "grad_norm": 0.5625759363174438, "learning_rate": 5.075069375266055e-06, "loss": 0.590381383895874, "step": 1205 }, { "epoch": 1.5644768856447688, "grad_norm": 0.8771416544914246, "learning_rate": 5.067920373893075e-06, "loss": 0.5482794046401978, "step": 1206 }, { "epoch": 1.5657745336577453, "grad_norm": 0.8982309699058533, "learning_rate": 5.060771233640122e-06, "loss": 0.6464008092880249, "step": 1207 }, { "epoch": 1.5670721816707218, "grad_norm": 0.6009715795516968, "learning_rate": 5.0536219691253776e-06, "loss": 0.5735194683074951, "step": 1208 }, { "epoch": 1.5683698296836983, "grad_norm": 0.5980544686317444, "learning_rate": 5.046472594967279e-06, "loss": 0.66939377784729, "step": 1209 }, { "epoch": 1.5696674776966748, "grad_norm": 0.6162261962890625, "learning_rate": 5.039323125784485e-06, "loss": 0.6994204521179199, "step": 1210 }, { "epoch": 1.5709651257096513, "grad_norm": 0.617485761642456, "learning_rate": 5.0321735761958515e-06, "loss": 0.633686363697052, "step": 1211 }, { "epoch": 1.5722627737226276, "grad_norm": 0.5924466848373413, "learning_rate": 5.025023960820399e-06, "loss": 0.6124377250671387, "step": 1212 }, { "epoch": 1.5735604217356043, "grad_norm": 0.6040006279945374, "learning_rate": 5.01787429427728e-06, "loss": 0.6491550207138062, "step": 1213 }, { "epoch": 1.5748580697485806, "grad_norm": 0.5883519649505615, "learning_rate": 5.010724591185752e-06, "loss": 0.6150457262992859, "step": 1214 }, { "epoch": 1.5761557177615573, "grad_norm": 0.6369590759277344, "learning_rate": 5.003574866165149e-06, "loss": 0.6079261898994446, "step": 1215 }, { "epoch": 1.5774533657745335, "grad_norm": 0.6027874946594238, "learning_rate": 4.9964251338348515e-06, "loss": 0.6851716637611389, "step": 1216 }, { "epoch": 1.5787510137875103, "grad_norm": 0.5862027406692505, "learning_rate": 4.989275408814251e-06, "loss": 0.5923515558242798, "step": 1217 }, { "epoch": 1.5800486618004865, "grad_norm": 0.6328719854354858, "learning_rate": 4.982125705722722e-06, "loss": 0.6643452644348145, "step": 1218 }, { "epoch": 1.5813463098134632, "grad_norm": 0.6100243330001831, "learning_rate": 4.974976039179604e-06, "loss": 0.6416760683059692, "step": 1219 }, { "epoch": 1.5826439578264395, "grad_norm": 0.5908761620521545, "learning_rate": 4.967826423804151e-06, "loss": 0.643882155418396, "step": 1220 }, { "epoch": 1.583941605839416, "grad_norm": 0.5938880443572998, "learning_rate": 4.960676874215518e-06, "loss": 0.6157772541046143, "step": 1221 }, { "epoch": 1.5852392538523925, "grad_norm": 0.5930868983268738, "learning_rate": 4.953527405032723e-06, "loss": 0.5862378478050232, "step": 1222 }, { "epoch": 1.586536901865369, "grad_norm": 0.589255690574646, "learning_rate": 4.946378030874625e-06, "loss": 0.6135423183441162, "step": 1223 }, { "epoch": 1.5878345498783455, "grad_norm": 0.5754698514938354, "learning_rate": 4.9392287663598785e-06, "loss": 0.6066054701805115, "step": 1224 }, { "epoch": 1.589132197891322, "grad_norm": 0.6168340444564819, "learning_rate": 4.932079626106926e-06, "loss": 0.683946967124939, "step": 1225 }, { "epoch": 1.5904298459042985, "grad_norm": 0.5985932350158691, "learning_rate": 4.924930624733947e-06, "loss": 0.6772314310073853, "step": 1226 }, { "epoch": 1.591727493917275, "grad_norm": 0.6024285554885864, "learning_rate": 4.91778177685884e-06, "loss": 0.652093768119812, "step": 1227 }, { "epoch": 1.5930251419302515, "grad_norm": 0.6394546627998352, "learning_rate": 4.910633097099188e-06, "loss": 0.6307955384254456, "step": 1228 }, { "epoch": 1.5943227899432277, "grad_norm": 0.5471766591072083, "learning_rate": 4.903484600072236e-06, "loss": 0.5805978775024414, "step": 1229 }, { "epoch": 1.5956204379562045, "grad_norm": 0.5722350478172302, "learning_rate": 4.896336300394845e-06, "loss": 0.6355024576187134, "step": 1230 }, { "epoch": 1.5969180859691807, "grad_norm": 0.6039298176765442, "learning_rate": 4.889188212683483e-06, "loss": 0.6441288590431213, "step": 1231 }, { "epoch": 1.5982157339821574, "grad_norm": 0.6237229704856873, "learning_rate": 4.882040351554181e-06, "loss": 0.6681591272354126, "step": 1232 }, { "epoch": 1.5995133819951337, "grad_norm": 0.6051374673843384, "learning_rate": 4.874892731622503e-06, "loss": 0.6615642309188843, "step": 1233 }, { "epoch": 1.6008110300081104, "grad_norm": 0.5937628746032715, "learning_rate": 4.867745367503524e-06, "loss": 0.6506084203720093, "step": 1234 }, { "epoch": 1.6021086780210867, "grad_norm": 0.5851325988769531, "learning_rate": 4.860598273811793e-06, "loss": 0.6443929076194763, "step": 1235 }, { "epoch": 1.6034063260340634, "grad_norm": 0.5777382850646973, "learning_rate": 4.8534514651613104e-06, "loss": 0.635840892791748, "step": 1236 }, { "epoch": 1.6047039740470397, "grad_norm": 0.5909644365310669, "learning_rate": 4.846304956165488e-06, "loss": 0.6581849455833435, "step": 1237 }, { "epoch": 1.6060016220600162, "grad_norm": 0.5992142558097839, "learning_rate": 4.83915876143713e-06, "loss": 0.6690875291824341, "step": 1238 }, { "epoch": 1.6072992700729927, "grad_norm": 1.2001910209655762, "learning_rate": 4.832012895588395e-06, "loss": 0.6264456510543823, "step": 1239 }, { "epoch": 1.6085969180859692, "grad_norm": 0.6141691207885742, "learning_rate": 4.824867373230772e-06, "loss": 0.670561671257019, "step": 1240 }, { "epoch": 1.6098945660989457, "grad_norm": 0.5834086537361145, "learning_rate": 4.817722208975041e-06, "loss": 0.6045785546302795, "step": 1241 }, { "epoch": 1.6111922141119221, "grad_norm": 0.6060406565666199, "learning_rate": 4.81057741743126e-06, "loss": 0.5803914666175842, "step": 1242 }, { "epoch": 1.6124898621248986, "grad_norm": 0.5703381299972534, "learning_rate": 4.8034330132087155e-06, "loss": 0.6377118825912476, "step": 1243 }, { "epoch": 1.6137875101378751, "grad_norm": 0.6010227203369141, "learning_rate": 4.7962890109159085e-06, "loss": 0.6981620788574219, "step": 1244 }, { "epoch": 1.6150851581508516, "grad_norm": 0.6107721924781799, "learning_rate": 4.789145425160511e-06, "loss": 0.6511063575744629, "step": 1245 }, { "epoch": 1.616382806163828, "grad_norm": 0.5982344150543213, "learning_rate": 4.782002270549354e-06, "loss": 0.6058223247528076, "step": 1246 }, { "epoch": 1.6176804541768046, "grad_norm": 0.7359511256217957, "learning_rate": 4.774859561688374e-06, "loss": 0.7255959510803223, "step": 1247 }, { "epoch": 1.6189781021897809, "grad_norm": 0.6240600347518921, "learning_rate": 4.767717313182611e-06, "loss": 0.695855975151062, "step": 1248 }, { "epoch": 1.6202757502027576, "grad_norm": 0.6217120885848999, "learning_rate": 4.760575539636147e-06, "loss": 0.7245144844055176, "step": 1249 }, { "epoch": 1.6215733982157339, "grad_norm": 0.6095402240753174, "learning_rate": 4.753434255652108e-06, "loss": 0.6345319151878357, "step": 1250 }, { "epoch": 1.6228710462287106, "grad_norm": 0.5852973461151123, "learning_rate": 4.746293475832607e-06, "loss": 0.7055230736732483, "step": 1251 }, { "epoch": 1.6241686942416869, "grad_norm": 0.5857930779457092, "learning_rate": 4.739153214778735e-06, "loss": 0.611079216003418, "step": 1252 }, { "epoch": 1.6254663422546636, "grad_norm": 0.5896874070167542, "learning_rate": 4.732013487090517e-06, "loss": 0.6760262250900269, "step": 1253 }, { "epoch": 1.6267639902676398, "grad_norm": 0.5715303421020508, "learning_rate": 4.72487430736689e-06, "loss": 0.6258687376976013, "step": 1254 }, { "epoch": 1.6280616382806163, "grad_norm": 0.6083521246910095, "learning_rate": 4.7177356902056675e-06, "loss": 0.6745297908782959, "step": 1255 }, { "epoch": 1.6293592862935928, "grad_norm": 0.5798436403274536, "learning_rate": 4.7105976502035175e-06, "loss": 0.5955469608306885, "step": 1256 }, { "epoch": 1.6306569343065693, "grad_norm": 0.5836136341094971, "learning_rate": 4.703460201955924e-06, "loss": 0.6397416591644287, "step": 1257 }, { "epoch": 1.6319545823195458, "grad_norm": 0.5983015894889832, "learning_rate": 4.696323360057162e-06, "loss": 0.6736359596252441, "step": 1258 }, { "epoch": 1.6332522303325223, "grad_norm": 0.5725530982017517, "learning_rate": 4.689187139100265e-06, "loss": 0.6878089904785156, "step": 1259 }, { "epoch": 1.6345498783454988, "grad_norm": 0.5805061459541321, "learning_rate": 4.682051553677001e-06, "loss": 0.6194028854370117, "step": 1260 }, { "epoch": 1.6358475263584753, "grad_norm": 0.6036574840545654, "learning_rate": 4.6749166183778375e-06, "loss": 0.634255588054657, "step": 1261 }, { "epoch": 1.6371451743714518, "grad_norm": 0.8983334898948669, "learning_rate": 4.667782347791908e-06, "loss": 0.6297205686569214, "step": 1262 }, { "epoch": 1.638442822384428, "grad_norm": 0.5956529378890991, "learning_rate": 4.660648756506993e-06, "loss": 0.6427313089370728, "step": 1263 }, { "epoch": 1.6397404703974048, "grad_norm": 0.5881230235099792, "learning_rate": 4.653515859109478e-06, "loss": 0.6450825929641724, "step": 1264 }, { "epoch": 1.641038118410381, "grad_norm": 0.5867661833763123, "learning_rate": 4.646383670184336e-06, "loss": 0.6814026832580566, "step": 1265 }, { "epoch": 1.6423357664233578, "grad_norm": 0.6160328388214111, "learning_rate": 4.639252204315086e-06, "loss": 0.6689074039459229, "step": 1266 }, { "epoch": 1.643633414436334, "grad_norm": 0.582465410232544, "learning_rate": 4.632121476083772e-06, "loss": 0.6467956304550171, "step": 1267 }, { "epoch": 1.6449310624493108, "grad_norm": 0.5506557822227478, "learning_rate": 4.624991500070925e-06, "loss": 0.6649973392486572, "step": 1268 }, { "epoch": 1.646228710462287, "grad_norm": 0.600159227848053, "learning_rate": 4.617862290855548e-06, "loss": 0.6144022345542908, "step": 1269 }, { "epoch": 1.6475263584752637, "grad_norm": 1.0451817512512207, "learning_rate": 4.610733863015063e-06, "loss": 0.6827117800712585, "step": 1270 }, { "epoch": 1.64882400648824, "grad_norm": 0.5652205944061279, "learning_rate": 4.6036062311253055e-06, "loss": 0.5971782207489014, "step": 1271 }, { "epoch": 1.6501216545012165, "grad_norm": 0.686071515083313, "learning_rate": 4.596479409760474e-06, "loss": 0.5615164041519165, "step": 1272 }, { "epoch": 1.651419302514193, "grad_norm": 0.5449540019035339, "learning_rate": 4.589353413493118e-06, "loss": 0.6300219297409058, "step": 1273 }, { "epoch": 1.6527169505271695, "grad_norm": 0.6144797205924988, "learning_rate": 4.582228256894093e-06, "loss": 0.6373116970062256, "step": 1274 }, { "epoch": 1.654014598540146, "grad_norm": 0.6170778274536133, "learning_rate": 4.575103954532547e-06, "loss": 0.6746265888214111, "step": 1275 }, { "epoch": 1.6553122465531225, "grad_norm": 0.5726920366287231, "learning_rate": 4.567980520975867e-06, "loss": 0.598582923412323, "step": 1276 }, { "epoch": 1.656609894566099, "grad_norm": 0.59462571144104, "learning_rate": 4.560857970789679e-06, "loss": 0.5969716906547546, "step": 1277 }, { "epoch": 1.6579075425790755, "grad_norm": 0.5755953192710876, "learning_rate": 4.553736318537789e-06, "loss": 0.6542321443557739, "step": 1278 }, { "epoch": 1.659205190592052, "grad_norm": 0.6138618588447571, "learning_rate": 4.546615578782178e-06, "loss": 0.6415365934371948, "step": 1279 }, { "epoch": 1.6605028386050282, "grad_norm": 0.5503448247909546, "learning_rate": 4.5394957660829554e-06, "loss": 0.6184664964675903, "step": 1280 }, { "epoch": 1.661800486618005, "grad_norm": 0.5893129110336304, "learning_rate": 4.532376894998335e-06, "loss": 0.6324410438537598, "step": 1281 }, { "epoch": 1.6630981346309812, "grad_norm": 0.6124705672264099, "learning_rate": 4.5252589800846054e-06, "loss": 0.6756390333175659, "step": 1282 }, { "epoch": 1.664395782643958, "grad_norm": 0.598412275314331, "learning_rate": 4.518142035896106e-06, "loss": 0.7126625776290894, "step": 1283 }, { "epoch": 1.6656934306569342, "grad_norm": 0.599096417427063, "learning_rate": 4.5110260769851804e-06, "loss": 0.6402862071990967, "step": 1284 }, { "epoch": 1.666991078669911, "grad_norm": 0.5952857136726379, "learning_rate": 4.503911117902167e-06, "loss": 0.6510819792747498, "step": 1285 }, { "epoch": 1.6682887266828872, "grad_norm": 0.5893689393997192, "learning_rate": 4.496797173195354e-06, "loss": 0.6236964464187622, "step": 1286 }, { "epoch": 1.669586374695864, "grad_norm": 0.5871599316596985, "learning_rate": 4.489684257410959e-06, "loss": 0.6143825054168701, "step": 1287 }, { "epoch": 1.6708840227088402, "grad_norm": 0.5756003260612488, "learning_rate": 4.482572385093096e-06, "loss": 0.6664775609970093, "step": 1288 }, { "epoch": 1.6721816707218167, "grad_norm": 0.6174732446670532, "learning_rate": 4.475461570783741e-06, "loss": 0.6171724200248718, "step": 1289 }, { "epoch": 1.6734793187347932, "grad_norm": 0.6114921569824219, "learning_rate": 4.468351829022713e-06, "loss": 0.7615275382995605, "step": 1290 }, { "epoch": 1.6747769667477697, "grad_norm": 0.6558356285095215, "learning_rate": 4.46124317434763e-06, "loss": 0.6879911422729492, "step": 1291 }, { "epoch": 1.6760746147607462, "grad_norm": 0.5599299669265747, "learning_rate": 4.454135621293895e-06, "loss": 0.6413300633430481, "step": 1292 }, { "epoch": 1.6773722627737226, "grad_norm": 0.5664532780647278, "learning_rate": 4.447029184394654e-06, "loss": 0.5328360795974731, "step": 1293 }, { "epoch": 1.6786699107866991, "grad_norm": 0.5689435005187988, "learning_rate": 4.439923878180772e-06, "loss": 0.6179879903793335, "step": 1294 }, { "epoch": 1.6799675587996756, "grad_norm": 0.7659060955047607, "learning_rate": 4.4328197171808e-06, "loss": 0.6246920824050903, "step": 1295 }, { "epoch": 1.6812652068126521, "grad_norm": 0.5884883403778076, "learning_rate": 4.425716715920952e-06, "loss": 0.6561876535415649, "step": 1296 }, { "epoch": 1.6825628548256284, "grad_norm": 0.604040801525116, "learning_rate": 4.418614888925064e-06, "loss": 0.6797306537628174, "step": 1297 }, { "epoch": 1.683860502838605, "grad_norm": 0.6084474921226501, "learning_rate": 4.4115142507145806e-06, "loss": 0.6703431606292725, "step": 1298 }, { "epoch": 1.6851581508515814, "grad_norm": 0.5863416790962219, "learning_rate": 4.4044148158085046e-06, "loss": 0.6162433624267578, "step": 1299 }, { "epoch": 1.686455798864558, "grad_norm": 0.6356022953987122, "learning_rate": 4.397316598723385e-06, "loss": 0.7044586539268494, "step": 1300 }, { "epoch": 1.6877534468775344, "grad_norm": 0.625541627407074, "learning_rate": 4.39021961397328e-06, "loss": 0.6772735714912415, "step": 1301 }, { "epoch": 1.689051094890511, "grad_norm": 0.6222056746482849, "learning_rate": 4.383123876069726e-06, "loss": 0.6994260549545288, "step": 1302 }, { "epoch": 1.6903487429034874, "grad_norm": 0.6140106916427612, "learning_rate": 4.376029399521711e-06, "loss": 0.6723775863647461, "step": 1303 }, { "epoch": 1.691646390916464, "grad_norm": 0.665780782699585, "learning_rate": 4.368936198835646e-06, "loss": 0.6295307278633118, "step": 1304 }, { "epoch": 1.6929440389294403, "grad_norm": 0.5935512781143188, "learning_rate": 4.361844288515327e-06, "loss": 0.6478678584098816, "step": 1305 }, { "epoch": 1.6942416869424168, "grad_norm": 0.6001803874969482, "learning_rate": 4.354753683061921e-06, "loss": 0.6501032710075378, "step": 1306 }, { "epoch": 1.6955393349553933, "grad_norm": 0.5884422063827515, "learning_rate": 4.347664396973917e-06, "loss": 0.5854666829109192, "step": 1307 }, { "epoch": 1.6968369829683698, "grad_norm": 0.5774276256561279, "learning_rate": 4.340576444747114e-06, "loss": 0.6706461906433105, "step": 1308 }, { "epoch": 1.6981346309813463, "grad_norm": 0.6317939162254333, "learning_rate": 4.333489840874575e-06, "loss": 0.6367801427841187, "step": 1309 }, { "epoch": 1.6994322789943228, "grad_norm": 0.5990278720855713, "learning_rate": 4.326404599846618e-06, "loss": 0.6113296747207642, "step": 1310 }, { "epoch": 1.7007299270072993, "grad_norm": 0.5930926203727722, "learning_rate": 4.319320736150762e-06, "loss": 0.658935546875, "step": 1311 }, { "epoch": 1.7020275750202758, "grad_norm": 0.5893100500106812, "learning_rate": 4.3122382642717196e-06, "loss": 0.6707964539527893, "step": 1312 }, { "epoch": 1.7033252230332523, "grad_norm": 0.6219534277915955, "learning_rate": 4.305157198691351e-06, "loss": 0.6915128231048584, "step": 1313 }, { "epoch": 1.7046228710462286, "grad_norm": 0.5844510197639465, "learning_rate": 4.298077553888644e-06, "loss": 0.6463670134544373, "step": 1314 }, { "epoch": 1.7059205190592053, "grad_norm": 0.590699315071106, "learning_rate": 4.290999344339678e-06, "loss": 0.6447714567184448, "step": 1315 }, { "epoch": 1.7072181670721815, "grad_norm": 0.7812482714653015, "learning_rate": 4.283922584517603e-06, "loss": 0.6600894927978516, "step": 1316 }, { "epoch": 1.7085158150851583, "grad_norm": 0.5863601565361023, "learning_rate": 4.276847288892601e-06, "loss": 0.6242765784263611, "step": 1317 }, { "epoch": 1.7098134630981345, "grad_norm": 0.5812450647354126, "learning_rate": 4.269773471931858e-06, "loss": 0.6475106477737427, "step": 1318 }, { "epoch": 1.7111111111111112, "grad_norm": 0.5987546443939209, "learning_rate": 4.262701148099544e-06, "loss": 0.6834150552749634, "step": 1319 }, { "epoch": 1.7124087591240875, "grad_norm": 0.5713450312614441, "learning_rate": 4.255630331856768e-06, "loss": 0.5877612829208374, "step": 1320 }, { "epoch": 1.7137064071370642, "grad_norm": 0.5582994818687439, "learning_rate": 4.248561037661561e-06, "loss": 0.5848795175552368, "step": 1321 }, { "epoch": 1.7150040551500405, "grad_norm": 0.5713660717010498, "learning_rate": 4.241493279968838e-06, "loss": 0.6386708617210388, "step": 1322 }, { "epoch": 1.716301703163017, "grad_norm": 0.5673105716705322, "learning_rate": 4.234427073230377e-06, "loss": 0.6179746389389038, "step": 1323 }, { "epoch": 1.7175993511759935, "grad_norm": 0.5679452419281006, "learning_rate": 4.22736243189478e-06, "loss": 0.641147255897522, "step": 1324 }, { "epoch": 1.71889699918897, "grad_norm": 0.608302652835846, "learning_rate": 4.220299370407454e-06, "loss": 0.6888396143913269, "step": 1325 }, { "epoch": 1.7201946472019465, "grad_norm": 0.5650665163993835, "learning_rate": 4.2132379032105695e-06, "loss": 0.651650607585907, "step": 1326 }, { "epoch": 1.721492295214923, "grad_norm": 0.561650812625885, "learning_rate": 4.206178044743041e-06, "loss": 0.6115202307701111, "step": 1327 }, { "epoch": 1.7227899432278995, "grad_norm": 0.5860607624053955, "learning_rate": 4.19911980944049e-06, "loss": 0.6547002792358398, "step": 1328 }, { "epoch": 1.724087591240876, "grad_norm": 0.7003436088562012, "learning_rate": 4.1920632117352235e-06, "loss": 0.6392462253570557, "step": 1329 }, { "epoch": 1.7253852392538525, "grad_norm": 0.5677862763404846, "learning_rate": 4.185008266056195e-06, "loss": 0.5821945667266846, "step": 1330 }, { "epoch": 1.7266828872668287, "grad_norm": 0.587795615196228, "learning_rate": 4.177954986828987e-06, "loss": 0.6519031524658203, "step": 1331 }, { "epoch": 1.7279805352798054, "grad_norm": 0.5895066857337952, "learning_rate": 4.170903388475766e-06, "loss": 0.6622262597084045, "step": 1332 }, { "epoch": 1.7292781832927817, "grad_norm": 0.5911295413970947, "learning_rate": 4.163853485415269e-06, "loss": 0.6385645866394043, "step": 1333 }, { "epoch": 1.7305758313057584, "grad_norm": 0.6040472984313965, "learning_rate": 4.156805292062762e-06, "loss": 0.6997763514518738, "step": 1334 }, { "epoch": 1.7318734793187347, "grad_norm": 0.6030855178833008, "learning_rate": 4.1497588228300165e-06, "loss": 0.6099704504013062, "step": 1335 }, { "epoch": 1.7331711273317114, "grad_norm": 0.5850874781608582, "learning_rate": 4.142714092125277e-06, "loss": 0.5748507380485535, "step": 1336 }, { "epoch": 1.7344687753446877, "grad_norm": 0.5881203413009644, "learning_rate": 4.135671114353239e-06, "loss": 0.6896364688873291, "step": 1337 }, { "epoch": 1.7357664233576642, "grad_norm": 0.5428244471549988, "learning_rate": 4.128629903915004e-06, "loss": 0.5673160552978516, "step": 1338 }, { "epoch": 1.7370640713706407, "grad_norm": 0.6348845362663269, "learning_rate": 4.121590475208071e-06, "loss": 0.6452966928482056, "step": 1339 }, { "epoch": 1.7383617193836172, "grad_norm": 0.5799127221107483, "learning_rate": 4.114552842626285e-06, "loss": 0.626937747001648, "step": 1340 }, { "epoch": 1.7396593673965937, "grad_norm": 0.5999795198440552, "learning_rate": 4.107517020559827e-06, "loss": 0.6316832900047302, "step": 1341 }, { "epoch": 1.7409570154095702, "grad_norm": 3.404263734817505, "learning_rate": 4.1004830233951696e-06, "loss": 0.6446040868759155, "step": 1342 }, { "epoch": 1.7422546634225466, "grad_norm": 0.5750575661659241, "learning_rate": 4.0934508655150585e-06, "loss": 0.6410173177719116, "step": 1343 }, { "epoch": 1.7435523114355231, "grad_norm": 0.612946093082428, "learning_rate": 4.086420561298476e-06, "loss": 0.7256200313568115, "step": 1344 }, { "epoch": 1.7448499594484996, "grad_norm": 0.5811024904251099, "learning_rate": 4.079392125120613e-06, "loss": 0.6546262502670288, "step": 1345 }, { "epoch": 1.7461476074614761, "grad_norm": 0.6089962124824524, "learning_rate": 4.072365571352847e-06, "loss": 0.5643700957298279, "step": 1346 }, { "epoch": 1.7474452554744526, "grad_norm": 0.5598763227462769, "learning_rate": 4.065340914362697e-06, "loss": 0.6210203170776367, "step": 1347 }, { "epoch": 1.748742903487429, "grad_norm": 0.5718949437141418, "learning_rate": 4.058318168513813e-06, "loss": 0.6246052980422974, "step": 1348 }, { "epoch": 1.7500405515004056, "grad_norm": 0.5816182494163513, "learning_rate": 4.05129734816593e-06, "loss": 0.6502724289894104, "step": 1349 }, { "epoch": 1.7513381995133819, "grad_norm": 0.6006066799163818, "learning_rate": 4.04427846767485e-06, "loss": 0.6196832060813904, "step": 1350 }, { "epoch": 1.7526358475263586, "grad_norm": 0.6209701299667358, "learning_rate": 4.037261541392405e-06, "loss": 0.6615033149719238, "step": 1351 }, { "epoch": 1.7539334955393349, "grad_norm": 0.5778906345367432, "learning_rate": 4.030246583666437e-06, "loss": 0.600303053855896, "step": 1352 }, { "epoch": 1.7552311435523116, "grad_norm": 0.5654350519180298, "learning_rate": 4.023233608840755e-06, "loss": 0.6526889801025391, "step": 1353 }, { "epoch": 1.7565287915652879, "grad_norm": 0.604720413684845, "learning_rate": 4.016222631255121e-06, "loss": 0.6632093191146851, "step": 1354 }, { "epoch": 1.7578264395782643, "grad_norm": 0.5776406526565552, "learning_rate": 4.0092136652452054e-06, "loss": 0.5856695175170898, "step": 1355 }, { "epoch": 1.7591240875912408, "grad_norm": 0.5833093523979187, "learning_rate": 4.0022067251425736e-06, "loss": 0.7012051939964294, "step": 1356 }, { "epoch": 1.7604217356042173, "grad_norm": 0.6321353912353516, "learning_rate": 3.9952018252746424e-06, "loss": 0.6692728996276855, "step": 1357 }, { "epoch": 1.7617193836171938, "grad_norm": 0.5867600440979004, "learning_rate": 3.988198979964662e-06, "loss": 0.6333553791046143, "step": 1358 }, { "epoch": 1.7630170316301703, "grad_norm": 0.5640849471092224, "learning_rate": 3.981198203531673e-06, "loss": 0.6600401401519775, "step": 1359 }, { "epoch": 1.7643146796431468, "grad_norm": 0.5749746561050415, "learning_rate": 3.974199510290498e-06, "loss": 0.600135087966919, "step": 1360 }, { "epoch": 1.7656123276561233, "grad_norm": 0.6021872162818909, "learning_rate": 3.967202914551688e-06, "loss": 0.6514877676963806, "step": 1361 }, { "epoch": 1.7669099756690998, "grad_norm": 1.1252561807632446, "learning_rate": 3.960208430621514e-06, "loss": 0.6247175931930542, "step": 1362 }, { "epoch": 1.7682076236820763, "grad_norm": 0.6089026927947998, "learning_rate": 3.953216072801922e-06, "loss": 0.6505289077758789, "step": 1363 }, { "epoch": 1.7695052716950528, "grad_norm": 0.613433301448822, "learning_rate": 3.946225855390518e-06, "loss": 0.6519597768783569, "step": 1364 }, { "epoch": 1.770802919708029, "grad_norm": 0.6230673789978027, "learning_rate": 3.9392377926805226e-06, "loss": 0.6527152061462402, "step": 1365 }, { "epoch": 1.7721005677210058, "grad_norm": 0.629035472869873, "learning_rate": 3.932251898960759e-06, "loss": 0.6801344156265259, "step": 1366 }, { "epoch": 1.773398215733982, "grad_norm": 0.586634635925293, "learning_rate": 3.925268188515611e-06, "loss": 0.6678798794746399, "step": 1367 }, { "epoch": 1.7746958637469588, "grad_norm": 0.691630482673645, "learning_rate": 3.918286675624998e-06, "loss": 0.6675139665603638, "step": 1368 }, { "epoch": 1.775993511759935, "grad_norm": 0.5624348521232605, "learning_rate": 3.911307374564346e-06, "loss": 0.5508803129196167, "step": 1369 }, { "epoch": 1.7772911597729117, "grad_norm": 0.9164373874664307, "learning_rate": 3.904330299604562e-06, "loss": 0.6670984625816345, "step": 1370 }, { "epoch": 1.778588807785888, "grad_norm": 0.620689868927002, "learning_rate": 3.897355465011996e-06, "loss": 0.6593863368034363, "step": 1371 }, { "epoch": 1.7798864557988645, "grad_norm": 0.5467659831047058, "learning_rate": 3.89038288504842e-06, "loss": 0.5556522607803345, "step": 1372 }, { "epoch": 1.781184103811841, "grad_norm": 0.5498706698417664, "learning_rate": 3.883412573970995e-06, "loss": 0.6222935914993286, "step": 1373 }, { "epoch": 1.7824817518248175, "grad_norm": 0.5786144137382507, "learning_rate": 3.876444546032242e-06, "loss": 0.6003856658935547, "step": 1374 }, { "epoch": 1.783779399837794, "grad_norm": 0.5900736451148987, "learning_rate": 3.8694788154800185e-06, "loss": 0.6151521801948547, "step": 1375 }, { "epoch": 1.7850770478507705, "grad_norm": 0.5880241394042969, "learning_rate": 3.862515396557476e-06, "loss": 0.6527180671691895, "step": 1376 }, { "epoch": 1.786374695863747, "grad_norm": 0.6083548069000244, "learning_rate": 3.855554303503047e-06, "loss": 0.6581445932388306, "step": 1377 }, { "epoch": 1.7876723438767235, "grad_norm": 0.5609106421470642, "learning_rate": 3.848595550550401e-06, "loss": 0.6590725779533386, "step": 1378 }, { "epoch": 1.7889699918897, "grad_norm": 0.6204782724380493, "learning_rate": 3.841639151928431e-06, "loss": 0.6809993386268616, "step": 1379 }, { "epoch": 1.7902676399026762, "grad_norm": 0.5831668972969055, "learning_rate": 3.834685121861208e-06, "loss": 0.6498827934265137, "step": 1380 }, { "epoch": 1.7902676399026762, "eval_loss": 0.6777992248535156, "eval_runtime": 73.0192, "eval_samples_per_second": 71.105, "eval_steps_per_second": 8.888, "step": 1380 }, { "epoch": 1.791565287915653, "grad_norm": 0.5954435467720032, "learning_rate": 3.827733474567966e-06, "loss": 0.6496779322624207, "step": 1381 }, { "epoch": 1.7928629359286292, "grad_norm": 0.5471308827400208, "learning_rate": 3.820784224263061e-06, "loss": 0.5941118001937866, "step": 1382 }, { "epoch": 1.794160583941606, "grad_norm": 0.5896412134170532, "learning_rate": 3.8138373851559546e-06, "loss": 0.6255256533622742, "step": 1383 }, { "epoch": 1.7954582319545822, "grad_norm": 0.9544134736061096, "learning_rate": 3.8068929714511716e-06, "loss": 0.6434448957443237, "step": 1384 }, { "epoch": 1.796755879967559, "grad_norm": 0.5609217882156372, "learning_rate": 3.799950997348283e-06, "loss": 0.6087275743484497, "step": 1385 }, { "epoch": 1.7980535279805352, "grad_norm": 4.44458532333374, "learning_rate": 3.7930114770418654e-06, "loss": 0.5713160037994385, "step": 1386 }, { "epoch": 1.799351175993512, "grad_norm": 0.5974010825157166, "learning_rate": 3.7860744247214853e-06, "loss": 0.6058465838432312, "step": 1387 }, { "epoch": 1.8006488240064882, "grad_norm": 0.5761491060256958, "learning_rate": 3.7791398545716552e-06, "loss": 0.619678258895874, "step": 1388 }, { "epoch": 1.8019464720194647, "grad_norm": 1.2458136081695557, "learning_rate": 3.7722077807718193e-06, "loss": 0.6886736750602722, "step": 1389 }, { "epoch": 1.8032441200324412, "grad_norm": 0.6204317212104797, "learning_rate": 3.7652782174963107e-06, "loss": 0.6285656690597534, "step": 1390 }, { "epoch": 1.8045417680454177, "grad_norm": 0.5791151523590088, "learning_rate": 3.758351178914336e-06, "loss": 0.6601356267929077, "step": 1391 }, { "epoch": 1.8058394160583942, "grad_norm": 0.5656175017356873, "learning_rate": 3.7514266791899324e-06, "loss": 0.5828202962875366, "step": 1392 }, { "epoch": 1.8071370640713706, "grad_norm": 0.6195251941680908, "learning_rate": 3.7445047324819517e-06, "loss": 0.7079391479492188, "step": 1393 }, { "epoch": 1.8084347120843471, "grad_norm": 0.5826953649520874, "learning_rate": 3.737585352944021e-06, "loss": 0.6261759996414185, "step": 1394 }, { "epoch": 1.8097323600973236, "grad_norm": 0.6581652760505676, "learning_rate": 3.7306685547245225e-06, "loss": 0.6573713421821594, "step": 1395 }, { "epoch": 1.8110300081103001, "grad_norm": 0.5666741728782654, "learning_rate": 3.7237543519665543e-06, "loss": 0.621452808380127, "step": 1396 }, { "epoch": 1.8123276561232764, "grad_norm": 0.5948919057846069, "learning_rate": 3.7168427588079153e-06, "loss": 0.6522223353385925, "step": 1397 }, { "epoch": 1.8136253041362531, "grad_norm": 0.5332669615745544, "learning_rate": 3.7099337893810593e-06, "loss": 0.650192141532898, "step": 1398 }, { "epoch": 1.8149229521492294, "grad_norm": 0.599592924118042, "learning_rate": 3.703027457813086e-06, "loss": 0.6094880700111389, "step": 1399 }, { "epoch": 1.816220600162206, "grad_norm": 0.6047189235687256, "learning_rate": 3.696123778225691e-06, "loss": 0.6866611838340759, "step": 1400 }, { "epoch": 1.8175182481751824, "grad_norm": 0.7004641890525818, "learning_rate": 3.6892227647351515e-06, "loss": 0.6755614280700684, "step": 1401 }, { "epoch": 1.818815896188159, "grad_norm": 0.5989522933959961, "learning_rate": 3.6823244314522966e-06, "loss": 0.6946245431900024, "step": 1402 }, { "epoch": 1.8201135442011354, "grad_norm": 0.579132080078125, "learning_rate": 3.67542879248247e-06, "loss": 0.6097831726074219, "step": 1403 }, { "epoch": 1.821411192214112, "grad_norm": 0.577029287815094, "learning_rate": 3.668535861925509e-06, "loss": 0.6218363046646118, "step": 1404 }, { "epoch": 1.8227088402270883, "grad_norm": 0.6415956020355225, "learning_rate": 3.661645653875709e-06, "loss": 0.6793798208236694, "step": 1405 }, { "epoch": 1.8240064882400648, "grad_norm": 0.603378415107727, "learning_rate": 3.6547581824218057e-06, "loss": 0.5855191946029663, "step": 1406 }, { "epoch": 1.8253041362530413, "grad_norm": 0.6317605376243591, "learning_rate": 3.6478734616469324e-06, "loss": 0.6648485660552979, "step": 1407 }, { "epoch": 1.8266017842660178, "grad_norm": 0.5663666725158691, "learning_rate": 3.6409915056286017e-06, "loss": 0.6257850527763367, "step": 1408 }, { "epoch": 1.8278994322789943, "grad_norm": 0.8109258413314819, "learning_rate": 3.6341123284386694e-06, "loss": 0.6545461416244507, "step": 1409 }, { "epoch": 1.8291970802919708, "grad_norm": 0.6355454325675964, "learning_rate": 3.627235944143315e-06, "loss": 0.68341463804245, "step": 1410 }, { "epoch": 1.8304947283049473, "grad_norm": 0.5834214091300964, "learning_rate": 3.620362366803001e-06, "loss": 0.6818444728851318, "step": 1411 }, { "epoch": 1.8317923763179238, "grad_norm": 0.5867376327514648, "learning_rate": 3.6134916104724573e-06, "loss": 0.6132810115814209, "step": 1412 }, { "epoch": 1.8330900243309003, "grad_norm": 0.5869424343109131, "learning_rate": 3.606623689200637e-06, "loss": 0.6913362741470337, "step": 1413 }, { "epoch": 1.8343876723438766, "grad_norm": 0.5870312452316284, "learning_rate": 3.599758617030704e-06, "loss": 0.6339567303657532, "step": 1414 }, { "epoch": 1.8356853203568533, "grad_norm": 0.6119568943977356, "learning_rate": 3.5928964079999907e-06, "loss": 0.6378414630889893, "step": 1415 }, { "epoch": 1.8369829683698295, "grad_norm": 0.5717766284942627, "learning_rate": 3.5860370761399814e-06, "loss": 0.6197869777679443, "step": 1416 }, { "epoch": 1.8382806163828063, "grad_norm": 0.626775860786438, "learning_rate": 3.5791806354762702e-06, "loss": 0.7052003145217896, "step": 1417 }, { "epoch": 1.8395782643957825, "grad_norm": 0.5812957286834717, "learning_rate": 3.572327100028545e-06, "loss": 0.66878342628479, "step": 1418 }, { "epoch": 1.8408759124087593, "grad_norm": 0.585649311542511, "learning_rate": 3.565476483810548e-06, "loss": 0.6272032260894775, "step": 1419 }, { "epoch": 1.8421735604217355, "grad_norm": 0.6118691563606262, "learning_rate": 3.55862880083006e-06, "loss": 0.6374541521072388, "step": 1420 }, { "epoch": 1.8434712084347122, "grad_norm": 0.5860823392868042, "learning_rate": 3.5517840650888564e-06, "loss": 0.6104147434234619, "step": 1421 }, { "epoch": 1.8447688564476885, "grad_norm": 0.5618652701377869, "learning_rate": 3.544942290582691e-06, "loss": 0.5710769891738892, "step": 1422 }, { "epoch": 1.846066504460665, "grad_norm": 0.5879126787185669, "learning_rate": 3.538103491301258e-06, "loss": 0.6456954479217529, "step": 1423 }, { "epoch": 1.8473641524736415, "grad_norm": 0.6192496418952942, "learning_rate": 3.531267681228175e-06, "loss": 0.6715401411056519, "step": 1424 }, { "epoch": 1.848661800486618, "grad_norm": 0.6261125802993774, "learning_rate": 3.5244348743409394e-06, "loss": 0.6905325055122375, "step": 1425 }, { "epoch": 1.8499594484995945, "grad_norm": 0.5808646082878113, "learning_rate": 3.517605084610917e-06, "loss": 0.6800282001495361, "step": 1426 }, { "epoch": 1.851257096512571, "grad_norm": 0.5866647362709045, "learning_rate": 3.510778326003294e-06, "loss": 0.6750452518463135, "step": 1427 }, { "epoch": 1.8525547445255475, "grad_norm": 0.5787751078605652, "learning_rate": 3.5039546124770675e-06, "loss": 0.6570975184440613, "step": 1428 }, { "epoch": 1.853852392538524, "grad_norm": 0.6095142960548401, "learning_rate": 3.4971339579850017e-06, "loss": 0.6344528198242188, "step": 1429 }, { "epoch": 1.8551500405515005, "grad_norm": 0.5892320871353149, "learning_rate": 3.4903163764736104e-06, "loss": 0.6722358465194702, "step": 1430 }, { "epoch": 1.8564476885644767, "grad_norm": 0.5868071913719177, "learning_rate": 3.4835018818831235e-06, "loss": 0.638904333114624, "step": 1431 }, { "epoch": 1.8577453365774534, "grad_norm": 0.6003979444503784, "learning_rate": 3.4766904881474535e-06, "loss": 0.6853640079498291, "step": 1432 }, { "epoch": 1.8590429845904297, "grad_norm": 0.555009663105011, "learning_rate": 3.4698822091941808e-06, "loss": 0.6409114599227905, "step": 1433 }, { "epoch": 1.8603406326034064, "grad_norm": 0.5608627796173096, "learning_rate": 3.463077058944511e-06, "loss": 0.6055079698562622, "step": 1434 }, { "epoch": 1.8616382806163827, "grad_norm": 0.6137329339981079, "learning_rate": 3.456275051313255e-06, "loss": 0.6407139897346497, "step": 1435 }, { "epoch": 1.8629359286293594, "grad_norm": 0.5606741905212402, "learning_rate": 3.4494762002087934e-06, "loss": 0.6254716515541077, "step": 1436 }, { "epoch": 1.8642335766423357, "grad_norm": 0.6578085422515869, "learning_rate": 3.4426805195330605e-06, "loss": 0.7003939151763916, "step": 1437 }, { "epoch": 1.8655312246553124, "grad_norm": 0.6054635047912598, "learning_rate": 3.4358880231814983e-06, "loss": 0.6616827845573425, "step": 1438 }, { "epoch": 1.8668288726682887, "grad_norm": 0.5833800435066223, "learning_rate": 3.4290987250430486e-06, "loss": 0.6554232835769653, "step": 1439 }, { "epoch": 1.8681265206812652, "grad_norm": 0.6048437356948853, "learning_rate": 3.4223126390001025e-06, "loss": 0.6970128417015076, "step": 1440 }, { "epoch": 1.8694241686942417, "grad_norm": 0.5701255202293396, "learning_rate": 3.415529778928492e-06, "loss": 0.6580668687820435, "step": 1441 }, { "epoch": 1.8707218167072182, "grad_norm": 0.553488552570343, "learning_rate": 3.408750158697445e-06, "loss": 0.5830860137939453, "step": 1442 }, { "epoch": 1.8720194647201946, "grad_norm": 0.5695835947990417, "learning_rate": 3.401973792169574e-06, "loss": 0.6223429441452026, "step": 1443 }, { "epoch": 1.8733171127331711, "grad_norm": 0.5780246257781982, "learning_rate": 3.39520069320083e-06, "loss": 0.6171367168426514, "step": 1444 }, { "epoch": 1.8746147607461476, "grad_norm": 0.5851401686668396, "learning_rate": 3.3884308756404873e-06, "loss": 0.648118257522583, "step": 1445 }, { "epoch": 1.8759124087591241, "grad_norm": 0.5909201502799988, "learning_rate": 3.381664353331107e-06, "loss": 0.6370965242385864, "step": 1446 }, { "epoch": 1.8772100567721006, "grad_norm": 0.5840253233909607, "learning_rate": 3.3749011401085185e-06, "loss": 0.637911856174469, "step": 1447 }, { "epoch": 1.878507704785077, "grad_norm": 0.5772621035575867, "learning_rate": 3.3681412498017773e-06, "loss": 0.6257845759391785, "step": 1448 }, { "epoch": 1.8798053527980536, "grad_norm": 0.5972771048545837, "learning_rate": 3.361384696233152e-06, "loss": 0.6612721085548401, "step": 1449 }, { "epoch": 1.8811030008110299, "grad_norm": 0.622917652130127, "learning_rate": 3.354631493218081e-06, "loss": 0.657785177230835, "step": 1450 }, { "epoch": 1.8824006488240066, "grad_norm": 0.581942081451416, "learning_rate": 3.347881654565159e-06, "loss": 0.6339654922485352, "step": 1451 }, { "epoch": 1.8836982968369829, "grad_norm": 0.5792364478111267, "learning_rate": 3.3411351940760924e-06, "loss": 0.606496274471283, "step": 1452 }, { "epoch": 1.8849959448499596, "grad_norm": 0.5994595289230347, "learning_rate": 3.3343921255456903e-06, "loss": 0.6079939603805542, "step": 1453 }, { "epoch": 1.8862935928629359, "grad_norm": 0.5667769908905029, "learning_rate": 3.3276524627618177e-06, "loss": 0.5945770740509033, "step": 1454 }, { "epoch": 1.8875912408759126, "grad_norm": 0.591791033744812, "learning_rate": 3.3209162195053825e-06, "loss": 0.620225727558136, "step": 1455 }, { "epoch": 1.8888888888888888, "grad_norm": 0.5802031755447388, "learning_rate": 3.314183409550293e-06, "loss": 0.614050567150116, "step": 1456 }, { "epoch": 1.8901865369018653, "grad_norm": 0.6020429134368896, "learning_rate": 3.3074540466634454e-06, "loss": 0.6691816449165344, "step": 1457 }, { "epoch": 1.8914841849148418, "grad_norm": 0.6074531674385071, "learning_rate": 3.300728144604681e-06, "loss": 0.6914318799972534, "step": 1458 }, { "epoch": 1.8927818329278183, "grad_norm": 0.5949025750160217, "learning_rate": 3.294005717126767e-06, "loss": 0.5819941163063049, "step": 1459 }, { "epoch": 1.8940794809407948, "grad_norm": 0.5953806638717651, "learning_rate": 3.287286777975369e-06, "loss": 0.6016311645507812, "step": 1460 }, { "epoch": 1.8953771289537713, "grad_norm": 0.6012862920761108, "learning_rate": 3.2805713408890134e-06, "loss": 0.62370765209198, "step": 1461 }, { "epoch": 1.8966747769667478, "grad_norm": 0.5692993402481079, "learning_rate": 3.2738594195990725e-06, "loss": 0.6124866604804993, "step": 1462 }, { "epoch": 1.8979724249797243, "grad_norm": 0.5979285836219788, "learning_rate": 3.267151027829725e-06, "loss": 0.6501439213752747, "step": 1463 }, { "epoch": 1.8992700729927008, "grad_norm": 0.579058825969696, "learning_rate": 3.2604461792979346e-06, "loss": 0.6591506004333496, "step": 1464 }, { "epoch": 1.900567721005677, "grad_norm": 0.5612583756446838, "learning_rate": 3.253744887713417e-06, "loss": 0.644995927810669, "step": 1465 }, { "epoch": 1.9018653690186538, "grad_norm": 0.5929267406463623, "learning_rate": 3.2470471667786217e-06, "loss": 0.6369574069976807, "step": 1466 }, { "epoch": 1.90316301703163, "grad_norm": 0.5371314287185669, "learning_rate": 3.2403530301886897e-06, "loss": 0.6427657604217529, "step": 1467 }, { "epoch": 1.9044606650446068, "grad_norm": 0.5879482626914978, "learning_rate": 3.2336624916314385e-06, "loss": 0.6144864559173584, "step": 1468 }, { "epoch": 1.905758313057583, "grad_norm": 0.5627234578132629, "learning_rate": 3.226975564787322e-06, "loss": 0.6070575714111328, "step": 1469 }, { "epoch": 1.9070559610705597, "grad_norm": 0.595919668674469, "learning_rate": 3.2202922633294178e-06, "loss": 0.6438186764717102, "step": 1470 }, { "epoch": 1.908353609083536, "grad_norm": 0.5860680937767029, "learning_rate": 3.2136126009233815e-06, "loss": 0.6168484091758728, "step": 1471 }, { "epoch": 1.9096512570965127, "grad_norm": 0.6082072257995605, "learning_rate": 3.2069365912274364e-06, "loss": 0.6607163548469543, "step": 1472 }, { "epoch": 1.910948905109489, "grad_norm": 0.6000680923461914, "learning_rate": 3.2002642478923273e-06, "loss": 0.6100636720657349, "step": 1473 }, { "epoch": 1.9122465531224655, "grad_norm": 0.5958935022354126, "learning_rate": 3.1935955845613138e-06, "loss": 0.6283643245697021, "step": 1474 }, { "epoch": 1.913544201135442, "grad_norm": 0.5999156832695007, "learning_rate": 3.1869306148701186e-06, "loss": 0.6624071002006531, "step": 1475 }, { "epoch": 1.9148418491484185, "grad_norm": 0.5659943222999573, "learning_rate": 3.1802693524469226e-06, "loss": 0.5978960990905762, "step": 1476 }, { "epoch": 1.916139497161395, "grad_norm": 0.6041963696479797, "learning_rate": 3.1736118109123183e-06, "loss": 0.6953626871109009, "step": 1477 }, { "epoch": 1.9174371451743715, "grad_norm": 0.5829861164093018, "learning_rate": 3.1669580038792953e-06, "loss": 0.6347401142120361, "step": 1478 }, { "epoch": 1.918734793187348, "grad_norm": 0.5910770297050476, "learning_rate": 3.1603079449532014e-06, "loss": 0.6252144575119019, "step": 1479 }, { "epoch": 1.9200324412003245, "grad_norm": 0.5840498208999634, "learning_rate": 3.1536616477317283e-06, "loss": 0.6821172833442688, "step": 1480 }, { "epoch": 1.921330089213301, "grad_norm": 0.5815771222114563, "learning_rate": 3.147019125804869e-06, "loss": 0.627813458442688, "step": 1481 }, { "epoch": 1.9226277372262772, "grad_norm": 0.6089122295379639, "learning_rate": 3.140380392754901e-06, "loss": 0.5848509073257446, "step": 1482 }, { "epoch": 1.923925385239254, "grad_norm": 0.5963802337646484, "learning_rate": 3.13374546215635e-06, "loss": 0.6434051990509033, "step": 1483 }, { "epoch": 1.9252230332522302, "grad_norm": 0.5844939351081848, "learning_rate": 3.1271143475759745e-06, "loss": 0.6818792819976807, "step": 1484 }, { "epoch": 1.926520681265207, "grad_norm": 0.5862755179405212, "learning_rate": 3.1204870625727216e-06, "loss": 0.6306114196777344, "step": 1485 }, { "epoch": 1.9278183292781832, "grad_norm": 0.5746100544929504, "learning_rate": 3.1138636206977147e-06, "loss": 0.649817705154419, "step": 1486 }, { "epoch": 1.92911597729116, "grad_norm": 0.7469968199729919, "learning_rate": 3.107244035494212e-06, "loss": 0.6348094940185547, "step": 1487 }, { "epoch": 1.9304136253041362, "grad_norm": 0.5893679857254028, "learning_rate": 3.100628320497592e-06, "loss": 0.6067320704460144, "step": 1488 }, { "epoch": 1.931711273317113, "grad_norm": 0.5654053688049316, "learning_rate": 3.0940164892353197e-06, "loss": 0.6475971937179565, "step": 1489 }, { "epoch": 1.9330089213300892, "grad_norm": 0.5734997987747192, "learning_rate": 3.087408555226914e-06, "loss": 0.61939537525177, "step": 1490 }, { "epoch": 1.9343065693430657, "grad_norm": 0.5849641561508179, "learning_rate": 3.0808045319839285e-06, "loss": 0.6628157496452332, "step": 1491 }, { "epoch": 1.9356042173560422, "grad_norm": 0.6002839803695679, "learning_rate": 3.0742044330099162e-06, "loss": 0.7149718403816223, "step": 1492 }, { "epoch": 1.9369018653690186, "grad_norm": 0.5984014272689819, "learning_rate": 3.067608271800414e-06, "loss": 0.6320532560348511, "step": 1493 }, { "epoch": 1.9381995133819951, "grad_norm": 0.5990681052207947, "learning_rate": 3.0610160618428987e-06, "loss": 0.7083904147148132, "step": 1494 }, { "epoch": 1.9394971613949716, "grad_norm": 0.5863717794418335, "learning_rate": 3.054427816616773e-06, "loss": 0.6290713548660278, "step": 1495 }, { "epoch": 1.9407948094079481, "grad_norm": 0.5699295401573181, "learning_rate": 3.0478435495933273e-06, "loss": 0.621793270111084, "step": 1496 }, { "epoch": 1.9420924574209246, "grad_norm": 0.5777533054351807, "learning_rate": 3.0412632742357263e-06, "loss": 0.6173816323280334, "step": 1497 }, { "epoch": 1.9433901054339011, "grad_norm": 0.6047410368919373, "learning_rate": 3.0346870039989618e-06, "loss": 0.6888694763183594, "step": 1498 }, { "epoch": 1.9446877534468774, "grad_norm": 0.5461248159408569, "learning_rate": 3.028114752329848e-06, "loss": 0.5872098207473755, "step": 1499 }, { "epoch": 1.945985401459854, "grad_norm": 0.6002129316329956, "learning_rate": 3.0215465326669724e-06, "loss": 0.6144348382949829, "step": 1500 }, { "epoch": 1.9472830494728304, "grad_norm": 0.5926127433776855, "learning_rate": 3.0149823584406834e-06, "loss": 0.5981168746948242, "step": 1501 }, { "epoch": 1.948580697485807, "grad_norm": 0.553831160068512, "learning_rate": 3.008422243073053e-06, "loss": 0.6507419943809509, "step": 1502 }, { "epoch": 1.9498783454987834, "grad_norm": 0.6168836951255798, "learning_rate": 3.001866199977861e-06, "loss": 0.6085610389709473, "step": 1503 }, { "epoch": 1.95117599351176, "grad_norm": 0.610622227191925, "learning_rate": 2.995314242560553e-06, "loss": 0.584296703338623, "step": 1504 }, { "epoch": 1.9524736415247363, "grad_norm": 0.598139762878418, "learning_rate": 2.988766384218225e-06, "loss": 0.6997476816177368, "step": 1505 }, { "epoch": 1.9537712895377128, "grad_norm": 0.5578987002372742, "learning_rate": 2.982222638339588e-06, "loss": 0.5938620567321777, "step": 1506 }, { "epoch": 1.9550689375506893, "grad_norm": 0.6006044745445251, "learning_rate": 2.9756830183049502e-06, "loss": 0.6362953186035156, "step": 1507 }, { "epoch": 1.9563665855636658, "grad_norm": 0.6040393710136414, "learning_rate": 2.969147537486175e-06, "loss": 0.5799316167831421, "step": 1508 }, { "epoch": 1.9576642335766423, "grad_norm": 0.5984890460968018, "learning_rate": 2.962616209246669e-06, "loss": 0.639271080493927, "step": 1509 }, { "epoch": 1.9589618815896188, "grad_norm": 0.7439842820167542, "learning_rate": 2.956089046941344e-06, "loss": 0.6323772072792053, "step": 1510 }, { "epoch": 1.9602595296025953, "grad_norm": 0.5876015424728394, "learning_rate": 2.9495660639165967e-06, "loss": 0.5763074159622192, "step": 1511 }, { "epoch": 1.9615571776155718, "grad_norm": 0.6764865517616272, "learning_rate": 2.9430472735102733e-06, "loss": 0.7091867923736572, "step": 1512 }, { "epoch": 1.9628548256285483, "grad_norm": 0.6067684292793274, "learning_rate": 2.9365326890516543e-06, "loss": 0.6496888995170593, "step": 1513 }, { "epoch": 1.9641524736415248, "grad_norm": 0.5764046311378479, "learning_rate": 2.9300223238614135e-06, "loss": 0.6311619281768799, "step": 1514 }, { "epoch": 1.9654501216545013, "grad_norm": 0.5956159234046936, "learning_rate": 2.923516191251601e-06, "loss": 0.6114912033081055, "step": 1515 }, { "epoch": 1.9667477696674776, "grad_norm": 0.5818417072296143, "learning_rate": 2.917014304525609e-06, "loss": 0.6572203636169434, "step": 1516 }, { "epoch": 1.9680454176804543, "grad_norm": 0.6058406233787537, "learning_rate": 2.91051667697815e-06, "loss": 0.6197275519371033, "step": 1517 }, { "epoch": 1.9693430656934305, "grad_norm": 0.6014067530632019, "learning_rate": 2.904023321895234e-06, "loss": 0.6693457365036011, "step": 1518 }, { "epoch": 1.9706407137064073, "grad_norm": 0.5446932315826416, "learning_rate": 2.8975342525541217e-06, "loss": 0.6219191551208496, "step": 1519 }, { "epoch": 1.9719383617193835, "grad_norm": 0.5773969292640686, "learning_rate": 2.8910494822233203e-06, "loss": 0.6279373168945312, "step": 1520 }, { "epoch": 1.9732360097323602, "grad_norm": 0.553596556186676, "learning_rate": 2.8845690241625437e-06, "loss": 0.5865894556045532, "step": 1521 }, { "epoch": 1.9745336577453365, "grad_norm": 0.5790948867797852, "learning_rate": 2.878092891622688e-06, "loss": 0.6192329525947571, "step": 1522 }, { "epoch": 1.975831305758313, "grad_norm": 0.5870917439460754, "learning_rate": 2.871621097845806e-06, "loss": 0.6201770305633545, "step": 1523 }, { "epoch": 1.9771289537712895, "grad_norm": 0.586599588394165, "learning_rate": 2.865153656065076e-06, "loss": 0.6979238390922546, "step": 1524 }, { "epoch": 1.978426601784266, "grad_norm": 0.5809787511825562, "learning_rate": 2.8586905795047813e-06, "loss": 0.6264389753341675, "step": 1525 }, { "epoch": 1.9797242497972425, "grad_norm": 0.5969094038009644, "learning_rate": 2.8522318813802796e-06, "loss": 0.6544374227523804, "step": 1526 }, { "epoch": 1.981021897810219, "grad_norm": 0.5875753164291382, "learning_rate": 2.8457775748979664e-06, "loss": 0.7151497006416321, "step": 1527 }, { "epoch": 1.9823195458231955, "grad_norm": 0.5887599587440491, "learning_rate": 2.8393276732552745e-06, "loss": 0.650242030620575, "step": 1528 }, { "epoch": 1.983617193836172, "grad_norm": 0.5730281472206116, "learning_rate": 2.8328821896406132e-06, "loss": 0.6076555252075195, "step": 1529 }, { "epoch": 1.9849148418491485, "grad_norm": 0.6394782662391663, "learning_rate": 2.826441137233368e-06, "loss": 0.6826823949813843, "step": 1530 }, { "epoch": 1.986212489862125, "grad_norm": 0.5790883302688599, "learning_rate": 2.8200045292038596e-06, "loss": 0.6138323545455933, "step": 1531 }, { "epoch": 1.9875101378751014, "grad_norm": 0.6426994800567627, "learning_rate": 2.8135723787133233e-06, "loss": 0.7073339223861694, "step": 1532 }, { "epoch": 1.9888077858880777, "grad_norm": 0.6070610880851746, "learning_rate": 2.8071446989138786e-06, "loss": 0.6867741346359253, "step": 1533 }, { "epoch": 1.9901054339010544, "grad_norm": 0.6205259561538696, "learning_rate": 2.800721502948506e-06, "loss": 0.6849797368049622, "step": 1534 }, { "epoch": 1.9914030819140307, "grad_norm": 0.6018499135971069, "learning_rate": 2.7943028039510085e-06, "loss": 0.6437822580337524, "step": 1535 }, { "epoch": 1.9927007299270074, "grad_norm": 0.6043044328689575, "learning_rate": 2.78788861504601e-06, "loss": 0.6022955775260925, "step": 1536 }, { "epoch": 1.9939983779399837, "grad_norm": 0.5917290449142456, "learning_rate": 2.7814789493488947e-06, "loss": 0.6646702885627747, "step": 1537 }, { "epoch": 1.9952960259529604, "grad_norm": 0.6160155534744263, "learning_rate": 2.7750738199658157e-06, "loss": 0.6750048398971558, "step": 1538 }, { "epoch": 1.9965936739659367, "grad_norm": 0.5714327096939087, "learning_rate": 2.7686732399936343e-06, "loss": 0.6445184946060181, "step": 1539 }, { "epoch": 1.9978913219789132, "grad_norm": 0.5985759496688843, "learning_rate": 2.762277222519919e-06, "loss": 0.6806057691574097, "step": 1540 }, { "epoch": 1.9991889699918897, "grad_norm": 0.5991272330284119, "learning_rate": 2.7558857806229066e-06, "loss": 0.6159195899963379, "step": 1541 }, { "epoch": 2.0, "grad_norm": 0.7430510520935059, "learning_rate": 2.749498927371478e-06, "loss": 0.5819271802902222, "step": 1542 }, { "epoch": 2.0012976480129763, "grad_norm": 0.6751839518547058, "learning_rate": 2.7431166758251317e-06, "loss": 0.5926187634468079, "step": 1543 }, { "epoch": 2.002595296025953, "grad_norm": 0.6562322378158569, "learning_rate": 2.7367390390339565e-06, "loss": 0.6589317321777344, "step": 1544 }, { "epoch": 2.0038929440389293, "grad_norm": 0.6393698453903198, "learning_rate": 2.730366030038606e-06, "loss": 0.544275164604187, "step": 1545 }, { "epoch": 2.005190592051906, "grad_norm": 0.5910435318946838, "learning_rate": 2.72399766187027e-06, "loss": 0.6208810210227966, "step": 1546 }, { "epoch": 2.0064882400648822, "grad_norm": 0.6248382925987244, "learning_rate": 2.7176339475506515e-06, "loss": 0.5592293739318848, "step": 1547 }, { "epoch": 2.007785888077859, "grad_norm": 0.6031874418258667, "learning_rate": 2.7112749000919304e-06, "loss": 0.5941007137298584, "step": 1548 }, { "epoch": 2.0090835360908352, "grad_norm": 0.598434567451477, "learning_rate": 2.704920532496756e-06, "loss": 0.5872475504875183, "step": 1549 }, { "epoch": 2.010381184103812, "grad_norm": 0.606324315071106, "learning_rate": 2.698570857758195e-06, "loss": 0.5607691407203674, "step": 1550 }, { "epoch": 2.011678832116788, "grad_norm": 0.6241020560264587, "learning_rate": 2.692225888859732e-06, "loss": 0.6537069082260132, "step": 1551 }, { "epoch": 2.012976480129765, "grad_norm": 0.6302763223648071, "learning_rate": 2.685885638775216e-06, "loss": 0.6311033964157104, "step": 1552 }, { "epoch": 2.014274128142741, "grad_norm": 0.7878178358078003, "learning_rate": 2.6795501204688586e-06, "loss": 0.6164021492004395, "step": 1553 }, { "epoch": 2.015571776155718, "grad_norm": 0.6297698616981506, "learning_rate": 2.6732193468951882e-06, "loss": 0.6132771968841553, "step": 1554 }, { "epoch": 2.016869424168694, "grad_norm": 0.6227217316627502, "learning_rate": 2.666893330999035e-06, "loss": 0.5441837310791016, "step": 1555 }, { "epoch": 2.018167072181671, "grad_norm": 0.639819860458374, "learning_rate": 2.6605720857155017e-06, "loss": 0.5625590682029724, "step": 1556 }, { "epoch": 2.019464720194647, "grad_norm": 0.6482471227645874, "learning_rate": 2.654255623969936e-06, "loss": 0.5997311472892761, "step": 1557 }, { "epoch": 2.020762368207624, "grad_norm": 0.6367791891098022, "learning_rate": 2.647943958677897e-06, "loss": 0.5890505313873291, "step": 1558 }, { "epoch": 2.0220600162206, "grad_norm": 0.6217620372772217, "learning_rate": 2.6416371027451514e-06, "loss": 0.5508283376693726, "step": 1559 }, { "epoch": 2.0233576642335764, "grad_norm": 0.651731014251709, "learning_rate": 2.635335069067617e-06, "loss": 0.6351226568222046, "step": 1560 }, { "epoch": 2.024655312246553, "grad_norm": 0.6955805420875549, "learning_rate": 2.62903787053136e-06, "loss": 0.6140905022621155, "step": 1561 }, { "epoch": 2.0259529602595294, "grad_norm": 0.5920689105987549, "learning_rate": 2.6227455200125575e-06, "loss": 0.5677257776260376, "step": 1562 }, { "epoch": 2.027250608272506, "grad_norm": 0.6131844520568848, "learning_rate": 2.6164580303774733e-06, "loss": 0.5954424142837524, "step": 1563 }, { "epoch": 2.0285482562854824, "grad_norm": 0.6266505122184753, "learning_rate": 2.6101754144824327e-06, "loss": 0.5571186542510986, "step": 1564 }, { "epoch": 2.029845904298459, "grad_norm": 0.609183669090271, "learning_rate": 2.603897685173794e-06, "loss": 0.61628657579422, "step": 1565 }, { "epoch": 2.0311435523114354, "grad_norm": 0.6080002784729004, "learning_rate": 2.5976248552879264e-06, "loss": 0.5877048969268799, "step": 1566 }, { "epoch": 2.032441200324412, "grad_norm": 0.6042158007621765, "learning_rate": 2.5913569376511806e-06, "loss": 0.5496143102645874, "step": 1567 }, { "epoch": 2.0337388483373884, "grad_norm": 0.6415978670120239, "learning_rate": 2.5850939450798553e-06, "loss": 0.6424070596694946, "step": 1568 }, { "epoch": 2.035036496350365, "grad_norm": 0.6292750239372253, "learning_rate": 2.5788358903801926e-06, "loss": 0.5802291631698608, "step": 1569 }, { "epoch": 2.0363341443633414, "grad_norm": 0.5823472738265991, "learning_rate": 2.572582786348326e-06, "loss": 0.5664765238761902, "step": 1570 }, { "epoch": 2.037631792376318, "grad_norm": 0.6012071371078491, "learning_rate": 2.566334645770272e-06, "loss": 0.5476250648498535, "step": 1571 }, { "epoch": 2.0389294403892944, "grad_norm": 0.6168148517608643, "learning_rate": 2.5600914814218963e-06, "loss": 0.5573870539665222, "step": 1572 }, { "epoch": 2.040227088402271, "grad_norm": 0.6200307011604309, "learning_rate": 2.553853306068888e-06, "loss": 0.5985852479934692, "step": 1573 }, { "epoch": 2.0415247364152473, "grad_norm": 0.5821851491928101, "learning_rate": 2.547620132466743e-06, "loss": 0.5544208288192749, "step": 1574 }, { "epoch": 2.042822384428224, "grad_norm": 0.5919877886772156, "learning_rate": 2.541391973360717e-06, "loss": 0.5700052976608276, "step": 1575 }, { "epoch": 2.0441200324412003, "grad_norm": 0.6059973835945129, "learning_rate": 2.535168841485821e-06, "loss": 0.6292803287506104, "step": 1576 }, { "epoch": 2.0454176804541766, "grad_norm": 0.580622136592865, "learning_rate": 2.5289507495667864e-06, "loss": 0.5648876428604126, "step": 1577 }, { "epoch": 2.0467153284671533, "grad_norm": 0.6086398363113403, "learning_rate": 2.5227377103180353e-06, "loss": 0.5471535921096802, "step": 1578 }, { "epoch": 2.0480129764801296, "grad_norm": 0.6052615642547607, "learning_rate": 2.516529736443661e-06, "loss": 0.5907412767410278, "step": 1579 }, { "epoch": 2.0493106244931063, "grad_norm": 0.6123395562171936, "learning_rate": 2.5103268406374002e-06, "loss": 0.5662798881530762, "step": 1580 }, { "epoch": 2.0506082725060826, "grad_norm": 0.5917913317680359, "learning_rate": 2.504129035582601e-06, "loss": 0.5825642943382263, "step": 1581 }, { "epoch": 2.0519059205190593, "grad_norm": 0.645075261592865, "learning_rate": 2.497936333952212e-06, "loss": 0.6213525533676147, "step": 1582 }, { "epoch": 2.0532035685320356, "grad_norm": 1.3204904794692993, "learning_rate": 2.491748748408735e-06, "loss": 0.5462846755981445, "step": 1583 }, { "epoch": 2.0545012165450123, "grad_norm": 0.5815834403038025, "learning_rate": 2.485566291604219e-06, "loss": 0.5608969926834106, "step": 1584 }, { "epoch": 2.0557988645579885, "grad_norm": 0.6155984401702881, "learning_rate": 2.4793889761802225e-06, "loss": 0.5753802061080933, "step": 1585 }, { "epoch": 2.0570965125709653, "grad_norm": 0.645876407623291, "learning_rate": 2.4732168147677927e-06, "loss": 0.5655276775360107, "step": 1586 }, { "epoch": 2.0583941605839415, "grad_norm": 0.6139212846755981, "learning_rate": 2.467049819987437e-06, "loss": 0.5936379432678223, "step": 1587 }, { "epoch": 2.0596918085969182, "grad_norm": 0.5790942311286926, "learning_rate": 2.460888004449099e-06, "loss": 0.5055116415023804, "step": 1588 }, { "epoch": 2.0609894566098945, "grad_norm": 0.5931289196014404, "learning_rate": 2.454731380752132e-06, "loss": 0.611015260219574, "step": 1589 }, { "epoch": 2.0622871046228712, "grad_norm": 0.5739728808403015, "learning_rate": 2.4485799614852755e-06, "loss": 0.5669503211975098, "step": 1590 }, { "epoch": 2.0635847526358475, "grad_norm": 0.6072220802307129, "learning_rate": 2.442433759226619e-06, "loss": 0.6242780685424805, "step": 1591 }, { "epoch": 2.0648824006488242, "grad_norm": 0.6013473868370056, "learning_rate": 2.4362927865435975e-06, "loss": 0.6564007997512817, "step": 1592 }, { "epoch": 2.0661800486618005, "grad_norm": 0.6308622360229492, "learning_rate": 2.4301570559929405e-06, "loss": 0.6350818276405334, "step": 1593 }, { "epoch": 2.0674776966747768, "grad_norm": 0.5770552754402161, "learning_rate": 2.4240265801206665e-06, "loss": 0.5588065981864929, "step": 1594 }, { "epoch": 2.0687753446877535, "grad_norm": 0.5862566828727722, "learning_rate": 2.4179013714620456e-06, "loss": 0.564478874206543, "step": 1595 }, { "epoch": 2.0700729927007298, "grad_norm": 0.6063327193260193, "learning_rate": 2.4117814425415803e-06, "loss": 0.5994401574134827, "step": 1596 }, { "epoch": 2.0713706407137065, "grad_norm": 0.7205548286437988, "learning_rate": 2.4056668058729766e-06, "loss": 0.5876675248146057, "step": 1597 }, { "epoch": 2.0726682887266827, "grad_norm": 0.6141117811203003, "learning_rate": 2.399557473959119e-06, "loss": 0.5730265974998474, "step": 1598 }, { "epoch": 2.0739659367396595, "grad_norm": 0.653766393661499, "learning_rate": 2.3934534592920416e-06, "loss": 0.5947611331939697, "step": 1599 }, { "epoch": 2.0752635847526357, "grad_norm": 0.6264708042144775, "learning_rate": 2.3873547743529157e-06, "loss": 0.597199559211731, "step": 1600 }, { "epoch": 2.0765612327656124, "grad_norm": 0.6255890727043152, "learning_rate": 2.3812614316120003e-06, "loss": 0.5970041155815125, "step": 1601 }, { "epoch": 2.0778588807785887, "grad_norm": 0.6051512956619263, "learning_rate": 2.375173443528646e-06, "loss": 0.5532850027084351, "step": 1602 }, { "epoch": 2.0791565287915654, "grad_norm": 0.6211998462677002, "learning_rate": 2.3690908225512464e-06, "loss": 0.5505103468894958, "step": 1603 }, { "epoch": 2.0804541768045417, "grad_norm": 0.6291670799255371, "learning_rate": 2.363013581117217e-06, "loss": 0.6287462711334229, "step": 1604 }, { "epoch": 2.0817518248175184, "grad_norm": 0.6058430075645447, "learning_rate": 2.356941731652986e-06, "loss": 0.6096627712249756, "step": 1605 }, { "epoch": 2.0830494728304947, "grad_norm": 0.6372430324554443, "learning_rate": 2.3508752865739425e-06, "loss": 0.6022605895996094, "step": 1606 }, { "epoch": 2.0843471208434714, "grad_norm": 0.6325316429138184, "learning_rate": 2.344814258284433e-06, "loss": 0.610370397567749, "step": 1607 }, { "epoch": 2.0856447688564477, "grad_norm": 0.6065165996551514, "learning_rate": 2.3387586591777274e-06, "loss": 0.5800055861473083, "step": 1608 }, { "epoch": 2.086942416869424, "grad_norm": 0.59498131275177, "learning_rate": 2.3327085016359912e-06, "loss": 0.5574961304664612, "step": 1609 }, { "epoch": 2.0882400648824007, "grad_norm": 0.6029080748558044, "learning_rate": 2.3266637980302677e-06, "loss": 0.5879454016685486, "step": 1610 }, { "epoch": 2.0882400648824007, "eval_loss": 0.6837871670722961, "eval_runtime": 72.9619, "eval_samples_per_second": 71.16, "eval_steps_per_second": 8.895, "step": 1610 }, { "epoch": 2.089537712895377, "grad_norm": 0.6146489381790161, "learning_rate": 2.320624560720446e-06, "loss": 0.5897351503372192, "step": 1611 }, { "epoch": 2.0908353609083536, "grad_norm": 0.6313148140907288, "learning_rate": 2.314590802055232e-06, "loss": 0.5991021990776062, "step": 1612 }, { "epoch": 2.09213300892133, "grad_norm": 0.578288197517395, "learning_rate": 2.308562534372144e-06, "loss": 0.5127542018890381, "step": 1613 }, { "epoch": 2.0934306569343066, "grad_norm": 0.6262894868850708, "learning_rate": 2.3025397699974555e-06, "loss": 0.6180716753005981, "step": 1614 }, { "epoch": 2.094728304947283, "grad_norm": 0.6143955588340759, "learning_rate": 2.296522521246202e-06, "loss": 0.6144124865531921, "step": 1615 }, { "epoch": 2.0960259529602596, "grad_norm": 0.6245327591896057, "learning_rate": 2.290510800422129e-06, "loss": 0.5791307687759399, "step": 1616 }, { "epoch": 2.097323600973236, "grad_norm": 0.6619604825973511, "learning_rate": 2.284504619817687e-06, "loss": 0.6063104271888733, "step": 1617 }, { "epoch": 2.0986212489862126, "grad_norm": 0.6063318848609924, "learning_rate": 2.2785039917139933e-06, "loss": 0.619540810585022, "step": 1618 }, { "epoch": 2.099918896999189, "grad_norm": 0.6290093660354614, "learning_rate": 2.272508928380815e-06, "loss": 0.5471513271331787, "step": 1619 }, { "epoch": 2.1012165450121656, "grad_norm": 0.6088972091674805, "learning_rate": 2.2665194420765386e-06, "loss": 0.673788845539093, "step": 1620 }, { "epoch": 2.102514193025142, "grad_norm": 0.6053624153137207, "learning_rate": 2.260535545048149e-06, "loss": 0.540647029876709, "step": 1621 }, { "epoch": 2.1038118410381186, "grad_norm": 0.6025784015655518, "learning_rate": 2.2545572495311966e-06, "loss": 0.5704219341278076, "step": 1622 }, { "epoch": 2.105109489051095, "grad_norm": 0.5917617678642273, "learning_rate": 2.2485845677497897e-06, "loss": 0.5879180431365967, "step": 1623 }, { "epoch": 2.1064071370640716, "grad_norm": 0.6286986470222473, "learning_rate": 2.2426175119165435e-06, "loss": 0.6564632058143616, "step": 1624 }, { "epoch": 2.107704785077048, "grad_norm": 0.7979365587234497, "learning_rate": 2.2366560942325833e-06, "loss": 0.5867825746536255, "step": 1625 }, { "epoch": 2.1090024330900246, "grad_norm": 0.6283751130104065, "learning_rate": 2.230700326887495e-06, "loss": 0.5519679188728333, "step": 1626 }, { "epoch": 2.110300081103001, "grad_norm": 0.6093899011611938, "learning_rate": 2.2247502220593164e-06, "loss": 0.578905463218689, "step": 1627 }, { "epoch": 2.111597729115977, "grad_norm": 0.694290816783905, "learning_rate": 2.218805791914507e-06, "loss": 0.5794886350631714, "step": 1628 }, { "epoch": 2.112895377128954, "grad_norm": 0.6268723607063293, "learning_rate": 2.21286704860792e-06, "loss": 0.5475939512252808, "step": 1629 }, { "epoch": 2.11419302514193, "grad_norm": 0.5893663167953491, "learning_rate": 2.2069340042827846e-06, "loss": 0.5644780397415161, "step": 1630 }, { "epoch": 2.115490673154907, "grad_norm": 0.6139518022537231, "learning_rate": 2.2010066710706734e-06, "loss": 0.5307568311691284, "step": 1631 }, { "epoch": 2.116788321167883, "grad_norm": 0.6323785781860352, "learning_rate": 2.1950850610914824e-06, "loss": 0.5611797571182251, "step": 1632 }, { "epoch": 2.11808596918086, "grad_norm": 0.5823566913604736, "learning_rate": 2.1891691864534065e-06, "loss": 0.5725387334823608, "step": 1633 }, { "epoch": 2.119383617193836, "grad_norm": 0.6572033762931824, "learning_rate": 2.1832590592529128e-06, "loss": 0.6158653497695923, "step": 1634 }, { "epoch": 2.1206812652068128, "grad_norm": 1.0890551805496216, "learning_rate": 2.1773546915747103e-06, "loss": 0.559654951095581, "step": 1635 }, { "epoch": 2.121978913219789, "grad_norm": 0.6277933120727539, "learning_rate": 2.1714560954917437e-06, "loss": 0.6304750442504883, "step": 1636 }, { "epoch": 2.1232765612327658, "grad_norm": 0.6458949446678162, "learning_rate": 2.165563283065142e-06, "loss": 0.6345778703689575, "step": 1637 }, { "epoch": 2.124574209245742, "grad_norm": 0.643680214881897, "learning_rate": 2.159676266344222e-06, "loss": 0.5876523852348328, "step": 1638 }, { "epoch": 2.1258718572587187, "grad_norm": 0.595977783203125, "learning_rate": 2.1537950573664372e-06, "loss": 0.6067019104957581, "step": 1639 }, { "epoch": 2.127169505271695, "grad_norm": 0.6042376160621643, "learning_rate": 2.1479196681573745e-06, "loss": 0.5710458159446716, "step": 1640 }, { "epoch": 2.1284671532846717, "grad_norm": 0.6172091960906982, "learning_rate": 2.142050110730716e-06, "loss": 0.5443819761276245, "step": 1641 }, { "epoch": 2.129764801297648, "grad_norm": 0.6249525547027588, "learning_rate": 2.136186397088223e-06, "loss": 0.6747730374336243, "step": 1642 }, { "epoch": 2.1310624493106243, "grad_norm": 0.6373762488365173, "learning_rate": 2.1303285392197043e-06, "loss": 0.6101464033126831, "step": 1643 }, { "epoch": 2.132360097323601, "grad_norm": 0.6049467921257019, "learning_rate": 2.1244765491029985e-06, "loss": 0.5729132890701294, "step": 1644 }, { "epoch": 2.1336577453365773, "grad_norm": 0.6222594380378723, "learning_rate": 2.118630438703939e-06, "loss": 0.6150310039520264, "step": 1645 }, { "epoch": 2.134955393349554, "grad_norm": 0.9498931169509888, "learning_rate": 2.1127902199763496e-06, "loss": 0.6144990921020508, "step": 1646 }, { "epoch": 2.1362530413625302, "grad_norm": 0.6177363991737366, "learning_rate": 2.1069559048619937e-06, "loss": 0.5762449502944946, "step": 1647 }, { "epoch": 2.137550689375507, "grad_norm": 0.59578537940979, "learning_rate": 2.10112750529057e-06, "loss": 0.6036182641983032, "step": 1648 }, { "epoch": 2.1388483373884832, "grad_norm": 0.6090502738952637, "learning_rate": 2.095305033179682e-06, "loss": 0.5963237285614014, "step": 1649 }, { "epoch": 2.14014598540146, "grad_norm": 5.44432258605957, "learning_rate": 2.0894885004348102e-06, "loss": 0.6094678640365601, "step": 1650 }, { "epoch": 2.141443633414436, "grad_norm": 0.6466519832611084, "learning_rate": 2.0836779189492925e-06, "loss": 0.6607776880264282, "step": 1651 }, { "epoch": 2.142741281427413, "grad_norm": 0.6259258985519409, "learning_rate": 2.077873300604297e-06, "loss": 0.6022912859916687, "step": 1652 }, { "epoch": 2.144038929440389, "grad_norm": 0.6033953428268433, "learning_rate": 2.0720746572687995e-06, "loss": 0.5635781288146973, "step": 1653 }, { "epoch": 2.145336577453366, "grad_norm": 0.5921186208724976, "learning_rate": 2.0662820007995592e-06, "loss": 0.5796300172805786, "step": 1654 }, { "epoch": 2.146634225466342, "grad_norm": 0.7194099426269531, "learning_rate": 2.060495343041087e-06, "loss": 0.5955857038497925, "step": 1655 }, { "epoch": 2.147931873479319, "grad_norm": 0.6012006998062134, "learning_rate": 2.0547146958256416e-06, "loss": 0.531291127204895, "step": 1656 }, { "epoch": 2.149229521492295, "grad_norm": 0.8573319911956787, "learning_rate": 2.048940070973177e-06, "loss": 0.5659847259521484, "step": 1657 }, { "epoch": 2.150527169505272, "grad_norm": 0.639750599861145, "learning_rate": 2.04317148029134e-06, "loss": 0.5485103130340576, "step": 1658 }, { "epoch": 2.151824817518248, "grad_norm": 0.6052505970001221, "learning_rate": 2.0374089355754434e-06, "loss": 0.6026275753974915, "step": 1659 }, { "epoch": 2.153122465531225, "grad_norm": 0.6007844805717468, "learning_rate": 2.031652448608428e-06, "loss": 0.5721523761749268, "step": 1660 }, { "epoch": 2.154420113544201, "grad_norm": 0.6320387125015259, "learning_rate": 2.025902031160853e-06, "loss": 0.5851036906242371, "step": 1661 }, { "epoch": 2.1557177615571774, "grad_norm": 0.8335509300231934, "learning_rate": 2.020157694990868e-06, "loss": 0.631894588470459, "step": 1662 }, { "epoch": 2.157015409570154, "grad_norm": 0.6097387075424194, "learning_rate": 2.014419451844186e-06, "loss": 0.6118210554122925, "step": 1663 }, { "epoch": 2.1583130575831304, "grad_norm": 0.6130866408348083, "learning_rate": 2.0086873134540626e-06, "loss": 0.5941121578216553, "step": 1664 }, { "epoch": 2.159610705596107, "grad_norm": 0.6047623753547668, "learning_rate": 2.002961291541269e-06, "loss": 0.592534065246582, "step": 1665 }, { "epoch": 2.1609083536090834, "grad_norm": 0.6416432857513428, "learning_rate": 1.997241397814071e-06, "loss": 0.6065758466720581, "step": 1666 }, { "epoch": 2.16220600162206, "grad_norm": 0.6395633816719055, "learning_rate": 1.9915276439682056e-06, "loss": 0.6400467157363892, "step": 1667 }, { "epoch": 2.1635036496350364, "grad_norm": 0.604591965675354, "learning_rate": 1.985820041686848e-06, "loss": 0.590105414390564, "step": 1668 }, { "epoch": 2.164801297648013, "grad_norm": 0.6412749886512756, "learning_rate": 1.9801186026406066e-06, "loss": 0.5925630927085876, "step": 1669 }, { "epoch": 2.1660989456609894, "grad_norm": 0.6263708472251892, "learning_rate": 1.9744233384874766e-06, "loss": 0.6293658018112183, "step": 1670 }, { "epoch": 2.167396593673966, "grad_norm": 0.6095645427703857, "learning_rate": 1.968734260872833e-06, "loss": 0.5433490872383118, "step": 1671 }, { "epoch": 2.1686942416869424, "grad_norm": 0.6286778450012207, "learning_rate": 1.9630513814294e-06, "loss": 0.637223482131958, "step": 1672 }, { "epoch": 2.169991889699919, "grad_norm": 0.6185746788978577, "learning_rate": 1.9573747117772272e-06, "loss": 0.5756215453147888, "step": 1673 }, { "epoch": 2.1712895377128953, "grad_norm": 0.63084876537323, "learning_rate": 1.951704263523668e-06, "loss": 0.5794859528541565, "step": 1674 }, { "epoch": 2.172587185725872, "grad_norm": 0.6249853372573853, "learning_rate": 1.9460400482633537e-06, "loss": 0.5887556672096252, "step": 1675 }, { "epoch": 2.1738848337388483, "grad_norm": 0.6094781756401062, "learning_rate": 1.9403820775781696e-06, "loss": 0.550574779510498, "step": 1676 }, { "epoch": 2.1751824817518246, "grad_norm": 0.6323735117912292, "learning_rate": 1.9347303630372373e-06, "loss": 0.6414695978164673, "step": 1677 }, { "epoch": 2.1764801297648013, "grad_norm": 0.6307917237281799, "learning_rate": 1.929084916196876e-06, "loss": 0.5808806419372559, "step": 1678 }, { "epoch": 2.1777777777777776, "grad_norm": 0.6933386921882629, "learning_rate": 1.923445748600603e-06, "loss": 0.6602625846862793, "step": 1679 }, { "epoch": 2.1790754257907543, "grad_norm": 0.6270908713340759, "learning_rate": 1.917812871779084e-06, "loss": 0.6303268074989319, "step": 1680 }, { "epoch": 2.1803730738037306, "grad_norm": 0.640339195728302, "learning_rate": 1.912186297250128e-06, "loss": 0.6451208591461182, "step": 1681 }, { "epoch": 2.1816707218167073, "grad_norm": 0.632722020149231, "learning_rate": 1.9065660365186545e-06, "loss": 0.6016892194747925, "step": 1682 }, { "epoch": 2.1829683698296836, "grad_norm": 0.6271699070930481, "learning_rate": 1.9009521010766756e-06, "loss": 0.5756760835647583, "step": 1683 }, { "epoch": 2.1842660178426603, "grad_norm": 0.6046696305274963, "learning_rate": 1.8953445024032679e-06, "loss": 0.6025729775428772, "step": 1684 }, { "epoch": 2.1855636658556366, "grad_norm": 0.62837815284729, "learning_rate": 1.889743251964553e-06, "loss": 0.5909950733184814, "step": 1685 }, { "epoch": 2.1868613138686133, "grad_norm": 0.6451588869094849, "learning_rate": 1.8841483612136658e-06, "loss": 0.6150632500648499, "step": 1686 }, { "epoch": 2.1881589618815895, "grad_norm": 0.6260155439376831, "learning_rate": 1.8785598415907464e-06, "loss": 0.5601434707641602, "step": 1687 }, { "epoch": 2.1894566098945663, "grad_norm": 0.6069033741950989, "learning_rate": 1.8729777045229009e-06, "loss": 0.508891761302948, "step": 1688 }, { "epoch": 2.1907542579075425, "grad_norm": 0.613555371761322, "learning_rate": 1.8674019614241879e-06, "loss": 0.5379388928413391, "step": 1689 }, { "epoch": 2.1920519059205192, "grad_norm": 0.6071879863739014, "learning_rate": 1.8618326236955908e-06, "loss": 0.5609877109527588, "step": 1690 }, { "epoch": 2.1933495539334955, "grad_norm": 0.6063624620437622, "learning_rate": 1.8562697027249921e-06, "loss": 0.5955809950828552, "step": 1691 }, { "epoch": 2.1946472019464722, "grad_norm": 0.6319783926010132, "learning_rate": 1.8507132098871633e-06, "loss": 0.5856696367263794, "step": 1692 }, { "epoch": 2.1959448499594485, "grad_norm": 0.6493479609489441, "learning_rate": 1.8451631565437211e-06, "loss": 0.6506030559539795, "step": 1693 }, { "epoch": 2.197242497972425, "grad_norm": 0.6016229391098022, "learning_rate": 1.8396195540431205e-06, "loss": 0.6117116212844849, "step": 1694 }, { "epoch": 2.1985401459854015, "grad_norm": 0.6247114539146423, "learning_rate": 1.834082413720627e-06, "loss": 0.6172184348106384, "step": 1695 }, { "epoch": 2.1998377939983778, "grad_norm": 0.6307165026664734, "learning_rate": 1.8285517468982905e-06, "loss": 0.589012622833252, "step": 1696 }, { "epoch": 2.2011354420113545, "grad_norm": 0.6177083253860474, "learning_rate": 1.8230275648849243e-06, "loss": 0.5813847780227661, "step": 1697 }, { "epoch": 2.2024330900243307, "grad_norm": 0.6012999415397644, "learning_rate": 1.8175098789760848e-06, "loss": 0.5948748588562012, "step": 1698 }, { "epoch": 2.2037307380373075, "grad_norm": 0.6464450359344482, "learning_rate": 1.8119987004540373e-06, "loss": 0.5775672197341919, "step": 1699 }, { "epoch": 2.2050283860502837, "grad_norm": 0.6167866587638855, "learning_rate": 1.8064940405877546e-06, "loss": 0.6011961698532104, "step": 1700 }, { "epoch": 2.2063260340632604, "grad_norm": 0.6373000741004944, "learning_rate": 1.8009959106328655e-06, "loss": 0.5679797530174255, "step": 1701 }, { "epoch": 2.2076236820762367, "grad_norm": 0.5966001152992249, "learning_rate": 1.7955043218316615e-06, "loss": 0.5757954120635986, "step": 1702 }, { "epoch": 2.2089213300892134, "grad_norm": 0.6121652722358704, "learning_rate": 1.7900192854130465e-06, "loss": 0.5717330574989319, "step": 1703 }, { "epoch": 2.2102189781021897, "grad_norm": 0.6737116575241089, "learning_rate": 1.7845408125925328e-06, "loss": 0.5650469064712524, "step": 1704 }, { "epoch": 2.2115166261151664, "grad_norm": 0.6384139060974121, "learning_rate": 1.7790689145722111e-06, "loss": 0.5935101509094238, "step": 1705 }, { "epoch": 2.2128142741281427, "grad_norm": 0.5904914140701294, "learning_rate": 1.7736036025407282e-06, "loss": 0.5071459412574768, "step": 1706 }, { "epoch": 2.2141119221411194, "grad_norm": 0.6069095730781555, "learning_rate": 1.7681448876732632e-06, "loss": 0.5586497783660889, "step": 1707 }, { "epoch": 2.2154095701540957, "grad_norm": 0.6180804967880249, "learning_rate": 1.7626927811315087e-06, "loss": 0.6200004816055298, "step": 1708 }, { "epoch": 2.2167072181670724, "grad_norm": 0.6510108113288879, "learning_rate": 1.7572472940636375e-06, "loss": 0.6552962064743042, "step": 1709 }, { "epoch": 2.2180048661800487, "grad_norm": 0.6048802137374878, "learning_rate": 1.7518084376042988e-06, "loss": 0.5669337511062622, "step": 1710 }, { "epoch": 2.219302514193025, "grad_norm": 0.6182539463043213, "learning_rate": 1.7463762228745728e-06, "loss": 0.5660184621810913, "step": 1711 }, { "epoch": 2.2206001622060016, "grad_norm": 0.8579080700874329, "learning_rate": 1.7409506609819648e-06, "loss": 0.5399761199951172, "step": 1712 }, { "epoch": 2.221897810218978, "grad_norm": 0.5763684511184692, "learning_rate": 1.735531763020376e-06, "loss": 0.5553586483001709, "step": 1713 }, { "epoch": 2.2231954582319546, "grad_norm": 0.5882948040962219, "learning_rate": 1.7301195400700815e-06, "loss": 0.5055762529373169, "step": 1714 }, { "epoch": 2.224493106244931, "grad_norm": 0.6292950510978699, "learning_rate": 1.7247140031977073e-06, "loss": 0.6296324133872986, "step": 1715 }, { "epoch": 2.2257907542579076, "grad_norm": 0.600114643573761, "learning_rate": 1.7193151634562071e-06, "loss": 0.5636775493621826, "step": 1716 }, { "epoch": 2.227088402270884, "grad_norm": 0.6101363301277161, "learning_rate": 1.7139230318848432e-06, "loss": 0.6061251163482666, "step": 1717 }, { "epoch": 2.2283860502838606, "grad_norm": 0.6061044335365295, "learning_rate": 1.7085376195091591e-06, "loss": 0.6004489660263062, "step": 1718 }, { "epoch": 2.229683698296837, "grad_norm": 0.6100283265113831, "learning_rate": 1.7031589373409596e-06, "loss": 0.571765661239624, "step": 1719 }, { "epoch": 2.2309813463098136, "grad_norm": 0.6510441303253174, "learning_rate": 1.6977869963782895e-06, "loss": 0.5853846073150635, "step": 1720 }, { "epoch": 2.23227899432279, "grad_norm": 0.635164201259613, "learning_rate": 1.6924218076054095e-06, "loss": 0.6079269647598267, "step": 1721 }, { "epoch": 2.2335766423357666, "grad_norm": 0.641042172908783, "learning_rate": 1.6870633819927672e-06, "loss": 0.7038273811340332, "step": 1722 }, { "epoch": 2.234874290348743, "grad_norm": 0.621701717376709, "learning_rate": 1.6817117304969944e-06, "loss": 0.5776299238204956, "step": 1723 }, { "epoch": 2.2361719383617196, "grad_norm": 0.5985130071640015, "learning_rate": 1.676366864060856e-06, "loss": 0.5792907476425171, "step": 1724 }, { "epoch": 2.237469586374696, "grad_norm": 0.6016199588775635, "learning_rate": 1.6710287936132592e-06, "loss": 0.518044650554657, "step": 1725 }, { "epoch": 2.238767234387672, "grad_norm": 0.5871309041976929, "learning_rate": 1.6656975300692008e-06, "loss": 0.5443193316459656, "step": 1726 }, { "epoch": 2.240064882400649, "grad_norm": 0.6189736723899841, "learning_rate": 1.660373084329767e-06, "loss": 0.6327258944511414, "step": 1727 }, { "epoch": 2.241362530413625, "grad_norm": 0.6076374650001526, "learning_rate": 1.6550554672821028e-06, "loss": 0.5638880729675293, "step": 1728 }, { "epoch": 2.242660178426602, "grad_norm": 0.6918789744377136, "learning_rate": 1.6497446897993885e-06, "loss": 0.6006242632865906, "step": 1729 }, { "epoch": 2.243957826439578, "grad_norm": 0.636972188949585, "learning_rate": 1.6444407627408194e-06, "loss": 0.5908925533294678, "step": 1730 }, { "epoch": 2.245255474452555, "grad_norm": 0.6132383942604065, "learning_rate": 1.639143696951586e-06, "loss": 0.5603156089782715, "step": 1731 }, { "epoch": 2.246553122465531, "grad_norm": 0.6161746382713318, "learning_rate": 1.6338535032628427e-06, "loss": 0.5923026204109192, "step": 1732 }, { "epoch": 2.247850770478508, "grad_norm": 0.6077067255973816, "learning_rate": 1.6285701924917025e-06, "loss": 0.5932834148406982, "step": 1733 }, { "epoch": 2.249148418491484, "grad_norm": 0.6137869954109192, "learning_rate": 1.6232937754411938e-06, "loss": 0.5695690512657166, "step": 1734 }, { "epoch": 2.2504460665044608, "grad_norm": 0.5874996185302734, "learning_rate": 1.6180242629002558e-06, "loss": 0.5515947341918945, "step": 1735 }, { "epoch": 2.251743714517437, "grad_norm": 0.5972124934196472, "learning_rate": 1.6127616656437078e-06, "loss": 0.6108847260475159, "step": 1736 }, { "epoch": 2.2530413625304138, "grad_norm": 0.6362358927726746, "learning_rate": 1.6075059944322297e-06, "loss": 0.5956808924674988, "step": 1737 }, { "epoch": 2.25433901054339, "grad_norm": 0.6626409888267517, "learning_rate": 1.6022572600123382e-06, "loss": 0.5291856527328491, "step": 1738 }, { "epoch": 2.2556366585563667, "grad_norm": 0.6440781354904175, "learning_rate": 1.5970154731163667e-06, "loss": 0.6244629621505737, "step": 1739 }, { "epoch": 2.256934306569343, "grad_norm": 0.598318874835968, "learning_rate": 1.5917806444624434e-06, "loss": 0.5915838479995728, "step": 1740 }, { "epoch": 2.2582319545823197, "grad_norm": 0.6128567457199097, "learning_rate": 1.5865527847544692e-06, "loss": 0.5356861352920532, "step": 1741 }, { "epoch": 2.259529602595296, "grad_norm": 0.6078605651855469, "learning_rate": 1.581331904682089e-06, "loss": 0.5974371433258057, "step": 1742 }, { "epoch": 2.2608272506082727, "grad_norm": 0.6011683344841003, "learning_rate": 1.576118014920688e-06, "loss": 0.5702426433563232, "step": 1743 }, { "epoch": 2.262124898621249, "grad_norm": 0.6138583421707153, "learning_rate": 1.5709111261313454e-06, "loss": 0.6526232361793518, "step": 1744 }, { "epoch": 2.2634225466342253, "grad_norm": 0.5757991075515747, "learning_rate": 1.5657112489608316e-06, "loss": 0.5384607315063477, "step": 1745 }, { "epoch": 2.264720194647202, "grad_norm": 0.5720049142837524, "learning_rate": 1.5605183940415842e-06, "loss": 0.5239338278770447, "step": 1746 }, { "epoch": 2.2660178426601782, "grad_norm": 0.6321298480033875, "learning_rate": 1.5553325719916717e-06, "loss": 0.5788372159004211, "step": 1747 }, { "epoch": 2.267315490673155, "grad_norm": 0.6393312215805054, "learning_rate": 1.5501537934147897e-06, "loss": 0.6262606978416443, "step": 1748 }, { "epoch": 2.2686131386861312, "grad_norm": 0.5900475978851318, "learning_rate": 1.5449820689002298e-06, "loss": 0.5757325887680054, "step": 1749 }, { "epoch": 2.269910786699108, "grad_norm": 0.9620885848999023, "learning_rate": 1.5398174090228595e-06, "loss": 0.5125218629837036, "step": 1750 }, { "epoch": 2.2712084347120842, "grad_norm": 0.611209511756897, "learning_rate": 1.534659824343101e-06, "loss": 0.5692592859268188, "step": 1751 }, { "epoch": 2.272506082725061, "grad_norm": 0.5884798169136047, "learning_rate": 1.5295093254069093e-06, "loss": 0.561367392539978, "step": 1752 }, { "epoch": 2.273803730738037, "grad_norm": 0.9114322662353516, "learning_rate": 1.524365922745752e-06, "loss": 0.5305287837982178, "step": 1753 }, { "epoch": 2.275101378751014, "grad_norm": 0.615672767162323, "learning_rate": 1.519229626876586e-06, "loss": 0.5678682923316956, "step": 1754 }, { "epoch": 2.27639902676399, "grad_norm": 0.596740186214447, "learning_rate": 1.5141004483018323e-06, "loss": 0.562171995639801, "step": 1755 }, { "epoch": 2.277696674776967, "grad_norm": 0.5916588306427002, "learning_rate": 1.5089783975093698e-06, "loss": 0.5581475496292114, "step": 1756 }, { "epoch": 2.278994322789943, "grad_norm": 0.5962932705879211, "learning_rate": 1.5038634849724898e-06, "loss": 0.5466150045394897, "step": 1757 }, { "epoch": 2.28029197080292, "grad_norm": 0.6148486137390137, "learning_rate": 1.4987557211498966e-06, "loss": 0.562313973903656, "step": 1758 }, { "epoch": 2.281589618815896, "grad_norm": 0.6286764740943909, "learning_rate": 1.4936551164856739e-06, "loss": 0.585920512676239, "step": 1759 }, { "epoch": 2.2828872668288724, "grad_norm": 0.5897656679153442, "learning_rate": 1.4885616814092673e-06, "loss": 0.5238120555877686, "step": 1760 }, { "epoch": 2.284184914841849, "grad_norm": 0.6284115314483643, "learning_rate": 1.4834754263354628e-06, "loss": 0.6318528652191162, "step": 1761 }, { "epoch": 2.285482562854826, "grad_norm": 0.6478753685951233, "learning_rate": 1.4783963616643654e-06, "loss": 0.6090703010559082, "step": 1762 }, { "epoch": 2.286780210867802, "grad_norm": 0.6065962314605713, "learning_rate": 1.4733244977813726e-06, "loss": 0.6407983303070068, "step": 1763 }, { "epoch": 2.2880778588807784, "grad_norm": 0.6130876541137695, "learning_rate": 1.468259845057169e-06, "loss": 0.5580013990402222, "step": 1764 }, { "epoch": 2.289375506893755, "grad_norm": 0.5975884199142456, "learning_rate": 1.4632024138476803e-06, "loss": 0.5697616338729858, "step": 1765 }, { "epoch": 2.2906731549067314, "grad_norm": 0.6038120985031128, "learning_rate": 1.4581522144940802e-06, "loss": 0.5938565731048584, "step": 1766 }, { "epoch": 2.291970802919708, "grad_norm": 0.6311531066894531, "learning_rate": 1.4531092573227434e-06, "loss": 0.5615339875221252, "step": 1767 }, { "epoch": 2.2932684509326844, "grad_norm": 0.7556526064872742, "learning_rate": 1.4480735526452427e-06, "loss": 0.6018495559692383, "step": 1768 }, { "epoch": 2.294566098945661, "grad_norm": 0.5966140627861023, "learning_rate": 1.4430451107583187e-06, "loss": 0.5482977628707886, "step": 1769 }, { "epoch": 2.2958637469586374, "grad_norm": 0.6495786309242249, "learning_rate": 1.4380239419438636e-06, "loss": 0.6411464810371399, "step": 1770 }, { "epoch": 2.297161394971614, "grad_norm": 0.6380159258842468, "learning_rate": 1.433010056468896e-06, "loss": 0.585355281829834, "step": 1771 }, { "epoch": 2.2984590429845904, "grad_norm": 0.6330418586730957, "learning_rate": 1.4280034645855429e-06, "loss": 0.6234038472175598, "step": 1772 }, { "epoch": 2.299756690997567, "grad_norm": 0.6164976358413696, "learning_rate": 1.4230041765310171e-06, "loss": 0.6070310473442078, "step": 1773 }, { "epoch": 2.3010543390105433, "grad_norm": 0.5887792706489563, "learning_rate": 1.4180122025275972e-06, "loss": 0.49864742159843445, "step": 1774 }, { "epoch": 2.30235198702352, "grad_norm": 0.5811118483543396, "learning_rate": 1.4130275527826077e-06, "loss": 0.6116331815719604, "step": 1775 }, { "epoch": 2.3036496350364963, "grad_norm": 0.6207771897315979, "learning_rate": 1.4080502374883947e-06, "loss": 0.6092080473899841, "step": 1776 }, { "epoch": 2.304947283049473, "grad_norm": 0.6548723578453064, "learning_rate": 1.4030802668223097e-06, "loss": 0.5866458415985107, "step": 1777 }, { "epoch": 2.3062449310624493, "grad_norm": 0.6081016659736633, "learning_rate": 1.398117650946681e-06, "loss": 0.5727241039276123, "step": 1778 }, { "epoch": 2.3075425790754256, "grad_norm": 0.6203863024711609, "learning_rate": 1.3931624000088073e-06, "loss": 0.5507431030273438, "step": 1779 }, { "epoch": 2.3088402270884023, "grad_norm": 0.61940598487854, "learning_rate": 1.3882145241409184e-06, "loss": 0.6124242544174194, "step": 1780 }, { "epoch": 2.3101378751013786, "grad_norm": 0.6263229250907898, "learning_rate": 1.3832740334601692e-06, "loss": 0.6032424569129944, "step": 1781 }, { "epoch": 2.3114355231143553, "grad_norm": 0.5885775685310364, "learning_rate": 1.3783409380686135e-06, "loss": 0.5357505083084106, "step": 1782 }, { "epoch": 2.3127331711273316, "grad_norm": 0.6165185570716858, "learning_rate": 1.3734152480531821e-06, "loss": 0.6190866231918335, "step": 1783 }, { "epoch": 2.3140308191403083, "grad_norm": 0.6044617295265198, "learning_rate": 1.3684969734856646e-06, "loss": 0.5655971765518188, "step": 1784 }, { "epoch": 2.3153284671532846, "grad_norm": 0.6254231929779053, "learning_rate": 1.363586124422689e-06, "loss": 0.5936893224716187, "step": 1785 }, { "epoch": 2.3166261151662613, "grad_norm": 0.6128689646720886, "learning_rate": 1.3586827109056944e-06, "loss": 0.5749369263648987, "step": 1786 }, { "epoch": 2.3179237631792375, "grad_norm": 0.6421996355056763, "learning_rate": 1.3537867429609263e-06, "loss": 0.5559523105621338, "step": 1787 }, { "epoch": 2.3192214111922143, "grad_norm": 0.6680915951728821, "learning_rate": 1.3488982305993942e-06, "loss": 0.5511724352836609, "step": 1788 }, { "epoch": 2.3205190592051905, "grad_norm": 0.6443539261817932, "learning_rate": 1.3440171838168743e-06, "loss": 0.5881825089454651, "step": 1789 }, { "epoch": 2.3218167072181672, "grad_norm": 0.612708568572998, "learning_rate": 1.3391436125938673e-06, "loss": 0.5950250625610352, "step": 1790 }, { "epoch": 2.3231143552311435, "grad_norm": 0.595385730266571, "learning_rate": 1.3342775268955943e-06, "loss": 0.5954742431640625, "step": 1791 }, { "epoch": 2.3244120032441202, "grad_norm": 0.6444376111030579, "learning_rate": 1.329418936671969e-06, "loss": 0.5775749087333679, "step": 1792 }, { "epoch": 2.3257096512570965, "grad_norm": 0.6064639687538147, "learning_rate": 1.3245678518575782e-06, "loss": 0.5845799446105957, "step": 1793 }, { "epoch": 2.3270072992700728, "grad_norm": 0.6051777601242065, "learning_rate": 1.319724282371664e-06, "loss": 0.5920668840408325, "step": 1794 }, { "epoch": 2.3283049472830495, "grad_norm": 0.6336135268211365, "learning_rate": 1.3148882381181e-06, "loss": 0.562667965888977, "step": 1795 }, { "epoch": 2.329602595296026, "grad_norm": 0.6154525876045227, "learning_rate": 1.3100597289853689e-06, "loss": 0.5847402811050415, "step": 1796 }, { "epoch": 2.3309002433090025, "grad_norm": 0.6278738379478455, "learning_rate": 1.3052387648465559e-06, "loss": 0.6408085823059082, "step": 1797 }, { "epoch": 2.3321978913219787, "grad_norm": 0.6477576494216919, "learning_rate": 1.3004253555593071e-06, "loss": 0.5616024732589722, "step": 1798 }, { "epoch": 2.3334955393349555, "grad_norm": 0.6108107566833496, "learning_rate": 1.2956195109658287e-06, "loss": 0.5326311588287354, "step": 1799 }, { "epoch": 2.3347931873479317, "grad_norm": 1.3087694644927979, "learning_rate": 1.2908212408928561e-06, "loss": 0.6685813069343567, "step": 1800 }, { "epoch": 2.3360908353609084, "grad_norm": 0.592055082321167, "learning_rate": 1.2860305551516355e-06, "loss": 0.6329461932182312, "step": 1801 }, { "epoch": 2.3373884833738847, "grad_norm": 0.6386983394622803, "learning_rate": 1.281247463537912e-06, "loss": 0.5208531618118286, "step": 1802 }, { "epoch": 2.3386861313868614, "grad_norm": 0.6252365112304688, "learning_rate": 1.276471975831891e-06, "loss": 0.5943001508712769, "step": 1803 }, { "epoch": 2.3399837793998377, "grad_norm": 0.6460595726966858, "learning_rate": 1.2717041017982396e-06, "loss": 0.6217683553695679, "step": 1804 }, { "epoch": 2.3412814274128144, "grad_norm": 0.6099584698677063, "learning_rate": 1.2669438511860527e-06, "loss": 0.5706977844238281, "step": 1805 }, { "epoch": 2.3425790754257907, "grad_norm": 0.6689403653144836, "learning_rate": 1.2621912337288372e-06, "loss": 0.551365077495575, "step": 1806 }, { "epoch": 2.3438767234387674, "grad_norm": 0.606182873249054, "learning_rate": 1.257446259144494e-06, "loss": 0.5419661998748779, "step": 1807 }, { "epoch": 2.3451743714517437, "grad_norm": 0.5901670455932617, "learning_rate": 1.2527089371352968e-06, "loss": 0.5732494592666626, "step": 1808 }, { "epoch": 2.34647201946472, "grad_norm": 0.6110414862632751, "learning_rate": 1.2479792773878647e-06, "loss": 0.6051602363586426, "step": 1809 }, { "epoch": 2.3477696674776967, "grad_norm": 0.6416681408882141, "learning_rate": 1.243257289573161e-06, "loss": 0.593826949596405, "step": 1810 }, { "epoch": 2.3490673154906734, "grad_norm": 0.6288197636604309, "learning_rate": 1.2385429833464513e-06, "loss": 0.5421499609947205, "step": 1811 }, { "epoch": 2.3503649635036497, "grad_norm": 0.6199961304664612, "learning_rate": 1.2338363683472998e-06, "loss": 0.5908663868904114, "step": 1812 }, { "epoch": 2.351662611516626, "grad_norm": 0.6229044198989868, "learning_rate": 1.2291374541995437e-06, "loss": 0.5933829545974731, "step": 1813 }, { "epoch": 2.3529602595296026, "grad_norm": 0.6609744429588318, "learning_rate": 1.224446250511272e-06, "loss": 0.594125509262085, "step": 1814 }, { "epoch": 2.354257907542579, "grad_norm": 0.6363682150840759, "learning_rate": 1.2197627668748101e-06, "loss": 0.5930228233337402, "step": 1815 }, { "epoch": 2.3555555555555556, "grad_norm": 0.6157255172729492, "learning_rate": 1.2150870128666959e-06, "loss": 0.5634854435920715, "step": 1816 }, { "epoch": 2.356853203568532, "grad_norm": 0.6403535604476929, "learning_rate": 1.2104189980476627e-06, "loss": 0.5946694612503052, "step": 1817 }, { "epoch": 2.3581508515815086, "grad_norm": 0.6029789447784424, "learning_rate": 1.2057587319626213e-06, "loss": 0.5258057713508606, "step": 1818 }, { "epoch": 2.359448499594485, "grad_norm": 0.6252802014350891, "learning_rate": 1.2011062241406313e-06, "loss": 0.5830211639404297, "step": 1819 }, { "epoch": 2.3607461476074616, "grad_norm": 0.608201801776886, "learning_rate": 1.1964614840949002e-06, "loss": 0.6013060212135315, "step": 1820 }, { "epoch": 2.362043795620438, "grad_norm": 0.6110815405845642, "learning_rate": 1.1918245213227408e-06, "loss": 0.576073169708252, "step": 1821 }, { "epoch": 2.3633414436334146, "grad_norm": 0.605087161064148, "learning_rate": 1.1871953453055707e-06, "loss": 0.6136230826377869, "step": 1822 }, { "epoch": 2.364639091646391, "grad_norm": 0.6053324341773987, "learning_rate": 1.182573965508882e-06, "loss": 0.5785141587257385, "step": 1823 }, { "epoch": 2.3659367396593676, "grad_norm": 0.6085898876190186, "learning_rate": 1.1779603913822274e-06, "loss": 0.5601797103881836, "step": 1824 }, { "epoch": 2.367234387672344, "grad_norm": 0.6608554124832153, "learning_rate": 1.1733546323591981e-06, "loss": 0.5785682797431946, "step": 1825 }, { "epoch": 2.3685320356853206, "grad_norm": 0.6056334972381592, "learning_rate": 1.168756697857406e-06, "loss": 0.5939031839370728, "step": 1826 }, { "epoch": 2.369829683698297, "grad_norm": 0.6553589105606079, "learning_rate": 1.1641665972784628e-06, "loss": 0.6532239317893982, "step": 1827 }, { "epoch": 2.371127331711273, "grad_norm": 0.6094745397567749, "learning_rate": 1.1595843400079636e-06, "loss": 0.5682094097137451, "step": 1828 }, { "epoch": 2.37242497972425, "grad_norm": 0.623717188835144, "learning_rate": 1.1550099354154615e-06, "loss": 0.6046154499053955, "step": 1829 }, { "epoch": 2.373722627737226, "grad_norm": 0.631445050239563, "learning_rate": 1.1504433928544594e-06, "loss": 0.6053498387336731, "step": 1830 }, { "epoch": 2.375020275750203, "grad_norm": 0.6280617117881775, "learning_rate": 1.1458847216623813e-06, "loss": 0.5817880630493164, "step": 1831 }, { "epoch": 2.376317923763179, "grad_norm": 0.6309313178062439, "learning_rate": 1.141333931160552e-06, "loss": 0.6206140518188477, "step": 1832 }, { "epoch": 2.377615571776156, "grad_norm": 0.6384704113006592, "learning_rate": 1.1367910306541918e-06, "loss": 0.6599752306938171, "step": 1833 }, { "epoch": 2.378913219789132, "grad_norm": 0.6254469752311707, "learning_rate": 1.1322560294323775e-06, "loss": 0.5889034271240234, "step": 1834 }, { "epoch": 2.3802108678021088, "grad_norm": 0.6390111446380615, "learning_rate": 1.1277289367680411e-06, "loss": 0.6020563840866089, "step": 1835 }, { "epoch": 2.381508515815085, "grad_norm": 0.6277632117271423, "learning_rate": 1.123209761917941e-06, "loss": 0.5417424440383911, "step": 1836 }, { "epoch": 2.3828061638280618, "grad_norm": 0.6135120987892151, "learning_rate": 1.1186985141226458e-06, "loss": 0.5558514595031738, "step": 1837 }, { "epoch": 2.384103811841038, "grad_norm": 0.6234643459320068, "learning_rate": 1.1141952026065156e-06, "loss": 0.6145384311676025, "step": 1838 }, { "epoch": 2.3854014598540147, "grad_norm": 0.6055371165275574, "learning_rate": 1.1096998365776828e-06, "loss": 0.5748616456985474, "step": 1839 }, { "epoch": 2.386699107866991, "grad_norm": 0.6127825379371643, "learning_rate": 1.1052124252280322e-06, "loss": 0.5389982461929321, "step": 1840 }, { "epoch": 2.386699107866991, "eval_loss": 0.6825700998306274, "eval_runtime": 72.9215, "eval_samples_per_second": 71.2, "eval_steps_per_second": 8.9, "step": 1840 }, { "epoch": 2.3879967558799677, "grad_norm": 0.6031513214111328, "learning_rate": 1.1007329777331866e-06, "loss": 0.5840494632720947, "step": 1841 }, { "epoch": 2.389294403892944, "grad_norm": 0.63483726978302, "learning_rate": 1.096261503252478e-06, "loss": 0.5311962366104126, "step": 1842 }, { "epoch": 2.3905920519059203, "grad_norm": 0.6125195622444153, "learning_rate": 1.0917980109289455e-06, "loss": 0.5285024046897888, "step": 1843 }, { "epoch": 2.391889699918897, "grad_norm": 0.5990893244743347, "learning_rate": 1.0873425098892964e-06, "loss": 0.5493112802505493, "step": 1844 }, { "epoch": 2.3931873479318737, "grad_norm": 0.6030960083007812, "learning_rate": 1.082895009243905e-06, "loss": 0.5796130895614624, "step": 1845 }, { "epoch": 2.39448499594485, "grad_norm": 0.6366276741027832, "learning_rate": 1.078455518086784e-06, "loss": 0.5433975458145142, "step": 1846 }, { "epoch": 2.3957826439578263, "grad_norm": 0.5901277661323547, "learning_rate": 1.0740240454955692e-06, "loss": 0.5538575649261475, "step": 1847 }, { "epoch": 2.397080291970803, "grad_norm": 0.6165037155151367, "learning_rate": 1.0696006005314996e-06, "loss": 0.5998971462249756, "step": 1848 }, { "epoch": 2.3983779399837792, "grad_norm": 0.6113094091415405, "learning_rate": 1.0651851922394035e-06, "loss": 0.570077121257782, "step": 1849 }, { "epoch": 2.399675587996756, "grad_norm": 0.6432837247848511, "learning_rate": 1.0607778296476679e-06, "loss": 0.6083425283432007, "step": 1850 }, { "epoch": 2.4009732360097322, "grad_norm": 0.5917057394981384, "learning_rate": 1.05637852176824e-06, "loss": 0.5251022577285767, "step": 1851 }, { "epoch": 2.402270884022709, "grad_norm": 0.6266626119613647, "learning_rate": 1.051987277596585e-06, "loss": 0.5856255292892456, "step": 1852 }, { "epoch": 2.403568532035685, "grad_norm": 0.610355019569397, "learning_rate": 1.0476041061116915e-06, "loss": 0.6004334688186646, "step": 1853 }, { "epoch": 2.404866180048662, "grad_norm": 0.5825424790382385, "learning_rate": 1.0432290162760311e-06, "loss": 0.5548322796821594, "step": 1854 }, { "epoch": 2.406163828061638, "grad_norm": 0.6335608959197998, "learning_rate": 1.038862017035558e-06, "loss": 0.5934311747550964, "step": 1855 }, { "epoch": 2.407461476074615, "grad_norm": 0.6018176078796387, "learning_rate": 1.0345031173196785e-06, "loss": 0.5377739071846008, "step": 1856 }, { "epoch": 2.408759124087591, "grad_norm": 0.6398853659629822, "learning_rate": 1.0301523260412405e-06, "loss": 0.6047654151916504, "step": 1857 }, { "epoch": 2.410056772100568, "grad_norm": 0.6761499643325806, "learning_rate": 1.025809652096511e-06, "loss": 0.6525087356567383, "step": 1858 }, { "epoch": 2.411354420113544, "grad_norm": 0.5981181859970093, "learning_rate": 1.0214751043651582e-06, "loss": 0.5705087184906006, "step": 1859 }, { "epoch": 2.412652068126521, "grad_norm": 0.6022308468818665, "learning_rate": 1.0171486917102357e-06, "loss": 0.5528420209884644, "step": 1860 }, { "epoch": 2.413949716139497, "grad_norm": 0.576118528842926, "learning_rate": 1.0128304229781622e-06, "loss": 0.572098970413208, "step": 1861 }, { "epoch": 2.4152473641524734, "grad_norm": 0.6066587567329407, "learning_rate": 1.008520306998706e-06, "loss": 0.5568013787269592, "step": 1862 }, { "epoch": 2.41654501216545, "grad_norm": 0.7212052345275879, "learning_rate": 1.0042183525849586e-06, "loss": 0.5123892426490784, "step": 1863 }, { "epoch": 2.4178426601784264, "grad_norm": 0.5919977426528931, "learning_rate": 9.999245685333342e-07, "loss": 0.5277501344680786, "step": 1864 }, { "epoch": 2.419140308191403, "grad_norm": 0.5896833539009094, "learning_rate": 9.95638963623528e-07, "loss": 0.5733782649040222, "step": 1865 }, { "epoch": 2.4204379562043794, "grad_norm": 0.6342105269432068, "learning_rate": 9.913615466185234e-07, "loss": 0.6013584136962891, "step": 1866 }, { "epoch": 2.421735604217356, "grad_norm": 0.5951900482177734, "learning_rate": 9.870923262645516e-07, "loss": 0.5315797328948975, "step": 1867 }, { "epoch": 2.4230332522303324, "grad_norm": 0.6201072931289673, "learning_rate": 9.828313112910887e-07, "loss": 0.5741020441055298, "step": 1868 }, { "epoch": 2.424330900243309, "grad_norm": 0.6206340193748474, "learning_rate": 9.78578510410832e-07, "loss": 0.5911818146705627, "step": 1869 }, { "epoch": 2.4256285482562854, "grad_norm": 0.6191825270652771, "learning_rate": 9.743339323196827e-07, "loss": 0.5818160772323608, "step": 1870 }, { "epoch": 2.426926196269262, "grad_norm": 0.6224012970924377, "learning_rate": 9.700975856967287e-07, "loss": 0.5667495727539062, "step": 1871 }, { "epoch": 2.4282238442822384, "grad_norm": 0.622602105140686, "learning_rate": 9.658694792042284e-07, "loss": 0.5867684483528137, "step": 1872 }, { "epoch": 2.429521492295215, "grad_norm": 0.6468759179115295, "learning_rate": 9.616496214875847e-07, "loss": 0.5605747699737549, "step": 1873 }, { "epoch": 2.4308191403081914, "grad_norm": 0.6025612950325012, "learning_rate": 9.574380211753442e-07, "loss": 0.5322221517562866, "step": 1874 }, { "epoch": 2.432116788321168, "grad_norm": 0.601256251335144, "learning_rate": 9.532346868791587e-07, "loss": 0.6136845350265503, "step": 1875 }, { "epoch": 2.4334144363341443, "grad_norm": 0.6094178557395935, "learning_rate": 9.490396271937879e-07, "loss": 0.6157099604606628, "step": 1876 }, { "epoch": 2.4347120843471206, "grad_norm": 0.6287171244621277, "learning_rate": 9.448528506970628e-07, "loss": 0.5530134439468384, "step": 1877 }, { "epoch": 2.4360097323600973, "grad_norm": 0.5963685512542725, "learning_rate": 9.406743659498829e-07, "loss": 0.5840374827384949, "step": 1878 }, { "epoch": 2.437307380373074, "grad_norm": 0.6349402070045471, "learning_rate": 9.365041814961928e-07, "loss": 0.5503448843955994, "step": 1879 }, { "epoch": 2.4386050283860503, "grad_norm": 0.6072769165039062, "learning_rate": 9.323423058629638e-07, "loss": 0.5658475756645203, "step": 1880 }, { "epoch": 2.4399026763990266, "grad_norm": 0.6268115043640137, "learning_rate": 9.281887475601775e-07, "loss": 0.6097016334533691, "step": 1881 }, { "epoch": 2.4412003244120033, "grad_norm": 0.5882371664047241, "learning_rate": 9.240435150808113e-07, "loss": 0.5780482292175293, "step": 1882 }, { "epoch": 2.4424979724249796, "grad_norm": 0.6373420357704163, "learning_rate": 9.19906616900813e-07, "loss": 0.6226140260696411, "step": 1883 }, { "epoch": 2.4437956204379563, "grad_norm": 0.6072852611541748, "learning_rate": 9.157780614790963e-07, "loss": 0.5743207335472107, "step": 1884 }, { "epoch": 2.4450932684509326, "grad_norm": 0.634705126285553, "learning_rate": 9.116578572575091e-07, "loss": 0.6267349720001221, "step": 1885 }, { "epoch": 2.4463909164639093, "grad_norm": 0.6120656132698059, "learning_rate": 9.075460126608271e-07, "loss": 0.6176955699920654, "step": 1886 }, { "epoch": 2.4476885644768855, "grad_norm": 0.5967820882797241, "learning_rate": 9.034425360967319e-07, "loss": 0.6183077096939087, "step": 1887 }, { "epoch": 2.4489862124898623, "grad_norm": 0.5987744331359863, "learning_rate": 8.993474359557936e-07, "loss": 0.5591214895248413, "step": 1888 }, { "epoch": 2.4502838605028385, "grad_norm": 0.6169969439506531, "learning_rate": 8.952607206114588e-07, "loss": 0.5904876589775085, "step": 1889 }, { "epoch": 2.4515815085158152, "grad_norm": 0.6008497476577759, "learning_rate": 8.911823984200219e-07, "loss": 0.5758087635040283, "step": 1890 }, { "epoch": 2.4528791565287915, "grad_norm": 0.6111242175102234, "learning_rate": 8.871124777206213e-07, "loss": 0.6324316263198853, "step": 1891 }, { "epoch": 2.4541768045417682, "grad_norm": 0.638118326663971, "learning_rate": 8.83050966835215e-07, "loss": 0.5944634079933167, "step": 1892 }, { "epoch": 2.4554744525547445, "grad_norm": 0.6154019832611084, "learning_rate": 8.789978740685646e-07, "loss": 0.5495239496231079, "step": 1893 }, { "epoch": 2.456772100567721, "grad_norm": 0.618356466293335, "learning_rate": 8.749532077082179e-07, "loss": 0.5651803016662598, "step": 1894 }, { "epoch": 2.4580697485806975, "grad_norm": 0.6217320561408997, "learning_rate": 8.709169760244968e-07, "loss": 0.6198887825012207, "step": 1895 }, { "epoch": 2.4593673965936738, "grad_norm": 0.6045297384262085, "learning_rate": 8.668891872704682e-07, "loss": 0.5438726544380188, "step": 1896 }, { "epoch": 2.4606650446066505, "grad_norm": 0.614281952381134, "learning_rate": 8.628698496819471e-07, "loss": 0.5607205629348755, "step": 1897 }, { "epoch": 2.4619626926196267, "grad_norm": 0.5984881520271301, "learning_rate": 8.58858971477457e-07, "loss": 0.6331669688224792, "step": 1898 }, { "epoch": 2.4632603406326035, "grad_norm": 0.6256738901138306, "learning_rate": 8.548565608582299e-07, "loss": 0.5844709873199463, "step": 1899 }, { "epoch": 2.4645579886455797, "grad_norm": 0.5857892036437988, "learning_rate": 8.508626260081826e-07, "loss": 0.5776396989822388, "step": 1900 }, { "epoch": 2.4658556366585564, "grad_norm": 0.6575695872306824, "learning_rate": 8.468771750939009e-07, "loss": 0.5862407684326172, "step": 1901 }, { "epoch": 2.4671532846715327, "grad_norm": 0.5867515206336975, "learning_rate": 8.429002162646233e-07, "loss": 0.5810645222663879, "step": 1902 }, { "epoch": 2.4684509326845094, "grad_norm": 0.6347371935844421, "learning_rate": 8.389317576522243e-07, "loss": 0.6229629516601562, "step": 1903 }, { "epoch": 2.4697485806974857, "grad_norm": 0.604457676410675, "learning_rate": 8.349718073711971e-07, "loss": 0.5473800897598267, "step": 1904 }, { "epoch": 2.4710462287104624, "grad_norm": 0.6130659580230713, "learning_rate": 8.310203735186384e-07, "loss": 0.6687853932380676, "step": 1905 }, { "epoch": 2.4723438767234387, "grad_norm": 0.6164904236793518, "learning_rate": 8.270774641742275e-07, "loss": 0.6242067217826843, "step": 1906 }, { "epoch": 2.4736415247364154, "grad_norm": 0.64787358045578, "learning_rate": 8.231430874002206e-07, "loss": 0.5970586538314819, "step": 1907 }, { "epoch": 2.4749391727493917, "grad_norm": 0.6561875939369202, "learning_rate": 8.192172512414187e-07, "loss": 0.5711146593093872, "step": 1908 }, { "epoch": 2.4762368207623684, "grad_norm": 0.6017801761627197, "learning_rate": 8.152999637251641e-07, "loss": 0.5429533123970032, "step": 1909 }, { "epoch": 2.4775344687753447, "grad_norm": 0.60152268409729, "learning_rate": 8.113912328613183e-07, "loss": 0.5184666514396667, "step": 1910 }, { "epoch": 2.478832116788321, "grad_norm": 0.598573625087738, "learning_rate": 8.074910666422475e-07, "loss": 0.5503566861152649, "step": 1911 }, { "epoch": 2.4801297648012977, "grad_norm": 0.6241352558135986, "learning_rate": 8.035994730428031e-07, "loss": 0.6021054983139038, "step": 1912 }, { "epoch": 2.4814274128142744, "grad_norm": 0.6195024251937866, "learning_rate": 7.997164600203111e-07, "loss": 0.5467978715896606, "step": 1913 }, { "epoch": 2.4827250608272506, "grad_norm": 0.6009840369224548, "learning_rate": 7.958420355145469e-07, "loss": 0.5863580703735352, "step": 1914 }, { "epoch": 2.484022708840227, "grad_norm": 0.6128111481666565, "learning_rate": 7.919762074477311e-07, "loss": 0.5403767824172974, "step": 1915 }, { "epoch": 2.4853203568532036, "grad_norm": 0.6071099042892456, "learning_rate": 7.881189837245024e-07, "loss": 0.5299487709999084, "step": 1916 }, { "epoch": 2.48661800486618, "grad_norm": 0.6704837083816528, "learning_rate": 7.842703722319073e-07, "loss": 0.6165317893028259, "step": 1917 }, { "epoch": 2.4879156528791566, "grad_norm": 0.6277005672454834, "learning_rate": 7.804303808393831e-07, "loss": 0.5439109206199646, "step": 1918 }, { "epoch": 2.489213300892133, "grad_norm": 0.6348392367362976, "learning_rate": 7.76599017398737e-07, "loss": 0.6694045662879944, "step": 1919 }, { "epoch": 2.4905109489051096, "grad_norm": 0.6145819425582886, "learning_rate": 7.727762897441421e-07, "loss": 0.550458550453186, "step": 1920 }, { "epoch": 2.491808596918086, "grad_norm": 0.61981600522995, "learning_rate": 7.689622056921053e-07, "loss": 0.594965934753418, "step": 1921 }, { "epoch": 2.4931062449310626, "grad_norm": 0.7170799374580383, "learning_rate": 7.65156773041465e-07, "loss": 0.6357606053352356, "step": 1922 }, { "epoch": 2.494403892944039, "grad_norm": 0.6079750061035156, "learning_rate": 7.613599995733667e-07, "loss": 0.5912356376647949, "step": 1923 }, { "epoch": 2.4957015409570156, "grad_norm": 0.6176713109016418, "learning_rate": 7.575718930512516e-07, "loss": 0.5135859847068787, "step": 1924 }, { "epoch": 2.496999188969992, "grad_norm": 0.6063299179077148, "learning_rate": 7.537924612208391e-07, "loss": 0.5870840549468994, "step": 1925 }, { "epoch": 2.4982968369829686, "grad_norm": 0.6175487041473389, "learning_rate": 7.500217118101106e-07, "loss": 0.5973732471466064, "step": 1926 }, { "epoch": 2.499594484995945, "grad_norm": 0.6008102893829346, "learning_rate": 7.462596525292937e-07, "loss": 0.5943004488945007, "step": 1927 }, { "epoch": 2.5008921330089215, "grad_norm": 0.6359487771987915, "learning_rate": 7.425062910708492e-07, "loss": 0.5653975009918213, "step": 1928 }, { "epoch": 2.502189781021898, "grad_norm": 0.6241583824157715, "learning_rate": 7.387616351094473e-07, "loss": 0.5532112121582031, "step": 1929 }, { "epoch": 2.503487429034874, "grad_norm": 0.6088744401931763, "learning_rate": 7.350256923019666e-07, "loss": 0.5315259695053101, "step": 1930 }, { "epoch": 2.504785077047851, "grad_norm": 0.6145752668380737, "learning_rate": 7.312984702874609e-07, "loss": 0.600688099861145, "step": 1931 }, { "epoch": 2.5060827250608275, "grad_norm": 0.6202653050422668, "learning_rate": 7.275799766871577e-07, "loss": 0.6020484566688538, "step": 1932 }, { "epoch": 2.507380373073804, "grad_norm": 0.6492214798927307, "learning_rate": 7.238702191044344e-07, "loss": 0.6212818622589111, "step": 1933 }, { "epoch": 2.50867802108678, "grad_norm": 0.5913106203079224, "learning_rate": 7.201692051248066e-07, "loss": 0.5435472726821899, "step": 1934 }, { "epoch": 2.509975669099757, "grad_norm": 0.6050302982330322, "learning_rate": 7.164769423159113e-07, "loss": 0.6042004823684692, "step": 1935 }, { "epoch": 2.511273317112733, "grad_norm": 0.6316038966178894, "learning_rate": 7.127934382274926e-07, "loss": 0.558472752571106, "step": 1936 }, { "epoch": 2.5125709651257098, "grad_norm": 0.6041384339332581, "learning_rate": 7.091187003913802e-07, "loss": 0.6053918600082397, "step": 1937 }, { "epoch": 2.513868613138686, "grad_norm": 0.6338528394699097, "learning_rate": 7.054527363214875e-07, "loss": 0.5851538777351379, "step": 1938 }, { "epoch": 2.5151662611516628, "grad_norm": 0.7164930105209351, "learning_rate": 7.017955535137788e-07, "loss": 0.5775594115257263, "step": 1939 }, { "epoch": 2.516463909164639, "grad_norm": 0.9809231758117676, "learning_rate": 6.981471594462719e-07, "loss": 0.6198115348815918, "step": 1940 }, { "epoch": 2.5177615571776153, "grad_norm": 0.6024364829063416, "learning_rate": 6.945075615790059e-07, "loss": 0.5934704542160034, "step": 1941 }, { "epoch": 2.519059205190592, "grad_norm": 0.6212522387504578, "learning_rate": 6.908767673540384e-07, "loss": 0.6180324554443359, "step": 1942 }, { "epoch": 2.5203568532035687, "grad_norm": 0.6258326172828674, "learning_rate": 6.872547841954241e-07, "loss": 0.5982950925827026, "step": 1943 }, { "epoch": 2.521654501216545, "grad_norm": 0.6158891320228577, "learning_rate": 6.836416195092021e-07, "loss": 0.5860976576805115, "step": 1944 }, { "epoch": 2.5229521492295213, "grad_norm": 0.6238812208175659, "learning_rate": 6.800372806833799e-07, "loss": 0.5936440229415894, "step": 1945 }, { "epoch": 2.524249797242498, "grad_norm": 0.5862494111061096, "learning_rate": 6.764417750879182e-07, "loss": 0.5802135467529297, "step": 1946 }, { "epoch": 2.5255474452554747, "grad_norm": 0.6118647456169128, "learning_rate": 6.728551100747155e-07, "loss": 0.5778954029083252, "step": 1947 }, { "epoch": 2.526845093268451, "grad_norm": 0.6207137703895569, "learning_rate": 6.692772929775943e-07, "loss": 0.6226284503936768, "step": 1948 }, { "epoch": 2.5281427412814272, "grad_norm": 0.6094867587089539, "learning_rate": 6.657083311122858e-07, "loss": 0.5938500761985779, "step": 1949 }, { "epoch": 2.529440389294404, "grad_norm": 0.6266283988952637, "learning_rate": 6.621482317764105e-07, "loss": 0.5501142740249634, "step": 1950 }, { "epoch": 2.5307380373073802, "grad_norm": 0.6360139846801758, "learning_rate": 6.585970022494748e-07, "loss": 0.6632074117660522, "step": 1951 }, { "epoch": 2.532035685320357, "grad_norm": 0.6052773594856262, "learning_rate": 6.550546497928401e-07, "loss": 0.5711944103240967, "step": 1952 }, { "epoch": 2.533333333333333, "grad_norm": 0.6809741258621216, "learning_rate": 6.515211816497247e-07, "loss": 0.5731922388076782, "step": 1953 }, { "epoch": 2.53463098134631, "grad_norm": 0.6013851761817932, "learning_rate": 6.479966050451736e-07, "loss": 0.572198748588562, "step": 1954 }, { "epoch": 2.535928629359286, "grad_norm": 0.6084575653076172, "learning_rate": 6.444809271860547e-07, "loss": 0.5986557006835938, "step": 1955 }, { "epoch": 2.537226277372263, "grad_norm": 0.6349742412567139, "learning_rate": 6.409741552610399e-07, "loss": 0.5914225578308105, "step": 1956 }, { "epoch": 2.538523925385239, "grad_norm": 0.6118656396865845, "learning_rate": 6.374762964405895e-07, "loss": 0.5655546188354492, "step": 1957 }, { "epoch": 2.539821573398216, "grad_norm": 0.6187875270843506, "learning_rate": 6.339873578769401e-07, "loss": 0.5871388912200928, "step": 1958 }, { "epoch": 2.541119221411192, "grad_norm": 48.24391555786133, "learning_rate": 6.305073467040884e-07, "loss": 0.5712297558784485, "step": 1959 }, { "epoch": 2.5424168694241684, "grad_norm": 0.6253454685211182, "learning_rate": 6.270362700377736e-07, "loss": 0.6522243022918701, "step": 1960 }, { "epoch": 2.543714517437145, "grad_norm": 0.5885297656059265, "learning_rate": 6.235741349754731e-07, "loss": 0.6240279078483582, "step": 1961 }, { "epoch": 2.545012165450122, "grad_norm": 0.600005030632019, "learning_rate": 6.201209485963744e-07, "loss": 0.6034828424453735, "step": 1962 }, { "epoch": 2.546309813463098, "grad_norm": 0.677692711353302, "learning_rate": 6.166767179613691e-07, "loss": 0.5885945558547974, "step": 1963 }, { "epoch": 2.5476074614760744, "grad_norm": 0.6142828464508057, "learning_rate": 6.132414501130385e-07, "loss": 0.5538769960403442, "step": 1964 }, { "epoch": 2.548905109489051, "grad_norm": 0.6016609072685242, "learning_rate": 6.098151520756357e-07, "loss": 0.5977665185928345, "step": 1965 }, { "epoch": 2.550202757502028, "grad_norm": 2.0037388801574707, "learning_rate": 6.063978308550722e-07, "loss": 0.612566351890564, "step": 1966 }, { "epoch": 2.551500405515004, "grad_norm": 0.602703869342804, "learning_rate": 6.029894934389058e-07, "loss": 0.5812326669692993, "step": 1967 }, { "epoch": 2.5527980535279804, "grad_norm": 0.5868345499038696, "learning_rate": 5.995901467963228e-07, "loss": 0.5142446160316467, "step": 1968 }, { "epoch": 2.554095701540957, "grad_norm": 0.625521719455719, "learning_rate": 5.961997978781292e-07, "loss": 0.5533977746963501, "step": 1969 }, { "epoch": 2.5553933495539334, "grad_norm": 0.6117697358131409, "learning_rate": 5.928184536167258e-07, "loss": 0.6049879789352417, "step": 1970 }, { "epoch": 2.55669099756691, "grad_norm": 0.6366870403289795, "learning_rate": 5.89446120926111e-07, "loss": 0.5416997671127319, "step": 1971 }, { "epoch": 2.5579886455798864, "grad_norm": 0.6090091466903687, "learning_rate": 5.860828067018481e-07, "loss": 0.5767660737037659, "step": 1972 }, { "epoch": 2.559286293592863, "grad_norm": 0.6263614892959595, "learning_rate": 5.82728517821064e-07, "loss": 0.5914768576622009, "step": 1973 }, { "epoch": 2.5605839416058394, "grad_norm": 0.6438020467758179, "learning_rate": 5.793832611424322e-07, "loss": 0.5773044228553772, "step": 1974 }, { "epoch": 2.5618815896188156, "grad_norm": 0.6195680499076843, "learning_rate": 5.760470435061533e-07, "loss": 0.5637648701667786, "step": 1975 }, { "epoch": 2.5631792376317923, "grad_norm": 1.289580225944519, "learning_rate": 5.727198717339511e-07, "loss": 0.6060294508934021, "step": 1976 }, { "epoch": 2.564476885644769, "grad_norm": 0.6049319505691528, "learning_rate": 5.694017526290468e-07, "loss": 0.5878962278366089, "step": 1977 }, { "epoch": 2.5657745336577453, "grad_norm": 0.6546334028244019, "learning_rate": 5.660926929761556e-07, "loss": 0.5719892382621765, "step": 1978 }, { "epoch": 2.5670721816707216, "grad_norm": 0.5887362957000732, "learning_rate": 5.627926995414662e-07, "loss": 0.5226088762283325, "step": 1979 }, { "epoch": 2.5683698296836983, "grad_norm": 0.6115890741348267, "learning_rate": 5.59501779072631e-07, "loss": 0.5784634947776794, "step": 1980 }, { "epoch": 2.569667477696675, "grad_norm": 0.6565897464752197, "learning_rate": 5.562199382987488e-07, "loss": 0.5947513580322266, "step": 1981 }, { "epoch": 2.5709651257096513, "grad_norm": 0.594465970993042, "learning_rate": 5.529471839303541e-07, "loss": 0.5367786884307861, "step": 1982 }, { "epoch": 2.5722627737226276, "grad_norm": 0.6155304908752441, "learning_rate": 5.496835226593983e-07, "loss": 0.6144155859947205, "step": 1983 }, { "epoch": 2.5735604217356043, "grad_norm": 0.6233793497085571, "learning_rate": 5.464289611592472e-07, "loss": 0.5667406916618347, "step": 1984 }, { "epoch": 2.5748580697485806, "grad_norm": 0.6025534272193909, "learning_rate": 5.431835060846519e-07, "loss": 0.5775101184844971, "step": 1985 }, { "epoch": 2.5761557177615573, "grad_norm": 0.6037949323654175, "learning_rate": 5.399471640717479e-07, "loss": 0.6155390739440918, "step": 1986 }, { "epoch": 2.5774533657745335, "grad_norm": 0.61771160364151, "learning_rate": 5.367199417380347e-07, "loss": 0.5459461808204651, "step": 1987 }, { "epoch": 2.5787510137875103, "grad_norm": 0.6559909582138062, "learning_rate": 5.335018456823665e-07, "loss": 0.6187810897827148, "step": 1988 }, { "epoch": 2.5800486618004865, "grad_norm": 0.6218096017837524, "learning_rate": 5.302928824849335e-07, "loss": 0.629378080368042, "step": 1989 }, { "epoch": 2.5813463098134632, "grad_norm": 0.5922935605049133, "learning_rate": 5.270930587072548e-07, "loss": 0.5435377359390259, "step": 1990 }, { "epoch": 2.5826439578264395, "grad_norm": 0.5918126106262207, "learning_rate": 5.239023808921595e-07, "loss": 0.5545147657394409, "step": 1991 }, { "epoch": 2.5839416058394162, "grad_norm": 0.6067506074905396, "learning_rate": 5.207208555637767e-07, "loss": 0.6249223351478577, "step": 1992 }, { "epoch": 2.5852392538523925, "grad_norm": 0.6125559210777283, "learning_rate": 5.175484892275184e-07, "loss": 0.5820242166519165, "step": 1993 }, { "epoch": 2.5865369018653688, "grad_norm": 0.5970590114593506, "learning_rate": 5.14385288370074e-07, "loss": 0.6091808080673218, "step": 1994 }, { "epoch": 2.5878345498783455, "grad_norm": 0.5902854204177856, "learning_rate": 5.11231259459386e-07, "loss": 0.5224129557609558, "step": 1995 }, { "epoch": 2.589132197891322, "grad_norm": 0.604062020778656, "learning_rate": 5.080864089446464e-07, "loss": 0.5258910655975342, "step": 1996 }, { "epoch": 2.5904298459042985, "grad_norm": 0.6816832423210144, "learning_rate": 5.049507432562778e-07, "loss": 0.5509624481201172, "step": 1997 }, { "epoch": 2.5917274939172747, "grad_norm": 0.6220773458480835, "learning_rate": 5.018242688059238e-07, "loss": 0.6509982943534851, "step": 1998 }, { "epoch": 2.5930251419302515, "grad_norm": 0.6238852143287659, "learning_rate": 4.987069919864329e-07, "loss": 0.6329154968261719, "step": 1999 }, { "epoch": 2.5943227899432277, "grad_norm": 0.6279301643371582, "learning_rate": 4.95598919171848e-07, "loss": 0.624962329864502, "step": 2000 }, { "epoch": 2.5956204379562045, "grad_norm": 0.6066421866416931, "learning_rate": 4.925000567173882e-07, "loss": 0.6009570360183716, "step": 2001 }, { "epoch": 2.5969180859691807, "grad_norm": 0.6097516417503357, "learning_rate": 4.894104109594466e-07, "loss": 0.5533030033111572, "step": 2002 }, { "epoch": 2.5982157339821574, "grad_norm": 0.60311359167099, "learning_rate": 4.863299882155659e-07, "loss": 0.5549200177192688, "step": 2003 }, { "epoch": 2.5995133819951337, "grad_norm": 0.6075156927108765, "learning_rate": 4.832587947844297e-07, "loss": 0.5541381239891052, "step": 2004 }, { "epoch": 2.6008110300081104, "grad_norm": 0.6099098324775696, "learning_rate": 4.801968369458531e-07, "loss": 0.6142464876174927, "step": 2005 }, { "epoch": 2.6021086780210867, "grad_norm": 0.6433584690093994, "learning_rate": 4.771441209607625e-07, "loss": 0.6120733022689819, "step": 2006 }, { "epoch": 2.6034063260340634, "grad_norm": 1.134731411933899, "learning_rate": 4.7410065307119167e-07, "loss": 0.6064984798431396, "step": 2007 }, { "epoch": 2.6047039740470397, "grad_norm": 0.6147306561470032, "learning_rate": 4.7106643950026067e-07, "loss": 0.5834633111953735, "step": 2008 }, { "epoch": 2.606001622060016, "grad_norm": 0.610374927520752, "learning_rate": 4.6804148645216873e-07, "loss": 0.5858355760574341, "step": 2009 }, { "epoch": 2.6072992700729927, "grad_norm": 0.6226435899734497, "learning_rate": 4.6502580011217934e-07, "loss": 0.5983865261077881, "step": 2010 }, { "epoch": 2.6085969180859694, "grad_norm": 0.6833674311637878, "learning_rate": 4.6201938664660775e-07, "loss": 0.6065071225166321, "step": 2011 }, { "epoch": 2.6098945660989457, "grad_norm": 0.6266833543777466, "learning_rate": 4.590222522028082e-07, "loss": 0.5968768000602722, "step": 2012 }, { "epoch": 2.611192214111922, "grad_norm": 0.6198201179504395, "learning_rate": 4.5603440290916347e-07, "loss": 0.6149097681045532, "step": 2013 }, { "epoch": 2.6124898621248986, "grad_norm": 0.6224921941757202, "learning_rate": 4.5305584487506605e-07, "loss": 0.6195799708366394, "step": 2014 }, { "epoch": 2.6137875101378754, "grad_norm": 0.5922067165374756, "learning_rate": 4.500865841909169e-07, "loss": 0.5795333385467529, "step": 2015 }, { "epoch": 2.6150851581508516, "grad_norm": 0.6451519727706909, "learning_rate": 4.471266269280994e-07, "loss": 0.6512206196784973, "step": 2016 }, { "epoch": 2.616382806163828, "grad_norm": 0.6207348108291626, "learning_rate": 4.441759791389799e-07, "loss": 0.6410412788391113, "step": 2017 }, { "epoch": 2.6176804541768046, "grad_norm": 0.6637576818466187, "learning_rate": 4.41234646856884e-07, "loss": 0.5507533550262451, "step": 2018 }, { "epoch": 2.618978102189781, "grad_norm": 0.6296217441558838, "learning_rate": 4.383026360960929e-07, "loss": 0.5853258371353149, "step": 2019 }, { "epoch": 2.6202757502027576, "grad_norm": 0.5993384122848511, "learning_rate": 4.35379952851826e-07, "loss": 0.5613459944725037, "step": 2020 }, { "epoch": 2.621573398215734, "grad_norm": 0.6372536420822144, "learning_rate": 4.324666031002311e-07, "loss": 0.563460111618042, "step": 2021 }, { "epoch": 2.6228710462287106, "grad_norm": 0.6129400134086609, "learning_rate": 4.29562592798371e-07, "loss": 0.6133362650871277, "step": 2022 }, { "epoch": 2.624168694241687, "grad_norm": 0.6232635974884033, "learning_rate": 4.266679278842123e-07, "loss": 0.5923752784729004, "step": 2023 }, { "epoch": 2.6254663422546636, "grad_norm": 0.6236964464187622, "learning_rate": 4.2378261427660994e-07, "loss": 0.5925074815750122, "step": 2024 }, { "epoch": 2.62676399026764, "grad_norm": 0.5997064113616943, "learning_rate": 4.209066578753035e-07, "loss": 0.5586100816726685, "step": 2025 }, { "epoch": 2.6280616382806166, "grad_norm": 0.6276852488517761, "learning_rate": 4.1804006456089174e-07, "loss": 0.5699270367622375, "step": 2026 }, { "epoch": 2.629359286293593, "grad_norm": 0.5818026065826416, "learning_rate": 4.1518284019483655e-07, "loss": 0.5539983510971069, "step": 2027 }, { "epoch": 2.630656934306569, "grad_norm": 0.6021342277526855, "learning_rate": 4.123349906194357e-07, "loss": 0.5571432709693909, "step": 2028 }, { "epoch": 2.631954582319546, "grad_norm": 0.6044632196426392, "learning_rate": 4.094965216578212e-07, "loss": 0.5815938711166382, "step": 2029 }, { "epoch": 2.6332522303325225, "grad_norm": 0.6218861937522888, "learning_rate": 4.066674391139458e-07, "loss": 0.5798450112342834, "step": 2030 }, { "epoch": 2.634549878345499, "grad_norm": 0.6776529550552368, "learning_rate": 4.038477487725645e-07, "loss": 0.5181751251220703, "step": 2031 }, { "epoch": 2.635847526358475, "grad_norm": 0.6296592354774475, "learning_rate": 4.0103745639923144e-07, "loss": 0.6052215695381165, "step": 2032 }, { "epoch": 2.637145174371452, "grad_norm": 0.6410042643547058, "learning_rate": 3.9823656774028386e-07, "loss": 0.5471499562263489, "step": 2033 }, { "epoch": 2.638442822384428, "grad_norm": 0.6148339509963989, "learning_rate": 3.9544508852282895e-07, "loss": 0.6046350002288818, "step": 2034 }, { "epoch": 2.639740470397405, "grad_norm": 0.6409063935279846, "learning_rate": 3.9266302445473634e-07, "loss": 0.5563018918037415, "step": 2035 }, { "epoch": 2.641038118410381, "grad_norm": 0.6377732157707214, "learning_rate": 3.89890381224623e-07, "loss": 0.5965743064880371, "step": 2036 }, { "epoch": 2.6423357664233578, "grad_norm": 0.6147736310958862, "learning_rate": 3.8712716450183985e-07, "loss": 0.558821439743042, "step": 2037 }, { "epoch": 2.643633414436334, "grad_norm": 0.5959088802337646, "learning_rate": 3.8437337993647017e-07, "loss": 0.6072096824645996, "step": 2038 }, { "epoch": 2.6449310624493108, "grad_norm": 0.5934545993804932, "learning_rate": 3.81629033159302e-07, "loss": 0.5585888028144836, "step": 2039 }, { "epoch": 2.646228710462287, "grad_norm": 0.6148179173469543, "learning_rate": 3.7889412978183324e-07, "loss": 0.6224203705787659, "step": 2040 }, { "epoch": 2.6475263584752637, "grad_norm": 0.6041895151138306, "learning_rate": 3.7616867539624733e-07, "loss": 0.5594790577888489, "step": 2041 }, { "epoch": 2.64882400648824, "grad_norm": 0.6036660075187683, "learning_rate": 3.734526755754092e-07, "loss": 0.5392581820487976, "step": 2042 }, { "epoch": 2.6501216545012163, "grad_norm": 0.6497801542282104, "learning_rate": 3.707461358728509e-07, "loss": 0.645263135433197, "step": 2043 }, { "epoch": 2.651419302514193, "grad_norm": 0.6202139258384705, "learning_rate": 3.680490618227611e-07, "loss": 0.6205359697341919, "step": 2044 }, { "epoch": 2.6527169505271697, "grad_norm": 0.5867362022399902, "learning_rate": 3.6536145893997346e-07, "loss": 0.5754397511482239, "step": 2045 }, { "epoch": 2.654014598540146, "grad_norm": 0.6415355205535889, "learning_rate": 3.626833327199564e-07, "loss": 0.6042582392692566, "step": 2046 }, { "epoch": 2.6553122465531223, "grad_norm": 0.6417367458343506, "learning_rate": 3.600146886387984e-07, "loss": 0.6140678524971008, "step": 2047 }, { "epoch": 2.656609894566099, "grad_norm": 0.6080589890480042, "learning_rate": 3.573555321532035e-07, "loss": 0.574844241142273, "step": 2048 }, { "epoch": 2.6579075425790757, "grad_norm": 0.6920068264007568, "learning_rate": 3.547058687004723e-07, "loss": 0.6025684475898743, "step": 2049 }, { "epoch": 2.659205190592052, "grad_norm": 0.6130858659744263, "learning_rate": 3.520657036984959e-07, "loss": 0.5683197379112244, "step": 2050 }, { "epoch": 2.6605028386050282, "grad_norm": 0.6280376315116882, "learning_rate": 3.494350425457438e-07, "loss": 0.5609173774719238, "step": 2051 }, { "epoch": 2.661800486618005, "grad_norm": 0.6326773166656494, "learning_rate": 3.46813890621252e-07, "loss": 0.5946630239486694, "step": 2052 }, { "epoch": 2.663098134630981, "grad_norm": 0.6118667721748352, "learning_rate": 3.4420225328461286e-07, "loss": 0.5908790826797485, "step": 2053 }, { "epoch": 2.664395782643958, "grad_norm": 0.6427050828933716, "learning_rate": 3.416001358759635e-07, "loss": 0.6200711727142334, "step": 2054 }, { "epoch": 2.665693430656934, "grad_norm": 0.6258965730667114, "learning_rate": 3.390075437159762e-07, "loss": 0.6091062426567078, "step": 2055 }, { "epoch": 2.666991078669911, "grad_norm": 0.8197891116142273, "learning_rate": 3.36424482105846e-07, "loss": 0.6184768676757812, "step": 2056 }, { "epoch": 2.668288726682887, "grad_norm": 0.6219103336334229, "learning_rate": 3.338509563272774e-07, "loss": 0.5699069499969482, "step": 2057 }, { "epoch": 2.669586374695864, "grad_norm": 0.6160385012626648, "learning_rate": 3.3128697164248213e-07, "loss": 0.6063632369041443, "step": 2058 }, { "epoch": 2.67088402270884, "grad_norm": 0.6377853155136108, "learning_rate": 3.2873253329415986e-07, "loss": 0.6303044557571411, "step": 2059 }, { "epoch": 2.672181670721817, "grad_norm": 0.6218414306640625, "learning_rate": 3.2618764650548806e-07, "loss": 0.5987715721130371, "step": 2060 }, { "epoch": 2.673479318734793, "grad_norm": 0.6107571125030518, "learning_rate": 3.236523164801192e-07, "loss": 0.5237259864807129, "step": 2061 }, { "epoch": 2.6747769667477694, "grad_norm": 0.6305319666862488, "learning_rate": 3.2112654840215863e-07, "loss": 0.6254755854606628, "step": 2062 }, { "epoch": 2.676074614760746, "grad_norm": 0.6144214868545532, "learning_rate": 3.186103474361646e-07, "loss": 0.6048131585121155, "step": 2063 }, { "epoch": 2.677372262773723, "grad_norm": 0.6124334335327148, "learning_rate": 3.161037187271304e-07, "loss": 0.5881555080413818, "step": 2064 }, { "epoch": 2.678669910786699, "grad_norm": 0.6141470670700073, "learning_rate": 3.136066674004773e-07, "loss": 0.5876516103744507, "step": 2065 }, { "epoch": 2.6799675587996754, "grad_norm": 0.5808926820755005, "learning_rate": 3.1111919856204373e-07, "loss": 0.5583111047744751, "step": 2066 }, { "epoch": 2.681265206812652, "grad_norm": 0.663599967956543, "learning_rate": 3.08641317298074e-07, "loss": 0.5772061944007874, "step": 2067 }, { "epoch": 2.6825628548256284, "grad_norm": 0.6320760846138, "learning_rate": 3.0617302867520736e-07, "loss": 0.5595476031303406, "step": 2068 }, { "epoch": 2.683860502838605, "grad_norm": 0.61170494556427, "learning_rate": 3.0371433774047056e-07, "loss": 0.6012779474258423, "step": 2069 }, { "epoch": 2.6851581508515814, "grad_norm": 0.6115148067474365, "learning_rate": 3.0126524952126203e-07, "loss": 0.6057910919189453, "step": 2070 }, { "epoch": 2.6851581508515814, "eval_loss": 0.6819512844085693, "eval_runtime": 72.9512, "eval_samples_per_second": 71.171, "eval_steps_per_second": 8.896, "step": 2070 }, { "epoch": 2.686455798864558, "grad_norm": 0.6251775026321411, "learning_rate": 2.988257690253504e-07, "loss": 0.6118081212043762, "step": 2071 }, { "epoch": 2.6877534468775344, "grad_norm": 0.6253253221511841, "learning_rate": 2.9639590124085296e-07, "loss": 0.6572234630584717, "step": 2072 }, { "epoch": 2.689051094890511, "grad_norm": 0.6017980575561523, "learning_rate": 2.939756511362357e-07, "loss": 0.5534753799438477, "step": 2073 }, { "epoch": 2.6903487429034874, "grad_norm": 0.6164457201957703, "learning_rate": 2.915650236602974e-07, "loss": 0.6046677827835083, "step": 2074 }, { "epoch": 2.691646390916464, "grad_norm": 0.6189885139465332, "learning_rate": 2.891640237421611e-07, "loss": 0.6001750826835632, "step": 2075 }, { "epoch": 2.6929440389294403, "grad_norm": 0.6118842959403992, "learning_rate": 2.8677265629126373e-07, "loss": 0.5822157263755798, "step": 2076 }, { "epoch": 2.6942416869424166, "grad_norm": 0.6505289673805237, "learning_rate": 2.8439092619734655e-07, "loss": 0.6047310829162598, "step": 2077 }, { "epoch": 2.6955393349553933, "grad_norm": 0.6261717081069946, "learning_rate": 2.820188383304451e-07, "loss": 0.5709232687950134, "step": 2078 }, { "epoch": 2.69683698296837, "grad_norm": 0.591399610042572, "learning_rate": 2.7965639754087893e-07, "loss": 0.5760236382484436, "step": 2079 }, { "epoch": 2.6981346309813463, "grad_norm": 0.6267626881599426, "learning_rate": 2.7730360865923954e-07, "loss": 0.627373218536377, "step": 2080 }, { "epoch": 2.6994322789943226, "grad_norm": 0.5880517959594727, "learning_rate": 2.7496047649638757e-07, "loss": 0.556127667427063, "step": 2081 }, { "epoch": 2.7007299270072993, "grad_norm": 0.6221486926078796, "learning_rate": 2.726270058434327e-07, "loss": 0.6388289332389832, "step": 2082 }, { "epoch": 2.702027575020276, "grad_norm": 0.6296391487121582, "learning_rate": 2.703032014717333e-07, "loss": 0.6471085548400879, "step": 2083 }, { "epoch": 2.7033252230332523, "grad_norm": 0.6119943261146545, "learning_rate": 2.6798906813288117e-07, "loss": 0.587184488773346, "step": 2084 }, { "epoch": 2.7046228710462286, "grad_norm": 0.5858760476112366, "learning_rate": 2.656846105586919e-07, "loss": 0.6001055836677551, "step": 2085 }, { "epoch": 2.7059205190592053, "grad_norm": 0.6214133501052856, "learning_rate": 2.633898334611995e-07, "loss": 0.6275671720504761, "step": 2086 }, { "epoch": 2.7072181670721815, "grad_norm": 0.5908603668212891, "learning_rate": 2.6110474153264176e-07, "loss": 0.5731199979782104, "step": 2087 }, { "epoch": 2.7085158150851583, "grad_norm": 0.5500771403312683, "learning_rate": 2.588293394454533e-07, "loss": 0.5535600781440735, "step": 2088 }, { "epoch": 2.7098134630981345, "grad_norm": 0.6212435364723206, "learning_rate": 2.565636318522552e-07, "loss": 0.6325974464416504, "step": 2089 }, { "epoch": 2.7111111111111112, "grad_norm": 0.5896530747413635, "learning_rate": 2.543076233858466e-07, "loss": 0.564407229423523, "step": 2090 }, { "epoch": 2.7124087591240875, "grad_norm": 0.6151485443115234, "learning_rate": 2.5206131865919303e-07, "loss": 0.5890393257141113, "step": 2091 }, { "epoch": 2.7137064071370642, "grad_norm": 0.5984410643577576, "learning_rate": 2.4982472226542045e-07, "loss": 0.5423193573951721, "step": 2092 }, { "epoch": 2.7150040551500405, "grad_norm": 0.6220104694366455, "learning_rate": 2.475978387778e-07, "loss": 0.5741702318191528, "step": 2093 }, { "epoch": 2.7163017031630172, "grad_norm": 0.64532470703125, "learning_rate": 2.453806727497482e-07, "loss": 0.578140914440155, "step": 2094 }, { "epoch": 2.7175993511759935, "grad_norm": 0.6362125277519226, "learning_rate": 2.431732287148053e-07, "loss": 0.6103841066360474, "step": 2095 }, { "epoch": 2.7188969991889698, "grad_norm": 0.6365206837654114, "learning_rate": 2.409755111866369e-07, "loss": 0.6380729079246521, "step": 2096 }, { "epoch": 2.7201946472019465, "grad_norm": 0.6440710425376892, "learning_rate": 2.387875246590193e-07, "loss": 0.5572207570075989, "step": 2097 }, { "epoch": 2.721492295214923, "grad_norm": 0.6295807361602783, "learning_rate": 2.3660927360583064e-07, "loss": 0.6024692058563232, "step": 2098 }, { "epoch": 2.7227899432278995, "grad_norm": 0.5711405873298645, "learning_rate": 2.3444076248104297e-07, "loss": 0.5038433074951172, "step": 2099 }, { "epoch": 2.7240875912408757, "grad_norm": 0.7995308637619019, "learning_rate": 2.322819957187139e-07, "loss": 0.6232460737228394, "step": 2100 }, { "epoch": 2.7253852392538525, "grad_norm": 0.5909203886985779, "learning_rate": 2.3013297773297306e-07, "loss": 0.5349663496017456, "step": 2101 }, { "epoch": 2.7266828872668287, "grad_norm": 0.6373469829559326, "learning_rate": 2.279937129180204e-07, "loss": 0.5974945425987244, "step": 2102 }, { "epoch": 2.7279805352798054, "grad_norm": 0.6128799915313721, "learning_rate": 2.2586420564810863e-07, "loss": 0.5850982069969177, "step": 2103 }, { "epoch": 2.7292781832927817, "grad_norm": 0.6667084097862244, "learning_rate": 2.2374446027754405e-07, "loss": 0.5952577590942383, "step": 2104 }, { "epoch": 2.7305758313057584, "grad_norm": 0.6103115081787109, "learning_rate": 2.2163448114066677e-07, "loss": 0.5764719247817993, "step": 2105 }, { "epoch": 2.7318734793187347, "grad_norm": 0.5843047499656677, "learning_rate": 2.1953427255185122e-07, "loss": 0.5831491947174072, "step": 2106 }, { "epoch": 2.7331711273317114, "grad_norm": 0.6300417184829712, "learning_rate": 2.174438388054928e-07, "loss": 0.5893597602844238, "step": 2107 }, { "epoch": 2.7344687753446877, "grad_norm": 0.601433515548706, "learning_rate": 2.1536318417599844e-07, "loss": 0.5604301691055298, "step": 2108 }, { "epoch": 2.7357664233576644, "grad_norm": 0.6220826506614685, "learning_rate": 2.1329231291778108e-07, "loss": 0.6189798712730408, "step": 2109 }, { "epoch": 2.7370640713706407, "grad_norm": 0.5895432233810425, "learning_rate": 2.1123122926524853e-07, "loss": 0.5561822652816772, "step": 2110 }, { "epoch": 2.738361719383617, "grad_norm": 0.8975700736045837, "learning_rate": 2.0917993743279297e-07, "loss": 0.552111029624939, "step": 2111 }, { "epoch": 2.7396593673965937, "grad_norm": 0.5886269211769104, "learning_rate": 2.0713844161479035e-07, "loss": 0.5910426378250122, "step": 2112 }, { "epoch": 2.7409570154095704, "grad_norm": 0.5890198945999146, "learning_rate": 2.0510674598558045e-07, "loss": 0.5544984936714172, "step": 2113 }, { "epoch": 2.7422546634225466, "grad_norm": 0.6140372157096863, "learning_rate": 2.0308485469946736e-07, "loss": 0.6121523380279541, "step": 2114 }, { "epoch": 2.743552311435523, "grad_norm": 0.5979804396629333, "learning_rate": 2.010727718907074e-07, "loss": 0.5417115688323975, "step": 2115 }, { "epoch": 2.7448499594484996, "grad_norm": 0.6019598841667175, "learning_rate": 1.9907050167349894e-07, "loss": 0.5624793171882629, "step": 2116 }, { "epoch": 2.7461476074614763, "grad_norm": 0.6011685132980347, "learning_rate": 1.9707804814198096e-07, "loss": 0.5510683655738831, "step": 2117 }, { "epoch": 2.7474452554744526, "grad_norm": 0.5924180746078491, "learning_rate": 1.9509541537021392e-07, "loss": 0.5276060104370117, "step": 2118 }, { "epoch": 2.748742903487429, "grad_norm": 0.6053572297096252, "learning_rate": 1.9312260741218114e-07, "loss": 0.5551567673683167, "step": 2119 }, { "epoch": 2.7500405515004056, "grad_norm": 0.620968222618103, "learning_rate": 1.911596283017747e-07, "loss": 0.5851413011550903, "step": 2120 }, { "epoch": 2.751338199513382, "grad_norm": 0.626413881778717, "learning_rate": 1.8920648205279113e-07, "loss": 0.5591974258422852, "step": 2121 }, { "epoch": 2.7526358475263586, "grad_norm": 0.615846574306488, "learning_rate": 1.8726317265891968e-07, "loss": 0.5918228626251221, "step": 2122 }, { "epoch": 2.753933495539335, "grad_norm": 0.645077645778656, "learning_rate": 1.8532970409373684e-07, "loss": 0.5714014172554016, "step": 2123 }, { "epoch": 2.7552311435523116, "grad_norm": 0.6882081031799316, "learning_rate": 1.8340608031069462e-07, "loss": 0.6177914142608643, "step": 2124 }, { "epoch": 2.756528791565288, "grad_norm": 0.6870415806770325, "learning_rate": 1.8149230524311944e-07, "loss": 0.6026558876037598, "step": 2125 }, { "epoch": 2.757826439578264, "grad_norm": 0.5801068544387817, "learning_rate": 1.7958838280419387e-07, "loss": 0.5492424964904785, "step": 2126 }, { "epoch": 2.759124087591241, "grad_norm": 0.6277424693107605, "learning_rate": 1.7769431688696048e-07, "loss": 0.5704351663589478, "step": 2127 }, { "epoch": 2.7604217356042176, "grad_norm": 0.6131430864334106, "learning_rate": 1.7581011136430238e-07, "loss": 0.6227852702140808, "step": 2128 }, { "epoch": 2.761719383617194, "grad_norm": 0.6621940732002258, "learning_rate": 1.739357700889438e-07, "loss": 0.5971069931983948, "step": 2129 }, { "epoch": 2.76301703163017, "grad_norm": 0.6566265225410461, "learning_rate": 1.720712968934385e-07, "loss": 0.6617914438247681, "step": 2130 }, { "epoch": 2.764314679643147, "grad_norm": 0.6165506839752197, "learning_rate": 1.7021669559016184e-07, "loss": 0.5680196285247803, "step": 2131 }, { "epoch": 2.7656123276561235, "grad_norm": 0.6046646237373352, "learning_rate": 1.6837196997130434e-07, "loss": 0.605772078037262, "step": 2132 }, { "epoch": 2.7669099756691, "grad_norm": 0.6838919520378113, "learning_rate": 1.6653712380886366e-07, "loss": 0.5754232406616211, "step": 2133 }, { "epoch": 2.768207623682076, "grad_norm": 0.6096740365028381, "learning_rate": 1.6471216085463372e-07, "loss": 0.5173358917236328, "step": 2134 }, { "epoch": 2.769505271695053, "grad_norm": 0.6066602468490601, "learning_rate": 1.6289708484020395e-07, "loss": 0.5950397253036499, "step": 2135 }, { "epoch": 2.770802919708029, "grad_norm": 0.609034538269043, "learning_rate": 1.6109189947694448e-07, "loss": 0.5427603721618652, "step": 2136 }, { "epoch": 2.7721005677210058, "grad_norm": 0.6451703906059265, "learning_rate": 1.5929660845600215e-07, "loss": 0.6046600341796875, "step": 2137 }, { "epoch": 2.773398215733982, "grad_norm": 0.5977014899253845, "learning_rate": 1.575112154482933e-07, "loss": 0.5849440693855286, "step": 2138 }, { "epoch": 2.7746958637469588, "grad_norm": 0.6242566108703613, "learning_rate": 1.557357241044949e-07, "loss": 0.6496338844299316, "step": 2139 }, { "epoch": 2.775993511759935, "grad_norm": 0.5988749265670776, "learning_rate": 1.539701380550368e-07, "loss": 0.5334508419036865, "step": 2140 }, { "epoch": 2.7772911597729117, "grad_norm": 0.7054303288459778, "learning_rate": 1.5221446091009618e-07, "loss": 0.4878901541233063, "step": 2141 }, { "epoch": 2.778588807785888, "grad_norm": 0.6645851731300354, "learning_rate": 1.504686962595875e-07, "loss": 0.6245031356811523, "step": 2142 }, { "epoch": 2.7798864557988647, "grad_norm": 0.6102975606918335, "learning_rate": 1.4873284767315864e-07, "loss": 0.5180703997612, "step": 2143 }, { "epoch": 2.781184103811841, "grad_norm": 0.6466278433799744, "learning_rate": 1.4700691870017991e-07, "loss": 0.5804831981658936, "step": 2144 }, { "epoch": 2.7824817518248173, "grad_norm": 0.639724612236023, "learning_rate": 1.4529091286973994e-07, "loss": 0.6196957230567932, "step": 2145 }, { "epoch": 2.783779399837794, "grad_norm": 0.6038338541984558, "learning_rate": 1.435848336906359e-07, "loss": 0.5739912986755371, "step": 2146 }, { "epoch": 2.7850770478507707, "grad_norm": 0.6094257831573486, "learning_rate": 1.418886846513673e-07, "loss": 0.6085304021835327, "step": 2147 }, { "epoch": 2.786374695863747, "grad_norm": 0.6370331048965454, "learning_rate": 1.4020246922013093e-07, "loss": 0.572968065738678, "step": 2148 }, { "epoch": 2.7876723438767232, "grad_norm": 0.5946716666221619, "learning_rate": 1.3852619084480933e-07, "loss": 0.5418939590454102, "step": 2149 }, { "epoch": 2.7889699918897, "grad_norm": 0.6075360774993896, "learning_rate": 1.3685985295296798e-07, "loss": 0.5994930267333984, "step": 2150 }, { "epoch": 2.7902676399026762, "grad_norm": 0.6279016733169556, "learning_rate": 1.3520345895184583e-07, "loss": 0.5570014715194702, "step": 2151 }, { "epoch": 2.791565287915653, "grad_norm": 0.6306800246238708, "learning_rate": 1.3355701222835026e-07, "loss": 0.5708903074264526, "step": 2152 }, { "epoch": 2.792862935928629, "grad_norm": 0.6170070171356201, "learning_rate": 1.3192051614904722e-07, "loss": 0.550320029258728, "step": 2153 }, { "epoch": 2.794160583941606, "grad_norm": 0.6288532018661499, "learning_rate": 1.302939740601572e-07, "loss": 0.613933801651001, "step": 2154 }, { "epoch": 2.795458231954582, "grad_norm": 0.6111281514167786, "learning_rate": 1.2867738928754703e-07, "loss": 0.5617604851722717, "step": 2155 }, { "epoch": 2.796755879967559, "grad_norm": 0.9522826075553894, "learning_rate": 1.2707076513672423e-07, "loss": 0.5882472395896912, "step": 2156 }, { "epoch": 2.798053527980535, "grad_norm": 0.6296880841255188, "learning_rate": 1.2547410489282708e-07, "loss": 0.559617280960083, "step": 2157 }, { "epoch": 2.799351175993512, "grad_norm": 0.6598941087722778, "learning_rate": 1.2388741182062348e-07, "loss": 0.5574393272399902, "step": 2158 }, { "epoch": 2.800648824006488, "grad_norm": 0.5911178588867188, "learning_rate": 1.2231068916449705e-07, "loss": 0.5624610185623169, "step": 2159 }, { "epoch": 2.8019464720194645, "grad_norm": 0.6504255533218384, "learning_rate": 1.2074394014844782e-07, "loss": 0.6260690689086914, "step": 2160 }, { "epoch": 2.803244120032441, "grad_norm": 0.6211426258087158, "learning_rate": 1.1918716797608087e-07, "loss": 0.6100113391876221, "step": 2161 }, { "epoch": 2.804541768045418, "grad_norm": 0.6233659386634827, "learning_rate": 1.1764037583060162e-07, "loss": 0.5747858285903931, "step": 2162 }, { "epoch": 2.805839416058394, "grad_norm": 0.6078013777732849, "learning_rate": 1.1610356687480728e-07, "loss": 0.5918527841567993, "step": 2163 }, { "epoch": 2.8071370640713704, "grad_norm": 0.6079197525978088, "learning_rate": 1.1457674425108478e-07, "loss": 0.5714898109436035, "step": 2164 }, { "epoch": 2.808434712084347, "grad_norm": 0.5850006937980652, "learning_rate": 1.1305991108139847e-07, "loss": 0.5996066927909851, "step": 2165 }, { "epoch": 2.809732360097324, "grad_norm": 0.6206707954406738, "learning_rate": 1.1155307046728958e-07, "loss": 0.55565345287323, "step": 2166 }, { "epoch": 2.8110300081103, "grad_norm": 0.6294933557510376, "learning_rate": 1.1005622548986406e-07, "loss": 0.5798709392547607, "step": 2167 }, { "epoch": 2.8123276561232764, "grad_norm": 0.6298512816429138, "learning_rate": 1.0856937920979305e-07, "loss": 0.5979269742965698, "step": 2168 }, { "epoch": 2.813625304136253, "grad_norm": 0.6053575277328491, "learning_rate": 1.0709253466729963e-07, "loss": 0.5668598413467407, "step": 2169 }, { "epoch": 2.8149229521492294, "grad_norm": 0.6343475580215454, "learning_rate": 1.0562569488215712e-07, "loss": 0.6248285174369812, "step": 2170 }, { "epoch": 2.816220600162206, "grad_norm": 0.6348695755004883, "learning_rate": 1.0416886285368188e-07, "loss": 0.5982720851898193, "step": 2171 }, { "epoch": 2.8175182481751824, "grad_norm": 0.6075454354286194, "learning_rate": 1.0272204156072663e-07, "loss": 0.580233097076416, "step": 2172 }, { "epoch": 2.818815896188159, "grad_norm": 0.6037595272064209, "learning_rate": 1.012852339616749e-07, "loss": 0.549045205116272, "step": 2173 }, { "epoch": 2.8201135442011354, "grad_norm": 0.6013658046722412, "learning_rate": 9.985844299443437e-08, "loss": 0.5709958672523499, "step": 2174 }, { "epoch": 2.821411192214112, "grad_norm": 0.6192932724952698, "learning_rate": 9.844167157643191e-08, "loss": 0.5936025381088257, "step": 2175 }, { "epoch": 2.8227088402270883, "grad_norm": 0.6013957858085632, "learning_rate": 9.703492260460578e-08, "loss": 0.5784536600112915, "step": 2176 }, { "epoch": 2.824006488240065, "grad_norm": 0.6452348232269287, "learning_rate": 9.563819895540172e-08, "loss": 0.6597691774368286, "step": 2177 }, { "epoch": 2.8253041362530413, "grad_norm": 0.6121287941932678, "learning_rate": 9.42515034847663e-08, "loss": 0.6041057705879211, "step": 2178 }, { "epoch": 2.8266017842660176, "grad_norm": 0.6265618801116943, "learning_rate": 9.287483902814087e-08, "loss": 0.5931543707847595, "step": 2179 }, { "epoch": 2.8278994322789943, "grad_norm": 0.6284413933753967, "learning_rate": 9.150820840045483e-08, "loss": 0.5969519019126892, "step": 2180 }, { "epoch": 2.829197080291971, "grad_norm": 0.6021496057510376, "learning_rate": 9.015161439612396e-08, "loss": 0.6106084585189819, "step": 2181 }, { "epoch": 2.8304947283049473, "grad_norm": 0.6177151203155518, "learning_rate": 8.880505978903719e-08, "loss": 0.6132292151451111, "step": 2182 }, { "epoch": 2.8317923763179236, "grad_norm": 0.6375380754470825, "learning_rate": 8.746854733255982e-08, "loss": 0.5775139927864075, "step": 2183 }, { "epoch": 2.8330900243309003, "grad_norm": 0.623674750328064, "learning_rate": 8.614207975952083e-08, "loss": 0.5772640705108643, "step": 2184 }, { "epoch": 2.8343876723438766, "grad_norm": 2.0252397060394287, "learning_rate": 8.482565978221002e-08, "loss": 0.6038268804550171, "step": 2185 }, { "epoch": 2.8356853203568533, "grad_norm": 0.6209124326705933, "learning_rate": 8.351929009237425e-08, "loss": 0.5768431425094604, "step": 2186 }, { "epoch": 2.8369829683698295, "grad_norm": 0.6141339540481567, "learning_rate": 8.222297336120844e-08, "loss": 0.6076856851577759, "step": 2187 }, { "epoch": 2.8382806163828063, "grad_norm": 0.615900456905365, "learning_rate": 8.093671223935118e-08, "loss": 0.5514330267906189, "step": 2188 }, { "epoch": 2.8395782643957825, "grad_norm": 0.6074119210243225, "learning_rate": 7.966050935688252e-08, "loss": 0.5663487911224365, "step": 2189 }, { "epoch": 2.8408759124087593, "grad_norm": 0.6119362711906433, "learning_rate": 7.839436732331285e-08, "loss": 0.5301929712295532, "step": 2190 }, { "epoch": 2.8421735604217355, "grad_norm": 0.6157346963882446, "learning_rate": 7.7138288727584e-08, "loss": 0.5847445130348206, "step": 2191 }, { "epoch": 2.8434712084347122, "grad_norm": 0.8324165940284729, "learning_rate": 7.589227613805705e-08, "loss": 0.6258946061134338, "step": 2192 }, { "epoch": 2.8447688564476885, "grad_norm": 0.6051367521286011, "learning_rate": 7.465633210251344e-08, "loss": 0.6049424409866333, "step": 2193 }, { "epoch": 2.846066504460665, "grad_norm": 0.6111598610877991, "learning_rate": 7.343045914814495e-08, "loss": 0.615462601184845, "step": 2194 }, { "epoch": 2.8473641524736415, "grad_norm": 0.6303842663764954, "learning_rate": 7.221465978155262e-08, "loss": 0.5582486987113953, "step": 2195 }, { "epoch": 2.848661800486618, "grad_norm": 0.6294355392456055, "learning_rate": 7.10089364887373e-08, "loss": 0.5927014946937561, "step": 2196 }, { "epoch": 2.8499594484995945, "grad_norm": 0.6469996571540833, "learning_rate": 6.981329173509909e-08, "loss": 0.639467179775238, "step": 2197 }, { "epoch": 2.8512570965125708, "grad_norm": 0.5986980199813843, "learning_rate": 6.862772796542794e-08, "loss": 0.6210333704948425, "step": 2198 }, { "epoch": 2.8525547445255475, "grad_norm": 0.6324379444122314, "learning_rate": 6.745224760390246e-08, "loss": 0.5866251587867737, "step": 2199 }, { "epoch": 2.853852392538524, "grad_norm": 0.6159996390342712, "learning_rate": 6.628685305408166e-08, "loss": 0.5464287996292114, "step": 2200 }, { "epoch": 2.8551500405515005, "grad_norm": 0.6313933730125427, "learning_rate": 6.513154669890221e-08, "loss": 0.5239887237548828, "step": 2201 }, { "epoch": 2.8564476885644767, "grad_norm": 0.6151242852210999, "learning_rate": 6.398633090067497e-08, "loss": 0.5513571500778198, "step": 2202 }, { "epoch": 2.8577453365774534, "grad_norm": 0.6272878646850586, "learning_rate": 6.285120800107402e-08, "loss": 0.5711073875427246, "step": 2203 }, { "epoch": 2.8590429845904297, "grad_norm": 0.6284655928611755, "learning_rate": 6.172618032114108e-08, "loss": 0.5585539937019348, "step": 2204 }, { "epoch": 2.8603406326034064, "grad_norm": 0.6207369565963745, "learning_rate": 6.061125016127045e-08, "loss": 0.6215085983276367, "step": 2205 }, { "epoch": 2.8616382806163827, "grad_norm": 0.5943953394889832, "learning_rate": 5.950641980121352e-08, "loss": 0.5761866569519043, "step": 2206 }, { "epoch": 2.8629359286293594, "grad_norm": 0.6034578680992126, "learning_rate": 5.84116915000682e-08, "loss": 0.5912197828292847, "step": 2207 }, { "epoch": 2.8642335766423357, "grad_norm": 0.612192690372467, "learning_rate": 5.732706749627726e-08, "loss": 0.5836058855056763, "step": 2208 }, { "epoch": 2.8655312246553124, "grad_norm": 0.5961986780166626, "learning_rate": 5.6252550007621645e-08, "loss": 0.6387939453125, "step": 2209 }, { "epoch": 2.8668288726682887, "grad_norm": 0.6482071876525879, "learning_rate": 5.518814123121885e-08, "loss": 0.5909046530723572, "step": 2210 }, { "epoch": 2.8681265206812654, "grad_norm": 0.6003962755203247, "learning_rate": 5.413384334351346e-08, "loss": 0.5739219188690186, "step": 2211 }, { "epoch": 2.8694241686942417, "grad_norm": 0.6164109110832214, "learning_rate": 5.308965850027992e-08, "loss": 0.5988886952400208, "step": 2212 }, { "epoch": 2.870721816707218, "grad_norm": 0.6417747139930725, "learning_rate": 5.205558883661033e-08, "loss": 0.6298974752426147, "step": 2213 }, { "epoch": 2.8720194647201946, "grad_norm": 0.6133490204811096, "learning_rate": 5.103163646691611e-08, "loss": 0.584977388381958, "step": 2214 }, { "epoch": 2.8733171127331714, "grad_norm": 0.611873984336853, "learning_rate": 5.00178034849208e-08, "loss": 0.5997759103775024, "step": 2215 }, { "epoch": 2.8746147607461476, "grad_norm": 0.5938137173652649, "learning_rate": 4.9014091963655584e-08, "loss": 0.5509130954742432, "step": 2216 }, { "epoch": 2.875912408759124, "grad_norm": 0.6484394073486328, "learning_rate": 4.802050395545765e-08, "loss": 0.6474854946136475, "step": 2217 }, { "epoch": 2.8772100567721006, "grad_norm": 0.619995653629303, "learning_rate": 4.703704149196187e-08, "loss": 0.5942093133926392, "step": 2218 }, { "epoch": 2.878507704785077, "grad_norm": 0.6322592496871948, "learning_rate": 4.6063706584100196e-08, "loss": 0.5504230856895447, "step": 2219 }, { "epoch": 2.8798053527980536, "grad_norm": 0.6172313094139099, "learning_rate": 4.5100501222097304e-08, "loss": 0.677121639251709, "step": 2220 }, { "epoch": 2.88110300081103, "grad_norm": 0.5940432548522949, "learning_rate": 4.414742737546274e-08, "loss": 0.593209981918335, "step": 2221 }, { "epoch": 2.8824006488240066, "grad_norm": 0.8577704429626465, "learning_rate": 4.320448699299262e-08, "loss": 0.705782949924469, "step": 2222 }, { "epoch": 2.883698296836983, "grad_norm": 0.6182291507720947, "learning_rate": 4.227168200276077e-08, "loss": 0.569422721862793, "step": 2223 }, { "epoch": 2.8849959448499596, "grad_norm": 0.6797721982002258, "learning_rate": 4.134901431211702e-08, "loss": 0.6029517650604248, "step": 2224 }, { "epoch": 2.886293592862936, "grad_norm": 0.5963630676269531, "learning_rate": 4.043648580768389e-08, "loss": 0.5859914422035217, "step": 2225 }, { "epoch": 2.8875912408759126, "grad_norm": 0.5913455486297607, "learning_rate": 3.953409835535049e-08, "loss": 0.5406662225723267, "step": 2226 }, { "epoch": 2.888888888888889, "grad_norm": 0.6106013655662537, "learning_rate": 3.8641853800271414e-08, "loss": 0.5755677223205566, "step": 2227 }, { "epoch": 2.890186536901865, "grad_norm": 0.6029745936393738, "learning_rate": 3.77597539668606e-08, "loss": 0.5609725713729858, "step": 2228 }, { "epoch": 2.891484184914842, "grad_norm": 0.6042892336845398, "learning_rate": 3.688780065878916e-08, "loss": 0.5605003237724304, "step": 2229 }, { "epoch": 2.8927818329278185, "grad_norm": 0.5953066945075989, "learning_rate": 3.602599565898091e-08, "loss": 0.5514798164367676, "step": 2230 }, { "epoch": 2.894079480940795, "grad_norm": 0.5983406901359558, "learning_rate": 3.517434072960901e-08, "loss": 0.6183291077613831, "step": 2231 }, { "epoch": 2.895377128953771, "grad_norm": 0.5932295918464661, "learning_rate": 3.433283761209161e-08, "loss": 0.6106539964675903, "step": 2232 }, { "epoch": 2.896674776966748, "grad_norm": 0.6509292721748352, "learning_rate": 3.3501488027090635e-08, "loss": 0.5615214109420776, "step": 2233 }, { "epoch": 2.8979724249797245, "grad_norm": 0.613764762878418, "learning_rate": 3.268029367450465e-08, "loss": 0.6054869294166565, "step": 2234 }, { "epoch": 2.899270072992701, "grad_norm": 0.6044638156890869, "learning_rate": 3.186925623346882e-08, "loss": 0.5691530704498291, "step": 2235 }, { "epoch": 2.900567721005677, "grad_norm": 0.6060168147087097, "learning_rate": 3.10683773623488e-08, "loss": 0.5762636065483093, "step": 2236 }, { "epoch": 2.9018653690186538, "grad_norm": 0.611011803150177, "learning_rate": 3.0277658698739665e-08, "loss": 0.5851128101348877, "step": 2237 }, { "epoch": 2.90316301703163, "grad_norm": 0.6304229497909546, "learning_rate": 2.9497101859460865e-08, "loss": 0.5497856140136719, "step": 2238 }, { "epoch": 2.9044606650446068, "grad_norm": 0.5783108472824097, "learning_rate": 2.872670844055403e-08, "loss": 0.5745448470115662, "step": 2239 }, { "epoch": 2.905758313057583, "grad_norm": 0.609293520450592, "learning_rate": 2.7966480017277974e-08, "loss": 0.5522551536560059, "step": 2240 }, { "epoch": 2.9070559610705597, "grad_norm": 0.5960776209831238, "learning_rate": 2.7216418144107583e-08, "loss": 0.5907799005508423, "step": 2241 }, { "epoch": 2.908353609083536, "grad_norm": 0.793721079826355, "learning_rate": 2.6476524354729917e-08, "loss": 0.55716872215271, "step": 2242 }, { "epoch": 2.9096512570965127, "grad_norm": 0.6245414614677429, "learning_rate": 2.5746800162040342e-08, "loss": 0.5835314989089966, "step": 2243 }, { "epoch": 2.910948905109489, "grad_norm": 0.616008996963501, "learning_rate": 2.5027247058139748e-08, "loss": 0.594428300857544, "step": 2244 }, { "epoch": 2.9122465531224657, "grad_norm": 0.5911674499511719, "learning_rate": 2.4317866514332322e-08, "loss": 0.5509923696517944, "step": 2245 }, { "epoch": 2.913544201135442, "grad_norm": 0.6335274577140808, "learning_rate": 2.361865998112223e-08, "loss": 0.6094061732292175, "step": 2246 }, { "epoch": 2.9148418491484183, "grad_norm": 0.6137773990631104, "learning_rate": 2.2929628888209156e-08, "loss": 0.6228293180465698, "step": 2247 }, { "epoch": 2.916139497161395, "grad_norm": 0.6228021383285522, "learning_rate": 2.2250774644487215e-08, "loss": 0.5877048969268799, "step": 2248 }, { "epoch": 2.9174371451743717, "grad_norm": 0.6152287125587463, "learning_rate": 2.158209863804217e-08, "loss": 0.6036567091941833, "step": 2249 }, { "epoch": 2.918734793187348, "grad_norm": 0.6172757744789124, "learning_rate": 2.0923602236146977e-08, "loss": 0.5865423083305359, "step": 2250 }, { "epoch": 2.9200324412003242, "grad_norm": 0.6071073412895203, "learning_rate": 2.0275286785260694e-08, "loss": 0.583999752998352, "step": 2251 }, { "epoch": 2.921330089213301, "grad_norm": 0.6244159936904907, "learning_rate": 1.9637153611022365e-08, "loss": 0.5794707536697388, "step": 2252 }, { "epoch": 2.9226277372262772, "grad_norm": 0.6465387940406799, "learning_rate": 1.9009204018255456e-08, "loss": 0.559209942817688, "step": 2253 }, { "epoch": 2.923925385239254, "grad_norm": 0.6284136176109314, "learning_rate": 1.839143929095566e-08, "loss": 0.562762975692749, "step": 2254 }, { "epoch": 2.92522303325223, "grad_norm": 0.6393802762031555, "learning_rate": 1.7783860692296982e-08, "loss": 0.6002349853515625, "step": 2255 }, { "epoch": 2.926520681265207, "grad_norm": 0.6242037415504456, "learning_rate": 1.718646946462288e-08, "loss": 0.593687117099762, "step": 2256 }, { "epoch": 2.927818329278183, "grad_norm": 0.6453087329864502, "learning_rate": 1.6599266829447902e-08, "loss": 0.6138840317726135, "step": 2257 }, { "epoch": 2.92911597729116, "grad_norm": 0.632391095161438, "learning_rate": 1.6022253987452717e-08, "loss": 0.5360509157180786, "step": 2258 }, { "epoch": 2.930413625304136, "grad_norm": 0.625159740447998, "learning_rate": 1.5455432118481884e-08, "loss": 0.6014057397842407, "step": 2259 }, { "epoch": 2.931711273317113, "grad_norm": 0.6160334944725037, "learning_rate": 1.4898802381543842e-08, "loss": 0.5864812135696411, "step": 2260 }, { "epoch": 2.933008921330089, "grad_norm": 0.6208499073982239, "learning_rate": 1.4352365914804822e-08, "loss": 0.5853984355926514, "step": 2261 }, { "epoch": 2.9343065693430654, "grad_norm": 0.6147589087486267, "learning_rate": 1.3816123835588835e-08, "loss": 0.6146311163902283, "step": 2262 }, { "epoch": 2.935604217356042, "grad_norm": 0.6171795129776001, "learning_rate": 1.3290077240375453e-08, "loss": 0.5833883285522461, "step": 2263 }, { "epoch": 2.936901865369019, "grad_norm": 0.5844340920448303, "learning_rate": 1.277422720479704e-08, "loss": 0.6002391576766968, "step": 2264 }, { "epoch": 2.938199513381995, "grad_norm": 0.6268512606620789, "learning_rate": 1.2268574783635968e-08, "loss": 0.6797309517860413, "step": 2265 }, { "epoch": 2.9394971613949714, "grad_norm": 0.5872271656990051, "learning_rate": 1.1773121010824063e-08, "loss": 0.5867947936058044, "step": 2266 }, { "epoch": 2.940794809407948, "grad_norm": 0.633165180683136, "learning_rate": 1.1287866899438171e-08, "loss": 0.6117358207702637, "step": 2267 }, { "epoch": 2.942092457420925, "grad_norm": 0.5991867184638977, "learning_rate": 1.081281344170071e-08, "loss": 0.5292370319366455, "step": 2268 }, { "epoch": 2.943390105433901, "grad_norm": 0.6432121396064758, "learning_rate": 1.0347961608975221e-08, "loss": 0.5962504148483276, "step": 2269 }, { "epoch": 2.9446877534468774, "grad_norm": 0.6073801517486572, "learning_rate": 9.893312351766382e-09, "loss": 0.6454894542694092, "step": 2270 }, { "epoch": 2.945985401459854, "grad_norm": 0.6156368851661682, "learning_rate": 9.448866599717221e-09, "loss": 0.5632429718971252, "step": 2271 }, { "epoch": 2.9472830494728304, "grad_norm": 0.7083485126495361, "learning_rate": 9.014625261605791e-09, "loss": 0.5813943147659302, "step": 2272 }, { "epoch": 2.948580697485807, "grad_norm": 0.6162700653076172, "learning_rate": 8.590589225346834e-09, "loss": 0.5752675533294678, "step": 2273 }, { "epoch": 2.9498783454987834, "grad_norm": 0.610639750957489, "learning_rate": 8.17675935798623e-09, "loss": 0.6433367133140564, "step": 2274 }, { "epoch": 2.95117599351176, "grad_norm": 0.5966771841049194, "learning_rate": 7.773136505700995e-09, "loss": 0.532874345779419, "step": 2275 }, { "epoch": 2.9524736415247363, "grad_norm": 0.6585695743560791, "learning_rate": 7.379721493798176e-09, "loss": 0.5892356634140015, "step": 2276 }, { "epoch": 2.9537712895377126, "grad_norm": 0.6081703901290894, "learning_rate": 6.996515126711511e-09, "loss": 0.5548315048217773, "step": 2277 }, { "epoch": 2.9550689375506893, "grad_norm": 0.6258850693702698, "learning_rate": 6.623518188001443e-09, "loss": 0.5927635431289673, "step": 2278 }, { "epoch": 2.956366585563666, "grad_norm": 0.6431419253349304, "learning_rate": 6.260731440351775e-09, "loss": 0.6057431101799011, "step": 2279 }, { "epoch": 2.9576642335766423, "grad_norm": 0.621634840965271, "learning_rate": 5.908155625570233e-09, "loss": 0.5803443789482117, "step": 2280 }, { "epoch": 2.9589618815896186, "grad_norm": 0.5794631838798523, "learning_rate": 5.56579146458458e-09, "loss": 0.5982474088668823, "step": 2281 }, { "epoch": 2.9602595296025953, "grad_norm": 0.5987969040870667, "learning_rate": 5.233639657443168e-09, "loss": 0.6081230640411377, "step": 2282 }, { "epoch": 2.961557177615572, "grad_norm": 0.6121331453323364, "learning_rate": 4.911700883312165e-09, "loss": 0.5589238405227661, "step": 2283 }, { "epoch": 2.9628548256285483, "grad_norm": 0.6170937418937683, "learning_rate": 4.599975800475553e-09, "loss": 0.575406014919281, "step": 2284 }, { "epoch": 2.9641524736415246, "grad_norm": 0.5928655862808228, "learning_rate": 4.298465046331246e-09, "loss": 0.588203489780426, "step": 2285 }, { "epoch": 2.9654501216545013, "grad_norm": 0.6178304553031921, "learning_rate": 4.007169237392749e-09, "loss": 0.5311431288719177, "step": 2286 }, { "epoch": 2.9667477696674776, "grad_norm": 0.6006078124046326, "learning_rate": 3.726088969286945e-09, "loss": 0.5917048454284668, "step": 2287 }, { "epoch": 2.9680454176804543, "grad_norm": 0.6022590398788452, "learning_rate": 3.4552248167507576e-09, "loss": 0.5889644026756287, "step": 2288 }, { "epoch": 2.9693430656934305, "grad_norm": 0.5813162922859192, "learning_rate": 3.1945773336333754e-09, "loss": 0.5726138353347778, "step": 2289 }, { "epoch": 2.9706407137064073, "grad_norm": 0.6178452372550964, "learning_rate": 2.9441470528929206e-09, "loss": 0.6099365949630737, "step": 2290 }, { "epoch": 2.9719383617193835, "grad_norm": 0.6197913289070129, "learning_rate": 2.703934486595894e-09, "loss": 0.6363242268562317, "step": 2291 }, { "epoch": 2.9732360097323602, "grad_norm": 0.6046017408370972, "learning_rate": 2.4739401259160635e-09, "loss": 0.5827226042747498, "step": 2292 }, { "epoch": 2.9745336577453365, "grad_norm": 0.64341801404953, "learning_rate": 2.2541644411344653e-09, "loss": 0.5797464847564697, "step": 2293 }, { "epoch": 2.9758313057583132, "grad_norm": 0.6010720133781433, "learning_rate": 2.0446078816355186e-09, "loss": 0.5213384628295898, "step": 2294 }, { "epoch": 2.9771289537712895, "grad_norm": 0.5899950265884399, "learning_rate": 1.8452708759097993e-09, "loss": 0.5917242765426636, "step": 2295 }, { "epoch": 2.9784266017842658, "grad_norm": 0.5960827469825745, "learning_rate": 1.656153831551821e-09, "loss": 0.5761323571205139, "step": 2296 }, { "epoch": 2.9797242497972425, "grad_norm": 0.641033411026001, "learning_rate": 1.4772571352567044e-09, "loss": 0.6058821678161621, "step": 2297 }, { "epoch": 2.981021897810219, "grad_norm": 0.6438850164413452, "learning_rate": 1.3085811528240622e-09, "loss": 0.6135293245315552, "step": 2298 }, { "epoch": 2.9823195458231955, "grad_norm": 0.652836263179779, "learning_rate": 1.1501262291530034e-09, "loss": 0.6278634667396545, "step": 2299 }, { "epoch": 2.9836171938361717, "grad_norm": 0.6056571006774902, "learning_rate": 1.0018926882443548e-09, "loss": 0.6097397208213806, "step": 2300 }, { "epoch": 2.9836171938361717, "eval_loss": 0.6816402673721313, "eval_runtime": 72.9022, "eval_samples_per_second": 71.219, "eval_steps_per_second": 8.902, "step": 2300 }, { "epoch": 2.9849148418491485, "grad_norm": 0.6095858812332153, "learning_rate": 8.638808331973281e-10, "loss": 0.5901839733123779, "step": 2301 }, { "epoch": 2.986212489862125, "grad_norm": 0.610837996006012, "learning_rate": 7.360909462111876e-10, "loss": 0.6008099913597107, "step": 2302 }, { "epoch": 2.9875101378751014, "grad_norm": 0.6182950735092163, "learning_rate": 6.185232885846937e-10, "loss": 0.599170446395874, "step": 2303 }, { "epoch": 2.9888077858880777, "grad_norm": 0.5876109600067139, "learning_rate": 5.111781007138827e-10, "loss": 0.5724647045135498, "step": 2304 }, { "epoch": 2.9901054339010544, "grad_norm": 0.6355734467506409, "learning_rate": 4.1405560209206716e-10, "loss": 0.5922134518623352, "step": 2305 }, { "epoch": 2.9914030819140307, "grad_norm": 0.613153338432312, "learning_rate": 3.2715599131039053e-10, "loss": 0.5836412906646729, "step": 2306 }, { "epoch": 2.9927007299270074, "grad_norm": 0.6345803737640381, "learning_rate": 2.5047944605616215e-10, "loss": 0.5756551623344421, "step": 2307 }, { "epoch": 2.9939983779399837, "grad_norm": 0.6199482679367065, "learning_rate": 1.840261231139673e-10, "loss": 0.5494982004165649, "step": 2308 }, { "epoch": 2.9952960259529604, "grad_norm": 0.62641441822052, "learning_rate": 1.2779615836455706e-10, "loss": 0.6009610295295715, "step": 2309 }, { "epoch": 2.9965936739659367, "grad_norm": 0.6289675831794739, "learning_rate": 8.17896667826279e-11, "loss": 0.6343727111816406, "step": 2310 }, { "epoch": 2.997891321978913, "grad_norm": 0.6318255662918091, "learning_rate": 4.600674244070735e-11, "loss": 0.5607834458351135, "step": 2311 }, { "epoch": 2.9991889699918897, "grad_norm": 0.6189204454421997, "learning_rate": 2.04474585052683e-11, "loss": 0.5686444044113159, "step": 2312 }, { "epoch": 3.0, "grad_norm": 0.7989046573638916, "learning_rate": 5.11186723950452e-12, "loss": 0.6789172887802124, "step": 2313 }, { "epoch": 3.0, "step": 2313, "total_flos": 8.852766725217714e+18, "train_loss": 0.6584745990833162, "train_runtime": 19013.667, "train_samples_per_second": 15.562, "train_steps_per_second": 0.122 } ], "logging_steps": 1.0, "max_steps": 2313, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 230, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.852766725217714e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }