Files
Qwen3-4B-SFT-science-1e-5/trainer_state.json
ModelHub XC afd3adcaa1 初始化项目,由ModelHub XC社区提供模型
Model: graf/Qwen3-4B-SFT-science-1e-5
Source: Original Platform
2026-05-30 05:11:19 +08:00

16315 lines
422 KiB
JSON

{
"best_global_step": 1380,
"best_metric": 0.6777992248535156,
"best_model_checkpoint": "saves/qwen3-4B/Qwen3-4B-SFT-science-1e-5/checkpoint-1380",
"epoch": 3.0,
"eval_steps": 230,
"global_step": 2313,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012976480129764801,
"grad_norm": 8.159334182739258,
"learning_rate": 0.0,
"loss": 1.117659091949463,
"step": 1
},
{
"epoch": 0.0025952960259529602,
"grad_norm": 7.678379535675049,
"learning_rate": 8.620689655172414e-08,
"loss": 1.0263863801956177,
"step": 2
},
{
"epoch": 0.0038929440389294406,
"grad_norm": 8.245121002197266,
"learning_rate": 1.7241379310344828e-07,
"loss": 1.1220793724060059,
"step": 3
},
{
"epoch": 0.0051905920519059205,
"grad_norm": 8.546252250671387,
"learning_rate": 2.5862068965517245e-07,
"loss": 1.18021821975708,
"step": 4
},
{
"epoch": 0.006488240064882401,
"grad_norm": 7.886499404907227,
"learning_rate": 3.4482758620689656e-07,
"loss": 1.107445240020752,
"step": 5
},
{
"epoch": 0.007785888077858881,
"grad_norm": 10.850175857543945,
"learning_rate": 4.3103448275862073e-07,
"loss": 1.099359393119812,
"step": 6
},
{
"epoch": 0.009083536090835361,
"grad_norm": 8.097647666931152,
"learning_rate": 5.172413793103449e-07,
"loss": 1.0631245374679565,
"step": 7
},
{
"epoch": 0.010381184103811841,
"grad_norm": 7.725368022918701,
"learning_rate": 6.034482758620691e-07,
"loss": 1.0364526510238647,
"step": 8
},
{
"epoch": 0.01167883211678832,
"grad_norm": 7.084433555603027,
"learning_rate": 6.896551724137931e-07,
"loss": 0.977345883846283,
"step": 9
},
{
"epoch": 0.012976480129764802,
"grad_norm": 7.370170593261719,
"learning_rate": 7.758620689655173e-07,
"loss": 1.0759401321411133,
"step": 10
},
{
"epoch": 0.014274128142741281,
"grad_norm": 7.056736469268799,
"learning_rate": 8.620689655172415e-07,
"loss": 1.051821231842041,
"step": 11
},
{
"epoch": 0.015571776155717762,
"grad_norm": 6.129208564758301,
"learning_rate": 9.482758620689655e-07,
"loss": 1.008002758026123,
"step": 12
},
{
"epoch": 0.01686942416869424,
"grad_norm": 6.331120491027832,
"learning_rate": 1.0344827586206898e-06,
"loss": 1.0412415266036987,
"step": 13
},
{
"epoch": 0.018167072181670723,
"grad_norm": 5.9186625480651855,
"learning_rate": 1.120689655172414e-06,
"loss": 1.0198311805725098,
"step": 14
},
{
"epoch": 0.019464720194647202,
"grad_norm": 5.27198600769043,
"learning_rate": 1.2068965517241381e-06,
"loss": 1.0152095556259155,
"step": 15
},
{
"epoch": 0.020762368207623682,
"grad_norm": 4.562581539154053,
"learning_rate": 1.2931034482758623e-06,
"loss": 0.9857317805290222,
"step": 16
},
{
"epoch": 0.02206001622060016,
"grad_norm": 4.586100101470947,
"learning_rate": 1.3793103448275862e-06,
"loss": 1.0225930213928223,
"step": 17
},
{
"epoch": 0.02335766423357664,
"grad_norm": 4.058810234069824,
"learning_rate": 1.4655172413793104e-06,
"loss": 0.9823198318481445,
"step": 18
},
{
"epoch": 0.024655312246553124,
"grad_norm": 4.066655158996582,
"learning_rate": 1.5517241379310346e-06,
"loss": 0.9863596558570862,
"step": 19
},
{
"epoch": 0.025952960259529603,
"grad_norm": 3.7554173469543457,
"learning_rate": 1.6379310344827587e-06,
"loss": 0.9025828838348389,
"step": 20
},
{
"epoch": 0.027250608272506083,
"grad_norm": 2.6631460189819336,
"learning_rate": 1.724137931034483e-06,
"loss": 0.9907147288322449,
"step": 21
},
{
"epoch": 0.028548256285482562,
"grad_norm": 2.3198695182800293,
"learning_rate": 1.810344827586207e-06,
"loss": 0.872843861579895,
"step": 22
},
{
"epoch": 0.02984590429845904,
"grad_norm": 2.0851941108703613,
"learning_rate": 1.896551724137931e-06,
"loss": 0.896687388420105,
"step": 23
},
{
"epoch": 0.031143552311435525,
"grad_norm": 1.9391196966171265,
"learning_rate": 1.982758620689655e-06,
"loss": 0.8471081852912903,
"step": 24
},
{
"epoch": 0.032441200324412,
"grad_norm": 1.705809473991394,
"learning_rate": 2.0689655172413796e-06,
"loss": 0.8402453064918518,
"step": 25
},
{
"epoch": 0.03373884833738848,
"grad_norm": 1.6870861053466797,
"learning_rate": 2.1551724137931035e-06,
"loss": 0.8823003768920898,
"step": 26
},
{
"epoch": 0.035036496350364967,
"grad_norm": 1.63539719581604,
"learning_rate": 2.241379310344828e-06,
"loss": 0.8520861864089966,
"step": 27
},
{
"epoch": 0.036334144363341446,
"grad_norm": 1.215566873550415,
"learning_rate": 2.327586206896552e-06,
"loss": 0.8638472557067871,
"step": 28
},
{
"epoch": 0.037631792376317925,
"grad_norm": 1.3506380319595337,
"learning_rate": 2.4137931034482762e-06,
"loss": 0.8670657277107239,
"step": 29
},
{
"epoch": 0.038929440389294405,
"grad_norm": 1.3755369186401367,
"learning_rate": 2.5e-06,
"loss": 0.8863908648490906,
"step": 30
},
{
"epoch": 0.040227088402270884,
"grad_norm": 1.3734447956085205,
"learning_rate": 2.5862068965517246e-06,
"loss": 0.8129177093505859,
"step": 31
},
{
"epoch": 0.041524736415247364,
"grad_norm": 1.294492244720459,
"learning_rate": 2.672413793103448e-06,
"loss": 0.8296308517456055,
"step": 32
},
{
"epoch": 0.04282238442822384,
"grad_norm": 1.0568984746932983,
"learning_rate": 2.7586206896551725e-06,
"loss": 0.8287128210067749,
"step": 33
},
{
"epoch": 0.04412003244120032,
"grad_norm": 0.9133521914482117,
"learning_rate": 2.844827586206897e-06,
"loss": 0.7776259183883667,
"step": 34
},
{
"epoch": 0.0454176804541768,
"grad_norm": 0.9421447515487671,
"learning_rate": 2.931034482758621e-06,
"loss": 0.8422408103942871,
"step": 35
},
{
"epoch": 0.04671532846715328,
"grad_norm": 0.9022809863090515,
"learning_rate": 3.017241379310345e-06,
"loss": 0.8629512190818787,
"step": 36
},
{
"epoch": 0.04801297648012977,
"grad_norm": 0.780587375164032,
"learning_rate": 3.103448275862069e-06,
"loss": 0.781667947769165,
"step": 37
},
{
"epoch": 0.04931062449310625,
"grad_norm": 0.7616261839866638,
"learning_rate": 3.1896551724137935e-06,
"loss": 0.7705612182617188,
"step": 38
},
{
"epoch": 0.05060827250608273,
"grad_norm": 0.7669604420661926,
"learning_rate": 3.2758620689655175e-06,
"loss": 0.8600028157234192,
"step": 39
},
{
"epoch": 0.05190592051905921,
"grad_norm": 0.8013553619384766,
"learning_rate": 3.362068965517242e-06,
"loss": 0.8316032886505127,
"step": 40
},
{
"epoch": 0.053203568532035686,
"grad_norm": 0.760819673538208,
"learning_rate": 3.448275862068966e-06,
"loss": 0.8170580863952637,
"step": 41
},
{
"epoch": 0.054501216545012166,
"grad_norm": 0.719124436378479,
"learning_rate": 3.5344827586206898e-06,
"loss": 0.7726640701293945,
"step": 42
},
{
"epoch": 0.055798864557988645,
"grad_norm": 0.7333022952079773,
"learning_rate": 3.620689655172414e-06,
"loss": 0.7847077250480652,
"step": 43
},
{
"epoch": 0.057096512570965124,
"grad_norm": 0.7520370483398438,
"learning_rate": 3.7068965517241385e-06,
"loss": 0.7839537858963013,
"step": 44
},
{
"epoch": 0.058394160583941604,
"grad_norm": 0.7901465892791748,
"learning_rate": 3.793103448275862e-06,
"loss": 0.8387829065322876,
"step": 45
},
{
"epoch": 0.05969180859691808,
"grad_norm": 0.7442818284034729,
"learning_rate": 3.8793103448275865e-06,
"loss": 0.7767361402511597,
"step": 46
},
{
"epoch": 0.06098945660989456,
"grad_norm": 0.6601076722145081,
"learning_rate": 3.96551724137931e-06,
"loss": 0.7320765256881714,
"step": 47
},
{
"epoch": 0.06228710462287105,
"grad_norm": 0.6948726773262024,
"learning_rate": 4.051724137931034e-06,
"loss": 0.8290716409683228,
"step": 48
},
{
"epoch": 0.06358475263584752,
"grad_norm": 0.6669663190841675,
"learning_rate": 4.137931034482759e-06,
"loss": 0.7917401790618896,
"step": 49
},
{
"epoch": 0.064882400648824,
"grad_norm": 0.6616993546485901,
"learning_rate": 4.224137931034483e-06,
"loss": 0.7917563915252686,
"step": 50
},
{
"epoch": 0.06618004866180048,
"grad_norm": 0.6595159769058228,
"learning_rate": 4.310344827586207e-06,
"loss": 0.7899826765060425,
"step": 51
},
{
"epoch": 0.06747769667477696,
"grad_norm": 0.6776856184005737,
"learning_rate": 4.396551724137931e-06,
"loss": 0.8258700370788574,
"step": 52
},
{
"epoch": 0.06877534468775345,
"grad_norm": 0.7086785435676575,
"learning_rate": 4.482758620689656e-06,
"loss": 0.8281067609786987,
"step": 53
},
{
"epoch": 0.07007299270072993,
"grad_norm": 0.6362385153770447,
"learning_rate": 4.56896551724138e-06,
"loss": 0.7703720331192017,
"step": 54
},
{
"epoch": 0.07137064071370641,
"grad_norm": 1.0633333921432495,
"learning_rate": 4.655172413793104e-06,
"loss": 0.7698659896850586,
"step": 55
},
{
"epoch": 0.07266828872668289,
"grad_norm": 0.6450533270835876,
"learning_rate": 4.741379310344828e-06,
"loss": 0.7988396286964417,
"step": 56
},
{
"epoch": 0.07396593673965937,
"grad_norm": 0.6176488995552063,
"learning_rate": 4.8275862068965525e-06,
"loss": 0.7486166954040527,
"step": 57
},
{
"epoch": 0.07526358475263585,
"grad_norm": 0.6564953327178955,
"learning_rate": 4.9137931034482765e-06,
"loss": 0.8380484580993652,
"step": 58
},
{
"epoch": 0.07656123276561233,
"grad_norm": 1.4383426904678345,
"learning_rate": 5e-06,
"loss": 0.8178592920303345,
"step": 59
},
{
"epoch": 0.07785888077858881,
"grad_norm": 0.6065345406532288,
"learning_rate": 5.086206896551724e-06,
"loss": 0.7402592897415161,
"step": 60
},
{
"epoch": 0.07915652879156529,
"grad_norm": 0.6361149549484253,
"learning_rate": 5.172413793103449e-06,
"loss": 0.7722781300544739,
"step": 61
},
{
"epoch": 0.08045417680454177,
"grad_norm": 0.6287536025047302,
"learning_rate": 5.258620689655173e-06,
"loss": 0.8166277408599854,
"step": 62
},
{
"epoch": 0.08175182481751825,
"grad_norm": 0.6238293051719666,
"learning_rate": 5.344827586206896e-06,
"loss": 0.7863017320632935,
"step": 63
},
{
"epoch": 0.08304947283049473,
"grad_norm": 0.6116371750831604,
"learning_rate": 5.431034482758621e-06,
"loss": 0.8139179944992065,
"step": 64
},
{
"epoch": 0.08434712084347121,
"grad_norm": 0.6211651563644409,
"learning_rate": 5.517241379310345e-06,
"loss": 0.802246630191803,
"step": 65
},
{
"epoch": 0.08564476885644769,
"grad_norm": 0.6179801821708679,
"learning_rate": 5.603448275862069e-06,
"loss": 0.8019924163818359,
"step": 66
},
{
"epoch": 0.08694241686942417,
"grad_norm": 0.6304736733436584,
"learning_rate": 5.689655172413794e-06,
"loss": 0.797938346862793,
"step": 67
},
{
"epoch": 0.08824006488240065,
"grad_norm": 0.5991215705871582,
"learning_rate": 5.775862068965518e-06,
"loss": 0.7311227917671204,
"step": 68
},
{
"epoch": 0.08953771289537713,
"grad_norm": 0.6336483955383301,
"learning_rate": 5.862068965517242e-06,
"loss": 0.8222885131835938,
"step": 69
},
{
"epoch": 0.0908353609083536,
"grad_norm": 0.6269424557685852,
"learning_rate": 5.9482758620689665e-06,
"loss": 0.7962170839309692,
"step": 70
},
{
"epoch": 0.09213300892133008,
"grad_norm": 0.6373898983001709,
"learning_rate": 6.03448275862069e-06,
"loss": 0.8021715879440308,
"step": 71
},
{
"epoch": 0.09343065693430656,
"grad_norm": 0.6345935463905334,
"learning_rate": 6.1206896551724135e-06,
"loss": 0.8776074647903442,
"step": 72
},
{
"epoch": 0.09472830494728304,
"grad_norm": 0.6083796620368958,
"learning_rate": 6.206896551724138e-06,
"loss": 0.7513650059700012,
"step": 73
},
{
"epoch": 0.09602595296025954,
"grad_norm": 0.6068538427352905,
"learning_rate": 6.293103448275862e-06,
"loss": 0.7684041857719421,
"step": 74
},
{
"epoch": 0.09732360097323602,
"grad_norm": 0.6176103949546814,
"learning_rate": 6.379310344827587e-06,
"loss": 0.7645843029022217,
"step": 75
},
{
"epoch": 0.0986212489862125,
"grad_norm": 0.6182767152786255,
"learning_rate": 6.465517241379311e-06,
"loss": 0.8177169561386108,
"step": 76
},
{
"epoch": 0.09991889699918897,
"grad_norm": 0.6175945997238159,
"learning_rate": 6.551724137931035e-06,
"loss": 0.7822265625,
"step": 77
},
{
"epoch": 0.10121654501216545,
"grad_norm": 0.6050496101379395,
"learning_rate": 6.63793103448276e-06,
"loss": 0.7576093673706055,
"step": 78
},
{
"epoch": 0.10251419302514193,
"grad_norm": 0.7123962640762329,
"learning_rate": 6.724137931034484e-06,
"loss": 0.8231764435768127,
"step": 79
},
{
"epoch": 0.10381184103811841,
"grad_norm": 0.61634361743927,
"learning_rate": 6.810344827586207e-06,
"loss": 0.7479314804077148,
"step": 80
},
{
"epoch": 0.10510948905109489,
"grad_norm": 0.5944046378135681,
"learning_rate": 6.896551724137932e-06,
"loss": 0.7602187395095825,
"step": 81
},
{
"epoch": 0.10640713706407137,
"grad_norm": 1.9641212224960327,
"learning_rate": 6.982758620689656e-06,
"loss": 0.7291417121887207,
"step": 82
},
{
"epoch": 0.10770478507704785,
"grad_norm": 0.6604083776473999,
"learning_rate": 7.0689655172413796e-06,
"loss": 0.7462600469589233,
"step": 83
},
{
"epoch": 0.10900243309002433,
"grad_norm": 0.6202764511108398,
"learning_rate": 7.155172413793104e-06,
"loss": 0.8041630983352661,
"step": 84
},
{
"epoch": 0.11030008110300081,
"grad_norm": 0.6278896331787109,
"learning_rate": 7.241379310344828e-06,
"loss": 0.7589733600616455,
"step": 85
},
{
"epoch": 0.11159772911597729,
"grad_norm": 0.5918757915496826,
"learning_rate": 7.327586206896552e-06,
"loss": 0.7733231782913208,
"step": 86
},
{
"epoch": 0.11289537712895377,
"grad_norm": 0.6275747418403625,
"learning_rate": 7.413793103448277e-06,
"loss": 0.7821832299232483,
"step": 87
},
{
"epoch": 0.11419302514193025,
"grad_norm": 0.5935595631599426,
"learning_rate": 7.500000000000001e-06,
"loss": 0.7410198450088501,
"step": 88
},
{
"epoch": 0.11549067315490673,
"grad_norm": 0.6088429093360901,
"learning_rate": 7.586206896551724e-06,
"loss": 0.78556227684021,
"step": 89
},
{
"epoch": 0.11678832116788321,
"grad_norm": 0.6014888286590576,
"learning_rate": 7.672413793103449e-06,
"loss": 0.7850443124771118,
"step": 90
},
{
"epoch": 0.11808596918085969,
"grad_norm": 0.644192636013031,
"learning_rate": 7.758620689655173e-06,
"loss": 0.7852208614349365,
"step": 91
},
{
"epoch": 0.11938361719383617,
"grad_norm": 0.6681314706802368,
"learning_rate": 7.844827586206897e-06,
"loss": 0.8286585211753845,
"step": 92
},
{
"epoch": 0.12068126520681265,
"grad_norm": 0.6156536936759949,
"learning_rate": 7.93103448275862e-06,
"loss": 0.7740339040756226,
"step": 93
},
{
"epoch": 0.12197891321978913,
"grad_norm": 0.5617393255233765,
"learning_rate": 8.017241379310345e-06,
"loss": 0.7133764028549194,
"step": 94
},
{
"epoch": 0.12327656123276562,
"grad_norm": 0.6284353733062744,
"learning_rate": 8.103448275862069e-06,
"loss": 0.8572052121162415,
"step": 95
},
{
"epoch": 0.1245742092457421,
"grad_norm": 0.6048849821090698,
"learning_rate": 8.189655172413794e-06,
"loss": 0.7354931831359863,
"step": 96
},
{
"epoch": 0.12587185725871858,
"grad_norm": 0.717276930809021,
"learning_rate": 8.275862068965518e-06,
"loss": 0.7633223533630371,
"step": 97
},
{
"epoch": 0.12716950527169504,
"grad_norm": 0.5850024223327637,
"learning_rate": 8.362068965517242e-06,
"loss": 0.7660566568374634,
"step": 98
},
{
"epoch": 0.12846715328467154,
"grad_norm": 0.6040444374084473,
"learning_rate": 8.448275862068966e-06,
"loss": 0.687772274017334,
"step": 99
},
{
"epoch": 0.129764801297648,
"grad_norm": 0.635793924331665,
"learning_rate": 8.53448275862069e-06,
"loss": 0.8365746736526489,
"step": 100
},
{
"epoch": 0.1310624493106245,
"grad_norm": 0.681013822555542,
"learning_rate": 8.620689655172414e-06,
"loss": 0.8097229599952698,
"step": 101
},
{
"epoch": 0.13236009732360096,
"grad_norm": 0.5976776480674744,
"learning_rate": 8.706896551724138e-06,
"loss": 0.7460197806358337,
"step": 102
},
{
"epoch": 0.13365774533657745,
"grad_norm": 0.5931348204612732,
"learning_rate": 8.793103448275862e-06,
"loss": 0.7234000563621521,
"step": 103
},
{
"epoch": 0.13495539334955392,
"grad_norm": 0.6787912845611572,
"learning_rate": 8.879310344827588e-06,
"loss": 0.8100739121437073,
"step": 104
},
{
"epoch": 0.1362530413625304,
"grad_norm": 0.6532299518585205,
"learning_rate": 8.965517241379312e-06,
"loss": 0.8400453925132751,
"step": 105
},
{
"epoch": 0.1375506893755069,
"grad_norm": 0.6569010615348816,
"learning_rate": 9.051724137931036e-06,
"loss": 0.8247137069702148,
"step": 106
},
{
"epoch": 0.13884833738848337,
"grad_norm": 0.6199808716773987,
"learning_rate": 9.13793103448276e-06,
"loss": 0.7428423166275024,
"step": 107
},
{
"epoch": 0.14014598540145987,
"grad_norm": 0.6075517535209656,
"learning_rate": 9.224137931034484e-06,
"loss": 0.7575728893280029,
"step": 108
},
{
"epoch": 0.14144363341443633,
"grad_norm": 0.6420115232467651,
"learning_rate": 9.310344827586207e-06,
"loss": 0.8051052093505859,
"step": 109
},
{
"epoch": 0.14274128142741282,
"grad_norm": 0.6138091683387756,
"learning_rate": 9.396551724137931e-06,
"loss": 0.8522422313690186,
"step": 110
},
{
"epoch": 0.1440389294403893,
"grad_norm": 0.650187075138092,
"learning_rate": 9.482758620689655e-06,
"loss": 0.8301827907562256,
"step": 111
},
{
"epoch": 0.14533657745336578,
"grad_norm": 0.6030973196029663,
"learning_rate": 9.56896551724138e-06,
"loss": 0.7207387089729309,
"step": 112
},
{
"epoch": 0.14663422546634225,
"grad_norm": 0.622131884098053,
"learning_rate": 9.655172413793105e-06,
"loss": 0.7915451526641846,
"step": 113
},
{
"epoch": 0.14793187347931874,
"grad_norm": 0.6085039377212524,
"learning_rate": 9.741379310344829e-06,
"loss": 0.7769342064857483,
"step": 114
},
{
"epoch": 0.1492295214922952,
"grad_norm": 0.6578651666641235,
"learning_rate": 9.827586206896553e-06,
"loss": 0.7566852569580078,
"step": 115
},
{
"epoch": 0.1505271695052717,
"grad_norm": 0.6066433787345886,
"learning_rate": 9.913793103448277e-06,
"loss": 0.7825925350189209,
"step": 116
},
{
"epoch": 0.15182481751824817,
"grad_norm": 0.6409288644790649,
"learning_rate": 1e-05,
"loss": 0.8247882127761841,
"step": 117
},
{
"epoch": 0.15312246553122466,
"grad_norm": 0.6675072312355042,
"learning_rate": 9.99999488813276e-06,
"loss": 0.8096261024475098,
"step": 118
},
{
"epoch": 0.15442011354420113,
"grad_norm": 0.6444228887557983,
"learning_rate": 9.999979552541496e-06,
"loss": 0.7326732873916626,
"step": 119
},
{
"epoch": 0.15571776155717762,
"grad_norm": 0.6155293583869934,
"learning_rate": 9.99995399325756e-06,
"loss": 0.7694077491760254,
"step": 120
},
{
"epoch": 0.15701540957015409,
"grad_norm": 0.6370646953582764,
"learning_rate": 9.999918210333219e-06,
"loss": 0.8235340118408203,
"step": 121
},
{
"epoch": 0.15831305758313058,
"grad_norm": 0.6056079864501953,
"learning_rate": 9.999872203841635e-06,
"loss": 0.7498428821563721,
"step": 122
},
{
"epoch": 0.15961070559610704,
"grad_norm": 0.6514161825180054,
"learning_rate": 9.999815973876888e-06,
"loss": 0.772469162940979,
"step": 123
},
{
"epoch": 0.16090835360908354,
"grad_norm": 0.6417706608772278,
"learning_rate": 9.999749520553945e-06,
"loss": 0.8074150085449219,
"step": 124
},
{
"epoch": 0.16220600162206,
"grad_norm": 0.6162619590759277,
"learning_rate": 9.99967284400869e-06,
"loss": 0.7649105191230774,
"step": 125
},
{
"epoch": 0.1635036496350365,
"grad_norm": 0.6231618523597717,
"learning_rate": 9.99958594439791e-06,
"loss": 0.7435484528541565,
"step": 126
},
{
"epoch": 0.164801297648013,
"grad_norm": 0.6211341023445129,
"learning_rate": 9.999488821899286e-06,
"loss": 0.7700725793838501,
"step": 127
},
{
"epoch": 0.16609894566098946,
"grad_norm": 0.6546758413314819,
"learning_rate": 9.999381476711416e-06,
"loss": 0.7208442091941833,
"step": 128
},
{
"epoch": 0.16739659367396595,
"grad_norm": 0.6165010333061218,
"learning_rate": 9.999263909053789e-06,
"loss": 0.7380815148353577,
"step": 129
},
{
"epoch": 0.16869424168694241,
"grad_norm": 0.7457146048545837,
"learning_rate": 9.999136119166803e-06,
"loss": 0.7085788249969482,
"step": 130
},
{
"epoch": 0.1699918896999189,
"grad_norm": 0.6893863677978516,
"learning_rate": 9.998998107311758e-06,
"loss": 0.8248496055603027,
"step": 131
},
{
"epoch": 0.17128953771289537,
"grad_norm": 0.6099883317947388,
"learning_rate": 9.998849873770849e-06,
"loss": 0.7661588191986084,
"step": 132
},
{
"epoch": 0.17258718572587187,
"grad_norm": 0.5964142084121704,
"learning_rate": 9.998691418847177e-06,
"loss": 0.7037764191627502,
"step": 133
},
{
"epoch": 0.17388483373884833,
"grad_norm": 0.6277547478675842,
"learning_rate": 9.998522742864745e-06,
"loss": 0.8015055060386658,
"step": 134
},
{
"epoch": 0.17518248175182483,
"grad_norm": 0.6385223865509033,
"learning_rate": 9.998343846168448e-06,
"loss": 0.7598564028739929,
"step": 135
},
{
"epoch": 0.1764801297648013,
"grad_norm": 0.6057168245315552,
"learning_rate": 9.998154729124092e-06,
"loss": 0.7190810441970825,
"step": 136
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.6524573564529419,
"learning_rate": 9.997955392118365e-06,
"loss": 0.7655267715454102,
"step": 137
},
{
"epoch": 0.17907542579075425,
"grad_norm": 0.593307614326477,
"learning_rate": 9.997745835558867e-06,
"loss": 0.6991128921508789,
"step": 138
},
{
"epoch": 0.18037307380373074,
"grad_norm": 0.6667762994766235,
"learning_rate": 9.997526059874086e-06,
"loss": 0.7836197018623352,
"step": 139
},
{
"epoch": 0.1816707218167072,
"grad_norm": 0.6364095211029053,
"learning_rate": 9.997296065513405e-06,
"loss": 0.7866847515106201,
"step": 140
},
{
"epoch": 0.1829683698296837,
"grad_norm": 0.6693204641342163,
"learning_rate": 9.997055852947109e-06,
"loss": 0.8498630523681641,
"step": 141
},
{
"epoch": 0.18426601784266017,
"grad_norm": 0.6703641414642334,
"learning_rate": 9.996805422666367e-06,
"loss": 0.7902424335479736,
"step": 142
},
{
"epoch": 0.18556366585563666,
"grad_norm": 0.6226605772972107,
"learning_rate": 9.99654477518325e-06,
"loss": 0.7982854843139648,
"step": 143
},
{
"epoch": 0.18686131386861313,
"grad_norm": 0.5963988304138184,
"learning_rate": 9.996273911030714e-06,
"loss": 0.7364012598991394,
"step": 144
},
{
"epoch": 0.18815896188158962,
"grad_norm": 3.2399189472198486,
"learning_rate": 9.995992830762608e-06,
"loss": 0.8748813271522522,
"step": 145
},
{
"epoch": 0.18945660989456609,
"grad_norm": 0.6035348773002625,
"learning_rate": 9.99570153495367e-06,
"loss": 0.7249287366867065,
"step": 146
},
{
"epoch": 0.19075425790754258,
"grad_norm": 0.6258792877197266,
"learning_rate": 9.995400024199526e-06,
"loss": 0.7734540700912476,
"step": 147
},
{
"epoch": 0.19205190592051907,
"grad_norm": 0.6568045020103455,
"learning_rate": 9.99508829911669e-06,
"loss": 0.8293142318725586,
"step": 148
},
{
"epoch": 0.19334955393349554,
"grad_norm": 0.8624785542488098,
"learning_rate": 9.994766360342557e-06,
"loss": 0.8258950710296631,
"step": 149
},
{
"epoch": 0.19464720194647203,
"grad_norm": 0.591865599155426,
"learning_rate": 9.994434208535415e-06,
"loss": 0.7743998765945435,
"step": 150
},
{
"epoch": 0.1959448499594485,
"grad_norm": 0.6273242831230164,
"learning_rate": 9.994091844374431e-06,
"loss": 0.8304177522659302,
"step": 151
},
{
"epoch": 0.197242497972425,
"grad_norm": 0.6169039011001587,
"learning_rate": 9.993739268559648e-06,
"loss": 0.8317509889602661,
"step": 152
},
{
"epoch": 0.19854014598540146,
"grad_norm": 0.6500508785247803,
"learning_rate": 9.993376481812001e-06,
"loss": 0.8074177503585815,
"step": 153
},
{
"epoch": 0.19983779399837795,
"grad_norm": 0.691698431968689,
"learning_rate": 9.99300348487329e-06,
"loss": 0.7966357469558716,
"step": 154
},
{
"epoch": 0.20113544201135442,
"grad_norm": 0.6341277956962585,
"learning_rate": 9.992620278506203e-06,
"loss": 0.7922544479370117,
"step": 155
},
{
"epoch": 0.2024330900243309,
"grad_norm": 0.5936447381973267,
"learning_rate": 9.9922268634943e-06,
"loss": 0.6732587218284607,
"step": 156
},
{
"epoch": 0.20373073803730737,
"grad_norm": 0.6575024127960205,
"learning_rate": 9.991823240642014e-06,
"loss": 0.8733258247375488,
"step": 157
},
{
"epoch": 0.20502838605028387,
"grad_norm": 0.6686046719551086,
"learning_rate": 9.991409410774654e-06,
"loss": 0.790815532207489,
"step": 158
},
{
"epoch": 0.20632603406326033,
"grad_norm": 1.4253793954849243,
"learning_rate": 9.990985374738396e-06,
"loss": 0.7325870990753174,
"step": 159
},
{
"epoch": 0.20762368207623683,
"grad_norm": 0.6524296998977661,
"learning_rate": 9.990551133400284e-06,
"loss": 0.7516152858734131,
"step": 160
},
{
"epoch": 0.2089213300892133,
"grad_norm": 0.6569153666496277,
"learning_rate": 9.990106687648234e-06,
"loss": 0.7317984104156494,
"step": 161
},
{
"epoch": 0.21021897810218979,
"grad_norm": 0.5729793906211853,
"learning_rate": 9.989652038391025e-06,
"loss": 0.7050694227218628,
"step": 162
},
{
"epoch": 0.21151662611516625,
"grad_norm": 0.5924677848815918,
"learning_rate": 9.9891871865583e-06,
"loss": 0.7387759685516357,
"step": 163
},
{
"epoch": 0.21281427412814274,
"grad_norm": 0.9845248460769653,
"learning_rate": 9.988712133100563e-06,
"loss": 0.8402718305587769,
"step": 164
},
{
"epoch": 0.2141119221411192,
"grad_norm": 0.6559567451477051,
"learning_rate": 9.988226878989178e-06,
"loss": 0.7516730427742004,
"step": 165
},
{
"epoch": 0.2154095701540957,
"grad_norm": 0.603742778301239,
"learning_rate": 9.987731425216364e-06,
"loss": 0.6687497496604919,
"step": 166
},
{
"epoch": 0.21670721816707217,
"grad_norm": 0.6345369815826416,
"learning_rate": 9.987225772795204e-06,
"loss": 0.8063400387763977,
"step": 167
},
{
"epoch": 0.21800486618004866,
"grad_norm": 0.6372174024581909,
"learning_rate": 9.986709922759626e-06,
"loss": 0.7703537940979004,
"step": 168
},
{
"epoch": 0.21930251419302516,
"grad_norm": 0.607814371585846,
"learning_rate": 9.986183876164412e-06,
"loss": 0.6834731101989746,
"step": 169
},
{
"epoch": 0.22060016220600162,
"grad_norm": 0.5630145072937012,
"learning_rate": 9.985647634085197e-06,
"loss": 0.7261765599250793,
"step": 170
},
{
"epoch": 0.22189781021897811,
"grad_norm": 0.6719157695770264,
"learning_rate": 9.985101197618456e-06,
"loss": 0.7341983318328857,
"step": 171
},
{
"epoch": 0.22319545823195458,
"grad_norm": 0.6283457279205322,
"learning_rate": 9.98454456788152e-06,
"loss": 0.7351614832878113,
"step": 172
},
{
"epoch": 0.22449310624493107,
"grad_norm": 0.6344905495643616,
"learning_rate": 9.983977746012547e-06,
"loss": 0.7843720316886902,
"step": 173
},
{
"epoch": 0.22579075425790754,
"grad_norm": 0.605237603187561,
"learning_rate": 9.983400733170553e-06,
"loss": 0.7114173769950867,
"step": 174
},
{
"epoch": 0.22708840227088403,
"grad_norm": 0.626672089099884,
"learning_rate": 9.982813530535377e-06,
"loss": 0.7024215459823608,
"step": 175
},
{
"epoch": 0.2283860502838605,
"grad_norm": 0.6185852885246277,
"learning_rate": 9.982216139307705e-06,
"loss": 0.8043787479400635,
"step": 176
},
{
"epoch": 0.229683698296837,
"grad_norm": 0.5857049226760864,
"learning_rate": 9.981608560709044e-06,
"loss": 0.6755383014678955,
"step": 177
},
{
"epoch": 0.23098134630981346,
"grad_norm": 0.6019972562789917,
"learning_rate": 9.980990795981747e-06,
"loss": 0.7932974100112915,
"step": 178
},
{
"epoch": 0.23227899432278995,
"grad_norm": 0.6226310729980469,
"learning_rate": 9.980362846388978e-06,
"loss": 0.784454882144928,
"step": 179
},
{
"epoch": 0.23357664233576642,
"grad_norm": 0.643936812877655,
"learning_rate": 9.97972471321474e-06,
"loss": 0.768436849117279,
"step": 180
},
{
"epoch": 0.2348742903487429,
"grad_norm": 0.629254162311554,
"learning_rate": 9.979076397763853e-06,
"loss": 0.7261864542961121,
"step": 181
},
{
"epoch": 0.23617193836171937,
"grad_norm": 0.6138353943824768,
"learning_rate": 9.978417901361958e-06,
"loss": 0.8290830254554749,
"step": 182
},
{
"epoch": 0.23746958637469587,
"grad_norm": 0.6166982054710388,
"learning_rate": 9.977749225355513e-06,
"loss": 0.7295878529548645,
"step": 183
},
{
"epoch": 0.23876723438767233,
"grad_norm": 0.5729910731315613,
"learning_rate": 9.977070371111793e-06,
"loss": 0.7391046285629272,
"step": 184
},
{
"epoch": 0.24006488240064883,
"grad_norm": 0.6283906102180481,
"learning_rate": 9.976381340018879e-06,
"loss": 0.7741225957870483,
"step": 185
},
{
"epoch": 0.2413625304136253,
"grad_norm": 0.5742847919464111,
"learning_rate": 9.97568213348567e-06,
"loss": 0.7565523386001587,
"step": 186
},
{
"epoch": 0.24266017842660179,
"grad_norm": 0.5885831713676453,
"learning_rate": 9.974972752941861e-06,
"loss": 0.7079343199729919,
"step": 187
},
{
"epoch": 0.24395782643957825,
"grad_norm": 0.6233158707618713,
"learning_rate": 9.97425319983796e-06,
"loss": 0.802773118019104,
"step": 188
},
{
"epoch": 0.24525547445255474,
"grad_norm": 0.6107950210571289,
"learning_rate": 9.97352347564527e-06,
"loss": 0.7514665126800537,
"step": 189
},
{
"epoch": 0.24655312246553124,
"grad_norm": 0.6127108335494995,
"learning_rate": 9.972783581855894e-06,
"loss": 0.766715943813324,
"step": 190
},
{
"epoch": 0.2478507704785077,
"grad_norm": 0.5911589860916138,
"learning_rate": 9.972033519982722e-06,
"loss": 0.719687283039093,
"step": 191
},
{
"epoch": 0.2491484184914842,
"grad_norm": 0.7104600071907043,
"learning_rate": 9.971273291559447e-06,
"loss": 0.7840068340301514,
"step": 192
},
{
"epoch": 0.25044606650446066,
"grad_norm": 1.2322938442230225,
"learning_rate": 9.97050289814054e-06,
"loss": 0.7457755208015442,
"step": 193
},
{
"epoch": 0.25174371451743716,
"grad_norm": 0.568343460559845,
"learning_rate": 9.969722341301261e-06,
"loss": 0.6806910037994385,
"step": 194
},
{
"epoch": 0.25304136253041365,
"grad_norm": 0.6099660396575928,
"learning_rate": 9.968931622637652e-06,
"loss": 0.7885247468948364,
"step": 195
},
{
"epoch": 0.2543390105433901,
"grad_norm": 0.5906837582588196,
"learning_rate": 9.968130743766533e-06,
"loss": 0.7320465445518494,
"step": 196
},
{
"epoch": 0.2556366585563666,
"grad_norm": 0.5778429508209229,
"learning_rate": 9.967319706325495e-06,
"loss": 0.7082957029342651,
"step": 197
},
{
"epoch": 0.2569343065693431,
"grad_norm": 0.5944257974624634,
"learning_rate": 9.96649851197291e-06,
"loss": 0.7171834707260132,
"step": 198
},
{
"epoch": 0.25823195458231957,
"grad_norm": 0.8729922771453857,
"learning_rate": 9.965667162387908e-06,
"loss": 0.8201053142547607,
"step": 199
},
{
"epoch": 0.259529602595296,
"grad_norm": 0.6156542897224426,
"learning_rate": 9.964825659270391e-06,
"loss": 0.7408115863800049,
"step": 200
},
{
"epoch": 0.2608272506082725,
"grad_norm": 0.5976687669754028,
"learning_rate": 9.963974004341019e-06,
"loss": 0.7426021099090576,
"step": 201
},
{
"epoch": 0.262124898621249,
"grad_norm": 0.6217131018638611,
"learning_rate": 9.963112199341212e-06,
"loss": 0.7804723978042603,
"step": 202
},
{
"epoch": 0.2634225466342255,
"grad_norm": 0.5792650580406189,
"learning_rate": 9.96224024603314e-06,
"loss": 0.6894349455833435,
"step": 203
},
{
"epoch": 0.2647201946472019,
"grad_norm": 0.6177152395248413,
"learning_rate": 9.961358146199729e-06,
"loss": 0.717537522315979,
"step": 204
},
{
"epoch": 0.2660178426601784,
"grad_norm": 0.6125051975250244,
"learning_rate": 9.960465901644651e-06,
"loss": 0.774456799030304,
"step": 205
},
{
"epoch": 0.2673154906731549,
"grad_norm": 0.6172115206718445,
"learning_rate": 9.959563514192317e-06,
"loss": 0.7355530261993408,
"step": 206
},
{
"epoch": 0.2686131386861314,
"grad_norm": 0.6835010051727295,
"learning_rate": 9.958650985687884e-06,
"loss": 0.8002670407295227,
"step": 207
},
{
"epoch": 0.26991078669910784,
"grad_norm": 0.6039808392524719,
"learning_rate": 9.95772831799724e-06,
"loss": 0.784502387046814,
"step": 208
},
{
"epoch": 0.27120843471208433,
"grad_norm": 3.698056936264038,
"learning_rate": 9.956795513007008e-06,
"loss": 0.7473998069763184,
"step": 209
},
{
"epoch": 0.2725060827250608,
"grad_norm": 0.6423486471176147,
"learning_rate": 9.955852572624538e-06,
"loss": 0.7945725917816162,
"step": 210
},
{
"epoch": 0.2738037307380373,
"grad_norm": 0.5756685137748718,
"learning_rate": 9.954899498777903e-06,
"loss": 0.7909812927246094,
"step": 211
},
{
"epoch": 0.2751013787510138,
"grad_norm": 0.5984244346618652,
"learning_rate": 9.9539362934159e-06,
"loss": 0.7091703414916992,
"step": 212
},
{
"epoch": 0.27639902676399025,
"grad_norm": 0.6023333072662354,
"learning_rate": 9.952962958508038e-06,
"loss": 0.7251565456390381,
"step": 213
},
{
"epoch": 0.27769667477696675,
"grad_norm": 0.6191360950469971,
"learning_rate": 9.951979496044544e-06,
"loss": 0.7646386027336121,
"step": 214
},
{
"epoch": 0.27899432278994324,
"grad_norm": 0.6032703518867493,
"learning_rate": 9.950985908036346e-06,
"loss": 0.76767897605896,
"step": 215
},
{
"epoch": 0.28029197080291973,
"grad_norm": 0.5847381949424744,
"learning_rate": 9.94998219651508e-06,
"loss": 0.7368282079696655,
"step": 216
},
{
"epoch": 0.28158961881589617,
"grad_norm": 0.6057823896408081,
"learning_rate": 9.948968363533085e-06,
"loss": 0.7350323796272278,
"step": 217
},
{
"epoch": 0.28288726682887266,
"grad_norm": 0.6186010241508484,
"learning_rate": 9.947944411163391e-06,
"loss": 0.7249234318733215,
"step": 218
},
{
"epoch": 0.28418491484184916,
"grad_norm": 0.6159788370132446,
"learning_rate": 9.946910341499722e-06,
"loss": 0.761109471321106,
"step": 219
},
{
"epoch": 0.28548256285482565,
"grad_norm": 0.5817273259162903,
"learning_rate": 9.945866156656487e-06,
"loss": 0.7725365161895752,
"step": 220
},
{
"epoch": 0.2867802108678021,
"grad_norm": 0.655717134475708,
"learning_rate": 9.944811858768782e-06,
"loss": 0.7668634057044983,
"step": 221
},
{
"epoch": 0.2880778588807786,
"grad_norm": 0.6457056403160095,
"learning_rate": 9.943747449992379e-06,
"loss": 0.7912311553955078,
"step": 222
},
{
"epoch": 0.2893755068937551,
"grad_norm": 0.5742535591125488,
"learning_rate": 9.942672932503722e-06,
"loss": 0.7619901299476624,
"step": 223
},
{
"epoch": 0.29067315490673157,
"grad_norm": 0.5950078964233398,
"learning_rate": 9.941588308499932e-06,
"loss": 0.7898773550987244,
"step": 224
},
{
"epoch": 0.291970802919708,
"grad_norm": 0.6142423152923584,
"learning_rate": 9.940493580198787e-06,
"loss": 0.7200186252593994,
"step": 225
},
{
"epoch": 0.2932684509326845,
"grad_norm": 0.6070595979690552,
"learning_rate": 9.93938874983873e-06,
"loss": 0.6990747451782227,
"step": 226
},
{
"epoch": 0.294566098945661,
"grad_norm": 0.6014435887336731,
"learning_rate": 9.93827381967886e-06,
"loss": 0.7597475647926331,
"step": 227
},
{
"epoch": 0.2958637469586375,
"grad_norm": 0.5983416438102722,
"learning_rate": 9.937148791998926e-06,
"loss": 0.738788366317749,
"step": 228
},
{
"epoch": 0.2971613949716139,
"grad_norm": 2.7879600524902344,
"learning_rate": 9.936013669099326e-06,
"loss": 0.7541340589523315,
"step": 229
},
{
"epoch": 0.2984590429845904,
"grad_norm": 0.6435497403144836,
"learning_rate": 9.9348684533011e-06,
"loss": 0.8065454959869385,
"step": 230
},
{
"epoch": 0.2984590429845904,
"eval_loss": 0.7250053882598877,
"eval_runtime": 73.3232,
"eval_samples_per_second": 70.81,
"eval_steps_per_second": 8.851,
"step": 230
},
{
"epoch": 0.2997566909975669,
"grad_norm": 2.4210150241851807,
"learning_rate": 9.93371314694592e-06,
"loss": 0.7646887302398682,
"step": 231
},
{
"epoch": 0.3010543390105434,
"grad_norm": 0.601508617401123,
"learning_rate": 9.9325477523961e-06,
"loss": 0.7489044070243835,
"step": 232
},
{
"epoch": 0.3023519870235199,
"grad_norm": 0.5808404684066772,
"learning_rate": 9.931372272034573e-06,
"loss": 0.7624624371528625,
"step": 233
},
{
"epoch": 0.30364963503649633,
"grad_norm": 1.0590876340866089,
"learning_rate": 9.930186708264902e-06,
"loss": 0.7188542485237122,
"step": 234
},
{
"epoch": 0.30494728304947283,
"grad_norm": 0.6582311391830444,
"learning_rate": 9.928991063511264e-06,
"loss": 0.7417193055152893,
"step": 235
},
{
"epoch": 0.3062449310624493,
"grad_norm": 0.5886158347129822,
"learning_rate": 9.927785340218448e-06,
"loss": 0.7227447032928467,
"step": 236
},
{
"epoch": 0.3075425790754258,
"grad_norm": 0.8434078693389893,
"learning_rate": 9.926569540851856e-06,
"loss": 0.8079698085784912,
"step": 237
},
{
"epoch": 0.30884022708840225,
"grad_norm": 0.7032890915870667,
"learning_rate": 9.925343667897487e-06,
"loss": 0.730448842048645,
"step": 238
},
{
"epoch": 0.31013787510137875,
"grad_norm": 0.5958182215690613,
"learning_rate": 9.924107723861944e-06,
"loss": 0.7622323036193848,
"step": 239
},
{
"epoch": 0.31143552311435524,
"grad_norm": 0.7387073040008545,
"learning_rate": 9.922861711272417e-06,
"loss": 0.8103834390640259,
"step": 240
},
{
"epoch": 0.31273317112733173,
"grad_norm": 0.589846134185791,
"learning_rate": 9.921605632676688e-06,
"loss": 0.7218436002731323,
"step": 241
},
{
"epoch": 0.31403081914030817,
"grad_norm": 1.18753182888031,
"learning_rate": 9.920339490643119e-06,
"loss": 0.6769864559173584,
"step": 242
},
{
"epoch": 0.31532846715328466,
"grad_norm": 0.6063650250434875,
"learning_rate": 9.91906328776065e-06,
"loss": 0.6872894763946533,
"step": 243
},
{
"epoch": 0.31662611516626116,
"grad_norm": 0.6060184240341187,
"learning_rate": 9.917777026638794e-06,
"loss": 0.7477156519889832,
"step": 244
},
{
"epoch": 0.31792376317923765,
"grad_norm": 0.5981388092041016,
"learning_rate": 9.916480709907626e-06,
"loss": 0.6859747767448425,
"step": 245
},
{
"epoch": 0.3192214111922141,
"grad_norm": 0.5809654593467712,
"learning_rate": 9.91517434021779e-06,
"loss": 0.7025295495986938,
"step": 246
},
{
"epoch": 0.3205190592051906,
"grad_norm": 0.6036680340766907,
"learning_rate": 9.913857920240481e-06,
"loss": 0.8275207877159119,
"step": 247
},
{
"epoch": 0.3218167072181671,
"grad_norm": 0.5851848125457764,
"learning_rate": 9.912531452667441e-06,
"loss": 0.7031136155128479,
"step": 248
},
{
"epoch": 0.32311435523114357,
"grad_norm": 0.5534024238586426,
"learning_rate": 9.911194940210964e-06,
"loss": 0.7281129956245422,
"step": 249
},
{
"epoch": 0.32441200324412,
"grad_norm": 0.6152268052101135,
"learning_rate": 9.909848385603878e-06,
"loss": 0.7846366167068481,
"step": 250
},
{
"epoch": 0.3257096512570965,
"grad_norm": 0.5951406359672546,
"learning_rate": 9.908491791599546e-06,
"loss": 0.7278503179550171,
"step": 251
},
{
"epoch": 0.327007299270073,
"grad_norm": 0.6011956334114075,
"learning_rate": 9.90712516097186e-06,
"loss": 0.7939674854278564,
"step": 252
},
{
"epoch": 0.3283049472830495,
"grad_norm": 0.6651070713996887,
"learning_rate": 9.905748496515235e-06,
"loss": 0.772196888923645,
"step": 253
},
{
"epoch": 0.329602595296026,
"grad_norm": 0.617461085319519,
"learning_rate": 9.904361801044599e-06,
"loss": 0.7933390140533447,
"step": 254
},
{
"epoch": 0.3309002433090024,
"grad_norm": 0.5844789147377014,
"learning_rate": 9.902965077395395e-06,
"loss": 0.7286657691001892,
"step": 255
},
{
"epoch": 0.3321978913219789,
"grad_norm": 0.6185967326164246,
"learning_rate": 9.901558328423568e-06,
"loss": 0.8058604001998901,
"step": 256
},
{
"epoch": 0.3334955393349554,
"grad_norm": 0.6511676907539368,
"learning_rate": 9.900141557005567e-06,
"loss": 0.7281938195228577,
"step": 257
},
{
"epoch": 0.3347931873479319,
"grad_norm": 0.6114381551742554,
"learning_rate": 9.898714766038326e-06,
"loss": 0.7546758651733398,
"step": 258
},
{
"epoch": 0.33609083536090834,
"grad_norm": 0.5931724905967712,
"learning_rate": 9.897277958439274e-06,
"loss": 0.811058759689331,
"step": 259
},
{
"epoch": 0.33738848337388483,
"grad_norm": 0.5811541080474854,
"learning_rate": 9.895831137146319e-06,
"loss": 0.764075517654419,
"step": 260
},
{
"epoch": 0.3386861313868613,
"grad_norm": 0.5857120156288147,
"learning_rate": 9.894374305117844e-06,
"loss": 0.730948805809021,
"step": 261
},
{
"epoch": 0.3399837793998378,
"grad_norm": 0.5755126476287842,
"learning_rate": 9.892907465332702e-06,
"loss": 0.7732649445533752,
"step": 262
},
{
"epoch": 0.34128142741281425,
"grad_norm": 0.5852351784706116,
"learning_rate": 9.891430620790208e-06,
"loss": 0.6883482933044434,
"step": 263
},
{
"epoch": 0.34257907542579075,
"grad_norm": 0.5931571125984192,
"learning_rate": 9.889943774510136e-06,
"loss": 0.7685630321502686,
"step": 264
},
{
"epoch": 0.34387672343876724,
"grad_norm": 0.7222980260848999,
"learning_rate": 9.888446929532712e-06,
"loss": 0.7235557436943054,
"step": 265
},
{
"epoch": 0.34517437145174373,
"grad_norm": 0.6728655695915222,
"learning_rate": 9.886940088918601e-06,
"loss": 0.7901487350463867,
"step": 266
},
{
"epoch": 0.34647201946472017,
"grad_norm": 0.5990903973579407,
"learning_rate": 9.885423255748916e-06,
"loss": 0.7315446138381958,
"step": 267
},
{
"epoch": 0.34776966747769666,
"grad_norm": 0.6058611869812012,
"learning_rate": 9.883896433125193e-06,
"loss": 0.748113751411438,
"step": 268
},
{
"epoch": 0.34906731549067316,
"grad_norm": 0.6079699397087097,
"learning_rate": 9.8823596241694e-06,
"loss": 0.7346718907356262,
"step": 269
},
{
"epoch": 0.35036496350364965,
"grad_norm": 0.5837222337722778,
"learning_rate": 9.88081283202392e-06,
"loss": 0.6944899559020996,
"step": 270
},
{
"epoch": 0.3516626115166261,
"grad_norm": 0.5878487229347229,
"learning_rate": 9.879256059851553e-06,
"loss": 0.766356885433197,
"step": 271
},
{
"epoch": 0.3529602595296026,
"grad_norm": 0.605903685092926,
"learning_rate": 9.877689310835503e-06,
"loss": 0.7980437278747559,
"step": 272
},
{
"epoch": 0.3542579075425791,
"grad_norm": 0.5946698784828186,
"learning_rate": 9.876112588179378e-06,
"loss": 0.7276085019111633,
"step": 273
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.5997035503387451,
"learning_rate": 9.874525895107175e-06,
"loss": 0.7429395318031311,
"step": 274
},
{
"epoch": 0.35685320356853206,
"grad_norm": 0.5639536380767822,
"learning_rate": 9.872929234863277e-06,
"loss": 0.7452772855758667,
"step": 275
},
{
"epoch": 0.3581508515815085,
"grad_norm": 0.5665518641471863,
"learning_rate": 9.871322610712452e-06,
"loss": 0.6850217580795288,
"step": 276
},
{
"epoch": 0.359448499594485,
"grad_norm": 0.5540530681610107,
"learning_rate": 9.869706025939843e-06,
"loss": 0.6755887269973755,
"step": 277
},
{
"epoch": 0.3607461476074615,
"grad_norm": 0.5980620384216309,
"learning_rate": 9.868079483850955e-06,
"loss": 0.7464824914932251,
"step": 278
},
{
"epoch": 0.362043795620438,
"grad_norm": 0.619748055934906,
"learning_rate": 9.86644298777165e-06,
"loss": 0.778630793094635,
"step": 279
},
{
"epoch": 0.3633414436334144,
"grad_norm": 0.5898886919021606,
"learning_rate": 9.864796541048155e-06,
"loss": 0.7965477705001831,
"step": 280
},
{
"epoch": 0.3646390916463909,
"grad_norm": 0.5768588185310364,
"learning_rate": 9.863140147047034e-06,
"loss": 0.7540180087089539,
"step": 281
},
{
"epoch": 0.3659367396593674,
"grad_norm": 0.6073225140571594,
"learning_rate": 9.861473809155192e-06,
"loss": 0.7069481015205383,
"step": 282
},
{
"epoch": 0.3672343876723439,
"grad_norm": 0.853999137878418,
"learning_rate": 9.859797530779871e-06,
"loss": 0.6730421185493469,
"step": 283
},
{
"epoch": 0.36853203568532034,
"grad_norm": 0.5999425649642944,
"learning_rate": 9.858111315348633e-06,
"loss": 0.7877826690673828,
"step": 284
},
{
"epoch": 0.36982968369829683,
"grad_norm": 0.9857465624809265,
"learning_rate": 9.856415166309365e-06,
"loss": 0.7664862871170044,
"step": 285
},
{
"epoch": 0.3711273317112733,
"grad_norm": 0.6046482920646667,
"learning_rate": 9.854709087130261e-06,
"loss": 0.7595510482788086,
"step": 286
},
{
"epoch": 0.3724249797242498,
"grad_norm": 0.6335992217063904,
"learning_rate": 9.852993081299821e-06,
"loss": 0.7546533346176147,
"step": 287
},
{
"epoch": 0.37372262773722625,
"grad_norm": 0.6080864667892456,
"learning_rate": 9.851267152326842e-06,
"loss": 0.7263352870941162,
"step": 288
},
{
"epoch": 0.37502027575020275,
"grad_norm": 0.6323843598365784,
"learning_rate": 9.849531303740414e-06,
"loss": 0.7602711915969849,
"step": 289
},
{
"epoch": 0.37631792376317924,
"grad_norm": 0.6081179976463318,
"learning_rate": 9.847785539089904e-06,
"loss": 0.740424633026123,
"step": 290
},
{
"epoch": 0.37761557177615573,
"grad_norm": 0.6082411408424377,
"learning_rate": 9.846029861944964e-06,
"loss": 0.7497418522834778,
"step": 291
},
{
"epoch": 0.37891321978913217,
"grad_norm": 2.8806638717651367,
"learning_rate": 9.844264275895505e-06,
"loss": 0.7668443918228149,
"step": 292
},
{
"epoch": 0.38021086780210867,
"grad_norm": 0.6383978128433228,
"learning_rate": 9.842488784551707e-06,
"loss": 0.7615733742713928,
"step": 293
},
{
"epoch": 0.38150851581508516,
"grad_norm": 0.589131772518158,
"learning_rate": 9.840703391543999e-06,
"loss": 0.6759642362594604,
"step": 294
},
{
"epoch": 0.38280616382806165,
"grad_norm": 0.5658035278320312,
"learning_rate": 9.838908100523056e-06,
"loss": 0.6837214231491089,
"step": 295
},
{
"epoch": 0.38410381184103815,
"grad_norm": 0.7991520166397095,
"learning_rate": 9.837102915159797e-06,
"loss": 0.6950873732566833,
"step": 296
},
{
"epoch": 0.3854014598540146,
"grad_norm": 0.6660937666893005,
"learning_rate": 9.835287839145366e-06,
"loss": 0.7929595708847046,
"step": 297
},
{
"epoch": 0.3866991078669911,
"grad_norm": 0.5755690336227417,
"learning_rate": 9.833462876191138e-06,
"loss": 0.7429145574569702,
"step": 298
},
{
"epoch": 0.38799675587996757,
"grad_norm": 0.5845285654067993,
"learning_rate": 9.831628030028698e-06,
"loss": 0.673062801361084,
"step": 299
},
{
"epoch": 0.38929440389294406,
"grad_norm": 0.6984291672706604,
"learning_rate": 9.829783304409838e-06,
"loss": 0.7271926403045654,
"step": 300
},
{
"epoch": 0.3905920519059205,
"grad_norm": 0.6314187049865723,
"learning_rate": 9.827928703106562e-06,
"loss": 0.7842410206794739,
"step": 301
},
{
"epoch": 0.391889699918897,
"grad_norm": 0.5774804353713989,
"learning_rate": 9.826064229911056e-06,
"loss": 0.7108284831047058,
"step": 302
},
{
"epoch": 0.3931873479318735,
"grad_norm": 0.5863385200500488,
"learning_rate": 9.824189888635699e-06,
"loss": 0.6845728158950806,
"step": 303
},
{
"epoch": 0.39448499594485,
"grad_norm": 0.6258076429367065,
"learning_rate": 9.82230568311304e-06,
"loss": 0.7528674602508545,
"step": 304
},
{
"epoch": 0.3957826439578264,
"grad_norm": 0.5792856216430664,
"learning_rate": 9.820411617195807e-06,
"loss": 0.6762325763702393,
"step": 305
},
{
"epoch": 0.3970802919708029,
"grad_norm": 0.6361887454986572,
"learning_rate": 9.818507694756883e-06,
"loss": 0.7917072176933289,
"step": 306
},
{
"epoch": 0.3983779399837794,
"grad_norm": 0.5518248677253723,
"learning_rate": 9.816593919689305e-06,
"loss": 0.6964313387870789,
"step": 307
},
{
"epoch": 0.3996755879967559,
"grad_norm": 0.5932815670967102,
"learning_rate": 9.814670295906265e-06,
"loss": 0.7426280975341797,
"step": 308
},
{
"epoch": 0.40097323600973234,
"grad_norm": 0.6102697253227234,
"learning_rate": 9.81273682734108e-06,
"loss": 0.7797576189041138,
"step": 309
},
{
"epoch": 0.40227088402270883,
"grad_norm": 0.5859159827232361,
"learning_rate": 9.81079351794721e-06,
"loss": 0.6963766813278198,
"step": 310
},
{
"epoch": 0.4035685320356853,
"grad_norm": 0.6081574559211731,
"learning_rate": 9.808840371698226e-06,
"loss": 0.7762277722358704,
"step": 311
},
{
"epoch": 0.4048661800486618,
"grad_norm": 0.5929109454154968,
"learning_rate": 9.80687739258782e-06,
"loss": 0.6928838491439819,
"step": 312
},
{
"epoch": 0.40616382806163825,
"grad_norm": 0.6156943440437317,
"learning_rate": 9.804904584629786e-06,
"loss": 0.7755375504493713,
"step": 313
},
{
"epoch": 0.40746147607461475,
"grad_norm": 0.6252034306526184,
"learning_rate": 9.80292195185802e-06,
"loss": 0.7410427927970886,
"step": 314
},
{
"epoch": 0.40875912408759124,
"grad_norm": 0.5801575183868408,
"learning_rate": 9.800929498326502e-06,
"loss": 0.7257661819458008,
"step": 315
},
{
"epoch": 0.41005677210056773,
"grad_norm": 0.6071752309799194,
"learning_rate": 9.798927228109294e-06,
"loss": 0.72821044921875,
"step": 316
},
{
"epoch": 0.41135442011354423,
"grad_norm": 0.6007112264633179,
"learning_rate": 9.796915145300534e-06,
"loss": 0.7845569849014282,
"step": 317
},
{
"epoch": 0.41265206812652067,
"grad_norm": 0.5841884016990662,
"learning_rate": 9.794893254014421e-06,
"loss": 0.7238840460777283,
"step": 318
},
{
"epoch": 0.41394971613949716,
"grad_norm": 0.7773919701576233,
"learning_rate": 9.792861558385212e-06,
"loss": 0.7452490329742432,
"step": 319
},
{
"epoch": 0.41524736415247365,
"grad_norm": 0.6115602254867554,
"learning_rate": 9.790820062567208e-06,
"loss": 0.769629716873169,
"step": 320
},
{
"epoch": 0.41654501216545015,
"grad_norm": 0.597138524055481,
"learning_rate": 9.788768770734753e-06,
"loss": 0.7215956449508667,
"step": 321
},
{
"epoch": 0.4178426601784266,
"grad_norm": 0.5886080265045166,
"learning_rate": 9.78670768708222e-06,
"loss": 0.6885201930999756,
"step": 322
},
{
"epoch": 0.4191403081914031,
"grad_norm": 0.6041279435157776,
"learning_rate": 9.784636815824003e-06,
"loss": 0.748660147190094,
"step": 323
},
{
"epoch": 0.42043795620437957,
"grad_norm": 0.6275052428245544,
"learning_rate": 9.782556161194508e-06,
"loss": 0.7351919412612915,
"step": 324
},
{
"epoch": 0.42173560421735606,
"grad_norm": 0.6083272695541382,
"learning_rate": 9.78046572744815e-06,
"loss": 0.7183579206466675,
"step": 325
},
{
"epoch": 0.4230332522303325,
"grad_norm": 0.5836600065231323,
"learning_rate": 9.778365518859334e-06,
"loss": 0.6470940113067627,
"step": 326
},
{
"epoch": 0.424330900243309,
"grad_norm": 0.611179769039154,
"learning_rate": 9.776255539722457e-06,
"loss": 0.7807853817939758,
"step": 327
},
{
"epoch": 0.4256285482562855,
"grad_norm": 0.5962700843811035,
"learning_rate": 9.774135794351892e-06,
"loss": 0.7775930166244507,
"step": 328
},
{
"epoch": 0.426926196269262,
"grad_norm": 0.5820413827896118,
"learning_rate": 9.77200628708198e-06,
"loss": 0.6654623746871948,
"step": 329
},
{
"epoch": 0.4282238442822384,
"grad_norm": 0.5713212490081787,
"learning_rate": 9.769867022267028e-06,
"loss": 0.7844803333282471,
"step": 330
},
{
"epoch": 0.4295214922952149,
"grad_norm": 0.6236836314201355,
"learning_rate": 9.767718004281288e-06,
"loss": 0.7271528244018555,
"step": 331
},
{
"epoch": 0.4308191403081914,
"grad_norm": 0.5810200572013855,
"learning_rate": 9.765559237518958e-06,
"loss": 0.6717958450317383,
"step": 332
},
{
"epoch": 0.4321167883211679,
"grad_norm": 0.5980990529060364,
"learning_rate": 9.763390726394171e-06,
"loss": 0.7378814220428467,
"step": 333
},
{
"epoch": 0.43341443633414434,
"grad_norm": 0.620817244052887,
"learning_rate": 9.761212475340982e-06,
"loss": 0.7411800026893616,
"step": 334
},
{
"epoch": 0.43471208434712083,
"grad_norm": 0.5831018686294556,
"learning_rate": 9.759024488813364e-06,
"loss": 0.6943602561950684,
"step": 335
},
{
"epoch": 0.4360097323600973,
"grad_norm": 0.6330239176750183,
"learning_rate": 9.756826771285195e-06,
"loss": 0.6916518211364746,
"step": 336
},
{
"epoch": 0.4373073803730738,
"grad_norm": 0.5482841730117798,
"learning_rate": 9.754619327250253e-06,
"loss": 0.6894945502281189,
"step": 337
},
{
"epoch": 0.4386050283860503,
"grad_norm": 0.5814421772956848,
"learning_rate": 9.7524021612222e-06,
"loss": 0.7126766443252563,
"step": 338
},
{
"epoch": 0.43990267639902675,
"grad_norm": 0.6360822916030884,
"learning_rate": 9.750175277734582e-06,
"loss": 0.7301243543624878,
"step": 339
},
{
"epoch": 0.44120032441200324,
"grad_norm": 0.5673643946647644,
"learning_rate": 9.747938681340807e-06,
"loss": 0.632249116897583,
"step": 340
},
{
"epoch": 0.44249797242497974,
"grad_norm": 0.59381103515625,
"learning_rate": 9.745692376614154e-06,
"loss": 0.7363812923431396,
"step": 341
},
{
"epoch": 0.44379562043795623,
"grad_norm": 0.5689446926116943,
"learning_rate": 9.743436368147745e-06,
"loss": 0.6463121175765991,
"step": 342
},
{
"epoch": 0.44509326845093267,
"grad_norm": 0.5716972351074219,
"learning_rate": 9.741170660554548e-06,
"loss": 0.726833701133728,
"step": 343
},
{
"epoch": 0.44639091646390916,
"grad_norm": 0.6090091466903687,
"learning_rate": 9.73889525846736e-06,
"loss": 0.7105214595794678,
"step": 344
},
{
"epoch": 0.44768856447688565,
"grad_norm": 0.6220769286155701,
"learning_rate": 9.736610166538802e-06,
"loss": 0.7986119389533997,
"step": 345
},
{
"epoch": 0.44898621248986215,
"grad_norm": 0.6415942311286926,
"learning_rate": 9.73431538944131e-06,
"loss": 0.8365704417228699,
"step": 346
},
{
"epoch": 0.4502838605028386,
"grad_norm": 0.6018549203872681,
"learning_rate": 9.73201093186712e-06,
"loss": 0.754788875579834,
"step": 347
},
{
"epoch": 0.4515815085158151,
"grad_norm": 0.6342391967773438,
"learning_rate": 9.729696798528268e-06,
"loss": 0.6986638307571411,
"step": 348
},
{
"epoch": 0.45287915652879157,
"grad_norm": 0.6728231906890869,
"learning_rate": 9.727372994156568e-06,
"loss": 0.7003589272499084,
"step": 349
},
{
"epoch": 0.45417680454176806,
"grad_norm": 0.5958974957466125,
"learning_rate": 9.725039523503615e-06,
"loss": 0.7366368770599365,
"step": 350
},
{
"epoch": 0.4554744525547445,
"grad_norm": 0.5878227353096008,
"learning_rate": 9.722696391340762e-06,
"loss": 0.6686346530914307,
"step": 351
},
{
"epoch": 0.456772100567721,
"grad_norm": 0.5995833277702332,
"learning_rate": 9.720343602459123e-06,
"loss": 0.720341682434082,
"step": 352
},
{
"epoch": 0.4580697485806975,
"grad_norm": 0.5677472352981567,
"learning_rate": 9.717981161669556e-06,
"loss": 0.7040742039680481,
"step": 353
},
{
"epoch": 0.459367396593674,
"grad_norm": 0.5821993350982666,
"learning_rate": 9.715609073802653e-06,
"loss": 0.7871376276016235,
"step": 354
},
{
"epoch": 0.4606650446066504,
"grad_norm": 0.6043302416801453,
"learning_rate": 9.713227343708737e-06,
"loss": 0.6964189410209656,
"step": 355
},
{
"epoch": 0.4619626926196269,
"grad_norm": 0.5885515213012695,
"learning_rate": 9.71083597625784e-06,
"loss": 0.6288225054740906,
"step": 356
},
{
"epoch": 0.4632603406326034,
"grad_norm": 0.5931031703948975,
"learning_rate": 9.708434976339704e-06,
"loss": 0.7654111981391907,
"step": 357
},
{
"epoch": 0.4645579886455799,
"grad_norm": 0.5929883122444153,
"learning_rate": 9.706024348863766e-06,
"loss": 0.7472108602523804,
"step": 358
},
{
"epoch": 0.4658556366585564,
"grad_norm": 0.6003252267837524,
"learning_rate": 9.703604098759148e-06,
"loss": 0.7266678810119629,
"step": 359
},
{
"epoch": 0.46715328467153283,
"grad_norm": 0.6148797869682312,
"learning_rate": 9.70117423097465e-06,
"loss": 0.6877753734588623,
"step": 360
},
{
"epoch": 0.4684509326845093,
"grad_norm": 0.632279634475708,
"learning_rate": 9.698734750478739e-06,
"loss": 0.7512223720550537,
"step": 361
},
{
"epoch": 0.4697485806974858,
"grad_norm": 0.5888375639915466,
"learning_rate": 9.69628566225953e-06,
"loss": 0.7822796702384949,
"step": 362
},
{
"epoch": 0.4710462287104623,
"grad_norm": 0.6794424057006836,
"learning_rate": 9.693826971324793e-06,
"loss": 0.7204307317733765,
"step": 363
},
{
"epoch": 0.47234387672343875,
"grad_norm": 0.5850203633308411,
"learning_rate": 9.691358682701927e-06,
"loss": 0.7395058870315552,
"step": 364
},
{
"epoch": 0.47364152473641524,
"grad_norm": 0.947333574295044,
"learning_rate": 9.688880801437957e-06,
"loss": 0.7230464220046997,
"step": 365
},
{
"epoch": 0.47493917274939174,
"grad_norm": 0.6044790744781494,
"learning_rate": 9.686393332599525e-06,
"loss": 0.7762792110443115,
"step": 366
},
{
"epoch": 0.47623682076236823,
"grad_norm": 0.558193027973175,
"learning_rate": 9.683896281272872e-06,
"loss": 0.7202603816986084,
"step": 367
},
{
"epoch": 0.47753446877534467,
"grad_norm": 0.6356004476547241,
"learning_rate": 9.681389652563837e-06,
"loss": 0.6806402206420898,
"step": 368
},
{
"epoch": 0.47883211678832116,
"grad_norm": 0.5731885433197021,
"learning_rate": 9.678873451597843e-06,
"loss": 0.7234804630279541,
"step": 369
},
{
"epoch": 0.48012976480129765,
"grad_norm": 0.6563818454742432,
"learning_rate": 9.676347683519882e-06,
"loss": 0.7021783590316772,
"step": 370
},
{
"epoch": 0.48142741281427415,
"grad_norm": 0.632475733757019,
"learning_rate": 9.673812353494513e-06,
"loss": 0.7313486337661743,
"step": 371
},
{
"epoch": 0.4827250608272506,
"grad_norm": 0.6746646761894226,
"learning_rate": 9.671267466705841e-06,
"loss": 0.7820821404457092,
"step": 372
},
{
"epoch": 0.4840227088402271,
"grad_norm": 0.558120608329773,
"learning_rate": 9.668713028357518e-06,
"loss": 0.7215161323547363,
"step": 373
},
{
"epoch": 0.48532035685320357,
"grad_norm": 0.5888929963111877,
"learning_rate": 9.666149043672724e-06,
"loss": 0.7091335654258728,
"step": 374
},
{
"epoch": 0.48661800486618007,
"grad_norm": 7.202490329742432,
"learning_rate": 9.663575517894155e-06,
"loss": 0.7597553133964539,
"step": 375
},
{
"epoch": 0.4879156528791565,
"grad_norm": 0.6477593183517456,
"learning_rate": 9.660992456284024e-06,
"loss": 0.6395682692527771,
"step": 376
},
{
"epoch": 0.489213300892133,
"grad_norm": 0.6040880680084229,
"learning_rate": 9.658399864124037e-06,
"loss": 0.7132856249809265,
"step": 377
},
{
"epoch": 0.4905109489051095,
"grad_norm": 0.6065711379051208,
"learning_rate": 9.655797746715388e-06,
"loss": 0.7926105260848999,
"step": 378
},
{
"epoch": 0.491808596918086,
"grad_norm": 0.6568942666053772,
"learning_rate": 9.65318610937875e-06,
"loss": 0.7595465183258057,
"step": 379
},
{
"epoch": 0.4931062449310625,
"grad_norm": 0.5950395464897156,
"learning_rate": 9.650564957454258e-06,
"loss": 0.7643356919288635,
"step": 380
},
{
"epoch": 0.4944038929440389,
"grad_norm": 0.608245313167572,
"learning_rate": 9.647934296301506e-06,
"loss": 0.8734641075134277,
"step": 381
},
{
"epoch": 0.4957015409570154,
"grad_norm": 0.6461122632026672,
"learning_rate": 9.64529413129953e-06,
"loss": 0.7460113167762756,
"step": 382
},
{
"epoch": 0.4969991889699919,
"grad_norm": 0.5779212117195129,
"learning_rate": 9.642644467846799e-06,
"loss": 0.707379937171936,
"step": 383
},
{
"epoch": 0.4982968369829684,
"grad_norm": 0.5882854461669922,
"learning_rate": 9.639985311361202e-06,
"loss": 0.74379563331604,
"step": 384
},
{
"epoch": 0.49959448499594483,
"grad_norm": 0.6086680293083191,
"learning_rate": 9.637316667280046e-06,
"loss": 0.7925621271133423,
"step": 385
},
{
"epoch": 0.5008921330089213,
"grad_norm": 0.5651184916496277,
"learning_rate": 9.634638541060027e-06,
"loss": 0.7554738521575928,
"step": 386
},
{
"epoch": 0.5021897810218978,
"grad_norm": 0.5808055400848389,
"learning_rate": 9.63195093817724e-06,
"loss": 0.7644078731536865,
"step": 387
},
{
"epoch": 0.5034874290348743,
"grad_norm": 0.6111287474632263,
"learning_rate": 9.62925386412715e-06,
"loss": 0.7364607453346252,
"step": 388
},
{
"epoch": 0.5047850770478508,
"grad_norm": 0.6057661175727844,
"learning_rate": 9.626547324424592e-06,
"loss": 0.7212823629379272,
"step": 389
},
{
"epoch": 0.5060827250608273,
"grad_norm": 0.6477599740028381,
"learning_rate": 9.623831324603755e-06,
"loss": 0.813086748123169,
"step": 390
},
{
"epoch": 0.5073803730738037,
"grad_norm": 0.5950746536254883,
"learning_rate": 9.621105870218167e-06,
"loss": 0.7306693196296692,
"step": 391
},
{
"epoch": 0.5086780210867802,
"grad_norm": 0.6298786401748657,
"learning_rate": 9.618370966840698e-06,
"loss": 0.7335579991340637,
"step": 392
},
{
"epoch": 0.5099756690997567,
"grad_norm": 0.5998733639717102,
"learning_rate": 9.615626620063531e-06,
"loss": 0.6837765574455261,
"step": 393
},
{
"epoch": 0.5112733171127332,
"grad_norm": 0.6094253659248352,
"learning_rate": 9.61287283549816e-06,
"loss": 0.7273898720741272,
"step": 394
},
{
"epoch": 0.5125709651257097,
"grad_norm": 0.5919696092605591,
"learning_rate": 9.610109618775379e-06,
"loss": 0.7142295241355896,
"step": 395
},
{
"epoch": 0.5138686131386861,
"grad_norm": 0.5768521428108215,
"learning_rate": 9.607336975545264e-06,
"loss": 0.6993876695632935,
"step": 396
},
{
"epoch": 0.5151662611516626,
"grad_norm": 0.6359198689460754,
"learning_rate": 9.604554911477173e-06,
"loss": 0.751734733581543,
"step": 397
},
{
"epoch": 0.5164639091646391,
"grad_norm": 0.612307071685791,
"learning_rate": 9.601763432259716e-06,
"loss": 0.7581944465637207,
"step": 398
},
{
"epoch": 0.5177615571776155,
"grad_norm": 0.5969548225402832,
"learning_rate": 9.59896254360077e-06,
"loss": 0.7034813165664673,
"step": 399
},
{
"epoch": 0.519059205190592,
"grad_norm": 0.5891065001487732,
"learning_rate": 9.596152251227438e-06,
"loss": 0.7002313137054443,
"step": 400
},
{
"epoch": 0.5203568532035685,
"grad_norm": 0.5791100263595581,
"learning_rate": 9.593332560886055e-06,
"loss": 0.7138193845748901,
"step": 401
},
{
"epoch": 0.521654501216545,
"grad_norm": 0.7952408790588379,
"learning_rate": 9.59050347834218e-06,
"loss": 0.6865421533584595,
"step": 402
},
{
"epoch": 0.5229521492295215,
"grad_norm": 0.6096974015235901,
"learning_rate": 9.587665009380565e-06,
"loss": 0.7312819957733154,
"step": 403
},
{
"epoch": 0.524249797242498,
"grad_norm": 0.6021596789360046,
"learning_rate": 9.584817159805164e-06,
"loss": 0.7670427560806274,
"step": 404
},
{
"epoch": 0.5255474452554745,
"grad_norm": 0.6113924980163574,
"learning_rate": 9.58195993543911e-06,
"loss": 0.7259009480476379,
"step": 405
},
{
"epoch": 0.526845093268451,
"grad_norm": 0.6386753916740417,
"learning_rate": 9.579093342124699e-06,
"loss": 0.7742621898651123,
"step": 406
},
{
"epoch": 0.5281427412814275,
"grad_norm": 0.5846640467643738,
"learning_rate": 9.576217385723391e-06,
"loss": 0.6874604225158691,
"step": 407
},
{
"epoch": 0.5294403892944038,
"grad_norm": 0.5714486241340637,
"learning_rate": 9.57333207211579e-06,
"loss": 0.6830397844314575,
"step": 408
},
{
"epoch": 0.5307380373073803,
"grad_norm": 0.5846112370491028,
"learning_rate": 9.57043740720163e-06,
"loss": 0.7333765029907227,
"step": 409
},
{
"epoch": 0.5320356853203568,
"grad_norm": 0.6309279799461365,
"learning_rate": 9.567533396899769e-06,
"loss": 0.698890209197998,
"step": 410
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.5987696647644043,
"learning_rate": 9.564620047148174e-06,
"loss": 0.7424242496490479,
"step": 411
},
{
"epoch": 0.5346309813463098,
"grad_norm": 0.5915178656578064,
"learning_rate": 9.561697363903908e-06,
"loss": 0.7625330090522766,
"step": 412
},
{
"epoch": 0.5359286293592863,
"grad_norm": 0.6682938933372498,
"learning_rate": 9.558765353143116e-06,
"loss": 0.7808880805969238,
"step": 413
},
{
"epoch": 0.5372262773722628,
"grad_norm": 0.5921300649642944,
"learning_rate": 9.555824020861022e-06,
"loss": 0.7293972969055176,
"step": 414
},
{
"epoch": 0.5385239253852393,
"grad_norm": 0.6055417060852051,
"learning_rate": 9.5528733730719e-06,
"loss": 0.7130710482597351,
"step": 415
},
{
"epoch": 0.5398215733982157,
"grad_norm": 1.2821067571640015,
"learning_rate": 9.549913415809084e-06,
"loss": 0.6902526617050171,
"step": 416
},
{
"epoch": 0.5411192214111922,
"grad_norm": 0.5723661184310913,
"learning_rate": 9.546944155124935e-06,
"loss": 0.7237967252731323,
"step": 417
},
{
"epoch": 0.5424168694241687,
"grad_norm": 0.5984989404678345,
"learning_rate": 9.54396559709084e-06,
"loss": 0.7385105490684509,
"step": 418
},
{
"epoch": 0.5437145174371452,
"grad_norm": 0.6114164590835571,
"learning_rate": 9.540977747797194e-06,
"loss": 0.6872152090072632,
"step": 419
},
{
"epoch": 0.5450121654501217,
"grad_norm": 0.585870087146759,
"learning_rate": 9.537980613353392e-06,
"loss": 0.7558926343917847,
"step": 420
},
{
"epoch": 0.5463098134630981,
"grad_norm": 0.5969951748847961,
"learning_rate": 9.53497419988782e-06,
"loss": 0.7628536224365234,
"step": 421
},
{
"epoch": 0.5476074614760746,
"grad_norm": 0.6526360511779785,
"learning_rate": 9.531958513547832e-06,
"loss": 0.7417917251586914,
"step": 422
},
{
"epoch": 0.5489051094890511,
"grad_norm": 0.6217682361602783,
"learning_rate": 9.52893356049974e-06,
"loss": 0.7846866846084595,
"step": 423
},
{
"epoch": 0.5502027575020276,
"grad_norm": 0.6098693013191223,
"learning_rate": 9.525899346928809e-06,
"loss": 0.7403139472007751,
"step": 424
},
{
"epoch": 0.551500405515004,
"grad_norm": 0.6113680005073547,
"learning_rate": 9.52285587903924e-06,
"loss": 0.7699853181838989,
"step": 425
},
{
"epoch": 0.5527980535279805,
"grad_norm": 0.5491748452186584,
"learning_rate": 9.519803163054149e-06,
"loss": 0.7141760587692261,
"step": 426
},
{
"epoch": 0.554095701540957,
"grad_norm": 0.6018276214599609,
"learning_rate": 9.51674120521557e-06,
"loss": 0.7314755916595459,
"step": 427
},
{
"epoch": 0.5553933495539335,
"grad_norm": 0.6114900708198547,
"learning_rate": 9.513670011784435e-06,
"loss": 0.7220840454101562,
"step": 428
},
{
"epoch": 0.55669099756691,
"grad_norm": 0.5553966760635376,
"learning_rate": 9.510589589040554e-06,
"loss": 0.630115270614624,
"step": 429
},
{
"epoch": 0.5579886455798865,
"grad_norm": 0.5907071232795715,
"learning_rate": 9.507499943282613e-06,
"loss": 0.6516691446304321,
"step": 430
},
{
"epoch": 0.559286293592863,
"grad_norm": 0.5842899084091187,
"learning_rate": 9.504401080828154e-06,
"loss": 0.7031220197677612,
"step": 431
},
{
"epoch": 0.5605839416058395,
"grad_norm": 0.5828782916069031,
"learning_rate": 9.501293008013568e-06,
"loss": 0.7107349038124084,
"step": 432
},
{
"epoch": 0.5618815896188158,
"grad_norm": 0.5939279198646545,
"learning_rate": 9.498175731194077e-06,
"loss": 0.7517828941345215,
"step": 433
},
{
"epoch": 0.5631792376317923,
"grad_norm": 0.6058377623558044,
"learning_rate": 9.495049256743723e-06,
"loss": 0.7890589237213135,
"step": 434
},
{
"epoch": 0.5644768856447688,
"grad_norm": 0.6133562922477722,
"learning_rate": 9.491913591055356e-06,
"loss": 0.6695548892021179,
"step": 435
},
{
"epoch": 0.5657745336577453,
"grad_norm": 0.6204050183296204,
"learning_rate": 9.488768740540615e-06,
"loss": 0.7749900817871094,
"step": 436
},
{
"epoch": 0.5670721816707218,
"grad_norm": 0.5636538863182068,
"learning_rate": 9.485614711629927e-06,
"loss": 0.6592154502868652,
"step": 437
},
{
"epoch": 0.5683698296836983,
"grad_norm": 0.5660319328308105,
"learning_rate": 9.482451510772482e-06,
"loss": 0.7120122313499451,
"step": 438
},
{
"epoch": 0.5696674776966748,
"grad_norm": 0.574423611164093,
"learning_rate": 9.479279144436224e-06,
"loss": 0.7538824081420898,
"step": 439
},
{
"epoch": 0.5709651257096513,
"grad_norm": 0.5769577622413635,
"learning_rate": 9.47609761910784e-06,
"loss": 0.6975010633468628,
"step": 440
},
{
"epoch": 0.5722627737226277,
"grad_norm": 1.1428693532943726,
"learning_rate": 9.472906941292746e-06,
"loss": 0.7184154987335205,
"step": 441
},
{
"epoch": 0.5735604217356042,
"grad_norm": 0.6155918836593628,
"learning_rate": 9.469707117515068e-06,
"loss": 0.7325999140739441,
"step": 442
},
{
"epoch": 0.5748580697485807,
"grad_norm": 0.6040661931037903,
"learning_rate": 9.466498154317635e-06,
"loss": 0.6905105113983154,
"step": 443
},
{
"epoch": 0.5761557177615572,
"grad_norm": 0.6275285482406616,
"learning_rate": 9.463280058261965e-06,
"loss": 0.7441266775131226,
"step": 444
},
{
"epoch": 0.5774533657745337,
"grad_norm": 0.5689868927001953,
"learning_rate": 9.460052835928254e-06,
"loss": 0.6997857093811035,
"step": 445
},
{
"epoch": 0.5787510137875101,
"grad_norm": 0.5860233902931213,
"learning_rate": 9.45681649391535e-06,
"loss": 0.6657996773719788,
"step": 446
},
{
"epoch": 0.5800486618004866,
"grad_norm": 0.5518195629119873,
"learning_rate": 9.453571038840755e-06,
"loss": 0.6410640478134155,
"step": 447
},
{
"epoch": 0.5813463098134631,
"grad_norm": 0.7139276266098022,
"learning_rate": 9.450316477340602e-06,
"loss": 0.7444489598274231,
"step": 448
},
{
"epoch": 0.5826439578264396,
"grad_norm": 0.6063182950019836,
"learning_rate": 9.447052816069648e-06,
"loss": 0.7016487121582031,
"step": 449
},
{
"epoch": 0.583941605839416,
"grad_norm": 0.5990587472915649,
"learning_rate": 9.443780061701252e-06,
"loss": 0.7742944359779358,
"step": 450
},
{
"epoch": 0.5852392538523925,
"grad_norm": 0.5863263010978699,
"learning_rate": 9.44049822092737e-06,
"loss": 0.7078189253807068,
"step": 451
},
{
"epoch": 0.586536901865369,
"grad_norm": 0.5597153902053833,
"learning_rate": 9.437207300458535e-06,
"loss": 0.7037616968154907,
"step": 452
},
{
"epoch": 0.5878345498783455,
"grad_norm": 0.5865596532821655,
"learning_rate": 9.433907307023845e-06,
"loss": 0.7111040353775024,
"step": 453
},
{
"epoch": 0.589132197891322,
"grad_norm": 0.595535397529602,
"learning_rate": 9.430598247370955e-06,
"loss": 0.6840265393257141,
"step": 454
},
{
"epoch": 0.5904298459042985,
"grad_norm": 0.6209713816642761,
"learning_rate": 9.427280128266049e-06,
"loss": 0.6608985066413879,
"step": 455
},
{
"epoch": 0.591727493917275,
"grad_norm": 0.7749186158180237,
"learning_rate": 9.423952956493846e-06,
"loss": 0.6757811307907104,
"step": 456
},
{
"epoch": 0.5930251419302515,
"grad_norm": 0.6284626126289368,
"learning_rate": 9.420616738857568e-06,
"loss": 0.6912366151809692,
"step": 457
},
{
"epoch": 0.5943227899432278,
"grad_norm": 0.5830085277557373,
"learning_rate": 9.417271482178938e-06,
"loss": 0.7678932547569275,
"step": 458
},
{
"epoch": 0.5956204379562043,
"grad_norm": 0.5680383443832397,
"learning_rate": 9.413917193298153e-06,
"loss": 0.7322279810905457,
"step": 459
},
{
"epoch": 0.5969180859691808,
"grad_norm": 0.5904244184494019,
"learning_rate": 9.41055387907389e-06,
"loss": 0.6763080358505249,
"step": 460
},
{
"epoch": 0.5969180859691808,
"eval_loss": 0.7040426731109619,
"eval_runtime": 73.0729,
"eval_samples_per_second": 71.052,
"eval_steps_per_second": 8.882,
"step": 460
},
{
"epoch": 0.5982157339821573,
"grad_norm": 0.5804091691970825,
"learning_rate": 9.407181546383275e-06,
"loss": 0.7188655138015747,
"step": 461
},
{
"epoch": 0.5995133819951338,
"grad_norm": 0.5912026166915894,
"learning_rate": 9.403800202121873e-06,
"loss": 0.6785882711410522,
"step": 462
},
{
"epoch": 0.6008110300081103,
"grad_norm": 0.5554898381233215,
"learning_rate": 9.400409853203677e-06,
"loss": 0.7052475214004517,
"step": 463
},
{
"epoch": 0.6021086780210868,
"grad_norm": 0.6723419427871704,
"learning_rate": 9.397010506561096e-06,
"loss": 0.6488598585128784,
"step": 464
},
{
"epoch": 0.6034063260340633,
"grad_norm": 0.5925308465957642,
"learning_rate": 9.393602169144929e-06,
"loss": 0.7316585779190063,
"step": 465
},
{
"epoch": 0.6047039740470398,
"grad_norm": 0.7151989936828613,
"learning_rate": 9.390184847924366e-06,
"loss": 0.7060757875442505,
"step": 466
},
{
"epoch": 0.6060016220600162,
"grad_norm": 0.5946957468986511,
"learning_rate": 9.386758549886964e-06,
"loss": 0.7584104537963867,
"step": 467
},
{
"epoch": 0.6072992700729927,
"grad_norm": 0.568766176700592,
"learning_rate": 9.383323282038632e-06,
"loss": 0.725806713104248,
"step": 468
},
{
"epoch": 0.6085969180859692,
"grad_norm": 0.5797498226165771,
"learning_rate": 9.379879051403627e-06,
"loss": 0.6769331693649292,
"step": 469
},
{
"epoch": 0.6098945660989457,
"grad_norm": 0.7914499640464783,
"learning_rate": 9.376425865024527e-06,
"loss": 0.7631534934043884,
"step": 470
},
{
"epoch": 0.6111922141119221,
"grad_norm": 0.601610004901886,
"learning_rate": 9.372963729962227e-06,
"loss": 0.8109684586524963,
"step": 471
},
{
"epoch": 0.6124898621248986,
"grad_norm": 0.6191813349723816,
"learning_rate": 9.369492653295913e-06,
"loss": 0.6854857206344604,
"step": 472
},
{
"epoch": 0.6137875101378751,
"grad_norm": 0.8444225192070007,
"learning_rate": 9.366012642123061e-06,
"loss": 0.7072763442993164,
"step": 473
},
{
"epoch": 0.6150851581508516,
"grad_norm": 0.5926432609558105,
"learning_rate": 9.362523703559412e-06,
"loss": 0.7057541012763977,
"step": 474
},
{
"epoch": 0.616382806163828,
"grad_norm": 0.5982694625854492,
"learning_rate": 9.359025844738962e-06,
"loss": 0.7388914823532104,
"step": 475
},
{
"epoch": 0.6176804541768045,
"grad_norm": 0.6068631410598755,
"learning_rate": 9.355519072813946e-06,
"loss": 0.7815642356872559,
"step": 476
},
{
"epoch": 0.618978102189781,
"grad_norm": 0.5807543396949768,
"learning_rate": 9.352003394954827e-06,
"loss": 0.7441459894180298,
"step": 477
},
{
"epoch": 0.6202757502027575,
"grad_norm": 0.5668230056762695,
"learning_rate": 9.348478818350277e-06,
"loss": 0.7281776666641235,
"step": 478
},
{
"epoch": 0.621573398215734,
"grad_norm": 0.6428498029708862,
"learning_rate": 9.34494535020716e-06,
"loss": 0.754060685634613,
"step": 479
},
{
"epoch": 0.6228710462287105,
"grad_norm": 0.6553912162780762,
"learning_rate": 9.341402997750526e-06,
"loss": 0.6970114707946777,
"step": 480
},
{
"epoch": 0.624168694241687,
"grad_norm": 0.5876368880271912,
"learning_rate": 9.337851768223589e-06,
"loss": 0.7278268933296204,
"step": 481
},
{
"epoch": 0.6254663422546635,
"grad_norm": 0.6632186770439148,
"learning_rate": 9.334291668887716e-06,
"loss": 0.724956750869751,
"step": 482
},
{
"epoch": 0.6267639902676398,
"grad_norm": 0.582115113735199,
"learning_rate": 9.330722707022406e-06,
"loss": 0.7292401790618896,
"step": 483
},
{
"epoch": 0.6280616382806163,
"grad_norm": 0.5983607769012451,
"learning_rate": 9.327144889925286e-06,
"loss": 0.7359820604324341,
"step": 484
},
{
"epoch": 0.6293592862935928,
"grad_norm": 0.594374418258667,
"learning_rate": 9.323558224912083e-06,
"loss": 0.7724255323410034,
"step": 485
},
{
"epoch": 0.6306569343065693,
"grad_norm": 0.5669406056404114,
"learning_rate": 9.319962719316621e-06,
"loss": 0.7348428964614868,
"step": 486
},
{
"epoch": 0.6319545823195458,
"grad_norm": 0.6060366630554199,
"learning_rate": 9.3163583804908e-06,
"loss": 0.682552695274353,
"step": 487
},
{
"epoch": 0.6332522303325223,
"grad_norm": 0.6307089328765869,
"learning_rate": 9.312745215804577e-06,
"loss": 0.8117605447769165,
"step": 488
},
{
"epoch": 0.6345498783454988,
"grad_norm": 0.5955522656440735,
"learning_rate": 9.309123232645963e-06,
"loss": 0.7129393219947815,
"step": 489
},
{
"epoch": 0.6358475263584753,
"grad_norm": 0.6481534242630005,
"learning_rate": 9.305492438420995e-06,
"loss": 0.6988842487335205,
"step": 490
},
{
"epoch": 0.6371451743714518,
"grad_norm": 0.5734648108482361,
"learning_rate": 9.301852840553728e-06,
"loss": 0.678565263748169,
"step": 491
},
{
"epoch": 0.6384428223844282,
"grad_norm": 0.5938750505447388,
"learning_rate": 9.298204446486221e-06,
"loss": 0.7267583608627319,
"step": 492
},
{
"epoch": 0.6397404703974047,
"grad_norm": 0.5493259429931641,
"learning_rate": 9.294547263678515e-06,
"loss": 0.665608286857605,
"step": 493
},
{
"epoch": 0.6410381184103812,
"grad_norm": 0.6349811553955078,
"learning_rate": 9.29088129960862e-06,
"loss": 0.7591350078582764,
"step": 494
},
{
"epoch": 0.6423357664233577,
"grad_norm": 0.5922753214836121,
"learning_rate": 9.28720656177251e-06,
"loss": 0.6984656453132629,
"step": 495
},
{
"epoch": 0.6436334144363342,
"grad_norm": 0.5910064578056335,
"learning_rate": 9.28352305768409e-06,
"loss": 0.7371819019317627,
"step": 496
},
{
"epoch": 0.6449310624493106,
"grad_norm": 0.5690438151359558,
"learning_rate": 9.279830794875194e-06,
"loss": 0.7185039520263672,
"step": 497
},
{
"epoch": 0.6462287104622871,
"grad_norm": 0.6163010597229004,
"learning_rate": 9.276129780895566e-06,
"loss": 0.6993834972381592,
"step": 498
},
{
"epoch": 0.6475263584752636,
"grad_norm": 0.6288541555404663,
"learning_rate": 9.272420023312843e-06,
"loss": 0.8217408657073975,
"step": 499
},
{
"epoch": 0.64882400648824,
"grad_norm": 0.620994508266449,
"learning_rate": 9.268701529712541e-06,
"loss": 0.7522677779197693,
"step": 500
},
{
"epoch": 0.6501216545012165,
"grad_norm": 0.5998205542564392,
"learning_rate": 9.264974307698034e-06,
"loss": 0.6935300827026367,
"step": 501
},
{
"epoch": 0.651419302514193,
"grad_norm": 0.8760928511619568,
"learning_rate": 9.261238364890553e-06,
"loss": 0.7158179879188538,
"step": 502
},
{
"epoch": 0.6527169505271695,
"grad_norm": 0.6253861784934998,
"learning_rate": 9.257493708929153e-06,
"loss": 0.7684556841850281,
"step": 503
},
{
"epoch": 0.654014598540146,
"grad_norm": 0.6935423016548157,
"learning_rate": 9.253740347470708e-06,
"loss": 0.778200626373291,
"step": 504
},
{
"epoch": 0.6553122465531225,
"grad_norm": 0.6469247937202454,
"learning_rate": 9.24997828818989e-06,
"loss": 0.7509121894836426,
"step": 505
},
{
"epoch": 0.656609894566099,
"grad_norm": 0.6015416979789734,
"learning_rate": 9.246207538779162e-06,
"loss": 0.7778556942939758,
"step": 506
},
{
"epoch": 0.6579075425790755,
"grad_norm": 0.5774285793304443,
"learning_rate": 9.242428106948748e-06,
"loss": 0.7515290975570679,
"step": 507
},
{
"epoch": 0.659205190592052,
"grad_norm": 0.5681214332580566,
"learning_rate": 9.238640000426635e-06,
"loss": 0.7492050528526306,
"step": 508
},
{
"epoch": 0.6605028386050283,
"grad_norm": 0.5640445351600647,
"learning_rate": 9.234843226958537e-06,
"loss": 0.6927063465118408,
"step": 509
},
{
"epoch": 0.6618004866180048,
"grad_norm": 0.6083568334579468,
"learning_rate": 9.231037794307896e-06,
"loss": 0.7587168216705322,
"step": 510
},
{
"epoch": 0.6630981346309813,
"grad_norm": 0.5821657776832581,
"learning_rate": 9.22722371025586e-06,
"loss": 0.7126904726028442,
"step": 511
},
{
"epoch": 0.6643957826439578,
"grad_norm": 2.4457342624664307,
"learning_rate": 9.223400982601262e-06,
"loss": 0.6615161895751953,
"step": 512
},
{
"epoch": 0.6656934306569343,
"grad_norm": 0.6009355187416077,
"learning_rate": 9.219569619160618e-06,
"loss": 0.7299069166183472,
"step": 513
},
{
"epoch": 0.6669910786699108,
"grad_norm": 0.6069469451904297,
"learning_rate": 9.215729627768093e-06,
"loss": 0.7864600419998169,
"step": 514
},
{
"epoch": 0.6682887266828873,
"grad_norm": 0.6514759659767151,
"learning_rate": 9.2118810162755e-06,
"loss": 0.6937267184257507,
"step": 515
},
{
"epoch": 0.6695863746958638,
"grad_norm": 0.795812726020813,
"learning_rate": 9.20802379255227e-06,
"loss": 0.704431414604187,
"step": 516
},
{
"epoch": 0.6708840227088402,
"grad_norm": 0.6042063236236572,
"learning_rate": 9.204157964485454e-06,
"loss": 0.7550405263900757,
"step": 517
},
{
"epoch": 0.6721816707218167,
"grad_norm": 0.6756092309951782,
"learning_rate": 9.200283539979691e-06,
"loss": 0.7409992218017578,
"step": 518
},
{
"epoch": 0.6734793187347932,
"grad_norm": 0.7710636854171753,
"learning_rate": 9.196400526957198e-06,
"loss": 0.7560484409332275,
"step": 519
},
{
"epoch": 0.6747769667477697,
"grad_norm": 0.7084681987762451,
"learning_rate": 9.192508933357753e-06,
"loss": 0.7406056523323059,
"step": 520
},
{
"epoch": 0.6760746147607462,
"grad_norm": 0.6131231188774109,
"learning_rate": 9.188608767138683e-06,
"loss": 0.7801857590675354,
"step": 521
},
{
"epoch": 0.6773722627737226,
"grad_norm": 0.6520926356315613,
"learning_rate": 9.184700036274837e-06,
"loss": 0.7538937926292419,
"step": 522
},
{
"epoch": 0.6786699107866991,
"grad_norm": 0.5901785492897034,
"learning_rate": 9.180782748758583e-06,
"loss": 0.7579227089881897,
"step": 523
},
{
"epoch": 0.6799675587996756,
"grad_norm": 0.5867577195167542,
"learning_rate": 9.17685691259978e-06,
"loss": 0.7785968780517578,
"step": 524
},
{
"epoch": 0.681265206812652,
"grad_norm": 0.6682732105255127,
"learning_rate": 9.172922535825772e-06,
"loss": 0.6564942598342896,
"step": 525
},
{
"epoch": 0.6825628548256285,
"grad_norm": 0.5923816561698914,
"learning_rate": 9.168979626481364e-06,
"loss": 0.7041895985603333,
"step": 526
},
{
"epoch": 0.683860502838605,
"grad_norm": 0.5651242136955261,
"learning_rate": 9.165028192628803e-06,
"loss": 0.7024134397506714,
"step": 527
},
{
"epoch": 0.6851581508515815,
"grad_norm": 0.6138148307800293,
"learning_rate": 9.161068242347777e-06,
"loss": 0.680936872959137,
"step": 528
},
{
"epoch": 0.686455798864558,
"grad_norm": 0.5655775666236877,
"learning_rate": 9.157099783735378e-06,
"loss": 0.6618273854255676,
"step": 529
},
{
"epoch": 0.6877534468775345,
"grad_norm": 0.6033377051353455,
"learning_rate": 9.1531228249061e-06,
"loss": 0.7136421203613281,
"step": 530
},
{
"epoch": 0.689051094890511,
"grad_norm": 0.7331950068473816,
"learning_rate": 9.149137373991819e-06,
"loss": 0.7970547676086426,
"step": 531
},
{
"epoch": 0.6903487429034875,
"grad_norm": 0.5791338682174683,
"learning_rate": 9.145143439141771e-06,
"loss": 0.6997847557067871,
"step": 532
},
{
"epoch": 0.691646390916464,
"grad_norm": 0.578549325466156,
"learning_rate": 9.141141028522544e-06,
"loss": 0.7562875151634216,
"step": 533
},
{
"epoch": 0.6929440389294403,
"grad_norm": 1.920037865638733,
"learning_rate": 9.137130150318055e-06,
"loss": 0.6756929755210876,
"step": 534
},
{
"epoch": 0.6942416869424168,
"grad_norm": 0.6300271153450012,
"learning_rate": 9.133110812729532e-06,
"loss": 0.7216504216194153,
"step": 535
},
{
"epoch": 0.6955393349553933,
"grad_norm": 0.6114068031311035,
"learning_rate": 9.129083023975505e-06,
"loss": 0.7115483283996582,
"step": 536
},
{
"epoch": 0.6968369829683698,
"grad_norm": 0.6002055406570435,
"learning_rate": 9.125046792291784e-06,
"loss": 0.7236282229423523,
"step": 537
},
{
"epoch": 0.6981346309813463,
"grad_norm": 0.6047035455703735,
"learning_rate": 9.121002125931436e-06,
"loss": 0.6811922788619995,
"step": 538
},
{
"epoch": 0.6994322789943228,
"grad_norm": 0.6067850589752197,
"learning_rate": 9.116949033164785e-06,
"loss": 0.7463216781616211,
"step": 539
},
{
"epoch": 0.7007299270072993,
"grad_norm": 0.5822233557701111,
"learning_rate": 9.112887522279378e-06,
"loss": 0.7334940433502197,
"step": 540
},
{
"epoch": 0.7020275750202758,
"grad_norm": 0.5947557687759399,
"learning_rate": 9.108817601579978e-06,
"loss": 0.7504947185516357,
"step": 541
},
{
"epoch": 0.7033252230332522,
"grad_norm": 0.6123725771903992,
"learning_rate": 9.104739279388542e-06,
"loss": 0.7778276205062866,
"step": 542
},
{
"epoch": 0.7046228710462287,
"grad_norm": 0.6185777187347412,
"learning_rate": 9.100652564044206e-06,
"loss": 0.7200486660003662,
"step": 543
},
{
"epoch": 0.7059205190592052,
"grad_norm": 1.0739803314208984,
"learning_rate": 9.09655746390327e-06,
"loss": 0.7538056969642639,
"step": 544
},
{
"epoch": 0.7072181670721817,
"grad_norm": 0.5895283818244934,
"learning_rate": 9.092453987339174e-06,
"loss": 0.6963307857513428,
"step": 545
},
{
"epoch": 0.7085158150851582,
"grad_norm": 0.5688499212265015,
"learning_rate": 9.088342142742493e-06,
"loss": 0.7032905220985413,
"step": 546
},
{
"epoch": 0.7098134630981346,
"grad_norm": 0.6233918070793152,
"learning_rate": 9.084221938520906e-06,
"loss": 0.6713303923606873,
"step": 547
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.7095353007316589,
"learning_rate": 9.080093383099187e-06,
"loss": 0.7268386483192444,
"step": 548
},
{
"epoch": 0.7124087591240876,
"grad_norm": 0.6135478019714355,
"learning_rate": 9.07595648491919e-06,
"loss": 0.7246679663658142,
"step": 549
},
{
"epoch": 0.7137064071370641,
"grad_norm": 0.582713782787323,
"learning_rate": 9.071811252439823e-06,
"loss": 0.691692590713501,
"step": 550
},
{
"epoch": 0.7150040551500405,
"grad_norm": 0.542813241481781,
"learning_rate": 9.067657694137038e-06,
"loss": 0.7191475629806519,
"step": 551
},
{
"epoch": 0.716301703163017,
"grad_norm": 0.6026738286018372,
"learning_rate": 9.063495818503809e-06,
"loss": 0.7817606925964355,
"step": 552
},
{
"epoch": 0.7175993511759935,
"grad_norm": 0.8981631398200989,
"learning_rate": 9.059325634050118e-06,
"loss": 0.7415137887001038,
"step": 553
},
{
"epoch": 0.71889699918897,
"grad_norm": 0.624947190284729,
"learning_rate": 9.05514714930294e-06,
"loss": 0.7271240949630737,
"step": 554
},
{
"epoch": 0.7201946472019465,
"grad_norm": 0.5546719431877136,
"learning_rate": 9.050960372806214e-06,
"loss": 0.698599100112915,
"step": 555
},
{
"epoch": 0.721492295214923,
"grad_norm": 0.5948834419250488,
"learning_rate": 9.046765313120842e-06,
"loss": 0.7756059169769287,
"step": 556
},
{
"epoch": 0.7227899432278995,
"grad_norm": 0.5877026915550232,
"learning_rate": 9.042561978824657e-06,
"loss": 0.7625119090080261,
"step": 557
},
{
"epoch": 0.724087591240876,
"grad_norm": 0.6063138246536255,
"learning_rate": 9.038350378512417e-06,
"loss": 0.7803001403808594,
"step": 558
},
{
"epoch": 0.7253852392538523,
"grad_norm": 0.5974534153938293,
"learning_rate": 9.034130520795774e-06,
"loss": 0.716859757900238,
"step": 559
},
{
"epoch": 0.7266828872668288,
"grad_norm": 0.5728408694267273,
"learning_rate": 9.029902414303273e-06,
"loss": 0.749966561794281,
"step": 560
},
{
"epoch": 0.7279805352798053,
"grad_norm": 0.5723510384559631,
"learning_rate": 9.025666067680319e-06,
"loss": 0.6597641706466675,
"step": 561
},
{
"epoch": 0.7292781832927818,
"grad_norm": 0.6084505915641785,
"learning_rate": 9.021421489589169e-06,
"loss": 0.710649847984314,
"step": 562
},
{
"epoch": 0.7305758313057583,
"grad_norm": 0.5824548006057739,
"learning_rate": 9.017168688708913e-06,
"loss": 0.6628729104995728,
"step": 563
},
{
"epoch": 0.7318734793187348,
"grad_norm": 0.594218373298645,
"learning_rate": 9.01290767373545e-06,
"loss": 0.730206310749054,
"step": 564
},
{
"epoch": 0.7331711273317113,
"grad_norm": 0.7261629700660706,
"learning_rate": 9.008638453381477e-06,
"loss": 0.6241463422775269,
"step": 565
},
{
"epoch": 0.7344687753446878,
"grad_norm": 0.6365723609924316,
"learning_rate": 9.004361036376472e-06,
"loss": 0.7979130148887634,
"step": 566
},
{
"epoch": 0.7357664233576642,
"grad_norm": 0.6350899934768677,
"learning_rate": 9.000075431466668e-06,
"loss": 0.7318904399871826,
"step": 567
},
{
"epoch": 0.7370640713706407,
"grad_norm": 0.5833107829093933,
"learning_rate": 8.995781647415041e-06,
"loss": 0.6889808177947998,
"step": 568
},
{
"epoch": 0.7383617193836172,
"grad_norm": 1.110663652420044,
"learning_rate": 8.991479693001296e-06,
"loss": 0.7418273687362671,
"step": 569
},
{
"epoch": 0.7396593673965937,
"grad_norm": 0.5860966444015503,
"learning_rate": 8.987169577021838e-06,
"loss": 0.7295401096343994,
"step": 570
},
{
"epoch": 0.7409570154095702,
"grad_norm": 2.7430782318115234,
"learning_rate": 8.982851308289765e-06,
"loss": 0.7898417711257935,
"step": 571
},
{
"epoch": 0.7422546634225466,
"grad_norm": 0.6228799223899841,
"learning_rate": 8.978524895634842e-06,
"loss": 0.7360432147979736,
"step": 572
},
{
"epoch": 0.7435523114355231,
"grad_norm": 0.6052027344703674,
"learning_rate": 8.974190347903491e-06,
"loss": 0.7148642539978027,
"step": 573
},
{
"epoch": 0.7448499594484996,
"grad_norm": 0.5462301969528198,
"learning_rate": 8.96984767395876e-06,
"loss": 0.6608201861381531,
"step": 574
},
{
"epoch": 0.7461476074614761,
"grad_norm": 0.6186708211898804,
"learning_rate": 8.965496882680322e-06,
"loss": 0.7763011455535889,
"step": 575
},
{
"epoch": 0.7474452554744525,
"grad_norm": 0.5678666830062866,
"learning_rate": 8.961137982964445e-06,
"loss": 0.6967377662658691,
"step": 576
},
{
"epoch": 0.748742903487429,
"grad_norm": 0.5985408425331116,
"learning_rate": 8.95677098372397e-06,
"loss": 0.7348828911781311,
"step": 577
},
{
"epoch": 0.7500405515004055,
"grad_norm": 0.5867311954498291,
"learning_rate": 8.95239589388831e-06,
"loss": 0.7279753684997559,
"step": 578
},
{
"epoch": 0.751338199513382,
"grad_norm": 0.5872586369514465,
"learning_rate": 8.948012722403417e-06,
"loss": 0.7667936086654663,
"step": 579
},
{
"epoch": 0.7526358475263585,
"grad_norm": 0.6062989234924316,
"learning_rate": 8.943621478231764e-06,
"loss": 0.7433009147644043,
"step": 580
},
{
"epoch": 0.753933495539335,
"grad_norm": 0.5952759981155396,
"learning_rate": 8.939222170352333e-06,
"loss": 0.7213162183761597,
"step": 581
},
{
"epoch": 0.7552311435523115,
"grad_norm": 0.6251077651977539,
"learning_rate": 8.9348148077606e-06,
"loss": 0.6798166632652283,
"step": 582
},
{
"epoch": 0.756528791565288,
"grad_norm": 0.6643015742301941,
"learning_rate": 8.9303993994685e-06,
"loss": 0.697973370552063,
"step": 583
},
{
"epoch": 0.7578264395782643,
"grad_norm": 0.614818274974823,
"learning_rate": 8.925975954504432e-06,
"loss": 0.6740398406982422,
"step": 584
},
{
"epoch": 0.7591240875912408,
"grad_norm": 0.5874298214912415,
"learning_rate": 8.921544481913218e-06,
"loss": 0.6789122819900513,
"step": 585
},
{
"epoch": 0.7604217356042173,
"grad_norm": 0.5964909791946411,
"learning_rate": 8.917104990756096e-06,
"loss": 0.7620725631713867,
"step": 586
},
{
"epoch": 0.7617193836171938,
"grad_norm": 0.6049628853797913,
"learning_rate": 8.912657490110705e-06,
"loss": 0.7080841064453125,
"step": 587
},
{
"epoch": 0.7630170316301703,
"grad_norm": 0.5781946778297424,
"learning_rate": 8.908201989071055e-06,
"loss": 0.7524607181549072,
"step": 588
},
{
"epoch": 0.7643146796431468,
"grad_norm": 0.585602879524231,
"learning_rate": 8.903738496747523e-06,
"loss": 0.775031566619873,
"step": 589
},
{
"epoch": 0.7656123276561233,
"grad_norm": 0.5722633004188538,
"learning_rate": 8.899267022266815e-06,
"loss": 0.7250426411628723,
"step": 590
},
{
"epoch": 0.7669099756690998,
"grad_norm": 0.5955145359039307,
"learning_rate": 8.894787574771968e-06,
"loss": 0.7013397216796875,
"step": 591
},
{
"epoch": 0.7682076236820763,
"grad_norm": 0.5935817956924438,
"learning_rate": 8.890300163422319e-06,
"loss": 0.7290763854980469,
"step": 592
},
{
"epoch": 0.7695052716950527,
"grad_norm": 0.5822441577911377,
"learning_rate": 8.885804797393484e-06,
"loss": 0.7267876863479614,
"step": 593
},
{
"epoch": 0.7708029197080292,
"grad_norm": 0.6610195636749268,
"learning_rate": 8.881301485877355e-06,
"loss": 0.7642419338226318,
"step": 594
},
{
"epoch": 0.7721005677210057,
"grad_norm": 0.5827111005783081,
"learning_rate": 8.87679023808206e-06,
"loss": 0.6633021831512451,
"step": 595
},
{
"epoch": 0.7733982157339822,
"grad_norm": 0.5982354283332825,
"learning_rate": 8.87227106323196e-06,
"loss": 0.7427453994750977,
"step": 596
},
{
"epoch": 0.7746958637469586,
"grad_norm": 0.5927367210388184,
"learning_rate": 8.867743970567625e-06,
"loss": 0.6740269660949707,
"step": 597
},
{
"epoch": 0.7759935117599351,
"grad_norm": 0.5812351703643799,
"learning_rate": 8.86320896934581e-06,
"loss": 0.7781720161437988,
"step": 598
},
{
"epoch": 0.7772911597729116,
"grad_norm": 0.5589850544929504,
"learning_rate": 8.858666068839447e-06,
"loss": 0.6646384000778198,
"step": 599
},
{
"epoch": 0.7785888077858881,
"grad_norm": 0.6152946352958679,
"learning_rate": 8.85411527833762e-06,
"loss": 0.7158241868019104,
"step": 600
},
{
"epoch": 0.7798864557988645,
"grad_norm": 0.6571215987205505,
"learning_rate": 8.849556607145541e-06,
"loss": 0.6301259994506836,
"step": 601
},
{
"epoch": 0.781184103811841,
"grad_norm": 0.650355339050293,
"learning_rate": 8.84499006458454e-06,
"loss": 0.7729838490486145,
"step": 602
},
{
"epoch": 0.7824817518248175,
"grad_norm": 0.5668020844459534,
"learning_rate": 8.840415659992038e-06,
"loss": 0.7071006298065186,
"step": 603
},
{
"epoch": 0.783779399837794,
"grad_norm": 0.5940731763839722,
"learning_rate": 8.835833402721538e-06,
"loss": 0.709991991519928,
"step": 604
},
{
"epoch": 0.7850770478507705,
"grad_norm": 0.6069549918174744,
"learning_rate": 8.831243302142595e-06,
"loss": 0.7425503730773926,
"step": 605
},
{
"epoch": 0.786374695863747,
"grad_norm": 0.6917547583580017,
"learning_rate": 8.826645367640803e-06,
"loss": 0.7509415149688721,
"step": 606
},
{
"epoch": 0.7876723438767235,
"grad_norm": 0.5669399499893188,
"learning_rate": 8.822039608617773e-06,
"loss": 0.7422374486923218,
"step": 607
},
{
"epoch": 0.7889699918897,
"grad_norm": 0.5998254418373108,
"learning_rate": 8.81742603449112e-06,
"loss": 0.6498250961303711,
"step": 608
},
{
"epoch": 0.7902676399026763,
"grad_norm": 0.5784206390380859,
"learning_rate": 8.81280465469443e-06,
"loss": 0.7794440388679504,
"step": 609
},
{
"epoch": 0.7915652879156528,
"grad_norm": 0.5644393563270569,
"learning_rate": 8.808175478677261e-06,
"loss": 0.697083055973053,
"step": 610
},
{
"epoch": 0.7928629359286293,
"grad_norm": 0.5781574249267578,
"learning_rate": 8.803538515905102e-06,
"loss": 0.6970184445381165,
"step": 611
},
{
"epoch": 0.7941605839416058,
"grad_norm": 0.585652768611908,
"learning_rate": 8.79889377585937e-06,
"loss": 0.7602633833885193,
"step": 612
},
{
"epoch": 0.7954582319545823,
"grad_norm": 0.5716352462768555,
"learning_rate": 8.79424126803738e-06,
"loss": 0.717863142490387,
"step": 613
},
{
"epoch": 0.7967558799675588,
"grad_norm": 0.5922728776931763,
"learning_rate": 8.789581001952339e-06,
"loss": 0.7333586812019348,
"step": 614
},
{
"epoch": 0.7980535279805353,
"grad_norm": 0.7918326258659363,
"learning_rate": 8.784912987133305e-06,
"loss": 0.7329719066619873,
"step": 615
},
{
"epoch": 0.7993511759935118,
"grad_norm": 0.6318597793579102,
"learning_rate": 8.78023723312519e-06,
"loss": 0.71714848279953,
"step": 616
},
{
"epoch": 0.8006488240064883,
"grad_norm": 0.5931165814399719,
"learning_rate": 8.775553749488729e-06,
"loss": 0.6446089744567871,
"step": 617
},
{
"epoch": 0.8019464720194647,
"grad_norm": 0.5699899196624756,
"learning_rate": 8.770862545800459e-06,
"loss": 0.6896922588348389,
"step": 618
},
{
"epoch": 0.8032441200324412,
"grad_norm": 0.5788043141365051,
"learning_rate": 8.766163631652702e-06,
"loss": 0.7116216421127319,
"step": 619
},
{
"epoch": 0.8045417680454177,
"grad_norm": 0.6152717471122742,
"learning_rate": 8.76145701665355e-06,
"loss": 0.7757282853126526,
"step": 620
},
{
"epoch": 0.8058394160583942,
"grad_norm": 0.6117092967033386,
"learning_rate": 8.756742710426842e-06,
"loss": 0.6977071166038513,
"step": 621
},
{
"epoch": 0.8071370640713706,
"grad_norm": 0.5893334150314331,
"learning_rate": 8.752020722612135e-06,
"loss": 0.7122848033905029,
"step": 622
},
{
"epoch": 0.8084347120843471,
"grad_norm": 0.613097608089447,
"learning_rate": 8.747291062864704e-06,
"loss": 0.7448244094848633,
"step": 623
},
{
"epoch": 0.8097323600973236,
"grad_norm": 0.5860653519630432,
"learning_rate": 8.742553740855507e-06,
"loss": 0.6702634692192078,
"step": 624
},
{
"epoch": 0.8110300081103001,
"grad_norm": 0.6024116277694702,
"learning_rate": 8.737808766271163e-06,
"loss": 0.6898221969604492,
"step": 625
},
{
"epoch": 0.8123276561232765,
"grad_norm": 0.5622679591178894,
"learning_rate": 8.733056148813947e-06,
"loss": 0.7181109189987183,
"step": 626
},
{
"epoch": 0.813625304136253,
"grad_norm": 0.595656156539917,
"learning_rate": 8.728295898201762e-06,
"loss": 0.7352790832519531,
"step": 627
},
{
"epoch": 0.8149229521492295,
"grad_norm": 0.5798142552375793,
"learning_rate": 8.72352802416811e-06,
"loss": 0.6691849231719971,
"step": 628
},
{
"epoch": 0.816220600162206,
"grad_norm": 0.6328383088111877,
"learning_rate": 8.718752536462089e-06,
"loss": 0.7578571438789368,
"step": 629
},
{
"epoch": 0.8175182481751825,
"grad_norm": 0.6140182018280029,
"learning_rate": 8.713969444848365e-06,
"loss": 0.8000912666320801,
"step": 630
},
{
"epoch": 0.818815896188159,
"grad_norm": 0.5924091935157776,
"learning_rate": 8.709178759107146e-06,
"loss": 0.7412709593772888,
"step": 631
},
{
"epoch": 0.8201135442011355,
"grad_norm": 0.5865992903709412,
"learning_rate": 8.704380489034172e-06,
"loss": 0.6817134022712708,
"step": 632
},
{
"epoch": 0.821411192214112,
"grad_norm": 0.6066908240318298,
"learning_rate": 8.699574644440696e-06,
"loss": 0.7462890148162842,
"step": 633
},
{
"epoch": 0.8227088402270885,
"grad_norm": 0.6996213793754578,
"learning_rate": 8.694761235153446e-06,
"loss": 0.7541388273239136,
"step": 634
},
{
"epoch": 0.8240064882400648,
"grad_norm": 0.5837500691413879,
"learning_rate": 8.689940271014631e-06,
"loss": 0.7211518883705139,
"step": 635
},
{
"epoch": 0.8253041362530413,
"grad_norm": 0.6041287183761597,
"learning_rate": 8.685111761881902e-06,
"loss": 0.7510079741477966,
"step": 636
},
{
"epoch": 0.8266017842660178,
"grad_norm": 0.5609418153762817,
"learning_rate": 8.680275717628336e-06,
"loss": 0.7399103045463562,
"step": 637
},
{
"epoch": 0.8278994322789943,
"grad_norm": 0.6362541913986206,
"learning_rate": 8.675432148142423e-06,
"loss": 0.7379388809204102,
"step": 638
},
{
"epoch": 0.8291970802919708,
"grad_norm": 0.555855393409729,
"learning_rate": 8.670581063328031e-06,
"loss": 0.6878998279571533,
"step": 639
},
{
"epoch": 0.8304947283049473,
"grad_norm": 0.5522022843360901,
"learning_rate": 8.665722473104407e-06,
"loss": 0.6912398338317871,
"step": 640
},
{
"epoch": 0.8317923763179238,
"grad_norm": 0.6348553895950317,
"learning_rate": 8.660856387406134e-06,
"loss": 0.7144729495048523,
"step": 641
},
{
"epoch": 0.8330900243309003,
"grad_norm": 0.5787035226821899,
"learning_rate": 8.655982816183127e-06,
"loss": 0.7252941727638245,
"step": 642
},
{
"epoch": 0.8343876723438767,
"grad_norm": 1.6580746173858643,
"learning_rate": 8.651101769400606e-06,
"loss": 0.7200146913528442,
"step": 643
},
{
"epoch": 0.8356853203568532,
"grad_norm": 1.0832597017288208,
"learning_rate": 8.646213257039076e-06,
"loss": 0.7684627771377563,
"step": 644
},
{
"epoch": 0.8369829683698297,
"grad_norm": 1.513912320137024,
"learning_rate": 8.641317289094306e-06,
"loss": 0.7325241565704346,
"step": 645
},
{
"epoch": 0.8382806163828062,
"grad_norm": 0.6023765802383423,
"learning_rate": 8.636413875577314e-06,
"loss": 0.74098801612854,
"step": 646
},
{
"epoch": 0.8395782643957826,
"grad_norm": 0.6051165461540222,
"learning_rate": 8.631503026514337e-06,
"loss": 0.6847478151321411,
"step": 647
},
{
"epoch": 0.8408759124087591,
"grad_norm": 0.5932079553604126,
"learning_rate": 8.626584751946818e-06,
"loss": 0.731514036655426,
"step": 648
},
{
"epoch": 0.8421735604217356,
"grad_norm": 0.592435359954834,
"learning_rate": 8.621659061931389e-06,
"loss": 0.7055472731590271,
"step": 649
},
{
"epoch": 0.8434712084347121,
"grad_norm": 2.370189905166626,
"learning_rate": 8.616725966539831e-06,
"loss": 0.6948425769805908,
"step": 650
},
{
"epoch": 0.8447688564476885,
"grad_norm": 0.6067817807197571,
"learning_rate": 8.611785475859083e-06,
"loss": 0.7035855650901794,
"step": 651
},
{
"epoch": 0.846066504460665,
"grad_norm": 0.6086214780807495,
"learning_rate": 8.606837599991194e-06,
"loss": 0.7720967531204224,
"step": 652
},
{
"epoch": 0.8473641524736415,
"grad_norm": 0.5939242243766785,
"learning_rate": 8.601882349053318e-06,
"loss": 0.7347517609596252,
"step": 653
},
{
"epoch": 0.848661800486618,
"grad_norm": 0.6451635360717773,
"learning_rate": 8.596919733177692e-06,
"loss": 0.6510732173919678,
"step": 654
},
{
"epoch": 0.8499594484995945,
"grad_norm": 0.6460222601890564,
"learning_rate": 8.591949762511606e-06,
"loss": 0.6970388293266296,
"step": 655
},
{
"epoch": 0.851257096512571,
"grad_norm": 0.5829662084579468,
"learning_rate": 8.586972447217392e-06,
"loss": 0.6706767678260803,
"step": 656
},
{
"epoch": 0.8525547445255475,
"grad_norm": 0.5833383798599243,
"learning_rate": 8.581987797472404e-06,
"loss": 0.7589589357376099,
"step": 657
},
{
"epoch": 0.853852392538524,
"grad_norm": 0.5842010974884033,
"learning_rate": 8.576995823468984e-06,
"loss": 0.7162166833877563,
"step": 658
},
{
"epoch": 0.8551500405515005,
"grad_norm": 0.5614502429962158,
"learning_rate": 8.571996535414457e-06,
"loss": 0.6840311288833618,
"step": 659
},
{
"epoch": 0.8564476885644768,
"grad_norm": 0.5722468495368958,
"learning_rate": 8.566989943531106e-06,
"loss": 0.7161433100700378,
"step": 660
},
{
"epoch": 0.8577453365774533,
"grad_norm": 0.6029196977615356,
"learning_rate": 8.561976058056138e-06,
"loss": 0.7230268716812134,
"step": 661
},
{
"epoch": 0.8590429845904298,
"grad_norm": 0.5787186622619629,
"learning_rate": 8.556954889241682e-06,
"loss": 0.7280833721160889,
"step": 662
},
{
"epoch": 0.8603406326034063,
"grad_norm": 0.6488873362541199,
"learning_rate": 8.551926447354759e-06,
"loss": 0.6804985404014587,
"step": 663
},
{
"epoch": 0.8616382806163828,
"grad_norm": 0.5842364430427551,
"learning_rate": 8.546890742677259e-06,
"loss": 0.669411301612854,
"step": 664
},
{
"epoch": 0.8629359286293593,
"grad_norm": 0.5956006646156311,
"learning_rate": 8.541847785505921e-06,
"loss": 0.7321279048919678,
"step": 665
},
{
"epoch": 0.8642335766423358,
"grad_norm": 3.8146164417266846,
"learning_rate": 8.53679758615232e-06,
"loss": 0.693459153175354,
"step": 666
},
{
"epoch": 0.8655312246553123,
"grad_norm": 0.7075020670890808,
"learning_rate": 8.531740154942834e-06,
"loss": 0.6751031875610352,
"step": 667
},
{
"epoch": 0.8668288726682887,
"grad_norm": 0.5840404629707336,
"learning_rate": 8.526675502218629e-06,
"loss": 0.7010972499847412,
"step": 668
},
{
"epoch": 0.8681265206812652,
"grad_norm": 0.5663997530937195,
"learning_rate": 8.521603638335638e-06,
"loss": 0.7152513265609741,
"step": 669
},
{
"epoch": 0.8694241686942417,
"grad_norm": 0.58479243516922,
"learning_rate": 8.516524573664539e-06,
"loss": 0.7431036233901978,
"step": 670
},
{
"epoch": 0.8707218167072182,
"grad_norm": 0.5867894887924194,
"learning_rate": 8.511438318590735e-06,
"loss": 0.6411721706390381,
"step": 671
},
{
"epoch": 0.8720194647201946,
"grad_norm": 0.595013439655304,
"learning_rate": 8.506344883514328e-06,
"loss": 0.6847820281982422,
"step": 672
},
{
"epoch": 0.8733171127331711,
"grad_norm": 0.6092846989631653,
"learning_rate": 8.501244278850105e-06,
"loss": 0.7914074659347534,
"step": 673
},
{
"epoch": 0.8746147607461476,
"grad_norm": 0.6108312606811523,
"learning_rate": 8.496136515027511e-06,
"loss": 0.7064344882965088,
"step": 674
},
{
"epoch": 0.8759124087591241,
"grad_norm": 0.6098673343658447,
"learning_rate": 8.491021602490632e-06,
"loss": 0.7082339525222778,
"step": 675
},
{
"epoch": 0.8772100567721006,
"grad_norm": 0.5852345824241638,
"learning_rate": 8.485899551698166e-06,
"loss": 0.6980363130569458,
"step": 676
},
{
"epoch": 0.878507704785077,
"grad_norm": 0.60945725440979,
"learning_rate": 8.480770373123415e-06,
"loss": 0.7337608933448792,
"step": 677
},
{
"epoch": 0.8798053527980535,
"grad_norm": 0.5622206926345825,
"learning_rate": 8.475634077254248e-06,
"loss": 0.7212387919425964,
"step": 678
},
{
"epoch": 0.88110300081103,
"grad_norm": 1.9474778175354004,
"learning_rate": 8.470490674593091e-06,
"loss": 0.7507941722869873,
"step": 679
},
{
"epoch": 0.8824006488240065,
"grad_norm": 0.5891706943511963,
"learning_rate": 8.4653401756569e-06,
"loss": 0.72685706615448,
"step": 680
},
{
"epoch": 0.883698296836983,
"grad_norm": 0.5848804116249084,
"learning_rate": 8.460182590977142e-06,
"loss": 0.7391736507415771,
"step": 681
},
{
"epoch": 0.8849959448499595,
"grad_norm": 0.5995469093322754,
"learning_rate": 8.455017931099772e-06,
"loss": 0.7077188491821289,
"step": 682
},
{
"epoch": 0.886293592862936,
"grad_norm": 0.5778690576553345,
"learning_rate": 8.449846206585211e-06,
"loss": 0.7160015106201172,
"step": 683
},
{
"epoch": 0.8875912408759125,
"grad_norm": 0.6114044785499573,
"learning_rate": 8.44466742800833e-06,
"loss": 0.7118149995803833,
"step": 684
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.5748172998428345,
"learning_rate": 8.439481605958416e-06,
"loss": 0.7232242822647095,
"step": 685
},
{
"epoch": 0.8901865369018653,
"grad_norm": 0.9608264565467834,
"learning_rate": 8.434288751039168e-06,
"loss": 0.7293300032615662,
"step": 686
},
{
"epoch": 0.8914841849148418,
"grad_norm": 0.5927110910415649,
"learning_rate": 8.429088873868656e-06,
"loss": 0.7629004716873169,
"step": 687
},
{
"epoch": 0.8927818329278183,
"grad_norm": 0.5677574872970581,
"learning_rate": 8.423881985079315e-06,
"loss": 0.6493050456047058,
"step": 688
},
{
"epoch": 0.8940794809407948,
"grad_norm": 0.5510875582695007,
"learning_rate": 8.418668095317912e-06,
"loss": 0.6685976386070251,
"step": 689
},
{
"epoch": 0.8953771289537713,
"grad_norm": 0.5691307187080383,
"learning_rate": 8.413447215245534e-06,
"loss": 0.7029674053192139,
"step": 690
},
{
"epoch": 0.8953771289537713,
"eval_loss": 0.6914051175117493,
"eval_runtime": 73.0841,
"eval_samples_per_second": 71.041,
"eval_steps_per_second": 8.88,
"step": 690
},
{
"epoch": 0.8966747769667478,
"grad_norm": 0.5947213172912598,
"learning_rate": 8.408219355537557e-06,
"loss": 0.7144750356674194,
"step": 691
},
{
"epoch": 0.8979724249797243,
"grad_norm": 0.6758149266242981,
"learning_rate": 8.402984526883635e-06,
"loss": 0.7232916355133057,
"step": 692
},
{
"epoch": 0.8992700729927007,
"grad_norm": 0.6068633198738098,
"learning_rate": 8.397742739987664e-06,
"loss": 0.6896466612815857,
"step": 693
},
{
"epoch": 0.9005677210056772,
"grad_norm": 0.5855746865272522,
"learning_rate": 8.392494005567773e-06,
"loss": 0.7137375473976135,
"step": 694
},
{
"epoch": 0.9018653690186537,
"grad_norm": 0.6378610134124756,
"learning_rate": 8.387238334356294e-06,
"loss": 0.6991242170333862,
"step": 695
},
{
"epoch": 0.9031630170316302,
"grad_norm": 0.5615161657333374,
"learning_rate": 8.381975737099745e-06,
"loss": 0.7315720319747925,
"step": 696
},
{
"epoch": 0.9044606650446066,
"grad_norm": 0.5945183634757996,
"learning_rate": 8.376706224558807e-06,
"loss": 0.7387629151344299,
"step": 697
},
{
"epoch": 0.9057583130575831,
"grad_norm": 0.5757802724838257,
"learning_rate": 8.3714298075083e-06,
"loss": 0.769163191318512,
"step": 698
},
{
"epoch": 0.9070559610705596,
"grad_norm": 0.6023557186126709,
"learning_rate": 8.366146496737158e-06,
"loss": 0.7032333016395569,
"step": 699
},
{
"epoch": 0.9083536090835361,
"grad_norm": 0.5623191595077515,
"learning_rate": 8.360856303048417e-06,
"loss": 0.688059389591217,
"step": 700
},
{
"epoch": 0.9096512570965126,
"grad_norm": 0.6660424470901489,
"learning_rate": 8.355559237259181e-06,
"loss": 0.6570596098899841,
"step": 701
},
{
"epoch": 0.910948905109489,
"grad_norm": 0.6358682513237,
"learning_rate": 8.350255310200611e-06,
"loss": 0.6851440668106079,
"step": 702
},
{
"epoch": 0.9122465531224655,
"grad_norm": 0.5915968418121338,
"learning_rate": 8.344944532717898e-06,
"loss": 0.7370898127555847,
"step": 703
},
{
"epoch": 0.913544201135442,
"grad_norm": 0.6724914908409119,
"learning_rate": 8.339626915670234e-06,
"loss": 0.6419695615768433,
"step": 704
},
{
"epoch": 0.9148418491484185,
"grad_norm": 0.5758830308914185,
"learning_rate": 8.3343024699308e-06,
"loss": 0.7100552320480347,
"step": 705
},
{
"epoch": 0.916139497161395,
"grad_norm": 0.5856196284294128,
"learning_rate": 8.328971206386742e-06,
"loss": 0.7285655736923218,
"step": 706
},
{
"epoch": 0.9174371451743715,
"grad_norm": 0.6096091270446777,
"learning_rate": 8.323633135939145e-06,
"loss": 0.7508881092071533,
"step": 707
},
{
"epoch": 0.918734793187348,
"grad_norm": 0.5876352787017822,
"learning_rate": 8.318288269503007e-06,
"loss": 0.7147477865219116,
"step": 708
},
{
"epoch": 0.9200324412003245,
"grad_norm": 0.5633363127708435,
"learning_rate": 8.312936618007232e-06,
"loss": 0.7191579937934875,
"step": 709
},
{
"epoch": 0.9213300892133008,
"grad_norm": 0.6324480772018433,
"learning_rate": 8.307578192394592e-06,
"loss": 0.6980431079864502,
"step": 710
},
{
"epoch": 0.9226277372262773,
"grad_norm": 0.559508740901947,
"learning_rate": 8.30221300362171e-06,
"loss": 0.6977928280830383,
"step": 711
},
{
"epoch": 0.9239253852392538,
"grad_norm": 0.5924115180969238,
"learning_rate": 8.29684106265904e-06,
"loss": 0.7254680395126343,
"step": 712
},
{
"epoch": 0.9252230332522303,
"grad_norm": 0.5572075843811035,
"learning_rate": 8.291462380490842e-06,
"loss": 0.7060861587524414,
"step": 713
},
{
"epoch": 0.9265206812652068,
"grad_norm": 0.5710304975509644,
"learning_rate": 8.286076968115158e-06,
"loss": 0.6528699398040771,
"step": 714
},
{
"epoch": 0.9278183292781833,
"grad_norm": 0.7677385210990906,
"learning_rate": 8.280684836543794e-06,
"loss": 0.7742418646812439,
"step": 715
},
{
"epoch": 0.9291159772911598,
"grad_norm": 0.5909350514411926,
"learning_rate": 8.275285996802293e-06,
"loss": 0.7355895042419434,
"step": 716
},
{
"epoch": 0.9304136253041363,
"grad_norm": 0.6246051788330078,
"learning_rate": 8.269880459929919e-06,
"loss": 0.7119331955909729,
"step": 717
},
{
"epoch": 0.9317112733171128,
"grad_norm": 1.3237872123718262,
"learning_rate": 8.264468236979626e-06,
"loss": 0.724329948425293,
"step": 718
},
{
"epoch": 0.9330089213300892,
"grad_norm": 0.6042487621307373,
"learning_rate": 8.259049339018036e-06,
"loss": 0.7507586479187012,
"step": 719
},
{
"epoch": 0.9343065693430657,
"grad_norm": 0.6646915078163147,
"learning_rate": 8.25362377712543e-06,
"loss": 0.7630937695503235,
"step": 720
},
{
"epoch": 0.9356042173560422,
"grad_norm": 1.2076338529586792,
"learning_rate": 8.248191562395703e-06,
"loss": 0.6889426708221436,
"step": 721
},
{
"epoch": 0.9369018653690186,
"grad_norm": 0.7128719091415405,
"learning_rate": 8.242752705936363e-06,
"loss": 0.7193243503570557,
"step": 722
},
{
"epoch": 0.9381995133819951,
"grad_norm": 0.5779634714126587,
"learning_rate": 8.237307218868493e-06,
"loss": 0.7252578735351562,
"step": 723
},
{
"epoch": 0.9394971613949716,
"grad_norm": 0.5774085521697998,
"learning_rate": 8.231855112326738e-06,
"loss": 0.7056664228439331,
"step": 724
},
{
"epoch": 0.9407948094079481,
"grad_norm": 0.5759864449501038,
"learning_rate": 8.226396397459272e-06,
"loss": 0.7182119488716125,
"step": 725
},
{
"epoch": 0.9420924574209246,
"grad_norm": 0.5475362539291382,
"learning_rate": 8.22093108542779e-06,
"loss": 0.7100398540496826,
"step": 726
},
{
"epoch": 0.943390105433901,
"grad_norm": 0.6080360412597656,
"learning_rate": 8.215459187407468e-06,
"loss": 0.7540023326873779,
"step": 727
},
{
"epoch": 0.9446877534468775,
"grad_norm": 0.5985339283943176,
"learning_rate": 8.209980714586955e-06,
"loss": 0.7655041217803955,
"step": 728
},
{
"epoch": 0.945985401459854,
"grad_norm": 0.5587835311889648,
"learning_rate": 8.20449567816834e-06,
"loss": 0.7308551669120789,
"step": 729
},
{
"epoch": 0.9472830494728305,
"grad_norm": 0.5767388939857483,
"learning_rate": 8.199004089367136e-06,
"loss": 0.7747267484664917,
"step": 730
},
{
"epoch": 0.948580697485807,
"grad_norm": 0.5542681217193604,
"learning_rate": 8.193505959412246e-06,
"loss": 0.7009122371673584,
"step": 731
},
{
"epoch": 0.9498783454987835,
"grad_norm": 0.7035977244377136,
"learning_rate": 8.188001299545963e-06,
"loss": 0.7160595655441284,
"step": 732
},
{
"epoch": 0.95117599351176,
"grad_norm": 3.6369824409484863,
"learning_rate": 8.182490121023918e-06,
"loss": 0.7146700620651245,
"step": 733
},
{
"epoch": 0.9524736415247365,
"grad_norm": 0.6017202734947205,
"learning_rate": 8.176972435115075e-06,
"loss": 0.7427970170974731,
"step": 734
},
{
"epoch": 0.9537712895377128,
"grad_norm": 0.5797709822654724,
"learning_rate": 8.17144825310171e-06,
"loss": 0.7534258365631104,
"step": 735
},
{
"epoch": 0.9550689375506893,
"grad_norm": 0.6132066249847412,
"learning_rate": 8.165917586279374e-06,
"loss": 0.6742781400680542,
"step": 736
},
{
"epoch": 0.9563665855636658,
"grad_norm": 0.5700656175613403,
"learning_rate": 8.16038044595688e-06,
"loss": 0.7190455794334412,
"step": 737
},
{
"epoch": 0.9576642335766423,
"grad_norm": 0.5793234705924988,
"learning_rate": 8.15483684345628e-06,
"loss": 0.7258193492889404,
"step": 738
},
{
"epoch": 0.9589618815896188,
"grad_norm": 0.589043378829956,
"learning_rate": 8.149286790112838e-06,
"loss": 0.6817978620529175,
"step": 739
},
{
"epoch": 0.9602595296025953,
"grad_norm": 0.5883787870407104,
"learning_rate": 8.143730297275008e-06,
"loss": 0.6951944828033447,
"step": 740
},
{
"epoch": 0.9615571776155718,
"grad_norm": 0.6058008074760437,
"learning_rate": 8.138167376304411e-06,
"loss": 0.7065063118934631,
"step": 741
},
{
"epoch": 0.9628548256285483,
"grad_norm": 0.5645580291748047,
"learning_rate": 8.132598038575814e-06,
"loss": 0.6607494354248047,
"step": 742
},
{
"epoch": 0.9641524736415248,
"grad_norm": 0.5984307527542114,
"learning_rate": 8.1270222954771e-06,
"loss": 0.7731702327728271,
"step": 743
},
{
"epoch": 0.9654501216545012,
"grad_norm": 0.5940436124801636,
"learning_rate": 8.121440158409255e-06,
"loss": 0.7217580080032349,
"step": 744
},
{
"epoch": 0.9667477696674777,
"grad_norm": 0.6139102578163147,
"learning_rate": 8.115851638786335e-06,
"loss": 0.761775553226471,
"step": 745
},
{
"epoch": 0.9680454176804542,
"grad_norm": 0.5621196627616882,
"learning_rate": 8.11025674803545e-06,
"loss": 0.7084890007972717,
"step": 746
},
{
"epoch": 0.9693430656934306,
"grad_norm": 0.634238064289093,
"learning_rate": 8.104655497596734e-06,
"loss": 0.7413675785064697,
"step": 747
},
{
"epoch": 0.9706407137064071,
"grad_norm": 0.6062578558921814,
"learning_rate": 8.099047898923326e-06,
"loss": 0.6940469741821289,
"step": 748
},
{
"epoch": 0.9719383617193836,
"grad_norm": 1.2983204126358032,
"learning_rate": 8.093433963481348e-06,
"loss": 0.7091077566146851,
"step": 749
},
{
"epoch": 0.9732360097323601,
"grad_norm": 0.5655047297477722,
"learning_rate": 8.087813702749873e-06,
"loss": 0.7066688537597656,
"step": 750
},
{
"epoch": 0.9745336577453366,
"grad_norm": 0.6067200303077698,
"learning_rate": 8.082187128220918e-06,
"loss": 0.7150874137878418,
"step": 751
},
{
"epoch": 0.975831305758313,
"grad_norm": 0.5860595107078552,
"learning_rate": 8.076554251399398e-06,
"loss": 0.7268061637878418,
"step": 752
},
{
"epoch": 0.9771289537712895,
"grad_norm": 0.5691843628883362,
"learning_rate": 8.070915083803124e-06,
"loss": 0.7130003571510315,
"step": 753
},
{
"epoch": 0.978426601784266,
"grad_norm": 0.5511523485183716,
"learning_rate": 8.065269636962765e-06,
"loss": 0.7632818222045898,
"step": 754
},
{
"epoch": 0.9797242497972425,
"grad_norm": 0.9720051884651184,
"learning_rate": 8.059617922421832e-06,
"loss": 0.6920190453529358,
"step": 755
},
{
"epoch": 0.981021897810219,
"grad_norm": 0.9689953327178955,
"learning_rate": 8.053959951736647e-06,
"loss": 0.7026671171188354,
"step": 756
},
{
"epoch": 0.9823195458231955,
"grad_norm": 0.5877639055252075,
"learning_rate": 8.048295736476332e-06,
"loss": 0.7458422780036926,
"step": 757
},
{
"epoch": 0.983617193836172,
"grad_norm": 0.5555517077445984,
"learning_rate": 8.042625288222774e-06,
"loss": 0.6832958459854126,
"step": 758
},
{
"epoch": 0.9849148418491485,
"grad_norm": 0.5778935551643372,
"learning_rate": 8.036948618570601e-06,
"loss": 0.6715413331985474,
"step": 759
},
{
"epoch": 0.986212489862125,
"grad_norm": 0.5913302898406982,
"learning_rate": 8.031265739127167e-06,
"loss": 0.6345862150192261,
"step": 760
},
{
"epoch": 0.9875101378751013,
"grad_norm": 0.5491726994514465,
"learning_rate": 8.025576661512524e-06,
"loss": 0.6723500490188599,
"step": 761
},
{
"epoch": 0.9888077858880778,
"grad_norm": 0.5520846247673035,
"learning_rate": 8.019881397359395e-06,
"loss": 0.7205091118812561,
"step": 762
},
{
"epoch": 0.9901054339010543,
"grad_norm": 0.5902574062347412,
"learning_rate": 8.014179958313154e-06,
"loss": 0.7127419114112854,
"step": 763
},
{
"epoch": 0.9914030819140308,
"grad_norm": 0.5558638572692871,
"learning_rate": 8.008472356031795e-06,
"loss": 0.6300485134124756,
"step": 764
},
{
"epoch": 0.9927007299270073,
"grad_norm": 0.5584984421730042,
"learning_rate": 8.00275860218593e-06,
"loss": 0.6915569305419922,
"step": 765
},
{
"epoch": 0.9939983779399838,
"grad_norm": 0.5804587006568909,
"learning_rate": 7.99703870845873e-06,
"loss": 0.7401936054229736,
"step": 766
},
{
"epoch": 0.9952960259529603,
"grad_norm": 0.562065064907074,
"learning_rate": 7.991312686545939e-06,
"loss": 0.6845479011535645,
"step": 767
},
{
"epoch": 0.9965936739659368,
"grad_norm": 0.5887646079063416,
"learning_rate": 7.985580548155814e-06,
"loss": 0.7238905429840088,
"step": 768
},
{
"epoch": 0.9978913219789132,
"grad_norm": 0.70610111951828,
"learning_rate": 7.979842305009133e-06,
"loss": 0.6573514342308044,
"step": 769
},
{
"epoch": 0.9991889699918897,
"grad_norm": 0.5765895843505859,
"learning_rate": 7.974097968839149e-06,
"loss": 0.6816248297691345,
"step": 770
},
{
"epoch": 1.0,
"grad_norm": 0.7250688672065735,
"learning_rate": 7.968347551391574e-06,
"loss": 0.6674489974975586,
"step": 771
},
{
"epoch": 1.0012976480129765,
"grad_norm": 0.6792595982551575,
"learning_rate": 7.962591064424558e-06,
"loss": 0.6514409184455872,
"step": 772
},
{
"epoch": 1.002595296025953,
"grad_norm": 0.7125512361526489,
"learning_rate": 7.95682851970866e-06,
"loss": 0.6212759613990784,
"step": 773
},
{
"epoch": 1.0038929440389295,
"grad_norm": 0.6438767313957214,
"learning_rate": 7.951059929026826e-06,
"loss": 0.6282512545585632,
"step": 774
},
{
"epoch": 1.005190592051906,
"grad_norm": 0.572353720664978,
"learning_rate": 7.94528530417436e-06,
"loss": 0.6370311379432678,
"step": 775
},
{
"epoch": 1.0064882400648825,
"grad_norm": 0.5794159173965454,
"learning_rate": 7.939504656958913e-06,
"loss": 0.6351627707481384,
"step": 776
},
{
"epoch": 1.007785888077859,
"grad_norm": 0.6709707379341125,
"learning_rate": 7.933717999200442e-06,
"loss": 0.7240197658538818,
"step": 777
},
{
"epoch": 1.0090835360908355,
"grad_norm": 0.7591879963874817,
"learning_rate": 7.927925342731202e-06,
"loss": 0.662930428981781,
"step": 778
},
{
"epoch": 1.010381184103812,
"grad_norm": 0.6731166243553162,
"learning_rate": 7.922126699395705e-06,
"loss": 0.6665748357772827,
"step": 779
},
{
"epoch": 1.0116788321167882,
"grad_norm": 0.6249240040779114,
"learning_rate": 7.916322081050708e-06,
"loss": 0.6313880681991577,
"step": 780
},
{
"epoch": 1.0129764801297647,
"grad_norm": 0.6070784330368042,
"learning_rate": 7.910511499565192e-06,
"loss": 0.5778607130050659,
"step": 781
},
{
"epoch": 1.0142741281427412,
"grad_norm": 0.5682867765426636,
"learning_rate": 7.90469496682032e-06,
"loss": 0.5984998941421509,
"step": 782
},
{
"epoch": 1.0155717761557177,
"grad_norm": 0.5944799184799194,
"learning_rate": 7.89887249470943e-06,
"loss": 0.6242648363113403,
"step": 783
},
{
"epoch": 1.0168694241686942,
"grad_norm": 0.8286924958229065,
"learning_rate": 7.89304409513801e-06,
"loss": 0.612074613571167,
"step": 784
},
{
"epoch": 1.0181670721816707,
"grad_norm": 0.6117927432060242,
"learning_rate": 7.887209780023652e-06,
"loss": 0.6674654483795166,
"step": 785
},
{
"epoch": 1.0194647201946472,
"grad_norm": 0.6768798828125,
"learning_rate": 7.881369561296061e-06,
"loss": 0.6811670660972595,
"step": 786
},
{
"epoch": 1.0207623682076237,
"grad_norm": 0.6664367914199829,
"learning_rate": 7.875523450897004e-06,
"loss": 0.638746440410614,
"step": 787
},
{
"epoch": 1.0220600162206002,
"grad_norm": 1.1638799905776978,
"learning_rate": 7.869671460780297e-06,
"loss": 0.6403613090515137,
"step": 788
},
{
"epoch": 1.0233576642335767,
"grad_norm": 0.5986616015434265,
"learning_rate": 7.863813602911777e-06,
"loss": 0.6099958419799805,
"step": 789
},
{
"epoch": 1.0246553122465532,
"grad_norm": 1.8672071695327759,
"learning_rate": 7.857949889269285e-06,
"loss": 0.6486390829086304,
"step": 790
},
{
"epoch": 1.0259529602595296,
"grad_norm": 0.6674206852912903,
"learning_rate": 7.852080331842627e-06,
"loss": 0.5824840664863586,
"step": 791
},
{
"epoch": 1.0272506082725061,
"grad_norm": 0.6552616953849792,
"learning_rate": 7.846204942633564e-06,
"loss": 0.7385782599449158,
"step": 792
},
{
"epoch": 1.0285482562854826,
"grad_norm": 0.636968195438385,
"learning_rate": 7.84032373365578e-06,
"loss": 0.6557282209396362,
"step": 793
},
{
"epoch": 1.0298459042984591,
"grad_norm": 0.5769335627555847,
"learning_rate": 7.834436716934859e-06,
"loss": 0.5607404708862305,
"step": 794
},
{
"epoch": 1.0311435523114356,
"grad_norm": 0.6747480034828186,
"learning_rate": 7.828543904508258e-06,
"loss": 0.6176875829696655,
"step": 795
},
{
"epoch": 1.0324412003244121,
"grad_norm": 0.5826826691627502,
"learning_rate": 7.82264530842529e-06,
"loss": 0.6352604627609253,
"step": 796
},
{
"epoch": 1.0337388483373884,
"grad_norm": 0.5748003721237183,
"learning_rate": 7.816740940747089e-06,
"loss": 0.5930640697479248,
"step": 797
},
{
"epoch": 1.0350364963503649,
"grad_norm": 0.5976374745368958,
"learning_rate": 7.810830813546594e-06,
"loss": 0.6040553450584412,
"step": 798
},
{
"epoch": 1.0363341443633414,
"grad_norm": 0.5924686789512634,
"learning_rate": 7.80491493890852e-06,
"loss": 0.6496337652206421,
"step": 799
},
{
"epoch": 1.0376317923763179,
"grad_norm": 0.5696931481361389,
"learning_rate": 7.798993328929328e-06,
"loss": 0.6347925662994385,
"step": 800
},
{
"epoch": 1.0389294403892944,
"grad_norm": 0.5750864148139954,
"learning_rate": 7.793065995717217e-06,
"loss": 0.6404843330383301,
"step": 801
},
{
"epoch": 1.0402270884022708,
"grad_norm": 0.5975061058998108,
"learning_rate": 7.787132951392082e-06,
"loss": 0.5997766256332397,
"step": 802
},
{
"epoch": 1.0415247364152473,
"grad_norm": 0.6157170534133911,
"learning_rate": 7.781194208085495e-06,
"loss": 0.6501672267913818,
"step": 803
},
{
"epoch": 1.0428223844282238,
"grad_norm": 0.6032687425613403,
"learning_rate": 7.775249777940685e-06,
"loss": 0.6564816832542419,
"step": 804
},
{
"epoch": 1.0441200324412003,
"grad_norm": 0.5874586701393127,
"learning_rate": 7.769299673112507e-06,
"loss": 0.6064618825912476,
"step": 805
},
{
"epoch": 1.0454176804541768,
"grad_norm": 0.6239724159240723,
"learning_rate": 7.76334390576742e-06,
"loss": 0.6170182228088379,
"step": 806
},
{
"epoch": 1.0467153284671533,
"grad_norm": 0.6056293845176697,
"learning_rate": 7.757382488083458e-06,
"loss": 0.7019131183624268,
"step": 807
},
{
"epoch": 1.0480129764801298,
"grad_norm": 0.5994875431060791,
"learning_rate": 7.751415432250213e-06,
"loss": 0.6316931247711182,
"step": 808
},
{
"epoch": 1.0493106244931063,
"grad_norm": 0.6516374945640564,
"learning_rate": 7.745442750468803e-06,
"loss": 0.649019718170166,
"step": 809
},
{
"epoch": 1.0506082725060828,
"grad_norm": 0.5792532563209534,
"learning_rate": 7.739464454951853e-06,
"loss": 0.6500118374824524,
"step": 810
},
{
"epoch": 1.0519059205190593,
"grad_norm": 0.745469331741333,
"learning_rate": 7.733480557923464e-06,
"loss": 0.5821675658226013,
"step": 811
},
{
"epoch": 1.0532035685320358,
"grad_norm": 0.6124119162559509,
"learning_rate": 7.727491071619186e-06,
"loss": 0.6384508609771729,
"step": 812
},
{
"epoch": 1.0545012165450123,
"grad_norm": 0.5831156969070435,
"learning_rate": 7.72149600828601e-06,
"loss": 0.6578410267829895,
"step": 813
},
{
"epoch": 1.0557988645579885,
"grad_norm": 0.605689287185669,
"learning_rate": 7.715495380182314e-06,
"loss": 0.6352893710136414,
"step": 814
},
{
"epoch": 1.057096512570965,
"grad_norm": 0.5769819617271423,
"learning_rate": 7.709489199577874e-06,
"loss": 0.5956138372421265,
"step": 815
},
{
"epoch": 1.0583941605839415,
"grad_norm": 1.2673306465148926,
"learning_rate": 7.7034774787538e-06,
"loss": 0.6302381753921509,
"step": 816
},
{
"epoch": 1.059691808596918,
"grad_norm": 0.5970334410667419,
"learning_rate": 7.697460230002545e-06,
"loss": 0.6213703751564026,
"step": 817
},
{
"epoch": 1.0609894566098945,
"grad_norm": 0.5932973623275757,
"learning_rate": 7.691437465627859e-06,
"loss": 0.6656537652015686,
"step": 818
},
{
"epoch": 1.062287104622871,
"grad_norm": 0.5778910517692566,
"learning_rate": 7.685409197944768e-06,
"loss": 0.6016901135444641,
"step": 819
},
{
"epoch": 1.0635847526358475,
"grad_norm": 0.6970887780189514,
"learning_rate": 7.679375439279557e-06,
"loss": 0.6404139995574951,
"step": 820
},
{
"epoch": 1.064882400648824,
"grad_norm": 0.8317319750785828,
"learning_rate": 7.673336201969733e-06,
"loss": 0.670491099357605,
"step": 821
},
{
"epoch": 1.0661800486618005,
"grad_norm": 0.5904209613800049,
"learning_rate": 7.667291498364009e-06,
"loss": 0.697813868522644,
"step": 822
},
{
"epoch": 1.067477696674777,
"grad_norm": 0.6368371844291687,
"learning_rate": 7.661241340822274e-06,
"loss": 0.6957151889801025,
"step": 823
},
{
"epoch": 1.0687753446877535,
"grad_norm": 0.6323496103286743,
"learning_rate": 7.655185741715569e-06,
"loss": 0.6282387375831604,
"step": 824
},
{
"epoch": 1.07007299270073,
"grad_norm": 0.582459568977356,
"learning_rate": 7.64912471342606e-06,
"loss": 0.6632883548736572,
"step": 825
},
{
"epoch": 1.0713706407137065,
"grad_norm": 0.5815753936767578,
"learning_rate": 7.643058268347015e-06,
"loss": 0.6437957882881165,
"step": 826
},
{
"epoch": 1.072668288726683,
"grad_norm": 0.5913931131362915,
"learning_rate": 7.636986418882783e-06,
"loss": 0.6558079719543457,
"step": 827
},
{
"epoch": 1.0739659367396595,
"grad_norm": 0.5545955300331116,
"learning_rate": 7.630909177448755e-06,
"loss": 0.6246286630630493,
"step": 828
},
{
"epoch": 1.075263584752636,
"grad_norm": 0.5951606631278992,
"learning_rate": 7.624826556471354e-06,
"loss": 0.6540351510047913,
"step": 829
},
{
"epoch": 1.0765612327656124,
"grad_norm": 0.6533515453338623,
"learning_rate": 7.618738568388e-06,
"loss": 0.6222127676010132,
"step": 830
},
{
"epoch": 1.0778588807785887,
"grad_norm": 0.5797233581542969,
"learning_rate": 7.612645225647086e-06,
"loss": 0.5815407037734985,
"step": 831
},
{
"epoch": 1.0791565287915652,
"grad_norm": 0.6024124622344971,
"learning_rate": 7.60654654070796e-06,
"loss": 0.609170138835907,
"step": 832
},
{
"epoch": 1.0804541768045417,
"grad_norm": 0.6007437109947205,
"learning_rate": 7.600442526040883e-06,
"loss": 0.6566615104675293,
"step": 833
},
{
"epoch": 1.0817518248175182,
"grad_norm": 0.6132609844207764,
"learning_rate": 7.594333194127025e-06,
"loss": 0.6762999892234802,
"step": 834
},
{
"epoch": 1.0830494728304947,
"grad_norm": 0.6206640005111694,
"learning_rate": 7.58821855745842e-06,
"loss": 0.6008488535881042,
"step": 835
},
{
"epoch": 1.0843471208434712,
"grad_norm": 0.5727500319480896,
"learning_rate": 7.582098628537955e-06,
"loss": 0.6291306018829346,
"step": 836
},
{
"epoch": 1.0856447688564477,
"grad_norm": 0.5835679769515991,
"learning_rate": 7.5759734198793365e-06,
"loss": 0.598922848701477,
"step": 837
},
{
"epoch": 1.0869424168694242,
"grad_norm": 0.6435012817382812,
"learning_rate": 7.5698429440070616e-06,
"loss": 0.6742567420005798,
"step": 838
},
{
"epoch": 1.0882400648824007,
"grad_norm": 0.6521117687225342,
"learning_rate": 7.563707213456405e-06,
"loss": 0.7133705615997314,
"step": 839
},
{
"epoch": 1.0895377128953772,
"grad_norm": 0.6230207085609436,
"learning_rate": 7.5575662407733815e-06,
"loss": 0.6346240043640137,
"step": 840
},
{
"epoch": 1.0908353609083536,
"grad_norm": 0.6041070818901062,
"learning_rate": 7.551420038514726e-06,
"loss": 0.5786027908325195,
"step": 841
},
{
"epoch": 1.0921330089213301,
"grad_norm": 0.6142879724502563,
"learning_rate": 7.54526861924787e-06,
"loss": 0.689670205116272,
"step": 842
},
{
"epoch": 1.0934306569343066,
"grad_norm": 0.5727767944335938,
"learning_rate": 7.5391119955509026e-06,
"loss": 0.6093534827232361,
"step": 843
},
{
"epoch": 1.0947283049472831,
"grad_norm": 0.5920162796974182,
"learning_rate": 7.532950180012564e-06,
"loss": 0.6508292555809021,
"step": 844
},
{
"epoch": 1.0960259529602596,
"grad_norm": 0.6140349507331848,
"learning_rate": 7.526783185232208e-06,
"loss": 0.6522685885429382,
"step": 845
},
{
"epoch": 1.0973236009732361,
"grad_norm": 0.6111754179000854,
"learning_rate": 7.520611023819779e-06,
"loss": 0.6456558704376221,
"step": 846
},
{
"epoch": 1.0986212489862126,
"grad_norm": 0.5693365931510925,
"learning_rate": 7.514433708395783e-06,
"loss": 0.6057475805282593,
"step": 847
},
{
"epoch": 1.0999188969991889,
"grad_norm": 0.6043863892555237,
"learning_rate": 7.508251251591266e-06,
"loss": 0.6344411969184875,
"step": 848
},
{
"epoch": 1.1012165450121654,
"grad_norm": 0.6892386078834534,
"learning_rate": 7.5020636660477894e-06,
"loss": 0.6500993371009827,
"step": 849
},
{
"epoch": 1.1025141930251419,
"grad_norm": 0.6054773926734924,
"learning_rate": 7.4958709644174e-06,
"loss": 0.6792426109313965,
"step": 850
},
{
"epoch": 1.1038118410381184,
"grad_norm": 0.6106455326080322,
"learning_rate": 7.4896731593626015e-06,
"loss": 0.648511528968811,
"step": 851
},
{
"epoch": 1.1051094890510949,
"grad_norm": 0.5832105875015259,
"learning_rate": 7.4834702635563395e-06,
"loss": 0.6617711782455444,
"step": 852
},
{
"epoch": 1.1064071370640713,
"grad_norm": 0.668353259563446,
"learning_rate": 7.477262289681966e-06,
"loss": 0.6955296397209167,
"step": 853
},
{
"epoch": 1.1077047850770478,
"grad_norm": 0.5962719917297363,
"learning_rate": 7.471049250433214e-06,
"loss": 0.680686354637146,
"step": 854
},
{
"epoch": 1.1090024330900243,
"grad_norm": 0.6140416860580444,
"learning_rate": 7.464831158514179e-06,
"loss": 0.6445127725601196,
"step": 855
},
{
"epoch": 1.1103000811030008,
"grad_norm": 0.6690049171447754,
"learning_rate": 7.458608026639285e-06,
"loss": 0.6185108423233032,
"step": 856
},
{
"epoch": 1.1115977291159773,
"grad_norm": 0.7241218090057373,
"learning_rate": 7.45237986753326e-06,
"loss": 0.6828392744064331,
"step": 857
},
{
"epoch": 1.1128953771289538,
"grad_norm": 0.6075162887573242,
"learning_rate": 7.446146693931111e-06,
"loss": 0.6688688397407532,
"step": 858
},
{
"epoch": 1.1141930251419303,
"grad_norm": 0.7877935767173767,
"learning_rate": 7.439908518578105e-06,
"loss": 0.6596081852912903,
"step": 859
},
{
"epoch": 1.1154906731549068,
"grad_norm": 0.5754934549331665,
"learning_rate": 7.433665354229731e-06,
"loss": 0.655542254447937,
"step": 860
},
{
"epoch": 1.1167883211678833,
"grad_norm": 0.6457986831665039,
"learning_rate": 7.4274172136516766e-06,
"loss": 0.6543152928352356,
"step": 861
},
{
"epoch": 1.1180859691808598,
"grad_norm": 0.5904266238212585,
"learning_rate": 7.421164109619809e-06,
"loss": 0.6421469449996948,
"step": 862
},
{
"epoch": 1.119383617193836,
"grad_norm": 0.5537955164909363,
"learning_rate": 7.4149060549201455e-06,
"loss": 0.609650194644928,
"step": 863
},
{
"epoch": 1.1206812652068125,
"grad_norm": 0.5964105129241943,
"learning_rate": 7.408643062348824e-06,
"loss": 0.6043794751167297,
"step": 864
},
{
"epoch": 1.121978913219789,
"grad_norm": 0.5994772911071777,
"learning_rate": 7.402375144712075e-06,
"loss": 0.6849918365478516,
"step": 865
},
{
"epoch": 1.1232765612327655,
"grad_norm": 0.6322051286697388,
"learning_rate": 7.396102314826207e-06,
"loss": 0.6219741106033325,
"step": 866
},
{
"epoch": 1.124574209245742,
"grad_norm": 0.5794394016265869,
"learning_rate": 7.389824585517569e-06,
"loss": 0.6507738828659058,
"step": 867
},
{
"epoch": 1.1258718572587185,
"grad_norm": 0.6662233471870422,
"learning_rate": 7.3835419696225275e-06,
"loss": 0.6731002330780029,
"step": 868
},
{
"epoch": 1.127169505271695,
"grad_norm": 0.5842033624649048,
"learning_rate": 7.377254479987445e-06,
"loss": 0.6546036005020142,
"step": 869
},
{
"epoch": 1.1284671532846715,
"grad_norm": 2.6347815990448,
"learning_rate": 7.370962129468642e-06,
"loss": 0.61831134557724,
"step": 870
},
{
"epoch": 1.129764801297648,
"grad_norm": 0.6191915273666382,
"learning_rate": 7.364664930932385e-06,
"loss": 0.682953953742981,
"step": 871
},
{
"epoch": 1.1310624493106245,
"grad_norm": 0.6216323375701904,
"learning_rate": 7.35836289725485e-06,
"loss": 0.6735019087791443,
"step": 872
},
{
"epoch": 1.132360097323601,
"grad_norm": 0.5958914756774902,
"learning_rate": 7.352056041322103e-06,
"loss": 0.6420754194259644,
"step": 873
},
{
"epoch": 1.1336577453365775,
"grad_norm": 0.5970807671546936,
"learning_rate": 7.345744376030066e-06,
"loss": 0.6589509844779968,
"step": 874
},
{
"epoch": 1.134955393349554,
"grad_norm": 0.6387295126914978,
"learning_rate": 7.339427914284498e-06,
"loss": 0.5913777351379395,
"step": 875
},
{
"epoch": 1.1362530413625305,
"grad_norm": 1.3676766157150269,
"learning_rate": 7.3331066690009644e-06,
"loss": 0.6156778931617737,
"step": 876
},
{
"epoch": 1.137550689375507,
"grad_norm": 0.5990293025970459,
"learning_rate": 7.326780653104813e-06,
"loss": 0.6320254802703857,
"step": 877
},
{
"epoch": 1.1388483373884835,
"grad_norm": 0.6619262099266052,
"learning_rate": 7.320449879531143e-06,
"loss": 0.6741781830787659,
"step": 878
},
{
"epoch": 1.14014598540146,
"grad_norm": 0.6091610193252563,
"learning_rate": 7.314114361224785e-06,
"loss": 0.6403502821922302,
"step": 879
},
{
"epoch": 1.1414436334144362,
"grad_norm": 0.6015101075172424,
"learning_rate": 7.30777411114027e-06,
"loss": 0.6477581858634949,
"step": 880
},
{
"epoch": 1.142741281427413,
"grad_norm": 0.5771135687828064,
"learning_rate": 7.301429142241805e-06,
"loss": 0.5903566479682922,
"step": 881
},
{
"epoch": 1.1440389294403892,
"grad_norm": 0.571612536907196,
"learning_rate": 7.295079467503247e-06,
"loss": 0.5671682357788086,
"step": 882
},
{
"epoch": 1.1453365774533657,
"grad_norm": 0.7478623390197754,
"learning_rate": 7.288725099908071e-06,
"loss": 0.6659491658210754,
"step": 883
},
{
"epoch": 1.1466342254663422,
"grad_norm": 0.6303284764289856,
"learning_rate": 7.282366052449351e-06,
"loss": 0.7001731395721436,
"step": 884
},
{
"epoch": 1.1479318734793187,
"grad_norm": 0.5829930901527405,
"learning_rate": 7.276002338129731e-06,
"loss": 0.632986843585968,
"step": 885
},
{
"epoch": 1.1492295214922952,
"grad_norm": 0.6018064022064209,
"learning_rate": 7.269633969961395e-06,
"loss": 0.6848266124725342,
"step": 886
},
{
"epoch": 1.1505271695052717,
"grad_norm": 0.7479543089866638,
"learning_rate": 7.2632609609660456e-06,
"loss": 0.6810072064399719,
"step": 887
},
{
"epoch": 1.1518248175182482,
"grad_norm": 0.5979959964752197,
"learning_rate": 7.256883324174871e-06,
"loss": 0.59900963306427,
"step": 888
},
{
"epoch": 1.1531224655312247,
"grad_norm": 0.608985424041748,
"learning_rate": 7.250501072628524e-06,
"loss": 0.6502770185470581,
"step": 889
},
{
"epoch": 1.1544201135442012,
"grad_norm": 0.5771687626838684,
"learning_rate": 7.2441142193770955e-06,
"loss": 0.6427179574966431,
"step": 890
},
{
"epoch": 1.1557177615571776,
"grad_norm": 0.7472683787345886,
"learning_rate": 7.237722777480083e-06,
"loss": 0.6853768825531006,
"step": 891
},
{
"epoch": 1.1570154095701541,
"grad_norm": 0.5946991443634033,
"learning_rate": 7.231326760006368e-06,
"loss": 0.6969834566116333,
"step": 892
},
{
"epoch": 1.1583130575831306,
"grad_norm": 0.6238925457000732,
"learning_rate": 7.224926180034186e-06,
"loss": 0.6919976472854614,
"step": 893
},
{
"epoch": 1.1596107055961071,
"grad_norm": 0.6162919402122498,
"learning_rate": 7.218521050651106e-06,
"loss": 0.6636837720870972,
"step": 894
},
{
"epoch": 1.1609083536090836,
"grad_norm": 0.5723338723182678,
"learning_rate": 7.212111384953993e-06,
"loss": 0.6149659156799316,
"step": 895
},
{
"epoch": 1.1622060016220601,
"grad_norm": 0.6074439883232117,
"learning_rate": 7.205697196048992e-06,
"loss": 0.6255541443824768,
"step": 896
},
{
"epoch": 1.1635036496350364,
"grad_norm": 0.6277779936790466,
"learning_rate": 7.199278497051498e-06,
"loss": 0.6648150086402893,
"step": 897
},
{
"epoch": 1.164801297648013,
"grad_norm": 0.6254341006278992,
"learning_rate": 7.192855301086123e-06,
"loss": 0.6707339882850647,
"step": 898
},
{
"epoch": 1.1660989456609894,
"grad_norm": 0.6244154572486877,
"learning_rate": 7.186427621286678e-06,
"loss": 0.6344256401062012,
"step": 899
},
{
"epoch": 1.1673965936739659,
"grad_norm": 0.6074284911155701,
"learning_rate": 7.179995470796141e-06,
"loss": 0.6663004159927368,
"step": 900
},
{
"epoch": 1.1686942416869424,
"grad_norm": 0.6512662768363953,
"learning_rate": 7.1735588627666346e-06,
"loss": 0.6009752154350281,
"step": 901
},
{
"epoch": 1.1699918896999189,
"grad_norm": 0.6028872132301331,
"learning_rate": 7.167117810359387e-06,
"loss": 0.5874291062355042,
"step": 902
},
{
"epoch": 1.1712895377128953,
"grad_norm": 0.6266588568687439,
"learning_rate": 7.160672326744726e-06,
"loss": 0.6230692267417908,
"step": 903
},
{
"epoch": 1.1725871857258718,
"grad_norm": 3.8021433353424072,
"learning_rate": 7.154222425102033e-06,
"loss": 0.6242640018463135,
"step": 904
},
{
"epoch": 1.1738848337388483,
"grad_norm": 0.6971346735954285,
"learning_rate": 7.1477681186197225e-06,
"loss": 0.6548742651939392,
"step": 905
},
{
"epoch": 1.1751824817518248,
"grad_norm": 0.612678587436676,
"learning_rate": 7.141309420495219e-06,
"loss": 0.6528737545013428,
"step": 906
},
{
"epoch": 1.1764801297648013,
"grad_norm": 0.6218580007553101,
"learning_rate": 7.134846343934924e-06,
"loss": 0.6845676898956299,
"step": 907
},
{
"epoch": 1.1777777777777778,
"grad_norm": 0.6113817691802979,
"learning_rate": 7.128378902154195e-06,
"loss": 0.6958880424499512,
"step": 908
},
{
"epoch": 1.1790754257907543,
"grad_norm": 0.6120286583900452,
"learning_rate": 7.121907108377313e-06,
"loss": 0.6543635725975037,
"step": 909
},
{
"epoch": 1.1803730738037308,
"grad_norm": 0.6076055765151978,
"learning_rate": 7.115430975837457e-06,
"loss": 0.6869640946388245,
"step": 910
},
{
"epoch": 1.1816707218167073,
"grad_norm": 0.6232397556304932,
"learning_rate": 7.10895051777668e-06,
"loss": 0.6338291764259338,
"step": 911
},
{
"epoch": 1.1829683698296838,
"grad_norm": 0.6153266429901123,
"learning_rate": 7.1024657474458795e-06,
"loss": 0.6337912678718567,
"step": 912
},
{
"epoch": 1.1842660178426603,
"grad_norm": 0.6057350039482117,
"learning_rate": 7.095976678104768e-06,
"loss": 0.6359199285507202,
"step": 913
},
{
"epoch": 1.1855636658556366,
"grad_norm": 0.6107894778251648,
"learning_rate": 7.089483323021851e-06,
"loss": 0.6233211755752563,
"step": 914
},
{
"epoch": 1.186861313868613,
"grad_norm": 0.5987040400505066,
"learning_rate": 7.082985695474394e-06,
"loss": 0.6974512338638306,
"step": 915
},
{
"epoch": 1.1881589618815895,
"grad_norm": 0.5928195118904114,
"learning_rate": 7.076483808748402e-06,
"loss": 0.6281331777572632,
"step": 916
},
{
"epoch": 1.189456609894566,
"grad_norm": 0.751203179359436,
"learning_rate": 7.069977676138588e-06,
"loss": 0.6113827228546143,
"step": 917
},
{
"epoch": 1.1907542579075425,
"grad_norm": 0.6335259079933167,
"learning_rate": 7.063467310948346e-06,
"loss": 0.5900315046310425,
"step": 918
},
{
"epoch": 1.192051905920519,
"grad_norm": 0.6231621503829956,
"learning_rate": 7.0569527264897275e-06,
"loss": 0.6505625247955322,
"step": 919
},
{
"epoch": 1.1933495539334955,
"grad_norm": 0.6135134696960449,
"learning_rate": 7.050433936083405e-06,
"loss": 0.6122363805770874,
"step": 920
},
{
"epoch": 1.1933495539334955,
"eval_loss": 0.68769770860672,
"eval_runtime": 73.0979,
"eval_samples_per_second": 71.028,
"eval_steps_per_second": 8.879,
"step": 920
},
{
"epoch": 1.194647201946472,
"grad_norm": 0.5773142576217651,
"learning_rate": 7.043910953058657e-06,
"loss": 0.5964255332946777,
"step": 921
},
{
"epoch": 1.1959448499594485,
"grad_norm": 0.6031613945960999,
"learning_rate": 7.037383790753333e-06,
"loss": 0.662893533706665,
"step": 922
},
{
"epoch": 1.197242497972425,
"grad_norm": 0.6189724206924438,
"learning_rate": 7.030852462513827e-06,
"loss": 0.6189711093902588,
"step": 923
},
{
"epoch": 1.1985401459854015,
"grad_norm": 0.6367059946060181,
"learning_rate": 7.024316981695053e-06,
"loss": 0.6123430132865906,
"step": 924
},
{
"epoch": 1.199837793998378,
"grad_norm": 0.6039940118789673,
"learning_rate": 7.017777361660414e-06,
"loss": 0.6341007947921753,
"step": 925
},
{
"epoch": 1.2011354420113545,
"grad_norm": 0.7465354204177856,
"learning_rate": 7.011233615781777e-06,
"loss": 0.6174352765083313,
"step": 926
},
{
"epoch": 1.202433090024331,
"grad_norm": 0.6807838678359985,
"learning_rate": 7.004685757439449e-06,
"loss": 0.7061627507209778,
"step": 927
},
{
"epoch": 1.2037307380373075,
"grad_norm": 0.5960806012153625,
"learning_rate": 6.99813380002214e-06,
"loss": 0.6526781320571899,
"step": 928
},
{
"epoch": 1.205028386050284,
"grad_norm": 0.5771905183792114,
"learning_rate": 6.991577756926948e-06,
"loss": 0.6951519250869751,
"step": 929
},
{
"epoch": 1.2063260340632604,
"grad_norm": 0.632168710231781,
"learning_rate": 6.9850176415593195e-06,
"loss": 0.6279127597808838,
"step": 930
},
{
"epoch": 1.2076236820762367,
"grad_norm": 0.6110833287239075,
"learning_rate": 6.978453467333028e-06,
"loss": 0.6424981355667114,
"step": 931
},
{
"epoch": 1.2089213300892132,
"grad_norm": 0.5829861164093018,
"learning_rate": 6.9718852476701535e-06,
"loss": 0.6850586533546448,
"step": 932
},
{
"epoch": 1.2102189781021897,
"grad_norm": 0.6042872071266174,
"learning_rate": 6.965312996001038e-06,
"loss": 0.628888726234436,
"step": 933
},
{
"epoch": 1.2115166261151662,
"grad_norm": 0.641800045967102,
"learning_rate": 6.958736725764275e-06,
"loss": 0.6589823961257935,
"step": 934
},
{
"epoch": 1.2128142741281427,
"grad_norm": 0.5857986211776733,
"learning_rate": 6.952156450406673e-06,
"loss": 0.5867838859558105,
"step": 935
},
{
"epoch": 1.2141119221411192,
"grad_norm": 0.6070905923843384,
"learning_rate": 6.945572183383229e-06,
"loss": 0.6120666265487671,
"step": 936
},
{
"epoch": 1.2154095701540957,
"grad_norm": 0.620799720287323,
"learning_rate": 6.9389839381571025e-06,
"loss": 0.6689779758453369,
"step": 937
},
{
"epoch": 1.2167072181670722,
"grad_norm": 3.69341778755188,
"learning_rate": 6.932391728199587e-06,
"loss": 0.6268787384033203,
"step": 938
},
{
"epoch": 1.2180048661800487,
"grad_norm": 0.6159505248069763,
"learning_rate": 6.925795566990083e-06,
"loss": 0.6517162322998047,
"step": 939
},
{
"epoch": 1.2193025141930252,
"grad_norm": 0.6000729203224182,
"learning_rate": 6.919195468016073e-06,
"loss": 0.6077402234077454,
"step": 940
},
{
"epoch": 1.2206001622060016,
"grad_norm": 0.5589438080787659,
"learning_rate": 6.9125914447730865e-06,
"loss": 0.596868634223938,
"step": 941
},
{
"epoch": 1.2218978102189781,
"grad_norm": 2.3887641429901123,
"learning_rate": 6.905983510764681e-06,
"loss": 0.6510117053985596,
"step": 942
},
{
"epoch": 1.2231954582319546,
"grad_norm": 0.5905357003211975,
"learning_rate": 6.899371679502408e-06,
"loss": 0.6385715007781982,
"step": 943
},
{
"epoch": 1.2244931062449311,
"grad_norm": 0.6210343837738037,
"learning_rate": 6.89275596450579e-06,
"loss": 0.5893187522888184,
"step": 944
},
{
"epoch": 1.2257907542579076,
"grad_norm": 0.5834376215934753,
"learning_rate": 6.886136379302288e-06,
"loss": 0.6301822662353516,
"step": 945
},
{
"epoch": 1.2270884022708841,
"grad_norm": 0.6120421886444092,
"learning_rate": 6.87951293742728e-06,
"loss": 0.6227176189422607,
"step": 946
},
{
"epoch": 1.2283860502838606,
"grad_norm": 0.5846749544143677,
"learning_rate": 6.872885652424028e-06,
"loss": 0.5956023931503296,
"step": 947
},
{
"epoch": 1.2296836982968369,
"grad_norm": 0.6237694025039673,
"learning_rate": 6.866254537843651e-06,
"loss": 0.619324266910553,
"step": 948
},
{
"epoch": 1.2309813463098134,
"grad_norm": 0.6295216679573059,
"learning_rate": 6.859619607245102e-06,
"loss": 0.6520287990570068,
"step": 949
},
{
"epoch": 1.2322789943227899,
"grad_norm": 0.6216979026794434,
"learning_rate": 6.852980874195132e-06,
"loss": 0.6138555407524109,
"step": 950
},
{
"epoch": 1.2335766423357664,
"grad_norm": 0.59978848695755,
"learning_rate": 6.846338352268273e-06,
"loss": 0.6959421038627625,
"step": 951
},
{
"epoch": 1.2348742903487429,
"grad_norm": 0.6199280619621277,
"learning_rate": 6.839692055046801e-06,
"loss": 0.6330957412719727,
"step": 952
},
{
"epoch": 1.2361719383617193,
"grad_norm": 0.6078975200653076,
"learning_rate": 6.833041996120707e-06,
"loss": 0.6647271513938904,
"step": 953
},
{
"epoch": 1.2374695863746958,
"grad_norm": 0.6505293846130371,
"learning_rate": 6.826388189087683e-06,
"loss": 0.6796462535858154,
"step": 954
},
{
"epoch": 1.2387672343876723,
"grad_norm": 2.935091257095337,
"learning_rate": 6.819730647553079e-06,
"loss": 0.6220841407775879,
"step": 955
},
{
"epoch": 1.2400648824006488,
"grad_norm": 0.6445925831794739,
"learning_rate": 6.813069385129883e-06,
"loss": 0.5865710973739624,
"step": 956
},
{
"epoch": 1.2413625304136253,
"grad_norm": 0.5919390320777893,
"learning_rate": 6.806404415438689e-06,
"loss": 0.6186652779579163,
"step": 957
},
{
"epoch": 1.2426601784266018,
"grad_norm": 0.601252019405365,
"learning_rate": 6.7997357521076735e-06,
"loss": 0.6536276340484619,
"step": 958
},
{
"epoch": 1.2439578264395783,
"grad_norm": 1.1728289127349854,
"learning_rate": 6.793063408772565e-06,
"loss": 0.6327337026596069,
"step": 959
},
{
"epoch": 1.2452554744525548,
"grad_norm": 0.6600290536880493,
"learning_rate": 6.78638739907662e-06,
"loss": 0.6598416566848755,
"step": 960
},
{
"epoch": 1.2465531224655313,
"grad_norm": 0.6247118711471558,
"learning_rate": 6.779707736670585e-06,
"loss": 0.6106679439544678,
"step": 961
},
{
"epoch": 1.2478507704785078,
"grad_norm": 0.588431179523468,
"learning_rate": 6.773024435212678e-06,
"loss": 0.6234384775161743,
"step": 962
},
{
"epoch": 1.2491484184914843,
"grad_norm": 0.6060811281204224,
"learning_rate": 6.7663375083685635e-06,
"loss": 0.6653448343276978,
"step": 963
},
{
"epoch": 1.2504460665044608,
"grad_norm": 0.7780699729919434,
"learning_rate": 6.759646969811311e-06,
"loss": 0.7183551788330078,
"step": 964
},
{
"epoch": 1.251743714517437,
"grad_norm": 0.6161801815032959,
"learning_rate": 6.752952833221379e-06,
"loss": 0.693482518196106,
"step": 965
},
{
"epoch": 1.2530413625304138,
"grad_norm": 0.5934755802154541,
"learning_rate": 6.7462551122865825e-06,
"loss": 0.6136157512664795,
"step": 966
},
{
"epoch": 1.25433901054339,
"grad_norm": 0.5638807415962219,
"learning_rate": 6.739553820702067e-06,
"loss": 0.6110460758209229,
"step": 967
},
{
"epoch": 1.2556366585563665,
"grad_norm": 2.232645273208618,
"learning_rate": 6.732848972170276e-06,
"loss": 0.5771392583847046,
"step": 968
},
{
"epoch": 1.256934306569343,
"grad_norm": 0.5793489217758179,
"learning_rate": 6.726140580400928e-06,
"loss": 0.637577474117279,
"step": 969
},
{
"epoch": 1.2582319545823195,
"grad_norm": 0.6198015213012695,
"learning_rate": 6.719428659110987e-06,
"loss": 0.6566798686981201,
"step": 970
},
{
"epoch": 1.259529602595296,
"grad_norm": 8.447957992553711,
"learning_rate": 6.712713222024633e-06,
"loss": 0.6350081562995911,
"step": 971
},
{
"epoch": 1.2608272506082725,
"grad_norm": 0.6281896233558655,
"learning_rate": 6.705994282873233e-06,
"loss": 0.6955903172492981,
"step": 972
},
{
"epoch": 1.262124898621249,
"grad_norm": 0.5929207801818848,
"learning_rate": 6.699271855395321e-06,
"loss": 0.6420506834983826,
"step": 973
},
{
"epoch": 1.2634225466342255,
"grad_norm": 0.6053920388221741,
"learning_rate": 6.6925459533365576e-06,
"loss": 0.6596835851669312,
"step": 974
},
{
"epoch": 1.264720194647202,
"grad_norm": 0.6256871819496155,
"learning_rate": 6.685816590449708e-06,
"loss": 0.7071737051010132,
"step": 975
},
{
"epoch": 1.2660178426601785,
"grad_norm": 0.5950897336006165,
"learning_rate": 6.67908378049462e-06,
"loss": 0.656615674495697,
"step": 976
},
{
"epoch": 1.267315490673155,
"grad_norm": 0.6450179815292358,
"learning_rate": 6.672347537238183e-06,
"loss": 0.6895189881324768,
"step": 977
},
{
"epoch": 1.2686131386861315,
"grad_norm": 0.6535899639129639,
"learning_rate": 6.665607874454311e-06,
"loss": 0.6748580932617188,
"step": 978
},
{
"epoch": 1.269910786699108,
"grad_norm": 3.30841326713562,
"learning_rate": 6.658864805923909e-06,
"loss": 0.6493468284606934,
"step": 979
},
{
"epoch": 1.2712084347120842,
"grad_norm": 0.6671776175498962,
"learning_rate": 6.652118345434844e-06,
"loss": 0.6867607235908508,
"step": 980
},
{
"epoch": 1.272506082725061,
"grad_norm": 0.623457670211792,
"learning_rate": 6.64536850678192e-06,
"loss": 0.6442928314208984,
"step": 981
},
{
"epoch": 1.2738037307380372,
"grad_norm": 0.5984421372413635,
"learning_rate": 6.638615303766849e-06,
"loss": 0.5990972518920898,
"step": 982
},
{
"epoch": 1.275101378751014,
"grad_norm": 0.7166045904159546,
"learning_rate": 6.631858750198223e-06,
"loss": 0.6415522694587708,
"step": 983
},
{
"epoch": 1.2763990267639902,
"grad_norm": 0.6510207056999207,
"learning_rate": 6.625098859891483e-06,
"loss": 0.6367224454879761,
"step": 984
},
{
"epoch": 1.2776966747769667,
"grad_norm": 0.6455490589141846,
"learning_rate": 6.618335646668894e-06,
"loss": 0.6474705934524536,
"step": 985
},
{
"epoch": 1.2789943227899432,
"grad_norm": 0.6324385404586792,
"learning_rate": 6.611569124359516e-06,
"loss": 0.6616948843002319,
"step": 986
},
{
"epoch": 1.2802919708029197,
"grad_norm": 0.6118378043174744,
"learning_rate": 6.604799306799172e-06,
"loss": 0.628074586391449,
"step": 987
},
{
"epoch": 1.2815896188158962,
"grad_norm": 0.5939401984214783,
"learning_rate": 6.598026207830428e-06,
"loss": 0.6460234522819519,
"step": 988
},
{
"epoch": 1.2828872668288727,
"grad_norm": 0.5931558609008789,
"learning_rate": 6.591249841302555e-06,
"loss": 0.7053772211074829,
"step": 989
},
{
"epoch": 1.2841849148418492,
"grad_norm": 0.6080952882766724,
"learning_rate": 6.58447022107151e-06,
"loss": 0.6465653777122498,
"step": 990
},
{
"epoch": 1.2854825628548256,
"grad_norm": 0.5909331440925598,
"learning_rate": 6.577687360999898e-06,
"loss": 0.6280587911605835,
"step": 991
},
{
"epoch": 1.2867802108678021,
"grad_norm": 0.6082817912101746,
"learning_rate": 6.5709012749569535e-06,
"loss": 0.6570587158203125,
"step": 992
},
{
"epoch": 1.2880778588807786,
"grad_norm": 0.5879994630813599,
"learning_rate": 6.564111976818501e-06,
"loss": 0.6010950803756714,
"step": 993
},
{
"epoch": 1.2893755068937551,
"grad_norm": 0.6213524341583252,
"learning_rate": 6.5573194804669416e-06,
"loss": 0.7210543751716614,
"step": 994
},
{
"epoch": 1.2906731549067316,
"grad_norm": 0.8193002343177795,
"learning_rate": 6.550523799791207e-06,
"loss": 0.6705042123794556,
"step": 995
},
{
"epoch": 1.2919708029197081,
"grad_norm": 0.6038559079170227,
"learning_rate": 6.543724948686747e-06,
"loss": 0.6417216062545776,
"step": 996
},
{
"epoch": 1.2932684509326844,
"grad_norm": 0.6030299067497253,
"learning_rate": 6.53692294105549e-06,
"loss": 0.6307570338249207,
"step": 997
},
{
"epoch": 1.294566098945661,
"grad_norm": 0.6002436876296997,
"learning_rate": 6.53011779080582e-06,
"loss": 0.6394779086112976,
"step": 998
},
{
"epoch": 1.2958637469586374,
"grad_norm": 0.6847420334815979,
"learning_rate": 6.523309511852547e-06,
"loss": 0.7355165481567383,
"step": 999
},
{
"epoch": 1.2971613949716139,
"grad_norm": 0.6133946180343628,
"learning_rate": 6.516498118116878e-06,
"loss": 0.6960593461990356,
"step": 1000
},
{
"epoch": 1.2984590429845904,
"grad_norm": 0.6106923222541809,
"learning_rate": 6.5096836235263904e-06,
"loss": 0.6673066020011902,
"step": 1001
},
{
"epoch": 1.2997566909975669,
"grad_norm": 0.6132566928863525,
"learning_rate": 6.502866042015e-06,
"loss": 0.6237598657608032,
"step": 1002
},
{
"epoch": 1.3010543390105433,
"grad_norm": 0.8997653126716614,
"learning_rate": 6.496045387522934e-06,
"loss": 0.6304394006729126,
"step": 1003
},
{
"epoch": 1.3023519870235198,
"grad_norm": 0.5679188966751099,
"learning_rate": 6.489221673996708e-06,
"loss": 0.575568675994873,
"step": 1004
},
{
"epoch": 1.3036496350364963,
"grad_norm": 0.6406558752059937,
"learning_rate": 6.482394915389085e-06,
"loss": 0.632392406463623,
"step": 1005
},
{
"epoch": 1.3049472830494728,
"grad_norm": 0.6094868183135986,
"learning_rate": 6.475565125659063e-06,
"loss": 0.6548421382904053,
"step": 1006
},
{
"epoch": 1.3062449310624493,
"grad_norm": 0.5837537050247192,
"learning_rate": 6.4687323187718276e-06,
"loss": 0.6500783562660217,
"step": 1007
},
{
"epoch": 1.3075425790754258,
"grad_norm": 0.5676296353340149,
"learning_rate": 6.461896508698744e-06,
"loss": 0.5843409299850464,
"step": 1008
},
{
"epoch": 1.3088402270884023,
"grad_norm": 0.5929064154624939,
"learning_rate": 6.455057709417312e-06,
"loss": 0.5786738395690918,
"step": 1009
},
{
"epoch": 1.3101378751013788,
"grad_norm": 0.6186608672142029,
"learning_rate": 6.448215934911145e-06,
"loss": 0.7198565006256104,
"step": 1010
},
{
"epoch": 1.3114355231143553,
"grad_norm": 0.573298454284668,
"learning_rate": 6.441371199169942e-06,
"loss": 0.6153538227081299,
"step": 1011
},
{
"epoch": 1.3127331711273318,
"grad_norm": 0.6731165051460266,
"learning_rate": 6.434523516189453e-06,
"loss": 0.6571598052978516,
"step": 1012
},
{
"epoch": 1.3140308191403083,
"grad_norm": 0.5842266082763672,
"learning_rate": 6.427672899971457e-06,
"loss": 0.6164257526397705,
"step": 1013
},
{
"epoch": 1.3153284671532846,
"grad_norm": 0.6072558760643005,
"learning_rate": 6.4208193645237314e-06,
"loss": 0.6229099035263062,
"step": 1014
},
{
"epoch": 1.3166261151662613,
"grad_norm": 0.6617994904518127,
"learning_rate": 6.413962923860021e-06,
"loss": 0.634198009967804,
"step": 1015
},
{
"epoch": 1.3179237631792375,
"grad_norm": 5.200798511505127,
"learning_rate": 6.407103592000009e-06,
"loss": 0.6058683395385742,
"step": 1016
},
{
"epoch": 1.319221411192214,
"grad_norm": 0.5821889042854309,
"learning_rate": 6.400241382969297e-06,
"loss": 0.6865833401679993,
"step": 1017
},
{
"epoch": 1.3205190592051905,
"grad_norm": 0.5700265169143677,
"learning_rate": 6.393376310799363e-06,
"loss": 0.6534625291824341,
"step": 1018
},
{
"epoch": 1.321816707218167,
"grad_norm": 0.5971737504005432,
"learning_rate": 6.386508389527544e-06,
"loss": 0.6178575158119202,
"step": 1019
},
{
"epoch": 1.3231143552311435,
"grad_norm": 0.5835508108139038,
"learning_rate": 6.379637633196999e-06,
"loss": 0.6270486116409302,
"step": 1020
},
{
"epoch": 1.32441200324412,
"grad_norm": 0.5576135516166687,
"learning_rate": 6.3727640558566865e-06,
"loss": 0.6197627782821655,
"step": 1021
},
{
"epoch": 1.3257096512570965,
"grad_norm": 0.6085971593856812,
"learning_rate": 6.3658876715613315e-06,
"loss": 0.6738483309745789,
"step": 1022
},
{
"epoch": 1.327007299270073,
"grad_norm": 0.6080042719841003,
"learning_rate": 6.3590084943713995e-06,
"loss": 0.6581575870513916,
"step": 1023
},
{
"epoch": 1.3283049472830495,
"grad_norm": 0.5854855179786682,
"learning_rate": 6.35212653835307e-06,
"loss": 0.6252275705337524,
"step": 1024
},
{
"epoch": 1.329602595296026,
"grad_norm": 0.5838765501976013,
"learning_rate": 6.345241817578196e-06,
"loss": 0.6577827334403992,
"step": 1025
},
{
"epoch": 1.3309002433090025,
"grad_norm": 0.5933322310447693,
"learning_rate": 6.3383543461242914e-06,
"loss": 0.6144447326660156,
"step": 1026
},
{
"epoch": 1.332197891321979,
"grad_norm": 0.6093854904174805,
"learning_rate": 6.331464138074493e-06,
"loss": 0.6428185701370239,
"step": 1027
},
{
"epoch": 1.3334955393349555,
"grad_norm": 0.6086922287940979,
"learning_rate": 6.32457120751753e-06,
"loss": 0.6779354810714722,
"step": 1028
},
{
"epoch": 1.334793187347932,
"grad_norm": 0.5752759575843811,
"learning_rate": 6.317675568547704e-06,
"loss": 0.6089493036270142,
"step": 1029
},
{
"epoch": 1.3360908353609084,
"grad_norm": 0.5942736268043518,
"learning_rate": 6.310777235264849e-06,
"loss": 0.6579400300979614,
"step": 1030
},
{
"epoch": 1.3373884833738847,
"grad_norm": 0.5779575705528259,
"learning_rate": 6.303876221774311e-06,
"loss": 0.6444313526153564,
"step": 1031
},
{
"epoch": 1.3386861313868614,
"grad_norm": 0.6055609583854675,
"learning_rate": 6.296972542186915e-06,
"loss": 0.6654270887374878,
"step": 1032
},
{
"epoch": 1.3399837793998377,
"grad_norm": 0.60945063829422,
"learning_rate": 6.2900662106189415e-06,
"loss": 0.661444902420044,
"step": 1033
},
{
"epoch": 1.3412814274128142,
"grad_norm": 1.2685060501098633,
"learning_rate": 6.283157241192087e-06,
"loss": 0.6629235148429871,
"step": 1034
},
{
"epoch": 1.3425790754257907,
"grad_norm": 0.6141192317008972,
"learning_rate": 6.276245648033447e-06,
"loss": 0.6560642719268799,
"step": 1035
},
{
"epoch": 1.3438767234387672,
"grad_norm": 0.5949526429176331,
"learning_rate": 6.2693314452754796e-06,
"loss": 0.7151345014572144,
"step": 1036
},
{
"epoch": 1.3451743714517437,
"grad_norm": 0.6198956370353699,
"learning_rate": 6.26241464705598e-06,
"loss": 0.6870914101600647,
"step": 1037
},
{
"epoch": 1.3464720194647202,
"grad_norm": 0.6193254590034485,
"learning_rate": 6.25549526751805e-06,
"loss": 0.6167775392532349,
"step": 1038
},
{
"epoch": 1.3477696674776967,
"grad_norm": 0.582747757434845,
"learning_rate": 6.24857332081007e-06,
"loss": 0.6421079635620117,
"step": 1039
},
{
"epoch": 1.3490673154906732,
"grad_norm": 0.6055523753166199,
"learning_rate": 6.241648821085666e-06,
"loss": 0.642772376537323,
"step": 1040
},
{
"epoch": 1.3503649635036497,
"grad_norm": 0.5949704051017761,
"learning_rate": 6.23472178250369e-06,
"loss": 0.6979063153266907,
"step": 1041
},
{
"epoch": 1.3516626115166261,
"grad_norm": 0.6037271022796631,
"learning_rate": 6.227792219228183e-06,
"loss": 0.6815102100372314,
"step": 1042
},
{
"epoch": 1.3529602595296026,
"grad_norm": 0.5929699540138245,
"learning_rate": 6.220860145428347e-06,
"loss": 0.6474612951278687,
"step": 1043
},
{
"epoch": 1.3542579075425791,
"grad_norm": 0.612301230430603,
"learning_rate": 6.213925575278518e-06,
"loss": 0.669405460357666,
"step": 1044
},
{
"epoch": 1.3555555555555556,
"grad_norm": 0.5837467908859253,
"learning_rate": 6.206988522958135e-06,
"loss": 0.5990941524505615,
"step": 1045
},
{
"epoch": 1.3568532035685321,
"grad_norm": 0.614405632019043,
"learning_rate": 6.200049002651718e-06,
"loss": 0.6845515370368958,
"step": 1046
},
{
"epoch": 1.3581508515815086,
"grad_norm": 0.713435709476471,
"learning_rate": 6.19310702854883e-06,
"loss": 0.5659395456314087,
"step": 1047
},
{
"epoch": 1.3594484995944849,
"grad_norm": 0.6173283457756042,
"learning_rate": 6.186162614844047e-06,
"loss": 0.6531370282173157,
"step": 1048
},
{
"epoch": 1.3607461476074616,
"grad_norm": 0.6224690675735474,
"learning_rate": 6.17921577573694e-06,
"loss": 0.6006350517272949,
"step": 1049
},
{
"epoch": 1.3620437956204379,
"grad_norm": 0.5716680288314819,
"learning_rate": 6.172266525432036e-06,
"loss": 0.6007007360458374,
"step": 1050
},
{
"epoch": 1.3633414436334144,
"grad_norm": 0.964508593082428,
"learning_rate": 6.165314878138794e-06,
"loss": 0.5759468674659729,
"step": 1051
},
{
"epoch": 1.3646390916463909,
"grad_norm": 1.8147951364517212,
"learning_rate": 6.1583608480715705e-06,
"loss": 0.6763917207717896,
"step": 1052
},
{
"epoch": 1.3659367396593673,
"grad_norm": 0.5682212710380554,
"learning_rate": 6.1514044494496e-06,
"loss": 0.5627442002296448,
"step": 1053
},
{
"epoch": 1.3672343876723438,
"grad_norm": 0.6249387860298157,
"learning_rate": 6.144445696496955e-06,
"loss": 0.7233635187149048,
"step": 1054
},
{
"epoch": 1.3685320356853203,
"grad_norm": 0.5967603921890259,
"learning_rate": 6.137484603442524e-06,
"loss": 0.60671067237854,
"step": 1055
},
{
"epoch": 1.3698296836982968,
"grad_norm": 0.9533456563949585,
"learning_rate": 6.130521184519983e-06,
"loss": 0.6718368530273438,
"step": 1056
},
{
"epoch": 1.3711273317112733,
"grad_norm": 0.577439546585083,
"learning_rate": 6.123555453967759e-06,
"loss": 0.6093976497650146,
"step": 1057
},
{
"epoch": 1.3724249797242498,
"grad_norm": 0.5558829307556152,
"learning_rate": 6.1165874260290074e-06,
"loss": 0.6086419224739075,
"step": 1058
},
{
"epoch": 1.3737226277372263,
"grad_norm": 0.6080211400985718,
"learning_rate": 6.109617114951581e-06,
"loss": 0.6369859576225281,
"step": 1059
},
{
"epoch": 1.3750202757502028,
"grad_norm": 0.70982426404953,
"learning_rate": 6.102644534988006e-06,
"loss": 0.6179996728897095,
"step": 1060
},
{
"epoch": 1.3763179237631793,
"grad_norm": 0.6002638936042786,
"learning_rate": 6.0956697003954404e-06,
"loss": 0.6171343326568604,
"step": 1061
},
{
"epoch": 1.3776155717761558,
"grad_norm": 0.74629145860672,
"learning_rate": 6.088692625435656e-06,
"loss": 0.64389967918396,
"step": 1062
},
{
"epoch": 1.378913219789132,
"grad_norm": 0.5946625471115112,
"learning_rate": 6.0817133243750046e-06,
"loss": 0.6315205097198486,
"step": 1063
},
{
"epoch": 1.3802108678021088,
"grad_norm": 0.6307440996170044,
"learning_rate": 6.074731811484391e-06,
"loss": 0.6365832090377808,
"step": 1064
},
{
"epoch": 1.381508515815085,
"grad_norm": 0.958493173122406,
"learning_rate": 6.067748101039243e-06,
"loss": 0.588029146194458,
"step": 1065
},
{
"epoch": 1.3828061638280618,
"grad_norm": 2.589282512664795,
"learning_rate": 6.060762207319479e-06,
"loss": 0.6348222494125366,
"step": 1066
},
{
"epoch": 1.384103811841038,
"grad_norm": 0.6122376322746277,
"learning_rate": 6.053774144609484e-06,
"loss": 0.6187014579772949,
"step": 1067
},
{
"epoch": 1.3854014598540145,
"grad_norm": 0.6017574071884155,
"learning_rate": 6.046783927198079e-06,
"loss": 0.646289587020874,
"step": 1068
},
{
"epoch": 1.386699107866991,
"grad_norm": 0.5894978046417236,
"learning_rate": 6.039791569378488e-06,
"loss": 0.6435679197311401,
"step": 1069
},
{
"epoch": 1.3879967558799675,
"grad_norm": 0.5931923389434814,
"learning_rate": 6.032797085448315e-06,
"loss": 0.6404111981391907,
"step": 1070
},
{
"epoch": 1.389294403892944,
"grad_norm": 0.5930508971214294,
"learning_rate": 6.025800489709505e-06,
"loss": 0.6763365268707275,
"step": 1071
},
{
"epoch": 1.3905920519059205,
"grad_norm": 0.621198832988739,
"learning_rate": 6.018801796468328e-06,
"loss": 0.7032692432403564,
"step": 1072
},
{
"epoch": 1.391889699918897,
"grad_norm": 0.6337350606918335,
"learning_rate": 6.0118010200353396e-06,
"loss": 0.7524909973144531,
"step": 1073
},
{
"epoch": 1.3931873479318735,
"grad_norm": 0.5976428389549255,
"learning_rate": 6.004798174725358e-06,
"loss": 0.6851296424865723,
"step": 1074
},
{
"epoch": 1.39448499594485,
"grad_norm": 0.9778940081596375,
"learning_rate": 5.997793274857427e-06,
"loss": 0.6498898267745972,
"step": 1075
},
{
"epoch": 1.3957826439578265,
"grad_norm": 0.638106644153595,
"learning_rate": 5.990786334754795e-06,
"loss": 0.707371711730957,
"step": 1076
},
{
"epoch": 1.397080291970803,
"grad_norm": 4.414336204528809,
"learning_rate": 5.983777368744881e-06,
"loss": 0.6448768973350525,
"step": 1077
},
{
"epoch": 1.3983779399837795,
"grad_norm": 0.5891152620315552,
"learning_rate": 5.9767663911592454e-06,
"loss": 0.6236732602119446,
"step": 1078
},
{
"epoch": 1.399675587996756,
"grad_norm": 0.59264075756073,
"learning_rate": 5.9697534163335645e-06,
"loss": 0.6284846663475037,
"step": 1079
},
{
"epoch": 1.4009732360097322,
"grad_norm": 0.6076551675796509,
"learning_rate": 5.9627384586075954e-06,
"loss": 0.6464221477508545,
"step": 1080
},
{
"epoch": 1.402270884022709,
"grad_norm": 0.6048544645309448,
"learning_rate": 5.955721532325151e-06,
"loss": 0.6747769713401794,
"step": 1081
},
{
"epoch": 1.4035685320356852,
"grad_norm": 1.4065572023391724,
"learning_rate": 5.94870265183407e-06,
"loss": 0.6566921472549438,
"step": 1082
},
{
"epoch": 1.404866180048662,
"grad_norm": 0.5989380478858948,
"learning_rate": 5.941681831486188e-06,
"loss": 0.65166175365448,
"step": 1083
},
{
"epoch": 1.4061638280616382,
"grad_norm": 0.566685676574707,
"learning_rate": 5.934659085637303e-06,
"loss": 0.6065230369567871,
"step": 1084
},
{
"epoch": 1.4074614760746147,
"grad_norm": 0.5942695736885071,
"learning_rate": 5.927634428647154e-06,
"loss": 0.6362863183021545,
"step": 1085
},
{
"epoch": 1.4087591240875912,
"grad_norm": 0.6072388887405396,
"learning_rate": 5.920607874879387e-06,
"loss": 0.6389554738998413,
"step": 1086
},
{
"epoch": 1.4100567721005677,
"grad_norm": 0.6188286542892456,
"learning_rate": 5.913579438701525e-06,
"loss": 0.7114623188972473,
"step": 1087
},
{
"epoch": 1.4113544201135442,
"grad_norm": 0.639775812625885,
"learning_rate": 5.906549134484943e-06,
"loss": 0.6554163694381714,
"step": 1088
},
{
"epoch": 1.4126520681265207,
"grad_norm": 0.5889431238174438,
"learning_rate": 5.899516976604832e-06,
"loss": 0.6516610383987427,
"step": 1089
},
{
"epoch": 1.4139497161394972,
"grad_norm": 0.5683200359344482,
"learning_rate": 5.892482979440175e-06,
"loss": 0.6421197652816772,
"step": 1090
},
{
"epoch": 1.4152473641524737,
"grad_norm": 0.6089890003204346,
"learning_rate": 5.885447157373716e-06,
"loss": 0.6774452924728394,
"step": 1091
},
{
"epoch": 1.4165450121654501,
"grad_norm": 0.6220733523368835,
"learning_rate": 5.878409524791931e-06,
"loss": 0.6213857531547546,
"step": 1092
},
{
"epoch": 1.4178426601784266,
"grad_norm": 0.6474610567092896,
"learning_rate": 5.871370096084997e-06,
"loss": 0.6641533970832825,
"step": 1093
},
{
"epoch": 1.4191403081914031,
"grad_norm": 0.5982186198234558,
"learning_rate": 5.864328885646764e-06,
"loss": 0.6307400465011597,
"step": 1094
},
{
"epoch": 1.4204379562043796,
"grad_norm": 1.0146141052246094,
"learning_rate": 5.857285907874725e-06,
"loss": 0.6501115560531616,
"step": 1095
},
{
"epoch": 1.4217356042173561,
"grad_norm": 0.6026197671890259,
"learning_rate": 5.850241177169986e-06,
"loss": 0.6877589225769043,
"step": 1096
},
{
"epoch": 1.4230332522303324,
"grad_norm": 0.6162115931510925,
"learning_rate": 5.84319470793724e-06,
"loss": 0.6401875019073486,
"step": 1097
},
{
"epoch": 1.424330900243309,
"grad_norm": 0.5684193968772888,
"learning_rate": 5.836146514584733e-06,
"loss": 0.6159685850143433,
"step": 1098
},
{
"epoch": 1.4256285482562854,
"grad_norm": 0.6261927485466003,
"learning_rate": 5.829096611524235e-06,
"loss": 0.6478676199913025,
"step": 1099
},
{
"epoch": 1.426926196269262,
"grad_norm": 0.6026703119277954,
"learning_rate": 5.822045013171015e-06,
"loss": 0.6607078313827515,
"step": 1100
},
{
"epoch": 1.4282238442822384,
"grad_norm": 0.602356493473053,
"learning_rate": 5.814991733943805e-06,
"loss": 0.6449368000030518,
"step": 1101
},
{
"epoch": 1.4295214922952149,
"grad_norm": 0.5841164588928223,
"learning_rate": 5.807936788264778e-06,
"loss": 0.6442397236824036,
"step": 1102
},
{
"epoch": 1.4308191403081914,
"grad_norm": 0.6046425104141235,
"learning_rate": 5.800880190559511e-06,
"loss": 0.6141000986099243,
"step": 1103
},
{
"epoch": 1.4321167883211678,
"grad_norm": 0.6180353760719299,
"learning_rate": 5.79382195525696e-06,
"loss": 0.7307353019714355,
"step": 1104
},
{
"epoch": 1.4334144363341443,
"grad_norm": 0.5996196269989014,
"learning_rate": 5.786762096789431e-06,
"loss": 0.6220886707305908,
"step": 1105
},
{
"epoch": 1.4347120843471208,
"grad_norm": 0.6037473678588867,
"learning_rate": 5.779700629592547e-06,
"loss": 0.7145535945892334,
"step": 1106
},
{
"epoch": 1.4360097323600973,
"grad_norm": 0.5726904273033142,
"learning_rate": 5.7726375681052205e-06,
"loss": 0.6307674646377563,
"step": 1107
},
{
"epoch": 1.4373073803730738,
"grad_norm": 0.6289665102958679,
"learning_rate": 5.765572926769625e-06,
"loss": 0.7094706296920776,
"step": 1108
},
{
"epoch": 1.4386050283860503,
"grad_norm": 0.5811914801597595,
"learning_rate": 5.758506720031163e-06,
"loss": 0.6041115522384644,
"step": 1109
},
{
"epoch": 1.4399026763990268,
"grad_norm": 0.5376439094543457,
"learning_rate": 5.751438962338441e-06,
"loss": 0.5803889036178589,
"step": 1110
},
{
"epoch": 1.4412003244120033,
"grad_norm": 0.5952728390693665,
"learning_rate": 5.744369668143233e-06,
"loss": 0.6684442758560181,
"step": 1111
},
{
"epoch": 1.4424979724249798,
"grad_norm": 0.5791693329811096,
"learning_rate": 5.737298851900457e-06,
"loss": 0.6404840350151062,
"step": 1112
},
{
"epoch": 1.4437956204379563,
"grad_norm": 0.6007118225097656,
"learning_rate": 5.730226528068142e-06,
"loss": 0.6698148846626282,
"step": 1113
},
{
"epoch": 1.4450932684509326,
"grad_norm": 0.613433301448822,
"learning_rate": 5.7231527111074e-06,
"loss": 0.7007705569267273,
"step": 1114
},
{
"epoch": 1.4463909164639093,
"grad_norm": 0.5919564962387085,
"learning_rate": 5.716077415482398e-06,
"loss": 0.6769901514053345,
"step": 1115
},
{
"epoch": 1.4476885644768855,
"grad_norm": 0.5912166833877563,
"learning_rate": 5.709000655660324e-06,
"loss": 0.6436672210693359,
"step": 1116
},
{
"epoch": 1.4489862124898623,
"grad_norm": 0.5603325366973877,
"learning_rate": 5.7019224461113585e-06,
"loss": 0.5793130993843079,
"step": 1117
},
{
"epoch": 1.4502838605028385,
"grad_norm": 0.611814558506012,
"learning_rate": 5.694842801308651e-06,
"loss": 0.6368833780288696,
"step": 1118
},
{
"epoch": 1.451581508515815,
"grad_norm": 0.5689136385917664,
"learning_rate": 5.687761735728282e-06,
"loss": 0.6261428594589233,
"step": 1119
},
{
"epoch": 1.4528791565287915,
"grad_norm": 0.6117684245109558,
"learning_rate": 5.680679263849241e-06,
"loss": 0.6463526487350464,
"step": 1120
},
{
"epoch": 1.454176804541768,
"grad_norm": 0.5878109931945801,
"learning_rate": 5.673595400153385e-06,
"loss": 0.6132445335388184,
"step": 1121
},
{
"epoch": 1.4554744525547445,
"grad_norm": 0.5826682448387146,
"learning_rate": 5.666510159125427e-06,
"loss": 0.6556754112243652,
"step": 1122
},
{
"epoch": 1.456772100567721,
"grad_norm": 0.5753729939460754,
"learning_rate": 5.65942355525289e-06,
"loss": 0.6176761388778687,
"step": 1123
},
{
"epoch": 1.4580697485806975,
"grad_norm": 0.7028788924217224,
"learning_rate": 5.652335603026084e-06,
"loss": 0.5802330374717712,
"step": 1124
},
{
"epoch": 1.459367396593674,
"grad_norm": 0.5847388505935669,
"learning_rate": 5.645246316938082e-06,
"loss": 0.6626067161560059,
"step": 1125
},
{
"epoch": 1.4606650446066505,
"grad_norm": 1.399409294128418,
"learning_rate": 5.638155711484674e-06,
"loss": 0.6308712959289551,
"step": 1126
},
{
"epoch": 1.461962692619627,
"grad_norm": 0.602827250957489,
"learning_rate": 5.631063801164356e-06,
"loss": 0.6493173241615295,
"step": 1127
},
{
"epoch": 1.4632603406326035,
"grad_norm": 0.7403953075408936,
"learning_rate": 5.62397060047829e-06,
"loss": 0.620072603225708,
"step": 1128
},
{
"epoch": 1.46455798864558,
"grad_norm": 0.6334176063537598,
"learning_rate": 5.6168761239302745e-06,
"loss": 0.665931761264801,
"step": 1129
},
{
"epoch": 1.4658556366585564,
"grad_norm": 0.6131840944290161,
"learning_rate": 5.609780386026721e-06,
"loss": 0.6492164731025696,
"step": 1130
},
{
"epoch": 1.4671532846715327,
"grad_norm": 0.6045870780944824,
"learning_rate": 5.6026834012766155e-06,
"loss": 0.6135592460632324,
"step": 1131
},
{
"epoch": 1.4684509326845094,
"grad_norm": 0.650088906288147,
"learning_rate": 5.595585184191496e-06,
"loss": 0.7170080542564392,
"step": 1132
},
{
"epoch": 1.4697485806974857,
"grad_norm": 0.5771186351776123,
"learning_rate": 5.58848574928542e-06,
"loss": 0.6513093709945679,
"step": 1133
},
{
"epoch": 1.4710462287104624,
"grad_norm": 0.7128145694732666,
"learning_rate": 5.5813851110749365e-06,
"loss": 0.6579954624176025,
"step": 1134
},
{
"epoch": 1.4723438767234387,
"grad_norm": 0.5734491348266602,
"learning_rate": 5.574283284079049e-06,
"loss": 0.6137959361076355,
"step": 1135
},
{
"epoch": 1.4736415247364152,
"grad_norm": 0.5757655501365662,
"learning_rate": 5.567180282819201e-06,
"loss": 0.6633074283599854,
"step": 1136
},
{
"epoch": 1.4749391727493917,
"grad_norm": 0.5958343148231506,
"learning_rate": 5.560076121819229e-06,
"loss": 0.6766320466995239,
"step": 1137
},
{
"epoch": 1.4762368207623682,
"grad_norm": 0.5708390474319458,
"learning_rate": 5.552970815605347e-06,
"loss": 0.6593270897865295,
"step": 1138
},
{
"epoch": 1.4775344687753447,
"grad_norm": 0.5592367649078369,
"learning_rate": 5.545864378706106e-06,
"loss": 0.6107625961303711,
"step": 1139
},
{
"epoch": 1.4788321167883212,
"grad_norm": 0.5908456444740295,
"learning_rate": 5.53875682565237e-06,
"loss": 0.612775444984436,
"step": 1140
},
{
"epoch": 1.4801297648012977,
"grad_norm": 0.7283220291137695,
"learning_rate": 5.5316481709772886e-06,
"loss": 0.6324783563613892,
"step": 1141
},
{
"epoch": 1.4814274128142741,
"grad_norm": 0.5963947176933289,
"learning_rate": 5.524538429216258e-06,
"loss": 0.6906737089157104,
"step": 1142
},
{
"epoch": 1.4827250608272506,
"grad_norm": 0.6059021949768066,
"learning_rate": 5.517427614906906e-06,
"loss": 0.6746259331703186,
"step": 1143
},
{
"epoch": 1.4840227088402271,
"grad_norm": 0.5953018069267273,
"learning_rate": 5.510315742589042e-06,
"loss": 0.6834631562232971,
"step": 1144
},
{
"epoch": 1.4853203568532036,
"grad_norm": 0.5694923996925354,
"learning_rate": 5.503202826804647e-06,
"loss": 0.6960294246673584,
"step": 1145
},
{
"epoch": 1.4866180048661801,
"grad_norm": 0.6007208228111267,
"learning_rate": 5.496088882097836e-06,
"loss": 0.657875657081604,
"step": 1146
},
{
"epoch": 1.4879156528791566,
"grad_norm": 0.6081047654151917,
"learning_rate": 5.488973923014821e-06,
"loss": 0.6561139225959778,
"step": 1147
},
{
"epoch": 1.4892133008921329,
"grad_norm": 0.5819503664970398,
"learning_rate": 5.4818579641038974e-06,
"loss": 0.6176397204399109,
"step": 1148
},
{
"epoch": 1.4905109489051096,
"grad_norm": 0.6077326536178589,
"learning_rate": 5.474741019915395e-06,
"loss": 0.6847512722015381,
"step": 1149
},
{
"epoch": 1.4918085969180859,
"grad_norm": 0.6074263453483582,
"learning_rate": 5.467623105001667e-06,
"loss": 0.6360629200935364,
"step": 1150
},
{
"epoch": 1.4918085969180859,
"eval_loss": 0.6826658844947815,
"eval_runtime": 73.0405,
"eval_samples_per_second": 71.084,
"eval_steps_per_second": 8.885,
"step": 1150
},
{
"epoch": 1.4931062449310626,
"grad_norm": 0.5855403542518616,
"learning_rate": 5.460504233917047e-06,
"loss": 0.6704986095428467,
"step": 1151
},
{
"epoch": 1.4944038929440389,
"grad_norm": 0.6127449870109558,
"learning_rate": 5.453384421217823e-06,
"loss": 0.6719274520874023,
"step": 1152
},
{
"epoch": 1.4957015409570154,
"grad_norm": 0.5484548211097717,
"learning_rate": 5.446263681462213e-06,
"loss": 0.6012224555015564,
"step": 1153
},
{
"epoch": 1.4969991889699918,
"grad_norm": 0.5728206038475037,
"learning_rate": 5.439142029210323e-06,
"loss": 0.6711239218711853,
"step": 1154
},
{
"epoch": 1.4982968369829683,
"grad_norm": 0.5789787769317627,
"learning_rate": 5.4320194790241335e-06,
"loss": 0.5949071645736694,
"step": 1155
},
{
"epoch": 1.4995944849959448,
"grad_norm": 0.5778141021728516,
"learning_rate": 5.424896045467455e-06,
"loss": 0.6263710260391235,
"step": 1156
},
{
"epoch": 1.5008921330089213,
"grad_norm": 0.5851665139198303,
"learning_rate": 5.417771743105908e-06,
"loss": 0.690178632736206,
"step": 1157
},
{
"epoch": 1.5021897810218978,
"grad_norm": 0.620339035987854,
"learning_rate": 5.4106465865068846e-06,
"loss": 0.6553722620010376,
"step": 1158
},
{
"epoch": 1.5034874290348743,
"grad_norm": 0.5484940409660339,
"learning_rate": 5.403520590239527e-06,
"loss": 0.5462528467178345,
"step": 1159
},
{
"epoch": 1.5047850770478508,
"grad_norm": 0.62648606300354,
"learning_rate": 5.396393768874696e-06,
"loss": 0.7103927135467529,
"step": 1160
},
{
"epoch": 1.5060827250608273,
"grad_norm": 0.5696239471435547,
"learning_rate": 5.389266136984939e-06,
"loss": 0.6234554648399353,
"step": 1161
},
{
"epoch": 1.5073803730738038,
"grad_norm": 0.6027652025222778,
"learning_rate": 5.382137709144454e-06,
"loss": 0.6729198694229126,
"step": 1162
},
{
"epoch": 1.50867802108678,
"grad_norm": 0.5693642497062683,
"learning_rate": 5.3750084999290755e-06,
"loss": 0.6457726359367371,
"step": 1163
},
{
"epoch": 1.5099756690997568,
"grad_norm": 1.6674511432647705,
"learning_rate": 5.3678785239162305e-06,
"loss": 0.656345009803772,
"step": 1164
},
{
"epoch": 1.511273317112733,
"grad_norm": 0.5577940344810486,
"learning_rate": 5.360747795684916e-06,
"loss": 0.5705595016479492,
"step": 1165
},
{
"epoch": 1.5125709651257098,
"grad_norm": 0.5919134616851807,
"learning_rate": 5.353616329815667e-06,
"loss": 0.6972566246986389,
"step": 1166
},
{
"epoch": 1.513868613138686,
"grad_norm": 0.6095024347305298,
"learning_rate": 5.346484140890523e-06,
"loss": 0.6107922196388245,
"step": 1167
},
{
"epoch": 1.5151662611516628,
"grad_norm": 0.5990864634513855,
"learning_rate": 5.339351243493008e-06,
"loss": 0.5962531566619873,
"step": 1168
},
{
"epoch": 1.516463909164639,
"grad_norm": 0.5995983481407166,
"learning_rate": 5.332217652208093e-06,
"loss": 0.6228233575820923,
"step": 1169
},
{
"epoch": 1.5177615571776155,
"grad_norm": 0.5965218544006348,
"learning_rate": 5.325083381622165e-06,
"loss": 0.6963210105895996,
"step": 1170
},
{
"epoch": 1.519059205190592,
"grad_norm": 0.5758861303329468,
"learning_rate": 5.317948446322999e-06,
"loss": 0.58036869764328,
"step": 1171
},
{
"epoch": 1.5203568532035685,
"grad_norm": 0.5857213139533997,
"learning_rate": 5.310812860899737e-06,
"loss": 0.6398880481719971,
"step": 1172
},
{
"epoch": 1.521654501216545,
"grad_norm": 0.706536054611206,
"learning_rate": 5.303676639942841e-06,
"loss": 0.6162217855453491,
"step": 1173
},
{
"epoch": 1.5229521492295215,
"grad_norm": 0.5781589150428772,
"learning_rate": 5.296539798044078e-06,
"loss": 0.6084649562835693,
"step": 1174
},
{
"epoch": 1.524249797242498,
"grad_norm": 0.5943130850791931,
"learning_rate": 5.289402349796484e-06,
"loss": 0.6497021913528442,
"step": 1175
},
{
"epoch": 1.5255474452554745,
"grad_norm": 0.5641393065452576,
"learning_rate": 5.282264309794334e-06,
"loss": 0.5834084749221802,
"step": 1176
},
{
"epoch": 1.526845093268451,
"grad_norm": 0.5564937591552734,
"learning_rate": 5.2751256926331115e-06,
"loss": 0.6279217004776001,
"step": 1177
},
{
"epoch": 1.5281427412814275,
"grad_norm": 0.5945193767547607,
"learning_rate": 5.267986512909484e-06,
"loss": 0.6333688497543335,
"step": 1178
},
{
"epoch": 1.529440389294404,
"grad_norm": 0.6081971526145935,
"learning_rate": 5.2608467852212665e-06,
"loss": 0.6803103685379028,
"step": 1179
},
{
"epoch": 1.5307380373073802,
"grad_norm": 0.584886908531189,
"learning_rate": 5.253706524167395e-06,
"loss": 0.6653470993041992,
"step": 1180
},
{
"epoch": 1.532035685320357,
"grad_norm": 0.8528439998626709,
"learning_rate": 5.246565744347894e-06,
"loss": 0.6093430519104004,
"step": 1181
},
{
"epoch": 1.5333333333333332,
"grad_norm": 0.573440432548523,
"learning_rate": 5.2394244603638536e-06,
"loss": 0.6251604557037354,
"step": 1182
},
{
"epoch": 1.53463098134631,
"grad_norm": 0.5646257996559143,
"learning_rate": 5.232282686817392e-06,
"loss": 0.5792976021766663,
"step": 1183
},
{
"epoch": 1.5359286293592862,
"grad_norm": 0.5741854310035706,
"learning_rate": 5.2251404383116265e-06,
"loss": 0.6484105587005615,
"step": 1184
},
{
"epoch": 1.537226277372263,
"grad_norm": 0.5606357455253601,
"learning_rate": 5.217997729450649e-06,
"loss": 0.6315451860427856,
"step": 1185
},
{
"epoch": 1.5385239253852392,
"grad_norm": 0.5854267477989197,
"learning_rate": 5.21085457483949e-06,
"loss": 0.6010391712188721,
"step": 1186
},
{
"epoch": 1.5398215733982157,
"grad_norm": 0.6120538711547852,
"learning_rate": 5.203710989084093e-06,
"loss": 0.6872812509536743,
"step": 1187
},
{
"epoch": 1.5411192214111922,
"grad_norm": 0.6018205881118774,
"learning_rate": 5.196566986791286e-06,
"loss": 0.6842239499092102,
"step": 1188
},
{
"epoch": 1.5424168694241687,
"grad_norm": 0.5673507452011108,
"learning_rate": 5.189422582568742e-06,
"loss": 0.6135258674621582,
"step": 1189
},
{
"epoch": 1.5437145174371452,
"grad_norm": 0.5736320614814758,
"learning_rate": 5.182277791024959e-06,
"loss": 0.6442878246307373,
"step": 1190
},
{
"epoch": 1.5450121654501217,
"grad_norm": 0.5806821584701538,
"learning_rate": 5.175132626769229e-06,
"loss": 0.6409611701965332,
"step": 1191
},
{
"epoch": 1.5463098134630981,
"grad_norm": 0.6098542213439941,
"learning_rate": 5.167987104411605e-06,
"loss": 0.6895368695259094,
"step": 1192
},
{
"epoch": 1.5476074614760746,
"grad_norm": 0.6138260364532471,
"learning_rate": 5.160841238562872e-06,
"loss": 0.6403982043266296,
"step": 1193
},
{
"epoch": 1.5489051094890511,
"grad_norm": 0.5820956826210022,
"learning_rate": 5.153695043834513e-06,
"loss": 0.6204026937484741,
"step": 1194
},
{
"epoch": 1.5502027575020276,
"grad_norm": 0.5773366093635559,
"learning_rate": 5.146548534838691e-06,
"loss": 0.645720899105072,
"step": 1195
},
{
"epoch": 1.5515004055150041,
"grad_norm": 0.5759880542755127,
"learning_rate": 5.139401726188208e-06,
"loss": 0.5854007601737976,
"step": 1196
},
{
"epoch": 1.5527980535279804,
"grad_norm": 0.584076464176178,
"learning_rate": 5.132254632496477e-06,
"loss": 0.662139892578125,
"step": 1197
},
{
"epoch": 1.554095701540957,
"grad_norm": 0.6095874905586243,
"learning_rate": 5.125107268377498e-06,
"loss": 0.6662768125534058,
"step": 1198
},
{
"epoch": 1.5553933495539334,
"grad_norm": 0.5676849484443665,
"learning_rate": 5.117959648445821e-06,
"loss": 0.593256413936615,
"step": 1199
},
{
"epoch": 1.55669099756691,
"grad_norm": 0.9843289852142334,
"learning_rate": 5.1108117873165175e-06,
"loss": 0.6919536590576172,
"step": 1200
},
{
"epoch": 1.5579886455798864,
"grad_norm": 0.5795591473579407,
"learning_rate": 5.1036636996051556e-06,
"loss": 0.6274605989456177,
"step": 1201
},
{
"epoch": 1.559286293592863,
"grad_norm": 0.5853375196456909,
"learning_rate": 5.096515399927767e-06,
"loss": 0.6070197820663452,
"step": 1202
},
{
"epoch": 1.5605839416058394,
"grad_norm": 0.6098043918609619,
"learning_rate": 5.089366902900813e-06,
"loss": 0.6619631052017212,
"step": 1203
},
{
"epoch": 1.5618815896188158,
"grad_norm": 0.6142205595970154,
"learning_rate": 5.082218223141162e-06,
"loss": 0.6737958192825317,
"step": 1204
},
{
"epoch": 1.5631792376317923,
"grad_norm": 0.5625759363174438,
"learning_rate": 5.075069375266055e-06,
"loss": 0.590381383895874,
"step": 1205
},
{
"epoch": 1.5644768856447688,
"grad_norm": 0.8771416544914246,
"learning_rate": 5.067920373893075e-06,
"loss": 0.5482794046401978,
"step": 1206
},
{
"epoch": 1.5657745336577453,
"grad_norm": 0.8982309699058533,
"learning_rate": 5.060771233640122e-06,
"loss": 0.6464008092880249,
"step": 1207
},
{
"epoch": 1.5670721816707218,
"grad_norm": 0.6009715795516968,
"learning_rate": 5.0536219691253776e-06,
"loss": 0.5735194683074951,
"step": 1208
},
{
"epoch": 1.5683698296836983,
"grad_norm": 0.5980544686317444,
"learning_rate": 5.046472594967279e-06,
"loss": 0.66939377784729,
"step": 1209
},
{
"epoch": 1.5696674776966748,
"grad_norm": 0.6162261962890625,
"learning_rate": 5.039323125784485e-06,
"loss": 0.6994204521179199,
"step": 1210
},
{
"epoch": 1.5709651257096513,
"grad_norm": 0.617485761642456,
"learning_rate": 5.0321735761958515e-06,
"loss": 0.633686363697052,
"step": 1211
},
{
"epoch": 1.5722627737226276,
"grad_norm": 0.5924466848373413,
"learning_rate": 5.025023960820399e-06,
"loss": 0.6124377250671387,
"step": 1212
},
{
"epoch": 1.5735604217356043,
"grad_norm": 0.6040006279945374,
"learning_rate": 5.01787429427728e-06,
"loss": 0.6491550207138062,
"step": 1213
},
{
"epoch": 1.5748580697485806,
"grad_norm": 0.5883519649505615,
"learning_rate": 5.010724591185752e-06,
"loss": 0.6150457262992859,
"step": 1214
},
{
"epoch": 1.5761557177615573,
"grad_norm": 0.6369590759277344,
"learning_rate": 5.003574866165149e-06,
"loss": 0.6079261898994446,
"step": 1215
},
{
"epoch": 1.5774533657745335,
"grad_norm": 0.6027874946594238,
"learning_rate": 4.9964251338348515e-06,
"loss": 0.6851716637611389,
"step": 1216
},
{
"epoch": 1.5787510137875103,
"grad_norm": 0.5862027406692505,
"learning_rate": 4.989275408814251e-06,
"loss": 0.5923515558242798,
"step": 1217
},
{
"epoch": 1.5800486618004865,
"grad_norm": 0.6328719854354858,
"learning_rate": 4.982125705722722e-06,
"loss": 0.6643452644348145,
"step": 1218
},
{
"epoch": 1.5813463098134632,
"grad_norm": 0.6100243330001831,
"learning_rate": 4.974976039179604e-06,
"loss": 0.6416760683059692,
"step": 1219
},
{
"epoch": 1.5826439578264395,
"grad_norm": 0.5908761620521545,
"learning_rate": 4.967826423804151e-06,
"loss": 0.643882155418396,
"step": 1220
},
{
"epoch": 1.583941605839416,
"grad_norm": 0.5938880443572998,
"learning_rate": 4.960676874215518e-06,
"loss": 0.6157772541046143,
"step": 1221
},
{
"epoch": 1.5852392538523925,
"grad_norm": 0.5930868983268738,
"learning_rate": 4.953527405032723e-06,
"loss": 0.5862378478050232,
"step": 1222
},
{
"epoch": 1.586536901865369,
"grad_norm": 0.589255690574646,
"learning_rate": 4.946378030874625e-06,
"loss": 0.6135423183441162,
"step": 1223
},
{
"epoch": 1.5878345498783455,
"grad_norm": 0.5754698514938354,
"learning_rate": 4.9392287663598785e-06,
"loss": 0.6066054701805115,
"step": 1224
},
{
"epoch": 1.589132197891322,
"grad_norm": 0.6168340444564819,
"learning_rate": 4.932079626106926e-06,
"loss": 0.683946967124939,
"step": 1225
},
{
"epoch": 1.5904298459042985,
"grad_norm": 0.5985932350158691,
"learning_rate": 4.924930624733947e-06,
"loss": 0.6772314310073853,
"step": 1226
},
{
"epoch": 1.591727493917275,
"grad_norm": 0.6024285554885864,
"learning_rate": 4.91778177685884e-06,
"loss": 0.652093768119812,
"step": 1227
},
{
"epoch": 1.5930251419302515,
"grad_norm": 0.6394546627998352,
"learning_rate": 4.910633097099188e-06,
"loss": 0.6307955384254456,
"step": 1228
},
{
"epoch": 1.5943227899432277,
"grad_norm": 0.5471766591072083,
"learning_rate": 4.903484600072236e-06,
"loss": 0.5805978775024414,
"step": 1229
},
{
"epoch": 1.5956204379562045,
"grad_norm": 0.5722350478172302,
"learning_rate": 4.896336300394845e-06,
"loss": 0.6355024576187134,
"step": 1230
},
{
"epoch": 1.5969180859691807,
"grad_norm": 0.6039298176765442,
"learning_rate": 4.889188212683483e-06,
"loss": 0.6441288590431213,
"step": 1231
},
{
"epoch": 1.5982157339821574,
"grad_norm": 0.6237229704856873,
"learning_rate": 4.882040351554181e-06,
"loss": 0.6681591272354126,
"step": 1232
},
{
"epoch": 1.5995133819951337,
"grad_norm": 0.6051374673843384,
"learning_rate": 4.874892731622503e-06,
"loss": 0.6615642309188843,
"step": 1233
},
{
"epoch": 1.6008110300081104,
"grad_norm": 0.5937628746032715,
"learning_rate": 4.867745367503524e-06,
"loss": 0.6506084203720093,
"step": 1234
},
{
"epoch": 1.6021086780210867,
"grad_norm": 0.5851325988769531,
"learning_rate": 4.860598273811793e-06,
"loss": 0.6443929076194763,
"step": 1235
},
{
"epoch": 1.6034063260340634,
"grad_norm": 0.5777382850646973,
"learning_rate": 4.8534514651613104e-06,
"loss": 0.635840892791748,
"step": 1236
},
{
"epoch": 1.6047039740470397,
"grad_norm": 0.5909644365310669,
"learning_rate": 4.846304956165488e-06,
"loss": 0.6581849455833435,
"step": 1237
},
{
"epoch": 1.6060016220600162,
"grad_norm": 0.5992142558097839,
"learning_rate": 4.83915876143713e-06,
"loss": 0.6690875291824341,
"step": 1238
},
{
"epoch": 1.6072992700729927,
"grad_norm": 1.2001910209655762,
"learning_rate": 4.832012895588395e-06,
"loss": 0.6264456510543823,
"step": 1239
},
{
"epoch": 1.6085969180859692,
"grad_norm": 0.6141691207885742,
"learning_rate": 4.824867373230772e-06,
"loss": 0.670561671257019,
"step": 1240
},
{
"epoch": 1.6098945660989457,
"grad_norm": 0.5834086537361145,
"learning_rate": 4.817722208975041e-06,
"loss": 0.6045785546302795,
"step": 1241
},
{
"epoch": 1.6111922141119221,
"grad_norm": 0.6060406565666199,
"learning_rate": 4.81057741743126e-06,
"loss": 0.5803914666175842,
"step": 1242
},
{
"epoch": 1.6124898621248986,
"grad_norm": 0.5703381299972534,
"learning_rate": 4.8034330132087155e-06,
"loss": 0.6377118825912476,
"step": 1243
},
{
"epoch": 1.6137875101378751,
"grad_norm": 0.6010227203369141,
"learning_rate": 4.7962890109159085e-06,
"loss": 0.6981620788574219,
"step": 1244
},
{
"epoch": 1.6150851581508516,
"grad_norm": 0.6107721924781799,
"learning_rate": 4.789145425160511e-06,
"loss": 0.6511063575744629,
"step": 1245
},
{
"epoch": 1.616382806163828,
"grad_norm": 0.5982344150543213,
"learning_rate": 4.782002270549354e-06,
"loss": 0.6058223247528076,
"step": 1246
},
{
"epoch": 1.6176804541768046,
"grad_norm": 0.7359511256217957,
"learning_rate": 4.774859561688374e-06,
"loss": 0.7255959510803223,
"step": 1247
},
{
"epoch": 1.6189781021897809,
"grad_norm": 0.6240600347518921,
"learning_rate": 4.767717313182611e-06,
"loss": 0.695855975151062,
"step": 1248
},
{
"epoch": 1.6202757502027576,
"grad_norm": 0.6217120885848999,
"learning_rate": 4.760575539636147e-06,
"loss": 0.7245144844055176,
"step": 1249
},
{
"epoch": 1.6215733982157339,
"grad_norm": 0.6095402240753174,
"learning_rate": 4.753434255652108e-06,
"loss": 0.6345319151878357,
"step": 1250
},
{
"epoch": 1.6228710462287106,
"grad_norm": 0.5852973461151123,
"learning_rate": 4.746293475832607e-06,
"loss": 0.7055230736732483,
"step": 1251
},
{
"epoch": 1.6241686942416869,
"grad_norm": 0.5857930779457092,
"learning_rate": 4.739153214778735e-06,
"loss": 0.611079216003418,
"step": 1252
},
{
"epoch": 1.6254663422546636,
"grad_norm": 0.5896874070167542,
"learning_rate": 4.732013487090517e-06,
"loss": 0.6760262250900269,
"step": 1253
},
{
"epoch": 1.6267639902676398,
"grad_norm": 0.5715303421020508,
"learning_rate": 4.72487430736689e-06,
"loss": 0.6258687376976013,
"step": 1254
},
{
"epoch": 1.6280616382806163,
"grad_norm": 0.6083521246910095,
"learning_rate": 4.7177356902056675e-06,
"loss": 0.6745297908782959,
"step": 1255
},
{
"epoch": 1.6293592862935928,
"grad_norm": 0.5798436403274536,
"learning_rate": 4.7105976502035175e-06,
"loss": 0.5955469608306885,
"step": 1256
},
{
"epoch": 1.6306569343065693,
"grad_norm": 0.5836136341094971,
"learning_rate": 4.703460201955924e-06,
"loss": 0.6397416591644287,
"step": 1257
},
{
"epoch": 1.6319545823195458,
"grad_norm": 0.5983015894889832,
"learning_rate": 4.696323360057162e-06,
"loss": 0.6736359596252441,
"step": 1258
},
{
"epoch": 1.6332522303325223,
"grad_norm": 0.5725530982017517,
"learning_rate": 4.689187139100265e-06,
"loss": 0.6878089904785156,
"step": 1259
},
{
"epoch": 1.6345498783454988,
"grad_norm": 0.5805061459541321,
"learning_rate": 4.682051553677001e-06,
"loss": 0.6194028854370117,
"step": 1260
},
{
"epoch": 1.6358475263584753,
"grad_norm": 0.6036574840545654,
"learning_rate": 4.6749166183778375e-06,
"loss": 0.634255588054657,
"step": 1261
},
{
"epoch": 1.6371451743714518,
"grad_norm": 0.8983334898948669,
"learning_rate": 4.667782347791908e-06,
"loss": 0.6297205686569214,
"step": 1262
},
{
"epoch": 1.638442822384428,
"grad_norm": 0.5956529378890991,
"learning_rate": 4.660648756506993e-06,
"loss": 0.6427313089370728,
"step": 1263
},
{
"epoch": 1.6397404703974048,
"grad_norm": 0.5881230235099792,
"learning_rate": 4.653515859109478e-06,
"loss": 0.6450825929641724,
"step": 1264
},
{
"epoch": 1.641038118410381,
"grad_norm": 0.5867661833763123,
"learning_rate": 4.646383670184336e-06,
"loss": 0.6814026832580566,
"step": 1265
},
{
"epoch": 1.6423357664233578,
"grad_norm": 0.6160328388214111,
"learning_rate": 4.639252204315086e-06,
"loss": 0.6689074039459229,
"step": 1266
},
{
"epoch": 1.643633414436334,
"grad_norm": 0.582465410232544,
"learning_rate": 4.632121476083772e-06,
"loss": 0.6467956304550171,
"step": 1267
},
{
"epoch": 1.6449310624493108,
"grad_norm": 0.5506557822227478,
"learning_rate": 4.624991500070925e-06,
"loss": 0.6649973392486572,
"step": 1268
},
{
"epoch": 1.646228710462287,
"grad_norm": 0.600159227848053,
"learning_rate": 4.617862290855548e-06,
"loss": 0.6144022345542908,
"step": 1269
},
{
"epoch": 1.6475263584752637,
"grad_norm": 1.0451817512512207,
"learning_rate": 4.610733863015063e-06,
"loss": 0.6827117800712585,
"step": 1270
},
{
"epoch": 1.64882400648824,
"grad_norm": 0.5652205944061279,
"learning_rate": 4.6036062311253055e-06,
"loss": 0.5971782207489014,
"step": 1271
},
{
"epoch": 1.6501216545012165,
"grad_norm": 0.686071515083313,
"learning_rate": 4.596479409760474e-06,
"loss": 0.5615164041519165,
"step": 1272
},
{
"epoch": 1.651419302514193,
"grad_norm": 0.5449540019035339,
"learning_rate": 4.589353413493118e-06,
"loss": 0.6300219297409058,
"step": 1273
},
{
"epoch": 1.6527169505271695,
"grad_norm": 0.6144797205924988,
"learning_rate": 4.582228256894093e-06,
"loss": 0.6373116970062256,
"step": 1274
},
{
"epoch": 1.654014598540146,
"grad_norm": 0.6170778274536133,
"learning_rate": 4.575103954532547e-06,
"loss": 0.6746265888214111,
"step": 1275
},
{
"epoch": 1.6553122465531225,
"grad_norm": 0.5726920366287231,
"learning_rate": 4.567980520975867e-06,
"loss": 0.598582923412323,
"step": 1276
},
{
"epoch": 1.656609894566099,
"grad_norm": 0.59462571144104,
"learning_rate": 4.560857970789679e-06,
"loss": 0.5969716906547546,
"step": 1277
},
{
"epoch": 1.6579075425790755,
"grad_norm": 0.5755953192710876,
"learning_rate": 4.553736318537789e-06,
"loss": 0.6542321443557739,
"step": 1278
},
{
"epoch": 1.659205190592052,
"grad_norm": 0.6138618588447571,
"learning_rate": 4.546615578782178e-06,
"loss": 0.6415365934371948,
"step": 1279
},
{
"epoch": 1.6605028386050282,
"grad_norm": 0.5503448247909546,
"learning_rate": 4.5394957660829554e-06,
"loss": 0.6184664964675903,
"step": 1280
},
{
"epoch": 1.661800486618005,
"grad_norm": 0.5893129110336304,
"learning_rate": 4.532376894998335e-06,
"loss": 0.6324410438537598,
"step": 1281
},
{
"epoch": 1.6630981346309812,
"grad_norm": 0.6124705672264099,
"learning_rate": 4.5252589800846054e-06,
"loss": 0.6756390333175659,
"step": 1282
},
{
"epoch": 1.664395782643958,
"grad_norm": 0.598412275314331,
"learning_rate": 4.518142035896106e-06,
"loss": 0.7126625776290894,
"step": 1283
},
{
"epoch": 1.6656934306569342,
"grad_norm": 0.599096417427063,
"learning_rate": 4.5110260769851804e-06,
"loss": 0.6402862071990967,
"step": 1284
},
{
"epoch": 1.666991078669911,
"grad_norm": 0.5952857136726379,
"learning_rate": 4.503911117902167e-06,
"loss": 0.6510819792747498,
"step": 1285
},
{
"epoch": 1.6682887266828872,
"grad_norm": 0.5893689393997192,
"learning_rate": 4.496797173195354e-06,
"loss": 0.6236964464187622,
"step": 1286
},
{
"epoch": 1.669586374695864,
"grad_norm": 0.5871599316596985,
"learning_rate": 4.489684257410959e-06,
"loss": 0.6143825054168701,
"step": 1287
},
{
"epoch": 1.6708840227088402,
"grad_norm": 0.5756003260612488,
"learning_rate": 4.482572385093096e-06,
"loss": 0.6664775609970093,
"step": 1288
},
{
"epoch": 1.6721816707218167,
"grad_norm": 0.6174732446670532,
"learning_rate": 4.475461570783741e-06,
"loss": 0.6171724200248718,
"step": 1289
},
{
"epoch": 1.6734793187347932,
"grad_norm": 0.6114921569824219,
"learning_rate": 4.468351829022713e-06,
"loss": 0.7615275382995605,
"step": 1290
},
{
"epoch": 1.6747769667477697,
"grad_norm": 0.6558356285095215,
"learning_rate": 4.46124317434763e-06,
"loss": 0.6879911422729492,
"step": 1291
},
{
"epoch": 1.6760746147607462,
"grad_norm": 0.5599299669265747,
"learning_rate": 4.454135621293895e-06,
"loss": 0.6413300633430481,
"step": 1292
},
{
"epoch": 1.6773722627737226,
"grad_norm": 0.5664532780647278,
"learning_rate": 4.447029184394654e-06,
"loss": 0.5328360795974731,
"step": 1293
},
{
"epoch": 1.6786699107866991,
"grad_norm": 0.5689435005187988,
"learning_rate": 4.439923878180772e-06,
"loss": 0.6179879903793335,
"step": 1294
},
{
"epoch": 1.6799675587996756,
"grad_norm": 0.7659060955047607,
"learning_rate": 4.4328197171808e-06,
"loss": 0.6246920824050903,
"step": 1295
},
{
"epoch": 1.6812652068126521,
"grad_norm": 0.5884883403778076,
"learning_rate": 4.425716715920952e-06,
"loss": 0.6561876535415649,
"step": 1296
},
{
"epoch": 1.6825628548256284,
"grad_norm": 0.604040801525116,
"learning_rate": 4.418614888925064e-06,
"loss": 0.6797306537628174,
"step": 1297
},
{
"epoch": 1.683860502838605,
"grad_norm": 0.6084474921226501,
"learning_rate": 4.4115142507145806e-06,
"loss": 0.6703431606292725,
"step": 1298
},
{
"epoch": 1.6851581508515814,
"grad_norm": 0.5863416790962219,
"learning_rate": 4.4044148158085046e-06,
"loss": 0.6162433624267578,
"step": 1299
},
{
"epoch": 1.686455798864558,
"grad_norm": 0.6356022953987122,
"learning_rate": 4.397316598723385e-06,
"loss": 0.7044586539268494,
"step": 1300
},
{
"epoch": 1.6877534468775344,
"grad_norm": 0.625541627407074,
"learning_rate": 4.39021961397328e-06,
"loss": 0.6772735714912415,
"step": 1301
},
{
"epoch": 1.689051094890511,
"grad_norm": 0.6222056746482849,
"learning_rate": 4.383123876069726e-06,
"loss": 0.6994260549545288,
"step": 1302
},
{
"epoch": 1.6903487429034874,
"grad_norm": 0.6140106916427612,
"learning_rate": 4.376029399521711e-06,
"loss": 0.6723775863647461,
"step": 1303
},
{
"epoch": 1.691646390916464,
"grad_norm": 0.665780782699585,
"learning_rate": 4.368936198835646e-06,
"loss": 0.6295307278633118,
"step": 1304
},
{
"epoch": 1.6929440389294403,
"grad_norm": 0.5935512781143188,
"learning_rate": 4.361844288515327e-06,
"loss": 0.6478678584098816,
"step": 1305
},
{
"epoch": 1.6942416869424168,
"grad_norm": 0.6001803874969482,
"learning_rate": 4.354753683061921e-06,
"loss": 0.6501032710075378,
"step": 1306
},
{
"epoch": 1.6955393349553933,
"grad_norm": 0.5884422063827515,
"learning_rate": 4.347664396973917e-06,
"loss": 0.5854666829109192,
"step": 1307
},
{
"epoch": 1.6968369829683698,
"grad_norm": 0.5774276256561279,
"learning_rate": 4.340576444747114e-06,
"loss": 0.6706461906433105,
"step": 1308
},
{
"epoch": 1.6981346309813463,
"grad_norm": 0.6317939162254333,
"learning_rate": 4.333489840874575e-06,
"loss": 0.6367801427841187,
"step": 1309
},
{
"epoch": 1.6994322789943228,
"grad_norm": 0.5990278720855713,
"learning_rate": 4.326404599846618e-06,
"loss": 0.6113296747207642,
"step": 1310
},
{
"epoch": 1.7007299270072993,
"grad_norm": 0.5930926203727722,
"learning_rate": 4.319320736150762e-06,
"loss": 0.658935546875,
"step": 1311
},
{
"epoch": 1.7020275750202758,
"grad_norm": 0.5893100500106812,
"learning_rate": 4.3122382642717196e-06,
"loss": 0.6707964539527893,
"step": 1312
},
{
"epoch": 1.7033252230332523,
"grad_norm": 0.6219534277915955,
"learning_rate": 4.305157198691351e-06,
"loss": 0.6915128231048584,
"step": 1313
},
{
"epoch": 1.7046228710462286,
"grad_norm": 0.5844510197639465,
"learning_rate": 4.298077553888644e-06,
"loss": 0.6463670134544373,
"step": 1314
},
{
"epoch": 1.7059205190592053,
"grad_norm": 0.590699315071106,
"learning_rate": 4.290999344339678e-06,
"loss": 0.6447714567184448,
"step": 1315
},
{
"epoch": 1.7072181670721815,
"grad_norm": 0.7812482714653015,
"learning_rate": 4.283922584517603e-06,
"loss": 0.6600894927978516,
"step": 1316
},
{
"epoch": 1.7085158150851583,
"grad_norm": 0.5863601565361023,
"learning_rate": 4.276847288892601e-06,
"loss": 0.6242765784263611,
"step": 1317
},
{
"epoch": 1.7098134630981345,
"grad_norm": 0.5812450647354126,
"learning_rate": 4.269773471931858e-06,
"loss": 0.6475106477737427,
"step": 1318
},
{
"epoch": 1.7111111111111112,
"grad_norm": 0.5987546443939209,
"learning_rate": 4.262701148099544e-06,
"loss": 0.6834150552749634,
"step": 1319
},
{
"epoch": 1.7124087591240875,
"grad_norm": 0.5713450312614441,
"learning_rate": 4.255630331856768e-06,
"loss": 0.5877612829208374,
"step": 1320
},
{
"epoch": 1.7137064071370642,
"grad_norm": 0.5582994818687439,
"learning_rate": 4.248561037661561e-06,
"loss": 0.5848795175552368,
"step": 1321
},
{
"epoch": 1.7150040551500405,
"grad_norm": 0.5713660717010498,
"learning_rate": 4.241493279968838e-06,
"loss": 0.6386708617210388,
"step": 1322
},
{
"epoch": 1.716301703163017,
"grad_norm": 0.5673105716705322,
"learning_rate": 4.234427073230377e-06,
"loss": 0.6179746389389038,
"step": 1323
},
{
"epoch": 1.7175993511759935,
"grad_norm": 0.5679452419281006,
"learning_rate": 4.22736243189478e-06,
"loss": 0.641147255897522,
"step": 1324
},
{
"epoch": 1.71889699918897,
"grad_norm": 0.608302652835846,
"learning_rate": 4.220299370407454e-06,
"loss": 0.6888396143913269,
"step": 1325
},
{
"epoch": 1.7201946472019465,
"grad_norm": 0.5650665163993835,
"learning_rate": 4.2132379032105695e-06,
"loss": 0.651650607585907,
"step": 1326
},
{
"epoch": 1.721492295214923,
"grad_norm": 0.561650812625885,
"learning_rate": 4.206178044743041e-06,
"loss": 0.6115202307701111,
"step": 1327
},
{
"epoch": 1.7227899432278995,
"grad_norm": 0.5860607624053955,
"learning_rate": 4.19911980944049e-06,
"loss": 0.6547002792358398,
"step": 1328
},
{
"epoch": 1.724087591240876,
"grad_norm": 0.7003436088562012,
"learning_rate": 4.1920632117352235e-06,
"loss": 0.6392462253570557,
"step": 1329
},
{
"epoch": 1.7253852392538525,
"grad_norm": 0.5677862763404846,
"learning_rate": 4.185008266056195e-06,
"loss": 0.5821945667266846,
"step": 1330
},
{
"epoch": 1.7266828872668287,
"grad_norm": 0.587795615196228,
"learning_rate": 4.177954986828987e-06,
"loss": 0.6519031524658203,
"step": 1331
},
{
"epoch": 1.7279805352798054,
"grad_norm": 0.5895066857337952,
"learning_rate": 4.170903388475766e-06,
"loss": 0.6622262597084045,
"step": 1332
},
{
"epoch": 1.7292781832927817,
"grad_norm": 0.5911295413970947,
"learning_rate": 4.163853485415269e-06,
"loss": 0.6385645866394043,
"step": 1333
},
{
"epoch": 1.7305758313057584,
"grad_norm": 0.6040472984313965,
"learning_rate": 4.156805292062762e-06,
"loss": 0.6997763514518738,
"step": 1334
},
{
"epoch": 1.7318734793187347,
"grad_norm": 0.6030855178833008,
"learning_rate": 4.1497588228300165e-06,
"loss": 0.6099704504013062,
"step": 1335
},
{
"epoch": 1.7331711273317114,
"grad_norm": 0.5850874781608582,
"learning_rate": 4.142714092125277e-06,
"loss": 0.5748507380485535,
"step": 1336
},
{
"epoch": 1.7344687753446877,
"grad_norm": 0.5881203413009644,
"learning_rate": 4.135671114353239e-06,
"loss": 0.6896364688873291,
"step": 1337
},
{
"epoch": 1.7357664233576642,
"grad_norm": 0.5428244471549988,
"learning_rate": 4.128629903915004e-06,
"loss": 0.5673160552978516,
"step": 1338
},
{
"epoch": 1.7370640713706407,
"grad_norm": 0.6348845362663269,
"learning_rate": 4.121590475208071e-06,
"loss": 0.6452966928482056,
"step": 1339
},
{
"epoch": 1.7383617193836172,
"grad_norm": 0.5799127221107483,
"learning_rate": 4.114552842626285e-06,
"loss": 0.626937747001648,
"step": 1340
},
{
"epoch": 1.7396593673965937,
"grad_norm": 0.5999795198440552,
"learning_rate": 4.107517020559827e-06,
"loss": 0.6316832900047302,
"step": 1341
},
{
"epoch": 1.7409570154095702,
"grad_norm": 3.404263734817505,
"learning_rate": 4.1004830233951696e-06,
"loss": 0.6446040868759155,
"step": 1342
},
{
"epoch": 1.7422546634225466,
"grad_norm": 0.5750575661659241,
"learning_rate": 4.0934508655150585e-06,
"loss": 0.6410173177719116,
"step": 1343
},
{
"epoch": 1.7435523114355231,
"grad_norm": 0.612946093082428,
"learning_rate": 4.086420561298476e-06,
"loss": 0.7256200313568115,
"step": 1344
},
{
"epoch": 1.7448499594484996,
"grad_norm": 0.5811024904251099,
"learning_rate": 4.079392125120613e-06,
"loss": 0.6546262502670288,
"step": 1345
},
{
"epoch": 1.7461476074614761,
"grad_norm": 0.6089962124824524,
"learning_rate": 4.072365571352847e-06,
"loss": 0.5643700957298279,
"step": 1346
},
{
"epoch": 1.7474452554744526,
"grad_norm": 0.5598763227462769,
"learning_rate": 4.065340914362697e-06,
"loss": 0.6210203170776367,
"step": 1347
},
{
"epoch": 1.748742903487429,
"grad_norm": 0.5718949437141418,
"learning_rate": 4.058318168513813e-06,
"loss": 0.6246052980422974,
"step": 1348
},
{
"epoch": 1.7500405515004056,
"grad_norm": 0.5816182494163513,
"learning_rate": 4.05129734816593e-06,
"loss": 0.6502724289894104,
"step": 1349
},
{
"epoch": 1.7513381995133819,
"grad_norm": 0.6006066799163818,
"learning_rate": 4.04427846767485e-06,
"loss": 0.6196832060813904,
"step": 1350
},
{
"epoch": 1.7526358475263586,
"grad_norm": 0.6209701299667358,
"learning_rate": 4.037261541392405e-06,
"loss": 0.6615033149719238,
"step": 1351
},
{
"epoch": 1.7539334955393349,
"grad_norm": 0.5778906345367432,
"learning_rate": 4.030246583666437e-06,
"loss": 0.600303053855896,
"step": 1352
},
{
"epoch": 1.7552311435523116,
"grad_norm": 0.5654350519180298,
"learning_rate": 4.023233608840755e-06,
"loss": 0.6526889801025391,
"step": 1353
},
{
"epoch": 1.7565287915652879,
"grad_norm": 0.604720413684845,
"learning_rate": 4.016222631255121e-06,
"loss": 0.6632093191146851,
"step": 1354
},
{
"epoch": 1.7578264395782643,
"grad_norm": 0.5776406526565552,
"learning_rate": 4.0092136652452054e-06,
"loss": 0.5856695175170898,
"step": 1355
},
{
"epoch": 1.7591240875912408,
"grad_norm": 0.5833093523979187,
"learning_rate": 4.0022067251425736e-06,
"loss": 0.7012051939964294,
"step": 1356
},
{
"epoch": 1.7604217356042173,
"grad_norm": 0.6321353912353516,
"learning_rate": 3.9952018252746424e-06,
"loss": 0.6692728996276855,
"step": 1357
},
{
"epoch": 1.7617193836171938,
"grad_norm": 0.5867600440979004,
"learning_rate": 3.988198979964662e-06,
"loss": 0.6333553791046143,
"step": 1358
},
{
"epoch": 1.7630170316301703,
"grad_norm": 0.5640849471092224,
"learning_rate": 3.981198203531673e-06,
"loss": 0.6600401401519775,
"step": 1359
},
{
"epoch": 1.7643146796431468,
"grad_norm": 0.5749746561050415,
"learning_rate": 3.974199510290498e-06,
"loss": 0.600135087966919,
"step": 1360
},
{
"epoch": 1.7656123276561233,
"grad_norm": 0.6021872162818909,
"learning_rate": 3.967202914551688e-06,
"loss": 0.6514877676963806,
"step": 1361
},
{
"epoch": 1.7669099756690998,
"grad_norm": 1.1252561807632446,
"learning_rate": 3.960208430621514e-06,
"loss": 0.6247175931930542,
"step": 1362
},
{
"epoch": 1.7682076236820763,
"grad_norm": 0.6089026927947998,
"learning_rate": 3.953216072801922e-06,
"loss": 0.6505289077758789,
"step": 1363
},
{
"epoch": 1.7695052716950528,
"grad_norm": 0.613433301448822,
"learning_rate": 3.946225855390518e-06,
"loss": 0.6519597768783569,
"step": 1364
},
{
"epoch": 1.770802919708029,
"grad_norm": 0.6230673789978027,
"learning_rate": 3.9392377926805226e-06,
"loss": 0.6527152061462402,
"step": 1365
},
{
"epoch": 1.7721005677210058,
"grad_norm": 0.629035472869873,
"learning_rate": 3.932251898960759e-06,
"loss": 0.6801344156265259,
"step": 1366
},
{
"epoch": 1.773398215733982,
"grad_norm": 0.586634635925293,
"learning_rate": 3.925268188515611e-06,
"loss": 0.6678798794746399,
"step": 1367
},
{
"epoch": 1.7746958637469588,
"grad_norm": 0.691630482673645,
"learning_rate": 3.918286675624998e-06,
"loss": 0.6675139665603638,
"step": 1368
},
{
"epoch": 1.775993511759935,
"grad_norm": 0.5624348521232605,
"learning_rate": 3.911307374564346e-06,
"loss": 0.5508803129196167,
"step": 1369
},
{
"epoch": 1.7772911597729117,
"grad_norm": 0.9164373874664307,
"learning_rate": 3.904330299604562e-06,
"loss": 0.6670984625816345,
"step": 1370
},
{
"epoch": 1.778588807785888,
"grad_norm": 0.620689868927002,
"learning_rate": 3.897355465011996e-06,
"loss": 0.6593863368034363,
"step": 1371
},
{
"epoch": 1.7798864557988645,
"grad_norm": 0.5467659831047058,
"learning_rate": 3.89038288504842e-06,
"loss": 0.5556522607803345,
"step": 1372
},
{
"epoch": 1.781184103811841,
"grad_norm": 0.5498706698417664,
"learning_rate": 3.883412573970995e-06,
"loss": 0.6222935914993286,
"step": 1373
},
{
"epoch": 1.7824817518248175,
"grad_norm": 0.5786144137382507,
"learning_rate": 3.876444546032242e-06,
"loss": 0.6003856658935547,
"step": 1374
},
{
"epoch": 1.783779399837794,
"grad_norm": 0.5900736451148987,
"learning_rate": 3.8694788154800185e-06,
"loss": 0.6151521801948547,
"step": 1375
},
{
"epoch": 1.7850770478507705,
"grad_norm": 0.5880241394042969,
"learning_rate": 3.862515396557476e-06,
"loss": 0.6527180671691895,
"step": 1376
},
{
"epoch": 1.786374695863747,
"grad_norm": 0.6083548069000244,
"learning_rate": 3.855554303503047e-06,
"loss": 0.6581445932388306,
"step": 1377
},
{
"epoch": 1.7876723438767235,
"grad_norm": 0.5609106421470642,
"learning_rate": 3.848595550550401e-06,
"loss": 0.6590725779533386,
"step": 1378
},
{
"epoch": 1.7889699918897,
"grad_norm": 0.6204782724380493,
"learning_rate": 3.841639151928431e-06,
"loss": 0.6809993386268616,
"step": 1379
},
{
"epoch": 1.7902676399026762,
"grad_norm": 0.5831668972969055,
"learning_rate": 3.834685121861208e-06,
"loss": 0.6498827934265137,
"step": 1380
},
{
"epoch": 1.7902676399026762,
"eval_loss": 0.6777992248535156,
"eval_runtime": 73.0192,
"eval_samples_per_second": 71.105,
"eval_steps_per_second": 8.888,
"step": 1380
},
{
"epoch": 1.791565287915653,
"grad_norm": 0.5954435467720032,
"learning_rate": 3.827733474567966e-06,
"loss": 0.6496779322624207,
"step": 1381
},
{
"epoch": 1.7928629359286292,
"grad_norm": 0.5471308827400208,
"learning_rate": 3.820784224263061e-06,
"loss": 0.5941118001937866,
"step": 1382
},
{
"epoch": 1.794160583941606,
"grad_norm": 0.5896412134170532,
"learning_rate": 3.8138373851559546e-06,
"loss": 0.6255256533622742,
"step": 1383
},
{
"epoch": 1.7954582319545822,
"grad_norm": 0.9544134736061096,
"learning_rate": 3.8068929714511716e-06,
"loss": 0.6434448957443237,
"step": 1384
},
{
"epoch": 1.796755879967559,
"grad_norm": 0.5609217882156372,
"learning_rate": 3.799950997348283e-06,
"loss": 0.6087275743484497,
"step": 1385
},
{
"epoch": 1.7980535279805352,
"grad_norm": 4.44458532333374,
"learning_rate": 3.7930114770418654e-06,
"loss": 0.5713160037994385,
"step": 1386
},
{
"epoch": 1.799351175993512,
"grad_norm": 0.5974010825157166,
"learning_rate": 3.7860744247214853e-06,
"loss": 0.6058465838432312,
"step": 1387
},
{
"epoch": 1.8006488240064882,
"grad_norm": 0.5761491060256958,
"learning_rate": 3.7791398545716552e-06,
"loss": 0.619678258895874,
"step": 1388
},
{
"epoch": 1.8019464720194647,
"grad_norm": 1.2458136081695557,
"learning_rate": 3.7722077807718193e-06,
"loss": 0.6886736750602722,
"step": 1389
},
{
"epoch": 1.8032441200324412,
"grad_norm": 0.6204317212104797,
"learning_rate": 3.7652782174963107e-06,
"loss": 0.6285656690597534,
"step": 1390
},
{
"epoch": 1.8045417680454177,
"grad_norm": 0.5791151523590088,
"learning_rate": 3.758351178914336e-06,
"loss": 0.6601356267929077,
"step": 1391
},
{
"epoch": 1.8058394160583942,
"grad_norm": 0.5656175017356873,
"learning_rate": 3.7514266791899324e-06,
"loss": 0.5828202962875366,
"step": 1392
},
{
"epoch": 1.8071370640713706,
"grad_norm": 0.6195251941680908,
"learning_rate": 3.7445047324819517e-06,
"loss": 0.7079391479492188,
"step": 1393
},
{
"epoch": 1.8084347120843471,
"grad_norm": 0.5826953649520874,
"learning_rate": 3.737585352944021e-06,
"loss": 0.6261759996414185,
"step": 1394
},
{
"epoch": 1.8097323600973236,
"grad_norm": 0.6581652760505676,
"learning_rate": 3.7306685547245225e-06,
"loss": 0.6573713421821594,
"step": 1395
},
{
"epoch": 1.8110300081103001,
"grad_norm": 0.5666741728782654,
"learning_rate": 3.7237543519665543e-06,
"loss": 0.621452808380127,
"step": 1396
},
{
"epoch": 1.8123276561232764,
"grad_norm": 0.5948919057846069,
"learning_rate": 3.7168427588079153e-06,
"loss": 0.6522223353385925,
"step": 1397
},
{
"epoch": 1.8136253041362531,
"grad_norm": 0.5332669615745544,
"learning_rate": 3.7099337893810593e-06,
"loss": 0.650192141532898,
"step": 1398
},
{
"epoch": 1.8149229521492294,
"grad_norm": 0.599592924118042,
"learning_rate": 3.703027457813086e-06,
"loss": 0.6094880700111389,
"step": 1399
},
{
"epoch": 1.816220600162206,
"grad_norm": 0.6047189235687256,
"learning_rate": 3.696123778225691e-06,
"loss": 0.6866611838340759,
"step": 1400
},
{
"epoch": 1.8175182481751824,
"grad_norm": 0.7004641890525818,
"learning_rate": 3.6892227647351515e-06,
"loss": 0.6755614280700684,
"step": 1401
},
{
"epoch": 1.818815896188159,
"grad_norm": 0.5989522933959961,
"learning_rate": 3.6823244314522966e-06,
"loss": 0.6946245431900024,
"step": 1402
},
{
"epoch": 1.8201135442011354,
"grad_norm": 0.579132080078125,
"learning_rate": 3.67542879248247e-06,
"loss": 0.6097831726074219,
"step": 1403
},
{
"epoch": 1.821411192214112,
"grad_norm": 0.577029287815094,
"learning_rate": 3.668535861925509e-06,
"loss": 0.6218363046646118,
"step": 1404
},
{
"epoch": 1.8227088402270883,
"grad_norm": 0.6415956020355225,
"learning_rate": 3.661645653875709e-06,
"loss": 0.6793798208236694,
"step": 1405
},
{
"epoch": 1.8240064882400648,
"grad_norm": 0.603378415107727,
"learning_rate": 3.6547581824218057e-06,
"loss": 0.5855191946029663,
"step": 1406
},
{
"epoch": 1.8253041362530413,
"grad_norm": 0.6317605376243591,
"learning_rate": 3.6478734616469324e-06,
"loss": 0.6648485660552979,
"step": 1407
},
{
"epoch": 1.8266017842660178,
"grad_norm": 0.5663666725158691,
"learning_rate": 3.6409915056286017e-06,
"loss": 0.6257850527763367,
"step": 1408
},
{
"epoch": 1.8278994322789943,
"grad_norm": 0.8109258413314819,
"learning_rate": 3.6341123284386694e-06,
"loss": 0.6545461416244507,
"step": 1409
},
{
"epoch": 1.8291970802919708,
"grad_norm": 0.6355454325675964,
"learning_rate": 3.627235944143315e-06,
"loss": 0.68341463804245,
"step": 1410
},
{
"epoch": 1.8304947283049473,
"grad_norm": 0.5834214091300964,
"learning_rate": 3.620362366803001e-06,
"loss": 0.6818444728851318,
"step": 1411
},
{
"epoch": 1.8317923763179238,
"grad_norm": 0.5867376327514648,
"learning_rate": 3.6134916104724573e-06,
"loss": 0.6132810115814209,
"step": 1412
},
{
"epoch": 1.8330900243309003,
"grad_norm": 0.5869424343109131,
"learning_rate": 3.606623689200637e-06,
"loss": 0.6913362741470337,
"step": 1413
},
{
"epoch": 1.8343876723438766,
"grad_norm": 0.5870312452316284,
"learning_rate": 3.599758617030704e-06,
"loss": 0.6339567303657532,
"step": 1414
},
{
"epoch": 1.8356853203568533,
"grad_norm": 0.6119568943977356,
"learning_rate": 3.5928964079999907e-06,
"loss": 0.6378414630889893,
"step": 1415
},
{
"epoch": 1.8369829683698295,
"grad_norm": 0.5717766284942627,
"learning_rate": 3.5860370761399814e-06,
"loss": 0.6197869777679443,
"step": 1416
},
{
"epoch": 1.8382806163828063,
"grad_norm": 0.626775860786438,
"learning_rate": 3.5791806354762702e-06,
"loss": 0.7052003145217896,
"step": 1417
},
{
"epoch": 1.8395782643957825,
"grad_norm": 0.5812957286834717,
"learning_rate": 3.572327100028545e-06,
"loss": 0.66878342628479,
"step": 1418
},
{
"epoch": 1.8408759124087593,
"grad_norm": 0.585649311542511,
"learning_rate": 3.565476483810548e-06,
"loss": 0.6272032260894775,
"step": 1419
},
{
"epoch": 1.8421735604217355,
"grad_norm": 0.6118691563606262,
"learning_rate": 3.55862880083006e-06,
"loss": 0.6374541521072388,
"step": 1420
},
{
"epoch": 1.8434712084347122,
"grad_norm": 0.5860823392868042,
"learning_rate": 3.5517840650888564e-06,
"loss": 0.6104147434234619,
"step": 1421
},
{
"epoch": 1.8447688564476885,
"grad_norm": 0.5618652701377869,
"learning_rate": 3.544942290582691e-06,
"loss": 0.5710769891738892,
"step": 1422
},
{
"epoch": 1.846066504460665,
"grad_norm": 0.5879126787185669,
"learning_rate": 3.538103491301258e-06,
"loss": 0.6456954479217529,
"step": 1423
},
{
"epoch": 1.8473641524736415,
"grad_norm": 0.6192496418952942,
"learning_rate": 3.531267681228175e-06,
"loss": 0.6715401411056519,
"step": 1424
},
{
"epoch": 1.848661800486618,
"grad_norm": 0.6261125802993774,
"learning_rate": 3.5244348743409394e-06,
"loss": 0.6905325055122375,
"step": 1425
},
{
"epoch": 1.8499594484995945,
"grad_norm": 0.5808646082878113,
"learning_rate": 3.517605084610917e-06,
"loss": 0.6800282001495361,
"step": 1426
},
{
"epoch": 1.851257096512571,
"grad_norm": 0.5866647362709045,
"learning_rate": 3.510778326003294e-06,
"loss": 0.6750452518463135,
"step": 1427
},
{
"epoch": 1.8525547445255475,
"grad_norm": 0.5787751078605652,
"learning_rate": 3.5039546124770675e-06,
"loss": 0.6570975184440613,
"step": 1428
},
{
"epoch": 1.853852392538524,
"grad_norm": 0.6095142960548401,
"learning_rate": 3.4971339579850017e-06,
"loss": 0.6344528198242188,
"step": 1429
},
{
"epoch": 1.8551500405515005,
"grad_norm": 0.5892320871353149,
"learning_rate": 3.4903163764736104e-06,
"loss": 0.6722358465194702,
"step": 1430
},
{
"epoch": 1.8564476885644767,
"grad_norm": 0.5868071913719177,
"learning_rate": 3.4835018818831235e-06,
"loss": 0.638904333114624,
"step": 1431
},
{
"epoch": 1.8577453365774534,
"grad_norm": 0.6003979444503784,
"learning_rate": 3.4766904881474535e-06,
"loss": 0.6853640079498291,
"step": 1432
},
{
"epoch": 1.8590429845904297,
"grad_norm": 0.555009663105011,
"learning_rate": 3.4698822091941808e-06,
"loss": 0.6409114599227905,
"step": 1433
},
{
"epoch": 1.8603406326034064,
"grad_norm": 0.5608627796173096,
"learning_rate": 3.463077058944511e-06,
"loss": 0.6055079698562622,
"step": 1434
},
{
"epoch": 1.8616382806163827,
"grad_norm": 0.6137329339981079,
"learning_rate": 3.456275051313255e-06,
"loss": 0.6407139897346497,
"step": 1435
},
{
"epoch": 1.8629359286293594,
"grad_norm": 0.5606741905212402,
"learning_rate": 3.4494762002087934e-06,
"loss": 0.6254716515541077,
"step": 1436
},
{
"epoch": 1.8642335766423357,
"grad_norm": 0.6578085422515869,
"learning_rate": 3.4426805195330605e-06,
"loss": 0.7003939151763916,
"step": 1437
},
{
"epoch": 1.8655312246553124,
"grad_norm": 0.6054635047912598,
"learning_rate": 3.4358880231814983e-06,
"loss": 0.6616827845573425,
"step": 1438
},
{
"epoch": 1.8668288726682887,
"grad_norm": 0.5833800435066223,
"learning_rate": 3.4290987250430486e-06,
"loss": 0.6554232835769653,
"step": 1439
},
{
"epoch": 1.8681265206812652,
"grad_norm": 0.6048437356948853,
"learning_rate": 3.4223126390001025e-06,
"loss": 0.6970128417015076,
"step": 1440
},
{
"epoch": 1.8694241686942417,
"grad_norm": 0.5701255202293396,
"learning_rate": 3.415529778928492e-06,
"loss": 0.6580668687820435,
"step": 1441
},
{
"epoch": 1.8707218167072182,
"grad_norm": 0.553488552570343,
"learning_rate": 3.408750158697445e-06,
"loss": 0.5830860137939453,
"step": 1442
},
{
"epoch": 1.8720194647201946,
"grad_norm": 0.5695835947990417,
"learning_rate": 3.401973792169574e-06,
"loss": 0.6223429441452026,
"step": 1443
},
{
"epoch": 1.8733171127331711,
"grad_norm": 0.5780246257781982,
"learning_rate": 3.39520069320083e-06,
"loss": 0.6171367168426514,
"step": 1444
},
{
"epoch": 1.8746147607461476,
"grad_norm": 0.5851401686668396,
"learning_rate": 3.3884308756404873e-06,
"loss": 0.648118257522583,
"step": 1445
},
{
"epoch": 1.8759124087591241,
"grad_norm": 0.5909201502799988,
"learning_rate": 3.381664353331107e-06,
"loss": 0.6370965242385864,
"step": 1446
},
{
"epoch": 1.8772100567721006,
"grad_norm": 0.5840253233909607,
"learning_rate": 3.3749011401085185e-06,
"loss": 0.637911856174469,
"step": 1447
},
{
"epoch": 1.878507704785077,
"grad_norm": 0.5772621035575867,
"learning_rate": 3.3681412498017773e-06,
"loss": 0.6257845759391785,
"step": 1448
},
{
"epoch": 1.8798053527980536,
"grad_norm": 0.5972771048545837,
"learning_rate": 3.361384696233152e-06,
"loss": 0.6612721085548401,
"step": 1449
},
{
"epoch": 1.8811030008110299,
"grad_norm": 0.622917652130127,
"learning_rate": 3.354631493218081e-06,
"loss": 0.657785177230835,
"step": 1450
},
{
"epoch": 1.8824006488240066,
"grad_norm": 0.581942081451416,
"learning_rate": 3.347881654565159e-06,
"loss": 0.6339654922485352,
"step": 1451
},
{
"epoch": 1.8836982968369829,
"grad_norm": 0.5792364478111267,
"learning_rate": 3.3411351940760924e-06,
"loss": 0.606496274471283,
"step": 1452
},
{
"epoch": 1.8849959448499596,
"grad_norm": 0.5994595289230347,
"learning_rate": 3.3343921255456903e-06,
"loss": 0.6079939603805542,
"step": 1453
},
{
"epoch": 1.8862935928629359,
"grad_norm": 0.5667769908905029,
"learning_rate": 3.3276524627618177e-06,
"loss": 0.5945770740509033,
"step": 1454
},
{
"epoch": 1.8875912408759126,
"grad_norm": 0.591791033744812,
"learning_rate": 3.3209162195053825e-06,
"loss": 0.620225727558136,
"step": 1455
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.5802031755447388,
"learning_rate": 3.314183409550293e-06,
"loss": 0.614050567150116,
"step": 1456
},
{
"epoch": 1.8901865369018653,
"grad_norm": 0.6020429134368896,
"learning_rate": 3.3074540466634454e-06,
"loss": 0.6691816449165344,
"step": 1457
},
{
"epoch": 1.8914841849148418,
"grad_norm": 0.6074531674385071,
"learning_rate": 3.300728144604681e-06,
"loss": 0.6914318799972534,
"step": 1458
},
{
"epoch": 1.8927818329278183,
"grad_norm": 0.5949025750160217,
"learning_rate": 3.294005717126767e-06,
"loss": 0.5819941163063049,
"step": 1459
},
{
"epoch": 1.8940794809407948,
"grad_norm": 0.5953806638717651,
"learning_rate": 3.287286777975369e-06,
"loss": 0.6016311645507812,
"step": 1460
},
{
"epoch": 1.8953771289537713,
"grad_norm": 0.6012862920761108,
"learning_rate": 3.2805713408890134e-06,
"loss": 0.62370765209198,
"step": 1461
},
{
"epoch": 1.8966747769667478,
"grad_norm": 0.5692993402481079,
"learning_rate": 3.2738594195990725e-06,
"loss": 0.6124866604804993,
"step": 1462
},
{
"epoch": 1.8979724249797243,
"grad_norm": 0.5979285836219788,
"learning_rate": 3.267151027829725e-06,
"loss": 0.6501439213752747,
"step": 1463
},
{
"epoch": 1.8992700729927008,
"grad_norm": 0.579058825969696,
"learning_rate": 3.2604461792979346e-06,
"loss": 0.6591506004333496,
"step": 1464
},
{
"epoch": 1.900567721005677,
"grad_norm": 0.5612583756446838,
"learning_rate": 3.253744887713417e-06,
"loss": 0.644995927810669,
"step": 1465
},
{
"epoch": 1.9018653690186538,
"grad_norm": 0.5929267406463623,
"learning_rate": 3.2470471667786217e-06,
"loss": 0.6369574069976807,
"step": 1466
},
{
"epoch": 1.90316301703163,
"grad_norm": 0.5371314287185669,
"learning_rate": 3.2403530301886897e-06,
"loss": 0.6427657604217529,
"step": 1467
},
{
"epoch": 1.9044606650446068,
"grad_norm": 0.5879482626914978,
"learning_rate": 3.2336624916314385e-06,
"loss": 0.6144864559173584,
"step": 1468
},
{
"epoch": 1.905758313057583,
"grad_norm": 0.5627234578132629,
"learning_rate": 3.226975564787322e-06,
"loss": 0.6070575714111328,
"step": 1469
},
{
"epoch": 1.9070559610705597,
"grad_norm": 0.595919668674469,
"learning_rate": 3.2202922633294178e-06,
"loss": 0.6438186764717102,
"step": 1470
},
{
"epoch": 1.908353609083536,
"grad_norm": 0.5860680937767029,
"learning_rate": 3.2136126009233815e-06,
"loss": 0.6168484091758728,
"step": 1471
},
{
"epoch": 1.9096512570965127,
"grad_norm": 0.6082072257995605,
"learning_rate": 3.2069365912274364e-06,
"loss": 0.6607163548469543,
"step": 1472
},
{
"epoch": 1.910948905109489,
"grad_norm": 0.6000680923461914,
"learning_rate": 3.2002642478923273e-06,
"loss": 0.6100636720657349,
"step": 1473
},
{
"epoch": 1.9122465531224655,
"grad_norm": 0.5958935022354126,
"learning_rate": 3.1935955845613138e-06,
"loss": 0.6283643245697021,
"step": 1474
},
{
"epoch": 1.913544201135442,
"grad_norm": 0.5999156832695007,
"learning_rate": 3.1869306148701186e-06,
"loss": 0.6624071002006531,
"step": 1475
},
{
"epoch": 1.9148418491484185,
"grad_norm": 0.5659943222999573,
"learning_rate": 3.1802693524469226e-06,
"loss": 0.5978960990905762,
"step": 1476
},
{
"epoch": 1.916139497161395,
"grad_norm": 0.6041963696479797,
"learning_rate": 3.1736118109123183e-06,
"loss": 0.6953626871109009,
"step": 1477
},
{
"epoch": 1.9174371451743715,
"grad_norm": 0.5829861164093018,
"learning_rate": 3.1669580038792953e-06,
"loss": 0.6347401142120361,
"step": 1478
},
{
"epoch": 1.918734793187348,
"grad_norm": 0.5910770297050476,
"learning_rate": 3.1603079449532014e-06,
"loss": 0.6252144575119019,
"step": 1479
},
{
"epoch": 1.9200324412003245,
"grad_norm": 0.5840498208999634,
"learning_rate": 3.1536616477317283e-06,
"loss": 0.6821172833442688,
"step": 1480
},
{
"epoch": 1.921330089213301,
"grad_norm": 0.5815771222114563,
"learning_rate": 3.147019125804869e-06,
"loss": 0.627813458442688,
"step": 1481
},
{
"epoch": 1.9226277372262772,
"grad_norm": 0.6089122295379639,
"learning_rate": 3.140380392754901e-06,
"loss": 0.5848509073257446,
"step": 1482
},
{
"epoch": 1.923925385239254,
"grad_norm": 0.5963802337646484,
"learning_rate": 3.13374546215635e-06,
"loss": 0.6434051990509033,
"step": 1483
},
{
"epoch": 1.9252230332522302,
"grad_norm": 0.5844939351081848,
"learning_rate": 3.1271143475759745e-06,
"loss": 0.6818792819976807,
"step": 1484
},
{
"epoch": 1.926520681265207,
"grad_norm": 0.5862755179405212,
"learning_rate": 3.1204870625727216e-06,
"loss": 0.6306114196777344,
"step": 1485
},
{
"epoch": 1.9278183292781832,
"grad_norm": 0.5746100544929504,
"learning_rate": 3.1138636206977147e-06,
"loss": 0.649817705154419,
"step": 1486
},
{
"epoch": 1.92911597729116,
"grad_norm": 0.7469968199729919,
"learning_rate": 3.107244035494212e-06,
"loss": 0.6348094940185547,
"step": 1487
},
{
"epoch": 1.9304136253041362,
"grad_norm": 0.5893679857254028,
"learning_rate": 3.100628320497592e-06,
"loss": 0.6067320704460144,
"step": 1488
},
{
"epoch": 1.931711273317113,
"grad_norm": 0.5654053688049316,
"learning_rate": 3.0940164892353197e-06,
"loss": 0.6475971937179565,
"step": 1489
},
{
"epoch": 1.9330089213300892,
"grad_norm": 0.5734997987747192,
"learning_rate": 3.087408555226914e-06,
"loss": 0.61939537525177,
"step": 1490
},
{
"epoch": 1.9343065693430657,
"grad_norm": 0.5849641561508179,
"learning_rate": 3.0808045319839285e-06,
"loss": 0.6628157496452332,
"step": 1491
},
{
"epoch": 1.9356042173560422,
"grad_norm": 0.6002839803695679,
"learning_rate": 3.0742044330099162e-06,
"loss": 0.7149718403816223,
"step": 1492
},
{
"epoch": 1.9369018653690186,
"grad_norm": 0.5984014272689819,
"learning_rate": 3.067608271800414e-06,
"loss": 0.6320532560348511,
"step": 1493
},
{
"epoch": 1.9381995133819951,
"grad_norm": 0.5990681052207947,
"learning_rate": 3.0610160618428987e-06,
"loss": 0.7083904147148132,
"step": 1494
},
{
"epoch": 1.9394971613949716,
"grad_norm": 0.5863717794418335,
"learning_rate": 3.054427816616773e-06,
"loss": 0.6290713548660278,
"step": 1495
},
{
"epoch": 1.9407948094079481,
"grad_norm": 0.5699295401573181,
"learning_rate": 3.0478435495933273e-06,
"loss": 0.621793270111084,
"step": 1496
},
{
"epoch": 1.9420924574209246,
"grad_norm": 0.5777533054351807,
"learning_rate": 3.0412632742357263e-06,
"loss": 0.6173816323280334,
"step": 1497
},
{
"epoch": 1.9433901054339011,
"grad_norm": 0.6047410368919373,
"learning_rate": 3.0346870039989618e-06,
"loss": 0.6888694763183594,
"step": 1498
},
{
"epoch": 1.9446877534468774,
"grad_norm": 0.5461248159408569,
"learning_rate": 3.028114752329848e-06,
"loss": 0.5872098207473755,
"step": 1499
},
{
"epoch": 1.945985401459854,
"grad_norm": 0.6002129316329956,
"learning_rate": 3.0215465326669724e-06,
"loss": 0.6144348382949829,
"step": 1500
},
{
"epoch": 1.9472830494728304,
"grad_norm": 0.5926127433776855,
"learning_rate": 3.0149823584406834e-06,
"loss": 0.5981168746948242,
"step": 1501
},
{
"epoch": 1.948580697485807,
"grad_norm": 0.553831160068512,
"learning_rate": 3.008422243073053e-06,
"loss": 0.6507419943809509,
"step": 1502
},
{
"epoch": 1.9498783454987834,
"grad_norm": 0.6168836951255798,
"learning_rate": 3.001866199977861e-06,
"loss": 0.6085610389709473,
"step": 1503
},
{
"epoch": 1.95117599351176,
"grad_norm": 0.610622227191925,
"learning_rate": 2.995314242560553e-06,
"loss": 0.584296703338623,
"step": 1504
},
{
"epoch": 1.9524736415247363,
"grad_norm": 0.598139762878418,
"learning_rate": 2.988766384218225e-06,
"loss": 0.6997476816177368,
"step": 1505
},
{
"epoch": 1.9537712895377128,
"grad_norm": 0.5578987002372742,
"learning_rate": 2.982222638339588e-06,
"loss": 0.5938620567321777,
"step": 1506
},
{
"epoch": 1.9550689375506893,
"grad_norm": 0.6006044745445251,
"learning_rate": 2.9756830183049502e-06,
"loss": 0.6362953186035156,
"step": 1507
},
{
"epoch": 1.9563665855636658,
"grad_norm": 0.6040393710136414,
"learning_rate": 2.969147537486175e-06,
"loss": 0.5799316167831421,
"step": 1508
},
{
"epoch": 1.9576642335766423,
"grad_norm": 0.5984890460968018,
"learning_rate": 2.962616209246669e-06,
"loss": 0.639271080493927,
"step": 1509
},
{
"epoch": 1.9589618815896188,
"grad_norm": 0.7439842820167542,
"learning_rate": 2.956089046941344e-06,
"loss": 0.6323772072792053,
"step": 1510
},
{
"epoch": 1.9602595296025953,
"grad_norm": 0.5876015424728394,
"learning_rate": 2.9495660639165967e-06,
"loss": 0.5763074159622192,
"step": 1511
},
{
"epoch": 1.9615571776155718,
"grad_norm": 0.6764865517616272,
"learning_rate": 2.9430472735102733e-06,
"loss": 0.7091867923736572,
"step": 1512
},
{
"epoch": 1.9628548256285483,
"grad_norm": 0.6067684292793274,
"learning_rate": 2.9365326890516543e-06,
"loss": 0.6496888995170593,
"step": 1513
},
{
"epoch": 1.9641524736415248,
"grad_norm": 0.5764046311378479,
"learning_rate": 2.9300223238614135e-06,
"loss": 0.6311619281768799,
"step": 1514
},
{
"epoch": 1.9654501216545013,
"grad_norm": 0.5956159234046936,
"learning_rate": 2.923516191251601e-06,
"loss": 0.6114912033081055,
"step": 1515
},
{
"epoch": 1.9667477696674776,
"grad_norm": 0.5818417072296143,
"learning_rate": 2.917014304525609e-06,
"loss": 0.6572203636169434,
"step": 1516
},
{
"epoch": 1.9680454176804543,
"grad_norm": 0.6058406233787537,
"learning_rate": 2.91051667697815e-06,
"loss": 0.6197275519371033,
"step": 1517
},
{
"epoch": 1.9693430656934305,
"grad_norm": 0.6014067530632019,
"learning_rate": 2.904023321895234e-06,
"loss": 0.6693457365036011,
"step": 1518
},
{
"epoch": 1.9706407137064073,
"grad_norm": 0.5446932315826416,
"learning_rate": 2.8975342525541217e-06,
"loss": 0.6219191551208496,
"step": 1519
},
{
"epoch": 1.9719383617193835,
"grad_norm": 0.5773969292640686,
"learning_rate": 2.8910494822233203e-06,
"loss": 0.6279373168945312,
"step": 1520
},
{
"epoch": 1.9732360097323602,
"grad_norm": 0.553596556186676,
"learning_rate": 2.8845690241625437e-06,
"loss": 0.5865894556045532,
"step": 1521
},
{
"epoch": 1.9745336577453365,
"grad_norm": 0.5790948867797852,
"learning_rate": 2.878092891622688e-06,
"loss": 0.6192329525947571,
"step": 1522
},
{
"epoch": 1.975831305758313,
"grad_norm": 0.5870917439460754,
"learning_rate": 2.871621097845806e-06,
"loss": 0.6201770305633545,
"step": 1523
},
{
"epoch": 1.9771289537712895,
"grad_norm": 0.586599588394165,
"learning_rate": 2.865153656065076e-06,
"loss": 0.6979238390922546,
"step": 1524
},
{
"epoch": 1.978426601784266,
"grad_norm": 0.5809787511825562,
"learning_rate": 2.8586905795047813e-06,
"loss": 0.6264389753341675,
"step": 1525
},
{
"epoch": 1.9797242497972425,
"grad_norm": 0.5969094038009644,
"learning_rate": 2.8522318813802796e-06,
"loss": 0.6544374227523804,
"step": 1526
},
{
"epoch": 1.981021897810219,
"grad_norm": 0.5875753164291382,
"learning_rate": 2.8457775748979664e-06,
"loss": 0.7151497006416321,
"step": 1527
},
{
"epoch": 1.9823195458231955,
"grad_norm": 0.5887599587440491,
"learning_rate": 2.8393276732552745e-06,
"loss": 0.650242030620575,
"step": 1528
},
{
"epoch": 1.983617193836172,
"grad_norm": 0.5730281472206116,
"learning_rate": 2.8328821896406132e-06,
"loss": 0.6076555252075195,
"step": 1529
},
{
"epoch": 1.9849148418491485,
"grad_norm": 0.6394782662391663,
"learning_rate": 2.826441137233368e-06,
"loss": 0.6826823949813843,
"step": 1530
},
{
"epoch": 1.986212489862125,
"grad_norm": 0.5790883302688599,
"learning_rate": 2.8200045292038596e-06,
"loss": 0.6138323545455933,
"step": 1531
},
{
"epoch": 1.9875101378751014,
"grad_norm": 0.6426994800567627,
"learning_rate": 2.8135723787133233e-06,
"loss": 0.7073339223861694,
"step": 1532
},
{
"epoch": 1.9888077858880777,
"grad_norm": 0.6070610880851746,
"learning_rate": 2.8071446989138786e-06,
"loss": 0.6867741346359253,
"step": 1533
},
{
"epoch": 1.9901054339010544,
"grad_norm": 0.6205259561538696,
"learning_rate": 2.800721502948506e-06,
"loss": 0.6849797368049622,
"step": 1534
},
{
"epoch": 1.9914030819140307,
"grad_norm": 0.6018499135971069,
"learning_rate": 2.7943028039510085e-06,
"loss": 0.6437822580337524,
"step": 1535
},
{
"epoch": 1.9927007299270074,
"grad_norm": 0.6043044328689575,
"learning_rate": 2.78788861504601e-06,
"loss": 0.6022955775260925,
"step": 1536
},
{
"epoch": 1.9939983779399837,
"grad_norm": 0.5917290449142456,
"learning_rate": 2.7814789493488947e-06,
"loss": 0.6646702885627747,
"step": 1537
},
{
"epoch": 1.9952960259529604,
"grad_norm": 0.6160155534744263,
"learning_rate": 2.7750738199658157e-06,
"loss": 0.6750048398971558,
"step": 1538
},
{
"epoch": 1.9965936739659367,
"grad_norm": 0.5714327096939087,
"learning_rate": 2.7686732399936343e-06,
"loss": 0.6445184946060181,
"step": 1539
},
{
"epoch": 1.9978913219789132,
"grad_norm": 0.5985759496688843,
"learning_rate": 2.762277222519919e-06,
"loss": 0.6806057691574097,
"step": 1540
},
{
"epoch": 1.9991889699918897,
"grad_norm": 0.5991272330284119,
"learning_rate": 2.7558857806229066e-06,
"loss": 0.6159195899963379,
"step": 1541
},
{
"epoch": 2.0,
"grad_norm": 0.7430510520935059,
"learning_rate": 2.749498927371478e-06,
"loss": 0.5819271802902222,
"step": 1542
},
{
"epoch": 2.0012976480129763,
"grad_norm": 0.6751839518547058,
"learning_rate": 2.7431166758251317e-06,
"loss": 0.5926187634468079,
"step": 1543
},
{
"epoch": 2.002595296025953,
"grad_norm": 0.6562322378158569,
"learning_rate": 2.7367390390339565e-06,
"loss": 0.6589317321777344,
"step": 1544
},
{
"epoch": 2.0038929440389293,
"grad_norm": 0.6393698453903198,
"learning_rate": 2.730366030038606e-06,
"loss": 0.544275164604187,
"step": 1545
},
{
"epoch": 2.005190592051906,
"grad_norm": 0.5910435318946838,
"learning_rate": 2.72399766187027e-06,
"loss": 0.6208810210227966,
"step": 1546
},
{
"epoch": 2.0064882400648822,
"grad_norm": 0.6248382925987244,
"learning_rate": 2.7176339475506515e-06,
"loss": 0.5592293739318848,
"step": 1547
},
{
"epoch": 2.007785888077859,
"grad_norm": 0.6031874418258667,
"learning_rate": 2.7112749000919304e-06,
"loss": 0.5941007137298584,
"step": 1548
},
{
"epoch": 2.0090835360908352,
"grad_norm": 0.598434567451477,
"learning_rate": 2.704920532496756e-06,
"loss": 0.5872475504875183,
"step": 1549
},
{
"epoch": 2.010381184103812,
"grad_norm": 0.606324315071106,
"learning_rate": 2.698570857758195e-06,
"loss": 0.5607691407203674,
"step": 1550
},
{
"epoch": 2.011678832116788,
"grad_norm": 0.6241020560264587,
"learning_rate": 2.692225888859732e-06,
"loss": 0.6537069082260132,
"step": 1551
},
{
"epoch": 2.012976480129765,
"grad_norm": 0.6302763223648071,
"learning_rate": 2.685885638775216e-06,
"loss": 0.6311033964157104,
"step": 1552
},
{
"epoch": 2.014274128142741,
"grad_norm": 0.7878178358078003,
"learning_rate": 2.6795501204688586e-06,
"loss": 0.6164021492004395,
"step": 1553
},
{
"epoch": 2.015571776155718,
"grad_norm": 0.6297698616981506,
"learning_rate": 2.6732193468951882e-06,
"loss": 0.6132771968841553,
"step": 1554
},
{
"epoch": 2.016869424168694,
"grad_norm": 0.6227217316627502,
"learning_rate": 2.666893330999035e-06,
"loss": 0.5441837310791016,
"step": 1555
},
{
"epoch": 2.018167072181671,
"grad_norm": 0.639819860458374,
"learning_rate": 2.6605720857155017e-06,
"loss": 0.5625590682029724,
"step": 1556
},
{
"epoch": 2.019464720194647,
"grad_norm": 0.6482471227645874,
"learning_rate": 2.654255623969936e-06,
"loss": 0.5997311472892761,
"step": 1557
},
{
"epoch": 2.020762368207624,
"grad_norm": 0.6367791891098022,
"learning_rate": 2.647943958677897e-06,
"loss": 0.5890505313873291,
"step": 1558
},
{
"epoch": 2.0220600162206,
"grad_norm": 0.6217620372772217,
"learning_rate": 2.6416371027451514e-06,
"loss": 0.5508283376693726,
"step": 1559
},
{
"epoch": 2.0233576642335764,
"grad_norm": 0.651731014251709,
"learning_rate": 2.635335069067617e-06,
"loss": 0.6351226568222046,
"step": 1560
},
{
"epoch": 2.024655312246553,
"grad_norm": 0.6955805420875549,
"learning_rate": 2.62903787053136e-06,
"loss": 0.6140905022621155,
"step": 1561
},
{
"epoch": 2.0259529602595294,
"grad_norm": 0.5920689105987549,
"learning_rate": 2.6227455200125575e-06,
"loss": 0.5677257776260376,
"step": 1562
},
{
"epoch": 2.027250608272506,
"grad_norm": 0.6131844520568848,
"learning_rate": 2.6164580303774733e-06,
"loss": 0.5954424142837524,
"step": 1563
},
{
"epoch": 2.0285482562854824,
"grad_norm": 0.6266505122184753,
"learning_rate": 2.6101754144824327e-06,
"loss": 0.5571186542510986,
"step": 1564
},
{
"epoch": 2.029845904298459,
"grad_norm": 0.609183669090271,
"learning_rate": 2.603897685173794e-06,
"loss": 0.61628657579422,
"step": 1565
},
{
"epoch": 2.0311435523114354,
"grad_norm": 0.6080002784729004,
"learning_rate": 2.5976248552879264e-06,
"loss": 0.5877048969268799,
"step": 1566
},
{
"epoch": 2.032441200324412,
"grad_norm": 0.6042158007621765,
"learning_rate": 2.5913569376511806e-06,
"loss": 0.5496143102645874,
"step": 1567
},
{
"epoch": 2.0337388483373884,
"grad_norm": 0.6415978670120239,
"learning_rate": 2.5850939450798553e-06,
"loss": 0.6424070596694946,
"step": 1568
},
{
"epoch": 2.035036496350365,
"grad_norm": 0.6292750239372253,
"learning_rate": 2.5788358903801926e-06,
"loss": 0.5802291631698608,
"step": 1569
},
{
"epoch": 2.0363341443633414,
"grad_norm": 0.5823472738265991,
"learning_rate": 2.572582786348326e-06,
"loss": 0.5664765238761902,
"step": 1570
},
{
"epoch": 2.037631792376318,
"grad_norm": 0.6012071371078491,
"learning_rate": 2.566334645770272e-06,
"loss": 0.5476250648498535,
"step": 1571
},
{
"epoch": 2.0389294403892944,
"grad_norm": 0.6168148517608643,
"learning_rate": 2.5600914814218963e-06,
"loss": 0.5573870539665222,
"step": 1572
},
{
"epoch": 2.040227088402271,
"grad_norm": 0.6200307011604309,
"learning_rate": 2.553853306068888e-06,
"loss": 0.5985852479934692,
"step": 1573
},
{
"epoch": 2.0415247364152473,
"grad_norm": 0.5821851491928101,
"learning_rate": 2.547620132466743e-06,
"loss": 0.5544208288192749,
"step": 1574
},
{
"epoch": 2.042822384428224,
"grad_norm": 0.5919877886772156,
"learning_rate": 2.541391973360717e-06,
"loss": 0.5700052976608276,
"step": 1575
},
{
"epoch": 2.0441200324412003,
"grad_norm": 0.6059973835945129,
"learning_rate": 2.535168841485821e-06,
"loss": 0.6292803287506104,
"step": 1576
},
{
"epoch": 2.0454176804541766,
"grad_norm": 0.580622136592865,
"learning_rate": 2.5289507495667864e-06,
"loss": 0.5648876428604126,
"step": 1577
},
{
"epoch": 2.0467153284671533,
"grad_norm": 0.6086398363113403,
"learning_rate": 2.5227377103180353e-06,
"loss": 0.5471535921096802,
"step": 1578
},
{
"epoch": 2.0480129764801296,
"grad_norm": 0.6052615642547607,
"learning_rate": 2.516529736443661e-06,
"loss": 0.5907412767410278,
"step": 1579
},
{
"epoch": 2.0493106244931063,
"grad_norm": 0.6123395562171936,
"learning_rate": 2.5103268406374002e-06,
"loss": 0.5662798881530762,
"step": 1580
},
{
"epoch": 2.0506082725060826,
"grad_norm": 0.5917913317680359,
"learning_rate": 2.504129035582601e-06,
"loss": 0.5825642943382263,
"step": 1581
},
{
"epoch": 2.0519059205190593,
"grad_norm": 0.645075261592865,
"learning_rate": 2.497936333952212e-06,
"loss": 0.6213525533676147,
"step": 1582
},
{
"epoch": 2.0532035685320356,
"grad_norm": 1.3204904794692993,
"learning_rate": 2.491748748408735e-06,
"loss": 0.5462846755981445,
"step": 1583
},
{
"epoch": 2.0545012165450123,
"grad_norm": 0.5815834403038025,
"learning_rate": 2.485566291604219e-06,
"loss": 0.5608969926834106,
"step": 1584
},
{
"epoch": 2.0557988645579885,
"grad_norm": 0.6155984401702881,
"learning_rate": 2.4793889761802225e-06,
"loss": 0.5753802061080933,
"step": 1585
},
{
"epoch": 2.0570965125709653,
"grad_norm": 0.645876407623291,
"learning_rate": 2.4732168147677927e-06,
"loss": 0.5655276775360107,
"step": 1586
},
{
"epoch": 2.0583941605839415,
"grad_norm": 0.6139212846755981,
"learning_rate": 2.467049819987437e-06,
"loss": 0.5936379432678223,
"step": 1587
},
{
"epoch": 2.0596918085969182,
"grad_norm": 0.5790942311286926,
"learning_rate": 2.460888004449099e-06,
"loss": 0.5055116415023804,
"step": 1588
},
{
"epoch": 2.0609894566098945,
"grad_norm": 0.5931289196014404,
"learning_rate": 2.454731380752132e-06,
"loss": 0.611015260219574,
"step": 1589
},
{
"epoch": 2.0622871046228712,
"grad_norm": 0.5739728808403015,
"learning_rate": 2.4485799614852755e-06,
"loss": 0.5669503211975098,
"step": 1590
},
{
"epoch": 2.0635847526358475,
"grad_norm": 0.6072220802307129,
"learning_rate": 2.442433759226619e-06,
"loss": 0.6242780685424805,
"step": 1591
},
{
"epoch": 2.0648824006488242,
"grad_norm": 0.6013473868370056,
"learning_rate": 2.4362927865435975e-06,
"loss": 0.6564007997512817,
"step": 1592
},
{
"epoch": 2.0661800486618005,
"grad_norm": 0.6308622360229492,
"learning_rate": 2.4301570559929405e-06,
"loss": 0.6350818276405334,
"step": 1593
},
{
"epoch": 2.0674776966747768,
"grad_norm": 0.5770552754402161,
"learning_rate": 2.4240265801206665e-06,
"loss": 0.5588065981864929,
"step": 1594
},
{
"epoch": 2.0687753446877535,
"grad_norm": 0.5862566828727722,
"learning_rate": 2.4179013714620456e-06,
"loss": 0.564478874206543,
"step": 1595
},
{
"epoch": 2.0700729927007298,
"grad_norm": 0.6063327193260193,
"learning_rate": 2.4117814425415803e-06,
"loss": 0.5994401574134827,
"step": 1596
},
{
"epoch": 2.0713706407137065,
"grad_norm": 0.7205548286437988,
"learning_rate": 2.4056668058729766e-06,
"loss": 0.5876675248146057,
"step": 1597
},
{
"epoch": 2.0726682887266827,
"grad_norm": 0.6141117811203003,
"learning_rate": 2.399557473959119e-06,
"loss": 0.5730265974998474,
"step": 1598
},
{
"epoch": 2.0739659367396595,
"grad_norm": 0.653766393661499,
"learning_rate": 2.3934534592920416e-06,
"loss": 0.5947611331939697,
"step": 1599
},
{
"epoch": 2.0752635847526357,
"grad_norm": 0.6264708042144775,
"learning_rate": 2.3873547743529157e-06,
"loss": 0.597199559211731,
"step": 1600
},
{
"epoch": 2.0765612327656124,
"grad_norm": 0.6255890727043152,
"learning_rate": 2.3812614316120003e-06,
"loss": 0.5970041155815125,
"step": 1601
},
{
"epoch": 2.0778588807785887,
"grad_norm": 0.6051512956619263,
"learning_rate": 2.375173443528646e-06,
"loss": 0.5532850027084351,
"step": 1602
},
{
"epoch": 2.0791565287915654,
"grad_norm": 0.6211998462677002,
"learning_rate": 2.3690908225512464e-06,
"loss": 0.5505103468894958,
"step": 1603
},
{
"epoch": 2.0804541768045417,
"grad_norm": 0.6291670799255371,
"learning_rate": 2.363013581117217e-06,
"loss": 0.6287462711334229,
"step": 1604
},
{
"epoch": 2.0817518248175184,
"grad_norm": 0.6058430075645447,
"learning_rate": 2.356941731652986e-06,
"loss": 0.6096627712249756,
"step": 1605
},
{
"epoch": 2.0830494728304947,
"grad_norm": 0.6372430324554443,
"learning_rate": 2.3508752865739425e-06,
"loss": 0.6022605895996094,
"step": 1606
},
{
"epoch": 2.0843471208434714,
"grad_norm": 0.6325316429138184,
"learning_rate": 2.344814258284433e-06,
"loss": 0.610370397567749,
"step": 1607
},
{
"epoch": 2.0856447688564477,
"grad_norm": 0.6065165996551514,
"learning_rate": 2.3387586591777274e-06,
"loss": 0.5800055861473083,
"step": 1608
},
{
"epoch": 2.086942416869424,
"grad_norm": 0.59498131275177,
"learning_rate": 2.3327085016359912e-06,
"loss": 0.5574961304664612,
"step": 1609
},
{
"epoch": 2.0882400648824007,
"grad_norm": 0.6029080748558044,
"learning_rate": 2.3266637980302677e-06,
"loss": 0.5879454016685486,
"step": 1610
},
{
"epoch": 2.0882400648824007,
"eval_loss": 0.6837871670722961,
"eval_runtime": 72.9619,
"eval_samples_per_second": 71.16,
"eval_steps_per_second": 8.895,
"step": 1610
},
{
"epoch": 2.089537712895377,
"grad_norm": 0.6146489381790161,
"learning_rate": 2.320624560720446e-06,
"loss": 0.5897351503372192,
"step": 1611
},
{
"epoch": 2.0908353609083536,
"grad_norm": 0.6313148140907288,
"learning_rate": 2.314590802055232e-06,
"loss": 0.5991021990776062,
"step": 1612
},
{
"epoch": 2.09213300892133,
"grad_norm": 0.578288197517395,
"learning_rate": 2.308562534372144e-06,
"loss": 0.5127542018890381,
"step": 1613
},
{
"epoch": 2.0934306569343066,
"grad_norm": 0.6262894868850708,
"learning_rate": 2.3025397699974555e-06,
"loss": 0.6180716753005981,
"step": 1614
},
{
"epoch": 2.094728304947283,
"grad_norm": 0.6143955588340759,
"learning_rate": 2.296522521246202e-06,
"loss": 0.6144124865531921,
"step": 1615
},
{
"epoch": 2.0960259529602596,
"grad_norm": 0.6245327591896057,
"learning_rate": 2.290510800422129e-06,
"loss": 0.5791307687759399,
"step": 1616
},
{
"epoch": 2.097323600973236,
"grad_norm": 0.6619604825973511,
"learning_rate": 2.284504619817687e-06,
"loss": 0.6063104271888733,
"step": 1617
},
{
"epoch": 2.0986212489862126,
"grad_norm": 0.6063318848609924,
"learning_rate": 2.2785039917139933e-06,
"loss": 0.619540810585022,
"step": 1618
},
{
"epoch": 2.099918896999189,
"grad_norm": 0.6290093660354614,
"learning_rate": 2.272508928380815e-06,
"loss": 0.5471513271331787,
"step": 1619
},
{
"epoch": 2.1012165450121656,
"grad_norm": 0.6088972091674805,
"learning_rate": 2.2665194420765386e-06,
"loss": 0.673788845539093,
"step": 1620
},
{
"epoch": 2.102514193025142,
"grad_norm": 0.6053624153137207,
"learning_rate": 2.260535545048149e-06,
"loss": 0.540647029876709,
"step": 1621
},
{
"epoch": 2.1038118410381186,
"grad_norm": 0.6025784015655518,
"learning_rate": 2.2545572495311966e-06,
"loss": 0.5704219341278076,
"step": 1622
},
{
"epoch": 2.105109489051095,
"grad_norm": 0.5917617678642273,
"learning_rate": 2.2485845677497897e-06,
"loss": 0.5879180431365967,
"step": 1623
},
{
"epoch": 2.1064071370640716,
"grad_norm": 0.6286986470222473,
"learning_rate": 2.2426175119165435e-06,
"loss": 0.6564632058143616,
"step": 1624
},
{
"epoch": 2.107704785077048,
"grad_norm": 0.7979365587234497,
"learning_rate": 2.2366560942325833e-06,
"loss": 0.5867825746536255,
"step": 1625
},
{
"epoch": 2.1090024330900246,
"grad_norm": 0.6283751130104065,
"learning_rate": 2.230700326887495e-06,
"loss": 0.5519679188728333,
"step": 1626
},
{
"epoch": 2.110300081103001,
"grad_norm": 0.6093899011611938,
"learning_rate": 2.2247502220593164e-06,
"loss": 0.578905463218689,
"step": 1627
},
{
"epoch": 2.111597729115977,
"grad_norm": 0.694290816783905,
"learning_rate": 2.218805791914507e-06,
"loss": 0.5794886350631714,
"step": 1628
},
{
"epoch": 2.112895377128954,
"grad_norm": 0.6268723607063293,
"learning_rate": 2.21286704860792e-06,
"loss": 0.5475939512252808,
"step": 1629
},
{
"epoch": 2.11419302514193,
"grad_norm": 0.5893663167953491,
"learning_rate": 2.2069340042827846e-06,
"loss": 0.5644780397415161,
"step": 1630
},
{
"epoch": 2.115490673154907,
"grad_norm": 0.6139518022537231,
"learning_rate": 2.2010066710706734e-06,
"loss": 0.5307568311691284,
"step": 1631
},
{
"epoch": 2.116788321167883,
"grad_norm": 0.6323785781860352,
"learning_rate": 2.1950850610914824e-06,
"loss": 0.5611797571182251,
"step": 1632
},
{
"epoch": 2.11808596918086,
"grad_norm": 0.5823566913604736,
"learning_rate": 2.1891691864534065e-06,
"loss": 0.5725387334823608,
"step": 1633
},
{
"epoch": 2.119383617193836,
"grad_norm": 0.6572033762931824,
"learning_rate": 2.1832590592529128e-06,
"loss": 0.6158653497695923,
"step": 1634
},
{
"epoch": 2.1206812652068128,
"grad_norm": 1.0890551805496216,
"learning_rate": 2.1773546915747103e-06,
"loss": 0.559654951095581,
"step": 1635
},
{
"epoch": 2.121978913219789,
"grad_norm": 0.6277933120727539,
"learning_rate": 2.1714560954917437e-06,
"loss": 0.6304750442504883,
"step": 1636
},
{
"epoch": 2.1232765612327658,
"grad_norm": 0.6458949446678162,
"learning_rate": 2.165563283065142e-06,
"loss": 0.6345778703689575,
"step": 1637
},
{
"epoch": 2.124574209245742,
"grad_norm": 0.643680214881897,
"learning_rate": 2.159676266344222e-06,
"loss": 0.5876523852348328,
"step": 1638
},
{
"epoch": 2.1258718572587187,
"grad_norm": 0.595977783203125,
"learning_rate": 2.1537950573664372e-06,
"loss": 0.6067019104957581,
"step": 1639
},
{
"epoch": 2.127169505271695,
"grad_norm": 0.6042376160621643,
"learning_rate": 2.1479196681573745e-06,
"loss": 0.5710458159446716,
"step": 1640
},
{
"epoch": 2.1284671532846717,
"grad_norm": 0.6172091960906982,
"learning_rate": 2.142050110730716e-06,
"loss": 0.5443819761276245,
"step": 1641
},
{
"epoch": 2.129764801297648,
"grad_norm": 0.6249525547027588,
"learning_rate": 2.136186397088223e-06,
"loss": 0.6747730374336243,
"step": 1642
},
{
"epoch": 2.1310624493106243,
"grad_norm": 0.6373762488365173,
"learning_rate": 2.1303285392197043e-06,
"loss": 0.6101464033126831,
"step": 1643
},
{
"epoch": 2.132360097323601,
"grad_norm": 0.6049467921257019,
"learning_rate": 2.1244765491029985e-06,
"loss": 0.5729132890701294,
"step": 1644
},
{
"epoch": 2.1336577453365773,
"grad_norm": 0.6222594380378723,
"learning_rate": 2.118630438703939e-06,
"loss": 0.6150310039520264,
"step": 1645
},
{
"epoch": 2.134955393349554,
"grad_norm": 0.9498931169509888,
"learning_rate": 2.1127902199763496e-06,
"loss": 0.6144990921020508,
"step": 1646
},
{
"epoch": 2.1362530413625302,
"grad_norm": 0.6177363991737366,
"learning_rate": 2.1069559048619937e-06,
"loss": 0.5762449502944946,
"step": 1647
},
{
"epoch": 2.137550689375507,
"grad_norm": 0.59578537940979,
"learning_rate": 2.10112750529057e-06,
"loss": 0.6036182641983032,
"step": 1648
},
{
"epoch": 2.1388483373884832,
"grad_norm": 0.6090502738952637,
"learning_rate": 2.095305033179682e-06,
"loss": 0.5963237285614014,
"step": 1649
},
{
"epoch": 2.14014598540146,
"grad_norm": 5.44432258605957,
"learning_rate": 2.0894885004348102e-06,
"loss": 0.6094678640365601,
"step": 1650
},
{
"epoch": 2.141443633414436,
"grad_norm": 0.6466519832611084,
"learning_rate": 2.0836779189492925e-06,
"loss": 0.6607776880264282,
"step": 1651
},
{
"epoch": 2.142741281427413,
"grad_norm": 0.6259258985519409,
"learning_rate": 2.077873300604297e-06,
"loss": 0.6022912859916687,
"step": 1652
},
{
"epoch": 2.144038929440389,
"grad_norm": 0.6033953428268433,
"learning_rate": 2.0720746572687995e-06,
"loss": 0.5635781288146973,
"step": 1653
},
{
"epoch": 2.145336577453366,
"grad_norm": 0.5921186208724976,
"learning_rate": 2.0662820007995592e-06,
"loss": 0.5796300172805786,
"step": 1654
},
{
"epoch": 2.146634225466342,
"grad_norm": 0.7194099426269531,
"learning_rate": 2.060495343041087e-06,
"loss": 0.5955857038497925,
"step": 1655
},
{
"epoch": 2.147931873479319,
"grad_norm": 0.6012006998062134,
"learning_rate": 2.0547146958256416e-06,
"loss": 0.531291127204895,
"step": 1656
},
{
"epoch": 2.149229521492295,
"grad_norm": 0.8573319911956787,
"learning_rate": 2.048940070973177e-06,
"loss": 0.5659847259521484,
"step": 1657
},
{
"epoch": 2.150527169505272,
"grad_norm": 0.639750599861145,
"learning_rate": 2.04317148029134e-06,
"loss": 0.5485103130340576,
"step": 1658
},
{
"epoch": 2.151824817518248,
"grad_norm": 0.6052505970001221,
"learning_rate": 2.0374089355754434e-06,
"loss": 0.6026275753974915,
"step": 1659
},
{
"epoch": 2.153122465531225,
"grad_norm": 0.6007844805717468,
"learning_rate": 2.031652448608428e-06,
"loss": 0.5721523761749268,
"step": 1660
},
{
"epoch": 2.154420113544201,
"grad_norm": 0.6320387125015259,
"learning_rate": 2.025902031160853e-06,
"loss": 0.5851036906242371,
"step": 1661
},
{
"epoch": 2.1557177615571774,
"grad_norm": 0.8335509300231934,
"learning_rate": 2.020157694990868e-06,
"loss": 0.631894588470459,
"step": 1662
},
{
"epoch": 2.157015409570154,
"grad_norm": 0.6097387075424194,
"learning_rate": 2.014419451844186e-06,
"loss": 0.6118210554122925,
"step": 1663
},
{
"epoch": 2.1583130575831304,
"grad_norm": 0.6130866408348083,
"learning_rate": 2.0086873134540626e-06,
"loss": 0.5941121578216553,
"step": 1664
},
{
"epoch": 2.159610705596107,
"grad_norm": 0.6047623753547668,
"learning_rate": 2.002961291541269e-06,
"loss": 0.592534065246582,
"step": 1665
},
{
"epoch": 2.1609083536090834,
"grad_norm": 0.6416432857513428,
"learning_rate": 1.997241397814071e-06,
"loss": 0.6065758466720581,
"step": 1666
},
{
"epoch": 2.16220600162206,
"grad_norm": 0.6395633816719055,
"learning_rate": 1.9915276439682056e-06,
"loss": 0.6400467157363892,
"step": 1667
},
{
"epoch": 2.1635036496350364,
"grad_norm": 0.604591965675354,
"learning_rate": 1.985820041686848e-06,
"loss": 0.590105414390564,
"step": 1668
},
{
"epoch": 2.164801297648013,
"grad_norm": 0.6412749886512756,
"learning_rate": 1.9801186026406066e-06,
"loss": 0.5925630927085876,
"step": 1669
},
{
"epoch": 2.1660989456609894,
"grad_norm": 0.6263708472251892,
"learning_rate": 1.9744233384874766e-06,
"loss": 0.6293658018112183,
"step": 1670
},
{
"epoch": 2.167396593673966,
"grad_norm": 0.6095645427703857,
"learning_rate": 1.968734260872833e-06,
"loss": 0.5433490872383118,
"step": 1671
},
{
"epoch": 2.1686942416869424,
"grad_norm": 0.6286778450012207,
"learning_rate": 1.9630513814294e-06,
"loss": 0.637223482131958,
"step": 1672
},
{
"epoch": 2.169991889699919,
"grad_norm": 0.6185746788978577,
"learning_rate": 1.9573747117772272e-06,
"loss": 0.5756215453147888,
"step": 1673
},
{
"epoch": 2.1712895377128953,
"grad_norm": 0.63084876537323,
"learning_rate": 1.951704263523668e-06,
"loss": 0.5794859528541565,
"step": 1674
},
{
"epoch": 2.172587185725872,
"grad_norm": 0.6249853372573853,
"learning_rate": 1.9460400482633537e-06,
"loss": 0.5887556672096252,
"step": 1675
},
{
"epoch": 2.1738848337388483,
"grad_norm": 0.6094781756401062,
"learning_rate": 1.9403820775781696e-06,
"loss": 0.550574779510498,
"step": 1676
},
{
"epoch": 2.1751824817518246,
"grad_norm": 0.6323735117912292,
"learning_rate": 1.9347303630372373e-06,
"loss": 0.6414695978164673,
"step": 1677
},
{
"epoch": 2.1764801297648013,
"grad_norm": 0.6307917237281799,
"learning_rate": 1.929084916196876e-06,
"loss": 0.5808806419372559,
"step": 1678
},
{
"epoch": 2.1777777777777776,
"grad_norm": 0.6933386921882629,
"learning_rate": 1.923445748600603e-06,
"loss": 0.6602625846862793,
"step": 1679
},
{
"epoch": 2.1790754257907543,
"grad_norm": 0.6270908713340759,
"learning_rate": 1.917812871779084e-06,
"loss": 0.6303268074989319,
"step": 1680
},
{
"epoch": 2.1803730738037306,
"grad_norm": 0.640339195728302,
"learning_rate": 1.912186297250128e-06,
"loss": 0.6451208591461182,
"step": 1681
},
{
"epoch": 2.1816707218167073,
"grad_norm": 0.632722020149231,
"learning_rate": 1.9065660365186545e-06,
"loss": 0.6016892194747925,
"step": 1682
},
{
"epoch": 2.1829683698296836,
"grad_norm": 0.6271699070930481,
"learning_rate": 1.9009521010766756e-06,
"loss": 0.5756760835647583,
"step": 1683
},
{
"epoch": 2.1842660178426603,
"grad_norm": 0.6046696305274963,
"learning_rate": 1.8953445024032679e-06,
"loss": 0.6025729775428772,
"step": 1684
},
{
"epoch": 2.1855636658556366,
"grad_norm": 0.62837815284729,
"learning_rate": 1.889743251964553e-06,
"loss": 0.5909950733184814,
"step": 1685
},
{
"epoch": 2.1868613138686133,
"grad_norm": 0.6451588869094849,
"learning_rate": 1.8841483612136658e-06,
"loss": 0.6150632500648499,
"step": 1686
},
{
"epoch": 2.1881589618815895,
"grad_norm": 0.6260155439376831,
"learning_rate": 1.8785598415907464e-06,
"loss": 0.5601434707641602,
"step": 1687
},
{
"epoch": 2.1894566098945663,
"grad_norm": 0.6069033741950989,
"learning_rate": 1.8729777045229009e-06,
"loss": 0.508891761302948,
"step": 1688
},
{
"epoch": 2.1907542579075425,
"grad_norm": 0.613555371761322,
"learning_rate": 1.8674019614241879e-06,
"loss": 0.5379388928413391,
"step": 1689
},
{
"epoch": 2.1920519059205192,
"grad_norm": 0.6071879863739014,
"learning_rate": 1.8618326236955908e-06,
"loss": 0.5609877109527588,
"step": 1690
},
{
"epoch": 2.1933495539334955,
"grad_norm": 0.6063624620437622,
"learning_rate": 1.8562697027249921e-06,
"loss": 0.5955809950828552,
"step": 1691
},
{
"epoch": 2.1946472019464722,
"grad_norm": 0.6319783926010132,
"learning_rate": 1.8507132098871633e-06,
"loss": 0.5856696367263794,
"step": 1692
},
{
"epoch": 2.1959448499594485,
"grad_norm": 0.6493479609489441,
"learning_rate": 1.8451631565437211e-06,
"loss": 0.6506030559539795,
"step": 1693
},
{
"epoch": 2.197242497972425,
"grad_norm": 0.6016229391098022,
"learning_rate": 1.8396195540431205e-06,
"loss": 0.6117116212844849,
"step": 1694
},
{
"epoch": 2.1985401459854015,
"grad_norm": 0.6247114539146423,
"learning_rate": 1.834082413720627e-06,
"loss": 0.6172184348106384,
"step": 1695
},
{
"epoch": 2.1998377939983778,
"grad_norm": 0.6307165026664734,
"learning_rate": 1.8285517468982905e-06,
"loss": 0.589012622833252,
"step": 1696
},
{
"epoch": 2.2011354420113545,
"grad_norm": 0.6177083253860474,
"learning_rate": 1.8230275648849243e-06,
"loss": 0.5813847780227661,
"step": 1697
},
{
"epoch": 2.2024330900243307,
"grad_norm": 0.6012999415397644,
"learning_rate": 1.8175098789760848e-06,
"loss": 0.5948748588562012,
"step": 1698
},
{
"epoch": 2.2037307380373075,
"grad_norm": 0.6464450359344482,
"learning_rate": 1.8119987004540373e-06,
"loss": 0.5775672197341919,
"step": 1699
},
{
"epoch": 2.2050283860502837,
"grad_norm": 0.6167866587638855,
"learning_rate": 1.8064940405877546e-06,
"loss": 0.6011961698532104,
"step": 1700
},
{
"epoch": 2.2063260340632604,
"grad_norm": 0.6373000741004944,
"learning_rate": 1.8009959106328655e-06,
"loss": 0.5679797530174255,
"step": 1701
},
{
"epoch": 2.2076236820762367,
"grad_norm": 0.5966001152992249,
"learning_rate": 1.7955043218316615e-06,
"loss": 0.5757954120635986,
"step": 1702
},
{
"epoch": 2.2089213300892134,
"grad_norm": 0.6121652722358704,
"learning_rate": 1.7900192854130465e-06,
"loss": 0.5717330574989319,
"step": 1703
},
{
"epoch": 2.2102189781021897,
"grad_norm": 0.6737116575241089,
"learning_rate": 1.7845408125925328e-06,
"loss": 0.5650469064712524,
"step": 1704
},
{
"epoch": 2.2115166261151664,
"grad_norm": 0.6384139060974121,
"learning_rate": 1.7790689145722111e-06,
"loss": 0.5935101509094238,
"step": 1705
},
{
"epoch": 2.2128142741281427,
"grad_norm": 0.5904914140701294,
"learning_rate": 1.7736036025407282e-06,
"loss": 0.5071459412574768,
"step": 1706
},
{
"epoch": 2.2141119221411194,
"grad_norm": 0.6069095730781555,
"learning_rate": 1.7681448876732632e-06,
"loss": 0.5586497783660889,
"step": 1707
},
{
"epoch": 2.2154095701540957,
"grad_norm": 0.6180804967880249,
"learning_rate": 1.7626927811315087e-06,
"loss": 0.6200004816055298,
"step": 1708
},
{
"epoch": 2.2167072181670724,
"grad_norm": 0.6510108113288879,
"learning_rate": 1.7572472940636375e-06,
"loss": 0.6552962064743042,
"step": 1709
},
{
"epoch": 2.2180048661800487,
"grad_norm": 0.6048802137374878,
"learning_rate": 1.7518084376042988e-06,
"loss": 0.5669337511062622,
"step": 1710
},
{
"epoch": 2.219302514193025,
"grad_norm": 0.6182539463043213,
"learning_rate": 1.7463762228745728e-06,
"loss": 0.5660184621810913,
"step": 1711
},
{
"epoch": 2.2206001622060016,
"grad_norm": 0.8579080700874329,
"learning_rate": 1.7409506609819648e-06,
"loss": 0.5399761199951172,
"step": 1712
},
{
"epoch": 2.221897810218978,
"grad_norm": 0.5763684511184692,
"learning_rate": 1.735531763020376e-06,
"loss": 0.5553586483001709,
"step": 1713
},
{
"epoch": 2.2231954582319546,
"grad_norm": 0.5882948040962219,
"learning_rate": 1.7301195400700815e-06,
"loss": 0.5055762529373169,
"step": 1714
},
{
"epoch": 2.224493106244931,
"grad_norm": 0.6292950510978699,
"learning_rate": 1.7247140031977073e-06,
"loss": 0.6296324133872986,
"step": 1715
},
{
"epoch": 2.2257907542579076,
"grad_norm": 0.600114643573761,
"learning_rate": 1.7193151634562071e-06,
"loss": 0.5636775493621826,
"step": 1716
},
{
"epoch": 2.227088402270884,
"grad_norm": 0.6101363301277161,
"learning_rate": 1.7139230318848432e-06,
"loss": 0.6061251163482666,
"step": 1717
},
{
"epoch": 2.2283860502838606,
"grad_norm": 0.6061044335365295,
"learning_rate": 1.7085376195091591e-06,
"loss": 0.6004489660263062,
"step": 1718
},
{
"epoch": 2.229683698296837,
"grad_norm": 0.6100283265113831,
"learning_rate": 1.7031589373409596e-06,
"loss": 0.571765661239624,
"step": 1719
},
{
"epoch": 2.2309813463098136,
"grad_norm": 0.6510441303253174,
"learning_rate": 1.6977869963782895e-06,
"loss": 0.5853846073150635,
"step": 1720
},
{
"epoch": 2.23227899432279,
"grad_norm": 0.635164201259613,
"learning_rate": 1.6924218076054095e-06,
"loss": 0.6079269647598267,
"step": 1721
},
{
"epoch": 2.2335766423357666,
"grad_norm": 0.641042172908783,
"learning_rate": 1.6870633819927672e-06,
"loss": 0.7038273811340332,
"step": 1722
},
{
"epoch": 2.234874290348743,
"grad_norm": 0.621701717376709,
"learning_rate": 1.6817117304969944e-06,
"loss": 0.5776299238204956,
"step": 1723
},
{
"epoch": 2.2361719383617196,
"grad_norm": 0.5985130071640015,
"learning_rate": 1.676366864060856e-06,
"loss": 0.5792907476425171,
"step": 1724
},
{
"epoch": 2.237469586374696,
"grad_norm": 0.6016199588775635,
"learning_rate": 1.6710287936132592e-06,
"loss": 0.518044650554657,
"step": 1725
},
{
"epoch": 2.238767234387672,
"grad_norm": 0.5871309041976929,
"learning_rate": 1.6656975300692008e-06,
"loss": 0.5443193316459656,
"step": 1726
},
{
"epoch": 2.240064882400649,
"grad_norm": 0.6189736723899841,
"learning_rate": 1.660373084329767e-06,
"loss": 0.6327258944511414,
"step": 1727
},
{
"epoch": 2.241362530413625,
"grad_norm": 0.6076374650001526,
"learning_rate": 1.6550554672821028e-06,
"loss": 0.5638880729675293,
"step": 1728
},
{
"epoch": 2.242660178426602,
"grad_norm": 0.6918789744377136,
"learning_rate": 1.6497446897993885e-06,
"loss": 0.6006242632865906,
"step": 1729
},
{
"epoch": 2.243957826439578,
"grad_norm": 0.636972188949585,
"learning_rate": 1.6444407627408194e-06,
"loss": 0.5908925533294678,
"step": 1730
},
{
"epoch": 2.245255474452555,
"grad_norm": 0.6132383942604065,
"learning_rate": 1.639143696951586e-06,
"loss": 0.5603156089782715,
"step": 1731
},
{
"epoch": 2.246553122465531,
"grad_norm": 0.6161746382713318,
"learning_rate": 1.6338535032628427e-06,
"loss": 0.5923026204109192,
"step": 1732
},
{
"epoch": 2.247850770478508,
"grad_norm": 0.6077067255973816,
"learning_rate": 1.6285701924917025e-06,
"loss": 0.5932834148406982,
"step": 1733
},
{
"epoch": 2.249148418491484,
"grad_norm": 0.6137869954109192,
"learning_rate": 1.6232937754411938e-06,
"loss": 0.5695690512657166,
"step": 1734
},
{
"epoch": 2.2504460665044608,
"grad_norm": 0.5874996185302734,
"learning_rate": 1.6180242629002558e-06,
"loss": 0.5515947341918945,
"step": 1735
},
{
"epoch": 2.251743714517437,
"grad_norm": 0.5972124934196472,
"learning_rate": 1.6127616656437078e-06,
"loss": 0.6108847260475159,
"step": 1736
},
{
"epoch": 2.2530413625304138,
"grad_norm": 0.6362358927726746,
"learning_rate": 1.6075059944322297e-06,
"loss": 0.5956808924674988,
"step": 1737
},
{
"epoch": 2.25433901054339,
"grad_norm": 0.6626409888267517,
"learning_rate": 1.6022572600123382e-06,
"loss": 0.5291856527328491,
"step": 1738
},
{
"epoch": 2.2556366585563667,
"grad_norm": 0.6440781354904175,
"learning_rate": 1.5970154731163667e-06,
"loss": 0.6244629621505737,
"step": 1739
},
{
"epoch": 2.256934306569343,
"grad_norm": 0.598318874835968,
"learning_rate": 1.5917806444624434e-06,
"loss": 0.5915838479995728,
"step": 1740
},
{
"epoch": 2.2582319545823197,
"grad_norm": 0.6128567457199097,
"learning_rate": 1.5865527847544692e-06,
"loss": 0.5356861352920532,
"step": 1741
},
{
"epoch": 2.259529602595296,
"grad_norm": 0.6078605651855469,
"learning_rate": 1.581331904682089e-06,
"loss": 0.5974371433258057,
"step": 1742
},
{
"epoch": 2.2608272506082727,
"grad_norm": 0.6011683344841003,
"learning_rate": 1.576118014920688e-06,
"loss": 0.5702426433563232,
"step": 1743
},
{
"epoch": 2.262124898621249,
"grad_norm": 0.6138583421707153,
"learning_rate": 1.5709111261313454e-06,
"loss": 0.6526232361793518,
"step": 1744
},
{
"epoch": 2.2634225466342253,
"grad_norm": 0.5757991075515747,
"learning_rate": 1.5657112489608316e-06,
"loss": 0.5384607315063477,
"step": 1745
},
{
"epoch": 2.264720194647202,
"grad_norm": 0.5720049142837524,
"learning_rate": 1.5605183940415842e-06,
"loss": 0.5239338278770447,
"step": 1746
},
{
"epoch": 2.2660178426601782,
"grad_norm": 0.6321298480033875,
"learning_rate": 1.5553325719916717e-06,
"loss": 0.5788372159004211,
"step": 1747
},
{
"epoch": 2.267315490673155,
"grad_norm": 0.6393312215805054,
"learning_rate": 1.5501537934147897e-06,
"loss": 0.6262606978416443,
"step": 1748
},
{
"epoch": 2.2686131386861312,
"grad_norm": 0.5900475978851318,
"learning_rate": 1.5449820689002298e-06,
"loss": 0.5757325887680054,
"step": 1749
},
{
"epoch": 2.269910786699108,
"grad_norm": 0.9620885848999023,
"learning_rate": 1.5398174090228595e-06,
"loss": 0.5125218629837036,
"step": 1750
},
{
"epoch": 2.2712084347120842,
"grad_norm": 0.611209511756897,
"learning_rate": 1.534659824343101e-06,
"loss": 0.5692592859268188,
"step": 1751
},
{
"epoch": 2.272506082725061,
"grad_norm": 0.5884798169136047,
"learning_rate": 1.5295093254069093e-06,
"loss": 0.561367392539978,
"step": 1752
},
{
"epoch": 2.273803730738037,
"grad_norm": 0.9114322662353516,
"learning_rate": 1.524365922745752e-06,
"loss": 0.5305287837982178,
"step": 1753
},
{
"epoch": 2.275101378751014,
"grad_norm": 0.615672767162323,
"learning_rate": 1.519229626876586e-06,
"loss": 0.5678682923316956,
"step": 1754
},
{
"epoch": 2.27639902676399,
"grad_norm": 0.596740186214447,
"learning_rate": 1.5141004483018323e-06,
"loss": 0.562171995639801,
"step": 1755
},
{
"epoch": 2.277696674776967,
"grad_norm": 0.5916588306427002,
"learning_rate": 1.5089783975093698e-06,
"loss": 0.5581475496292114,
"step": 1756
},
{
"epoch": 2.278994322789943,
"grad_norm": 0.5962932705879211,
"learning_rate": 1.5038634849724898e-06,
"loss": 0.5466150045394897,
"step": 1757
},
{
"epoch": 2.28029197080292,
"grad_norm": 0.6148486137390137,
"learning_rate": 1.4987557211498966e-06,
"loss": 0.562313973903656,
"step": 1758
},
{
"epoch": 2.281589618815896,
"grad_norm": 0.6286764740943909,
"learning_rate": 1.4936551164856739e-06,
"loss": 0.585920512676239,
"step": 1759
},
{
"epoch": 2.2828872668288724,
"grad_norm": 0.5897656679153442,
"learning_rate": 1.4885616814092673e-06,
"loss": 0.5238120555877686,
"step": 1760
},
{
"epoch": 2.284184914841849,
"grad_norm": 0.6284115314483643,
"learning_rate": 1.4834754263354628e-06,
"loss": 0.6318528652191162,
"step": 1761
},
{
"epoch": 2.285482562854826,
"grad_norm": 0.6478753685951233,
"learning_rate": 1.4783963616643654e-06,
"loss": 0.6090703010559082,
"step": 1762
},
{
"epoch": 2.286780210867802,
"grad_norm": 0.6065962314605713,
"learning_rate": 1.4733244977813726e-06,
"loss": 0.6407983303070068,
"step": 1763
},
{
"epoch": 2.2880778588807784,
"grad_norm": 0.6130876541137695,
"learning_rate": 1.468259845057169e-06,
"loss": 0.5580013990402222,
"step": 1764
},
{
"epoch": 2.289375506893755,
"grad_norm": 0.5975884199142456,
"learning_rate": 1.4632024138476803e-06,
"loss": 0.5697616338729858,
"step": 1765
},
{
"epoch": 2.2906731549067314,
"grad_norm": 0.6038120985031128,
"learning_rate": 1.4581522144940802e-06,
"loss": 0.5938565731048584,
"step": 1766
},
{
"epoch": 2.291970802919708,
"grad_norm": 0.6311531066894531,
"learning_rate": 1.4531092573227434e-06,
"loss": 0.5615339875221252,
"step": 1767
},
{
"epoch": 2.2932684509326844,
"grad_norm": 0.7556526064872742,
"learning_rate": 1.4480735526452427e-06,
"loss": 0.6018495559692383,
"step": 1768
},
{
"epoch": 2.294566098945661,
"grad_norm": 0.5966140627861023,
"learning_rate": 1.4430451107583187e-06,
"loss": 0.5482977628707886,
"step": 1769
},
{
"epoch": 2.2958637469586374,
"grad_norm": 0.6495786309242249,
"learning_rate": 1.4380239419438636e-06,
"loss": 0.6411464810371399,
"step": 1770
},
{
"epoch": 2.297161394971614,
"grad_norm": 0.6380159258842468,
"learning_rate": 1.433010056468896e-06,
"loss": 0.585355281829834,
"step": 1771
},
{
"epoch": 2.2984590429845904,
"grad_norm": 0.6330418586730957,
"learning_rate": 1.4280034645855429e-06,
"loss": 0.6234038472175598,
"step": 1772
},
{
"epoch": 2.299756690997567,
"grad_norm": 0.6164976358413696,
"learning_rate": 1.4230041765310171e-06,
"loss": 0.6070310473442078,
"step": 1773
},
{
"epoch": 2.3010543390105433,
"grad_norm": 0.5887792706489563,
"learning_rate": 1.4180122025275972e-06,
"loss": 0.49864742159843445,
"step": 1774
},
{
"epoch": 2.30235198702352,
"grad_norm": 0.5811118483543396,
"learning_rate": 1.4130275527826077e-06,
"loss": 0.6116331815719604,
"step": 1775
},
{
"epoch": 2.3036496350364963,
"grad_norm": 0.6207771897315979,
"learning_rate": 1.4080502374883947e-06,
"loss": 0.6092080473899841,
"step": 1776
},
{
"epoch": 2.304947283049473,
"grad_norm": 0.6548723578453064,
"learning_rate": 1.4030802668223097e-06,
"loss": 0.5866458415985107,
"step": 1777
},
{
"epoch": 2.3062449310624493,
"grad_norm": 0.6081016659736633,
"learning_rate": 1.398117650946681e-06,
"loss": 0.5727241039276123,
"step": 1778
},
{
"epoch": 2.3075425790754256,
"grad_norm": 0.6203863024711609,
"learning_rate": 1.3931624000088073e-06,
"loss": 0.5507431030273438,
"step": 1779
},
{
"epoch": 2.3088402270884023,
"grad_norm": 0.61940598487854,
"learning_rate": 1.3882145241409184e-06,
"loss": 0.6124242544174194,
"step": 1780
},
{
"epoch": 2.3101378751013786,
"grad_norm": 0.6263229250907898,
"learning_rate": 1.3832740334601692e-06,
"loss": 0.6032424569129944,
"step": 1781
},
{
"epoch": 2.3114355231143553,
"grad_norm": 0.5885775685310364,
"learning_rate": 1.3783409380686135e-06,
"loss": 0.5357505083084106,
"step": 1782
},
{
"epoch": 2.3127331711273316,
"grad_norm": 0.6165185570716858,
"learning_rate": 1.3734152480531821e-06,
"loss": 0.6190866231918335,
"step": 1783
},
{
"epoch": 2.3140308191403083,
"grad_norm": 0.6044617295265198,
"learning_rate": 1.3684969734856646e-06,
"loss": 0.5655971765518188,
"step": 1784
},
{
"epoch": 2.3153284671532846,
"grad_norm": 0.6254231929779053,
"learning_rate": 1.363586124422689e-06,
"loss": 0.5936893224716187,
"step": 1785
},
{
"epoch": 2.3166261151662613,
"grad_norm": 0.6128689646720886,
"learning_rate": 1.3586827109056944e-06,
"loss": 0.5749369263648987,
"step": 1786
},
{
"epoch": 2.3179237631792375,
"grad_norm": 0.6421996355056763,
"learning_rate": 1.3537867429609263e-06,
"loss": 0.5559523105621338,
"step": 1787
},
{
"epoch": 2.3192214111922143,
"grad_norm": 0.6680915951728821,
"learning_rate": 1.3488982305993942e-06,
"loss": 0.5511724352836609,
"step": 1788
},
{
"epoch": 2.3205190592051905,
"grad_norm": 0.6443539261817932,
"learning_rate": 1.3440171838168743e-06,
"loss": 0.5881825089454651,
"step": 1789
},
{
"epoch": 2.3218167072181672,
"grad_norm": 0.612708568572998,
"learning_rate": 1.3391436125938673e-06,
"loss": 0.5950250625610352,
"step": 1790
},
{
"epoch": 2.3231143552311435,
"grad_norm": 0.595385730266571,
"learning_rate": 1.3342775268955943e-06,
"loss": 0.5954742431640625,
"step": 1791
},
{
"epoch": 2.3244120032441202,
"grad_norm": 0.6444376111030579,
"learning_rate": 1.329418936671969e-06,
"loss": 0.5775749087333679,
"step": 1792
},
{
"epoch": 2.3257096512570965,
"grad_norm": 0.6064639687538147,
"learning_rate": 1.3245678518575782e-06,
"loss": 0.5845799446105957,
"step": 1793
},
{
"epoch": 2.3270072992700728,
"grad_norm": 0.6051777601242065,
"learning_rate": 1.319724282371664e-06,
"loss": 0.5920668840408325,
"step": 1794
},
{
"epoch": 2.3283049472830495,
"grad_norm": 0.6336135268211365,
"learning_rate": 1.3148882381181e-06,
"loss": 0.562667965888977,
"step": 1795
},
{
"epoch": 2.329602595296026,
"grad_norm": 0.6154525876045227,
"learning_rate": 1.3100597289853689e-06,
"loss": 0.5847402811050415,
"step": 1796
},
{
"epoch": 2.3309002433090025,
"grad_norm": 0.6278738379478455,
"learning_rate": 1.3052387648465559e-06,
"loss": 0.6408085823059082,
"step": 1797
},
{
"epoch": 2.3321978913219787,
"grad_norm": 0.6477576494216919,
"learning_rate": 1.3004253555593071e-06,
"loss": 0.5616024732589722,
"step": 1798
},
{
"epoch": 2.3334955393349555,
"grad_norm": 0.6108107566833496,
"learning_rate": 1.2956195109658287e-06,
"loss": 0.5326311588287354,
"step": 1799
},
{
"epoch": 2.3347931873479317,
"grad_norm": 1.3087694644927979,
"learning_rate": 1.2908212408928561e-06,
"loss": 0.6685813069343567,
"step": 1800
},
{
"epoch": 2.3360908353609084,
"grad_norm": 0.592055082321167,
"learning_rate": 1.2860305551516355e-06,
"loss": 0.6329461932182312,
"step": 1801
},
{
"epoch": 2.3373884833738847,
"grad_norm": 0.6386983394622803,
"learning_rate": 1.281247463537912e-06,
"loss": 0.5208531618118286,
"step": 1802
},
{
"epoch": 2.3386861313868614,
"grad_norm": 0.6252365112304688,
"learning_rate": 1.276471975831891e-06,
"loss": 0.5943001508712769,
"step": 1803
},
{
"epoch": 2.3399837793998377,
"grad_norm": 0.6460595726966858,
"learning_rate": 1.2717041017982396e-06,
"loss": 0.6217683553695679,
"step": 1804
},
{
"epoch": 2.3412814274128144,
"grad_norm": 0.6099584698677063,
"learning_rate": 1.2669438511860527e-06,
"loss": 0.5706977844238281,
"step": 1805
},
{
"epoch": 2.3425790754257907,
"grad_norm": 0.6689403653144836,
"learning_rate": 1.2621912337288372e-06,
"loss": 0.551365077495575,
"step": 1806
},
{
"epoch": 2.3438767234387674,
"grad_norm": 0.606182873249054,
"learning_rate": 1.257446259144494e-06,
"loss": 0.5419661998748779,
"step": 1807
},
{
"epoch": 2.3451743714517437,
"grad_norm": 0.5901670455932617,
"learning_rate": 1.2527089371352968e-06,
"loss": 0.5732494592666626,
"step": 1808
},
{
"epoch": 2.34647201946472,
"grad_norm": 0.6110414862632751,
"learning_rate": 1.2479792773878647e-06,
"loss": 0.6051602363586426,
"step": 1809
},
{
"epoch": 2.3477696674776967,
"grad_norm": 0.6416681408882141,
"learning_rate": 1.243257289573161e-06,
"loss": 0.593826949596405,
"step": 1810
},
{
"epoch": 2.3490673154906734,
"grad_norm": 0.6288197636604309,
"learning_rate": 1.2385429833464513e-06,
"loss": 0.5421499609947205,
"step": 1811
},
{
"epoch": 2.3503649635036497,
"grad_norm": 0.6199961304664612,
"learning_rate": 1.2338363683472998e-06,
"loss": 0.5908663868904114,
"step": 1812
},
{
"epoch": 2.351662611516626,
"grad_norm": 0.6229044198989868,
"learning_rate": 1.2291374541995437e-06,
"loss": 0.5933829545974731,
"step": 1813
},
{
"epoch": 2.3529602595296026,
"grad_norm": 0.6609744429588318,
"learning_rate": 1.224446250511272e-06,
"loss": 0.594125509262085,
"step": 1814
},
{
"epoch": 2.354257907542579,
"grad_norm": 0.6363682150840759,
"learning_rate": 1.2197627668748101e-06,
"loss": 0.5930228233337402,
"step": 1815
},
{
"epoch": 2.3555555555555556,
"grad_norm": 0.6157255172729492,
"learning_rate": 1.2150870128666959e-06,
"loss": 0.5634854435920715,
"step": 1816
},
{
"epoch": 2.356853203568532,
"grad_norm": 0.6403535604476929,
"learning_rate": 1.2104189980476627e-06,
"loss": 0.5946694612503052,
"step": 1817
},
{
"epoch": 2.3581508515815086,
"grad_norm": 0.6029789447784424,
"learning_rate": 1.2057587319626213e-06,
"loss": 0.5258057713508606,
"step": 1818
},
{
"epoch": 2.359448499594485,
"grad_norm": 0.6252802014350891,
"learning_rate": 1.2011062241406313e-06,
"loss": 0.5830211639404297,
"step": 1819
},
{
"epoch": 2.3607461476074616,
"grad_norm": 0.608201801776886,
"learning_rate": 1.1964614840949002e-06,
"loss": 0.6013060212135315,
"step": 1820
},
{
"epoch": 2.362043795620438,
"grad_norm": 0.6110815405845642,
"learning_rate": 1.1918245213227408e-06,
"loss": 0.576073169708252,
"step": 1821
},
{
"epoch": 2.3633414436334146,
"grad_norm": 0.605087161064148,
"learning_rate": 1.1871953453055707e-06,
"loss": 0.6136230826377869,
"step": 1822
},
{
"epoch": 2.364639091646391,
"grad_norm": 0.6053324341773987,
"learning_rate": 1.182573965508882e-06,
"loss": 0.5785141587257385,
"step": 1823
},
{
"epoch": 2.3659367396593676,
"grad_norm": 0.6085898876190186,
"learning_rate": 1.1779603913822274e-06,
"loss": 0.5601797103881836,
"step": 1824
},
{
"epoch": 2.367234387672344,
"grad_norm": 0.6608554124832153,
"learning_rate": 1.1733546323591981e-06,
"loss": 0.5785682797431946,
"step": 1825
},
{
"epoch": 2.3685320356853206,
"grad_norm": 0.6056334972381592,
"learning_rate": 1.168756697857406e-06,
"loss": 0.5939031839370728,
"step": 1826
},
{
"epoch": 2.369829683698297,
"grad_norm": 0.6553589105606079,
"learning_rate": 1.1641665972784628e-06,
"loss": 0.6532239317893982,
"step": 1827
},
{
"epoch": 2.371127331711273,
"grad_norm": 0.6094745397567749,
"learning_rate": 1.1595843400079636e-06,
"loss": 0.5682094097137451,
"step": 1828
},
{
"epoch": 2.37242497972425,
"grad_norm": 0.623717188835144,
"learning_rate": 1.1550099354154615e-06,
"loss": 0.6046154499053955,
"step": 1829
},
{
"epoch": 2.373722627737226,
"grad_norm": 0.631445050239563,
"learning_rate": 1.1504433928544594e-06,
"loss": 0.6053498387336731,
"step": 1830
},
{
"epoch": 2.375020275750203,
"grad_norm": 0.6280617117881775,
"learning_rate": 1.1458847216623813e-06,
"loss": 0.5817880630493164,
"step": 1831
},
{
"epoch": 2.376317923763179,
"grad_norm": 0.6309313178062439,
"learning_rate": 1.141333931160552e-06,
"loss": 0.6206140518188477,
"step": 1832
},
{
"epoch": 2.377615571776156,
"grad_norm": 0.6384704113006592,
"learning_rate": 1.1367910306541918e-06,
"loss": 0.6599752306938171,
"step": 1833
},
{
"epoch": 2.378913219789132,
"grad_norm": 0.6254469752311707,
"learning_rate": 1.1322560294323775e-06,
"loss": 0.5889034271240234,
"step": 1834
},
{
"epoch": 2.3802108678021088,
"grad_norm": 0.6390111446380615,
"learning_rate": 1.1277289367680411e-06,
"loss": 0.6020563840866089,
"step": 1835
},
{
"epoch": 2.381508515815085,
"grad_norm": 0.6277632117271423,
"learning_rate": 1.123209761917941e-06,
"loss": 0.5417424440383911,
"step": 1836
},
{
"epoch": 2.3828061638280618,
"grad_norm": 0.6135120987892151,
"learning_rate": 1.1186985141226458e-06,
"loss": 0.5558514595031738,
"step": 1837
},
{
"epoch": 2.384103811841038,
"grad_norm": 0.6234643459320068,
"learning_rate": 1.1141952026065156e-06,
"loss": 0.6145384311676025,
"step": 1838
},
{
"epoch": 2.3854014598540147,
"grad_norm": 0.6055371165275574,
"learning_rate": 1.1096998365776828e-06,
"loss": 0.5748616456985474,
"step": 1839
},
{
"epoch": 2.386699107866991,
"grad_norm": 0.6127825379371643,
"learning_rate": 1.1052124252280322e-06,
"loss": 0.5389982461929321,
"step": 1840
},
{
"epoch": 2.386699107866991,
"eval_loss": 0.6825700998306274,
"eval_runtime": 72.9215,
"eval_samples_per_second": 71.2,
"eval_steps_per_second": 8.9,
"step": 1840
},
{
"epoch": 2.3879967558799677,
"grad_norm": 0.6031513214111328,
"learning_rate": 1.1007329777331866e-06,
"loss": 0.5840494632720947,
"step": 1841
},
{
"epoch": 2.389294403892944,
"grad_norm": 0.63483726978302,
"learning_rate": 1.096261503252478e-06,
"loss": 0.5311962366104126,
"step": 1842
},
{
"epoch": 2.3905920519059203,
"grad_norm": 0.6125195622444153,
"learning_rate": 1.0917980109289455e-06,
"loss": 0.5285024046897888,
"step": 1843
},
{
"epoch": 2.391889699918897,
"grad_norm": 0.5990893244743347,
"learning_rate": 1.0873425098892964e-06,
"loss": 0.5493112802505493,
"step": 1844
},
{
"epoch": 2.3931873479318737,
"grad_norm": 0.6030960083007812,
"learning_rate": 1.082895009243905e-06,
"loss": 0.5796130895614624,
"step": 1845
},
{
"epoch": 2.39448499594485,
"grad_norm": 0.6366276741027832,
"learning_rate": 1.078455518086784e-06,
"loss": 0.5433975458145142,
"step": 1846
},
{
"epoch": 2.3957826439578263,
"grad_norm": 0.5901277661323547,
"learning_rate": 1.0740240454955692e-06,
"loss": 0.5538575649261475,
"step": 1847
},
{
"epoch": 2.397080291970803,
"grad_norm": 0.6165037155151367,
"learning_rate": 1.0696006005314996e-06,
"loss": 0.5998971462249756,
"step": 1848
},
{
"epoch": 2.3983779399837792,
"grad_norm": 0.6113094091415405,
"learning_rate": 1.0651851922394035e-06,
"loss": 0.570077121257782,
"step": 1849
},
{
"epoch": 2.399675587996756,
"grad_norm": 0.6432837247848511,
"learning_rate": 1.0607778296476679e-06,
"loss": 0.6083425283432007,
"step": 1850
},
{
"epoch": 2.4009732360097322,
"grad_norm": 0.5917057394981384,
"learning_rate": 1.05637852176824e-06,
"loss": 0.5251022577285767,
"step": 1851
},
{
"epoch": 2.402270884022709,
"grad_norm": 0.6266626119613647,
"learning_rate": 1.051987277596585e-06,
"loss": 0.5856255292892456,
"step": 1852
},
{
"epoch": 2.403568532035685,
"grad_norm": 0.610355019569397,
"learning_rate": 1.0476041061116915e-06,
"loss": 0.6004334688186646,
"step": 1853
},
{
"epoch": 2.404866180048662,
"grad_norm": 0.5825424790382385,
"learning_rate": 1.0432290162760311e-06,
"loss": 0.5548322796821594,
"step": 1854
},
{
"epoch": 2.406163828061638,
"grad_norm": 0.6335608959197998,
"learning_rate": 1.038862017035558e-06,
"loss": 0.5934311747550964,
"step": 1855
},
{
"epoch": 2.407461476074615,
"grad_norm": 0.6018176078796387,
"learning_rate": 1.0345031173196785e-06,
"loss": 0.5377739071846008,
"step": 1856
},
{
"epoch": 2.408759124087591,
"grad_norm": 0.6398853659629822,
"learning_rate": 1.0301523260412405e-06,
"loss": 0.6047654151916504,
"step": 1857
},
{
"epoch": 2.410056772100568,
"grad_norm": 0.6761499643325806,
"learning_rate": 1.025809652096511e-06,
"loss": 0.6525087356567383,
"step": 1858
},
{
"epoch": 2.411354420113544,
"grad_norm": 0.5981181859970093,
"learning_rate": 1.0214751043651582e-06,
"loss": 0.5705087184906006,
"step": 1859
},
{
"epoch": 2.412652068126521,
"grad_norm": 0.6022308468818665,
"learning_rate": 1.0171486917102357e-06,
"loss": 0.5528420209884644,
"step": 1860
},
{
"epoch": 2.413949716139497,
"grad_norm": 0.576118528842926,
"learning_rate": 1.0128304229781622e-06,
"loss": 0.572098970413208,
"step": 1861
},
{
"epoch": 2.4152473641524734,
"grad_norm": 0.6066587567329407,
"learning_rate": 1.008520306998706e-06,
"loss": 0.5568013787269592,
"step": 1862
},
{
"epoch": 2.41654501216545,
"grad_norm": 0.7212052345275879,
"learning_rate": 1.0042183525849586e-06,
"loss": 0.5123892426490784,
"step": 1863
},
{
"epoch": 2.4178426601784264,
"grad_norm": 0.5919977426528931,
"learning_rate": 9.999245685333342e-07,
"loss": 0.5277501344680786,
"step": 1864
},
{
"epoch": 2.419140308191403,
"grad_norm": 0.5896833539009094,
"learning_rate": 9.95638963623528e-07,
"loss": 0.5733782649040222,
"step": 1865
},
{
"epoch": 2.4204379562043794,
"grad_norm": 0.6342105269432068,
"learning_rate": 9.913615466185234e-07,
"loss": 0.6013584136962891,
"step": 1866
},
{
"epoch": 2.421735604217356,
"grad_norm": 0.5951900482177734,
"learning_rate": 9.870923262645516e-07,
"loss": 0.5315797328948975,
"step": 1867
},
{
"epoch": 2.4230332522303324,
"grad_norm": 0.6201072931289673,
"learning_rate": 9.828313112910887e-07,
"loss": 0.5741020441055298,
"step": 1868
},
{
"epoch": 2.424330900243309,
"grad_norm": 0.6206340193748474,
"learning_rate": 9.78578510410832e-07,
"loss": 0.5911818146705627,
"step": 1869
},
{
"epoch": 2.4256285482562854,
"grad_norm": 0.6191825270652771,
"learning_rate": 9.743339323196827e-07,
"loss": 0.5818160772323608,
"step": 1870
},
{
"epoch": 2.426926196269262,
"grad_norm": 0.6224012970924377,
"learning_rate": 9.700975856967287e-07,
"loss": 0.5667495727539062,
"step": 1871
},
{
"epoch": 2.4282238442822384,
"grad_norm": 0.622602105140686,
"learning_rate": 9.658694792042284e-07,
"loss": 0.5867684483528137,
"step": 1872
},
{
"epoch": 2.429521492295215,
"grad_norm": 0.6468759179115295,
"learning_rate": 9.616496214875847e-07,
"loss": 0.5605747699737549,
"step": 1873
},
{
"epoch": 2.4308191403081914,
"grad_norm": 0.6025612950325012,
"learning_rate": 9.574380211753442e-07,
"loss": 0.5322221517562866,
"step": 1874
},
{
"epoch": 2.432116788321168,
"grad_norm": 0.601256251335144,
"learning_rate": 9.532346868791587e-07,
"loss": 0.6136845350265503,
"step": 1875
},
{
"epoch": 2.4334144363341443,
"grad_norm": 0.6094178557395935,
"learning_rate": 9.490396271937879e-07,
"loss": 0.6157099604606628,
"step": 1876
},
{
"epoch": 2.4347120843471206,
"grad_norm": 0.6287171244621277,
"learning_rate": 9.448528506970628e-07,
"loss": 0.5530134439468384,
"step": 1877
},
{
"epoch": 2.4360097323600973,
"grad_norm": 0.5963685512542725,
"learning_rate": 9.406743659498829e-07,
"loss": 0.5840374827384949,
"step": 1878
},
{
"epoch": 2.437307380373074,
"grad_norm": 0.6349402070045471,
"learning_rate": 9.365041814961928e-07,
"loss": 0.5503448843955994,
"step": 1879
},
{
"epoch": 2.4386050283860503,
"grad_norm": 0.6072769165039062,
"learning_rate": 9.323423058629638e-07,
"loss": 0.5658475756645203,
"step": 1880
},
{
"epoch": 2.4399026763990266,
"grad_norm": 0.6268115043640137,
"learning_rate": 9.281887475601775e-07,
"loss": 0.6097016334533691,
"step": 1881
},
{
"epoch": 2.4412003244120033,
"grad_norm": 0.5882371664047241,
"learning_rate": 9.240435150808113e-07,
"loss": 0.5780482292175293,
"step": 1882
},
{
"epoch": 2.4424979724249796,
"grad_norm": 0.6373420357704163,
"learning_rate": 9.19906616900813e-07,
"loss": 0.6226140260696411,
"step": 1883
},
{
"epoch": 2.4437956204379563,
"grad_norm": 0.6072852611541748,
"learning_rate": 9.157780614790963e-07,
"loss": 0.5743207335472107,
"step": 1884
},
{
"epoch": 2.4450932684509326,
"grad_norm": 0.634705126285553,
"learning_rate": 9.116578572575091e-07,
"loss": 0.6267349720001221,
"step": 1885
},
{
"epoch": 2.4463909164639093,
"grad_norm": 0.6120656132698059,
"learning_rate": 9.075460126608271e-07,
"loss": 0.6176955699920654,
"step": 1886
},
{
"epoch": 2.4476885644768855,
"grad_norm": 0.5967820882797241,
"learning_rate": 9.034425360967319e-07,
"loss": 0.6183077096939087,
"step": 1887
},
{
"epoch": 2.4489862124898623,
"grad_norm": 0.5987744331359863,
"learning_rate": 8.993474359557936e-07,
"loss": 0.5591214895248413,
"step": 1888
},
{
"epoch": 2.4502838605028385,
"grad_norm": 0.6169969439506531,
"learning_rate": 8.952607206114588e-07,
"loss": 0.5904876589775085,
"step": 1889
},
{
"epoch": 2.4515815085158152,
"grad_norm": 0.6008497476577759,
"learning_rate": 8.911823984200219e-07,
"loss": 0.5758087635040283,
"step": 1890
},
{
"epoch": 2.4528791565287915,
"grad_norm": 0.6111242175102234,
"learning_rate": 8.871124777206213e-07,
"loss": 0.6324316263198853,
"step": 1891
},
{
"epoch": 2.4541768045417682,
"grad_norm": 0.638118326663971,
"learning_rate": 8.83050966835215e-07,
"loss": 0.5944634079933167,
"step": 1892
},
{
"epoch": 2.4554744525547445,
"grad_norm": 0.6154019832611084,
"learning_rate": 8.789978740685646e-07,
"loss": 0.5495239496231079,
"step": 1893
},
{
"epoch": 2.456772100567721,
"grad_norm": 0.618356466293335,
"learning_rate": 8.749532077082179e-07,
"loss": 0.5651803016662598,
"step": 1894
},
{
"epoch": 2.4580697485806975,
"grad_norm": 0.6217320561408997,
"learning_rate": 8.709169760244968e-07,
"loss": 0.6198887825012207,
"step": 1895
},
{
"epoch": 2.4593673965936738,
"grad_norm": 0.6045297384262085,
"learning_rate": 8.668891872704682e-07,
"loss": 0.5438726544380188,
"step": 1896
},
{
"epoch": 2.4606650446066505,
"grad_norm": 0.614281952381134,
"learning_rate": 8.628698496819471e-07,
"loss": 0.5607205629348755,
"step": 1897
},
{
"epoch": 2.4619626926196267,
"grad_norm": 0.5984881520271301,
"learning_rate": 8.58858971477457e-07,
"loss": 0.6331669688224792,
"step": 1898
},
{
"epoch": 2.4632603406326035,
"grad_norm": 0.6256738901138306,
"learning_rate": 8.548565608582299e-07,
"loss": 0.5844709873199463,
"step": 1899
},
{
"epoch": 2.4645579886455797,
"grad_norm": 0.5857892036437988,
"learning_rate": 8.508626260081826e-07,
"loss": 0.5776396989822388,
"step": 1900
},
{
"epoch": 2.4658556366585564,
"grad_norm": 0.6575695872306824,
"learning_rate": 8.468771750939009e-07,
"loss": 0.5862407684326172,
"step": 1901
},
{
"epoch": 2.4671532846715327,
"grad_norm": 0.5867515206336975,
"learning_rate": 8.429002162646233e-07,
"loss": 0.5810645222663879,
"step": 1902
},
{
"epoch": 2.4684509326845094,
"grad_norm": 0.6347371935844421,
"learning_rate": 8.389317576522243e-07,
"loss": 0.6229629516601562,
"step": 1903
},
{
"epoch": 2.4697485806974857,
"grad_norm": 0.604457676410675,
"learning_rate": 8.349718073711971e-07,
"loss": 0.5473800897598267,
"step": 1904
},
{
"epoch": 2.4710462287104624,
"grad_norm": 0.6130659580230713,
"learning_rate": 8.310203735186384e-07,
"loss": 0.6687853932380676,
"step": 1905
},
{
"epoch": 2.4723438767234387,
"grad_norm": 0.6164904236793518,
"learning_rate": 8.270774641742275e-07,
"loss": 0.6242067217826843,
"step": 1906
},
{
"epoch": 2.4736415247364154,
"grad_norm": 0.64787358045578,
"learning_rate": 8.231430874002206e-07,
"loss": 0.5970586538314819,
"step": 1907
},
{
"epoch": 2.4749391727493917,
"grad_norm": 0.6561875939369202,
"learning_rate": 8.192172512414187e-07,
"loss": 0.5711146593093872,
"step": 1908
},
{
"epoch": 2.4762368207623684,
"grad_norm": 0.6017801761627197,
"learning_rate": 8.152999637251641e-07,
"loss": 0.5429533123970032,
"step": 1909
},
{
"epoch": 2.4775344687753447,
"grad_norm": 0.60152268409729,
"learning_rate": 8.113912328613183e-07,
"loss": 0.5184666514396667,
"step": 1910
},
{
"epoch": 2.478832116788321,
"grad_norm": 0.598573625087738,
"learning_rate": 8.074910666422475e-07,
"loss": 0.5503566861152649,
"step": 1911
},
{
"epoch": 2.4801297648012977,
"grad_norm": 0.6241352558135986,
"learning_rate": 8.035994730428031e-07,
"loss": 0.6021054983139038,
"step": 1912
},
{
"epoch": 2.4814274128142744,
"grad_norm": 0.6195024251937866,
"learning_rate": 7.997164600203111e-07,
"loss": 0.5467978715896606,
"step": 1913
},
{
"epoch": 2.4827250608272506,
"grad_norm": 0.6009840369224548,
"learning_rate": 7.958420355145469e-07,
"loss": 0.5863580703735352,
"step": 1914
},
{
"epoch": 2.484022708840227,
"grad_norm": 0.6128111481666565,
"learning_rate": 7.919762074477311e-07,
"loss": 0.5403767824172974,
"step": 1915
},
{
"epoch": 2.4853203568532036,
"grad_norm": 0.6071099042892456,
"learning_rate": 7.881189837245024e-07,
"loss": 0.5299487709999084,
"step": 1916
},
{
"epoch": 2.48661800486618,
"grad_norm": 0.6704837083816528,
"learning_rate": 7.842703722319073e-07,
"loss": 0.6165317893028259,
"step": 1917
},
{
"epoch": 2.4879156528791566,
"grad_norm": 0.6277005672454834,
"learning_rate": 7.804303808393831e-07,
"loss": 0.5439109206199646,
"step": 1918
},
{
"epoch": 2.489213300892133,
"grad_norm": 0.6348392367362976,
"learning_rate": 7.76599017398737e-07,
"loss": 0.6694045662879944,
"step": 1919
},
{
"epoch": 2.4905109489051096,
"grad_norm": 0.6145819425582886,
"learning_rate": 7.727762897441421e-07,
"loss": 0.550458550453186,
"step": 1920
},
{
"epoch": 2.491808596918086,
"grad_norm": 0.61981600522995,
"learning_rate": 7.689622056921053e-07,
"loss": 0.594965934753418,
"step": 1921
},
{
"epoch": 2.4931062449310626,
"grad_norm": 0.7170799374580383,
"learning_rate": 7.65156773041465e-07,
"loss": 0.6357606053352356,
"step": 1922
},
{
"epoch": 2.494403892944039,
"grad_norm": 0.6079750061035156,
"learning_rate": 7.613599995733667e-07,
"loss": 0.5912356376647949,
"step": 1923
},
{
"epoch": 2.4957015409570156,
"grad_norm": 0.6176713109016418,
"learning_rate": 7.575718930512516e-07,
"loss": 0.5135859847068787,
"step": 1924
},
{
"epoch": 2.496999188969992,
"grad_norm": 0.6063299179077148,
"learning_rate": 7.537924612208391e-07,
"loss": 0.5870840549468994,
"step": 1925
},
{
"epoch": 2.4982968369829686,
"grad_norm": 0.6175487041473389,
"learning_rate": 7.500217118101106e-07,
"loss": 0.5973732471466064,
"step": 1926
},
{
"epoch": 2.499594484995945,
"grad_norm": 0.6008102893829346,
"learning_rate": 7.462596525292937e-07,
"loss": 0.5943004488945007,
"step": 1927
},
{
"epoch": 2.5008921330089215,
"grad_norm": 0.6359487771987915,
"learning_rate": 7.425062910708492e-07,
"loss": 0.5653975009918213,
"step": 1928
},
{
"epoch": 2.502189781021898,
"grad_norm": 0.6241583824157715,
"learning_rate": 7.387616351094473e-07,
"loss": 0.5532112121582031,
"step": 1929
},
{
"epoch": 2.503487429034874,
"grad_norm": 0.6088744401931763,
"learning_rate": 7.350256923019666e-07,
"loss": 0.5315259695053101,
"step": 1930
},
{
"epoch": 2.504785077047851,
"grad_norm": 0.6145752668380737,
"learning_rate": 7.312984702874609e-07,
"loss": 0.600688099861145,
"step": 1931
},
{
"epoch": 2.5060827250608275,
"grad_norm": 0.6202653050422668,
"learning_rate": 7.275799766871577e-07,
"loss": 0.6020484566688538,
"step": 1932
},
{
"epoch": 2.507380373073804,
"grad_norm": 0.6492214798927307,
"learning_rate": 7.238702191044344e-07,
"loss": 0.6212818622589111,
"step": 1933
},
{
"epoch": 2.50867802108678,
"grad_norm": 0.5913106203079224,
"learning_rate": 7.201692051248066e-07,
"loss": 0.5435472726821899,
"step": 1934
},
{
"epoch": 2.509975669099757,
"grad_norm": 0.6050302982330322,
"learning_rate": 7.164769423159113e-07,
"loss": 0.6042004823684692,
"step": 1935
},
{
"epoch": 2.511273317112733,
"grad_norm": 0.6316038966178894,
"learning_rate": 7.127934382274926e-07,
"loss": 0.558472752571106,
"step": 1936
},
{
"epoch": 2.5125709651257098,
"grad_norm": 0.6041384339332581,
"learning_rate": 7.091187003913802e-07,
"loss": 0.6053918600082397,
"step": 1937
},
{
"epoch": 2.513868613138686,
"grad_norm": 0.6338528394699097,
"learning_rate": 7.054527363214875e-07,
"loss": 0.5851538777351379,
"step": 1938
},
{
"epoch": 2.5151662611516628,
"grad_norm": 0.7164930105209351,
"learning_rate": 7.017955535137788e-07,
"loss": 0.5775594115257263,
"step": 1939
},
{
"epoch": 2.516463909164639,
"grad_norm": 0.9809231758117676,
"learning_rate": 6.981471594462719e-07,
"loss": 0.6198115348815918,
"step": 1940
},
{
"epoch": 2.5177615571776153,
"grad_norm": 0.6024364829063416,
"learning_rate": 6.945075615790059e-07,
"loss": 0.5934704542160034,
"step": 1941
},
{
"epoch": 2.519059205190592,
"grad_norm": 0.6212522387504578,
"learning_rate": 6.908767673540384e-07,
"loss": 0.6180324554443359,
"step": 1942
},
{
"epoch": 2.5203568532035687,
"grad_norm": 0.6258326172828674,
"learning_rate": 6.872547841954241e-07,
"loss": 0.5982950925827026,
"step": 1943
},
{
"epoch": 2.521654501216545,
"grad_norm": 0.6158891320228577,
"learning_rate": 6.836416195092021e-07,
"loss": 0.5860976576805115,
"step": 1944
},
{
"epoch": 2.5229521492295213,
"grad_norm": 0.6238812208175659,
"learning_rate": 6.800372806833799e-07,
"loss": 0.5936440229415894,
"step": 1945
},
{
"epoch": 2.524249797242498,
"grad_norm": 0.5862494111061096,
"learning_rate": 6.764417750879182e-07,
"loss": 0.5802135467529297,
"step": 1946
},
{
"epoch": 2.5255474452554747,
"grad_norm": 0.6118647456169128,
"learning_rate": 6.728551100747155e-07,
"loss": 0.5778954029083252,
"step": 1947
},
{
"epoch": 2.526845093268451,
"grad_norm": 0.6207137703895569,
"learning_rate": 6.692772929775943e-07,
"loss": 0.6226284503936768,
"step": 1948
},
{
"epoch": 2.5281427412814272,
"grad_norm": 0.6094867587089539,
"learning_rate": 6.657083311122858e-07,
"loss": 0.5938500761985779,
"step": 1949
},
{
"epoch": 2.529440389294404,
"grad_norm": 0.6266283988952637,
"learning_rate": 6.621482317764105e-07,
"loss": 0.5501142740249634,
"step": 1950
},
{
"epoch": 2.5307380373073802,
"grad_norm": 0.6360139846801758,
"learning_rate": 6.585970022494748e-07,
"loss": 0.6632074117660522,
"step": 1951
},
{
"epoch": 2.532035685320357,
"grad_norm": 0.6052773594856262,
"learning_rate": 6.550546497928401e-07,
"loss": 0.5711944103240967,
"step": 1952
},
{
"epoch": 2.533333333333333,
"grad_norm": 0.6809741258621216,
"learning_rate": 6.515211816497247e-07,
"loss": 0.5731922388076782,
"step": 1953
},
{
"epoch": 2.53463098134631,
"grad_norm": 0.6013851761817932,
"learning_rate": 6.479966050451736e-07,
"loss": 0.572198748588562,
"step": 1954
},
{
"epoch": 2.535928629359286,
"grad_norm": 0.6084575653076172,
"learning_rate": 6.444809271860547e-07,
"loss": 0.5986557006835938,
"step": 1955
},
{
"epoch": 2.537226277372263,
"grad_norm": 0.6349742412567139,
"learning_rate": 6.409741552610399e-07,
"loss": 0.5914225578308105,
"step": 1956
},
{
"epoch": 2.538523925385239,
"grad_norm": 0.6118656396865845,
"learning_rate": 6.374762964405895e-07,
"loss": 0.5655546188354492,
"step": 1957
},
{
"epoch": 2.539821573398216,
"grad_norm": 0.6187875270843506,
"learning_rate": 6.339873578769401e-07,
"loss": 0.5871388912200928,
"step": 1958
},
{
"epoch": 2.541119221411192,
"grad_norm": 48.24391555786133,
"learning_rate": 6.305073467040884e-07,
"loss": 0.5712297558784485,
"step": 1959
},
{
"epoch": 2.5424168694241684,
"grad_norm": 0.6253454685211182,
"learning_rate": 6.270362700377736e-07,
"loss": 0.6522243022918701,
"step": 1960
},
{
"epoch": 2.543714517437145,
"grad_norm": 0.5885297656059265,
"learning_rate": 6.235741349754731e-07,
"loss": 0.6240279078483582,
"step": 1961
},
{
"epoch": 2.545012165450122,
"grad_norm": 0.600005030632019,
"learning_rate": 6.201209485963744e-07,
"loss": 0.6034828424453735,
"step": 1962
},
{
"epoch": 2.546309813463098,
"grad_norm": 0.677692711353302,
"learning_rate": 6.166767179613691e-07,
"loss": 0.5885945558547974,
"step": 1963
},
{
"epoch": 2.5476074614760744,
"grad_norm": 0.6142828464508057,
"learning_rate": 6.132414501130385e-07,
"loss": 0.5538769960403442,
"step": 1964
},
{
"epoch": 2.548905109489051,
"grad_norm": 0.6016609072685242,
"learning_rate": 6.098151520756357e-07,
"loss": 0.5977665185928345,
"step": 1965
},
{
"epoch": 2.550202757502028,
"grad_norm": 2.0037388801574707,
"learning_rate": 6.063978308550722e-07,
"loss": 0.612566351890564,
"step": 1966
},
{
"epoch": 2.551500405515004,
"grad_norm": 0.602703869342804,
"learning_rate": 6.029894934389058e-07,
"loss": 0.5812326669692993,
"step": 1967
},
{
"epoch": 2.5527980535279804,
"grad_norm": 0.5868345499038696,
"learning_rate": 5.995901467963228e-07,
"loss": 0.5142446160316467,
"step": 1968
},
{
"epoch": 2.554095701540957,
"grad_norm": 0.625521719455719,
"learning_rate": 5.961997978781292e-07,
"loss": 0.5533977746963501,
"step": 1969
},
{
"epoch": 2.5553933495539334,
"grad_norm": 0.6117697358131409,
"learning_rate": 5.928184536167258e-07,
"loss": 0.6049879789352417,
"step": 1970
},
{
"epoch": 2.55669099756691,
"grad_norm": 0.6366870403289795,
"learning_rate": 5.89446120926111e-07,
"loss": 0.5416997671127319,
"step": 1971
},
{
"epoch": 2.5579886455798864,
"grad_norm": 0.6090091466903687,
"learning_rate": 5.860828067018481e-07,
"loss": 0.5767660737037659,
"step": 1972
},
{
"epoch": 2.559286293592863,
"grad_norm": 0.6263614892959595,
"learning_rate": 5.82728517821064e-07,
"loss": 0.5914768576622009,
"step": 1973
},
{
"epoch": 2.5605839416058394,
"grad_norm": 0.6438020467758179,
"learning_rate": 5.793832611424322e-07,
"loss": 0.5773044228553772,
"step": 1974
},
{
"epoch": 2.5618815896188156,
"grad_norm": 0.6195680499076843,
"learning_rate": 5.760470435061533e-07,
"loss": 0.5637648701667786,
"step": 1975
},
{
"epoch": 2.5631792376317923,
"grad_norm": 1.289580225944519,
"learning_rate": 5.727198717339511e-07,
"loss": 0.6060294508934021,
"step": 1976
},
{
"epoch": 2.564476885644769,
"grad_norm": 0.6049319505691528,
"learning_rate": 5.694017526290468e-07,
"loss": 0.5878962278366089,
"step": 1977
},
{
"epoch": 2.5657745336577453,
"grad_norm": 0.6546334028244019,
"learning_rate": 5.660926929761556e-07,
"loss": 0.5719892382621765,
"step": 1978
},
{
"epoch": 2.5670721816707216,
"grad_norm": 0.5887362957000732,
"learning_rate": 5.627926995414662e-07,
"loss": 0.5226088762283325,
"step": 1979
},
{
"epoch": 2.5683698296836983,
"grad_norm": 0.6115890741348267,
"learning_rate": 5.59501779072631e-07,
"loss": 0.5784634947776794,
"step": 1980
},
{
"epoch": 2.569667477696675,
"grad_norm": 0.6565897464752197,
"learning_rate": 5.562199382987488e-07,
"loss": 0.5947513580322266,
"step": 1981
},
{
"epoch": 2.5709651257096513,
"grad_norm": 0.594465970993042,
"learning_rate": 5.529471839303541e-07,
"loss": 0.5367786884307861,
"step": 1982
},
{
"epoch": 2.5722627737226276,
"grad_norm": 0.6155304908752441,
"learning_rate": 5.496835226593983e-07,
"loss": 0.6144155859947205,
"step": 1983
},
{
"epoch": 2.5735604217356043,
"grad_norm": 0.6233793497085571,
"learning_rate": 5.464289611592472e-07,
"loss": 0.5667406916618347,
"step": 1984
},
{
"epoch": 2.5748580697485806,
"grad_norm": 0.6025534272193909,
"learning_rate": 5.431835060846519e-07,
"loss": 0.5775101184844971,
"step": 1985
},
{
"epoch": 2.5761557177615573,
"grad_norm": 0.6037949323654175,
"learning_rate": 5.399471640717479e-07,
"loss": 0.6155390739440918,
"step": 1986
},
{
"epoch": 2.5774533657745335,
"grad_norm": 0.61771160364151,
"learning_rate": 5.367199417380347e-07,
"loss": 0.5459461808204651,
"step": 1987
},
{
"epoch": 2.5787510137875103,
"grad_norm": 0.6559909582138062,
"learning_rate": 5.335018456823665e-07,
"loss": 0.6187810897827148,
"step": 1988
},
{
"epoch": 2.5800486618004865,
"grad_norm": 0.6218096017837524,
"learning_rate": 5.302928824849335e-07,
"loss": 0.629378080368042,
"step": 1989
},
{
"epoch": 2.5813463098134632,
"grad_norm": 0.5922935605049133,
"learning_rate": 5.270930587072548e-07,
"loss": 0.5435377359390259,
"step": 1990
},
{
"epoch": 2.5826439578264395,
"grad_norm": 0.5918126106262207,
"learning_rate": 5.239023808921595e-07,
"loss": 0.5545147657394409,
"step": 1991
},
{
"epoch": 2.5839416058394162,
"grad_norm": 0.6067506074905396,
"learning_rate": 5.207208555637767e-07,
"loss": 0.6249223351478577,
"step": 1992
},
{
"epoch": 2.5852392538523925,
"grad_norm": 0.6125559210777283,
"learning_rate": 5.175484892275184e-07,
"loss": 0.5820242166519165,
"step": 1993
},
{
"epoch": 2.5865369018653688,
"grad_norm": 0.5970590114593506,
"learning_rate": 5.14385288370074e-07,
"loss": 0.6091808080673218,
"step": 1994
},
{
"epoch": 2.5878345498783455,
"grad_norm": 0.5902854204177856,
"learning_rate": 5.11231259459386e-07,
"loss": 0.5224129557609558,
"step": 1995
},
{
"epoch": 2.589132197891322,
"grad_norm": 0.604062020778656,
"learning_rate": 5.080864089446464e-07,
"loss": 0.5258910655975342,
"step": 1996
},
{
"epoch": 2.5904298459042985,
"grad_norm": 0.6816832423210144,
"learning_rate": 5.049507432562778e-07,
"loss": 0.5509624481201172,
"step": 1997
},
{
"epoch": 2.5917274939172747,
"grad_norm": 0.6220773458480835,
"learning_rate": 5.018242688059238e-07,
"loss": 0.6509982943534851,
"step": 1998
},
{
"epoch": 2.5930251419302515,
"grad_norm": 0.6238852143287659,
"learning_rate": 4.987069919864329e-07,
"loss": 0.6329154968261719,
"step": 1999
},
{
"epoch": 2.5943227899432277,
"grad_norm": 0.6279301643371582,
"learning_rate": 4.95598919171848e-07,
"loss": 0.624962329864502,
"step": 2000
},
{
"epoch": 2.5956204379562045,
"grad_norm": 0.6066421866416931,
"learning_rate": 4.925000567173882e-07,
"loss": 0.6009570360183716,
"step": 2001
},
{
"epoch": 2.5969180859691807,
"grad_norm": 0.6097516417503357,
"learning_rate": 4.894104109594466e-07,
"loss": 0.5533030033111572,
"step": 2002
},
{
"epoch": 2.5982157339821574,
"grad_norm": 0.60311359167099,
"learning_rate": 4.863299882155659e-07,
"loss": 0.5549200177192688,
"step": 2003
},
{
"epoch": 2.5995133819951337,
"grad_norm": 0.6075156927108765,
"learning_rate": 4.832587947844297e-07,
"loss": 0.5541381239891052,
"step": 2004
},
{
"epoch": 2.6008110300081104,
"grad_norm": 0.6099098324775696,
"learning_rate": 4.801968369458531e-07,
"loss": 0.6142464876174927,
"step": 2005
},
{
"epoch": 2.6021086780210867,
"grad_norm": 0.6433584690093994,
"learning_rate": 4.771441209607625e-07,
"loss": 0.6120733022689819,
"step": 2006
},
{
"epoch": 2.6034063260340634,
"grad_norm": 1.134731411933899,
"learning_rate": 4.7410065307119167e-07,
"loss": 0.6064984798431396,
"step": 2007
},
{
"epoch": 2.6047039740470397,
"grad_norm": 0.6147306561470032,
"learning_rate": 4.7106643950026067e-07,
"loss": 0.5834633111953735,
"step": 2008
},
{
"epoch": 2.606001622060016,
"grad_norm": 0.610374927520752,
"learning_rate": 4.6804148645216873e-07,
"loss": 0.5858355760574341,
"step": 2009
},
{
"epoch": 2.6072992700729927,
"grad_norm": 0.6226435899734497,
"learning_rate": 4.6502580011217934e-07,
"loss": 0.5983865261077881,
"step": 2010
},
{
"epoch": 2.6085969180859694,
"grad_norm": 0.6833674311637878,
"learning_rate": 4.6201938664660775e-07,
"loss": 0.6065071225166321,
"step": 2011
},
{
"epoch": 2.6098945660989457,
"grad_norm": 0.6266833543777466,
"learning_rate": 4.590222522028082e-07,
"loss": 0.5968768000602722,
"step": 2012
},
{
"epoch": 2.611192214111922,
"grad_norm": 0.6198201179504395,
"learning_rate": 4.5603440290916347e-07,
"loss": 0.6149097681045532,
"step": 2013
},
{
"epoch": 2.6124898621248986,
"grad_norm": 0.6224921941757202,
"learning_rate": 4.5305584487506605e-07,
"loss": 0.6195799708366394,
"step": 2014
},
{
"epoch": 2.6137875101378754,
"grad_norm": 0.5922067165374756,
"learning_rate": 4.500865841909169e-07,
"loss": 0.5795333385467529,
"step": 2015
},
{
"epoch": 2.6150851581508516,
"grad_norm": 0.6451519727706909,
"learning_rate": 4.471266269280994e-07,
"loss": 0.6512206196784973,
"step": 2016
},
{
"epoch": 2.616382806163828,
"grad_norm": 0.6207348108291626,
"learning_rate": 4.441759791389799e-07,
"loss": 0.6410412788391113,
"step": 2017
},
{
"epoch": 2.6176804541768046,
"grad_norm": 0.6637576818466187,
"learning_rate": 4.41234646856884e-07,
"loss": 0.5507533550262451,
"step": 2018
},
{
"epoch": 2.618978102189781,
"grad_norm": 0.6296217441558838,
"learning_rate": 4.383026360960929e-07,
"loss": 0.5853258371353149,
"step": 2019
},
{
"epoch": 2.6202757502027576,
"grad_norm": 0.5993384122848511,
"learning_rate": 4.35379952851826e-07,
"loss": 0.5613459944725037,
"step": 2020
},
{
"epoch": 2.621573398215734,
"grad_norm": 0.6372536420822144,
"learning_rate": 4.324666031002311e-07,
"loss": 0.563460111618042,
"step": 2021
},
{
"epoch": 2.6228710462287106,
"grad_norm": 0.6129400134086609,
"learning_rate": 4.29562592798371e-07,
"loss": 0.6133362650871277,
"step": 2022
},
{
"epoch": 2.624168694241687,
"grad_norm": 0.6232635974884033,
"learning_rate": 4.266679278842123e-07,
"loss": 0.5923752784729004,
"step": 2023
},
{
"epoch": 2.6254663422546636,
"grad_norm": 0.6236964464187622,
"learning_rate": 4.2378261427660994e-07,
"loss": 0.5925074815750122,
"step": 2024
},
{
"epoch": 2.62676399026764,
"grad_norm": 0.5997064113616943,
"learning_rate": 4.209066578753035e-07,
"loss": 0.5586100816726685,
"step": 2025
},
{
"epoch": 2.6280616382806166,
"grad_norm": 0.6276852488517761,
"learning_rate": 4.1804006456089174e-07,
"loss": 0.5699270367622375,
"step": 2026
},
{
"epoch": 2.629359286293593,
"grad_norm": 0.5818026065826416,
"learning_rate": 4.1518284019483655e-07,
"loss": 0.5539983510971069,
"step": 2027
},
{
"epoch": 2.630656934306569,
"grad_norm": 0.6021342277526855,
"learning_rate": 4.123349906194357e-07,
"loss": 0.5571432709693909,
"step": 2028
},
{
"epoch": 2.631954582319546,
"grad_norm": 0.6044632196426392,
"learning_rate": 4.094965216578212e-07,
"loss": 0.5815938711166382,
"step": 2029
},
{
"epoch": 2.6332522303325225,
"grad_norm": 0.6218861937522888,
"learning_rate": 4.066674391139458e-07,
"loss": 0.5798450112342834,
"step": 2030
},
{
"epoch": 2.634549878345499,
"grad_norm": 0.6776529550552368,
"learning_rate": 4.038477487725645e-07,
"loss": 0.5181751251220703,
"step": 2031
},
{
"epoch": 2.635847526358475,
"grad_norm": 0.6296592354774475,
"learning_rate": 4.0103745639923144e-07,
"loss": 0.6052215695381165,
"step": 2032
},
{
"epoch": 2.637145174371452,
"grad_norm": 0.6410042643547058,
"learning_rate": 3.9823656774028386e-07,
"loss": 0.5471499562263489,
"step": 2033
},
{
"epoch": 2.638442822384428,
"grad_norm": 0.6148339509963989,
"learning_rate": 3.9544508852282895e-07,
"loss": 0.6046350002288818,
"step": 2034
},
{
"epoch": 2.639740470397405,
"grad_norm": 0.6409063935279846,
"learning_rate": 3.9266302445473634e-07,
"loss": 0.5563018918037415,
"step": 2035
},
{
"epoch": 2.641038118410381,
"grad_norm": 0.6377732157707214,
"learning_rate": 3.89890381224623e-07,
"loss": 0.5965743064880371,
"step": 2036
},
{
"epoch": 2.6423357664233578,
"grad_norm": 0.6147736310958862,
"learning_rate": 3.8712716450183985e-07,
"loss": 0.558821439743042,
"step": 2037
},
{
"epoch": 2.643633414436334,
"grad_norm": 0.5959088802337646,
"learning_rate": 3.8437337993647017e-07,
"loss": 0.6072096824645996,
"step": 2038
},
{
"epoch": 2.6449310624493108,
"grad_norm": 0.5934545993804932,
"learning_rate": 3.81629033159302e-07,
"loss": 0.5585888028144836,
"step": 2039
},
{
"epoch": 2.646228710462287,
"grad_norm": 0.6148179173469543,
"learning_rate": 3.7889412978183324e-07,
"loss": 0.6224203705787659,
"step": 2040
},
{
"epoch": 2.6475263584752637,
"grad_norm": 0.6041895151138306,
"learning_rate": 3.7616867539624733e-07,
"loss": 0.5594790577888489,
"step": 2041
},
{
"epoch": 2.64882400648824,
"grad_norm": 0.6036660075187683,
"learning_rate": 3.734526755754092e-07,
"loss": 0.5392581820487976,
"step": 2042
},
{
"epoch": 2.6501216545012163,
"grad_norm": 0.6497801542282104,
"learning_rate": 3.707461358728509e-07,
"loss": 0.645263135433197,
"step": 2043
},
{
"epoch": 2.651419302514193,
"grad_norm": 0.6202139258384705,
"learning_rate": 3.680490618227611e-07,
"loss": 0.6205359697341919,
"step": 2044
},
{
"epoch": 2.6527169505271697,
"grad_norm": 0.5867362022399902,
"learning_rate": 3.6536145893997346e-07,
"loss": 0.5754397511482239,
"step": 2045
},
{
"epoch": 2.654014598540146,
"grad_norm": 0.6415355205535889,
"learning_rate": 3.626833327199564e-07,
"loss": 0.6042582392692566,
"step": 2046
},
{
"epoch": 2.6553122465531223,
"grad_norm": 0.6417367458343506,
"learning_rate": 3.600146886387984e-07,
"loss": 0.6140678524971008,
"step": 2047
},
{
"epoch": 2.656609894566099,
"grad_norm": 0.6080589890480042,
"learning_rate": 3.573555321532035e-07,
"loss": 0.574844241142273,
"step": 2048
},
{
"epoch": 2.6579075425790757,
"grad_norm": 0.6920068264007568,
"learning_rate": 3.547058687004723e-07,
"loss": 0.6025684475898743,
"step": 2049
},
{
"epoch": 2.659205190592052,
"grad_norm": 0.6130858659744263,
"learning_rate": 3.520657036984959e-07,
"loss": 0.5683197379112244,
"step": 2050
},
{
"epoch": 2.6605028386050282,
"grad_norm": 0.6280376315116882,
"learning_rate": 3.494350425457438e-07,
"loss": 0.5609173774719238,
"step": 2051
},
{
"epoch": 2.661800486618005,
"grad_norm": 0.6326773166656494,
"learning_rate": 3.46813890621252e-07,
"loss": 0.5946630239486694,
"step": 2052
},
{
"epoch": 2.663098134630981,
"grad_norm": 0.6118667721748352,
"learning_rate": 3.4420225328461286e-07,
"loss": 0.5908790826797485,
"step": 2053
},
{
"epoch": 2.664395782643958,
"grad_norm": 0.6427050828933716,
"learning_rate": 3.416001358759635e-07,
"loss": 0.6200711727142334,
"step": 2054
},
{
"epoch": 2.665693430656934,
"grad_norm": 0.6258965730667114,
"learning_rate": 3.390075437159762e-07,
"loss": 0.6091062426567078,
"step": 2055
},
{
"epoch": 2.666991078669911,
"grad_norm": 0.8197891116142273,
"learning_rate": 3.36424482105846e-07,
"loss": 0.6184768676757812,
"step": 2056
},
{
"epoch": 2.668288726682887,
"grad_norm": 0.6219103336334229,
"learning_rate": 3.338509563272774e-07,
"loss": 0.5699069499969482,
"step": 2057
},
{
"epoch": 2.669586374695864,
"grad_norm": 0.6160385012626648,
"learning_rate": 3.3128697164248213e-07,
"loss": 0.6063632369041443,
"step": 2058
},
{
"epoch": 2.67088402270884,
"grad_norm": 0.6377853155136108,
"learning_rate": 3.2873253329415986e-07,
"loss": 0.6303044557571411,
"step": 2059
},
{
"epoch": 2.672181670721817,
"grad_norm": 0.6218414306640625,
"learning_rate": 3.2618764650548806e-07,
"loss": 0.5987715721130371,
"step": 2060
},
{
"epoch": 2.673479318734793,
"grad_norm": 0.6107571125030518,
"learning_rate": 3.236523164801192e-07,
"loss": 0.5237259864807129,
"step": 2061
},
{
"epoch": 2.6747769667477694,
"grad_norm": 0.6305319666862488,
"learning_rate": 3.2112654840215863e-07,
"loss": 0.6254755854606628,
"step": 2062
},
{
"epoch": 2.676074614760746,
"grad_norm": 0.6144214868545532,
"learning_rate": 3.186103474361646e-07,
"loss": 0.6048131585121155,
"step": 2063
},
{
"epoch": 2.677372262773723,
"grad_norm": 0.6124334335327148,
"learning_rate": 3.161037187271304e-07,
"loss": 0.5881555080413818,
"step": 2064
},
{
"epoch": 2.678669910786699,
"grad_norm": 0.6141470670700073,
"learning_rate": 3.136066674004773e-07,
"loss": 0.5876516103744507,
"step": 2065
},
{
"epoch": 2.6799675587996754,
"grad_norm": 0.5808926820755005,
"learning_rate": 3.1111919856204373e-07,
"loss": 0.5583111047744751,
"step": 2066
},
{
"epoch": 2.681265206812652,
"grad_norm": 0.663599967956543,
"learning_rate": 3.08641317298074e-07,
"loss": 0.5772061944007874,
"step": 2067
},
{
"epoch": 2.6825628548256284,
"grad_norm": 0.6320760846138,
"learning_rate": 3.0617302867520736e-07,
"loss": 0.5595476031303406,
"step": 2068
},
{
"epoch": 2.683860502838605,
"grad_norm": 0.61170494556427,
"learning_rate": 3.0371433774047056e-07,
"loss": 0.6012779474258423,
"step": 2069
},
{
"epoch": 2.6851581508515814,
"grad_norm": 0.6115148067474365,
"learning_rate": 3.0126524952126203e-07,
"loss": 0.6057910919189453,
"step": 2070
},
{
"epoch": 2.6851581508515814,
"eval_loss": 0.6819512844085693,
"eval_runtime": 72.9512,
"eval_samples_per_second": 71.171,
"eval_steps_per_second": 8.896,
"step": 2070
},
{
"epoch": 2.686455798864558,
"grad_norm": 0.6251775026321411,
"learning_rate": 2.988257690253504e-07,
"loss": 0.6118081212043762,
"step": 2071
},
{
"epoch": 2.6877534468775344,
"grad_norm": 0.6253253221511841,
"learning_rate": 2.9639590124085296e-07,
"loss": 0.6572234630584717,
"step": 2072
},
{
"epoch": 2.689051094890511,
"grad_norm": 0.6017980575561523,
"learning_rate": 2.939756511362357e-07,
"loss": 0.5534753799438477,
"step": 2073
},
{
"epoch": 2.6903487429034874,
"grad_norm": 0.6164457201957703,
"learning_rate": 2.915650236602974e-07,
"loss": 0.6046677827835083,
"step": 2074
},
{
"epoch": 2.691646390916464,
"grad_norm": 0.6189885139465332,
"learning_rate": 2.891640237421611e-07,
"loss": 0.6001750826835632,
"step": 2075
},
{
"epoch": 2.6929440389294403,
"grad_norm": 0.6118842959403992,
"learning_rate": 2.8677265629126373e-07,
"loss": 0.5822157263755798,
"step": 2076
},
{
"epoch": 2.6942416869424166,
"grad_norm": 0.6505289673805237,
"learning_rate": 2.8439092619734655e-07,
"loss": 0.6047310829162598,
"step": 2077
},
{
"epoch": 2.6955393349553933,
"grad_norm": 0.6261717081069946,
"learning_rate": 2.820188383304451e-07,
"loss": 0.5709232687950134,
"step": 2078
},
{
"epoch": 2.69683698296837,
"grad_norm": 0.591399610042572,
"learning_rate": 2.7965639754087893e-07,
"loss": 0.5760236382484436,
"step": 2079
},
{
"epoch": 2.6981346309813463,
"grad_norm": 0.6267626881599426,
"learning_rate": 2.7730360865923954e-07,
"loss": 0.627373218536377,
"step": 2080
},
{
"epoch": 2.6994322789943226,
"grad_norm": 0.5880517959594727,
"learning_rate": 2.7496047649638757e-07,
"loss": 0.556127667427063,
"step": 2081
},
{
"epoch": 2.7007299270072993,
"grad_norm": 0.6221486926078796,
"learning_rate": 2.726270058434327e-07,
"loss": 0.6388289332389832,
"step": 2082
},
{
"epoch": 2.702027575020276,
"grad_norm": 0.6296391487121582,
"learning_rate": 2.703032014717333e-07,
"loss": 0.6471085548400879,
"step": 2083
},
{
"epoch": 2.7033252230332523,
"grad_norm": 0.6119943261146545,
"learning_rate": 2.6798906813288117e-07,
"loss": 0.587184488773346,
"step": 2084
},
{
"epoch": 2.7046228710462286,
"grad_norm": 0.5858760476112366,
"learning_rate": 2.656846105586919e-07,
"loss": 0.6001055836677551,
"step": 2085
},
{
"epoch": 2.7059205190592053,
"grad_norm": 0.6214133501052856,
"learning_rate": 2.633898334611995e-07,
"loss": 0.6275671720504761,
"step": 2086
},
{
"epoch": 2.7072181670721815,
"grad_norm": 0.5908603668212891,
"learning_rate": 2.6110474153264176e-07,
"loss": 0.5731199979782104,
"step": 2087
},
{
"epoch": 2.7085158150851583,
"grad_norm": 0.5500771403312683,
"learning_rate": 2.588293394454533e-07,
"loss": 0.5535600781440735,
"step": 2088
},
{
"epoch": 2.7098134630981345,
"grad_norm": 0.6212435364723206,
"learning_rate": 2.565636318522552e-07,
"loss": 0.6325974464416504,
"step": 2089
},
{
"epoch": 2.7111111111111112,
"grad_norm": 0.5896530747413635,
"learning_rate": 2.543076233858466e-07,
"loss": 0.564407229423523,
"step": 2090
},
{
"epoch": 2.7124087591240875,
"grad_norm": 0.6151485443115234,
"learning_rate": 2.5206131865919303e-07,
"loss": 0.5890393257141113,
"step": 2091
},
{
"epoch": 2.7137064071370642,
"grad_norm": 0.5984410643577576,
"learning_rate": 2.4982472226542045e-07,
"loss": 0.5423193573951721,
"step": 2092
},
{
"epoch": 2.7150040551500405,
"grad_norm": 0.6220104694366455,
"learning_rate": 2.475978387778e-07,
"loss": 0.5741702318191528,
"step": 2093
},
{
"epoch": 2.7163017031630172,
"grad_norm": 0.64532470703125,
"learning_rate": 2.453806727497482e-07,
"loss": 0.578140914440155,
"step": 2094
},
{
"epoch": 2.7175993511759935,
"grad_norm": 0.6362125277519226,
"learning_rate": 2.431732287148053e-07,
"loss": 0.6103841066360474,
"step": 2095
},
{
"epoch": 2.7188969991889698,
"grad_norm": 0.6365206837654114,
"learning_rate": 2.409755111866369e-07,
"loss": 0.6380729079246521,
"step": 2096
},
{
"epoch": 2.7201946472019465,
"grad_norm": 0.6440710425376892,
"learning_rate": 2.387875246590193e-07,
"loss": 0.5572207570075989,
"step": 2097
},
{
"epoch": 2.721492295214923,
"grad_norm": 0.6295807361602783,
"learning_rate": 2.3660927360583064e-07,
"loss": 0.6024692058563232,
"step": 2098
},
{
"epoch": 2.7227899432278995,
"grad_norm": 0.5711405873298645,
"learning_rate": 2.3444076248104297e-07,
"loss": 0.5038433074951172,
"step": 2099
},
{
"epoch": 2.7240875912408757,
"grad_norm": 0.7995308637619019,
"learning_rate": 2.322819957187139e-07,
"loss": 0.6232460737228394,
"step": 2100
},
{
"epoch": 2.7253852392538525,
"grad_norm": 0.5909203886985779,
"learning_rate": 2.3013297773297306e-07,
"loss": 0.5349663496017456,
"step": 2101
},
{
"epoch": 2.7266828872668287,
"grad_norm": 0.6373469829559326,
"learning_rate": 2.279937129180204e-07,
"loss": 0.5974945425987244,
"step": 2102
},
{
"epoch": 2.7279805352798054,
"grad_norm": 0.6128799915313721,
"learning_rate": 2.2586420564810863e-07,
"loss": 0.5850982069969177,
"step": 2103
},
{
"epoch": 2.7292781832927817,
"grad_norm": 0.6667084097862244,
"learning_rate": 2.2374446027754405e-07,
"loss": 0.5952577590942383,
"step": 2104
},
{
"epoch": 2.7305758313057584,
"grad_norm": 0.6103115081787109,
"learning_rate": 2.2163448114066677e-07,
"loss": 0.5764719247817993,
"step": 2105
},
{
"epoch": 2.7318734793187347,
"grad_norm": 0.5843047499656677,
"learning_rate": 2.1953427255185122e-07,
"loss": 0.5831491947174072,
"step": 2106
},
{
"epoch": 2.7331711273317114,
"grad_norm": 0.6300417184829712,
"learning_rate": 2.174438388054928e-07,
"loss": 0.5893597602844238,
"step": 2107
},
{
"epoch": 2.7344687753446877,
"grad_norm": 0.601433515548706,
"learning_rate": 2.1536318417599844e-07,
"loss": 0.5604301691055298,
"step": 2108
},
{
"epoch": 2.7357664233576644,
"grad_norm": 0.6220826506614685,
"learning_rate": 2.1329231291778108e-07,
"loss": 0.6189798712730408,
"step": 2109
},
{
"epoch": 2.7370640713706407,
"grad_norm": 0.5895432233810425,
"learning_rate": 2.1123122926524853e-07,
"loss": 0.5561822652816772,
"step": 2110
},
{
"epoch": 2.738361719383617,
"grad_norm": 0.8975700736045837,
"learning_rate": 2.0917993743279297e-07,
"loss": 0.552111029624939,
"step": 2111
},
{
"epoch": 2.7396593673965937,
"grad_norm": 0.5886269211769104,
"learning_rate": 2.0713844161479035e-07,
"loss": 0.5910426378250122,
"step": 2112
},
{
"epoch": 2.7409570154095704,
"grad_norm": 0.5890198945999146,
"learning_rate": 2.0510674598558045e-07,
"loss": 0.5544984936714172,
"step": 2113
},
{
"epoch": 2.7422546634225466,
"grad_norm": 0.6140372157096863,
"learning_rate": 2.0308485469946736e-07,
"loss": 0.6121523380279541,
"step": 2114
},
{
"epoch": 2.743552311435523,
"grad_norm": 0.5979804396629333,
"learning_rate": 2.010727718907074e-07,
"loss": 0.5417115688323975,
"step": 2115
},
{
"epoch": 2.7448499594484996,
"grad_norm": 0.6019598841667175,
"learning_rate": 1.9907050167349894e-07,
"loss": 0.5624793171882629,
"step": 2116
},
{
"epoch": 2.7461476074614763,
"grad_norm": 0.6011685132980347,
"learning_rate": 1.9707804814198096e-07,
"loss": 0.5510683655738831,
"step": 2117
},
{
"epoch": 2.7474452554744526,
"grad_norm": 0.5924180746078491,
"learning_rate": 1.9509541537021392e-07,
"loss": 0.5276060104370117,
"step": 2118
},
{
"epoch": 2.748742903487429,
"grad_norm": 0.6053572297096252,
"learning_rate": 1.9312260741218114e-07,
"loss": 0.5551567673683167,
"step": 2119
},
{
"epoch": 2.7500405515004056,
"grad_norm": 0.620968222618103,
"learning_rate": 1.911596283017747e-07,
"loss": 0.5851413011550903,
"step": 2120
},
{
"epoch": 2.751338199513382,
"grad_norm": 0.626413881778717,
"learning_rate": 1.8920648205279113e-07,
"loss": 0.5591974258422852,
"step": 2121
},
{
"epoch": 2.7526358475263586,
"grad_norm": 0.615846574306488,
"learning_rate": 1.8726317265891968e-07,
"loss": 0.5918228626251221,
"step": 2122
},
{
"epoch": 2.753933495539335,
"grad_norm": 0.645077645778656,
"learning_rate": 1.8532970409373684e-07,
"loss": 0.5714014172554016,
"step": 2123
},
{
"epoch": 2.7552311435523116,
"grad_norm": 0.6882081031799316,
"learning_rate": 1.8340608031069462e-07,
"loss": 0.6177914142608643,
"step": 2124
},
{
"epoch": 2.756528791565288,
"grad_norm": 0.6870415806770325,
"learning_rate": 1.8149230524311944e-07,
"loss": 0.6026558876037598,
"step": 2125
},
{
"epoch": 2.757826439578264,
"grad_norm": 0.5801068544387817,
"learning_rate": 1.7958838280419387e-07,
"loss": 0.5492424964904785,
"step": 2126
},
{
"epoch": 2.759124087591241,
"grad_norm": 0.6277424693107605,
"learning_rate": 1.7769431688696048e-07,
"loss": 0.5704351663589478,
"step": 2127
},
{
"epoch": 2.7604217356042176,
"grad_norm": 0.6131430864334106,
"learning_rate": 1.7581011136430238e-07,
"loss": 0.6227852702140808,
"step": 2128
},
{
"epoch": 2.761719383617194,
"grad_norm": 0.6621940732002258,
"learning_rate": 1.739357700889438e-07,
"loss": 0.5971069931983948,
"step": 2129
},
{
"epoch": 2.76301703163017,
"grad_norm": 0.6566265225410461,
"learning_rate": 1.720712968934385e-07,
"loss": 0.6617914438247681,
"step": 2130
},
{
"epoch": 2.764314679643147,
"grad_norm": 0.6165506839752197,
"learning_rate": 1.7021669559016184e-07,
"loss": 0.5680196285247803,
"step": 2131
},
{
"epoch": 2.7656123276561235,
"grad_norm": 0.6046646237373352,
"learning_rate": 1.6837196997130434e-07,
"loss": 0.605772078037262,
"step": 2132
},
{
"epoch": 2.7669099756691,
"grad_norm": 0.6838919520378113,
"learning_rate": 1.6653712380886366e-07,
"loss": 0.5754232406616211,
"step": 2133
},
{
"epoch": 2.768207623682076,
"grad_norm": 0.6096740365028381,
"learning_rate": 1.6471216085463372e-07,
"loss": 0.5173358917236328,
"step": 2134
},
{
"epoch": 2.769505271695053,
"grad_norm": 0.6066602468490601,
"learning_rate": 1.6289708484020395e-07,
"loss": 0.5950397253036499,
"step": 2135
},
{
"epoch": 2.770802919708029,
"grad_norm": 0.609034538269043,
"learning_rate": 1.6109189947694448e-07,
"loss": 0.5427603721618652,
"step": 2136
},
{
"epoch": 2.7721005677210058,
"grad_norm": 0.6451703906059265,
"learning_rate": 1.5929660845600215e-07,
"loss": 0.6046600341796875,
"step": 2137
},
{
"epoch": 2.773398215733982,
"grad_norm": 0.5977014899253845,
"learning_rate": 1.575112154482933e-07,
"loss": 0.5849440693855286,
"step": 2138
},
{
"epoch": 2.7746958637469588,
"grad_norm": 0.6242566108703613,
"learning_rate": 1.557357241044949e-07,
"loss": 0.6496338844299316,
"step": 2139
},
{
"epoch": 2.775993511759935,
"grad_norm": 0.5988749265670776,
"learning_rate": 1.539701380550368e-07,
"loss": 0.5334508419036865,
"step": 2140
},
{
"epoch": 2.7772911597729117,
"grad_norm": 0.7054303288459778,
"learning_rate": 1.5221446091009618e-07,
"loss": 0.4878901541233063,
"step": 2141
},
{
"epoch": 2.778588807785888,
"grad_norm": 0.6645851731300354,
"learning_rate": 1.504686962595875e-07,
"loss": 0.6245031356811523,
"step": 2142
},
{
"epoch": 2.7798864557988647,
"grad_norm": 0.6102975606918335,
"learning_rate": 1.4873284767315864e-07,
"loss": 0.5180703997612,
"step": 2143
},
{
"epoch": 2.781184103811841,
"grad_norm": 0.6466278433799744,
"learning_rate": 1.4700691870017991e-07,
"loss": 0.5804831981658936,
"step": 2144
},
{
"epoch": 2.7824817518248173,
"grad_norm": 0.639724612236023,
"learning_rate": 1.4529091286973994e-07,
"loss": 0.6196957230567932,
"step": 2145
},
{
"epoch": 2.783779399837794,
"grad_norm": 0.6038338541984558,
"learning_rate": 1.435848336906359e-07,
"loss": 0.5739912986755371,
"step": 2146
},
{
"epoch": 2.7850770478507707,
"grad_norm": 0.6094257831573486,
"learning_rate": 1.418886846513673e-07,
"loss": 0.6085304021835327,
"step": 2147
},
{
"epoch": 2.786374695863747,
"grad_norm": 0.6370331048965454,
"learning_rate": 1.4020246922013093e-07,
"loss": 0.572968065738678,
"step": 2148
},
{
"epoch": 2.7876723438767232,
"grad_norm": 0.5946716666221619,
"learning_rate": 1.3852619084480933e-07,
"loss": 0.5418939590454102,
"step": 2149
},
{
"epoch": 2.7889699918897,
"grad_norm": 0.6075360774993896,
"learning_rate": 1.3685985295296798e-07,
"loss": 0.5994930267333984,
"step": 2150
},
{
"epoch": 2.7902676399026762,
"grad_norm": 0.6279016733169556,
"learning_rate": 1.3520345895184583e-07,
"loss": 0.5570014715194702,
"step": 2151
},
{
"epoch": 2.791565287915653,
"grad_norm": 0.6306800246238708,
"learning_rate": 1.3355701222835026e-07,
"loss": 0.5708903074264526,
"step": 2152
},
{
"epoch": 2.792862935928629,
"grad_norm": 0.6170070171356201,
"learning_rate": 1.3192051614904722e-07,
"loss": 0.550320029258728,
"step": 2153
},
{
"epoch": 2.794160583941606,
"grad_norm": 0.6288532018661499,
"learning_rate": 1.302939740601572e-07,
"loss": 0.613933801651001,
"step": 2154
},
{
"epoch": 2.795458231954582,
"grad_norm": 0.6111281514167786,
"learning_rate": 1.2867738928754703e-07,
"loss": 0.5617604851722717,
"step": 2155
},
{
"epoch": 2.796755879967559,
"grad_norm": 0.9522826075553894,
"learning_rate": 1.2707076513672423e-07,
"loss": 0.5882472395896912,
"step": 2156
},
{
"epoch": 2.798053527980535,
"grad_norm": 0.6296880841255188,
"learning_rate": 1.2547410489282708e-07,
"loss": 0.559617280960083,
"step": 2157
},
{
"epoch": 2.799351175993512,
"grad_norm": 0.6598941087722778,
"learning_rate": 1.2388741182062348e-07,
"loss": 0.5574393272399902,
"step": 2158
},
{
"epoch": 2.800648824006488,
"grad_norm": 0.5911178588867188,
"learning_rate": 1.2231068916449705e-07,
"loss": 0.5624610185623169,
"step": 2159
},
{
"epoch": 2.8019464720194645,
"grad_norm": 0.6504255533218384,
"learning_rate": 1.2074394014844782e-07,
"loss": 0.6260690689086914,
"step": 2160
},
{
"epoch": 2.803244120032441,
"grad_norm": 0.6211426258087158,
"learning_rate": 1.1918716797608087e-07,
"loss": 0.6100113391876221,
"step": 2161
},
{
"epoch": 2.804541768045418,
"grad_norm": 0.6233659386634827,
"learning_rate": 1.1764037583060162e-07,
"loss": 0.5747858285903931,
"step": 2162
},
{
"epoch": 2.805839416058394,
"grad_norm": 0.6078013777732849,
"learning_rate": 1.1610356687480728e-07,
"loss": 0.5918527841567993,
"step": 2163
},
{
"epoch": 2.8071370640713704,
"grad_norm": 0.6079197525978088,
"learning_rate": 1.1457674425108478e-07,
"loss": 0.5714898109436035,
"step": 2164
},
{
"epoch": 2.808434712084347,
"grad_norm": 0.5850006937980652,
"learning_rate": 1.1305991108139847e-07,
"loss": 0.5996066927909851,
"step": 2165
},
{
"epoch": 2.809732360097324,
"grad_norm": 0.6206707954406738,
"learning_rate": 1.1155307046728958e-07,
"loss": 0.55565345287323,
"step": 2166
},
{
"epoch": 2.8110300081103,
"grad_norm": 0.6294933557510376,
"learning_rate": 1.1005622548986406e-07,
"loss": 0.5798709392547607,
"step": 2167
},
{
"epoch": 2.8123276561232764,
"grad_norm": 0.6298512816429138,
"learning_rate": 1.0856937920979305e-07,
"loss": 0.5979269742965698,
"step": 2168
},
{
"epoch": 2.813625304136253,
"grad_norm": 0.6053575277328491,
"learning_rate": 1.0709253466729963e-07,
"loss": 0.5668598413467407,
"step": 2169
},
{
"epoch": 2.8149229521492294,
"grad_norm": 0.6343475580215454,
"learning_rate": 1.0562569488215712e-07,
"loss": 0.6248285174369812,
"step": 2170
},
{
"epoch": 2.816220600162206,
"grad_norm": 0.6348695755004883,
"learning_rate": 1.0416886285368188e-07,
"loss": 0.5982720851898193,
"step": 2171
},
{
"epoch": 2.8175182481751824,
"grad_norm": 0.6075454354286194,
"learning_rate": 1.0272204156072663e-07,
"loss": 0.580233097076416,
"step": 2172
},
{
"epoch": 2.818815896188159,
"grad_norm": 0.6037595272064209,
"learning_rate": 1.012852339616749e-07,
"loss": 0.549045205116272,
"step": 2173
},
{
"epoch": 2.8201135442011354,
"grad_norm": 0.6013658046722412,
"learning_rate": 9.985844299443437e-08,
"loss": 0.5709958672523499,
"step": 2174
},
{
"epoch": 2.821411192214112,
"grad_norm": 0.6192932724952698,
"learning_rate": 9.844167157643191e-08,
"loss": 0.5936025381088257,
"step": 2175
},
{
"epoch": 2.8227088402270883,
"grad_norm": 0.6013957858085632,
"learning_rate": 9.703492260460578e-08,
"loss": 0.5784536600112915,
"step": 2176
},
{
"epoch": 2.824006488240065,
"grad_norm": 0.6452348232269287,
"learning_rate": 9.563819895540172e-08,
"loss": 0.6597691774368286,
"step": 2177
},
{
"epoch": 2.8253041362530413,
"grad_norm": 0.6121287941932678,
"learning_rate": 9.42515034847663e-08,
"loss": 0.6041057705879211,
"step": 2178
},
{
"epoch": 2.8266017842660176,
"grad_norm": 0.6265618801116943,
"learning_rate": 9.287483902814087e-08,
"loss": 0.5931543707847595,
"step": 2179
},
{
"epoch": 2.8278994322789943,
"grad_norm": 0.6284413933753967,
"learning_rate": 9.150820840045483e-08,
"loss": 0.5969519019126892,
"step": 2180
},
{
"epoch": 2.829197080291971,
"grad_norm": 0.6021496057510376,
"learning_rate": 9.015161439612396e-08,
"loss": 0.6106084585189819,
"step": 2181
},
{
"epoch": 2.8304947283049473,
"grad_norm": 0.6177151203155518,
"learning_rate": 8.880505978903719e-08,
"loss": 0.6132292151451111,
"step": 2182
},
{
"epoch": 2.8317923763179236,
"grad_norm": 0.6375380754470825,
"learning_rate": 8.746854733255982e-08,
"loss": 0.5775139927864075,
"step": 2183
},
{
"epoch": 2.8330900243309003,
"grad_norm": 0.623674750328064,
"learning_rate": 8.614207975952083e-08,
"loss": 0.5772640705108643,
"step": 2184
},
{
"epoch": 2.8343876723438766,
"grad_norm": 2.0252397060394287,
"learning_rate": 8.482565978221002e-08,
"loss": 0.6038268804550171,
"step": 2185
},
{
"epoch": 2.8356853203568533,
"grad_norm": 0.6209124326705933,
"learning_rate": 8.351929009237425e-08,
"loss": 0.5768431425094604,
"step": 2186
},
{
"epoch": 2.8369829683698295,
"grad_norm": 0.6141339540481567,
"learning_rate": 8.222297336120844e-08,
"loss": 0.6076856851577759,
"step": 2187
},
{
"epoch": 2.8382806163828063,
"grad_norm": 0.615900456905365,
"learning_rate": 8.093671223935118e-08,
"loss": 0.5514330267906189,
"step": 2188
},
{
"epoch": 2.8395782643957825,
"grad_norm": 0.6074119210243225,
"learning_rate": 7.966050935688252e-08,
"loss": 0.5663487911224365,
"step": 2189
},
{
"epoch": 2.8408759124087593,
"grad_norm": 0.6119362711906433,
"learning_rate": 7.839436732331285e-08,
"loss": 0.5301929712295532,
"step": 2190
},
{
"epoch": 2.8421735604217355,
"grad_norm": 0.6157346963882446,
"learning_rate": 7.7138288727584e-08,
"loss": 0.5847445130348206,
"step": 2191
},
{
"epoch": 2.8434712084347122,
"grad_norm": 0.8324165940284729,
"learning_rate": 7.589227613805705e-08,
"loss": 0.6258946061134338,
"step": 2192
},
{
"epoch": 2.8447688564476885,
"grad_norm": 0.6051367521286011,
"learning_rate": 7.465633210251344e-08,
"loss": 0.6049424409866333,
"step": 2193
},
{
"epoch": 2.846066504460665,
"grad_norm": 0.6111598610877991,
"learning_rate": 7.343045914814495e-08,
"loss": 0.615462601184845,
"step": 2194
},
{
"epoch": 2.8473641524736415,
"grad_norm": 0.6303842663764954,
"learning_rate": 7.221465978155262e-08,
"loss": 0.5582486987113953,
"step": 2195
},
{
"epoch": 2.848661800486618,
"grad_norm": 0.6294355392456055,
"learning_rate": 7.10089364887373e-08,
"loss": 0.5927014946937561,
"step": 2196
},
{
"epoch": 2.8499594484995945,
"grad_norm": 0.6469996571540833,
"learning_rate": 6.981329173509909e-08,
"loss": 0.639467179775238,
"step": 2197
},
{
"epoch": 2.8512570965125708,
"grad_norm": 0.5986980199813843,
"learning_rate": 6.862772796542794e-08,
"loss": 0.6210333704948425,
"step": 2198
},
{
"epoch": 2.8525547445255475,
"grad_norm": 0.6324379444122314,
"learning_rate": 6.745224760390246e-08,
"loss": 0.5866251587867737,
"step": 2199
},
{
"epoch": 2.853852392538524,
"grad_norm": 0.6159996390342712,
"learning_rate": 6.628685305408166e-08,
"loss": 0.5464287996292114,
"step": 2200
},
{
"epoch": 2.8551500405515005,
"grad_norm": 0.6313933730125427,
"learning_rate": 6.513154669890221e-08,
"loss": 0.5239887237548828,
"step": 2201
},
{
"epoch": 2.8564476885644767,
"grad_norm": 0.6151242852210999,
"learning_rate": 6.398633090067497e-08,
"loss": 0.5513571500778198,
"step": 2202
},
{
"epoch": 2.8577453365774534,
"grad_norm": 0.6272878646850586,
"learning_rate": 6.285120800107402e-08,
"loss": 0.5711073875427246,
"step": 2203
},
{
"epoch": 2.8590429845904297,
"grad_norm": 0.6284655928611755,
"learning_rate": 6.172618032114108e-08,
"loss": 0.5585539937019348,
"step": 2204
},
{
"epoch": 2.8603406326034064,
"grad_norm": 0.6207369565963745,
"learning_rate": 6.061125016127045e-08,
"loss": 0.6215085983276367,
"step": 2205
},
{
"epoch": 2.8616382806163827,
"grad_norm": 0.5943953394889832,
"learning_rate": 5.950641980121352e-08,
"loss": 0.5761866569519043,
"step": 2206
},
{
"epoch": 2.8629359286293594,
"grad_norm": 0.6034578680992126,
"learning_rate": 5.84116915000682e-08,
"loss": 0.5912197828292847,
"step": 2207
},
{
"epoch": 2.8642335766423357,
"grad_norm": 0.612192690372467,
"learning_rate": 5.732706749627726e-08,
"loss": 0.5836058855056763,
"step": 2208
},
{
"epoch": 2.8655312246553124,
"grad_norm": 0.5961986780166626,
"learning_rate": 5.6252550007621645e-08,
"loss": 0.6387939453125,
"step": 2209
},
{
"epoch": 2.8668288726682887,
"grad_norm": 0.6482071876525879,
"learning_rate": 5.518814123121885e-08,
"loss": 0.5909046530723572,
"step": 2210
},
{
"epoch": 2.8681265206812654,
"grad_norm": 0.6003962755203247,
"learning_rate": 5.413384334351346e-08,
"loss": 0.5739219188690186,
"step": 2211
},
{
"epoch": 2.8694241686942417,
"grad_norm": 0.6164109110832214,
"learning_rate": 5.308965850027992e-08,
"loss": 0.5988886952400208,
"step": 2212
},
{
"epoch": 2.870721816707218,
"grad_norm": 0.6417747139930725,
"learning_rate": 5.205558883661033e-08,
"loss": 0.6298974752426147,
"step": 2213
},
{
"epoch": 2.8720194647201946,
"grad_norm": 0.6133490204811096,
"learning_rate": 5.103163646691611e-08,
"loss": 0.584977388381958,
"step": 2214
},
{
"epoch": 2.8733171127331714,
"grad_norm": 0.611873984336853,
"learning_rate": 5.00178034849208e-08,
"loss": 0.5997759103775024,
"step": 2215
},
{
"epoch": 2.8746147607461476,
"grad_norm": 0.5938137173652649,
"learning_rate": 4.9014091963655584e-08,
"loss": 0.5509130954742432,
"step": 2216
},
{
"epoch": 2.875912408759124,
"grad_norm": 0.6484394073486328,
"learning_rate": 4.802050395545765e-08,
"loss": 0.6474854946136475,
"step": 2217
},
{
"epoch": 2.8772100567721006,
"grad_norm": 0.619995653629303,
"learning_rate": 4.703704149196187e-08,
"loss": 0.5942093133926392,
"step": 2218
},
{
"epoch": 2.878507704785077,
"grad_norm": 0.6322592496871948,
"learning_rate": 4.6063706584100196e-08,
"loss": 0.5504230856895447,
"step": 2219
},
{
"epoch": 2.8798053527980536,
"grad_norm": 0.6172313094139099,
"learning_rate": 4.5100501222097304e-08,
"loss": 0.677121639251709,
"step": 2220
},
{
"epoch": 2.88110300081103,
"grad_norm": 0.5940432548522949,
"learning_rate": 4.414742737546274e-08,
"loss": 0.593209981918335,
"step": 2221
},
{
"epoch": 2.8824006488240066,
"grad_norm": 0.8577704429626465,
"learning_rate": 4.320448699299262e-08,
"loss": 0.705782949924469,
"step": 2222
},
{
"epoch": 2.883698296836983,
"grad_norm": 0.6182291507720947,
"learning_rate": 4.227168200276077e-08,
"loss": 0.569422721862793,
"step": 2223
},
{
"epoch": 2.8849959448499596,
"grad_norm": 0.6797721982002258,
"learning_rate": 4.134901431211702e-08,
"loss": 0.6029517650604248,
"step": 2224
},
{
"epoch": 2.886293592862936,
"grad_norm": 0.5963630676269531,
"learning_rate": 4.043648580768389e-08,
"loss": 0.5859914422035217,
"step": 2225
},
{
"epoch": 2.8875912408759126,
"grad_norm": 0.5913455486297607,
"learning_rate": 3.953409835535049e-08,
"loss": 0.5406662225723267,
"step": 2226
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.6106013655662537,
"learning_rate": 3.8641853800271414e-08,
"loss": 0.5755677223205566,
"step": 2227
},
{
"epoch": 2.890186536901865,
"grad_norm": 0.6029745936393738,
"learning_rate": 3.77597539668606e-08,
"loss": 0.5609725713729858,
"step": 2228
},
{
"epoch": 2.891484184914842,
"grad_norm": 0.6042892336845398,
"learning_rate": 3.688780065878916e-08,
"loss": 0.5605003237724304,
"step": 2229
},
{
"epoch": 2.8927818329278185,
"grad_norm": 0.5953066945075989,
"learning_rate": 3.602599565898091e-08,
"loss": 0.5514798164367676,
"step": 2230
},
{
"epoch": 2.894079480940795,
"grad_norm": 0.5983406901359558,
"learning_rate": 3.517434072960901e-08,
"loss": 0.6183291077613831,
"step": 2231
},
{
"epoch": 2.895377128953771,
"grad_norm": 0.5932295918464661,
"learning_rate": 3.433283761209161e-08,
"loss": 0.6106539964675903,
"step": 2232
},
{
"epoch": 2.896674776966748,
"grad_norm": 0.6509292721748352,
"learning_rate": 3.3501488027090635e-08,
"loss": 0.5615214109420776,
"step": 2233
},
{
"epoch": 2.8979724249797245,
"grad_norm": 0.613764762878418,
"learning_rate": 3.268029367450465e-08,
"loss": 0.6054869294166565,
"step": 2234
},
{
"epoch": 2.899270072992701,
"grad_norm": 0.6044638156890869,
"learning_rate": 3.186925623346882e-08,
"loss": 0.5691530704498291,
"step": 2235
},
{
"epoch": 2.900567721005677,
"grad_norm": 0.6060168147087097,
"learning_rate": 3.10683773623488e-08,
"loss": 0.5762636065483093,
"step": 2236
},
{
"epoch": 2.9018653690186538,
"grad_norm": 0.611011803150177,
"learning_rate": 3.0277658698739665e-08,
"loss": 0.5851128101348877,
"step": 2237
},
{
"epoch": 2.90316301703163,
"grad_norm": 0.6304229497909546,
"learning_rate": 2.9497101859460865e-08,
"loss": 0.5497856140136719,
"step": 2238
},
{
"epoch": 2.9044606650446068,
"grad_norm": 0.5783108472824097,
"learning_rate": 2.872670844055403e-08,
"loss": 0.5745448470115662,
"step": 2239
},
{
"epoch": 2.905758313057583,
"grad_norm": 0.609293520450592,
"learning_rate": 2.7966480017277974e-08,
"loss": 0.5522551536560059,
"step": 2240
},
{
"epoch": 2.9070559610705597,
"grad_norm": 0.5960776209831238,
"learning_rate": 2.7216418144107583e-08,
"loss": 0.5907799005508423,
"step": 2241
},
{
"epoch": 2.908353609083536,
"grad_norm": 0.793721079826355,
"learning_rate": 2.6476524354729917e-08,
"loss": 0.55716872215271,
"step": 2242
},
{
"epoch": 2.9096512570965127,
"grad_norm": 0.6245414614677429,
"learning_rate": 2.5746800162040342e-08,
"loss": 0.5835314989089966,
"step": 2243
},
{
"epoch": 2.910948905109489,
"grad_norm": 0.616008996963501,
"learning_rate": 2.5027247058139748e-08,
"loss": 0.594428300857544,
"step": 2244
},
{
"epoch": 2.9122465531224657,
"grad_norm": 0.5911674499511719,
"learning_rate": 2.4317866514332322e-08,
"loss": 0.5509923696517944,
"step": 2245
},
{
"epoch": 2.913544201135442,
"grad_norm": 0.6335274577140808,
"learning_rate": 2.361865998112223e-08,
"loss": 0.6094061732292175,
"step": 2246
},
{
"epoch": 2.9148418491484183,
"grad_norm": 0.6137773990631104,
"learning_rate": 2.2929628888209156e-08,
"loss": 0.6228293180465698,
"step": 2247
},
{
"epoch": 2.916139497161395,
"grad_norm": 0.6228021383285522,
"learning_rate": 2.2250774644487215e-08,
"loss": 0.5877048969268799,
"step": 2248
},
{
"epoch": 2.9174371451743717,
"grad_norm": 0.6152287125587463,
"learning_rate": 2.158209863804217e-08,
"loss": 0.6036567091941833,
"step": 2249
},
{
"epoch": 2.918734793187348,
"grad_norm": 0.6172757744789124,
"learning_rate": 2.0923602236146977e-08,
"loss": 0.5865423083305359,
"step": 2250
},
{
"epoch": 2.9200324412003242,
"grad_norm": 0.6071073412895203,
"learning_rate": 2.0275286785260694e-08,
"loss": 0.583999752998352,
"step": 2251
},
{
"epoch": 2.921330089213301,
"grad_norm": 0.6244159936904907,
"learning_rate": 1.9637153611022365e-08,
"loss": 0.5794707536697388,
"step": 2252
},
{
"epoch": 2.9226277372262772,
"grad_norm": 0.6465387940406799,
"learning_rate": 1.9009204018255456e-08,
"loss": 0.559209942817688,
"step": 2253
},
{
"epoch": 2.923925385239254,
"grad_norm": 0.6284136176109314,
"learning_rate": 1.839143929095566e-08,
"loss": 0.562762975692749,
"step": 2254
},
{
"epoch": 2.92522303325223,
"grad_norm": 0.6393802762031555,
"learning_rate": 1.7783860692296982e-08,
"loss": 0.6002349853515625,
"step": 2255
},
{
"epoch": 2.926520681265207,
"grad_norm": 0.6242037415504456,
"learning_rate": 1.718646946462288e-08,
"loss": 0.593687117099762,
"step": 2256
},
{
"epoch": 2.927818329278183,
"grad_norm": 0.6453087329864502,
"learning_rate": 1.6599266829447902e-08,
"loss": 0.6138840317726135,
"step": 2257
},
{
"epoch": 2.92911597729116,
"grad_norm": 0.632391095161438,
"learning_rate": 1.6022253987452717e-08,
"loss": 0.5360509157180786,
"step": 2258
},
{
"epoch": 2.930413625304136,
"grad_norm": 0.625159740447998,
"learning_rate": 1.5455432118481884e-08,
"loss": 0.6014057397842407,
"step": 2259
},
{
"epoch": 2.931711273317113,
"grad_norm": 0.6160334944725037,
"learning_rate": 1.4898802381543842e-08,
"loss": 0.5864812135696411,
"step": 2260
},
{
"epoch": 2.933008921330089,
"grad_norm": 0.6208499073982239,
"learning_rate": 1.4352365914804822e-08,
"loss": 0.5853984355926514,
"step": 2261
},
{
"epoch": 2.9343065693430654,
"grad_norm": 0.6147589087486267,
"learning_rate": 1.3816123835588835e-08,
"loss": 0.6146311163902283,
"step": 2262
},
{
"epoch": 2.935604217356042,
"grad_norm": 0.6171795129776001,
"learning_rate": 1.3290077240375453e-08,
"loss": 0.5833883285522461,
"step": 2263
},
{
"epoch": 2.936901865369019,
"grad_norm": 0.5844340920448303,
"learning_rate": 1.277422720479704e-08,
"loss": 0.6002391576766968,
"step": 2264
},
{
"epoch": 2.938199513381995,
"grad_norm": 0.6268512606620789,
"learning_rate": 1.2268574783635968e-08,
"loss": 0.6797309517860413,
"step": 2265
},
{
"epoch": 2.9394971613949714,
"grad_norm": 0.5872271656990051,
"learning_rate": 1.1773121010824063e-08,
"loss": 0.5867947936058044,
"step": 2266
},
{
"epoch": 2.940794809407948,
"grad_norm": 0.633165180683136,
"learning_rate": 1.1287866899438171e-08,
"loss": 0.6117358207702637,
"step": 2267
},
{
"epoch": 2.942092457420925,
"grad_norm": 0.5991867184638977,
"learning_rate": 1.081281344170071e-08,
"loss": 0.5292370319366455,
"step": 2268
},
{
"epoch": 2.943390105433901,
"grad_norm": 0.6432121396064758,
"learning_rate": 1.0347961608975221e-08,
"loss": 0.5962504148483276,
"step": 2269
},
{
"epoch": 2.9446877534468774,
"grad_norm": 0.6073801517486572,
"learning_rate": 9.893312351766382e-09,
"loss": 0.6454894542694092,
"step": 2270
},
{
"epoch": 2.945985401459854,
"grad_norm": 0.6156368851661682,
"learning_rate": 9.448866599717221e-09,
"loss": 0.5632429718971252,
"step": 2271
},
{
"epoch": 2.9472830494728304,
"grad_norm": 0.7083485126495361,
"learning_rate": 9.014625261605791e-09,
"loss": 0.5813943147659302,
"step": 2272
},
{
"epoch": 2.948580697485807,
"grad_norm": 0.6162700653076172,
"learning_rate": 8.590589225346834e-09,
"loss": 0.5752675533294678,
"step": 2273
},
{
"epoch": 2.9498783454987834,
"grad_norm": 0.610639750957489,
"learning_rate": 8.17675935798623e-09,
"loss": 0.6433367133140564,
"step": 2274
},
{
"epoch": 2.95117599351176,
"grad_norm": 0.5966771841049194,
"learning_rate": 7.773136505700995e-09,
"loss": 0.532874345779419,
"step": 2275
},
{
"epoch": 2.9524736415247363,
"grad_norm": 0.6585695743560791,
"learning_rate": 7.379721493798176e-09,
"loss": 0.5892356634140015,
"step": 2276
},
{
"epoch": 2.9537712895377126,
"grad_norm": 0.6081703901290894,
"learning_rate": 6.996515126711511e-09,
"loss": 0.5548315048217773,
"step": 2277
},
{
"epoch": 2.9550689375506893,
"grad_norm": 0.6258850693702698,
"learning_rate": 6.623518188001443e-09,
"loss": 0.5927635431289673,
"step": 2278
},
{
"epoch": 2.956366585563666,
"grad_norm": 0.6431419253349304,
"learning_rate": 6.260731440351775e-09,
"loss": 0.6057431101799011,
"step": 2279
},
{
"epoch": 2.9576642335766423,
"grad_norm": 0.621634840965271,
"learning_rate": 5.908155625570233e-09,
"loss": 0.5803443789482117,
"step": 2280
},
{
"epoch": 2.9589618815896186,
"grad_norm": 0.5794631838798523,
"learning_rate": 5.56579146458458e-09,
"loss": 0.5982474088668823,
"step": 2281
},
{
"epoch": 2.9602595296025953,
"grad_norm": 0.5987969040870667,
"learning_rate": 5.233639657443168e-09,
"loss": 0.6081230640411377,
"step": 2282
},
{
"epoch": 2.961557177615572,
"grad_norm": 0.6121331453323364,
"learning_rate": 4.911700883312165e-09,
"loss": 0.5589238405227661,
"step": 2283
},
{
"epoch": 2.9628548256285483,
"grad_norm": 0.6170937418937683,
"learning_rate": 4.599975800475553e-09,
"loss": 0.575406014919281,
"step": 2284
},
{
"epoch": 2.9641524736415246,
"grad_norm": 0.5928655862808228,
"learning_rate": 4.298465046331246e-09,
"loss": 0.588203489780426,
"step": 2285
},
{
"epoch": 2.9654501216545013,
"grad_norm": 0.6178304553031921,
"learning_rate": 4.007169237392749e-09,
"loss": 0.5311431288719177,
"step": 2286
},
{
"epoch": 2.9667477696674776,
"grad_norm": 0.6006078124046326,
"learning_rate": 3.726088969286945e-09,
"loss": 0.5917048454284668,
"step": 2287
},
{
"epoch": 2.9680454176804543,
"grad_norm": 0.6022590398788452,
"learning_rate": 3.4552248167507576e-09,
"loss": 0.5889644026756287,
"step": 2288
},
{
"epoch": 2.9693430656934305,
"grad_norm": 0.5813162922859192,
"learning_rate": 3.1945773336333754e-09,
"loss": 0.5726138353347778,
"step": 2289
},
{
"epoch": 2.9706407137064073,
"grad_norm": 0.6178452372550964,
"learning_rate": 2.9441470528929206e-09,
"loss": 0.6099365949630737,
"step": 2290
},
{
"epoch": 2.9719383617193835,
"grad_norm": 0.6197913289070129,
"learning_rate": 2.703934486595894e-09,
"loss": 0.6363242268562317,
"step": 2291
},
{
"epoch": 2.9732360097323602,
"grad_norm": 0.6046017408370972,
"learning_rate": 2.4739401259160635e-09,
"loss": 0.5827226042747498,
"step": 2292
},
{
"epoch": 2.9745336577453365,
"grad_norm": 0.64341801404953,
"learning_rate": 2.2541644411344653e-09,
"loss": 0.5797464847564697,
"step": 2293
},
{
"epoch": 2.9758313057583132,
"grad_norm": 0.6010720133781433,
"learning_rate": 2.0446078816355186e-09,
"loss": 0.5213384628295898,
"step": 2294
},
{
"epoch": 2.9771289537712895,
"grad_norm": 0.5899950265884399,
"learning_rate": 1.8452708759097993e-09,
"loss": 0.5917242765426636,
"step": 2295
},
{
"epoch": 2.9784266017842658,
"grad_norm": 0.5960827469825745,
"learning_rate": 1.656153831551821e-09,
"loss": 0.5761323571205139,
"step": 2296
},
{
"epoch": 2.9797242497972425,
"grad_norm": 0.641033411026001,
"learning_rate": 1.4772571352567044e-09,
"loss": 0.6058821678161621,
"step": 2297
},
{
"epoch": 2.981021897810219,
"grad_norm": 0.6438850164413452,
"learning_rate": 1.3085811528240622e-09,
"loss": 0.6135293245315552,
"step": 2298
},
{
"epoch": 2.9823195458231955,
"grad_norm": 0.652836263179779,
"learning_rate": 1.1501262291530034e-09,
"loss": 0.6278634667396545,
"step": 2299
},
{
"epoch": 2.9836171938361717,
"grad_norm": 0.6056571006774902,
"learning_rate": 1.0018926882443548e-09,
"loss": 0.6097397208213806,
"step": 2300
},
{
"epoch": 2.9836171938361717,
"eval_loss": 0.6816402673721313,
"eval_runtime": 72.9022,
"eval_samples_per_second": 71.219,
"eval_steps_per_second": 8.902,
"step": 2300
},
{
"epoch": 2.9849148418491485,
"grad_norm": 0.6095858812332153,
"learning_rate": 8.638808331973281e-10,
"loss": 0.5901839733123779,
"step": 2301
},
{
"epoch": 2.986212489862125,
"grad_norm": 0.610837996006012,
"learning_rate": 7.360909462111876e-10,
"loss": 0.6008099913597107,
"step": 2302
},
{
"epoch": 2.9875101378751014,
"grad_norm": 0.6182950735092163,
"learning_rate": 6.185232885846937e-10,
"loss": 0.599170446395874,
"step": 2303
},
{
"epoch": 2.9888077858880777,
"grad_norm": 0.5876109600067139,
"learning_rate": 5.111781007138827e-10,
"loss": 0.5724647045135498,
"step": 2304
},
{
"epoch": 2.9901054339010544,
"grad_norm": 0.6355734467506409,
"learning_rate": 4.1405560209206716e-10,
"loss": 0.5922134518623352,
"step": 2305
},
{
"epoch": 2.9914030819140307,
"grad_norm": 0.613153338432312,
"learning_rate": 3.2715599131039053e-10,
"loss": 0.5836412906646729,
"step": 2306
},
{
"epoch": 2.9927007299270074,
"grad_norm": 0.6345803737640381,
"learning_rate": 2.5047944605616215e-10,
"loss": 0.5756551623344421,
"step": 2307
},
{
"epoch": 2.9939983779399837,
"grad_norm": 0.6199482679367065,
"learning_rate": 1.840261231139673e-10,
"loss": 0.5494982004165649,
"step": 2308
},
{
"epoch": 2.9952960259529604,
"grad_norm": 0.62641441822052,
"learning_rate": 1.2779615836455706e-10,
"loss": 0.6009610295295715,
"step": 2309
},
{
"epoch": 2.9965936739659367,
"grad_norm": 0.6289675831794739,
"learning_rate": 8.17896667826279e-11,
"loss": 0.6343727111816406,
"step": 2310
},
{
"epoch": 2.997891321978913,
"grad_norm": 0.6318255662918091,
"learning_rate": 4.600674244070735e-11,
"loss": 0.5607834458351135,
"step": 2311
},
{
"epoch": 2.9991889699918897,
"grad_norm": 0.6189204454421997,
"learning_rate": 2.04474585052683e-11,
"loss": 0.5686444044113159,
"step": 2312
},
{
"epoch": 3.0,
"grad_norm": 0.7989046573638916,
"learning_rate": 5.11186723950452e-12,
"loss": 0.6789172887802124,
"step": 2313
},
{
"epoch": 3.0,
"step": 2313,
"total_flos": 8.852766725217714e+18,
"train_loss": 0.6584745990833162,
"train_runtime": 19013.667,
"train_samples_per_second": 15.562,
"train_steps_per_second": 0.122
}
],
"logging_steps": 1.0,
"max_steps": 2313,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 230,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.852766725217714e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}