Files
XtraGPT-3B/trainer_state.json
ModelHub XC fcfa06473f 初始化项目,由ModelHub XC社区提供模型
Model: Xtra-Computing/XtraGPT-3B
Source: Original Platform
2026-05-22 21:25:37 +08:00

15434 lines
373 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.6490066225165565,
"eval_steps": 500,
"global_step": 22000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012040939193257074,
"grad_norm": 9.316359519958496,
"learning_rate": 3.0102347983142685e-09,
"loss": 0.9861,
"step": 10
},
{
"epoch": 0.002408187838651415,
"grad_norm": 9.306455612182617,
"learning_rate": 6.020469596628537e-09,
"loss": 0.9917,
"step": 20
},
{
"epoch": 0.003612281757977122,
"grad_norm": 10.237154006958008,
"learning_rate": 9.030704394942806e-09,
"loss": 0.9875,
"step": 30
},
{
"epoch": 0.00481637567730283,
"grad_norm": 10.9087553024292,
"learning_rate": 1.2040939193257074e-08,
"loss": 1.0501,
"step": 40
},
{
"epoch": 0.006020469596628537,
"grad_norm": 8.430444717407227,
"learning_rate": 1.5051173991571343e-08,
"loss": 0.9665,
"step": 50
},
{
"epoch": 0.007224563515954244,
"grad_norm": 8.640472412109375,
"learning_rate": 1.8061408789885613e-08,
"loss": 1.0249,
"step": 60
},
{
"epoch": 0.008428657435279952,
"grad_norm": 8.656485557556152,
"learning_rate": 2.107164358819988e-08,
"loss": 0.9942,
"step": 70
},
{
"epoch": 0.00963275135460566,
"grad_norm": 11.827771186828613,
"learning_rate": 2.4081878386514148e-08,
"loss": 1.0359,
"step": 80
},
{
"epoch": 0.010836845273931367,
"grad_norm": 7.204784870147705,
"learning_rate": 2.7092113184828417e-08,
"loss": 0.9532,
"step": 90
},
{
"epoch": 0.012040939193257074,
"grad_norm": 7.546351432800293,
"learning_rate": 3.010234798314269e-08,
"loss": 1.0236,
"step": 100
},
{
"epoch": 0.013245033112582781,
"grad_norm": 10.156384468078613,
"learning_rate": 3.311258278145695e-08,
"loss": 1.0031,
"step": 110
},
{
"epoch": 0.014449127031908489,
"grad_norm": 9.753625869750977,
"learning_rate": 3.6122817579771225e-08,
"loss": 1.016,
"step": 120
},
{
"epoch": 0.015653220951234198,
"grad_norm": 8.019911766052246,
"learning_rate": 3.9133052378085485e-08,
"loss": 0.9972,
"step": 130
},
{
"epoch": 0.016857314870559904,
"grad_norm": 8.1049165725708,
"learning_rate": 4.214328717639976e-08,
"loss": 0.984,
"step": 140
},
{
"epoch": 0.018061408789885613,
"grad_norm": 8.9203462600708,
"learning_rate": 4.5153521974714023e-08,
"loss": 0.9811,
"step": 150
},
{
"epoch": 0.01926550270921132,
"grad_norm": 8.825779914855957,
"learning_rate": 4.8163756773028296e-08,
"loss": 1.0207,
"step": 160
},
{
"epoch": 0.020469596628537028,
"grad_norm": 8.64220142364502,
"learning_rate": 5.117399157134256e-08,
"loss": 0.9971,
"step": 170
},
{
"epoch": 0.021673690547862733,
"grad_norm": 8.120660781860352,
"learning_rate": 5.4184226369656835e-08,
"loss": 0.9606,
"step": 180
},
{
"epoch": 0.022877784467188442,
"grad_norm": 8.181641578674316,
"learning_rate": 5.71944611679711e-08,
"loss": 0.9516,
"step": 190
},
{
"epoch": 0.024081878386514148,
"grad_norm": 7.607439994812012,
"learning_rate": 6.020469596628537e-08,
"loss": 0.9919,
"step": 200
},
{
"epoch": 0.025285972305839857,
"grad_norm": 7.434635162353516,
"learning_rate": 6.321493076459963e-08,
"loss": 0.9991,
"step": 210
},
{
"epoch": 0.026490066225165563,
"grad_norm": 7.569486141204834,
"learning_rate": 6.62251655629139e-08,
"loss": 0.984,
"step": 220
},
{
"epoch": 0.027694160144491272,
"grad_norm": 7.499971389770508,
"learning_rate": 6.923540036122818e-08,
"loss": 0.9598,
"step": 230
},
{
"epoch": 0.028898254063816978,
"grad_norm": 6.992701053619385,
"learning_rate": 7.224563515954245e-08,
"loss": 0.905,
"step": 240
},
{
"epoch": 0.030102347983142687,
"grad_norm": 6.3157877922058105,
"learning_rate": 7.525586995785671e-08,
"loss": 0.9493,
"step": 250
},
{
"epoch": 0.031306441902468396,
"grad_norm": 6.263482570648193,
"learning_rate": 7.826610475617097e-08,
"loss": 0.9501,
"step": 260
},
{
"epoch": 0.0325105358217941,
"grad_norm": 6.178393840789795,
"learning_rate": 8.127633955448524e-08,
"loss": 0.9056,
"step": 270
},
{
"epoch": 0.03371462974111981,
"grad_norm": 4.896974086761475,
"learning_rate": 8.428657435279951e-08,
"loss": 0.8679,
"step": 280
},
{
"epoch": 0.034918723660445516,
"grad_norm": 5.896145820617676,
"learning_rate": 8.729680915111379e-08,
"loss": 0.8658,
"step": 290
},
{
"epoch": 0.036122817579771226,
"grad_norm": 5.6855573654174805,
"learning_rate": 9.030704394942805e-08,
"loss": 0.9227,
"step": 300
},
{
"epoch": 0.03732691149909693,
"grad_norm": 4.907613277435303,
"learning_rate": 9.331727874774232e-08,
"loss": 0.8581,
"step": 310
},
{
"epoch": 0.03853100541842264,
"grad_norm": 6.029637336730957,
"learning_rate": 9.632751354605659e-08,
"loss": 0.8184,
"step": 320
},
{
"epoch": 0.039735099337748346,
"grad_norm": 5.0958333015441895,
"learning_rate": 9.933774834437085e-08,
"loss": 0.8524,
"step": 330
},
{
"epoch": 0.040939193257074055,
"grad_norm": 6.18320369720459,
"learning_rate": 1.0234798314268512e-07,
"loss": 0.8371,
"step": 340
},
{
"epoch": 0.04214328717639976,
"grad_norm": 4.874738693237305,
"learning_rate": 1.0535821794099938e-07,
"loss": 0.8348,
"step": 350
},
{
"epoch": 0.04334738109572547,
"grad_norm": 5.273070812225342,
"learning_rate": 1.0836845273931367e-07,
"loss": 0.8286,
"step": 360
},
{
"epoch": 0.044551475015051176,
"grad_norm": 5.052524089813232,
"learning_rate": 1.1137868753762793e-07,
"loss": 0.7585,
"step": 370
},
{
"epoch": 0.045755568934376885,
"grad_norm": 4.216408729553223,
"learning_rate": 1.143889223359422e-07,
"loss": 0.7869,
"step": 380
},
{
"epoch": 0.04695966285370259,
"grad_norm": 5.456339359283447,
"learning_rate": 1.1739915713425646e-07,
"loss": 0.7699,
"step": 390
},
{
"epoch": 0.048163756773028296,
"grad_norm": 4.809760093688965,
"learning_rate": 1.2040939193257075e-07,
"loss": 0.763,
"step": 400
},
{
"epoch": 0.049367850692354005,
"grad_norm": 4.933152675628662,
"learning_rate": 1.23419626730885e-07,
"loss": 0.7192,
"step": 410
},
{
"epoch": 0.050571944611679714,
"grad_norm": 5.025005340576172,
"learning_rate": 1.2642986152919927e-07,
"loss": 0.7092,
"step": 420
},
{
"epoch": 0.05177603853100542,
"grad_norm": 4.338512420654297,
"learning_rate": 1.2944009632751355e-07,
"loss": 0.7346,
"step": 430
},
{
"epoch": 0.052980132450331126,
"grad_norm": 4.557036399841309,
"learning_rate": 1.324503311258278e-07,
"loss": 0.723,
"step": 440
},
{
"epoch": 0.054184226369656835,
"grad_norm": 4.911799907684326,
"learning_rate": 1.3546056592414207e-07,
"loss": 0.7673,
"step": 450
},
{
"epoch": 0.055388320288982544,
"grad_norm": 4.063588619232178,
"learning_rate": 1.3847080072245636e-07,
"loss": 0.708,
"step": 460
},
{
"epoch": 0.056592414208308246,
"grad_norm": 4.037914752960205,
"learning_rate": 1.4148103552077062e-07,
"loss": 0.7641,
"step": 470
},
{
"epoch": 0.057796508127633955,
"grad_norm": 4.673463344573975,
"learning_rate": 1.444912703190849e-07,
"loss": 0.7196,
"step": 480
},
{
"epoch": 0.059000602046959665,
"grad_norm": 5.096141815185547,
"learning_rate": 1.4750150511739913e-07,
"loss": 0.689,
"step": 490
},
{
"epoch": 0.060204695966285374,
"grad_norm": 4.904088973999023,
"learning_rate": 1.5051173991571342e-07,
"loss": 0.7211,
"step": 500
},
{
"epoch": 0.061408789885611076,
"grad_norm": 5.234721660614014,
"learning_rate": 1.535219747140277e-07,
"loss": 0.7091,
"step": 510
},
{
"epoch": 0.06261288380493679,
"grad_norm": 4.105505466461182,
"learning_rate": 1.5653220951234194e-07,
"loss": 0.7607,
"step": 520
},
{
"epoch": 0.0638169777242625,
"grad_norm": 4.666725158691406,
"learning_rate": 1.5954244431065622e-07,
"loss": 0.7158,
"step": 530
},
{
"epoch": 0.0650210716435882,
"grad_norm": 4.976656436920166,
"learning_rate": 1.6255267910897048e-07,
"loss": 0.7401,
"step": 540
},
{
"epoch": 0.06622516556291391,
"grad_norm": 5.044974327087402,
"learning_rate": 1.6556291390728477e-07,
"loss": 0.6685,
"step": 550
},
{
"epoch": 0.06742925948223961,
"grad_norm": 4.7259368896484375,
"learning_rate": 1.6857314870559903e-07,
"loss": 0.6866,
"step": 560
},
{
"epoch": 0.06863335340156532,
"grad_norm": 5.358945369720459,
"learning_rate": 1.715833835039133e-07,
"loss": 0.7059,
"step": 570
},
{
"epoch": 0.06983744732089103,
"grad_norm": 5.156592845916748,
"learning_rate": 1.7459361830222757e-07,
"loss": 0.7111,
"step": 580
},
{
"epoch": 0.07104154124021674,
"grad_norm": 4.320924282073975,
"learning_rate": 1.7760385310054183e-07,
"loss": 0.6862,
"step": 590
},
{
"epoch": 0.07224563515954245,
"grad_norm": 4.59999418258667,
"learning_rate": 1.806140878988561e-07,
"loss": 0.6893,
"step": 600
},
{
"epoch": 0.07344972907886815,
"grad_norm": 4.4846391677856445,
"learning_rate": 1.8362432269717038e-07,
"loss": 0.7128,
"step": 610
},
{
"epoch": 0.07465382299819386,
"grad_norm": 5.029007911682129,
"learning_rate": 1.8663455749548464e-07,
"loss": 0.7114,
"step": 620
},
{
"epoch": 0.07585791691751957,
"grad_norm": 4.288726806640625,
"learning_rate": 1.896447922937989e-07,
"loss": 0.6848,
"step": 630
},
{
"epoch": 0.07706201083684527,
"grad_norm": 4.063099384307861,
"learning_rate": 1.9265502709211318e-07,
"loss": 0.659,
"step": 640
},
{
"epoch": 0.07826610475617098,
"grad_norm": 4.031120300292969,
"learning_rate": 1.9566526189042744e-07,
"loss": 0.6802,
"step": 650
},
{
"epoch": 0.07947019867549669,
"grad_norm": 5.234511852264404,
"learning_rate": 1.986754966887417e-07,
"loss": 0.6685,
"step": 660
},
{
"epoch": 0.0806742925948224,
"grad_norm": 5.434250831604004,
"learning_rate": 2.01685731487056e-07,
"loss": 0.6762,
"step": 670
},
{
"epoch": 0.08187838651414811,
"grad_norm": 5.326634407043457,
"learning_rate": 2.0469596628537025e-07,
"loss": 0.6951,
"step": 680
},
{
"epoch": 0.08308248043347381,
"grad_norm": 3.630930185317993,
"learning_rate": 2.0770620108368453e-07,
"loss": 0.6445,
"step": 690
},
{
"epoch": 0.08428657435279951,
"grad_norm": 5.273288726806641,
"learning_rate": 2.1071643588199877e-07,
"loss": 0.65,
"step": 700
},
{
"epoch": 0.08549066827212523,
"grad_norm": 4.212562084197998,
"learning_rate": 2.1372667068031305e-07,
"loss": 0.6485,
"step": 710
},
{
"epoch": 0.08669476219145093,
"grad_norm": 4.293779373168945,
"learning_rate": 2.1673690547862734e-07,
"loss": 0.6734,
"step": 720
},
{
"epoch": 0.08789885611077664,
"grad_norm": 4.917520999908447,
"learning_rate": 2.1974714027694157e-07,
"loss": 0.6593,
"step": 730
},
{
"epoch": 0.08910295003010235,
"grad_norm": 4.624716281890869,
"learning_rate": 2.2275737507525586e-07,
"loss": 0.6712,
"step": 740
},
{
"epoch": 0.09030704394942805,
"grad_norm": 5.3648552894592285,
"learning_rate": 2.2576760987357014e-07,
"loss": 0.6481,
"step": 750
},
{
"epoch": 0.09151113786875377,
"grad_norm": 4.328650951385498,
"learning_rate": 2.287778446718844e-07,
"loss": 0.6418,
"step": 760
},
{
"epoch": 0.09271523178807947,
"grad_norm": 4.933085918426514,
"learning_rate": 2.3178807947019866e-07,
"loss": 0.6622,
"step": 770
},
{
"epoch": 0.09391932570740517,
"grad_norm": 4.703038215637207,
"learning_rate": 2.3479831426851292e-07,
"loss": 0.6317,
"step": 780
},
{
"epoch": 0.09512341962673089,
"grad_norm": 4.468968391418457,
"learning_rate": 2.378085490668272e-07,
"loss": 0.6431,
"step": 790
},
{
"epoch": 0.09632751354605659,
"grad_norm": 4.49053430557251,
"learning_rate": 2.408187838651415e-07,
"loss": 0.6212,
"step": 800
},
{
"epoch": 0.0975316074653823,
"grad_norm": 4.350528240203857,
"learning_rate": 2.438290186634557e-07,
"loss": 0.623,
"step": 810
},
{
"epoch": 0.09873570138470801,
"grad_norm": 4.772309303283691,
"learning_rate": 2.4683925346177e-07,
"loss": 0.6268,
"step": 820
},
{
"epoch": 0.09993979530403371,
"grad_norm": 5.437624931335449,
"learning_rate": 2.498494882600843e-07,
"loss": 0.6497,
"step": 830
},
{
"epoch": 0.10114388922335943,
"grad_norm": 4.474155902862549,
"learning_rate": 2.5285972305839853e-07,
"loss": 0.6092,
"step": 840
},
{
"epoch": 0.10234798314268513,
"grad_norm": 4.417370796203613,
"learning_rate": 2.558699578567128e-07,
"loss": 0.6587,
"step": 850
},
{
"epoch": 0.10355207706201083,
"grad_norm": 5.5498456954956055,
"learning_rate": 2.588801926550271e-07,
"loss": 0.6564,
"step": 860
},
{
"epoch": 0.10475617098133655,
"grad_norm": 5.326292514801025,
"learning_rate": 2.6189042745334134e-07,
"loss": 0.6333,
"step": 870
},
{
"epoch": 0.10596026490066225,
"grad_norm": 4.284069538116455,
"learning_rate": 2.649006622516556e-07,
"loss": 0.6325,
"step": 880
},
{
"epoch": 0.10716435881998795,
"grad_norm": 4.672844886779785,
"learning_rate": 2.679108970499699e-07,
"loss": 0.6046,
"step": 890
},
{
"epoch": 0.10836845273931367,
"grad_norm": 4.223860740661621,
"learning_rate": 2.7092113184828414e-07,
"loss": 0.6094,
"step": 900
},
{
"epoch": 0.10957254665863937,
"grad_norm": 4.813838005065918,
"learning_rate": 2.739313666465984e-07,
"loss": 0.6332,
"step": 910
},
{
"epoch": 0.11077664057796509,
"grad_norm": 3.5245296955108643,
"learning_rate": 2.769416014449127e-07,
"loss": 0.6161,
"step": 920
},
{
"epoch": 0.11198073449729079,
"grad_norm": 4.577372074127197,
"learning_rate": 2.7995183624322695e-07,
"loss": 0.6254,
"step": 930
},
{
"epoch": 0.11318482841661649,
"grad_norm": 4.295224666595459,
"learning_rate": 2.8296207104154123e-07,
"loss": 0.6089,
"step": 940
},
{
"epoch": 0.11438892233594221,
"grad_norm": 4.899755477905273,
"learning_rate": 2.8597230583985546e-07,
"loss": 0.6255,
"step": 950
},
{
"epoch": 0.11559301625526791,
"grad_norm": 5.047530651092529,
"learning_rate": 2.889825406381698e-07,
"loss": 0.6273,
"step": 960
},
{
"epoch": 0.11679711017459361,
"grad_norm": 4.75305700302124,
"learning_rate": 2.9199277543648404e-07,
"loss": 0.6277,
"step": 970
},
{
"epoch": 0.11800120409391933,
"grad_norm": 5.476251602172852,
"learning_rate": 2.9500301023479827e-07,
"loss": 0.6121,
"step": 980
},
{
"epoch": 0.11920529801324503,
"grad_norm": 6.20451021194458,
"learning_rate": 2.980132450331126e-07,
"loss": 0.6121,
"step": 990
},
{
"epoch": 0.12040939193257075,
"grad_norm": 4.61058235168457,
"learning_rate": 3.0102347983142684e-07,
"loss": 0.5862,
"step": 1000
},
{
"epoch": 0.12161348585189645,
"grad_norm": 4.537725925445557,
"learning_rate": 3.0403371462974107e-07,
"loss": 0.6186,
"step": 1010
},
{
"epoch": 0.12281757977122215,
"grad_norm": 4.347688674926758,
"learning_rate": 3.070439494280554e-07,
"loss": 0.6127,
"step": 1020
},
{
"epoch": 0.12402167369054787,
"grad_norm": 4.965167045593262,
"learning_rate": 3.1005418422636965e-07,
"loss": 0.6026,
"step": 1030
},
{
"epoch": 0.12522576760987358,
"grad_norm": 4.610491752624512,
"learning_rate": 3.130644190246839e-07,
"loss": 0.6327,
"step": 1040
},
{
"epoch": 0.12642986152919927,
"grad_norm": 5.292304039001465,
"learning_rate": 3.160746538229982e-07,
"loss": 0.5972,
"step": 1050
},
{
"epoch": 0.127633955448525,
"grad_norm": 6.372762680053711,
"learning_rate": 3.1908488862131245e-07,
"loss": 0.6393,
"step": 1060
},
{
"epoch": 0.1288380493678507,
"grad_norm": 5.56066370010376,
"learning_rate": 3.220951234196267e-07,
"loss": 0.6196,
"step": 1070
},
{
"epoch": 0.1300421432871764,
"grad_norm": 4.777896881103516,
"learning_rate": 3.2510535821794097e-07,
"loss": 0.6217,
"step": 1080
},
{
"epoch": 0.1312462372065021,
"grad_norm": 4.9745683670043945,
"learning_rate": 3.2811559301625525e-07,
"loss": 0.6189,
"step": 1090
},
{
"epoch": 0.13245033112582782,
"grad_norm": 3.71576189994812,
"learning_rate": 3.3112582781456954e-07,
"loss": 0.5893,
"step": 1100
},
{
"epoch": 0.1336544250451535,
"grad_norm": 4.458312034606934,
"learning_rate": 3.3413606261288377e-07,
"loss": 0.5951,
"step": 1110
},
{
"epoch": 0.13485851896447923,
"grad_norm": 4.835500240325928,
"learning_rate": 3.3714629741119806e-07,
"loss": 0.5791,
"step": 1120
},
{
"epoch": 0.13606261288380495,
"grad_norm": 4.516515254974365,
"learning_rate": 3.4015653220951235e-07,
"loss": 0.5759,
"step": 1130
},
{
"epoch": 0.13726670680313063,
"grad_norm": 5.564052104949951,
"learning_rate": 3.431667670078266e-07,
"loss": 0.5791,
"step": 1140
},
{
"epoch": 0.13847080072245635,
"grad_norm": 5.586264610290527,
"learning_rate": 3.4617700180614086e-07,
"loss": 0.6142,
"step": 1150
},
{
"epoch": 0.13967489464178207,
"grad_norm": 4.408708572387695,
"learning_rate": 3.4918723660445515e-07,
"loss": 0.617,
"step": 1160
},
{
"epoch": 0.14087898856110775,
"grad_norm": 4.4068403244018555,
"learning_rate": 3.521974714027694e-07,
"loss": 0.6099,
"step": 1170
},
{
"epoch": 0.14208308248043347,
"grad_norm": 3.947399854660034,
"learning_rate": 3.5520770620108367e-07,
"loss": 0.5555,
"step": 1180
},
{
"epoch": 0.1432871763997592,
"grad_norm": 5.264540195465088,
"learning_rate": 3.5821794099939795e-07,
"loss": 0.5556,
"step": 1190
},
{
"epoch": 0.1444912703190849,
"grad_norm": 4.486605644226074,
"learning_rate": 3.612281757977122e-07,
"loss": 0.5997,
"step": 1200
},
{
"epoch": 0.1456953642384106,
"grad_norm": 6.195891857147217,
"learning_rate": 3.642384105960264e-07,
"loss": 0.6104,
"step": 1210
},
{
"epoch": 0.1468994581577363,
"grad_norm": 4.5443572998046875,
"learning_rate": 3.6724864539434076e-07,
"loss": 0.5806,
"step": 1220
},
{
"epoch": 0.14810355207706202,
"grad_norm": 4.380715370178223,
"learning_rate": 3.70258880192655e-07,
"loss": 0.5759,
"step": 1230
},
{
"epoch": 0.1493076459963877,
"grad_norm": 5.033191680908203,
"learning_rate": 3.732691149909693e-07,
"loss": 0.5782,
"step": 1240
},
{
"epoch": 0.15051173991571343,
"grad_norm": 4.244385719299316,
"learning_rate": 3.7627934978928356e-07,
"loss": 0.5658,
"step": 1250
},
{
"epoch": 0.15171583383503914,
"grad_norm": 4.332985877990723,
"learning_rate": 3.792895845875978e-07,
"loss": 0.5702,
"step": 1260
},
{
"epoch": 0.15291992775436483,
"grad_norm": 4.5175628662109375,
"learning_rate": 3.822998193859121e-07,
"loss": 0.5588,
"step": 1270
},
{
"epoch": 0.15412402167369055,
"grad_norm": 4.519990921020508,
"learning_rate": 3.8531005418422637e-07,
"loss": 0.5871,
"step": 1280
},
{
"epoch": 0.15532811559301626,
"grad_norm": 4.500414848327637,
"learning_rate": 3.883202889825406e-07,
"loss": 0.5977,
"step": 1290
},
{
"epoch": 0.15653220951234195,
"grad_norm": 4.714526653289795,
"learning_rate": 3.913305237808549e-07,
"loss": 0.5647,
"step": 1300
},
{
"epoch": 0.15773630343166767,
"grad_norm": 4.869201183319092,
"learning_rate": 3.9434075857916917e-07,
"loss": 0.5816,
"step": 1310
},
{
"epoch": 0.15894039735099338,
"grad_norm": 5.167849540710449,
"learning_rate": 3.973509933774834e-07,
"loss": 0.5633,
"step": 1320
},
{
"epoch": 0.16014449127031907,
"grad_norm": 4.805886745452881,
"learning_rate": 4.003612281757977e-07,
"loss": 0.5858,
"step": 1330
},
{
"epoch": 0.1613485851896448,
"grad_norm": 4.569708824157715,
"learning_rate": 4.03371462974112e-07,
"loss": 0.5729,
"step": 1340
},
{
"epoch": 0.1625526791089705,
"grad_norm": 4.649074554443359,
"learning_rate": 4.0638169777242626e-07,
"loss": 0.5904,
"step": 1350
},
{
"epoch": 0.16375677302829622,
"grad_norm": 4.956695556640625,
"learning_rate": 4.093919325707405e-07,
"loss": 0.5743,
"step": 1360
},
{
"epoch": 0.1649608669476219,
"grad_norm": 5.056834697723389,
"learning_rate": 4.1240216736905473e-07,
"loss": 0.5903,
"step": 1370
},
{
"epoch": 0.16616496086694763,
"grad_norm": 4.751232624053955,
"learning_rate": 4.1541240216736907e-07,
"loss": 0.5697,
"step": 1380
},
{
"epoch": 0.16736905478627334,
"grad_norm": 4.0161027908325195,
"learning_rate": 4.184226369656833e-07,
"loss": 0.5588,
"step": 1390
},
{
"epoch": 0.16857314870559903,
"grad_norm": 4.591194152832031,
"learning_rate": 4.2143287176399753e-07,
"loss": 0.5792,
"step": 1400
},
{
"epoch": 0.16977724262492475,
"grad_norm": 5.218972206115723,
"learning_rate": 4.2444310656231187e-07,
"loss": 0.5793,
"step": 1410
},
{
"epoch": 0.17098133654425046,
"grad_norm": 4.32102108001709,
"learning_rate": 4.274533413606261e-07,
"loss": 0.57,
"step": 1420
},
{
"epoch": 0.17218543046357615,
"grad_norm": 4.359175205230713,
"learning_rate": 4.3046357615894034e-07,
"loss": 0.5675,
"step": 1430
},
{
"epoch": 0.17338952438290187,
"grad_norm": 5.192026615142822,
"learning_rate": 4.334738109572547e-07,
"loss": 0.5668,
"step": 1440
},
{
"epoch": 0.17459361830222758,
"grad_norm": 4.002780914306641,
"learning_rate": 4.364840457555689e-07,
"loss": 0.5787,
"step": 1450
},
{
"epoch": 0.17579771222155327,
"grad_norm": 5.319111347198486,
"learning_rate": 4.3949428055388314e-07,
"loss": 0.5734,
"step": 1460
},
{
"epoch": 0.177001806140879,
"grad_norm": 4.700523376464844,
"learning_rate": 4.425045153521975e-07,
"loss": 0.5754,
"step": 1470
},
{
"epoch": 0.1782059000602047,
"grad_norm": 4.4386372566223145,
"learning_rate": 4.455147501505117e-07,
"loss": 0.5459,
"step": 1480
},
{
"epoch": 0.1794099939795304,
"grad_norm": 4.084826946258545,
"learning_rate": 4.48524984948826e-07,
"loss": 0.5399,
"step": 1490
},
{
"epoch": 0.1806140878988561,
"grad_norm": 4.401342391967773,
"learning_rate": 4.515352197471403e-07,
"loss": 0.573,
"step": 1500
},
{
"epoch": 0.18181818181818182,
"grad_norm": 4.5059685707092285,
"learning_rate": 4.545454545454545e-07,
"loss": 0.5724,
"step": 1510
},
{
"epoch": 0.18302227573750754,
"grad_norm": 5.070437431335449,
"learning_rate": 4.575556893437688e-07,
"loss": 0.5711,
"step": 1520
},
{
"epoch": 0.18422636965683323,
"grad_norm": 4.188956260681152,
"learning_rate": 4.6056592414208304e-07,
"loss": 0.5498,
"step": 1530
},
{
"epoch": 0.18543046357615894,
"grad_norm": 4.391158580780029,
"learning_rate": 4.635761589403973e-07,
"loss": 0.5602,
"step": 1540
},
{
"epoch": 0.18663455749548466,
"grad_norm": 5.272259712219238,
"learning_rate": 4.665863937387116e-07,
"loss": 0.5748,
"step": 1550
},
{
"epoch": 0.18783865141481035,
"grad_norm": 4.982473373413086,
"learning_rate": 4.6959662853702584e-07,
"loss": 0.5584,
"step": 1560
},
{
"epoch": 0.18904274533413606,
"grad_norm": 5.263506889343262,
"learning_rate": 4.7260686333534013e-07,
"loss": 0.5828,
"step": 1570
},
{
"epoch": 0.19024683925346178,
"grad_norm": 4.1373724937438965,
"learning_rate": 4.756170981336544e-07,
"loss": 0.5494,
"step": 1580
},
{
"epoch": 0.19145093317278747,
"grad_norm": 4.439697265625,
"learning_rate": 4.786273329319686e-07,
"loss": 0.5522,
"step": 1590
},
{
"epoch": 0.19265502709211318,
"grad_norm": 4.79713249206543,
"learning_rate": 4.81637567730283e-07,
"loss": 0.5058,
"step": 1600
},
{
"epoch": 0.1938591210114389,
"grad_norm": 3.973453998565674,
"learning_rate": 4.846478025285972e-07,
"loss": 0.5471,
"step": 1610
},
{
"epoch": 0.1950632149307646,
"grad_norm": 4.748741149902344,
"learning_rate": 4.876580373269115e-07,
"loss": 0.5768,
"step": 1620
},
{
"epoch": 0.1962673088500903,
"grad_norm": 5.98441743850708,
"learning_rate": 4.906682721252258e-07,
"loss": 0.5497,
"step": 1630
},
{
"epoch": 0.19747140276941602,
"grad_norm": 5.55325174331665,
"learning_rate": 4.9367850692354e-07,
"loss": 0.5595,
"step": 1640
},
{
"epoch": 0.1986754966887417,
"grad_norm": 5.114386081695557,
"learning_rate": 4.966887417218543e-07,
"loss": 0.5635,
"step": 1650
},
{
"epoch": 0.19987959060806743,
"grad_norm": 4.869389533996582,
"learning_rate": 4.996989765201686e-07,
"loss": 0.5409,
"step": 1660
},
{
"epoch": 0.20108368452739314,
"grad_norm": 4.4507222175598145,
"learning_rate": 5.027092113184828e-07,
"loss": 0.5598,
"step": 1670
},
{
"epoch": 0.20228777844671886,
"grad_norm": 4.574100494384766,
"learning_rate": 5.057194461167971e-07,
"loss": 0.5432,
"step": 1680
},
{
"epoch": 0.20349187236604455,
"grad_norm": 4.581476211547852,
"learning_rate": 5.087296809151114e-07,
"loss": 0.5509,
"step": 1690
},
{
"epoch": 0.20469596628537026,
"grad_norm": 4.631548881530762,
"learning_rate": 5.117399157134256e-07,
"loss": 0.5712,
"step": 1700
},
{
"epoch": 0.20590006020469598,
"grad_norm": 5.006454944610596,
"learning_rate": 5.147501505117399e-07,
"loss": 0.5586,
"step": 1710
},
{
"epoch": 0.20710415412402167,
"grad_norm": 4.4788408279418945,
"learning_rate": 5.177603853100542e-07,
"loss": 0.5543,
"step": 1720
},
{
"epoch": 0.20830824804334738,
"grad_norm": 4.614450931549072,
"learning_rate": 5.207706201083684e-07,
"loss": 0.5677,
"step": 1730
},
{
"epoch": 0.2095123419626731,
"grad_norm": 4.377712249755859,
"learning_rate": 5.237808549066827e-07,
"loss": 0.5399,
"step": 1740
},
{
"epoch": 0.2107164358819988,
"grad_norm": 6.157577991485596,
"learning_rate": 5.26791089704997e-07,
"loss": 0.5288,
"step": 1750
},
{
"epoch": 0.2119205298013245,
"grad_norm": 4.206299781799316,
"learning_rate": 5.298013245033112e-07,
"loss": 0.5308,
"step": 1760
},
{
"epoch": 0.21312462372065022,
"grad_norm": 4.296496868133545,
"learning_rate": 5.328115593016255e-07,
"loss": 0.552,
"step": 1770
},
{
"epoch": 0.2143287176399759,
"grad_norm": 4.474640846252441,
"learning_rate": 5.358217940999398e-07,
"loss": 0.5505,
"step": 1780
},
{
"epoch": 0.21553281155930162,
"grad_norm": 4.762406349182129,
"learning_rate": 5.38832028898254e-07,
"loss": 0.5669,
"step": 1790
},
{
"epoch": 0.21673690547862734,
"grad_norm": 4.40052604675293,
"learning_rate": 5.418422636965683e-07,
"loss": 0.5386,
"step": 1800
},
{
"epoch": 0.21794099939795303,
"grad_norm": 4.364424228668213,
"learning_rate": 5.448524984948826e-07,
"loss": 0.5446,
"step": 1810
},
{
"epoch": 0.21914509331727874,
"grad_norm": 5.686670780181885,
"learning_rate": 5.478627332931969e-07,
"loss": 0.5708,
"step": 1820
},
{
"epoch": 0.22034918723660446,
"grad_norm": 6.244655132293701,
"learning_rate": 5.508729680915111e-07,
"loss": 0.5353,
"step": 1830
},
{
"epoch": 0.22155328115593018,
"grad_norm": 5.4936323165893555,
"learning_rate": 5.538832028898254e-07,
"loss": 0.5486,
"step": 1840
},
{
"epoch": 0.22275737507525586,
"grad_norm": 4.955344200134277,
"learning_rate": 5.568934376881397e-07,
"loss": 0.5142,
"step": 1850
},
{
"epoch": 0.22396146899458158,
"grad_norm": 4.333896636962891,
"learning_rate": 5.599036724864539e-07,
"loss": 0.5432,
"step": 1860
},
{
"epoch": 0.2251655629139073,
"grad_norm": 4.568367958068848,
"learning_rate": 5.629139072847681e-07,
"loss": 0.5351,
"step": 1870
},
{
"epoch": 0.22636965683323299,
"grad_norm": 5.548391342163086,
"learning_rate": 5.659241420830825e-07,
"loss": 0.5053,
"step": 1880
},
{
"epoch": 0.2275737507525587,
"grad_norm": 4.526470184326172,
"learning_rate": 5.689343768813967e-07,
"loss": 0.5494,
"step": 1890
},
{
"epoch": 0.22877784467188442,
"grad_norm": 4.453249454498291,
"learning_rate": 5.719446116797109e-07,
"loss": 0.5397,
"step": 1900
},
{
"epoch": 0.2299819385912101,
"grad_norm": 7.503538131713867,
"learning_rate": 5.749548464780253e-07,
"loss": 0.5232,
"step": 1910
},
{
"epoch": 0.23118603251053582,
"grad_norm": 5.740428924560547,
"learning_rate": 5.779650812763396e-07,
"loss": 0.5426,
"step": 1920
},
{
"epoch": 0.23239012642986154,
"grad_norm": 5.185967445373535,
"learning_rate": 5.809753160746537e-07,
"loss": 0.5277,
"step": 1930
},
{
"epoch": 0.23359422034918723,
"grad_norm": 5.1867547035217285,
"learning_rate": 5.839855508729681e-07,
"loss": 0.5233,
"step": 1940
},
{
"epoch": 0.23479831426851294,
"grad_norm": 4.812213897705078,
"learning_rate": 5.869957856712824e-07,
"loss": 0.535,
"step": 1950
},
{
"epoch": 0.23600240818783866,
"grad_norm": 5.038625240325928,
"learning_rate": 5.900060204695965e-07,
"loss": 0.5365,
"step": 1960
},
{
"epoch": 0.23720650210716435,
"grad_norm": 4.050044536590576,
"learning_rate": 5.930162552679109e-07,
"loss": 0.5145,
"step": 1970
},
{
"epoch": 0.23841059602649006,
"grad_norm": 4.956125736236572,
"learning_rate": 5.960264900662252e-07,
"loss": 0.5141,
"step": 1980
},
{
"epoch": 0.23961468994581578,
"grad_norm": 4.40023136138916,
"learning_rate": 5.990367248645393e-07,
"loss": 0.544,
"step": 1990
},
{
"epoch": 0.2408187838651415,
"grad_norm": 5.268930912017822,
"learning_rate": 6.020469596628537e-07,
"loss": 0.5514,
"step": 2000
},
{
"epoch": 0.24202287778446718,
"grad_norm": 3.9441418647766113,
"learning_rate": 6.05057194461168e-07,
"loss": 0.5368,
"step": 2010
},
{
"epoch": 0.2432269717037929,
"grad_norm": 4.060418605804443,
"learning_rate": 6.080674292594821e-07,
"loss": 0.5228,
"step": 2020
},
{
"epoch": 0.24443106562311862,
"grad_norm": 4.1477861404418945,
"learning_rate": 6.110776640577965e-07,
"loss": 0.5221,
"step": 2030
},
{
"epoch": 0.2456351595424443,
"grad_norm": 5.319125175476074,
"learning_rate": 6.140878988561108e-07,
"loss": 0.5441,
"step": 2040
},
{
"epoch": 0.24683925346177002,
"grad_norm": 4.920033931732178,
"learning_rate": 6.17098133654425e-07,
"loss": 0.5307,
"step": 2050
},
{
"epoch": 0.24804334738109574,
"grad_norm": 5.167773246765137,
"learning_rate": 6.201083684527393e-07,
"loss": 0.5304,
"step": 2060
},
{
"epoch": 0.24924744130042142,
"grad_norm": 5.3018879890441895,
"learning_rate": 6.231186032510536e-07,
"loss": 0.5356,
"step": 2070
},
{
"epoch": 0.25045153521974717,
"grad_norm": 4.822166919708252,
"learning_rate": 6.261288380493678e-07,
"loss": 0.513,
"step": 2080
},
{
"epoch": 0.25165562913907286,
"grad_norm": 4.957582473754883,
"learning_rate": 6.291390728476821e-07,
"loss": 0.5069,
"step": 2090
},
{
"epoch": 0.25285972305839854,
"grad_norm": 6.180065155029297,
"learning_rate": 6.321493076459964e-07,
"loss": 0.5329,
"step": 2100
},
{
"epoch": 0.2540638169777243,
"grad_norm": 5.123517990112305,
"learning_rate": 6.351595424443106e-07,
"loss": 0.5169,
"step": 2110
},
{
"epoch": 0.25526791089705,
"grad_norm": 5.372180938720703,
"learning_rate": 6.381697772426249e-07,
"loss": 0.508,
"step": 2120
},
{
"epoch": 0.25647200481637566,
"grad_norm": 3.907548189163208,
"learning_rate": 6.411800120409392e-07,
"loss": 0.5082,
"step": 2130
},
{
"epoch": 0.2576760987357014,
"grad_norm": 4.107047080993652,
"learning_rate": 6.441902468392534e-07,
"loss": 0.5257,
"step": 2140
},
{
"epoch": 0.2588801926550271,
"grad_norm": 5.055625915527344,
"learning_rate": 6.472004816375677e-07,
"loss": 0.5458,
"step": 2150
},
{
"epoch": 0.2600842865743528,
"grad_norm": 5.573007106781006,
"learning_rate": 6.502107164358819e-07,
"loss": 0.5178,
"step": 2160
},
{
"epoch": 0.26128838049367853,
"grad_norm": 4.955606460571289,
"learning_rate": 6.532209512341962e-07,
"loss": 0.5355,
"step": 2170
},
{
"epoch": 0.2624924744130042,
"grad_norm": 4.537413120269775,
"learning_rate": 6.562311860325105e-07,
"loss": 0.5393,
"step": 2180
},
{
"epoch": 0.2636965683323299,
"grad_norm": 5.761811256408691,
"learning_rate": 6.592414208308247e-07,
"loss": 0.5497,
"step": 2190
},
{
"epoch": 0.26490066225165565,
"grad_norm": 3.865335464477539,
"learning_rate": 6.622516556291391e-07,
"loss": 0.4914,
"step": 2200
},
{
"epoch": 0.26610475617098134,
"grad_norm": 4.600432872772217,
"learning_rate": 6.652618904274533e-07,
"loss": 0.5099,
"step": 2210
},
{
"epoch": 0.267308850090307,
"grad_norm": 4.737097263336182,
"learning_rate": 6.682721252257675e-07,
"loss": 0.5236,
"step": 2220
},
{
"epoch": 0.26851294400963277,
"grad_norm": 4.7886247634887695,
"learning_rate": 6.712823600240819e-07,
"loss": 0.5152,
"step": 2230
},
{
"epoch": 0.26971703792895846,
"grad_norm": 6.00905179977417,
"learning_rate": 6.742925948223961e-07,
"loss": 0.5369,
"step": 2240
},
{
"epoch": 0.27092113184828415,
"grad_norm": 5.080295085906982,
"learning_rate": 6.773028296207104e-07,
"loss": 0.5135,
"step": 2250
},
{
"epoch": 0.2721252257676099,
"grad_norm": 5.130943775177002,
"learning_rate": 6.803130644190247e-07,
"loss": 0.4921,
"step": 2260
},
{
"epoch": 0.2733293196869356,
"grad_norm": 4.8161187171936035,
"learning_rate": 6.833232992173389e-07,
"loss": 0.5243,
"step": 2270
},
{
"epoch": 0.27453341360626127,
"grad_norm": 5.960630416870117,
"learning_rate": 6.863335340156532e-07,
"loss": 0.525,
"step": 2280
},
{
"epoch": 0.275737507525587,
"grad_norm": 6.012716770172119,
"learning_rate": 6.893437688139675e-07,
"loss": 0.5126,
"step": 2290
},
{
"epoch": 0.2769416014449127,
"grad_norm": 4.913167476654053,
"learning_rate": 6.923540036122817e-07,
"loss": 0.531,
"step": 2300
},
{
"epoch": 0.2781456953642384,
"grad_norm": 5.190576076507568,
"learning_rate": 6.95364238410596e-07,
"loss": 0.5147,
"step": 2310
},
{
"epoch": 0.27934978928356413,
"grad_norm": 4.0760602951049805,
"learning_rate": 6.983744732089103e-07,
"loss": 0.5135,
"step": 2320
},
{
"epoch": 0.2805538832028898,
"grad_norm": 4.385684490203857,
"learning_rate": 7.013847080072245e-07,
"loss": 0.5196,
"step": 2330
},
{
"epoch": 0.2817579771222155,
"grad_norm": 4.470118045806885,
"learning_rate": 7.043949428055388e-07,
"loss": 0.502,
"step": 2340
},
{
"epoch": 0.28296207104154125,
"grad_norm": 4.798367023468018,
"learning_rate": 7.074051776038531e-07,
"loss": 0.5078,
"step": 2350
},
{
"epoch": 0.28416616496086694,
"grad_norm": 4.64969539642334,
"learning_rate": 7.104154124021673e-07,
"loss": 0.5126,
"step": 2360
},
{
"epoch": 0.28537025888019263,
"grad_norm": 5.035313606262207,
"learning_rate": 7.134256472004816e-07,
"loss": 0.5068,
"step": 2370
},
{
"epoch": 0.2865743527995184,
"grad_norm": 3.7338409423828125,
"learning_rate": 7.164358819987959e-07,
"loss": 0.4956,
"step": 2380
},
{
"epoch": 0.28777844671884406,
"grad_norm": 5.102356910705566,
"learning_rate": 7.194461167971101e-07,
"loss": 0.5128,
"step": 2390
},
{
"epoch": 0.2889825406381698,
"grad_norm": 5.0710320472717285,
"learning_rate": 7.224563515954244e-07,
"loss": 0.5064,
"step": 2400
},
{
"epoch": 0.2901866345574955,
"grad_norm": 5.2054667472839355,
"learning_rate": 7.254665863937387e-07,
"loss": 0.5318,
"step": 2410
},
{
"epoch": 0.2913907284768212,
"grad_norm": 4.590500831604004,
"learning_rate": 7.284768211920528e-07,
"loss": 0.5352,
"step": 2420
},
{
"epoch": 0.2925948223961469,
"grad_norm": 5.737983226776123,
"learning_rate": 7.314870559903672e-07,
"loss": 0.5047,
"step": 2430
},
{
"epoch": 0.2937989163154726,
"grad_norm": 5.184499263763428,
"learning_rate": 7.344972907886815e-07,
"loss": 0.4998,
"step": 2440
},
{
"epoch": 0.2950030102347983,
"grad_norm": 5.553317070007324,
"learning_rate": 7.375075255869959e-07,
"loss": 0.5099,
"step": 2450
},
{
"epoch": 0.29620710415412405,
"grad_norm": 4.864592552185059,
"learning_rate": 7.4051776038531e-07,
"loss": 0.5071,
"step": 2460
},
{
"epoch": 0.29741119807344973,
"grad_norm": 4.1055803298950195,
"learning_rate": 7.435279951836243e-07,
"loss": 0.4985,
"step": 2470
},
{
"epoch": 0.2986152919927754,
"grad_norm": 5.875371932983398,
"learning_rate": 7.465382299819386e-07,
"loss": 0.4982,
"step": 2480
},
{
"epoch": 0.29981938591210117,
"grad_norm": 4.417768955230713,
"learning_rate": 7.495484647802528e-07,
"loss": 0.4999,
"step": 2490
},
{
"epoch": 0.30102347983142685,
"grad_norm": 4.034854888916016,
"learning_rate": 7.525586995785671e-07,
"loss": 0.5063,
"step": 2500
},
{
"epoch": 0.30222757375075254,
"grad_norm": 4.711478233337402,
"learning_rate": 7.555689343768814e-07,
"loss": 0.5194,
"step": 2510
},
{
"epoch": 0.3034316676700783,
"grad_norm": 4.778373718261719,
"learning_rate": 7.585791691751956e-07,
"loss": 0.5178,
"step": 2520
},
{
"epoch": 0.304635761589404,
"grad_norm": 3.896817922592163,
"learning_rate": 7.615894039735099e-07,
"loss": 0.5073,
"step": 2530
},
{
"epoch": 0.30583985550872966,
"grad_norm": 4.729064464569092,
"learning_rate": 7.645996387718242e-07,
"loss": 0.5083,
"step": 2540
},
{
"epoch": 0.3070439494280554,
"grad_norm": 4.760159015655518,
"learning_rate": 7.676098735701384e-07,
"loss": 0.5108,
"step": 2550
},
{
"epoch": 0.3082480433473811,
"grad_norm": 4.362825870513916,
"learning_rate": 7.706201083684527e-07,
"loss": 0.5027,
"step": 2560
},
{
"epoch": 0.3094521372667068,
"grad_norm": 4.749810695648193,
"learning_rate": 7.73630343166767e-07,
"loss": 0.5051,
"step": 2570
},
{
"epoch": 0.3106562311860325,
"grad_norm": 4.157332897186279,
"learning_rate": 7.766405779650812e-07,
"loss": 0.5,
"step": 2580
},
{
"epoch": 0.3118603251053582,
"grad_norm": 4.272891044616699,
"learning_rate": 7.796508127633955e-07,
"loss": 0.4946,
"step": 2590
},
{
"epoch": 0.3130644190246839,
"grad_norm": 4.159026145935059,
"learning_rate": 7.826610475617098e-07,
"loss": 0.4992,
"step": 2600
},
{
"epoch": 0.31426851294400965,
"grad_norm": 5.095447063446045,
"learning_rate": 7.85671282360024e-07,
"loss": 0.4968,
"step": 2610
},
{
"epoch": 0.31547260686333534,
"grad_norm": 4.606817722320557,
"learning_rate": 7.886815171583383e-07,
"loss": 0.5018,
"step": 2620
},
{
"epoch": 0.316676700782661,
"grad_norm": 4.154166221618652,
"learning_rate": 7.916917519566526e-07,
"loss": 0.4848,
"step": 2630
},
{
"epoch": 0.31788079470198677,
"grad_norm": 4.749946117401123,
"learning_rate": 7.947019867549668e-07,
"loss": 0.4955,
"step": 2640
},
{
"epoch": 0.31908488862131246,
"grad_norm": 6.158957481384277,
"learning_rate": 7.977122215532812e-07,
"loss": 0.5088,
"step": 2650
},
{
"epoch": 0.32028898254063815,
"grad_norm": 4.356431484222412,
"learning_rate": 8.007224563515954e-07,
"loss": 0.5071,
"step": 2660
},
{
"epoch": 0.3214930764599639,
"grad_norm": 5.454282760620117,
"learning_rate": 8.037326911499096e-07,
"loss": 0.518,
"step": 2670
},
{
"epoch": 0.3226971703792896,
"grad_norm": 4.323178291320801,
"learning_rate": 8.06742925948224e-07,
"loss": 0.5077,
"step": 2680
},
{
"epoch": 0.32390126429861527,
"grad_norm": 5.352051258087158,
"learning_rate": 8.097531607465382e-07,
"loss": 0.5042,
"step": 2690
},
{
"epoch": 0.325105358217941,
"grad_norm": 4.680684566497803,
"learning_rate": 8.127633955448525e-07,
"loss": 0.5006,
"step": 2700
},
{
"epoch": 0.3263094521372667,
"grad_norm": 5.054072380065918,
"learning_rate": 8.157736303431668e-07,
"loss": 0.5005,
"step": 2710
},
{
"epoch": 0.32751354605659244,
"grad_norm": 4.090258598327637,
"learning_rate": 8.18783865141481e-07,
"loss": 0.4694,
"step": 2720
},
{
"epoch": 0.32871763997591813,
"grad_norm": 4.663838863372803,
"learning_rate": 8.217940999397953e-07,
"loss": 0.502,
"step": 2730
},
{
"epoch": 0.3299217338952438,
"grad_norm": 4.440493106842041,
"learning_rate": 8.248043347381095e-07,
"loss": 0.4933,
"step": 2740
},
{
"epoch": 0.33112582781456956,
"grad_norm": 5.184099197387695,
"learning_rate": 8.278145695364238e-07,
"loss": 0.5088,
"step": 2750
},
{
"epoch": 0.33232992173389525,
"grad_norm": 4.647283554077148,
"learning_rate": 8.308248043347381e-07,
"loss": 0.4909,
"step": 2760
},
{
"epoch": 0.33353401565322094,
"grad_norm": 4.6232500076293945,
"learning_rate": 8.338350391330523e-07,
"loss": 0.4929,
"step": 2770
},
{
"epoch": 0.3347381095725467,
"grad_norm": 5.234133720397949,
"learning_rate": 8.368452739313666e-07,
"loss": 0.5287,
"step": 2780
},
{
"epoch": 0.33594220349187237,
"grad_norm": 4.967161178588867,
"learning_rate": 8.398555087296809e-07,
"loss": 0.5041,
"step": 2790
},
{
"epoch": 0.33714629741119806,
"grad_norm": 4.8062591552734375,
"learning_rate": 8.428657435279951e-07,
"loss": 0.4878,
"step": 2800
},
{
"epoch": 0.3383503913305238,
"grad_norm": 5.188631534576416,
"learning_rate": 8.458759783263094e-07,
"loss": 0.4907,
"step": 2810
},
{
"epoch": 0.3395544852498495,
"grad_norm": 4.293895244598389,
"learning_rate": 8.488862131246237e-07,
"loss": 0.4952,
"step": 2820
},
{
"epoch": 0.3407585791691752,
"grad_norm": 5.219202041625977,
"learning_rate": 8.518964479229379e-07,
"loss": 0.5046,
"step": 2830
},
{
"epoch": 0.3419626730885009,
"grad_norm": 4.529453754425049,
"learning_rate": 8.549066827212522e-07,
"loss": 0.4951,
"step": 2840
},
{
"epoch": 0.3431667670078266,
"grad_norm": 4.706615924835205,
"learning_rate": 8.579169175195666e-07,
"loss": 0.5083,
"step": 2850
},
{
"epoch": 0.3443708609271523,
"grad_norm": 5.135066986083984,
"learning_rate": 8.609271523178807e-07,
"loss": 0.4823,
"step": 2860
},
{
"epoch": 0.34557495484647804,
"grad_norm": 4.977953910827637,
"learning_rate": 8.63937387116195e-07,
"loss": 0.4845,
"step": 2870
},
{
"epoch": 0.34677904876580373,
"grad_norm": 4.964434623718262,
"learning_rate": 8.669476219145094e-07,
"loss": 0.5008,
"step": 2880
},
{
"epoch": 0.3479831426851294,
"grad_norm": 4.28712797164917,
"learning_rate": 8.699578567128235e-07,
"loss": 0.4819,
"step": 2890
},
{
"epoch": 0.34918723660445516,
"grad_norm": 4.125621318817139,
"learning_rate": 8.729680915111378e-07,
"loss": 0.505,
"step": 2900
},
{
"epoch": 0.35039133052378085,
"grad_norm": 4.779543399810791,
"learning_rate": 8.759783263094522e-07,
"loss": 0.5002,
"step": 2910
},
{
"epoch": 0.35159542444310654,
"grad_norm": 4.9358320236206055,
"learning_rate": 8.789885611077663e-07,
"loss": 0.4854,
"step": 2920
},
{
"epoch": 0.3527995183624323,
"grad_norm": 5.439524173736572,
"learning_rate": 8.819987959060806e-07,
"loss": 0.4893,
"step": 2930
},
{
"epoch": 0.354003612281758,
"grad_norm": 5.939353942871094,
"learning_rate": 8.85009030704395e-07,
"loss": 0.4876,
"step": 2940
},
{
"epoch": 0.35520770620108366,
"grad_norm": 5.600659370422363,
"learning_rate": 8.880192655027092e-07,
"loss": 0.4916,
"step": 2950
},
{
"epoch": 0.3564118001204094,
"grad_norm": 6.2792134284973145,
"learning_rate": 8.910295003010234e-07,
"loss": 0.5139,
"step": 2960
},
{
"epoch": 0.3576158940397351,
"grad_norm": 5.060665130615234,
"learning_rate": 8.940397350993378e-07,
"loss": 0.5138,
"step": 2970
},
{
"epoch": 0.3588199879590608,
"grad_norm": 5.271560192108154,
"learning_rate": 8.97049969897652e-07,
"loss": 0.4971,
"step": 2980
},
{
"epoch": 0.3600240818783865,
"grad_norm": 4.9547014236450195,
"learning_rate": 9.000602046959662e-07,
"loss": 0.4767,
"step": 2990
},
{
"epoch": 0.3612281757977122,
"grad_norm": 5.039198398590088,
"learning_rate": 9.030704394942806e-07,
"loss": 0.5038,
"step": 3000
},
{
"epoch": 0.3624322697170379,
"grad_norm": 3.5281832218170166,
"learning_rate": 9.060806742925948e-07,
"loss": 0.4837,
"step": 3010
},
{
"epoch": 0.36363636363636365,
"grad_norm": 4.734562873840332,
"learning_rate": 9.09090909090909e-07,
"loss": 0.4925,
"step": 3020
},
{
"epoch": 0.36484045755568933,
"grad_norm": 4.400488376617432,
"learning_rate": 9.121011438892233e-07,
"loss": 0.4819,
"step": 3030
},
{
"epoch": 0.3660445514750151,
"grad_norm": 4.797727584838867,
"learning_rate": 9.151113786875376e-07,
"loss": 0.4779,
"step": 3040
},
{
"epoch": 0.36724864539434077,
"grad_norm": 4.852715492248535,
"learning_rate": 9.181216134858518e-07,
"loss": 0.4581,
"step": 3050
},
{
"epoch": 0.36845273931366646,
"grad_norm": 4.8324971199035645,
"learning_rate": 9.211318482841661e-07,
"loss": 0.5075,
"step": 3060
},
{
"epoch": 0.3696568332329922,
"grad_norm": 4.099527835845947,
"learning_rate": 9.241420830824804e-07,
"loss": 0.4926,
"step": 3070
},
{
"epoch": 0.3708609271523179,
"grad_norm": 4.540558338165283,
"learning_rate": 9.271523178807946e-07,
"loss": 0.4901,
"step": 3080
},
{
"epoch": 0.3720650210716436,
"grad_norm": 4.567551612854004,
"learning_rate": 9.301625526791089e-07,
"loss": 0.4781,
"step": 3090
},
{
"epoch": 0.3732691149909693,
"grad_norm": 5.362119674682617,
"learning_rate": 9.331727874774232e-07,
"loss": 0.4784,
"step": 3100
},
{
"epoch": 0.374473208910295,
"grad_norm": 4.974254131317139,
"learning_rate": 9.361830222757375e-07,
"loss": 0.4985,
"step": 3110
},
{
"epoch": 0.3756773028296207,
"grad_norm": 4.490511417388916,
"learning_rate": 9.391932570740517e-07,
"loss": 0.4619,
"step": 3120
},
{
"epoch": 0.37688139674894644,
"grad_norm": 4.691735744476318,
"learning_rate": 9.42203491872366e-07,
"loss": 0.4892,
"step": 3130
},
{
"epoch": 0.37808549066827213,
"grad_norm": 5.031266689300537,
"learning_rate": 9.452137266706803e-07,
"loss": 0.4653,
"step": 3140
},
{
"epoch": 0.3792895845875978,
"grad_norm": 6.112424850463867,
"learning_rate": 9.482239614689945e-07,
"loss": 0.4887,
"step": 3150
},
{
"epoch": 0.38049367850692356,
"grad_norm": 4.281744480133057,
"learning_rate": 9.512341962673088e-07,
"loss": 0.4828,
"step": 3160
},
{
"epoch": 0.38169777242624925,
"grad_norm": 4.672320365905762,
"learning_rate": 9.54244431065623e-07,
"loss": 0.4807,
"step": 3170
},
{
"epoch": 0.38290186634557494,
"grad_norm": 4.8247528076171875,
"learning_rate": 9.572546658639373e-07,
"loss": 0.4652,
"step": 3180
},
{
"epoch": 0.3841059602649007,
"grad_norm": 4.806872844696045,
"learning_rate": 9.602649006622515e-07,
"loss": 0.4687,
"step": 3190
},
{
"epoch": 0.38531005418422637,
"grad_norm": 4.877020835876465,
"learning_rate": 9.63275135460566e-07,
"loss": 0.4954,
"step": 3200
},
{
"epoch": 0.38651414810355206,
"grad_norm": 5.005871295928955,
"learning_rate": 9.662853702588802e-07,
"loss": 0.5117,
"step": 3210
},
{
"epoch": 0.3877182420228778,
"grad_norm": 4.2746100425720215,
"learning_rate": 9.692956050571944e-07,
"loss": 0.472,
"step": 3220
},
{
"epoch": 0.3889223359422035,
"grad_norm": 4.155144691467285,
"learning_rate": 9.723058398555087e-07,
"loss": 0.4882,
"step": 3230
},
{
"epoch": 0.3901264298615292,
"grad_norm": 4.557404041290283,
"learning_rate": 9.75316074653823e-07,
"loss": 0.4845,
"step": 3240
},
{
"epoch": 0.3913305237808549,
"grad_norm": 4.442798614501953,
"learning_rate": 9.783263094521371e-07,
"loss": 0.4822,
"step": 3250
},
{
"epoch": 0.3925346177001806,
"grad_norm": 5.363224029541016,
"learning_rate": 9.813365442504516e-07,
"loss": 0.4808,
"step": 3260
},
{
"epoch": 0.3937387116195063,
"grad_norm": 4.809715747833252,
"learning_rate": 9.843467790487658e-07,
"loss": 0.4834,
"step": 3270
},
{
"epoch": 0.39494280553883204,
"grad_norm": 4.954145431518555,
"learning_rate": 9.8735701384708e-07,
"loss": 0.4796,
"step": 3280
},
{
"epoch": 0.39614689945815773,
"grad_norm": 4.381477355957031,
"learning_rate": 9.903672486453943e-07,
"loss": 0.465,
"step": 3290
},
{
"epoch": 0.3973509933774834,
"grad_norm": 5.086960315704346,
"learning_rate": 9.933774834437085e-07,
"loss": 0.4996,
"step": 3300
},
{
"epoch": 0.39855508729680916,
"grad_norm": 5.4834303855896,
"learning_rate": 9.963877182420227e-07,
"loss": 0.4854,
"step": 3310
},
{
"epoch": 0.39975918121613485,
"grad_norm": 4.411494255065918,
"learning_rate": 9.993979530403372e-07,
"loss": 0.4882,
"step": 3320
},
{
"epoch": 0.40096327513546054,
"grad_norm": 3.9291751384735107,
"learning_rate": 9.999998233411383e-07,
"loss": 0.4975,
"step": 3330
},
{
"epoch": 0.4021673690547863,
"grad_norm": 4.288562774658203,
"learning_rate": 9.999991056647273e-07,
"loss": 0.4712,
"step": 3340
},
{
"epoch": 0.40337146297411197,
"grad_norm": 4.603250026702881,
"learning_rate": 9.999978359303795e-07,
"loss": 0.4933,
"step": 3350
},
{
"epoch": 0.4045755568934377,
"grad_norm": 4.753664970397949,
"learning_rate": 9.999960141394973e-07,
"loss": 0.4748,
"step": 3360
},
{
"epoch": 0.4057796508127634,
"grad_norm": 4.143571376800537,
"learning_rate": 9.99993640294092e-07,
"loss": 0.46,
"step": 3370
},
{
"epoch": 0.4069837447320891,
"grad_norm": 5.25679874420166,
"learning_rate": 9.99990714396784e-07,
"loss": 0.4859,
"step": 3380
},
{
"epoch": 0.40818783865141484,
"grad_norm": 5.903568744659424,
"learning_rate": 9.999872364508047e-07,
"loss": 0.4942,
"step": 3390
},
{
"epoch": 0.4093919325707405,
"grad_norm": 4.5355939865112305,
"learning_rate": 9.999832064599938e-07,
"loss": 0.4713,
"step": 3400
},
{
"epoch": 0.4105960264900662,
"grad_norm": 4.297218322753906,
"learning_rate": 9.999786244288008e-07,
"loss": 0.4701,
"step": 3410
},
{
"epoch": 0.41180012040939196,
"grad_norm": 4.364749908447266,
"learning_rate": 9.99973490362285e-07,
"loss": 0.4805,
"step": 3420
},
{
"epoch": 0.41300421432871764,
"grad_norm": 5.253974914550781,
"learning_rate": 9.999678042661147e-07,
"loss": 0.4728,
"step": 3430
},
{
"epoch": 0.41420830824804333,
"grad_norm": 3.7505037784576416,
"learning_rate": 9.999615661465685e-07,
"loss": 0.4666,
"step": 3440
},
{
"epoch": 0.4154124021673691,
"grad_norm": 4.56821346282959,
"learning_rate": 9.999547760105335e-07,
"loss": 0.4654,
"step": 3450
},
{
"epoch": 0.41661649608669477,
"grad_norm": 5.777834415435791,
"learning_rate": 9.999474338655073e-07,
"loss": 0.4708,
"step": 3460
},
{
"epoch": 0.41782059000602045,
"grad_norm": 4.463301181793213,
"learning_rate": 9.999395397195961e-07,
"loss": 0.4736,
"step": 3470
},
{
"epoch": 0.4190246839253462,
"grad_norm": 4.7559494972229,
"learning_rate": 9.999310935815165e-07,
"loss": 0.4858,
"step": 3480
},
{
"epoch": 0.4202287778446719,
"grad_norm": 5.451569557189941,
"learning_rate": 9.999220954605932e-07,
"loss": 0.4945,
"step": 3490
},
{
"epoch": 0.4214328717639976,
"grad_norm": 4.072139739990234,
"learning_rate": 9.99912545366762e-07,
"loss": 0.4685,
"step": 3500
},
{
"epoch": 0.4226369656833233,
"grad_norm": 5.299817085266113,
"learning_rate": 9.999024433105666e-07,
"loss": 0.4782,
"step": 3510
},
{
"epoch": 0.423841059602649,
"grad_norm": 4.960267543792725,
"learning_rate": 9.998917893031615e-07,
"loss": 0.4766,
"step": 3520
},
{
"epoch": 0.4250451535219747,
"grad_norm": 5.582713603973389,
"learning_rate": 9.998805833563096e-07,
"loss": 0.4737,
"step": 3530
},
{
"epoch": 0.42624924744130044,
"grad_norm": 4.434458255767822,
"learning_rate": 9.998688254823836e-07,
"loss": 0.4679,
"step": 3540
},
{
"epoch": 0.4274533413606261,
"grad_norm": 4.943469524383545,
"learning_rate": 9.99856515694366e-07,
"loss": 0.4754,
"step": 3550
},
{
"epoch": 0.4286574352799518,
"grad_norm": 5.145878314971924,
"learning_rate": 9.998436540058476e-07,
"loss": 0.4855,
"step": 3560
},
{
"epoch": 0.42986152919927756,
"grad_norm": 4.884524822235107,
"learning_rate": 9.998302404310296e-07,
"loss": 0.4801,
"step": 3570
},
{
"epoch": 0.43106562311860325,
"grad_norm": 4.950911045074463,
"learning_rate": 9.998162749847223e-07,
"loss": 0.51,
"step": 3580
},
{
"epoch": 0.43226971703792894,
"grad_norm": 4.5520148277282715,
"learning_rate": 9.99801757682345e-07,
"loss": 0.4887,
"step": 3590
},
{
"epoch": 0.4334738109572547,
"grad_norm": 5.745821952819824,
"learning_rate": 9.997866885399265e-07,
"loss": 0.4934,
"step": 3600
},
{
"epoch": 0.43467790487658037,
"grad_norm": 4.750070095062256,
"learning_rate": 9.997710675741049e-07,
"loss": 0.4611,
"step": 3610
},
{
"epoch": 0.43588199879590606,
"grad_norm": 4.3570966720581055,
"learning_rate": 9.997548948021275e-07,
"loss": 0.4819,
"step": 3620
},
{
"epoch": 0.4370860927152318,
"grad_norm": 3.810598373413086,
"learning_rate": 9.997381702418513e-07,
"loss": 0.4514,
"step": 3630
},
{
"epoch": 0.4382901866345575,
"grad_norm": 4.763775825500488,
"learning_rate": 9.997208939117418e-07,
"loss": 0.4686,
"step": 3640
},
{
"epoch": 0.4394942805538832,
"grad_norm": 4.3974385261535645,
"learning_rate": 9.997030658308745e-07,
"loss": 0.4763,
"step": 3650
},
{
"epoch": 0.4406983744732089,
"grad_norm": 4.901960372924805,
"learning_rate": 9.996846860189332e-07,
"loss": 0.4649,
"step": 3660
},
{
"epoch": 0.4419024683925346,
"grad_norm": 3.764139175415039,
"learning_rate": 9.996657544962118e-07,
"loss": 0.4752,
"step": 3670
},
{
"epoch": 0.44310656231186035,
"grad_norm": 4.972975730895996,
"learning_rate": 9.996462712836126e-07,
"loss": 0.4736,
"step": 3680
},
{
"epoch": 0.44431065623118604,
"grad_norm": 3.928086757659912,
"learning_rate": 9.996262364026477e-07,
"loss": 0.4939,
"step": 3690
},
{
"epoch": 0.44551475015051173,
"grad_norm": 4.017699718475342,
"learning_rate": 9.99605649875438e-07,
"loss": 0.4693,
"step": 3700
},
{
"epoch": 0.4467188440698375,
"grad_norm": 6.103999137878418,
"learning_rate": 9.995845117247129e-07,
"loss": 0.4774,
"step": 3710
},
{
"epoch": 0.44792293798916316,
"grad_norm": 6.031617641448975,
"learning_rate": 9.99562821973812e-07,
"loss": 0.4528,
"step": 3720
},
{
"epoch": 0.44912703190848885,
"grad_norm": 4.691218852996826,
"learning_rate": 9.99540580646683e-07,
"loss": 0.4646,
"step": 3730
},
{
"epoch": 0.4503311258278146,
"grad_norm": 4.680331230163574,
"learning_rate": 9.995177877678832e-07,
"loss": 0.469,
"step": 3740
},
{
"epoch": 0.4515352197471403,
"grad_norm": 4.436509132385254,
"learning_rate": 9.994944433625784e-07,
"loss": 0.4619,
"step": 3750
},
{
"epoch": 0.45273931366646597,
"grad_norm": 4.72512149810791,
"learning_rate": 9.994705474565435e-07,
"loss": 0.4404,
"step": 3760
},
{
"epoch": 0.4539434075857917,
"grad_norm": 4.427882194519043,
"learning_rate": 9.994461000761627e-07,
"loss": 0.4826,
"step": 3770
},
{
"epoch": 0.4551475015051174,
"grad_norm": 4.025267124176025,
"learning_rate": 9.994211012484285e-07,
"loss": 0.4671,
"step": 3780
},
{
"epoch": 0.4563515954244431,
"grad_norm": 5.315865516662598,
"learning_rate": 9.99395551000943e-07,
"loss": 0.4922,
"step": 3790
},
{
"epoch": 0.45755568934376883,
"grad_norm": 5.362889289855957,
"learning_rate": 9.993694493619162e-07,
"loss": 0.4554,
"step": 3800
},
{
"epoch": 0.4587597832630945,
"grad_norm": 3.8804094791412354,
"learning_rate": 9.993427963601674e-07,
"loss": 0.4558,
"step": 3810
},
{
"epoch": 0.4599638771824202,
"grad_norm": 3.8259241580963135,
"learning_rate": 9.99315592025125e-07,
"loss": 0.4756,
"step": 3820
},
{
"epoch": 0.46116797110174595,
"grad_norm": 3.806236505508423,
"learning_rate": 9.992878363868256e-07,
"loss": 0.4801,
"step": 3830
},
{
"epoch": 0.46237206502107164,
"grad_norm": 4.628232002258301,
"learning_rate": 9.992595294759147e-07,
"loss": 0.4953,
"step": 3840
},
{
"epoch": 0.46357615894039733,
"grad_norm": 4.719220161437988,
"learning_rate": 9.992306713236465e-07,
"loss": 0.4658,
"step": 3850
},
{
"epoch": 0.4647802528597231,
"grad_norm": 4.918371200561523,
"learning_rate": 9.992012619618838e-07,
"loss": 0.4691,
"step": 3860
},
{
"epoch": 0.46598434677904876,
"grad_norm": 4.425540447235107,
"learning_rate": 9.991713014230981e-07,
"loss": 0.4648,
"step": 3870
},
{
"epoch": 0.46718844069837445,
"grad_norm": 3.687819480895996,
"learning_rate": 9.99140789740369e-07,
"loss": 0.4714,
"step": 3880
},
{
"epoch": 0.4683925346177002,
"grad_norm": 4.835513591766357,
"learning_rate": 9.991097269473852e-07,
"loss": 0.4866,
"step": 3890
},
{
"epoch": 0.4695966285370259,
"grad_norm": 4.215537071228027,
"learning_rate": 9.990781130784437e-07,
"loss": 0.4697,
"step": 3900
},
{
"epoch": 0.4708007224563516,
"grad_norm": 4.371738433837891,
"learning_rate": 9.990459481684504e-07,
"loss": 0.4655,
"step": 3910
},
{
"epoch": 0.4720048163756773,
"grad_norm": 4.469852924346924,
"learning_rate": 9.990132322529181e-07,
"loss": 0.4416,
"step": 3920
},
{
"epoch": 0.473208910295003,
"grad_norm": 4.61678409576416,
"learning_rate": 9.989799653679701e-07,
"loss": 0.4625,
"step": 3930
},
{
"epoch": 0.4744130042143287,
"grad_norm": 5.12364387512207,
"learning_rate": 9.989461475503362e-07,
"loss": 0.4515,
"step": 3940
},
{
"epoch": 0.47561709813365444,
"grad_norm": 5.4315924644470215,
"learning_rate": 9.989117788373558e-07,
"loss": 0.4773,
"step": 3950
},
{
"epoch": 0.4768211920529801,
"grad_norm": 4.474724769592285,
"learning_rate": 9.988768592669756e-07,
"loss": 0.445,
"step": 3960
},
{
"epoch": 0.4780252859723058,
"grad_norm": 4.433851718902588,
"learning_rate": 9.98841388877751e-07,
"loss": 0.4667,
"step": 3970
},
{
"epoch": 0.47922937989163156,
"grad_norm": 4.388487815856934,
"learning_rate": 9.988053677088456e-07,
"loss": 0.443,
"step": 3980
},
{
"epoch": 0.48043347381095725,
"grad_norm": 5.400040149688721,
"learning_rate": 9.987687958000314e-07,
"loss": 0.4702,
"step": 3990
},
{
"epoch": 0.481637567730283,
"grad_norm": 4.436804294586182,
"learning_rate": 9.987316731916872e-07,
"loss": 0.4568,
"step": 4000
},
{
"epoch": 0.4828416616496087,
"grad_norm": 5.063580513000488,
"learning_rate": 9.986939999248014e-07,
"loss": 0.4877,
"step": 4010
},
{
"epoch": 0.48404575556893437,
"grad_norm": 4.696618556976318,
"learning_rate": 9.986557760409694e-07,
"loss": 0.464,
"step": 4020
},
{
"epoch": 0.4852498494882601,
"grad_norm": 5.019808292388916,
"learning_rate": 9.98617001582395e-07,
"loss": 0.4533,
"step": 4030
},
{
"epoch": 0.4864539434075858,
"grad_norm": 4.419073104858398,
"learning_rate": 9.9857767659189e-07,
"loss": 0.4416,
"step": 4040
},
{
"epoch": 0.4876580373269115,
"grad_norm": 4.31454610824585,
"learning_rate": 9.985378011128736e-07,
"loss": 0.458,
"step": 4050
},
{
"epoch": 0.48886213124623723,
"grad_norm": 5.41327428817749,
"learning_rate": 9.98497375189373e-07,
"loss": 0.4669,
"step": 4060
},
{
"epoch": 0.4900662251655629,
"grad_norm": 4.439949035644531,
"learning_rate": 9.98456398866023e-07,
"loss": 0.4532,
"step": 4070
},
{
"epoch": 0.4912703190848886,
"grad_norm": 4.076527118682861,
"learning_rate": 9.98414872188067e-07,
"loss": 0.4565,
"step": 4080
},
{
"epoch": 0.49247441300421435,
"grad_norm": 4.239142894744873,
"learning_rate": 9.983727952013545e-07,
"loss": 0.4686,
"step": 4090
},
{
"epoch": 0.49367850692354004,
"grad_norm": 4.340599060058594,
"learning_rate": 9.98330167952344e-07,
"loss": 0.4654,
"step": 4100
},
{
"epoch": 0.4948826008428657,
"grad_norm": 4.37545108795166,
"learning_rate": 9.982869904881007e-07,
"loss": 0.4634,
"step": 4110
},
{
"epoch": 0.49608669476219147,
"grad_norm": 4.235968112945557,
"learning_rate": 9.982432628562976e-07,
"loss": 0.4537,
"step": 4120
},
{
"epoch": 0.49729078868151716,
"grad_norm": 5.080899715423584,
"learning_rate": 9.981989851052153e-07,
"loss": 0.4675,
"step": 4130
},
{
"epoch": 0.49849488260084285,
"grad_norm": 4.327193260192871,
"learning_rate": 9.98154157283742e-07,
"loss": 0.4336,
"step": 4140
},
{
"epoch": 0.4996989765201686,
"grad_norm": 4.647739887237549,
"learning_rate": 9.981087794413721e-07,
"loss": 0.4547,
"step": 4150
},
{
"epoch": 0.5009030704394943,
"grad_norm": 4.411125659942627,
"learning_rate": 9.980628516282088e-07,
"loss": 0.4453,
"step": 4160
},
{
"epoch": 0.50210716435882,
"grad_norm": 4.8657026290893555,
"learning_rate": 9.980163738949615e-07,
"loss": 0.4714,
"step": 4170
},
{
"epoch": 0.5033112582781457,
"grad_norm": 4.7668776512146,
"learning_rate": 9.97969346292947e-07,
"loss": 0.4472,
"step": 4180
},
{
"epoch": 0.5045153521974715,
"grad_norm": 5.490717887878418,
"learning_rate": 9.979217688740895e-07,
"loss": 0.4767,
"step": 4190
},
{
"epoch": 0.5057194461167971,
"grad_norm": 4.896997928619385,
"learning_rate": 9.978736416909201e-07,
"loss": 0.4714,
"step": 4200
},
{
"epoch": 0.5069235400361228,
"grad_norm": 4.777568340301514,
"learning_rate": 9.978249647965768e-07,
"loss": 0.4608,
"step": 4210
},
{
"epoch": 0.5081276339554486,
"grad_norm": 4.839885711669922,
"learning_rate": 9.977757382448047e-07,
"loss": 0.4798,
"step": 4220
},
{
"epoch": 0.5093317278747742,
"grad_norm": 4.311272144317627,
"learning_rate": 9.977259620899557e-07,
"loss": 0.4347,
"step": 4230
},
{
"epoch": 0.5105358217941,
"grad_norm": 4.5723772048950195,
"learning_rate": 9.976756363869883e-07,
"loss": 0.4485,
"step": 4240
},
{
"epoch": 0.5117399157134257,
"grad_norm": 4.344234943389893,
"learning_rate": 9.976247611914681e-07,
"loss": 0.4623,
"step": 4250
},
{
"epoch": 0.5129440096327513,
"grad_norm": 4.216832160949707,
"learning_rate": 9.975733365595678e-07,
"loss": 0.4587,
"step": 4260
},
{
"epoch": 0.5141481035520771,
"grad_norm": 4.828461647033691,
"learning_rate": 9.975213625480656e-07,
"loss": 0.4616,
"step": 4270
},
{
"epoch": 0.5153521974714028,
"grad_norm": 4.608251571655273,
"learning_rate": 9.974688392143473e-07,
"loss": 0.4537,
"step": 4280
},
{
"epoch": 0.5165562913907285,
"grad_norm": 5.024391174316406,
"learning_rate": 9.974157666164047e-07,
"loss": 0.4596,
"step": 4290
},
{
"epoch": 0.5177603853100542,
"grad_norm": 4.869425296783447,
"learning_rate": 9.973621448128362e-07,
"loss": 0.468,
"step": 4300
},
{
"epoch": 0.5189644792293799,
"grad_norm": 4.599194526672363,
"learning_rate": 9.973079738628466e-07,
"loss": 0.4475,
"step": 4310
},
{
"epoch": 0.5201685731487056,
"grad_norm": 4.410305500030518,
"learning_rate": 9.972532538262473e-07,
"loss": 0.4684,
"step": 4320
},
{
"epoch": 0.5213726670680313,
"grad_norm": 3.9566409587860107,
"learning_rate": 9.971979847634552e-07,
"loss": 0.4472,
"step": 4330
},
{
"epoch": 0.5225767609873571,
"grad_norm": 4.608943462371826,
"learning_rate": 9.971421667354944e-07,
"loss": 0.4591,
"step": 4340
},
{
"epoch": 0.5237808549066827,
"grad_norm": 4.722293853759766,
"learning_rate": 9.97085799803994e-07,
"loss": 0.4529,
"step": 4350
},
{
"epoch": 0.5249849488260084,
"grad_norm": 4.868890762329102,
"learning_rate": 9.9702888403119e-07,
"loss": 0.4742,
"step": 4360
},
{
"epoch": 0.5261890427453342,
"grad_norm": 4.125800132751465,
"learning_rate": 9.969714194799243e-07,
"loss": 0.4501,
"step": 4370
},
{
"epoch": 0.5273931366646598,
"grad_norm": 4.570892810821533,
"learning_rate": 9.969134062136442e-07,
"loss": 0.4392,
"step": 4380
},
{
"epoch": 0.5285972305839856,
"grad_norm": 3.8944973945617676,
"learning_rate": 9.968548442964033e-07,
"loss": 0.4525,
"step": 4390
},
{
"epoch": 0.5298013245033113,
"grad_norm": 4.27981424331665,
"learning_rate": 9.96795733792861e-07,
"loss": 0.4607,
"step": 4400
},
{
"epoch": 0.5310054184226369,
"grad_norm": 4.3153300285339355,
"learning_rate": 9.96736074768282e-07,
"loss": 0.4709,
"step": 4410
},
{
"epoch": 0.5322095123419627,
"grad_norm": 5.543158531188965,
"learning_rate": 9.966758672885373e-07,
"loss": 0.4234,
"step": 4420
},
{
"epoch": 0.5334136062612884,
"grad_norm": 3.463160991668701,
"learning_rate": 9.966151114201027e-07,
"loss": 0.4684,
"step": 4430
},
{
"epoch": 0.534617700180614,
"grad_norm": 3.8580965995788574,
"learning_rate": 9.965538072300598e-07,
"loss": 0.4662,
"step": 4440
},
{
"epoch": 0.5358217940999398,
"grad_norm": 4.317717552185059,
"learning_rate": 9.96491954786096e-07,
"loss": 0.441,
"step": 4450
},
{
"epoch": 0.5370258880192655,
"grad_norm": 4.992043495178223,
"learning_rate": 9.964295541565035e-07,
"loss": 0.4575,
"step": 4460
},
{
"epoch": 0.5382299819385912,
"grad_norm": 4.042685031890869,
"learning_rate": 9.963666054101797e-07,
"loss": 0.421,
"step": 4470
},
{
"epoch": 0.5394340758579169,
"grad_norm": 4.4409260749816895,
"learning_rate": 9.96303108616628e-07,
"loss": 0.4684,
"step": 4480
},
{
"epoch": 0.5406381697772427,
"grad_norm": 4.652424335479736,
"learning_rate": 9.96239063845956e-07,
"loss": 0.4562,
"step": 4490
},
{
"epoch": 0.5418422636965683,
"grad_norm": 3.927960157394409,
"learning_rate": 9.961744711688765e-07,
"loss": 0.4636,
"step": 4500
},
{
"epoch": 0.543046357615894,
"grad_norm": 4.20367956161499,
"learning_rate": 9.961093306567074e-07,
"loss": 0.4629,
"step": 4510
},
{
"epoch": 0.5442504515352198,
"grad_norm": 5.0242791175842285,
"learning_rate": 9.960436423813721e-07,
"loss": 0.4699,
"step": 4520
},
{
"epoch": 0.5454545454545454,
"grad_norm": 4.339791297912598,
"learning_rate": 9.959774064153975e-07,
"loss": 0.4393,
"step": 4530
},
{
"epoch": 0.5466586393738712,
"grad_norm": 3.955888509750366,
"learning_rate": 9.959106228319164e-07,
"loss": 0.4419,
"step": 4540
},
{
"epoch": 0.5478627332931969,
"grad_norm": 4.508617401123047,
"learning_rate": 9.958432917046656e-07,
"loss": 0.4534,
"step": 4550
},
{
"epoch": 0.5490668272125225,
"grad_norm": 4.84667444229126,
"learning_rate": 9.957754131079865e-07,
"loss": 0.4621,
"step": 4560
},
{
"epoch": 0.5502709211318483,
"grad_norm": 4.65517520904541,
"learning_rate": 9.957069871168252e-07,
"loss": 0.4644,
"step": 4570
},
{
"epoch": 0.551475015051174,
"grad_norm": 4.428783416748047,
"learning_rate": 9.95638013806732e-07,
"loss": 0.4285,
"step": 4580
},
{
"epoch": 0.5526791089704997,
"grad_norm": 5.219538688659668,
"learning_rate": 9.955684932538615e-07,
"loss": 0.4342,
"step": 4590
},
{
"epoch": 0.5538832028898254,
"grad_norm": 4.356168270111084,
"learning_rate": 9.954984255349726e-07,
"loss": 0.4502,
"step": 4600
},
{
"epoch": 0.5550872968091511,
"grad_norm": 4.607705116271973,
"learning_rate": 9.954278107274286e-07,
"loss": 0.4397,
"step": 4610
},
{
"epoch": 0.5562913907284768,
"grad_norm": 4.667281150817871,
"learning_rate": 9.95356648909196e-07,
"loss": 0.4749,
"step": 4620
},
{
"epoch": 0.5574954846478025,
"grad_norm": 5.4144673347473145,
"learning_rate": 9.952849401588464e-07,
"loss": 0.4516,
"step": 4630
},
{
"epoch": 0.5586995785671283,
"grad_norm": 4.449268817901611,
"learning_rate": 9.952126845555544e-07,
"loss": 0.467,
"step": 4640
},
{
"epoch": 0.5599036724864539,
"grad_norm": 4.58141565322876,
"learning_rate": 9.951398821790988e-07,
"loss": 0.4674,
"step": 4650
},
{
"epoch": 0.5611077664057796,
"grad_norm": 4.779237747192383,
"learning_rate": 9.95066533109862e-07,
"loss": 0.4486,
"step": 4660
},
{
"epoch": 0.5623118603251054,
"grad_norm": 4.009070873260498,
"learning_rate": 9.949926374288298e-07,
"loss": 0.4466,
"step": 4670
},
{
"epoch": 0.563515954244431,
"grad_norm": 4.913680553436279,
"learning_rate": 9.949181952175922e-07,
"loss": 0.4574,
"step": 4680
},
{
"epoch": 0.5647200481637568,
"grad_norm": 4.114124774932861,
"learning_rate": 9.94843206558342e-07,
"loss": 0.4556,
"step": 4690
},
{
"epoch": 0.5659241420830825,
"grad_norm": 4.208637237548828,
"learning_rate": 9.94767671533875e-07,
"loss": 0.4446,
"step": 4700
},
{
"epoch": 0.5671282360024081,
"grad_norm": 4.362401962280273,
"learning_rate": 9.946915902275914e-07,
"loss": 0.4591,
"step": 4710
},
{
"epoch": 0.5683323299217339,
"grad_norm": 4.419969081878662,
"learning_rate": 9.946149627234939e-07,
"loss": 0.4352,
"step": 4720
},
{
"epoch": 0.5695364238410596,
"grad_norm": 5.162231922149658,
"learning_rate": 9.94537789106188e-07,
"loss": 0.4613,
"step": 4730
},
{
"epoch": 0.5707405177603853,
"grad_norm": 4.270598411560059,
"learning_rate": 9.944600694608825e-07,
"loss": 0.4628,
"step": 4740
},
{
"epoch": 0.571944611679711,
"grad_norm": 4.181495666503906,
"learning_rate": 9.943818038733891e-07,
"loss": 0.4391,
"step": 4750
},
{
"epoch": 0.5731487055990367,
"grad_norm": 4.3339033126831055,
"learning_rate": 9.943029924301225e-07,
"loss": 0.4406,
"step": 4760
},
{
"epoch": 0.5743527995183624,
"grad_norm": 4.909811496734619,
"learning_rate": 9.942236352180996e-07,
"loss": 0.4575,
"step": 4770
},
{
"epoch": 0.5755568934376881,
"grad_norm": 4.58059549331665,
"learning_rate": 9.941437323249398e-07,
"loss": 0.4613,
"step": 4780
},
{
"epoch": 0.5767609873570139,
"grad_norm": 3.9194531440734863,
"learning_rate": 9.94063283838866e-07,
"loss": 0.4449,
"step": 4790
},
{
"epoch": 0.5779650812763396,
"grad_norm": 4.602609634399414,
"learning_rate": 9.93982289848702e-07,
"loss": 0.4622,
"step": 4800
},
{
"epoch": 0.5791691751956652,
"grad_norm": 4.630181789398193,
"learning_rate": 9.939007504438754e-07,
"loss": 0.442,
"step": 4810
},
{
"epoch": 0.580373269114991,
"grad_norm": 3.903799057006836,
"learning_rate": 9.938186657144149e-07,
"loss": 0.4624,
"step": 4820
},
{
"epoch": 0.5815773630343167,
"grad_norm": 5.423624515533447,
"learning_rate": 9.937360357509522e-07,
"loss": 0.4372,
"step": 4830
},
{
"epoch": 0.5827814569536424,
"grad_norm": 4.571367263793945,
"learning_rate": 9.936528606447198e-07,
"loss": 0.4521,
"step": 4840
},
{
"epoch": 0.5839855508729681,
"grad_norm": 3.8848462104797363,
"learning_rate": 9.935691404875534e-07,
"loss": 0.4399,
"step": 4850
},
{
"epoch": 0.5851896447922939,
"grad_norm": 4.659217357635498,
"learning_rate": 9.934848753718896e-07,
"loss": 0.4345,
"step": 4860
},
{
"epoch": 0.5863937387116195,
"grad_norm": 5.5009026527404785,
"learning_rate": 9.934000653907672e-07,
"loss": 0.4173,
"step": 4870
},
{
"epoch": 0.5875978326309452,
"grad_norm": 3.984834671020508,
"learning_rate": 9.933147106378263e-07,
"loss": 0.4354,
"step": 4880
},
{
"epoch": 0.588801926550271,
"grad_norm": 4.0750346183776855,
"learning_rate": 9.932288112073086e-07,
"loss": 0.4447,
"step": 4890
},
{
"epoch": 0.5900060204695966,
"grad_norm": 4.871407985687256,
"learning_rate": 9.931423671940575e-07,
"loss": 0.4501,
"step": 4900
},
{
"epoch": 0.5912101143889223,
"grad_norm": 4.388524055480957,
"learning_rate": 9.93055378693517e-07,
"loss": 0.4421,
"step": 4910
},
{
"epoch": 0.5924142083082481,
"grad_norm": 4.511969566345215,
"learning_rate": 9.929678458017328e-07,
"loss": 0.4431,
"step": 4920
},
{
"epoch": 0.5936183022275737,
"grad_norm": 4.788571834564209,
"learning_rate": 9.928797686153514e-07,
"loss": 0.4621,
"step": 4930
},
{
"epoch": 0.5948223961468995,
"grad_norm": 5.144417762756348,
"learning_rate": 9.927911472316205e-07,
"loss": 0.4418,
"step": 4940
},
{
"epoch": 0.5960264900662252,
"grad_norm": 4.649743556976318,
"learning_rate": 9.927019817483887e-07,
"loss": 0.4639,
"step": 4950
},
{
"epoch": 0.5972305839855508,
"grad_norm": 4.76192045211792,
"learning_rate": 9.92612272264105e-07,
"loss": 0.4646,
"step": 4960
},
{
"epoch": 0.5984346779048766,
"grad_norm": 4.137574195861816,
"learning_rate": 9.925220188778193e-07,
"loss": 0.4537,
"step": 4970
},
{
"epoch": 0.5996387718242023,
"grad_norm": 4.616219997406006,
"learning_rate": 9.924312216891819e-07,
"loss": 0.4451,
"step": 4980
},
{
"epoch": 0.600842865743528,
"grad_norm": 4.623941421508789,
"learning_rate": 9.923398807984438e-07,
"loss": 0.4441,
"step": 4990
},
{
"epoch": 0.6020469596628537,
"grad_norm": 4.540246486663818,
"learning_rate": 9.92247996306456e-07,
"loss": 0.4477,
"step": 5000
},
{
"epoch": 0.6032510535821795,
"grad_norm": 4.742766380310059,
"learning_rate": 9.921555683146695e-07,
"loss": 0.4672,
"step": 5010
},
{
"epoch": 0.6044551475015051,
"grad_norm": 5.316002368927002,
"learning_rate": 9.920625969251364e-07,
"loss": 0.4593,
"step": 5020
},
{
"epoch": 0.6056592414208308,
"grad_norm": 4.386168003082275,
"learning_rate": 9.919690822405074e-07,
"loss": 0.4438,
"step": 5030
},
{
"epoch": 0.6068633353401566,
"grad_norm": 3.9734067916870117,
"learning_rate": 9.91875024364034e-07,
"loss": 0.4428,
"step": 5040
},
{
"epoch": 0.6080674292594822,
"grad_norm": 4.917031764984131,
"learning_rate": 9.917804233995673e-07,
"loss": 0.4622,
"step": 5050
},
{
"epoch": 0.609271523178808,
"grad_norm": 4.690892696380615,
"learning_rate": 9.916852794515575e-07,
"loss": 0.4513,
"step": 5060
},
{
"epoch": 0.6104756170981337,
"grad_norm": 4.1330952644348145,
"learning_rate": 9.915895926250552e-07,
"loss": 0.4523,
"step": 5070
},
{
"epoch": 0.6116797110174593,
"grad_norm": 4.932434558868408,
"learning_rate": 9.9149336302571e-07,
"loss": 0.4407,
"step": 5080
},
{
"epoch": 0.6128838049367851,
"grad_norm": 4.421885967254639,
"learning_rate": 9.913965907597702e-07,
"loss": 0.4332,
"step": 5090
},
{
"epoch": 0.6140878988561108,
"grad_norm": 5.199044704437256,
"learning_rate": 9.91299275934084e-07,
"loss": 0.426,
"step": 5100
},
{
"epoch": 0.6152919927754364,
"grad_norm": 4.189499855041504,
"learning_rate": 9.912014186560984e-07,
"loss": 0.4326,
"step": 5110
},
{
"epoch": 0.6164960866947622,
"grad_norm": 4.297112464904785,
"learning_rate": 9.911030190338597e-07,
"loss": 0.4622,
"step": 5120
},
{
"epoch": 0.6177001806140879,
"grad_norm": 3.9968087673187256,
"learning_rate": 9.910040771760122e-07,
"loss": 0.447,
"step": 5130
},
{
"epoch": 0.6189042745334136,
"grad_norm": 4.857995510101318,
"learning_rate": 9.909045931917998e-07,
"loss": 0.4343,
"step": 5140
},
{
"epoch": 0.6201083684527393,
"grad_norm": 3.741711378097534,
"learning_rate": 9.908045671910642e-07,
"loss": 0.4366,
"step": 5150
},
{
"epoch": 0.621312462372065,
"grad_norm": 4.424086093902588,
"learning_rate": 9.907039992842461e-07,
"loss": 0.448,
"step": 5160
},
{
"epoch": 0.6225165562913907,
"grad_norm": 5.499582767486572,
"learning_rate": 9.906028895823842e-07,
"loss": 0.4546,
"step": 5170
},
{
"epoch": 0.6237206502107164,
"grad_norm": 4.836984634399414,
"learning_rate": 9.905012381971157e-07,
"loss": 0.4605,
"step": 5180
},
{
"epoch": 0.6249247441300422,
"grad_norm": 4.31553316116333,
"learning_rate": 9.903990452406756e-07,
"loss": 0.4302,
"step": 5190
},
{
"epoch": 0.6261288380493678,
"grad_norm": 4.909146785736084,
"learning_rate": 9.902963108258968e-07,
"loss": 0.4445,
"step": 5200
},
{
"epoch": 0.6273329319686936,
"grad_norm": 4.295082092285156,
"learning_rate": 9.901930350662103e-07,
"loss": 0.4364,
"step": 5210
},
{
"epoch": 0.6285370258880193,
"grad_norm": 4.154002666473389,
"learning_rate": 9.90089218075645e-07,
"loss": 0.4526,
"step": 5220
},
{
"epoch": 0.6297411198073449,
"grad_norm": 4.30592679977417,
"learning_rate": 9.89984859968827e-07,
"loss": 0.4442,
"step": 5230
},
{
"epoch": 0.6309452137266707,
"grad_norm": 5.334674835205078,
"learning_rate": 9.898799608609795e-07,
"loss": 0.4415,
"step": 5240
},
{
"epoch": 0.6321493076459964,
"grad_norm": 4.136261940002441,
"learning_rate": 9.897745208679239e-07,
"loss": 0.4442,
"step": 5250
},
{
"epoch": 0.633353401565322,
"grad_norm": 4.585081577301025,
"learning_rate": 9.896685401060782e-07,
"loss": 0.4565,
"step": 5260
},
{
"epoch": 0.6345574954846478,
"grad_norm": 4.742111682891846,
"learning_rate": 9.895620186924578e-07,
"loss": 0.4393,
"step": 5270
},
{
"epoch": 0.6357615894039735,
"grad_norm": 3.9798941612243652,
"learning_rate": 9.894549567446748e-07,
"loss": 0.4255,
"step": 5280
},
{
"epoch": 0.6369656833232992,
"grad_norm": 4.722369194030762,
"learning_rate": 9.893473543809383e-07,
"loss": 0.4377,
"step": 5290
},
{
"epoch": 0.6381697772426249,
"grad_norm": 4.399467945098877,
"learning_rate": 9.892392117200536e-07,
"loss": 0.4215,
"step": 5300
},
{
"epoch": 0.6393738711619507,
"grad_norm": 4.718751430511475,
"learning_rate": 9.891305288814235e-07,
"loss": 0.4372,
"step": 5310
},
{
"epoch": 0.6405779650812763,
"grad_norm": 4.376132488250732,
"learning_rate": 9.890213059850465e-07,
"loss": 0.4567,
"step": 5320
},
{
"epoch": 0.641782059000602,
"grad_norm": 5.186975955963135,
"learning_rate": 9.889115431515173e-07,
"loss": 0.4414,
"step": 5330
},
{
"epoch": 0.6429861529199278,
"grad_norm": 4.560245037078857,
"learning_rate": 9.888012405020271e-07,
"loss": 0.4329,
"step": 5340
},
{
"epoch": 0.6441902468392534,
"grad_norm": 5.553184986114502,
"learning_rate": 9.886903981583632e-07,
"loss": 0.4472,
"step": 5350
},
{
"epoch": 0.6453943407585792,
"grad_norm": 5.126540660858154,
"learning_rate": 9.885790162429086e-07,
"loss": 0.4577,
"step": 5360
},
{
"epoch": 0.6465984346779049,
"grad_norm": 5.031693935394287,
"learning_rate": 9.884670948786417e-07,
"loss": 0.4608,
"step": 5370
},
{
"epoch": 0.6478025285972305,
"grad_norm": 4.265883445739746,
"learning_rate": 9.883546341891373e-07,
"loss": 0.4335,
"step": 5380
},
{
"epoch": 0.6490066225165563,
"grad_norm": 3.7793495655059814,
"learning_rate": 9.88241634298565e-07,
"loss": 0.4481,
"step": 5390
},
{
"epoch": 0.650210716435882,
"grad_norm": 4.184829235076904,
"learning_rate": 9.881280953316903e-07,
"loss": 0.4351,
"step": 5400
},
{
"epoch": 0.6514148103552077,
"grad_norm": 5.431835174560547,
"learning_rate": 9.880140174138735e-07,
"loss": 0.4739,
"step": 5410
},
{
"epoch": 0.6526189042745334,
"grad_norm": 5.218166828155518,
"learning_rate": 9.878994006710695e-07,
"loss": 0.4547,
"step": 5420
},
{
"epoch": 0.6538229981938591,
"grad_norm": 5.319456100463867,
"learning_rate": 9.877842452298293e-07,
"loss": 0.453,
"step": 5430
},
{
"epoch": 0.6550270921131849,
"grad_norm": 4.373801231384277,
"learning_rate": 9.876685512172979e-07,
"loss": 0.4245,
"step": 5440
},
{
"epoch": 0.6562311860325105,
"grad_norm": 4.274784088134766,
"learning_rate": 9.875523187612153e-07,
"loss": 0.4327,
"step": 5450
},
{
"epoch": 0.6574352799518363,
"grad_norm": 5.235876560211182,
"learning_rate": 9.874355479899157e-07,
"loss": 0.4365,
"step": 5460
},
{
"epoch": 0.658639373871162,
"grad_norm": 4.505414962768555,
"learning_rate": 9.873182390323275e-07,
"loss": 0.4236,
"step": 5470
},
{
"epoch": 0.6598434677904876,
"grad_norm": 5.843977451324463,
"learning_rate": 9.87200392017974e-07,
"loss": 0.4482,
"step": 5480
},
{
"epoch": 0.6610475617098134,
"grad_norm": 4.754218578338623,
"learning_rate": 9.870820070769723e-07,
"loss": 0.4526,
"step": 5490
},
{
"epoch": 0.6622516556291391,
"grad_norm": 4.734755992889404,
"learning_rate": 9.869630843400329e-07,
"loss": 0.4286,
"step": 5500
},
{
"epoch": 0.6634557495484648,
"grad_norm": 4.781942367553711,
"learning_rate": 9.868436239384608e-07,
"loss": 0.4395,
"step": 5510
},
{
"epoch": 0.6646598434677905,
"grad_norm": 4.710615634918213,
"learning_rate": 9.86723626004154e-07,
"loss": 0.4437,
"step": 5520
},
{
"epoch": 0.6658639373871162,
"grad_norm": 3.9797275066375732,
"learning_rate": 9.86603090669605e-07,
"loss": 0.4285,
"step": 5530
},
{
"epoch": 0.6670680313064419,
"grad_norm": 5.289978981018066,
"learning_rate": 9.864820180678984e-07,
"loss": 0.4482,
"step": 5540
},
{
"epoch": 0.6682721252257676,
"grad_norm": 3.6335768699645996,
"learning_rate": 9.86360408332713e-07,
"loss": 0.4578,
"step": 5550
},
{
"epoch": 0.6694762191450934,
"grad_norm": 3.998011589050293,
"learning_rate": 9.862382615983201e-07,
"loss": 0.439,
"step": 5560
},
{
"epoch": 0.670680313064419,
"grad_norm": 4.6308369636535645,
"learning_rate": 9.861155779995843e-07,
"loss": 0.4416,
"step": 5570
},
{
"epoch": 0.6718844069837447,
"grad_norm": 4.869227409362793,
"learning_rate": 9.859923576719623e-07,
"loss": 0.4271,
"step": 5580
},
{
"epoch": 0.6730885009030705,
"grad_norm": 4.426019668579102,
"learning_rate": 9.858686007515043e-07,
"loss": 0.424,
"step": 5590
},
{
"epoch": 0.6742925948223961,
"grad_norm": 4.659002304077148,
"learning_rate": 9.857443073748526e-07,
"loss": 0.4419,
"step": 5600
},
{
"epoch": 0.6754966887417219,
"grad_norm": 3.8600122928619385,
"learning_rate": 9.856194776792412e-07,
"loss": 0.4397,
"step": 5610
},
{
"epoch": 0.6767007826610476,
"grad_norm": 4.6182756423950195,
"learning_rate": 9.854941118024973e-07,
"loss": 0.454,
"step": 5620
},
{
"epoch": 0.6779048765803732,
"grad_norm": 4.149092674255371,
"learning_rate": 9.853682098830392e-07,
"loss": 0.426,
"step": 5630
},
{
"epoch": 0.679108970499699,
"grad_norm": 4.583498954772949,
"learning_rate": 9.852417720598778e-07,
"loss": 0.4226,
"step": 5640
},
{
"epoch": 0.6803130644190247,
"grad_norm": 4.789090633392334,
"learning_rate": 9.851147984726152e-07,
"loss": 0.4506,
"step": 5650
},
{
"epoch": 0.6815171583383504,
"grad_norm": 3.850926160812378,
"learning_rate": 9.849872892614452e-07,
"loss": 0.4149,
"step": 5660
},
{
"epoch": 0.6827212522576761,
"grad_norm": 4.576216697692871,
"learning_rate": 9.848592445671532e-07,
"loss": 0.4364,
"step": 5670
},
{
"epoch": 0.6839253461770018,
"grad_norm": 5.302231311798096,
"learning_rate": 9.847306645311152e-07,
"loss": 0.4529,
"step": 5680
},
{
"epoch": 0.6851294400963275,
"grad_norm": 4.6318864822387695,
"learning_rate": 9.846015492952993e-07,
"loss": 0.4299,
"step": 5690
},
{
"epoch": 0.6863335340156532,
"grad_norm": 4.18743896484375,
"learning_rate": 9.844718990022634e-07,
"loss": 0.4567,
"step": 5700
},
{
"epoch": 0.687537627934979,
"grad_norm": 4.45042610168457,
"learning_rate": 9.84341713795157e-07,
"loss": 0.4461,
"step": 5710
},
{
"epoch": 0.6887417218543046,
"grad_norm": 4.0155415534973145,
"learning_rate": 9.842109938177197e-07,
"loss": 0.4422,
"step": 5720
},
{
"epoch": 0.6899458157736303,
"grad_norm": 4.72194242477417,
"learning_rate": 9.840797392142819e-07,
"loss": 0.4499,
"step": 5730
},
{
"epoch": 0.6911499096929561,
"grad_norm": 4.1018595695495605,
"learning_rate": 9.83947950129764e-07,
"loss": 0.4305,
"step": 5740
},
{
"epoch": 0.6923540036122817,
"grad_norm": 4.466518402099609,
"learning_rate": 9.838156267096772e-07,
"loss": 0.437,
"step": 5750
},
{
"epoch": 0.6935580975316075,
"grad_norm": 4.084195137023926,
"learning_rate": 9.836827691001215e-07,
"loss": 0.4571,
"step": 5760
},
{
"epoch": 0.6947621914509332,
"grad_norm": 4.3810319900512695,
"learning_rate": 9.835493774477876e-07,
"loss": 0.4358,
"step": 5770
},
{
"epoch": 0.6959662853702588,
"grad_norm": 4.7473464012146,
"learning_rate": 9.834154518999558e-07,
"loss": 0.4307,
"step": 5780
},
{
"epoch": 0.6971703792895846,
"grad_norm": 4.240455627441406,
"learning_rate": 9.832809926044953e-07,
"loss": 0.4456,
"step": 5790
},
{
"epoch": 0.6983744732089103,
"grad_norm": 4.3158087730407715,
"learning_rate": 9.831459997098653e-07,
"loss": 0.4268,
"step": 5800
},
{
"epoch": 0.699578567128236,
"grad_norm": 4.3610005378723145,
"learning_rate": 9.83010473365114e-07,
"loss": 0.4334,
"step": 5810
},
{
"epoch": 0.7007826610475617,
"grad_norm": 4.417696952819824,
"learning_rate": 9.828744137198778e-07,
"loss": 0.4451,
"step": 5820
},
{
"epoch": 0.7019867549668874,
"grad_norm": 4.091536998748779,
"learning_rate": 9.827378209243833e-07,
"loss": 0.4277,
"step": 5830
},
{
"epoch": 0.7031908488862131,
"grad_norm": 5.2131028175354,
"learning_rate": 9.826006951294448e-07,
"loss": 0.4353,
"step": 5840
},
{
"epoch": 0.7043949428055388,
"grad_norm": 4.724157810211182,
"learning_rate": 9.824630364864653e-07,
"loss": 0.4379,
"step": 5850
},
{
"epoch": 0.7055990367248646,
"grad_norm": 3.924499034881592,
"learning_rate": 9.82324845147436e-07,
"loss": 0.4341,
"step": 5860
},
{
"epoch": 0.7068031306441902,
"grad_norm": 3.9886951446533203,
"learning_rate": 9.821861212649367e-07,
"loss": 0.4458,
"step": 5870
},
{
"epoch": 0.708007224563516,
"grad_norm": 5.176059246063232,
"learning_rate": 9.820468649921348e-07,
"loss": 0.4277,
"step": 5880
},
{
"epoch": 0.7092113184828417,
"grad_norm": 5.795221328735352,
"learning_rate": 9.819070764827856e-07,
"loss": 0.4608,
"step": 5890
},
{
"epoch": 0.7104154124021673,
"grad_norm": 4.0651702880859375,
"learning_rate": 9.81766755891232e-07,
"loss": 0.4349,
"step": 5900
},
{
"epoch": 0.7116195063214931,
"grad_norm": 4.822697162628174,
"learning_rate": 9.816259033724051e-07,
"loss": 0.4368,
"step": 5910
},
{
"epoch": 0.7128236002408188,
"grad_norm": 3.429680585861206,
"learning_rate": 9.814845190818218e-07,
"loss": 0.4119,
"step": 5920
},
{
"epoch": 0.7140276941601444,
"grad_norm": 4.649044513702393,
"learning_rate": 9.813426031755873e-07,
"loss": 0.431,
"step": 5930
},
{
"epoch": 0.7152317880794702,
"grad_norm": 4.576180458068848,
"learning_rate": 9.812001558103937e-07,
"loss": 0.4478,
"step": 5940
},
{
"epoch": 0.7164358819987959,
"grad_norm": 4.996614933013916,
"learning_rate": 9.810571771435196e-07,
"loss": 0.4013,
"step": 5950
},
{
"epoch": 0.7176399759181216,
"grad_norm": 5.006197929382324,
"learning_rate": 9.809136673328305e-07,
"loss": 0.4275,
"step": 5960
},
{
"epoch": 0.7188440698374473,
"grad_norm": 3.766942024230957,
"learning_rate": 9.807696265367776e-07,
"loss": 0.4377,
"step": 5970
},
{
"epoch": 0.720048163756773,
"grad_norm": 4.086816787719727,
"learning_rate": 9.806250549143992e-07,
"loss": 0.4384,
"step": 5980
},
{
"epoch": 0.7212522576760987,
"grad_norm": 5.5871734619140625,
"learning_rate": 9.804799526253196e-07,
"loss": 0.4511,
"step": 5990
},
{
"epoch": 0.7224563515954244,
"grad_norm": 4.023412704467773,
"learning_rate": 9.803343198297484e-07,
"loss": 0.4446,
"step": 6000
},
{
"epoch": 0.7236604455147502,
"grad_norm": 4.708857536315918,
"learning_rate": 9.80188156688482e-07,
"loss": 0.4395,
"step": 6010
},
{
"epoch": 0.7248645394340758,
"grad_norm": 3.879977226257324,
"learning_rate": 9.80041463362901e-07,
"loss": 0.4434,
"step": 6020
},
{
"epoch": 0.7260686333534015,
"grad_norm": 4.743607997894287,
"learning_rate": 9.798942400149726e-07,
"loss": 0.4365,
"step": 6030
},
{
"epoch": 0.7272727272727273,
"grad_norm": 3.6438701152801514,
"learning_rate": 9.797464868072486e-07,
"loss": 0.447,
"step": 6040
},
{
"epoch": 0.7284768211920529,
"grad_norm": 4.472813129425049,
"learning_rate": 9.79598203902866e-07,
"loss": 0.443,
"step": 6050
},
{
"epoch": 0.7296809151113787,
"grad_norm": 5.6175312995910645,
"learning_rate": 9.794493914655467e-07,
"loss": 0.4207,
"step": 6060
},
{
"epoch": 0.7308850090307044,
"grad_norm": 4.9606404304504395,
"learning_rate": 9.793000496595966e-07,
"loss": 0.4279,
"step": 6070
},
{
"epoch": 0.7320891029500302,
"grad_norm": 4.130514144897461,
"learning_rate": 9.791501786499074e-07,
"loss": 0.4183,
"step": 6080
},
{
"epoch": 0.7332931968693558,
"grad_norm": 2.9547371864318848,
"learning_rate": 9.78999778601954e-07,
"loss": 0.4038,
"step": 6090
},
{
"epoch": 0.7344972907886815,
"grad_norm": 4.06984281539917,
"learning_rate": 9.788488496817958e-07,
"loss": 0.4333,
"step": 6100
},
{
"epoch": 0.7357013847080073,
"grad_norm": 3.900606870651245,
"learning_rate": 9.78697392056076e-07,
"loss": 0.418,
"step": 6110
},
{
"epoch": 0.7369054786273329,
"grad_norm": 4.396324157714844,
"learning_rate": 9.78545405892022e-07,
"loss": 0.435,
"step": 6120
},
{
"epoch": 0.7381095725466587,
"grad_norm": 4.068949222564697,
"learning_rate": 9.78392891357444e-07,
"loss": 0.4138,
"step": 6130
},
{
"epoch": 0.7393136664659844,
"grad_norm": 4.090792655944824,
"learning_rate": 9.782398486207364e-07,
"loss": 0.4106,
"step": 6140
},
{
"epoch": 0.74051776038531,
"grad_norm": 5.222830295562744,
"learning_rate": 9.780862778508762e-07,
"loss": 0.4534,
"step": 6150
},
{
"epoch": 0.7417218543046358,
"grad_norm": 3.9300661087036133,
"learning_rate": 9.779321792174238e-07,
"loss": 0.4436,
"step": 6160
},
{
"epoch": 0.7429259482239615,
"grad_norm": 4.139192581176758,
"learning_rate": 9.77777552890522e-07,
"loss": 0.4384,
"step": 6170
},
{
"epoch": 0.7441300421432872,
"grad_norm": 4.677849292755127,
"learning_rate": 9.776223990408969e-07,
"loss": 0.4338,
"step": 6180
},
{
"epoch": 0.7453341360626129,
"grad_norm": 4.7174391746521,
"learning_rate": 9.77466717839856e-07,
"loss": 0.4265,
"step": 6190
},
{
"epoch": 0.7465382299819386,
"grad_norm": 4.314562797546387,
"learning_rate": 9.773105094592903e-07,
"loss": 0.4389,
"step": 6200
},
{
"epoch": 0.7477423239012643,
"grad_norm": 4.679368495941162,
"learning_rate": 9.77153774071672e-07,
"loss": 0.4177,
"step": 6210
},
{
"epoch": 0.74894641782059,
"grad_norm": 4.037609577178955,
"learning_rate": 9.769965118500554e-07,
"loss": 0.4376,
"step": 6220
},
{
"epoch": 0.7501505117399158,
"grad_norm": 4.8901448249816895,
"learning_rate": 9.768387229680765e-07,
"loss": 0.4597,
"step": 6230
},
{
"epoch": 0.7513546056592414,
"grad_norm": 4.4093122482299805,
"learning_rate": 9.76680407599953e-07,
"loss": 0.4332,
"step": 6240
},
{
"epoch": 0.7525586995785671,
"grad_norm": 4.720508575439453,
"learning_rate": 9.765215659204837e-07,
"loss": 0.4579,
"step": 6250
},
{
"epoch": 0.7537627934978929,
"grad_norm": 4.316104412078857,
"learning_rate": 9.763621981050486e-07,
"loss": 0.4499,
"step": 6260
},
{
"epoch": 0.7549668874172185,
"grad_norm": 4.805814743041992,
"learning_rate": 9.762023043296082e-07,
"loss": 0.4229,
"step": 6270
},
{
"epoch": 0.7561709813365443,
"grad_norm": 4.259012699127197,
"learning_rate": 9.760418847707042e-07,
"loss": 0.4307,
"step": 6280
},
{
"epoch": 0.75737507525587,
"grad_norm": 4.74151086807251,
"learning_rate": 9.75880939605459e-07,
"loss": 0.4039,
"step": 6290
},
{
"epoch": 0.7585791691751956,
"grad_norm": 4.7510294914245605,
"learning_rate": 9.757194690115747e-07,
"loss": 0.4302,
"step": 6300
},
{
"epoch": 0.7597832630945214,
"grad_norm": 5.057920455932617,
"learning_rate": 9.75557473167334e-07,
"loss": 0.4196,
"step": 6310
},
{
"epoch": 0.7609873570138471,
"grad_norm": 4.428061485290527,
"learning_rate": 9.753949522515992e-07,
"loss": 0.4271,
"step": 6320
},
{
"epoch": 0.7621914509331728,
"grad_norm": 4.023929595947266,
"learning_rate": 9.75231906443813e-07,
"loss": 0.4125,
"step": 6330
},
{
"epoch": 0.7633955448524985,
"grad_norm": 4.456701755523682,
"learning_rate": 9.75068335923997e-07,
"loss": 0.4177,
"step": 6340
},
{
"epoch": 0.7645996387718242,
"grad_norm": 4.046926975250244,
"learning_rate": 9.749042408727517e-07,
"loss": 0.4172,
"step": 6350
},
{
"epoch": 0.7658037326911499,
"grad_norm": 4.5811944007873535,
"learning_rate": 9.747396214712584e-07,
"loss": 0.4165,
"step": 6360
},
{
"epoch": 0.7670078266104756,
"grad_norm": 3.6832375526428223,
"learning_rate": 9.745744779012757e-07,
"loss": 0.4183,
"step": 6370
},
{
"epoch": 0.7682119205298014,
"grad_norm": 4.535373210906982,
"learning_rate": 9.744088103451417e-07,
"loss": 0.4205,
"step": 6380
},
{
"epoch": 0.769416014449127,
"grad_norm": 4.3140363693237305,
"learning_rate": 9.742426189857729e-07,
"loss": 0.4414,
"step": 6390
},
{
"epoch": 0.7706201083684527,
"grad_norm": 4.968809604644775,
"learning_rate": 9.74075904006664e-07,
"loss": 0.4421,
"step": 6400
},
{
"epoch": 0.7718242022877785,
"grad_norm": 4.488393783569336,
"learning_rate": 9.739086655918883e-07,
"loss": 0.441,
"step": 6410
},
{
"epoch": 0.7730282962071041,
"grad_norm": 4.255595684051514,
"learning_rate": 9.737409039260966e-07,
"loss": 0.4211,
"step": 6420
},
{
"epoch": 0.7742323901264299,
"grad_norm": 4.285024642944336,
"learning_rate": 9.735726191945175e-07,
"loss": 0.42,
"step": 6430
},
{
"epoch": 0.7754364840457556,
"grad_norm": 4.8813347816467285,
"learning_rate": 9.734038115829571e-07,
"loss": 0.433,
"step": 6440
},
{
"epoch": 0.7766405779650812,
"grad_norm": 3.9893128871917725,
"learning_rate": 9.732344812777987e-07,
"loss": 0.3902,
"step": 6450
},
{
"epoch": 0.777844671884407,
"grad_norm": 4.2948784828186035,
"learning_rate": 9.730646284660035e-07,
"loss": 0.4094,
"step": 6460
},
{
"epoch": 0.7790487658037327,
"grad_norm": 4.328617572784424,
"learning_rate": 9.728942533351087e-07,
"loss": 0.4412,
"step": 6470
},
{
"epoch": 0.7802528597230584,
"grad_norm": 4.67041015625,
"learning_rate": 9.727233560732286e-07,
"loss": 0.4157,
"step": 6480
},
{
"epoch": 0.7814569536423841,
"grad_norm": 4.249061584472656,
"learning_rate": 9.725519368690538e-07,
"loss": 0.4398,
"step": 6490
},
{
"epoch": 0.7826610475617098,
"grad_norm": 5.444673538208008,
"learning_rate": 9.723799959118513e-07,
"loss": 0.4299,
"step": 6500
},
{
"epoch": 0.7838651414810355,
"grad_norm": 4.813880920410156,
"learning_rate": 9.722075333914642e-07,
"loss": 0.4483,
"step": 6510
},
{
"epoch": 0.7850692354003612,
"grad_norm": 3.9406328201293945,
"learning_rate": 9.720345494983116e-07,
"loss": 0.4101,
"step": 6520
},
{
"epoch": 0.786273329319687,
"grad_norm": 5.169934272766113,
"learning_rate": 9.718610444233878e-07,
"loss": 0.4284,
"step": 6530
},
{
"epoch": 0.7874774232390126,
"grad_norm": 4.304941177368164,
"learning_rate": 9.71687018358263e-07,
"loss": 0.4232,
"step": 6540
},
{
"epoch": 0.7886815171583383,
"grad_norm": 4.452000141143799,
"learning_rate": 9.715124714950827e-07,
"loss": 0.4506,
"step": 6550
},
{
"epoch": 0.7898856110776641,
"grad_norm": 3.7503676414489746,
"learning_rate": 9.713374040265668e-07,
"loss": 0.4246,
"step": 6560
},
{
"epoch": 0.7910897049969897,
"grad_norm": 4.534003257751465,
"learning_rate": 9.71161816146011e-07,
"loss": 0.4247,
"step": 6570
},
{
"epoch": 0.7922937989163155,
"grad_norm": 5.637129306793213,
"learning_rate": 9.709857080472845e-07,
"loss": 0.4419,
"step": 6580
},
{
"epoch": 0.7934978928356412,
"grad_norm": 3.844273805618286,
"learning_rate": 9.708090799248313e-07,
"loss": 0.4042,
"step": 6590
},
{
"epoch": 0.7947019867549668,
"grad_norm": 4.556625843048096,
"learning_rate": 9.706319319736703e-07,
"loss": 0.4384,
"step": 6600
},
{
"epoch": 0.7959060806742926,
"grad_norm": 4.6486053466796875,
"learning_rate": 9.70454264389393e-07,
"loss": 0.4091,
"step": 6610
},
{
"epoch": 0.7971101745936183,
"grad_norm": 4.751596927642822,
"learning_rate": 9.702760773681658e-07,
"loss": 0.428,
"step": 6620
},
{
"epoch": 0.798314268512944,
"grad_norm": 4.64603328704834,
"learning_rate": 9.700973711067282e-07,
"loss": 0.4376,
"step": 6630
},
{
"epoch": 0.7995183624322697,
"grad_norm": 4.823798656463623,
"learning_rate": 9.699181458023927e-07,
"loss": 0.4057,
"step": 6640
},
{
"epoch": 0.8007224563515954,
"grad_norm": 5.07472562789917,
"learning_rate": 9.697384016530451e-07,
"loss": 0.4103,
"step": 6650
},
{
"epoch": 0.8019265502709211,
"grad_norm": 5.586597442626953,
"learning_rate": 9.695581388571444e-07,
"loss": 0.4401,
"step": 6660
},
{
"epoch": 0.8031306441902468,
"grad_norm": 5.10539436340332,
"learning_rate": 9.693773576137219e-07,
"loss": 0.4298,
"step": 6670
},
{
"epoch": 0.8043347381095726,
"grad_norm": 5.036708354949951,
"learning_rate": 9.691960581223815e-07,
"loss": 0.4299,
"step": 6680
},
{
"epoch": 0.8055388320288982,
"grad_norm": 4.794188499450684,
"learning_rate": 9.690142405832988e-07,
"loss": 0.4296,
"step": 6690
},
{
"epoch": 0.8067429259482239,
"grad_norm": 4.483447074890137,
"learning_rate": 9.688319051972223e-07,
"loss": 0.4063,
"step": 6700
},
{
"epoch": 0.8079470198675497,
"grad_norm": 4.88456916809082,
"learning_rate": 9.686490521654713e-07,
"loss": 0.4548,
"step": 6710
},
{
"epoch": 0.8091511137868754,
"grad_norm": 4.166242599487305,
"learning_rate": 9.684656816899374e-07,
"loss": 0.4344,
"step": 6720
},
{
"epoch": 0.8103552077062011,
"grad_norm": 4.282528877258301,
"learning_rate": 9.682817939730831e-07,
"loss": 0.4143,
"step": 6730
},
{
"epoch": 0.8115593016255268,
"grad_norm": 4.342618942260742,
"learning_rate": 9.680973892179423e-07,
"loss": 0.4224,
"step": 6740
},
{
"epoch": 0.8127633955448526,
"grad_norm": 4.768647193908691,
"learning_rate": 9.679124676281195e-07,
"loss": 0.4251,
"step": 6750
},
{
"epoch": 0.8139674894641782,
"grad_norm": 4.024239540100098,
"learning_rate": 9.677270294077896e-07,
"loss": 0.4415,
"step": 6760
},
{
"epoch": 0.8151715833835039,
"grad_norm": 3.9242262840270996,
"learning_rate": 9.675410747616984e-07,
"loss": 0.4475,
"step": 6770
},
{
"epoch": 0.8163756773028297,
"grad_norm": 4.580953121185303,
"learning_rate": 9.67354603895162e-07,
"loss": 0.4067,
"step": 6780
},
{
"epoch": 0.8175797712221553,
"grad_norm": 4.859120845794678,
"learning_rate": 9.67167617014066e-07,
"loss": 0.4311,
"step": 6790
},
{
"epoch": 0.818783865141481,
"grad_norm": 4.1437835693359375,
"learning_rate": 9.66980114324866e-07,
"loss": 0.4135,
"step": 6800
},
{
"epoch": 0.8199879590608068,
"grad_norm": 4.027251243591309,
"learning_rate": 9.667920960345872e-07,
"loss": 0.4021,
"step": 6810
},
{
"epoch": 0.8211920529801324,
"grad_norm": 4.283502101898193,
"learning_rate": 9.666035623508237e-07,
"loss": 0.4207,
"step": 6820
},
{
"epoch": 0.8223961468994582,
"grad_norm": 4.910589694976807,
"learning_rate": 9.66414513481739e-07,
"loss": 0.4474,
"step": 6830
},
{
"epoch": 0.8236002408187839,
"grad_norm": 5.238614559173584,
"learning_rate": 9.662249496360653e-07,
"loss": 0.4294,
"step": 6840
},
{
"epoch": 0.8248043347381095,
"grad_norm": 4.113722801208496,
"learning_rate": 9.660348710231036e-07,
"loss": 0.4145,
"step": 6850
},
{
"epoch": 0.8260084286574353,
"grad_norm": 4.979987144470215,
"learning_rate": 9.65844277852723e-07,
"loss": 0.421,
"step": 6860
},
{
"epoch": 0.827212522576761,
"grad_norm": 5.396749973297119,
"learning_rate": 9.656531703353608e-07,
"loss": 0.4444,
"step": 6870
},
{
"epoch": 0.8284166164960867,
"grad_norm": 4.567556858062744,
"learning_rate": 9.654615486820222e-07,
"loss": 0.4198,
"step": 6880
},
{
"epoch": 0.8296207104154124,
"grad_norm": 5.2882304191589355,
"learning_rate": 9.6526941310428e-07,
"loss": 0.4274,
"step": 6890
},
{
"epoch": 0.8308248043347382,
"grad_norm": 4.51816987991333,
"learning_rate": 9.650767638142746e-07,
"loss": 0.4465,
"step": 6900
},
{
"epoch": 0.8320288982540638,
"grad_norm": 3.9410834312438965,
"learning_rate": 9.648836010247137e-07,
"loss": 0.4182,
"step": 6910
},
{
"epoch": 0.8332329921733895,
"grad_norm": 4.620553493499756,
"learning_rate": 9.646899249488714e-07,
"loss": 0.4206,
"step": 6920
},
{
"epoch": 0.8344370860927153,
"grad_norm": 4.430214881896973,
"learning_rate": 9.644957358005892e-07,
"loss": 0.4313,
"step": 6930
},
{
"epoch": 0.8356411800120409,
"grad_norm": 4.277939796447754,
"learning_rate": 9.643010337942747e-07,
"loss": 0.4313,
"step": 6940
},
{
"epoch": 0.8368452739313667,
"grad_norm": 5.185015678405762,
"learning_rate": 9.64105819144902e-07,
"loss": 0.4225,
"step": 6950
},
{
"epoch": 0.8380493678506924,
"grad_norm": 4.402646541595459,
"learning_rate": 9.63910092068011e-07,
"loss": 0.417,
"step": 6960
},
{
"epoch": 0.839253461770018,
"grad_norm": 3.664020538330078,
"learning_rate": 9.637138527797074e-07,
"loss": 0.4337,
"step": 6970
},
{
"epoch": 0.8404575556893438,
"grad_norm": 4.9388041496276855,
"learning_rate": 9.635171014966625e-07,
"loss": 0.412,
"step": 6980
},
{
"epoch": 0.8416616496086695,
"grad_norm": 4.200076103210449,
"learning_rate": 9.63319838436113e-07,
"loss": 0.4212,
"step": 6990
},
{
"epoch": 0.8428657435279951,
"grad_norm": 4.56259822845459,
"learning_rate": 9.631220638158605e-07,
"loss": 0.4316,
"step": 7000
},
{
"epoch": 0.8440698374473209,
"grad_norm": 3.910545587539673,
"learning_rate": 9.629237778542714e-07,
"loss": 0.4,
"step": 7010
},
{
"epoch": 0.8452739313666466,
"grad_norm": 4.639405250549316,
"learning_rate": 9.62724980770277e-07,
"loss": 0.4084,
"step": 7020
},
{
"epoch": 0.8464780252859723,
"grad_norm": 4.84975528717041,
"learning_rate": 9.625256727833725e-07,
"loss": 0.4331,
"step": 7030
},
{
"epoch": 0.847682119205298,
"grad_norm": 3.9190306663513184,
"learning_rate": 9.623258541136175e-07,
"loss": 0.4171,
"step": 7040
},
{
"epoch": 0.8488862131246238,
"grad_norm": 4.248600482940674,
"learning_rate": 9.621255249816353e-07,
"loss": 0.4255,
"step": 7050
},
{
"epoch": 0.8500903070439494,
"grad_norm": 4.055094242095947,
"learning_rate": 9.61924685608613e-07,
"loss": 0.4257,
"step": 7060
},
{
"epoch": 0.8512944009632751,
"grad_norm": 4.14054536819458,
"learning_rate": 9.617233362163007e-07,
"loss": 0.4046,
"step": 7070
},
{
"epoch": 0.8524984948826009,
"grad_norm": 5.480048179626465,
"learning_rate": 9.61521477027012e-07,
"loss": 0.4007,
"step": 7080
},
{
"epoch": 0.8537025888019265,
"grad_norm": 4.100722312927246,
"learning_rate": 9.613191082636232e-07,
"loss": 0.4148,
"step": 7090
},
{
"epoch": 0.8549066827212523,
"grad_norm": 3.739861011505127,
"learning_rate": 9.611162301495735e-07,
"loss": 0.4156,
"step": 7100
},
{
"epoch": 0.856110776640578,
"grad_norm": 4.769533634185791,
"learning_rate": 9.60912842908864e-07,
"loss": 0.4356,
"step": 7110
},
{
"epoch": 0.8573148705599036,
"grad_norm": 4.347903728485107,
"learning_rate": 9.60708946766058e-07,
"loss": 0.4509,
"step": 7120
},
{
"epoch": 0.8585189644792294,
"grad_norm": 4.265124797821045,
"learning_rate": 9.605045419462813e-07,
"loss": 0.4231,
"step": 7130
},
{
"epoch": 0.8597230583985551,
"grad_norm": 5.108783721923828,
"learning_rate": 9.602996286752206e-07,
"loss": 0.4363,
"step": 7140
},
{
"epoch": 0.8609271523178808,
"grad_norm": 5.001750946044922,
"learning_rate": 9.600942071791248e-07,
"loss": 0.4223,
"step": 7150
},
{
"epoch": 0.8621312462372065,
"grad_norm": 4.6718668937683105,
"learning_rate": 9.598882776848025e-07,
"loss": 0.4206,
"step": 7160
},
{
"epoch": 0.8633353401565322,
"grad_norm": 4.35657262802124,
"learning_rate": 9.596818404196249e-07,
"loss": 0.4136,
"step": 7170
},
{
"epoch": 0.8645394340758579,
"grad_norm": 4.119489669799805,
"learning_rate": 9.59474895611523e-07,
"loss": 0.4254,
"step": 7180
},
{
"epoch": 0.8657435279951836,
"grad_norm": 4.4842047691345215,
"learning_rate": 9.59267443488988e-07,
"loss": 0.4279,
"step": 7190
},
{
"epoch": 0.8669476219145094,
"grad_norm": 4.105453014373779,
"learning_rate": 9.590594842810714e-07,
"loss": 0.4031,
"step": 7200
},
{
"epoch": 0.868151715833835,
"grad_norm": 4.400485992431641,
"learning_rate": 9.58851018217385e-07,
"loss": 0.4098,
"step": 7210
},
{
"epoch": 0.8693558097531607,
"grad_norm": 4.673033714294434,
"learning_rate": 9.586420455280998e-07,
"loss": 0.4299,
"step": 7220
},
{
"epoch": 0.8705599036724865,
"grad_norm": 4.483117580413818,
"learning_rate": 9.584325664439463e-07,
"loss": 0.438,
"step": 7230
},
{
"epoch": 0.8717639975918121,
"grad_norm": 5.068016052246094,
"learning_rate": 9.58222581196214e-07,
"loss": 0.4162,
"step": 7240
},
{
"epoch": 0.8729680915111379,
"grad_norm": 4.488113880157471,
"learning_rate": 9.580120900167513e-07,
"loss": 0.4196,
"step": 7250
},
{
"epoch": 0.8741721854304636,
"grad_norm": 4.887204647064209,
"learning_rate": 9.578010931379654e-07,
"loss": 0.439,
"step": 7260
},
{
"epoch": 0.8753762793497892,
"grad_norm": 4.7396159172058105,
"learning_rate": 9.575895907928217e-07,
"loss": 0.4202,
"step": 7270
},
{
"epoch": 0.876580373269115,
"grad_norm": 4.224496364593506,
"learning_rate": 9.573775832148438e-07,
"loss": 0.4027,
"step": 7280
},
{
"epoch": 0.8777844671884407,
"grad_norm": 5.062420845031738,
"learning_rate": 9.57165070638113e-07,
"loss": 0.4123,
"step": 7290
},
{
"epoch": 0.8789885611077664,
"grad_norm": 3.75753116607666,
"learning_rate": 9.569520532972678e-07,
"loss": 0.4066,
"step": 7300
},
{
"epoch": 0.8801926550270921,
"grad_norm": 4.535136699676514,
"learning_rate": 9.567385314275054e-07,
"loss": 0.4067,
"step": 7310
},
{
"epoch": 0.8813967489464178,
"grad_norm": 4.068704128265381,
"learning_rate": 9.56524505264578e-07,
"loss": 0.4238,
"step": 7320
},
{
"epoch": 0.8826008428657435,
"grad_norm": 5.032285690307617,
"learning_rate": 9.563099750447965e-07,
"loss": 0.4392,
"step": 7330
},
{
"epoch": 0.8838049367850692,
"grad_norm": 4.432474136352539,
"learning_rate": 9.560949410050274e-07,
"loss": 0.4394,
"step": 7340
},
{
"epoch": 0.885009030704395,
"grad_norm": 3.7745227813720703,
"learning_rate": 9.558794033826933e-07,
"loss": 0.4228,
"step": 7350
},
{
"epoch": 0.8862131246237207,
"grad_norm": 4.947648525238037,
"learning_rate": 9.556633624157734e-07,
"loss": 0.4324,
"step": 7360
},
{
"epoch": 0.8874172185430463,
"grad_norm": 3.695946216583252,
"learning_rate": 9.554468183428025e-07,
"loss": 0.407,
"step": 7370
},
{
"epoch": 0.8886213124623721,
"grad_norm": 4.399337291717529,
"learning_rate": 9.552297714028703e-07,
"loss": 0.4313,
"step": 7380
},
{
"epoch": 0.8898254063816978,
"grad_norm": 4.042302131652832,
"learning_rate": 9.550122218356227e-07,
"loss": 0.4183,
"step": 7390
},
{
"epoch": 0.8910295003010235,
"grad_norm": 4.341307163238525,
"learning_rate": 9.5479416988126e-07,
"loss": 0.4335,
"step": 7400
},
{
"epoch": 0.8922335942203492,
"grad_norm": 3.7946054935455322,
"learning_rate": 9.545756157805367e-07,
"loss": 0.4123,
"step": 7410
},
{
"epoch": 0.893437688139675,
"grad_norm": 5.04152250289917,
"learning_rate": 9.543565597747632e-07,
"loss": 0.4139,
"step": 7420
},
{
"epoch": 0.8946417820590006,
"grad_norm": 3.8958561420440674,
"learning_rate": 9.541370021058023e-07,
"loss": 0.4084,
"step": 7430
},
{
"epoch": 0.8958458759783263,
"grad_norm": 3.7490954399108887,
"learning_rate": 9.53916943016072e-07,
"loss": 0.4048,
"step": 7440
},
{
"epoch": 0.8970499698976521,
"grad_norm": 4.4821858406066895,
"learning_rate": 9.536963827485434e-07,
"loss": 0.3984,
"step": 7450
},
{
"epoch": 0.8982540638169777,
"grad_norm": 4.666491985321045,
"learning_rate": 9.53475321546741e-07,
"loss": 0.4098,
"step": 7460
},
{
"epoch": 0.8994581577363034,
"grad_norm": 4.890908718109131,
"learning_rate": 9.532537596547423e-07,
"loss": 0.3982,
"step": 7470
},
{
"epoch": 0.9006622516556292,
"grad_norm": 4.651495933532715,
"learning_rate": 9.53031697317178e-07,
"loss": 0.418,
"step": 7480
},
{
"epoch": 0.9018663455749548,
"grad_norm": 4.55120849609375,
"learning_rate": 9.528091347792308e-07,
"loss": 0.4187,
"step": 7490
},
{
"epoch": 0.9030704394942806,
"grad_norm": 5.57934045791626,
"learning_rate": 9.525860722866362e-07,
"loss": 0.4156,
"step": 7500
},
{
"epoch": 0.9042745334136063,
"grad_norm": 3.860431432723999,
"learning_rate": 9.523625100856813e-07,
"loss": 0.4078,
"step": 7510
},
{
"epoch": 0.9054786273329319,
"grad_norm": 4.670098781585693,
"learning_rate": 9.521384484232054e-07,
"loss": 0.4088,
"step": 7520
},
{
"epoch": 0.9066827212522577,
"grad_norm": 4.332681655883789,
"learning_rate": 9.519138875465986e-07,
"loss": 0.422,
"step": 7530
},
{
"epoch": 0.9078868151715834,
"grad_norm": 4.745145797729492,
"learning_rate": 9.516888277038029e-07,
"loss": 0.409,
"step": 7540
},
{
"epoch": 0.9090909090909091,
"grad_norm": 4.109555721282959,
"learning_rate": 9.514632691433106e-07,
"loss": 0.4177,
"step": 7550
},
{
"epoch": 0.9102950030102348,
"grad_norm": 5.039947032928467,
"learning_rate": 9.512372121141652e-07,
"loss": 0.4132,
"step": 7560
},
{
"epoch": 0.9114990969295605,
"grad_norm": 4.389688968658447,
"learning_rate": 9.510106568659599e-07,
"loss": 0.4176,
"step": 7570
},
{
"epoch": 0.9127031908488862,
"grad_norm": 4.67106819152832,
"learning_rate": 9.50783603648839e-07,
"loss": 0.4441,
"step": 7580
},
{
"epoch": 0.9139072847682119,
"grad_norm": 3.6345438957214355,
"learning_rate": 9.505560527134956e-07,
"loss": 0.395,
"step": 7590
},
{
"epoch": 0.9151113786875377,
"grad_norm": 4.544852256774902,
"learning_rate": 9.503280043111728e-07,
"loss": 0.4291,
"step": 7600
},
{
"epoch": 0.9163154726068633,
"grad_norm": 5.17853307723999,
"learning_rate": 9.50099458693663e-07,
"loss": 0.42,
"step": 7610
},
{
"epoch": 0.917519566526189,
"grad_norm": 4.111993789672852,
"learning_rate": 9.498704161133073e-07,
"loss": 0.4086,
"step": 7620
},
{
"epoch": 0.9187236604455148,
"grad_norm": 3.93930721282959,
"learning_rate": 9.49640876822996e-07,
"loss": 0.4128,
"step": 7630
},
{
"epoch": 0.9199277543648404,
"grad_norm": 4.442197322845459,
"learning_rate": 9.494108410761672e-07,
"loss": 0.4107,
"step": 7640
},
{
"epoch": 0.9211318482841662,
"grad_norm": 4.266764163970947,
"learning_rate": 9.491803091268077e-07,
"loss": 0.4093,
"step": 7650
},
{
"epoch": 0.9223359422034919,
"grad_norm": 4.633232593536377,
"learning_rate": 9.48949281229452e-07,
"loss": 0.4152,
"step": 7660
},
{
"epoch": 0.9235400361228175,
"grad_norm": 4.4745073318481445,
"learning_rate": 9.487177576391818e-07,
"loss": 0.4423,
"step": 7670
},
{
"epoch": 0.9247441300421433,
"grad_norm": 3.795365333557129,
"learning_rate": 9.484857386116268e-07,
"loss": 0.4013,
"step": 7680
},
{
"epoch": 0.925948223961469,
"grad_norm": 4.76974630355835,
"learning_rate": 9.48253224402963e-07,
"loss": 0.4084,
"step": 7690
},
{
"epoch": 0.9271523178807947,
"grad_norm": 4.584947109222412,
"learning_rate": 9.48020215269914e-07,
"loss": 0.4237,
"step": 7700
},
{
"epoch": 0.9283564118001204,
"grad_norm": 4.877064228057861,
"learning_rate": 9.477867114697486e-07,
"loss": 0.409,
"step": 7710
},
{
"epoch": 0.9295605057194462,
"grad_norm": 4.372793674468994,
"learning_rate": 9.475527132602832e-07,
"loss": 0.4142,
"step": 7720
},
{
"epoch": 0.9307645996387718,
"grad_norm": 4.198723316192627,
"learning_rate": 9.473182208998792e-07,
"loss": 0.4057,
"step": 7730
},
{
"epoch": 0.9319686935580975,
"grad_norm": 4.460008144378662,
"learning_rate": 9.470832346474435e-07,
"loss": 0.4235,
"step": 7740
},
{
"epoch": 0.9331727874774233,
"grad_norm": 4.3058905601501465,
"learning_rate": 9.468477547624289e-07,
"loss": 0.4307,
"step": 7750
},
{
"epoch": 0.9343768813967489,
"grad_norm": 4.6467132568359375,
"learning_rate": 9.466117815048329e-07,
"loss": 0.4127,
"step": 7760
},
{
"epoch": 0.9355809753160746,
"grad_norm": 5.1491217613220215,
"learning_rate": 9.463753151351978e-07,
"loss": 0.4181,
"step": 7770
},
{
"epoch": 0.9367850692354004,
"grad_norm": 5.166205883026123,
"learning_rate": 9.461383559146102e-07,
"loss": 0.4102,
"step": 7780
},
{
"epoch": 0.937989163154726,
"grad_norm": 4.453047275543213,
"learning_rate": 9.459009041047012e-07,
"loss": 0.4135,
"step": 7790
},
{
"epoch": 0.9391932570740518,
"grad_norm": 5.151276111602783,
"learning_rate": 9.456629599676456e-07,
"loss": 0.4072,
"step": 7800
},
{
"epoch": 0.9403973509933775,
"grad_norm": 3.8332607746124268,
"learning_rate": 9.454245237661615e-07,
"loss": 0.4363,
"step": 7810
},
{
"epoch": 0.9416014449127031,
"grad_norm": 4.51285982131958,
"learning_rate": 9.451855957635108e-07,
"loss": 0.4265,
"step": 7820
},
{
"epoch": 0.9428055388320289,
"grad_norm": 4.756032466888428,
"learning_rate": 9.449461762234981e-07,
"loss": 0.4322,
"step": 7830
},
{
"epoch": 0.9440096327513546,
"grad_norm": 3.7539730072021484,
"learning_rate": 9.447062654104707e-07,
"loss": 0.4052,
"step": 7840
},
{
"epoch": 0.9452137266706803,
"grad_norm": 4.208081245422363,
"learning_rate": 9.444658635893186e-07,
"loss": 0.4101,
"step": 7850
},
{
"epoch": 0.946417820590006,
"grad_norm": 3.338568925857544,
"learning_rate": 9.442249710254737e-07,
"loss": 0.4195,
"step": 7860
},
{
"epoch": 0.9476219145093318,
"grad_norm": 4.421904563903809,
"learning_rate": 9.439835879849096e-07,
"loss": 0.4232,
"step": 7870
},
{
"epoch": 0.9488260084286574,
"grad_norm": 4.675938129425049,
"learning_rate": 9.437417147341417e-07,
"loss": 0.4171,
"step": 7880
},
{
"epoch": 0.9500301023479831,
"grad_norm": 5.047989845275879,
"learning_rate": 9.434993515402267e-07,
"loss": 0.4083,
"step": 7890
},
{
"epoch": 0.9512341962673089,
"grad_norm": 4.1763916015625,
"learning_rate": 9.432564986707621e-07,
"loss": 0.3946,
"step": 7900
},
{
"epoch": 0.9524382901866345,
"grad_norm": 4.706401348114014,
"learning_rate": 9.43013156393886e-07,
"loss": 0.4147,
"step": 7910
},
{
"epoch": 0.9536423841059603,
"grad_norm": 4.3355255126953125,
"learning_rate": 9.427693249782769e-07,
"loss": 0.4244,
"step": 7920
},
{
"epoch": 0.954846478025286,
"grad_norm": 5.126685619354248,
"learning_rate": 9.425250046931537e-07,
"loss": 0.4148,
"step": 7930
},
{
"epoch": 0.9560505719446116,
"grad_norm": 3.4599716663360596,
"learning_rate": 9.422801958082744e-07,
"loss": 0.4237,
"step": 7940
},
{
"epoch": 0.9572546658639374,
"grad_norm": 4.331906795501709,
"learning_rate": 9.420348985939371e-07,
"loss": 0.4097,
"step": 7950
},
{
"epoch": 0.9584587597832631,
"grad_norm": 4.4911370277404785,
"learning_rate": 9.417891133209787e-07,
"loss": 0.4029,
"step": 7960
},
{
"epoch": 0.9596628537025887,
"grad_norm": 4.601186275482178,
"learning_rate": 9.415428402607754e-07,
"loss": 0.4194,
"step": 7970
},
{
"epoch": 0.9608669476219145,
"grad_norm": 4.048129558563232,
"learning_rate": 9.412960796852412e-07,
"loss": 0.4205,
"step": 7980
},
{
"epoch": 0.9620710415412402,
"grad_norm": 4.8655571937561035,
"learning_rate": 9.410488318668292e-07,
"loss": 0.4229,
"step": 7990
},
{
"epoch": 0.963275135460566,
"grad_norm": 3.7495744228363037,
"learning_rate": 9.408010970785302e-07,
"loss": 0.3761,
"step": 8000
},
{
"epoch": 0.9644792293798916,
"grad_norm": 5.3356499671936035,
"learning_rate": 9.405528755938725e-07,
"loss": 0.4093,
"step": 8010
},
{
"epoch": 0.9656833232992174,
"grad_norm": 5.407442569732666,
"learning_rate": 9.403041676869217e-07,
"loss": 0.4066,
"step": 8020
},
{
"epoch": 0.9668874172185431,
"grad_norm": 3.860828161239624,
"learning_rate": 9.400549736322807e-07,
"loss": 0.3982,
"step": 8030
},
{
"epoch": 0.9680915111378687,
"grad_norm": 4.087296962738037,
"learning_rate": 9.398052937050892e-07,
"loss": 0.3951,
"step": 8040
},
{
"epoch": 0.9692956050571945,
"grad_norm": 4.309443473815918,
"learning_rate": 9.395551281810233e-07,
"loss": 0.4025,
"step": 8050
},
{
"epoch": 0.9704996989765202,
"grad_norm": 4.655600547790527,
"learning_rate": 9.39304477336295e-07,
"loss": 0.4187,
"step": 8060
},
{
"epoch": 0.9717037928958459,
"grad_norm": 4.34591007232666,
"learning_rate": 9.390533414476527e-07,
"loss": 0.4164,
"step": 8070
},
{
"epoch": 0.9729078868151716,
"grad_norm": 4.547005653381348,
"learning_rate": 9.388017207923798e-07,
"loss": 0.4124,
"step": 8080
},
{
"epoch": 0.9741119807344973,
"grad_norm": 5.021882057189941,
"learning_rate": 9.385496156482953e-07,
"loss": 0.4289,
"step": 8090
},
{
"epoch": 0.975316074653823,
"grad_norm": 4.165801525115967,
"learning_rate": 9.382970262937526e-07,
"loss": 0.4058,
"step": 8100
},
{
"epoch": 0.9765201685731487,
"grad_norm": 4.876884460449219,
"learning_rate": 9.380439530076407e-07,
"loss": 0.43,
"step": 8110
},
{
"epoch": 0.9777242624924745,
"grad_norm": 4.928086757659912,
"learning_rate": 9.377903960693818e-07,
"loss": 0.423,
"step": 8120
},
{
"epoch": 0.9789283564118001,
"grad_norm": 5.006045341491699,
"learning_rate": 9.375363557589331e-07,
"loss": 0.4354,
"step": 8130
},
{
"epoch": 0.9801324503311258,
"grad_norm": 3.8796417713165283,
"learning_rate": 9.372818323567846e-07,
"loss": 0.4132,
"step": 8140
},
{
"epoch": 0.9813365442504516,
"grad_norm": 4.275393962860107,
"learning_rate": 9.370268261439604e-07,
"loss": 0.4071,
"step": 8150
},
{
"epoch": 0.9825406381697772,
"grad_norm": 5.467378616333008,
"learning_rate": 9.367713374020174e-07,
"loss": 0.4049,
"step": 8160
},
{
"epoch": 0.983744732089103,
"grad_norm": 3.720611095428467,
"learning_rate": 9.365153664130453e-07,
"loss": 0.4008,
"step": 8170
},
{
"epoch": 0.9849488260084287,
"grad_norm": 4.539004802703857,
"learning_rate": 9.362589134596661e-07,
"loss": 0.4118,
"step": 8180
},
{
"epoch": 0.9861529199277543,
"grad_norm": 3.776636838912964,
"learning_rate": 9.360019788250342e-07,
"loss": 0.4334,
"step": 8190
},
{
"epoch": 0.9873570138470801,
"grad_norm": 3.8309648036956787,
"learning_rate": 9.357445627928355e-07,
"loss": 0.4179,
"step": 8200
},
{
"epoch": 0.9885611077664058,
"grad_norm": 4.798840045928955,
"learning_rate": 9.354866656472881e-07,
"loss": 0.4154,
"step": 8210
},
{
"epoch": 0.9897652016857315,
"grad_norm": 4.182796955108643,
"learning_rate": 9.352282876731403e-07,
"loss": 0.4196,
"step": 8220
},
{
"epoch": 0.9909692956050572,
"grad_norm": 4.6675801277160645,
"learning_rate": 9.349694291556723e-07,
"loss": 0.4182,
"step": 8230
},
{
"epoch": 0.9921733895243829,
"grad_norm": 4.432309627532959,
"learning_rate": 9.347100903806941e-07,
"loss": 0.4206,
"step": 8240
},
{
"epoch": 0.9933774834437086,
"grad_norm": 4.616915702819824,
"learning_rate": 9.344502716345463e-07,
"loss": 0.4153,
"step": 8250
},
{
"epoch": 0.9945815773630343,
"grad_norm": 4.290421485900879,
"learning_rate": 9.341899732040994e-07,
"loss": 0.4162,
"step": 8260
},
{
"epoch": 0.9957856712823601,
"grad_norm": 4.533810138702393,
"learning_rate": 9.339291953767539e-07,
"loss": 0.4113,
"step": 8270
},
{
"epoch": 0.9969897652016857,
"grad_norm": 4.271683692932129,
"learning_rate": 9.336679384404387e-07,
"loss": 0.4166,
"step": 8280
},
{
"epoch": 0.9981938591210114,
"grad_norm": 5.167937755584717,
"learning_rate": 9.334062026836127e-07,
"loss": 0.385,
"step": 8290
},
{
"epoch": 0.9993979530403372,
"grad_norm": 4.525483131408691,
"learning_rate": 9.331439883952628e-07,
"loss": 0.3977,
"step": 8300
},
{
"epoch": 1.0006020469596628,
"grad_norm": 4.6253533363342285,
"learning_rate": 9.328812958649044e-07,
"loss": 0.4123,
"step": 8310
},
{
"epoch": 1.0018061408789887,
"grad_norm": 5.165332317352295,
"learning_rate": 9.326181253825812e-07,
"loss": 0.3842,
"step": 8320
},
{
"epoch": 1.0030102347983143,
"grad_norm": 3.894192934036255,
"learning_rate": 9.323544772388645e-07,
"loss": 0.3528,
"step": 8330
},
{
"epoch": 1.00421432871764,
"grad_norm": 3.8034422397613525,
"learning_rate": 9.320903517248527e-07,
"loss": 0.3817,
"step": 8340
},
{
"epoch": 1.0054184226369658,
"grad_norm": 4.677804946899414,
"learning_rate": 9.318257491321714e-07,
"loss": 0.3772,
"step": 8350
},
{
"epoch": 1.0066225165562914,
"grad_norm": 4.256035327911377,
"learning_rate": 9.315606697529733e-07,
"loss": 0.3858,
"step": 8360
},
{
"epoch": 1.007826610475617,
"grad_norm": 4.362122058868408,
"learning_rate": 9.312951138799371e-07,
"loss": 0.3702,
"step": 8370
},
{
"epoch": 1.009030704394943,
"grad_norm": 4.146007537841797,
"learning_rate": 9.310290818062681e-07,
"loss": 0.3869,
"step": 8380
},
{
"epoch": 1.0102347983142685,
"grad_norm": 4.480301856994629,
"learning_rate": 9.307625738256967e-07,
"loss": 0.4082,
"step": 8390
},
{
"epoch": 1.0114388922335942,
"grad_norm": 4.406433582305908,
"learning_rate": 9.304955902324793e-07,
"loss": 0.3846,
"step": 8400
},
{
"epoch": 1.01264298615292,
"grad_norm": 4.386068820953369,
"learning_rate": 9.302281313213972e-07,
"loss": 0.3806,
"step": 8410
},
{
"epoch": 1.0138470800722457,
"grad_norm": 4.706192970275879,
"learning_rate": 9.299601973877566e-07,
"loss": 0.385,
"step": 8420
},
{
"epoch": 1.0150511739915713,
"grad_norm": 5.003023624420166,
"learning_rate": 9.29691788727388e-07,
"loss": 0.3785,
"step": 8430
},
{
"epoch": 1.0162552679108972,
"grad_norm": 4.118617534637451,
"learning_rate": 9.294229056366463e-07,
"loss": 0.3649,
"step": 8440
},
{
"epoch": 1.0174593618302228,
"grad_norm": 4.070971488952637,
"learning_rate": 9.291535484124101e-07,
"loss": 0.3897,
"step": 8450
},
{
"epoch": 1.0186634557495484,
"grad_norm": 4.141367435455322,
"learning_rate": 9.288837173520814e-07,
"loss": 0.3712,
"step": 8460
},
{
"epoch": 1.0198675496688743,
"grad_norm": 4.00056791305542,
"learning_rate": 9.286134127535859e-07,
"loss": 0.372,
"step": 8470
},
{
"epoch": 1.0210716435882,
"grad_norm": 4.618954658508301,
"learning_rate": 9.283426349153711e-07,
"loss": 0.3708,
"step": 8480
},
{
"epoch": 1.0222757375075255,
"grad_norm": 4.50954008102417,
"learning_rate": 9.280713841364083e-07,
"loss": 0.3831,
"step": 8490
},
{
"epoch": 1.0234798314268514,
"grad_norm": 4.025129795074463,
"learning_rate": 9.277996607161898e-07,
"loss": 0.3807,
"step": 8500
},
{
"epoch": 1.024683925346177,
"grad_norm": 4.727366924285889,
"learning_rate": 9.275274649547307e-07,
"loss": 0.3707,
"step": 8510
},
{
"epoch": 1.0258880192655027,
"grad_norm": 4.731372833251953,
"learning_rate": 9.272547971525669e-07,
"loss": 0.3655,
"step": 8520
},
{
"epoch": 1.0270921131848285,
"grad_norm": 4.237710475921631,
"learning_rate": 9.269816576107559e-07,
"loss": 0.365,
"step": 8530
},
{
"epoch": 1.0282962071041541,
"grad_norm": 4.294924736022949,
"learning_rate": 9.267080466308758e-07,
"loss": 0.3774,
"step": 8540
},
{
"epoch": 1.0295003010234798,
"grad_norm": 4.249452590942383,
"learning_rate": 9.264339645150256e-07,
"loss": 0.372,
"step": 8550
},
{
"epoch": 1.0307043949428056,
"grad_norm": 4.078114986419678,
"learning_rate": 9.26159411565824e-07,
"loss": 0.3736,
"step": 8560
},
{
"epoch": 1.0319084888621313,
"grad_norm": 5.815018177032471,
"learning_rate": 9.258843880864101e-07,
"loss": 0.3708,
"step": 8570
},
{
"epoch": 1.033112582781457,
"grad_norm": 4.562671184539795,
"learning_rate": 9.256088943804421e-07,
"loss": 0.3926,
"step": 8580
},
{
"epoch": 1.0343166767007828,
"grad_norm": 5.159687042236328,
"learning_rate": 9.253329307520974e-07,
"loss": 0.3754,
"step": 8590
},
{
"epoch": 1.0355207706201084,
"grad_norm": 4.418034076690674,
"learning_rate": 9.250564975060725e-07,
"loss": 0.3756,
"step": 8600
},
{
"epoch": 1.036724864539434,
"grad_norm": 4.661262035369873,
"learning_rate": 9.247795949475823e-07,
"loss": 0.3854,
"step": 8610
},
{
"epoch": 1.0379289584587599,
"grad_norm": 4.768362522125244,
"learning_rate": 9.245022233823598e-07,
"loss": 0.3798,
"step": 8620
},
{
"epoch": 1.0391330523780855,
"grad_norm": 4.090877056121826,
"learning_rate": 9.242243831166558e-07,
"loss": 0.3883,
"step": 8630
},
{
"epoch": 1.0403371462974111,
"grad_norm": 4.2338032722473145,
"learning_rate": 9.23946074457239e-07,
"loss": 0.3784,
"step": 8640
},
{
"epoch": 1.041541240216737,
"grad_norm": 4.812314510345459,
"learning_rate": 9.236672977113947e-07,
"loss": 0.3938,
"step": 8650
},
{
"epoch": 1.0427453341360626,
"grad_norm": 5.055131435394287,
"learning_rate": 9.233880531869253e-07,
"loss": 0.3784,
"step": 8660
},
{
"epoch": 1.0439494280553883,
"grad_norm": 4.143119812011719,
"learning_rate": 9.231083411921497e-07,
"loss": 0.368,
"step": 8670
},
{
"epoch": 1.0451535219747141,
"grad_norm": 4.840368270874023,
"learning_rate": 9.228281620359029e-07,
"loss": 0.3771,
"step": 8680
},
{
"epoch": 1.0463576158940397,
"grad_norm": 4.708595275878906,
"learning_rate": 9.225475160275358e-07,
"loss": 0.3572,
"step": 8690
},
{
"epoch": 1.0475617098133654,
"grad_norm": 4.715826511383057,
"learning_rate": 9.222664034769145e-07,
"loss": 0.3929,
"step": 8700
},
{
"epoch": 1.0487658037326912,
"grad_norm": 4.969057559967041,
"learning_rate": 9.219848246944205e-07,
"loss": 0.3895,
"step": 8710
},
{
"epoch": 1.0499698976520169,
"grad_norm": 3.3560914993286133,
"learning_rate": 9.217027799909499e-07,
"loss": 0.379,
"step": 8720
},
{
"epoch": 1.0511739915713425,
"grad_norm": 4.196804046630859,
"learning_rate": 9.214202696779134e-07,
"loss": 0.3692,
"step": 8730
},
{
"epoch": 1.0523780854906684,
"grad_norm": 4.214865684509277,
"learning_rate": 9.211372940672355e-07,
"loss": 0.3673,
"step": 8740
},
{
"epoch": 1.053582179409994,
"grad_norm": 4.642685890197754,
"learning_rate": 9.208538534713548e-07,
"loss": 0.3961,
"step": 8750
},
{
"epoch": 1.0547862733293196,
"grad_norm": 4.921828269958496,
"learning_rate": 9.20569948203223e-07,
"loss": 0.3616,
"step": 8760
},
{
"epoch": 1.0559903672486455,
"grad_norm": 3.9251582622528076,
"learning_rate": 9.202855785763051e-07,
"loss": 0.3958,
"step": 8770
},
{
"epoch": 1.0571944611679711,
"grad_norm": 4.475203990936279,
"learning_rate": 9.200007449045785e-07,
"loss": 0.3782,
"step": 8780
},
{
"epoch": 1.0583985550872967,
"grad_norm": 4.735462665557861,
"learning_rate": 9.197154475025333e-07,
"loss": 0.3571,
"step": 8790
},
{
"epoch": 1.0596026490066226,
"grad_norm": 4.720487117767334,
"learning_rate": 9.194296866851712e-07,
"loss": 0.3632,
"step": 8800
},
{
"epoch": 1.0608067429259482,
"grad_norm": 4.291871547698975,
"learning_rate": 9.191434627680063e-07,
"loss": 0.3722,
"step": 8810
},
{
"epoch": 1.0620108368452739,
"grad_norm": 4.449291229248047,
"learning_rate": 9.188567760670631e-07,
"loss": 0.3857,
"step": 8820
},
{
"epoch": 1.0632149307645997,
"grad_norm": 4.42001485824585,
"learning_rate": 9.185696268988776e-07,
"loss": 0.3798,
"step": 8830
},
{
"epoch": 1.0644190246839254,
"grad_norm": 4.68118953704834,
"learning_rate": 9.182820155804965e-07,
"loss": 0.364,
"step": 8840
},
{
"epoch": 1.065623118603251,
"grad_norm": 4.831759929656982,
"learning_rate": 9.179939424294763e-07,
"loss": 0.3656,
"step": 8850
},
{
"epoch": 1.0668272125225768,
"grad_norm": 4.51068115234375,
"learning_rate": 9.177054077638839e-07,
"loss": 0.3779,
"step": 8860
},
{
"epoch": 1.0680313064419025,
"grad_norm": 4.588883399963379,
"learning_rate": 9.174164119022956e-07,
"loss": 0.3766,
"step": 8870
},
{
"epoch": 1.069235400361228,
"grad_norm": 4.487590789794922,
"learning_rate": 9.171269551637968e-07,
"loss": 0.3676,
"step": 8880
},
{
"epoch": 1.070439494280554,
"grad_norm": 5.2501702308654785,
"learning_rate": 9.168370378679819e-07,
"loss": 0.3764,
"step": 8890
},
{
"epoch": 1.0716435881998796,
"grad_norm": 4.199159145355225,
"learning_rate": 9.165466603349539e-07,
"loss": 0.3736,
"step": 8900
},
{
"epoch": 1.0728476821192052,
"grad_norm": 4.138830184936523,
"learning_rate": 9.162558228853235e-07,
"loss": 0.3745,
"step": 8910
},
{
"epoch": 1.074051776038531,
"grad_norm": 4.139305114746094,
"learning_rate": 9.159645258402095e-07,
"loss": 0.3693,
"step": 8920
},
{
"epoch": 1.0752558699578567,
"grad_norm": 5.9480438232421875,
"learning_rate": 9.156727695212386e-07,
"loss": 0.3644,
"step": 8930
},
{
"epoch": 1.0764599638771823,
"grad_norm": 4.251008987426758,
"learning_rate": 9.153805542505438e-07,
"loss": 0.3844,
"step": 8940
},
{
"epoch": 1.0776640577965082,
"grad_norm": 4.630239486694336,
"learning_rate": 9.150878803507654e-07,
"loss": 0.3699,
"step": 8950
},
{
"epoch": 1.0788681517158338,
"grad_norm": 5.171538829803467,
"learning_rate": 9.147947481450498e-07,
"loss": 0.4026,
"step": 8960
},
{
"epoch": 1.0800722456351595,
"grad_norm": 4.777914524078369,
"learning_rate": 9.145011579570491e-07,
"loss": 0.3642,
"step": 8970
},
{
"epoch": 1.0812763395544853,
"grad_norm": 5.336880207061768,
"learning_rate": 9.142071101109224e-07,
"loss": 0.3926,
"step": 8980
},
{
"epoch": 1.082480433473811,
"grad_norm": 3.8747503757476807,
"learning_rate": 9.139126049313321e-07,
"loss": 0.3792,
"step": 8990
},
{
"epoch": 1.0836845273931366,
"grad_norm": 4.528430461883545,
"learning_rate": 9.136176427434475e-07,
"loss": 0.3735,
"step": 9000
},
{
"epoch": 1.0848886213124624,
"grad_norm": 5.05435848236084,
"learning_rate": 9.133222238729412e-07,
"loss": 0.3604,
"step": 9010
},
{
"epoch": 1.086092715231788,
"grad_norm": 4.354115962982178,
"learning_rate": 9.130263486459904e-07,
"loss": 0.3995,
"step": 9020
},
{
"epoch": 1.0872968091511137,
"grad_norm": 5.124173164367676,
"learning_rate": 9.127300173892763e-07,
"loss": 0.3622,
"step": 9030
},
{
"epoch": 1.0885009030704396,
"grad_norm": 4.644625186920166,
"learning_rate": 9.124332304299838e-07,
"loss": 0.3704,
"step": 9040
},
{
"epoch": 1.0897049969897652,
"grad_norm": 4.276961803436279,
"learning_rate": 9.121359880958002e-07,
"loss": 0.3771,
"step": 9050
},
{
"epoch": 1.0909090909090908,
"grad_norm": 4.1808648109436035,
"learning_rate": 9.118382907149163e-07,
"loss": 0.3638,
"step": 9060
},
{
"epoch": 1.0921131848284167,
"grad_norm": 4.921030521392822,
"learning_rate": 9.115401386160251e-07,
"loss": 0.3633,
"step": 9070
},
{
"epoch": 1.0933172787477423,
"grad_norm": 4.0871663093566895,
"learning_rate": 9.112415321283217e-07,
"loss": 0.358,
"step": 9080
},
{
"epoch": 1.094521372667068,
"grad_norm": 3.419311046600342,
"learning_rate": 9.10942471581503e-07,
"loss": 0.3601,
"step": 9090
},
{
"epoch": 1.0957254665863938,
"grad_norm": 4.138514518737793,
"learning_rate": 9.106429573057666e-07,
"loss": 0.3764,
"step": 9100
},
{
"epoch": 1.0969295605057194,
"grad_norm": 5.0829691886901855,
"learning_rate": 9.10342989631812e-07,
"loss": 0.3756,
"step": 9110
},
{
"epoch": 1.098133654425045,
"grad_norm": 4.330390930175781,
"learning_rate": 9.100425688908386e-07,
"loss": 0.3587,
"step": 9120
},
{
"epoch": 1.099337748344371,
"grad_norm": 5.1065592765808105,
"learning_rate": 9.097416954145465e-07,
"loss": 0.38,
"step": 9130
},
{
"epoch": 1.1005418422636966,
"grad_norm": 4.509856224060059,
"learning_rate": 9.094403695351352e-07,
"loss": 0.38,
"step": 9140
},
{
"epoch": 1.1017459361830222,
"grad_norm": 5.324617862701416,
"learning_rate": 9.091385915853042e-07,
"loss": 0.3658,
"step": 9150
},
{
"epoch": 1.102950030102348,
"grad_norm": 5.061591148376465,
"learning_rate": 9.088363618982521e-07,
"loss": 0.3723,
"step": 9160
},
{
"epoch": 1.1041541240216737,
"grad_norm": 5.028870582580566,
"learning_rate": 9.085336808076758e-07,
"loss": 0.3837,
"step": 9170
},
{
"epoch": 1.1053582179409993,
"grad_norm": 4.214852809906006,
"learning_rate": 9.082305486477708e-07,
"loss": 0.3681,
"step": 9180
},
{
"epoch": 1.1065623118603252,
"grad_norm": 4.787420272827148,
"learning_rate": 9.079269657532311e-07,
"loss": 0.3843,
"step": 9190
},
{
"epoch": 1.1077664057796508,
"grad_norm": 3.78640079498291,
"learning_rate": 9.076229324592477e-07,
"loss": 0.3747,
"step": 9200
},
{
"epoch": 1.1089704996989764,
"grad_norm": 4.786212921142578,
"learning_rate": 9.073184491015094e-07,
"loss": 0.3684,
"step": 9210
},
{
"epoch": 1.1101745936183023,
"grad_norm": 3.932164430618286,
"learning_rate": 9.070135160162015e-07,
"loss": 0.3662,
"step": 9220
},
{
"epoch": 1.111378687537628,
"grad_norm": 4.249774932861328,
"learning_rate": 9.067081335400061e-07,
"loss": 0.3722,
"step": 9230
},
{
"epoch": 1.1125827814569536,
"grad_norm": 4.269323348999023,
"learning_rate": 9.064023020101015e-07,
"loss": 0.3765,
"step": 9240
},
{
"epoch": 1.1137868753762794,
"grad_norm": 4.183831214904785,
"learning_rate": 9.060960217641617e-07,
"loss": 0.3657,
"step": 9250
},
{
"epoch": 1.114990969295605,
"grad_norm": 4.336716175079346,
"learning_rate": 9.057892931403563e-07,
"loss": 0.3869,
"step": 9260
},
{
"epoch": 1.1161950632149307,
"grad_norm": 4.948883533477783,
"learning_rate": 9.054821164773498e-07,
"loss": 0.3823,
"step": 9270
},
{
"epoch": 1.1173991571342565,
"grad_norm": 4.687775611877441,
"learning_rate": 9.051744921143014e-07,
"loss": 0.3853,
"step": 9280
},
{
"epoch": 1.1186032510535822,
"grad_norm": 4.803307056427002,
"learning_rate": 9.048664203908647e-07,
"loss": 0.3609,
"step": 9290
},
{
"epoch": 1.1198073449729078,
"grad_norm": 4.377987861633301,
"learning_rate": 9.045579016471871e-07,
"loss": 0.3873,
"step": 9300
},
{
"epoch": 1.1210114388922336,
"grad_norm": 4.264991760253906,
"learning_rate": 9.042489362239096e-07,
"loss": 0.3663,
"step": 9310
},
{
"epoch": 1.1222155328115593,
"grad_norm": 4.69897985458374,
"learning_rate": 9.039395244621667e-07,
"loss": 0.3797,
"step": 9320
},
{
"epoch": 1.123419626730885,
"grad_norm": 4.6573357582092285,
"learning_rate": 9.036296667035853e-07,
"loss": 0.3774,
"step": 9330
},
{
"epoch": 1.1246237206502108,
"grad_norm": 4.6396307945251465,
"learning_rate": 9.033193632902848e-07,
"loss": 0.3708,
"step": 9340
},
{
"epoch": 1.1258278145695364,
"grad_norm": 4.781702518463135,
"learning_rate": 9.030086145648767e-07,
"loss": 0.366,
"step": 9350
},
{
"epoch": 1.127031908488862,
"grad_norm": 3.859081745147705,
"learning_rate": 9.026974208704645e-07,
"loss": 0.3592,
"step": 9360
},
{
"epoch": 1.1282360024081879,
"grad_norm": 3.917964220046997,
"learning_rate": 9.023857825506425e-07,
"loss": 0.3828,
"step": 9370
},
{
"epoch": 1.1294400963275135,
"grad_norm": 4.249654293060303,
"learning_rate": 9.020736999494962e-07,
"loss": 0.3816,
"step": 9380
},
{
"epoch": 1.1306441902468394,
"grad_norm": 4.181410789489746,
"learning_rate": 9.017611734116015e-07,
"loss": 0.3881,
"step": 9390
},
{
"epoch": 1.131848284166165,
"grad_norm": 4.529959678649902,
"learning_rate": 9.014482032820245e-07,
"loss": 0.3866,
"step": 9400
},
{
"epoch": 1.1330523780854906,
"grad_norm": 4.115703105926514,
"learning_rate": 9.011347899063212e-07,
"loss": 0.4017,
"step": 9410
},
{
"epoch": 1.1342564720048163,
"grad_norm": 5.330405235290527,
"learning_rate": 9.008209336305369e-07,
"loss": 0.382,
"step": 9420
},
{
"epoch": 1.1354605659241421,
"grad_norm": 4.53489351272583,
"learning_rate": 9.005066348012058e-07,
"loss": 0.4002,
"step": 9430
},
{
"epoch": 1.1366646598434678,
"grad_norm": 4.984791278839111,
"learning_rate": 9.00191893765351e-07,
"loss": 0.3699,
"step": 9440
},
{
"epoch": 1.1378687537627936,
"grad_norm": 4.83209753036499,
"learning_rate": 8.998767108704836e-07,
"loss": 0.3612,
"step": 9450
},
{
"epoch": 1.1390728476821192,
"grad_norm": 4.549959659576416,
"learning_rate": 8.995610864646029e-07,
"loss": 0.3552,
"step": 9460
},
{
"epoch": 1.1402769416014449,
"grad_norm": 4.30760383605957,
"learning_rate": 8.992450208961949e-07,
"loss": 0.3796,
"step": 9470
},
{
"epoch": 1.1414810355207705,
"grad_norm": 4.3470234870910645,
"learning_rate": 8.989285145142338e-07,
"loss": 0.3868,
"step": 9480
},
{
"epoch": 1.1426851294400964,
"grad_norm": 4.755895614624023,
"learning_rate": 8.986115676681796e-07,
"loss": 0.3867,
"step": 9490
},
{
"epoch": 1.143889223359422,
"grad_norm": 4.874184608459473,
"learning_rate": 8.982941807079791e-07,
"loss": 0.3866,
"step": 9500
},
{
"epoch": 1.1450933172787479,
"grad_norm": 4.068636894226074,
"learning_rate": 8.979763539840649e-07,
"loss": 0.3558,
"step": 9510
},
{
"epoch": 1.1462974111980735,
"grad_norm": 4.380646705627441,
"learning_rate": 8.976580878473552e-07,
"loss": 0.3704,
"step": 9520
},
{
"epoch": 1.1475015051173991,
"grad_norm": 4.3028950691223145,
"learning_rate": 8.973393826492531e-07,
"loss": 0.3995,
"step": 9530
},
{
"epoch": 1.1487055990367248,
"grad_norm": 4.423670768737793,
"learning_rate": 8.97020238741647e-07,
"loss": 0.38,
"step": 9540
},
{
"epoch": 1.1499096929560506,
"grad_norm": 4.808249473571777,
"learning_rate": 8.967006564769093e-07,
"loss": 0.3779,
"step": 9550
},
{
"epoch": 1.1511137868753762,
"grad_norm": 5.734920501708984,
"learning_rate": 8.963806362078963e-07,
"loss": 0.3713,
"step": 9560
},
{
"epoch": 1.152317880794702,
"grad_norm": 4.730371952056885,
"learning_rate": 8.960601782879483e-07,
"loss": 0.3583,
"step": 9570
},
{
"epoch": 1.1535219747140277,
"grad_norm": 5.035944938659668,
"learning_rate": 8.957392830708886e-07,
"loss": 0.39,
"step": 9580
},
{
"epoch": 1.1547260686333534,
"grad_norm": 4.2402119636535645,
"learning_rate": 8.95417950911023e-07,
"loss": 0.3655,
"step": 9590
},
{
"epoch": 1.155930162552679,
"grad_norm": 3.995563507080078,
"learning_rate": 8.950961821631406e-07,
"loss": 0.3657,
"step": 9600
},
{
"epoch": 1.1571342564720049,
"grad_norm": 5.285823822021484,
"learning_rate": 8.947739771825117e-07,
"loss": 0.3825,
"step": 9610
},
{
"epoch": 1.1583383503913305,
"grad_norm": 4.332102298736572,
"learning_rate": 8.944513363248885e-07,
"loss": 0.3808,
"step": 9620
},
{
"epoch": 1.1595424443106563,
"grad_norm": 4.714332103729248,
"learning_rate": 8.941282599465047e-07,
"loss": 0.3904,
"step": 9630
},
{
"epoch": 1.160746538229982,
"grad_norm": 3.8975484371185303,
"learning_rate": 8.938047484040748e-07,
"loss": 0.3559,
"step": 9640
},
{
"epoch": 1.1619506321493076,
"grad_norm": 4.700948238372803,
"learning_rate": 8.934808020547935e-07,
"loss": 0.3676,
"step": 9650
},
{
"epoch": 1.1631547260686332,
"grad_norm": 4.926019191741943,
"learning_rate": 8.931564212563356e-07,
"loss": 0.3913,
"step": 9660
},
{
"epoch": 1.164358819987959,
"grad_norm": 4.402989864349365,
"learning_rate": 8.92831606366856e-07,
"loss": 0.3672,
"step": 9670
},
{
"epoch": 1.1655629139072847,
"grad_norm": 4.371270656585693,
"learning_rate": 8.925063577449886e-07,
"loss": 0.3529,
"step": 9680
},
{
"epoch": 1.1667670078266106,
"grad_norm": 5.072457790374756,
"learning_rate": 8.92180675749846e-07,
"loss": 0.3703,
"step": 9690
},
{
"epoch": 1.1679711017459362,
"grad_norm": 5.789607524871826,
"learning_rate": 8.918545607410197e-07,
"loss": 0.3618,
"step": 9700
},
{
"epoch": 1.1691751956652618,
"grad_norm": 4.929603576660156,
"learning_rate": 8.91528013078579e-07,
"loss": 0.3632,
"step": 9710
},
{
"epoch": 1.1703792895845875,
"grad_norm": 4.385134220123291,
"learning_rate": 8.91201033123071e-07,
"loss": 0.3726,
"step": 9720
},
{
"epoch": 1.1715833835039133,
"grad_norm": 4.493896961212158,
"learning_rate": 8.908736212355201e-07,
"loss": 0.396,
"step": 9730
},
{
"epoch": 1.172787477423239,
"grad_norm": 5.4288859367370605,
"learning_rate": 8.905457777774278e-07,
"loss": 0.3693,
"step": 9740
},
{
"epoch": 1.1739915713425648,
"grad_norm": 4.925263404846191,
"learning_rate": 8.902175031107717e-07,
"loss": 0.3809,
"step": 9750
},
{
"epoch": 1.1751956652618905,
"grad_norm": 4.450766086578369,
"learning_rate": 8.898887975980058e-07,
"loss": 0.3747,
"step": 9760
},
{
"epoch": 1.176399759181216,
"grad_norm": 5.003162860870361,
"learning_rate": 8.895596616020595e-07,
"loss": 0.3763,
"step": 9770
},
{
"epoch": 1.1776038531005417,
"grad_norm": 5.204108238220215,
"learning_rate": 8.89230095486338e-07,
"loss": 0.3983,
"step": 9780
},
{
"epoch": 1.1788079470198676,
"grad_norm": 5.1089372634887695,
"learning_rate": 8.889000996147213e-07,
"loss": 0.3757,
"step": 9790
},
{
"epoch": 1.1800120409391932,
"grad_norm": 5.394412994384766,
"learning_rate": 8.885696743515632e-07,
"loss": 0.3764,
"step": 9800
},
{
"epoch": 1.181216134858519,
"grad_norm": 4.811611175537109,
"learning_rate": 8.882388200616926e-07,
"loss": 0.3686,
"step": 9810
},
{
"epoch": 1.1824202287778447,
"grad_norm": 4.908543109893799,
"learning_rate": 8.879075371104113e-07,
"loss": 0.368,
"step": 9820
},
{
"epoch": 1.1836243226971703,
"grad_norm": 4.540360450744629,
"learning_rate": 8.875758258634949e-07,
"loss": 0.3698,
"step": 9830
},
{
"epoch": 1.1848284166164962,
"grad_norm": 4.033935546875,
"learning_rate": 8.872436866871917e-07,
"loss": 0.3522,
"step": 9840
},
{
"epoch": 1.1860325105358218,
"grad_norm": 5.225256443023682,
"learning_rate": 8.869111199482225e-07,
"loss": 0.3837,
"step": 9850
},
{
"epoch": 1.1872366044551474,
"grad_norm": 4.02462100982666,
"learning_rate": 8.865781260137801e-07,
"loss": 0.381,
"step": 9860
},
{
"epoch": 1.1884406983744733,
"grad_norm": 4.905768871307373,
"learning_rate": 8.862447052515291e-07,
"loss": 0.384,
"step": 9870
},
{
"epoch": 1.189644792293799,
"grad_norm": 4.620838642120361,
"learning_rate": 8.859108580296053e-07,
"loss": 0.3533,
"step": 9880
},
{
"epoch": 1.1908488862131246,
"grad_norm": 4.312672138214111,
"learning_rate": 8.855765847166154e-07,
"loss": 0.3591,
"step": 9890
},
{
"epoch": 1.1920529801324504,
"grad_norm": 4.337918758392334,
"learning_rate": 8.852418856816365e-07,
"loss": 0.374,
"step": 9900
},
{
"epoch": 1.193257074051776,
"grad_norm": 4.154960632324219,
"learning_rate": 8.849067612942158e-07,
"loss": 0.3551,
"step": 9910
},
{
"epoch": 1.1944611679711017,
"grad_norm": 4.451188564300537,
"learning_rate": 8.845712119243701e-07,
"loss": 0.3699,
"step": 9920
},
{
"epoch": 1.1956652618904275,
"grad_norm": 5.723966598510742,
"learning_rate": 8.842352379425853e-07,
"loss": 0.3875,
"step": 9930
},
{
"epoch": 1.1968693558097532,
"grad_norm": 4.982749938964844,
"learning_rate": 8.838988397198166e-07,
"loss": 0.375,
"step": 9940
},
{
"epoch": 1.1980734497290788,
"grad_norm": 4.661801338195801,
"learning_rate": 8.835620176274869e-07,
"loss": 0.3721,
"step": 9950
},
{
"epoch": 1.1992775436484047,
"grad_norm": 5.228112697601318,
"learning_rate": 8.832247720374879e-07,
"loss": 0.366,
"step": 9960
},
{
"epoch": 1.2004816375677303,
"grad_norm": 4.082928657531738,
"learning_rate": 8.828871033221782e-07,
"loss": 0.3621,
"step": 9970
},
{
"epoch": 1.201685731487056,
"grad_norm": 3.532892942428589,
"learning_rate": 8.82549011854384e-07,
"loss": 0.365,
"step": 9980
},
{
"epoch": 1.2028898254063818,
"grad_norm": 4.03758430480957,
"learning_rate": 8.822104980073978e-07,
"loss": 0.3786,
"step": 9990
},
{
"epoch": 1.2040939193257074,
"grad_norm": 4.233405590057373,
"learning_rate": 8.818715621549792e-07,
"loss": 0.3664,
"step": 10000
},
{
"epoch": 1.205298013245033,
"grad_norm": 4.029031753540039,
"learning_rate": 8.815322046713531e-07,
"loss": 0.3655,
"step": 10010
},
{
"epoch": 1.206502107164359,
"grad_norm": 4.398824691772461,
"learning_rate": 8.811924259312102e-07,
"loss": 0.3818,
"step": 10020
},
{
"epoch": 1.2077062010836845,
"grad_norm": 4.394994258880615,
"learning_rate": 8.808522263097063e-07,
"loss": 0.3875,
"step": 10030
},
{
"epoch": 1.2089102950030102,
"grad_norm": 4.941735744476318,
"learning_rate": 8.805116061824617e-07,
"loss": 0.3635,
"step": 10040
},
{
"epoch": 1.210114388922336,
"grad_norm": 4.183002471923828,
"learning_rate": 8.801705659255616e-07,
"loss": 0.3718,
"step": 10050
},
{
"epoch": 1.2113184828416617,
"grad_norm": 3.9239907264709473,
"learning_rate": 8.798291059155541e-07,
"loss": 0.3562,
"step": 10060
},
{
"epoch": 1.2125225767609873,
"grad_norm": 4.399021625518799,
"learning_rate": 8.794872265294516e-07,
"loss": 0.3577,
"step": 10070
},
{
"epoch": 1.2137266706803131,
"grad_norm": 3.739692211151123,
"learning_rate": 8.791449281447291e-07,
"loss": 0.3715,
"step": 10080
},
{
"epoch": 1.2149307645996388,
"grad_norm": 6.101430416107178,
"learning_rate": 8.788022111393245e-07,
"loss": 0.3791,
"step": 10090
},
{
"epoch": 1.2161348585189644,
"grad_norm": 4.473653793334961,
"learning_rate": 8.784590758916377e-07,
"loss": 0.3733,
"step": 10100
},
{
"epoch": 1.2173389524382903,
"grad_norm": 5.723465919494629,
"learning_rate": 8.781155227805304e-07,
"loss": 0.376,
"step": 10110
},
{
"epoch": 1.218543046357616,
"grad_norm": 6.045252323150635,
"learning_rate": 8.777715521853257e-07,
"loss": 0.383,
"step": 10120
},
{
"epoch": 1.2197471402769415,
"grad_norm": 4.978476524353027,
"learning_rate": 8.774271644858078e-07,
"loss": 0.3902,
"step": 10130
},
{
"epoch": 1.2209512341962674,
"grad_norm": 4.655144691467285,
"learning_rate": 8.770823600622212e-07,
"loss": 0.3832,
"step": 10140
},
{
"epoch": 1.222155328115593,
"grad_norm": 4.3407883644104,
"learning_rate": 8.767371392952708e-07,
"loss": 0.3582,
"step": 10150
},
{
"epoch": 1.2233594220349187,
"grad_norm": 4.6942596435546875,
"learning_rate": 8.763915025661206e-07,
"loss": 0.3755,
"step": 10160
},
{
"epoch": 1.2245635159542445,
"grad_norm": 4.285218715667725,
"learning_rate": 8.760454502563947e-07,
"loss": 0.3776,
"step": 10170
},
{
"epoch": 1.2257676098735701,
"grad_norm": 4.890243053436279,
"learning_rate": 8.756989827481755e-07,
"loss": 0.37,
"step": 10180
},
{
"epoch": 1.2269717037928958,
"grad_norm": 4.752533912658691,
"learning_rate": 8.753521004240038e-07,
"loss": 0.3717,
"step": 10190
},
{
"epoch": 1.2281757977122216,
"grad_norm": 4.077126502990723,
"learning_rate": 8.750048036668789e-07,
"loss": 0.3811,
"step": 10200
},
{
"epoch": 1.2293798916315473,
"grad_norm": 3.9369449615478516,
"learning_rate": 8.74657092860257e-07,
"loss": 0.3737,
"step": 10210
},
{
"epoch": 1.230583985550873,
"grad_norm": 4.381350040435791,
"learning_rate": 8.74308968388052e-07,
"loss": 0.3528,
"step": 10220
},
{
"epoch": 1.2317880794701987,
"grad_norm": 4.581336975097656,
"learning_rate": 8.739604306346342e-07,
"loss": 0.3728,
"step": 10230
},
{
"epoch": 1.2329921733895244,
"grad_norm": 5.837801933288574,
"learning_rate": 8.736114799848306e-07,
"loss": 0.3812,
"step": 10240
},
{
"epoch": 1.23419626730885,
"grad_norm": 4.347848892211914,
"learning_rate": 8.732621168239236e-07,
"loss": 0.3818,
"step": 10250
},
{
"epoch": 1.2354003612281759,
"grad_norm": 4.717700004577637,
"learning_rate": 8.729123415376514e-07,
"loss": 0.3516,
"step": 10260
},
{
"epoch": 1.2366044551475015,
"grad_norm": 4.809170722961426,
"learning_rate": 8.725621545122072e-07,
"loss": 0.3642,
"step": 10270
},
{
"epoch": 1.2378085490668271,
"grad_norm": 4.547823905944824,
"learning_rate": 8.722115561342387e-07,
"loss": 0.3791,
"step": 10280
},
{
"epoch": 1.239012642986153,
"grad_norm": 4.235891819000244,
"learning_rate": 8.718605467908478e-07,
"loss": 0.3663,
"step": 10290
},
{
"epoch": 1.2402167369054786,
"grad_norm": 4.648200035095215,
"learning_rate": 8.715091268695901e-07,
"loss": 0.3623,
"step": 10300
},
{
"epoch": 1.2414208308248043,
"grad_norm": 4.6003737449646,
"learning_rate": 8.711572967584747e-07,
"loss": 0.378,
"step": 10310
},
{
"epoch": 1.24262492474413,
"grad_norm": 4.921525001525879,
"learning_rate": 8.708050568459635e-07,
"loss": 0.3602,
"step": 10320
},
{
"epoch": 1.2438290186634557,
"grad_norm": 4.075355052947998,
"learning_rate": 8.704524075209709e-07,
"loss": 0.3698,
"step": 10330
},
{
"epoch": 1.2450331125827814,
"grad_norm": 5.707545280456543,
"learning_rate": 8.700993491728634e-07,
"loss": 0.3538,
"step": 10340
},
{
"epoch": 1.2462372065021072,
"grad_norm": 4.669870853424072,
"learning_rate": 8.697458821914587e-07,
"loss": 0.3685,
"step": 10350
},
{
"epoch": 1.2474413004214329,
"grad_norm": 4.101998329162598,
"learning_rate": 8.693920069670264e-07,
"loss": 0.3823,
"step": 10360
},
{
"epoch": 1.2486453943407585,
"grad_norm": 4.307315349578857,
"learning_rate": 8.690377238902862e-07,
"loss": 0.3718,
"step": 10370
},
{
"epoch": 1.2498494882600844,
"grad_norm": 4.498570442199707,
"learning_rate": 8.686830333524084e-07,
"loss": 0.3894,
"step": 10380
},
{
"epoch": 1.25105358217941,
"grad_norm": 4.348161697387695,
"learning_rate": 8.68327935745013e-07,
"loss": 0.3661,
"step": 10390
},
{
"epoch": 1.2522576760987358,
"grad_norm": 4.509785175323486,
"learning_rate": 8.679724314601701e-07,
"loss": 0.3691,
"step": 10400
},
{
"epoch": 1.2534617700180615,
"grad_norm": 4.251500606536865,
"learning_rate": 8.676165208903978e-07,
"loss": 0.3489,
"step": 10410
},
{
"epoch": 1.254665863937387,
"grad_norm": 3.91599702835083,
"learning_rate": 8.672602044286637e-07,
"loss": 0.3835,
"step": 10420
},
{
"epoch": 1.2558699578567127,
"grad_norm": 4.641791820526123,
"learning_rate": 8.66903482468383e-07,
"loss": 0.3676,
"step": 10430
},
{
"epoch": 1.2570740517760386,
"grad_norm": 6.0034499168396,
"learning_rate": 8.665463554034187e-07,
"loss": 0.3728,
"step": 10440
},
{
"epoch": 1.2582781456953642,
"grad_norm": 5.09488582611084,
"learning_rate": 8.661888236280813e-07,
"loss": 0.3718,
"step": 10450
},
{
"epoch": 1.25948223961469,
"grad_norm": 5.368484020233154,
"learning_rate": 8.658308875371279e-07,
"loss": 0.3908,
"step": 10460
},
{
"epoch": 1.2606863335340157,
"grad_norm": 5.200775623321533,
"learning_rate": 8.654725475257621e-07,
"loss": 0.3655,
"step": 10470
},
{
"epoch": 1.2618904274533413,
"grad_norm": 4.358388900756836,
"learning_rate": 8.651138039896338e-07,
"loss": 0.3748,
"step": 10480
},
{
"epoch": 1.263094521372667,
"grad_norm": 4.452842712402344,
"learning_rate": 8.647546573248377e-07,
"loss": 0.3731,
"step": 10490
},
{
"epoch": 1.2642986152919928,
"grad_norm": 4.0504584312438965,
"learning_rate": 8.643951079279144e-07,
"loss": 0.3767,
"step": 10500
},
{
"epoch": 1.2655027092113185,
"grad_norm": 5.186153411865234,
"learning_rate": 8.640351561958486e-07,
"loss": 0.362,
"step": 10510
},
{
"epoch": 1.2667068031306443,
"grad_norm": 4.57370662689209,
"learning_rate": 8.636748025260696e-07,
"loss": 0.3766,
"step": 10520
},
{
"epoch": 1.26791089704997,
"grad_norm": 5.416035175323486,
"learning_rate": 8.633140473164502e-07,
"loss": 0.3653,
"step": 10530
},
{
"epoch": 1.2691149909692956,
"grad_norm": 4.351581573486328,
"learning_rate": 8.629528909653065e-07,
"loss": 0.3556,
"step": 10540
},
{
"epoch": 1.2703190848886212,
"grad_norm": 5.305721759796143,
"learning_rate": 8.625913338713982e-07,
"loss": 0.3873,
"step": 10550
},
{
"epoch": 1.271523178807947,
"grad_norm": 3.8972630500793457,
"learning_rate": 8.622293764339264e-07,
"loss": 0.3812,
"step": 10560
},
{
"epoch": 1.2727272727272727,
"grad_norm": 5.005763530731201,
"learning_rate": 8.61867019052535e-07,
"loss": 0.3761,
"step": 10570
},
{
"epoch": 1.2739313666465986,
"grad_norm": 4.1513848304748535,
"learning_rate": 8.615042621273093e-07,
"loss": 0.3525,
"step": 10580
},
{
"epoch": 1.2751354605659242,
"grad_norm": 5.166493892669678,
"learning_rate": 8.611411060587757e-07,
"loss": 0.3866,
"step": 10590
},
{
"epoch": 1.2763395544852498,
"grad_norm": 4.168553352355957,
"learning_rate": 8.60777551247901e-07,
"loss": 0.3735,
"step": 10600
},
{
"epoch": 1.2775436484045755,
"grad_norm": 4.891838550567627,
"learning_rate": 8.60413598096093e-07,
"loss": 0.3603,
"step": 10610
},
{
"epoch": 1.2787477423239013,
"grad_norm": 4.317160606384277,
"learning_rate": 8.600492470051983e-07,
"loss": 0.3765,
"step": 10620
},
{
"epoch": 1.279951836243227,
"grad_norm": 4.056015968322754,
"learning_rate": 8.59684498377504e-07,
"loss": 0.3704,
"step": 10630
},
{
"epoch": 1.2811559301625528,
"grad_norm": 4.8416242599487305,
"learning_rate": 8.593193526157354e-07,
"loss": 0.3475,
"step": 10640
},
{
"epoch": 1.2823600240818784,
"grad_norm": 5.178276062011719,
"learning_rate": 8.589538101230564e-07,
"loss": 0.3823,
"step": 10650
},
{
"epoch": 1.283564118001204,
"grad_norm": 4.507132053375244,
"learning_rate": 8.58587871303069e-07,
"loss": 0.3597,
"step": 10660
},
{
"epoch": 1.2847682119205297,
"grad_norm": 4.44130277633667,
"learning_rate": 8.582215365598127e-07,
"loss": 0.3748,
"step": 10670
},
{
"epoch": 1.2859723058398556,
"grad_norm": 4.559373378753662,
"learning_rate": 8.578548062977644e-07,
"loss": 0.3684,
"step": 10680
},
{
"epoch": 1.2871763997591812,
"grad_norm": 4.59391450881958,
"learning_rate": 8.574876809218374e-07,
"loss": 0.3729,
"step": 10690
},
{
"epoch": 1.288380493678507,
"grad_norm": 4.64610481262207,
"learning_rate": 8.571201608373815e-07,
"loss": 0.367,
"step": 10700
},
{
"epoch": 1.2895845875978327,
"grad_norm": 5.637624740600586,
"learning_rate": 8.56752246450182e-07,
"loss": 0.3799,
"step": 10710
},
{
"epoch": 1.2907886815171583,
"grad_norm": 4.1183271408081055,
"learning_rate": 8.563839381664599e-07,
"loss": 0.3744,
"step": 10720
},
{
"epoch": 1.291992775436484,
"grad_norm": 5.679279327392578,
"learning_rate": 8.560152363928709e-07,
"loss": 0.3636,
"step": 10730
},
{
"epoch": 1.2931968693558098,
"grad_norm": 4.73154878616333,
"learning_rate": 8.556461415365052e-07,
"loss": 0.3772,
"step": 10740
},
{
"epoch": 1.2944009632751354,
"grad_norm": 4.206639289855957,
"learning_rate": 8.552766540048871e-07,
"loss": 0.3652,
"step": 10750
},
{
"epoch": 1.2956050571944613,
"grad_norm": 4.551361083984375,
"learning_rate": 8.549067742059741e-07,
"loss": 0.36,
"step": 10760
},
{
"epoch": 1.296809151113787,
"grad_norm": 4.472609043121338,
"learning_rate": 8.545365025481574e-07,
"loss": 0.3949,
"step": 10770
},
{
"epoch": 1.2980132450331126,
"grad_norm": 3.9386298656463623,
"learning_rate": 8.541658394402605e-07,
"loss": 0.3736,
"step": 10780
},
{
"epoch": 1.2992173389524382,
"grad_norm": 5.128427505493164,
"learning_rate": 8.537947852915388e-07,
"loss": 0.3708,
"step": 10790
},
{
"epoch": 1.300421432871764,
"grad_norm": 4.362430095672607,
"learning_rate": 8.534233405116804e-07,
"loss": 0.3707,
"step": 10800
},
{
"epoch": 1.3016255267910897,
"grad_norm": 5.032322883605957,
"learning_rate": 8.530515055108036e-07,
"loss": 0.3694,
"step": 10810
},
{
"epoch": 1.3028296207104155,
"grad_norm": 3.745659828186035,
"learning_rate": 8.526792806994585e-07,
"loss": 0.3531,
"step": 10820
},
{
"epoch": 1.3040337146297412,
"grad_norm": 3.8410699367523193,
"learning_rate": 8.523066664886248e-07,
"loss": 0.3591,
"step": 10830
},
{
"epoch": 1.3052378085490668,
"grad_norm": 6.065695285797119,
"learning_rate": 8.519336632897128e-07,
"loss": 0.3748,
"step": 10840
},
{
"epoch": 1.3064419024683924,
"grad_norm": 4.5033464431762695,
"learning_rate": 8.515602715145615e-07,
"loss": 0.3661,
"step": 10850
},
{
"epoch": 1.3076459963877183,
"grad_norm": 4.6679558753967285,
"learning_rate": 8.511864915754399e-07,
"loss": 0.3835,
"step": 10860
},
{
"epoch": 1.308850090307044,
"grad_norm": 4.266571998596191,
"learning_rate": 8.50812323885045e-07,
"loss": 0.3799,
"step": 10870
},
{
"epoch": 1.3100541842263698,
"grad_norm": 4.90196418762207,
"learning_rate": 8.504377688565019e-07,
"loss": 0.3551,
"step": 10880
},
{
"epoch": 1.3112582781456954,
"grad_norm": 4.301276683807373,
"learning_rate": 8.500628269033635e-07,
"loss": 0.3825,
"step": 10890
},
{
"epoch": 1.312462372065021,
"grad_norm": 4.9276580810546875,
"learning_rate": 8.4968749843961e-07,
"loss": 0.37,
"step": 10900
},
{
"epoch": 1.3136664659843467,
"grad_norm": 4.929906845092773,
"learning_rate": 8.493117838796482e-07,
"loss": 0.3751,
"step": 10910
},
{
"epoch": 1.3148705599036725,
"grad_norm": 4.179794788360596,
"learning_rate": 8.489356836383112e-07,
"loss": 0.3714,
"step": 10920
},
{
"epoch": 1.3160746538229982,
"grad_norm": 4.671365261077881,
"learning_rate": 8.485591981308583e-07,
"loss": 0.3665,
"step": 10930
},
{
"epoch": 1.317278747742324,
"grad_norm": 4.073710918426514,
"learning_rate": 8.481823277729734e-07,
"loss": 0.3602,
"step": 10940
},
{
"epoch": 1.3184828416616496,
"grad_norm": 4.633068084716797,
"learning_rate": 8.478050729807663e-07,
"loss": 0.3682,
"step": 10950
},
{
"epoch": 1.3196869355809753,
"grad_norm": 5.233600616455078,
"learning_rate": 8.474274341707701e-07,
"loss": 0.3781,
"step": 10960
},
{
"epoch": 1.320891029500301,
"grad_norm": 4.329504013061523,
"learning_rate": 8.470494117599431e-07,
"loss": 0.3763,
"step": 10970
},
{
"epoch": 1.3220951234196268,
"grad_norm": 4.211668968200684,
"learning_rate": 8.466710061656664e-07,
"loss": 0.3325,
"step": 10980
},
{
"epoch": 1.3232992173389524,
"grad_norm": 4.388267993927002,
"learning_rate": 8.462922178057443e-07,
"loss": 0.3709,
"step": 10990
},
{
"epoch": 1.3245033112582782,
"grad_norm": 5.167718887329102,
"learning_rate": 8.45913047098404e-07,
"loss": 0.362,
"step": 11000
},
{
"epoch": 1.3257074051776039,
"grad_norm": 4.614595890045166,
"learning_rate": 8.455334944622945e-07,
"loss": 0.3549,
"step": 11010
},
{
"epoch": 1.3269114990969295,
"grad_norm": 4.618056774139404,
"learning_rate": 8.451535603164864e-07,
"loss": 0.3773,
"step": 11020
},
{
"epoch": 1.3281155930162551,
"grad_norm": 4.563729763031006,
"learning_rate": 8.447732450804723e-07,
"loss": 0.3688,
"step": 11030
},
{
"epoch": 1.329319686935581,
"grad_norm": 4.429327011108398,
"learning_rate": 8.443925491741646e-07,
"loss": 0.3429,
"step": 11040
},
{
"epoch": 1.3305237808549066,
"grad_norm": 4.474249362945557,
"learning_rate": 8.440114730178966e-07,
"loss": 0.3879,
"step": 11050
},
{
"epoch": 1.3317278747742325,
"grad_norm": 4.212963581085205,
"learning_rate": 8.436300170324215e-07,
"loss": 0.349,
"step": 11060
},
{
"epoch": 1.3329319686935581,
"grad_norm": 4.393470287322998,
"learning_rate": 8.432481816389112e-07,
"loss": 0.3609,
"step": 11070
},
{
"epoch": 1.3341360626128838,
"grad_norm": 4.512639045715332,
"learning_rate": 8.428659672589574e-07,
"loss": 0.3446,
"step": 11080
},
{
"epoch": 1.3353401565322094,
"grad_norm": 5.399291515350342,
"learning_rate": 8.424833743145696e-07,
"loss": 0.3643,
"step": 11090
},
{
"epoch": 1.3365442504515352,
"grad_norm": 4.692162990570068,
"learning_rate": 8.421004032281756e-07,
"loss": 0.3782,
"step": 11100
},
{
"epoch": 1.3377483443708609,
"grad_norm": 4.4849677085876465,
"learning_rate": 8.417170544226203e-07,
"loss": 0.36,
"step": 11110
},
{
"epoch": 1.3389524382901867,
"grad_norm": 4.692328453063965,
"learning_rate": 8.413333283211664e-07,
"loss": 0.3626,
"step": 11120
},
{
"epoch": 1.3401565322095124,
"grad_norm": 4.903812408447266,
"learning_rate": 8.409492253474925e-07,
"loss": 0.3576,
"step": 11130
},
{
"epoch": 1.341360626128838,
"grad_norm": 4.484142780303955,
"learning_rate": 8.405647459256937e-07,
"loss": 0.3611,
"step": 11140
},
{
"epoch": 1.3425647200481636,
"grad_norm": 4.777652263641357,
"learning_rate": 8.401798904802804e-07,
"loss": 0.3654,
"step": 11150
},
{
"epoch": 1.3437688139674895,
"grad_norm": 4.49363374710083,
"learning_rate": 8.397946594361785e-07,
"loss": 0.3684,
"step": 11160
},
{
"epoch": 1.3449729078868151,
"grad_norm": 5.207254886627197,
"learning_rate": 8.394090532187284e-07,
"loss": 0.3706,
"step": 11170
},
{
"epoch": 1.346177001806141,
"grad_norm": 5.246047496795654,
"learning_rate": 8.390230722536849e-07,
"loss": 0.365,
"step": 11180
},
{
"epoch": 1.3473810957254666,
"grad_norm": 4.5202317237854,
"learning_rate": 8.386367169672164e-07,
"loss": 0.3549,
"step": 11190
},
{
"epoch": 1.3485851896447922,
"grad_norm": 5.0257368087768555,
"learning_rate": 8.382499877859046e-07,
"loss": 0.3765,
"step": 11200
},
{
"epoch": 1.3497892835641179,
"grad_norm": 3.513502597808838,
"learning_rate": 8.378628851367441e-07,
"loss": 0.3435,
"step": 11210
},
{
"epoch": 1.3509933774834437,
"grad_norm": 4.943020820617676,
"learning_rate": 8.374754094471421e-07,
"loss": 0.3754,
"step": 11220
},
{
"epoch": 1.3521974714027694,
"grad_norm": 4.6621012687683105,
"learning_rate": 8.37087561144917e-07,
"loss": 0.3823,
"step": 11230
},
{
"epoch": 1.3534015653220952,
"grad_norm": 3.8831217288970947,
"learning_rate": 8.366993406582996e-07,
"loss": 0.3606,
"step": 11240
},
{
"epoch": 1.3546056592414208,
"grad_norm": 4.315981388092041,
"learning_rate": 8.363107484159305e-07,
"loss": 0.3647,
"step": 11250
},
{
"epoch": 1.3558097531607465,
"grad_norm": 4.6641011238098145,
"learning_rate": 8.359217848468616e-07,
"loss": 0.377,
"step": 11260
},
{
"epoch": 1.357013847080072,
"grad_norm": 4.609387397766113,
"learning_rate": 8.355324503805545e-07,
"loss": 0.369,
"step": 11270
},
{
"epoch": 1.358217940999398,
"grad_norm": 4.37289571762085,
"learning_rate": 8.351427454468805e-07,
"loss": 0.3594,
"step": 11280
},
{
"epoch": 1.3594220349187236,
"grad_norm": 5.407008171081543,
"learning_rate": 8.347526704761192e-07,
"loss": 0.3732,
"step": 11290
},
{
"epoch": 1.3606261288380495,
"grad_norm": 4.5802083015441895,
"learning_rate": 8.3436222589896e-07,
"loss": 0.3506,
"step": 11300
},
{
"epoch": 1.361830222757375,
"grad_norm": 4.10429048538208,
"learning_rate": 8.339714121464994e-07,
"loss": 0.3917,
"step": 11310
},
{
"epoch": 1.3630343166767007,
"grad_norm": 4.250566005706787,
"learning_rate": 8.335802296502419e-07,
"loss": 0.3515,
"step": 11320
},
{
"epoch": 1.3642384105960264,
"grad_norm": 5.012816429138184,
"learning_rate": 8.33188678842099e-07,
"loss": 0.354,
"step": 11330
},
{
"epoch": 1.3654425045153522,
"grad_norm": 4.53849983215332,
"learning_rate": 8.327967601543891e-07,
"loss": 0.3612,
"step": 11340
},
{
"epoch": 1.3666465984346778,
"grad_norm": 4.784470081329346,
"learning_rate": 8.324044740198364e-07,
"loss": 0.356,
"step": 11350
},
{
"epoch": 1.3678506923540037,
"grad_norm": 4.100750923156738,
"learning_rate": 8.320118208715714e-07,
"loss": 0.3769,
"step": 11360
},
{
"epoch": 1.3690547862733293,
"grad_norm": 5.738262176513672,
"learning_rate": 8.316188011431291e-07,
"loss": 0.3797,
"step": 11370
},
{
"epoch": 1.370258880192655,
"grad_norm": 4.102308750152588,
"learning_rate": 8.312254152684495e-07,
"loss": 0.3723,
"step": 11380
},
{
"epoch": 1.3714629741119808,
"grad_norm": 3.786195993423462,
"learning_rate": 8.308316636818773e-07,
"loss": 0.3638,
"step": 11390
},
{
"epoch": 1.3726670680313064,
"grad_norm": 4.1659159660339355,
"learning_rate": 8.304375468181606e-07,
"loss": 0.3487,
"step": 11400
},
{
"epoch": 1.373871161950632,
"grad_norm": 4.081630229949951,
"learning_rate": 8.300430651124505e-07,
"loss": 0.3602,
"step": 11410
},
{
"epoch": 1.375075255869958,
"grad_norm": 4.725644111633301,
"learning_rate": 8.296482190003019e-07,
"loss": 0.3746,
"step": 11420
},
{
"epoch": 1.3762793497892836,
"grad_norm": 4.421098709106445,
"learning_rate": 8.292530089176709e-07,
"loss": 0.3632,
"step": 11430
},
{
"epoch": 1.3774834437086092,
"grad_norm": 4.213558197021484,
"learning_rate": 8.288574353009164e-07,
"loss": 0.3748,
"step": 11440
},
{
"epoch": 1.378687537627935,
"grad_norm": 5.2602458000183105,
"learning_rate": 8.284614985867979e-07,
"loss": 0.355,
"step": 11450
},
{
"epoch": 1.3798916315472607,
"grad_norm": 4.735654354095459,
"learning_rate": 8.280651992124766e-07,
"loss": 0.3619,
"step": 11460
},
{
"epoch": 1.3810957254665863,
"grad_norm": 5.071203708648682,
"learning_rate": 8.276685376155133e-07,
"loss": 0.3693,
"step": 11470
},
{
"epoch": 1.3822998193859122,
"grad_norm": 4.431037902832031,
"learning_rate": 8.272715142338694e-07,
"loss": 0.3652,
"step": 11480
},
{
"epoch": 1.3835039133052378,
"grad_norm": 4.460841178894043,
"learning_rate": 8.268741295059056e-07,
"loss": 0.3732,
"step": 11490
},
{
"epoch": 1.3847080072245634,
"grad_norm": 5.048714637756348,
"learning_rate": 8.264763838703812e-07,
"loss": 0.364,
"step": 11500
},
{
"epoch": 1.3859121011438893,
"grad_norm": 4.322780132293701,
"learning_rate": 8.260782777664544e-07,
"loss": 0.3606,
"step": 11510
},
{
"epoch": 1.387116195063215,
"grad_norm": 4.763073921203613,
"learning_rate": 8.256798116336813e-07,
"loss": 0.3885,
"step": 11520
},
{
"epoch": 1.3883202889825406,
"grad_norm": 4.54296350479126,
"learning_rate": 8.252809859120153e-07,
"loss": 0.3629,
"step": 11530
},
{
"epoch": 1.3895243829018664,
"grad_norm": 4.481988430023193,
"learning_rate": 8.248818010418073e-07,
"loss": 0.3641,
"step": 11540
},
{
"epoch": 1.390728476821192,
"grad_norm": 4.431914806365967,
"learning_rate": 8.244822574638041e-07,
"loss": 0.3591,
"step": 11550
},
{
"epoch": 1.3919325707405177,
"grad_norm": 4.374257564544678,
"learning_rate": 8.240823556191489e-07,
"loss": 0.3634,
"step": 11560
},
{
"epoch": 1.3931366646598435,
"grad_norm": 3.9488606452941895,
"learning_rate": 8.23682095949381e-07,
"loss": 0.3466,
"step": 11570
},
{
"epoch": 1.3943407585791692,
"grad_norm": 4.069718837738037,
"learning_rate": 8.232814788964336e-07,
"loss": 0.3286,
"step": 11580
},
{
"epoch": 1.3955448524984948,
"grad_norm": 4.749855995178223,
"learning_rate": 8.228805049026355e-07,
"loss": 0.3546,
"step": 11590
},
{
"epoch": 1.3967489464178207,
"grad_norm": 3.9409117698669434,
"learning_rate": 8.224791744107089e-07,
"loss": 0.3663,
"step": 11600
},
{
"epoch": 1.3979530403371463,
"grad_norm": 4.028295993804932,
"learning_rate": 8.220774878637704e-07,
"loss": 0.3705,
"step": 11610
},
{
"epoch": 1.399157134256472,
"grad_norm": 4.911005973815918,
"learning_rate": 8.21675445705329e-07,
"loss": 0.3691,
"step": 11620
},
{
"epoch": 1.4003612281757978,
"grad_norm": 4.403053283691406,
"learning_rate": 8.212730483792868e-07,
"loss": 0.3736,
"step": 11630
},
{
"epoch": 1.4015653220951234,
"grad_norm": 4.316033840179443,
"learning_rate": 8.208702963299376e-07,
"loss": 0.373,
"step": 11640
},
{
"epoch": 1.402769416014449,
"grad_norm": 5.129039764404297,
"learning_rate": 8.204671900019676e-07,
"loss": 0.37,
"step": 11650
},
{
"epoch": 1.403973509933775,
"grad_norm": 4.374388694763184,
"learning_rate": 8.200637298404531e-07,
"loss": 0.3621,
"step": 11660
},
{
"epoch": 1.4051776038531005,
"grad_norm": 4.385969161987305,
"learning_rate": 8.19659916290862e-07,
"loss": 0.3744,
"step": 11670
},
{
"epoch": 1.4063816977724262,
"grad_norm": 4.8797149658203125,
"learning_rate": 8.192557497990521e-07,
"loss": 0.3554,
"step": 11680
},
{
"epoch": 1.407585791691752,
"grad_norm": 3.9471144676208496,
"learning_rate": 8.188512308112707e-07,
"loss": 0.3702,
"step": 11690
},
{
"epoch": 1.4087898856110777,
"grad_norm": 4.70519495010376,
"learning_rate": 8.184463597741544e-07,
"loss": 0.3422,
"step": 11700
},
{
"epoch": 1.4099939795304035,
"grad_norm": 5.044809818267822,
"learning_rate": 8.180411371347287e-07,
"loss": 0.3702,
"step": 11710
},
{
"epoch": 1.4111980734497291,
"grad_norm": 4.174710273742676,
"learning_rate": 8.17635563340407e-07,
"loss": 0.3513,
"step": 11720
},
{
"epoch": 1.4124021673690548,
"grad_norm": 4.635099411010742,
"learning_rate": 8.172296388389907e-07,
"loss": 0.3779,
"step": 11730
},
{
"epoch": 1.4136062612883804,
"grad_norm": 5.230491638183594,
"learning_rate": 8.168233640786682e-07,
"loss": 0.3601,
"step": 11740
},
{
"epoch": 1.4148103552077063,
"grad_norm": 4.704545497894287,
"learning_rate": 8.164167395080149e-07,
"loss": 0.3569,
"step": 11750
},
{
"epoch": 1.416014449127032,
"grad_norm": 4.232817649841309,
"learning_rate": 8.160097655759917e-07,
"loss": 0.374,
"step": 11760
},
{
"epoch": 1.4172185430463577,
"grad_norm": 5.304251670837402,
"learning_rate": 8.156024427319463e-07,
"loss": 0.3668,
"step": 11770
},
{
"epoch": 1.4184226369656834,
"grad_norm": 4.5971245765686035,
"learning_rate": 8.151947714256111e-07,
"loss": 0.3778,
"step": 11780
},
{
"epoch": 1.419626730885009,
"grad_norm": 4.492901802062988,
"learning_rate": 8.14786752107103e-07,
"loss": 0.3418,
"step": 11790
},
{
"epoch": 1.4208308248043346,
"grad_norm": 4.80876350402832,
"learning_rate": 8.143783852269237e-07,
"loss": 0.3633,
"step": 11800
},
{
"epoch": 1.4220349187236605,
"grad_norm": 3.9827497005462646,
"learning_rate": 8.13969671235958e-07,
"loss": 0.3649,
"step": 11810
},
{
"epoch": 1.4232390126429861,
"grad_norm": 4.20520544052124,
"learning_rate": 8.135606105854747e-07,
"loss": 0.3495,
"step": 11820
},
{
"epoch": 1.424443106562312,
"grad_norm": 4.29602575302124,
"learning_rate": 8.131512037271247e-07,
"loss": 0.3678,
"step": 11830
},
{
"epoch": 1.4256472004816376,
"grad_norm": 4.648280143737793,
"learning_rate": 8.127414511129416e-07,
"loss": 0.3789,
"step": 11840
},
{
"epoch": 1.4268512944009633,
"grad_norm": 4.162654399871826,
"learning_rate": 8.123313531953404e-07,
"loss": 0.372,
"step": 11850
},
{
"epoch": 1.4280553883202889,
"grad_norm": 4.688777446746826,
"learning_rate": 8.119209104271176e-07,
"loss": 0.3576,
"step": 11860
},
{
"epoch": 1.4292594822396147,
"grad_norm": 4.464323997497559,
"learning_rate": 8.115101232614506e-07,
"loss": 0.3817,
"step": 11870
},
{
"epoch": 1.4304635761589404,
"grad_norm": 4.280879974365234,
"learning_rate": 8.110989921518965e-07,
"loss": 0.3604,
"step": 11880
},
{
"epoch": 1.4316676700782662,
"grad_norm": 3.778425693511963,
"learning_rate": 8.106875175523926e-07,
"loss": 0.3553,
"step": 11890
},
{
"epoch": 1.4328717639975919,
"grad_norm": 4.960265159606934,
"learning_rate": 8.102756999172554e-07,
"loss": 0.3723,
"step": 11900
},
{
"epoch": 1.4340758579169175,
"grad_norm": 4.935343265533447,
"learning_rate": 8.098635397011802e-07,
"loss": 0.3714,
"step": 11910
},
{
"epoch": 1.4352799518362431,
"grad_norm": 4.417319297790527,
"learning_rate": 8.094510373592402e-07,
"loss": 0.3612,
"step": 11920
},
{
"epoch": 1.436484045755569,
"grad_norm": 4.819094181060791,
"learning_rate": 8.090381933468868e-07,
"loss": 0.3602,
"step": 11930
},
{
"epoch": 1.4376881396748946,
"grad_norm": 4.769229888916016,
"learning_rate": 8.086250081199484e-07,
"loss": 0.3597,
"step": 11940
},
{
"epoch": 1.4388922335942205,
"grad_norm": 4.872611999511719,
"learning_rate": 8.082114821346302e-07,
"loss": 0.3698,
"step": 11950
},
{
"epoch": 1.440096327513546,
"grad_norm": 4.3483686447143555,
"learning_rate": 8.077976158475135e-07,
"loss": 0.366,
"step": 11960
},
{
"epoch": 1.4413004214328717,
"grad_norm": 4.28345251083374,
"learning_rate": 8.073834097155555e-07,
"loss": 0.3564,
"step": 11970
},
{
"epoch": 1.4425045153521974,
"grad_norm": 4.1988606452941895,
"learning_rate": 8.069688641960888e-07,
"loss": 0.3557,
"step": 11980
},
{
"epoch": 1.4437086092715232,
"grad_norm": 4.156854152679443,
"learning_rate": 8.065539797468201e-07,
"loss": 0.3631,
"step": 11990
},
{
"epoch": 1.4449127031908489,
"grad_norm": 5.002780914306641,
"learning_rate": 8.061387568258312e-07,
"loss": 0.362,
"step": 12000
},
{
"epoch": 1.4461167971101747,
"grad_norm": 4.551509380340576,
"learning_rate": 8.057231958915767e-07,
"loss": 0.3545,
"step": 12010
},
{
"epoch": 1.4473208910295003,
"grad_norm": 3.529510498046875,
"learning_rate": 8.053072974028851e-07,
"loss": 0.3698,
"step": 12020
},
{
"epoch": 1.448524984948826,
"grad_norm": 5.073483467102051,
"learning_rate": 8.048910618189573e-07,
"loss": 0.3762,
"step": 12030
},
{
"epoch": 1.4497290788681516,
"grad_norm": 4.148519992828369,
"learning_rate": 8.044744895993665e-07,
"loss": 0.3714,
"step": 12040
},
{
"epoch": 1.4509331727874775,
"grad_norm": 5.03234338760376,
"learning_rate": 8.040575812040574e-07,
"loss": 0.3651,
"step": 12050
},
{
"epoch": 1.452137266706803,
"grad_norm": 4.286599159240723,
"learning_rate": 8.03640337093346e-07,
"loss": 0.3646,
"step": 12060
},
{
"epoch": 1.453341360626129,
"grad_norm": 5.805792808532715,
"learning_rate": 8.03222757727919e-07,
"loss": 0.3662,
"step": 12070
},
{
"epoch": 1.4545454545454546,
"grad_norm": 5.614697456359863,
"learning_rate": 8.028048435688333e-07,
"loss": 0.3661,
"step": 12080
},
{
"epoch": 1.4557495484647802,
"grad_norm": 4.117318630218506,
"learning_rate": 8.023865950775153e-07,
"loss": 0.3611,
"step": 12090
},
{
"epoch": 1.4569536423841059,
"grad_norm": 4.437227249145508,
"learning_rate": 8.019680127157606e-07,
"loss": 0.3551,
"step": 12100
},
{
"epoch": 1.4581577363034317,
"grad_norm": 4.852316856384277,
"learning_rate": 8.015490969457337e-07,
"loss": 0.3738,
"step": 12110
},
{
"epoch": 1.4593618302227573,
"grad_norm": 4.06812047958374,
"learning_rate": 8.011298482299666e-07,
"loss": 0.3535,
"step": 12120
},
{
"epoch": 1.4605659241420832,
"grad_norm": 4.921239376068115,
"learning_rate": 8.007102670313595e-07,
"loss": 0.3586,
"step": 12130
},
{
"epoch": 1.4617700180614088,
"grad_norm": 3.9317848682403564,
"learning_rate": 8.002903538131794e-07,
"loss": 0.3527,
"step": 12140
},
{
"epoch": 1.4629741119807345,
"grad_norm": 5.692650318145752,
"learning_rate": 7.998701090390601e-07,
"loss": 0.364,
"step": 12150
},
{
"epoch": 1.46417820590006,
"grad_norm": 4.238543510437012,
"learning_rate": 7.994495331730013e-07,
"loss": 0.3516,
"step": 12160
},
{
"epoch": 1.465382299819386,
"grad_norm": 4.356393814086914,
"learning_rate": 7.990286266793685e-07,
"loss": 0.3464,
"step": 12170
},
{
"epoch": 1.4665863937387116,
"grad_norm": 4.616797924041748,
"learning_rate": 7.986073900228916e-07,
"loss": 0.3465,
"step": 12180
},
{
"epoch": 1.4677904876580374,
"grad_norm": 3.8541862964630127,
"learning_rate": 7.981858236686661e-07,
"loss": 0.3546,
"step": 12190
},
{
"epoch": 1.468994581577363,
"grad_norm": 5.685515880584717,
"learning_rate": 7.977639280821505e-07,
"loss": 0.3563,
"step": 12200
},
{
"epoch": 1.4701986754966887,
"grad_norm": 4.1002702713012695,
"learning_rate": 7.973417037291672e-07,
"loss": 0.3771,
"step": 12210
},
{
"epoch": 1.4714027694160143,
"grad_norm": 4.752336025238037,
"learning_rate": 7.969191510759019e-07,
"loss": 0.366,
"step": 12220
},
{
"epoch": 1.4726068633353402,
"grad_norm": 4.7561774253845215,
"learning_rate": 7.964962705889027e-07,
"loss": 0.3621,
"step": 12230
},
{
"epoch": 1.4738109572546658,
"grad_norm": 4.569270133972168,
"learning_rate": 7.96073062735079e-07,
"loss": 0.3662,
"step": 12240
},
{
"epoch": 1.4750150511739917,
"grad_norm": 3.9785332679748535,
"learning_rate": 7.956495279817025e-07,
"loss": 0.3711,
"step": 12250
},
{
"epoch": 1.4762191450933173,
"grad_norm": 4.953578948974609,
"learning_rate": 7.952256667964053e-07,
"loss": 0.3671,
"step": 12260
},
{
"epoch": 1.477423239012643,
"grad_norm": 4.805257320404053,
"learning_rate": 7.948014796471802e-07,
"loss": 0.3707,
"step": 12270
},
{
"epoch": 1.4786273329319686,
"grad_norm": 4.094834804534912,
"learning_rate": 7.943769670023799e-07,
"loss": 0.3699,
"step": 12280
},
{
"epoch": 1.4798314268512944,
"grad_norm": 5.696323394775391,
"learning_rate": 7.939521293307161e-07,
"loss": 0.3753,
"step": 12290
},
{
"epoch": 1.48103552077062,
"grad_norm": 4.848500728607178,
"learning_rate": 7.935269671012599e-07,
"loss": 0.3643,
"step": 12300
},
{
"epoch": 1.482239614689946,
"grad_norm": 4.916533946990967,
"learning_rate": 7.931014807834404e-07,
"loss": 0.3621,
"step": 12310
},
{
"epoch": 1.4834437086092715,
"grad_norm": 4.234400272369385,
"learning_rate": 7.926756708470447e-07,
"loss": 0.3464,
"step": 12320
},
{
"epoch": 1.4846478025285972,
"grad_norm": 4.844507217407227,
"learning_rate": 7.922495377622171e-07,
"loss": 0.3535,
"step": 12330
},
{
"epoch": 1.4858518964479228,
"grad_norm": 5.471369743347168,
"learning_rate": 7.918230819994588e-07,
"loss": 0.3592,
"step": 12340
},
{
"epoch": 1.4870559903672487,
"grad_norm": 5.131628036499023,
"learning_rate": 7.913963040296272e-07,
"loss": 0.376,
"step": 12350
},
{
"epoch": 1.4882600842865743,
"grad_norm": 4.308112144470215,
"learning_rate": 7.909692043239353e-07,
"loss": 0.3526,
"step": 12360
},
{
"epoch": 1.4894641782059002,
"grad_norm": 4.5161356925964355,
"learning_rate": 7.905417833539518e-07,
"loss": 0.3548,
"step": 12370
},
{
"epoch": 1.4906682721252258,
"grad_norm": 4.657468795776367,
"learning_rate": 7.901140415915995e-07,
"loss": 0.3727,
"step": 12380
},
{
"epoch": 1.4918723660445514,
"grad_norm": 4.615851879119873,
"learning_rate": 7.896859795091562e-07,
"loss": 0.3728,
"step": 12390
},
{
"epoch": 1.493076459963877,
"grad_norm": 3.6912169456481934,
"learning_rate": 7.892575975792523e-07,
"loss": 0.3646,
"step": 12400
},
{
"epoch": 1.494280553883203,
"grad_norm": 4.871870517730713,
"learning_rate": 7.888288962748723e-07,
"loss": 0.3416,
"step": 12410
},
{
"epoch": 1.4954846478025285,
"grad_norm": 4.7089385986328125,
"learning_rate": 7.883998760693529e-07,
"loss": 0.3883,
"step": 12420
},
{
"epoch": 1.4966887417218544,
"grad_norm": 4.376954078674316,
"learning_rate": 7.87970537436383e-07,
"loss": 0.3427,
"step": 12430
},
{
"epoch": 1.49789283564118,
"grad_norm": 4.280700206756592,
"learning_rate": 7.875408808500028e-07,
"loss": 0.3651,
"step": 12440
},
{
"epoch": 1.4990969295605057,
"grad_norm": 4.794469356536865,
"learning_rate": 7.871109067846041e-07,
"loss": 0.3731,
"step": 12450
},
{
"epoch": 1.5003010234798313,
"grad_norm": 4.945312023162842,
"learning_rate": 7.86680615714929e-07,
"loss": 0.3586,
"step": 12460
},
{
"epoch": 1.5015051173991572,
"grad_norm": 3.5225555896759033,
"learning_rate": 7.862500081160692e-07,
"loss": 0.3595,
"step": 12470
},
{
"epoch": 1.502709211318483,
"grad_norm": 4.152462005615234,
"learning_rate": 7.858190844634664e-07,
"loss": 0.3777,
"step": 12480
},
{
"epoch": 1.5039133052378086,
"grad_norm": 5.704073905944824,
"learning_rate": 7.853878452329113e-07,
"loss": 0.375,
"step": 12490
},
{
"epoch": 1.5051173991571343,
"grad_norm": 5.431835174560547,
"learning_rate": 7.849562909005425e-07,
"loss": 0.3596,
"step": 12500
},
{
"epoch": 1.50632149307646,
"grad_norm": 4.785493850708008,
"learning_rate": 7.845244219428469e-07,
"loss": 0.3888,
"step": 12510
},
{
"epoch": 1.5075255869957855,
"grad_norm": 4.297571182250977,
"learning_rate": 7.84092238836659e-07,
"loss": 0.3576,
"step": 12520
},
{
"epoch": 1.5087296809151114,
"grad_norm": 4.948373317718506,
"learning_rate": 7.836597420591595e-07,
"loss": 0.3766,
"step": 12530
},
{
"epoch": 1.5099337748344372,
"grad_norm": 4.502270698547363,
"learning_rate": 7.832269320878762e-07,
"loss": 0.3624,
"step": 12540
},
{
"epoch": 1.5111378687537629,
"grad_norm": 3.8894035816192627,
"learning_rate": 7.827938094006821e-07,
"loss": 0.3743,
"step": 12550
},
{
"epoch": 1.5123419626730885,
"grad_norm": 4.615447044372559,
"learning_rate": 7.823603744757956e-07,
"loss": 0.3586,
"step": 12560
},
{
"epoch": 1.5135460565924141,
"grad_norm": 4.9232401847839355,
"learning_rate": 7.8192662779178e-07,
"loss": 0.3488,
"step": 12570
},
{
"epoch": 1.5147501505117398,
"grad_norm": 4.241856575012207,
"learning_rate": 7.81492569827543e-07,
"loss": 0.355,
"step": 12580
},
{
"epoch": 1.5159542444310656,
"grad_norm": 5.041738986968994,
"learning_rate": 7.810582010623354e-07,
"loss": 0.3755,
"step": 12590
},
{
"epoch": 1.5171583383503915,
"grad_norm": 4.944552421569824,
"learning_rate": 7.806235219757518e-07,
"loss": 0.3643,
"step": 12600
},
{
"epoch": 1.5183624322697171,
"grad_norm": 5.554732799530029,
"learning_rate": 7.801885330477289e-07,
"loss": 0.3687,
"step": 12610
},
{
"epoch": 1.5195665261890428,
"grad_norm": 6.034419059753418,
"learning_rate": 7.797532347585459e-07,
"loss": 0.3595,
"step": 12620
},
{
"epoch": 1.5207706201083684,
"grad_norm": 4.2550048828125,
"learning_rate": 7.793176275888231e-07,
"loss": 0.3727,
"step": 12630
},
{
"epoch": 1.521974714027694,
"grad_norm": 4.084836006164551,
"learning_rate": 7.788817120195226e-07,
"loss": 0.3646,
"step": 12640
},
{
"epoch": 1.5231788079470199,
"grad_norm": 4.183859825134277,
"learning_rate": 7.784454885319464e-07,
"loss": 0.3846,
"step": 12650
},
{
"epoch": 1.5243829018663457,
"grad_norm": 4.216065406799316,
"learning_rate": 7.780089576077364e-07,
"loss": 0.3794,
"step": 12660
},
{
"epoch": 1.5255869957856714,
"grad_norm": 4.975666522979736,
"learning_rate": 7.775721197288744e-07,
"loss": 0.3903,
"step": 12670
},
{
"epoch": 1.526791089704997,
"grad_norm": 4.360125541687012,
"learning_rate": 7.77134975377681e-07,
"loss": 0.3481,
"step": 12680
},
{
"epoch": 1.5279951836243226,
"grad_norm": 5.113675594329834,
"learning_rate": 7.766975250368149e-07,
"loss": 0.3624,
"step": 12690
},
{
"epoch": 1.5291992775436483,
"grad_norm": 4.466128349304199,
"learning_rate": 7.76259769189273e-07,
"loss": 0.3619,
"step": 12700
},
{
"epoch": 1.5304033714629741,
"grad_norm": 4.945206165313721,
"learning_rate": 7.758217083183891e-07,
"loss": 0.358,
"step": 12710
},
{
"epoch": 1.5316074653823,
"grad_norm": 4.3737287521362305,
"learning_rate": 7.753833429078342e-07,
"loss": 0.3566,
"step": 12720
},
{
"epoch": 1.5328115593016256,
"grad_norm": 4.813685894012451,
"learning_rate": 7.749446734416152e-07,
"loss": 0.344,
"step": 12730
},
{
"epoch": 1.5340156532209512,
"grad_norm": 3.858191728591919,
"learning_rate": 7.745057004040751e-07,
"loss": 0.3461,
"step": 12740
},
{
"epoch": 1.5352197471402769,
"grad_norm": 4.396629333496094,
"learning_rate": 7.740664242798919e-07,
"loss": 0.3496,
"step": 12750
},
{
"epoch": 1.5364238410596025,
"grad_norm": 4.17794132232666,
"learning_rate": 7.73626845554078e-07,
"loss": 0.3584,
"step": 12760
},
{
"epoch": 1.5376279349789284,
"grad_norm": 6.110503673553467,
"learning_rate": 7.731869647119801e-07,
"loss": 0.3741,
"step": 12770
},
{
"epoch": 1.5388320288982542,
"grad_norm": 4.858775615692139,
"learning_rate": 7.727467822392787e-07,
"loss": 0.3489,
"step": 12780
},
{
"epoch": 1.5400361228175798,
"grad_norm": 4.899129390716553,
"learning_rate": 7.723062986219871e-07,
"loss": 0.3574,
"step": 12790
},
{
"epoch": 1.5412402167369055,
"grad_norm": 4.589954853057861,
"learning_rate": 7.718655143464508e-07,
"loss": 0.3697,
"step": 12800
},
{
"epoch": 1.542444310656231,
"grad_norm": 4.615177154541016,
"learning_rate": 7.71424429899348e-07,
"loss": 0.3574,
"step": 12810
},
{
"epoch": 1.5436484045755567,
"grad_norm": 5.081363201141357,
"learning_rate": 7.709830457676876e-07,
"loss": 0.3793,
"step": 12820
},
{
"epoch": 1.5448524984948826,
"grad_norm": 5.210774898529053,
"learning_rate": 7.7054136243881e-07,
"loss": 0.3562,
"step": 12830
},
{
"epoch": 1.5460565924142085,
"grad_norm": 4.458885192871094,
"learning_rate": 7.700993804003855e-07,
"loss": 0.3619,
"step": 12840
},
{
"epoch": 1.547260686333534,
"grad_norm": 4.320379734039307,
"learning_rate": 7.696571001404142e-07,
"loss": 0.3629,
"step": 12850
},
{
"epoch": 1.5484647802528597,
"grad_norm": 4.779387474060059,
"learning_rate": 7.692145221472258e-07,
"loss": 0.3633,
"step": 12860
},
{
"epoch": 1.5496688741721854,
"grad_norm": 4.924083709716797,
"learning_rate": 7.687716469094786e-07,
"loss": 0.3624,
"step": 12870
},
{
"epoch": 1.550872968091511,
"grad_norm": 5.194228649139404,
"learning_rate": 7.68328474916159e-07,
"loss": 0.3592,
"step": 12880
},
{
"epoch": 1.5520770620108368,
"grad_norm": 4.606070041656494,
"learning_rate": 7.67885006656581e-07,
"loss": 0.3686,
"step": 12890
},
{
"epoch": 1.5532811559301627,
"grad_norm": 4.206083297729492,
"learning_rate": 7.674412426203859e-07,
"loss": 0.3551,
"step": 12900
},
{
"epoch": 1.5544852498494883,
"grad_norm": 4.67086124420166,
"learning_rate": 7.669971832975416e-07,
"loss": 0.3569,
"step": 12910
},
{
"epoch": 1.555689343768814,
"grad_norm": 5.904470443725586,
"learning_rate": 7.665528291783417e-07,
"loss": 0.3407,
"step": 12920
},
{
"epoch": 1.5568934376881396,
"grad_norm": 4.242117404937744,
"learning_rate": 7.661081807534058e-07,
"loss": 0.3422,
"step": 12930
},
{
"epoch": 1.5580975316074652,
"grad_norm": 4.790373802185059,
"learning_rate": 7.656632385136778e-07,
"loss": 0.3573,
"step": 12940
},
{
"epoch": 1.559301625526791,
"grad_norm": 4.904318809509277,
"learning_rate": 7.652180029504268e-07,
"loss": 0.3606,
"step": 12950
},
{
"epoch": 1.560505719446117,
"grad_norm": 4.863579750061035,
"learning_rate": 7.64772474555245e-07,
"loss": 0.361,
"step": 12960
},
{
"epoch": 1.5617098133654426,
"grad_norm": 5.459078311920166,
"learning_rate": 7.643266538200483e-07,
"loss": 0.3577,
"step": 12970
},
{
"epoch": 1.5629139072847682,
"grad_norm": 5.426388740539551,
"learning_rate": 7.638805412370755e-07,
"loss": 0.3725,
"step": 12980
},
{
"epoch": 1.5641180012040938,
"grad_norm": 4.903288841247559,
"learning_rate": 7.634341372988872e-07,
"loss": 0.3562,
"step": 12990
},
{
"epoch": 1.5653220951234195,
"grad_norm": 4.128101825714111,
"learning_rate": 7.629874424983664e-07,
"loss": 0.3405,
"step": 13000
},
{
"epoch": 1.5665261890427453,
"grad_norm": 4.6488213539123535,
"learning_rate": 7.625404573287163e-07,
"loss": 0.3731,
"step": 13010
},
{
"epoch": 1.5677302829620712,
"grad_norm": 4.610156059265137,
"learning_rate": 7.620931822834614e-07,
"loss": 0.3575,
"step": 13020
},
{
"epoch": 1.5689343768813968,
"grad_norm": 5.422335147857666,
"learning_rate": 7.616456178564462e-07,
"loss": 0.3833,
"step": 13030
},
{
"epoch": 1.5701384708007224,
"grad_norm": 4.844593048095703,
"learning_rate": 7.611977645418343e-07,
"loss": 0.3647,
"step": 13040
},
{
"epoch": 1.571342564720048,
"grad_norm": 4.274131774902344,
"learning_rate": 7.607496228341088e-07,
"loss": 0.3542,
"step": 13050
},
{
"epoch": 1.5725466586393737,
"grad_norm": 4.641569137573242,
"learning_rate": 7.60301193228071e-07,
"loss": 0.3704,
"step": 13060
},
{
"epoch": 1.5737507525586996,
"grad_norm": 4.771531105041504,
"learning_rate": 7.598524762188395e-07,
"loss": 0.3529,
"step": 13070
},
{
"epoch": 1.5749548464780254,
"grad_norm": 5.63432502746582,
"learning_rate": 7.594034723018514e-07,
"loss": 0.3554,
"step": 13080
},
{
"epoch": 1.576158940397351,
"grad_norm": 3.5664002895355225,
"learning_rate": 7.589541819728596e-07,
"loss": 0.3617,
"step": 13090
},
{
"epoch": 1.5773630343166767,
"grad_norm": 4.43233060836792,
"learning_rate": 7.585046057279337e-07,
"loss": 0.3795,
"step": 13100
},
{
"epoch": 1.5785671282360023,
"grad_norm": 4.293588638305664,
"learning_rate": 7.580547440634587e-07,
"loss": 0.3361,
"step": 13110
},
{
"epoch": 1.5797712221553282,
"grad_norm": 4.606287479400635,
"learning_rate": 7.576045974761351e-07,
"loss": 0.3573,
"step": 13120
},
{
"epoch": 1.5809753160746538,
"grad_norm": 4.9702558517456055,
"learning_rate": 7.571541664629775e-07,
"loss": 0.3718,
"step": 13130
},
{
"epoch": 1.5821794099939797,
"grad_norm": 4.685069561004639,
"learning_rate": 7.567034515213151e-07,
"loss": 0.3704,
"step": 13140
},
{
"epoch": 1.5833835039133053,
"grad_norm": 4.804528713226318,
"learning_rate": 7.562524531487902e-07,
"loss": 0.3511,
"step": 13150
},
{
"epoch": 1.584587597832631,
"grad_norm": 5.332268714904785,
"learning_rate": 7.558011718433582e-07,
"loss": 0.3573,
"step": 13160
},
{
"epoch": 1.5857916917519566,
"grad_norm": 4.4862284660339355,
"learning_rate": 7.553496081032867e-07,
"loss": 0.3423,
"step": 13170
},
{
"epoch": 1.5869957856712824,
"grad_norm": 4.632198810577393,
"learning_rate": 7.548977624271555e-07,
"loss": 0.3719,
"step": 13180
},
{
"epoch": 1.588199879590608,
"grad_norm": 4.4371137619018555,
"learning_rate": 7.544456353138553e-07,
"loss": 0.3515,
"step": 13190
},
{
"epoch": 1.589403973509934,
"grad_norm": 4.162461757659912,
"learning_rate": 7.539932272625879e-07,
"loss": 0.363,
"step": 13200
},
{
"epoch": 1.5906080674292595,
"grad_norm": 4.980907917022705,
"learning_rate": 7.535405387728648e-07,
"loss": 0.362,
"step": 13210
},
{
"epoch": 1.5918121613485852,
"grad_norm": 4.321689128875732,
"learning_rate": 7.530875703445077e-07,
"loss": 0.3441,
"step": 13220
},
{
"epoch": 1.5930162552679108,
"grad_norm": 4.930966854095459,
"learning_rate": 7.526343224776471e-07,
"loss": 0.3505,
"step": 13230
},
{
"epoch": 1.5942203491872367,
"grad_norm": 4.267889499664307,
"learning_rate": 7.52180795672722e-07,
"loss": 0.3678,
"step": 13240
},
{
"epoch": 1.5954244431065623,
"grad_norm": 3.8834383487701416,
"learning_rate": 7.517269904304794e-07,
"loss": 0.3648,
"step": 13250
},
{
"epoch": 1.5966285370258881,
"grad_norm": 4.397730827331543,
"learning_rate": 7.512729072519739e-07,
"loss": 0.3601,
"step": 13260
},
{
"epoch": 1.5978326309452138,
"grad_norm": 4.559187889099121,
"learning_rate": 7.508185466385666e-07,
"loss": 0.3508,
"step": 13270
},
{
"epoch": 1.5990367248645394,
"grad_norm": 4.514613628387451,
"learning_rate": 7.503639090919255e-07,
"loss": 0.3578,
"step": 13280
},
{
"epoch": 1.600240818783865,
"grad_norm": 4.5233073234558105,
"learning_rate": 7.499089951140237e-07,
"loss": 0.3516,
"step": 13290
},
{
"epoch": 1.601444912703191,
"grad_norm": 4.616694450378418,
"learning_rate": 7.494538052071402e-07,
"loss": 0.3616,
"step": 13300
},
{
"epoch": 1.6026490066225165,
"grad_norm": 4.6488518714904785,
"learning_rate": 7.489983398738579e-07,
"loss": 0.3582,
"step": 13310
},
{
"epoch": 1.6038531005418424,
"grad_norm": 4.645969390869141,
"learning_rate": 7.485425996170644e-07,
"loss": 0.3548,
"step": 13320
},
{
"epoch": 1.605057194461168,
"grad_norm": 5.864965438842773,
"learning_rate": 7.480865849399507e-07,
"loss": 0.3587,
"step": 13330
},
{
"epoch": 1.6062612883804936,
"grad_norm": 4.283803939819336,
"learning_rate": 7.476302963460108e-07,
"loss": 0.3626,
"step": 13340
},
{
"epoch": 1.6074653822998193,
"grad_norm": 4.545533657073975,
"learning_rate": 7.47173734339041e-07,
"loss": 0.3526,
"step": 13350
},
{
"epoch": 1.6086694762191451,
"grad_norm": 4.885293483734131,
"learning_rate": 7.467168994231393e-07,
"loss": 0.3685,
"step": 13360
},
{
"epoch": 1.6098735701384708,
"grad_norm": 4.112198829650879,
"learning_rate": 7.462597921027056e-07,
"loss": 0.3727,
"step": 13370
},
{
"epoch": 1.6110776640577966,
"grad_norm": 4.272058963775635,
"learning_rate": 7.458024128824403e-07,
"loss": 0.3567,
"step": 13380
},
{
"epoch": 1.6122817579771223,
"grad_norm": 4.891336441040039,
"learning_rate": 7.453447622673438e-07,
"loss": 0.3566,
"step": 13390
},
{
"epoch": 1.6134858518964479,
"grad_norm": 5.003636837005615,
"learning_rate": 7.448868407627163e-07,
"loss": 0.3717,
"step": 13400
},
{
"epoch": 1.6146899458157735,
"grad_norm": 3.9844002723693848,
"learning_rate": 7.444286488741571e-07,
"loss": 0.3537,
"step": 13410
},
{
"epoch": 1.6158940397350994,
"grad_norm": 4.326488018035889,
"learning_rate": 7.439701871075641e-07,
"loss": 0.3353,
"step": 13420
},
{
"epoch": 1.617098133654425,
"grad_norm": 4.168161392211914,
"learning_rate": 7.435114559691333e-07,
"loss": 0.3506,
"step": 13430
},
{
"epoch": 1.6183022275737509,
"grad_norm": 5.062152862548828,
"learning_rate": 7.430524559653575e-07,
"loss": 0.3536,
"step": 13440
},
{
"epoch": 1.6195063214930765,
"grad_norm": 5.29563570022583,
"learning_rate": 7.425931876030272e-07,
"loss": 0.359,
"step": 13450
},
{
"epoch": 1.6207104154124021,
"grad_norm": 4.655216693878174,
"learning_rate": 7.421336513892284e-07,
"loss": 0.3459,
"step": 13460
},
{
"epoch": 1.6219145093317278,
"grad_norm": 4.558264255523682,
"learning_rate": 7.416738478313438e-07,
"loss": 0.3603,
"step": 13470
},
{
"epoch": 1.6231186032510536,
"grad_norm": 4.36596155166626,
"learning_rate": 7.412137774370501e-07,
"loss": 0.3632,
"step": 13480
},
{
"epoch": 1.6243226971703792,
"grad_norm": 4.248297214508057,
"learning_rate": 7.407534407143198e-07,
"loss": 0.3575,
"step": 13490
},
{
"epoch": 1.625526791089705,
"grad_norm": 4.935293197631836,
"learning_rate": 7.402928381714184e-07,
"loss": 0.3583,
"step": 13500
},
{
"epoch": 1.6267308850090307,
"grad_norm": 4.29832649230957,
"learning_rate": 7.398319703169057e-07,
"loss": 0.3593,
"step": 13510
},
{
"epoch": 1.6279349789283564,
"grad_norm": 4.707507610321045,
"learning_rate": 7.39370837659634e-07,
"loss": 0.3486,
"step": 13520
},
{
"epoch": 1.629139072847682,
"grad_norm": 4.7867326736450195,
"learning_rate": 7.389094407087481e-07,
"loss": 0.3708,
"step": 13530
},
{
"epoch": 1.6303431667670079,
"grad_norm": 5.004173755645752,
"learning_rate": 7.384477799736847e-07,
"loss": 0.3693,
"step": 13540
},
{
"epoch": 1.6315472606863335,
"grad_norm": 4.378966331481934,
"learning_rate": 7.379858559641716e-07,
"loss": 0.3792,
"step": 13550
},
{
"epoch": 1.6327513546056593,
"grad_norm": 4.35708475112915,
"learning_rate": 7.375236691902272e-07,
"loss": 0.357,
"step": 13560
},
{
"epoch": 1.633955448524985,
"grad_norm": 4.158879280090332,
"learning_rate": 7.370612201621606e-07,
"loss": 0.3705,
"step": 13570
},
{
"epoch": 1.6351595424443106,
"grad_norm": 4.620648384094238,
"learning_rate": 7.365985093905693e-07,
"loss": 0.3288,
"step": 13580
},
{
"epoch": 1.6363636363636362,
"grad_norm": 4.588129997253418,
"learning_rate": 7.361355373863413e-07,
"loss": 0.3545,
"step": 13590
},
{
"epoch": 1.637567730282962,
"grad_norm": 4.273639678955078,
"learning_rate": 7.356723046606517e-07,
"loss": 0.3597,
"step": 13600
},
{
"epoch": 1.6387718242022877,
"grad_norm": 4.793459415435791,
"learning_rate": 7.352088117249644e-07,
"loss": 0.3532,
"step": 13610
},
{
"epoch": 1.6399759181216136,
"grad_norm": 4.27385950088501,
"learning_rate": 7.347450590910299e-07,
"loss": 0.3787,
"step": 13620
},
{
"epoch": 1.6411800120409392,
"grad_norm": 4.229093551635742,
"learning_rate": 7.34281047270886e-07,
"loss": 0.3592,
"step": 13630
},
{
"epoch": 1.6423841059602649,
"grad_norm": 4.402678489685059,
"learning_rate": 7.338167767768564e-07,
"loss": 0.3612,
"step": 13640
},
{
"epoch": 1.6435881998795905,
"grad_norm": 4.09978723526001,
"learning_rate": 7.333522481215503e-07,
"loss": 0.3571,
"step": 13650
},
{
"epoch": 1.6447922937989163,
"grad_norm": 4.659477710723877,
"learning_rate": 7.32887461817862e-07,
"loss": 0.3725,
"step": 13660
},
{
"epoch": 1.645996387718242,
"grad_norm": 4.500072002410889,
"learning_rate": 7.324224183789707e-07,
"loss": 0.3458,
"step": 13670
},
{
"epoch": 1.6472004816375678,
"grad_norm": 5.1016526222229,
"learning_rate": 7.319571183183388e-07,
"loss": 0.3734,
"step": 13680
},
{
"epoch": 1.6484045755568935,
"grad_norm": 4.819193363189697,
"learning_rate": 7.314915621497129e-07,
"loss": 0.3601,
"step": 13690
},
{
"epoch": 1.649608669476219,
"grad_norm": 4.4075026512146,
"learning_rate": 7.310257503871214e-07,
"loss": 0.3556,
"step": 13700
},
{
"epoch": 1.6508127633955447,
"grad_norm": 4.471024036407471,
"learning_rate": 7.305596835448753e-07,
"loss": 0.3625,
"step": 13710
},
{
"epoch": 1.6520168573148706,
"grad_norm": 4.29016637802124,
"learning_rate": 7.300933621375676e-07,
"loss": 0.3619,
"step": 13720
},
{
"epoch": 1.6532209512341962,
"grad_norm": 4.514208793640137,
"learning_rate": 7.296267866800722e-07,
"loss": 0.3622,
"step": 13730
},
{
"epoch": 1.654425045153522,
"grad_norm": 4.275468826293945,
"learning_rate": 7.291599576875432e-07,
"loss": 0.3667,
"step": 13740
},
{
"epoch": 1.6556291390728477,
"grad_norm": 4.0805559158325195,
"learning_rate": 7.286928756754148e-07,
"loss": 0.371,
"step": 13750
},
{
"epoch": 1.6568332329921733,
"grad_norm": 4.84345006942749,
"learning_rate": 7.282255411594006e-07,
"loss": 0.3696,
"step": 13760
},
{
"epoch": 1.658037326911499,
"grad_norm": 4.703734874725342,
"learning_rate": 7.277579546554931e-07,
"loss": 0.3673,
"step": 13770
},
{
"epoch": 1.6592414208308248,
"grad_norm": 4.18894624710083,
"learning_rate": 7.272901166799627e-07,
"loss": 0.3365,
"step": 13780
},
{
"epoch": 1.6604455147501507,
"grad_norm": 4.9901204109191895,
"learning_rate": 7.268220277493578e-07,
"loss": 0.3588,
"step": 13790
},
{
"epoch": 1.6616496086694763,
"grad_norm": 4.896132946014404,
"learning_rate": 7.263536883805039e-07,
"loss": 0.3659,
"step": 13800
},
{
"epoch": 1.662853702588802,
"grad_norm": 4.311833381652832,
"learning_rate": 7.258850990905025e-07,
"loss": 0.3707,
"step": 13810
},
{
"epoch": 1.6640577965081276,
"grad_norm": 4.157628059387207,
"learning_rate": 7.254162603967317e-07,
"loss": 0.3498,
"step": 13820
},
{
"epoch": 1.6652618904274532,
"grad_norm": 5.240469932556152,
"learning_rate": 7.249471728168443e-07,
"loss": 0.3559,
"step": 13830
},
{
"epoch": 1.666465984346779,
"grad_norm": 4.077708721160889,
"learning_rate": 7.244778368687687e-07,
"loss": 0.3745,
"step": 13840
},
{
"epoch": 1.667670078266105,
"grad_norm": 4.9550395011901855,
"learning_rate": 7.240082530707069e-07,
"loss": 0.3563,
"step": 13850
},
{
"epoch": 1.6688741721854305,
"grad_norm": 5.530270576477051,
"learning_rate": 7.235384219411348e-07,
"loss": 0.3764,
"step": 13860
},
{
"epoch": 1.6700782661047562,
"grad_norm": 4.50790548324585,
"learning_rate": 7.230683439988012e-07,
"loss": 0.3471,
"step": 13870
},
{
"epoch": 1.6712823600240818,
"grad_norm": 4.373943328857422,
"learning_rate": 7.225980197627277e-07,
"loss": 0.3601,
"step": 13880
},
{
"epoch": 1.6724864539434074,
"grad_norm": 3.9449055194854736,
"learning_rate": 7.221274497522076e-07,
"loss": 0.3533,
"step": 13890
},
{
"epoch": 1.6736905478627333,
"grad_norm": 4.625890254974365,
"learning_rate": 7.216566344868058e-07,
"loss": 0.3771,
"step": 13900
},
{
"epoch": 1.6748946417820592,
"grad_norm": 4.7843475341796875,
"learning_rate": 7.211855744863577e-07,
"loss": 0.3477,
"step": 13910
},
{
"epoch": 1.6760987357013848,
"grad_norm": 4.275618076324463,
"learning_rate": 7.207142702709688e-07,
"loss": 0.3452,
"step": 13920
},
{
"epoch": 1.6773028296207104,
"grad_norm": 5.26132869720459,
"learning_rate": 7.202427223610152e-07,
"loss": 0.3568,
"step": 13930
},
{
"epoch": 1.678506923540036,
"grad_norm": 4.528031826019287,
"learning_rate": 7.197709312771406e-07,
"loss": 0.347,
"step": 13940
},
{
"epoch": 1.6797110174593617,
"grad_norm": 4.68961763381958,
"learning_rate": 7.192988975402583e-07,
"loss": 0.3687,
"step": 13950
},
{
"epoch": 1.6809151113786875,
"grad_norm": 4.3820719718933105,
"learning_rate": 7.188266216715493e-07,
"loss": 0.3572,
"step": 13960
},
{
"epoch": 1.6821192052980134,
"grad_norm": 3.974177598953247,
"learning_rate": 7.183541041924616e-07,
"loss": 0.34,
"step": 13970
},
{
"epoch": 1.683323299217339,
"grad_norm": 4.8562331199646,
"learning_rate": 7.178813456247102e-07,
"loss": 0.3532,
"step": 13980
},
{
"epoch": 1.6845273931366647,
"grad_norm": 3.9439549446105957,
"learning_rate": 7.174083464902763e-07,
"loss": 0.3459,
"step": 13990
},
{
"epoch": 1.6857314870559903,
"grad_norm": 4.226308345794678,
"learning_rate": 7.16935107311407e-07,
"loss": 0.3352,
"step": 14000
},
{
"epoch": 1.686935580975316,
"grad_norm": 4.850135326385498,
"learning_rate": 7.164616286106135e-07,
"loss": 0.3661,
"step": 14010
},
{
"epoch": 1.6881396748946418,
"grad_norm": 4.845891952514648,
"learning_rate": 7.159879109106725e-07,
"loss": 0.3868,
"step": 14020
},
{
"epoch": 1.6893437688139676,
"grad_norm": 5.063507556915283,
"learning_rate": 7.155139547346242e-07,
"loss": 0.3628,
"step": 14030
},
{
"epoch": 1.6905478627332933,
"grad_norm": 4.6817216873168945,
"learning_rate": 7.15039760605772e-07,
"loss": 0.3744,
"step": 14040
},
{
"epoch": 1.691751956652619,
"grad_norm": 4.315075874328613,
"learning_rate": 7.145653290476819e-07,
"loss": 0.3613,
"step": 14050
},
{
"epoch": 1.6929560505719445,
"grad_norm": 4.234760284423828,
"learning_rate": 7.140906605841825e-07,
"loss": 0.3733,
"step": 14060
},
{
"epoch": 1.6941601444912702,
"grad_norm": 5.843511581420898,
"learning_rate": 7.136157557393637e-07,
"loss": 0.3443,
"step": 14070
},
{
"epoch": 1.695364238410596,
"grad_norm": 4.704221248626709,
"learning_rate": 7.131406150375762e-07,
"loss": 0.3384,
"step": 14080
},
{
"epoch": 1.6965683323299219,
"grad_norm": 4.1078200340271,
"learning_rate": 7.126652390034316e-07,
"loss": 0.3554,
"step": 14090
},
{
"epoch": 1.6977724262492475,
"grad_norm": 4.6124773025512695,
"learning_rate": 7.12189628161801e-07,
"loss": 0.3323,
"step": 14100
},
{
"epoch": 1.6989765201685731,
"grad_norm": 3.9569902420043945,
"learning_rate": 7.117137830378146e-07,
"loss": 0.3581,
"step": 14110
},
{
"epoch": 1.7001806140878988,
"grad_norm": 4.327024459838867,
"learning_rate": 7.112377041568617e-07,
"loss": 0.3605,
"step": 14120
},
{
"epoch": 1.7013847080072244,
"grad_norm": 4.041974067687988,
"learning_rate": 7.107613920445895e-07,
"loss": 0.3514,
"step": 14130
},
{
"epoch": 1.7025888019265503,
"grad_norm": 4.295658588409424,
"learning_rate": 7.102848472269026e-07,
"loss": 0.3489,
"step": 14140
},
{
"epoch": 1.7037928958458761,
"grad_norm": 4.117722988128662,
"learning_rate": 7.098080702299628e-07,
"loss": 0.3382,
"step": 14150
},
{
"epoch": 1.7049969897652018,
"grad_norm": 5.249290943145752,
"learning_rate": 7.093310615801879e-07,
"loss": 0.3696,
"step": 14160
},
{
"epoch": 1.7062010836845274,
"grad_norm": 3.8647286891937256,
"learning_rate": 7.088538218042518e-07,
"loss": 0.3403,
"step": 14170
},
{
"epoch": 1.707405177603853,
"grad_norm": 4.454891204833984,
"learning_rate": 7.083763514290834e-07,
"loss": 0.3743,
"step": 14180
},
{
"epoch": 1.7086092715231787,
"grad_norm": 4.183931827545166,
"learning_rate": 7.078986509818662e-07,
"loss": 0.3493,
"step": 14190
},
{
"epoch": 1.7098133654425045,
"grad_norm": 3.9510889053344727,
"learning_rate": 7.074207209900379e-07,
"loss": 0.3469,
"step": 14200
},
{
"epoch": 1.7110174593618304,
"grad_norm": 4.839264869689941,
"learning_rate": 7.069425619812896e-07,
"loss": 0.3444,
"step": 14210
},
{
"epoch": 1.712221553281156,
"grad_norm": 4.237350940704346,
"learning_rate": 7.064641744835649e-07,
"loss": 0.3474,
"step": 14220
},
{
"epoch": 1.7134256472004816,
"grad_norm": 4.17114782333374,
"learning_rate": 7.059855590250603e-07,
"loss": 0.3465,
"step": 14230
},
{
"epoch": 1.7146297411198073,
"grad_norm": 4.114003658294678,
"learning_rate": 7.055067161342233e-07,
"loss": 0.3674,
"step": 14240
},
{
"epoch": 1.715833835039133,
"grad_norm": 4.886813640594482,
"learning_rate": 7.050276463397533e-07,
"loss": 0.3848,
"step": 14250
},
{
"epoch": 1.7170379289584587,
"grad_norm": 4.069955348968506,
"learning_rate": 7.045483501705996e-07,
"loss": 0.3493,
"step": 14260
},
{
"epoch": 1.7182420228777846,
"grad_norm": 4.502857685089111,
"learning_rate": 7.040688281559617e-07,
"loss": 0.3548,
"step": 14270
},
{
"epoch": 1.7194461167971102,
"grad_norm": 4.283501148223877,
"learning_rate": 7.035890808252884e-07,
"loss": 0.3571,
"step": 14280
},
{
"epoch": 1.7206502107164359,
"grad_norm": 4.563022136688232,
"learning_rate": 7.031091087082772e-07,
"loss": 0.3485,
"step": 14290
},
{
"epoch": 1.7218543046357615,
"grad_norm": 4.165189266204834,
"learning_rate": 7.02628912334874e-07,
"loss": 0.3417,
"step": 14300
},
{
"epoch": 1.7230583985550871,
"grad_norm": 4.657063961029053,
"learning_rate": 7.021484922352721e-07,
"loss": 0.3611,
"step": 14310
},
{
"epoch": 1.724262492474413,
"grad_norm": 6.094346046447754,
"learning_rate": 7.016678489399121e-07,
"loss": 0.3371,
"step": 14320
},
{
"epoch": 1.7254665863937388,
"grad_norm": 4.576262474060059,
"learning_rate": 7.011869829794806e-07,
"loss": 0.3624,
"step": 14330
},
{
"epoch": 1.7266706803130645,
"grad_norm": 5.231967449188232,
"learning_rate": 7.007058948849105e-07,
"loss": 0.3745,
"step": 14340
},
{
"epoch": 1.72787477423239,
"grad_norm": 4.39863395690918,
"learning_rate": 7.002245851873794e-07,
"loss": 0.3545,
"step": 14350
},
{
"epoch": 1.7290788681517157,
"grad_norm": 4.428983211517334,
"learning_rate": 6.997430544183103e-07,
"loss": 0.3534,
"step": 14360
},
{
"epoch": 1.7302829620710414,
"grad_norm": 5.451033115386963,
"learning_rate": 6.992613031093698e-07,
"loss": 0.3584,
"step": 14370
},
{
"epoch": 1.7314870559903672,
"grad_norm": 4.715031147003174,
"learning_rate": 6.987793317924682e-07,
"loss": 0.3643,
"step": 14380
},
{
"epoch": 1.732691149909693,
"grad_norm": 4.199245452880859,
"learning_rate": 6.982971409997583e-07,
"loss": 0.3539,
"step": 14390
},
{
"epoch": 1.7338952438290187,
"grad_norm": 5.606119632720947,
"learning_rate": 6.97814731263636e-07,
"loss": 0.3613,
"step": 14400
},
{
"epoch": 1.7350993377483444,
"grad_norm": 5.036284923553467,
"learning_rate": 6.973321031167382e-07,
"loss": 0.3679,
"step": 14410
},
{
"epoch": 1.73630343166767,
"grad_norm": 4.951879978179932,
"learning_rate": 6.968492570919434e-07,
"loss": 0.3572,
"step": 14420
},
{
"epoch": 1.7375075255869958,
"grad_norm": 4.428969860076904,
"learning_rate": 6.963661937223703e-07,
"loss": 0.3538,
"step": 14430
},
{
"epoch": 1.7387116195063215,
"grad_norm": 3.7024569511413574,
"learning_rate": 6.958829135413782e-07,
"loss": 0.3644,
"step": 14440
},
{
"epoch": 1.7399157134256473,
"grad_norm": 4.4168381690979,
"learning_rate": 6.95399417082565e-07,
"loss": 0.3498,
"step": 14450
},
{
"epoch": 1.741119807344973,
"grad_norm": 4.818751335144043,
"learning_rate": 6.949157048797678e-07,
"loss": 0.3726,
"step": 14460
},
{
"epoch": 1.7423239012642986,
"grad_norm": 5.769382953643799,
"learning_rate": 6.944317774670622e-07,
"loss": 0.3517,
"step": 14470
},
{
"epoch": 1.7435279951836242,
"grad_norm": 4.914524078369141,
"learning_rate": 6.939476353787607e-07,
"loss": 0.349,
"step": 14480
},
{
"epoch": 1.74473208910295,
"grad_norm": 4.6800456047058105,
"learning_rate": 6.934632791494134e-07,
"loss": 0.3725,
"step": 14490
},
{
"epoch": 1.7459361830222757,
"grad_norm": 4.627834796905518,
"learning_rate": 6.929787093138067e-07,
"loss": 0.359,
"step": 14500
},
{
"epoch": 1.7471402769416016,
"grad_norm": 5.098109245300293,
"learning_rate": 6.924939264069626e-07,
"loss": 0.3502,
"step": 14510
},
{
"epoch": 1.7483443708609272,
"grad_norm": 4.18192720413208,
"learning_rate": 6.920089309641388e-07,
"loss": 0.3448,
"step": 14520
},
{
"epoch": 1.7495484647802528,
"grad_norm": 4.4052815437316895,
"learning_rate": 6.915237235208274e-07,
"loss": 0.3459,
"step": 14530
},
{
"epoch": 1.7507525586995785,
"grad_norm": 5.557136058807373,
"learning_rate": 6.910383046127544e-07,
"loss": 0.355,
"step": 14540
},
{
"epoch": 1.7519566526189043,
"grad_norm": 5.7654128074646,
"learning_rate": 6.905526747758796e-07,
"loss": 0.3624,
"step": 14550
},
{
"epoch": 1.75316074653823,
"grad_norm": 5.040695667266846,
"learning_rate": 6.900668345463957e-07,
"loss": 0.3513,
"step": 14560
},
{
"epoch": 1.7543648404575558,
"grad_norm": 4.529175758361816,
"learning_rate": 6.895807844607274e-07,
"loss": 0.348,
"step": 14570
},
{
"epoch": 1.7555689343768814,
"grad_norm": 4.473850727081299,
"learning_rate": 6.890945250555312e-07,
"loss": 0.3708,
"step": 14580
},
{
"epoch": 1.756773028296207,
"grad_norm": 4.2242865562438965,
"learning_rate": 6.88608056867695e-07,
"loss": 0.3536,
"step": 14590
},
{
"epoch": 1.7579771222155327,
"grad_norm": 4.953219413757324,
"learning_rate": 6.881213804343369e-07,
"loss": 0.3564,
"step": 14600
},
{
"epoch": 1.7591812161348586,
"grad_norm": 4.626575469970703,
"learning_rate": 6.876344962928051e-07,
"loss": 0.3624,
"step": 14610
},
{
"epoch": 1.7603853100541842,
"grad_norm": 5.615645408630371,
"learning_rate": 6.87147404980677e-07,
"loss": 0.3711,
"step": 14620
},
{
"epoch": 1.76158940397351,
"grad_norm": 4.350038051605225,
"learning_rate": 6.866601070357587e-07,
"loss": 0.3517,
"step": 14630
},
{
"epoch": 1.7627934978928357,
"grad_norm": 4.5289387702941895,
"learning_rate": 6.861726029960849e-07,
"loss": 0.3602,
"step": 14640
},
{
"epoch": 1.7639975918121613,
"grad_norm": 5.127388954162598,
"learning_rate": 6.856848933999173e-07,
"loss": 0.345,
"step": 14650
},
{
"epoch": 1.765201685731487,
"grad_norm": 4.675601482391357,
"learning_rate": 6.851969787857447e-07,
"loss": 0.3484,
"step": 14660
},
{
"epoch": 1.7664057796508128,
"grad_norm": 3.9305527210235596,
"learning_rate": 6.847088596922824e-07,
"loss": 0.3478,
"step": 14670
},
{
"epoch": 1.7676098735701384,
"grad_norm": 4.547889709472656,
"learning_rate": 6.842205366584715e-07,
"loss": 0.3627,
"step": 14680
},
{
"epoch": 1.7688139674894643,
"grad_norm": 5.042651653289795,
"learning_rate": 6.837320102234781e-07,
"loss": 0.3595,
"step": 14690
},
{
"epoch": 1.77001806140879,
"grad_norm": 4.645577907562256,
"learning_rate": 6.832432809266928e-07,
"loss": 0.3417,
"step": 14700
},
{
"epoch": 1.7712221553281156,
"grad_norm": 5.52669095993042,
"learning_rate": 6.827543493077306e-07,
"loss": 0.352,
"step": 14710
},
{
"epoch": 1.7724262492474412,
"grad_norm": 4.48500394821167,
"learning_rate": 6.822652159064293e-07,
"loss": 0.3427,
"step": 14720
},
{
"epoch": 1.773630343166767,
"grad_norm": 4.676848411560059,
"learning_rate": 6.817758812628503e-07,
"loss": 0.3568,
"step": 14730
},
{
"epoch": 1.7748344370860927,
"grad_norm": 4.112384796142578,
"learning_rate": 6.812863459172764e-07,
"loss": 0.3626,
"step": 14740
},
{
"epoch": 1.7760385310054185,
"grad_norm": 4.3355326652526855,
"learning_rate": 6.807966104102122e-07,
"loss": 0.3408,
"step": 14750
},
{
"epoch": 1.7772426249247442,
"grad_norm": 4.12075138092041,
"learning_rate": 6.803066752823837e-07,
"loss": 0.3516,
"step": 14760
},
{
"epoch": 1.7784467188440698,
"grad_norm": 4.14115571975708,
"learning_rate": 6.79816541074737e-07,
"loss": 0.3442,
"step": 14770
},
{
"epoch": 1.7796508127633954,
"grad_norm": 4.440965175628662,
"learning_rate": 6.793262083284377e-07,
"loss": 0.348,
"step": 14780
},
{
"epoch": 1.7808549066827213,
"grad_norm": 4.727054595947266,
"learning_rate": 6.788356775848712e-07,
"loss": 0.3545,
"step": 14790
},
{
"epoch": 1.782059000602047,
"grad_norm": 4.421995639801025,
"learning_rate": 6.783449493856411e-07,
"loss": 0.3584,
"step": 14800
},
{
"epoch": 1.7832630945213728,
"grad_norm": 4.619497776031494,
"learning_rate": 6.778540242725695e-07,
"loss": 0.3621,
"step": 14810
},
{
"epoch": 1.7844671884406984,
"grad_norm": 4.975179672241211,
"learning_rate": 6.773629027876952e-07,
"loss": 0.3433,
"step": 14820
},
{
"epoch": 1.785671282360024,
"grad_norm": 4.3249030113220215,
"learning_rate": 6.768715854732743e-07,
"loss": 0.362,
"step": 14830
},
{
"epoch": 1.7868753762793497,
"grad_norm": 4.467803001403809,
"learning_rate": 6.763800728717792e-07,
"loss": 0.3589,
"step": 14840
},
{
"epoch": 1.7880794701986755,
"grad_norm": 5.496029376983643,
"learning_rate": 6.758883655258976e-07,
"loss": 0.3395,
"step": 14850
},
{
"epoch": 1.7892835641180012,
"grad_norm": 4.524773120880127,
"learning_rate": 6.753964639785321e-07,
"loss": 0.3544,
"step": 14860
},
{
"epoch": 1.790487658037327,
"grad_norm": 4.625549793243408,
"learning_rate": 6.749043687728005e-07,
"loss": 0.3721,
"step": 14870
},
{
"epoch": 1.7916917519566526,
"grad_norm": 5.1430230140686035,
"learning_rate": 6.744120804520335e-07,
"loss": 0.3516,
"step": 14880
},
{
"epoch": 1.7928958458759783,
"grad_norm": 5.0784173011779785,
"learning_rate": 6.739195995597757e-07,
"loss": 0.3579,
"step": 14890
},
{
"epoch": 1.794099939795304,
"grad_norm": 4.529468536376953,
"learning_rate": 6.734269266397836e-07,
"loss": 0.3573,
"step": 14900
},
{
"epoch": 1.7953040337146298,
"grad_norm": 4.950248718261719,
"learning_rate": 6.729340622360267e-07,
"loss": 0.3615,
"step": 14910
},
{
"epoch": 1.7965081276339554,
"grad_norm": 3.968449831008911,
"learning_rate": 6.724410068926852e-07,
"loss": 0.3361,
"step": 14920
},
{
"epoch": 1.7977122215532813,
"grad_norm": 4.806743144989014,
"learning_rate": 6.7194776115415e-07,
"loss": 0.3497,
"step": 14930
},
{
"epoch": 1.7989163154726069,
"grad_norm": 4.263092517852783,
"learning_rate": 6.714543255650229e-07,
"loss": 0.3659,
"step": 14940
},
{
"epoch": 1.8001204093919325,
"grad_norm": 4.752941131591797,
"learning_rate": 6.709607006701148e-07,
"loss": 0.3363,
"step": 14950
},
{
"epoch": 1.8013245033112582,
"grad_norm": 5.102241516113281,
"learning_rate": 6.704668870144458e-07,
"loss": 0.3487,
"step": 14960
},
{
"epoch": 1.802528597230584,
"grad_norm": 3.8051202297210693,
"learning_rate": 6.699728851432442e-07,
"loss": 0.3373,
"step": 14970
},
{
"epoch": 1.8037326911499096,
"grad_norm": 4.386908054351807,
"learning_rate": 6.694786956019467e-07,
"loss": 0.3646,
"step": 14980
},
{
"epoch": 1.8049367850692355,
"grad_norm": 4.566622257232666,
"learning_rate": 6.689843189361962e-07,
"loss": 0.3698,
"step": 14990
},
{
"epoch": 1.8061408789885611,
"grad_norm": 4.474935054779053,
"learning_rate": 6.684897556918434e-07,
"loss": 0.3567,
"step": 15000
},
{
"epoch": 1.8073449729078868,
"grad_norm": 4.712069034576416,
"learning_rate": 6.67995006414944e-07,
"loss": 0.3573,
"step": 15010
},
{
"epoch": 1.8085490668272124,
"grad_norm": 4.497696876525879,
"learning_rate": 6.675000716517595e-07,
"loss": 0.3373,
"step": 15020
},
{
"epoch": 1.8097531607465382,
"grad_norm": 4.327920436859131,
"learning_rate": 6.670049519487565e-07,
"loss": 0.3689,
"step": 15030
},
{
"epoch": 1.810957254665864,
"grad_norm": 6.609139919281006,
"learning_rate": 6.665096478526053e-07,
"loss": 0.3465,
"step": 15040
},
{
"epoch": 1.8121613485851897,
"grad_norm": 4.8396196365356445,
"learning_rate": 6.6601415991018e-07,
"loss": 0.3628,
"step": 15050
},
{
"epoch": 1.8133654425045154,
"grad_norm": 5.569112777709961,
"learning_rate": 6.655184886685577e-07,
"loss": 0.3484,
"step": 15060
},
{
"epoch": 1.814569536423841,
"grad_norm": 4.458260536193848,
"learning_rate": 6.650226346750178e-07,
"loss": 0.3523,
"step": 15070
},
{
"epoch": 1.8157736303431666,
"grad_norm": 4.671230316162109,
"learning_rate": 6.645265984770417e-07,
"loss": 0.3501,
"step": 15080
},
{
"epoch": 1.8169777242624925,
"grad_norm": 4.7510504722595215,
"learning_rate": 6.640303806223116e-07,
"loss": 0.3565,
"step": 15090
},
{
"epoch": 1.8181818181818183,
"grad_norm": 4.930042266845703,
"learning_rate": 6.635339816587108e-07,
"loss": 0.3519,
"step": 15100
},
{
"epoch": 1.819385912101144,
"grad_norm": 4.401383876800537,
"learning_rate": 6.63037402134322e-07,
"loss": 0.3444,
"step": 15110
},
{
"epoch": 1.8205900060204696,
"grad_norm": 4.55552864074707,
"learning_rate": 6.625406425974277e-07,
"loss": 0.3593,
"step": 15120
},
{
"epoch": 1.8217940999397952,
"grad_norm": 4.647222995758057,
"learning_rate": 6.620437035965088e-07,
"loss": 0.3513,
"step": 15130
},
{
"epoch": 1.8229981938591209,
"grad_norm": 4.750911235809326,
"learning_rate": 6.615465856802446e-07,
"loss": 0.3754,
"step": 15140
},
{
"epoch": 1.8242022877784467,
"grad_norm": 3.9289968013763428,
"learning_rate": 6.610492893975117e-07,
"loss": 0.3511,
"step": 15150
},
{
"epoch": 1.8254063816977726,
"grad_norm": 3.834213972091675,
"learning_rate": 6.605518152973842e-07,
"loss": 0.3446,
"step": 15160
},
{
"epoch": 1.8266104756170982,
"grad_norm": 5.1060075759887695,
"learning_rate": 6.600541639291316e-07,
"loss": 0.3548,
"step": 15170
},
{
"epoch": 1.8278145695364238,
"grad_norm": 4.696617603302002,
"learning_rate": 6.595563358422202e-07,
"loss": 0.3576,
"step": 15180
},
{
"epoch": 1.8290186634557495,
"grad_norm": 4.141697883605957,
"learning_rate": 6.590583315863105e-07,
"loss": 0.3513,
"step": 15190
},
{
"epoch": 1.8302227573750751,
"grad_norm": 5.357382774353027,
"learning_rate": 6.58560151711258e-07,
"loss": 0.3508,
"step": 15200
},
{
"epoch": 1.831426851294401,
"grad_norm": 4.808011054992676,
"learning_rate": 6.58061796767112e-07,
"loss": 0.3568,
"step": 15210
},
{
"epoch": 1.8326309452137268,
"grad_norm": 4.633763790130615,
"learning_rate": 6.575632673041151e-07,
"loss": 0.355,
"step": 15220
},
{
"epoch": 1.8338350391330525,
"grad_norm": 4.953246116638184,
"learning_rate": 6.570645638727026e-07,
"loss": 0.3604,
"step": 15230
},
{
"epoch": 1.835039133052378,
"grad_norm": 4.354135513305664,
"learning_rate": 6.565656870235019e-07,
"loss": 0.337,
"step": 15240
},
{
"epoch": 1.8362432269717037,
"grad_norm": 5.245918273925781,
"learning_rate": 6.560666373073316e-07,
"loss": 0.3711,
"step": 15250
},
{
"epoch": 1.8374473208910294,
"grad_norm": 5.532114028930664,
"learning_rate": 6.555674152752016e-07,
"loss": 0.3618,
"step": 15260
},
{
"epoch": 1.8386514148103552,
"grad_norm": 5.3348212242126465,
"learning_rate": 6.55068021478312e-07,
"loss": 0.3646,
"step": 15270
},
{
"epoch": 1.839855508729681,
"grad_norm": 4.423579216003418,
"learning_rate": 6.54568456468052e-07,
"loss": 0.3522,
"step": 15280
},
{
"epoch": 1.8410596026490067,
"grad_norm": 4.966454982757568,
"learning_rate": 6.540687207960005e-07,
"loss": 0.3592,
"step": 15290
},
{
"epoch": 1.8422636965683323,
"grad_norm": 4.406902313232422,
"learning_rate": 6.535688150139246e-07,
"loss": 0.3637,
"step": 15300
},
{
"epoch": 1.843467790487658,
"grad_norm": 4.565004348754883,
"learning_rate": 6.530687396737791e-07,
"loss": 0.343,
"step": 15310
},
{
"epoch": 1.8446718844069836,
"grad_norm": 4.898248672485352,
"learning_rate": 6.525684953277061e-07,
"loss": 0.3589,
"step": 15320
},
{
"epoch": 1.8458759783263095,
"grad_norm": 4.416904449462891,
"learning_rate": 6.520680825280344e-07,
"loss": 0.3297,
"step": 15330
},
{
"epoch": 1.8470800722456353,
"grad_norm": 4.844006538391113,
"learning_rate": 6.515675018272786e-07,
"loss": 0.3692,
"step": 15340
},
{
"epoch": 1.848284166164961,
"grad_norm": 4.351726531982422,
"learning_rate": 6.510667537781389e-07,
"loss": 0.3627,
"step": 15350
},
{
"epoch": 1.8494882600842866,
"grad_norm": 4.276306629180908,
"learning_rate": 6.505658389335e-07,
"loss": 0.3581,
"step": 15360
},
{
"epoch": 1.8506923540036122,
"grad_norm": 4.866278648376465,
"learning_rate": 6.500647578464311e-07,
"loss": 0.3756,
"step": 15370
},
{
"epoch": 1.8518964479229378,
"grad_norm": 4.005789279937744,
"learning_rate": 6.495635110701847e-07,
"loss": 0.3551,
"step": 15380
},
{
"epoch": 1.8531005418422637,
"grad_norm": 4.069939136505127,
"learning_rate": 6.490620991581963e-07,
"loss": 0.3426,
"step": 15390
},
{
"epoch": 1.8543046357615895,
"grad_norm": 5.377545356750488,
"learning_rate": 6.485605226640836e-07,
"loss": 0.363,
"step": 15400
},
{
"epoch": 1.8555087296809152,
"grad_norm": 4.171127796173096,
"learning_rate": 6.480587821416465e-07,
"loss": 0.3601,
"step": 15410
},
{
"epoch": 1.8567128236002408,
"grad_norm": 4.944298267364502,
"learning_rate": 6.475568781448654e-07,
"loss": 0.3445,
"step": 15420
},
{
"epoch": 1.8579169175195664,
"grad_norm": 4.719433784484863,
"learning_rate": 6.470548112279015e-07,
"loss": 0.349,
"step": 15430
},
{
"epoch": 1.859121011438892,
"grad_norm": 4.289638042449951,
"learning_rate": 6.465525819450959e-07,
"loss": 0.3675,
"step": 15440
},
{
"epoch": 1.860325105358218,
"grad_norm": 4.580896377563477,
"learning_rate": 6.46050190850969e-07,
"loss": 0.362,
"step": 15450
},
{
"epoch": 1.8615291992775438,
"grad_norm": 4.68642520904541,
"learning_rate": 6.455476385002195e-07,
"loss": 0.3544,
"step": 15460
},
{
"epoch": 1.8627332931968694,
"grad_norm": 4.221519470214844,
"learning_rate": 6.450449254477246e-07,
"loss": 0.3557,
"step": 15470
},
{
"epoch": 1.863937387116195,
"grad_norm": 5.103092670440674,
"learning_rate": 6.445420522485387e-07,
"loss": 0.3575,
"step": 15480
},
{
"epoch": 1.8651414810355207,
"grad_norm": 5.300514221191406,
"learning_rate": 6.440390194578933e-07,
"loss": 0.3655,
"step": 15490
},
{
"epoch": 1.8663455749548463,
"grad_norm": 5.2280049324035645,
"learning_rate": 6.435358276311955e-07,
"loss": 0.3615,
"step": 15500
},
{
"epoch": 1.8675496688741722,
"grad_norm": 4.393173694610596,
"learning_rate": 6.430324773240287e-07,
"loss": 0.3617,
"step": 15510
},
{
"epoch": 1.868753762793498,
"grad_norm": 3.9914498329162598,
"learning_rate": 6.425289690921508e-07,
"loss": 0.3482,
"step": 15520
},
{
"epoch": 1.8699578567128237,
"grad_norm": 4.967134475708008,
"learning_rate": 6.420253034914943e-07,
"loss": 0.3635,
"step": 15530
},
{
"epoch": 1.8711619506321493,
"grad_norm": 4.27791166305542,
"learning_rate": 6.415214810781653e-07,
"loss": 0.3508,
"step": 15540
},
{
"epoch": 1.872366044551475,
"grad_norm": 4.6500163078308105,
"learning_rate": 6.410175024084431e-07,
"loss": 0.3589,
"step": 15550
},
{
"epoch": 1.8735701384708006,
"grad_norm": 4.22102689743042,
"learning_rate": 6.405133680387797e-07,
"loss": 0.3558,
"step": 15560
},
{
"epoch": 1.8747742323901264,
"grad_norm": 4.9325947761535645,
"learning_rate": 6.400090785257987e-07,
"loss": 0.3696,
"step": 15570
},
{
"epoch": 1.8759783263094523,
"grad_norm": 3.8292155265808105,
"learning_rate": 6.395046344262951e-07,
"loss": 0.356,
"step": 15580
},
{
"epoch": 1.877182420228778,
"grad_norm": 4.739902973175049,
"learning_rate": 6.390000362972348e-07,
"loss": 0.3407,
"step": 15590
},
{
"epoch": 1.8783865141481035,
"grad_norm": 3.770754814147949,
"learning_rate": 6.384952846957535e-07,
"loss": 0.3502,
"step": 15600
},
{
"epoch": 1.8795906080674292,
"grad_norm": 4.367559432983398,
"learning_rate": 6.379903801791566e-07,
"loss": 0.3566,
"step": 15610
},
{
"epoch": 1.8807947019867548,
"grad_norm": 5.16295862197876,
"learning_rate": 6.374853233049182e-07,
"loss": 0.3668,
"step": 15620
},
{
"epoch": 1.8819987959060807,
"grad_norm": 4.346946716308594,
"learning_rate": 6.369801146306802e-07,
"loss": 0.3483,
"step": 15630
},
{
"epoch": 1.8832028898254065,
"grad_norm": 4.716429710388184,
"learning_rate": 6.36474754714253e-07,
"loss": 0.3452,
"step": 15640
},
{
"epoch": 1.8844069837447321,
"grad_norm": 4.5193891525268555,
"learning_rate": 6.359692441136131e-07,
"loss": 0.361,
"step": 15650
},
{
"epoch": 1.8856110776640578,
"grad_norm": 3.9874355792999268,
"learning_rate": 6.354635833869042e-07,
"loss": 0.358,
"step": 15660
},
{
"epoch": 1.8868151715833834,
"grad_norm": 4.598703861236572,
"learning_rate": 6.349577730924349e-07,
"loss": 0.35,
"step": 15670
},
{
"epoch": 1.8880192655027093,
"grad_norm": 5.374682426452637,
"learning_rate": 6.344518137886798e-07,
"loss": 0.3639,
"step": 15680
},
{
"epoch": 1.889223359422035,
"grad_norm": 6.002275466918945,
"learning_rate": 6.339457060342772e-07,
"loss": 0.3546,
"step": 15690
},
{
"epoch": 1.8904274533413608,
"grad_norm": 4.864243984222412,
"learning_rate": 6.3343945038803e-07,
"loss": 0.3543,
"step": 15700
},
{
"epoch": 1.8916315472606864,
"grad_norm": 3.9879305362701416,
"learning_rate": 6.329330474089039e-07,
"loss": 0.3549,
"step": 15710
},
{
"epoch": 1.892835641180012,
"grad_norm": 4.457694053649902,
"learning_rate": 6.324264976560277e-07,
"loss": 0.3584,
"step": 15720
},
{
"epoch": 1.8940397350993377,
"grad_norm": 3.741135835647583,
"learning_rate": 6.319198016886918e-07,
"loss": 0.3618,
"step": 15730
},
{
"epoch": 1.8952438290186635,
"grad_norm": 4.002588272094727,
"learning_rate": 6.314129600663484e-07,
"loss": 0.3492,
"step": 15740
},
{
"epoch": 1.8964479229379891,
"grad_norm": 4.551817893981934,
"learning_rate": 6.309059733486102e-07,
"loss": 0.3567,
"step": 15750
},
{
"epoch": 1.897652016857315,
"grad_norm": 4.268725872039795,
"learning_rate": 6.303988420952505e-07,
"loss": 0.3591,
"step": 15760
},
{
"epoch": 1.8988561107766406,
"grad_norm": 4.963777542114258,
"learning_rate": 6.298915668662017e-07,
"loss": 0.3551,
"step": 15770
},
{
"epoch": 1.9000602046959663,
"grad_norm": 4.293519973754883,
"learning_rate": 6.293841482215558e-07,
"loss": 0.3586,
"step": 15780
},
{
"epoch": 1.901264298615292,
"grad_norm": 4.556762218475342,
"learning_rate": 6.288765867215625e-07,
"loss": 0.3538,
"step": 15790
},
{
"epoch": 1.9024683925346177,
"grad_norm": 3.792178153991699,
"learning_rate": 6.283688829266297e-07,
"loss": 0.3331,
"step": 15800
},
{
"epoch": 1.9036724864539434,
"grad_norm": 5.197310447692871,
"learning_rate": 6.278610373973219e-07,
"loss": 0.3515,
"step": 15810
},
{
"epoch": 1.9048765803732692,
"grad_norm": 5.082350730895996,
"learning_rate": 6.273530506943609e-07,
"loss": 0.3389,
"step": 15820
},
{
"epoch": 1.9060806742925949,
"grad_norm": 4.892045021057129,
"learning_rate": 6.268449233786236e-07,
"loss": 0.3531,
"step": 15830
},
{
"epoch": 1.9072847682119205,
"grad_norm": 4.555123805999756,
"learning_rate": 6.263366560111423e-07,
"loss": 0.3414,
"step": 15840
},
{
"epoch": 1.9084888621312461,
"grad_norm": 4.728994846343994,
"learning_rate": 6.258282491531043e-07,
"loss": 0.3556,
"step": 15850
},
{
"epoch": 1.909692956050572,
"grad_norm": 4.745967388153076,
"learning_rate": 6.253197033658507e-07,
"loss": 0.343,
"step": 15860
},
{
"epoch": 1.9108970499698976,
"grad_norm": 4.600861072540283,
"learning_rate": 6.248110192108757e-07,
"loss": 0.3475,
"step": 15870
},
{
"epoch": 1.9121011438892235,
"grad_norm": 4.099234580993652,
"learning_rate": 6.243021972498269e-07,
"loss": 0.3624,
"step": 15880
},
{
"epoch": 1.913305237808549,
"grad_norm": 4.272284030914307,
"learning_rate": 6.237932380445034e-07,
"loss": 0.3565,
"step": 15890
},
{
"epoch": 1.9145093317278747,
"grad_norm": 3.7602131366729736,
"learning_rate": 6.232841421568565e-07,
"loss": 0.3499,
"step": 15900
},
{
"epoch": 1.9157134256472004,
"grad_norm": 4.971080303192139,
"learning_rate": 6.227749101489877e-07,
"loss": 0.3701,
"step": 15910
},
{
"epoch": 1.9169175195665262,
"grad_norm": 5.319652080535889,
"learning_rate": 6.222655425831495e-07,
"loss": 0.3451,
"step": 15920
},
{
"epoch": 1.9181216134858519,
"grad_norm": 4.283812522888184,
"learning_rate": 6.217560400217433e-07,
"loss": 0.3559,
"step": 15930
},
{
"epoch": 1.9193257074051777,
"grad_norm": 5.055164813995361,
"learning_rate": 6.212464030273204e-07,
"loss": 0.3562,
"step": 15940
},
{
"epoch": 1.9205298013245033,
"grad_norm": 4.813416004180908,
"learning_rate": 6.207366321625798e-07,
"loss": 0.3606,
"step": 15950
},
{
"epoch": 1.921733895243829,
"grad_norm": 4.402296543121338,
"learning_rate": 6.202267279903686e-07,
"loss": 0.353,
"step": 15960
},
{
"epoch": 1.9229379891631546,
"grad_norm": 4.458485126495361,
"learning_rate": 6.197166910736814e-07,
"loss": 0.3523,
"step": 15970
},
{
"epoch": 1.9241420830824805,
"grad_norm": 3.5323286056518555,
"learning_rate": 6.192065219756587e-07,
"loss": 0.357,
"step": 15980
},
{
"epoch": 1.925346177001806,
"grad_norm": 4.047741413116455,
"learning_rate": 6.186962212595876e-07,
"loss": 0.3513,
"step": 15990
},
{
"epoch": 1.926550270921132,
"grad_norm": 4.608432769775391,
"learning_rate": 6.181857894889e-07,
"loss": 0.3556,
"step": 16000
},
{
"epoch": 1.9277543648404576,
"grad_norm": 4.246164321899414,
"learning_rate": 6.17675227227173e-07,
"loss": 0.3274,
"step": 16010
},
{
"epoch": 1.9289584587597832,
"grad_norm": 4.55797004699707,
"learning_rate": 6.171645350381272e-07,
"loss": 0.3537,
"step": 16020
},
{
"epoch": 1.9301625526791089,
"grad_norm": 4.349902629852295,
"learning_rate": 6.166537134856272e-07,
"loss": 0.3454,
"step": 16030
},
{
"epoch": 1.9313666465984347,
"grad_norm": 4.9922614097595215,
"learning_rate": 6.161427631336799e-07,
"loss": 0.3377,
"step": 16040
},
{
"epoch": 1.9325707405177603,
"grad_norm": 4.467525005340576,
"learning_rate": 6.156316845464351e-07,
"loss": 0.345,
"step": 16050
},
{
"epoch": 1.9337748344370862,
"grad_norm": 4.589630603790283,
"learning_rate": 6.151204782881835e-07,
"loss": 0.3393,
"step": 16060
},
{
"epoch": 1.9349789283564118,
"grad_norm": 4.475553035736084,
"learning_rate": 6.146091449233571e-07,
"loss": 0.3544,
"step": 16070
},
{
"epoch": 1.9361830222757375,
"grad_norm": 4.827112197875977,
"learning_rate": 6.140976850165283e-07,
"loss": 0.3447,
"step": 16080
},
{
"epoch": 1.937387116195063,
"grad_norm": 3.81062388420105,
"learning_rate": 6.135860991324092e-07,
"loss": 0.3493,
"step": 16090
},
{
"epoch": 1.938591210114389,
"grad_norm": 4.450663089752197,
"learning_rate": 6.130743878358505e-07,
"loss": 0.3601,
"step": 16100
},
{
"epoch": 1.9397953040337146,
"grad_norm": 3.878636598587036,
"learning_rate": 6.125625516918421e-07,
"loss": 0.3638,
"step": 16110
},
{
"epoch": 1.9409993979530404,
"grad_norm": 4.681748390197754,
"learning_rate": 6.120505912655114e-07,
"loss": 0.3542,
"step": 16120
},
{
"epoch": 1.942203491872366,
"grad_norm": 5.228558540344238,
"learning_rate": 6.115385071221231e-07,
"loss": 0.3538,
"step": 16130
},
{
"epoch": 1.9434075857916917,
"grad_norm": 5.1694488525390625,
"learning_rate": 6.110262998270781e-07,
"loss": 0.3689,
"step": 16140
},
{
"epoch": 1.9446116797110173,
"grad_norm": 4.253943920135498,
"learning_rate": 6.10513969945914e-07,
"loss": 0.3518,
"step": 16150
},
{
"epoch": 1.9458157736303432,
"grad_norm": 4.636354446411133,
"learning_rate": 6.100015180443031e-07,
"loss": 0.3643,
"step": 16160
},
{
"epoch": 1.9470198675496688,
"grad_norm": 3.8941125869750977,
"learning_rate": 6.094889446880529e-07,
"loss": 0.3444,
"step": 16170
},
{
"epoch": 1.9482239614689947,
"grad_norm": 4.6928391456604,
"learning_rate": 6.089762504431046e-07,
"loss": 0.3541,
"step": 16180
},
{
"epoch": 1.9494280553883203,
"grad_norm": 4.19013786315918,
"learning_rate": 6.084634358755334e-07,
"loss": 0.357,
"step": 16190
},
{
"epoch": 1.950632149307646,
"grad_norm": 4.565307140350342,
"learning_rate": 6.079505015515465e-07,
"loss": 0.3419,
"step": 16200
},
{
"epoch": 1.9518362432269716,
"grad_norm": 5.345344543457031,
"learning_rate": 6.074374480374843e-07,
"loss": 0.3569,
"step": 16210
},
{
"epoch": 1.9530403371462974,
"grad_norm": 4.672290802001953,
"learning_rate": 6.069242758998181e-07,
"loss": 0.3564,
"step": 16220
},
{
"epoch": 1.954244431065623,
"grad_norm": 4.522906303405762,
"learning_rate": 6.064109857051505e-07,
"loss": 0.35,
"step": 16230
},
{
"epoch": 1.955448524984949,
"grad_norm": 4.692704200744629,
"learning_rate": 6.058975780202143e-07,
"loss": 0.334,
"step": 16240
},
{
"epoch": 1.9566526189042746,
"grad_norm": 4.350996971130371,
"learning_rate": 6.053840534118722e-07,
"loss": 0.3512,
"step": 16250
},
{
"epoch": 1.9578567128236002,
"grad_norm": 4.869346618652344,
"learning_rate": 6.04870412447116e-07,
"loss": 0.3415,
"step": 16260
},
{
"epoch": 1.9590608067429258,
"grad_norm": 4.5982818603515625,
"learning_rate": 6.043566556930655e-07,
"loss": 0.3697,
"step": 16270
},
{
"epoch": 1.9602649006622517,
"grad_norm": 4.133756637573242,
"learning_rate": 6.038427837169688e-07,
"loss": 0.3498,
"step": 16280
},
{
"epoch": 1.9614689945815773,
"grad_norm": 4.6877546310424805,
"learning_rate": 6.033287970862013e-07,
"loss": 0.3622,
"step": 16290
},
{
"epoch": 1.9626730885009032,
"grad_norm": 5.100693702697754,
"learning_rate": 6.028146963682648e-07,
"loss": 0.3571,
"step": 16300
},
{
"epoch": 1.9638771824202288,
"grad_norm": 5.0933685302734375,
"learning_rate": 6.023004821307867e-07,
"loss": 0.3247,
"step": 16310
},
{
"epoch": 1.9650812763395544,
"grad_norm": 3.7194926738739014,
"learning_rate": 6.017861549415207e-07,
"loss": 0.3519,
"step": 16320
},
{
"epoch": 1.96628537025888,
"grad_norm": 4.424744606018066,
"learning_rate": 6.012717153683442e-07,
"loss": 0.3401,
"step": 16330
},
{
"epoch": 1.967489464178206,
"grad_norm": 3.9198262691497803,
"learning_rate": 6.007571639792593e-07,
"loss": 0.3434,
"step": 16340
},
{
"epoch": 1.9686935580975318,
"grad_norm": 3.9350152015686035,
"learning_rate": 6.002425013423913e-07,
"loss": 0.3447,
"step": 16350
},
{
"epoch": 1.9698976520168574,
"grad_norm": 4.852246284484863,
"learning_rate": 5.997277280259885e-07,
"loss": 0.3457,
"step": 16360
},
{
"epoch": 1.971101745936183,
"grad_norm": 4.658691883087158,
"learning_rate": 5.992128445984212e-07,
"loss": 0.3692,
"step": 16370
},
{
"epoch": 1.9723058398555087,
"grad_norm": 4.637414932250977,
"learning_rate": 5.986978516281815e-07,
"loss": 0.3555,
"step": 16380
},
{
"epoch": 1.9735099337748343,
"grad_norm": 4.982326984405518,
"learning_rate": 5.981827496838822e-07,
"loss": 0.3526,
"step": 16390
},
{
"epoch": 1.9747140276941602,
"grad_norm": 4.729382514953613,
"learning_rate": 5.976675393342566e-07,
"loss": 0.3558,
"step": 16400
},
{
"epoch": 1.975918121613486,
"grad_norm": 4.774322509765625,
"learning_rate": 5.971522211481575e-07,
"loss": 0.358,
"step": 16410
},
{
"epoch": 1.9771222155328116,
"grad_norm": 4.948471546173096,
"learning_rate": 5.966367956945572e-07,
"loss": 0.359,
"step": 16420
},
{
"epoch": 1.9783263094521373,
"grad_norm": 4.0199198722839355,
"learning_rate": 5.961212635425459e-07,
"loss": 0.3423,
"step": 16430
},
{
"epoch": 1.979530403371463,
"grad_norm": 4.141156196594238,
"learning_rate": 5.956056252613319e-07,
"loss": 0.3475,
"step": 16440
},
{
"epoch": 1.9807344972907885,
"grad_norm": 4.316824913024902,
"learning_rate": 5.950898814202407e-07,
"loss": 0.3436,
"step": 16450
},
{
"epoch": 1.9819385912101144,
"grad_norm": 5.594763278961182,
"learning_rate": 5.945740325887144e-07,
"loss": 0.3435,
"step": 16460
},
{
"epoch": 1.9831426851294403,
"grad_norm": 4.995075702667236,
"learning_rate": 5.940580793363105e-07,
"loss": 0.3539,
"step": 16470
},
{
"epoch": 1.9843467790487659,
"grad_norm": 4.139880180358887,
"learning_rate": 5.935420222327028e-07,
"loss": 0.3544,
"step": 16480
},
{
"epoch": 1.9855508729680915,
"grad_norm": 3.917797088623047,
"learning_rate": 5.930258618476785e-07,
"loss": 0.3331,
"step": 16490
},
{
"epoch": 1.9867549668874172,
"grad_norm": 5.234194755554199,
"learning_rate": 5.9250959875114e-07,
"loss": 0.3477,
"step": 16500
},
{
"epoch": 1.9879590608067428,
"grad_norm": 4.324552059173584,
"learning_rate": 5.919932335131022e-07,
"loss": 0.341,
"step": 16510
},
{
"epoch": 1.9891631547260686,
"grad_norm": 5.321447849273682,
"learning_rate": 5.914767667036936e-07,
"loss": 0.3606,
"step": 16520
},
{
"epoch": 1.9903672486453945,
"grad_norm": 4.159404277801514,
"learning_rate": 5.90960198893154e-07,
"loss": 0.3484,
"step": 16530
},
{
"epoch": 1.9915713425647201,
"grad_norm": 4.632839202880859,
"learning_rate": 5.904435306518354e-07,
"loss": 0.35,
"step": 16540
},
{
"epoch": 1.9927754364840458,
"grad_norm": 4.1767168045043945,
"learning_rate": 5.899267625502004e-07,
"loss": 0.356,
"step": 16550
},
{
"epoch": 1.9939795304033714,
"grad_norm": 4.770878314971924,
"learning_rate": 5.894098951588218e-07,
"loss": 0.3338,
"step": 16560
},
{
"epoch": 1.995183624322697,
"grad_norm": 4.481430530548096,
"learning_rate": 5.888929290483821e-07,
"loss": 0.3569,
"step": 16570
},
{
"epoch": 1.9963877182420229,
"grad_norm": 4.496611595153809,
"learning_rate": 5.883758647896729e-07,
"loss": 0.3602,
"step": 16580
},
{
"epoch": 1.9975918121613487,
"grad_norm": 3.9505410194396973,
"learning_rate": 5.878587029535942e-07,
"loss": 0.3403,
"step": 16590
},
{
"epoch": 1.9987959060806744,
"grad_norm": 4.308087348937988,
"learning_rate": 5.873414441111532e-07,
"loss": 0.3556,
"step": 16600
},
{
"epoch": 2.0,
"grad_norm": 4.440168857574463,
"learning_rate": 5.868240888334652e-07,
"loss": 0.3312,
"step": 16610
},
{
"epoch": 2.0012040939193256,
"grad_norm": 4.038889408111572,
"learning_rate": 5.863066376917508e-07,
"loss": 0.3224,
"step": 16620
},
{
"epoch": 2.0024081878386513,
"grad_norm": 4.833006381988525,
"learning_rate": 5.857890912573376e-07,
"loss": 0.3001,
"step": 16630
},
{
"epoch": 2.0036122817579773,
"grad_norm": 4.160131931304932,
"learning_rate": 5.852714501016572e-07,
"loss": 0.2985,
"step": 16640
},
{
"epoch": 2.004816375677303,
"grad_norm": 5.080901622772217,
"learning_rate": 5.84753714796247e-07,
"loss": 0.3228,
"step": 16650
},
{
"epoch": 2.0060204695966286,
"grad_norm": 4.37393856048584,
"learning_rate": 5.842358859127478e-07,
"loss": 0.3036,
"step": 16660
},
{
"epoch": 2.0072245635159542,
"grad_norm": 4.473939895629883,
"learning_rate": 5.837179640229032e-07,
"loss": 0.3135,
"step": 16670
},
{
"epoch": 2.00842865743528,
"grad_norm": 5.297366619110107,
"learning_rate": 5.831999496985605e-07,
"loss": 0.3059,
"step": 16680
},
{
"epoch": 2.0096327513546055,
"grad_norm": 5.174331188201904,
"learning_rate": 5.826818435116683e-07,
"loss": 0.3123,
"step": 16690
},
{
"epoch": 2.0108368452739316,
"grad_norm": 4.679065704345703,
"learning_rate": 5.821636460342769e-07,
"loss": 0.3232,
"step": 16700
},
{
"epoch": 2.012040939193257,
"grad_norm": 4.446617126464844,
"learning_rate": 5.816453578385375e-07,
"loss": 0.3063,
"step": 16710
},
{
"epoch": 2.013245033112583,
"grad_norm": 5.05123233795166,
"learning_rate": 5.811269794967014e-07,
"loss": 0.3095,
"step": 16720
},
{
"epoch": 2.0144491270319085,
"grad_norm": 4.649383544921875,
"learning_rate": 5.806085115811191e-07,
"loss": 0.309,
"step": 16730
},
{
"epoch": 2.015653220951234,
"grad_norm": 4.328246116638184,
"learning_rate": 5.800899546642406e-07,
"loss": 0.2981,
"step": 16740
},
{
"epoch": 2.0168573148705597,
"grad_norm": 4.504574775695801,
"learning_rate": 5.795713093186136e-07,
"loss": 0.3162,
"step": 16750
},
{
"epoch": 2.018061408789886,
"grad_norm": 4.636085033416748,
"learning_rate": 5.790525761168839e-07,
"loss": 0.318,
"step": 16760
},
{
"epoch": 2.0192655027092115,
"grad_norm": 5.4193291664123535,
"learning_rate": 5.785337556317938e-07,
"loss": 0.3216,
"step": 16770
},
{
"epoch": 2.020469596628537,
"grad_norm": 4.318239212036133,
"learning_rate": 5.780148484361826e-07,
"loss": 0.3018,
"step": 16780
},
{
"epoch": 2.0216736905478627,
"grad_norm": 4.4032087326049805,
"learning_rate": 5.774958551029847e-07,
"loss": 0.3078,
"step": 16790
},
{
"epoch": 2.0228777844671884,
"grad_norm": 4.946054458618164,
"learning_rate": 5.769767762052301e-07,
"loss": 0.3155,
"step": 16800
},
{
"epoch": 2.024081878386514,
"grad_norm": 4.1051344871521,
"learning_rate": 5.764576123160429e-07,
"loss": 0.3183,
"step": 16810
},
{
"epoch": 2.02528597230584,
"grad_norm": 4.6641459465026855,
"learning_rate": 5.759383640086415e-07,
"loss": 0.3063,
"step": 16820
},
{
"epoch": 2.0264900662251657,
"grad_norm": 4.728779315948486,
"learning_rate": 5.75419031856337e-07,
"loss": 0.3153,
"step": 16830
},
{
"epoch": 2.0276941601444913,
"grad_norm": 5.103392124176025,
"learning_rate": 5.748996164325331e-07,
"loss": 0.304,
"step": 16840
},
{
"epoch": 2.028898254063817,
"grad_norm": 5.283243656158447,
"learning_rate": 5.743801183107261e-07,
"loss": 0.3188,
"step": 16850
},
{
"epoch": 2.0301023479831426,
"grad_norm": 4.704992294311523,
"learning_rate": 5.73860538064503e-07,
"loss": 0.306,
"step": 16860
},
{
"epoch": 2.0313064419024682,
"grad_norm": 5.523532390594482,
"learning_rate": 5.733408762675414e-07,
"loss": 0.3164,
"step": 16870
},
{
"epoch": 2.0325105358217943,
"grad_norm": 4.29448127746582,
"learning_rate": 5.728211334936093e-07,
"loss": 0.3011,
"step": 16880
},
{
"epoch": 2.03371462974112,
"grad_norm": 4.910971164703369,
"learning_rate": 5.723013103165642e-07,
"loss": 0.3093,
"step": 16890
},
{
"epoch": 2.0349187236604456,
"grad_norm": 4.527739524841309,
"learning_rate": 5.717814073103519e-07,
"loss": 0.2994,
"step": 16900
},
{
"epoch": 2.036122817579771,
"grad_norm": 4.409666061401367,
"learning_rate": 5.712614250490064e-07,
"loss": 0.3165,
"step": 16910
},
{
"epoch": 2.037326911499097,
"grad_norm": 4.129342079162598,
"learning_rate": 5.707413641066497e-07,
"loss": 0.3159,
"step": 16920
},
{
"epoch": 2.0385310054184225,
"grad_norm": 4.361571788787842,
"learning_rate": 5.702212250574904e-07,
"loss": 0.3008,
"step": 16930
},
{
"epoch": 2.0397350993377485,
"grad_norm": 4.482879638671875,
"learning_rate": 5.697010084758232e-07,
"loss": 0.3169,
"step": 16940
},
{
"epoch": 2.040939193257074,
"grad_norm": 4.7954535484313965,
"learning_rate": 5.691807149360285e-07,
"loss": 0.3057,
"step": 16950
},
{
"epoch": 2.0421432871764,
"grad_norm": 4.840571403503418,
"learning_rate": 5.686603450125717e-07,
"loss": 0.2973,
"step": 16960
},
{
"epoch": 2.0433473810957254,
"grad_norm": 4.597223281860352,
"learning_rate": 5.681398992800024e-07,
"loss": 0.3144,
"step": 16970
},
{
"epoch": 2.044551475015051,
"grad_norm": 4.794790744781494,
"learning_rate": 5.676193783129542e-07,
"loss": 0.3087,
"step": 16980
},
{
"epoch": 2.0457555689343767,
"grad_norm": 4.340571403503418,
"learning_rate": 5.670987826861435e-07,
"loss": 0.3083,
"step": 16990
},
{
"epoch": 2.046959662853703,
"grad_norm": 4.629497051239014,
"learning_rate": 5.665781129743693e-07,
"loss": 0.3088,
"step": 17000
},
{
"epoch": 2.0481637567730284,
"grad_norm": 4.827451229095459,
"learning_rate": 5.660573697525121e-07,
"loss": 0.3039,
"step": 17010
},
{
"epoch": 2.049367850692354,
"grad_norm": 4.8336381912231445,
"learning_rate": 5.655365535955342e-07,
"loss": 0.306,
"step": 17020
},
{
"epoch": 2.0505719446116797,
"grad_norm": 5.4790940284729,
"learning_rate": 5.650156650784777e-07,
"loss": 0.3129,
"step": 17030
},
{
"epoch": 2.0517760385310053,
"grad_norm": 3.705552577972412,
"learning_rate": 5.64494704776465e-07,
"loss": 0.3062,
"step": 17040
},
{
"epoch": 2.052980132450331,
"grad_norm": 4.869053840637207,
"learning_rate": 5.639736732646976e-07,
"loss": 0.3169,
"step": 17050
},
{
"epoch": 2.054184226369657,
"grad_norm": 4.759436130523682,
"learning_rate": 5.634525711184556e-07,
"loss": 0.3129,
"step": 17060
},
{
"epoch": 2.0553883202889827,
"grad_norm": 4.388055324554443,
"learning_rate": 5.629313989130975e-07,
"loss": 0.3026,
"step": 17070
},
{
"epoch": 2.0565924142083083,
"grad_norm": 5.617096900939941,
"learning_rate": 5.624101572240587e-07,
"loss": 0.3064,
"step": 17080
},
{
"epoch": 2.057796508127634,
"grad_norm": 4.787253379821777,
"learning_rate": 5.618888466268513e-07,
"loss": 0.3174,
"step": 17090
},
{
"epoch": 2.0590006020469596,
"grad_norm": 4.347087383270264,
"learning_rate": 5.613674676970638e-07,
"loss": 0.3028,
"step": 17100
},
{
"epoch": 2.060204695966285,
"grad_norm": 4.601030349731445,
"learning_rate": 5.608460210103598e-07,
"loss": 0.3136,
"step": 17110
},
{
"epoch": 2.0614087898856113,
"grad_norm": 4.6767048835754395,
"learning_rate": 5.603245071424783e-07,
"loss": 0.3126,
"step": 17120
},
{
"epoch": 2.062612883804937,
"grad_norm": 5.636801719665527,
"learning_rate": 5.598029266692315e-07,
"loss": 0.3107,
"step": 17130
},
{
"epoch": 2.0638169777242625,
"grad_norm": 5.514817714691162,
"learning_rate": 5.592812801665061e-07,
"loss": 0.3191,
"step": 17140
},
{
"epoch": 2.065021071643588,
"grad_norm": 4.12761116027832,
"learning_rate": 5.587595682102611e-07,
"loss": 0.3119,
"step": 17150
},
{
"epoch": 2.066225165562914,
"grad_norm": 4.940089702606201,
"learning_rate": 5.582377913765283e-07,
"loss": 0.3072,
"step": 17160
},
{
"epoch": 2.0674292594822394,
"grad_norm": 4.235925674438477,
"learning_rate": 5.577159502414103e-07,
"loss": 0.3168,
"step": 17170
},
{
"epoch": 2.0686333534015655,
"grad_norm": 5.036463260650635,
"learning_rate": 5.57194045381082e-07,
"loss": 0.3236,
"step": 17180
},
{
"epoch": 2.069837447320891,
"grad_norm": 3.9009006023406982,
"learning_rate": 5.56672077371787e-07,
"loss": 0.3111,
"step": 17190
},
{
"epoch": 2.0710415412402168,
"grad_norm": 4.592634677886963,
"learning_rate": 5.5615004678984e-07,
"loss": 0.3001,
"step": 17200
},
{
"epoch": 2.0722456351595424,
"grad_norm": 4.5537004470825195,
"learning_rate": 5.556279542116242e-07,
"loss": 0.305,
"step": 17210
},
{
"epoch": 2.073449729078868,
"grad_norm": 4.557441711425781,
"learning_rate": 5.551058002135913e-07,
"loss": 0.2978,
"step": 17220
},
{
"epoch": 2.0746538229981937,
"grad_norm": 3.7024407386779785,
"learning_rate": 5.545835853722608e-07,
"loss": 0.3134,
"step": 17230
},
{
"epoch": 2.0758579169175198,
"grad_norm": 5.503789901733398,
"learning_rate": 5.540613102642195e-07,
"loss": 0.3217,
"step": 17240
},
{
"epoch": 2.0770620108368454,
"grad_norm": 4.864404678344727,
"learning_rate": 5.535389754661208e-07,
"loss": 0.2983,
"step": 17250
},
{
"epoch": 2.078266104756171,
"grad_norm": 5.232902526855469,
"learning_rate": 5.530165815546835e-07,
"loss": 0.3154,
"step": 17260
},
{
"epoch": 2.0794701986754967,
"grad_norm": 4.34998083114624,
"learning_rate": 5.524941291066923e-07,
"loss": 0.3078,
"step": 17270
},
{
"epoch": 2.0806742925948223,
"grad_norm": 4.243396282196045,
"learning_rate": 5.519716186989962e-07,
"loss": 0.2971,
"step": 17280
},
{
"epoch": 2.081878386514148,
"grad_norm": 4.376738548278809,
"learning_rate": 5.514490509085083e-07,
"loss": 0.3081,
"step": 17290
},
{
"epoch": 2.083082480433474,
"grad_norm": 4.597198486328125,
"learning_rate": 5.50926426312205e-07,
"loss": 0.3279,
"step": 17300
},
{
"epoch": 2.0842865743527996,
"grad_norm": 4.825913906097412,
"learning_rate": 5.504037454871258e-07,
"loss": 0.3164,
"step": 17310
},
{
"epoch": 2.0854906682721253,
"grad_norm": 4.312431812286377,
"learning_rate": 5.498810090103711e-07,
"loss": 0.29,
"step": 17320
},
{
"epoch": 2.086694762191451,
"grad_norm": 4.7181854248046875,
"learning_rate": 5.493582174591045e-07,
"loss": 0.2962,
"step": 17330
},
{
"epoch": 2.0878988561107765,
"grad_norm": 5.4123759269714355,
"learning_rate": 5.488353714105488e-07,
"loss": 0.3044,
"step": 17340
},
{
"epoch": 2.089102950030102,
"grad_norm": 4.742303371429443,
"learning_rate": 5.48312471441988e-07,
"loss": 0.287,
"step": 17350
},
{
"epoch": 2.0903070439494282,
"grad_norm": 3.8717334270477295,
"learning_rate": 5.477895181307651e-07,
"loss": 0.3205,
"step": 17360
},
{
"epoch": 2.091511137868754,
"grad_norm": 4.724112510681152,
"learning_rate": 5.472665120542824e-07,
"loss": 0.2851,
"step": 17370
},
{
"epoch": 2.0927152317880795,
"grad_norm": 5.797724723815918,
"learning_rate": 5.4674345379e-07,
"loss": 0.3136,
"step": 17380
},
{
"epoch": 2.093919325707405,
"grad_norm": 4.77787446975708,
"learning_rate": 5.462203439154361e-07,
"loss": 0.3059,
"step": 17390
},
{
"epoch": 2.0951234196267308,
"grad_norm": 4.670202732086182,
"learning_rate": 5.456971830081655e-07,
"loss": 0.3219,
"step": 17400
},
{
"epoch": 2.0963275135460564,
"grad_norm": 4.7208099365234375,
"learning_rate": 5.451739716458195e-07,
"loss": 0.3146,
"step": 17410
},
{
"epoch": 2.0975316074653825,
"grad_norm": 4.647831439971924,
"learning_rate": 5.446507104060851e-07,
"loss": 0.3266,
"step": 17420
},
{
"epoch": 2.098735701384708,
"grad_norm": 4.2992987632751465,
"learning_rate": 5.441273998667046e-07,
"loss": 0.3091,
"step": 17430
},
{
"epoch": 2.0999397953040337,
"grad_norm": 4.718204975128174,
"learning_rate": 5.436040406054742e-07,
"loss": 0.3103,
"step": 17440
},
{
"epoch": 2.1011438892233594,
"grad_norm": 4.716932773590088,
"learning_rate": 5.430806332002443e-07,
"loss": 0.3044,
"step": 17450
},
{
"epoch": 2.102347983142685,
"grad_norm": 4.856298923492432,
"learning_rate": 5.425571782289185e-07,
"loss": 0.3039,
"step": 17460
},
{
"epoch": 2.1035520770620106,
"grad_norm": 5.1161208152771,
"learning_rate": 5.420336762694524e-07,
"loss": 0.3014,
"step": 17470
},
{
"epoch": 2.1047561709813367,
"grad_norm": 4.895595550537109,
"learning_rate": 5.415101278998543e-07,
"loss": 0.3113,
"step": 17480
},
{
"epoch": 2.1059602649006623,
"grad_norm": 4.259979248046875,
"learning_rate": 5.409865336981832e-07,
"loss": 0.3158,
"step": 17490
},
{
"epoch": 2.107164358819988,
"grad_norm": 5.523928642272949,
"learning_rate": 5.404628942425484e-07,
"loss": 0.3293,
"step": 17500
},
{
"epoch": 2.1083684527393136,
"grad_norm": 5.490001201629639,
"learning_rate": 5.399392101111102e-07,
"loss": 0.3253,
"step": 17510
},
{
"epoch": 2.1095725466586392,
"grad_norm": 4.070251941680908,
"learning_rate": 5.39415481882077e-07,
"loss": 0.3341,
"step": 17520
},
{
"epoch": 2.110776640577965,
"grad_norm": 4.516000270843506,
"learning_rate": 5.388917101337069e-07,
"loss": 0.3115,
"step": 17530
},
{
"epoch": 2.111980734497291,
"grad_norm": 4.881539821624756,
"learning_rate": 5.383678954443056e-07,
"loss": 0.2962,
"step": 17540
},
{
"epoch": 2.1131848284166166,
"grad_norm": 4.361866474151611,
"learning_rate": 5.378440383922261e-07,
"loss": 0.2959,
"step": 17550
},
{
"epoch": 2.1143889223359422,
"grad_norm": 4.218469619750977,
"learning_rate": 5.373201395558683e-07,
"loss": 0.3004,
"step": 17560
},
{
"epoch": 2.115593016255268,
"grad_norm": 5.058506488800049,
"learning_rate": 5.367961995136782e-07,
"loss": 0.3177,
"step": 17570
},
{
"epoch": 2.1167971101745935,
"grad_norm": 5.340724468231201,
"learning_rate": 5.362722188441476e-07,
"loss": 0.3116,
"step": 17580
},
{
"epoch": 2.118001204093919,
"grad_norm": 4.867612361907959,
"learning_rate": 5.357481981258128e-07,
"loss": 0.3287,
"step": 17590
},
{
"epoch": 2.119205298013245,
"grad_norm": 4.499852180480957,
"learning_rate": 5.352241379372545e-07,
"loss": 0.3057,
"step": 17600
},
{
"epoch": 2.120409391932571,
"grad_norm": 5.446403980255127,
"learning_rate": 5.347000388570966e-07,
"loss": 0.3206,
"step": 17610
},
{
"epoch": 2.1216134858518965,
"grad_norm": 4.157654762268066,
"learning_rate": 5.341759014640067e-07,
"loss": 0.2985,
"step": 17620
},
{
"epoch": 2.122817579771222,
"grad_norm": 5.162617206573486,
"learning_rate": 5.336517263366939e-07,
"loss": 0.3057,
"step": 17630
},
{
"epoch": 2.1240216736905477,
"grad_norm": 4.874579906463623,
"learning_rate": 5.331275140539094e-07,
"loss": 0.3096,
"step": 17640
},
{
"epoch": 2.125225767609874,
"grad_norm": 4.7379350662231445,
"learning_rate": 5.326032651944453e-07,
"loss": 0.3178,
"step": 17650
},
{
"epoch": 2.1264298615291994,
"grad_norm": 4.660308361053467,
"learning_rate": 5.320789803371344e-07,
"loss": 0.3121,
"step": 17660
},
{
"epoch": 2.127633955448525,
"grad_norm": 4.264311790466309,
"learning_rate": 5.315546600608486e-07,
"loss": 0.3041,
"step": 17670
},
{
"epoch": 2.1288380493678507,
"grad_norm": 5.007218360900879,
"learning_rate": 5.310303049444995e-07,
"loss": 0.3133,
"step": 17680
},
{
"epoch": 2.1300421432871763,
"grad_norm": 4.878419399261475,
"learning_rate": 5.305059155670369e-07,
"loss": 0.307,
"step": 17690
},
{
"epoch": 2.131246237206502,
"grad_norm": 4.373286724090576,
"learning_rate": 5.299814925074485e-07,
"loss": 0.2988,
"step": 17700
},
{
"epoch": 2.1324503311258276,
"grad_norm": 4.705572128295898,
"learning_rate": 5.294570363447589e-07,
"loss": 0.3101,
"step": 17710
},
{
"epoch": 2.1336544250451537,
"grad_norm": 5.6706461906433105,
"learning_rate": 5.2893254765803e-07,
"loss": 0.3182,
"step": 17720
},
{
"epoch": 2.1348585189644793,
"grad_norm": 4.4038896560668945,
"learning_rate": 5.284080270263586e-07,
"loss": 0.3055,
"step": 17730
},
{
"epoch": 2.136062612883805,
"grad_norm": 4.746342658996582,
"learning_rate": 5.278834750288776e-07,
"loss": 0.3098,
"step": 17740
},
{
"epoch": 2.1372667068031306,
"grad_norm": 4.472485065460205,
"learning_rate": 5.273588922447543e-07,
"loss": 0.3192,
"step": 17750
},
{
"epoch": 2.138470800722456,
"grad_norm": 5.553606033325195,
"learning_rate": 5.268342792531897e-07,
"loss": 0.3328,
"step": 17760
},
{
"epoch": 2.1396748946417823,
"grad_norm": 5.298537731170654,
"learning_rate": 5.263096366334183e-07,
"loss": 0.3072,
"step": 17770
},
{
"epoch": 2.140878988561108,
"grad_norm": 4.98936128616333,
"learning_rate": 5.257849649647077e-07,
"loss": 0.3131,
"step": 17780
},
{
"epoch": 2.1420830824804336,
"grad_norm": 4.389891147613525,
"learning_rate": 5.252602648263569e-07,
"loss": 0.3142,
"step": 17790
},
{
"epoch": 2.143287176399759,
"grad_norm": 4.614076614379883,
"learning_rate": 5.24735536797697e-07,
"loss": 0.3075,
"step": 17800
},
{
"epoch": 2.144491270319085,
"grad_norm": 5.098964214324951,
"learning_rate": 5.242107814580893e-07,
"loss": 0.3125,
"step": 17810
},
{
"epoch": 2.1456953642384105,
"grad_norm": 4.502909183502197,
"learning_rate": 5.236859993869258e-07,
"loss": 0.2986,
"step": 17820
},
{
"epoch": 2.146899458157736,
"grad_norm": 5.02591609954834,
"learning_rate": 5.231611911636276e-07,
"loss": 0.294,
"step": 17830
},
{
"epoch": 2.148103552077062,
"grad_norm": 4.412136077880859,
"learning_rate": 5.226363573676447e-07,
"loss": 0.3085,
"step": 17840
},
{
"epoch": 2.149307645996388,
"grad_norm": 4.393168926239014,
"learning_rate": 5.221114985784558e-07,
"loss": 0.3145,
"step": 17850
},
{
"epoch": 2.1505117399157134,
"grad_norm": 4.741860389709473,
"learning_rate": 5.215866153755666e-07,
"loss": 0.3194,
"step": 17860
},
{
"epoch": 2.151715833835039,
"grad_norm": 4.4850006103515625,
"learning_rate": 5.210617083385101e-07,
"loss": 0.3015,
"step": 17870
},
{
"epoch": 2.1529199277543647,
"grad_norm": 5.466598033905029,
"learning_rate": 5.205367780468455e-07,
"loss": 0.311,
"step": 17880
},
{
"epoch": 2.1541240216736908,
"grad_norm": 5.164214611053467,
"learning_rate": 5.200118250801578e-07,
"loss": 0.3161,
"step": 17890
},
{
"epoch": 2.1553281155930164,
"grad_norm": 4.714061737060547,
"learning_rate": 5.194868500180567e-07,
"loss": 0.3171,
"step": 17900
},
{
"epoch": 2.156532209512342,
"grad_norm": 4.755367279052734,
"learning_rate": 5.189618534401768e-07,
"loss": 0.3059,
"step": 17910
},
{
"epoch": 2.1577363034316677,
"grad_norm": 4.605241298675537,
"learning_rate": 5.184368359261761e-07,
"loss": 0.3207,
"step": 17920
},
{
"epoch": 2.1589403973509933,
"grad_norm": 5.180820465087891,
"learning_rate": 5.179117980557357e-07,
"loss": 0.3097,
"step": 17930
},
{
"epoch": 2.160144491270319,
"grad_norm": 5.053746700286865,
"learning_rate": 5.173867404085594e-07,
"loss": 0.3208,
"step": 17940
},
{
"epoch": 2.1613485851896446,
"grad_norm": 4.809300899505615,
"learning_rate": 5.168616635643728e-07,
"loss": 0.3009,
"step": 17950
},
{
"epoch": 2.1625526791089706,
"grad_norm": 4.434291839599609,
"learning_rate": 5.163365681029224e-07,
"loss": 0.3118,
"step": 17960
},
{
"epoch": 2.1637567730282963,
"grad_norm": 3.94570255279541,
"learning_rate": 5.158114546039756e-07,
"loss": 0.3081,
"step": 17970
},
{
"epoch": 2.164960866947622,
"grad_norm": 4.972118854522705,
"learning_rate": 5.152863236473195e-07,
"loss": 0.3,
"step": 17980
},
{
"epoch": 2.1661649608669475,
"grad_norm": 5.422942161560059,
"learning_rate": 5.147611758127608e-07,
"loss": 0.3039,
"step": 17990
},
{
"epoch": 2.167369054786273,
"grad_norm": 4.45037317276001,
"learning_rate": 5.142360116801242e-07,
"loss": 0.3158,
"step": 18000
},
{
"epoch": 2.1685731487055993,
"grad_norm": 5.098633289337158,
"learning_rate": 5.137108318292533e-07,
"loss": 0.2949,
"step": 18010
},
{
"epoch": 2.169777242624925,
"grad_norm": 5.256601810455322,
"learning_rate": 5.131856368400082e-07,
"loss": 0.3037,
"step": 18020
},
{
"epoch": 2.1709813365442505,
"grad_norm": 5.189584732055664,
"learning_rate": 5.126604272922659e-07,
"loss": 0.3256,
"step": 18030
},
{
"epoch": 2.172185430463576,
"grad_norm": 4.259381294250488,
"learning_rate": 5.121352037659201e-07,
"loss": 0.3051,
"step": 18040
},
{
"epoch": 2.173389524382902,
"grad_norm": 4.795348644256592,
"learning_rate": 5.116099668408791e-07,
"loss": 0.3002,
"step": 18050
},
{
"epoch": 2.1745936183022274,
"grad_norm": 5.63735818862915,
"learning_rate": 5.110847170970665e-07,
"loss": 0.313,
"step": 18060
},
{
"epoch": 2.175797712221553,
"grad_norm": 6.581758975982666,
"learning_rate": 5.1055945511442e-07,
"loss": 0.3014,
"step": 18070
},
{
"epoch": 2.177001806140879,
"grad_norm": 5.026032447814941,
"learning_rate": 5.100341814728904e-07,
"loss": 0.3009,
"step": 18080
},
{
"epoch": 2.1782059000602048,
"grad_norm": 4.6837263107299805,
"learning_rate": 5.095088967524423e-07,
"loss": 0.3251,
"step": 18090
},
{
"epoch": 2.1794099939795304,
"grad_norm": 4.637839317321777,
"learning_rate": 5.089836015330513e-07,
"loss": 0.3177,
"step": 18100
},
{
"epoch": 2.180614087898856,
"grad_norm": 4.267435550689697,
"learning_rate": 5.084582963947057e-07,
"loss": 0.3003,
"step": 18110
},
{
"epoch": 2.1818181818181817,
"grad_norm": 4.481462001800537,
"learning_rate": 5.07932981917404e-07,
"loss": 0.3084,
"step": 18120
},
{
"epoch": 2.1830222757375077,
"grad_norm": 5.001600742340088,
"learning_rate": 5.074076586811554e-07,
"loss": 0.3117,
"step": 18130
},
{
"epoch": 2.1842263696568334,
"grad_norm": 4.785762310028076,
"learning_rate": 5.068823272659785e-07,
"loss": 0.3044,
"step": 18140
},
{
"epoch": 2.185430463576159,
"grad_norm": 4.241122245788574,
"learning_rate": 5.063569882519014e-07,
"loss": 0.3114,
"step": 18150
},
{
"epoch": 2.1866345574954846,
"grad_norm": 4.614393711090088,
"learning_rate": 5.0583164221896e-07,
"loss": 0.3143,
"step": 18160
},
{
"epoch": 2.1878386514148103,
"grad_norm": 5.790137767791748,
"learning_rate": 5.053062897471985e-07,
"loss": 0.3086,
"step": 18170
},
{
"epoch": 2.189042745334136,
"grad_norm": 5.027008056640625,
"learning_rate": 5.047809314166677e-07,
"loss": 0.2996,
"step": 18180
},
{
"epoch": 2.190246839253462,
"grad_norm": 4.725672245025635,
"learning_rate": 5.042555678074251e-07,
"loss": 0.3101,
"step": 18190
},
{
"epoch": 2.1914509331727876,
"grad_norm": 4.756001949310303,
"learning_rate": 5.037301994995342e-07,
"loss": 0.2892,
"step": 18200
},
{
"epoch": 2.1926550270921132,
"grad_norm": 3.9560751914978027,
"learning_rate": 5.032048270730634e-07,
"loss": 0.3118,
"step": 18210
},
{
"epoch": 2.193859121011439,
"grad_norm": 4.681294918060303,
"learning_rate": 5.026794511080859e-07,
"loss": 0.306,
"step": 18220
},
{
"epoch": 2.1950632149307645,
"grad_norm": 5.220909118652344,
"learning_rate": 5.021540721846787e-07,
"loss": 0.3089,
"step": 18230
},
{
"epoch": 2.19626730885009,
"grad_norm": 4.095883369445801,
"learning_rate": 5.016286908829218e-07,
"loss": 0.3179,
"step": 18240
},
{
"epoch": 2.197471402769416,
"grad_norm": 4.485768795013428,
"learning_rate": 5.011033077828982e-07,
"loss": 0.3037,
"step": 18250
},
{
"epoch": 2.198675496688742,
"grad_norm": 4.850970268249512,
"learning_rate": 5.00577923464693e-07,
"loss": 0.3098,
"step": 18260
},
{
"epoch": 2.1998795906080675,
"grad_norm": 4.3276848793029785,
"learning_rate": 5.000525385083919e-07,
"loss": 0.3117,
"step": 18270
},
{
"epoch": 2.201083684527393,
"grad_norm": 4.39775276184082,
"learning_rate": 4.995271534940823e-07,
"loss": 0.3185,
"step": 18280
},
{
"epoch": 2.2022877784467187,
"grad_norm": 4.972282409667969,
"learning_rate": 4.99001769001851e-07,
"loss": 0.3131,
"step": 18290
},
{
"epoch": 2.2034918723660444,
"grad_norm": 4.450355052947998,
"learning_rate": 4.984763856117842e-07,
"loss": 0.3052,
"step": 18300
},
{
"epoch": 2.2046959662853705,
"grad_norm": 4.771944046020508,
"learning_rate": 4.979510039039674e-07,
"loss": 0.3087,
"step": 18310
},
{
"epoch": 2.205900060204696,
"grad_norm": 4.077056407928467,
"learning_rate": 4.974256244584838e-07,
"loss": 0.2991,
"step": 18320
},
{
"epoch": 2.2071041541240217,
"grad_norm": 4.485861778259277,
"learning_rate": 4.969002478554139e-07,
"loss": 0.3117,
"step": 18330
},
{
"epoch": 2.2083082480433474,
"grad_norm": 4.26900053024292,
"learning_rate": 4.963748746748358e-07,
"loss": 0.299,
"step": 18340
},
{
"epoch": 2.209512341962673,
"grad_norm": 5.258630752563477,
"learning_rate": 4.958495054968235e-07,
"loss": 0.3109,
"step": 18350
},
{
"epoch": 2.2107164358819986,
"grad_norm": 5.4050774574279785,
"learning_rate": 4.953241409014459e-07,
"loss": 0.3263,
"step": 18360
},
{
"epoch": 2.2119205298013247,
"grad_norm": 4.431223392486572,
"learning_rate": 4.947987814687679e-07,
"loss": 0.3131,
"step": 18370
},
{
"epoch": 2.2131246237206503,
"grad_norm": 5.015274524688721,
"learning_rate": 4.942734277788481e-07,
"loss": 0.3122,
"step": 18380
},
{
"epoch": 2.214328717639976,
"grad_norm": 5.460362911224365,
"learning_rate": 4.937480804117392e-07,
"loss": 0.3049,
"step": 18390
},
{
"epoch": 2.2155328115593016,
"grad_norm": 4.469453811645508,
"learning_rate": 4.93222739947486e-07,
"loss": 0.3109,
"step": 18400
},
{
"epoch": 2.2167369054786272,
"grad_norm": 4.560921669006348,
"learning_rate": 4.926974069661265e-07,
"loss": 0.3155,
"step": 18410
},
{
"epoch": 2.217940999397953,
"grad_norm": 4.696376800537109,
"learning_rate": 4.921720820476904e-07,
"loss": 0.3256,
"step": 18420
},
{
"epoch": 2.219145093317279,
"grad_norm": 4.80272102355957,
"learning_rate": 4.916467657721984e-07,
"loss": 0.3172,
"step": 18430
},
{
"epoch": 2.2203491872366046,
"grad_norm": 4.686549663543701,
"learning_rate": 4.911214587196612e-07,
"loss": 0.3044,
"step": 18440
},
{
"epoch": 2.22155328115593,
"grad_norm": 4.5141921043396,
"learning_rate": 4.9059616147008e-07,
"loss": 0.296,
"step": 18450
},
{
"epoch": 2.222757375075256,
"grad_norm": 4.311396598815918,
"learning_rate": 4.900708746034446e-07,
"loss": 0.3052,
"step": 18460
},
{
"epoch": 2.2239614689945815,
"grad_norm": 4.644687175750732,
"learning_rate": 4.895455986997341e-07,
"loss": 0.3091,
"step": 18470
},
{
"epoch": 2.225165562913907,
"grad_norm": 4.708485126495361,
"learning_rate": 4.890203343389144e-07,
"loss": 0.3126,
"step": 18480
},
{
"epoch": 2.226369656833233,
"grad_norm": 4.648069381713867,
"learning_rate": 4.884950821009394e-07,
"loss": 0.3303,
"step": 18490
},
{
"epoch": 2.227573750752559,
"grad_norm": 5.3636555671691895,
"learning_rate": 4.8796984256575e-07,
"loss": 0.308,
"step": 18500
},
{
"epoch": 2.2287778446718844,
"grad_norm": 4.061014652252197,
"learning_rate": 4.874446163132719e-07,
"loss": 0.2957,
"step": 18510
},
{
"epoch": 2.22998193859121,
"grad_norm": 6.169346332550049,
"learning_rate": 4.869194039234169e-07,
"loss": 0.318,
"step": 18520
},
{
"epoch": 2.2311860325105357,
"grad_norm": 4.9474053382873535,
"learning_rate": 4.863942059760817e-07,
"loss": 0.3112,
"step": 18530
},
{
"epoch": 2.2323901264298613,
"grad_norm": 4.635356903076172,
"learning_rate": 4.858690230511465e-07,
"loss": 0.3006,
"step": 18540
},
{
"epoch": 2.2335942203491874,
"grad_norm": 4.872357368469238,
"learning_rate": 4.85343855728475e-07,
"loss": 0.315,
"step": 18550
},
{
"epoch": 2.234798314268513,
"grad_norm": 4.909818172454834,
"learning_rate": 4.848187045879141e-07,
"loss": 0.2983,
"step": 18560
},
{
"epoch": 2.2360024081878387,
"grad_norm": 5.507841110229492,
"learning_rate": 4.842935702092923e-07,
"loss": 0.2919,
"step": 18570
},
{
"epoch": 2.2372065021071643,
"grad_norm": 4.438649654388428,
"learning_rate": 4.837684531724202e-07,
"loss": 0.3012,
"step": 18580
},
{
"epoch": 2.23841059602649,
"grad_norm": 4.70427942276001,
"learning_rate": 4.832433540570885e-07,
"loss": 0.3076,
"step": 18590
},
{
"epoch": 2.2396146899458156,
"grad_norm": 4.81848669052124,
"learning_rate": 4.827182734430687e-07,
"loss": 0.3021,
"step": 18600
},
{
"epoch": 2.2408187838651417,
"grad_norm": 4.911860466003418,
"learning_rate": 4.821932119101116e-07,
"loss": 0.3109,
"step": 18610
},
{
"epoch": 2.2420228777844673,
"grad_norm": 5.092623233795166,
"learning_rate": 4.816681700379472e-07,
"loss": 0.3243,
"step": 18620
},
{
"epoch": 2.243226971703793,
"grad_norm": 4.224728584289551,
"learning_rate": 4.811431484062832e-07,
"loss": 0.3128,
"step": 18630
},
{
"epoch": 2.2444310656231186,
"grad_norm": 4.93331241607666,
"learning_rate": 4.806181475948057e-07,
"loss": 0.3147,
"step": 18640
},
{
"epoch": 2.245635159542444,
"grad_norm": 6.220354080200195,
"learning_rate": 4.800931681831773e-07,
"loss": 0.2964,
"step": 18650
},
{
"epoch": 2.24683925346177,
"grad_norm": 5.004923343658447,
"learning_rate": 4.795682107510375e-07,
"loss": 0.3172,
"step": 18660
},
{
"epoch": 2.248043347381096,
"grad_norm": 5.164400577545166,
"learning_rate": 4.790432758780005e-07,
"loss": 0.3063,
"step": 18670
},
{
"epoch": 2.2492474413004215,
"grad_norm": 5.098756313323975,
"learning_rate": 4.785183641436569e-07,
"loss": 0.3045,
"step": 18680
},
{
"epoch": 2.250451535219747,
"grad_norm": 4.363048553466797,
"learning_rate": 4.779934761275706e-07,
"loss": 0.3084,
"step": 18690
},
{
"epoch": 2.251655629139073,
"grad_norm": 5.233163833618164,
"learning_rate": 4.774686124092804e-07,
"loss": 0.316,
"step": 18700
},
{
"epoch": 2.2528597230583984,
"grad_norm": 4.870039463043213,
"learning_rate": 4.769437735682972e-07,
"loss": 0.3008,
"step": 18710
},
{
"epoch": 2.254063816977724,
"grad_norm": 5.44446325302124,
"learning_rate": 4.7641896018410506e-07,
"loss": 0.3139,
"step": 18720
},
{
"epoch": 2.25526791089705,
"grad_norm": 4.950879096984863,
"learning_rate": 4.758941728361599e-07,
"loss": 0.3108,
"step": 18730
},
{
"epoch": 2.2564720048163758,
"grad_norm": 4.887548446655273,
"learning_rate": 4.7536941210388895e-07,
"loss": 0.3195,
"step": 18740
},
{
"epoch": 2.2576760987357014,
"grad_norm": 6.180630207061768,
"learning_rate": 4.7484467856668946e-07,
"loss": 0.3112,
"step": 18750
},
{
"epoch": 2.258880192655027,
"grad_norm": 5.481302738189697,
"learning_rate": 4.743199728039294e-07,
"loss": 0.3124,
"step": 18760
},
{
"epoch": 2.2600842865743527,
"grad_norm": 4.6261677742004395,
"learning_rate": 4.737952953949457e-07,
"loss": 0.3058,
"step": 18770
},
{
"epoch": 2.2612883804936788,
"grad_norm": 4.097585201263428,
"learning_rate": 4.732706469190442e-07,
"loss": 0.3271,
"step": 18780
},
{
"epoch": 2.2624924744130044,
"grad_norm": 5.000282287597656,
"learning_rate": 4.7274602795549836e-07,
"loss": 0.317,
"step": 18790
},
{
"epoch": 2.26369656833233,
"grad_norm": 4.3350958824157715,
"learning_rate": 4.7222143908354943e-07,
"loss": 0.3083,
"step": 18800
},
{
"epoch": 2.2649006622516556,
"grad_norm": 4.336573123931885,
"learning_rate": 4.7169688088240555e-07,
"loss": 0.3139,
"step": 18810
},
{
"epoch": 2.2661047561709813,
"grad_norm": 4.1952900886535645,
"learning_rate": 4.7117235393124064e-07,
"loss": 0.294,
"step": 18820
},
{
"epoch": 2.267308850090307,
"grad_norm": 5.418072700500488,
"learning_rate": 4.7064785880919414e-07,
"loss": 0.3185,
"step": 18830
},
{
"epoch": 2.2685129440096325,
"grad_norm": 5.001430511474609,
"learning_rate": 4.701233960953708e-07,
"loss": 0.3108,
"step": 18840
},
{
"epoch": 2.2697170379289586,
"grad_norm": 5.28980827331543,
"learning_rate": 4.69598966368839e-07,
"loss": 0.3149,
"step": 18850
},
{
"epoch": 2.2709211318482843,
"grad_norm": 5.221833229064941,
"learning_rate": 4.6907457020863095e-07,
"loss": 0.3106,
"step": 18860
},
{
"epoch": 2.27212522576761,
"grad_norm": 4.259886264801025,
"learning_rate": 4.6855020819374196e-07,
"loss": 0.3159,
"step": 18870
},
{
"epoch": 2.2733293196869355,
"grad_norm": 5.210353851318359,
"learning_rate": 4.680258809031293e-07,
"loss": 0.306,
"step": 18880
},
{
"epoch": 2.274533413606261,
"grad_norm": 4.933556079864502,
"learning_rate": 4.6750158891571246e-07,
"loss": 0.2988,
"step": 18890
},
{
"epoch": 2.2757375075255872,
"grad_norm": 5.060166358947754,
"learning_rate": 4.669773328103712e-07,
"loss": 0.3298,
"step": 18900
},
{
"epoch": 2.276941601444913,
"grad_norm": 5.316260814666748,
"learning_rate": 4.664531131659461e-07,
"loss": 0.3193,
"step": 18910
},
{
"epoch": 2.2781456953642385,
"grad_norm": 4.371904373168945,
"learning_rate": 4.659289305612375e-07,
"loss": 0.3181,
"step": 18920
},
{
"epoch": 2.279349789283564,
"grad_norm": 4.114840984344482,
"learning_rate": 4.65404785575005e-07,
"loss": 0.3089,
"step": 18930
},
{
"epoch": 2.2805538832028898,
"grad_norm": 4.94135046005249,
"learning_rate": 4.64880678785966e-07,
"loss": 0.3158,
"step": 18940
},
{
"epoch": 2.2817579771222154,
"grad_norm": 5.033153057098389,
"learning_rate": 4.6435661077279633e-07,
"loss": 0.3087,
"step": 18950
},
{
"epoch": 2.282962071041541,
"grad_norm": 4.434708595275879,
"learning_rate": 4.638325821141289e-07,
"loss": 0.3031,
"step": 18960
},
{
"epoch": 2.284166164960867,
"grad_norm": 4.674195766448975,
"learning_rate": 4.6330859338855325e-07,
"loss": 0.3227,
"step": 18970
},
{
"epoch": 2.2853702588801927,
"grad_norm": 4.624505043029785,
"learning_rate": 4.6278464517461434e-07,
"loss": 0.2994,
"step": 18980
},
{
"epoch": 2.2865743527995184,
"grad_norm": 4.435290336608887,
"learning_rate": 4.622607380508129e-07,
"loss": 0.3125,
"step": 18990
},
{
"epoch": 2.287778446718844,
"grad_norm": 4.538943767547607,
"learning_rate": 4.6173687259560417e-07,
"loss": 0.3166,
"step": 19000
},
{
"epoch": 2.2889825406381696,
"grad_norm": 5.1769890785217285,
"learning_rate": 4.6121304938739754e-07,
"loss": 0.2978,
"step": 19010
},
{
"epoch": 2.2901866345574957,
"grad_norm": 4.897463321685791,
"learning_rate": 4.606892690045551e-07,
"loss": 0.2857,
"step": 19020
},
{
"epoch": 2.2913907284768213,
"grad_norm": 5.332199573516846,
"learning_rate": 4.601655320253924e-07,
"loss": 0.3082,
"step": 19030
},
{
"epoch": 2.292594822396147,
"grad_norm": 4.842720985412598,
"learning_rate": 4.5964183902817677e-07,
"loss": 0.3003,
"step": 19040
},
{
"epoch": 2.2937989163154726,
"grad_norm": 4.277060031890869,
"learning_rate": 4.5911819059112724e-07,
"loss": 0.3027,
"step": 19050
},
{
"epoch": 2.2950030102347982,
"grad_norm": 4.499503135681152,
"learning_rate": 4.5859458729241287e-07,
"loss": 0.311,
"step": 19060
},
{
"epoch": 2.296207104154124,
"grad_norm": 5.2861762046813965,
"learning_rate": 4.580710297101537e-07,
"loss": 0.3197,
"step": 19070
},
{
"epoch": 2.2974111980734495,
"grad_norm": 4.3773112297058105,
"learning_rate": 4.5754751842241905e-07,
"loss": 0.3113,
"step": 19080
},
{
"epoch": 2.2986152919927756,
"grad_norm": 4.447787284851074,
"learning_rate": 4.5702405400722703e-07,
"loss": 0.3037,
"step": 19090
},
{
"epoch": 2.299819385912101,
"grad_norm": 5.014771938323975,
"learning_rate": 4.5650063704254395e-07,
"loss": 0.3018,
"step": 19100
},
{
"epoch": 2.301023479831427,
"grad_norm": 4.333285331726074,
"learning_rate": 4.55977268106284e-07,
"loss": 0.3176,
"step": 19110
},
{
"epoch": 2.3022275737507525,
"grad_norm": 6.291433334350586,
"learning_rate": 4.5545394777630786e-07,
"loss": 0.3335,
"step": 19120
},
{
"epoch": 2.303431667670078,
"grad_norm": 4.657562255859375,
"learning_rate": 4.5493067663042325e-07,
"loss": 0.3059,
"step": 19130
},
{
"epoch": 2.304635761589404,
"grad_norm": 4.472227573394775,
"learning_rate": 4.544074552463829e-07,
"loss": 0.3074,
"step": 19140
},
{
"epoch": 2.30583985550873,
"grad_norm": 5.011964797973633,
"learning_rate": 4.5388428420188486e-07,
"loss": 0.3036,
"step": 19150
},
{
"epoch": 2.3070439494280555,
"grad_norm": 5.620879173278809,
"learning_rate": 4.533611640745718e-07,
"loss": 0.31,
"step": 19160
},
{
"epoch": 2.308248043347381,
"grad_norm": 5.25240421295166,
"learning_rate": 4.5283809544202996e-07,
"loss": 0.328,
"step": 19170
},
{
"epoch": 2.3094521372667067,
"grad_norm": 4.3917317390441895,
"learning_rate": 4.5231507888178856e-07,
"loss": 0.3129,
"step": 19180
},
{
"epoch": 2.3106562311860324,
"grad_norm": 4.568994998931885,
"learning_rate": 4.517921149713196e-07,
"loss": 0.3057,
"step": 19190
},
{
"epoch": 2.311860325105358,
"grad_norm": 4.5026726722717285,
"learning_rate": 4.512692042880372e-07,
"loss": 0.2997,
"step": 19200
},
{
"epoch": 2.313064419024684,
"grad_norm": 3.986133098602295,
"learning_rate": 4.507463474092959e-07,
"loss": 0.2952,
"step": 19210
},
{
"epoch": 2.3142685129440097,
"grad_norm": 4.367317199707031,
"learning_rate": 4.5022354491239145e-07,
"loss": 0.3036,
"step": 19220
},
{
"epoch": 2.3154726068633353,
"grad_norm": 5.649072170257568,
"learning_rate": 4.497007973745595e-07,
"loss": 0.3173,
"step": 19230
},
{
"epoch": 2.316676700782661,
"grad_norm": 5.655643463134766,
"learning_rate": 4.4917810537297514e-07,
"loss": 0.327,
"step": 19240
},
{
"epoch": 2.3178807947019866,
"grad_norm": 5.137732982635498,
"learning_rate": 4.4865546948475147e-07,
"loss": 0.3065,
"step": 19250
},
{
"epoch": 2.3190848886213127,
"grad_norm": 4.715443134307861,
"learning_rate": 4.481328902869404e-07,
"loss": 0.3207,
"step": 19260
},
{
"epoch": 2.3202889825406383,
"grad_norm": 3.9082722663879395,
"learning_rate": 4.476103683565308e-07,
"loss": 0.3074,
"step": 19270
},
{
"epoch": 2.321493076459964,
"grad_norm": 4.448252201080322,
"learning_rate": 4.4708790427044887e-07,
"loss": 0.3063,
"step": 19280
},
{
"epoch": 2.3226971703792896,
"grad_norm": 4.547604560852051,
"learning_rate": 4.465654986055559e-07,
"loss": 0.3098,
"step": 19290
},
{
"epoch": 2.323901264298615,
"grad_norm": 5.669996738433838,
"learning_rate": 4.460431519386497e-07,
"loss": 0.3188,
"step": 19300
},
{
"epoch": 2.325105358217941,
"grad_norm": 5.271092891693115,
"learning_rate": 4.4552086484646246e-07,
"loss": 0.2948,
"step": 19310
},
{
"epoch": 2.3263094521372665,
"grad_norm": 5.6719231605529785,
"learning_rate": 4.4499863790566087e-07,
"loss": 0.3089,
"step": 19320
},
{
"epoch": 2.3275135460565926,
"grad_norm": 5.9080657958984375,
"learning_rate": 4.444764716928447e-07,
"loss": 0.3195,
"step": 19330
},
{
"epoch": 2.328717639975918,
"grad_norm": 5.201897144317627,
"learning_rate": 4.43954366784547e-07,
"loss": 0.2979,
"step": 19340
},
{
"epoch": 2.329921733895244,
"grad_norm": 4.319961071014404,
"learning_rate": 4.4343232375723343e-07,
"loss": 0.3059,
"step": 19350
},
{
"epoch": 2.3311258278145695,
"grad_norm": 4.492523670196533,
"learning_rate": 4.4291034318730086e-07,
"loss": 0.2941,
"step": 19360
},
{
"epoch": 2.332329921733895,
"grad_norm": 5.589833736419678,
"learning_rate": 4.4238842565107715e-07,
"loss": 0.3089,
"step": 19370
},
{
"epoch": 2.333534015653221,
"grad_norm": 4.234698295593262,
"learning_rate": 4.4186657172482105e-07,
"loss": 0.3012,
"step": 19380
},
{
"epoch": 2.334738109572547,
"grad_norm": 4.777867317199707,
"learning_rate": 4.413447819847206e-07,
"loss": 0.3083,
"step": 19390
},
{
"epoch": 2.3359422034918724,
"grad_norm": 5.0551533699035645,
"learning_rate": 4.4082305700689334e-07,
"loss": 0.3056,
"step": 19400
},
{
"epoch": 2.337146297411198,
"grad_norm": 4.407803535461426,
"learning_rate": 4.40301397367385e-07,
"loss": 0.3137,
"step": 19410
},
{
"epoch": 2.3383503913305237,
"grad_norm": 4.408458709716797,
"learning_rate": 4.3977980364216925e-07,
"loss": 0.3234,
"step": 19420
},
{
"epoch": 2.3395544852498493,
"grad_norm": 5.100025653839111,
"learning_rate": 4.392582764071471e-07,
"loss": 0.3053,
"step": 19430
},
{
"epoch": 2.340758579169175,
"grad_norm": 4.870809078216553,
"learning_rate": 4.3873681623814634e-07,
"loss": 0.2973,
"step": 19440
},
{
"epoch": 2.341962673088501,
"grad_norm": 5.078246116638184,
"learning_rate": 4.3821542371092e-07,
"loss": 0.3042,
"step": 19450
},
{
"epoch": 2.3431667670078267,
"grad_norm": 4.400288105010986,
"learning_rate": 4.3769409940114706e-07,
"loss": 0.3012,
"step": 19460
},
{
"epoch": 2.3443708609271523,
"grad_norm": 5.289750576019287,
"learning_rate": 4.3717284388443123e-07,
"loss": 0.3149,
"step": 19470
},
{
"epoch": 2.345574954846478,
"grad_norm": 4.133148670196533,
"learning_rate": 4.3665165773629955e-07,
"loss": 0.311,
"step": 19480
},
{
"epoch": 2.3467790487658036,
"grad_norm": 4.689704418182373,
"learning_rate": 4.361305415322032e-07,
"loss": 0.2985,
"step": 19490
},
{
"epoch": 2.3479831426851296,
"grad_norm": 5.3425822257995605,
"learning_rate": 4.35609495847516e-07,
"loss": 0.3252,
"step": 19500
},
{
"epoch": 2.3491872366044553,
"grad_norm": 4.8020524978637695,
"learning_rate": 4.350885212575338e-07,
"loss": 0.3017,
"step": 19510
},
{
"epoch": 2.350391330523781,
"grad_norm": 3.823481798171997,
"learning_rate": 4.345676183374737e-07,
"loss": 0.3163,
"step": 19520
},
{
"epoch": 2.3515954244431065,
"grad_norm": 5.067866802215576,
"learning_rate": 4.3404678766247393e-07,
"loss": 0.2985,
"step": 19530
},
{
"epoch": 2.352799518362432,
"grad_norm": 4.470125198364258,
"learning_rate": 4.335260298075931e-07,
"loss": 0.3215,
"step": 19540
},
{
"epoch": 2.354003612281758,
"grad_norm": 4.854072093963623,
"learning_rate": 4.330053453478094e-07,
"loss": 0.3139,
"step": 19550
},
{
"epoch": 2.3552077062010834,
"grad_norm": 4.061732292175293,
"learning_rate": 4.3248473485801943e-07,
"loss": 0.2944,
"step": 19560
},
{
"epoch": 2.3564118001204095,
"grad_norm": 4.881399154663086,
"learning_rate": 4.319641989130387e-07,
"loss": 0.2958,
"step": 19570
},
{
"epoch": 2.357615894039735,
"grad_norm": 4.650146007537842,
"learning_rate": 4.3144373808760026e-07,
"loss": 0.3092,
"step": 19580
},
{
"epoch": 2.358819987959061,
"grad_norm": 5.014580249786377,
"learning_rate": 4.3092335295635444e-07,
"loss": 0.3143,
"step": 19590
},
{
"epoch": 2.3600240818783864,
"grad_norm": 5.064713478088379,
"learning_rate": 4.304030440938673e-07,
"loss": 0.3106,
"step": 19600
},
{
"epoch": 2.361228175797712,
"grad_norm": 4.044290065765381,
"learning_rate": 4.298828120746213e-07,
"loss": 0.3024,
"step": 19610
},
{
"epoch": 2.362432269717038,
"grad_norm": 5.447383403778076,
"learning_rate": 4.29362657473014e-07,
"loss": 0.3147,
"step": 19620
},
{
"epoch": 2.3636363636363638,
"grad_norm": 4.447105884552002,
"learning_rate": 4.2884258086335745e-07,
"loss": 0.303,
"step": 19630
},
{
"epoch": 2.3648404575556894,
"grad_norm": 4.2513957023620605,
"learning_rate": 4.2832258281987724e-07,
"loss": 0.3107,
"step": 19640
},
{
"epoch": 2.366044551475015,
"grad_norm": 5.619822025299072,
"learning_rate": 4.2780266391671277e-07,
"loss": 0.3212,
"step": 19650
},
{
"epoch": 2.3672486453943407,
"grad_norm": 5.056023597717285,
"learning_rate": 4.272828247279155e-07,
"loss": 0.298,
"step": 19660
},
{
"epoch": 2.3684527393136663,
"grad_norm": 4.584505558013916,
"learning_rate": 4.267630658274495e-07,
"loss": 0.3069,
"step": 19670
},
{
"epoch": 2.3696568332329924,
"grad_norm": 5.227287292480469,
"learning_rate": 4.2624338778918964e-07,
"loss": 0.296,
"step": 19680
},
{
"epoch": 2.370860927152318,
"grad_norm": 4.425261974334717,
"learning_rate": 4.2572379118692155e-07,
"loss": 0.3093,
"step": 19690
},
{
"epoch": 2.3720650210716436,
"grad_norm": 4.10771369934082,
"learning_rate": 4.2520427659434134e-07,
"loss": 0.295,
"step": 19700
},
{
"epoch": 2.3732691149909693,
"grad_norm": 4.561648845672607,
"learning_rate": 4.2468484458505456e-07,
"loss": 0.3006,
"step": 19710
},
{
"epoch": 2.374473208910295,
"grad_norm": 3.9050345420837402,
"learning_rate": 4.241654957325748e-07,
"loss": 0.3016,
"step": 19720
},
{
"epoch": 2.3756773028296205,
"grad_norm": 5.106329917907715,
"learning_rate": 4.2364623061032477e-07,
"loss": 0.3043,
"step": 19730
},
{
"epoch": 2.3768813967489466,
"grad_norm": 6.0447211265563965,
"learning_rate": 4.231270497916343e-07,
"loss": 0.3114,
"step": 19740
},
{
"epoch": 2.3780854906682722,
"grad_norm": 4.171956539154053,
"learning_rate": 4.2260795384974037e-07,
"loss": 0.3033,
"step": 19750
},
{
"epoch": 2.379289584587598,
"grad_norm": 4.500546932220459,
"learning_rate": 4.2208894335778573e-07,
"loss": 0.3066,
"step": 19760
},
{
"epoch": 2.3804936785069235,
"grad_norm": 5.30014181137085,
"learning_rate": 4.215700188888192e-07,
"loss": 0.3008,
"step": 19770
},
{
"epoch": 2.381697772426249,
"grad_norm": 4.23181676864624,
"learning_rate": 4.2105118101579497e-07,
"loss": 0.2925,
"step": 19780
},
{
"epoch": 2.3829018663455748,
"grad_norm": 4.446700096130371,
"learning_rate": 4.205324303115706e-07,
"loss": 0.3142,
"step": 19790
},
{
"epoch": 2.384105960264901,
"grad_norm": 5.344078063964844,
"learning_rate": 4.2001376734890824e-07,
"loss": 0.3053,
"step": 19800
},
{
"epoch": 2.3853100541842265,
"grad_norm": 5.066955089569092,
"learning_rate": 4.1949519270047295e-07,
"loss": 0.3071,
"step": 19810
},
{
"epoch": 2.386514148103552,
"grad_norm": 4.834653377532959,
"learning_rate": 4.1897670693883255e-07,
"loss": 0.3039,
"step": 19820
},
{
"epoch": 2.3877182420228777,
"grad_norm": 4.982695579528809,
"learning_rate": 4.1845831063645586e-07,
"loss": 0.3007,
"step": 19830
},
{
"epoch": 2.3889223359422034,
"grad_norm": 5.261125564575195,
"learning_rate": 4.1794000436571374e-07,
"loss": 0.3121,
"step": 19840
},
{
"epoch": 2.390126429861529,
"grad_norm": 5.1389570236206055,
"learning_rate": 4.174217886988775e-07,
"loss": 0.3058,
"step": 19850
},
{
"epoch": 2.391330523780855,
"grad_norm": 4.307366371154785,
"learning_rate": 4.169036642081183e-07,
"loss": 0.3008,
"step": 19860
},
{
"epoch": 2.3925346177001807,
"grad_norm": 5.068446636199951,
"learning_rate": 4.163856314655064e-07,
"loss": 0.3145,
"step": 19870
},
{
"epoch": 2.3937387116195064,
"grad_norm": 5.377712249755859,
"learning_rate": 4.1586769104301124e-07,
"loss": 0.3047,
"step": 19880
},
{
"epoch": 2.394942805538832,
"grad_norm": 5.161853313446045,
"learning_rate": 4.153498435124999e-07,
"loss": 0.3111,
"step": 19890
},
{
"epoch": 2.3961468994581576,
"grad_norm": 4.217031002044678,
"learning_rate": 4.1483208944573745e-07,
"loss": 0.2886,
"step": 19900
},
{
"epoch": 2.3973509933774833,
"grad_norm": 4.948873996734619,
"learning_rate": 4.1431442941438486e-07,
"loss": 0.3138,
"step": 19910
},
{
"epoch": 2.3985550872968093,
"grad_norm": 5.304249286651611,
"learning_rate": 4.1379686399000016e-07,
"loss": 0.3013,
"step": 19920
},
{
"epoch": 2.399759181216135,
"grad_norm": 5.372039318084717,
"learning_rate": 4.132793937440365e-07,
"loss": 0.316,
"step": 19930
},
{
"epoch": 2.4009632751354606,
"grad_norm": 5.1526265144348145,
"learning_rate": 4.127620192478421e-07,
"loss": 0.3177,
"step": 19940
},
{
"epoch": 2.4021673690547862,
"grad_norm": 4.650707244873047,
"learning_rate": 4.122447410726591e-07,
"loss": 0.3014,
"step": 19950
},
{
"epoch": 2.403371462974112,
"grad_norm": 4.576737403869629,
"learning_rate": 4.1172755978962395e-07,
"loss": 0.3069,
"step": 19960
},
{
"epoch": 2.4045755568934375,
"grad_norm": 5.201079845428467,
"learning_rate": 4.1121047596976534e-07,
"loss": 0.3151,
"step": 19970
},
{
"epoch": 2.4057796508127636,
"grad_norm": 4.859030723571777,
"learning_rate": 4.1069349018400503e-07,
"loss": 0.298,
"step": 19980
},
{
"epoch": 2.406983744732089,
"grad_norm": 5.44400691986084,
"learning_rate": 4.101766030031562e-07,
"loss": 0.303,
"step": 19990
},
{
"epoch": 2.408187838651415,
"grad_norm": 4.533078193664551,
"learning_rate": 4.0965981499792307e-07,
"loss": 0.3055,
"step": 20000
},
{
"epoch": 2.4093919325707405,
"grad_norm": 5.147141456604004,
"learning_rate": 4.0914312673890054e-07,
"loss": 0.3141,
"step": 20010
},
{
"epoch": 2.410596026490066,
"grad_norm": 4.530623912811279,
"learning_rate": 4.0862653879657373e-07,
"loss": 0.3205,
"step": 20020
},
{
"epoch": 2.411800120409392,
"grad_norm": 4.804474830627441,
"learning_rate": 4.08110051741316e-07,
"loss": 0.3113,
"step": 20030
},
{
"epoch": 2.413004214328718,
"grad_norm": 4.642183780670166,
"learning_rate": 4.0759366614339015e-07,
"loss": 0.3115,
"step": 20040
},
{
"epoch": 2.4142083082480434,
"grad_norm": 4.975921630859375,
"learning_rate": 4.0707738257294685e-07,
"loss": 0.3165,
"step": 20050
},
{
"epoch": 2.415412402167369,
"grad_norm": 4.621540546417236,
"learning_rate": 4.065612016000241e-07,
"loss": 0.2914,
"step": 20060
},
{
"epoch": 2.4166164960866947,
"grad_norm": 4.194451808929443,
"learning_rate": 4.060451237945462e-07,
"loss": 0.3035,
"step": 20070
},
{
"epoch": 2.4178205900060203,
"grad_norm": 4.82729959487915,
"learning_rate": 4.05529149726324e-07,
"loss": 0.3068,
"step": 20080
},
{
"epoch": 2.419024683925346,
"grad_norm": 5.17459774017334,
"learning_rate": 4.050132799650538e-07,
"loss": 0.3092,
"step": 20090
},
{
"epoch": 2.420228777844672,
"grad_norm": 5.787187576293945,
"learning_rate": 4.0449751508031666e-07,
"loss": 0.3168,
"step": 20100
},
{
"epoch": 2.4214328717639977,
"grad_norm": 4.466209411621094,
"learning_rate": 4.039818556415775e-07,
"loss": 0.296,
"step": 20110
},
{
"epoch": 2.4226369656833233,
"grad_norm": 4.929852485656738,
"learning_rate": 4.034663022181852e-07,
"loss": 0.3135,
"step": 20120
},
{
"epoch": 2.423841059602649,
"grad_norm": 4.523739337921143,
"learning_rate": 4.029508553793718e-07,
"loss": 0.288,
"step": 20130
},
{
"epoch": 2.4250451535219746,
"grad_norm": 7.000367641448975,
"learning_rate": 4.0243551569425095e-07,
"loss": 0.3105,
"step": 20140
},
{
"epoch": 2.4262492474413007,
"grad_norm": 6.229575157165527,
"learning_rate": 4.019202837318185e-07,
"loss": 0.3166,
"step": 20150
},
{
"epoch": 2.4274533413606263,
"grad_norm": 5.243337154388428,
"learning_rate": 4.0140516006095134e-07,
"loss": 0.3046,
"step": 20160
},
{
"epoch": 2.428657435279952,
"grad_norm": 4.598159313201904,
"learning_rate": 4.0089014525040685e-07,
"loss": 0.3064,
"step": 20170
},
{
"epoch": 2.4298615291992776,
"grad_norm": 4.482394695281982,
"learning_rate": 4.003752398688218e-07,
"loss": 0.3097,
"step": 20180
},
{
"epoch": 2.431065623118603,
"grad_norm": 5.39198637008667,
"learning_rate": 3.9986044448471244e-07,
"loss": 0.3112,
"step": 20190
},
{
"epoch": 2.432269717037929,
"grad_norm": 4.356963634490967,
"learning_rate": 3.9934575966647375e-07,
"loss": 0.3006,
"step": 20200
},
{
"epoch": 2.4334738109572545,
"grad_norm": 4.211975574493408,
"learning_rate": 3.9883118598237837e-07,
"loss": 0.2989,
"step": 20210
},
{
"epoch": 2.4346779048765805,
"grad_norm": 5.301422119140625,
"learning_rate": 3.9831672400057605e-07,
"loss": 0.3178,
"step": 20220
},
{
"epoch": 2.435881998795906,
"grad_norm": 4.181766510009766,
"learning_rate": 3.978023742890937e-07,
"loss": 0.3066,
"step": 20230
},
{
"epoch": 2.437086092715232,
"grad_norm": 5.18208122253418,
"learning_rate": 3.9728813741583383e-07,
"loss": 0.3001,
"step": 20240
},
{
"epoch": 2.4382901866345574,
"grad_norm": 5.382752418518066,
"learning_rate": 3.967740139485748e-07,
"loss": 0.3088,
"step": 20250
},
{
"epoch": 2.439494280553883,
"grad_norm": 5.215182304382324,
"learning_rate": 3.9626000445496934e-07,
"loss": 0.2882,
"step": 20260
},
{
"epoch": 2.440698374473209,
"grad_norm": 5.133399963378906,
"learning_rate": 3.957461095025444e-07,
"loss": 0.3303,
"step": 20270
},
{
"epoch": 2.4419024683925348,
"grad_norm": 5.194669246673584,
"learning_rate": 3.952323296587007e-07,
"loss": 0.3172,
"step": 20280
},
{
"epoch": 2.4431065623118604,
"grad_norm": 4.95144510269165,
"learning_rate": 3.947186654907119e-07,
"loss": 0.3138,
"step": 20290
},
{
"epoch": 2.444310656231186,
"grad_norm": 5.0588812828063965,
"learning_rate": 3.9420511756572346e-07,
"loss": 0.3058,
"step": 20300
},
{
"epoch": 2.4455147501505117,
"grad_norm": 5.033606052398682,
"learning_rate": 3.936916864507529e-07,
"loss": 0.3161,
"step": 20310
},
{
"epoch": 2.4467188440698373,
"grad_norm": 5.006187915802002,
"learning_rate": 3.9317837271268876e-07,
"loss": 0.2993,
"step": 20320
},
{
"epoch": 2.447922937989163,
"grad_norm": 4.955638408660889,
"learning_rate": 3.926651769182901e-07,
"loss": 0.3023,
"step": 20330
},
{
"epoch": 2.449127031908489,
"grad_norm": 4.786928653717041,
"learning_rate": 3.9215209963418513e-07,
"loss": 0.3207,
"step": 20340
},
{
"epoch": 2.4503311258278146,
"grad_norm": 4.456767559051514,
"learning_rate": 3.9163914142687177e-07,
"loss": 0.3142,
"step": 20350
},
{
"epoch": 2.4515352197471403,
"grad_norm": 5.671106338500977,
"learning_rate": 3.911263028627164e-07,
"loss": 0.3125,
"step": 20360
},
{
"epoch": 2.452739313666466,
"grad_norm": 5.525556564331055,
"learning_rate": 3.9061358450795344e-07,
"loss": 0.2972,
"step": 20370
},
{
"epoch": 2.4539434075857915,
"grad_norm": 4.18988561630249,
"learning_rate": 3.9010098692868397e-07,
"loss": 0.2971,
"step": 20380
},
{
"epoch": 2.4551475015051176,
"grad_norm": 5.705048561096191,
"learning_rate": 3.895885106908763e-07,
"loss": 0.3094,
"step": 20390
},
{
"epoch": 2.4563515954244433,
"grad_norm": 5.453742980957031,
"learning_rate": 3.890761563603647e-07,
"loss": 0.3079,
"step": 20400
},
{
"epoch": 2.457555689343769,
"grad_norm": 4.007357120513916,
"learning_rate": 3.885639245028488e-07,
"loss": 0.3119,
"step": 20410
},
{
"epoch": 2.4587597832630945,
"grad_norm": 5.247729301452637,
"learning_rate": 3.8805181568389255e-07,
"loss": 0.3047,
"step": 20420
},
{
"epoch": 2.45996387718242,
"grad_norm": 4.143746852874756,
"learning_rate": 3.8753983046892465e-07,
"loss": 0.3062,
"step": 20430
},
{
"epoch": 2.461167971101746,
"grad_norm": 4.356471538543701,
"learning_rate": 3.8702796942323736e-07,
"loss": 0.3095,
"step": 20440
},
{
"epoch": 2.4623720650210714,
"grad_norm": 4.553625106811523,
"learning_rate": 3.8651623311198516e-07,
"loss": 0.3117,
"step": 20450
},
{
"epoch": 2.4635761589403975,
"grad_norm": 4.882122039794922,
"learning_rate": 3.860046221001855e-07,
"loss": 0.322,
"step": 20460
},
{
"epoch": 2.464780252859723,
"grad_norm": 5.218991756439209,
"learning_rate": 3.854931369527172e-07,
"loss": 0.3138,
"step": 20470
},
{
"epoch": 2.4659843467790488,
"grad_norm": 5.427024841308594,
"learning_rate": 3.849817782343201e-07,
"loss": 0.3125,
"step": 20480
},
{
"epoch": 2.4671884406983744,
"grad_norm": 4.729675769805908,
"learning_rate": 3.8447054650959447e-07,
"loss": 0.2925,
"step": 20490
},
{
"epoch": 2.4683925346177,
"grad_norm": 5.330557346343994,
"learning_rate": 3.8395944234300053e-07,
"loss": 0.2968,
"step": 20500
},
{
"epoch": 2.469596628537026,
"grad_norm": 4.960201740264893,
"learning_rate": 3.834484662988573e-07,
"loss": 0.3147,
"step": 20510
},
{
"epoch": 2.4708007224563517,
"grad_norm": 4.888551235198975,
"learning_rate": 3.829376189413427e-07,
"loss": 0.3098,
"step": 20520
},
{
"epoch": 2.4720048163756774,
"grad_norm": 4.717561721801758,
"learning_rate": 3.824269008344924e-07,
"loss": 0.3018,
"step": 20530
},
{
"epoch": 2.473208910295003,
"grad_norm": 4.666635990142822,
"learning_rate": 3.8191631254219927e-07,
"loss": 0.2942,
"step": 20540
},
{
"epoch": 2.4744130042143286,
"grad_norm": 5.138599872589111,
"learning_rate": 3.8140585462821296e-07,
"loss": 0.2922,
"step": 20550
},
{
"epoch": 2.4756170981336543,
"grad_norm": 5.150256633758545,
"learning_rate": 3.808955276561395e-07,
"loss": 0.3039,
"step": 20560
},
{
"epoch": 2.47682119205298,
"grad_norm": 5.677982807159424,
"learning_rate": 3.8038533218943954e-07,
"loss": 0.2928,
"step": 20570
},
{
"epoch": 2.478025285972306,
"grad_norm": 4.552664756774902,
"learning_rate": 3.798752687914292e-07,
"loss": 0.3108,
"step": 20580
},
{
"epoch": 2.4792293798916316,
"grad_norm": 4.48048210144043,
"learning_rate": 3.7936533802527855e-07,
"loss": 0.3159,
"step": 20590
},
{
"epoch": 2.4804334738109572,
"grad_norm": 4.3352370262146,
"learning_rate": 3.7885554045401147e-07,
"loss": 0.3079,
"step": 20600
},
{
"epoch": 2.481637567730283,
"grad_norm": 4.1587653160095215,
"learning_rate": 3.783458766405042e-07,
"loss": 0.3036,
"step": 20610
},
{
"epoch": 2.4828416616496085,
"grad_norm": 4.668213844299316,
"learning_rate": 3.7783634714748584e-07,
"loss": 0.3003,
"step": 20620
},
{
"epoch": 2.4840457555689346,
"grad_norm": 4.186696529388428,
"learning_rate": 3.7732695253753697e-07,
"loss": 0.3192,
"step": 20630
},
{
"epoch": 2.48524984948826,
"grad_norm": 4.841115951538086,
"learning_rate": 3.7681769337308954e-07,
"loss": 0.3064,
"step": 20640
},
{
"epoch": 2.486453943407586,
"grad_norm": 4.4625020027160645,
"learning_rate": 3.7630857021642514e-07,
"loss": 0.3059,
"step": 20650
},
{
"epoch": 2.4876580373269115,
"grad_norm": 4.459711074829102,
"learning_rate": 3.757995836296761e-07,
"loss": 0.2925,
"step": 20660
},
{
"epoch": 2.488862131246237,
"grad_norm": 4.983307361602783,
"learning_rate": 3.7529073417482345e-07,
"loss": 0.2961,
"step": 20670
},
{
"epoch": 2.4900662251655628,
"grad_norm": 4.813161373138428,
"learning_rate": 3.747820224136973e-07,
"loss": 0.3138,
"step": 20680
},
{
"epoch": 2.4912703190848884,
"grad_norm": 4.922794342041016,
"learning_rate": 3.742734489079748e-07,
"loss": 0.3219,
"step": 20690
},
{
"epoch": 2.4924744130042145,
"grad_norm": 5.428676128387451,
"learning_rate": 3.737650142191814e-07,
"loss": 0.3077,
"step": 20700
},
{
"epoch": 2.49367850692354,
"grad_norm": 4.670940399169922,
"learning_rate": 3.7325671890868895e-07,
"loss": 0.3035,
"step": 20710
},
{
"epoch": 2.4948826008428657,
"grad_norm": 4.245230674743652,
"learning_rate": 3.727485635377153e-07,
"loss": 0.3102,
"step": 20720
},
{
"epoch": 2.4960866947621914,
"grad_norm": 4.281071186065674,
"learning_rate": 3.7224054866732366e-07,
"loss": 0.2848,
"step": 20730
},
{
"epoch": 2.497290788681517,
"grad_norm": 4.969486236572266,
"learning_rate": 3.717326748584227e-07,
"loss": 0.3109,
"step": 20740
},
{
"epoch": 2.498494882600843,
"grad_norm": 6.3518500328063965,
"learning_rate": 3.712249426717647e-07,
"loss": 0.321,
"step": 20750
},
{
"epoch": 2.4996989765201687,
"grad_norm": 4.896385192871094,
"learning_rate": 3.707173526679458e-07,
"loss": 0.3096,
"step": 20760
},
{
"epoch": 2.5009030704394943,
"grad_norm": 4.546391487121582,
"learning_rate": 3.702099054074054e-07,
"loss": 0.3153,
"step": 20770
},
{
"epoch": 2.50210716435882,
"grad_norm": 4.817781925201416,
"learning_rate": 3.6970260145042475e-07,
"loss": 0.3072,
"step": 20780
},
{
"epoch": 2.5033112582781456,
"grad_norm": 4.495319366455078,
"learning_rate": 3.691954413571276e-07,
"loss": 0.316,
"step": 20790
},
{
"epoch": 2.5045153521974717,
"grad_norm": 4.200586318969727,
"learning_rate": 3.6868842568747826e-07,
"loss": 0.3146,
"step": 20800
},
{
"epoch": 2.505719446116797,
"grad_norm": 5.999356269836426,
"learning_rate": 3.681815550012816e-07,
"loss": 0.3087,
"step": 20810
},
{
"epoch": 2.506923540036123,
"grad_norm": 4.140690326690674,
"learning_rate": 3.676748298581828e-07,
"loss": 0.2786,
"step": 20820
},
{
"epoch": 2.5081276339554486,
"grad_norm": 4.519384384155273,
"learning_rate": 3.6716825081766634e-07,
"loss": 0.3073,
"step": 20830
},
{
"epoch": 2.509331727874774,
"grad_norm": 4.580509185791016,
"learning_rate": 3.6666181843905477e-07,
"loss": 0.3224,
"step": 20840
},
{
"epoch": 2.5105358217941,
"grad_norm": 4.371671676635742,
"learning_rate": 3.661555332815092e-07,
"loss": 0.303,
"step": 20850
},
{
"epoch": 2.5117399157134255,
"grad_norm": 5.235719680786133,
"learning_rate": 3.656493959040283e-07,
"loss": 0.3104,
"step": 20860
},
{
"epoch": 2.5129440096327516,
"grad_norm": 5.564718246459961,
"learning_rate": 3.651434068654474e-07,
"loss": 0.3111,
"step": 20870
},
{
"epoch": 2.514148103552077,
"grad_norm": 4.76020622253418,
"learning_rate": 3.646375667244378e-07,
"loss": 0.3153,
"step": 20880
},
{
"epoch": 2.515352197471403,
"grad_norm": 4.534407138824463,
"learning_rate": 3.6413187603950667e-07,
"loss": 0.305,
"step": 20890
},
{
"epoch": 2.5165562913907285,
"grad_norm": 5.413814067840576,
"learning_rate": 3.636263353689962e-07,
"loss": 0.3088,
"step": 20900
},
{
"epoch": 2.517760385310054,
"grad_norm": 5.003753185272217,
"learning_rate": 3.6312094527108307e-07,
"loss": 0.3146,
"step": 20910
},
{
"epoch": 2.51896447922938,
"grad_norm": 5.368070125579834,
"learning_rate": 3.6261570630377713e-07,
"loss": 0.3131,
"step": 20920
},
{
"epoch": 2.5201685731487053,
"grad_norm": 5.054159641265869,
"learning_rate": 3.621106190249219e-07,
"loss": 0.2967,
"step": 20930
},
{
"epoch": 2.5213726670680314,
"grad_norm": 5.523135185241699,
"learning_rate": 3.616056839921932e-07,
"loss": 0.3154,
"step": 20940
},
{
"epoch": 2.522576760987357,
"grad_norm": 5.352376937866211,
"learning_rate": 3.6110090176309914e-07,
"loss": 0.3033,
"step": 20950
},
{
"epoch": 2.5237808549066827,
"grad_norm": 3.677163600921631,
"learning_rate": 3.605962728949783e-07,
"loss": 0.3198,
"step": 20960
},
{
"epoch": 2.5249849488260083,
"grad_norm": 4.4316840171813965,
"learning_rate": 3.6009179794500067e-07,
"loss": 0.304,
"step": 20970
},
{
"epoch": 2.526189042745334,
"grad_norm": 4.927300453186035,
"learning_rate": 3.5958747747016603e-07,
"loss": 0.3221,
"step": 20980
},
{
"epoch": 2.52739313666466,
"grad_norm": 5.448822975158691,
"learning_rate": 3.590833120273038e-07,
"loss": 0.3186,
"step": 20990
},
{
"epoch": 2.5285972305839857,
"grad_norm": 4.188570022583008,
"learning_rate": 3.5857930217307163e-07,
"loss": 0.3015,
"step": 21000
},
{
"epoch": 2.5298013245033113,
"grad_norm": 4.157015323638916,
"learning_rate": 3.580754484639561e-07,
"loss": 0.2909,
"step": 21010
},
{
"epoch": 2.531005418422637,
"grad_norm": 4.773519992828369,
"learning_rate": 3.5757175145627107e-07,
"loss": 0.3034,
"step": 21020
},
{
"epoch": 2.5322095123419626,
"grad_norm": 5.435080051422119,
"learning_rate": 3.570682117061573e-07,
"loss": 0.3148,
"step": 21030
},
{
"epoch": 2.5334136062612886,
"grad_norm": 4.959787368774414,
"learning_rate": 3.56564829769582e-07,
"loss": 0.3115,
"step": 21040
},
{
"epoch": 2.534617700180614,
"grad_norm": 4.7358880043029785,
"learning_rate": 3.5606160620233815e-07,
"loss": 0.3078,
"step": 21050
},
{
"epoch": 2.53582179409994,
"grad_norm": 4.220034599304199,
"learning_rate": 3.5555854156004404e-07,
"loss": 0.298,
"step": 21060
},
{
"epoch": 2.5370258880192655,
"grad_norm": 4.433871746063232,
"learning_rate": 3.550556363981422e-07,
"loss": 0.2809,
"step": 21070
},
{
"epoch": 2.538229981938591,
"grad_norm": 4.491239070892334,
"learning_rate": 3.5455289127189907e-07,
"loss": 0.3179,
"step": 21080
},
{
"epoch": 2.539434075857917,
"grad_norm": 4.969503879547119,
"learning_rate": 3.540503067364047e-07,
"loss": 0.3018,
"step": 21090
},
{
"epoch": 2.5406381697772424,
"grad_norm": 4.266849040985107,
"learning_rate": 3.535478833465717e-07,
"loss": 0.3121,
"step": 21100
},
{
"epoch": 2.5418422636965685,
"grad_norm": 4.8507771492004395,
"learning_rate": 3.5304562165713435e-07,
"loss": 0.317,
"step": 21110
},
{
"epoch": 2.543046357615894,
"grad_norm": 4.610383987426758,
"learning_rate": 3.525435222226491e-07,
"loss": 0.3083,
"step": 21120
},
{
"epoch": 2.54425045153522,
"grad_norm": 4.408012390136719,
"learning_rate": 3.5204158559749275e-07,
"loss": 0.3141,
"step": 21130
},
{
"epoch": 2.5454545454545454,
"grad_norm": 5.178010940551758,
"learning_rate": 3.5153981233586274e-07,
"loss": 0.3106,
"step": 21140
},
{
"epoch": 2.546658639373871,
"grad_norm": 4.6306681632995605,
"learning_rate": 3.5103820299177535e-07,
"loss": 0.3086,
"step": 21150
},
{
"epoch": 2.547862733293197,
"grad_norm": 5.366611003875732,
"learning_rate": 3.505367581190668e-07,
"loss": 0.2985,
"step": 21160
},
{
"epoch": 2.5490668272125223,
"grad_norm": 5.572306156158447,
"learning_rate": 3.5003547827139125e-07,
"loss": 0.2976,
"step": 21170
},
{
"epoch": 2.5502709211318484,
"grad_norm": 5.326085090637207,
"learning_rate": 3.495343640022209e-07,
"loss": 0.2971,
"step": 21180
},
{
"epoch": 2.551475015051174,
"grad_norm": 7.600101947784424,
"learning_rate": 3.4903341586484456e-07,
"loss": 0.2961,
"step": 21190
},
{
"epoch": 2.5526791089704997,
"grad_norm": 4.568670272827148,
"learning_rate": 3.4853263441236834e-07,
"loss": 0.3142,
"step": 21200
},
{
"epoch": 2.5538832028898253,
"grad_norm": 4.9445695877075195,
"learning_rate": 3.480320201977138e-07,
"loss": 0.2988,
"step": 21210
},
{
"epoch": 2.555087296809151,
"grad_norm": 5.26786994934082,
"learning_rate": 3.475315737736183e-07,
"loss": 0.3074,
"step": 21220
},
{
"epoch": 2.556291390728477,
"grad_norm": 4.316328525543213,
"learning_rate": 3.4703129569263323e-07,
"loss": 0.2917,
"step": 21230
},
{
"epoch": 2.5574954846478026,
"grad_norm": 4.018758773803711,
"learning_rate": 3.465311865071248e-07,
"loss": 0.2967,
"step": 21240
},
{
"epoch": 2.5586995785671283,
"grad_norm": 5.121528625488281,
"learning_rate": 3.460312467692725e-07,
"loss": 0.3061,
"step": 21250
},
{
"epoch": 2.559903672486454,
"grad_norm": 4.710129261016846,
"learning_rate": 3.4553147703106886e-07,
"loss": 0.3074,
"step": 21260
},
{
"epoch": 2.5611077664057795,
"grad_norm": 4.447737216949463,
"learning_rate": 3.4503187784431825e-07,
"loss": 0.3062,
"step": 21270
},
{
"epoch": 2.5623118603251056,
"grad_norm": 4.8179612159729,
"learning_rate": 3.445324497606372e-07,
"loss": 0.3007,
"step": 21280
},
{
"epoch": 2.563515954244431,
"grad_norm": 4.53162956237793,
"learning_rate": 3.440331933314532e-07,
"loss": 0.3103,
"step": 21290
},
{
"epoch": 2.564720048163757,
"grad_norm": 4.889903545379639,
"learning_rate": 3.435341091080042e-07,
"loss": 0.3109,
"step": 21300
},
{
"epoch": 2.5659241420830825,
"grad_norm": 4.858291149139404,
"learning_rate": 3.430351976413378e-07,
"loss": 0.3191,
"step": 21310
},
{
"epoch": 2.567128236002408,
"grad_norm": 4.58107852935791,
"learning_rate": 3.425364594823114e-07,
"loss": 0.2853,
"step": 21320
},
{
"epoch": 2.5683323299217338,
"grad_norm": 5.6206207275390625,
"learning_rate": 3.420378951815903e-07,
"loss": 0.3081,
"step": 21330
},
{
"epoch": 2.5695364238410594,
"grad_norm": 5.069255352020264,
"learning_rate": 3.4153950528964866e-07,
"loss": 0.3034,
"step": 21340
},
{
"epoch": 2.5707405177603855,
"grad_norm": 5.086771488189697,
"learning_rate": 3.4104129035676743e-07,
"loss": 0.318,
"step": 21350
},
{
"epoch": 2.571944611679711,
"grad_norm": 5.416161060333252,
"learning_rate": 3.4054325093303447e-07,
"loss": 0.3062,
"step": 21360
},
{
"epoch": 2.5731487055990367,
"grad_norm": 4.536307334899902,
"learning_rate": 3.4004538756834415e-07,
"loss": 0.3028,
"step": 21370
},
{
"epoch": 2.5743527995183624,
"grad_norm": 4.512822151184082,
"learning_rate": 3.3954770081239657e-07,
"loss": 0.3046,
"step": 21380
},
{
"epoch": 2.575556893437688,
"grad_norm": 5.5262322425842285,
"learning_rate": 3.39050191214696e-07,
"loss": 0.3012,
"step": 21390
},
{
"epoch": 2.576760987357014,
"grad_norm": 5.3342509269714355,
"learning_rate": 3.38552859324552e-07,
"loss": 0.3046,
"step": 21400
},
{
"epoch": 2.5779650812763397,
"grad_norm": 4.271503925323486,
"learning_rate": 3.380557056910778e-07,
"loss": 0.3097,
"step": 21410
},
{
"epoch": 2.5791691751956654,
"grad_norm": 4.600352764129639,
"learning_rate": 3.375587308631891e-07,
"loss": 0.3094,
"step": 21420
},
{
"epoch": 2.580373269114991,
"grad_norm": 4.630692958831787,
"learning_rate": 3.3706193538960493e-07,
"loss": 0.3117,
"step": 21430
},
{
"epoch": 2.5815773630343166,
"grad_norm": 4.425769329071045,
"learning_rate": 3.3656531981884604e-07,
"loss": 0.3097,
"step": 21440
},
{
"epoch": 2.5827814569536423,
"grad_norm": 4.963135242462158,
"learning_rate": 3.3606888469923474e-07,
"loss": 0.3079,
"step": 21450
},
{
"epoch": 2.583985550872968,
"grad_norm": 5.204167366027832,
"learning_rate": 3.3557263057889344e-07,
"loss": 0.2965,
"step": 21460
},
{
"epoch": 2.585189644792294,
"grad_norm": 4.431160926818848,
"learning_rate": 3.3507655800574554e-07,
"loss": 0.2973,
"step": 21470
},
{
"epoch": 2.5863937387116196,
"grad_norm": 5.386955261230469,
"learning_rate": 3.345806675275134e-07,
"loss": 0.3035,
"step": 21480
},
{
"epoch": 2.5875978326309452,
"grad_norm": 4.363948345184326,
"learning_rate": 3.340849596917189e-07,
"loss": 0.2848,
"step": 21490
},
{
"epoch": 2.588801926550271,
"grad_norm": 4.813036918640137,
"learning_rate": 3.3358943504568147e-07,
"loss": 0.3086,
"step": 21500
},
{
"epoch": 2.5900060204695965,
"grad_norm": 4.847212791442871,
"learning_rate": 3.3309409413651895e-07,
"loss": 0.2939,
"step": 21510
},
{
"epoch": 2.5912101143889226,
"grad_norm": 6.291325569152832,
"learning_rate": 3.3259893751114606e-07,
"loss": 0.3117,
"step": 21520
},
{
"epoch": 2.592414208308248,
"grad_norm": 5.317537307739258,
"learning_rate": 3.321039657162742e-07,
"loss": 0.3222,
"step": 21530
},
{
"epoch": 2.593618302227574,
"grad_norm": 4.0502190589904785,
"learning_rate": 3.3160917929841027e-07,
"loss": 0.2994,
"step": 21540
},
{
"epoch": 2.5948223961468995,
"grad_norm": 5.079105377197266,
"learning_rate": 3.3111457880385686e-07,
"loss": 0.3002,
"step": 21550
},
{
"epoch": 2.596026490066225,
"grad_norm": 5.073225975036621,
"learning_rate": 3.3062016477871147e-07,
"loss": 0.2969,
"step": 21560
},
{
"epoch": 2.5972305839855507,
"grad_norm": 5.702369689941406,
"learning_rate": 3.3012593776886524e-07,
"loss": 0.3229,
"step": 21570
},
{
"epoch": 2.5984346779048764,
"grad_norm": 5.685046672821045,
"learning_rate": 3.296318983200028e-07,
"loss": 0.3149,
"step": 21580
},
{
"epoch": 2.5996387718242024,
"grad_norm": 5.351219654083252,
"learning_rate": 3.2913804697760244e-07,
"loss": 0.3116,
"step": 21590
},
{
"epoch": 2.600842865743528,
"grad_norm": 4.610897541046143,
"learning_rate": 3.286443842869338e-07,
"loss": 0.3092,
"step": 21600
},
{
"epoch": 2.6020469596628537,
"grad_norm": 4.982673168182373,
"learning_rate": 3.2815091079305895e-07,
"loss": 0.2942,
"step": 21610
},
{
"epoch": 2.6032510535821793,
"grad_norm": 5.005990982055664,
"learning_rate": 3.2765762704083067e-07,
"loss": 0.311,
"step": 21620
},
{
"epoch": 2.604455147501505,
"grad_norm": 4.512310028076172,
"learning_rate": 3.271645335748923e-07,
"loss": 0.3267,
"step": 21630
},
{
"epoch": 2.605659241420831,
"grad_norm": 4.117137432098389,
"learning_rate": 3.2667163093967716e-07,
"loss": 0.3003,
"step": 21640
},
{
"epoch": 2.6068633353401567,
"grad_norm": 5.019242763519287,
"learning_rate": 3.2617891967940806e-07,
"loss": 0.2979,
"step": 21650
},
{
"epoch": 2.6080674292594823,
"grad_norm": 4.304302215576172,
"learning_rate": 3.2568640033809597e-07,
"loss": 0.3009,
"step": 21660
},
{
"epoch": 2.609271523178808,
"grad_norm": 5.543119430541992,
"learning_rate": 3.2519407345954043e-07,
"loss": 0.3085,
"step": 21670
},
{
"epoch": 2.6104756170981336,
"grad_norm": 4.892364025115967,
"learning_rate": 3.247019395873283e-07,
"loss": 0.2965,
"step": 21680
},
{
"epoch": 2.611679711017459,
"grad_norm": 3.9560534954071045,
"learning_rate": 3.242099992648336e-07,
"loss": 0.2994,
"step": 21690
},
{
"epoch": 2.612883804936785,
"grad_norm": 4.653574466705322,
"learning_rate": 3.2371825303521604e-07,
"loss": 0.3072,
"step": 21700
},
{
"epoch": 2.614087898856111,
"grad_norm": 4.340296268463135,
"learning_rate": 3.232267014414216e-07,
"loss": 0.2965,
"step": 21710
},
{
"epoch": 2.6152919927754366,
"grad_norm": 3.889099597930908,
"learning_rate": 3.2273534502618136e-07,
"loss": 0.3212,
"step": 21720
},
{
"epoch": 2.616496086694762,
"grad_norm": 4.952009201049805,
"learning_rate": 3.2224418433201033e-07,
"loss": 0.3121,
"step": 21730
},
{
"epoch": 2.617700180614088,
"grad_norm": 5.229816913604736,
"learning_rate": 3.2175321990120797e-07,
"loss": 0.304,
"step": 21740
},
{
"epoch": 2.6189042745334135,
"grad_norm": 4.951354503631592,
"learning_rate": 3.2126245227585693e-07,
"loss": 0.3024,
"step": 21750
},
{
"epoch": 2.6201083684527395,
"grad_norm": 5.034163475036621,
"learning_rate": 3.2077188199782257e-07,
"loss": 0.3057,
"step": 21760
},
{
"epoch": 2.621312462372065,
"grad_norm": 5.984414100646973,
"learning_rate": 3.20281509608752e-07,
"loss": 0.3209,
"step": 21770
},
{
"epoch": 2.622516556291391,
"grad_norm": 4.373472213745117,
"learning_rate": 3.1979133565007434e-07,
"loss": 0.2947,
"step": 21780
},
{
"epoch": 2.6237206502107164,
"grad_norm": 4.750053405761719,
"learning_rate": 3.193013606629994e-07,
"loss": 0.3196,
"step": 21790
},
{
"epoch": 2.624924744130042,
"grad_norm": 4.528110027313232,
"learning_rate": 3.188115851885174e-07,
"loss": 0.3053,
"step": 21800
},
{
"epoch": 2.6261288380493677,
"grad_norm": 4.8642072677612305,
"learning_rate": 3.1832200976739786e-07,
"loss": 0.3328,
"step": 21810
},
{
"epoch": 2.6273329319686933,
"grad_norm": 4.624762535095215,
"learning_rate": 3.1783263494019e-07,
"loss": 0.3123,
"step": 21820
},
{
"epoch": 2.6285370258880194,
"grad_norm": 4.700741767883301,
"learning_rate": 3.1734346124722135e-07,
"loss": 0.3011,
"step": 21830
},
{
"epoch": 2.629741119807345,
"grad_norm": 5.0118021965026855,
"learning_rate": 3.1685448922859716e-07,
"loss": 0.3163,
"step": 21840
},
{
"epoch": 2.6309452137266707,
"grad_norm": 5.321165084838867,
"learning_rate": 3.1636571942420014e-07,
"loss": 0.3019,
"step": 21850
},
{
"epoch": 2.6321493076459963,
"grad_norm": 5.864070892333984,
"learning_rate": 3.1587715237368996e-07,
"loss": 0.3027,
"step": 21860
},
{
"epoch": 2.633353401565322,
"grad_norm": 4.458745956420898,
"learning_rate": 3.1538878861650194e-07,
"loss": 0.3152,
"step": 21870
},
{
"epoch": 2.634557495484648,
"grad_norm": 4.945919036865234,
"learning_rate": 3.149006286918474e-07,
"loss": 0.3238,
"step": 21880
},
{
"epoch": 2.6357615894039736,
"grad_norm": 4.671433448791504,
"learning_rate": 3.144126731387126e-07,
"loss": 0.2941,
"step": 21890
},
{
"epoch": 2.6369656833232993,
"grad_norm": 5.389127731323242,
"learning_rate": 3.1392492249585744e-07,
"loss": 0.3223,
"step": 21900
},
{
"epoch": 2.638169777242625,
"grad_norm": 5.42547607421875,
"learning_rate": 3.134373773018165e-07,
"loss": 0.305,
"step": 21910
},
{
"epoch": 2.6393738711619505,
"grad_norm": 5.633350849151611,
"learning_rate": 3.129500380948973e-07,
"loss": 0.296,
"step": 21920
},
{
"epoch": 2.640577965081276,
"grad_norm": 4.668237209320068,
"learning_rate": 3.1246290541317937e-07,
"loss": 0.3032,
"step": 21930
},
{
"epoch": 2.641782059000602,
"grad_norm": 4.56117057800293,
"learning_rate": 3.119759797945147e-07,
"loss": 0.3036,
"step": 21940
},
{
"epoch": 2.642986152919928,
"grad_norm": 5.208002090454102,
"learning_rate": 3.114892617765266e-07,
"loss": 0.2983,
"step": 21950
},
{
"epoch": 2.6441902468392535,
"grad_norm": 4.775214195251465,
"learning_rate": 3.110027518966094e-07,
"loss": 0.3104,
"step": 21960
},
{
"epoch": 2.645394340758579,
"grad_norm": 4.55642032623291,
"learning_rate": 3.1051645069192675e-07,
"loss": 0.3162,
"step": 21970
},
{
"epoch": 2.646598434677905,
"grad_norm": 4.810263156890869,
"learning_rate": 3.1003035869941295e-07,
"loss": 0.2958,
"step": 21980
},
{
"epoch": 2.6478025285972304,
"grad_norm": 4.988792896270752,
"learning_rate": 3.0954447645577063e-07,
"loss": 0.308,
"step": 21990
},
{
"epoch": 2.6490066225165565,
"grad_norm": 4.394057273864746,
"learning_rate": 3.0905880449747134e-07,
"loss": 0.2995,
"step": 22000
}
],
"logging_steps": 10,
"max_steps": 33220,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.886664442836628e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}