{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996635488863468, "eval_steps": 60, "global_step": 1857, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005383217818450979, "grad_norm": 109.41815851913972, "learning_rate": 1.7857142857142858e-07, "loss": 1.8223, "step": 1 }, { "epoch": 0.0010766435636901958, "grad_norm": 6.467874390653583, "learning_rate": 3.5714285714285716e-07, "loss": 0.995, "step": 2 }, { "epoch": 0.0016149653455352936, "grad_norm": 4802.638302957245, "learning_rate": 5.357142857142857e-07, "loss": 1.8859, "step": 3 }, { "epoch": 0.0021532871273803916, "grad_norm": 16.08026403474417, "learning_rate": 7.142857142857143e-07, "loss": 0.953, "step": 4 }, { "epoch": 0.0026916089092254895, "grad_norm": 57.36353411210698, "learning_rate": 8.928571428571429e-07, "loss": 0.9429, "step": 5 }, { "epoch": 0.0032299306910705873, "grad_norm": 19675.34780096024, "learning_rate": 1.0714285714285714e-06, "loss": 2.6006, "step": 6 }, { "epoch": 0.0037682524729156855, "grad_norm": 818.3276445922152, "learning_rate": 1.25e-06, "loss": 0.9918, "step": 7 }, { "epoch": 0.004306574254760783, "grad_norm": 104.38551556669266, "learning_rate": 1.4285714285714286e-06, "loss": 0.9549, "step": 8 }, { "epoch": 0.004844896036605881, "grad_norm": 367.74940177761715, "learning_rate": 1.6071428571428574e-06, "loss": 2.68, "step": 9 }, { "epoch": 0.005383217818450979, "grad_norm": 2496.9575083108416, "learning_rate": 1.7857142857142859e-06, "loss": 1.0041, "step": 10 }, { "epoch": 0.005921539600296077, "grad_norm": 8.38602908383877, "learning_rate": 1.9642857142857144e-06, "loss": 0.9457, "step": 11 }, { "epoch": 0.0064598613821411745, "grad_norm": 8.592616439259098, "learning_rate": 2.1428571428571427e-06, "loss": 0.8897, "step": 12 }, { "epoch": 0.006998183163986273, "grad_norm": 438.07218134142477, "learning_rate": 2.321428571428572e-06, "loss": 0.9019, "step": 13 }, { "epoch": 0.007536504945831371, "grad_norm": 9.994776961708206, "learning_rate": 2.5e-06, "loss": 0.825, "step": 14 }, { "epoch": 0.008074826727676468, "grad_norm": 10.743498710130337, "learning_rate": 2.6785714285714285e-06, "loss": 0.849, "step": 15 }, { "epoch": 0.008613148509521567, "grad_norm": 15.967903678723232, "learning_rate": 2.8571428571428573e-06, "loss": 0.8012, "step": 16 }, { "epoch": 0.009151470291366665, "grad_norm": 4.1668693518220055, "learning_rate": 3.0357142857142856e-06, "loss": 0.7744, "step": 17 }, { "epoch": 0.009689792073211762, "grad_norm": 6.926360685293587, "learning_rate": 3.2142857142857147e-06, "loss": 0.692, "step": 18 }, { "epoch": 0.010228113855056861, "grad_norm": 3.2346473797515074, "learning_rate": 3.3928571428571435e-06, "loss": 0.6831, "step": 19 }, { "epoch": 0.010766435636901958, "grad_norm": 3.7571978637997177, "learning_rate": 3.5714285714285718e-06, "loss": 0.6537, "step": 20 }, { "epoch": 0.011304757418747056, "grad_norm": 7.559105627558757, "learning_rate": 3.7500000000000005e-06, "loss": 0.6141, "step": 21 }, { "epoch": 0.011843079200592153, "grad_norm": 2.387397312463083, "learning_rate": 3.928571428571429e-06, "loss": 0.695, "step": 22 }, { "epoch": 0.012381400982437252, "grad_norm": 4.391512430632287, "learning_rate": 4.107142857142857e-06, "loss": 0.6185, "step": 23 }, { "epoch": 0.012919722764282349, "grad_norm": 4.15230181408785, "learning_rate": 4.2857142857142855e-06, "loss": 0.5936, "step": 24 }, { "epoch": 0.013458044546127448, "grad_norm": 2.0653396060051605, "learning_rate": 4.464285714285715e-06, "loss": 0.5898, "step": 25 }, { "epoch": 0.013996366327972546, "grad_norm": 17.979519982673633, "learning_rate": 4.642857142857144e-06, "loss": 0.5905, "step": 26 }, { "epoch": 0.014534688109817643, "grad_norm": 8.151818005754823, "learning_rate": 4.821428571428572e-06, "loss": 0.5533, "step": 27 }, { "epoch": 0.015073009891662742, "grad_norm": 5.987974608299902, "learning_rate": 5e-06, "loss": 0.6107, "step": 28 }, { "epoch": 0.015611331673507839, "grad_norm": 2.330194497601682, "learning_rate": 5.1785714285714296e-06, "loss": 0.5506, "step": 29 }, { "epoch": 0.016149653455352936, "grad_norm": 12.794302959608173, "learning_rate": 5.357142857142857e-06, "loss": 0.5766, "step": 30 }, { "epoch": 0.016687975237198036, "grad_norm": 1.9007212740143298, "learning_rate": 5.535714285714286e-06, "loss": 0.5627, "step": 31 }, { "epoch": 0.017226297019043133, "grad_norm": 1.9146904127163438, "learning_rate": 5.7142857142857145e-06, "loss": 0.5883, "step": 32 }, { "epoch": 0.01776461880088823, "grad_norm": 2.187675355948441, "learning_rate": 5.892857142857144e-06, "loss": 0.5035, "step": 33 }, { "epoch": 0.01830294058273333, "grad_norm": 2.6806082565798124, "learning_rate": 6.071428571428571e-06, "loss": 0.5542, "step": 34 }, { "epoch": 0.018841262364578427, "grad_norm": 3.3951990046554323, "learning_rate": 6.25e-06, "loss": 0.538, "step": 35 }, { "epoch": 0.019379584146423524, "grad_norm": 2.5099129162892853, "learning_rate": 6.4285714285714295e-06, "loss": 0.5251, "step": 36 }, { "epoch": 0.01991790592826862, "grad_norm": 2.0412677432451627, "learning_rate": 6.607142857142858e-06, "loss": 0.5121, "step": 37 }, { "epoch": 0.020456227710113722, "grad_norm": 2.781712184570042, "learning_rate": 6.785714285714287e-06, "loss": 0.5254, "step": 38 }, { "epoch": 0.02099454949195882, "grad_norm": 6.805805985669443, "learning_rate": 6.964285714285714e-06, "loss": 0.5831, "step": 39 }, { "epoch": 0.021532871273803916, "grad_norm": 2.0490434598423652, "learning_rate": 7.1428571428571436e-06, "loss": 0.5393, "step": 40 }, { "epoch": 0.022071193055649013, "grad_norm": 1.9372367290098516, "learning_rate": 7.321428571428572e-06, "loss": 0.5972, "step": 41 }, { "epoch": 0.022609514837494113, "grad_norm": 2.7227356306942165, "learning_rate": 7.500000000000001e-06, "loss": 0.5369, "step": 42 }, { "epoch": 0.02314783661933921, "grad_norm": 2.011860407760454, "learning_rate": 7.67857142857143e-06, "loss": 0.5164, "step": 43 }, { "epoch": 0.023686158401184307, "grad_norm": 2.380752182458038, "learning_rate": 7.857142857142858e-06, "loss": 0.4715, "step": 44 }, { "epoch": 0.024224480183029407, "grad_norm": 2.0112153484283537, "learning_rate": 8.035714285714286e-06, "loss": 0.4943, "step": 45 }, { "epoch": 0.024762801964874504, "grad_norm": 1.7657871236862508, "learning_rate": 8.214285714285714e-06, "loss": 0.5792, "step": 46 }, { "epoch": 0.0253011237467196, "grad_norm": 2.012306508738324, "learning_rate": 8.392857142857144e-06, "loss": 0.5704, "step": 47 }, { "epoch": 0.025839445528564698, "grad_norm": 2.0657223159326743, "learning_rate": 8.571428571428571e-06, "loss": 0.5145, "step": 48 }, { "epoch": 0.0263777673104098, "grad_norm": 2.137310846323582, "learning_rate": 8.750000000000001e-06, "loss": 0.5067, "step": 49 }, { "epoch": 0.026916089092254895, "grad_norm": 2.2166052489861534, "learning_rate": 8.92857142857143e-06, "loss": 0.5799, "step": 50 }, { "epoch": 0.027454410874099992, "grad_norm": 2.029493952864758, "learning_rate": 9.107142857142858e-06, "loss": 0.5817, "step": 51 }, { "epoch": 0.027992732655945093, "grad_norm": 1.5628607433382145, "learning_rate": 9.285714285714288e-06, "loss": 0.4722, "step": 52 }, { "epoch": 0.02853105443779019, "grad_norm": 1.686683459837313, "learning_rate": 9.464285714285714e-06, "loss": 0.5233, "step": 53 }, { "epoch": 0.029069376219635287, "grad_norm": 1.7287851726495882, "learning_rate": 9.642857142857144e-06, "loss": 0.5744, "step": 54 }, { "epoch": 0.029607698001480384, "grad_norm": 2.246853321344625, "learning_rate": 9.821428571428573e-06, "loss": 0.4972, "step": 55 }, { "epoch": 0.030146019783325484, "grad_norm": 1.9175548738162544, "learning_rate": 1e-05, "loss": 0.535, "step": 56 }, { "epoch": 0.03068434156517058, "grad_norm": 2.169109901402676, "learning_rate": 9.999992393020984e-06, "loss": 0.5429, "step": 57 }, { "epoch": 0.031222663347015678, "grad_norm": 2.260825281362616, "learning_rate": 9.99996957210708e-06, "loss": 0.521, "step": 58 }, { "epoch": 0.031760985128860775, "grad_norm": 1.660309077201794, "learning_rate": 9.999931537327727e-06, "loss": 0.531, "step": 59 }, { "epoch": 0.03229930691070587, "grad_norm": 2.069841458563405, "learning_rate": 9.999878288798659e-06, "loss": 0.5661, "step": 60 }, { "epoch": 0.03229930691070587, "eval_loss": 0.5262647271156311, "eval_runtime": 1569.0341, "eval_samples_per_second": 15.94, "eval_steps_per_second": 0.498, "step": 60 }, { "epoch": 0.032837628692550976, "grad_norm": 2.6347591222570577, "learning_rate": 9.999809826681898e-06, "loss": 0.544, "step": 61 }, { "epoch": 0.03337595047439607, "grad_norm": 2.286499156997404, "learning_rate": 9.999726151185762e-06, "loss": 0.5387, "step": 62 }, { "epoch": 0.03391427225624117, "grad_norm": 1.8415858956026085, "learning_rate": 9.999627262564856e-06, "loss": 0.5148, "step": 63 }, { "epoch": 0.034452594038086266, "grad_norm": 1.6900844200859937, "learning_rate": 9.999513161120078e-06, "loss": 0.5291, "step": 64 }, { "epoch": 0.03499091581993136, "grad_norm": 1.7125448582732223, "learning_rate": 9.999383847198618e-06, "loss": 0.5535, "step": 65 }, { "epoch": 0.03552923760177646, "grad_norm": 1.9111631206584763, "learning_rate": 9.999239321193946e-06, "loss": 0.5146, "step": 66 }, { "epoch": 0.03606755938362156, "grad_norm": 1.5772484080951499, "learning_rate": 9.999079583545829e-06, "loss": 0.4713, "step": 67 }, { "epoch": 0.03660588116546666, "grad_norm": 1.8895632782472054, "learning_rate": 9.998904634740313e-06, "loss": 0.5802, "step": 68 }, { "epoch": 0.03714420294731176, "grad_norm": 1.7764047564754841, "learning_rate": 9.998714475309733e-06, "loss": 0.4893, "step": 69 }, { "epoch": 0.037682524729156855, "grad_norm": 1.6552020383306354, "learning_rate": 9.9985091058327e-06, "loss": 0.5265, "step": 70 }, { "epoch": 0.03822084651100195, "grad_norm": 1.6488442266603467, "learning_rate": 9.998288526934115e-06, "loss": 0.5231, "step": 71 }, { "epoch": 0.03875916829284705, "grad_norm": 2.563488205094923, "learning_rate": 9.998052739285151e-06, "loss": 0.5305, "step": 72 }, { "epoch": 0.039297490074692146, "grad_norm": 1.7898615543554037, "learning_rate": 9.997801743603264e-06, "loss": 0.5237, "step": 73 }, { "epoch": 0.03983581185653724, "grad_norm": 1.7633259864675677, "learning_rate": 9.997535540652177e-06, "loss": 0.5502, "step": 74 }, { "epoch": 0.04037413363838234, "grad_norm": 1.8121416043404328, "learning_rate": 9.997254131241893e-06, "loss": 0.4952, "step": 75 }, { "epoch": 0.040912455420227443, "grad_norm": 1.5652647418073986, "learning_rate": 9.996957516228682e-06, "loss": 0.4945, "step": 76 }, { "epoch": 0.04145077720207254, "grad_norm": 2.048844737679617, "learning_rate": 9.996645696515082e-06, "loss": 0.5123, "step": 77 }, { "epoch": 0.04198909898391764, "grad_norm": 1.6687520157181732, "learning_rate": 9.996318673049893e-06, "loss": 0.5443, "step": 78 }, { "epoch": 0.042527420765762734, "grad_norm": 1.66167477759581, "learning_rate": 9.995976446828182e-06, "loss": 0.5029, "step": 79 }, { "epoch": 0.04306574254760783, "grad_norm": 1.5077402156848434, "learning_rate": 9.99561901889127e-06, "loss": 0.5197, "step": 80 }, { "epoch": 0.04360406432945293, "grad_norm": 1.8622381731018631, "learning_rate": 9.995246390326739e-06, "loss": 0.5048, "step": 81 }, { "epoch": 0.044142386111298025, "grad_norm": 1.6038417564132132, "learning_rate": 9.994858562268415e-06, "loss": 0.5779, "step": 82 }, { "epoch": 0.04468070789314313, "grad_norm": 2.2450492036773126, "learning_rate": 9.994455535896383e-06, "loss": 0.5407, "step": 83 }, { "epoch": 0.045219029674988226, "grad_norm": 1.7319893085330837, "learning_rate": 9.994037312436963e-06, "loss": 0.4857, "step": 84 }, { "epoch": 0.04575735145683332, "grad_norm": 1.6718459312817726, "learning_rate": 9.99360389316273e-06, "loss": 0.4815, "step": 85 }, { "epoch": 0.04629567323867842, "grad_norm": 2.7232264171397276, "learning_rate": 9.993155279392479e-06, "loss": 0.5877, "step": 86 }, { "epoch": 0.04683399502052352, "grad_norm": 1.9404135244552454, "learning_rate": 9.992691472491253e-06, "loss": 0.5062, "step": 87 }, { "epoch": 0.047372316802368614, "grad_norm": 1.9213426547558368, "learning_rate": 9.99221247387032e-06, "loss": 0.5188, "step": 88 }, { "epoch": 0.04791063858421371, "grad_norm": 1.5451598644824311, "learning_rate": 9.991718284987173e-06, "loss": 0.5397, "step": 89 }, { "epoch": 0.048448960366058814, "grad_norm": 2.5679521016629385, "learning_rate": 9.991208907345524e-06, "loss": 0.541, "step": 90 }, { "epoch": 0.04898728214790391, "grad_norm": 2.98985646242629, "learning_rate": 9.990684342495304e-06, "loss": 0.4854, "step": 91 }, { "epoch": 0.04952560392974901, "grad_norm": 1.9886055940456542, "learning_rate": 9.990144592032657e-06, "loss": 0.5256, "step": 92 }, { "epoch": 0.050063925711594105, "grad_norm": 2.083677922083048, "learning_rate": 9.989589657599927e-06, "loss": 0.4859, "step": 93 }, { "epoch": 0.0506022474934392, "grad_norm": 1.5145771411744222, "learning_rate": 9.989019540885664e-06, "loss": 0.4744, "step": 94 }, { "epoch": 0.0511405692752843, "grad_norm": 1.655565898472542, "learning_rate": 9.98843424362462e-06, "loss": 0.4615, "step": 95 }, { "epoch": 0.051678891057129396, "grad_norm": 1.9814143121579568, "learning_rate": 9.987833767597726e-06, "loss": 0.4806, "step": 96 }, { "epoch": 0.0522172128389745, "grad_norm": 1.5166169599719224, "learning_rate": 9.987218114632109e-06, "loss": 0.5279, "step": 97 }, { "epoch": 0.0527555346208196, "grad_norm": 1.7338166251896456, "learning_rate": 9.98658728660107e-06, "loss": 0.4885, "step": 98 }, { "epoch": 0.053293856402664694, "grad_norm": 2.059909188253357, "learning_rate": 9.98594128542409e-06, "loss": 0.4878, "step": 99 }, { "epoch": 0.05383217818450979, "grad_norm": 1.946469408161261, "learning_rate": 9.985280113066816e-06, "loss": 0.5423, "step": 100 }, { "epoch": 0.05437049996635489, "grad_norm": 2.2782083747319333, "learning_rate": 9.984603771541055e-06, "loss": 0.5132, "step": 101 }, { "epoch": 0.054908821748199985, "grad_norm": 2.057010956887204, "learning_rate": 9.983912262904775e-06, "loss": 0.5092, "step": 102 }, { "epoch": 0.05544714353004508, "grad_norm": 1.7498707830077607, "learning_rate": 9.983205589262093e-06, "loss": 0.4711, "step": 103 }, { "epoch": 0.055985465311890185, "grad_norm": 2.08857966446578, "learning_rate": 9.98248375276327e-06, "loss": 0.5405, "step": 104 }, { "epoch": 0.05652378709373528, "grad_norm": 1.6492587393982439, "learning_rate": 9.981746755604703e-06, "loss": 0.5346, "step": 105 }, { "epoch": 0.05706210887558038, "grad_norm": 2.4884932019084203, "learning_rate": 9.980994600028919e-06, "loss": 0.4979, "step": 106 }, { "epoch": 0.057600430657425476, "grad_norm": 2.357643749019895, "learning_rate": 9.980227288324576e-06, "loss": 0.547, "step": 107 }, { "epoch": 0.05813875243927057, "grad_norm": 1.7013608238808469, "learning_rate": 9.979444822826438e-06, "loss": 0.4984, "step": 108 }, { "epoch": 0.05867707422111567, "grad_norm": 1.6424667181868076, "learning_rate": 9.978647205915386e-06, "loss": 0.5501, "step": 109 }, { "epoch": 0.05921539600296077, "grad_norm": 1.8701509501400961, "learning_rate": 9.977834440018406e-06, "loss": 0.5478, "step": 110 }, { "epoch": 0.05975371778480587, "grad_norm": 1.8496243899167086, "learning_rate": 9.977006527608569e-06, "loss": 0.4782, "step": 111 }, { "epoch": 0.06029203956665097, "grad_norm": 1.6878413932010692, "learning_rate": 9.976163471205045e-06, "loss": 0.4832, "step": 112 }, { "epoch": 0.060830361348496065, "grad_norm": 1.9099800850936837, "learning_rate": 9.975305273373075e-06, "loss": 0.515, "step": 113 }, { "epoch": 0.06136868313034116, "grad_norm": 1.5649119566569916, "learning_rate": 9.974431936723979e-06, "loss": 0.4561, "step": 114 }, { "epoch": 0.06190700491218626, "grad_norm": 1.7341754469580601, "learning_rate": 9.973543463915139e-06, "loss": 0.5348, "step": 115 }, { "epoch": 0.062445326694031356, "grad_norm": 1.7476560123562952, "learning_rate": 9.972639857649989e-06, "loss": 0.5287, "step": 116 }, { "epoch": 0.06298364847587645, "grad_norm": 2.0434137346621624, "learning_rate": 9.971721120678018e-06, "loss": 0.5932, "step": 117 }, { "epoch": 0.06352197025772155, "grad_norm": 1.62299849715006, "learning_rate": 9.97078725579475e-06, "loss": 0.5077, "step": 118 }, { "epoch": 0.06406029203956665, "grad_norm": 1.7228929187523507, "learning_rate": 9.969838265841739e-06, "loss": 0.5859, "step": 119 }, { "epoch": 0.06459861382141174, "grad_norm": 1.6625474372880666, "learning_rate": 9.968874153706567e-06, "loss": 0.4655, "step": 120 }, { "epoch": 0.06459861382141174, "eval_loss": 0.5072533488273621, "eval_runtime": 1577.1777, "eval_samples_per_second": 15.857, "eval_steps_per_second": 0.496, "step": 120 }, { "epoch": 0.06513693560325684, "grad_norm": 2.0716206061611486, "learning_rate": 9.967894922322824e-06, "loss": 0.539, "step": 121 }, { "epoch": 0.06567525738510195, "grad_norm": 1.6205145916384769, "learning_rate": 9.96690057467011e-06, "loss": 0.5478, "step": 122 }, { "epoch": 0.06621357916694705, "grad_norm": 1.587372514164151, "learning_rate": 9.965891113774015e-06, "loss": 0.538, "step": 123 }, { "epoch": 0.06675190094879214, "grad_norm": 1.4772510136765666, "learning_rate": 9.964866542706119e-06, "loss": 0.5349, "step": 124 }, { "epoch": 0.06729022273063724, "grad_norm": 1.7801746551956565, "learning_rate": 9.963826864583979e-06, "loss": 0.4909, "step": 125 }, { "epoch": 0.06782854451248234, "grad_norm": 5.729919312521928, "learning_rate": 9.962772082571115e-06, "loss": 0.6005, "step": 126 }, { "epoch": 0.06836686629432744, "grad_norm": 1.6619105967880943, "learning_rate": 9.961702199877014e-06, "loss": 0.4715, "step": 127 }, { "epoch": 0.06890518807617253, "grad_norm": 1.5987631874828743, "learning_rate": 9.960617219757105e-06, "loss": 0.4807, "step": 128 }, { "epoch": 0.06944350985801763, "grad_norm": 1.625681174655454, "learning_rate": 9.959517145512754e-06, "loss": 0.535, "step": 129 }, { "epoch": 0.06998183163986273, "grad_norm": 2.100345459551234, "learning_rate": 9.958401980491259e-06, "loss": 0.5264, "step": 130 }, { "epoch": 0.07052015342170782, "grad_norm": 1.7787800977162425, "learning_rate": 9.957271728085836e-06, "loss": 0.5171, "step": 131 }, { "epoch": 0.07105847520355292, "grad_norm": 1.6985346393670706, "learning_rate": 9.956126391735605e-06, "loss": 0.5016, "step": 132 }, { "epoch": 0.07159679698539802, "grad_norm": 1.3787117088478043, "learning_rate": 9.954965974925586e-06, "loss": 0.502, "step": 133 }, { "epoch": 0.07213511876724311, "grad_norm": 1.547259961768447, "learning_rate": 9.953790481186689e-06, "loss": 0.5046, "step": 134 }, { "epoch": 0.07267344054908821, "grad_norm": 1.7755359789986371, "learning_rate": 9.952599914095692e-06, "loss": 0.5385, "step": 135 }, { "epoch": 0.07321176233093332, "grad_norm": 1.5896819627160363, "learning_rate": 9.951394277275247e-06, "loss": 0.4749, "step": 136 }, { "epoch": 0.07375008411277842, "grad_norm": 1.6875256792153286, "learning_rate": 9.950173574393853e-06, "loss": 0.4763, "step": 137 }, { "epoch": 0.07428840589462352, "grad_norm": 1.437266797535168, "learning_rate": 9.948937809165853e-06, "loss": 0.4833, "step": 138 }, { "epoch": 0.07482672767646861, "grad_norm": 1.7282025114929471, "learning_rate": 9.947686985351427e-06, "loss": 0.4767, "step": 139 }, { "epoch": 0.07536504945831371, "grad_norm": 1.8616012721247828, "learning_rate": 9.946421106756568e-06, "loss": 0.5093, "step": 140 }, { "epoch": 0.0759033712401588, "grad_norm": 1.8460263465465812, "learning_rate": 9.94514017723308e-06, "loss": 0.517, "step": 141 }, { "epoch": 0.0764416930220039, "grad_norm": 2.0057873955643823, "learning_rate": 9.94384420067857e-06, "loss": 0.5154, "step": 142 }, { "epoch": 0.076980014803849, "grad_norm": 1.65882505385735, "learning_rate": 9.94253318103642e-06, "loss": 0.4701, "step": 143 }, { "epoch": 0.0775183365856941, "grad_norm": 2.3628830084290806, "learning_rate": 9.941207122295789e-06, "loss": 0.5405, "step": 144 }, { "epoch": 0.0780566583675392, "grad_norm": 1.6577450103892044, "learning_rate": 9.9398660284916e-06, "loss": 0.4927, "step": 145 }, { "epoch": 0.07859498014938429, "grad_norm": 1.4186036899765784, "learning_rate": 9.938509903704521e-06, "loss": 0.4898, "step": 146 }, { "epoch": 0.07913330193122939, "grad_norm": 1.544561300695159, "learning_rate": 9.937138752060958e-06, "loss": 0.4893, "step": 147 }, { "epoch": 0.07967162371307449, "grad_norm": 2.396784154476515, "learning_rate": 9.935752577733038e-06, "loss": 0.5326, "step": 148 }, { "epoch": 0.08020994549491958, "grad_norm": 1.6617814624124967, "learning_rate": 9.9343513849386e-06, "loss": 0.5131, "step": 149 }, { "epoch": 0.08074826727676468, "grad_norm": 1.7862849588167096, "learning_rate": 9.932935177941185e-06, "loss": 0.571, "step": 150 }, { "epoch": 0.08128658905860979, "grad_norm": 1.4319233814203582, "learning_rate": 9.931503961050012e-06, "loss": 0.5017, "step": 151 }, { "epoch": 0.08182491084045489, "grad_norm": 4.306871831666418, "learning_rate": 9.93005773861998e-06, "loss": 0.4935, "step": 152 }, { "epoch": 0.08236323262229998, "grad_norm": 2.160758045969246, "learning_rate": 9.928596515051639e-06, "loss": 0.4985, "step": 153 }, { "epoch": 0.08290155440414508, "grad_norm": 1.5540015811422117, "learning_rate": 9.927120294791188e-06, "loss": 0.4575, "step": 154 }, { "epoch": 0.08343987618599018, "grad_norm": 1.5794711992375656, "learning_rate": 9.92562908233046e-06, "loss": 0.5031, "step": 155 }, { "epoch": 0.08397819796783527, "grad_norm": 2.034943473794147, "learning_rate": 9.9241228822069e-06, "loss": 0.4829, "step": 156 }, { "epoch": 0.08451651974968037, "grad_norm": 1.878275757652009, "learning_rate": 9.922601699003567e-06, "loss": 0.5468, "step": 157 }, { "epoch": 0.08505484153152547, "grad_norm": 1.8197718876914466, "learning_rate": 9.921065537349097e-06, "loss": 0.5228, "step": 158 }, { "epoch": 0.08559316331337057, "grad_norm": 1.850901219005824, "learning_rate": 9.919514401917717e-06, "loss": 0.4894, "step": 159 }, { "epoch": 0.08613148509521566, "grad_norm": 1.6912529326600465, "learning_rate": 9.917948297429202e-06, "loss": 0.4783, "step": 160 }, { "epoch": 0.08666980687706076, "grad_norm": 1.9572290713193328, "learning_rate": 9.916367228648887e-06, "loss": 0.4889, "step": 161 }, { "epoch": 0.08720812865890586, "grad_norm": 2.2412763350776497, "learning_rate": 9.914771200387634e-06, "loss": 0.5196, "step": 162 }, { "epoch": 0.08774645044075095, "grad_norm": 2.0096075056146527, "learning_rate": 9.913160217501822e-06, "loss": 0.5098, "step": 163 }, { "epoch": 0.08828477222259605, "grad_norm": 1.561955725348752, "learning_rate": 9.911534284893336e-06, "loss": 0.4993, "step": 164 }, { "epoch": 0.08882309400444116, "grad_norm": 2.2239745440823113, "learning_rate": 9.909893407509554e-06, "loss": 0.5189, "step": 165 }, { "epoch": 0.08936141578628626, "grad_norm": 2.1956593936333606, "learning_rate": 9.90823759034332e-06, "loss": 0.4956, "step": 166 }, { "epoch": 0.08989973756813135, "grad_norm": 1.7245617400478288, "learning_rate": 9.906566838432943e-06, "loss": 0.5076, "step": 167 }, { "epoch": 0.09043805934997645, "grad_norm": 1.6846599680454537, "learning_rate": 9.904881156862172e-06, "loss": 0.4546, "step": 168 }, { "epoch": 0.09097638113182155, "grad_norm": 1.713604562000994, "learning_rate": 9.903180550760184e-06, "loss": 0.5622, "step": 169 }, { "epoch": 0.09151470291366665, "grad_norm": 1.4559714724478827, "learning_rate": 9.901465025301571e-06, "loss": 0.499, "step": 170 }, { "epoch": 0.09205302469551174, "grad_norm": 1.748975091207079, "learning_rate": 9.899734585706316e-06, "loss": 0.4823, "step": 171 }, { "epoch": 0.09259134647735684, "grad_norm": 1.6268147978199312, "learning_rate": 9.89798923723979e-06, "loss": 0.5452, "step": 172 }, { "epoch": 0.09312966825920194, "grad_norm": 1.7343158101478648, "learning_rate": 9.896228985212722e-06, "loss": 0.4359, "step": 173 }, { "epoch": 0.09366799004104703, "grad_norm": 2.07042169826696, "learning_rate": 9.894453834981194e-06, "loss": 0.511, "step": 174 }, { "epoch": 0.09420631182289213, "grad_norm": 1.791222622400255, "learning_rate": 9.892663791946617e-06, "loss": 0.5451, "step": 175 }, { "epoch": 0.09474463360473723, "grad_norm": 2.20105621306618, "learning_rate": 9.890858861555719e-06, "loss": 0.5144, "step": 176 }, { "epoch": 0.09528295538658232, "grad_norm": 1.6902715423027703, "learning_rate": 9.889039049300526e-06, "loss": 0.5445, "step": 177 }, { "epoch": 0.09582127716842742, "grad_norm": 1.6384822244675972, "learning_rate": 9.88720436071835e-06, "loss": 0.5164, "step": 178 }, { "epoch": 0.09635959895027253, "grad_norm": 1.486764051130488, "learning_rate": 9.885354801391764e-06, "loss": 0.478, "step": 179 }, { "epoch": 0.09689792073211763, "grad_norm": 1.701132133672937, "learning_rate": 9.883490376948593e-06, "loss": 0.5027, "step": 180 }, { "epoch": 0.09689792073211763, "eval_loss": 0.49806535243988037, "eval_runtime": 1515.9148, "eval_samples_per_second": 16.498, "eval_steps_per_second": 0.516, "step": 180 }, { "epoch": 0.09743624251396273, "grad_norm": 1.9402448136247314, "learning_rate": 9.881611093061891e-06, "loss": 0.5127, "step": 181 }, { "epoch": 0.09797456429580782, "grad_norm": 1.7830082860168288, "learning_rate": 9.879716955449927e-06, "loss": 0.4977, "step": 182 }, { "epoch": 0.09851288607765292, "grad_norm": 1.8728338162339362, "learning_rate": 9.877807969876167e-06, "loss": 0.5303, "step": 183 }, { "epoch": 0.09905120785949802, "grad_norm": 1.9418905923773875, "learning_rate": 9.875884142149258e-06, "loss": 0.4924, "step": 184 }, { "epoch": 0.09958952964134311, "grad_norm": 1.7198468996934395, "learning_rate": 9.873945478123006e-06, "loss": 0.4753, "step": 185 }, { "epoch": 0.10012785142318821, "grad_norm": 1.9960103116925314, "learning_rate": 9.87199198369636e-06, "loss": 0.5277, "step": 186 }, { "epoch": 0.10066617320503331, "grad_norm": 1.627744057918891, "learning_rate": 9.870023664813399e-06, "loss": 0.46, "step": 187 }, { "epoch": 0.1012044949868784, "grad_norm": 1.689952574264165, "learning_rate": 9.868040527463305e-06, "loss": 0.4994, "step": 188 }, { "epoch": 0.1017428167687235, "grad_norm": 1.5603624594142342, "learning_rate": 9.866042577680354e-06, "loss": 0.5304, "step": 189 }, { "epoch": 0.1022811385505686, "grad_norm": 1.748472496778829, "learning_rate": 9.86402982154389e-06, "loss": 0.4964, "step": 190 }, { "epoch": 0.1028194603324137, "grad_norm": 1.7431819106596798, "learning_rate": 9.862002265178308e-06, "loss": 0.4783, "step": 191 }, { "epoch": 0.10335778211425879, "grad_norm": 1.837418537016329, "learning_rate": 9.859959914753042e-06, "loss": 0.4862, "step": 192 }, { "epoch": 0.1038961038961039, "grad_norm": 2.596761998177084, "learning_rate": 9.857902776482538e-06, "loss": 0.5261, "step": 193 }, { "epoch": 0.104434425677949, "grad_norm": 1.893467433056967, "learning_rate": 9.85583085662624e-06, "loss": 0.5324, "step": 194 }, { "epoch": 0.1049727474597941, "grad_norm": 1.5311561663354358, "learning_rate": 9.853744161488568e-06, "loss": 0.4934, "step": 195 }, { "epoch": 0.1055110692416392, "grad_norm": 1.573948338119931, "learning_rate": 9.851642697418898e-06, "loss": 0.5137, "step": 196 }, { "epoch": 0.10604939102348429, "grad_norm": 1.7486390517463863, "learning_rate": 9.84952647081155e-06, "loss": 0.535, "step": 197 }, { "epoch": 0.10658771280532939, "grad_norm": 1.589021194069147, "learning_rate": 9.847395488105761e-06, "loss": 0.443, "step": 198 }, { "epoch": 0.10712603458717448, "grad_norm": 1.9185393015026924, "learning_rate": 9.845249755785665e-06, "loss": 0.5281, "step": 199 }, { "epoch": 0.10766435636901958, "grad_norm": 2.3792026849321704, "learning_rate": 9.84308928038028e-06, "loss": 0.5031, "step": 200 }, { "epoch": 0.10820267815086468, "grad_norm": 1.9165328926467609, "learning_rate": 9.840914068463482e-06, "loss": 0.5557, "step": 201 }, { "epoch": 0.10874099993270978, "grad_norm": 2.5946215311840315, "learning_rate": 9.838724126653987e-06, "loss": 0.4922, "step": 202 }, { "epoch": 0.10927932171455487, "grad_norm": 2.13076319151747, "learning_rate": 9.836519461615331e-06, "loss": 0.5781, "step": 203 }, { "epoch": 0.10981764349639997, "grad_norm": 1.663228941320188, "learning_rate": 9.834300080055854e-06, "loss": 0.484, "step": 204 }, { "epoch": 0.11035596527824507, "grad_norm": 2.225077581890442, "learning_rate": 9.832065988728667e-06, "loss": 0.4869, "step": 205 }, { "epoch": 0.11089428706009016, "grad_norm": 1.4816502494413102, "learning_rate": 9.829817194431646e-06, "loss": 0.4782, "step": 206 }, { "epoch": 0.11143260884193526, "grad_norm": 1.9584675295393534, "learning_rate": 9.827553704007403e-06, "loss": 0.4572, "step": 207 }, { "epoch": 0.11197093062378037, "grad_norm": 1.4348786359320973, "learning_rate": 9.82527552434327e-06, "loss": 0.4682, "step": 208 }, { "epoch": 0.11250925240562547, "grad_norm": 1.836643464151516, "learning_rate": 9.82298266237127e-06, "loss": 0.475, "step": 209 }, { "epoch": 0.11304757418747056, "grad_norm": 1.6780795457698512, "learning_rate": 9.820675125068105e-06, "loss": 0.4903, "step": 210 }, { "epoch": 0.11358589596931566, "grad_norm": 2.0824594091852124, "learning_rate": 9.818352919455133e-06, "loss": 0.5396, "step": 211 }, { "epoch": 0.11412421775116076, "grad_norm": 1.7381485522277624, "learning_rate": 9.816016052598336e-06, "loss": 0.536, "step": 212 }, { "epoch": 0.11466253953300586, "grad_norm": 1.7730039428627105, "learning_rate": 9.813664531608319e-06, "loss": 0.5344, "step": 213 }, { "epoch": 0.11520086131485095, "grad_norm": 1.726577182888005, "learning_rate": 9.811298363640265e-06, "loss": 0.4686, "step": 214 }, { "epoch": 0.11573918309669605, "grad_norm": 1.4284226913661735, "learning_rate": 9.808917555893934e-06, "loss": 0.417, "step": 215 }, { "epoch": 0.11627750487854115, "grad_norm": 1.8490676859358208, "learning_rate": 9.806522115613624e-06, "loss": 0.4734, "step": 216 }, { "epoch": 0.11681582666038624, "grad_norm": 1.9252320315263673, "learning_rate": 9.804112050088164e-06, "loss": 0.5216, "step": 217 }, { "epoch": 0.11735414844223134, "grad_norm": 2.039324491259981, "learning_rate": 9.801687366650882e-06, "loss": 0.5209, "step": 218 }, { "epoch": 0.11789247022407644, "grad_norm": 2.9773699463269168, "learning_rate": 9.799248072679581e-06, "loss": 0.5341, "step": 219 }, { "epoch": 0.11843079200592153, "grad_norm": 2.742476530553411, "learning_rate": 9.796794175596526e-06, "loss": 0.5013, "step": 220 }, { "epoch": 0.11896911378776663, "grad_norm": 1.7756468554357536, "learning_rate": 9.794325682868413e-06, "loss": 0.4789, "step": 221 }, { "epoch": 0.11950743556961174, "grad_norm": 1.6809704903695406, "learning_rate": 9.791842602006355e-06, "loss": 0.4661, "step": 222 }, { "epoch": 0.12004575735145684, "grad_norm": 1.5983552620095136, "learning_rate": 9.789344940565844e-06, "loss": 0.4525, "step": 223 }, { "epoch": 0.12058407913330194, "grad_norm": 1.6785718872740183, "learning_rate": 9.786832706146745e-06, "loss": 0.5614, "step": 224 }, { "epoch": 0.12112240091514703, "grad_norm": 1.8472396669798028, "learning_rate": 9.784305906393266e-06, "loss": 0.5442, "step": 225 }, { "epoch": 0.12166072269699213, "grad_norm": 2.233728320756155, "learning_rate": 9.781764548993932e-06, "loss": 0.5065, "step": 226 }, { "epoch": 0.12219904447883723, "grad_norm": 1.7583669595786098, "learning_rate": 9.77920864168156e-06, "loss": 0.5031, "step": 227 }, { "epoch": 0.12273736626068232, "grad_norm": 1.856107901761449, "learning_rate": 9.77663819223325e-06, "loss": 0.5218, "step": 228 }, { "epoch": 0.12327568804252742, "grad_norm": 1.5999284716572806, "learning_rate": 9.774053208470338e-06, "loss": 0.447, "step": 229 }, { "epoch": 0.12381400982437252, "grad_norm": 3.170181526472491, "learning_rate": 9.771453698258392e-06, "loss": 0.4549, "step": 230 }, { "epoch": 0.12435233160621761, "grad_norm": 1.7567006972999655, "learning_rate": 9.768839669507185e-06, "loss": 0.5203, "step": 231 }, { "epoch": 0.12489065338806271, "grad_norm": 1.6024823185860628, "learning_rate": 9.766211130170653e-06, "loss": 0.5035, "step": 232 }, { "epoch": 0.1254289751699078, "grad_norm": 1.9234982966827474, "learning_rate": 9.7635680882469e-06, "loss": 0.5742, "step": 233 }, { "epoch": 0.1259672969517529, "grad_norm": 1.526400617412084, "learning_rate": 9.760910551778149e-06, "loss": 0.4953, "step": 234 }, { "epoch": 0.126505618733598, "grad_norm": 1.7460568880199783, "learning_rate": 9.758238528850733e-06, "loss": 0.4705, "step": 235 }, { "epoch": 0.1270439405154431, "grad_norm": 5.681983754980635, "learning_rate": 9.755552027595055e-06, "loss": 0.5499, "step": 236 }, { "epoch": 0.1275822622972882, "grad_norm": 1.9059517301514561, "learning_rate": 9.752851056185583e-06, "loss": 0.5016, "step": 237 }, { "epoch": 0.1281205840791333, "grad_norm": 2.032081768465102, "learning_rate": 9.750135622840811e-06, "loss": 0.4761, "step": 238 }, { "epoch": 0.1286589058609784, "grad_norm": 2.044888486278771, "learning_rate": 9.747405735823232e-06, "loss": 0.535, "step": 239 }, { "epoch": 0.1291972276428235, "grad_norm": 1.7814262228625417, "learning_rate": 9.744661403439328e-06, "loss": 0.5524, "step": 240 }, { "epoch": 0.1291972276428235, "eval_loss": 0.4923091232776642, "eval_runtime": 1516.8995, "eval_samples_per_second": 16.488, "eval_steps_per_second": 0.516, "step": 240 }, { "epoch": 0.12973554942466858, "grad_norm": 3.1298270206538, "learning_rate": 9.74190263403953e-06, "loss": 0.4938, "step": 241 }, { "epoch": 0.13027387120651368, "grad_norm": 1.4984946811035116, "learning_rate": 9.739129436018193e-06, "loss": 0.4417, "step": 242 }, { "epoch": 0.1308121929883588, "grad_norm": 1.364613667269671, "learning_rate": 9.736341817813586e-06, "loss": 0.4698, "step": 243 }, { "epoch": 0.1313505147702039, "grad_norm": 1.4558332152005662, "learning_rate": 9.733539787907851e-06, "loss": 0.51, "step": 244 }, { "epoch": 0.131888836552049, "grad_norm": 1.605378069117634, "learning_rate": 9.730723354826978e-06, "loss": 0.4502, "step": 245 }, { "epoch": 0.1324271583338941, "grad_norm": 1.6741314580897366, "learning_rate": 9.727892527140787e-06, "loss": 0.4445, "step": 246 }, { "epoch": 0.1329654801157392, "grad_norm": 2.306950410094544, "learning_rate": 9.725047313462897e-06, "loss": 0.541, "step": 247 }, { "epoch": 0.1335038018975843, "grad_norm": 2.110791301537649, "learning_rate": 9.722187722450699e-06, "loss": 0.5105, "step": 248 }, { "epoch": 0.1340421236794294, "grad_norm": 1.8250944708952, "learning_rate": 9.719313762805334e-06, "loss": 0.5233, "step": 249 }, { "epoch": 0.13458044546127448, "grad_norm": 1.5279014760068415, "learning_rate": 9.716425443271663e-06, "loss": 0.4978, "step": 250 }, { "epoch": 0.13511876724311958, "grad_norm": 1.6155139379634116, "learning_rate": 9.713522772638238e-06, "loss": 0.489, "step": 251 }, { "epoch": 0.13565708902496468, "grad_norm": 1.7541916143762504, "learning_rate": 9.710605759737281e-06, "loss": 0.5058, "step": 252 }, { "epoch": 0.13619541080680977, "grad_norm": 2.0770411769433914, "learning_rate": 9.707674413444658e-06, "loss": 0.4765, "step": 253 }, { "epoch": 0.13673373258865487, "grad_norm": 2.20017292136363, "learning_rate": 9.70472874267984e-06, "loss": 0.5073, "step": 254 }, { "epoch": 0.13727205437049997, "grad_norm": 2.5155355882755495, "learning_rate": 9.701768756405894e-06, "loss": 0.5271, "step": 255 }, { "epoch": 0.13781037615234507, "grad_norm": 1.6203966463313373, "learning_rate": 9.698794463629438e-06, "loss": 0.5328, "step": 256 }, { "epoch": 0.13834869793419016, "grad_norm": 1.776204296227151, "learning_rate": 9.695805873400627e-06, "loss": 0.4975, "step": 257 }, { "epoch": 0.13888701971603526, "grad_norm": 1.817996887986963, "learning_rate": 9.692802994813117e-06, "loss": 0.5076, "step": 258 }, { "epoch": 0.13942534149788036, "grad_norm": 1.5387316388819356, "learning_rate": 9.68978583700404e-06, "loss": 0.4783, "step": 259 }, { "epoch": 0.13996366327972545, "grad_norm": 1.4525191587799346, "learning_rate": 9.686754409153984e-06, "loss": 0.4541, "step": 260 }, { "epoch": 0.14050198506157055, "grad_norm": 2.5072786042500286, "learning_rate": 9.683708720486947e-06, "loss": 0.4321, "step": 261 }, { "epoch": 0.14104030684341565, "grad_norm": 1.928234336171056, "learning_rate": 9.680648780270327e-06, "loss": 0.5026, "step": 262 }, { "epoch": 0.14157862862526074, "grad_norm": 1.9095002820990152, "learning_rate": 9.677574597814884e-06, "loss": 0.5048, "step": 263 }, { "epoch": 0.14211695040710584, "grad_norm": 2.7537047870453777, "learning_rate": 9.674486182474716e-06, "loss": 0.5202, "step": 264 }, { "epoch": 0.14265527218895094, "grad_norm": 1.5411698281683408, "learning_rate": 9.671383543647225e-06, "loss": 0.473, "step": 265 }, { "epoch": 0.14319359397079603, "grad_norm": 1.6351867542673815, "learning_rate": 9.668266690773094e-06, "loss": 0.4734, "step": 266 }, { "epoch": 0.14373191575264113, "grad_norm": 1.8884810300636565, "learning_rate": 9.66513563333626e-06, "loss": 0.5014, "step": 267 }, { "epoch": 0.14427023753448623, "grad_norm": 1.6743904016832571, "learning_rate": 9.661990380863876e-06, "loss": 0.4782, "step": 268 }, { "epoch": 0.14480855931633133, "grad_norm": 1.9090758165263444, "learning_rate": 9.658830942926291e-06, "loss": 0.5003, "step": 269 }, { "epoch": 0.14534688109817642, "grad_norm": 1.4937405913115736, "learning_rate": 9.655657329137015e-06, "loss": 0.4432, "step": 270 }, { "epoch": 0.14588520288002152, "grad_norm": 1.9026943182309153, "learning_rate": 9.652469549152695e-06, "loss": 0.529, "step": 271 }, { "epoch": 0.14642352466186664, "grad_norm": 1.8186943886881364, "learning_rate": 9.649267612673079e-06, "loss": 0.4737, "step": 272 }, { "epoch": 0.14696184644371174, "grad_norm": 1.8259823260308685, "learning_rate": 9.646051529440993e-06, "loss": 0.4985, "step": 273 }, { "epoch": 0.14750016822555684, "grad_norm": 1.9385932273349529, "learning_rate": 9.64282130924231e-06, "loss": 0.4838, "step": 274 }, { "epoch": 0.14803849000740193, "grad_norm": 2.04013899262351, "learning_rate": 9.639576961905915e-06, "loss": 0.5434, "step": 275 }, { "epoch": 0.14857681178924703, "grad_norm": 1.4822512590060632, "learning_rate": 9.636318497303679e-06, "loss": 0.5105, "step": 276 }, { "epoch": 0.14911513357109213, "grad_norm": 1.580055299090581, "learning_rate": 9.633045925350436e-06, "loss": 0.5236, "step": 277 }, { "epoch": 0.14965345535293723, "grad_norm": 1.947058506268201, "learning_rate": 9.629759256003936e-06, "loss": 0.517, "step": 278 }, { "epoch": 0.15019177713478232, "grad_norm": 2.09097300966892, "learning_rate": 9.626458499264833e-06, "loss": 0.4795, "step": 279 }, { "epoch": 0.15073009891662742, "grad_norm": 1.9281815370039999, "learning_rate": 9.623143665176636e-06, "loss": 0.5091, "step": 280 }, { "epoch": 0.15126842069847252, "grad_norm": 1.8942765435710498, "learning_rate": 9.6198147638257e-06, "loss": 0.486, "step": 281 }, { "epoch": 0.1518067424803176, "grad_norm": 1.5680877122601742, "learning_rate": 9.616471805341175e-06, "loss": 0.5756, "step": 282 }, { "epoch": 0.1523450642621627, "grad_norm": 1.8187589637332664, "learning_rate": 9.613114799894989e-06, "loss": 0.4848, "step": 283 }, { "epoch": 0.1528833860440078, "grad_norm": 2.845269186548161, "learning_rate": 9.609743757701806e-06, "loss": 0.5196, "step": 284 }, { "epoch": 0.1534217078258529, "grad_norm": 1.6573799451128552, "learning_rate": 9.60635868901901e-06, "loss": 0.5256, "step": 285 }, { "epoch": 0.153960029607698, "grad_norm": 1.403409672767778, "learning_rate": 9.602959604146658e-06, "loss": 0.4591, "step": 286 }, { "epoch": 0.1544983513895431, "grad_norm": 1.5756224710697608, "learning_rate": 9.599546513427455e-06, "loss": 0.4499, "step": 287 }, { "epoch": 0.1550366731713882, "grad_norm": 1.8561161081867996, "learning_rate": 9.596119427246727e-06, "loss": 0.514, "step": 288 }, { "epoch": 0.1555749949532333, "grad_norm": 1.6430886050709819, "learning_rate": 9.592678356032382e-06, "loss": 0.4916, "step": 289 }, { "epoch": 0.1561133167350784, "grad_norm": 1.5608831001537813, "learning_rate": 9.589223310254881e-06, "loss": 0.4845, "step": 290 }, { "epoch": 0.15665163851692349, "grad_norm": 2.041472319934021, "learning_rate": 9.58575430042721e-06, "loss": 0.5105, "step": 291 }, { "epoch": 0.15718996029876858, "grad_norm": 1.879252835980779, "learning_rate": 9.582271337104844e-06, "loss": 0.5254, "step": 292 }, { "epoch": 0.15772828208061368, "grad_norm": 1.7353738362985391, "learning_rate": 9.578774430885714e-06, "loss": 0.545, "step": 293 }, { "epoch": 0.15826660386245878, "grad_norm": 1.6167983704567415, "learning_rate": 9.575263592410176e-06, "loss": 0.484, "step": 294 }, { "epoch": 0.15880492564430387, "grad_norm": 1.6983057165346465, "learning_rate": 9.571738832360979e-06, "loss": 0.5001, "step": 295 }, { "epoch": 0.15934324742614897, "grad_norm": 2.081190213763369, "learning_rate": 9.568200161463237e-06, "loss": 0.4722, "step": 296 }, { "epoch": 0.15988156920799407, "grad_norm": 2.246655796617688, "learning_rate": 9.564647590484384e-06, "loss": 0.5171, "step": 297 }, { "epoch": 0.16041989098983916, "grad_norm": 1.4481263563444773, "learning_rate": 9.561081130234155e-06, "loss": 0.471, "step": 298 }, { "epoch": 0.16095821277168426, "grad_norm": 1.6254902571476582, "learning_rate": 9.557500791564545e-06, "loss": 0.4709, "step": 299 }, { "epoch": 0.16149653455352936, "grad_norm": 1.6522030181707457, "learning_rate": 9.55390658536978e-06, "loss": 0.4314, "step": 300 }, { "epoch": 0.16149653455352936, "eval_loss": 0.48600396513938904, "eval_runtime": 1525.5556, "eval_samples_per_second": 16.394, "eval_steps_per_second": 0.513, "step": 300 }, { "epoch": 0.16203485633537448, "grad_norm": 1.6735119675316397, "learning_rate": 9.550298522586277e-06, "loss": 0.4981, "step": 301 }, { "epoch": 0.16257317811721958, "grad_norm": 1.7492206784400102, "learning_rate": 9.546676614192623e-06, "loss": 0.5166, "step": 302 }, { "epoch": 0.16311149989906468, "grad_norm": 1.8716369675908593, "learning_rate": 9.543040871209528e-06, "loss": 0.4587, "step": 303 }, { "epoch": 0.16364982168090977, "grad_norm": 1.5260344735318792, "learning_rate": 9.5393913046998e-06, "loss": 0.4637, "step": 304 }, { "epoch": 0.16418814346275487, "grad_norm": 1.9514934425079693, "learning_rate": 9.535727925768312e-06, "loss": 0.5018, "step": 305 }, { "epoch": 0.16472646524459997, "grad_norm": 1.9239888955973004, "learning_rate": 9.53205074556196e-06, "loss": 0.5156, "step": 306 }, { "epoch": 0.16526478702644506, "grad_norm": 1.4397611201745624, "learning_rate": 9.528359775269637e-06, "loss": 0.4876, "step": 307 }, { "epoch": 0.16580310880829016, "grad_norm": 1.6314792528136741, "learning_rate": 9.524655026122199e-06, "loss": 0.4466, "step": 308 }, { "epoch": 0.16634143059013526, "grad_norm": 1.7046994741333183, "learning_rate": 9.520936509392425e-06, "loss": 0.5137, "step": 309 }, { "epoch": 0.16687975237198036, "grad_norm": 1.6773498230286716, "learning_rate": 9.517204236394983e-06, "loss": 0.4857, "step": 310 }, { "epoch": 0.16741807415382545, "grad_norm": 1.9407453364887826, "learning_rate": 9.513458218486404e-06, "loss": 0.569, "step": 311 }, { "epoch": 0.16795639593567055, "grad_norm": 2.3596815310352355, "learning_rate": 9.509698467065042e-06, "loss": 0.4823, "step": 312 }, { "epoch": 0.16849471771751565, "grad_norm": 1.491461623274511, "learning_rate": 9.505924993571037e-06, "loss": 0.4814, "step": 313 }, { "epoch": 0.16903303949936074, "grad_norm": 1.755984194501031, "learning_rate": 9.502137809486277e-06, "loss": 0.4953, "step": 314 }, { "epoch": 0.16957136128120584, "grad_norm": 1.4330639099631888, "learning_rate": 9.49833692633438e-06, "loss": 0.4566, "step": 315 }, { "epoch": 0.17010968306305094, "grad_norm": 2.8224430252996413, "learning_rate": 9.49452235568064e-06, "loss": 0.5356, "step": 316 }, { "epoch": 0.17064800484489603, "grad_norm": 1.6038158256481398, "learning_rate": 9.490694109131997e-06, "loss": 0.4667, "step": 317 }, { "epoch": 0.17118632662674113, "grad_norm": 1.5264996881581228, "learning_rate": 9.486852198337013e-06, "loss": 0.5066, "step": 318 }, { "epoch": 0.17172464840858623, "grad_norm": 2.1960133726792987, "learning_rate": 9.482996634985818e-06, "loss": 0.51, "step": 319 }, { "epoch": 0.17226297019043132, "grad_norm": 1.8025162435130595, "learning_rate": 9.479127430810087e-06, "loss": 0.4542, "step": 320 }, { "epoch": 0.17280129197227642, "grad_norm": 1.573351382907097, "learning_rate": 9.475244597583007e-06, "loss": 0.4932, "step": 321 }, { "epoch": 0.17333961375412152, "grad_norm": 1.8667569419712537, "learning_rate": 9.471348147119226e-06, "loss": 0.5095, "step": 322 }, { "epoch": 0.17387793553596662, "grad_norm": 1.7668055772396445, "learning_rate": 9.467438091274831e-06, "loss": 0.5407, "step": 323 }, { "epoch": 0.1744162573178117, "grad_norm": 1.8953472452582216, "learning_rate": 9.46351444194731e-06, "loss": 0.5128, "step": 324 }, { "epoch": 0.1749545790996568, "grad_norm": 1.4178882398027213, "learning_rate": 9.459577211075505e-06, "loss": 0.4783, "step": 325 }, { "epoch": 0.1754929008815019, "grad_norm": 2.0556054399757833, "learning_rate": 9.455626410639595e-06, "loss": 0.4883, "step": 326 }, { "epoch": 0.176031222663347, "grad_norm": 1.7326020245251583, "learning_rate": 9.451662052661042e-06, "loss": 0.5118, "step": 327 }, { "epoch": 0.1765695444451921, "grad_norm": 4.171939008569256, "learning_rate": 9.447684149202555e-06, "loss": 0.5034, "step": 328 }, { "epoch": 0.17710786622703723, "grad_norm": 1.4094510294695572, "learning_rate": 9.44369271236807e-06, "loss": 0.485, "step": 329 }, { "epoch": 0.17764618800888232, "grad_norm": 1.7412556596004685, "learning_rate": 9.4396877543027e-06, "loss": 0.5202, "step": 330 }, { "epoch": 0.17818450979072742, "grad_norm": 2.605859372043168, "learning_rate": 9.435669287192691e-06, "loss": 0.4685, "step": 331 }, { "epoch": 0.17872283157257252, "grad_norm": 1.751047130574041, "learning_rate": 9.431637323265406e-06, "loss": 0.5435, "step": 332 }, { "epoch": 0.1792611533544176, "grad_norm": 1.6979113314955865, "learning_rate": 9.42759187478927e-06, "loss": 0.5082, "step": 333 }, { "epoch": 0.1797994751362627, "grad_norm": 1.655193667961951, "learning_rate": 9.423532954073737e-06, "loss": 0.52, "step": 334 }, { "epoch": 0.1803377969181078, "grad_norm": 1.715183078111553, "learning_rate": 9.419460573469262e-06, "loss": 0.4876, "step": 335 }, { "epoch": 0.1808761186999529, "grad_norm": 1.755206515543788, "learning_rate": 9.415374745367245e-06, "loss": 0.4826, "step": 336 }, { "epoch": 0.181414440481798, "grad_norm": 1.530238277234238, "learning_rate": 9.411275482200015e-06, "loss": 0.5227, "step": 337 }, { "epoch": 0.1819527622636431, "grad_norm": 1.4873212835334444, "learning_rate": 9.40716279644077e-06, "loss": 0.4784, "step": 338 }, { "epoch": 0.1824910840454882, "grad_norm": 1.4713841358562554, "learning_rate": 9.403036700603561e-06, "loss": 0.4872, "step": 339 }, { "epoch": 0.1830294058273333, "grad_norm": 1.5551919063027968, "learning_rate": 9.398897207243232e-06, "loss": 0.4817, "step": 340 }, { "epoch": 0.1835677276091784, "grad_norm": 1.8717050820441055, "learning_rate": 9.394744328955403e-06, "loss": 0.5002, "step": 341 }, { "epoch": 0.18410604939102349, "grad_norm": 1.9843100820794195, "learning_rate": 9.390578078376417e-06, "loss": 0.4799, "step": 342 }, { "epoch": 0.18464437117286858, "grad_norm": 2.156998251608843, "learning_rate": 9.386398468183304e-06, "loss": 0.4469, "step": 343 }, { "epoch": 0.18518269295471368, "grad_norm": 1.7123477834586953, "learning_rate": 9.38220551109375e-06, "loss": 0.5312, "step": 344 }, { "epoch": 0.18572101473655878, "grad_norm": 1.862901860663747, "learning_rate": 9.377999219866046e-06, "loss": 0.5146, "step": 345 }, { "epoch": 0.18625933651840387, "grad_norm": 1.8400145206055536, "learning_rate": 9.373779607299061e-06, "loss": 0.498, "step": 346 }, { "epoch": 0.18679765830024897, "grad_norm": 1.4419967374301528, "learning_rate": 9.369546686232199e-06, "loss": 0.491, "step": 347 }, { "epoch": 0.18733598008209407, "grad_norm": 1.6800971553110484, "learning_rate": 9.365300469545352e-06, "loss": 0.453, "step": 348 }, { "epoch": 0.18787430186393916, "grad_norm": 1.4414646625492236, "learning_rate": 9.361040970158876e-06, "loss": 0.4844, "step": 349 }, { "epoch": 0.18841262364578426, "grad_norm": 1.4693828151901231, "learning_rate": 9.356768201033542e-06, "loss": 0.4846, "step": 350 }, { "epoch": 0.18895094542762936, "grad_norm": 1.6213301090422854, "learning_rate": 9.35248217517049e-06, "loss": 0.4528, "step": 351 }, { "epoch": 0.18948926720947445, "grad_norm": 1.3998204036117714, "learning_rate": 9.348182905611209e-06, "loss": 0.4677, "step": 352 }, { "epoch": 0.19002758899131955, "grad_norm": 1.4713366703366633, "learning_rate": 9.343870405437477e-06, "loss": 0.4292, "step": 353 }, { "epoch": 0.19056591077316465, "grad_norm": 1.941068700941172, "learning_rate": 9.339544687771334e-06, "loss": 0.5102, "step": 354 }, { "epoch": 0.19110423255500975, "grad_norm": 1.828849112653357, "learning_rate": 9.335205765775039e-06, "loss": 0.4638, "step": 355 }, { "epoch": 0.19164255433685484, "grad_norm": 1.6885129161638754, "learning_rate": 9.330853652651026e-06, "loss": 0.4391, "step": 356 }, { "epoch": 0.19218087611869994, "grad_norm": 1.7268115477491062, "learning_rate": 9.326488361641867e-06, "loss": 0.4557, "step": 357 }, { "epoch": 0.19271919790054506, "grad_norm": 1.369390489248521, "learning_rate": 9.322109906030237e-06, "loss": 0.4451, "step": 358 }, { "epoch": 0.19325751968239016, "grad_norm": 1.653269795096283, "learning_rate": 9.31771829913886e-06, "loss": 0.4466, "step": 359 }, { "epoch": 0.19379584146423526, "grad_norm": 1.6015504141518857, "learning_rate": 9.313313554330484e-06, "loss": 0.4977, "step": 360 }, { "epoch": 0.19379584146423526, "eval_loss": 0.4812440574169159, "eval_runtime": 1528.9254, "eval_samples_per_second": 16.358, "eval_steps_per_second": 0.511, "step": 360 }, { "epoch": 0.19433416324608035, "grad_norm": 1.6899547102686612, "learning_rate": 9.308895685007824e-06, "loss": 0.5404, "step": 361 }, { "epoch": 0.19487248502792545, "grad_norm": 1.8153441873291498, "learning_rate": 9.304464704613541e-06, "loss": 0.5128, "step": 362 }, { "epoch": 0.19541080680977055, "grad_norm": 1.6094259149494354, "learning_rate": 9.300020626630184e-06, "loss": 0.4854, "step": 363 }, { "epoch": 0.19594912859161565, "grad_norm": 1.726004590201776, "learning_rate": 9.295563464580153e-06, "loss": 0.4827, "step": 364 }, { "epoch": 0.19648745037346074, "grad_norm": 1.7917006550897865, "learning_rate": 9.29109323202567e-06, "loss": 0.4689, "step": 365 }, { "epoch": 0.19702577215530584, "grad_norm": 2.067420755566304, "learning_rate": 9.286609942568712e-06, "loss": 0.4411, "step": 366 }, { "epoch": 0.19756409393715094, "grad_norm": 1.9439738397276571, "learning_rate": 9.282113609851002e-06, "loss": 0.4748, "step": 367 }, { "epoch": 0.19810241571899603, "grad_norm": 1.6206588657538272, "learning_rate": 9.277604247553939e-06, "loss": 0.5215, "step": 368 }, { "epoch": 0.19864073750084113, "grad_norm": 2.0968303117516136, "learning_rate": 9.273081869398577e-06, "loss": 0.4466, "step": 369 }, { "epoch": 0.19917905928268623, "grad_norm": 1.5483077144548956, "learning_rate": 9.268546489145566e-06, "loss": 0.5042, "step": 370 }, { "epoch": 0.19971738106453132, "grad_norm": 1.6430391903483688, "learning_rate": 9.263998120595124e-06, "loss": 0.4798, "step": 371 }, { "epoch": 0.20025570284637642, "grad_norm": 1.451263876582638, "learning_rate": 9.259436777586991e-06, "loss": 0.4498, "step": 372 }, { "epoch": 0.20079402462822152, "grad_norm": 1.924895097651951, "learning_rate": 9.25486247400038e-06, "loss": 0.4971, "step": 373 }, { "epoch": 0.20133234641006661, "grad_norm": 1.5044716731151997, "learning_rate": 9.250275223753948e-06, "loss": 0.4761, "step": 374 }, { "epoch": 0.2018706681919117, "grad_norm": 1.8105401635317677, "learning_rate": 9.245675040805738e-06, "loss": 0.4645, "step": 375 }, { "epoch": 0.2024089899737568, "grad_norm": 1.4400001043179194, "learning_rate": 9.241061939153146e-06, "loss": 0.5052, "step": 376 }, { "epoch": 0.2029473117556019, "grad_norm": 2.1898160128283046, "learning_rate": 9.236435932832883e-06, "loss": 0.4571, "step": 377 }, { "epoch": 0.203485633537447, "grad_norm": 1.728102995146478, "learning_rate": 9.231797035920921e-06, "loss": 0.459, "step": 378 }, { "epoch": 0.2040239553192921, "grad_norm": 1.5484346370702677, "learning_rate": 9.227145262532458e-06, "loss": 0.5106, "step": 379 }, { "epoch": 0.2045622771011372, "grad_norm": 1.5623742217769747, "learning_rate": 9.222480626821868e-06, "loss": 0.444, "step": 380 }, { "epoch": 0.2051005988829823, "grad_norm": 1.7091436440987169, "learning_rate": 9.217803142982668e-06, "loss": 0.4732, "step": 381 }, { "epoch": 0.2056389206648274, "grad_norm": 1.4196906974845203, "learning_rate": 9.213112825247466e-06, "loss": 0.4779, "step": 382 }, { "epoch": 0.2061772424466725, "grad_norm": 1.5167704426292719, "learning_rate": 9.20840968788792e-06, "loss": 0.4967, "step": 383 }, { "epoch": 0.20671556422851758, "grad_norm": 1.4170871947038493, "learning_rate": 9.203693745214698e-06, "loss": 0.491, "step": 384 }, { "epoch": 0.20725388601036268, "grad_norm": 1.5152939794668674, "learning_rate": 9.19896501157743e-06, "loss": 0.4541, "step": 385 }, { "epoch": 0.2077922077922078, "grad_norm": 1.9536536833455793, "learning_rate": 9.19422350136467e-06, "loss": 0.4799, "step": 386 }, { "epoch": 0.2083305295740529, "grad_norm": 2.316326510948496, "learning_rate": 9.18946922900384e-06, "loss": 0.4658, "step": 387 }, { "epoch": 0.208868851355898, "grad_norm": 1.2922243986398827, "learning_rate": 9.184702208961204e-06, "loss": 0.4057, "step": 388 }, { "epoch": 0.2094071731377431, "grad_norm": 1.8303479595554093, "learning_rate": 9.179922455741812e-06, "loss": 0.4427, "step": 389 }, { "epoch": 0.2099454949195882, "grad_norm": 1.541720900007236, "learning_rate": 9.175129983889452e-06, "loss": 0.516, "step": 390 }, { "epoch": 0.2104838167014333, "grad_norm": 1.9307101459341938, "learning_rate": 9.17032480798662e-06, "loss": 0.4349, "step": 391 }, { "epoch": 0.2110221384832784, "grad_norm": 1.3922182421272982, "learning_rate": 9.165506942654468e-06, "loss": 0.4816, "step": 392 }, { "epoch": 0.21156046026512348, "grad_norm": 1.6974151932118977, "learning_rate": 9.16067640255275e-06, "loss": 0.4812, "step": 393 }, { "epoch": 0.21209878204696858, "grad_norm": 1.4726854167474133, "learning_rate": 9.155833202379798e-06, "loss": 0.4717, "step": 394 }, { "epoch": 0.21263710382881368, "grad_norm": 1.8790922445419658, "learning_rate": 9.150977356872456e-06, "loss": 0.4885, "step": 395 }, { "epoch": 0.21317542561065878, "grad_norm": 1.9084443087840661, "learning_rate": 9.146108880806056e-06, "loss": 0.4633, "step": 396 }, { "epoch": 0.21371374739250387, "grad_norm": 1.6996601490386696, "learning_rate": 9.141227788994348e-06, "loss": 0.4453, "step": 397 }, { "epoch": 0.21425206917434897, "grad_norm": 1.7127514086857762, "learning_rate": 9.136334096289485e-06, "loss": 0.5144, "step": 398 }, { "epoch": 0.21479039095619407, "grad_norm": 1.4183339048304517, "learning_rate": 9.131427817581953e-06, "loss": 0.476, "step": 399 }, { "epoch": 0.21532871273803916, "grad_norm": 1.5688801517253075, "learning_rate": 9.12650896780053e-06, "loss": 0.4657, "step": 400 }, { "epoch": 0.21586703451988426, "grad_norm": 1.391080609496865, "learning_rate": 9.121577561912256e-06, "loss": 0.5043, "step": 401 }, { "epoch": 0.21640535630172936, "grad_norm": 3.302547702490585, "learning_rate": 9.11663361492237e-06, "loss": 0.497, "step": 402 }, { "epoch": 0.21694367808357445, "grad_norm": 1.7874988296563226, "learning_rate": 9.111677141874273e-06, "loss": 0.4465, "step": 403 }, { "epoch": 0.21748199986541955, "grad_norm": 1.830004021479594, "learning_rate": 9.106708157849478e-06, "loss": 0.5088, "step": 404 }, { "epoch": 0.21802032164726465, "grad_norm": 2.4236747379642267, "learning_rate": 9.101726677967569e-06, "loss": 0.4922, "step": 405 }, { "epoch": 0.21855864342910974, "grad_norm": 1.5488577176317244, "learning_rate": 9.096732717386152e-06, "loss": 0.497, "step": 406 }, { "epoch": 0.21909696521095484, "grad_norm": 2.3263014189367306, "learning_rate": 9.091726291300806e-06, "loss": 0.4791, "step": 407 }, { "epoch": 0.21963528699279994, "grad_norm": 1.7243223143837634, "learning_rate": 9.086707414945044e-06, "loss": 0.5192, "step": 408 }, { "epoch": 0.22017360877464504, "grad_norm": 1.3667216442420331, "learning_rate": 9.08167610359026e-06, "loss": 0.4816, "step": 409 }, { "epoch": 0.22071193055649013, "grad_norm": 1.4675898960533509, "learning_rate": 9.076632372545688e-06, "loss": 0.4694, "step": 410 }, { "epoch": 0.22125025233833523, "grad_norm": 1.725309532729321, "learning_rate": 9.071576237158348e-06, "loss": 0.5097, "step": 411 }, { "epoch": 0.22178857412018033, "grad_norm": 1.48659542538949, "learning_rate": 9.066507712813009e-06, "loss": 0.445, "step": 412 }, { "epoch": 0.22232689590202542, "grad_norm": 1.6287270540094485, "learning_rate": 9.06142681493213e-06, "loss": 0.4948, "step": 413 }, { "epoch": 0.22286521768387052, "grad_norm": 1.5275233090165254, "learning_rate": 9.056333558975828e-06, "loss": 0.4556, "step": 414 }, { "epoch": 0.22340353946571564, "grad_norm": 1.6620168630066545, "learning_rate": 9.051227960441819e-06, "loss": 0.4652, "step": 415 }, { "epoch": 0.22394186124756074, "grad_norm": 2.059601149156459, "learning_rate": 9.046110034865374e-06, "loss": 0.5085, "step": 416 }, { "epoch": 0.22448018302940584, "grad_norm": 1.762324556385875, "learning_rate": 9.040979797819275e-06, "loss": 0.4461, "step": 417 }, { "epoch": 0.22501850481125094, "grad_norm": 1.7567357923246754, "learning_rate": 9.035837264913764e-06, "loss": 0.4732, "step": 418 }, { "epoch": 0.22555682659309603, "grad_norm": 1.6696886078675257, "learning_rate": 9.030682451796497e-06, "loss": 0.4642, "step": 419 }, { "epoch": 0.22609514837494113, "grad_norm": 1.8175306322549967, "learning_rate": 9.025515374152498e-06, "loss": 0.4613, "step": 420 }, { "epoch": 0.22609514837494113, "eval_loss": 0.4776149392127991, "eval_runtime": 1533.2316, "eval_samples_per_second": 16.312, "eval_steps_per_second": 0.51, "step": 420 }, { "epoch": 0.22663347015678623, "grad_norm": 1.7934239843519915, "learning_rate": 9.020336047704105e-06, "loss": 0.516, "step": 421 }, { "epoch": 0.22717179193863132, "grad_norm": 1.5310720805604554, "learning_rate": 9.015144488210927e-06, "loss": 0.489, "step": 422 }, { "epoch": 0.22771011372047642, "grad_norm": 1.48774951332565, "learning_rate": 9.009940711469804e-06, "loss": 0.5009, "step": 423 }, { "epoch": 0.22824843550232152, "grad_norm": 2.4756529462562145, "learning_rate": 9.004724733314738e-06, "loss": 0.4406, "step": 424 }, { "epoch": 0.22878675728416661, "grad_norm": 1.4505668733407078, "learning_rate": 8.999496569616867e-06, "loss": 0.4554, "step": 425 }, { "epoch": 0.2293250790660117, "grad_norm": 1.7945762191089136, "learning_rate": 8.994256236284402e-06, "loss": 0.4632, "step": 426 }, { "epoch": 0.2298634008478568, "grad_norm": 1.6376843185311614, "learning_rate": 8.989003749262587e-06, "loss": 0.4885, "step": 427 }, { "epoch": 0.2304017226297019, "grad_norm": 1.8830741232863908, "learning_rate": 8.983739124533644e-06, "loss": 0.5075, "step": 428 }, { "epoch": 0.230940044411547, "grad_norm": 1.3195150579928587, "learning_rate": 8.978462378116729e-06, "loss": 0.4708, "step": 429 }, { "epoch": 0.2314783661933921, "grad_norm": 3.7495214134368977, "learning_rate": 8.973173526067883e-06, "loss": 0.4286, "step": 430 }, { "epoch": 0.2320166879752372, "grad_norm": 2.359888838059791, "learning_rate": 8.967872584479977e-06, "loss": 0.5009, "step": 431 }, { "epoch": 0.2325550097570823, "grad_norm": 2.307039087438763, "learning_rate": 8.962559569482677e-06, "loss": 0.5676, "step": 432 }, { "epoch": 0.2330933315389274, "grad_norm": 1.6816015759212095, "learning_rate": 8.957234497242378e-06, "loss": 0.4741, "step": 433 }, { "epoch": 0.2336316533207725, "grad_norm": 1.322921614998224, "learning_rate": 8.951897383962163e-06, "loss": 0.4688, "step": 434 }, { "epoch": 0.23416997510261758, "grad_norm": 1.4430047272258668, "learning_rate": 8.946548245881758e-06, "loss": 0.4711, "step": 435 }, { "epoch": 0.23470829688446268, "grad_norm": 1.5731159349637571, "learning_rate": 8.941187099277475e-06, "loss": 0.5128, "step": 436 }, { "epoch": 0.23524661866630778, "grad_norm": 1.7731819377906834, "learning_rate": 8.935813960462166e-06, "loss": 0.4669, "step": 437 }, { "epoch": 0.23578494044815287, "grad_norm": 1.5736170200351274, "learning_rate": 8.930428845785171e-06, "loss": 0.5151, "step": 438 }, { "epoch": 0.23632326222999797, "grad_norm": 1.9488876650276103, "learning_rate": 8.925031771632273e-06, "loss": 0.449, "step": 439 }, { "epoch": 0.23686158401184307, "grad_norm": 1.8677275264654012, "learning_rate": 8.919622754425645e-06, "loss": 0.4758, "step": 440 }, { "epoch": 0.23739990579368817, "grad_norm": 1.6185523790901868, "learning_rate": 8.914201810623796e-06, "loss": 0.4539, "step": 441 }, { "epoch": 0.23793822757553326, "grad_norm": 1.7808483857096469, "learning_rate": 8.908768956721535e-06, "loss": 0.5022, "step": 442 }, { "epoch": 0.2384765493573784, "grad_norm": 1.5766134824810658, "learning_rate": 8.903324209249895e-06, "loss": 0.448, "step": 443 }, { "epoch": 0.23901487113922348, "grad_norm": 1.734675342226781, "learning_rate": 8.897867584776114e-06, "loss": 0.4646, "step": 444 }, { "epoch": 0.23955319292106858, "grad_norm": 1.5790149541067802, "learning_rate": 8.892399099903564e-06, "loss": 0.4786, "step": 445 }, { "epoch": 0.24009151470291368, "grad_norm": 1.4746994503206987, "learning_rate": 8.8869187712717e-06, "loss": 0.5055, "step": 446 }, { "epoch": 0.24062983648475877, "grad_norm": 1.629202002564735, "learning_rate": 8.881426615556023e-06, "loss": 0.4572, "step": 447 }, { "epoch": 0.24116815826660387, "grad_norm": 2.060742412650639, "learning_rate": 8.875922649468019e-06, "loss": 0.5032, "step": 448 }, { "epoch": 0.24170648004844897, "grad_norm": 1.5621749237333817, "learning_rate": 8.87040688975511e-06, "loss": 0.4654, "step": 449 }, { "epoch": 0.24224480183029407, "grad_norm": 1.4674899116105513, "learning_rate": 8.864879353200599e-06, "loss": 0.4747, "step": 450 }, { "epoch": 0.24278312361213916, "grad_norm": 1.5183875651941505, "learning_rate": 8.859340056623632e-06, "loss": 0.4982, "step": 451 }, { "epoch": 0.24332144539398426, "grad_norm": 1.5706370531453442, "learning_rate": 8.853789016879134e-06, "loss": 0.4667, "step": 452 }, { "epoch": 0.24385976717582936, "grad_norm": 1.6305623278282155, "learning_rate": 8.84822625085776e-06, "loss": 0.456, "step": 453 }, { "epoch": 0.24439808895767445, "grad_norm": 1.6523301690172285, "learning_rate": 8.842651775485848e-06, "loss": 0.5383, "step": 454 }, { "epoch": 0.24493641073951955, "grad_norm": 1.5998220743266833, "learning_rate": 8.837065607725368e-06, "loss": 0.4829, "step": 455 }, { "epoch": 0.24547473252136465, "grad_norm": 1.7862569885991761, "learning_rate": 8.831467764573863e-06, "loss": 0.5101, "step": 456 }, { "epoch": 0.24601305430320974, "grad_norm": 1.704691179868801, "learning_rate": 8.8258582630644e-06, "loss": 0.4627, "step": 457 }, { "epoch": 0.24655137608505484, "grad_norm": 1.7756811764982563, "learning_rate": 8.820237120265526e-06, "loss": 0.5079, "step": 458 }, { "epoch": 0.24708969786689994, "grad_norm": 1.3696742776597963, "learning_rate": 8.814604353281206e-06, "loss": 0.4393, "step": 459 }, { "epoch": 0.24762801964874503, "grad_norm": 2.7637461827933083, "learning_rate": 8.80895997925078e-06, "loss": 0.4548, "step": 460 }, { "epoch": 0.24816634143059013, "grad_norm": 1.9115795242982947, "learning_rate": 8.803304015348894e-06, "loss": 0.4805, "step": 461 }, { "epoch": 0.24870466321243523, "grad_norm": 1.6805506691737162, "learning_rate": 8.797636478785475e-06, "loss": 0.4786, "step": 462 }, { "epoch": 0.24924298499428033, "grad_norm": 1.865661091263274, "learning_rate": 8.791957386805651e-06, "loss": 0.4722, "step": 463 }, { "epoch": 0.24978130677612542, "grad_norm": 1.9405317358586787, "learning_rate": 8.78626675668972e-06, "loss": 0.4705, "step": 464 }, { "epoch": 0.2503196285579705, "grad_norm": 1.4415009315383829, "learning_rate": 8.78056460575308e-06, "loss": 0.4301, "step": 465 }, { "epoch": 0.2508579503398156, "grad_norm": 1.6060330602526178, "learning_rate": 8.774850951346188e-06, "loss": 0.4114, "step": 466 }, { "epoch": 0.2513962721216607, "grad_norm": 1.7567677906852937, "learning_rate": 8.769125810854504e-06, "loss": 0.4922, "step": 467 }, { "epoch": 0.2519345939035058, "grad_norm": 1.4281502602519498, "learning_rate": 8.763389201698438e-06, "loss": 0.4426, "step": 468 }, { "epoch": 0.2524729156853509, "grad_norm": 1.787920776798679, "learning_rate": 8.757641141333296e-06, "loss": 0.4451, "step": 469 }, { "epoch": 0.253011237467196, "grad_norm": 1.4246034781799948, "learning_rate": 8.751881647249228e-06, "loss": 0.4353, "step": 470 }, { "epoch": 0.2535495592490411, "grad_norm": 1.6679185342871934, "learning_rate": 8.746110736971175e-06, "loss": 0.4573, "step": 471 }, { "epoch": 0.2540878810308862, "grad_norm": 1.6765594656197593, "learning_rate": 8.740328428058813e-06, "loss": 0.4797, "step": 472 }, { "epoch": 0.2546262028127313, "grad_norm": 1.7826390062476167, "learning_rate": 8.734534738106503e-06, "loss": 0.473, "step": 473 }, { "epoch": 0.2551645245945764, "grad_norm": 2.195730177211015, "learning_rate": 8.728729684743238e-06, "loss": 0.4648, "step": 474 }, { "epoch": 0.2557028463764215, "grad_norm": 1.475566632306908, "learning_rate": 8.722913285632584e-06, "loss": 0.4845, "step": 475 }, { "epoch": 0.2562411681582666, "grad_norm": 1.7347583810505152, "learning_rate": 8.717085558472631e-06, "loss": 0.4708, "step": 476 }, { "epoch": 0.2567794899401117, "grad_norm": 1.6902146229456119, "learning_rate": 8.71124652099594e-06, "loss": 0.4817, "step": 477 }, { "epoch": 0.2573178117219568, "grad_norm": 1.7071042054828858, "learning_rate": 8.705396190969484e-06, "loss": 0.4712, "step": 478 }, { "epoch": 0.2578561335038019, "grad_norm": 1.729348975756144, "learning_rate": 8.699534586194598e-06, "loss": 0.4881, "step": 479 }, { "epoch": 0.258394455285647, "grad_norm": 1.4614872127177663, "learning_rate": 8.693661724506924e-06, "loss": 0.457, "step": 480 }, { "epoch": 0.258394455285647, "eval_loss": 0.4751787483692169, "eval_runtime": 1539.7899, "eval_samples_per_second": 16.242, "eval_steps_per_second": 0.508, "step": 480 }, { "epoch": 0.25893277706749207, "grad_norm": 2.1154756500873977, "learning_rate": 8.687777623776357e-06, "loss": 0.4842, "step": 481 }, { "epoch": 0.25947109884933717, "grad_norm": 1.5862460419373354, "learning_rate": 8.681882301906988e-06, "loss": 0.4432, "step": 482 }, { "epoch": 0.26000942063118226, "grad_norm": 1.796404843665338, "learning_rate": 8.675975776837053e-06, "loss": 0.4759, "step": 483 }, { "epoch": 0.26054774241302736, "grad_norm": 1.5555927859924092, "learning_rate": 8.67005806653888e-06, "loss": 0.509, "step": 484 }, { "epoch": 0.26108606419487246, "grad_norm": 2.1699720622194354, "learning_rate": 8.664129189018826e-06, "loss": 0.5334, "step": 485 }, { "epoch": 0.2616243859767176, "grad_norm": 1.690073634180223, "learning_rate": 8.658189162317226e-06, "loss": 0.4356, "step": 486 }, { "epoch": 0.2621627077585627, "grad_norm": 1.8294975401345657, "learning_rate": 8.65223800450835e-06, "loss": 0.4387, "step": 487 }, { "epoch": 0.2627010295404078, "grad_norm": 2.5288130694594337, "learning_rate": 8.646275733700327e-06, "loss": 0.4567, "step": 488 }, { "epoch": 0.2632393513222529, "grad_norm": 1.957861459161194, "learning_rate": 8.640302368035105e-06, "loss": 0.4614, "step": 489 }, { "epoch": 0.263777673104098, "grad_norm": 1.5304950580333017, "learning_rate": 8.634317925688392e-06, "loss": 0.4655, "step": 490 }, { "epoch": 0.2643159948859431, "grad_norm": 1.667011172421826, "learning_rate": 8.628322424869599e-06, "loss": 0.4834, "step": 491 }, { "epoch": 0.2648543166677882, "grad_norm": 2.1636641173694464, "learning_rate": 8.622315883821783e-06, "loss": 0.4776, "step": 492 }, { "epoch": 0.2653926384496333, "grad_norm": 1.46798046973594, "learning_rate": 8.616298320821601e-06, "loss": 0.4272, "step": 493 }, { "epoch": 0.2659309602314784, "grad_norm": 1.861178177564276, "learning_rate": 8.61026975417924e-06, "loss": 0.4784, "step": 494 }, { "epoch": 0.2664692820133235, "grad_norm": 1.6268110739530368, "learning_rate": 8.604230202238373e-06, "loss": 0.5029, "step": 495 }, { "epoch": 0.2670076037951686, "grad_norm": 1.5680263307618678, "learning_rate": 8.598179683376098e-06, "loss": 0.4225, "step": 496 }, { "epoch": 0.2675459255770137, "grad_norm": 1.5774347517397593, "learning_rate": 8.592118216002883e-06, "loss": 0.4879, "step": 497 }, { "epoch": 0.2680842473588588, "grad_norm": 2.670832440569625, "learning_rate": 8.586045818562508e-06, "loss": 0.4667, "step": 498 }, { "epoch": 0.26862256914070387, "grad_norm": 2.2055704035459787, "learning_rate": 8.579962509532016e-06, "loss": 0.4331, "step": 499 }, { "epoch": 0.26916089092254897, "grad_norm": 1.4435727148058994, "learning_rate": 8.573868307421648e-06, "loss": 0.4894, "step": 500 }, { "epoch": 0.26969921270439406, "grad_norm": 1.6814136996880347, "learning_rate": 8.567763230774789e-06, "loss": 0.4697, "step": 501 }, { "epoch": 0.27023753448623916, "grad_norm": 1.5774141123551826, "learning_rate": 8.561647298167918e-06, "loss": 0.503, "step": 502 }, { "epoch": 0.27077585626808426, "grad_norm": 1.5778826165083357, "learning_rate": 8.555520528210541e-06, "loss": 0.4535, "step": 503 }, { "epoch": 0.27131417804992936, "grad_norm": 1.7129721491097367, "learning_rate": 8.549382939545143e-06, "loss": 0.4494, "step": 504 }, { "epoch": 0.27185249983177445, "grad_norm": 1.8943346844828264, "learning_rate": 8.543234550847128e-06, "loss": 0.5063, "step": 505 }, { "epoch": 0.27239082161361955, "grad_norm": 1.5886936361058726, "learning_rate": 8.537075380824761e-06, "loss": 0.4652, "step": 506 }, { "epoch": 0.27292914339546465, "grad_norm": 1.4831172032030655, "learning_rate": 8.530905448219112e-06, "loss": 0.4243, "step": 507 }, { "epoch": 0.27346746517730974, "grad_norm": 1.7919686995453996, "learning_rate": 8.524724771804001e-06, "loss": 0.5049, "step": 508 }, { "epoch": 0.27400578695915484, "grad_norm": 1.7505822684442558, "learning_rate": 8.518533370385939e-06, "loss": 0.4423, "step": 509 }, { "epoch": 0.27454410874099994, "grad_norm": 1.5798026347891434, "learning_rate": 8.512331262804069e-06, "loss": 0.4866, "step": 510 }, { "epoch": 0.27508243052284503, "grad_norm": 1.8464155171834333, "learning_rate": 8.506118467930112e-06, "loss": 0.4708, "step": 511 }, { "epoch": 0.27562075230469013, "grad_norm": 1.6897436623195476, "learning_rate": 8.499895004668308e-06, "loss": 0.4903, "step": 512 }, { "epoch": 0.27615907408653523, "grad_norm": 1.7863457448170967, "learning_rate": 8.49366089195536e-06, "loss": 0.5092, "step": 513 }, { "epoch": 0.2766973958683803, "grad_norm": 1.7320740104134424, "learning_rate": 8.487416148760375e-06, "loss": 0.48, "step": 514 }, { "epoch": 0.2772357176502254, "grad_norm": 1.7064456081649735, "learning_rate": 8.481160794084799e-06, "loss": 0.4754, "step": 515 }, { "epoch": 0.2777740394320705, "grad_norm": 1.7525756365837095, "learning_rate": 8.47489484696238e-06, "loss": 0.427, "step": 516 }, { "epoch": 0.2783123612139156, "grad_norm": 2.058946941055886, "learning_rate": 8.468618326459086e-06, "loss": 0.4847, "step": 517 }, { "epoch": 0.2788506829957607, "grad_norm": 2.0477477556261467, "learning_rate": 8.46233125167306e-06, "loss": 0.4579, "step": 518 }, { "epoch": 0.2793890047776058, "grad_norm": 1.783616738245662, "learning_rate": 8.456033641734562e-06, "loss": 0.4858, "step": 519 }, { "epoch": 0.2799273265594509, "grad_norm": 2.0513841896237444, "learning_rate": 8.449725515805907e-06, "loss": 0.5352, "step": 520 }, { "epoch": 0.280465648341296, "grad_norm": 1.6372025528727123, "learning_rate": 8.443406893081406e-06, "loss": 0.4618, "step": 521 }, { "epoch": 0.2810039701231411, "grad_norm": 1.5571805104955587, "learning_rate": 8.437077792787314e-06, "loss": 0.4038, "step": 522 }, { "epoch": 0.2815422919049862, "grad_norm": 1.75233105631481, "learning_rate": 8.43073823418176e-06, "loss": 0.4845, "step": 523 }, { "epoch": 0.2820806136868313, "grad_norm": 1.6881033261753147, "learning_rate": 8.424388236554704e-06, "loss": 0.4865, "step": 524 }, { "epoch": 0.2826189354686764, "grad_norm": 1.796069079351986, "learning_rate": 8.418027819227861e-06, "loss": 0.4538, "step": 525 }, { "epoch": 0.2831572572505215, "grad_norm": 1.24349614978993, "learning_rate": 8.41165700155466e-06, "loss": 0.4166, "step": 526 }, { "epoch": 0.2836955790323666, "grad_norm": 1.932274887854439, "learning_rate": 8.405275802920168e-06, "loss": 0.5061, "step": 527 }, { "epoch": 0.2842339008142117, "grad_norm": 1.5593268393001998, "learning_rate": 8.398884242741045e-06, "loss": 0.4894, "step": 528 }, { "epoch": 0.2847722225960568, "grad_norm": 1.7069043502360113, "learning_rate": 8.392482340465475e-06, "loss": 0.4485, "step": 529 }, { "epoch": 0.2853105443779019, "grad_norm": 1.5063144141336193, "learning_rate": 8.386070115573115e-06, "loss": 0.4175, "step": 530 }, { "epoch": 0.285848866159747, "grad_norm": 1.4364305869165457, "learning_rate": 8.379647587575026e-06, "loss": 0.4766, "step": 531 }, { "epoch": 0.28638718794159207, "grad_norm": 1.3932649525614649, "learning_rate": 8.373214776013625e-06, "loss": 0.406, "step": 532 }, { "epoch": 0.28692550972343717, "grad_norm": 1.5523357464392091, "learning_rate": 8.366771700462615e-06, "loss": 0.508, "step": 533 }, { "epoch": 0.28746383150528226, "grad_norm": 2.1213305217928613, "learning_rate": 8.360318380526932e-06, "loss": 0.4985, "step": 534 }, { "epoch": 0.28800215328712736, "grad_norm": 1.5873480547904262, "learning_rate": 8.353854835842685e-06, "loss": 0.4919, "step": 535 }, { "epoch": 0.28854047506897246, "grad_norm": 1.5670280821673355, "learning_rate": 8.347381086077095e-06, "loss": 0.4708, "step": 536 }, { "epoch": 0.28907879685081755, "grad_norm": 1.6763746949820768, "learning_rate": 8.34089715092843e-06, "loss": 0.4165, "step": 537 }, { "epoch": 0.28961711863266265, "grad_norm": 1.5717106133141925, "learning_rate": 8.334403050125956e-06, "loss": 0.4554, "step": 538 }, { "epoch": 0.29015544041450775, "grad_norm": 1.9743994746638458, "learning_rate": 8.327898803429866e-06, "loss": 0.4695, "step": 539 }, { "epoch": 0.29069376219635285, "grad_norm": 1.5473676266482859, "learning_rate": 8.32138443063123e-06, "loss": 0.4712, "step": 540 }, { "epoch": 0.29069376219635285, "eval_loss": 0.47182729840278625, "eval_runtime": 1553.992, "eval_samples_per_second": 16.094, "eval_steps_per_second": 0.503, "step": 540 }, { "epoch": 0.29123208397819794, "grad_norm": 1.4425882953477511, "learning_rate": 8.314859951551926e-06, "loss": 0.4837, "step": 541 }, { "epoch": 0.29177040576004304, "grad_norm": 1.3326493426074462, "learning_rate": 8.308325386044583e-06, "loss": 0.4814, "step": 542 }, { "epoch": 0.2923087275418882, "grad_norm": 1.6128638362772016, "learning_rate": 8.301780753992523e-06, "loss": 0.4575, "step": 543 }, { "epoch": 0.2928470493237333, "grad_norm": 1.4423693981211698, "learning_rate": 8.295226075309697e-06, "loss": 0.4633, "step": 544 }, { "epoch": 0.2933853711055784, "grad_norm": 1.6198600771922913, "learning_rate": 8.288661369940627e-06, "loss": 0.4463, "step": 545 }, { "epoch": 0.2939236928874235, "grad_norm": 1.5249628074643904, "learning_rate": 8.282086657860342e-06, "loss": 0.4668, "step": 546 }, { "epoch": 0.2944620146692686, "grad_norm": 1.8125904384120293, "learning_rate": 8.275501959074325e-06, "loss": 0.4825, "step": 547 }, { "epoch": 0.2950003364511137, "grad_norm": 1.9606743516276068, "learning_rate": 8.268907293618437e-06, "loss": 0.4684, "step": 548 }, { "epoch": 0.2955386582329588, "grad_norm": 1.494990763192773, "learning_rate": 8.262302681558872e-06, "loss": 0.4664, "step": 549 }, { "epoch": 0.29607698001480387, "grad_norm": 1.8337579001893594, "learning_rate": 8.255688142992089e-06, "loss": 0.4699, "step": 550 }, { "epoch": 0.29661530179664897, "grad_norm": 1.779841389754219, "learning_rate": 8.24906369804475e-06, "loss": 0.4857, "step": 551 }, { "epoch": 0.29715362357849406, "grad_norm": 1.6593925240524081, "learning_rate": 8.242429366873663e-06, "loss": 0.5038, "step": 552 }, { "epoch": 0.29769194536033916, "grad_norm": 1.9956877344800352, "learning_rate": 8.235785169665711e-06, "loss": 0.4911, "step": 553 }, { "epoch": 0.29823026714218426, "grad_norm": 1.579568204329291, "learning_rate": 8.229131126637804e-06, "loss": 0.4552, "step": 554 }, { "epoch": 0.29876858892402935, "grad_norm": 1.5989428055850947, "learning_rate": 8.222467258036808e-06, "loss": 0.5177, "step": 555 }, { "epoch": 0.29930691070587445, "grad_norm": 2.349536199541145, "learning_rate": 8.215793584139485e-06, "loss": 0.4911, "step": 556 }, { "epoch": 0.29984523248771955, "grad_norm": 1.9403593317863332, "learning_rate": 8.209110125252435e-06, "loss": 0.5061, "step": 557 }, { "epoch": 0.30038355426956465, "grad_norm": 1.7346564666609186, "learning_rate": 8.202416901712033e-06, "loss": 0.4357, "step": 558 }, { "epoch": 0.30092187605140974, "grad_norm": 1.710471255918245, "learning_rate": 8.195713933884359e-06, "loss": 0.5015, "step": 559 }, { "epoch": 0.30146019783325484, "grad_norm": 2.207816727293276, "learning_rate": 8.189001242165151e-06, "loss": 0.527, "step": 560 }, { "epoch": 0.30199851961509994, "grad_norm": 1.428363458277829, "learning_rate": 8.182278846979728e-06, "loss": 0.4983, "step": 561 }, { "epoch": 0.30253684139694503, "grad_norm": 1.77069966551508, "learning_rate": 8.175546768782938e-06, "loss": 0.4996, "step": 562 }, { "epoch": 0.30307516317879013, "grad_norm": 1.631420375855133, "learning_rate": 8.168805028059095e-06, "loss": 0.4899, "step": 563 }, { "epoch": 0.3036134849606352, "grad_norm": 1.6234744365340297, "learning_rate": 8.162053645321908e-06, "loss": 0.4275, "step": 564 }, { "epoch": 0.3041518067424803, "grad_norm": 1.7151129037835051, "learning_rate": 8.15529264111443e-06, "loss": 0.4628, "step": 565 }, { "epoch": 0.3046901285243254, "grad_norm": 1.6757537025608307, "learning_rate": 8.148522036008985e-06, "loss": 0.4636, "step": 566 }, { "epoch": 0.3052284503061705, "grad_norm": 1.157809434742461, "learning_rate": 8.141741850607117e-06, "loss": 0.3868, "step": 567 }, { "epoch": 0.3057667720880156, "grad_norm": 1.4360027236144732, "learning_rate": 8.134952105539515e-06, "loss": 0.4725, "step": 568 }, { "epoch": 0.3063050938698607, "grad_norm": 1.6762158717929798, "learning_rate": 8.128152821465957e-06, "loss": 0.4818, "step": 569 }, { "epoch": 0.3068434156517058, "grad_norm": 1.6736535469921034, "learning_rate": 8.121344019075253e-06, "loss": 0.4805, "step": 570 }, { "epoch": 0.3073817374335509, "grad_norm": 1.5918931966460608, "learning_rate": 8.114525719085163e-06, "loss": 0.5152, "step": 571 }, { "epoch": 0.307920059215396, "grad_norm": 1.4169517878992852, "learning_rate": 8.107697942242356e-06, "loss": 0.4731, "step": 572 }, { "epoch": 0.3084583809972411, "grad_norm": 1.5959353428431666, "learning_rate": 8.100860709322334e-06, "loss": 0.4463, "step": 573 }, { "epoch": 0.3089967027790862, "grad_norm": 1.4569323564340282, "learning_rate": 8.094014041129373e-06, "loss": 0.4046, "step": 574 }, { "epoch": 0.3095350245609313, "grad_norm": 1.5558748525412556, "learning_rate": 8.087157958496456e-06, "loss": 0.4644, "step": 575 }, { "epoch": 0.3100733463427764, "grad_norm": 1.6641076139574378, "learning_rate": 8.080292482285213e-06, "loss": 0.5064, "step": 576 }, { "epoch": 0.3106116681246215, "grad_norm": 1.5793644667521578, "learning_rate": 8.07341763338586e-06, "loss": 0.515, "step": 577 }, { "epoch": 0.3111499899064666, "grad_norm": 1.895774618714942, "learning_rate": 8.066533432717127e-06, "loss": 0.4763, "step": 578 }, { "epoch": 0.3116883116883117, "grad_norm": 1.6689610869771314, "learning_rate": 8.059639901226203e-06, "loss": 0.4487, "step": 579 }, { "epoch": 0.3122266334701568, "grad_norm": 1.4289516860868958, "learning_rate": 8.05273705988867e-06, "loss": 0.426, "step": 580 }, { "epoch": 0.3127649552520019, "grad_norm": 1.448460429863824, "learning_rate": 8.04582492970843e-06, "loss": 0.4622, "step": 581 }, { "epoch": 0.31330327703384697, "grad_norm": 1.562340995796949, "learning_rate": 8.038903531717662e-06, "loss": 0.4644, "step": 582 }, { "epoch": 0.31384159881569207, "grad_norm": 1.4837986133941243, "learning_rate": 8.031972886976731e-06, "loss": 0.4845, "step": 583 }, { "epoch": 0.31437992059753717, "grad_norm": 1.696043847539263, "learning_rate": 8.025033016574148e-06, "loss": 0.4631, "step": 584 }, { "epoch": 0.31491824237938226, "grad_norm": 1.8636443570370922, "learning_rate": 8.018083941626494e-06, "loss": 0.4582, "step": 585 }, { "epoch": 0.31545656416122736, "grad_norm": 1.6588060343624296, "learning_rate": 8.011125683278351e-06, "loss": 0.4118, "step": 586 }, { "epoch": 0.31599488594307246, "grad_norm": 2.064927405044272, "learning_rate": 8.004158262702253e-06, "loss": 0.5307, "step": 587 }, { "epoch": 0.31653320772491755, "grad_norm": 1.7599540523459494, "learning_rate": 7.997181701098608e-06, "loss": 0.4542, "step": 588 }, { "epoch": 0.31707152950676265, "grad_norm": 1.679120614548226, "learning_rate": 7.99019601969564e-06, "loss": 0.4462, "step": 589 }, { "epoch": 0.31760985128860775, "grad_norm": 1.6748781594901945, "learning_rate": 7.983201239749321e-06, "loss": 0.4435, "step": 590 }, { "epoch": 0.31814817307045284, "grad_norm": 1.6895768411385892, "learning_rate": 7.976197382543306e-06, "loss": 0.5043, "step": 591 }, { "epoch": 0.31868649485229794, "grad_norm": 1.4551705590923076, "learning_rate": 7.969184469388877e-06, "loss": 0.4992, "step": 592 }, { "epoch": 0.31922481663414304, "grad_norm": 1.8224446520059305, "learning_rate": 7.962162521624865e-06, "loss": 0.5242, "step": 593 }, { "epoch": 0.31976313841598814, "grad_norm": 1.5471915857747345, "learning_rate": 7.955131560617595e-06, "loss": 0.4672, "step": 594 }, { "epoch": 0.32030146019783323, "grad_norm": 1.943277469873626, "learning_rate": 7.948091607760815e-06, "loss": 0.4817, "step": 595 }, { "epoch": 0.32083978197967833, "grad_norm": 1.361762394527565, "learning_rate": 7.941042684475635e-06, "loss": 0.4341, "step": 596 }, { "epoch": 0.3213781037615234, "grad_norm": 1.578768861245864, "learning_rate": 7.933984812210459e-06, "loss": 0.452, "step": 597 }, { "epoch": 0.3219164255433685, "grad_norm": 1.3732353872225034, "learning_rate": 7.926918012440923e-06, "loss": 0.4349, "step": 598 }, { "epoch": 0.3224547473252136, "grad_norm": 1.8064334973816905, "learning_rate": 7.919842306669825e-06, "loss": 0.4499, "step": 599 }, { "epoch": 0.3229930691070587, "grad_norm": 1.582853458222087, "learning_rate": 7.912757716427062e-06, "loss": 0.4865, "step": 600 }, { "epoch": 0.3229930691070587, "eval_loss": 0.4672350585460663, "eval_runtime": 1563.3319, "eval_samples_per_second": 15.998, "eval_steps_per_second": 0.5, "step": 600 }, { "epoch": 0.32353139088890387, "grad_norm": 1.6009402167895466, "learning_rate": 7.905664263269567e-06, "loss": 0.4576, "step": 601 }, { "epoch": 0.32406971267074897, "grad_norm": 1.6832973254975117, "learning_rate": 7.898561968781242e-06, "loss": 0.457, "step": 602 }, { "epoch": 0.32460803445259406, "grad_norm": 4.046599916473538, "learning_rate": 7.891450854572884e-06, "loss": 0.49, "step": 603 }, { "epoch": 0.32514635623443916, "grad_norm": 1.5254137578843718, "learning_rate": 7.884330942282136e-06, "loss": 0.4533, "step": 604 }, { "epoch": 0.32568467801628426, "grad_norm": 1.5392402810831298, "learning_rate": 7.877202253573404e-06, "loss": 0.4566, "step": 605 }, { "epoch": 0.32622299979812935, "grad_norm": 1.5838863815714255, "learning_rate": 7.870064810137806e-06, "loss": 0.4224, "step": 606 }, { "epoch": 0.32676132157997445, "grad_norm": 1.5112598539099842, "learning_rate": 7.862918633693091e-06, "loss": 0.4537, "step": 607 }, { "epoch": 0.32729964336181955, "grad_norm": 1.7380984306062113, "learning_rate": 7.855763745983588e-06, "loss": 0.5168, "step": 608 }, { "epoch": 0.32783796514366464, "grad_norm": 1.3686616623355445, "learning_rate": 7.848600168780127e-06, "loss": 0.4774, "step": 609 }, { "epoch": 0.32837628692550974, "grad_norm": 1.8037345014596735, "learning_rate": 7.841427923879982e-06, "loss": 0.4841, "step": 610 }, { "epoch": 0.32891460870735484, "grad_norm": 1.5578093278723995, "learning_rate": 7.834247033106798e-06, "loss": 0.4494, "step": 611 }, { "epoch": 0.32945293048919994, "grad_norm": 1.7470526074648303, "learning_rate": 7.827057518310532e-06, "loss": 0.4316, "step": 612 }, { "epoch": 0.32999125227104503, "grad_norm": 1.344635684714144, "learning_rate": 7.819859401367376e-06, "loss": 0.4277, "step": 613 }, { "epoch": 0.33052957405289013, "grad_norm": 1.6142148463610868, "learning_rate": 7.8126527041797e-06, "loss": 0.4732, "step": 614 }, { "epoch": 0.3310678958347352, "grad_norm": 1.4894686294102883, "learning_rate": 7.805437448675986e-06, "loss": 0.4804, "step": 615 }, { "epoch": 0.3316062176165803, "grad_norm": 1.959553525810308, "learning_rate": 7.798213656810747e-06, "loss": 0.5052, "step": 616 }, { "epoch": 0.3321445393984254, "grad_norm": 1.5799236754205312, "learning_rate": 7.790981350564482e-06, "loss": 0.432, "step": 617 }, { "epoch": 0.3326828611802705, "grad_norm": 1.82490515289263, "learning_rate": 7.783740551943586e-06, "loss": 0.4394, "step": 618 }, { "epoch": 0.3332211829621156, "grad_norm": 1.5031228288941465, "learning_rate": 7.776491282980305e-06, "loss": 0.5064, "step": 619 }, { "epoch": 0.3337595047439607, "grad_norm": 1.4329349118783261, "learning_rate": 7.76923356573265e-06, "loss": 0.489, "step": 620 }, { "epoch": 0.3342978265258058, "grad_norm": 1.4961946186338742, "learning_rate": 7.761967422284347e-06, "loss": 0.4704, "step": 621 }, { "epoch": 0.3348361483076509, "grad_norm": 1.7319823672043928, "learning_rate": 7.754692874744752e-06, "loss": 0.4621, "step": 622 }, { "epoch": 0.335374470089496, "grad_norm": 2.0507693298974035, "learning_rate": 7.747409945248797e-06, "loss": 0.502, "step": 623 }, { "epoch": 0.3359127918713411, "grad_norm": 1.4817353671174234, "learning_rate": 7.74011865595692e-06, "loss": 0.4975, "step": 624 }, { "epoch": 0.3364511136531862, "grad_norm": 1.5154706925154366, "learning_rate": 7.732819029054999e-06, "loss": 0.4819, "step": 625 }, { "epoch": 0.3369894354350313, "grad_norm": 2.9866409096863507, "learning_rate": 7.725511086754269e-06, "loss": 0.4947, "step": 626 }, { "epoch": 0.3375277572168764, "grad_norm": 1.7699700957236326, "learning_rate": 7.718194851291284e-06, "loss": 0.4703, "step": 627 }, { "epoch": 0.3380660789987215, "grad_norm": 2.371528841529566, "learning_rate": 7.710870344927817e-06, "loss": 0.5458, "step": 628 }, { "epoch": 0.3386044007805666, "grad_norm": 1.5200234564971724, "learning_rate": 7.703537589950819e-06, "loss": 0.4562, "step": 629 }, { "epoch": 0.3391427225624117, "grad_norm": 1.371146036616362, "learning_rate": 7.696196608672333e-06, "loss": 0.4196, "step": 630 }, { "epoch": 0.3396810443442568, "grad_norm": 1.5627852767313657, "learning_rate": 7.688847423429434e-06, "loss": 0.505, "step": 631 }, { "epoch": 0.3402193661261019, "grad_norm": 1.3089486655111793, "learning_rate": 7.68149005658417e-06, "loss": 0.4532, "step": 632 }, { "epoch": 0.34075768790794697, "grad_norm": 1.72862210074593, "learning_rate": 7.674124530523461e-06, "loss": 0.5431, "step": 633 }, { "epoch": 0.34129600968979207, "grad_norm": 1.397330557638678, "learning_rate": 7.666750867659078e-06, "loss": 0.46, "step": 634 }, { "epoch": 0.34183433147163717, "grad_norm": 1.5822930242940645, "learning_rate": 7.659369090427537e-06, "loss": 0.5183, "step": 635 }, { "epoch": 0.34237265325348226, "grad_norm": 1.517257101602274, "learning_rate": 7.651979221290049e-06, "loss": 0.4847, "step": 636 }, { "epoch": 0.34291097503532736, "grad_norm": 1.569552765274582, "learning_rate": 7.644581282732445e-06, "loss": 0.5237, "step": 637 }, { "epoch": 0.34344929681717246, "grad_norm": 1.5173887839906304, "learning_rate": 7.637175297265109e-06, "loss": 0.444, "step": 638 }, { "epoch": 0.34398761859901755, "grad_norm": 2.6037607041595883, "learning_rate": 7.629761287422915e-06, "loss": 0.4271, "step": 639 }, { "epoch": 0.34452594038086265, "grad_norm": 1.6900192017878133, "learning_rate": 7.622339275765147e-06, "loss": 0.4631, "step": 640 }, { "epoch": 0.34506426216270775, "grad_norm": 1.6204089265699804, "learning_rate": 7.61490928487544e-06, "loss": 0.4798, "step": 641 }, { "epoch": 0.34560258394455284, "grad_norm": 2.072148397739707, "learning_rate": 7.6074713373617094e-06, "loss": 0.5169, "step": 642 }, { "epoch": 0.34614090572639794, "grad_norm": 1.4489303833679512, "learning_rate": 7.600025455856078e-06, "loss": 0.4477, "step": 643 }, { "epoch": 0.34667922750824304, "grad_norm": 1.808968142318587, "learning_rate": 7.592571663014811e-06, "loss": 0.4591, "step": 644 }, { "epoch": 0.34721754929008813, "grad_norm": 1.4861828747421941, "learning_rate": 7.5851099815182505e-06, "loss": 0.4792, "step": 645 }, { "epoch": 0.34775587107193323, "grad_norm": 1.6729126421729203, "learning_rate": 7.577640434070734e-06, "loss": 0.4832, "step": 646 }, { "epoch": 0.34829419285377833, "grad_norm": 1.871195222211602, "learning_rate": 7.5701630434005405e-06, "loss": 0.4417, "step": 647 }, { "epoch": 0.3488325146356234, "grad_norm": 1.51735945461571, "learning_rate": 7.56267783225981e-06, "loss": 0.4741, "step": 648 }, { "epoch": 0.3493708364174685, "grad_norm": 2.071142969866682, "learning_rate": 7.555184823424479e-06, "loss": 0.4127, "step": 649 }, { "epoch": 0.3499091581993136, "grad_norm": 1.910282433363155, "learning_rate": 7.547684039694216e-06, "loss": 0.4531, "step": 650 }, { "epoch": 0.3504474799811587, "grad_norm": 1.9652818314978835, "learning_rate": 7.54017550389234e-06, "loss": 0.5085, "step": 651 }, { "epoch": 0.3509858017630038, "grad_norm": 1.6117024086203307, "learning_rate": 7.5326592388657605e-06, "loss": 0.5148, "step": 652 }, { "epoch": 0.3515241235448489, "grad_norm": 1.4960314880258612, "learning_rate": 7.525135267484906e-06, "loss": 0.4629, "step": 653 }, { "epoch": 0.352062445326694, "grad_norm": 1.604228922752054, "learning_rate": 7.517603612643653e-06, "loss": 0.5117, "step": 654 }, { "epoch": 0.3526007671085391, "grad_norm": 2.136019956641433, "learning_rate": 7.5100642972592606e-06, "loss": 0.4629, "step": 655 }, { "epoch": 0.3531390888903842, "grad_norm": 1.4857693238664922, "learning_rate": 7.50251734427229e-06, "loss": 0.4671, "step": 656 }, { "epoch": 0.3536774106722293, "grad_norm": 1.4380772688023766, "learning_rate": 7.494962776646549e-06, "loss": 0.428, "step": 657 }, { "epoch": 0.35421573245407445, "grad_norm": 1.7510803552126726, "learning_rate": 7.487400617369013e-06, "loss": 0.4417, "step": 658 }, { "epoch": 0.35475405423591955, "grad_norm": 1.8718328199464012, "learning_rate": 7.479830889449754e-06, "loss": 0.4489, "step": 659 }, { "epoch": 0.35529237601776464, "grad_norm": 1.3987482870509058, "learning_rate": 7.472253615921878e-06, "loss": 0.5121, "step": 660 }, { "epoch": 0.35529237601776464, "eval_loss": 0.4641415774822235, "eval_runtime": 1581.4987, "eval_samples_per_second": 15.814, "eval_steps_per_second": 0.494, "step": 660 }, { "epoch": 0.35583069779960974, "grad_norm": 1.5856953831241587, "learning_rate": 7.464668819841453e-06, "loss": 0.4429, "step": 661 }, { "epoch": 0.35636901958145484, "grad_norm": 1.648655956667231, "learning_rate": 7.457076524287426e-06, "loss": 0.4794, "step": 662 }, { "epoch": 0.35690734136329993, "grad_norm": 1.8056054836187343, "learning_rate": 7.4494767523615754e-06, "loss": 0.4488, "step": 663 }, { "epoch": 0.35744566314514503, "grad_norm": 1.7062432057396102, "learning_rate": 7.441869527188421e-06, "loss": 0.4506, "step": 664 }, { "epoch": 0.35798398492699013, "grad_norm": 1.4819375518870144, "learning_rate": 7.434254871915166e-06, "loss": 0.4135, "step": 665 }, { "epoch": 0.3585223067088352, "grad_norm": 1.734074823822691, "learning_rate": 7.426632809711617e-06, "loss": 0.4744, "step": 666 }, { "epoch": 0.3590606284906803, "grad_norm": 1.5235626105944915, "learning_rate": 7.4190033637701216e-06, "loss": 0.4646, "step": 667 }, { "epoch": 0.3595989502725254, "grad_norm": 1.9128329967338416, "learning_rate": 7.411366557305495e-06, "loss": 0.4626, "step": 668 }, { "epoch": 0.3601372720543705, "grad_norm": 2.5022708068016097, "learning_rate": 7.403722413554947e-06, "loss": 0.4959, "step": 669 }, { "epoch": 0.3606755938362156, "grad_norm": 1.8966801972869858, "learning_rate": 7.396070955778013e-06, "loss": 0.45, "step": 670 }, { "epoch": 0.3612139156180607, "grad_norm": 2.061313497940433, "learning_rate": 7.388412207256486e-06, "loss": 0.4961, "step": 671 }, { "epoch": 0.3617522373999058, "grad_norm": 1.6720715956995327, "learning_rate": 7.380746191294341e-06, "loss": 0.4667, "step": 672 }, { "epoch": 0.3622905591817509, "grad_norm": 1.5487990630837682, "learning_rate": 7.373072931217669e-06, "loss": 0.527, "step": 673 }, { "epoch": 0.362828880963596, "grad_norm": 1.4996736955806738, "learning_rate": 7.365392450374598e-06, "loss": 0.4353, "step": 674 }, { "epoch": 0.3633672027454411, "grad_norm": 1.6372189463929279, "learning_rate": 7.357704772135231e-06, "loss": 0.469, "step": 675 }, { "epoch": 0.3639055245272862, "grad_norm": 1.5447454253844684, "learning_rate": 7.350009919891574e-06, "loss": 0.4278, "step": 676 }, { "epoch": 0.3644438463091313, "grad_norm": 1.4107385578994651, "learning_rate": 7.342307917057457e-06, "loss": 0.44, "step": 677 }, { "epoch": 0.3649821680909764, "grad_norm": 1.4950963156286234, "learning_rate": 7.334598787068469e-06, "loss": 0.4529, "step": 678 }, { "epoch": 0.3655204898728215, "grad_norm": 2.047196931688194, "learning_rate": 7.326882553381886e-06, "loss": 0.4993, "step": 679 }, { "epoch": 0.3660588116546666, "grad_norm": 1.8078116478641435, "learning_rate": 7.319159239476601e-06, "loss": 0.4903, "step": 680 }, { "epoch": 0.3665971334365117, "grad_norm": 1.6585777335125267, "learning_rate": 7.311428868853047e-06, "loss": 0.449, "step": 681 }, { "epoch": 0.3671354552183568, "grad_norm": 1.644551492901717, "learning_rate": 7.30369146503313e-06, "loss": 0.4359, "step": 682 }, { "epoch": 0.3676737770002019, "grad_norm": 1.566051715226832, "learning_rate": 7.29594705156016e-06, "loss": 0.5171, "step": 683 }, { "epoch": 0.36821209878204697, "grad_norm": 1.860361723636211, "learning_rate": 7.288195651998772e-06, "loss": 0.5058, "step": 684 }, { "epoch": 0.36875042056389207, "grad_norm": 1.479824820585221, "learning_rate": 7.280437289934858e-06, "loss": 0.5082, "step": 685 }, { "epoch": 0.36928874234573716, "grad_norm": 1.5621912841951935, "learning_rate": 7.272671988975499e-06, "loss": 0.4861, "step": 686 }, { "epoch": 0.36982706412758226, "grad_norm": 1.6260728405178757, "learning_rate": 7.264899772748889e-06, "loss": 0.5003, "step": 687 }, { "epoch": 0.37036538590942736, "grad_norm": 1.5646367035382582, "learning_rate": 7.2571206649042584e-06, "loss": 0.4559, "step": 688 }, { "epoch": 0.37090370769127246, "grad_norm": 1.7472551729015091, "learning_rate": 7.249334689111814e-06, "loss": 0.4541, "step": 689 }, { "epoch": 0.37144202947311755, "grad_norm": 1.6362939723396042, "learning_rate": 7.241541869062656e-06, "loss": 0.4733, "step": 690 }, { "epoch": 0.37198035125496265, "grad_norm": 1.4710809281537391, "learning_rate": 7.2337422284687135e-06, "loss": 0.4523, "step": 691 }, { "epoch": 0.37251867303680775, "grad_norm": 1.6849371563467512, "learning_rate": 7.225935791062665e-06, "loss": 0.4976, "step": 692 }, { "epoch": 0.37305699481865284, "grad_norm": 1.7850003378424297, "learning_rate": 7.2181225805978745e-06, "loss": 0.4482, "step": 693 }, { "epoch": 0.37359531660049794, "grad_norm": 2.355398835881447, "learning_rate": 7.210302620848315e-06, "loss": 0.4599, "step": 694 }, { "epoch": 0.37413363838234304, "grad_norm": 1.617194741699657, "learning_rate": 7.20247593560849e-06, "loss": 0.4543, "step": 695 }, { "epoch": 0.37467196016418813, "grad_norm": 1.4733355105927, "learning_rate": 7.1946425486933755e-06, "loss": 0.4125, "step": 696 }, { "epoch": 0.37521028194603323, "grad_norm": 1.4512303803275823, "learning_rate": 7.186802483938333e-06, "loss": 0.4515, "step": 697 }, { "epoch": 0.3757486037278783, "grad_norm": 1.4829224037632613, "learning_rate": 7.178955765199048e-06, "loss": 0.475, "step": 698 }, { "epoch": 0.3762869255097234, "grad_norm": 1.4882203445110318, "learning_rate": 7.171102416351448e-06, "loss": 0.4485, "step": 699 }, { "epoch": 0.3768252472915685, "grad_norm": 1.6613200067557963, "learning_rate": 7.163242461291639e-06, "loss": 0.4402, "step": 700 }, { "epoch": 0.3773635690734136, "grad_norm": 1.7483634690103926, "learning_rate": 7.155375923935826e-06, "loss": 0.4936, "step": 701 }, { "epoch": 0.3779018908552587, "grad_norm": 1.6616671629226913, "learning_rate": 7.14750282822024e-06, "loss": 0.4644, "step": 702 }, { "epoch": 0.3784402126371038, "grad_norm": 1.5260208283942596, "learning_rate": 7.139623198101073e-06, "loss": 0.489, "step": 703 }, { "epoch": 0.3789785344189489, "grad_norm": 1.361965813750003, "learning_rate": 7.131737057554399e-06, "loss": 0.3901, "step": 704 }, { "epoch": 0.379516856200794, "grad_norm": 1.620874046214403, "learning_rate": 7.1238444305760975e-06, "loss": 0.458, "step": 705 }, { "epoch": 0.3800551779826391, "grad_norm": 1.7744718469804224, "learning_rate": 7.115945341181789e-06, "loss": 0.4585, "step": 706 }, { "epoch": 0.3805934997644842, "grad_norm": 1.4959797567409379, "learning_rate": 7.108039813406755e-06, "loss": 0.4497, "step": 707 }, { "epoch": 0.3811318215463293, "grad_norm": 1.645088668489625, "learning_rate": 7.10012787130587e-06, "loss": 0.4419, "step": 708 }, { "epoch": 0.3816701433281744, "grad_norm": 1.5908205648141605, "learning_rate": 7.092209538953527e-06, "loss": 0.4768, "step": 709 }, { "epoch": 0.3822084651100195, "grad_norm": 1.2865059891101038, "learning_rate": 7.0842848404435574e-06, "loss": 0.4432, "step": 710 }, { "epoch": 0.3827467868918646, "grad_norm": 1.438686585698748, "learning_rate": 7.07635379988917e-06, "loss": 0.463, "step": 711 }, { "epoch": 0.3832851086737097, "grad_norm": 1.5810030390346108, "learning_rate": 7.068416441422867e-06, "loss": 0.4324, "step": 712 }, { "epoch": 0.3838234304555548, "grad_norm": 1.8920886247581228, "learning_rate": 7.060472789196378e-06, "loss": 0.4513, "step": 713 }, { "epoch": 0.3843617522373999, "grad_norm": 1.4721512319324748, "learning_rate": 7.052522867380578e-06, "loss": 0.4794, "step": 714 }, { "epoch": 0.38490007401924503, "grad_norm": 1.8748283518664401, "learning_rate": 7.044566700165426e-06, "loss": 0.5359, "step": 715 }, { "epoch": 0.38543839580109013, "grad_norm": 2.1664339926414247, "learning_rate": 7.036604311759879e-06, "loss": 0.4696, "step": 716 }, { "epoch": 0.3859767175829352, "grad_norm": 1.599064767192068, "learning_rate": 7.028635726391826e-06, "loss": 0.5009, "step": 717 }, { "epoch": 0.3865150393647803, "grad_norm": 1.658951664965314, "learning_rate": 7.020660968308011e-06, "loss": 0.526, "step": 718 }, { "epoch": 0.3870533611466254, "grad_norm": 1.5566803387570707, "learning_rate": 7.012680061773962e-06, "loss": 0.4944, "step": 719 }, { "epoch": 0.3875916829284705, "grad_norm": 1.5561052872784167, "learning_rate": 7.0046930310739145e-06, "loss": 0.4023, "step": 720 }, { "epoch": 0.3875916829284705, "eval_loss": 0.4598337709903717, "eval_runtime": 1512.3789, "eval_samples_per_second": 16.537, "eval_steps_per_second": 0.517, "step": 720 }, { "epoch": 0.3881300047103156, "grad_norm": 1.5343444055056177, "learning_rate": 6.996699900510736e-06, "loss": 0.4661, "step": 721 }, { "epoch": 0.3886683264921607, "grad_norm": 1.5835711750557553, "learning_rate": 6.988700694405861e-06, "loss": 0.5243, "step": 722 }, { "epoch": 0.3892066482740058, "grad_norm": 1.739458700941234, "learning_rate": 6.980695437099203e-06, "loss": 0.468, "step": 723 }, { "epoch": 0.3897449700558509, "grad_norm": 1.4597418259308022, "learning_rate": 6.972684152949095e-06, "loss": 0.4312, "step": 724 }, { "epoch": 0.390283291837696, "grad_norm": 1.4822140659700849, "learning_rate": 6.964666866332202e-06, "loss": 0.4171, "step": 725 }, { "epoch": 0.3908216136195411, "grad_norm": 2.219448742321713, "learning_rate": 6.956643601643459e-06, "loss": 0.4682, "step": 726 }, { "epoch": 0.3913599354013862, "grad_norm": 1.6249675680199915, "learning_rate": 6.948614383295988e-06, "loss": 0.467, "step": 727 }, { "epoch": 0.3918982571832313, "grad_norm": 2.5331886913847916, "learning_rate": 6.940579235721027e-06, "loss": 0.5046, "step": 728 }, { "epoch": 0.3924365789650764, "grad_norm": 1.651989792055275, "learning_rate": 6.932538183367854e-06, "loss": 0.4432, "step": 729 }, { "epoch": 0.3929749007469215, "grad_norm": 1.4451051204854284, "learning_rate": 6.924491250703716e-06, "loss": 0.436, "step": 730 }, { "epoch": 0.3935132225287666, "grad_norm": 1.6726948542569147, "learning_rate": 6.916438462213756e-06, "loss": 0.4701, "step": 731 }, { "epoch": 0.3940515443106117, "grad_norm": 1.3458270610890806, "learning_rate": 6.908379842400926e-06, "loss": 0.461, "step": 732 }, { "epoch": 0.3945898660924568, "grad_norm": 1.8671906958135296, "learning_rate": 6.90031541578593e-06, "loss": 0.4621, "step": 733 }, { "epoch": 0.3951281878743019, "grad_norm": 1.6937643401491398, "learning_rate": 6.892245206907136e-06, "loss": 0.4403, "step": 734 }, { "epoch": 0.39566650965614697, "grad_norm": 1.6011629978962008, "learning_rate": 6.88416924032051e-06, "loss": 0.4832, "step": 735 }, { "epoch": 0.39620483143799207, "grad_norm": 1.7023847640279732, "learning_rate": 6.876087540599532e-06, "loss": 0.4871, "step": 736 }, { "epoch": 0.39674315321983716, "grad_norm": 1.5639503808317925, "learning_rate": 6.868000132335132e-06, "loss": 0.504, "step": 737 }, { "epoch": 0.39728147500168226, "grad_norm": 1.6209519657967315, "learning_rate": 6.859907040135609e-06, "loss": 0.4947, "step": 738 }, { "epoch": 0.39781979678352736, "grad_norm": 1.4902231086791655, "learning_rate": 6.851808288626554e-06, "loss": 0.4329, "step": 739 }, { "epoch": 0.39835811856537245, "grad_norm": 1.4751989923406863, "learning_rate": 6.843703902450781e-06, "loss": 0.469, "step": 740 }, { "epoch": 0.39889644034721755, "grad_norm": 1.7318655949983495, "learning_rate": 6.8355939062682485e-06, "loss": 0.4646, "step": 741 }, { "epoch": 0.39943476212906265, "grad_norm": 2.0477062374958312, "learning_rate": 6.827478324755986e-06, "loss": 0.4527, "step": 742 }, { "epoch": 0.39997308391090775, "grad_norm": 1.5357049173396753, "learning_rate": 6.819357182608014e-06, "loss": 0.4119, "step": 743 }, { "epoch": 0.40051140569275284, "grad_norm": 1.6669074072618764, "learning_rate": 6.811230504535276e-06, "loss": 0.4123, "step": 744 }, { "epoch": 0.40104972747459794, "grad_norm": 2.0238793916536095, "learning_rate": 6.803098315265563e-06, "loss": 0.4607, "step": 745 }, { "epoch": 0.40158804925644304, "grad_norm": 1.7302550872159141, "learning_rate": 6.7949606395434294e-06, "loss": 0.5252, "step": 746 }, { "epoch": 0.40212637103828813, "grad_norm": 1.5575167275155066, "learning_rate": 6.786817502130127e-06, "loss": 0.4484, "step": 747 }, { "epoch": 0.40266469282013323, "grad_norm": 1.3960320100955355, "learning_rate": 6.778668927803526e-06, "loss": 0.444, "step": 748 }, { "epoch": 0.4032030146019783, "grad_norm": 1.5537207671933355, "learning_rate": 6.770514941358041e-06, "loss": 0.4522, "step": 749 }, { "epoch": 0.4037413363838234, "grad_norm": 1.6191186519608955, "learning_rate": 6.762355567604553e-06, "loss": 0.489, "step": 750 }, { "epoch": 0.4042796581656685, "grad_norm": 1.7320364851332162, "learning_rate": 6.7541908313703355e-06, "loss": 0.4746, "step": 751 }, { "epoch": 0.4048179799475136, "grad_norm": 1.5268044530623444, "learning_rate": 6.746020757498979e-06, "loss": 0.4138, "step": 752 }, { "epoch": 0.4053563017293587, "grad_norm": 1.522928297135606, "learning_rate": 6.737845370850317e-06, "loss": 0.4938, "step": 753 }, { "epoch": 0.4058946235112038, "grad_norm": 1.567608770456755, "learning_rate": 6.729664696300347e-06, "loss": 0.4745, "step": 754 }, { "epoch": 0.4064329452930489, "grad_norm": 1.5048680773669196, "learning_rate": 6.721478758741155e-06, "loss": 0.4714, "step": 755 }, { "epoch": 0.406971267074894, "grad_norm": 1.7508536934704277, "learning_rate": 6.713287583080845e-06, "loss": 0.4778, "step": 756 }, { "epoch": 0.4075095888567391, "grad_norm": 1.6217945250756625, "learning_rate": 6.70509119424346e-06, "loss": 0.4529, "step": 757 }, { "epoch": 0.4080479106385842, "grad_norm": 1.6092594479977214, "learning_rate": 6.696889617168897e-06, "loss": 0.4674, "step": 758 }, { "epoch": 0.4085862324204293, "grad_norm": 1.5153766468742507, "learning_rate": 6.688682876812851e-06, "loss": 0.4612, "step": 759 }, { "epoch": 0.4091245542022744, "grad_norm": 1.6200362705011053, "learning_rate": 6.6804709981467195e-06, "loss": 0.4812, "step": 760 }, { "epoch": 0.4096628759841195, "grad_norm": 1.6047382022765324, "learning_rate": 6.672254006157541e-06, "loss": 0.4758, "step": 761 }, { "epoch": 0.4102011977659646, "grad_norm": 1.8520426373676713, "learning_rate": 6.664031925847908e-06, "loss": 0.4184, "step": 762 }, { "epoch": 0.4107395195478097, "grad_norm": 2.2658987317474195, "learning_rate": 6.6558047822358975e-06, "loss": 0.5178, "step": 763 }, { "epoch": 0.4112778413296548, "grad_norm": 1.580321228406977, "learning_rate": 6.6475726003549934e-06, "loss": 0.4249, "step": 764 }, { "epoch": 0.4118161631114999, "grad_norm": 1.4077736219835957, "learning_rate": 6.639335405254008e-06, "loss": 0.4586, "step": 765 }, { "epoch": 0.412354484893345, "grad_norm": 1.5112139801178681, "learning_rate": 6.631093221997012e-06, "loss": 0.4316, "step": 766 }, { "epoch": 0.41289280667519007, "grad_norm": 1.4529648200398257, "learning_rate": 6.6228460756632496e-06, "loss": 0.4571, "step": 767 }, { "epoch": 0.41343112845703517, "grad_norm": 1.826148495373045, "learning_rate": 6.61459399134707e-06, "loss": 0.4278, "step": 768 }, { "epoch": 0.41396945023888027, "grad_norm": 1.5179851185666227, "learning_rate": 6.6063369941578445e-06, "loss": 0.4622, "step": 769 }, { "epoch": 0.41450777202072536, "grad_norm": 1.3529363726674315, "learning_rate": 6.5980751092198955e-06, "loss": 0.4215, "step": 770 }, { "epoch": 0.41504609380257046, "grad_norm": 2.0731132539429944, "learning_rate": 6.589808361672417e-06, "loss": 0.484, "step": 771 }, { "epoch": 0.4155844155844156, "grad_norm": 1.4870501106627148, "learning_rate": 6.581536776669402e-06, "loss": 0.4863, "step": 772 }, { "epoch": 0.4161227373662607, "grad_norm": 1.9062099501037697, "learning_rate": 6.5732603793795535e-06, "loss": 0.4238, "step": 773 }, { "epoch": 0.4166610591481058, "grad_norm": 1.5565227999579219, "learning_rate": 6.564979194986229e-06, "loss": 0.4524, "step": 774 }, { "epoch": 0.4171993809299509, "grad_norm": 2.306172957615922, "learning_rate": 6.5566932486873455e-06, "loss": 0.4964, "step": 775 }, { "epoch": 0.417737702711796, "grad_norm": 1.401583156601946, "learning_rate": 6.54840256569531e-06, "loss": 0.4304, "step": 776 }, { "epoch": 0.4182760244936411, "grad_norm": 1.749412909981746, "learning_rate": 6.540107171236943e-06, "loss": 0.4844, "step": 777 }, { "epoch": 0.4188143462754862, "grad_norm": 1.6322807652870075, "learning_rate": 6.531807090553402e-06, "loss": 0.4853, "step": 778 }, { "epoch": 0.4193526680573313, "grad_norm": 1.2479234535295218, "learning_rate": 6.5235023489001046e-06, "loss": 0.4491, "step": 779 }, { "epoch": 0.4198909898391764, "grad_norm": 1.5833625576839316, "learning_rate": 6.515192971546645e-06, "loss": 0.4171, "step": 780 }, { "epoch": 0.4198909898391764, "eval_loss": 0.4564184546470642, "eval_runtime": 1517.3821, "eval_samples_per_second": 16.482, "eval_steps_per_second": 0.515, "step": 780 }, { "epoch": 0.4204293116210215, "grad_norm": 1.5809122747897906, "learning_rate": 6.50687898377673e-06, "loss": 0.4087, "step": 781 }, { "epoch": 0.4209676334028666, "grad_norm": 1.5387429096209948, "learning_rate": 6.49856041088809e-06, "loss": 0.4414, "step": 782 }, { "epoch": 0.4215059551847117, "grad_norm": 1.6020701369523538, "learning_rate": 6.49023727819241e-06, "loss": 0.4237, "step": 783 }, { "epoch": 0.4220442769665568, "grad_norm": 1.6896383664306511, "learning_rate": 6.481909611015249e-06, "loss": 0.5049, "step": 784 }, { "epoch": 0.42258259874840187, "grad_norm": 1.4623261927757227, "learning_rate": 6.47357743469596e-06, "loss": 0.4513, "step": 785 }, { "epoch": 0.42312092053024697, "grad_norm": 1.8063028002015338, "learning_rate": 6.465240774587623e-06, "loss": 0.4917, "step": 786 }, { "epoch": 0.42365924231209207, "grad_norm": 1.639390083578586, "learning_rate": 6.4568996560569515e-06, "loss": 0.4578, "step": 787 }, { "epoch": 0.42419756409393716, "grad_norm": 1.337761070121856, "learning_rate": 6.448554104484236e-06, "loss": 0.4523, "step": 788 }, { "epoch": 0.42473588587578226, "grad_norm": 1.518872556678575, "learning_rate": 6.44020414526325e-06, "loss": 0.4384, "step": 789 }, { "epoch": 0.42527420765762736, "grad_norm": 1.491028002743192, "learning_rate": 6.431849803801179e-06, "loss": 0.451, "step": 790 }, { "epoch": 0.42581252943947245, "grad_norm": 2.093042650030991, "learning_rate": 6.423491105518542e-06, "loss": 0.4656, "step": 791 }, { "epoch": 0.42635085122131755, "grad_norm": 1.9063256309499805, "learning_rate": 6.415128075849118e-06, "loss": 0.4848, "step": 792 }, { "epoch": 0.42688917300316265, "grad_norm": 1.7660120890204227, "learning_rate": 6.4067607402398625e-06, "loss": 0.4451, "step": 793 }, { "epoch": 0.42742749478500774, "grad_norm": 1.577961253859089, "learning_rate": 6.398389124150832e-06, "loss": 0.485, "step": 794 }, { "epoch": 0.42796581656685284, "grad_norm": 1.6746798086361996, "learning_rate": 6.3900132530551125e-06, "loss": 0.4521, "step": 795 }, { "epoch": 0.42850413834869794, "grad_norm": 1.696615006593536, "learning_rate": 6.381633152438733e-06, "loss": 0.4406, "step": 796 }, { "epoch": 0.42904246013054304, "grad_norm": 3.213801364228645, "learning_rate": 6.373248847800595e-06, "loss": 0.5115, "step": 797 }, { "epoch": 0.42958078191238813, "grad_norm": 1.719986070739237, "learning_rate": 6.364860364652388e-06, "loss": 0.4237, "step": 798 }, { "epoch": 0.43011910369423323, "grad_norm": 1.778509802687885, "learning_rate": 6.3564677285185196e-06, "loss": 0.4568, "step": 799 }, { "epoch": 0.4306574254760783, "grad_norm": 1.5260126863179546, "learning_rate": 6.348070964936032e-06, "loss": 0.4337, "step": 800 }, { "epoch": 0.4311957472579234, "grad_norm": 1.5937231247097972, "learning_rate": 6.339670099454526e-06, "loss": 0.4642, "step": 801 }, { "epoch": 0.4317340690397685, "grad_norm": 2.9535392042792465, "learning_rate": 6.3312651576360866e-06, "loss": 0.4434, "step": 802 }, { "epoch": 0.4322723908216136, "grad_norm": 1.49472223900728, "learning_rate": 6.322856165055198e-06, "loss": 0.4125, "step": 803 }, { "epoch": 0.4328107126034587, "grad_norm": 2.242176131558003, "learning_rate": 6.314443147298675e-06, "loss": 0.49, "step": 804 }, { "epoch": 0.4333490343853038, "grad_norm": 1.681655235385771, "learning_rate": 6.306026129965573e-06, "loss": 0.4245, "step": 805 }, { "epoch": 0.4338873561671489, "grad_norm": 1.5909295811480582, "learning_rate": 6.297605138667127e-06, "loss": 0.4748, "step": 806 }, { "epoch": 0.434425677948994, "grad_norm": 1.5145278838582474, "learning_rate": 6.289180199026654e-06, "loss": 0.4578, "step": 807 }, { "epoch": 0.4349639997308391, "grad_norm": 1.459737051246134, "learning_rate": 6.280751336679495e-06, "loss": 0.4637, "step": 808 }, { "epoch": 0.4355023215126842, "grad_norm": 1.6191142290587295, "learning_rate": 6.2723185772729166e-06, "loss": 0.4582, "step": 809 }, { "epoch": 0.4360406432945293, "grad_norm": 2.0040844342157422, "learning_rate": 6.263881946466049e-06, "loss": 0.4783, "step": 810 }, { "epoch": 0.4365789650763744, "grad_norm": 1.7322826082498741, "learning_rate": 6.255441469929804e-06, "loss": 0.5002, "step": 811 }, { "epoch": 0.4371172868582195, "grad_norm": 1.4894619670010198, "learning_rate": 6.2469971733467925e-06, "loss": 0.4253, "step": 812 }, { "epoch": 0.4376556086400646, "grad_norm": 1.6488111913669299, "learning_rate": 6.238549082411247e-06, "loss": 0.4539, "step": 813 }, { "epoch": 0.4381939304219097, "grad_norm": 1.3488898562178637, "learning_rate": 6.230097222828949e-06, "loss": 0.4623, "step": 814 }, { "epoch": 0.4387322522037548, "grad_norm": 1.6423043283763479, "learning_rate": 6.221641620317147e-06, "loss": 0.4921, "step": 815 }, { "epoch": 0.4392705739855999, "grad_norm": 1.9335639612379423, "learning_rate": 6.2131823006044756e-06, "loss": 0.4453, "step": 816 }, { "epoch": 0.439808895767445, "grad_norm": 1.389152591337612, "learning_rate": 6.2047192894308815e-06, "loss": 0.4413, "step": 817 }, { "epoch": 0.44034721754929007, "grad_norm": 1.983305422880984, "learning_rate": 6.196252612547545e-06, "loss": 0.5093, "step": 818 }, { "epoch": 0.44088553933113517, "grad_norm": 2.053814295705837, "learning_rate": 6.187782295716802e-06, "loss": 0.4381, "step": 819 }, { "epoch": 0.44142386111298026, "grad_norm": 1.547864349515979, "learning_rate": 6.179308364712056e-06, "loss": 0.4932, "step": 820 }, { "epoch": 0.44196218289482536, "grad_norm": 1.4111506897228125, "learning_rate": 6.170830845317717e-06, "loss": 0.4695, "step": 821 }, { "epoch": 0.44250050467667046, "grad_norm": 2.5994615269947485, "learning_rate": 6.162349763329109e-06, "loss": 0.5318, "step": 822 }, { "epoch": 0.44303882645851556, "grad_norm": 1.5802737203663468, "learning_rate": 6.153865144552398e-06, "loss": 0.4676, "step": 823 }, { "epoch": 0.44357714824036065, "grad_norm": 1.4711770748421387, "learning_rate": 6.145377014804509e-06, "loss": 0.4687, "step": 824 }, { "epoch": 0.44411547002220575, "grad_norm": 1.3383114582462243, "learning_rate": 6.136885399913052e-06, "loss": 0.4514, "step": 825 }, { "epoch": 0.44465379180405085, "grad_norm": 1.375700143244168, "learning_rate": 6.1283903257162434e-06, "loss": 0.4581, "step": 826 }, { "epoch": 0.44519211358589594, "grad_norm": 1.6933351988143874, "learning_rate": 6.119891818062822e-06, "loss": 0.4399, "step": 827 }, { "epoch": 0.44573043536774104, "grad_norm": 1.4137670063234855, "learning_rate": 6.1113899028119764e-06, "loss": 0.4298, "step": 828 }, { "epoch": 0.4462687571495862, "grad_norm": 1.8781325581931287, "learning_rate": 6.102884605833262e-06, "loss": 0.4921, "step": 829 }, { "epoch": 0.4468070789314313, "grad_norm": 1.5329498351981126, "learning_rate": 6.094375953006527e-06, "loss": 0.4518, "step": 830 }, { "epoch": 0.4473454007132764, "grad_norm": 1.6692806133274172, "learning_rate": 6.085863970221827e-06, "loss": 0.5337, "step": 831 }, { "epoch": 0.4478837224951215, "grad_norm": 1.5092683621943173, "learning_rate": 6.077348683379351e-06, "loss": 0.4578, "step": 832 }, { "epoch": 0.4484220442769666, "grad_norm": 1.6510945855973929, "learning_rate": 6.068830118389345e-06, "loss": 0.479, "step": 833 }, { "epoch": 0.4489603660588117, "grad_norm": 2.639396623007194, "learning_rate": 6.060308301172026e-06, "loss": 0.451, "step": 834 }, { "epoch": 0.4494986878406568, "grad_norm": 1.8709014826106682, "learning_rate": 6.051783257657508e-06, "loss": 0.5109, "step": 835 }, { "epoch": 0.45003700962250187, "grad_norm": 2.1325245569205284, "learning_rate": 6.04325501378572e-06, "loss": 0.4874, "step": 836 }, { "epoch": 0.45057533140434697, "grad_norm": 1.4972184191802396, "learning_rate": 6.034723595506334e-06, "loss": 0.4671, "step": 837 }, { "epoch": 0.45111365318619207, "grad_norm": 1.3179174814289414, "learning_rate": 6.026189028778675e-06, "loss": 0.4078, "step": 838 }, { "epoch": 0.45165197496803716, "grad_norm": 1.521198968359238, "learning_rate": 6.017651339571652e-06, "loss": 0.4456, "step": 839 }, { "epoch": 0.45219029674988226, "grad_norm": 1.4836797423023151, "learning_rate": 6.009110553863674e-06, "loss": 0.4497, "step": 840 }, { "epoch": 0.45219029674988226, "eval_loss": 0.4534289836883545, "eval_runtime": 1525.9354, "eval_samples_per_second": 16.39, "eval_steps_per_second": 0.512, "step": 840 }, { "epoch": 0.45272861853172736, "grad_norm": 1.808617433298175, "learning_rate": 6.000566697642575e-06, "loss": 0.435, "step": 841 }, { "epoch": 0.45326694031357245, "grad_norm": 2.008290454012663, "learning_rate": 5.992019796905524e-06, "loss": 0.4626, "step": 842 }, { "epoch": 0.45380526209541755, "grad_norm": 1.7710157949578111, "learning_rate": 5.9834698776589614e-06, "loss": 0.4311, "step": 843 }, { "epoch": 0.45434358387726265, "grad_norm": 1.6230775011015806, "learning_rate": 5.9749169659185104e-06, "loss": 0.4693, "step": 844 }, { "epoch": 0.45488190565910774, "grad_norm": 1.3639464284433171, "learning_rate": 5.966361087708898e-06, "loss": 0.4658, "step": 845 }, { "epoch": 0.45542022744095284, "grad_norm": 1.8137146027163404, "learning_rate": 5.957802269063878e-06, "loss": 0.4567, "step": 846 }, { "epoch": 0.45595854922279794, "grad_norm": 1.6758956331351547, "learning_rate": 5.949240536026153e-06, "loss": 0.467, "step": 847 }, { "epoch": 0.45649687100464303, "grad_norm": 1.5131926980070547, "learning_rate": 5.940675914647293e-06, "loss": 0.4106, "step": 848 }, { "epoch": 0.45703519278648813, "grad_norm": 1.5046633719884865, "learning_rate": 5.9321084309876555e-06, "loss": 0.4282, "step": 849 }, { "epoch": 0.45757351456833323, "grad_norm": 1.6481158877878923, "learning_rate": 5.923538111116307e-06, "loss": 0.4414, "step": 850 }, { "epoch": 0.4581118363501783, "grad_norm": 2.175705374474076, "learning_rate": 5.914964981110944e-06, "loss": 0.5038, "step": 851 }, { "epoch": 0.4586501581320234, "grad_norm": 1.748850851161863, "learning_rate": 5.906389067057819e-06, "loss": 0.4603, "step": 852 }, { "epoch": 0.4591884799138685, "grad_norm": 1.5440809581743327, "learning_rate": 5.897810395051646e-06, "loss": 0.4697, "step": 853 }, { "epoch": 0.4597268016957136, "grad_norm": 1.5332714275032744, "learning_rate": 5.889228991195539e-06, "loss": 0.4549, "step": 854 }, { "epoch": 0.4602651234775587, "grad_norm": 1.6246537267152152, "learning_rate": 5.880644881600921e-06, "loss": 0.4413, "step": 855 }, { "epoch": 0.4608034452594038, "grad_norm": 1.7384003721983572, "learning_rate": 5.872058092387449e-06, "loss": 0.5178, "step": 856 }, { "epoch": 0.4613417670412489, "grad_norm": 1.4306474231507047, "learning_rate": 5.863468649682933e-06, "loss": 0.4584, "step": 857 }, { "epoch": 0.461880088823094, "grad_norm": 1.7487008875581123, "learning_rate": 5.8548765796232565e-06, "loss": 0.4775, "step": 858 }, { "epoch": 0.4624184106049391, "grad_norm": 1.6200058585564832, "learning_rate": 5.846281908352299e-06, "loss": 0.4718, "step": 859 }, { "epoch": 0.4629567323867842, "grad_norm": 1.4993582658806037, "learning_rate": 5.837684662021856e-06, "loss": 0.4367, "step": 860 }, { "epoch": 0.4634950541686293, "grad_norm": 1.6215871681690963, "learning_rate": 5.829084866791551e-06, "loss": 0.4891, "step": 861 }, { "epoch": 0.4640333759504744, "grad_norm": 1.6479378578126422, "learning_rate": 5.820482548828773e-06, "loss": 0.4701, "step": 862 }, { "epoch": 0.4645716977323195, "grad_norm": 1.709497613352161, "learning_rate": 5.811877734308583e-06, "loss": 0.4314, "step": 863 }, { "epoch": 0.4651100195141646, "grad_norm": 1.850585526202356, "learning_rate": 5.803270449413636e-06, "loss": 0.4399, "step": 864 }, { "epoch": 0.4656483412960097, "grad_norm": 1.4300437023045451, "learning_rate": 5.7946607203341075e-06, "loss": 0.4434, "step": 865 }, { "epoch": 0.4661866630778548, "grad_norm": 1.4799373263095972, "learning_rate": 5.786048573267608e-06, "loss": 0.4065, "step": 866 }, { "epoch": 0.4667249848596999, "grad_norm": 1.8869037850434587, "learning_rate": 5.777434034419111e-06, "loss": 0.4823, "step": 867 }, { "epoch": 0.467263306641545, "grad_norm": 1.720619241457494, "learning_rate": 5.768817130000857e-06, "loss": 0.4444, "step": 868 }, { "epoch": 0.46780162842339007, "grad_norm": 1.3809501342652182, "learning_rate": 5.760197886232292e-06, "loss": 0.4058, "step": 869 }, { "epoch": 0.46833995020523517, "grad_norm": 1.6474446895806825, "learning_rate": 5.75157632933998e-06, "loss": 0.4244, "step": 870 }, { "epoch": 0.46887827198708026, "grad_norm": 1.3347455312904397, "learning_rate": 5.7429524855575216e-06, "loss": 0.4509, "step": 871 }, { "epoch": 0.46941659376892536, "grad_norm": 2.4700574740497583, "learning_rate": 5.7343263811254746e-06, "loss": 0.4078, "step": 872 }, { "epoch": 0.46995491555077046, "grad_norm": 1.6808144924631037, "learning_rate": 5.725698042291279e-06, "loss": 0.445, "step": 873 }, { "epoch": 0.47049323733261555, "grad_norm": 1.6561338534624221, "learning_rate": 5.717067495309172e-06, "loss": 0.4626, "step": 874 }, { "epoch": 0.47103155911446065, "grad_norm": 1.4357104359447126, "learning_rate": 5.708434766440109e-06, "loss": 0.4253, "step": 875 }, { "epoch": 0.47156988089630575, "grad_norm": 1.5584705980730198, "learning_rate": 5.699799881951684e-06, "loss": 0.4326, "step": 876 }, { "epoch": 0.47210820267815085, "grad_norm": 1.6134096232268902, "learning_rate": 5.691162868118052e-06, "loss": 0.4361, "step": 877 }, { "epoch": 0.47264652445999594, "grad_norm": 1.4597620039500387, "learning_rate": 5.682523751219846e-06, "loss": 0.4009, "step": 878 }, { "epoch": 0.47318484624184104, "grad_norm": 1.6065681327100592, "learning_rate": 5.673882557544098e-06, "loss": 0.4859, "step": 879 }, { "epoch": 0.47372316802368614, "grad_norm": 1.5207533993363942, "learning_rate": 5.665239313384161e-06, "loss": 0.4281, "step": 880 }, { "epoch": 0.47426148980553123, "grad_norm": 1.4714029139534557, "learning_rate": 5.656594045039623e-06, "loss": 0.4364, "step": 881 }, { "epoch": 0.47479981158737633, "grad_norm": 1.7055967072229654, "learning_rate": 5.647946778816238e-06, "loss": 0.5044, "step": 882 }, { "epoch": 0.4753381333692214, "grad_norm": 1.7261543220071143, "learning_rate": 5.639297541025831e-06, "loss": 0.486, "step": 883 }, { "epoch": 0.4758764551510665, "grad_norm": 1.6626927738024924, "learning_rate": 5.630646357986232e-06, "loss": 0.5142, "step": 884 }, { "epoch": 0.4764147769329116, "grad_norm": 1.5653946306822688, "learning_rate": 5.621993256021188e-06, "loss": 0.4364, "step": 885 }, { "epoch": 0.4769530987147568, "grad_norm": 1.8026208698346797, "learning_rate": 5.613338261460287e-06, "loss": 0.4538, "step": 886 }, { "epoch": 0.47749142049660187, "grad_norm": 1.6799784860946594, "learning_rate": 5.6046814006388705e-06, "loss": 0.4644, "step": 887 }, { "epoch": 0.47802974227844697, "grad_norm": 1.4364276865950356, "learning_rate": 5.596022699897963e-06, "loss": 0.4051, "step": 888 }, { "epoch": 0.47856806406029206, "grad_norm": 1.6914469502870713, "learning_rate": 5.587362185584189e-06, "loss": 0.4871, "step": 889 }, { "epoch": 0.47910638584213716, "grad_norm": 1.4415518156055118, "learning_rate": 5.578699884049683e-06, "loss": 0.4429, "step": 890 }, { "epoch": 0.47964470762398226, "grad_norm": 1.4674935937695475, "learning_rate": 5.570035821652029e-06, "loss": 0.426, "step": 891 }, { "epoch": 0.48018302940582736, "grad_norm": 2.1147351198112982, "learning_rate": 5.561370024754161e-06, "loss": 0.4789, "step": 892 }, { "epoch": 0.48072135118767245, "grad_norm": 1.4253127193278772, "learning_rate": 5.552702519724294e-06, "loss": 0.4346, "step": 893 }, { "epoch": 0.48125967296951755, "grad_norm": 3.7503200169998676, "learning_rate": 5.544033332935838e-06, "loss": 0.4393, "step": 894 }, { "epoch": 0.48179799475136265, "grad_norm": 2.1079137772003818, "learning_rate": 5.535362490767323e-06, "loss": 0.5118, "step": 895 }, { "epoch": 0.48233631653320774, "grad_norm": 2.2185325950005477, "learning_rate": 5.526690019602315e-06, "loss": 0.3894, "step": 896 }, { "epoch": 0.48287463831505284, "grad_norm": 1.5274617672885367, "learning_rate": 5.518015945829337e-06, "loss": 0.42, "step": 897 }, { "epoch": 0.48341296009689794, "grad_norm": 1.622273471984762, "learning_rate": 5.509340295841785e-06, "loss": 0.5112, "step": 898 }, { "epoch": 0.48395128187874303, "grad_norm": 1.5776105686627353, "learning_rate": 5.500663096037856e-06, "loss": 0.4577, "step": 899 }, { "epoch": 0.48448960366058813, "grad_norm": 1.4494216604414056, "learning_rate": 5.491984372820461e-06, "loss": 0.4585, "step": 900 }, { "epoch": 0.48448960366058813, "eval_loss": 0.4497644305229187, "eval_runtime": 1526.5252, "eval_samples_per_second": 16.384, "eval_steps_per_second": 0.512, "step": 900 }, { "epoch": 0.4850279254424332, "grad_norm": 1.5164622603897875, "learning_rate": 5.483304152597145e-06, "loss": 0.4488, "step": 901 }, { "epoch": 0.4855662472242783, "grad_norm": 1.5363015107046971, "learning_rate": 5.474622461780011e-06, "loss": 0.424, "step": 902 }, { "epoch": 0.4861045690061234, "grad_norm": 1.5955517741757022, "learning_rate": 5.465939326785634e-06, "loss": 0.4544, "step": 903 }, { "epoch": 0.4866428907879685, "grad_norm": 1.879614888686265, "learning_rate": 5.457254774034983e-06, "loss": 0.5032, "step": 904 }, { "epoch": 0.4871812125698136, "grad_norm": 1.5621620080191398, "learning_rate": 5.448568829953344e-06, "loss": 0.4675, "step": 905 }, { "epoch": 0.4877195343516587, "grad_norm": 1.463009731317384, "learning_rate": 5.439881520970234e-06, "loss": 0.5112, "step": 906 }, { "epoch": 0.4882578561335038, "grad_norm": 1.4309448662315376, "learning_rate": 5.431192873519326e-06, "loss": 0.4532, "step": 907 }, { "epoch": 0.4887961779153489, "grad_norm": 1.8077348129923718, "learning_rate": 5.422502914038359e-06, "loss": 0.4498, "step": 908 }, { "epoch": 0.489334499697194, "grad_norm": 1.770786349097794, "learning_rate": 5.413811668969072e-06, "loss": 0.5081, "step": 909 }, { "epoch": 0.4898728214790391, "grad_norm": 1.911624959064584, "learning_rate": 5.4051191647571126e-06, "loss": 0.4297, "step": 910 }, { "epoch": 0.4904111432608842, "grad_norm": 2.238598280094612, "learning_rate": 5.396425427851958e-06, "loss": 0.4722, "step": 911 }, { "epoch": 0.4909494650427293, "grad_norm": 1.7184560772593453, "learning_rate": 5.387730484706839e-06, "loss": 0.4778, "step": 912 }, { "epoch": 0.4914877868245744, "grad_norm": 1.452205930174256, "learning_rate": 5.3790343617786555e-06, "loss": 0.4233, "step": 913 }, { "epoch": 0.4920261086064195, "grad_norm": 1.6315132839706739, "learning_rate": 5.3703370855278995e-06, "loss": 0.4429, "step": 914 }, { "epoch": 0.4925644303882646, "grad_norm": 2.1202501474227984, "learning_rate": 5.361638682418565e-06, "loss": 0.461, "step": 915 }, { "epoch": 0.4931027521701097, "grad_norm": 1.4850726589476337, "learning_rate": 5.352939178918084e-06, "loss": 0.5053, "step": 916 }, { "epoch": 0.4936410739519548, "grad_norm": 2.5715760460764505, "learning_rate": 5.344238601497231e-06, "loss": 0.523, "step": 917 }, { "epoch": 0.4941793957337999, "grad_norm": 1.6641597075498922, "learning_rate": 5.335536976630052e-06, "loss": 0.4452, "step": 918 }, { "epoch": 0.494717717515645, "grad_norm": 1.579954501546705, "learning_rate": 5.326834330793775e-06, "loss": 0.4365, "step": 919 }, { "epoch": 0.49525603929749007, "grad_norm": 1.8639771696751175, "learning_rate": 5.318130690468741e-06, "loss": 0.4956, "step": 920 }, { "epoch": 0.49579436107933517, "grad_norm": 1.6264721082016091, "learning_rate": 5.309426082138311e-06, "loss": 0.4592, "step": 921 }, { "epoch": 0.49633268286118026, "grad_norm": 1.624012882860616, "learning_rate": 5.300720532288798e-06, "loss": 0.437, "step": 922 }, { "epoch": 0.49687100464302536, "grad_norm": 1.6131788103239653, "learning_rate": 5.29201406740937e-06, "loss": 0.4335, "step": 923 }, { "epoch": 0.49740932642487046, "grad_norm": 1.4350753111666732, "learning_rate": 5.28330671399199e-06, "loss": 0.4462, "step": 924 }, { "epoch": 0.49794764820671555, "grad_norm": 1.9075044926150524, "learning_rate": 5.274598498531318e-06, "loss": 0.5123, "step": 925 }, { "epoch": 0.49848596998856065, "grad_norm": 2.2955162228107233, "learning_rate": 5.265889447524641e-06, "loss": 0.4649, "step": 926 }, { "epoch": 0.49902429177040575, "grad_norm": 1.8752294916309997, "learning_rate": 5.257179587471784e-06, "loss": 0.4339, "step": 927 }, { "epoch": 0.49956261355225084, "grad_norm": 1.776206864828494, "learning_rate": 5.248468944875036e-06, "loss": 0.4047, "step": 928 }, { "epoch": 0.5001009353340959, "grad_norm": 1.6863520776370677, "learning_rate": 5.239757546239069e-06, "loss": 0.4041, "step": 929 }, { "epoch": 0.500639257115941, "grad_norm": 1.6004117617835396, "learning_rate": 5.231045418070852e-06, "loss": 0.4026, "step": 930 }, { "epoch": 0.5011775788977861, "grad_norm": 1.6497898215404967, "learning_rate": 5.222332586879576e-06, "loss": 0.4953, "step": 931 }, { "epoch": 0.5017159006796312, "grad_norm": 1.6264336562152901, "learning_rate": 5.2136190791765714e-06, "loss": 0.4697, "step": 932 }, { "epoch": 0.5022542224614763, "grad_norm": 1.4687648507656423, "learning_rate": 5.204904921475226e-06, "loss": 0.4608, "step": 933 }, { "epoch": 0.5027925442433214, "grad_norm": 1.555407852307028, "learning_rate": 5.196190140290905e-06, "loss": 0.4191, "step": 934 }, { "epoch": 0.5033308660251665, "grad_norm": 1.6926089059266405, "learning_rate": 5.1874747621408705e-06, "loss": 0.4034, "step": 935 }, { "epoch": 0.5038691878070116, "grad_norm": 1.5853166612648868, "learning_rate": 5.178758813544203e-06, "loss": 0.4288, "step": 936 }, { "epoch": 0.5044075095888567, "grad_norm": 1.5462488708677307, "learning_rate": 5.170042321021721e-06, "loss": 0.5049, "step": 937 }, { "epoch": 0.5049458313707018, "grad_norm": 1.6860561151031408, "learning_rate": 5.161325311095889e-06, "loss": 0.4673, "step": 938 }, { "epoch": 0.5054841531525469, "grad_norm": 1.603506680608381, "learning_rate": 5.1526078102907565e-06, "loss": 0.4613, "step": 939 }, { "epoch": 0.506022474934392, "grad_norm": 1.7493626988274396, "learning_rate": 5.143889845131859e-06, "loss": 0.4563, "step": 940 }, { "epoch": 0.5065607967162371, "grad_norm": 1.7677497007408356, "learning_rate": 5.135171442146147e-06, "loss": 0.4389, "step": 941 }, { "epoch": 0.5070991184980822, "grad_norm": 1.7686507376112643, "learning_rate": 5.126452627861906e-06, "loss": 0.469, "step": 942 }, { "epoch": 0.5076374402799273, "grad_norm": 2.03881052798833, "learning_rate": 5.117733428808671e-06, "loss": 0.473, "step": 943 }, { "epoch": 0.5081757620617724, "grad_norm": 1.5924723958151055, "learning_rate": 5.109013871517148e-06, "loss": 0.4449, "step": 944 }, { "epoch": 0.5087140838436175, "grad_norm": 1.787982594535362, "learning_rate": 5.10029398251913e-06, "loss": 0.4575, "step": 945 }, { "epoch": 0.5092524056254626, "grad_norm": 1.8443122029947836, "learning_rate": 5.091573788347424e-06, "loss": 0.4825, "step": 946 }, { "epoch": 0.5097907274073077, "grad_norm": 1.5660114035251782, "learning_rate": 5.082853315535764e-06, "loss": 0.4705, "step": 947 }, { "epoch": 0.5103290491891528, "grad_norm": 1.4015195298555256, "learning_rate": 5.074132590618731e-06, "loss": 0.4222, "step": 948 }, { "epoch": 0.5108673709709979, "grad_norm": 1.6261999654731143, "learning_rate": 5.065411640131672e-06, "loss": 0.4172, "step": 949 }, { "epoch": 0.511405692752843, "grad_norm": 1.6580955314247148, "learning_rate": 5.0566904906106254e-06, "loss": 0.4803, "step": 950 }, { "epoch": 0.5119440145346881, "grad_norm": 1.6882580545035042, "learning_rate": 5.047969168592229e-06, "loss": 0.4959, "step": 951 }, { "epoch": 0.5124823363165332, "grad_norm": 1.2734853203083423, "learning_rate": 5.039247700613649e-06, "loss": 0.4532, "step": 952 }, { "epoch": 0.5130206580983783, "grad_norm": 1.6598696282615735, "learning_rate": 5.030526113212494e-06, "loss": 0.4443, "step": 953 }, { "epoch": 0.5135589798802234, "grad_norm": 1.555381309193185, "learning_rate": 5.021804432926739e-06, "loss": 0.4704, "step": 954 }, { "epoch": 0.5140973016620685, "grad_norm": 1.5525351037863324, "learning_rate": 5.013082686294639e-06, "loss": 0.4373, "step": 955 }, { "epoch": 0.5146356234439136, "grad_norm": 1.5575470355469987, "learning_rate": 5.00436089985465e-06, "loss": 0.4242, "step": 956 }, { "epoch": 0.5151739452257587, "grad_norm": 1.7457061624641392, "learning_rate": 4.995639100145352e-06, "loss": 0.4685, "step": 957 }, { "epoch": 0.5157122670076038, "grad_norm": 1.6284837184280405, "learning_rate": 4.9869173137053625e-06, "loss": 0.4702, "step": 958 }, { "epoch": 0.5162505887894488, "grad_norm": 2.191085743474062, "learning_rate": 4.978195567073262e-06, "loss": 0.5185, "step": 959 }, { "epoch": 0.516788910571294, "grad_norm": 1.5407588424547343, "learning_rate": 4.969473886787507e-06, "loss": 0.505, "step": 960 }, { "epoch": 0.516788910571294, "eval_loss": 0.44528621435165405, "eval_runtime": 1532.2971, "eval_samples_per_second": 16.322, "eval_steps_per_second": 0.51, "step": 960 }, { "epoch": 0.517327232353139, "grad_norm": 1.7214959560480187, "learning_rate": 4.960752299386353e-06, "loss": 0.4826, "step": 961 }, { "epoch": 0.5178655541349841, "grad_norm": 1.5649628360297678, "learning_rate": 4.9520308314077726e-06, "loss": 0.4224, "step": 962 }, { "epoch": 0.5184038759168292, "grad_norm": 1.6424636557347856, "learning_rate": 4.943309509389377e-06, "loss": 0.4148, "step": 963 }, { "epoch": 0.5189421976986743, "grad_norm": 1.98993484637264, "learning_rate": 4.934588359868329e-06, "loss": 0.4307, "step": 964 }, { "epoch": 0.5194805194805194, "grad_norm": 2.0804456077787123, "learning_rate": 4.92586740938127e-06, "loss": 0.4108, "step": 965 }, { "epoch": 0.5200188412623645, "grad_norm": 1.748710199317067, "learning_rate": 4.917146684464238e-06, "loss": 0.4567, "step": 966 }, { "epoch": 0.5205571630442096, "grad_norm": 1.4755067360374794, "learning_rate": 4.908426211652577e-06, "loss": 0.4523, "step": 967 }, { "epoch": 0.5210954848260547, "grad_norm": 1.6340640272431366, "learning_rate": 4.899706017480872e-06, "loss": 0.4697, "step": 968 }, { "epoch": 0.5216338066078998, "grad_norm": 1.5338487326156454, "learning_rate": 4.890986128482854e-06, "loss": 0.4108, "step": 969 }, { "epoch": 0.5221721283897449, "grad_norm": 1.4204187507894679, "learning_rate": 4.88226657119133e-06, "loss": 0.4175, "step": 970 }, { "epoch": 0.52271045017159, "grad_norm": 1.4916766712552136, "learning_rate": 4.873547372138095e-06, "loss": 0.4274, "step": 971 }, { "epoch": 0.5232487719534352, "grad_norm": 1.514306526603469, "learning_rate": 4.864828557853854e-06, "loss": 0.4745, "step": 972 }, { "epoch": 0.5237870937352803, "grad_norm": 1.774262113242822, "learning_rate": 4.856110154868143e-06, "loss": 0.4172, "step": 973 }, { "epoch": 0.5243254155171254, "grad_norm": 1.4311594537408503, "learning_rate": 4.847392189709246e-06, "loss": 0.4499, "step": 974 }, { "epoch": 0.5248637372989705, "grad_norm": 2.045966100772589, "learning_rate": 4.8386746889041116e-06, "loss": 0.496, "step": 975 }, { "epoch": 0.5254020590808156, "grad_norm": 1.3914439869095196, "learning_rate": 4.82995767897828e-06, "loss": 0.4068, "step": 976 }, { "epoch": 0.5259403808626607, "grad_norm": 1.3260222946498679, "learning_rate": 4.8212411864557975e-06, "loss": 0.4344, "step": 977 }, { "epoch": 0.5264787026445058, "grad_norm": 1.7672350290368148, "learning_rate": 4.812525237859131e-06, "loss": 0.4647, "step": 978 }, { "epoch": 0.5270170244263509, "grad_norm": 1.5287264304361414, "learning_rate": 4.803809859709097e-06, "loss": 0.4406, "step": 979 }, { "epoch": 0.527555346208196, "grad_norm": 1.5180822455976997, "learning_rate": 4.795095078524775e-06, "loss": 0.4462, "step": 980 }, { "epoch": 0.5280936679900411, "grad_norm": 1.5390017294524125, "learning_rate": 4.78638092082343e-06, "loss": 0.4427, "step": 981 }, { "epoch": 0.5286319897718862, "grad_norm": 1.8490518419390272, "learning_rate": 4.777667413120425e-06, "loss": 0.4716, "step": 982 }, { "epoch": 0.5291703115537313, "grad_norm": 1.9241747880139426, "learning_rate": 4.7689545819291484e-06, "loss": 0.4471, "step": 983 }, { "epoch": 0.5297086333355764, "grad_norm": 1.5723366516079713, "learning_rate": 4.760242453760932e-06, "loss": 0.3616, "step": 984 }, { "epoch": 0.5302469551174215, "grad_norm": 2.125474240340618, "learning_rate": 4.751531055124965e-06, "loss": 0.4567, "step": 985 }, { "epoch": 0.5307852768992666, "grad_norm": 1.5872857045985345, "learning_rate": 4.742820412528217e-06, "loss": 0.4311, "step": 986 }, { "epoch": 0.5313235986811117, "grad_norm": 1.5991351116825514, "learning_rate": 4.73411055247536e-06, "loss": 0.4572, "step": 987 }, { "epoch": 0.5318619204629568, "grad_norm": 1.5620726404348677, "learning_rate": 4.725401501468683e-06, "loss": 0.4299, "step": 988 }, { "epoch": 0.5324002422448019, "grad_norm": 1.6599112973852914, "learning_rate": 4.716693286008011e-06, "loss": 0.4444, "step": 989 }, { "epoch": 0.532938564026647, "grad_norm": 1.7825302359359856, "learning_rate": 4.707985932590631e-06, "loss": 0.4321, "step": 990 }, { "epoch": 0.5334768858084921, "grad_norm": 1.5739707930921258, "learning_rate": 4.699279467711204e-06, "loss": 0.4567, "step": 991 }, { "epoch": 0.5340152075903372, "grad_norm": 1.5857670482566744, "learning_rate": 4.69057391786169e-06, "loss": 0.4312, "step": 992 }, { "epoch": 0.5345535293721823, "grad_norm": 1.3615110605746865, "learning_rate": 4.68186930953126e-06, "loss": 0.376, "step": 993 }, { "epoch": 0.5350918511540274, "grad_norm": 1.4263273424189502, "learning_rate": 4.673165669206226e-06, "loss": 0.4424, "step": 994 }, { "epoch": 0.5356301729358725, "grad_norm": 2.8748098476059933, "learning_rate": 4.6644630233699495e-06, "loss": 0.4828, "step": 995 }, { "epoch": 0.5361684947177175, "grad_norm": 1.7530111025052908, "learning_rate": 4.65576139850277e-06, "loss": 0.4565, "step": 996 }, { "epoch": 0.5367068164995626, "grad_norm": 1.625700838321751, "learning_rate": 4.647060821081918e-06, "loss": 0.4397, "step": 997 }, { "epoch": 0.5372451382814077, "grad_norm": 1.7382100638812064, "learning_rate": 4.638361317581437e-06, "loss": 0.4701, "step": 998 }, { "epoch": 0.5377834600632528, "grad_norm": 2.153555864190946, "learning_rate": 4.629662914472103e-06, "loss": 0.45, "step": 999 }, { "epoch": 0.5383217818450979, "grad_norm": 1.6756544006397587, "learning_rate": 4.620965638221346e-06, "loss": 0.4373, "step": 1000 }, { "epoch": 0.538860103626943, "grad_norm": 2.115872641463188, "learning_rate": 4.612269515293162e-06, "loss": 0.4807, "step": 1001 }, { "epoch": 0.5393984254087881, "grad_norm": 1.7162266935661588, "learning_rate": 4.603574572148043e-06, "loss": 0.4231, "step": 1002 }, { "epoch": 0.5399367471906332, "grad_norm": 1.828685276454168, "learning_rate": 4.59488083524289e-06, "loss": 0.4405, "step": 1003 }, { "epoch": 0.5404750689724783, "grad_norm": 1.6864896839159536, "learning_rate": 4.58618833103093e-06, "loss": 0.4144, "step": 1004 }, { "epoch": 0.5410133907543234, "grad_norm": 1.4876643937775926, "learning_rate": 4.5774970859616426e-06, "loss": 0.4628, "step": 1005 }, { "epoch": 0.5415517125361685, "grad_norm": 1.5038750034441302, "learning_rate": 4.568807126480676e-06, "loss": 0.4595, "step": 1006 }, { "epoch": 0.5420900343180136, "grad_norm": 1.3366252716503892, "learning_rate": 4.560118479029768e-06, "loss": 0.4447, "step": 1007 }, { "epoch": 0.5426283560998587, "grad_norm": 1.5955474786951926, "learning_rate": 4.5514311700466575e-06, "loss": 0.4731, "step": 1008 }, { "epoch": 0.5431666778817038, "grad_norm": 1.415371321661975, "learning_rate": 4.5427452259650185e-06, "loss": 0.4565, "step": 1009 }, { "epoch": 0.5437049996635489, "grad_norm": 1.414837591715847, "learning_rate": 4.534060673214367e-06, "loss": 0.439, "step": 1010 }, { "epoch": 0.544243321445394, "grad_norm": 1.6390543819341332, "learning_rate": 4.525377538219991e-06, "loss": 0.4434, "step": 1011 }, { "epoch": 0.5447816432272391, "grad_norm": 1.9027726313032218, "learning_rate": 4.516695847402857e-06, "loss": 0.4841, "step": 1012 }, { "epoch": 0.5453199650090842, "grad_norm": 1.6549184700101718, "learning_rate": 4.50801562717954e-06, "loss": 0.4187, "step": 1013 }, { "epoch": 0.5458582867909293, "grad_norm": 1.672495923944031, "learning_rate": 4.499336903962146e-06, "loss": 0.461, "step": 1014 }, { "epoch": 0.5463966085727744, "grad_norm": 1.9002456572131434, "learning_rate": 4.490659704158218e-06, "loss": 0.4305, "step": 1015 }, { "epoch": 0.5469349303546195, "grad_norm": 1.3438622389285284, "learning_rate": 4.481984054170666e-06, "loss": 0.4569, "step": 1016 }, { "epoch": 0.5474732521364646, "grad_norm": 1.6738782134152472, "learning_rate": 4.473309980397686e-06, "loss": 0.4574, "step": 1017 }, { "epoch": 0.5480115739183097, "grad_norm": 1.410079098904291, "learning_rate": 4.464637509232679e-06, "loss": 0.4616, "step": 1018 }, { "epoch": 0.5485498957001548, "grad_norm": 1.5059024241541985, "learning_rate": 4.455966667064164e-06, "loss": 0.4257, "step": 1019 }, { "epoch": 0.5490882174819999, "grad_norm": 1.8743979543800648, "learning_rate": 4.447297480275708e-06, "loss": 0.4468, "step": 1020 }, { "epoch": 0.5490882174819999, "eval_loss": 0.44231292605400085, "eval_runtime": 1542.3429, "eval_samples_per_second": 16.216, "eval_steps_per_second": 0.507, "step": 1020 }, { "epoch": 0.549626539263845, "grad_norm": 2.326652305551719, "learning_rate": 4.4386299752458405e-06, "loss": 0.5123, "step": 1021 }, { "epoch": 0.5501648610456901, "grad_norm": 1.5214313173590028, "learning_rate": 4.429964178347973e-06, "loss": 0.4525, "step": 1022 }, { "epoch": 0.5507031828275352, "grad_norm": 1.578588355929213, "learning_rate": 4.4213001159503185e-06, "loss": 0.4511, "step": 1023 }, { "epoch": 0.5512415046093803, "grad_norm": 1.5736153928065848, "learning_rate": 4.4126378144158145e-06, "loss": 0.402, "step": 1024 }, { "epoch": 0.5517798263912254, "grad_norm": 1.4881049360513776, "learning_rate": 4.4039773001020394e-06, "loss": 0.4312, "step": 1025 }, { "epoch": 0.5523181481730705, "grad_norm": 1.5453517436989277, "learning_rate": 4.395318599361133e-06, "loss": 0.4297, "step": 1026 }, { "epoch": 0.5528564699549156, "grad_norm": 1.7401645944762647, "learning_rate": 4.386661738539716e-06, "loss": 0.4021, "step": 1027 }, { "epoch": 0.5533947917367606, "grad_norm": 1.6594295806955806, "learning_rate": 4.3780067439788125e-06, "loss": 0.3936, "step": 1028 }, { "epoch": 0.5539331135186057, "grad_norm": 1.4018911995650016, "learning_rate": 4.3693536420137704e-06, "loss": 0.4208, "step": 1029 }, { "epoch": 0.5544714353004508, "grad_norm": 1.554369257290078, "learning_rate": 4.360702458974172e-06, "loss": 0.3869, "step": 1030 }, { "epoch": 0.5550097570822959, "grad_norm": 1.7013778785431986, "learning_rate": 4.3520532211837645e-06, "loss": 0.4557, "step": 1031 }, { "epoch": 0.555548078864141, "grad_norm": 1.5141795112180816, "learning_rate": 4.343405954960378e-06, "loss": 0.437, "step": 1032 }, { "epoch": 0.5560864006459861, "grad_norm": 1.6876343830074998, "learning_rate": 4.334760686615842e-06, "loss": 0.4632, "step": 1033 }, { "epoch": 0.5566247224278312, "grad_norm": 1.7137409506750598, "learning_rate": 4.326117442455904e-06, "loss": 0.451, "step": 1034 }, { "epoch": 0.5571630442096763, "grad_norm": 2.2054388725094993, "learning_rate": 4.3174762487801554e-06, "loss": 0.4845, "step": 1035 }, { "epoch": 0.5577013659915214, "grad_norm": 1.4514781472802996, "learning_rate": 4.30883713188195e-06, "loss": 0.4713, "step": 1036 }, { "epoch": 0.5582396877733665, "grad_norm": 1.3155208362445518, "learning_rate": 4.300200118048318e-06, "loss": 0.4048, "step": 1037 }, { "epoch": 0.5587780095552116, "grad_norm": 1.7594624250292574, "learning_rate": 4.291565233559893e-06, "loss": 0.4719, "step": 1038 }, { "epoch": 0.5593163313370567, "grad_norm": 1.5899320924503517, "learning_rate": 4.282932504690829e-06, "loss": 0.4889, "step": 1039 }, { "epoch": 0.5598546531189018, "grad_norm": 1.5400899090595648, "learning_rate": 4.274301957708723e-06, "loss": 0.48, "step": 1040 }, { "epoch": 0.5603929749007469, "grad_norm": 1.9340975529821163, "learning_rate": 4.265673618874527e-06, "loss": 0.4558, "step": 1041 }, { "epoch": 0.560931296682592, "grad_norm": 1.1875057467361612, "learning_rate": 4.257047514442481e-06, "loss": 0.4308, "step": 1042 }, { "epoch": 0.5614696184644371, "grad_norm": 1.7255919834039524, "learning_rate": 4.248423670660022e-06, "loss": 0.4637, "step": 1043 }, { "epoch": 0.5620079402462822, "grad_norm": 1.552937296818888, "learning_rate": 4.239802113767711e-06, "loss": 0.5167, "step": 1044 }, { "epoch": 0.5625462620281273, "grad_norm": 1.4241418668403774, "learning_rate": 4.231182869999146e-06, "loss": 0.4262, "step": 1045 }, { "epoch": 0.5630845838099724, "grad_norm": 1.4079020132555902, "learning_rate": 4.222565965580892e-06, "loss": 0.4527, "step": 1046 }, { "epoch": 0.5636229055918175, "grad_norm": 1.3617602268653886, "learning_rate": 4.2139514267323925e-06, "loss": 0.4546, "step": 1047 }, { "epoch": 0.5641612273736626, "grad_norm": 1.5838734348735288, "learning_rate": 4.205339279665895e-06, "loss": 0.3903, "step": 1048 }, { "epoch": 0.5646995491555077, "grad_norm": 1.451984176062728, "learning_rate": 4.196729550586367e-06, "loss": 0.4211, "step": 1049 }, { "epoch": 0.5652378709373528, "grad_norm": 1.5454288468811321, "learning_rate": 4.18812226569142e-06, "loss": 0.3856, "step": 1050 }, { "epoch": 0.5657761927191979, "grad_norm": 1.6143068691418476, "learning_rate": 4.17951745117123e-06, "loss": 0.4137, "step": 1051 }, { "epoch": 0.566314514501043, "grad_norm": 1.5780823976901985, "learning_rate": 4.170915133208452e-06, "loss": 0.4402, "step": 1052 }, { "epoch": 0.5668528362828881, "grad_norm": 1.4482990847613153, "learning_rate": 4.162315337978148e-06, "loss": 0.5056, "step": 1053 }, { "epoch": 0.5673911580647332, "grad_norm": 1.534829858260644, "learning_rate": 4.153718091647702e-06, "loss": 0.4212, "step": 1054 }, { "epoch": 0.5679294798465783, "grad_norm": 1.6872941151721794, "learning_rate": 4.145123420376745e-06, "loss": 0.4604, "step": 1055 }, { "epoch": 0.5684678016284234, "grad_norm": 1.3923901318290877, "learning_rate": 4.136531350317069e-06, "loss": 0.4608, "step": 1056 }, { "epoch": 0.5690061234102685, "grad_norm": 1.7627677860939457, "learning_rate": 4.127941907612553e-06, "loss": 0.4345, "step": 1057 }, { "epoch": 0.5695444451921136, "grad_norm": 1.6236383393521263, "learning_rate": 4.11935511839908e-06, "loss": 0.4599, "step": 1058 }, { "epoch": 0.5700827669739587, "grad_norm": 1.5390392661613181, "learning_rate": 4.110771008804463e-06, "loss": 0.4822, "step": 1059 }, { "epoch": 0.5706210887558038, "grad_norm": 1.6460116304075034, "learning_rate": 4.102189604948356e-06, "loss": 0.4277, "step": 1060 }, { "epoch": 0.5711594105376488, "grad_norm": 1.4089445870425645, "learning_rate": 4.093610932942184e-06, "loss": 0.4055, "step": 1061 }, { "epoch": 0.571697732319494, "grad_norm": 1.4912945610802475, "learning_rate": 4.085035018889058e-06, "loss": 0.4081, "step": 1062 }, { "epoch": 0.572236054101339, "grad_norm": 1.7313554326427134, "learning_rate": 4.076461888883696e-06, "loss": 0.4516, "step": 1063 }, { "epoch": 0.5727743758831841, "grad_norm": 1.438398770463997, "learning_rate": 4.067891569012347e-06, "loss": 0.4591, "step": 1064 }, { "epoch": 0.5733126976650292, "grad_norm": 1.2911877198700585, "learning_rate": 4.059324085352709e-06, "loss": 0.3877, "step": 1065 }, { "epoch": 0.5738510194468743, "grad_norm": 1.4799665950387828, "learning_rate": 4.050759463973849e-06, "loss": 0.4027, "step": 1066 }, { "epoch": 0.5743893412287194, "grad_norm": 1.31856553741587, "learning_rate": 4.042197730936124e-06, "loss": 0.4385, "step": 1067 }, { "epoch": 0.5749276630105645, "grad_norm": 1.4681673368671948, "learning_rate": 4.033638912291104e-06, "loss": 0.4699, "step": 1068 }, { "epoch": 0.5754659847924096, "grad_norm": 1.8186933987892613, "learning_rate": 4.025083034081492e-06, "loss": 0.474, "step": 1069 }, { "epoch": 0.5760043065742547, "grad_norm": 1.7243406009536202, "learning_rate": 4.016530122341039e-06, "loss": 0.4664, "step": 1070 }, { "epoch": 0.5765426283560998, "grad_norm": 1.7574219154990909, "learning_rate": 4.007980203094476e-06, "loss": 0.412, "step": 1071 }, { "epoch": 0.5770809501379449, "grad_norm": 3.3723520725361325, "learning_rate": 3.999433302357427e-06, "loss": 0.3745, "step": 1072 }, { "epoch": 0.57761927191979, "grad_norm": 1.470644839329035, "learning_rate": 3.990889446136326e-06, "loss": 0.4192, "step": 1073 }, { "epoch": 0.5781575937016351, "grad_norm": 1.8064402874305607, "learning_rate": 3.982348660428349e-06, "loss": 0.4633, "step": 1074 }, { "epoch": 0.5786959154834802, "grad_norm": 1.5560108586108519, "learning_rate": 3.9738109712213255e-06, "loss": 0.4554, "step": 1075 }, { "epoch": 0.5792342372653253, "grad_norm": 1.390022072661602, "learning_rate": 3.965276404493667e-06, "loss": 0.4468, "step": 1076 }, { "epoch": 0.5797725590471704, "grad_norm": 1.5485174930428875, "learning_rate": 3.956744986214281e-06, "loss": 0.4406, "step": 1077 }, { "epoch": 0.5803108808290155, "grad_norm": 1.377328803064819, "learning_rate": 3.948216742342492e-06, "loss": 0.3914, "step": 1078 }, { "epoch": 0.5808492026108606, "grad_norm": 1.7377815121930535, "learning_rate": 3.939691698827975e-06, "loss": 0.4409, "step": 1079 }, { "epoch": 0.5813875243927057, "grad_norm": 1.584949416405362, "learning_rate": 3.931169881610655e-06, "loss": 0.4909, "step": 1080 }, { "epoch": 0.5813875243927057, "eval_loss": 0.43915173411369324, "eval_runtime": 1551.2876, "eval_samples_per_second": 16.122, "eval_steps_per_second": 0.504, "step": 1080 }, { "epoch": 0.5819258461745508, "grad_norm": 1.4259479318176305, "learning_rate": 3.922651316620648e-06, "loss": 0.419, "step": 1081 }, { "epoch": 0.5824641679563959, "grad_norm": 1.883836889268125, "learning_rate": 3.914136029778173e-06, "loss": 0.4847, "step": 1082 }, { "epoch": 0.583002489738241, "grad_norm": 1.5440830790183266, "learning_rate": 3.905624046993474e-06, "loss": 0.4484, "step": 1083 }, { "epoch": 0.5835408115200861, "grad_norm": 1.711059696428319, "learning_rate": 3.897115394166738e-06, "loss": 0.4682, "step": 1084 }, { "epoch": 0.5840791333019312, "grad_norm": 1.8908190002251042, "learning_rate": 3.8886100971880235e-06, "loss": 0.4325, "step": 1085 }, { "epoch": 0.5846174550837764, "grad_norm": 1.5374015806352503, "learning_rate": 3.880108181937178e-06, "loss": 0.4434, "step": 1086 }, { "epoch": 0.5851557768656215, "grad_norm": 1.864521131460447, "learning_rate": 3.871609674283757e-06, "loss": 0.4649, "step": 1087 }, { "epoch": 0.5856940986474666, "grad_norm": 1.9214802187823141, "learning_rate": 3.863114600086948e-06, "loss": 0.452, "step": 1088 }, { "epoch": 0.5862324204293117, "grad_norm": 1.3598584887277212, "learning_rate": 3.854622985195492e-06, "loss": 0.466, "step": 1089 }, { "epoch": 0.5867707422111568, "grad_norm": 1.6127091744766286, "learning_rate": 3.846134855447602e-06, "loss": 0.4627, "step": 1090 }, { "epoch": 0.5873090639930019, "grad_norm": 1.4648349504902127, "learning_rate": 3.837650236670892e-06, "loss": 0.3967, "step": 1091 }, { "epoch": 0.587847385774847, "grad_norm": 1.8146408700451369, "learning_rate": 3.829169154682283e-06, "loss": 0.4271, "step": 1092 }, { "epoch": 0.5883857075566921, "grad_norm": 1.7751846942753446, "learning_rate": 3.8206916352879446e-06, "loss": 0.4464, "step": 1093 }, { "epoch": 0.5889240293385372, "grad_norm": 1.6612024138612147, "learning_rate": 3.8122177042832e-06, "loss": 0.4107, "step": 1094 }, { "epoch": 0.5894623511203823, "grad_norm": 2.812616379162355, "learning_rate": 3.8037473874524542e-06, "loss": 0.4584, "step": 1095 }, { "epoch": 0.5900006729022274, "grad_norm": 1.3709537212409602, "learning_rate": 3.7952807105691185e-06, "loss": 0.4356, "step": 1096 }, { "epoch": 0.5905389946840724, "grad_norm": 1.2984038273503478, "learning_rate": 3.7868176993955253e-06, "loss": 0.426, "step": 1097 }, { "epoch": 0.5910773164659175, "grad_norm": 1.6589883894837865, "learning_rate": 3.7783583796828543e-06, "loss": 0.4449, "step": 1098 }, { "epoch": 0.5916156382477626, "grad_norm": 1.66006556219293, "learning_rate": 3.769902777171051e-06, "loss": 0.493, "step": 1099 }, { "epoch": 0.5921539600296077, "grad_norm": 1.5937225644555308, "learning_rate": 3.761450917588753e-06, "loss": 0.4723, "step": 1100 }, { "epoch": 0.5926922818114528, "grad_norm": 1.3456146090228862, "learning_rate": 3.7530028266532074e-06, "loss": 0.4137, "step": 1101 }, { "epoch": 0.5932306035932979, "grad_norm": 1.679198037724048, "learning_rate": 3.744558530070196e-06, "loss": 0.4261, "step": 1102 }, { "epoch": 0.593768925375143, "grad_norm": 1.581894355411804, "learning_rate": 3.7361180535339504e-06, "loss": 0.4612, "step": 1103 }, { "epoch": 0.5943072471569881, "grad_norm": 1.4999393803804146, "learning_rate": 3.7276814227270842e-06, "loss": 0.4242, "step": 1104 }, { "epoch": 0.5948455689388332, "grad_norm": 1.6700110113661726, "learning_rate": 3.719248663320506e-06, "loss": 0.4536, "step": 1105 }, { "epoch": 0.5953838907206783, "grad_norm": 1.4628534581538355, "learning_rate": 3.7108198009733454e-06, "loss": 0.3885, "step": 1106 }, { "epoch": 0.5959222125025234, "grad_norm": 1.5174908060004981, "learning_rate": 3.7023948613328736e-06, "loss": 0.4688, "step": 1107 }, { "epoch": 0.5964605342843685, "grad_norm": 1.6277090494975097, "learning_rate": 3.6939738700344264e-06, "loss": 0.4404, "step": 1108 }, { "epoch": 0.5969988560662136, "grad_norm": 2.5097831655290954, "learning_rate": 3.6855568527013273e-06, "loss": 0.4608, "step": 1109 }, { "epoch": 0.5975371778480587, "grad_norm": 1.4992012722834578, "learning_rate": 3.677143834944803e-06, "loss": 0.4446, "step": 1110 }, { "epoch": 0.5980754996299038, "grad_norm": 1.4139401580995998, "learning_rate": 3.6687348423639147e-06, "loss": 0.4098, "step": 1111 }, { "epoch": 0.5986138214117489, "grad_norm": 2.0752058550686585, "learning_rate": 3.6603299005454744e-06, "loss": 0.4234, "step": 1112 }, { "epoch": 0.599152143193594, "grad_norm": 1.6967487088214965, "learning_rate": 3.6519290350639697e-06, "loss": 0.4348, "step": 1113 }, { "epoch": 0.5996904649754391, "grad_norm": 1.7094622508466781, "learning_rate": 3.6435322714814813e-06, "loss": 0.4584, "step": 1114 }, { "epoch": 0.6002287867572842, "grad_norm": 1.5333043053128887, "learning_rate": 3.635139635347612e-06, "loss": 0.4211, "step": 1115 }, { "epoch": 0.6007671085391293, "grad_norm": 1.447440380533825, "learning_rate": 3.626751152199406e-06, "loss": 0.4392, "step": 1116 }, { "epoch": 0.6013054303209744, "grad_norm": 1.558545230893266, "learning_rate": 3.6183668475612665e-06, "loss": 0.4553, "step": 1117 }, { "epoch": 0.6018437521028195, "grad_norm": 1.7341397982742823, "learning_rate": 3.6099867469448874e-06, "loss": 0.4521, "step": 1118 }, { "epoch": 0.6023820738846646, "grad_norm": 3.5577384559068075, "learning_rate": 3.601610875849168e-06, "loss": 0.4999, "step": 1119 }, { "epoch": 0.6029203956665097, "grad_norm": 1.3499033786926813, "learning_rate": 3.5932392597601396e-06, "loss": 0.4273, "step": 1120 }, { "epoch": 0.6034587174483548, "grad_norm": 1.49775810523526, "learning_rate": 3.584871924150883e-06, "loss": 0.4275, "step": 1121 }, { "epoch": 0.6039970392301999, "grad_norm": 1.4867216376875734, "learning_rate": 3.576508894481458e-06, "loss": 0.443, "step": 1122 }, { "epoch": 0.604535361012045, "grad_norm": 1.8077118144262816, "learning_rate": 3.5681501961988212e-06, "loss": 0.408, "step": 1123 }, { "epoch": 0.6050736827938901, "grad_norm": 2.0530433441295535, "learning_rate": 3.5597958547367507e-06, "loss": 0.3988, "step": 1124 }, { "epoch": 0.6056120045757352, "grad_norm": 1.4118492293118154, "learning_rate": 3.551445895515765e-06, "loss": 0.477, "step": 1125 }, { "epoch": 0.6061503263575803, "grad_norm": 1.7018214299556869, "learning_rate": 3.5431003439430493e-06, "loss": 0.4441, "step": 1126 }, { "epoch": 0.6066886481394254, "grad_norm": 1.434018580532193, "learning_rate": 3.5347592254123795e-06, "loss": 0.4539, "step": 1127 }, { "epoch": 0.6072269699212705, "grad_norm": 1.4867130289511963, "learning_rate": 3.526422565304042e-06, "loss": 0.4158, "step": 1128 }, { "epoch": 0.6077652917031156, "grad_norm": 1.4715457603229556, "learning_rate": 3.518090388984753e-06, "loss": 0.425, "step": 1129 }, { "epoch": 0.6083036134849606, "grad_norm": 1.4891631829297116, "learning_rate": 3.5097627218075905e-06, "loss": 0.4551, "step": 1130 }, { "epoch": 0.6088419352668057, "grad_norm": 1.38559309859237, "learning_rate": 3.5014395891119112e-06, "loss": 0.3903, "step": 1131 }, { "epoch": 0.6093802570486508, "grad_norm": 1.5211311736282844, "learning_rate": 3.4931210162232716e-06, "loss": 0.474, "step": 1132 }, { "epoch": 0.6099185788304959, "grad_norm": 3.910273590345733, "learning_rate": 3.484807028453356e-06, "loss": 0.4386, "step": 1133 }, { "epoch": 0.610456900612341, "grad_norm": 1.21915593287012, "learning_rate": 3.476497651099897e-06, "loss": 0.4214, "step": 1134 }, { "epoch": 0.6109952223941861, "grad_norm": 7.218438211629208, "learning_rate": 3.4681929094465987e-06, "loss": 0.4368, "step": 1135 }, { "epoch": 0.6115335441760312, "grad_norm": 1.5885679173464573, "learning_rate": 3.4598928287630585e-06, "loss": 0.4304, "step": 1136 }, { "epoch": 0.6120718659578763, "grad_norm": 1.6276966755475062, "learning_rate": 3.451597434304692e-06, "loss": 0.4303, "step": 1137 }, { "epoch": 0.6126101877397214, "grad_norm": 2.4974771072637227, "learning_rate": 3.443306751312656e-06, "loss": 0.4812, "step": 1138 }, { "epoch": 0.6131485095215665, "grad_norm": 1.8523418655749138, "learning_rate": 3.435020805013773e-06, "loss": 0.4464, "step": 1139 }, { "epoch": 0.6136868313034116, "grad_norm": 1.6153961476534389, "learning_rate": 3.4267396206204477e-06, "loss": 0.4258, "step": 1140 }, { "epoch": 0.6136868313034116, "eval_loss": 0.4358210265636444, "eval_runtime": 1559.0889, "eval_samples_per_second": 16.041, "eval_steps_per_second": 0.502, "step": 1140 }, { "epoch": 0.6142251530852567, "grad_norm": 1.5200314946583775, "learning_rate": 3.4184632233306004e-06, "loss": 0.4328, "step": 1141 }, { "epoch": 0.6147634748671018, "grad_norm": 1.753239287330404, "learning_rate": 3.4101916383275836e-06, "loss": 0.4164, "step": 1142 }, { "epoch": 0.6153017966489469, "grad_norm": 1.3784614615536817, "learning_rate": 3.4019248907801058e-06, "loss": 0.407, "step": 1143 }, { "epoch": 0.615840118430792, "grad_norm": 1.4916546024442217, "learning_rate": 3.3936630058421567e-06, "loss": 0.4449, "step": 1144 }, { "epoch": 0.6163784402126371, "grad_norm": 1.411016335795447, "learning_rate": 3.385406008652931e-06, "loss": 0.4137, "step": 1145 }, { "epoch": 0.6169167619944822, "grad_norm": 1.969929829038151, "learning_rate": 3.3771539243367517e-06, "loss": 0.4569, "step": 1146 }, { "epoch": 0.6174550837763273, "grad_norm": 1.4268646662770854, "learning_rate": 3.3689067780029895e-06, "loss": 0.4399, "step": 1147 }, { "epoch": 0.6179934055581724, "grad_norm": 1.4858645297475759, "learning_rate": 3.3606645947459933e-06, "loss": 0.4318, "step": 1148 }, { "epoch": 0.6185317273400175, "grad_norm": 2.07970165108201, "learning_rate": 3.3524273996450087e-06, "loss": 0.4804, "step": 1149 }, { "epoch": 0.6190700491218626, "grad_norm": 1.5524399522642343, "learning_rate": 3.3441952177641046e-06, "loss": 0.448, "step": 1150 }, { "epoch": 0.6196083709037077, "grad_norm": 1.5025047668730835, "learning_rate": 3.335968074152094e-06, "loss": 0.4229, "step": 1151 }, { "epoch": 0.6201466926855528, "grad_norm": 1.51932290948172, "learning_rate": 3.32774599384246e-06, "loss": 0.4238, "step": 1152 }, { "epoch": 0.6206850144673979, "grad_norm": 1.4003637291864899, "learning_rate": 3.319529001853282e-06, "loss": 0.4618, "step": 1153 }, { "epoch": 0.621223336249243, "grad_norm": 1.3792399628540106, "learning_rate": 3.311317123187151e-06, "loss": 0.4052, "step": 1154 }, { "epoch": 0.6217616580310881, "grad_norm": 1.4341824487711958, "learning_rate": 3.3031103828311044e-06, "loss": 0.4452, "step": 1155 }, { "epoch": 0.6222999798129332, "grad_norm": 1.8890388921678993, "learning_rate": 3.294908805756543e-06, "loss": 0.4311, "step": 1156 }, { "epoch": 0.6228383015947783, "grad_norm": 1.6873174271659632, "learning_rate": 3.286712416919156e-06, "loss": 0.465, "step": 1157 }, { "epoch": 0.6233766233766234, "grad_norm": 2.113957712483436, "learning_rate": 3.2785212412588464e-06, "loss": 0.4103, "step": 1158 }, { "epoch": 0.6239149451584685, "grad_norm": 1.6169473829408894, "learning_rate": 3.2703353036996553e-06, "loss": 0.4042, "step": 1159 }, { "epoch": 0.6244532669403136, "grad_norm": 1.6678579140480474, "learning_rate": 3.262154629149684e-06, "loss": 0.4849, "step": 1160 }, { "epoch": 0.6249915887221587, "grad_norm": 1.5133551741537392, "learning_rate": 3.253979242501023e-06, "loss": 0.4479, "step": 1161 }, { "epoch": 0.6255299105040037, "grad_norm": 1.5463516633606489, "learning_rate": 3.2458091686296666e-06, "loss": 0.4589, "step": 1162 }, { "epoch": 0.6260682322858488, "grad_norm": 1.3908513399535982, "learning_rate": 3.2376444323954487e-06, "loss": 0.407, "step": 1163 }, { "epoch": 0.6266065540676939, "grad_norm": 1.4911824388993882, "learning_rate": 3.2294850586419603e-06, "loss": 0.4016, "step": 1164 }, { "epoch": 0.627144875849539, "grad_norm": 1.4342504928355473, "learning_rate": 3.2213310721964753e-06, "loss": 0.4269, "step": 1165 }, { "epoch": 0.6276831976313841, "grad_norm": 1.5982636474188436, "learning_rate": 3.2131824978698744e-06, "loss": 0.4532, "step": 1166 }, { "epoch": 0.6282215194132292, "grad_norm": 1.3672342575621805, "learning_rate": 3.2050393604565722e-06, "loss": 0.3972, "step": 1167 }, { "epoch": 0.6287598411950743, "grad_norm": 1.6874817093257244, "learning_rate": 3.196901684734439e-06, "loss": 0.457, "step": 1168 }, { "epoch": 0.6292981629769194, "grad_norm": 1.5723777384143767, "learning_rate": 3.188769495464725e-06, "loss": 0.3892, "step": 1169 }, { "epoch": 0.6298364847587645, "grad_norm": 1.601524939347794, "learning_rate": 3.180642817391988e-06, "loss": 0.4433, "step": 1170 }, { "epoch": 0.6303748065406096, "grad_norm": 2.25805654454037, "learning_rate": 3.172521675244016e-06, "loss": 0.4322, "step": 1171 }, { "epoch": 0.6309131283224547, "grad_norm": 1.5555079250741115, "learning_rate": 3.1644060937317523e-06, "loss": 0.391, "step": 1172 }, { "epoch": 0.6314514501042998, "grad_norm": 1.4992699551350894, "learning_rate": 3.1562960975492194e-06, "loss": 0.4044, "step": 1173 }, { "epoch": 0.6319897718861449, "grad_norm": 1.5799132322735037, "learning_rate": 3.1481917113734474e-06, "loss": 0.3812, "step": 1174 }, { "epoch": 0.63252809366799, "grad_norm": 1.7698333563655604, "learning_rate": 3.140092959864392e-06, "loss": 0.4353, "step": 1175 }, { "epoch": 0.6330664154498351, "grad_norm": 1.568455528145148, "learning_rate": 3.1319998676648695e-06, "loss": 0.4307, "step": 1176 }, { "epoch": 0.6336047372316802, "grad_norm": 1.6539679705814518, "learning_rate": 3.12391245940047e-06, "loss": 0.4269, "step": 1177 }, { "epoch": 0.6341430590135253, "grad_norm": 1.7204853297231233, "learning_rate": 3.115830759679492e-06, "loss": 0.4857, "step": 1178 }, { "epoch": 0.6346813807953704, "grad_norm": 1.6626863719528417, "learning_rate": 3.1077547930928652e-06, "loss": 0.4681, "step": 1179 }, { "epoch": 0.6352197025772155, "grad_norm": 1.6842711637823262, "learning_rate": 3.0996845842140716e-06, "loss": 0.4312, "step": 1180 }, { "epoch": 0.6357580243590606, "grad_norm": 1.7431784823037149, "learning_rate": 3.091620157599075e-06, "loss": 0.4206, "step": 1181 }, { "epoch": 0.6362963461409057, "grad_norm": 1.7565059915579697, "learning_rate": 3.0835615377862453e-06, "loss": 0.4787, "step": 1182 }, { "epoch": 0.6368346679227508, "grad_norm": 1.5940508036600212, "learning_rate": 3.0755087492962844e-06, "loss": 0.3977, "step": 1183 }, { "epoch": 0.6373729897045959, "grad_norm": 1.4265440236436624, "learning_rate": 3.0674618166321477e-06, "loss": 0.4455, "step": 1184 }, { "epoch": 0.637911311486441, "grad_norm": 1.5203806820148102, "learning_rate": 3.059420764278975e-06, "loss": 0.4421, "step": 1185 }, { "epoch": 0.6384496332682861, "grad_norm": 1.7485388075672719, "learning_rate": 3.0513856167040123e-06, "loss": 0.4337, "step": 1186 }, { "epoch": 0.6389879550501312, "grad_norm": 1.5758916072812403, "learning_rate": 3.0433563983565415e-06, "loss": 0.483, "step": 1187 }, { "epoch": 0.6395262768319763, "grad_norm": 1.7757740619316615, "learning_rate": 3.0353331336677984e-06, "loss": 0.402, "step": 1188 }, { "epoch": 0.6400645986138214, "grad_norm": 1.5639356203741708, "learning_rate": 3.027315847050906e-06, "loss": 0.4588, "step": 1189 }, { "epoch": 0.6406029203956665, "grad_norm": 1.900913903628273, "learning_rate": 3.0193045629007982e-06, "loss": 0.4318, "step": 1190 }, { "epoch": 0.6411412421775116, "grad_norm": 1.7813979669008324, "learning_rate": 3.011299305594141e-06, "loss": 0.4444, "step": 1191 }, { "epoch": 0.6416795639593567, "grad_norm": 1.4267787696799576, "learning_rate": 3.0033000994892646e-06, "loss": 0.4394, "step": 1192 }, { "epoch": 0.6422178857412018, "grad_norm": 1.425734282167891, "learning_rate": 2.995306968926087e-06, "loss": 0.4729, "step": 1193 }, { "epoch": 0.6427562075230469, "grad_norm": 1.6415657973276232, "learning_rate": 2.98731993822604e-06, "loss": 0.4644, "step": 1194 }, { "epoch": 0.643294529304892, "grad_norm": 1.8314597950910743, "learning_rate": 2.97933903169199e-06, "loss": 0.5308, "step": 1195 }, { "epoch": 0.643832851086737, "grad_norm": 1.5314208582263587, "learning_rate": 2.9713642736081755e-06, "loss": 0.4539, "step": 1196 }, { "epoch": 0.6443711728685821, "grad_norm": 1.7043966331574372, "learning_rate": 2.9633956882401215e-06, "loss": 0.4478, "step": 1197 }, { "epoch": 0.6449094946504272, "grad_norm": 1.3896380014466228, "learning_rate": 2.955433299834576e-06, "loss": 0.4274, "step": 1198 }, { "epoch": 0.6454478164322723, "grad_norm": 1.328466975562685, "learning_rate": 2.947477132619423e-06, "loss": 0.4151, "step": 1199 }, { "epoch": 0.6459861382141174, "grad_norm": 1.4947495053829816, "learning_rate": 2.939527210803624e-06, "loss": 0.4225, "step": 1200 }, { "epoch": 0.6459861382141174, "eval_loss": 0.43335118889808655, "eval_runtime": 1568.1591, "eval_samples_per_second": 15.949, "eval_steps_per_second": 0.499, "step": 1200 }, { "epoch": 0.6465244599959626, "grad_norm": 1.7770419353679783, "learning_rate": 2.9315835585771334e-06, "loss": 0.4443, "step": 1201 }, { "epoch": 0.6470627817778077, "grad_norm": 1.509257884926516, "learning_rate": 2.923646200110832e-06, "loss": 0.403, "step": 1202 }, { "epoch": 0.6476011035596528, "grad_norm": 1.413359799607147, "learning_rate": 2.915715159556444e-06, "loss": 0.3995, "step": 1203 }, { "epoch": 0.6481394253414979, "grad_norm": 1.4051405846579907, "learning_rate": 2.9077904610464745e-06, "loss": 0.3597, "step": 1204 }, { "epoch": 0.648677747123343, "grad_norm": 1.5857210618229394, "learning_rate": 2.89987212869413e-06, "loss": 0.448, "step": 1205 }, { "epoch": 0.6492160689051881, "grad_norm": 1.3723187404527468, "learning_rate": 2.8919601865932456e-06, "loss": 0.4522, "step": 1206 }, { "epoch": 0.6497543906870332, "grad_norm": 1.3511061410304184, "learning_rate": 2.884054658818214e-06, "loss": 0.3792, "step": 1207 }, { "epoch": 0.6502927124688783, "grad_norm": 1.387760091675675, "learning_rate": 2.8761555694239046e-06, "loss": 0.4515, "step": 1208 }, { "epoch": 0.6508310342507234, "grad_norm": 1.4247593593472396, "learning_rate": 2.868262942445603e-06, "loss": 0.4489, "step": 1209 }, { "epoch": 0.6513693560325685, "grad_norm": 1.600671347691334, "learning_rate": 2.8603768018989275e-06, "loss": 0.3944, "step": 1210 }, { "epoch": 0.6519076778144136, "grad_norm": 1.4284428882228806, "learning_rate": 2.852497171779761e-06, "loss": 0.432, "step": 1211 }, { "epoch": 0.6524459995962587, "grad_norm": 1.8170320001458748, "learning_rate": 2.8446240760641762e-06, "loss": 0.483, "step": 1212 }, { "epoch": 0.6529843213781038, "grad_norm": 1.872300633931277, "learning_rate": 2.836757538708362e-06, "loss": 0.4226, "step": 1213 }, { "epoch": 0.6535226431599489, "grad_norm": 1.5545253276420463, "learning_rate": 2.8288975836485523e-06, "loss": 0.4452, "step": 1214 }, { "epoch": 0.654060964941794, "grad_norm": 1.4689119979210103, "learning_rate": 2.8210442348009543e-06, "loss": 0.4206, "step": 1215 }, { "epoch": 0.6545992867236391, "grad_norm": 1.495722266239985, "learning_rate": 2.8131975160616686e-06, "loss": 0.4555, "step": 1216 }, { "epoch": 0.6551376085054842, "grad_norm": 1.4286754464458904, "learning_rate": 2.805357451306626e-06, "loss": 0.4531, "step": 1217 }, { "epoch": 0.6556759302873293, "grad_norm": 1.6604089854519999, "learning_rate": 2.797524064391511e-06, "loss": 0.4351, "step": 1218 }, { "epoch": 0.6562142520691744, "grad_norm": 1.677727217993553, "learning_rate": 2.7896973791516867e-06, "loss": 0.4797, "step": 1219 }, { "epoch": 0.6567525738510195, "grad_norm": 1.8188528752490087, "learning_rate": 2.781877419402126e-06, "loss": 0.3942, "step": 1220 }, { "epoch": 0.6572908956328646, "grad_norm": 1.518304729497582, "learning_rate": 2.7740642089373356e-06, "loss": 0.4567, "step": 1221 }, { "epoch": 0.6578292174147097, "grad_norm": 1.9076520179847476, "learning_rate": 2.76625777153129e-06, "loss": 0.4761, "step": 1222 }, { "epoch": 0.6583675391965548, "grad_norm": 1.6501027454283104, "learning_rate": 2.758458130937346e-06, "loss": 0.4568, "step": 1223 }, { "epoch": 0.6589058609783999, "grad_norm": 1.4971909664683323, "learning_rate": 2.7506653108881885e-06, "loss": 0.4534, "step": 1224 }, { "epoch": 0.659444182760245, "grad_norm": 1.8216935826384455, "learning_rate": 2.742879335095743e-06, "loss": 0.4872, "step": 1225 }, { "epoch": 0.6599825045420901, "grad_norm": 1.441369836777809, "learning_rate": 2.735100227251113e-06, "loss": 0.3857, "step": 1226 }, { "epoch": 0.6605208263239352, "grad_norm": 1.3907320663098741, "learning_rate": 2.7273280110245e-06, "loss": 0.4055, "step": 1227 }, { "epoch": 0.6610591481057803, "grad_norm": 1.3629302314750185, "learning_rate": 2.719562710065142e-06, "loss": 0.4059, "step": 1228 }, { "epoch": 0.6615974698876254, "grad_norm": 1.5181251515722511, "learning_rate": 2.711804348001231e-06, "loss": 0.4927, "step": 1229 }, { "epoch": 0.6621357916694705, "grad_norm": 1.583461554714453, "learning_rate": 2.704052948439842e-06, "loss": 0.4139, "step": 1230 }, { "epoch": 0.6626741134513155, "grad_norm": 1.597683792644596, "learning_rate": 2.6963085349668718e-06, "loss": 0.4299, "step": 1231 }, { "epoch": 0.6632124352331606, "grad_norm": 1.4538764746820028, "learning_rate": 2.6885711311469547e-06, "loss": 0.4238, "step": 1232 }, { "epoch": 0.6637507570150057, "grad_norm": 1.5760098860778269, "learning_rate": 2.6808407605234006e-06, "loss": 0.4605, "step": 1233 }, { "epoch": 0.6642890787968508, "grad_norm": 1.8819638022647283, "learning_rate": 2.673117446618114e-06, "loss": 0.4176, "step": 1234 }, { "epoch": 0.6648274005786959, "grad_norm": 1.7467867886896942, "learning_rate": 2.665401212931532e-06, "loss": 0.4284, "step": 1235 }, { "epoch": 0.665365722360541, "grad_norm": 1.3582161008888671, "learning_rate": 2.6576920829425434e-06, "loss": 0.449, "step": 1236 }, { "epoch": 0.6659040441423861, "grad_norm": 1.7112669988534182, "learning_rate": 2.6499900801084283e-06, "loss": 0.4702, "step": 1237 }, { "epoch": 0.6664423659242312, "grad_norm": 2.099925951296545, "learning_rate": 2.6422952278647705e-06, "loss": 0.4592, "step": 1238 }, { "epoch": 0.6669806877060763, "grad_norm": 1.4352705146813356, "learning_rate": 2.6346075496254054e-06, "loss": 0.384, "step": 1239 }, { "epoch": 0.6675190094879214, "grad_norm": 1.89895053480487, "learning_rate": 2.6269270687823337e-06, "loss": 0.4632, "step": 1240 }, { "epoch": 0.6680573312697665, "grad_norm": 1.527126991788229, "learning_rate": 2.619253808705661e-06, "loss": 0.4304, "step": 1241 }, { "epoch": 0.6685956530516116, "grad_norm": 1.9088122860113825, "learning_rate": 2.6115877927435152e-06, "loss": 0.4615, "step": 1242 }, { "epoch": 0.6691339748334567, "grad_norm": 1.5152814714510374, "learning_rate": 2.6039290442219884e-06, "loss": 0.4019, "step": 1243 }, { "epoch": 0.6696722966153018, "grad_norm": 1.490222426325067, "learning_rate": 2.5962775864450563e-06, "loss": 0.425, "step": 1244 }, { "epoch": 0.6702106183971469, "grad_norm": 1.5269175130136061, "learning_rate": 2.588633442694508e-06, "loss": 0.3988, "step": 1245 }, { "epoch": 0.670748940178992, "grad_norm": 1.4416954872355545, "learning_rate": 2.5809966362298805e-06, "loss": 0.4603, "step": 1246 }, { "epoch": 0.6712872619608371, "grad_norm": 2.6364873275752014, "learning_rate": 2.573367190288385e-06, "loss": 0.4648, "step": 1247 }, { "epoch": 0.6718255837426822, "grad_norm": 1.788546820645697, "learning_rate": 2.5657451280848355e-06, "loss": 0.4635, "step": 1248 }, { "epoch": 0.6723639055245273, "grad_norm": 1.3806063124644692, "learning_rate": 2.5581304728115797e-06, "loss": 0.4943, "step": 1249 }, { "epoch": 0.6729022273063724, "grad_norm": 1.402487270939909, "learning_rate": 2.550523247638426e-06, "loss": 0.4006, "step": 1250 }, { "epoch": 0.6734405490882175, "grad_norm": 1.910681275697032, "learning_rate": 2.542923475712574e-06, "loss": 0.4609, "step": 1251 }, { "epoch": 0.6739788708700626, "grad_norm": 1.446121535462886, "learning_rate": 2.5353311801585507e-06, "loss": 0.4092, "step": 1252 }, { "epoch": 0.6745171926519077, "grad_norm": 1.6008122915794563, "learning_rate": 2.5277463840781236e-06, "loss": 0.4648, "step": 1253 }, { "epoch": 0.6750555144337528, "grad_norm": 1.8052193116478468, "learning_rate": 2.520169110550248e-06, "loss": 0.4325, "step": 1254 }, { "epoch": 0.6755938362155979, "grad_norm": 2.0544496666589245, "learning_rate": 2.5125993826309904e-06, "loss": 0.4102, "step": 1255 }, { "epoch": 0.676132157997443, "grad_norm": 1.5511129757696938, "learning_rate": 2.5050372233534526e-06, "loss": 0.4443, "step": 1256 }, { "epoch": 0.6766704797792881, "grad_norm": 1.8672906417068529, "learning_rate": 2.4974826557277115e-06, "loss": 0.4516, "step": 1257 }, { "epoch": 0.6772088015611332, "grad_norm": 1.4831806217941237, "learning_rate": 2.489935702740741e-06, "loss": 0.4347, "step": 1258 }, { "epoch": 0.6777471233429783, "grad_norm": 1.5986607931002996, "learning_rate": 2.4823963873563487e-06, "loss": 0.427, "step": 1259 }, { "epoch": 0.6782854451248234, "grad_norm": 1.481767434298922, "learning_rate": 2.4748647325150966e-06, "loss": 0.4135, "step": 1260 }, { "epoch": 0.6782854451248234, "eval_loss": 0.43108630180358887, "eval_runtime": 1581.7954, "eval_samples_per_second": 15.811, "eval_steps_per_second": 0.494, "step": 1260 }, { "epoch": 0.6788237669066685, "grad_norm": 1.491812080960543, "learning_rate": 2.467340761134242e-06, "loss": 0.4392, "step": 1261 }, { "epoch": 0.6793620886885136, "grad_norm": 1.5403059882131847, "learning_rate": 2.459824496107662e-06, "loss": 0.4631, "step": 1262 }, { "epoch": 0.6799004104703587, "grad_norm": 1.4488066174399352, "learning_rate": 2.4523159603057858e-06, "loss": 0.4401, "step": 1263 }, { "epoch": 0.6804387322522037, "grad_norm": 1.6997928715987718, "learning_rate": 2.444815176575521e-06, "loss": 0.4671, "step": 1264 }, { "epoch": 0.6809770540340488, "grad_norm": 1.6242395825984155, "learning_rate": 2.4373221677401916e-06, "loss": 0.4227, "step": 1265 }, { "epoch": 0.6815153758158939, "grad_norm": 1.3272959133305353, "learning_rate": 2.429836956599463e-06, "loss": 0.3586, "step": 1266 }, { "epoch": 0.682053697597739, "grad_norm": 1.723455688742321, "learning_rate": 2.422359565929268e-06, "loss": 0.4275, "step": 1267 }, { "epoch": 0.6825920193795841, "grad_norm": 1.3911086482449566, "learning_rate": 2.414890018481752e-06, "loss": 0.4383, "step": 1268 }, { "epoch": 0.6831303411614292, "grad_norm": 1.515918050738459, "learning_rate": 2.40742833698519e-06, "loss": 0.4342, "step": 1269 }, { "epoch": 0.6836686629432743, "grad_norm": 1.6928322026664087, "learning_rate": 2.3999745441439243e-06, "loss": 0.4156, "step": 1270 }, { "epoch": 0.6842069847251194, "grad_norm": 1.3632558682947689, "learning_rate": 2.3925286626382926e-06, "loss": 0.3914, "step": 1271 }, { "epoch": 0.6847453065069645, "grad_norm": 3.139130094162036, "learning_rate": 2.385090715124562e-06, "loss": 0.4637, "step": 1272 }, { "epoch": 0.6852836282888096, "grad_norm": 1.434440598705869, "learning_rate": 2.3776607242348547e-06, "loss": 0.437, "step": 1273 }, { "epoch": 0.6858219500706547, "grad_norm": 1.5144260531076574, "learning_rate": 2.3702387125770882e-06, "loss": 0.4234, "step": 1274 }, { "epoch": 0.6863602718524998, "grad_norm": 1.693660818176695, "learning_rate": 2.362824702734893e-06, "loss": 0.4164, "step": 1275 }, { "epoch": 0.6868985936343449, "grad_norm": 1.3894626651308215, "learning_rate": 2.355418717267558e-06, "loss": 0.4221, "step": 1276 }, { "epoch": 0.68743691541619, "grad_norm": 1.697033782203384, "learning_rate": 2.3480207787099534e-06, "loss": 0.4383, "step": 1277 }, { "epoch": 0.6879752371980351, "grad_norm": 1.4858347246883488, "learning_rate": 2.340630909572465e-06, "loss": 0.4265, "step": 1278 }, { "epoch": 0.6885135589798802, "grad_norm": 1.500359176091357, "learning_rate": 2.3332491323409234e-06, "loss": 0.4481, "step": 1279 }, { "epoch": 0.6890518807617253, "grad_norm": 1.5297356725220441, "learning_rate": 2.32587546947654e-06, "loss": 0.4348, "step": 1280 }, { "epoch": 0.6895902025435704, "grad_norm": 2.508398158502729, "learning_rate": 2.3185099434158352e-06, "loss": 0.4437, "step": 1281 }, { "epoch": 0.6901285243254155, "grad_norm": 1.523641981004582, "learning_rate": 2.311152576570566e-06, "loss": 0.4575, "step": 1282 }, { "epoch": 0.6906668461072606, "grad_norm": 1.6114434265747755, "learning_rate": 2.303803391327669e-06, "loss": 0.4378, "step": 1283 }, { "epoch": 0.6912051678891057, "grad_norm": 1.4928444150803868, "learning_rate": 2.296462410049183e-06, "loss": 0.4411, "step": 1284 }, { "epoch": 0.6917434896709508, "grad_norm": 1.5345549032626111, "learning_rate": 2.289129655072185e-06, "loss": 0.4324, "step": 1285 }, { "epoch": 0.6922818114527959, "grad_norm": 1.4298368477097725, "learning_rate": 2.2818051487087183e-06, "loss": 0.426, "step": 1286 }, { "epoch": 0.692820133234641, "grad_norm": 1.8725369506254443, "learning_rate": 2.2744889132457314e-06, "loss": 0.4541, "step": 1287 }, { "epoch": 0.6933584550164861, "grad_norm": 1.77702449875276, "learning_rate": 2.267180970945003e-06, "loss": 0.432, "step": 1288 }, { "epoch": 0.6938967767983312, "grad_norm": 1.4563290123647166, "learning_rate": 2.259881344043081e-06, "loss": 0.3832, "step": 1289 }, { "epoch": 0.6944350985801763, "grad_norm": 1.3449801230990073, "learning_rate": 2.252590054751205e-06, "loss": 0.3962, "step": 1290 }, { "epoch": 0.6949734203620214, "grad_norm": 1.8854534900995603, "learning_rate": 2.2453071252552515e-06, "loss": 0.4807, "step": 1291 }, { "epoch": 0.6955117421438665, "grad_norm": 1.762423954535133, "learning_rate": 2.238032577715656e-06, "loss": 0.384, "step": 1292 }, { "epoch": 0.6960500639257116, "grad_norm": 1.476803369543656, "learning_rate": 2.2307664342673506e-06, "loss": 0.4539, "step": 1293 }, { "epoch": 0.6965883857075567, "grad_norm": 1.4854619250041479, "learning_rate": 2.2235087170196966e-06, "loss": 0.4396, "step": 1294 }, { "epoch": 0.6971267074894018, "grad_norm": 1.41098403179678, "learning_rate": 2.2162594480564155e-06, "loss": 0.4005, "step": 1295 }, { "epoch": 0.6976650292712469, "grad_norm": 1.2989632950912373, "learning_rate": 2.2090186494355203e-06, "loss": 0.4151, "step": 1296 }, { "epoch": 0.698203351053092, "grad_norm": 1.6133874577700047, "learning_rate": 2.2017863431892534e-06, "loss": 0.4285, "step": 1297 }, { "epoch": 0.698741672834937, "grad_norm": 1.333799397613619, "learning_rate": 2.1945625513240154e-06, "loss": 0.4041, "step": 1298 }, { "epoch": 0.6992799946167821, "grad_norm": 1.4390186504294415, "learning_rate": 2.1873472958202997e-06, "loss": 0.4365, "step": 1299 }, { "epoch": 0.6998183163986272, "grad_norm": 1.2866738586576456, "learning_rate": 2.1801405986326245e-06, "loss": 0.4665, "step": 1300 }, { "epoch": 0.7003566381804723, "grad_norm": 2.2273828713275865, "learning_rate": 2.1729424816894685e-06, "loss": 0.4564, "step": 1301 }, { "epoch": 0.7008949599623174, "grad_norm": 1.4546138888578992, "learning_rate": 2.165752966893203e-06, "loss": 0.4051, "step": 1302 }, { "epoch": 0.7014332817441625, "grad_norm": 1.3514329197218915, "learning_rate": 2.158572076120019e-06, "loss": 0.4154, "step": 1303 }, { "epoch": 0.7019716035260076, "grad_norm": 1.3870510485604055, "learning_rate": 2.1513998312198734e-06, "loss": 0.4269, "step": 1304 }, { "epoch": 0.7025099253078527, "grad_norm": 1.6439661727082362, "learning_rate": 2.1442362540164123e-06, "loss": 0.4472, "step": 1305 }, { "epoch": 0.7030482470896978, "grad_norm": 2.036208978375709, "learning_rate": 2.1370813663069086e-06, "loss": 0.4952, "step": 1306 }, { "epoch": 0.7035865688715429, "grad_norm": 1.4306434260587932, "learning_rate": 2.1299351898621938e-06, "loss": 0.3815, "step": 1307 }, { "epoch": 0.704124890653388, "grad_norm": 1.5518498802370642, "learning_rate": 2.122797746426595e-06, "loss": 0.4656, "step": 1308 }, { "epoch": 0.7046632124352331, "grad_norm": 1.353149193018473, "learning_rate": 2.1156690577178657e-06, "loss": 0.4414, "step": 1309 }, { "epoch": 0.7052015342170782, "grad_norm": 1.3081505827837419, "learning_rate": 2.108549145427117e-06, "loss": 0.4355, "step": 1310 }, { "epoch": 0.7057398559989233, "grad_norm": 1.5741831120177514, "learning_rate": 2.1014380312187593e-06, "loss": 0.4396, "step": 1311 }, { "epoch": 0.7062781777807684, "grad_norm": 1.5628460516936316, "learning_rate": 2.094335736730433e-06, "loss": 0.3687, "step": 1312 }, { "epoch": 0.7068164995626135, "grad_norm": 3.0284027392779986, "learning_rate": 2.0872422835729384e-06, "loss": 0.4463, "step": 1313 }, { "epoch": 0.7073548213444586, "grad_norm": 1.3447501399327724, "learning_rate": 2.0801576933301757e-06, "loss": 0.4371, "step": 1314 }, { "epoch": 0.7078931431263038, "grad_norm": 1.8116776445346612, "learning_rate": 2.073081987559077e-06, "loss": 0.4109, "step": 1315 }, { "epoch": 0.7084314649081489, "grad_norm": 1.571648134209876, "learning_rate": 2.06601518778954e-06, "loss": 0.432, "step": 1316 }, { "epoch": 0.708969786689994, "grad_norm": 1.596166756734421, "learning_rate": 2.0589573155243663e-06, "loss": 0.4291, "step": 1317 }, { "epoch": 0.7095081084718391, "grad_norm": 1.4446289087866433, "learning_rate": 2.051908392239186e-06, "loss": 0.4094, "step": 1318 }, { "epoch": 0.7100464302536842, "grad_norm": 1.377063116073787, "learning_rate": 2.044868439382406e-06, "loss": 0.4696, "step": 1319 }, { "epoch": 0.7105847520355293, "grad_norm": 1.3694098512093758, "learning_rate": 2.0378374783751352e-06, "loss": 0.402, "step": 1320 }, { "epoch": 0.7105847520355293, "eval_loss": 0.4282020330429077, "eval_runtime": 1515.7705, "eval_samples_per_second": 16.5, "eval_steps_per_second": 0.516, "step": 1320 }, { "epoch": 0.7111230738173744, "grad_norm": 1.929826065439873, "learning_rate": 2.030815530611123e-06, "loss": 0.4159, "step": 1321 }, { "epoch": 0.7116613955992195, "grad_norm": 1.4082500795847726, "learning_rate": 2.023802617456694e-06, "loss": 0.3941, "step": 1322 }, { "epoch": 0.7121997173810646, "grad_norm": 1.8816103595399847, "learning_rate": 2.01679876025068e-06, "loss": 0.4244, "step": 1323 }, { "epoch": 0.7127380391629097, "grad_norm": 1.5683369901785116, "learning_rate": 2.0098039803043612e-06, "loss": 0.4332, "step": 1324 }, { "epoch": 0.7132763609447548, "grad_norm": 1.4453103994083734, "learning_rate": 2.0028182989013923e-06, "loss": 0.3945, "step": 1325 }, { "epoch": 0.7138146827265999, "grad_norm": 1.6267798252157584, "learning_rate": 1.9958417372977474e-06, "loss": 0.4528, "step": 1326 }, { "epoch": 0.714353004508445, "grad_norm": 1.6214655041789812, "learning_rate": 1.9888743167216493e-06, "loss": 0.4074, "step": 1327 }, { "epoch": 0.7148913262902901, "grad_norm": 1.8595682807437428, "learning_rate": 1.9819160583735077e-06, "loss": 0.4494, "step": 1328 }, { "epoch": 0.7154296480721352, "grad_norm": 1.4662467013475076, "learning_rate": 1.974966983425852e-06, "loss": 0.4066, "step": 1329 }, { "epoch": 0.7159679698539803, "grad_norm": 2.5261174973160716, "learning_rate": 1.9680271130232693e-06, "loss": 0.4394, "step": 1330 }, { "epoch": 0.7165062916358254, "grad_norm": 1.8084272539130577, "learning_rate": 1.9610964682823407e-06, "loss": 0.4601, "step": 1331 }, { "epoch": 0.7170446134176705, "grad_norm": 1.820018846201368, "learning_rate": 1.9541750702915706e-06, "loss": 0.4446, "step": 1332 }, { "epoch": 0.7175829351995155, "grad_norm": 1.3923517314522877, "learning_rate": 1.9472629401113325e-06, "loss": 0.3857, "step": 1333 }, { "epoch": 0.7181212569813606, "grad_norm": 1.527238991242769, "learning_rate": 1.9403600987737976e-06, "loss": 0.4381, "step": 1334 }, { "epoch": 0.7186595787632057, "grad_norm": 1.4006251254778943, "learning_rate": 1.9334665672828736e-06, "loss": 0.4332, "step": 1335 }, { "epoch": 0.7191979005450508, "grad_norm": 2.1367769390904, "learning_rate": 1.926582366614141e-06, "loss": 0.4331, "step": 1336 }, { "epoch": 0.7197362223268959, "grad_norm": 1.661348731930383, "learning_rate": 1.9197075177147866e-06, "loss": 0.4877, "step": 1337 }, { "epoch": 0.720274544108741, "grad_norm": 1.4928525414429736, "learning_rate": 1.9128420415035442e-06, "loss": 0.4239, "step": 1338 }, { "epoch": 0.7208128658905861, "grad_norm": 1.533499882863047, "learning_rate": 1.9059859588706287e-06, "loss": 0.3951, "step": 1339 }, { "epoch": 0.7213511876724312, "grad_norm": 1.8392687775713348, "learning_rate": 1.8991392906776668e-06, "loss": 0.4395, "step": 1340 }, { "epoch": 0.7218895094542763, "grad_norm": 1.573889490157054, "learning_rate": 1.8923020577576452e-06, "loss": 0.4162, "step": 1341 }, { "epoch": 0.7224278312361214, "grad_norm": 1.5526149616819422, "learning_rate": 1.885474280914838e-06, "loss": 0.4579, "step": 1342 }, { "epoch": 0.7229661530179665, "grad_norm": 1.5191810245344743, "learning_rate": 1.8786559809247485e-06, "loss": 0.4216, "step": 1343 }, { "epoch": 0.7235044747998116, "grad_norm": 1.5555786435185341, "learning_rate": 1.8718471785340414e-06, "loss": 0.4122, "step": 1344 }, { "epoch": 0.7240427965816567, "grad_norm": 1.3557551585285899, "learning_rate": 1.8650478944604844e-06, "loss": 0.3932, "step": 1345 }, { "epoch": 0.7245811183635018, "grad_norm": 1.4728885839955113, "learning_rate": 1.8582581493928837e-06, "loss": 0.4934, "step": 1346 }, { "epoch": 0.7251194401453469, "grad_norm": 1.5560703862712066, "learning_rate": 1.8514779639910152e-06, "loss": 0.4565, "step": 1347 }, { "epoch": 0.725657761927192, "grad_norm": 1.4005810948444959, "learning_rate": 1.8447073588855707e-06, "loss": 0.45, "step": 1348 }, { "epoch": 0.7261960837090371, "grad_norm": 1.4372886671511238, "learning_rate": 1.8379463546780923e-06, "loss": 0.4076, "step": 1349 }, { "epoch": 0.7267344054908822, "grad_norm": 1.3561213817272149, "learning_rate": 1.8311949719409056e-06, "loss": 0.3991, "step": 1350 }, { "epoch": 0.7272727272727273, "grad_norm": 1.592180627183088, "learning_rate": 1.824453231217062e-06, "loss": 0.4395, "step": 1351 }, { "epoch": 0.7278110490545724, "grad_norm": 1.674234401633556, "learning_rate": 1.8177211530202733e-06, "loss": 0.5076, "step": 1352 }, { "epoch": 0.7283493708364175, "grad_norm": 1.3869830990008478, "learning_rate": 1.8109987578348504e-06, "loss": 0.3823, "step": 1353 }, { "epoch": 0.7288876926182626, "grad_norm": 1.8958736579636137, "learning_rate": 1.8042860661156425e-06, "loss": 0.4283, "step": 1354 }, { "epoch": 0.7294260144001077, "grad_norm": 2.277391563720137, "learning_rate": 1.7975830982879688e-06, "loss": 0.4344, "step": 1355 }, { "epoch": 0.7299643361819528, "grad_norm": 1.3788436987213148, "learning_rate": 1.7908898747475656e-06, "loss": 0.42, "step": 1356 }, { "epoch": 0.7305026579637979, "grad_norm": 1.472584181988221, "learning_rate": 1.784206415860516e-06, "loss": 0.4554, "step": 1357 }, { "epoch": 0.731040979745643, "grad_norm": 1.441497867695086, "learning_rate": 1.7775327419631938e-06, "loss": 0.3914, "step": 1358 }, { "epoch": 0.7315793015274881, "grad_norm": 1.413962400530734, "learning_rate": 1.7708688733621971e-06, "loss": 0.4271, "step": 1359 }, { "epoch": 0.7321176233093332, "grad_norm": 1.467777866704718, "learning_rate": 1.7642148303342894e-06, "loss": 0.4613, "step": 1360 }, { "epoch": 0.7326559450911783, "grad_norm": 1.4588809601870538, "learning_rate": 1.7575706331263392e-06, "loss": 0.3732, "step": 1361 }, { "epoch": 0.7331942668730234, "grad_norm": 1.9984141502445067, "learning_rate": 1.7509363019552506e-06, "loss": 0.4337, "step": 1362 }, { "epoch": 0.7337325886548685, "grad_norm": 1.7211596185425657, "learning_rate": 1.744311857007912e-06, "loss": 0.4237, "step": 1363 }, { "epoch": 0.7342709104367136, "grad_norm": 1.3275340316554045, "learning_rate": 1.7376973184411294e-06, "loss": 0.4026, "step": 1364 }, { "epoch": 0.7348092322185587, "grad_norm": 1.3704150312314805, "learning_rate": 1.7310927063815647e-06, "loss": 0.4221, "step": 1365 }, { "epoch": 0.7353475540004037, "grad_norm": 1.6240778919766734, "learning_rate": 1.7244980409256768e-06, "loss": 0.3956, "step": 1366 }, { "epoch": 0.7358858757822488, "grad_norm": 1.5916150137066967, "learning_rate": 1.7179133421396571e-06, "loss": 0.449, "step": 1367 }, { "epoch": 0.7364241975640939, "grad_norm": 1.3674325981426028, "learning_rate": 1.7113386300593749e-06, "loss": 0.469, "step": 1368 }, { "epoch": 0.736962519345939, "grad_norm": 1.823579935483228, "learning_rate": 1.7047739246903044e-06, "loss": 0.4256, "step": 1369 }, { "epoch": 0.7375008411277841, "grad_norm": 1.5992570631473233, "learning_rate": 1.6982192460074787e-06, "loss": 0.4364, "step": 1370 }, { "epoch": 0.7380391629096292, "grad_norm": 1.83556587779534, "learning_rate": 1.6916746139554186e-06, "loss": 0.462, "step": 1371 }, { "epoch": 0.7385774846914743, "grad_norm": 1.63962319033326, "learning_rate": 1.6851400484480757e-06, "loss": 0.4647, "step": 1372 }, { "epoch": 0.7391158064733194, "grad_norm": 1.489565256988372, "learning_rate": 1.6786155693687712e-06, "loss": 0.4391, "step": 1373 }, { "epoch": 0.7396541282551645, "grad_norm": 1.8781762497357959, "learning_rate": 1.6721011965701344e-06, "loss": 0.4429, "step": 1374 }, { "epoch": 0.7401924500370096, "grad_norm": 1.394724821422672, "learning_rate": 1.6655969498740455e-06, "loss": 0.3781, "step": 1375 }, { "epoch": 0.7407307718188547, "grad_norm": 1.7954529740174663, "learning_rate": 1.6591028490715722e-06, "loss": 0.4437, "step": 1376 }, { "epoch": 0.7412690936006998, "grad_norm": 1.5625366322113399, "learning_rate": 1.6526189139229072e-06, "loss": 0.4221, "step": 1377 }, { "epoch": 0.7418074153825449, "grad_norm": 1.49000718617141, "learning_rate": 1.6461451641573156e-06, "loss": 0.3824, "step": 1378 }, { "epoch": 0.74234573716439, "grad_norm": 1.5501486593751905, "learning_rate": 1.639681619473069e-06, "loss": 0.4316, "step": 1379 }, { "epoch": 0.7428840589462351, "grad_norm": 1.6012264627466746, "learning_rate": 1.6332282995373867e-06, "loss": 0.4414, "step": 1380 }, { "epoch": 0.7428840589462351, "eval_loss": 0.4260067939758301, "eval_runtime": 1520.5135, "eval_samples_per_second": 16.448, "eval_steps_per_second": 0.514, "step": 1380 }, { "epoch": 0.7434223807280802, "grad_norm": 1.3868379821786618, "learning_rate": 1.6267852239863763e-06, "loss": 0.3962, "step": 1381 }, { "epoch": 0.7439607025099253, "grad_norm": 1.563201406467786, "learning_rate": 1.6203524124249742e-06, "loss": 0.4359, "step": 1382 }, { "epoch": 0.7444990242917704, "grad_norm": 2.0744885451879895, "learning_rate": 1.613929884426887e-06, "loss": 0.472, "step": 1383 }, { "epoch": 0.7450373460736155, "grad_norm": 1.7165383734256863, "learning_rate": 1.607517659534526e-06, "loss": 0.4449, "step": 1384 }, { "epoch": 0.7455756678554606, "grad_norm": 1.420966932605389, "learning_rate": 1.6011157572589565e-06, "loss": 0.4594, "step": 1385 }, { "epoch": 0.7461139896373057, "grad_norm": 1.3843843466818937, "learning_rate": 1.5947241970798332e-06, "loss": 0.4021, "step": 1386 }, { "epoch": 0.7466523114191508, "grad_norm": 2.021869994898455, "learning_rate": 1.588342998445342e-06, "loss": 0.4973, "step": 1387 }, { "epoch": 0.7471906332009959, "grad_norm": 1.6308202289723368, "learning_rate": 1.58197218077214e-06, "loss": 0.4448, "step": 1388 }, { "epoch": 0.747728954982841, "grad_norm": 1.5609319044422376, "learning_rate": 1.5756117634452977e-06, "loss": 0.4512, "step": 1389 }, { "epoch": 0.7482672767646861, "grad_norm": 1.3798571945954525, "learning_rate": 1.5692617658182402e-06, "loss": 0.4332, "step": 1390 }, { "epoch": 0.7488055985465312, "grad_norm": 1.5464889993436788, "learning_rate": 1.5629222072126888e-06, "loss": 0.4716, "step": 1391 }, { "epoch": 0.7493439203283763, "grad_norm": 1.7517747662085987, "learning_rate": 1.5565931069185946e-06, "loss": 0.4305, "step": 1392 }, { "epoch": 0.7498822421102214, "grad_norm": 1.5029346054542445, "learning_rate": 1.5502744841940936e-06, "loss": 0.4657, "step": 1393 }, { "epoch": 0.7504205638920665, "grad_norm": 1.3544718143048395, "learning_rate": 1.543966358265438e-06, "loss": 0.418, "step": 1394 }, { "epoch": 0.7509588856739116, "grad_norm": 1.52275975192662, "learning_rate": 1.5376687483269404e-06, "loss": 0.3732, "step": 1395 }, { "epoch": 0.7514972074557567, "grad_norm": 1.691512607761959, "learning_rate": 1.5313816735409148e-06, "loss": 0.4606, "step": 1396 }, { "epoch": 0.7520355292376018, "grad_norm": 1.6421517222533963, "learning_rate": 1.5251051530376199e-06, "loss": 0.413, "step": 1397 }, { "epoch": 0.7525738510194468, "grad_norm": 1.7994036447279773, "learning_rate": 1.518839205915202e-06, "loss": 0.4167, "step": 1398 }, { "epoch": 0.753112172801292, "grad_norm": 1.4116743542426848, "learning_rate": 1.5125838512396278e-06, "loss": 0.4502, "step": 1399 }, { "epoch": 0.753650494583137, "grad_norm": 2.9318193198163414, "learning_rate": 1.5063391080446404e-06, "loss": 0.4523, "step": 1400 }, { "epoch": 0.7541888163649821, "grad_norm": 1.3582596783082035, "learning_rate": 1.500104995331692e-06, "loss": 0.3758, "step": 1401 }, { "epoch": 0.7547271381468272, "grad_norm": 2.1921211591651435, "learning_rate": 1.493881532069889e-06, "loss": 0.4725, "step": 1402 }, { "epoch": 0.7552654599286723, "grad_norm": 1.5078767590789557, "learning_rate": 1.487668737195932e-06, "loss": 0.4137, "step": 1403 }, { "epoch": 0.7558037817105174, "grad_norm": 1.7747344554372293, "learning_rate": 1.4814666296140617e-06, "loss": 0.4519, "step": 1404 }, { "epoch": 0.7563421034923625, "grad_norm": 1.4869616706516326, "learning_rate": 1.4752752281960003e-06, "loss": 0.3805, "step": 1405 }, { "epoch": 0.7568804252742076, "grad_norm": 1.688795973706041, "learning_rate": 1.4690945517808897e-06, "loss": 0.4993, "step": 1406 }, { "epoch": 0.7574187470560527, "grad_norm": 1.583736337415557, "learning_rate": 1.4629246191752406e-06, "loss": 0.4382, "step": 1407 }, { "epoch": 0.7579570688378978, "grad_norm": 1.405921968173557, "learning_rate": 1.4567654491528732e-06, "loss": 0.3952, "step": 1408 }, { "epoch": 0.7584953906197429, "grad_norm": 1.3449184128012615, "learning_rate": 1.4506170604548575e-06, "loss": 0.4443, "step": 1409 }, { "epoch": 0.759033712401588, "grad_norm": 1.5849926738123288, "learning_rate": 1.4444794717894596e-06, "loss": 0.4131, "step": 1410 }, { "epoch": 0.7595720341834331, "grad_norm": 1.6555281403636608, "learning_rate": 1.4383527018320825e-06, "loss": 0.4414, "step": 1411 }, { "epoch": 0.7601103559652782, "grad_norm": 1.6263621942357136, "learning_rate": 1.432236769225211e-06, "loss": 0.4346, "step": 1412 }, { "epoch": 0.7606486777471233, "grad_norm": 2.0460094225135044, "learning_rate": 1.426131692578354e-06, "loss": 0.4493, "step": 1413 }, { "epoch": 0.7611869995289684, "grad_norm": 1.472378438798274, "learning_rate": 1.4200374904679853e-06, "loss": 0.4562, "step": 1414 }, { "epoch": 0.7617253213108135, "grad_norm": 1.7242311556580157, "learning_rate": 1.413954181437493e-06, "loss": 0.4043, "step": 1415 }, { "epoch": 0.7622636430926586, "grad_norm": 1.6120964716761355, "learning_rate": 1.4078817839971193e-06, "loss": 0.4815, "step": 1416 }, { "epoch": 0.7628019648745037, "grad_norm": 2.00633033152504, "learning_rate": 1.4018203166239032e-06, "loss": 0.5084, "step": 1417 }, { "epoch": 0.7633402866563488, "grad_norm": 1.593451139015103, "learning_rate": 1.3957697977616275e-06, "loss": 0.4089, "step": 1418 }, { "epoch": 0.7638786084381939, "grad_norm": 1.520947317999593, "learning_rate": 1.38973024582076e-06, "loss": 0.4204, "step": 1419 }, { "epoch": 0.764416930220039, "grad_norm": 1.5671907812915762, "learning_rate": 1.3837016791784002e-06, "loss": 0.4011, "step": 1420 }, { "epoch": 0.7649552520018841, "grad_norm": 2.3136360187940435, "learning_rate": 1.3776841161782174e-06, "loss": 0.5217, "step": 1421 }, { "epoch": 0.7654935737837292, "grad_norm": 1.6259616459954453, "learning_rate": 1.3716775751304024e-06, "loss": 0.4094, "step": 1422 }, { "epoch": 0.7660318955655743, "grad_norm": 1.2851781752532265, "learning_rate": 1.365682074311609e-06, "loss": 0.4371, "step": 1423 }, { "epoch": 0.7665702173474194, "grad_norm": 1.6356127807123704, "learning_rate": 1.3596976319648957e-06, "loss": 0.4305, "step": 1424 }, { "epoch": 0.7671085391292645, "grad_norm": 1.7847217896835836, "learning_rate": 1.3537242662996741e-06, "loss": 0.4228, "step": 1425 }, { "epoch": 0.7676468609111096, "grad_norm": 1.9347446509271482, "learning_rate": 1.347761995491651e-06, "loss": 0.3528, "step": 1426 }, { "epoch": 0.7681851826929547, "grad_norm": 1.7975930657160712, "learning_rate": 1.3418108376827738e-06, "loss": 0.4782, "step": 1427 }, { "epoch": 0.7687235044747998, "grad_norm": 1.4744627345322843, "learning_rate": 1.3358708109811775e-06, "loss": 0.3919, "step": 1428 }, { "epoch": 0.769261826256645, "grad_norm": 2.7855979759464926, "learning_rate": 1.3299419334611213e-06, "loss": 0.4646, "step": 1429 }, { "epoch": 0.7698001480384901, "grad_norm": 1.4805916259048137, "learning_rate": 1.324024223162947e-06, "loss": 0.3906, "step": 1430 }, { "epoch": 0.7703384698203352, "grad_norm": 1.7443733531704324, "learning_rate": 1.3181176980930133e-06, "loss": 0.4046, "step": 1431 }, { "epoch": 0.7708767916021803, "grad_norm": 1.3403811088010225, "learning_rate": 1.3122223762236446e-06, "loss": 0.4585, "step": 1432 }, { "epoch": 0.7714151133840254, "grad_norm": 1.8083215069181602, "learning_rate": 1.306338275493077e-06, "loss": 0.4488, "step": 1433 }, { "epoch": 0.7719534351658704, "grad_norm": 2.257570529751952, "learning_rate": 1.3004654138054035e-06, "loss": 0.4411, "step": 1434 }, { "epoch": 0.7724917569477155, "grad_norm": 1.5282453915471157, "learning_rate": 1.2946038090305186e-06, "loss": 0.3982, "step": 1435 }, { "epoch": 0.7730300787295606, "grad_norm": 1.3350543760395588, "learning_rate": 1.2887534790040623e-06, "loss": 0.3529, "step": 1436 }, { "epoch": 0.7735684005114057, "grad_norm": 1.5872897107277366, "learning_rate": 1.2829144415273703e-06, "loss": 0.4175, "step": 1437 }, { "epoch": 0.7741067222932508, "grad_norm": 1.461133941363055, "learning_rate": 1.2770867143674176e-06, "loss": 0.4225, "step": 1438 }, { "epoch": 0.7746450440750959, "grad_norm": 1.977273812214763, "learning_rate": 1.2712703152567634e-06, "loss": 0.3955, "step": 1439 }, { "epoch": 0.775183365856941, "grad_norm": 1.6743349069669249, "learning_rate": 1.2654652618934977e-06, "loss": 0.3861, "step": 1440 }, { "epoch": 0.775183365856941, "eval_loss": 0.42436715960502625, "eval_runtime": 1522.7354, "eval_samples_per_second": 16.424, "eval_steps_per_second": 0.514, "step": 1440 }, { "epoch": 0.7757216876387861, "grad_norm": 1.499262565396223, "learning_rate": 1.2596715719411877e-06, "loss": 0.4024, "step": 1441 }, { "epoch": 0.7762600094206312, "grad_norm": 1.6235233768215886, "learning_rate": 1.253889263028827e-06, "loss": 0.3789, "step": 1442 }, { "epoch": 0.7767983312024763, "grad_norm": 1.4115144384917186, "learning_rate": 1.2481183527507734e-06, "loss": 0.4605, "step": 1443 }, { "epoch": 0.7773366529843214, "grad_norm": 1.4061010836073027, "learning_rate": 1.2423588586667058e-06, "loss": 0.394, "step": 1444 }, { "epoch": 0.7778749747661665, "grad_norm": 1.4756730352326592, "learning_rate": 1.2366107983015636e-06, "loss": 0.3997, "step": 1445 }, { "epoch": 0.7784132965480116, "grad_norm": 1.7767670811956109, "learning_rate": 1.2308741891454978e-06, "loss": 0.4388, "step": 1446 }, { "epoch": 0.7789516183298567, "grad_norm": 1.9567881229548667, "learning_rate": 1.2251490486538143e-06, "loss": 0.4457, "step": 1447 }, { "epoch": 0.7794899401117018, "grad_norm": 1.7149877959759003, "learning_rate": 1.2194353942469217e-06, "loss": 0.4482, "step": 1448 }, { "epoch": 0.7800282618935469, "grad_norm": 1.5521839437257912, "learning_rate": 1.2137332433102806e-06, "loss": 0.469, "step": 1449 }, { "epoch": 0.780566583675392, "grad_norm": 2.688209146479993, "learning_rate": 1.2080426131943496e-06, "loss": 0.3849, "step": 1450 }, { "epoch": 0.7811049054572371, "grad_norm": 1.4274278905750635, "learning_rate": 1.2023635212145262e-06, "loss": 0.3923, "step": 1451 }, { "epoch": 0.7816432272390822, "grad_norm": 1.5796240111966617, "learning_rate": 1.1966959846511068e-06, "loss": 0.4567, "step": 1452 }, { "epoch": 0.7821815490209273, "grad_norm": 2.368565849047706, "learning_rate": 1.191040020749223e-06, "loss": 0.3885, "step": 1453 }, { "epoch": 0.7827198708027724, "grad_norm": 1.7831232578884653, "learning_rate": 1.1853956467187943e-06, "loss": 0.3873, "step": 1454 }, { "epoch": 0.7832581925846175, "grad_norm": 2.2089394022551363, "learning_rate": 1.1797628797344752e-06, "loss": 0.4341, "step": 1455 }, { "epoch": 0.7837965143664626, "grad_norm": 1.7921663918566133, "learning_rate": 1.1741417369356011e-06, "loss": 0.4138, "step": 1456 }, { "epoch": 0.7843348361483077, "grad_norm": 1.503278809860387, "learning_rate": 1.1685322354261402e-06, "loss": 0.4608, "step": 1457 }, { "epoch": 0.7848731579301528, "grad_norm": 1.567305564830315, "learning_rate": 1.1629343922746334e-06, "loss": 0.4444, "step": 1458 }, { "epoch": 0.7854114797119979, "grad_norm": 1.4431401966395603, "learning_rate": 1.1573482245141525e-06, "loss": 0.4353, "step": 1459 }, { "epoch": 0.785949801493843, "grad_norm": 1.7031469874820835, "learning_rate": 1.1517737491422415e-06, "loss": 0.4433, "step": 1460 }, { "epoch": 0.7864881232756881, "grad_norm": 1.9609977211459744, "learning_rate": 1.1462109831208679e-06, "loss": 0.4482, "step": 1461 }, { "epoch": 0.7870264450575332, "grad_norm": 2.150596318263902, "learning_rate": 1.1406599433763694e-06, "loss": 0.4755, "step": 1462 }, { "epoch": 0.7875647668393783, "grad_norm": 1.3265638431410287, "learning_rate": 1.1351206467994018e-06, "loss": 0.4102, "step": 1463 }, { "epoch": 0.7881030886212234, "grad_norm": 4.188075621147485, "learning_rate": 1.129593110244892e-06, "loss": 0.3644, "step": 1464 }, { "epoch": 0.7886414104030685, "grad_norm": 1.5439643283706193, "learning_rate": 1.1240773505319824e-06, "loss": 0.4707, "step": 1465 }, { "epoch": 0.7891797321849136, "grad_norm": 1.695949064351043, "learning_rate": 1.1185733844439778e-06, "loss": 0.4506, "step": 1466 }, { "epoch": 0.7897180539667586, "grad_norm": 1.4925323276596911, "learning_rate": 1.113081228728301e-06, "loss": 0.4062, "step": 1467 }, { "epoch": 0.7902563757486037, "grad_norm": 1.810916777909123, "learning_rate": 1.1076009000964384e-06, "loss": 0.4617, "step": 1468 }, { "epoch": 0.7907946975304488, "grad_norm": 1.5391006325796759, "learning_rate": 1.102132415223886e-06, "loss": 0.4341, "step": 1469 }, { "epoch": 0.7913330193122939, "grad_norm": 1.3539603638585116, "learning_rate": 1.0966757907501058e-06, "loss": 0.4045, "step": 1470 }, { "epoch": 0.791871341094139, "grad_norm": 1.585969494802185, "learning_rate": 1.0912310432784673e-06, "loss": 0.4889, "step": 1471 }, { "epoch": 0.7924096628759841, "grad_norm": 1.3636312861290756, "learning_rate": 1.0857981893762048e-06, "loss": 0.4352, "step": 1472 }, { "epoch": 0.7929479846578292, "grad_norm": 1.5823372906311277, "learning_rate": 1.0803772455743572e-06, "loss": 0.398, "step": 1473 }, { "epoch": 0.7934863064396743, "grad_norm": 1.5278694836184388, "learning_rate": 1.0749682283677288e-06, "loss": 0.4228, "step": 1474 }, { "epoch": 0.7940246282215194, "grad_norm": 1.1652690918407183, "learning_rate": 1.0695711542148313e-06, "loss": 0.3811, "step": 1475 }, { "epoch": 0.7945629500033645, "grad_norm": 1.4886602129753284, "learning_rate": 1.0641860395378367e-06, "loss": 0.4037, "step": 1476 }, { "epoch": 0.7951012717852096, "grad_norm": 1.5390850918633818, "learning_rate": 1.0588129007225266e-06, "loss": 0.3754, "step": 1477 }, { "epoch": 0.7956395935670547, "grad_norm": 1.676720868561217, "learning_rate": 1.0534517541182431e-06, "loss": 0.4599, "step": 1478 }, { "epoch": 0.7961779153488998, "grad_norm": 1.676144009500296, "learning_rate": 1.0481026160378394e-06, "loss": 0.4203, "step": 1479 }, { "epoch": 0.7967162371307449, "grad_norm": 1.3949722623692342, "learning_rate": 1.042765502757625e-06, "loss": 0.4149, "step": 1480 }, { "epoch": 0.79725455891259, "grad_norm": 1.6398344004557446, "learning_rate": 1.0374404305173247e-06, "loss": 0.4215, "step": 1481 }, { "epoch": 0.7977928806944351, "grad_norm": 1.6715940485370635, "learning_rate": 1.0321274155200234e-06, "loss": 0.4393, "step": 1482 }, { "epoch": 0.7983312024762802, "grad_norm": 1.395308837290767, "learning_rate": 1.0268264739321194e-06, "loss": 0.4398, "step": 1483 }, { "epoch": 0.7988695242581253, "grad_norm": 1.6597231226511682, "learning_rate": 1.0215376218832723e-06, "loss": 0.4185, "step": 1484 }, { "epoch": 0.7994078460399704, "grad_norm": 1.5059702316944186, "learning_rate": 1.0162608754663572e-06, "loss": 0.4428, "step": 1485 }, { "epoch": 0.7999461678218155, "grad_norm": 1.774717767949121, "learning_rate": 1.0109962507374139e-06, "loss": 0.456, "step": 1486 }, { "epoch": 0.8004844896036606, "grad_norm": 1.5763966693479707, "learning_rate": 1.0057437637155997e-06, "loss": 0.4742, "step": 1487 }, { "epoch": 0.8010228113855057, "grad_norm": 1.66961890257069, "learning_rate": 1.0005034303831352e-06, "loss": 0.4479, "step": 1488 }, { "epoch": 0.8015611331673508, "grad_norm": 1.4312052717987154, "learning_rate": 9.95275266685264e-07, "loss": 0.3894, "step": 1489 }, { "epoch": 0.8020994549491959, "grad_norm": 1.5395533368166758, "learning_rate": 9.900592885301986e-07, "loss": 0.433, "step": 1490 }, { "epoch": 0.802637776731041, "grad_norm": 1.7267038818610854, "learning_rate": 9.848555117890734e-07, "loss": 0.4399, "step": 1491 }, { "epoch": 0.8031760985128861, "grad_norm": 1.588155903799363, "learning_rate": 9.796639522958972e-07, "loss": 0.4662, "step": 1492 }, { "epoch": 0.8037144202947312, "grad_norm": 1.278378381771794, "learning_rate": 9.744846258475032e-07, "loss": 0.4023, "step": 1493 }, { "epoch": 0.8042527420765763, "grad_norm": 1.630276962177858, "learning_rate": 9.693175482035038e-07, "loss": 0.4352, "step": 1494 }, { "epoch": 0.8047910638584214, "grad_norm": 1.7375887913272672, "learning_rate": 9.641627350862371e-07, "loss": 0.4451, "step": 1495 }, { "epoch": 0.8053293856402665, "grad_norm": 1.5671830810820253, "learning_rate": 9.590202021807266e-07, "loss": 0.4944, "step": 1496 }, { "epoch": 0.8058677074221116, "grad_norm": 1.5984498803682108, "learning_rate": 9.538899651346278e-07, "loss": 0.4171, "step": 1497 }, { "epoch": 0.8064060292039567, "grad_norm": 1.4646889528560627, "learning_rate": 9.487720395581829e-07, "loss": 0.3802, "step": 1498 }, { "epoch": 0.8069443509858018, "grad_norm": 1.3512741257951366, "learning_rate": 9.436664410241736e-07, "loss": 0.4309, "step": 1499 }, { "epoch": 0.8074826727676468, "grad_norm": 1.5243040161927932, "learning_rate": 9.385731850678714e-07, "loss": 0.4321, "step": 1500 }, { "epoch": 0.8074826727676468, "eval_loss": 0.42280885577201843, "eval_runtime": 1525.8015, "eval_samples_per_second": 16.391, "eval_steps_per_second": 0.513, "step": 1500 }, { "epoch": 0.8080209945494919, "grad_norm": 1.7335916518675676, "learning_rate": 9.334922871869933e-07, "loss": 0.4613, "step": 1501 }, { "epoch": 0.808559316331337, "grad_norm": 1.4183990627505498, "learning_rate": 9.284237628416537e-07, "loss": 0.4245, "step": 1502 }, { "epoch": 0.8090976381131821, "grad_norm": 1.6705452727321846, "learning_rate": 9.233676274543141e-07, "loss": 0.4186, "step": 1503 }, { "epoch": 0.8096359598950272, "grad_norm": 1.6195072788491132, "learning_rate": 9.183238964097408e-07, "loss": 0.4606, "step": 1504 }, { "epoch": 0.8101742816768723, "grad_norm": 1.5392537994753088, "learning_rate": 9.132925850549573e-07, "loss": 0.4261, "step": 1505 }, { "epoch": 0.8107126034587174, "grad_norm": 1.5937406024477896, "learning_rate": 9.082737086991955e-07, "loss": 0.378, "step": 1506 }, { "epoch": 0.8112509252405625, "grad_norm": 1.6757621701627432, "learning_rate": 9.0326728261385e-07, "loss": 0.4782, "step": 1507 }, { "epoch": 0.8117892470224076, "grad_norm": 2.005066048659624, "learning_rate": 8.982733220324319e-07, "loss": 0.4419, "step": 1508 }, { "epoch": 0.8123275688042527, "grad_norm": 1.5506134684388948, "learning_rate": 8.932918421505244e-07, "loss": 0.4669, "step": 1509 }, { "epoch": 0.8128658905860978, "grad_norm": 1.8474324824508042, "learning_rate": 8.883228581257297e-07, "loss": 0.4416, "step": 1510 }, { "epoch": 0.8134042123679429, "grad_norm": 1.5536434524734581, "learning_rate": 8.83366385077632e-07, "loss": 0.4377, "step": 1511 }, { "epoch": 0.813942534149788, "grad_norm": 1.399796692285853, "learning_rate": 8.784224380877454e-07, "loss": 0.4392, "step": 1512 }, { "epoch": 0.8144808559316331, "grad_norm": 1.5556950965685121, "learning_rate": 8.734910321994717e-07, "loss": 0.406, "step": 1513 }, { "epoch": 0.8150191777134782, "grad_norm": 1.5480188724931883, "learning_rate": 8.685721824180499e-07, "loss": 0.4433, "step": 1514 }, { "epoch": 0.8155574994953233, "grad_norm": 1.4971651714962706, "learning_rate": 8.636659037105149e-07, "loss": 0.3966, "step": 1515 }, { "epoch": 0.8160958212771684, "grad_norm": 1.6155911416639859, "learning_rate": 8.587722110056529e-07, "loss": 0.4212, "step": 1516 }, { "epoch": 0.8166341430590135, "grad_norm": 1.976217129048654, "learning_rate": 8.538911191939475e-07, "loss": 0.4107, "step": 1517 }, { "epoch": 0.8171724648408586, "grad_norm": 1.9846803772964912, "learning_rate": 8.490226431275456e-07, "loss": 0.4094, "step": 1518 }, { "epoch": 0.8177107866227037, "grad_norm": 3.0586074935315133, "learning_rate": 8.441667976202045e-07, "loss": 0.4492, "step": 1519 }, { "epoch": 0.8182491084045488, "grad_norm": 1.6149445557914077, "learning_rate": 8.393235974472497e-07, "loss": 0.4361, "step": 1520 }, { "epoch": 0.8187874301863939, "grad_norm": 1.4631036764406664, "learning_rate": 8.344930573455323e-07, "loss": 0.4343, "step": 1521 }, { "epoch": 0.819325751968239, "grad_norm": 1.3342306529935604, "learning_rate": 8.296751920133794e-07, "loss": 0.3546, "step": 1522 }, { "epoch": 0.8198640737500841, "grad_norm": 2.0226246030817356, "learning_rate": 8.248700161105483e-07, "loss": 0.4281, "step": 1523 }, { "epoch": 0.8204023955319292, "grad_norm": 1.9696807317895189, "learning_rate": 8.200775442581893e-07, "loss": 0.4215, "step": 1524 }, { "epoch": 0.8209407173137743, "grad_norm": 1.4820095683603027, "learning_rate": 8.152977910387955e-07, "loss": 0.4928, "step": 1525 }, { "epoch": 0.8214790390956194, "grad_norm": 1.5809021302001485, "learning_rate": 8.105307709961602e-07, "loss": 0.442, "step": 1526 }, { "epoch": 0.8220173608774645, "grad_norm": 1.3682019844229378, "learning_rate": 8.057764986353317e-07, "loss": 0.448, "step": 1527 }, { "epoch": 0.8225556826593096, "grad_norm": 1.6136391165039332, "learning_rate": 8.010349884225699e-07, "loss": 0.4458, "step": 1528 }, { "epoch": 0.8230940044411547, "grad_norm": 1.2595845723052967, "learning_rate": 7.963062547853023e-07, "loss": 0.4014, "step": 1529 }, { "epoch": 0.8236323262229998, "grad_norm": 2.650357568288943, "learning_rate": 7.915903121120816e-07, "loss": 0.4475, "step": 1530 }, { "epoch": 0.8241706480048449, "grad_norm": 1.5993270434912978, "learning_rate": 7.868871747525353e-07, "loss": 0.3952, "step": 1531 }, { "epoch": 0.82470896978669, "grad_norm": 1.5445035783730348, "learning_rate": 7.821968570173321e-07, "loss": 0.4546, "step": 1532 }, { "epoch": 0.825247291568535, "grad_norm": 1.7600163478435773, "learning_rate": 7.775193731781316e-07, "loss": 0.3925, "step": 1533 }, { "epoch": 0.8257856133503801, "grad_norm": 1.9376227278838558, "learning_rate": 7.728547374675421e-07, "loss": 0.4142, "step": 1534 }, { "epoch": 0.8263239351322252, "grad_norm": 1.5661272939035957, "learning_rate": 7.682029640790783e-07, "loss": 0.408, "step": 1535 }, { "epoch": 0.8268622569140703, "grad_norm": 1.7751314318755442, "learning_rate": 7.635640671671168e-07, "loss": 0.4748, "step": 1536 }, { "epoch": 0.8274005786959154, "grad_norm": 1.4328800747976576, "learning_rate": 7.589380608468549e-07, "loss": 0.445, "step": 1537 }, { "epoch": 0.8279389004777605, "grad_norm": 1.770544068666416, "learning_rate": 7.543249591942647e-07, "loss": 0.3877, "step": 1538 }, { "epoch": 0.8284772222596056, "grad_norm": 1.4644257793154838, "learning_rate": 7.497247762460535e-07, "loss": 0.4729, "step": 1539 }, { "epoch": 0.8290155440414507, "grad_norm": 2.0251569316621354, "learning_rate": 7.451375259996196e-07, "loss": 0.3926, "step": 1540 }, { "epoch": 0.8295538658232958, "grad_norm": 1.5659705563939743, "learning_rate": 7.405632224130094e-07, "loss": 0.3978, "step": 1541 }, { "epoch": 0.8300921876051409, "grad_norm": 1.5791357169071338, "learning_rate": 7.360018794048757e-07, "loss": 0.4482, "step": 1542 }, { "epoch": 0.830630509386986, "grad_norm": 1.5219436138787439, "learning_rate": 7.314535108544346e-07, "loss": 0.3993, "step": 1543 }, { "epoch": 0.8311688311688312, "grad_norm": 1.5116221556805869, "learning_rate": 7.26918130601425e-07, "loss": 0.4431, "step": 1544 }, { "epoch": 0.8317071529506763, "grad_norm": 1.5355423700033741, "learning_rate": 7.223957524460612e-07, "loss": 0.3847, "step": 1545 }, { "epoch": 0.8322454747325214, "grad_norm": 1.6301347275924607, "learning_rate": 7.17886390148999e-07, "loss": 0.4149, "step": 1546 }, { "epoch": 0.8327837965143665, "grad_norm": 1.39164969438826, "learning_rate": 7.133900574312885e-07, "loss": 0.444, "step": 1547 }, { "epoch": 0.8333221182962116, "grad_norm": 1.6360359120384138, "learning_rate": 7.089067679743322e-07, "loss": 0.4387, "step": 1548 }, { "epoch": 0.8338604400780567, "grad_norm": 1.1463330927551836, "learning_rate": 7.044365354198462e-07, "loss": 0.367, "step": 1549 }, { "epoch": 0.8343987618599018, "grad_norm": 1.3951952353250727, "learning_rate": 6.999793733698168e-07, "loss": 0.4537, "step": 1550 }, { "epoch": 0.8349370836417469, "grad_norm": 1.444313279525601, "learning_rate": 6.955352953864592e-07, "loss": 0.4517, "step": 1551 }, { "epoch": 0.835475405423592, "grad_norm": 1.4922885632634126, "learning_rate": 6.91104314992177e-07, "loss": 0.4182, "step": 1552 }, { "epoch": 0.8360137272054371, "grad_norm": 1.361490120387784, "learning_rate": 6.866864456695189e-07, "loss": 0.3819, "step": 1553 }, { "epoch": 0.8365520489872822, "grad_norm": 1.3785822196112183, "learning_rate": 6.822817008611409e-07, "loss": 0.4315, "step": 1554 }, { "epoch": 0.8370903707691273, "grad_norm": 1.786812938484116, "learning_rate": 6.778900939697642e-07, "loss": 0.4352, "step": 1555 }, { "epoch": 0.8376286925509724, "grad_norm": 1.51980814160385, "learning_rate": 6.735116383581325e-07, "loss": 0.4681, "step": 1556 }, { "epoch": 0.8381670143328175, "grad_norm": 1.6909398106864937, "learning_rate": 6.691463473489751e-07, "loss": 0.3764, "step": 1557 }, { "epoch": 0.8387053361146626, "grad_norm": 1.3032028525505768, "learning_rate": 6.647942342249619e-07, "loss": 0.4571, "step": 1558 }, { "epoch": 0.8392436578965077, "grad_norm": 2.673478994173862, "learning_rate": 6.604553122286672e-07, "loss": 0.4424, "step": 1559 }, { "epoch": 0.8397819796783528, "grad_norm": 1.8774151039134228, "learning_rate": 6.561295945625246e-07, "loss": 0.4289, "step": 1560 }, { "epoch": 0.8397819796783528, "eval_loss": 0.42163270711898804, "eval_runtime": 1532.1805, "eval_samples_per_second": 16.323, "eval_steps_per_second": 0.51, "step": 1560 }, { "epoch": 0.8403203014601979, "grad_norm": 1.3658795551777532, "learning_rate": 6.51817094388793e-07, "loss": 0.4041, "step": 1561 }, { "epoch": 0.840858623242043, "grad_norm": 2.0775682420189683, "learning_rate": 6.475178248295111e-07, "loss": 0.4626, "step": 1562 }, { "epoch": 0.8413969450238881, "grad_norm": 2.0811838469436137, "learning_rate": 6.432317989664599e-07, "loss": 0.4316, "step": 1563 }, { "epoch": 0.8419352668057332, "grad_norm": 1.6387122228577398, "learning_rate": 6.389590298411236e-07, "loss": 0.4198, "step": 1564 }, { "epoch": 0.8424735885875783, "grad_norm": 1.6679858558099225, "learning_rate": 6.346995304546482e-07, "loss": 0.3999, "step": 1565 }, { "epoch": 0.8430119103694234, "grad_norm": 1.4149904617289844, "learning_rate": 6.304533137678026e-07, "loss": 0.418, "step": 1566 }, { "epoch": 0.8435502321512685, "grad_norm": 1.58157239985269, "learning_rate": 6.262203927009403e-07, "loss": 0.4279, "step": 1567 }, { "epoch": 0.8440885539331136, "grad_norm": 1.7638599414290634, "learning_rate": 6.220007801339562e-07, "loss": 0.4042, "step": 1568 }, { "epoch": 0.8446268757149586, "grad_norm": 1.5007385916657803, "learning_rate": 6.17794488906252e-07, "loss": 0.4402, "step": 1569 }, { "epoch": 0.8451651974968037, "grad_norm": 1.345366896432651, "learning_rate": 6.136015318166966e-07, "loss": 0.3642, "step": 1570 }, { "epoch": 0.8457035192786488, "grad_norm": 1.5235663558748846, "learning_rate": 6.094219216235841e-07, "loss": 0.3964, "step": 1571 }, { "epoch": 0.8462418410604939, "grad_norm": 1.3657476470037149, "learning_rate": 6.052556710445972e-07, "loss": 0.3748, "step": 1572 }, { "epoch": 0.846780162842339, "grad_norm": 1.4394596688138968, "learning_rate": 6.011027927567681e-07, "loss": 0.441, "step": 1573 }, { "epoch": 0.8473184846241841, "grad_norm": 1.5318361149430813, "learning_rate": 5.969632993964414e-07, "loss": 0.4621, "step": 1574 }, { "epoch": 0.8478568064060292, "grad_norm": 1.6075753885114712, "learning_rate": 5.928372035592306e-07, "loss": 0.4645, "step": 1575 }, { "epoch": 0.8483951281878743, "grad_norm": 1.5722006469692726, "learning_rate": 5.887245177999867e-07, "loss": 0.4446, "step": 1576 }, { "epoch": 0.8489334499697194, "grad_norm": 1.4551383751314828, "learning_rate": 5.846252546327547e-07, "loss": 0.43, "step": 1577 }, { "epoch": 0.8494717717515645, "grad_norm": 1.4487392657122655, "learning_rate": 5.805394265307391e-07, "loss": 0.4032, "step": 1578 }, { "epoch": 0.8500100935334096, "grad_norm": 1.6691803468661808, "learning_rate": 5.764670459262622e-07, "loss": 0.4328, "step": 1579 }, { "epoch": 0.8505484153152547, "grad_norm": 1.6197190610235175, "learning_rate": 5.724081252107311e-07, "loss": 0.4045, "step": 1580 }, { "epoch": 0.8510867370970998, "grad_norm": 1.6633094952520224, "learning_rate": 5.683626767345951e-07, "loss": 0.4271, "step": 1581 }, { "epoch": 0.8516250588789449, "grad_norm": 1.3383638616282105, "learning_rate": 5.6433071280731e-07, "loss": 0.3742, "step": 1582 }, { "epoch": 0.85216338066079, "grad_norm": 1.3573201978569531, "learning_rate": 5.60312245697302e-07, "loss": 0.355, "step": 1583 }, { "epoch": 0.8527017024426351, "grad_norm": 1.5087600985731158, "learning_rate": 5.563072876319292e-07, "loss": 0.4275, "step": 1584 }, { "epoch": 0.8532400242244802, "grad_norm": 1.9174671861368988, "learning_rate": 5.523158507974452e-07, "loss": 0.4523, "step": 1585 }, { "epoch": 0.8537783460063253, "grad_norm": 1.2701535232392451, "learning_rate": 5.483379473389599e-07, "loss": 0.4157, "step": 1586 }, { "epoch": 0.8543166677881704, "grad_norm": 1.3648674048032239, "learning_rate": 5.443735893604041e-07, "loss": 0.443, "step": 1587 }, { "epoch": 0.8548549895700155, "grad_norm": 1.7303772028968518, "learning_rate": 5.404227889244939e-07, "loss": 0.3945, "step": 1588 }, { "epoch": 0.8553933113518606, "grad_norm": 1.4650825399074572, "learning_rate": 5.364855580526923e-07, "loss": 0.4183, "step": 1589 }, { "epoch": 0.8559316331337057, "grad_norm": 1.7612420028556155, "learning_rate": 5.325619087251704e-07, "loss": 0.4472, "step": 1590 }, { "epoch": 0.8564699549155508, "grad_norm": 1.6090688100302808, "learning_rate": 5.28651852880776e-07, "loss": 0.4348, "step": 1591 }, { "epoch": 0.8570082766973959, "grad_norm": 1.59025634923398, "learning_rate": 5.247554024169949e-07, "loss": 0.4132, "step": 1592 }, { "epoch": 0.857546598479241, "grad_norm": 1.8249117304980227, "learning_rate": 5.20872569189913e-07, "loss": 0.415, "step": 1593 }, { "epoch": 0.8580849202610861, "grad_norm": 1.3724204134525155, "learning_rate": 5.170033650141837e-07, "loss": 0.4645, "step": 1594 }, { "epoch": 0.8586232420429312, "grad_norm": 2.066798117946357, "learning_rate": 5.131478016629888e-07, "loss": 0.4225, "step": 1595 }, { "epoch": 0.8591615638247763, "grad_norm": 2.780252323052268, "learning_rate": 5.093058908680043e-07, "loss": 0.4048, "step": 1596 }, { "epoch": 0.8596998856066214, "grad_norm": 1.4726854180656292, "learning_rate": 5.054776443193626e-07, "loss": 0.4337, "step": 1597 }, { "epoch": 0.8602382073884665, "grad_norm": 1.7991832445280496, "learning_rate": 5.016630736656213e-07, "loss": 0.3871, "step": 1598 }, { "epoch": 0.8607765291703116, "grad_norm": 1.6803342666413155, "learning_rate": 4.978621905137238e-07, "loss": 0.4332, "step": 1599 }, { "epoch": 0.8613148509521567, "grad_norm": 1.4355251448306459, "learning_rate": 4.940750064289657e-07, "loss": 0.3924, "step": 1600 }, { "epoch": 0.8618531727340017, "grad_norm": 1.3604897046592517, "learning_rate": 4.903015329349581e-07, "loss": 0.4057, "step": 1601 }, { "epoch": 0.8623914945158468, "grad_norm": 1.6598958205265515, "learning_rate": 4.865417815135958e-07, "loss": 0.3885, "step": 1602 }, { "epoch": 0.8629298162976919, "grad_norm": 1.4613049538096838, "learning_rate": 4.827957636050179e-07, "loss": 0.3922, "step": 1603 }, { "epoch": 0.863468138079537, "grad_norm": 1.5965664706849296, "learning_rate": 4.790634906075775e-07, "loss": 0.4828, "step": 1604 }, { "epoch": 0.8640064598613821, "grad_norm": 1.8120189192545764, "learning_rate": 4.753449738778021e-07, "loss": 0.429, "step": 1605 }, { "epoch": 0.8645447816432272, "grad_norm": 1.8371969884713577, "learning_rate": 4.716402247303631e-07, "loss": 0.4074, "step": 1606 }, { "epoch": 0.8650831034250723, "grad_norm": 1.5256250240541858, "learning_rate": 4.6794925443804097e-07, "loss": 0.4015, "step": 1607 }, { "epoch": 0.8656214252069174, "grad_norm": 1.6504131905617414, "learning_rate": 4.642720742316886e-07, "loss": 0.4619, "step": 1608 }, { "epoch": 0.8661597469887625, "grad_norm": 1.7464812669613627, "learning_rate": 4.6060869530019983e-07, "loss": 0.4537, "step": 1609 }, { "epoch": 0.8666980687706076, "grad_norm": 1.8767060082708276, "learning_rate": 4.569591287904723e-07, "loss": 0.4612, "step": 1610 }, { "epoch": 0.8672363905524527, "grad_norm": 1.3070105173969313, "learning_rate": 4.5332338580737824e-07, "loss": 0.3629, "step": 1611 }, { "epoch": 0.8677747123342978, "grad_norm": 4.572221630177869, "learning_rate": 4.4970147741372315e-07, "loss": 0.4587, "step": 1612 }, { "epoch": 0.8683130341161429, "grad_norm": 1.4960042467223587, "learning_rate": 4.460934146302215e-07, "loss": 0.4734, "step": 1613 }, { "epoch": 0.868851355897988, "grad_norm": 1.9121190508560355, "learning_rate": 4.424992084354551e-07, "loss": 0.4016, "step": 1614 }, { "epoch": 0.8693896776798331, "grad_norm": 1.706342167134769, "learning_rate": 4.389188697658453e-07, "loss": 0.4207, "step": 1615 }, { "epoch": 0.8699279994616782, "grad_norm": 1.5621521598790504, "learning_rate": 4.3535240951561695e-07, "loss": 0.4101, "step": 1616 }, { "epoch": 0.8704663212435233, "grad_norm": 1.4806315484210542, "learning_rate": 4.3179983853676386e-07, "loss": 0.4608, "step": 1617 }, { "epoch": 0.8710046430253684, "grad_norm": 1.526083402719131, "learning_rate": 4.2826116763902135e-07, "loss": 0.4183, "step": 1618 }, { "epoch": 0.8715429648072135, "grad_norm": 1.6689772565592038, "learning_rate": 4.247364075898258e-07, "loss": 0.4288, "step": 1619 }, { "epoch": 0.8720812865890586, "grad_norm": 1.3834588776364911, "learning_rate": 4.2122556911428744e-07, "loss": 0.4032, "step": 1620 }, { "epoch": 0.8720812865890586, "eval_loss": 0.42079228162765503, "eval_runtime": 1541.5294, "eval_samples_per_second": 16.224, "eval_steps_per_second": 0.507, "step": 1620 }, { "epoch": 0.8726196083709037, "grad_norm": 1.5791149363732657, "learning_rate": 4.177286628951566e-07, "loss": 0.4388, "step": 1621 }, { "epoch": 0.8731579301527488, "grad_norm": 1.7565308716827732, "learning_rate": 4.142456995727906e-07, "loss": 0.4403, "step": 1622 }, { "epoch": 0.8736962519345939, "grad_norm": 1.8536625820585364, "learning_rate": 4.107766897451204e-07, "loss": 0.377, "step": 1623 }, { "epoch": 0.874234573716439, "grad_norm": 1.557798623706775, "learning_rate": 4.073216439676203e-07, "loss": 0.4099, "step": 1624 }, { "epoch": 0.8747728954982841, "grad_norm": 1.5848805929742247, "learning_rate": 4.0388057275327466e-07, "loss": 0.4127, "step": 1625 }, { "epoch": 0.8753112172801292, "grad_norm": 1.4737469672067065, "learning_rate": 4.004534865725462e-07, "loss": 0.4125, "step": 1626 }, { "epoch": 0.8758495390619743, "grad_norm": 1.4866822244945306, "learning_rate": 3.970403958533436e-07, "loss": 0.4081, "step": 1627 }, { "epoch": 0.8763878608438194, "grad_norm": 1.6255821682103373, "learning_rate": 3.936413109809906e-07, "loss": 0.4465, "step": 1628 }, { "epoch": 0.8769261826256645, "grad_norm": 1.4642881317646486, "learning_rate": 3.902562422981937e-07, "loss": 0.4286, "step": 1629 }, { "epoch": 0.8774645044075096, "grad_norm": 1.580573409189922, "learning_rate": 3.8688520010501276e-07, "loss": 0.4527, "step": 1630 }, { "epoch": 0.8780028261893547, "grad_norm": 2.0543315708956387, "learning_rate": 3.835281946588254e-07, "loss": 0.4377, "step": 1631 }, { "epoch": 0.8785411479711998, "grad_norm": 1.5115782436115135, "learning_rate": 3.801852361743008e-07, "loss": 0.4525, "step": 1632 }, { "epoch": 0.8790794697530449, "grad_norm": 1.8374746527735237, "learning_rate": 3.7685633482336504e-07, "loss": 0.4242, "step": 1633 }, { "epoch": 0.87961779153489, "grad_norm": 1.5036770046647692, "learning_rate": 3.7354150073516947e-07, "loss": 0.4474, "step": 1634 }, { "epoch": 0.880156113316735, "grad_norm": 1.658882270187231, "learning_rate": 3.702407439960648e-07, "loss": 0.4321, "step": 1635 }, { "epoch": 0.8806944350985801, "grad_norm": 1.6020319338410256, "learning_rate": 3.669540746495653e-07, "loss": 0.4212, "step": 1636 }, { "epoch": 0.8812327568804252, "grad_norm": 1.7415071086793177, "learning_rate": 3.636815026963214e-07, "loss": 0.4229, "step": 1637 }, { "epoch": 0.8817710786622703, "grad_norm": 1.328144623680027, "learning_rate": 3.604230380940871e-07, "loss": 0.4135, "step": 1638 }, { "epoch": 0.8823094004441154, "grad_norm": 1.8361744282067538, "learning_rate": 3.5717869075769187e-07, "loss": 0.4448, "step": 1639 }, { "epoch": 0.8828477222259605, "grad_norm": 1.4454157174291669, "learning_rate": 3.5394847055900794e-07, "loss": 0.4339, "step": 1640 }, { "epoch": 0.8833860440078056, "grad_norm": 1.6322475345286311, "learning_rate": 3.5073238732692305e-07, "loss": 0.4176, "step": 1641 }, { "epoch": 0.8839243657896507, "grad_norm": 1.445292085363601, "learning_rate": 3.475304508473071e-07, "loss": 0.4554, "step": 1642 }, { "epoch": 0.8844626875714958, "grad_norm": 1.4938616353672438, "learning_rate": 3.44342670862986e-07, "loss": 0.4088, "step": 1643 }, { "epoch": 0.8850010093533409, "grad_norm": 1.47760594711673, "learning_rate": 3.411690570737097e-07, "loss": 0.3793, "step": 1644 }, { "epoch": 0.885539331135186, "grad_norm": 1.6041036008050786, "learning_rate": 3.3800961913612427e-07, "loss": 0.4648, "step": 1645 }, { "epoch": 0.8860776529170311, "grad_norm": 1.6055085861001368, "learning_rate": 3.3486436666374024e-07, "loss": 0.3958, "step": 1646 }, { "epoch": 0.8866159746988762, "grad_norm": 1.592597656491022, "learning_rate": 3.3173330922690594e-07, "loss": 0.4534, "step": 1647 }, { "epoch": 0.8871542964807213, "grad_norm": 1.3972942678399092, "learning_rate": 3.2861645635277715e-07, "loss": 0.4075, "step": 1648 }, { "epoch": 0.8876926182625664, "grad_norm": 1.299571800868061, "learning_rate": 3.255138175252859e-07, "loss": 0.4322, "step": 1649 }, { "epoch": 0.8882309400444115, "grad_norm": 1.6074089216828915, "learning_rate": 3.22425402185117e-07, "loss": 0.4442, "step": 1650 }, { "epoch": 0.8887692618262566, "grad_norm": 1.6515277192815747, "learning_rate": 3.1935121972967387e-07, "loss": 0.3974, "step": 1651 }, { "epoch": 0.8893075836081017, "grad_norm": 1.9560867162587892, "learning_rate": 3.1629127951305407e-07, "loss": 0.4419, "step": 1652 }, { "epoch": 0.8898459053899468, "grad_norm": 1.4109620050170866, "learning_rate": 3.132455908460175e-07, "loss": 0.4006, "step": 1653 }, { "epoch": 0.8903842271717919, "grad_norm": 1.3778369174445322, "learning_rate": 3.1021416299595985e-07, "loss": 0.3917, "step": 1654 }, { "epoch": 0.890922548953637, "grad_norm": 1.7547858079840999, "learning_rate": 3.0719700518688447e-07, "loss": 0.4698, "step": 1655 }, { "epoch": 0.8914608707354821, "grad_norm": 1.5659476763978994, "learning_rate": 3.0419412659937477e-07, "loss": 0.4172, "step": 1656 }, { "epoch": 0.8919991925173272, "grad_norm": 3.093400384631848, "learning_rate": 3.0120553637056293e-07, "loss": 0.3883, "step": 1657 }, { "epoch": 0.8925375142991724, "grad_norm": 1.4466790084982413, "learning_rate": 2.9823124359410706e-07, "loss": 0.391, "step": 1658 }, { "epoch": 0.8930758360810175, "grad_norm": 1.2602029099448362, "learning_rate": 2.9527125732015995e-07, "loss": 0.41, "step": 1659 }, { "epoch": 0.8936141578628626, "grad_norm": 1.5682198116188635, "learning_rate": 2.923255865553432e-07, "loss": 0.4361, "step": 1660 }, { "epoch": 0.8941524796447077, "grad_norm": 1.7284038118874672, "learning_rate": 2.8939424026271923e-07, "loss": 0.4248, "step": 1661 }, { "epoch": 0.8946908014265528, "grad_norm": 1.4256983828332148, "learning_rate": 2.8647722736176333e-07, "loss": 0.4291, "step": 1662 }, { "epoch": 0.8952291232083979, "grad_norm": 1.4976102627551229, "learning_rate": 2.8357455672833933e-07, "loss": 0.3813, "step": 1663 }, { "epoch": 0.895767444990243, "grad_norm": 1.8854495681463317, "learning_rate": 2.8068623719466725e-07, "loss": 0.4516, "step": 1664 }, { "epoch": 0.8963057667720881, "grad_norm": 1.5693149002013742, "learning_rate": 2.7781227754930253e-07, "loss": 0.4585, "step": 1665 }, { "epoch": 0.8968440885539332, "grad_norm": 1.573734503341506, "learning_rate": 2.7495268653710493e-07, "loss": 0.4483, "step": 1666 }, { "epoch": 0.8973824103357783, "grad_norm": 1.5481263062327042, "learning_rate": 2.7210747285921435e-07, "loss": 0.4468, "step": 1667 }, { "epoch": 0.8979207321176234, "grad_norm": 1.7822442462595496, "learning_rate": 2.692766451730233e-07, "loss": 0.4234, "step": 1668 }, { "epoch": 0.8984590538994685, "grad_norm": 1.8797060608535148, "learning_rate": 2.6646021209215003e-07, "loss": 0.4063, "step": 1669 }, { "epoch": 0.8989973756813135, "grad_norm": 1.4047802142985153, "learning_rate": 2.636581821864148e-07, "loss": 0.3933, "step": 1670 }, { "epoch": 0.8995356974631586, "grad_norm": 1.9919594742667397, "learning_rate": 2.6087056398180823e-07, "loss": 0.4259, "step": 1671 }, { "epoch": 0.9000740192450037, "grad_norm": 1.439697905572551, "learning_rate": 2.580973659604735e-07, "loss": 0.4234, "step": 1672 }, { "epoch": 0.9006123410268488, "grad_norm": 1.4340034850095604, "learning_rate": 2.553385965606736e-07, "loss": 0.4011, "step": 1673 }, { "epoch": 0.9011506628086939, "grad_norm": 1.6008407880111504, "learning_rate": 2.525942641767687e-07, "loss": 0.4064, "step": 1674 }, { "epoch": 0.901688984590539, "grad_norm": 1.393769083088064, "learning_rate": 2.498643771591908e-07, "loss": 0.3878, "step": 1675 }, { "epoch": 0.9022273063723841, "grad_norm": 1.5473000323872435, "learning_rate": 2.47148943814417e-07, "loss": 0.4125, "step": 1676 }, { "epoch": 0.9027656281542292, "grad_norm": 1.504947787937997, "learning_rate": 2.4444797240494533e-07, "loss": 0.4328, "step": 1677 }, { "epoch": 0.9033039499360743, "grad_norm": 1.8071042005817233, "learning_rate": 2.4176147114927e-07, "loss": 0.4429, "step": 1678 }, { "epoch": 0.9038422717179194, "grad_norm": 1.5975781936612632, "learning_rate": 2.3908944822185144e-07, "loss": 0.4279, "step": 1679 }, { "epoch": 0.9043805934997645, "grad_norm": 1.4408734852067904, "learning_rate": 2.364319117531011e-07, "loss": 0.404, "step": 1680 }, { "epoch": 0.9043805934997645, "eval_loss": 0.42025431990623474, "eval_runtime": 1550.3923, "eval_samples_per_second": 16.131, "eval_steps_per_second": 0.504, "step": 1680 }, { "epoch": 0.9049189152816096, "grad_norm": 1.6629310324181896, "learning_rate": 2.3378886982934778e-07, "loss": 0.4876, "step": 1681 }, { "epoch": 0.9054572370634547, "grad_norm": 1.5275509334845596, "learning_rate": 2.311603304928173e-07, "loss": 0.4428, "step": 1682 }, { "epoch": 0.9059955588452998, "grad_norm": 1.6372832685609333, "learning_rate": 2.285463017416073e-07, "loss": 0.4815, "step": 1683 }, { "epoch": 0.9065338806271449, "grad_norm": 1.846596894090347, "learning_rate": 2.2594679152966258e-07, "loss": 0.4724, "step": 1684 }, { "epoch": 0.90707220240899, "grad_norm": 1.7091710123282846, "learning_rate": 2.2336180776675154e-07, "loss": 0.4447, "step": 1685 }, { "epoch": 0.9076105241908351, "grad_norm": 1.4759554995733482, "learning_rate": 2.2079135831843956e-07, "loss": 0.4421, "step": 1686 }, { "epoch": 0.9081488459726802, "grad_norm": 1.4044547819882969, "learning_rate": 2.1823545100606914e-07, "loss": 0.4438, "step": 1687 }, { "epoch": 0.9086871677545253, "grad_norm": 1.6839786445608516, "learning_rate": 2.1569409360673422e-07, "loss": 0.4295, "step": 1688 }, { "epoch": 0.9092254895363704, "grad_norm": 1.695687328944884, "learning_rate": 2.131672938532553e-07, "loss": 0.4001, "step": 1689 }, { "epoch": 0.9097638113182155, "grad_norm": 1.6064285368620497, "learning_rate": 2.1065505943415775e-07, "loss": 0.426, "step": 1690 }, { "epoch": 0.9103021331000606, "grad_norm": 1.805677873651136, "learning_rate": 2.0815739799364743e-07, "loss": 0.4109, "step": 1691 }, { "epoch": 0.9108404548819057, "grad_norm": 1.6393066274059234, "learning_rate": 2.0567431713158726e-07, "loss": 0.4377, "step": 1692 }, { "epoch": 0.9113787766637508, "grad_norm": 1.6183131956225818, "learning_rate": 2.032058244034757e-07, "loss": 0.4412, "step": 1693 }, { "epoch": 0.9119170984455959, "grad_norm": 1.5002695967364554, "learning_rate": 2.007519273204206e-07, "loss": 0.4437, "step": 1694 }, { "epoch": 0.912455420227441, "grad_norm": 1.647362717510626, "learning_rate": 1.9831263334911977e-07, "loss": 0.4808, "step": 1695 }, { "epoch": 0.9129937420092861, "grad_norm": 1.5964438963275278, "learning_rate": 1.95887949911836e-07, "loss": 0.4393, "step": 1696 }, { "epoch": 0.9135320637911312, "grad_norm": 1.8713869106599383, "learning_rate": 1.934778843863766e-07, "loss": 0.434, "step": 1697 }, { "epoch": 0.9140703855729763, "grad_norm": 1.9039547376831083, "learning_rate": 1.9108244410606823e-07, "loss": 0.4364, "step": 1698 }, { "epoch": 0.9146087073548214, "grad_norm": 1.5450254177283191, "learning_rate": 1.887016363597366e-07, "loss": 0.4589, "step": 1699 }, { "epoch": 0.9151470291366665, "grad_norm": 1.543879530191546, "learning_rate": 1.8633546839168403e-07, "loss": 0.4064, "step": 1700 }, { "epoch": 0.9156853509185116, "grad_norm": 1.5304353330893454, "learning_rate": 1.839839474016658e-07, "loss": 0.442, "step": 1701 }, { "epoch": 0.9162236727003567, "grad_norm": 2.3452574340826233, "learning_rate": 1.8164708054487002e-07, "loss": 0.422, "step": 1702 }, { "epoch": 0.9167619944822017, "grad_norm": 1.9150867244566236, "learning_rate": 1.7932487493189598e-07, "loss": 0.4294, "step": 1703 }, { "epoch": 0.9173003162640468, "grad_norm": 1.6124806051656038, "learning_rate": 1.7701733762873152e-07, "loss": 0.428, "step": 1704 }, { "epoch": 0.9178386380458919, "grad_norm": 1.4187608860726189, "learning_rate": 1.7472447565673177e-07, "loss": 0.4038, "step": 1705 }, { "epoch": 0.918376959827737, "grad_norm": 1.4661931221135862, "learning_rate": 1.7244629599259767e-07, "loss": 0.3848, "step": 1706 }, { "epoch": 0.9189152816095821, "grad_norm": 1.6206434175751971, "learning_rate": 1.7018280556835632e-07, "loss": 0.3851, "step": 1707 }, { "epoch": 0.9194536033914272, "grad_norm": 1.8423442465927384, "learning_rate": 1.6793401127133513e-07, "loss": 0.4079, "step": 1708 }, { "epoch": 0.9199919251732723, "grad_norm": 1.3950233471823357, "learning_rate": 1.6569991994414835e-07, "loss": 0.3994, "step": 1709 }, { "epoch": 0.9205302469551174, "grad_norm": 1.5142214065755961, "learning_rate": 1.6348053838466937e-07, "loss": 0.4189, "step": 1710 }, { "epoch": 0.9210685687369625, "grad_norm": 1.5917351975615364, "learning_rate": 1.6127587334601458e-07, "loss": 0.4314, "step": 1711 }, { "epoch": 0.9216068905188076, "grad_norm": 1.605064219083874, "learning_rate": 1.5908593153651952e-07, "loss": 0.4237, "step": 1712 }, { "epoch": 0.9221452123006527, "grad_norm": 1.7341654884483175, "learning_rate": 1.5691071961972116e-07, "loss": 0.4131, "step": 1713 }, { "epoch": 0.9226835340824978, "grad_norm": 1.6343186301580133, "learning_rate": 1.547502442143356e-07, "loss": 0.4233, "step": 1714 }, { "epoch": 0.9232218558643429, "grad_norm": 1.5099995374537671, "learning_rate": 1.526045118942404e-07, "loss": 0.3982, "step": 1715 }, { "epoch": 0.923760177646188, "grad_norm": 1.7958348974891065, "learning_rate": 1.504735291884507e-07, "loss": 0.4331, "step": 1716 }, { "epoch": 0.9242984994280331, "grad_norm": 1.7356588334735397, "learning_rate": 1.4835730258110303e-07, "loss": 0.4357, "step": 1717 }, { "epoch": 0.9248368212098782, "grad_norm": 2.500196744283525, "learning_rate": 1.4625583851143432e-07, "loss": 0.3799, "step": 1718 }, { "epoch": 0.9253751429917233, "grad_norm": 1.3646453068750661, "learning_rate": 1.4416914337376132e-07, "loss": 0.4128, "step": 1719 }, { "epoch": 0.9259134647735684, "grad_norm": 1.642640642870041, "learning_rate": 1.420972235174628e-07, "loss": 0.4506, "step": 1720 }, { "epoch": 0.9264517865554135, "grad_norm": 1.592814733182936, "learning_rate": 1.4004008524695912e-07, "loss": 0.4296, "step": 1721 }, { "epoch": 0.9269901083372586, "grad_norm": 1.4652552983592342, "learning_rate": 1.3799773482169378e-07, "loss": 0.4233, "step": 1722 }, { "epoch": 0.9275284301191037, "grad_norm": 1.7410090898687602, "learning_rate": 1.3597017845611181e-07, "loss": 0.4594, "step": 1723 }, { "epoch": 0.9280667519009488, "grad_norm": 1.559448064084867, "learning_rate": 1.3395742231964658e-07, "loss": 0.4336, "step": 1724 }, { "epoch": 0.9286050736827939, "grad_norm": 1.9623398348887997, "learning_rate": 1.3195947253669518e-07, "loss": 0.4724, "step": 1725 }, { "epoch": 0.929143395464639, "grad_norm": 1.4765323135961603, "learning_rate": 1.2997633518660125e-07, "loss": 0.4122, "step": 1726 }, { "epoch": 0.9296817172464841, "grad_norm": 1.9030353185015407, "learning_rate": 1.2800801630364013e-07, "loss": 0.4414, "step": 1727 }, { "epoch": 0.9302200390283292, "grad_norm": 1.3486307498615422, "learning_rate": 1.2605452187699484e-07, "loss": 0.4799, "step": 1728 }, { "epoch": 0.9307583608101743, "grad_norm": 1.4474994381201687, "learning_rate": 1.2411585785074232e-07, "loss": 0.4353, "step": 1729 }, { "epoch": 0.9312966825920194, "grad_norm": 1.460955137197927, "learning_rate": 1.221920301238333e-07, "loss": 0.4248, "step": 1730 }, { "epoch": 0.9318350043738645, "grad_norm": 1.8140612572363009, "learning_rate": 1.2028304455007412e-07, "loss": 0.3888, "step": 1731 }, { "epoch": 0.9323733261557096, "grad_norm": 1.4724419135884532, "learning_rate": 1.1838890693811055e-07, "loss": 0.3868, "step": 1732 }, { "epoch": 0.9329116479375547, "grad_norm": 1.4562877473919869, "learning_rate": 1.1650962305140845e-07, "loss": 0.4305, "step": 1733 }, { "epoch": 0.9334499697193998, "grad_norm": 2.0045234339432763, "learning_rate": 1.1464519860823698e-07, "loss": 0.5062, "step": 1734 }, { "epoch": 0.9339882915012448, "grad_norm": 1.8962618785171959, "learning_rate": 1.1279563928165094e-07, "loss": 0.4049, "step": 1735 }, { "epoch": 0.93452661328309, "grad_norm": 1.580337734175196, "learning_rate": 1.1096095069947466e-07, "loss": 0.4465, "step": 1736 }, { "epoch": 0.935064935064935, "grad_norm": 1.6703156179249958, "learning_rate": 1.091411384442831e-07, "loss": 0.4174, "step": 1737 }, { "epoch": 0.9356032568467801, "grad_norm": 1.4707795804039079, "learning_rate": 1.0733620805338462e-07, "loss": 0.3582, "step": 1738 }, { "epoch": 0.9361415786286252, "grad_norm": 1.5443607495595517, "learning_rate": 1.0554616501880722e-07, "loss": 0.4322, "step": 1739 }, { "epoch": 0.9366799004104703, "grad_norm": 1.647874029047969, "learning_rate": 1.0377101478727835e-07, "loss": 0.4465, "step": 1740 }, { "epoch": 0.9366799004104703, "eval_loss": 0.41988879442214966, "eval_runtime": 1559.0337, "eval_samples_per_second": 16.042, "eval_steps_per_second": 0.502, "step": 1740 }, { "epoch": 0.9372182221923154, "grad_norm": 1.6210033117188805, "learning_rate": 1.0201076276021072e-07, "loss": 0.4432, "step": 1741 }, { "epoch": 0.9377565439741605, "grad_norm": 1.9123170938822815, "learning_rate": 1.0026541429368431e-07, "loss": 0.4024, "step": 1742 }, { "epoch": 0.9382948657560056, "grad_norm": 2.5680416907462864, "learning_rate": 9.853497469843043e-08, "loss": 0.3973, "step": 1743 }, { "epoch": 0.9388331875378507, "grad_norm": 1.462242975230514, "learning_rate": 9.681944923981724e-08, "loss": 0.455, "step": 1744 }, { "epoch": 0.9393715093196958, "grad_norm": 1.4330622858448745, "learning_rate": 9.511884313782915e-08, "loss": 0.409, "step": 1745 }, { "epoch": 0.9399098311015409, "grad_norm": 1.5924131568344673, "learning_rate": 9.343316156705751e-08, "loss": 0.4709, "step": 1746 }, { "epoch": 0.940448152883386, "grad_norm": 2.1748083360521, "learning_rate": 9.176240965668049e-08, "loss": 0.4975, "step": 1747 }, { "epoch": 0.9409864746652311, "grad_norm": 2.240808535802813, "learning_rate": 9.01065924904465e-08, "loss": 0.4817, "step": 1748 }, { "epoch": 0.9415247964470762, "grad_norm": 1.7231015704313604, "learning_rate": 8.846571510666369e-08, "loss": 0.4094, "step": 1749 }, { "epoch": 0.9420631182289213, "grad_norm": 1.4693480082476622, "learning_rate": 8.683978249817981e-08, "loss": 0.4453, "step": 1750 }, { "epoch": 0.9426014400107664, "grad_norm": 1.6509935540008158, "learning_rate": 8.52287996123674e-08, "loss": 0.4065, "step": 1751 }, { "epoch": 0.9431397617926115, "grad_norm": 1.6701873629796138, "learning_rate": 8.363277135111314e-08, "loss": 0.3761, "step": 1752 }, { "epoch": 0.9436780835744566, "grad_norm": 1.2809352240300242, "learning_rate": 8.205170257079786e-08, "loss": 0.4159, "step": 1753 }, { "epoch": 0.9442164053563017, "grad_norm": 1.62872520153001, "learning_rate": 8.048559808228496e-08, "loss": 0.3973, "step": 1754 }, { "epoch": 0.9447547271381468, "grad_norm": 1.6888413344801536, "learning_rate": 7.89344626509031e-08, "loss": 0.4219, "step": 1755 }, { "epoch": 0.9452930489199919, "grad_norm": 1.6223202323912347, "learning_rate": 7.739830099643464e-08, "loss": 0.4303, "step": 1756 }, { "epoch": 0.945831370701837, "grad_norm": 1.2810729846885742, "learning_rate": 7.587711779309947e-08, "loss": 0.3868, "step": 1757 }, { "epoch": 0.9463696924836821, "grad_norm": 1.6840497326805903, "learning_rate": 7.437091766954119e-08, "loss": 0.434, "step": 1758 }, { "epoch": 0.9469080142655272, "grad_norm": 1.765752446431431, "learning_rate": 7.287970520881205e-08, "loss": 0.4461, "step": 1759 }, { "epoch": 0.9474463360473723, "grad_norm": 1.4694297184744327, "learning_rate": 7.140348494836191e-08, "loss": 0.4374, "step": 1760 }, { "epoch": 0.9479846578292174, "grad_norm": 1.456090878683348, "learning_rate": 6.994226138002047e-08, "loss": 0.4204, "step": 1761 }, { "epoch": 0.9485229796110625, "grad_norm": 1.5114503786906142, "learning_rate": 6.849603894998725e-08, "loss": 0.4431, "step": 1762 }, { "epoch": 0.9490613013929076, "grad_norm": 1.9303693867033398, "learning_rate": 6.706482205881548e-08, "loss": 0.4292, "step": 1763 }, { "epoch": 0.9495996231747527, "grad_norm": 1.3436489528854563, "learning_rate": 6.564861506139996e-08, "loss": 0.3854, "step": 1764 }, { "epoch": 0.9501379449565978, "grad_norm": 1.3843500014884988, "learning_rate": 6.424742226696312e-08, "loss": 0.3969, "step": 1765 }, { "epoch": 0.9506762667384429, "grad_norm": 1.3401735876692071, "learning_rate": 6.286124793904336e-08, "loss": 0.4183, "step": 1766 }, { "epoch": 0.951214588520288, "grad_norm": 1.685672633138118, "learning_rate": 6.149009629547897e-08, "loss": 0.4468, "step": 1767 }, { "epoch": 0.951752910302133, "grad_norm": 1.8943339017606036, "learning_rate": 6.013397150839983e-08, "loss": 0.4361, "step": 1768 }, { "epoch": 0.9522912320839781, "grad_norm": 1.7967244404705551, "learning_rate": 5.8792877704211274e-08, "loss": 0.4491, "step": 1769 }, { "epoch": 0.9528295538658232, "grad_norm": 1.4606147240071112, "learning_rate": 5.746681896358131e-08, "loss": 0.4019, "step": 1770 }, { "epoch": 0.9533678756476683, "grad_norm": 1.455938194249448, "learning_rate": 5.615579932143067e-08, "loss": 0.3948, "step": 1771 }, { "epoch": 0.9539061974295135, "grad_norm": 1.2759206549407909, "learning_rate": 5.485982276691892e-08, "loss": 0.3949, "step": 1772 }, { "epoch": 0.9544445192113586, "grad_norm": 1.5731889340664074, "learning_rate": 5.35788932434328e-08, "loss": 0.4422, "step": 1773 }, { "epoch": 0.9549828409932037, "grad_norm": 1.4900834870938766, "learning_rate": 5.2313014648573966e-08, "loss": 0.3651, "step": 1774 }, { "epoch": 0.9555211627750488, "grad_norm": 1.3653648358156305, "learning_rate": 5.1062190834146875e-08, "loss": 0.403, "step": 1775 }, { "epoch": 0.9560594845568939, "grad_norm": 1.5012692588758656, "learning_rate": 4.9826425606148145e-08, "loss": 0.4056, "step": 1776 }, { "epoch": 0.956597806338739, "grad_norm": 1.7114437223613954, "learning_rate": 4.860572272475384e-08, "loss": 0.4219, "step": 1777 }, { "epoch": 0.9571361281205841, "grad_norm": 1.5710449681536929, "learning_rate": 4.740008590430778e-08, "loss": 0.4504, "step": 1778 }, { "epoch": 0.9576744499024292, "grad_norm": 1.5334464777855485, "learning_rate": 4.620951881331215e-08, "loss": 0.4078, "step": 1779 }, { "epoch": 0.9582127716842743, "grad_norm": 1.665311340751073, "learning_rate": 4.5034025074414124e-08, "loss": 0.388, "step": 1780 }, { "epoch": 0.9587510934661194, "grad_norm": 1.6819133415223784, "learning_rate": 4.3873608264397014e-08, "loss": 0.4318, "step": 1781 }, { "epoch": 0.9592894152479645, "grad_norm": 2.1910803064926947, "learning_rate": 4.272827191416584e-08, "loss": 0.3862, "step": 1782 }, { "epoch": 0.9598277370298096, "grad_norm": 1.3743310605178427, "learning_rate": 4.159801950874176e-08, "loss": 0.382, "step": 1783 }, { "epoch": 0.9603660588116547, "grad_norm": 1.753291691489888, "learning_rate": 4.048285448724709e-08, "loss": 0.4677, "step": 1784 }, { "epoch": 0.9609043805934998, "grad_norm": 1.4424214242693971, "learning_rate": 3.938278024289644e-08, "loss": 0.4012, "step": 1785 }, { "epoch": 0.9614427023753449, "grad_norm": 1.4573151134275804, "learning_rate": 3.829780012298612e-08, "loss": 0.4058, "step": 1786 }, { "epoch": 0.96198102415719, "grad_norm": 1.4245212432098524, "learning_rate": 3.722791742888476e-08, "loss": 0.3958, "step": 1787 }, { "epoch": 0.9625193459390351, "grad_norm": 1.533496999870574, "learning_rate": 3.617313541602274e-08, "loss": 0.4195, "step": 1788 }, { "epoch": 0.9630576677208802, "grad_norm": 1.854726516234056, "learning_rate": 3.5133457293881626e-08, "loss": 0.4376, "step": 1789 }, { "epoch": 0.9635959895027253, "grad_norm": 1.9373159151394588, "learning_rate": 3.410888622598585e-08, "loss": 0.4312, "step": 1790 }, { "epoch": 0.9641343112845704, "grad_norm": 2.153201724460075, "learning_rate": 3.3099425329890525e-08, "loss": 0.4494, "step": 1791 }, { "epoch": 0.9646726330664155, "grad_norm": 1.4498518000265068, "learning_rate": 3.210507767717586e-08, "loss": 0.4199, "step": 1792 }, { "epoch": 0.9652109548482606, "grad_norm": 1.6032986767797375, "learning_rate": 3.1125846293433846e-08, "loss": 0.3771, "step": 1793 }, { "epoch": 0.9657492766301057, "grad_norm": 2.1622319654687057, "learning_rate": 3.0161734158261625e-08, "loss": 0.4214, "step": 1794 }, { "epoch": 0.9662875984119508, "grad_norm": 1.4345400536711836, "learning_rate": 2.9212744205252553e-08, "loss": 0.3797, "step": 1795 }, { "epoch": 0.9668259201937959, "grad_norm": 1.6565073229021858, "learning_rate": 2.8278879321983477e-08, "loss": 0.3874, "step": 1796 }, { "epoch": 0.967364241975641, "grad_norm": 2.0557097314570196, "learning_rate": 2.736014235001194e-08, "loss": 0.4341, "step": 1797 }, { "epoch": 0.9679025637574861, "grad_norm": 1.64490095462292, "learning_rate": 2.6456536084862872e-08, "loss": 0.3979, "step": 1798 }, { "epoch": 0.9684408855393312, "grad_norm": 1.6729564375619899, "learning_rate": 2.5568063276021347e-08, "loss": 0.397, "step": 1799 }, { "epoch": 0.9689792073211763, "grad_norm": 1.5597222162662605, "learning_rate": 2.4694726626925403e-08, "loss": 0.432, "step": 1800 }, { "epoch": 0.9689792073211763, "eval_loss": 0.4197918474674225, "eval_runtime": 1571.0705, "eval_samples_per_second": 15.919, "eval_steps_per_second": 0.498, "step": 1800 }, { "epoch": 0.9695175291030214, "grad_norm": 1.4076281710448164, "learning_rate": 2.383652879495657e-08, "loss": 0.3963, "step": 1801 }, { "epoch": 0.9700558508848665, "grad_norm": 1.645367632025504, "learning_rate": 2.299347239143157e-08, "loss": 0.4272, "step": 1802 }, { "epoch": 0.9705941726667116, "grad_norm": 1.3956889574044051, "learning_rate": 2.2165559981595642e-08, "loss": 0.429, "step": 1803 }, { "epoch": 0.9711324944485566, "grad_norm": 1.4793349281728767, "learning_rate": 2.1352794084613658e-08, "loss": 0.4479, "step": 1804 }, { "epoch": 0.9716708162304017, "grad_norm": 1.580535608856093, "learning_rate": 2.0555177173562925e-08, "loss": 0.431, "step": 1805 }, { "epoch": 0.9722091380122468, "grad_norm": 1.7015563233283766, "learning_rate": 1.9772711675425937e-08, "loss": 0.3984, "step": 1806 }, { "epoch": 0.9727474597940919, "grad_norm": 1.5158636017258738, "learning_rate": 1.9005399971080974e-08, "loss": 0.4166, "step": 1807 }, { "epoch": 0.973285781575937, "grad_norm": 1.4220838677616172, "learning_rate": 1.8253244395298186e-08, "loss": 0.3988, "step": 1808 }, { "epoch": 0.9738241033577821, "grad_norm": 1.3963959999222404, "learning_rate": 1.7516247236731288e-08, "loss": 0.4224, "step": 1809 }, { "epoch": 0.9743624251396272, "grad_norm": 1.7337278360138024, "learning_rate": 1.679441073790755e-08, "loss": 0.4738, "step": 1810 }, { "epoch": 0.9749007469214723, "grad_norm": 1.4861221398216466, "learning_rate": 1.6087737095225598e-08, "loss": 0.4449, "step": 1811 }, { "epoch": 0.9754390687033174, "grad_norm": 1.3145810749185178, "learning_rate": 1.539622845894595e-08, "loss": 0.3885, "step": 1812 }, { "epoch": 0.9759773904851625, "grad_norm": 1.3176971825763986, "learning_rate": 1.471988693318549e-08, "loss": 0.4232, "step": 1813 }, { "epoch": 0.9765157122670076, "grad_norm": 1.442309770679218, "learning_rate": 1.4058714575910238e-08, "loss": 0.4328, "step": 1814 }, { "epoch": 0.9770540340488527, "grad_norm": 1.5157478456952573, "learning_rate": 1.3412713398930355e-08, "loss": 0.3911, "step": 1815 }, { "epoch": 0.9775923558306978, "grad_norm": 1.779840899462066, "learning_rate": 1.2781885367892377e-08, "loss": 0.4179, "step": 1816 }, { "epoch": 0.9781306776125429, "grad_norm": 1.6067561255260123, "learning_rate": 1.2166232402275325e-08, "loss": 0.3987, "step": 1817 }, { "epoch": 0.978668999394388, "grad_norm": 1.4429159861518235, "learning_rate": 1.156575637538182e-08, "loss": 0.3752, "step": 1818 }, { "epoch": 0.9792073211762331, "grad_norm": 1.6134101059886168, "learning_rate": 1.0980459114335318e-08, "loss": 0.4491, "step": 1819 }, { "epoch": 0.9797456429580782, "grad_norm": 1.3430032688894593, "learning_rate": 1.0410342400073992e-08, "loss": 0.4446, "step": 1820 }, { "epoch": 0.9802839647399233, "grad_norm": 1.5854543749606242, "learning_rate": 9.855407967344078e-09, "loss": 0.4022, "step": 1821 }, { "epoch": 0.9808222865217684, "grad_norm": 1.3429626400579588, "learning_rate": 9.31565750469543e-09, "loss": 0.4173, "step": 1822 }, { "epoch": 0.9813606083036135, "grad_norm": 1.8181594324695687, "learning_rate": 8.791092654476529e-09, "loss": 0.4699, "step": 1823 }, { "epoch": 0.9818989300854586, "grad_norm": 1.3189784151442827, "learning_rate": 8.281715012827817e-09, "loss": 0.3847, "step": 1824 }, { "epoch": 0.9824372518673037, "grad_norm": 1.29942395236663, "learning_rate": 7.78752612968059e-09, "loss": 0.3989, "step": 1825 }, { "epoch": 0.9829755736491488, "grad_norm": 1.6481398184837366, "learning_rate": 7.3085275087475535e-09, "loss": 0.385, "step": 1826 }, { "epoch": 0.9835138954309939, "grad_norm": 1.2097016930732503, "learning_rate": 6.844720607522282e-09, "loss": 0.4635, "step": 1827 }, { "epoch": 0.984052217212839, "grad_norm": 1.3353672523995217, "learning_rate": 6.3961068372725425e-09, "loss": 0.4659, "step": 1828 }, { "epoch": 0.9845905389946841, "grad_norm": 1.6604758834668205, "learning_rate": 5.962687563036418e-09, "loss": 0.4182, "step": 1829 }, { "epoch": 0.9851288607765292, "grad_norm": 1.365766973195823, "learning_rate": 5.544464103618419e-09, "loss": 0.4496, "step": 1830 }, { "epoch": 0.9856671825583743, "grad_norm": 1.7311791534397065, "learning_rate": 5.1414377315855965e-09, "loss": 0.4091, "step": 1831 }, { "epoch": 0.9862055043402194, "grad_norm": 1.6223056568910816, "learning_rate": 4.753609673263104e-09, "loss": 0.435, "step": 1832 }, { "epoch": 0.9867438261220645, "grad_norm": 1.4811187708876057, "learning_rate": 4.380981108730309e-09, "loss": 0.4229, "step": 1833 }, { "epoch": 0.9872821479039096, "grad_norm": 1.5639619332709622, "learning_rate": 4.023553171819128e-09, "loss": 0.4434, "step": 1834 }, { "epoch": 0.9878204696857547, "grad_norm": 1.4607336838401341, "learning_rate": 3.681326950107922e-09, "loss": 0.3892, "step": 1835 }, { "epoch": 0.9883587914675998, "grad_norm": 1.4459818740856154, "learning_rate": 3.3543034849192746e-09, "loss": 0.4613, "step": 1836 }, { "epoch": 0.9888971132494448, "grad_norm": 1.727956071768554, "learning_rate": 3.0424837713188825e-09, "loss": 0.4321, "step": 1837 }, { "epoch": 0.98943543503129, "grad_norm": 1.4250494159267046, "learning_rate": 2.7458687581072284e-09, "loss": 0.4361, "step": 1838 }, { "epoch": 0.989973756813135, "grad_norm": 1.6825614414547043, "learning_rate": 2.4644593478240218e-09, "loss": 0.4247, "step": 1839 }, { "epoch": 0.9905120785949801, "grad_norm": 1.3394226647545722, "learning_rate": 2.1982563967376525e-09, "loss": 0.4224, "step": 1840 }, { "epoch": 0.9910504003768252, "grad_norm": 1.3878090062249357, "learning_rate": 1.9472607148490752e-09, "loss": 0.4671, "step": 1841 }, { "epoch": 0.9915887221586703, "grad_norm": 1.8045067084462034, "learning_rate": 1.71147306588626e-09, "loss": 0.4093, "step": 1842 }, { "epoch": 0.9921270439405154, "grad_norm": 1.6487465697670387, "learning_rate": 1.4908941673008604e-09, "loss": 0.4768, "step": 1843 }, { "epoch": 0.9926653657223605, "grad_norm": 1.3894142004683563, "learning_rate": 1.2855246902693241e-09, "loss": 0.4126, "step": 1844 }, { "epoch": 0.9932036875042056, "grad_norm": 1.5382669595746958, "learning_rate": 1.0953652596878972e-09, "loss": 0.4662, "step": 1845 }, { "epoch": 0.9937420092860507, "grad_norm": 1.5055759777025033, "learning_rate": 9.204164541720683e-10, "loss": 0.3911, "step": 1846 }, { "epoch": 0.9942803310678958, "grad_norm": 1.4883627722190473, "learning_rate": 7.606788060543491e-10, "loss": 0.4005, "step": 1847 }, { "epoch": 0.9948186528497409, "grad_norm": 1.7929841052447726, "learning_rate": 6.16152801383163e-10, "loss": 0.4239, "step": 1848 }, { "epoch": 0.995356974631586, "grad_norm": 1.3514634100350202, "learning_rate": 4.86838879921736e-10, "loss": 0.4122, "step": 1849 }, { "epoch": 0.9958952964134311, "grad_norm": 1.5688583282415778, "learning_rate": 3.7273743514476544e-10, "loss": 0.3613, "step": 1850 }, { "epoch": 0.9964336181952762, "grad_norm": 1.3790895255701852, "learning_rate": 2.73848814238975e-10, "loss": 0.3974, "step": 1851 }, { "epoch": 0.9969719399771213, "grad_norm": 1.4609310145673613, "learning_rate": 1.9017331810256002e-10, "loss": 0.4287, "step": 1852 }, { "epoch": 0.9975102617589664, "grad_norm": 1.6915446904327818, "learning_rate": 1.2171120134185643e-10, "loss": 0.4238, "step": 1853 }, { "epoch": 0.9980485835408115, "grad_norm": 1.636253995850887, "learning_rate": 6.846267227356152e-11, "loss": 0.4105, "step": 1854 }, { "epoch": 0.9985869053226566, "grad_norm": 1.3210272324277625, "learning_rate": 3.042789292140302e-11, "loss": 0.3978, "step": 1855 }, { "epoch": 0.9991252271045017, "grad_norm": 1.7798971238230394, "learning_rate": 7.606979016694383e-12, "loss": 0.4537, "step": 1856 }, { "epoch": 0.9996635488863468, "grad_norm": 1.6132079869080023, "learning_rate": 0.0, "loss": 0.4395, "step": 1857 }, { "epoch": 0.9996635488863468, "step": 1857, "total_flos": 1.243798906601472e+16, "train_loss": 0.0, "train_runtime": 0.4818, "train_samples_per_second": 987062.335, "train_steps_per_second": 3854.561 } ], "logging_steps": 1.0, "max_steps": 1857, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.243798906601472e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }