{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 2737, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025575447570332483, "grad_norm": 2.9635716191319874, "learning_rate": 7.299270072992701e-08, "loss": 1.5218, "step": 1 }, { "epoch": 0.005115089514066497, "grad_norm": 2.9570039035815743, "learning_rate": 1.4598540145985402e-07, "loss": 1.4755, "step": 2 }, { "epoch": 0.0076726342710997444, "grad_norm": 3.017937072257941, "learning_rate": 2.1897810218978106e-07, "loss": 1.4935, "step": 3 }, { "epoch": 0.010230179028132993, "grad_norm": 2.960891290072101, "learning_rate": 2.9197080291970804e-07, "loss": 1.4813, "step": 4 }, { "epoch": 0.01278772378516624, "grad_norm": 2.976019939455323, "learning_rate": 3.6496350364963505e-07, "loss": 1.4941, "step": 5 }, { "epoch": 0.015345268542199489, "grad_norm": 3.0149006457959886, "learning_rate": 4.379562043795621e-07, "loss": 1.5066, "step": 6 }, { "epoch": 0.017902813299232736, "grad_norm": 2.9237260833122214, "learning_rate": 5.109489051094891e-07, "loss": 1.478, "step": 7 }, { "epoch": 0.020460358056265986, "grad_norm": 2.9640674426484077, "learning_rate": 5.839416058394161e-07, "loss": 1.4882, "step": 8 }, { "epoch": 0.023017902813299233, "grad_norm": 2.883080870578686, "learning_rate": 6.569343065693432e-07, "loss": 1.5219, "step": 9 }, { "epoch": 0.02557544757033248, "grad_norm": 2.8912016510708844, "learning_rate": 7.299270072992701e-07, "loss": 1.5149, "step": 10 }, { "epoch": 0.028132992327365727, "grad_norm": 2.8525137837011734, "learning_rate": 8.029197080291971e-07, "loss": 1.5065, "step": 11 }, { "epoch": 0.030690537084398978, "grad_norm": 2.6980401328828734, "learning_rate": 8.759124087591242e-07, "loss": 1.47, "step": 12 }, { "epoch": 0.03324808184143223, "grad_norm": 2.6499759522230795, "learning_rate": 9.489051094890511e-07, "loss": 1.5126, "step": 13 }, { "epoch": 0.03580562659846547, "grad_norm": 2.646192888612826, "learning_rate": 1.0218978102189781e-06, "loss": 1.4605, "step": 14 }, { "epoch": 0.03836317135549872, "grad_norm": 2.584050631976731, "learning_rate": 1.0948905109489052e-06, "loss": 1.4985, "step": 15 }, { "epoch": 0.04092071611253197, "grad_norm": 2.3627571129305425, "learning_rate": 1.1678832116788322e-06, "loss": 1.4523, "step": 16 }, { "epoch": 0.043478260869565216, "grad_norm": 2.052553239445229, "learning_rate": 1.2408759124087592e-06, "loss": 1.4734, "step": 17 }, { "epoch": 0.04603580562659847, "grad_norm": 2.0014770644457442, "learning_rate": 1.3138686131386864e-06, "loss": 1.479, "step": 18 }, { "epoch": 0.04859335038363171, "grad_norm": 1.9847838678835794, "learning_rate": 1.3868613138686132e-06, "loss": 1.4702, "step": 19 }, { "epoch": 0.05115089514066496, "grad_norm": 1.9111274600693329, "learning_rate": 1.4598540145985402e-06, "loss": 1.4617, "step": 20 }, { "epoch": 0.05370843989769821, "grad_norm": 1.870897574722989, "learning_rate": 1.5328467153284674e-06, "loss": 1.4463, "step": 21 }, { "epoch": 0.056265984654731455, "grad_norm": 1.4296640142109796, "learning_rate": 1.6058394160583942e-06, "loss": 1.4599, "step": 22 }, { "epoch": 0.058823529411764705, "grad_norm": 1.4790607914654283, "learning_rate": 1.6788321167883212e-06, "loss": 1.4157, "step": 23 }, { "epoch": 0.061381074168797956, "grad_norm": 1.6141927865863235, "learning_rate": 1.7518248175182485e-06, "loss": 1.4439, "step": 24 }, { "epoch": 0.0639386189258312, "grad_norm": 1.599753856171314, "learning_rate": 1.8248175182481753e-06, "loss": 1.4218, "step": 25 }, { "epoch": 0.06649616368286446, "grad_norm": 1.4847704184111228, "learning_rate": 1.8978102189781023e-06, "loss": 1.4269, "step": 26 }, { "epoch": 0.06905370843989769, "grad_norm": 1.3521166489305316, "learning_rate": 1.9708029197080293e-06, "loss": 1.4158, "step": 27 }, { "epoch": 0.07161125319693094, "grad_norm": 1.2579545228076663, "learning_rate": 2.0437956204379563e-06, "loss": 1.4405, "step": 28 }, { "epoch": 0.0741687979539642, "grad_norm": 1.009619956209423, "learning_rate": 2.1167883211678833e-06, "loss": 1.4151, "step": 29 }, { "epoch": 0.07672634271099744, "grad_norm": 1.1838282966029092, "learning_rate": 2.1897810218978103e-06, "loss": 1.419, "step": 30 }, { "epoch": 0.0792838874680307, "grad_norm": 1.2384598412642265, "learning_rate": 2.2627737226277373e-06, "loss": 1.412, "step": 31 }, { "epoch": 0.08184143222506395, "grad_norm": 1.1754182466507677, "learning_rate": 2.3357664233576643e-06, "loss": 1.3866, "step": 32 }, { "epoch": 0.08439897698209718, "grad_norm": 1.0614055850869524, "learning_rate": 2.4087591240875918e-06, "loss": 1.4127, "step": 33 }, { "epoch": 0.08695652173913043, "grad_norm": 1.0576160445761484, "learning_rate": 2.4817518248175183e-06, "loss": 1.4281, "step": 34 }, { "epoch": 0.08951406649616368, "grad_norm": 1.0117252925259892, "learning_rate": 2.5547445255474458e-06, "loss": 1.3731, "step": 35 }, { "epoch": 0.09207161125319693, "grad_norm": 0.9022593000895403, "learning_rate": 2.627737226277373e-06, "loss": 1.3866, "step": 36 }, { "epoch": 0.09462915601023018, "grad_norm": 0.8340755212001483, "learning_rate": 2.7007299270072994e-06, "loss": 1.4026, "step": 37 }, { "epoch": 0.09718670076726342, "grad_norm": 0.7261384916519003, "learning_rate": 2.7737226277372264e-06, "loss": 1.372, "step": 38 }, { "epoch": 0.09974424552429667, "grad_norm": 0.6484685338282444, "learning_rate": 2.8467153284671534e-06, "loss": 1.3914, "step": 39 }, { "epoch": 0.10230179028132992, "grad_norm": 0.5852202685330168, "learning_rate": 2.9197080291970804e-06, "loss": 1.328, "step": 40 }, { "epoch": 0.10485933503836317, "grad_norm": 0.7534890308070339, "learning_rate": 2.992700729927008e-06, "loss": 1.3525, "step": 41 }, { "epoch": 0.10741687979539642, "grad_norm": 0.851146761403294, "learning_rate": 3.065693430656935e-06, "loss": 1.3478, "step": 42 }, { "epoch": 0.10997442455242967, "grad_norm": 0.7827817647570426, "learning_rate": 3.1386861313868614e-06, "loss": 1.3191, "step": 43 }, { "epoch": 0.11253196930946291, "grad_norm": 0.664689408470926, "learning_rate": 3.2116788321167884e-06, "loss": 1.3222, "step": 44 }, { "epoch": 0.11508951406649616, "grad_norm": 0.5490554622557167, "learning_rate": 3.2846715328467155e-06, "loss": 1.3238, "step": 45 }, { "epoch": 0.11764705882352941, "grad_norm": 0.5108750790400686, "learning_rate": 3.3576642335766425e-06, "loss": 1.3436, "step": 46 }, { "epoch": 0.12020460358056266, "grad_norm": 0.5445952611951665, "learning_rate": 3.43065693430657e-06, "loss": 1.3458, "step": 47 }, { "epoch": 0.12276214833759591, "grad_norm": 0.5697581064671751, "learning_rate": 3.503649635036497e-06, "loss": 1.3132, "step": 48 }, { "epoch": 0.12531969309462915, "grad_norm": 0.578411430323597, "learning_rate": 3.576642335766424e-06, "loss": 1.3268, "step": 49 }, { "epoch": 0.1278772378516624, "grad_norm": 0.5601792557806415, "learning_rate": 3.6496350364963505e-06, "loss": 1.2966, "step": 50 }, { "epoch": 0.13043478260869565, "grad_norm": 0.5306373264311374, "learning_rate": 3.7226277372262775e-06, "loss": 1.3004, "step": 51 }, { "epoch": 0.1329923273657289, "grad_norm": 0.4661660429983145, "learning_rate": 3.7956204379562045e-06, "loss": 1.2812, "step": 52 }, { "epoch": 0.13554987212276215, "grad_norm": 0.42244352277225405, "learning_rate": 3.868613138686132e-06, "loss": 1.2774, "step": 53 }, { "epoch": 0.13810741687979539, "grad_norm": 0.39129018686480066, "learning_rate": 3.9416058394160585e-06, "loss": 1.3168, "step": 54 }, { "epoch": 0.14066496163682865, "grad_norm": 0.3485115346190062, "learning_rate": 4.014598540145986e-06, "loss": 1.3283, "step": 55 }, { "epoch": 0.1432225063938619, "grad_norm": 0.3976730412907507, "learning_rate": 4.0875912408759126e-06, "loss": 1.3135, "step": 56 }, { "epoch": 0.14578005115089515, "grad_norm": 0.4153119646875293, "learning_rate": 4.16058394160584e-06, "loss": 1.2989, "step": 57 }, { "epoch": 0.1483375959079284, "grad_norm": 0.42065859451204163, "learning_rate": 4.233576642335767e-06, "loss": 1.3137, "step": 58 }, { "epoch": 0.15089514066496162, "grad_norm": 0.35014086468112804, "learning_rate": 4.306569343065693e-06, "loss": 1.2743, "step": 59 }, { "epoch": 0.1534526854219949, "grad_norm": 0.32228235531527744, "learning_rate": 4.379562043795621e-06, "loss": 1.2987, "step": 60 }, { "epoch": 0.15601023017902813, "grad_norm": 0.33710245284823415, "learning_rate": 4.452554744525548e-06, "loss": 1.2869, "step": 61 }, { "epoch": 0.1585677749360614, "grad_norm": 0.34426470471374965, "learning_rate": 4.525547445255475e-06, "loss": 1.3199, "step": 62 }, { "epoch": 0.16112531969309463, "grad_norm": 0.334431341569014, "learning_rate": 4.598540145985402e-06, "loss": 1.2972, "step": 63 }, { "epoch": 0.1636828644501279, "grad_norm": 0.33024914298061436, "learning_rate": 4.671532846715329e-06, "loss": 1.2928, "step": 64 }, { "epoch": 0.16624040920716113, "grad_norm": 0.3058316278280544, "learning_rate": 4.744525547445255e-06, "loss": 1.2861, "step": 65 }, { "epoch": 0.16879795396419436, "grad_norm": 0.292869194083437, "learning_rate": 4.8175182481751835e-06, "loss": 1.2461, "step": 66 }, { "epoch": 0.17135549872122763, "grad_norm": 0.24971695111221698, "learning_rate": 4.89051094890511e-06, "loss": 1.2661, "step": 67 }, { "epoch": 0.17391304347826086, "grad_norm": 0.26954765363549843, "learning_rate": 4.963503649635037e-06, "loss": 1.2467, "step": 68 }, { "epoch": 0.17647058823529413, "grad_norm": 0.25356010222488795, "learning_rate": 5.036496350364964e-06, "loss": 1.2303, "step": 69 }, { "epoch": 0.17902813299232737, "grad_norm": 0.2339589024717998, "learning_rate": 5.1094890510948916e-06, "loss": 1.2399, "step": 70 }, { "epoch": 0.1815856777493606, "grad_norm": 0.22823462929167784, "learning_rate": 5.182481751824818e-06, "loss": 1.2498, "step": 71 }, { "epoch": 0.18414322250639387, "grad_norm": 0.24948571250389207, "learning_rate": 5.255474452554746e-06, "loss": 1.2643, "step": 72 }, { "epoch": 0.1867007672634271, "grad_norm": 0.2298632960982471, "learning_rate": 5.328467153284672e-06, "loss": 1.2958, "step": 73 }, { "epoch": 0.18925831202046037, "grad_norm": 0.22223759951095107, "learning_rate": 5.401459854014599e-06, "loss": 1.2422, "step": 74 }, { "epoch": 0.1918158567774936, "grad_norm": 0.23124679789968172, "learning_rate": 5.474452554744526e-06, "loss": 1.2407, "step": 75 }, { "epoch": 0.19437340153452684, "grad_norm": 0.2221181062125986, "learning_rate": 5.547445255474453e-06, "loss": 1.2456, "step": 76 }, { "epoch": 0.1969309462915601, "grad_norm": 0.1998449044080008, "learning_rate": 5.62043795620438e-06, "loss": 1.2514, "step": 77 }, { "epoch": 0.19948849104859334, "grad_norm": 0.19727362882566524, "learning_rate": 5.693430656934307e-06, "loss": 1.2335, "step": 78 }, { "epoch": 0.2020460358056266, "grad_norm": 0.20659124094509168, "learning_rate": 5.766423357664233e-06, "loss": 1.2276, "step": 79 }, { "epoch": 0.20460358056265984, "grad_norm": 0.22959713985782182, "learning_rate": 5.839416058394161e-06, "loss": 1.2435, "step": 80 }, { "epoch": 0.2071611253196931, "grad_norm": 0.19904222253631854, "learning_rate": 5.912408759124088e-06, "loss": 1.2266, "step": 81 }, { "epoch": 0.20971867007672634, "grad_norm": 0.19344151897086864, "learning_rate": 5.985401459854016e-06, "loss": 1.2261, "step": 82 }, { "epoch": 0.21227621483375958, "grad_norm": 0.19302417663791685, "learning_rate": 6.058394160583942e-06, "loss": 1.2384, "step": 83 }, { "epoch": 0.21483375959079284, "grad_norm": 0.21396454463521547, "learning_rate": 6.13138686131387e-06, "loss": 1.235, "step": 84 }, { "epoch": 0.21739130434782608, "grad_norm": 0.1913859035516872, "learning_rate": 6.204379562043796e-06, "loss": 1.2838, "step": 85 }, { "epoch": 0.21994884910485935, "grad_norm": 0.17510278677847252, "learning_rate": 6.277372262773723e-06, "loss": 1.2358, "step": 86 }, { "epoch": 0.22250639386189258, "grad_norm": 0.19863525132725016, "learning_rate": 6.35036496350365e-06, "loss": 1.2419, "step": 87 }, { "epoch": 0.22506393861892582, "grad_norm": 0.19478563516185365, "learning_rate": 6.423357664233577e-06, "loss": 1.2641, "step": 88 }, { "epoch": 0.22762148337595908, "grad_norm": 0.17875499154062388, "learning_rate": 6.496350364963504e-06, "loss": 1.2239, "step": 89 }, { "epoch": 0.23017902813299232, "grad_norm": 0.1751251099110654, "learning_rate": 6.569343065693431e-06, "loss": 1.2524, "step": 90 }, { "epoch": 0.23273657289002558, "grad_norm": 0.1869390091762672, "learning_rate": 6.6423357664233575e-06, "loss": 1.2494, "step": 91 }, { "epoch": 0.23529411764705882, "grad_norm": 0.17676974553290642, "learning_rate": 6.715328467153285e-06, "loss": 1.2537, "step": 92 }, { "epoch": 0.23785166240409208, "grad_norm": 0.1806189007928041, "learning_rate": 6.7883211678832115e-06, "loss": 1.2349, "step": 93 }, { "epoch": 0.24040920716112532, "grad_norm": 0.18193990233718968, "learning_rate": 6.86131386861314e-06, "loss": 1.2583, "step": 94 }, { "epoch": 0.24296675191815856, "grad_norm": 0.19012671201766562, "learning_rate": 6.934306569343066e-06, "loss": 1.2029, "step": 95 }, { "epoch": 0.24552429667519182, "grad_norm": 0.16857838785815454, "learning_rate": 7.007299270072994e-06, "loss": 1.2423, "step": 96 }, { "epoch": 0.24808184143222506, "grad_norm": 0.18952785901605423, "learning_rate": 7.08029197080292e-06, "loss": 1.2394, "step": 97 }, { "epoch": 0.2506393861892583, "grad_norm": 0.18078294692872968, "learning_rate": 7.153284671532848e-06, "loss": 1.2122, "step": 98 }, { "epoch": 0.2531969309462916, "grad_norm": 0.17487368586515217, "learning_rate": 7.2262773722627744e-06, "loss": 1.2117, "step": 99 }, { "epoch": 0.2557544757033248, "grad_norm": 0.17732077203789362, "learning_rate": 7.299270072992701e-06, "loss": 1.2041, "step": 100 }, { "epoch": 0.25831202046035806, "grad_norm": 0.18421840800752218, "learning_rate": 7.3722627737226285e-06, "loss": 1.2231, "step": 101 }, { "epoch": 0.2608695652173913, "grad_norm": 0.1768000076239069, "learning_rate": 7.445255474452555e-06, "loss": 1.2325, "step": 102 }, { "epoch": 0.26342710997442453, "grad_norm": 0.16984854034130697, "learning_rate": 7.5182481751824825e-06, "loss": 1.2026, "step": 103 }, { "epoch": 0.2659846547314578, "grad_norm": 0.16277787684968492, "learning_rate": 7.591240875912409e-06, "loss": 1.193, "step": 104 }, { "epoch": 0.26854219948849106, "grad_norm": 0.17357111549131551, "learning_rate": 7.664233576642336e-06, "loss": 1.2009, "step": 105 }, { "epoch": 0.2710997442455243, "grad_norm": 0.1800163972852127, "learning_rate": 7.737226277372264e-06, "loss": 1.1909, "step": 106 }, { "epoch": 0.27365728900255754, "grad_norm": 0.1681574320113801, "learning_rate": 7.810218978102191e-06, "loss": 1.2194, "step": 107 }, { "epoch": 0.27621483375959077, "grad_norm": 0.16885285400717157, "learning_rate": 7.883211678832117e-06, "loss": 1.1985, "step": 108 }, { "epoch": 0.27877237851662406, "grad_norm": 0.17914067468814437, "learning_rate": 7.956204379562045e-06, "loss": 1.2218, "step": 109 }, { "epoch": 0.2813299232736573, "grad_norm": 0.16706925568533235, "learning_rate": 8.029197080291972e-06, "loss": 1.222, "step": 110 }, { "epoch": 0.28388746803069054, "grad_norm": 0.1641264132835115, "learning_rate": 8.1021897810219e-06, "loss": 1.2242, "step": 111 }, { "epoch": 0.2864450127877238, "grad_norm": 0.18443514799994437, "learning_rate": 8.175182481751825e-06, "loss": 1.2118, "step": 112 }, { "epoch": 0.289002557544757, "grad_norm": 0.17675822272527503, "learning_rate": 8.248175182481753e-06, "loss": 1.1849, "step": 113 }, { "epoch": 0.2915601023017903, "grad_norm": 0.1880451995042565, "learning_rate": 8.32116788321168e-06, "loss": 1.2103, "step": 114 }, { "epoch": 0.29411764705882354, "grad_norm": 0.16598375442205784, "learning_rate": 8.394160583941606e-06, "loss": 1.1937, "step": 115 }, { "epoch": 0.2966751918158568, "grad_norm": 0.190898911263414, "learning_rate": 8.467153284671533e-06, "loss": 1.2028, "step": 116 }, { "epoch": 0.29923273657289, "grad_norm": 0.18881369445042054, "learning_rate": 8.54014598540146e-06, "loss": 1.1976, "step": 117 }, { "epoch": 0.30179028132992325, "grad_norm": 0.20907040258575316, "learning_rate": 8.613138686131386e-06, "loss": 1.2476, "step": 118 }, { "epoch": 0.30434782608695654, "grad_norm": 0.1704000017989476, "learning_rate": 8.686131386861315e-06, "loss": 1.2087, "step": 119 }, { "epoch": 0.3069053708439898, "grad_norm": 0.19455649517228424, "learning_rate": 8.759124087591241e-06, "loss": 1.214, "step": 120 }, { "epoch": 0.309462915601023, "grad_norm": 0.18574238206663096, "learning_rate": 8.832116788321169e-06, "loss": 1.2276, "step": 121 }, { "epoch": 0.31202046035805625, "grad_norm": 0.19290426166252228, "learning_rate": 8.905109489051096e-06, "loss": 1.1805, "step": 122 }, { "epoch": 0.3145780051150895, "grad_norm": 0.1995598501375803, "learning_rate": 8.978102189781024e-06, "loss": 1.2007, "step": 123 }, { "epoch": 0.3171355498721228, "grad_norm": 0.17673439222358, "learning_rate": 9.05109489051095e-06, "loss": 1.1966, "step": 124 }, { "epoch": 0.319693094629156, "grad_norm": 0.1966681987874607, "learning_rate": 9.124087591240877e-06, "loss": 1.1739, "step": 125 }, { "epoch": 0.32225063938618925, "grad_norm": 0.20745524723498263, "learning_rate": 9.197080291970804e-06, "loss": 1.2309, "step": 126 }, { "epoch": 0.3248081841432225, "grad_norm": 0.20371417264487574, "learning_rate": 9.27007299270073e-06, "loss": 1.1718, "step": 127 }, { "epoch": 0.3273657289002558, "grad_norm": 0.20142192992356361, "learning_rate": 9.343065693430657e-06, "loss": 1.1981, "step": 128 }, { "epoch": 0.329923273657289, "grad_norm": 0.18157695452516256, "learning_rate": 9.416058394160585e-06, "loss": 1.187, "step": 129 }, { "epoch": 0.33248081841432225, "grad_norm": 0.18405529622418393, "learning_rate": 9.48905109489051e-06, "loss": 1.2154, "step": 130 }, { "epoch": 0.3350383631713555, "grad_norm": 0.18826966568044085, "learning_rate": 9.56204379562044e-06, "loss": 1.1823, "step": 131 }, { "epoch": 0.3375959079283887, "grad_norm": 0.17870276101242044, "learning_rate": 9.635036496350367e-06, "loss": 1.2399, "step": 132 }, { "epoch": 0.340153452685422, "grad_norm": 0.18386831261657108, "learning_rate": 9.708029197080293e-06, "loss": 1.2114, "step": 133 }, { "epoch": 0.34271099744245526, "grad_norm": 0.1795896309939293, "learning_rate": 9.78102189781022e-06, "loss": 1.1832, "step": 134 }, { "epoch": 0.3452685421994885, "grad_norm": 0.21827425129513728, "learning_rate": 9.854014598540148e-06, "loss": 1.2389, "step": 135 }, { "epoch": 0.34782608695652173, "grad_norm": 0.1768309026825683, "learning_rate": 9.927007299270073e-06, "loss": 1.1965, "step": 136 }, { "epoch": 0.35038363171355497, "grad_norm": 0.20302569863881262, "learning_rate": 1e-05, "loss": 1.2094, "step": 137 }, { "epoch": 0.35294117647058826, "grad_norm": 0.19427846203063504, "learning_rate": 1.0072992700729928e-05, "loss": 1.1974, "step": 138 }, { "epoch": 0.3554987212276215, "grad_norm": 0.17339331224519358, "learning_rate": 1.0145985401459854e-05, "loss": 1.1736, "step": 139 }, { "epoch": 0.35805626598465473, "grad_norm": 0.2466539718194467, "learning_rate": 1.0218978102189783e-05, "loss": 1.2279, "step": 140 }, { "epoch": 0.36061381074168797, "grad_norm": 0.21241110450455392, "learning_rate": 1.0291970802919709e-05, "loss": 1.1409, "step": 141 }, { "epoch": 0.3631713554987212, "grad_norm": 0.18293508498426997, "learning_rate": 1.0364963503649636e-05, "loss": 1.1957, "step": 142 }, { "epoch": 0.3657289002557545, "grad_norm": 0.19790775478208397, "learning_rate": 1.0437956204379562e-05, "loss": 1.2193, "step": 143 }, { "epoch": 0.36828644501278773, "grad_norm": 0.20929660856991877, "learning_rate": 1.0510948905109491e-05, "loss": 1.1866, "step": 144 }, { "epoch": 0.37084398976982097, "grad_norm": 0.1926018989518869, "learning_rate": 1.0583941605839417e-05, "loss": 1.2015, "step": 145 }, { "epoch": 0.3734015345268542, "grad_norm": 0.19192914492955238, "learning_rate": 1.0656934306569344e-05, "loss": 1.1886, "step": 146 }, { "epoch": 0.37595907928388744, "grad_norm": 0.20322534422512073, "learning_rate": 1.072992700729927e-05, "loss": 1.2199, "step": 147 }, { "epoch": 0.37851662404092073, "grad_norm": 0.18947938981971202, "learning_rate": 1.0802919708029198e-05, "loss": 1.1829, "step": 148 }, { "epoch": 0.38107416879795397, "grad_norm": 0.2154696847726249, "learning_rate": 1.0875912408759123e-05, "loss": 1.1655, "step": 149 }, { "epoch": 0.3836317135549872, "grad_norm": 0.20859256059256231, "learning_rate": 1.0948905109489052e-05, "loss": 1.1815, "step": 150 }, { "epoch": 0.38618925831202044, "grad_norm": 0.20565139563521717, "learning_rate": 1.102189781021898e-05, "loss": 1.1848, "step": 151 }, { "epoch": 0.3887468030690537, "grad_norm": 0.21340531513272162, "learning_rate": 1.1094890510948906e-05, "loss": 1.188, "step": 152 }, { "epoch": 0.391304347826087, "grad_norm": 0.22952365545919354, "learning_rate": 1.1167883211678833e-05, "loss": 1.1772, "step": 153 }, { "epoch": 0.3938618925831202, "grad_norm": 0.21489457470648385, "learning_rate": 1.124087591240876e-05, "loss": 1.1807, "step": 154 }, { "epoch": 0.39641943734015345, "grad_norm": 0.22932079381688553, "learning_rate": 1.1313868613138688e-05, "loss": 1.1949, "step": 155 }, { "epoch": 0.3989769820971867, "grad_norm": 0.23209900752946952, "learning_rate": 1.1386861313868614e-05, "loss": 1.1996, "step": 156 }, { "epoch": 0.40153452685422, "grad_norm": 0.22388173844283388, "learning_rate": 1.1459854014598541e-05, "loss": 1.2097, "step": 157 }, { "epoch": 0.4040920716112532, "grad_norm": 0.21380373488801446, "learning_rate": 1.1532846715328467e-05, "loss": 1.2082, "step": 158 }, { "epoch": 0.40664961636828645, "grad_norm": 0.21817873889647327, "learning_rate": 1.1605839416058396e-05, "loss": 1.1586, "step": 159 }, { "epoch": 0.4092071611253197, "grad_norm": 0.2450535248536084, "learning_rate": 1.1678832116788322e-05, "loss": 1.1765, "step": 160 }, { "epoch": 0.4117647058823529, "grad_norm": 0.24576894425899287, "learning_rate": 1.1751824817518249e-05, "loss": 1.1701, "step": 161 }, { "epoch": 0.4143222506393862, "grad_norm": 0.2781533359151788, "learning_rate": 1.1824817518248176e-05, "loss": 1.1686, "step": 162 }, { "epoch": 0.41687979539641945, "grad_norm": 0.23249844406377174, "learning_rate": 1.1897810218978102e-05, "loss": 1.169, "step": 163 }, { "epoch": 0.4194373401534527, "grad_norm": 0.2425823032194627, "learning_rate": 1.1970802919708031e-05, "loss": 1.1821, "step": 164 }, { "epoch": 0.4219948849104859, "grad_norm": 0.18932993548929591, "learning_rate": 1.2043795620437957e-05, "loss": 1.1538, "step": 165 }, { "epoch": 0.42455242966751916, "grad_norm": 0.2884159065917926, "learning_rate": 1.2116788321167885e-05, "loss": 1.1787, "step": 166 }, { "epoch": 0.42710997442455245, "grad_norm": 0.2667378207082784, "learning_rate": 1.218978102189781e-05, "loss": 1.1774, "step": 167 }, { "epoch": 0.4296675191815857, "grad_norm": 0.24644746723371008, "learning_rate": 1.226277372262774e-05, "loss": 1.1823, "step": 168 }, { "epoch": 0.4322250639386189, "grad_norm": 0.3049603900188157, "learning_rate": 1.2335766423357665e-05, "loss": 1.1808, "step": 169 }, { "epoch": 0.43478260869565216, "grad_norm": 0.24091240924103605, "learning_rate": 1.2408759124087593e-05, "loss": 1.1646, "step": 170 }, { "epoch": 0.4373401534526854, "grad_norm": 0.31462619972433453, "learning_rate": 1.2481751824817518e-05, "loss": 1.1742, "step": 171 }, { "epoch": 0.4398976982097187, "grad_norm": 0.25976500149808457, "learning_rate": 1.2554744525547446e-05, "loss": 1.1741, "step": 172 }, { "epoch": 0.4424552429667519, "grad_norm": 0.22869248627416927, "learning_rate": 1.2627737226277371e-05, "loss": 1.1927, "step": 173 }, { "epoch": 0.44501278772378516, "grad_norm": 0.27204853892769404, "learning_rate": 1.27007299270073e-05, "loss": 1.199, "step": 174 }, { "epoch": 0.4475703324808184, "grad_norm": 0.22922656795364751, "learning_rate": 1.2773722627737228e-05, "loss": 1.1742, "step": 175 }, { "epoch": 0.45012787723785164, "grad_norm": 0.3018012418428905, "learning_rate": 1.2846715328467154e-05, "loss": 1.2027, "step": 176 }, { "epoch": 0.45268542199488493, "grad_norm": 0.2578612414340434, "learning_rate": 1.2919708029197083e-05, "loss": 1.1757, "step": 177 }, { "epoch": 0.45524296675191817, "grad_norm": 0.25636745613132944, "learning_rate": 1.2992700729927009e-05, "loss": 1.1716, "step": 178 }, { "epoch": 0.4578005115089514, "grad_norm": 0.2715386790093217, "learning_rate": 1.3065693430656936e-05, "loss": 1.1583, "step": 179 }, { "epoch": 0.46035805626598464, "grad_norm": 0.2891675384844315, "learning_rate": 1.3138686131386862e-05, "loss": 1.1657, "step": 180 }, { "epoch": 0.4629156010230179, "grad_norm": 0.23385863111978508, "learning_rate": 1.321167883211679e-05, "loss": 1.1922, "step": 181 }, { "epoch": 0.46547314578005117, "grad_norm": 0.22994123507129197, "learning_rate": 1.3284671532846715e-05, "loss": 1.1717, "step": 182 }, { "epoch": 0.4680306905370844, "grad_norm": 0.23612727422353394, "learning_rate": 1.3357664233576644e-05, "loss": 1.1801, "step": 183 }, { "epoch": 0.47058823529411764, "grad_norm": 0.2069010463077349, "learning_rate": 1.343065693430657e-05, "loss": 1.177, "step": 184 }, { "epoch": 0.4731457800511509, "grad_norm": 0.2588170825534718, "learning_rate": 1.3503649635036497e-05, "loss": 1.1808, "step": 185 }, { "epoch": 0.47570332480818417, "grad_norm": 0.2157790774775731, "learning_rate": 1.3576642335766423e-05, "loss": 1.1821, "step": 186 }, { "epoch": 0.4782608695652174, "grad_norm": 0.23223470081294634, "learning_rate": 1.3649635036496352e-05, "loss": 1.1615, "step": 187 }, { "epoch": 0.48081841432225064, "grad_norm": 0.21725466354040374, "learning_rate": 1.372262773722628e-05, "loss": 1.1912, "step": 188 }, { "epoch": 0.4833759590792839, "grad_norm": 0.211538836700456, "learning_rate": 1.3795620437956205e-05, "loss": 1.1678, "step": 189 }, { "epoch": 0.4859335038363171, "grad_norm": 0.25537726955126566, "learning_rate": 1.3868613138686133e-05, "loss": 1.1745, "step": 190 }, { "epoch": 0.4884910485933504, "grad_norm": 0.28371208474889603, "learning_rate": 1.3941605839416059e-05, "loss": 1.1193, "step": 191 }, { "epoch": 0.49104859335038364, "grad_norm": 0.26303907455029885, "learning_rate": 1.4014598540145988e-05, "loss": 1.1622, "step": 192 }, { "epoch": 0.4936061381074169, "grad_norm": 0.2799114044156544, "learning_rate": 1.4087591240875913e-05, "loss": 1.136, "step": 193 }, { "epoch": 0.4961636828644501, "grad_norm": 0.24139333187754325, "learning_rate": 1.416058394160584e-05, "loss": 1.1306, "step": 194 }, { "epoch": 0.49872122762148335, "grad_norm": 0.2793729959544077, "learning_rate": 1.4233576642335767e-05, "loss": 1.2086, "step": 195 }, { "epoch": 0.5012787723785166, "grad_norm": 0.27570376951402886, "learning_rate": 1.4306569343065696e-05, "loss": 1.1628, "step": 196 }, { "epoch": 0.5038363171355499, "grad_norm": 0.32786685913286884, "learning_rate": 1.4379562043795621e-05, "loss": 1.1518, "step": 197 }, { "epoch": 0.5063938618925832, "grad_norm": 0.45385237120867455, "learning_rate": 1.4452554744525549e-05, "loss": 1.1856, "step": 198 }, { "epoch": 0.5089514066496164, "grad_norm": 0.41272427110721904, "learning_rate": 1.4525547445255475e-05, "loss": 1.1483, "step": 199 }, { "epoch": 0.5115089514066496, "grad_norm": 0.2841480764999212, "learning_rate": 1.4598540145985402e-05, "loss": 1.1629, "step": 200 }, { "epoch": 0.5140664961636828, "grad_norm": 0.27714909479279093, "learning_rate": 1.4671532846715331e-05, "loss": 1.1442, "step": 201 }, { "epoch": 0.5166240409207161, "grad_norm": 0.403242161588326, "learning_rate": 1.4744525547445257e-05, "loss": 1.1385, "step": 202 }, { "epoch": 0.5191815856777494, "grad_norm": 0.337013121025594, "learning_rate": 1.4817518248175184e-05, "loss": 1.171, "step": 203 }, { "epoch": 0.5217391304347826, "grad_norm": 0.4040109170859878, "learning_rate": 1.489051094890511e-05, "loss": 1.1418, "step": 204 }, { "epoch": 0.5242966751918159, "grad_norm": 0.48665453956547733, "learning_rate": 1.4963503649635038e-05, "loss": 1.164, "step": 205 }, { "epoch": 0.5268542199488491, "grad_norm": 0.24722444184837292, "learning_rate": 1.5036496350364965e-05, "loss": 1.1535, "step": 206 }, { "epoch": 0.5294117647058824, "grad_norm": 0.329077822667812, "learning_rate": 1.5109489051094892e-05, "loss": 1.1704, "step": 207 }, { "epoch": 0.5319693094629157, "grad_norm": 0.41651469422399784, "learning_rate": 1.5182481751824818e-05, "loss": 1.1559, "step": 208 }, { "epoch": 0.5345268542199488, "grad_norm": 0.32960667919190284, "learning_rate": 1.5255474452554746e-05, "loss": 1.1495, "step": 209 }, { "epoch": 0.5370843989769821, "grad_norm": 0.4781321369544006, "learning_rate": 1.5328467153284673e-05, "loss": 1.1387, "step": 210 }, { "epoch": 0.5396419437340153, "grad_norm": 0.43671817015361414, "learning_rate": 1.54014598540146e-05, "loss": 1.1607, "step": 211 }, { "epoch": 0.5421994884910486, "grad_norm": 0.32190848339790007, "learning_rate": 1.5474452554744528e-05, "loss": 1.1286, "step": 212 }, { "epoch": 0.5447570332480819, "grad_norm": 0.28497016967310845, "learning_rate": 1.5547445255474454e-05, "loss": 1.1701, "step": 213 }, { "epoch": 0.5473145780051151, "grad_norm": 0.30316718930544045, "learning_rate": 1.5620437956204383e-05, "loss": 1.1236, "step": 214 }, { "epoch": 0.5498721227621484, "grad_norm": 0.26835985072996216, "learning_rate": 1.569343065693431e-05, "loss": 1.1289, "step": 215 }, { "epoch": 0.5524296675191815, "grad_norm": 0.3009095238514411, "learning_rate": 1.5766423357664234e-05, "loss": 1.1636, "step": 216 }, { "epoch": 0.5549872122762148, "grad_norm": 0.3065933942839116, "learning_rate": 1.583941605839416e-05, "loss": 1.1368, "step": 217 }, { "epoch": 0.5575447570332481, "grad_norm": 0.26109719009183135, "learning_rate": 1.591240875912409e-05, "loss": 1.1077, "step": 218 }, { "epoch": 0.5601023017902813, "grad_norm": 0.3164778738084223, "learning_rate": 1.5985401459854015e-05, "loss": 1.1333, "step": 219 }, { "epoch": 0.5626598465473146, "grad_norm": 0.35400248747839075, "learning_rate": 1.6058394160583944e-05, "loss": 1.1865, "step": 220 }, { "epoch": 0.5652173913043478, "grad_norm": 0.28805686893200677, "learning_rate": 1.613138686131387e-05, "loss": 1.1293, "step": 221 }, { "epoch": 0.5677749360613811, "grad_norm": 0.30523736515745126, "learning_rate": 1.62043795620438e-05, "loss": 1.1296, "step": 222 }, { "epoch": 0.5703324808184144, "grad_norm": 0.4190076909483638, "learning_rate": 1.6277372262773725e-05, "loss": 1.1344, "step": 223 }, { "epoch": 0.5728900255754475, "grad_norm": 0.42243425644304494, "learning_rate": 1.635036496350365e-05, "loss": 1.1665, "step": 224 }, { "epoch": 0.5754475703324808, "grad_norm": 0.33398927440080145, "learning_rate": 1.642335766423358e-05, "loss": 1.1616, "step": 225 }, { "epoch": 0.578005115089514, "grad_norm": 0.31042126932738984, "learning_rate": 1.6496350364963505e-05, "loss": 1.1346, "step": 226 }, { "epoch": 0.5805626598465473, "grad_norm": 0.4022933069927679, "learning_rate": 1.6569343065693434e-05, "loss": 1.1474, "step": 227 }, { "epoch": 0.5831202046035806, "grad_norm": 0.34778708873533665, "learning_rate": 1.664233576642336e-05, "loss": 1.1328, "step": 228 }, { "epoch": 0.5856777493606138, "grad_norm": 0.35235801712692716, "learning_rate": 1.6715328467153286e-05, "loss": 1.1507, "step": 229 }, { "epoch": 0.5882352941176471, "grad_norm": 0.3378264430318775, "learning_rate": 1.678832116788321e-05, "loss": 1.1556, "step": 230 }, { "epoch": 0.5907928388746803, "grad_norm": 0.3260621828817585, "learning_rate": 1.686131386861314e-05, "loss": 1.152, "step": 231 }, { "epoch": 0.5933503836317136, "grad_norm": 0.39226471807556507, "learning_rate": 1.6934306569343066e-05, "loss": 1.1398, "step": 232 }, { "epoch": 0.5959079283887468, "grad_norm": 0.4562478952465355, "learning_rate": 1.7007299270072995e-05, "loss": 1.1447, "step": 233 }, { "epoch": 0.59846547314578, "grad_norm": 0.3451241092677777, "learning_rate": 1.708029197080292e-05, "loss": 1.1005, "step": 234 }, { "epoch": 0.6010230179028133, "grad_norm": 0.35647792283371854, "learning_rate": 1.7153284671532847e-05, "loss": 1.1227, "step": 235 }, { "epoch": 0.6035805626598465, "grad_norm": 0.4594520420622475, "learning_rate": 1.7226277372262773e-05, "loss": 1.1505, "step": 236 }, { "epoch": 0.6061381074168798, "grad_norm": 0.45224289985329424, "learning_rate": 1.7299270072992702e-05, "loss": 1.1308, "step": 237 }, { "epoch": 0.6086956521739131, "grad_norm": 0.40418344343634116, "learning_rate": 1.737226277372263e-05, "loss": 1.1181, "step": 238 }, { "epoch": 0.6112531969309463, "grad_norm": 0.3386408460236669, "learning_rate": 1.7445255474452557e-05, "loss": 1.1584, "step": 239 }, { "epoch": 0.6138107416879796, "grad_norm": 0.26946506842987866, "learning_rate": 1.7518248175182482e-05, "loss": 1.1264, "step": 240 }, { "epoch": 0.6163682864450127, "grad_norm": 0.36854128324837004, "learning_rate": 1.7591240875912408e-05, "loss": 1.1234, "step": 241 }, { "epoch": 0.618925831202046, "grad_norm": 0.40766745885420824, "learning_rate": 1.7664233576642337e-05, "loss": 1.1473, "step": 242 }, { "epoch": 0.6214833759590793, "grad_norm": 0.34418627419066, "learning_rate": 1.7737226277372263e-05, "loss": 1.1443, "step": 243 }, { "epoch": 0.6240409207161125, "grad_norm": 0.3132419041181749, "learning_rate": 1.7810218978102192e-05, "loss": 1.1898, "step": 244 }, { "epoch": 0.6265984654731458, "grad_norm": 0.3133703026128217, "learning_rate": 1.7883211678832118e-05, "loss": 1.1501, "step": 245 }, { "epoch": 0.629156010230179, "grad_norm": 0.3441898164827929, "learning_rate": 1.7956204379562047e-05, "loss": 1.1452, "step": 246 }, { "epoch": 0.6317135549872123, "grad_norm": 0.33750686928448953, "learning_rate": 1.8029197080291973e-05, "loss": 1.1359, "step": 247 }, { "epoch": 0.6342710997442456, "grad_norm": 0.374020584176404, "learning_rate": 1.81021897810219e-05, "loss": 1.1823, "step": 248 }, { "epoch": 0.6368286445012787, "grad_norm": 0.3514782831462071, "learning_rate": 1.8175182481751824e-05, "loss": 1.1632, "step": 249 }, { "epoch": 0.639386189258312, "grad_norm": 0.3606450922286876, "learning_rate": 1.8248175182481753e-05, "loss": 1.1409, "step": 250 }, { "epoch": 0.6419437340153452, "grad_norm": 0.261265823171208, "learning_rate": 1.8321167883211683e-05, "loss": 1.1499, "step": 251 }, { "epoch": 0.6445012787723785, "grad_norm": 0.42167995133388064, "learning_rate": 1.8394160583941608e-05, "loss": 1.154, "step": 252 }, { "epoch": 0.6470588235294118, "grad_norm": 0.3940819685714755, "learning_rate": 1.8467153284671534e-05, "loss": 1.1355, "step": 253 }, { "epoch": 0.649616368286445, "grad_norm": 0.3265578920410715, "learning_rate": 1.854014598540146e-05, "loss": 1.1874, "step": 254 }, { "epoch": 0.6521739130434783, "grad_norm": 0.39035015686633145, "learning_rate": 1.861313868613139e-05, "loss": 1.1374, "step": 255 }, { "epoch": 0.6547314578005116, "grad_norm": 0.41589276832005634, "learning_rate": 1.8686131386861315e-05, "loss": 1.1289, "step": 256 }, { "epoch": 0.6572890025575447, "grad_norm": 0.45228952583155346, "learning_rate": 1.8759124087591244e-05, "loss": 1.1646, "step": 257 }, { "epoch": 0.659846547314578, "grad_norm": 0.5348752543777668, "learning_rate": 1.883211678832117e-05, "loss": 1.1268, "step": 258 }, { "epoch": 0.6624040920716112, "grad_norm": 0.6021227056854751, "learning_rate": 1.8905109489051095e-05, "loss": 1.1593, "step": 259 }, { "epoch": 0.6649616368286445, "grad_norm": 0.5171238656799629, "learning_rate": 1.897810218978102e-05, "loss": 1.1374, "step": 260 }, { "epoch": 0.6675191815856778, "grad_norm": 0.4416261577215247, "learning_rate": 1.905109489051095e-05, "loss": 1.1093, "step": 261 }, { "epoch": 0.670076726342711, "grad_norm": 0.569218554097933, "learning_rate": 1.912408759124088e-05, "loss": 1.1232, "step": 262 }, { "epoch": 0.6726342710997443, "grad_norm": 0.6811617127901143, "learning_rate": 1.9197080291970805e-05, "loss": 1.1682, "step": 263 }, { "epoch": 0.6751918158567775, "grad_norm": 0.749600012327492, "learning_rate": 1.9270072992700734e-05, "loss": 1.1484, "step": 264 }, { "epoch": 0.6777493606138107, "grad_norm": 0.5547245978393044, "learning_rate": 1.934306569343066e-05, "loss": 1.1746, "step": 265 }, { "epoch": 0.680306905370844, "grad_norm": 0.29516123217758117, "learning_rate": 1.9416058394160586e-05, "loss": 1.1414, "step": 266 }, { "epoch": 0.6828644501278772, "grad_norm": 0.5616443320978407, "learning_rate": 1.948905109489051e-05, "loss": 1.096, "step": 267 }, { "epoch": 0.6854219948849105, "grad_norm": 0.7110950485565922, "learning_rate": 1.956204379562044e-05, "loss": 1.1383, "step": 268 }, { "epoch": 0.6879795396419437, "grad_norm": 0.5747864084575326, "learning_rate": 1.9635036496350366e-05, "loss": 1.1157, "step": 269 }, { "epoch": 0.690537084398977, "grad_norm": 0.5551996423553552, "learning_rate": 1.9708029197080295e-05, "loss": 1.1569, "step": 270 }, { "epoch": 0.6930946291560103, "grad_norm": 0.7165225607672224, "learning_rate": 1.978102189781022e-05, "loss": 1.1551, "step": 271 }, { "epoch": 0.6956521739130435, "grad_norm": 0.7036255091082283, "learning_rate": 1.9854014598540147e-05, "loss": 1.1155, "step": 272 }, { "epoch": 0.6982097186700768, "grad_norm": 0.37416829306334026, "learning_rate": 1.9927007299270073e-05, "loss": 1.1293, "step": 273 }, { "epoch": 0.7007672634271099, "grad_norm": 0.5000491272477234, "learning_rate": 2e-05, "loss": 1.1495, "step": 274 }, { "epoch": 0.7033248081841432, "grad_norm": 0.7162752485719868, "learning_rate": 1.9999991865312627e-05, "loss": 1.1267, "step": 275 }, { "epoch": 0.7058823529411765, "grad_norm": 0.6356341002049819, "learning_rate": 1.9999967461263736e-05, "loss": 1.1469, "step": 276 }, { "epoch": 0.7084398976982097, "grad_norm": 0.46429306768513406, "learning_rate": 1.9999926787893038e-05, "loss": 1.1605, "step": 277 }, { "epoch": 0.710997442455243, "grad_norm": 0.42193730725900314, "learning_rate": 1.99998698452667e-05, "loss": 1.1291, "step": 278 }, { "epoch": 0.7135549872122762, "grad_norm": 0.45111683276082, "learning_rate": 1.999979663347736e-05, "loss": 1.1594, "step": 279 }, { "epoch": 0.7161125319693095, "grad_norm": 0.48963964069881194, "learning_rate": 1.9999707152644143e-05, "loss": 1.1245, "step": 280 }, { "epoch": 0.7186700767263428, "grad_norm": 0.4979629650586617, "learning_rate": 1.999960140291262e-05, "loss": 1.119, "step": 281 }, { "epoch": 0.7212276214833759, "grad_norm": 0.4664713962878264, "learning_rate": 1.9999479384454838e-05, "loss": 1.1468, "step": 282 }, { "epoch": 0.7237851662404092, "grad_norm": 0.3844942432737082, "learning_rate": 1.9999341097469313e-05, "loss": 1.075, "step": 283 }, { "epoch": 0.7263427109974424, "grad_norm": 0.3748435881073205, "learning_rate": 1.9999186542181038e-05, "loss": 1.1388, "step": 284 }, { "epoch": 0.7289002557544757, "grad_norm": 0.37537611839818713, "learning_rate": 1.9999015718841453e-05, "loss": 1.1204, "step": 285 }, { "epoch": 0.731457800511509, "grad_norm": 0.2604152551489964, "learning_rate": 1.9998828627728483e-05, "loss": 1.1441, "step": 286 }, { "epoch": 0.7340153452685422, "grad_norm": 0.3500229133794647, "learning_rate": 1.9998625269146515e-05, "loss": 1.1418, "step": 287 }, { "epoch": 0.7365728900255755, "grad_norm": 0.40870411685426555, "learning_rate": 1.9998405643426398e-05, "loss": 1.107, "step": 288 }, { "epoch": 0.7391304347826086, "grad_norm": 0.4142193267583776, "learning_rate": 1.999816975092545e-05, "loss": 1.1386, "step": 289 }, { "epoch": 0.7416879795396419, "grad_norm": 0.3984615533147621, "learning_rate": 1.9997917592027455e-05, "loss": 1.1478, "step": 290 }, { "epoch": 0.7442455242966752, "grad_norm": 0.33486990703650343, "learning_rate": 1.9997649167142654e-05, "loss": 1.1322, "step": 291 }, { "epoch": 0.7468030690537084, "grad_norm": 0.34307927675012156, "learning_rate": 1.9997364476707765e-05, "loss": 1.0975, "step": 292 }, { "epoch": 0.7493606138107417, "grad_norm": 0.32862273663424796, "learning_rate": 1.9997063521185956e-05, "loss": 1.1234, "step": 293 }, { "epoch": 0.7519181585677749, "grad_norm": 0.3832334389775187, "learning_rate": 1.9996746301066867e-05, "loss": 1.1204, "step": 294 }, { "epoch": 0.7544757033248082, "grad_norm": 0.37651748057590684, "learning_rate": 1.999641281686659e-05, "loss": 1.1101, "step": 295 }, { "epoch": 0.7570332480818415, "grad_norm": 0.3987512509485477, "learning_rate": 1.999606306912769e-05, "loss": 1.1182, "step": 296 }, { "epoch": 0.7595907928388747, "grad_norm": 0.3135294282014092, "learning_rate": 1.999569705841918e-05, "loss": 1.1576, "step": 297 }, { "epoch": 0.7621483375959079, "grad_norm": 0.310570536991235, "learning_rate": 1.9995314785336534e-05, "loss": 1.1329, "step": 298 }, { "epoch": 0.7647058823529411, "grad_norm": 0.28886285275015344, "learning_rate": 1.999491625050169e-05, "loss": 1.1486, "step": 299 }, { "epoch": 0.7672634271099744, "grad_norm": 0.2810916108747155, "learning_rate": 1.9994501454563046e-05, "loss": 1.1067, "step": 300 }, { "epoch": 0.7698209718670077, "grad_norm": 0.2641826394714093, "learning_rate": 1.9994070398195437e-05, "loss": 1.1391, "step": 301 }, { "epoch": 0.7723785166240409, "grad_norm": 0.23992392919351505, "learning_rate": 1.999362308210017e-05, "loss": 1.1387, "step": 302 }, { "epoch": 0.7749360613810742, "grad_norm": 0.24856265925820004, "learning_rate": 1.9993159507005e-05, "loss": 1.1084, "step": 303 }, { "epoch": 0.7774936061381074, "grad_norm": 0.22572823705824116, "learning_rate": 1.9992679673664136e-05, "loss": 1.1134, "step": 304 }, { "epoch": 0.7800511508951407, "grad_norm": 0.27595626439843796, "learning_rate": 1.9992183582858233e-05, "loss": 1.1269, "step": 305 }, { "epoch": 0.782608695652174, "grad_norm": 0.33828817219220914, "learning_rate": 1.9991671235394404e-05, "loss": 1.1211, "step": 306 }, { "epoch": 0.7851662404092071, "grad_norm": 0.23908198593915184, "learning_rate": 1.9991142632106205e-05, "loss": 1.0874, "step": 307 }, { "epoch": 0.7877237851662404, "grad_norm": 0.32916775113793656, "learning_rate": 1.999059777385364e-05, "loss": 1.1189, "step": 308 }, { "epoch": 0.7902813299232737, "grad_norm": 0.4164086722930908, "learning_rate": 1.9990036661523162e-05, "loss": 1.1368, "step": 309 }, { "epoch": 0.7928388746803069, "grad_norm": 0.4356985530787425, "learning_rate": 1.998945929602766e-05, "loss": 1.1041, "step": 310 }, { "epoch": 0.7953964194373402, "grad_norm": 0.32329800121359825, "learning_rate": 1.9988865678306476e-05, "loss": 1.1381, "step": 311 }, { "epoch": 0.7979539641943734, "grad_norm": 0.28030048685966436, "learning_rate": 1.998825580932539e-05, "loss": 1.1505, "step": 312 }, { "epoch": 0.8005115089514067, "grad_norm": 0.3736128210505236, "learning_rate": 1.9987629690076615e-05, "loss": 1.116, "step": 313 }, { "epoch": 0.80306905370844, "grad_norm": 0.3711938440381308, "learning_rate": 1.998698732157881e-05, "loss": 1.1233, "step": 314 }, { "epoch": 0.8056265984654731, "grad_norm": 0.283799635820317, "learning_rate": 1.998632870487707e-05, "loss": 1.1112, "step": 315 }, { "epoch": 0.8081841432225064, "grad_norm": 0.29982174151777125, "learning_rate": 1.9985653841042926e-05, "loss": 1.1089, "step": 316 }, { "epoch": 0.8107416879795396, "grad_norm": 0.33144242270715973, "learning_rate": 1.9984962731174336e-05, "loss": 1.1387, "step": 317 }, { "epoch": 0.8132992327365729, "grad_norm": 0.33991853938376265, "learning_rate": 1.998425537639569e-05, "loss": 1.1292, "step": 318 }, { "epoch": 0.8158567774936062, "grad_norm": 0.342802086067408, "learning_rate": 1.9983531777857817e-05, "loss": 1.0907, "step": 319 }, { "epoch": 0.8184143222506394, "grad_norm": 0.3083367366680541, "learning_rate": 1.998279193673796e-05, "loss": 1.1157, "step": 320 }, { "epoch": 0.8209718670076727, "grad_norm": 0.32536985414256364, "learning_rate": 1.9982035854239793e-05, "loss": 1.0971, "step": 321 }, { "epoch": 0.8235294117647058, "grad_norm": 0.3810630836606236, "learning_rate": 1.9981263531593422e-05, "loss": 1.1236, "step": 322 }, { "epoch": 0.8260869565217391, "grad_norm": 0.36452300722278047, "learning_rate": 1.9980474970055367e-05, "loss": 1.1438, "step": 323 }, { "epoch": 0.8286445012787724, "grad_norm": 0.2795921565060519, "learning_rate": 1.997967017090856e-05, "loss": 1.1465, "step": 324 }, { "epoch": 0.8312020460358056, "grad_norm": 0.2986081929713523, "learning_rate": 1.9978849135462367e-05, "loss": 1.1061, "step": 325 }, { "epoch": 0.8337595907928389, "grad_norm": 0.3054440401423343, "learning_rate": 1.9978011865052554e-05, "loss": 1.1146, "step": 326 }, { "epoch": 0.8363171355498721, "grad_norm": 0.32318950453837997, "learning_rate": 1.9977158361041317e-05, "loss": 1.1554, "step": 327 }, { "epoch": 0.8388746803069054, "grad_norm": 0.30472902927491496, "learning_rate": 1.997628862481725e-05, "loss": 1.1274, "step": 328 }, { "epoch": 0.8414322250639387, "grad_norm": 0.4042829285862421, "learning_rate": 1.9975402657795355e-05, "loss": 1.1669, "step": 329 }, { "epoch": 0.8439897698209718, "grad_norm": 0.2804285578799784, "learning_rate": 1.997450046141705e-05, "loss": 1.1361, "step": 330 }, { "epoch": 0.8465473145780051, "grad_norm": 0.3569177728816469, "learning_rate": 1.997358203715015e-05, "loss": 1.1095, "step": 331 }, { "epoch": 0.8491048593350383, "grad_norm": 0.4230090216431553, "learning_rate": 1.9972647386488873e-05, "loss": 1.1016, "step": 332 }, { "epoch": 0.8516624040920716, "grad_norm": 0.37021286913388013, "learning_rate": 1.997169651095384e-05, "loss": 1.1475, "step": 333 }, { "epoch": 0.8542199488491049, "grad_norm": 0.3317123580055209, "learning_rate": 1.9970729412092064e-05, "loss": 1.0813, "step": 334 }, { "epoch": 0.8567774936061381, "grad_norm": 0.273842287695835, "learning_rate": 1.9969746091476955e-05, "loss": 1.1067, "step": 335 }, { "epoch": 0.8593350383631714, "grad_norm": 0.2673820670815786, "learning_rate": 1.9968746550708313e-05, "loss": 1.1069, "step": 336 }, { "epoch": 0.8618925831202046, "grad_norm": 0.2979937548082758, "learning_rate": 1.996773079141233e-05, "loss": 1.1279, "step": 337 }, { "epoch": 0.8644501278772379, "grad_norm": 0.37172355657034833, "learning_rate": 1.9966698815241583e-05, "loss": 1.1339, "step": 338 }, { "epoch": 0.8670076726342711, "grad_norm": 0.506903869952954, "learning_rate": 1.9965650623875034e-05, "loss": 1.1039, "step": 339 }, { "epoch": 0.8695652173913043, "grad_norm": 0.4279498776163848, "learning_rate": 1.9964586219018018e-05, "loss": 1.1425, "step": 340 }, { "epoch": 0.8721227621483376, "grad_norm": 0.36753587770795587, "learning_rate": 1.9963505602402263e-05, "loss": 1.0978, "step": 341 }, { "epoch": 0.8746803069053708, "grad_norm": 0.3648609772451092, "learning_rate": 1.996240877578586e-05, "loss": 1.1242, "step": 342 }, { "epoch": 0.8772378516624041, "grad_norm": 0.37366918011434086, "learning_rate": 1.996129574095328e-05, "loss": 1.1191, "step": 343 }, { "epoch": 0.8797953964194374, "grad_norm": 0.3879756302273747, "learning_rate": 1.996016649971536e-05, "loss": 1.1253, "step": 344 }, { "epoch": 0.8823529411764706, "grad_norm": 0.35306903326209926, "learning_rate": 1.9959021053909304e-05, "loss": 1.1097, "step": 345 }, { "epoch": 0.8849104859335039, "grad_norm": 0.3497813112371213, "learning_rate": 1.995785940539868e-05, "loss": 1.1751, "step": 346 }, { "epoch": 0.887468030690537, "grad_norm": 0.31991011885719, "learning_rate": 1.995668155607342e-05, "loss": 1.06, "step": 347 }, { "epoch": 0.8900255754475703, "grad_norm": 0.33100033800466955, "learning_rate": 1.9955487507849815e-05, "loss": 1.1217, "step": 348 }, { "epoch": 0.8925831202046036, "grad_norm": 0.3302462532169077, "learning_rate": 1.9954277262670497e-05, "loss": 1.1016, "step": 349 }, { "epoch": 0.8951406649616368, "grad_norm": 0.2988617813500731, "learning_rate": 1.9953050822504466e-05, "loss": 1.1259, "step": 350 }, { "epoch": 0.8976982097186701, "grad_norm": 0.2467443109516983, "learning_rate": 1.995180818934706e-05, "loss": 1.1449, "step": 351 }, { "epoch": 0.9002557544757033, "grad_norm": 0.2862819186333417, "learning_rate": 1.995054936521997e-05, "loss": 1.1, "step": 352 }, { "epoch": 0.9028132992327366, "grad_norm": 0.3386935579478213, "learning_rate": 1.9949274352171218e-05, "loss": 1.1215, "step": 353 }, { "epoch": 0.9053708439897699, "grad_norm": 0.377267345773294, "learning_rate": 1.9947983152275175e-05, "loss": 1.1151, "step": 354 }, { "epoch": 0.907928388746803, "grad_norm": 0.26418004315541993, "learning_rate": 1.9946675767632545e-05, "loss": 1.0909, "step": 355 }, { "epoch": 0.9104859335038363, "grad_norm": 0.3036950602266219, "learning_rate": 1.9945352200370352e-05, "loss": 1.1065, "step": 356 }, { "epoch": 0.9130434782608695, "grad_norm": 0.2847990677396293, "learning_rate": 1.9944012452641966e-05, "loss": 1.1187, "step": 357 }, { "epoch": 0.9156010230179028, "grad_norm": 0.3155239647410138, "learning_rate": 1.994265652662707e-05, "loss": 1.1402, "step": 358 }, { "epoch": 0.9181585677749361, "grad_norm": 0.3011564965680371, "learning_rate": 1.9941284424531668e-05, "loss": 1.1232, "step": 359 }, { "epoch": 0.9207161125319693, "grad_norm": 0.3119452115804441, "learning_rate": 1.9939896148588086e-05, "loss": 1.0879, "step": 360 }, { "epoch": 0.9232736572890026, "grad_norm": 0.33133352515569403, "learning_rate": 1.9938491701054965e-05, "loss": 1.1384, "step": 361 }, { "epoch": 0.9258312020460358, "grad_norm": 0.2085194934877816, "learning_rate": 1.9937071084217254e-05, "loss": 1.0616, "step": 362 }, { "epoch": 0.928388746803069, "grad_norm": 0.27348539950003964, "learning_rate": 1.99356343003862e-05, "loss": 1.127, "step": 363 }, { "epoch": 0.9309462915601023, "grad_norm": 0.314231043083254, "learning_rate": 1.9934181351899365e-05, "loss": 1.1075, "step": 364 }, { "epoch": 0.9335038363171355, "grad_norm": 0.3354380584507947, "learning_rate": 1.9932712241120606e-05, "loss": 1.1272, "step": 365 }, { "epoch": 0.9360613810741688, "grad_norm": 0.28703321632472045, "learning_rate": 1.9931226970440075e-05, "loss": 1.1469, "step": 366 }, { "epoch": 0.9386189258312021, "grad_norm": 0.3426859912220677, "learning_rate": 1.9929725542274215e-05, "loss": 1.1278, "step": 367 }, { "epoch": 0.9411764705882353, "grad_norm": 0.29299540193881474, "learning_rate": 1.992820795906575e-05, "loss": 1.1187, "step": 368 }, { "epoch": 0.9437340153452686, "grad_norm": 0.39295341923846966, "learning_rate": 1.99266742232837e-05, "loss": 1.1126, "step": 369 }, { "epoch": 0.9462915601023018, "grad_norm": 0.35353202277391543, "learning_rate": 1.9925124337423356e-05, "loss": 1.1139, "step": 370 }, { "epoch": 0.948849104859335, "grad_norm": 0.3311467211582019, "learning_rate": 1.9923558304006283e-05, "loss": 1.138, "step": 371 }, { "epoch": 0.9514066496163683, "grad_norm": 0.3816152174441759, "learning_rate": 1.992197612558032e-05, "loss": 1.1176, "step": 372 }, { "epoch": 0.9539641943734015, "grad_norm": 0.36605913254516786, "learning_rate": 1.9920377804719573e-05, "loss": 1.1221, "step": 373 }, { "epoch": 0.9565217391304348, "grad_norm": 0.36097755733897396, "learning_rate": 1.991876334402441e-05, "loss": 1.1198, "step": 374 }, { "epoch": 0.959079283887468, "grad_norm": 0.34895670740815254, "learning_rate": 1.9917132746121454e-05, "loss": 1.1438, "step": 375 }, { "epoch": 0.9616368286445013, "grad_norm": 0.2817987248252719, "learning_rate": 1.9915486013663595e-05, "loss": 1.0946, "step": 376 }, { "epoch": 0.9641943734015346, "grad_norm": 0.2440543185648296, "learning_rate": 1.9913823149329952e-05, "loss": 1.1257, "step": 377 }, { "epoch": 0.9667519181585678, "grad_norm": 0.29938424755141774, "learning_rate": 1.9912144155825913e-05, "loss": 1.1315, "step": 378 }, { "epoch": 0.969309462915601, "grad_norm": 0.3042211939245891, "learning_rate": 1.9910449035883086e-05, "loss": 1.1005, "step": 379 }, { "epoch": 0.9718670076726342, "grad_norm": 0.3662935173068649, "learning_rate": 1.990873779225933e-05, "loss": 1.0831, "step": 380 }, { "epoch": 0.9744245524296675, "grad_norm": 0.34290782200372855, "learning_rate": 1.990701042773873e-05, "loss": 1.1116, "step": 381 }, { "epoch": 0.9769820971867008, "grad_norm": 0.2659876511429978, "learning_rate": 1.99052669451316e-05, "loss": 1.1172, "step": 382 }, { "epoch": 0.979539641943734, "grad_norm": 0.2656583663382276, "learning_rate": 1.9903507347274478e-05, "loss": 1.1243, "step": 383 }, { "epoch": 0.9820971867007673, "grad_norm": 0.35197356004646674, "learning_rate": 1.9901731637030123e-05, "loss": 1.0751, "step": 384 }, { "epoch": 0.9846547314578005, "grad_norm": 0.4123186710230891, "learning_rate": 1.9899939817287494e-05, "loss": 1.1572, "step": 385 }, { "epoch": 0.9872122762148338, "grad_norm": 0.48886837110572706, "learning_rate": 1.989813189096178e-05, "loss": 1.1109, "step": 386 }, { "epoch": 0.989769820971867, "grad_norm": 0.4200898181195607, "learning_rate": 1.989630786099436e-05, "loss": 1.1243, "step": 387 }, { "epoch": 0.9923273657289002, "grad_norm": 0.36473186521348727, "learning_rate": 1.9894467730352817e-05, "loss": 1.1379, "step": 388 }, { "epoch": 0.9948849104859335, "grad_norm": 0.33106729200219565, "learning_rate": 1.9892611502030932e-05, "loss": 1.1183, "step": 389 }, { "epoch": 0.9974424552429667, "grad_norm": 0.28859949847448485, "learning_rate": 1.9890739179048666e-05, "loss": 1.1019, "step": 390 }, { "epoch": 1.0, "grad_norm": 0.32343067044443596, "learning_rate": 1.9888850764452177e-05, "loss": 1.1315, "step": 391 }, { "epoch": 1.0025575447570332, "grad_norm": 0.2946752191785302, "learning_rate": 1.988694626131379e-05, "loss": 1.1027, "step": 392 }, { "epoch": 1.0051150895140666, "grad_norm": 0.2840956310037306, "learning_rate": 1.9885025672732024e-05, "loss": 1.1255, "step": 393 }, { "epoch": 1.0076726342710998, "grad_norm": 0.3834929641779387, "learning_rate": 1.9883089001831545e-05, "loss": 1.0926, "step": 394 }, { "epoch": 1.010230179028133, "grad_norm": 0.37119046465058125, "learning_rate": 1.9881136251763203e-05, "loss": 1.1024, "step": 395 }, { "epoch": 1.0127877237851663, "grad_norm": 0.3481999615848297, "learning_rate": 1.9879167425703998e-05, "loss": 1.1177, "step": 396 }, { "epoch": 1.0153452685421995, "grad_norm": 0.4174534154279672, "learning_rate": 1.9877182526857086e-05, "loss": 1.1194, "step": 397 }, { "epoch": 1.0179028132992327, "grad_norm": 0.428283352237624, "learning_rate": 1.9875181558451774e-05, "loss": 1.1126, "step": 398 }, { "epoch": 1.020460358056266, "grad_norm": 0.34788898984052513, "learning_rate": 1.9873164523743517e-05, "loss": 1.0826, "step": 399 }, { "epoch": 1.0230179028132993, "grad_norm": 0.3235948349939345, "learning_rate": 1.9871131426013894e-05, "loss": 1.137, "step": 400 }, { "epoch": 1.0255754475703325, "grad_norm": 0.3661886910233816, "learning_rate": 1.9869082268570637e-05, "loss": 1.1135, "step": 401 }, { "epoch": 1.0281329923273657, "grad_norm": 0.3844357019706309, "learning_rate": 1.9867017054747593e-05, "loss": 1.1316, "step": 402 }, { "epoch": 1.030690537084399, "grad_norm": 0.3351625771872402, "learning_rate": 1.9864935787904734e-05, "loss": 1.1009, "step": 403 }, { "epoch": 1.0332480818414322, "grad_norm": 0.34602161255624664, "learning_rate": 1.986283847142816e-05, "loss": 1.1047, "step": 404 }, { "epoch": 1.0358056265984654, "grad_norm": 0.3709821493330784, "learning_rate": 1.9860725108730065e-05, "loss": 1.1031, "step": 405 }, { "epoch": 1.0383631713554988, "grad_norm": 0.37774483264562303, "learning_rate": 1.9858595703248755e-05, "loss": 1.137, "step": 406 }, { "epoch": 1.040920716112532, "grad_norm": 0.3599825369273542, "learning_rate": 1.985645025844865e-05, "loss": 1.0707, "step": 407 }, { "epoch": 1.0434782608695652, "grad_norm": 0.39966584857588405, "learning_rate": 1.9854288777820246e-05, "loss": 1.1033, "step": 408 }, { "epoch": 1.0460358056265984, "grad_norm": 0.40289071310305824, "learning_rate": 1.9852111264880145e-05, "loss": 1.0806, "step": 409 }, { "epoch": 1.0485933503836318, "grad_norm": 0.47128238325065436, "learning_rate": 1.984991772317102e-05, "loss": 1.0756, "step": 410 }, { "epoch": 1.051150895140665, "grad_norm": 0.5298917118212448, "learning_rate": 1.9847708156261622e-05, "loss": 1.1055, "step": 411 }, { "epoch": 1.0537084398976981, "grad_norm": 0.47297356768421134, "learning_rate": 1.9845482567746783e-05, "loss": 1.0836, "step": 412 }, { "epoch": 1.0562659846547315, "grad_norm": 0.38344561089251955, "learning_rate": 1.9843240961247398e-05, "loss": 1.0904, "step": 413 }, { "epoch": 1.0588235294117647, "grad_norm": 0.27676602705991193, "learning_rate": 1.9840983340410414e-05, "loss": 1.1402, "step": 414 }, { "epoch": 1.061381074168798, "grad_norm": 0.4125473070163219, "learning_rate": 1.9838709708908848e-05, "loss": 1.1108, "step": 415 }, { "epoch": 1.0639386189258313, "grad_norm": 0.39100913652365626, "learning_rate": 1.983642007044175e-05, "loss": 1.0894, "step": 416 }, { "epoch": 1.0664961636828645, "grad_norm": 0.3635147529725554, "learning_rate": 1.983411442873422e-05, "loss": 1.0751, "step": 417 }, { "epoch": 1.0690537084398977, "grad_norm": 0.3157457311508148, "learning_rate": 1.983179278753739e-05, "loss": 1.0867, "step": 418 }, { "epoch": 1.0716112531969308, "grad_norm": 0.3380507668468239, "learning_rate": 1.9829455150628432e-05, "loss": 1.1428, "step": 419 }, { "epoch": 1.0741687979539642, "grad_norm": 0.3531121689418475, "learning_rate": 1.982710152181053e-05, "loss": 1.0877, "step": 420 }, { "epoch": 1.0767263427109974, "grad_norm": 0.2800940522052926, "learning_rate": 1.982473190491289e-05, "loss": 1.1025, "step": 421 }, { "epoch": 1.0792838874680306, "grad_norm": 0.3045440051536889, "learning_rate": 1.9822346303790732e-05, "loss": 1.0954, "step": 422 }, { "epoch": 1.081841432225064, "grad_norm": 0.2875179180998631, "learning_rate": 1.9819944722325283e-05, "loss": 1.0799, "step": 423 }, { "epoch": 1.0843989769820972, "grad_norm": 0.3671466904640979, "learning_rate": 1.981752716442376e-05, "loss": 1.1239, "step": 424 }, { "epoch": 1.0869565217391304, "grad_norm": 0.310905332933887, "learning_rate": 1.9815093634019384e-05, "loss": 1.0885, "step": 425 }, { "epoch": 1.0895140664961638, "grad_norm": 0.34866191023824383, "learning_rate": 1.9812644135071358e-05, "loss": 1.0789, "step": 426 }, { "epoch": 1.092071611253197, "grad_norm": 0.3670206738107968, "learning_rate": 1.9810178671564853e-05, "loss": 1.1051, "step": 427 }, { "epoch": 1.0946291560102301, "grad_norm": 0.46475258100798056, "learning_rate": 1.980769724751104e-05, "loss": 1.0838, "step": 428 }, { "epoch": 1.0971867007672633, "grad_norm": 0.3157024370545657, "learning_rate": 1.9805199866947026e-05, "loss": 1.114, "step": 429 }, { "epoch": 1.0997442455242967, "grad_norm": 0.29958992335623563, "learning_rate": 1.9802686533935903e-05, "loss": 1.0909, "step": 430 }, { "epoch": 1.10230179028133, "grad_norm": 0.3045539331442299, "learning_rate": 1.9800157252566698e-05, "loss": 1.119, "step": 431 }, { "epoch": 1.104859335038363, "grad_norm": 0.35388881893166907, "learning_rate": 1.97976120269544e-05, "loss": 1.1357, "step": 432 }, { "epoch": 1.1074168797953965, "grad_norm": 0.4072658855507119, "learning_rate": 1.9795050861239932e-05, "loss": 1.1153, "step": 433 }, { "epoch": 1.1099744245524297, "grad_norm": 0.3515081652084557, "learning_rate": 1.9792473759590148e-05, "loss": 1.1051, "step": 434 }, { "epoch": 1.1125319693094629, "grad_norm": 0.30513537117496636, "learning_rate": 1.978988072619783e-05, "loss": 1.0943, "step": 435 }, { "epoch": 1.1150895140664963, "grad_norm": 0.5088746516427844, "learning_rate": 1.9787271765281684e-05, "loss": 1.0947, "step": 436 }, { "epoch": 1.1176470588235294, "grad_norm": 0.6682126794134292, "learning_rate": 1.9784646881086327e-05, "loss": 1.0743, "step": 437 }, { "epoch": 1.1202046035805626, "grad_norm": 0.5551640593749172, "learning_rate": 1.9782006077882282e-05, "loss": 1.0861, "step": 438 }, { "epoch": 1.1227621483375958, "grad_norm": 0.3278866812808205, "learning_rate": 1.9779349359965966e-05, "loss": 1.1069, "step": 439 }, { "epoch": 1.1253196930946292, "grad_norm": 0.38591224008325814, "learning_rate": 1.9776676731659695e-05, "loss": 1.0849, "step": 440 }, { "epoch": 1.1278772378516624, "grad_norm": 0.35719651550677206, "learning_rate": 1.977398819731167e-05, "loss": 1.1053, "step": 441 }, { "epoch": 1.1304347826086956, "grad_norm": 0.4232965403621678, "learning_rate": 1.9771283761295966e-05, "loss": 1.0848, "step": 442 }, { "epoch": 1.132992327365729, "grad_norm": 0.2697343671368354, "learning_rate": 1.9768563428012536e-05, "loss": 1.1091, "step": 443 }, { "epoch": 1.1355498721227621, "grad_norm": 0.3193367309932036, "learning_rate": 1.9765827201887183e-05, "loss": 1.0767, "step": 444 }, { "epoch": 1.1381074168797953, "grad_norm": 0.36846576847881124, "learning_rate": 1.9763075087371583e-05, "loss": 1.0996, "step": 445 }, { "epoch": 1.1406649616368287, "grad_norm": 0.31668666427159936, "learning_rate": 1.9760307088943254e-05, "loss": 1.0713, "step": 446 }, { "epoch": 1.143222506393862, "grad_norm": 0.35150116619841826, "learning_rate": 1.9757523211105555e-05, "loss": 1.0564, "step": 447 }, { "epoch": 1.145780051150895, "grad_norm": 0.429831549745095, "learning_rate": 1.975472345838768e-05, "loss": 1.0907, "step": 448 }, { "epoch": 1.1483375959079285, "grad_norm": 0.44872565734771747, "learning_rate": 1.9751907835344654e-05, "loss": 1.0817, "step": 449 }, { "epoch": 1.1508951406649617, "grad_norm": 0.33913236381932554, "learning_rate": 1.9749076346557318e-05, "loss": 1.129, "step": 450 }, { "epoch": 1.1534526854219949, "grad_norm": 0.33115586128973074, "learning_rate": 1.9746228996632326e-05, "loss": 1.1034, "step": 451 }, { "epoch": 1.156010230179028, "grad_norm": 0.3057185791661933, "learning_rate": 1.974336579020214e-05, "loss": 1.1076, "step": 452 }, { "epoch": 1.1585677749360614, "grad_norm": 0.43316526036175457, "learning_rate": 1.9740486731925022e-05, "loss": 1.1224, "step": 453 }, { "epoch": 1.1611253196930946, "grad_norm": 0.5066112837446138, "learning_rate": 1.9737591826485013e-05, "loss": 1.0962, "step": 454 }, { "epoch": 1.1636828644501278, "grad_norm": 0.4014502906289108, "learning_rate": 1.9734681078591943e-05, "loss": 1.0905, "step": 455 }, { "epoch": 1.1662404092071612, "grad_norm": 0.30247128311625804, "learning_rate": 1.9731754492981423e-05, "loss": 1.0812, "step": 456 }, { "epoch": 1.1687979539641944, "grad_norm": 0.31145252945008656, "learning_rate": 1.9728812074414822e-05, "loss": 1.0729, "step": 457 }, { "epoch": 1.1713554987212276, "grad_norm": 0.33968915375934183, "learning_rate": 1.9725853827679266e-05, "loss": 1.078, "step": 458 }, { "epoch": 1.1739130434782608, "grad_norm": 0.27618072861680876, "learning_rate": 1.9722879757587647e-05, "loss": 1.0864, "step": 459 }, { "epoch": 1.1764705882352942, "grad_norm": 0.28234315821124384, "learning_rate": 1.9719889868978582e-05, "loss": 1.1135, "step": 460 }, { "epoch": 1.1790281329923273, "grad_norm": 0.29884726287169866, "learning_rate": 1.971688416671644e-05, "loss": 1.1363, "step": 461 }, { "epoch": 1.1815856777493605, "grad_norm": 0.27600448666706423, "learning_rate": 1.9713862655691302e-05, "loss": 1.0791, "step": 462 }, { "epoch": 1.184143222506394, "grad_norm": 0.2803813788615088, "learning_rate": 1.971082534081899e-05, "loss": 1.0718, "step": 463 }, { "epoch": 1.186700767263427, "grad_norm": 0.2696501099289663, "learning_rate": 1.970777222704101e-05, "loss": 1.0961, "step": 464 }, { "epoch": 1.1892583120204603, "grad_norm": 0.3010556872116562, "learning_rate": 1.97047033193246e-05, "loss": 1.1038, "step": 465 }, { "epoch": 1.1918158567774937, "grad_norm": 0.28235325514025905, "learning_rate": 1.970161862266268e-05, "loss": 1.1054, "step": 466 }, { "epoch": 1.1943734015345269, "grad_norm": 0.28808186970685423, "learning_rate": 1.969851814207385e-05, "loss": 1.0807, "step": 467 }, { "epoch": 1.19693094629156, "grad_norm": 0.33258411208957883, "learning_rate": 1.9695401882602406e-05, "loss": 1.1296, "step": 468 }, { "epoch": 1.1994884910485935, "grad_norm": 0.3318703383183081, "learning_rate": 1.9692269849318303e-05, "loss": 1.0936, "step": 469 }, { "epoch": 1.2020460358056266, "grad_norm": 0.30178464518160203, "learning_rate": 1.9689122047317166e-05, "loss": 1.1155, "step": 470 }, { "epoch": 1.2046035805626598, "grad_norm": 0.30521273043475255, "learning_rate": 1.968595848172027e-05, "loss": 1.0896, "step": 471 }, { "epoch": 1.207161125319693, "grad_norm": 0.34614634138914463, "learning_rate": 1.968277915767454e-05, "loss": 1.0452, "step": 472 }, { "epoch": 1.2097186700767264, "grad_norm": 0.32741746531886684, "learning_rate": 1.9679584080352537e-05, "loss": 1.1045, "step": 473 }, { "epoch": 1.2122762148337596, "grad_norm": 0.2615489309131341, "learning_rate": 1.967637325495245e-05, "loss": 1.0855, "step": 474 }, { "epoch": 1.2148337595907928, "grad_norm": 0.27476592859150684, "learning_rate": 1.9673146686698093e-05, "loss": 1.1001, "step": 475 }, { "epoch": 1.2173913043478262, "grad_norm": 0.3421071933190777, "learning_rate": 1.9669904380838892e-05, "loss": 1.0729, "step": 476 }, { "epoch": 1.2199488491048593, "grad_norm": 0.3598257915245131, "learning_rate": 1.966664634264987e-05, "loss": 1.1242, "step": 477 }, { "epoch": 1.2225063938618925, "grad_norm": 0.32107570559715254, "learning_rate": 1.9663372577431663e-05, "loss": 1.1087, "step": 478 }, { "epoch": 1.2250639386189257, "grad_norm": 0.341209086018264, "learning_rate": 1.966008309051047e-05, "loss": 1.1167, "step": 479 }, { "epoch": 1.227621483375959, "grad_norm": 0.29733249941263845, "learning_rate": 1.965677788723809e-05, "loss": 1.07, "step": 480 }, { "epoch": 1.2301790281329923, "grad_norm": 0.26502862394927407, "learning_rate": 1.9653456972991877e-05, "loss": 1.0775, "step": 481 }, { "epoch": 1.2327365728900257, "grad_norm": 0.28986896788872485, "learning_rate": 1.965012035317475e-05, "loss": 1.0967, "step": 482 }, { "epoch": 1.2352941176470589, "grad_norm": 0.33295845795202056, "learning_rate": 1.9646768033215183e-05, "loss": 1.0879, "step": 483 }, { "epoch": 1.237851662404092, "grad_norm": 0.3705619524001342, "learning_rate": 1.9643400018567195e-05, "loss": 1.1019, "step": 484 }, { "epoch": 1.2404092071611252, "grad_norm": 0.3266347911273673, "learning_rate": 1.9640016314710323e-05, "loss": 1.1084, "step": 485 }, { "epoch": 1.2429667519181586, "grad_norm": 0.3761069051897771, "learning_rate": 1.9636616927149655e-05, "loss": 1.1029, "step": 486 }, { "epoch": 1.2455242966751918, "grad_norm": 0.2621662577070755, "learning_rate": 1.9633201861415773e-05, "loss": 1.0735, "step": 487 }, { "epoch": 1.248081841432225, "grad_norm": 0.266376960810325, "learning_rate": 1.9629771123064784e-05, "loss": 1.0948, "step": 488 }, { "epoch": 1.2506393861892584, "grad_norm": 0.3408438115021644, "learning_rate": 1.9626324717678275e-05, "loss": 1.0984, "step": 489 }, { "epoch": 1.2531969309462916, "grad_norm": 0.3255066954002719, "learning_rate": 1.962286265086334e-05, "loss": 1.1213, "step": 490 }, { "epoch": 1.2557544757033248, "grad_norm": 0.3765758476751633, "learning_rate": 1.961938492825254e-05, "loss": 1.0909, "step": 491 }, { "epoch": 1.258312020460358, "grad_norm": 0.3109670040308706, "learning_rate": 1.9615891555503914e-05, "loss": 1.1164, "step": 492 }, { "epoch": 1.2608695652173914, "grad_norm": 0.28523527744811616, "learning_rate": 1.961238253830096e-05, "loss": 1.0834, "step": 493 }, { "epoch": 1.2634271099744245, "grad_norm": 0.3472113617037474, "learning_rate": 1.9608857882352636e-05, "loss": 1.0823, "step": 494 }, { "epoch": 1.265984654731458, "grad_norm": 0.45214384592951995, "learning_rate": 1.9605317593393326e-05, "loss": 1.1084, "step": 495 }, { "epoch": 1.2685421994884911, "grad_norm": 0.3401855972965097, "learning_rate": 1.9601761677182868e-05, "loss": 1.0978, "step": 496 }, { "epoch": 1.2710997442455243, "grad_norm": 0.3025957486994177, "learning_rate": 1.959819013950651e-05, "loss": 1.0889, "step": 497 }, { "epoch": 1.2736572890025575, "grad_norm": 0.29140422941812544, "learning_rate": 1.9594602986174923e-05, "loss": 1.0792, "step": 498 }, { "epoch": 1.2762148337595907, "grad_norm": 0.3620688439157377, "learning_rate": 1.959100022302418e-05, "loss": 1.092, "step": 499 }, { "epoch": 1.278772378516624, "grad_norm": 0.3498507983384518, "learning_rate": 1.9587381855915754e-05, "loss": 1.0652, "step": 500 }, { "epoch": 1.2813299232736572, "grad_norm": 0.34633148833870603, "learning_rate": 1.95837478907365e-05, "loss": 1.0859, "step": 501 }, { "epoch": 1.2838874680306906, "grad_norm": 0.28466962730903933, "learning_rate": 1.958009833339865e-05, "loss": 1.0912, "step": 502 }, { "epoch": 1.2864450127877238, "grad_norm": 0.26890207030009217, "learning_rate": 1.9576433189839807e-05, "loss": 1.1088, "step": 503 }, { "epoch": 1.289002557544757, "grad_norm": 0.273263645379487, "learning_rate": 1.957275246602293e-05, "loss": 1.0837, "step": 504 }, { "epoch": 1.2915601023017902, "grad_norm": 0.2716148540613851, "learning_rate": 1.9569056167936332e-05, "loss": 1.105, "step": 505 }, { "epoch": 1.2941176470588236, "grad_norm": 0.24370260465489227, "learning_rate": 1.956534430159365e-05, "loss": 1.0726, "step": 506 }, { "epoch": 1.2966751918158568, "grad_norm": 0.2620730046771573, "learning_rate": 1.9561616873033867e-05, "loss": 1.1079, "step": 507 }, { "epoch": 1.29923273657289, "grad_norm": 0.3135544306790673, "learning_rate": 1.955787388832127e-05, "loss": 1.0697, "step": 508 }, { "epoch": 1.3017902813299234, "grad_norm": 0.26135639483849105, "learning_rate": 1.9554115353545464e-05, "loss": 1.1016, "step": 509 }, { "epoch": 1.3043478260869565, "grad_norm": 0.25771344651987327, "learning_rate": 1.9550341274821348e-05, "loss": 1.0727, "step": 510 }, { "epoch": 1.3069053708439897, "grad_norm": 0.3167223084832456, "learning_rate": 1.9546551658289113e-05, "loss": 1.0792, "step": 511 }, { "epoch": 1.309462915601023, "grad_norm": 0.37857845074967256, "learning_rate": 1.954274651011423e-05, "loss": 1.1143, "step": 512 }, { "epoch": 1.3120204603580563, "grad_norm": 0.2580494189739856, "learning_rate": 1.9538925836487436e-05, "loss": 1.0674, "step": 513 }, { "epoch": 1.3145780051150895, "grad_norm": 0.39297270925108346, "learning_rate": 1.953508964362473e-05, "loss": 1.0885, "step": 514 }, { "epoch": 1.317135549872123, "grad_norm": 0.4568937813346712, "learning_rate": 1.9531237937767352e-05, "loss": 1.0807, "step": 515 }, { "epoch": 1.319693094629156, "grad_norm": 0.4182414922758871, "learning_rate": 1.9527370725181793e-05, "loss": 1.0766, "step": 516 }, { "epoch": 1.3222506393861893, "grad_norm": 0.4402863172879326, "learning_rate": 1.9523488012159762e-05, "loss": 1.0712, "step": 517 }, { "epoch": 1.3248081841432224, "grad_norm": 0.3810424193074309, "learning_rate": 1.9519589805018187e-05, "loss": 1.0888, "step": 518 }, { "epoch": 1.3273657289002558, "grad_norm": 0.4051938816832732, "learning_rate": 1.951567611009922e-05, "loss": 1.0801, "step": 519 }, { "epoch": 1.329923273657289, "grad_norm": 0.3260440045944625, "learning_rate": 1.9511746933770186e-05, "loss": 1.1149, "step": 520 }, { "epoch": 1.3324808184143222, "grad_norm": 0.31554258651135036, "learning_rate": 1.9507802282423612e-05, "loss": 1.1202, "step": 521 }, { "epoch": 1.3350383631713556, "grad_norm": 0.2622342243824476, "learning_rate": 1.9503842162477205e-05, "loss": 1.1006, "step": 522 }, { "epoch": 1.3375959079283888, "grad_norm": 0.3015423536266443, "learning_rate": 1.9499866580373826e-05, "loss": 1.0873, "step": 523 }, { "epoch": 1.340153452685422, "grad_norm": 0.3920165036339574, "learning_rate": 1.94958755425815e-05, "loss": 1.1154, "step": 524 }, { "epoch": 1.3427109974424551, "grad_norm": 0.2769409471650046, "learning_rate": 1.9491869055593392e-05, "loss": 1.0867, "step": 525 }, { "epoch": 1.3452685421994885, "grad_norm": 0.30161940340621723, "learning_rate": 1.9487847125927814e-05, "loss": 1.1126, "step": 526 }, { "epoch": 1.3478260869565217, "grad_norm": 0.41990580701086677, "learning_rate": 1.948380976012819e-05, "loss": 1.0625, "step": 527 }, { "epoch": 1.350383631713555, "grad_norm": 0.3940286196901995, "learning_rate": 1.9479756964763062e-05, "loss": 1.1262, "step": 528 }, { "epoch": 1.3529411764705883, "grad_norm": 0.3683443524857737, "learning_rate": 1.9475688746426075e-05, "loss": 1.0865, "step": 529 }, { "epoch": 1.3554987212276215, "grad_norm": 0.2675607272032647, "learning_rate": 1.9471605111735964e-05, "loss": 1.0594, "step": 530 }, { "epoch": 1.3580562659846547, "grad_norm": 0.30194225210114733, "learning_rate": 1.9467506067336554e-05, "loss": 1.0955, "step": 531 }, { "epoch": 1.3606138107416879, "grad_norm": 0.32576735510414695, "learning_rate": 1.946339161989672e-05, "loss": 1.0824, "step": 532 }, { "epoch": 1.3631713554987213, "grad_norm": 0.3598150497292756, "learning_rate": 1.9459261776110426e-05, "loss": 1.1215, "step": 533 }, { "epoch": 1.3657289002557544, "grad_norm": 0.30585802865605916, "learning_rate": 1.945511654269666e-05, "loss": 1.086, "step": 534 }, { "epoch": 1.3682864450127878, "grad_norm": 0.2832294529242309, "learning_rate": 1.945095592639946e-05, "loss": 1.0992, "step": 535 }, { "epoch": 1.370843989769821, "grad_norm": 0.29056128095513195, "learning_rate": 1.944677993398789e-05, "loss": 1.1311, "step": 536 }, { "epoch": 1.3734015345268542, "grad_norm": 0.2598885076655647, "learning_rate": 1.944258857225603e-05, "loss": 1.0869, "step": 537 }, { "epoch": 1.3759590792838874, "grad_norm": 0.29819735030908995, "learning_rate": 1.943838184802296e-05, "loss": 1.1034, "step": 538 }, { "epoch": 1.3785166240409208, "grad_norm": 0.27354562935410204, "learning_rate": 1.9434159768132762e-05, "loss": 1.0834, "step": 539 }, { "epoch": 1.381074168797954, "grad_norm": 0.3164865864885613, "learning_rate": 1.9429922339454486e-05, "loss": 1.0952, "step": 540 }, { "epoch": 1.3836317135549872, "grad_norm": 0.34458030079305596, "learning_rate": 1.9425669568882175e-05, "loss": 1.1195, "step": 541 }, { "epoch": 1.3861892583120206, "grad_norm": 0.2973996932273863, "learning_rate": 1.942140146333481e-05, "loss": 1.1082, "step": 542 }, { "epoch": 1.3887468030690537, "grad_norm": 0.41583952226086746, "learning_rate": 1.9417118029756342e-05, "loss": 1.0664, "step": 543 }, { "epoch": 1.391304347826087, "grad_norm": 0.33101469656406096, "learning_rate": 1.9412819275115648e-05, "loss": 1.087, "step": 544 }, { "epoch": 1.39386189258312, "grad_norm": 0.2709972180594455, "learning_rate": 1.9408505206406526e-05, "loss": 1.078, "step": 545 }, { "epoch": 1.3964194373401535, "grad_norm": 0.3358832525629651, "learning_rate": 1.9404175830647703e-05, "loss": 1.0549, "step": 546 }, { "epoch": 1.3989769820971867, "grad_norm": 0.2987798463061033, "learning_rate": 1.93998311548828e-05, "loss": 1.0946, "step": 547 }, { "epoch": 1.40153452685422, "grad_norm": 0.3337061384486843, "learning_rate": 1.939547118618033e-05, "loss": 1.0898, "step": 548 }, { "epoch": 1.4040920716112533, "grad_norm": 0.3217064113312768, "learning_rate": 1.9391095931633694e-05, "loss": 1.1098, "step": 549 }, { "epoch": 1.4066496163682864, "grad_norm": 0.2752108304141071, "learning_rate": 1.9386705398361156e-05, "loss": 1.0469, "step": 550 }, { "epoch": 1.4092071611253196, "grad_norm": 0.25580623137423647, "learning_rate": 1.938229959350584e-05, "loss": 1.0616, "step": 551 }, { "epoch": 1.4117647058823528, "grad_norm": 0.3326332518112022, "learning_rate": 1.937787852423571e-05, "loss": 1.1083, "step": 552 }, { "epoch": 1.4143222506393862, "grad_norm": 0.28662569595039195, "learning_rate": 1.937344219774358e-05, "loss": 1.0908, "step": 553 }, { "epoch": 1.4168797953964194, "grad_norm": 0.27173135593182157, "learning_rate": 1.9368990621247062e-05, "loss": 1.102, "step": 554 }, { "epoch": 1.4194373401534528, "grad_norm": 0.2468084134139675, "learning_rate": 1.9364523801988606e-05, "loss": 1.1147, "step": 555 }, { "epoch": 1.421994884910486, "grad_norm": 0.2709546209917836, "learning_rate": 1.9360041747235437e-05, "loss": 1.0962, "step": 556 }, { "epoch": 1.4245524296675192, "grad_norm": 0.2653203619472685, "learning_rate": 1.9355544464279587e-05, "loss": 1.0864, "step": 557 }, { "epoch": 1.4271099744245523, "grad_norm": 0.28467968268797966, "learning_rate": 1.9351031960437848e-05, "loss": 1.0747, "step": 558 }, { "epoch": 1.4296675191815857, "grad_norm": 0.31847968792917525, "learning_rate": 1.934650424305178e-05, "loss": 1.0731, "step": 559 }, { "epoch": 1.432225063938619, "grad_norm": 0.3091639351747145, "learning_rate": 1.9341961319487704e-05, "loss": 1.0598, "step": 560 }, { "epoch": 1.434782608695652, "grad_norm": 0.26120102379692217, "learning_rate": 1.9337403197136663e-05, "loss": 1.0712, "step": 561 }, { "epoch": 1.4373401534526855, "grad_norm": 0.283165316308832, "learning_rate": 1.9332829883414444e-05, "loss": 1.0883, "step": 562 }, { "epoch": 1.4398976982097187, "grad_norm": 0.2767794060421261, "learning_rate": 1.932824138576154e-05, "loss": 1.1141, "step": 563 }, { "epoch": 1.4424552429667519, "grad_norm": 0.3027787955580307, "learning_rate": 1.9323637711643147e-05, "loss": 1.1109, "step": 564 }, { "epoch": 1.445012787723785, "grad_norm": 0.32071961002527666, "learning_rate": 1.9319018868549165e-05, "loss": 1.1192, "step": 565 }, { "epoch": 1.4475703324808185, "grad_norm": 0.33467873672280385, "learning_rate": 1.931438486399415e-05, "loss": 1.0817, "step": 566 }, { "epoch": 1.4501278772378516, "grad_norm": 0.30569240173237483, "learning_rate": 1.930973570551735e-05, "loss": 1.0607, "step": 567 }, { "epoch": 1.452685421994885, "grad_norm": 0.298726423982734, "learning_rate": 1.9305071400682644e-05, "loss": 1.0914, "step": 568 }, { "epoch": 1.4552429667519182, "grad_norm": 0.3038529339878212, "learning_rate": 1.9300391957078564e-05, "loss": 1.0834, "step": 569 }, { "epoch": 1.4578005115089514, "grad_norm": 0.30563450154931243, "learning_rate": 1.9295697382318286e-05, "loss": 1.0733, "step": 570 }, { "epoch": 1.4603580562659846, "grad_norm": 0.3808106030288731, "learning_rate": 1.9290987684039576e-05, "loss": 1.0955, "step": 571 }, { "epoch": 1.4629156010230178, "grad_norm": 0.32964679230942334, "learning_rate": 1.9286262869904827e-05, "loss": 1.0977, "step": 572 }, { "epoch": 1.4654731457800512, "grad_norm": 0.3576744350781661, "learning_rate": 1.928152294760101e-05, "loss": 1.0826, "step": 573 }, { "epoch": 1.4680306905370843, "grad_norm": 0.3442477800849191, "learning_rate": 1.9276767924839687e-05, "loss": 1.0693, "step": 574 }, { "epoch": 1.4705882352941178, "grad_norm": 0.4177409226360097, "learning_rate": 1.927199780935698e-05, "loss": 1.1031, "step": 575 }, { "epoch": 1.473145780051151, "grad_norm": 0.5022744214347684, "learning_rate": 1.926721260891357e-05, "loss": 1.1081, "step": 576 }, { "epoch": 1.4757033248081841, "grad_norm": 0.5089458782552098, "learning_rate": 1.9262412331294677e-05, "loss": 1.0984, "step": 577 }, { "epoch": 1.4782608695652173, "grad_norm": 0.28913442828013464, "learning_rate": 1.9257596984310055e-05, "loss": 1.0907, "step": 578 }, { "epoch": 1.4808184143222507, "grad_norm": 0.36385701502207274, "learning_rate": 1.925276657579397e-05, "loss": 1.0667, "step": 579 }, { "epoch": 1.4833759590792839, "grad_norm": 0.39854637256040343, "learning_rate": 1.9247921113605197e-05, "loss": 1.0814, "step": 580 }, { "epoch": 1.485933503836317, "grad_norm": 0.3421920326108303, "learning_rate": 1.9243060605626995e-05, "loss": 1.0984, "step": 581 }, { "epoch": 1.4884910485933505, "grad_norm": 0.2806970145004491, "learning_rate": 1.9238185059767116e-05, "loss": 1.0903, "step": 582 }, { "epoch": 1.4910485933503836, "grad_norm": 0.458875989536999, "learning_rate": 1.9233294483957758e-05, "loss": 1.1135, "step": 583 }, { "epoch": 1.4936061381074168, "grad_norm": 0.5204446417118193, "learning_rate": 1.922838888615559e-05, "loss": 1.1228, "step": 584 }, { "epoch": 1.49616368286445, "grad_norm": 0.4574878580551403, "learning_rate": 1.922346827434171e-05, "loss": 1.0595, "step": 585 }, { "epoch": 1.4987212276214834, "grad_norm": 0.26814443608722427, "learning_rate": 1.921853265652164e-05, "loss": 1.0742, "step": 586 }, { "epoch": 1.5012787723785166, "grad_norm": 0.4321843380909753, "learning_rate": 1.9213582040725333e-05, "loss": 1.0823, "step": 587 }, { "epoch": 1.50383631713555, "grad_norm": 0.3998584041466985, "learning_rate": 1.9208616435007124e-05, "loss": 1.1113, "step": 588 }, { "epoch": 1.5063938618925832, "grad_norm": 0.36340166424292447, "learning_rate": 1.9203635847445743e-05, "loss": 1.0495, "step": 589 }, { "epoch": 1.5089514066496164, "grad_norm": 0.30341924814307153, "learning_rate": 1.9198640286144296e-05, "loss": 1.0778, "step": 590 }, { "epoch": 1.5115089514066495, "grad_norm": 0.3549252043532506, "learning_rate": 1.9193629759230252e-05, "loss": 1.0526, "step": 591 }, { "epoch": 1.5140664961636827, "grad_norm": 0.3706707482911529, "learning_rate": 1.9188604274855417e-05, "loss": 1.1082, "step": 592 }, { "epoch": 1.5166240409207161, "grad_norm": 0.3221161365565599, "learning_rate": 1.9183563841195948e-05, "loss": 1.0358, "step": 593 }, { "epoch": 1.5191815856777495, "grad_norm": 0.35561020647213454, "learning_rate": 1.917850846645231e-05, "loss": 1.1016, "step": 594 }, { "epoch": 1.5217391304347827, "grad_norm": 0.3891453948051964, "learning_rate": 1.917343815884929e-05, "loss": 1.0723, "step": 595 }, { "epoch": 1.5242966751918159, "grad_norm": 0.293218650160261, "learning_rate": 1.9168352926635948e-05, "loss": 1.0842, "step": 596 }, { "epoch": 1.526854219948849, "grad_norm": 0.331624086856979, "learning_rate": 1.9163252778085646e-05, "loss": 1.0928, "step": 597 }, { "epoch": 1.5294117647058822, "grad_norm": 0.36005628746389595, "learning_rate": 1.9158137721496014e-05, "loss": 1.0954, "step": 598 }, { "epoch": 1.5319693094629157, "grad_norm": 0.25854576697363735, "learning_rate": 1.9153007765188918e-05, "loss": 1.0703, "step": 599 }, { "epoch": 1.5345268542199488, "grad_norm": 0.3178892680337157, "learning_rate": 1.914786291751048e-05, "loss": 1.1178, "step": 600 }, { "epoch": 1.5370843989769822, "grad_norm": 0.3276728285320476, "learning_rate": 1.9142703186831044e-05, "loss": 1.0711, "step": 601 }, { "epoch": 1.5396419437340154, "grad_norm": 0.34402306746609335, "learning_rate": 1.9137528581545172e-05, "loss": 1.0669, "step": 602 }, { "epoch": 1.5421994884910486, "grad_norm": 0.3658697294408855, "learning_rate": 1.9132339110071623e-05, "loss": 1.0738, "step": 603 }, { "epoch": 1.5447570332480818, "grad_norm": 0.33272997926321957, "learning_rate": 1.9127134780853343e-05, "loss": 1.0891, "step": 604 }, { "epoch": 1.547314578005115, "grad_norm": 0.26256059097959605, "learning_rate": 1.9121915602357447e-05, "loss": 1.0752, "step": 605 }, { "epoch": 1.5498721227621484, "grad_norm": 0.29698212652722755, "learning_rate": 1.9116681583075215e-05, "loss": 1.0531, "step": 606 }, { "epoch": 1.5524296675191815, "grad_norm": 0.3308461220455405, "learning_rate": 1.9111432731522067e-05, "loss": 1.0775, "step": 607 }, { "epoch": 1.554987212276215, "grad_norm": 0.28434303668023103, "learning_rate": 1.910616905623756e-05, "loss": 1.0989, "step": 608 }, { "epoch": 1.5575447570332481, "grad_norm": 0.2949610693246568, "learning_rate": 1.910089056578536e-05, "loss": 1.0942, "step": 609 }, { "epoch": 1.5601023017902813, "grad_norm": 0.26028511630293355, "learning_rate": 1.9095597268753243e-05, "loss": 1.0639, "step": 610 }, { "epoch": 1.5626598465473145, "grad_norm": 0.2736816450940113, "learning_rate": 1.9090289173753077e-05, "loss": 1.1013, "step": 611 }, { "epoch": 1.5652173913043477, "grad_norm": 0.24169212652369965, "learning_rate": 1.908496628942079e-05, "loss": 1.0904, "step": 612 }, { "epoch": 1.567774936061381, "grad_norm": 0.2790060046832418, "learning_rate": 1.907962862441639e-05, "loss": 1.0789, "step": 613 }, { "epoch": 1.5703324808184145, "grad_norm": 0.25148763709880523, "learning_rate": 1.9074276187423925e-05, "loss": 1.083, "step": 614 }, { "epoch": 1.5728900255754477, "grad_norm": 0.260089635225582, "learning_rate": 1.906890898715147e-05, "loss": 1.1052, "step": 615 }, { "epoch": 1.5754475703324808, "grad_norm": 0.24239290344853867, "learning_rate": 1.9063527032331128e-05, "loss": 1.0587, "step": 616 }, { "epoch": 1.578005115089514, "grad_norm": 0.31033949728422483, "learning_rate": 1.9058130331719002e-05, "loss": 1.0906, "step": 617 }, { "epoch": 1.5805626598465472, "grad_norm": 0.29694640873919886, "learning_rate": 1.9052718894095183e-05, "loss": 1.0828, "step": 618 }, { "epoch": 1.5831202046035806, "grad_norm": 0.268458744450183, "learning_rate": 1.904729272826375e-05, "loss": 1.0697, "step": 619 }, { "epoch": 1.5856777493606138, "grad_norm": 0.3328538025026265, "learning_rate": 1.9041851843052727e-05, "loss": 1.0556, "step": 620 }, { "epoch": 1.5882352941176472, "grad_norm": 0.4354576423430095, "learning_rate": 1.90363962473141e-05, "loss": 1.0888, "step": 621 }, { "epoch": 1.5907928388746804, "grad_norm": 0.4488970201166202, "learning_rate": 1.9030925949923777e-05, "loss": 1.0991, "step": 622 }, { "epoch": 1.5933503836317136, "grad_norm": 0.30850235477610843, "learning_rate": 1.9025440959781593e-05, "loss": 1.0721, "step": 623 }, { "epoch": 1.5959079283887467, "grad_norm": 0.24306011770668454, "learning_rate": 1.9019941285811284e-05, "loss": 1.1146, "step": 624 }, { "epoch": 1.59846547314578, "grad_norm": 0.31927732953474425, "learning_rate": 1.9014426936960477e-05, "loss": 1.1386, "step": 625 }, { "epoch": 1.6010230179028133, "grad_norm": 0.30395309199867215, "learning_rate": 1.900889792220067e-05, "loss": 1.0651, "step": 626 }, { "epoch": 1.6035805626598465, "grad_norm": 0.2641664347228699, "learning_rate": 1.9003354250527225e-05, "loss": 1.0737, "step": 627 }, { "epoch": 1.60613810741688, "grad_norm": 0.2541673904415416, "learning_rate": 1.899779593095935e-05, "loss": 1.1093, "step": 628 }, { "epoch": 1.608695652173913, "grad_norm": 0.248114384702292, "learning_rate": 1.8992222972540083e-05, "loss": 1.0631, "step": 629 }, { "epoch": 1.6112531969309463, "grad_norm": 0.27098670487834897, "learning_rate": 1.8986635384336275e-05, "loss": 1.0684, "step": 630 }, { "epoch": 1.6138107416879794, "grad_norm": 0.2707047290641469, "learning_rate": 1.8981033175438593e-05, "loss": 1.0793, "step": 631 }, { "epoch": 1.6163682864450126, "grad_norm": 0.2248022175811438, "learning_rate": 1.897541635496147e-05, "loss": 1.0741, "step": 632 }, { "epoch": 1.618925831202046, "grad_norm": 0.33046089699268805, "learning_rate": 1.896978493204313e-05, "loss": 1.0536, "step": 633 }, { "epoch": 1.6214833759590794, "grad_norm": 0.2897890506100947, "learning_rate": 1.896413891584554e-05, "loss": 1.1041, "step": 634 }, { "epoch": 1.6240409207161126, "grad_norm": 0.24423929651462964, "learning_rate": 1.8958478315554414e-05, "loss": 1.0554, "step": 635 }, { "epoch": 1.6265984654731458, "grad_norm": 0.2824637389915044, "learning_rate": 1.8952803140379198e-05, "loss": 1.105, "step": 636 }, { "epoch": 1.629156010230179, "grad_norm": 0.34172319194434536, "learning_rate": 1.894711339955305e-05, "loss": 1.0966, "step": 637 }, { "epoch": 1.6317135549872122, "grad_norm": 0.2986624598202099, "learning_rate": 1.8941409102332818e-05, "loss": 1.0801, "step": 638 }, { "epoch": 1.6342710997442456, "grad_norm": 0.35330551163337126, "learning_rate": 1.893569025799904e-05, "loss": 1.1168, "step": 639 }, { "epoch": 1.6368286445012787, "grad_norm": 0.37997527154753075, "learning_rate": 1.8929956875855913e-05, "loss": 1.044, "step": 640 }, { "epoch": 1.6393861892583121, "grad_norm": 0.3987670557181093, "learning_rate": 1.89242089652313e-05, "loss": 1.0678, "step": 641 }, { "epoch": 1.6419437340153453, "grad_norm": 0.4164983853962145, "learning_rate": 1.8918446535476683e-05, "loss": 1.0713, "step": 642 }, { "epoch": 1.6445012787723785, "grad_norm": 0.36634278907361967, "learning_rate": 1.8912669595967182e-05, "loss": 1.0845, "step": 643 }, { "epoch": 1.6470588235294117, "grad_norm": 0.3377854105852521, "learning_rate": 1.890687815610151e-05, "loss": 1.1325, "step": 644 }, { "epoch": 1.6496163682864449, "grad_norm": 0.2921364211079459, "learning_rate": 1.8901072225301983e-05, "loss": 1.0417, "step": 645 }, { "epoch": 1.6521739130434783, "grad_norm": 0.40803324585389733, "learning_rate": 1.8895251813014486e-05, "loss": 1.0985, "step": 646 }, { "epoch": 1.6547314578005117, "grad_norm": 0.4777584379650545, "learning_rate": 1.8889416928708465e-05, "loss": 1.0579, "step": 647 }, { "epoch": 1.6572890025575449, "grad_norm": 0.4575863335013247, "learning_rate": 1.8883567581876913e-05, "loss": 1.075, "step": 648 }, { "epoch": 1.659846547314578, "grad_norm": 0.44868767506108537, "learning_rate": 1.887770378203635e-05, "loss": 1.082, "step": 649 }, { "epoch": 1.6624040920716112, "grad_norm": 0.3990360823870846, "learning_rate": 1.8871825538726815e-05, "loss": 1.0618, "step": 650 }, { "epoch": 1.6649616368286444, "grad_norm": 0.384455268117493, "learning_rate": 1.8865932861511836e-05, "loss": 1.0883, "step": 651 }, { "epoch": 1.6675191815856778, "grad_norm": 0.4308655650983798, "learning_rate": 1.8860025759978436e-05, "loss": 1.1136, "step": 652 }, { "epoch": 1.670076726342711, "grad_norm": 0.5161027640726775, "learning_rate": 1.8854104243737096e-05, "loss": 1.0876, "step": 653 }, { "epoch": 1.6726342710997444, "grad_norm": 0.5710337903727111, "learning_rate": 1.8848168322421756e-05, "loss": 1.0921, "step": 654 }, { "epoch": 1.6751918158567776, "grad_norm": 0.4680011964164238, "learning_rate": 1.884221800568979e-05, "loss": 1.0817, "step": 655 }, { "epoch": 1.6777493606138107, "grad_norm": 0.273509418810932, "learning_rate": 1.8836253303221985e-05, "loss": 1.0676, "step": 656 }, { "epoch": 1.680306905370844, "grad_norm": 0.36238937602325755, "learning_rate": 1.8830274224722544e-05, "loss": 1.0694, "step": 657 }, { "epoch": 1.682864450127877, "grad_norm": 0.4331370312585361, "learning_rate": 1.8824280779919055e-05, "loss": 1.0939, "step": 658 }, { "epoch": 1.6854219948849105, "grad_norm": 0.42161084086226236, "learning_rate": 1.8818272978562472e-05, "loss": 1.0949, "step": 659 }, { "epoch": 1.6879795396419437, "grad_norm": 0.42114600096809945, "learning_rate": 1.8812250830427116e-05, "loss": 1.1071, "step": 660 }, { "epoch": 1.690537084398977, "grad_norm": 0.2580305989521523, "learning_rate": 1.8806214345310648e-05, "loss": 1.0884, "step": 661 }, { "epoch": 1.6930946291560103, "grad_norm": 0.2790098578226022, "learning_rate": 1.8800163533034048e-05, "loss": 1.0786, "step": 662 }, { "epoch": 1.6956521739130435, "grad_norm": 0.3952483114126335, "learning_rate": 1.879409840344161e-05, "loss": 1.1025, "step": 663 }, { "epoch": 1.6982097186700766, "grad_norm": 0.34837002184241345, "learning_rate": 1.8788018966400923e-05, "loss": 1.0862, "step": 664 }, { "epoch": 1.7007672634271098, "grad_norm": 0.23347425632455518, "learning_rate": 1.878192523180285e-05, "loss": 1.0903, "step": 665 }, { "epoch": 1.7033248081841432, "grad_norm": 0.258084870513599, "learning_rate": 1.877581720956151e-05, "loss": 1.0659, "step": 666 }, { "epoch": 1.7058823529411766, "grad_norm": 0.2955310030807304, "learning_rate": 1.876969490961428e-05, "loss": 1.0803, "step": 667 }, { "epoch": 1.7084398976982098, "grad_norm": 0.34485101895191056, "learning_rate": 1.8763558341921762e-05, "loss": 1.0729, "step": 668 }, { "epoch": 1.710997442455243, "grad_norm": 0.25932977011662367, "learning_rate": 1.8757407516467762e-05, "loss": 1.1017, "step": 669 }, { "epoch": 1.7135549872122762, "grad_norm": 0.23771298856204617, "learning_rate": 1.8751242443259286e-05, "loss": 1.0771, "step": 670 }, { "epoch": 1.7161125319693094, "grad_norm": 0.3403000739473665, "learning_rate": 1.874506313232653e-05, "loss": 1.0972, "step": 671 }, { "epoch": 1.7186700767263428, "grad_norm": 0.36624614786635146, "learning_rate": 1.873886959372284e-05, "loss": 1.0948, "step": 672 }, { "epoch": 1.721227621483376, "grad_norm": 0.23241780598609607, "learning_rate": 1.8732661837524722e-05, "loss": 1.0726, "step": 673 }, { "epoch": 1.7237851662404093, "grad_norm": 0.27573330219222747, "learning_rate": 1.8726439873831803e-05, "loss": 1.1154, "step": 674 }, { "epoch": 1.7263427109974425, "grad_norm": 0.3289571952505283, "learning_rate": 1.8720203712766833e-05, "loss": 1.0855, "step": 675 }, { "epoch": 1.7289002557544757, "grad_norm": 0.26315983835648826, "learning_rate": 1.8713953364475654e-05, "loss": 1.0561, "step": 676 }, { "epoch": 1.7314578005115089, "grad_norm": 0.2933737539222408, "learning_rate": 1.8707688839127187e-05, "loss": 1.0717, "step": 677 }, { "epoch": 1.734015345268542, "grad_norm": 0.24075336640916348, "learning_rate": 1.8701410146913427e-05, "loss": 1.0733, "step": 678 }, { "epoch": 1.7365728900255755, "grad_norm": 0.2969635924636881, "learning_rate": 1.869511729804942e-05, "loss": 1.0736, "step": 679 }, { "epoch": 1.7391304347826086, "grad_norm": 0.2302120367596696, "learning_rate": 1.8688810302773225e-05, "loss": 1.0718, "step": 680 }, { "epoch": 1.741687979539642, "grad_norm": 0.31123990252305606, "learning_rate": 1.8682489171345942e-05, "loss": 1.0633, "step": 681 }, { "epoch": 1.7442455242966752, "grad_norm": 0.25671775642481637, "learning_rate": 1.8676153914051648e-05, "loss": 1.1055, "step": 682 }, { "epoch": 1.7468030690537084, "grad_norm": 0.2731165902037635, "learning_rate": 1.866980454119741e-05, "loss": 1.1019, "step": 683 }, { "epoch": 1.7493606138107416, "grad_norm": 0.29946202186623655, "learning_rate": 1.8663441063113266e-05, "loss": 1.0856, "step": 684 }, { "epoch": 1.7519181585677748, "grad_norm": 0.2743108383298565, "learning_rate": 1.8657063490152193e-05, "loss": 1.0797, "step": 685 }, { "epoch": 1.7544757033248082, "grad_norm": 0.2910690805954212, "learning_rate": 1.8650671832690106e-05, "loss": 1.1068, "step": 686 }, { "epoch": 1.7570332480818416, "grad_norm": 0.25617556691443527, "learning_rate": 1.864426610112583e-05, "loss": 1.0801, "step": 687 }, { "epoch": 1.7595907928388748, "grad_norm": 0.2446643852273966, "learning_rate": 1.8637846305881092e-05, "loss": 1.0712, "step": 688 }, { "epoch": 1.762148337595908, "grad_norm": 0.24853300895824507, "learning_rate": 1.8631412457400494e-05, "loss": 1.0518, "step": 689 }, { "epoch": 1.7647058823529411, "grad_norm": 0.2250526521940477, "learning_rate": 1.862496456615151e-05, "loss": 1.0802, "step": 690 }, { "epoch": 1.7672634271099743, "grad_norm": 0.23033386861703295, "learning_rate": 1.861850264262445e-05, "loss": 1.0921, "step": 691 }, { "epoch": 1.7698209718670077, "grad_norm": 0.22393185289398734, "learning_rate": 1.8612026697332466e-05, "loss": 1.0824, "step": 692 }, { "epoch": 1.772378516624041, "grad_norm": 0.24371247518659098, "learning_rate": 1.860553674081151e-05, "loss": 1.0958, "step": 693 }, { "epoch": 1.7749360613810743, "grad_norm": 0.21684995978781324, "learning_rate": 1.859903278362034e-05, "loss": 1.0511, "step": 694 }, { "epoch": 1.7774936061381075, "grad_norm": 0.24359803588661344, "learning_rate": 1.8592514836340485e-05, "loss": 1.064, "step": 695 }, { "epoch": 1.7800511508951407, "grad_norm": 0.2806613621237684, "learning_rate": 1.8585982909576243e-05, "loss": 1.0974, "step": 696 }, { "epoch": 1.7826086956521738, "grad_norm": 0.2951317541501585, "learning_rate": 1.857943701395464e-05, "loss": 1.0745, "step": 697 }, { "epoch": 1.785166240409207, "grad_norm": 0.2602691127905397, "learning_rate": 1.857287716012545e-05, "loss": 1.094, "step": 698 }, { "epoch": 1.7877237851662404, "grad_norm": 0.2878865850607815, "learning_rate": 1.8566303358761134e-05, "loss": 1.0764, "step": 699 }, { "epoch": 1.7902813299232738, "grad_norm": 0.25826524614522556, "learning_rate": 1.8559715620556865e-05, "loss": 1.095, "step": 700 }, { "epoch": 1.792838874680307, "grad_norm": 0.3113734244197743, "learning_rate": 1.855311395623048e-05, "loss": 1.0636, "step": 701 }, { "epoch": 1.7953964194373402, "grad_norm": 0.32545837268145317, "learning_rate": 1.854649837652247e-05, "loss": 1.0836, "step": 702 }, { "epoch": 1.7979539641943734, "grad_norm": 0.285984682125429, "learning_rate": 1.8539868892195972e-05, "loss": 1.0848, "step": 703 }, { "epoch": 1.8005115089514065, "grad_norm": 0.27758608852953665, "learning_rate": 1.8533225514036742e-05, "loss": 1.0663, "step": 704 }, { "epoch": 1.80306905370844, "grad_norm": 0.27148772448252917, "learning_rate": 1.852656825285314e-05, "loss": 1.094, "step": 705 }, { "epoch": 1.8056265984654731, "grad_norm": 0.30810009717755804, "learning_rate": 1.8519897119476115e-05, "loss": 1.0455, "step": 706 }, { "epoch": 1.8081841432225065, "grad_norm": 0.2763175632842481, "learning_rate": 1.8513212124759185e-05, "loss": 1.0525, "step": 707 }, { "epoch": 1.8107416879795397, "grad_norm": 0.2555077301269018, "learning_rate": 1.8506513279578415e-05, "loss": 1.0708, "step": 708 }, { "epoch": 1.813299232736573, "grad_norm": 0.2861828394638753, "learning_rate": 1.849980059483241e-05, "loss": 1.0269, "step": 709 }, { "epoch": 1.815856777493606, "grad_norm": 0.32694363610851984, "learning_rate": 1.849307408144229e-05, "loss": 1.0742, "step": 710 }, { "epoch": 1.8184143222506393, "grad_norm": 0.33550420038638934, "learning_rate": 1.8486333750351668e-05, "loss": 1.1291, "step": 711 }, { "epoch": 1.8209718670076727, "grad_norm": 0.30494475043620173, "learning_rate": 1.8479579612526642e-05, "loss": 1.0754, "step": 712 }, { "epoch": 1.8235294117647058, "grad_norm": 0.2449819480488345, "learning_rate": 1.8472811678955773e-05, "loss": 1.083, "step": 713 }, { "epoch": 1.8260869565217392, "grad_norm": 0.26042670531487994, "learning_rate": 1.8466029960650066e-05, "loss": 1.0749, "step": 714 }, { "epoch": 1.8286445012787724, "grad_norm": 0.3057228350277353, "learning_rate": 1.845923446864295e-05, "loss": 1.0549, "step": 715 }, { "epoch": 1.8312020460358056, "grad_norm": 0.2500852141764497, "learning_rate": 1.845242521399027e-05, "loss": 1.0721, "step": 716 }, { "epoch": 1.8337595907928388, "grad_norm": 0.2675252870460311, "learning_rate": 1.8445602207770254e-05, "loss": 1.0449, "step": 717 }, { "epoch": 1.836317135549872, "grad_norm": 0.2836719734304398, "learning_rate": 1.8438765461083504e-05, "loss": 1.0905, "step": 718 }, { "epoch": 1.8388746803069054, "grad_norm": 0.34699165997108533, "learning_rate": 1.843191498505299e-05, "loss": 1.0901, "step": 719 }, { "epoch": 1.8414322250639388, "grad_norm": 0.2722070954863811, "learning_rate": 1.8425050790823994e-05, "loss": 1.0964, "step": 720 }, { "epoch": 1.843989769820972, "grad_norm": 0.258368289769939, "learning_rate": 1.8418172889564145e-05, "loss": 1.0962, "step": 721 }, { "epoch": 1.8465473145780051, "grad_norm": 0.25936143701246717, "learning_rate": 1.8411281292463345e-05, "loss": 1.0545, "step": 722 }, { "epoch": 1.8491048593350383, "grad_norm": 0.3060957581043503, "learning_rate": 1.8404376010733802e-05, "loss": 1.0815, "step": 723 }, { "epoch": 1.8516624040920715, "grad_norm": 0.2815365945528782, "learning_rate": 1.8397457055609973e-05, "loss": 1.0759, "step": 724 }, { "epoch": 1.854219948849105, "grad_norm": 0.2745951540225352, "learning_rate": 1.8390524438348565e-05, "loss": 1.1021, "step": 725 }, { "epoch": 1.856777493606138, "grad_norm": 0.27846031555437806, "learning_rate": 1.8383578170228514e-05, "loss": 1.0248, "step": 726 }, { "epoch": 1.8593350383631715, "grad_norm": 0.2938959273434096, "learning_rate": 1.8376618262550966e-05, "loss": 1.0528, "step": 727 }, { "epoch": 1.8618925831202047, "grad_norm": 0.2993316558221603, "learning_rate": 1.836964472663925e-05, "loss": 1.058, "step": 728 }, { "epoch": 1.8644501278772379, "grad_norm": 0.28817201575308804, "learning_rate": 1.8362657573838874e-05, "loss": 1.1157, "step": 729 }, { "epoch": 1.867007672634271, "grad_norm": 0.22467467671098768, "learning_rate": 1.8355656815517505e-05, "loss": 1.0711, "step": 730 }, { "epoch": 1.8695652173913042, "grad_norm": 0.29149108866988305, "learning_rate": 1.8348642463064937e-05, "loss": 1.0414, "step": 731 }, { "epoch": 1.8721227621483376, "grad_norm": 0.39401431973372464, "learning_rate": 1.8341614527893077e-05, "loss": 1.0791, "step": 732 }, { "epoch": 1.8746803069053708, "grad_norm": 0.4335182479065654, "learning_rate": 1.833457302143594e-05, "loss": 1.0878, "step": 733 }, { "epoch": 1.8772378516624042, "grad_norm": 0.43497766670833005, "learning_rate": 1.832751795514962e-05, "loss": 1.0484, "step": 734 }, { "epoch": 1.8797953964194374, "grad_norm": 0.2997553952148685, "learning_rate": 1.832044934051226e-05, "loss": 1.0762, "step": 735 }, { "epoch": 1.8823529411764706, "grad_norm": 0.23441660095601177, "learning_rate": 1.8313367189024065e-05, "loss": 1.1082, "step": 736 }, { "epoch": 1.8849104859335037, "grad_norm": 0.23816717696848114, "learning_rate": 1.8306271512207242e-05, "loss": 1.0834, "step": 737 }, { "epoch": 1.887468030690537, "grad_norm": 0.29809886717421774, "learning_rate": 1.829916232160602e-05, "loss": 1.087, "step": 738 }, { "epoch": 1.8900255754475703, "grad_norm": 0.36580006827207345, "learning_rate": 1.829203962878661e-05, "loss": 1.0718, "step": 739 }, { "epoch": 1.8925831202046037, "grad_norm": 0.36472500474679165, "learning_rate": 1.8284903445337184e-05, "loss": 1.0435, "step": 740 }, { "epoch": 1.895140664961637, "grad_norm": 0.2569898458683152, "learning_rate": 1.8277753782867865e-05, "loss": 1.0569, "step": 741 }, { "epoch": 1.89769820971867, "grad_norm": 0.2807015519670205, "learning_rate": 1.8270590653010706e-05, "loss": 1.0623, "step": 742 }, { "epoch": 1.9002557544757033, "grad_norm": 0.2706420270561887, "learning_rate": 1.8263414067419676e-05, "loss": 1.101, "step": 743 }, { "epoch": 1.9028132992327365, "grad_norm": 0.28562929161394046, "learning_rate": 1.8256224037770628e-05, "loss": 1.0524, "step": 744 }, { "epoch": 1.9053708439897699, "grad_norm": 0.2774733347803849, "learning_rate": 1.824902057576129e-05, "loss": 1.0511, "step": 745 }, { "epoch": 1.907928388746803, "grad_norm": 0.22198709105225659, "learning_rate": 1.8241803693111245e-05, "loss": 1.075, "step": 746 }, { "epoch": 1.9104859335038364, "grad_norm": 0.287788512970941, "learning_rate": 1.8234573401561914e-05, "loss": 1.0665, "step": 747 }, { "epoch": 1.9130434782608696, "grad_norm": 0.2909301551397291, "learning_rate": 1.8227329712876525e-05, "loss": 1.0802, "step": 748 }, { "epoch": 1.9156010230179028, "grad_norm": 0.25392349276614573, "learning_rate": 1.8220072638840105e-05, "loss": 1.1035, "step": 749 }, { "epoch": 1.918158567774936, "grad_norm": 0.22821936416155694, "learning_rate": 1.8212802191259465e-05, "loss": 1.0571, "step": 750 }, { "epoch": 1.9207161125319692, "grad_norm": 0.3130516886250542, "learning_rate": 1.8205518381963165e-05, "loss": 1.1095, "step": 751 }, { "epoch": 1.9232736572890026, "grad_norm": 0.3857586516868388, "learning_rate": 1.8198221222801506e-05, "loss": 1.06, "step": 752 }, { "epoch": 1.9258312020460358, "grad_norm": 0.315792024279407, "learning_rate": 1.8190910725646512e-05, "loss": 1.0772, "step": 753 }, { "epoch": 1.9283887468030692, "grad_norm": 0.26686727973038904, "learning_rate": 1.8183586902391905e-05, "loss": 1.0708, "step": 754 }, { "epoch": 1.9309462915601023, "grad_norm": 0.3669775155609857, "learning_rate": 1.8176249764953088e-05, "loss": 1.0393, "step": 755 }, { "epoch": 1.9335038363171355, "grad_norm": 0.3411186812565117, "learning_rate": 1.8168899325267122e-05, "loss": 1.0777, "step": 756 }, { "epoch": 1.9360613810741687, "grad_norm": 0.29525106020949826, "learning_rate": 1.8161535595292717e-05, "loss": 1.0738, "step": 757 }, { "epoch": 1.938618925831202, "grad_norm": 0.2431416087312154, "learning_rate": 1.8154158587010195e-05, "loss": 1.0552, "step": 758 }, { "epoch": 1.9411764705882353, "grad_norm": 0.2528824918629993, "learning_rate": 1.8146768312421495e-05, "loss": 1.1049, "step": 759 }, { "epoch": 1.9437340153452687, "grad_norm": 0.27274199937217425, "learning_rate": 1.8139364783550128e-05, "loss": 1.11, "step": 760 }, { "epoch": 1.9462915601023019, "grad_norm": 0.27694326525936447, "learning_rate": 1.813194801244117e-05, "loss": 1.1085, "step": 761 }, { "epoch": 1.948849104859335, "grad_norm": 0.26284036778935943, "learning_rate": 1.8124518011161246e-05, "loss": 1.0817, "step": 762 }, { "epoch": 1.9514066496163682, "grad_norm": 0.34628694859076536, "learning_rate": 1.8117074791798503e-05, "loss": 1.0723, "step": 763 }, { "epoch": 1.9539641943734014, "grad_norm": 0.3205449398285809, "learning_rate": 1.8109618366462597e-05, "loss": 1.0878, "step": 764 }, { "epoch": 1.9565217391304348, "grad_norm": 0.2930907660937919, "learning_rate": 1.8102148747284662e-05, "loss": 1.0194, "step": 765 }, { "epoch": 1.959079283887468, "grad_norm": 0.3199378305398446, "learning_rate": 1.8094665946417304e-05, "loss": 1.0818, "step": 766 }, { "epoch": 1.9616368286445014, "grad_norm": 0.3147442131814513, "learning_rate": 1.8087169976034568e-05, "loss": 1.0524, "step": 767 }, { "epoch": 1.9641943734015346, "grad_norm": 0.29010540377698546, "learning_rate": 1.807966084833193e-05, "loss": 1.0804, "step": 768 }, { "epoch": 1.9667519181585678, "grad_norm": 0.2830375710975825, "learning_rate": 1.8072138575526277e-05, "loss": 1.0876, "step": 769 }, { "epoch": 1.969309462915601, "grad_norm": 0.29912181409924526, "learning_rate": 1.806460316985587e-05, "loss": 1.0674, "step": 770 }, { "epoch": 1.9718670076726341, "grad_norm": 0.280637494020639, "learning_rate": 1.8057054643580347e-05, "loss": 1.059, "step": 771 }, { "epoch": 1.9744245524296675, "grad_norm": 0.25437147169201857, "learning_rate": 1.8049493008980685e-05, "loss": 1.076, "step": 772 }, { "epoch": 1.976982097186701, "grad_norm": 0.260015840044801, "learning_rate": 1.8041918278359194e-05, "loss": 1.0884, "step": 773 }, { "epoch": 1.979539641943734, "grad_norm": 0.23338451398624144, "learning_rate": 1.8034330464039485e-05, "loss": 1.0564, "step": 774 }, { "epoch": 1.9820971867007673, "grad_norm": 0.27240262637273416, "learning_rate": 1.8026729578366457e-05, "loss": 1.0653, "step": 775 }, { "epoch": 1.9846547314578005, "grad_norm": 0.2658428330726454, "learning_rate": 1.801911563370628e-05, "loss": 1.0847, "step": 776 }, { "epoch": 1.9872122762148337, "grad_norm": 0.24259844645380865, "learning_rate": 1.801148864244636e-05, "loss": 1.0617, "step": 777 }, { "epoch": 1.989769820971867, "grad_norm": 0.274423591955145, "learning_rate": 1.8003848616995333e-05, "loss": 1.1046, "step": 778 }, { "epoch": 1.9923273657289002, "grad_norm": 0.270074412347766, "learning_rate": 1.7996195569783053e-05, "loss": 1.0841, "step": 779 }, { "epoch": 1.9948849104859336, "grad_norm": 0.32727342222060607, "learning_rate": 1.798852951326054e-05, "loss": 1.064, "step": 780 }, { "epoch": 1.9974424552429668, "grad_norm": 0.28041604224998723, "learning_rate": 1.7980850459899997e-05, "loss": 1.0748, "step": 781 }, { "epoch": 2.0, "grad_norm": 0.230649257113214, "learning_rate": 1.7973158422194754e-05, "loss": 1.0504, "step": 782 }, { "epoch": 2.002557544757033, "grad_norm": 0.27721442928112094, "learning_rate": 1.7965453412659284e-05, "loss": 1.0561, "step": 783 }, { "epoch": 2.0051150895140664, "grad_norm": 0.3484629274944669, "learning_rate": 1.795773544382915e-05, "loss": 1.0484, "step": 784 }, { "epoch": 2.0076726342710995, "grad_norm": 0.35248757109292245, "learning_rate": 1.795000452826101e-05, "loss": 1.0494, "step": 785 }, { "epoch": 2.010230179028133, "grad_norm": 0.31602726514395096, "learning_rate": 1.794226067853257e-05, "loss": 1.1343, "step": 786 }, { "epoch": 2.0127877237851663, "grad_norm": 0.30632695925595954, "learning_rate": 1.79345039072426e-05, "loss": 1.0648, "step": 787 }, { "epoch": 2.0153452685421995, "grad_norm": 0.33328827891250323, "learning_rate": 1.7926734227010876e-05, "loss": 1.0801, "step": 788 }, { "epoch": 2.0179028132992327, "grad_norm": 0.35618373914463364, "learning_rate": 1.7918951650478188e-05, "loss": 1.0613, "step": 789 }, { "epoch": 2.020460358056266, "grad_norm": 0.3085542598082131, "learning_rate": 1.7911156190306296e-05, "loss": 1.0476, "step": 790 }, { "epoch": 2.023017902813299, "grad_norm": 0.22686489493321832, "learning_rate": 1.7903347859177926e-05, "loss": 1.0486, "step": 791 }, { "epoch": 2.0255754475703327, "grad_norm": 0.2750201664093288, "learning_rate": 1.7895526669796747e-05, "loss": 1.0543, "step": 792 }, { "epoch": 2.028132992327366, "grad_norm": 0.2998881689120612, "learning_rate": 1.7887692634887345e-05, "loss": 1.0434, "step": 793 }, { "epoch": 2.030690537084399, "grad_norm": 0.260904922673988, "learning_rate": 1.7879845767195204e-05, "loss": 1.0443, "step": 794 }, { "epoch": 2.0332480818414322, "grad_norm": 0.2465816351987358, "learning_rate": 1.787198607948669e-05, "loss": 1.0516, "step": 795 }, { "epoch": 2.0358056265984654, "grad_norm": 0.23239060808440448, "learning_rate": 1.786411358454902e-05, "loss": 1.0588, "step": 796 }, { "epoch": 2.0383631713554986, "grad_norm": 0.26101630597920855, "learning_rate": 1.785622829519025e-05, "loss": 1.0835, "step": 797 }, { "epoch": 2.040920716112532, "grad_norm": 0.3040971752066545, "learning_rate": 1.7848330224239256e-05, "loss": 1.0563, "step": 798 }, { "epoch": 2.0434782608695654, "grad_norm": 0.26487253530894395, "learning_rate": 1.7840419384545706e-05, "loss": 1.0579, "step": 799 }, { "epoch": 2.0460358056265986, "grad_norm": 0.2689601096947907, "learning_rate": 1.7832495788980035e-05, "loss": 1.1015, "step": 800 }, { "epoch": 2.0485933503836318, "grad_norm": 0.25525460785840065, "learning_rate": 1.7824559450433446e-05, "loss": 1.0537, "step": 801 }, { "epoch": 2.051150895140665, "grad_norm": 0.345599384998098, "learning_rate": 1.7816610381817864e-05, "loss": 1.0604, "step": 802 }, { "epoch": 2.053708439897698, "grad_norm": 0.3359389407416057, "learning_rate": 1.780864859606592e-05, "loss": 1.0664, "step": 803 }, { "epoch": 2.0562659846547313, "grad_norm": 0.2813553104050823, "learning_rate": 1.780067410613095e-05, "loss": 1.0937, "step": 804 }, { "epoch": 2.0588235294117645, "grad_norm": 0.2548220560875847, "learning_rate": 1.7792686924986946e-05, "loss": 1.0441, "step": 805 }, { "epoch": 2.061381074168798, "grad_norm": 0.28792647000401994, "learning_rate": 1.7784687065628554e-05, "loss": 1.058, "step": 806 }, { "epoch": 2.0639386189258313, "grad_norm": 0.2603601267230107, "learning_rate": 1.777667454107104e-05, "loss": 1.0992, "step": 807 }, { "epoch": 2.0664961636828645, "grad_norm": 0.2583588654263776, "learning_rate": 1.776864936435029e-05, "loss": 1.0735, "step": 808 }, { "epoch": 2.0690537084398977, "grad_norm": 0.30719716854376583, "learning_rate": 1.7760611548522755e-05, "loss": 1.0498, "step": 809 }, { "epoch": 2.071611253196931, "grad_norm": 0.30807492892970295, "learning_rate": 1.7752561106665463e-05, "loss": 1.0548, "step": 810 }, { "epoch": 2.074168797953964, "grad_norm": 0.3210704099635407, "learning_rate": 1.7744498051875984e-05, "loss": 1.077, "step": 811 }, { "epoch": 2.0767263427109977, "grad_norm": 0.4282126010865939, "learning_rate": 1.7736422397272396e-05, "loss": 1.0494, "step": 812 }, { "epoch": 2.079283887468031, "grad_norm": 0.4051125030459934, "learning_rate": 1.772833415599329e-05, "loss": 1.0511, "step": 813 }, { "epoch": 2.081841432225064, "grad_norm": 0.2991528183767012, "learning_rate": 1.7720233341197726e-05, "loss": 1.1121, "step": 814 }, { "epoch": 2.084398976982097, "grad_norm": 0.22783217071200507, "learning_rate": 1.7712119966065225e-05, "loss": 1.0383, "step": 815 }, { "epoch": 2.0869565217391304, "grad_norm": 0.3516616820022178, "learning_rate": 1.770399404379574e-05, "loss": 1.0498, "step": 816 }, { "epoch": 2.0895140664961636, "grad_norm": 0.2606641623626611, "learning_rate": 1.7695855587609637e-05, "loss": 1.0594, "step": 817 }, { "epoch": 2.0920716112531967, "grad_norm": 0.269085192714615, "learning_rate": 1.7687704610747676e-05, "loss": 1.0419, "step": 818 }, { "epoch": 2.0946291560102304, "grad_norm": 0.28768629596697776, "learning_rate": 1.767954112647099e-05, "loss": 1.0435, "step": 819 }, { "epoch": 2.0971867007672635, "grad_norm": 0.27429737921035624, "learning_rate": 1.7671365148061053e-05, "loss": 1.0458, "step": 820 }, { "epoch": 2.0997442455242967, "grad_norm": 0.29736519534073375, "learning_rate": 1.7663176688819673e-05, "loss": 1.0566, "step": 821 }, { "epoch": 2.10230179028133, "grad_norm": 0.26021437570192907, "learning_rate": 1.765497576206896e-05, "loss": 1.0422, "step": 822 }, { "epoch": 2.104859335038363, "grad_norm": 0.2783440308095714, "learning_rate": 1.764676238115131e-05, "loss": 1.0776, "step": 823 }, { "epoch": 2.1074168797953963, "grad_norm": 0.3339846285282316, "learning_rate": 1.763853655942938e-05, "loss": 1.0674, "step": 824 }, { "epoch": 2.10997442455243, "grad_norm": 0.2223362385153581, "learning_rate": 1.7630298310286065e-05, "loss": 1.0699, "step": 825 }, { "epoch": 2.112531969309463, "grad_norm": 0.33059613735162624, "learning_rate": 1.7622047647124488e-05, "loss": 1.0634, "step": 826 }, { "epoch": 2.1150895140664963, "grad_norm": 0.3414911305158879, "learning_rate": 1.761378458336796e-05, "loss": 1.0548, "step": 827 }, { "epoch": 2.1176470588235294, "grad_norm": 0.32041930375116484, "learning_rate": 1.760550913245996e-05, "loss": 1.0621, "step": 828 }, { "epoch": 2.1202046035805626, "grad_norm": 0.2971788267573472, "learning_rate": 1.7597221307864142e-05, "loss": 1.0704, "step": 829 }, { "epoch": 2.122762148337596, "grad_norm": 0.27537162097267065, "learning_rate": 1.7588921123064273e-05, "loss": 1.0961, "step": 830 }, { "epoch": 2.125319693094629, "grad_norm": 0.29232241446373336, "learning_rate": 1.7580608591564233e-05, "loss": 1.0916, "step": 831 }, { "epoch": 2.1278772378516626, "grad_norm": 0.3815701080685027, "learning_rate": 1.757228372688799e-05, "loss": 1.0848, "step": 832 }, { "epoch": 2.130434782608696, "grad_norm": 0.33830135607419565, "learning_rate": 1.7563946542579584e-05, "loss": 1.0824, "step": 833 }, { "epoch": 2.132992327365729, "grad_norm": 0.26436755888688523, "learning_rate": 1.7555597052203088e-05, "loss": 1.0424, "step": 834 }, { "epoch": 2.135549872122762, "grad_norm": 0.2204259325114956, "learning_rate": 1.7547235269342602e-05, "loss": 1.0749, "step": 835 }, { "epoch": 2.1381074168797953, "grad_norm": 0.31500508880378464, "learning_rate": 1.7538861207602225e-05, "loss": 1.0871, "step": 836 }, { "epoch": 2.1406649616368285, "grad_norm": 0.33104625224299034, "learning_rate": 1.753047488060603e-05, "loss": 1.0257, "step": 837 }, { "epoch": 2.1432225063938617, "grad_norm": 0.2325551980906377, "learning_rate": 1.7522076301998048e-05, "loss": 1.0907, "step": 838 }, { "epoch": 2.1457800511508953, "grad_norm": 0.2464976826758584, "learning_rate": 1.7513665485442238e-05, "loss": 1.067, "step": 839 }, { "epoch": 2.1483375959079285, "grad_norm": 0.25290511781194314, "learning_rate": 1.750524244462248e-05, "loss": 1.0893, "step": 840 }, { "epoch": 2.1508951406649617, "grad_norm": 0.3247901788745791, "learning_rate": 1.7496807193242528e-05, "loss": 1.0638, "step": 841 }, { "epoch": 2.153452685421995, "grad_norm": 0.34958915516133227, "learning_rate": 1.748835974502601e-05, "loss": 1.0825, "step": 842 }, { "epoch": 2.156010230179028, "grad_norm": 0.24243104695456325, "learning_rate": 1.7479900113716398e-05, "loss": 1.0537, "step": 843 }, { "epoch": 2.1585677749360612, "grad_norm": 0.2734369268109971, "learning_rate": 1.7471428313076984e-05, "loss": 1.1031, "step": 844 }, { "epoch": 2.1611253196930944, "grad_norm": 0.3380184912512867, "learning_rate": 1.7462944356890853e-05, "loss": 1.0589, "step": 845 }, { "epoch": 2.163682864450128, "grad_norm": 0.3625402818137926, "learning_rate": 1.7454448258960877e-05, "loss": 1.0561, "step": 846 }, { "epoch": 2.166240409207161, "grad_norm": 0.34638148620089215, "learning_rate": 1.744594003310967e-05, "loss": 1.0186, "step": 847 }, { "epoch": 2.1687979539641944, "grad_norm": 0.24740728690176142, "learning_rate": 1.743741969317959e-05, "loss": 1.1099, "step": 848 }, { "epoch": 2.1713554987212276, "grad_norm": 0.287155398140135, "learning_rate": 1.7428887253032695e-05, "loss": 1.0691, "step": 849 }, { "epoch": 2.1739130434782608, "grad_norm": 0.3566062867329238, "learning_rate": 1.7420342726550728e-05, "loss": 1.0701, "step": 850 }, { "epoch": 2.176470588235294, "grad_norm": 0.3096727205958978, "learning_rate": 1.74117861276351e-05, "loss": 1.0716, "step": 851 }, { "epoch": 2.1790281329923276, "grad_norm": 0.25874536932280473, "learning_rate": 1.740321747020687e-05, "loss": 1.0893, "step": 852 }, { "epoch": 2.1815856777493607, "grad_norm": 0.21538442833683963, "learning_rate": 1.7394636768206702e-05, "loss": 1.0266, "step": 853 }, { "epoch": 2.184143222506394, "grad_norm": 0.2871943030157397, "learning_rate": 1.738604403559486e-05, "loss": 1.0085, "step": 854 }, { "epoch": 2.186700767263427, "grad_norm": 0.2851621085345804, "learning_rate": 1.7377439286351184e-05, "loss": 1.0622, "step": 855 }, { "epoch": 2.1892583120204603, "grad_norm": 0.26228336638762867, "learning_rate": 1.736882253447506e-05, "loss": 1.083, "step": 856 }, { "epoch": 2.1918158567774935, "grad_norm": 0.26992050889733915, "learning_rate": 1.736019379398542e-05, "loss": 1.1006, "step": 857 }, { "epoch": 2.1943734015345266, "grad_norm": 0.23555655653113924, "learning_rate": 1.7351553078920665e-05, "loss": 1.0914, "step": 858 }, { "epoch": 2.1969309462915603, "grad_norm": 0.30209071932451825, "learning_rate": 1.734290040333871e-05, "loss": 1.0873, "step": 859 }, { "epoch": 2.1994884910485935, "grad_norm": 0.23936877597438264, "learning_rate": 1.733423578131691e-05, "loss": 1.0835, "step": 860 }, { "epoch": 2.2020460358056266, "grad_norm": 0.3366403647300894, "learning_rate": 1.732555922695207e-05, "loss": 1.0743, "step": 861 }, { "epoch": 2.20460358056266, "grad_norm": 0.30248308613139313, "learning_rate": 1.73168707543604e-05, "loss": 1.0482, "step": 862 }, { "epoch": 2.207161125319693, "grad_norm": 0.26759196361130394, "learning_rate": 1.73081703776775e-05, "loss": 1.0686, "step": 863 }, { "epoch": 2.209718670076726, "grad_norm": 0.2424062745806639, "learning_rate": 1.7299458111058336e-05, "loss": 1.0738, "step": 864 }, { "epoch": 2.21227621483376, "grad_norm": 0.24086304886593904, "learning_rate": 1.7290733968677226e-05, "loss": 1.0313, "step": 865 }, { "epoch": 2.214833759590793, "grad_norm": 0.30184358263466177, "learning_rate": 1.7281997964727803e-05, "loss": 1.0602, "step": 866 }, { "epoch": 2.217391304347826, "grad_norm": 0.2366294082979442, "learning_rate": 1.7273250113423e-05, "loss": 1.1046, "step": 867 }, { "epoch": 2.2199488491048593, "grad_norm": 0.26905581826310315, "learning_rate": 1.726449042899502e-05, "loss": 1.0437, "step": 868 }, { "epoch": 2.2225063938618925, "grad_norm": 0.36508543225667806, "learning_rate": 1.725571892569533e-05, "loss": 1.0809, "step": 869 }, { "epoch": 2.2250639386189257, "grad_norm": 0.30221117179280654, "learning_rate": 1.7246935617794608e-05, "loss": 1.0664, "step": 870 }, { "epoch": 2.227621483375959, "grad_norm": 0.2269380846996494, "learning_rate": 1.723814051958275e-05, "loss": 1.045, "step": 871 }, { "epoch": 2.2301790281329925, "grad_norm": 0.3848192034817777, "learning_rate": 1.7229333645368834e-05, "loss": 1.0661, "step": 872 }, { "epoch": 2.2327365728900257, "grad_norm": 0.4724477310420707, "learning_rate": 1.722051500948109e-05, "loss": 1.0846, "step": 873 }, { "epoch": 2.235294117647059, "grad_norm": 0.3561338471365552, "learning_rate": 1.7211684626266887e-05, "loss": 1.0718, "step": 874 }, { "epoch": 2.237851662404092, "grad_norm": 0.24533531015000096, "learning_rate": 1.7202842510092706e-05, "loss": 1.0428, "step": 875 }, { "epoch": 2.2404092071611252, "grad_norm": 0.2999534454935499, "learning_rate": 1.7193988675344125e-05, "loss": 1.0598, "step": 876 }, { "epoch": 2.2429667519181584, "grad_norm": 0.3931502655829081, "learning_rate": 1.7185123136425775e-05, "loss": 1.0486, "step": 877 }, { "epoch": 2.2455242966751916, "grad_norm": 0.4099239641868052, "learning_rate": 1.7176245907761327e-05, "loss": 1.0567, "step": 878 }, { "epoch": 2.2480818414322252, "grad_norm": 0.2859379832887241, "learning_rate": 1.7167357003793485e-05, "loss": 1.0567, "step": 879 }, { "epoch": 2.2506393861892584, "grad_norm": 0.29262327466969734, "learning_rate": 1.7158456438983934e-05, "loss": 1.0299, "step": 880 }, { "epoch": 2.2531969309462916, "grad_norm": 0.43158299248544585, "learning_rate": 1.7149544227813343e-05, "loss": 1.05, "step": 881 }, { "epoch": 2.2557544757033248, "grad_norm": 0.3011090401640172, "learning_rate": 1.7140620384781316e-05, "loss": 1.0166, "step": 882 }, { "epoch": 2.258312020460358, "grad_norm": 0.2826762526500697, "learning_rate": 1.7131684924406392e-05, "loss": 1.0561, "step": 883 }, { "epoch": 2.260869565217391, "grad_norm": 0.40076272547936787, "learning_rate": 1.7122737861226007e-05, "loss": 1.0536, "step": 884 }, { "epoch": 2.2634271099744243, "grad_norm": 0.3893952639906247, "learning_rate": 1.711377920979647e-05, "loss": 1.0717, "step": 885 }, { "epoch": 2.265984654731458, "grad_norm": 0.2701415754560129, "learning_rate": 1.7104808984692946e-05, "loss": 1.0788, "step": 886 }, { "epoch": 2.268542199488491, "grad_norm": 0.3118978955533469, "learning_rate": 1.7095827200509436e-05, "loss": 1.0358, "step": 887 }, { "epoch": 2.2710997442455243, "grad_norm": 0.4681497183113763, "learning_rate": 1.7086833871858735e-05, "loss": 1.0405, "step": 888 }, { "epoch": 2.2736572890025575, "grad_norm": 0.44886562710116457, "learning_rate": 1.707782901337243e-05, "loss": 1.0635, "step": 889 }, { "epoch": 2.2762148337595907, "grad_norm": 0.24326783713209693, "learning_rate": 1.7068812639700862e-05, "loss": 1.0995, "step": 890 }, { "epoch": 2.2787723785166243, "grad_norm": 0.34628521799460377, "learning_rate": 1.7059784765513106e-05, "loss": 1.0772, "step": 891 }, { "epoch": 2.2813299232736575, "grad_norm": 0.3903166631143913, "learning_rate": 1.705074540549695e-05, "loss": 1.0609, "step": 892 }, { "epoch": 2.2838874680306906, "grad_norm": 0.3263912141551758, "learning_rate": 1.704169457435887e-05, "loss": 1.0661, "step": 893 }, { "epoch": 2.286445012787724, "grad_norm": 0.2566336981081094, "learning_rate": 1.7032632286823995e-05, "loss": 1.0853, "step": 894 }, { "epoch": 2.289002557544757, "grad_norm": 0.36154048413903833, "learning_rate": 1.702355855763611e-05, "loss": 1.0723, "step": 895 }, { "epoch": 2.29156010230179, "grad_norm": 0.2971617301340999, "learning_rate": 1.70144734015576e-05, "loss": 1.0619, "step": 896 }, { "epoch": 2.2941176470588234, "grad_norm": 0.2572103383141402, "learning_rate": 1.700537683336944e-05, "loss": 1.0589, "step": 897 }, { "epoch": 2.296675191815857, "grad_norm": 0.37750177979394905, "learning_rate": 1.699626886787119e-05, "loss": 1.0361, "step": 898 }, { "epoch": 2.29923273657289, "grad_norm": 0.35765757522418873, "learning_rate": 1.698714951988093e-05, "loss": 1.071, "step": 899 }, { "epoch": 2.3017902813299234, "grad_norm": 0.30989044748347006, "learning_rate": 1.6978018804235278e-05, "loss": 1.0555, "step": 900 }, { "epoch": 2.3043478260869565, "grad_norm": 0.24476809290635856, "learning_rate": 1.6968876735789326e-05, "loss": 1.0483, "step": 901 }, { "epoch": 2.3069053708439897, "grad_norm": 0.308551372008468, "learning_rate": 1.695972332941666e-05, "loss": 1.0551, "step": 902 }, { "epoch": 2.309462915601023, "grad_norm": 0.37111491476604536, "learning_rate": 1.695055860000929e-05, "loss": 1.0743, "step": 903 }, { "epoch": 2.312020460358056, "grad_norm": 0.29147416337800386, "learning_rate": 1.6941382562477664e-05, "loss": 1.0003, "step": 904 }, { "epoch": 2.3145780051150897, "grad_norm": 0.26326878890729166, "learning_rate": 1.6932195231750616e-05, "loss": 1.0351, "step": 905 }, { "epoch": 2.317135549872123, "grad_norm": 0.29839767577203885, "learning_rate": 1.6922996622775363e-05, "loss": 1.0445, "step": 906 }, { "epoch": 2.319693094629156, "grad_norm": 0.23637128109675618, "learning_rate": 1.691378675051747e-05, "loss": 1.0519, "step": 907 }, { "epoch": 2.3222506393861893, "grad_norm": 0.25442257071130125, "learning_rate": 1.6904565629960814e-05, "loss": 1.0902, "step": 908 }, { "epoch": 2.3248081841432224, "grad_norm": 0.3303656891744051, "learning_rate": 1.6895333276107588e-05, "loss": 1.0265, "step": 909 }, { "epoch": 2.3273657289002556, "grad_norm": 0.2612217404110996, "learning_rate": 1.688608970397825e-05, "loss": 1.1046, "step": 910 }, { "epoch": 2.329923273657289, "grad_norm": 0.271721798226581, "learning_rate": 1.6876834928611524e-05, "loss": 1.0784, "step": 911 }, { "epoch": 2.3324808184143224, "grad_norm": 0.22229862393309946, "learning_rate": 1.6867568965064336e-05, "loss": 1.0364, "step": 912 }, { "epoch": 2.3350383631713556, "grad_norm": 0.23741009658476048, "learning_rate": 1.685829182841184e-05, "loss": 1.0707, "step": 913 }, { "epoch": 2.337595907928389, "grad_norm": 0.28874176637750065, "learning_rate": 1.684900353374735e-05, "loss": 1.0702, "step": 914 }, { "epoch": 2.340153452685422, "grad_norm": 0.30379227509184065, "learning_rate": 1.683970409618235e-05, "loss": 1.0689, "step": 915 }, { "epoch": 2.342710997442455, "grad_norm": 0.2726310509927992, "learning_rate": 1.683039353084644e-05, "loss": 1.0905, "step": 916 }, { "epoch": 2.3452685421994883, "grad_norm": 0.2713331067951481, "learning_rate": 1.6821071852887322e-05, "loss": 1.0317, "step": 917 }, { "epoch": 2.3478260869565215, "grad_norm": 0.3293005148131402, "learning_rate": 1.681173907747079e-05, "loss": 1.0572, "step": 918 }, { "epoch": 2.350383631713555, "grad_norm": 0.2660221814623652, "learning_rate": 1.680239521978068e-05, "loss": 1.0429, "step": 919 }, { "epoch": 2.3529411764705883, "grad_norm": 0.2412158860005583, "learning_rate": 1.679304029501887e-05, "loss": 1.0452, "step": 920 }, { "epoch": 2.3554987212276215, "grad_norm": 0.33605356950268017, "learning_rate": 1.6783674318405233e-05, "loss": 1.0496, "step": 921 }, { "epoch": 2.3580562659846547, "grad_norm": 0.29348949393829404, "learning_rate": 1.677429730517763e-05, "loss": 1.0471, "step": 922 }, { "epoch": 2.360613810741688, "grad_norm": 0.27205789977362044, "learning_rate": 1.6764909270591875e-05, "loss": 1.049, "step": 923 }, { "epoch": 2.363171355498721, "grad_norm": 0.24380065073942686, "learning_rate": 1.6755510229921713e-05, "loss": 1.0568, "step": 924 }, { "epoch": 2.3657289002557547, "grad_norm": 0.2607905003163443, "learning_rate": 1.6746100198458795e-05, "loss": 1.0447, "step": 925 }, { "epoch": 2.368286445012788, "grad_norm": 0.25646849705097663, "learning_rate": 1.673667919151266e-05, "loss": 1.0213, "step": 926 }, { "epoch": 2.370843989769821, "grad_norm": 0.24557852833345492, "learning_rate": 1.6727247224410686e-05, "loss": 1.079, "step": 927 }, { "epoch": 2.373401534526854, "grad_norm": 0.2536896072712956, "learning_rate": 1.67178043124981e-05, "loss": 1.0864, "step": 928 }, { "epoch": 2.3759590792838874, "grad_norm": 0.2921088303385537, "learning_rate": 1.6708350471137927e-05, "loss": 1.0564, "step": 929 }, { "epoch": 2.3785166240409206, "grad_norm": 0.20366681064359315, "learning_rate": 1.669888571571098e-05, "loss": 1.0815, "step": 930 }, { "epoch": 2.381074168797954, "grad_norm": 0.2708885776774786, "learning_rate": 1.6689410061615823e-05, "loss": 1.0453, "step": 931 }, { "epoch": 2.3836317135549874, "grad_norm": 0.26422900568518476, "learning_rate": 1.6679923524268748e-05, "loss": 1.0691, "step": 932 }, { "epoch": 2.3861892583120206, "grad_norm": 0.24062139672551194, "learning_rate": 1.6670426119103762e-05, "loss": 1.0527, "step": 933 }, { "epoch": 2.3887468030690537, "grad_norm": 0.2440568759213169, "learning_rate": 1.666091786157255e-05, "loss": 1.039, "step": 934 }, { "epoch": 2.391304347826087, "grad_norm": 0.24192631220648755, "learning_rate": 1.6651398767144454e-05, "loss": 1.0368, "step": 935 }, { "epoch": 2.39386189258312, "grad_norm": 0.3094662604619502, "learning_rate": 1.664186885130644e-05, "loss": 1.0612, "step": 936 }, { "epoch": 2.3964194373401533, "grad_norm": 0.22698815376801923, "learning_rate": 1.6632328129563088e-05, "loss": 1.0573, "step": 937 }, { "epoch": 2.398976982097187, "grad_norm": 0.25713439762667506, "learning_rate": 1.6622776617436556e-05, "loss": 1.0689, "step": 938 }, { "epoch": 2.40153452685422, "grad_norm": 0.21070288001877646, "learning_rate": 1.6613214330466557e-05, "loss": 1.0514, "step": 939 }, { "epoch": 2.4040920716112533, "grad_norm": 0.2650104302111488, "learning_rate": 1.6603641284210335e-05, "loss": 1.0607, "step": 940 }, { "epoch": 2.4066496163682864, "grad_norm": 0.24280091189228237, "learning_rate": 1.6594057494242634e-05, "loss": 1.0526, "step": 941 }, { "epoch": 2.4092071611253196, "grad_norm": 0.2255724092281544, "learning_rate": 1.6584462976155683e-05, "loss": 1.0584, "step": 942 }, { "epoch": 2.411764705882353, "grad_norm": 0.2704536970024839, "learning_rate": 1.6574857745559168e-05, "loss": 1.0621, "step": 943 }, { "epoch": 2.414322250639386, "grad_norm": 0.29272610932834264, "learning_rate": 1.656524181808019e-05, "loss": 1.0625, "step": 944 }, { "epoch": 2.4168797953964196, "grad_norm": 0.28911787491946217, "learning_rate": 1.655561520936327e-05, "loss": 1.0165, "step": 945 }, { "epoch": 2.419437340153453, "grad_norm": 0.2532789709507061, "learning_rate": 1.6545977935070293e-05, "loss": 1.036, "step": 946 }, { "epoch": 2.421994884910486, "grad_norm": 0.2522741919476773, "learning_rate": 1.6536330010880502e-05, "loss": 1.0879, "step": 947 }, { "epoch": 2.424552429667519, "grad_norm": 0.2902148618078098, "learning_rate": 1.652667145249047e-05, "loss": 1.0447, "step": 948 }, { "epoch": 2.4271099744245523, "grad_norm": 0.2266116217612757, "learning_rate": 1.6517002275614062e-05, "loss": 1.0603, "step": 949 }, { "epoch": 2.4296675191815855, "grad_norm": 0.2855681782290051, "learning_rate": 1.6507322495982433e-05, "loss": 1.0415, "step": 950 }, { "epoch": 2.4322250639386187, "grad_norm": 0.2666978671553076, "learning_rate": 1.6497632129343964e-05, "loss": 1.057, "step": 951 }, { "epoch": 2.4347826086956523, "grad_norm": 0.25398223147396237, "learning_rate": 1.6487931191464293e-05, "loss": 1.0225, "step": 952 }, { "epoch": 2.4373401534526855, "grad_norm": 0.27478774153195795, "learning_rate": 1.647821969812623e-05, "loss": 1.0743, "step": 953 }, { "epoch": 2.4398976982097187, "grad_norm": 0.2548269730970245, "learning_rate": 1.6468497665129767e-05, "loss": 1.0753, "step": 954 }, { "epoch": 2.442455242966752, "grad_norm": 0.2531646552603803, "learning_rate": 1.645876510829205e-05, "loss": 1.0502, "step": 955 }, { "epoch": 2.445012787723785, "grad_norm": 0.2716259730414166, "learning_rate": 1.6449022043447333e-05, "loss": 1.0604, "step": 956 }, { "epoch": 2.4475703324808182, "grad_norm": 0.2759652629992187, "learning_rate": 1.6439268486446982e-05, "loss": 1.0307, "step": 957 }, { "epoch": 2.4501278772378514, "grad_norm": 0.284229730108131, "learning_rate": 1.642950445315941e-05, "loss": 1.0244, "step": 958 }, { "epoch": 2.452685421994885, "grad_norm": 0.2857191939202473, "learning_rate": 1.6419729959470107e-05, "loss": 1.0475, "step": 959 }, { "epoch": 2.455242966751918, "grad_norm": 0.24411876551827455, "learning_rate": 1.6409945021281547e-05, "loss": 1.0205, "step": 960 }, { "epoch": 2.4578005115089514, "grad_norm": 0.2839219346381256, "learning_rate": 1.6400149654513224e-05, "loss": 1.0902, "step": 961 }, { "epoch": 2.4603580562659846, "grad_norm": 0.290894600450773, "learning_rate": 1.6390343875101582e-05, "loss": 1.0655, "step": 962 }, { "epoch": 2.4629156010230178, "grad_norm": 0.25018640254339125, "learning_rate": 1.6380527699000012e-05, "loss": 1.075, "step": 963 }, { "epoch": 2.4654731457800514, "grad_norm": 0.314947984707885, "learning_rate": 1.6370701142178815e-05, "loss": 1.0802, "step": 964 }, { "epoch": 2.4680306905370846, "grad_norm": 0.23513441288297676, "learning_rate": 1.636086422062519e-05, "loss": 1.0315, "step": 965 }, { "epoch": 2.4705882352941178, "grad_norm": 0.26967522371119773, "learning_rate": 1.635101695034319e-05, "loss": 1.0454, "step": 966 }, { "epoch": 2.473145780051151, "grad_norm": 0.2673917447835626, "learning_rate": 1.6341159347353714e-05, "loss": 1.0577, "step": 967 }, { "epoch": 2.475703324808184, "grad_norm": 0.24623838061921519, "learning_rate": 1.633129142769446e-05, "loss": 1.0607, "step": 968 }, { "epoch": 2.4782608695652173, "grad_norm": 0.5975989314807109, "learning_rate": 1.6321413207419915e-05, "loss": 1.0624, "step": 969 }, { "epoch": 2.4808184143222505, "grad_norm": 0.2783985268403012, "learning_rate": 1.6311524702601328e-05, "loss": 1.0277, "step": 970 }, { "epoch": 2.483375959079284, "grad_norm": 0.2948227168148377, "learning_rate": 1.6301625929326682e-05, "loss": 1.0509, "step": 971 }, { "epoch": 2.4859335038363173, "grad_norm": 0.25464495418366273, "learning_rate": 1.6291716903700657e-05, "loss": 1.0743, "step": 972 }, { "epoch": 2.4884910485933505, "grad_norm": 0.32267891042610297, "learning_rate": 1.6281797641844615e-05, "loss": 1.0528, "step": 973 }, { "epoch": 2.4910485933503836, "grad_norm": 0.24461174022768228, "learning_rate": 1.6271868159896583e-05, "loss": 1.0536, "step": 974 }, { "epoch": 2.493606138107417, "grad_norm": 0.3184259095166065, "learning_rate": 1.6261928474011205e-05, "loss": 1.0295, "step": 975 }, { "epoch": 2.49616368286445, "grad_norm": 0.31223168542424856, "learning_rate": 1.6251978600359727e-05, "loss": 1.0611, "step": 976 }, { "epoch": 2.498721227621483, "grad_norm": 0.24470883821957645, "learning_rate": 1.6242018555129968e-05, "loss": 1.0501, "step": 977 }, { "epoch": 2.501278772378517, "grad_norm": 0.263841680832215, "learning_rate": 1.6232048354526305e-05, "loss": 1.0632, "step": 978 }, { "epoch": 2.50383631713555, "grad_norm": 0.2799350053468126, "learning_rate": 1.6222068014769626e-05, "loss": 1.0669, "step": 979 }, { "epoch": 2.506393861892583, "grad_norm": 0.23708656285849256, "learning_rate": 1.6212077552097326e-05, "loss": 1.0242, "step": 980 }, { "epoch": 2.5089514066496164, "grad_norm": 0.32106303705514144, "learning_rate": 1.6202076982763258e-05, "loss": 1.038, "step": 981 }, { "epoch": 2.5115089514066495, "grad_norm": 0.32641459248285415, "learning_rate": 1.6192066323037723e-05, "loss": 1.0192, "step": 982 }, { "epoch": 2.5140664961636827, "grad_norm": 0.2374782294678397, "learning_rate": 1.618204558920744e-05, "loss": 1.0317, "step": 983 }, { "epoch": 2.516624040920716, "grad_norm": 0.2669950742681541, "learning_rate": 1.6172014797575512e-05, "loss": 1.0604, "step": 984 }, { "epoch": 2.5191815856777495, "grad_norm": 0.3289018657957539, "learning_rate": 1.616197396446142e-05, "loss": 1.0558, "step": 985 }, { "epoch": 2.5217391304347827, "grad_norm": 0.30014120894320534, "learning_rate": 1.6151923106200964e-05, "loss": 1.0282, "step": 986 }, { "epoch": 2.524296675191816, "grad_norm": 0.22934126760741957, "learning_rate": 1.6141862239146263e-05, "loss": 1.0442, "step": 987 }, { "epoch": 2.526854219948849, "grad_norm": 0.3082443169061738, "learning_rate": 1.613179137966572e-05, "loss": 1.0671, "step": 988 }, { "epoch": 2.5294117647058822, "grad_norm": 0.34264852115767747, "learning_rate": 1.612171054414399e-05, "loss": 1.0659, "step": 989 }, { "epoch": 2.531969309462916, "grad_norm": 0.28840855857878017, "learning_rate": 1.6111619748981967e-05, "loss": 1.0757, "step": 990 }, { "epoch": 2.5345268542199486, "grad_norm": 0.29679625325903564, "learning_rate": 1.610151901059674e-05, "loss": 1.0574, "step": 991 }, { "epoch": 2.5370843989769822, "grad_norm": 0.2701305485919972, "learning_rate": 1.6091408345421583e-05, "loss": 1.076, "step": 992 }, { "epoch": 2.5396419437340154, "grad_norm": 0.27772319714999755, "learning_rate": 1.6081287769905914e-05, "loss": 1.0557, "step": 993 }, { "epoch": 2.5421994884910486, "grad_norm": 0.2575298835482317, "learning_rate": 1.6071157300515274e-05, "loss": 1.0371, "step": 994 }, { "epoch": 2.544757033248082, "grad_norm": 0.2434229348885953, "learning_rate": 1.6061016953731307e-05, "loss": 1.0293, "step": 995 }, { "epoch": 2.547314578005115, "grad_norm": 0.24931228820010734, "learning_rate": 1.6050866746051722e-05, "loss": 1.0497, "step": 996 }, { "epoch": 2.5498721227621486, "grad_norm": 0.24970615225374868, "learning_rate": 1.6040706693990272e-05, "loss": 1.0507, "step": 997 }, { "epoch": 2.5524296675191813, "grad_norm": 0.2705848075384666, "learning_rate": 1.6030536814076722e-05, "loss": 1.051, "step": 998 }, { "epoch": 2.554987212276215, "grad_norm": 0.2645976951028759, "learning_rate": 1.602035712285684e-05, "loss": 1.044, "step": 999 }, { "epoch": 2.557544757033248, "grad_norm": 0.25280588284501737, "learning_rate": 1.6010167636892338e-05, "loss": 1.0466, "step": 1000 }, { "epoch": 2.5601023017902813, "grad_norm": 0.23309975174376094, "learning_rate": 1.5999968372760882e-05, "loss": 1.0503, "step": 1001 }, { "epoch": 2.5626598465473145, "grad_norm": 0.24003131974818753, "learning_rate": 1.5989759347056028e-05, "loss": 1.0428, "step": 1002 }, { "epoch": 2.5652173913043477, "grad_norm": 0.22803670250684518, "learning_rate": 1.5979540576387226e-05, "loss": 1.067, "step": 1003 }, { "epoch": 2.5677749360613813, "grad_norm": 0.23366692767216873, "learning_rate": 1.596931207737978e-05, "loss": 1.0735, "step": 1004 }, { "epoch": 2.5703324808184145, "grad_norm": 0.2514628572179653, "learning_rate": 1.5959073866674812e-05, "loss": 1.0683, "step": 1005 }, { "epoch": 2.5728900255754477, "grad_norm": 0.2647695835957155, "learning_rate": 1.594882596092926e-05, "loss": 1.006, "step": 1006 }, { "epoch": 2.575447570332481, "grad_norm": 0.2705206567562451, "learning_rate": 1.5938568376815816e-05, "loss": 1.0815, "step": 1007 }, { "epoch": 2.578005115089514, "grad_norm": 0.26218100830771535, "learning_rate": 1.5928301131022933e-05, "loss": 1.0712, "step": 1008 }, { "epoch": 2.580562659846547, "grad_norm": 0.24704018764157912, "learning_rate": 1.5918024240254778e-05, "loss": 1.069, "step": 1009 }, { "epoch": 2.5831202046035804, "grad_norm": 0.3099818232532923, "learning_rate": 1.5907737721231205e-05, "loss": 1.0485, "step": 1010 }, { "epoch": 2.585677749360614, "grad_norm": 0.2976698121714401, "learning_rate": 1.5897441590687747e-05, "loss": 1.0577, "step": 1011 }, { "epoch": 2.588235294117647, "grad_norm": 0.25285713641828206, "learning_rate": 1.5887135865375552e-05, "loss": 1.0603, "step": 1012 }, { "epoch": 2.5907928388746804, "grad_norm": 0.2526446484384057, "learning_rate": 1.5876820562061402e-05, "loss": 1.0433, "step": 1013 }, { "epoch": 2.5933503836317136, "grad_norm": 0.29067294932967996, "learning_rate": 1.586649569752765e-05, "loss": 1.0616, "step": 1014 }, { "epoch": 2.5959079283887467, "grad_norm": 0.282910218177146, "learning_rate": 1.5856161288572195e-05, "loss": 1.0413, "step": 1015 }, { "epoch": 2.59846547314578, "grad_norm": 0.2268843181296163, "learning_rate": 1.5845817352008485e-05, "loss": 1.0407, "step": 1016 }, { "epoch": 2.601023017902813, "grad_norm": 0.22762472803069236, "learning_rate": 1.583546390466545e-05, "loss": 1.0536, "step": 1017 }, { "epoch": 2.6035805626598467, "grad_norm": 0.23603794648210832, "learning_rate": 1.58251009633875e-05, "loss": 1.0571, "step": 1018 }, { "epoch": 2.60613810741688, "grad_norm": 0.2676423332930833, "learning_rate": 1.5814728545034503e-05, "loss": 1.0297, "step": 1019 }, { "epoch": 2.608695652173913, "grad_norm": 0.25371119273646303, "learning_rate": 1.5804346666481728e-05, "loss": 1.037, "step": 1020 }, { "epoch": 2.6112531969309463, "grad_norm": 0.23765073500378178, "learning_rate": 1.5793955344619846e-05, "loss": 1.0493, "step": 1021 }, { "epoch": 2.6138107416879794, "grad_norm": 0.28479895070770733, "learning_rate": 1.5783554596354885e-05, "loss": 1.0428, "step": 1022 }, { "epoch": 2.6163682864450126, "grad_norm": 0.2610596840924324, "learning_rate": 1.577314443860821e-05, "loss": 1.0659, "step": 1023 }, { "epoch": 2.618925831202046, "grad_norm": 0.24670717715351206, "learning_rate": 1.57627248883165e-05, "loss": 1.0434, "step": 1024 }, { "epoch": 2.6214833759590794, "grad_norm": 0.22640840073229135, "learning_rate": 1.575229596243171e-05, "loss": 1.043, "step": 1025 }, { "epoch": 2.6240409207161126, "grad_norm": 0.25314200985521523, "learning_rate": 1.574185767792106e-05, "loss": 1.0494, "step": 1026 }, { "epoch": 2.626598465473146, "grad_norm": 0.21470094174624627, "learning_rate": 1.573141005176697e-05, "loss": 1.0568, "step": 1027 }, { "epoch": 2.629156010230179, "grad_norm": 0.23151889692704267, "learning_rate": 1.5720953100967085e-05, "loss": 1.0648, "step": 1028 }, { "epoch": 2.631713554987212, "grad_norm": 0.21397184877158426, "learning_rate": 1.5710486842534206e-05, "loss": 1.0663, "step": 1029 }, { "epoch": 2.634271099744246, "grad_norm": 0.22192997813660584, "learning_rate": 1.5700011293496285e-05, "loss": 1.0534, "step": 1030 }, { "epoch": 2.6368286445012785, "grad_norm": 0.21407356154899657, "learning_rate": 1.568952647089638e-05, "loss": 1.059, "step": 1031 }, { "epoch": 2.639386189258312, "grad_norm": 0.21832618515669033, "learning_rate": 1.5679032391792648e-05, "loss": 1.0221, "step": 1032 }, { "epoch": 2.6419437340153453, "grad_norm": 0.24431871394272658, "learning_rate": 1.5668529073258298e-05, "loss": 1.0858, "step": 1033 }, { "epoch": 2.6445012787723785, "grad_norm": 0.31234951434869057, "learning_rate": 1.5658016532381565e-05, "loss": 1.06, "step": 1034 }, { "epoch": 2.6470588235294117, "grad_norm": 0.2080542192295102, "learning_rate": 1.5647494786265705e-05, "loss": 1.0651, "step": 1035 }, { "epoch": 2.649616368286445, "grad_norm": 0.24670278561413833, "learning_rate": 1.5636963852028936e-05, "loss": 1.0373, "step": 1036 }, { "epoch": 2.6521739130434785, "grad_norm": 0.23750220801463004, "learning_rate": 1.5626423746804433e-05, "loss": 1.0426, "step": 1037 }, { "epoch": 2.6547314578005117, "grad_norm": 0.24041568140574793, "learning_rate": 1.5615874487740287e-05, "loss": 1.0504, "step": 1038 }, { "epoch": 2.657289002557545, "grad_norm": 0.2389633958150457, "learning_rate": 1.560531609199948e-05, "loss": 1.0572, "step": 1039 }, { "epoch": 2.659846547314578, "grad_norm": 0.2770548151196396, "learning_rate": 1.559474857675986e-05, "loss": 1.068, "step": 1040 }, { "epoch": 2.662404092071611, "grad_norm": 0.266725154908083, "learning_rate": 1.5584171959214126e-05, "loss": 1.0449, "step": 1041 }, { "epoch": 2.6649616368286444, "grad_norm": 0.25482885945652345, "learning_rate": 1.557358625656976e-05, "loss": 1.0784, "step": 1042 }, { "epoch": 2.6675191815856776, "grad_norm": 0.264472394184579, "learning_rate": 1.5562991486049045e-05, "loss": 1.0118, "step": 1043 }, { "epoch": 2.670076726342711, "grad_norm": 0.2848797989882817, "learning_rate": 1.555238766488901e-05, "loss": 1.0555, "step": 1044 }, { "epoch": 2.6726342710997444, "grad_norm": 0.24695033243914596, "learning_rate": 1.5541774810341404e-05, "loss": 1.0402, "step": 1045 }, { "epoch": 2.6751918158567776, "grad_norm": 0.20315866222350132, "learning_rate": 1.5531152939672683e-05, "loss": 1.0251, "step": 1046 }, { "epoch": 2.6777493606138107, "grad_norm": 0.2608581931242649, "learning_rate": 1.5520522070163962e-05, "loss": 1.0549, "step": 1047 }, { "epoch": 2.680306905370844, "grad_norm": 0.3085807293166213, "learning_rate": 1.550988221911101e-05, "loss": 1.0586, "step": 1048 }, { "epoch": 2.682864450127877, "grad_norm": 0.22686082652143869, "learning_rate": 1.549923340382419e-05, "loss": 1.0315, "step": 1049 }, { "epoch": 2.6854219948849103, "grad_norm": 0.23840859030860576, "learning_rate": 1.548857564162846e-05, "loss": 1.0542, "step": 1050 }, { "epoch": 2.687979539641944, "grad_norm": 0.2828144148836396, "learning_rate": 1.5477908949863335e-05, "loss": 1.0546, "step": 1051 }, { "epoch": 2.690537084398977, "grad_norm": 0.24462451577997144, "learning_rate": 1.5467233345882858e-05, "loss": 1.05, "step": 1052 }, { "epoch": 2.6930946291560103, "grad_norm": 0.2608389325913873, "learning_rate": 1.5456548847055565e-05, "loss": 1.0582, "step": 1053 }, { "epoch": 2.6956521739130435, "grad_norm": 0.2341653521141245, "learning_rate": 1.5445855470764467e-05, "loss": 1.0227, "step": 1054 }, { "epoch": 2.6982097186700766, "grad_norm": 0.2001748409496552, "learning_rate": 1.5435153234407023e-05, "loss": 1.0361, "step": 1055 }, { "epoch": 2.70076726342711, "grad_norm": 0.24778418959062198, "learning_rate": 1.5424442155395095e-05, "loss": 1.0556, "step": 1056 }, { "epoch": 2.703324808184143, "grad_norm": 0.23891064433631373, "learning_rate": 1.5413722251154947e-05, "loss": 1.0583, "step": 1057 }, { "epoch": 2.7058823529411766, "grad_norm": 0.18730639273619554, "learning_rate": 1.540299353912719e-05, "loss": 1.0461, "step": 1058 }, { "epoch": 2.70843989769821, "grad_norm": 0.22764007423409213, "learning_rate": 1.5392256036766767e-05, "loss": 1.0723, "step": 1059 }, { "epoch": 2.710997442455243, "grad_norm": 0.2161337514937876, "learning_rate": 1.5381509761542925e-05, "loss": 1.0303, "step": 1060 }, { "epoch": 2.713554987212276, "grad_norm": 0.23665490844389125, "learning_rate": 1.537075473093918e-05, "loss": 1.072, "step": 1061 }, { "epoch": 2.7161125319693094, "grad_norm": 0.2171745194472315, "learning_rate": 1.535999096245329e-05, "loss": 1.0609, "step": 1062 }, { "epoch": 2.718670076726343, "grad_norm": 0.27479490086390757, "learning_rate": 1.5349218473597244e-05, "loss": 1.0976, "step": 1063 }, { "epoch": 2.7212276214833757, "grad_norm": 0.23802159891837593, "learning_rate": 1.5338437281897196e-05, "loss": 1.0561, "step": 1064 }, { "epoch": 2.7237851662404093, "grad_norm": 0.23413108216980624, "learning_rate": 1.532764740489348e-05, "loss": 1.0249, "step": 1065 }, { "epoch": 2.7263427109974425, "grad_norm": 0.23839123328370654, "learning_rate": 1.5316848860140545e-05, "loss": 1.0448, "step": 1066 }, { "epoch": 2.7289002557544757, "grad_norm": 0.26889749126936374, "learning_rate": 1.530604166520695e-05, "loss": 1.0538, "step": 1067 }, { "epoch": 2.731457800511509, "grad_norm": 0.23104275616772496, "learning_rate": 1.529522583767533e-05, "loss": 1.0709, "step": 1068 }, { "epoch": 2.734015345268542, "grad_norm": 0.26947945752974595, "learning_rate": 1.5284401395142356e-05, "loss": 1.0476, "step": 1069 }, { "epoch": 2.7365728900255757, "grad_norm": 0.2650970504236315, "learning_rate": 1.5273568355218714e-05, "loss": 1.0906, "step": 1070 }, { "epoch": 2.7391304347826084, "grad_norm": 0.2426600100365933, "learning_rate": 1.5262726735529096e-05, "loss": 1.0421, "step": 1071 }, { "epoch": 2.741687979539642, "grad_norm": 0.2565653498953779, "learning_rate": 1.5251876553712129e-05, "loss": 1.0714, "step": 1072 }, { "epoch": 2.7442455242966752, "grad_norm": 0.2590844357725753, "learning_rate": 1.5241017827420379e-05, "loss": 1.0529, "step": 1073 }, { "epoch": 2.7468030690537084, "grad_norm": 0.2661157616076656, "learning_rate": 1.523015057432032e-05, "loss": 1.0413, "step": 1074 }, { "epoch": 2.7493606138107416, "grad_norm": 0.2316877382855349, "learning_rate": 1.5219274812092297e-05, "loss": 1.0965, "step": 1075 }, { "epoch": 2.7519181585677748, "grad_norm": 0.281689753856549, "learning_rate": 1.5208390558430486e-05, "loss": 1.0506, "step": 1076 }, { "epoch": 2.7544757033248084, "grad_norm": 0.25889609476509934, "learning_rate": 1.5197497831042891e-05, "loss": 1.0701, "step": 1077 }, { "epoch": 2.7570332480818416, "grad_norm": 0.25370938447354224, "learning_rate": 1.5186596647651299e-05, "loss": 1.0344, "step": 1078 }, { "epoch": 2.7595907928388748, "grad_norm": 0.21590996086487077, "learning_rate": 1.5175687025991254e-05, "loss": 1.0111, "step": 1079 }, { "epoch": 2.762148337595908, "grad_norm": 0.25136209115240976, "learning_rate": 1.5164768983812031e-05, "loss": 1.0594, "step": 1080 }, { "epoch": 2.764705882352941, "grad_norm": 0.2296309073317973, "learning_rate": 1.5153842538876595e-05, "loss": 1.0195, "step": 1081 }, { "epoch": 2.7672634271099743, "grad_norm": 0.2188880236827278, "learning_rate": 1.5142907708961594e-05, "loss": 1.0563, "step": 1082 }, { "epoch": 2.7698209718670075, "grad_norm": 0.29043124524993463, "learning_rate": 1.5131964511857307e-05, "loss": 1.0579, "step": 1083 }, { "epoch": 2.772378516624041, "grad_norm": 0.23042976434473456, "learning_rate": 1.512101296536764e-05, "loss": 1.0594, "step": 1084 }, { "epoch": 2.7749360613810743, "grad_norm": 0.3064542379695439, "learning_rate": 1.5110053087310067e-05, "loss": 1.0347, "step": 1085 }, { "epoch": 2.7774936061381075, "grad_norm": 0.2990911954190306, "learning_rate": 1.5099084895515633e-05, "loss": 1.0872, "step": 1086 }, { "epoch": 2.7800511508951407, "grad_norm": 0.30238830537129957, "learning_rate": 1.5088108407828887e-05, "loss": 1.0102, "step": 1087 }, { "epoch": 2.782608695652174, "grad_norm": 0.22800852447745912, "learning_rate": 1.5077123642107901e-05, "loss": 1.0373, "step": 1088 }, { "epoch": 2.785166240409207, "grad_norm": 0.26466118290058793, "learning_rate": 1.5066130616224194e-05, "loss": 1.0601, "step": 1089 }, { "epoch": 2.78772378516624, "grad_norm": 0.3134236905423725, "learning_rate": 1.5055129348062733e-05, "loss": 1.0282, "step": 1090 }, { "epoch": 2.790281329923274, "grad_norm": 0.30040919493276264, "learning_rate": 1.5044119855521899e-05, "loss": 1.0028, "step": 1091 }, { "epoch": 2.792838874680307, "grad_norm": 0.3018437088485077, "learning_rate": 1.5033102156513442e-05, "loss": 1.0642, "step": 1092 }, { "epoch": 2.79539641943734, "grad_norm": 0.2594288455529522, "learning_rate": 1.5022076268962474e-05, "loss": 1.0651, "step": 1093 }, { "epoch": 2.7979539641943734, "grad_norm": 0.2427672329241251, "learning_rate": 1.5011042210807416e-05, "loss": 1.0499, "step": 1094 }, { "epoch": 2.8005115089514065, "grad_norm": 0.2753688016374087, "learning_rate": 1.5000000000000002e-05, "loss": 1.0441, "step": 1095 }, { "epoch": 2.80306905370844, "grad_norm": 0.333646004575826, "learning_rate": 1.4988949654505212e-05, "loss": 1.0954, "step": 1096 }, { "epoch": 2.805626598465473, "grad_norm": 0.24884374092942535, "learning_rate": 1.4977891192301266e-05, "loss": 1.0616, "step": 1097 }, { "epoch": 2.8081841432225065, "grad_norm": 0.25576802318021363, "learning_rate": 1.4966824631379595e-05, "loss": 1.0767, "step": 1098 }, { "epoch": 2.8107416879795397, "grad_norm": 0.2726811004318987, "learning_rate": 1.49557499897448e-05, "loss": 1.0629, "step": 1099 }, { "epoch": 2.813299232736573, "grad_norm": 0.2490020562964201, "learning_rate": 1.4944667285414629e-05, "loss": 1.0401, "step": 1100 }, { "epoch": 2.815856777493606, "grad_norm": 0.230153454763048, "learning_rate": 1.4933576536419951e-05, "loss": 1.0681, "step": 1101 }, { "epoch": 2.8184143222506393, "grad_norm": 0.29290021173573333, "learning_rate": 1.492247776080472e-05, "loss": 1.0478, "step": 1102 }, { "epoch": 2.820971867007673, "grad_norm": 0.22373455728798555, "learning_rate": 1.4911370976625951e-05, "loss": 1.0646, "step": 1103 }, { "epoch": 2.8235294117647056, "grad_norm": 0.2867670697761132, "learning_rate": 1.4900256201953686e-05, "loss": 1.0395, "step": 1104 }, { "epoch": 2.8260869565217392, "grad_norm": 0.2580511336465639, "learning_rate": 1.488913345487097e-05, "loss": 1.0299, "step": 1105 }, { "epoch": 2.8286445012787724, "grad_norm": 0.30823901300584283, "learning_rate": 1.4878002753473814e-05, "loss": 1.0588, "step": 1106 }, { "epoch": 2.8312020460358056, "grad_norm": 0.26061529857491966, "learning_rate": 1.486686411587118e-05, "loss": 1.0544, "step": 1107 }, { "epoch": 2.833759590792839, "grad_norm": 0.3411340236384177, "learning_rate": 1.4855717560184925e-05, "loss": 1.0673, "step": 1108 }, { "epoch": 2.836317135549872, "grad_norm": 0.3112034427743734, "learning_rate": 1.4844563104549808e-05, "loss": 1.0702, "step": 1109 }, { "epoch": 2.8388746803069056, "grad_norm": 0.26159448325094614, "learning_rate": 1.4833400767113425e-05, "loss": 1.0518, "step": 1110 }, { "epoch": 2.8414322250639388, "grad_norm": 0.24843885045239295, "learning_rate": 1.48222305660362e-05, "loss": 1.0519, "step": 1111 }, { "epoch": 2.843989769820972, "grad_norm": 0.34052436576940476, "learning_rate": 1.4811052519491358e-05, "loss": 1.0621, "step": 1112 }, { "epoch": 2.846547314578005, "grad_norm": 0.25035667041534276, "learning_rate": 1.4799866645664875e-05, "loss": 1.0495, "step": 1113 }, { "epoch": 2.8491048593350383, "grad_norm": 0.23950107492766087, "learning_rate": 1.4788672962755474e-05, "loss": 1.0474, "step": 1114 }, { "epoch": 2.8516624040920715, "grad_norm": 0.2228748439468561, "learning_rate": 1.4777471488974573e-05, "loss": 1.056, "step": 1115 }, { "epoch": 2.8542199488491047, "grad_norm": 0.21686894636285, "learning_rate": 1.476626224254627e-05, "loss": 1.0473, "step": 1116 }, { "epoch": 2.8567774936061383, "grad_norm": 0.21336673271033718, "learning_rate": 1.475504524170731e-05, "loss": 1.0327, "step": 1117 }, { "epoch": 2.8593350383631715, "grad_norm": 0.2412247096897979, "learning_rate": 1.4743820504707054e-05, "loss": 1.0603, "step": 1118 }, { "epoch": 2.8618925831202047, "grad_norm": 0.20338495510222906, "learning_rate": 1.4732588049807442e-05, "loss": 1.0345, "step": 1119 }, { "epoch": 2.864450127877238, "grad_norm": 0.2224056939046196, "learning_rate": 1.4721347895282977e-05, "loss": 1.0932, "step": 1120 }, { "epoch": 2.867007672634271, "grad_norm": 0.21219190570803861, "learning_rate": 1.4710100059420693e-05, "loss": 1.0577, "step": 1121 }, { "epoch": 2.869565217391304, "grad_norm": 0.23417177032958478, "learning_rate": 1.4698844560520107e-05, "loss": 1.04, "step": 1122 }, { "epoch": 2.8721227621483374, "grad_norm": 0.21756710346483277, "learning_rate": 1.4687581416893218e-05, "loss": 1.0115, "step": 1123 }, { "epoch": 2.874680306905371, "grad_norm": 0.27116811809019226, "learning_rate": 1.4676310646864455e-05, "loss": 1.0925, "step": 1124 }, { "epoch": 2.877237851662404, "grad_norm": 0.20359779513752466, "learning_rate": 1.4665032268770656e-05, "loss": 1.0662, "step": 1125 }, { "epoch": 2.8797953964194374, "grad_norm": 0.25086860996163834, "learning_rate": 1.4653746300961037e-05, "loss": 1.0615, "step": 1126 }, { "epoch": 2.8823529411764706, "grad_norm": 0.21619154701357268, "learning_rate": 1.4642452761797166e-05, "loss": 1.028, "step": 1127 }, { "epoch": 2.8849104859335037, "grad_norm": 0.23657771626030477, "learning_rate": 1.4631151669652917e-05, "loss": 1.0339, "step": 1128 }, { "epoch": 2.887468030690537, "grad_norm": 0.25435410320469787, "learning_rate": 1.4619843042914466e-05, "loss": 1.0382, "step": 1129 }, { "epoch": 2.89002557544757, "grad_norm": 0.3165858987447032, "learning_rate": 1.4608526899980238e-05, "loss": 1.0631, "step": 1130 }, { "epoch": 2.8925831202046037, "grad_norm": 0.3059530735276844, "learning_rate": 1.4597203259260893e-05, "loss": 1.0742, "step": 1131 }, { "epoch": 2.895140664961637, "grad_norm": 0.23231123365328338, "learning_rate": 1.4585872139179284e-05, "loss": 1.0108, "step": 1132 }, { "epoch": 2.89769820971867, "grad_norm": 0.32159788413714113, "learning_rate": 1.457453355817044e-05, "loss": 1.0343, "step": 1133 }, { "epoch": 2.9002557544757033, "grad_norm": 0.2624561212579556, "learning_rate": 1.456318753468152e-05, "loss": 1.0344, "step": 1134 }, { "epoch": 2.9028132992327365, "grad_norm": 0.21340797781295, "learning_rate": 1.455183408717179e-05, "loss": 1.0582, "step": 1135 }, { "epoch": 2.90537084398977, "grad_norm": 0.27498982896150626, "learning_rate": 1.4540473234112607e-05, "loss": 1.0319, "step": 1136 }, { "epoch": 2.907928388746803, "grad_norm": 0.26787413886350847, "learning_rate": 1.4529104993987364e-05, "loss": 1.094, "step": 1137 }, { "epoch": 2.9104859335038364, "grad_norm": 0.22411507204789752, "learning_rate": 1.4517729385291479e-05, "loss": 1.0289, "step": 1138 }, { "epoch": 2.9130434782608696, "grad_norm": 0.3186727715150146, "learning_rate": 1.4506346426532356e-05, "loss": 1.0474, "step": 1139 }, { "epoch": 2.915601023017903, "grad_norm": 0.23017658335190225, "learning_rate": 1.4494956136229356e-05, "loss": 1.0406, "step": 1140 }, { "epoch": 2.918158567774936, "grad_norm": 0.2469732487522561, "learning_rate": 1.448355853291377e-05, "loss": 1.0545, "step": 1141 }, { "epoch": 2.920716112531969, "grad_norm": 0.34257461951959434, "learning_rate": 1.4472153635128787e-05, "loss": 1.0649, "step": 1142 }, { "epoch": 2.923273657289003, "grad_norm": 0.26582238607210484, "learning_rate": 1.4460741461429457e-05, "loss": 1.0643, "step": 1143 }, { "epoch": 2.9258312020460355, "grad_norm": 0.238713886041743, "learning_rate": 1.4449322030382681e-05, "loss": 1.0375, "step": 1144 }, { "epoch": 2.928388746803069, "grad_norm": 0.28544164960503227, "learning_rate": 1.4437895360567156e-05, "loss": 1.0459, "step": 1145 }, { "epoch": 2.9309462915601023, "grad_norm": 0.30617216188801405, "learning_rate": 1.4426461470573358e-05, "loss": 1.0352, "step": 1146 }, { "epoch": 2.9335038363171355, "grad_norm": 0.23250706835607923, "learning_rate": 1.4415020379003513e-05, "loss": 1.0547, "step": 1147 }, { "epoch": 2.9360613810741687, "grad_norm": 0.23449213816934886, "learning_rate": 1.4403572104471559e-05, "loss": 1.0506, "step": 1148 }, { "epoch": 2.938618925831202, "grad_norm": 0.26285727807721854, "learning_rate": 1.4392116665603123e-05, "loss": 1.067, "step": 1149 }, { "epoch": 2.9411764705882355, "grad_norm": 0.25864228967500363, "learning_rate": 1.4380654081035492e-05, "loss": 1.0566, "step": 1150 }, { "epoch": 2.9437340153452687, "grad_norm": 0.2197313587417355, "learning_rate": 1.4369184369417573e-05, "loss": 1.069, "step": 1151 }, { "epoch": 2.946291560102302, "grad_norm": 0.22175625255078796, "learning_rate": 1.4357707549409865e-05, "loss": 1.0393, "step": 1152 }, { "epoch": 2.948849104859335, "grad_norm": 0.20734806987916835, "learning_rate": 1.4346223639684445e-05, "loss": 1.0629, "step": 1153 }, { "epoch": 2.9514066496163682, "grad_norm": 0.20844980678105798, "learning_rate": 1.4334732658924906e-05, "loss": 1.0683, "step": 1154 }, { "epoch": 2.9539641943734014, "grad_norm": 0.1986457605182691, "learning_rate": 1.4323234625826363e-05, "loss": 1.082, "step": 1155 }, { "epoch": 2.9565217391304346, "grad_norm": 0.2173974733436024, "learning_rate": 1.4311729559095391e-05, "loss": 1.0579, "step": 1156 }, { "epoch": 2.959079283887468, "grad_norm": 0.23569051033252647, "learning_rate": 1.430021747745002e-05, "loss": 1.0501, "step": 1157 }, { "epoch": 2.9616368286445014, "grad_norm": 0.1958953354096487, "learning_rate": 1.4288698399619682e-05, "loss": 1.0423, "step": 1158 }, { "epoch": 2.9641943734015346, "grad_norm": 0.24550680925330018, "learning_rate": 1.4277172344345203e-05, "loss": 1.0429, "step": 1159 }, { "epoch": 2.9667519181585678, "grad_norm": 0.22335624269922177, "learning_rate": 1.4265639330378751e-05, "loss": 1.0637, "step": 1160 }, { "epoch": 2.969309462915601, "grad_norm": 0.19207777433952558, "learning_rate": 1.4254099376483814e-05, "loss": 1.032, "step": 1161 }, { "epoch": 2.971867007672634, "grad_norm": 0.21933228277599973, "learning_rate": 1.424255250143518e-05, "loss": 1.0399, "step": 1162 }, { "epoch": 2.9744245524296673, "grad_norm": 0.2042696972237095, "learning_rate": 1.423099872401889e-05, "loss": 1.082, "step": 1163 }, { "epoch": 2.976982097186701, "grad_norm": 0.23521017440946976, "learning_rate": 1.4219438063032223e-05, "loss": 1.0337, "step": 1164 }, { "epoch": 2.979539641943734, "grad_norm": 0.23773407464153606, "learning_rate": 1.4207870537283645e-05, "loss": 1.0464, "step": 1165 }, { "epoch": 2.9820971867007673, "grad_norm": 0.19999456866670134, "learning_rate": 1.4196296165592804e-05, "loss": 1.0738, "step": 1166 }, { "epoch": 2.9846547314578005, "grad_norm": 0.24196149952728568, "learning_rate": 1.4184714966790472e-05, "loss": 1.0515, "step": 1167 }, { "epoch": 2.9872122762148337, "grad_norm": 0.2078635385282362, "learning_rate": 1.4173126959718542e-05, "loss": 1.0685, "step": 1168 }, { "epoch": 2.9897698209718673, "grad_norm": 0.22519888128468324, "learning_rate": 1.416153216322997e-05, "loss": 1.0406, "step": 1169 }, { "epoch": 2.9923273657289, "grad_norm": 0.23526057180385235, "learning_rate": 1.4149930596188768e-05, "loss": 1.0388, "step": 1170 }, { "epoch": 2.9948849104859336, "grad_norm": 0.23228687433861023, "learning_rate": 1.4138322277469962e-05, "loss": 1.035, "step": 1171 }, { "epoch": 2.997442455242967, "grad_norm": 0.23799687340205392, "learning_rate": 1.412670722595956e-05, "loss": 1.0798, "step": 1172 }, { "epoch": 3.0, "grad_norm": 0.22605319189413042, "learning_rate": 1.4115085460554524e-05, "loss": 1.0724, "step": 1173 }, { "epoch": 3.002557544757033, "grad_norm": 0.22583372556086656, "learning_rate": 1.410345700016274e-05, "loss": 1.0653, "step": 1174 }, { "epoch": 3.0051150895140664, "grad_norm": 0.20810235633737204, "learning_rate": 1.4091821863702983e-05, "loss": 1.0641, "step": 1175 }, { "epoch": 3.0076726342710995, "grad_norm": 0.20645828983892262, "learning_rate": 1.4080180070104897e-05, "loss": 1.0426, "step": 1176 }, { "epoch": 3.010230179028133, "grad_norm": 0.20345366792505884, "learning_rate": 1.406853163830895e-05, "loss": 1.0849, "step": 1177 }, { "epoch": 3.0127877237851663, "grad_norm": 0.21212291453565033, "learning_rate": 1.4056876587266413e-05, "loss": 1.0687, "step": 1178 }, { "epoch": 3.0153452685421995, "grad_norm": 0.19908450369242628, "learning_rate": 1.4045214935939323e-05, "loss": 1.0193, "step": 1179 }, { "epoch": 3.0179028132992327, "grad_norm": 0.22127953869549283, "learning_rate": 1.4033546703300465e-05, "loss": 1.027, "step": 1180 }, { "epoch": 3.020460358056266, "grad_norm": 0.2284795334598278, "learning_rate": 1.402187190833331e-05, "loss": 1.041, "step": 1181 }, { "epoch": 3.023017902813299, "grad_norm": 0.2062329065326131, "learning_rate": 1.4010190570032034e-05, "loss": 1.0371, "step": 1182 }, { "epoch": 3.0255754475703327, "grad_norm": 0.19478100964489237, "learning_rate": 1.3998502707401437e-05, "loss": 1.0578, "step": 1183 }, { "epoch": 3.028132992327366, "grad_norm": 0.22168971452412287, "learning_rate": 1.398680833945694e-05, "loss": 1.023, "step": 1184 }, { "epoch": 3.030690537084399, "grad_norm": 0.2040809628837293, "learning_rate": 1.3975107485224552e-05, "loss": 1.0382, "step": 1185 }, { "epoch": 3.0332480818414322, "grad_norm": 0.2051983553640489, "learning_rate": 1.3963400163740828e-05, "loss": 1.0186, "step": 1186 }, { "epoch": 3.0358056265984654, "grad_norm": 0.2350671015231016, "learning_rate": 1.395168639405285e-05, "loss": 1.0455, "step": 1187 }, { "epoch": 3.0383631713554986, "grad_norm": 0.22621448501355076, "learning_rate": 1.3939966195218188e-05, "loss": 1.0074, "step": 1188 }, { "epoch": 3.040920716112532, "grad_norm": 0.23737640534776971, "learning_rate": 1.3928239586304873e-05, "loss": 1.0437, "step": 1189 }, { "epoch": 3.0434782608695654, "grad_norm": 0.2323257168547048, "learning_rate": 1.3916506586391364e-05, "loss": 1.0327, "step": 1190 }, { "epoch": 3.0460358056265986, "grad_norm": 0.22305161499533654, "learning_rate": 1.390476721456652e-05, "loss": 1.0099, "step": 1191 }, { "epoch": 3.0485933503836318, "grad_norm": 0.23535858097990897, "learning_rate": 1.3893021489929564e-05, "loss": 1.051, "step": 1192 }, { "epoch": 3.051150895140665, "grad_norm": 0.20326385048979087, "learning_rate": 1.3881269431590052e-05, "loss": 1.057, "step": 1193 }, { "epoch": 3.053708439897698, "grad_norm": 0.21150204467244554, "learning_rate": 1.3869511058667855e-05, "loss": 1.0296, "step": 1194 }, { "epoch": 3.0562659846547313, "grad_norm": 0.2227085234968232, "learning_rate": 1.3857746390293106e-05, "loss": 1.0342, "step": 1195 }, { "epoch": 3.0588235294117645, "grad_norm": 0.24189335016155378, "learning_rate": 1.3845975445606184e-05, "loss": 1.0491, "step": 1196 }, { "epoch": 3.061381074168798, "grad_norm": 0.21700679291777608, "learning_rate": 1.383419824375768e-05, "loss": 1.0458, "step": 1197 }, { "epoch": 3.0639386189258313, "grad_norm": 0.2325789506958363, "learning_rate": 1.382241480390837e-05, "loss": 1.0451, "step": 1198 }, { "epoch": 3.0664961636828645, "grad_norm": 0.21783084381710976, "learning_rate": 1.3810625145229174e-05, "loss": 1.0621, "step": 1199 }, { "epoch": 3.0690537084398977, "grad_norm": 0.259978348441225, "learning_rate": 1.3798829286901122e-05, "loss": 1.0216, "step": 1200 }, { "epoch": 3.071611253196931, "grad_norm": 0.2531231166315013, "learning_rate": 1.3787027248115341e-05, "loss": 1.0344, "step": 1201 }, { "epoch": 3.074168797953964, "grad_norm": 0.25693037958499804, "learning_rate": 1.3775219048073011e-05, "loss": 1.0571, "step": 1202 }, { "epoch": 3.0767263427109977, "grad_norm": 0.22329447917802453, "learning_rate": 1.376340470598534e-05, "loss": 1.0621, "step": 1203 }, { "epoch": 3.079283887468031, "grad_norm": 0.24363305905238922, "learning_rate": 1.3751584241073517e-05, "loss": 1.0627, "step": 1204 }, { "epoch": 3.081841432225064, "grad_norm": 0.252245006887946, "learning_rate": 1.3739757672568703e-05, "loss": 1.0619, "step": 1205 }, { "epoch": 3.084398976982097, "grad_norm": 0.24187527332738293, "learning_rate": 1.3727925019711981e-05, "loss": 1.0324, "step": 1206 }, { "epoch": 3.0869565217391304, "grad_norm": 0.2140650570738505, "learning_rate": 1.3716086301754343e-05, "loss": 1.0538, "step": 1207 }, { "epoch": 3.0895140664961636, "grad_norm": 0.26828049735013604, "learning_rate": 1.3704241537956643e-05, "loss": 1.0806, "step": 1208 }, { "epoch": 3.0920716112531967, "grad_norm": 0.20662196910585112, "learning_rate": 1.3692390747589564e-05, "loss": 1.0272, "step": 1209 }, { "epoch": 3.0946291560102304, "grad_norm": 0.23564415225665816, "learning_rate": 1.3680533949933607e-05, "loss": 1.0499, "step": 1210 }, { "epoch": 3.0971867007672635, "grad_norm": 0.20991526952221617, "learning_rate": 1.3668671164279039e-05, "loss": 1.0514, "step": 1211 }, { "epoch": 3.0997442455242967, "grad_norm": 0.22870151484413298, "learning_rate": 1.3656802409925874e-05, "loss": 1.0134, "step": 1212 }, { "epoch": 3.10230179028133, "grad_norm": 0.21877781759998727, "learning_rate": 1.3644927706183824e-05, "loss": 1.0851, "step": 1213 }, { "epoch": 3.104859335038363, "grad_norm": 0.2327125173805525, "learning_rate": 1.3633047072372301e-05, "loss": 1.0311, "step": 1214 }, { "epoch": 3.1074168797953963, "grad_norm": 0.22202571636713042, "learning_rate": 1.3621160527820343e-05, "loss": 1.0737, "step": 1215 }, { "epoch": 3.10997442455243, "grad_norm": 0.2154525697553689, "learning_rate": 1.3609268091866621e-05, "loss": 1.0298, "step": 1216 }, { "epoch": 3.112531969309463, "grad_norm": 0.24440602961960542, "learning_rate": 1.3597369783859385e-05, "loss": 1.0637, "step": 1217 }, { "epoch": 3.1150895140664963, "grad_norm": 0.22947504540372743, "learning_rate": 1.3585465623156434e-05, "loss": 1.0358, "step": 1218 }, { "epoch": 3.1176470588235294, "grad_norm": 0.20546693748205078, "learning_rate": 1.3573555629125097e-05, "loss": 1.0531, "step": 1219 }, { "epoch": 3.1202046035805626, "grad_norm": 0.2376207772257609, "learning_rate": 1.3561639821142187e-05, "loss": 1.0422, "step": 1220 }, { "epoch": 3.122762148337596, "grad_norm": 0.2075906124157621, "learning_rate": 1.3549718218593982e-05, "loss": 1.0373, "step": 1221 }, { "epoch": 3.125319693094629, "grad_norm": 0.2710877805734423, "learning_rate": 1.3537790840876179e-05, "loss": 0.9867, "step": 1222 }, { "epoch": 3.1278772378516626, "grad_norm": 0.21873389694947254, "learning_rate": 1.3525857707393878e-05, "loss": 1.0493, "step": 1223 }, { "epoch": 3.130434782608696, "grad_norm": 0.23140954420047274, "learning_rate": 1.3513918837561544e-05, "loss": 1.0192, "step": 1224 }, { "epoch": 3.132992327365729, "grad_norm": 0.21413826548960174, "learning_rate": 1.3501974250802967e-05, "loss": 1.0233, "step": 1225 }, { "epoch": 3.135549872122762, "grad_norm": 0.2211593381832046, "learning_rate": 1.3490023966551249e-05, "loss": 1.0415, "step": 1226 }, { "epoch": 3.1381074168797953, "grad_norm": 0.23108631631913867, "learning_rate": 1.3478068004248747e-05, "loss": 1.0399, "step": 1227 }, { "epoch": 3.1406649616368285, "grad_norm": 0.22275279756167513, "learning_rate": 1.346610638334707e-05, "loss": 1.0596, "step": 1228 }, { "epoch": 3.1432225063938617, "grad_norm": 0.2524231602837744, "learning_rate": 1.3454139123307023e-05, "loss": 1.065, "step": 1229 }, { "epoch": 3.1457800511508953, "grad_norm": 0.2196098109454718, "learning_rate": 1.3442166243598598e-05, "loss": 1.0497, "step": 1230 }, { "epoch": 3.1483375959079285, "grad_norm": 0.2392235318659055, "learning_rate": 1.3430187763700914e-05, "loss": 1.0579, "step": 1231 }, { "epoch": 3.1508951406649617, "grad_norm": 0.2252882411678263, "learning_rate": 1.341820370310221e-05, "loss": 1.037, "step": 1232 }, { "epoch": 3.153452685421995, "grad_norm": 0.21957606499611643, "learning_rate": 1.3406214081299807e-05, "loss": 1.077, "step": 1233 }, { "epoch": 3.156010230179028, "grad_norm": 0.2158883136158835, "learning_rate": 1.3394218917800064e-05, "loss": 1.0576, "step": 1234 }, { "epoch": 3.1585677749360612, "grad_norm": 0.23206630107006462, "learning_rate": 1.3382218232118367e-05, "loss": 1.046, "step": 1235 }, { "epoch": 3.1611253196930944, "grad_norm": 0.22650165934718894, "learning_rate": 1.3370212043779078e-05, "loss": 1.0513, "step": 1236 }, { "epoch": 3.163682864450128, "grad_norm": 0.2146494581025888, "learning_rate": 1.335820037231552e-05, "loss": 1.0418, "step": 1237 }, { "epoch": 3.166240409207161, "grad_norm": 0.22693672785502703, "learning_rate": 1.3346183237269925e-05, "loss": 1.044, "step": 1238 }, { "epoch": 3.1687979539641944, "grad_norm": 0.24944388113412067, "learning_rate": 1.3334160658193425e-05, "loss": 1.0085, "step": 1239 }, { "epoch": 3.1713554987212276, "grad_norm": 0.2323240702756201, "learning_rate": 1.3322132654646003e-05, "loss": 1.0348, "step": 1240 }, { "epoch": 3.1739130434782608, "grad_norm": 0.23314120380593967, "learning_rate": 1.3310099246196466e-05, "loss": 1.0255, "step": 1241 }, { "epoch": 3.176470588235294, "grad_norm": 0.22959022702139156, "learning_rate": 1.3298060452422421e-05, "loss": 1.0303, "step": 1242 }, { "epoch": 3.1790281329923276, "grad_norm": 0.1945764817333214, "learning_rate": 1.3286016292910229e-05, "loss": 1.0366, "step": 1243 }, { "epoch": 3.1815856777493607, "grad_norm": 0.2049881448552149, "learning_rate": 1.327396678725499e-05, "loss": 1.0224, "step": 1244 }, { "epoch": 3.184143222506394, "grad_norm": 0.245199876694944, "learning_rate": 1.3261911955060493e-05, "loss": 0.9968, "step": 1245 }, { "epoch": 3.186700767263427, "grad_norm": 0.19541276884697034, "learning_rate": 1.3249851815939197e-05, "loss": 1.0502, "step": 1246 }, { "epoch": 3.1892583120204603, "grad_norm": 0.22313066289223873, "learning_rate": 1.3237786389512191e-05, "loss": 1.0577, "step": 1247 }, { "epoch": 3.1918158567774935, "grad_norm": 0.23691814508572034, "learning_rate": 1.3225715695409171e-05, "loss": 1.0407, "step": 1248 }, { "epoch": 3.1943734015345266, "grad_norm": 0.19364764369376442, "learning_rate": 1.3213639753268406e-05, "loss": 1.0289, "step": 1249 }, { "epoch": 3.1969309462915603, "grad_norm": 0.19636310287160377, "learning_rate": 1.3201558582736693e-05, "loss": 1.0389, "step": 1250 }, { "epoch": 3.1994884910485935, "grad_norm": 0.1876664287484004, "learning_rate": 1.3189472203469347e-05, "loss": 1.0167, "step": 1251 }, { "epoch": 3.2020460358056266, "grad_norm": 0.19365316134612506, "learning_rate": 1.3177380635130144e-05, "loss": 1.0522, "step": 1252 }, { "epoch": 3.20460358056266, "grad_norm": 0.17412371216897868, "learning_rate": 1.3165283897391315e-05, "loss": 1.0125, "step": 1253 }, { "epoch": 3.207161125319693, "grad_norm": 0.21377597350657065, "learning_rate": 1.3153182009933495e-05, "loss": 1.035, "step": 1254 }, { "epoch": 3.209718670076726, "grad_norm": 0.18072951551049465, "learning_rate": 1.3141074992445695e-05, "loss": 1.0354, "step": 1255 }, { "epoch": 3.21227621483376, "grad_norm": 0.21819804516231073, "learning_rate": 1.3128962864625281e-05, "loss": 1.0288, "step": 1256 }, { "epoch": 3.214833759590793, "grad_norm": 0.22829327535687294, "learning_rate": 1.3116845646177923e-05, "loss": 1.0329, "step": 1257 }, { "epoch": 3.217391304347826, "grad_norm": 0.22096551556124827, "learning_rate": 1.3104723356817582e-05, "loss": 1.0272, "step": 1258 }, { "epoch": 3.2199488491048593, "grad_norm": 0.19427368545567542, "learning_rate": 1.309259601626646e-05, "loss": 1.0757, "step": 1259 }, { "epoch": 3.2225063938618925, "grad_norm": 0.2517142880283656, "learning_rate": 1.3080463644254986e-05, "loss": 1.0449, "step": 1260 }, { "epoch": 3.2250639386189257, "grad_norm": 0.21438511450639225, "learning_rate": 1.3068326260521769e-05, "loss": 1.0253, "step": 1261 }, { "epoch": 3.227621483375959, "grad_norm": 0.23939604240119217, "learning_rate": 1.3056183884813568e-05, "loss": 1.0055, "step": 1262 }, { "epoch": 3.2301790281329925, "grad_norm": 0.24913816729402657, "learning_rate": 1.3044036536885284e-05, "loss": 1.0305, "step": 1263 }, { "epoch": 3.2327365728900257, "grad_norm": 0.22985968452270927, "learning_rate": 1.3031884236499877e-05, "loss": 1.0356, "step": 1264 }, { "epoch": 3.235294117647059, "grad_norm": 0.2432127136491896, "learning_rate": 1.3019727003428387e-05, "loss": 1.0327, "step": 1265 }, { "epoch": 3.237851662404092, "grad_norm": 0.21511626506563813, "learning_rate": 1.300756485744987e-05, "loss": 1.0351, "step": 1266 }, { "epoch": 3.2404092071611252, "grad_norm": 0.21620331140589194, "learning_rate": 1.2995397818351381e-05, "loss": 1.0272, "step": 1267 }, { "epoch": 3.2429667519181584, "grad_norm": 0.24918797088173247, "learning_rate": 1.2983225905927924e-05, "loss": 0.9923, "step": 1268 }, { "epoch": 3.2455242966751916, "grad_norm": 0.2033868759774891, "learning_rate": 1.2971049139982448e-05, "loss": 1.0526, "step": 1269 }, { "epoch": 3.2480818414322252, "grad_norm": 0.24065409839804014, "learning_rate": 1.2958867540325785e-05, "loss": 1.0283, "step": 1270 }, { "epoch": 3.2506393861892584, "grad_norm": 0.23975735377063542, "learning_rate": 1.294668112677664e-05, "loss": 1.0467, "step": 1271 }, { "epoch": 3.2531969309462916, "grad_norm": 0.20321738007355677, "learning_rate": 1.2934489919161541e-05, "loss": 1.0292, "step": 1272 }, { "epoch": 3.2557544757033248, "grad_norm": 0.22563988593724132, "learning_rate": 1.292229393731482e-05, "loss": 1.0273, "step": 1273 }, { "epoch": 3.258312020460358, "grad_norm": 0.2108784426288754, "learning_rate": 1.2910093201078584e-05, "loss": 1.041, "step": 1274 }, { "epoch": 3.260869565217391, "grad_norm": 0.25182826531670705, "learning_rate": 1.289788773030266e-05, "loss": 1.0507, "step": 1275 }, { "epoch": 3.2634271099744243, "grad_norm": 0.23260866121986465, "learning_rate": 1.2885677544844592e-05, "loss": 1.0073, "step": 1276 }, { "epoch": 3.265984654731458, "grad_norm": 0.20778832907058722, "learning_rate": 1.2873462664569583e-05, "loss": 1.063, "step": 1277 }, { "epoch": 3.268542199488491, "grad_norm": 0.24704017386773852, "learning_rate": 1.2861243109350485e-05, "loss": 1.0275, "step": 1278 }, { "epoch": 3.2710997442455243, "grad_norm": 0.20143011397018976, "learning_rate": 1.2849018899067746e-05, "loss": 1.0786, "step": 1279 }, { "epoch": 3.2736572890025575, "grad_norm": 0.19780957370773475, "learning_rate": 1.2836790053609396e-05, "loss": 1.0475, "step": 1280 }, { "epoch": 3.2762148337595907, "grad_norm": 0.21001290371983408, "learning_rate": 1.2824556592870993e-05, "loss": 1.0544, "step": 1281 }, { "epoch": 3.2787723785166243, "grad_norm": 0.2314545925289747, "learning_rate": 1.2812318536755624e-05, "loss": 1.0432, "step": 1282 }, { "epoch": 3.2813299232736575, "grad_norm": 0.21988256589877733, "learning_rate": 1.2800075905173834e-05, "loss": 1.0432, "step": 1283 }, { "epoch": 3.2838874680306906, "grad_norm": 0.26832633674704665, "learning_rate": 1.2787828718043622e-05, "loss": 1.0379, "step": 1284 }, { "epoch": 3.286445012787724, "grad_norm": 0.2234222589374059, "learning_rate": 1.2775576995290397e-05, "loss": 1.0421, "step": 1285 }, { "epoch": 3.289002557544757, "grad_norm": 0.20516563803916263, "learning_rate": 1.276332075684694e-05, "loss": 1.0392, "step": 1286 }, { "epoch": 3.29156010230179, "grad_norm": 0.2404590656925125, "learning_rate": 1.2751060022653393e-05, "loss": 1.0283, "step": 1287 }, { "epoch": 3.2941176470588234, "grad_norm": 0.19864113603292302, "learning_rate": 1.2738794812657194e-05, "loss": 1.0144, "step": 1288 }, { "epoch": 3.296675191815857, "grad_norm": 0.2323436030300969, "learning_rate": 1.2726525146813078e-05, "loss": 1.0151, "step": 1289 }, { "epoch": 3.29923273657289, "grad_norm": 0.24929371156784427, "learning_rate": 1.2714251045083028e-05, "loss": 1.0137, "step": 1290 }, { "epoch": 3.3017902813299234, "grad_norm": 0.20413376158858587, "learning_rate": 1.2701972527436235e-05, "loss": 1.0233, "step": 1291 }, { "epoch": 3.3043478260869565, "grad_norm": 0.21637513281635873, "learning_rate": 1.2689689613849083e-05, "loss": 1.0586, "step": 1292 }, { "epoch": 3.3069053708439897, "grad_norm": 0.18194714637573692, "learning_rate": 1.2677402324305099e-05, "loss": 0.994, "step": 1293 }, { "epoch": 3.309462915601023, "grad_norm": 0.19606411156722506, "learning_rate": 1.266511067879494e-05, "loss": 1.0283, "step": 1294 }, { "epoch": 3.312020460358056, "grad_norm": 0.19517256802808283, "learning_rate": 1.265281469731634e-05, "loss": 1.0373, "step": 1295 }, { "epoch": 3.3145780051150897, "grad_norm": 0.17867307264513901, "learning_rate": 1.2640514399874095e-05, "loss": 1.0517, "step": 1296 }, { "epoch": 3.317135549872123, "grad_norm": 0.19814474828943063, "learning_rate": 1.2628209806480024e-05, "loss": 1.0068, "step": 1297 }, { "epoch": 3.319693094629156, "grad_norm": 0.21270750338094424, "learning_rate": 1.2615900937152923e-05, "loss": 1.0236, "step": 1298 }, { "epoch": 3.3222506393861893, "grad_norm": 0.21625825452151415, "learning_rate": 1.2603587811918558e-05, "loss": 1.0495, "step": 1299 }, { "epoch": 3.3248081841432224, "grad_norm": 0.23776899893360745, "learning_rate": 1.2591270450809612e-05, "loss": 1.0741, "step": 1300 }, { "epoch": 3.3273657289002556, "grad_norm": 0.22428186293001376, "learning_rate": 1.2578948873865662e-05, "loss": 1.0132, "step": 1301 }, { "epoch": 3.329923273657289, "grad_norm": 0.20864902455184137, "learning_rate": 1.2566623101133144e-05, "loss": 1.0464, "step": 1302 }, { "epoch": 3.3324808184143224, "grad_norm": 0.2685355350833958, "learning_rate": 1.2554293152665316e-05, "loss": 1.0247, "step": 1303 }, { "epoch": 3.3350383631713556, "grad_norm": 0.2527986356697781, "learning_rate": 1.2541959048522239e-05, "loss": 1.0399, "step": 1304 }, { "epoch": 3.337595907928389, "grad_norm": 0.22197339925214596, "learning_rate": 1.2529620808770723e-05, "loss": 1.0157, "step": 1305 }, { "epoch": 3.340153452685422, "grad_norm": 0.3107261506811511, "learning_rate": 1.251727845348432e-05, "loss": 1.0495, "step": 1306 }, { "epoch": 3.342710997442455, "grad_norm": 0.2643689123746537, "learning_rate": 1.2504932002743262e-05, "loss": 1.001, "step": 1307 }, { "epoch": 3.3452685421994883, "grad_norm": 0.2364739279711792, "learning_rate": 1.2492581476634458e-05, "loss": 1.045, "step": 1308 }, { "epoch": 3.3478260869565215, "grad_norm": 0.28136518049730547, "learning_rate": 1.2480226895251439e-05, "loss": 1.0285, "step": 1309 }, { "epoch": 3.350383631713555, "grad_norm": 0.2523350080360508, "learning_rate": 1.2467868278694342e-05, "loss": 1.0658, "step": 1310 }, { "epoch": 3.3529411764705883, "grad_norm": 0.20529584681597104, "learning_rate": 1.245550564706986e-05, "loss": 1.0372, "step": 1311 }, { "epoch": 3.3554987212276215, "grad_norm": 0.26187724014211844, "learning_rate": 1.2443139020491216e-05, "loss": 1.0295, "step": 1312 }, { "epoch": 3.3580562659846547, "grad_norm": 0.2759180573007528, "learning_rate": 1.2430768419078143e-05, "loss": 1.0312, "step": 1313 }, { "epoch": 3.360613810741688, "grad_norm": 0.2020495956799633, "learning_rate": 1.2418393862956837e-05, "loss": 1.0419, "step": 1314 }, { "epoch": 3.363171355498721, "grad_norm": 0.2369272520944126, "learning_rate": 1.2406015372259925e-05, "loss": 1.0122, "step": 1315 }, { "epoch": 3.3657289002557547, "grad_norm": 0.2184979100214276, "learning_rate": 1.2393632967126441e-05, "loss": 1.0327, "step": 1316 }, { "epoch": 3.368286445012788, "grad_norm": 0.23858603204557072, "learning_rate": 1.2381246667701781e-05, "loss": 1.0475, "step": 1317 }, { "epoch": 3.370843989769821, "grad_norm": 0.26756479784593945, "learning_rate": 1.236885649413768e-05, "loss": 1.0426, "step": 1318 }, { "epoch": 3.373401534526854, "grad_norm": 0.1892302039091279, "learning_rate": 1.2356462466592177e-05, "loss": 1.0412, "step": 1319 }, { "epoch": 3.3759590792838874, "grad_norm": 0.29335988888765785, "learning_rate": 1.2344064605229577e-05, "loss": 1.0175, "step": 1320 }, { "epoch": 3.3785166240409206, "grad_norm": 0.21447038773497848, "learning_rate": 1.2331662930220424e-05, "loss": 1.018, "step": 1321 }, { "epoch": 3.381074168797954, "grad_norm": 0.24164773212365756, "learning_rate": 1.2319257461741478e-05, "loss": 1.029, "step": 1322 }, { "epoch": 3.3836317135549874, "grad_norm": 0.23724415736018667, "learning_rate": 1.2306848219975649e-05, "loss": 1.017, "step": 1323 }, { "epoch": 3.3861892583120206, "grad_norm": 0.2146728306264026, "learning_rate": 1.2294435225112005e-05, "loss": 1.0301, "step": 1324 }, { "epoch": 3.3887468030690537, "grad_norm": 0.18212095256468025, "learning_rate": 1.2282018497345705e-05, "loss": 1.0361, "step": 1325 }, { "epoch": 3.391304347826087, "grad_norm": 0.23148682510609303, "learning_rate": 1.2269598056877996e-05, "loss": 1.0385, "step": 1326 }, { "epoch": 3.39386189258312, "grad_norm": 0.20473257376707585, "learning_rate": 1.2257173923916154e-05, "loss": 1.0208, "step": 1327 }, { "epoch": 3.3964194373401533, "grad_norm": 0.20995062344103757, "learning_rate": 1.2244746118673467e-05, "loss": 1.0116, "step": 1328 }, { "epoch": 3.398976982097187, "grad_norm": 0.23774156769953378, "learning_rate": 1.22323146613692e-05, "loss": 1.0742, "step": 1329 }, { "epoch": 3.40153452685422, "grad_norm": 0.20830692559875352, "learning_rate": 1.2219879572228555e-05, "loss": 1.0565, "step": 1330 }, { "epoch": 3.4040920716112533, "grad_norm": 0.2147028468697588, "learning_rate": 1.2207440871482644e-05, "loss": 1.0294, "step": 1331 }, { "epoch": 3.4066496163682864, "grad_norm": 0.24756067918436106, "learning_rate": 1.2194998579368451e-05, "loss": 1.0479, "step": 1332 }, { "epoch": 3.4092071611253196, "grad_norm": 0.2056045421373826, "learning_rate": 1.2182552716128818e-05, "loss": 1.0236, "step": 1333 }, { "epoch": 3.411764705882353, "grad_norm": 0.2079215269898909, "learning_rate": 1.2170103302012374e-05, "loss": 1.0513, "step": 1334 }, { "epoch": 3.414322250639386, "grad_norm": 0.19554068307435188, "learning_rate": 1.2157650357273547e-05, "loss": 1.0389, "step": 1335 }, { "epoch": 3.4168797953964196, "grad_norm": 0.20840944979090947, "learning_rate": 1.2145193902172496e-05, "loss": 1.0355, "step": 1336 }, { "epoch": 3.419437340153453, "grad_norm": 0.21130712097196197, "learning_rate": 1.2132733956975093e-05, "loss": 1.0322, "step": 1337 }, { "epoch": 3.421994884910486, "grad_norm": 0.17958150894777242, "learning_rate": 1.2120270541952892e-05, "loss": 1.0227, "step": 1338 }, { "epoch": 3.424552429667519, "grad_norm": 0.2225571229441682, "learning_rate": 1.210780367738309e-05, "loss": 1.0285, "step": 1339 }, { "epoch": 3.4271099744245523, "grad_norm": 0.1885954682977986, "learning_rate": 1.2095333383548495e-05, "loss": 1.0812, "step": 1340 }, { "epoch": 3.4296675191815855, "grad_norm": 0.2099948092443905, "learning_rate": 1.2082859680737495e-05, "loss": 1.0716, "step": 1341 }, { "epoch": 3.4322250639386187, "grad_norm": 0.2256939428442792, "learning_rate": 1.2070382589244026e-05, "loss": 1.0311, "step": 1342 }, { "epoch": 3.4347826086956523, "grad_norm": 0.23072791297771425, "learning_rate": 1.2057902129367536e-05, "loss": 1.0467, "step": 1343 }, { "epoch": 3.4373401534526855, "grad_norm": 0.2057602125391487, "learning_rate": 1.204541832141295e-05, "loss": 1.028, "step": 1344 }, { "epoch": 3.4398976982097187, "grad_norm": 0.2520074046407619, "learning_rate": 1.2032931185690646e-05, "loss": 1.0163, "step": 1345 }, { "epoch": 3.442455242966752, "grad_norm": 0.2421964192866277, "learning_rate": 1.202044074251641e-05, "loss": 1.063, "step": 1346 }, { "epoch": 3.445012787723785, "grad_norm": 0.20429551187516548, "learning_rate": 1.2007947012211419e-05, "loss": 1.0361, "step": 1347 }, { "epoch": 3.4475703324808182, "grad_norm": 0.2520787216839294, "learning_rate": 1.199545001510218e-05, "loss": 1.054, "step": 1348 }, { "epoch": 3.4501278772378514, "grad_norm": 0.24681543428956615, "learning_rate": 1.1982949771520535e-05, "loss": 1.0605, "step": 1349 }, { "epoch": 3.452685421994885, "grad_norm": 0.20282034999970464, "learning_rate": 1.1970446301803598e-05, "loss": 1.0461, "step": 1350 }, { "epoch": 3.455242966751918, "grad_norm": 0.22677677047988842, "learning_rate": 1.1957939626293726e-05, "loss": 1.0459, "step": 1351 }, { "epoch": 3.4578005115089514, "grad_norm": 0.23929950706752162, "learning_rate": 1.1945429765338507e-05, "loss": 1.0531, "step": 1352 }, { "epoch": 3.4603580562659846, "grad_norm": 0.2096490071983182, "learning_rate": 1.1932916739290694e-05, "loss": 1.0148, "step": 1353 }, { "epoch": 3.4629156010230178, "grad_norm": 0.20618185619438542, "learning_rate": 1.1920400568508201e-05, "loss": 1.0375, "step": 1354 }, { "epoch": 3.4654731457800514, "grad_norm": 0.23186283780985562, "learning_rate": 1.1907881273354059e-05, "loss": 1.0276, "step": 1355 }, { "epoch": 3.4680306905370846, "grad_norm": 0.21691929515578598, "learning_rate": 1.1895358874196377e-05, "loss": 1.0368, "step": 1356 }, { "epoch": 3.4705882352941178, "grad_norm": 0.20410519325755752, "learning_rate": 1.188283339140831e-05, "loss": 1.038, "step": 1357 }, { "epoch": 3.473145780051151, "grad_norm": 0.22863334112386996, "learning_rate": 1.1870304845368043e-05, "loss": 1.0433, "step": 1358 }, { "epoch": 3.475703324808184, "grad_norm": 0.2126661663430652, "learning_rate": 1.1857773256458732e-05, "loss": 1.0605, "step": 1359 }, { "epoch": 3.4782608695652173, "grad_norm": 0.24272298207990836, "learning_rate": 1.184523864506849e-05, "loss": 1.0476, "step": 1360 }, { "epoch": 3.4808184143222505, "grad_norm": 0.20098243757734405, "learning_rate": 1.1832701031590345e-05, "loss": 1.032, "step": 1361 }, { "epoch": 3.483375959079284, "grad_norm": 0.2516527217412891, "learning_rate": 1.1820160436422213e-05, "loss": 1.0392, "step": 1362 }, { "epoch": 3.4859335038363173, "grad_norm": 0.22312520765078486, "learning_rate": 1.1807616879966856e-05, "loss": 1.0549, "step": 1363 }, { "epoch": 3.4884910485933505, "grad_norm": 0.23508194911007732, "learning_rate": 1.1795070382631856e-05, "loss": 1.0257, "step": 1364 }, { "epoch": 3.4910485933503836, "grad_norm": 0.2056219883277526, "learning_rate": 1.1782520964829583e-05, "loss": 1.0616, "step": 1365 }, { "epoch": 3.493606138107417, "grad_norm": 0.22297849379676427, "learning_rate": 1.1769968646977148e-05, "loss": 1.08, "step": 1366 }, { "epoch": 3.49616368286445, "grad_norm": 0.1917605236627194, "learning_rate": 1.1757413449496393e-05, "loss": 1.0582, "step": 1367 }, { "epoch": 3.498721227621483, "grad_norm": 0.22264832355995012, "learning_rate": 1.174485539281384e-05, "loss": 0.9999, "step": 1368 }, { "epoch": 3.501278772378517, "grad_norm": 0.18053830121135175, "learning_rate": 1.1732294497360658e-05, "loss": 1.0481, "step": 1369 }, { "epoch": 3.50383631713555, "grad_norm": 0.25413658020729973, "learning_rate": 1.1719730783572645e-05, "loss": 1.0526, "step": 1370 }, { "epoch": 3.506393861892583, "grad_norm": 0.20438148687464178, "learning_rate": 1.1707164271890168e-05, "loss": 1.0465, "step": 1371 }, { "epoch": 3.5089514066496164, "grad_norm": 0.27411869672391553, "learning_rate": 1.1694594982758164e-05, "loss": 1.0672, "step": 1372 }, { "epoch": 3.5115089514066495, "grad_norm": 0.27020394951486204, "learning_rate": 1.1682022936626076e-05, "loss": 1.0249, "step": 1373 }, { "epoch": 3.5140664961636827, "grad_norm": 0.20542313494356507, "learning_rate": 1.166944815394784e-05, "loss": 1.0444, "step": 1374 }, { "epoch": 3.516624040920716, "grad_norm": 0.2696771035530231, "learning_rate": 1.165687065518184e-05, "loss": 1.0164, "step": 1375 }, { "epoch": 3.5191815856777495, "grad_norm": 0.21834933315057503, "learning_rate": 1.1644290460790879e-05, "loss": 1.0231, "step": 1376 }, { "epoch": 3.5217391304347827, "grad_norm": 0.25602165129241816, "learning_rate": 1.163170759124215e-05, "loss": 1.0499, "step": 1377 }, { "epoch": 3.524296675191816, "grad_norm": 0.2466307590095287, "learning_rate": 1.161912206700719e-05, "loss": 1.0179, "step": 1378 }, { "epoch": 3.526854219948849, "grad_norm": 0.1990877095514582, "learning_rate": 1.1606533908561866e-05, "loss": 1.0825, "step": 1379 }, { "epoch": 3.5294117647058822, "grad_norm": 0.2262880860449741, "learning_rate": 1.1593943136386316e-05, "loss": 1.0239, "step": 1380 }, { "epoch": 3.531969309462916, "grad_norm": 0.23639713675723853, "learning_rate": 1.1581349770964946e-05, "loss": 1.0797, "step": 1381 }, { "epoch": 3.5345268542199486, "grad_norm": 0.19143592758217978, "learning_rate": 1.1568753832786376e-05, "loss": 1.0482, "step": 1382 }, { "epoch": 3.5370843989769822, "grad_norm": 0.21395077968188803, "learning_rate": 1.1556155342343405e-05, "loss": 1.0341, "step": 1383 }, { "epoch": 3.5396419437340154, "grad_norm": 0.20517427967195068, "learning_rate": 1.154355432013299e-05, "loss": 1.0657, "step": 1384 }, { "epoch": 3.5421994884910486, "grad_norm": 0.19022344547536582, "learning_rate": 1.1530950786656205e-05, "loss": 1.0428, "step": 1385 }, { "epoch": 3.544757033248082, "grad_norm": 0.24857892965208156, "learning_rate": 1.1518344762418216e-05, "loss": 1.0614, "step": 1386 }, { "epoch": 3.547314578005115, "grad_norm": 0.17434032950673256, "learning_rate": 1.150573626792823e-05, "loss": 1.0119, "step": 1387 }, { "epoch": 3.5498721227621486, "grad_norm": 0.221669736437551, "learning_rate": 1.1493125323699486e-05, "loss": 1.0325, "step": 1388 }, { "epoch": 3.5524296675191813, "grad_norm": 0.19550877444868983, "learning_rate": 1.1480511950249195e-05, "loss": 1.0621, "step": 1389 }, { "epoch": 3.554987212276215, "grad_norm": 0.20320983764425946, "learning_rate": 1.1467896168098533e-05, "loss": 1.0688, "step": 1390 }, { "epoch": 3.557544757033248, "grad_norm": 0.21236236447911172, "learning_rate": 1.1455277997772585e-05, "loss": 0.9992, "step": 1391 }, { "epoch": 3.5601023017902813, "grad_norm": 0.1946876189282923, "learning_rate": 1.1442657459800323e-05, "loss": 1.0298, "step": 1392 }, { "epoch": 3.5626598465473145, "grad_norm": 0.20833695509734265, "learning_rate": 1.143003457471458e-05, "loss": 1.0481, "step": 1393 }, { "epoch": 3.5652173913043477, "grad_norm": 0.19849397670530705, "learning_rate": 1.1417409363051992e-05, "loss": 1.0508, "step": 1394 }, { "epoch": 3.5677749360613813, "grad_norm": 0.1862173592034928, "learning_rate": 1.1404781845352999e-05, "loss": 1.0586, "step": 1395 }, { "epoch": 3.5703324808184145, "grad_norm": 0.20151362231655162, "learning_rate": 1.1392152042161774e-05, "loss": 1.0319, "step": 1396 }, { "epoch": 3.5728900255754477, "grad_norm": 0.23404342439834142, "learning_rate": 1.1379519974026226e-05, "loss": 1.0151, "step": 1397 }, { "epoch": 3.575447570332481, "grad_norm": 0.18584316354206787, "learning_rate": 1.136688566149793e-05, "loss": 1.0516, "step": 1398 }, { "epoch": 3.578005115089514, "grad_norm": 0.2357364264338847, "learning_rate": 1.1354249125132131e-05, "loss": 1.0558, "step": 1399 }, { "epoch": 3.580562659846547, "grad_norm": 0.255370311471337, "learning_rate": 1.1341610385487677e-05, "loss": 1.0159, "step": 1400 }, { "epoch": 3.5831202046035804, "grad_norm": 0.2015566724373594, "learning_rate": 1.1328969463127009e-05, "loss": 1.0256, "step": 1401 }, { "epoch": 3.585677749360614, "grad_norm": 0.2717588011458947, "learning_rate": 1.1316326378616121e-05, "loss": 1.0452, "step": 1402 }, { "epoch": 3.588235294117647, "grad_norm": 0.226800697503035, "learning_rate": 1.1303681152524514e-05, "loss": 1.0417, "step": 1403 }, { "epoch": 3.5907928388746804, "grad_norm": 0.20628829171202948, "learning_rate": 1.129103380542519e-05, "loss": 1.0483, "step": 1404 }, { "epoch": 3.5933503836317136, "grad_norm": 0.2260665953032841, "learning_rate": 1.1278384357894585e-05, "loss": 1.0407, "step": 1405 }, { "epoch": 3.5959079283887467, "grad_norm": 0.20513785218039995, "learning_rate": 1.1265732830512561e-05, "loss": 1.0391, "step": 1406 }, { "epoch": 3.59846547314578, "grad_norm": 0.21444285296757887, "learning_rate": 1.125307924386236e-05, "loss": 1.0456, "step": 1407 }, { "epoch": 3.601023017902813, "grad_norm": 0.2652819565444848, "learning_rate": 1.1240423618530578e-05, "loss": 1.0501, "step": 1408 }, { "epoch": 3.6035805626598467, "grad_norm": 0.23632809050025924, "learning_rate": 1.122776597510713e-05, "loss": 1.0294, "step": 1409 }, { "epoch": 3.60613810741688, "grad_norm": 0.2185806876530497, "learning_rate": 1.1215106334185201e-05, "loss": 1.0024, "step": 1410 }, { "epoch": 3.608695652173913, "grad_norm": 0.24854116957417377, "learning_rate": 1.1202444716361247e-05, "loss": 1.0451, "step": 1411 }, { "epoch": 3.6112531969309463, "grad_norm": 0.2045525689869136, "learning_rate": 1.1189781142234917e-05, "loss": 1.0635, "step": 1412 }, { "epoch": 3.6138107416879794, "grad_norm": 0.2399433598230184, "learning_rate": 1.1177115632409064e-05, "loss": 1.0177, "step": 1413 }, { "epoch": 3.6163682864450126, "grad_norm": 0.2415017313404832, "learning_rate": 1.1164448207489673e-05, "loss": 1.0379, "step": 1414 }, { "epoch": 3.618925831202046, "grad_norm": 0.21319360249943278, "learning_rate": 1.1151778888085856e-05, "loss": 1.0179, "step": 1415 }, { "epoch": 3.6214833759590794, "grad_norm": 0.24881166658392342, "learning_rate": 1.1139107694809806e-05, "loss": 1.0392, "step": 1416 }, { "epoch": 3.6240409207161126, "grad_norm": 0.19415985264760977, "learning_rate": 1.1126434648276756e-05, "loss": 1.0124, "step": 1417 }, { "epoch": 3.626598465473146, "grad_norm": 0.25642703103922565, "learning_rate": 1.1113759769104965e-05, "loss": 1.0496, "step": 1418 }, { "epoch": 3.629156010230179, "grad_norm": 0.2492878689877881, "learning_rate": 1.1101083077915667e-05, "loss": 1.043, "step": 1419 }, { "epoch": 3.631713554987212, "grad_norm": 0.1983125579481505, "learning_rate": 1.1088404595333046e-05, "loss": 1.0449, "step": 1420 }, { "epoch": 3.634271099744246, "grad_norm": 0.21827713474511093, "learning_rate": 1.1075724341984201e-05, "loss": 1.0622, "step": 1421 }, { "epoch": 3.6368286445012785, "grad_norm": 0.23619084555258635, "learning_rate": 1.1063042338499113e-05, "loss": 1.015, "step": 1422 }, { "epoch": 3.639386189258312, "grad_norm": 0.20336660531825468, "learning_rate": 1.1050358605510606e-05, "loss": 1.0413, "step": 1423 }, { "epoch": 3.6419437340153453, "grad_norm": 0.2421386235557971, "learning_rate": 1.1037673163654321e-05, "loss": 1.0307, "step": 1424 }, { "epoch": 3.6445012787723785, "grad_norm": 0.22360499286457716, "learning_rate": 1.1024986033568683e-05, "loss": 1.0605, "step": 1425 }, { "epoch": 3.6470588235294117, "grad_norm": 0.2378376933825962, "learning_rate": 1.101229723589485e-05, "loss": 1.0192, "step": 1426 }, { "epoch": 3.649616368286445, "grad_norm": 0.22968460013912853, "learning_rate": 1.099960679127671e-05, "loss": 1.0349, "step": 1427 }, { "epoch": 3.6521739130434785, "grad_norm": 0.23158540102865127, "learning_rate": 1.0986914720360821e-05, "loss": 1.0253, "step": 1428 }, { "epoch": 3.6547314578005117, "grad_norm": 0.22013393117978197, "learning_rate": 1.097422104379639e-05, "loss": 1.018, "step": 1429 }, { "epoch": 3.657289002557545, "grad_norm": 0.22220097208242998, "learning_rate": 1.0961525782235233e-05, "loss": 1.0473, "step": 1430 }, { "epoch": 3.659846547314578, "grad_norm": 0.22194116899976712, "learning_rate": 1.0948828956331752e-05, "loss": 1.0424, "step": 1431 }, { "epoch": 3.662404092071611, "grad_norm": 0.1983453396349903, "learning_rate": 1.0936130586742881e-05, "loss": 1.0453, "step": 1432 }, { "epoch": 3.6649616368286444, "grad_norm": 0.2327743943604014, "learning_rate": 1.0923430694128074e-05, "loss": 1.0193, "step": 1433 }, { "epoch": 3.6675191815856776, "grad_norm": 0.21867884439727386, "learning_rate": 1.091072929914927e-05, "loss": 1.0256, "step": 1434 }, { "epoch": 3.670076726342711, "grad_norm": 0.23080732244405422, "learning_rate": 1.0898026422470838e-05, "loss": 1.0232, "step": 1435 }, { "epoch": 3.6726342710997444, "grad_norm": 0.22857566907679472, "learning_rate": 1.0885322084759566e-05, "loss": 1.0536, "step": 1436 }, { "epoch": 3.6751918158567776, "grad_norm": 0.2520804757587095, "learning_rate": 1.0872616306684616e-05, "loss": 1.0287, "step": 1437 }, { "epoch": 3.6777493606138107, "grad_norm": 0.2469698171523125, "learning_rate": 1.0859909108917497e-05, "loss": 1.0909, "step": 1438 }, { "epoch": 3.680306905370844, "grad_norm": 0.2327692634720372, "learning_rate": 1.084720051213202e-05, "loss": 1.0193, "step": 1439 }, { "epoch": 3.682864450127877, "grad_norm": 0.23658961049768784, "learning_rate": 1.0834490537004286e-05, "loss": 1.0212, "step": 1440 }, { "epoch": 3.6854219948849103, "grad_norm": 0.20942394628132058, "learning_rate": 1.0821779204212623e-05, "loss": 1.0249, "step": 1441 }, { "epoch": 3.687979539641944, "grad_norm": 0.23145657493822064, "learning_rate": 1.0809066534437576e-05, "loss": 1.0179, "step": 1442 }, { "epoch": 3.690537084398977, "grad_norm": 0.1999453161376075, "learning_rate": 1.0796352548361863e-05, "loss": 1.0026, "step": 1443 }, { "epoch": 3.6930946291560103, "grad_norm": 0.22035660036843002, "learning_rate": 1.0783637266670348e-05, "loss": 1.0287, "step": 1444 }, { "epoch": 3.6956521739130435, "grad_norm": 0.19317194516834582, "learning_rate": 1.0770920710049997e-05, "loss": 1.0507, "step": 1445 }, { "epoch": 3.6982097186700766, "grad_norm": 0.2457010945328612, "learning_rate": 1.0758202899189852e-05, "loss": 1.0135, "step": 1446 }, { "epoch": 3.70076726342711, "grad_norm": 0.18287871278152357, "learning_rate": 1.0745483854780996e-05, "loss": 1.0408, "step": 1447 }, { "epoch": 3.703324808184143, "grad_norm": 0.23748668263508885, "learning_rate": 1.073276359751652e-05, "loss": 1.0642, "step": 1448 }, { "epoch": 3.7058823529411766, "grad_norm": 0.22123508756316554, "learning_rate": 1.0720042148091487e-05, "loss": 1.0136, "step": 1449 }, { "epoch": 3.70843989769821, "grad_norm": 0.23936061656812962, "learning_rate": 1.0707319527202902e-05, "loss": 1.0297, "step": 1450 }, { "epoch": 3.710997442455243, "grad_norm": 0.27579723622779695, "learning_rate": 1.0694595755549668e-05, "loss": 1.0088, "step": 1451 }, { "epoch": 3.713554987212276, "grad_norm": 0.2295449569053256, "learning_rate": 1.0681870853832572e-05, "loss": 1.0411, "step": 1452 }, { "epoch": 3.7161125319693094, "grad_norm": 0.21165912842223478, "learning_rate": 1.066914484275423e-05, "loss": 1.0237, "step": 1453 }, { "epoch": 3.718670076726343, "grad_norm": 0.22373624538155187, "learning_rate": 1.0656417743019065e-05, "loss": 1.0661, "step": 1454 }, { "epoch": 3.7212276214833757, "grad_norm": 0.18604305862261736, "learning_rate": 1.0643689575333276e-05, "loss": 1.0205, "step": 1455 }, { "epoch": 3.7237851662404093, "grad_norm": 0.22160309843387682, "learning_rate": 1.0630960360404793e-05, "loss": 1.0179, "step": 1456 }, { "epoch": 3.7263427109974425, "grad_norm": 0.1910813020463846, "learning_rate": 1.061823011894326e-05, "loss": 1.0622, "step": 1457 }, { "epoch": 3.7289002557544757, "grad_norm": 0.22862715748972842, "learning_rate": 1.0605498871659974e-05, "loss": 1.0185, "step": 1458 }, { "epoch": 3.731457800511509, "grad_norm": 0.20341936295394042, "learning_rate": 1.0592766639267885e-05, "loss": 1.0534, "step": 1459 }, { "epoch": 3.734015345268542, "grad_norm": 0.2403253522185079, "learning_rate": 1.0580033442481532e-05, "loss": 1.0384, "step": 1460 }, { "epoch": 3.7365728900255757, "grad_norm": 0.22338961464147264, "learning_rate": 1.0567299302017038e-05, "loss": 1.0143, "step": 1461 }, { "epoch": 3.7391304347826084, "grad_norm": 0.2117212049005623, "learning_rate": 1.0554564238592051e-05, "loss": 1.021, "step": 1462 }, { "epoch": 3.741687979539642, "grad_norm": 0.2254372260082909, "learning_rate": 1.0541828272925721e-05, "loss": 1.0292, "step": 1463 }, { "epoch": 3.7442455242966752, "grad_norm": 0.1922734992717323, "learning_rate": 1.0529091425738669e-05, "loss": 1.0489, "step": 1464 }, { "epoch": 3.7468030690537084, "grad_norm": 0.21486062627786348, "learning_rate": 1.0516353717752947e-05, "loss": 1.0359, "step": 1465 }, { "epoch": 3.7493606138107416, "grad_norm": 0.19407217948842267, "learning_rate": 1.0503615169692012e-05, "loss": 1.0342, "step": 1466 }, { "epoch": 3.7519181585677748, "grad_norm": 0.1785805281257786, "learning_rate": 1.0490875802280685e-05, "loss": 1.0353, "step": 1467 }, { "epoch": 3.7544757033248084, "grad_norm": 0.20291577459751503, "learning_rate": 1.0478135636245122e-05, "loss": 1.0306, "step": 1468 }, { "epoch": 3.7570332480818416, "grad_norm": 0.1982096205595046, "learning_rate": 1.046539469231277e-05, "loss": 1.0548, "step": 1469 }, { "epoch": 3.7595907928388748, "grad_norm": 0.20930042720158404, "learning_rate": 1.0452652991212357e-05, "loss": 1.0094, "step": 1470 }, { "epoch": 3.762148337595908, "grad_norm": 0.19919273397375814, "learning_rate": 1.0439910553673829e-05, "loss": 1.0439, "step": 1471 }, { "epoch": 3.764705882352941, "grad_norm": 0.22254826567261315, "learning_rate": 1.0427167400428331e-05, "loss": 1.0373, "step": 1472 }, { "epoch": 3.7672634271099743, "grad_norm": 0.22854611711688827, "learning_rate": 1.0414423552208184e-05, "loss": 1.0199, "step": 1473 }, { "epoch": 3.7698209718670075, "grad_norm": 0.3654589035727414, "learning_rate": 1.0401679029746828e-05, "loss": 1.0311, "step": 1474 }, { "epoch": 3.772378516624041, "grad_norm": 0.19477682817923897, "learning_rate": 1.038893385377881e-05, "loss": 1.0445, "step": 1475 }, { "epoch": 3.7749360613810743, "grad_norm": 0.2035068833502665, "learning_rate": 1.0376188045039723e-05, "loss": 1.035, "step": 1476 }, { "epoch": 3.7774936061381075, "grad_norm": 0.20207740056727894, "learning_rate": 1.0363441624266213e-05, "loss": 1.0054, "step": 1477 }, { "epoch": 3.7800511508951407, "grad_norm": 0.23108316839210677, "learning_rate": 1.0350694612195905e-05, "loss": 1.0299, "step": 1478 }, { "epoch": 3.782608695652174, "grad_norm": 0.19921910618488686, "learning_rate": 1.0337947029567388e-05, "loss": 1.013, "step": 1479 }, { "epoch": 3.785166240409207, "grad_norm": 0.19609376442655463, "learning_rate": 1.0325198897120183e-05, "loss": 1.0239, "step": 1480 }, { "epoch": 3.78772378516624, "grad_norm": 0.2039103534692172, "learning_rate": 1.0312450235594706e-05, "loss": 1.0262, "step": 1481 }, { "epoch": 3.790281329923274, "grad_norm": 0.19686683259289736, "learning_rate": 1.0299701065732235e-05, "loss": 1.0444, "step": 1482 }, { "epoch": 3.792838874680307, "grad_norm": 0.2031103792356114, "learning_rate": 1.0286951408274865e-05, "loss": 1.0993, "step": 1483 }, { "epoch": 3.79539641943734, "grad_norm": 0.2263801739639009, "learning_rate": 1.0274201283965497e-05, "loss": 1.0409, "step": 1484 }, { "epoch": 3.7979539641943734, "grad_norm": 0.17572315424279408, "learning_rate": 1.0261450713547785e-05, "loss": 1.075, "step": 1485 }, { "epoch": 3.8005115089514065, "grad_norm": 0.27023491274755906, "learning_rate": 1.0248699717766107e-05, "loss": 1.0679, "step": 1486 }, { "epoch": 3.80306905370844, "grad_norm": 0.1713633148592625, "learning_rate": 1.023594831736554e-05, "loss": 1.0484, "step": 1487 }, { "epoch": 3.805626598465473, "grad_norm": 0.2367623046752298, "learning_rate": 1.0223196533091813e-05, "loss": 1.0287, "step": 1488 }, { "epoch": 3.8081841432225065, "grad_norm": 0.1984118987646221, "learning_rate": 1.0210444385691282e-05, "loss": 1.0373, "step": 1489 }, { "epoch": 3.8107416879795397, "grad_norm": 0.19013291547902408, "learning_rate": 1.0197691895910895e-05, "loss": 1.0396, "step": 1490 }, { "epoch": 3.813299232736573, "grad_norm": 0.2262690201508357, "learning_rate": 1.0184939084498153e-05, "loss": 1.0383, "step": 1491 }, { "epoch": 3.815856777493606, "grad_norm": 0.21345095926753077, "learning_rate": 1.0172185972201082e-05, "loss": 1.0341, "step": 1492 }, { "epoch": 3.8184143222506393, "grad_norm": 0.18180827453898485, "learning_rate": 1.01594325797682e-05, "loss": 1.0419, "step": 1493 }, { "epoch": 3.820971867007673, "grad_norm": 0.23760325057681905, "learning_rate": 1.0146678927948484e-05, "loss": 1.0178, "step": 1494 }, { "epoch": 3.8235294117647056, "grad_norm": 0.18084043730292876, "learning_rate": 1.013392503749132e-05, "loss": 1.0701, "step": 1495 }, { "epoch": 3.8260869565217392, "grad_norm": 0.18619103410121773, "learning_rate": 1.0121170929146493e-05, "loss": 1.0359, "step": 1496 }, { "epoch": 3.8286445012787724, "grad_norm": 0.1814058213229099, "learning_rate": 1.0108416623664142e-05, "loss": 1.0483, "step": 1497 }, { "epoch": 3.8312020460358056, "grad_norm": 0.17659823284048892, "learning_rate": 1.0095662141794725e-05, "loss": 1.0167, "step": 1498 }, { "epoch": 3.833759590792839, "grad_norm": 0.18093838446366517, "learning_rate": 1.0082907504288977e-05, "loss": 1.0271, "step": 1499 }, { "epoch": 3.836317135549872, "grad_norm": 0.19401662423230362, "learning_rate": 1.0070152731897911e-05, "loss": 1.0525, "step": 1500 }, { "epoch": 3.8388746803069056, "grad_norm": 0.17897896363370017, "learning_rate": 1.0057397845372734e-05, "loss": 1.0354, "step": 1501 }, { "epoch": 3.8414322250639388, "grad_norm": 0.18581636595029996, "learning_rate": 1.004464286546485e-05, "loss": 1.0439, "step": 1502 }, { "epoch": 3.843989769820972, "grad_norm": 0.17458922536736418, "learning_rate": 1.0031887812925818e-05, "loss": 1.0073, "step": 1503 }, { "epoch": 3.846547314578005, "grad_norm": 0.18401279215992355, "learning_rate": 1.0019132708507307e-05, "loss": 1.0549, "step": 1504 }, { "epoch": 3.8491048593350383, "grad_norm": 0.17886260918603583, "learning_rate": 1.0006377572961075e-05, "loss": 1.056, "step": 1505 }, { "epoch": 3.8516624040920715, "grad_norm": 0.18640944420175584, "learning_rate": 9.99362242703893e-06, "loss": 1.0317, "step": 1506 }, { "epoch": 3.8542199488491047, "grad_norm": 0.1724777242125077, "learning_rate": 9.980867291492697e-06, "loss": 1.0496, "step": 1507 }, { "epoch": 3.8567774936061383, "grad_norm": 0.17736614296923925, "learning_rate": 9.968112187074187e-06, "loss": 1.0321, "step": 1508 }, { "epoch": 3.8593350383631715, "grad_norm": 0.18919776197181185, "learning_rate": 9.955357134535153e-06, "loss": 1.0612, "step": 1509 }, { "epoch": 3.8618925831202047, "grad_norm": 0.17013450287572257, "learning_rate": 9.94260215462727e-06, "loss": 1.0371, "step": 1510 }, { "epoch": 3.864450127877238, "grad_norm": 0.1795391930284376, "learning_rate": 9.929847268102092e-06, "loss": 1.0116, "step": 1511 }, { "epoch": 3.867007672634271, "grad_norm": 0.18010343872623125, "learning_rate": 9.917092495711023e-06, "loss": 0.9975, "step": 1512 }, { "epoch": 3.869565217391304, "grad_norm": 0.2018143041172149, "learning_rate": 9.904337858205282e-06, "loss": 1.0261, "step": 1513 }, { "epoch": 3.8721227621483374, "grad_norm": 0.20189193249637963, "learning_rate": 9.891583376335861e-06, "loss": 1.036, "step": 1514 }, { "epoch": 3.874680306905371, "grad_norm": 0.18604316403857601, "learning_rate": 9.87882907085351e-06, "loss": 1.0353, "step": 1515 }, { "epoch": 3.877237851662404, "grad_norm": 0.1764086076077849, "learning_rate": 9.866074962508684e-06, "loss": 1.048, "step": 1516 }, { "epoch": 3.8797953964194374, "grad_norm": 0.18861859299069214, "learning_rate": 9.85332107205152e-06, "loss": 1.0719, "step": 1517 }, { "epoch": 3.8823529411764706, "grad_norm": 0.1729886347071538, "learning_rate": 9.840567420231802e-06, "loss": 1.0436, "step": 1518 }, { "epoch": 3.8849104859335037, "grad_norm": 0.20230041478663247, "learning_rate": 9.82781402779892e-06, "loss": 1.0611, "step": 1519 }, { "epoch": 3.887468030690537, "grad_norm": 0.19599063188718716, "learning_rate": 9.815060915501852e-06, "loss": 1.0517, "step": 1520 }, { "epoch": 3.89002557544757, "grad_norm": 0.20556197980895194, "learning_rate": 9.802308104089109e-06, "loss": 1.0249, "step": 1521 }, { "epoch": 3.8925831202046037, "grad_norm": 0.21413593644142717, "learning_rate": 9.789555614308721e-06, "loss": 0.9947, "step": 1522 }, { "epoch": 3.895140664961637, "grad_norm": 0.20287758208508144, "learning_rate": 9.77680346690819e-06, "loss": 1.0352, "step": 1523 }, { "epoch": 3.89769820971867, "grad_norm": 0.19248950316327032, "learning_rate": 9.764051682634462e-06, "loss": 1.0275, "step": 1524 }, { "epoch": 3.9002557544757033, "grad_norm": 0.22258046212032104, "learning_rate": 9.751300282233895e-06, "loss": 1.0534, "step": 1525 }, { "epoch": 3.9028132992327365, "grad_norm": 0.21347571901775975, "learning_rate": 9.738549286452218e-06, "loss": 1.038, "step": 1526 }, { "epoch": 3.90537084398977, "grad_norm": 0.2280185995042673, "learning_rate": 9.725798716034507e-06, "loss": 1.0286, "step": 1527 }, { "epoch": 3.907928388746803, "grad_norm": 0.20202933779134605, "learning_rate": 9.713048591725138e-06, "loss": 1.0448, "step": 1528 }, { "epoch": 3.9104859335038364, "grad_norm": 0.20920944736139577, "learning_rate": 9.700298934267766e-06, "loss": 1.0069, "step": 1529 }, { "epoch": 3.9130434782608696, "grad_norm": 0.19240200507914293, "learning_rate": 9.687549764405296e-06, "loss": 1.0376, "step": 1530 }, { "epoch": 3.915601023017903, "grad_norm": 0.20292905124684749, "learning_rate": 9.674801102879817e-06, "loss": 1.0274, "step": 1531 }, { "epoch": 3.918158567774936, "grad_norm": 0.19062905855598355, "learning_rate": 9.662052970432617e-06, "loss": 1.0407, "step": 1532 }, { "epoch": 3.920716112531969, "grad_norm": 0.21406493946615143, "learning_rate": 9.6493053878041e-06, "loss": 1.0401, "step": 1533 }, { "epoch": 3.923273657289003, "grad_norm": 0.19190236583371453, "learning_rate": 9.63655837573379e-06, "loss": 1.0521, "step": 1534 }, { "epoch": 3.9258312020460355, "grad_norm": 0.22868484745745557, "learning_rate": 9.623811954960279e-06, "loss": 1.0396, "step": 1535 }, { "epoch": 3.928388746803069, "grad_norm": 0.1896213962401851, "learning_rate": 9.611066146221192e-06, "loss": 1.0272, "step": 1536 }, { "epoch": 3.9309462915601023, "grad_norm": 0.208558000446644, "learning_rate": 9.598320970253175e-06, "loss": 1.0263, "step": 1537 }, { "epoch": 3.9335038363171355, "grad_norm": 0.18215621037833685, "learning_rate": 9.585576447791817e-06, "loss": 1.044, "step": 1538 }, { "epoch": 3.9360613810741687, "grad_norm": 0.17351304593560926, "learning_rate": 9.572832599571674e-06, "loss": 1.0268, "step": 1539 }, { "epoch": 3.938618925831202, "grad_norm": 0.22389061474679745, "learning_rate": 9.560089446326175e-06, "loss": 1.0313, "step": 1540 }, { "epoch": 3.9411764705882355, "grad_norm": 0.17547633776625562, "learning_rate": 9.547347008787648e-06, "loss": 1.0321, "step": 1541 }, { "epoch": 3.9437340153452687, "grad_norm": 0.21231411571444475, "learning_rate": 9.534605307687233e-06, "loss": 1.0027, "step": 1542 }, { "epoch": 3.946291560102302, "grad_norm": 0.1792239552721382, "learning_rate": 9.52186436375488e-06, "loss": 1.0272, "step": 1543 }, { "epoch": 3.948849104859335, "grad_norm": 0.21595336710565813, "learning_rate": 9.509124197719317e-06, "loss": 1.0074, "step": 1544 }, { "epoch": 3.9514066496163682, "grad_norm": 0.20310879984969743, "learning_rate": 9.496384830307988e-06, "loss": 1.0481, "step": 1545 }, { "epoch": 3.9539641943734014, "grad_norm": 0.20949639165674833, "learning_rate": 9.483646282247056e-06, "loss": 1.0167, "step": 1546 }, { "epoch": 3.9565217391304346, "grad_norm": 0.23427285497954728, "learning_rate": 9.470908574261333e-06, "loss": 1.0478, "step": 1547 }, { "epoch": 3.959079283887468, "grad_norm": 0.1881836520862583, "learning_rate": 9.458171727074284e-06, "loss": 1.0257, "step": 1548 }, { "epoch": 3.9616368286445014, "grad_norm": 0.22079043196824938, "learning_rate": 9.44543576140795e-06, "loss": 1.0904, "step": 1549 }, { "epoch": 3.9641943734015346, "grad_norm": 0.18959168411837335, "learning_rate": 9.432700697982962e-06, "loss": 1.0562, "step": 1550 }, { "epoch": 3.9667519181585678, "grad_norm": 0.1881932409897208, "learning_rate": 9.419966557518472e-06, "loss": 1.048, "step": 1551 }, { "epoch": 3.969309462915601, "grad_norm": 0.20694575807793056, "learning_rate": 9.407233360732119e-06, "loss": 1.0453, "step": 1552 }, { "epoch": 3.971867007672634, "grad_norm": 0.21141511803194477, "learning_rate": 9.39450112834003e-06, "loss": 1.0416, "step": 1553 }, { "epoch": 3.9744245524296673, "grad_norm": 0.19924380600743072, "learning_rate": 9.381769881056744e-06, "loss": 1.0302, "step": 1554 }, { "epoch": 3.976982097186701, "grad_norm": 0.18443702573710982, "learning_rate": 9.36903963959521e-06, "loss": 1.0509, "step": 1555 }, { "epoch": 3.979539641943734, "grad_norm": 0.2130900807101153, "learning_rate": 9.356310424666725e-06, "loss": 1.0674, "step": 1556 }, { "epoch": 3.9820971867007673, "grad_norm": 0.18076464736813797, "learning_rate": 9.343582256980937e-06, "loss": 1.0327, "step": 1557 }, { "epoch": 3.9846547314578005, "grad_norm": 0.19770573119978005, "learning_rate": 9.330855157245776e-06, "loss": 1.049, "step": 1558 }, { "epoch": 3.9872122762148337, "grad_norm": 0.18941088064084555, "learning_rate": 9.318129146167432e-06, "loss": 1.0285, "step": 1559 }, { "epoch": 3.9897698209718673, "grad_norm": 0.21949442372495884, "learning_rate": 9.305404244450337e-06, "loss": 1.0447, "step": 1560 }, { "epoch": 3.9923273657289, "grad_norm": 0.19665403880426255, "learning_rate": 9.292680472797101e-06, "loss": 1.0411, "step": 1561 }, { "epoch": 3.9948849104859336, "grad_norm": 0.19058036356127872, "learning_rate": 9.279957851908513e-06, "loss": 1.0535, "step": 1562 }, { "epoch": 3.997442455242967, "grad_norm": 0.18814319318672243, "learning_rate": 9.267236402483482e-06, "loss": 1.036, "step": 1563 }, { "epoch": 4.0, "grad_norm": 0.1865356816625339, "learning_rate": 9.254516145219006e-06, "loss": 1.0435, "step": 1564 }, { "epoch": 4.002557544757034, "grad_norm": 0.19230450271770366, "learning_rate": 9.241797100810152e-06, "loss": 1.0143, "step": 1565 }, { "epoch": 4.005115089514066, "grad_norm": 0.19899721133072965, "learning_rate": 9.229079289950005e-06, "loss": 1.0249, "step": 1566 }, { "epoch": 4.0076726342711, "grad_norm": 0.21185878359559354, "learning_rate": 9.216362733329657e-06, "loss": 0.9987, "step": 1567 }, { "epoch": 4.010230179028133, "grad_norm": 0.1985629222033457, "learning_rate": 9.203647451638138e-06, "loss": 1.0198, "step": 1568 }, { "epoch": 4.012787723785166, "grad_norm": 0.1930121039553769, "learning_rate": 9.190933465562426e-06, "loss": 1.0328, "step": 1569 }, { "epoch": 4.015345268542199, "grad_norm": 0.2189356848452908, "learning_rate": 9.17822079578738e-06, "loss": 1.0358, "step": 1570 }, { "epoch": 4.017902813299233, "grad_norm": 0.18197666560197398, "learning_rate": 9.165509462995716e-06, "loss": 1.0312, "step": 1571 }, { "epoch": 4.020460358056266, "grad_norm": 0.22141370700870244, "learning_rate": 9.152799487867981e-06, "loss": 1.0167, "step": 1572 }, { "epoch": 4.023017902813299, "grad_norm": 0.2061928144217363, "learning_rate": 9.140090891082506e-06, "loss": 1.0173, "step": 1573 }, { "epoch": 4.025575447570333, "grad_norm": 0.1855420730525284, "learning_rate": 9.127383693315387e-06, "loss": 1.0122, "step": 1574 }, { "epoch": 4.028132992327365, "grad_norm": 0.19054702381827276, "learning_rate": 9.114677915240436e-06, "loss": 1.0207, "step": 1575 }, { "epoch": 4.030690537084399, "grad_norm": 0.17786433578081798, "learning_rate": 9.101973577529164e-06, "loss": 1.0339, "step": 1576 }, { "epoch": 4.033248081841432, "grad_norm": 0.18910562787321678, "learning_rate": 9.089270700850733e-06, "loss": 1.0007, "step": 1577 }, { "epoch": 4.035805626598465, "grad_norm": 0.18519350419636166, "learning_rate": 9.076569305871926e-06, "loss": 1.0314, "step": 1578 }, { "epoch": 4.038363171355499, "grad_norm": 0.21754655747857035, "learning_rate": 9.063869413257124e-06, "loss": 1.0302, "step": 1579 }, { "epoch": 4.040920716112532, "grad_norm": 0.18004679417947927, "learning_rate": 9.051171043668251e-06, "loss": 1.0476, "step": 1580 }, { "epoch": 4.043478260869565, "grad_norm": 0.2168920363400877, "learning_rate": 9.038474217764768e-06, "loss": 1.025, "step": 1581 }, { "epoch": 4.046035805626598, "grad_norm": 0.19274796431055907, "learning_rate": 9.025778956203611e-06, "loss": 1.0098, "step": 1582 }, { "epoch": 4.048593350383632, "grad_norm": 0.19201028214018007, "learning_rate": 9.013085279639178e-06, "loss": 1.0017, "step": 1583 }, { "epoch": 4.051150895140665, "grad_norm": 0.19629486524205142, "learning_rate": 9.000393208723291e-06, "loss": 1.0219, "step": 1584 }, { "epoch": 4.053708439897698, "grad_norm": 0.19752451256428386, "learning_rate": 8.987702764105151e-06, "loss": 1.0177, "step": 1585 }, { "epoch": 4.056265984654732, "grad_norm": 0.20166118830323768, "learning_rate": 8.975013966431323e-06, "loss": 1.0601, "step": 1586 }, { "epoch": 4.0588235294117645, "grad_norm": 0.17326861120237855, "learning_rate": 8.96232683634568e-06, "loss": 0.9847, "step": 1587 }, { "epoch": 4.061381074168798, "grad_norm": 0.1898245941021511, "learning_rate": 8.949641394489399e-06, "loss": 1.0099, "step": 1588 }, { "epoch": 4.063938618925831, "grad_norm": 0.1700392821316134, "learning_rate": 8.93695766150089e-06, "loss": 1.0538, "step": 1589 }, { "epoch": 4.0664961636828645, "grad_norm": 0.1682061615806585, "learning_rate": 8.9242756580158e-06, "loss": 1.0172, "step": 1590 }, { "epoch": 4.069053708439898, "grad_norm": 0.19303997092308417, "learning_rate": 8.911595404666957e-06, "loss": 1.0546, "step": 1591 }, { "epoch": 4.071611253196931, "grad_norm": 0.1654939906619837, "learning_rate": 8.898916922084336e-06, "loss": 1.0464, "step": 1592 }, { "epoch": 4.0741687979539645, "grad_norm": 0.18143405806846177, "learning_rate": 8.88624023089504e-06, "loss": 1.0545, "step": 1593 }, { "epoch": 4.076726342710997, "grad_norm": 0.20747010533584376, "learning_rate": 8.873565351723249e-06, "loss": 1.0589, "step": 1594 }, { "epoch": 4.079283887468031, "grad_norm": 0.15953653305890375, "learning_rate": 8.8608923051902e-06, "loss": 1.0179, "step": 1595 }, { "epoch": 4.081841432225064, "grad_norm": 0.2035902582767619, "learning_rate": 8.848221111914147e-06, "loss": 1.0447, "step": 1596 }, { "epoch": 4.084398976982097, "grad_norm": 0.15347759439362155, "learning_rate": 8.835551792510329e-06, "loss": 1.0307, "step": 1597 }, { "epoch": 4.086956521739131, "grad_norm": 0.20574769500088766, "learning_rate": 8.822884367590941e-06, "loss": 0.9952, "step": 1598 }, { "epoch": 4.089514066496164, "grad_norm": 0.1835496415175651, "learning_rate": 8.810218857765085e-06, "loss": 1.0005, "step": 1599 }, { "epoch": 4.092071611253197, "grad_norm": 0.20530099186755948, "learning_rate": 8.79755528363876e-06, "loss": 1.0361, "step": 1600 }, { "epoch": 4.09462915601023, "grad_norm": 0.2026938929869877, "learning_rate": 8.7848936658148e-06, "loss": 1.0328, "step": 1601 }, { "epoch": 4.0971867007672635, "grad_norm": 0.1907662170906002, "learning_rate": 8.772234024892872e-06, "loss": 1.0133, "step": 1602 }, { "epoch": 4.099744245524296, "grad_norm": 0.19617684565754476, "learning_rate": 8.759576381469425e-06, "loss": 1.0027, "step": 1603 }, { "epoch": 4.10230179028133, "grad_norm": 0.17534476994793663, "learning_rate": 8.746920756137642e-06, "loss": 1.0437, "step": 1604 }, { "epoch": 4.1048593350383635, "grad_norm": 0.20521166727954332, "learning_rate": 8.734267169487444e-06, "loss": 1.0265, "step": 1605 }, { "epoch": 4.107416879795396, "grad_norm": 0.17225400361630142, "learning_rate": 8.721615642105417e-06, "loss": 1.0338, "step": 1606 }, { "epoch": 4.10997442455243, "grad_norm": 0.21382338032724127, "learning_rate": 8.708966194574814e-06, "loss": 1.0083, "step": 1607 }, { "epoch": 4.112531969309463, "grad_norm": 0.16180422908572098, "learning_rate": 8.696318847475487e-06, "loss": 1.0169, "step": 1608 }, { "epoch": 4.115089514066496, "grad_norm": 0.23650182130816144, "learning_rate": 8.68367362138388e-06, "loss": 1.0323, "step": 1609 }, { "epoch": 4.117647058823529, "grad_norm": 0.18535588146645351, "learning_rate": 8.671030536872995e-06, "loss": 1.0299, "step": 1610 }, { "epoch": 4.120204603580563, "grad_norm": 0.17955290128121904, "learning_rate": 8.658389614512325e-06, "loss": 1.0189, "step": 1611 }, { "epoch": 4.122762148337596, "grad_norm": 0.1782288851096717, "learning_rate": 8.645750874867876e-06, "loss": 1.0134, "step": 1612 }, { "epoch": 4.125319693094629, "grad_norm": 0.18693604034380645, "learning_rate": 8.633114338502073e-06, "loss": 1.0403, "step": 1613 }, { "epoch": 4.127877237851663, "grad_norm": 0.18248123513699424, "learning_rate": 8.62048002597378e-06, "loss": 1.0288, "step": 1614 }, { "epoch": 4.130434782608695, "grad_norm": 0.18165634630490243, "learning_rate": 8.607847957838227e-06, "loss": 1.0301, "step": 1615 }, { "epoch": 4.132992327365729, "grad_norm": 0.1803487141905229, "learning_rate": 8.595218154647001e-06, "loss": 1.0301, "step": 1616 }, { "epoch": 4.135549872122763, "grad_norm": 0.18173901474688528, "learning_rate": 8.58259063694801e-06, "loss": 1.0222, "step": 1617 }, { "epoch": 4.138107416879795, "grad_norm": 0.18078862560079437, "learning_rate": 8.56996542528542e-06, "loss": 1.0235, "step": 1618 }, { "epoch": 4.140664961636829, "grad_norm": 0.1803693056043885, "learning_rate": 8.55734254019968e-06, "loss": 0.9988, "step": 1619 }, { "epoch": 4.143222506393862, "grad_norm": 0.1865048325076587, "learning_rate": 8.544722002227417e-06, "loss": 1.0538, "step": 1620 }, { "epoch": 4.145780051150895, "grad_norm": 0.17978097814336544, "learning_rate": 8.532103831901472e-06, "loss": 1.035, "step": 1621 }, { "epoch": 4.148337595907928, "grad_norm": 0.23624978152806544, "learning_rate": 8.519488049750808e-06, "loss": 1.0298, "step": 1622 }, { "epoch": 4.150895140664962, "grad_norm": 0.16381055698474817, "learning_rate": 8.506874676300514e-06, "loss": 1.0485, "step": 1623 }, { "epoch": 4.153452685421995, "grad_norm": 0.19963138199162672, "learning_rate": 8.494263732071772e-06, "loss": 1.0092, "step": 1624 }, { "epoch": 4.156010230179028, "grad_norm": 0.19251260911612733, "learning_rate": 8.481655237581785e-06, "loss": 1.0209, "step": 1625 }, { "epoch": 4.158567774936062, "grad_norm": 0.17091450724555518, "learning_rate": 8.469049213343798e-06, "loss": 1.0358, "step": 1626 }, { "epoch": 4.161125319693094, "grad_norm": 0.18111441891291247, "learning_rate": 8.456445679867013e-06, "loss": 1.0235, "step": 1627 }, { "epoch": 4.163682864450128, "grad_norm": 0.1742001195215167, "learning_rate": 8.443844657656596e-06, "loss": 1.0436, "step": 1628 }, { "epoch": 4.166240409207161, "grad_norm": 0.17755175605855264, "learning_rate": 8.431246167213627e-06, "loss": 1.0444, "step": 1629 }, { "epoch": 4.168797953964194, "grad_norm": 0.17719860198513576, "learning_rate": 8.418650229035054e-06, "loss": 1.0321, "step": 1630 }, { "epoch": 4.171355498721228, "grad_norm": 0.1606826181735471, "learning_rate": 8.406056863613689e-06, "loss": 1.0539, "step": 1631 }, { "epoch": 4.173913043478261, "grad_norm": 0.1739885726513299, "learning_rate": 8.393466091438139e-06, "loss": 1.0282, "step": 1632 }, { "epoch": 4.176470588235294, "grad_norm": 0.18218865497775108, "learning_rate": 8.380877932992815e-06, "loss": 1.0239, "step": 1633 }, { "epoch": 4.179028132992327, "grad_norm": 0.16523774532642985, "learning_rate": 8.368292408757853e-06, "loss": 1.02, "step": 1634 }, { "epoch": 4.181585677749361, "grad_norm": 0.17345180693087728, "learning_rate": 8.355709539209121e-06, "loss": 1.0392, "step": 1635 }, { "epoch": 4.1841432225063935, "grad_norm": 0.17255097246631376, "learning_rate": 8.343129344818162e-06, "loss": 1.0714, "step": 1636 }, { "epoch": 4.186700767263427, "grad_norm": 0.1814224170983909, "learning_rate": 8.33055184605216e-06, "loss": 1.0217, "step": 1637 }, { "epoch": 4.189258312020461, "grad_norm": 0.1748560906889792, "learning_rate": 8.317977063373925e-06, "loss": 1.0391, "step": 1638 }, { "epoch": 4.1918158567774935, "grad_norm": 0.18435771096605524, "learning_rate": 8.305405017241837e-06, "loss": 1.0215, "step": 1639 }, { "epoch": 4.194373401534527, "grad_norm": 0.16909940397166726, "learning_rate": 8.292835728109835e-06, "loss": 1.0141, "step": 1640 }, { "epoch": 4.19693094629156, "grad_norm": 0.16864611479976394, "learning_rate": 8.28026921642736e-06, "loss": 0.995, "step": 1641 }, { "epoch": 4.1994884910485935, "grad_norm": 0.1832641724885349, "learning_rate": 8.267705502639342e-06, "loss": 1.0443, "step": 1642 }, { "epoch": 4.202046035805626, "grad_norm": 0.15678971891456242, "learning_rate": 8.255144607186161e-06, "loss": 0.9988, "step": 1643 }, { "epoch": 4.20460358056266, "grad_norm": 0.17026684913571113, "learning_rate": 8.242586550503607e-06, "loss": 1.0413, "step": 1644 }, { "epoch": 4.207161125319693, "grad_norm": 0.17089179054567286, "learning_rate": 8.230031353022855e-06, "loss": 1.0305, "step": 1645 }, { "epoch": 4.209718670076726, "grad_norm": 0.17613488393658056, "learning_rate": 8.217479035170422e-06, "loss": 1.0075, "step": 1646 }, { "epoch": 4.21227621483376, "grad_norm": 0.15804554349273428, "learning_rate": 8.204929617368147e-06, "loss": 1.0119, "step": 1647 }, { "epoch": 4.2148337595907925, "grad_norm": 0.20718638597658195, "learning_rate": 8.192383120033147e-06, "loss": 1.0239, "step": 1648 }, { "epoch": 4.217391304347826, "grad_norm": 0.1845223450299457, "learning_rate": 8.179839563577789e-06, "loss": 1.0044, "step": 1649 }, { "epoch": 4.21994884910486, "grad_norm": 0.1740911877816002, "learning_rate": 8.167298968409658e-06, "loss": 1.0114, "step": 1650 }, { "epoch": 4.2225063938618925, "grad_norm": 0.17787524858695802, "learning_rate": 8.154761354931513e-06, "loss": 1.0342, "step": 1651 }, { "epoch": 4.225063938618926, "grad_norm": 0.17981590233123262, "learning_rate": 8.142226743541273e-06, "loss": 1.0196, "step": 1652 }, { "epoch": 4.227621483375959, "grad_norm": 0.15945346875306546, "learning_rate": 8.12969515463196e-06, "loss": 1.0319, "step": 1653 }, { "epoch": 4.2301790281329925, "grad_norm": 0.1782254652095104, "learning_rate": 8.117166608591693e-06, "loss": 1.027, "step": 1654 }, { "epoch": 4.232736572890025, "grad_norm": 0.16769675527664904, "learning_rate": 8.104641125803628e-06, "loss": 1.0512, "step": 1655 }, { "epoch": 4.235294117647059, "grad_norm": 0.17673772312426278, "learning_rate": 8.092118726645943e-06, "loss": 1.0289, "step": 1656 }, { "epoch": 4.2378516624040925, "grad_norm": 0.17775412310787495, "learning_rate": 8.0795994314918e-06, "loss": 1.0134, "step": 1657 }, { "epoch": 4.240409207161125, "grad_norm": 0.165083768711067, "learning_rate": 8.067083260709309e-06, "loss": 1.0482, "step": 1658 }, { "epoch": 4.242966751918159, "grad_norm": 0.19604799862438058, "learning_rate": 8.054570234661498e-06, "loss": 1.0317, "step": 1659 }, { "epoch": 4.245524296675192, "grad_norm": 0.16528010613818045, "learning_rate": 8.042060373706275e-06, "loss": 1.0348, "step": 1660 }, { "epoch": 4.248081841432225, "grad_norm": 0.1804031281677697, "learning_rate": 8.029553698196405e-06, "loss": 1.0401, "step": 1661 }, { "epoch": 4.250639386189258, "grad_norm": 0.176393933273107, "learning_rate": 8.017050228479467e-06, "loss": 1.0356, "step": 1662 }, { "epoch": 4.253196930946292, "grad_norm": 0.19395943497159726, "learning_rate": 8.004549984897822e-06, "loss": 1.0191, "step": 1663 }, { "epoch": 4.255754475703325, "grad_norm": 0.17246963598612605, "learning_rate": 7.992052987788586e-06, "loss": 1.0162, "step": 1664 }, { "epoch": 4.258312020460358, "grad_norm": 0.18066442113845643, "learning_rate": 7.979559257483591e-06, "loss": 1.0229, "step": 1665 }, { "epoch": 4.260869565217392, "grad_norm": 0.1680697165366633, "learning_rate": 7.967068814309359e-06, "loss": 1.0202, "step": 1666 }, { "epoch": 4.263427109974424, "grad_norm": 0.17705957749246876, "learning_rate": 7.954581678587054e-06, "loss": 1.0324, "step": 1667 }, { "epoch": 4.265984654731458, "grad_norm": 0.16130768348650035, "learning_rate": 7.942097870632467e-06, "loss": 0.9793, "step": 1668 }, { "epoch": 4.268542199488491, "grad_norm": 0.17498237044992782, "learning_rate": 7.929617410755977e-06, "loss": 1.0249, "step": 1669 }, { "epoch": 4.271099744245524, "grad_norm": 0.1925424733299812, "learning_rate": 7.917140319262507e-06, "loss": 1.0365, "step": 1670 }, { "epoch": 4.273657289002558, "grad_norm": 0.18797309789320532, "learning_rate": 7.90466661645151e-06, "loss": 1.0118, "step": 1671 }, { "epoch": 4.276214833759591, "grad_norm": 0.16573297446104532, "learning_rate": 7.892196322616912e-06, "loss": 1.0247, "step": 1672 }, { "epoch": 4.278772378516624, "grad_norm": 0.1925991067748996, "learning_rate": 7.879729458047111e-06, "loss": 0.978, "step": 1673 }, { "epoch": 4.281329923273657, "grad_norm": 0.1758834459188358, "learning_rate": 7.86726604302491e-06, "loss": 1.0175, "step": 1674 }, { "epoch": 4.283887468030691, "grad_norm": 0.16487956982839647, "learning_rate": 7.854806097827507e-06, "loss": 1.0288, "step": 1675 }, { "epoch": 4.286445012787723, "grad_norm": 0.1787793037572042, "learning_rate": 7.842349642726458e-06, "loss": 1.0166, "step": 1676 }, { "epoch": 4.289002557544757, "grad_norm": 0.1841366036398648, "learning_rate": 7.829896697987627e-06, "loss": 1.0348, "step": 1677 }, { "epoch": 4.291560102301791, "grad_norm": 0.1576001038888875, "learning_rate": 7.817447283871187e-06, "loss": 1.0342, "step": 1678 }, { "epoch": 4.294117647058823, "grad_norm": 0.17981916810192364, "learning_rate": 7.80500142063155e-06, "loss": 1.0214, "step": 1679 }, { "epoch": 4.296675191815857, "grad_norm": 0.17518421051117097, "learning_rate": 7.792559128517363e-06, "loss": 1.0404, "step": 1680 }, { "epoch": 4.29923273657289, "grad_norm": 0.16823487687822244, "learning_rate": 7.780120427771449e-06, "loss": 1.0112, "step": 1681 }, { "epoch": 4.301790281329923, "grad_norm": 0.16558738219755195, "learning_rate": 7.7676853386308e-06, "loss": 1.0605, "step": 1682 }, { "epoch": 4.304347826086957, "grad_norm": 0.17794613732094552, "learning_rate": 7.755253881326535e-06, "loss": 1.0371, "step": 1683 }, { "epoch": 4.30690537084399, "grad_norm": 0.19300577747925785, "learning_rate": 7.742826076083848e-06, "loss": 1.06, "step": 1684 }, { "epoch": 4.309462915601023, "grad_norm": 0.16066023211525512, "learning_rate": 7.730401943122007e-06, "loss": 1.0084, "step": 1685 }, { "epoch": 4.312020460358056, "grad_norm": 0.1947539405327399, "learning_rate": 7.717981502654297e-06, "loss": 1.0418, "step": 1686 }, { "epoch": 4.31457800511509, "grad_norm": 0.16039175830465094, "learning_rate": 7.705564774888001e-06, "loss": 1.0039, "step": 1687 }, { "epoch": 4.3171355498721224, "grad_norm": 0.18746085529738926, "learning_rate": 7.693151780024354e-06, "loss": 1.0041, "step": 1688 }, { "epoch": 4.319693094629156, "grad_norm": 0.17014035483962622, "learning_rate": 7.680742538258524e-06, "loss": 1.0087, "step": 1689 }, { "epoch": 4.322250639386189, "grad_norm": 0.19178845859382257, "learning_rate": 7.668337069779577e-06, "loss": 1.0716, "step": 1690 }, { "epoch": 4.324808184143222, "grad_norm": 0.16691270419041054, "learning_rate": 7.655935394770425e-06, "loss": 1.0185, "step": 1691 }, { "epoch": 4.327365728900256, "grad_norm": 0.17518851447109943, "learning_rate": 7.643537533407828e-06, "loss": 1.0173, "step": 1692 }, { "epoch": 4.329923273657289, "grad_norm": 0.16145421958943196, "learning_rate": 7.631143505862325e-06, "loss": 1.0351, "step": 1693 }, { "epoch": 4.332480818414322, "grad_norm": 0.37204295825399436, "learning_rate": 7.618753332298219e-06, "loss": 1.0303, "step": 1694 }, { "epoch": 4.335038363171355, "grad_norm": 0.15830617963945456, "learning_rate": 7.606367032873562e-06, "loss": 1.0129, "step": 1695 }, { "epoch": 4.337595907928389, "grad_norm": 0.18979652677231215, "learning_rate": 7.593984627740075e-06, "loss": 1.0526, "step": 1696 }, { "epoch": 4.340153452685422, "grad_norm": 0.1876359842591056, "learning_rate": 7.5816061370431674e-06, "loss": 1.0181, "step": 1697 }, { "epoch": 4.342710997442455, "grad_norm": 0.18251068037823034, "learning_rate": 7.569231580921858e-06, "loss": 0.996, "step": 1698 }, { "epoch": 4.345268542199489, "grad_norm": 0.17542644898051862, "learning_rate": 7.556860979508791e-06, "loss": 1.0301, "step": 1699 }, { "epoch": 4.3478260869565215, "grad_norm": 0.1927803590994827, "learning_rate": 7.544494352930145e-06, "loss": 1.03, "step": 1700 }, { "epoch": 4.350383631713555, "grad_norm": 0.16917148556319608, "learning_rate": 7.532131721305659e-06, "loss": 0.9895, "step": 1701 }, { "epoch": 4.352941176470588, "grad_norm": 0.18346223176780307, "learning_rate": 7.519773104748562e-06, "loss": 1.0428, "step": 1702 }, { "epoch": 4.3554987212276215, "grad_norm": 0.1628922532881499, "learning_rate": 7.507418523365542e-06, "loss": 1.058, "step": 1703 }, { "epoch": 4.358056265984655, "grad_norm": 0.1876763139643933, "learning_rate": 7.495067997256742e-06, "loss": 1.0112, "step": 1704 }, { "epoch": 4.360613810741688, "grad_norm": 0.15693274545823557, "learning_rate": 7.482721546515683e-06, "loss": 1.0281, "step": 1705 }, { "epoch": 4.3631713554987215, "grad_norm": 0.18630090934243648, "learning_rate": 7.47037919122928e-06, "loss": 1.0418, "step": 1706 }, { "epoch": 4.365728900255754, "grad_norm": 0.16500214550907966, "learning_rate": 7.458040951477763e-06, "loss": 1.0279, "step": 1707 }, { "epoch": 4.368286445012788, "grad_norm": 0.18494529984039387, "learning_rate": 7.4457068473346836e-06, "loss": 1.0155, "step": 1708 }, { "epoch": 4.370843989769821, "grad_norm": 0.19216574362796557, "learning_rate": 7.43337689886686e-06, "loss": 1.0423, "step": 1709 }, { "epoch": 4.373401534526854, "grad_norm": 0.16751025476175924, "learning_rate": 7.42105112613434e-06, "loss": 1.0317, "step": 1710 }, { "epoch": 4.375959079283888, "grad_norm": 0.20151154222401438, "learning_rate": 7.408729549190393e-06, "loss": 1.0536, "step": 1711 }, { "epoch": 4.378516624040921, "grad_norm": 0.18065737789912834, "learning_rate": 7.3964121880814445e-06, "loss": 1.0549, "step": 1712 }, { "epoch": 4.381074168797954, "grad_norm": 0.17160881413407147, "learning_rate": 7.3840990628470824e-06, "loss": 1.0168, "step": 1713 }, { "epoch": 4.383631713554987, "grad_norm": 0.1786512550850061, "learning_rate": 7.371790193519979e-06, "loss": 1.0435, "step": 1714 }, { "epoch": 4.3861892583120206, "grad_norm": 0.19232717850899114, "learning_rate": 7.359485600125904e-06, "loss": 1.0389, "step": 1715 }, { "epoch": 4.388746803069053, "grad_norm": 0.18440121677046997, "learning_rate": 7.347185302683662e-06, "loss": 1.0264, "step": 1716 }, { "epoch": 4.391304347826087, "grad_norm": 0.19371415512946702, "learning_rate": 7.334889321205063e-06, "loss": 1.0622, "step": 1717 }, { "epoch": 4.3938618925831205, "grad_norm": 0.19249478474991424, "learning_rate": 7.322597675694904e-06, "loss": 1.0029, "step": 1718 }, { "epoch": 4.396419437340153, "grad_norm": 0.19009338152933727, "learning_rate": 7.31031038615092e-06, "loss": 1.0165, "step": 1719 }, { "epoch": 4.398976982097187, "grad_norm": 0.18669974928276975, "learning_rate": 7.298027472563768e-06, "loss": 1.0357, "step": 1720 }, { "epoch": 4.40153452685422, "grad_norm": 0.1650051526675111, "learning_rate": 7.285748954916973e-06, "loss": 1.0562, "step": 1721 }, { "epoch": 4.404092071611253, "grad_norm": 0.1917534223305165, "learning_rate": 7.273474853186922e-06, "loss": 1.0409, "step": 1722 }, { "epoch": 4.406649616368286, "grad_norm": 0.17737384077233112, "learning_rate": 7.261205187342809e-06, "loss": 1.0464, "step": 1723 }, { "epoch": 4.40920716112532, "grad_norm": 0.17939864900718247, "learning_rate": 7.248939977346612e-06, "loss": 1.0153, "step": 1724 }, { "epoch": 4.411764705882353, "grad_norm": 0.16822250340936998, "learning_rate": 7.236679243153062e-06, "loss": 1.0274, "step": 1725 }, { "epoch": 4.414322250639386, "grad_norm": 0.2012483436702938, "learning_rate": 7.224423004709607e-06, "loss": 1.0302, "step": 1726 }, { "epoch": 4.41687979539642, "grad_norm": 0.16437642340196237, "learning_rate": 7.212171281956377e-06, "loss": 1.0173, "step": 1727 }, { "epoch": 4.419437340153452, "grad_norm": 0.18420316116672247, "learning_rate": 7.199924094826167e-06, "loss": 1.0154, "step": 1728 }, { "epoch": 4.421994884910486, "grad_norm": 0.17063629208523548, "learning_rate": 7.187681463244377e-06, "loss": 1.0252, "step": 1729 }, { "epoch": 4.42455242966752, "grad_norm": 0.2071747152600751, "learning_rate": 7.175443407129008e-06, "loss": 1.0643, "step": 1730 }, { "epoch": 4.427109974424552, "grad_norm": 0.1596268627900996, "learning_rate": 7.163209946390608e-06, "loss": 1.0094, "step": 1731 }, { "epoch": 4.429667519181586, "grad_norm": 0.17222832212411637, "learning_rate": 7.1509811009322574e-06, "loss": 1.0011, "step": 1732 }, { "epoch": 4.432225063938619, "grad_norm": 0.18768984848570255, "learning_rate": 7.138756890649516e-06, "loss": 1.0344, "step": 1733 }, { "epoch": 4.434782608695652, "grad_norm": 0.20394581557700622, "learning_rate": 7.126537335430417e-06, "loss": 1.0187, "step": 1734 }, { "epoch": 4.437340153452685, "grad_norm": 0.1930227044611592, "learning_rate": 7.1143224551554115e-06, "loss": 1.0391, "step": 1735 }, { "epoch": 4.439897698209719, "grad_norm": 0.19780011138369127, "learning_rate": 7.102112269697341e-06, "loss": 1.0599, "step": 1736 }, { "epoch": 4.442455242966752, "grad_norm": 0.18641195549148987, "learning_rate": 7.08990679892142e-06, "loss": 1.0205, "step": 1737 }, { "epoch": 4.445012787723785, "grad_norm": 0.1745033043017393, "learning_rate": 7.077706062685181e-06, "loss": 1.0254, "step": 1738 }, { "epoch": 4.447570332480819, "grad_norm": 0.1875404190434515, "learning_rate": 7.065510080838465e-06, "loss": 1.0375, "step": 1739 }, { "epoch": 4.450127877237851, "grad_norm": 0.17560201588299784, "learning_rate": 7.053318873223365e-06, "loss": 0.9962, "step": 1740 }, { "epoch": 4.452685421994885, "grad_norm": 0.16337995441327988, "learning_rate": 7.041132459674216e-06, "loss": 1.0151, "step": 1741 }, { "epoch": 4.455242966751918, "grad_norm": 0.17910647034147473, "learning_rate": 7.028950860017555e-06, "loss": 1.059, "step": 1742 }, { "epoch": 4.457800511508951, "grad_norm": 0.1645714876947052, "learning_rate": 7.016774094072077e-06, "loss": 1.0151, "step": 1743 }, { "epoch": 4.460358056265985, "grad_norm": 0.18052975261895468, "learning_rate": 7.004602181648626e-06, "loss": 1.0226, "step": 1744 }, { "epoch": 4.462915601023018, "grad_norm": 0.15506591744701947, "learning_rate": 6.992435142550133e-06, "loss": 1.0315, "step": 1745 }, { "epoch": 4.465473145780051, "grad_norm": 0.18883014972610887, "learning_rate": 6.980272996571617e-06, "loss": 1.035, "step": 1746 }, { "epoch": 4.468030690537084, "grad_norm": 0.17244955302277767, "learning_rate": 6.968115763500127e-06, "loss": 1.0212, "step": 1747 }, { "epoch": 4.470588235294118, "grad_norm": 0.17237420999484432, "learning_rate": 6.95596346311472e-06, "loss": 1.0262, "step": 1748 }, { "epoch": 4.4731457800511505, "grad_norm": 0.18044664054131004, "learning_rate": 6.943816115186432e-06, "loss": 1.0285, "step": 1749 }, { "epoch": 4.475703324808184, "grad_norm": 0.16838623145296286, "learning_rate": 6.931673739478235e-06, "loss": 1.0526, "step": 1750 }, { "epoch": 4.478260869565218, "grad_norm": 0.16324922887275686, "learning_rate": 6.919536355745018e-06, "loss": 1.0174, "step": 1751 }, { "epoch": 4.4808184143222505, "grad_norm": 0.16440559510930122, "learning_rate": 6.907403983733543e-06, "loss": 1.035, "step": 1752 }, { "epoch": 4.483375959079284, "grad_norm": 0.15720327328308067, "learning_rate": 6.895276643182423e-06, "loss": 1.047, "step": 1753 }, { "epoch": 4.485933503836317, "grad_norm": 0.16163765669193314, "learning_rate": 6.883154353822079e-06, "loss": 1.0465, "step": 1754 }, { "epoch": 4.4884910485933505, "grad_norm": 0.17497015050920636, "learning_rate": 6.871037135374722e-06, "loss": 1.0184, "step": 1755 }, { "epoch": 4.491048593350383, "grad_norm": 0.15908864283642854, "learning_rate": 6.858925007554308e-06, "loss": 1.0307, "step": 1756 }, { "epoch": 4.493606138107417, "grad_norm": 0.18008191707505186, "learning_rate": 6.8468179900665095e-06, "loss": 1.0363, "step": 1757 }, { "epoch": 4.4961636828644505, "grad_norm": 0.1854747706459379, "learning_rate": 6.834716102608689e-06, "loss": 1.0083, "step": 1758 }, { "epoch": 4.498721227621483, "grad_norm": 0.1919413504278039, "learning_rate": 6.8226193648698605e-06, "loss": 0.996, "step": 1759 }, { "epoch": 4.501278772378517, "grad_norm": 0.16472038994778412, "learning_rate": 6.810527796530655e-06, "loss": 1.0476, "step": 1760 }, { "epoch": 4.5038363171355495, "grad_norm": 0.1877483916121461, "learning_rate": 6.798441417263311e-06, "loss": 1.042, "step": 1761 }, { "epoch": 4.506393861892583, "grad_norm": 0.1524530347847294, "learning_rate": 6.786360246731595e-06, "loss": 1.0535, "step": 1762 }, { "epoch": 4.508951406649617, "grad_norm": 0.16736289193940437, "learning_rate": 6.774284304590832e-06, "loss": 1.0384, "step": 1763 }, { "epoch": 4.5115089514066495, "grad_norm": 0.1509168260512166, "learning_rate": 6.762213610487813e-06, "loss": 1.0124, "step": 1764 }, { "epoch": 4.514066496163683, "grad_norm": 0.15987500159184168, "learning_rate": 6.75014818406081e-06, "loss": 1.0282, "step": 1765 }, { "epoch": 4.516624040920716, "grad_norm": 0.16208604821494524, "learning_rate": 6.7380880449395105e-06, "loss": 1.017, "step": 1766 }, { "epoch": 4.5191815856777495, "grad_norm": 0.1750240175749838, "learning_rate": 6.726033212745009e-06, "loss": 1.0448, "step": 1767 }, { "epoch": 4.521739130434782, "grad_norm": 0.17627152188563489, "learning_rate": 6.713983707089773e-06, "loss": 1.0431, "step": 1768 }, { "epoch": 4.524296675191816, "grad_norm": 0.172403571140956, "learning_rate": 6.7019395475775805e-06, "loss": 1.0014, "step": 1769 }, { "epoch": 4.526854219948849, "grad_norm": 0.16551799261888245, "learning_rate": 6.6899007538035376e-06, "loss": 1.0277, "step": 1770 }, { "epoch": 4.529411764705882, "grad_norm": 0.17935995088209722, "learning_rate": 6.6778673453539984e-06, "loss": 1.0214, "step": 1771 }, { "epoch": 4.531969309462916, "grad_norm": 0.14762155206935834, "learning_rate": 6.66583934180658e-06, "loss": 1.0254, "step": 1772 }, { "epoch": 4.534526854219949, "grad_norm": 0.18205952935739028, "learning_rate": 6.653816762730079e-06, "loss": 1.0128, "step": 1773 }, { "epoch": 4.537084398976982, "grad_norm": 0.16531567285520288, "learning_rate": 6.641799627684481e-06, "loss": 1.0117, "step": 1774 }, { "epoch": 4.539641943734015, "grad_norm": 0.1761641546294807, "learning_rate": 6.629787956220924e-06, "loss": 1.0047, "step": 1775 }, { "epoch": 4.542199488491049, "grad_norm": 0.16044890357588265, "learning_rate": 6.617781767881635e-06, "loss": 1.0193, "step": 1776 }, { "epoch": 4.544757033248082, "grad_norm": 0.159801416179025, "learning_rate": 6.6057810821999406e-06, "loss": 1.0344, "step": 1777 }, { "epoch": 4.547314578005115, "grad_norm": 0.18194045342283055, "learning_rate": 6.593785918700197e-06, "loss": 1.046, "step": 1778 }, { "epoch": 4.549872122762149, "grad_norm": 0.15701008924351048, "learning_rate": 6.581796296897795e-06, "loss": 1.0264, "step": 1779 }, { "epoch": 4.552429667519181, "grad_norm": 0.16610935282488204, "learning_rate": 6.569812236299089e-06, "loss": 1.0207, "step": 1780 }, { "epoch": 4.554987212276215, "grad_norm": 0.15940091408671517, "learning_rate": 6.557833756401404e-06, "loss": 1.049, "step": 1781 }, { "epoch": 4.557544757033249, "grad_norm": 0.16618353240025538, "learning_rate": 6.545860876692979e-06, "loss": 1.0266, "step": 1782 }, { "epoch": 4.560102301790281, "grad_norm": 0.17022750553375388, "learning_rate": 6.533893616652932e-06, "loss": 1.0791, "step": 1783 }, { "epoch": 4.562659846547315, "grad_norm": 0.17223278557669286, "learning_rate": 6.521931995751258e-06, "loss": 1.001, "step": 1784 }, { "epoch": 4.565217391304348, "grad_norm": 0.18588830803208972, "learning_rate": 6.509976033448755e-06, "loss": 1.0029, "step": 1785 }, { "epoch": 4.567774936061381, "grad_norm": 0.15803052054999583, "learning_rate": 6.498025749197036e-06, "loss": 1.0085, "step": 1786 }, { "epoch": 4.570332480818414, "grad_norm": 0.17758373561683846, "learning_rate": 6.486081162438458e-06, "loss": 1.0215, "step": 1787 }, { "epoch": 4.572890025575448, "grad_norm": 0.1675050184516244, "learning_rate": 6.4741422926061225e-06, "loss": 1.0101, "step": 1788 }, { "epoch": 4.57544757033248, "grad_norm": 0.1802049784719144, "learning_rate": 6.462209159123825e-06, "loss": 1.0594, "step": 1789 }, { "epoch": 4.578005115089514, "grad_norm": 0.15407960949128488, "learning_rate": 6.450281781406022e-06, "loss": 1.0351, "step": 1790 }, { "epoch": 4.580562659846548, "grad_norm": 0.17251700051840302, "learning_rate": 6.438360178857818e-06, "loss": 1.0237, "step": 1791 }, { "epoch": 4.58312020460358, "grad_norm": 0.17736986767063925, "learning_rate": 6.426444370874906e-06, "loss": 1.0262, "step": 1792 }, { "epoch": 4.585677749360614, "grad_norm": 0.18476336736016494, "learning_rate": 6.414534376843566e-06, "loss": 1.018, "step": 1793 }, { "epoch": 4.588235294117647, "grad_norm": 0.17911429354068129, "learning_rate": 6.402630216140618e-06, "loss": 1.0286, "step": 1794 }, { "epoch": 4.59079283887468, "grad_norm": 0.17311984725832297, "learning_rate": 6.39073190813338e-06, "loss": 1.0103, "step": 1795 }, { "epoch": 4.593350383631714, "grad_norm": 0.1621278479186866, "learning_rate": 6.37883947217966e-06, "loss": 1.0228, "step": 1796 }, { "epoch": 4.595907928388747, "grad_norm": 0.18444591716270403, "learning_rate": 6.366952927627703e-06, "loss": 1.0306, "step": 1797 }, { "epoch": 4.59846547314578, "grad_norm": 0.1659804117379894, "learning_rate": 6.355072293816178e-06, "loss": 1.0072, "step": 1798 }, { "epoch": 4.601023017902813, "grad_norm": 0.16571291930690385, "learning_rate": 6.34319759007413e-06, "loss": 1.0122, "step": 1799 }, { "epoch": 4.603580562659847, "grad_norm": 0.1720471422264511, "learning_rate": 6.331328835720961e-06, "loss": 1.0465, "step": 1800 }, { "epoch": 4.6061381074168795, "grad_norm": 0.16256427527474918, "learning_rate": 6.319466050066395e-06, "loss": 1.0069, "step": 1801 }, { "epoch": 4.608695652173913, "grad_norm": 0.16289290458169317, "learning_rate": 6.307609252410438e-06, "loss": 0.9955, "step": 1802 }, { "epoch": 4.611253196930946, "grad_norm": 0.16420344005471815, "learning_rate": 6.295758462043362e-06, "loss": 1.021, "step": 1803 }, { "epoch": 4.6138107416879794, "grad_norm": 0.16431618715461352, "learning_rate": 6.283913698245659e-06, "loss": 0.9887, "step": 1804 }, { "epoch": 4.616368286445013, "grad_norm": 0.162477757683666, "learning_rate": 6.272074980288021e-06, "loss": 1.0315, "step": 1805 }, { "epoch": 4.618925831202046, "grad_norm": 0.1420949863331362, "learning_rate": 6.2602423274313e-06, "loss": 0.9946, "step": 1806 }, { "epoch": 4.621483375959079, "grad_norm": 0.1617352765159284, "learning_rate": 6.248415758926485e-06, "loss": 1.0247, "step": 1807 }, { "epoch": 4.624040920716112, "grad_norm": 0.14727458038419122, "learning_rate": 6.236595294014662e-06, "loss": 1.0695, "step": 1808 }, { "epoch": 4.626598465473146, "grad_norm": 0.15513852752076332, "learning_rate": 6.22478095192699e-06, "loss": 1.0361, "step": 1809 }, { "epoch": 4.629156010230179, "grad_norm": 0.15023148854538287, "learning_rate": 6.212972751884663e-06, "loss": 1.0263, "step": 1810 }, { "epoch": 4.631713554987212, "grad_norm": 0.16087300720694614, "learning_rate": 6.201170713098883e-06, "loss": 1.0248, "step": 1811 }, { "epoch": 4.634271099744246, "grad_norm": 0.15834981601790443, "learning_rate": 6.189374854770832e-06, "loss": 1.053, "step": 1812 }, { "epoch": 4.6368286445012785, "grad_norm": 0.1573655447598696, "learning_rate": 6.177585196091631e-06, "loss": 0.9904, "step": 1813 }, { "epoch": 4.639386189258312, "grad_norm": 0.158683133829273, "learning_rate": 6.16580175624232e-06, "loss": 1.0595, "step": 1814 }, { "epoch": 4.641943734015345, "grad_norm": 0.1597812398342448, "learning_rate": 6.15402455439382e-06, "loss": 1.0517, "step": 1815 }, { "epoch": 4.6445012787723785, "grad_norm": 0.15551450371650033, "learning_rate": 6.142253609706898e-06, "loss": 1.054, "step": 1816 }, { "epoch": 4.647058823529412, "grad_norm": 0.19632917660508345, "learning_rate": 6.130488941332151e-06, "loss": 1.0512, "step": 1817 }, { "epoch": 4.649616368286445, "grad_norm": 0.15643968941800954, "learning_rate": 6.118730568409951e-06, "loss": 1.039, "step": 1818 }, { "epoch": 4.6521739130434785, "grad_norm": 0.20652844984094032, "learning_rate": 6.106978510070443e-06, "loss": 1.0129, "step": 1819 }, { "epoch": 4.654731457800511, "grad_norm": 0.15097637750201956, "learning_rate": 6.095232785433485e-06, "loss": 1.0003, "step": 1820 }, { "epoch": 4.657289002557545, "grad_norm": 0.20892906717171159, "learning_rate": 6.083493413608639e-06, "loss": 1.0032, "step": 1821 }, { "epoch": 4.659846547314578, "grad_norm": 0.14676895460609313, "learning_rate": 6.0717604136951315e-06, "loss": 1.0575, "step": 1822 }, { "epoch": 4.662404092071611, "grad_norm": 0.1744598380072282, "learning_rate": 6.0600338047818155e-06, "loss": 1.0012, "step": 1823 }, { "epoch": 4.664961636828645, "grad_norm": 0.15898084906509888, "learning_rate": 6.048313605947153e-06, "loss": 1.0152, "step": 1824 }, { "epoch": 4.667519181585678, "grad_norm": 0.18500242483627394, "learning_rate": 6.036599836259175e-06, "loss": 1.0202, "step": 1825 }, { "epoch": 4.670076726342711, "grad_norm": 0.17586881973502083, "learning_rate": 6.024892514775451e-06, "loss": 1.0152, "step": 1826 }, { "epoch": 4.672634271099744, "grad_norm": 0.1751917297897623, "learning_rate": 6.013191660543063e-06, "loss": 1.0185, "step": 1827 }, { "epoch": 4.675191815856778, "grad_norm": 0.16539844174921248, "learning_rate": 6.001497292598566e-06, "loss": 1.0091, "step": 1828 }, { "epoch": 4.677749360613811, "grad_norm": 0.16305138932194513, "learning_rate": 5.98980942996797e-06, "loss": 1.0171, "step": 1829 }, { "epoch": 4.680306905370844, "grad_norm": 0.1978081666622713, "learning_rate": 5.97812809166669e-06, "loss": 1.04, "step": 1830 }, { "epoch": 4.6828644501278776, "grad_norm": 0.14529737115947974, "learning_rate": 5.966453296699541e-06, "loss": 1.0219, "step": 1831 }, { "epoch": 4.68542199488491, "grad_norm": 0.19132792503166993, "learning_rate": 5.954785064060678e-06, "loss": 1.0466, "step": 1832 }, { "epoch": 4.687979539641944, "grad_norm": 0.14925809757481498, "learning_rate": 5.943123412733587e-06, "loss": 1.0168, "step": 1833 }, { "epoch": 4.690537084398977, "grad_norm": 0.19480783069632648, "learning_rate": 5.931468361691053e-06, "loss": 1.074, "step": 1834 }, { "epoch": 4.69309462915601, "grad_norm": 0.1597024405029427, "learning_rate": 5.919819929895106e-06, "loss": 1.0365, "step": 1835 }, { "epoch": 4.695652173913043, "grad_norm": 0.179287834985346, "learning_rate": 5.9081781362970205e-06, "loss": 1.0461, "step": 1836 }, { "epoch": 4.698209718670077, "grad_norm": 0.16882218098581764, "learning_rate": 5.896542999837265e-06, "loss": 1.0305, "step": 1837 }, { "epoch": 4.70076726342711, "grad_norm": 0.14058129617791865, "learning_rate": 5.8849145394454806e-06, "loss": 0.9987, "step": 1838 }, { "epoch": 4.703324808184143, "grad_norm": 0.18349693674349288, "learning_rate": 5.873292774040442e-06, "loss": 0.9943, "step": 1839 }, { "epoch": 4.705882352941177, "grad_norm": 0.1610970199108, "learning_rate": 5.861677722530037e-06, "loss": 1.0579, "step": 1840 }, { "epoch": 4.708439897698209, "grad_norm": 0.166987113555728, "learning_rate": 5.850069403811235e-06, "loss": 1.0181, "step": 1841 }, { "epoch": 4.710997442455243, "grad_norm": 0.1677755864894642, "learning_rate": 5.8384678367700325e-06, "loss": 1.0125, "step": 1842 }, { "epoch": 4.713554987212277, "grad_norm": 0.1779899568878102, "learning_rate": 5.826873040281462e-06, "loss": 1.0157, "step": 1843 }, { "epoch": 4.716112531969309, "grad_norm": 0.16348039752545065, "learning_rate": 5.81528503320953e-06, "loss": 1.0343, "step": 1844 }, { "epoch": 4.718670076726343, "grad_norm": 0.1670971620135551, "learning_rate": 5.8037038344072e-06, "loss": 1.0318, "step": 1845 }, { "epoch": 4.721227621483376, "grad_norm": 0.18617223073968917, "learning_rate": 5.792129462716355e-06, "loss": 1.0219, "step": 1846 }, { "epoch": 4.723785166240409, "grad_norm": 0.15449905092529612, "learning_rate": 5.780561936967779e-06, "loss": 1.0272, "step": 1847 }, { "epoch": 4.726342710997442, "grad_norm": 0.1750868480359856, "learning_rate": 5.769001275981112e-06, "loss": 1.0565, "step": 1848 }, { "epoch": 4.728900255754476, "grad_norm": 0.1663229129876114, "learning_rate": 5.757447498564821e-06, "loss": 1.0535, "step": 1849 }, { "epoch": 4.731457800511509, "grad_norm": 0.15809631122185844, "learning_rate": 5.745900623516189e-06, "loss": 1.021, "step": 1850 }, { "epoch": 4.734015345268542, "grad_norm": 0.16459750473842777, "learning_rate": 5.734360669621255e-06, "loss": 1.0248, "step": 1851 }, { "epoch": 4.736572890025576, "grad_norm": 0.15287249372875325, "learning_rate": 5.722827655654801e-06, "loss": 1.0156, "step": 1852 }, { "epoch": 4.739130434782608, "grad_norm": 0.1605211421637796, "learning_rate": 5.711301600380317e-06, "loss": 1.0569, "step": 1853 }, { "epoch": 4.741687979539642, "grad_norm": 0.14939498740260876, "learning_rate": 5.699782522549983e-06, "loss": 1.0509, "step": 1854 }, { "epoch": 4.744245524296675, "grad_norm": 0.16398542522125342, "learning_rate": 5.688270440904613e-06, "loss": 1.0273, "step": 1855 }, { "epoch": 4.746803069053708, "grad_norm": 0.16733173044314129, "learning_rate": 5.6767653741736405e-06, "loss": 0.9938, "step": 1856 }, { "epoch": 4.749360613810742, "grad_norm": 0.1505426061615439, "learning_rate": 5.665267341075098e-06, "loss": 1.0144, "step": 1857 }, { "epoch": 4.751918158567775, "grad_norm": 0.1527851077672571, "learning_rate": 5.653776360315562e-06, "loss": 1.0478, "step": 1858 }, { "epoch": 4.754475703324808, "grad_norm": 0.16913240191236387, "learning_rate": 5.642292450590134e-06, "loss": 1.0122, "step": 1859 }, { "epoch": 4.757033248081841, "grad_norm": 0.158875356158748, "learning_rate": 5.630815630582429e-06, "loss": 1.0413, "step": 1860 }, { "epoch": 4.759590792838875, "grad_norm": 0.14953756040104652, "learning_rate": 5.61934591896451e-06, "loss": 1.0337, "step": 1861 }, { "epoch": 4.762148337595908, "grad_norm": 0.17219828313172605, "learning_rate": 5.60788333439688e-06, "loss": 1.0287, "step": 1862 }, { "epoch": 4.764705882352941, "grad_norm": 0.1659776610530445, "learning_rate": 5.596427895528443e-06, "loss": 1.0443, "step": 1863 }, { "epoch": 4.767263427109975, "grad_norm": 0.1676484186832149, "learning_rate": 5.584979620996491e-06, "loss": 1.0489, "step": 1864 }, { "epoch": 4.7698209718670075, "grad_norm": 0.1623795959715509, "learning_rate": 5.573538529426645e-06, "loss": 1.0144, "step": 1865 }, { "epoch": 4.772378516624041, "grad_norm": 0.16256260144035772, "learning_rate": 5.562104639432845e-06, "loss": 1.0427, "step": 1866 }, { "epoch": 4.774936061381074, "grad_norm": 0.17175961986303814, "learning_rate": 5.550677969617319e-06, "loss": 1.0162, "step": 1867 }, { "epoch": 4.7774936061381075, "grad_norm": 0.1542050330321217, "learning_rate": 5.539258538570544e-06, "loss": 1.0164, "step": 1868 }, { "epoch": 4.78005115089514, "grad_norm": 0.15918533657676529, "learning_rate": 5.527846364871219e-06, "loss": 1.0309, "step": 1869 }, { "epoch": 4.782608695652174, "grad_norm": 0.1403676241793028, "learning_rate": 5.516441467086231e-06, "loss": 1.0228, "step": 1870 }, { "epoch": 4.7851662404092075, "grad_norm": 0.14773251181856192, "learning_rate": 5.505043863770646e-06, "loss": 1.0734, "step": 1871 }, { "epoch": 4.78772378516624, "grad_norm": 0.16196858898805197, "learning_rate": 5.493653573467647e-06, "loss": 1.0048, "step": 1872 }, { "epoch": 4.790281329923274, "grad_norm": 0.15355301379517172, "learning_rate": 5.4822706147085205e-06, "loss": 1.0125, "step": 1873 }, { "epoch": 4.792838874680307, "grad_norm": 0.18982539717495267, "learning_rate": 5.470895006012637e-06, "loss": 0.9959, "step": 1874 }, { "epoch": 4.79539641943734, "grad_norm": 0.1573171337655545, "learning_rate": 5.459526765887397e-06, "loss": 1.0297, "step": 1875 }, { "epoch": 4.797953964194374, "grad_norm": 0.16351573968402464, "learning_rate": 5.448165912828214e-06, "loss": 0.9945, "step": 1876 }, { "epoch": 4.8005115089514065, "grad_norm": 0.18629349709548856, "learning_rate": 5.4368124653184835e-06, "loss": 1.0363, "step": 1877 }, { "epoch": 4.80306905370844, "grad_norm": 0.17008978855695026, "learning_rate": 5.4254664418295634e-06, "loss": 1.0273, "step": 1878 }, { "epoch": 4.805626598465473, "grad_norm": 0.16524085689648021, "learning_rate": 5.414127860820719e-06, "loss": 1.0098, "step": 1879 }, { "epoch": 4.8081841432225065, "grad_norm": 0.18739927868121126, "learning_rate": 5.402796740739109e-06, "loss": 1.0057, "step": 1880 }, { "epoch": 4.810741687979539, "grad_norm": 0.17551431540439197, "learning_rate": 5.391473100019767e-06, "loss": 1.0378, "step": 1881 }, { "epoch": 4.813299232736573, "grad_norm": 0.20076574431883742, "learning_rate": 5.380156957085536e-06, "loss": 1.0054, "step": 1882 }, { "epoch": 4.8158567774936065, "grad_norm": 0.1633457331284817, "learning_rate": 5.3688483303470895e-06, "loss": 0.9945, "step": 1883 }, { "epoch": 4.818414322250639, "grad_norm": 0.18981752589117254, "learning_rate": 5.3575472382028386e-06, "loss": 1.018, "step": 1884 }, { "epoch": 4.820971867007673, "grad_norm": 0.1796254125656967, "learning_rate": 5.346253699038966e-06, "loss": 1.0175, "step": 1885 }, { "epoch": 4.823529411764706, "grad_norm": 0.18612504881053146, "learning_rate": 5.334967731229348e-06, "loss": 1.0343, "step": 1886 }, { "epoch": 4.826086956521739, "grad_norm": 0.1896503989682664, "learning_rate": 5.323689353135546e-06, "loss": 1.033, "step": 1887 }, { "epoch": 4.828644501278772, "grad_norm": 0.17351769644886408, "learning_rate": 5.312418583106784e-06, "loss": 1.0341, "step": 1888 }, { "epoch": 4.831202046035806, "grad_norm": 0.19813048664100952, "learning_rate": 5.301155439479893e-06, "loss": 1.0189, "step": 1889 }, { "epoch": 4.833759590792839, "grad_norm": 0.17414587401870055, "learning_rate": 5.289899940579315e-06, "loss": 0.9979, "step": 1890 }, { "epoch": 4.836317135549872, "grad_norm": 0.17954394790720937, "learning_rate": 5.278652104717026e-06, "loss": 1.033, "step": 1891 }, { "epoch": 4.838874680306906, "grad_norm": 0.18225354012614833, "learning_rate": 5.267411950192558e-06, "loss": 1.0006, "step": 1892 }, { "epoch": 4.841432225063938, "grad_norm": 0.19171250300846782, "learning_rate": 5.256179495292953e-06, "loss": 0.976, "step": 1893 }, { "epoch": 4.843989769820972, "grad_norm": 0.16560762200333132, "learning_rate": 5.244954758292691e-06, "loss": 1.03, "step": 1894 }, { "epoch": 4.846547314578006, "grad_norm": 0.17384349031638302, "learning_rate": 5.233737757453733e-06, "loss": 1.017, "step": 1895 }, { "epoch": 4.849104859335038, "grad_norm": 0.18200737855014837, "learning_rate": 5.222528511025429e-06, "loss": 1.0544, "step": 1896 }, { "epoch": 4.851662404092072, "grad_norm": 0.1674383880489774, "learning_rate": 5.2113270372445334e-06, "loss": 1.0199, "step": 1897 }, { "epoch": 4.854219948849105, "grad_norm": 0.16206185822222566, "learning_rate": 5.200133354335129e-06, "loss": 1.0297, "step": 1898 }, { "epoch": 4.856777493606138, "grad_norm": 0.16330979230562037, "learning_rate": 5.188947480508644e-06, "loss": 1.0618, "step": 1899 }, { "epoch": 4.859335038363171, "grad_norm": 0.1641289208809162, "learning_rate": 5.177769433963801e-06, "loss": 1.0095, "step": 1900 }, { "epoch": 4.861892583120205, "grad_norm": 0.16857653947800838, "learning_rate": 5.166599232886579e-06, "loss": 1.0132, "step": 1901 }, { "epoch": 4.864450127877237, "grad_norm": 0.15123752972525892, "learning_rate": 5.155436895450197e-06, "loss": 1.0231, "step": 1902 }, { "epoch": 4.867007672634271, "grad_norm": 0.18007827051394826, "learning_rate": 5.144282439815075e-06, "loss": 1.0299, "step": 1903 }, { "epoch": 4.869565217391305, "grad_norm": 0.17145491388315698, "learning_rate": 5.133135884128828e-06, "loss": 1.0426, "step": 1904 }, { "epoch": 4.872122762148337, "grad_norm": 0.15111451411798363, "learning_rate": 5.121997246526188e-06, "loss": 1.0335, "step": 1905 }, { "epoch": 4.874680306905371, "grad_norm": 0.17562740075351813, "learning_rate": 5.110866545129031e-06, "loss": 1.0226, "step": 1906 }, { "epoch": 4.877237851662404, "grad_norm": 0.14883986205754957, "learning_rate": 5.099743798046315e-06, "loss": 1.03, "step": 1907 }, { "epoch": 4.879795396419437, "grad_norm": 0.16425606815927463, "learning_rate": 5.088629023374052e-06, "loss": 1.0524, "step": 1908 }, { "epoch": 4.882352941176471, "grad_norm": 0.15699998164150683, "learning_rate": 5.0775222391952826e-06, "loss": 1.0598, "step": 1909 }, { "epoch": 4.884910485933504, "grad_norm": 0.16747367530556498, "learning_rate": 5.06642346358005e-06, "loss": 1.0197, "step": 1910 }, { "epoch": 4.887468030690537, "grad_norm": 0.19072243056188606, "learning_rate": 5.055332714585372e-06, "loss": 1.001, "step": 1911 }, { "epoch": 4.89002557544757, "grad_norm": 0.16853967810789172, "learning_rate": 5.044250010255202e-06, "loss": 1.0432, "step": 1912 }, { "epoch": 4.892583120204604, "grad_norm": 0.17828385119329374, "learning_rate": 5.033175368620406e-06, "loss": 1.0314, "step": 1913 }, { "epoch": 4.8951406649616365, "grad_norm": 0.15062414843555882, "learning_rate": 5.022108807698735e-06, "loss": 1.0358, "step": 1914 }, { "epoch": 4.89769820971867, "grad_norm": 0.17399854674836523, "learning_rate": 5.0110503454947926e-06, "loss": 1.0265, "step": 1915 }, { "epoch": 4.900255754475703, "grad_norm": 0.16505478849391259, "learning_rate": 5.000000000000003e-06, "loss": 1.0495, "step": 1916 }, { "epoch": 4.9028132992327365, "grad_norm": 0.1446909805445552, "learning_rate": 4.988957789192583e-06, "loss": 1.0044, "step": 1917 }, { "epoch": 4.90537084398977, "grad_norm": 0.16047225013403066, "learning_rate": 4.97792373103753e-06, "loss": 0.977, "step": 1918 }, { "epoch": 4.907928388746803, "grad_norm": 0.15267602057033672, "learning_rate": 4.966897843486561e-06, "loss": 1.0563, "step": 1919 }, { "epoch": 4.910485933503836, "grad_norm": 0.14094891470116488, "learning_rate": 4.955880144478101e-06, "loss": 1.0172, "step": 1920 }, { "epoch": 4.913043478260869, "grad_norm": 0.16225336285064607, "learning_rate": 4.944870651937267e-06, "loss": 1.0332, "step": 1921 }, { "epoch": 4.915601023017903, "grad_norm": 0.15352807995544615, "learning_rate": 4.933869383775809e-06, "loss": 1.0285, "step": 1922 }, { "epoch": 4.918158567774936, "grad_norm": 0.14893755036217834, "learning_rate": 4.922876357892103e-06, "loss": 1.0082, "step": 1923 }, { "epoch": 4.920716112531969, "grad_norm": 0.17251988177114058, "learning_rate": 4.911891592171113e-06, "loss": 1.0131, "step": 1924 }, { "epoch": 4.923273657289003, "grad_norm": 0.15340872718806947, "learning_rate": 4.900915104484372e-06, "loss": 1.0502, "step": 1925 }, { "epoch": 4.9258312020460355, "grad_norm": 0.16259551968874744, "learning_rate": 4.889946912689936e-06, "loss": 1.0457, "step": 1926 }, { "epoch": 4.928388746803069, "grad_norm": 0.15432669889294595, "learning_rate": 4.878987034632361e-06, "loss": 1.0491, "step": 1927 }, { "epoch": 4.930946291560103, "grad_norm": 0.16399149074989694, "learning_rate": 4.8680354881426935e-06, "loss": 1.011, "step": 1928 }, { "epoch": 4.9335038363171355, "grad_norm": 0.17537267004354543, "learning_rate": 4.857092291038411e-06, "loss": 1.0356, "step": 1929 }, { "epoch": 4.936061381074169, "grad_norm": 0.15804425089068397, "learning_rate": 4.846157461123411e-06, "loss": 1.0556, "step": 1930 }, { "epoch": 4.938618925831202, "grad_norm": 0.1644217524312441, "learning_rate": 4.8352310161879724e-06, "loss": 1.0521, "step": 1931 }, { "epoch": 4.9411764705882355, "grad_norm": 0.166490586450367, "learning_rate": 4.824312974008748e-06, "loss": 1.0348, "step": 1932 }, { "epoch": 4.943734015345268, "grad_norm": 0.15262614264530625, "learning_rate": 4.813403352348703e-06, "loss": 1.003, "step": 1933 }, { "epoch": 4.946291560102302, "grad_norm": 0.16914604106371434, "learning_rate": 4.8025021689571095e-06, "loss": 1.0261, "step": 1934 }, { "epoch": 4.948849104859335, "grad_norm": 0.14949420788516232, "learning_rate": 4.791609441569517e-06, "loss": 1.013, "step": 1935 }, { "epoch": 4.951406649616368, "grad_norm": 0.18410232609002486, "learning_rate": 4.780725187907707e-06, "loss": 1.0211, "step": 1936 }, { "epoch": 4.953964194373402, "grad_norm": 0.14300056243568887, "learning_rate": 4.769849425679683e-06, "loss": 1.0222, "step": 1937 }, { "epoch": 4.956521739130435, "grad_norm": 0.17246451645014146, "learning_rate": 4.758982172579621e-06, "loss": 0.9967, "step": 1938 }, { "epoch": 4.959079283887468, "grad_norm": 0.17259140226193048, "learning_rate": 4.748123446287875e-06, "loss": 1.0321, "step": 1939 }, { "epoch": 4.961636828644501, "grad_norm": 1.1109363534677956, "learning_rate": 4.737273264470909e-06, "loss": 1.0923, "step": 1940 }, { "epoch": 4.964194373401535, "grad_norm": 0.17074890567417172, "learning_rate": 4.726431644781284e-06, "loss": 1.0245, "step": 1941 }, { "epoch": 4.966751918158568, "grad_norm": 0.15432050773937248, "learning_rate": 4.715598604857648e-06, "loss": 1.0378, "step": 1942 }, { "epoch": 4.969309462915601, "grad_norm": 0.15888604747270782, "learning_rate": 4.704774162324673e-06, "loss": 1.0287, "step": 1943 }, { "epoch": 4.971867007672635, "grad_norm": 0.17597082523498278, "learning_rate": 4.6939583347930525e-06, "loss": 1.0024, "step": 1944 }, { "epoch": 4.974424552429667, "grad_norm": 0.15465610920055028, "learning_rate": 4.6831511398594574e-06, "loss": 1.0216, "step": 1945 }, { "epoch": 4.976982097186701, "grad_norm": 0.16914400485984177, "learning_rate": 4.672352595106525e-06, "loss": 1.0595, "step": 1946 }, { "epoch": 4.979539641943734, "grad_norm": 0.17772012019293779, "learning_rate": 4.661562718102808e-06, "loss": 1.0056, "step": 1947 }, { "epoch": 4.982097186700767, "grad_norm": 0.14226899552306443, "learning_rate": 4.65078152640276e-06, "loss": 1.0221, "step": 1948 }, { "epoch": 4.9846547314578, "grad_norm": 0.14866025187075746, "learning_rate": 4.640009037546711e-06, "loss": 1.0534, "step": 1949 }, { "epoch": 4.987212276214834, "grad_norm": 0.18309163357147787, "learning_rate": 4.629245269060826e-06, "loss": 1.046, "step": 1950 }, { "epoch": 4.989769820971867, "grad_norm": 0.14195791571684566, "learning_rate": 4.61849023845708e-06, "loss": 1.0119, "step": 1951 }, { "epoch": 4.9923273657289, "grad_norm": 0.15240227847957083, "learning_rate": 4.607743963233233e-06, "loss": 1.0373, "step": 1952 }, { "epoch": 4.994884910485934, "grad_norm": 0.1706260447764414, "learning_rate": 4.5970064608728085e-06, "loss": 0.9995, "step": 1953 }, { "epoch": 4.997442455242966, "grad_norm": 0.16263531281395652, "learning_rate": 4.586277748845056e-06, "loss": 1.0053, "step": 1954 }, { "epoch": 5.0, "grad_norm": 0.15411495644560275, "learning_rate": 4.575557844604905e-06, "loss": 1.0268, "step": 1955 }, { "epoch": 5.002557544757034, "grad_norm": 0.15615925966080388, "learning_rate": 4.5648467655929815e-06, "loss": 1.0199, "step": 1956 }, { "epoch": 5.005115089514066, "grad_norm": 0.16045903540647527, "learning_rate": 4.554144529235537e-06, "loss": 1.0277, "step": 1957 }, { "epoch": 5.0076726342711, "grad_norm": 0.16031341969126212, "learning_rate": 4.543451152944438e-06, "loss": 1.0562, "step": 1958 }, { "epoch": 5.010230179028133, "grad_norm": 0.1429706019310508, "learning_rate": 4.532766654117146e-06, "loss": 1.031, "step": 1959 }, { "epoch": 5.012787723785166, "grad_norm": 0.15753846906492294, "learning_rate": 4.5220910501366635e-06, "loss": 1.0368, "step": 1960 }, { "epoch": 5.015345268542199, "grad_norm": 0.14579202507979455, "learning_rate": 4.511424358371544e-06, "loss": 1.0358, "step": 1961 }, { "epoch": 5.017902813299233, "grad_norm": 0.15694921661063782, "learning_rate": 4.500766596175813e-06, "loss": 1.0037, "step": 1962 }, { "epoch": 5.020460358056266, "grad_norm": 0.16268209756361607, "learning_rate": 4.490117780888994e-06, "loss": 1.0191, "step": 1963 }, { "epoch": 5.023017902813299, "grad_norm": 0.13601692002794843, "learning_rate": 4.479477929836039e-06, "loss": 1.0225, "step": 1964 }, { "epoch": 5.025575447570333, "grad_norm": 0.1513485213042126, "learning_rate": 4.4688470603273184e-06, "loss": 0.9987, "step": 1965 }, { "epoch": 5.028132992327365, "grad_norm": 0.14505501997147888, "learning_rate": 4.458225189658598e-06, "loss": 1.0244, "step": 1966 }, { "epoch": 5.030690537084399, "grad_norm": 0.15866972934335427, "learning_rate": 4.447612335110991e-06, "loss": 1.0147, "step": 1967 }, { "epoch": 5.033248081841432, "grad_norm": 0.15717036214065513, "learning_rate": 4.43700851395096e-06, "loss": 1.0056, "step": 1968 }, { "epoch": 5.035805626598465, "grad_norm": 0.15634999112536652, "learning_rate": 4.426413743430241e-06, "loss": 1.0486, "step": 1969 }, { "epoch": 5.038363171355499, "grad_norm": 0.1549586768650421, "learning_rate": 4.415828040785877e-06, "loss": 1.0046, "step": 1970 }, { "epoch": 5.040920716112532, "grad_norm": 0.1643495461245206, "learning_rate": 4.405251423240138e-06, "loss": 1.0158, "step": 1971 }, { "epoch": 5.043478260869565, "grad_norm": 0.14558675280550004, "learning_rate": 4.3946839080005236e-06, "loss": 1.0167, "step": 1972 }, { "epoch": 5.046035805626598, "grad_norm": 0.16057769002475886, "learning_rate": 4.384125512259718e-06, "loss": 1.0412, "step": 1973 }, { "epoch": 5.048593350383632, "grad_norm": 0.1589654545230765, "learning_rate": 4.373576253195568e-06, "loss": 1.0058, "step": 1974 }, { "epoch": 5.051150895140665, "grad_norm": 0.14004326798784272, "learning_rate": 4.363036147971069e-06, "loss": 0.9958, "step": 1975 }, { "epoch": 5.053708439897698, "grad_norm": 0.16704739125788623, "learning_rate": 4.352505213734298e-06, "loss": 1.0202, "step": 1976 }, { "epoch": 5.056265984654732, "grad_norm": 0.15270263482532218, "learning_rate": 4.3419834676184395e-06, "loss": 1.0221, "step": 1977 }, { "epoch": 5.0588235294117645, "grad_norm": 0.15264750560420307, "learning_rate": 4.331470926741707e-06, "loss": 1.0264, "step": 1978 }, { "epoch": 5.061381074168798, "grad_norm": 0.1675831575968936, "learning_rate": 4.320967608207354e-06, "loss": 1.0256, "step": 1979 }, { "epoch": 5.063938618925831, "grad_norm": 0.15506176173449848, "learning_rate": 4.3104735291036214e-06, "loss": 1.0246, "step": 1980 }, { "epoch": 5.0664961636828645, "grad_norm": 0.147438074557832, "learning_rate": 4.299988706503716e-06, "loss": 0.9895, "step": 1981 }, { "epoch": 5.069053708439898, "grad_norm": 0.13712823238173896, "learning_rate": 4.289513157465796e-06, "loss": 1.0069, "step": 1982 }, { "epoch": 5.071611253196931, "grad_norm": 0.1530445973165712, "learning_rate": 4.279046899032918e-06, "loss": 1.028, "step": 1983 }, { "epoch": 5.0741687979539645, "grad_norm": 0.1487111811647309, "learning_rate": 4.268589948233034e-06, "loss": 0.9806, "step": 1984 }, { "epoch": 5.076726342710997, "grad_norm": 0.1536495899212468, "learning_rate": 4.258142322078944e-06, "loss": 1.0141, "step": 1985 }, { "epoch": 5.079283887468031, "grad_norm": 0.1420705753526825, "learning_rate": 4.247704037568289e-06, "loss": 1.0484, "step": 1986 }, { "epoch": 5.081841432225064, "grad_norm": 0.14854933088338998, "learning_rate": 4.237275111683502e-06, "loss": 1.0176, "step": 1987 }, { "epoch": 5.084398976982097, "grad_norm": 0.15085396882702742, "learning_rate": 4.226855561391792e-06, "loss": 1.0241, "step": 1988 }, { "epoch": 5.086956521739131, "grad_norm": 0.13480571166529362, "learning_rate": 4.2164454036451185e-06, "loss": 1.0105, "step": 1989 }, { "epoch": 5.089514066496164, "grad_norm": 0.15439478858765343, "learning_rate": 4.2060446553801585e-06, "loss": 1.0571, "step": 1990 }, { "epoch": 5.092071611253197, "grad_norm": 0.14887589003918353, "learning_rate": 4.195653333518271e-06, "loss": 1.0309, "step": 1991 }, { "epoch": 5.09462915601023, "grad_norm": 0.14823587280930983, "learning_rate": 4.1852714549654985e-06, "loss": 1.0286, "step": 1992 }, { "epoch": 5.0971867007672635, "grad_norm": 0.1502816473196306, "learning_rate": 4.1748990366125005e-06, "loss": 1.0092, "step": 1993 }, { "epoch": 5.099744245524296, "grad_norm": 0.13426636004437947, "learning_rate": 4.164536095334557e-06, "loss": 1.0055, "step": 1994 }, { "epoch": 5.10230179028133, "grad_norm": 0.14869672831898953, "learning_rate": 4.154182647991519e-06, "loss": 1.0492, "step": 1995 }, { "epoch": 5.1048593350383635, "grad_norm": 0.15755018419795028, "learning_rate": 4.143838711427808e-06, "loss": 1.0103, "step": 1996 }, { "epoch": 5.107416879795396, "grad_norm": 0.1503017786383216, "learning_rate": 4.133504302472356e-06, "loss": 1.0015, "step": 1997 }, { "epoch": 5.10997442455243, "grad_norm": 0.14022700208845976, "learning_rate": 4.123179437938596e-06, "loss": 1.0394, "step": 1998 }, { "epoch": 5.112531969309463, "grad_norm": 0.149747082086179, "learning_rate": 4.112864134624447e-06, "loss": 1.0406, "step": 1999 }, { "epoch": 5.115089514066496, "grad_norm": 0.15174138196167658, "learning_rate": 4.102558409312256e-06, "loss": 1.022, "step": 2000 }, { "epoch": 5.117647058823529, "grad_norm": 0.14846170493390945, "learning_rate": 4.092262278768797e-06, "loss": 1.0132, "step": 2001 }, { "epoch": 5.120204603580563, "grad_norm": 0.14541949365283377, "learning_rate": 4.0819757597452246e-06, "loss": 1.0328, "step": 2002 }, { "epoch": 5.122762148337596, "grad_norm": 0.16073985913183766, "learning_rate": 4.0716988689770695e-06, "loss": 1.0067, "step": 2003 }, { "epoch": 5.125319693094629, "grad_norm": 0.14371815787004755, "learning_rate": 4.061431623184188e-06, "loss": 1.0289, "step": 2004 }, { "epoch": 5.127877237851663, "grad_norm": 0.14339076964243316, "learning_rate": 4.051174039070742e-06, "loss": 0.9812, "step": 2005 }, { "epoch": 5.130434782608695, "grad_norm": 0.1437711220903366, "learning_rate": 4.040926133325188e-06, "loss": 1.0059, "step": 2006 }, { "epoch": 5.132992327365729, "grad_norm": 0.1432806446083087, "learning_rate": 4.030687922620223e-06, "loss": 1.0183, "step": 2007 }, { "epoch": 5.135549872122763, "grad_norm": 0.14407049755074497, "learning_rate": 4.020459423612777e-06, "loss": 1.0328, "step": 2008 }, { "epoch": 5.138107416879795, "grad_norm": 0.14311456671607106, "learning_rate": 4.010240652943974e-06, "loss": 1.0247, "step": 2009 }, { "epoch": 5.140664961636829, "grad_norm": 0.14651674275116736, "learning_rate": 4.000031627239123e-06, "loss": 1.0271, "step": 2010 }, { "epoch": 5.143222506393862, "grad_norm": 0.14244659447949104, "learning_rate": 3.989832363107664e-06, "loss": 0.9729, "step": 2011 }, { "epoch": 5.145780051150895, "grad_norm": 0.1474525383109307, "learning_rate": 3.9796428771431625e-06, "loss": 1.0208, "step": 2012 }, { "epoch": 5.148337595907928, "grad_norm": 0.14684653759057748, "learning_rate": 3.96946318592328e-06, "loss": 0.9944, "step": 2013 }, { "epoch": 5.150895140664962, "grad_norm": 0.14793817657477276, "learning_rate": 3.959293306009734e-06, "loss": 1.0606, "step": 2014 }, { "epoch": 5.153452685421995, "grad_norm": 0.13847357302909763, "learning_rate": 3.949133253948284e-06, "loss": 1.0035, "step": 2015 }, { "epoch": 5.156010230179028, "grad_norm": 0.14747847539008258, "learning_rate": 3.938983046268695e-06, "loss": 0.9869, "step": 2016 }, { "epoch": 5.158567774936062, "grad_norm": 0.14511374476416694, "learning_rate": 3.9288426994847285e-06, "loss": 1.0238, "step": 2017 }, { "epoch": 5.161125319693094, "grad_norm": 0.15030414965811079, "learning_rate": 3.918712230094091e-06, "loss": 1.0521, "step": 2018 }, { "epoch": 5.163682864450128, "grad_norm": 0.14420923408617164, "learning_rate": 3.908591654578417e-06, "loss": 0.9878, "step": 2019 }, { "epoch": 5.166240409207161, "grad_norm": 0.1369795797536583, "learning_rate": 3.89848098940326e-06, "loss": 1.0203, "step": 2020 }, { "epoch": 5.168797953964194, "grad_norm": 0.15862135307508646, "learning_rate": 3.888380251018035e-06, "loss": 1.0112, "step": 2021 }, { "epoch": 5.171355498721228, "grad_norm": 0.13968732984433663, "learning_rate": 3.878289455856013e-06, "loss": 1.0589, "step": 2022 }, { "epoch": 5.173913043478261, "grad_norm": 0.14444481777607088, "learning_rate": 3.868208620334282e-06, "loss": 1.0065, "step": 2023 }, { "epoch": 5.176470588235294, "grad_norm": 0.14184611750434217, "learning_rate": 3.858137760853737e-06, "loss": 1.0189, "step": 2024 }, { "epoch": 5.179028132992327, "grad_norm": 0.14923144029216218, "learning_rate": 3.84807689379904e-06, "loss": 1.0052, "step": 2025 }, { "epoch": 5.181585677749361, "grad_norm": 0.15459564247502722, "learning_rate": 3.838026035538581e-06, "loss": 0.9946, "step": 2026 }, { "epoch": 5.1841432225063935, "grad_norm": 0.1418795966374483, "learning_rate": 3.827985202424488e-06, "loss": 1.0234, "step": 2027 }, { "epoch": 5.186700767263427, "grad_norm": 0.1553154903132494, "learning_rate": 3.817954410792565e-06, "loss": 1.0137, "step": 2028 }, { "epoch": 5.189258312020461, "grad_norm": 0.14275503896178632, "learning_rate": 3.8079336769622834e-06, "loss": 1.0289, "step": 2029 }, { "epoch": 5.1918158567774935, "grad_norm": 0.13897565956134958, "learning_rate": 3.7979230172367453e-06, "loss": 1.0148, "step": 2030 }, { "epoch": 5.194373401534527, "grad_norm": 0.14252828284486727, "learning_rate": 3.7879224479026745e-06, "loss": 1.0068, "step": 2031 }, { "epoch": 5.19693094629156, "grad_norm": 0.1517901716492953, "learning_rate": 3.7779319852303766e-06, "loss": 1.0572, "step": 2032 }, { "epoch": 5.1994884910485935, "grad_norm": 0.1439259357160915, "learning_rate": 3.7679516454736977e-06, "loss": 1.0446, "step": 2033 }, { "epoch": 5.202046035805626, "grad_norm": 0.1371345617669485, "learning_rate": 3.757981444870035e-06, "loss": 0.9957, "step": 2034 }, { "epoch": 5.20460358056266, "grad_norm": 0.16004739713130242, "learning_rate": 3.748021399640279e-06, "loss": 1.0276, "step": 2035 }, { "epoch": 5.207161125319693, "grad_norm": 0.1441426514349444, "learning_rate": 3.7380715259888e-06, "loss": 1.0344, "step": 2036 }, { "epoch": 5.209718670076726, "grad_norm": 0.14152534835692054, "learning_rate": 3.7281318401034183e-06, "loss": 0.9949, "step": 2037 }, { "epoch": 5.21227621483376, "grad_norm": 0.1481149663167974, "learning_rate": 3.718202358155384e-06, "loss": 1.0545, "step": 2038 }, { "epoch": 5.2148337595907925, "grad_norm": 0.13716666870403715, "learning_rate": 3.7082830962993497e-06, "loss": 1.0388, "step": 2039 }, { "epoch": 5.217391304347826, "grad_norm": 0.1427599492035968, "learning_rate": 3.6983740706733207e-06, "loss": 0.9945, "step": 2040 }, { "epoch": 5.21994884910486, "grad_norm": 0.14437989757241948, "learning_rate": 3.688475297398674e-06, "loss": 1.037, "step": 2041 }, { "epoch": 5.2225063938618925, "grad_norm": 0.1407689885502161, "learning_rate": 3.6785867925800856e-06, "loss": 1.0019, "step": 2042 }, { "epoch": 5.225063938618926, "grad_norm": 0.1381622930416597, "learning_rate": 3.668708572305546e-06, "loss": 1.0384, "step": 2043 }, { "epoch": 5.227621483375959, "grad_norm": 0.13975927307572164, "learning_rate": 3.658840652646287e-06, "loss": 1.0018, "step": 2044 }, { "epoch": 5.2301790281329925, "grad_norm": 0.15578171256673842, "learning_rate": 3.6489830496568067e-06, "loss": 1.0221, "step": 2045 }, { "epoch": 5.232736572890025, "grad_norm": 0.14587450260403836, "learning_rate": 3.639135779374813e-06, "loss": 1.0462, "step": 2046 }, { "epoch": 5.235294117647059, "grad_norm": 0.14336907869458113, "learning_rate": 3.6292988578211863e-06, "loss": 1.0242, "step": 2047 }, { "epoch": 5.2378516624040925, "grad_norm": 0.13614785911809554, "learning_rate": 3.619472300999992e-06, "loss": 1.002, "step": 2048 }, { "epoch": 5.240409207161125, "grad_norm": 0.14654873047839187, "learning_rate": 3.6096561248984186e-06, "loss": 1.0365, "step": 2049 }, { "epoch": 5.242966751918159, "grad_norm": 0.14832735168435557, "learning_rate": 3.5998503454867807e-06, "loss": 1.0206, "step": 2050 }, { "epoch": 5.245524296675192, "grad_norm": 0.15182549845090051, "learning_rate": 3.5900549787184534e-06, "loss": 1.0086, "step": 2051 }, { "epoch": 5.248081841432225, "grad_norm": 0.15218834374865772, "learning_rate": 3.580270040529894e-06, "loss": 1.0457, "step": 2052 }, { "epoch": 5.250639386189258, "grad_norm": 0.1386445311628316, "learning_rate": 3.570495546840591e-06, "loss": 1.0316, "step": 2053 }, { "epoch": 5.253196930946292, "grad_norm": 0.1415172130548022, "learning_rate": 3.560731513553022e-06, "loss": 1.033, "step": 2054 }, { "epoch": 5.255754475703325, "grad_norm": 0.134688736061587, "learning_rate": 3.5509779565526683e-06, "loss": 1.0341, "step": 2055 }, { "epoch": 5.258312020460358, "grad_norm": 0.14665953403303808, "learning_rate": 3.5412348917079507e-06, "loss": 1.0621, "step": 2056 }, { "epoch": 5.260869565217392, "grad_norm": 0.13619183573807261, "learning_rate": 3.5315023348702325e-06, "loss": 1.0366, "step": 2057 }, { "epoch": 5.263427109974424, "grad_norm": 0.13658849089622857, "learning_rate": 3.521780301873773e-06, "loss": 1.0008, "step": 2058 }, { "epoch": 5.265984654731458, "grad_norm": 0.14630387436677678, "learning_rate": 3.512068808535707e-06, "loss": 1.0147, "step": 2059 }, { "epoch": 5.268542199488491, "grad_norm": 0.13734073999332427, "learning_rate": 3.502367870656035e-06, "loss": 1.028, "step": 2060 }, { "epoch": 5.271099744245524, "grad_norm": 0.1355644028489033, "learning_rate": 3.492677504017573e-06, "loss": 1.0026, "step": 2061 }, { "epoch": 5.273657289002558, "grad_norm": 0.14119902993384847, "learning_rate": 3.4829977243859414e-06, "loss": 1.0093, "step": 2062 }, { "epoch": 5.276214833759591, "grad_norm": 0.14118557253626327, "learning_rate": 3.4733285475095324e-06, "loss": 1.0255, "step": 2063 }, { "epoch": 5.278772378516624, "grad_norm": 0.13630213438701977, "learning_rate": 3.4636699891195e-06, "loss": 1.0176, "step": 2064 }, { "epoch": 5.281329923273657, "grad_norm": 0.1355438862392238, "learning_rate": 3.454022064929711e-06, "loss": 1.0355, "step": 2065 }, { "epoch": 5.283887468030691, "grad_norm": 0.1335405410237401, "learning_rate": 3.4443847906367313e-06, "loss": 0.9999, "step": 2066 }, { "epoch": 5.286445012787723, "grad_norm": 0.13568542243072879, "learning_rate": 3.4347581819198095e-06, "loss": 1.0069, "step": 2067 }, { "epoch": 5.289002557544757, "grad_norm": 0.14279750042804518, "learning_rate": 3.425142254440835e-06, "loss": 1.0316, "step": 2068 }, { "epoch": 5.291560102301791, "grad_norm": 0.1421562223189775, "learning_rate": 3.4155370238443185e-06, "loss": 0.9929, "step": 2069 }, { "epoch": 5.294117647058823, "grad_norm": 0.13090998129388792, "learning_rate": 3.405942505757367e-06, "loss": 1.0235, "step": 2070 }, { "epoch": 5.296675191815857, "grad_norm": 0.1447611334505954, "learning_rate": 3.3963587157896694e-06, "loss": 0.9883, "step": 2071 }, { "epoch": 5.29923273657289, "grad_norm": 0.1486460622906693, "learning_rate": 3.386785669533447e-06, "loss": 1.0614, "step": 2072 }, { "epoch": 5.301790281329923, "grad_norm": 0.13082209863415079, "learning_rate": 3.377223382563446e-06, "loss": 1.019, "step": 2073 }, { "epoch": 5.304347826086957, "grad_norm": 0.14431855838963542, "learning_rate": 3.367671870436915e-06, "loss": 1.0744, "step": 2074 }, { "epoch": 5.30690537084399, "grad_norm": 0.13501366283453947, "learning_rate": 3.358131148693564e-06, "loss": 1.0204, "step": 2075 }, { "epoch": 5.309462915601023, "grad_norm": 0.13647498103708036, "learning_rate": 3.3486012328555505e-06, "loss": 1.0361, "step": 2076 }, { "epoch": 5.312020460358056, "grad_norm": 0.13678423051822214, "learning_rate": 3.33908213842745e-06, "loss": 1.0416, "step": 2077 }, { "epoch": 5.31457800511509, "grad_norm": 0.15117370323671084, "learning_rate": 3.3295738808962388e-06, "loss": 1.0398, "step": 2078 }, { "epoch": 5.3171355498721224, "grad_norm": 0.13218102548293045, "learning_rate": 3.3200764757312555e-06, "loss": 1.0211, "step": 2079 }, { "epoch": 5.319693094629156, "grad_norm": 0.13875158228376064, "learning_rate": 3.310589938384179e-06, "loss": 1.0246, "step": 2080 }, { "epoch": 5.322250639386189, "grad_norm": 0.1390888027343779, "learning_rate": 3.301114284289021e-06, "loss": 1.0228, "step": 2081 }, { "epoch": 5.324808184143222, "grad_norm": 0.14311106791965889, "learning_rate": 3.291649528862074e-06, "loss": 1.0366, "step": 2082 }, { "epoch": 5.327365728900256, "grad_norm": 0.1329482436934704, "learning_rate": 3.2821956875019045e-06, "loss": 0.9983, "step": 2083 }, { "epoch": 5.329923273657289, "grad_norm": 0.1353254341465528, "learning_rate": 3.272752775589316e-06, "loss": 1.0262, "step": 2084 }, { "epoch": 5.332480818414322, "grad_norm": 0.14279181335598803, "learning_rate": 3.2633208084873445e-06, "loss": 1.0214, "step": 2085 }, { "epoch": 5.335038363171355, "grad_norm": 0.14938681808695, "learning_rate": 3.253899801541206e-06, "loss": 1.0458, "step": 2086 }, { "epoch": 5.337595907928389, "grad_norm": 0.13903091402439763, "learning_rate": 3.244489770078286e-06, "loss": 1.0699, "step": 2087 }, { "epoch": 5.340153452685422, "grad_norm": 0.14447995472723943, "learning_rate": 3.2350907294081258e-06, "loss": 0.9936, "step": 2088 }, { "epoch": 5.342710997442455, "grad_norm": 0.14276869094442168, "learning_rate": 3.2257026948223726e-06, "loss": 1.0565, "step": 2089 }, { "epoch": 5.345268542199489, "grad_norm": 0.14335515694613532, "learning_rate": 3.2163256815947674e-06, "loss": 0.9993, "step": 2090 }, { "epoch": 5.3478260869565215, "grad_norm": 0.14665513927933138, "learning_rate": 3.206959704981133e-06, "loss": 1.0555, "step": 2091 }, { "epoch": 5.350383631713555, "grad_norm": 0.1322833527352921, "learning_rate": 3.197604780219323e-06, "loss": 0.9652, "step": 2092 }, { "epoch": 5.352941176470588, "grad_norm": 0.13906561826785738, "learning_rate": 3.188260922529215e-06, "loss": 1.0432, "step": 2093 }, { "epoch": 5.3554987212276215, "grad_norm": 0.14254937224329012, "learning_rate": 3.1789281471126786e-06, "loss": 1.0175, "step": 2094 }, { "epoch": 5.358056265984655, "grad_norm": 0.14911195774932937, "learning_rate": 3.1696064691535634e-06, "loss": 1.0024, "step": 2095 }, { "epoch": 5.360613810741688, "grad_norm": 0.1296333526942248, "learning_rate": 3.1602959038176516e-06, "loss": 1.016, "step": 2096 }, { "epoch": 5.3631713554987215, "grad_norm": 0.14492528039945102, "learning_rate": 3.1509964662526484e-06, "loss": 1.0072, "step": 2097 }, { "epoch": 5.365728900255754, "grad_norm": 0.14261896658846623, "learning_rate": 3.1417081715881623e-06, "loss": 0.997, "step": 2098 }, { "epoch": 5.368286445012788, "grad_norm": 0.15062841301973245, "learning_rate": 3.132431034935667e-06, "loss": 1.0286, "step": 2099 }, { "epoch": 5.370843989769821, "grad_norm": 0.14079332067477582, "learning_rate": 3.1231650713884832e-06, "loss": 1.0331, "step": 2100 }, { "epoch": 5.373401534526854, "grad_norm": 0.13555419460898196, "learning_rate": 3.1139102960217493e-06, "loss": 1.0041, "step": 2101 }, { "epoch": 5.375959079283888, "grad_norm": 0.13880524146849596, "learning_rate": 3.1046667238924155e-06, "loss": 1.0423, "step": 2102 }, { "epoch": 5.378516624040921, "grad_norm": 0.1511402878049476, "learning_rate": 3.0954343700391897e-06, "loss": 1.0349, "step": 2103 }, { "epoch": 5.381074168797954, "grad_norm": 0.14254863702344298, "learning_rate": 3.0862132494825325e-06, "loss": 1.026, "step": 2104 }, { "epoch": 5.383631713554987, "grad_norm": 0.1352194409726658, "learning_rate": 3.0770033772246376e-06, "loss": 0.9938, "step": 2105 }, { "epoch": 5.3861892583120206, "grad_norm": 0.14319029352124846, "learning_rate": 3.067804768249386e-06, "loss": 0.9968, "step": 2106 }, { "epoch": 5.388746803069053, "grad_norm": 0.1348404188548053, "learning_rate": 3.058617437522342e-06, "loss": 1.0166, "step": 2107 }, { "epoch": 5.391304347826087, "grad_norm": 0.14010852729827156, "learning_rate": 3.0494413999907125e-06, "loss": 1.0066, "step": 2108 }, { "epoch": 5.3938618925831205, "grad_norm": 0.1351055036158788, "learning_rate": 3.0402766705833455e-06, "loss": 1.0052, "step": 2109 }, { "epoch": 5.396419437340153, "grad_norm": 0.13186613064153313, "learning_rate": 3.0311232642106768e-06, "loss": 0.9969, "step": 2110 }, { "epoch": 5.398976982097187, "grad_norm": 0.1408809630359071, "learning_rate": 3.021981195764726e-06, "loss": 1.0283, "step": 2111 }, { "epoch": 5.40153452685422, "grad_norm": 0.12965889759923607, "learning_rate": 3.0128504801190716e-06, "loss": 1.0179, "step": 2112 }, { "epoch": 5.404092071611253, "grad_norm": 0.13945206906826596, "learning_rate": 3.003731132128811e-06, "loss": 1.0099, "step": 2113 }, { "epoch": 5.406649616368286, "grad_norm": 0.1400549514773388, "learning_rate": 2.9946231666305627e-06, "loss": 0.998, "step": 2114 }, { "epoch": 5.40920716112532, "grad_norm": 0.13519306803227119, "learning_rate": 2.9855265984424042e-06, "loss": 1.0069, "step": 2115 }, { "epoch": 5.411764705882353, "grad_norm": 0.12988356378358373, "learning_rate": 2.976441442363893e-06, "loss": 0.9928, "step": 2116 }, { "epoch": 5.414322250639386, "grad_norm": 0.13225437647406532, "learning_rate": 2.967367713176007e-06, "loss": 1.0082, "step": 2117 }, { "epoch": 5.41687979539642, "grad_norm": 0.13453763452834291, "learning_rate": 2.9583054256411326e-06, "loss": 0.9779, "step": 2118 }, { "epoch": 5.419437340153452, "grad_norm": 0.13933174777230192, "learning_rate": 2.9492545945030517e-06, "loss": 0.9947, "step": 2119 }, { "epoch": 5.421994884910486, "grad_norm": 0.13265772100907866, "learning_rate": 2.940215234486894e-06, "loss": 1.0304, "step": 2120 }, { "epoch": 5.42455242966752, "grad_norm": 0.13461066684644984, "learning_rate": 2.9311873602991435e-06, "loss": 1.0265, "step": 2121 }, { "epoch": 5.427109974424552, "grad_norm": 0.13302962430701365, "learning_rate": 2.922170986627573e-06, "loss": 0.9907, "step": 2122 }, { "epoch": 5.429667519181586, "grad_norm": 0.1372156107097446, "learning_rate": 2.913166128141265e-06, "loss": 1.0362, "step": 2123 }, { "epoch": 5.432225063938619, "grad_norm": 0.13526418969755188, "learning_rate": 2.9041727994905686e-06, "loss": 1.0335, "step": 2124 }, { "epoch": 5.434782608695652, "grad_norm": 0.14056788233892892, "learning_rate": 2.895191015307055e-06, "loss": 0.9863, "step": 2125 }, { "epoch": 5.437340153452685, "grad_norm": 0.13830914570568487, "learning_rate": 2.8862207902035334e-06, "loss": 1.0279, "step": 2126 }, { "epoch": 5.439897698209719, "grad_norm": 0.13255464251905436, "learning_rate": 2.877262138773994e-06, "loss": 1.0074, "step": 2127 }, { "epoch": 5.442455242966752, "grad_norm": 0.13094809127879986, "learning_rate": 2.8683150755936107e-06, "loss": 1.0007, "step": 2128 }, { "epoch": 5.445012787723785, "grad_norm": 0.13969902391137623, "learning_rate": 2.859379615218685e-06, "loss": 1.0183, "step": 2129 }, { "epoch": 5.447570332480819, "grad_norm": 0.13298200813066383, "learning_rate": 2.850455772186658e-06, "loss": 1.0553, "step": 2130 }, { "epoch": 5.450127877237851, "grad_norm": 0.13752465215056384, "learning_rate": 2.8415435610160667e-06, "loss": 1.0029, "step": 2131 }, { "epoch": 5.452685421994885, "grad_norm": 0.13776730476333435, "learning_rate": 2.8326429962065184e-06, "loss": 1.0591, "step": 2132 }, { "epoch": 5.455242966751918, "grad_norm": 0.15290697841832607, "learning_rate": 2.8237540922386764e-06, "loss": 1.0234, "step": 2133 }, { "epoch": 5.457800511508951, "grad_norm": 0.1435647245473299, "learning_rate": 2.8148768635742286e-06, "loss": 1.0408, "step": 2134 }, { "epoch": 5.460358056265985, "grad_norm": 0.1348972282036283, "learning_rate": 2.8060113246558783e-06, "loss": 1.0582, "step": 2135 }, { "epoch": 5.462915601023018, "grad_norm": 0.14312694503231538, "learning_rate": 2.7971574899072938e-06, "loss": 1.0557, "step": 2136 }, { "epoch": 5.465473145780051, "grad_norm": 0.14626596664710145, "learning_rate": 2.7883153737331136e-06, "loss": 1.0213, "step": 2137 }, { "epoch": 5.468030690537084, "grad_norm": 0.12723321182479033, "learning_rate": 2.7794849905189138e-06, "loss": 1.0258, "step": 2138 }, { "epoch": 5.470588235294118, "grad_norm": 0.1297835067922189, "learning_rate": 2.7706663546311705e-06, "loss": 0.9791, "step": 2139 }, { "epoch": 5.4731457800511505, "grad_norm": 0.14065052834912603, "learning_rate": 2.761859480417255e-06, "loss": 1.0364, "step": 2140 }, { "epoch": 5.475703324808184, "grad_norm": 0.14903101964341123, "learning_rate": 2.753064382205396e-06, "loss": 1.046, "step": 2141 }, { "epoch": 5.478260869565218, "grad_norm": 0.12884063957129957, "learning_rate": 2.7442810743046742e-06, "loss": 1.0377, "step": 2142 }, { "epoch": 5.4808184143222505, "grad_norm": 0.13327063753076238, "learning_rate": 2.735509571004982e-06, "loss": 1.0095, "step": 2143 }, { "epoch": 5.483375959079284, "grad_norm": 0.1571390786677921, "learning_rate": 2.7267498865770005e-06, "loss": 0.9769, "step": 2144 }, { "epoch": 5.485933503836317, "grad_norm": 0.1320156220064998, "learning_rate": 2.718002035272197e-06, "loss": 1.0057, "step": 2145 }, { "epoch": 5.4884910485933505, "grad_norm": 0.1360636747597633, "learning_rate": 2.7092660313227748e-06, "loss": 1.0064, "step": 2146 }, { "epoch": 5.491048593350383, "grad_norm": 0.13394654726028757, "learning_rate": 2.700541888941667e-06, "loss": 1.0025, "step": 2147 }, { "epoch": 5.493606138107417, "grad_norm": 0.1460012982176339, "learning_rate": 2.6918296223225026e-06, "loss": 1.0227, "step": 2148 }, { "epoch": 5.4961636828644505, "grad_norm": 0.13049152775591077, "learning_rate": 2.683129245639603e-06, "loss": 1.0393, "step": 2149 }, { "epoch": 5.498721227621483, "grad_norm": 0.15254103744247385, "learning_rate": 2.6744407730479325e-06, "loss": 1.0279, "step": 2150 }, { "epoch": 5.501278772378517, "grad_norm": 0.1440023793657765, "learning_rate": 2.66576421868309e-06, "loss": 1.0295, "step": 2151 }, { "epoch": 5.5038363171355495, "grad_norm": 0.13606809517331622, "learning_rate": 2.6570995966612945e-06, "loss": 1.0299, "step": 2152 }, { "epoch": 5.506393861892583, "grad_norm": 0.13926181662872325, "learning_rate": 2.6484469210793384e-06, "loss": 1.037, "step": 2153 }, { "epoch": 5.508951406649617, "grad_norm": 0.14473456019169403, "learning_rate": 2.6398062060145867e-06, "loss": 1.017, "step": 2154 }, { "epoch": 5.5115089514066495, "grad_norm": 0.13272081994045937, "learning_rate": 2.631177465524938e-06, "loss": 1.0217, "step": 2155 }, { "epoch": 5.514066496163683, "grad_norm": 0.14026203110310534, "learning_rate": 2.6225607136488194e-06, "loss": 1.0021, "step": 2156 }, { "epoch": 5.516624040920716, "grad_norm": 0.13205919977316974, "learning_rate": 2.613955964405146e-06, "loss": 1.052, "step": 2157 }, { "epoch": 5.5191815856777495, "grad_norm": 0.13360379756882199, "learning_rate": 2.605363231793302e-06, "loss": 1.0499, "step": 2158 }, { "epoch": 5.521739130434782, "grad_norm": 0.14208435941220482, "learning_rate": 2.5967825297931328e-06, "loss": 1.0172, "step": 2159 }, { "epoch": 5.524296675191816, "grad_norm": 0.13295870010362018, "learning_rate": 2.5882138723649018e-06, "loss": 1.0334, "step": 2160 }, { "epoch": 5.526854219948849, "grad_norm": 0.12489034371588933, "learning_rate": 2.5796572734492777e-06, "loss": 1.0103, "step": 2161 }, { "epoch": 5.529411764705882, "grad_norm": 0.13244599397256537, "learning_rate": 2.571112746967309e-06, "loss": 1.0218, "step": 2162 }, { "epoch": 5.531969309462916, "grad_norm": 0.15003256070846932, "learning_rate": 2.5625803068204126e-06, "loss": 1.0759, "step": 2163 }, { "epoch": 5.534526854219949, "grad_norm": 0.1356632599292978, "learning_rate": 2.554059966890332e-06, "loss": 1.0042, "step": 2164 }, { "epoch": 5.537084398976982, "grad_norm": 0.15088785982749878, "learning_rate": 2.545551741039125e-06, "loss": 1.0084, "step": 2165 }, { "epoch": 5.539641943734015, "grad_norm": 0.13549191741444538, "learning_rate": 2.5370556431091486e-06, "loss": 1.0447, "step": 2166 }, { "epoch": 5.542199488491049, "grad_norm": 0.1345097927774657, "learning_rate": 2.5285716869230192e-06, "loss": 1.0352, "step": 2167 }, { "epoch": 5.544757033248082, "grad_norm": 0.1377603438588639, "learning_rate": 2.5200998862836044e-06, "loss": 1.0456, "step": 2168 }, { "epoch": 5.547314578005115, "grad_norm": 0.13719837282442893, "learning_rate": 2.5116402549739904e-06, "loss": 1.0111, "step": 2169 }, { "epoch": 5.549872122762149, "grad_norm": 0.12784774794791698, "learning_rate": 2.503192806757474e-06, "loss": 1.0555, "step": 2170 }, { "epoch": 5.552429667519181, "grad_norm": 0.1377625979101254, "learning_rate": 2.494757555377524e-06, "loss": 1.0217, "step": 2171 }, { "epoch": 5.554987212276215, "grad_norm": 0.13849942681054245, "learning_rate": 2.486334514557761e-06, "loss": 1.0175, "step": 2172 }, { "epoch": 5.557544757033249, "grad_norm": 0.14070221787371265, "learning_rate": 2.477923698001955e-06, "loss": 1.03, "step": 2173 }, { "epoch": 5.560102301790281, "grad_norm": 0.12917105115289923, "learning_rate": 2.469525119393974e-06, "loss": 1.0316, "step": 2174 }, { "epoch": 5.562659846547315, "grad_norm": 0.14393204543904917, "learning_rate": 2.461138792397779e-06, "loss": 1.0429, "step": 2175 }, { "epoch": 5.565217391304348, "grad_norm": 0.1350830986575781, "learning_rate": 2.4527647306574e-06, "loss": 1.0005, "step": 2176 }, { "epoch": 5.567774936061381, "grad_norm": 0.1272869817285887, "learning_rate": 2.4444029477969157e-06, "loss": 1.0083, "step": 2177 }, { "epoch": 5.570332480818414, "grad_norm": 0.1329875176980315, "learning_rate": 2.4360534574204196e-06, "loss": 1.0064, "step": 2178 }, { "epoch": 5.572890025575448, "grad_norm": 0.13284521850316935, "learning_rate": 2.427716273112011e-06, "loss": 1.026, "step": 2179 }, { "epoch": 5.57544757033248, "grad_norm": 0.13655729094534802, "learning_rate": 2.4193914084357708e-06, "loss": 1.0311, "step": 2180 }, { "epoch": 5.578005115089514, "grad_norm": 0.13249886049800538, "learning_rate": 2.4110788769357305e-06, "loss": 1.0245, "step": 2181 }, { "epoch": 5.580562659846548, "grad_norm": 0.14032611666517894, "learning_rate": 2.402778692135861e-06, "loss": 1.0218, "step": 2182 }, { "epoch": 5.58312020460358, "grad_norm": 0.13366091002172387, "learning_rate": 2.394490867540039e-06, "loss": 1.0275, "step": 2183 }, { "epoch": 5.585677749360614, "grad_norm": 0.13700684117392312, "learning_rate": 2.3862154166320417e-06, "loss": 1.0055, "step": 2184 }, { "epoch": 5.588235294117647, "grad_norm": 0.13884798487973146, "learning_rate": 2.3779523528755143e-06, "loss": 1.0298, "step": 2185 }, { "epoch": 5.59079283887468, "grad_norm": 0.14068128211510497, "learning_rate": 2.3697016897139345e-06, "loss": 1.0568, "step": 2186 }, { "epoch": 5.593350383631714, "grad_norm": 0.1367538445761975, "learning_rate": 2.361463440570623e-06, "loss": 1.0211, "step": 2187 }, { "epoch": 5.595907928388747, "grad_norm": 0.137882423029852, "learning_rate": 2.353237618848695e-06, "loss": 1.0388, "step": 2188 }, { "epoch": 5.59846547314578, "grad_norm": 0.13627762962811446, "learning_rate": 2.3450242379310427e-06, "loss": 1.0423, "step": 2189 }, { "epoch": 5.601023017902813, "grad_norm": 0.13080557028764447, "learning_rate": 2.3368233111803305e-06, "loss": 1.0209, "step": 2190 }, { "epoch": 5.603580562659847, "grad_norm": 0.13373365809565754, "learning_rate": 2.328634851938949e-06, "loss": 1.0548, "step": 2191 }, { "epoch": 5.6061381074168795, "grad_norm": 0.14670903806258018, "learning_rate": 2.3204588735290155e-06, "loss": 1.0283, "step": 2192 }, { "epoch": 5.608695652173913, "grad_norm": 0.1351316953465856, "learning_rate": 2.312295389252326e-06, "loss": 1.0253, "step": 2193 }, { "epoch": 5.611253196930946, "grad_norm": 0.14536763822784776, "learning_rate": 2.304144412390367e-06, "loss": 1.0289, "step": 2194 }, { "epoch": 5.6138107416879794, "grad_norm": 0.1373151541315976, "learning_rate": 2.2960059562042647e-06, "loss": 1.0227, "step": 2195 }, { "epoch": 5.616368286445013, "grad_norm": 0.12983515898716327, "learning_rate": 2.2878800339347763e-06, "loss": 1.0256, "step": 2196 }, { "epoch": 5.618925831202046, "grad_norm": 0.12825544867685706, "learning_rate": 2.279766658802275e-06, "loss": 1.0468, "step": 2197 }, { "epoch": 5.621483375959079, "grad_norm": 0.14977773117762613, "learning_rate": 2.2716658440067085e-06, "loss": 1.0045, "step": 2198 }, { "epoch": 5.624040920716112, "grad_norm": 0.163815240244753, "learning_rate": 2.2635776027276056e-06, "loss": 1.0211, "step": 2199 }, { "epoch": 5.626598465473146, "grad_norm": 0.1311668589781632, "learning_rate": 2.255501948124017e-06, "loss": 1.0318, "step": 2200 }, { "epoch": 5.629156010230179, "grad_norm": 0.13085196604157895, "learning_rate": 2.247438893334537e-06, "loss": 1.0219, "step": 2201 }, { "epoch": 5.631713554987212, "grad_norm": 0.1273903714267332, "learning_rate": 2.2393884514772457e-06, "loss": 0.9929, "step": 2202 }, { "epoch": 5.634271099744246, "grad_norm": 0.13914897324377146, "learning_rate": 2.231350635649713e-06, "loss": 1.0452, "step": 2203 }, { "epoch": 5.6368286445012785, "grad_norm": 0.13636448611829766, "learning_rate": 2.223325458928961e-06, "loss": 1.0078, "step": 2204 }, { "epoch": 5.639386189258312, "grad_norm": 0.13875063448351502, "learning_rate": 2.2153129343714484e-06, "loss": 1.044, "step": 2205 }, { "epoch": 5.641943734015345, "grad_norm": 0.1268762090418032, "learning_rate": 2.207313075013059e-06, "loss": 1.021, "step": 2206 }, { "epoch": 5.6445012787723785, "grad_norm": 0.14115564139986136, "learning_rate": 2.1993258938690533e-06, "loss": 0.9935, "step": 2207 }, { "epoch": 5.647058823529412, "grad_norm": 0.13114159318824248, "learning_rate": 2.191351403934082e-06, "loss": 1.0314, "step": 2208 }, { "epoch": 5.649616368286445, "grad_norm": 0.12884976286632582, "learning_rate": 2.183389618182139e-06, "loss": 1.0046, "step": 2209 }, { "epoch": 5.6521739130434785, "grad_norm": 0.12995582182420992, "learning_rate": 2.1754405495665553e-06, "loss": 1.0373, "step": 2210 }, { "epoch": 5.654731457800511, "grad_norm": 0.13421458626767693, "learning_rate": 2.1675042110199664e-06, "loss": 1.016, "step": 2211 }, { "epoch": 5.657289002557545, "grad_norm": 0.13511795554454278, "learning_rate": 2.1595806154542965e-06, "loss": 1.0203, "step": 2212 }, { "epoch": 5.659846547314578, "grad_norm": 0.12526718028345482, "learning_rate": 2.1516697757607464e-06, "loss": 1.048, "step": 2213 }, { "epoch": 5.662404092071611, "grad_norm": 0.13609131915375153, "learning_rate": 2.143771704809753e-06, "loss": 1.0221, "step": 2214 }, { "epoch": 5.664961636828645, "grad_norm": 0.13389453092548842, "learning_rate": 2.1358864154509838e-06, "loss": 0.995, "step": 2215 }, { "epoch": 5.667519181585678, "grad_norm": 0.13120531247951384, "learning_rate": 2.128013920513311e-06, "loss": 1.002, "step": 2216 }, { "epoch": 5.670076726342711, "grad_norm": 0.12595917765897047, "learning_rate": 2.1201542328047965e-06, "loss": 1.0307, "step": 2217 }, { "epoch": 5.672634271099744, "grad_norm": 0.1327291524503786, "learning_rate": 2.112307365112657e-06, "loss": 1.0042, "step": 2218 }, { "epoch": 5.675191815856778, "grad_norm": 0.14073038841763177, "learning_rate": 2.1044733302032527e-06, "loss": 1.0089, "step": 2219 }, { "epoch": 5.677749360613811, "grad_norm": 0.13145348857222067, "learning_rate": 2.0966521408220753e-06, "loss": 1.0191, "step": 2220 }, { "epoch": 5.680306905370844, "grad_norm": 0.13758179967194598, "learning_rate": 2.088843809693708e-06, "loss": 1.0389, "step": 2221 }, { "epoch": 5.6828644501278776, "grad_norm": 0.12934306601192186, "learning_rate": 2.081048349521814e-06, "loss": 1.0386, "step": 2222 }, { "epoch": 5.68542199488491, "grad_norm": 0.12132994106171455, "learning_rate": 2.0732657729891236e-06, "loss": 1.0237, "step": 2223 }, { "epoch": 5.687979539641944, "grad_norm": 0.12639844337210293, "learning_rate": 2.065496092757403e-06, "loss": 1.0039, "step": 2224 }, { "epoch": 5.690537084398977, "grad_norm": 0.1397408236378054, "learning_rate": 2.0577393214674335e-06, "loss": 1.0782, "step": 2225 }, { "epoch": 5.69309462915601, "grad_norm": 0.12975569414651789, "learning_rate": 2.049995471738995e-06, "loss": 1.029, "step": 2226 }, { "epoch": 5.695652173913043, "grad_norm": 0.13025784101096557, "learning_rate": 2.042264556170853e-06, "loss": 0.9846, "step": 2227 }, { "epoch": 5.698209718670077, "grad_norm": 0.1282941793591346, "learning_rate": 2.034546587340719e-06, "loss": 1.0143, "step": 2228 }, { "epoch": 5.70076726342711, "grad_norm": 0.13236558983338137, "learning_rate": 2.026841577805245e-06, "loss": 1.0534, "step": 2229 }, { "epoch": 5.703324808184143, "grad_norm": 0.13423342295188723, "learning_rate": 2.019149540100005e-06, "loss": 1.0568, "step": 2230 }, { "epoch": 5.705882352941177, "grad_norm": 0.13468947441049006, "learning_rate": 2.0114704867394598e-06, "loss": 1.014, "step": 2231 }, { "epoch": 5.708439897698209, "grad_norm": 0.13388666927274886, "learning_rate": 2.0038044302169492e-06, "loss": 1.0246, "step": 2232 }, { "epoch": 5.710997442455243, "grad_norm": 0.13458582769078975, "learning_rate": 1.9961513830046663e-06, "loss": 1.0335, "step": 2233 }, { "epoch": 5.713554987212277, "grad_norm": 0.1334530516759338, "learning_rate": 1.988511357553644e-06, "loss": 1.0107, "step": 2234 }, { "epoch": 5.716112531969309, "grad_norm": 0.13432155143391286, "learning_rate": 1.980884366293725e-06, "loss": 1.002, "step": 2235 }, { "epoch": 5.718670076726343, "grad_norm": 0.1321302038455819, "learning_rate": 1.973270421633543e-06, "loss": 1.0281, "step": 2236 }, { "epoch": 5.721227621483376, "grad_norm": 0.13482083547904436, "learning_rate": 1.965669535960516e-06, "loss": 1.0032, "step": 2237 }, { "epoch": 5.723785166240409, "grad_norm": 0.1362582011621695, "learning_rate": 1.9580817216408075e-06, "loss": 1.0151, "step": 2238 }, { "epoch": 5.726342710997442, "grad_norm": 0.13381683599607858, "learning_rate": 1.9505069910193164e-06, "loss": 0.9876, "step": 2239 }, { "epoch": 5.728900255754476, "grad_norm": 0.12202356902109507, "learning_rate": 1.9429453564196543e-06, "loss": 1.0203, "step": 2240 }, { "epoch": 5.731457800511509, "grad_norm": 0.12193705736628206, "learning_rate": 1.9353968301441306e-06, "loss": 0.9752, "step": 2241 }, { "epoch": 5.734015345268542, "grad_norm": 0.1264989543927549, "learning_rate": 1.927861424473726e-06, "loss": 1.025, "step": 2242 }, { "epoch": 5.736572890025576, "grad_norm": 0.14123473229613026, "learning_rate": 1.920339151668069e-06, "loss": 1.0125, "step": 2243 }, { "epoch": 5.739130434782608, "grad_norm": 0.12538976213285152, "learning_rate": 1.9128300239654353e-06, "loss": 1.0103, "step": 2244 }, { "epoch": 5.741687979539642, "grad_norm": 0.12777815103030538, "learning_rate": 1.9053340535827004e-06, "loss": 1.0365, "step": 2245 }, { "epoch": 5.744245524296675, "grad_norm": 0.9983046758036718, "learning_rate": 1.8978512527153414e-06, "loss": 1.0208, "step": 2246 }, { "epoch": 5.746803069053708, "grad_norm": 0.13869698166830857, "learning_rate": 1.8903816335374048e-06, "loss": 1.0092, "step": 2247 }, { "epoch": 5.749360613810742, "grad_norm": 0.13909895674572456, "learning_rate": 1.882925208201498e-06, "loss": 0.9976, "step": 2248 }, { "epoch": 5.751918158567775, "grad_norm": 0.13223029843900272, "learning_rate": 1.8754819888387576e-06, "loss": 1.0226, "step": 2249 }, { "epoch": 5.754475703324808, "grad_norm": 0.1355611449623982, "learning_rate": 1.868051987558832e-06, "loss": 1.0547, "step": 2250 }, { "epoch": 5.757033248081841, "grad_norm": 0.1335592612771471, "learning_rate": 1.8606352164498754e-06, "loss": 1.022, "step": 2251 }, { "epoch": 5.759590792838875, "grad_norm": 0.13517321815446315, "learning_rate": 1.8532316875785084e-06, "loss": 1.059, "step": 2252 }, { "epoch": 5.762148337595908, "grad_norm": 0.12900109188000092, "learning_rate": 1.8458414129898072e-06, "loss": 1.0121, "step": 2253 }, { "epoch": 5.764705882352941, "grad_norm": 0.13164593690766663, "learning_rate": 1.8384644047072864e-06, "loss": 1.0363, "step": 2254 }, { "epoch": 5.767263427109975, "grad_norm": 0.12836234729861262, "learning_rate": 1.8311006747328775e-06, "loss": 1.0342, "step": 2255 }, { "epoch": 5.7698209718670075, "grad_norm": 0.13352486032417052, "learning_rate": 1.8237502350469161e-06, "loss": 1.028, "step": 2256 }, { "epoch": 5.772378516624041, "grad_norm": 0.12666547237956713, "learning_rate": 1.8164130976080962e-06, "loss": 0.9998, "step": 2257 }, { "epoch": 5.774936061381074, "grad_norm": 0.12597408036958038, "learning_rate": 1.8090892743534904e-06, "loss": 0.9861, "step": 2258 }, { "epoch": 5.7774936061381075, "grad_norm": 0.13091969265184827, "learning_rate": 1.8017787771984973e-06, "loss": 1.0196, "step": 2259 }, { "epoch": 5.78005115089514, "grad_norm": 0.1328229090332335, "learning_rate": 1.7944816180368408e-06, "loss": 1.0422, "step": 2260 }, { "epoch": 5.782608695652174, "grad_norm": 0.12677176745235394, "learning_rate": 1.7871978087405384e-06, "loss": 1.0097, "step": 2261 }, { "epoch": 5.7851662404092075, "grad_norm": 0.12437893059639113, "learning_rate": 1.7799273611598943e-06, "loss": 1.0121, "step": 2262 }, { "epoch": 5.78772378516624, "grad_norm": 0.1251367564202301, "learning_rate": 1.772670287123479e-06, "loss": 0.9939, "step": 2263 }, { "epoch": 5.790281329923274, "grad_norm": 0.1302978820127013, "learning_rate": 1.765426598438088e-06, "loss": 1.0377, "step": 2264 }, { "epoch": 5.792838874680307, "grad_norm": 0.12296911765019702, "learning_rate": 1.7581963068887554e-06, "loss": 1.0082, "step": 2265 }, { "epoch": 5.79539641943734, "grad_norm": 0.1310292740348814, "learning_rate": 1.7509794242387135e-06, "loss": 1.0455, "step": 2266 }, { "epoch": 5.797953964194374, "grad_norm": 0.11962773068304663, "learning_rate": 1.7437759622293771e-06, "loss": 1.0301, "step": 2267 }, { "epoch": 5.8005115089514065, "grad_norm": 0.1338997971252641, "learning_rate": 1.7365859325803269e-06, "loss": 1.028, "step": 2268 }, { "epoch": 5.80306905370844, "grad_norm": 0.12161266269112997, "learning_rate": 1.7294093469892948e-06, "loss": 1.0253, "step": 2269 }, { "epoch": 5.805626598465473, "grad_norm": 0.12194546591797659, "learning_rate": 1.7222462171321397e-06, "loss": 1.0112, "step": 2270 }, { "epoch": 5.8081841432225065, "grad_norm": 0.12690399558973253, "learning_rate": 1.7150965546628184e-06, "loss": 1.0168, "step": 2271 }, { "epoch": 5.810741687979539, "grad_norm": 0.1329159422591136, "learning_rate": 1.7079603712133908e-06, "loss": 0.9867, "step": 2272 }, { "epoch": 5.813299232736573, "grad_norm": 0.12116530026113131, "learning_rate": 1.7008376783939772e-06, "loss": 1.0085, "step": 2273 }, { "epoch": 5.8158567774936065, "grad_norm": 0.12935715986878404, "learning_rate": 1.6937284877927596e-06, "loss": 1.0162, "step": 2274 }, { "epoch": 5.818414322250639, "grad_norm": 0.12690629229315065, "learning_rate": 1.6866328109759377e-06, "loss": 0.9794, "step": 2275 }, { "epoch": 5.820971867007673, "grad_norm": 0.12407793133570494, "learning_rate": 1.6795506594877388e-06, "loss": 1.031, "step": 2276 }, { "epoch": 5.823529411764706, "grad_norm": 0.12704984040936246, "learning_rate": 1.6724820448503852e-06, "loss": 1.0204, "step": 2277 }, { "epoch": 5.826086956521739, "grad_norm": 0.13001027110393584, "learning_rate": 1.6654269785640608e-06, "loss": 1.0448, "step": 2278 }, { "epoch": 5.828644501278772, "grad_norm": 0.11915860756194478, "learning_rate": 1.658385472106926e-06, "loss": 1.0146, "step": 2279 }, { "epoch": 5.831202046035806, "grad_norm": 0.12897358959587038, "learning_rate": 1.6513575369350654e-06, "loss": 1.021, "step": 2280 }, { "epoch": 5.833759590792839, "grad_norm": 0.13505425066582885, "learning_rate": 1.6443431844824975e-06, "loss": 1.0002, "step": 2281 }, { "epoch": 5.836317135549872, "grad_norm": 0.12555260697675938, "learning_rate": 1.637342426161126e-06, "loss": 1.0013, "step": 2282 }, { "epoch": 5.838874680306906, "grad_norm": 0.1276721077986895, "learning_rate": 1.630355273360752e-06, "loss": 1.0083, "step": 2283 }, { "epoch": 5.841432225063938, "grad_norm": 0.12628248303483217, "learning_rate": 1.623381737449038e-06, "loss": 1.0495, "step": 2284 }, { "epoch": 5.843989769820972, "grad_norm": 0.13396531513865312, "learning_rate": 1.6164218297714884e-06, "loss": 0.9778, "step": 2285 }, { "epoch": 5.846547314578006, "grad_norm": 0.13405119018709796, "learning_rate": 1.609475561651438e-06, "loss": 0.9882, "step": 2286 }, { "epoch": 5.849104859335038, "grad_norm": 0.11946775190358987, "learning_rate": 1.6025429443900286e-06, "loss": 1.0402, "step": 2287 }, { "epoch": 5.851662404092072, "grad_norm": 0.1286546110791319, "learning_rate": 1.5956239892661995e-06, "loss": 1.0323, "step": 2288 }, { "epoch": 5.854219948849105, "grad_norm": 0.12706067523411144, "learning_rate": 1.588718707536656e-06, "loss": 1.0153, "step": 2289 }, { "epoch": 5.856777493606138, "grad_norm": 0.12632255275977317, "learning_rate": 1.5818271104358574e-06, "loss": 1.0359, "step": 2290 }, { "epoch": 5.859335038363171, "grad_norm": 0.12022429130741803, "learning_rate": 1.5749492091760054e-06, "loss": 1.0272, "step": 2291 }, { "epoch": 5.861892583120205, "grad_norm": 0.12754203390815988, "learning_rate": 1.5680850149470139e-06, "loss": 1.0141, "step": 2292 }, { "epoch": 5.864450127877237, "grad_norm": 0.12789955923845803, "learning_rate": 1.5612345389164974e-06, "loss": 1.0213, "step": 2293 }, { "epoch": 5.867007672634271, "grad_norm": 0.13105545311215508, "learning_rate": 1.5543977922297494e-06, "loss": 1.0203, "step": 2294 }, { "epoch": 5.869565217391305, "grad_norm": 0.12692375648838364, "learning_rate": 1.5475747860097335e-06, "loss": 1.0175, "step": 2295 }, { "epoch": 5.872122762148337, "grad_norm": 0.12758413074272634, "learning_rate": 1.5407655313570525e-06, "loss": 1.0187, "step": 2296 }, { "epoch": 5.874680306905371, "grad_norm": 0.1347266986438743, "learning_rate": 1.5339700393499357e-06, "loss": 0.978, "step": 2297 }, { "epoch": 5.877237851662404, "grad_norm": 0.1286412634763229, "learning_rate": 1.5271883210442285e-06, "loss": 1.0243, "step": 2298 }, { "epoch": 5.879795396419437, "grad_norm": 0.13598473504010955, "learning_rate": 1.5204203874733604e-06, "loss": 1.0458, "step": 2299 }, { "epoch": 5.882352941176471, "grad_norm": 0.12217909066335947, "learning_rate": 1.5136662496483346e-06, "loss": 1.0159, "step": 2300 }, { "epoch": 5.884910485933504, "grad_norm": 0.13697298325476193, "learning_rate": 1.5069259185577112e-06, "loss": 1.0234, "step": 2301 }, { "epoch": 5.887468030690537, "grad_norm": 0.12856950834935316, "learning_rate": 1.5001994051675894e-06, "loss": 1.0005, "step": 2302 }, { "epoch": 5.89002557544757, "grad_norm": 0.12272037964597306, "learning_rate": 1.4934867204215864e-06, "loss": 1.0182, "step": 2303 }, { "epoch": 5.892583120204604, "grad_norm": 0.12396363368680077, "learning_rate": 1.486787875240816e-06, "loss": 1.0023, "step": 2304 }, { "epoch": 5.8951406649616365, "grad_norm": 0.12822276354353365, "learning_rate": 1.480102880523886e-06, "loss": 1.0114, "step": 2305 }, { "epoch": 5.89769820971867, "grad_norm": 0.12823957750976692, "learning_rate": 1.4734317471468618e-06, "loss": 1.0279, "step": 2306 }, { "epoch": 5.900255754475703, "grad_norm": 0.12481205791568802, "learning_rate": 1.4667744859632615e-06, "loss": 0.9748, "step": 2307 }, { "epoch": 5.9028132992327365, "grad_norm": 0.12376259417000356, "learning_rate": 1.4601311078040304e-06, "loss": 1.0291, "step": 2308 }, { "epoch": 5.90537084398977, "grad_norm": 0.12039082706987389, "learning_rate": 1.4535016234775324e-06, "loss": 0.9835, "step": 2309 }, { "epoch": 5.907928388746803, "grad_norm": 0.1278580324817726, "learning_rate": 1.4468860437695243e-06, "loss": 1.0276, "step": 2310 }, { "epoch": 5.910485933503836, "grad_norm": 0.12971723157693313, "learning_rate": 1.4402843794431354e-06, "loss": 1.0085, "step": 2311 }, { "epoch": 5.913043478260869, "grad_norm": 0.12766208083651814, "learning_rate": 1.4336966412388674e-06, "loss": 1.0392, "step": 2312 }, { "epoch": 5.915601023017903, "grad_norm": 0.12363722996422528, "learning_rate": 1.4271228398745552e-06, "loss": 1.0063, "step": 2313 }, { "epoch": 5.918158567774936, "grad_norm": 0.12491762028888559, "learning_rate": 1.4205629860453641e-06, "loss": 1.0598, "step": 2314 }, { "epoch": 5.920716112531969, "grad_norm": 0.12614418988739717, "learning_rate": 1.4140170904237616e-06, "loss": 1.0078, "step": 2315 }, { "epoch": 5.923273657289003, "grad_norm": 0.12871200444350614, "learning_rate": 1.4074851636595165e-06, "loss": 0.9912, "step": 2316 }, { "epoch": 5.9258312020460355, "grad_norm": 0.12176341068010405, "learning_rate": 1.400967216379663e-06, "loss": 1.0023, "step": 2317 }, { "epoch": 5.928388746803069, "grad_norm": 0.12736989149935335, "learning_rate": 1.394463259188491e-06, "loss": 1.0097, "step": 2318 }, { "epoch": 5.930946291560103, "grad_norm": 0.12401472625813548, "learning_rate": 1.3879733026675367e-06, "loss": 1.036, "step": 2319 }, { "epoch": 5.9335038363171355, "grad_norm": 0.12937517228342466, "learning_rate": 1.3814973573755518e-06, "loss": 1.036, "step": 2320 }, { "epoch": 5.936061381074169, "grad_norm": 0.127613205394154, "learning_rate": 1.3750354338484916e-06, "loss": 0.9881, "step": 2321 }, { "epoch": 5.938618925831202, "grad_norm": 0.12739173803258835, "learning_rate": 1.3685875425995064e-06, "loss": 1.0191, "step": 2322 }, { "epoch": 5.9411764705882355, "grad_norm": 0.13795008867321654, "learning_rate": 1.3621536941189107e-06, "loss": 1.0144, "step": 2323 }, { "epoch": 5.943734015345268, "grad_norm": 0.12984194360371934, "learning_rate": 1.355733898874173e-06, "loss": 1.049, "step": 2324 }, { "epoch": 5.946291560102302, "grad_norm": 0.13129623864662363, "learning_rate": 1.3493281673098956e-06, "loss": 1.015, "step": 2325 }, { "epoch": 5.948849104859335, "grad_norm": 0.12793818903871373, "learning_rate": 1.3429365098478087e-06, "loss": 0.9981, "step": 2326 }, { "epoch": 5.951406649616368, "grad_norm": 0.1255755665233896, "learning_rate": 1.3365589368867371e-06, "loss": 0.9794, "step": 2327 }, { "epoch": 5.953964194373402, "grad_norm": 0.1279352390496069, "learning_rate": 1.330195458802591e-06, "loss": 1.0249, "step": 2328 }, { "epoch": 5.956521739130435, "grad_norm": 0.128293917496119, "learning_rate": 1.323846085948356e-06, "loss": 0.9898, "step": 2329 }, { "epoch": 5.959079283887468, "grad_norm": 0.12767639872018413, "learning_rate": 1.3175108286540617e-06, "loss": 1.0352, "step": 2330 }, { "epoch": 5.961636828644501, "grad_norm": 0.12662645466299385, "learning_rate": 1.3111896972267768e-06, "loss": 1.0055, "step": 2331 }, { "epoch": 5.964194373401535, "grad_norm": 0.12253304775794958, "learning_rate": 1.3048827019505828e-06, "loss": 0.9892, "step": 2332 }, { "epoch": 5.966751918158568, "grad_norm": 0.13233724231669944, "learning_rate": 1.2985898530865736e-06, "loss": 0.9883, "step": 2333 }, { "epoch": 5.969309462915601, "grad_norm": 0.12275354609893704, "learning_rate": 1.2923111608728168e-06, "loss": 1.0221, "step": 2334 }, { "epoch": 5.971867007672635, "grad_norm": 0.13544461017695578, "learning_rate": 1.2860466355243506e-06, "loss": 1.0587, "step": 2335 }, { "epoch": 5.974424552429667, "grad_norm": 0.125504059793445, "learning_rate": 1.2797962872331693e-06, "loss": 1.0096, "step": 2336 }, { "epoch": 5.976982097186701, "grad_norm": 0.13226317160144294, "learning_rate": 1.2735601261681985e-06, "loss": 1.0489, "step": 2337 }, { "epoch": 5.979539641943734, "grad_norm": 0.12803280744387227, "learning_rate": 1.2673381624752813e-06, "loss": 1.0307, "step": 2338 }, { "epoch": 5.982097186700767, "grad_norm": 0.12863654527584692, "learning_rate": 1.2611304062771613e-06, "loss": 1.017, "step": 2339 }, { "epoch": 5.9846547314578, "grad_norm": 0.12401870969986709, "learning_rate": 1.254936867673474e-06, "loss": 1.0056, "step": 2340 }, { "epoch": 5.987212276214834, "grad_norm": 0.11891932350440772, "learning_rate": 1.2487575567407184e-06, "loss": 0.9998, "step": 2341 }, { "epoch": 5.989769820971867, "grad_norm": 0.12341714944406178, "learning_rate": 1.2425924835322422e-06, "loss": 1.0247, "step": 2342 }, { "epoch": 5.9923273657289, "grad_norm": 0.1229416512376773, "learning_rate": 1.2364416580782413e-06, "loss": 1.0195, "step": 2343 }, { "epoch": 5.994884910485934, "grad_norm": 0.12303637728566778, "learning_rate": 1.2303050903857195e-06, "loss": 1.0156, "step": 2344 }, { "epoch": 5.997442455242966, "grad_norm": 0.13561743214244987, "learning_rate": 1.2241827904384928e-06, "loss": 1.0304, "step": 2345 }, { "epoch": 6.0, "grad_norm": 0.11664031093263695, "learning_rate": 1.2180747681971539e-06, "loss": 1.0047, "step": 2346 }, { "epoch": 6.002557544757034, "grad_norm": 0.1230389316598828, "learning_rate": 1.211981033599079e-06, "loss": 1.0416, "step": 2347 }, { "epoch": 6.005115089514066, "grad_norm": 0.12948288079807183, "learning_rate": 1.2059015965583908e-06, "loss": 1.0123, "step": 2348 }, { "epoch": 6.0076726342711, "grad_norm": 0.1207876296019636, "learning_rate": 1.1998364669659524e-06, "loss": 0.9796, "step": 2349 }, { "epoch": 6.010230179028133, "grad_norm": 0.1191785329656778, "learning_rate": 1.1937856546893533e-06, "loss": 0.9862, "step": 2350 }, { "epoch": 6.012787723785166, "grad_norm": 0.12106597514269477, "learning_rate": 1.1877491695728827e-06, "loss": 1.0181, "step": 2351 }, { "epoch": 6.015345268542199, "grad_norm": 0.12714775517717014, "learning_rate": 1.181727021437531e-06, "loss": 0.9901, "step": 2352 }, { "epoch": 6.017902813299233, "grad_norm": 0.12314221662217836, "learning_rate": 1.1757192200809487e-06, "loss": 1.0139, "step": 2353 }, { "epoch": 6.020460358056266, "grad_norm": 0.1205656248704543, "learning_rate": 1.1697257752774581e-06, "loss": 1.0064, "step": 2354 }, { "epoch": 6.023017902813299, "grad_norm": 0.12375532206452915, "learning_rate": 1.1637466967780186e-06, "loss": 1.0055, "step": 2355 }, { "epoch": 6.025575447570333, "grad_norm": 0.13727612152509278, "learning_rate": 1.1577819943102132e-06, "loss": 1.0334, "step": 2356 }, { "epoch": 6.028132992327365, "grad_norm": 0.13743682672187252, "learning_rate": 1.1518316775782456e-06, "loss": 1.063, "step": 2357 }, { "epoch": 6.030690537084399, "grad_norm": 0.1269152481030464, "learning_rate": 1.1458957562629048e-06, "loss": 1.0245, "step": 2358 }, { "epoch": 6.033248081841432, "grad_norm": 0.12054742496527425, "learning_rate": 1.1399742400215685e-06, "loss": 1.016, "step": 2359 }, { "epoch": 6.035805626598465, "grad_norm": 0.11563655740461991, "learning_rate": 1.1340671384881664e-06, "loss": 1.0034, "step": 2360 }, { "epoch": 6.038363171355499, "grad_norm": 0.12654719374228424, "learning_rate": 1.128174461273187e-06, "loss": 1.0303, "step": 2361 }, { "epoch": 6.040920716112532, "grad_norm": 0.13400791982749355, "learning_rate": 1.122296217963651e-06, "loss": 0.9908, "step": 2362 }, { "epoch": 6.043478260869565, "grad_norm": 0.13721318190820386, "learning_rate": 1.116432418123088e-06, "loss": 1.0143, "step": 2363 }, { "epoch": 6.046035805626598, "grad_norm": 0.1331473057560735, "learning_rate": 1.1105830712915355e-06, "loss": 1.0389, "step": 2364 }, { "epoch": 6.048593350383632, "grad_norm": 0.12186052033355585, "learning_rate": 1.1047481869855136e-06, "loss": 0.9923, "step": 2365 }, { "epoch": 6.051150895140665, "grad_norm": 0.130398414275441, "learning_rate": 1.0989277746980186e-06, "loss": 0.9989, "step": 2366 }, { "epoch": 6.053708439897698, "grad_norm": 0.1212752348474763, "learning_rate": 1.0931218438984903e-06, "loss": 1.0002, "step": 2367 }, { "epoch": 6.056265984654732, "grad_norm": 0.12066129403697316, "learning_rate": 1.0873304040328193e-06, "loss": 0.9855, "step": 2368 }, { "epoch": 6.0588235294117645, "grad_norm": 0.12980745503624036, "learning_rate": 1.0815534645233182e-06, "loss": 1.0108, "step": 2369 }, { "epoch": 6.061381074168798, "grad_norm": 0.12190895753762201, "learning_rate": 1.075791034768704e-06, "loss": 1.0134, "step": 2370 }, { "epoch": 6.063938618925831, "grad_norm": 0.11736296572501317, "learning_rate": 1.0700431241440888e-06, "loss": 0.9819, "step": 2371 }, { "epoch": 6.0664961636828645, "grad_norm": 0.11803134631202541, "learning_rate": 1.064309742000963e-06, "loss": 0.999, "step": 2372 }, { "epoch": 6.069053708439898, "grad_norm": 0.12274428069266924, "learning_rate": 1.0585908976671844e-06, "loss": 1.0263, "step": 2373 }, { "epoch": 6.071611253196931, "grad_norm": 0.1280904409678555, "learning_rate": 1.052886600446954e-06, "loss": 0.9989, "step": 2374 }, { "epoch": 6.0741687979539645, "grad_norm": 0.13800491036101872, "learning_rate": 1.0471968596208026e-06, "loss": 1.0168, "step": 2375 }, { "epoch": 6.076726342710997, "grad_norm": 0.125255996087832, "learning_rate": 1.0415216844455889e-06, "loss": 1.0016, "step": 2376 }, { "epoch": 6.079283887468031, "grad_norm": 0.12500402095406113, "learning_rate": 1.0358610841544657e-06, "loss": 1.0207, "step": 2377 }, { "epoch": 6.081841432225064, "grad_norm": 0.12102753345414748, "learning_rate": 1.0302150679568745e-06, "loss": 0.9889, "step": 2378 }, { "epoch": 6.084398976982097, "grad_norm": 0.1263965580697967, "learning_rate": 1.0245836450385304e-06, "loss": 1.0278, "step": 2379 }, { "epoch": 6.086956521739131, "grad_norm": 0.12426986420829644, "learning_rate": 1.0189668245614092e-06, "loss": 1.0024, "step": 2380 }, { "epoch": 6.089514066496164, "grad_norm": 0.12124987678343191, "learning_rate": 1.0133646156637244e-06, "loss": 1.0346, "step": 2381 }, { "epoch": 6.092071611253197, "grad_norm": 0.11760759251820775, "learning_rate": 1.0077770274599187e-06, "loss": 1.0176, "step": 2382 }, { "epoch": 6.09462915601023, "grad_norm": 0.11882704515829542, "learning_rate": 1.002204069040652e-06, "loss": 0.9894, "step": 2383 }, { "epoch": 6.0971867007672635, "grad_norm": 0.12369290549039276, "learning_rate": 9.966457494727777e-07, "loss": 1.04, "step": 2384 }, { "epoch": 6.099744245524296, "grad_norm": 0.12345493397851956, "learning_rate": 9.91102077799333e-07, "loss": 1.0049, "step": 2385 }, { "epoch": 6.10230179028133, "grad_norm": 0.12872126244712379, "learning_rate": 9.855730630395244e-07, "loss": 0.9933, "step": 2386 }, { "epoch": 6.1048593350383635, "grad_norm": 0.11772835201472491, "learning_rate": 9.800587141887173e-07, "loss": 1.0285, "step": 2387 }, { "epoch": 6.107416879795396, "grad_norm": 0.12252902927138364, "learning_rate": 9.745590402184092e-07, "loss": 1.0134, "step": 2388 }, { "epoch": 6.10997442455243, "grad_norm": 0.12214679346044635, "learning_rate": 9.690740500762241e-07, "loss": 0.9778, "step": 2389 }, { "epoch": 6.112531969309463, "grad_norm": 0.12270563199721099, "learning_rate": 9.636037526859032e-07, "loss": 1.0048, "step": 2390 }, { "epoch": 6.115089514066496, "grad_norm": 0.13289561214559903, "learning_rate": 9.58148156947276e-07, "loss": 1.0355, "step": 2391 }, { "epoch": 6.117647058823529, "grad_norm": 0.124015797218616, "learning_rate": 9.52707271736254e-07, "loss": 0.9894, "step": 2392 }, { "epoch": 6.120204603580563, "grad_norm": 0.12869746602968873, "learning_rate": 9.472811059048182e-07, "loss": 1.034, "step": 2393 }, { "epoch": 6.122762148337596, "grad_norm": 0.11502225665357182, "learning_rate": 9.418696682810014e-07, "loss": 1.0279, "step": 2394 }, { "epoch": 6.125319693094629, "grad_norm": 0.12442843747682036, "learning_rate": 9.364729676688755e-07, "loss": 1.0346, "step": 2395 }, { "epoch": 6.127877237851663, "grad_norm": 0.12203934311867798, "learning_rate": 9.310910128485317e-07, "loss": 1.0042, "step": 2396 }, { "epoch": 6.130434782608695, "grad_norm": 0.13225053449453802, "learning_rate": 9.257238125760781e-07, "loss": 0.9979, "step": 2397 }, { "epoch": 6.132992327365729, "grad_norm": 0.11626249473093271, "learning_rate": 9.203713755836108e-07, "loss": 1.0151, "step": 2398 }, { "epoch": 6.135549872122763, "grad_norm": 0.12565196489418815, "learning_rate": 9.150337105792129e-07, "loss": 1.0003, "step": 2399 }, { "epoch": 6.138107416879795, "grad_norm": 0.1176707888425743, "learning_rate": 9.097108262469268e-07, "loss": 1.0174, "step": 2400 }, { "epoch": 6.140664961636829, "grad_norm": 0.1254506125476653, "learning_rate": 9.044027312467574e-07, "loss": 1.024, "step": 2401 }, { "epoch": 6.143222506393862, "grad_norm": 0.12040306772801906, "learning_rate": 8.991094342146423e-07, "loss": 1.0238, "step": 2402 }, { "epoch": 6.145780051150895, "grad_norm": 0.12003711394998114, "learning_rate": 8.938309437624415e-07, "loss": 1.0361, "step": 2403 }, { "epoch": 6.148337595907928, "grad_norm": 0.1222116778211444, "learning_rate": 8.885672684779345e-07, "loss": 1.0195, "step": 2404 }, { "epoch": 6.150895140664962, "grad_norm": 0.12213600424627216, "learning_rate": 8.833184169247877e-07, "loss": 1.0147, "step": 2405 }, { "epoch": 6.153452685421995, "grad_norm": 0.11882499943476486, "learning_rate": 8.780843976425568e-07, "loss": 1.0443, "step": 2406 }, { "epoch": 6.156010230179028, "grad_norm": 0.11944071935758879, "learning_rate": 8.728652191466602e-07, "loss": 1.0269, "step": 2407 }, { "epoch": 6.158567774936062, "grad_norm": 0.12479032723786981, "learning_rate": 8.676608899283789e-07, "loss": 1.0407, "step": 2408 }, { "epoch": 6.161125319693094, "grad_norm": 0.1232368778241773, "learning_rate": 8.62471418454831e-07, "loss": 0.998, "step": 2409 }, { "epoch": 6.163682864450128, "grad_norm": 0.12380002645622601, "learning_rate": 8.572968131689585e-07, "loss": 1.0215, "step": 2410 }, { "epoch": 6.166240409207161, "grad_norm": 0.11990258505813678, "learning_rate": 8.521370824895236e-07, "loss": 1.0362, "step": 2411 }, { "epoch": 6.168797953964194, "grad_norm": 0.12763582460814127, "learning_rate": 8.469922348110871e-07, "loss": 1.0005, "step": 2412 }, { "epoch": 6.171355498721228, "grad_norm": 0.12048771338001237, "learning_rate": 8.41862278503991e-07, "loss": 1.0154, "step": 2413 }, { "epoch": 6.173913043478261, "grad_norm": 0.11110330026915051, "learning_rate": 8.367472219143524e-07, "loss": 0.9864, "step": 2414 }, { "epoch": 6.176470588235294, "grad_norm": 0.12274015937027666, "learning_rate": 8.316470733640525e-07, "loss": 1.01, "step": 2415 }, { "epoch": 6.179028132992327, "grad_norm": 0.11875414799502092, "learning_rate": 8.265618411507148e-07, "loss": 1.0349, "step": 2416 }, { "epoch": 6.181585677749361, "grad_norm": 0.12112785116554001, "learning_rate": 8.214915335476892e-07, "loss": 1.0108, "step": 2417 }, { "epoch": 6.1841432225063935, "grad_norm": 0.11843273179000395, "learning_rate": 8.164361588040526e-07, "loss": 1.0316, "step": 2418 }, { "epoch": 6.186700767263427, "grad_norm": 0.12171206599055973, "learning_rate": 8.113957251445837e-07, "loss": 1.0181, "step": 2419 }, { "epoch": 6.189258312020461, "grad_norm": 0.1332901069553243, "learning_rate": 8.063702407697515e-07, "loss": 1.0163, "step": 2420 }, { "epoch": 6.1918158567774935, "grad_norm": 0.12665149802988054, "learning_rate": 8.013597138557039e-07, "loss": 1.0316, "step": 2421 }, { "epoch": 6.194373401534527, "grad_norm": 0.11748240466353733, "learning_rate": 7.963641525542564e-07, "loss": 1.0295, "step": 2422 }, { "epoch": 6.19693094629156, "grad_norm": 0.12263136155853388, "learning_rate": 7.913835649928792e-07, "loss": 1.0443, "step": 2423 }, { "epoch": 6.1994884910485935, "grad_norm": 0.12057268564537553, "learning_rate": 7.864179592746679e-07, "loss": 0.9758, "step": 2424 }, { "epoch": 6.202046035805626, "grad_norm": 0.11757878694680841, "learning_rate": 7.814673434783604e-07, "loss": 0.998, "step": 2425 }, { "epoch": 6.20460358056266, "grad_norm": 0.18582779787648557, "learning_rate": 7.765317256582949e-07, "loss": 1.0115, "step": 2426 }, { "epoch": 6.207161125319693, "grad_norm": 0.13582232353707813, "learning_rate": 7.716111138444115e-07, "loss": 1.0459, "step": 2427 }, { "epoch": 6.209718670076726, "grad_norm": 0.13389475712289786, "learning_rate": 7.667055160422432e-07, "loss": 1.0274, "step": 2428 }, { "epoch": 6.21227621483376, "grad_norm": 0.12673104354118297, "learning_rate": 7.618149402328867e-07, "loss": 1.0011, "step": 2429 }, { "epoch": 6.2148337595907925, "grad_norm": 0.12765584122890725, "learning_rate": 7.569393943730064e-07, "loss": 1.0635, "step": 2430 }, { "epoch": 6.217391304347826, "grad_norm": 0.11473857666105772, "learning_rate": 7.52078886394807e-07, "loss": 0.9878, "step": 2431 }, { "epoch": 6.21994884910486, "grad_norm": 0.12228794360420046, "learning_rate": 7.472334242060331e-07, "loss": 1.0316, "step": 2432 }, { "epoch": 6.2225063938618925, "grad_norm": 0.12426451417815787, "learning_rate": 7.424030156899475e-07, "loss": 1.0098, "step": 2433 }, { "epoch": 6.225063938618926, "grad_norm": 0.11800919098475897, "learning_rate": 7.375876687053252e-07, "loss": 1.0508, "step": 2434 }, { "epoch": 6.227621483375959, "grad_norm": 0.1309293626602563, "learning_rate": 7.327873910864325e-07, "loss": 1.0265, "step": 2435 }, { "epoch": 6.2301790281329925, "grad_norm": 0.12364264713239634, "learning_rate": 7.280021906430201e-07, "loss": 1.038, "step": 2436 }, { "epoch": 6.232736572890025, "grad_norm": 0.12731230734269985, "learning_rate": 7.23232075160315e-07, "loss": 0.9938, "step": 2437 }, { "epoch": 6.235294117647059, "grad_norm": 0.11754730324986598, "learning_rate": 7.184770523989904e-07, "loss": 1.0209, "step": 2438 }, { "epoch": 6.2378516624040925, "grad_norm": 0.12687711722398867, "learning_rate": 7.137371300951746e-07, "loss": 1.0369, "step": 2439 }, { "epoch": 6.240409207161125, "grad_norm": 0.1226944492744433, "learning_rate": 7.090123159604234e-07, "loss": 1.0417, "step": 2440 }, { "epoch": 6.242966751918159, "grad_norm": 0.11721843519340895, "learning_rate": 7.043026176817158e-07, "loss": 0.99, "step": 2441 }, { "epoch": 6.245524296675192, "grad_norm": 0.12080675281454777, "learning_rate": 6.996080429214347e-07, "loss": 1.0065, "step": 2442 }, { "epoch": 6.248081841432225, "grad_norm": 0.12010992913398671, "learning_rate": 6.949285993173593e-07, "loss": 1.0359, "step": 2443 }, { "epoch": 6.250639386189258, "grad_norm": 0.11624614678372433, "learning_rate": 6.902642944826544e-07, "loss": 0.97, "step": 2444 }, { "epoch": 6.253196930946292, "grad_norm": 0.12257573737475404, "learning_rate": 6.856151360058505e-07, "loss": 1.0192, "step": 2445 }, { "epoch": 6.255754475703325, "grad_norm": 0.1201829684398593, "learning_rate": 6.809811314508386e-07, "loss": 1.0466, "step": 2446 }, { "epoch": 6.258312020460358, "grad_norm": 0.12401967000820303, "learning_rate": 6.763622883568521e-07, "loss": 1.0356, "step": 2447 }, { "epoch": 6.260869565217392, "grad_norm": 0.11778396980454381, "learning_rate": 6.717586142384624e-07, "loss": 1.036, "step": 2448 }, { "epoch": 6.263427109974424, "grad_norm": 0.12185872889499474, "learning_rate": 6.671701165855593e-07, "loss": 1.0261, "step": 2449 }, { "epoch": 6.265984654731458, "grad_norm": 0.1201489344194391, "learning_rate": 6.625968028633389e-07, "loss": 1.0119, "step": 2450 }, { "epoch": 6.268542199488491, "grad_norm": 0.11988021977061444, "learning_rate": 6.580386805122996e-07, "loss": 1.021, "step": 2451 }, { "epoch": 6.271099744245524, "grad_norm": 0.11792524228657224, "learning_rate": 6.534957569482214e-07, "loss": 1.0635, "step": 2452 }, { "epoch": 6.273657289002558, "grad_norm": 0.11687466392592072, "learning_rate": 6.489680395621556e-07, "loss": 1.0129, "step": 2453 }, { "epoch": 6.276214833759591, "grad_norm": 0.12220153331468454, "learning_rate": 6.444555357204152e-07, "loss": 0.9876, "step": 2454 }, { "epoch": 6.278772378516624, "grad_norm": 0.11658584388896727, "learning_rate": 6.39958252764562e-07, "loss": 1.0258, "step": 2455 }, { "epoch": 6.281329923273657, "grad_norm": 0.11595243705777233, "learning_rate": 6.354761980113966e-07, "loss": 1.0364, "step": 2456 }, { "epoch": 6.283887468030691, "grad_norm": 0.11948349789713839, "learning_rate": 6.31009378752937e-07, "loss": 1.0295, "step": 2457 }, { "epoch": 6.286445012787723, "grad_norm": 0.11578209417911318, "learning_rate": 6.265578022564233e-07, "loss": 1.003, "step": 2458 }, { "epoch": 6.289002557544757, "grad_norm": 0.11954141892522423, "learning_rate": 6.221214757642901e-07, "loss": 1.0186, "step": 2459 }, { "epoch": 6.291560102301791, "grad_norm": 0.1214032884466788, "learning_rate": 6.177004064941616e-07, "loss": 1.0325, "step": 2460 }, { "epoch": 6.294117647058823, "grad_norm": 0.11798550854551848, "learning_rate": 6.132946016388453e-07, "loss": 1.0034, "step": 2461 }, { "epoch": 6.296675191815857, "grad_norm": 0.12025821516068275, "learning_rate": 6.089040683663083e-07, "loss": 0.9823, "step": 2462 }, { "epoch": 6.29923273657289, "grad_norm": 0.11951253909474888, "learning_rate": 6.045288138196725e-07, "loss": 1.0409, "step": 2463 }, { "epoch": 6.301790281329923, "grad_norm": 0.11418311978255119, "learning_rate": 6.001688451172027e-07, "loss": 1.0022, "step": 2464 }, { "epoch": 6.304347826086957, "grad_norm": 0.11934858308797691, "learning_rate": 5.958241693522993e-07, "loss": 1.0107, "step": 2465 }, { "epoch": 6.30690537084399, "grad_norm": 0.12241414028875457, "learning_rate": 5.914947935934756e-07, "loss": 0.9971, "step": 2466 }, { "epoch": 6.309462915601023, "grad_norm": 0.11903591318763888, "learning_rate": 5.871807248843542e-07, "loss": 1.0117, "step": 2467 }, { "epoch": 6.312020460358056, "grad_norm": 0.11896713837542751, "learning_rate": 5.828819702436573e-07, "loss": 1.0199, "step": 2468 }, { "epoch": 6.31457800511509, "grad_norm": 0.12256891371488562, "learning_rate": 5.785985366651892e-07, "loss": 1.003, "step": 2469 }, { "epoch": 6.3171355498721224, "grad_norm": 0.1224791957117775, "learning_rate": 5.743304311178289e-07, "loss": 1.0067, "step": 2470 }, { "epoch": 6.319693094629156, "grad_norm": 0.12119833550268867, "learning_rate": 5.70077660545515e-07, "loss": 1.0196, "step": 2471 }, { "epoch": 6.322250639386189, "grad_norm": 0.11520605275376457, "learning_rate": 5.658402318672418e-07, "loss": 1.0127, "step": 2472 }, { "epoch": 6.324808184143222, "grad_norm": 0.11525398133510434, "learning_rate": 5.616181519770414e-07, "loss": 1.0161, "step": 2473 }, { "epoch": 6.327365728900256, "grad_norm": 0.12176149506861418, "learning_rate": 5.574114277439702e-07, "loss": 1.0216, "step": 2474 }, { "epoch": 6.329923273657289, "grad_norm": 0.12541686899065785, "learning_rate": 5.53220066012109e-07, "loss": 1.0263, "step": 2475 }, { "epoch": 6.332480818414322, "grad_norm": 0.12958665943781433, "learning_rate": 5.490440736005397e-07, "loss": 1.0737, "step": 2476 }, { "epoch": 6.335038363171355, "grad_norm": 0.1273940622092984, "learning_rate": 5.448834573033424e-07, "loss": 1.028, "step": 2477 }, { "epoch": 6.337595907928389, "grad_norm": 0.11799709709320902, "learning_rate": 5.407382238895765e-07, "loss": 0.9949, "step": 2478 }, { "epoch": 6.340153452685422, "grad_norm": 0.1220634348791913, "learning_rate": 5.366083801032806e-07, "loss": 1.0422, "step": 2479 }, { "epoch": 6.342710997442455, "grad_norm": 0.11889607141087616, "learning_rate": 5.324939326634515e-07, "loss": 1.0017, "step": 2480 }, { "epoch": 6.345268542199489, "grad_norm": 0.12002156059223426, "learning_rate": 5.283948882640355e-07, "loss": 1.0181, "step": 2481 }, { "epoch": 6.3478260869565215, "grad_norm": 0.11596540294437355, "learning_rate": 5.24311253573927e-07, "loss": 1.0346, "step": 2482 }, { "epoch": 6.350383631713555, "grad_norm": 0.11502520531650343, "learning_rate": 5.202430352369392e-07, "loss": 1.0135, "step": 2483 }, { "epoch": 6.352941176470588, "grad_norm": 0.12267491898314155, "learning_rate": 5.161902398718121e-07, "loss": 1.0435, "step": 2484 }, { "epoch": 6.3554987212276215, "grad_norm": 0.12185761812901445, "learning_rate": 5.121528740721871e-07, "loss": 1.0377, "step": 2485 }, { "epoch": 6.358056265984655, "grad_norm": 0.11976615175350093, "learning_rate": 5.081309444066085e-07, "loss": 1.034, "step": 2486 }, { "epoch": 6.360613810741688, "grad_norm": 0.116555412280644, "learning_rate": 5.041244574185056e-07, "loss": 1.011, "step": 2487 }, { "epoch": 6.3631713554987215, "grad_norm": 0.12515368166748755, "learning_rate": 5.001334196261776e-07, "loss": 0.9861, "step": 2488 }, { "epoch": 6.365728900255754, "grad_norm": 0.11814447264484773, "learning_rate": 4.961578375227982e-07, "loss": 1.0146, "step": 2489 }, { "epoch": 6.368286445012788, "grad_norm": 0.12245094109059326, "learning_rate": 4.921977175763881e-07, "loss": 1.0204, "step": 2490 }, { "epoch": 6.370843989769821, "grad_norm": 0.12283694751475284, "learning_rate": 4.882530662298168e-07, "loss": 1.0313, "step": 2491 }, { "epoch": 6.373401534526854, "grad_norm": 0.12224108783096758, "learning_rate": 4.843238899007829e-07, "loss": 1.032, "step": 2492 }, { "epoch": 6.375959079283888, "grad_norm": 0.11751909048944272, "learning_rate": 4.804101949818119e-07, "loss": 1.0037, "step": 2493 }, { "epoch": 6.378516624040921, "grad_norm": 0.1189722841334927, "learning_rate": 4.765119878402424e-07, "loss": 1.0218, "step": 2494 }, { "epoch": 6.381074168797954, "grad_norm": 0.12188011601377355, "learning_rate": 4.726292748182104e-07, "loss": 1.0235, "step": 2495 }, { "epoch": 6.383631713554987, "grad_norm": 0.11601162144284871, "learning_rate": 4.687620622326505e-07, "loss": 1.0095, "step": 2496 }, { "epoch": 6.3861892583120206, "grad_norm": 0.11794823628283956, "learning_rate": 4.6491035637527437e-07, "loss": 1.0211, "step": 2497 }, { "epoch": 6.388746803069053, "grad_norm": 0.12080963912657082, "learning_rate": 4.6107416351256595e-07, "loss": 0.996, "step": 2498 }, { "epoch": 6.391304347826087, "grad_norm": 0.11852593163423941, "learning_rate": 4.5725348988577057e-07, "loss": 1.0473, "step": 2499 }, { "epoch": 6.3938618925831205, "grad_norm": 0.1154582217572824, "learning_rate": 4.5344834171088594e-07, "loss": 0.9916, "step": 2500 }, { "epoch": 6.396419437340153, "grad_norm": 0.12611349351005327, "learning_rate": 4.496587251786544e-07, "loss": 1.0537, "step": 2501 }, { "epoch": 6.398976982097187, "grad_norm": 0.11841147140282605, "learning_rate": 4.4588464645453856e-07, "loss": 1.0354, "step": 2502 }, { "epoch": 6.40153452685422, "grad_norm": 0.11761246404197793, "learning_rate": 4.421261116787323e-07, "loss": 1.0056, "step": 2503 }, { "epoch": 6.404092071611253, "grad_norm": 0.116833267265145, "learning_rate": 4.383831269661343e-07, "loss": 0.9983, "step": 2504 }, { "epoch": 6.406649616368286, "grad_norm": 0.12485584628194238, "learning_rate": 4.3465569840635105e-07, "loss": 1.0276, "step": 2505 }, { "epoch": 6.40920716112532, "grad_norm": 0.11771747761741529, "learning_rate": 4.309438320636705e-07, "loss": 1.0119, "step": 2506 }, { "epoch": 6.411764705882353, "grad_norm": 0.1167766752899283, "learning_rate": 4.272475339770699e-07, "loss": 1.0257, "step": 2507 }, { "epoch": 6.414322250639386, "grad_norm": 0.11997899496687212, "learning_rate": 4.235668101601964e-07, "loss": 0.9887, "step": 2508 }, { "epoch": 6.41687979539642, "grad_norm": 0.11897278858577053, "learning_rate": 4.199016666013533e-07, "loss": 1.0162, "step": 2509 }, { "epoch": 6.419437340153452, "grad_norm": 0.1213013490317867, "learning_rate": 4.1625210926350413e-07, "loss": 1.0141, "step": 2510 }, { "epoch": 6.421994884910486, "grad_norm": 0.12533002989447992, "learning_rate": 4.1261814408424806e-07, "loss": 1.0251, "step": 2511 }, { "epoch": 6.42455242966752, "grad_norm": 0.12196478149472252, "learning_rate": 4.089997769758225e-07, "loss": 1.0365, "step": 2512 }, { "epoch": 6.427109974424552, "grad_norm": 0.12143791187790264, "learning_rate": 4.0539701382507847e-07, "loss": 1.0032, "step": 2513 }, { "epoch": 6.429667519181586, "grad_norm": 0.11682750481108217, "learning_rate": 4.018098604934906e-07, "loss": 1.0045, "step": 2514 }, { "epoch": 6.432225063938619, "grad_norm": 0.11654420434670919, "learning_rate": 3.982383228171338e-07, "loss": 1.0122, "step": 2515 }, { "epoch": 6.434782608695652, "grad_norm": 0.12087376970393812, "learning_rate": 3.946824066066757e-07, "loss": 1.0091, "step": 2516 }, { "epoch": 6.437340153452685, "grad_norm": 0.11198028929740504, "learning_rate": 3.9114211764736843e-07, "loss": 0.9916, "step": 2517 }, { "epoch": 6.439897698209719, "grad_norm": 0.117876547438714, "learning_rate": 3.876174616990402e-07, "loss": 0.9688, "step": 2518 }, { "epoch": 6.442455242966752, "grad_norm": 0.11691097425539704, "learning_rate": 3.8410844449608966e-07, "loss": 1.0262, "step": 2519 }, { "epoch": 6.445012787723785, "grad_norm": 0.12067476965271878, "learning_rate": 3.8061507174746326e-07, "loss": 1.0357, "step": 2520 }, { "epoch": 6.447570332480819, "grad_norm": 0.11448044711242149, "learning_rate": 3.7713734913666254e-07, "loss": 1.0278, "step": 2521 }, { "epoch": 6.450127877237851, "grad_norm": 0.11900503374045875, "learning_rate": 3.73675282321726e-07, "loss": 1.0293, "step": 2522 }, { "epoch": 6.452685421994885, "grad_norm": 0.1237852363860751, "learning_rate": 3.7022887693521914e-07, "loss": 1.0432, "step": 2523 }, { "epoch": 6.455242966751918, "grad_norm": 0.11395769439497158, "learning_rate": 3.6679813858422673e-07, "loss": 1.0451, "step": 2524 }, { "epoch": 6.457800511508951, "grad_norm": 0.11755851431433859, "learning_rate": 3.6338307285034626e-07, "loss": 1.0166, "step": 2525 }, { "epoch": 6.460358056265985, "grad_norm": 0.11537719335337888, "learning_rate": 3.5998368528967764e-07, "loss": 1.0221, "step": 2526 }, { "epoch": 6.462915601023018, "grad_norm": 0.12098800578611382, "learning_rate": 3.5659998143281027e-07, "loss": 1.0474, "step": 2527 }, { "epoch": 6.465473145780051, "grad_norm": 0.11989356063597686, "learning_rate": 3.532319667848172e-07, "loss": 1.0187, "step": 2528 }, { "epoch": 6.468030690537084, "grad_norm": 0.1156244817453119, "learning_rate": 3.498796468252508e-07, "loss": 0.9894, "step": 2529 }, { "epoch": 6.470588235294118, "grad_norm": 0.11213145863456157, "learning_rate": 3.46543027008126e-07, "loss": 1.0331, "step": 2530 }, { "epoch": 6.4731457800511505, "grad_norm": 0.11707883319628067, "learning_rate": 3.4322211276191176e-07, "loss": 1.0259, "step": 2531 }, { "epoch": 6.475703324808184, "grad_norm": 0.11350670721406404, "learning_rate": 3.399169094895294e-07, "loss": 1.0065, "step": 2532 }, { "epoch": 6.478260869565218, "grad_norm": 0.11452239943111842, "learning_rate": 3.366274225683397e-07, "loss": 1.0382, "step": 2533 }, { "epoch": 6.4808184143222505, "grad_norm": 0.11645854358551593, "learning_rate": 3.3335365735012947e-07, "loss": 0.9849, "step": 2534 }, { "epoch": 6.483375959079284, "grad_norm": 0.1150643632230636, "learning_rate": 3.3009561916111045e-07, "loss": 1.0441, "step": 2535 }, { "epoch": 6.485933503836317, "grad_norm": 0.11565843726243669, "learning_rate": 3.2685331330190916e-07, "loss": 1.0256, "step": 2536 }, { "epoch": 6.4884910485933505, "grad_norm": 0.12172892123412701, "learning_rate": 3.2362674504755385e-07, "loss": 1.0006, "step": 2537 }, { "epoch": 6.491048593350383, "grad_norm": 0.11416395245772691, "learning_rate": 3.2041591964746767e-07, "loss": 0.9981, "step": 2538 }, { "epoch": 6.493606138107417, "grad_norm": 0.11099012627200047, "learning_rate": 3.17220842325463e-07, "loss": 0.9971, "step": 2539 }, { "epoch": 6.4961636828644505, "grad_norm": 0.12666071845516697, "learning_rate": 3.14041518279733e-07, "loss": 1.019, "step": 2540 }, { "epoch": 6.498721227621483, "grad_norm": 0.11694427326316041, "learning_rate": 3.108779526828365e-07, "loss": 1.048, "step": 2541 }, { "epoch": 6.501278772378517, "grad_norm": 0.11663277776194486, "learning_rate": 3.0773015068169876e-07, "loss": 1.0205, "step": 2542 }, { "epoch": 6.5038363171355495, "grad_norm": 0.11421370105035522, "learning_rate": 3.045981173975965e-07, "loss": 1.0062, "step": 2543 }, { "epoch": 6.506393861892583, "grad_norm": 0.11416247400561318, "learning_rate": 3.0148185792615137e-07, "loss": 1.0221, "step": 2544 }, { "epoch": 6.508951406649617, "grad_norm": 0.12004167269390631, "learning_rate": 2.9838137733732343e-07, "loss": 1.0336, "step": 2545 }, { "epoch": 6.5115089514066495, "grad_norm": 0.12185027359479889, "learning_rate": 2.9529668067539986e-07, "loss": 1.0085, "step": 2546 }, { "epoch": 6.514066496163683, "grad_norm": 0.11920181864869182, "learning_rate": 2.922277729589906e-07, "loss": 1.0212, "step": 2547 }, { "epoch": 6.516624040920716, "grad_norm": 0.11457206340363568, "learning_rate": 2.891746591810152e-07, "loss": 1.0062, "step": 2548 }, { "epoch": 6.5191815856777495, "grad_norm": 0.11396161204686395, "learning_rate": 2.86137344308699e-07, "loss": 1.0269, "step": 2549 }, { "epoch": 6.521739130434782, "grad_norm": 0.11716042134956894, "learning_rate": 2.8311583328356485e-07, "loss": 1.0513, "step": 2550 }, { "epoch": 6.524296675191816, "grad_norm": 0.11082138416428153, "learning_rate": 2.801101310214205e-07, "loss": 1.0133, "step": 2551 }, { "epoch": 6.526854219948849, "grad_norm": 0.11831445098631707, "learning_rate": 2.7712024241235757e-07, "loss": 1.0184, "step": 2552 }, { "epoch": 6.529411764705882, "grad_norm": 0.11918281125426747, "learning_rate": 2.7414617232073505e-07, "loss": 1.0344, "step": 2553 }, { "epoch": 6.531969309462916, "grad_norm": 0.11681313613977624, "learning_rate": 2.7118792558518237e-07, "loss": 1.0219, "step": 2554 }, { "epoch": 6.534526854219949, "grad_norm": 0.12570449518559115, "learning_rate": 2.6824550701857966e-07, "loss": 1.0192, "step": 2555 }, { "epoch": 6.537084398976982, "grad_norm": 0.11631595597156608, "learning_rate": 2.653189214080576e-07, "loss": 0.9885, "step": 2556 }, { "epoch": 6.539641943734015, "grad_norm": 0.11976742856004091, "learning_rate": 2.624081735149897e-07, "loss": 1.0225, "step": 2557 }, { "epoch": 6.542199488491049, "grad_norm": 0.11687676414472607, "learning_rate": 2.5951326807498123e-07, "loss": 1.0051, "step": 2558 }, { "epoch": 6.544757033248082, "grad_norm": 0.11626243542745685, "learning_rate": 2.5663420979785915e-07, "loss": 1.0256, "step": 2559 }, { "epoch": 6.547314578005115, "grad_norm": 0.11473271542819383, "learning_rate": 2.5377100336767547e-07, "loss": 1.0134, "step": 2560 }, { "epoch": 6.549872122762149, "grad_norm": 0.11617767916671155, "learning_rate": 2.509236534426851e-07, "loss": 1.0045, "step": 2561 }, { "epoch": 6.552429667519181, "grad_norm": 0.11177045938404909, "learning_rate": 2.4809216465534913e-07, "loss": 1.0377, "step": 2562 }, { "epoch": 6.554987212276215, "grad_norm": 0.11344781404055954, "learning_rate": 2.4527654161232153e-07, "loss": 1.0037, "step": 2563 }, { "epoch": 6.557544757033249, "grad_norm": 0.12399390000812018, "learning_rate": 2.424767888944468e-07, "loss": 1.0462, "step": 2564 }, { "epoch": 6.560102301790281, "grad_norm": 0.11847061868510626, "learning_rate": 2.3969291105674805e-07, "loss": 0.9959, "step": 2565 }, { "epoch": 6.562659846547315, "grad_norm": 0.116920831153564, "learning_rate": 2.3692491262841788e-07, "loss": 0.9783, "step": 2566 }, { "epoch": 6.565217391304348, "grad_norm": 0.12018087616989655, "learning_rate": 2.3417279811281947e-07, "loss": 0.9778, "step": 2567 }, { "epoch": 6.567774936061381, "grad_norm": 0.11727845557913934, "learning_rate": 2.3143657198746893e-07, "loss": 1.042, "step": 2568 }, { "epoch": 6.570332480818414, "grad_norm": 0.1156893274747709, "learning_rate": 2.2871623870403649e-07, "loss": 1.0302, "step": 2569 }, { "epoch": 6.572890025575448, "grad_norm": 0.11720330890092409, "learning_rate": 2.260118026883318e-07, "loss": 1.0267, "step": 2570 }, { "epoch": 6.57544757033248, "grad_norm": 0.11688767903985245, "learning_rate": 2.233232683403075e-07, "loss": 1.0292, "step": 2571 }, { "epoch": 6.578005115089514, "grad_norm": 0.11603026043379294, "learning_rate": 2.206506400340369e-07, "loss": 1.0017, "step": 2572 }, { "epoch": 6.580562659846548, "grad_norm": 0.11389458080146765, "learning_rate": 2.1799392211772074e-07, "loss": 1.0082, "step": 2573 }, { "epoch": 6.58312020460358, "grad_norm": 0.1161474107114186, "learning_rate": 2.1535311891367373e-07, "loss": 1.0219, "step": 2574 }, { "epoch": 6.585677749360614, "grad_norm": 0.11523869949699879, "learning_rate": 2.1272823471831573e-07, "loss": 1.0048, "step": 2575 }, { "epoch": 6.588235294117647, "grad_norm": 0.11447790591214169, "learning_rate": 2.101192738021718e-07, "loss": 1.0116, "step": 2576 }, { "epoch": 6.59079283887468, "grad_norm": 0.11643651666513412, "learning_rate": 2.0752624040985436e-07, "loss": 1.0117, "step": 2577 }, { "epoch": 6.593350383631714, "grad_norm": 0.12040988081003166, "learning_rate": 2.0494913876007105e-07, "loss": 1.0255, "step": 2578 }, { "epoch": 6.595907928388747, "grad_norm": 0.11872708662460554, "learning_rate": 2.0238797304560243e-07, "loss": 1.0241, "step": 2579 }, { "epoch": 6.59846547314578, "grad_norm": 0.10983144316407795, "learning_rate": 1.9984274743330424e-07, "loss": 1.0106, "step": 2580 }, { "epoch": 6.601023017902813, "grad_norm": 0.112895943367732, "learning_rate": 1.9731346606410185e-07, "loss": 1.0405, "step": 2581 }, { "epoch": 6.603580562659847, "grad_norm": 0.11309181158689928, "learning_rate": 1.9480013305297585e-07, "loss": 1.0286, "step": 2582 }, { "epoch": 6.6061381074168795, "grad_norm": 0.11579577875848088, "learning_rate": 1.9230275248896425e-07, "loss": 1.0137, "step": 2583 }, { "epoch": 6.608695652173913, "grad_norm": 0.11932271374275923, "learning_rate": 1.8982132843514577e-07, "loss": 1.0352, "step": 2584 }, { "epoch": 6.611253196930946, "grad_norm": 0.1187240263728754, "learning_rate": 1.8735586492864556e-07, "loss": 0.9899, "step": 2585 }, { "epoch": 6.6138107416879794, "grad_norm": 0.12010362235501355, "learning_rate": 1.8490636598061605e-07, "loss": 1.0202, "step": 2586 }, { "epoch": 6.616368286445013, "grad_norm": 0.11896072789581243, "learning_rate": 1.8247283557624062e-07, "loss": 1.0801, "step": 2587 }, { "epoch": 6.618925831202046, "grad_norm": 0.11269695438058397, "learning_rate": 1.8005527767471998e-07, "loss": 1.0323, "step": 2588 }, { "epoch": 6.621483375959079, "grad_norm": 0.11595014960172056, "learning_rate": 1.7765369620926899e-07, "loss": 1.0247, "step": 2589 }, { "epoch": 6.624040920716112, "grad_norm": 0.11457210948093192, "learning_rate": 1.752680950871144e-07, "loss": 1.0561, "step": 2590 }, { "epoch": 6.626598465473146, "grad_norm": 0.11577860483951284, "learning_rate": 1.7289847818947492e-07, "loss": 1.0182, "step": 2591 }, { "epoch": 6.629156010230179, "grad_norm": 0.11240383490721378, "learning_rate": 1.7054484937157112e-07, "loss": 1.0255, "step": 2592 }, { "epoch": 6.631713554987212, "grad_norm": 0.11631232042116323, "learning_rate": 1.6820721246261106e-07, "loss": 1.0299, "step": 2593 }, { "epoch": 6.634271099744246, "grad_norm": 0.11273655621311057, "learning_rate": 1.6588557126578365e-07, "loss": 1.0407, "step": 2594 }, { "epoch": 6.6368286445012785, "grad_norm": 0.11767164102993428, "learning_rate": 1.6357992955825297e-07, "loss": 1.0145, "step": 2595 }, { "epoch": 6.639386189258312, "grad_norm": 0.11534695075999606, "learning_rate": 1.6129029109115401e-07, "loss": 1.0106, "step": 2596 }, { "epoch": 6.641943734015345, "grad_norm": 0.11539400507669376, "learning_rate": 1.59016659589587e-07, "loss": 0.9862, "step": 2597 }, { "epoch": 6.6445012787723785, "grad_norm": 0.11483047616375414, "learning_rate": 1.567590387526041e-07, "loss": 1.0301, "step": 2598 }, { "epoch": 6.647058823529412, "grad_norm": 0.11260638212850177, "learning_rate": 1.5451743225321726e-07, "loss": 1.0088, "step": 2599 }, { "epoch": 6.649616368286445, "grad_norm": 0.11619144848069289, "learning_rate": 1.5229184373837912e-07, "loss": 1.0117, "step": 2600 }, { "epoch": 6.6521739130434785, "grad_norm": 0.12170161725444163, "learning_rate": 1.5008227682898337e-07, "loss": 1.0345, "step": 2601 }, { "epoch": 6.654731457800511, "grad_norm": 0.11009879990340311, "learning_rate": 1.4788873511985656e-07, "loss": 1.0074, "step": 2602 }, { "epoch": 6.657289002557545, "grad_norm": 0.11242257451547451, "learning_rate": 1.4571122217975298e-07, "loss": 1.0295, "step": 2603 }, { "epoch": 6.659846547314578, "grad_norm": 0.11604613398078274, "learning_rate": 1.4354974155135203e-07, "loss": 1.0287, "step": 2604 }, { "epoch": 6.662404092071611, "grad_norm": 0.11447891191608152, "learning_rate": 1.4140429675124633e-07, "loss": 1.0059, "step": 2605 }, { "epoch": 6.664961636828645, "grad_norm": 0.11195548180186611, "learning_rate": 1.3927489126993932e-07, "loss": 1.0347, "step": 2606 }, { "epoch": 6.667519181585678, "grad_norm": 0.11445065696070437, "learning_rate": 1.3716152857184306e-07, "loss": 1.012, "step": 2607 }, { "epoch": 6.670076726342711, "grad_norm": 0.11614977059279803, "learning_rate": 1.350642120952661e-07, "loss": 0.9918, "step": 2608 }, { "epoch": 6.672634271099744, "grad_norm": 0.11871269418863775, "learning_rate": 1.3298294525241008e-07, "loss": 1.0269, "step": 2609 }, { "epoch": 6.675191815856778, "grad_norm": 0.10866128338893077, "learning_rate": 1.3091773142936525e-07, "loss": 1.0334, "step": 2610 }, { "epoch": 6.677749360613811, "grad_norm": 0.12041795104852608, "learning_rate": 1.2886857398610731e-07, "loss": 0.9974, "step": 2611 }, { "epoch": 6.680306905370844, "grad_norm": 0.11406194376177828, "learning_rate": 1.2683547625648718e-07, "loss": 1.0222, "step": 2612 }, { "epoch": 6.6828644501278776, "grad_norm": 0.11240623577621248, "learning_rate": 1.2481844154822565e-07, "loss": 0.9952, "step": 2613 }, { "epoch": 6.68542199488491, "grad_norm": 0.11514164915047609, "learning_rate": 1.2281747314291437e-07, "loss": 1.0026, "step": 2614 }, { "epoch": 6.687979539641944, "grad_norm": 0.11222335726022206, "learning_rate": 1.208325742960037e-07, "loss": 1.0056, "step": 2615 }, { "epoch": 6.690537084398977, "grad_norm": 0.11243016039454592, "learning_rate": 1.1886374823679825e-07, "loss": 1.0492, "step": 2616 }, { "epoch": 6.69309462915601, "grad_norm": 0.11317201484958644, "learning_rate": 1.1691099816845574e-07, "loss": 1.0213, "step": 2617 }, { "epoch": 6.695652173913043, "grad_norm": 0.1170626311837824, "learning_rate": 1.149743272679793e-07, "loss": 0.9974, "step": 2618 }, { "epoch": 6.698209718670077, "grad_norm": 0.12262867677149476, "learning_rate": 1.1305373868620961e-07, "loss": 0.9967, "step": 2619 }, { "epoch": 6.70076726342711, "grad_norm": 0.11396022297257247, "learning_rate": 1.1114923554782608e-07, "loss": 0.9956, "step": 2620 }, { "epoch": 6.703324808184143, "grad_norm": 0.11735281558425238, "learning_rate": 1.0926082095133572e-07, "loss": 1.0193, "step": 2621 }, { "epoch": 6.705882352941177, "grad_norm": 0.12029512917783149, "learning_rate": 1.0738849796907091e-07, "loss": 1.0473, "step": 2622 }, { "epoch": 6.708439897698209, "grad_norm": 0.11312555151340069, "learning_rate": 1.0553226964718277e-07, "loss": 1.008, "step": 2623 }, { "epoch": 6.710997442455243, "grad_norm": 0.11541322342299927, "learning_rate": 1.0369213900564001e-07, "loss": 1.0029, "step": 2624 }, { "epoch": 6.713554987212277, "grad_norm": 0.11302071428638145, "learning_rate": 1.0186810903822119e-07, "loss": 0.9623, "step": 2625 }, { "epoch": 6.716112531969309, "grad_norm": 0.11291140484686953, "learning_rate": 1.0006018271250695e-07, "loss": 1.0305, "step": 2626 }, { "epoch": 6.718670076726343, "grad_norm": 0.11524487387426563, "learning_rate": 9.826836296988107e-08, "loss": 1.0596, "step": 2627 }, { "epoch": 6.721227621483376, "grad_norm": 0.11543260535666969, "learning_rate": 9.649265272552277e-08, "loss": 1.0237, "step": 2628 }, { "epoch": 6.723785166240409, "grad_norm": 0.11302904037284935, "learning_rate": 9.473305486840112e-08, "loss": 1.0177, "step": 2629 }, { "epoch": 6.726342710997442, "grad_norm": 0.11210024116892857, "learning_rate": 9.29895722612717e-08, "loss": 1.0284, "step": 2630 }, { "epoch": 6.728900255754476, "grad_norm": 0.11611360048557691, "learning_rate": 9.126220774067218e-08, "loss": 1.0313, "step": 2631 }, { "epoch": 6.731457800511509, "grad_norm": 0.11281080704543008, "learning_rate": 8.955096411691566e-08, "loss": 1.0156, "step": 2632 }, { "epoch": 6.734015345268542, "grad_norm": 0.11192307343079083, "learning_rate": 8.785584417409065e-08, "loss": 1.0173, "step": 2633 }, { "epoch": 6.736572890025576, "grad_norm": 0.11483249975315203, "learning_rate": 8.617685067004777e-08, "loss": 1.0269, "step": 2634 }, { "epoch": 6.739130434782608, "grad_norm": 0.11652633110386056, "learning_rate": 8.451398633640861e-08, "loss": 0.9978, "step": 2635 }, { "epoch": 6.741687979539642, "grad_norm": 0.11193935061569056, "learning_rate": 8.286725387854689e-08, "loss": 1.0166, "step": 2636 }, { "epoch": 6.744245524296675, "grad_norm": 0.1132575109344062, "learning_rate": 8.123665597559393e-08, "loss": 1.03, "step": 2637 }, { "epoch": 6.746803069053708, "grad_norm": 0.10909141114205528, "learning_rate": 7.962219528042991e-08, "loss": 0.9843, "step": 2638 }, { "epoch": 6.749360613810742, "grad_norm": 0.11510554903103819, "learning_rate": 7.802387441968262e-08, "loss": 1.0058, "step": 2639 }, { "epoch": 6.751918158567775, "grad_norm": 0.1126125629269261, "learning_rate": 7.644169599371975e-08, "loss": 1.0451, "step": 2640 }, { "epoch": 6.754475703324808, "grad_norm": 0.11361718582807691, "learning_rate": 7.487566257664558e-08, "loss": 1.0447, "step": 2641 }, { "epoch": 6.757033248081841, "grad_norm": 0.11201362418480085, "learning_rate": 7.332577671629982e-08, "loss": 1.0003, "step": 2642 }, { "epoch": 6.759590792838875, "grad_norm": 0.11250812055949669, "learning_rate": 7.179204093424985e-08, "loss": 1.0152, "step": 2643 }, { "epoch": 6.762148337595908, "grad_norm": 0.11340595916397253, "learning_rate": 7.027445772578856e-08, "loss": 1.0136, "step": 2644 }, { "epoch": 6.764705882352941, "grad_norm": 0.11043173067397596, "learning_rate": 6.877302955992649e-08, "loss": 1.0039, "step": 2645 }, { "epoch": 6.767263427109975, "grad_norm": 0.11320152606971275, "learning_rate": 6.72877588793952e-08, "loss": 1.0263, "step": 2646 }, { "epoch": 6.7698209718670075, "grad_norm": 0.11555065643180781, "learning_rate": 6.581864810063732e-08, "loss": 1.0095, "step": 2647 }, { "epoch": 6.772378516624041, "grad_norm": 0.1114703182443358, "learning_rate": 6.436569961380313e-08, "loss": 1.0014, "step": 2648 }, { "epoch": 6.774936061381074, "grad_norm": 0.11945044598900786, "learning_rate": 6.292891578275063e-08, "loss": 1.0308, "step": 2649 }, { "epoch": 6.7774936061381075, "grad_norm": 0.11250868328242511, "learning_rate": 6.150829894503662e-08, "loss": 1.0107, "step": 2650 }, { "epoch": 6.78005115089514, "grad_norm": 0.11491638958663465, "learning_rate": 6.010385141191455e-08, "loss": 1.0279, "step": 2651 }, { "epoch": 6.782608695652174, "grad_norm": 0.1160903563132126, "learning_rate": 5.8715575468333286e-08, "loss": 1.0067, "step": 2652 }, { "epoch": 6.7851662404092075, "grad_norm": 0.11673880519657757, "learning_rate": 5.734347337293167e-08, "loss": 1.0253, "step": 2653 }, { "epoch": 6.78772378516624, "grad_norm": 0.11345092121417273, "learning_rate": 5.598754735803513e-08, "loss": 1.0256, "step": 2654 }, { "epoch": 6.790281329923274, "grad_norm": 0.11245719320265857, "learning_rate": 5.464779962964795e-08, "loss": 1.023, "step": 2655 }, { "epoch": 6.792838874680307, "grad_norm": 0.11318781711220266, "learning_rate": 5.332423236745765e-08, "loss": 0.9817, "step": 2656 }, { "epoch": 6.79539641943734, "grad_norm": 0.11393255984182678, "learning_rate": 5.201684772482507e-08, "loss": 0.9919, "step": 2657 }, { "epoch": 6.797953964194374, "grad_norm": 0.1114106983420887, "learning_rate": 5.0725647828783196e-08, "loss": 0.9949, "step": 2658 }, { "epoch": 6.8005115089514065, "grad_norm": 0.11613702586163382, "learning_rate": 4.945063478003276e-08, "loss": 1.0246, "step": 2659 }, { "epoch": 6.80306905370844, "grad_norm": 0.11426036986413816, "learning_rate": 4.8191810652941096e-08, "loss": 1.0434, "step": 2660 }, { "epoch": 6.805626598465473, "grad_norm": 0.11654706791098739, "learning_rate": 4.694917749553663e-08, "loss": 1.0256, "step": 2661 }, { "epoch": 6.8081841432225065, "grad_norm": 0.10999242921563646, "learning_rate": 4.5722737329505495e-08, "loss": 0.9802, "step": 2662 }, { "epoch": 6.810741687979539, "grad_norm": 0.11948231260555445, "learning_rate": 4.451249215018827e-08, "loss": 1.0593, "step": 2663 }, { "epoch": 6.813299232736573, "grad_norm": 0.11285924950704992, "learning_rate": 4.331844392657991e-08, "loss": 1.026, "step": 2664 }, { "epoch": 6.8158567774936065, "grad_norm": 0.11299230638204774, "learning_rate": 4.2140594601320915e-08, "loss": 1.0162, "step": 2665 }, { "epoch": 6.818414322250639, "grad_norm": 0.11595950299690573, "learning_rate": 4.097894609069841e-08, "loss": 0.9853, "step": 2666 }, { "epoch": 6.820971867007673, "grad_norm": 0.1155751170348188, "learning_rate": 3.983350028464283e-08, "loss": 1.0022, "step": 2667 }, { "epoch": 6.823529411764706, "grad_norm": 0.11385145480733656, "learning_rate": 3.870425904672237e-08, "loss": 1.0905, "step": 2668 }, { "epoch": 6.826086956521739, "grad_norm": 0.11441498941945787, "learning_rate": 3.7591224214141855e-08, "loss": 1.032, "step": 2669 }, { "epoch": 6.828644501278772, "grad_norm": 0.11622957694085463, "learning_rate": 3.649439759773943e-08, "loss": 1.0273, "step": 2670 }, { "epoch": 6.831202046035806, "grad_norm": 0.11310752492427763, "learning_rate": 3.541378098198323e-08, "loss": 1.0202, "step": 2671 }, { "epoch": 6.833759590792839, "grad_norm": 0.11205928985871322, "learning_rate": 3.4349376124969136e-08, "loss": 0.9919, "step": 2672 }, { "epoch": 6.836317135549872, "grad_norm": 0.11055590937853152, "learning_rate": 3.330118475841859e-08, "loss": 1.019, "step": 2673 }, { "epoch": 6.838874680306906, "grad_norm": 0.11098200209047006, "learning_rate": 3.22692085876708e-08, "loss": 0.9972, "step": 2674 }, { "epoch": 6.841432225063938, "grad_norm": 0.11522340948350532, "learning_rate": 3.125344929168828e-08, "loss": 1.0004, "step": 2675 }, { "epoch": 6.843989769820972, "grad_norm": 0.11422976235509531, "learning_rate": 3.025390852304688e-08, "loss": 1.0273, "step": 2676 }, { "epoch": 6.846547314578006, "grad_norm": 0.11018216168196639, "learning_rate": 2.927058790793802e-08, "loss": 1.0102, "step": 2677 }, { "epoch": 6.849104859335038, "grad_norm": 0.10995140569223621, "learning_rate": 2.830348904616198e-08, "loss": 0.991, "step": 2678 }, { "epoch": 6.851662404092072, "grad_norm": 0.11543991907521552, "learning_rate": 2.7352613511127946e-08, "loss": 1.0338, "step": 2679 }, { "epoch": 6.854219948849105, "grad_norm": 0.11129720513762761, "learning_rate": 2.6417962849852875e-08, "loss": 1.0094, "step": 2680 }, { "epoch": 6.856777493606138, "grad_norm": 0.1097107046759256, "learning_rate": 2.549953858295262e-08, "loss": 1.0208, "step": 2681 }, { "epoch": 6.859335038363171, "grad_norm": 0.1181695445768175, "learning_rate": 2.459734220464638e-08, "loss": 1.0015, "step": 2682 }, { "epoch": 6.861892583120205, "grad_norm": 0.11107816598809478, "learning_rate": 2.3711375182753347e-08, "loss": 1.0261, "step": 2683 }, { "epoch": 6.864450127877237, "grad_norm": 0.10839159774339671, "learning_rate": 2.2841638958683855e-08, "loss": 1.0135, "step": 2684 }, { "epoch": 6.867007672634271, "grad_norm": 0.1121417586939987, "learning_rate": 2.1988134947446004e-08, "loss": 1.0035, "step": 2685 }, { "epoch": 6.869565217391305, "grad_norm": 0.11209845991644457, "learning_rate": 2.1150864537636817e-08, "loss": 1.0321, "step": 2686 }, { "epoch": 6.872122762148337, "grad_norm": 0.11303462530389491, "learning_rate": 2.032982909144332e-08, "loss": 1.012, "step": 2687 }, { "epoch": 6.874680306905371, "grad_norm": 0.11117791002965544, "learning_rate": 1.9525029944637008e-08, "loss": 0.9929, "step": 2688 }, { "epoch": 6.877237851662404, "grad_norm": 0.1089777437805983, "learning_rate": 1.8736468406579388e-08, "loss": 0.9931, "step": 2689 }, { "epoch": 6.879795396419437, "grad_norm": 0.11251100033934079, "learning_rate": 1.796414576020755e-08, "loss": 1.0153, "step": 2690 }, { "epoch": 6.882352941176471, "grad_norm": 0.11299998984552379, "learning_rate": 1.720806326204305e-08, "loss": 1.005, "step": 2691 }, { "epoch": 6.884910485933504, "grad_norm": 0.11290626743296132, "learning_rate": 1.646822214218524e-08, "loss": 1.049, "step": 2692 }, { "epoch": 6.887468030690537, "grad_norm": 0.11186130749976496, "learning_rate": 1.5744623604310172e-08, "loss": 1.003, "step": 2693 }, { "epoch": 6.89002557544757, "grad_norm": 0.11028332749990057, "learning_rate": 1.503726882566503e-08, "loss": 0.9892, "step": 2694 }, { "epoch": 6.892583120204604, "grad_norm": 0.11457205700764143, "learning_rate": 1.4346158957073696e-08, "loss": 1.0261, "step": 2695 }, { "epoch": 6.8951406649616365, "grad_norm": 0.11434079231719742, "learning_rate": 1.3671295122928974e-08, "loss": 1.0118, "step": 2696 }, { "epoch": 6.89769820971867, "grad_norm": 0.11590548541933458, "learning_rate": 1.3012678421191471e-08, "loss": 1.0397, "step": 2697 }, { "epoch": 6.900255754475703, "grad_norm": 0.11241776946007812, "learning_rate": 1.2370309923388501e-08, "loss": 1.0214, "step": 2698 }, { "epoch": 6.9028132992327365, "grad_norm": 0.11386908312296881, "learning_rate": 1.1744190674614076e-08, "loss": 1.0249, "step": 2699 }, { "epoch": 6.90537084398977, "grad_norm": 0.1111155708841944, "learning_rate": 1.1134321693525574e-08, "loss": 1.0013, "step": 2700 }, { "epoch": 6.907928388746803, "grad_norm": 0.11383791079341445, "learning_rate": 1.0540703972341525e-08, "loss": 1.0148, "step": 2701 }, { "epoch": 6.910485933503836, "grad_norm": 0.11458774717785482, "learning_rate": 9.963338476840501e-09, "loss": 1.029, "step": 2702 }, { "epoch": 6.913043478260869, "grad_norm": 0.11295695096599505, "learning_rate": 9.402226146361104e-09, "loss": 1.0136, "step": 2703 }, { "epoch": 6.915601023017903, "grad_norm": 0.11389257052620162, "learning_rate": 8.857367893796431e-09, "loss": 0.9989, "step": 2704 }, { "epoch": 6.918158567774936, "grad_norm": 0.11405136091559014, "learning_rate": 8.328764605597395e-09, "loss": 1.0239, "step": 2705 }, { "epoch": 6.920716112531969, "grad_norm": 0.11514239271194625, "learning_rate": 7.816417141768284e-09, "loss": 1.041, "step": 2706 }, { "epoch": 6.923273657289003, "grad_norm": 0.11236159186101047, "learning_rate": 7.3203263358678775e-09, "loss": 1.0297, "step": 2707 }, { "epoch": 6.9258312020460355, "grad_norm": 0.112779013609661, "learning_rate": 6.840492995002779e-09, "loss": 1.0177, "step": 2708 }, { "epoch": 6.928388746803069, "grad_norm": 0.11154163182583252, "learning_rate": 6.376917899832968e-09, "loss": 1.0262, "step": 2709 }, { "epoch": 6.930946291560103, "grad_norm": 0.11358295898234577, "learning_rate": 5.929601804566254e-09, "loss": 1.0057, "step": 2710 }, { "epoch": 6.9335038363171355, "grad_norm": 0.11003717187565273, "learning_rate": 5.498545436957159e-09, "loss": 1.0269, "step": 2711 }, { "epoch": 6.936061381074169, "grad_norm": 0.10600474645039837, "learning_rate": 5.0837494983091425e-09, "loss": 0.9854, "step": 2712 }, { "epoch": 6.938618925831202, "grad_norm": 0.10929642667614789, "learning_rate": 4.6852146634668304e-09, "loss": 1.0149, "step": 2713 }, { "epoch": 6.9411764705882355, "grad_norm": 0.11582392789733863, "learning_rate": 4.302941580823783e-09, "loss": 0.9864, "step": 2714 }, { "epoch": 6.943734015345268, "grad_norm": 0.11406855862931596, "learning_rate": 3.936930872312506e-09, "loss": 1.0296, "step": 2715 }, { "epoch": 6.946291560102302, "grad_norm": 0.11629050448797144, "learning_rate": 3.5871831334099992e-09, "loss": 1.0319, "step": 2716 }, { "epoch": 6.948849104859335, "grad_norm": 0.11235711633426523, "learning_rate": 3.2536989331355406e-09, "loss": 1.0061, "step": 2717 }, { "epoch": 6.951406649616368, "grad_norm": 0.11339029722347495, "learning_rate": 2.9364788140451296e-09, "loss": 1.0558, "step": 2718 }, { "epoch": 6.953964194373402, "grad_norm": 0.1122327401431765, "learning_rate": 2.635523292237041e-09, "loss": 1.043, "step": 2719 }, { "epoch": 6.956521739130435, "grad_norm": 0.1150922652013077, "learning_rate": 2.3508328573462745e-09, "loss": 1.0157, "step": 2720 }, { "epoch": 6.959079283887468, "grad_norm": 0.11034749878838018, "learning_rate": 2.082407972547884e-09, "loss": 1.0172, "step": 2721 }, { "epoch": 6.961636828644501, "grad_norm": 0.11414568906111035, "learning_rate": 1.8302490745503166e-09, "loss": 1.0294, "step": 2722 }, { "epoch": 6.964194373401535, "grad_norm": 0.11166620944982035, "learning_rate": 1.5943565736020739e-09, "loss": 1.0242, "step": 2723 }, { "epoch": 6.966751918158568, "grad_norm": 0.11672921275884213, "learning_rate": 1.3747308534850512e-09, "loss": 1.0372, "step": 2724 }, { "epoch": 6.969309462915601, "grad_norm": 0.11540312400728218, "learning_rate": 1.1713722715167575e-09, "loss": 1.0515, "step": 2725 }, { "epoch": 6.971867007672635, "grad_norm": 0.11588267312835213, "learning_rate": 9.84281158548095e-10, "loss": 1.0291, "step": 2726 }, { "epoch": 6.974424552429667, "grad_norm": 0.11642536438528109, "learning_rate": 8.134578189644692e-10, "loss": 1.013, "step": 2727 }, { "epoch": 6.976982097186701, "grad_norm": 0.11741237126233431, "learning_rate": 6.589025306869002e-10, "loss": 1.0054, "step": 2728 }, { "epoch": 6.979539641943734, "grad_norm": 0.1116075879721608, "learning_rate": 5.206155451642491e-10, "loss": 1.0299, "step": 2729 }, { "epoch": 6.982097186700767, "grad_norm": 0.11444442287287329, "learning_rate": 3.985970873821021e-10, "loss": 1.0413, "step": 2730 }, { "epoch": 6.9846547314578, "grad_norm": 0.12160291833827606, "learning_rate": 2.928473558583278e-10, "loss": 1.0317, "step": 2731 }, { "epoch": 6.987212276214834, "grad_norm": 0.1124635627813877, "learning_rate": 2.033665226386372e-10, "loss": 1.0144, "step": 2732 }, { "epoch": 6.989769820971867, "grad_norm": 0.11276149081438312, "learning_rate": 1.301547333032449e-10, "loss": 1.0007, "step": 2733 }, { "epoch": 6.9923273657289, "grad_norm": 0.10984392228143453, "learning_rate": 7.321210696464853e-11, "loss": 0.9763, "step": 2734 }, { "epoch": 6.994884910485934, "grad_norm": 0.11019543161726779, "learning_rate": 3.253873626429816e-11, "loss": 1.0013, "step": 2735 }, { "epoch": 6.997442455242966, "grad_norm": 0.11197749059770203, "learning_rate": 8.134687374816708e-12, "loss": 1.0472, "step": 2736 }, { "epoch": 7.0, "grad_norm": 0.11208987109779546, "learning_rate": 0.0, "loss": 0.9774, "step": 2737 }, { "epoch": 7.0, "step": 2737, "total_flos": 9969287656374272.0, "train_loss": 1.063590354692078, "train_runtime": 97730.0822, "train_samples_per_second": 7.163, "train_steps_per_second": 0.028 } ], "logging_steps": 1.0, "max_steps": 2737, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9969287656374272.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }