{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025575447570332483, "grad_norm": 2.9563186457899664, "learning_rate": 8.163265306122449e-07, "loss": 1.5213, "step": 1 }, { "epoch": 0.005115089514066497, "grad_norm": 2.9570403686006705, "learning_rate": 1.6326530612244897e-06, "loss": 1.4742, "step": 2 }, { "epoch": 0.0076726342710997444, "grad_norm": 3.010165733072805, "learning_rate": 2.4489795918367347e-06, "loss": 1.4946, "step": 3 }, { "epoch": 0.010230179028132993, "grad_norm": 2.868210172096221, "learning_rate": 3.2653061224489794e-06, "loss": 1.482, "step": 4 }, { "epoch": 0.01278772378516624, "grad_norm": 2.6518374719836477, "learning_rate": 4.081632653061225e-06, "loss": 1.4866, "step": 5 }, { "epoch": 0.015345268542199489, "grad_norm": 2.0719187075670176, "learning_rate": 4.897959183673469e-06, "loss": 1.4844, "step": 6 }, { "epoch": 0.017902813299232736, "grad_norm": 1.8691931463314044, "learning_rate": 5.7142857142857145e-06, "loss": 1.4533, "step": 7 }, { "epoch": 0.020460358056265986, "grad_norm": 1.8092896927352589, "learning_rate": 6.530612244897959e-06, "loss": 1.4433, "step": 8 }, { "epoch": 0.023017902813299233, "grad_norm": 1.7543993002445608, "learning_rate": 7.346938775510205e-06, "loss": 1.4744, "step": 9 }, { "epoch": 0.02557544757033248, "grad_norm": 1.6606628173638305, "learning_rate": 8.16326530612245e-06, "loss": 1.4586, "step": 10 }, { "epoch": 0.028132992327365727, "grad_norm": 2.197985553952399, "learning_rate": 8.979591836734695e-06, "loss": 1.4315, "step": 11 }, { "epoch": 0.030690537084398978, "grad_norm": 2.096672912966444, "learning_rate": 9.795918367346939e-06, "loss": 1.3907, "step": 12 }, { "epoch": 0.03324808184143223, "grad_norm": 1.7669816182231157, "learning_rate": 1.0612244897959186e-05, "loss": 1.4234, "step": 13 }, { "epoch": 0.03580562659846547, "grad_norm": 1.3020764177290665, "learning_rate": 1.1428571428571429e-05, "loss": 1.3478, "step": 14 }, { "epoch": 0.03836317135549872, "grad_norm": 1.2917276833945952, "learning_rate": 1.2244897959183674e-05, "loss": 1.378, "step": 15 }, { "epoch": 0.04092071611253197, "grad_norm": 0.9647900041095249, "learning_rate": 1.3061224489795918e-05, "loss": 1.3273, "step": 16 }, { "epoch": 0.043478260869565216, "grad_norm": 0.998986811649884, "learning_rate": 1.3877551020408165e-05, "loss": 1.3424, "step": 17 }, { "epoch": 0.04603580562659847, "grad_norm": 0.8293785359427173, "learning_rate": 1.469387755102041e-05, "loss": 1.3354, "step": 18 }, { "epoch": 0.04859335038363171, "grad_norm": 0.7442208693017255, "learning_rate": 1.5510204081632655e-05, "loss": 1.3216, "step": 19 }, { "epoch": 0.05115089514066496, "grad_norm": 0.8334097235660463, "learning_rate": 1.63265306122449e-05, "loss": 1.305, "step": 20 }, { "epoch": 0.05370843989769821, "grad_norm": 0.7133053870238929, "learning_rate": 1.7142857142857142e-05, "loss": 1.2863, "step": 21 }, { "epoch": 0.056265984654731455, "grad_norm": 0.5994613004850937, "learning_rate": 1.795918367346939e-05, "loss": 1.31, "step": 22 }, { "epoch": 0.058823529411764705, "grad_norm": 0.6168319603979278, "learning_rate": 1.8775510204081636e-05, "loss": 1.2652, "step": 23 }, { "epoch": 0.061381074168797956, "grad_norm": 0.5934674503101482, "learning_rate": 1.9591836734693877e-05, "loss": 1.2848, "step": 24 }, { "epoch": 0.0639386189258312, "grad_norm": 0.5809141308410171, "learning_rate": 2.0408163265306126e-05, "loss": 1.2605, "step": 25 }, { "epoch": 0.06649616368286446, "grad_norm": 0.5544963829723922, "learning_rate": 2.122448979591837e-05, "loss": 1.2663, "step": 26 }, { "epoch": 0.06905370843989769, "grad_norm": 0.5109525040926751, "learning_rate": 2.2040816326530613e-05, "loss": 1.2493, "step": 27 }, { "epoch": 0.07161125319693094, "grad_norm": 0.47071086900075043, "learning_rate": 2.2857142857142858e-05, "loss": 1.2725, "step": 28 }, { "epoch": 0.0741687979539642, "grad_norm": 0.47760033429842697, "learning_rate": 2.3673469387755103e-05, "loss": 1.2493, "step": 29 }, { "epoch": 0.07672634271099744, "grad_norm": 0.47942640683455684, "learning_rate": 2.448979591836735e-05, "loss": 1.2635, "step": 30 }, { "epoch": 0.0792838874680307, "grad_norm": 0.3817784984018378, "learning_rate": 2.5306122448979597e-05, "loss": 1.2581, "step": 31 }, { "epoch": 0.08184143222506395, "grad_norm": 0.41863028873772656, "learning_rate": 2.6122448979591835e-05, "loss": 1.2319, "step": 32 }, { "epoch": 0.08439897698209718, "grad_norm": 0.4561646749370822, "learning_rate": 2.6938775510204084e-05, "loss": 1.2647, "step": 33 }, { "epoch": 0.08695652173913043, "grad_norm": 0.32944852639889954, "learning_rate": 2.775510204081633e-05, "loss": 1.2828, "step": 34 }, { "epoch": 0.08951406649616368, "grad_norm": 0.36090683632534276, "learning_rate": 2.8571428571428574e-05, "loss": 1.2245, "step": 35 }, { "epoch": 0.09207161125319693, "grad_norm": 0.36952861081098753, "learning_rate": 2.938775510204082e-05, "loss": 1.2383, "step": 36 }, { "epoch": 0.09462915601023018, "grad_norm": 0.39714992118388376, "learning_rate": 3.020408163265306e-05, "loss": 1.2524, "step": 37 }, { "epoch": 0.09718670076726342, "grad_norm": 0.3567290279003148, "learning_rate": 3.102040816326531e-05, "loss": 1.229, "step": 38 }, { "epoch": 0.09974424552429667, "grad_norm": 0.3806643799838351, "learning_rate": 3.183673469387755e-05, "loss": 1.2438, "step": 39 }, { "epoch": 0.10230179028132992, "grad_norm": 0.407422548294049, "learning_rate": 3.26530612244898e-05, "loss": 1.1862, "step": 40 }, { "epoch": 0.10485933503836317, "grad_norm": 0.34463209168828013, "learning_rate": 3.346938775510204e-05, "loss": 1.2127, "step": 41 }, { "epoch": 0.10741687979539642, "grad_norm": 0.36477387999624367, "learning_rate": 3.4285714285714284e-05, "loss": 1.2118, "step": 42 }, { "epoch": 0.10997442455242967, "grad_norm": 0.33681318596769666, "learning_rate": 3.510204081632653e-05, "loss": 1.1849, "step": 43 }, { "epoch": 0.11253196930946291, "grad_norm": 0.3683055012446813, "learning_rate": 3.591836734693878e-05, "loss": 1.1965, "step": 44 }, { "epoch": 0.11508951406649616, "grad_norm": 0.3236097196989051, "learning_rate": 3.673469387755102e-05, "loss": 1.1973, "step": 45 }, { "epoch": 0.11764705882352941, "grad_norm": 0.45336744047964317, "learning_rate": 3.755102040816327e-05, "loss": 1.219, "step": 46 }, { "epoch": 0.12020460358056266, "grad_norm": 0.6485049911187234, "learning_rate": 3.836734693877551e-05, "loss": 1.22, "step": 47 }, { "epoch": 0.12276214833759591, "grad_norm": 0.7308887737693851, "learning_rate": 3.9183673469387755e-05, "loss": 1.1927, "step": 48 }, { "epoch": 0.12531969309462915, "grad_norm": 0.7412779741523179, "learning_rate": 4e-05, "loss": 1.207, "step": 49 }, { "epoch": 0.1278772378516624, "grad_norm": 0.61907561491782, "learning_rate": 4.081632653061225e-05, "loss": 1.1761, "step": 50 }, { "epoch": 0.13043478260869565, "grad_norm": 0.5645180027937694, "learning_rate": 4.1632653061224494e-05, "loss": 1.1828, "step": 51 }, { "epoch": 0.1329923273657289, "grad_norm": 0.6097938476878244, "learning_rate": 4.244897959183674e-05, "loss": 1.1645, "step": 52 }, { "epoch": 0.13554987212276215, "grad_norm": 0.68105585214221, "learning_rate": 4.3265306122448984e-05, "loss": 1.1663, "step": 53 }, { "epoch": 0.13810741687979539, "grad_norm": 0.5148592364190684, "learning_rate": 4.4081632653061226e-05, "loss": 1.2013, "step": 54 }, { "epoch": 0.14066496163682865, "grad_norm": 0.6290537917728678, "learning_rate": 4.489795918367347e-05, "loss": 1.2142, "step": 55 }, { "epoch": 0.1432225063938619, "grad_norm": 0.8770682994258979, "learning_rate": 4.5714285714285716e-05, "loss": 1.2031, "step": 56 }, { "epoch": 0.14578005115089515, "grad_norm": 1.211521452597314, "learning_rate": 4.6530612244897965e-05, "loss": 1.1872, "step": 57 }, { "epoch": 0.1483375959079284, "grad_norm": 1.1706192692433377, "learning_rate": 4.7346938775510206e-05, "loss": 1.2026, "step": 58 }, { "epoch": 0.15089514066496162, "grad_norm": 1.0347528096815952, "learning_rate": 4.8163265306122455e-05, "loss": 1.1698, "step": 59 }, { "epoch": 0.1534526854219949, "grad_norm": 0.8917967843832559, "learning_rate": 4.89795918367347e-05, "loss": 1.1935, "step": 60 }, { "epoch": 0.15601023017902813, "grad_norm": 0.8447536110052303, "learning_rate": 4.9795918367346945e-05, "loss": 1.1816, "step": 61 }, { "epoch": 0.1585677749360614, "grad_norm": 1.0808910383761972, "learning_rate": 5.0612244897959194e-05, "loss": 1.2148, "step": 62 }, { "epoch": 0.16112531969309463, "grad_norm": 1.0232789536513451, "learning_rate": 5.1428571428571436e-05, "loss": 1.1974, "step": 63 }, { "epoch": 0.1636828644501279, "grad_norm": 0.870169282881004, "learning_rate": 5.224489795918367e-05, "loss": 1.1914, "step": 64 }, { "epoch": 0.16624040920716113, "grad_norm": 0.7292663989493176, "learning_rate": 5.306122448979592e-05, "loss": 1.183, "step": 65 }, { "epoch": 0.16879795396419436, "grad_norm": 0.8315009268144099, "learning_rate": 5.387755102040817e-05, "loss": 1.1457, "step": 66 }, { "epoch": 0.17135549872122763, "grad_norm": 1.04261775715331, "learning_rate": 5.469387755102041e-05, "loss": 1.1724, "step": 67 }, { "epoch": 0.17391304347826086, "grad_norm": 1.0040970248925822, "learning_rate": 5.551020408163266e-05, "loss": 1.1469, "step": 68 }, { "epoch": 0.17647058823529413, "grad_norm": 1.0399514999943609, "learning_rate": 5.63265306122449e-05, "loss": 1.1335, "step": 69 }, { "epoch": 0.17902813299232737, "grad_norm": 0.9541534570834667, "learning_rate": 5.714285714285715e-05, "loss": 1.1475, "step": 70 }, { "epoch": 0.1815856777493606, "grad_norm": 1.155886502502828, "learning_rate": 5.79591836734694e-05, "loss": 1.1595, "step": 71 }, { "epoch": 0.18414322250639387, "grad_norm": 1.4920355778823207, "learning_rate": 5.877551020408164e-05, "loss": 1.1764, "step": 72 }, { "epoch": 0.1867007672634271, "grad_norm": 0.8392580472572768, "learning_rate": 5.959183673469389e-05, "loss": 1.2046, "step": 73 }, { "epoch": 0.18925831202046037, "grad_norm": 1.3327976055634758, "learning_rate": 6.040816326530612e-05, "loss": 1.1601, "step": 74 }, { "epoch": 0.1918158567774936, "grad_norm": 1.2349989957797203, "learning_rate": 6.122448979591836e-05, "loss": 1.1524, "step": 75 }, { "epoch": 0.19437340153452684, "grad_norm": 1.1978584662405511, "learning_rate": 6.204081632653062e-05, "loss": 1.1559, "step": 76 }, { "epoch": 0.1969309462915601, "grad_norm": 1.0353931821191475, "learning_rate": 6.285714285714286e-05, "loss": 1.1657, "step": 77 }, { "epoch": 0.19948849104859334, "grad_norm": 0.9094148187384907, "learning_rate": 6.36734693877551e-05, "loss": 1.1471, "step": 78 }, { "epoch": 0.2020460358056266, "grad_norm": 1.187032715727395, "learning_rate": 6.448979591836736e-05, "loss": 1.1408, "step": 79 }, { "epoch": 0.20460358056265984, "grad_norm": 1.0732720468700825, "learning_rate": 6.53061224489796e-05, "loss": 1.1565, "step": 80 }, { "epoch": 0.2071611253196931, "grad_norm": 0.8103161887096414, "learning_rate": 6.612244897959184e-05, "loss": 1.1394, "step": 81 }, { "epoch": 0.20971867007672634, "grad_norm": 1.0683800405517745, "learning_rate": 6.693877551020408e-05, "loss": 1.1366, "step": 82 }, { "epoch": 0.21227621483375958, "grad_norm": 1.0570001635125388, "learning_rate": 6.775510204081634e-05, "loss": 1.1521, "step": 83 }, { "epoch": 0.21483375959079284, "grad_norm": 1.0038253962694932, "learning_rate": 6.857142857142857e-05, "loss": 1.1485, "step": 84 }, { "epoch": 0.21739130434782608, "grad_norm": 1.1691956641199828, "learning_rate": 6.938775510204082e-05, "loss": 1.199, "step": 85 }, { "epoch": 0.21994884910485935, "grad_norm": 1.160205747766507, "learning_rate": 7.020408163265306e-05, "loss": 1.15, "step": 86 }, { "epoch": 0.22250639386189258, "grad_norm": 1.0403901594667788, "learning_rate": 7.10204081632653e-05, "loss": 1.1547, "step": 87 }, { "epoch": 0.22506393861892582, "grad_norm": 1.253302691517826, "learning_rate": 7.183673469387756e-05, "loss": 1.1808, "step": 88 }, { "epoch": 0.22762148337595908, "grad_norm": 1.0181115029064822, "learning_rate": 7.26530612244898e-05, "loss": 1.1399, "step": 89 }, { "epoch": 0.23017902813299232, "grad_norm": 1.179029120883534, "learning_rate": 7.346938775510205e-05, "loss": 1.1709, "step": 90 }, { "epoch": 0.23273657289002558, "grad_norm": 0.8065046535786934, "learning_rate": 7.42857142857143e-05, "loss": 1.1649, "step": 91 }, { "epoch": 0.23529411764705882, "grad_norm": 0.9920804997259105, "learning_rate": 7.510204081632654e-05, "loss": 1.1693, "step": 92 }, { "epoch": 0.23785166240409208, "grad_norm": 1.4041632222361236, "learning_rate": 7.591836734693878e-05, "loss": 1.1525, "step": 93 }, { "epoch": 0.24040920716112532, "grad_norm": 1.1202267325769892, "learning_rate": 7.673469387755103e-05, "loss": 1.1727, "step": 94 }, { "epoch": 0.24296675191815856, "grad_norm": 0.9214700487119486, "learning_rate": 7.755102040816327e-05, "loss": 1.1193, "step": 95 }, { "epoch": 0.24552429667519182, "grad_norm": 0.9731714014969046, "learning_rate": 7.836734693877551e-05, "loss": 1.1605, "step": 96 }, { "epoch": 0.24808184143222506, "grad_norm": 1.2370098654676154, "learning_rate": 7.918367346938776e-05, "loss": 1.1663, "step": 97 }, { "epoch": 0.2506393861892583, "grad_norm": 0.856182416906324, "learning_rate": 8e-05, "loss": 1.134, "step": 98 }, { "epoch": 0.2531969309462916, "grad_norm": 1.0086218583881408, "learning_rate": 8.081632653061225e-05, "loss": 1.1307, "step": 99 }, { "epoch": 0.2557544757033248, "grad_norm": 1.3360855997576195, "learning_rate": 8.16326530612245e-05, "loss": 1.1283, "step": 100 }, { "epoch": 0.25831202046035806, "grad_norm": 1.1442169715816302, "learning_rate": 8.244897959183673e-05, "loss": 1.1486, "step": 101 }, { "epoch": 0.2608695652173913, "grad_norm": 0.9528964485268216, "learning_rate": 8.326530612244899e-05, "loss": 1.1539, "step": 102 }, { "epoch": 0.26342710997442453, "grad_norm": 1.2014260964822987, "learning_rate": 8.408163265306123e-05, "loss": 1.1246, "step": 103 }, { "epoch": 0.2659846547314578, "grad_norm": 1.2896301281378582, "learning_rate": 8.489795918367348e-05, "loss": 1.1193, "step": 104 }, { "epoch": 0.26854219948849106, "grad_norm": 1.2365104040466046, "learning_rate": 8.571428571428571e-05, "loss": 1.1257, "step": 105 }, { "epoch": 0.2710997442455243, "grad_norm": 0.8909578987791607, "learning_rate": 8.653061224489797e-05, "loss": 1.1127, "step": 106 }, { "epoch": 0.27365728900255754, "grad_norm": 1.170325095506981, "learning_rate": 8.734693877551021e-05, "loss": 1.1441, "step": 107 }, { "epoch": 0.27621483375959077, "grad_norm": 0.8299590325351531, "learning_rate": 8.816326530612245e-05, "loss": 1.1199, "step": 108 }, { "epoch": 0.27877237851662406, "grad_norm": 1.0039851893132474, "learning_rate": 8.897959183673471e-05, "loss": 1.1454, "step": 109 }, { "epoch": 0.2813299232736573, "grad_norm": 1.309094467948393, "learning_rate": 8.979591836734694e-05, "loss": 1.1534, "step": 110 }, { "epoch": 0.28388746803069054, "grad_norm": 1.030513843956652, "learning_rate": 9.061224489795919e-05, "loss": 1.1518, "step": 111 }, { "epoch": 0.2864450127877238, "grad_norm": 1.1548472835092134, "learning_rate": 9.142857142857143e-05, "loss": 1.1422, "step": 112 }, { "epoch": 0.289002557544757, "grad_norm": 1.0781950243182032, "learning_rate": 9.224489795918369e-05, "loss": 1.1125, "step": 113 }, { "epoch": 0.2915601023017903, "grad_norm": 1.4696697800626741, "learning_rate": 9.306122448979593e-05, "loss": 1.147, "step": 114 }, { "epoch": 0.29411764705882354, "grad_norm": 0.8932168550895682, "learning_rate": 9.387755102040817e-05, "loss": 1.1225, "step": 115 }, { "epoch": 0.2966751918158568, "grad_norm": 1.4609624921794502, "learning_rate": 9.469387755102041e-05, "loss": 1.1402, "step": 116 }, { "epoch": 0.29923273657289, "grad_norm": 1.1608303447004447, "learning_rate": 9.551020408163267e-05, "loss": 1.1268, "step": 117 }, { "epoch": 0.30179028132992325, "grad_norm": 1.3699566135342083, "learning_rate": 9.632653061224491e-05, "loss": 1.1782, "step": 118 }, { "epoch": 0.30434782608695654, "grad_norm": 1.0951988856065036, "learning_rate": 9.714285714285714e-05, "loss": 1.1383, "step": 119 }, { "epoch": 0.3069053708439898, "grad_norm": 1.311071103466961, "learning_rate": 9.79591836734694e-05, "loss": 1.1485, "step": 120 }, { "epoch": 0.309462915601023, "grad_norm": 0.8986951965704776, "learning_rate": 9.877551020408164e-05, "loss": 1.1604, "step": 121 }, { "epoch": 0.31202046035805625, "grad_norm": 1.2243542530871734, "learning_rate": 9.959183673469389e-05, "loss": 1.1129, "step": 122 }, { "epoch": 0.3145780051150895, "grad_norm": 1.3033780963392814, "learning_rate": 0.00010040816326530613, "loss": 1.1344, "step": 123 }, { "epoch": 0.3171355498721228, "grad_norm": 1.1948786110977876, "learning_rate": 0.00010122448979591839, "loss": 1.1269, "step": 124 }, { "epoch": 0.319693094629156, "grad_norm": 1.142953671137177, "learning_rate": 0.00010204081632653062, "loss": 1.1078, "step": 125 }, { "epoch": 0.32225063938618925, "grad_norm": 1.1524456987304121, "learning_rate": 0.00010285714285714287, "loss": 1.1653, "step": 126 }, { "epoch": 0.3248081841432225, "grad_norm": 1.1658601331325984, "learning_rate": 0.00010367346938775511, "loss": 1.1088, "step": 127 }, { "epoch": 0.3273657289002558, "grad_norm": 1.72409589259486, "learning_rate": 0.00010448979591836734, "loss": 1.1289, "step": 128 }, { "epoch": 0.329923273657289, "grad_norm": 0.7330929431707807, "learning_rate": 0.0001053061224489796, "loss": 1.1191, "step": 129 }, { "epoch": 0.33248081841432225, "grad_norm": 1.2646590423606026, "learning_rate": 0.00010612244897959184, "loss": 1.1527, "step": 130 }, { "epoch": 0.3350383631713555, "grad_norm": 1.723349785426215, "learning_rate": 0.0001069387755102041, "loss": 1.118, "step": 131 }, { "epoch": 0.3375959079283887, "grad_norm": 0.8139452555798524, "learning_rate": 0.00010775510204081634, "loss": 1.1702, "step": 132 }, { "epoch": 0.340153452685422, "grad_norm": 1.1603914477308341, "learning_rate": 0.00010857142857142859, "loss": 1.1467, "step": 133 }, { "epoch": 0.34271099744245526, "grad_norm": 1.2110578835398869, "learning_rate": 0.00010938775510204082, "loss": 1.1174, "step": 134 }, { "epoch": 0.3452685421994885, "grad_norm": 1.3576198261777483, "learning_rate": 0.00011020408163265307, "loss": 1.1746, "step": 135 }, { "epoch": 0.34782608695652173, "grad_norm": 1.0573462588351146, "learning_rate": 0.00011102040816326532, "loss": 1.1333, "step": 136 }, { "epoch": 0.35038363171355497, "grad_norm": 1.3340765255200633, "learning_rate": 0.00011183673469387757, "loss": 1.1482, "step": 137 }, { "epoch": 0.35294117647058826, "grad_norm": 0.9284689786425085, "learning_rate": 0.0001126530612244898, "loss": 1.1304, "step": 138 }, { "epoch": 0.3554987212276215, "grad_norm": 1.4254480759114776, "learning_rate": 0.00011346938775510204, "loss": 1.1106, "step": 139 }, { "epoch": 0.35805626598465473, "grad_norm": 1.3594890583091455, "learning_rate": 0.0001142857142857143, "loss": 1.1664, "step": 140 }, { "epoch": 0.36061381074168797, "grad_norm": 1.086678024627712, "learning_rate": 0.00011510204081632654, "loss": 1.0802, "step": 141 }, { "epoch": 0.3631713554987212, "grad_norm": 1.533977830454029, "learning_rate": 0.0001159183673469388, "loss": 1.1332, "step": 142 }, { "epoch": 0.3657289002557545, "grad_norm": 0.940287237501315, "learning_rate": 0.00011673469387755102, "loss": 1.1573, "step": 143 }, { "epoch": 0.36828644501278773, "grad_norm": 1.2572408225100642, "learning_rate": 0.00011755102040816328, "loss": 1.1292, "step": 144 }, { "epoch": 0.37084398976982097, "grad_norm": 0.9995509690787548, "learning_rate": 0.00011836734693877552, "loss": 1.1375, "step": 145 }, { "epoch": 0.3734015345268542, "grad_norm": 1.6478855533912629, "learning_rate": 0.00011918367346938777, "loss": 1.1281, "step": 146 }, { "epoch": 0.37595907928388744, "grad_norm": 0.9807964464883856, "learning_rate": 0.00012000000000000002, "loss": 1.1604, "step": 147 }, { "epoch": 0.37851662404092073, "grad_norm": 1.3424151204814954, "learning_rate": 0.00012081632653061224, "loss": 1.1247, "step": 148 }, { "epoch": 0.38107416879795397, "grad_norm": 1.1827965041877697, "learning_rate": 0.0001216326530612245, "loss": 1.1087, "step": 149 }, { "epoch": 0.3836317135549872, "grad_norm": 1.374289317317436, "learning_rate": 0.00012244897959183673, "loss": 1.1174, "step": 150 }, { "epoch": 0.38618925831202044, "grad_norm": 1.4462982798920152, "learning_rate": 0.00012326530612244898, "loss": 1.1243, "step": 151 }, { "epoch": 0.3887468030690537, "grad_norm": 1.2338591594860693, "learning_rate": 0.00012408163265306124, "loss": 1.127, "step": 152 }, { "epoch": 0.391304347826087, "grad_norm": 0.9926991217212723, "learning_rate": 0.0001248979591836735, "loss": 1.1152, "step": 153 }, { "epoch": 0.3938618925831202, "grad_norm": 1.6602432782777794, "learning_rate": 0.00012571428571428572, "loss": 1.129, "step": 154 }, { "epoch": 0.39641943734015345, "grad_norm": 1.0710563936657969, "learning_rate": 0.00012653061224489798, "loss": 1.1347, "step": 155 }, { "epoch": 0.3989769820971867, "grad_norm": 1.0203164310897854, "learning_rate": 0.0001273469387755102, "loss": 1.1398, "step": 156 }, { "epoch": 0.40153452685422, "grad_norm": 1.4486120558817688, "learning_rate": 0.00012816326530612246, "loss": 1.1572, "step": 157 }, { "epoch": 0.4040920716112532, "grad_norm": 1.0665461325193415, "learning_rate": 0.00012897959183673472, "loss": 1.1443, "step": 158 }, { "epoch": 0.40664961636828645, "grad_norm": 1.6999184553867208, "learning_rate": 0.00012979591836734695, "loss": 1.1027, "step": 159 }, { "epoch": 0.4092071611253197, "grad_norm": 1.0289801155197138, "learning_rate": 0.0001306122448979592, "loss": 1.1194, "step": 160 }, { "epoch": 0.4117647058823529, "grad_norm": 1.5775539926551432, "learning_rate": 0.00013142857142857143, "loss": 1.1209, "step": 161 }, { "epoch": 0.4143222506393862, "grad_norm": 0.9132827293227751, "learning_rate": 0.00013224489795918368, "loss": 1.1115, "step": 162 }, { "epoch": 0.41687979539641945, "grad_norm": 1.8502610336806449, "learning_rate": 0.00013306122448979594, "loss": 1.1237, "step": 163 }, { "epoch": 0.4194373401534527, "grad_norm": 1.3709322904356605, "learning_rate": 0.00013387755102040817, "loss": 1.1353, "step": 164 }, { "epoch": 0.4219948849104859, "grad_norm": 1.1361849330749851, "learning_rate": 0.00013469387755102042, "loss": 1.1043, "step": 165 }, { "epoch": 0.42455242966751916, "grad_norm": 1.1401579492886242, "learning_rate": 0.00013551020408163268, "loss": 1.1252, "step": 166 }, { "epoch": 0.42710997442455245, "grad_norm": 1.171525164401231, "learning_rate": 0.0001363265306122449, "loss": 1.1226, "step": 167 }, { "epoch": 0.4296675191815857, "grad_norm": 1.7103135890270424, "learning_rate": 0.00013714285714285713, "loss": 1.1323, "step": 168 }, { "epoch": 0.4322250639386189, "grad_norm": 1.0590485560747558, "learning_rate": 0.0001379591836734694, "loss": 1.1318, "step": 169 }, { "epoch": 0.43478260869565216, "grad_norm": 1.1381323068879685, "learning_rate": 0.00013877551020408165, "loss": 1.1093, "step": 170 }, { "epoch": 0.4373401534526854, "grad_norm": 1.8095148756504853, "learning_rate": 0.0001395918367346939, "loss": 1.1297, "step": 171 }, { "epoch": 0.4398976982097187, "grad_norm": 1.022630524722603, "learning_rate": 0.00014040816326530613, "loss": 1.1217, "step": 172 }, { "epoch": 0.4424552429667519, "grad_norm": 1.3822427448836618, "learning_rate": 0.00014122448979591838, "loss": 1.145, "step": 173 }, { "epoch": 0.44501278772378516, "grad_norm": 1.3577366143882001, "learning_rate": 0.0001420408163265306, "loss": 1.151, "step": 174 }, { "epoch": 0.4475703324808184, "grad_norm": 1.1001324929653025, "learning_rate": 0.00014285714285714287, "loss": 1.1209, "step": 175 }, { "epoch": 0.45012787723785164, "grad_norm": 1.7043306971887084, "learning_rate": 0.00014367346938775512, "loss": 1.155, "step": 176 }, { "epoch": 0.45268542199488493, "grad_norm": 0.8908531162714106, "learning_rate": 0.00014448979591836735, "loss": 1.1264, "step": 177 }, { "epoch": 0.45524296675191817, "grad_norm": 2.0064867394818577, "learning_rate": 0.0001453061224489796, "loss": 1.1339, "step": 178 }, { "epoch": 0.4578005115089514, "grad_norm": 1.272017394425661, "learning_rate": 0.00014612244897959183, "loss": 1.1179, "step": 179 }, { "epoch": 0.46035805626598464, "grad_norm": 1.6809099682132984, "learning_rate": 0.0001469387755102041, "loss": 1.1306, "step": 180 }, { "epoch": 0.4629156010230179, "grad_norm": 1.2729546637062361, "learning_rate": 0.00014775510204081635, "loss": 1.1547, "step": 181 }, { "epoch": 0.46547314578005117, "grad_norm": 1.2637405257695475, "learning_rate": 0.0001485714285714286, "loss": 1.1234, "step": 182 }, { "epoch": 0.4680306905370844, "grad_norm": 1.3792667256601667, "learning_rate": 0.00014938775510204083, "loss": 1.1384, "step": 183 }, { "epoch": 0.47058823529411764, "grad_norm": 1.0581158807496975, "learning_rate": 0.00015020408163265308, "loss": 1.1308, "step": 184 }, { "epoch": 0.4731457800511509, "grad_norm": 1.2395276036732317, "learning_rate": 0.0001510204081632653, "loss": 1.142, "step": 185 }, { "epoch": 0.47570332480818417, "grad_norm": 1.1474988241030795, "learning_rate": 0.00015183673469387757, "loss": 1.1399, "step": 186 }, { "epoch": 0.4782608695652174, "grad_norm": 1.4488607840873033, "learning_rate": 0.0001526530612244898, "loss": 1.1247, "step": 187 }, { "epoch": 0.48081841432225064, "grad_norm": 0.9895262383072666, "learning_rate": 0.00015346938775510205, "loss": 1.1439, "step": 188 }, { "epoch": 0.4833759590792839, "grad_norm": 1.509540789570866, "learning_rate": 0.0001542857142857143, "loss": 1.1268, "step": 189 }, { "epoch": 0.4859335038363171, "grad_norm": 1.2634220572499701, "learning_rate": 0.00015510204081632654, "loss": 1.1315, "step": 190 }, { "epoch": 0.4884910485933504, "grad_norm": 2.03411519572473, "learning_rate": 0.0001559183673469388, "loss": 1.0859, "step": 191 }, { "epoch": 0.49104859335038364, "grad_norm": 1.1783378998438716, "learning_rate": 0.00015673469387755102, "loss": 1.122, "step": 192 }, { "epoch": 0.4936061381074169, "grad_norm": 1.869178693106169, "learning_rate": 0.00015755102040816327, "loss": 1.0953, "step": 193 }, { "epoch": 0.4961636828644501, "grad_norm": 1.4133576585465655, "learning_rate": 0.00015836734693877553, "loss": 1.0973, "step": 194 }, { "epoch": 0.49872122762148335, "grad_norm": 1.1007402607506083, "learning_rate": 0.00015918367346938778, "loss": 1.1666, "step": 195 }, { "epoch": 0.5012787723785166, "grad_norm": 1.0455333445001125, "learning_rate": 0.00016, "loss": 1.1244, "step": 196 }, { "epoch": 0.5038363171355499, "grad_norm": 1.1414091012657146, "learning_rate": 0.00015999987240667874, "loss": 1.118, "step": 197 }, { "epoch": 0.5063938618925832, "grad_norm": 1.1934725533176622, "learning_rate": 0.0001599994896271219, "loss": 1.1489, "step": 198 }, { "epoch": 0.5089514066496164, "grad_norm": 1.3418673611629677, "learning_rate": 0.0001599988516625505, "loss": 1.1172, "step": 199 }, { "epoch": 0.5115089514066496, "grad_norm": 1.2281301450926736, "learning_rate": 0.00015999795851499954, "loss": 1.124, "step": 200 }, { "epoch": 0.5140664961636828, "grad_norm": 1.4232277874832118, "learning_rate": 0.000159996810187318, "loss": 1.1087, "step": 201 }, { "epoch": 0.5166240409207161, "grad_norm": 1.2445810609035501, "learning_rate": 0.0001599954066831689, "loss": 1.0977, "step": 202 }, { "epoch": 0.5191815856777494, "grad_norm": 1.4902156849341144, "learning_rate": 0.00015999374800702916, "loss": 1.1278, "step": 203 }, { "epoch": 0.5217391304347826, "grad_norm": 0.9117749926569193, "learning_rate": 0.00015999183416418963, "loss": 1.0978, "step": 204 }, { "epoch": 0.5242966751918159, "grad_norm": 1.521914055307176, "learning_rate": 0.0001599896651607552, "loss": 1.1255, "step": 205 }, { "epoch": 0.5268542199488491, "grad_norm": 1.675086821646465, "learning_rate": 0.00015998724100364464, "loss": 1.1117, "step": 206 }, { "epoch": 0.5294117647058824, "grad_norm": 1.0370916213463357, "learning_rate": 0.00015998456170059059, "loss": 1.1269, "step": 207 }, { "epoch": 0.5319693094629157, "grad_norm": 1.4543936507994073, "learning_rate": 0.00015998162726013954, "loss": 1.1159, "step": 208 }, { "epoch": 0.5345268542199488, "grad_norm": 1.628168132567413, "learning_rate": 0.00015997843769165193, "loss": 1.1025, "step": 209 }, { "epoch": 0.5370843989769821, "grad_norm": 1.114123127352084, "learning_rate": 0.0001599749930053019, "loss": 1.0962, "step": 210 }, { "epoch": 0.5396419437340153, "grad_norm": 1.7051681399590384, "learning_rate": 0.00015997129321207747, "loss": 1.1216, "step": 211 }, { "epoch": 0.5421994884910486, "grad_norm": 0.9137353240287979, "learning_rate": 0.00015996733832378032, "loss": 1.0845, "step": 212 }, { "epoch": 0.5447570332480819, "grad_norm": 1.3585376285654678, "learning_rate": 0.00015996312835302593, "loss": 1.1337, "step": 213 }, { "epoch": 0.5473145780051151, "grad_norm": 0.986649874454745, "learning_rate": 0.00015995866331324334, "loss": 1.0791, "step": 214 }, { "epoch": 0.5498721227621484, "grad_norm": 1.4872086766761456, "learning_rate": 0.00015995394321867534, "loss": 1.0898, "step": 215 }, { "epoch": 0.5524296675191815, "grad_norm": 1.3583123340693906, "learning_rate": 0.0001599489680843782, "loss": 1.1221, "step": 216 }, { "epoch": 0.5549872122762148, "grad_norm": 1.1209846232833984, "learning_rate": 0.00015994373792622182, "loss": 1.0914, "step": 217 }, { "epoch": 0.5575447570332481, "grad_norm": 1.1159100799958372, "learning_rate": 0.0001599382527608895, "loss": 1.0659, "step": 218 }, { "epoch": 0.5601023017902813, "grad_norm": 1.014792737157986, "learning_rate": 0.00015993251260587796, "loss": 1.0895, "step": 219 }, { "epoch": 0.5626598465473146, "grad_norm": 1.3514884114926682, "learning_rate": 0.00015992651747949742, "loss": 1.1447, "step": 220 }, { "epoch": 0.5652173913043478, "grad_norm": 1.3662814180004041, "learning_rate": 0.00015992026740087125, "loss": 1.082, "step": 221 }, { "epoch": 0.5677749360613811, "grad_norm": 1.1729073479593213, "learning_rate": 0.00015991376238993623, "loss": 1.0858, "step": 222 }, { "epoch": 0.5703324808184144, "grad_norm": 1.098894416827083, "learning_rate": 0.0001599070024674422, "loss": 1.0903, "step": 223 }, { "epoch": 0.5728900255754475, "grad_norm": 0.975594652798118, "learning_rate": 0.0001598999876549522, "loss": 1.1162, "step": 224 }, { "epoch": 0.5754475703324808, "grad_norm": 1.0143269006614197, "learning_rate": 0.00015989271797484236, "loss": 1.1131, "step": 225 }, { "epoch": 0.578005115089514, "grad_norm": 1.3483287924450105, "learning_rate": 0.00015988519345030167, "loss": 1.0896, "step": 226 }, { "epoch": 0.5805626598465473, "grad_norm": 0.7520971748388883, "learning_rate": 0.00015987741410533217, "loss": 1.0953, "step": 227 }, { "epoch": 0.5831202046035806, "grad_norm": 1.3201762056381772, "learning_rate": 0.0001598693799647486, "loss": 1.0837, "step": 228 }, { "epoch": 0.5856777493606138, "grad_norm": 1.2193125892583727, "learning_rate": 0.00015986109105417862, "loss": 1.1026, "step": 229 }, { "epoch": 0.5882352941176471, "grad_norm": 1.3892856581992825, "learning_rate": 0.0001598525474000624, "loss": 1.1069, "step": 230 }, { "epoch": 0.5907928388746803, "grad_norm": 0.8831793540357707, "learning_rate": 0.00015984374902965284, "loss": 1.1079, "step": 231 }, { "epoch": 0.5933503836317136, "grad_norm": 0.8405263869404558, "learning_rate": 0.00015983469597101517, "loss": 1.088, "step": 232 }, { "epoch": 0.5959079283887468, "grad_norm": 0.8048081062282874, "learning_rate": 0.0001598253882530272, "loss": 1.0947, "step": 233 }, { "epoch": 0.59846547314578, "grad_norm": 1.1026453527649267, "learning_rate": 0.00015981582590537897, "loss": 1.0527, "step": 234 }, { "epoch": 0.6010230179028133, "grad_norm": 1.945124480668707, "learning_rate": 0.0001598060089585728, "loss": 1.0747, "step": 235 }, { "epoch": 0.6035805626598465, "grad_norm": 0.6633926296437849, "learning_rate": 0.00015979593744392312, "loss": 1.1013, "step": 236 }, { "epoch": 0.6061381074168798, "grad_norm": 1.9149178380903846, "learning_rate": 0.00015978561139355635, "loss": 1.0967, "step": 237 }, { "epoch": 0.6086956521739131, "grad_norm": 1.3222885863625786, "learning_rate": 0.00015977503084041087, "loss": 1.0733, "step": 238 }, { "epoch": 0.6112531969309463, "grad_norm": 1.0130031801765467, "learning_rate": 0.00015976419581823688, "loss": 1.1196, "step": 239 }, { "epoch": 0.6138107416879796, "grad_norm": 1.5551163600364186, "learning_rate": 0.00015975310636159632, "loss": 1.088, "step": 240 }, { "epoch": 0.6163682864450127, "grad_norm": 1.2158294095692619, "learning_rate": 0.00015974176250586265, "loss": 1.0768, "step": 241 }, { "epoch": 0.618925831202046, "grad_norm": 1.0765542476008974, "learning_rate": 0.00015973016428722094, "loss": 1.106, "step": 242 }, { "epoch": 0.6214833759590793, "grad_norm": 1.1132699812581053, "learning_rate": 0.0001597183117426675, "loss": 1.1002, "step": 243 }, { "epoch": 0.6240409207161125, "grad_norm": 1.3600712766399181, "learning_rate": 0.00015970620491001004, "loss": 1.1445, "step": 244 }, { "epoch": 0.6265984654731458, "grad_norm": 1.0416236386170334, "learning_rate": 0.00015969384382786729, "loss": 1.1019, "step": 245 }, { "epoch": 0.629156010230179, "grad_norm": 1.3027622469497735, "learning_rate": 0.00015968122853566905, "loss": 1.1002, "step": 246 }, { "epoch": 0.6317135549872123, "grad_norm": 0.8037304289524585, "learning_rate": 0.000159668359073656, "loss": 1.0892, "step": 247 }, { "epoch": 0.6342710997442456, "grad_norm": 0.9188404876547497, "learning_rate": 0.00015965523548287956, "loss": 1.1395, "step": 248 }, { "epoch": 0.6368286445012787, "grad_norm": 1.1903100937742757, "learning_rate": 0.0001596418578052018, "loss": 1.1157, "step": 249 }, { "epoch": 0.639386189258312, "grad_norm": 1.134136870599723, "learning_rate": 0.0001596282260832953, "loss": 1.0961, "step": 250 }, { "epoch": 0.6419437340153452, "grad_norm": 1.1666299453160198, "learning_rate": 0.00015961434036064294, "loss": 1.1019, "step": 251 }, { "epoch": 0.6445012787723785, "grad_norm": 0.8723696508206527, "learning_rate": 0.00015960020068153785, "loss": 1.1053, "step": 252 }, { "epoch": 0.6470588235294118, "grad_norm": 0.9568431382175138, "learning_rate": 0.00015958580709108332, "loss": 1.0848, "step": 253 }, { "epoch": 0.649616368286445, "grad_norm": 1.1129808719393837, "learning_rate": 0.00015957115963519244, "loss": 1.136, "step": 254 }, { "epoch": 0.6521739130434783, "grad_norm": 1.3963252311082919, "learning_rate": 0.00015955625836058815, "loss": 1.0952, "step": 255 }, { "epoch": 0.6547314578005116, "grad_norm": 0.9298685363556572, "learning_rate": 0.00015954110331480302, "loss": 1.0809, "step": 256 }, { "epoch": 0.6572890025575447, "grad_norm": 0.7001103257159264, "learning_rate": 0.00015952569454617916, "loss": 1.116, "step": 257 }, { "epoch": 0.659846547314578, "grad_norm": 0.9441648189630093, "learning_rate": 0.00015951003210386793, "loss": 1.0784, "step": 258 }, { "epoch": 0.6624040920716112, "grad_norm": 1.4002615649377306, "learning_rate": 0.0001594941160378299, "loss": 1.1071, "step": 259 }, { "epoch": 0.6649616368286445, "grad_norm": 0.8178386113146091, "learning_rate": 0.00015947794639883473, "loss": 1.087, "step": 260 }, { "epoch": 0.6675191815856778, "grad_norm": 1.452979203118016, "learning_rate": 0.0001594615232384608, "loss": 1.0604, "step": 261 }, { "epoch": 0.670076726342711, "grad_norm": 0.6774046196617319, "learning_rate": 0.00015944484660909523, "loss": 1.076, "step": 262 }, { "epoch": 0.6726342710997443, "grad_norm": 0.7670969521082094, "learning_rate": 0.00015942791656393376, "loss": 1.1204, "step": 263 }, { "epoch": 0.6751918158567775, "grad_norm": 1.0850513811767653, "learning_rate": 0.00015941073315698035, "loss": 1.0986, "step": 264 }, { "epoch": 0.6777493606138107, "grad_norm": 1.472017968872445, "learning_rate": 0.00015939329644304724, "loss": 1.1274, "step": 265 }, { "epoch": 0.680306905370844, "grad_norm": 0.9702787550395545, "learning_rate": 0.0001593756064777546, "loss": 1.0934, "step": 266 }, { "epoch": 0.6828644501278772, "grad_norm": 1.0584827946044062, "learning_rate": 0.00015935766331753049, "loss": 1.0471, "step": 267 }, { "epoch": 0.6854219948849105, "grad_norm": 0.8089889110807604, "learning_rate": 0.00015933946701961055, "loss": 1.0887, "step": 268 }, { "epoch": 0.6879795396419437, "grad_norm": 1.0320882417148256, "learning_rate": 0.000159321017642038, "loss": 1.0667, "step": 269 }, { "epoch": 0.690537084398977, "grad_norm": 1.4674982303373638, "learning_rate": 0.00015930231524366326, "loss": 1.1073, "step": 270 }, { "epoch": 0.6930946291560103, "grad_norm": 0.7320918729382444, "learning_rate": 0.0001592833598841438, "loss": 1.1053, "step": 271 }, { "epoch": 0.6956521739130435, "grad_norm": 0.8289503109780553, "learning_rate": 0.00015926415162394414, "loss": 1.0707, "step": 272 }, { "epoch": 0.6982097186700768, "grad_norm": 1.130825151382903, "learning_rate": 0.00015924469052433534, "loss": 1.0878, "step": 273 }, { "epoch": 0.7007672634271099, "grad_norm": 0.9816938036576663, "learning_rate": 0.00015922497664739508, "loss": 1.1036, "step": 274 }, { "epoch": 0.7033248081841432, "grad_norm": 1.1744231549177595, "learning_rate": 0.0001592050100560074, "loss": 1.0826, "step": 275 }, { "epoch": 0.7058823529411765, "grad_norm": 1.1244228971801966, "learning_rate": 0.0001591847908138623, "loss": 1.0992, "step": 276 }, { "epoch": 0.7084398976982097, "grad_norm": 1.0273673884618308, "learning_rate": 0.00015916431898545583, "loss": 1.1122, "step": 277 }, { "epoch": 0.710997442455243, "grad_norm": 1.3019719478481941, "learning_rate": 0.0001591435946360897, "loss": 1.0797, "step": 278 }, { "epoch": 0.7135549872122762, "grad_norm": 0.9179007336169464, "learning_rate": 0.00015912261783187113, "loss": 1.1083, "step": 279 }, { "epoch": 0.7161125319693095, "grad_norm": 1.3938652199122237, "learning_rate": 0.00015910138863971265, "loss": 1.0768, "step": 280 }, { "epoch": 0.7186700767263428, "grad_norm": 0.8460589876687793, "learning_rate": 0.00015907990712733176, "loss": 1.0675, "step": 281 }, { "epoch": 0.7212276214833759, "grad_norm": 1.2311027949600852, "learning_rate": 0.00015905817336325098, "loss": 1.095, "step": 282 }, { "epoch": 0.7237851662404092, "grad_norm": 0.5637046057878358, "learning_rate": 0.00015903618741679735, "loss": 1.0227, "step": 283 }, { "epoch": 0.7263427109974424, "grad_norm": 0.8864195638565602, "learning_rate": 0.00015901394935810236, "loss": 1.0894, "step": 284 }, { "epoch": 0.7289002557544757, "grad_norm": 1.118154448385255, "learning_rate": 0.00015899145925810172, "loss": 1.0708, "step": 285 }, { "epoch": 0.731457800511509, "grad_norm": 0.8797417608904688, "learning_rate": 0.0001589687171885351, "loss": 1.0973, "step": 286 }, { "epoch": 0.7340153452685422, "grad_norm": 1.2417892204976435, "learning_rate": 0.0001589457232219459, "loss": 1.0959, "step": 287 }, { "epoch": 0.7365728900255755, "grad_norm": 1.3823792436001885, "learning_rate": 0.000158922477431681, "loss": 1.0588, "step": 288 }, { "epoch": 0.7391304347826086, "grad_norm": 0.5914973374896305, "learning_rate": 0.00015889897989189065, "loss": 1.0877, "step": 289 }, { "epoch": 0.7416879795396419, "grad_norm": 0.6894697219091279, "learning_rate": 0.00015887523067752805, "loss": 1.0987, "step": 290 }, { "epoch": 0.7442455242966752, "grad_norm": 0.9378104999898202, "learning_rate": 0.0001588512298643492, "loss": 1.0813, "step": 291 }, { "epoch": 0.7468030690537084, "grad_norm": 1.5924222953617497, "learning_rate": 0.00015882697752891273, "loss": 1.0493, "step": 292 }, { "epoch": 0.7493606138107417, "grad_norm": 0.8644236985398326, "learning_rate": 0.0001588024737485795, "loss": 1.0745, "step": 293 }, { "epoch": 0.7519181585677749, "grad_norm": 1.2617771174370838, "learning_rate": 0.00015877771860151255, "loss": 1.0756, "step": 294 }, { "epoch": 0.7544757033248082, "grad_norm": 0.6053221801377883, "learning_rate": 0.00015875271216667658, "loss": 1.0624, "step": 295 }, { "epoch": 0.7570332480818415, "grad_norm": 0.8733719684486176, "learning_rate": 0.00015872745452383797, "loss": 1.0713, "step": 296 }, { "epoch": 0.7595907928388747, "grad_norm": 1.0570673007983702, "learning_rate": 0.00015870194575356444, "loss": 1.1115, "step": 297 }, { "epoch": 0.7621483375959079, "grad_norm": 0.7325728255149376, "learning_rate": 0.00015867618593722464, "loss": 1.0871, "step": 298 }, { "epoch": 0.7647058823529411, "grad_norm": 0.7340524897043603, "learning_rate": 0.00015865017515698807, "loss": 1.0979, "step": 299 }, { "epoch": 0.7672634271099744, "grad_norm": 1.1656279626023016, "learning_rate": 0.00015862391349582484, "loss": 1.0597, "step": 300 }, { "epoch": 0.7698209718670077, "grad_norm": 0.9978239568565908, "learning_rate": 0.00015859740103750522, "loss": 1.0932, "step": 301 }, { "epoch": 0.7723785166240409, "grad_norm": 1.878442480743071, "learning_rate": 0.00015857063786659954, "loss": 1.0938, "step": 302 }, { "epoch": 0.7749360613810742, "grad_norm": 0.6117011045915516, "learning_rate": 0.00015854362406847786, "loss": 1.0623, "step": 303 }, { "epoch": 0.7774936061381074, "grad_norm": 1.8420720325784072, "learning_rate": 0.00015851635972930967, "loss": 1.0699, "step": 304 }, { "epoch": 0.7800511508951407, "grad_norm": 1.002131752478182, "learning_rate": 0.00015848884493606367, "loss": 1.0826, "step": 305 }, { "epoch": 0.782608695652174, "grad_norm": 1.2471718061674597, "learning_rate": 0.00015846107977650743, "loss": 1.0755, "step": 306 }, { "epoch": 0.7851662404092071, "grad_norm": 0.9634733361160541, "learning_rate": 0.0001584330643392072, "loss": 1.0416, "step": 307 }, { "epoch": 0.7877237851662404, "grad_norm": 1.790526532103535, "learning_rate": 0.00015840479871352754, "loss": 1.0754, "step": 308 }, { "epoch": 0.7902813299232737, "grad_norm": 0.8667875735812341, "learning_rate": 0.00015837628298963105, "loss": 1.0934, "step": 309 }, { "epoch": 0.7928388746803069, "grad_norm": 1.4536288271279978, "learning_rate": 0.00015834751725847816, "loss": 1.0632, "step": 310 }, { "epoch": 0.7953964194373402, "grad_norm": 1.3777516183353187, "learning_rate": 0.00015831850161182677, "loss": 1.0956, "step": 311 }, { "epoch": 0.7979539641943734, "grad_norm": 0.7721449298753891, "learning_rate": 0.0001582892361422319, "loss": 1.1069, "step": 312 }, { "epoch": 0.8005115089514067, "grad_norm": 1.174156872017157, "learning_rate": 0.00015825972094304555, "loss": 1.0728, "step": 313 }, { "epoch": 0.80306905370844, "grad_norm": 1.2588808228888746, "learning_rate": 0.00015822995610841623, "loss": 1.0772, "step": 314 }, { "epoch": 0.8056265984654731, "grad_norm": 0.8720000426242472, "learning_rate": 0.00015819994173328885, "loss": 1.0654, "step": 315 }, { "epoch": 0.8081841432225064, "grad_norm": 0.923631788770043, "learning_rate": 0.00015816967791340417, "loss": 1.0668, "step": 316 }, { "epoch": 0.8107416879795396, "grad_norm": 1.1357229877804957, "learning_rate": 0.00015813916474529885, "loss": 1.0911, "step": 317 }, { "epoch": 0.8132992327365729, "grad_norm": 0.8907121901474587, "learning_rate": 0.0001581084023263047, "loss": 1.0826, "step": 318 }, { "epoch": 0.8158567774936062, "grad_norm": 1.0350783431396418, "learning_rate": 0.00015807739075454874, "loss": 1.0426, "step": 319 }, { "epoch": 0.8184143222506394, "grad_norm": 1.2795269410097496, "learning_rate": 0.00015804613012895268, "loss": 1.0731, "step": 320 }, { "epoch": 0.8209718670076727, "grad_norm": 0.8440033467786482, "learning_rate": 0.0001580146205492327, "loss": 1.0491, "step": 321 }, { "epoch": 0.8235294117647058, "grad_norm": 0.9336906509179427, "learning_rate": 0.00015798286211589916, "loss": 1.0796, "step": 322 }, { "epoch": 0.8260869565217391, "grad_norm": 1.243210147279451, "learning_rate": 0.00015795085493025608, "loss": 1.0998, "step": 323 }, { "epoch": 0.8286445012787724, "grad_norm": 0.985781736568132, "learning_rate": 0.00015791859909440107, "loss": 1.097, "step": 324 }, { "epoch": 0.8312020460358056, "grad_norm": 1.115722030381177, "learning_rate": 0.00015788609471122485, "loss": 1.0594, "step": 325 }, { "epoch": 0.8337595907928389, "grad_norm": 0.6317177707367972, "learning_rate": 0.000157853341884411, "loss": 1.0672, "step": 326 }, { "epoch": 0.8363171355498721, "grad_norm": 0.7614994384747567, "learning_rate": 0.00015782034071843557, "loss": 1.1076, "step": 327 }, { "epoch": 0.8388746803069054, "grad_norm": 0.6788203373242645, "learning_rate": 0.00015778709131856675, "loss": 1.0794, "step": 328 }, { "epoch": 0.8414322250639387, "grad_norm": 0.6573621171258895, "learning_rate": 0.00015775359379086455, "loss": 1.1175, "step": 329 }, { "epoch": 0.8439897698209718, "grad_norm": 0.865009547315977, "learning_rate": 0.00015771984824218053, "loss": 1.0893, "step": 330 }, { "epoch": 0.8465473145780051, "grad_norm": 1.0982989183876286, "learning_rate": 0.00015768585478015732, "loss": 1.0628, "step": 331 }, { "epoch": 0.8491048593350383, "grad_norm": 1.5816845014682415, "learning_rate": 0.00015765161351322845, "loss": 1.0553, "step": 332 }, { "epoch": 0.8516624040920716, "grad_norm": 0.5583122236625028, "learning_rate": 0.0001576171245506178, "loss": 1.1007, "step": 333 }, { "epoch": 0.8542199488491049, "grad_norm": 1.4589646002026686, "learning_rate": 0.00015758238800233937, "loss": 1.0354, "step": 334 }, { "epoch": 0.8567774936061381, "grad_norm": 1.1988373358126654, "learning_rate": 0.00015754740397919703, "loss": 1.0609, "step": 335 }, { "epoch": 0.8593350383631714, "grad_norm": 0.7798431918437426, "learning_rate": 0.0001575121725927839, "loss": 1.0599, "step": 336 }, { "epoch": 0.8618925831202046, "grad_norm": 0.8001399476748517, "learning_rate": 0.00015747669395548228, "loss": 1.0825, "step": 337 }, { "epoch": 0.8644501278772379, "grad_norm": 0.9268381518772149, "learning_rate": 0.00015744096818046306, "loss": 1.0867, "step": 338 }, { "epoch": 0.8670076726342711, "grad_norm": 0.8482506857320948, "learning_rate": 0.00015740499538168548, "loss": 1.0519, "step": 339 }, { "epoch": 0.8695652173913043, "grad_norm": 1.1051027320167537, "learning_rate": 0.00015736877567389682, "loss": 1.0926, "step": 340 }, { "epoch": 0.8721227621483376, "grad_norm": 1.1295814345497992, "learning_rate": 0.00015733230917263182, "loss": 1.0485, "step": 341 }, { "epoch": 0.8746803069053708, "grad_norm": 0.8381578992561258, "learning_rate": 0.00015729559599421262, "loss": 1.0742, "step": 342 }, { "epoch": 0.8772378516624041, "grad_norm": 1.1355285501553987, "learning_rate": 0.00015725863625574808, "loss": 1.0731, "step": 343 }, { "epoch": 0.8797953964194374, "grad_norm": 1.2716344612482289, "learning_rate": 0.0001572214300751336, "loss": 1.0818, "step": 344 }, { "epoch": 0.8823529411764706, "grad_norm": 0.7977797928903454, "learning_rate": 0.00015718397757105072, "loss": 1.0592, "step": 345 }, { "epoch": 0.8849104859335039, "grad_norm": 0.5888833117266756, "learning_rate": 0.0001571462788629666, "loss": 1.124, "step": 346 }, { "epoch": 0.887468030690537, "grad_norm": 0.7277724084604381, "learning_rate": 0.00015710833407113386, "loss": 1.0076, "step": 347 }, { "epoch": 0.8900255754475703, "grad_norm": 0.7175876926395411, "learning_rate": 0.00015707014331659008, "loss": 1.0735, "step": 348 }, { "epoch": 0.8925831202046036, "grad_norm": 0.8127426786215441, "learning_rate": 0.00015703170672115737, "loss": 1.0582, "step": 349 }, { "epoch": 0.8951406649616368, "grad_norm": 1.0648976192629485, "learning_rate": 0.00015699302440744202, "loss": 1.0788, "step": 350 }, { "epoch": 0.8976982097186701, "grad_norm": 1.2133128800930093, "learning_rate": 0.00015695409649883418, "loss": 1.0986, "step": 351 }, { "epoch": 0.9002557544757033, "grad_norm": 0.946491692276404, "learning_rate": 0.0001569149231195074, "loss": 1.0522, "step": 352 }, { "epoch": 0.9028132992327366, "grad_norm": 1.2375939940771874, "learning_rate": 0.0001568755043944182, "loss": 1.077, "step": 353 }, { "epoch": 0.9053708439897699, "grad_norm": 0.7734830655451521, "learning_rate": 0.00015683584044930572, "loss": 1.0659, "step": 354 }, { "epoch": 0.907928388746803, "grad_norm": 0.6097683019560797, "learning_rate": 0.00015679593141069132, "loss": 1.0446, "step": 355 }, { "epoch": 0.9104859335038363, "grad_norm": 0.5759587093662797, "learning_rate": 0.0001567557774058782, "loss": 1.0577, "step": 356 }, { "epoch": 0.9130434782608695, "grad_norm": 0.5878753626840652, "learning_rate": 0.0001567153785629509, "loss": 1.0675, "step": 357 }, { "epoch": 0.9156010230179028, "grad_norm": 0.6653732754348032, "learning_rate": 0.000156674735010775, "loss": 1.0891, "step": 358 }, { "epoch": 0.9181585677749361, "grad_norm": 0.768263015413779, "learning_rate": 0.00015663384687899663, "loss": 1.0715, "step": 359 }, { "epoch": 0.9207161125319693, "grad_norm": 0.9765055577703315, "learning_rate": 0.00015659271429804215, "loss": 1.0396, "step": 360 }, { "epoch": 0.9232736572890026, "grad_norm": 1.4554265699809417, "learning_rate": 0.00015655133739911757, "loss": 1.0919, "step": 361 }, { "epoch": 0.9258312020460358, "grad_norm": 0.7208280463855818, "learning_rate": 0.0001565097163142083, "loss": 1.0151, "step": 362 }, { "epoch": 0.928388746803069, "grad_norm": 0.8611710190483517, "learning_rate": 0.00015646785117607865, "loss": 1.0796, "step": 363 }, { "epoch": 0.9309462915601023, "grad_norm": 1.1291766944081427, "learning_rate": 0.00015642574211827142, "loss": 1.0651, "step": 364 }, { "epoch": 0.9335038363171355, "grad_norm": 1.0023408896760695, "learning_rate": 0.00015638338927510752, "loss": 1.0785, "step": 365 }, { "epoch": 0.9360613810741688, "grad_norm": 1.2325468393537922, "learning_rate": 0.00015634079278168542, "loss": 1.1032, "step": 366 }, { "epoch": 0.9386189258312021, "grad_norm": 0.8116887550297889, "learning_rate": 0.00015629795277388077, "loss": 1.0784, "step": 367 }, { "epoch": 0.9411764705882353, "grad_norm": 0.8465793191190484, "learning_rate": 0.00015625486938834613, "loss": 1.0729, "step": 368 }, { "epoch": 0.9437340153452686, "grad_norm": 0.8630348039771475, "learning_rate": 0.00015621154276251024, "loss": 1.0676, "step": 369 }, { "epoch": 0.9462915601023018, "grad_norm": 0.8909789093135501, "learning_rate": 0.00015616797303457782, "loss": 1.0626, "step": 370 }, { "epoch": 0.948849104859335, "grad_norm": 1.3639686895279477, "learning_rate": 0.00015612416034352906, "loss": 1.0935, "step": 371 }, { "epoch": 0.9514066496163683, "grad_norm": 0.7547937680438821, "learning_rate": 0.00015608010482911908, "loss": 1.0714, "step": 372 }, { "epoch": 0.9539641943734015, "grad_norm": 0.6097577881338234, "learning_rate": 0.00015603580663187765, "loss": 1.0757, "step": 373 }, { "epoch": 0.9565217391304348, "grad_norm": 0.7408592240149442, "learning_rate": 0.00015599126589310857, "loss": 1.0762, "step": 374 }, { "epoch": 0.959079283887468, "grad_norm": 0.8123009573402776, "learning_rate": 0.00015594648275488944, "loss": 1.0991, "step": 375 }, { "epoch": 0.9616368286445013, "grad_norm": 0.8997010834862542, "learning_rate": 0.00015590145736007091, "loss": 1.0493, "step": 376 }, { "epoch": 0.9641943734015346, "grad_norm": 1.211365253216414, "learning_rate": 0.00015585618985227657, "loss": 1.0845, "step": 377 }, { "epoch": 0.9667519181585678, "grad_norm": 1.1546641796621098, "learning_rate": 0.00015581068037590212, "loss": 1.0851, "step": 378 }, { "epoch": 0.969309462915601, "grad_norm": 1.1673337321688009, "learning_rate": 0.00015576492907611524, "loss": 1.054, "step": 379 }, { "epoch": 0.9718670076726342, "grad_norm": 0.6737544031199463, "learning_rate": 0.00015571893609885493, "loss": 1.0377, "step": 380 }, { "epoch": 0.9744245524296675, "grad_norm": 0.8151328439701532, "learning_rate": 0.00015567270159083107, "loss": 1.0698, "step": 381 }, { "epoch": 0.9769820971867008, "grad_norm": 0.9445758081131683, "learning_rate": 0.00015562622569952408, "loss": 1.0723, "step": 382 }, { "epoch": 0.979539641943734, "grad_norm": 1.0143687259241263, "learning_rate": 0.00015557950857318425, "loss": 1.0753, "step": 383 }, { "epoch": 0.9820971867007673, "grad_norm": 1.0909144236610384, "learning_rate": 0.00015553255036083145, "loss": 1.0301, "step": 384 }, { "epoch": 0.9846547314578005, "grad_norm": 1.2562026829762518, "learning_rate": 0.0001554853512122545, "loss": 1.1103, "step": 385 }, { "epoch": 0.9872122762148338, "grad_norm": 0.7752538678352305, "learning_rate": 0.00015543791127801084, "loss": 1.0633, "step": 386 }, { "epoch": 0.989769820971867, "grad_norm": 0.6480828071883595, "learning_rate": 0.0001553902307094259, "loss": 1.0769, "step": 387 }, { "epoch": 0.9923273657289002, "grad_norm": 0.8764236095011647, "learning_rate": 0.00015534230965859276, "loss": 1.0905, "step": 388 }, { "epoch": 0.9948849104859335, "grad_norm": 1.1982183014384076, "learning_rate": 0.00015529414827837156, "loss": 1.0737, "step": 389 }, { "epoch": 0.9974424552429667, "grad_norm": 1.0015924584874194, "learning_rate": 0.00015524574672238906, "loss": 1.0539, "step": 390 }, { "epoch": 1.0, "grad_norm": 1.3714997731388885, "learning_rate": 0.00015519710514503814, "loss": 1.0846, "step": 391 }, { "epoch": 1.0025575447570332, "grad_norm": 0.5566435857743947, "learning_rate": 0.00015514822370147732, "loss": 1.0432, "step": 392 }, { "epoch": 1.0051150895140666, "grad_norm": 0.7918387632633654, "learning_rate": 0.00015509910254763023, "loss": 1.0578, "step": 393 }, { "epoch": 1.0076726342710998, "grad_norm": 1.256938009132569, "learning_rate": 0.0001550497418401852, "loss": 1.0306, "step": 394 }, { "epoch": 1.010230179028133, "grad_norm": 1.2314520681198668, "learning_rate": 0.00015500014173659457, "loss": 1.0383, "step": 395 }, { "epoch": 1.0127877237851663, "grad_norm": 0.923069995672888, "learning_rate": 0.00015495030239507442, "loss": 1.0573, "step": 396 }, { "epoch": 1.0153452685421995, "grad_norm": 0.936236903889318, "learning_rate": 0.00015490022397460392, "loss": 1.0573, "step": 397 }, { "epoch": 1.0179028132992327, "grad_norm": 0.6628420746065794, "learning_rate": 0.0001548499066349249, "loss": 1.0474, "step": 398 }, { "epoch": 1.020460358056266, "grad_norm": 0.47759016557709666, "learning_rate": 0.00015479935053654126, "loss": 1.0175, "step": 399 }, { "epoch": 1.0230179028132993, "grad_norm": 0.61072929455943, "learning_rate": 0.00015474855584071847, "loss": 1.0724, "step": 400 }, { "epoch": 1.0255754475703325, "grad_norm": 0.607075351205747, "learning_rate": 0.0001546975227094832, "loss": 1.0527, "step": 401 }, { "epoch": 1.0281329923273657, "grad_norm": 0.5993295243529821, "learning_rate": 0.00015464625130562256, "loss": 1.0695, "step": 402 }, { "epoch": 1.030690537084399, "grad_norm": 0.9177173231285568, "learning_rate": 0.0001545947417926838, "loss": 1.0344, "step": 403 }, { "epoch": 1.0332480818414322, "grad_norm": 1.4911897806007488, "learning_rate": 0.00015454299433497362, "loss": 1.0443, "step": 404 }, { "epoch": 1.0358056265984654, "grad_norm": 0.6069008914687445, "learning_rate": 0.00015449100909755784, "loss": 1.0393, "step": 405 }, { "epoch": 1.0383631713554988, "grad_norm": 0.9163856494121054, "learning_rate": 0.00015443878624626066, "loss": 1.0737, "step": 406 }, { "epoch": 1.040920716112532, "grad_norm": 1.369010227838881, "learning_rate": 0.0001543863259476642, "loss": 1.0106, "step": 407 }, { "epoch": 1.0434782608695652, "grad_norm": 0.8651156065397383, "learning_rate": 0.00015433362836910817, "loss": 1.0399, "step": 408 }, { "epoch": 1.0460358056265984, "grad_norm": 0.8527058058258006, "learning_rate": 0.00015428069367868892, "loss": 1.0222, "step": 409 }, { "epoch": 1.0485933503836318, "grad_norm": 0.7680613356197566, "learning_rate": 0.00015422752204525937, "loss": 1.0161, "step": 410 }, { "epoch": 1.051150895140665, "grad_norm": 1.0745283772693792, "learning_rate": 0.0001541741136384281, "loss": 1.0446, "step": 411 }, { "epoch": 1.0537084398976981, "grad_norm": 1.0936408809378098, "learning_rate": 0.00015412046862855902, "loss": 1.0245, "step": 412 }, { "epoch": 1.0562659846547315, "grad_norm": 0.9926125079651018, "learning_rate": 0.00015406658718677076, "loss": 1.0308, "step": 413 }, { "epoch": 1.0588235294117647, "grad_norm": 1.1175953083121093, "learning_rate": 0.00015401246948493612, "loss": 1.0768, "step": 414 }, { "epoch": 1.061381074168798, "grad_norm": 0.8210085027845057, "learning_rate": 0.00015395811569568154, "loss": 1.0473, "step": 415 }, { "epoch": 1.0639386189258313, "grad_norm": 0.9226634652720442, "learning_rate": 0.00015390352599238655, "loss": 1.0299, "step": 416 }, { "epoch": 1.0664961636828645, "grad_norm": 1.2471786951586945, "learning_rate": 0.00015384870054918314, "loss": 1.0139, "step": 417 }, { "epoch": 1.0690537084398977, "grad_norm": 0.8806851237766041, "learning_rate": 0.00015379363954095535, "loss": 1.0237, "step": 418 }, { "epoch": 1.0716112531969308, "grad_norm": 0.727069173053958, "learning_rate": 0.0001537383431433386, "loss": 1.0786, "step": 419 }, { "epoch": 1.0741687979539642, "grad_norm": 0.6337579771769642, "learning_rate": 0.00015368281153271918, "loss": 1.0264, "step": 420 }, { "epoch": 1.0767263427109974, "grad_norm": 0.8868138217653037, "learning_rate": 0.0001536270448862336, "loss": 1.0413, "step": 421 }, { "epoch": 1.0792838874680306, "grad_norm": 0.8013668539540468, "learning_rate": 0.00015357104338176823, "loss": 1.0305, "step": 422 }, { "epoch": 1.081841432225064, "grad_norm": 1.0111414586274687, "learning_rate": 0.00015351480719795845, "loss": 1.0177, "step": 423 }, { "epoch": 1.0843989769820972, "grad_norm": 1.3128642093201517, "learning_rate": 0.00015345833651418835, "loss": 1.0663, "step": 424 }, { "epoch": 1.0869565217391304, "grad_norm": 0.7074818377117421, "learning_rate": 0.00015340163151058997, "loss": 1.0262, "step": 425 }, { "epoch": 1.0895140664961638, "grad_norm": 0.7476417982075203, "learning_rate": 0.00015334469236804278, "loss": 1.0166, "step": 426 }, { "epoch": 1.092071611253197, "grad_norm": 0.7163607115802371, "learning_rate": 0.00015328751926817314, "loss": 1.041, "step": 427 }, { "epoch": 1.0946291560102301, "grad_norm": 1.0614664295591614, "learning_rate": 0.0001532301123933537, "loss": 1.0236, "step": 428 }, { "epoch": 1.0971867007672633, "grad_norm": 1.265439568931787, "learning_rate": 0.00015317247192670282, "loss": 1.0528, "step": 429 }, { "epoch": 1.0997442455242967, "grad_norm": 0.7025263297795912, "learning_rate": 0.00015311459805208397, "loss": 1.0277, "step": 430 }, { "epoch": 1.10230179028133, "grad_norm": 0.8167641509021383, "learning_rate": 0.0001530564909541051, "loss": 1.0582, "step": 431 }, { "epoch": 1.104859335038363, "grad_norm": 0.8716549745993203, "learning_rate": 0.0001529981508181182, "loss": 1.077, "step": 432 }, { "epoch": 1.1074168797953965, "grad_norm": 0.7246028123611893, "learning_rate": 0.00015293957783021854, "loss": 1.0542, "step": 433 }, { "epoch": 1.1099744245524297, "grad_norm": 0.6784199036145839, "learning_rate": 0.0001528807721772442, "loss": 1.0418, "step": 434 }, { "epoch": 1.1125319693094629, "grad_norm": 0.8506075875171634, "learning_rate": 0.00015282173404677533, "loss": 1.0343, "step": 435 }, { "epoch": 1.1150895140664963, "grad_norm": 0.8375757880980345, "learning_rate": 0.00015276246362713375, "loss": 1.0341, "step": 436 }, { "epoch": 1.1176470588235294, "grad_norm": 0.7540319449850698, "learning_rate": 0.00015270296110738221, "loss": 1.014, "step": 437 }, { "epoch": 1.1202046035805626, "grad_norm": 0.9166441931706429, "learning_rate": 0.0001526432266773238, "loss": 1.0269, "step": 438 }, { "epoch": 1.1227621483375958, "grad_norm": 1.0822305273066126, "learning_rate": 0.0001525832605275014, "loss": 1.0472, "step": 439 }, { "epoch": 1.1253196930946292, "grad_norm": 0.9450917972251209, "learning_rate": 0.000152523062849197, "loss": 1.024, "step": 440 }, { "epoch": 1.1278772378516624, "grad_norm": 1.1333566165350994, "learning_rate": 0.0001524626338344311, "loss": 1.0448, "step": 441 }, { "epoch": 1.1304347826086956, "grad_norm": 1.177581998734778, "learning_rate": 0.00015240197367596226, "loss": 1.0244, "step": 442 }, { "epoch": 1.132992327365729, "grad_norm": 0.8866480092962395, "learning_rate": 0.00015234108256728616, "loss": 1.0499, "step": 443 }, { "epoch": 1.1355498721227621, "grad_norm": 0.6882160288370965, "learning_rate": 0.00015227996070263535, "loss": 1.0151, "step": 444 }, { "epoch": 1.1381074168797953, "grad_norm": 0.7419397568748587, "learning_rate": 0.00015221860827697832, "loss": 1.0345, "step": 445 }, { "epoch": 1.1406649616368287, "grad_norm": 0.854881931061872, "learning_rate": 0.00015215702548601907, "loss": 1.008, "step": 446 }, { "epoch": 1.143222506393862, "grad_norm": 0.8138274292487687, "learning_rate": 0.00015209521252619644, "loss": 0.9962, "step": 447 }, { "epoch": 1.145780051150895, "grad_norm": 0.7536271031473499, "learning_rate": 0.00015203316959468344, "loss": 1.0299, "step": 448 }, { "epoch": 1.1483375959079285, "grad_norm": 0.9110426205382722, "learning_rate": 0.0001519708968893867, "loss": 1.019, "step": 449 }, { "epoch": 1.1508951406649617, "grad_norm": 1.2088991550402766, "learning_rate": 0.00015190839460894567, "loss": 1.0708, "step": 450 }, { "epoch": 1.1534526854219949, "grad_norm": 0.8573913285400658, "learning_rate": 0.00015184566295273227, "loss": 1.0417, "step": 451 }, { "epoch": 1.156010230179028, "grad_norm": 0.6951469442919158, "learning_rate": 0.00015178270212084995, "loss": 1.0464, "step": 452 }, { "epoch": 1.1585677749360614, "grad_norm": 0.6419948195410027, "learning_rate": 0.00015171951231413328, "loss": 1.0612, "step": 453 }, { "epoch": 1.1611253196930946, "grad_norm": 0.6841619518854335, "learning_rate": 0.00015165609373414722, "loss": 1.0325, "step": 454 }, { "epoch": 1.1636828644501278, "grad_norm": 0.8037291566188051, "learning_rate": 0.0001515924465831864, "loss": 1.0295, "step": 455 }, { "epoch": 1.1662404092071612, "grad_norm": 1.1795212959071533, "learning_rate": 0.00015152857106427462, "loss": 1.0231, "step": 456 }, { "epoch": 1.1687979539641944, "grad_norm": 1.1007425485117117, "learning_rate": 0.00015146446738116412, "loss": 1.015, "step": 457 }, { "epoch": 1.1713554987212276, "grad_norm": 1.072656472389329, "learning_rate": 0.00015140013573833498, "loss": 1.0195, "step": 458 }, { "epoch": 1.1739130434782608, "grad_norm": 0.9339605123999745, "learning_rate": 0.00015133557634099435, "loss": 1.026, "step": 459 }, { "epoch": 1.1764705882352942, "grad_norm": 0.8580962355846978, "learning_rate": 0.00015127078939507595, "loss": 1.055, "step": 460 }, { "epoch": 1.1790281329923273, "grad_norm": 1.028703820245517, "learning_rate": 0.00015120577510723934, "loss": 1.0768, "step": 461 }, { "epoch": 1.1815856777493605, "grad_norm": 1.1535909770008528, "learning_rate": 0.00015114053368486919, "loss": 1.0227, "step": 462 }, { "epoch": 1.184143222506394, "grad_norm": 0.7549525724152655, "learning_rate": 0.0001510750653360748, "loss": 1.0101, "step": 463 }, { "epoch": 1.186700767263427, "grad_norm": 0.6560485854233202, "learning_rate": 0.00015100937026968922, "loss": 1.0372, "step": 464 }, { "epoch": 1.1892583120204603, "grad_norm": 0.5946694031246916, "learning_rate": 0.0001509434486952688, "loss": 1.0471, "step": 465 }, { "epoch": 1.1918158567774937, "grad_norm": 0.5311919492244818, "learning_rate": 0.00015087730082309232, "loss": 1.0431, "step": 466 }, { "epoch": 1.1943734015345269, "grad_norm": 0.5154174371307244, "learning_rate": 0.00015081092686416043, "loss": 1.0199, "step": 467 }, { "epoch": 1.19693094629156, "grad_norm": 0.505383670902881, "learning_rate": 0.00015074432703019504, "loss": 1.0706, "step": 468 }, { "epoch": 1.1994884910485935, "grad_norm": 0.4907682209551291, "learning_rate": 0.00015067750153363845, "loss": 1.0346, "step": 469 }, { "epoch": 1.2020460358056266, "grad_norm": 0.39066205442828883, "learning_rate": 0.00015061045058765282, "loss": 1.0554, "step": 470 }, { "epoch": 1.2046035805626598, "grad_norm": 0.34420579713251814, "learning_rate": 0.0001505431744061195, "loss": 1.0279, "step": 471 }, { "epoch": 1.207161125319693, "grad_norm": 0.43688810183174753, "learning_rate": 0.0001504756732036383, "loss": 0.9885, "step": 472 }, { "epoch": 1.2097186700767264, "grad_norm": 0.4751633909038584, "learning_rate": 0.00015040794719552676, "loss": 1.0432, "step": 473 }, { "epoch": 1.2122762148337596, "grad_norm": 0.5269656781598262, "learning_rate": 0.00015033999659781953, "loss": 1.027, "step": 474 }, { "epoch": 1.2148337595907928, "grad_norm": 0.5712060191776948, "learning_rate": 0.00015027182162726769, "loss": 1.0421, "step": 475 }, { "epoch": 1.2173913043478262, "grad_norm": 0.6411090148779058, "learning_rate": 0.000150203422501338, "loss": 1.013, "step": 476 }, { "epoch": 1.2199488491048593, "grad_norm": 0.922985318540642, "learning_rate": 0.00015013479943821225, "loss": 1.0671, "step": 477 }, { "epoch": 1.2225063938618925, "grad_norm": 1.411342942170953, "learning_rate": 0.00015006595265678655, "loss": 1.0506, "step": 478 }, { "epoch": 1.2250639386189257, "grad_norm": 0.7044934707287243, "learning_rate": 0.00014999688237667065, "loss": 1.058, "step": 479 }, { "epoch": 1.227621483375959, "grad_norm": 0.844446069080729, "learning_rate": 0.00014992758881818722, "loss": 1.0112, "step": 480 }, { "epoch": 1.2301790281329923, "grad_norm": 0.863795773273135, "learning_rate": 0.00014985807220237112, "loss": 1.0223, "step": 481 }, { "epoch": 1.2327365728900257, "grad_norm": 1.1955253111068895, "learning_rate": 0.00014978833275096872, "loss": 1.0437, "step": 482 }, { "epoch": 1.2352941176470589, "grad_norm": 0.9710436321082059, "learning_rate": 0.00014971837068643732, "loss": 1.0331, "step": 483 }, { "epoch": 1.237851662404092, "grad_norm": 0.9838152365395039, "learning_rate": 0.00014964818623194412, "loss": 1.0503, "step": 484 }, { "epoch": 1.2404092071611252, "grad_norm": 1.3111101164937617, "learning_rate": 0.00014957777961136588, "loss": 1.0536, "step": 485 }, { "epoch": 1.2429667519181586, "grad_norm": 0.9426881648292104, "learning_rate": 0.00014950715104928794, "loss": 1.0452, "step": 486 }, { "epoch": 1.2455242966751918, "grad_norm": 0.9708865131907598, "learning_rate": 0.0001494363007710036, "loss": 1.0205, "step": 487 }, { "epoch": 1.248081841432225, "grad_norm": 0.735118260321914, "learning_rate": 0.00014936522900251348, "loss": 1.0355, "step": 488 }, { "epoch": 1.2506393861892584, "grad_norm": 0.8962772386972064, "learning_rate": 0.00014929393597052458, "loss": 1.0455, "step": 489 }, { "epoch": 1.2531969309462916, "grad_norm": 0.6546912235303116, "learning_rate": 0.00014922242190244981, "loss": 1.0625, "step": 490 }, { "epoch": 1.2557544757033248, "grad_norm": 0.5383201135001036, "learning_rate": 0.0001491506870264071, "loss": 1.0346, "step": 491 }, { "epoch": 1.258312020460358, "grad_norm": 0.8097960021561659, "learning_rate": 0.00014907873157121875, "loss": 1.0605, "step": 492 }, { "epoch": 1.2608695652173914, "grad_norm": 0.670808763781411, "learning_rate": 0.00014900655576641057, "loss": 1.0282, "step": 493 }, { "epoch": 1.2634271099744245, "grad_norm": 0.7979394762122887, "learning_rate": 0.00014893415984221141, "loss": 1.0264, "step": 494 }, { "epoch": 1.265984654731458, "grad_norm": 1.026770422301297, "learning_rate": 0.00014886154402955217, "loss": 1.0514, "step": 495 }, { "epoch": 1.2685421994884911, "grad_norm": 1.032280976957703, "learning_rate": 0.00014878870856006513, "loss": 1.0408, "step": 496 }, { "epoch": 1.2710997442455243, "grad_norm": 1.1296018012465836, "learning_rate": 0.00014871565366608329, "loss": 1.0338, "step": 497 }, { "epoch": 1.2736572890025575, "grad_norm": 0.9749313409863054, "learning_rate": 0.0001486423795806396, "loss": 1.0193, "step": 498 }, { "epoch": 1.2762148337595907, "grad_norm": 0.8177048634676223, "learning_rate": 0.00014856888653746607, "loss": 1.0324, "step": 499 }, { "epoch": 1.278772378516624, "grad_norm": 0.7747012524305006, "learning_rate": 0.00014849517477099334, "loss": 1.0076, "step": 500 }, { "epoch": 1.2813299232736572, "grad_norm": 0.8429034680075405, "learning_rate": 0.00014842124451634956, "loss": 1.0266, "step": 501 }, { "epoch": 1.2838874680306906, "grad_norm": 1.0704964042478793, "learning_rate": 0.00014834709600935995, "loss": 1.033, "step": 502 }, { "epoch": 1.2864450127877238, "grad_norm": 1.1030823411998563, "learning_rate": 0.00014827272948654584, "loss": 1.0519, "step": 503 }, { "epoch": 1.289002557544757, "grad_norm": 0.7099638951621647, "learning_rate": 0.00014819814518512403, "loss": 1.0258, "step": 504 }, { "epoch": 1.2915601023017902, "grad_norm": 0.5286675820388321, "learning_rate": 0.000148123343343006, "loss": 1.0398, "step": 505 }, { "epoch": 1.2941176470588236, "grad_norm": 0.5306607233732565, "learning_rate": 0.0001480483241987971, "loss": 1.0155, "step": 506 }, { "epoch": 1.2966751918158568, "grad_norm": 0.6060078277369222, "learning_rate": 0.0001479730879917959, "loss": 1.0486, "step": 507 }, { "epoch": 1.29923273657289, "grad_norm": 0.8537119327365599, "learning_rate": 0.00014789763496199335, "loss": 1.0115, "step": 508 }, { "epoch": 1.3017902813299234, "grad_norm": 1.0701098672995177, "learning_rate": 0.00014782196535007198, "loss": 1.0449, "step": 509 }, { "epoch": 1.3043478260869565, "grad_norm": 1.0452113870678157, "learning_rate": 0.00014774607939740524, "loss": 1.0132, "step": 510 }, { "epoch": 1.3069053708439897, "grad_norm": 1.0085703377598065, "learning_rate": 0.0001476699773460567, "loss": 1.0229, "step": 511 }, { "epoch": 1.309462915601023, "grad_norm": 0.8918712650363909, "learning_rate": 0.00014759365943877906, "loss": 1.0509, "step": 512 }, { "epoch": 1.3120204603580563, "grad_norm": 0.839691736422046, "learning_rate": 0.00014751712591901385, "loss": 1.0078, "step": 513 }, { "epoch": 1.3145780051150895, "grad_norm": 0.7023292683764998, "learning_rate": 0.00014744037703089014, "loss": 1.0289, "step": 514 }, { "epoch": 1.317135549872123, "grad_norm": 0.686332323144994, "learning_rate": 0.00014736341301922406, "loss": 1.0213, "step": 515 }, { "epoch": 1.319693094629156, "grad_norm": 0.5991056794621004, "learning_rate": 0.00014728623412951802, "loss": 1.0164, "step": 516 }, { "epoch": 1.3222506393861893, "grad_norm": 0.7507696949786656, "learning_rate": 0.00014720884060795975, "loss": 1.0119, "step": 517 }, { "epoch": 1.3248081841432224, "grad_norm": 0.8658712614342154, "learning_rate": 0.00014713123270142163, "loss": 1.0295, "step": 518 }, { "epoch": 1.3273657289002558, "grad_norm": 0.6119299788578647, "learning_rate": 0.00014705341065745999, "loss": 1.0197, "step": 519 }, { "epoch": 1.329923273657289, "grad_norm": 0.4927851179899278, "learning_rate": 0.00014697537472431411, "loss": 1.0624, "step": 520 }, { "epoch": 1.3324808184143222, "grad_norm": 0.4167468121183674, "learning_rate": 0.0001468971251509056, "loss": 1.0647, "step": 521 }, { "epoch": 1.3350383631713556, "grad_norm": 0.47586787480372, "learning_rate": 0.00014681866218683757, "loss": 1.0402, "step": 522 }, { "epoch": 1.3375959079283888, "grad_norm": 0.5745122439927115, "learning_rate": 0.0001467399860823937, "loss": 1.0304, "step": 523 }, { "epoch": 1.340153452685422, "grad_norm": 0.7552655303578069, "learning_rate": 0.00014666109708853767, "loss": 1.0548, "step": 524 }, { "epoch": 1.3427109974424551, "grad_norm": 1.06908823148847, "learning_rate": 0.00014658199545691222, "loss": 1.0287, "step": 525 }, { "epoch": 1.3452685421994885, "grad_norm": 1.1444185918054413, "learning_rate": 0.0001465026814398383, "loss": 1.0539, "step": 526 }, { "epoch": 1.3478260869565217, "grad_norm": 0.7989998085879703, "learning_rate": 0.00014642315529031442, "loss": 1.0035, "step": 527 }, { "epoch": 1.350383631713555, "grad_norm": 0.6352155319789643, "learning_rate": 0.00014634341726201572, "loss": 1.0659, "step": 528 }, { "epoch": 1.3529411764705883, "grad_norm": 0.5614215368601074, "learning_rate": 0.00014626346760929316, "loss": 1.0282, "step": 529 }, { "epoch": 1.3554987212276215, "grad_norm": 0.5422618777488837, "learning_rate": 0.00014618330658717278, "loss": 1.0002, "step": 530 }, { "epoch": 1.3580562659846547, "grad_norm": 0.4783637133302247, "learning_rate": 0.00014610293445135492, "loss": 1.0377, "step": 531 }, { "epoch": 1.3606138107416879, "grad_norm": 0.4390483950197236, "learning_rate": 0.00014602235145821322, "loss": 1.023, "step": 532 }, { "epoch": 1.3631713554987213, "grad_norm": 0.4768466306371761, "learning_rate": 0.00014594155786479398, "loss": 1.0601, "step": 533 }, { "epoch": 1.3657289002557544, "grad_norm": 0.7582418871164014, "learning_rate": 0.00014586055392881527, "loss": 1.0292, "step": 534 }, { "epoch": 1.3682864450127878, "grad_norm": 1.0430189228296438, "learning_rate": 0.00014577933990866617, "loss": 1.0397, "step": 535 }, { "epoch": 1.370843989769821, "grad_norm": 1.2646327577842662, "learning_rate": 0.00014569791606340577, "loss": 1.0749, "step": 536 }, { "epoch": 1.3734015345268542, "grad_norm": 0.6922891659849906, "learning_rate": 0.00014561628265276257, "loss": 1.0293, "step": 537 }, { "epoch": 1.3759590792838874, "grad_norm": 0.44386889614919295, "learning_rate": 0.00014553443993713355, "loss": 1.0398, "step": 538 }, { "epoch": 1.3785166240409208, "grad_norm": 0.5439717030086442, "learning_rate": 0.00014545238817758327, "loss": 1.0268, "step": 539 }, { "epoch": 1.381074168797954, "grad_norm": 0.8373630963710572, "learning_rate": 0.00014537012763584316, "loss": 1.0354, "step": 540 }, { "epoch": 1.3836317135549872, "grad_norm": 1.3266757684220118, "learning_rate": 0.0001452876585743106, "loss": 1.0642, "step": 541 }, { "epoch": 1.3861892583120206, "grad_norm": 0.7488029622406787, "learning_rate": 0.00014520498125604814, "loss": 1.0534, "step": 542 }, { "epoch": 1.3887468030690537, "grad_norm": 0.7282698103684015, "learning_rate": 0.00014512209594478263, "loss": 1.01, "step": 543 }, { "epoch": 1.391304347826087, "grad_norm": 0.7969771518742094, "learning_rate": 0.00014503900290490436, "loss": 1.0307, "step": 544 }, { "epoch": 1.39386189258312, "grad_norm": 0.9263524028660353, "learning_rate": 0.00014495570240146625, "loss": 1.0211, "step": 545 }, { "epoch": 1.3964194373401535, "grad_norm": 1.1608361715103017, "learning_rate": 0.000144872194700183, "loss": 1.0005, "step": 546 }, { "epoch": 1.3989769820971867, "grad_norm": 0.836914057851843, "learning_rate": 0.00014478848006743022, "loss": 1.0387, "step": 547 }, { "epoch": 1.40153452685422, "grad_norm": 0.6826412525653701, "learning_rate": 0.00014470455877024365, "loss": 1.0292, "step": 548 }, { "epoch": 1.4040920716112533, "grad_norm": 0.48703773893723834, "learning_rate": 0.00014462043107631818, "loss": 1.0511, "step": 549 }, { "epoch": 1.4066496163682864, "grad_norm": 0.6223475644721191, "learning_rate": 0.00014453609725400713, "loss": 0.9925, "step": 550 }, { "epoch": 1.4092071611253196, "grad_norm": 0.8882232962821335, "learning_rate": 0.0001444515575723213, "loss": 1.0061, "step": 551 }, { "epoch": 1.4117647058823528, "grad_norm": 1.1304081971561695, "learning_rate": 0.00014436681230092815, "loss": 1.0488, "step": 552 }, { "epoch": 1.4143222506393862, "grad_norm": 0.8848381914341709, "learning_rate": 0.00014428186171015097, "loss": 1.0324, "step": 553 }, { "epoch": 1.4168797953964194, "grad_norm": 0.7483522323458203, "learning_rate": 0.00014419670607096791, "loss": 1.0422, "step": 554 }, { "epoch": 1.4194373401534528, "grad_norm": 0.7721209602826212, "learning_rate": 0.00014411134565501133, "loss": 1.056, "step": 555 }, { "epoch": 1.421994884910486, "grad_norm": 0.8535777213626637, "learning_rate": 0.00014402578073456661, "loss": 1.0408, "step": 556 }, { "epoch": 1.4245524296675192, "grad_norm": 0.6959036355749549, "learning_rate": 0.00014394001158257163, "loss": 1.0271, "step": 557 }, { "epoch": 1.4271099744245523, "grad_norm": 0.6014343484373971, "learning_rate": 0.00014385403847261562, "loss": 1.0193, "step": 558 }, { "epoch": 1.4296675191815857, "grad_norm": 0.7106873814775013, "learning_rate": 0.00014376786167893846, "loss": 1.0122, "step": 559 }, { "epoch": 1.432225063938619, "grad_norm": 0.8444210941994957, "learning_rate": 0.00014368148147642974, "loss": 1.0045, "step": 560 }, { "epoch": 1.434782608695652, "grad_norm": 0.8805969266684864, "learning_rate": 0.00014359489814062788, "loss": 1.0144, "step": 561 }, { "epoch": 1.4373401534526855, "grad_norm": 1.009450224204603, "learning_rate": 0.00014350811194771928, "loss": 1.0287, "step": 562 }, { "epoch": 1.4398976982097187, "grad_norm": 1.2351992837125931, "learning_rate": 0.00014342112317453738, "loss": 1.0566, "step": 563 }, { "epoch": 1.4424552429667519, "grad_norm": 0.6573457770192163, "learning_rate": 0.00014333393209856182, "loss": 1.052, "step": 564 }, { "epoch": 1.445012787723785, "grad_norm": 0.5070847718255479, "learning_rate": 0.00014324653899791765, "loss": 1.0608, "step": 565 }, { "epoch": 1.4475703324808185, "grad_norm": 0.6935855951791632, "learning_rate": 0.00014315894415137416, "loss": 1.0234, "step": 566 }, { "epoch": 1.4501278772378516, "grad_norm": 0.7956146938043426, "learning_rate": 0.00014307114783834442, "loss": 1.0048, "step": 567 }, { "epoch": 1.452685421994885, "grad_norm": 0.9003410836319078, "learning_rate": 0.0001429831503388839, "loss": 1.0363, "step": 568 }, { "epoch": 1.4552429667519182, "grad_norm": 1.0643618726104027, "learning_rate": 0.00014289495193368996, "loss": 1.0269, "step": 569 }, { "epoch": 1.4578005115089514, "grad_norm": 0.9080907950888324, "learning_rate": 0.0001428065529041008, "loss": 1.017, "step": 570 }, { "epoch": 1.4603580562659846, "grad_norm": 0.8536436997073572, "learning_rate": 0.00014271795353209456, "loss": 1.0375, "step": 571 }, { "epoch": 1.4629156010230178, "grad_norm": 0.9398461282489688, "learning_rate": 0.00014262915410028848, "loss": 1.0434, "step": 572 }, { "epoch": 1.4654731457800512, "grad_norm": 0.9631928132083718, "learning_rate": 0.00014254015489193782, "loss": 1.0292, "step": 573 }, { "epoch": 1.4680306905370843, "grad_norm": 0.9076791954370104, "learning_rate": 0.00014245095619093532, "loss": 1.0159, "step": 574 }, { "epoch": 1.4705882352941178, "grad_norm": 0.9587339014454659, "learning_rate": 0.00014236155828180983, "loss": 1.0484, "step": 575 }, { "epoch": 1.473145780051151, "grad_norm": 0.8891566782622077, "learning_rate": 0.00014227196144972582, "loss": 1.0508, "step": 576 }, { "epoch": 1.4757033248081841, "grad_norm": 0.6581614104684226, "learning_rate": 0.0001421821659804822, "loss": 1.0403, "step": 577 }, { "epoch": 1.4782608695652173, "grad_norm": 0.5861192400584929, "learning_rate": 0.00014209217216051156, "loss": 1.0304, "step": 578 }, { "epoch": 1.4808184143222507, "grad_norm": 0.5774127863656433, "learning_rate": 0.00014200198027687912, "loss": 1.0102, "step": 579 }, { "epoch": 1.4833759590792839, "grad_norm": 0.6502157171768282, "learning_rate": 0.00014191159061728193, "loss": 1.0253, "step": 580 }, { "epoch": 1.485933503836317, "grad_norm": 0.5386614139768452, "learning_rate": 0.00014182100347004793, "loss": 1.044, "step": 581 }, { "epoch": 1.4884910485933505, "grad_norm": 0.4786011997004328, "learning_rate": 0.000141730219124135, "loss": 1.0322, "step": 582 }, { "epoch": 1.4910485933503836, "grad_norm": 0.5755235187273994, "learning_rate": 0.00014163923786913004, "loss": 1.0572, "step": 583 }, { "epoch": 1.4936061381074168, "grad_norm": 0.641263771557679, "learning_rate": 0.00014154805999524802, "loss": 1.0627, "step": 584 }, { "epoch": 1.49616368286445, "grad_norm": 0.798665776000645, "learning_rate": 0.0001414566857933312, "loss": 1.0017, "step": 585 }, { "epoch": 1.4987212276214834, "grad_norm": 0.8759678129527348, "learning_rate": 0.00014136511555484798, "loss": 1.0168, "step": 586 }, { "epoch": 1.5012787723785166, "grad_norm": 0.7904395533793586, "learning_rate": 0.00014127334957189219, "loss": 1.0253, "step": 587 }, { "epoch": 1.50383631713555, "grad_norm": 0.6451046472087583, "learning_rate": 0.00014118138813718192, "loss": 1.0523, "step": 588 }, { "epoch": 1.5063938618925832, "grad_norm": 0.5705461372803496, "learning_rate": 0.0001410892315440588, "loss": 0.9921, "step": 589 }, { "epoch": 1.5089514066496164, "grad_norm": 0.6000400371240294, "learning_rate": 0.00014099688008648703, "loss": 1.0219, "step": 590 }, { "epoch": 1.5115089514066495, "grad_norm": 0.6112952152068515, "learning_rate": 0.0001409043340590523, "loss": 0.9963, "step": 591 }, { "epoch": 1.5140664961636827, "grad_norm": 0.5886324573188866, "learning_rate": 0.00014081159375696102, "loss": 1.0484, "step": 592 }, { "epoch": 1.5166240409207161, "grad_norm": 0.5048817308801855, "learning_rate": 0.00014071865947603922, "loss": 0.978, "step": 593 }, { "epoch": 1.5191815856777495, "grad_norm": 0.5000111304078102, "learning_rate": 0.00014062553151273177, "loss": 1.0431, "step": 594 }, { "epoch": 1.5217391304347827, "grad_norm": 0.47701322805085783, "learning_rate": 0.0001405322101641013, "loss": 1.0157, "step": 595 }, { "epoch": 1.5242966751918159, "grad_norm": 0.45047959305759844, "learning_rate": 0.00014043869572782737, "loss": 1.026, "step": 596 }, { "epoch": 1.526854219948849, "grad_norm": 0.37562193605886857, "learning_rate": 0.00014034498850220537, "loss": 1.0334, "step": 597 }, { "epoch": 1.5294117647058822, "grad_norm": 0.44055163797782626, "learning_rate": 0.00014025108878614576, "loss": 1.0353, "step": 598 }, { "epoch": 1.5319693094629157, "grad_norm": 0.39725606847915634, "learning_rate": 0.0001401569968791729, "loss": 1.0115, "step": 599 }, { "epoch": 1.5345268542199488, "grad_norm": 0.39650786805208904, "learning_rate": 0.00014006271308142433, "loss": 1.0604, "step": 600 }, { "epoch": 1.5370843989769822, "grad_norm": 0.32569926641458746, "learning_rate": 0.0001399682376936495, "loss": 1.0096, "step": 601 }, { "epoch": 1.5396419437340154, "grad_norm": 0.43543100187257516, "learning_rate": 0.00013987357101720929, "loss": 1.0059, "step": 602 }, { "epoch": 1.5421994884910486, "grad_norm": 0.458695174168892, "learning_rate": 0.00013977871335407445, "loss": 1.0197, "step": 603 }, { "epoch": 1.5447570332480818, "grad_norm": 0.43690410697330667, "learning_rate": 0.00013968366500682514, "loss": 1.0302, "step": 604 }, { "epoch": 1.547314578005115, "grad_norm": 0.4143725631119223, "learning_rate": 0.00013958842627864975, "loss": 1.0167, "step": 605 }, { "epoch": 1.5498721227621484, "grad_norm": 0.36509470245988934, "learning_rate": 0.00013949299747334387, "loss": 0.994, "step": 606 }, { "epoch": 1.5524296675191815, "grad_norm": 0.42997115738098735, "learning_rate": 0.00013939737889530948, "loss": 1.0182, "step": 607 }, { "epoch": 1.554987212276215, "grad_norm": 0.519737904298238, "learning_rate": 0.00013930157084955387, "loss": 1.0432, "step": 608 }, { "epoch": 1.5575447570332481, "grad_norm": 0.5413718715320616, "learning_rate": 0.00013920557364168872, "loss": 1.0392, "step": 609 }, { "epoch": 1.5601023017902813, "grad_norm": 0.4622784565390988, "learning_rate": 0.00013910938757792911, "loss": 1.0089, "step": 610 }, { "epoch": 1.5626598465473145, "grad_norm": 0.517572135003303, "learning_rate": 0.00013901301296509247, "loss": 1.0433, "step": 611 }, { "epoch": 1.5652173913043477, "grad_norm": 0.6472771877158792, "learning_rate": 0.00013891645011059774, "loss": 1.033, "step": 612 }, { "epoch": 1.567774936061381, "grad_norm": 0.73777975779115, "learning_rate": 0.00013881969932246434, "loss": 1.0233, "step": 613 }, { "epoch": 1.5703324808184145, "grad_norm": 0.6556752106938734, "learning_rate": 0.00013872276090931112, "loss": 1.0283, "step": 614 }, { "epoch": 1.5728900255754477, "grad_norm": 0.647001672639268, "learning_rate": 0.0001386256351803554, "loss": 1.0449, "step": 615 }, { "epoch": 1.5754475703324808, "grad_norm": 0.755466796600313, "learning_rate": 0.00013852832244541207, "loss": 1.0005, "step": 616 }, { "epoch": 1.578005115089514, "grad_norm": 0.9067726592525303, "learning_rate": 0.00013843082301489247, "loss": 1.034, "step": 617 }, { "epoch": 1.5805626598465472, "grad_norm": 1.205016289595881, "learning_rate": 0.00013833313719980358, "loss": 1.0292, "step": 618 }, { "epoch": 1.5831202046035806, "grad_norm": 0.8478168612376876, "learning_rate": 0.00013823526531174675, "loss": 1.0142, "step": 619 }, { "epoch": 1.5856777493606138, "grad_norm": 0.7403592560784086, "learning_rate": 0.000138137207662917, "loss": 1.0019, "step": 620 }, { "epoch": 1.5882352941176472, "grad_norm": 0.6403376151233803, "learning_rate": 0.00013803896456610187, "loss": 1.0308, "step": 621 }, { "epoch": 1.5907928388746804, "grad_norm": 0.712308710605845, "learning_rate": 0.0001379405363346804, "loss": 1.0455, "step": 622 }, { "epoch": 1.5933503836317136, "grad_norm": 0.6512025986675177, "learning_rate": 0.00013784192328262227, "loss": 1.018, "step": 623 }, { "epoch": 1.5959079283887467, "grad_norm": 0.6467882755688008, "learning_rate": 0.00013774312572448658, "loss": 1.0566, "step": 624 }, { "epoch": 1.59846547314578, "grad_norm": 0.7409770827879977, "learning_rate": 0.00013764414397542113, "loss": 1.0759, "step": 625 }, { "epoch": 1.6010230179028133, "grad_norm": 0.8147656835217053, "learning_rate": 0.0001375449783511611, "loss": 1.0041, "step": 626 }, { "epoch": 1.6035805626598465, "grad_norm": 0.9034624506464588, "learning_rate": 0.0001374456291680283, "loss": 1.0141, "step": 627 }, { "epoch": 1.60613810741688, "grad_norm": 1.0050570938199166, "learning_rate": 0.00013734609674293001, "loss": 1.0532, "step": 628 }, { "epoch": 1.608695652173913, "grad_norm": 0.9807521253903259, "learning_rate": 0.00013724638139335808, "loss": 1.0079, "step": 629 }, { "epoch": 1.6112531969309463, "grad_norm": 1.0251289878636651, "learning_rate": 0.00013714648343738785, "loss": 1.014, "step": 630 }, { "epoch": 1.6138107416879794, "grad_norm": 1.1145588268761022, "learning_rate": 0.00013704640319367706, "loss": 1.0217, "step": 631 }, { "epoch": 1.6163682864450126, "grad_norm": 0.9024588644594059, "learning_rate": 0.000136946140981465, "loss": 1.0151, "step": 632 }, { "epoch": 1.618925831202046, "grad_norm": 0.7164435145214515, "learning_rate": 0.00013684569712057141, "loss": 0.9972, "step": 633 }, { "epoch": 1.6214833759590794, "grad_norm": 0.40989603024156007, "learning_rate": 0.0001367450719313954, "loss": 1.0438, "step": 634 }, { "epoch": 1.6240409207161126, "grad_norm": 0.4621187072292993, "learning_rate": 0.00013664426573491454, "loss": 0.9964, "step": 635 }, { "epoch": 1.6265984654731458, "grad_norm": 0.7796243265332405, "learning_rate": 0.0001365432788526838, "loss": 1.0428, "step": 636 }, { "epoch": 1.629156010230179, "grad_norm": 0.9807118313427811, "learning_rate": 0.0001364421116068344, "loss": 1.0374, "step": 637 }, { "epoch": 1.6317135549872122, "grad_norm": 1.0521751456854462, "learning_rate": 0.00013634076432007298, "loss": 1.022, "step": 638 }, { "epoch": 1.6342710997442456, "grad_norm": 1.014819808376515, "learning_rate": 0.00013623923731568053, "loss": 1.0555, "step": 639 }, { "epoch": 1.6368286445012787, "grad_norm": 0.8908217824529507, "learning_rate": 0.00013613753091751117, "loss": 0.9896, "step": 640 }, { "epoch": 1.6393861892583121, "grad_norm": 0.7338590542416318, "learning_rate": 0.00013603564544999134, "loss": 1.0104, "step": 641 }, { "epoch": 1.6419437340153453, "grad_norm": 0.4947515917010355, "learning_rate": 0.00013593358123811873, "loss": 1.013, "step": 642 }, { "epoch": 1.6445012787723785, "grad_norm": 0.3613565103885808, "learning_rate": 0.00013583133860746102, "loss": 1.0285, "step": 643 }, { "epoch": 1.6470588235294117, "grad_norm": 0.44918465574622884, "learning_rate": 0.00013572891788415526, "loss": 1.0735, "step": 644 }, { "epoch": 1.6496163682864449, "grad_norm": 0.6919277753013154, "learning_rate": 0.00013562631939490638, "loss": 0.9838, "step": 645 }, { "epoch": 1.6521739130434783, "grad_norm": 0.998596135317296, "learning_rate": 0.00013552354346698644, "loss": 1.0407, "step": 646 }, { "epoch": 1.6547314578005117, "grad_norm": 1.1274200277350097, "learning_rate": 0.0001354205904282335, "loss": 0.9994, "step": 647 }, { "epoch": 1.6572890025575449, "grad_norm": 0.7298162047765786, "learning_rate": 0.0001353174606070505, "loss": 1.0158, "step": 648 }, { "epoch": 1.659846547314578, "grad_norm": 0.4959923867676345, "learning_rate": 0.00013521415433240448, "loss": 1.0223, "step": 649 }, { "epoch": 1.6624040920716112, "grad_norm": 0.4028073795408234, "learning_rate": 0.0001351106719338251, "loss": 1.0048, "step": 650 }, { "epoch": 1.6649616368286444, "grad_norm": 0.4151895967851957, "learning_rate": 0.000135007013741404, "loss": 1.031, "step": 651 }, { "epoch": 1.6675191815856778, "grad_norm": 0.493296338959119, "learning_rate": 0.0001349031800857934, "loss": 1.0551, "step": 652 }, { "epoch": 1.670076726342711, "grad_norm": 0.5474927271625798, "learning_rate": 0.00013479917129820547, "loss": 1.0296, "step": 653 }, { "epoch": 1.6726342710997444, "grad_norm": 0.6314250125042725, "learning_rate": 0.00013469498771041078, "loss": 1.0355, "step": 654 }, { "epoch": 1.6751918158567776, "grad_norm": 0.7183033795455095, "learning_rate": 0.0001345906296547376, "loss": 1.0239, "step": 655 }, { "epoch": 1.6777493606138107, "grad_norm": 0.6627049343116693, "learning_rate": 0.00013448609746407076, "loss": 1.0107, "step": 656 }, { "epoch": 1.680306905370844, "grad_norm": 0.8323267890128159, "learning_rate": 0.0001343813914718504, "loss": 1.0132, "step": 657 }, { "epoch": 1.682864450127877, "grad_norm": 1.0100396544553614, "learning_rate": 0.0001342765120120712, "loss": 1.034, "step": 658 }, { "epoch": 1.6854219948849105, "grad_norm": 0.9397586944756832, "learning_rate": 0.0001341714594192811, "loss": 1.0359, "step": 659 }, { "epoch": 1.6879795396419437, "grad_norm": 0.60948367814948, "learning_rate": 0.00013406623402858038, "loss": 1.0515, "step": 660 }, { "epoch": 1.690537084398977, "grad_norm": 0.4064851961480879, "learning_rate": 0.00013396083617562041, "loss": 1.0295, "step": 661 }, { "epoch": 1.6930946291560103, "grad_norm": 0.4835321670487211, "learning_rate": 0.0001338552661966028, "loss": 1.0218, "step": 662 }, { "epoch": 1.6956521739130435, "grad_norm": 0.5087590456762057, "learning_rate": 0.00013374952442827813, "loss": 1.0438, "step": 663 }, { "epoch": 1.6982097186700766, "grad_norm": 0.487251739240553, "learning_rate": 0.00013364361120794495, "loss": 1.0293, "step": 664 }, { "epoch": 1.7007672634271098, "grad_norm": 0.5712982739684782, "learning_rate": 0.00013353752687344882, "loss": 1.0332, "step": 665 }, { "epoch": 1.7033248081841432, "grad_norm": 0.7033661782388088, "learning_rate": 0.000133431271763181, "loss": 1.0053, "step": 666 }, { "epoch": 1.7058823529411766, "grad_norm": 0.6935444307133046, "learning_rate": 0.00013332484621607758, "loss": 1.0262, "step": 667 }, { "epoch": 1.7084398976982098, "grad_norm": 0.7341105705188075, "learning_rate": 0.00013321825057161825, "loss": 1.0156, "step": 668 }, { "epoch": 1.710997442455243, "grad_norm": 0.7907280681410083, "learning_rate": 0.00013311148516982534, "loss": 1.0413, "step": 669 }, { "epoch": 1.7135549872122762, "grad_norm": 0.7112672488330658, "learning_rate": 0.00013300455035126268, "loss": 1.0199, "step": 670 }, { "epoch": 1.7161125319693094, "grad_norm": 0.5766576717286938, "learning_rate": 0.00013289744645703444, "loss": 1.0361, "step": 671 }, { "epoch": 1.7186700767263428, "grad_norm": 0.5059688666618373, "learning_rate": 0.0001327901738287842, "loss": 1.0385, "step": 672 }, { "epoch": 1.721227621483376, "grad_norm": 0.45263501963427877, "learning_rate": 0.0001326827328086937, "loss": 1.0163, "step": 673 }, { "epoch": 1.7237851662404093, "grad_norm": 0.5156404930129397, "learning_rate": 0.00013257512373948186, "loss": 1.0592, "step": 674 }, { "epoch": 1.7263427109974425, "grad_norm": 0.6373966994332245, "learning_rate": 0.00013246734696440368, "loss": 1.0303, "step": 675 }, { "epoch": 1.7289002557544757, "grad_norm": 0.6497706378399105, "learning_rate": 0.000132359402827249, "loss": 0.9963, "step": 676 }, { "epoch": 1.7314578005115089, "grad_norm": 0.6649205635237081, "learning_rate": 0.0001322512916723417, "loss": 1.0133, "step": 677 }, { "epoch": 1.734015345268542, "grad_norm": 0.7302337459964975, "learning_rate": 0.00013214301384453824, "loss": 1.0143, "step": 678 }, { "epoch": 1.7365728900255755, "grad_norm": 0.7742690150052379, "learning_rate": 0.00013203456968922684, "loss": 1.0164, "step": 679 }, { "epoch": 1.7391304347826086, "grad_norm": 0.6798309822233196, "learning_rate": 0.0001319259595523262, "loss": 1.0172, "step": 680 }, { "epoch": 1.741687979539642, "grad_norm": 0.5208733748449712, "learning_rate": 0.0001318171837802846, "loss": 1.0048, "step": 681 }, { "epoch": 1.7442455242966752, "grad_norm": 0.41856841228081965, "learning_rate": 0.00013170824272007854, "loss": 1.0508, "step": 682 }, { "epoch": 1.7468030690537084, "grad_norm": 0.41744052183195546, "learning_rate": 0.00013159913671921184, "loss": 1.0433, "step": 683 }, { "epoch": 1.7493606138107416, "grad_norm": 0.45034351237029546, "learning_rate": 0.00013148986612571438, "loss": 1.0281, "step": 684 }, { "epoch": 1.7519181585677748, "grad_norm": 0.5021896906440644, "learning_rate": 0.00013138043128814114, "loss": 1.0207, "step": 685 }, { "epoch": 1.7544757033248082, "grad_norm": 0.6367316434278153, "learning_rate": 0.000131270832555571, "loss": 1.0509, "step": 686 }, { "epoch": 1.7570332480818416, "grad_norm": 0.9449450079946309, "learning_rate": 0.00013116107027760557, "loss": 1.0263, "step": 687 }, { "epoch": 1.7595907928388748, "grad_norm": 1.2671861813793404, "learning_rate": 0.00013105114480436823, "loss": 1.015, "step": 688 }, { "epoch": 1.762148337595908, "grad_norm": 0.6133472053088566, "learning_rate": 0.00013094105648650285, "loss": 0.9964, "step": 689 }, { "epoch": 1.7647058823529411, "grad_norm": 0.5563333895443464, "learning_rate": 0.00013083080567517284, "loss": 1.0221, "step": 690 }, { "epoch": 1.7672634271099743, "grad_norm": 0.8984060988722041, "learning_rate": 0.0001307203927220598, "loss": 1.0333, "step": 691 }, { "epoch": 1.7698209718670077, "grad_norm": 1.1600459077736829, "learning_rate": 0.0001306098179793627, "loss": 1.0281, "step": 692 }, { "epoch": 1.772378516624041, "grad_norm": 0.8749748158295617, "learning_rate": 0.00013049908179979644, "loss": 1.0414, "step": 693 }, { "epoch": 1.7749360613810743, "grad_norm": 0.6456013771393564, "learning_rate": 0.00013038818453659098, "loss": 0.9934, "step": 694 }, { "epoch": 1.7774936061381075, "grad_norm": 0.4834000513881869, "learning_rate": 0.00013027712654349003, "loss": 1.0077, "step": 695 }, { "epoch": 1.7800511508951407, "grad_norm": 0.46969762642929197, "learning_rate": 0.0001301659081747501, "loss": 1.0408, "step": 696 }, { "epoch": 1.7826086956521738, "grad_norm": 0.5147779689056563, "learning_rate": 0.0001300545297851392, "loss": 1.0186, "step": 697 }, { "epoch": 1.785166240409207, "grad_norm": 0.55729153001615, "learning_rate": 0.0001299429917299358, "loss": 1.0329, "step": 698 }, { "epoch": 1.7877237851662404, "grad_norm": 0.5260414108398854, "learning_rate": 0.00012983129436492763, "loss": 1.0233, "step": 699 }, { "epoch": 1.7902813299232738, "grad_norm": 0.5427361149590243, "learning_rate": 0.00012971943804641068, "loss": 1.0409, "step": 700 }, { "epoch": 1.792838874680307, "grad_norm": 0.5405520825559765, "learning_rate": 0.0001296074231311879, "loss": 1.0066, "step": 701 }, { "epoch": 1.7953964194373402, "grad_norm": 0.6297890907155308, "learning_rate": 0.0001294952499765682, "loss": 1.0254, "step": 702 }, { "epoch": 1.7979539641943734, "grad_norm": 0.6644546067252105, "learning_rate": 0.00012938291894036522, "loss": 1.0285, "step": 703 }, { "epoch": 1.8005115089514065, "grad_norm": 0.683427488866508, "learning_rate": 0.00012927043038089616, "loss": 1.0091, "step": 704 }, { "epoch": 1.80306905370844, "grad_norm": 0.6319295334248269, "learning_rate": 0.00012915778465698077, "loss": 1.0397, "step": 705 }, { "epoch": 1.8056265984654731, "grad_norm": 0.5438735087695892, "learning_rate": 0.00012904498212794007, "loss": 0.991, "step": 706 }, { "epoch": 1.8081841432225065, "grad_norm": 0.5047705166677889, "learning_rate": 0.00012893202315359537, "loss": 0.9944, "step": 707 }, { "epoch": 1.8107416879795397, "grad_norm": 0.5361496724146492, "learning_rate": 0.00012881890809426688, "loss": 1.0212, "step": 708 }, { "epoch": 1.813299232736573, "grad_norm": 0.4758891777297796, "learning_rate": 0.00012870563731077277, "loss": 0.9717, "step": 709 }, { "epoch": 1.815856777493606, "grad_norm": 0.41562952895729655, "learning_rate": 0.0001285922111644279, "loss": 1.0162, "step": 710 }, { "epoch": 1.8184143222506393, "grad_norm": 0.4923656957788762, "learning_rate": 0.00012847863001704278, "loss": 1.0685, "step": 711 }, { "epoch": 1.8209718670076727, "grad_norm": 0.43817036243213936, "learning_rate": 0.00012836489423092225, "loss": 1.0166, "step": 712 }, { "epoch": 1.8235294117647058, "grad_norm": 0.36194875273904087, "learning_rate": 0.00012825100416886454, "loss": 1.0255, "step": 713 }, { "epoch": 1.8260869565217392, "grad_norm": 0.5507986270387409, "learning_rate": 0.0001281369601941599, "loss": 1.0135, "step": 714 }, { "epoch": 1.8286445012787724, "grad_norm": 0.685338916623197, "learning_rate": 0.00012802276267058957, "loss": 0.999, "step": 715 }, { "epoch": 1.8312020460358056, "grad_norm": 0.5568312967518175, "learning_rate": 0.00012790841196242458, "loss": 1.0153, "step": 716 }, { "epoch": 1.8337595907928388, "grad_norm": 0.4401729278401454, "learning_rate": 0.00012779390843442462, "loss": 0.9855, "step": 717 }, { "epoch": 1.836317135549872, "grad_norm": 0.4249893778808539, "learning_rate": 0.00012767925245183676, "loss": 1.0351, "step": 718 }, { "epoch": 1.8388746803069054, "grad_norm": 0.47539299147834413, "learning_rate": 0.00012756444438039453, "loss": 1.035, "step": 719 }, { "epoch": 1.8414322250639388, "grad_norm": 0.5475741371560751, "learning_rate": 0.00012744948458631646, "loss": 1.0412, "step": 720 }, { "epoch": 1.843989769820972, "grad_norm": 0.5751955332609484, "learning_rate": 0.0001273343734363051, "loss": 1.0419, "step": 721 }, { "epoch": 1.8465473145780051, "grad_norm": 0.5673429560849089, "learning_rate": 0.00012721911129754578, "loss": 0.9993, "step": 722 }, { "epoch": 1.8491048593350383, "grad_norm": 0.475786389030356, "learning_rate": 0.0001271036985377055, "loss": 1.0255, "step": 723 }, { "epoch": 1.8516624040920715, "grad_norm": 0.4435215042959613, "learning_rate": 0.00012698813552493174, "loss": 1.0159, "step": 724 }, { "epoch": 1.854219948849105, "grad_norm": 0.6384652673350472, "learning_rate": 0.00012687242262785116, "loss": 1.0468, "step": 725 }, { "epoch": 1.856777493606138, "grad_norm": 0.660707948092585, "learning_rate": 0.00012675656021556855, "loss": 0.9702, "step": 726 }, { "epoch": 1.8593350383631715, "grad_norm": 0.5190779530078301, "learning_rate": 0.00012664054865766573, "loss": 0.9959, "step": 727 }, { "epoch": 1.8618925831202047, "grad_norm": 0.59002541889049, "learning_rate": 0.00012652438832420017, "loss": 1.0009, "step": 728 }, { "epoch": 1.8644501278772379, "grad_norm": 0.724406502768554, "learning_rate": 0.00012640807958570394, "loss": 1.0572, "step": 729 }, { "epoch": 1.867007672634271, "grad_norm": 0.606082979636232, "learning_rate": 0.00012629162281318248, "loss": 1.0123, "step": 730 }, { "epoch": 1.8695652173913042, "grad_norm": 0.3890444487309348, "learning_rate": 0.00012617501837811347, "loss": 0.9835, "step": 731 }, { "epoch": 1.8721227621483376, "grad_norm": 0.4748189131220067, "learning_rate": 0.00012605826665244559, "loss": 1.0206, "step": 732 }, { "epoch": 1.8746803069053708, "grad_norm": 0.5894024279814004, "learning_rate": 0.00012594136800859733, "loss": 1.0312, "step": 733 }, { "epoch": 1.8772378516624042, "grad_norm": 0.8812294314944346, "learning_rate": 0.00012582432281945587, "loss": 0.9929, "step": 734 }, { "epoch": 1.8797953964194374, "grad_norm": 1.2695722544281176, "learning_rate": 0.0001257071314583758, "loss": 1.0232, "step": 735 }, { "epoch": 1.8823529411764706, "grad_norm": 0.7877721338048511, "learning_rate": 0.00012558979429917803, "loss": 1.0528, "step": 736 }, { "epoch": 1.8849104859335037, "grad_norm": 0.6479567586178989, "learning_rate": 0.00012547231171614845, "loss": 1.0262, "step": 737 }, { "epoch": 1.887468030690537, "grad_norm": 0.6844520570754378, "learning_rate": 0.00012535468408403697, "loss": 1.0333, "step": 738 }, { "epoch": 1.8900255754475703, "grad_norm": 0.6085957966970293, "learning_rate": 0.00012523691177805597, "loss": 1.0168, "step": 739 }, { "epoch": 1.8925831202046037, "grad_norm": 0.5254572324853038, "learning_rate": 0.00012511899517387955, "loss": 0.9883, "step": 740 }, { "epoch": 1.895140664961637, "grad_norm": 0.6139364866532532, "learning_rate": 0.00012500093464764197, "loss": 0.9977, "step": 741 }, { "epoch": 1.89769820971867, "grad_norm": 0.6998963267481692, "learning_rate": 0.00012488273057593654, "loss": 1.0044, "step": 742 }, { "epoch": 1.9002557544757033, "grad_norm": 0.5270554785542413, "learning_rate": 0.00012476438333581456, "loss": 1.0412, "step": 743 }, { "epoch": 1.9028132992327365, "grad_norm": 0.5157043265448235, "learning_rate": 0.00012464589330478398, "loss": 0.9978, "step": 744 }, { "epoch": 1.9053708439897699, "grad_norm": 0.5631065206891138, "learning_rate": 0.0001245272608608082, "loss": 0.9944, "step": 745 }, { "epoch": 1.907928388746803, "grad_norm": 0.4807212257749526, "learning_rate": 0.00012440848638230485, "loss": 1.0184, "step": 746 }, { "epoch": 1.9104859335038364, "grad_norm": 0.42670701279562534, "learning_rate": 0.00012428957024814477, "loss": 1.0105, "step": 747 }, { "epoch": 1.9130434782608696, "grad_norm": 0.41188284810782877, "learning_rate": 0.00012417051283765055, "loss": 1.0256, "step": 748 }, { "epoch": 1.9156010230179028, "grad_norm": 0.39912216267661754, "learning_rate": 0.0001240513145305954, "loss": 1.0479, "step": 749 }, { "epoch": 1.918158567774936, "grad_norm": 0.40181896505552256, "learning_rate": 0.00012393197570720208, "loss": 1.0006, "step": 750 }, { "epoch": 1.9207161125319692, "grad_norm": 0.4686514718132313, "learning_rate": 0.0001238124967481415, "loss": 1.0527, "step": 751 }, { "epoch": 1.9232736572890026, "grad_norm": 0.4847458570755899, "learning_rate": 0.00012369287803453156, "loss": 1.0039, "step": 752 }, { "epoch": 1.9258312020460358, "grad_norm": 0.5873940841619928, "learning_rate": 0.00012357311994793603, "loss": 1.0191, "step": 753 }, { "epoch": 1.9283887468030692, "grad_norm": 0.6710549953392281, "learning_rate": 0.00012345322287036315, "loss": 1.014, "step": 754 }, { "epoch": 1.9309462915601023, "grad_norm": 0.7897611598340533, "learning_rate": 0.0001233331871842646, "loss": 0.9853, "step": 755 }, { "epoch": 1.9335038363171355, "grad_norm": 0.870069888372245, "learning_rate": 0.0001232130132725342, "loss": 1.022, "step": 756 }, { "epoch": 1.9360613810741687, "grad_norm": 1.0698935466826593, "learning_rate": 0.00012309270151850666, "loss": 1.0199, "step": 757 }, { "epoch": 1.938618925831202, "grad_norm": 1.0318153691478889, "learning_rate": 0.00012297225230595637, "loss": 1.0008, "step": 758 }, { "epoch": 1.9411764705882353, "grad_norm": 0.8031059628622865, "learning_rate": 0.0001228516660190962, "loss": 1.0464, "step": 759 }, { "epoch": 1.9437340153452687, "grad_norm": 0.4432470641559668, "learning_rate": 0.00012273094304257633, "loss": 1.0486, "step": 760 }, { "epoch": 1.9462915601023019, "grad_norm": 0.4413834236432169, "learning_rate": 0.00012261008376148282, "loss": 1.0483, "step": 761 }, { "epoch": 1.948849104859335, "grad_norm": 0.5753204802658383, "learning_rate": 0.0001224890885613366, "loss": 1.026, "step": 762 }, { "epoch": 1.9514066496163682, "grad_norm": 0.6330964706251369, "learning_rate": 0.00012236795782809225, "loss": 1.017, "step": 763 }, { "epoch": 1.9539641943734014, "grad_norm": 0.6869010778127252, "learning_rate": 0.00012224669194813647, "loss": 1.031, "step": 764 }, { "epoch": 1.9565217391304348, "grad_norm": 0.7455335150670086, "learning_rate": 0.00012212529130828725, "loss": 0.9639, "step": 765 }, { "epoch": 1.959079283887468, "grad_norm": 0.6598851148094896, "learning_rate": 0.00012200375629579234, "loss": 1.0298, "step": 766 }, { "epoch": 1.9616368286445014, "grad_norm": 0.44847708135640946, "learning_rate": 0.0001218820872983281, "loss": 0.9979, "step": 767 }, { "epoch": 1.9641943734015346, "grad_norm": 0.4421542384496395, "learning_rate": 0.00012176028470399836, "loss": 1.0219, "step": 768 }, { "epoch": 1.9667519181585678, "grad_norm": 0.5551681283301225, "learning_rate": 0.00012163834890133303, "loss": 1.0321, "step": 769 }, { "epoch": 1.969309462915601, "grad_norm": 0.5433680138372817, "learning_rate": 0.000121516280279287, "loss": 1.0152, "step": 770 }, { "epoch": 1.9718670076726341, "grad_norm": 0.3927534411279976, "learning_rate": 0.00012139407922723875, "loss": 1.0056, "step": 771 }, { "epoch": 1.9744245524296675, "grad_norm": 0.3504638375301521, "learning_rate": 0.00012127174613498925, "loss": 1.0211, "step": 772 }, { "epoch": 1.976982097186701, "grad_norm": 0.5235226714465111, "learning_rate": 0.00012114928139276064, "loss": 1.0298, "step": 773 }, { "epoch": 1.979539641943734, "grad_norm": 0.47218634270204046, "learning_rate": 0.00012102668539119501, "loss": 0.997, "step": 774 }, { "epoch": 1.9820971867007673, "grad_norm": 0.3909468495312419, "learning_rate": 0.00012090395852135314, "loss": 1.008, "step": 775 }, { "epoch": 1.9846547314578005, "grad_norm": 0.3354579546285365, "learning_rate": 0.0001207811011747132, "loss": 1.0247, "step": 776 }, { "epoch": 1.9872122762148337, "grad_norm": 0.3467079716757078, "learning_rate": 0.00012065811374316966, "loss": 1.0049, "step": 777 }, { "epoch": 1.989769820971867, "grad_norm": 0.3407603167118022, "learning_rate": 0.0001205349966190319, "loss": 1.0454, "step": 778 }, { "epoch": 1.9923273657289002, "grad_norm": 0.3172074392515775, "learning_rate": 0.00012041175019502295, "loss": 1.0269, "step": 779 }, { "epoch": 1.9948849104859336, "grad_norm": 0.38289682905322714, "learning_rate": 0.00012028837486427837, "loss": 1.0085, "step": 780 }, { "epoch": 1.9974424552429668, "grad_norm": 0.3409699287203162, "learning_rate": 0.00012016487102034482, "loss": 1.0151, "step": 781 }, { "epoch": 2.0, "grad_norm": 0.4841721621140613, "learning_rate": 0.00012004123905717898, "loss": 0.9888, "step": 782 }, { "epoch": 2.002557544757033, "grad_norm": 0.5947034995797379, "learning_rate": 0.00011991747936914614, "loss": 0.98, "step": 783 }, { "epoch": 2.0051150895140664, "grad_norm": 0.5314717777356649, "learning_rate": 0.00011979359235101906, "loss": 0.966, "step": 784 }, { "epoch": 2.0076726342710995, "grad_norm": 0.4148615363763489, "learning_rate": 0.00011966957839797664, "loss": 0.9695, "step": 785 }, { "epoch": 2.010230179028133, "grad_norm": 0.4001599305252567, "learning_rate": 0.00011954543790560267, "loss": 1.0493, "step": 786 }, { "epoch": 2.0127877237851663, "grad_norm": 0.43752065357850173, "learning_rate": 0.00011942117126988461, "loss": 0.9883, "step": 787 }, { "epoch": 2.0153452685421995, "grad_norm": 0.5092717368916159, "learning_rate": 0.00011929677888721227, "loss": 0.9984, "step": 788 }, { "epoch": 2.0179028132992327, "grad_norm": 0.5840375290444557, "learning_rate": 0.00011917226115437656, "loss": 0.9833, "step": 789 }, { "epoch": 2.020460358056266, "grad_norm": 0.573138093028074, "learning_rate": 0.00011904761846856831, "loss": 0.9724, "step": 790 }, { "epoch": 2.023017902813299, "grad_norm": 0.5890770850578259, "learning_rate": 0.00011892285122737683, "loss": 0.9699, "step": 791 }, { "epoch": 2.0255754475703327, "grad_norm": 0.5692021165096304, "learning_rate": 0.00011879795982878883, "loss": 0.9741, "step": 792 }, { "epoch": 2.028132992327366, "grad_norm": 0.6399550167383995, "learning_rate": 0.00011867294467118698, "loss": 0.9682, "step": 793 }, { "epoch": 2.030690537084399, "grad_norm": 0.7338640869363395, "learning_rate": 0.00011854780615334875, "loss": 0.9683, "step": 794 }, { "epoch": 2.0332480818414322, "grad_norm": 0.806906500405086, "learning_rate": 0.00011842254467444517, "loss": 0.9756, "step": 795 }, { "epoch": 2.0358056265984654, "grad_norm": 0.7925351913713344, "learning_rate": 0.0001182971606340394, "loss": 0.9853, "step": 796 }, { "epoch": 2.0383631713554986, "grad_norm": 0.6258347835444797, "learning_rate": 0.00011817165443208562, "loss": 1.0054, "step": 797 }, { "epoch": 2.040920716112532, "grad_norm": 0.4512585898690294, "learning_rate": 0.00011804602646892762, "loss": 0.9792, "step": 798 }, { "epoch": 2.0434782608695654, "grad_norm": 0.3681772077619349, "learning_rate": 0.00011792027714529767, "loss": 0.9788, "step": 799 }, { "epoch": 2.0460358056265986, "grad_norm": 0.4769785686846811, "learning_rate": 0.0001177944068623151, "loss": 1.023, "step": 800 }, { "epoch": 2.0485933503836318, "grad_norm": 0.5513670753501893, "learning_rate": 0.00011766841602148507, "loss": 0.9758, "step": 801 }, { "epoch": 2.051150895140665, "grad_norm": 0.5343242524485008, "learning_rate": 0.00011754230502469739, "loss": 0.9828, "step": 802 }, { "epoch": 2.053708439897698, "grad_norm": 0.3790786798266737, "learning_rate": 0.00011741607427422502, "loss": 0.9891, "step": 803 }, { "epoch": 2.0562659846547313, "grad_norm": 0.3356594047836669, "learning_rate": 0.000117289724172723, "loss": 1.0182, "step": 804 }, { "epoch": 2.0588235294117645, "grad_norm": 0.4979916614188739, "learning_rate": 0.00011716325512322707, "loss": 0.9653, "step": 805 }, { "epoch": 2.061381074168798, "grad_norm": 0.5917115439040083, "learning_rate": 0.00011703666752915235, "loss": 0.9779, "step": 806 }, { "epoch": 2.0639386189258313, "grad_norm": 0.7711282568070231, "learning_rate": 0.00011690996179429219, "loss": 1.0192, "step": 807 }, { "epoch": 2.0664961636828645, "grad_norm": 0.9738458712850159, "learning_rate": 0.00011678313832281664, "loss": 0.9929, "step": 808 }, { "epoch": 2.0690537084398977, "grad_norm": 1.0543246508556696, "learning_rate": 0.00011665619751927146, "loss": 0.9711, "step": 809 }, { "epoch": 2.071611253196931, "grad_norm": 0.7273546848221022, "learning_rate": 0.00011652913978857664, "loss": 0.9732, "step": 810 }, { "epoch": 2.074168797953964, "grad_norm": 0.5119256334998138, "learning_rate": 0.00011640196553602505, "loss": 0.9955, "step": 811 }, { "epoch": 2.0767263427109977, "grad_norm": 0.36268273560962566, "learning_rate": 0.00011627467516728138, "loss": 0.9706, "step": 812 }, { "epoch": 2.079283887468031, "grad_norm": 0.40355937427082544, "learning_rate": 0.00011614726908838063, "loss": 0.9712, "step": 813 }, { "epoch": 2.081841432225064, "grad_norm": 0.5018343946579583, "learning_rate": 0.00011601974770572692, "loss": 1.0314, "step": 814 }, { "epoch": 2.084398976982097, "grad_norm": 0.49570234160885446, "learning_rate": 0.0001158921114260922, "loss": 0.961, "step": 815 }, { "epoch": 2.0869565217391304, "grad_norm": 0.5836483164644858, "learning_rate": 0.00011576436065661484, "loss": 0.9732, "step": 816 }, { "epoch": 2.0895140664961636, "grad_norm": 0.562651886144191, "learning_rate": 0.00011563649580479848, "loss": 0.9827, "step": 817 }, { "epoch": 2.0920716112531967, "grad_norm": 0.3634053027085326, "learning_rate": 0.00011550851727851067, "loss": 0.9634, "step": 818 }, { "epoch": 2.0946291560102304, "grad_norm": 0.35421206748470696, "learning_rate": 0.00011538042548598154, "loss": 0.9674, "step": 819 }, { "epoch": 2.0971867007672635, "grad_norm": 0.34410099266933664, "learning_rate": 0.00011525222083580247, "loss": 0.9682, "step": 820 }, { "epoch": 2.0997442455242967, "grad_norm": 0.36019738429870557, "learning_rate": 0.00011512390373692495, "loss": 0.98, "step": 821 }, { "epoch": 2.10230179028133, "grad_norm": 0.4497160405180852, "learning_rate": 0.00011499547459865908, "loss": 0.9658, "step": 822 }, { "epoch": 2.104859335038363, "grad_norm": 0.48924052145081715, "learning_rate": 0.00011486693383067234, "loss": 0.9961, "step": 823 }, { "epoch": 2.1074168797953963, "grad_norm": 0.51728675513698, "learning_rate": 0.0001147382818429884, "loss": 0.9886, "step": 824 }, { "epoch": 2.10997442455243, "grad_norm": 0.48298534091718054, "learning_rate": 0.0001146095190459855, "loss": 0.99, "step": 825 }, { "epoch": 2.112531969309463, "grad_norm": 0.3873329201691133, "learning_rate": 0.00011448064585039555, "loss": 0.9855, "step": 826 }, { "epoch": 2.1150895140664963, "grad_norm": 0.36617676835976043, "learning_rate": 0.0001143516626673025, "loss": 0.9784, "step": 827 }, { "epoch": 2.1176470588235294, "grad_norm": 0.39303542839485295, "learning_rate": 0.00011422256990814115, "loss": 0.9884, "step": 828 }, { "epoch": 2.1202046035805626, "grad_norm": 0.5159106405133932, "learning_rate": 0.0001140933679846959, "loss": 0.9926, "step": 829 }, { "epoch": 2.122762148337596, "grad_norm": 0.7469560811887815, "learning_rate": 0.00011396405730909925, "loss": 1.0183, "step": 830 }, { "epoch": 2.125319693094629, "grad_norm": 0.7327464479712988, "learning_rate": 0.00011383463829383071, "loss": 1.0098, "step": 831 }, { "epoch": 2.1278772378516626, "grad_norm": 0.5977082749289835, "learning_rate": 0.00011370511135171532, "loss": 1.0071, "step": 832 }, { "epoch": 2.130434782608696, "grad_norm": 0.4052295767189102, "learning_rate": 0.00011357547689592237, "loss": 1.0049, "step": 833 }, { "epoch": 2.132992327365729, "grad_norm": 0.5292207555015371, "learning_rate": 0.00011344573533996417, "loss": 0.9656, "step": 834 }, { "epoch": 2.135549872122762, "grad_norm": 0.4549224765225602, "learning_rate": 0.0001133158870976946, "loss": 0.9968, "step": 835 }, { "epoch": 2.1381074168797953, "grad_norm": 0.4460508304219039, "learning_rate": 0.00011318593258330785, "loss": 1.0134, "step": 836 }, { "epoch": 2.1406649616368285, "grad_norm": 0.46592246024671363, "learning_rate": 0.00011305587221133718, "loss": 0.9522, "step": 837 }, { "epoch": 2.1432225063938617, "grad_norm": 0.4489945484428353, "learning_rate": 0.00011292570639665342, "loss": 1.0104, "step": 838 }, { "epoch": 2.1457800511508953, "grad_norm": 0.46784938019320965, "learning_rate": 0.00011279543555446379, "loss": 0.988, "step": 839 }, { "epoch": 2.1483375959079285, "grad_norm": 0.4200222134898951, "learning_rate": 0.00011266506010031052, "loss": 1.0119, "step": 840 }, { "epoch": 2.1508951406649617, "grad_norm": 0.3655050664603677, "learning_rate": 0.00011253458045006955, "loss": 0.9895, "step": 841 }, { "epoch": 2.153452685421995, "grad_norm": 0.3022642865356664, "learning_rate": 0.00011240399701994919, "loss": 1.001, "step": 842 }, { "epoch": 2.156010230179028, "grad_norm": 0.3188747440198214, "learning_rate": 0.00011227331022648877, "loss": 0.9773, "step": 843 }, { "epoch": 2.1585677749360612, "grad_norm": 0.41190200456297044, "learning_rate": 0.00011214252048655733, "loss": 1.024, "step": 844 }, { "epoch": 2.1611253196930944, "grad_norm": 0.33803198230453474, "learning_rate": 0.00011201162821735228, "loss": 0.9843, "step": 845 }, { "epoch": 2.163682864450128, "grad_norm": 0.36583158073668925, "learning_rate": 0.00011188063383639817, "loss": 0.9809, "step": 846 }, { "epoch": 2.166240409207161, "grad_norm": 0.39675634848639996, "learning_rate": 0.00011174953776154516, "loss": 0.942, "step": 847 }, { "epoch": 2.1687979539641944, "grad_norm": 0.4164372273567332, "learning_rate": 0.00011161834041096782, "loss": 1.0337, "step": 848 }, { "epoch": 2.1713554987212276, "grad_norm": 0.42306948681428896, "learning_rate": 0.00011148704220316387, "loss": 0.9913, "step": 849 }, { "epoch": 2.1739130434782608, "grad_norm": 0.374454297267049, "learning_rate": 0.0001113556435569526, "loss": 0.9928, "step": 850 }, { "epoch": 2.176470588235294, "grad_norm": 0.31767286286037444, "learning_rate": 0.00011122414489147376, "loss": 0.9972, "step": 851 }, { "epoch": 2.1790281329923276, "grad_norm": 0.36673595005863613, "learning_rate": 0.00011109254662618616, "loss": 1.0105, "step": 852 }, { "epoch": 2.1815856777493607, "grad_norm": 0.5025085408193712, "learning_rate": 0.00011096084918086626, "loss": 0.9508, "step": 853 }, { "epoch": 2.184143222506394, "grad_norm": 0.5453118752197188, "learning_rate": 0.00011082905297560697, "loss": 0.9354, "step": 854 }, { "epoch": 2.186700767263427, "grad_norm": 0.535508310533172, "learning_rate": 0.00011069715843081613, "loss": 0.986, "step": 855 }, { "epoch": 2.1892583120204603, "grad_norm": 0.5550105153386212, "learning_rate": 0.00011056516596721534, "loss": 1.0047, "step": 856 }, { "epoch": 2.1918158567774935, "grad_norm": 0.5522958050937595, "learning_rate": 0.00011043307600583854, "loss": 1.0204, "step": 857 }, { "epoch": 2.1943734015345266, "grad_norm": 0.514732209947304, "learning_rate": 0.0001103008889680306, "loss": 1.0137, "step": 858 }, { "epoch": 2.1969309462915603, "grad_norm": 0.5281211410564769, "learning_rate": 0.00011016860527544616, "loss": 1.0085, "step": 859 }, { "epoch": 2.1994884910485935, "grad_norm": 0.46959816689384604, "learning_rate": 0.00011003622535004806, "loss": 1.0058, "step": 860 }, { "epoch": 2.2020460358056266, "grad_norm": 0.3407338275520536, "learning_rate": 0.0001099037496141062, "loss": 0.9986, "step": 861 }, { "epoch": 2.20460358056266, "grad_norm": 0.47884582066611536, "learning_rate": 0.00010977117849019604, "loss": 0.9707, "step": 862 }, { "epoch": 2.207161125319693, "grad_norm": 0.6169099163617163, "learning_rate": 0.00010963851240119731, "loss": 0.9957, "step": 863 }, { "epoch": 2.209718670076726, "grad_norm": 0.5842777084702644, "learning_rate": 0.00010950575177029271, "loss": 0.9971, "step": 864 }, { "epoch": 2.21227621483376, "grad_norm": 0.5415512252484223, "learning_rate": 0.00010937289702096648, "loss": 0.955, "step": 865 }, { "epoch": 2.214833759590793, "grad_norm": 0.5584987591506012, "learning_rate": 0.00010923994857700308, "loss": 0.9858, "step": 866 }, { "epoch": 2.217391304347826, "grad_norm": 0.5438681169787357, "learning_rate": 0.00010910690686248587, "loss": 1.0272, "step": 867 }, { "epoch": 2.2199488491048593, "grad_norm": 0.45923876211266634, "learning_rate": 0.00010897377230179568, "loss": 0.9689, "step": 868 }, { "epoch": 2.2225063938618925, "grad_norm": 0.344989298275585, "learning_rate": 0.00010884054531960956, "loss": 1.005, "step": 869 }, { "epoch": 2.2250639386189257, "grad_norm": 0.3203832886307522, "learning_rate": 0.00010870722634089927, "loss": 0.9904, "step": 870 }, { "epoch": 2.227621483375959, "grad_norm": 0.4050058894119621, "learning_rate": 0.0001085738157909302, "loss": 0.9716, "step": 871 }, { "epoch": 2.2301790281329925, "grad_norm": 0.5042105083367587, "learning_rate": 0.00010844031409525962, "loss": 0.9921, "step": 872 }, { "epoch": 2.2327365728900257, "grad_norm": 0.5771976233792036, "learning_rate": 0.00010830672167973572, "loss": 1.0081, "step": 873 }, { "epoch": 2.235294117647059, "grad_norm": 0.6444239077326948, "learning_rate": 0.00010817303897049597, "loss": 0.9961, "step": 874 }, { "epoch": 2.237851662404092, "grad_norm": 0.6303091061510789, "learning_rate": 0.0001080392663939659, "loss": 0.9648, "step": 875 }, { "epoch": 2.2404092071611252, "grad_norm": 0.5383211537711221, "learning_rate": 0.00010790540437685771, "loss": 0.9835, "step": 876 }, { "epoch": 2.2429667519181584, "grad_norm": 0.4021404516007495, "learning_rate": 0.00010777145334616884, "loss": 0.9732, "step": 877 }, { "epoch": 2.2455242966751916, "grad_norm": 0.31439318271272565, "learning_rate": 0.00010763741372918076, "loss": 0.9799, "step": 878 }, { "epoch": 2.2480818414322252, "grad_norm": 0.4404091457741591, "learning_rate": 0.00010750328595345744, "loss": 0.9798, "step": 879 }, { "epoch": 2.2506393861892584, "grad_norm": 0.5676899676174939, "learning_rate": 0.00010736907044684409, "loss": 0.956, "step": 880 }, { "epoch": 2.2531969309462916, "grad_norm": 0.6251515987816799, "learning_rate": 0.00010723476763746578, "loss": 0.9766, "step": 881 }, { "epoch": 2.2557544757033248, "grad_norm": 0.6188152066667294, "learning_rate": 0.00010710037795372604, "loss": 0.9436, "step": 882 }, { "epoch": 2.258312020460358, "grad_norm": 0.561619175816319, "learning_rate": 0.00010696590182430552, "loss": 0.9829, "step": 883 }, { "epoch": 2.260869565217391, "grad_norm": 0.42915411587906266, "learning_rate": 0.00010683133967816062, "loss": 0.9776, "step": 884 }, { "epoch": 2.2634271099744243, "grad_norm": 0.3524127037006637, "learning_rate": 0.00010669669194452213, "loss": 0.9966, "step": 885 }, { "epoch": 2.265984654731458, "grad_norm": 0.3537805903644639, "learning_rate": 0.00010656195905289382, "loss": 1.0042, "step": 886 }, { "epoch": 2.268542199488491, "grad_norm": 0.38907067845530163, "learning_rate": 0.00010642714143305115, "loss": 0.9591, "step": 887 }, { "epoch": 2.2710997442455243, "grad_norm": 0.4388187336605131, "learning_rate": 0.00010629223951503975, "loss": 0.9657, "step": 888 }, { "epoch": 2.2736572890025575, "grad_norm": 0.5259226887120563, "learning_rate": 0.00010615725372917429, "loss": 0.9902, "step": 889 }, { "epoch": 2.2762148337595907, "grad_norm": 0.5228861897572435, "learning_rate": 0.00010602218450603687, "loss": 1.0222, "step": 890 }, { "epoch": 2.2787723785166243, "grad_norm": 0.5036534202887699, "learning_rate": 0.00010588703227647573, "loss": 1.0003, "step": 891 }, { "epoch": 2.2813299232736575, "grad_norm": 0.3581923819862395, "learning_rate": 0.00010575179747160391, "loss": 0.9834, "step": 892 }, { "epoch": 2.2838874680306906, "grad_norm": 0.3410033765731837, "learning_rate": 0.00010561648052279792, "loss": 0.9893, "step": 893 }, { "epoch": 2.286445012787724, "grad_norm": 0.48497621648344247, "learning_rate": 0.00010548108186169619, "loss": 1.0097, "step": 894 }, { "epoch": 2.289002557544757, "grad_norm": 0.4811056602507645, "learning_rate": 0.00010534560192019784, "loss": 0.9987, "step": 895 }, { "epoch": 2.29156010230179, "grad_norm": 0.5430558900686754, "learning_rate": 0.00010521004113046126, "loss": 0.9863, "step": 896 }, { "epoch": 2.2941176470588234, "grad_norm": 0.5520225619306299, "learning_rate": 0.00010507439992490274, "loss": 0.9854, "step": 897 }, { "epoch": 2.296675191815857, "grad_norm": 0.5368891057768155, "learning_rate": 0.00010493867873619509, "loss": 0.962, "step": 898 }, { "epoch": 2.29923273657289, "grad_norm": 0.45785580350946786, "learning_rate": 0.00010480287799726624, "loss": 0.9951, "step": 899 }, { "epoch": 2.3017902813299234, "grad_norm": 0.3134044741551554, "learning_rate": 0.00010466699814129784, "loss": 0.9808, "step": 900 }, { "epoch": 2.3043478260869565, "grad_norm": 0.3718160522616458, "learning_rate": 0.00010453103960172399, "loss": 0.9722, "step": 901 }, { "epoch": 2.3069053708439897, "grad_norm": 0.42777708592376057, "learning_rate": 0.0001043950028122297, "loss": 0.9778, "step": 902 }, { "epoch": 2.309462915601023, "grad_norm": 0.5114598924445181, "learning_rate": 0.00010425888820674964, "loss": 0.9999, "step": 903 }, { "epoch": 2.312020460358056, "grad_norm": 0.42665599355653705, "learning_rate": 0.00010412269621946664, "loss": 0.9277, "step": 904 }, { "epoch": 2.3145780051150897, "grad_norm": 0.32425667546420855, "learning_rate": 0.0001039864272848104, "loss": 0.9623, "step": 905 }, { "epoch": 2.317135549872123, "grad_norm": 0.278767997134977, "learning_rate": 0.00010385008183745614, "loss": 0.9709, "step": 906 }, { "epoch": 2.319693094629156, "grad_norm": 0.2973268406415685, "learning_rate": 0.00010371366031232298, "loss": 0.9752, "step": 907 }, { "epoch": 2.3222506393861893, "grad_norm": 0.32805655210523665, "learning_rate": 0.00010357716314457286, "loss": 1.0151, "step": 908 }, { "epoch": 2.3248081841432224, "grad_norm": 0.3136457006720511, "learning_rate": 0.00010344059076960893, "loss": 0.9525, "step": 909 }, { "epoch": 2.3273657289002556, "grad_norm": 0.36706796314794027, "learning_rate": 0.00010330394362307426, "loss": 1.0263, "step": 910 }, { "epoch": 2.329923273657289, "grad_norm": 0.3628334304816528, "learning_rate": 0.00010316722214085048, "loss": 1.0032, "step": 911 }, { "epoch": 2.3324808184143224, "grad_norm": 0.4614008122870428, "learning_rate": 0.00010303042675905623, "loss": 0.9655, "step": 912 }, { "epoch": 2.3350383631713556, "grad_norm": 0.5091780040539386, "learning_rate": 0.00010289355791404597, "loss": 0.9963, "step": 913 }, { "epoch": 2.337595907928389, "grad_norm": 0.4886959522852251, "learning_rate": 0.00010275661604240844, "loss": 0.9959, "step": 914 }, { "epoch": 2.340153452685422, "grad_norm": 0.3477812096500851, "learning_rate": 0.00010261960158096538, "loss": 0.9923, "step": 915 }, { "epoch": 2.342710997442455, "grad_norm": 0.3003617995320152, "learning_rate": 0.00010248251496677002, "loss": 1.0133, "step": 916 }, { "epoch": 2.3452685421994883, "grad_norm": 0.3907656568645366, "learning_rate": 0.00010234535663710578, "loss": 0.9559, "step": 917 }, { "epoch": 2.3478260869565215, "grad_norm": 0.44450800877616453, "learning_rate": 0.00010220812702948483, "loss": 0.9839, "step": 918 }, { "epoch": 2.350383631713555, "grad_norm": 0.41444476133681435, "learning_rate": 0.00010207082658164668, "loss": 0.9695, "step": 919 }, { "epoch": 2.3529411764705883, "grad_norm": 0.3486015741078046, "learning_rate": 0.00010193345573155686, "loss": 0.9699, "step": 920 }, { "epoch": 2.3554987212276215, "grad_norm": 0.305313779906682, "learning_rate": 0.00010179601491740546, "loss": 0.9737, "step": 921 }, { "epoch": 2.3580562659846547, "grad_norm": 0.3210944860271877, "learning_rate": 0.00010165850457760569, "loss": 0.9734, "step": 922 }, { "epoch": 2.360613810741688, "grad_norm": 0.33354001864174027, "learning_rate": 0.00010152092515079263, "loss": 0.9758, "step": 923 }, { "epoch": 2.363171355498721, "grad_norm": 0.3630435985390137, "learning_rate": 0.00010138327707582161, "loss": 0.9843, "step": 924 }, { "epoch": 2.3657289002557547, "grad_norm": 0.3068154551503405, "learning_rate": 0.00010124556079176705, "loss": 0.9718, "step": 925 }, { "epoch": 2.368286445012788, "grad_norm": 0.3145375023118287, "learning_rate": 0.0001011077767379209, "loss": 0.9485, "step": 926 }, { "epoch": 2.370843989769821, "grad_norm": 0.4562062846091247, "learning_rate": 0.00010096992535379125, "loss": 1.0041, "step": 927 }, { "epoch": 2.373401534526854, "grad_norm": 0.4613854636034836, "learning_rate": 0.00010083200707910109, "loss": 1.0095, "step": 928 }, { "epoch": 2.3759590792838874, "grad_norm": 0.5020460478647006, "learning_rate": 0.00010069402235378657, "loss": 0.9793, "step": 929 }, { "epoch": 2.3785166240409206, "grad_norm": 0.47032502181209285, "learning_rate": 0.000100555971617996, "loss": 1.003, "step": 930 }, { "epoch": 2.381074168797954, "grad_norm": 0.37153265133623853, "learning_rate": 0.00010041785531208813, "loss": 0.9707, "step": 931 }, { "epoch": 2.3836317135549874, "grad_norm": 0.2954908430723523, "learning_rate": 0.00010027967387663098, "loss": 0.9943, "step": 932 }, { "epoch": 2.3861892583120206, "grad_norm": 0.2860326087524264, "learning_rate": 0.00010014142775240018, "loss": 0.978, "step": 933 }, { "epoch": 2.3887468030690537, "grad_norm": 0.36670864980970264, "learning_rate": 0.00010000311738037786, "loss": 0.9654, "step": 934 }, { "epoch": 2.391304347826087, "grad_norm": 0.39639852002586273, "learning_rate": 9.986474320175097e-05, "loss": 0.964, "step": 935 }, { "epoch": 2.39386189258312, "grad_norm": 0.3585981520256939, "learning_rate": 9.972630565791003e-05, "loss": 0.9825, "step": 936 }, { "epoch": 2.3964194373401533, "grad_norm": 0.3189834091257556, "learning_rate": 9.958780519044772e-05, "loss": 0.9851, "step": 937 }, { "epoch": 2.398976982097187, "grad_norm": 0.3049358905004256, "learning_rate": 9.944924224115737e-05, "loss": 0.9939, "step": 938 }, { "epoch": 2.40153452685422, "grad_norm": 0.2622458924767327, "learning_rate": 9.931061725203167e-05, "loss": 0.9781, "step": 939 }, { "epoch": 2.4040920716112533, "grad_norm": 0.2924257759631161, "learning_rate": 9.917193066526122e-05, "loss": 0.9868, "step": 940 }, { "epoch": 2.4066496163682864, "grad_norm": 0.3604978006726876, "learning_rate": 9.903318292323301e-05, "loss": 0.9754, "step": 941 }, { "epoch": 2.4092071611253196, "grad_norm": 0.29745498369836404, "learning_rate": 9.889437446852923e-05, "loss": 0.9859, "step": 942 }, { "epoch": 2.411764705882353, "grad_norm": 0.37371862497237623, "learning_rate": 9.875550574392565e-05, "loss": 0.9896, "step": 943 }, { "epoch": 2.414322250639386, "grad_norm": 0.38638295584959187, "learning_rate": 9.86165771923903e-05, "loss": 0.9881, "step": 944 }, { "epoch": 2.4168797953964196, "grad_norm": 0.4041126989806797, "learning_rate": 9.84775892570821e-05, "loss": 0.9428, "step": 945 }, { "epoch": 2.419437340153453, "grad_norm": 0.395096912214402, "learning_rate": 9.833854238134931e-05, "loss": 0.9622, "step": 946 }, { "epoch": 2.421994884910486, "grad_norm": 0.3464290247147215, "learning_rate": 9.819943700872828e-05, "loss": 1.0125, "step": 947 }, { "epoch": 2.424552429667519, "grad_norm": 0.28843985739584715, "learning_rate": 9.806027358294195e-05, "loss": 0.9712, "step": 948 }, { "epoch": 2.4271099744245523, "grad_norm": 0.38051542261971155, "learning_rate": 9.792105254789834e-05, "loss": 0.9851, "step": 949 }, { "epoch": 2.4296675191815855, "grad_norm": 0.4466310758086544, "learning_rate": 9.778177434768935e-05, "loss": 0.9683, "step": 950 }, { "epoch": 2.4322250639386187, "grad_norm": 0.4692147641165216, "learning_rate": 9.764243942658919e-05, "loss": 0.9841, "step": 951 }, { "epoch": 2.4347826086956523, "grad_norm": 0.35373867138680226, "learning_rate": 9.750304822905297e-05, "loss": 0.9492, "step": 952 }, { "epoch": 2.4373401534526855, "grad_norm": 0.28385300113252654, "learning_rate": 9.736360119971537e-05, "loss": 0.9996, "step": 953 }, { "epoch": 2.4398976982097187, "grad_norm": 0.2937003946020655, "learning_rate": 9.722409878338908e-05, "loss": 1.0015, "step": 954 }, { "epoch": 2.442455242966752, "grad_norm": 0.3969860787197417, "learning_rate": 9.708454142506354e-05, "loss": 0.9774, "step": 955 }, { "epoch": 2.445012787723785, "grad_norm": 0.5498839614052679, "learning_rate": 9.694492956990345e-05, "loss": 0.9847, "step": 956 }, { "epoch": 2.4475703324808182, "grad_norm": 0.5513989094448135, "learning_rate": 9.680526366324726e-05, "loss": 0.9565, "step": 957 }, { "epoch": 2.4501278772378514, "grad_norm": 0.506905247181652, "learning_rate": 9.666554415060596e-05, "loss": 0.9517, "step": 958 }, { "epoch": 2.452685421994885, "grad_norm": 0.44474310752723095, "learning_rate": 9.652577147766142e-05, "loss": 0.9743, "step": 959 }, { "epoch": 2.455242966751918, "grad_norm": 0.37097475676427244, "learning_rate": 9.638594609026515e-05, "loss": 0.9506, "step": 960 }, { "epoch": 2.4578005115089514, "grad_norm": 0.2734924283931777, "learning_rate": 9.624606843443675e-05, "loss": 1.0158, "step": 961 }, { "epoch": 2.4603580562659846, "grad_norm": 0.31804819233085263, "learning_rate": 9.610613895636263e-05, "loss": 0.992, "step": 962 }, { "epoch": 2.4629156010230178, "grad_norm": 0.41664714320663915, "learning_rate": 9.596615810239445e-05, "loss": 0.999, "step": 963 }, { "epoch": 2.4654731457800514, "grad_norm": 0.5523065515247985, "learning_rate": 9.582612631904779e-05, "loss": 1.0055, "step": 964 }, { "epoch": 2.4680306905370846, "grad_norm": 0.4671305490762141, "learning_rate": 9.568604405300062e-05, "loss": 0.9579, "step": 965 }, { "epoch": 2.4705882352941178, "grad_norm": 0.3279722497396409, "learning_rate": 9.554591175109194e-05, "loss": 0.9731, "step": 966 }, { "epoch": 2.473145780051151, "grad_norm": 0.25846610901040445, "learning_rate": 9.54057298603205e-05, "loss": 0.9817, "step": 967 }, { "epoch": 2.475703324808184, "grad_norm": 0.3730225408971352, "learning_rate": 9.526549882784305e-05, "loss": 0.9874, "step": 968 }, { "epoch": 2.4782608695652173, "grad_norm": 0.7271461728885226, "learning_rate": 9.512521910097316e-05, "loss": 1.0348, "step": 969 }, { "epoch": 2.4808184143222505, "grad_norm": 0.32875046425746846, "learning_rate": 9.49848911271798e-05, "loss": 0.9565, "step": 970 }, { "epoch": 2.483375959079284, "grad_norm": 0.3205410594330121, "learning_rate": 9.484451535408572e-05, "loss": 0.9784, "step": 971 }, { "epoch": 2.4859335038363173, "grad_norm": 0.26205949445440796, "learning_rate": 9.470409222946623e-05, "loss": 0.9983, "step": 972 }, { "epoch": 2.4884910485933505, "grad_norm": 0.3237027571460551, "learning_rate": 9.456362220124766e-05, "loss": 0.98, "step": 973 }, { "epoch": 2.4910485933503836, "grad_norm": 0.35272232039199597, "learning_rate": 9.442310571750588e-05, "loss": 0.9779, "step": 974 }, { "epoch": 2.493606138107417, "grad_norm": 0.305939353717968, "learning_rate": 9.42825432264651e-05, "loss": 0.9581, "step": 975 }, { "epoch": 2.49616368286445, "grad_norm": 0.2932577303248136, "learning_rate": 9.414193517649614e-05, "loss": 0.9855, "step": 976 }, { "epoch": 2.498721227621483, "grad_norm": 0.30059710492898495, "learning_rate": 9.400128201611521e-05, "loss": 0.9754, "step": 977 }, { "epoch": 2.501278772378517, "grad_norm": 0.2973031341519278, "learning_rate": 9.386058419398243e-05, "loss": 0.9909, "step": 978 }, { "epoch": 2.50383631713555, "grad_norm": 0.3722883437832787, "learning_rate": 9.371984215890032e-05, "loss": 0.9946, "step": 979 }, { "epoch": 2.506393861892583, "grad_norm": 0.3473263838445932, "learning_rate": 9.357905635981251e-05, "loss": 0.9543, "step": 980 }, { "epoch": 2.5089514066496164, "grad_norm": 0.2867570028047222, "learning_rate": 9.34382272458022e-05, "loss": 0.9638, "step": 981 }, { "epoch": 2.5115089514066495, "grad_norm": 0.30564756429493334, "learning_rate": 9.329735526609071e-05, "loss": 0.9464, "step": 982 }, { "epoch": 2.5140664961636827, "grad_norm": 0.277493802953859, "learning_rate": 9.315644087003614e-05, "loss": 0.9565, "step": 983 }, { "epoch": 2.516624040920716, "grad_norm": 0.32107200459340096, "learning_rate": 9.301548450713193e-05, "loss": 0.987, "step": 984 }, { "epoch": 2.5191815856777495, "grad_norm": 0.34282165398687586, "learning_rate": 9.28744866270053e-05, "loss": 0.985, "step": 985 }, { "epoch": 2.5217391304347827, "grad_norm": 0.32220988156237623, "learning_rate": 9.273344767941595e-05, "loss": 0.958, "step": 986 }, { "epoch": 2.524296675191816, "grad_norm": 0.2659763342921004, "learning_rate": 9.259236811425458e-05, "loss": 0.9693, "step": 987 }, { "epoch": 2.526854219948849, "grad_norm": 0.31738841820079255, "learning_rate": 9.245124838154145e-05, "loss": 0.9938, "step": 988 }, { "epoch": 2.5294117647058822, "grad_norm": 0.32830918791297703, "learning_rate": 9.231008893142496e-05, "loss": 0.9934, "step": 989 }, { "epoch": 2.531969309462916, "grad_norm": 0.3402708856013208, "learning_rate": 9.216889021418015e-05, "loss": 1.0013, "step": 990 }, { "epoch": 2.5345268542199486, "grad_norm": 0.4044102426145664, "learning_rate": 9.202765268020734e-05, "loss": 0.9831, "step": 991 }, { "epoch": 2.5370843989769822, "grad_norm": 0.42862262278596586, "learning_rate": 9.188637678003078e-05, "loss": 0.9997, "step": 992 }, { "epoch": 2.5396419437340154, "grad_norm": 0.4484266743548927, "learning_rate": 9.17450629642969e-05, "loss": 0.9828, "step": 993 }, { "epoch": 2.5421994884910486, "grad_norm": 0.3265912580211292, "learning_rate": 9.160371168377322e-05, "loss": 0.9643, "step": 994 }, { "epoch": 2.544757033248082, "grad_norm": 0.32534751123207517, "learning_rate": 9.146232338934671e-05, "loss": 0.9582, "step": 995 }, { "epoch": 2.547314578005115, "grad_norm": 0.38239024918470127, "learning_rate": 9.132089853202243e-05, "loss": 0.9744, "step": 996 }, { "epoch": 2.5498721227621486, "grad_norm": 0.46563347602108834, "learning_rate": 9.117943756292208e-05, "loss": 0.9792, "step": 997 }, { "epoch": 2.5524296675191813, "grad_norm": 0.39461054417861174, "learning_rate": 9.103794093328248e-05, "loss": 0.9755, "step": 998 }, { "epoch": 2.554987212276215, "grad_norm": 0.3125908044097884, "learning_rate": 9.089640909445431e-05, "loss": 0.9716, "step": 999 }, { "epoch": 2.557544757033248, "grad_norm": 0.2684368877044592, "learning_rate": 9.075484249790048e-05, "loss": 0.9747, "step": 1000 }, { "epoch": 2.5601023017902813, "grad_norm": 0.28891578856074146, "learning_rate": 9.061324159519476e-05, "loss": 0.9762, "step": 1001 }, { "epoch": 2.5626598465473145, "grad_norm": 0.3034677475712927, "learning_rate": 9.047160683802046e-05, "loss": 0.9674, "step": 1002 }, { "epoch": 2.5652173913043477, "grad_norm": 0.31908253316340884, "learning_rate": 9.032993867816876e-05, "loss": 0.9942, "step": 1003 }, { "epoch": 2.5677749360613813, "grad_norm": 0.2544491678916064, "learning_rate": 9.018823756753746e-05, "loss": 1.0001, "step": 1004 }, { "epoch": 2.5703324808184145, "grad_norm": 0.2995352776229395, "learning_rate": 9.00465039581294e-05, "loss": 0.9929, "step": 1005 }, { "epoch": 2.5728900255754477, "grad_norm": 0.35913882534331126, "learning_rate": 8.990473830205118e-05, "loss": 0.9318, "step": 1006 }, { "epoch": 2.575447570332481, "grad_norm": 0.37010668314829087, "learning_rate": 8.976294105151154e-05, "loss": 1.0079, "step": 1007 }, { "epoch": 2.578005115089514, "grad_norm": 0.2570784147501355, "learning_rate": 8.962111265882006e-05, "loss": 0.9952, "step": 1008 }, { "epoch": 2.580562659846547, "grad_norm": 0.3149539278736431, "learning_rate": 8.947925357638561e-05, "loss": 0.9941, "step": 1009 }, { "epoch": 2.5831202046035804, "grad_norm": 0.2855340149405739, "learning_rate": 8.933736425671495e-05, "loss": 0.9816, "step": 1010 }, { "epoch": 2.585677749360614, "grad_norm": 0.25345884892793763, "learning_rate": 8.91954451524114e-05, "loss": 0.9818, "step": 1011 }, { "epoch": 2.588235294117647, "grad_norm": 0.29694516426804485, "learning_rate": 8.905349671617313e-05, "loss": 0.9876, "step": 1012 }, { "epoch": 2.5907928388746804, "grad_norm": 0.3052840810260173, "learning_rate": 8.891151940079198e-05, "loss": 0.9702, "step": 1013 }, { "epoch": 2.5933503836317136, "grad_norm": 0.2661838830871243, "learning_rate": 8.87695136591519e-05, "loss": 0.9877, "step": 1014 }, { "epoch": 2.5959079283887467, "grad_norm": 0.2986390559549456, "learning_rate": 8.862747994422744e-05, "loss": 0.9707, "step": 1015 }, { "epoch": 2.59846547314578, "grad_norm": 0.3613476612681819, "learning_rate": 8.848541870908248e-05, "loss": 0.9703, "step": 1016 }, { "epoch": 2.601023017902813, "grad_norm": 0.33024018130732985, "learning_rate": 8.834333040686867e-05, "loss": 0.979, "step": 1017 }, { "epoch": 2.6035805626598467, "grad_norm": 0.31187166502347763, "learning_rate": 8.820121549082389e-05, "loss": 0.9829, "step": 1018 }, { "epoch": 2.60613810741688, "grad_norm": 0.3469288630004611, "learning_rate": 8.805907441427107e-05, "loss": 0.9558, "step": 1019 }, { "epoch": 2.608695652173913, "grad_norm": 0.3134454892157028, "learning_rate": 8.791690763061646e-05, "loss": 0.9644, "step": 1020 }, { "epoch": 2.6112531969309463, "grad_norm": 0.30922058220600745, "learning_rate": 8.777471559334835e-05, "loss": 0.9769, "step": 1021 }, { "epoch": 2.6138107416879794, "grad_norm": 0.3164613704707754, "learning_rate": 8.763249875603568e-05, "loss": 0.9699, "step": 1022 }, { "epoch": 2.6163682864450126, "grad_norm": 0.3937696035168064, "learning_rate": 8.74902575723263e-05, "loss": 0.9913, "step": 1023 }, { "epoch": 2.618925831202046, "grad_norm": 0.3269757525342128, "learning_rate": 8.734799249594593e-05, "loss": 0.9714, "step": 1024 }, { "epoch": 2.6214833759590794, "grad_norm": 0.3137372841061025, "learning_rate": 8.720570398069639e-05, "loss": 0.9667, "step": 1025 }, { "epoch": 2.6240409207161126, "grad_norm": 0.296905098424126, "learning_rate": 8.706339248045425e-05, "loss": 0.9748, "step": 1026 }, { "epoch": 2.626598465473146, "grad_norm": 0.3341447796223413, "learning_rate": 8.692105844916946e-05, "loss": 0.9813, "step": 1027 }, { "epoch": 2.629156010230179, "grad_norm": 0.3756191138022281, "learning_rate": 8.677870234086383e-05, "loss": 0.9908, "step": 1028 }, { "epoch": 2.631713554987212, "grad_norm": 0.3559465468948902, "learning_rate": 8.663632460962956e-05, "loss": 0.9936, "step": 1029 }, { "epoch": 2.634271099744246, "grad_norm": 0.300711572823478, "learning_rate": 8.649392570962781e-05, "loss": 0.9795, "step": 1030 }, { "epoch": 2.6368286445012785, "grad_norm": 0.3320572865051935, "learning_rate": 8.635150609508733e-05, "loss": 0.984, "step": 1031 }, { "epoch": 2.639386189258312, "grad_norm": 0.3635828441982571, "learning_rate": 8.620906622030292e-05, "loss": 0.9536, "step": 1032 }, { "epoch": 2.6419437340153453, "grad_norm": 0.3278411915419061, "learning_rate": 8.6066606539634e-05, "loss": 1.0088, "step": 1033 }, { "epoch": 2.6445012787723785, "grad_norm": 0.32767767702958833, "learning_rate": 8.592412750750312e-05, "loss": 0.9876, "step": 1034 }, { "epoch": 2.6470588235294117, "grad_norm": 0.35097964529502185, "learning_rate": 8.578162957839462e-05, "loss": 0.9915, "step": 1035 }, { "epoch": 2.649616368286445, "grad_norm": 0.31991735732581283, "learning_rate": 8.563911320685312e-05, "loss": 0.9638, "step": 1036 }, { "epoch": 2.6521739130434785, "grad_norm": 0.23787926653601094, "learning_rate": 8.549657884748205e-05, "loss": 0.9713, "step": 1037 }, { "epoch": 2.6547314578005117, "grad_norm": 0.32244485030641373, "learning_rate": 8.535402695494221e-05, "loss": 0.9772, "step": 1038 }, { "epoch": 2.657289002557545, "grad_norm": 0.312950136510117, "learning_rate": 8.521145798395035e-05, "loss": 0.9841, "step": 1039 }, { "epoch": 2.659846547314578, "grad_norm": 0.26212781885375047, "learning_rate": 8.506887238927764e-05, "loss": 0.9955, "step": 1040 }, { "epoch": 2.662404092071611, "grad_norm": 0.34105099182259796, "learning_rate": 8.492627062574837e-05, "loss": 0.9729, "step": 1041 }, { "epoch": 2.6649616368286444, "grad_norm": 0.297943326170416, "learning_rate": 8.478365314823831e-05, "loss": 1.0041, "step": 1042 }, { "epoch": 2.6675191815856776, "grad_norm": 0.23653735859455993, "learning_rate": 8.464102041167343e-05, "loss": 0.9385, "step": 1043 }, { "epoch": 2.670076726342711, "grad_norm": 0.24103662980964566, "learning_rate": 8.449837287102837e-05, "loss": 0.9798, "step": 1044 }, { "epoch": 2.6726342710997444, "grad_norm": 0.3266522540557997, "learning_rate": 8.43557109813249e-05, "loss": 0.9664, "step": 1045 }, { "epoch": 2.6751918158567776, "grad_norm": 0.34157505937073707, "learning_rate": 8.421303519763067e-05, "loss": 0.9512, "step": 1046 }, { "epoch": 2.6777493606138107, "grad_norm": 0.32745487240393034, "learning_rate": 8.407034597505762e-05, "loss": 0.9847, "step": 1047 }, { "epoch": 2.680306905370844, "grad_norm": 0.30390244215100753, "learning_rate": 8.392764376876049e-05, "loss": 0.9847, "step": 1048 }, { "epoch": 2.682864450127877, "grad_norm": 0.28021611753279574, "learning_rate": 8.378492903393555e-05, "loss": 0.9592, "step": 1049 }, { "epoch": 2.6854219948849103, "grad_norm": 0.3320556275827844, "learning_rate": 8.364220222581896e-05, "loss": 0.9846, "step": 1050 }, { "epoch": 2.687979539641944, "grad_norm": 0.3136101711766941, "learning_rate": 8.34994637996854e-05, "loss": 0.9811, "step": 1051 }, { "epoch": 2.690537084398977, "grad_norm": 0.2618192450012102, "learning_rate": 8.335671421084661e-05, "loss": 0.9744, "step": 1052 }, { "epoch": 2.6930946291560103, "grad_norm": 0.3220025314640929, "learning_rate": 8.321395391464995e-05, "loss": 0.9868, "step": 1053 }, { "epoch": 2.6956521739130435, "grad_norm": 0.3598315892247714, "learning_rate": 8.307118336647694e-05, "loss": 0.951, "step": 1054 }, { "epoch": 2.6982097186700766, "grad_norm": 0.4106007096012368, "learning_rate": 8.292840302174178e-05, "loss": 0.9643, "step": 1055 }, { "epoch": 2.70076726342711, "grad_norm": 0.2548097195613678, "learning_rate": 8.278561333588993e-05, "loss": 0.9841, "step": 1056 }, { "epoch": 2.703324808184143, "grad_norm": 0.3371557483370203, "learning_rate": 8.264281476439662e-05, "loss": 0.984, "step": 1057 }, { "epoch": 2.7058823529411766, "grad_norm": 0.38976688577634183, "learning_rate": 8.250000776276551e-05, "loss": 0.9731, "step": 1058 }, { "epoch": 2.70843989769821, "grad_norm": 0.2695308176694805, "learning_rate": 8.235719278652704e-05, "loss": 1.0008, "step": 1059 }, { "epoch": 2.710997442455243, "grad_norm": 0.2799834287903197, "learning_rate": 8.221437029123715e-05, "loss": 0.96, "step": 1060 }, { "epoch": 2.713554987212276, "grad_norm": 0.3887662531222578, "learning_rate": 8.20715407324758e-05, "loss": 1.0134, "step": 1061 }, { "epoch": 2.7161125319693094, "grad_norm": 0.36475843384332224, "learning_rate": 8.192870456584536e-05, "loss": 0.9869, "step": 1062 }, { "epoch": 2.718670076726343, "grad_norm": 0.3842950619442295, "learning_rate": 8.178586224696938e-05, "loss": 1.0191, "step": 1063 }, { "epoch": 2.7212276214833757, "grad_norm": 0.29521526511075435, "learning_rate": 8.164301423149104e-05, "loss": 0.9847, "step": 1064 }, { "epoch": 2.7237851662404093, "grad_norm": 0.2510688717518455, "learning_rate": 8.150016097507161e-05, "loss": 0.9537, "step": 1065 }, { "epoch": 2.7263427109974425, "grad_norm": 0.31175386208986516, "learning_rate": 8.135730293338918e-05, "loss": 0.9715, "step": 1066 }, { "epoch": 2.7289002557544757, "grad_norm": 0.2969969026627777, "learning_rate": 8.121444056213698e-05, "loss": 0.9778, "step": 1067 }, { "epoch": 2.731457800511509, "grad_norm": 0.316196872282454, "learning_rate": 8.107157431702219e-05, "loss": 0.9979, "step": 1068 }, { "epoch": 2.734015345268542, "grad_norm": 0.2677096371345643, "learning_rate": 8.092870465376422e-05, "loss": 0.972, "step": 1069 }, { "epoch": 2.7365728900255757, "grad_norm": 0.25111395109245066, "learning_rate": 8.078583202809347e-05, "loss": 1.0173, "step": 1070 }, { "epoch": 2.7391304347826084, "grad_norm": 0.23618007037740435, "learning_rate": 8.064295689574979e-05, "loss": 0.9681, "step": 1071 }, { "epoch": 2.741687979539642, "grad_norm": 0.2462154966468633, "learning_rate": 8.050007971248095e-05, "loss": 0.9977, "step": 1072 }, { "epoch": 2.7442455242966752, "grad_norm": 0.2396576027964869, "learning_rate": 8.035720093404133e-05, "loss": 0.9817, "step": 1073 }, { "epoch": 2.7468030690537084, "grad_norm": 0.23288900252567163, "learning_rate": 8.021432101619034e-05, "loss": 0.9677, "step": 1074 }, { "epoch": 2.7493606138107416, "grad_norm": 0.309943456329605, "learning_rate": 8.007144041469111e-05, "loss": 1.0198, "step": 1075 }, { "epoch": 2.7519181585677748, "grad_norm": 0.2438257902275988, "learning_rate": 7.992855958530893e-05, "loss": 0.9774, "step": 1076 }, { "epoch": 2.7544757033248084, "grad_norm": 0.24225939294568138, "learning_rate": 7.978567898380968e-05, "loss": 0.9975, "step": 1077 }, { "epoch": 2.7570332480818416, "grad_norm": 0.2557453042666024, "learning_rate": 7.96427990659587e-05, "loss": 0.9601, "step": 1078 }, { "epoch": 2.7595907928388748, "grad_norm": 0.25399744095479343, "learning_rate": 7.949992028751908e-05, "loss": 0.94, "step": 1079 }, { "epoch": 2.762148337595908, "grad_norm": 0.25806395609838956, "learning_rate": 7.935704310425022e-05, "loss": 0.9856, "step": 1080 }, { "epoch": 2.764705882352941, "grad_norm": 0.2778516319437345, "learning_rate": 7.921416797190653e-05, "loss": 0.9485, "step": 1081 }, { "epoch": 2.7672634271099743, "grad_norm": 0.2652382709743763, "learning_rate": 7.90712953462358e-05, "loss": 0.9852, "step": 1082 }, { "epoch": 2.7698209718670075, "grad_norm": 0.3078124836381294, "learning_rate": 7.892842568297784e-05, "loss": 0.9843, "step": 1083 }, { "epoch": 2.772378516624041, "grad_norm": 0.2630029283693419, "learning_rate": 7.878555943786304e-05, "loss": 0.9866, "step": 1084 }, { "epoch": 2.7749360613810743, "grad_norm": 0.3230772942242779, "learning_rate": 7.864269706661084e-05, "loss": 0.9617, "step": 1085 }, { "epoch": 2.7774936061381075, "grad_norm": 0.33688102829350425, "learning_rate": 7.84998390249284e-05, "loss": 1.0151, "step": 1086 }, { "epoch": 2.7800511508951407, "grad_norm": 0.27010473360932136, "learning_rate": 7.8356985768509e-05, "loss": 0.9416, "step": 1087 }, { "epoch": 2.782608695652174, "grad_norm": 0.3216032949279463, "learning_rate": 7.821413775303063e-05, "loss": 0.9677, "step": 1088 }, { "epoch": 2.785166240409207, "grad_norm": 0.3184797598775921, "learning_rate": 7.807129543415467e-05, "loss": 0.9878, "step": 1089 }, { "epoch": 2.78772378516624, "grad_norm": 0.26980179286312655, "learning_rate": 7.792845926752422e-05, "loss": 0.9559, "step": 1090 }, { "epoch": 2.790281329923274, "grad_norm": 0.2788560924053536, "learning_rate": 7.778562970876285e-05, "loss": 0.9315, "step": 1091 }, { "epoch": 2.792838874680307, "grad_norm": 0.34225351537345716, "learning_rate": 7.764280721347296e-05, "loss": 0.9905, "step": 1092 }, { "epoch": 2.79539641943734, "grad_norm": 0.3181751957801659, "learning_rate": 7.749999223723451e-05, "loss": 0.992, "step": 1093 }, { "epoch": 2.7979539641943734, "grad_norm": 0.2617895154207013, "learning_rate": 7.73571852356034e-05, "loss": 0.976, "step": 1094 }, { "epoch": 2.8005115089514065, "grad_norm": 0.26160435542511723, "learning_rate": 7.72143866641101e-05, "loss": 0.9717, "step": 1095 }, { "epoch": 2.80306905370844, "grad_norm": 0.3005466825228635, "learning_rate": 7.707159697825824e-05, "loss": 1.019, "step": 1096 }, { "epoch": 2.805626598465473, "grad_norm": 0.2737567544420114, "learning_rate": 7.692881663352306e-05, "loss": 0.9877, "step": 1097 }, { "epoch": 2.8081841432225065, "grad_norm": 0.25383083364525466, "learning_rate": 7.678604608535007e-05, "loss": 1.0, "step": 1098 }, { "epoch": 2.8107416879795397, "grad_norm": 0.24966621455789795, "learning_rate": 7.664328578915341e-05, "loss": 0.9913, "step": 1099 }, { "epoch": 2.813299232736573, "grad_norm": 0.26731325577468995, "learning_rate": 7.650053620031461e-05, "loss": 0.9667, "step": 1100 }, { "epoch": 2.815856777493606, "grad_norm": 0.24369512341274932, "learning_rate": 7.635779777418105e-05, "loss": 0.9941, "step": 1101 }, { "epoch": 2.8184143222506393, "grad_norm": 0.22967457166848224, "learning_rate": 7.621507096606445e-05, "loss": 0.9755, "step": 1102 }, { "epoch": 2.820971867007673, "grad_norm": 0.2571549233122558, "learning_rate": 7.607235623123952e-05, "loss": 0.9896, "step": 1103 }, { "epoch": 2.8235294117647056, "grad_norm": 0.21308122874558627, "learning_rate": 7.592965402494242e-05, "loss": 0.9671, "step": 1104 }, { "epoch": 2.8260869565217392, "grad_norm": 0.23965692093466115, "learning_rate": 7.578696480236935e-05, "loss": 0.9572, "step": 1105 }, { "epoch": 2.8286445012787724, "grad_norm": 0.20206088609556147, "learning_rate": 7.564428901867512e-05, "loss": 0.9874, "step": 1106 }, { "epoch": 2.8312020460358056, "grad_norm": 0.24456595967971878, "learning_rate": 7.550162712897166e-05, "loss": 0.9834, "step": 1107 }, { "epoch": 2.833759590792839, "grad_norm": 0.2395628798306672, "learning_rate": 7.535897958832657e-05, "loss": 0.9932, "step": 1108 }, { "epoch": 2.836317135549872, "grad_norm": 0.24488788117262922, "learning_rate": 7.521634685176171e-05, "loss": 0.9976, "step": 1109 }, { "epoch": 2.8388746803069056, "grad_norm": 0.2475079536458042, "learning_rate": 7.507372937425166e-05, "loss": 0.979, "step": 1110 }, { "epoch": 2.8414322250639388, "grad_norm": 0.25103418982918085, "learning_rate": 7.493112761072238e-05, "loss": 0.9784, "step": 1111 }, { "epoch": 2.843989769820972, "grad_norm": 0.21080156526173952, "learning_rate": 7.478854201604967e-05, "loss": 0.9861, "step": 1112 }, { "epoch": 2.846547314578005, "grad_norm": 0.2636072879534979, "learning_rate": 7.464597304505779e-05, "loss": 0.9767, "step": 1113 }, { "epoch": 2.8491048593350383, "grad_norm": 0.3447559742850428, "learning_rate": 7.450342115251793e-05, "loss": 0.9763, "step": 1114 }, { "epoch": 2.8516624040920715, "grad_norm": 0.3554201272513753, "learning_rate": 7.436088679314689e-05, "loss": 0.9814, "step": 1115 }, { "epoch": 2.8542199488491047, "grad_norm": 0.2338897866384284, "learning_rate": 7.42183704216054e-05, "loss": 0.9737, "step": 1116 }, { "epoch": 2.8567774936061383, "grad_norm": 0.3005337593534035, "learning_rate": 7.407587249249691e-05, "loss": 0.9593, "step": 1117 }, { "epoch": 2.8593350383631715, "grad_norm": 0.28306065139483866, "learning_rate": 7.393339346036604e-05, "loss": 0.9912, "step": 1118 }, { "epoch": 2.8618925831202047, "grad_norm": 0.32462258403513267, "learning_rate": 7.379093377969708e-05, "loss": 0.9636, "step": 1119 }, { "epoch": 2.864450127877238, "grad_norm": 0.23458466619854929, "learning_rate": 7.364849390491269e-05, "loss": 1.0179, "step": 1120 }, { "epoch": 2.867007672634271, "grad_norm": 0.26599173050846503, "learning_rate": 7.350607429037222e-05, "loss": 0.9865, "step": 1121 }, { "epoch": 2.869565217391304, "grad_norm": 0.28672176422376533, "learning_rate": 7.336367539037047e-05, "loss": 0.9697, "step": 1122 }, { "epoch": 2.8721227621483374, "grad_norm": 0.38174167324236646, "learning_rate": 7.32212976591362e-05, "loss": 0.9394, "step": 1123 }, { "epoch": 2.874680306905371, "grad_norm": 0.3008937451500426, "learning_rate": 7.307894155083054e-05, "loss": 1.0193, "step": 1124 }, { "epoch": 2.877237851662404, "grad_norm": 0.2647744376072329, "learning_rate": 7.293660751954576e-05, "loss": 0.9959, "step": 1125 }, { "epoch": 2.8797953964194374, "grad_norm": 0.3361184185105208, "learning_rate": 7.279429601930365e-05, "loss": 0.9886, "step": 1126 }, { "epoch": 2.8823529411764706, "grad_norm": 0.28703805124273124, "learning_rate": 7.265200750405408e-05, "loss": 0.9552, "step": 1127 }, { "epoch": 2.8849104859335037, "grad_norm": 0.2282314607084684, "learning_rate": 7.250974242767372e-05, "loss": 0.9613, "step": 1128 }, { "epoch": 2.887468030690537, "grad_norm": 0.2492748754541012, "learning_rate": 7.236750124396435e-05, "loss": 0.9668, "step": 1129 }, { "epoch": 2.89002557544757, "grad_norm": 0.25888788395575085, "learning_rate": 7.222528440665167e-05, "loss": 0.9925, "step": 1130 }, { "epoch": 2.8925831202046037, "grad_norm": 0.24496080625420605, "learning_rate": 7.20830923693836e-05, "loss": 1.0041, "step": 1131 }, { "epoch": 2.895140664961637, "grad_norm": 0.23733176427430222, "learning_rate": 7.194092558572897e-05, "loss": 0.9425, "step": 1132 }, { "epoch": 2.89769820971867, "grad_norm": 0.27037826071655174, "learning_rate": 7.179878450917613e-05, "loss": 0.9618, "step": 1133 }, { "epoch": 2.9002557544757033, "grad_norm": 0.2110486047552461, "learning_rate": 7.165666959313135e-05, "loss": 0.9625, "step": 1134 }, { "epoch": 2.9028132992327365, "grad_norm": 0.2356138250996952, "learning_rate": 7.151458129091752e-05, "loss": 0.9868, "step": 1135 }, { "epoch": 2.90537084398977, "grad_norm": 0.2507648626394698, "learning_rate": 7.137252005577256e-05, "loss": 0.9579, "step": 1136 }, { "epoch": 2.907928388746803, "grad_norm": 0.21729817798268314, "learning_rate": 7.123048634084815e-05, "loss": 1.0193, "step": 1137 }, { "epoch": 2.9104859335038364, "grad_norm": 0.25511738825377567, "learning_rate": 7.108848059920805e-05, "loss": 0.9594, "step": 1138 }, { "epoch": 2.9130434782608696, "grad_norm": 0.25447395942517514, "learning_rate": 7.09465032838269e-05, "loss": 0.9746, "step": 1139 }, { "epoch": 2.915601023017903, "grad_norm": 0.24784365067022293, "learning_rate": 7.080455484758863e-05, "loss": 0.9659, "step": 1140 }, { "epoch": 2.918158567774936, "grad_norm": 0.2730224277035152, "learning_rate": 7.066263574328505e-05, "loss": 0.9818, "step": 1141 }, { "epoch": 2.920716112531969, "grad_norm": 0.30594100479026, "learning_rate": 7.052074642361444e-05, "loss": 0.9915, "step": 1142 }, { "epoch": 2.923273657289003, "grad_norm": 0.32054932862442914, "learning_rate": 7.037888734117998e-05, "loss": 0.9882, "step": 1143 }, { "epoch": 2.9258312020460355, "grad_norm": 0.23958919561701653, "learning_rate": 7.023705894848848e-05, "loss": 0.9666, "step": 1144 }, { "epoch": 2.928388746803069, "grad_norm": 0.27076318118261017, "learning_rate": 7.009526169794885e-05, "loss": 0.9746, "step": 1145 }, { "epoch": 2.9309462915601023, "grad_norm": 0.2729574133461879, "learning_rate": 6.995349604187061e-05, "loss": 0.9624, "step": 1146 }, { "epoch": 2.9335038363171355, "grad_norm": 0.3259725455577868, "learning_rate": 6.981176243246257e-05, "loss": 0.9795, "step": 1147 }, { "epoch": 2.9360613810741687, "grad_norm": 0.34256481150449963, "learning_rate": 6.967006132183127e-05, "loss": 0.977, "step": 1148 }, { "epoch": 2.938618925831202, "grad_norm": 0.2828018012599345, "learning_rate": 6.952839316197956e-05, "loss": 0.9928, "step": 1149 }, { "epoch": 2.9411764705882355, "grad_norm": 0.2397889702793678, "learning_rate": 6.938675840480525e-05, "loss": 0.9822, "step": 1150 }, { "epoch": 2.9437340153452687, "grad_norm": 0.331164422112377, "learning_rate": 6.924515750209954e-05, "loss": 0.9973, "step": 1151 }, { "epoch": 2.946291560102302, "grad_norm": 0.2704740780802998, "learning_rate": 6.910359090554572e-05, "loss": 0.9685, "step": 1152 }, { "epoch": 2.948849104859335, "grad_norm": 0.2437699512495755, "learning_rate": 6.896205906671755e-05, "loss": 0.9896, "step": 1153 }, { "epoch": 2.9514066496163682, "grad_norm": 0.24008371878492457, "learning_rate": 6.882056243707796e-05, "loss": 0.9948, "step": 1154 }, { "epoch": 2.9539641943734014, "grad_norm": 0.2714718735118312, "learning_rate": 6.86791014679776e-05, "loss": 1.0107, "step": 1155 }, { "epoch": 2.9565217391304346, "grad_norm": 0.2689100345729253, "learning_rate": 6.85376766106533e-05, "loss": 0.9844, "step": 1156 }, { "epoch": 2.959079283887468, "grad_norm": 0.217002318039709, "learning_rate": 6.839628831622681e-05, "loss": 0.9748, "step": 1157 }, { "epoch": 2.9616368286445014, "grad_norm": 0.2919920400101465, "learning_rate": 6.825493703570311e-05, "loss": 0.9699, "step": 1158 }, { "epoch": 2.9641943734015346, "grad_norm": 0.3490734108048557, "learning_rate": 6.811362321996926e-05, "loss": 0.9694, "step": 1159 }, { "epoch": 2.9667519181585678, "grad_norm": 0.3103643754348234, "learning_rate": 6.797234731979267e-05, "loss": 0.991, "step": 1160 }, { "epoch": 2.969309462915601, "grad_norm": 0.1939069857875497, "learning_rate": 6.783110978581989e-05, "loss": 0.9614, "step": 1161 }, { "epoch": 2.971867007672634, "grad_norm": 0.2495187824732926, "learning_rate": 6.768991106857508e-05, "loss": 0.9656, "step": 1162 }, { "epoch": 2.9744245524296673, "grad_norm": 0.3034345894428266, "learning_rate": 6.754875161845855e-05, "loss": 1.0069, "step": 1163 }, { "epoch": 2.976982097186701, "grad_norm": 0.3567922857742952, "learning_rate": 6.740763188574546e-05, "loss": 0.9612, "step": 1164 }, { "epoch": 2.979539641943734, "grad_norm": 0.25891106467169334, "learning_rate": 6.726655232058409e-05, "loss": 0.9696, "step": 1165 }, { "epoch": 2.9820971867007673, "grad_norm": 0.25153156564503487, "learning_rate": 6.712551337299473e-05, "loss": 1.0014, "step": 1166 }, { "epoch": 2.9846547314578005, "grad_norm": 0.32964252932862226, "learning_rate": 6.69845154928681e-05, "loss": 0.9773, "step": 1167 }, { "epoch": 2.9872122762148337, "grad_norm": 0.2917177962042733, "learning_rate": 6.684355912996386e-05, "loss": 0.9911, "step": 1168 }, { "epoch": 2.9897698209718673, "grad_norm": 0.2002913243087303, "learning_rate": 6.670264473390931e-05, "loss": 0.9683, "step": 1169 }, { "epoch": 2.9923273657289, "grad_norm": 0.26813771266232983, "learning_rate": 6.656177275419785e-05, "loss": 0.967, "step": 1170 }, { "epoch": 2.9948849104859336, "grad_norm": 0.2590485360645914, "learning_rate": 6.64209436401875e-05, "loss": 0.9638, "step": 1171 }, { "epoch": 2.997442455242967, "grad_norm": 0.26357426110685056, "learning_rate": 6.62801578410997e-05, "loss": 1.0056, "step": 1172 }, { "epoch": 3.0, "grad_norm": 0.22456837673610008, "learning_rate": 6.61394158060176e-05, "loss": 0.9933, "step": 1173 }, { "epoch": 3.002557544757033, "grad_norm": 0.22123515970304183, "learning_rate": 6.59987179838848e-05, "loss": 0.9712, "step": 1174 }, { "epoch": 3.0051150895140664, "grad_norm": 0.2497098271402969, "learning_rate": 6.58580648235039e-05, "loss": 0.9701, "step": 1175 }, { "epoch": 3.0076726342710995, "grad_norm": 0.2264514281442564, "learning_rate": 6.571745677353492e-05, "loss": 0.9498, "step": 1176 }, { "epoch": 3.010230179028133, "grad_norm": 0.24110920081950274, "learning_rate": 6.557689428249414e-05, "loss": 0.9841, "step": 1177 }, { "epoch": 3.0127877237851663, "grad_norm": 0.28882150068726187, "learning_rate": 6.543637779875237e-05, "loss": 0.9728, "step": 1178 }, { "epoch": 3.0153452685421995, "grad_norm": 0.22165888817736834, "learning_rate": 6.529590777053378e-05, "loss": 0.9263, "step": 1179 }, { "epoch": 3.0179028132992327, "grad_norm": 0.2715939791147568, "learning_rate": 6.515548464591428e-05, "loss": 0.9353, "step": 1180 }, { "epoch": 3.020460358056266, "grad_norm": 0.3321798212445876, "learning_rate": 6.501510887282024e-05, "loss": 0.948, "step": 1181 }, { "epoch": 3.023017902813299, "grad_norm": 0.2852631687681614, "learning_rate": 6.487478089902685e-05, "loss": 0.9406, "step": 1182 }, { "epoch": 3.0255754475703327, "grad_norm": 0.23938138232215803, "learning_rate": 6.473450117215699e-05, "loss": 0.9612, "step": 1183 }, { "epoch": 3.028132992327366, "grad_norm": 0.2897634546793638, "learning_rate": 6.459427013967953e-05, "loss": 0.93, "step": 1184 }, { "epoch": 3.030690537084399, "grad_norm": 0.28668995967161215, "learning_rate": 6.445408824890805e-05, "loss": 0.943, "step": 1185 }, { "epoch": 3.0332480818414322, "grad_norm": 0.23250708905243717, "learning_rate": 6.431395594699943e-05, "loss": 0.9264, "step": 1186 }, { "epoch": 3.0358056265984654, "grad_norm": 0.3127461016723165, "learning_rate": 6.417387368095225e-05, "loss": 0.9492, "step": 1187 }, { "epoch": 3.0383631713554986, "grad_norm": 0.26702473205124055, "learning_rate": 6.403384189760556e-05, "loss": 0.9173, "step": 1188 }, { "epoch": 3.040920716112532, "grad_norm": 0.2692197582092417, "learning_rate": 6.389386104363738e-05, "loss": 0.9483, "step": 1189 }, { "epoch": 3.0434782608695654, "grad_norm": 0.29389458281034464, "learning_rate": 6.375393156556325e-05, "loss": 0.938, "step": 1190 }, { "epoch": 3.0460358056265986, "grad_norm": 0.24003231343808254, "learning_rate": 6.361405390973489e-05, "loss": 0.9174, "step": 1191 }, { "epoch": 3.0485933503836318, "grad_norm": 0.25208756985944336, "learning_rate": 6.347422852233862e-05, "loss": 0.9542, "step": 1192 }, { "epoch": 3.051150895140665, "grad_norm": 0.24466794377181064, "learning_rate": 6.333445584939407e-05, "loss": 0.9617, "step": 1193 }, { "epoch": 3.053708439897698, "grad_norm": 0.23317237737554486, "learning_rate": 6.319473633675275e-05, "loss": 0.9349, "step": 1194 }, { "epoch": 3.0562659846547313, "grad_norm": 0.24590715837760968, "learning_rate": 6.305507043009657e-05, "loss": 0.9414, "step": 1195 }, { "epoch": 3.0588235294117645, "grad_norm": 0.21035477411097228, "learning_rate": 6.291545857493645e-05, "loss": 0.9512, "step": 1196 }, { "epoch": 3.061381074168798, "grad_norm": 0.2248505455887991, "learning_rate": 6.277590121661098e-05, "loss": 0.9522, "step": 1197 }, { "epoch": 3.0639386189258313, "grad_norm": 0.2471462687532793, "learning_rate": 6.263639880028468e-05, "loss": 0.9493, "step": 1198 }, { "epoch": 3.0664961636828645, "grad_norm": 0.22868376945738234, "learning_rate": 6.249695177094707e-05, "loss": 0.9668, "step": 1199 }, { "epoch": 3.0690537084398977, "grad_norm": 0.23527194146680278, "learning_rate": 6.235756057341084e-05, "loss": 0.9279, "step": 1200 }, { "epoch": 3.071611253196931, "grad_norm": 0.2513612868250463, "learning_rate": 6.221822565231066e-05, "loss": 0.9403, "step": 1201 }, { "epoch": 3.074168797953964, "grad_norm": 0.22860913544864897, "learning_rate": 6.207894745210168e-05, "loss": 0.9616, "step": 1202 }, { "epoch": 3.0767263427109977, "grad_norm": 0.24014291985565175, "learning_rate": 6.193972641705809e-05, "loss": 0.9664, "step": 1203 }, { "epoch": 3.079283887468031, "grad_norm": 0.22572397342217615, "learning_rate": 6.180056299127174e-05, "loss": 0.9663, "step": 1204 }, { "epoch": 3.081841432225064, "grad_norm": 0.25121933762619786, "learning_rate": 6.16614576186507e-05, "loss": 0.9676, "step": 1205 }, { "epoch": 3.084398976982097, "grad_norm": 0.21264743561877053, "learning_rate": 6.152241074291791e-05, "loss": 0.9385, "step": 1206 }, { "epoch": 3.0869565217391304, "grad_norm": 0.2110657205113156, "learning_rate": 6.13834228076097e-05, "loss": 0.9593, "step": 1207 }, { "epoch": 3.0895140664961636, "grad_norm": 0.23064076505093895, "learning_rate": 6.12444942560744e-05, "loss": 0.9859, "step": 1208 }, { "epoch": 3.0920716112531967, "grad_norm": 0.2327889001545048, "learning_rate": 6.110562553147078e-05, "loss": 0.9343, "step": 1209 }, { "epoch": 3.0946291560102304, "grad_norm": 0.22081121627352496, "learning_rate": 6.0966817076767e-05, "loss": 0.9572, "step": 1210 }, { "epoch": 3.0971867007672635, "grad_norm": 0.21410596357542921, "learning_rate": 6.08280693347388e-05, "loss": 0.9577, "step": 1211 }, { "epoch": 3.0997442455242967, "grad_norm": 0.22670771449737367, "learning_rate": 6.068938274796834e-05, "loss": 0.9253, "step": 1212 }, { "epoch": 3.10230179028133, "grad_norm": 0.205343189542066, "learning_rate": 6.055075775884263e-05, "loss": 0.9896, "step": 1213 }, { "epoch": 3.104859335038363, "grad_norm": 0.22769741326879356, "learning_rate": 6.0412194809552316e-05, "loss": 0.9387, "step": 1214 }, { "epoch": 3.1074168797953963, "grad_norm": 0.19822402152888394, "learning_rate": 6.027369434208999e-05, "loss": 0.9808, "step": 1215 }, { "epoch": 3.10997442455243, "grad_norm": 0.23051970557462004, "learning_rate": 6.0135256798249047e-05, "loss": 0.933, "step": 1216 }, { "epoch": 3.112531969309463, "grad_norm": 0.20329115598362008, "learning_rate": 5.999688261962216e-05, "loss": 0.9684, "step": 1217 }, { "epoch": 3.1150895140664963, "grad_norm": 0.21036340816499827, "learning_rate": 5.985857224759981e-05, "loss": 0.944, "step": 1218 }, { "epoch": 3.1176470588235294, "grad_norm": 0.20307590074585102, "learning_rate": 5.972032612336906e-05, "loss": 0.9598, "step": 1219 }, { "epoch": 3.1202046035805626, "grad_norm": 0.2259792004822342, "learning_rate": 5.958214468791189e-05, "loss": 0.9483, "step": 1220 }, { "epoch": 3.122762148337596, "grad_norm": 0.21243681629633632, "learning_rate": 5.944402838200404e-05, "loss": 0.9455, "step": 1221 }, { "epoch": 3.125319693094629, "grad_norm": 0.21205256563770825, "learning_rate": 5.930597764621347e-05, "loss": 0.8963, "step": 1222 }, { "epoch": 3.1278772378516626, "grad_norm": 0.19717448713959743, "learning_rate": 5.916799292089895e-05, "loss": 0.9564, "step": 1223 }, { "epoch": 3.130434782608696, "grad_norm": 0.2244196417767959, "learning_rate": 5.9030074646208745e-05, "loss": 0.9272, "step": 1224 }, { "epoch": 3.132992327365729, "grad_norm": 0.21563385011040548, "learning_rate": 5.8892223262079144e-05, "loss": 0.9316, "step": 1225 }, { "epoch": 3.135549872122762, "grad_norm": 0.2350946628160643, "learning_rate": 5.875443920823297e-05, "loss": 0.9487, "step": 1226 }, { "epoch": 3.1381074168797953, "grad_norm": 0.2865769039296874, "learning_rate": 5.861672292417842e-05, "loss": 0.9492, "step": 1227 }, { "epoch": 3.1406649616368285, "grad_norm": 0.23430970345425967, "learning_rate": 5.84790748492074e-05, "loss": 0.966, "step": 1228 }, { "epoch": 3.1432225063938617, "grad_norm": 0.2467472265535791, "learning_rate": 5.834149542239431e-05, "loss": 0.9708, "step": 1229 }, { "epoch": 3.1457800511508953, "grad_norm": 0.26772393728125105, "learning_rate": 5.8203985082594575e-05, "loss": 0.9557, "step": 1230 }, { "epoch": 3.1483375959079285, "grad_norm": 0.2338023529317996, "learning_rate": 5.806654426844315e-05, "loss": 0.9638, "step": 1231 }, { "epoch": 3.1508951406649617, "grad_norm": 0.2523069016121197, "learning_rate": 5.792917341835335e-05, "loss": 0.9434, "step": 1232 }, { "epoch": 3.153452685421995, "grad_norm": 0.2766552697496739, "learning_rate": 5.77918729705152e-05, "loss": 0.9809, "step": 1233 }, { "epoch": 3.156010230179028, "grad_norm": 0.22646812781120942, "learning_rate": 5.765464336289424e-05, "loss": 0.9639, "step": 1234 }, { "epoch": 3.1585677749360612, "grad_norm": 0.2205961359884855, "learning_rate": 5.751748503322999e-05, "loss": 0.954, "step": 1235 }, { "epoch": 3.1611253196930944, "grad_norm": 0.2701811323136191, "learning_rate": 5.7380398419034644e-05, "loss": 0.9589, "step": 1236 }, { "epoch": 3.163682864450128, "grad_norm": 0.2081039558632908, "learning_rate": 5.7243383957591586e-05, "loss": 0.9471, "step": 1237 }, { "epoch": 3.166240409207161, "grad_norm": 0.19643865068397245, "learning_rate": 5.7106442085954045e-05, "loss": 0.9518, "step": 1238 }, { "epoch": 3.1687979539641944, "grad_norm": 0.30921257471256036, "learning_rate": 5.69695732409438e-05, "loss": 0.9242, "step": 1239 }, { "epoch": 3.1713554987212276, "grad_norm": 0.24583021366711547, "learning_rate": 5.6832777859149536e-05, "loss": 0.9423, "step": 1240 }, { "epoch": 3.1739130434782608, "grad_norm": 0.18950822302407402, "learning_rate": 5.669605637692575e-05, "loss": 0.932, "step": 1241 }, { "epoch": 3.176470588235294, "grad_norm": 0.25157456578331905, "learning_rate": 5.655940923039111e-05, "loss": 0.9379, "step": 1242 }, { "epoch": 3.1790281329923276, "grad_norm": 0.18343916898513093, "learning_rate": 5.642283685542717e-05, "loss": 0.9456, "step": 1243 }, { "epoch": 3.1815856777493607, "grad_norm": 0.19560349844702873, "learning_rate": 5.6286339687677044e-05, "loss": 0.9328, "step": 1244 }, { "epoch": 3.184143222506394, "grad_norm": 0.189610936953741, "learning_rate": 5.614991816254388e-05, "loss": 0.9109, "step": 1245 }, { "epoch": 3.186700767263427, "grad_norm": 0.18320058939508785, "learning_rate": 5.601357271518959e-05, "loss": 0.9584, "step": 1246 }, { "epoch": 3.1892583120204603, "grad_norm": 0.17494234166851327, "learning_rate": 5.587730378053339e-05, "loss": 0.9656, "step": 1247 }, { "epoch": 3.1918158567774935, "grad_norm": 0.19092078945148688, "learning_rate": 5.574111179325039e-05, "loss": 0.9487, "step": 1248 }, { "epoch": 3.1943734015345266, "grad_norm": 0.1860857981568226, "learning_rate": 5.560499718777031e-05, "loss": 0.9372, "step": 1249 }, { "epoch": 3.1969309462915603, "grad_norm": 0.18572653447801232, "learning_rate": 5.5468960398276014e-05, "loss": 0.9459, "step": 1250 }, { "epoch": 3.1994884910485935, "grad_norm": 0.19107345846336404, "learning_rate": 5.5333001858702164e-05, "loss": 0.9255, "step": 1251 }, { "epoch": 3.2020460358056266, "grad_norm": 0.20057541760798753, "learning_rate": 5.519712200273381e-05, "loss": 0.9615, "step": 1252 }, { "epoch": 3.20460358056266, "grad_norm": 0.20198119736904155, "learning_rate": 5.5061321263804933e-05, "loss": 0.9204, "step": 1253 }, { "epoch": 3.207161125319693, "grad_norm": 0.21942879387381486, "learning_rate": 5.4925600075097285e-05, "loss": 0.945, "step": 1254 }, { "epoch": 3.209718670076726, "grad_norm": 0.19469068958831684, "learning_rate": 5.4789958869538756e-05, "loss": 0.9435, "step": 1255 }, { "epoch": 3.21227621483376, "grad_norm": 0.20250937006123632, "learning_rate": 5.4654398079802183e-05, "loss": 0.9364, "step": 1256 }, { "epoch": 3.214833759590793, "grad_norm": 0.19846072138477766, "learning_rate": 5.451891813830382e-05, "loss": 0.94, "step": 1257 }, { "epoch": 3.217391304347826, "grad_norm": 0.20425114535656635, "learning_rate": 5.4383519477202103e-05, "loss": 0.9363, "step": 1258 }, { "epoch": 3.2199488491048593, "grad_norm": 0.185008322081447, "learning_rate": 5.42482025283961e-05, "loss": 0.9815, "step": 1259 }, { "epoch": 3.2225063938618925, "grad_norm": 0.2151529732841821, "learning_rate": 5.41129677235243e-05, "loss": 0.9498, "step": 1260 }, { "epoch": 3.2250639386189257, "grad_norm": 0.1885448397273564, "learning_rate": 5.397781549396316e-05, "loss": 0.9337, "step": 1261 }, { "epoch": 3.227621483375959, "grad_norm": 0.21418784649002942, "learning_rate": 5.3842746270825705e-05, "loss": 0.9171, "step": 1262 }, { "epoch": 3.2301790281329925, "grad_norm": 0.20068889946827412, "learning_rate": 5.370776048496026e-05, "loss": 0.9376, "step": 1263 }, { "epoch": 3.2327365728900257, "grad_norm": 0.24899426008654885, "learning_rate": 5.357285856694891e-05, "loss": 0.9429, "step": 1264 }, { "epoch": 3.235294117647059, "grad_norm": 0.19686757692012147, "learning_rate": 5.34380409471062e-05, "loss": 0.9377, "step": 1265 }, { "epoch": 3.237851662404092, "grad_norm": 0.24870949090788627, "learning_rate": 5.33033080554779e-05, "loss": 0.945, "step": 1266 }, { "epoch": 3.2404092071611252, "grad_norm": 0.20621519140618658, "learning_rate": 5.3168660321839386e-05, "loss": 0.9379, "step": 1267 }, { "epoch": 3.2429667519181584, "grad_norm": 0.21652792479122668, "learning_rate": 5.303409817569449e-05, "loss": 0.9021, "step": 1268 }, { "epoch": 3.2455242966751916, "grad_norm": 0.19103019263904417, "learning_rate": 5.2899622046274e-05, "loss": 0.9613, "step": 1269 }, { "epoch": 3.2480818414322252, "grad_norm": 0.21245341007957305, "learning_rate": 5.276523236253425e-05, "loss": 0.9387, "step": 1270 }, { "epoch": 3.2506393861892584, "grad_norm": 0.2106216561170891, "learning_rate": 5.263092955315595e-05, "loss": 0.9546, "step": 1271 }, { "epoch": 3.2531969309462916, "grad_norm": 0.197972453520414, "learning_rate": 5.2496714046542583e-05, "loss": 0.9391, "step": 1272 }, { "epoch": 3.2557544757033248, "grad_norm": 0.199650022114146, "learning_rate": 5.2362586270819256e-05, "loss": 0.9386, "step": 1273 }, { "epoch": 3.258312020460358, "grad_norm": 0.18979777369555925, "learning_rate": 5.222854665383116e-05, "loss": 0.9495, "step": 1274 }, { "epoch": 3.260869565217391, "grad_norm": 0.2173804109344821, "learning_rate": 5.2094595623142326e-05, "loss": 0.9588, "step": 1275 }, { "epoch": 3.2634271099744243, "grad_norm": 0.2016383197459456, "learning_rate": 5.1960733606034126e-05, "loss": 0.9151, "step": 1276 }, { "epoch": 3.265984654731458, "grad_norm": 0.2047292724222713, "learning_rate": 5.182696102950404e-05, "loss": 0.9686, "step": 1277 }, { "epoch": 3.268542199488491, "grad_norm": 0.2065833579125683, "learning_rate": 5.1693278320264304e-05, "loss": 0.9384, "step": 1278 }, { "epoch": 3.2710997442455243, "grad_norm": 0.20569255957459082, "learning_rate": 5.1559685904740386e-05, "loss": 0.9869, "step": 1279 }, { "epoch": 3.2736572890025575, "grad_norm": 0.19840584494069785, "learning_rate": 5.142618420906985e-05, "loss": 0.9557, "step": 1280 }, { "epoch": 3.2762148337595907, "grad_norm": 0.20387885459079644, "learning_rate": 5.1292773659100755e-05, "loss": 0.9642, "step": 1281 }, { "epoch": 3.2787723785166243, "grad_norm": 0.2101778694530114, "learning_rate": 5.115945468039048e-05, "loss": 0.9509, "step": 1282 }, { "epoch": 3.2813299232736575, "grad_norm": 0.2155780933816927, "learning_rate": 5.1026227698204335e-05, "loss": 0.9499, "step": 1283 }, { "epoch": 3.2838874680306906, "grad_norm": 0.24104255752130535, "learning_rate": 5.089309313751415e-05, "loss": 0.9458, "step": 1284 }, { "epoch": 3.286445012787724, "grad_norm": 0.2121724580915078, "learning_rate": 5.0760051422996925e-05, "loss": 0.9499, "step": 1285 }, { "epoch": 3.289002557544757, "grad_norm": 0.20440164305922942, "learning_rate": 5.0627102979033546e-05, "loss": 0.9458, "step": 1286 }, { "epoch": 3.29156010230179, "grad_norm": 0.21910653895674295, "learning_rate": 5.049424822970731e-05, "loss": 0.9379, "step": 1287 }, { "epoch": 3.2941176470588234, "grad_norm": 0.17657372919405595, "learning_rate": 5.036148759880272e-05, "loss": 0.9249, "step": 1288 }, { "epoch": 3.296675191815857, "grad_norm": 0.22994935624931387, "learning_rate": 5.0228821509803984e-05, "loss": 0.9247, "step": 1289 }, { "epoch": 3.29923273657289, "grad_norm": 0.18809716520389427, "learning_rate": 5.0096250385893825e-05, "loss": 0.9236, "step": 1290 }, { "epoch": 3.3017902813299234, "grad_norm": 0.20395108123985592, "learning_rate": 4.9963774649951975e-05, "loss": 0.9351, "step": 1291 }, { "epoch": 3.3043478260869565, "grad_norm": 0.21017478598124728, "learning_rate": 4.983139472455387e-05, "loss": 0.9603, "step": 1292 }, { "epoch": 3.3069053708439897, "grad_norm": 0.21877137266724161, "learning_rate": 4.969911103196942e-05, "loss": 0.9067, "step": 1293 }, { "epoch": 3.309462915601023, "grad_norm": 0.18726348177523444, "learning_rate": 4.956692399416149e-05, "loss": 0.9368, "step": 1294 }, { "epoch": 3.312020460358056, "grad_norm": 0.2241750270363803, "learning_rate": 4.943483403278468e-05, "loss": 0.947, "step": 1295 }, { "epoch": 3.3145780051150897, "grad_norm": 0.20581443285806397, "learning_rate": 4.9302841569183884e-05, "loss": 0.9575, "step": 1296 }, { "epoch": 3.317135549872123, "grad_norm": 0.17452182993008977, "learning_rate": 4.9170947024393074e-05, "loss": 0.9156, "step": 1297 }, { "epoch": 3.319693094629156, "grad_norm": 0.198949333785195, "learning_rate": 4.9039150819133775e-05, "loss": 0.9348, "step": 1298 }, { "epoch": 3.3222506393861893, "grad_norm": 0.16601657169918604, "learning_rate": 4.890745337381388e-05, "loss": 0.9587, "step": 1299 }, { "epoch": 3.3248081841432224, "grad_norm": 0.23036877304791145, "learning_rate": 4.877585510852627e-05, "loss": 0.9792, "step": 1300 }, { "epoch": 3.3273657289002556, "grad_norm": 0.18765197640496664, "learning_rate": 4.864435644304742e-05, "loss": 0.9253, "step": 1301 }, { "epoch": 3.329923273657289, "grad_norm": 0.19041731553942576, "learning_rate": 4.851295779683616e-05, "loss": 0.9535, "step": 1302 }, { "epoch": 3.3324808184143224, "grad_norm": 0.2087435808060436, "learning_rate": 4.8381659589032186e-05, "loss": 0.9338, "step": 1303 }, { "epoch": 3.3350383631713556, "grad_norm": 0.1903448069067344, "learning_rate": 4.825046223845486e-05, "loss": 0.9499, "step": 1304 }, { "epoch": 3.337595907928389, "grad_norm": 0.21308090181205586, "learning_rate": 4.811936616360186e-05, "loss": 0.9256, "step": 1305 }, { "epoch": 3.340153452685422, "grad_norm": 0.2023342708755437, "learning_rate": 4.798837178264772e-05, "loss": 0.9582, "step": 1306 }, { "epoch": 3.342710997442455, "grad_norm": 0.21619791962247753, "learning_rate": 4.78574795134427e-05, "loss": 0.9125, "step": 1307 }, { "epoch": 3.3452685421994883, "grad_norm": 0.2487539660815107, "learning_rate": 4.772668977351128e-05, "loss": 0.9537, "step": 1308 }, { "epoch": 3.3478260869565215, "grad_norm": 0.2240156883350933, "learning_rate": 4.7596002980050834e-05, "loss": 0.9401, "step": 1309 }, { "epoch": 3.350383631713555, "grad_norm": 0.2251746608186689, "learning_rate": 4.7465419549930476e-05, "loss": 0.9782, "step": 1310 }, { "epoch": 3.3529411764705883, "grad_norm": 0.22881310384597994, "learning_rate": 4.733493989968949e-05, "loss": 0.9458, "step": 1311 }, { "epoch": 3.3554987212276215, "grad_norm": 0.2141099007638843, "learning_rate": 4.7204564445536234e-05, "loss": 0.9396, "step": 1312 }, { "epoch": 3.3580562659846547, "grad_norm": 0.1882802550926345, "learning_rate": 4.707429360334662e-05, "loss": 0.942, "step": 1313 }, { "epoch": 3.360613810741688, "grad_norm": 0.2179119833942681, "learning_rate": 4.694412778866285e-05, "loss": 0.9504, "step": 1314 }, { "epoch": 3.363171355498721, "grad_norm": 0.16843886415285414, "learning_rate": 4.681406741669216e-05, "loss": 0.9221, "step": 1315 }, { "epoch": 3.3657289002557547, "grad_norm": 0.21980007814521796, "learning_rate": 4.668411290230543e-05, "loss": 0.944, "step": 1316 }, { "epoch": 3.368286445012788, "grad_norm": 0.1510130725197139, "learning_rate": 4.655426466003586e-05, "loss": 0.9563, "step": 1317 }, { "epoch": 3.370843989769821, "grad_norm": 0.19586517189701522, "learning_rate": 4.6424523104077654e-05, "loss": 0.9508, "step": 1318 }, { "epoch": 3.373401534526854, "grad_norm": 0.1995467600478656, "learning_rate": 4.629488864828472e-05, "loss": 0.9502, "step": 1319 }, { "epoch": 3.3759590792838874, "grad_norm": 0.1742993616386661, "learning_rate": 4.6165361706169325e-05, "loss": 0.9268, "step": 1320 }, { "epoch": 3.3785166240409206, "grad_norm": 0.2067544794585532, "learning_rate": 4.603594269090078e-05, "loss": 0.9268, "step": 1321 }, { "epoch": 3.381074168797954, "grad_norm": 0.2227068577818483, "learning_rate": 4.5906632015304116e-05, "loss": 0.9358, "step": 1322 }, { "epoch": 3.3836317135549874, "grad_norm": 0.2034466989052333, "learning_rate": 4.5777430091858855e-05, "loss": 0.9302, "step": 1323 }, { "epoch": 3.3861892583120206, "grad_norm": 0.20709571806774676, "learning_rate": 4.564833733269755e-05, "loss": 0.9427, "step": 1324 }, { "epoch": 3.3887468030690537, "grad_norm": 0.22013092566675613, "learning_rate": 4.5519354149604474e-05, "loss": 0.9437, "step": 1325 }, { "epoch": 3.391304347826087, "grad_norm": 0.18450541197105383, "learning_rate": 4.539048095401452e-05, "loss": 0.9466, "step": 1326 }, { "epoch": 3.39386189258312, "grad_norm": 0.22548387813850762, "learning_rate": 4.526171815701165e-05, "loss": 0.9336, "step": 1327 }, { "epoch": 3.3964194373401533, "grad_norm": 0.1820733823905873, "learning_rate": 4.513306616932764e-05, "loss": 0.9215, "step": 1328 }, { "epoch": 3.398976982097187, "grad_norm": 0.21404349632115405, "learning_rate": 4.5004525401340915e-05, "loss": 0.9801, "step": 1329 }, { "epoch": 3.40153452685422, "grad_norm": 0.18377817821243256, "learning_rate": 4.487609626307508e-05, "loss": 0.9655, "step": 1330 }, { "epoch": 3.4040920716112533, "grad_norm": 0.1923893878636668, "learning_rate": 4.4747779164197535e-05, "loss": 0.9382, "step": 1331 }, { "epoch": 3.4066496163682864, "grad_norm": 0.19516009680845245, "learning_rate": 4.4619574514018486e-05, "loss": 0.9557, "step": 1332 }, { "epoch": 3.4092071611253196, "grad_norm": 0.19144644869283248, "learning_rate": 4.449148272148934e-05, "loss": 0.9345, "step": 1333 }, { "epoch": 3.411764705882353, "grad_norm": 0.1817955488888704, "learning_rate": 4.436350419520154e-05, "loss": 0.9608, "step": 1334 }, { "epoch": 3.414322250639386, "grad_norm": 0.2056911128568184, "learning_rate": 4.423563934338519e-05, "loss": 0.9458, "step": 1335 }, { "epoch": 3.4168797953964196, "grad_norm": 0.1693771378014072, "learning_rate": 4.410788857390785e-05, "loss": 0.9466, "step": 1336 }, { "epoch": 3.419437340153453, "grad_norm": 0.20830311663566495, "learning_rate": 4.39802522942731e-05, "loss": 0.9408, "step": 1337 }, { "epoch": 3.421994884910486, "grad_norm": 0.1698790309922409, "learning_rate": 4.385273091161937e-05, "loss": 0.9305, "step": 1338 }, { "epoch": 3.424552429667519, "grad_norm": 0.19474240897387077, "learning_rate": 4.372532483271863e-05, "loss": 0.9375, "step": 1339 }, { "epoch": 3.4271099744245523, "grad_norm": 0.2059429092680418, "learning_rate": 4.3598034463974966e-05, "loss": 0.9869, "step": 1340 }, { "epoch": 3.4296675191815855, "grad_norm": 0.19031026060303782, "learning_rate": 4.347086021142339e-05, "loss": 0.9765, "step": 1341 }, { "epoch": 3.4322250639386187, "grad_norm": 0.19960933133782244, "learning_rate": 4.3343802480728544e-05, "loss": 0.9431, "step": 1342 }, { "epoch": 3.4347826086956523, "grad_norm": 0.1924073308227482, "learning_rate": 4.321686167718337e-05, "loss": 0.9545, "step": 1343 }, { "epoch": 3.4373401534526855, "grad_norm": 0.2028658725938022, "learning_rate": 4.309003820570785e-05, "loss": 0.9377, "step": 1344 }, { "epoch": 3.4398976982097187, "grad_norm": 0.2106823975486889, "learning_rate": 4.296333247084764e-05, "loss": 0.9283, "step": 1345 }, { "epoch": 3.442455242966752, "grad_norm": 0.21370019365379003, "learning_rate": 4.283674487677297e-05, "loss": 0.9663, "step": 1346 }, { "epoch": 3.445012787723785, "grad_norm": 0.20381679039668288, "learning_rate": 4.271027582727703e-05, "loss": 0.9425, "step": 1347 }, { "epoch": 3.4475703324808182, "grad_norm": 0.2465303759456818, "learning_rate": 4.2583925725774996e-05, "loss": 0.963, "step": 1348 }, { "epoch": 3.4501278772378514, "grad_norm": 0.2017710128697274, "learning_rate": 4.2457694975302625e-05, "loss": 0.969, "step": 1349 }, { "epoch": 3.452685421994885, "grad_norm": 0.2599485575517086, "learning_rate": 4.233158397851494e-05, "loss": 0.9578, "step": 1350 }, { "epoch": 3.455242966751918, "grad_norm": 0.20994916380961168, "learning_rate": 4.220559313768492e-05, "loss": 0.9517, "step": 1351 }, { "epoch": 3.4578005115089514, "grad_norm": 0.25562334357376887, "learning_rate": 4.207972285470236e-05, "loss": 0.9593, "step": 1352 }, { "epoch": 3.4603580562659846, "grad_norm": 0.2018942765243476, "learning_rate": 4.1953973531072403e-05, "loss": 0.9238, "step": 1353 }, { "epoch": 3.4629156010230178, "grad_norm": 0.23893893502461097, "learning_rate": 4.1828345567914426e-05, "loss": 0.9463, "step": 1354 }, { "epoch": 3.4654731457800514, "grad_norm": 0.2377570507765394, "learning_rate": 4.17028393659606e-05, "loss": 0.9379, "step": 1355 }, { "epoch": 3.4680306905370846, "grad_norm": 0.21617110584103066, "learning_rate": 4.157745532555484e-05, "loss": 0.9445, "step": 1356 }, { "epoch": 3.4705882352941178, "grad_norm": 0.20973373939841763, "learning_rate": 4.145219384665128e-05, "loss": 0.9471, "step": 1357 }, { "epoch": 3.473145780051151, "grad_norm": 0.19248666440528944, "learning_rate": 4.1327055328813036e-05, "loss": 0.9492, "step": 1358 }, { "epoch": 3.475703324808184, "grad_norm": 0.19782620860430303, "learning_rate": 4.1202040171211195e-05, "loss": 0.9677, "step": 1359 }, { "epoch": 3.4782608695652173, "grad_norm": 0.18288110899297144, "learning_rate": 4.107714877262318e-05, "loss": 0.9574, "step": 1360 }, { "epoch": 3.4808184143222505, "grad_norm": 0.18982354052970898, "learning_rate": 4.0952381531431716e-05, "loss": 0.9411, "step": 1361 }, { "epoch": 3.483375959079284, "grad_norm": 0.19047078322563796, "learning_rate": 4.082773884562342e-05, "loss": 0.9465, "step": 1362 }, { "epoch": 3.4859335038363173, "grad_norm": 0.20024490556690386, "learning_rate": 4.0703221112787774e-05, "loss": 0.9631, "step": 1363 }, { "epoch": 3.4884910485933505, "grad_norm": 0.18855297057246742, "learning_rate": 4.057882873011543e-05, "loss": 0.9333, "step": 1364 }, { "epoch": 3.4910485933503836, "grad_norm": 0.18121257314529818, "learning_rate": 4.045456209439734e-05, "loss": 0.9683, "step": 1365 }, { "epoch": 3.493606138107417, "grad_norm": 0.19866185503250056, "learning_rate": 4.033042160202337e-05, "loss": 0.9872, "step": 1366 }, { "epoch": 3.49616368286445, "grad_norm": 0.17010036933663283, "learning_rate": 4.020640764898096e-05, "loss": 0.9685, "step": 1367 }, { "epoch": 3.498721227621483, "grad_norm": 0.18176622769606524, "learning_rate": 4.0082520630853865e-05, "loss": 0.9112, "step": 1368 }, { "epoch": 3.501278772378517, "grad_norm": 0.1861883153790341, "learning_rate": 3.995876094282104e-05, "loss": 0.9585, "step": 1369 }, { "epoch": 3.50383631713555, "grad_norm": 0.19579755858911602, "learning_rate": 3.983512897965519e-05, "loss": 0.959, "step": 1370 }, { "epoch": 3.506393861892583, "grad_norm": 0.18488711544490097, "learning_rate": 3.9711625135721664e-05, "loss": 0.9555, "step": 1371 }, { "epoch": 3.5089514066496164, "grad_norm": 0.2073614939639127, "learning_rate": 3.958824980497704e-05, "loss": 0.9744, "step": 1372 }, { "epoch": 3.5115089514066495, "grad_norm": 0.17154095562950622, "learning_rate": 3.946500338096811e-05, "loss": 0.9353, "step": 1373 }, { "epoch": 3.5140664961636827, "grad_norm": 0.20478213377969626, "learning_rate": 3.934188625683037e-05, "loss": 0.9568, "step": 1374 }, { "epoch": 3.516624040920716, "grad_norm": 0.18373687324276738, "learning_rate": 3.9218898825286806e-05, "loss": 0.9279, "step": 1375 }, { "epoch": 3.5191815856777495, "grad_norm": 0.1716453870437831, "learning_rate": 3.9096041478646885e-05, "loss": 0.9342, "step": 1376 }, { "epoch": 3.5217391304347827, "grad_norm": 0.18268819201544698, "learning_rate": 3.8973314608805e-05, "loss": 0.962, "step": 1377 }, { "epoch": 3.524296675191816, "grad_norm": 0.16258821810908097, "learning_rate": 3.885071860723937e-05, "loss": 0.9293, "step": 1378 }, { "epoch": 3.526854219948849, "grad_norm": 0.165376063640211, "learning_rate": 3.8728253865010765e-05, "loss": 0.9895, "step": 1379 }, { "epoch": 3.5294117647058822, "grad_norm": 0.16721193942916188, "learning_rate": 3.8605920772761274e-05, "loss": 0.9328, "step": 1380 }, { "epoch": 3.531969309462916, "grad_norm": 0.16130857457103082, "learning_rate": 3.848371972071304e-05, "loss": 0.9859, "step": 1381 }, { "epoch": 3.5345268542199486, "grad_norm": 0.16278759213568428, "learning_rate": 3.8361651098666967e-05, "loss": 0.9569, "step": 1382 }, { "epoch": 3.5370843989769822, "grad_norm": 0.17183294163130294, "learning_rate": 3.8239715296001654e-05, "loss": 0.9418, "step": 1383 }, { "epoch": 3.5396419437340154, "grad_norm": 0.155240959003008, "learning_rate": 3.8117912701671905e-05, "loss": 0.9696, "step": 1384 }, { "epoch": 3.5421994884910486, "grad_norm": 0.17273359598041008, "learning_rate": 3.7996243704207686e-05, "loss": 0.9502, "step": 1385 }, { "epoch": 3.544757033248082, "grad_norm": 0.1703572907276737, "learning_rate": 3.787470869171277e-05, "loss": 0.9673, "step": 1386 }, { "epoch": 3.547314578005115, "grad_norm": 0.163047329660931, "learning_rate": 3.7753308051863534e-05, "loss": 0.9244, "step": 1387 }, { "epoch": 3.5498721227621486, "grad_norm": 0.16125670043718637, "learning_rate": 3.763204217190778e-05, "loss": 0.9414, "step": 1388 }, { "epoch": 3.5524296675191813, "grad_norm": 0.17450887360011574, "learning_rate": 3.751091143866338e-05, "loss": 0.9677, "step": 1389 }, { "epoch": 3.554987212276215, "grad_norm": 0.15580595508138104, "learning_rate": 3.7389916238517224e-05, "loss": 0.9758, "step": 1390 }, { "epoch": 3.557544757033248, "grad_norm": 0.17069367779408143, "learning_rate": 3.726905695742372e-05, "loss": 0.9142, "step": 1391 }, { "epoch": 3.5601023017902813, "grad_norm": 0.16910211167776398, "learning_rate": 3.7148333980903796e-05, "loss": 0.9389, "step": 1392 }, { "epoch": 3.5626598465473145, "grad_norm": 0.1663225487056752, "learning_rate": 3.7027747694043645e-05, "loss": 0.9557, "step": 1393 }, { "epoch": 3.5652173913043477, "grad_norm": 0.16804185773204355, "learning_rate": 3.690729848149335e-05, "loss": 0.9588, "step": 1394 }, { "epoch": 3.5677749360613813, "grad_norm": 0.16402784688128466, "learning_rate": 3.678698672746581e-05, "loss": 0.964, "step": 1395 }, { "epoch": 3.5703324808184145, "grad_norm": 0.18174268933477528, "learning_rate": 3.6666812815735424e-05, "loss": 0.9433, "step": 1396 }, { "epoch": 3.5728900255754477, "grad_norm": 0.15614453400715234, "learning_rate": 3.6546777129636886e-05, "loss": 0.9252, "step": 1397 }, { "epoch": 3.575447570332481, "grad_norm": 0.16700607138470522, "learning_rate": 3.6426880052064026e-05, "loss": 0.9636, "step": 1398 }, { "epoch": 3.578005115089514, "grad_norm": 0.20568461367374485, "learning_rate": 3.630712196546844e-05, "loss": 0.9649, "step": 1399 }, { "epoch": 3.580562659846547, "grad_norm": 0.14660657078481024, "learning_rate": 3.6187503251858505e-05, "loss": 0.9267, "step": 1400 }, { "epoch": 3.5831202046035804, "grad_norm": 0.16935747703951526, "learning_rate": 3.6068024292797945e-05, "loss": 0.9356, "step": 1401 }, { "epoch": 3.585677749360614, "grad_norm": 0.15782075450424704, "learning_rate": 3.59486854694046e-05, "loss": 0.9548, "step": 1402 }, { "epoch": 3.588235294117647, "grad_norm": 0.17132410907270623, "learning_rate": 3.582948716234948e-05, "loss": 0.9493, "step": 1403 }, { "epoch": 3.5907928388746804, "grad_norm": 0.16858095077712948, "learning_rate": 3.571042975185524e-05, "loss": 0.9552, "step": 1404 }, { "epoch": 3.5933503836317136, "grad_norm": 0.1634251285228488, "learning_rate": 3.559151361769517e-05, "loss": 0.9466, "step": 1405 }, { "epoch": 3.5959079283887467, "grad_norm": 0.1729430282795056, "learning_rate": 3.547273913919182e-05, "loss": 0.95, "step": 1406 }, { "epoch": 3.59846547314578, "grad_norm": 0.1821907434145911, "learning_rate": 3.535410669521605e-05, "loss": 0.9588, "step": 1407 }, { "epoch": 3.601023017902813, "grad_norm": 0.15781654283531932, "learning_rate": 3.5235616664185465e-05, "loss": 0.9591, "step": 1408 }, { "epoch": 3.6035805626598467, "grad_norm": 0.1677674098580371, "learning_rate": 3.5117269424063466e-05, "loss": 0.9372, "step": 1409 }, { "epoch": 3.60613810741688, "grad_norm": 0.1668467714604029, "learning_rate": 3.4999065352358055e-05, "loss": 0.9128, "step": 1410 }, { "epoch": 3.608695652173913, "grad_norm": 0.16023804099695482, "learning_rate": 3.488100482612046e-05, "loss": 0.9533, "step": 1411 }, { "epoch": 3.6112531969309463, "grad_norm": 0.17448057130149636, "learning_rate": 3.476308822194404e-05, "loss": 0.9696, "step": 1412 }, { "epoch": 3.6138107416879794, "grad_norm": 0.17176757036978785, "learning_rate": 3.4645315915963085e-05, "loss": 0.9295, "step": 1413 }, { "epoch": 3.6163682864450126, "grad_norm": 0.16582442582314796, "learning_rate": 3.452768828385156e-05, "loss": 0.9478, "step": 1414 }, { "epoch": 3.618925831202046, "grad_norm": 0.16508960150611576, "learning_rate": 3.4410205700822e-05, "loss": 0.9267, "step": 1415 }, { "epoch": 3.6214833759590794, "grad_norm": 0.15842544276922507, "learning_rate": 3.42928685416242e-05, "loss": 0.9487, "step": 1416 }, { "epoch": 3.6240409207161126, "grad_norm": 0.16737847990453103, "learning_rate": 3.417567718054413e-05, "loss": 0.9257, "step": 1417 }, { "epoch": 3.626598465473146, "grad_norm": 0.16179442819088455, "learning_rate": 3.405863199140271e-05, "loss": 0.9594, "step": 1418 }, { "epoch": 3.629156010230179, "grad_norm": 0.17740705653386357, "learning_rate": 3.3941733347554434e-05, "loss": 0.954, "step": 1419 }, { "epoch": 3.631713554987212, "grad_norm": 0.1745105989485467, "learning_rate": 3.3824981621886545e-05, "loss": 0.9536, "step": 1420 }, { "epoch": 3.634271099744246, "grad_norm": 0.1927262004385616, "learning_rate": 3.370837718681754e-05, "loss": 0.9685, "step": 1421 }, { "epoch": 3.6368286445012785, "grad_norm": 0.15752590578867717, "learning_rate": 3.3591920414296094e-05, "loss": 0.9248, "step": 1422 }, { "epoch": 3.639386189258312, "grad_norm": 0.21240595387549532, "learning_rate": 3.347561167579986e-05, "loss": 0.9521, "step": 1423 }, { "epoch": 3.6419437340153453, "grad_norm": 0.17508530317965004, "learning_rate": 3.3359451342334306e-05, "loss": 0.9431, "step": 1424 }, { "epoch": 3.6445012787723785, "grad_norm": 0.21738581132916354, "learning_rate": 3.324343978443148e-05, "loss": 0.9716, "step": 1425 }, { "epoch": 3.6470588235294117, "grad_norm": 0.16746773638107448, "learning_rate": 3.3127577372148874e-05, "loss": 0.9322, "step": 1426 }, { "epoch": 3.649616368286445, "grad_norm": 0.2122059201301744, "learning_rate": 3.301186447506827e-05, "loss": 0.9422, "step": 1427 }, { "epoch": 3.6521739130434785, "grad_norm": 0.15741451467355758, "learning_rate": 3.289630146229449e-05, "loss": 0.9366, "step": 1428 }, { "epoch": 3.6547314578005117, "grad_norm": 0.19813994445803942, "learning_rate": 3.278088870245423e-05, "loss": 0.9286, "step": 1429 }, { "epoch": 3.657289002557545, "grad_norm": 0.16851843081939155, "learning_rate": 3.2665626563694937e-05, "loss": 0.9572, "step": 1430 }, { "epoch": 3.659846547314578, "grad_norm": 0.20717471275600138, "learning_rate": 3.2550515413683574e-05, "loss": 0.9512, "step": 1431 }, { "epoch": 3.662404092071611, "grad_norm": 0.16245953402744545, "learning_rate": 3.2435555619605504e-05, "loss": 0.9542, "step": 1432 }, { "epoch": 3.6649616368286444, "grad_norm": 0.19641538640030912, "learning_rate": 3.232074754816323e-05, "loss": 0.9306, "step": 1433 }, { "epoch": 3.6675191815856776, "grad_norm": 0.1594631052144963, "learning_rate": 3.220609156557544e-05, "loss": 0.9363, "step": 1434 }, { "epoch": 3.670076726342711, "grad_norm": 0.18455147659478868, "learning_rate": 3.209158803757546e-05, "loss": 0.9321, "step": 1435 }, { "epoch": 3.6726342710997444, "grad_norm": 0.1790498881096886, "learning_rate": 3.1977237329410446e-05, "loss": 0.9608, "step": 1436 }, { "epoch": 3.6751918158567776, "grad_norm": 0.1870454897435218, "learning_rate": 3.186303980584012e-05, "loss": 0.9389, "step": 1437 }, { "epoch": 3.6777493606138107, "grad_norm": 0.20530561810770268, "learning_rate": 3.174899583113548e-05, "loss": 0.9945, "step": 1438 }, { "epoch": 3.680306905370844, "grad_norm": 0.18019213638281067, "learning_rate": 3.1635105769077766e-05, "loss": 0.9307, "step": 1439 }, { "epoch": 3.682864450127877, "grad_norm": 0.20610761052130405, "learning_rate": 3.152136998295727e-05, "loss": 0.9321, "step": 1440 }, { "epoch": 3.6854219948849103, "grad_norm": 0.17985929842660886, "learning_rate": 3.140778883557213e-05, "loss": 0.932, "step": 1441 }, { "epoch": 3.687979539641944, "grad_norm": 0.20013068677532989, "learning_rate": 3.129436268922728e-05, "loss": 0.9324, "step": 1442 }, { "epoch": 3.690537084398977, "grad_norm": 0.17562501633026537, "learning_rate": 3.118109190573313e-05, "loss": 0.9145, "step": 1443 }, { "epoch": 3.6930946291560103, "grad_norm": 0.18827294282018908, "learning_rate": 3.106797684640464e-05, "loss": 0.9402, "step": 1444 }, { "epoch": 3.6956521739130435, "grad_norm": 0.20170283801470837, "learning_rate": 3.0955017872059956e-05, "loss": 0.9591, "step": 1445 }, { "epoch": 3.6982097186700766, "grad_norm": 0.15387225427234089, "learning_rate": 3.084221534301926e-05, "loss": 0.9253, "step": 1446 }, { "epoch": 3.70076726342711, "grad_norm": 0.24032338349831264, "learning_rate": 3.0729569619103876e-05, "loss": 0.9501, "step": 1447 }, { "epoch": 3.703324808184143, "grad_norm": 0.1613801252077293, "learning_rate": 3.061708105963481e-05, "loss": 0.9706, "step": 1448 }, { "epoch": 3.7058823529411766, "grad_norm": 0.18342909310635377, "learning_rate": 3.0504750023431787e-05, "loss": 0.9268, "step": 1449 }, { "epoch": 3.70843989769821, "grad_norm": 0.1656531219879725, "learning_rate": 3.039257686881209e-05, "loss": 0.9385, "step": 1450 }, { "epoch": 3.710997442455243, "grad_norm": 0.1781080191407481, "learning_rate": 3.028056195358936e-05, "loss": 0.9201, "step": 1451 }, { "epoch": 3.713554987212276, "grad_norm": 0.1682926250161123, "learning_rate": 3.016870563507241e-05, "loss": 0.9486, "step": 1452 }, { "epoch": 3.7161125319693094, "grad_norm": 0.17403568022524737, "learning_rate": 3.0057008270064226e-05, "loss": 0.9326, "step": 1453 }, { "epoch": 3.718670076726343, "grad_norm": 0.17412534323602966, "learning_rate": 2.9945470214860815e-05, "loss": 0.9737, "step": 1454 }, { "epoch": 3.7212276214833757, "grad_norm": 0.2012938530305388, "learning_rate": 2.9834091825249908e-05, "loss": 0.9319, "step": 1455 }, { "epoch": 3.7237851662404093, "grad_norm": 0.15521247782508635, "learning_rate": 2.9722873456509985e-05, "loss": 0.9289, "step": 1456 }, { "epoch": 3.7263427109974425, "grad_norm": 0.15552821509875525, "learning_rate": 2.961181546340906e-05, "loss": 0.9707, "step": 1457 }, { "epoch": 3.7289002557544757, "grad_norm": 0.19037886779641314, "learning_rate": 2.95009182002036e-05, "loss": 0.9313, "step": 1458 }, { "epoch": 3.731457800511509, "grad_norm": 0.16615970202045902, "learning_rate": 2.939018202063732e-05, "loss": 0.9647, "step": 1459 }, { "epoch": 3.734015345268542, "grad_norm": 0.17646317393385902, "learning_rate": 2.9279607277940196e-05, "loss": 0.9474, "step": 1460 }, { "epoch": 3.7365728900255757, "grad_norm": 0.16080135640987508, "learning_rate": 2.9169194324827183e-05, "loss": 0.926, "step": 1461 }, { "epoch": 3.7391304347826084, "grad_norm": 0.17325852442311754, "learning_rate": 2.9058943513497158e-05, "loss": 0.9312, "step": 1462 }, { "epoch": 3.741687979539642, "grad_norm": 0.2657172615999172, "learning_rate": 2.8948855195631797e-05, "loss": 0.9417, "step": 1463 }, { "epoch": 3.7442455242966752, "grad_norm": 0.18232454995244132, "learning_rate": 2.883892972239445e-05, "loss": 0.9596, "step": 1464 }, { "epoch": 3.7468030690537084, "grad_norm": 0.15153887237658853, "learning_rate": 2.8729167444429042e-05, "loss": 0.9476, "step": 1465 }, { "epoch": 3.7493606138107416, "grad_norm": 0.17675913819692224, "learning_rate": 2.8619568711858858e-05, "loss": 0.945, "step": 1466 }, { "epoch": 3.7519181585677748, "grad_norm": 0.16206615280321732, "learning_rate": 2.8510133874285633e-05, "loss": 0.9462, "step": 1467 }, { "epoch": 3.7544757033248084, "grad_norm": 0.1553778010776279, "learning_rate": 2.8400863280788207e-05, "loss": 0.9407, "step": 1468 }, { "epoch": 3.7570332480818416, "grad_norm": 0.16829547679009138, "learning_rate": 2.829175727992147e-05, "loss": 0.963, "step": 1469 }, { "epoch": 3.7595907928388748, "grad_norm": 0.13746655170307476, "learning_rate": 2.818281621971541e-05, "loss": 0.9221, "step": 1470 }, { "epoch": 3.762148337595908, "grad_norm": 0.16271667131621254, "learning_rate": 2.8074040447673794e-05, "loss": 0.9535, "step": 1471 }, { "epoch": 3.764705882352941, "grad_norm": 0.16318435465235073, "learning_rate": 2.7965430310773184e-05, "loss": 0.9475, "step": 1472 }, { "epoch": 3.7672634271099743, "grad_norm": 0.16520541373584413, "learning_rate": 2.7856986155461777e-05, "loss": 0.9315, "step": 1473 }, { "epoch": 3.7698209718670075, "grad_norm": 0.32117889861607873, "learning_rate": 2.7748708327658317e-05, "loss": 0.9455, "step": 1474 }, { "epoch": 3.772378516624041, "grad_norm": 0.17314463246020131, "learning_rate": 2.7640597172751004e-05, "loss": 0.9525, "step": 1475 }, { "epoch": 3.7749360613810743, "grad_norm": 0.15225032038812816, "learning_rate": 2.7532653035596336e-05, "loss": 0.9453, "step": 1476 }, { "epoch": 3.7774936061381075, "grad_norm": 0.17247417052786013, "learning_rate": 2.7424876260518146e-05, "loss": 0.9152, "step": 1477 }, { "epoch": 3.7800511508951407, "grad_norm": 0.15503112719134568, "learning_rate": 2.7317267191306318e-05, "loss": 0.9398, "step": 1478 }, { "epoch": 3.782608695652174, "grad_norm": 0.1631084235061464, "learning_rate": 2.7209826171215827e-05, "loss": 0.9246, "step": 1479 }, { "epoch": 3.785166240409207, "grad_norm": 0.15506280568530903, "learning_rate": 2.7102553542965577e-05, "loss": 0.936, "step": 1480 }, { "epoch": 3.78772378516624, "grad_norm": 0.1404687271754989, "learning_rate": 2.6995449648737343e-05, "loss": 0.9359, "step": 1481 }, { "epoch": 3.790281329923274, "grad_norm": 0.1557007128341937, "learning_rate": 2.6888514830174678e-05, "loss": 0.954, "step": 1482 }, { "epoch": 3.792838874680307, "grad_norm": 0.16612555940333462, "learning_rate": 2.6781749428381752e-05, "loss": 1.0034, "step": 1483 }, { "epoch": 3.79539641943734, "grad_norm": 0.1733496961568388, "learning_rate": 2.6675153783922457e-05, "loss": 0.9518, "step": 1484 }, { "epoch": 3.7979539641943734, "grad_norm": 0.15940418283478483, "learning_rate": 2.6568728236819023e-05, "loss": 0.9817, "step": 1485 }, { "epoch": 3.8005115089514065, "grad_norm": 0.19079011728203774, "learning_rate": 2.6462473126551187e-05, "loss": 0.9735, "step": 1486 }, { "epoch": 3.80306905370844, "grad_norm": 0.16130729906636684, "learning_rate": 2.635638879205504e-05, "loss": 0.9579, "step": 1487 }, { "epoch": 3.805626598465473, "grad_norm": 0.1745866503183891, "learning_rate": 2.625047557172189e-05, "loss": 0.9402, "step": 1488 }, { "epoch": 3.8081841432225065, "grad_norm": 0.18057372768582713, "learning_rate": 2.6144733803397212e-05, "loss": 0.9474, "step": 1489 }, { "epoch": 3.8107416879795397, "grad_norm": 0.1560777993171654, "learning_rate": 2.6039163824379588e-05, "loss": 0.9506, "step": 1490 }, { "epoch": 3.813299232736573, "grad_norm": 0.1674616567029557, "learning_rate": 2.5933765971419647e-05, "loss": 0.9488, "step": 1491 }, { "epoch": 3.815856777493606, "grad_norm": 0.15672982172497663, "learning_rate": 2.582854058071892e-05, "loss": 0.9458, "step": 1492 }, { "epoch": 3.8184143222506393, "grad_norm": 0.1558200464104945, "learning_rate": 2.5723487987928817e-05, "loss": 0.9518, "step": 1493 }, { "epoch": 3.820971867007673, "grad_norm": 0.14208299213871128, "learning_rate": 2.5618608528149614e-05, "loss": 0.93, "step": 1494 }, { "epoch": 3.8235294117647056, "grad_norm": 0.16087610572734629, "learning_rate": 2.5513902535929288e-05, "loss": 0.9763, "step": 1495 }, { "epoch": 3.8260869565217392, "grad_norm": 0.1493299114392072, "learning_rate": 2.5409370345262385e-05, "loss": 0.9471, "step": 1496 }, { "epoch": 3.8286445012787724, "grad_norm": 0.15214002644065255, "learning_rate": 2.5305012289589223e-05, "loss": 0.9588, "step": 1497 }, { "epoch": 3.8312020460358056, "grad_norm": 0.15727057443971326, "learning_rate": 2.5200828701794543e-05, "loss": 0.9294, "step": 1498 }, { "epoch": 3.833759590792839, "grad_norm": 0.14966978310373255, "learning_rate": 2.5096819914206592e-05, "loss": 0.9372, "step": 1499 }, { "epoch": 3.836317135549872, "grad_norm": 0.160200304381001, "learning_rate": 2.4992986258596023e-05, "loss": 0.9648, "step": 1500 }, { "epoch": 3.8388746803069056, "grad_norm": 0.1364407301299318, "learning_rate": 2.4889328066174932e-05, "loss": 0.9458, "step": 1501 }, { "epoch": 3.8414322250639388, "grad_norm": 0.15554384512550426, "learning_rate": 2.4785845667595565e-05, "loss": 0.9532, "step": 1502 }, { "epoch": 3.843989769820972, "grad_norm": 0.14270917443883158, "learning_rate": 2.4682539392949494e-05, "loss": 0.9194, "step": 1503 }, { "epoch": 3.846547314578005, "grad_norm": 0.15315949958673647, "learning_rate": 2.4579409571766543e-05, "loss": 0.9619, "step": 1504 }, { "epoch": 3.8491048593350383, "grad_norm": 0.14236120859618645, "learning_rate": 2.4476456533013597e-05, "loss": 0.9637, "step": 1505 }, { "epoch": 3.8516624040920715, "grad_norm": 0.14065482492078218, "learning_rate": 2.437368060509365e-05, "loss": 0.9406, "step": 1506 }, { "epoch": 3.8542199488491047, "grad_norm": 0.13361767868605823, "learning_rate": 2.427108211584476e-05, "loss": 0.9595, "step": 1507 }, { "epoch": 3.8567774936061383, "grad_norm": 0.13594955260031957, "learning_rate": 2.4168661392538982e-05, "loss": 0.9421, "step": 1508 }, { "epoch": 3.8593350383631715, "grad_norm": 0.13851801316117543, "learning_rate": 2.4066418761881308e-05, "loss": 0.9687, "step": 1509 }, { "epoch": 3.8618925831202047, "grad_norm": 0.13380711931983305, "learning_rate": 2.396435455000864e-05, "loss": 0.9468, "step": 1510 }, { "epoch": 3.864450127877238, "grad_norm": 0.13649849585417867, "learning_rate": 2.386246908248883e-05, "loss": 0.9228, "step": 1511 }, { "epoch": 3.867007672634271, "grad_norm": 0.13210578639270845, "learning_rate": 2.3760762684319508e-05, "loss": 0.9094, "step": 1512 }, { "epoch": 3.869565217391304, "grad_norm": 0.14259288669579517, "learning_rate": 2.3659235679927016e-05, "loss": 0.9351, "step": 1513 }, { "epoch": 3.8721227621483374, "grad_norm": 0.1388101682540646, "learning_rate": 2.3557888393165627e-05, "loss": 0.9454, "step": 1514 }, { "epoch": 3.874680306905371, "grad_norm": 0.12901592134412895, "learning_rate": 2.345672114731624e-05, "loss": 0.9481, "step": 1515 }, { "epoch": 3.877237851662404, "grad_norm": 0.13894304934030247, "learning_rate": 2.335573426508547e-05, "loss": 0.9583, "step": 1516 }, { "epoch": 3.8797953964194374, "grad_norm": 0.1370325882290817, "learning_rate": 2.325492806860462e-05, "loss": 0.9799, "step": 1517 }, { "epoch": 3.8823529411764706, "grad_norm": 0.13421409804749201, "learning_rate": 2.315430287942862e-05, "loss": 0.9533, "step": 1518 }, { "epoch": 3.8849104859335037, "grad_norm": 0.13298313283238028, "learning_rate": 2.3053859018535026e-05, "loss": 0.9709, "step": 1519 }, { "epoch": 3.887468030690537, "grad_norm": 0.1361450777437208, "learning_rate": 2.295359680632295e-05, "loss": 0.9615, "step": 1520 }, { "epoch": 3.89002557544757, "grad_norm": 0.1486100399377403, "learning_rate": 2.2853516562612173e-05, "loss": 0.9376, "step": 1521 }, { "epoch": 3.8925831202046037, "grad_norm": 0.13690524401965368, "learning_rate": 2.2753618606641928e-05, "loss": 0.9092, "step": 1522 }, { "epoch": 3.895140664961637, "grad_norm": 0.15669583951357616, "learning_rate": 2.2653903257070012e-05, "loss": 0.9443, "step": 1523 }, { "epoch": 3.89769820971867, "grad_norm": 0.12931778250099024, "learning_rate": 2.2554370831971743e-05, "loss": 0.9406, "step": 1524 }, { "epoch": 3.9002557544757033, "grad_norm": 0.17258200785982056, "learning_rate": 2.2455021648838935e-05, "loss": 0.9614, "step": 1525 }, { "epoch": 3.9028132992327365, "grad_norm": 0.1521157336174598, "learning_rate": 2.235585602457891e-05, "loss": 0.9487, "step": 1526 }, { "epoch": 3.90537084398977, "grad_norm": 0.14390268768179504, "learning_rate": 2.225687427551341e-05, "loss": 0.9401, "step": 1527 }, { "epoch": 3.907928388746803, "grad_norm": 0.16337966447000044, "learning_rate": 2.2158076717377765e-05, "loss": 0.9536, "step": 1528 }, { "epoch": 3.9104859335038364, "grad_norm": 0.15324748802477992, "learning_rate": 2.2059463665319623e-05, "loss": 0.9198, "step": 1529 }, { "epoch": 3.9130434782608696, "grad_norm": 0.14907378875032545, "learning_rate": 2.196103543389815e-05, "loss": 0.9481, "step": 1530 }, { "epoch": 3.915601023017903, "grad_norm": 0.14207939797213323, "learning_rate": 2.1862792337083017e-05, "loss": 0.9387, "step": 1531 }, { "epoch": 3.918158567774936, "grad_norm": 0.13959510597089575, "learning_rate": 2.176473468825328e-05, "loss": 0.9536, "step": 1532 }, { "epoch": 3.920716112531969, "grad_norm": 0.14016454333503284, "learning_rate": 2.1666862800196454e-05, "loss": 0.9491, "step": 1533 }, { "epoch": 3.923273657289003, "grad_norm": 0.14885818803453518, "learning_rate": 2.1569176985107535e-05, "loss": 0.9612, "step": 1534 }, { "epoch": 3.9258312020460355, "grad_norm": 0.14403866973582788, "learning_rate": 2.1471677554587958e-05, "loss": 0.9511, "step": 1535 }, { "epoch": 3.928388746803069, "grad_norm": 0.13223516573639468, "learning_rate": 2.1374364819644623e-05, "loss": 0.9373, "step": 1536 }, { "epoch": 3.9309462915601023, "grad_norm": 0.14036184466315108, "learning_rate": 2.1277239090688894e-05, "loss": 0.9353, "step": 1537 }, { "epoch": 3.9335038363171355, "grad_norm": 0.1396968491520172, "learning_rate": 2.1180300677535655e-05, "loss": 0.9531, "step": 1538 }, { "epoch": 3.9360613810741687, "grad_norm": 0.13659743962984422, "learning_rate": 2.108354988940228e-05, "loss": 0.936, "step": 1539 }, { "epoch": 3.938618925831202, "grad_norm": 0.1508626854215839, "learning_rate": 2.0986987034907554e-05, "loss": 0.9452, "step": 1540 }, { "epoch": 3.9411764705882355, "grad_norm": 0.14129695624224084, "learning_rate": 2.089061242207092e-05, "loss": 0.9369, "step": 1541 }, { "epoch": 3.9437340153452687, "grad_norm": 0.1428765331179949, "learning_rate": 2.0794426358311294e-05, "loss": 0.9142, "step": 1542 }, { "epoch": 3.946291560102302, "grad_norm": 0.1330347524331098, "learning_rate": 2.069842915044614e-05, "loss": 0.9381, "step": 1543 }, { "epoch": 3.948849104859335, "grad_norm": 0.14069953111767788, "learning_rate": 2.0602621104690517e-05, "loss": 0.921, "step": 1544 }, { "epoch": 3.9514066496163682, "grad_norm": 0.1456949051715094, "learning_rate": 2.050700252665615e-05, "loss": 0.9549, "step": 1545 }, { "epoch": 3.9539641943734014, "grad_norm": 0.13746866783044756, "learning_rate": 2.041157372135028e-05, "loss": 0.9287, "step": 1546 }, { "epoch": 3.9565217391304346, "grad_norm": 0.15606889468360874, "learning_rate": 2.0316334993174856e-05, "loss": 0.9555, "step": 1547 }, { "epoch": 3.959079283887468, "grad_norm": 0.14118323164397703, "learning_rate": 2.0221286645925558e-05, "loss": 0.9343, "step": 1548 }, { "epoch": 3.9616368286445014, "grad_norm": 0.1363380304979579, "learning_rate": 2.012642898279074e-05, "loss": 0.9961, "step": 1549 }, { "epoch": 3.9641943734015346, "grad_norm": 0.14317404024733354, "learning_rate": 2.003176230635049e-05, "loss": 0.9647, "step": 1550 }, { "epoch": 3.9667519181585678, "grad_norm": 0.14674699824614082, "learning_rate": 1.9937286918575713e-05, "loss": 0.9541, "step": 1551 }, { "epoch": 3.969309462915601, "grad_norm": 0.1392728526341487, "learning_rate": 1.984300312082711e-05, "loss": 0.9549, "step": 1552 }, { "epoch": 3.971867007672634, "grad_norm": 0.1388687318173855, "learning_rate": 1.9748911213854267e-05, "loss": 0.9538, "step": 1553 }, { "epoch": 3.9744245524296673, "grad_norm": 0.13901730161036177, "learning_rate": 1.9655011497794616e-05, "loss": 0.9426, "step": 1554 }, { "epoch": 3.976982097186701, "grad_norm": 0.13747089636524243, "learning_rate": 1.9561304272172644e-05, "loss": 0.9639, "step": 1555 }, { "epoch": 3.979539641943734, "grad_norm": 0.1395863657318075, "learning_rate": 1.946778983589873e-05, "loss": 0.9733, "step": 1556 }, { "epoch": 3.9820971867007673, "grad_norm": 0.1388892460599247, "learning_rate": 1.9374468487268254e-05, "loss": 0.944, "step": 1557 }, { "epoch": 3.9846547314578005, "grad_norm": 0.1542426182338673, "learning_rate": 1.9281340523960806e-05, "loss": 0.9575, "step": 1558 }, { "epoch": 3.9872122762148337, "grad_norm": 0.14702194394411322, "learning_rate": 1.9188406243039015e-05, "loss": 0.939, "step": 1559 }, { "epoch": 3.9897698209718673, "grad_norm": 0.15088719580788107, "learning_rate": 1.9095665940947717e-05, "loss": 0.9523, "step": 1560 }, { "epoch": 3.9923273657289, "grad_norm": 0.13979637370531914, "learning_rate": 1.9003119913512992e-05, "loss": 0.9518, "step": 1561 }, { "epoch": 3.9948849104859336, "grad_norm": 0.13293457854923818, "learning_rate": 1.891076845594122e-05, "loss": 0.966, "step": 1562 }, { "epoch": 3.997442455242967, "grad_norm": 0.1330659091048459, "learning_rate": 1.881861186281813e-05, "loss": 0.9425, "step": 1563 }, { "epoch": 4.0, "grad_norm": 0.15532958865697588, "learning_rate": 1.872665042810784e-05, "loss": 0.9491, "step": 1564 }, { "epoch": 4.002557544757034, "grad_norm": 0.172134213325208, "learning_rate": 1.863488444515203e-05, "loss": 0.9131, "step": 1565 }, { "epoch": 4.005115089514066, "grad_norm": 0.15705142364202992, "learning_rate": 1.854331420666882e-05, "loss": 0.9254, "step": 1566 }, { "epoch": 4.0076726342711, "grad_norm": 0.16319791463669756, "learning_rate": 1.845194000475199e-05, "loss": 0.9005, "step": 1567 }, { "epoch": 4.010230179028133, "grad_norm": 0.16550445546270565, "learning_rate": 1.836076213087e-05, "loss": 0.9177, "step": 1568 }, { "epoch": 4.012787723785166, "grad_norm": 0.17000604940332, "learning_rate": 1.826978087586502e-05, "loss": 0.9288, "step": 1569 }, { "epoch": 4.015345268542199, "grad_norm": 0.17439370178321326, "learning_rate": 1.8178996529952088e-05, "loss": 0.9302, "step": 1570 }, { "epoch": 4.017902813299233, "grad_norm": 0.16621808084873166, "learning_rate": 1.808840938271807e-05, "loss": 0.9277, "step": 1571 }, { "epoch": 4.020460358056266, "grad_norm": 0.1502855048809297, "learning_rate": 1.799801972312092e-05, "loss": 0.9146, "step": 1572 }, { "epoch": 4.023017902813299, "grad_norm": 0.15792591947199125, "learning_rate": 1.7907827839488474e-05, "loss": 0.9175, "step": 1573 }, { "epoch": 4.025575447570333, "grad_norm": 0.1563775392864349, "learning_rate": 1.7817834019517805e-05, "loss": 0.9128, "step": 1574 }, { "epoch": 4.028132992327365, "grad_norm": 0.14597718440990778, "learning_rate": 1.7728038550274193e-05, "loss": 0.9185, "step": 1575 }, { "epoch": 4.030690537084399, "grad_norm": 0.1569564550463153, "learning_rate": 1.7638441718190192e-05, "loss": 0.9296, "step": 1576 }, { "epoch": 4.033248081841432, "grad_norm": 0.15089755959303894, "learning_rate": 1.7549043809064697e-05, "loss": 0.9011, "step": 1577 }, { "epoch": 4.035805626598465, "grad_norm": 0.14320940233490406, "learning_rate": 1.74598451080622e-05, "loss": 0.9301, "step": 1578 }, { "epoch": 4.038363171355499, "grad_norm": 0.1640364740345872, "learning_rate": 1.737084589971157e-05, "loss": 0.9294, "step": 1579 }, { "epoch": 4.040920716112532, "grad_norm": 0.15372462860199906, "learning_rate": 1.728204646790544e-05, "loss": 0.9464, "step": 1580 }, { "epoch": 4.043478260869565, "grad_norm": 0.14792763942080298, "learning_rate": 1.7193447095899206e-05, "loss": 0.9224, "step": 1581 }, { "epoch": 4.046035805626598, "grad_norm": 0.13951058738523123, "learning_rate": 1.710504806631005e-05, "loss": 0.9087, "step": 1582 }, { "epoch": 4.048593350383632, "grad_norm": 0.13260882878617228, "learning_rate": 1.701684966111615e-05, "loss": 0.9036, "step": 1583 }, { "epoch": 4.051150895140665, "grad_norm": 0.14125256658288957, "learning_rate": 1.6928852161655616e-05, "loss": 0.92, "step": 1584 }, { "epoch": 4.053708439897698, "grad_norm": 0.13237438231494236, "learning_rate": 1.684105584862584e-05, "loss": 0.9156, "step": 1585 }, { "epoch": 4.056265984654732, "grad_norm": 0.1359119819403516, "learning_rate": 1.6753461002082395e-05, "loss": 0.9554, "step": 1586 }, { "epoch": 4.0588235294117645, "grad_norm": 0.136943228077222, "learning_rate": 1.6666067901438178e-05, "loss": 0.8844, "step": 1587 }, { "epoch": 4.061381074168798, "grad_norm": 0.14746043096646916, "learning_rate": 1.657887682546264e-05, "loss": 0.9091, "step": 1588 }, { "epoch": 4.063938618925831, "grad_norm": 0.13289891251117492, "learning_rate": 1.649188805228076e-05, "loss": 0.9462, "step": 1589 }, { "epoch": 4.0664961636828645, "grad_norm": 0.14117852752538673, "learning_rate": 1.6405101859372123e-05, "loss": 0.9153, "step": 1590 }, { "epoch": 4.069053708439898, "grad_norm": 0.12613455462183037, "learning_rate": 1.631851852357026e-05, "loss": 0.9519, "step": 1591 }, { "epoch": 4.071611253196931, "grad_norm": 0.1396860703236042, "learning_rate": 1.6232138321061544e-05, "loss": 0.9412, "step": 1592 }, { "epoch": 4.0741687979539645, "grad_norm": 0.1360638603818121, "learning_rate": 1.6145961527384395e-05, "loss": 0.9517, "step": 1593 }, { "epoch": 4.076726342710997, "grad_norm": 0.1324923155606263, "learning_rate": 1.6059988417428396e-05, "loss": 0.9513, "step": 1594 }, { "epoch": 4.079283887468031, "grad_norm": 0.14265745538296148, "learning_rate": 1.5974219265433406e-05, "loss": 0.9154, "step": 1595 }, { "epoch": 4.081841432225064, "grad_norm": 0.14492559140570338, "learning_rate": 1.58886543449887e-05, "loss": 0.9394, "step": 1596 }, { "epoch": 4.084398976982097, "grad_norm": 0.12579546842676975, "learning_rate": 1.5803293929032078e-05, "loss": 0.9281, "step": 1597 }, { "epoch": 4.086956521739131, "grad_norm": 0.14549537683931857, "learning_rate": 1.5718138289849055e-05, "loss": 0.8957, "step": 1598 }, { "epoch": 4.089514066496164, "grad_norm": 0.14813650458162753, "learning_rate": 1.563318769907187e-05, "loss": 0.9004, "step": 1599 }, { "epoch": 4.092071611253197, "grad_norm": 0.12523568970989923, "learning_rate": 1.554844242767872e-05, "loss": 0.9311, "step": 1600 }, { "epoch": 4.09462915601023, "grad_norm": 0.13296174952051867, "learning_rate": 1.546390274599289e-05, "loss": 0.9256, "step": 1601 }, { "epoch": 4.0971867007672635, "grad_norm": 0.12809367590620266, "learning_rate": 1.5379568923681833e-05, "loss": 0.9136, "step": 1602 }, { "epoch": 4.099744245524296, "grad_norm": 0.13109260024902633, "learning_rate": 1.5295441229756364e-05, "loss": 0.9007, "step": 1603 }, { "epoch": 4.10230179028133, "grad_norm": 0.12407094954940708, "learning_rate": 1.521151993256977e-05, "loss": 0.9406, "step": 1604 }, { "epoch": 4.1048593350383635, "grad_norm": 0.1298161922376652, "learning_rate": 1.5127805299817025e-05, "loss": 0.9264, "step": 1605 }, { "epoch": 4.107416879795396, "grad_norm": 0.1481163518427539, "learning_rate": 1.5044297598533777e-05, "loss": 0.9285, "step": 1606 }, { "epoch": 4.10997442455243, "grad_norm": 0.12078740228639545, "learning_rate": 1.496099709509565e-05, "loss": 0.9078, "step": 1607 }, { "epoch": 4.112531969309463, "grad_norm": 0.13027908099413282, "learning_rate": 1.4877904055217376e-05, "loss": 0.9149, "step": 1608 }, { "epoch": 4.115089514066496, "grad_norm": 0.1468019204651356, "learning_rate": 1.4795018743951857e-05, "loss": 0.9304, "step": 1609 }, { "epoch": 4.117647058823529, "grad_norm": 0.1349316946630024, "learning_rate": 1.4712341425689406e-05, "loss": 0.926, "step": 1610 }, { "epoch": 4.120204603580563, "grad_norm": 0.1228754724620514, "learning_rate": 1.4629872364156854e-05, "loss": 0.9185, "step": 1611 }, { "epoch": 4.122762148337596, "grad_norm": 0.14313419206388078, "learning_rate": 1.4547611822416748e-05, "loss": 0.9126, "step": 1612 }, { "epoch": 4.125319693094629, "grad_norm": 0.14531581013669995, "learning_rate": 1.446556006286648e-05, "loss": 0.9372, "step": 1613 }, { "epoch": 4.127877237851663, "grad_norm": 0.12636103579388067, "learning_rate": 1.4383717347237425e-05, "loss": 0.9255, "step": 1614 }, { "epoch": 4.130434782608695, "grad_norm": 0.13484501378576969, "learning_rate": 1.4302083936594247e-05, "loss": 0.9267, "step": 1615 }, { "epoch": 4.132992327365729, "grad_norm": 0.1306495047012211, "learning_rate": 1.4220660091333875e-05, "loss": 0.9237, "step": 1616 }, { "epoch": 4.135549872122763, "grad_norm": 0.12979097348457122, "learning_rate": 1.4139446071184737e-05, "loss": 0.9197, "step": 1617 }, { "epoch": 4.138107416879795, "grad_norm": 0.13739201337062779, "learning_rate": 1.405844213520604e-05, "loss": 0.9197, "step": 1618 }, { "epoch": 4.140664961636829, "grad_norm": 0.1294644982423319, "learning_rate": 1.3977648541786804e-05, "loss": 0.896, "step": 1619 }, { "epoch": 4.143222506393862, "grad_norm": 0.12588348274914363, "learning_rate": 1.3897065548645104e-05, "loss": 0.9453, "step": 1620 }, { "epoch": 4.145780051150895, "grad_norm": 0.15398362387202247, "learning_rate": 1.381669341282721e-05, "loss": 0.9317, "step": 1621 }, { "epoch": 4.148337595907928, "grad_norm": 0.13197721364304257, "learning_rate": 1.3736532390706878e-05, "loss": 0.9279, "step": 1622 }, { "epoch": 4.150895140664962, "grad_norm": 0.12322044737512756, "learning_rate": 1.3656582737984318e-05, "loss": 0.9439, "step": 1623 }, { "epoch": 4.153452685421995, "grad_norm": 0.12440470950789576, "learning_rate": 1.3576844709685583e-05, "loss": 0.9088, "step": 1624 }, { "epoch": 4.156010230179028, "grad_norm": 0.12465116010990127, "learning_rate": 1.3497318560161704e-05, "loss": 0.9211, "step": 1625 }, { "epoch": 4.158567774936062, "grad_norm": 0.13358086347052778, "learning_rate": 1.3418004543087792e-05, "loss": 0.9312, "step": 1626 }, { "epoch": 4.161125319693094, "grad_norm": 0.1224560124714394, "learning_rate": 1.3338902911462336e-05, "loss": 0.9253, "step": 1627 }, { "epoch": 4.163682864450128, "grad_norm": 0.12240140914681184, "learning_rate": 1.3260013917606319e-05, "loss": 0.9383, "step": 1628 }, { "epoch": 4.166240409207161, "grad_norm": 0.12945740752464988, "learning_rate": 1.318133781316247e-05, "loss": 0.9416, "step": 1629 }, { "epoch": 4.168797953964194, "grad_norm": 0.13087100044291045, "learning_rate": 1.3102874849094414e-05, "loss": 0.9316, "step": 1630 }, { "epoch": 4.171355498721228, "grad_norm": 0.14189296661844325, "learning_rate": 1.3024625275685891e-05, "loss": 0.9465, "step": 1631 }, { "epoch": 4.173913043478261, "grad_norm": 0.1297951759919457, "learning_rate": 1.2946589342540023e-05, "loss": 0.9275, "step": 1632 }, { "epoch": 4.176470588235294, "grad_norm": 0.11911786087772278, "learning_rate": 1.2868767298578395e-05, "loss": 0.9225, "step": 1633 }, { "epoch": 4.179028132992327, "grad_norm": 0.12225398214034955, "learning_rate": 1.2791159392040275e-05, "loss": 0.9196, "step": 1634 }, { "epoch": 4.181585677749361, "grad_norm": 0.1310216078232746, "learning_rate": 1.2713765870481995e-05, "loss": 0.9353, "step": 1635 }, { "epoch": 4.1841432225063935, "grad_norm": 0.12742055135018454, "learning_rate": 1.2636586980775945e-05, "loss": 0.9666, "step": 1636 }, { "epoch": 4.186700767263427, "grad_norm": 0.12384487664186089, "learning_rate": 1.2559622969109886e-05, "loss": 0.9209, "step": 1637 }, { "epoch": 4.189258312020461, "grad_norm": 0.1340544434519516, "learning_rate": 1.2482874080986176e-05, "loss": 0.9377, "step": 1638 }, { "epoch": 4.1918158567774935, "grad_norm": 0.13746772119236356, "learning_rate": 1.2406340561220947e-05, "loss": 0.9207, "step": 1639 }, { "epoch": 4.194373401534527, "grad_norm": 0.1280603990954687, "learning_rate": 1.2330022653943358e-05, "loss": 0.914, "step": 1640 }, { "epoch": 4.19693094629156, "grad_norm": 0.12374468420399631, "learning_rate": 1.2253920602594759e-05, "loss": 0.8923, "step": 1641 }, { "epoch": 4.1994884910485935, "grad_norm": 0.12384342114389504, "learning_rate": 1.2178034649928034e-05, "loss": 0.9396, "step": 1642 }, { "epoch": 4.202046035805626, "grad_norm": 0.1230247461338335, "learning_rate": 1.2102365038006672e-05, "loss": 0.8981, "step": 1643 }, { "epoch": 4.20460358056266, "grad_norm": 0.12441020446608941, "learning_rate": 1.2026912008204117e-05, "loss": 0.9395, "step": 1644 }, { "epoch": 4.207161125319693, "grad_norm": 0.1207928603043833, "learning_rate": 1.195167580120292e-05, "loss": 0.9257, "step": 1645 }, { "epoch": 4.209718670076726, "grad_norm": 0.12168214916803673, "learning_rate": 1.1876656656994032e-05, "loss": 0.907, "step": 1646 }, { "epoch": 4.21227621483376, "grad_norm": 0.12409121363381591, "learning_rate": 1.180185481487599e-05, "loss": 0.9082, "step": 1647 }, { "epoch": 4.2148337595907925, "grad_norm": 0.12218546237016087, "learning_rate": 1.1727270513454161e-05, "loss": 0.9207, "step": 1648 }, { "epoch": 4.217391304347826, "grad_norm": 0.1373741099688316, "learning_rate": 1.1652903990640075e-05, "loss": 0.9041, "step": 1649 }, { "epoch": 4.21994884910486, "grad_norm": 0.126043833861761, "learning_rate": 1.1578755483650465e-05, "loss": 0.9071, "step": 1650 }, { "epoch": 4.2225063938618925, "grad_norm": 0.12907468546494064, "learning_rate": 1.150482522900668e-05, "loss": 0.9267, "step": 1651 }, { "epoch": 4.225063938618926, "grad_norm": 0.11696490881508001, "learning_rate": 1.1431113462533942e-05, "loss": 0.9188, "step": 1652 }, { "epoch": 4.227621483375959, "grad_norm": 0.1219772936698238, "learning_rate": 1.1357620419360438e-05, "loss": 0.93, "step": 1653 }, { "epoch": 4.2301790281329925, "grad_norm": 0.12317189729882781, "learning_rate": 1.128434633391673e-05, "loss": 0.9248, "step": 1654 }, { "epoch": 4.232736572890025, "grad_norm": 0.12135967777000363, "learning_rate": 1.121129143993489e-05, "loss": 0.9482, "step": 1655 }, { "epoch": 4.235294117647059, "grad_norm": 0.12569146595438008, "learning_rate": 1.1138455970447857e-05, "loss": 0.9237, "step": 1656 }, { "epoch": 4.2378516624040925, "grad_norm": 0.12009749843054457, "learning_rate": 1.1065840157788599e-05, "loss": 0.9117, "step": 1657 }, { "epoch": 4.240409207161125, "grad_norm": 0.12262206120182582, "learning_rate": 1.099344423358943e-05, "loss": 0.944, "step": 1658 }, { "epoch": 4.242966751918159, "grad_norm": 0.12739673009436395, "learning_rate": 1.0921268428781277e-05, "loss": 0.928, "step": 1659 }, { "epoch": 4.245524296675192, "grad_norm": 0.12049563257356445, "learning_rate": 1.084931297359293e-05, "loss": 0.9307, "step": 1660 }, { "epoch": 4.248081841432225, "grad_norm": 0.1268732696430339, "learning_rate": 1.0777578097550206e-05, "loss": 0.938, "step": 1661 }, { "epoch": 4.250639386189258, "grad_norm": 0.1302689278877736, "learning_rate": 1.0706064029475436e-05, "loss": 0.9339, "step": 1662 }, { "epoch": 4.253196930946292, "grad_norm": 0.1207622169109695, "learning_rate": 1.0634770997486546e-05, "loss": 0.9153, "step": 1663 }, { "epoch": 4.255754475703325, "grad_norm": 0.11706181174774555, "learning_rate": 1.0563699228996405e-05, "loss": 0.9129, "step": 1664 }, { "epoch": 4.258312020460358, "grad_norm": 0.11849875702011481, "learning_rate": 1.0492848950712067e-05, "loss": 0.9183, "step": 1665 }, { "epoch": 4.260869565217392, "grad_norm": 0.12286048694545573, "learning_rate": 1.0422220388634145e-05, "loss": 0.9194, "step": 1666 }, { "epoch": 4.263427109974424, "grad_norm": 0.12106155524848677, "learning_rate": 1.03518137680559e-05, "loss": 0.93, "step": 1667 }, { "epoch": 4.265984654731458, "grad_norm": 0.11931612070623257, "learning_rate": 1.0281629313562704e-05, "loss": 0.8812, "step": 1668 }, { "epoch": 4.268542199488491, "grad_norm": 0.12412002218869622, "learning_rate": 1.0211667249031278e-05, "loss": 0.9211, "step": 1669 }, { "epoch": 4.271099744245524, "grad_norm": 0.11050129272365039, "learning_rate": 1.0141927797628913e-05, "loss": 0.9346, "step": 1670 }, { "epoch": 4.273657289002558, "grad_norm": 0.11696142916514798, "learning_rate": 1.0072411181812805e-05, "loss": 0.9103, "step": 1671 }, { "epoch": 4.276214833759591, "grad_norm": 0.12523114611535077, "learning_rate": 1.0003117623329373e-05, "loss": 0.9188, "step": 1672 }, { "epoch": 4.278772378516624, "grad_norm": 0.1211246626009557, "learning_rate": 9.934047343213468e-06, "loss": 0.8779, "step": 1673 }, { "epoch": 4.281329923273657, "grad_norm": 0.11896385138151676, "learning_rate": 9.865200561787779e-06, "loss": 0.916, "step": 1674 }, { "epoch": 4.283887468030691, "grad_norm": 0.12907351319734606, "learning_rate": 9.796577498662017e-06, "loss": 0.9316, "step": 1675 }, { "epoch": 4.286445012787723, "grad_norm": 0.1175024733129538, "learning_rate": 9.728178372732323e-06, "loss": 0.9175, "step": 1676 }, { "epoch": 4.289002557544757, "grad_norm": 0.11765409328640529, "learning_rate": 9.660003402180495e-06, "loss": 0.9322, "step": 1677 }, { "epoch": 4.291560102301791, "grad_norm": 0.11606048414482627, "learning_rate": 9.592052804473248e-06, "loss": 0.9338, "step": 1678 }, { "epoch": 4.294117647058823, "grad_norm": 0.12217997194310143, "learning_rate": 9.524326796361704e-06, "loss": 0.9198, "step": 1679 }, { "epoch": 4.296675191815857, "grad_norm": 0.13681552209998984, "learning_rate": 9.456825593880502e-06, "loss": 0.9381, "step": 1680 }, { "epoch": 4.29923273657289, "grad_norm": 0.11707040245774833, "learning_rate": 9.389549412347204e-06, "loss": 0.9114, "step": 1681 }, { "epoch": 4.301790281329923, "grad_norm": 0.11739134713610266, "learning_rate": 9.322498466361574e-06, "loss": 0.9564, "step": 1682 }, { "epoch": 4.304347826086957, "grad_norm": 0.11490889884017837, "learning_rate": 9.25567296980499e-06, "loss": 0.9372, "step": 1683 }, { "epoch": 4.30690537084399, "grad_norm": 0.13548343430667473, "learning_rate": 9.18907313583958e-06, "loss": 0.9571, "step": 1684 }, { "epoch": 4.309462915601023, "grad_norm": 0.1169879093609689, "learning_rate": 9.122699176907699e-06, "loss": 0.91, "step": 1685 }, { "epoch": 4.312020460358056, "grad_norm": 0.12181883918771313, "learning_rate": 9.056551304731216e-06, "loss": 0.9403, "step": 1686 }, { "epoch": 4.31457800511509, "grad_norm": 0.11516301601447926, "learning_rate": 8.990629730310787e-06, "loss": 0.9045, "step": 1687 }, { "epoch": 4.3171355498721224, "grad_norm": 0.1130886469711019, "learning_rate": 8.924934663925228e-06, "loss": 0.9005, "step": 1688 }, { "epoch": 4.319693094629156, "grad_norm": 0.12056683149234801, "learning_rate": 8.859466315130833e-06, "loss": 0.905, "step": 1689 }, { "epoch": 4.322250639386189, "grad_norm": 0.12131053610936289, "learning_rate": 8.794224892760694e-06, "loss": 0.964, "step": 1690 }, { "epoch": 4.324808184143222, "grad_norm": 0.11072666373506544, "learning_rate": 8.729210604924075e-06, "loss": 0.9168, "step": 1691 }, { "epoch": 4.327365728900256, "grad_norm": 0.11419375138008123, "learning_rate": 8.66442365900566e-06, "loss": 0.9155, "step": 1692 }, { "epoch": 4.329923273657289, "grad_norm": 0.11067325544749756, "learning_rate": 8.599864261665032e-06, "loss": 0.929, "step": 1693 }, { "epoch": 4.332480818414322, "grad_norm": 0.13119769270640452, "learning_rate": 8.535532618835894e-06, "loss": 0.9196, "step": 1694 }, { "epoch": 4.335038363171355, "grad_norm": 0.12122259309350006, "learning_rate": 8.471428935725394e-06, "loss": 0.9097, "step": 1695 }, { "epoch": 4.337595907928389, "grad_norm": 0.1186567073290791, "learning_rate": 8.407553416813621e-06, "loss": 0.9486, "step": 1696 }, { "epoch": 4.340153452685422, "grad_norm": 0.13863787273855152, "learning_rate": 8.343906265852806e-06, "loss": 0.9194, "step": 1697 }, { "epoch": 4.342710997442455, "grad_norm": 0.11736813648606277, "learning_rate": 8.280487685866707e-06, "loss": 0.8964, "step": 1698 }, { "epoch": 4.345268542199489, "grad_norm": 0.11874382513666652, "learning_rate": 8.217297879150065e-06, "loss": 0.9305, "step": 1699 }, { "epoch": 4.3478260869565215, "grad_norm": 0.12096917615982158, "learning_rate": 8.154337047267763e-06, "loss": 0.926, "step": 1700 }, { "epoch": 4.350383631713555, "grad_norm": 0.12459874607610563, "learning_rate": 8.091605391054354e-06, "loss": 0.8922, "step": 1701 }, { "epoch": 4.352941176470588, "grad_norm": 0.12221739613538536, "learning_rate": 8.02910311061333e-06, "loss": 0.9401, "step": 1702 }, { "epoch": 4.3554987212276215, "grad_norm": 0.12254645629749011, "learning_rate": 7.966830405316561e-06, "loss": 0.9547, "step": 1703 }, { "epoch": 4.358056265984655, "grad_norm": 0.12001133797508247, "learning_rate": 7.90478747380357e-06, "loss": 0.9103, "step": 1704 }, { "epoch": 4.360613810741688, "grad_norm": 0.12199519070925526, "learning_rate": 7.842974513980946e-06, "loss": 0.9271, "step": 1705 }, { "epoch": 4.3631713554987215, "grad_norm": 0.11295241635294967, "learning_rate": 7.781391723021711e-06, "loss": 0.9363, "step": 1706 }, { "epoch": 4.365728900255754, "grad_norm": 0.12686526411244078, "learning_rate": 7.720039297364681e-06, "loss": 0.9274, "step": 1707 }, { "epoch": 4.368286445012788, "grad_norm": 0.1333081116381865, "learning_rate": 7.658917432713839e-06, "loss": 0.9172, "step": 1708 }, { "epoch": 4.370843989769821, "grad_norm": 0.12577470275328256, "learning_rate": 7.598026324037762e-06, "loss": 0.939, "step": 1709 }, { "epoch": 4.373401534526854, "grad_norm": 0.12345544691397578, "learning_rate": 7.537366165568909e-06, "loss": 0.9288, "step": 1710 }, { "epoch": 4.375959079283888, "grad_norm": 0.11948532376497799, "learning_rate": 7.476937150803025e-06, "loss": 0.9497, "step": 1711 }, { "epoch": 4.378516624040921, "grad_norm": 0.12876903997603817, "learning_rate": 7.416739472498613e-06, "loss": 0.9479, "step": 1712 }, { "epoch": 4.381074168797954, "grad_norm": 0.11529385831506739, "learning_rate": 7.356773322676205e-06, "loss": 0.9158, "step": 1713 }, { "epoch": 4.383631713554987, "grad_norm": 0.11078825541988917, "learning_rate": 7.2970388926178045e-06, "loss": 0.937, "step": 1714 }, { "epoch": 4.3861892583120206, "grad_norm": 0.11173435690628004, "learning_rate": 7.237536372866247e-06, "loss": 0.9327, "step": 1715 }, { "epoch": 4.388746803069053, "grad_norm": 0.1223612229123131, "learning_rate": 7.178265953224701e-06, "loss": 0.9227, "step": 1716 }, { "epoch": 4.391304347826087, "grad_norm": 0.12507251852936713, "learning_rate": 7.119227822755843e-06, "loss": 0.9571, "step": 1717 }, { "epoch": 4.3938618925831205, "grad_norm": 0.11397092222799754, "learning_rate": 7.060422169781467e-06, "loss": 0.9041, "step": 1718 }, { "epoch": 4.396419437340153, "grad_norm": 0.10753667090584995, "learning_rate": 7.001849181881808e-06, "loss": 0.9166, "step": 1719 }, { "epoch": 4.398976982097187, "grad_norm": 0.12054572854799732, "learning_rate": 6.943509045894905e-06, "loss": 0.9341, "step": 1720 }, { "epoch": 4.40153452685422, "grad_norm": 0.11185867845020742, "learning_rate": 6.885401947916048e-06, "loss": 0.9514, "step": 1721 }, { "epoch": 4.404092071611253, "grad_norm": 0.11085335077105966, "learning_rate": 6.827528073297185e-06, "loss": 0.9382, "step": 1722 }, { "epoch": 4.406649616368286, "grad_norm": 0.11479224410155166, "learning_rate": 6.769887606646306e-06, "loss": 0.9414, "step": 1723 }, { "epoch": 4.40920716112532, "grad_norm": 0.11417555802279347, "learning_rate": 6.712480731826878e-06, "loss": 0.912, "step": 1724 }, { "epoch": 4.411764705882353, "grad_norm": 0.11413292812828428, "learning_rate": 6.6553076319572394e-06, "loss": 0.9268, "step": 1725 }, { "epoch": 4.414322250639386, "grad_norm": 0.10996848327532169, "learning_rate": 6.59836848941005e-06, "loss": 0.9253, "step": 1726 }, { "epoch": 4.41687979539642, "grad_norm": 0.12150368369219573, "learning_rate": 6.541663485811667e-06, "loss": 0.915, "step": 1727 }, { "epoch": 4.419437340153452, "grad_norm": 0.11980533715997778, "learning_rate": 6.485192802041553e-06, "loss": 0.9156, "step": 1728 }, { "epoch": 4.421994884910486, "grad_norm": 0.11392894414591724, "learning_rate": 6.428956618231788e-06, "loss": 0.9197, "step": 1729 }, { "epoch": 4.42455242966752, "grad_norm": 0.11760332661995491, "learning_rate": 6.3729551137664055e-06, "loss": 0.9545, "step": 1730 }, { "epoch": 4.427109974424552, "grad_norm": 0.10904085632244291, "learning_rate": 6.3171884672808524e-06, "loss": 0.9103, "step": 1731 }, { "epoch": 4.429667519181586, "grad_norm": 0.10863502669554059, "learning_rate": 6.26165685666142e-06, "loss": 0.9016, "step": 1732 }, { "epoch": 4.432225063938619, "grad_norm": 0.11509438949225145, "learning_rate": 6.206360459044671e-06, "loss": 0.931, "step": 1733 }, { "epoch": 4.434782608695652, "grad_norm": 0.11748690634314717, "learning_rate": 6.15129945081689e-06, "loss": 0.9151, "step": 1734 }, { "epoch": 4.437340153452685, "grad_norm": 0.11639698873895774, "learning_rate": 6.096474007613476e-06, "loss": 0.9365, "step": 1735 }, { "epoch": 4.439897698209719, "grad_norm": 0.11159987657775047, "learning_rate": 6.0418843043184636e-06, "loss": 0.9552, "step": 1736 }, { "epoch": 4.442455242966752, "grad_norm": 0.10952923402441073, "learning_rate": 5.987530515063889e-06, "loss": 0.9194, "step": 1737 }, { "epoch": 4.445012787723785, "grad_norm": 0.11072771958857656, "learning_rate": 5.933412813229256e-06, "loss": 0.9189, "step": 1738 }, { "epoch": 4.447570332480819, "grad_norm": 0.11775592911375234, "learning_rate": 5.879531371440994e-06, "loss": 0.9388, "step": 1739 }, { "epoch": 4.450127877237851, "grad_norm": 0.11460729784468633, "learning_rate": 5.825886361571922e-06, "loss": 0.8945, "step": 1740 }, { "epoch": 4.452685421994885, "grad_norm": 0.11581761610879335, "learning_rate": 5.772477954740652e-06, "loss": 0.9126, "step": 1741 }, { "epoch": 4.455242966751918, "grad_norm": 0.11118413455302595, "learning_rate": 5.719306321311075e-06, "loss": 0.9565, "step": 1742 }, { "epoch": 4.457800511508951, "grad_norm": 0.10749836975161339, "learning_rate": 5.666371630891858e-06, "loss": 0.9127, "step": 1743 }, { "epoch": 4.460358056265985, "grad_norm": 0.10944652966346073, "learning_rate": 5.613674052335798e-06, "loss": 0.9184, "step": 1744 }, { "epoch": 4.462915601023018, "grad_norm": 0.11540805854208941, "learning_rate": 5.561213753739356e-06, "loss": 0.9281, "step": 1745 }, { "epoch": 4.465473145780051, "grad_norm": 0.11318814770450754, "learning_rate": 5.5089909024421685e-06, "loss": 0.9327, "step": 1746 }, { "epoch": 4.468030690537084, "grad_norm": 0.11689654113549015, "learning_rate": 5.4570056650263784e-06, "loss": 0.9196, "step": 1747 }, { "epoch": 4.470588235294118, "grad_norm": 0.11410697533075874, "learning_rate": 5.405258207316228e-06, "loss": 0.9248, "step": 1748 }, { "epoch": 4.4731457800511505, "grad_norm": 0.11032997359153394, "learning_rate": 5.3537486943774674e-06, "loss": 0.9278, "step": 1749 }, { "epoch": 4.475703324808184, "grad_norm": 0.11362254544830364, "learning_rate": 5.302477290516832e-06, "loss": 0.9508, "step": 1750 }, { "epoch": 4.478260869565218, "grad_norm": 0.114903272001298, "learning_rate": 5.251444159281551e-06, "loss": 0.9177, "step": 1751 }, { "epoch": 4.4808184143222505, "grad_norm": 0.11311594662750116, "learning_rate": 5.200649463458769e-06, "loss": 0.9315, "step": 1752 }, { "epoch": 4.483375959079284, "grad_norm": 0.1080019715192445, "learning_rate": 5.150093365075117e-06, "loss": 0.9423, "step": 1753 }, { "epoch": 4.485933503836317, "grad_norm": 0.11099521632078349, "learning_rate": 5.0997760253961036e-06, "loss": 0.9432, "step": 1754 }, { "epoch": 4.4884910485933505, "grad_norm": 0.1115281668793938, "learning_rate": 5.049697604925605e-06, "loss": 0.9201, "step": 1755 }, { "epoch": 4.491048593350383, "grad_norm": 0.11559474894332394, "learning_rate": 4.999858263405468e-06, "loss": 0.9335, "step": 1756 }, { "epoch": 4.493606138107417, "grad_norm": 0.10752469888696953, "learning_rate": 4.9502581598148425e-06, "loss": 0.9326, "step": 1757 }, { "epoch": 4.4961636828644505, "grad_norm": 0.11823364858584975, "learning_rate": 4.900897452369782e-06, "loss": 0.9085, "step": 1758 }, { "epoch": 4.498721227621483, "grad_norm": 0.12367303838985884, "learning_rate": 4.851776298522692e-06, "loss": 0.8962, "step": 1759 }, { "epoch": 4.501278772378517, "grad_norm": 0.11649199224229981, "learning_rate": 4.802894854961882e-06, "loss": 0.945, "step": 1760 }, { "epoch": 4.5038363171355495, "grad_norm": 0.10951836253938066, "learning_rate": 4.754253277610969e-06, "loss": 0.9362, "step": 1761 }, { "epoch": 4.506393861892583, "grad_norm": 0.11824940633958814, "learning_rate": 4.705851721628465e-06, "loss": 0.9489, "step": 1762 }, { "epoch": 4.508951406649617, "grad_norm": 0.11623129349141179, "learning_rate": 4.6576903414072576e-06, "loss": 0.9345, "step": 1763 }, { "epoch": 4.5115089514066495, "grad_norm": 0.10609179613886349, "learning_rate": 4.6097692905741194e-06, "loss": 0.912, "step": 1764 }, { "epoch": 4.514066496163683, "grad_norm": 0.1110236313063869, "learning_rate": 4.562088721989178e-06, "loss": 0.9263, "step": 1765 }, { "epoch": 4.516624040920716, "grad_norm": 0.10545968825146992, "learning_rate": 4.514648787745506e-06, "loss": 0.9132, "step": 1766 }, { "epoch": 4.5191815856777495, "grad_norm": 0.11497860724139544, "learning_rate": 4.467449639168564e-06, "loss": 0.9435, "step": 1767 }, { "epoch": 4.521739130434782, "grad_norm": 0.11514110122345275, "learning_rate": 4.420491426815758e-06, "loss": 0.9405, "step": 1768 }, { "epoch": 4.524296675191816, "grad_norm": 0.1123546579246865, "learning_rate": 4.373774300475928e-06, "loss": 0.9013, "step": 1769 }, { "epoch": 4.526854219948849, "grad_norm": 0.10434900776877028, "learning_rate": 4.327298409168928e-06, "loss": 0.9234, "step": 1770 }, { "epoch": 4.529411764705882, "grad_norm": 0.10753377323226707, "learning_rate": 4.281063901145102e-06, "loss": 0.9191, "step": 1771 }, { "epoch": 4.531969309462916, "grad_norm": 0.10990039699899636, "learning_rate": 4.235070923884772e-06, "loss": 0.9218, "step": 1772 }, { "epoch": 4.534526854219949, "grad_norm": 0.10914742733757979, "learning_rate": 4.18931962409789e-06, "loss": 0.9109, "step": 1773 }, { "epoch": 4.537084398976982, "grad_norm": 0.10959258250347798, "learning_rate": 4.143810147723448e-06, "loss": 0.9152, "step": 1774 }, { "epoch": 4.539641943734015, "grad_norm": 0.11106116826490182, "learning_rate": 4.098542639929086e-06, "loss": 0.9046, "step": 1775 }, { "epoch": 4.542199488491049, "grad_norm": 0.10748546841476085, "learning_rate": 4.0535172451105785e-06, "loss": 0.9128, "step": 1776 }, { "epoch": 4.544757033248082, "grad_norm": 0.11225561412585737, "learning_rate": 4.008734106891439e-06, "loss": 0.929, "step": 1777 }, { "epoch": 4.547314578005115, "grad_norm": 0.10831404168834766, "learning_rate": 3.964193368122384e-06, "loss": 0.9397, "step": 1778 }, { "epoch": 4.549872122762149, "grad_norm": 0.11033594472176086, "learning_rate": 3.919895170880938e-06, "loss": 0.9252, "step": 1779 }, { "epoch": 4.552429667519181, "grad_norm": 0.10441833953450541, "learning_rate": 3.875839656470959e-06, "loss": 0.9182, "step": 1780 }, { "epoch": 4.554987212276215, "grad_norm": 0.11080119595164395, "learning_rate": 3.832026965422184e-06, "loss": 0.949, "step": 1781 }, { "epoch": 4.557544757033249, "grad_norm": 0.11022335632664775, "learning_rate": 3.788457237489773e-06, "loss": 0.9238, "step": 1782 }, { "epoch": 4.560102301790281, "grad_norm": 0.11308201432747443, "learning_rate": 3.7451306116538867e-06, "loss": 0.9711, "step": 1783 }, { "epoch": 4.562659846547315, "grad_norm": 0.1028220418076954, "learning_rate": 3.7020472261192253e-06, "loss": 0.9005, "step": 1784 }, { "epoch": 4.565217391304348, "grad_norm": 0.10528950924867539, "learning_rate": 3.6592072183146043e-06, "loss": 0.9014, "step": 1785 }, { "epoch": 4.567774936061381, "grad_norm": 0.10885389205625104, "learning_rate": 3.616610724892473e-06, "loss": 0.9105, "step": 1786 }, { "epoch": 4.570332480818414, "grad_norm": 0.10574673017545647, "learning_rate": 3.5742578817285777e-06, "loss": 0.9193, "step": 1787 }, { "epoch": 4.572890025575448, "grad_norm": 0.1117883112559058, "learning_rate": 3.532148823921375e-06, "loss": 0.91, "step": 1788 }, { "epoch": 4.57544757033248, "grad_norm": 0.1096961353796292, "learning_rate": 3.490283685791722e-06, "loss": 0.9594, "step": 1789 }, { "epoch": 4.578005115089514, "grad_norm": 0.11161221492802147, "learning_rate": 3.4486626008824575e-06, "loss": 0.9327, "step": 1790 }, { "epoch": 4.580562659846548, "grad_norm": 0.10744759992585007, "learning_rate": 3.4072857019578787e-06, "loss": 0.9219, "step": 1791 }, { "epoch": 4.58312020460358, "grad_norm": 0.10620450789029019, "learning_rate": 3.3661531210033684e-06, "loss": 0.9256, "step": 1792 }, { "epoch": 4.585677749360614, "grad_norm": 0.11017512262461532, "learning_rate": 3.3252649892250123e-06, "loss": 0.9188, "step": 1793 }, { "epoch": 4.588235294117647, "grad_norm": 0.10649203584062787, "learning_rate": 3.2846214370491114e-06, "loss": 0.9286, "step": 1794 }, { "epoch": 4.59079283887468, "grad_norm": 0.10775649571843056, "learning_rate": 3.2442225941218175e-06, "loss": 0.91, "step": 1795 }, { "epoch": 4.593350383631714, "grad_norm": 0.10474409566182012, "learning_rate": 3.20406858930868e-06, "loss": 0.9187, "step": 1796 }, { "epoch": 4.595907928388747, "grad_norm": 0.10901379780591824, "learning_rate": 3.164159550694299e-06, "loss": 0.9268, "step": 1797 }, { "epoch": 4.59846547314578, "grad_norm": 0.10466246579829651, "learning_rate": 3.12449560558183e-06, "loss": 0.9045, "step": 1798 }, { "epoch": 4.601023017902813, "grad_norm": 0.10734422633494305, "learning_rate": 3.085076880492608e-06, "loss": 0.9131, "step": 1799 }, { "epoch": 4.603580562659847, "grad_norm": 0.1102245685075459, "learning_rate": 3.045903501165821e-06, "loss": 0.9456, "step": 1800 }, { "epoch": 4.6061381074168795, "grad_norm": 0.10268613459994491, "learning_rate": 3.0069755925579945e-06, "loss": 0.9068, "step": 1801 }, { "epoch": 4.608695652173913, "grad_norm": 0.1041191008417218, "learning_rate": 2.9682932788426622e-06, "loss": 0.8961, "step": 1802 }, { "epoch": 4.611253196930946, "grad_norm": 0.10864214050559602, "learning_rate": 2.9298566834099307e-06, "loss": 0.9196, "step": 1803 }, { "epoch": 4.6138107416879794, "grad_norm": 0.10289987799334356, "learning_rate": 2.891665928866152e-06, "loss": 0.8891, "step": 1804 }, { "epoch": 4.616368286445013, "grad_norm": 0.10627932552480018, "learning_rate": 2.853721137033425e-06, "loss": 0.9309, "step": 1805 }, { "epoch": 4.618925831202046, "grad_norm": 0.10976448315029629, "learning_rate": 2.816022428949303e-06, "loss": 0.8956, "step": 1806 }, { "epoch": 4.621483375959079, "grad_norm": 0.10383428088111558, "learning_rate": 2.7785699248663946e-06, "loss": 0.9245, "step": 1807 }, { "epoch": 4.624040920716112, "grad_norm": 0.10746935820829795, "learning_rate": 2.741363744251917e-06, "loss": 0.9641, "step": 1808 }, { "epoch": 4.626598465473146, "grad_norm": 0.1077084422715649, "learning_rate": 2.70440400578738e-06, "loss": 0.936, "step": 1809 }, { "epoch": 4.629156010230179, "grad_norm": 0.10619050887196295, "learning_rate": 2.6676908273681745e-06, "loss": 0.9236, "step": 1810 }, { "epoch": 4.631713554987212, "grad_norm": 0.09868786010783248, "learning_rate": 2.63122432610321e-06, "loss": 0.9235, "step": 1811 }, { "epoch": 4.634271099744246, "grad_norm": 0.10946907000550939, "learning_rate": 2.5950046183145315e-06, "loss": 0.9477, "step": 1812 }, { "epoch": 4.6368286445012785, "grad_norm": 0.10911271296863308, "learning_rate": 2.559031819536966e-06, "loss": 0.8923, "step": 1813 }, { "epoch": 4.639386189258312, "grad_norm": 0.1057852003057491, "learning_rate": 2.523306044517737e-06, "loss": 0.9575, "step": 1814 }, { "epoch": 4.641943734015345, "grad_norm": 0.10597129201414962, "learning_rate": 2.4878274072161147e-06, "loss": 0.9478, "step": 1815 }, { "epoch": 4.6445012787723785, "grad_norm": 0.10530345780753828, "learning_rate": 2.4525960208029843e-06, "loss": 0.9468, "step": 1816 }, { "epoch": 4.647058823529412, "grad_norm": 0.11128520568838593, "learning_rate": 2.417611997660636e-06, "loss": 0.9441, "step": 1817 }, { "epoch": 4.649616368286445, "grad_norm": 0.10763480468498407, "learning_rate": 2.3828754493822315e-06, "loss": 0.9342, "step": 1818 }, { "epoch": 4.6521739130434785, "grad_norm": 0.10157629367738297, "learning_rate": 2.348386486771572e-06, "loss": 0.9121, "step": 1819 }, { "epoch": 4.654731457800511, "grad_norm": 0.10471609831813257, "learning_rate": 2.314145219842683e-06, "loss": 0.8991, "step": 1820 }, { "epoch": 4.657289002557545, "grad_norm": 0.10785688490272143, "learning_rate": 2.2801517578194997e-06, "loss": 0.9023, "step": 1821 }, { "epoch": 4.659846547314578, "grad_norm": 0.10437430915631776, "learning_rate": 2.246406209135481e-06, "loss": 0.9526, "step": 1822 }, { "epoch": 4.662404092071611, "grad_norm": 0.09976754454013415, "learning_rate": 2.212908681433286e-06, "loss": 0.9032, "step": 1823 }, { "epoch": 4.664961636828645, "grad_norm": 0.10687421431181417, "learning_rate": 2.179659281564446e-06, "loss": 0.9164, "step": 1824 }, { "epoch": 4.667519181585678, "grad_norm": 0.10095706529924005, "learning_rate": 2.146658115589002e-06, "loss": 0.9191, "step": 1825 }, { "epoch": 4.670076726342711, "grad_norm": 0.10132269971777201, "learning_rate": 2.113905288775149e-06, "loss": 0.9155, "step": 1826 }, { "epoch": 4.672634271099744, "grad_norm": 0.10307251320208077, "learning_rate": 2.0814009055989403e-06, "loss": 0.9165, "step": 1827 }, { "epoch": 4.675191815856778, "grad_norm": 0.10286096825987698, "learning_rate": 2.0491450697439362e-06, "loss": 0.9101, "step": 1828 }, { "epoch": 4.677749360613811, "grad_norm": 0.11262366728295894, "learning_rate": 2.017137884100855e-06, "loss": 0.914, "step": 1829 }, { "epoch": 4.680306905370844, "grad_norm": 0.11116962011162274, "learning_rate": 1.9853794507672885e-06, "loss": 0.9376, "step": 1830 }, { "epoch": 4.6828644501278776, "grad_norm": 0.1040833044448223, "learning_rate": 1.9538698710473404e-06, "loss": 0.9236, "step": 1831 }, { "epoch": 4.68542199488491, "grad_norm": 0.10541970140434043, "learning_rate": 1.9226092454512945e-06, "loss": 0.9449, "step": 1832 }, { "epoch": 4.687979539641944, "grad_norm": 0.10066677117893352, "learning_rate": 1.8915976736953157e-06, "loss": 0.9138, "step": 1833 }, { "epoch": 4.690537084398977, "grad_norm": 0.10836258727940289, "learning_rate": 1.8608352547011722e-06, "loss": 0.9687, "step": 1834 }, { "epoch": 4.69309462915601, "grad_norm": 0.11074221672096896, "learning_rate": 1.8303220865958194e-06, "loss": 0.9331, "step": 1835 }, { "epoch": 4.695652173913043, "grad_norm": 0.10768331106543749, "learning_rate": 1.8000582667111777e-06, "loss": 0.945, "step": 1836 }, { "epoch": 4.698209718670077, "grad_norm": 0.11098771435258944, "learning_rate": 1.7700438915837858e-06, "loss": 0.9284, "step": 1837 }, { "epoch": 4.70076726342711, "grad_norm": 0.10799063090442731, "learning_rate": 1.7402790569544813e-06, "loss": 0.9, "step": 1838 }, { "epoch": 4.703324808184143, "grad_norm": 0.1063256441527157, "learning_rate": 1.7107638577681073e-06, "loss": 0.8962, "step": 1839 }, { "epoch": 4.705882352941177, "grad_norm": 0.1040346093959911, "learning_rate": 1.681498388173246e-06, "loss": 0.9516, "step": 1840 }, { "epoch": 4.708439897698209, "grad_norm": 0.10335093559260676, "learning_rate": 1.652482741521837e-06, "loss": 0.9131, "step": 1841 }, { "epoch": 4.710997442455243, "grad_norm": 0.10497266871186595, "learning_rate": 1.6237170103689547e-06, "loss": 0.9119, "step": 1842 }, { "epoch": 4.713554987212277, "grad_norm": 0.09874397507531227, "learning_rate": 1.5952012864724898e-06, "loss": 0.9141, "step": 1843 }, { "epoch": 4.716112531969309, "grad_norm": 0.10588059236614217, "learning_rate": 1.5669356607928188e-06, "loss": 0.9331, "step": 1844 }, { "epoch": 4.718670076726343, "grad_norm": 0.10070088788493103, "learning_rate": 1.5389202234925837e-06, "loss": 0.929, "step": 1845 }, { "epoch": 4.721227621483376, "grad_norm": 0.10575607673396381, "learning_rate": 1.5111550639363447e-06, "loss": 0.9195, "step": 1846 }, { "epoch": 4.723785166240409, "grad_norm": 0.1052143724728097, "learning_rate": 1.483640270690332e-06, "loss": 0.9236, "step": 1847 }, { "epoch": 4.726342710997442, "grad_norm": 0.10525748489261051, "learning_rate": 1.4563759315221515e-06, "loss": 0.9515, "step": 1848 }, { "epoch": 4.728900255754476, "grad_norm": 0.10259868287875906, "learning_rate": 1.4293621334004581e-06, "loss": 0.9522, "step": 1849 }, { "epoch": 4.731457800511509, "grad_norm": 0.10136041128342929, "learning_rate": 1.4025989624947856e-06, "loss": 0.9207, "step": 1850 }, { "epoch": 4.734015345268542, "grad_norm": 0.09781638687367422, "learning_rate": 1.3760865041751736e-06, "loss": 0.9226, "step": 1851 }, { "epoch": 4.736572890025576, "grad_norm": 0.10175570288516775, "learning_rate": 1.3498248430119465e-06, "loss": 0.9141, "step": 1852 }, { "epoch": 4.739130434782608, "grad_norm": 0.10920419786681472, "learning_rate": 1.3238140627754014e-06, "loss": 0.9544, "step": 1853 }, { "epoch": 4.741687979539642, "grad_norm": 0.10426566657693524, "learning_rate": 1.2980542464355962e-06, "loss": 0.9492, "step": 1854 }, { "epoch": 4.744245524296675, "grad_norm": 0.10161986714655702, "learning_rate": 1.272545476162037e-06, "loss": 0.9253, "step": 1855 }, { "epoch": 4.746803069053708, "grad_norm": 0.10568474804520346, "learning_rate": 1.2472878333234407e-06, "loss": 0.895, "step": 1856 }, { "epoch": 4.749360613810742, "grad_norm": 0.10079844884131213, "learning_rate": 1.2222813984874749e-06, "loss": 0.9146, "step": 1857 }, { "epoch": 4.751918158567775, "grad_norm": 0.09772653572503225, "learning_rate": 1.197526251420502e-06, "loss": 0.9434, "step": 1858 }, { "epoch": 4.754475703324808, "grad_norm": 0.10521061309223152, "learning_rate": 1.1730224710872862e-06, "loss": 0.917, "step": 1859 }, { "epoch": 4.757033248081841, "grad_norm": 0.10102811382690155, "learning_rate": 1.148770135650814e-06, "loss": 0.9402, "step": 1860 }, { "epoch": 4.759590792838875, "grad_norm": 0.10184925109076563, "learning_rate": 1.1247693224719768e-06, "loss": 0.9341, "step": 1861 }, { "epoch": 4.762148337595908, "grad_norm": 0.10416605640976224, "learning_rate": 1.1010201081093653e-06, "loss": 0.9258, "step": 1862 }, { "epoch": 4.764705882352941, "grad_norm": 0.10242702305319981, "learning_rate": 1.0775225683190027e-06, "loss": 0.9401, "step": 1863 }, { "epoch": 4.767263427109975, "grad_norm": 0.1054355472195325, "learning_rate": 1.0542767780541242e-06, "loss": 0.9452, "step": 1864 }, { "epoch": 4.7698209718670075, "grad_norm": 0.09850748287302327, "learning_rate": 1.0312828114649175e-06, "loss": 0.9147, "step": 1865 }, { "epoch": 4.772378516624041, "grad_norm": 0.10426914175715249, "learning_rate": 1.008540741898285e-06, "loss": 0.9364, "step": 1866 }, { "epoch": 4.774936061381074, "grad_norm": 0.10421190980413071, "learning_rate": 9.860506418976556e-07, "loss": 0.9155, "step": 1867 }, { "epoch": 4.7774936061381075, "grad_norm": 0.09974968560728949, "learning_rate": 9.638125832026658e-07, "loss": 0.9164, "step": 1868 }, { "epoch": 4.78005115089514, "grad_norm": 0.10323506252287525, "learning_rate": 9.418266367490347e-07, "loss": 0.9294, "step": 1869 }, { "epoch": 4.782608695652174, "grad_norm": 0.10057988567304277, "learning_rate": 9.200928726682456e-07, "loss": 0.9198, "step": 1870 }, { "epoch": 4.7851662404092075, "grad_norm": 0.10109533674227822, "learning_rate": 8.986113602873758e-07, "loss": 0.9696, "step": 1871 }, { "epoch": 4.78772378516624, "grad_norm": 0.10248654252247842, "learning_rate": 8.773821681288752e-07, "loss": 0.9059, "step": 1872 }, { "epoch": 4.790281329923274, "grad_norm": 0.10623698814695832, "learning_rate": 8.564053639103087e-07, "loss": 0.9104, "step": 1873 }, { "epoch": 4.792838874680307, "grad_norm": 0.10184589368398628, "learning_rate": 8.356810145441874e-07, "loss": 0.8999, "step": 1874 }, { "epoch": 4.79539641943734, "grad_norm": 0.09973933906653507, "learning_rate": 8.152091861377198e-07, "loss": 0.9281, "step": 1875 }, { "epoch": 4.797953964194374, "grad_norm": 0.0965602895068992, "learning_rate": 7.949899439926345e-07, "loss": 0.8972, "step": 1876 }, { "epoch": 4.8005115089514065, "grad_norm": 0.09817984542309073, "learning_rate": 7.750233526049222e-07, "loss": 0.9374, "step": 1877 }, { "epoch": 4.80306905370844, "grad_norm": 0.10767556941660049, "learning_rate": 7.553094756646761e-07, "loss": 0.922, "step": 1878 }, { "epoch": 4.805626598465473, "grad_norm": 0.09968854723854502, "learning_rate": 7.358483760558877e-07, "loss": 0.9092, "step": 1879 }, { "epoch": 4.8081841432225065, "grad_norm": 0.10013368895859236, "learning_rate": 7.166401158561886e-07, "loss": 0.9053, "step": 1880 }, { "epoch": 4.810741687979539, "grad_norm": 0.10050188953527933, "learning_rate": 6.976847563367539e-07, "loss": 0.9342, "step": 1881 }, { "epoch": 4.813299232736573, "grad_norm": 0.10572001540704473, "learning_rate": 6.789823579619992e-07, "loss": 0.9055, "step": 1882 }, { "epoch": 4.8158567774936065, "grad_norm": 0.0958884248641111, "learning_rate": 6.605329803894389e-07, "loss": 0.8971, "step": 1883 }, { "epoch": 4.818414322250639, "grad_norm": 0.10042711105691594, "learning_rate": 6.423366824695265e-07, "loss": 0.9176, "step": 1884 }, { "epoch": 4.820971867007673, "grad_norm": 0.10511225981510647, "learning_rate": 6.243935222454145e-07, "loss": 0.9176, "step": 1885 }, { "epoch": 4.823529411764706, "grad_norm": 0.09696941259664335, "learning_rate": 6.067035569527768e-07, "loss": 0.9336, "step": 1886 }, { "epoch": 4.826086956521739, "grad_norm": 0.09743670957958701, "learning_rate": 5.89266843019658e-07, "loss": 0.9335, "step": 1887 }, { "epoch": 4.828644501278772, "grad_norm": 0.10334868098940422, "learning_rate": 5.720834360662597e-07, "loss": 0.9302, "step": 1888 }, { "epoch": 4.831202046035806, "grad_norm": 0.10567530011947436, "learning_rate": 5.551533909047812e-07, "loss": 0.9173, "step": 1889 }, { "epoch": 4.833759590792839, "grad_norm": 0.10109569243664909, "learning_rate": 5.384767615392328e-07, "loss": 0.8973, "step": 1890 }, { "epoch": 4.836317135549872, "grad_norm": 0.10107099176370515, "learning_rate": 5.220536011652933e-07, "loss": 0.9327, "step": 1891 }, { "epoch": 4.838874680306906, "grad_norm": 0.09592817542499839, "learning_rate": 5.058839621700973e-07, "loss": 0.8986, "step": 1892 }, { "epoch": 4.841432225063938, "grad_norm": 0.10402134439975212, "learning_rate": 4.899678961320842e-07, "loss": 0.8783, "step": 1893 }, { "epoch": 4.843989769820972, "grad_norm": 0.09879349396951775, "learning_rate": 4.743054538208558e-07, "loss": 0.9265, "step": 1894 }, { "epoch": 4.846547314578006, "grad_norm": 0.10801219003494308, "learning_rate": 4.5889668519698117e-07, "loss": 0.917, "step": 1895 }, { "epoch": 4.849104859335038, "grad_norm": 0.10336628048777474, "learning_rate": 4.437416394118721e-07, "loss": 0.9475, "step": 1896 }, { "epoch": 4.851662404092072, "grad_norm": 0.09915519846574018, "learning_rate": 4.2884036480757896e-07, "loss": 0.9136, "step": 1897 }, { "epoch": 4.854219948849105, "grad_norm": 0.10488853611936978, "learning_rate": 4.1419290891669293e-07, "loss": 0.9276, "step": 1898 }, { "epoch": 4.856777493606138, "grad_norm": 0.10257283710076046, "learning_rate": 3.997993184621418e-07, "loss": 0.9584, "step": 1899 }, { "epoch": 4.859335038363171, "grad_norm": 0.10288770850501508, "learning_rate": 3.856596393570744e-07, "loss": 0.9128, "step": 1900 }, { "epoch": 4.861892583120205, "grad_norm": 0.09729119851077626, "learning_rate": 3.717739167047185e-07, "loss": 0.912, "step": 1901 }, { "epoch": 4.864450127877237, "grad_norm": 0.1024901619430387, "learning_rate": 3.581421947982122e-07, "loss": 0.9166, "step": 1902 }, { "epoch": 4.867007672634271, "grad_norm": 0.10281823220549692, "learning_rate": 3.447645171204528e-07, "loss": 0.9308, "step": 1903 }, { "epoch": 4.869565217391305, "grad_norm": 0.1014220238267167, "learning_rate": 3.316409263440168e-07, "loss": 0.9401, "step": 1904 }, { "epoch": 4.872122762148337, "grad_norm": 0.10082233886495114, "learning_rate": 3.1877146433095584e-07, "loss": 0.9349, "step": 1905 }, { "epoch": 4.874680306905371, "grad_norm": 0.09966232794121334, "learning_rate": 3.0615617213271664e-07, "loss": 0.9218, "step": 1906 }, { "epoch": 4.877237851662404, "grad_norm": 0.09941244859685047, "learning_rate": 2.937950899899633e-07, "loss": 0.9278, "step": 1907 }, { "epoch": 4.879795396419437, "grad_norm": 0.09951897237383148, "learning_rate": 2.816882573324886e-07, "loss": 0.949, "step": 1908 }, { "epoch": 4.882352941176471, "grad_norm": 0.10401741016384587, "learning_rate": 2.6983571277907184e-07, "loss": 0.9563, "step": 1909 }, { "epoch": 4.884910485933504, "grad_norm": 0.09725714975876674, "learning_rate": 2.582374941373456e-07, "loss": 0.9211, "step": 1910 }, { "epoch": 4.887468030690537, "grad_norm": 0.10133318561817573, "learning_rate": 2.468936384036891e-07, "loss": 0.9013, "step": 1911 }, { "epoch": 4.89002557544757, "grad_norm": 0.10119524228199774, "learning_rate": 2.3580418176311293e-07, "loss": 0.9417, "step": 1912 }, { "epoch": 4.892583120204604, "grad_norm": 0.09951712783614965, "learning_rate": 2.2496915958913458e-07, "loss": 0.9253, "step": 1913 }, { "epoch": 4.8951406649616365, "grad_norm": 0.0988058097334845, "learning_rate": 2.143886064436629e-07, "loss": 0.9344, "step": 1914 }, { "epoch": 4.89769820971867, "grad_norm": 0.0988533205503812, "learning_rate": 2.0406255607688274e-07, "loss": 0.9258, "step": 1915 }, { "epoch": 4.900255754475703, "grad_norm": 0.09899535759420186, "learning_rate": 1.9399104142719283e-07, "loss": 0.9484, "step": 1916 }, { "epoch": 4.9028132992327365, "grad_norm": 0.10153569163687459, "learning_rate": 1.8417409462102798e-07, "loss": 0.9073, "step": 1917 }, { "epoch": 4.90537084398977, "grad_norm": 0.09957601677253938, "learning_rate": 1.746117469728148e-07, "loss": 0.8841, "step": 1918 }, { "epoch": 4.907928388746803, "grad_norm": 0.10184723073884586, "learning_rate": 1.6530402898484733e-07, "loss": 0.9525, "step": 1919 }, { "epoch": 4.910485933503836, "grad_norm": 0.09694091907819868, "learning_rate": 1.5625097034719815e-07, "loss": 0.9193, "step": 1920 }, { "epoch": 4.913043478260869, "grad_norm": 0.10383046531826044, "learning_rate": 1.474525999375942e-07, "loss": 0.9339, "step": 1921 }, { "epoch": 4.915601023017903, "grad_norm": 0.09727962611523398, "learning_rate": 1.3890894582138103e-07, "loss": 0.9271, "step": 1922 }, { "epoch": 4.918158567774936, "grad_norm": 0.10045856203495888, "learning_rate": 1.3062003525138089e-07, "loss": 0.9129, "step": 1923 }, { "epoch": 4.920716112531969, "grad_norm": 0.09953247096750498, "learning_rate": 1.225858946678393e-07, "loss": 0.9149, "step": 1924 }, { "epoch": 4.923273657289003, "grad_norm": 0.10381806462155738, "learning_rate": 1.1480654969833638e-07, "loss": 0.9473, "step": 1925 }, { "epoch": 4.9258312020460355, "grad_norm": 0.09951540982333777, "learning_rate": 1.0728202515766228e-07, "loss": 0.9452, "step": 1926 }, { "epoch": 4.928388746803069, "grad_norm": 0.09714908717583805, "learning_rate": 1.0001234504779966e-07, "loss": 0.9478, "step": 1927 }, { "epoch": 4.930946291560103, "grad_norm": 0.10355673013634514, "learning_rate": 9.299753255781696e-08, "loss": 0.9113, "step": 1928 }, { "epoch": 4.9335038363171355, "grad_norm": 0.1010600576834511, "learning_rate": 8.623761006379738e-08, "loss": 0.9322, "step": 1929 }, { "epoch": 4.936061381074169, "grad_norm": 0.09937740112494577, "learning_rate": 7.973259912875897e-08, "loss": 0.9529, "step": 1930 }, { "epoch": 4.938618925831202, "grad_norm": 0.10172138015837517, "learning_rate": 7.348252050261018e-08, "loss": 0.9516, "step": 1931 }, { "epoch": 4.9411764705882355, "grad_norm": 0.10153203845561144, "learning_rate": 6.748739412205218e-08, "loss": 0.9327, "step": 1932 }, { "epoch": 4.943734015345268, "grad_norm": 0.09630467933849142, "learning_rate": 6.174723911053449e-08, "loss": 0.9033, "step": 1933 }, { "epoch": 4.946291560102302, "grad_norm": 0.09792982830145779, "learning_rate": 5.6262073778192705e-08, "loss": 0.9289, "step": 1934 }, { "epoch": 4.948849104859335, "grad_norm": 0.10137971801200332, "learning_rate": 5.1031915621795325e-08, "loss": 0.9127, "step": 1935 }, { "epoch": 4.951406649616368, "grad_norm": 0.09867014858433792, "learning_rate": 4.605678132467262e-08, "loss": 0.9195, "step": 1936 }, { "epoch": 4.953964194373402, "grad_norm": 0.09945447399480298, "learning_rate": 4.133668675666336e-08, "loss": 0.9235, "step": 1937 }, { "epoch": 4.956521739130435, "grad_norm": 0.09740241154451518, "learning_rate": 3.687164697408818e-08, "loss": 0.8983, "step": 1938 }, { "epoch": 4.959079283887468, "grad_norm": 0.10216904139394242, "learning_rate": 3.266167621967853e-08, "loss": 0.9333, "step": 1939 }, { "epoch": 4.961636828644501, "grad_norm": 1.7447830173428402, "learning_rate": 2.8706787922541112e-08, "loss": 0.9677, "step": 1940 }, { "epoch": 4.964194373401535, "grad_norm": 0.10248140850999501, "learning_rate": 2.5006994698095754e-08, "loss": 0.9205, "step": 1941 }, { "epoch": 4.966751918158568, "grad_norm": 0.10291780599089813, "learning_rate": 2.156230834808426e-08, "loss": 0.9314, "step": 1942 }, { "epoch": 4.969309462915601, "grad_norm": 0.09792527077121264, "learning_rate": 1.837273986046384e-08, "loss": 0.9289, "step": 1943 }, { "epoch": 4.971867007672635, "grad_norm": 0.0960164691356107, "learning_rate": 1.5438299409433755e-08, "loss": 0.9013, "step": 1944 }, { "epoch": 4.974424552429667, "grad_norm": 0.09979959822032446, "learning_rate": 1.2758996355373144e-08, "loss": 0.9203, "step": 1945 }, { "epoch": 4.976982097186701, "grad_norm": 0.10827315260460384, "learning_rate": 1.0334839244805495e-08, "loss": 0.9541, "step": 1946 }, { "epoch": 4.979539641943734, "grad_norm": 0.0988359933652592, "learning_rate": 8.165835810389766e-09, "loss": 0.9064, "step": 1947 }, { "epoch": 4.982097186700767, "grad_norm": 0.09820054319763678, "learning_rate": 6.251992970875975e-09, "loss": 0.9214, "step": 1948 }, { "epoch": 4.9846547314578, "grad_norm": 0.10015641951197356, "learning_rate": 4.5933168311140805e-09, "loss": 0.9461, "step": 1949 }, { "epoch": 4.987212276214834, "grad_norm": 0.10040227257081992, "learning_rate": 3.1898126820006924e-09, "loss": 0.9465, "step": 1950 }, { "epoch": 4.989769820971867, "grad_norm": 0.09609050872126598, "learning_rate": 2.041485000479071e-09, "loss": 0.9108, "step": 1951 }, { "epoch": 4.9923273657289, "grad_norm": 0.09913441529294063, "learning_rate": 1.148337449521364e-09, "loss": 0.9356, "step": 1952 }, { "epoch": 4.994884910485934, "grad_norm": 0.09800757849537761, "learning_rate": 5.103728781197248e-10, "loss": 0.9002, "step": 1953 }, { "epoch": 4.997442455242966, "grad_norm": 0.09827002033578132, "learning_rate": 1.275933212774305e-10, "loss": 0.9081, "step": 1954 }, { "epoch": 5.0, "grad_norm": 0.10411131044626397, "learning_rate": 0.0, "loss": 0.9254, "step": 1955 }, { "epoch": 5.0, "step": 1955, "total_flos": 7122204608430080.0, "train_loss": 1.0036099467436066, "train_runtime": 36219.8634, "train_samples_per_second": 13.805, "train_steps_per_second": 0.054 } ], "logging_steps": 1.0, "max_steps": 1955, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7122204608430080.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }