[ { "loss": 0.311, "grad_norm": 7.0026350021362305, "learning_rate": 0.0, "epoch": 0.0008019246190858059, "step": 1 }, { "loss": 0.4209, "grad_norm": 7.763463973999023, "learning_rate": 3.1746031746031746e-06, "epoch": 0.0016038492381716118, "step": 2 }, { "loss": 0.5177, "grad_norm": 7.721052646636963, "learning_rate": 6.349206349206349e-06, "epoch": 0.0024057738572574178, "step": 3 }, { "loss": 0.3841, "grad_norm": 5.636883735656738, "learning_rate": 9.523809523809523e-06, "epoch": 0.0032076984763432237, "step": 4 }, { "loss": 0.3934, "grad_norm": 5.894153118133545, "learning_rate": 1.2698412698412699e-05, "epoch": 0.00400962309542903, "step": 5 }, { "loss": 0.2577, "grad_norm": 3.8658251762390137, "learning_rate": 1.5873015873015872e-05, "epoch": 0.0048115477145148355, "step": 6 }, { "loss": 0.1507, "grad_norm": 3.220764398574829, "learning_rate": 1.9047619047619046e-05, "epoch": 0.0056134723336006415, "step": 7 }, { "loss": 0.1547, "grad_norm": 1.97222101688385, "learning_rate": 2.2222222222222223e-05, "epoch": 0.006415396952686447, "step": 8 }, { "loss": 0.1619, "grad_norm": 2.004807472229004, "learning_rate": 2.5396825396825397e-05, "epoch": 0.007217321571772253, "step": 9 }, { "loss": 0.2521, "grad_norm": 2.6470654010772705, "learning_rate": 2.857142857142857e-05, "epoch": 0.00801924619085806, "step": 10 }, { "loss": 0.3305, "grad_norm": 3.026132106781006, "learning_rate": 3.1746031746031745e-05, "epoch": 0.008821170809943865, "step": 11 }, { "loss": 0.2042, "grad_norm": 2.123467206954956, "learning_rate": 3.492063492063492e-05, "epoch": 0.009623095429029671, "step": 12 }, { "loss": 0.2047, "grad_norm": 1.958135962486267, "learning_rate": 3.809523809523809e-05, "epoch": 0.010425020048115477, "step": 13 }, { "loss": 0.1499, "grad_norm": 1.2876746654510498, "learning_rate": 4.126984126984127e-05, "epoch": 0.011226944667201283, "step": 14 }, { "loss": 0.0589, "grad_norm": 0.7209349870681763, "learning_rate": 4.4444444444444447e-05, "epoch": 0.012028869286287089, "step": 15 }, { "loss": 0.1404, "grad_norm": 1.2799328565597534, "learning_rate": 4.761904761904762e-05, "epoch": 0.012830793905372895, "step": 16 }, { "loss": 0.1594, "grad_norm": 1.2897180318832397, "learning_rate": 5.0793650793650794e-05, "epoch": 0.0136327185244587, "step": 17 }, { "loss": 0.1236, "grad_norm": 0.907631516456604, "learning_rate": 5.396825396825397e-05, "epoch": 0.014434643143544507, "step": 18 }, { "loss": 0.0899, "grad_norm": 0.7336040139198303, "learning_rate": 5.714285714285714e-05, "epoch": 0.015236567762630313, "step": 19 }, { "loss": 0.1434, "grad_norm": 1.4779671430587769, "learning_rate": 6.0317460317460316e-05, "epoch": 0.01603849238171612, "step": 20 }, { "loss": 0.1072, "grad_norm": 0.6834859251976013, "learning_rate": 6.349206349206349e-05, "epoch": 0.016840417000801924, "step": 21 }, { "loss": 0.1077, "grad_norm": 0.9278278946876526, "learning_rate": 6.666666666666667e-05, "epoch": 0.01764234161988773, "step": 22 }, { "loss": 0.1463, "grad_norm": 1.041062593460083, "learning_rate": 6.984126984126984e-05, "epoch": 0.018444266238973536, "step": 23 }, { "loss": 0.1311, "grad_norm": 1.007616639137268, "learning_rate": 7.301587301587302e-05, "epoch": 0.019246190858059342, "step": 24 }, { "loss": 0.1652, "grad_norm": 1.5278170108795166, "learning_rate": 7.619047619047618e-05, "epoch": 0.020048115477145148, "step": 25 }, { "loss": 0.2241, "grad_norm": 1.5930604934692383, "learning_rate": 7.936507936507937e-05, "epoch": 0.020850040096230954, "step": 26 }, { "loss": 0.0831, "grad_norm": 0.7199026942253113, "learning_rate": 8.253968253968255e-05, "epoch": 0.02165196471531676, "step": 27 }, { "loss": 0.1204, "grad_norm": 0.986321210861206, "learning_rate": 8.571428571428571e-05, "epoch": 0.022453889334402566, "step": 28 }, { "loss": 0.1153, "grad_norm": 1.234464168548584, "learning_rate": 8.888888888888889e-05, "epoch": 0.023255813953488372, "step": 29 }, { "loss": 0.081, "grad_norm": 1.0115418434143066, "learning_rate": 9.206349206349206e-05, "epoch": 0.024057738572574178, "step": 30 }, { "loss": 0.103, "grad_norm": 1.4726132154464722, "learning_rate": 9.523809523809524e-05, "epoch": 0.024859663191659984, "step": 31 }, { "loss": 0.0841, "grad_norm": 0.7434117197990417, "learning_rate": 9.841269841269841e-05, "epoch": 0.02566158781074579, "step": 32 }, { "loss": 0.0806, "grad_norm": 0.6968424916267395, "learning_rate": 0.00010158730158730159, "epoch": 0.026463512429831595, "step": 33 }, { "loss": 0.1339, "grad_norm": 1.8305174112319946, "learning_rate": 0.00010476190476190477, "epoch": 0.0272654370489174, "step": 34 }, { "loss": 0.148, "grad_norm": 1.3083935976028442, "learning_rate": 0.00010793650793650794, "epoch": 0.028067361668003207, "step": 35 }, { "loss": 0.0659, "grad_norm": 0.5363959074020386, "learning_rate": 0.00011111111111111112, "epoch": 0.028869286287089013, "step": 36 }, { "loss": 0.0761, "grad_norm": 0.7278910875320435, "learning_rate": 0.00011428571428571428, "epoch": 0.02967121090617482, "step": 37 }, { "loss": 0.0628, "grad_norm": 0.5862115621566772, "learning_rate": 0.00011746031746031746, "epoch": 0.030473135525260625, "step": 38 }, { "loss": 0.0892, "grad_norm": 0.8882272243499756, "learning_rate": 0.00012063492063492063, "epoch": 0.03127506014434643, "step": 39 }, { "loss": 0.0907, "grad_norm": 0.8315787315368652, "learning_rate": 0.0001238095238095238, "epoch": 0.03207698476343224, "step": 40 }, { "loss": 0.048, "grad_norm": 0.6063331365585327, "learning_rate": 0.00012698412698412698, "epoch": 0.03287890938251804, "step": 41 }, { "loss": 0.066, "grad_norm": 0.6467223167419434, "learning_rate": 0.00013015873015873017, "epoch": 0.03368083400160385, "step": 42 }, { "loss": 0.1351, "grad_norm": 1.2565680742263794, "learning_rate": 0.00013333333333333334, "epoch": 0.034482758620689655, "step": 43 }, { "loss": 0.0867, "grad_norm": 0.8123145699501038, "learning_rate": 0.0001365079365079365, "epoch": 0.03528468323977546, "step": 44 }, { "loss": 0.1072, "grad_norm": 0.8433717489242554, "learning_rate": 0.00013968253968253967, "epoch": 0.03608660785886127, "step": 45 }, { "loss": 0.0438, "grad_norm": 0.5360523462295532, "learning_rate": 0.00014285714285714287, "epoch": 0.03688853247794707, "step": 46 }, { "loss": 0.0866, "grad_norm": 0.644867479801178, "learning_rate": 0.00014603174603174603, "epoch": 0.03769045709703288, "step": 47 }, { "loss": 0.0839, "grad_norm": 0.8485159873962402, "learning_rate": 0.00014920634920634923, "epoch": 0.038492381716118684, "step": 48 }, { "loss": 0.0774, "grad_norm": 0.5638540387153625, "learning_rate": 0.00015238095238095237, "epoch": 0.03929430633520449, "step": 49 }, { "loss": 0.0611, "grad_norm": 0.7566853761672974, "learning_rate": 0.00015555555555555556, "epoch": 0.040096230954290296, "step": 50 }, { "loss": 0.1411, "grad_norm": 1.0959564447402954, "learning_rate": 0.00015873015873015873, "epoch": 0.0408981555733761, "step": 51 }, { "loss": 0.0594, "grad_norm": 0.6066744923591614, "learning_rate": 0.00016190476190476192, "epoch": 0.04170008019246191, "step": 52 }, { "loss": 0.0337, "grad_norm": 0.5505036115646362, "learning_rate": 0.0001650793650793651, "epoch": 0.042502004811547714, "step": 53 }, { "loss": 0.0572, "grad_norm": 0.6075869798660278, "learning_rate": 0.00016825396825396826, "epoch": 0.04330392943063352, "step": 54 }, { "loss": 0.0869, "grad_norm": 0.9212067723274231, "learning_rate": 0.00017142857142857143, "epoch": 0.044105854049719326, "step": 55 }, { "loss": 0.033, "grad_norm": 0.4611626863479614, "learning_rate": 0.00017460317460317462, "epoch": 0.04490777866880513, "step": 56 }, { "loss": 0.0711, "grad_norm": 0.8158572912216187, "learning_rate": 0.00017777777777777779, "epoch": 0.04570970328789094, "step": 57 }, { "loss": 0.1482, "grad_norm": 1.3836172819137573, "learning_rate": 0.00018095238095238095, "epoch": 0.046511627906976744, "step": 58 }, { "loss": 0.0953, "grad_norm": 0.6279105544090271, "learning_rate": 0.00018412698412698412, "epoch": 0.04731355252606255, "step": 59 }, { "loss": 0.1133, "grad_norm": 1.3958708047866821, "learning_rate": 0.00018730158730158731, "epoch": 0.048115477145148355, "step": 60 }, { "loss": 0.1674, "grad_norm": 1.2703611850738525, "learning_rate": 0.00019047619047619048, "epoch": 0.04891740176423416, "step": 61 }, { "loss": 0.0756, "grad_norm": 0.8350338935852051, "learning_rate": 0.00019365079365079365, "epoch": 0.04971932638331997, "step": 62 }, { "loss": 0.0776, "grad_norm": 0.7750063538551331, "learning_rate": 0.00019682539682539682, "epoch": 0.05052125100240577, "step": 63 }, { "loss": 0.0258, "grad_norm": 0.4177851974964142, "learning_rate": 0.0002, "epoch": 0.05132317562149158, "step": 64 }, { "loss": 0.0743, "grad_norm": 0.9661064743995667, "learning_rate": 0.00019999964798101197, "epoch": 0.052125100240577385, "step": 65 }, { "loss": 0.1386, "grad_norm": 1.2234452962875366, "learning_rate": 0.0001999985919265261, "epoch": 0.05292702485966319, "step": 66 }, { "loss": 0.0466, "grad_norm": 0.3697403073310852, "learning_rate": 0.00019999683184397752, "epoch": 0.053728949478749, "step": 67 }, { "loss": 0.1913, "grad_norm": 1.1906723976135254, "learning_rate": 0.0001999943677457578, "epoch": 0.0545308740978348, "step": 68 }, { "loss": 0.0784, "grad_norm": 0.6538499593734741, "learning_rate": 0.0001999911996492152, "epoch": 0.05533279871692061, "step": 69 }, { "loss": 0.0477, "grad_norm": 0.7172570824623108, "learning_rate": 0.00019998732757665427, "epoch": 0.056134723336006415, "step": 70 }, { "loss": 0.0691, "grad_norm": 0.5724918842315674, "learning_rate": 0.00019998275155533587, "epoch": 0.05693664795509222, "step": 71 }, { "loss": 0.0746, "grad_norm": 0.8900654911994934, "learning_rate": 0.00019997747161747695, "epoch": 0.057738572574178026, "step": 72 }, { "loss": 0.0356, "grad_norm": 0.43228501081466675, "learning_rate": 0.00019997148780025027, "epoch": 0.05854049719326383, "step": 73 }, { "loss": 0.0341, "grad_norm": 0.4748842716217041, "learning_rate": 0.0001999648001457842, "epoch": 0.05934242181234964, "step": 74 }, { "loss": 0.0751, "grad_norm": 0.5790648460388184, "learning_rate": 0.00019995740870116233, "epoch": 0.060144346431435444, "step": 75 }, { "loss": 0.1419, "grad_norm": 0.8083305358886719, "learning_rate": 0.00019994931351842327, "epoch": 0.06094627105052125, "step": 76 }, { "loss": 0.0738, "grad_norm": 1.0509905815124512, "learning_rate": 0.00019994051465456014, "epoch": 0.061748195669607056, "step": 77 }, { "loss": 0.09, "grad_norm": 1.006076693534851, "learning_rate": 0.00019993101217152028, "epoch": 0.06255012028869286, "step": 78 }, { "loss": 0.0671, "grad_norm": 0.7131031155586243, "learning_rate": 0.00019992080613620485, "epoch": 0.06335204490777867, "step": 79 }, { "loss": 0.0476, "grad_norm": 0.6254774332046509, "learning_rate": 0.00019990989662046818, "epoch": 0.06415396952686447, "step": 80 }, { "loss": 0.0313, "grad_norm": 0.5143452882766724, "learning_rate": 0.00019989828370111737, "epoch": 0.06495589414595028, "step": 81 }, { "loss": 0.0727, "grad_norm": 0.616113007068634, "learning_rate": 0.00019988596745991179, "epoch": 0.06575781876503609, "step": 82 }, { "loss": 0.0693, "grad_norm": 1.8450731039047241, "learning_rate": 0.00019987294798356247, "epoch": 0.06655974338412189, "step": 83 }, { "loss": 0.1301, "grad_norm": 0.8279522657394409, "learning_rate": 0.00019985922536373146, "epoch": 0.0673616680032077, "step": 84 }, { "loss": 0.0457, "grad_norm": 0.6411037445068359, "learning_rate": 0.00019984479969703127, "epoch": 0.0681635926222935, "step": 85 }, { "loss": 0.0636, "grad_norm": 0.5541757941246033, "learning_rate": 0.000199829671085024, "epoch": 0.06896551724137931, "step": 86 }, { "loss": 0.0644, "grad_norm": 0.5471921563148499, "learning_rate": 0.00019981383963422087, "epoch": 0.06976744186046512, "step": 87 }, { "loss": 0.0486, "grad_norm": 0.7092999219894409, "learning_rate": 0.00019979730545608126, "epoch": 0.07056936647955092, "step": 88 }, { "loss": 0.124, "grad_norm": 1.2980421781539917, "learning_rate": 0.00019978006866701211, "epoch": 0.07137129109863673, "step": 89 }, { "loss": 0.1298, "grad_norm": 0.778945803642273, "learning_rate": 0.0001997621293883669, "epoch": 0.07217321571772253, "step": 90 }, { "loss": 0.0542, "grad_norm": 0.4509424865245819, "learning_rate": 0.00019974348774644501, "epoch": 0.07297514033680834, "step": 91 }, { "loss": 0.0468, "grad_norm": 0.6056888103485107, "learning_rate": 0.00019972414387249072, "epoch": 0.07377706495589414, "step": 92 }, { "loss": 0.0815, "grad_norm": 0.7726762294769287, "learning_rate": 0.00019970409790269215, "epoch": 0.07457898957497995, "step": 93 }, { "loss": 0.0668, "grad_norm": 0.6297205090522766, "learning_rate": 0.00019968334997818064, "epoch": 0.07538091419406576, "step": 94 }, { "loss": 0.1778, "grad_norm": 1.166032075881958, "learning_rate": 0.00019966190024502939, "epoch": 0.07618283881315156, "step": 95 }, { "loss": 0.0662, "grad_norm": 0.7612900733947754, "learning_rate": 0.00019963974885425266, "epoch": 0.07698476343223737, "step": 96 }, { "loss": 0.0433, "grad_norm": 0.43482959270477295, "learning_rate": 0.00019961689596180467, "epoch": 0.07778668805132317, "step": 97 }, { "loss": 0.0616, "grad_norm": 0.5207836627960205, "learning_rate": 0.0001995933417285785, "epoch": 0.07858861267040898, "step": 98 }, { "loss": 0.0523, "grad_norm": 0.6553881764411926, "learning_rate": 0.0001995690863204049, "epoch": 0.07939053728949479, "step": 99 }, { "loss": 0.1302, "grad_norm": 1.2842791080474854, "learning_rate": 0.00019954412990805107, "epoch": 0.08019246190858059, "step": 100 }, { "loss": 0.0922, "grad_norm": 0.5699795484542847, "learning_rate": 0.0001995184726672197, "epoch": 0.0809943865276664, "step": 101 }, { "loss": 0.0807, "grad_norm": 0.5272155404090881, "learning_rate": 0.00019949211477854749, "epoch": 0.0817963111467522, "step": 102 }, { "loss": 0.1362, "grad_norm": 0.6196130514144897, "learning_rate": 0.00019946505642760398, "epoch": 0.08259823576583801, "step": 103 }, { "loss": 0.0705, "grad_norm": 0.6336621046066284, "learning_rate": 0.00019943729780489027, "epoch": 0.08340016038492382, "step": 104 }, { "loss": 0.0652, "grad_norm": 0.7032070755958557, "learning_rate": 0.00019940883910583756, "epoch": 0.08420208500400962, "step": 105 }, { "loss": 0.1116, "grad_norm": 0.908371090888977, "learning_rate": 0.0001993796805308059, "epoch": 0.08500400962309543, "step": 106 }, { "loss": 0.0818, "grad_norm": 0.7326153516769409, "learning_rate": 0.00019934982228508278, "epoch": 0.08580593424218123, "step": 107 }, { "loss": 0.1018, "grad_norm": 0.8321508169174194, "learning_rate": 0.00019931926457888156, "epoch": 0.08660785886126704, "step": 108 }, { "loss": 0.0259, "grad_norm": 0.2848133146762848, "learning_rate": 0.00019928800762734005, "epoch": 0.08740978348035285, "step": 109 }, { "loss": 0.0884, "grad_norm": 1.1061406135559082, "learning_rate": 0.00019925605165051918, "epoch": 0.08821170809943865, "step": 110 }, { "loss": 0.0813, "grad_norm": 0.5895913243293762, "learning_rate": 0.000199223396873401, "epoch": 0.08901363271852446, "step": 111 }, { "loss": 0.1933, "grad_norm": 1.0626415014266968, "learning_rate": 0.00019919004352588767, "epoch": 0.08981555733761026, "step": 112 }, { "loss": 0.0619, "grad_norm": 0.5373443365097046, "learning_rate": 0.00019915599184279942, "epoch": 0.09061748195669607, "step": 113 }, { "loss": 0.1071, "grad_norm": 0.6781280636787415, "learning_rate": 0.00019912124206387295, "epoch": 0.09141940657578188, "step": 114 }, { "loss": 0.0754, "grad_norm": 0.42521488666534424, "learning_rate": 0.00019908579443375996, "epoch": 0.09222133119486768, "step": 115 }, { "loss": 0.0705, "grad_norm": 0.5241889357566833, "learning_rate": 0.0001990496492020252, "epoch": 0.09302325581395349, "step": 116 }, { "loss": 0.1076, "grad_norm": 0.6329948902130127, "learning_rate": 0.00019901280662314484, "epoch": 0.09382518043303929, "step": 117 }, { "loss": 0.0346, "grad_norm": 0.3218804597854614, "learning_rate": 0.0001989752669565046, "epoch": 0.0946271050521251, "step": 118 }, { "loss": 0.1206, "grad_norm": 0.5836507081985474, "learning_rate": 0.00019893703046639804, "epoch": 0.0954290296712109, "step": 119 }, { "loss": 0.0955, "grad_norm": 0.6629716157913208, "learning_rate": 0.00019889809742202455, "epoch": 0.09623095429029671, "step": 120 }, { "loss": 0.1089, "grad_norm": 0.9768190383911133, "learning_rate": 0.00019885846809748753, "epoch": 0.09703287890938252, "step": 121 }, { "loss": 0.0323, "grad_norm": 0.27991437911987305, "learning_rate": 0.00019881814277179248, "epoch": 0.09783480352846832, "step": 122 }, { "loss": 0.0466, "grad_norm": 0.5002017617225647, "learning_rate": 0.00019877712172884502, "epoch": 0.09863672814755413, "step": 123 }, { "loss": 0.0311, "grad_norm": 0.4994860589504242, "learning_rate": 0.00019873540525744887, "epoch": 0.09943865276663993, "step": 124 }, { "eval_loss": 0.06782178580760956, "eval_runtime": 50.5786, "eval_samples_per_second": 20.76, "eval_steps_per_second": 5.2, "epoch": 0.09943865276663993, "step": 124 }, { "loss": 0.0705, "grad_norm": 0.6123189926147461, "learning_rate": 0.00019869299365130383, "epoch": 0.10024057738572574, "step": 125 }, { "loss": 0.0841, "grad_norm": 0.7357730865478516, "learning_rate": 0.00019864988720900368, "epoch": 0.10104250200481155, "step": 126 }, { "loss": 0.0674, "grad_norm": 0.46437254548072815, "learning_rate": 0.0001986060862340342, "epoch": 0.10184442662389735, "step": 127 }, { "loss": 0.0376, "grad_norm": 0.5510705709457397, "learning_rate": 0.00019856159103477086, "epoch": 0.10264635124298316, "step": 128 }, { "loss": 0.0396, "grad_norm": 0.5313072204589844, "learning_rate": 0.00019851640192447673, "epoch": 0.10344827586206896, "step": 129 }, { "loss": 0.0801, "grad_norm": 0.6203364133834839, "learning_rate": 0.00019847051922130038, "epoch": 0.10425020048115477, "step": 130 }, { "loss": 0.0499, "grad_norm": 0.3568151891231537, "learning_rate": 0.00019842394324827341, "epoch": 0.10505212510024058, "step": 131 }, { "loss": 0.0561, "grad_norm": 0.3815423548221588, "learning_rate": 0.00019837667433330838, "epoch": 0.10585404971932638, "step": 132 }, { "loss": 0.0722, "grad_norm": 0.4797166585922241, "learning_rate": 0.00019832871280919635, "epoch": 0.10665597433841219, "step": 133 }, { "loss": 0.0516, "grad_norm": 0.47701454162597656, "learning_rate": 0.00019828005901360475, "epoch": 0.107457898957498, "step": 134 }, { "loss": 0.0309, "grad_norm": 0.37124770879745483, "learning_rate": 0.00019823071328907473, "epoch": 0.1082598235765838, "step": 135 }, { "loss": 0.0707, "grad_norm": 0.6959102749824524, "learning_rate": 0.0001981806759830189, "epoch": 0.1090617481956696, "step": 136 }, { "loss": 0.1417, "grad_norm": 0.7774357795715332, "learning_rate": 0.00019812994744771898, "epoch": 0.10986367281475541, "step": 137 }, { "loss": 0.0405, "grad_norm": 0.4280378818511963, "learning_rate": 0.00019807852804032305, "epoch": 0.11066559743384122, "step": 138 }, { "loss": 0.0506, "grad_norm": 0.5292235016822815, "learning_rate": 0.00019802641812284328, "epoch": 0.11146752205292702, "step": 139 }, { "loss": 0.0624, "grad_norm": 0.5091221332550049, "learning_rate": 0.00019797361806215332, "epoch": 0.11226944667201283, "step": 140 }, { "loss": 0.0598, "grad_norm": 0.5391169786453247, "learning_rate": 0.0001979201282299856, "epoch": 0.11307137129109864, "step": 141 }, { "loss": 0.0861, "grad_norm": 0.7957108616828918, "learning_rate": 0.00019786594900292887, "epoch": 0.11387329591018444, "step": 142 }, { "loss": 0.0698, "grad_norm": 0.5378095507621765, "learning_rate": 0.00019781108076242547, "epoch": 0.11467522052927025, "step": 143 }, { "loss": 0.0612, "grad_norm": 0.5657555460929871, "learning_rate": 0.00019775552389476864, "epoch": 0.11547714514835605, "step": 144 }, { "loss": 0.1241, "grad_norm": 0.8074794411659241, "learning_rate": 0.00019769927879109982, "epoch": 0.11627906976744186, "step": 145 }, { "loss": 0.0831, "grad_norm": 0.5241571068763733, "learning_rate": 0.0001976423458474059, "epoch": 0.11708099438652766, "step": 146 }, { "loss": 0.0462, "grad_norm": 0.3452630043029785, "learning_rate": 0.00019758472546451645, "epoch": 0.11788291900561347, "step": 147 }, { "loss": 0.0656, "grad_norm": 0.38813871145248413, "learning_rate": 0.00019752641804810084, "epoch": 0.11868484362469928, "step": 148 }, { "loss": 0.0512, "grad_norm": 0.5402405261993408, "learning_rate": 0.0001974674240086654, "epoch": 0.11948676824378508, "step": 149 }, { "loss": 0.039, "grad_norm": 0.35998794436454773, "learning_rate": 0.00019740774376155061, "epoch": 0.12028869286287089, "step": 150 }, { "loss": 0.0275, "grad_norm": 0.2361939251422882, "learning_rate": 0.000197347377726928, "epoch": 0.1210906174819567, "step": 151 }, { "loss": 0.0476, "grad_norm": 0.48203134536743164, "learning_rate": 0.00019728632632979746, "epoch": 0.1218925421010425, "step": 152 }, { "loss": 0.0255, "grad_norm": 0.2733021676540375, "learning_rate": 0.00019722458999998398, "epoch": 0.1226944667201283, "step": 153 }, { "loss": 0.0506, "grad_norm": 0.3442985713481903, "learning_rate": 0.00019716216917213476, "epoch": 0.12349639133921411, "step": 154 }, { "loss": 0.0441, "grad_norm": 0.46731194853782654, "learning_rate": 0.00019709906428571616, "epoch": 0.12429831595829992, "step": 155 }, { "loss": 0.0794, "grad_norm": 0.9103071689605713, "learning_rate": 0.0001970352757850105, "epoch": 0.12510024057738572, "step": 156 }, { "loss": 0.0394, "grad_norm": 0.46548521518707275, "learning_rate": 0.0001969708041191131, "epoch": 0.12590216519647154, "step": 157 }, { "loss": 0.0485, "grad_norm": 0.4331950545310974, "learning_rate": 0.00019690564974192892, "epoch": 0.12670408981555734, "step": 158 }, { "loss": 0.0417, "grad_norm": 0.4089224636554718, "learning_rate": 0.00019683981311216959, "epoch": 0.12750601443464316, "step": 159 }, { "loss": 0.067, "grad_norm": 0.7565222978591919, "learning_rate": 0.0001967732946933499, "epoch": 0.12830793905372895, "step": 160 }, { "loss": 0.1036, "grad_norm": 0.5942509174346924, "learning_rate": 0.00019670609495378482, "epoch": 0.12910986367281477, "step": 161 }, { "loss": 0.079, "grad_norm": 0.6143490672111511, "learning_rate": 0.00019663821436658604, "epoch": 0.12991178829190056, "step": 162 }, { "loss": 0.0825, "grad_norm": 0.4321056306362152, "learning_rate": 0.0001965696534096587, "epoch": 0.13071371291098638, "step": 163 }, { "loss": 0.0583, "grad_norm": 0.5038022398948669, "learning_rate": 0.00019650041256569792, "epoch": 0.13151563753007217, "step": 164 }, { "loss": 0.0422, "grad_norm": 0.31969794631004333, "learning_rate": 0.00019643049232218553, "epoch": 0.132317562149158, "step": 165 }, { "loss": 0.0574, "grad_norm": 0.33682748675346375, "learning_rate": 0.00019635989317138666, "epoch": 0.13311948676824378, "step": 166 }, { "loss": 0.1009, "grad_norm": 0.8040818572044373, "learning_rate": 0.0001962886156103461, "epoch": 0.1339214113873296, "step": 167 }, { "loss": 0.0342, "grad_norm": 0.3678284287452698, "learning_rate": 0.00019621666014088494, "epoch": 0.1347233360064154, "step": 168 }, { "loss": 0.1214, "grad_norm": 0.5396429300308228, "learning_rate": 0.00019614402726959705, "epoch": 0.13552526062550121, "step": 169 }, { "loss": 0.0595, "grad_norm": 0.46996235847473145, "learning_rate": 0.0001960707175078454, "epoch": 0.136327185244587, "step": 170 }, { "loss": 0.0714, "grad_norm": 0.5289244055747986, "learning_rate": 0.00019599673137175855, "epoch": 0.13712910986367283, "step": 171 }, { "loss": 0.0443, "grad_norm": 0.45096877217292786, "learning_rate": 0.00019592206938222703, "epoch": 0.13793103448275862, "step": 172 }, { "loss": 0.0552, "grad_norm": 0.451477587223053, "learning_rate": 0.00019584673206489954, "epoch": 0.13873295910184444, "step": 173 }, { "loss": 0.0539, "grad_norm": 0.37521955370903015, "learning_rate": 0.00019577071995017945, "epoch": 0.13953488372093023, "step": 174 }, { "loss": 0.0612, "grad_norm": 0.34417349100112915, "learning_rate": 0.0001956940335732209, "epoch": 0.14033680834001605, "step": 175 }, { "loss": 0.0488, "grad_norm": 0.30773675441741943, "learning_rate": 0.00019561667347392508, "epoch": 0.14113873295910184, "step": 176 }, { "loss": 0.0713, "grad_norm": 0.633940577507019, "learning_rate": 0.00019553864019693652, "epoch": 0.14194065757818766, "step": 177 }, { "loss": 0.046, "grad_norm": 0.5169980525970459, "learning_rate": 0.00019545993429163913, "epoch": 0.14274258219727345, "step": 178 }, { "loss": 0.1141, "grad_norm": 0.6768614053726196, "learning_rate": 0.0001953805563121523, "epoch": 0.14354450681635927, "step": 179 }, { "loss": 0.0451, "grad_norm": 0.4514835476875305, "learning_rate": 0.0001953005068173272, "epoch": 0.14434643143544507, "step": 180 }, { "loss": 0.0473, "grad_norm": 0.4141976237297058, "learning_rate": 0.00019521978637074267, "epoch": 0.14514835605453089, "step": 181 }, { "loss": 0.0551, "grad_norm": 0.5272877812385559, "learning_rate": 0.0001951383955407013, "epoch": 0.14595028067361668, "step": 182 }, { "loss": 0.0714, "grad_norm": 0.9530414342880249, "learning_rate": 0.00019505633490022546, "epoch": 0.1467522052927025, "step": 183 }, { "loss": 0.1042, "grad_norm": 0.8545625805854797, "learning_rate": 0.0001949736050270532, "epoch": 0.1475541299117883, "step": 184 }, { "loss": 0.0809, "grad_norm": 0.4434387683868408, "learning_rate": 0.00019489020650363426, "epoch": 0.1483560545308741, "step": 185 }, { "loss": 0.0775, "grad_norm": 0.6959827542304993, "learning_rate": 0.00019480613991712588, "epoch": 0.1491579791499599, "step": 186 }, { "loss": 0.0479, "grad_norm": 0.6551741361618042, "learning_rate": 0.00019472140585938882, "epoch": 0.14995990376904572, "step": 187 }, { "loss": 0.0719, "grad_norm": 0.6390430331230164, "learning_rate": 0.00019463600492698296, "epoch": 0.1507618283881315, "step": 188 }, { "loss": 0.0513, "grad_norm": 0.4449121356010437, "learning_rate": 0.00019454993772116336, "epoch": 0.15156375300721733, "step": 189 }, { "loss": 0.0419, "grad_norm": 0.3439355492591858, "learning_rate": 0.00019446320484787575, "epoch": 0.15236567762630313, "step": 190 }, { "loss": 0.0594, "grad_norm": 0.3776263892650604, "learning_rate": 0.00019437580691775258, "epoch": 0.15316760224538895, "step": 191 }, { "loss": 0.0544, "grad_norm": 0.5031057000160217, "learning_rate": 0.00019428774454610843, "epoch": 0.15396952686447474, "step": 192 }, { "loss": 0.0451, "grad_norm": 0.33334505558013916, "learning_rate": 0.00019419901835293583, "epoch": 0.15477145148356056, "step": 193 }, { "loss": 0.0433, "grad_norm": 0.28815487027168274, "learning_rate": 0.0001941096289629009, "epoch": 0.15557337610264635, "step": 194 }, { "loss": 0.0648, "grad_norm": 0.5981271266937256, "learning_rate": 0.00019401957700533888, "epoch": 0.15637530072173217, "step": 195 }, { "loss": 0.0469, "grad_norm": 0.3985323905944824, "learning_rate": 0.00019392886311424973, "epoch": 0.15717722534081796, "step": 196 }, { "loss": 0.0467, "grad_norm": 0.3379111886024475, "learning_rate": 0.00019383748792829372, "epoch": 0.15797914995990378, "step": 197 }, { "loss": 0.0467, "grad_norm": 0.4567181468009949, "learning_rate": 0.00019374545209078687, "epoch": 0.15878107457898957, "step": 198 }, { "loss": 0.0494, "grad_norm": 0.38965797424316406, "learning_rate": 0.0001936527562496964, "epoch": 0.1595829991980754, "step": 199 }, { "loss": 0.0552, "grad_norm": 0.5935775637626648, "learning_rate": 0.0001935594010576362, "epoch": 0.16038492381716118, "step": 200 }, { "loss": 0.0558, "grad_norm": 0.4197022318840027, "learning_rate": 0.0001934653871718624, "epoch": 0.161186848436247, "step": 201 }, { "loss": 0.0582, "grad_norm": 0.46609750390052795, "learning_rate": 0.0001933707152542683, "epoch": 0.1619887730553328, "step": 202 }, { "loss": 0.0686, "grad_norm": 0.6024537086486816, "learning_rate": 0.00019327538597138029, "epoch": 0.16279069767441862, "step": 203 }, { "loss": 0.0394, "grad_norm": 0.36054351925849915, "learning_rate": 0.0001931793999943526, "epoch": 0.1635926222935044, "step": 204 }, { "loss": 0.0756, "grad_norm": 0.6744378805160522, "learning_rate": 0.0001930827579989631, "epoch": 0.16439454691259023, "step": 205 }, { "loss": 0.0492, "grad_norm": 0.3806591331958771, "learning_rate": 0.00019298546066560802, "epoch": 0.16519647153167602, "step": 206 }, { "loss": 0.0189, "grad_norm": 0.30359482765197754, "learning_rate": 0.00019288750867929756, "epoch": 0.16599839615076184, "step": 207 }, { "loss": 0.0396, "grad_norm": 0.4475793242454529, "learning_rate": 0.00019278890272965096, "epoch": 0.16680032076984763, "step": 208 }, { "loss": 0.0935, "grad_norm": 0.6485787630081177, "learning_rate": 0.00019268964351089148, "epoch": 0.16760224538893345, "step": 209 }, { "loss": 0.0615, "grad_norm": 0.4646718502044678, "learning_rate": 0.00019258973172184174, "epoch": 0.16840417000801924, "step": 210 }, { "loss": 0.0414, "grad_norm": 0.49095892906188965, "learning_rate": 0.0001924891680659187, "epoch": 0.16920609462710506, "step": 211 }, { "loss": 0.0265, "grad_norm": 0.835785984992981, "learning_rate": 0.0001923879532511287, "epoch": 0.17000801924619086, "step": 212 }, { "loss": 0.0696, "grad_norm": 0.43860942125320435, "learning_rate": 0.0001922860879900624, "epoch": 0.17080994386527668, "step": 213 }, { "loss": 0.0578, "grad_norm": 0.5574386119842529, "learning_rate": 0.00019218357299988998, "epoch": 0.17161186848436247, "step": 214 }, { "loss": 0.0454, "grad_norm": 0.3218346834182739, "learning_rate": 0.0001920804090023559, "epoch": 0.1724137931034483, "step": 215 }, { "loss": 0.0453, "grad_norm": 0.4190017879009247, "learning_rate": 0.0001919765967237739, "epoch": 0.17321571772253408, "step": 216 }, { "loss": 0.0449, "grad_norm": 0.35313868522644043, "learning_rate": 0.00019187213689502176, "epoch": 0.1740176423416199, "step": 217 }, { "loss": 0.0813, "grad_norm": 0.44302183389663696, "learning_rate": 0.00019176703025153643, "epoch": 0.1748195669607057, "step": 218 }, { "loss": 0.0276, "grad_norm": 0.2679917812347412, "learning_rate": 0.00019166127753330857, "epoch": 0.1756214915797915, "step": 219 }, { "loss": 0.0323, "grad_norm": 0.2973562777042389, "learning_rate": 0.00019155487948487748, "epoch": 0.1764234161988773, "step": 220 }, { "loss": 0.044, "grad_norm": 0.3215465247631073, "learning_rate": 0.00019144783685532578, "epoch": 0.17722534081796312, "step": 221 }, { "loss": 0.0353, "grad_norm": 0.3549197018146515, "learning_rate": 0.00019134015039827431, "epoch": 0.17802726543704891, "step": 222 }, { "loss": 0.051, "grad_norm": 0.42532142996788025, "learning_rate": 0.00019123182087187656, "epoch": 0.17882919005613473, "step": 223 }, { "loss": 0.0479, "grad_norm": 0.39455631375312805, "learning_rate": 0.0001911228490388136, "epoch": 0.17963111467522053, "step": 224 }, { "loss": 0.0573, "grad_norm": 0.45477625727653503, "learning_rate": 0.00019101323566628843, "epoch": 0.18043303929430635, "step": 225 }, { "loss": 0.0342, "grad_norm": 0.2890731692314148, "learning_rate": 0.0001909029815260209, "epoch": 0.18123496391339214, "step": 226 }, { "loss": 0.0361, "grad_norm": 0.34329336881637573, "learning_rate": 0.00019079208739424197, "epoch": 0.18203688853247796, "step": 227 }, { "loss": 0.0714, "grad_norm": 0.6790952682495117, "learning_rate": 0.0001906805540516885, "epoch": 0.18283881315156375, "step": 228 }, { "loss": 0.032, "grad_norm": 0.26758837699890137, "learning_rate": 0.00019056838228359753, "epoch": 0.18364073777064957, "step": 229 }, { "loss": 0.0401, "grad_norm": 0.5012450218200684, "learning_rate": 0.0001904555728797009, "epoch": 0.18444266238973536, "step": 230 }, { "loss": 0.0301, "grad_norm": 0.28922465443611145, "learning_rate": 0.00019034212663421969, "epoch": 0.18524458700882118, "step": 231 }, { "loss": 0.0639, "grad_norm": 0.44703420996665955, "learning_rate": 0.00019022804434585852, "epoch": 0.18604651162790697, "step": 232 }, { "loss": 0.0446, "grad_norm": 0.25191769003868103, "learning_rate": 0.00019011332681780006, "epoch": 0.1868484362469928, "step": 233 }, { "loss": 0.0623, "grad_norm": 0.46206915378570557, "learning_rate": 0.00018999797485769925, "epoch": 0.18765036086607859, "step": 234 }, { "loss": 0.0379, "grad_norm": 0.25015994906425476, "learning_rate": 0.0001898819892776777, "epoch": 0.1884522854851644, "step": 235 }, { "loss": 0.0602, "grad_norm": 0.3543962836265564, "learning_rate": 0.0001897653708943179, "epoch": 0.1892542101042502, "step": 236 }, { "loss": 0.0483, "grad_norm": 0.29529276490211487, "learning_rate": 0.00018964812052865764, "epoch": 0.19005613472333602, "step": 237 }, { "loss": 0.1344, "grad_norm": 0.5841286778450012, "learning_rate": 0.00018953023900618397, "epoch": 0.1908580593424218, "step": 238 }, { "loss": 0.0651, "grad_norm": 0.45970141887664795, "learning_rate": 0.00018941172715682757, "epoch": 0.19165998396150763, "step": 239 }, { "loss": 0.0552, "grad_norm": 0.4103776514530182, "learning_rate": 0.00018929258581495685, "epoch": 0.19246190858059342, "step": 240 }, { "loss": 0.0525, "grad_norm": 0.3026215434074402, "learning_rate": 0.00018917281581937214, "epoch": 0.19326383319967924, "step": 241 }, { "loss": 0.0343, "grad_norm": 0.28369593620300293, "learning_rate": 0.00018905241801329972, "epoch": 0.19406575781876503, "step": 242 }, { "loss": 0.05, "grad_norm": 0.36326268315315247, "learning_rate": 0.00018893139324438577, "epoch": 0.19486768243785085, "step": 243 }, { "loss": 0.0539, "grad_norm": 0.38554129004478455, "learning_rate": 0.0001888097423646907, "epoch": 0.19566960705693665, "step": 244 }, { "loss": 0.0202, "grad_norm": 0.16676409542560577, "learning_rate": 0.00018868746623068293, "epoch": 0.19647153167602247, "step": 245 }, { "loss": 0.0282, "grad_norm": 0.37308400869369507, "learning_rate": 0.00018856456570323277, "epoch": 0.19727345629510826, "step": 246 }, { "loss": 0.0517, "grad_norm": 0.38957253098487854, "learning_rate": 0.0001884410416476067, "epoch": 0.19807538091419408, "step": 247 }, { "loss": 0.0236, "grad_norm": 0.33424702286720276, "learning_rate": 0.00018831689493346095, "epoch": 0.19887730553327987, "step": 248 }, { "eval_loss": 0.056792400777339935, "eval_runtime": 32.4803, "eval_samples_per_second": 32.327, "eval_steps_per_second": 8.097, "epoch": 0.19887730553327987, "step": 248 }, { "loss": 0.0303, "grad_norm": 0.308700829744339, "learning_rate": 0.0001881921264348355, "epoch": 0.1996792301523657, "step": 249 }, { "loss": 0.0619, "grad_norm": 0.31528714299201965, "learning_rate": 0.00018806673703014804, "epoch": 0.20048115477145148, "step": 250 }, { "loss": 0.0917, "grad_norm": 0.6735418438911438, "learning_rate": 0.00018794072760218753, "epoch": 0.2012830793905373, "step": 251 }, { "loss": 0.0882, "grad_norm": 0.793260931968689, "learning_rate": 0.00018781409903810821, "epoch": 0.2020850040096231, "step": 252 }, { "loss": 0.0455, "grad_norm": 0.39877378940582275, "learning_rate": 0.0001876868522294233, "epoch": 0.2028869286287089, "step": 253 }, { "loss": 0.0959, "grad_norm": 0.931326687335968, "learning_rate": 0.00018755898807199856, "epoch": 0.2036888532477947, "step": 254 }, { "loss": 0.0983, "grad_norm": 0.8125079274177551, "learning_rate": 0.00018743050746604633, "epoch": 0.20449077786688052, "step": 255 }, { "loss": 0.0523, "grad_norm": 0.4794807434082031, "learning_rate": 0.00018730141131611882, "epoch": 0.20529270248596632, "step": 256 }, { "loss": 0.0311, "grad_norm": 0.24229726195335388, "learning_rate": 0.00018717170053110196, "epoch": 0.20609462710505214, "step": 257 }, { "loss": 0.073, "grad_norm": 0.3876228630542755, "learning_rate": 0.0001870413760242089, "epoch": 0.20689655172413793, "step": 258 }, { "loss": 0.0837, "grad_norm": 0.3997587263584137, "learning_rate": 0.0001869104387129737, "epoch": 0.20769847634322375, "step": 259 }, { "loss": 0.0395, "grad_norm": 0.3965926468372345, "learning_rate": 0.00018677888951924474, "epoch": 0.20850040096230954, "step": 260 }, { "loss": 0.0554, "grad_norm": 0.4688607156276703, "learning_rate": 0.00018664672936917828, "epoch": 0.20930232558139536, "step": 261 }, { "loss": 0.1299, "grad_norm": 0.9481903910636902, "learning_rate": 0.00018651395919323202, "epoch": 0.21010425020048115, "step": 262 }, { "loss": 0.0503, "grad_norm": 0.29419174790382385, "learning_rate": 0.00018638057992615838, "epoch": 0.21090617481956697, "step": 263 }, { "loss": 0.0969, "grad_norm": 0.5171178579330444, "learning_rate": 0.00018624659250699805, "epoch": 0.21170809943865276, "step": 264 }, { "loss": 0.0345, "grad_norm": 0.3265765309333801, "learning_rate": 0.00018611199787907338, "epoch": 0.21251002405773858, "step": 265 }, { "loss": 0.0362, "grad_norm": 0.33063560724258423, "learning_rate": 0.00018597679698998163, "epoch": 0.21331194867682438, "step": 266 }, { "loss": 0.0511, "grad_norm": 0.660375714302063, "learning_rate": 0.00018584099079158842, "epoch": 0.2141138732959102, "step": 267 }, { "loss": 0.0815, "grad_norm": 0.580894410610199, "learning_rate": 0.00018570458024002093, "epoch": 0.214915797914996, "step": 268 }, { "loss": 0.1139, "grad_norm": 0.7811892032623291, "learning_rate": 0.0001855675662956613, "epoch": 0.2157177225340818, "step": 269 }, { "loss": 0.0454, "grad_norm": 0.40047529339790344, "learning_rate": 0.0001854299499231397, "epoch": 0.2165196471531676, "step": 270 }, { "loss": 0.0691, "grad_norm": 0.6034380197525024, "learning_rate": 0.0001852917320913276, "epoch": 0.21732157177225342, "step": 271 }, { "loss": 0.0392, "grad_norm": 0.3567689061164856, "learning_rate": 0.00018515291377333112, "epoch": 0.2181234963913392, "step": 272 }, { "loss": 0.0426, "grad_norm": 0.346510648727417, "learning_rate": 0.00018501349594648395, "epoch": 0.21892542101042503, "step": 273 }, { "loss": 0.0326, "grad_norm": 0.5042007565498352, "learning_rate": 0.0001848734795923404, "epoch": 0.21972734562951082, "step": 274 }, { "loss": 0.0446, "grad_norm": 0.5788083076477051, "learning_rate": 0.0001847328656966689, "epoch": 0.22052927024859664, "step": 275 }, { "loss": 0.0573, "grad_norm": 0.3450917601585388, "learning_rate": 0.0001845916552494446, "epoch": 0.22133119486768243, "step": 276 }, { "loss": 0.0495, "grad_norm": 0.36637723445892334, "learning_rate": 0.00018444984924484277, "epoch": 0.22213311948676825, "step": 277 }, { "loss": 0.0342, "grad_norm": 0.24668528139591217, "learning_rate": 0.00018430744868123145, "epoch": 0.22293504410585405, "step": 278 }, { "loss": 0.0499, "grad_norm": 0.49873360991477966, "learning_rate": 0.0001841644545611647, "epoch": 0.22373696872493987, "step": 279 }, { "loss": 0.1, "grad_norm": 1.0184354782104492, "learning_rate": 0.00018402086789137546, "epoch": 0.22453889334402566, "step": 280 }, { "loss": 0.0401, "grad_norm": 0.43986013531684875, "learning_rate": 0.00018387668968276836, "epoch": 0.22534081796311148, "step": 281 }, { "loss": 0.0451, "grad_norm": 0.3782620131969452, "learning_rate": 0.0001837319209504128, "epoch": 0.22614274258219727, "step": 282 }, { "loss": 0.0494, "grad_norm": 0.396990031003952, "learning_rate": 0.00018358656271353559, "epoch": 0.2269446672012831, "step": 283 }, { "loss": 0.0262, "grad_norm": 0.34569329023361206, "learning_rate": 0.00018344061599551398, "epoch": 0.22774659182036888, "step": 284 }, { "loss": 0.049, "grad_norm": 0.36551395058631897, "learning_rate": 0.0001832940818238682, "epoch": 0.2285485164394547, "step": 285 }, { "loss": 0.0493, "grad_norm": 0.3316669166088104, "learning_rate": 0.00018314696123025454, "epoch": 0.2293504410585405, "step": 286 }, { "loss": 0.0974, "grad_norm": 0.9379229545593262, "learning_rate": 0.0001829992552504578, "epoch": 0.2301523656776263, "step": 287 }, { "loss": 0.035, "grad_norm": 0.4309346079826355, "learning_rate": 0.00018285096492438424, "epoch": 0.2309542902967121, "step": 288 }, { "loss": 0.0573, "grad_norm": 0.47338199615478516, "learning_rate": 0.00018270209129605397, "epoch": 0.23175621491579793, "step": 289 }, { "loss": 0.026, "grad_norm": 0.3351285457611084, "learning_rate": 0.00018255263541359397, "epoch": 0.23255813953488372, "step": 290 }, { "loss": 0.0291, "grad_norm": 0.2552240192890167, "learning_rate": 0.00018240259832923034, "epoch": 0.23336006415396954, "step": 291 }, { "loss": 0.02, "grad_norm": 0.23985892534255981, "learning_rate": 0.00018225198109928114, "epoch": 0.23416198877305533, "step": 292 }, { "loss": 0.0195, "grad_norm": 0.26093894243240356, "learning_rate": 0.00018210078478414894, "epoch": 0.23496391339214115, "step": 293 }, { "loss": 0.0669, "grad_norm": 0.5353745222091675, "learning_rate": 0.00018194901044831313, "epoch": 0.23576583801122694, "step": 294 }, { "loss": 0.0193, "grad_norm": 0.25396963953971863, "learning_rate": 0.00018179665916032273, "epoch": 0.23656776263031276, "step": 295 }, { "loss": 0.0656, "grad_norm": 0.3989141881465912, "learning_rate": 0.00018164373199278856, "epoch": 0.23736968724939855, "step": 296 }, { "loss": 0.0297, "grad_norm": 0.31333601474761963, "learning_rate": 0.00018149023002237612, "epoch": 0.23817161186848437, "step": 297 }, { "loss": 0.0471, "grad_norm": 0.8698596954345703, "learning_rate": 0.00018133615432979744, "epoch": 0.23897353648757017, "step": 298 }, { "loss": 0.081, "grad_norm": 0.46993565559387207, "learning_rate": 0.00018118150599980397, "epoch": 0.23977546110665598, "step": 299 }, { "loss": 0.049, "grad_norm": 0.5059134364128113, "learning_rate": 0.00018102628612117865, "epoch": 0.24057738572574178, "step": 300 }, { "loss": 0.0653, "grad_norm": 0.5506439805030823, "learning_rate": 0.00018087049578672845, "epoch": 0.2413793103448276, "step": 301 }, { "loss": 0.0554, "grad_norm": 0.5644898414611816, "learning_rate": 0.00018071413609327638, "epoch": 0.2421812349639134, "step": 302 }, { "loss": 0.0536, "grad_norm": 0.4158555269241333, "learning_rate": 0.00018055720814165414, "epoch": 0.2429831595829992, "step": 303 }, { "loss": 0.0593, "grad_norm": 0.4376695454120636, "learning_rate": 0.00018039971303669407, "epoch": 0.243785084202085, "step": 304 }, { "loss": 0.0698, "grad_norm": 0.5507003664970398, "learning_rate": 0.00018024165188722151, "epoch": 0.24458700882117082, "step": 305 }, { "loss": 0.0219, "grad_norm": 0.25363439321517944, "learning_rate": 0.000180083025806047, "epoch": 0.2453889334402566, "step": 306 }, { "loss": 0.0134, "grad_norm": 0.2080700397491455, "learning_rate": 0.00017992383590995838, "epoch": 0.24619085805934243, "step": 307 }, { "loss": 0.0656, "grad_norm": 0.421975702047348, "learning_rate": 0.00017976408331971298, "epoch": 0.24699278267842822, "step": 308 }, { "loss": 0.027, "grad_norm": 0.3046298921108246, "learning_rate": 0.00017960376916002972, "epoch": 0.24779470729751404, "step": 309 }, { "loss": 0.0534, "grad_norm": 0.3668377995491028, "learning_rate": 0.00017944289455958112, "epoch": 0.24859663191659984, "step": 310 }, { "loss": 0.0553, "grad_norm": 0.4287368059158325, "learning_rate": 0.0001792814606509855, "epoch": 0.24939855653568566, "step": 311 }, { "loss": 0.0487, "grad_norm": 0.370373010635376, "learning_rate": 0.00017911946857079888, "epoch": 0.25020048115477145, "step": 312 }, { "loss": 0.0422, "grad_norm": 0.4405466616153717, "learning_rate": 0.00017895691945950696, "epoch": 0.25100240577385724, "step": 313 }, { "loss": 0.1081, "grad_norm": 0.6124715805053711, "learning_rate": 0.0001787938144615173, "epoch": 0.2518043303929431, "step": 314 }, { "loss": 0.0403, "grad_norm": 0.22574079036712646, "learning_rate": 0.000178630154725151, "epoch": 0.2526062550120289, "step": 315 }, { "loss": 0.1046, "grad_norm": 0.5598015189170837, "learning_rate": 0.00017846594140263474, "epoch": 0.25340817963111467, "step": 316 }, { "loss": 0.0544, "grad_norm": 0.3449535667896271, "learning_rate": 0.0001783011756500927, "epoch": 0.25421010425020046, "step": 317 }, { "loss": 0.058, "grad_norm": 0.40914788842201233, "learning_rate": 0.0001781358586275383, "epoch": 0.2550120288692863, "step": 318 }, { "loss": 0.1132, "grad_norm": 0.7423124313354492, "learning_rate": 0.0001779699914988662, "epoch": 0.2558139534883721, "step": 319 }, { "loss": 0.0601, "grad_norm": 0.6021925210952759, "learning_rate": 0.00017780357543184397, "epoch": 0.2566158781074579, "step": 320 }, { "loss": 0.1014, "grad_norm": 0.48059457540512085, "learning_rate": 0.0001776366115981039, "epoch": 0.2574178027265437, "step": 321 }, { "loss": 0.0548, "grad_norm": 0.5897157788276672, "learning_rate": 0.00017746910117313482, "epoch": 0.25821972734562953, "step": 322 }, { "loss": 0.0483, "grad_norm": 0.36229458451271057, "learning_rate": 0.0001773010453362737, "epoch": 0.2590216519647153, "step": 323 }, { "loss": 0.0632, "grad_norm": 0.49136513471603394, "learning_rate": 0.0001771324452706975, "epoch": 0.2598235765838011, "step": 324 }, { "loss": 0.0324, "grad_norm": 0.6286053657531738, "learning_rate": 0.00017696330216341463, "epoch": 0.2606255012028869, "step": 325 }, { "loss": 0.054, "grad_norm": 0.49283909797668457, "learning_rate": 0.0001767936172052569, "epoch": 0.26142742582197276, "step": 326 }, { "loss": 0.0256, "grad_norm": 0.2010183483362198, "learning_rate": 0.00017662339159087078, "epoch": 0.26222935044105855, "step": 327 }, { "loss": 0.0451, "grad_norm": 0.39567244052886963, "learning_rate": 0.00017645262651870926, "epoch": 0.26303127506014434, "step": 328 }, { "loss": 0.1059, "grad_norm": 0.5877751708030701, "learning_rate": 0.00017628132319102332, "epoch": 0.26383319967923013, "step": 329 }, { "loss": 0.0314, "grad_norm": 0.28202834725379944, "learning_rate": 0.0001761094828138534, "epoch": 0.264635124298316, "step": 330 }, { "loss": 0.0241, "grad_norm": 0.3100980222225189, "learning_rate": 0.00017593710659702104, "epoch": 0.2654370489174018, "step": 331 }, { "loss": 0.0515, "grad_norm": 0.28590792417526245, "learning_rate": 0.0001757641957541203, "epoch": 0.26623897353648757, "step": 332 }, { "loss": 0.0691, "grad_norm": 0.7025532126426697, "learning_rate": 0.0001755907515025091, "epoch": 0.26704089815557336, "step": 333 }, { "loss": 0.0398, "grad_norm": 0.3652035593986511, "learning_rate": 0.0001754167750633009, "epoch": 0.2678428227746592, "step": 334 }, { "loss": 0.0597, "grad_norm": 0.41364148259162903, "learning_rate": 0.00017524226766135588, "epoch": 0.268644747393745, "step": 335 }, { "loss": 0.0582, "grad_norm": 0.3338804244995117, "learning_rate": 0.00017506723052527242, "epoch": 0.2694466720128308, "step": 336 }, { "loss": 0.0833, "grad_norm": 0.9465529322624207, "learning_rate": 0.00017489166488737846, "epoch": 0.2702485966319166, "step": 337 }, { "loss": 0.0848, "grad_norm": 0.44553694128990173, "learning_rate": 0.00017471557198372274, "epoch": 0.27105052125100243, "step": 338 }, { "loss": 0.1273, "grad_norm": 1.1346548795700073, "learning_rate": 0.00017453895305406616, "epoch": 0.2718524458700882, "step": 339 }, { "loss": 0.0736, "grad_norm": 0.4856693744659424, "learning_rate": 0.00017436180934187308, "epoch": 0.272654370489174, "step": 340 }, { "loss": 0.0296, "grad_norm": 0.27393412590026855, "learning_rate": 0.0001741841420943025, "epoch": 0.2734562951082598, "step": 341 }, { "loss": 0.0504, "grad_norm": 0.3282850384712219, "learning_rate": 0.00017400595256219928, "epoch": 0.27425821972734565, "step": 342 }, { "loss": 0.0501, "grad_norm": 0.3622792363166809, "learning_rate": 0.00017382724200008546, "epoch": 0.27506014434643145, "step": 343 }, { "loss": 0.0875, "grad_norm": 0.5967736840248108, "learning_rate": 0.00017364801166615124, "epoch": 0.27586206896551724, "step": 344 }, { "loss": 0.0797, "grad_norm": 0.665009617805481, "learning_rate": 0.0001734682628222462, "epoch": 0.27666399358460303, "step": 345 }, { "loss": 0.0367, "grad_norm": 0.31664225459098816, "learning_rate": 0.0001732879967338705, "epoch": 0.2774659182036889, "step": 346 }, { "loss": 0.078, "grad_norm": 0.613771915435791, "learning_rate": 0.00017310721467016587, "epoch": 0.27826784282277467, "step": 347 }, { "loss": 0.0309, "grad_norm": 0.29217079281806946, "learning_rate": 0.00017292591790390665, "epoch": 0.27906976744186046, "step": 348 }, { "loss": 0.0147, "grad_norm": 0.1654537171125412, "learning_rate": 0.00017274410771149094, "epoch": 0.27987169206094625, "step": 349 }, { "loss": 0.0372, "grad_norm": 0.2641878128051758, "learning_rate": 0.0001725617853729316, "epoch": 0.2806736166800321, "step": 350 }, { "loss": 0.0438, "grad_norm": 0.4984488785266876, "learning_rate": 0.00017237895217184703, "epoch": 0.2814755412991179, "step": 351 }, { "loss": 0.0641, "grad_norm": 0.4201189875602722, "learning_rate": 0.00017219560939545246, "epoch": 0.2822774659182037, "step": 352 }, { "loss": 0.0582, "grad_norm": 0.3273194134235382, "learning_rate": 0.00017201175833455066, "epoch": 0.2830793905372895, "step": 353 }, { "loss": 0.0358, "grad_norm": 0.2902083396911621, "learning_rate": 0.0001718274002835229, "epoch": 0.2838813151563753, "step": 354 }, { "loss": 0.0383, "grad_norm": 0.1811976581811905, "learning_rate": 0.00017164253654031986, "epoch": 0.2846832397754611, "step": 355 }, { "loss": 0.0519, "grad_norm": 0.4728938639163971, "learning_rate": 0.00017145716840645254, "epoch": 0.2854851643945469, "step": 356 }, { "loss": 0.0437, "grad_norm": 0.48397713899612427, "learning_rate": 0.00017127129718698297, "epoch": 0.2862870890136327, "step": 357 }, { "loss": 0.0416, "grad_norm": 0.3491261303424835, "learning_rate": 0.0001710849241905151, "epoch": 0.28708901363271855, "step": 358 }, { "loss": 0.0688, "grad_norm": 0.4765617251396179, "learning_rate": 0.00017089805072918567, "epoch": 0.28789093825180434, "step": 359 }, { "loss": 0.0959, "grad_norm": 0.7366757988929749, "learning_rate": 0.00017071067811865476, "epoch": 0.28869286287089013, "step": 360 }, { "loss": 0.033, "grad_norm": 0.3149030804634094, "learning_rate": 0.00017052280767809673, "epoch": 0.2894947874899759, "step": 361 }, { "loss": 0.0458, "grad_norm": 0.3187673091888428, "learning_rate": 0.00017033444073019077, "epoch": 0.29029671210906177, "step": 362 }, { "loss": 0.0446, "grad_norm": 0.3986169099807739, "learning_rate": 0.0001701455786011118, "epoch": 0.29109863672814756, "step": 363 }, { "loss": 0.0341, "grad_norm": 0.3107149302959442, "learning_rate": 0.00016995622262052092, "epoch": 0.29190056134723336, "step": 364 }, { "loss": 0.0392, "grad_norm": 0.38037049770355225, "learning_rate": 0.00016976637412155612, "epoch": 0.29270248596631915, "step": 365 }, { "loss": 0.0406, "grad_norm": 0.35384100675582886, "learning_rate": 0.00016957603444082295, "epoch": 0.293504410585405, "step": 366 }, { "loss": 0.0676, "grad_norm": 0.6596208810806274, "learning_rate": 0.000169385204918385, "epoch": 0.2943063352044908, "step": 367 }, { "loss": 0.0496, "grad_norm": 0.3856953978538513, "learning_rate": 0.00016919388689775464, "epoch": 0.2951082598235766, "step": 368 }, { "loss": 0.0541, "grad_norm": 0.3974038362503052, "learning_rate": 0.00016900208172588332, "epoch": 0.29591018444266237, "step": 369 }, { "loss": 0.0419, "grad_norm": 0.40319862961769104, "learning_rate": 0.00016880979075315237, "epoch": 0.2967121090617482, "step": 370 }, { "loss": 0.0565, "grad_norm": 0.27359071373939514, "learning_rate": 0.00016861701533336322, "epoch": 0.297514033680834, "step": 371 }, { "loss": 0.0262, "grad_norm": 0.351244181394577, "learning_rate": 0.00016842375682372805, "epoch": 0.2983159582999198, "step": 372 }, { "eval_loss": 0.05102652311325073, "eval_runtime": 31.7718, "eval_samples_per_second": 33.048, "eval_steps_per_second": 8.278, "epoch": 0.2983159582999198, "step": 372 }, { "loss": 0.0428, "grad_norm": 0.42074060440063477, "learning_rate": 0.00016823001658486012, "epoch": 0.2991178829190056, "step": 373 }, { "loss": 0.0224, "grad_norm": 0.2260231077671051, "learning_rate": 0.00016803579598076432, "epoch": 0.29991980753809144, "step": 374 }, { "loss": 0.0492, "grad_norm": 0.47774842381477356, "learning_rate": 0.0001678410963788275, "epoch": 0.30072173215717724, "step": 375 }, { "loss": 0.0638, "grad_norm": 0.5587054491043091, "learning_rate": 0.0001676459191498087, "epoch": 0.301523656776263, "step": 376 }, { "loss": 0.0707, "grad_norm": 0.4895194172859192, "learning_rate": 0.0001674502656678298, "epoch": 0.3023255813953488, "step": 377 }, { "loss": 0.0279, "grad_norm": 0.24737556278705597, "learning_rate": 0.00016725413731036561, "epoch": 0.30312750601443467, "step": 378 }, { "loss": 0.0305, "grad_norm": 0.35510316491127014, "learning_rate": 0.00016705753545823423, "epoch": 0.30392943063352046, "step": 379 }, { "loss": 0.0204, "grad_norm": 0.253121942281723, "learning_rate": 0.00016686046149558736, "epoch": 0.30473135525260625, "step": 380 }, { "loss": 0.0226, "grad_norm": 0.25866273045539856, "learning_rate": 0.00016666291680990055, "epoch": 0.30553327987169204, "step": 381 }, { "loss": 0.0829, "grad_norm": 0.4675450325012207, "learning_rate": 0.00016646490279196343, "epoch": 0.3063352044907779, "step": 382 }, { "loss": 0.0203, "grad_norm": 0.30080100893974304, "learning_rate": 0.00016626642083586985, "epoch": 0.3071371291098637, "step": 383 }, { "loss": 0.0454, "grad_norm": 0.5222088694572449, "learning_rate": 0.00016606747233900815, "epoch": 0.3079390537289495, "step": 384 }, { "loss": 0.0502, "grad_norm": 0.32578209042549133, "learning_rate": 0.00016586805870205134, "epoch": 0.30874097834803527, "step": 385 }, { "loss": 0.0567, "grad_norm": 0.3294476568698883, "learning_rate": 0.0001656681813289471, "epoch": 0.3095429029671211, "step": 386 }, { "loss": 0.0817, "grad_norm": 0.7187215685844421, "learning_rate": 0.0001654678416269081, "epoch": 0.3103448275862069, "step": 387 }, { "loss": 0.0305, "grad_norm": 0.31030380725860596, "learning_rate": 0.0001652670410064019, "epoch": 0.3111467522052927, "step": 388 }, { "loss": 0.0614, "grad_norm": 0.5844921469688416, "learning_rate": 0.00016506578088114107, "epoch": 0.3119486768243785, "step": 389 }, { "loss": 0.0291, "grad_norm": 0.2818225622177124, "learning_rate": 0.00016486406266807345, "epoch": 0.31275060144346434, "step": 390 }, { "loss": 0.1276, "grad_norm": 0.6056419610977173, "learning_rate": 0.0001646618877873717, "epoch": 0.31355252606255013, "step": 391 }, { "loss": 0.0534, "grad_norm": 0.36668699979782104, "learning_rate": 0.00016445925766242391, "epoch": 0.3143544506816359, "step": 392 }, { "loss": 0.031, "grad_norm": 0.34223347902297974, "learning_rate": 0.00016425617371982303, "epoch": 0.3151563753007217, "step": 393 }, { "loss": 0.0862, "grad_norm": 0.394709050655365, "learning_rate": 0.00016405263738935718, "epoch": 0.31595829991980756, "step": 394 }, { "loss": 0.0489, "grad_norm": 0.41530197858810425, "learning_rate": 0.00016384865010399935, "epoch": 0.31676022453889335, "step": 395 }, { "loss": 0.1056, "grad_norm": 0.450509637594223, "learning_rate": 0.00016364421329989755, "epoch": 0.31756214915797915, "step": 396 }, { "loss": 0.0511, "grad_norm": 0.4890766441822052, "learning_rate": 0.00016343932841636456, "epoch": 0.31836407377706494, "step": 397 }, { "loss": 0.16, "grad_norm": 0.6917940974235535, "learning_rate": 0.00016323399689586768, "epoch": 0.3191659983961508, "step": 398 }, { "loss": 0.0584, "grad_norm": 0.4217245280742645, "learning_rate": 0.00016302822018401884, "epoch": 0.3199679230152366, "step": 399 }, { "loss": 0.0471, "grad_norm": 0.33742472529411316, "learning_rate": 0.00016282199972956425, "epoch": 0.32076984763432237, "step": 400 }, { "loss": 0.0561, "grad_norm": 0.30320796370506287, "learning_rate": 0.00016261533698437418, "epoch": 0.32157177225340816, "step": 401 }, { "loss": 0.0504, "grad_norm": 0.41129252314567566, "learning_rate": 0.00016240823340343285, "epoch": 0.322373696872494, "step": 402 }, { "loss": 0.0191, "grad_norm": 0.21539658308029175, "learning_rate": 0.00016220069044482814, "epoch": 0.3231756214915798, "step": 403 }, { "loss": 0.0846, "grad_norm": 0.5003443360328674, "learning_rate": 0.00016199270956974128, "epoch": 0.3239775461106656, "step": 404 }, { "loss": 0.0821, "grad_norm": 0.3936382532119751, "learning_rate": 0.00016178429224243663, "epoch": 0.3247794707297514, "step": 405 }, { "loss": 0.1342, "grad_norm": 1.055274248123169, "learning_rate": 0.00016157543993025134, "epoch": 0.32558139534883723, "step": 406 }, { "loss": 0.0784, "grad_norm": 0.33087801933288574, "learning_rate": 0.00016136615410358493, "epoch": 0.326383319967923, "step": 407 }, { "loss": 0.0415, "grad_norm": 0.27356383204460144, "learning_rate": 0.00016115643623588915, "epoch": 0.3271852445870088, "step": 408 }, { "loss": 0.0449, "grad_norm": 0.39037784934043884, "learning_rate": 0.00016094628780365743, "epoch": 0.3279871692060946, "step": 409 }, { "loss": 0.0643, "grad_norm": 0.3727872967720032, "learning_rate": 0.00016073571028641452, "epoch": 0.32878909382518046, "step": 410 }, { "loss": 0.0366, "grad_norm": 0.30508482456207275, "learning_rate": 0.0001605247051667061, "epoch": 0.32959101844426625, "step": 411 }, { "loss": 0.0384, "grad_norm": 0.313531756401062, "learning_rate": 0.00016031327393008845, "epoch": 0.33039294306335204, "step": 412 }, { "loss": 0.034, "grad_norm": 0.3675989806652069, "learning_rate": 0.00016010141806511766, "epoch": 0.33119486768243783, "step": 413 }, { "loss": 0.0383, "grad_norm": 0.2861047685146332, "learning_rate": 0.00015988913906333946, "epoch": 0.3319967923015237, "step": 414 }, { "loss": 0.0232, "grad_norm": 0.30425795912742615, "learning_rate": 0.0001596764384192787, "epoch": 0.33279871692060947, "step": 415 }, { "loss": 0.0582, "grad_norm": 0.5757021307945251, "learning_rate": 0.00015946331763042867, "epoch": 0.33360064153969526, "step": 416 }, { "loss": 0.0341, "grad_norm": 0.2700221538543701, "learning_rate": 0.00015924977819724068, "epoch": 0.33440256615878106, "step": 417 }, { "loss": 0.0458, "grad_norm": 0.3216298818588257, "learning_rate": 0.00015903582162311337, "epoch": 0.3352044907778669, "step": 418 }, { "loss": 0.0497, "grad_norm": 0.2954160273075104, "learning_rate": 0.00015882144941438233, "epoch": 0.3360064153969527, "step": 419 }, { "loss": 0.0345, "grad_norm": 0.30057498812675476, "learning_rate": 0.00015860666308030932, "epoch": 0.3368083400160385, "step": 420 }, { "loss": 0.046, "grad_norm": 0.31479984521865845, "learning_rate": 0.00015839146413307165, "epoch": 0.3376102646351243, "step": 421 }, { "loss": 0.0455, "grad_norm": 0.31836825609207153, "learning_rate": 0.00015817585408775168, "epoch": 0.3384121892542101, "step": 422 }, { "loss": 0.021, "grad_norm": 0.24365834891796112, "learning_rate": 0.000157959834462326, "epoch": 0.3392141138732959, "step": 423 }, { "loss": 0.0354, "grad_norm": 0.38124316930770874, "learning_rate": 0.0001577434067776548, "epoch": 0.3400160384923817, "step": 424 }, { "loss": 0.0853, "grad_norm": 0.6972952485084534, "learning_rate": 0.00015752657255747122, "epoch": 0.3408179631114675, "step": 425 }, { "loss": 0.0189, "grad_norm": 0.2013692706823349, "learning_rate": 0.00015730933332837045, "epoch": 0.34161988773055335, "step": 426 }, { "loss": 0.0561, "grad_norm": 0.3334507346153259, "learning_rate": 0.00015709169061979913, "epoch": 0.34242181234963914, "step": 427 }, { "loss": 0.0652, "grad_norm": 0.8858683109283447, "learning_rate": 0.0001568736459640447, "epoch": 0.34322373696872494, "step": 428 }, { "loss": 0.0267, "grad_norm": 0.2540907561779022, "learning_rate": 0.00015665520089622423, "epoch": 0.3440256615878107, "step": 429 }, { "loss": 0.0475, "grad_norm": 0.3518412709236145, "learning_rate": 0.00015643635695427403, "epoch": 0.3448275862068966, "step": 430 }, { "loss": 0.0311, "grad_norm": 0.17859573662281036, "learning_rate": 0.00015621711567893854, "epoch": 0.34562951082598237, "step": 431 }, { "loss": 0.0202, "grad_norm": 0.3969719409942627, "learning_rate": 0.00015599747861375955, "epoch": 0.34643143544506816, "step": 432 }, { "loss": 0.0249, "grad_norm": 0.21450327336788177, "learning_rate": 0.00015577744730506545, "epoch": 0.34723336006415395, "step": 433 }, { "loss": 0.0416, "grad_norm": 0.37466296553611755, "learning_rate": 0.00015555702330196023, "epoch": 0.3480352846832398, "step": 434 }, { "loss": 0.064, "grad_norm": 0.5470214486122131, "learning_rate": 0.00015533620815631256, "epoch": 0.3488372093023256, "step": 435 }, { "loss": 0.0988, "grad_norm": 0.6237538456916809, "learning_rate": 0.0001551150034227449, "epoch": 0.3496391339214114, "step": 436 }, { "loss": 0.1344, "grad_norm": 0.5647206902503967, "learning_rate": 0.0001548934106586226, "epoch": 0.3504410585404972, "step": 437 }, { "loss": 0.0561, "grad_norm": 0.326889306306839, "learning_rate": 0.0001546714314240429, "epoch": 0.351242983159583, "step": 438 }, { "loss": 0.0785, "grad_norm": 0.4708334803581238, "learning_rate": 0.00015444906728182385, "epoch": 0.3520449077786688, "step": 439 }, { "loss": 0.0392, "grad_norm": 0.4006723165512085, "learning_rate": 0.00015422631979749354, "epoch": 0.3528468323977546, "step": 440 }, { "loss": 0.0291, "grad_norm": 0.25906902551651, "learning_rate": 0.00015400319053927874, "epoch": 0.3536487570168404, "step": 441 }, { "loss": 0.0242, "grad_norm": 0.31759947538375854, "learning_rate": 0.00015377968107809425, "epoch": 0.35445068163592625, "step": 442 }, { "loss": 0.0368, "grad_norm": 0.2436400055885315, "learning_rate": 0.00015355579298753153, "epoch": 0.35525260625501204, "step": 443 }, { "loss": 0.0704, "grad_norm": 0.4932403564453125, "learning_rate": 0.00015333152784384777, "epoch": 0.35605453087409783, "step": 444 }, { "loss": 0.0529, "grad_norm": 0.4474373757839203, "learning_rate": 0.00015310688722595473, "epoch": 0.3568564554931836, "step": 445 }, { "loss": 0.0773, "grad_norm": 0.5451852679252625, "learning_rate": 0.00015288187271540767, "epoch": 0.35765838011226947, "step": 446 }, { "loss": 0.0567, "grad_norm": 0.3486538231372833, "learning_rate": 0.00015265648589639423, "epoch": 0.35846030473135526, "step": 447 }, { "loss": 0.0445, "grad_norm": 0.33438971638679504, "learning_rate": 0.00015243072835572318, "epoch": 0.35926222935044105, "step": 448 }, { "loss": 0.0656, "grad_norm": 0.6021797060966492, "learning_rate": 0.00015220460168281335, "epoch": 0.36006415396952685, "step": 449 }, { "loss": 0.0539, "grad_norm": 0.2629101276397705, "learning_rate": 0.0001519781074696824, "epoch": 0.3608660785886127, "step": 450 }, { "loss": 0.0559, "grad_norm": 0.38639259338378906, "learning_rate": 0.00015175124731093553, "epoch": 0.3616680032076985, "step": 451 }, { "loss": 0.0632, "grad_norm": 0.40031421184539795, "learning_rate": 0.00015152402280375454, "epoch": 0.3624699278267843, "step": 452 }, { "loss": 0.0196, "grad_norm": 0.24561044573783875, "learning_rate": 0.00015129643554788612, "epoch": 0.36327185244587007, "step": 453 }, { "loss": 0.0563, "grad_norm": 0.8373734354972839, "learning_rate": 0.00015106848714563112, "epoch": 0.3640737770649559, "step": 454 }, { "loss": 0.0388, "grad_norm": 0.38167354464530945, "learning_rate": 0.00015084017920183272, "epoch": 0.3648757016840417, "step": 455 }, { "loss": 0.0506, "grad_norm": 0.46959736943244934, "learning_rate": 0.00015061151332386566, "epoch": 0.3656776263031275, "step": 456 }, { "loss": 0.0314, "grad_norm": 0.30401480197906494, "learning_rate": 0.00015038249112162445, "epoch": 0.3664795509222133, "step": 457 }, { "loss": 0.0466, "grad_norm": 0.2866615355014801, "learning_rate": 0.00015015311420751244, "epoch": 0.36728147554129914, "step": 458 }, { "loss": 0.0963, "grad_norm": 0.787212073802948, "learning_rate": 0.00014992338419643022, "epoch": 0.36808340016038493, "step": 459 }, { "loss": 0.0872, "grad_norm": 0.49554625153541565, "learning_rate": 0.00014969330270576427, "epoch": 0.3688853247794707, "step": 460 }, { "loss": 0.0268, "grad_norm": 0.2807726562023163, "learning_rate": 0.0001494628713553757, "epoch": 0.3696872493985565, "step": 461 }, { "loss": 0.0488, "grad_norm": 0.28138288855552673, "learning_rate": 0.0001492320917675887, "epoch": 0.37048917401764236, "step": 462 }, { "loss": 0.0582, "grad_norm": 0.3524348735809326, "learning_rate": 0.0001490009655671792, "epoch": 0.37129109863672816, "step": 463 }, { "loss": 0.0627, "grad_norm": 0.38492485880851746, "learning_rate": 0.00014876949438136347, "epoch": 0.37209302325581395, "step": 464 }, { "loss": 0.0581, "grad_norm": 0.31561005115509033, "learning_rate": 0.0001485376798397865, "epoch": 0.37289494787489974, "step": 465 }, { "loss": 0.0437, "grad_norm": 0.30238181352615356, "learning_rate": 0.00014830552357451076, "epoch": 0.3736968724939856, "step": 466 }, { "loss": 0.0498, "grad_norm": 0.3918459117412567, "learning_rate": 0.00014807302722000447, "epoch": 0.3744987971130714, "step": 467 }, { "loss": 0.0245, "grad_norm": 0.20536094903945923, "learning_rate": 0.00014784019241313026, "epoch": 0.37530072173215717, "step": 468 }, { "loss": 0.0327, "grad_norm": 0.2256690412759781, "learning_rate": 0.0001476070207931336, "epoch": 0.37610264635124296, "step": 469 }, { "loss": 0.0626, "grad_norm": 0.42872869968414307, "learning_rate": 0.00014737351400163128, "epoch": 0.3769045709703288, "step": 470 }, { "loss": 0.0555, "grad_norm": 0.3690952658653259, "learning_rate": 0.0001471396736825998, "epoch": 0.3777064955894146, "step": 471 }, { "loss": 0.0675, "grad_norm": 0.4958707094192505, "learning_rate": 0.0001469055014823637, "epoch": 0.3785084202085004, "step": 472 }, { "loss": 0.0505, "grad_norm": 0.319414883852005, "learning_rate": 0.0001466709990495843, "epoch": 0.3793103448275862, "step": 473 }, { "loss": 0.0455, "grad_norm": 0.34806713461875916, "learning_rate": 0.00014643616803524778, "epoch": 0.38011226944667204, "step": 474 }, { "loss": 0.0284, "grad_norm": 0.25858795642852783, "learning_rate": 0.0001462010100926536, "epoch": 0.3809141940657578, "step": 475 }, { "loss": 0.0587, "grad_norm": 0.39808589220046997, "learning_rate": 0.00014596552687740302, "epoch": 0.3817161186848436, "step": 476 }, { "loss": 0.0476, "grad_norm": 0.4907149374485016, "learning_rate": 0.00014572972004738732, "epoch": 0.3825180433039294, "step": 477 }, { "loss": 0.0506, "grad_norm": 0.30614417791366577, "learning_rate": 0.00014549359126277608, "epoch": 0.38331996792301526, "step": 478 }, { "loss": 0.0259, "grad_norm": 0.3281151354312897, "learning_rate": 0.00014525714218600565, "epoch": 0.38412189254210105, "step": 479 }, { "loss": 0.0376, "grad_norm": 0.34824758768081665, "learning_rate": 0.00014502037448176734, "epoch": 0.38492381716118684, "step": 480 }, { "loss": 0.0425, "grad_norm": 0.2705196440219879, "learning_rate": 0.00014478328981699568, "epoch": 0.38572574178027264, "step": 481 }, { "loss": 0.0466, "grad_norm": 0.2696325480937958, "learning_rate": 0.00014454588986085676, "epoch": 0.3865276663993585, "step": 482 }, { "loss": 0.0372, "grad_norm": 0.3687107264995575, "learning_rate": 0.00014430817628473638, "epoch": 0.3873295910184443, "step": 483 }, { "loss": 0.0554, "grad_norm": 0.3724960684776306, "learning_rate": 0.00014407015076222846, "epoch": 0.38813151563753007, "step": 484 }, { "loss": 0.079, "grad_norm": 0.5664525032043457, "learning_rate": 0.000143831814969123, "epoch": 0.38893344025661586, "step": 485 }, { "loss": 0.0318, "grad_norm": 0.20477205514907837, "learning_rate": 0.00014359317058339457, "epoch": 0.3897353648757017, "step": 486 }, { "loss": 0.0456, "grad_norm": 0.3792808949947357, "learning_rate": 0.0001433542192851902, "epoch": 0.3905372894947875, "step": 487 }, { "loss": 0.0253, "grad_norm": 0.26179176568984985, "learning_rate": 0.00014311496275681783, "epoch": 0.3913392141138733, "step": 488 }, { "loss": 0.0398, "grad_norm": 0.29624319076538086, "learning_rate": 0.00014287540268273426, "epoch": 0.3921411387329591, "step": 489 }, { "loss": 0.0425, "grad_norm": 0.3284585773944855, "learning_rate": 0.00014263554074953337, "epoch": 0.39294306335204493, "step": 490 }, { "loss": 0.0277, "grad_norm": 0.23194313049316406, "learning_rate": 0.00014239537864593432, "epoch": 0.3937449879711307, "step": 491 }, { "loss": 0.047, "grad_norm": 0.557132363319397, "learning_rate": 0.00014215491806276944, "epoch": 0.3945469125902165, "step": 492 }, { "loss": 0.0495, "grad_norm": 0.3186132311820984, "learning_rate": 0.0001419141606929726, "epoch": 0.3953488372093023, "step": 493 }, { "loss": 0.0476, "grad_norm": 0.4139769375324249, "learning_rate": 0.0001416731082315671, "epoch": 0.39615076182838815, "step": 494 }, { "loss": 0.0707, "grad_norm": 0.6908156275749207, "learning_rate": 0.00014143176237565387, "epoch": 0.39695268644747395, "step": 495 }, { "loss": 0.0328, "grad_norm": 0.47614389657974243, "learning_rate": 0.0001411901248243993, "epoch": 0.39775461106655974, "step": 496 }, { "eval_loss": 0.04790589585900307, "eval_runtime": 31.9045, "eval_samples_per_second": 32.911, "eval_steps_per_second": 8.243, "epoch": 0.39775461106655974, "step": 496 }, { "loss": 0.0491, "grad_norm": 0.4075859487056732, "learning_rate": 0.00014094819727902353, "epoch": 0.39855653568564553, "step": 497 }, { "loss": 0.0679, "grad_norm": 0.2855551838874817, "learning_rate": 0.0001407059814427884, "epoch": 0.3993584603047314, "step": 498 }, { "loss": 0.0366, "grad_norm": 0.7473935484886169, "learning_rate": 0.00014046347902098535, "epoch": 0.40016038492381717, "step": 499 }, { "loss": 0.0177, "grad_norm": 0.16580775380134583, "learning_rate": 0.00014022069172092352, "epoch": 0.40096230954290296, "step": 500 }, { "loss": 0.04, "grad_norm": 0.3346802592277527, "learning_rate": 0.00013997762125191773, "epoch": 0.40176423416198875, "step": 501 }, { "loss": 0.065, "grad_norm": 0.5194714069366455, "learning_rate": 0.00013973426932527636, "epoch": 0.4025661587810746, "step": 502 }, { "loss": 0.0412, "grad_norm": 0.25542134046554565, "learning_rate": 0.00013949063765428943, "epoch": 0.4033680834001604, "step": 503 }, { "loss": 0.0768, "grad_norm": 0.46887674927711487, "learning_rate": 0.00013924672795421637, "epoch": 0.4041700080192462, "step": 504 }, { "loss": 0.0508, "grad_norm": 0.3275587558746338, "learning_rate": 0.00013900254194227415, "epoch": 0.404971932638332, "step": 505 }, { "loss": 0.0253, "grad_norm": 0.2020861655473709, "learning_rate": 0.000138758081337625, "epoch": 0.4057738572574178, "step": 506 }, { "loss": 0.0322, "grad_norm": 0.5022090673446655, "learning_rate": 0.0001385133478613644, "epoch": 0.4065757818765036, "step": 507 }, { "loss": 0.121, "grad_norm": 1.1316415071487427, "learning_rate": 0.000138268343236509, "epoch": 0.4073777064955894, "step": 508 }, { "loss": 0.0239, "grad_norm": 0.29626041650772095, "learning_rate": 0.00013802306918798437, "epoch": 0.4081796311146752, "step": 509 }, { "loss": 0.0354, "grad_norm": 0.19514746963977814, "learning_rate": 0.00013777752744261295, "epoch": 0.40898155573376105, "step": 510 }, { "loss": 0.0692, "grad_norm": 0.4436163902282715, "learning_rate": 0.0001375317197291019, "epoch": 0.40978348035284684, "step": 511 }, { "loss": 0.0427, "grad_norm": 0.36557817459106445, "learning_rate": 0.00013728564777803088, "epoch": 0.41058540497193263, "step": 512 }, { "loss": 0.0407, "grad_norm": 0.3514234721660614, "learning_rate": 0.00013703931332183987, "epoch": 0.4113873295910184, "step": 513 }, { "loss": 0.0235, "grad_norm": 0.24922512471675873, "learning_rate": 0.00013679271809481693, "epoch": 0.41218925421010427, "step": 514 }, { "loss": 0.0492, "grad_norm": 0.4417109787464142, "learning_rate": 0.00013654586383308619, "epoch": 0.41299117882919006, "step": 515 }, { "loss": 0.0973, "grad_norm": 0.5984606146812439, "learning_rate": 0.00013629875227459532, "epoch": 0.41379310344827586, "step": 516 }, { "loss": 0.0597, "grad_norm": 0.5426322221755981, "learning_rate": 0.0001360513851591036, "epoch": 0.41459502806736165, "step": 517 }, { "loss": 0.081, "grad_norm": 0.7733796238899231, "learning_rate": 0.00013580376422816945, "epoch": 0.4153969526864475, "step": 518 }, { "loss": 0.031, "grad_norm": 0.33183905482292175, "learning_rate": 0.00013555589122513827, "epoch": 0.4161988773055333, "step": 519 }, { "loss": 0.0592, "grad_norm": 0.4072870910167694, "learning_rate": 0.0001353077678951301, "epoch": 0.4170008019246191, "step": 520 }, { "loss": 0.0523, "grad_norm": 0.3927518427371979, "learning_rate": 0.0001350593959850274, "epoch": 0.41780272654370487, "step": 521 }, { "loss": 0.0332, "grad_norm": 0.3755587637424469, "learning_rate": 0.00013481077724346278, "epoch": 0.4186046511627907, "step": 522 }, { "loss": 0.1049, "grad_norm": 0.5004737377166748, "learning_rate": 0.0001345619134208066, "epoch": 0.4194065757818765, "step": 523 }, { "loss": 0.0878, "grad_norm": 0.3315165042877197, "learning_rate": 0.00013431280626915467, "epoch": 0.4202085004009623, "step": 524 }, { "loss": 0.0339, "grad_norm": 0.27768945693969727, "learning_rate": 0.00013406345754231588, "epoch": 0.4210104250200481, "step": 525 }, { "loss": 0.0433, "grad_norm": 0.3195447325706482, "learning_rate": 0.00013381386899580003, "epoch": 0.42181234963913394, "step": 526 }, { "loss": 0.028, "grad_norm": 0.2721582055091858, "learning_rate": 0.00013356404238680527, "epoch": 0.42261427425821974, "step": 527 }, { "loss": 0.0324, "grad_norm": 0.2353498488664627, "learning_rate": 0.00013331397947420576, "epoch": 0.4234161988773055, "step": 528 }, { "loss": 0.0572, "grad_norm": 0.49510321021080017, "learning_rate": 0.0001330636820185394, "epoch": 0.4242181234963913, "step": 529 }, { "loss": 0.0586, "grad_norm": 0.5035674571990967, "learning_rate": 0.00013281315178199536, "epoch": 0.42502004811547717, "step": 530 }, { "loss": 0.0337, "grad_norm": 0.761020839214325, "learning_rate": 0.00013256239052840155, "epoch": 0.42582197273456296, "step": 531 }, { "loss": 0.0587, "grad_norm": 0.2618282735347748, "learning_rate": 0.00013231140002321253, "epoch": 0.42662389735364875, "step": 532 }, { "loss": 0.0257, "grad_norm": 0.2896956503391266, "learning_rate": 0.0001320601820334967, "epoch": 0.42742582197273454, "step": 533 }, { "loss": 0.0461, "grad_norm": 0.48962509632110596, "learning_rate": 0.00013180873832792416, "epoch": 0.4282277465918204, "step": 534 }, { "loss": 0.0093, "grad_norm": 0.13504081964492798, "learning_rate": 0.00013155707067675406, "epoch": 0.4290296712109062, "step": 535 }, { "loss": 0.0417, "grad_norm": 0.3743266463279724, "learning_rate": 0.00013130518085182225, "epoch": 0.429831595829992, "step": 536 }, { "loss": 0.0343, "grad_norm": 0.29630181193351746, "learning_rate": 0.00013105307062652872, "epoch": 0.43063352044907777, "step": 537 }, { "loss": 0.0291, "grad_norm": 0.25488558411598206, "learning_rate": 0.00013080074177582526, "epoch": 0.4314354450681636, "step": 538 }, { "loss": 0.091, "grad_norm": 0.4586013853549957, "learning_rate": 0.00013054819607620274, "epoch": 0.4322373696872494, "step": 539 }, { "loss": 0.1163, "grad_norm": 0.7305994033813477, "learning_rate": 0.00013029543530567884, "epoch": 0.4330392943063352, "step": 540 }, { "loss": 0.0339, "grad_norm": 0.234614759683609, "learning_rate": 0.00013004246124378535, "epoch": 0.433841218925421, "step": 541 }, { "loss": 0.0321, "grad_norm": 0.2804659903049469, "learning_rate": 0.00012978927567155573, "epoch": 0.43464314354450684, "step": 542 }, { "loss": 0.0514, "grad_norm": 0.5687031745910645, "learning_rate": 0.0001295358803715126, "epoch": 0.43544506816359263, "step": 543 }, { "loss": 0.0824, "grad_norm": 0.583227276802063, "learning_rate": 0.00012928227712765504, "epoch": 0.4362469927826784, "step": 544 }, { "loss": 0.0453, "grad_norm": 0.31921252608299255, "learning_rate": 0.00012902846772544624, "epoch": 0.4370489174017642, "step": 545 }, { "loss": 0.0495, "grad_norm": 0.4188879430294037, "learning_rate": 0.00012877445395180078, "epoch": 0.43785084202085006, "step": 546 }, { "loss": 0.0393, "grad_norm": 0.2866995334625244, "learning_rate": 0.00012852023759507203, "epoch": 0.43865276663993585, "step": 547 }, { "loss": 0.0772, "grad_norm": 0.48335814476013184, "learning_rate": 0.00012826582044503978, "epoch": 0.43945469125902165, "step": 548 }, { "loss": 0.0537, "grad_norm": 0.3400033414363861, "learning_rate": 0.0001280112042928973, "epoch": 0.44025661587810744, "step": 549 }, { "loss": 0.0503, "grad_norm": 0.43847382068634033, "learning_rate": 0.00012775639093123907, "epoch": 0.4410585404971933, "step": 550 }, { "loss": 0.0659, "grad_norm": 0.3055131137371063, "learning_rate": 0.00012750138215404782, "epoch": 0.4418604651162791, "step": 551 }, { "loss": 0.0532, "grad_norm": 0.31449994444847107, "learning_rate": 0.0001272461797566823, "epoch": 0.44266238973536487, "step": 552 }, { "loss": 0.0273, "grad_norm": 0.39831122756004333, "learning_rate": 0.00012699078553586422, "epoch": 0.44346431435445066, "step": 553 }, { "loss": 0.04, "grad_norm": 0.464834600687027, "learning_rate": 0.00012673520128966592, "epoch": 0.4442662389735365, "step": 554 }, { "loss": 0.0679, "grad_norm": 0.3944595158100128, "learning_rate": 0.00012647942881749755, "epoch": 0.4450681635926223, "step": 555 }, { "loss": 0.0271, "grad_norm": 0.21679094433784485, "learning_rate": 0.00012622346992009447, "epoch": 0.4458700882117081, "step": 556 }, { "loss": 0.0349, "grad_norm": 0.34640711545944214, "learning_rate": 0.00012596732639950442, "epoch": 0.4466720128307939, "step": 557 }, { "loss": 0.0445, "grad_norm": 0.5096455216407776, "learning_rate": 0.00012571100005907523, "epoch": 0.44747393744987973, "step": 558 }, { "loss": 0.0544, "grad_norm": 0.35034018754959106, "learning_rate": 0.0001254544927034415, "epoch": 0.4482758620689655, "step": 559 }, { "loss": 0.1161, "grad_norm": 0.4701795279979706, "learning_rate": 0.00012519780613851254, "epoch": 0.4490777866880513, "step": 560 }, { "loss": 0.0259, "grad_norm": 0.25175973773002625, "learning_rate": 0.00012494094217145918, "epoch": 0.4498797113071371, "step": 561 }, { "loss": 0.0431, "grad_norm": 0.30269894003868103, "learning_rate": 0.00012468390261070138, "epoch": 0.45068163592622296, "step": 562 }, { "loss": 0.0234, "grad_norm": 0.23327726125717163, "learning_rate": 0.0001244266892658952, "epoch": 0.45148356054530875, "step": 563 }, { "loss": 0.0433, "grad_norm": 0.26909253001213074, "learning_rate": 0.00012416930394792026, "epoch": 0.45228548516439454, "step": 564 }, { "loss": 0.0676, "grad_norm": 0.4461866319179535, "learning_rate": 0.00012391174846886698, "epoch": 0.45308740978348033, "step": 565 }, { "loss": 0.0461, "grad_norm": 0.4100785553455353, "learning_rate": 0.0001236540246420237, "epoch": 0.4538893344025662, "step": 566 }, { "loss": 0.0338, "grad_norm": 0.35902178287506104, "learning_rate": 0.00012339613428186407, "epoch": 0.454691259021652, "step": 567 }, { "loss": 0.0544, "grad_norm": 0.43561217188835144, "learning_rate": 0.00012313807920403419, "epoch": 0.45549318364073776, "step": 568 }, { "loss": 0.0476, "grad_norm": 0.34299418330192566, "learning_rate": 0.0001228798612253397, "epoch": 0.45629510825982356, "step": 569 }, { "loss": 0.1276, "grad_norm": 0.5789246559143066, "learning_rate": 0.00012262148216373331, "epoch": 0.4570970328789094, "step": 570 }, { "loss": 0.0243, "grad_norm": 0.42919760942459106, "learning_rate": 0.00012236294383830175, "epoch": 0.4578989574979952, "step": 571 }, { "loss": 0.0459, "grad_norm": 0.24285271763801575, "learning_rate": 0.00012210424806925301, "epoch": 0.458700882117081, "step": 572 }, { "loss": 0.0573, "grad_norm": 0.46728515625, "learning_rate": 0.00012184539667790349, "epoch": 0.4595028067361668, "step": 573 }, { "loss": 0.0543, "grad_norm": 0.2979477643966675, "learning_rate": 0.00012158639148666534, "epoch": 0.4603047313552526, "step": 574 }, { "loss": 0.0613, "grad_norm": 0.35671502351760864, "learning_rate": 0.00012132723431903341, "epoch": 0.4611066559743384, "step": 575 }, { "loss": 0.0328, "grad_norm": 0.279118150472641, "learning_rate": 0.00012106792699957263, "epoch": 0.4619085805934242, "step": 576 }, { "loss": 0.0595, "grad_norm": 0.6142110824584961, "learning_rate": 0.000120808471353905, "epoch": 0.46271050521251, "step": 577 }, { "loss": 0.0691, "grad_norm": 0.7308236956596375, "learning_rate": 0.00012054886920869681, "epoch": 0.46351242983159585, "step": 578 }, { "loss": 0.0528, "grad_norm": 0.45223355293273926, "learning_rate": 0.00012028912239164569, "epoch": 0.46431435445068164, "step": 579 }, { "loss": 0.0373, "grad_norm": 0.2948494255542755, "learning_rate": 0.00012002923273146794, "epoch": 0.46511627906976744, "step": 580 }, { "loss": 0.0414, "grad_norm": 0.27661287784576416, "learning_rate": 0.00011976920205788542, "epoch": 0.4659182036888532, "step": 581 }, { "loss": 0.0578, "grad_norm": 0.4644034504890442, "learning_rate": 0.00011950903220161285, "epoch": 0.4667201283079391, "step": 582 }, { "loss": 0.0565, "grad_norm": 0.6451210379600525, "learning_rate": 0.00011924872499434479, "epoch": 0.46752205292702487, "step": 583 }, { "loss": 0.0231, "grad_norm": 0.21448062360286713, "learning_rate": 0.00011898828226874284, "epoch": 0.46832397754611066, "step": 584 }, { "loss": 0.0166, "grad_norm": 0.15424512326717377, "learning_rate": 0.00011872770585842273, "epoch": 0.46912590216519645, "step": 585 }, { "loss": 0.0473, "grad_norm": 0.31540054082870483, "learning_rate": 0.0001184669975979413, "epoch": 0.4699278267842823, "step": 586 }, { "loss": 0.0165, "grad_norm": 0.13097421824932098, "learning_rate": 0.00011820615932278374, "epoch": 0.4707297514033681, "step": 587 }, { "loss": 0.0318, "grad_norm": 0.308799684047699, "learning_rate": 0.00011794519286935055, "epoch": 0.4715316760224539, "step": 588 }, { "loss": 0.0471, "grad_norm": 0.2947872579097748, "learning_rate": 0.00011768410007494466, "epoch": 0.4723336006415397, "step": 589 }, { "loss": 0.0516, "grad_norm": 0.22661037743091583, "learning_rate": 0.0001174228827777585, "epoch": 0.4731355252606255, "step": 590 }, { "loss": 0.03, "grad_norm": 0.24548248946666718, "learning_rate": 0.00011716154281686105, "epoch": 0.4739374498797113, "step": 591 }, { "loss": 0.0365, "grad_norm": 0.2837478220462799, "learning_rate": 0.00011690008203218493, "epoch": 0.4747393744987971, "step": 592 }, { "loss": 0.0538, "grad_norm": 0.3481287360191345, "learning_rate": 0.00011663850226451327, "epoch": 0.4755412991178829, "step": 593 }, { "loss": 0.048, "grad_norm": 0.4488002061843872, "learning_rate": 0.000116376805355467, "epoch": 0.47634322373696875, "step": 594 }, { "loss": 0.015, "grad_norm": 0.16303379833698273, "learning_rate": 0.00011611499314749177, "epoch": 0.47714514835605454, "step": 595 }, { "loss": 0.0246, "grad_norm": 0.22950126230716705, "learning_rate": 0.0001158530674838449, "epoch": 0.47794707297514033, "step": 596 }, { "loss": 0.0116, "grad_norm": 0.1625395268201828, "learning_rate": 0.0001155910302085826, "epoch": 0.4787489975942261, "step": 597 }, { "loss": 0.0301, "grad_norm": 0.23239369690418243, "learning_rate": 0.00011532888316654675, "epoch": 0.47955092221331197, "step": 598 }, { "loss": 0.1168, "grad_norm": 0.7024423480033875, "learning_rate": 0.00011506662820335208, "epoch": 0.48035284683239776, "step": 599 }, { "loss": 0.0615, "grad_norm": 0.31283116340637207, "learning_rate": 0.00011480426716537315, "epoch": 0.48115477145148355, "step": 600 }, { "loss": 0.0387, "grad_norm": 0.22865501046180725, "learning_rate": 0.0001145418018997313, "epoch": 0.48195669607056935, "step": 601 }, { "loss": 0.0189, "grad_norm": 0.2138299196958542, "learning_rate": 0.00011427923425428164, "epoch": 0.4827586206896552, "step": 602 }, { "loss": 0.0703, "grad_norm": 0.3493439257144928, "learning_rate": 0.00011401656607760015, "epoch": 0.483560545308741, "step": 603 }, { "loss": 0.0178, "grad_norm": 0.2075956165790558, "learning_rate": 0.00011375379921897051, "epoch": 0.4843624699278268, "step": 604 }, { "loss": 0.0456, "grad_norm": 0.4019928276538849, "learning_rate": 0.0001134909355283712, "epoch": 0.48516439454691257, "step": 605 }, { "loss": 0.0332, "grad_norm": 0.3662348687648773, "learning_rate": 0.00011322797685646242, "epoch": 0.4859663191659984, "step": 606 }, { "loss": 0.0289, "grad_norm": 0.26660025119781494, "learning_rate": 0.00011296492505457314, "epoch": 0.4867682437850842, "step": 607 }, { "loss": 0.0251, "grad_norm": 0.1749676614999771, "learning_rate": 0.00011270178197468789, "epoch": 0.48757016840417, "step": 608 }, { "loss": 0.0338, "grad_norm": 0.2791067957878113, "learning_rate": 0.00011243854946943388, "epoch": 0.4883720930232558, "step": 609 }, { "loss": 0.0395, "grad_norm": 0.21187956631183624, "learning_rate": 0.00011217522939206795, "epoch": 0.48917401764234164, "step": 610 }, { "loss": 0.0596, "grad_norm": 0.4193437695503235, "learning_rate": 0.00011191182359646337, "epoch": 0.48997594226142743, "step": 611 }, { "loss": 0.0437, "grad_norm": 0.42110878229141235, "learning_rate": 0.00011164833393709706, "epoch": 0.4907778668805132, "step": 612 }, { "loss": 0.0364, "grad_norm": 0.3795287013053894, "learning_rate": 0.00011138476226903625, "epoch": 0.491579791499599, "step": 613 }, { "loss": 0.0537, "grad_norm": 0.307650089263916, "learning_rate": 0.00011112111044792557, "epoch": 0.49238171611868486, "step": 614 }, { "loss": 0.044, "grad_norm": 0.33749890327453613, "learning_rate": 0.00011085738032997398, "epoch": 0.49318364073777066, "step": 615 }, { "loss": 0.0479, "grad_norm": 0.3227038085460663, "learning_rate": 0.00011059357377194161, "epoch": 0.49398556535685645, "step": 616 }, { "loss": 0.0452, "grad_norm": 0.3482477068901062, "learning_rate": 0.00011032969263112688, "epoch": 0.49478748997594224, "step": 617 }, { "loss": 0.0315, "grad_norm": 0.27159547805786133, "learning_rate": 0.00011006573876535322, "epoch": 0.4955894145950281, "step": 618 }, { "loss": 0.0637, "grad_norm": 0.40270885825157166, "learning_rate": 0.0001098017140329561, "epoch": 0.4963913392141139, "step": 619 }, { "loss": 0.022, "grad_norm": 0.21836791932582855, "learning_rate": 0.00010953762029276982, "epoch": 0.4971932638331997, "step": 620 }, { "eval_loss": 0.04481621831655502, "eval_runtime": 32.1222, "eval_samples_per_second": 32.688, "eval_steps_per_second": 8.187, "epoch": 0.4971932638331997, "step": 620 }, { "loss": 0.0203, "grad_norm": 0.1830679029226303, "learning_rate": 0.00010927345940411467, "epoch": 0.49799518845228546, "step": 621 }, { "loss": 0.0607, "grad_norm": 0.4090077579021454, "learning_rate": 0.00010900923322678364, "epoch": 0.4987971130713713, "step": 622 }, { "loss": 0.0342, "grad_norm": 0.28506171703338623, "learning_rate": 0.00010874494362102931, "epoch": 0.4995990376904571, "step": 623 }, { "loss": 0.0318, "grad_norm": 0.31976205110549927, "learning_rate": 0.00010848059244755093, "epoch": 0.5004009623095429, "step": 624 }, { "loss": 0.0556, "grad_norm": 0.2998436391353607, "learning_rate": 0.0001082161815674811, "epoch": 0.5012028869286287, "step": 625 }, { "loss": 0.021, "grad_norm": 0.22129428386688232, "learning_rate": 0.00010795171284237284, "epoch": 0.5020048115477145, "step": 626 }, { "loss": 0.0266, "grad_norm": 0.2941289246082306, "learning_rate": 0.00010768718813418644, "epoch": 0.5028067361668003, "step": 627 }, { "loss": 0.0568, "grad_norm": 0.3848710358142853, "learning_rate": 0.00010742260930527625, "epoch": 0.5036086607858862, "step": 628 }, { "loss": 0.0388, "grad_norm": 0.33324113488197327, "learning_rate": 0.00010715797821837776, "epoch": 0.504410585404972, "step": 629 }, { "loss": 0.0357, "grad_norm": 0.350759893655777, "learning_rate": 0.00010689329673659429, "epoch": 0.5052125100240578, "step": 630 }, { "loss": 0.0111, "grad_norm": 0.15862928330898285, "learning_rate": 0.00010662856672338397, "epoch": 0.5060144346431436, "step": 631 }, { "loss": 0.0382, "grad_norm": 0.26137423515319824, "learning_rate": 0.00010636379004254664, "epoch": 0.5068163592622293, "step": 632 }, { "loss": 0.0244, "grad_norm": 0.28696557879447937, "learning_rate": 0.00010609896855821068, "epoch": 0.5076182838813151, "step": 633 }, { "loss": 0.0497, "grad_norm": 0.3603985905647278, "learning_rate": 0.00010583410413481994, "epoch": 0.5084202085004009, "step": 634 }, { "loss": 0.0837, "grad_norm": 0.653423011302948, "learning_rate": 0.00010556919863712054, "epoch": 0.5092221331194867, "step": 635 }, { "loss": 0.0243, "grad_norm": 0.23948614299297333, "learning_rate": 0.00010530425393014774, "epoch": 0.5100240577385726, "step": 636 }, { "loss": 0.0271, "grad_norm": 0.22972430288791656, "learning_rate": 0.00010503927187921292, "epoch": 0.5108259823576584, "step": 637 }, { "loss": 0.047, "grad_norm": 0.4855923354625702, "learning_rate": 0.00010477425434989036, "epoch": 0.5116279069767442, "step": 638 }, { "loss": 0.0319, "grad_norm": 0.3573042154312134, "learning_rate": 0.0001045092032080041, "epoch": 0.51242983159583, "step": 639 }, { "loss": 0.0679, "grad_norm": 0.4812779426574707, "learning_rate": 0.00010424412031961484, "epoch": 0.5132317562149158, "step": 640 }, { "loss": 0.016, "grad_norm": 0.21666432917118073, "learning_rate": 0.00010397900755100678, "epoch": 0.5140336808340016, "step": 641 }, { "loss": 0.018, "grad_norm": 0.19402359426021576, "learning_rate": 0.00010371386676867447, "epoch": 0.5148356054530874, "step": 642 }, { "loss": 0.0917, "grad_norm": 0.5789539217948914, "learning_rate": 0.00010344869983930974, "epoch": 0.5156375300721732, "step": 643 }, { "loss": 0.02, "grad_norm": 0.19617126882076263, "learning_rate": 0.00010318350862978848, "epoch": 0.5164394546912591, "step": 644 }, { "loss": 0.0407, "grad_norm": 0.33302173018455505, "learning_rate": 0.00010291829500715744, "epoch": 0.5172413793103449, "step": 645 }, { "loss": 0.0685, "grad_norm": 0.4327728748321533, "learning_rate": 0.00010265306083862134, "epoch": 0.5180433039294307, "step": 646 }, { "loss": 0.029, "grad_norm": 0.3352719843387604, "learning_rate": 0.00010238780799152938, "epoch": 0.5188452285485164, "step": 647 }, { "loss": 0.0195, "grad_norm": 0.20400014519691467, "learning_rate": 0.00010212253833336237, "epoch": 0.5196471531676022, "step": 648 }, { "loss": 0.052, "grad_norm": 0.5209816098213196, "learning_rate": 0.00010185725373171942, "epoch": 0.520449077786688, "step": 649 }, { "loss": 0.0123, "grad_norm": 0.2923823595046997, "learning_rate": 0.0001015919560543049, "epoch": 0.5212510024057738, "step": 650 }, { "loss": 0.0412, "grad_norm": 0.3393188714981079, "learning_rate": 0.0001013266471689152, "epoch": 0.5220529270248596, "step": 651 }, { "loss": 0.0188, "grad_norm": 0.24097828567028046, "learning_rate": 0.00010106132894342564, "epoch": 0.5228548516439455, "step": 652 }, { "loss": 0.0686, "grad_norm": 0.44344210624694824, "learning_rate": 0.00010079600324577722, "epoch": 0.5236567762630313, "step": 653 }, { "loss": 0.0143, "grad_norm": 0.2262842059135437, "learning_rate": 0.0001005306719439637, "epoch": 0.5244587008821171, "step": 654 }, { "loss": 0.0288, "grad_norm": 0.2735036611557007, "learning_rate": 0.00010026533690601814, "epoch": 0.5252606255012029, "step": 655 }, { "loss": 0.0554, "grad_norm": 0.5491762757301331, "learning_rate": 0.0001, "epoch": 0.5260625501202887, "step": 656 }, { "loss": 0.052, "grad_norm": 0.3667290508747101, "learning_rate": 9.973466309398187e-05, "epoch": 0.5268644747393745, "step": 657 }, { "loss": 0.029, "grad_norm": 0.24463889002799988, "learning_rate": 9.946932805603635e-05, "epoch": 0.5276663993584603, "step": 658 }, { "loss": 0.0305, "grad_norm": 0.34307271242141724, "learning_rate": 9.92039967542228e-05, "epoch": 0.5284683239775461, "step": 659 }, { "loss": 0.0543, "grad_norm": 0.32049161195755005, "learning_rate": 9.89386710565744e-05, "epoch": 0.529270248596632, "step": 660 }, { "loss": 0.0745, "grad_norm": 0.5253795981407166, "learning_rate": 9.867335283108479e-05, "epoch": 0.5300721732157178, "step": 661 }, { "loss": 0.0205, "grad_norm": 0.22634099423885345, "learning_rate": 9.840804394569513e-05, "epoch": 0.5308740978348035, "step": 662 }, { "loss": 0.0478, "grad_norm": 0.3835356831550598, "learning_rate": 9.81427462682806e-05, "epoch": 0.5316760224538893, "step": 663 }, { "loss": 0.0198, "grad_norm": 0.25156858563423157, "learning_rate": 9.787746166663764e-05, "epoch": 0.5324779470729751, "step": 664 }, { "loss": 0.066, "grad_norm": 0.577354907989502, "learning_rate": 9.761219200847065e-05, "epoch": 0.5332798716920609, "step": 665 }, { "loss": 0.0769, "grad_norm": 0.5115137696266174, "learning_rate": 9.73469391613787e-05, "epoch": 0.5340817963111467, "step": 666 }, { "loss": 0.0391, "grad_norm": 0.3202758729457855, "learning_rate": 9.708170499284256e-05, "epoch": 0.5348837209302325, "step": 667 }, { "loss": 0.0445, "grad_norm": 0.422722727060318, "learning_rate": 9.681649137021158e-05, "epoch": 0.5356856455493184, "step": 668 }, { "loss": 0.0446, "grad_norm": 0.32844579219818115, "learning_rate": 9.655130016069028e-05, "epoch": 0.5364875701684042, "step": 669 }, { "loss": 0.045, "grad_norm": 0.3552158772945404, "learning_rate": 9.628613323132554e-05, "epoch": 0.53728949478749, "step": 670 }, { "loss": 0.0516, "grad_norm": 0.37886497378349304, "learning_rate": 9.602099244899323e-05, "epoch": 0.5380914194065758, "step": 671 }, { "loss": 0.0224, "grad_norm": 0.25544053316116333, "learning_rate": 9.57558796803852e-05, "epoch": 0.5388933440256616, "step": 672 }, { "loss": 0.0433, "grad_norm": 0.2606353163719177, "learning_rate": 9.549079679199592e-05, "epoch": 0.5396952686447474, "step": 673 }, { "loss": 0.053, "grad_norm": 0.3851439654827118, "learning_rate": 9.522574565010965e-05, "epoch": 0.5404971932638332, "step": 674 }, { "loss": 0.0314, "grad_norm": 0.26221150159835815, "learning_rate": 9.496072812078712e-05, "epoch": 0.541299117882919, "step": 675 }, { "loss": 0.0569, "grad_norm": 0.5227025747299194, "learning_rate": 9.46957460698523e-05, "epoch": 0.5421010425020049, "step": 676 }, { "loss": 0.0181, "grad_norm": 0.20920135080814362, "learning_rate": 9.44308013628795e-05, "epoch": 0.5429029671210907, "step": 677 }, { "loss": 0.0326, "grad_norm": 0.2929348647594452, "learning_rate": 9.416589586518008e-05, "epoch": 0.5437048917401764, "step": 678 }, { "loss": 0.0346, "grad_norm": 0.38871344923973083, "learning_rate": 9.390103144178932e-05, "epoch": 0.5445068163592622, "step": 679 }, { "loss": 0.0603, "grad_norm": 0.392945259809494, "learning_rate": 9.363620995745337e-05, "epoch": 0.545308740978348, "step": 680 }, { "loss": 0.0579, "grad_norm": 0.6106362342834473, "learning_rate": 9.337143327661604e-05, "epoch": 0.5461106655974338, "step": 681 }, { "loss": 0.1305, "grad_norm": 0.6625472903251648, "learning_rate": 9.310670326340576e-05, "epoch": 0.5469125902165196, "step": 682 }, { "loss": 0.0954, "grad_norm": 0.5873953104019165, "learning_rate": 9.284202178162226e-05, "epoch": 0.5477145148356054, "step": 683 }, { "loss": 0.0214, "grad_norm": 0.2383047342300415, "learning_rate": 9.257739069472374e-05, "epoch": 0.5485164394546913, "step": 684 }, { "loss": 0.0395, "grad_norm": 0.46583423018455505, "learning_rate": 9.23128118658136e-05, "epoch": 0.5493183640737771, "step": 685 }, { "loss": 0.0847, "grad_norm": 0.42172953486442566, "learning_rate": 9.204828715762718e-05, "epoch": 0.5501202886928629, "step": 686 }, { "loss": 0.0182, "grad_norm": 0.17326125502586365, "learning_rate": 9.178381843251891e-05, "epoch": 0.5509222133119487, "step": 687 }, { "loss": 0.0706, "grad_norm": 0.4465944468975067, "learning_rate": 9.151940755244912e-05, "epoch": 0.5517241379310345, "step": 688 }, { "loss": 0.0361, "grad_norm": 0.3605600893497467, "learning_rate": 9.12550563789707e-05, "epoch": 0.5525260625501203, "step": 689 }, { "loss": 0.0637, "grad_norm": 0.5488521456718445, "learning_rate": 9.099076677321638e-05, "epoch": 0.5533279871692061, "step": 690 }, { "loss": 0.0413, "grad_norm": 0.3144517242908478, "learning_rate": 9.072654059588533e-05, "epoch": 0.5541299117882919, "step": 691 }, { "loss": 0.0469, "grad_norm": 0.356842041015625, "learning_rate": 9.04623797072302e-05, "epoch": 0.5549318364073778, "step": 692 }, { "loss": 0.0976, "grad_norm": 0.5099210143089294, "learning_rate": 9.019828596704394e-05, "epoch": 0.5557337610264635, "step": 693 }, { "loss": 0.0491, "grad_norm": 0.5059170126914978, "learning_rate": 8.99342612346468e-05, "epoch": 0.5565356856455493, "step": 694 }, { "loss": 0.0378, "grad_norm": 0.29008913040161133, "learning_rate": 8.967030736887314e-05, "epoch": 0.5573376102646351, "step": 695 }, { "loss": 0.0998, "grad_norm": 0.6845918297767639, "learning_rate": 8.94064262280584e-05, "epoch": 0.5581395348837209, "step": 696 }, { "loss": 0.0582, "grad_norm": 0.3744989335536957, "learning_rate": 8.914261967002605e-05, "epoch": 0.5589414595028067, "step": 697 }, { "loss": 0.0581, "grad_norm": 0.467715859413147, "learning_rate": 8.887888955207444e-05, "epoch": 0.5597433841218925, "step": 698 }, { "loss": 0.0444, "grad_norm": 0.3465082347393036, "learning_rate": 8.861523773096378e-05, "epoch": 0.5605453087409783, "step": 699 }, { "loss": 0.0616, "grad_norm": 0.4096762537956238, "learning_rate": 8.835166606290295e-05, "epoch": 0.5613472333600642, "step": 700 }, { "loss": 0.0286, "grad_norm": 0.3438918888568878, "learning_rate": 8.808817640353661e-05, "epoch": 0.56214915797915, "step": 701 }, { "loss": 0.0796, "grad_norm": 0.503362774848938, "learning_rate": 8.782477060793211e-05, "epoch": 0.5629510825982358, "step": 702 }, { "loss": 0.0403, "grad_norm": 0.36747029423713684, "learning_rate": 8.756145053056615e-05, "epoch": 0.5637530072173216, "step": 703 }, { "loss": 0.0339, "grad_norm": 0.2829087972640991, "learning_rate": 8.729821802531212e-05, "epoch": 0.5645549318364074, "step": 704 }, { "loss": 0.0333, "grad_norm": 0.3334031105041504, "learning_rate": 8.703507494542691e-05, "epoch": 0.5653568564554932, "step": 705 }, { "loss": 0.0437, "grad_norm": 0.38484475016593933, "learning_rate": 8.67720231435376e-05, "epoch": 0.566158781074579, "step": 706 }, { "loss": 0.0485, "grad_norm": 0.3287144601345062, "learning_rate": 8.650906447162884e-05, "epoch": 0.5669607056936647, "step": 707 }, { "loss": 0.0255, "grad_norm": 0.22345122694969177, "learning_rate": 8.624620078102951e-05, "epoch": 0.5677626303127506, "step": 708 }, { "loss": 0.1315, "grad_norm": 0.7388908267021179, "learning_rate": 8.598343392239989e-05, "epoch": 0.5685645549318364, "step": 709 }, { "loss": 0.0334, "grad_norm": 0.2406347393989563, "learning_rate": 8.572076574571838e-05, "epoch": 0.5693664795509222, "step": 710 }, { "loss": 0.0225, "grad_norm": 0.3055616021156311, "learning_rate": 8.545819810026871e-05, "epoch": 0.570168404170008, "step": 711 }, { "loss": 0.0139, "grad_norm": 0.23999330401420593, "learning_rate": 8.519573283462687e-05, "epoch": 0.5709703287890938, "step": 712 }, { "loss": 0.0783, "grad_norm": 0.6496703624725342, "learning_rate": 8.493337179664793e-05, "epoch": 0.5717722534081796, "step": 713 }, { "loss": 0.0103, "grad_norm": 0.2168056070804596, "learning_rate": 8.467111683345326e-05, "epoch": 0.5725741780272654, "step": 714 }, { "loss": 0.1149, "grad_norm": 0.9031127095222473, "learning_rate": 8.440896979141744e-05, "epoch": 0.5733761026463512, "step": 715 }, { "loss": 0.0517, "grad_norm": 0.3651449680328369, "learning_rate": 8.414693251615512e-05, "epoch": 0.5741780272654371, "step": 716 }, { "loss": 0.0603, "grad_norm": 0.35386982560157776, "learning_rate": 8.388500685250827e-05, "epoch": 0.5749799518845229, "step": 717 }, { "loss": 0.0563, "grad_norm": 0.3657480478286743, "learning_rate": 8.3623194644533e-05, "epoch": 0.5757818765036087, "step": 718 }, { "loss": 0.0391, "grad_norm": 0.2397533357143402, "learning_rate": 8.336149773548678e-05, "epoch": 0.5765838011226945, "step": 719 }, { "loss": 0.0399, "grad_norm": 0.33155348896980286, "learning_rate": 8.309991796781511e-05, "epoch": 0.5773857257417803, "step": 720 }, { "loss": 0.0514, "grad_norm": 0.4915727972984314, "learning_rate": 8.283845718313894e-05, "epoch": 0.5781876503608661, "step": 721 }, { "loss": 0.026, "grad_norm": 0.22791197896003723, "learning_rate": 8.257711722224152e-05, "epoch": 0.5789895749799518, "step": 722 }, { "loss": 0.0435, "grad_norm": 0.40722930431365967, "learning_rate": 8.231589992505536e-05, "epoch": 0.5797914995990376, "step": 723 }, { "loss": 0.0217, "grad_norm": 0.24059796333312988, "learning_rate": 8.205480713064946e-05, "epoch": 0.5805934242181235, "step": 724 }, { "loss": 0.0339, "grad_norm": 0.40672048926353455, "learning_rate": 8.179384067721631e-05, "epoch": 0.5813953488372093, "step": 725 }, { "loss": 0.0744, "grad_norm": 0.3509446680545807, "learning_rate": 8.153300240205873e-05, "epoch": 0.5821972734562951, "step": 726 }, { "loss": 0.0644, "grad_norm": 0.28646859526634216, "learning_rate": 8.12722941415773e-05, "epoch": 0.5829991980753809, "step": 727 }, { "loss": 0.0163, "grad_norm": 0.222028449177742, "learning_rate": 8.101171773125716e-05, "epoch": 0.5838011226944667, "step": 728 }, { "loss": 0.0215, "grad_norm": 0.23310942947864532, "learning_rate": 8.075127500565525e-05, "epoch": 0.5846030473135525, "step": 729 }, { "loss": 0.0297, "grad_norm": 0.2620904743671417, "learning_rate": 8.049096779838719e-05, "epoch": 0.5854049719326383, "step": 730 }, { "loss": 0.0773, "grad_norm": 0.6896341443061829, "learning_rate": 8.023079794211459e-05, "epoch": 0.5862068965517241, "step": 731 }, { "loss": 0.0654, "grad_norm": 0.3588181138038635, "learning_rate": 7.99707672685321e-05, "epoch": 0.58700882117081, "step": 732 }, { "loss": 0.0348, "grad_norm": 0.2889043390750885, "learning_rate": 7.971087760835432e-05, "epoch": 0.5878107457898958, "step": 733 }, { "loss": 0.0174, "grad_norm": 0.5972622632980347, "learning_rate": 7.945113079130323e-05, "epoch": 0.5886126704089816, "step": 734 }, { "loss": 0.0359, "grad_norm": 0.25957322120666504, "learning_rate": 7.919152864609499e-05, "epoch": 0.5894145950280674, "step": 735 }, { "loss": 0.0433, "grad_norm": 0.3756544888019562, "learning_rate": 7.89320730004274e-05, "epoch": 0.5902165196471532, "step": 736 }, { "loss": 0.0429, "grad_norm": 0.25527504086494446, "learning_rate": 7.867276568096662e-05, "epoch": 0.591018444266239, "step": 737 }, { "loss": 0.0727, "grad_norm": 0.49652037024497986, "learning_rate": 7.84136085133347e-05, "epoch": 0.5918203688853247, "step": 738 }, { "loss": 0.0555, "grad_norm": 0.46329352259635925, "learning_rate": 7.815460332209656e-05, "epoch": 0.5926222935044105, "step": 739 }, { "loss": 0.0231, "grad_norm": 0.217621847987175, "learning_rate": 7.789575193074704e-05, "epoch": 0.5934242181234964, "step": 740 }, { "loss": 0.0689, "grad_norm": 0.5665069818496704, "learning_rate": 7.763705616169825e-05, "epoch": 0.5942261427425822, "step": 741 }, { "loss": 0.0463, "grad_norm": 0.42257973551750183, "learning_rate": 7.737851783626671e-05, "epoch": 0.595028067361668, "step": 742 }, { "loss": 0.0348, "grad_norm": 0.34942853450775146, "learning_rate": 7.712013877466032e-05, "epoch": 0.5958299919807538, "step": 743 }, { "loss": 0.0189, "grad_norm": 0.2506210505962372, "learning_rate": 7.686192079596586e-05, "epoch": 0.5966319165998396, "step": 744 }, { "eval_loss": 0.042267050594091415, "eval_runtime": 31.7194, "eval_samples_per_second": 33.103, "eval_steps_per_second": 8.291, "epoch": 0.5966319165998396, "step": 744 }, { "loss": 0.0516, "grad_norm": 0.3015764355659485, "learning_rate": 7.660386571813593e-05, "epoch": 0.5974338412189254, "step": 745 }, { "loss": 0.04, "grad_norm": 0.333032488822937, "learning_rate": 7.634597535797633e-05, "epoch": 0.5982357658380112, "step": 746 }, { "loss": 0.058, "grad_norm": 0.34520605206489563, "learning_rate": 7.608825153113305e-05, "epoch": 0.599037690457097, "step": 747 }, { "loss": 0.0177, "grad_norm": 0.24561840295791626, "learning_rate": 7.583069605207975e-05, "epoch": 0.5998396150761829, "step": 748 }, { "loss": 0.0406, "grad_norm": 0.30027586221694946, "learning_rate": 7.557331073410485e-05, "epoch": 0.6006415396952687, "step": 749 }, { "loss": 0.0641, "grad_norm": 0.41032275557518005, "learning_rate": 7.531609738929865e-05, "epoch": 0.6014434643143545, "step": 750 }, { "loss": 0.0209, "grad_norm": 0.20874442160129547, "learning_rate": 7.505905782854081e-05, "epoch": 0.6022453889334403, "step": 751 }, { "loss": 0.0501, "grad_norm": 0.3524108827114105, "learning_rate": 7.48021938614875e-05, "epoch": 0.603047313552526, "step": 752 }, { "loss": 0.0323, "grad_norm": 0.3698127269744873, "learning_rate": 7.454550729655852e-05, "epoch": 0.6038492381716118, "step": 753 }, { "loss": 0.047, "grad_norm": 0.40356680750846863, "learning_rate": 7.428899994092483e-05, "epoch": 0.6046511627906976, "step": 754 }, { "loss": 0.024, "grad_norm": 0.2525324523448944, "learning_rate": 7.403267360049556e-05, "epoch": 0.6054530874097834, "step": 755 }, { "loss": 0.0143, "grad_norm": 0.416182279586792, "learning_rate": 7.37765300799056e-05, "epoch": 0.6062550120288693, "step": 756 }, { "loss": 0.0715, "grad_norm": 0.4480084478855133, "learning_rate": 7.352057118250246e-05, "epoch": 0.6070569366479551, "step": 757 }, { "loss": 0.01, "grad_norm": 0.22036206722259521, "learning_rate": 7.326479871033409e-05, "epoch": 0.6078588612670409, "step": 758 }, { "loss": 0.0329, "grad_norm": 0.2710481882095337, "learning_rate": 7.300921446413583e-05, "epoch": 0.6086607858861267, "step": 759 }, { "loss": 0.0332, "grad_norm": 0.241096630692482, "learning_rate": 7.275382024331772e-05, "epoch": 0.6094627105052125, "step": 760 }, { "loss": 0.0367, "grad_norm": 0.37980324029922485, "learning_rate": 7.249861784595217e-05, "epoch": 0.6102646351242983, "step": 761 }, { "loss": 0.0499, "grad_norm": 0.4780760407447815, "learning_rate": 7.2243609068761e-05, "epoch": 0.6110665597433841, "step": 762 }, { "loss": 0.0261, "grad_norm": 0.21910789608955383, "learning_rate": 7.198879570710272e-05, "epoch": 0.6118684843624699, "step": 763 }, { "loss": 0.047, "grad_norm": 0.30522310733795166, "learning_rate": 7.173417955496024e-05, "epoch": 0.6126704089815558, "step": 764 }, { "loss": 0.0361, "grad_norm": 0.4247373044490814, "learning_rate": 7.147976240492795e-05, "epoch": 0.6134723336006416, "step": 765 }, { "loss": 0.0543, "grad_norm": 0.30531254410743713, "learning_rate": 7.122554604819925e-05, "epoch": 0.6142742582197274, "step": 766 }, { "loss": 0.0394, "grad_norm": 0.3051380515098572, "learning_rate": 7.097153227455379e-05, "epoch": 0.6150761828388132, "step": 767 }, { "loss": 0.0556, "grad_norm": 0.3333624601364136, "learning_rate": 7.071772287234497e-05, "epoch": 0.615878107457899, "step": 768 }, { "loss": 0.032, "grad_norm": 0.2435581535100937, "learning_rate": 7.046411962848744e-05, "epoch": 0.6166800320769847, "step": 769 }, { "loss": 0.0175, "grad_norm": 0.21923010051250458, "learning_rate": 7.021072432844426e-05, "epoch": 0.6174819566960705, "step": 770 }, { "loss": 0.0308, "grad_norm": 0.344446063041687, "learning_rate": 6.995753875621464e-05, "epoch": 0.6182838813151563, "step": 771 }, { "loss": 0.0579, "grad_norm": 0.6894804835319519, "learning_rate": 6.970456469432117e-05, "epoch": 0.6190858059342422, "step": 772 }, { "loss": 0.0412, "grad_norm": 0.9697020053863525, "learning_rate": 6.945180392379729e-05, "epoch": 0.619887730553328, "step": 773 }, { "loss": 0.0313, "grad_norm": 0.30235642194747925, "learning_rate": 6.919925822417476e-05, "epoch": 0.6206896551724138, "step": 774 }, { "loss": 0.058, "grad_norm": 0.42743489146232605, "learning_rate": 6.894692937347127e-05, "epoch": 0.6214915797914996, "step": 775 }, { "loss": 0.0405, "grad_norm": 0.38457682728767395, "learning_rate": 6.869481914817779e-05, "epoch": 0.6222935044105854, "step": 776 }, { "loss": 0.0319, "grad_norm": 0.31749409437179565, "learning_rate": 6.844292932324597e-05, "epoch": 0.6230954290296712, "step": 777 }, { "loss": 0.0363, "grad_norm": 0.4263424575328827, "learning_rate": 6.819126167207585e-05, "epoch": 0.623897353648757, "step": 778 }, { "loss": 0.0393, "grad_norm": 0.25529760122299194, "learning_rate": 6.793981796650333e-05, "epoch": 0.6246992782678428, "step": 779 }, { "loss": 0.0294, "grad_norm": 0.203300341963768, "learning_rate": 6.768859997678751e-05, "epoch": 0.6255012028869287, "step": 780 }, { "loss": 0.0714, "grad_norm": 0.43434929847717285, "learning_rate": 6.743760947159846e-05, "epoch": 0.6263031275060145, "step": 781 }, { "loss": 0.0236, "grad_norm": 0.3486297130584717, "learning_rate": 6.718684821800467e-05, "epoch": 0.6271050521251003, "step": 782 }, { "loss": 0.0401, "grad_norm": 0.36812183260917664, "learning_rate": 6.69363179814606e-05, "epoch": 0.627906976744186, "step": 783 }, { "loss": 0.0466, "grad_norm": 0.40551620721817017, "learning_rate": 6.668602052579424e-05, "epoch": 0.6287089013632718, "step": 784 }, { "loss": 0.0548, "grad_norm": 0.39897987246513367, "learning_rate": 6.643595761319474e-05, "epoch": 0.6295108259823576, "step": 785 }, { "loss": 0.0287, "grad_norm": 0.23864711821079254, "learning_rate": 6.61861310042e-05, "epoch": 0.6303127506014434, "step": 786 }, { "loss": 0.0342, "grad_norm": 0.32459014654159546, "learning_rate": 6.593654245768415e-05, "epoch": 0.6311146752205292, "step": 787 }, { "loss": 0.103, "grad_norm": 0.8521727323532104, "learning_rate": 6.568719373084538e-05, "epoch": 0.6319165998396151, "step": 788 }, { "loss": 0.0183, "grad_norm": 0.20950952172279358, "learning_rate": 6.543808657919345e-05, "epoch": 0.6327185244587009, "step": 789 }, { "loss": 0.0399, "grad_norm": 0.41553550958633423, "learning_rate": 6.518922275653724e-05, "epoch": 0.6335204490777867, "step": 790 }, { "loss": 0.0347, "grad_norm": 0.2640535831451416, "learning_rate": 6.494060401497261e-05, "epoch": 0.6343223736968725, "step": 791 }, { "loss": 0.0288, "grad_norm": 0.2901599407196045, "learning_rate": 6.469223210486992e-05, "epoch": 0.6351242983159583, "step": 792 }, { "loss": 0.036, "grad_norm": 0.30714696645736694, "learning_rate": 6.444410877486178e-05, "epoch": 0.6359262229350441, "step": 793 }, { "loss": 0.0168, "grad_norm": 0.16659529507160187, "learning_rate": 6.419623577183056e-05, "epoch": 0.6367281475541299, "step": 794 }, { "loss": 0.0391, "grad_norm": 0.2610877454280853, "learning_rate": 6.394861484089641e-05, "epoch": 0.6375300721732157, "step": 795 }, { "loss": 0.0113, "grad_norm": 0.14762139320373535, "learning_rate": 6.370124772540469e-05, "epoch": 0.6383319967923016, "step": 796 }, { "loss": 0.1048, "grad_norm": 0.5695735216140747, "learning_rate": 6.345413616691385e-05, "epoch": 0.6391339214113874, "step": 797 }, { "loss": 0.0379, "grad_norm": 0.2888137996196747, "learning_rate": 6.320728190518308e-05, "epoch": 0.6399358460304732, "step": 798 }, { "loss": 0.0287, "grad_norm": 0.3997354805469513, "learning_rate": 6.29606866781602e-05, "epoch": 0.640737770649559, "step": 799 }, { "loss": 0.0347, "grad_norm": 0.23028384149074554, "learning_rate": 6.271435222196916e-05, "epoch": 0.6415396952686447, "step": 800 }, { "loss": 0.0375, "grad_norm": 0.332156240940094, "learning_rate": 6.246828027089811e-05, "epoch": 0.6423416198877305, "step": 801 }, { "loss": 0.0492, "grad_norm": 0.41977575421333313, "learning_rate": 6.222247255738706e-05, "epoch": 0.6431435445068163, "step": 802 }, { "loss": 0.0199, "grad_norm": 0.24224106967449188, "learning_rate": 6.197693081201567e-05, "epoch": 0.6439454691259021, "step": 803 }, { "loss": 0.0513, "grad_norm": 0.46784040331840515, "learning_rate": 6.173165676349103e-05, "epoch": 0.644747393744988, "step": 804 }, { "loss": 0.0472, "grad_norm": 0.38110026717185974, "learning_rate": 6.14866521386356e-05, "epoch": 0.6455493183640738, "step": 805 }, { "loss": 0.0649, "grad_norm": 0.3705803453922272, "learning_rate": 6.124191866237504e-05, "epoch": 0.6463512429831596, "step": 806 }, { "loss": 0.0437, "grad_norm": 0.28756698966026306, "learning_rate": 6.0997458057725877e-05, "epoch": 0.6471531676022454, "step": 807 }, { "loss": 0.0297, "grad_norm": 0.3769364356994629, "learning_rate": 6.0753272045783625e-05, "epoch": 0.6479550922213312, "step": 808 }, { "loss": 0.0351, "grad_norm": 0.2772417962551117, "learning_rate": 6.0509362345710585e-05, "epoch": 0.648757016840417, "step": 809 }, { "loss": 0.0276, "grad_norm": 0.20303967595100403, "learning_rate": 6.026573067472366e-05, "epoch": 0.6495589414595028, "step": 810 }, { "loss": 0.0409, "grad_norm": 0.26352015137672424, "learning_rate": 6.00223787480823e-05, "epoch": 0.6503608660785886, "step": 811 }, { "loss": 0.0392, "grad_norm": 0.4463076591491699, "learning_rate": 5.977930827907649e-05, "epoch": 0.6511627906976745, "step": 812 }, { "loss": 0.0556, "grad_norm": 0.3590082824230194, "learning_rate": 5.9536520979014676e-05, "epoch": 0.6519647153167603, "step": 813 }, { "loss": 0.0316, "grad_norm": 0.38499733805656433, "learning_rate": 5.929401855721162e-05, "epoch": 0.652766639935846, "step": 814 }, { "loss": 0.0456, "grad_norm": 1.1676615476608276, "learning_rate": 5.905180272097648e-05, "epoch": 0.6535685645549318, "step": 815 }, { "loss": 0.0783, "grad_norm": 0.5431867837905884, "learning_rate": 5.880987517560075e-05, "epoch": 0.6543704891740176, "step": 816 }, { "loss": 0.0501, "grad_norm": 0.35964494943618774, "learning_rate": 5.856823762434618e-05, "epoch": 0.6551724137931034, "step": 817 }, { "loss": 0.0408, "grad_norm": 0.323234498500824, "learning_rate": 5.832689176843291e-05, "epoch": 0.6559743384121892, "step": 818 }, { "loss": 0.0442, "grad_norm": 1.2825475931167603, "learning_rate": 5.808583930702739e-05, "epoch": 0.656776263031275, "step": 819 }, { "loss": 0.0291, "grad_norm": 0.19290927052497864, "learning_rate": 5.784508193723057e-05, "epoch": 0.6575781876503609, "step": 820 }, { "loss": 0.0176, "grad_norm": 0.18903003633022308, "learning_rate": 5.76046213540657e-05, "epoch": 0.6583801122694467, "step": 821 }, { "loss": 0.0336, "grad_norm": 0.39771515130996704, "learning_rate": 5.7364459250466596e-05, "epoch": 0.6591820368885325, "step": 822 }, { "loss": 0.0294, "grad_norm": 0.3263964354991913, "learning_rate": 5.712459731726577e-05, "epoch": 0.6599839615076183, "step": 823 }, { "loss": 0.0371, "grad_norm": 0.2918822765350342, "learning_rate": 5.688503724318217e-05, "epoch": 0.6607858861267041, "step": 824 }, { "loss": 0.0395, "grad_norm": 0.24919533729553223, "learning_rate": 5.6645780714809814e-05, "epoch": 0.6615878107457899, "step": 825 }, { "loss": 0.0374, "grad_norm": 0.23720526695251465, "learning_rate": 5.640682941660547e-05, "epoch": 0.6623897353648757, "step": 826 }, { "loss": 0.0199, "grad_norm": 0.1959155648946762, "learning_rate": 5.616818503087704e-05, "epoch": 0.6631916599839615, "step": 827 }, { "loss": 0.0641, "grad_norm": 2.0196421146392822, "learning_rate": 5.5929849237771556e-05, "epoch": 0.6639935846030474, "step": 828 }, { "loss": 0.0397, "grad_norm": 0.24733805656433105, "learning_rate": 5.569182371526365e-05, "epoch": 0.6647955092221332, "step": 829 }, { "loss": 0.0359, "grad_norm": 0.24535015225410461, "learning_rate": 5.545411013914329e-05, "epoch": 0.6655974338412189, "step": 830 }, { "loss": 0.0384, "grad_norm": 0.4017760753631592, "learning_rate": 5.521671018300436e-05, "epoch": 0.6663993584603047, "step": 831 }, { "loss": 0.0287, "grad_norm": 0.2186603546142578, "learning_rate": 5.497962551823266e-05, "epoch": 0.6672012830793905, "step": 832 }, { "loss": 0.0885, "grad_norm": 0.4830259680747986, "learning_rate": 5.4742857813994356e-05, "epoch": 0.6680032076984763, "step": 833 }, { "loss": 0.0302, "grad_norm": 0.3544902205467224, "learning_rate": 5.450640873722395e-05, "epoch": 0.6688051323175621, "step": 834 }, { "loss": 0.0764, "grad_norm": 0.3852503299713135, "learning_rate": 5.427027995261269e-05, "epoch": 0.6696070569366479, "step": 835 }, { "loss": 0.0466, "grad_norm": 0.32173559069633484, "learning_rate": 5.403447312259702e-05, "epoch": 0.6704089815557338, "step": 836 }, { "loss": 0.0149, "grad_norm": 0.19790256023406982, "learning_rate": 5.379898990734641e-05, "epoch": 0.6712109061748196, "step": 837 }, { "loss": 0.0309, "grad_norm": 0.2427252233028412, "learning_rate": 5.356383196475225e-05, "epoch": 0.6720128307939054, "step": 838 }, { "loss": 0.0637, "grad_norm": 0.45702651143074036, "learning_rate": 5.332900095041569e-05, "epoch": 0.6728147554129912, "step": 839 }, { "loss": 0.0258, "grad_norm": 0.21773581206798553, "learning_rate": 5.309449851763633e-05, "epoch": 0.673616680032077, "step": 840 }, { "loss": 0.0383, "grad_norm": 0.34996020793914795, "learning_rate": 5.286032631740023e-05, "epoch": 0.6744186046511628, "step": 841 }, { "loss": 0.053, "grad_norm": 0.3601475656032562, "learning_rate": 5.2626485998368726e-05, "epoch": 0.6752205292702486, "step": 842 }, { "loss": 0.0334, "grad_norm": 0.2879583537578583, "learning_rate": 5.239297920686641e-05, "epoch": 0.6760224538893344, "step": 843 }, { "loss": 0.025, "grad_norm": 0.3214558959007263, "learning_rate": 5.215980758686978e-05, "epoch": 0.6768243785084203, "step": 844 }, { "loss": 0.038, "grad_norm": 0.33000731468200684, "learning_rate": 5.1926972779995564e-05, "epoch": 0.677626303127506, "step": 845 }, { "loss": 0.0325, "grad_norm": 0.45079219341278076, "learning_rate": 5.169447642548928e-05, "epoch": 0.6784282277465918, "step": 846 }, { "loss": 0.0186, "grad_norm": 0.27335742115974426, "learning_rate": 5.146232016021353e-05, "epoch": 0.6792301523656776, "step": 847 }, { "loss": 0.0674, "grad_norm": 0.480881929397583, "learning_rate": 5.123050561863657e-05, "epoch": 0.6800320769847634, "step": 848 }, { "loss": 0.0358, "grad_norm": 0.26150617003440857, "learning_rate": 5.099903443282079e-05, "epoch": 0.6808340016038492, "step": 849 }, { "loss": 0.0358, "grad_norm": 0.4229785203933716, "learning_rate": 5.0767908232411306e-05, "epoch": 0.681635926222935, "step": 850 }, { "loss": 0.0964, "grad_norm": 0.6302306652069092, "learning_rate": 5.053712864462432e-05, "epoch": 0.6824378508420208, "step": 851 }, { "loss": 0.0529, "grad_norm": 0.40250223875045776, "learning_rate": 5.0306697294235714e-05, "epoch": 0.6832397754611067, "step": 852 }, { "loss": 0.0349, "grad_norm": 0.40601104497909546, "learning_rate": 5.007661580356982e-05, "epoch": 0.6840417000801925, "step": 853 }, { "loss": 0.0264, "grad_norm": 0.20523907244205475, "learning_rate": 4.984688579248756e-05, "epoch": 0.6848436246992783, "step": 854 }, { "loss": 0.0233, "grad_norm": 0.2532117962837219, "learning_rate": 4.961750887837557e-05, "epoch": 0.6856455493183641, "step": 855 }, { "loss": 0.0222, "grad_norm": 0.23107284307479858, "learning_rate": 4.938848667613436e-05, "epoch": 0.6864474739374499, "step": 856 }, { "loss": 0.0243, "grad_norm": 0.2529151141643524, "learning_rate": 4.915982079816732e-05, "epoch": 0.6872493985565357, "step": 857 }, { "loss": 0.0391, "grad_norm": 0.2575894892215729, "learning_rate": 4.8931512854368913e-05, "epoch": 0.6880513231756215, "step": 858 }, { "loss": 0.0773, "grad_norm": 0.6415811777114868, "learning_rate": 4.870356445211388e-05, "epoch": 0.6888532477947072, "step": 859 }, { "loss": 0.0391, "grad_norm": 0.4123080372810364, "learning_rate": 4.8475977196245504e-05, "epoch": 0.6896551724137931, "step": 860 }, { "loss": 0.0457, "grad_norm": 0.31931477785110474, "learning_rate": 4.8248752689064494e-05, "epoch": 0.6904570970328789, "step": 861 }, { "loss": 0.0268, "grad_norm": 0.2256850302219391, "learning_rate": 4.802189253031764e-05, "epoch": 0.6912590216519647, "step": 862 }, { "loss": 0.029, "grad_norm": 0.4967258870601654, "learning_rate": 4.779539831718668e-05, "epoch": 0.6920609462710505, "step": 863 }, { "loss": 0.0175, "grad_norm": 0.18003036081790924, "learning_rate": 4.756927164427685e-05, "epoch": 0.6928628708901363, "step": 864 }, { "loss": 0.0215, "grad_norm": 0.2862027585506439, "learning_rate": 4.7343514103605767e-05, "epoch": 0.6936647955092221, "step": 865 }, { "loss": 0.0455, "grad_norm": 0.32969915866851807, "learning_rate": 4.711812728459233e-05, "epoch": 0.6944667201283079, "step": 866 }, { "loss": 0.0135, "grad_norm": 0.16625012457370758, "learning_rate": 4.689311277404529e-05, "epoch": 0.6952686447473937, "step": 867 }, { "loss": 0.0223, "grad_norm": 0.4092782139778137, "learning_rate": 4.666847215615226e-05, "epoch": 0.6960705693664796, "step": 868 }, { "eval_loss": 0.04034050926566124, "eval_runtime": 31.8203, "eval_samples_per_second": 32.998, "eval_steps_per_second": 8.265, "epoch": 0.6960705693664796, "step": 868 }, { "loss": 0.0779, "grad_norm": 0.5360198616981506, "learning_rate": 4.6444207012468465e-05, "epoch": 0.6968724939855654, "step": 869 }, { "loss": 0.0513, "grad_norm": 0.49572035670280457, "learning_rate": 4.622031892190579e-05, "epoch": 0.6976744186046512, "step": 870 }, { "loss": 0.0337, "grad_norm": 0.27240556478500366, "learning_rate": 4.599680946072127e-05, "epoch": 0.698476343223737, "step": 871 }, { "loss": 0.0367, "grad_norm": 0.3386692702770233, "learning_rate": 4.57736802025065e-05, "epoch": 0.6992782678428228, "step": 872 }, { "loss": 0.0502, "grad_norm": 0.47822192311286926, "learning_rate": 4.555093271817616e-05, "epoch": 0.7000801924619086, "step": 873 }, { "loss": 0.0752, "grad_norm": 0.6390823125839233, "learning_rate": 4.532856857595714e-05, "epoch": 0.7008821170809943, "step": 874 }, { "loss": 0.0452, "grad_norm": 0.37735825777053833, "learning_rate": 4.5106589341377394e-05, "epoch": 0.7016840417000801, "step": 875 }, { "loss": 0.021, "grad_norm": 0.18500256538391113, "learning_rate": 4.488499657725511e-05, "epoch": 0.702485966319166, "step": 876 }, { "loss": 0.0621, "grad_norm": 0.43604958057403564, "learning_rate": 4.466379184368747e-05, "epoch": 0.7032878909382518, "step": 877 }, { "loss": 0.0204, "grad_norm": 0.5190374851226807, "learning_rate": 4.444297669803981e-05, "epoch": 0.7040898155573376, "step": 878 }, { "loss": 0.0131, "grad_norm": 0.3594497740268707, "learning_rate": 4.422255269493455e-05, "epoch": 0.7048917401764234, "step": 879 }, { "loss": 0.0234, "grad_norm": 0.26429349184036255, "learning_rate": 4.4002521386240466e-05, "epoch": 0.7056936647955092, "step": 880 }, { "loss": 0.0205, "grad_norm": 0.3388163447380066, "learning_rate": 4.37828843210615e-05, "epoch": 0.706495589414595, "step": 881 }, { "loss": 0.0264, "grad_norm": 0.21950915455818176, "learning_rate": 4.3563643045725964e-05, "epoch": 0.7072975140336808, "step": 882 }, { "loss": 0.0542, "grad_norm": 0.40110042691230774, "learning_rate": 4.334479910377577e-05, "epoch": 0.7080994386527666, "step": 883 }, { "loss": 0.078, "grad_norm": 0.4152352511882782, "learning_rate": 4.312635403595532e-05, "epoch": 0.7089013632718525, "step": 884 }, { "loss": 0.0244, "grad_norm": 0.20890304446220398, "learning_rate": 4.290830938020087e-05, "epoch": 0.7097032878909383, "step": 885 }, { "loss": 0.0318, "grad_norm": 0.32398372888565063, "learning_rate": 4.269066667162956e-05, "epoch": 0.7105052125100241, "step": 886 }, { "loss": 0.0615, "grad_norm": 0.3690579831600189, "learning_rate": 4.247342744252883e-05, "epoch": 0.7113071371291099, "step": 887 }, { "loss": 0.0299, "grad_norm": 0.4021519422531128, "learning_rate": 4.2256593222345185e-05, "epoch": 0.7121090617481957, "step": 888 }, { "loss": 0.0228, "grad_norm": 0.24381564557552338, "learning_rate": 4.2040165537674006e-05, "epoch": 0.7129109863672815, "step": 889 }, { "loss": 0.0786, "grad_norm": 0.5315597057342529, "learning_rate": 4.182414591224833e-05, "epoch": 0.7137129109863672, "step": 890 }, { "loss": 0.0306, "grad_norm": 0.29537150263786316, "learning_rate": 4.160853586692839e-05, "epoch": 0.714514835605453, "step": 891 }, { "loss": 0.029, "grad_norm": 0.3600987195968628, "learning_rate": 4.139333691969071e-05, "epoch": 0.7153167602245389, "step": 892 }, { "loss": 0.0235, "grad_norm": 0.23906612396240234, "learning_rate": 4.117855058561769e-05, "epoch": 0.7161186848436247, "step": 893 }, { "loss": 0.1348, "grad_norm": 0.7623001337051392, "learning_rate": 4.096417837688666e-05, "epoch": 0.7169206094627105, "step": 894 }, { "loss": 0.0216, "grad_norm": 0.33109530806541443, "learning_rate": 4.075022180275935e-05, "epoch": 0.7177225340817963, "step": 895 }, { "loss": 0.0213, "grad_norm": 0.172570139169693, "learning_rate": 4.053668236957134e-05, "epoch": 0.7185244587008821, "step": 896 }, { "loss": 0.0213, "grad_norm": 0.27047714591026306, "learning_rate": 4.032356158072131e-05, "epoch": 0.7193263833199679, "step": 897 }, { "loss": 0.0791, "grad_norm": 0.4213772118091583, "learning_rate": 4.0110860936660566e-05, "epoch": 0.7201283079390537, "step": 898 }, { "loss": 0.0458, "grad_norm": 0.38493579626083374, "learning_rate": 3.989858193488236e-05, "epoch": 0.7209302325581395, "step": 899 }, { "loss": 0.022, "grad_norm": 0.23332200944423676, "learning_rate": 3.96867260699116e-05, "epoch": 0.7217321571772254, "step": 900 }, { "loss": 0.0438, "grad_norm": 0.3719151020050049, "learning_rate": 3.947529483329387e-05, "epoch": 0.7225340817963112, "step": 901 }, { "loss": 0.0294, "grad_norm": 0.18766042590141296, "learning_rate": 3.92642897135855e-05, "epoch": 0.723336006415397, "step": 902 }, { "loss": 0.0163, "grad_norm": 0.17008039355278015, "learning_rate": 3.905371219634257e-05, "epoch": 0.7241379310344828, "step": 903 }, { "loss": 0.0293, "grad_norm": 0.2763400673866272, "learning_rate": 3.884356376411089e-05, "epoch": 0.7249398556535686, "step": 904 }, { "loss": 0.0286, "grad_norm": 0.3425106704235077, "learning_rate": 3.863384589641509e-05, "epoch": 0.7257417802726543, "step": 905 }, { "loss": 0.0221, "grad_norm": 0.3655487596988678, "learning_rate": 3.8424560069748706e-05, "epoch": 0.7265437048917401, "step": 906 }, { "loss": 0.0191, "grad_norm": 0.22211404144763947, "learning_rate": 3.821570775756339e-05, "epoch": 0.7273456295108259, "step": 907 }, { "loss": 0.0415, "grad_norm": 0.3968844711780548, "learning_rate": 3.800729043025871e-05, "epoch": 0.7281475541299118, "step": 908 }, { "loss": 0.0477, "grad_norm": 0.30855193734169006, "learning_rate": 3.779930955517187e-05, "epoch": 0.7289494787489976, "step": 909 }, { "loss": 0.0186, "grad_norm": 0.20964409410953522, "learning_rate": 3.759176659656717e-05, "epoch": 0.7297514033680834, "step": 910 }, { "loss": 0.0169, "grad_norm": 0.20416317880153656, "learning_rate": 3.7384663015625854e-05, "epoch": 0.7305533279871692, "step": 911 }, { "loss": 0.1093, "grad_norm": 0.6007756590843201, "learning_rate": 3.717800027043576e-05, "epoch": 0.731355252606255, "step": 912 }, { "loss": 0.0393, "grad_norm": 0.4740281403064728, "learning_rate": 3.697177981598115e-05, "epoch": 0.7321571772253408, "step": 913 }, { "loss": 0.0507, "grad_norm": 0.39300405979156494, "learning_rate": 3.676600310413233e-05, "epoch": 0.7329591018444266, "step": 914 }, { "loss": 0.0255, "grad_norm": 0.29988205432891846, "learning_rate": 3.6560671583635467e-05, "epoch": 0.7337610264635124, "step": 915 }, { "loss": 0.0219, "grad_norm": 0.22536736726760864, "learning_rate": 3.635578670010242e-05, "epoch": 0.7345629510825983, "step": 916 }, { "loss": 0.0398, "grad_norm": 0.29492881894111633, "learning_rate": 3.615134989600069e-05, "epoch": 0.7353648757016841, "step": 917 }, { "loss": 0.0413, "grad_norm": 0.3680134415626526, "learning_rate": 3.5947362610642854e-05, "epoch": 0.7361668003207699, "step": 918 }, { "loss": 0.0451, "grad_norm": 0.2880399525165558, "learning_rate": 3.5743826280177e-05, "epoch": 0.7369687249398557, "step": 919 }, { "loss": 0.0463, "grad_norm": 0.38011434674263, "learning_rate": 3.554074233757608e-05, "epoch": 0.7377706495589414, "step": 920 }, { "loss": 0.0236, "grad_norm": 0.21362242102622986, "learning_rate": 3.533811221262833e-05, "epoch": 0.7385725741780272, "step": 921 }, { "loss": 0.0611, "grad_norm": 0.49550414085388184, "learning_rate": 3.5135937331926596e-05, "epoch": 0.739374498797113, "step": 922 }, { "loss": 0.0312, "grad_norm": 0.2971956133842468, "learning_rate": 3.4934219118858936e-05, "epoch": 0.7401764234161988, "step": 923 }, { "loss": 0.0168, "grad_norm": 0.21751493215560913, "learning_rate": 3.4732958993598154e-05, "epoch": 0.7409783480352847, "step": 924 }, { "loss": 0.0316, "grad_norm": 0.25100478529930115, "learning_rate": 3.453215837309192e-05, "epoch": 0.7417802726543705, "step": 925 }, { "loss": 0.0355, "grad_norm": 0.23625293374061584, "learning_rate": 3.4331818671052906e-05, "epoch": 0.7425821972734563, "step": 926 }, { "loss": 0.0158, "grad_norm": 0.4031226336956024, "learning_rate": 3.413194129794869e-05, "epoch": 0.7433841218925421, "step": 927 }, { "loss": 0.0269, "grad_norm": 0.27065587043762207, "learning_rate": 3.393252766099187e-05, "epoch": 0.7441860465116279, "step": 928 }, { "loss": 0.0348, "grad_norm": 0.2262876033782959, "learning_rate": 3.373357916413016e-05, "epoch": 0.7449879711307137, "step": 929 }, { "loss": 0.0519, "grad_norm": 0.4427652359008789, "learning_rate": 3.353509720803658e-05, "epoch": 0.7457898957497995, "step": 930 }, { "loss": 0.0686, "grad_norm": 0.46217551827430725, "learning_rate": 3.333708319009945e-05, "epoch": 0.7465918203688853, "step": 931 }, { "loss": 0.0278, "grad_norm": 0.3490634262561798, "learning_rate": 3.313953850441266e-05, "epoch": 0.7473937449879712, "step": 932 }, { "loss": 0.0184, "grad_norm": 0.23873376846313477, "learning_rate": 3.294246454176577e-05, "epoch": 0.748195669607057, "step": 933 }, { "loss": 0.0297, "grad_norm": 0.35058480501174927, "learning_rate": 3.274586268963443e-05, "epoch": 0.7489975942261428, "step": 934 }, { "loss": 0.0616, "grad_norm": 0.7911800742149353, "learning_rate": 3.254973433217021e-05, "epoch": 0.7497995188452286, "step": 935 }, { "loss": 0.0355, "grad_norm": 0.2944418489933014, "learning_rate": 3.2354080850191324e-05, "epoch": 0.7506014434643143, "step": 936 }, { "loss": 0.0705, "grad_norm": 0.495128333568573, "learning_rate": 3.215890362117256e-05, "epoch": 0.7514033680834001, "step": 937 }, { "loss": 0.0297, "grad_norm": 0.3445483446121216, "learning_rate": 3.196420401923566e-05, "epoch": 0.7522052927024859, "step": 938 }, { "loss": 0.025, "grad_norm": 0.28738442063331604, "learning_rate": 3.176998341513989e-05, "epoch": 0.7530072173215717, "step": 939 }, { "loss": 0.0215, "grad_norm": 0.24774937331676483, "learning_rate": 3.157624317627195e-05, "epoch": 0.7538091419406576, "step": 940 }, { "loss": 0.0215, "grad_norm": 0.2146318256855011, "learning_rate": 3.138298466663681e-05, "epoch": 0.7546110665597434, "step": 941 }, { "loss": 0.0433, "grad_norm": 0.27725639939308167, "learning_rate": 3.119020924684762e-05, "epoch": 0.7554129911788292, "step": 942 }, { "loss": 0.0978, "grad_norm": 0.4043017029762268, "learning_rate": 3.099791827411668e-05, "epoch": 0.756214915797915, "step": 943 }, { "loss": 0.0499, "grad_norm": 0.33598214387893677, "learning_rate": 3.080611310224539e-05, "epoch": 0.7570168404170008, "step": 944 }, { "loss": 0.0422, "grad_norm": 0.5092307925224304, "learning_rate": 3.061479508161502e-05, "epoch": 0.7578187650360866, "step": 945 }, { "loss": 0.0192, "grad_norm": 0.37134242057800293, "learning_rate": 3.042396555917707e-05, "epoch": 0.7586206896551724, "step": 946 }, { "loss": 0.0539, "grad_norm": 0.36531612277030945, "learning_rate": 3.0233625878443927e-05, "epoch": 0.7594226142742582, "step": 947 }, { "loss": 0.0405, "grad_norm": 0.3020681142807007, "learning_rate": 3.0043777379479098e-05, "epoch": 0.7602245388933441, "step": 948 }, { "loss": 0.093, "grad_norm": 0.45718201994895935, "learning_rate": 2.985442139888821e-05, "epoch": 0.7610264635124299, "step": 949 }, { "loss": 0.0479, "grad_norm": 0.26453983783721924, "learning_rate": 2.9665559269809217e-05, "epoch": 0.7618283881315157, "step": 950 }, { "loss": 0.06, "grad_norm": 0.3121758699417114, "learning_rate": 2.9477192321903292e-05, "epoch": 0.7626303127506014, "step": 951 }, { "loss": 0.0204, "grad_norm": 0.20015697181224823, "learning_rate": 2.9289321881345254e-05, "epoch": 0.7634322373696872, "step": 952 }, { "loss": 0.0371, "grad_norm": 0.36291614174842834, "learning_rate": 2.9101949270814344e-05, "epoch": 0.764234161988773, "step": 953 }, { "loss": 0.0229, "grad_norm": 0.2591784596443176, "learning_rate": 2.8915075809484904e-05, "epoch": 0.7650360866078588, "step": 954 }, { "loss": 0.0261, "grad_norm": 0.2614014744758606, "learning_rate": 2.872870281301704e-05, "epoch": 0.7658380112269446, "step": 955 }, { "loss": 0.0228, "grad_norm": 0.3347266912460327, "learning_rate": 2.854283159354748e-05, "epoch": 0.7666399358460305, "step": 956 }, { "loss": 0.0368, "grad_norm": 0.3613988757133484, "learning_rate": 2.835746345968012e-05, "epoch": 0.7674418604651163, "step": 957 }, { "loss": 0.0175, "grad_norm": 0.2561342716217041, "learning_rate": 2.8172599716477143e-05, "epoch": 0.7682437850842021, "step": 958 }, { "loss": 0.0396, "grad_norm": 0.2864450216293335, "learning_rate": 2.7988241665449354e-05, "epoch": 0.7690457097032879, "step": 959 }, { "loss": 0.0294, "grad_norm": 0.2632593512535095, "learning_rate": 2.7804390604547557e-05, "epoch": 0.7698476343223737, "step": 960 }, { "loss": 0.0252, "grad_norm": 0.2442079782485962, "learning_rate": 2.7621047828153e-05, "epoch": 0.7706495589414595, "step": 961 }, { "loss": 0.027, "grad_norm": 0.30140507221221924, "learning_rate": 2.7438214627068448e-05, "epoch": 0.7714514835605453, "step": 962 }, { "loss": 0.0602, "grad_norm": 0.4731035828590393, "learning_rate": 2.7255892288509043e-05, "epoch": 0.7722534081796311, "step": 963 }, { "loss": 0.0201, "grad_norm": 0.25631648302078247, "learning_rate": 2.707408209609339e-05, "epoch": 0.773055332798717, "step": 964 }, { "loss": 0.0316, "grad_norm": 0.2207869291305542, "learning_rate": 2.689278532983416e-05, "epoch": 0.7738572574178028, "step": 965 }, { "loss": 0.0997, "grad_norm": 0.574215292930603, "learning_rate": 2.6712003266129525e-05, "epoch": 0.7746591820368885, "step": 966 }, { "loss": 0.107, "grad_norm": 0.5775609612464905, "learning_rate": 2.65317371777538e-05, "epoch": 0.7754611066559743, "step": 967 }, { "loss": 0.028, "grad_norm": 0.37875795364379883, "learning_rate": 2.6351988333848788e-05, "epoch": 0.7762630312750601, "step": 968 }, { "loss": 0.026, "grad_norm": 0.2766773998737335, "learning_rate": 2.6172757999914554e-05, "epoch": 0.7770649558941459, "step": 969 }, { "loss": 0.066, "grad_norm": 0.3758871555328369, "learning_rate": 2.5994047437800706e-05, "epoch": 0.7778668805132317, "step": 970 }, { "loss": 0.019, "grad_norm": 0.2001071274280548, "learning_rate": 2.5815857905697548e-05, "epoch": 0.7786688051323175, "step": 971 }, { "loss": 0.0469, "grad_norm": 0.35508066415786743, "learning_rate": 2.5638190658126938e-05, "epoch": 0.7794707297514034, "step": 972 }, { "loss": 0.05, "grad_norm": 0.4158821403980255, "learning_rate": 2.5461046945933854e-05, "epoch": 0.7802726543704892, "step": 973 }, { "loss": 0.0471, "grad_norm": 0.4256257712841034, "learning_rate": 2.5284428016277284e-05, "epoch": 0.781074578989575, "step": 974 }, { "loss": 0.0345, "grad_norm": 0.4111097753047943, "learning_rate": 2.5108335112621562e-05, "epoch": 0.7818765036086608, "step": 975 }, { "loss": 0.0287, "grad_norm": 0.2494903802871704, "learning_rate": 2.493276947472756e-05, "epoch": 0.7826784282277466, "step": 976 }, { "loss": 0.0141, "grad_norm": 0.3150325119495392, "learning_rate": 2.4757732338644124e-05, "epoch": 0.7834803528468324, "step": 977 }, { "loss": 0.0281, "grad_norm": 0.33352166414260864, "learning_rate": 2.458322493669911e-05, "epoch": 0.7842822774659182, "step": 978 }, { "loss": 0.0404, "grad_norm": 0.2655154764652252, "learning_rate": 2.4409248497490922e-05, "epoch": 0.785084202085004, "step": 979 }, { "loss": 0.0338, "grad_norm": 0.4300474524497986, "learning_rate": 2.4235804245879723e-05, "epoch": 0.7858861267040899, "step": 980 }, { "loss": 0.0173, "grad_norm": 0.18177032470703125, "learning_rate": 2.4062893402978958e-05, "epoch": 0.7866880513231757, "step": 981 }, { "loss": 0.0422, "grad_norm": 0.3139914572238922, "learning_rate": 2.389051718614662e-05, "epoch": 0.7874899759422614, "step": 982 }, { "loss": 0.0139, "grad_norm": 0.16391263902187347, "learning_rate": 2.371867680897668e-05, "epoch": 0.7882919005613472, "step": 983 }, { "loss": 0.0496, "grad_norm": 0.3264078199863434, "learning_rate": 2.354737348129077e-05, "epoch": 0.789093825180433, "step": 984 }, { "loss": 0.0342, "grad_norm": 0.3129443824291229, "learning_rate": 2.337660840912923e-05, "epoch": 0.7898957497995188, "step": 985 }, { "loss": 0.0892, "grad_norm": 0.5504446029663086, "learning_rate": 2.320638279474312e-05, "epoch": 0.7906976744186046, "step": 986 }, { "loss": 0.0212, "grad_norm": 0.2645837664604187, "learning_rate": 2.3036697836585353e-05, "epoch": 0.7914995990376904, "step": 987 }, { "loss": 0.0374, "grad_norm": 0.3089625835418701, "learning_rate": 2.2867554729302542e-05, "epoch": 0.7923015236567763, "step": 988 }, { "loss": 0.068, "grad_norm": 0.5308516025543213, "learning_rate": 2.26989546637263e-05, "epoch": 0.7931034482758621, "step": 989 }, { "loss": 0.0077, "grad_norm": 0.20258042216300964, "learning_rate": 2.25308988268652e-05, "epoch": 0.7939053728949479, "step": 990 }, { "loss": 0.0377, "grad_norm": 0.3660711348056793, "learning_rate": 2.2363388401896124e-05, "epoch": 0.7947072975140337, "step": 991 }, { "loss": 0.037, "grad_norm": 0.3499682545661926, "learning_rate": 2.2196424568156073e-05, "epoch": 0.7955092221331195, "step": 992 }, { "eval_loss": 0.039402980357408524, "eval_runtime": 31.7538, "eval_samples_per_second": 33.067, "eval_steps_per_second": 8.282, "epoch": 0.7955092221331195, "step": 992 }, { "loss": 0.0333, "grad_norm": 0.259440541267395, "learning_rate": 2.2030008501133815e-05, "epoch": 0.7963111467522053, "step": 993 }, { "loss": 0.0071, "grad_norm": 0.16091641783714294, "learning_rate": 2.186414137246172e-05, "epoch": 0.7971130713712911, "step": 994 }, { "loss": 0.0539, "grad_norm": 0.4330938160419464, "learning_rate": 2.1698824349907344e-05, "epoch": 0.7979149959903769, "step": 995 }, { "loss": 0.0287, "grad_norm": 0.2377873659133911, "learning_rate": 2.153405859736528e-05, "epoch": 0.7987169206094628, "step": 996 }, { "loss": 0.0286, "grad_norm": 0.30016374588012695, "learning_rate": 2.136984527484901e-05, "epoch": 0.7995188452285485, "step": 997 }, { "loss": 0.0644, "grad_norm": 0.3702533543109894, "learning_rate": 2.1206185538482703e-05, "epoch": 0.8003207698476343, "step": 998 }, { "loss": 0.0425, "grad_norm": 0.31753045320510864, "learning_rate": 2.1043080540493056e-05, "epoch": 0.8011226944667201, "step": 999 }, { "loss": 0.0495, "grad_norm": 0.26273587346076965, "learning_rate": 2.0880531429201145e-05, "epoch": 0.8019246190858059, "step": 1000 }, { "loss": 0.0328, "grad_norm": 0.2284964621067047, "learning_rate": 2.0718539349014544e-05, "epoch": 0.8027265437048917, "step": 1001 }, { "loss": 0.0299, "grad_norm": 0.31368839740753174, "learning_rate": 2.05571054404189e-05, "epoch": 0.8035284683239775, "step": 1002 }, { "loss": 0.0734, "grad_norm": 0.5880581736564636, "learning_rate": 2.039623083997031e-05, "epoch": 0.8043303929430633, "step": 1003 }, { "loss": 0.0626, "grad_norm": 0.6813879609107971, "learning_rate": 2.0235916680287015e-05, "epoch": 0.8051323175621492, "step": 1004 }, { "loss": 0.0493, "grad_norm": 0.8034153580665588, "learning_rate": 2.007616409004165e-05, "epoch": 0.805934242181235, "step": 1005 }, { "loss": 0.0275, "grad_norm": 0.2636951208114624, "learning_rate": 1.991697419395301e-05, "epoch": 0.8067361668003208, "step": 1006 }, { "loss": 0.0725, "grad_norm": 0.7305841445922852, "learning_rate": 1.97583481127785e-05, "epoch": 0.8075380914194066, "step": 1007 }, { "loss": 0.0484, "grad_norm": 0.3095945119857788, "learning_rate": 1.9600286963305957e-05, "epoch": 0.8083400160384924, "step": 1008 }, { "loss": 0.0204, "grad_norm": 0.21153652667999268, "learning_rate": 1.9442791858345887e-05, "epoch": 0.8091419406575782, "step": 1009 }, { "loss": 0.0456, "grad_norm": 0.2735914885997772, "learning_rate": 1.928586390672361e-05, "epoch": 0.809943865276664, "step": 1010 }, { "loss": 0.0288, "grad_norm": 0.2760597765445709, "learning_rate": 1.9129504213271564e-05, "epoch": 0.8107457898957497, "step": 1011 }, { "loss": 0.0254, "grad_norm": 0.25427863001823425, "learning_rate": 1.897371387882134e-05, "epoch": 0.8115477145148356, "step": 1012 }, { "loss": 0.0204, "grad_norm": 0.23352181911468506, "learning_rate": 1.881849400019602e-05, "epoch": 0.8123496391339214, "step": 1013 }, { "loss": 0.0979, "grad_norm": 0.5435330867767334, "learning_rate": 1.8663845670202563e-05, "epoch": 0.8131515637530072, "step": 1014 }, { "loss": 0.025, "grad_norm": 0.2618383467197418, "learning_rate": 1.85097699776239e-05, "epoch": 0.813953488372093, "step": 1015 }, { "loss": 0.024, "grad_norm": 0.26069387793540955, "learning_rate": 1.835626800721144e-05, "epoch": 0.8147554129911788, "step": 1016 }, { "loss": 0.0237, "grad_norm": 0.2411407083272934, "learning_rate": 1.8203340839677308e-05, "epoch": 0.8155573376102646, "step": 1017 }, { "loss": 0.0388, "grad_norm": 0.4038242697715759, "learning_rate": 1.8050989551686914e-05, "epoch": 0.8163592622293504, "step": 1018 }, { "loss": 0.068, "grad_norm": 0.5744785070419312, "learning_rate": 1.7899215215851084e-05, "epoch": 0.8171611868484362, "step": 1019 }, { "loss": 0.0195, "grad_norm": 0.23403626680374146, "learning_rate": 1.7748018900718854e-05, "epoch": 0.8179631114675221, "step": 1020 }, { "loss": 0.0095, "grad_norm": 0.11110510677099228, "learning_rate": 1.7597401670769685e-05, "epoch": 0.8187650360866079, "step": 1021 }, { "loss": 0.0321, "grad_norm": 0.38874614238739014, "learning_rate": 1.7447364586406066e-05, "epoch": 0.8195669607056937, "step": 1022 }, { "loss": 0.083, "grad_norm": 0.46676599979400635, "learning_rate": 1.729790870394603e-05, "epoch": 0.8203688853247795, "step": 1023 }, { "loss": 0.0314, "grad_norm": 0.4288753569126129, "learning_rate": 1.7149035075615794e-05, "epoch": 0.8211708099438653, "step": 1024 }, { "loss": 0.0651, "grad_norm": 0.29615238308906555, "learning_rate": 1.7000744749542208e-05, "epoch": 0.8219727345629511, "step": 1025 }, { "loss": 0.068, "grad_norm": 0.4459689259529114, "learning_rate": 1.6853038769745467e-05, "epoch": 0.8227746591820368, "step": 1026 }, { "loss": 0.0259, "grad_norm": 0.22963477671146393, "learning_rate": 1.670591817613181e-05, "epoch": 0.8235765838011226, "step": 1027 }, { "loss": 0.0222, "grad_norm": 0.2704809010028839, "learning_rate": 1.6559384004486055e-05, "epoch": 0.8243785084202085, "step": 1028 }, { "loss": 0.028, "grad_norm": 0.28117361664772034, "learning_rate": 1.6413437286464417e-05, "epoch": 0.8251804330392943, "step": 1029 }, { "loss": 0.0303, "grad_norm": 0.22778946161270142, "learning_rate": 1.6268079049587203e-05, "epoch": 0.8259823576583801, "step": 1030 }, { "loss": 0.1011, "grad_norm": 0.7209060788154602, "learning_rate": 1.6123310317231643e-05, "epoch": 0.8267842822774659, "step": 1031 }, { "loss": 0.0223, "grad_norm": 0.383091002702713, "learning_rate": 1.5979132108624574e-05, "epoch": 0.8275862068965517, "step": 1032 }, { "loss": 0.0628, "grad_norm": 0.5048542618751526, "learning_rate": 1.583554543883532e-05, "epoch": 0.8283881315156375, "step": 1033 }, { "loss": 0.0255, "grad_norm": 0.21313592791557312, "learning_rate": 1.5692551318768556e-05, "epoch": 0.8291900561347233, "step": 1034 }, { "loss": 0.0251, "grad_norm": 0.2026532143354416, "learning_rate": 1.5550150755157268e-05, "epoch": 0.8299919807538091, "step": 1035 }, { "loss": 0.0779, "grad_norm": 0.3825243413448334, "learning_rate": 1.5408344750555383e-05, "epoch": 0.830793905372895, "step": 1036 }, { "loss": 0.0298, "grad_norm": 0.31827080249786377, "learning_rate": 1.5267134303331122e-05, "epoch": 0.8315958299919808, "step": 1037 }, { "loss": 0.0338, "grad_norm": 0.25245165824890137, "learning_rate": 1.5126520407659617e-05, "epoch": 0.8323977546110666, "step": 1038 }, { "loss": 0.026, "grad_norm": 0.22130174934864044, "learning_rate": 1.4986504053516105e-05, "epoch": 0.8331996792301524, "step": 1039 }, { "loss": 0.0215, "grad_norm": 0.22145700454711914, "learning_rate": 1.4847086226668872e-05, "epoch": 0.8340016038492382, "step": 1040 }, { "loss": 0.0759, "grad_norm": 0.42060795426368713, "learning_rate": 1.4708267908672401e-05, "epoch": 0.834803528468324, "step": 1041 }, { "loss": 0.0294, "grad_norm": 0.32257431745529175, "learning_rate": 1.4570050076860342e-05, "epoch": 0.8356054530874097, "step": 1042 }, { "loss": 0.0242, "grad_norm": 0.16260656714439392, "learning_rate": 1.4432433704338722e-05, "epoch": 0.8364073777064955, "step": 1043 }, { "loss": 0.0232, "grad_norm": 0.2972884476184845, "learning_rate": 1.429541975997908e-05, "epoch": 0.8372093023255814, "step": 1044 }, { "loss": 0.0321, "grad_norm": 0.3135718107223511, "learning_rate": 1.415900920841161e-05, "epoch": 0.8380112269446672, "step": 1045 }, { "loss": 0.0203, "grad_norm": 0.21164710819721222, "learning_rate": 1.4023203010018394e-05, "epoch": 0.838813151563753, "step": 1046 }, { "loss": 0.021, "grad_norm": 0.2070372849702835, "learning_rate": 1.3888002120926623e-05, "epoch": 0.8396150761828388, "step": 1047 }, { "loss": 0.0428, "grad_norm": 0.39735156297683716, "learning_rate": 1.3753407493001968e-05, "epoch": 0.8404170008019246, "step": 1048 }, { "loss": 0.02, "grad_norm": 0.23848125338554382, "learning_rate": 1.3619420073841637e-05, "epoch": 0.8412189254210104, "step": 1049 }, { "loss": 0.056, "grad_norm": 0.3926337957382202, "learning_rate": 1.3486040806767996e-05, "epoch": 0.8420208500400962, "step": 1050 }, { "loss": 0.0319, "grad_norm": 0.2710540294647217, "learning_rate": 1.3353270630821712e-05, "epoch": 0.842822774659182, "step": 1051 }, { "loss": 0.0402, "grad_norm": 0.41882696747779846, "learning_rate": 1.3221110480755305e-05, "epoch": 0.8436246992782679, "step": 1052 }, { "loss": 0.0475, "grad_norm": 0.4259793162345886, "learning_rate": 1.3089561287026319e-05, "epoch": 0.8444266238973537, "step": 1053 }, { "loss": 0.0314, "grad_norm": 0.26140275597572327, "learning_rate": 1.2958623975791118e-05, "epoch": 0.8452285485164395, "step": 1054 }, { "loss": 0.0476, "grad_norm": 0.3547419309616089, "learning_rate": 1.2828299468898076e-05, "epoch": 0.8460304731355253, "step": 1055 }, { "loss": 0.0493, "grad_norm": 0.40817224979400635, "learning_rate": 1.2698588683881186e-05, "epoch": 0.846832397754611, "step": 1056 }, { "loss": 0.0312, "grad_norm": 0.2400854080915451, "learning_rate": 1.2569492533953665e-05, "epoch": 0.8476343223736968, "step": 1057 }, { "loss": 0.0251, "grad_norm": 0.3751298785209656, "learning_rate": 1.2441011928001433e-05, "epoch": 0.8484362469927826, "step": 1058 }, { "loss": 0.0223, "grad_norm": 0.2388581931591034, "learning_rate": 1.2313147770576749e-05, "epoch": 0.8492381716118684, "step": 1059 }, { "loss": 0.0415, "grad_norm": 0.3184750974178314, "learning_rate": 1.2185900961891794e-05, "epoch": 0.8500400962309543, "step": 1060 }, { "loss": 0.0818, "grad_norm": 0.5252367258071899, "learning_rate": 1.2059272397812493e-05, "epoch": 0.8508420208500401, "step": 1061 }, { "loss": 0.0277, "grad_norm": 0.3043983578681946, "learning_rate": 1.1933262969851988e-05, "epoch": 0.8516439454691259, "step": 1062 }, { "loss": 0.0106, "grad_norm": 0.09968919306993484, "learning_rate": 1.1807873565164506e-05, "epoch": 0.8524458700882117, "step": 1063 }, { "loss": 0.0365, "grad_norm": 0.27977362275123596, "learning_rate": 1.1683105066539068e-05, "epoch": 0.8532477947072975, "step": 1064 }, { "loss": 0.0589, "grad_norm": 0.3934069275856018, "learning_rate": 1.1558958352393334e-05, "epoch": 0.8540497193263833, "step": 1065 }, { "loss": 0.0272, "grad_norm": 0.24241115152835846, "learning_rate": 1.1435434296767233e-05, "epoch": 0.8548516439454691, "step": 1066 }, { "loss": 0.0502, "grad_norm": 0.31850865483283997, "learning_rate": 1.1312533769317103e-05, "epoch": 0.8556535685645549, "step": 1067 }, { "loss": 0.0295, "grad_norm": 0.2685650885105133, "learning_rate": 1.1190257635309275e-05, "epoch": 0.8564554931836408, "step": 1068 }, { "loss": 0.0346, "grad_norm": 0.20911841094493866, "learning_rate": 1.106860675561424e-05, "epoch": 0.8572574178027266, "step": 1069 }, { "loss": 0.0187, "grad_norm": 0.24455945193767548, "learning_rate": 1.0947581986700306e-05, "epoch": 0.8580593424218124, "step": 1070 }, { "loss": 0.0396, "grad_norm": 0.3328882157802582, "learning_rate": 1.0827184180627858e-05, "epoch": 0.8588612670408982, "step": 1071 }, { "loss": 0.0463, "grad_norm": 0.4071497917175293, "learning_rate": 1.0707414185043163e-05, "epoch": 0.859663191659984, "step": 1072 }, { "loss": 0.0366, "grad_norm": 0.411491334438324, "learning_rate": 1.0588272843172454e-05, "epoch": 0.8604651162790697, "step": 1073 }, { "loss": 0.0312, "grad_norm": 0.4028700888156891, "learning_rate": 1.0469760993816057e-05, "epoch": 0.8612670408981555, "step": 1074 }, { "loss": 0.0297, "grad_norm": 0.17155224084854126, "learning_rate": 1.0351879471342374e-05, "epoch": 0.8620689655172413, "step": 1075 }, { "loss": 0.047, "grad_norm": 0.3104127049446106, "learning_rate": 1.0234629105682103e-05, "epoch": 0.8628708901363272, "step": 1076 }, { "loss": 0.0207, "grad_norm": 0.2133323699235916, "learning_rate": 1.0118010722322314e-05, "epoch": 0.863672814755413, "step": 1077 }, { "loss": 0.0143, "grad_norm": 0.17464157938957214, "learning_rate": 1.0002025142300765e-05, "epoch": 0.8644747393744988, "step": 1078 }, { "loss": 0.0289, "grad_norm": 0.3865419328212738, "learning_rate": 9.886673182199957e-06, "epoch": 0.8652766639935846, "step": 1079 }, { "loss": 0.0185, "grad_norm": 0.2113240659236908, "learning_rate": 9.771955654141496e-06, "epoch": 0.8660785886126704, "step": 1080 }, { "loss": 0.0135, "grad_norm": 0.138031005859375, "learning_rate": 9.657873365780323e-06, "epoch": 0.8668805132317562, "step": 1081 }, { "loss": 0.0308, "grad_norm": 0.3399095833301544, "learning_rate": 9.544427120299138e-06, "epoch": 0.867682437850842, "step": 1082 }, { "loss": 0.0146, "grad_norm": 0.26471027731895447, "learning_rate": 9.431617716402507e-06, "epoch": 0.8684843624699278, "step": 1083 }, { "loss": 0.0202, "grad_norm": 0.19661951065063477, "learning_rate": 9.319445948311534e-06, "epoch": 0.8692862870890137, "step": 1084 }, { "loss": 0.0636, "grad_norm": 0.49121007323265076, "learning_rate": 9.207912605758052e-06, "epoch": 0.8700882117080995, "step": 1085 }, { "loss": 0.0532, "grad_norm": 0.4061635434627533, "learning_rate": 9.097018473979124e-06, "epoch": 0.8708901363271853, "step": 1086 }, { "loss": 0.0411, "grad_norm": 0.29139432311058044, "learning_rate": 8.986764333711584e-06, "epoch": 0.871692060946271, "step": 1087 }, { "loss": 0.0191, "grad_norm": 0.14276857674121857, "learning_rate": 8.87715096118642e-06, "epoch": 0.8724939855653568, "step": 1088 }, { "loss": 0.0147, "grad_norm": 0.19102653861045837, "learning_rate": 8.768179128123455e-06, "epoch": 0.8732959101844426, "step": 1089 }, { "loss": 0.106, "grad_norm": 0.6417858004570007, "learning_rate": 8.659849601725701e-06, "epoch": 0.8740978348035284, "step": 1090 }, { "loss": 0.0448, "grad_norm": 0.3078593313694, "learning_rate": 8.55216314467422e-06, "epoch": 0.8748997594226142, "step": 1091 }, { "loss": 0.0169, "grad_norm": 0.22024403512477875, "learning_rate": 8.445120515122551e-06, "epoch": 0.8757016840417001, "step": 1092 }, { "loss": 0.0637, "grad_norm": 0.4088142514228821, "learning_rate": 8.338722466691451e-06, "epoch": 0.8765036086607859, "step": 1093 }, { "loss": 0.0353, "grad_norm": 0.2129702866077423, "learning_rate": 8.23296974846357e-06, "epoch": 0.8773055332798717, "step": 1094 }, { "loss": 0.0619, "grad_norm": 0.438853919506073, "learning_rate": 8.127863104978261e-06, "epoch": 0.8781074578989575, "step": 1095 }, { "loss": 0.0339, "grad_norm": 0.30360084772109985, "learning_rate": 8.023403276226126e-06, "epoch": 0.8789093825180433, "step": 1096 }, { "loss": 0.0257, "grad_norm": 0.2785508930683136, "learning_rate": 7.91959099764411e-06, "epoch": 0.8797113071371291, "step": 1097 }, { "loss": 0.0355, "grad_norm": 0.2872433066368103, "learning_rate": 7.816427000110015e-06, "epoch": 0.8805132317562149, "step": 1098 }, { "loss": 0.0103, "grad_norm": 0.11933287978172302, "learning_rate": 7.713912009937608e-06, "epoch": 0.8813151563753007, "step": 1099 }, { "loss": 0.0392, "grad_norm": 0.26215773820877075, "learning_rate": 7.612046748871327e-06, "epoch": 0.8821170809943866, "step": 1100 }, { "loss": 0.0147, "grad_norm": 1.0445473194122314, "learning_rate": 7.5108319340813085e-06, "epoch": 0.8829190056134724, "step": 1101 }, { "loss": 0.0212, "grad_norm": 0.18120186030864716, "learning_rate": 7.410268278158272e-06, "epoch": 0.8837209302325582, "step": 1102 }, { "loss": 0.0471, "grad_norm": 0.37671101093292236, "learning_rate": 7.310356489108538e-06, "epoch": 0.884522854851644, "step": 1103 }, { "loss": 0.0455, "grad_norm": 0.38250431418418884, "learning_rate": 7.211097270349066e-06, "epoch": 0.8853247794707297, "step": 1104 }, { "loss": 0.0233, "grad_norm": 0.36999034881591797, "learning_rate": 7.112491320702441e-06, "epoch": 0.8861267040898155, "step": 1105 }, { "loss": 0.0264, "grad_norm": 0.2825574278831482, "learning_rate": 7.014539334392012e-06, "epoch": 0.8869286287089013, "step": 1106 }, { "loss": 0.064, "grad_norm": 0.35153815150260925, "learning_rate": 6.917242001036917e-06, "epoch": 0.8877305533279871, "step": 1107 }, { "loss": 0.0188, "grad_norm": 0.14561624825000763, "learning_rate": 6.820600005647382e-06, "epoch": 0.888532477947073, "step": 1108 }, { "loss": 0.0561, "grad_norm": 0.463888555765152, "learning_rate": 6.7246140286197355e-06, "epoch": 0.8893344025661588, "step": 1109 }, { "loss": 0.0041, "grad_norm": 0.0676012933254242, "learning_rate": 6.629284745731701e-06, "epoch": 0.8901363271852446, "step": 1110 }, { "loss": 0.0159, "grad_norm": 0.18029426038265228, "learning_rate": 6.5346128281376204e-06, "epoch": 0.8909382518043304, "step": 1111 }, { "loss": 0.063, "grad_norm": 0.35441911220550537, "learning_rate": 6.440598942363796e-06, "epoch": 0.8917401764234162, "step": 1112 }, { "loss": 0.0502, "grad_norm": 0.37630465626716614, "learning_rate": 6.347243750303622e-06, "epoch": 0.892542101042502, "step": 1113 }, { "loss": 0.0604, "grad_norm": 0.3076138496398926, "learning_rate": 6.254547909213149e-06, "epoch": 0.8933440256615878, "step": 1114 }, { "loss": 0.0382, "grad_norm": 0.2812439203262329, "learning_rate": 6.162512071706272e-06, "epoch": 0.8941459502806736, "step": 1115 }, { "loss": 0.0421, "grad_norm": 0.26998552680015564, "learning_rate": 6.071136885750272e-06, "epoch": 0.8949478748997595, "step": 1116 }, { "eval_loss": 0.03864981606602669, "eval_runtime": 31.6055, "eval_samples_per_second": 33.222, "eval_steps_per_second": 8.321, "epoch": 0.8949478748997595, "step": 1116 }, { "loss": 0.0632, "grad_norm": 0.4778763949871063, "learning_rate": 5.980422994661139e-06, "epoch": 0.8957497995188453, "step": 1117 }, { "loss": 0.0152, "grad_norm": 0.18333400785923004, "learning_rate": 5.890371037099107e-06, "epoch": 0.896551724137931, "step": 1118 }, { "loss": 0.027, "grad_norm": 0.28018802404403687, "learning_rate": 5.800981647064186e-06, "epoch": 0.8973536487570168, "step": 1119 }, { "loss": 0.0281, "grad_norm": 0.2619428336620331, "learning_rate": 5.71225545389158e-06, "epoch": 0.8981555733761026, "step": 1120 }, { "loss": 0.0274, "grad_norm": 0.2673223912715912, "learning_rate": 5.624193082247431e-06, "epoch": 0.8989574979951884, "step": 1121 }, { "loss": 0.0482, "grad_norm": 0.2983281910419464, "learning_rate": 5.536795152124252e-06, "epoch": 0.8997594226142742, "step": 1122 }, { "loss": 0.0384, "grad_norm": 0.30985018610954285, "learning_rate": 5.450062278836677e-06, "epoch": 0.90056134723336, "step": 1123 }, { "loss": 0.0169, "grad_norm": 0.1679602563381195, "learning_rate": 5.363995073017047e-06, "epoch": 0.9013632718524459, "step": 1124 }, { "loss": 0.0396, "grad_norm": 0.29093822836875916, "learning_rate": 5.278594140611204e-06, "epoch": 0.9021651964715317, "step": 1125 }, { "loss": 0.0218, "grad_norm": 0.26348739862442017, "learning_rate": 5.193860082874125e-06, "epoch": 0.9029671210906175, "step": 1126 }, { "loss": 0.0458, "grad_norm": 0.5017328262329102, "learning_rate": 5.1097934963657665e-06, "epoch": 0.9037690457097033, "step": 1127 }, { "loss": 0.0321, "grad_norm": 0.3663092851638794, "learning_rate": 5.026394972946813e-06, "epoch": 0.9045709703287891, "step": 1128 }, { "loss": 0.0308, "grad_norm": 0.28317686915397644, "learning_rate": 4.943665099774553e-06, "epoch": 0.9053728949478749, "step": 1129 }, { "loss": 0.0492, "grad_norm": 0.4701959490776062, "learning_rate": 4.861604459298696e-06, "epoch": 0.9061748195669607, "step": 1130 }, { "loss": 0.0677, "grad_norm": 0.5933964252471924, "learning_rate": 4.780213629257324e-06, "epoch": 0.9069767441860465, "step": 1131 }, { "loss": 0.0532, "grad_norm": 0.35850825905799866, "learning_rate": 4.69949318267281e-06, "epoch": 0.9077786688051324, "step": 1132 }, { "loss": 0.0429, "grad_norm": 0.3367816209793091, "learning_rate": 4.619443687847702e-06, "epoch": 0.9085805934242182, "step": 1133 }, { "loss": 0.0223, "grad_norm": 0.3213984966278076, "learning_rate": 4.540065708360886e-06, "epoch": 0.909382518043304, "step": 1134 }, { "loss": 0.0452, "grad_norm": 0.4436606168746948, "learning_rate": 4.461359803063458e-06, "epoch": 0.9101844426623897, "step": 1135 }, { "loss": 0.0179, "grad_norm": 0.3105975389480591, "learning_rate": 4.383326526074916e-06, "epoch": 0.9109863672814755, "step": 1136 }, { "loss": 0.0313, "grad_norm": 0.3733506202697754, "learning_rate": 4.305966426779118e-06, "epoch": 0.9117882919005613, "step": 1137 }, { "loss": 0.0142, "grad_norm": 0.28305160999298096, "learning_rate": 4.229280049820561e-06, "epoch": 0.9125902165196471, "step": 1138 }, { "loss": 0.0351, "grad_norm": 0.431251585483551, "learning_rate": 4.15326793510048e-06, "epoch": 0.9133921411387329, "step": 1139 }, { "loss": 0.0231, "grad_norm": 0.20681129395961761, "learning_rate": 4.077930617773007e-06, "epoch": 0.9141940657578188, "step": 1140 }, { "loss": 0.0448, "grad_norm": 0.314126193523407, "learning_rate": 4.003268628241452e-06, "epoch": 0.9149959903769046, "step": 1141 }, { "loss": 0.0495, "grad_norm": 0.3883209228515625, "learning_rate": 3.929282492154607e-06, "epoch": 0.9157979149959904, "step": 1142 }, { "loss": 0.0204, "grad_norm": 0.28020593523979187, "learning_rate": 3.855972730402968e-06, "epoch": 0.9165998396150762, "step": 1143 }, { "loss": 0.0161, "grad_norm": 0.23583762347698212, "learning_rate": 3.783339859115065e-06, "epoch": 0.917401764234162, "step": 1144 }, { "loss": 0.0306, "grad_norm": 0.2325585037469864, "learning_rate": 3.711384389653916e-06, "epoch": 0.9182036888532478, "step": 1145 }, { "loss": 0.0235, "grad_norm": 0.24619098007678986, "learning_rate": 3.6401068286133542e-06, "epoch": 0.9190056134723336, "step": 1146 }, { "loss": 0.033, "grad_norm": 0.3259875476360321, "learning_rate": 3.5695076778144875e-06, "epoch": 0.9198075380914194, "step": 1147 }, { "loss": 0.0304, "grad_norm": 0.25106412172317505, "learning_rate": 3.4995874343021094e-06, "epoch": 0.9206094627105053, "step": 1148 }, { "loss": 0.0469, "grad_norm": 0.4743681252002716, "learning_rate": 3.430346590341338e-06, "epoch": 0.921411387329591, "step": 1149 }, { "loss": 0.0539, "grad_norm": 0.34995362162590027, "learning_rate": 3.3617856334139607e-06, "epoch": 0.9222133119486768, "step": 1150 }, { "loss": 0.0662, "grad_norm": 0.7248356342315674, "learning_rate": 3.2939050462151953e-06, "epoch": 0.9230152365677626, "step": 1151 }, { "loss": 0.014, "grad_norm": 0.2265823483467102, "learning_rate": 3.226705306650113e-06, "epoch": 0.9238171611868484, "step": 1152 }, { "loss": 0.013, "grad_norm": 0.20024491846561432, "learning_rate": 3.1601868878304406e-06, "epoch": 0.9246190858059342, "step": 1153 }, { "loss": 0.0347, "grad_norm": 0.25708380341529846, "learning_rate": 3.0943502580710772e-06, "epoch": 0.92542101042502, "step": 1154 }, { "loss": 0.0149, "grad_norm": 0.2992137372493744, "learning_rate": 3.0291958808869037e-06, "epoch": 0.9262229350441058, "step": 1155 }, { "loss": 0.0162, "grad_norm": 0.24897243082523346, "learning_rate": 2.9647242149895006e-06, "epoch": 0.9270248596631917, "step": 1156 }, { "loss": 0.0193, "grad_norm": 0.19664904475212097, "learning_rate": 2.9009357142838477e-06, "epoch": 0.9278267842822775, "step": 1157 }, { "loss": 0.0288, "grad_norm": 0.40675321221351624, "learning_rate": 2.8378308278652288e-06, "epoch": 0.9286287089013633, "step": 1158 }, { "loss": 0.029, "grad_norm": 0.2802353501319885, "learning_rate": 2.775410000016021e-06, "epoch": 0.9294306335204491, "step": 1159 }, { "loss": 0.0561, "grad_norm": 0.9122768044471741, "learning_rate": 2.7136736702025433e-06, "epoch": 0.9302325581395349, "step": 1160 }, { "loss": 0.032, "grad_norm": 0.36887556314468384, "learning_rate": 2.652622273072003e-06, "epoch": 0.9310344827586207, "step": 1161 }, { "loss": 0.0285, "grad_norm": 0.31024497747421265, "learning_rate": 2.5922562384494196e-06, "epoch": 0.9318364073777065, "step": 1162 }, { "loss": 0.0499, "grad_norm": 0.27006012201309204, "learning_rate": 2.532575991334618e-06, "epoch": 0.9326383319967922, "step": 1163 }, { "loss": 0.0422, "grad_norm": 0.3135731816291809, "learning_rate": 2.473581951899184e-06, "epoch": 0.9334402566158782, "step": 1164 }, { "loss": 0.0249, "grad_norm": 0.23915302753448486, "learning_rate": 2.415274535483547e-06, "epoch": 0.9342421812349639, "step": 1165 }, { "loss": 0.0736, "grad_norm": 0.5155062675476074, "learning_rate": 2.357654152594113e-06, "epoch": 0.9350441058540497, "step": 1166 }, { "loss": 0.0379, "grad_norm": 0.28162410855293274, "learning_rate": 2.3007212089001916e-06, "epoch": 0.9358460304731355, "step": 1167 }, { "loss": 0.0247, "grad_norm": 0.2652982771396637, "learning_rate": 2.2444761052313856e-06, "epoch": 0.9366479550922213, "step": 1168 }, { "loss": 0.0219, "grad_norm": 0.1709524244070053, "learning_rate": 2.1889192375745494e-06, "epoch": 0.9374498797113071, "step": 1169 }, { "loss": 0.0725, "grad_norm": 0.5203680396080017, "learning_rate": 2.1340509970711466e-06, "epoch": 0.9382518043303929, "step": 1170 }, { "loss": 0.0466, "grad_norm": 0.5651960372924805, "learning_rate": 2.0798717700144077e-06, "epoch": 0.9390537289494787, "step": 1171 }, { "loss": 0.0316, "grad_norm": 0.219880610704422, "learning_rate": 2.0263819378466884e-06, "epoch": 0.9398556535685646, "step": 1172 }, { "loss": 0.037, "grad_norm": 0.3648744225502014, "learning_rate": 1.973581877156716e-06, "epoch": 0.9406575781876504, "step": 1173 }, { "loss": 0.0337, "grad_norm": 0.35861659049987793, "learning_rate": 1.921471959676957e-06, "epoch": 0.9414595028067362, "step": 1174 }, { "loss": 0.0255, "grad_norm": 0.3175615072250366, "learning_rate": 1.870052552281032e-06, "epoch": 0.942261427425822, "step": 1175 }, { "loss": 0.0521, "grad_norm": 0.41771504282951355, "learning_rate": 1.8193240169810943e-06, "epoch": 0.9430633520449078, "step": 1176 }, { "loss": 0.113, "grad_norm": 0.5672475695610046, "learning_rate": 1.7692867109252886e-06, "epoch": 0.9438652766639936, "step": 1177 }, { "loss": 0.0231, "grad_norm": 0.24217240512371063, "learning_rate": 1.7199409863952521e-06, "epoch": 0.9446672012830793, "step": 1178 }, { "loss": 0.0286, "grad_norm": 0.34596091508865356, "learning_rate": 1.6712871908036387e-06, "epoch": 0.9454691259021651, "step": 1179 }, { "loss": 0.0096, "grad_norm": 0.1532655507326126, "learning_rate": 1.623325666691644e-06, "epoch": 0.946271050521251, "step": 1180 }, { "loss": 0.0107, "grad_norm": 0.14835034310817719, "learning_rate": 1.5760567517266066e-06, "epoch": 0.9470729751403368, "step": 1181 }, { "loss": 0.0378, "grad_norm": 0.31077146530151367, "learning_rate": 1.5294807786996213e-06, "epoch": 0.9478748997594226, "step": 1182 }, { "loss": 0.0459, "grad_norm": 0.33775973320007324, "learning_rate": 1.4835980755232626e-06, "epoch": 0.9486768243785084, "step": 1183 }, { "loss": 0.0722, "grad_norm": 0.3435156047344208, "learning_rate": 1.4384089652291543e-06, "epoch": 0.9494787489975942, "step": 1184 }, { "loss": 0.0259, "grad_norm": 0.29428747296333313, "learning_rate": 1.3939137659658153e-06, "epoch": 0.95028067361668, "step": 1185 }, { "loss": 0.0189, "grad_norm": 0.21608836948871613, "learning_rate": 1.3501127909963274e-06, "epoch": 0.9510825982357658, "step": 1186 }, { "loss": 0.0352, "grad_norm": 0.35777002573013306, "learning_rate": 1.3070063486961936e-06, "epoch": 0.9518845228548516, "step": 1187 }, { "loss": 0.0284, "grad_norm": 0.2871112525463104, "learning_rate": 1.2645947425511395e-06, "epoch": 0.9526864474739375, "step": 1188 }, { "loss": 0.0225, "grad_norm": 0.22989165782928467, "learning_rate": 1.2228782711549924e-06, "epoch": 0.9534883720930233, "step": 1189 }, { "loss": 0.027, "grad_norm": 0.3213796317577362, "learning_rate": 1.181857228207539e-06, "epoch": 0.9542902967121091, "step": 1190 }, { "loss": 0.0232, "grad_norm": 0.27565819025039673, "learning_rate": 1.1415319025124938e-06, "epoch": 0.9550922213311949, "step": 1191 }, { "loss": 0.0385, "grad_norm": 0.5389794111251831, "learning_rate": 1.1019025779754666e-06, "epoch": 0.9558941459502807, "step": 1192 }, { "loss": 0.0419, "grad_norm": 0.27501630783081055, "learning_rate": 1.0629695336019763e-06, "epoch": 0.9566960705693665, "step": 1193 }, { "loss": 0.0423, "grad_norm": 0.2711651027202606, "learning_rate": 1.0247330434954071e-06, "epoch": 0.9574979951884522, "step": 1194 }, { "loss": 0.0222, "grad_norm": 0.20319171249866486, "learning_rate": 9.871933768551888e-07, "epoch": 0.958299919807538, "step": 1195 }, { "loss": 0.0718, "grad_norm": 0.3775697648525238, "learning_rate": 9.503507979748305e-07, "epoch": 0.9591018444266239, "step": 1196 }, { "loss": 0.0313, "grad_norm": 0.27909061312675476, "learning_rate": 9.142055662400673e-07, "epoch": 0.9599037690457097, "step": 1197 }, { "loss": 0.0754, "grad_norm": 0.5719237923622131, "learning_rate": 8.787579361270614e-07, "epoch": 0.9607056936647955, "step": 1198 }, { "loss": 0.0706, "grad_norm": 0.6314537525177002, "learning_rate": 8.440081572005931e-07, "epoch": 0.9615076182838813, "step": 1199 }, { "loss": 0.038, "grad_norm": 0.45612236857414246, "learning_rate": 8.099564741123166e-07, "epoch": 0.9623095429029671, "step": 1200 }, { "loss": 0.0525, "grad_norm": 0.3892579674720764, "learning_rate": 7.766031265989849e-07, "epoch": 0.9631114675220529, "step": 1201 }, { "loss": 0.0146, "grad_norm": 0.20876550674438477, "learning_rate": 7.439483494808497e-07, "epoch": 0.9639133921411387, "step": 1202 }, { "loss": 0.0245, "grad_norm": 0.2424009144306183, "learning_rate": 7.11992372659942e-07, "epoch": 0.9647153167602245, "step": 1203 }, { "loss": 0.0171, "grad_norm": 0.20193764567375183, "learning_rate": 6.807354211184613e-07, "epoch": 0.9655172413793104, "step": 1204 }, { "loss": 0.0357, "grad_norm": 0.3591613471508026, "learning_rate": 6.501777149172328e-07, "epoch": 0.9663191659983962, "step": 1205 }, { "loss": 0.0481, "grad_norm": 0.40879741311073303, "learning_rate": 6.203194691940972e-07, "epoch": 0.967121090617482, "step": 1206 }, { "loss": 0.0316, "grad_norm": 0.2834092080593109, "learning_rate": 5.91160894162468e-07, "epoch": 0.9679230152365678, "step": 1207 }, { "loss": 0.0357, "grad_norm": 0.23049277067184448, "learning_rate": 5.627021951097545e-07, "epoch": 0.9687249398556536, "step": 1208 }, { "loss": 0.0289, "grad_norm": 0.2661738693714142, "learning_rate": 5.349435723960183e-07, "epoch": 0.9695268644747393, "step": 1209 }, { "loss": 0.0295, "grad_norm": 0.2500978708267212, "learning_rate": 5.078852214525198e-07, "epoch": 0.9703287890938251, "step": 1210 }, { "loss": 0.0253, "grad_norm": 0.20364578068256378, "learning_rate": 4.815273327803182e-07, "epoch": 0.9711307137129109, "step": 1211 }, { "loss": 0.0444, "grad_norm": 0.33421790599823, "learning_rate": 4.5587009194894004e-07, "epoch": 0.9719326383319968, "step": 1212 }, { "loss": 0.0181, "grad_norm": 0.2810249626636505, "learning_rate": 4.3091367959512407e-07, "epoch": 0.9727345629510826, "step": 1213 }, { "loss": 0.0252, "grad_norm": 0.24068693816661835, "learning_rate": 4.066582714214895e-07, "epoch": 0.9735364875701684, "step": 1214 }, { "loss": 0.0529, "grad_norm": 0.3940003514289856, "learning_rate": 3.831040381953144e-07, "epoch": 0.9743384121892542, "step": 1215 }, { "loss": 0.0359, "grad_norm": 0.7913379073143005, "learning_rate": 3.6025114574734785e-07, "epoch": 0.97514033680834, "step": 1216 }, { "loss": 0.1689, "grad_norm": 0.7005312442779541, "learning_rate": 3.380997549706444e-07, "epoch": 0.9759422614274258, "step": 1217 }, { "loss": 0.0493, "grad_norm": 0.38180509209632874, "learning_rate": 3.166500218193758e-07, "epoch": 0.9767441860465116, "step": 1218 }, { "loss": 0.0252, "grad_norm": 0.20567728579044342, "learning_rate": 2.9590209730784304e-07, "epoch": 0.9775461106655974, "step": 1219 }, { "loss": 0.0352, "grad_norm": 0.6058043241500854, "learning_rate": 2.758561275092886e-07, "epoch": 0.9783480352846833, "step": 1220 }, { "loss": 0.0272, "grad_norm": 0.32482075691223145, "learning_rate": 2.5651225355497464e-07, "epoch": 0.9791499599037691, "step": 1221 }, { "loss": 0.031, "grad_norm": 0.16501711308956146, "learning_rate": 2.378706116330953e-07, "epoch": 0.9799518845228549, "step": 1222 }, { "loss": 0.0579, "grad_norm": 0.3395942747592926, "learning_rate": 2.1993133298791046e-07, "epoch": 0.9807538091419407, "step": 1223 }, { "loss": 0.013, "grad_norm": 0.18358808755874634, "learning_rate": 2.0269454391874666e-07, "epoch": 0.9815557337610264, "step": 1224 }, { "loss": 0.0441, "grad_norm": 0.4380914866924286, "learning_rate": 1.861603657791422e-07, "epoch": 0.9823576583801122, "step": 1225 }, { "loss": 0.0216, "grad_norm": 0.19042056798934937, "learning_rate": 1.7032891497600345e-07, "epoch": 0.983159582999198, "step": 1226 }, { "loss": 0.0208, "grad_norm": 0.21681898832321167, "learning_rate": 1.5520030296873877e-07, "epoch": 0.9839615076182838, "step": 1227 }, { "loss": 0.0664, "grad_norm": 0.4365503489971161, "learning_rate": 1.4077463626852582e-07, "epoch": 0.9847634322373697, "step": 1228 }, { "loss": 0.0224, "grad_norm": 0.18103234469890594, "learning_rate": 1.270520164375344e-07, "epoch": 0.9855653568564555, "step": 1229 }, { "loss": 0.029, "grad_norm": 0.26580461859703064, "learning_rate": 1.1403254008822695e-07, "epoch": 0.9863672814755413, "step": 1230 }, { "loss": 0.0414, "grad_norm": 0.2631921172142029, "learning_rate": 1.0171629888265921e-07, "epoch": 0.9871692060946271, "step": 1231 }, { "loss": 0.0267, "grad_norm": 0.34322279691696167, "learning_rate": 9.010337953185843e-08, "epoch": 0.9879711307137129, "step": 1232 }, { "loss": 0.0245, "grad_norm": 0.24382710456848145, "learning_rate": 7.919386379515726e-08, "epoch": 0.9887730553327987, "step": 1233 }, { "loss": 0.0652, "grad_norm": 0.5472224354743958, "learning_rate": 6.89878284797163e-08, "epoch": 0.9895749799518845, "step": 1234 }, { "loss": 0.0412, "grad_norm": 0.2964751720428467, "learning_rate": 5.948534543988027e-08, "epoch": 0.9903769045709703, "step": 1235 }, { "loss": 0.0436, "grad_norm": 0.3399409353733063, "learning_rate": 5.068648157675604e-08, "epoch": 0.9911788291900562, "step": 1236 }, { "loss": 0.0481, "grad_norm": 0.346693754196167, "learning_rate": 4.259129883767976e-08, "epoch": 0.991980753809142, "step": 1237 }, { "loss": 0.098, "grad_norm": 0.5714817047119141, "learning_rate": 3.5199854215817176e-08, "epoch": 0.9927826784282278, "step": 1238 }, { "loss": 0.0611, "grad_norm": 0.36500847339630127, "learning_rate": 2.8512199749730628e-08, "epoch": 0.9935846030473136, "step": 1239 }, { "loss": 0.0238, "grad_norm": 0.21105419099330902, "learning_rate": 2.2528382523057113e-08, "epoch": 0.9943865276663993, "step": 1240 }, { "eval_loss": 0.038508880883455276, "eval_runtime": 31.7523, "eval_samples_per_second": 33.068, "eval_steps_per_second": 8.283, "epoch": 0.9943865276663993, "step": 1240 }, { "loss": 0.0441, "grad_norm": 0.3713122010231018, "learning_rate": 1.7248444664141884e-08, "epoch": 0.9951884522854851, "step": 1241 }, { "loss": 0.0319, "grad_norm": 0.1977764517068863, "learning_rate": 1.2672423345760909e-08, "epoch": 0.9959903769045709, "step": 1242 }, { "loss": 0.0478, "grad_norm": 0.42995980381965637, "learning_rate": 8.80035078482111e-09, "epoch": 0.9967923015236567, "step": 1243 }, { "loss": 0.0306, "grad_norm": 0.39976274967193604, "learning_rate": 5.6322542422049266e-09, "epoch": 0.9975942261427426, "step": 1244 }, { "loss": 0.0413, "grad_norm": 0.28021112084388733, "learning_rate": 3.1681560225038654e-09, "epoch": 0.9983961507618284, "step": 1245 }, { "loss": 0.0335, "grad_norm": 0.25943997502326965, "learning_rate": 1.4080734739074786e-09, "epoch": 0.9991980753809142, "step": 1246 }, { "loss": 0.0384, "grad_norm": 0.45012426376342773, "learning_rate": 3.52018988059033e-10, "epoch": 1.0, "step": 1247 }, { "train_runtime": 1545.9245, "train_samples_per_second": 12.901, "train_steps_per_second": 0.807, "total_flos": 1.433925787776e+16, "train_loss": 0.05218265543086118, "epoch": 1.0, "step": 1247 } ]