{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.546440521518403, "eval_steps": 92, "global_step": 1380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001846082842967578, "grad_norm": 1.943859338760376, "learning_rate": 0.0, "loss": 0.0781, "step": 1 }, { "epoch": 0.003692165685935156, "grad_norm": 2.365267753601074, "learning_rate": 1.226993865030675e-07, "loss": 0.0565, "step": 2 }, { "epoch": 0.005538248528902735, "grad_norm": 2.2332890033721924, "learning_rate": 2.45398773006135e-07, "loss": 0.0418, "step": 3 }, { "epoch": 0.007384331371870312, "grad_norm": 1.7750693559646606, "learning_rate": 3.680981595092025e-07, "loss": 0.0424, "step": 4 }, { "epoch": 0.009230414214837892, "grad_norm": 2.268247604370117, "learning_rate": 4.9079754601227e-07, "loss": 0.0743, "step": 5 }, { "epoch": 0.01107649705780547, "grad_norm": 2.219888210296631, "learning_rate": 6.134969325153375e-07, "loss": 0.0636, "step": 6 }, { "epoch": 0.012922579900773046, "grad_norm": 2.5264432430267334, "learning_rate": 7.36196319018405e-07, "loss": 0.0736, "step": 7 }, { "epoch": 0.014768662743740625, "grad_norm": 1.7726325988769531, "learning_rate": 8.588957055214725e-07, "loss": 0.0595, "step": 8 }, { "epoch": 0.016614745586708203, "grad_norm": 2.682034969329834, "learning_rate": 9.8159509202454e-07, "loss": 0.0871, "step": 9 }, { "epoch": 0.018460828429675783, "grad_norm": 0.9481790661811829, "learning_rate": 1.1042944785276075e-06, "loss": 0.0404, "step": 10 }, { "epoch": 0.02030691127264336, "grad_norm": 1.4664937257766724, "learning_rate": 1.226993865030675e-06, "loss": 0.0381, "step": 11 }, { "epoch": 0.02215299411561094, "grad_norm": 1.3055261373519897, "learning_rate": 1.3496932515337425e-06, "loss": 0.0468, "step": 12 }, { "epoch": 0.023999076958578516, "grad_norm": 0.9701064229011536, "learning_rate": 1.47239263803681e-06, "loss": 0.0306, "step": 13 }, { "epoch": 0.025845159801546093, "grad_norm": 0.937262773513794, "learning_rate": 1.5950920245398775e-06, "loss": 0.0396, "step": 14 }, { "epoch": 0.027691242644513673, "grad_norm": 1.1696547269821167, "learning_rate": 1.717791411042945e-06, "loss": 0.0501, "step": 15 }, { "epoch": 0.02953732548748125, "grad_norm": 0.7118489146232605, "learning_rate": 1.8404907975460124e-06, "loss": 0.0434, "step": 16 }, { "epoch": 0.03138340833044883, "grad_norm": 1.077755093574524, "learning_rate": 1.96319018404908e-06, "loss": 0.029, "step": 17 }, { "epoch": 0.033229491173416406, "grad_norm": 1.2112401723861694, "learning_rate": 2.085889570552147e-06, "loss": 0.019, "step": 18 }, { "epoch": 0.03507557401638398, "grad_norm": 0.7316231727600098, "learning_rate": 2.208588957055215e-06, "loss": 0.0331, "step": 19 }, { "epoch": 0.036921656859351566, "grad_norm": 0.5681014657020569, "learning_rate": 2.331288343558282e-06, "loss": 0.0272, "step": 20 }, { "epoch": 0.03876773970231914, "grad_norm": 0.49111905694007874, "learning_rate": 2.45398773006135e-06, "loss": 0.0509, "step": 21 }, { "epoch": 0.04061382254528672, "grad_norm": 0.3748130202293396, "learning_rate": 2.5766871165644175e-06, "loss": 0.0143, "step": 22 }, { "epoch": 0.042459905388254296, "grad_norm": 0.5386937260627747, "learning_rate": 2.699386503067485e-06, "loss": 0.0212, "step": 23 }, { "epoch": 0.04430598823122188, "grad_norm": 0.3053921163082123, "learning_rate": 2.822085889570552e-06, "loss": 0.0205, "step": 24 }, { "epoch": 0.046152071074189456, "grad_norm": 1.0585299730300903, "learning_rate": 2.94478527607362e-06, "loss": 0.0166, "step": 25 }, { "epoch": 0.04799815391715703, "grad_norm": 0.41786885261535645, "learning_rate": 3.0674846625766875e-06, "loss": 0.0199, "step": 26 }, { "epoch": 0.04984423676012461, "grad_norm": 0.3514385223388672, "learning_rate": 3.190184049079755e-06, "loss": 0.038, "step": 27 }, { "epoch": 0.051690319603092186, "grad_norm": 0.3386166989803314, "learning_rate": 3.312883435582822e-06, "loss": 0.0206, "step": 28 }, { "epoch": 0.05353640244605977, "grad_norm": 0.22080476582050323, "learning_rate": 3.43558282208589e-06, "loss": 0.0108, "step": 29 }, { "epoch": 0.055382485289027346, "grad_norm": 0.2033078372478485, "learning_rate": 3.5582822085889574e-06, "loss": 0.0133, "step": 30 }, { "epoch": 0.05722856813199492, "grad_norm": 0.27421537041664124, "learning_rate": 3.680981595092025e-06, "loss": 0.0195, "step": 31 }, { "epoch": 0.0590746509749625, "grad_norm": 0.20013940334320068, "learning_rate": 3.8036809815950928e-06, "loss": 0.0117, "step": 32 }, { "epoch": 0.06092073381793008, "grad_norm": 0.22301937639713287, "learning_rate": 3.92638036809816e-06, "loss": 0.0225, "step": 33 }, { "epoch": 0.06276681666089766, "grad_norm": 0.19992490112781525, "learning_rate": 4.049079754601227e-06, "loss": 0.0198, "step": 34 }, { "epoch": 0.06461289950386524, "grad_norm": 0.1544710397720337, "learning_rate": 4.171779141104294e-06, "loss": 0.0113, "step": 35 }, { "epoch": 0.06645898234683281, "grad_norm": 0.1416219025850296, "learning_rate": 4.294478527607362e-06, "loss": 0.0111, "step": 36 }, { "epoch": 0.0683050651898004, "grad_norm": 0.18002353608608246, "learning_rate": 4.41717791411043e-06, "loss": 0.0138, "step": 37 }, { "epoch": 0.07015114803276797, "grad_norm": 0.24586960673332214, "learning_rate": 4.539877300613497e-06, "loss": 0.0261, "step": 38 }, { "epoch": 0.07199723087573555, "grad_norm": 0.23644472658634186, "learning_rate": 4.662576687116564e-06, "loss": 0.0083, "step": 39 }, { "epoch": 0.07384331371870313, "grad_norm": 0.2607399821281433, "learning_rate": 4.785276073619632e-06, "loss": 0.0112, "step": 40 }, { "epoch": 0.0756893965616707, "grad_norm": 0.2863721251487732, "learning_rate": 4.9079754601227e-06, "loss": 0.061, "step": 41 }, { "epoch": 0.07753547940463829, "grad_norm": 0.2519051730632782, "learning_rate": 5.030674846625767e-06, "loss": 0.0168, "step": 42 }, { "epoch": 0.07938156224760586, "grad_norm": 0.1219317764043808, "learning_rate": 5.153374233128835e-06, "loss": 0.0201, "step": 43 }, { "epoch": 0.08122764509057344, "grad_norm": 0.20760662853717804, "learning_rate": 5.276073619631902e-06, "loss": 0.0079, "step": 44 }, { "epoch": 0.08307372793354102, "grad_norm": 0.1823451817035675, "learning_rate": 5.39877300613497e-06, "loss": 0.0139, "step": 45 }, { "epoch": 0.08491981077650859, "grad_norm": 0.2157115638256073, "learning_rate": 5.521472392638038e-06, "loss": 0.0122, "step": 46 }, { "epoch": 0.08676589361947618, "grad_norm": 0.2477337270975113, "learning_rate": 5.644171779141104e-06, "loss": 0.0506, "step": 47 }, { "epoch": 0.08861197646244376, "grad_norm": 0.1235961839556694, "learning_rate": 5.766871165644172e-06, "loss": 0.0066, "step": 48 }, { "epoch": 0.09045805930541133, "grad_norm": 0.19829532504081726, "learning_rate": 5.88957055214724e-06, "loss": 0.0348, "step": 49 }, { "epoch": 0.09230414214837891, "grad_norm": 0.22850783169269562, "learning_rate": 6.012269938650307e-06, "loss": 0.0162, "step": 50 }, { "epoch": 0.09415022499134648, "grad_norm": 0.1670844852924347, "learning_rate": 6.134969325153375e-06, "loss": 0.0152, "step": 51 }, { "epoch": 0.09599630783431407, "grad_norm": 0.27324262261390686, "learning_rate": 6.257668711656443e-06, "loss": 0.0143, "step": 52 }, { "epoch": 0.09784239067728165, "grad_norm": 0.20399723947048187, "learning_rate": 6.38036809815951e-06, "loss": 0.0121, "step": 53 }, { "epoch": 0.09968847352024922, "grad_norm": 0.1744169145822525, "learning_rate": 6.503067484662578e-06, "loss": 0.0079, "step": 54 }, { "epoch": 0.1015345563632168, "grad_norm": 0.15908847749233246, "learning_rate": 6.625766871165644e-06, "loss": 0.0093, "step": 55 }, { "epoch": 0.10338063920618437, "grad_norm": 0.11401887983083725, "learning_rate": 6.748466257668712e-06, "loss": 0.0064, "step": 56 }, { "epoch": 0.10522672204915196, "grad_norm": 0.16666960716247559, "learning_rate": 6.87116564417178e-06, "loss": 0.0095, "step": 57 }, { "epoch": 0.10707280489211954, "grad_norm": 0.1956368088722229, "learning_rate": 6.993865030674847e-06, "loss": 0.0272, "step": 58 }, { "epoch": 0.10891888773508711, "grad_norm": 0.2012910097837448, "learning_rate": 7.116564417177915e-06, "loss": 0.008, "step": 59 }, { "epoch": 0.11076497057805469, "grad_norm": 0.1742282509803772, "learning_rate": 7.239263803680983e-06, "loss": 0.0131, "step": 60 }, { "epoch": 0.11261105342102228, "grad_norm": 0.22643107175827026, "learning_rate": 7.36196319018405e-06, "loss": 0.0297, "step": 61 }, { "epoch": 0.11445713626398984, "grad_norm": 0.25800758600234985, "learning_rate": 7.484662576687118e-06, "loss": 0.0232, "step": 62 }, { "epoch": 0.11630321910695743, "grad_norm": 0.20819664001464844, "learning_rate": 7.6073619631901856e-06, "loss": 0.0148, "step": 63 }, { "epoch": 0.118149301949925, "grad_norm": 0.3256385028362274, "learning_rate": 7.730061349693252e-06, "loss": 0.0171, "step": 64 }, { "epoch": 0.11999538479289258, "grad_norm": 0.1211743876338005, "learning_rate": 7.85276073619632e-06, "loss": 0.0069, "step": 65 }, { "epoch": 0.12184146763586017, "grad_norm": 0.11064834147691727, "learning_rate": 7.975460122699386e-06, "loss": 0.0096, "step": 66 }, { "epoch": 0.12368755047882773, "grad_norm": 0.3300691843032837, "learning_rate": 8.098159509202455e-06, "loss": 0.0217, "step": 67 }, { "epoch": 0.12553363332179532, "grad_norm": 0.11716436594724655, "learning_rate": 8.220858895705522e-06, "loss": 0.007, "step": 68 }, { "epoch": 0.1273797161647629, "grad_norm": 0.19859378039836884, "learning_rate": 8.343558282208589e-06, "loss": 0.0105, "step": 69 }, { "epoch": 0.12922579900773049, "grad_norm": 0.24976450204849243, "learning_rate": 8.466257668711658e-06, "loss": 0.0371, "step": 70 }, { "epoch": 0.13107188185069804, "grad_norm": 0.26116761565208435, "learning_rate": 8.588957055214725e-06, "loss": 0.017, "step": 71 }, { "epoch": 0.13291796469366562, "grad_norm": 0.15941867232322693, "learning_rate": 8.711656441717792e-06, "loss": 0.0166, "step": 72 }, { "epoch": 0.1347640475366332, "grad_norm": 0.28257545828819275, "learning_rate": 8.83435582822086e-06, "loss": 0.0088, "step": 73 }, { "epoch": 0.1366101303796008, "grad_norm": 0.1715109497308731, "learning_rate": 8.957055214723927e-06, "loss": 0.0106, "step": 74 }, { "epoch": 0.13845621322256838, "grad_norm": 0.16345541179180145, "learning_rate": 9.079754601226994e-06, "loss": 0.0228, "step": 75 }, { "epoch": 0.14030229606553593, "grad_norm": 0.1364922970533371, "learning_rate": 9.202453987730062e-06, "loss": 0.0136, "step": 76 }, { "epoch": 0.14214837890850351, "grad_norm": 0.13153736293315887, "learning_rate": 9.325153374233129e-06, "loss": 0.0064, "step": 77 }, { "epoch": 0.1439944617514711, "grad_norm": 0.16387970745563507, "learning_rate": 9.447852760736197e-06, "loss": 0.0076, "step": 78 }, { "epoch": 0.14584054459443868, "grad_norm": 0.3637576997280121, "learning_rate": 9.570552147239264e-06, "loss": 0.0116, "step": 79 }, { "epoch": 0.14768662743740626, "grad_norm": 0.12230011820793152, "learning_rate": 9.693251533742331e-06, "loss": 0.0058, "step": 80 }, { "epoch": 0.14953271028037382, "grad_norm": 0.12101858854293823, "learning_rate": 9.8159509202454e-06, "loss": 0.0043, "step": 81 }, { "epoch": 0.1513787931233414, "grad_norm": 0.22512343525886536, "learning_rate": 9.938650306748467e-06, "loss": 0.0114, "step": 82 }, { "epoch": 0.153224875966309, "grad_norm": 0.10917024314403534, "learning_rate": 1.0061349693251534e-05, "loss": 0.0062, "step": 83 }, { "epoch": 0.15507095880927657, "grad_norm": 0.1253615766763687, "learning_rate": 1.0184049079754601e-05, "loss": 0.0053, "step": 84 }, { "epoch": 0.15691704165224415, "grad_norm": 0.12273586541414261, "learning_rate": 1.030674846625767e-05, "loss": 0.0052, "step": 85 }, { "epoch": 0.1587631244952117, "grad_norm": 0.161468505859375, "learning_rate": 1.0429447852760737e-05, "loss": 0.0138, "step": 86 }, { "epoch": 0.1606092073381793, "grad_norm": 0.13661441206932068, "learning_rate": 1.0552147239263804e-05, "loss": 0.0059, "step": 87 }, { "epoch": 0.16245529018114688, "grad_norm": 0.24264393746852875, "learning_rate": 1.0674846625766873e-05, "loss": 0.0117, "step": 88 }, { "epoch": 0.16430137302411446, "grad_norm": 0.19432157278060913, "learning_rate": 1.079754601226994e-05, "loss": 0.0152, "step": 89 }, { "epoch": 0.16614745586708204, "grad_norm": 0.13859547674655914, "learning_rate": 1.0920245398773005e-05, "loss": 0.0063, "step": 90 }, { "epoch": 0.1679935387100496, "grad_norm": 0.22563207149505615, "learning_rate": 1.1042944785276076e-05, "loss": 0.0109, "step": 91 }, { "epoch": 0.16983962155301718, "grad_norm": 0.14970119297504425, "learning_rate": 1.1165644171779141e-05, "loss": 0.0056, "step": 92 }, { "epoch": 0.16983962155301718, "eval_loss": 0.009778047911822796, "eval_runtime": 91.7649, "eval_samples_per_second": 9.949, "eval_steps_per_second": 4.98, "step": 92 }, { "epoch": 0.17168570439598477, "grad_norm": 0.19423732161521912, "learning_rate": 1.1288343558282208e-05, "loss": 0.0312, "step": 93 }, { "epoch": 0.17353178723895235, "grad_norm": 0.1405217945575714, "learning_rate": 1.1411042944785277e-05, "loss": 0.0059, "step": 94 }, { "epoch": 0.17537787008191993, "grad_norm": 0.16466659307479858, "learning_rate": 1.1533742331288344e-05, "loss": 0.0117, "step": 95 }, { "epoch": 0.17722395292488752, "grad_norm": 0.15254826843738556, "learning_rate": 1.1656441717791411e-05, "loss": 0.0116, "step": 96 }, { "epoch": 0.17907003576785507, "grad_norm": 0.2417498081922531, "learning_rate": 1.177914110429448e-05, "loss": 0.0055, "step": 97 }, { "epoch": 0.18091611861082266, "grad_norm": 0.15393143892288208, "learning_rate": 1.1901840490797547e-05, "loss": 0.0072, "step": 98 }, { "epoch": 0.18276220145379024, "grad_norm": 0.09935597330331802, "learning_rate": 1.2024539877300614e-05, "loss": 0.0041, "step": 99 }, { "epoch": 0.18460828429675782, "grad_norm": 0.2075463831424713, "learning_rate": 1.2147239263803683e-05, "loss": 0.0225, "step": 100 }, { "epoch": 0.1864543671397254, "grad_norm": 0.1333729475736618, "learning_rate": 1.226993865030675e-05, "loss": 0.0065, "step": 101 }, { "epoch": 0.18830044998269296, "grad_norm": 0.15017342567443848, "learning_rate": 1.2392638036809817e-05, "loss": 0.0068, "step": 102 }, { "epoch": 0.19014653282566055, "grad_norm": 0.10981283336877823, "learning_rate": 1.2515337423312886e-05, "loss": 0.0059, "step": 103 }, { "epoch": 0.19199261566862813, "grad_norm": 0.12825554609298706, "learning_rate": 1.2638036809815953e-05, "loss": 0.0131, "step": 104 }, { "epoch": 0.19383869851159571, "grad_norm": 0.14562994241714478, "learning_rate": 1.276073619631902e-05, "loss": 0.0207, "step": 105 }, { "epoch": 0.1956847813545633, "grad_norm": 0.1667003035545349, "learning_rate": 1.2883435582822085e-05, "loss": 0.0081, "step": 106 }, { "epoch": 0.19753086419753085, "grad_norm": 0.13632525503635406, "learning_rate": 1.3006134969325156e-05, "loss": 0.0086, "step": 107 }, { "epoch": 0.19937694704049844, "grad_norm": 0.11093810200691223, "learning_rate": 1.3128834355828221e-05, "loss": 0.0052, "step": 108 }, { "epoch": 0.20122302988346602, "grad_norm": 0.1157720610499382, "learning_rate": 1.3251533742331288e-05, "loss": 0.0056, "step": 109 }, { "epoch": 0.2030691127264336, "grad_norm": 0.11683616042137146, "learning_rate": 1.3374233128834357e-05, "loss": 0.011, "step": 110 }, { "epoch": 0.2049151955694012, "grad_norm": 0.16558395326137543, "learning_rate": 1.3496932515337424e-05, "loss": 0.0126, "step": 111 }, { "epoch": 0.20676127841236874, "grad_norm": 0.11443828046321869, "learning_rate": 1.3619631901840491e-05, "loss": 0.003, "step": 112 }, { "epoch": 0.20860736125533633, "grad_norm": 0.17744530737400055, "learning_rate": 1.374233128834356e-05, "loss": 0.0128, "step": 113 }, { "epoch": 0.2104534440983039, "grad_norm": 0.1902821660041809, "learning_rate": 1.3865030674846627e-05, "loss": 0.0117, "step": 114 }, { "epoch": 0.2122995269412715, "grad_norm": 0.1485978364944458, "learning_rate": 1.3987730061349694e-05, "loss": 0.0115, "step": 115 }, { "epoch": 0.21414560978423908, "grad_norm": 0.1651626080274582, "learning_rate": 1.4110429447852763e-05, "loss": 0.0073, "step": 116 }, { "epoch": 0.21599169262720663, "grad_norm": 0.1463606357574463, "learning_rate": 1.423312883435583e-05, "loss": 0.0078, "step": 117 }, { "epoch": 0.21783777547017422, "grad_norm": 0.16828493773937225, "learning_rate": 1.4355828220858897e-05, "loss": 0.0062, "step": 118 }, { "epoch": 0.2196838583131418, "grad_norm": 0.1243981420993805, "learning_rate": 1.4478527607361965e-05, "loss": 0.0078, "step": 119 }, { "epoch": 0.22152994115610938, "grad_norm": 0.25788137316703796, "learning_rate": 1.4601226993865032e-05, "loss": 0.0103, "step": 120 }, { "epoch": 0.22337602399907697, "grad_norm": 0.2088393270969391, "learning_rate": 1.47239263803681e-05, "loss": 0.0123, "step": 121 }, { "epoch": 0.22522210684204455, "grad_norm": 0.11959819495677948, "learning_rate": 1.4846625766871168e-05, "loss": 0.0067, "step": 122 }, { "epoch": 0.2270681896850121, "grad_norm": 11.672961235046387, "learning_rate": 1.4969325153374235e-05, "loss": 0.0367, "step": 123 }, { "epoch": 0.2289142725279797, "grad_norm": 0.11650065332651138, "learning_rate": 1.50920245398773e-05, "loss": 0.0039, "step": 124 }, { "epoch": 0.23076035537094727, "grad_norm": 0.14429336786270142, "learning_rate": 1.5214723926380371e-05, "loss": 0.0071, "step": 125 }, { "epoch": 0.23260643821391486, "grad_norm": 1.8753544092178345, "learning_rate": 1.5337423312883436e-05, "loss": 0.0162, "step": 126 }, { "epoch": 0.23445252105688244, "grad_norm": 0.12293099611997604, "learning_rate": 1.5460122699386504e-05, "loss": 0.0093, "step": 127 }, { "epoch": 0.23629860389985, "grad_norm": 0.1450912058353424, "learning_rate": 1.5582822085889574e-05, "loss": 0.0061, "step": 128 }, { "epoch": 0.23814468674281758, "grad_norm": 0.26840445399284363, "learning_rate": 1.570552147239264e-05, "loss": 0.0127, "step": 129 }, { "epoch": 0.23999076958578516, "grad_norm": 0.33744606375694275, "learning_rate": 1.5828220858895708e-05, "loss": 0.0196, "step": 130 }, { "epoch": 0.24183685242875275, "grad_norm": 0.18890263140201569, "learning_rate": 1.5950920245398772e-05, "loss": 0.0107, "step": 131 }, { "epoch": 0.24368293527172033, "grad_norm": 0.15780992805957794, "learning_rate": 1.6073619631901842e-05, "loss": 0.0081, "step": 132 }, { "epoch": 0.24552901811468789, "grad_norm": 0.13231074810028076, "learning_rate": 1.619631901840491e-05, "loss": 0.0068, "step": 133 }, { "epoch": 0.24737510095765547, "grad_norm": 0.13381816446781158, "learning_rate": 1.6319018404907976e-05, "loss": 0.0095, "step": 134 }, { "epoch": 0.24922118380062305, "grad_norm": 0.18281026184558868, "learning_rate": 1.6441717791411043e-05, "loss": 0.0104, "step": 135 }, { "epoch": 0.25106726664359064, "grad_norm": 0.5789101719856262, "learning_rate": 1.656441717791411e-05, "loss": 0.0074, "step": 136 }, { "epoch": 0.2529133494865582, "grad_norm": 0.1189756840467453, "learning_rate": 1.6687116564417178e-05, "loss": 0.0074, "step": 137 }, { "epoch": 0.2547594323295258, "grad_norm": 0.11586418002843857, "learning_rate": 1.6809815950920248e-05, "loss": 0.0042, "step": 138 }, { "epoch": 0.25660551517249336, "grad_norm": 0.23946896195411682, "learning_rate": 1.6932515337423315e-05, "loss": 0.0346, "step": 139 }, { "epoch": 0.25845159801546097, "grad_norm": 0.16509361565113068, "learning_rate": 1.7055214723926382e-05, "loss": 0.0068, "step": 140 }, { "epoch": 0.2602976808584285, "grad_norm": 0.29569926857948303, "learning_rate": 1.717791411042945e-05, "loss": 0.0438, "step": 141 }, { "epoch": 0.2621437637013961, "grad_norm": 0.24651439487934113, "learning_rate": 1.7300613496932516e-05, "loss": 0.0079, "step": 142 }, { "epoch": 0.2639898465443637, "grad_norm": 0.14485400915145874, "learning_rate": 1.7423312883435583e-05, "loss": 0.0075, "step": 143 }, { "epoch": 0.26583592938733125, "grad_norm": 0.12196393311023712, "learning_rate": 1.7546012269938654e-05, "loss": 0.0069, "step": 144 }, { "epoch": 0.26768201223029886, "grad_norm": 0.10613025724887848, "learning_rate": 1.766871165644172e-05, "loss": 0.0047, "step": 145 }, { "epoch": 0.2695280950732664, "grad_norm": 0.2299281656742096, "learning_rate": 1.7791411042944788e-05, "loss": 0.0152, "step": 146 }, { "epoch": 0.27137417791623397, "grad_norm": 0.16602513194084167, "learning_rate": 1.7914110429447855e-05, "loss": 0.0111, "step": 147 }, { "epoch": 0.2732202607592016, "grad_norm": 0.1011560782790184, "learning_rate": 1.8036809815950922e-05, "loss": 0.0036, "step": 148 }, { "epoch": 0.27506634360216914, "grad_norm": 0.12293390929698944, "learning_rate": 1.815950920245399e-05, "loss": 0.006, "step": 149 }, { "epoch": 0.27691242644513675, "grad_norm": 0.09763162583112717, "learning_rate": 1.828220858895706e-05, "loss": 0.0062, "step": 150 }, { "epoch": 0.2787585092881043, "grad_norm": 0.12489021569490433, "learning_rate": 1.8404907975460123e-05, "loss": 0.0057, "step": 151 }, { "epoch": 0.28060459213107186, "grad_norm": 0.18567755818367004, "learning_rate": 1.852760736196319e-05, "loss": 0.0197, "step": 152 }, { "epoch": 0.2824506749740395, "grad_norm": 0.3939504325389862, "learning_rate": 1.8650306748466257e-05, "loss": 0.0085, "step": 153 }, { "epoch": 0.28429675781700703, "grad_norm": 0.3876646161079407, "learning_rate": 1.8773006134969328e-05, "loss": 0.0312, "step": 154 }, { "epoch": 0.28614284065997464, "grad_norm": 0.32117509841918945, "learning_rate": 1.8895705521472395e-05, "loss": 0.0257, "step": 155 }, { "epoch": 0.2879889235029422, "grad_norm": 0.1188175231218338, "learning_rate": 1.9018404907975462e-05, "loss": 0.0058, "step": 156 }, { "epoch": 0.28983500634590975, "grad_norm": 0.20765401422977448, "learning_rate": 1.914110429447853e-05, "loss": 0.0379, "step": 157 }, { "epoch": 0.29168108918887736, "grad_norm": 0.16120277345180511, "learning_rate": 1.9263803680981596e-05, "loss": 0.0085, "step": 158 }, { "epoch": 0.2935271720318449, "grad_norm": 0.2422802895307541, "learning_rate": 1.9386503067484663e-05, "loss": 0.0183, "step": 159 }, { "epoch": 0.29537325487481253, "grad_norm": 0.1309472918510437, "learning_rate": 1.9509202453987733e-05, "loss": 0.0092, "step": 160 }, { "epoch": 0.2972193377177801, "grad_norm": 0.18968728184700012, "learning_rate": 1.96319018404908e-05, "loss": 0.0076, "step": 161 }, { "epoch": 0.29906542056074764, "grad_norm": 0.5624260902404785, "learning_rate": 1.9754601226993868e-05, "loss": 0.0162, "step": 162 }, { "epoch": 0.30091150340371525, "grad_norm": 0.18030855059623718, "learning_rate": 1.9877300613496935e-05, "loss": 0.0109, "step": 163 }, { "epoch": 0.3027575862466828, "grad_norm": 0.20087158679962158, "learning_rate": 2e-05, "loss": 0.0082, "step": 164 }, { "epoch": 0.3046036690896504, "grad_norm": 0.13237029314041138, "learning_rate": 1.9999976944161012e-05, "loss": 0.0056, "step": 165 }, { "epoch": 0.306449751932618, "grad_norm": 0.2552473843097687, "learning_rate": 1.9999907776750355e-05, "loss": 0.0239, "step": 166 }, { "epoch": 0.30829583477558553, "grad_norm": 0.09501585364341736, "learning_rate": 1.9999792498086977e-05, "loss": 0.0074, "step": 167 }, { "epoch": 0.31014191761855314, "grad_norm": 0.2037140280008316, "learning_rate": 1.9999631108702447e-05, "loss": 0.0108, "step": 168 }, { "epoch": 0.3119880004615207, "grad_norm": 0.2303272783756256, "learning_rate": 1.9999423609340957e-05, "loss": 0.0156, "step": 169 }, { "epoch": 0.3138340833044883, "grad_norm": 0.3203318417072296, "learning_rate": 1.9999170000959317e-05, "loss": 0.009, "step": 170 }, { "epoch": 0.31568016614745587, "grad_norm": 0.15373510122299194, "learning_rate": 1.9998870284726968e-05, "loss": 0.0114, "step": 171 }, { "epoch": 0.3175262489904234, "grad_norm": 0.2735464870929718, "learning_rate": 1.9998524462025943e-05, "loss": 0.0071, "step": 172 }, { "epoch": 0.31937233183339103, "grad_norm": 0.26551553606987, "learning_rate": 1.9998132534450893e-05, "loss": 0.0193, "step": 173 }, { "epoch": 0.3212184146763586, "grad_norm": 0.13136360049247742, "learning_rate": 1.9997694503809058e-05, "loss": 0.0069, "step": 174 }, { "epoch": 0.3230644975193262, "grad_norm": 0.14918763935565948, "learning_rate": 1.9997210372120276e-05, "loss": 0.009, "step": 175 }, { "epoch": 0.32491058036229375, "grad_norm": 0.1694311797618866, "learning_rate": 1.9996680141616956e-05, "loss": 0.0091, "step": 176 }, { "epoch": 0.3267566632052613, "grad_norm": 0.18462465703487396, "learning_rate": 1.999610381474408e-05, "loss": 0.0114, "step": 177 }, { "epoch": 0.3286027460482289, "grad_norm": 0.12807750701904297, "learning_rate": 1.999548139415919e-05, "loss": 0.0074, "step": 178 }, { "epoch": 0.3304488288911965, "grad_norm": 0.21351908147335052, "learning_rate": 1.9994812882732364e-05, "loss": 0.0184, "step": 179 }, { "epoch": 0.3322949117341641, "grad_norm": 0.1473976969718933, "learning_rate": 1.9994098283546234e-05, "loss": 0.0123, "step": 180 }, { "epoch": 0.33414099457713164, "grad_norm": 0.12017160654067993, "learning_rate": 1.9993337599895925e-05, "loss": 0.0053, "step": 181 }, { "epoch": 0.3359870774200992, "grad_norm": 0.24068444967269897, "learning_rate": 1.999253083528908e-05, "loss": 0.0116, "step": 182 }, { "epoch": 0.3378331602630668, "grad_norm": 0.21027851104736328, "learning_rate": 1.9991677993445832e-05, "loss": 0.0032, "step": 183 }, { "epoch": 0.33967924310603437, "grad_norm": 0.12754715979099274, "learning_rate": 1.999077907829877e-05, "loss": 0.0115, "step": 184 }, { "epoch": 0.33967924310603437, "eval_loss": 0.01073263119906187, "eval_runtime": 260.2018, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.756, "step": 184 }, { "epoch": 0.341525325949002, "grad_norm": 0.1997697800397873, "learning_rate": 1.9989834093992945e-05, "loss": 0.0137, "step": 185 }, { "epoch": 0.34337140879196953, "grad_norm": 0.11237557232379913, "learning_rate": 1.998884304488584e-05, "loss": 0.004, "step": 186 }, { "epoch": 0.3452174916349371, "grad_norm": 0.3121223747730255, "learning_rate": 1.9987805935547347e-05, "loss": 0.0228, "step": 187 }, { "epoch": 0.3470635744779047, "grad_norm": 0.25514844059944153, "learning_rate": 1.998672277075975e-05, "loss": 0.0074, "step": 188 }, { "epoch": 0.34890965732087226, "grad_norm": 0.10668331384658813, "learning_rate": 1.998559355551771e-05, "loss": 0.0075, "step": 189 }, { "epoch": 0.35075574016383987, "grad_norm": 0.14544381201267242, "learning_rate": 1.9984418295028217e-05, "loss": 0.0082, "step": 190 }, { "epoch": 0.3526018230068074, "grad_norm": 0.22068175673484802, "learning_rate": 1.998319699471061e-05, "loss": 0.0221, "step": 191 }, { "epoch": 0.35444790584977504, "grad_norm": 0.1968519538640976, "learning_rate": 1.9981929660196492e-05, "loss": 0.0134, "step": 192 }, { "epoch": 0.3562939886927426, "grad_norm": 0.09916682541370392, "learning_rate": 1.9980616297329764e-05, "loss": 0.0062, "step": 193 }, { "epoch": 0.35814007153571015, "grad_norm": 0.1616455614566803, "learning_rate": 1.9979256912166565e-05, "loss": 0.0313, "step": 194 }, { "epoch": 0.35998615437867776, "grad_norm": 0.10900194197893143, "learning_rate": 1.9977851510975244e-05, "loss": 0.0063, "step": 195 }, { "epoch": 0.3618322372216453, "grad_norm": 0.16465668380260468, "learning_rate": 1.997640010023634e-05, "loss": 0.0187, "step": 196 }, { "epoch": 0.3636783200646129, "grad_norm": 0.09527245908975601, "learning_rate": 1.997490268664256e-05, "loss": 0.003, "step": 197 }, { "epoch": 0.3655244029075805, "grad_norm": 0.12054583430290222, "learning_rate": 1.997335927709872e-05, "loss": 0.0051, "step": 198 }, { "epoch": 0.36737048575054804, "grad_norm": 0.17793945968151093, "learning_rate": 1.9971769878721747e-05, "loss": 0.0214, "step": 199 }, { "epoch": 0.36921656859351565, "grad_norm": 0.43558791279792786, "learning_rate": 1.9970134498840617e-05, "loss": 0.0165, "step": 200 }, { "epoch": 0.3710626514364832, "grad_norm": 0.12647399306297302, "learning_rate": 1.9968453144996345e-05, "loss": 0.0081, "step": 201 }, { "epoch": 0.3729087342794508, "grad_norm": 0.12886571884155273, "learning_rate": 1.9966725824941933e-05, "loss": 0.0086, "step": 202 }, { "epoch": 0.37475481712241837, "grad_norm": 0.12100410461425781, "learning_rate": 1.996495254664235e-05, "loss": 0.005, "step": 203 }, { "epoch": 0.3766008999653859, "grad_norm": 0.15734834969043732, "learning_rate": 1.9963133318274475e-05, "loss": 0.0154, "step": 204 }, { "epoch": 0.37844698280835354, "grad_norm": 0.20007383823394775, "learning_rate": 1.9961268148227077e-05, "loss": 0.0099, "step": 205 }, { "epoch": 0.3802930656513211, "grad_norm": 0.13359728455543518, "learning_rate": 1.9959357045100764e-05, "loss": 0.0065, "step": 206 }, { "epoch": 0.3821391484942887, "grad_norm": 0.20117157697677612, "learning_rate": 1.995740001770796e-05, "loss": 0.0165, "step": 207 }, { "epoch": 0.38398523133725626, "grad_norm": 0.3189203441143036, "learning_rate": 1.995539707507284e-05, "loss": 0.0104, "step": 208 }, { "epoch": 0.3858313141802238, "grad_norm": 0.13761986792087555, "learning_rate": 1.995334822643131e-05, "loss": 0.0342, "step": 209 }, { "epoch": 0.38767739702319143, "grad_norm": 0.15731483697891235, "learning_rate": 1.9951253481230955e-05, "loss": 0.0125, "step": 210 }, { "epoch": 0.389523479866159, "grad_norm": 0.11964155733585358, "learning_rate": 1.9949112849131005e-05, "loss": 0.026, "step": 211 }, { "epoch": 0.3913695627091266, "grad_norm": 0.3471106290817261, "learning_rate": 1.9946926340002262e-05, "loss": 0.0171, "step": 212 }, { "epoch": 0.39321564555209415, "grad_norm": 0.08175747841596603, "learning_rate": 1.9944693963927092e-05, "loss": 0.0044, "step": 213 }, { "epoch": 0.3950617283950617, "grad_norm": 0.11287415027618408, "learning_rate": 1.9942415731199357e-05, "loss": 0.0062, "step": 214 }, { "epoch": 0.3969078112380293, "grad_norm": 0.2272588163614273, "learning_rate": 1.9940091652324363e-05, "loss": 0.019, "step": 215 }, { "epoch": 0.3987538940809969, "grad_norm": 0.19430284202098846, "learning_rate": 1.993772173801884e-05, "loss": 0.0076, "step": 216 }, { "epoch": 0.4005999769239645, "grad_norm": 0.34528452157974243, "learning_rate": 1.993530599921085e-05, "loss": 0.0296, "step": 217 }, { "epoch": 0.40244605976693204, "grad_norm": 0.25836917757987976, "learning_rate": 1.9932844447039775e-05, "loss": 0.0142, "step": 218 }, { "epoch": 0.4042921426098996, "grad_norm": 0.10283704102039337, "learning_rate": 1.9930337092856243e-05, "loss": 0.0044, "step": 219 }, { "epoch": 0.4061382254528672, "grad_norm": 0.2654596269130707, "learning_rate": 1.9927783948222084e-05, "loss": 0.008, "step": 220 }, { "epoch": 0.40798430829583476, "grad_norm": 0.17902354896068573, "learning_rate": 1.992518502491028e-05, "loss": 0.017, "step": 221 }, { "epoch": 0.4098303911388024, "grad_norm": 0.23217414319515228, "learning_rate": 1.9922540334904898e-05, "loss": 0.0115, "step": 222 }, { "epoch": 0.41167647398176993, "grad_norm": 0.1511717438697815, "learning_rate": 1.991984989040105e-05, "loss": 0.0108, "step": 223 }, { "epoch": 0.4135225568247375, "grad_norm": 0.24292099475860596, "learning_rate": 1.9917113703804828e-05, "loss": 0.0085, "step": 224 }, { "epoch": 0.4153686396677051, "grad_norm": 0.16115987300872803, "learning_rate": 1.9914331787733246e-05, "loss": 0.0053, "step": 225 }, { "epoch": 0.41721472251067265, "grad_norm": 0.11056244373321533, "learning_rate": 1.9911504155014187e-05, "loss": 0.0074, "step": 226 }, { "epoch": 0.41906080535364026, "grad_norm": 0.1903485357761383, "learning_rate": 1.990863081868634e-05, "loss": 0.0187, "step": 227 }, { "epoch": 0.4209068881966078, "grad_norm": 0.15223729610443115, "learning_rate": 1.9905711791999135e-05, "loss": 0.0054, "step": 228 }, { "epoch": 0.4227529710395754, "grad_norm": 0.2494320571422577, "learning_rate": 1.9902747088412703e-05, "loss": 0.0211, "step": 229 }, { "epoch": 0.424599053882543, "grad_norm": 0.1578684151172638, "learning_rate": 1.9899736721597787e-05, "loss": 0.018, "step": 230 }, { "epoch": 0.42644513672551054, "grad_norm": 0.10511214286088943, "learning_rate": 1.989668070543569e-05, "loss": 0.0074, "step": 231 }, { "epoch": 0.42829121956847815, "grad_norm": 0.15655289590358734, "learning_rate": 1.9893579054018216e-05, "loss": 0.0234, "step": 232 }, { "epoch": 0.4301373024114457, "grad_norm": 0.27272099256515503, "learning_rate": 1.98904317816476e-05, "loss": 0.0177, "step": 233 }, { "epoch": 0.43198338525441327, "grad_norm": 0.09095671772956848, "learning_rate": 1.988723890283645e-05, "loss": 0.0034, "step": 234 }, { "epoch": 0.4338294680973809, "grad_norm": 0.14508356153964996, "learning_rate": 1.9884000432307657e-05, "loss": 0.0086, "step": 235 }, { "epoch": 0.43567555094034843, "grad_norm": 0.21108239889144897, "learning_rate": 1.9880716384994355e-05, "loss": 0.0131, "step": 236 }, { "epoch": 0.43752163378331604, "grad_norm": 0.14441920816898346, "learning_rate": 1.987738677603984e-05, "loss": 0.0082, "step": 237 }, { "epoch": 0.4393677166262836, "grad_norm": 0.10114026069641113, "learning_rate": 1.9874011620797494e-05, "loss": 0.0091, "step": 238 }, { "epoch": 0.44121379946925116, "grad_norm": 0.11774880439043045, "learning_rate": 1.9870590934830726e-05, "loss": 0.0037, "step": 239 }, { "epoch": 0.44305988231221877, "grad_norm": 0.2125924825668335, "learning_rate": 1.986712473391289e-05, "loss": 0.01, "step": 240 }, { "epoch": 0.4449059651551863, "grad_norm": 0.13634435832500458, "learning_rate": 1.9863613034027224e-05, "loss": 0.0077, "step": 241 }, { "epoch": 0.44675204799815393, "grad_norm": 0.09352454543113708, "learning_rate": 1.9860055851366768e-05, "loss": 0.0039, "step": 242 }, { "epoch": 0.4485981308411215, "grad_norm": 0.15363694727420807, "learning_rate": 1.9856453202334277e-05, "loss": 0.0092, "step": 243 }, { "epoch": 0.4504442136840891, "grad_norm": 0.16866520047187805, "learning_rate": 1.985280510354218e-05, "loss": 0.0164, "step": 244 }, { "epoch": 0.45229029652705666, "grad_norm": 0.16478340327739716, "learning_rate": 1.984911157181247e-05, "loss": 0.0094, "step": 245 }, { "epoch": 0.4541363793700242, "grad_norm": 0.12157045304775238, "learning_rate": 1.9845372624176646e-05, "loss": 0.0101, "step": 246 }, { "epoch": 0.4559824622129918, "grad_norm": 0.1132885068655014, "learning_rate": 1.9841588277875613e-05, "loss": 0.0062, "step": 247 }, { "epoch": 0.4578285450559594, "grad_norm": 0.10305500775575638, "learning_rate": 1.9837758550359637e-05, "loss": 0.0098, "step": 248 }, { "epoch": 0.459674627898927, "grad_norm": 0.11010725051164627, "learning_rate": 1.9833883459288223e-05, "loss": 0.0079, "step": 249 }, { "epoch": 0.46152071074189455, "grad_norm": 0.138060063123703, "learning_rate": 1.9829963022530077e-05, "loss": 0.0206, "step": 250 }, { "epoch": 0.4633667935848621, "grad_norm": 0.08483293652534485, "learning_rate": 1.982599725816299e-05, "loss": 0.0049, "step": 251 }, { "epoch": 0.4652128764278297, "grad_norm": 0.1332448273897171, "learning_rate": 1.9821986184473757e-05, "loss": 0.0085, "step": 252 }, { "epoch": 0.46705895927079727, "grad_norm": 0.10986470431089401, "learning_rate": 1.981792981995812e-05, "loss": 0.0085, "step": 253 }, { "epoch": 0.4689050421137649, "grad_norm": 0.09792988747358322, "learning_rate": 1.9813828183320654e-05, "loss": 0.0111, "step": 254 }, { "epoch": 0.47075112495673244, "grad_norm": 0.10201071947813034, "learning_rate": 1.9809681293474693e-05, "loss": 0.0062, "step": 255 }, { "epoch": 0.4725972077997, "grad_norm": 0.17517031729221344, "learning_rate": 1.9805489169542245e-05, "loss": 0.0295, "step": 256 }, { "epoch": 0.4744432906426676, "grad_norm": 0.07551269978284836, "learning_rate": 1.9801251830853895e-05, "loss": 0.0032, "step": 257 }, { "epoch": 0.47628937348563516, "grad_norm": 0.12221430987119675, "learning_rate": 1.9796969296948723e-05, "loss": 0.0056, "step": 258 }, { "epoch": 0.47813545632860277, "grad_norm": 0.2373015284538269, "learning_rate": 1.9792641587574212e-05, "loss": 0.0184, "step": 259 }, { "epoch": 0.4799815391715703, "grad_norm": 0.1124010682106018, "learning_rate": 1.9788268722686153e-05, "loss": 0.012, "step": 260 }, { "epoch": 0.4818276220145379, "grad_norm": 0.10452152788639069, "learning_rate": 1.978385072244857e-05, "loss": 0.0183, "step": 261 }, { "epoch": 0.4836737048575055, "grad_norm": 0.13262808322906494, "learning_rate": 1.9779387607233587e-05, "loss": 0.008, "step": 262 }, { "epoch": 0.48551978770047305, "grad_norm": 0.10975097864866257, "learning_rate": 1.9774879397621387e-05, "loss": 0.0043, "step": 263 }, { "epoch": 0.48736587054344066, "grad_norm": 0.1568657010793686, "learning_rate": 1.977032611440008e-05, "loss": 0.0137, "step": 264 }, { "epoch": 0.4892119533864082, "grad_norm": 0.12431374937295914, "learning_rate": 1.976572777856562e-05, "loss": 0.0053, "step": 265 }, { "epoch": 0.49105803622937577, "grad_norm": 0.10776858776807785, "learning_rate": 1.9761084411321706e-05, "loss": 0.0058, "step": 266 }, { "epoch": 0.4929041190723434, "grad_norm": 0.1078772023320198, "learning_rate": 1.9756396034079678e-05, "loss": 0.0089, "step": 267 }, { "epoch": 0.49475020191531094, "grad_norm": 0.20356160402297974, "learning_rate": 1.9751662668458434e-05, "loss": 0.0394, "step": 268 }, { "epoch": 0.49659628475827855, "grad_norm": 0.11029177159070969, "learning_rate": 1.9746884336284316e-05, "loss": 0.0073, "step": 269 }, { "epoch": 0.4984423676012461, "grad_norm": 0.15297511219978333, "learning_rate": 1.974206105959102e-05, "loss": 0.0094, "step": 270 }, { "epoch": 0.5002884504442137, "grad_norm": 0.1340862512588501, "learning_rate": 1.9737192860619477e-05, "loss": 0.0115, "step": 271 }, { "epoch": 0.5021345332871813, "grad_norm": 0.12668649852275848, "learning_rate": 1.9732279761817774e-05, "loss": 0.0079, "step": 272 }, { "epoch": 0.5039806161301489, "grad_norm": 0.07369110733270645, "learning_rate": 1.9727321785841028e-05, "loss": 0.0023, "step": 273 }, { "epoch": 0.5058266989731164, "grad_norm": 0.11879657953977585, "learning_rate": 1.9722318955551307e-05, "loss": 0.0065, "step": 274 }, { "epoch": 0.507672781816084, "grad_norm": 0.11271411180496216, "learning_rate": 1.9717271294017495e-05, "loss": 0.0201, "step": 275 }, { "epoch": 0.5095188646590516, "grad_norm": 0.12968410551548004, "learning_rate": 1.971217882451521e-05, "loss": 0.015, "step": 276 }, { "epoch": 0.5095188646590516, "eval_loss": 0.009394334629178047, "eval_runtime": 91.3081, "eval_samples_per_second": 9.999, "eval_steps_per_second": 5.005, "step": 276 }, { "epoch": 0.5113649475020191, "grad_norm": 0.14326775074005127, "learning_rate": 1.970704157052668e-05, "loss": 0.0132, "step": 277 }, { "epoch": 0.5132110303449867, "grad_norm": 0.0933668315410614, "learning_rate": 1.9701859555740647e-05, "loss": 0.0054, "step": 278 }, { "epoch": 0.5150571131879543, "grad_norm": 0.12458498030900955, "learning_rate": 1.969663280405225e-05, "loss": 0.0115, "step": 279 }, { "epoch": 0.5169031960309219, "grad_norm": 0.16223905980587006, "learning_rate": 1.9691361339562917e-05, "loss": 0.029, "step": 280 }, { "epoch": 0.5187492788738894, "grad_norm": 0.15162405371665955, "learning_rate": 1.9686045186580258e-05, "loss": 0.0144, "step": 281 }, { "epoch": 0.520595361716857, "grad_norm": 0.21614527702331543, "learning_rate": 1.9680684369617947e-05, "loss": 0.0047, "step": 282 }, { "epoch": 0.5224414445598247, "grad_norm": 0.5095790028572083, "learning_rate": 1.9675278913395605e-05, "loss": 0.0223, "step": 283 }, { "epoch": 0.5242875274027922, "grad_norm": 0.07105882465839386, "learning_rate": 1.96698288428387e-05, "loss": 0.0029, "step": 284 }, { "epoch": 0.5261336102457598, "grad_norm": 0.16313835978507996, "learning_rate": 1.966433418307843e-05, "loss": 0.0098, "step": 285 }, { "epoch": 0.5279796930887274, "grad_norm": 0.15185286104679108, "learning_rate": 1.9658794959451583e-05, "loss": 0.026, "step": 286 }, { "epoch": 0.5298257759316949, "grad_norm": 0.14729946851730347, "learning_rate": 1.9653211197500447e-05, "loss": 0.0058, "step": 287 }, { "epoch": 0.5316718587746625, "grad_norm": 0.15539248287677765, "learning_rate": 1.9647582922972696e-05, "loss": 0.0161, "step": 288 }, { "epoch": 0.5335179416176301, "grad_norm": 0.12063451111316681, "learning_rate": 1.964191016182124e-05, "loss": 0.0066, "step": 289 }, { "epoch": 0.5353640244605977, "grad_norm": 0.10961133986711502, "learning_rate": 1.9636192940204134e-05, "loss": 0.0049, "step": 290 }, { "epoch": 0.5372101073035652, "grad_norm": 0.1027185246348381, "learning_rate": 1.9630431284484447e-05, "loss": 0.0075, "step": 291 }, { "epoch": 0.5390561901465328, "grad_norm": 0.0939934030175209, "learning_rate": 1.9624625221230146e-05, "loss": 0.0038, "step": 292 }, { "epoch": 0.5409022729895004, "grad_norm": 0.15825892984867096, "learning_rate": 1.9618774777213954e-05, "loss": 0.008, "step": 293 }, { "epoch": 0.5427483558324679, "grad_norm": 0.15646050870418549, "learning_rate": 1.9612879979413252e-05, "loss": 0.0165, "step": 294 }, { "epoch": 0.5445944386754356, "grad_norm": 0.11154604703187943, "learning_rate": 1.9606940855009944e-05, "loss": 0.0128, "step": 295 }, { "epoch": 0.5464405215184032, "grad_norm": 0.07165487110614777, "learning_rate": 1.960095743139033e-05, "loss": 0.0031, "step": 296 }, { "epoch": 0.5482866043613707, "grad_norm": 0.18987427651882172, "learning_rate": 1.9594929736144978e-05, "loss": 0.0186, "step": 297 }, { "epoch": 0.5501326872043383, "grad_norm": 0.1137097179889679, "learning_rate": 1.9588857797068602e-05, "loss": 0.0049, "step": 298 }, { "epoch": 0.5519787700473059, "grad_norm": 0.10796220600605011, "learning_rate": 1.9582741642159933e-05, "loss": 0.0046, "step": 299 }, { "epoch": 0.5538248528902735, "grad_norm": 0.07906807959079742, "learning_rate": 1.9576581299621587e-05, "loss": 0.005, "step": 300 }, { "epoch": 0.555670935733241, "grad_norm": 0.09325362741947174, "learning_rate": 1.957037679785994e-05, "loss": 0.0063, "step": 301 }, { "epoch": 0.5575170185762086, "grad_norm": 0.11685798317193985, "learning_rate": 1.9564128165484987e-05, "loss": 0.0071, "step": 302 }, { "epoch": 0.5593631014191762, "grad_norm": 0.14980854094028473, "learning_rate": 1.955783543131022e-05, "loss": 0.017, "step": 303 }, { "epoch": 0.5612091842621437, "grad_norm": 0.14807415008544922, "learning_rate": 1.9551498624352497e-05, "loss": 0.0155, "step": 304 }, { "epoch": 0.5630552671051113, "grad_norm": 0.13163696229457855, "learning_rate": 1.9545117773831893e-05, "loss": 0.0082, "step": 305 }, { "epoch": 0.564901349948079, "grad_norm": 0.21672053635120392, "learning_rate": 1.953869290917158e-05, "loss": 0.0076, "step": 306 }, { "epoch": 0.5667474327910464, "grad_norm": 0.1977837234735489, "learning_rate": 1.9532224059997693e-05, "loss": 0.0242, "step": 307 }, { "epoch": 0.5685935156340141, "grad_norm": 0.1022099182009697, "learning_rate": 1.952571125613918e-05, "loss": 0.005, "step": 308 }, { "epoch": 0.5704395984769817, "grad_norm": 0.20348013937473297, "learning_rate": 1.9519154527627667e-05, "loss": 0.0046, "step": 309 }, { "epoch": 0.5722856813199493, "grad_norm": 0.14997074007987976, "learning_rate": 1.9512553904697332e-05, "loss": 0.0182, "step": 310 }, { "epoch": 0.5741317641629168, "grad_norm": 0.0894525796175003, "learning_rate": 1.9505909417784758e-05, "loss": 0.0048, "step": 311 }, { "epoch": 0.5759778470058844, "grad_norm": 0.07030344009399414, "learning_rate": 1.9499221097528785e-05, "loss": 0.0037, "step": 312 }, { "epoch": 0.577823929848852, "grad_norm": 0.08951806277036667, "learning_rate": 1.949248897477038e-05, "loss": 0.0044, "step": 313 }, { "epoch": 0.5796700126918195, "grad_norm": 0.10578414797782898, "learning_rate": 1.9485713080552492e-05, "loss": 0.0047, "step": 314 }, { "epoch": 0.5815160955347871, "grad_norm": 0.05251196399331093, "learning_rate": 1.9478893446119905e-05, "loss": 0.0023, "step": 315 }, { "epoch": 0.5833621783777547, "grad_norm": 0.11290792375802994, "learning_rate": 1.9472030102919102e-05, "loss": 0.0176, "step": 316 }, { "epoch": 0.5852082612207222, "grad_norm": 0.12746615707874298, "learning_rate": 1.9465123082598107e-05, "loss": 0.0105, "step": 317 }, { "epoch": 0.5870543440636898, "grad_norm": 0.0989518091082573, "learning_rate": 1.9458172417006347e-05, "loss": 0.0047, "step": 318 }, { "epoch": 0.5889004269066574, "grad_norm": 0.1288129687309265, "learning_rate": 1.9451178138194514e-05, "loss": 0.0086, "step": 319 }, { "epoch": 0.5907465097496251, "grad_norm": 0.1171145960688591, "learning_rate": 1.9444140278414395e-05, "loss": 0.0066, "step": 320 }, { "epoch": 0.5925925925925926, "grad_norm": 0.1379767507314682, "learning_rate": 1.9437058870118745e-05, "loss": 0.0069, "step": 321 }, { "epoch": 0.5944386754355602, "grad_norm": 0.059574488550424576, "learning_rate": 1.9429933945961126e-05, "loss": 0.0058, "step": 322 }, { "epoch": 0.5962847582785278, "grad_norm": 0.08852468430995941, "learning_rate": 1.9422765538795758e-05, "loss": 0.0073, "step": 323 }, { "epoch": 0.5981308411214953, "grad_norm": 0.08040236681699753, "learning_rate": 1.941555368167737e-05, "loss": 0.0085, "step": 324 }, { "epoch": 0.5999769239644629, "grad_norm": 0.43157750368118286, "learning_rate": 1.9408298407861045e-05, "loss": 0.0172, "step": 325 }, { "epoch": 0.6018230068074305, "grad_norm": 0.06634240597486496, "learning_rate": 1.940099975080207e-05, "loss": 0.0057, "step": 326 }, { "epoch": 0.6036690896503981, "grad_norm": 0.13536123931407928, "learning_rate": 1.939365774415577e-05, "loss": 0.014, "step": 327 }, { "epoch": 0.6055151724933656, "grad_norm": 0.14865273237228394, "learning_rate": 1.938627242177738e-05, "loss": 0.0172, "step": 328 }, { "epoch": 0.6073612553363332, "grad_norm": 0.21095162630081177, "learning_rate": 1.9378843817721856e-05, "loss": 0.0107, "step": 329 }, { "epoch": 0.6092073381793008, "grad_norm": 0.1606818437576294, "learning_rate": 1.9371371966243734e-05, "loss": 0.0073, "step": 330 }, { "epoch": 0.6110534210222683, "grad_norm": 0.1389794498682022, "learning_rate": 1.9363856901796984e-05, "loss": 0.0085, "step": 331 }, { "epoch": 0.612899503865236, "grad_norm": 0.13808107376098633, "learning_rate": 1.935629865903482e-05, "loss": 0.0334, "step": 332 }, { "epoch": 0.6147455867082036, "grad_norm": 0.11393231898546219, "learning_rate": 1.9348697272809568e-05, "loss": 0.0054, "step": 333 }, { "epoch": 0.6165916695511711, "grad_norm": 0.14643581211566925, "learning_rate": 1.9341052778172505e-05, "loss": 0.0069, "step": 334 }, { "epoch": 0.6184377523941387, "grad_norm": 0.08248923718929291, "learning_rate": 1.9333365210373668e-05, "loss": 0.0082, "step": 335 }, { "epoch": 0.6202838352371063, "grad_norm": 0.16498494148254395, "learning_rate": 1.9325634604861728e-05, "loss": 0.0073, "step": 336 }, { "epoch": 0.6221299180800739, "grad_norm": 0.3315902054309845, "learning_rate": 1.9317860997283803e-05, "loss": 0.0156, "step": 337 }, { "epoch": 0.6239760009230414, "grad_norm": 0.13608397543430328, "learning_rate": 1.9310044423485303e-05, "loss": 0.006, "step": 338 }, { "epoch": 0.625822083766009, "grad_norm": 0.1520407497882843, "learning_rate": 1.9302184919509758e-05, "loss": 0.0086, "step": 339 }, { "epoch": 0.6276681666089766, "grad_norm": 0.09407825767993927, "learning_rate": 1.929428252159866e-05, "loss": 0.0059, "step": 340 }, { "epoch": 0.6295142494519441, "grad_norm": 0.08588697016239166, "learning_rate": 1.9286337266191295e-05, "loss": 0.0044, "step": 341 }, { "epoch": 0.6313603322949117, "grad_norm": 0.11330454796552658, "learning_rate": 1.9278349189924565e-05, "loss": 0.0067, "step": 342 }, { "epoch": 0.6332064151378793, "grad_norm": 0.1503840535879135, "learning_rate": 1.9270318329632833e-05, "loss": 0.0073, "step": 343 }, { "epoch": 0.6350524979808468, "grad_norm": 0.11648617684841156, "learning_rate": 1.9262244722347746e-05, "loss": 0.0064, "step": 344 }, { "epoch": 0.6368985808238145, "grad_norm": 0.10481298714876175, "learning_rate": 1.9254128405298054e-05, "loss": 0.0054, "step": 345 }, { "epoch": 0.6387446636667821, "grad_norm": 0.08493243157863617, "learning_rate": 1.9245969415909464e-05, "loss": 0.0186, "step": 346 }, { "epoch": 0.6405907465097497, "grad_norm": 0.12519818544387817, "learning_rate": 1.923776779180444e-05, "loss": 0.0157, "step": 347 }, { "epoch": 0.6424368293527172, "grad_norm": 0.25253045558929443, "learning_rate": 1.922952357080205e-05, "loss": 0.022, "step": 348 }, { "epoch": 0.6442829121956848, "grad_norm": 0.0882834643125534, "learning_rate": 1.9221236790917784e-05, "loss": 0.0091, "step": 349 }, { "epoch": 0.6461289950386524, "grad_norm": 0.10768232494592667, "learning_rate": 1.9212907490363365e-05, "loss": 0.0119, "step": 350 }, { "epoch": 0.6479750778816199, "grad_norm": 0.6699041724205017, "learning_rate": 1.9204535707546602e-05, "loss": 0.0176, "step": 351 }, { "epoch": 0.6498211607245875, "grad_norm": 0.16267555952072144, "learning_rate": 1.919612148107119e-05, "loss": 0.015, "step": 352 }, { "epoch": 0.6516672435675551, "grad_norm": 0.10609513521194458, "learning_rate": 1.9187664849736542e-05, "loss": 0.0062, "step": 353 }, { "epoch": 0.6535133264105226, "grad_norm": 0.16212430596351624, "learning_rate": 1.9179165852537596e-05, "loss": 0.0379, "step": 354 }, { "epoch": 0.6553594092534902, "grad_norm": 0.20922721922397614, "learning_rate": 1.9170624528664658e-05, "loss": 0.051, "step": 355 }, { "epoch": 0.6572054920964578, "grad_norm": 0.10107281059026718, "learning_rate": 1.916204091750321e-05, "loss": 0.0081, "step": 356 }, { "epoch": 0.6590515749394255, "grad_norm": 0.11072515696287155, "learning_rate": 1.9153415058633714e-05, "loss": 0.0066, "step": 357 }, { "epoch": 0.660897657782393, "grad_norm": 0.2692252993583679, "learning_rate": 1.9144746991831463e-05, "loss": 0.0248, "step": 358 }, { "epoch": 0.6627437406253606, "grad_norm": 0.15129469335079193, "learning_rate": 1.9136036757066362e-05, "loss": 0.0057, "step": 359 }, { "epoch": 0.6645898234683282, "grad_norm": 0.1859070360660553, "learning_rate": 1.9127284394502765e-05, "loss": 0.0055, "step": 360 }, { "epoch": 0.6664359063112957, "grad_norm": 0.14683200418949127, "learning_rate": 1.9118489944499287e-05, "loss": 0.0077, "step": 361 }, { "epoch": 0.6682819891542633, "grad_norm": 0.4112631678581238, "learning_rate": 1.9109653447608607e-05, "loss": 0.0186, "step": 362 }, { "epoch": 0.6701280719972309, "grad_norm": 0.11482436209917068, "learning_rate": 1.9100774944577303e-05, "loss": 0.0035, "step": 363 }, { "epoch": 0.6719741548401984, "grad_norm": 0.18928466737270355, "learning_rate": 1.9091854476345634e-05, "loss": 0.0113, "step": 364 }, { "epoch": 0.673820237683166, "grad_norm": 0.09932154417037964, "learning_rate": 1.9082892084047384e-05, "loss": 0.0066, "step": 365 }, { "epoch": 0.6756663205261336, "grad_norm": 0.08523999899625778, "learning_rate": 1.907388780900964e-05, "loss": 0.0096, "step": 366 }, { "epoch": 0.6775124033691012, "grad_norm": 0.3460504710674286, "learning_rate": 1.906484169275263e-05, "loss": 0.0067, "step": 367 }, { "epoch": 0.6793584862120687, "grad_norm": 0.16920697689056396, "learning_rate": 1.9055753776989516e-05, "loss": 0.0082, "step": 368 }, { "epoch": 0.6793584862120687, "eval_loss": 0.010424941778182983, "eval_runtime": 91.3619, "eval_samples_per_second": 9.993, "eval_steps_per_second": 5.002, "step": 368 }, { "epoch": 0.6812045690550363, "grad_norm": 0.4086015820503235, "learning_rate": 1.9046624103626194e-05, "loss": 0.0137, "step": 369 }, { "epoch": 0.683050651898004, "grad_norm": 0.126896932721138, "learning_rate": 1.903745271476113e-05, "loss": 0.003, "step": 370 }, { "epoch": 0.6848967347409715, "grad_norm": 0.7828288674354553, "learning_rate": 1.902823965268513e-05, "loss": 0.0134, "step": 371 }, { "epoch": 0.6867428175839391, "grad_norm": 0.1559770405292511, "learning_rate": 1.901898495988117e-05, "loss": 0.0064, "step": 372 }, { "epoch": 0.6885889004269067, "grad_norm": 0.07461792975664139, "learning_rate": 1.900968867902419e-05, "loss": 0.0028, "step": 373 }, { "epoch": 0.6904349832698742, "grad_norm": 0.13255064189434052, "learning_rate": 1.900035085298091e-05, "loss": 0.0098, "step": 374 }, { "epoch": 0.6922810661128418, "grad_norm": 0.11017435044050217, "learning_rate": 1.8990971524809602e-05, "loss": 0.0108, "step": 375 }, { "epoch": 0.6941271489558094, "grad_norm": 0.49764785170555115, "learning_rate": 1.8981550737759932e-05, "loss": 0.0131, "step": 376 }, { "epoch": 0.695973231798777, "grad_norm": 0.10128708928823471, "learning_rate": 1.8972088535272718e-05, "loss": 0.0044, "step": 377 }, { "epoch": 0.6978193146417445, "grad_norm": 0.1349850744009018, "learning_rate": 1.896258496097977e-05, "loss": 0.0099, "step": 378 }, { "epoch": 0.6996653974847121, "grad_norm": 0.14308737218379974, "learning_rate": 1.8953040058703668e-05, "loss": 0.0124, "step": 379 }, { "epoch": 0.7015114803276797, "grad_norm": 0.14602790772914886, "learning_rate": 1.894345387245755e-05, "loss": 0.0072, "step": 380 }, { "epoch": 0.7033575631706472, "grad_norm": 0.0991426333785057, "learning_rate": 1.8933826446444933e-05, "loss": 0.0044, "step": 381 }, { "epoch": 0.7052036460136148, "grad_norm": 0.10930776596069336, "learning_rate": 1.8924157825059496e-05, "loss": 0.0099, "step": 382 }, { "epoch": 0.7070497288565825, "grad_norm": 0.8534165620803833, "learning_rate": 1.891444805288487e-05, "loss": 0.0103, "step": 383 }, { "epoch": 0.7088958116995501, "grad_norm": 0.31674516201019287, "learning_rate": 1.8904697174694447e-05, "loss": 0.0108, "step": 384 }, { "epoch": 0.7107418945425176, "grad_norm": 1.4162273406982422, "learning_rate": 1.8894905235451163e-05, "loss": 0.0121, "step": 385 }, { "epoch": 0.7125879773854852, "grad_norm": 0.17984510958194733, "learning_rate": 1.888507228030729e-05, "loss": 0.0222, "step": 386 }, { "epoch": 0.7144340602284528, "grad_norm": 1.0672144889831543, "learning_rate": 1.887519835460423e-05, "loss": 0.0074, "step": 387 }, { "epoch": 0.7162801430714203, "grad_norm": 0.2030014991760254, "learning_rate": 1.8865283503872325e-05, "loss": 0.0161, "step": 388 }, { "epoch": 0.7181262259143879, "grad_norm": 0.07635032385587692, "learning_rate": 1.8855327773830604e-05, "loss": 0.0038, "step": 389 }, { "epoch": 0.7199723087573555, "grad_norm": 0.09135939180850983, "learning_rate": 1.8845331210386608e-05, "loss": 0.0058, "step": 390 }, { "epoch": 0.721818391600323, "grad_norm": 0.13557861745357513, "learning_rate": 1.8835293859636177e-05, "loss": 0.0034, "step": 391 }, { "epoch": 0.7236644744432906, "grad_norm": 0.20347817242145538, "learning_rate": 1.8825215767863215e-05, "loss": 0.0161, "step": 392 }, { "epoch": 0.7255105572862582, "grad_norm": 0.986000120639801, "learning_rate": 1.8815096981539494e-05, "loss": 0.0242, "step": 393 }, { "epoch": 0.7273566401292259, "grad_norm": 0.3647541105747223, "learning_rate": 1.8804937547324435e-05, "loss": 0.0057, "step": 394 }, { "epoch": 0.7292027229721934, "grad_norm": 0.09787865728139877, "learning_rate": 1.879473751206489e-05, "loss": 0.0039, "step": 395 }, { "epoch": 0.731048805815161, "grad_norm": 0.16595730185508728, "learning_rate": 1.8784496922794947e-05, "loss": 0.0044, "step": 396 }, { "epoch": 0.7328948886581286, "grad_norm": 0.17809519171714783, "learning_rate": 1.8774215826735664e-05, "loss": 0.0091, "step": 397 }, { "epoch": 0.7347409715010961, "grad_norm": 0.13774670660495758, "learning_rate": 1.8763894271294914e-05, "loss": 0.0073, "step": 398 }, { "epoch": 0.7365870543440637, "grad_norm": 0.2830973267555237, "learning_rate": 1.875353230406711e-05, "loss": 0.0103, "step": 399 }, { "epoch": 0.7384331371870313, "grad_norm": 0.15855129063129425, "learning_rate": 1.8743129972833033e-05, "loss": 0.0073, "step": 400 }, { "epoch": 0.7402792200299988, "grad_norm": 0.7160941362380981, "learning_rate": 1.873268732555957e-05, "loss": 0.0264, "step": 401 }, { "epoch": 0.7421253028729664, "grad_norm": 0.239408940076828, "learning_rate": 1.8722204410399524e-05, "loss": 0.0132, "step": 402 }, { "epoch": 0.743971385715934, "grad_norm": 0.1071736142039299, "learning_rate": 1.8711681275691366e-05, "loss": 0.0073, "step": 403 }, { "epoch": 0.7458174685589016, "grad_norm": 1.5305790901184082, "learning_rate": 1.870111796995905e-05, "loss": 0.0061, "step": 404 }, { "epoch": 0.7476635514018691, "grad_norm": 0.22047969698905945, "learning_rate": 1.8690514541911746e-05, "loss": 0.0071, "step": 405 }, { "epoch": 0.7495096342448367, "grad_norm": 0.15638285875320435, "learning_rate": 1.8679871040443632e-05, "loss": 0.0078, "step": 406 }, { "epoch": 0.7513557170878044, "grad_norm": 0.23750250041484833, "learning_rate": 1.866918751463369e-05, "loss": 0.0074, "step": 407 }, { "epoch": 0.7532017999307719, "grad_norm": 0.18000538647174835, "learning_rate": 1.8658464013745443e-05, "loss": 0.0194, "step": 408 }, { "epoch": 0.7550478827737395, "grad_norm": 0.14077042043209076, "learning_rate": 1.864770058722676e-05, "loss": 0.0331, "step": 409 }, { "epoch": 0.7568939656167071, "grad_norm": 0.1602960228919983, "learning_rate": 1.86368972847096e-05, "loss": 0.0149, "step": 410 }, { "epoch": 0.7587400484596746, "grad_norm": 0.1850017011165619, "learning_rate": 1.8626054156009807e-05, "loss": 0.0073, "step": 411 }, { "epoch": 0.7605861313026422, "grad_norm": 0.37113574147224426, "learning_rate": 1.8615171251126866e-05, "loss": 0.0063, "step": 412 }, { "epoch": 0.7624322141456098, "grad_norm": 0.20095576345920563, "learning_rate": 1.8604248620243682e-05, "loss": 0.0075, "step": 413 }, { "epoch": 0.7642782969885774, "grad_norm": 0.16010351479053497, "learning_rate": 1.8593286313726332e-05, "loss": 0.0065, "step": 414 }, { "epoch": 0.7661243798315449, "grad_norm": 0.23952090740203857, "learning_rate": 1.8582284382123853e-05, "loss": 0.0678, "step": 415 }, { "epoch": 0.7679704626745125, "grad_norm": 0.6012998819351196, "learning_rate": 1.8571242876167995e-05, "loss": 0.0151, "step": 416 }, { "epoch": 0.7698165455174801, "grad_norm": 0.22734975814819336, "learning_rate": 1.8560161846773002e-05, "loss": 0.0132, "step": 417 }, { "epoch": 0.7716626283604476, "grad_norm": 0.3496147096157074, "learning_rate": 1.8549041345035354e-05, "loss": 0.0198, "step": 418 }, { "epoch": 0.7735087112034152, "grad_norm": 0.7178294658660889, "learning_rate": 1.8537881422233553e-05, "loss": 0.0232, "step": 419 }, { "epoch": 0.7753547940463829, "grad_norm": 0.8831137418746948, "learning_rate": 1.8526682129827875e-05, "loss": 0.0084, "step": 420 }, { "epoch": 0.7772008768893504, "grad_norm": 0.13973954319953918, "learning_rate": 1.851544351946014e-05, "loss": 0.0126, "step": 421 }, { "epoch": 0.779046959732318, "grad_norm": 0.21449421346187592, "learning_rate": 1.8504165642953456e-05, "loss": 0.0055, "step": 422 }, { "epoch": 0.7808930425752856, "grad_norm": 0.17940129339694977, "learning_rate": 1.8492848552312016e-05, "loss": 0.0091, "step": 423 }, { "epoch": 0.7827391254182532, "grad_norm": 0.142044335603714, "learning_rate": 1.8481492299720817e-05, "loss": 0.0143, "step": 424 }, { "epoch": 0.7845852082612207, "grad_norm": 0.18737457692623138, "learning_rate": 1.8470096937545445e-05, "loss": 0.0178, "step": 425 }, { "epoch": 0.7864312911041883, "grad_norm": 0.17866647243499756, "learning_rate": 1.845866251833183e-05, "loss": 0.016, "step": 426 }, { "epoch": 0.7882773739471559, "grad_norm": 0.13905107975006104, "learning_rate": 1.8447189094805997e-05, "loss": 0.0192, "step": 427 }, { "epoch": 0.7901234567901234, "grad_norm": 0.23687602579593658, "learning_rate": 1.8435676719873828e-05, "loss": 0.0059, "step": 428 }, { "epoch": 0.791969539633091, "grad_norm": 0.35541704297065735, "learning_rate": 1.8424125446620812e-05, "loss": 0.0176, "step": 429 }, { "epoch": 0.7938156224760586, "grad_norm": 0.22285489737987518, "learning_rate": 1.8412535328311813e-05, "loss": 0.0059, "step": 430 }, { "epoch": 0.7956617053190262, "grad_norm": 0.6638950109481812, "learning_rate": 1.8400906418390808e-05, "loss": 0.0112, "step": 431 }, { "epoch": 0.7975077881619937, "grad_norm": 0.1354030817747116, "learning_rate": 1.8389238770480655e-05, "loss": 0.0108, "step": 432 }, { "epoch": 0.7993538710049614, "grad_norm": 0.2083008885383606, "learning_rate": 1.837753243838283e-05, "loss": 0.0104, "step": 433 }, { "epoch": 0.801199953847929, "grad_norm": 0.17043934762477875, "learning_rate": 1.83657874760772e-05, "loss": 0.0231, "step": 434 }, { "epoch": 0.8030460366908965, "grad_norm": 0.27301961183547974, "learning_rate": 1.8354003937721755e-05, "loss": 0.0065, "step": 435 }, { "epoch": 0.8048921195338641, "grad_norm": 0.15334510803222656, "learning_rate": 1.834218187765237e-05, "loss": 0.0194, "step": 436 }, { "epoch": 0.8067382023768317, "grad_norm": 0.13855521380901337, "learning_rate": 1.8330321350382545e-05, "loss": 0.0041, "step": 437 }, { "epoch": 0.8085842852197992, "grad_norm": 0.1874759942293167, "learning_rate": 1.8318422410603162e-05, "loss": 0.0073, "step": 438 }, { "epoch": 0.8104303680627668, "grad_norm": 0.13221128284931183, "learning_rate": 1.830648511318223e-05, "loss": 0.0116, "step": 439 }, { "epoch": 0.8122764509057344, "grad_norm": 0.14108803868293762, "learning_rate": 1.8294509513164632e-05, "loss": 0.0057, "step": 440 }, { "epoch": 0.814122533748702, "grad_norm": 0.18224814534187317, "learning_rate": 1.8282495665771864e-05, "loss": 0.0057, "step": 441 }, { "epoch": 0.8159686165916695, "grad_norm": 0.11267901211977005, "learning_rate": 1.8270443626401798e-05, "loss": 0.0051, "step": 442 }, { "epoch": 0.8178146994346371, "grad_norm": 0.11145651340484619, "learning_rate": 1.8258353450628402e-05, "loss": 0.0047, "step": 443 }, { "epoch": 0.8196607822776047, "grad_norm": 0.10886496305465698, "learning_rate": 1.8246225194201517e-05, "loss": 0.01, "step": 444 }, { "epoch": 0.8215068651205722, "grad_norm": 0.13111929595470428, "learning_rate": 1.823405891304656e-05, "loss": 0.0117, "step": 445 }, { "epoch": 0.8233529479635399, "grad_norm": 0.1536678671836853, "learning_rate": 1.8221854663264294e-05, "loss": 0.0093, "step": 446 }, { "epoch": 0.8251990308065075, "grad_norm": 0.13959679007530212, "learning_rate": 1.8209612501130566e-05, "loss": 0.007, "step": 447 }, { "epoch": 0.827045113649475, "grad_norm": 0.1471884697675705, "learning_rate": 1.819733248309604e-05, "loss": 0.0209, "step": 448 }, { "epoch": 0.8288911964924426, "grad_norm": 0.4662853479385376, "learning_rate": 1.8185014665785936e-05, "loss": 0.0088, "step": 449 }, { "epoch": 0.8307372793354102, "grad_norm": 0.16569823026657104, "learning_rate": 1.817265910599978e-05, "loss": 0.0097, "step": 450 }, { "epoch": 0.8325833621783778, "grad_norm": 0.1576852947473526, "learning_rate": 1.8160265860711134e-05, "loss": 0.0092, "step": 451 }, { "epoch": 0.8344294450213453, "grad_norm": 0.14117415249347687, "learning_rate": 1.8147834987067327e-05, "loss": 0.0043, "step": 452 }, { "epoch": 0.8362755278643129, "grad_norm": 0.18965789675712585, "learning_rate": 1.8135366542389202e-05, "loss": 0.0137, "step": 453 }, { "epoch": 0.8381216107072805, "grad_norm": 0.17018474638462067, "learning_rate": 1.8122860584170854e-05, "loss": 0.0111, "step": 454 }, { "epoch": 0.839967693550248, "grad_norm": 0.12502162158489227, "learning_rate": 1.8110317170079355e-05, "loss": 0.0073, "step": 455 }, { "epoch": 0.8418137763932156, "grad_norm": 0.19878755509853363, "learning_rate": 1.8097736357954487e-05, "loss": 0.0139, "step": 456 }, { "epoch": 0.8436598592361833, "grad_norm": 0.13912063837051392, "learning_rate": 1.808511820580849e-05, "loss": 0.0066, "step": 457 }, { "epoch": 0.8455059420791508, "grad_norm": 0.08366572856903076, "learning_rate": 1.807246277182578e-05, "loss": 0.0045, "step": 458 }, { "epoch": 0.8473520249221184, "grad_norm": 5.883597373962402, "learning_rate": 1.8059770114362686e-05, "loss": 0.0109, "step": 459 }, { "epoch": 0.849198107765086, "grad_norm": 0.12818843126296997, "learning_rate": 1.804704029194718e-05, "loss": 0.0094, "step": 460 }, { "epoch": 0.849198107765086, "eval_loss": 0.00954136997461319, "eval_runtime": 91.3222, "eval_samples_per_second": 9.998, "eval_steps_per_second": 5.004, "step": 460 }, { "epoch": 0.8510441906080536, "grad_norm": 0.09318236261606216, "learning_rate": 1.8034273363278615e-05, "loss": 0.0028, "step": 461 }, { "epoch": 0.8528902734510211, "grad_norm": 0.25973886251449585, "learning_rate": 1.8021469387227433e-05, "loss": 0.0191, "step": 462 }, { "epoch": 0.8547363562939887, "grad_norm": 0.1217271238565445, "learning_rate": 1.8008628422834923e-05, "loss": 0.0156, "step": 463 }, { "epoch": 0.8565824391369563, "grad_norm": 0.1459590196609497, "learning_rate": 1.7995750529312923e-05, "loss": 0.0171, "step": 464 }, { "epoch": 0.8584285219799238, "grad_norm": 0.09007342159748077, "learning_rate": 1.798283576604356e-05, "loss": 0.0047, "step": 465 }, { "epoch": 0.8602746048228914, "grad_norm": 0.13479134440422058, "learning_rate": 1.7969884192578977e-05, "loss": 0.0154, "step": 466 }, { "epoch": 0.862120687665859, "grad_norm": 0.1289559155702591, "learning_rate": 1.7956895868641053e-05, "loss": 0.0122, "step": 467 }, { "epoch": 0.8639667705088265, "grad_norm": 0.10503463447093964, "learning_rate": 1.7943870854121126e-05, "loss": 0.0067, "step": 468 }, { "epoch": 0.8658128533517941, "grad_norm": 0.1160660907626152, "learning_rate": 1.7930809209079728e-05, "loss": 0.0065, "step": 469 }, { "epoch": 0.8676589361947618, "grad_norm": 0.14017629623413086, "learning_rate": 1.791771099374629e-05, "loss": 0.0068, "step": 470 }, { "epoch": 0.8695050190377294, "grad_norm": 0.12286079674959183, "learning_rate": 1.7904576268518886e-05, "loss": 0.0084, "step": 471 }, { "epoch": 0.8713511018806969, "grad_norm": 0.1592610627412796, "learning_rate": 1.789140509396394e-05, "loss": 0.0099, "step": 472 }, { "epoch": 0.8731971847236645, "grad_norm": 0.09278839081525803, "learning_rate": 1.787819753081594e-05, "loss": 0.0031, "step": 473 }, { "epoch": 0.8750432675666321, "grad_norm": 0.11374642699956894, "learning_rate": 1.7864953639977177e-05, "loss": 0.0134, "step": 474 }, { "epoch": 0.8768893504095996, "grad_norm": 0.11769058555364609, "learning_rate": 1.7851673482517458e-05, "loss": 0.0087, "step": 475 }, { "epoch": 0.8787354332525672, "grad_norm": 0.1978936493396759, "learning_rate": 1.783835711967382e-05, "loss": 0.0171, "step": 476 }, { "epoch": 0.8805815160955348, "grad_norm": 0.17544765770435333, "learning_rate": 1.7825004612850242e-05, "loss": 0.0075, "step": 477 }, { "epoch": 0.8824275989385023, "grad_norm": 0.07806729525327682, "learning_rate": 1.781161602361737e-05, "loss": 0.0038, "step": 478 }, { "epoch": 0.8842736817814699, "grad_norm": 0.230962336063385, "learning_rate": 1.7798191413712244e-05, "loss": 0.037, "step": 479 }, { "epoch": 0.8861197646244375, "grad_norm": 0.18117716908454895, "learning_rate": 1.778473084503799e-05, "loss": 0.0114, "step": 480 }, { "epoch": 0.8879658474674051, "grad_norm": 0.08140026032924652, "learning_rate": 1.7771234379663545e-05, "loss": 0.0043, "step": 481 }, { "epoch": 0.8898119303103726, "grad_norm": 0.0821491926908493, "learning_rate": 1.775770207982338e-05, "loss": 0.0043, "step": 482 }, { "epoch": 0.8916580131533403, "grad_norm": 0.1920117437839508, "learning_rate": 1.7744134007917195e-05, "loss": 0.0074, "step": 483 }, { "epoch": 0.8935040959963079, "grad_norm": 0.09095922112464905, "learning_rate": 1.7730530226509652e-05, "loss": 0.0052, "step": 484 }, { "epoch": 0.8953501788392754, "grad_norm": 0.10795634239912033, "learning_rate": 1.7716890798330066e-05, "loss": 0.0061, "step": 485 }, { "epoch": 0.897196261682243, "grad_norm": 0.18433086574077606, "learning_rate": 1.770321578627213e-05, "loss": 0.0055, "step": 486 }, { "epoch": 0.8990423445252106, "grad_norm": 0.09844094514846802, "learning_rate": 1.768950525339362e-05, "loss": 0.0057, "step": 487 }, { "epoch": 0.9008884273681782, "grad_norm": 0.07778719812631607, "learning_rate": 1.7675759262916105e-05, "loss": 0.0062, "step": 488 }, { "epoch": 0.9027345102111457, "grad_norm": 0.10741523653268814, "learning_rate": 1.7661977878224653e-05, "loss": 0.0137, "step": 489 }, { "epoch": 0.9045805930541133, "grad_norm": 0.08387381583452225, "learning_rate": 1.7648161162867537e-05, "loss": 0.0035, "step": 490 }, { "epoch": 0.9064266758970809, "grad_norm": 0.1607184261083603, "learning_rate": 1.763430918055595e-05, "loss": 0.0107, "step": 491 }, { "epoch": 0.9082727587400484, "grad_norm": 0.21685011684894562, "learning_rate": 1.7620421995163718e-05, "loss": 0.0215, "step": 492 }, { "epoch": 0.910118841583016, "grad_norm": 0.07172352075576782, "learning_rate": 1.7606499670726972e-05, "loss": 0.004, "step": 493 }, { "epoch": 0.9119649244259836, "grad_norm": 0.06467904895544052, "learning_rate": 1.7592542271443888e-05, "loss": 0.0035, "step": 494 }, { "epoch": 0.9138110072689511, "grad_norm": 0.09848224371671677, "learning_rate": 1.7578549861674378e-05, "loss": 0.0046, "step": 495 }, { "epoch": 0.9156570901119188, "grad_norm": 0.19907814264297485, "learning_rate": 1.756452250593979e-05, "loss": 0.0265, "step": 496 }, { "epoch": 0.9175031729548864, "grad_norm": 0.19014231860637665, "learning_rate": 1.7550460268922615e-05, "loss": 0.0132, "step": 497 }, { "epoch": 0.919349255797854, "grad_norm": 0.13054688274860382, "learning_rate": 1.753636321546619e-05, "loss": 0.0067, "step": 498 }, { "epoch": 0.9211953386408215, "grad_norm": 0.13842567801475525, "learning_rate": 1.752223141057439e-05, "loss": 0.0279, "step": 499 }, { "epoch": 0.9230414214837891, "grad_norm": 0.08342625945806503, "learning_rate": 1.7508064919411344e-05, "loss": 0.0039, "step": 500 }, { "epoch": 0.9248875043267567, "grad_norm": 0.17037242650985718, "learning_rate": 1.7493863807301116e-05, "loss": 0.0142, "step": 501 }, { "epoch": 0.9267335871697242, "grad_norm": 0.09089969098567963, "learning_rate": 1.7479628139727417e-05, "loss": 0.0066, "step": 502 }, { "epoch": 0.9285796700126918, "grad_norm": 0.1631207913160324, "learning_rate": 1.7465357982333294e-05, "loss": 0.01, "step": 503 }, { "epoch": 0.9304257528556594, "grad_norm": 0.19571319222450256, "learning_rate": 1.745105340092085e-05, "loss": 0.0472, "step": 504 }, { "epoch": 0.9322718356986269, "grad_norm": 0.11733975261449814, "learning_rate": 1.74367144614509e-05, "loss": 0.0084, "step": 505 }, { "epoch": 0.9341179185415945, "grad_norm": 0.09824282675981522, "learning_rate": 1.74223412300427e-05, "loss": 0.0046, "step": 506 }, { "epoch": 0.9359640013845621, "grad_norm": 0.20196324586868286, "learning_rate": 1.7407933772973638e-05, "loss": 0.0066, "step": 507 }, { "epoch": 0.9378100842275298, "grad_norm": 0.15926776826381683, "learning_rate": 1.739349215667891e-05, "loss": 0.0146, "step": 508 }, { "epoch": 0.9396561670704973, "grad_norm": 0.09395015984773636, "learning_rate": 1.737901644775124e-05, "loss": 0.0052, "step": 509 }, { "epoch": 0.9415022499134649, "grad_norm": 0.10329575836658478, "learning_rate": 1.736450671294054e-05, "loss": 0.0214, "step": 510 }, { "epoch": 0.9433483327564325, "grad_norm": 0.06971684098243713, "learning_rate": 1.7349963019153638e-05, "loss": 0.0037, "step": 511 }, { "epoch": 0.9451944155994, "grad_norm": 0.08213736116886139, "learning_rate": 1.7335385433453948e-05, "loss": 0.0048, "step": 512 }, { "epoch": 0.9470404984423676, "grad_norm": 0.4655965268611908, "learning_rate": 1.732077402306116e-05, "loss": 0.0059, "step": 513 }, { "epoch": 0.9488865812853352, "grad_norm": 0.08121432363986969, "learning_rate": 1.730612885535094e-05, "loss": 0.0048, "step": 514 }, { "epoch": 0.9507326641283027, "grad_norm": 0.05632347986102104, "learning_rate": 1.729144999785462e-05, "loss": 0.0039, "step": 515 }, { "epoch": 0.9525787469712703, "grad_norm": 0.08921120315790176, "learning_rate": 1.7276737518258865e-05, "loss": 0.0048, "step": 516 }, { "epoch": 0.9544248298142379, "grad_norm": 0.41118451952934265, "learning_rate": 1.726199148440539e-05, "loss": 0.0116, "step": 517 }, { "epoch": 0.9562709126572055, "grad_norm": 0.14769668877124786, "learning_rate": 1.7247211964290635e-05, "loss": 0.005, "step": 518 }, { "epoch": 0.958116995500173, "grad_norm": 0.14160354435443878, "learning_rate": 1.7232399026065445e-05, "loss": 0.0309, "step": 519 }, { "epoch": 0.9599630783431407, "grad_norm": 0.20757490396499634, "learning_rate": 1.7217552738034763e-05, "loss": 0.0281, "step": 520 }, { "epoch": 0.9618091611861083, "grad_norm": 1.3972127437591553, "learning_rate": 1.7202673168657318e-05, "loss": 0.0721, "step": 521 }, { "epoch": 0.9636552440290758, "grad_norm": 0.14340125024318695, "learning_rate": 1.7187760386545297e-05, "loss": 0.0115, "step": 522 }, { "epoch": 0.9655013268720434, "grad_norm": 0.13766130805015564, "learning_rate": 1.717281446046404e-05, "loss": 0.0222, "step": 523 }, { "epoch": 0.967347409715011, "grad_norm": 0.14681874215602875, "learning_rate": 1.7157835459331726e-05, "loss": 0.0125, "step": 524 }, { "epoch": 0.9691934925579785, "grad_norm": 0.33441978693008423, "learning_rate": 1.7142823452219036e-05, "loss": 0.0348, "step": 525 }, { "epoch": 0.9710395754009461, "grad_norm": 0.15065835416316986, "learning_rate": 1.7127778508348858e-05, "loss": 0.0078, "step": 526 }, { "epoch": 0.9728856582439137, "grad_norm": 0.12596829235553741, "learning_rate": 1.7112700697095955e-05, "loss": 0.0095, "step": 527 }, { "epoch": 0.9747317410868813, "grad_norm": 0.20254209637641907, "learning_rate": 1.709759008798663e-05, "loss": 0.0082, "step": 528 }, { "epoch": 0.9765778239298488, "grad_norm": 0.12069059908390045, "learning_rate": 1.708244675069846e-05, "loss": 0.0057, "step": 529 }, { "epoch": 0.9784239067728164, "grad_norm": 0.09891531616449356, "learning_rate": 1.7067270755059897e-05, "loss": 0.0065, "step": 530 }, { "epoch": 0.980269989615784, "grad_norm": 0.11425697803497314, "learning_rate": 1.7052062171050008e-05, "loss": 0.0087, "step": 531 }, { "epoch": 0.9821160724587515, "grad_norm": 0.07714996486902237, "learning_rate": 1.7036821068798127e-05, "loss": 0.0052, "step": 532 }, { "epoch": 0.9839621553017192, "grad_norm": 0.08949927240610123, "learning_rate": 1.7021547518583536e-05, "loss": 0.0049, "step": 533 }, { "epoch": 0.9858082381446868, "grad_norm": 0.24043439328670502, "learning_rate": 1.7006241590835136e-05, "loss": 0.015, "step": 534 }, { "epoch": 0.9876543209876543, "grad_norm": 0.1122710183262825, "learning_rate": 1.6990903356131125e-05, "loss": 0.0052, "step": 535 }, { "epoch": 0.9895004038306219, "grad_norm": 0.13079215586185455, "learning_rate": 1.6975532885198678e-05, "loss": 0.0085, "step": 536 }, { "epoch": 0.9913464866735895, "grad_norm": 0.12605947256088257, "learning_rate": 1.696013024891362e-05, "loss": 0.0076, "step": 537 }, { "epoch": 0.9931925695165571, "grad_norm": 0.13878796994686127, "learning_rate": 1.6944695518300087e-05, "loss": 0.0127, "step": 538 }, { "epoch": 0.9950386523595246, "grad_norm": 0.1519862711429596, "learning_rate": 1.6929228764530214e-05, "loss": 0.0076, "step": 539 }, { "epoch": 0.9968847352024922, "grad_norm": 0.11886841803789139, "learning_rate": 1.69137300589238e-05, "loss": 0.0087, "step": 540 }, { "epoch": 0.9987308180454598, "grad_norm": 0.09037019312381744, "learning_rate": 1.6898199472947972e-05, "loss": 0.0108, "step": 541 }, { "epoch": 1.0, "grad_norm": 0.16914087533950806, "learning_rate": 1.6882637078216867e-05, "loss": 0.0111, "step": 542 }, { "epoch": 1.0018460828429676, "grad_norm": 0.0852038711309433, "learning_rate": 1.6867042946491306e-05, "loss": 0.0123, "step": 543 }, { "epoch": 1.0036921656859352, "grad_norm": 0.13573428988456726, "learning_rate": 1.6851417149678442e-05, "loss": 0.0238, "step": 544 }, { "epoch": 1.0055382485289028, "grad_norm": 0.143440842628479, "learning_rate": 1.6835759759831448e-05, "loss": 0.0081, "step": 545 }, { "epoch": 1.0073843313718702, "grad_norm": 0.10767962038516998, "learning_rate": 1.6820070849149174e-05, "loss": 0.0057, "step": 546 }, { "epoch": 1.0092304142148378, "grad_norm": 0.10234127938747406, "learning_rate": 1.680435048997582e-05, "loss": 0.0026, "step": 547 }, { "epoch": 1.0110764970578054, "grad_norm": 0.054232899099588394, "learning_rate": 1.6788598754800602e-05, "loss": 0.002, "step": 548 }, { "epoch": 1.012922579900773, "grad_norm": 0.10567190498113632, "learning_rate": 1.6772815716257414e-05, "loss": 0.0118, "step": 549 }, { "epoch": 1.0147686627437407, "grad_norm": 0.09728987514972687, "learning_rate": 1.6757001447124486e-05, "loss": 0.0133, "step": 550 }, { "epoch": 1.0166147455867083, "grad_norm": 0.07242578268051147, "learning_rate": 1.6741156020324086e-05, "loss": 0.0026, "step": 551 }, { "epoch": 1.018460828429676, "grad_norm": 0.06415614485740662, "learning_rate": 1.6725279508922114e-05, "loss": 0.0038, "step": 552 }, { "epoch": 1.018460828429676, "eval_loss": 0.00856359489262104, "eval_runtime": 91.3138, "eval_samples_per_second": 9.998, "eval_steps_per_second": 5.005, "step": 552 }, { "epoch": 1.0203069112726433, "grad_norm": 0.10766559839248657, "learning_rate": 1.6709371986127846e-05, "loss": 0.0084, "step": 553 }, { "epoch": 1.022152994115611, "grad_norm": 0.08855008333921432, "learning_rate": 1.6693433525293525e-05, "loss": 0.0024, "step": 554 }, { "epoch": 1.0239990769585785, "grad_norm": 0.09996992349624634, "learning_rate": 1.6677464199914076e-05, "loss": 0.004, "step": 555 }, { "epoch": 1.0258451598015461, "grad_norm": 0.1220189779996872, "learning_rate": 1.6661464083626734e-05, "loss": 0.0101, "step": 556 }, { "epoch": 1.0276912426445137, "grad_norm": 0.0515366792678833, "learning_rate": 1.6645433250210726e-05, "loss": 0.0018, "step": 557 }, { "epoch": 1.0295373254874813, "grad_norm": 0.09471988677978516, "learning_rate": 1.662937177358691e-05, "loss": 0.0031, "step": 558 }, { "epoch": 1.0313834083304487, "grad_norm": 0.08005911856889725, "learning_rate": 1.661327972781745e-05, "loss": 0.0106, "step": 559 }, { "epoch": 1.0332294911734163, "grad_norm": 0.7784668207168579, "learning_rate": 1.6597157187105475e-05, "loss": 0.012, "step": 560 }, { "epoch": 1.035075574016384, "grad_norm": 0.07025888562202454, "learning_rate": 1.6581004225794715e-05, "loss": 0.0038, "step": 561 }, { "epoch": 1.0369216568593516, "grad_norm": 0.06929878145456314, "learning_rate": 1.6564820918369194e-05, "loss": 0.0026, "step": 562 }, { "epoch": 1.0387677397023192, "grad_norm": 0.2543495297431946, "learning_rate": 1.6548607339452853e-05, "loss": 0.0099, "step": 563 }, { "epoch": 1.0406138225452868, "grad_norm": 0.15400093793869019, "learning_rate": 1.6532363563809226e-05, "loss": 0.0043, "step": 564 }, { "epoch": 1.0424599053882544, "grad_norm": 0.17001113295555115, "learning_rate": 1.651608966634109e-05, "loss": 0.0078, "step": 565 }, { "epoch": 1.0443059882312218, "grad_norm": 0.03794068843126297, "learning_rate": 1.649978572209012e-05, "loss": 0.0012, "step": 566 }, { "epoch": 1.0461520710741894, "grad_norm": 0.1631583869457245, "learning_rate": 1.648345180623653e-05, "loss": 0.0276, "step": 567 }, { "epoch": 1.047998153917157, "grad_norm": 0.3792674243450165, "learning_rate": 1.6467087994098753e-05, "loss": 0.0051, "step": 568 }, { "epoch": 1.0498442367601246, "grad_norm": 0.14960213005542755, "learning_rate": 1.6450694361133068e-05, "loss": 0.0114, "step": 569 }, { "epoch": 1.0516903196030922, "grad_norm": 0.05881274864077568, "learning_rate": 1.6434270982933272e-05, "loss": 0.0027, "step": 570 }, { "epoch": 1.0535364024460598, "grad_norm": 0.13894124329090118, "learning_rate": 1.6417817935230318e-05, "loss": 0.0032, "step": 571 }, { "epoch": 1.0553824852890274, "grad_norm": 4.140699863433838, "learning_rate": 1.6401335293891966e-05, "loss": 0.0341, "step": 572 }, { "epoch": 1.0572285681319948, "grad_norm": 0.32235807180404663, "learning_rate": 1.6384823134922444e-05, "loss": 0.0069, "step": 573 }, { "epoch": 1.0590746509749625, "grad_norm": 1.4171650409698486, "learning_rate": 1.6368281534462088e-05, "loss": 0.0134, "step": 574 }, { "epoch": 1.06092073381793, "grad_norm": 0.09332925081253052, "learning_rate": 1.635171056878699e-05, "loss": 0.0049, "step": 575 }, { "epoch": 1.0627668166608977, "grad_norm": 0.08895128965377808, "learning_rate": 1.6335110314308654e-05, "loss": 0.0049, "step": 576 }, { "epoch": 1.0646128995038653, "grad_norm": 0.15676097571849823, "learning_rate": 1.631848084757364e-05, "loss": 0.0064, "step": 577 }, { "epoch": 1.066458982346833, "grad_norm": 0.15743549168109894, "learning_rate": 1.6301822245263212e-05, "loss": 0.004, "step": 578 }, { "epoch": 1.0683050651898003, "grad_norm": 0.47084152698516846, "learning_rate": 1.6285134584192976e-05, "loss": 0.0033, "step": 579 }, { "epoch": 1.070151148032768, "grad_norm": 0.11432501673698425, "learning_rate": 1.626841794131254e-05, "loss": 0.0148, "step": 580 }, { "epoch": 1.0719972308757355, "grad_norm": 0.25219109654426575, "learning_rate": 1.6251672393705155e-05, "loss": 0.0033, "step": 581 }, { "epoch": 1.0738433137187031, "grad_norm": 0.11923202127218246, "learning_rate": 1.6234898018587336e-05, "loss": 0.0038, "step": 582 }, { "epoch": 1.0756893965616707, "grad_norm": 0.21197491884231567, "learning_rate": 1.6218094893308553e-05, "loss": 0.0071, "step": 583 }, { "epoch": 1.0775354794046383, "grad_norm": 0.15348570048809052, "learning_rate": 1.6201263095350833e-05, "loss": 0.0089, "step": 584 }, { "epoch": 1.079381562247606, "grad_norm": 0.15847565233707428, "learning_rate": 1.6184402702328426e-05, "loss": 0.004, "step": 585 }, { "epoch": 1.0812276450905733, "grad_norm": 0.2338961660861969, "learning_rate": 1.6167513791987423e-05, "loss": 0.0155, "step": 586 }, { "epoch": 1.083073727933541, "grad_norm": 0.10159553587436676, "learning_rate": 1.615059644220543e-05, "loss": 0.0039, "step": 587 }, { "epoch": 1.0849198107765086, "grad_norm": 0.23770081996917725, "learning_rate": 1.6133650730991183e-05, "loss": 0.0054, "step": 588 }, { "epoch": 1.0867658936194762, "grad_norm": 0.08259685337543488, "learning_rate": 1.6116676736484206e-05, "loss": 0.0025, "step": 589 }, { "epoch": 1.0886119764624438, "grad_norm": 0.11127261817455292, "learning_rate": 1.6099674536954426e-05, "loss": 0.0049, "step": 590 }, { "epoch": 1.0904580593054114, "grad_norm": 0.0952722355723381, "learning_rate": 1.6082644210801846e-05, "loss": 0.0022, "step": 591 }, { "epoch": 1.092304142148379, "grad_norm": 0.19041772186756134, "learning_rate": 1.6065585836556152e-05, "loss": 0.0102, "step": 592 }, { "epoch": 1.0941502249913464, "grad_norm": 0.14780960977077484, "learning_rate": 1.6048499492876378e-05, "loss": 0.0075, "step": 593 }, { "epoch": 1.095996307834314, "grad_norm": 0.11060582101345062, "learning_rate": 1.603138525855051e-05, "loss": 0.0056, "step": 594 }, { "epoch": 1.0978423906772816, "grad_norm": 0.10337001830339432, "learning_rate": 1.6014243212495167e-05, "loss": 0.0064, "step": 595 }, { "epoch": 1.0996884735202492, "grad_norm": 0.18508298695087433, "learning_rate": 1.5997073433755187e-05, "loss": 0.0036, "step": 596 }, { "epoch": 1.1015345563632168, "grad_norm": 0.17404358088970184, "learning_rate": 1.597987600150331e-05, "loss": 0.0091, "step": 597 }, { "epoch": 1.1033806392061845, "grad_norm": 0.13263043761253357, "learning_rate": 1.5962650995039783e-05, "loss": 0.0035, "step": 598 }, { "epoch": 1.105226722049152, "grad_norm": 0.20240873098373413, "learning_rate": 1.594539849379199e-05, "loss": 0.0257, "step": 599 }, { "epoch": 1.1070728048921195, "grad_norm": 0.1413458287715912, "learning_rate": 1.5928118577314123e-05, "loss": 0.0123, "step": 600 }, { "epoch": 1.108918887735087, "grad_norm": 0.11439207196235657, "learning_rate": 1.5910811325286768e-05, "loss": 0.0027, "step": 601 }, { "epoch": 1.1107649705780547, "grad_norm": 0.05116959661245346, "learning_rate": 1.5893476817516567e-05, "loss": 0.0015, "step": 602 }, { "epoch": 1.1126110534210223, "grad_norm": 0.10003647208213806, "learning_rate": 1.587611513393585e-05, "loss": 0.0045, "step": 603 }, { "epoch": 1.11445713626399, "grad_norm": 0.11144063621759415, "learning_rate": 1.5858726354602248e-05, "loss": 0.0063, "step": 604 }, { "epoch": 1.1163032191069575, "grad_norm": 0.15549401938915253, "learning_rate": 1.5841310559698346e-05, "loss": 0.0143, "step": 605 }, { "epoch": 1.118149301949925, "grad_norm": 0.13532917201519012, "learning_rate": 1.582386782953129e-05, "loss": 0.007, "step": 606 }, { "epoch": 1.1199953847928925, "grad_norm": 0.10869234800338745, "learning_rate": 1.580639824453244e-05, "loss": 0.0077, "step": 607 }, { "epoch": 1.1218414676358601, "grad_norm": 0.05746113136410713, "learning_rate": 1.5788901885256983e-05, "loss": 0.0021, "step": 608 }, { "epoch": 1.1236875504788277, "grad_norm": 0.07222943007946014, "learning_rate": 1.577137883238357e-05, "loss": 0.0028, "step": 609 }, { "epoch": 1.1255336333217953, "grad_norm": 0.07145451754331589, "learning_rate": 1.575382916671393e-05, "loss": 0.0056, "step": 610 }, { "epoch": 1.127379716164763, "grad_norm": 0.19978775084018707, "learning_rate": 1.5736252969172522e-05, "loss": 0.0064, "step": 611 }, { "epoch": 1.1292257990077306, "grad_norm": 0.158962681889534, "learning_rate": 1.5718650320806145e-05, "loss": 0.0194, "step": 612 }, { "epoch": 1.131071881850698, "grad_norm": 0.14478765428066254, "learning_rate": 1.5701021302783557e-05, "loss": 0.0214, "step": 613 }, { "epoch": 1.1329179646936656, "grad_norm": 0.1333470642566681, "learning_rate": 1.5683365996395123e-05, "loss": 0.0156, "step": 614 }, { "epoch": 1.1347640475366332, "grad_norm": 0.08896146714687347, "learning_rate": 1.5665684483052425e-05, "loss": 0.0138, "step": 615 }, { "epoch": 1.1366101303796008, "grad_norm": 0.48797956109046936, "learning_rate": 1.5647976844287884e-05, "loss": 0.0076, "step": 616 }, { "epoch": 1.1384562132225684, "grad_norm": 0.1988457441329956, "learning_rate": 1.5630243161754395e-05, "loss": 0.0208, "step": 617 }, { "epoch": 1.140302296065536, "grad_norm": 0.08605816215276718, "learning_rate": 1.5612483517224942e-05, "loss": 0.0086, "step": 618 }, { "epoch": 1.1421483789085034, "grad_norm": 0.09312114864587784, "learning_rate": 1.5594697992592232e-05, "loss": 0.0042, "step": 619 }, { "epoch": 1.143994461751471, "grad_norm": 0.08398724347352982, "learning_rate": 1.5576886669868297e-05, "loss": 0.0051, "step": 620 }, { "epoch": 1.1458405445944386, "grad_norm": 0.2768036127090454, "learning_rate": 1.5559049631184136e-05, "loss": 0.0041, "step": 621 }, { "epoch": 1.1476866274374062, "grad_norm": 0.16800187528133392, "learning_rate": 1.5541186958789327e-05, "loss": 0.0029, "step": 622 }, { "epoch": 1.1495327102803738, "grad_norm": 0.09099100530147552, "learning_rate": 1.5523298735051657e-05, "loss": 0.004, "step": 623 }, { "epoch": 1.1513787931233415, "grad_norm": 0.08173321932554245, "learning_rate": 1.5505385042456715e-05, "loss": 0.0043, "step": 624 }, { "epoch": 1.153224875966309, "grad_norm": 0.10678403079509735, "learning_rate": 1.5487445963607554e-05, "loss": 0.0026, "step": 625 }, { "epoch": 1.1550709588092767, "grad_norm": 0.08033821731805801, "learning_rate": 1.5469481581224274e-05, "loss": 0.0044, "step": 626 }, { "epoch": 1.156917041652244, "grad_norm": 0.4525659382343292, "learning_rate": 1.545149197814365e-05, "loss": 0.0082, "step": 627 }, { "epoch": 1.1587631244952117, "grad_norm": 0.07487817108631134, "learning_rate": 1.5433477237318765e-05, "loss": 0.0027, "step": 628 }, { "epoch": 1.1606092073381793, "grad_norm": 0.09451524913311005, "learning_rate": 1.5415437441818615e-05, "loss": 0.0064, "step": 629 }, { "epoch": 1.162455290181147, "grad_norm": 0.2998914122581482, "learning_rate": 1.5397372674827723e-05, "loss": 0.0242, "step": 630 }, { "epoch": 1.1643013730241145, "grad_norm": 0.3896186947822571, "learning_rate": 1.5379283019645757e-05, "loss": 0.0076, "step": 631 }, { "epoch": 1.1661474558670821, "grad_norm": 0.1330152451992035, "learning_rate": 1.5361168559687158e-05, "loss": 0.0032, "step": 632 }, { "epoch": 1.1679935387100495, "grad_norm": 0.11637061834335327, "learning_rate": 1.5343029378480733e-05, "loss": 0.0025, "step": 633 }, { "epoch": 1.1698396215530171, "grad_norm": 0.11787394434213638, "learning_rate": 1.5324865559669295e-05, "loss": 0.0065, "step": 634 }, { "epoch": 1.1716857043959847, "grad_norm": 0.23343242704868317, "learning_rate": 1.5306677187009263e-05, "loss": 0.0064, "step": 635 }, { "epoch": 1.1735317872389524, "grad_norm": 0.17973926663398743, "learning_rate": 1.5288464344370267e-05, "loss": 0.0056, "step": 636 }, { "epoch": 1.17537787008192, "grad_norm": 0.09165448695421219, "learning_rate": 1.527022711573479e-05, "loss": 0.016, "step": 637 }, { "epoch": 1.1772239529248876, "grad_norm": 0.10070207715034485, "learning_rate": 1.5251965585197748e-05, "loss": 0.0036, "step": 638 }, { "epoch": 1.1790700357678552, "grad_norm": 0.11805471032857895, "learning_rate": 1.5233679836966122e-05, "loss": 0.0079, "step": 639 }, { "epoch": 1.1809161186108226, "grad_norm": 0.11095066368579865, "learning_rate": 1.5215369955358568e-05, "loss": 0.007, "step": 640 }, { "epoch": 1.1827622014537902, "grad_norm": 0.1736016422510147, "learning_rate": 1.5197036024805018e-05, "loss": 0.0071, "step": 641 }, { "epoch": 1.1846082842967578, "grad_norm": 0.07015091180801392, "learning_rate": 1.5178678129846311e-05, "loss": 0.0024, "step": 642 }, { "epoch": 1.1864543671397254, "grad_norm": 0.17164985835552216, "learning_rate": 1.5160296355133773e-05, "loss": 0.0084, "step": 643 }, { "epoch": 1.188300449982693, "grad_norm": 0.0814700499176979, "learning_rate": 1.5141890785428855e-05, "loss": 0.0029, "step": 644 }, { "epoch": 1.188300449982693, "eval_loss": 0.009450200945138931, "eval_runtime": 91.4476, "eval_samples_per_second": 9.984, "eval_steps_per_second": 4.997, "step": 644 }, { "epoch": 1.1901465328256606, "grad_norm": 0.1890951693058014, "learning_rate": 1.5123461505602728e-05, "loss": 0.0093, "step": 645 }, { "epoch": 1.191992615668628, "grad_norm": 0.07871969789266586, "learning_rate": 1.5105008600635888e-05, "loss": 0.0038, "step": 646 }, { "epoch": 1.1938386985115956, "grad_norm": 0.3163944184780121, "learning_rate": 1.5086532155617785e-05, "loss": 0.0046, "step": 647 }, { "epoch": 1.1956847813545632, "grad_norm": 0.10958370566368103, "learning_rate": 1.50680322557464e-05, "loss": 0.0086, "step": 648 }, { "epoch": 1.1975308641975309, "grad_norm": 0.08893739432096481, "learning_rate": 1.5049508986327879e-05, "loss": 0.0059, "step": 649 }, { "epoch": 1.1993769470404985, "grad_norm": 0.09862252324819565, "learning_rate": 1.5030962432776126e-05, "loss": 0.0114, "step": 650 }, { "epoch": 1.201223029883466, "grad_norm": 0.1269105225801468, "learning_rate": 1.5012392680612408e-05, "loss": 0.0085, "step": 651 }, { "epoch": 1.2030691127264337, "grad_norm": 0.17242087423801422, "learning_rate": 1.499379981546497e-05, "loss": 0.0049, "step": 652 }, { "epoch": 1.2049151955694013, "grad_norm": 0.05630020052194595, "learning_rate": 1.4975183923068637e-05, "loss": 0.0027, "step": 653 }, { "epoch": 1.2067612784123687, "grad_norm": 0.08747876435518265, "learning_rate": 1.4956545089264408e-05, "loss": 0.0087, "step": 654 }, { "epoch": 1.2086073612553363, "grad_norm": 0.34542739391326904, "learning_rate": 1.493788339999907e-05, "loss": 0.0352, "step": 655 }, { "epoch": 1.210453444098304, "grad_norm": 0.10722552984952927, "learning_rate": 1.4919198941324813e-05, "loss": 0.005, "step": 656 }, { "epoch": 1.2122995269412715, "grad_norm": 0.09170122444629669, "learning_rate": 1.4900491799398802e-05, "loss": 0.0026, "step": 657 }, { "epoch": 1.2141456097842391, "grad_norm": 0.11170299351215363, "learning_rate": 1.4881762060482814e-05, "loss": 0.0065, "step": 658 }, { "epoch": 1.2159916926272065, "grad_norm": 0.12743814289569855, "learning_rate": 1.4863009810942814e-05, "loss": 0.0055, "step": 659 }, { "epoch": 1.2178377754701741, "grad_norm": 0.16897545754909515, "learning_rate": 1.4844235137248575e-05, "loss": 0.0117, "step": 660 }, { "epoch": 1.2196838583131417, "grad_norm": 0.055120084434747696, "learning_rate": 1.4825438125973263e-05, "loss": 0.0017, "step": 661 }, { "epoch": 1.2215299411561094, "grad_norm": 0.0961981788277626, "learning_rate": 1.4806618863793057e-05, "loss": 0.0081, "step": 662 }, { "epoch": 1.223376023999077, "grad_norm": 0.11256881803274155, "learning_rate": 1.4787777437486723e-05, "loss": 0.0067, "step": 663 }, { "epoch": 1.2252221068420446, "grad_norm": 0.1181740015745163, "learning_rate": 1.4768913933935249e-05, "loss": 0.0052, "step": 664 }, { "epoch": 1.2270681896850122, "grad_norm": 0.11756162345409393, "learning_rate": 1.475002844012141e-05, "loss": 0.0119, "step": 665 }, { "epoch": 1.2289142725279798, "grad_norm": 0.11867949366569519, "learning_rate": 1.4731121043129392e-05, "loss": 0.0042, "step": 666 }, { "epoch": 1.2307603553709472, "grad_norm": 0.1038561463356018, "learning_rate": 1.4712191830144369e-05, "loss": 0.0083, "step": 667 }, { "epoch": 1.2326064382139148, "grad_norm": 0.13451151549816132, "learning_rate": 1.4693240888452121e-05, "loss": 0.0064, "step": 668 }, { "epoch": 1.2344525210568824, "grad_norm": 0.18300770223140717, "learning_rate": 1.4674268305438624e-05, "loss": 0.007, "step": 669 }, { "epoch": 1.23629860389985, "grad_norm": 0.09086968749761581, "learning_rate": 1.4655274168589635e-05, "loss": 0.0031, "step": 670 }, { "epoch": 1.2381446867428176, "grad_norm": 0.031037267297506332, "learning_rate": 1.4636258565490304e-05, "loss": 0.0013, "step": 671 }, { "epoch": 1.2399907695857852, "grad_norm": 0.10484003275632858, "learning_rate": 1.461722158382478e-05, "loss": 0.0082, "step": 672 }, { "epoch": 1.2418368524287526, "grad_norm": 0.11549515277147293, "learning_rate": 1.459816331137577e-05, "loss": 0.0089, "step": 673 }, { "epoch": 1.2436829352717202, "grad_norm": 0.09996828436851501, "learning_rate": 1.4579083836024171e-05, "loss": 0.0023, "step": 674 }, { "epoch": 1.2455290181146879, "grad_norm": 0.11347710341215134, "learning_rate": 1.4559983245748639e-05, "loss": 0.0115, "step": 675 }, { "epoch": 1.2473751009576555, "grad_norm": 0.08734507858753204, "learning_rate": 1.4540861628625207e-05, "loss": 0.0057, "step": 676 }, { "epoch": 1.249221183800623, "grad_norm": 0.08014446496963501, "learning_rate": 1.4521719072826858e-05, "loss": 0.0032, "step": 677 }, { "epoch": 1.2510672666435907, "grad_norm": 0.1405206322669983, "learning_rate": 1.450255566662313e-05, "loss": 0.0086, "step": 678 }, { "epoch": 1.2529133494865583, "grad_norm": 0.1792028844356537, "learning_rate": 1.4483371498379702e-05, "loss": 0.0139, "step": 679 }, { "epoch": 1.254759432329526, "grad_norm": 0.08750908821821213, "learning_rate": 1.4464166656557997e-05, "loss": 0.0024, "step": 680 }, { "epoch": 1.2566055151724933, "grad_norm": 0.1796724945306778, "learning_rate": 1.444494122971476e-05, "loss": 0.0102, "step": 681 }, { "epoch": 1.258451598015461, "grad_norm": 0.07849518209695816, "learning_rate": 1.4425695306501656e-05, "loss": 0.0035, "step": 682 }, { "epoch": 1.2602976808584285, "grad_norm": 0.09903024882078171, "learning_rate": 1.4406428975664875e-05, "loss": 0.0115, "step": 683 }, { "epoch": 1.2621437637013961, "grad_norm": 0.12543067336082458, "learning_rate": 1.4387142326044696e-05, "loss": 0.0099, "step": 684 }, { "epoch": 1.2639898465443637, "grad_norm": 0.12240198999643326, "learning_rate": 1.43678354465751e-05, "loss": 0.0124, "step": 685 }, { "epoch": 1.2658359293873311, "grad_norm": 0.1332004815340042, "learning_rate": 1.4348508426283342e-05, "loss": 0.0054, "step": 686 }, { "epoch": 1.2676820122302987, "grad_norm": 0.14584915339946747, "learning_rate": 1.4329161354289562e-05, "loss": 0.0044, "step": 687 }, { "epoch": 1.2695280950732664, "grad_norm": 0.08272266387939453, "learning_rate": 1.4309794319806356e-05, "loss": 0.0035, "step": 688 }, { "epoch": 1.271374177916234, "grad_norm": 0.1179232969880104, "learning_rate": 1.4290407412138365e-05, "loss": 0.0066, "step": 689 }, { "epoch": 1.2732202607592016, "grad_norm": 0.12980088591575623, "learning_rate": 1.4271000720681874e-05, "loss": 0.0075, "step": 690 }, { "epoch": 1.2750663436021692, "grad_norm": 0.09224485605955124, "learning_rate": 1.4251574334924395e-05, "loss": 0.0066, "step": 691 }, { "epoch": 1.2769124264451368, "grad_norm": 0.07015877217054367, "learning_rate": 1.4232128344444251e-05, "loss": 0.0019, "step": 692 }, { "epoch": 1.2787585092881044, "grad_norm": 0.09918259084224701, "learning_rate": 1.421266283891017e-05, "loss": 0.0046, "step": 693 }, { "epoch": 1.2806045921310718, "grad_norm": 0.12773172557353973, "learning_rate": 1.419317790808086e-05, "loss": 0.0171, "step": 694 }, { "epoch": 1.2824506749740394, "grad_norm": 0.15679265558719635, "learning_rate": 1.417367364180461e-05, "loss": 0.0351, "step": 695 }, { "epoch": 1.284296757817007, "grad_norm": 0.04852623865008354, "learning_rate": 1.4154150130018867e-05, "loss": 0.0017, "step": 696 }, { "epoch": 1.2861428406599746, "grad_norm": 0.3174004554748535, "learning_rate": 1.4134607462749814e-05, "loss": 0.0347, "step": 697 }, { "epoch": 1.2879889235029423, "grad_norm": 0.11431937664747238, "learning_rate": 1.411504573011197e-05, "loss": 0.012, "step": 698 }, { "epoch": 1.2898350063459096, "grad_norm": 0.12127841264009476, "learning_rate": 1.409546502230777e-05, "loss": 0.0292, "step": 699 }, { "epoch": 1.2916810891888773, "grad_norm": 0.11356043070554733, "learning_rate": 1.4075865429627143e-05, "loss": 0.0082, "step": 700 }, { "epoch": 1.2935271720318449, "grad_norm": 0.0658322125673294, "learning_rate": 1.4056247042447096e-05, "loss": 0.0034, "step": 701 }, { "epoch": 1.2953732548748125, "grad_norm": 0.10870260745286942, "learning_rate": 1.4036609951231307e-05, "loss": 0.0033, "step": 702 }, { "epoch": 1.29721933771778, "grad_norm": 0.08417758345603943, "learning_rate": 1.4016954246529697e-05, "loss": 0.0038, "step": 703 }, { "epoch": 1.2990654205607477, "grad_norm": 0.05838305503129959, "learning_rate": 1.3997280018978018e-05, "loss": 0.0019, "step": 704 }, { "epoch": 1.3009115034037153, "grad_norm": 0.09693298488855362, "learning_rate": 1.397758735929744e-05, "loss": 0.005, "step": 705 }, { "epoch": 1.302757586246683, "grad_norm": 0.08534782379865646, "learning_rate": 1.3957876358294115e-05, "loss": 0.0044, "step": 706 }, { "epoch": 1.3046036690896505, "grad_norm": 0.08009322732686996, "learning_rate": 1.3938147106858776e-05, "loss": 0.0034, "step": 707 }, { "epoch": 1.306449751932618, "grad_norm": 0.08357948064804077, "learning_rate": 1.391839969596632e-05, "loss": 0.004, "step": 708 }, { "epoch": 1.3082958347755855, "grad_norm": 0.14750374853610992, "learning_rate": 1.3898634216675362e-05, "loss": 0.0058, "step": 709 }, { "epoch": 1.3101419176185531, "grad_norm": 0.06477048248052597, "learning_rate": 1.3878850760127848e-05, "loss": 0.0057, "step": 710 }, { "epoch": 1.3119880004615208, "grad_norm": 0.07694050669670105, "learning_rate": 1.385904941754862e-05, "loss": 0.0039, "step": 711 }, { "epoch": 1.3138340833044884, "grad_norm": 0.1097872257232666, "learning_rate": 1.3839230280244984e-05, "loss": 0.0032, "step": 712 }, { "epoch": 1.3156801661474558, "grad_norm": 0.24045175313949585, "learning_rate": 1.3819393439606313e-05, "loss": 0.0028, "step": 713 }, { "epoch": 1.3175262489904234, "grad_norm": 0.05547857657074928, "learning_rate": 1.37995389871036e-05, "loss": 0.0026, "step": 714 }, { "epoch": 1.319372331833391, "grad_norm": 0.057202987372875214, "learning_rate": 1.3779667014289067e-05, "loss": 0.0025, "step": 715 }, { "epoch": 1.3212184146763586, "grad_norm": 0.06509260088205338, "learning_rate": 1.375977761279571e-05, "loss": 0.0023, "step": 716 }, { "epoch": 1.3230644975193262, "grad_norm": 0.09592520445585251, "learning_rate": 1.3739870874336898e-05, "loss": 0.0085, "step": 717 }, { "epoch": 1.3249105803622938, "grad_norm": 0.06174939498305321, "learning_rate": 1.371994689070594e-05, "loss": 0.0025, "step": 718 }, { "epoch": 1.3267566632052614, "grad_norm": 0.06818564236164093, "learning_rate": 1.3700005753775671e-05, "loss": 0.0022, "step": 719 }, { "epoch": 1.328602746048229, "grad_norm": 0.10227657854557037, "learning_rate": 1.3680047555498017e-05, "loss": 0.0039, "step": 720 }, { "epoch": 1.3304488288911964, "grad_norm": 0.12304170429706573, "learning_rate": 1.366007238790358e-05, "loss": 0.0055, "step": 721 }, { "epoch": 1.332294911734164, "grad_norm": 0.03807997703552246, "learning_rate": 1.3640080343101209e-05, "loss": 0.0015, "step": 722 }, { "epoch": 1.3341409945771316, "grad_norm": 0.058405570685863495, "learning_rate": 1.362007151327758e-05, "loss": 0.0059, "step": 723 }, { "epoch": 1.3359870774200993, "grad_norm": 0.1530926525592804, "learning_rate": 1.3600045990696762e-05, "loss": 0.0178, "step": 724 }, { "epoch": 1.3378331602630669, "grad_norm": 0.07261884212493896, "learning_rate": 1.3580003867699801e-05, "loss": 0.0027, "step": 725 }, { "epoch": 1.3396792431060343, "grad_norm": 0.11661682277917862, "learning_rate": 1.3559945236704286e-05, "loss": 0.0039, "step": 726 }, { "epoch": 1.3415253259490019, "grad_norm": 0.09348852932453156, "learning_rate": 1.3539870190203937e-05, "loss": 0.0098, "step": 727 }, { "epoch": 1.3433714087919695, "grad_norm": 0.06435918807983398, "learning_rate": 1.3519778820768157e-05, "loss": 0.0013, "step": 728 }, { "epoch": 1.345217491634937, "grad_norm": 0.13823622465133667, "learning_rate": 1.349967122104162e-05, "loss": 0.0086, "step": 729 }, { "epoch": 1.3470635744779047, "grad_norm": 0.0859057605266571, "learning_rate": 1.3479547483743847e-05, "loss": 0.0078, "step": 730 }, { "epoch": 1.3489096573208723, "grad_norm": 0.05262494832277298, "learning_rate": 1.3459407701668762e-05, "loss": 0.0021, "step": 731 }, { "epoch": 1.35075574016384, "grad_norm": 0.16662868857383728, "learning_rate": 1.3439251967684288e-05, "loss": 0.0301, "step": 732 }, { "epoch": 1.3526018230068075, "grad_norm": 0.1268492341041565, "learning_rate": 1.3419080374731889e-05, "loss": 0.0079, "step": 733 }, { "epoch": 1.3544479058497751, "grad_norm": 0.09013361483812332, "learning_rate": 1.3398893015826166e-05, "loss": 0.0044, "step": 734 }, { "epoch": 1.3562939886927425, "grad_norm": 0.1784420609474182, "learning_rate": 1.3378689984054426e-05, "loss": 0.0095, "step": 735 }, { "epoch": 1.3581400715357101, "grad_norm": 0.08392225950956345, "learning_rate": 1.3358471372576229e-05, "loss": 0.01, "step": 736 }, { "epoch": 1.3581400715357101, "eval_loss": 0.008186421357095242, "eval_runtime": 91.7934, "eval_samples_per_second": 9.946, "eval_steps_per_second": 4.979, "step": 736 }, { "epoch": 1.3599861543786778, "grad_norm": 0.0762593075633049, "learning_rate": 1.3338237274622983e-05, "loss": 0.0025, "step": 737 }, { "epoch": 1.3618322372216454, "grad_norm": 0.07320329546928406, "learning_rate": 1.331798778349752e-05, "loss": 0.0082, "step": 738 }, { "epoch": 1.363678320064613, "grad_norm": 0.07280085980892181, "learning_rate": 1.3297722992573636e-05, "loss": 0.0027, "step": 739 }, { "epoch": 1.3655244029075804, "grad_norm": 0.05512760952115059, "learning_rate": 1.327744299529568e-05, "loss": 0.0028, "step": 740 }, { "epoch": 1.367370485750548, "grad_norm": 0.0936584621667862, "learning_rate": 1.3257147885178125e-05, "loss": 0.0033, "step": 741 }, { "epoch": 1.3692165685935156, "grad_norm": 0.08296520262956619, "learning_rate": 1.3236837755805127e-05, "loss": 0.0037, "step": 742 }, { "epoch": 1.3710626514364832, "grad_norm": 0.16028916835784912, "learning_rate": 1.3216512700830104e-05, "loss": 0.0044, "step": 743 }, { "epoch": 1.3729087342794508, "grad_norm": 0.054944444447755814, "learning_rate": 1.3196172813975294e-05, "loss": 0.0018, "step": 744 }, { "epoch": 1.3747548171224184, "grad_norm": 0.08457396924495697, "learning_rate": 1.3175818189031326e-05, "loss": 0.0037, "step": 745 }, { "epoch": 1.376600899965386, "grad_norm": 0.12011455744504929, "learning_rate": 1.3155448919856792e-05, "loss": 0.0079, "step": 746 }, { "epoch": 1.3784469828083536, "grad_norm": 0.10577619820833206, "learning_rate": 1.3135065100377816e-05, "loss": 0.0033, "step": 747 }, { "epoch": 1.380293065651321, "grad_norm": 0.10273445397615433, "learning_rate": 1.31146668245876e-05, "loss": 0.0185, "step": 748 }, { "epoch": 1.3821391484942886, "grad_norm": 0.16664770245552063, "learning_rate": 1.3094254186546018e-05, "loss": 0.0137, "step": 749 }, { "epoch": 1.3839852313372563, "grad_norm": 0.08871475607156754, "learning_rate": 1.3073827280379177e-05, "loss": 0.0033, "step": 750 }, { "epoch": 1.3858313141802239, "grad_norm": 0.06521476805210114, "learning_rate": 1.3053386200278963e-05, "loss": 0.0051, "step": 751 }, { "epoch": 1.3876773970231915, "grad_norm": 0.11643893271684647, "learning_rate": 1.3032931040502627e-05, "loss": 0.0087, "step": 752 }, { "epoch": 1.3895234798661589, "grad_norm": 0.10361618548631668, "learning_rate": 1.3012461895372343e-05, "loss": 0.0061, "step": 753 }, { "epoch": 1.3913695627091265, "grad_norm": 0.2820046544075012, "learning_rate": 1.2991978859274776e-05, "loss": 0.002, "step": 754 }, { "epoch": 1.393215645552094, "grad_norm": 0.10074072331190109, "learning_rate": 1.2971482026660642e-05, "loss": 0.0035, "step": 755 }, { "epoch": 1.3950617283950617, "grad_norm": 0.047872625291347504, "learning_rate": 1.2950971492044272e-05, "loss": 0.0018, "step": 756 }, { "epoch": 1.3969078112380293, "grad_norm": 0.10103340446949005, "learning_rate": 1.2930447350003186e-05, "loss": 0.0087, "step": 757 }, { "epoch": 1.398753894080997, "grad_norm": 0.11095941811800003, "learning_rate": 1.2909909695177647e-05, "loss": 0.0059, "step": 758 }, { "epoch": 1.4005999769239645, "grad_norm": 0.07271669059991837, "learning_rate": 1.2889358622270225e-05, "loss": 0.003, "step": 759 }, { "epoch": 1.4024460597669322, "grad_norm": 0.11334390938282013, "learning_rate": 1.2868794226045367e-05, "loss": 0.0085, "step": 760 }, { "epoch": 1.4042921426098995, "grad_norm": 0.06165868788957596, "learning_rate": 1.2848216601328958e-05, "loss": 0.003, "step": 761 }, { "epoch": 1.4061382254528672, "grad_norm": 0.11465856432914734, "learning_rate": 1.2827625843007871e-05, "loss": 0.002, "step": 762 }, { "epoch": 1.4079843082958348, "grad_norm": 0.1449616253376007, "learning_rate": 1.2807022046029556e-05, "loss": 0.014, "step": 763 }, { "epoch": 1.4098303911388024, "grad_norm": 0.04920601472258568, "learning_rate": 1.278640530540157e-05, "loss": 0.0024, "step": 764 }, { "epoch": 1.41167647398177, "grad_norm": 0.07521408796310425, "learning_rate": 1.276577571619117e-05, "loss": 0.0023, "step": 765 }, { "epoch": 1.4135225568247374, "grad_norm": 0.0906362533569336, "learning_rate": 1.2745133373524855e-05, "loss": 0.0054, "step": 766 }, { "epoch": 1.415368639667705, "grad_norm": 0.14911456406116486, "learning_rate": 1.2724478372587921e-05, "loss": 0.0016, "step": 767 }, { "epoch": 1.4172147225106726, "grad_norm": 0.059913989156484604, "learning_rate": 1.2703810808624051e-05, "loss": 0.0029, "step": 768 }, { "epoch": 1.4190608053536402, "grad_norm": 0.10398763418197632, "learning_rate": 1.268313077693485e-05, "loss": 0.0094, "step": 769 }, { "epoch": 1.4209068881966078, "grad_norm": 0.16045577824115753, "learning_rate": 1.2662438372879409e-05, "loss": 0.0032, "step": 770 }, { "epoch": 1.4227529710395754, "grad_norm": 0.0918615460395813, "learning_rate": 1.2641733691873884e-05, "loss": 0.0045, "step": 771 }, { "epoch": 1.424599053882543, "grad_norm": 0.18795117735862732, "learning_rate": 1.2621016829391022e-05, "loss": 0.0192, "step": 772 }, { "epoch": 1.4264451367255107, "grad_norm": 0.0472252257168293, "learning_rate": 1.2600287880959762e-05, "loss": 0.0014, "step": 773 }, { "epoch": 1.4282912195684783, "grad_norm": 0.10216383635997772, "learning_rate": 1.2579546942164762e-05, "loss": 0.0038, "step": 774 }, { "epoch": 1.4301373024114457, "grad_norm": 0.1046018898487091, "learning_rate": 1.2558794108645966e-05, "loss": 0.0045, "step": 775 }, { "epoch": 1.4319833852544133, "grad_norm": 0.08859994262456894, "learning_rate": 1.2538029476098175e-05, "loss": 0.0065, "step": 776 }, { "epoch": 1.4338294680973809, "grad_norm": 0.06777019798755646, "learning_rate": 1.2517253140270595e-05, "loss": 0.0024, "step": 777 }, { "epoch": 1.4356755509403485, "grad_norm": 0.08122848719358444, "learning_rate": 1.2496465196966393e-05, "loss": 0.0025, "step": 778 }, { "epoch": 1.437521633783316, "grad_norm": 0.07865536212921143, "learning_rate": 1.2475665742042269e-05, "loss": 0.0033, "step": 779 }, { "epoch": 1.4393677166262835, "grad_norm": 0.07937873899936676, "learning_rate": 1.2454854871407993e-05, "loss": 0.0109, "step": 780 }, { "epoch": 1.441213799469251, "grad_norm": 0.04683367908000946, "learning_rate": 1.2434032681025986e-05, "loss": 0.0022, "step": 781 }, { "epoch": 1.4430598823122187, "grad_norm": 0.08413892239332199, "learning_rate": 1.2413199266910865e-05, "loss": 0.0018, "step": 782 }, { "epoch": 1.4449059651551863, "grad_norm": 0.04942731931805611, "learning_rate": 1.239235472512899e-05, "loss": 0.0022, "step": 783 }, { "epoch": 1.446752047998154, "grad_norm": 0.06107890605926514, "learning_rate": 1.2371499151798046e-05, "loss": 0.0022, "step": 784 }, { "epoch": 1.4485981308411215, "grad_norm": 0.05639895424246788, "learning_rate": 1.2350632643086583e-05, "loss": 0.0019, "step": 785 }, { "epoch": 1.4504442136840892, "grad_norm": 0.03176642209291458, "learning_rate": 1.2329755295213568e-05, "loss": 0.0013, "step": 786 }, { "epoch": 1.4522902965270568, "grad_norm": 0.08829797804355621, "learning_rate": 1.2308867204447958e-05, "loss": 0.0015, "step": 787 }, { "epoch": 1.4541363793700242, "grad_norm": 0.09004661440849304, "learning_rate": 1.228796846710825e-05, "loss": 0.0105, "step": 788 }, { "epoch": 1.4559824622129918, "grad_norm": 0.09356413036584854, "learning_rate": 1.226705917956202e-05, "loss": 0.0118, "step": 789 }, { "epoch": 1.4578285450559594, "grad_norm": 0.08679736405611038, "learning_rate": 1.2246139438225509e-05, "loss": 0.0036, "step": 790 }, { "epoch": 1.459674627898927, "grad_norm": 0.05207599699497223, "learning_rate": 1.2225209339563144e-05, "loss": 0.0038, "step": 791 }, { "epoch": 1.4615207107418946, "grad_norm": 0.1354241818189621, "learning_rate": 1.2204268980087132e-05, "loss": 0.0215, "step": 792 }, { "epoch": 1.463366793584862, "grad_norm": 0.17882613837718964, "learning_rate": 1.2183318456356984e-05, "loss": 0.0118, "step": 793 }, { "epoch": 1.4652128764278296, "grad_norm": 0.06782796233892441, "learning_rate": 1.2162357864979073e-05, "loss": 0.0019, "step": 794 }, { "epoch": 1.4670589592707972, "grad_norm": 0.1393979787826538, "learning_rate": 1.214138730260621e-05, "loss": 0.0063, "step": 795 }, { "epoch": 1.4689050421137648, "grad_norm": 0.20122197270393372, "learning_rate": 1.2120406865937174e-05, "loss": 0.0104, "step": 796 }, { "epoch": 1.4707511249567324, "grad_norm": 0.11681712418794632, "learning_rate": 1.2099416651716277e-05, "loss": 0.003, "step": 797 }, { "epoch": 1.4725972077997, "grad_norm": 0.07837650179862976, "learning_rate": 1.2078416756732925e-05, "loss": 0.0018, "step": 798 }, { "epoch": 1.4744432906426677, "grad_norm": 0.055501487106084824, "learning_rate": 1.2057407277821148e-05, "loss": 0.0023, "step": 799 }, { "epoch": 1.4762893734856353, "grad_norm": 0.0729304850101471, "learning_rate": 1.2036388311859189e-05, "loss": 0.0085, "step": 800 }, { "epoch": 1.4781354563286029, "grad_norm": 0.05123327299952507, "learning_rate": 1.2015359955769021e-05, "loss": 0.002, "step": 801 }, { "epoch": 1.4799815391715703, "grad_norm": 0.06856680661439896, "learning_rate": 1.1994322306515926e-05, "loss": 0.0027, "step": 802 }, { "epoch": 1.4818276220145379, "grad_norm": 0.08618495613336563, "learning_rate": 1.1973275461108027e-05, "loss": 0.003, "step": 803 }, { "epoch": 1.4836737048575055, "grad_norm": 0.11988022923469543, "learning_rate": 1.1952219516595868e-05, "loss": 0.0027, "step": 804 }, { "epoch": 1.485519787700473, "grad_norm": 0.08761744946241379, "learning_rate": 1.193115457007194e-05, "loss": 0.0022, "step": 805 }, { "epoch": 1.4873658705434407, "grad_norm": 0.06701692938804626, "learning_rate": 1.1910080718670246e-05, "loss": 0.0045, "step": 806 }, { "epoch": 1.489211953386408, "grad_norm": 0.05863157659769058, "learning_rate": 1.1888998059565848e-05, "loss": 0.0021, "step": 807 }, { "epoch": 1.4910580362293757, "grad_norm": 0.08547432720661163, "learning_rate": 1.186790668997443e-05, "loss": 0.0038, "step": 808 }, { "epoch": 1.4929041190723433, "grad_norm": 0.13616196811199188, "learning_rate": 1.1846806707151832e-05, "loss": 0.0042, "step": 809 }, { "epoch": 1.494750201915311, "grad_norm": 0.05642635375261307, "learning_rate": 1.182569820839362e-05, "loss": 0.0036, "step": 810 }, { "epoch": 1.4965962847582785, "grad_norm": 0.11068624258041382, "learning_rate": 1.1804581291034615e-05, "loss": 0.0046, "step": 811 }, { "epoch": 1.4984423676012462, "grad_norm": 0.07298606634140015, "learning_rate": 1.1783456052448476e-05, "loss": 0.0031, "step": 812 }, { "epoch": 1.5002884504442138, "grad_norm": 0.1301308572292328, "learning_rate": 1.176232259004722e-05, "loss": 0.0139, "step": 813 }, { "epoch": 1.5021345332871814, "grad_norm": 0.14820291101932526, "learning_rate": 1.1741181001280783e-05, "loss": 0.0079, "step": 814 }, { "epoch": 1.503980616130149, "grad_norm": 0.0516553595662117, "learning_rate": 1.1720031383636585e-05, "loss": 0.0015, "step": 815 }, { "epoch": 1.5058266989731164, "grad_norm": 0.09537000954151154, "learning_rate": 1.169887383463906e-05, "loss": 0.0099, "step": 816 }, { "epoch": 1.507672781816084, "grad_norm": 0.21092131733894348, "learning_rate": 1.1677708451849214e-05, "loss": 0.0663, "step": 817 }, { "epoch": 1.5095188646590516, "grad_norm": 0.10161333531141281, "learning_rate": 1.165653533286418e-05, "loss": 0.0043, "step": 818 }, { "epoch": 1.511364947502019, "grad_norm": 0.17274737358093262, "learning_rate": 1.1635354575316765e-05, "loss": 0.0243, "step": 819 }, { "epoch": 1.5132110303449866, "grad_norm": 0.09211444109678268, "learning_rate": 1.1614166276874994e-05, "loss": 0.0044, "step": 820 }, { "epoch": 1.5150571131879542, "grad_norm": 0.10178298503160477, "learning_rate": 1.1592970535241668e-05, "loss": 0.0046, "step": 821 }, { "epoch": 1.5169031960309218, "grad_norm": 0.14009442925453186, "learning_rate": 1.15717674481539e-05, "loss": 0.0164, "step": 822 }, { "epoch": 1.5187492788738894, "grad_norm": 0.07609976083040237, "learning_rate": 1.1550557113382697e-05, "loss": 0.0027, "step": 823 }, { "epoch": 1.520595361716857, "grad_norm": 0.08188093453645706, "learning_rate": 1.1529339628732462e-05, "loss": 0.0068, "step": 824 }, { "epoch": 1.5224414445598247, "grad_norm": 0.10872019082307816, "learning_rate": 1.1508115092040577e-05, "loss": 0.0066, "step": 825 }, { "epoch": 1.5242875274027923, "grad_norm": 0.11876392364501953, "learning_rate": 1.1486883601176944e-05, "loss": 0.005, "step": 826 }, { "epoch": 1.5261336102457599, "grad_norm": 0.08055685460567474, "learning_rate": 1.146564525404353e-05, "loss": 0.0033, "step": 827 }, { "epoch": 1.5279796930887275, "grad_norm": 0.055356502532958984, "learning_rate": 1.1444400148573918e-05, "loss": 0.0019, "step": 828 }, { "epoch": 1.5279796930887275, "eval_loss": 0.008090285584330559, "eval_runtime": 91.6507, "eval_samples_per_second": 9.962, "eval_steps_per_second": 4.986, "step": 828 }, { "epoch": 1.5298257759316949, "grad_norm": 0.11040205508470535, "learning_rate": 1.1423148382732854e-05, "loss": 0.0074, "step": 829 }, { "epoch": 1.5316718587746625, "grad_norm": 0.05763715133070946, "learning_rate": 1.1401890054515792e-05, "loss": 0.0019, "step": 830 }, { "epoch": 1.53351794161763, "grad_norm": 0.13855917751789093, "learning_rate": 1.1380625261948458e-05, "loss": 0.0046, "step": 831 }, { "epoch": 1.5353640244605977, "grad_norm": 0.07250423729419708, "learning_rate": 1.1359354103086377e-05, "loss": 0.0029, "step": 832 }, { "epoch": 1.537210107303565, "grad_norm": 0.07645686715841293, "learning_rate": 1.1338076676014427e-05, "loss": 0.0048, "step": 833 }, { "epoch": 1.5390561901465327, "grad_norm": 0.08407016843557358, "learning_rate": 1.1316793078846395e-05, "loss": 0.0047, "step": 834 }, { "epoch": 1.5409022729895003, "grad_norm": 0.08176873624324799, "learning_rate": 1.1295503409724526e-05, "loss": 0.0053, "step": 835 }, { "epoch": 1.542748355832468, "grad_norm": 0.07881695032119751, "learning_rate": 1.127420776681905e-05, "loss": 0.0075, "step": 836 }, { "epoch": 1.5445944386754356, "grad_norm": 0.052021872252225876, "learning_rate": 1.1252906248327753e-05, "loss": 0.0022, "step": 837 }, { "epoch": 1.5464405215184032, "grad_norm": 0.12975731492042542, "learning_rate": 1.1231598952475504e-05, "loss": 0.0109, "step": 838 }, { "epoch": 1.5482866043613708, "grad_norm": 0.06862396746873856, "learning_rate": 1.1210285977513833e-05, "loss": 0.003, "step": 839 }, { "epoch": 1.5501326872043384, "grad_norm": 0.10307830572128296, "learning_rate": 1.1188967421720434e-05, "loss": 0.0068, "step": 840 }, { "epoch": 1.551978770047306, "grad_norm": 0.11857514083385468, "learning_rate": 1.1167643383398746e-05, "loss": 0.0075, "step": 841 }, { "epoch": 1.5538248528902736, "grad_norm": 0.057690635323524475, "learning_rate": 1.1146313960877486e-05, "loss": 0.0018, "step": 842 }, { "epoch": 1.555670935733241, "grad_norm": 0.0859212726354599, "learning_rate": 1.1124979252510209e-05, "loss": 0.0054, "step": 843 }, { "epoch": 1.5575170185762086, "grad_norm": 0.14271004498004913, "learning_rate": 1.1103639356674825e-05, "loss": 0.0175, "step": 844 }, { "epoch": 1.5593631014191762, "grad_norm": 0.0606565847992897, "learning_rate": 1.1082294371773182e-05, "loss": 0.0033, "step": 845 }, { "epoch": 1.5612091842621436, "grad_norm": 0.05779292806982994, "learning_rate": 1.1060944396230583e-05, "loss": 0.0028, "step": 846 }, { "epoch": 1.5630552671051112, "grad_norm": 0.0749453604221344, "learning_rate": 1.1039589528495347e-05, "loss": 0.005, "step": 847 }, { "epoch": 1.5649013499480788, "grad_norm": 0.14807872474193573, "learning_rate": 1.1018229867038358e-05, "loss": 0.0031, "step": 848 }, { "epoch": 1.5667474327910464, "grad_norm": 0.08591480553150177, "learning_rate": 1.099686551035259e-05, "loss": 0.0039, "step": 849 }, { "epoch": 1.568593515634014, "grad_norm": 0.06698640435934067, "learning_rate": 1.0975496556952683e-05, "loss": 0.0026, "step": 850 }, { "epoch": 1.5704395984769817, "grad_norm": 0.06228471174836159, "learning_rate": 1.0954123105374468e-05, "loss": 0.002, "step": 851 }, { "epoch": 1.5722856813199493, "grad_norm": 0.29273462295532227, "learning_rate": 1.0932745254174512e-05, "loss": 0.0175, "step": 852 }, { "epoch": 1.574131764162917, "grad_norm": 0.07816620916128159, "learning_rate": 1.0911363101929677e-05, "loss": 0.0057, "step": 853 }, { "epoch": 1.5759778470058845, "grad_norm": 0.1502242088317871, "learning_rate": 1.0889976747236657e-05, "loss": 0.0067, "step": 854 }, { "epoch": 1.5778239298488521, "grad_norm": 0.0736665204167366, "learning_rate": 1.0868586288711515e-05, "loss": 0.003, "step": 855 }, { "epoch": 1.5796700126918195, "grad_norm": 0.04509057104587555, "learning_rate": 1.0847191824989252e-05, "loss": 0.0053, "step": 856 }, { "epoch": 1.5815160955347871, "grad_norm": 0.05116039142012596, "learning_rate": 1.0825793454723325e-05, "loss": 0.0021, "step": 857 }, { "epoch": 1.5833621783777547, "grad_norm": 0.05853905901312828, "learning_rate": 1.080439127658521e-05, "loss": 0.007, "step": 858 }, { "epoch": 1.5852082612207221, "grad_norm": 0.08044451475143433, "learning_rate": 1.078298538926395e-05, "loss": 0.0045, "step": 859 }, { "epoch": 1.5870543440636897, "grad_norm": 0.09421449154615402, "learning_rate": 1.076157589146567e-05, "loss": 0.0093, "step": 860 }, { "epoch": 1.5889004269066573, "grad_norm": 0.15502449870109558, "learning_rate": 1.0740162881913165e-05, "loss": 0.0184, "step": 861 }, { "epoch": 1.590746509749625, "grad_norm": 0.08580180257558823, "learning_rate": 1.0718746459345415e-05, "loss": 0.0048, "step": 862 }, { "epoch": 1.5925925925925926, "grad_norm": 0.07770812511444092, "learning_rate": 1.0697326722517137e-05, "loss": 0.006, "step": 863 }, { "epoch": 1.5944386754355602, "grad_norm": 0.15370580554008484, "learning_rate": 1.0675903770198333e-05, "loss": 0.0326, "step": 864 }, { "epoch": 1.5962847582785278, "grad_norm": 0.08603129535913467, "learning_rate": 1.0654477701173824e-05, "loss": 0.0037, "step": 865 }, { "epoch": 1.5981308411214954, "grad_norm": 0.1838487982749939, "learning_rate": 1.0633048614242817e-05, "loss": 0.0274, "step": 866 }, { "epoch": 1.599976923964463, "grad_norm": 0.08164787292480469, "learning_rate": 1.0611616608218429e-05, "loss": 0.0028, "step": 867 }, { "epoch": 1.6018230068074306, "grad_norm": 0.05253343656659126, "learning_rate": 1.0590181781927229e-05, "loss": 0.0049, "step": 868 }, { "epoch": 1.6036690896503982, "grad_norm": 0.0916278213262558, "learning_rate": 1.05687442342088e-05, "loss": 0.0053, "step": 869 }, { "epoch": 1.6055151724933656, "grad_norm": 0.06521543860435486, "learning_rate": 1.0547304063915277e-05, "loss": 0.0058, "step": 870 }, { "epoch": 1.6073612553363332, "grad_norm": 0.08294078707695007, "learning_rate": 1.0525861369910877e-05, "loss": 0.0047, "step": 871 }, { "epoch": 1.6092073381793008, "grad_norm": 0.09736410528421402, "learning_rate": 1.0504416251071463e-05, "loss": 0.0041, "step": 872 }, { "epoch": 1.6110534210222682, "grad_norm": 0.1033322662115097, "learning_rate": 1.0482968806284073e-05, "loss": 0.0088, "step": 873 }, { "epoch": 1.6128995038652358, "grad_norm": 0.054253265261650085, "learning_rate": 1.0461519134446477e-05, "loss": 0.0035, "step": 874 }, { "epoch": 1.6147455867082035, "grad_norm": 0.08290667086839676, "learning_rate": 1.0440067334466712e-05, "loss": 0.0101, "step": 875 }, { "epoch": 1.616591669551171, "grad_norm": 0.13517652451992035, "learning_rate": 1.0418613505262623e-05, "loss": 0.0126, "step": 876 }, { "epoch": 1.6184377523941387, "grad_norm": 0.06763678044080734, "learning_rate": 1.0397157745761419e-05, "loss": 0.0025, "step": 877 }, { "epoch": 1.6202838352371063, "grad_norm": 0.07910499721765518, "learning_rate": 1.0375700154899208e-05, "loss": 0.006, "step": 878 }, { "epoch": 1.622129918080074, "grad_norm": 0.111945740878582, "learning_rate": 1.0354240831620542e-05, "loss": 0.0046, "step": 879 }, { "epoch": 1.6239760009230415, "grad_norm": 0.0739910826086998, "learning_rate": 1.0332779874877959e-05, "loss": 0.0049, "step": 880 }, { "epoch": 1.6258220837660091, "grad_norm": 0.11454188078641891, "learning_rate": 1.0311317383631532e-05, "loss": 0.01, "step": 881 }, { "epoch": 1.6276681666089767, "grad_norm": 0.0890415832400322, "learning_rate": 1.028985345684841e-05, "loss": 0.0072, "step": 882 }, { "epoch": 1.6295142494519441, "grad_norm": 0.06510549783706665, "learning_rate": 1.0268388193502365e-05, "loss": 0.0047, "step": 883 }, { "epoch": 1.6313603322949117, "grad_norm": 0.060889944434165955, "learning_rate": 1.0246921692573322e-05, "loss": 0.0024, "step": 884 }, { "epoch": 1.6332064151378793, "grad_norm": 0.14020121097564697, "learning_rate": 1.0225454053046922e-05, "loss": 0.0044, "step": 885 }, { "epoch": 1.6350524979808467, "grad_norm": 0.08631158620119095, "learning_rate": 1.0203985373914056e-05, "loss": 0.003, "step": 886 }, { "epoch": 1.6368985808238143, "grad_norm": 0.17379646003246307, "learning_rate": 1.0182515754170402e-05, "loss": 0.0051, "step": 887 }, { "epoch": 1.638744663666782, "grad_norm": 0.08604667335748672, "learning_rate": 1.0161045292815974e-05, "loss": 0.0053, "step": 888 }, { "epoch": 1.6405907465097496, "grad_norm": 0.09943891316652298, "learning_rate": 1.0139574088854682e-05, "loss": 0.0073, "step": 889 }, { "epoch": 1.6424368293527172, "grad_norm": 0.1661536991596222, "learning_rate": 1.0118102241293848e-05, "loss": 0.0056, "step": 890 }, { "epoch": 1.6442829121956848, "grad_norm": 0.062387865036726, "learning_rate": 1.0096629849143757e-05, "loss": 0.0053, "step": 891 }, { "epoch": 1.6461289950386524, "grad_norm": 0.07064353674650192, "learning_rate": 1.007515701141722e-05, "loss": 0.0036, "step": 892 }, { "epoch": 1.64797507788162, "grad_norm": 0.11467552930116653, "learning_rate": 1.0053683827129091e-05, "loss": 0.0134, "step": 893 }, { "epoch": 1.6498211607245876, "grad_norm": 0.14931882917881012, "learning_rate": 1.0032210395295829e-05, "loss": 0.0165, "step": 894 }, { "epoch": 1.6516672435675552, "grad_norm": 0.06062651425600052, "learning_rate": 1.001073681493503e-05, "loss": 0.0021, "step": 895 }, { "epoch": 1.6535133264105226, "grad_norm": 0.09045737981796265, "learning_rate": 9.989263185064974e-06, "loss": 0.0023, "step": 896 }, { "epoch": 1.6553594092534902, "grad_norm": 0.15048907697200775, "learning_rate": 9.967789604704173e-06, "loss": 0.0048, "step": 897 }, { "epoch": 1.6572054920964578, "grad_norm": 0.05222946032881737, "learning_rate": 9.946316172870909e-06, "loss": 0.0019, "step": 898 }, { "epoch": 1.6590515749394255, "grad_norm": 0.09345798194408417, "learning_rate": 9.924842988582783e-06, "loss": 0.0055, "step": 899 }, { "epoch": 1.6608976577823928, "grad_norm": 0.0894194170832634, "learning_rate": 9.903370150856245e-06, "loss": 0.005, "step": 900 }, { "epoch": 1.6627437406253605, "grad_norm": 0.10677597671747208, "learning_rate": 9.881897758706155e-06, "loss": 0.0069, "step": 901 }, { "epoch": 1.664589823468328, "grad_norm": 0.06564165651798248, "learning_rate": 9.860425911145323e-06, "loss": 0.0072, "step": 902 }, { "epoch": 1.6664359063112957, "grad_norm": 0.056849405169487, "learning_rate": 9.83895470718403e-06, "loss": 0.0018, "step": 903 }, { "epoch": 1.6682819891542633, "grad_norm": 0.05910499766469002, "learning_rate": 9.817484245829603e-06, "loss": 0.0031, "step": 904 }, { "epoch": 1.670128071997231, "grad_norm": 0.1368527114391327, "learning_rate": 9.79601462608595e-06, "loss": 0.0171, "step": 905 }, { "epoch": 1.6719741548401985, "grad_norm": 0.48427149653434753, "learning_rate": 9.77454594695308e-06, "loss": 0.0134, "step": 906 }, { "epoch": 1.6738202376831661, "grad_norm": 0.045465849339962006, "learning_rate": 9.75307830742668e-06, "loss": 0.0016, "step": 907 }, { "epoch": 1.6756663205261337, "grad_norm": 0.29681748151779175, "learning_rate": 9.731611806497637e-06, "loss": 0.006, "step": 908 }, { "epoch": 1.6775124033691013, "grad_norm": 0.12164149433374405, "learning_rate": 9.710146543151593e-06, "loss": 0.0058, "step": 909 }, { "epoch": 1.6793584862120687, "grad_norm": 0.0662166029214859, "learning_rate": 9.688682616368471e-06, "loss": 0.0028, "step": 910 }, { "epoch": 1.6812045690550363, "grad_norm": 0.10360967367887497, "learning_rate": 9.667220125122044e-06, "loss": 0.0069, "step": 911 }, { "epoch": 1.683050651898004, "grad_norm": 0.06868352741003036, "learning_rate": 9.645759168379463e-06, "loss": 0.0028, "step": 912 }, { "epoch": 1.6848967347409713, "grad_norm": 0.07601200044155121, "learning_rate": 9.624299845100795e-06, "loss": 0.0079, "step": 913 }, { "epoch": 1.686742817583939, "grad_norm": 0.08922822028398514, "learning_rate": 9.602842254238583e-06, "loss": 0.0035, "step": 914 }, { "epoch": 1.6885889004269066, "grad_norm": 0.267914742231369, "learning_rate": 9.58138649473738e-06, "loss": 0.0137, "step": 915 }, { "epoch": 1.6904349832698742, "grad_norm": 0.10150199383497238, "learning_rate": 9.559932665533291e-06, "loss": 0.0044, "step": 916 }, { "epoch": 1.6922810661128418, "grad_norm": 0.08167190849781036, "learning_rate": 9.538480865553523e-06, "loss": 0.0028, "step": 917 }, { "epoch": 1.6941271489558094, "grad_norm": 0.08491844683885574, "learning_rate": 9.51703119371593e-06, "loss": 0.0046, "step": 918 }, { "epoch": 1.695973231798777, "grad_norm": 0.1229473352432251, "learning_rate": 9.495583748928539e-06, "loss": 0.0069, "step": 919 }, { "epoch": 1.6978193146417446, "grad_norm": 0.10284758359193802, "learning_rate": 9.474138630089124e-06, "loss": 0.0045, "step": 920 }, { "epoch": 1.6978193146417446, "eval_loss": 0.008002725429832935, "eval_runtime": 92.1065, "eval_samples_per_second": 9.912, "eval_steps_per_second": 4.962, "step": 920 }, { "epoch": 1.6996653974847122, "grad_norm": 0.1596386730670929, "learning_rate": 9.452695936084728e-06, "loss": 0.0124, "step": 921 }, { "epoch": 1.7015114803276798, "grad_norm": 0.11229660362005234, "learning_rate": 9.431255765791201e-06, "loss": 0.0156, "step": 922 }, { "epoch": 1.7033575631706472, "grad_norm": 0.05473716929554939, "learning_rate": 9.409818218072774e-06, "loss": 0.0021, "step": 923 }, { "epoch": 1.7052036460136148, "grad_norm": 0.06023913249373436, "learning_rate": 9.388383391781576e-06, "loss": 0.0029, "step": 924 }, { "epoch": 1.7070497288565825, "grad_norm": 0.06623807549476624, "learning_rate": 9.366951385757184e-06, "loss": 0.0024, "step": 925 }, { "epoch": 1.70889581169955, "grad_norm": 0.08034256845712662, "learning_rate": 9.345522298826177e-06, "loss": 0.0042, "step": 926 }, { "epoch": 1.7107418945425175, "grad_norm": 0.06799780577421188, "learning_rate": 9.324096229801673e-06, "loss": 0.0074, "step": 927 }, { "epoch": 1.712587977385485, "grad_norm": 0.24874067306518555, "learning_rate": 9.302673277482867e-06, "loss": 0.0075, "step": 928 }, { "epoch": 1.7144340602284527, "grad_norm": 0.06625612080097198, "learning_rate": 9.281253540654586e-06, "loss": 0.005, "step": 929 }, { "epoch": 1.7162801430714203, "grad_norm": 0.1352481245994568, "learning_rate": 9.259837118086837e-06, "loss": 0.0113, "step": 930 }, { "epoch": 1.718126225914388, "grad_norm": 0.10778124630451202, "learning_rate": 9.238424108534333e-06, "loss": 0.0048, "step": 931 }, { "epoch": 1.7199723087573555, "grad_norm": 0.060022782534360886, "learning_rate": 9.217014610736054e-06, "loss": 0.0025, "step": 932 }, { "epoch": 1.7218183916003231, "grad_norm": 0.07783634960651398, "learning_rate": 9.19560872341479e-06, "loss": 0.0079, "step": 933 }, { "epoch": 1.7236644744432907, "grad_norm": 0.10427862405776978, "learning_rate": 9.174206545276678e-06, "loss": 0.0072, "step": 934 }, { "epoch": 1.7255105572862584, "grad_norm": 0.10971418768167496, "learning_rate": 9.15280817501075e-06, "loss": 0.0032, "step": 935 }, { "epoch": 1.727356640129226, "grad_norm": 0.057751502841711044, "learning_rate": 9.131413711288485e-06, "loss": 0.0052, "step": 936 }, { "epoch": 1.7292027229721934, "grad_norm": 0.08130896091461182, "learning_rate": 9.110023252763348e-06, "loss": 0.0039, "step": 937 }, { "epoch": 1.731048805815161, "grad_norm": 0.07428565621376038, "learning_rate": 9.088636898070326e-06, "loss": 0.0028, "step": 938 }, { "epoch": 1.7328948886581286, "grad_norm": 0.08366493880748749, "learning_rate": 9.067254745825488e-06, "loss": 0.0027, "step": 939 }, { "epoch": 1.734740971501096, "grad_norm": 0.057682104408741, "learning_rate": 9.045876894625537e-06, "loss": 0.0023, "step": 940 }, { "epoch": 1.7365870543440636, "grad_norm": 0.13296572864055634, "learning_rate": 9.024503443047318e-06, "loss": 0.0064, "step": 941 }, { "epoch": 1.7384331371870312, "grad_norm": 0.10311231017112732, "learning_rate": 9.003134489647412e-06, "loss": 0.0057, "step": 942 }, { "epoch": 1.7402792200299988, "grad_norm": 0.0860329195857048, "learning_rate": 8.981770132961649e-06, "loss": 0.0053, "step": 943 }, { "epoch": 1.7421253028729664, "grad_norm": 0.07848010212182999, "learning_rate": 8.960410471504656e-06, "loss": 0.0029, "step": 944 }, { "epoch": 1.743971385715934, "grad_norm": 0.053934577852487564, "learning_rate": 8.93905560376942e-06, "loss": 0.0041, "step": 945 }, { "epoch": 1.7458174685589016, "grad_norm": 0.0708983913064003, "learning_rate": 8.917705628226823e-06, "loss": 0.0033, "step": 946 }, { "epoch": 1.7476635514018692, "grad_norm": 0.1002105101943016, "learning_rate": 8.896360643325177e-06, "loss": 0.0037, "step": 947 }, { "epoch": 1.7495096342448369, "grad_norm": 0.09933286160230637, "learning_rate": 8.875020747489795e-06, "loss": 0.0147, "step": 948 }, { "epoch": 1.7513557170878045, "grad_norm": 0.10095307976007462, "learning_rate": 8.853686039122519e-06, "loss": 0.0165, "step": 949 }, { "epoch": 1.7532017999307719, "grad_norm": 0.08462752401828766, "learning_rate": 8.83235661660126e-06, "loss": 0.0058, "step": 950 }, { "epoch": 1.7550478827737395, "grad_norm": 0.06106647104024887, "learning_rate": 8.81103257827957e-06, "loss": 0.0048, "step": 951 }, { "epoch": 1.756893965616707, "grad_norm": 0.07836233079433441, "learning_rate": 8.789714022486168e-06, "loss": 0.0026, "step": 952 }, { "epoch": 1.7587400484596745, "grad_norm": 0.09076899290084839, "learning_rate": 8.768401047524498e-06, "loss": 0.0131, "step": 953 }, { "epoch": 1.760586131302642, "grad_norm": 0.07738377153873444, "learning_rate": 8.74709375167225e-06, "loss": 0.0061, "step": 954 }, { "epoch": 1.7624322141456097, "grad_norm": 0.0879320502281189, "learning_rate": 8.72579223318095e-06, "loss": 0.0043, "step": 955 }, { "epoch": 1.7642782969885773, "grad_norm": 0.24148541688919067, "learning_rate": 8.704496590275479e-06, "loss": 0.0047, "step": 956 }, { "epoch": 1.766124379831545, "grad_norm": 0.16469478607177734, "learning_rate": 8.683206921153607e-06, "loss": 0.0042, "step": 957 }, { "epoch": 1.7679704626745125, "grad_norm": 0.24314740300178528, "learning_rate": 8.661923323985576e-06, "loss": 0.0118, "step": 958 }, { "epoch": 1.7698165455174801, "grad_norm": 0.11575371026992798, "learning_rate": 8.640645896913628e-06, "loss": 0.0142, "step": 959 }, { "epoch": 1.7716626283604477, "grad_norm": 0.08918927609920502, "learning_rate": 8.619374738051543e-06, "loss": 0.0181, "step": 960 }, { "epoch": 1.7735087112034154, "grad_norm": 0.05675291642546654, "learning_rate": 8.598109945484208e-06, "loss": 0.0029, "step": 961 }, { "epoch": 1.775354794046383, "grad_norm": 0.07499504834413528, "learning_rate": 8.576851617267151e-06, "loss": 0.0025, "step": 962 }, { "epoch": 1.7772008768893504, "grad_norm": 0.056793201714754105, "learning_rate": 8.555599851426086e-06, "loss": 0.0026, "step": 963 }, { "epoch": 1.779046959732318, "grad_norm": 0.05858364701271057, "learning_rate": 8.534354745956472e-06, "loss": 0.0038, "step": 964 }, { "epoch": 1.7808930425752856, "grad_norm": 0.07139363139867783, "learning_rate": 8.51311639882306e-06, "loss": 0.003, "step": 965 }, { "epoch": 1.7827391254182532, "grad_norm": 0.09954049438238144, "learning_rate": 8.491884907959426e-06, "loss": 0.0118, "step": 966 }, { "epoch": 1.7845852082612206, "grad_norm": 0.09860999137163162, "learning_rate": 8.47066037126754e-06, "loss": 0.0109, "step": 967 }, { "epoch": 1.7864312911041882, "grad_norm": 0.1161302700638771, "learning_rate": 8.449442886617308e-06, "loss": 0.0068, "step": 968 }, { "epoch": 1.7882773739471558, "grad_norm": 0.13166776299476624, "learning_rate": 8.428232551846101e-06, "loss": 0.0317, "step": 969 }, { "epoch": 1.7901234567901234, "grad_norm": 0.11878406256437302, "learning_rate": 8.407029464758335e-06, "loss": 0.0171, "step": 970 }, { "epoch": 1.791969539633091, "grad_norm": 0.06158612668514252, "learning_rate": 8.385833723125006e-06, "loss": 0.0022, "step": 971 }, { "epoch": 1.7938156224760586, "grad_norm": 0.10814138501882553, "learning_rate": 8.364645424683237e-06, "loss": 0.0276, "step": 972 }, { "epoch": 1.7956617053190262, "grad_norm": 0.10428439825773239, "learning_rate": 8.343464667135821e-06, "loss": 0.0101, "step": 973 }, { "epoch": 1.7975077881619939, "grad_norm": 0.061348170042037964, "learning_rate": 8.322291548150786e-06, "loss": 0.0024, "step": 974 }, { "epoch": 1.7993538710049615, "grad_norm": 0.04177645966410637, "learning_rate": 8.301126165360944e-06, "loss": 0.0021, "step": 975 }, { "epoch": 1.801199953847929, "grad_norm": 0.08466877043247223, "learning_rate": 8.279968616363417e-06, "loss": 0.0077, "step": 976 }, { "epoch": 1.8030460366908965, "grad_norm": 0.18225808441638947, "learning_rate": 8.258818998719218e-06, "loss": 0.0136, "step": 977 }, { "epoch": 1.804892119533864, "grad_norm": 0.06195460259914398, "learning_rate": 8.237677409952784e-06, "loss": 0.0022, "step": 978 }, { "epoch": 1.8067382023768317, "grad_norm": 0.1816849410533905, "learning_rate": 8.216543947551525e-06, "loss": 0.0341, "step": 979 }, { "epoch": 1.808584285219799, "grad_norm": 0.09929855912923813, "learning_rate": 8.195418708965386e-06, "loss": 0.006, "step": 980 }, { "epoch": 1.8104303680627667, "grad_norm": 0.043719321489334106, "learning_rate": 8.174301791606384e-06, "loss": 0.0023, "step": 981 }, { "epoch": 1.8122764509057343, "grad_norm": 0.07386981695890427, "learning_rate": 8.15319329284817e-06, "loss": 0.0059, "step": 982 }, { "epoch": 1.814122533748702, "grad_norm": 0.05119425803422928, "learning_rate": 8.132093310025572e-06, "loss": 0.0019, "step": 983 }, { "epoch": 1.8159686165916695, "grad_norm": 0.06272052228450775, "learning_rate": 8.111001940434156e-06, "loss": 0.0036, "step": 984 }, { "epoch": 1.8178146994346371, "grad_norm": 0.06586525589227676, "learning_rate": 8.089919281329756e-06, "loss": 0.0021, "step": 985 }, { "epoch": 1.8196607822776047, "grad_norm": 0.05978992208838463, "learning_rate": 8.06884542992806e-06, "loss": 0.0021, "step": 986 }, { "epoch": 1.8215068651205724, "grad_norm": 0.09013515710830688, "learning_rate": 8.047780483404135e-06, "loss": 0.0052, "step": 987 }, { "epoch": 1.82335294796354, "grad_norm": 0.1136859878897667, "learning_rate": 8.026724538891976e-06, "loss": 0.0063, "step": 988 }, { "epoch": 1.8251990308065076, "grad_norm": 0.09807315468788147, "learning_rate": 8.005677693484077e-06, "loss": 0.0063, "step": 989 }, { "epoch": 1.827045113649475, "grad_norm": 0.07622791081666946, "learning_rate": 7.984640044230984e-06, "loss": 0.0064, "step": 990 }, { "epoch": 1.8288911964924426, "grad_norm": 0.057751450687646866, "learning_rate": 7.963611688140814e-06, "loss": 0.0062, "step": 991 }, { "epoch": 1.8307372793354102, "grad_norm": 0.14268364012241364, "learning_rate": 7.942592722178853e-06, "loss": 0.0094, "step": 992 }, { "epoch": 1.8325833621783778, "grad_norm": 0.04325347766280174, "learning_rate": 7.921583243267079e-06, "loss": 0.0017, "step": 993 }, { "epoch": 1.8344294450213452, "grad_norm": 0.06997363269329071, "learning_rate": 7.900583348283726e-06, "loss": 0.0031, "step": 994 }, { "epoch": 1.8362755278643128, "grad_norm": 0.18991568684577942, "learning_rate": 7.879593134062828e-06, "loss": 0.0057, "step": 995 }, { "epoch": 1.8381216107072804, "grad_norm": 0.07828588038682938, "learning_rate": 7.858612697393792e-06, "loss": 0.0031, "step": 996 }, { "epoch": 1.839967693550248, "grad_norm": 0.0916314348578453, "learning_rate": 7.837642135020929e-06, "loss": 0.0079, "step": 997 }, { "epoch": 1.8418137763932156, "grad_norm": 0.09158594161272049, "learning_rate": 7.816681543643019e-06, "loss": 0.0054, "step": 998 }, { "epoch": 1.8436598592361833, "grad_norm": 0.10289250314235687, "learning_rate": 7.795731019912867e-06, "loss": 0.0082, "step": 999 }, { "epoch": 1.8455059420791509, "grad_norm": 0.07108542323112488, "learning_rate": 7.774790660436857e-06, "loss": 0.0069, "step": 1000 }, { "epoch": 1.8473520249221185, "grad_norm": 0.08768683671951294, "learning_rate": 7.753860561774495e-06, "loss": 0.0158, "step": 1001 }, { "epoch": 1.849198107765086, "grad_norm": 0.06569988280534744, "learning_rate": 7.73294082043798e-06, "loss": 0.0021, "step": 1002 }, { "epoch": 1.8510441906080537, "grad_norm": 0.0680694505572319, "learning_rate": 7.712031532891754e-06, "loss": 0.0031, "step": 1003 }, { "epoch": 1.852890273451021, "grad_norm": 0.07440797984600067, "learning_rate": 7.691132795552044e-06, "loss": 0.0085, "step": 1004 }, { "epoch": 1.8547363562939887, "grad_norm": 0.06806052476167679, "learning_rate": 7.670244704786432e-06, "loss": 0.0031, "step": 1005 }, { "epoch": 1.8565824391369563, "grad_norm": 0.0569663941860199, "learning_rate": 7.649367356913422e-06, "loss": 0.0021, "step": 1006 }, { "epoch": 1.8584285219799237, "grad_norm": 0.23129606246948242, "learning_rate": 7.628500848201956e-06, "loss": 0.0428, "step": 1007 }, { "epoch": 1.8602746048228913, "grad_norm": 0.06253650039434433, "learning_rate": 7.607645274871013e-06, "loss": 0.0027, "step": 1008 }, { "epoch": 1.862120687665859, "grad_norm": 0.10426635295152664, "learning_rate": 7.58680073308914e-06, "loss": 0.0046, "step": 1009 }, { "epoch": 1.8639667705088265, "grad_norm": 0.07335297763347626, "learning_rate": 7.565967318974015e-06, "loss": 0.0028, "step": 1010 }, { "epoch": 1.8658128533517941, "grad_norm": 0.18146410584449768, "learning_rate": 7.545145128592009e-06, "loss": 0.0128, "step": 1011 }, { "epoch": 1.8676589361947618, "grad_norm": 0.17065131664276123, "learning_rate": 7.524334257957737e-06, "loss": 0.0091, "step": 1012 }, { "epoch": 1.8676589361947618, "eval_loss": 0.007683510426431894, "eval_runtime": 91.4577, "eval_samples_per_second": 9.983, "eval_steps_per_second": 4.997, "step": 1012 }, { "epoch": 1.8695050190377294, "grad_norm": 0.07308690249919891, "learning_rate": 7.50353480303361e-06, "loss": 0.0035, "step": 1013 }, { "epoch": 1.871351101880697, "grad_norm": 0.07680094987154007, "learning_rate": 7.482746859729408e-06, "loss": 0.0048, "step": 1014 }, { "epoch": 1.8731971847236646, "grad_norm": 0.05260724946856499, "learning_rate": 7.461970523901827e-06, "loss": 0.0026, "step": 1015 }, { "epoch": 1.8750432675666322, "grad_norm": 0.15840424597263336, "learning_rate": 7.441205891354037e-06, "loss": 0.0063, "step": 1016 }, { "epoch": 1.8768893504095996, "grad_norm": 0.060153163969516754, "learning_rate": 7.42045305783524e-06, "loss": 0.0019, "step": 1017 }, { "epoch": 1.8787354332525672, "grad_norm": 0.13796140253543854, "learning_rate": 7.3997121190402375e-06, "loss": 0.0153, "step": 1018 }, { "epoch": 1.8805815160955348, "grad_norm": 0.15525425970554352, "learning_rate": 7.378983170608982e-06, "loss": 0.0092, "step": 1019 }, { "epoch": 1.8824275989385022, "grad_norm": 0.09056207537651062, "learning_rate": 7.3582663081261195e-06, "loss": 0.0037, "step": 1020 }, { "epoch": 1.8842736817814698, "grad_norm": 0.18960314989089966, "learning_rate": 7.337561627120591e-06, "loss": 0.0053, "step": 1021 }, { "epoch": 1.8861197646244374, "grad_norm": 0.08146478235721588, "learning_rate": 7.316869223065156e-06, "loss": 0.0094, "step": 1022 }, { "epoch": 1.887965847467405, "grad_norm": 0.12083683162927628, "learning_rate": 7.296189191375953e-06, "loss": 0.0023, "step": 1023 }, { "epoch": 1.8898119303103726, "grad_norm": 0.06863249838352203, "learning_rate": 7.275521627412082e-06, "loss": 0.0043, "step": 1024 }, { "epoch": 1.8916580131533403, "grad_norm": 0.05190117284655571, "learning_rate": 7.254866626475152e-06, "loss": 0.0015, "step": 1025 }, { "epoch": 1.8935040959963079, "grad_norm": 0.10676594078540802, "learning_rate": 7.234224283808832e-06, "loss": 0.0146, "step": 1026 }, { "epoch": 1.8953501788392755, "grad_norm": 0.06794177740812302, "learning_rate": 7.213594694598432e-06, "loss": 0.0076, "step": 1027 }, { "epoch": 1.897196261682243, "grad_norm": 0.04282936453819275, "learning_rate": 7.192977953970448e-06, "loss": 0.0018, "step": 1028 }, { "epoch": 1.8990423445252107, "grad_norm": 0.0630037784576416, "learning_rate": 7.172374156992131e-06, "loss": 0.0023, "step": 1029 }, { "epoch": 1.9008884273681783, "grad_norm": 0.069447822868824, "learning_rate": 7.151783398671046e-06, "loss": 0.0057, "step": 1030 }, { "epoch": 1.9027345102111457, "grad_norm": 0.11327061057090759, "learning_rate": 7.131205773954636e-06, "loss": 0.0118, "step": 1031 }, { "epoch": 1.9045805930541133, "grad_norm": 0.12302399426698685, "learning_rate": 7.110641377729778e-06, "loss": 0.0057, "step": 1032 }, { "epoch": 1.906426675897081, "grad_norm": 0.09334293752908707, "learning_rate": 7.090090304822356e-06, "loss": 0.0038, "step": 1033 }, { "epoch": 1.9082727587400483, "grad_norm": 0.05566679313778877, "learning_rate": 7.069552649996819e-06, "loss": 0.0026, "step": 1034 }, { "epoch": 1.910118841583016, "grad_norm": 0.0628797709941864, "learning_rate": 7.049028507955731e-06, "loss": 0.0021, "step": 1035 }, { "epoch": 1.9119649244259835, "grad_norm": 0.09773414582014084, "learning_rate": 7.028517973339361e-06, "loss": 0.0033, "step": 1036 }, { "epoch": 1.9138110072689511, "grad_norm": 0.06920082867145538, "learning_rate": 7.008021140725224e-06, "loss": 0.0119, "step": 1037 }, { "epoch": 1.9156570901119188, "grad_norm": 0.07204987853765488, "learning_rate": 6.9875381046276605e-06, "loss": 0.0026, "step": 1038 }, { "epoch": 1.9175031729548864, "grad_norm": 0.298231303691864, "learning_rate": 6.967068959497376e-06, "loss": 0.0151, "step": 1039 }, { "epoch": 1.919349255797854, "grad_norm": 0.07963436096906662, "learning_rate": 6.946613799721038e-06, "loss": 0.0056, "step": 1040 }, { "epoch": 1.9211953386408216, "grad_norm": 0.10884180665016174, "learning_rate": 6.926172719620827e-06, "loss": 0.0104, "step": 1041 }, { "epoch": 1.9230414214837892, "grad_norm": 0.06557092815637589, "learning_rate": 6.905745813453983e-06, "loss": 0.003, "step": 1042 }, { "epoch": 1.9248875043267568, "grad_norm": 0.08651768416166306, "learning_rate": 6.885333175412406e-06, "loss": 0.0096, "step": 1043 }, { "epoch": 1.9267335871697242, "grad_norm": 0.05259367451071739, "learning_rate": 6.864934899622191e-06, "loss": 0.0034, "step": 1044 }, { "epoch": 1.9285796700126918, "grad_norm": 0.13659609854221344, "learning_rate": 6.844551080143209e-06, "loss": 0.0301, "step": 1045 }, { "epoch": 1.9304257528556594, "grad_norm": 0.10816258192062378, "learning_rate": 6.824181810968675e-06, "loss": 0.0061, "step": 1046 }, { "epoch": 1.9322718356986268, "grad_norm": 0.1165551021695137, "learning_rate": 6.80382718602471e-06, "loss": 0.0042, "step": 1047 }, { "epoch": 1.9341179185415944, "grad_norm": 0.047005485743284225, "learning_rate": 6.783487299169897e-06, "loss": 0.0018, "step": 1048 }, { "epoch": 1.935964001384562, "grad_norm": 0.048299964517354965, "learning_rate": 6.763162244194874e-06, "loss": 0.002, "step": 1049 }, { "epoch": 1.9378100842275297, "grad_norm": 0.1873732954263687, "learning_rate": 6.74285211482188e-06, "loss": 0.0249, "step": 1050 }, { "epoch": 1.9396561670704973, "grad_norm": 0.08936057239770889, "learning_rate": 6.722557004704322e-06, "loss": 0.0042, "step": 1051 }, { "epoch": 1.9415022499134649, "grad_norm": 0.09909237921237946, "learning_rate": 6.702277007426365e-06, "loss": 0.0102, "step": 1052 }, { "epoch": 1.9433483327564325, "grad_norm": 0.06664317846298218, "learning_rate": 6.6820122165024845e-06, "loss": 0.0066, "step": 1053 }, { "epoch": 1.9451944155994, "grad_norm": 0.0672774612903595, "learning_rate": 6.661762725377019e-06, "loss": 0.0082, "step": 1054 }, { "epoch": 1.9470404984423677, "grad_norm": 0.05301275476813316, "learning_rate": 6.6415286274237744e-06, "loss": 0.002, "step": 1055 }, { "epoch": 1.9488865812853353, "grad_norm": 0.06802228093147278, "learning_rate": 6.62131001594558e-06, "loss": 0.0094, "step": 1056 }, { "epoch": 1.9507326641283027, "grad_norm": 0.15557558834552765, "learning_rate": 6.601106984173835e-06, "loss": 0.0035, "step": 1057 }, { "epoch": 1.9525787469712703, "grad_norm": 0.06863709539175034, "learning_rate": 6.580919625268114e-06, "loss": 0.0032, "step": 1058 }, { "epoch": 1.954424829814238, "grad_norm": 0.05429449677467346, "learning_rate": 6.560748032315713e-06, "loss": 0.0037, "step": 1059 }, { "epoch": 1.9562709126572055, "grad_norm": 0.09654875099658966, "learning_rate": 6.540592298331239e-06, "loss": 0.0087, "step": 1060 }, { "epoch": 1.958116995500173, "grad_norm": 0.08214404433965683, "learning_rate": 6.520452516256157e-06, "loss": 0.0134, "step": 1061 }, { "epoch": 1.9599630783431405, "grad_norm": 0.09481552988290787, "learning_rate": 6.5003287789583825e-06, "loss": 0.0033, "step": 1062 }, { "epoch": 1.9618091611861082, "grad_norm": 0.05937489867210388, "learning_rate": 6.480221179231849e-06, "loss": 0.005, "step": 1063 }, { "epoch": 1.9636552440290758, "grad_norm": 0.06381800770759583, "learning_rate": 6.460129809796067e-06, "loss": 0.0025, "step": 1064 }, { "epoch": 1.9655013268720434, "grad_norm": 0.10020963102579117, "learning_rate": 6.440054763295714e-06, "loss": 0.0042, "step": 1065 }, { "epoch": 1.967347409715011, "grad_norm": 0.10043366998434067, "learning_rate": 6.419996132300203e-06, "loss": 0.0062, "step": 1066 }, { "epoch": 1.9691934925579786, "grad_norm": 0.07305952906608582, "learning_rate": 6.3999540093032396e-06, "loss": 0.0058, "step": 1067 }, { "epoch": 1.9710395754009462, "grad_norm": 0.052628278732299805, "learning_rate": 6.379928486722421e-06, "loss": 0.0027, "step": 1068 }, { "epoch": 1.9728856582439138, "grad_norm": 0.051156193017959595, "learning_rate": 6.359919656898794e-06, "loss": 0.002, "step": 1069 }, { "epoch": 1.9747317410868814, "grad_norm": 0.13866794109344482, "learning_rate": 6.3399276120964235e-06, "loss": 0.0098, "step": 1070 }, { "epoch": 1.9765778239298488, "grad_norm": 0.12971021234989166, "learning_rate": 6.319952444501984e-06, "loss": 0.0115, "step": 1071 }, { "epoch": 1.9784239067728164, "grad_norm": 0.05518823862075806, "learning_rate": 6.2999942462243345e-06, "loss": 0.0034, "step": 1072 }, { "epoch": 1.980269989615784, "grad_norm": 0.07717905938625336, "learning_rate": 6.280053109294064e-06, "loss": 0.0083, "step": 1073 }, { "epoch": 1.9821160724587514, "grad_norm": 0.053649693727493286, "learning_rate": 6.260129125663106e-06, "loss": 0.0016, "step": 1074 }, { "epoch": 1.983962155301719, "grad_norm": 0.12298066914081573, "learning_rate": 6.240222387204293e-06, "loss": 0.0122, "step": 1075 }, { "epoch": 1.9858082381446867, "grad_norm": 0.10608681291341782, "learning_rate": 6.220332985710936e-06, "loss": 0.0037, "step": 1076 }, { "epoch": 1.9876543209876543, "grad_norm": 0.07757773250341415, "learning_rate": 6.200461012896401e-06, "loss": 0.0027, "step": 1077 }, { "epoch": 1.9895004038306219, "grad_norm": 0.14791713654994965, "learning_rate": 6.180606560393694e-06, "loss": 0.008, "step": 1078 }, { "epoch": 1.9913464866735895, "grad_norm": 0.046395193785429, "learning_rate": 6.16076971975502e-06, "loss": 0.0019, "step": 1079 }, { "epoch": 1.993192569516557, "grad_norm": 0.11292968690395355, "learning_rate": 6.140950582451384e-06, "loss": 0.0063, "step": 1080 }, { "epoch": 1.9950386523595247, "grad_norm": 0.04149056598544121, "learning_rate": 6.121149239872151e-06, "loss": 0.0008, "step": 1081 }, { "epoch": 1.9968847352024923, "grad_norm": 0.10369537770748138, "learning_rate": 6.1013657833246396e-06, "loss": 0.0128, "step": 1082 }, { "epoch": 1.99873081804546, "grad_norm": 0.09148327261209488, "learning_rate": 6.081600304033682e-06, "loss": 0.0045, "step": 1083 }, { "epoch": 2.0, "grad_norm": 0.1143283098936081, "learning_rate": 6.061852893141222e-06, "loss": 0.0087, "step": 1084 }, { "epoch": 2.0018460828429676, "grad_norm": 0.059561945497989655, "learning_rate": 6.04212364170589e-06, "loss": 0.0019, "step": 1085 }, { "epoch": 2.0036921656859352, "grad_norm": 0.07360690087080002, "learning_rate": 6.0224126407025616e-06, "loss": 0.0008, "step": 1086 }, { "epoch": 2.005538248528903, "grad_norm": 0.0354192815721035, "learning_rate": 6.002719981021982e-06, "loss": 0.0014, "step": 1087 }, { "epoch": 2.0073843313718704, "grad_norm": 0.0898759588599205, "learning_rate": 5.983045753470308e-06, "loss": 0.0093, "step": 1088 }, { "epoch": 2.009230414214838, "grad_norm": 0.049619074910879135, "learning_rate": 5.963390048768698e-06, "loss": 0.0017, "step": 1089 }, { "epoch": 2.0110764970578057, "grad_norm": 0.031979288905858994, "learning_rate": 5.9437529575529085e-06, "loss": 0.0011, "step": 1090 }, { "epoch": 2.012922579900773, "grad_norm": 0.031964387744665146, "learning_rate": 5.924134570372863e-06, "loss": 0.0021, "step": 1091 }, { "epoch": 2.0147686627437404, "grad_norm": 0.028327271342277527, "learning_rate": 5.9045349776922335e-06, "loss": 0.0008, "step": 1092 }, { "epoch": 2.016614745586708, "grad_norm": 0.05546770617365837, "learning_rate": 5.884954269888032e-06, "loss": 0.0011, "step": 1093 }, { "epoch": 2.0184608284296757, "grad_norm": 0.11671288311481476, "learning_rate": 5.865392537250191e-06, "loss": 0.0037, "step": 1094 }, { "epoch": 2.0203069112726433, "grad_norm": 0.09133674204349518, "learning_rate": 5.845849869981137e-06, "loss": 0.0126, "step": 1095 }, { "epoch": 2.022152994115611, "grad_norm": 0.06473570317029953, "learning_rate": 5.826326358195391e-06, "loss": 0.0032, "step": 1096 }, { "epoch": 2.0239990769585785, "grad_norm": 0.05737852305173874, "learning_rate": 5.806822091919143e-06, "loss": 0.0035, "step": 1097 }, { "epoch": 2.025845159801546, "grad_norm": 0.05501372739672661, "learning_rate": 5.787337161089836e-06, "loss": 0.0037, "step": 1098 }, { "epoch": 2.0276912426445137, "grad_norm": 0.03991509601473808, "learning_rate": 5.7678716555557515e-06, "loss": 0.0011, "step": 1099 }, { "epoch": 2.0295373254874813, "grad_norm": 0.042931295931339264, "learning_rate": 5.74842566507561e-06, "loss": 0.0011, "step": 1100 }, { "epoch": 2.031383408330449, "grad_norm": 0.09508123248815536, "learning_rate": 5.728999279318131e-06, "loss": 0.0144, "step": 1101 }, { "epoch": 2.0332294911734166, "grad_norm": 0.04815695434808731, "learning_rate": 5.709592587861637e-06, "loss": 0.0026, "step": 1102 }, { "epoch": 2.035075574016384, "grad_norm": 0.05574629828333855, "learning_rate": 5.690205680193647e-06, "loss": 0.0011, "step": 1103 }, { "epoch": 2.036921656859352, "grad_norm": 0.10294865071773529, "learning_rate": 5.670838645710439e-06, "loss": 0.0057, "step": 1104 }, { "epoch": 2.036921656859352, "eval_loss": 0.008138884790241718, "eval_runtime": 91.5583, "eval_samples_per_second": 9.972, "eval_steps_per_second": 4.991, "step": 1104 }, { "epoch": 2.038767739702319, "grad_norm": 0.03519001603126526, "learning_rate": 5.651491573716657e-06, "loss": 0.0028, "step": 1105 }, { "epoch": 2.0406138225452866, "grad_norm": 0.046490173786878586, "learning_rate": 5.632164553424904e-06, "loss": 0.0032, "step": 1106 }, { "epoch": 2.042459905388254, "grad_norm": 0.044620878994464874, "learning_rate": 5.612857673955308e-06, "loss": 0.0012, "step": 1107 }, { "epoch": 2.044305988231222, "grad_norm": 0.0896461084485054, "learning_rate": 5.593571024335126e-06, "loss": 0.0033, "step": 1108 }, { "epoch": 2.0461520710741894, "grad_norm": 0.06016547977924347, "learning_rate": 5.574304693498346e-06, "loss": 0.0055, "step": 1109 }, { "epoch": 2.047998153917157, "grad_norm": 0.13048522174358368, "learning_rate": 5.5550587702852465e-06, "loss": 0.0086, "step": 1110 }, { "epoch": 2.0498442367601246, "grad_norm": 0.05786297842860222, "learning_rate": 5.5358333434420054e-06, "loss": 0.0048, "step": 1111 }, { "epoch": 2.0516903196030922, "grad_norm": 0.03565092384815216, "learning_rate": 5.516628501620299e-06, "loss": 0.0011, "step": 1112 }, { "epoch": 2.05353640244606, "grad_norm": 0.05448061600327492, "learning_rate": 5.497444333376874e-06, "loss": 0.0038, "step": 1113 }, { "epoch": 2.0553824852890274, "grad_norm": 0.04243125393986702, "learning_rate": 5.478280927173145e-06, "loss": 0.0013, "step": 1114 }, { "epoch": 2.057228568131995, "grad_norm": 0.054316967725753784, "learning_rate": 5.459138371374795e-06, "loss": 0.002, "step": 1115 }, { "epoch": 2.0590746509749627, "grad_norm": 0.03207079693675041, "learning_rate": 5.440016754251364e-06, "loss": 0.0012, "step": 1116 }, { "epoch": 2.0609207338179303, "grad_norm": 0.20276927947998047, "learning_rate": 5.420916163975836e-06, "loss": 0.0013, "step": 1117 }, { "epoch": 2.0627668166608975, "grad_norm": 0.04171600192785263, "learning_rate": 5.401836688624231e-06, "loss": 0.0014, "step": 1118 }, { "epoch": 2.064612899503865, "grad_norm": 0.060076795518398285, "learning_rate": 5.382778416175223e-06, "loss": 0.003, "step": 1119 }, { "epoch": 2.0664589823468327, "grad_norm": 0.0577683262526989, "learning_rate": 5.363741434509697e-06, "loss": 0.0011, "step": 1120 }, { "epoch": 2.0683050651898003, "grad_norm": 0.1408967226743698, "learning_rate": 5.344725831410369e-06, "loss": 0.0154, "step": 1121 }, { "epoch": 2.070151148032768, "grad_norm": 0.05401252210140228, "learning_rate": 5.32573169456138e-06, "loss": 0.0009, "step": 1122 }, { "epoch": 2.0719972308757355, "grad_norm": 0.05760827288031578, "learning_rate": 5.306759111547881e-06, "loss": 0.0013, "step": 1123 }, { "epoch": 2.073843313718703, "grad_norm": 0.04372551664710045, "learning_rate": 5.28780816985563e-06, "loss": 0.0035, "step": 1124 }, { "epoch": 2.0756893965616707, "grad_norm": 0.1505114585161209, "learning_rate": 5.26887895687061e-06, "loss": 0.0045, "step": 1125 }, { "epoch": 2.0775354794046383, "grad_norm": 0.033471547067165375, "learning_rate": 5.24997155987859e-06, "loss": 0.0007, "step": 1126 }, { "epoch": 2.079381562247606, "grad_norm": 0.05673528090119362, "learning_rate": 5.231086066064751e-06, "loss": 0.0015, "step": 1127 }, { "epoch": 2.0812276450905736, "grad_norm": 0.18660074472427368, "learning_rate": 5.212222562513278e-06, "loss": 0.0058, "step": 1128 }, { "epoch": 2.083073727933541, "grad_norm": 0.10123515874147415, "learning_rate": 5.193381136206948e-06, "loss": 0.0122, "step": 1129 }, { "epoch": 2.084919810776509, "grad_norm": 0.0788915827870369, "learning_rate": 5.174561874026741e-06, "loss": 0.0036, "step": 1130 }, { "epoch": 2.086765893619476, "grad_norm": 0.10579413175582886, "learning_rate": 5.155764862751427e-06, "loss": 0.0023, "step": 1131 }, { "epoch": 2.0886119764624436, "grad_norm": 0.039463870227336884, "learning_rate": 5.136990189057187e-06, "loss": 0.0025, "step": 1132 }, { "epoch": 2.090458059305411, "grad_norm": 0.09689252078533173, "learning_rate": 5.11823793951719e-06, "loss": 0.0055, "step": 1133 }, { "epoch": 2.092304142148379, "grad_norm": 0.09919244050979614, "learning_rate": 5.099508200601198e-06, "loss": 0.0045, "step": 1134 }, { "epoch": 2.0941502249913464, "grad_norm": 0.06431836634874344, "learning_rate": 5.080801058675191e-06, "loss": 0.0087, "step": 1135 }, { "epoch": 2.095996307834314, "grad_norm": 0.0821138471364975, "learning_rate": 5.062116600000933e-06, "loss": 0.0017, "step": 1136 }, { "epoch": 2.0978423906772816, "grad_norm": 0.050396766513586044, "learning_rate": 5.043454910735595e-06, "loss": 0.0009, "step": 1137 }, { "epoch": 2.0996884735202492, "grad_norm": 0.0441858284175396, "learning_rate": 5.024816076931366e-06, "loss": 0.0015, "step": 1138 }, { "epoch": 2.101534556363217, "grad_norm": 0.057779595255851746, "learning_rate": 5.006200184535033e-06, "loss": 0.0025, "step": 1139 }, { "epoch": 2.1033806392061845, "grad_norm": 0.06962257623672485, "learning_rate": 4.987607319387593e-06, "loss": 0.0019, "step": 1140 }, { "epoch": 2.105226722049152, "grad_norm": 0.0638086274266243, "learning_rate": 4.969037567223881e-06, "loss": 0.0033, "step": 1141 }, { "epoch": 2.1070728048921197, "grad_norm": 0.054150376468896866, "learning_rate": 4.950491013672124e-06, "loss": 0.0014, "step": 1142 }, { "epoch": 2.1089188877350873, "grad_norm": 0.06254181265830994, "learning_rate": 4.931967744253601e-06, "loss": 0.0024, "step": 1143 }, { "epoch": 2.110764970578055, "grad_norm": 0.04108688607811928, "learning_rate": 4.913467844382217e-06, "loss": 0.0017, "step": 1144 }, { "epoch": 2.112611053421022, "grad_norm": 0.05044626444578171, "learning_rate": 4.894991399364113e-06, "loss": 0.0013, "step": 1145 }, { "epoch": 2.1144571362639897, "grad_norm": 0.0502905435860157, "learning_rate": 4.876538494397274e-06, "loss": 0.0009, "step": 1146 }, { "epoch": 2.1163032191069573, "grad_norm": 0.043060798197984695, "learning_rate": 4.8581092145711466e-06, "loss": 0.0009, "step": 1147 }, { "epoch": 2.118149301949925, "grad_norm": 0.06760820746421814, "learning_rate": 4.839703644866228e-06, "loss": 0.0023, "step": 1148 }, { "epoch": 2.1199953847928925, "grad_norm": 0.06913480162620544, "learning_rate": 4.821321870153692e-06, "loss": 0.0062, "step": 1149 }, { "epoch": 2.12184146763586, "grad_norm": 0.10108120739459991, "learning_rate": 4.802963975194981e-06, "loss": 0.012, "step": 1150 }, { "epoch": 2.1236875504788277, "grad_norm": 0.028255267068743706, "learning_rate": 4.784630044641435e-06, "loss": 0.0006, "step": 1151 }, { "epoch": 2.1255336333217953, "grad_norm": 0.1202707514166832, "learning_rate": 4.766320163033882e-06, "loss": 0.0026, "step": 1152 }, { "epoch": 2.127379716164763, "grad_norm": 0.05136001482605934, "learning_rate": 4.7480344148022535e-06, "loss": 0.0021, "step": 1153 }, { "epoch": 2.1292257990077306, "grad_norm": 0.04602188244462013, "learning_rate": 4.729772884265212e-06, "loss": 0.0011, "step": 1154 }, { "epoch": 2.131071881850698, "grad_norm": 0.06541673839092255, "learning_rate": 4.711535655629735e-06, "loss": 0.0023, "step": 1155 }, { "epoch": 2.132917964693666, "grad_norm": 0.03694295138120651, "learning_rate": 4.6933228129907395e-06, "loss": 0.0009, "step": 1156 }, { "epoch": 2.1347640475366334, "grad_norm": 0.06101083382964134, "learning_rate": 4.675134440330706e-06, "loss": 0.0006, "step": 1157 }, { "epoch": 2.1366101303796006, "grad_norm": 0.028277406468987465, "learning_rate": 4.65697062151927e-06, "loss": 0.0006, "step": 1158 }, { "epoch": 2.138456213222568, "grad_norm": 0.08642231673002243, "learning_rate": 4.638831440312844e-06, "loss": 0.0026, "step": 1159 }, { "epoch": 2.140302296065536, "grad_norm": 0.07297013700008392, "learning_rate": 4.620716980354248e-06, "loss": 0.0017, "step": 1160 }, { "epoch": 2.1421483789085034, "grad_norm": 0.06735076755285263, "learning_rate": 4.602627325172279e-06, "loss": 0.0021, "step": 1161 }, { "epoch": 2.143994461751471, "grad_norm": 0.14374354481697083, "learning_rate": 4.584562558181384e-06, "loss": 0.0115, "step": 1162 }, { "epoch": 2.1458405445944386, "grad_norm": 0.08532073348760605, "learning_rate": 4.566522762681239e-06, "loss": 0.0102, "step": 1163 }, { "epoch": 2.1476866274374062, "grad_norm": 0.10443772375583649, "learning_rate": 4.548508021856354e-06, "loss": 0.0072, "step": 1164 }, { "epoch": 2.149532710280374, "grad_norm": 0.043913282454013824, "learning_rate": 4.530518418775734e-06, "loss": 0.0027, "step": 1165 }, { "epoch": 2.1513787931233415, "grad_norm": 0.08159793168306351, "learning_rate": 4.512554036392448e-06, "loss": 0.0014, "step": 1166 }, { "epoch": 2.153224875966309, "grad_norm": 0.14023910462856293, "learning_rate": 4.494614957543286e-06, "loss": 0.0083, "step": 1167 }, { "epoch": 2.1550709588092767, "grad_norm": 0.07015207409858704, "learning_rate": 4.4767012649483484e-06, "loss": 0.0056, "step": 1168 }, { "epoch": 2.1569170416522443, "grad_norm": 0.027905292809009552, "learning_rate": 4.458813041210672e-06, "loss": 0.0008, "step": 1169 }, { "epoch": 2.158763124495212, "grad_norm": 0.08288539946079254, "learning_rate": 4.440950368815866e-06, "loss": 0.0015, "step": 1170 }, { "epoch": 2.1606092073381795, "grad_norm": 0.042242277413606644, "learning_rate": 4.423113330131708e-06, "loss": 0.001, "step": 1171 }, { "epoch": 2.1624552901811467, "grad_norm": 0.04000399261713028, "learning_rate": 4.40530200740777e-06, "loss": 0.0007, "step": 1172 }, { "epoch": 2.1643013730241143, "grad_norm": 0.07605951279401779, "learning_rate": 4.387516482775058e-06, "loss": 0.0015, "step": 1173 }, { "epoch": 2.166147455867082, "grad_norm": 0.15174829959869385, "learning_rate": 4.369756838245608e-06, "loss": 0.0057, "step": 1174 }, { "epoch": 2.1679935387100495, "grad_norm": 0.0797356367111206, "learning_rate": 4.352023155712116e-06, "loss": 0.0085, "step": 1175 }, { "epoch": 2.169839621553017, "grad_norm": 0.04051666334271431, "learning_rate": 4.33431551694758e-06, "loss": 0.0007, "step": 1176 }, { "epoch": 2.1716857043959847, "grad_norm": 0.06101857125759125, "learning_rate": 4.316634003604878e-06, "loss": 0.0046, "step": 1177 }, { "epoch": 2.1735317872389524, "grad_norm": 0.16660486161708832, "learning_rate": 4.298978697216442e-06, "loss": 0.0049, "step": 1178 }, { "epoch": 2.17537787008192, "grad_norm": 0.12328508496284485, "learning_rate": 4.281349679193862e-06, "loss": 0.0075, "step": 1179 }, { "epoch": 2.1772239529248876, "grad_norm": 0.0972929298877716, "learning_rate": 4.263747030827481e-06, "loss": 0.0027, "step": 1180 }, { "epoch": 2.179070035767855, "grad_norm": 0.0562286414206028, "learning_rate": 4.246170833286075e-06, "loss": 0.0045, "step": 1181 }, { "epoch": 2.180916118610823, "grad_norm": 0.0480145663022995, "learning_rate": 4.228621167616438e-06, "loss": 0.0014, "step": 1182 }, { "epoch": 2.1827622014537904, "grad_norm": 0.0439535528421402, "learning_rate": 4.21109811474302e-06, "loss": 0.0021, "step": 1183 }, { "epoch": 2.184608284296758, "grad_norm": 0.03982429951429367, "learning_rate": 4.1936017554675635e-06, "loss": 0.001, "step": 1184 }, { "epoch": 2.186454367139725, "grad_norm": 0.04404020681977272, "learning_rate": 4.176132170468714e-06, "loss": 0.0011, "step": 1185 }, { "epoch": 2.188300449982693, "grad_norm": 0.05249952897429466, "learning_rate": 4.1586894403016576e-06, "loss": 0.0024, "step": 1186 }, { "epoch": 2.1901465328256604, "grad_norm": 0.06995794177055359, "learning_rate": 4.1412736453977545e-06, "loss": 0.0021, "step": 1187 }, { "epoch": 2.191992615668628, "grad_norm": 0.09189380705356598, "learning_rate": 4.1238848660641504e-06, "loss": 0.005, "step": 1188 }, { "epoch": 2.1938386985115956, "grad_norm": 0.09258892387151718, "learning_rate": 4.106523182483434e-06, "loss": 0.0037, "step": 1189 }, { "epoch": 2.1956847813545632, "grad_norm": 0.07874837517738342, "learning_rate": 4.0891886747132356e-06, "loss": 0.0041, "step": 1190 }, { "epoch": 2.197530864197531, "grad_norm": 0.05509480461478233, "learning_rate": 4.071881422685877e-06, "loss": 0.0013, "step": 1191 }, { "epoch": 2.1993769470404985, "grad_norm": 0.09273144602775574, "learning_rate": 4.054601506208009e-06, "loss": 0.0012, "step": 1192 }, { "epoch": 2.201223029883466, "grad_norm": 0.0319955050945282, "learning_rate": 4.03734900496022e-06, "loss": 0.0005, "step": 1193 }, { "epoch": 2.2030691127264337, "grad_norm": 0.06351076811552048, "learning_rate": 4.020123998496688e-06, "loss": 0.0041, "step": 1194 }, { "epoch": 2.2049151955694013, "grad_norm": 0.11725535243749619, "learning_rate": 4.002926566244816e-06, "loss": 0.004, "step": 1195 }, { "epoch": 2.206761278412369, "grad_norm": 0.034534893929958344, "learning_rate": 3.985756787504837e-06, "loss": 0.0006, "step": 1196 }, { "epoch": 2.206761278412369, "eval_loss": 0.008584747090935707, "eval_runtime": 91.2554, "eval_samples_per_second": 10.005, "eval_steps_per_second": 5.008, "step": 1196 }, { "epoch": 2.2086073612553365, "grad_norm": 0.052446335554122925, "learning_rate": 3.968614741449488e-06, "loss": 0.0039, "step": 1197 }, { "epoch": 2.210453444098304, "grad_norm": 0.05869259312748909, "learning_rate": 3.9515005071236274e-06, "loss": 0.0044, "step": 1198 }, { "epoch": 2.2122995269412713, "grad_norm": 0.030158422887325287, "learning_rate": 3.9344141634438484e-06, "loss": 0.0007, "step": 1199 }, { "epoch": 2.214145609784239, "grad_norm": 0.05609900876879692, "learning_rate": 3.917355789198157e-06, "loss": 0.0047, "step": 1200 }, { "epoch": 2.2159916926272065, "grad_norm": 0.09441249072551727, "learning_rate": 3.9003254630455775e-06, "loss": 0.007, "step": 1201 }, { "epoch": 2.217837775470174, "grad_norm": 0.03854013606905937, "learning_rate": 3.883323263515798e-06, "loss": 0.001, "step": 1202 }, { "epoch": 2.2196838583131417, "grad_norm": 0.10601000487804413, "learning_rate": 3.866349269008819e-06, "loss": 0.002, "step": 1203 }, { "epoch": 2.2215299411561094, "grad_norm": 0.05536085367202759, "learning_rate": 3.8494035577945745e-06, "loss": 0.0028, "step": 1204 }, { "epoch": 2.223376023999077, "grad_norm": 0.08182524889707565, "learning_rate": 3.832486208012579e-06, "loss": 0.0053, "step": 1205 }, { "epoch": 2.2252221068420446, "grad_norm": 0.046531178057193756, "learning_rate": 3.815597297671578e-06, "loss": 0.0029, "step": 1206 }, { "epoch": 2.227068189685012, "grad_norm": 0.04520846903324127, "learning_rate": 3.7987369046491684e-06, "loss": 0.0013, "step": 1207 }, { "epoch": 2.22891427252798, "grad_norm": 0.0656999722123146, "learning_rate": 3.781905106691447e-06, "loss": 0.001, "step": 1208 }, { "epoch": 2.2307603553709474, "grad_norm": 0.048656027764081955, "learning_rate": 3.7651019814126656e-06, "loss": 0.0015, "step": 1209 }, { "epoch": 2.232606438213915, "grad_norm": 0.05681651830673218, "learning_rate": 3.748327606294848e-06, "loss": 0.0018, "step": 1210 }, { "epoch": 2.2344525210568826, "grad_norm": 0.09261146187782288, "learning_rate": 3.731582058687462e-06, "loss": 0.0039, "step": 1211 }, { "epoch": 2.23629860389985, "grad_norm": 0.13084052503108978, "learning_rate": 3.714865415807024e-06, "loss": 0.0133, "step": 1212 }, { "epoch": 2.2381446867428174, "grad_norm": 0.06434764713048935, "learning_rate": 3.698177754736787e-06, "loss": 0.0018, "step": 1213 }, { "epoch": 2.239990769585785, "grad_norm": 0.07721427083015442, "learning_rate": 3.6815191524263628e-06, "loss": 0.0024, "step": 1214 }, { "epoch": 2.2418368524287526, "grad_norm": 0.08833146095275879, "learning_rate": 3.6648896856913483e-06, "loss": 0.0015, "step": 1215 }, { "epoch": 2.2436829352717202, "grad_norm": 0.11038446426391602, "learning_rate": 3.6482894312130146e-06, "loss": 0.0104, "step": 1216 }, { "epoch": 2.245529018114688, "grad_norm": 0.14379984140396118, "learning_rate": 3.631718465537918e-06, "loss": 0.0016, "step": 1217 }, { "epoch": 2.2473751009576555, "grad_norm": 0.048045579344034195, "learning_rate": 3.6151768650775577e-06, "loss": 0.001, "step": 1218 }, { "epoch": 2.249221183800623, "grad_norm": 0.04289795458316803, "learning_rate": 3.598664706108037e-06, "loss": 0.0007, "step": 1219 }, { "epoch": 2.2510672666435907, "grad_norm": 0.060146868228912354, "learning_rate": 3.582182064769687e-06, "loss": 0.0017, "step": 1220 }, { "epoch": 2.2529133494865583, "grad_norm": 0.08081227540969849, "learning_rate": 3.565729017066729e-06, "loss": 0.0047, "step": 1221 }, { "epoch": 2.254759432329526, "grad_norm": 0.0641130805015564, "learning_rate": 3.5493056388669356e-06, "loss": 0.0012, "step": 1222 }, { "epoch": 2.2566055151724935, "grad_norm": 0.034692347049713135, "learning_rate": 3.5329120059012536e-06, "loss": 0.0009, "step": 1223 }, { "epoch": 2.258451598015461, "grad_norm": 0.08040868490934372, "learning_rate": 3.516548193763474e-06, "loss": 0.0043, "step": 1224 }, { "epoch": 2.2602976808584287, "grad_norm": 0.5710757374763489, "learning_rate": 3.5002142779098857e-06, "loss": 0.0057, "step": 1225 }, { "epoch": 2.262143763701396, "grad_norm": 0.03892490640282631, "learning_rate": 3.483910333658913e-06, "loss": 0.0011, "step": 1226 }, { "epoch": 2.2639898465443635, "grad_norm": 0.07377569377422333, "learning_rate": 3.4676364361907777e-06, "loss": 0.002, "step": 1227 }, { "epoch": 2.265835929387331, "grad_norm": 0.06678915023803711, "learning_rate": 3.4513926605471504e-06, "loss": 0.0067, "step": 1228 }, { "epoch": 2.2676820122302987, "grad_norm": 0.055075231939554214, "learning_rate": 3.4351790816308074e-06, "loss": 0.0013, "step": 1229 }, { "epoch": 2.2695280950732664, "grad_norm": 0.057380709797143936, "learning_rate": 3.4189957742052894e-06, "loss": 0.0007, "step": 1230 }, { "epoch": 2.271374177916234, "grad_norm": 0.08206542581319809, "learning_rate": 3.402842812894529e-06, "loss": 0.0042, "step": 1231 }, { "epoch": 2.2732202607592016, "grad_norm": 0.04653835669159889, "learning_rate": 3.3867202721825474e-06, "loss": 0.0017, "step": 1232 }, { "epoch": 2.275066343602169, "grad_norm": 0.07135917246341705, "learning_rate": 3.370628226413093e-06, "loss": 0.0026, "step": 1233 }, { "epoch": 2.276912426445137, "grad_norm": 0.08944186568260193, "learning_rate": 3.3545667497892755e-06, "loss": 0.0083, "step": 1234 }, { "epoch": 2.2787585092881044, "grad_norm": 0.07027611136436462, "learning_rate": 3.338535916373267e-06, "loss": 0.0065, "step": 1235 }, { "epoch": 2.280604592131072, "grad_norm": 0.04656380042433739, "learning_rate": 3.3225358000859287e-06, "loss": 0.0023, "step": 1236 }, { "epoch": 2.2824506749740396, "grad_norm": 0.07539285719394684, "learning_rate": 3.3065664747064775e-06, "loss": 0.0042, "step": 1237 }, { "epoch": 2.284296757817007, "grad_norm": 0.05385654792189598, "learning_rate": 3.290628013872159e-06, "loss": 0.0013, "step": 1238 }, { "epoch": 2.2861428406599744, "grad_norm": 0.05064902454614639, "learning_rate": 3.2747204910778886e-06, "loss": 0.0018, "step": 1239 }, { "epoch": 2.287988923502942, "grad_norm": 0.03831757605075836, "learning_rate": 3.2588439796759175e-06, "loss": 0.0007, "step": 1240 }, { "epoch": 2.2898350063459096, "grad_norm": 0.055462419986724854, "learning_rate": 3.2429985528755127e-06, "loss": 0.001, "step": 1241 }, { "epoch": 2.2916810891888773, "grad_norm": 0.09444908797740936, "learning_rate": 3.2271842837425917e-06, "loss": 0.003, "step": 1242 }, { "epoch": 2.293527172031845, "grad_norm": 0.02971065044403076, "learning_rate": 3.211401245199398e-06, "loss": 0.0005, "step": 1243 }, { "epoch": 2.2953732548748125, "grad_norm": 0.039957791566848755, "learning_rate": 3.1956495100241813e-06, "loss": 0.0012, "step": 1244 }, { "epoch": 2.29721933771778, "grad_norm": 0.04119617119431496, "learning_rate": 3.179929150850829e-06, "loss": 0.0009, "step": 1245 }, { "epoch": 2.2990654205607477, "grad_norm": 0.037797797471284866, "learning_rate": 3.1642402401685557e-06, "loss": 0.001, "step": 1246 }, { "epoch": 2.3009115034037153, "grad_norm": 0.18547865748405457, "learning_rate": 3.1485828503215588e-06, "loss": 0.0183, "step": 1247 }, { "epoch": 2.302757586246683, "grad_norm": 0.06199440360069275, "learning_rate": 3.132957053508696e-06, "loss": 0.0027, "step": 1248 }, { "epoch": 2.3046036690896505, "grad_norm": 0.0491969920694828, "learning_rate": 3.1173629217831345e-06, "loss": 0.0013, "step": 1249 }, { "epoch": 2.306449751932618, "grad_norm": 0.02562612295150757, "learning_rate": 3.101800527052031e-06, "loss": 0.0005, "step": 1250 }, { "epoch": 2.3082958347755858, "grad_norm": 0.05772295966744423, "learning_rate": 3.0862699410762043e-06, "loss": 0.0027, "step": 1251 }, { "epoch": 2.3101419176185534, "grad_norm": 0.1610301434993744, "learning_rate": 3.0707712354697884e-06, "loss": 0.0386, "step": 1252 }, { "epoch": 2.3119880004615205, "grad_norm": 0.6296937465667725, "learning_rate": 3.0553044816999133e-06, "loss": 0.0226, "step": 1253 }, { "epoch": 2.313834083304488, "grad_norm": 0.09471811354160309, "learning_rate": 3.039869751086383e-06, "loss": 0.0056, "step": 1254 }, { "epoch": 2.3156801661474558, "grad_norm": 0.0495307631790638, "learning_rate": 3.024467114801325e-06, "loss": 0.0006, "step": 1255 }, { "epoch": 2.3175262489904234, "grad_norm": 0.059218164533376694, "learning_rate": 3.0090966438688774e-06, "loss": 0.0021, "step": 1256 }, { "epoch": 2.319372331833391, "grad_norm": 0.1365593820810318, "learning_rate": 2.9937584091648676e-06, "loss": 0.0049, "step": 1257 }, { "epoch": 2.3212184146763586, "grad_norm": 0.15948103368282318, "learning_rate": 2.9784524814164673e-06, "loss": 0.0103, "step": 1258 }, { "epoch": 2.323064497519326, "grad_norm": 0.05989250913262367, "learning_rate": 2.9631789312018723e-06, "loss": 0.0045, "step": 1259 }, { "epoch": 2.324910580362294, "grad_norm": 0.03901425749063492, "learning_rate": 2.9479378289499925e-06, "loss": 0.0015, "step": 1260 }, { "epoch": 2.3267566632052614, "grad_norm": 0.03326350450515747, "learning_rate": 2.9327292449401067e-06, "loss": 0.0006, "step": 1261 }, { "epoch": 2.328602746048229, "grad_norm": 0.07044060528278351, "learning_rate": 2.9175532493015445e-06, "loss": 0.0019, "step": 1262 }, { "epoch": 2.3304488288911966, "grad_norm": 0.049290288239717484, "learning_rate": 2.9024099120133674e-06, "loss": 0.0022, "step": 1263 }, { "epoch": 2.3322949117341643, "grad_norm": 0.0378446988761425, "learning_rate": 2.8872993029040506e-06, "loss": 0.0015, "step": 1264 }, { "epoch": 2.3341409945771314, "grad_norm": 0.05854736641049385, "learning_rate": 2.8722214916511446e-06, "loss": 0.0012, "step": 1265 }, { "epoch": 2.335987077420099, "grad_norm": 0.04649446904659271, "learning_rate": 2.8571765477809645e-06, "loss": 0.0013, "step": 1266 }, { "epoch": 2.3378331602630666, "grad_norm": 0.08969879150390625, "learning_rate": 2.842164540668276e-06, "loss": 0.0017, "step": 1267 }, { "epoch": 2.3396792431060343, "grad_norm": 0.053399041295051575, "learning_rate": 2.8271855395359613e-06, "loss": 0.0023, "step": 1268 }, { "epoch": 2.341525325949002, "grad_norm": 0.03856360912322998, "learning_rate": 2.8122396134547038e-06, "loss": 0.0011, "step": 1269 }, { "epoch": 2.3433714087919695, "grad_norm": 0.0647839829325676, "learning_rate": 2.7973268313426836e-06, "loss": 0.0032, "step": 1270 }, { "epoch": 2.345217491634937, "grad_norm": 0.06961486488580704, "learning_rate": 2.7824472619652386e-06, "loss": 0.0063, "step": 1271 }, { "epoch": 2.3470635744779047, "grad_norm": 0.028931181877851486, "learning_rate": 2.7676009739345556e-06, "loss": 0.0007, "step": 1272 }, { "epoch": 2.3489096573208723, "grad_norm": 0.04555844888091087, "learning_rate": 2.7527880357093673e-06, "loss": 0.0011, "step": 1273 }, { "epoch": 2.35075574016384, "grad_norm": 0.06969105452299118, "learning_rate": 2.7380085155946124e-06, "loss": 0.0011, "step": 1274 }, { "epoch": 2.3526018230068075, "grad_norm": 0.05253869295120239, "learning_rate": 2.723262481741138e-06, "loss": 0.0065, "step": 1275 }, { "epoch": 2.354447905849775, "grad_norm": 0.036371052265167236, "learning_rate": 2.7085500021453838e-06, "loss": 0.0006, "step": 1276 }, { "epoch": 2.3562939886927428, "grad_norm": 0.03346690908074379, "learning_rate": 2.6938711446490607e-06, "loss": 0.0008, "step": 1277 }, { "epoch": 2.3581400715357104, "grad_norm": 0.06123751774430275, "learning_rate": 2.6792259769388394e-06, "loss": 0.0014, "step": 1278 }, { "epoch": 2.359986154378678, "grad_norm": 0.024573156610131264, "learning_rate": 2.6646145665460533e-06, "loss": 0.0004, "step": 1279 }, { "epoch": 2.361832237221645, "grad_norm": 0.033585041761398315, "learning_rate": 2.6500369808463633e-06, "loss": 0.0012, "step": 1280 }, { "epoch": 2.3636783200646128, "grad_norm": 0.06921981275081635, "learning_rate": 2.635493287059464e-06, "loss": 0.0026, "step": 1281 }, { "epoch": 2.3655244029075804, "grad_norm": 0.0719681903719902, "learning_rate": 2.620983552248764e-06, "loss": 0.002, "step": 1282 }, { "epoch": 2.367370485750548, "grad_norm": 0.09403691440820694, "learning_rate": 2.6065078433210913e-06, "loss": 0.0059, "step": 1283 }, { "epoch": 2.3692165685935156, "grad_norm": 0.08925064653158188, "learning_rate": 2.5920662270263653e-06, "loss": 0.0103, "step": 1284 }, { "epoch": 2.371062651436483, "grad_norm": 0.08255797624588013, "learning_rate": 2.5776587699573007e-06, "loss": 0.0018, "step": 1285 }, { "epoch": 2.372908734279451, "grad_norm": 0.06867097318172455, "learning_rate": 2.563285538549104e-06, "loss": 0.0046, "step": 1286 }, { "epoch": 2.3747548171224184, "grad_norm": 0.07086781412363052, "learning_rate": 2.5489465990791552e-06, "loss": 0.0019, "step": 1287 }, { "epoch": 2.376600899965386, "grad_norm": 0.20935611426830292, "learning_rate": 2.5346420176667052e-06, "loss": 0.0075, "step": 1288 }, { "epoch": 2.376600899965386, "eval_loss": 0.008777663111686707, "eval_runtime": 91.3412, "eval_samples_per_second": 9.995, "eval_steps_per_second": 5.003, "step": 1288 }, { "epoch": 2.3784469828083536, "grad_norm": 0.09720870107412338, "learning_rate": 2.520371860272588e-06, "loss": 0.0054, "step": 1289 }, { "epoch": 2.3802930656513213, "grad_norm": 0.10902188718318939, "learning_rate": 2.506136192698889e-06, "loss": 0.001, "step": 1290 }, { "epoch": 2.382139148494289, "grad_norm": 0.08954726904630661, "learning_rate": 2.491935080588658e-06, "loss": 0.0036, "step": 1291 }, { "epoch": 2.383985231337256, "grad_norm": 0.07845090329647064, "learning_rate": 2.477768589425613e-06, "loss": 0.0007, "step": 1292 }, { "epoch": 2.3858313141802237, "grad_norm": 0.08264074474573135, "learning_rate": 2.463636784533813e-06, "loss": 0.0014, "step": 1293 }, { "epoch": 2.3876773970231913, "grad_norm": 0.12997566163539886, "learning_rate": 2.449539731077385e-06, "loss": 0.0053, "step": 1294 }, { "epoch": 2.389523479866159, "grad_norm": 0.05489185079932213, "learning_rate": 2.435477494060211e-06, "loss": 0.0013, "step": 1295 }, { "epoch": 2.3913695627091265, "grad_norm": 0.048362959176301956, "learning_rate": 2.421450138325625e-06, "loss": 0.0012, "step": 1296 }, { "epoch": 2.393215645552094, "grad_norm": 0.05003335326910019, "learning_rate": 2.407457728556115e-06, "loss": 0.0013, "step": 1297 }, { "epoch": 2.3950617283950617, "grad_norm": 0.07448890805244446, "learning_rate": 2.3935003292730295e-06, "loss": 0.001, "step": 1298 }, { "epoch": 2.3969078112380293, "grad_norm": 0.0375174954533577, "learning_rate": 2.3795780048362836e-06, "loss": 0.0008, "step": 1299 }, { "epoch": 2.398753894080997, "grad_norm": 0.0948011577129364, "learning_rate": 2.3656908194440485e-06, "loss": 0.0016, "step": 1300 }, { "epoch": 2.4005999769239645, "grad_norm": 0.05007372424006462, "learning_rate": 2.351838837132464e-06, "loss": 0.0018, "step": 1301 }, { "epoch": 2.402446059766932, "grad_norm": 0.06444990634918213, "learning_rate": 2.3380221217753507e-06, "loss": 0.0014, "step": 1302 }, { "epoch": 2.4042921426098998, "grad_norm": 0.05734981223940849, "learning_rate": 2.324240737083897e-06, "loss": 0.0013, "step": 1303 }, { "epoch": 2.4061382254528674, "grad_norm": 0.1687023639678955, "learning_rate": 2.3104947466063785e-06, "loss": 0.0157, "step": 1304 }, { "epoch": 2.407984308295835, "grad_norm": 0.1404077708721161, "learning_rate": 2.2967842137278706e-06, "loss": 0.0125, "step": 1305 }, { "epoch": 2.4098303911388026, "grad_norm": 0.09310151636600494, "learning_rate": 2.283109201669936e-06, "loss": 0.0074, "step": 1306 }, { "epoch": 2.4116764739817698, "grad_norm": 0.2597910463809967, "learning_rate": 2.269469773490349e-06, "loss": 0.0348, "step": 1307 }, { "epoch": 2.4135225568247374, "grad_norm": 0.09952439367771149, "learning_rate": 2.2558659920828095e-06, "loss": 0.0071, "step": 1308 }, { "epoch": 2.415368639667705, "grad_norm": 0.05615253001451492, "learning_rate": 2.2422979201766248e-06, "loss": 0.0006, "step": 1309 }, { "epoch": 2.4172147225106726, "grad_norm": 0.046665746718645096, "learning_rate": 2.228765620336456e-06, "loss": 0.0015, "step": 1310 }, { "epoch": 2.41906080535364, "grad_norm": 0.049411892890930176, "learning_rate": 2.2152691549620155e-06, "loss": 0.0008, "step": 1311 }, { "epoch": 2.420906888196608, "grad_norm": 0.03854452446103096, "learning_rate": 2.201808586287757e-06, "loss": 0.0007, "step": 1312 }, { "epoch": 2.4227529710395754, "grad_norm": 0.07337196916341782, "learning_rate": 2.1883839763826285e-06, "loss": 0.0014, "step": 1313 }, { "epoch": 2.424599053882543, "grad_norm": 0.04901151359081268, "learning_rate": 2.1749953871497633e-06, "loss": 0.0016, "step": 1314 }, { "epoch": 2.4264451367255107, "grad_norm": 0.055691029876470566, "learning_rate": 2.1616428803261825e-06, "loss": 0.0015, "step": 1315 }, { "epoch": 2.4282912195684783, "grad_norm": 0.045411136001348495, "learning_rate": 2.148326517482543e-06, "loss": 0.0014, "step": 1316 }, { "epoch": 2.430137302411446, "grad_norm": 0.03926033154129982, "learning_rate": 2.1350463600228232e-06, "loss": 0.0009, "step": 1317 }, { "epoch": 2.431983385254413, "grad_norm": 0.031893983483314514, "learning_rate": 2.1218024691840646e-06, "loss": 0.0007, "step": 1318 }, { "epoch": 2.4338294680973807, "grad_norm": 0.05034171789884567, "learning_rate": 2.1085949060360654e-06, "loss": 0.0018, "step": 1319 }, { "epoch": 2.4356755509403483, "grad_norm": 0.1299831122159958, "learning_rate": 2.095423731481113e-06, "loss": 0.0107, "step": 1320 }, { "epoch": 2.437521633783316, "grad_norm": 0.08864670991897583, "learning_rate": 2.0822890062537106e-06, "loss": 0.0035, "step": 1321 }, { "epoch": 2.4393677166262835, "grad_norm": 0.06742891669273376, "learning_rate": 2.0691907909202745e-06, "loss": 0.0023, "step": 1322 }, { "epoch": 2.441213799469251, "grad_norm": 2.7612528800964355, "learning_rate": 2.0561291458788736e-06, "loss": 0.0422, "step": 1323 }, { "epoch": 2.4430598823122187, "grad_norm": 0.09638822823762894, "learning_rate": 2.0431041313589482e-06, "loss": 0.004, "step": 1324 }, { "epoch": 2.4449059651551863, "grad_norm": 0.045905083417892456, "learning_rate": 2.0301158074210246e-06, "loss": 0.0015, "step": 1325 }, { "epoch": 2.446752047998154, "grad_norm": 0.037349555641412735, "learning_rate": 2.01716423395644e-06, "loss": 0.0014, "step": 1326 }, { "epoch": 2.4485981308411215, "grad_norm": 0.12282802909612656, "learning_rate": 2.0042494706870817e-06, "loss": 0.006, "step": 1327 }, { "epoch": 2.450444213684089, "grad_norm": 0.16453926265239716, "learning_rate": 1.9913715771650798e-06, "loss": 0.0006, "step": 1328 }, { "epoch": 2.4522902965270568, "grad_norm": 0.037392500787973404, "learning_rate": 1.9785306127725668e-06, "loss": 0.0008, "step": 1329 }, { "epoch": 2.4541363793700244, "grad_norm": 0.05177134647965431, "learning_rate": 1.96572663672139e-06, "loss": 0.0016, "step": 1330 }, { "epoch": 2.455982462212992, "grad_norm": 0.16529984772205353, "learning_rate": 1.9529597080528207e-06, "loss": 0.0098, "step": 1331 }, { "epoch": 2.4578285450559596, "grad_norm": 0.05820747837424278, "learning_rate": 1.940229885637317e-06, "loss": 0.0016, "step": 1332 }, { "epoch": 2.459674627898927, "grad_norm": 0.06154194101691246, "learning_rate": 1.9275372281742242e-06, "loss": 0.0009, "step": 1333 }, { "epoch": 2.4615207107418944, "grad_norm": 0.1000288873910904, "learning_rate": 1.914881794191512e-06, "loss": 0.0041, "step": 1334 }, { "epoch": 2.463366793584862, "grad_norm": 0.03698883578181267, "learning_rate": 1.9022636420455153e-06, "loss": 0.0011, "step": 1335 }, { "epoch": 2.4652128764278296, "grad_norm": 0.12048456072807312, "learning_rate": 1.8896828299206494e-06, "loss": 0.021, "step": 1336 }, { "epoch": 2.467058959270797, "grad_norm": 0.06396915018558502, "learning_rate": 1.8771394158291467e-06, "loss": 0.001, "step": 1337 }, { "epoch": 2.468905042113765, "grad_norm": 0.07608433812856674, "learning_rate": 1.8646334576107993e-06, "loss": 0.0106, "step": 1338 }, { "epoch": 2.4707511249567324, "grad_norm": 0.07956137508153915, "learning_rate": 1.852165012932674e-06, "loss": 0.0033, "step": 1339 }, { "epoch": 2.4725972077997, "grad_norm": 0.054058585315942764, "learning_rate": 1.8397341392888679e-06, "loss": 0.001, "step": 1340 }, { "epoch": 2.4744432906426677, "grad_norm": 0.05576934665441513, "learning_rate": 1.8273408940002202e-06, "loss": 0.0017, "step": 1341 }, { "epoch": 2.4762893734856353, "grad_norm": 0.06315232813358307, "learning_rate": 1.8149853342140644e-06, "loss": 0.0018, "step": 1342 }, { "epoch": 2.478135456328603, "grad_norm": 0.04733707383275032, "learning_rate": 1.8026675169039654e-06, "loss": 0.0007, "step": 1343 }, { "epoch": 2.4799815391715705, "grad_norm": 0.03141649812459946, "learning_rate": 1.790387498869437e-06, "loss": 0.0014, "step": 1344 }, { "epoch": 2.4818276220145377, "grad_norm": 0.09324819594621658, "learning_rate": 1.7781453367357081e-06, "loss": 0.0027, "step": 1345 }, { "epoch": 2.4836737048575053, "grad_norm": 0.09851463884115219, "learning_rate": 1.7659410869534466e-06, "loss": 0.0028, "step": 1346 }, { "epoch": 2.485519787700473, "grad_norm": 0.05877411738038063, "learning_rate": 1.7537748057984861e-06, "loss": 0.0017, "step": 1347 }, { "epoch": 2.4873658705434405, "grad_norm": 0.0785236656665802, "learning_rate": 1.7416465493715984e-06, "loss": 0.0024, "step": 1348 }, { "epoch": 2.489211953386408, "grad_norm": 0.18520614504814148, "learning_rate": 1.7295563735982068e-06, "loss": 0.0094, "step": 1349 }, { "epoch": 2.4910580362293757, "grad_norm": 0.12147017568349838, "learning_rate": 1.7175043342281372e-06, "loss": 0.0065, "step": 1350 }, { "epoch": 2.4929041190723433, "grad_norm": 0.058972734957933426, "learning_rate": 1.7054904868353717e-06, "loss": 0.0014, "step": 1351 }, { "epoch": 2.494750201915311, "grad_norm": 0.027336135506629944, "learning_rate": 1.693514886817772e-06, "loss": 0.0006, "step": 1352 }, { "epoch": 2.4965962847582785, "grad_norm": 0.04267744719982147, "learning_rate": 1.681577589396839e-06, "loss": 0.001, "step": 1353 }, { "epoch": 2.498442367601246, "grad_norm": 0.06234278529882431, "learning_rate": 1.6696786496174578e-06, "loss": 0.0033, "step": 1354 }, { "epoch": 2.5002884504442138, "grad_norm": 0.050173934549093246, "learning_rate": 1.657818122347634e-06, "loss": 0.0005, "step": 1355 }, { "epoch": 2.5021345332871814, "grad_norm": 0.06779942661523819, "learning_rate": 1.6459960622782466e-06, "loss": 0.0012, "step": 1356 }, { "epoch": 2.503980616130149, "grad_norm": 0.045062899589538574, "learning_rate": 1.6342125239228034e-06, "loss": 0.0012, "step": 1357 }, { "epoch": 2.5058266989731166, "grad_norm": 0.07291844487190247, "learning_rate": 1.6224675616171737e-06, "loss": 0.001, "step": 1358 }, { "epoch": 2.507672781816084, "grad_norm": 0.07481212168931961, "learning_rate": 1.6107612295193487e-06, "loss": 0.0035, "step": 1359 }, { "epoch": 2.509518864659052, "grad_norm": 0.08346392214298248, "learning_rate": 1.5990935816091945e-06, "loss": 0.0019, "step": 1360 }, { "epoch": 2.511364947502019, "grad_norm": 0.1303265392780304, "learning_rate": 1.587464671688187e-06, "loss": 0.0096, "step": 1361 }, { "epoch": 2.5132110303449866, "grad_norm": 0.23590275645256042, "learning_rate": 1.5758745533791897e-06, "loss": 0.0032, "step": 1362 }, { "epoch": 2.515057113187954, "grad_norm": 0.0556265152990818, "learning_rate": 1.5643232801261731e-06, "loss": 0.0029, "step": 1363 }, { "epoch": 2.516903196030922, "grad_norm": 0.028027968481183052, "learning_rate": 1.5528109051940021e-06, "loss": 0.0008, "step": 1364 }, { "epoch": 2.5187492788738894, "grad_norm": 0.19072595238685608, "learning_rate": 1.5413374816681725e-06, "loss": 0.0196, "step": 1365 }, { "epoch": 2.520595361716857, "grad_norm": 0.041838761419057846, "learning_rate": 1.5299030624545563e-06, "loss": 0.0009, "step": 1366 }, { "epoch": 2.5224414445598247, "grad_norm": 0.10718919336795807, "learning_rate": 1.518507700279186e-06, "loss": 0.0154, "step": 1367 }, { "epoch": 2.5242875274027923, "grad_norm": 0.04322516545653343, "learning_rate": 1.5071514476879878e-06, "loss": 0.001, "step": 1368 }, { "epoch": 2.52613361024576, "grad_norm": 0.06105953827500343, "learning_rate": 1.495834357046544e-06, "loss": 0.0016, "step": 1369 }, { "epoch": 2.5279796930887275, "grad_norm": 0.06509777903556824, "learning_rate": 1.4845564805398637e-06, "loss": 0.0014, "step": 1370 }, { "epoch": 2.5298257759316947, "grad_norm": 0.08758215606212616, "learning_rate": 1.4733178701721262e-06, "loss": 0.0036, "step": 1371 }, { "epoch": 2.5316718587746623, "grad_norm": 0.06143510341644287, "learning_rate": 1.462118577766447e-06, "loss": 0.0016, "step": 1372 }, { "epoch": 2.53351794161763, "grad_norm": 0.031274110078811646, "learning_rate": 1.450958654964647e-06, "loss": 0.001, "step": 1373 }, { "epoch": 2.5353640244605975, "grad_norm": 0.04649297147989273, "learning_rate": 1.4398381532270001e-06, "loss": 0.0012, "step": 1374 }, { "epoch": 2.537210107303565, "grad_norm": 0.04706169664859772, "learning_rate": 1.4287571238320053e-06, "loss": 0.0022, "step": 1375 }, { "epoch": 2.5390561901465327, "grad_norm": 0.0643034353852272, "learning_rate": 1.4177156178761508e-06, "loss": 0.0012, "step": 1376 }, { "epoch": 2.5409022729895003, "grad_norm": 0.053322724997997284, "learning_rate": 1.4067136862736718e-06, "loss": 0.0015, "step": 1377 }, { "epoch": 2.542748355832468, "grad_norm": 0.03393815830349922, "learning_rate": 1.3957513797563227e-06, "loss": 0.0009, "step": 1378 }, { "epoch": 2.5445944386754356, "grad_norm": 0.10828398168087006, "learning_rate": 1.3848287488731338e-06, "loss": 0.0083, "step": 1379 }, { "epoch": 2.546440521518403, "grad_norm": 1.0546766519546509, "learning_rate": 1.373945843990192e-06, "loss": 0.0065, "step": 1380 }, { "epoch": 2.546440521518403, "eval_loss": 0.008712568320333958, "eval_runtime": 91.2965, "eval_samples_per_second": 10.0, "eval_steps_per_second": 5.006, "step": 1380 } ], "logging_steps": 1, "max_steps": 1626, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 92, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.706259767524327e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }