{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.008947845247016454, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.1184806558770566e-05, "grad_norm": 9.17889404296875, "learning_rate": 0.00019999999993826567, "loss": 4.6752, "step": 1 }, { "epoch": 2.236961311754113e-05, "grad_norm": 15.023734092712402, "learning_rate": 0.0001999999997530627, "loss": 4.8816, "step": 2 }, { "epoch": 3.35544196763117e-05, "grad_norm": 4.711775302886963, "learning_rate": 0.00019999999944439107, "loss": 4.3122, "step": 3 }, { "epoch": 4.473922623508226e-05, "grad_norm": 5.1041460037231445, "learning_rate": 0.0001999999990122508, "loss": 4.0207, "step": 4 }, { "epoch": 5.5924032793852833e-05, "grad_norm": 11.579492568969727, "learning_rate": 0.0001999999984566419, "loss": 3.656, "step": 5 }, { "epoch": 6.71088393526234e-05, "grad_norm": 4.573488235473633, "learning_rate": 0.00019999999777756431, "loss": 3.3136, "step": 6 }, { "epoch": 7.829364591139397e-05, "grad_norm": 3.6844234466552734, "learning_rate": 0.0001999999969750181, "loss": 3.0363, "step": 7 }, { "epoch": 8.947845247016453e-05, "grad_norm": 2.9362566471099854, "learning_rate": 0.00019999999604900323, "loss": 2.7911, "step": 8 }, { "epoch": 0.0001006632590289351, "grad_norm": 2.6654202938079834, "learning_rate": 0.0001999999949995197, "loss": 2.6176, "step": 9 }, { "epoch": 0.00011184806558770567, "grad_norm": 1.8987364768981934, "learning_rate": 0.00019999999382656758, "loss": 2.5172, "step": 10 }, { "epoch": 0.00012303287214647624, "grad_norm": 2.596072196960449, "learning_rate": 0.0001999999925301468, "loss": 2.3978, "step": 11 }, { "epoch": 0.0001342176787052468, "grad_norm": 1.6658835411071777, "learning_rate": 0.00019999999111025733, "loss": 2.2888, "step": 12 }, { "epoch": 0.00014540248526401735, "grad_norm": 1.5891242027282715, "learning_rate": 0.00019999998956689926, "loss": 2.1966, "step": 13 }, { "epoch": 0.00015658729182278794, "grad_norm": 1.7551047801971436, "learning_rate": 0.00019999998790007256, "loss": 2.1286, "step": 14 }, { "epoch": 0.0001677720983815585, "grad_norm": 1.515837550163269, "learning_rate": 0.0001999999861097772, "loss": 2.0329, "step": 15 }, { "epoch": 0.00017895690494032905, "grad_norm": 2.0875966548919678, "learning_rate": 0.0001999999841960132, "loss": 1.9836, "step": 16 }, { "epoch": 0.00019014171149909964, "grad_norm": 1.3364766836166382, "learning_rate": 0.0001999999821587806, "loss": 1.9412, "step": 17 }, { "epoch": 0.0002013265180578702, "grad_norm": 0.9586036205291748, "learning_rate": 0.00019999997999807934, "loss": 1.9021, "step": 18 }, { "epoch": 0.00021251132461664075, "grad_norm": 0.610419750213623, "learning_rate": 0.00019999997771390947, "loss": 1.9278, "step": 19 }, { "epoch": 0.00022369613117541133, "grad_norm": 0.6592239141464233, "learning_rate": 0.000199999975306271, "loss": 1.8956, "step": 20 }, { "epoch": 0.0002348809377341819, "grad_norm": 0.7091565132141113, "learning_rate": 0.00019999997277516388, "loss": 1.8629, "step": 21 }, { "epoch": 0.0002460657442929525, "grad_norm": 0.6687048077583313, "learning_rate": 0.00019999997012058819, "loss": 1.818, "step": 22 }, { "epoch": 0.000257250550851723, "grad_norm": 0.29321762919425964, "learning_rate": 0.00019999996734254382, "loss": 1.8024, "step": 23 }, { "epoch": 0.0002684353574104936, "grad_norm": 0.6186531186103821, "learning_rate": 0.00019999996444103086, "loss": 1.7958, "step": 24 }, { "epoch": 0.0002796201639692642, "grad_norm": 0.4960622489452362, "learning_rate": 0.0001999999614160493, "loss": 1.7714, "step": 25 }, { "epoch": 0.0002908049705280347, "grad_norm": 0.25318390130996704, "learning_rate": 0.00019999995826759916, "loss": 1.7419, "step": 26 }, { "epoch": 0.0003019897770868053, "grad_norm": 0.5521177649497986, "learning_rate": 0.0001999999549956804, "loss": 1.7336, "step": 27 }, { "epoch": 0.0003131745836455759, "grad_norm": 0.3085158169269562, "learning_rate": 0.00019999995160029305, "loss": 1.7304, "step": 28 }, { "epoch": 0.0003243593902043464, "grad_norm": 0.2978903353214264, "learning_rate": 0.0001999999480814371, "loss": 1.7283, "step": 29 }, { "epoch": 0.000335544196763117, "grad_norm": 0.40339481830596924, "learning_rate": 0.00019999994443911258, "loss": 1.7577, "step": 30 }, { "epoch": 0.00034672900332188757, "grad_norm": 0.13451404869556427, "learning_rate": 0.00019999994067331945, "loss": 1.7435, "step": 31 }, { "epoch": 0.0003579138098806581, "grad_norm": 0.3141914904117584, "learning_rate": 0.0001999999367840578, "loss": 1.7479, "step": 32 }, { "epoch": 0.0003690986164394287, "grad_norm": 0.12182258069515228, "learning_rate": 0.00019999993277132754, "loss": 1.7391, "step": 33 }, { "epoch": 0.00038028342299819927, "grad_norm": 0.3160305917263031, "learning_rate": 0.00019999992863512872, "loss": 1.7231, "step": 34 }, { "epoch": 0.0003914682295569698, "grad_norm": 0.18215563893318176, "learning_rate": 0.00019999992437546134, "loss": 1.7067, "step": 35 }, { "epoch": 0.0004026530361157404, "grad_norm": 0.24103401601314545, "learning_rate": 0.0001999999199923254, "loss": 1.6877, "step": 36 }, { "epoch": 0.00041383784267451097, "grad_norm": 0.17353500425815582, "learning_rate": 0.0001999999154857209, "loss": 1.6746, "step": 37 }, { "epoch": 0.0004250226492332815, "grad_norm": 0.19149154424667358, "learning_rate": 0.00019999991085564784, "loss": 1.6734, "step": 38 }, { "epoch": 0.0004362074557920521, "grad_norm": 0.15810631215572357, "learning_rate": 0.0001999999061021063, "loss": 1.6773, "step": 39 }, { "epoch": 0.00044739226235082267, "grad_norm": 0.14770038425922394, "learning_rate": 0.00019999990122509614, "loss": 1.6967, "step": 40 }, { "epoch": 0.0004585770689095932, "grad_norm": 0.15101520717144012, "learning_rate": 0.0001999998962246175, "loss": 1.6816, "step": 41 }, { "epoch": 0.0004697618754683638, "grad_norm": 0.1085171177983284, "learning_rate": 0.00019999989110067035, "loss": 1.6875, "step": 42 }, { "epoch": 0.00048094668202713437, "grad_norm": 0.13066136837005615, "learning_rate": 0.00019999988585325468, "loss": 1.6768, "step": 43 }, { "epoch": 0.000492131488585905, "grad_norm": 0.09783171862363815, "learning_rate": 0.0001999998804823705, "loss": 1.6559, "step": 44 }, { "epoch": 0.0005033162951446755, "grad_norm": 0.15001484751701355, "learning_rate": 0.00019999987498801777, "loss": 1.6662, "step": 45 }, { "epoch": 0.000514501101703446, "grad_norm": 0.06484243273735046, "learning_rate": 0.0001999998693701966, "loss": 1.6528, "step": 46 }, { "epoch": 0.0005256859082622166, "grad_norm": 0.12908180058002472, "learning_rate": 0.00019999986362890693, "loss": 1.67, "step": 47 }, { "epoch": 0.0005368707148209872, "grad_norm": 0.05700545758008957, "learning_rate": 0.00019999985776414877, "loss": 1.6643, "step": 48 }, { "epoch": 0.0005480555213797577, "grad_norm": 0.10586538165807724, "learning_rate": 0.00019999985177592211, "loss": 1.6725, "step": 49 }, { "epoch": 0.0005592403279385283, "grad_norm": 0.05411362275481224, "learning_rate": 0.00019999984566422703, "loss": 1.6495, "step": 50 }, { "epoch": 0.0005704251344972989, "grad_norm": 0.08841974288225174, "learning_rate": 0.00019999983942906347, "loss": 1.6397, "step": 51 }, { "epoch": 0.0005816099410560694, "grad_norm": 0.049202822148799896, "learning_rate": 0.00019999983307043145, "loss": 1.6601, "step": 52 }, { "epoch": 0.00059279474761484, "grad_norm": 0.06537283957004547, "learning_rate": 0.00019999982658833098, "loss": 1.6405, "step": 53 }, { "epoch": 0.0006039795541736106, "grad_norm": 0.04358899965882301, "learning_rate": 0.0001999998199827621, "loss": 1.6586, "step": 54 }, { "epoch": 0.0006151643607323811, "grad_norm": 0.05924156308174133, "learning_rate": 0.0001999998132537248, "loss": 1.6609, "step": 55 }, { "epoch": 0.0006263491672911517, "grad_norm": 0.047364529222249985, "learning_rate": 0.00019999980640121904, "loss": 1.6561, "step": 56 }, { "epoch": 0.0006375339738499223, "grad_norm": 0.05860909819602966, "learning_rate": 0.00019999979942524488, "loss": 1.644, "step": 57 }, { "epoch": 0.0006487187804086928, "grad_norm": 0.058639075607061386, "learning_rate": 0.00019999979232580235, "loss": 1.6582, "step": 58 }, { "epoch": 0.0006599035869674634, "grad_norm": 0.049288246780633926, "learning_rate": 0.00019999978510289138, "loss": 1.6828, "step": 59 }, { "epoch": 0.000671088393526234, "grad_norm": 0.06293074041604996, "learning_rate": 0.00019999977775651207, "loss": 1.6852, "step": 60 }, { "epoch": 0.0006822732000850045, "grad_norm": 0.03848947212100029, "learning_rate": 0.00019999977028666436, "loss": 1.6792, "step": 61 }, { "epoch": 0.0006934580066437751, "grad_norm": 0.1217992901802063, "learning_rate": 0.00019999976269334828, "loss": 1.7142, "step": 62 }, { "epoch": 0.0007046428132025457, "grad_norm": 0.04006423428654671, "learning_rate": 0.00019999975497656384, "loss": 1.7108, "step": 63 }, { "epoch": 0.0007158276197613162, "grad_norm": 0.057927753776311874, "learning_rate": 0.0001999997471363111, "loss": 1.6902, "step": 64 }, { "epoch": 0.0007270124263200868, "grad_norm": 0.04537767171859741, "learning_rate": 0.00019999973917258997, "loss": 1.6627, "step": 65 }, { "epoch": 0.0007381972328788574, "grad_norm": 0.057307057082653046, "learning_rate": 0.00019999973108540052, "loss": 1.6561, "step": 66 }, { "epoch": 0.0007493820394376279, "grad_norm": 0.044002167880535126, "learning_rate": 0.00019999972287474272, "loss": 1.6595, "step": 67 }, { "epoch": 0.0007605668459963985, "grad_norm": 0.03991294279694557, "learning_rate": 0.00019999971454061666, "loss": 1.6563, "step": 68 }, { "epoch": 0.0007717516525551691, "grad_norm": 0.044279683381319046, "learning_rate": 0.0001999997060830223, "loss": 1.6352, "step": 69 }, { "epoch": 0.0007829364591139396, "grad_norm": 0.04319776967167854, "learning_rate": 0.00019999969750195967, "loss": 1.6319, "step": 70 }, { "epoch": 0.0007941212656727102, "grad_norm": 0.04436005651950836, "learning_rate": 0.00019999968879742873, "loss": 1.6556, "step": 71 }, { "epoch": 0.0008053060722314808, "grad_norm": 0.0416998416185379, "learning_rate": 0.00019999967996942952, "loss": 1.6646, "step": 72 }, { "epoch": 0.0008164908787902513, "grad_norm": 0.03601493313908577, "learning_rate": 0.00019999967101796208, "loss": 1.6605, "step": 73 }, { "epoch": 0.0008276756853490219, "grad_norm": 0.03957024961709976, "learning_rate": 0.00019999966194302637, "loss": 1.6714, "step": 74 }, { "epoch": 0.0008388604919077925, "grad_norm": 0.04401829466223717, "learning_rate": 0.00019999965274462245, "loss": 1.6497, "step": 75 }, { "epoch": 0.000850045298466563, "grad_norm": 0.04251580312848091, "learning_rate": 0.0001999996434227503, "loss": 1.6567, "step": 76 }, { "epoch": 0.0008612301050253336, "grad_norm": 0.038915786892175674, "learning_rate": 0.00019999963397740995, "loss": 1.6515, "step": 77 }, { "epoch": 0.0008724149115841042, "grad_norm": 0.04063812270760536, "learning_rate": 0.00019999962440860137, "loss": 1.6764, "step": 78 }, { "epoch": 0.0008835997181428747, "grad_norm": 0.04114954546093941, "learning_rate": 0.00019999961471632463, "loss": 1.6553, "step": 79 }, { "epoch": 0.0008947845247016453, "grad_norm": 0.039563097059726715, "learning_rate": 0.0001999996049005797, "loss": 1.6595, "step": 80 }, { "epoch": 0.0009059693312604159, "grad_norm": 0.03477632254362106, "learning_rate": 0.00019999959496136663, "loss": 1.6612, "step": 81 }, { "epoch": 0.0009171541378191864, "grad_norm": 0.04104992002248764, "learning_rate": 0.0001999995848986854, "loss": 1.6476, "step": 82 }, { "epoch": 0.000928338944377957, "grad_norm": 0.03909602388739586, "learning_rate": 0.00019999957471253602, "loss": 1.6445, "step": 83 }, { "epoch": 0.0009395237509367276, "grad_norm": 0.039677415043115616, "learning_rate": 0.00019999956440291855, "loss": 1.6368, "step": 84 }, { "epoch": 0.0009507085574954981, "grad_norm": 0.03493763506412506, "learning_rate": 0.00019999955396983292, "loss": 1.6343, "step": 85 }, { "epoch": 0.0009618933640542687, "grad_norm": 0.03530074283480644, "learning_rate": 0.0001999995434132792, "loss": 1.6843, "step": 86 }, { "epoch": 0.0009730781706130393, "grad_norm": 0.037156637758016586, "learning_rate": 0.00019999953273325743, "loss": 1.6721, "step": 87 }, { "epoch": 0.00098426297717181, "grad_norm": 0.04006032645702362, "learning_rate": 0.00019999952192976755, "loss": 1.6724, "step": 88 }, { "epoch": 0.0009954477837305804, "grad_norm": 0.03743763267993927, "learning_rate": 0.0001999995110028096, "loss": 1.6613, "step": 89 }, { "epoch": 0.001006632590289351, "grad_norm": 0.04100384563207626, "learning_rate": 0.00019999949995238369, "loss": 1.6444, "step": 90 }, { "epoch": 0.0010178173968481215, "grad_norm": 0.03799253702163696, "learning_rate": 0.00019999948877848965, "loss": 1.6641, "step": 91 }, { "epoch": 0.001029002203406892, "grad_norm": 0.040163811296224594, "learning_rate": 0.00019999947748112763, "loss": 1.6411, "step": 92 }, { "epoch": 0.0010401870099656626, "grad_norm": 0.03576591610908508, "learning_rate": 0.0001999994660602976, "loss": 1.6378, "step": 93 }, { "epoch": 0.0010513718165244333, "grad_norm": 0.03735070303082466, "learning_rate": 0.00019999945451599957, "loss": 1.644, "step": 94 }, { "epoch": 0.0010625566230832038, "grad_norm": 0.04157353192567825, "learning_rate": 0.00019999944284823358, "loss": 1.6532, "step": 95 }, { "epoch": 0.0010737414296419744, "grad_norm": 0.046478450298309326, "learning_rate": 0.0001999994310569996, "loss": 1.6744, "step": 96 }, { "epoch": 0.0010849262362007449, "grad_norm": 0.04813043400645256, "learning_rate": 0.0001999994191422977, "loss": 1.6716, "step": 97 }, { "epoch": 0.0010961110427595154, "grad_norm": 0.03780042380094528, "learning_rate": 0.00019999940710412788, "loss": 1.6502, "step": 98 }, { "epoch": 0.001107295849318286, "grad_norm": 0.03811471536755562, "learning_rate": 0.0001999993949424901, "loss": 1.6405, "step": 99 }, { "epoch": 0.0011184806558770567, "grad_norm": 0.03838631138205528, "learning_rate": 0.00019999938265738445, "loss": 1.6202, "step": 100 }, { "epoch": 0.0011296654624358272, "grad_norm": 0.04033266752958298, "learning_rate": 0.0001999993702488109, "loss": 1.6305, "step": 101 }, { "epoch": 0.0011408502689945978, "grad_norm": 0.038872625678777695, "learning_rate": 0.0001999993577167695, "loss": 1.6243, "step": 102 }, { "epoch": 0.0011520350755533683, "grad_norm": 0.047068770974874496, "learning_rate": 0.0001999993450612602, "loss": 1.6126, "step": 103 }, { "epoch": 0.0011632198821121388, "grad_norm": 0.038774486631155014, "learning_rate": 0.0001999993322822831, "loss": 1.6125, "step": 104 }, { "epoch": 0.0011744046886709093, "grad_norm": 0.046706534922122955, "learning_rate": 0.00019999931937983814, "loss": 1.6062, "step": 105 }, { "epoch": 0.00118558949522968, "grad_norm": 0.038454532623291016, "learning_rate": 0.00019999930635392538, "loss": 1.6145, "step": 106 }, { "epoch": 0.0011967743017884506, "grad_norm": 0.042938027530908585, "learning_rate": 0.0001999992932045448, "loss": 1.6064, "step": 107 }, { "epoch": 0.0012079591083472212, "grad_norm": 0.03728878125548363, "learning_rate": 0.00019999927993169652, "loss": 1.6203, "step": 108 }, { "epoch": 0.0012191439149059917, "grad_norm": 0.03831510245800018, "learning_rate": 0.00019999926653538043, "loss": 1.6214, "step": 109 }, { "epoch": 0.0012303287214647622, "grad_norm": 0.04032002389431, "learning_rate": 0.00019999925301559659, "loss": 1.6116, "step": 110 }, { "epoch": 0.0012415135280235327, "grad_norm": 0.04648670554161072, "learning_rate": 0.00019999923937234505, "loss": 1.6228, "step": 111 }, { "epoch": 0.0012526983345823035, "grad_norm": 0.03572435304522514, "learning_rate": 0.00019999922560562575, "loss": 1.6382, "step": 112 }, { "epoch": 0.001263883141141074, "grad_norm": 0.03520497307181358, "learning_rate": 0.0001999992117154388, "loss": 1.642, "step": 113 }, { "epoch": 0.0012750679476998446, "grad_norm": 0.037833768874406815, "learning_rate": 0.00019999919770178414, "loss": 1.6809, "step": 114 }, { "epoch": 0.001286252754258615, "grad_norm": 0.04116043448448181, "learning_rate": 0.00019999918356466186, "loss": 1.6669, "step": 115 }, { "epoch": 0.0012974375608173856, "grad_norm": 0.03686891868710518, "learning_rate": 0.00019999916930407192, "loss": 1.6336, "step": 116 }, { "epoch": 0.0013086223673761561, "grad_norm": 0.04540928825736046, "learning_rate": 0.00019999915492001434, "loss": 1.636, "step": 117 }, { "epoch": 0.0013198071739349269, "grad_norm": 0.03894014656543732, "learning_rate": 0.00019999914041248917, "loss": 1.642, "step": 118 }, { "epoch": 0.0013309919804936974, "grad_norm": 0.03868821635842323, "learning_rate": 0.0001999991257814964, "loss": 1.6459, "step": 119 }, { "epoch": 0.001342176787052468, "grad_norm": 0.05047065392136574, "learning_rate": 0.00019999911102703606, "loss": 1.6566, "step": 120 }, { "epoch": 0.0013533615936112385, "grad_norm": 0.037026163190603256, "learning_rate": 0.0001999990961491082, "loss": 1.644, "step": 121 }, { "epoch": 0.001364546400170009, "grad_norm": 0.05025867745280266, "learning_rate": 0.00019999908114771278, "loss": 1.6375, "step": 122 }, { "epoch": 0.0013757312067287795, "grad_norm": 0.05618799850344658, "learning_rate": 0.0001999990660228498, "loss": 1.6414, "step": 123 }, { "epoch": 0.0013869160132875503, "grad_norm": 0.04012198746204376, "learning_rate": 0.0001999990507745194, "loss": 1.6338, "step": 124 }, { "epoch": 0.0013981008198463208, "grad_norm": 0.03692522644996643, "learning_rate": 0.00019999903540272147, "loss": 1.6338, "step": 125 }, { "epoch": 0.0014092856264050913, "grad_norm": 0.04653295874595642, "learning_rate": 0.0001999990199074561, "loss": 1.6242, "step": 126 }, { "epoch": 0.0014204704329638619, "grad_norm": 0.038410138338804245, "learning_rate": 0.0001999990042887233, "loss": 1.6535, "step": 127 }, { "epoch": 0.0014316552395226324, "grad_norm": 0.03855321556329727, "learning_rate": 0.00019999898854652307, "loss": 1.6622, "step": 128 }, { "epoch": 0.001442840046081403, "grad_norm": 0.039161138236522675, "learning_rate": 0.00019999897268085543, "loss": 1.6596, "step": 129 }, { "epoch": 0.0014540248526401737, "grad_norm": 0.04229150339961052, "learning_rate": 0.00019999895669172042, "loss": 1.6044, "step": 130 }, { "epoch": 0.0014652096591989442, "grad_norm": 0.04695671424269676, "learning_rate": 0.00019999894057911804, "loss": 1.593, "step": 131 }, { "epoch": 0.0014763944657577147, "grad_norm": 0.05007043853402138, "learning_rate": 0.00019999892434304832, "loss": 1.6371, "step": 132 }, { "epoch": 0.0014875792723164853, "grad_norm": 0.04140834882855415, "learning_rate": 0.00019999890798351127, "loss": 1.6333, "step": 133 }, { "epoch": 0.0014987640788752558, "grad_norm": 0.03771176561713219, "learning_rate": 0.0001999988915005069, "loss": 1.6253, "step": 134 }, { "epoch": 0.0015099488854340263, "grad_norm": 0.03835882246494293, "learning_rate": 0.00019999887489403532, "loss": 1.6096, "step": 135 }, { "epoch": 0.001521133691992797, "grad_norm": 0.042071614414453506, "learning_rate": 0.00019999885816409643, "loss": 1.6158, "step": 136 }, { "epoch": 0.0015323184985515676, "grad_norm": 0.04736172780394554, "learning_rate": 0.0001999988413106903, "loss": 1.6076, "step": 137 }, { "epoch": 0.0015435033051103381, "grad_norm": 0.045473724603652954, "learning_rate": 0.00019999882433381695, "loss": 1.6002, "step": 138 }, { "epoch": 0.0015546881116691087, "grad_norm": 0.038239121437072754, "learning_rate": 0.0001999988072334764, "loss": 1.6093, "step": 139 }, { "epoch": 0.0015658729182278792, "grad_norm": 0.03806670382618904, "learning_rate": 0.0001999987900096687, "loss": 1.5958, "step": 140 }, { "epoch": 0.0015770577247866497, "grad_norm": 0.034363262355327606, "learning_rate": 0.00019999877266239382, "loss": 1.6182, "step": 141 }, { "epoch": 0.0015882425313454205, "grad_norm": 0.03781922906637192, "learning_rate": 0.0001999987551916518, "loss": 1.6041, "step": 142 }, { "epoch": 0.001599427337904191, "grad_norm": 0.037946417927742004, "learning_rate": 0.00019999873759744268, "loss": 1.6154, "step": 143 }, { "epoch": 0.0016106121444629615, "grad_norm": 0.03221859410405159, "learning_rate": 0.00019999871987976645, "loss": 1.6205, "step": 144 }, { "epoch": 0.001621796951021732, "grad_norm": 0.034399550408124924, "learning_rate": 0.00019999870203862318, "loss": 1.6077, "step": 145 }, { "epoch": 0.0016329817575805026, "grad_norm": 0.037965498864650726, "learning_rate": 0.00019999868407401285, "loss": 1.6226, "step": 146 }, { "epoch": 0.0016441665641392731, "grad_norm": 0.034158267080783844, "learning_rate": 0.00019999866598593549, "loss": 1.6364, "step": 147 }, { "epoch": 0.0016553513706980439, "grad_norm": 0.035179782658815384, "learning_rate": 0.00019999864777439113, "loss": 1.6015, "step": 148 }, { "epoch": 0.0016665361772568144, "grad_norm": 0.07380399852991104, "learning_rate": 0.00019999862943937977, "loss": 1.6036, "step": 149 }, { "epoch": 0.001677720983815585, "grad_norm": 0.055961351841688156, "learning_rate": 0.00019999861098090146, "loss": 1.6209, "step": 150 }, { "epoch": 0.0016889057903743555, "grad_norm": 0.06556452065706253, "learning_rate": 0.00019999859239895623, "loss": 1.6179, "step": 151 }, { "epoch": 0.001700090596933126, "grad_norm": 0.0406390018761158, "learning_rate": 0.0001999985736935441, "loss": 1.6251, "step": 152 }, { "epoch": 0.0017112754034918965, "grad_norm": 0.04337646812200546, "learning_rate": 0.00019999855486466504, "loss": 1.6229, "step": 153 }, { "epoch": 0.0017224602100506673, "grad_norm": 0.03773214668035507, "learning_rate": 0.00019999853591231914, "loss": 1.6163, "step": 154 }, { "epoch": 0.0017336450166094378, "grad_norm": 0.04170192405581474, "learning_rate": 0.0001999985168365064, "loss": 1.6187, "step": 155 }, { "epoch": 0.0017448298231682083, "grad_norm": 0.046784352511167526, "learning_rate": 0.00019999849763722684, "loss": 1.6313, "step": 156 }, { "epoch": 0.0017560146297269789, "grad_norm": 0.04641957953572273, "learning_rate": 0.00019999847831448048, "loss": 1.6353, "step": 157 }, { "epoch": 0.0017671994362857494, "grad_norm": 0.04125691205263138, "learning_rate": 0.00019999845886826736, "loss": 1.6136, "step": 158 }, { "epoch": 0.00177838424284452, "grad_norm": 0.03794560208916664, "learning_rate": 0.00019999843929858748, "loss": 1.6124, "step": 159 }, { "epoch": 0.0017895690494032907, "grad_norm": 0.04494895413517952, "learning_rate": 0.00019999841960544087, "loss": 1.6038, "step": 160 }, { "epoch": 0.0018007538559620612, "grad_norm": 0.0605354905128479, "learning_rate": 0.00019999839978882756, "loss": 1.6129, "step": 161 }, { "epoch": 0.0018119386625208317, "grad_norm": 0.03629598766565323, "learning_rate": 0.0001999983798487476, "loss": 1.6136, "step": 162 }, { "epoch": 0.0018231234690796023, "grad_norm": 0.051288772374391556, "learning_rate": 0.00019999835978520099, "loss": 1.6121, "step": 163 }, { "epoch": 0.0018343082756383728, "grad_norm": 0.0646372139453888, "learning_rate": 0.00019999833959818774, "loss": 1.6097, "step": 164 }, { "epoch": 0.0018454930821971433, "grad_norm": 0.05430466681718826, "learning_rate": 0.00019999831928770788, "loss": 1.6336, "step": 165 }, { "epoch": 0.001856677888755914, "grad_norm": 0.04446660354733467, "learning_rate": 0.00019999829885376146, "loss": 1.6446, "step": 166 }, { "epoch": 0.0018678626953146846, "grad_norm": 0.05089535191655159, "learning_rate": 0.0001999982782963485, "loss": 1.6295, "step": 167 }, { "epoch": 0.0018790475018734551, "grad_norm": 0.04738520458340645, "learning_rate": 0.000199998257615469, "loss": 1.6129, "step": 168 }, { "epoch": 0.0018902323084322257, "grad_norm": 0.06705950200557709, "learning_rate": 0.00019999823681112296, "loss": 1.6091, "step": 169 }, { "epoch": 0.0019014171149909962, "grad_norm": 0.08080725371837616, "learning_rate": 0.0001999982158833105, "loss": 1.6041, "step": 170 }, { "epoch": 0.0019126019215497667, "grad_norm": 0.08998562395572662, "learning_rate": 0.00019999819483203162, "loss": 1.61, "step": 171 }, { "epoch": 0.0019237867281085375, "grad_norm": 0.0561324767768383, "learning_rate": 0.00019999817365728626, "loss": 1.6019, "step": 172 }, { "epoch": 0.001934971534667308, "grad_norm": 0.06384976953268051, "learning_rate": 0.00019999815235907453, "loss": 1.6045, "step": 173 }, { "epoch": 0.0019461563412260785, "grad_norm": 0.08231502771377563, "learning_rate": 0.00019999813093739643, "loss": 1.5851, "step": 174 }, { "epoch": 0.001957341147784849, "grad_norm": 0.06544940173625946, "learning_rate": 0.00019999810939225196, "loss": 1.5902, "step": 175 }, { "epoch": 0.00196852595434362, "grad_norm": 0.05647379904985428, "learning_rate": 0.0001999980877236412, "loss": 1.5972, "step": 176 }, { "epoch": 0.00197971076090239, "grad_norm": 0.04889946058392525, "learning_rate": 0.00019999806593156417, "loss": 1.6039, "step": 177 }, { "epoch": 0.001990895567461161, "grad_norm": 0.04563042148947716, "learning_rate": 0.00019999804401602087, "loss": 1.6137, "step": 178 }, { "epoch": 0.002002080374019931, "grad_norm": 0.057878173887729645, "learning_rate": 0.0001999980219770113, "loss": 1.603, "step": 179 }, { "epoch": 0.002013265180578702, "grad_norm": 0.04965892806649208, "learning_rate": 0.00019999799981453554, "loss": 1.6016, "step": 180 }, { "epoch": 0.0020244499871374727, "grad_norm": 0.042930275201797485, "learning_rate": 0.00019999797752859362, "loss": 1.5872, "step": 181 }, { "epoch": 0.002035634793696243, "grad_norm": 0.07251135259866714, "learning_rate": 0.00019999795511918553, "loss": 1.5973, "step": 182 }, { "epoch": 0.0020468196002550137, "grad_norm": 0.053799793124198914, "learning_rate": 0.00019999793258631133, "loss": 1.6177, "step": 183 }, { "epoch": 0.002058004406813784, "grad_norm": 0.05833510681986809, "learning_rate": 0.00019999790992997101, "loss": 1.5898, "step": 184 }, { "epoch": 0.002069189213372555, "grad_norm": 0.06137058511376381, "learning_rate": 0.0001999978871501646, "loss": 1.5836, "step": 185 }, { "epoch": 0.002080374019931325, "grad_norm": 0.04289720579981804, "learning_rate": 0.0001999978642468922, "loss": 1.5893, "step": 186 }, { "epoch": 0.002091558826490096, "grad_norm": 0.054850902408361435, "learning_rate": 0.00019999784122015375, "loss": 1.5894, "step": 187 }, { "epoch": 0.0021027436330488666, "grad_norm": 0.06448347866535187, "learning_rate": 0.00019999781806994934, "loss": 1.5862, "step": 188 }, { "epoch": 0.002113928439607637, "grad_norm": 0.06109186261892319, "learning_rate": 0.00019999779479627897, "loss": 1.5951, "step": 189 }, { "epoch": 0.0021251132461664077, "grad_norm": 0.05638093128800392, "learning_rate": 0.00019999777139914263, "loss": 1.5929, "step": 190 }, { "epoch": 0.002136298052725178, "grad_norm": 0.05871524661779404, "learning_rate": 0.00019999774787854047, "loss": 1.5979, "step": 191 }, { "epoch": 0.0021474828592839487, "grad_norm": 0.05536816641688347, "learning_rate": 0.00019999772423447238, "loss": 1.598, "step": 192 }, { "epoch": 0.0021586676658427195, "grad_norm": 0.04985928162932396, "learning_rate": 0.00019999770046693848, "loss": 1.6067, "step": 193 }, { "epoch": 0.0021698524724014898, "grad_norm": 0.03956759348511696, "learning_rate": 0.00019999767657593874, "loss": 1.6101, "step": 194 }, { "epoch": 0.0021810372789602605, "grad_norm": 0.043052803725004196, "learning_rate": 0.00019999765256147324, "loss": 1.5925, "step": 195 }, { "epoch": 0.002192222085519031, "grad_norm": 0.049073830246925354, "learning_rate": 0.000199997628423542, "loss": 1.5997, "step": 196 }, { "epoch": 0.0022034068920778016, "grad_norm": 0.0505862683057785, "learning_rate": 0.00019999760416214503, "loss": 1.5976, "step": 197 }, { "epoch": 0.002214591698636572, "grad_norm": 0.04482674226164818, "learning_rate": 0.00019999757977728235, "loss": 1.5762, "step": 198 }, { "epoch": 0.0022257765051953426, "grad_norm": 0.03707127273082733, "learning_rate": 0.00019999755526895402, "loss": 1.5805, "step": 199 }, { "epoch": 0.0022369613117541134, "grad_norm": 0.041122857481241226, "learning_rate": 0.00019999753063716002, "loss": 1.6041, "step": 200 }, { "epoch": 0.0022481461183128837, "grad_norm": 0.05945773050189018, "learning_rate": 0.00019999750588190046, "loss": 1.6068, "step": 201 }, { "epoch": 0.0022593309248716545, "grad_norm": 0.058612506836652756, "learning_rate": 0.00019999748100317532, "loss": 1.6, "step": 202 }, { "epoch": 0.0022705157314304248, "grad_norm": 0.05361739546060562, "learning_rate": 0.00019999745600098466, "loss": 1.5891, "step": 203 }, { "epoch": 0.0022817005379891955, "grad_norm": 0.045548051595687866, "learning_rate": 0.00019999743087532846, "loss": 1.6161, "step": 204 }, { "epoch": 0.002292885344547966, "grad_norm": 0.04524560272693634, "learning_rate": 0.00019999740562620682, "loss": 1.6092, "step": 205 }, { "epoch": 0.0023040701511067366, "grad_norm": 0.04346180334687233, "learning_rate": 0.0001999973802536197, "loss": 1.6076, "step": 206 }, { "epoch": 0.0023152549576655073, "grad_norm": 0.047505974769592285, "learning_rate": 0.00019999735475756717, "loss": 1.5825, "step": 207 }, { "epoch": 0.0023264397642242776, "grad_norm": 0.03851678594946861, "learning_rate": 0.00019999732913804927, "loss": 1.5991, "step": 208 }, { "epoch": 0.0023376245707830484, "grad_norm": 0.051913902163505554, "learning_rate": 0.000199997303395066, "loss": 1.6281, "step": 209 }, { "epoch": 0.0023488093773418187, "grad_norm": 0.06070960685610771, "learning_rate": 0.0001999972775286174, "loss": 1.641, "step": 210 }, { "epoch": 0.0023599941839005894, "grad_norm": 0.13532494008541107, "learning_rate": 0.00019999725153870354, "loss": 1.6009, "step": 211 }, { "epoch": 0.00237117899045936, "grad_norm": 0.30072659254074097, "learning_rate": 0.00019999722542532442, "loss": 1.6163, "step": 212 }, { "epoch": 0.0023823637970181305, "grad_norm": 0.3511681854724884, "learning_rate": 0.00019999719918848004, "loss": 1.6465, "step": 213 }, { "epoch": 0.0023935486035769012, "grad_norm": 0.2487097680568695, "learning_rate": 0.00019999717282817052, "loss": 1.6367, "step": 214 }, { "epoch": 0.0024047334101356716, "grad_norm": 0.23582719266414642, "learning_rate": 0.00019999714634439582, "loss": 1.6202, "step": 215 }, { "epoch": 0.0024159182166944423, "grad_norm": 0.17389710247516632, "learning_rate": 0.000199997119737156, "loss": 1.6308, "step": 216 }, { "epoch": 0.0024271030232532126, "grad_norm": 0.14822594821453094, "learning_rate": 0.00019999709300645105, "loss": 1.6175, "step": 217 }, { "epoch": 0.0024382878298119834, "grad_norm": 0.12898804247379303, "learning_rate": 0.00019999706615228107, "loss": 1.6069, "step": 218 }, { "epoch": 0.002449472636370754, "grad_norm": 0.16721111536026, "learning_rate": 0.00019999703917464605, "loss": 1.6206, "step": 219 }, { "epoch": 0.0024606574429295244, "grad_norm": 0.08022027462720871, "learning_rate": 0.00019999701207354606, "loss": 1.6082, "step": 220 }, { "epoch": 0.002471842249488295, "grad_norm": 0.1281793862581253, "learning_rate": 0.0001999969848489811, "loss": 1.5928, "step": 221 }, { "epoch": 0.0024830270560470655, "grad_norm": 0.10240975767374039, "learning_rate": 0.00019999695750095117, "loss": 1.6058, "step": 222 }, { "epoch": 0.0024942118626058362, "grad_norm": 0.08368588238954544, "learning_rate": 0.00019999693002945642, "loss": 1.6193, "step": 223 }, { "epoch": 0.002505396669164607, "grad_norm": 0.09998615831136703, "learning_rate": 0.00019999690243449676, "loss": 1.6096, "step": 224 }, { "epoch": 0.0025165814757233773, "grad_norm": 0.08170673996210098, "learning_rate": 0.00019999687471607228, "loss": 1.5874, "step": 225 }, { "epoch": 0.002527766282282148, "grad_norm": 0.09015469253063202, "learning_rate": 0.000199996846874183, "loss": 1.5903, "step": 226 }, { "epoch": 0.0025389510888409184, "grad_norm": 0.07382986694574356, "learning_rate": 0.000199996818908829, "loss": 1.5958, "step": 227 }, { "epoch": 0.002550135895399689, "grad_norm": 0.06669515371322632, "learning_rate": 0.00019999679082001023, "loss": 1.5881, "step": 228 }, { "epoch": 0.0025613207019584594, "grad_norm": 0.0616774745285511, "learning_rate": 0.0001999967626077268, "loss": 1.6038, "step": 229 }, { "epoch": 0.00257250550851723, "grad_norm": 0.05451146885752678, "learning_rate": 0.00019999673427197872, "loss": 1.6021, "step": 230 }, { "epoch": 0.002583690315076001, "grad_norm": 0.05640149116516113, "learning_rate": 0.000199996705812766, "loss": 1.6022, "step": 231 }, { "epoch": 0.0025948751216347712, "grad_norm": 0.06033660098910332, "learning_rate": 0.00019999667723008871, "loss": 1.6002, "step": 232 }, { "epoch": 0.002606059928193542, "grad_norm": 0.07876270264387131, "learning_rate": 0.0001999966485239469, "loss": 1.6043, "step": 233 }, { "epoch": 0.0026172447347523123, "grad_norm": 0.0829700380563736, "learning_rate": 0.00019999661969434055, "loss": 1.5915, "step": 234 }, { "epoch": 0.002628429541311083, "grad_norm": 0.05654975026845932, "learning_rate": 0.0001999965907412697, "loss": 1.5893, "step": 235 }, { "epoch": 0.0026396143478698538, "grad_norm": 0.06751634925603867, "learning_rate": 0.00019999656166473444, "loss": 1.5757, "step": 236 }, { "epoch": 0.002650799154428624, "grad_norm": 0.06081743538379669, "learning_rate": 0.00019999653246473477, "loss": 1.5702, "step": 237 }, { "epoch": 0.002661983960987395, "grad_norm": 0.0666998103260994, "learning_rate": 0.00019999650314127075, "loss": 1.568, "step": 238 }, { "epoch": 0.002673168767546165, "grad_norm": 0.04934430122375488, "learning_rate": 0.00019999647369434235, "loss": 1.6017, "step": 239 }, { "epoch": 0.002684353574104936, "grad_norm": 0.0574209988117218, "learning_rate": 0.00019999644412394972, "loss": 1.5935, "step": 240 }, { "epoch": 0.002695538380663706, "grad_norm": 0.04870286583900452, "learning_rate": 0.00019999641443009278, "loss": 1.6035, "step": 241 }, { "epoch": 0.002706723187222477, "grad_norm": 0.04176439344882965, "learning_rate": 0.00019999638461277162, "loss": 1.598, "step": 242 }, { "epoch": 0.0027179079937812477, "grad_norm": 0.05534802004694939, "learning_rate": 0.0001999963546719863, "loss": 1.5945, "step": 243 }, { "epoch": 0.002729092800340018, "grad_norm": 0.04214160889387131, "learning_rate": 0.0001999963246077368, "loss": 1.6019, "step": 244 }, { "epoch": 0.0027402776068987888, "grad_norm": 0.04326852038502693, "learning_rate": 0.00019999629442002322, "loss": 1.604, "step": 245 }, { "epoch": 0.002751462413457559, "grad_norm": 0.04295732453465462, "learning_rate": 0.00019999626410884553, "loss": 1.6136, "step": 246 }, { "epoch": 0.00276264722001633, "grad_norm": 0.038508690893650055, "learning_rate": 0.00019999623367420385, "loss": 1.5904, "step": 247 }, { "epoch": 0.0027738320265751006, "grad_norm": 0.040281713008880615, "learning_rate": 0.0001999962031160981, "loss": 1.5955, "step": 248 }, { "epoch": 0.002785016833133871, "grad_norm": 0.041424721479415894, "learning_rate": 0.00019999617243452844, "loss": 1.608, "step": 249 }, { "epoch": 0.0027962016396926416, "grad_norm": 0.03804994374513626, "learning_rate": 0.00019999614162949484, "loss": 1.6125, "step": 250 }, { "epoch": 0.002807386446251412, "grad_norm": 0.04370785504579544, "learning_rate": 0.0001999961107009974, "loss": 1.6125, "step": 251 }, { "epoch": 0.0028185712528101827, "grad_norm": 0.047021038830280304, "learning_rate": 0.000199996079649036, "loss": 1.5931, "step": 252 }, { "epoch": 0.002829756059368953, "grad_norm": 0.036128588020801544, "learning_rate": 0.0001999960484736109, "loss": 1.5707, "step": 253 }, { "epoch": 0.0028409408659277238, "grad_norm": 0.04315561056137085, "learning_rate": 0.00019999601717472199, "loss": 1.5902, "step": 254 }, { "epoch": 0.0028521256724864945, "grad_norm": 0.04395722597837448, "learning_rate": 0.00019999598575236934, "loss": 1.5995, "step": 255 }, { "epoch": 0.002863310479045265, "grad_norm": 0.038929786533117294, "learning_rate": 0.000199995954206553, "loss": 1.5854, "step": 256 }, { "epoch": 0.0028744952856040356, "grad_norm": 0.041567280888557434, "learning_rate": 0.00019999592253727299, "loss": 1.5783, "step": 257 }, { "epoch": 0.002885680092162806, "grad_norm": 0.03894374892115593, "learning_rate": 0.0001999958907445294, "loss": 1.5846, "step": 258 }, { "epoch": 0.0028968648987215766, "grad_norm": 0.04269428178668022, "learning_rate": 0.00019999585882832222, "loss": 1.6023, "step": 259 }, { "epoch": 0.0029080497052803474, "grad_norm": 0.04121831804513931, "learning_rate": 0.00019999582678865147, "loss": 1.6051, "step": 260 }, { "epoch": 0.0029192345118391177, "grad_norm": 0.038076166063547134, "learning_rate": 0.00019999579462551728, "loss": 1.6136, "step": 261 }, { "epoch": 0.0029304193183978884, "grad_norm": 0.042008642107248306, "learning_rate": 0.0001999957623389196, "loss": 1.6063, "step": 262 }, { "epoch": 0.0029416041249566587, "grad_norm": 0.042438406497240067, "learning_rate": 0.0001999957299288585, "loss": 1.6033, "step": 263 }, { "epoch": 0.0029527889315154295, "grad_norm": 0.041119206696748734, "learning_rate": 0.000199995697395334, "loss": 1.6031, "step": 264 }, { "epoch": 0.0029639737380742, "grad_norm": 0.045258279889822006, "learning_rate": 0.00019999566473834622, "loss": 1.5853, "step": 265 }, { "epoch": 0.0029751585446329705, "grad_norm": 0.04734019562602043, "learning_rate": 0.0001999956319578951, "loss": 1.5921, "step": 266 }, { "epoch": 0.0029863433511917413, "grad_norm": 0.04389064759016037, "learning_rate": 0.00019999559905398072, "loss": 1.581, "step": 267 }, { "epoch": 0.0029975281577505116, "grad_norm": 0.04582642391324043, "learning_rate": 0.00019999556602660318, "loss": 1.5928, "step": 268 }, { "epoch": 0.0030087129643092824, "grad_norm": 0.04518941417336464, "learning_rate": 0.00019999553287576238, "loss": 1.5809, "step": 269 }, { "epoch": 0.0030198977708680527, "grad_norm": 0.04687381908297539, "learning_rate": 0.0001999954996014585, "loss": 1.5999, "step": 270 }, { "epoch": 0.0030310825774268234, "grad_norm": 0.04809357225894928, "learning_rate": 0.00019999546620369152, "loss": 1.5972, "step": 271 }, { "epoch": 0.003042267383985594, "grad_norm": 0.05907173454761505, "learning_rate": 0.0001999954326824615, "loss": 1.5862, "step": 272 }, { "epoch": 0.0030534521905443645, "grad_norm": 0.06583942472934723, "learning_rate": 0.00019999539903776842, "loss": 1.5846, "step": 273 }, { "epoch": 0.0030646369971031352, "grad_norm": 0.07557905465364456, "learning_rate": 0.0001999953652696124, "loss": 1.5962, "step": 274 }, { "epoch": 0.0030758218036619055, "grad_norm": 0.07007817178964615, "learning_rate": 0.00019999533137799347, "loss": 1.5951, "step": 275 }, { "epoch": 0.0030870066102206763, "grad_norm": 0.05711887776851654, "learning_rate": 0.00019999529736291162, "loss": 1.5932, "step": 276 }, { "epoch": 0.0030981914167794466, "grad_norm": 0.04450292885303497, "learning_rate": 0.00019999526322436696, "loss": 1.6071, "step": 277 }, { "epoch": 0.0031093762233382173, "grad_norm": 0.04260997474193573, "learning_rate": 0.00019999522896235947, "loss": 1.6038, "step": 278 }, { "epoch": 0.003120561029896988, "grad_norm": 0.05689796805381775, "learning_rate": 0.00019999519457688925, "loss": 1.5688, "step": 279 }, { "epoch": 0.0031317458364557584, "grad_norm": 0.06330379098653793, "learning_rate": 0.0001999951600679563, "loss": 1.5617, "step": 280 }, { "epoch": 0.003142930643014529, "grad_norm": 0.06195618584752083, "learning_rate": 0.00019999512543556066, "loss": 1.5602, "step": 281 }, { "epoch": 0.0031541154495732995, "grad_norm": 0.06677111238241196, "learning_rate": 0.0001999950906797024, "loss": 1.5659, "step": 282 }, { "epoch": 0.00316530025613207, "grad_norm": 0.05750421807169914, "learning_rate": 0.00019999505580038153, "loss": 1.5759, "step": 283 }, { "epoch": 0.003176485062690841, "grad_norm": 0.04907039552927017, "learning_rate": 0.00019999502079759817, "loss": 1.5833, "step": 284 }, { "epoch": 0.0031876698692496113, "grad_norm": 0.048877034336328506, "learning_rate": 0.00019999498567135223, "loss": 1.5836, "step": 285 }, { "epoch": 0.003198854675808382, "grad_norm": 0.05494236946105957, "learning_rate": 0.0001999949504216439, "loss": 1.5837, "step": 286 }, { "epoch": 0.0032100394823671523, "grad_norm": 0.04953937977552414, "learning_rate": 0.00019999491504847313, "loss": 1.5794, "step": 287 }, { "epoch": 0.003221224288925923, "grad_norm": 0.05240803211927414, "learning_rate": 0.00019999487955184, "loss": 1.5894, "step": 288 }, { "epoch": 0.0032324090954846934, "grad_norm": 0.05633338540792465, "learning_rate": 0.0001999948439317445, "loss": 1.5755, "step": 289 }, { "epoch": 0.003243593902043464, "grad_norm": 0.06563600897789001, "learning_rate": 0.00019999480818818675, "loss": 1.5912, "step": 290 }, { "epoch": 0.003254778708602235, "grad_norm": 0.05903002619743347, "learning_rate": 0.00019999477232116676, "loss": 1.5856, "step": 291 }, { "epoch": 0.003265963515161005, "grad_norm": 0.03582334890961647, "learning_rate": 0.00019999473633068457, "loss": 1.6079, "step": 292 }, { "epoch": 0.003277148321719776, "grad_norm": 0.05011364817619324, "learning_rate": 0.00019999470021674025, "loss": 1.5907, "step": 293 }, { "epoch": 0.0032883331282785463, "grad_norm": 0.0577118918299675, "learning_rate": 0.0001999946639793338, "loss": 1.5955, "step": 294 }, { "epoch": 0.003299517934837317, "grad_norm": 0.05170518904924393, "learning_rate": 0.00019999462761846528, "loss": 1.5884, "step": 295 }, { "epoch": 0.0033107027413960878, "grad_norm": 0.05011725425720215, "learning_rate": 0.00019999459113413475, "loss": 1.6046, "step": 296 }, { "epoch": 0.003321887547954858, "grad_norm": 0.05645633116364479, "learning_rate": 0.00019999455452634224, "loss": 1.596, "step": 297 }, { "epoch": 0.003333072354513629, "grad_norm": 0.05705921724438667, "learning_rate": 0.0001999945177950878, "loss": 1.6136, "step": 298 }, { "epoch": 0.003344257161072399, "grad_norm": 0.05761184170842171, "learning_rate": 0.0001999944809403715, "loss": 1.6095, "step": 299 }, { "epoch": 0.00335544196763117, "grad_norm": 0.0613851472735405, "learning_rate": 0.00019999444396219337, "loss": 1.6141, "step": 300 }, { "epoch": 0.00336662677418994, "grad_norm": 0.06220489367842674, "learning_rate": 0.00019999440686055344, "loss": 1.5988, "step": 301 }, { "epoch": 0.003377811580748711, "grad_norm": 0.062393296509981155, "learning_rate": 0.00019999436963545177, "loss": 1.6075, "step": 302 }, { "epoch": 0.0033889963873074817, "grad_norm": 0.0625912994146347, "learning_rate": 0.00019999433228688838, "loss": 1.6102, "step": 303 }, { "epoch": 0.003400181193866252, "grad_norm": 0.06049802899360657, "learning_rate": 0.00019999429481486335, "loss": 1.5968, "step": 304 }, { "epoch": 0.0034113660004250227, "grad_norm": 0.05767315998673439, "learning_rate": 0.00019999425721937674, "loss": 1.5906, "step": 305 }, { "epoch": 0.003422550806983793, "grad_norm": 0.049920015037059784, "learning_rate": 0.00019999421950042854, "loss": 1.5694, "step": 306 }, { "epoch": 0.003433735613542564, "grad_norm": 0.04852724075317383, "learning_rate": 0.0001999941816580188, "loss": 1.5609, "step": 307 }, { "epoch": 0.0034449204201013345, "grad_norm": 0.05249037966132164, "learning_rate": 0.00019999414369214767, "loss": 1.5675, "step": 308 }, { "epoch": 0.003456105226660105, "grad_norm": 0.05167357623577118, "learning_rate": 0.00019999410560281506, "loss": 1.5645, "step": 309 }, { "epoch": 0.0034672900332188756, "grad_norm": 0.05197747051715851, "learning_rate": 0.00019999406739002108, "loss": 1.5707, "step": 310 }, { "epoch": 0.003478474839777646, "grad_norm": 0.05140923336148262, "learning_rate": 0.00019999402905376582, "loss": 1.5745, "step": 311 }, { "epoch": 0.0034896596463364167, "grad_norm": 0.05357779935002327, "learning_rate": 0.00019999399059404923, "loss": 1.5824, "step": 312 }, { "epoch": 0.003500844452895187, "grad_norm": 0.04196302220225334, "learning_rate": 0.00019999395201087143, "loss": 1.5685, "step": 313 }, { "epoch": 0.0035120292594539577, "grad_norm": 0.04698769748210907, "learning_rate": 0.00019999391330423246, "loss": 1.5645, "step": 314 }, { "epoch": 0.0035232140660127285, "grad_norm": 0.055174414068460464, "learning_rate": 0.00019999387447413236, "loss": 1.5702, "step": 315 }, { "epoch": 0.003534398872571499, "grad_norm": 0.05560048297047615, "learning_rate": 0.00019999383552057114, "loss": 1.5746, "step": 316 }, { "epoch": 0.0035455836791302695, "grad_norm": 0.059730809181928635, "learning_rate": 0.0001999937964435489, "loss": 1.5698, "step": 317 }, { "epoch": 0.00355676848568904, "grad_norm": 0.06850636750459671, "learning_rate": 0.00019999375724306568, "loss": 1.5837, "step": 318 }, { "epoch": 0.0035679532922478106, "grad_norm": 0.0658111497759819, "learning_rate": 0.00019999371791912148, "loss": 1.5759, "step": 319 }, { "epoch": 0.0035791380988065813, "grad_norm": 0.05440279841423035, "learning_rate": 0.00019999367847171643, "loss": 1.5873, "step": 320 }, { "epoch": 0.0035903229053653517, "grad_norm": 0.06169600412249565, "learning_rate": 0.0001999936389008505, "loss": 1.5791, "step": 321 }, { "epoch": 0.0036015077119241224, "grad_norm": 0.06897033751010895, "learning_rate": 0.0001999935992065238, "loss": 1.5815, "step": 322 }, { "epoch": 0.0036126925184828927, "grad_norm": 0.06641620397567749, "learning_rate": 0.00019999355938873635, "loss": 1.5826, "step": 323 }, { "epoch": 0.0036238773250416635, "grad_norm": 0.057002220302820206, "learning_rate": 0.00019999351944748818, "loss": 1.5716, "step": 324 }, { "epoch": 0.0036350621316004338, "grad_norm": 0.06431427597999573, "learning_rate": 0.00019999347938277938, "loss": 1.5626, "step": 325 }, { "epoch": 0.0036462469381592045, "grad_norm": 0.06504250317811966, "learning_rate": 0.00019999343919460997, "loss": 1.574, "step": 326 }, { "epoch": 0.0036574317447179753, "grad_norm": 0.06940289586782455, "learning_rate": 0.00019999339888298004, "loss": 1.5676, "step": 327 }, { "epoch": 0.0036686165512767456, "grad_norm": 0.06492604315280914, "learning_rate": 0.00019999335844788957, "loss": 1.5676, "step": 328 }, { "epoch": 0.0036798013578355163, "grad_norm": 0.07069146633148193, "learning_rate": 0.0001999933178893387, "loss": 1.5601, "step": 329 }, { "epoch": 0.0036909861643942866, "grad_norm": 0.07502440363168716, "learning_rate": 0.00019999327720732736, "loss": 1.5651, "step": 330 }, { "epoch": 0.0037021709709530574, "grad_norm": 0.06407099217176437, "learning_rate": 0.00019999323640185573, "loss": 1.5818, "step": 331 }, { "epoch": 0.003713355777511828, "grad_norm": 0.05621904134750366, "learning_rate": 0.00019999319547292377, "loss": 1.5975, "step": 332 }, { "epoch": 0.0037245405840705985, "grad_norm": 0.053219810128211975, "learning_rate": 0.00019999315442053157, "loss": 1.5925, "step": 333 }, { "epoch": 0.003735725390629369, "grad_norm": 0.0538603812456131, "learning_rate": 0.00019999311324467919, "loss": 1.5784, "step": 334 }, { "epoch": 0.0037469101971881395, "grad_norm": 0.05239463597536087, "learning_rate": 0.00019999307194536664, "loss": 1.5782, "step": 335 }, { "epoch": 0.0037580950037469103, "grad_norm": 0.053746242076158524, "learning_rate": 0.00019999303052259398, "loss": 1.5802, "step": 336 }, { "epoch": 0.0037692798103056806, "grad_norm": 0.04721551761031151, "learning_rate": 0.0001999929889763613, "loss": 1.5739, "step": 337 }, { "epoch": 0.0037804646168644513, "grad_norm": 0.04483070224523544, "learning_rate": 0.00019999294730666862, "loss": 1.5823, "step": 338 }, { "epoch": 0.003791649423423222, "grad_norm": 0.05224015936255455, "learning_rate": 0.000199992905513516, "loss": 1.5559, "step": 339 }, { "epoch": 0.0038028342299819924, "grad_norm": 0.05772995948791504, "learning_rate": 0.0001999928635969035, "loss": 1.5574, "step": 340 }, { "epoch": 0.003814019036540763, "grad_norm": 0.059287503361701965, "learning_rate": 0.00019999282155683116, "loss": 1.5694, "step": 341 }, { "epoch": 0.0038252038430995334, "grad_norm": 0.050815433263778687, "learning_rate": 0.00019999277939329902, "loss": 1.5655, "step": 342 }, { "epoch": 0.003836388649658304, "grad_norm": 0.047384679317474365, "learning_rate": 0.00019999273710630714, "loss": 1.5704, "step": 343 }, { "epoch": 0.003847573456217075, "grad_norm": 0.04918666183948517, "learning_rate": 0.00019999269469585555, "loss": 1.5694, "step": 344 }, { "epoch": 0.0038587582627758452, "grad_norm": 0.058182474225759506, "learning_rate": 0.0001999926521619444, "loss": 1.5971, "step": 345 }, { "epoch": 0.003869943069334616, "grad_norm": 0.07194909453392029, "learning_rate": 0.00019999260950457362, "loss": 1.6033, "step": 346 }, { "epoch": 0.0038811278758933863, "grad_norm": 0.08051355183124542, "learning_rate": 0.0001999925667237433, "loss": 1.5969, "step": 347 }, { "epoch": 0.003892312682452157, "grad_norm": 0.09640984982252121, "learning_rate": 0.00019999252381945357, "loss": 1.6414, "step": 348 }, { "epoch": 0.0039034974890109274, "grad_norm": 0.11357203125953674, "learning_rate": 0.00019999248079170437, "loss": 1.6522, "step": 349 }, { "epoch": 0.003914682295569698, "grad_norm": 0.14623580873012543, "learning_rate": 0.00019999243764049586, "loss": 1.6482, "step": 350 }, { "epoch": 0.003925867102128468, "grad_norm": 0.158810093998909, "learning_rate": 0.00019999239436582796, "loss": 1.5814, "step": 351 }, { "epoch": 0.00393705190868724, "grad_norm": 0.11811669170856476, "learning_rate": 0.00019999235096770086, "loss": 1.5782, "step": 352 }, { "epoch": 0.00394823671524601, "grad_norm": 0.09518411755561829, "learning_rate": 0.0001999923074461145, "loss": 1.5852, "step": 353 }, { "epoch": 0.00395942152180478, "grad_norm": 0.1165471076965332, "learning_rate": 0.00019999226380106906, "loss": 1.5766, "step": 354 }, { "epoch": 0.0039706063283635505, "grad_norm": 0.09517768025398254, "learning_rate": 0.00019999222003256448, "loss": 1.5829, "step": 355 }, { "epoch": 0.003981791134922322, "grad_norm": 0.06006886065006256, "learning_rate": 0.00019999217614060085, "loss": 1.5708, "step": 356 }, { "epoch": 0.003992975941481092, "grad_norm": 0.08933499455451965, "learning_rate": 0.00019999213212517825, "loss": 1.5718, "step": 357 }, { "epoch": 0.004004160748039862, "grad_norm": 0.0867113471031189, "learning_rate": 0.0001999920879862967, "loss": 1.5711, "step": 358 }, { "epoch": 0.0040153455545986335, "grad_norm": 0.06603030860424042, "learning_rate": 0.00019999204372395628, "loss": 1.5778, "step": 359 }, { "epoch": 0.004026530361157404, "grad_norm": 0.07514684647321701, "learning_rate": 0.00019999199933815702, "loss": 1.5738, "step": 360 }, { "epoch": 0.004037715167716174, "grad_norm": 0.060728590935468674, "learning_rate": 0.00019999195482889897, "loss": 1.5694, "step": 361 }, { "epoch": 0.004048899974274945, "grad_norm": 0.06927715986967087, "learning_rate": 0.00019999191019618224, "loss": 1.5664, "step": 362 }, { "epoch": 0.004060084780833716, "grad_norm": 0.0576176755130291, "learning_rate": 0.00019999186544000685, "loss": 1.5704, "step": 363 }, { "epoch": 0.004071269587392486, "grad_norm": 0.047579534351825714, "learning_rate": 0.00019999182056037285, "loss": 1.5732, "step": 364 }, { "epoch": 0.004082454393951256, "grad_norm": 0.06114533543586731, "learning_rate": 0.00019999177555728027, "loss": 1.5776, "step": 365 }, { "epoch": 0.0040936392005100275, "grad_norm": 0.05183887854218483, "learning_rate": 0.0001999917304307292, "loss": 1.5697, "step": 366 }, { "epoch": 0.004104824007068798, "grad_norm": 0.05595005676150322, "learning_rate": 0.0001999916851807197, "loss": 1.5611, "step": 367 }, { "epoch": 0.004116008813627568, "grad_norm": 0.04884869232773781, "learning_rate": 0.00019999163980725183, "loss": 1.5754, "step": 368 }, { "epoch": 0.004127193620186339, "grad_norm": 0.050160013139247894, "learning_rate": 0.00019999159431032562, "loss": 1.5818, "step": 369 }, { "epoch": 0.00413837842674511, "grad_norm": 0.0458194725215435, "learning_rate": 0.00019999154868994111, "loss": 1.5821, "step": 370 }, { "epoch": 0.00414956323330388, "grad_norm": 0.04877634719014168, "learning_rate": 0.00019999150294609845, "loss": 1.5828, "step": 371 }, { "epoch": 0.00416074803986265, "grad_norm": 0.045320361852645874, "learning_rate": 0.00019999145707879758, "loss": 1.5946, "step": 372 }, { "epoch": 0.004171932846421421, "grad_norm": 0.05053291842341423, "learning_rate": 0.00019999141108803864, "loss": 1.6002, "step": 373 }, { "epoch": 0.004183117652980192, "grad_norm": 0.053211431950330734, "learning_rate": 0.0001999913649738216, "loss": 1.5914, "step": 374 }, { "epoch": 0.004194302459538962, "grad_norm": 0.045671332627534866, "learning_rate": 0.00019999131873614664, "loss": 1.5806, "step": 375 }, { "epoch": 0.004205487266097733, "grad_norm": 0.051272232085466385, "learning_rate": 0.0001999912723750137, "loss": 1.5898, "step": 376 }, { "epoch": 0.0042166720726565035, "grad_norm": 0.05297670140862465, "learning_rate": 0.0001999912258904229, "loss": 1.6003, "step": 377 }, { "epoch": 0.004227856879215274, "grad_norm": 0.044414643198251724, "learning_rate": 0.00019999117928237427, "loss": 1.6069, "step": 378 }, { "epoch": 0.004239041685774044, "grad_norm": 0.04553841054439545, "learning_rate": 0.0001999911325508679, "loss": 1.5798, "step": 379 }, { "epoch": 0.004250226492332815, "grad_norm": 0.05364730581641197, "learning_rate": 0.00019999108569590383, "loss": 1.5856, "step": 380 }, { "epoch": 0.004261411298891586, "grad_norm": 0.05173739790916443, "learning_rate": 0.0001999910387174821, "loss": 1.5764, "step": 381 }, { "epoch": 0.004272596105450356, "grad_norm": 0.05577515438199043, "learning_rate": 0.00019999099161560282, "loss": 1.5855, "step": 382 }, { "epoch": 0.004283780912009127, "grad_norm": 0.057436104863882065, "learning_rate": 0.00019999094439026598, "loss": 1.5785, "step": 383 }, { "epoch": 0.0042949657185678974, "grad_norm": 0.03927776962518692, "learning_rate": 0.00019999089704147166, "loss": 1.5735, "step": 384 }, { "epoch": 0.004306150525126668, "grad_norm": 0.04739474132657051, "learning_rate": 0.00019999084956921997, "loss": 1.5805, "step": 385 }, { "epoch": 0.004317335331685439, "grad_norm": 0.04832485690712929, "learning_rate": 0.0001999908019735109, "loss": 1.6164, "step": 386 }, { "epoch": 0.004328520138244209, "grad_norm": 0.049625612795352936, "learning_rate": 0.00019999075425434452, "loss": 1.6468, "step": 387 }, { "epoch": 0.0043397049448029796, "grad_norm": 0.04835371673107147, "learning_rate": 0.00019999070641172094, "loss": 1.6438, "step": 388 }, { "epoch": 0.00435088975136175, "grad_norm": 0.05029625818133354, "learning_rate": 0.00019999065844564018, "loss": 1.6103, "step": 389 }, { "epoch": 0.004362074557920521, "grad_norm": 0.055567558854818344, "learning_rate": 0.0001999906103561023, "loss": 1.602, "step": 390 }, { "epoch": 0.004373259364479291, "grad_norm": 0.06654093414545059, "learning_rate": 0.00019999056214310733, "loss": 1.5815, "step": 391 }, { "epoch": 0.004384444171038062, "grad_norm": 0.06477198004722595, "learning_rate": 0.0001999905138066554, "loss": 1.5883, "step": 392 }, { "epoch": 0.004395628977596833, "grad_norm": 0.0623844638466835, "learning_rate": 0.00019999046534674656, "loss": 1.5877, "step": 393 }, { "epoch": 0.004406813784155603, "grad_norm": 0.059734586626291275, "learning_rate": 0.00019999041676338077, "loss": 1.5855, "step": 394 }, { "epoch": 0.0044179985907143735, "grad_norm": 0.05187408998608589, "learning_rate": 0.0001999903680565582, "loss": 1.5827, "step": 395 }, { "epoch": 0.004429183397273144, "grad_norm": 0.05175703763961792, "learning_rate": 0.00019999031922627886, "loss": 1.5758, "step": 396 }, { "epoch": 0.004440368203831915, "grad_norm": 0.05059249326586723, "learning_rate": 0.00019999027027254286, "loss": 1.5758, "step": 397 }, { "epoch": 0.004451553010390685, "grad_norm": 0.050220511853694916, "learning_rate": 0.0001999902211953502, "loss": 1.5771, "step": 398 }, { "epoch": 0.004462737816949456, "grad_norm": 0.057194001972675323, "learning_rate": 0.00019999017199470094, "loss": 1.5828, "step": 399 }, { "epoch": 0.004473922623508227, "grad_norm": 0.07026943564414978, "learning_rate": 0.00019999012267059519, "loss": 1.5728, "step": 400 }, { "epoch": 0.004485107430066997, "grad_norm": 0.08094791322946548, "learning_rate": 0.00019999007322303296, "loss": 1.5634, "step": 401 }, { "epoch": 0.004496292236625767, "grad_norm": 0.08370808511972427, "learning_rate": 0.0001999900236520144, "loss": 1.5747, "step": 402 }, { "epoch": 0.004507477043184538, "grad_norm": 0.09409447014331818, "learning_rate": 0.00019998997395753945, "loss": 1.5703, "step": 403 }, { "epoch": 0.004518661849743309, "grad_norm": 0.09207552671432495, "learning_rate": 0.0001999899241396082, "loss": 1.5628, "step": 404 }, { "epoch": 0.004529846656302079, "grad_norm": 0.07077619433403015, "learning_rate": 0.0001999898741982208, "loss": 1.5826, "step": 405 }, { "epoch": 0.0045410314628608495, "grad_norm": 0.06743451207876205, "learning_rate": 0.00019998982413337724, "loss": 1.6047, "step": 406 }, { "epoch": 0.004552216269419621, "grad_norm": 0.10414531826972961, "learning_rate": 0.0001999897739450776, "loss": 1.6005, "step": 407 }, { "epoch": 0.004563401075978391, "grad_norm": 0.10947459191083908, "learning_rate": 0.0001999897236333219, "loss": 1.6025, "step": 408 }, { "epoch": 0.004574585882537161, "grad_norm": 0.06659277528524399, "learning_rate": 0.00019998967319811027, "loss": 1.6059, "step": 409 }, { "epoch": 0.004585770689095932, "grad_norm": 0.06038579344749451, "learning_rate": 0.00019998962263944274, "loss": 1.5826, "step": 410 }, { "epoch": 0.004596955495654703, "grad_norm": 0.08695773035287857, "learning_rate": 0.00019998957195731934, "loss": 1.5779, "step": 411 }, { "epoch": 0.004608140302213473, "grad_norm": 0.07446157187223434, "learning_rate": 0.0001999895211517402, "loss": 1.5812, "step": 412 }, { "epoch": 0.0046193251087722435, "grad_norm": 0.04878260940313339, "learning_rate": 0.00019998947022270534, "loss": 1.6033, "step": 413 }, { "epoch": 0.004630509915331015, "grad_norm": 0.06283459812402725, "learning_rate": 0.00019998941917021484, "loss": 1.5879, "step": 414 }, { "epoch": 0.004641694721889785, "grad_norm": 0.05891675129532814, "learning_rate": 0.00019998936799426874, "loss": 1.59, "step": 415 }, { "epoch": 0.004652879528448555, "grad_norm": 0.04745139181613922, "learning_rate": 0.0001999893166948671, "loss": 1.5981, "step": 416 }, { "epoch": 0.0046640643350073265, "grad_norm": 0.05297010764479637, "learning_rate": 0.00019998926527201003, "loss": 1.5937, "step": 417 }, { "epoch": 0.004675249141566097, "grad_norm": 0.05115678906440735, "learning_rate": 0.00019998921372569757, "loss": 1.5941, "step": 418 }, { "epoch": 0.004686433948124867, "grad_norm": 0.05678752437233925, "learning_rate": 0.00019998916205592974, "loss": 1.5631, "step": 419 }, { "epoch": 0.004697618754683637, "grad_norm": 0.05227034166455269, "learning_rate": 0.00019998911026270668, "loss": 1.5755, "step": 420 }, { "epoch": 0.004708803561242409, "grad_norm": 0.05871938541531563, "learning_rate": 0.0001999890583460284, "loss": 1.5843, "step": 421 }, { "epoch": 0.004719988367801179, "grad_norm": 0.06751812249422073, "learning_rate": 0.00019998900630589493, "loss": 1.5922, "step": 422 }, { "epoch": 0.004731173174359949, "grad_norm": 0.061179131269454956, "learning_rate": 0.00019998895414230646, "loss": 1.5939, "step": 423 }, { "epoch": 0.00474235798091872, "grad_norm": 0.06404510140419006, "learning_rate": 0.00019998890185526292, "loss": 1.5857, "step": 424 }, { "epoch": 0.004753542787477491, "grad_norm": 0.07117751985788345, "learning_rate": 0.0001999888494447645, "loss": 1.5676, "step": 425 }, { "epoch": 0.004764727594036261, "grad_norm": 0.06996233761310577, "learning_rate": 0.00019998879691081114, "loss": 1.5769, "step": 426 }, { "epoch": 0.004775912400595031, "grad_norm": 0.0711674690246582, "learning_rate": 0.00019998874425340298, "loss": 1.5622, "step": 427 }, { "epoch": 0.0047870972071538025, "grad_norm": 0.079228475689888, "learning_rate": 0.0001999886914725401, "loss": 1.571, "step": 428 }, { "epoch": 0.004798282013712573, "grad_norm": 0.08016793429851532, "learning_rate": 0.00019998863856822248, "loss": 1.6029, "step": 429 }, { "epoch": 0.004809466820271343, "grad_norm": 0.0795593336224556, "learning_rate": 0.00019998858554045026, "loss": 1.6062, "step": 430 }, { "epoch": 0.004820651626830114, "grad_norm": 0.06341350823640823, "learning_rate": 0.00019998853238922348, "loss": 1.5884, "step": 431 }, { "epoch": 0.004831836433388885, "grad_norm": 0.05454142764210701, "learning_rate": 0.00019998847911454219, "loss": 1.5921, "step": 432 }, { "epoch": 0.004843021239947655, "grad_norm": 0.0625983327627182, "learning_rate": 0.0001999884257164065, "loss": 1.582, "step": 433 }, { "epoch": 0.004854206046506425, "grad_norm": 0.06116988882422447, "learning_rate": 0.00019998837219481645, "loss": 1.5804, "step": 434 }, { "epoch": 0.004865390853065196, "grad_norm": 0.057872429490089417, "learning_rate": 0.0001999883185497721, "loss": 1.5721, "step": 435 }, { "epoch": 0.004876575659623967, "grad_norm": 0.04977230727672577, "learning_rate": 0.00019998826478127352, "loss": 1.5539, "step": 436 }, { "epoch": 0.004887760466182737, "grad_norm": 0.06137122958898544, "learning_rate": 0.00019998821088932077, "loss": 1.5715, "step": 437 }, { "epoch": 0.004898945272741508, "grad_norm": 0.05595165491104126, "learning_rate": 0.00019998815687391396, "loss": 1.5841, "step": 438 }, { "epoch": 0.0049101300793002785, "grad_norm": 0.049001295119524, "learning_rate": 0.00019998810273505311, "loss": 1.571, "step": 439 }, { "epoch": 0.004921314885859049, "grad_norm": 0.04792420566082001, "learning_rate": 0.00019998804847273828, "loss": 1.5695, "step": 440 }, { "epoch": 0.00493249969241782, "grad_norm": 0.052222106605768204, "learning_rate": 0.00019998799408696956, "loss": 1.5671, "step": 441 }, { "epoch": 0.00494368449897659, "grad_norm": 0.05545085668563843, "learning_rate": 0.00019998793957774703, "loss": 1.5679, "step": 442 }, { "epoch": 0.004954869305535361, "grad_norm": 0.06147260218858719, "learning_rate": 0.00019998788494507075, "loss": 1.5746, "step": 443 }, { "epoch": 0.004966054112094131, "grad_norm": 0.06299655884504318, "learning_rate": 0.00019998783018894073, "loss": 1.5719, "step": 444 }, { "epoch": 0.004977238918652902, "grad_norm": 0.05477641522884369, "learning_rate": 0.00019998777530935713, "loss": 1.5933, "step": 445 }, { "epoch": 0.0049884237252116725, "grad_norm": 0.054924603551626205, "learning_rate": 0.00019998772030631993, "loss": 1.6381, "step": 446 }, { "epoch": 0.004999608531770443, "grad_norm": 0.05982334539294243, "learning_rate": 0.0001999876651798293, "loss": 1.6049, "step": 447 }, { "epoch": 0.005010793338329214, "grad_norm": 0.07177302241325378, "learning_rate": 0.0001999876099298852, "loss": 1.5885, "step": 448 }, { "epoch": 0.005021978144887984, "grad_norm": 0.06020566448569298, "learning_rate": 0.00019998755455648778, "loss": 1.5918, "step": 449 }, { "epoch": 0.005033162951446755, "grad_norm": 0.0725252702832222, "learning_rate": 0.00019998749905963706, "loss": 1.5948, "step": 450 }, { "epoch": 0.005044347758005525, "grad_norm": 0.07799220085144043, "learning_rate": 0.00019998744343933313, "loss": 1.5903, "step": 451 }, { "epoch": 0.005055532564564296, "grad_norm": 0.06732252240180969, "learning_rate": 0.00019998738769557605, "loss": 1.6123, "step": 452 }, { "epoch": 0.005066717371123066, "grad_norm": 0.056653380393981934, "learning_rate": 0.0001999873318283659, "loss": 1.5976, "step": 453 }, { "epoch": 0.005077902177681837, "grad_norm": 0.06148442253470421, "learning_rate": 0.00019998727583770274, "loss": 1.5826, "step": 454 }, { "epoch": 0.005089086984240608, "grad_norm": 0.0657142624258995, "learning_rate": 0.00019998721972358662, "loss": 1.5504, "step": 455 }, { "epoch": 0.005100271790799378, "grad_norm": 0.06259225308895111, "learning_rate": 0.00019998716348601766, "loss": 1.552, "step": 456 }, { "epoch": 0.0051114565973581485, "grad_norm": 0.08781653642654419, "learning_rate": 0.00019998710712499585, "loss": 1.5523, "step": 457 }, { "epoch": 0.005122641403916919, "grad_norm": 0.0888068750500679, "learning_rate": 0.00019998705064052137, "loss": 1.5581, "step": 458 }, { "epoch": 0.00513382621047569, "grad_norm": 0.11727220565080643, "learning_rate": 0.0001999869940325942, "loss": 1.5714, "step": 459 }, { "epoch": 0.00514501101703446, "grad_norm": 0.10440998524427414, "learning_rate": 0.00019998693730121443, "loss": 1.5567, "step": 460 }, { "epoch": 0.005156195823593231, "grad_norm": 0.0791282206773758, "learning_rate": 0.00019998688044638215, "loss": 1.5605, "step": 461 }, { "epoch": 0.005167380630152002, "grad_norm": 0.07670507580041885, "learning_rate": 0.0001999868234680974, "loss": 1.5676, "step": 462 }, { "epoch": 0.005178565436710772, "grad_norm": 0.09401030838489532, "learning_rate": 0.0001999867663663603, "loss": 1.555, "step": 463 }, { "epoch": 0.0051897502432695424, "grad_norm": 0.08279041945934296, "learning_rate": 0.00019998670914117087, "loss": 1.5436, "step": 464 }, { "epoch": 0.005200935049828314, "grad_norm": 0.17070412635803223, "learning_rate": 0.0001999866517925292, "loss": 1.5575, "step": 465 }, { "epoch": 0.005212119856387084, "grad_norm": 0.09067776054143906, "learning_rate": 0.00019998659432043537, "loss": 1.5665, "step": 466 }, { "epoch": 0.005223304662945854, "grad_norm": 0.08899036049842834, "learning_rate": 0.00019998653672488942, "loss": 1.5678, "step": 467 }, { "epoch": 0.005234489469504625, "grad_norm": 0.06300584226846695, "learning_rate": 0.00019998647900589144, "loss": 1.5826, "step": 468 }, { "epoch": 0.005245674276063396, "grad_norm": 0.07066696137189865, "learning_rate": 0.00019998642116344156, "loss": 1.5696, "step": 469 }, { "epoch": 0.005256859082622166, "grad_norm": 0.0663958340883255, "learning_rate": 0.00019998636319753973, "loss": 1.5878, "step": 470 }, { "epoch": 0.005268043889180936, "grad_norm": 0.06905809789896011, "learning_rate": 0.00019998630510818612, "loss": 1.5667, "step": 471 }, { "epoch": 0.0052792286957397076, "grad_norm": 0.05643589049577713, "learning_rate": 0.00019998624689538077, "loss": 1.5742, "step": 472 }, { "epoch": 0.005290413502298478, "grad_norm": 0.05323821306228638, "learning_rate": 0.00019998618855912375, "loss": 1.5965, "step": 473 }, { "epoch": 0.005301598308857248, "grad_norm": 0.07279177010059357, "learning_rate": 0.0001999861300994151, "loss": 1.5954, "step": 474 }, { "epoch": 0.0053127831154160185, "grad_norm": 0.06261339038610458, "learning_rate": 0.00019998607151625497, "loss": 1.5771, "step": 475 }, { "epoch": 0.00532396792197479, "grad_norm": 0.0605684369802475, "learning_rate": 0.00019998601280964335, "loss": 1.5674, "step": 476 }, { "epoch": 0.00533515272853356, "grad_norm": 0.05855708196759224, "learning_rate": 0.0001999859539795804, "loss": 1.5641, "step": 477 }, { "epoch": 0.00534633753509233, "grad_norm": 0.04459947720170021, "learning_rate": 0.0001999858950260661, "loss": 1.5706, "step": 478 }, { "epoch": 0.0053575223416511015, "grad_norm": 0.05174221470952034, "learning_rate": 0.00019998583594910057, "loss": 1.5525, "step": 479 }, { "epoch": 0.005368707148209872, "grad_norm": 0.047726552933454514, "learning_rate": 0.0001999857767486839, "loss": 1.5613, "step": 480 }, { "epoch": 0.005379891954768642, "grad_norm": 0.056866295635700226, "learning_rate": 0.0001999857174248161, "loss": 1.5851, "step": 481 }, { "epoch": 0.005391076761327412, "grad_norm": 0.05624596029520035, "learning_rate": 0.00019998565797749732, "loss": 1.5862, "step": 482 }, { "epoch": 0.005402261567886184, "grad_norm": 0.057655058801174164, "learning_rate": 0.0001999855984067276, "loss": 1.5848, "step": 483 }, { "epoch": 0.005413446374444954, "grad_norm": 0.06511086970567703, "learning_rate": 0.000199985538712507, "loss": 1.6081, "step": 484 }, { "epoch": 0.005424631181003724, "grad_norm": 0.0913616269826889, "learning_rate": 0.0001999854788948356, "loss": 1.5973, "step": 485 }, { "epoch": 0.005435815987562495, "grad_norm": 0.11560064554214478, "learning_rate": 0.00019998541895371345, "loss": 1.5767, "step": 486 }, { "epoch": 0.005447000794121266, "grad_norm": 0.10205356776714325, "learning_rate": 0.00019998535888914073, "loss": 1.5641, "step": 487 }, { "epoch": 0.005458185600680036, "grad_norm": 0.06806248426437378, "learning_rate": 0.00019998529870111735, "loss": 1.5586, "step": 488 }, { "epoch": 0.005469370407238807, "grad_norm": 0.06459362804889679, "learning_rate": 0.00019998523838964355, "loss": 1.5532, "step": 489 }, { "epoch": 0.0054805552137975775, "grad_norm": 0.07120149582624435, "learning_rate": 0.00019998517795471928, "loss": 1.5515, "step": 490 }, { "epoch": 0.005491740020356348, "grad_norm": 0.059738751500844955, "learning_rate": 0.00019998511739634464, "loss": 1.5567, "step": 491 }, { "epoch": 0.005502924826915118, "grad_norm": 0.06517786532640457, "learning_rate": 0.00019998505671451976, "loss": 1.595, "step": 492 }, { "epoch": 0.005514109633473889, "grad_norm": 0.06841952353715897, "learning_rate": 0.0001999849959092447, "loss": 1.5805, "step": 493 }, { "epoch": 0.00552529444003266, "grad_norm": 0.06286690384149551, "learning_rate": 0.0001999849349805195, "loss": 1.571, "step": 494 }, { "epoch": 0.00553647924659143, "grad_norm": 0.0559217631816864, "learning_rate": 0.00019998487392834422, "loss": 1.5569, "step": 495 }, { "epoch": 0.005547664053150201, "grad_norm": 0.0687541738152504, "learning_rate": 0.000199984812752719, "loss": 1.557, "step": 496 }, { "epoch": 0.0055588488597089715, "grad_norm": 0.06690400838851929, "learning_rate": 0.00019998475145364383, "loss": 1.567, "step": 497 }, { "epoch": 0.005570033666267742, "grad_norm": 0.06469254940748215, "learning_rate": 0.00019998469003111892, "loss": 1.5783, "step": 498 }, { "epoch": 0.005581218472826512, "grad_norm": 0.06279771029949188, "learning_rate": 0.0001999846284851442, "loss": 1.588, "step": 499 }, { "epoch": 0.005592403279385283, "grad_norm": 0.05500609427690506, "learning_rate": 0.00019998456681571982, "loss": 1.5775, "step": 500 }, { "epoch": 0.005603588085944054, "grad_norm": 0.05660603567957878, "learning_rate": 0.00019998450502284584, "loss": 1.5847, "step": 501 }, { "epoch": 0.005614772892502824, "grad_norm": 0.06350179761648178, "learning_rate": 0.00019998444310652237, "loss": 1.5859, "step": 502 }, { "epoch": 0.005625957699061595, "grad_norm": 0.06947285681962967, "learning_rate": 0.00019998438106674945, "loss": 1.6142, "step": 503 }, { "epoch": 0.005637142505620365, "grad_norm": 0.08556090295314789, "learning_rate": 0.00019998431890352712, "loss": 1.5949, "step": 504 }, { "epoch": 0.005648327312179136, "grad_norm": 0.10163229703903198, "learning_rate": 0.00019998425661685553, "loss": 1.5819, "step": 505 }, { "epoch": 0.005659512118737906, "grad_norm": 0.11865612119436264, "learning_rate": 0.00019998419420673476, "loss": 1.5799, "step": 506 }, { "epoch": 0.005670696925296677, "grad_norm": 0.12710048258304596, "learning_rate": 0.0001999841316731648, "loss": 1.5837, "step": 507 }, { "epoch": 0.0056818817318554475, "grad_norm": 0.10893180966377258, "learning_rate": 0.00019998406901614583, "loss": 1.5547, "step": 508 }, { "epoch": 0.005693066538414218, "grad_norm": 0.06662983447313309, "learning_rate": 0.00019998400623567788, "loss": 1.5412, "step": 509 }, { "epoch": 0.005704251344972989, "grad_norm": 0.06717602163553238, "learning_rate": 0.000199983943331761, "loss": 1.5471, "step": 510 }, { "epoch": 0.005715436151531759, "grad_norm": 0.08228597790002823, "learning_rate": 0.0001999838803043953, "loss": 1.5446, "step": 511 }, { "epoch": 0.00572662095809053, "grad_norm": 0.07614196836948395, "learning_rate": 0.00019998381715358084, "loss": 1.5574, "step": 512 }, { "epoch": 0.005737805764649301, "grad_norm": 0.06075645610690117, "learning_rate": 0.00019998375387931774, "loss": 1.5597, "step": 513 }, { "epoch": 0.005748990571208071, "grad_norm": 0.05882800742983818, "learning_rate": 0.00019998369048160604, "loss": 1.5559, "step": 514 }, { "epoch": 0.0057601753777668414, "grad_norm": 0.07097506523132324, "learning_rate": 0.0001999836269604458, "loss": 1.5648, "step": 515 }, { "epoch": 0.005771360184325612, "grad_norm": 0.06486310064792633, "learning_rate": 0.00019998356331583716, "loss": 1.5735, "step": 516 }, { "epoch": 0.005782544990884383, "grad_norm": 0.05333361402153969, "learning_rate": 0.00019998349954778016, "loss": 1.567, "step": 517 }, { "epoch": 0.005793729797443153, "grad_norm": 0.07817003130912781, "learning_rate": 0.00019998343565627488, "loss": 1.5902, "step": 518 }, { "epoch": 0.0058049146040019236, "grad_norm": 0.07619974762201309, "learning_rate": 0.00019998337164132138, "loss": 1.5819, "step": 519 }, { "epoch": 0.005816099410560695, "grad_norm": 0.06044092774391174, "learning_rate": 0.0001999833075029198, "loss": 1.6163, "step": 520 }, { "epoch": 0.005827284217119465, "grad_norm": 0.06666608154773712, "learning_rate": 0.00019998324324107015, "loss": 1.5977, "step": 521 }, { "epoch": 0.005838469023678235, "grad_norm": 0.06902644038200378, "learning_rate": 0.00019998317885577254, "loss": 1.5818, "step": 522 }, { "epoch": 0.005849653830237006, "grad_norm": 0.05606195330619812, "learning_rate": 0.00019998311434702703, "loss": 1.5784, "step": 523 }, { "epoch": 0.005860838636795777, "grad_norm": 0.08465290814638138, "learning_rate": 0.00019998304971483374, "loss": 1.5653, "step": 524 }, { "epoch": 0.005872023443354547, "grad_norm": 0.08544803410768509, "learning_rate": 0.00019998298495919274, "loss": 1.5616, "step": 525 }, { "epoch": 0.0058832082499133175, "grad_norm": 0.06527858972549438, "learning_rate": 0.0001999829200801041, "loss": 1.5879, "step": 526 }, { "epoch": 0.005894393056472089, "grad_norm": 0.07150562107563019, "learning_rate": 0.00019998285507756789, "loss": 1.5642, "step": 527 }, { "epoch": 0.005905577863030859, "grad_norm": 0.08152669668197632, "learning_rate": 0.00019998278995158418, "loss": 1.5744, "step": 528 }, { "epoch": 0.005916762669589629, "grad_norm": 0.09149385243654251, "learning_rate": 0.0001999827247021531, "loss": 1.5715, "step": 529 }, { "epoch": 0.0059279474761484, "grad_norm": 0.08523254096508026, "learning_rate": 0.00019998265932927466, "loss": 1.5822, "step": 530 }, { "epoch": 0.005939132282707171, "grad_norm": 0.062498655170202255, "learning_rate": 0.000199982593832949, "loss": 1.5862, "step": 531 }, { "epoch": 0.005950317089265941, "grad_norm": 0.07431355118751526, "learning_rate": 0.0001999825282131762, "loss": 1.574, "step": 532 }, { "epoch": 0.005961501895824711, "grad_norm": 0.0720391795039177, "learning_rate": 0.00019998246246995632, "loss": 1.57, "step": 533 }, { "epoch": 0.005972686702383483, "grad_norm": 0.06915175914764404, "learning_rate": 0.00019998239660328943, "loss": 1.5721, "step": 534 }, { "epoch": 0.005983871508942253, "grad_norm": 0.058932509273290634, "learning_rate": 0.00019998233061317561, "loss": 1.583, "step": 535 }, { "epoch": 0.005995056315501023, "grad_norm": 0.05271697789430618, "learning_rate": 0.000199982264499615, "loss": 1.5762, "step": 536 }, { "epoch": 0.006006241122059794, "grad_norm": 0.05659927427768707, "learning_rate": 0.0001999821982626076, "loss": 1.5718, "step": 537 }, { "epoch": 0.006017425928618565, "grad_norm": 0.05749264359474182, "learning_rate": 0.00019998213190215353, "loss": 1.5897, "step": 538 }, { "epoch": 0.006028610735177335, "grad_norm": 0.06748930364847183, "learning_rate": 0.0001999820654182529, "loss": 1.5844, "step": 539 }, { "epoch": 0.006039795541736105, "grad_norm": 0.06751269847154617, "learning_rate": 0.0001999819988109057, "loss": 1.5926, "step": 540 }, { "epoch": 0.0060509803482948765, "grad_norm": 0.06434184312820435, "learning_rate": 0.00019998193208011213, "loss": 1.5742, "step": 541 }, { "epoch": 0.006062165154853647, "grad_norm": 0.06833555549383163, "learning_rate": 0.0001999818652258722, "loss": 1.5818, "step": 542 }, { "epoch": 0.006073349961412417, "grad_norm": 0.06202944368124008, "learning_rate": 0.000199981798248186, "loss": 1.5806, "step": 543 }, { "epoch": 0.006084534767971188, "grad_norm": 0.06603850424289703, "learning_rate": 0.00019998173114705366, "loss": 1.5892, "step": 544 }, { "epoch": 0.006095719574529959, "grad_norm": 0.04424119368195534, "learning_rate": 0.00019998166392247522, "loss": 1.6017, "step": 545 }, { "epoch": 0.006106904381088729, "grad_norm": 0.04886182025074959, "learning_rate": 0.00019998159657445074, "loss": 1.5971, "step": 546 }, { "epoch": 0.006118089187647499, "grad_norm": 0.055009353905916214, "learning_rate": 0.00019998152910298035, "loss": 1.5942, "step": 547 }, { "epoch": 0.0061292739942062704, "grad_norm": 0.04949553683400154, "learning_rate": 0.00019998146150806411, "loss": 1.5851, "step": 548 }, { "epoch": 0.006140458800765041, "grad_norm": 0.05669408291578293, "learning_rate": 0.0001999813937897021, "loss": 1.5565, "step": 549 }, { "epoch": 0.006151643607323811, "grad_norm": 0.052044518291950226, "learning_rate": 0.00019998132594789444, "loss": 1.5652, "step": 550 }, { "epoch": 0.006162828413882582, "grad_norm": 0.055031049996614456, "learning_rate": 0.00019998125798264117, "loss": 1.5509, "step": 551 }, { "epoch": 0.006174013220441353, "grad_norm": 0.0653780847787857, "learning_rate": 0.0001999811898939424, "loss": 1.5672, "step": 552 }, { "epoch": 0.006185198027000123, "grad_norm": 0.06647983938455582, "learning_rate": 0.00019998112168179822, "loss": 1.5744, "step": 553 }, { "epoch": 0.006196382833558893, "grad_norm": 0.062013398855924606, "learning_rate": 0.00019998105334620867, "loss": 1.5545, "step": 554 }, { "epoch": 0.006207567640117664, "grad_norm": 0.04633625969290733, "learning_rate": 0.00019998098488717384, "loss": 1.5643, "step": 555 }, { "epoch": 0.006218752446676435, "grad_norm": 0.04178265854716301, "learning_rate": 0.00019998091630469387, "loss": 1.5484, "step": 556 }, { "epoch": 0.006229937253235205, "grad_norm": 0.04647913947701454, "learning_rate": 0.00019998084759876883, "loss": 1.554, "step": 557 }, { "epoch": 0.006241122059793976, "grad_norm": 0.05254526063799858, "learning_rate": 0.00019998077876939876, "loss": 1.5671, "step": 558 }, { "epoch": 0.0062523068663527465, "grad_norm": 0.054666776210069656, "learning_rate": 0.00019998070981658376, "loss": 1.5709, "step": 559 }, { "epoch": 0.006263491672911517, "grad_norm": 0.04982587322592735, "learning_rate": 0.00019998064074032396, "loss": 1.5645, "step": 560 }, { "epoch": 0.006274676479470288, "grad_norm": 0.05644576624035835, "learning_rate": 0.00019998057154061938, "loss": 1.5512, "step": 561 }, { "epoch": 0.006285861286029058, "grad_norm": 0.05311651527881622, "learning_rate": 0.00019998050221747016, "loss": 1.5399, "step": 562 }, { "epoch": 0.006297046092587829, "grad_norm": 0.05626964941620827, "learning_rate": 0.00019998043277087634, "loss": 1.5389, "step": 563 }, { "epoch": 0.006308230899146599, "grad_norm": 0.06463497877120972, "learning_rate": 0.00019998036320083808, "loss": 1.549, "step": 564 }, { "epoch": 0.00631941570570537, "grad_norm": 0.07779069244861603, "learning_rate": 0.00019998029350735538, "loss": 1.5591, "step": 565 }, { "epoch": 0.00633060051226414, "grad_norm": 0.10419348627328873, "learning_rate": 0.00019998022369042837, "loss": 1.5669, "step": 566 }, { "epoch": 0.006341785318822911, "grad_norm": 0.11543929576873779, "learning_rate": 0.00019998015375005709, "loss": 1.5669, "step": 567 }, { "epoch": 0.006352970125381682, "grad_norm": 0.08400971442461014, "learning_rate": 0.0001999800836862417, "loss": 1.589, "step": 568 }, { "epoch": 0.006364154931940452, "grad_norm": 0.056216295808553696, "learning_rate": 0.00019998001349898225, "loss": 1.5781, "step": 569 }, { "epoch": 0.0063753397384992225, "grad_norm": 0.07171747833490372, "learning_rate": 0.0001999799431882788, "loss": 1.5722, "step": 570 }, { "epoch": 0.006386524545057993, "grad_norm": 0.07911943644285202, "learning_rate": 0.0001999798727541315, "loss": 1.5617, "step": 571 }, { "epoch": 0.006397709351616764, "grad_norm": 0.07471580803394318, "learning_rate": 0.0001999798021965404, "loss": 1.5674, "step": 572 }, { "epoch": 0.006408894158175534, "grad_norm": 0.06016454100608826, "learning_rate": 0.00019997973151550556, "loss": 1.589, "step": 573 }, { "epoch": 0.006420078964734305, "grad_norm": 0.06692295521497726, "learning_rate": 0.00019997966071102713, "loss": 1.5721, "step": 574 }, { "epoch": 0.006431263771293076, "grad_norm": 0.06640581041574478, "learning_rate": 0.00019997958978310514, "loss": 1.5781, "step": 575 }, { "epoch": 0.006442448577851846, "grad_norm": 0.058826372027397156, "learning_rate": 0.0001999795187317397, "loss": 1.5666, "step": 576 }, { "epoch": 0.0064536333844106165, "grad_norm": 0.055648185312747955, "learning_rate": 0.0001999794475569309, "loss": 1.5707, "step": 577 }, { "epoch": 0.006464818190969387, "grad_norm": 0.058248959481716156, "learning_rate": 0.00019997937625867884, "loss": 1.57, "step": 578 }, { "epoch": 0.006476002997528158, "grad_norm": 0.05667665973305702, "learning_rate": 0.00019997930483698357, "loss": 1.5715, "step": 579 }, { "epoch": 0.006487187804086928, "grad_norm": 0.051860544830560684, "learning_rate": 0.00019997923329184524, "loss": 1.5875, "step": 580 }, { "epoch": 0.006498372610645699, "grad_norm": 0.05429021269083023, "learning_rate": 0.00019997916162326385, "loss": 1.606, "step": 581 }, { "epoch": 0.00650955741720447, "grad_norm": 0.055650923401117325, "learning_rate": 0.00019997908983123956, "loss": 1.6024, "step": 582 }, { "epoch": 0.00652074222376324, "grad_norm": 0.061447255313396454, "learning_rate": 0.00019997901791577244, "loss": 1.5888, "step": 583 }, { "epoch": 0.00653192703032201, "grad_norm": 0.06065785884857178, "learning_rate": 0.00019997894587686255, "loss": 1.5739, "step": 584 }, { "epoch": 0.006543111836880782, "grad_norm": 0.07358521968126297, "learning_rate": 0.00019997887371451002, "loss": 1.5682, "step": 585 }, { "epoch": 0.006554296643439552, "grad_norm": 0.08286885917186737, "learning_rate": 0.00019997880142871494, "loss": 1.5702, "step": 586 }, { "epoch": 0.006565481449998322, "grad_norm": 0.09056065231561661, "learning_rate": 0.0001999787290194774, "loss": 1.5822, "step": 587 }, { "epoch": 0.0065766662565570925, "grad_norm": 0.08298853039741516, "learning_rate": 0.00019997865648679745, "loss": 1.5818, "step": 588 }, { "epoch": 0.006587851063115864, "grad_norm": 0.08499585837125778, "learning_rate": 0.00019997858383067517, "loss": 1.5775, "step": 589 }, { "epoch": 0.006599035869674634, "grad_norm": 0.08271525800228119, "learning_rate": 0.00019997851105111073, "loss": 1.5756, "step": 590 }, { "epoch": 0.006610220676233404, "grad_norm": 0.07318850606679916, "learning_rate": 0.00019997843814810416, "loss": 1.5674, "step": 591 }, { "epoch": 0.0066214054827921755, "grad_norm": 0.07372857630252838, "learning_rate": 0.00019997836512165558, "loss": 1.5589, "step": 592 }, { "epoch": 0.006632590289350946, "grad_norm": 0.09608045220375061, "learning_rate": 0.00019997829197176503, "loss": 1.5483, "step": 593 }, { "epoch": 0.006643775095909716, "grad_norm": 0.13775509595870972, "learning_rate": 0.00019997821869843264, "loss": 1.5534, "step": 594 }, { "epoch": 0.0066549599024684864, "grad_norm": 0.1282949000597, "learning_rate": 0.00019997814530165847, "loss": 1.5707, "step": 595 }, { "epoch": 0.006666144709027258, "grad_norm": 0.09030576795339584, "learning_rate": 0.00019997807178144268, "loss": 1.5759, "step": 596 }, { "epoch": 0.006677329515586028, "grad_norm": 0.08960919827222824, "learning_rate": 0.00019997799813778531, "loss": 1.5747, "step": 597 }, { "epoch": 0.006688514322144798, "grad_norm": 0.08592968434095383, "learning_rate": 0.00019997792437068644, "loss": 1.5837, "step": 598 }, { "epoch": 0.0066996991287035694, "grad_norm": 0.07291566580533981, "learning_rate": 0.00019997785048014616, "loss": 1.5797, "step": 599 }, { "epoch": 0.00671088393526234, "grad_norm": 0.07706471532583237, "learning_rate": 0.0001999777764661646, "loss": 1.5715, "step": 600 }, { "epoch": 0.00672206874182111, "grad_norm": 0.06954386830329895, "learning_rate": 0.00019997770232874182, "loss": 1.563, "step": 601 }, { "epoch": 0.00673325354837988, "grad_norm": 0.06999648362398148, "learning_rate": 0.00019997762806787792, "loss": 1.5717, "step": 602 }, { "epoch": 0.0067444383549386516, "grad_norm": 0.05400196090340614, "learning_rate": 0.00019997755368357298, "loss": 1.5862, "step": 603 }, { "epoch": 0.006755623161497422, "grad_norm": 0.06418072432279587, "learning_rate": 0.00019997747917582714, "loss": 1.5908, "step": 604 }, { "epoch": 0.006766807968056192, "grad_norm": 0.05838518589735031, "learning_rate": 0.00019997740454464044, "loss": 1.5718, "step": 605 }, { "epoch": 0.006777992774614963, "grad_norm": 0.05882187560200691, "learning_rate": 0.00019997732979001298, "loss": 1.5769, "step": 606 }, { "epoch": 0.006789177581173734, "grad_norm": 0.05842543765902519, "learning_rate": 0.00019997725491194482, "loss": 1.5746, "step": 607 }, { "epoch": 0.006800362387732504, "grad_norm": 0.0527958944439888, "learning_rate": 0.00019997717991043616, "loss": 1.5682, "step": 608 }, { "epoch": 0.006811547194291274, "grad_norm": 0.06266690045595169, "learning_rate": 0.00019997710478548698, "loss": 1.572, "step": 609 }, { "epoch": 0.0068227320008500455, "grad_norm": 0.05554317682981491, "learning_rate": 0.00019997702953709746, "loss": 1.583, "step": 610 }, { "epoch": 0.006833916807408816, "grad_norm": 0.04651861637830734, "learning_rate": 0.00019997695416526761, "loss": 1.5738, "step": 611 }, { "epoch": 0.006845101613967586, "grad_norm": 0.053717561066150665, "learning_rate": 0.0001999768786699976, "loss": 1.5766, "step": 612 }, { "epoch": 0.006856286420526357, "grad_norm": 0.050605516880750656, "learning_rate": 0.0001999768030512875, "loss": 1.606, "step": 613 }, { "epoch": 0.006867471227085128, "grad_norm": 0.054307371377944946, "learning_rate": 0.00019997672730913735, "loss": 1.6187, "step": 614 }, { "epoch": 0.006878656033643898, "grad_norm": 0.06506580859422684, "learning_rate": 0.00019997665144354728, "loss": 1.6113, "step": 615 }, { "epoch": 0.006889840840202669, "grad_norm": 0.06480210274457932, "learning_rate": 0.00019997657545451744, "loss": 1.594, "step": 616 }, { "epoch": 0.006901025646761439, "grad_norm": 0.04906410723924637, "learning_rate": 0.00019997649934204784, "loss": 1.5809, "step": 617 }, { "epoch": 0.00691221045332021, "grad_norm": 0.05194586515426636, "learning_rate": 0.00019997642310613857, "loss": 1.5913, "step": 618 }, { "epoch": 0.00692339525987898, "grad_norm": 0.05839546024799347, "learning_rate": 0.0001999763467467898, "loss": 1.5746, "step": 619 }, { "epoch": 0.006934580066437751, "grad_norm": 0.06750357896089554, "learning_rate": 0.0001999762702640016, "loss": 1.575, "step": 620 }, { "epoch": 0.0069457648729965215, "grad_norm": 0.07982991635799408, "learning_rate": 0.00019997619365777402, "loss": 1.576, "step": 621 }, { "epoch": 0.006956949679555292, "grad_norm": 0.08816216886043549, "learning_rate": 0.00019997611692810718, "loss": 1.5744, "step": 622 }, { "epoch": 0.006968134486114063, "grad_norm": 0.09619053453207016, "learning_rate": 0.0001999760400750012, "loss": 1.5791, "step": 623 }, { "epoch": 0.006979319292672833, "grad_norm": 0.09412987530231476, "learning_rate": 0.00019997596309845612, "loss": 1.5799, "step": 624 }, { "epoch": 0.006990504099231604, "grad_norm": 0.09227743744850159, "learning_rate": 0.0001999758859984721, "loss": 1.574, "step": 625 }, { "epoch": 0.007001688905790374, "grad_norm": 0.07984034717082977, "learning_rate": 0.00019997580877504918, "loss": 1.5612, "step": 626 }, { "epoch": 0.007012873712349145, "grad_norm": 0.05941009521484375, "learning_rate": 0.00019997573142818752, "loss": 1.5655, "step": 627 }, { "epoch": 0.0070240585189079155, "grad_norm": 0.06787893921136856, "learning_rate": 0.0001999756539578871, "loss": 1.5662, "step": 628 }, { "epoch": 0.007035243325466686, "grad_norm": 0.07556013017892838, "learning_rate": 0.00019997557636414816, "loss": 1.5877, "step": 629 }, { "epoch": 0.007046428132025457, "grad_norm": 0.06565730273723602, "learning_rate": 0.0001999754986469707, "loss": 1.5701, "step": 630 }, { "epoch": 0.007057612938584227, "grad_norm": 0.05801456421613693, "learning_rate": 0.00019997542080635482, "loss": 1.5604, "step": 631 }, { "epoch": 0.007068797745142998, "grad_norm": 0.058777451515197754, "learning_rate": 0.00019997534284230066, "loss": 1.5695, "step": 632 }, { "epoch": 0.007079982551701768, "grad_norm": 0.0650622621178627, "learning_rate": 0.0001999752647548083, "loss": 1.5612, "step": 633 }, { "epoch": 0.007091167358260539, "grad_norm": 0.05834876000881195, "learning_rate": 0.00019997518654387783, "loss": 1.5617, "step": 634 }, { "epoch": 0.007102352164819309, "grad_norm": 0.06384813785552979, "learning_rate": 0.00019997510820950933, "loss": 1.5567, "step": 635 }, { "epoch": 0.00711353697137808, "grad_norm": 0.05401776731014252, "learning_rate": 0.00019997502975170291, "loss": 1.5558, "step": 636 }, { "epoch": 0.007124721777936851, "grad_norm": 0.06590646505355835, "learning_rate": 0.00019997495117045867, "loss": 1.5474, "step": 637 }, { "epoch": 0.007135906584495621, "grad_norm": 0.0560823492705822, "learning_rate": 0.00019997487246577674, "loss": 1.5404, "step": 638 }, { "epoch": 0.0071470913910543915, "grad_norm": 0.0544624887406826, "learning_rate": 0.00019997479363765717, "loss": 1.5321, "step": 639 }, { "epoch": 0.007158276197613163, "grad_norm": 0.04914103075861931, "learning_rate": 0.00019997471468610005, "loss": 1.5395, "step": 640 }, { "epoch": 0.007169461004171933, "grad_norm": 0.0481346994638443, "learning_rate": 0.00019997463561110553, "loss": 1.5513, "step": 641 }, { "epoch": 0.007180645810730703, "grad_norm": 0.04910167306661606, "learning_rate": 0.00019997455641267367, "loss": 1.5436, "step": 642 }, { "epoch": 0.007191830617289474, "grad_norm": 0.05214869976043701, "learning_rate": 0.00019997447709080456, "loss": 1.5589, "step": 643 }, { "epoch": 0.007203015423848245, "grad_norm": 0.06242618337273598, "learning_rate": 0.00019997439764549832, "loss": 1.5395, "step": 644 }, { "epoch": 0.007214200230407015, "grad_norm": 0.07024102658033371, "learning_rate": 0.00019997431807675505, "loss": 1.5624, "step": 645 }, { "epoch": 0.0072253850369657854, "grad_norm": 0.07082174718379974, "learning_rate": 0.0001999742383845748, "loss": 1.5505, "step": 646 }, { "epoch": 0.007236569843524557, "grad_norm": 0.06821414083242416, "learning_rate": 0.00019997415856895775, "loss": 1.5489, "step": 647 }, { "epoch": 0.007247754650083327, "grad_norm": 0.062424443662166595, "learning_rate": 0.00019997407862990395, "loss": 1.5341, "step": 648 }, { "epoch": 0.007258939456642097, "grad_norm": 0.05251247063279152, "learning_rate": 0.00019997399856741348, "loss": 1.5257, "step": 649 }, { "epoch": 0.0072701242632008676, "grad_norm": 0.04626723378896713, "learning_rate": 0.0001999739183814865, "loss": 1.5387, "step": 650 }, { "epoch": 0.007281309069759639, "grad_norm": 0.05231785774230957, "learning_rate": 0.00019997383807212306, "loss": 1.5378, "step": 651 }, { "epoch": 0.007292493876318409, "grad_norm": 0.06309188902378082, "learning_rate": 0.00019997375763932323, "loss": 1.5531, "step": 652 }, { "epoch": 0.007303678682877179, "grad_norm": 0.05309786647558212, "learning_rate": 0.0001999736770830872, "loss": 1.5694, "step": 653 }, { "epoch": 0.0073148634894359505, "grad_norm": 0.04923882707953453, "learning_rate": 0.000199973596403415, "loss": 1.5837, "step": 654 }, { "epoch": 0.007326048295994721, "grad_norm": 0.05534524843096733, "learning_rate": 0.00019997351560030677, "loss": 1.5918, "step": 655 }, { "epoch": 0.007337233102553491, "grad_norm": 0.1240246444940567, "learning_rate": 0.00019997343467376258, "loss": 1.5881, "step": 656 }, { "epoch": 0.0073484179091122615, "grad_norm": 0.06112068518996239, "learning_rate": 0.00019997335362378254, "loss": 1.5761, "step": 657 }, { "epoch": 0.007359602715671033, "grad_norm": 0.06589160114526749, "learning_rate": 0.00019997327245036673, "loss": 1.5641, "step": 658 }, { "epoch": 0.007370787522229803, "grad_norm": 0.060181207954883575, "learning_rate": 0.0001999731911535153, "loss": 1.5573, "step": 659 }, { "epoch": 0.007381972328788573, "grad_norm": 0.06380990892648697, "learning_rate": 0.0001999731097332283, "loss": 1.5624, "step": 660 }, { "epoch": 0.0073931571353473445, "grad_norm": 0.06176357343792915, "learning_rate": 0.00019997302818950584, "loss": 1.5499, "step": 661 }, { "epoch": 0.007404341941906115, "grad_norm": 0.055721499025821686, "learning_rate": 0.00019997294652234805, "loss": 1.557, "step": 662 }, { "epoch": 0.007415526748464885, "grad_norm": 0.051978956907987595, "learning_rate": 0.000199972864731755, "loss": 1.5623, "step": 663 }, { "epoch": 0.007426711555023656, "grad_norm": 0.04754827544093132, "learning_rate": 0.00019997278281772682, "loss": 1.5465, "step": 664 }, { "epoch": 0.007437896361582427, "grad_norm": 0.0538279265165329, "learning_rate": 0.0001999727007802636, "loss": 1.5326, "step": 665 }, { "epoch": 0.007449081168141197, "grad_norm": 0.0629352405667305, "learning_rate": 0.00019997261861936543, "loss": 1.5365, "step": 666 }, { "epoch": 0.007460265974699967, "grad_norm": 0.06892745941877365, "learning_rate": 0.00019997253633503238, "loss": 1.5607, "step": 667 }, { "epoch": 0.007471450781258738, "grad_norm": 0.07525767385959625, "learning_rate": 0.00019997245392726465, "loss": 1.5728, "step": 668 }, { "epoch": 0.007482635587817509, "grad_norm": 0.10010071098804474, "learning_rate": 0.00019997237139606224, "loss": 1.559, "step": 669 }, { "epoch": 0.007493820394376279, "grad_norm": 0.12011202424764633, "learning_rate": 0.0001999722887414253, "loss": 1.5694, "step": 670 }, { "epoch": 0.00750500520093505, "grad_norm": 0.12278566509485245, "learning_rate": 0.00019997220596335393, "loss": 1.5939, "step": 671 }, { "epoch": 0.0075161900074938205, "grad_norm": 0.10163605213165283, "learning_rate": 0.00019997212306184823, "loss": 1.5722, "step": 672 }, { "epoch": 0.007527374814052591, "grad_norm": 0.09386469423770905, "learning_rate": 0.00019997204003690828, "loss": 1.5748, "step": 673 }, { "epoch": 0.007538559620611361, "grad_norm": 0.1031983494758606, "learning_rate": 0.00019997195688853422, "loss": 1.5778, "step": 674 }, { "epoch": 0.007549744427170132, "grad_norm": 0.09082422405481339, "learning_rate": 0.00019997187361672615, "loss": 1.5904, "step": 675 }, { "epoch": 0.007560929233728903, "grad_norm": 0.05761239677667618, "learning_rate": 0.0001999717902214841, "loss": 1.5753, "step": 676 }, { "epoch": 0.007572114040287673, "grad_norm": 0.0772882029414177, "learning_rate": 0.0001999717067028083, "loss": 1.5631, "step": 677 }, { "epoch": 0.007583298846846444, "grad_norm": 0.09266892075538635, "learning_rate": 0.00019997162306069875, "loss": 1.5798, "step": 678 }, { "epoch": 0.0075944836534052144, "grad_norm": 0.07755053043365479, "learning_rate": 0.00019997153929515558, "loss": 1.5969, "step": 679 }, { "epoch": 0.007605668459963985, "grad_norm": 0.061833951622247696, "learning_rate": 0.0001999714554061789, "loss": 1.5781, "step": 680 }, { "epoch": 0.007616853266522755, "grad_norm": 0.07911964505910873, "learning_rate": 0.00019997137139376883, "loss": 1.611, "step": 681 }, { "epoch": 0.007628038073081526, "grad_norm": 0.07502644509077072, "learning_rate": 0.00019997128725792544, "loss": 1.6201, "step": 682 }, { "epoch": 0.007639222879640297, "grad_norm": 0.11084458976984024, "learning_rate": 0.00019997120299864886, "loss": 1.5882, "step": 683 }, { "epoch": 0.007650407686199067, "grad_norm": 0.1428053230047226, "learning_rate": 0.00019997111861593921, "loss": 1.5737, "step": 684 }, { "epoch": 0.007661592492757838, "grad_norm": 0.1456058770418167, "learning_rate": 0.00019997103410979652, "loss": 1.583, "step": 685 }, { "epoch": 0.007672777299316608, "grad_norm": 0.10741148889064789, "learning_rate": 0.00019997094948022098, "loss": 1.5736, "step": 686 }, { "epoch": 0.007683962105875379, "grad_norm": 0.08248301595449448, "learning_rate": 0.00019997086472721263, "loss": 1.5559, "step": 687 }, { "epoch": 0.00769514691243415, "grad_norm": 0.09595336019992828, "learning_rate": 0.00019997077985077163, "loss": 1.5513, "step": 688 }, { "epoch": 0.00770633171899292, "grad_norm": 0.06806618720293045, "learning_rate": 0.00019997069485089804, "loss": 1.5624, "step": 689 }, { "epoch": 0.0077175165255516905, "grad_norm": 0.07510481029748917, "learning_rate": 0.00019997060972759198, "loss": 1.5401, "step": 690 }, { "epoch": 0.007728701332110461, "grad_norm": 0.06557908654212952, "learning_rate": 0.00019997052448085358, "loss": 1.5507, "step": 691 }, { "epoch": 0.007739886138669232, "grad_norm": 0.07286231964826584, "learning_rate": 0.0001999704391106829, "loss": 1.5597, "step": 692 }, { "epoch": 0.007751070945228002, "grad_norm": 0.06357460469007492, "learning_rate": 0.0001999703536170801, "loss": 1.5591, "step": 693 }, { "epoch": 0.007762255751786773, "grad_norm": 0.06700731813907623, "learning_rate": 0.00019997026800004522, "loss": 1.5521, "step": 694 }, { "epoch": 0.007773440558345544, "grad_norm": 0.05840053781867027, "learning_rate": 0.00019997018225957839, "loss": 1.5688, "step": 695 }, { "epoch": 0.007784625364904314, "grad_norm": 0.06327050924301147, "learning_rate": 0.00019997009639567974, "loss": 1.5547, "step": 696 }, { "epoch": 0.007795810171463084, "grad_norm": 0.06035961955785751, "learning_rate": 0.00019997001040834936, "loss": 1.5701, "step": 697 }, { "epoch": 0.007806994978021855, "grad_norm": 0.05573936179280281, "learning_rate": 0.0001999699242975874, "loss": 1.5878, "step": 698 }, { "epoch": 0.007818179784580626, "grad_norm": 0.05611170828342438, "learning_rate": 0.00019996983806339387, "loss": 1.5899, "step": 699 }, { "epoch": 0.007829364591139396, "grad_norm": 0.05826570466160774, "learning_rate": 0.00019996975170576896, "loss": 1.5533, "step": 700 }, { "epoch": 0.007840549397698167, "grad_norm": 0.050937049090862274, "learning_rate": 0.00019996966522471273, "loss": 1.5545, "step": 701 }, { "epoch": 0.007851734204256937, "grad_norm": 0.06593479216098785, "learning_rate": 0.0001999695786202253, "loss": 1.5685, "step": 702 }, { "epoch": 0.007862919010815707, "grad_norm": 0.06405465304851532, "learning_rate": 0.0001999694918923068, "loss": 1.6121, "step": 703 }, { "epoch": 0.00787410381737448, "grad_norm": 0.052820343524217606, "learning_rate": 0.0001999694050409573, "loss": 1.6007, "step": 704 }, { "epoch": 0.00788528862393325, "grad_norm": 0.05512186512351036, "learning_rate": 0.00019996931806617695, "loss": 1.5792, "step": 705 }, { "epoch": 0.00789647343049202, "grad_norm": 0.0432184673845768, "learning_rate": 0.0001999692309679658, "loss": 1.5852, "step": 706 }, { "epoch": 0.00790765823705079, "grad_norm": 0.05248282849788666, "learning_rate": 0.00019996914374632402, "loss": 1.5917, "step": 707 }, { "epoch": 0.00791884304360956, "grad_norm": 0.04413476958870888, "learning_rate": 0.00019996905640125165, "loss": 1.5584, "step": 708 }, { "epoch": 0.00793002785016833, "grad_norm": 0.04878908023238182, "learning_rate": 0.00019996896893274886, "loss": 1.5506, "step": 709 }, { "epoch": 0.007941212656727101, "grad_norm": 0.04344234988093376, "learning_rate": 0.00019996888134081575, "loss": 1.5684, "step": 710 }, { "epoch": 0.007952397463285873, "grad_norm": 0.047158923000097275, "learning_rate": 0.0001999687936254524, "loss": 1.6133, "step": 711 }, { "epoch": 0.007963582269844643, "grad_norm": 0.050282686948776245, "learning_rate": 0.00019996870578665893, "loss": 1.6021, "step": 712 }, { "epoch": 0.007974767076403414, "grad_norm": 0.043916840106248856, "learning_rate": 0.0001999686178244354, "loss": 1.5794, "step": 713 }, { "epoch": 0.007985951882962184, "grad_norm": 0.079423688352108, "learning_rate": 0.00019996852973878205, "loss": 1.5666, "step": 714 }, { "epoch": 0.007997136689520954, "grad_norm": 0.047040194272994995, "learning_rate": 0.00019996844152969884, "loss": 1.5643, "step": 715 }, { "epoch": 0.008008321496079725, "grad_norm": 0.04954817518591881, "learning_rate": 0.00019996835319718596, "loss": 1.5756, "step": 716 }, { "epoch": 0.008019506302638495, "grad_norm": 0.0529201366007328, "learning_rate": 0.00019996826474124352, "loss": 1.5693, "step": 717 }, { "epoch": 0.008030691109197267, "grad_norm": 0.0555887334048748, "learning_rate": 0.00019996817616187162, "loss": 1.5699, "step": 718 }, { "epoch": 0.008041875915756037, "grad_norm": 0.05515376478433609, "learning_rate": 0.00019996808745907036, "loss": 1.5729, "step": 719 }, { "epoch": 0.008053060722314808, "grad_norm": 0.05125884339213371, "learning_rate": 0.0001999679986328398, "loss": 1.5618, "step": 720 }, { "epoch": 0.008064245528873578, "grad_norm": 0.046284329146146774, "learning_rate": 0.0001999679096831802, "loss": 1.5723, "step": 721 }, { "epoch": 0.008075430335432348, "grad_norm": 0.07273488491773605, "learning_rate": 0.0001999678206100915, "loss": 1.5701, "step": 722 }, { "epoch": 0.008086615141991119, "grad_norm": 0.047560565173625946, "learning_rate": 0.0001999677314135739, "loss": 1.5579, "step": 723 }, { "epoch": 0.00809779994854989, "grad_norm": 0.060283761471509933, "learning_rate": 0.00019996764209362749, "loss": 1.5615, "step": 724 }, { "epoch": 0.008108984755108661, "grad_norm": 0.0602620430290699, "learning_rate": 0.00019996755265025236, "loss": 1.5602, "step": 725 }, { "epoch": 0.008120169561667431, "grad_norm": 0.05383098125457764, "learning_rate": 0.00019996746308344868, "loss": 1.5769, "step": 726 }, { "epoch": 0.008131354368226202, "grad_norm": 0.04577267915010452, "learning_rate": 0.0001999673733932165, "loss": 1.619, "step": 727 }, { "epoch": 0.008142539174784972, "grad_norm": 0.04550078883767128, "learning_rate": 0.00019996728357955595, "loss": 1.5907, "step": 728 }, { "epoch": 0.008153723981343742, "grad_norm": 0.050287820398807526, "learning_rate": 0.00019996719364246714, "loss": 1.5809, "step": 729 }, { "epoch": 0.008164908787902513, "grad_norm": 0.05841783806681633, "learning_rate": 0.00019996710358195018, "loss": 1.5521, "step": 730 }, { "epoch": 0.008176093594461285, "grad_norm": 0.07749857753515244, "learning_rate": 0.0001999670133980052, "loss": 1.5848, "step": 731 }, { "epoch": 0.008187278401020055, "grad_norm": 0.08802466839551926, "learning_rate": 0.00019996692309063232, "loss": 1.6046, "step": 732 }, { "epoch": 0.008198463207578825, "grad_norm": 0.09324830025434494, "learning_rate": 0.00019996683265983162, "loss": 1.5969, "step": 733 }, { "epoch": 0.008209648014137596, "grad_norm": 0.07845516502857208, "learning_rate": 0.0001999667421056032, "loss": 1.5831, "step": 734 }, { "epoch": 0.008220832820696366, "grad_norm": 0.06912586092948914, "learning_rate": 0.0001999666514279472, "loss": 1.5706, "step": 735 }, { "epoch": 0.008232017627255136, "grad_norm": 0.0572381317615509, "learning_rate": 0.00019996656062686374, "loss": 1.5609, "step": 736 }, { "epoch": 0.008243202433813906, "grad_norm": 0.06219245865941048, "learning_rate": 0.00019996646970235287, "loss": 1.5541, "step": 737 }, { "epoch": 0.008254387240372679, "grad_norm": 0.0628521591424942, "learning_rate": 0.0001999663786544148, "loss": 1.5556, "step": 738 }, { "epoch": 0.008265572046931449, "grad_norm": 0.06389934569597244, "learning_rate": 0.0001999662874830496, "loss": 1.5235, "step": 739 }, { "epoch": 0.00827675685349022, "grad_norm": 0.052320901304483414, "learning_rate": 0.00019996619618825733, "loss": 1.539, "step": 740 }, { "epoch": 0.00828794166004899, "grad_norm": 0.05470295995473862, "learning_rate": 0.00019996610477003817, "loss": 1.5415, "step": 741 }, { "epoch": 0.00829912646660776, "grad_norm": 0.06103771552443504, "learning_rate": 0.00019996601322839222, "loss": 1.5422, "step": 742 }, { "epoch": 0.00831031127316653, "grad_norm": 0.06434791535139084, "learning_rate": 0.00019996592156331958, "loss": 1.5527, "step": 743 }, { "epoch": 0.0083214960797253, "grad_norm": 0.06087024137377739, "learning_rate": 0.00019996582977482036, "loss": 1.562, "step": 744 }, { "epoch": 0.008332680886284072, "grad_norm": 0.060757141560316086, "learning_rate": 0.00019996573786289465, "loss": 1.5641, "step": 745 }, { "epoch": 0.008343865692842843, "grad_norm": 0.07097544521093369, "learning_rate": 0.00019996564582754265, "loss": 1.542, "step": 746 }, { "epoch": 0.008355050499401613, "grad_norm": 0.07591135054826736, "learning_rate": 0.00019996555366876437, "loss": 1.5557, "step": 747 }, { "epoch": 0.008366235305960383, "grad_norm": 0.07860101759433746, "learning_rate": 0.00019996546138655998, "loss": 1.5592, "step": 748 }, { "epoch": 0.008377420112519154, "grad_norm": 0.08454012125730515, "learning_rate": 0.00019996536898092958, "loss": 1.5428, "step": 749 }, { "epoch": 0.008388604919077924, "grad_norm": 0.08686886727809906, "learning_rate": 0.0001999652764518733, "loss": 1.5604, "step": 750 }, { "epoch": 0.008399789725636694, "grad_norm": 0.07752903550863266, "learning_rate": 0.00019996518379939126, "loss": 1.5663, "step": 751 }, { "epoch": 0.008410974532195466, "grad_norm": 0.07272690534591675, "learning_rate": 0.00019996509102348356, "loss": 1.5463, "step": 752 }, { "epoch": 0.008422159338754237, "grad_norm": 0.07069668918848038, "learning_rate": 0.00019996499812415026, "loss": 1.5403, "step": 753 }, { "epoch": 0.008433344145313007, "grad_norm": 0.06617298722267151, "learning_rate": 0.00019996490510139155, "loss": 1.5452, "step": 754 }, { "epoch": 0.008444528951871777, "grad_norm": 0.06795412302017212, "learning_rate": 0.00019996481195520756, "loss": 1.5355, "step": 755 }, { "epoch": 0.008455713758430548, "grad_norm": 0.06670048087835312, "learning_rate": 0.00019996471868559832, "loss": 1.5529, "step": 756 }, { "epoch": 0.008466898564989318, "grad_norm": 0.06750231981277466, "learning_rate": 0.000199964625292564, "loss": 1.5626, "step": 757 }, { "epoch": 0.008478083371548088, "grad_norm": 0.06446841359138489, "learning_rate": 0.0001999645317761047, "loss": 1.5599, "step": 758 }, { "epoch": 0.00848926817810686, "grad_norm": 0.0593569353222847, "learning_rate": 0.00019996443813622057, "loss": 1.5538, "step": 759 }, { "epoch": 0.00850045298466563, "grad_norm": 0.0496729277074337, "learning_rate": 0.00019996434437291168, "loss": 1.5427, "step": 760 }, { "epoch": 0.008511637791224401, "grad_norm": 0.04995394125580788, "learning_rate": 0.00019996425048617814, "loss": 1.5326, "step": 761 }, { "epoch": 0.008522822597783171, "grad_norm": 0.061305031180381775, "learning_rate": 0.00019996415647602014, "loss": 1.5553, "step": 762 }, { "epoch": 0.008534007404341942, "grad_norm": 0.07046514004468918, "learning_rate": 0.0001999640623424377, "loss": 1.5305, "step": 763 }, { "epoch": 0.008545192210900712, "grad_norm": 0.0729839950799942, "learning_rate": 0.00019996396808543102, "loss": 1.5448, "step": 764 }, { "epoch": 0.008556377017459482, "grad_norm": 0.07597866654396057, "learning_rate": 0.00019996387370500016, "loss": 1.5339, "step": 765 }, { "epoch": 0.008567561824018254, "grad_norm": 0.07808911800384521, "learning_rate": 0.00019996377920114525, "loss": 1.5474, "step": 766 }, { "epoch": 0.008578746630577025, "grad_norm": 0.07077853381633759, "learning_rate": 0.0001999636845738664, "loss": 1.5456, "step": 767 }, { "epoch": 0.008589931437135795, "grad_norm": 0.05932854115962982, "learning_rate": 0.00019996358982316378, "loss": 1.5581, "step": 768 }, { "epoch": 0.008601116243694565, "grad_norm": 0.055467307567596436, "learning_rate": 0.00019996349494903743, "loss": 1.552, "step": 769 }, { "epoch": 0.008612301050253336, "grad_norm": 0.0684780701994896, "learning_rate": 0.0001999633999514875, "loss": 1.5691, "step": 770 }, { "epoch": 0.008623485856812106, "grad_norm": 0.07558051496744156, "learning_rate": 0.0001999633048305141, "loss": 1.5704, "step": 771 }, { "epoch": 0.008634670663370878, "grad_norm": 0.07451245933771133, "learning_rate": 0.0001999632095861174, "loss": 1.562, "step": 772 }, { "epoch": 0.008645855469929648, "grad_norm": 0.06852173060178757, "learning_rate": 0.00019996311421829744, "loss": 1.5582, "step": 773 }, { "epoch": 0.008657040276488418, "grad_norm": 0.05201677978038788, "learning_rate": 0.00019996301872705438, "loss": 1.5405, "step": 774 }, { "epoch": 0.008668225083047189, "grad_norm": 0.05331201106309891, "learning_rate": 0.00019996292311238832, "loss": 1.5389, "step": 775 }, { "epoch": 0.008679409889605959, "grad_norm": 0.06298915296792984, "learning_rate": 0.00019996282737429942, "loss": 1.5544, "step": 776 }, { "epoch": 0.00869059469616473, "grad_norm": 0.06354406476020813, "learning_rate": 0.00019996273151278774, "loss": 1.5654, "step": 777 }, { "epoch": 0.0087017795027235, "grad_norm": 0.0683928057551384, "learning_rate": 0.00019996263552785344, "loss": 1.5515, "step": 778 }, { "epoch": 0.008712964309282272, "grad_norm": 0.08236062526702881, "learning_rate": 0.0001999625394194966, "loss": 1.5617, "step": 779 }, { "epoch": 0.008724149115841042, "grad_norm": 0.07203904539346695, "learning_rate": 0.0001999624431877174, "loss": 1.555, "step": 780 }, { "epoch": 0.008735333922399812, "grad_norm": 0.06245394051074982, "learning_rate": 0.0001999623468325159, "loss": 1.5696, "step": 781 }, { "epoch": 0.008746518728958583, "grad_norm": 0.06451458483934402, "learning_rate": 0.00019996225035389222, "loss": 1.5695, "step": 782 }, { "epoch": 0.008757703535517353, "grad_norm": 0.06131128594279289, "learning_rate": 0.00019996215375184652, "loss": 1.5749, "step": 783 }, { "epoch": 0.008768888342076123, "grad_norm": 0.07188650965690613, "learning_rate": 0.00019996205702637888, "loss": 1.5647, "step": 784 }, { "epoch": 0.008780073148634894, "grad_norm": 0.0647532194852829, "learning_rate": 0.00019996196017748948, "loss": 1.5906, "step": 785 }, { "epoch": 0.008791257955193666, "grad_norm": 0.06955672800540924, "learning_rate": 0.00019996186320517836, "loss": 1.5923, "step": 786 }, { "epoch": 0.008802442761752436, "grad_norm": 0.08375248312950134, "learning_rate": 0.00019996176610944568, "loss": 1.5949, "step": 787 }, { "epoch": 0.008813627568311206, "grad_norm": 0.07389501482248306, "learning_rate": 0.00019996166889029156, "loss": 1.5859, "step": 788 }, { "epoch": 0.008824812374869977, "grad_norm": 0.08891887962818146, "learning_rate": 0.0001999615715477161, "loss": 1.5799, "step": 789 }, { "epoch": 0.008835997181428747, "grad_norm": 0.08630786836147308, "learning_rate": 0.00019996147408171948, "loss": 1.5648, "step": 790 }, { "epoch": 0.008847181987987517, "grad_norm": 0.09771011024713516, "learning_rate": 0.00019996137649230176, "loss": 1.5505, "step": 791 }, { "epoch": 0.008858366794546288, "grad_norm": 0.09192012995481491, "learning_rate": 0.00019996127877946307, "loss": 1.5704, "step": 792 }, { "epoch": 0.00886955160110506, "grad_norm": 0.07876724004745483, "learning_rate": 0.00019996118094320355, "loss": 1.5822, "step": 793 }, { "epoch": 0.00888073640766383, "grad_norm": 0.06203979253768921, "learning_rate": 0.00019996108298352328, "loss": 1.5599, "step": 794 }, { "epoch": 0.0088919212142226, "grad_norm": 0.06725753843784332, "learning_rate": 0.00019996098490042242, "loss": 1.562, "step": 795 }, { "epoch": 0.00890310602078137, "grad_norm": 0.07860880345106125, "learning_rate": 0.0001999608866939011, "loss": 1.5554, "step": 796 }, { "epoch": 0.008914290827340141, "grad_norm": 0.07922618836164474, "learning_rate": 0.0001999607883639594, "loss": 1.564, "step": 797 }, { "epoch": 0.008925475633898911, "grad_norm": 0.07509887218475342, "learning_rate": 0.0001999606899105975, "loss": 1.5531, "step": 798 }, { "epoch": 0.008936660440457682, "grad_norm": 0.0813961774110794, "learning_rate": 0.00019996059133381547, "loss": 1.5479, "step": 799 }, { "epoch": 0.008947845247016454, "grad_norm": 0.09687768667936325, "learning_rate": 0.00019996049263361343, "loss": 1.5527, "step": 800 } ], "logging_steps": 1, "max_steps": 89407, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6661988212201226e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }