{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0125, "grad_norm": 2.2997210025787354, "learning_rate": 0.00015998457923856519, "loss": 1.2401, "step": 1 }, { "epoch": 0.025, "grad_norm": 24.28518295288086, "learning_rate": 0.00015993832289925785, "loss": 4.0648, "step": 2 }, { "epoch": 0.0375, "grad_norm": 12.727800369262695, "learning_rate": 0.0001598612488147773, "loss": 2.7044, "step": 3 }, { "epoch": 0.05, "grad_norm": 4.934032440185547, "learning_rate": 0.00015975338669865026, "loss": 1.994, "step": 4 }, { "epoch": 0.0625, "grad_norm": 8.1886625289917, "learning_rate": 0.00015961477813377576, "loss": 2.1426, "step": 5 }, { "epoch": 0.075, "grad_norm": 2.910872220993042, "learning_rate": 0.00015944547655639412, "loss": 1.7254, "step": 6 }, { "epoch": 0.0875, "grad_norm": 1.565290093421936, "learning_rate": 0.00015924554723548617, "loss": 1.5187, "step": 7 }, { "epoch": 0.1, "grad_norm": 2.9080827236175537, "learning_rate": 0.00015901506724761103, "loss": 1.5405, "step": 8 }, { "epoch": 0.1125, "grad_norm": 1.875126838684082, "learning_rate": 0.00015875412544719134, "loss": 1.4493, "step": 9 }, { "epoch": 0.125, "grad_norm": 1.2589935064315796, "learning_rate": 0.00015846282243225845, "loss": 1.4103, "step": 10 }, { "epoch": 0.1375, "grad_norm": 1.2925529479980469, "learning_rate": 0.0001581412705056698, "loss": 1.3507, "step": 11 }, { "epoch": 0.15, "grad_norm": 1.4467802047729492, "learning_rate": 0.00015778959363181415, "loss": 1.3, "step": 12 }, { "epoch": 0.1625, "grad_norm": 1.267639398574829, "learning_rate": 0.0001574079273888208, "loss": 1.2974, "step": 13 }, { "epoch": 0.175, "grad_norm": 1.0911085605621338, "learning_rate": 0.00015699641891629178, "loss": 1.2635, "step": 14 }, { "epoch": 0.1875, "grad_norm": 0.9065354466438293, "learning_rate": 0.00015655522685857672, "loss": 1.2119, "step": 15 }, { "epoch": 0.2, "grad_norm": 0.7415559887886047, "learning_rate": 0.0001560845213036123, "loss": 1.2337, "step": 16 }, { "epoch": 0.2125, "grad_norm": 0.7553166151046753, "learning_rate": 0.00015558448371735025, "loss": 1.1884, "step": 17 }, { "epoch": 0.225, "grad_norm": 0.5407947301864624, "learning_rate": 0.00015505530687379875, "loss": 1.17, "step": 18 }, { "epoch": 0.2375, "grad_norm": 0.5162355899810791, "learning_rate": 0.00015449719478070428, "loss": 1.1879, "step": 19 }, { "epoch": 0.25, "grad_norm": 0.5688554644584656, "learning_rate": 0.00015391036260090294, "loss": 1.1767, "step": 20 }, { "epoch": 0.2625, "grad_norm": 0.48555564880371094, "learning_rate": 0.0001532950365693709, "loss": 1.1726, "step": 21 }, { "epoch": 0.275, "grad_norm": 0.4502723515033722, "learning_rate": 0.00015265145390600652, "loss": 1.163, "step": 22 }, { "epoch": 0.2875, "grad_norm": 0.3590157926082611, "learning_rate": 0.00015197986272417774, "loss": 1.1839, "step": 23 }, { "epoch": 0.3, "grad_norm": 0.38364410400390625, "learning_rate": 0.00015128052193506944, "loss": 1.1642, "step": 24 }, { "epoch": 0.3125, "grad_norm": 0.36856546998023987, "learning_rate": 0.0001505537011478684, "loss": 1.1495, "step": 25 }, { "epoch": 0.325, "grad_norm": 0.3514528274536133, "learning_rate": 0.0001497996805658238, "loss": 1.1257, "step": 26 }, { "epoch": 0.3375, "grad_norm": 0.42414528131484985, "learning_rate": 0.00014901875087822337, "loss": 1.1463, "step": 27 }, { "epoch": 0.35, "grad_norm": 0.35511669516563416, "learning_rate": 0.0001482112131483274, "loss": 1.141, "step": 28 }, { "epoch": 0.3625, "grad_norm": 0.3799460530281067, "learning_rate": 0.00014737737869730292, "loss": 1.1414, "step": 29 }, { "epoch": 0.375, "grad_norm": 0.26333189010620117, "learning_rate": 0.00014651756898420365, "loss": 1.1352, "step": 30 }, { "epoch": 0.3875, "grad_norm": 0.37996864318847656, "learning_rate": 0.0001456321154820411, "loss": 1.1167, "step": 31 }, { "epoch": 0.4, "grad_norm": 0.3210310637950897, "learning_rate": 0.00014472135954999581, "loss": 1.113, "step": 32 }, { "epoch": 0.4125, "grad_norm": 0.342960923910141, "learning_rate": 0.00014378565230181657, "loss": 1.1201, "step": 33 }, { "epoch": 0.425, "grad_norm": 0.30171331763267517, "learning_rate": 0.0001428253544704596, "loss": 1.1303, "step": 34 }, { "epoch": 0.4375, "grad_norm": 0.3308579623699188, "learning_rate": 0.00014184083626901897, "loss": 1.135, "step": 35 }, { "epoch": 0.45, "grad_norm": 0.33749139308929443, "learning_rate": 0.0001408324772480025, "loss": 1.1413, "step": 36 }, { "epoch": 0.4625, "grad_norm": 0.29873886704444885, "learning_rate": 0.00013980066614900776, "loss": 1.1406, "step": 37 }, { "epoch": 0.475, "grad_norm": 0.25276514887809753, "learning_rate": 0.00013874580075485485, "loss": 1.1421, "step": 38 }, { "epoch": 0.4875, "grad_norm": 0.2849913537502289, "learning_rate": 0.00013766828773623352, "loss": 1.1298, "step": 39 }, { "epoch": 0.5, "grad_norm": 0.27665936946868896, "learning_rate": 0.00013656854249492382, "loss": 1.1052, "step": 40 }, { "epoch": 0.5125, "grad_norm": 0.31618547439575195, "learning_rate": 0.0001354469890036509, "loss": 1.1124, "step": 41 }, { "epoch": 0.525, "grad_norm": 0.30855098366737366, "learning_rate": 0.00013430405964263536, "loss": 1.1164, "step": 42 }, { "epoch": 0.5375, "grad_norm": 0.24974325299263, "learning_rate": 0.00013314019503290255, "loss": 1.1379, "step": 43 }, { "epoch": 0.55, "grad_norm": 0.259245365858078, "learning_rate": 0.00013195584386641469, "loss": 1.0963, "step": 44 }, { "epoch": 0.5625, "grad_norm": 0.3342917561531067, "learning_rate": 0.00013075146273309164, "loss": 1.1089, "step": 45 }, { "epoch": 0.575, "grad_norm": 0.3317720293998718, "learning_rate": 0.00012952751594478675, "loss": 1.1226, "step": 46 }, { "epoch": 0.5875, "grad_norm": 0.2566727101802826, "learning_rate": 0.0001282844753562857, "loss": 1.1035, "step": 47 }, { "epoch": 0.6, "grad_norm": 0.25012263655662537, "learning_rate": 0.00012702282018339786, "loss": 1.0713, "step": 48 }, { "epoch": 0.6125, "grad_norm": 0.2855740189552307, "learning_rate": 0.00012574303681820898, "loss": 1.1232, "step": 49 }, { "epoch": 0.625, "grad_norm": 0.21377117931842804, "learning_rate": 0.0001244456186415682, "loss": 1.0726, "step": 50 }, { "epoch": 0.6375, "grad_norm": 0.3012278079986572, "learning_rate": 0.00012313106583288004, "loss": 1.0855, "step": 51 }, { "epoch": 0.65, "grad_norm": 0.2754627764225006, "learning_rate": 0.00012179988517727591, "loss": 1.113, "step": 52 }, { "epoch": 0.6625, "grad_norm": 0.2773728668689728, "learning_rate": 0.00012045258987023879, "loss": 1.0931, "step": 53 }, { "epoch": 0.675, "grad_norm": 0.3616091012954712, "learning_rate": 0.00011908969931975641, "loss": 1.1007, "step": 54 }, { "epoch": 0.6875, "grad_norm": 0.28011709451675415, "learning_rate": 0.00011771173894607985, "loss": 1.1312, "step": 55 }, { "epoch": 0.7, "grad_norm": 0.19245974719524384, "learning_rate": 0.00011631923997916375, "loss": 1.0784, "step": 56 }, { "epoch": 0.7125, "grad_norm": 0.2807864844799042, "learning_rate": 0.00011491273925386736, "loss": 1.0766, "step": 57 }, { "epoch": 0.725, "grad_norm": 0.25869062542915344, "learning_rate": 0.00011349277900299426, "loss": 1.0929, "step": 58 }, { "epoch": 0.7375, "grad_norm": 0.20374780893325806, "learning_rate": 0.00011205990664825127, "loss": 1.0977, "step": 59 }, { "epoch": 0.75, "grad_norm": 0.275302529335022, "learning_rate": 0.00011061467458920719, "loss": 1.1218, "step": 60 }, { "epoch": 0.7625, "grad_norm": 0.26479312777519226, "learning_rate": 0.00010915763999033201, "loss": 1.0972, "step": 61 }, { "epoch": 0.775, "grad_norm": 0.20327049493789673, "learning_rate": 0.00010768936456619945, "loss": 1.0723, "step": 62 }, { "epoch": 0.7875, "grad_norm": 0.18908362090587616, "learning_rate": 0.0001062104143649355, "loss": 1.1059, "step": 63 }, { "epoch": 0.8, "grad_norm": 0.2153932750225067, "learning_rate": 0.0001047213595499958, "loss": 1.1112, "step": 64 }, { "epoch": 0.8125, "grad_norm": 0.23449215292930603, "learning_rate": 0.000103222774180357, "loss": 1.1125, "step": 65 }, { "epoch": 0.825, "grad_norm": 0.19739866256713867, "learning_rate": 0.00010171523598920594, "loss": 1.0506, "step": 66 }, { "epoch": 0.8375, "grad_norm": 0.579247236251831, "learning_rate": 0.00010019932616121264, "loss": 1.0599, "step": 67 }, { "epoch": 0.85, "grad_norm": 0.2158878892660141, "learning_rate": 9.867562910847246e-05, "loss": 1.1116, "step": 68 }, { "epoch": 0.8625, "grad_norm": 0.24534355103969574, "learning_rate": 9.714473224520406e-05, "loss": 1.1039, "step": 69 }, { "epoch": 0.875, "grad_norm": 0.1604059487581253, "learning_rate": 9.560722576129029e-05, "loss": 1.1007, "step": 70 }, { "epoch": 0.8875, "grad_norm": 0.24135896563529968, "learning_rate": 9.406370239474839e-05, "loss": 1.0976, "step": 71 }, { "epoch": 0.9, "grad_norm": 0.2200448215007782, "learning_rate": 9.251475720321848e-05, "loss": 1.1001, "step": 72 }, { "epoch": 0.9125, "grad_norm": 0.17517372965812683, "learning_rate": 9.096098733455746e-05, "loss": 1.0864, "step": 73 }, { "epoch": 0.925, "grad_norm": 0.23631267249584198, "learning_rate": 8.940299179662703e-05, "loss": 1.0976, "step": 74 }, { "epoch": 0.9375, "grad_norm": 0.17627741396427155, "learning_rate": 8.784137122636488e-05, "loss": 1.1049, "step": 75 }, { "epoch": 0.95, "grad_norm": 0.1840021312236786, "learning_rate": 8.627672765822762e-05, "loss": 1.0504, "step": 76 }, { "epoch": 0.9625, "grad_norm": 0.1925836205482483, "learning_rate": 8.470966429209512e-05, "loss": 1.1028, "step": 77 }, { "epoch": 0.975, "grad_norm": 0.18122681975364685, "learning_rate": 8.31407852607255e-05, "loss": 1.0605, "step": 78 }, { "epoch": 0.9875, "grad_norm": 0.16970321536064148, "learning_rate": 8.157069539685026e-05, "loss": 1.1069, "step": 79 }, { "epoch": 1.0, "grad_norm": 0.2044173628091812, "learning_rate": 8e-05, "loss": 1.1173, "step": 80 }, { "epoch": 1.0125, "grad_norm": 0.27160316705703735, "learning_rate": 7.842930460314975e-05, "loss": 1.0536, "step": 81 }, { "epoch": 1.025, "grad_norm": 0.46974608302116394, "learning_rate": 7.685921473927454e-05, "loss": 1.1236, "step": 82 }, { "epoch": 1.0375, "grad_norm": 0.4345127046108246, "learning_rate": 7.529033570790488e-05, "loss": 1.0927, "step": 83 }, { "epoch": 1.05, "grad_norm": 0.3712212145328522, "learning_rate": 7.372327234177242e-05, "loss": 1.0861, "step": 84 }, { "epoch": 1.0625, "grad_norm": 0.3404874801635742, "learning_rate": 7.215862877363515e-05, "loss": 1.108, "step": 85 }, { "epoch": 1.075, "grad_norm": 0.411827951669693, "learning_rate": 7.0597008203373e-05, "loss": 1.0958, "step": 86 }, { "epoch": 1.0875, "grad_norm": 0.28986656665802, "learning_rate": 6.903901266544258e-05, "loss": 1.0456, "step": 87 }, { "epoch": 1.1, "grad_norm": 0.35029786825180054, "learning_rate": 6.748524279678152e-05, "loss": 1.1224, "step": 88 }, { "epoch": 1.1125, "grad_norm": 0.2584652900695801, "learning_rate": 6.593629760525164e-05, "loss": 1.0932, "step": 89 }, { "epoch": 1.125, "grad_norm": 0.30658528208732605, "learning_rate": 6.439277423870975e-05, "loss": 1.1091, "step": 90 }, { "epoch": 1.1375, "grad_norm": 0.2145494520664215, "learning_rate": 6.285526775479596e-05, "loss": 1.0921, "step": 91 }, { "epoch": 1.15, "grad_norm": 0.2773503065109253, "learning_rate": 6.13243708915276e-05, "loss": 1.0642, "step": 92 }, { "epoch": 1.1625, "grad_norm": 0.2383718639612198, "learning_rate": 5.9800673838787364e-05, "loss": 1.0832, "step": 93 }, { "epoch": 1.175, "grad_norm": 0.23632678389549255, "learning_rate": 5.828476401079407e-05, "loss": 1.0666, "step": 94 }, { "epoch": 1.1875, "grad_norm": 0.21097873151302338, "learning_rate": 5.677722581964303e-05, "loss": 1.0342, "step": 95 }, { "epoch": 1.2, "grad_norm": 0.20526330173015594, "learning_rate": 5.5278640450004216e-05, "loss": 1.0668, "step": 96 }, { "epoch": 1.2125, "grad_norm": 0.19396792352199554, "learning_rate": 5.3789585635064534e-05, "loss": 1.0359, "step": 97 }, { "epoch": 1.225, "grad_norm": 0.18029625713825226, "learning_rate": 5.231063543380055e-05, "loss": 1.0305, "step": 98 }, { "epoch": 1.2375, "grad_norm": 0.16607148945331573, "learning_rate": 5.084236000966803e-05, "loss": 1.0516, "step": 99 }, { "epoch": 1.25, "grad_norm": 0.1904178261756897, "learning_rate": 4.9385325410792824e-05, "loss": 1.0456, "step": 100 }, { "epoch": 1.2625, "grad_norm": 0.16171352565288544, "learning_rate": 4.794009335174874e-05, "loss": 1.0482, "step": 101 }, { "epoch": 1.275, "grad_norm": 0.17175588011741638, "learning_rate": 4.650722099700578e-05, "loss": 1.0442, "step": 102 }, { "epoch": 1.2875, "grad_norm": 0.147624671459198, "learning_rate": 4.508726074613262e-05, "loss": 1.0694, "step": 103 }, { "epoch": 1.3, "grad_norm": 0.168911874294281, "learning_rate": 4.3680760020836266e-05, "loss": 1.0556, "step": 104 }, { "epoch": 1.3125, "grad_norm": 0.15126970410346985, "learning_rate": 4.2288261053920186e-05, "loss": 1.044, "step": 105 }, { "epoch": 1.325, "grad_norm": 0.15997202694416046, "learning_rate": 4.0910300680243636e-05, "loss": 1.0231, "step": 106 }, { "epoch": 1.3375, "grad_norm": 0.16348402202129364, "learning_rate": 3.954741012976125e-05, "loss": 1.0468, "step": 107 }, { "epoch": 1.35, "grad_norm": 0.16921056807041168, "learning_rate": 3.8200114822724096e-05, "loss": 1.0452, "step": 108 }, { "epoch": 1.3625, "grad_norm": 0.1568213850259781, "learning_rate": 3.686893416711998e-05, "loss": 1.0487, "step": 109 }, { "epoch": 1.375, "grad_norm": 0.12868821620941162, "learning_rate": 3.5554381358431845e-05, "loss": 1.0447, "step": 110 }, { "epoch": 1.3875, "grad_norm": 0.14700743556022644, "learning_rate": 3.425696318179103e-05, "loss": 1.0288, "step": 111 }, { "epoch": 1.4, "grad_norm": 0.1439713090658188, "learning_rate": 3.297717981660216e-05, "loss": 1.0269, "step": 112 }, { "epoch": 1.4125, "grad_norm": 0.12184255570173264, "learning_rate": 3.1715524643714286e-05, "loss": 1.0361, "step": 113 }, { "epoch": 1.425, "grad_norm": 0.14732472598552704, "learning_rate": 3.0472484055213276e-05, "loss": 1.0476, "step": 114 }, { "epoch": 1.4375, "grad_norm": 0.14000268280506134, "learning_rate": 2.9248537266908373e-05, "loss": 1.0531, "step": 115 }, { "epoch": 1.45, "grad_norm": 0.12124885618686676, "learning_rate": 2.804415613358532e-05, "loss": 1.0611, "step": 116 }, { "epoch": 1.4625, "grad_norm": 0.13823653757572174, "learning_rate": 2.685980496709749e-05, "loss": 1.0625, "step": 117 }, { "epoch": 1.475, "grad_norm": 0.1250331848859787, "learning_rate": 2.569594035736466e-05, "loss": 1.0647, "step": 118 }, { "epoch": 1.4875, "grad_norm": 0.11904938519001007, "learning_rate": 2.4553010996349143e-05, "loss": 1.0543, "step": 119 }, { "epoch": 1.5, "grad_norm": 0.12068229168653488, "learning_rate": 2.3431457505076205e-05, "loss": 1.0325, "step": 120 }, { "epoch": 1.5125, "grad_norm": 0.12019164115190506, "learning_rate": 2.2331712263766495e-05, "loss": 1.041, "step": 121 }, { "epoch": 1.525, "grad_norm": 0.12757574021816254, "learning_rate": 2.1254199245145177e-05, "loss": 1.0433, "step": 122 }, { "epoch": 1.5375, "grad_norm": 0.12285693734884262, "learning_rate": 2.0199333850992245e-05, "loss": 1.0657, "step": 123 }, { "epoch": 1.55, "grad_norm": 0.12083045393228531, "learning_rate": 1.9167522751997527e-05, "loss": 1.0292, "step": 124 }, { "epoch": 1.5625, "grad_norm": 0.11348845809698105, "learning_rate": 1.815916373098104e-05, "loss": 1.04, "step": 125 }, { "epoch": 1.575, "grad_norm": 0.12212307006120682, "learning_rate": 1.7174645529540424e-05, "loss": 1.0556, "step": 126 }, { "epoch": 1.5875, "grad_norm": 0.1128983274102211, "learning_rate": 1.621434769818344e-05, "loss": 1.0365, "step": 127 }, { "epoch": 1.6, "grad_norm": 0.11498909443616867, "learning_rate": 1.5278640450004213e-05, "loss": 1.0072, "step": 128 }, { "epoch": 1.6125, "grad_norm": 0.12018037587404251, "learning_rate": 1.4367884517958914e-05, "loss": 1.0589, "step": 129 }, { "epoch": 1.625, "grad_norm": 0.10916531831026077, "learning_rate": 1.3482431015796373e-05, "loss": 1.0112, "step": 130 }, { "epoch": 1.6375, "grad_norm": 0.10843243449926376, "learning_rate": 1.2622621302697087e-05, "loss": 1.0249, "step": 131 }, { "epoch": 1.65, "grad_norm": 0.111574187874794, "learning_rate": 1.1788786851672628e-05, "loss": 1.0507, "step": 132 }, { "epoch": 1.6625, "grad_norm": 0.10717642307281494, "learning_rate": 1.0981249121776654e-05, "loss": 1.0328, "step": 133 }, { "epoch": 1.675, "grad_norm": 0.11177720874547958, "learning_rate": 1.0200319434176227e-05, "loss": 1.0412, "step": 134 }, { "epoch": 1.6875, "grad_norm": 0.10437416285276413, "learning_rate": 9.446298852131605e-06, "loss": 1.072, "step": 135 }, { "epoch": 1.7, "grad_norm": 0.10602446645498276, "learning_rate": 8.719478064930578e-06, "loss": 1.022, "step": 136 }, { "epoch": 1.7125, "grad_norm": 0.10757151246070862, "learning_rate": 8.020137275822297e-06, "loss": 1.021, "step": 137 }, { "epoch": 1.725, "grad_norm": 0.10059204697608948, "learning_rate": 7.348546093993492e-06, "loss": 1.0369, "step": 138 }, { "epoch": 1.7375, "grad_norm": 0.09817243367433548, "learning_rate": 6.704963430629132e-06, "loss": 1.0433, "step": 139 }, { "epoch": 1.75, "grad_norm": 1.0110856294631958, "learning_rate": 6.0896373990970614e-06, "loss": 1.0893, "step": 140 }, { "epoch": 1.7625, "grad_norm": 0.1014508381485939, "learning_rate": 5.502805219295715e-06, "loss": 1.0433, "step": 141 }, { "epoch": 1.775, "grad_norm": 0.09631301462650299, "learning_rate": 4.944693126201276e-06, "loss": 1.0207, "step": 142 }, { "epoch": 1.7875, "grad_norm": 0.09827422350645065, "learning_rate": 4.415516282649756e-06, "loss": 1.0538, "step": 143 }, { "epoch": 1.8, "grad_norm": 0.100394107401371, "learning_rate": 3.915478696387718e-06, "loss": 1.0606, "step": 144 }, { "epoch": 1.8125, "grad_norm": 0.0930555984377861, "learning_rate": 3.4447731414232945e-06, "loss": 1.0614, "step": 145 }, { "epoch": 1.825, "grad_norm": 0.09103138744831085, "learning_rate": 3.0035810837082267e-06, "loss": 1.0028, "step": 146 }, { "epoch": 1.8375, "grad_norm": 0.09185432642698288, "learning_rate": 2.5920726111792195e-06, "loss": 1.0137, "step": 147 }, { "epoch": 1.85, "grad_norm": 0.09481006115674973, "learning_rate": 2.2104063681858757e-06, "loss": 1.0631, "step": 148 }, { "epoch": 1.8625, "grad_norm": 0.08825208991765976, "learning_rate": 1.8587294943302092e-06, "loss": 1.0562, "step": 149 }, { "epoch": 1.875, "grad_norm": 0.09086597710847855, "learning_rate": 1.5371775677415656e-06, "loss": 1.0534, "step": 150 }, { "epoch": 1.8875, "grad_norm": 0.09239210933446884, "learning_rate": 1.2458745528086723e-06, "loss": 1.0517, "step": 151 }, { "epoch": 1.9, "grad_norm": 0.091704361140728, "learning_rate": 9.849327523889873e-07, "loss": 1.0547, "step": 152 }, { "epoch": 1.9125, "grad_norm": 0.0930432379245758, "learning_rate": 7.5445276451382e-07, "loss": 1.0418, "step": 153 }, { "epoch": 1.925, "grad_norm": 0.08982589095830917, "learning_rate": 5.545234436058966e-07, "loss": 1.0541, "step": 154 }, { "epoch": 1.9375, "grad_norm": 0.09276595711708069, "learning_rate": 3.852218662242546e-07, "loss": 1.0613, "step": 155 }, { "epoch": 1.95, "grad_norm": 0.08679981529712677, "learning_rate": 2.466133013497629e-07, "loss": 1.0097, "step": 156 }, { "epoch": 1.9625, "grad_norm": 0.08747802674770355, "learning_rate": 1.3875118522273412e-07, "loss": 1.0606, "step": 157 }, { "epoch": 1.975, "grad_norm": 0.08833891898393631, "learning_rate": 6.167710074216792e-08, "loss": 1.0207, "step": 158 }, { "epoch": 1.9875, "grad_norm": 0.0912172719836235, "learning_rate": 1.5420761434814523e-08, "loss": 1.066, "step": 159 }, { "epoch": 2.0, "grad_norm": 0.09457752853631973, "learning_rate": 0.0, "loss": 1.0772, "step": 160 }, { "epoch": 2.0, "step": 160, "total_flos": 5.276128179356959e+18, "train_loss": 0.8005891665816307, "train_runtime": 11560.7477, "train_samples_per_second": 3.543, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 40, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.276128179356959e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }