{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01987918725974739, "eval_steps": 500, "global_step": 181, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010982976386600769, "grad_norm": 18.411802291870117, "learning_rate": 0.0, "loss": 0.3251, "step": 1 }, { "epoch": 0.00021965952773201537, "grad_norm": 14.977940559387207, "learning_rate": 5.2631578947368416e-08, "loss": 0.3606, "step": 2 }, { "epoch": 0.00032948929159802305, "grad_norm": 14.84136962890625, "learning_rate": 1.0526315789473683e-07, "loss": 0.2777, "step": 3 }, { "epoch": 0.00043931905546403075, "grad_norm": 15.656201362609863, "learning_rate": 1.5789473684210525e-07, "loss": 0.293, "step": 4 }, { "epoch": 0.0005491488193300384, "grad_norm": 11.50175666809082, "learning_rate": 2.1052631578947366e-07, "loss": 0.271, "step": 5 }, { "epoch": 0.0006589785831960461, "grad_norm": 13.30196762084961, "learning_rate": 2.631578947368421e-07, "loss": 0.3014, "step": 6 }, { "epoch": 0.0007688083470620538, "grad_norm": 18.187772750854492, "learning_rate": 3.157894736842105e-07, "loss": 0.2838, "step": 7 }, { "epoch": 0.0008786381109280615, "grad_norm": 12.342240333557129, "learning_rate": 3.684210526315789e-07, "loss": 0.2961, "step": 8 }, { "epoch": 0.0009884678747940692, "grad_norm": 12.453307151794434, "learning_rate": 4.2105263157894733e-07, "loss": 0.2727, "step": 9 }, { "epoch": 0.001098297638660077, "grad_norm": 13.236316680908203, "learning_rate": 4.7368421052631574e-07, "loss": 0.2777, "step": 10 }, { "epoch": 0.0012081274025260845, "grad_norm": 9.186564445495605, "learning_rate": 5.263157894736842e-07, "loss": 0.2362, "step": 11 }, { "epoch": 0.0013179571663920922, "grad_norm": 14.68601131439209, "learning_rate": 5.789473684210526e-07, "loss": 0.3246, "step": 12 }, { "epoch": 0.0014277869302581, "grad_norm": 12.118836402893066, "learning_rate": 6.31578947368421e-07, "loss": 0.2334, "step": 13 }, { "epoch": 0.0015376166941241077, "grad_norm": 9.764254570007324, "learning_rate": 6.842105263157895e-07, "loss": 0.2382, "step": 14 }, { "epoch": 0.0016474464579901153, "grad_norm": 7.129339694976807, "learning_rate": 7.368421052631578e-07, "loss": 0.2399, "step": 15 }, { "epoch": 0.001757276221856123, "grad_norm": 6.999264717102051, "learning_rate": 7.894736842105263e-07, "loss": 0.1915, "step": 16 }, { "epoch": 0.0018671059857221306, "grad_norm": 6.912661075592041, "learning_rate": 8.421052631578947e-07, "loss": 0.1996, "step": 17 }, { "epoch": 0.0019769357495881385, "grad_norm": 6.077408313751221, "learning_rate": 8.947368421052631e-07, "loss": 0.1915, "step": 18 }, { "epoch": 0.002086765513454146, "grad_norm": 4.185582160949707, "learning_rate": 9.473684210526315e-07, "loss": 0.176, "step": 19 }, { "epoch": 0.002196595277320154, "grad_norm": 2.6936287879943848, "learning_rate": 1e-06, "loss": 0.1309, "step": 20 }, { "epoch": 0.0023064250411861617, "grad_norm": 3.5675835609436035, "learning_rate": 1e-06, "loss": 0.1515, "step": 21 }, { "epoch": 0.002416254805052169, "grad_norm": 2.7305240631103516, "learning_rate": 1e-06, "loss": 0.132, "step": 22 }, { "epoch": 0.002526084568918177, "grad_norm": 2.0667977333068848, "learning_rate": 1e-06, "loss": 0.107, "step": 23 }, { "epoch": 0.0026359143327841844, "grad_norm": 2.3355114459991455, "learning_rate": 1e-06, "loss": 0.1252, "step": 24 }, { "epoch": 0.0027457440966501922, "grad_norm": 2.30021333694458, "learning_rate": 1e-06, "loss": 0.1262, "step": 25 }, { "epoch": 0.0028555738605162, "grad_norm": 2.639949083328247, "learning_rate": 1e-06, "loss": 0.1611, "step": 26 }, { "epoch": 0.0029654036243822075, "grad_norm": 2.6157100200653076, "learning_rate": 1e-06, "loss": 0.1133, "step": 27 }, { "epoch": 0.0030752333882482154, "grad_norm": 2.25714373588562, "learning_rate": 1e-06, "loss": 0.1175, "step": 28 }, { "epoch": 0.003185063152114223, "grad_norm": 2.413874864578247, "learning_rate": 1e-06, "loss": 0.1335, "step": 29 }, { "epoch": 0.0032948929159802307, "grad_norm": 2.9752049446105957, "learning_rate": 1e-06, "loss": 0.1549, "step": 30 }, { "epoch": 0.003404722679846238, "grad_norm": 2.191883087158203, "learning_rate": 1e-06, "loss": 0.1072, "step": 31 }, { "epoch": 0.003514552443712246, "grad_norm": 2.182326078414917, "learning_rate": 1e-06, "loss": 0.1322, "step": 32 }, { "epoch": 0.003624382207578254, "grad_norm": 2.0499420166015625, "learning_rate": 1e-06, "loss": 0.1252, "step": 33 }, { "epoch": 0.0037342119714442613, "grad_norm": 2.898209571838379, "learning_rate": 1e-06, "loss": 0.1564, "step": 34 }, { "epoch": 0.003844041735310269, "grad_norm": 2.6807024478912354, "learning_rate": 1e-06, "loss": 0.1215, "step": 35 }, { "epoch": 0.003953871499176277, "grad_norm": 2.6628167629241943, "learning_rate": 1e-06, "loss": 0.125, "step": 36 }, { "epoch": 0.004063701263042284, "grad_norm": 2.3685622215270996, "learning_rate": 1e-06, "loss": 0.1423, "step": 37 }, { "epoch": 0.004173531026908292, "grad_norm": 2.0525412559509277, "learning_rate": 1e-06, "loss": 0.1321, "step": 38 }, { "epoch": 0.0042833607907743, "grad_norm": 2.064305305480957, "learning_rate": 1e-06, "loss": 0.1139, "step": 39 }, { "epoch": 0.004393190554640308, "grad_norm": 2.255208969116211, "learning_rate": 1e-06, "loss": 0.1075, "step": 40 }, { "epoch": 0.004503020318506315, "grad_norm": 2.1674306392669678, "learning_rate": 1e-06, "loss": 0.1231, "step": 41 }, { "epoch": 0.004612850082372323, "grad_norm": 2.1255414485931396, "learning_rate": 1e-06, "loss": 0.149, "step": 42 }, { "epoch": 0.004722679846238331, "grad_norm": 1.9542911052703857, "learning_rate": 1e-06, "loss": 0.1406, "step": 43 }, { "epoch": 0.004832509610104338, "grad_norm": 2.3152379989624023, "learning_rate": 1e-06, "loss": 0.122, "step": 44 }, { "epoch": 0.004942339373970346, "grad_norm": 2.2259297370910645, "learning_rate": 1e-06, "loss": 0.1273, "step": 45 }, { "epoch": 0.005052169137836354, "grad_norm": 2.1162829399108887, "learning_rate": 1e-06, "loss": 0.1071, "step": 46 }, { "epoch": 0.005161998901702361, "grad_norm": 2.2870123386383057, "learning_rate": 1e-06, "loss": 0.1222, "step": 47 }, { "epoch": 0.005271828665568369, "grad_norm": 2.1178293228149414, "learning_rate": 1e-06, "loss": 0.1128, "step": 48 }, { "epoch": 0.005381658429434377, "grad_norm": 2.811563730239868, "learning_rate": 1e-06, "loss": 0.113, "step": 49 }, { "epoch": 0.0054914881933003845, "grad_norm": 1.9846251010894775, "learning_rate": 1e-06, "loss": 0.1083, "step": 50 }, { "epoch": 0.005601317957166392, "grad_norm": 4.522186279296875, "learning_rate": 1e-06, "loss": 0.1283, "step": 51 }, { "epoch": 0.0057111477210324, "grad_norm": 1.9514762163162231, "learning_rate": 1e-06, "loss": 0.117, "step": 52 }, { "epoch": 0.005820977484898408, "grad_norm": 1.644494652748108, "learning_rate": 1e-06, "loss": 0.1097, "step": 53 }, { "epoch": 0.005930807248764415, "grad_norm": 2.26704740524292, "learning_rate": 1e-06, "loss": 0.0996, "step": 54 }, { "epoch": 0.0060406370126304225, "grad_norm": 1.8278131484985352, "learning_rate": 1e-06, "loss": 0.1175, "step": 55 }, { "epoch": 0.006150466776496431, "grad_norm": 2.9004828929901123, "learning_rate": 1e-06, "loss": 0.118, "step": 56 }, { "epoch": 0.006260296540362438, "grad_norm": 2.1837573051452637, "learning_rate": 1e-06, "loss": 0.1052, "step": 57 }, { "epoch": 0.006370126304228446, "grad_norm": 2.207904577255249, "learning_rate": 1e-06, "loss": 0.1057, "step": 58 }, { "epoch": 0.006479956068094454, "grad_norm": 1.937738060951233, "learning_rate": 1e-06, "loss": 0.1377, "step": 59 }, { "epoch": 0.006589785831960461, "grad_norm": 1.8323129415512085, "learning_rate": 1e-06, "loss": 0.0962, "step": 60 }, { "epoch": 0.006699615595826469, "grad_norm": 1.9134773015975952, "learning_rate": 1e-06, "loss": 0.0991, "step": 61 }, { "epoch": 0.006809445359692476, "grad_norm": 2.54829478263855, "learning_rate": 1e-06, "loss": 0.1199, "step": 62 }, { "epoch": 0.0069192751235584845, "grad_norm": 2.0665552616119385, "learning_rate": 1e-06, "loss": 0.1242, "step": 63 }, { "epoch": 0.007029104887424492, "grad_norm": 1.9088149070739746, "learning_rate": 1e-06, "loss": 0.1187, "step": 64 }, { "epoch": 0.007138934651290499, "grad_norm": 1.6347726583480835, "learning_rate": 1e-06, "loss": 0.1087, "step": 65 }, { "epoch": 0.007248764415156508, "grad_norm": 1.8846579790115356, "learning_rate": 1e-06, "loss": 0.115, "step": 66 }, { "epoch": 0.007358594179022515, "grad_norm": 2.108840227127075, "learning_rate": 1e-06, "loss": 0.1296, "step": 67 }, { "epoch": 0.0074684239428885225, "grad_norm": 2.0860307216644287, "learning_rate": 1e-06, "loss": 0.1088, "step": 68 }, { "epoch": 0.007578253706754531, "grad_norm": 2.0085508823394775, "learning_rate": 1e-06, "loss": 0.1208, "step": 69 }, { "epoch": 0.007688083470620538, "grad_norm": 1.8251607418060303, "learning_rate": 1e-06, "loss": 0.1112, "step": 70 }, { "epoch": 0.007797913234486546, "grad_norm": 1.9405887126922607, "learning_rate": 1e-06, "loss": 0.1014, "step": 71 }, { "epoch": 0.007907742998352554, "grad_norm": 2.248020887374878, "learning_rate": 1e-06, "loss": 0.107, "step": 72 }, { "epoch": 0.008017572762218561, "grad_norm": 1.9570263624191284, "learning_rate": 1e-06, "loss": 0.1071, "step": 73 }, { "epoch": 0.008127402526084569, "grad_norm": 2.1239218711853027, "learning_rate": 1e-06, "loss": 0.133, "step": 74 }, { "epoch": 0.008237232289950576, "grad_norm": 1.7767431735992432, "learning_rate": 1e-06, "loss": 0.102, "step": 75 }, { "epoch": 0.008347062053816584, "grad_norm": 2.4217135906219482, "learning_rate": 1e-06, "loss": 0.1004, "step": 76 }, { "epoch": 0.008456891817682591, "grad_norm": 1.89665687084198, "learning_rate": 1e-06, "loss": 0.1241, "step": 77 }, { "epoch": 0.0085667215815486, "grad_norm": 1.431766152381897, "learning_rate": 1e-06, "loss": 0.0849, "step": 78 }, { "epoch": 0.008676551345414608, "grad_norm": 2.6968767642974854, "learning_rate": 1e-06, "loss": 0.1129, "step": 79 }, { "epoch": 0.008786381109280615, "grad_norm": 1.8317389488220215, "learning_rate": 1e-06, "loss": 0.1033, "step": 80 }, { "epoch": 0.008896210873146623, "grad_norm": 2.0611159801483154, "learning_rate": 1e-06, "loss": 0.1075, "step": 81 }, { "epoch": 0.00900604063701263, "grad_norm": 2.4678268432617188, "learning_rate": 1e-06, "loss": 0.0914, "step": 82 }, { "epoch": 0.009115870400878637, "grad_norm": 2.097642183303833, "learning_rate": 1e-06, "loss": 0.1033, "step": 83 }, { "epoch": 0.009225700164744647, "grad_norm": 1.9599785804748535, "learning_rate": 1e-06, "loss": 0.1063, "step": 84 }, { "epoch": 0.009335529928610654, "grad_norm": 1.938198447227478, "learning_rate": 1e-06, "loss": 0.0968, "step": 85 }, { "epoch": 0.009445359692476661, "grad_norm": 2.0601954460144043, "learning_rate": 1e-06, "loss": 0.1077, "step": 86 }, { "epoch": 0.009555189456342669, "grad_norm": 1.9235936403274536, "learning_rate": 1e-06, "loss": 0.1218, "step": 87 }, { "epoch": 0.009665019220208676, "grad_norm": 1.6672967672348022, "learning_rate": 1e-06, "loss": 0.0982, "step": 88 }, { "epoch": 0.009774848984074684, "grad_norm": 1.9302681684494019, "learning_rate": 1e-06, "loss": 0.1124, "step": 89 }, { "epoch": 0.009884678747940691, "grad_norm": 2.2827959060668945, "learning_rate": 1e-06, "loss": 0.1178, "step": 90 }, { "epoch": 0.0099945085118067, "grad_norm": 1.8714021444320679, "learning_rate": 1e-06, "loss": 0.0984, "step": 91 }, { "epoch": 0.010104338275672708, "grad_norm": 2.1995835304260254, "learning_rate": 1e-06, "loss": 0.1319, "step": 92 }, { "epoch": 0.010214168039538715, "grad_norm": 1.92769455909729, "learning_rate": 1e-06, "loss": 0.1026, "step": 93 }, { "epoch": 0.010323997803404723, "grad_norm": 1.9699769020080566, "learning_rate": 1e-06, "loss": 0.1189, "step": 94 }, { "epoch": 0.01043382756727073, "grad_norm": 2.7029881477355957, "learning_rate": 1e-06, "loss": 0.1282, "step": 95 }, { "epoch": 0.010543657331136738, "grad_norm": 1.6077944040298462, "learning_rate": 1e-06, "loss": 0.1096, "step": 96 }, { "epoch": 0.010653487095002745, "grad_norm": 2.0745413303375244, "learning_rate": 1e-06, "loss": 0.1154, "step": 97 }, { "epoch": 0.010763316858868754, "grad_norm": 1.8612251281738281, "learning_rate": 1e-06, "loss": 0.1014, "step": 98 }, { "epoch": 0.010873146622734762, "grad_norm": 1.8795632123947144, "learning_rate": 1e-06, "loss": 0.1029, "step": 99 }, { "epoch": 0.010982976386600769, "grad_norm": 1.8857154846191406, "learning_rate": 1e-06, "loss": 0.1014, "step": 100 }, { "epoch": 0.011092806150466776, "grad_norm": 1.87457275390625, "learning_rate": 1e-06, "loss": 0.1047, "step": 101 }, { "epoch": 0.011202635914332784, "grad_norm": 2.02274489402771, "learning_rate": 1e-06, "loss": 0.117, "step": 102 }, { "epoch": 0.011312465678198791, "grad_norm": 2.1100752353668213, "learning_rate": 1e-06, "loss": 0.1116, "step": 103 }, { "epoch": 0.0114222954420648, "grad_norm": 2.1528773307800293, "learning_rate": 1e-06, "loss": 0.1147, "step": 104 }, { "epoch": 0.011532125205930808, "grad_norm": 5.85520076751709, "learning_rate": 1e-06, "loss": 0.1089, "step": 105 }, { "epoch": 0.011641954969796815, "grad_norm": 2.007204532623291, "learning_rate": 1e-06, "loss": 0.1244, "step": 106 }, { "epoch": 0.011751784733662823, "grad_norm": 1.9761431217193604, "learning_rate": 1e-06, "loss": 0.104, "step": 107 }, { "epoch": 0.01186161449752883, "grad_norm": 1.6352622509002686, "learning_rate": 1e-06, "loss": 0.1098, "step": 108 }, { "epoch": 0.011971444261394838, "grad_norm": 1.898520588874817, "learning_rate": 1e-06, "loss": 0.1122, "step": 109 }, { "epoch": 0.012081274025260845, "grad_norm": 1.6044663190841675, "learning_rate": 1e-06, "loss": 0.1044, "step": 110 }, { "epoch": 0.012191103789126854, "grad_norm": 1.7996292114257812, "learning_rate": 1e-06, "loss": 0.1107, "step": 111 }, { "epoch": 0.012300933552992862, "grad_norm": 1.949839472770691, "learning_rate": 1e-06, "loss": 0.1345, "step": 112 }, { "epoch": 0.012410763316858869, "grad_norm": 1.7750391960144043, "learning_rate": 1e-06, "loss": 0.1023, "step": 113 }, { "epoch": 0.012520593080724876, "grad_norm": 1.8512459993362427, "learning_rate": 1e-06, "loss": 0.0921, "step": 114 }, { "epoch": 0.012630422844590884, "grad_norm": 1.8420369625091553, "learning_rate": 1e-06, "loss": 0.0949, "step": 115 }, { "epoch": 0.012740252608456891, "grad_norm": 1.885312795639038, "learning_rate": 1e-06, "loss": 0.1118, "step": 116 }, { "epoch": 0.012850082372322899, "grad_norm": 2.293736457824707, "learning_rate": 1e-06, "loss": 0.1227, "step": 117 }, { "epoch": 0.012959912136188908, "grad_norm": 1.9772549867630005, "learning_rate": 1e-06, "loss": 0.1212, "step": 118 }, { "epoch": 0.013069741900054915, "grad_norm": 2.280238151550293, "learning_rate": 1e-06, "loss": 0.1192, "step": 119 }, { "epoch": 0.013179571663920923, "grad_norm": 1.9888858795166016, "learning_rate": 1e-06, "loss": 0.1027, "step": 120 }, { "epoch": 0.01328940142778693, "grad_norm": 1.5730584859848022, "learning_rate": 1e-06, "loss": 0.0913, "step": 121 }, { "epoch": 0.013399231191652938, "grad_norm": 1.7493692636489868, "learning_rate": 1e-06, "loss": 0.097, "step": 122 }, { "epoch": 0.013509060955518945, "grad_norm": 2.1915624141693115, "learning_rate": 1e-06, "loss": 0.1217, "step": 123 }, { "epoch": 0.013618890719384952, "grad_norm": 2.0627121925354004, "learning_rate": 1e-06, "loss": 0.1138, "step": 124 }, { "epoch": 0.013728720483250962, "grad_norm": 1.8700608015060425, "learning_rate": 1e-06, "loss": 0.0957, "step": 125 }, { "epoch": 0.013838550247116969, "grad_norm": 4.867977142333984, "learning_rate": 1e-06, "loss": 0.1143, "step": 126 }, { "epoch": 0.013948380010982976, "grad_norm": 1.6484566926956177, "learning_rate": 1e-06, "loss": 0.0883, "step": 127 }, { "epoch": 0.014058209774848984, "grad_norm": 1.7756706476211548, "learning_rate": 1e-06, "loss": 0.0992, "step": 128 }, { "epoch": 0.014168039538714991, "grad_norm": 23.05768394470215, "learning_rate": 1e-06, "loss": 0.0779, "step": 129 }, { "epoch": 0.014277869302580999, "grad_norm": 1.8049583435058594, "learning_rate": 1e-06, "loss": 0.1014, "step": 130 }, { "epoch": 0.014387699066447008, "grad_norm": 1.9289432764053345, "learning_rate": 1e-06, "loss": 0.0958, "step": 131 }, { "epoch": 0.014497528830313015, "grad_norm": 1.9363269805908203, "learning_rate": 1e-06, "loss": 0.1306, "step": 132 }, { "epoch": 0.014607358594179023, "grad_norm": 1.7127413749694824, "learning_rate": 1e-06, "loss": 0.1015, "step": 133 }, { "epoch": 0.01471718835804503, "grad_norm": 2.036144733428955, "learning_rate": 1e-06, "loss": 0.1002, "step": 134 }, { "epoch": 0.014827018121911038, "grad_norm": 2.464301347732544, "learning_rate": 1e-06, "loss": 0.1119, "step": 135 }, { "epoch": 0.014936847885777045, "grad_norm": 1.9304972887039185, "learning_rate": 1e-06, "loss": 0.1148, "step": 136 }, { "epoch": 0.015046677649643053, "grad_norm": 1.6702697277069092, "learning_rate": 1e-06, "loss": 0.1113, "step": 137 }, { "epoch": 0.015156507413509062, "grad_norm": 1.804840326309204, "learning_rate": 1e-06, "loss": 0.0956, "step": 138 }, { "epoch": 0.015266337177375069, "grad_norm": 1.7338131666183472, "learning_rate": 1e-06, "loss": 0.107, "step": 139 }, { "epoch": 0.015376166941241077, "grad_norm": 1.8909966945648193, "learning_rate": 1e-06, "loss": 0.093, "step": 140 }, { "epoch": 0.015485996705107084, "grad_norm": 1.957493543624878, "learning_rate": 1e-06, "loss": 0.0968, "step": 141 }, { "epoch": 0.015595826468973091, "grad_norm": 2.450575590133667, "learning_rate": 1e-06, "loss": 0.1133, "step": 142 }, { "epoch": 0.0157056562328391, "grad_norm": 2.1446874141693115, "learning_rate": 1e-06, "loss": 0.1388, "step": 143 }, { "epoch": 0.015815485996705108, "grad_norm": 2.1485435962677, "learning_rate": 1e-06, "loss": 0.1315, "step": 144 }, { "epoch": 0.015925315760571115, "grad_norm": 2.152599573135376, "learning_rate": 1e-06, "loss": 0.1326, "step": 145 }, { "epoch": 0.016035145524437123, "grad_norm": 1.9794915914535522, "learning_rate": 1e-06, "loss": 0.1277, "step": 146 }, { "epoch": 0.01614497528830313, "grad_norm": 1.7658240795135498, "learning_rate": 1e-06, "loss": 0.1031, "step": 147 }, { "epoch": 0.016254805052169138, "grad_norm": 1.8528263568878174, "learning_rate": 1e-06, "loss": 0.0952, "step": 148 }, { "epoch": 0.016364634816035145, "grad_norm": 1.5647423267364502, "learning_rate": 1e-06, "loss": 0.1088, "step": 149 }, { "epoch": 0.016474464579901153, "grad_norm": 2.419384717941284, "learning_rate": 1e-06, "loss": 0.0967, "step": 150 }, { "epoch": 0.01658429434376716, "grad_norm": 1.5838412046432495, "learning_rate": 1e-06, "loss": 0.0839, "step": 151 }, { "epoch": 0.016694124107633167, "grad_norm": 2.1404707431793213, "learning_rate": 1e-06, "loss": 0.1142, "step": 152 }, { "epoch": 0.016803953871499175, "grad_norm": 1.6462748050689697, "learning_rate": 1e-06, "loss": 0.0947, "step": 153 }, { "epoch": 0.016913783635365182, "grad_norm": 2.057058811187744, "learning_rate": 1e-06, "loss": 0.1123, "step": 154 }, { "epoch": 0.017023613399231193, "grad_norm": 1.7520567178726196, "learning_rate": 1e-06, "loss": 0.1008, "step": 155 }, { "epoch": 0.0171334431630972, "grad_norm": 2.039196014404297, "learning_rate": 1e-06, "loss": 0.1322, "step": 156 }, { "epoch": 0.017243272926963208, "grad_norm": 1.8729941844940186, "learning_rate": 1e-06, "loss": 0.102, "step": 157 }, { "epoch": 0.017353102690829215, "grad_norm": 1.9198112487792969, "learning_rate": 1e-06, "loss": 0.1141, "step": 158 }, { "epoch": 0.017462932454695223, "grad_norm": 1.690664529800415, "learning_rate": 1e-06, "loss": 0.0938, "step": 159 }, { "epoch": 0.01757276221856123, "grad_norm": 2.284759044647217, "learning_rate": 1e-06, "loss": 0.1171, "step": 160 }, { "epoch": 0.017682591982427238, "grad_norm": 1.7743721008300781, "learning_rate": 1e-06, "loss": 0.0976, "step": 161 }, { "epoch": 0.017792421746293245, "grad_norm": 2.1249804496765137, "learning_rate": 1e-06, "loss": 0.0904, "step": 162 }, { "epoch": 0.017902251510159253, "grad_norm": 3.607625722885132, "learning_rate": 1e-06, "loss": 0.1277, "step": 163 }, { "epoch": 0.01801208127402526, "grad_norm": 1.950108289718628, "learning_rate": 1e-06, "loss": 0.1016, "step": 164 }, { "epoch": 0.018121911037891267, "grad_norm": 1.6242471933364868, "learning_rate": 1e-06, "loss": 0.0919, "step": 165 }, { "epoch": 0.018231740801757275, "grad_norm": 2.4311513900756836, "learning_rate": 1e-06, "loss": 0.1112, "step": 166 }, { "epoch": 0.018341570565623282, "grad_norm": 1.5507546663284302, "learning_rate": 1e-06, "loss": 0.0981, "step": 167 }, { "epoch": 0.018451400329489293, "grad_norm": 1.9630911350250244, "learning_rate": 1e-06, "loss": 0.1087, "step": 168 }, { "epoch": 0.0185612300933553, "grad_norm": 1.6163691282272339, "learning_rate": 1e-06, "loss": 0.092, "step": 169 }, { "epoch": 0.018671059857221308, "grad_norm": 1.647873878479004, "learning_rate": 1e-06, "loss": 0.0867, "step": 170 }, { "epoch": 0.018780889621087316, "grad_norm": 2.0003228187561035, "learning_rate": 1e-06, "loss": 0.1166, "step": 171 }, { "epoch": 0.018890719384953323, "grad_norm": 2.019808053970337, "learning_rate": 1e-06, "loss": 0.1139, "step": 172 }, { "epoch": 0.01900054914881933, "grad_norm": 1.6541454792022705, "learning_rate": 1e-06, "loss": 0.0857, "step": 173 }, { "epoch": 0.019110378912685338, "grad_norm": 2.194434642791748, "learning_rate": 1e-06, "loss": 0.0919, "step": 174 }, { "epoch": 0.019220208676551345, "grad_norm": 2.8449411392211914, "learning_rate": 1e-06, "loss": 0.113, "step": 175 }, { "epoch": 0.019330038440417353, "grad_norm": 2.208855152130127, "learning_rate": 1e-06, "loss": 0.0964, "step": 176 }, { "epoch": 0.01943986820428336, "grad_norm": 1.9803837537765503, "learning_rate": 1e-06, "loss": 0.0968, "step": 177 }, { "epoch": 0.019549697968149368, "grad_norm": 1.8835409879684448, "learning_rate": 1e-06, "loss": 0.1059, "step": 178 }, { "epoch": 0.019659527732015375, "grad_norm": 2.523775339126587, "learning_rate": 1e-06, "loss": 0.1217, "step": 179 }, { "epoch": 0.019769357495881382, "grad_norm": 2.160933494567871, "learning_rate": 1e-06, "loss": 0.0974, "step": 180 }, { "epoch": 0.01987918725974739, "grad_norm": 1.6890003681182861, "learning_rate": 1e-06, "loss": 0.0856, "step": 181 }, { "epoch": 0.01987918725974739, "step": 181, "total_flos": 0.0, "train_loss": 0.05325274675755211, "train_runtime": 2257.3022, "train_samples_per_second": 1.283, "train_steps_per_second": 0.08 } ], "logging_steps": 1, "max_steps": 181, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 91, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }