{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 132, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5756837734952569, "epoch": 0.045454545454545456, "grad_norm": 5.5625, "learning_rate": 0.0, "loss": 1.4932, "mean_token_accuracy": 0.6398156471550465, "num_tokens": 213256.0, "step": 1 }, { "entropy": 0.5991804897785187, "epoch": 0.09090909090909091, "grad_norm": 5.0, "learning_rate": 1.4285714285714286e-06, "loss": 1.487, "mean_token_accuracy": 0.6374303828924894, "num_tokens": 427738.0, "step": 2 }, { "entropy": 0.6032040221616626, "epoch": 0.13636363636363635, "grad_norm": 5.6875, "learning_rate": 2.8571428571428573e-06, "loss": 1.4726, "mean_token_accuracy": 0.6417305879294872, "num_tokens": 627248.0, "step": 3 }, { "entropy": 0.5772652318701148, "epoch": 0.18181818181818182, "grad_norm": 4.84375, "learning_rate": 4.2857142857142855e-06, "loss": 1.484, "mean_token_accuracy": 0.6436302307993174, "num_tokens": 863658.0, "step": 4 }, { "entropy": 0.5723715648055077, "epoch": 0.22727272727272727, "grad_norm": 4.625, "learning_rate": 5.7142857142857145e-06, "loss": 1.4338, "mean_token_accuracy": 0.647509815171361, "num_tokens": 1086491.0, "step": 5 }, { "entropy": 0.6264624744653702, "epoch": 0.2727272727272727, "grad_norm": 4.21875, "learning_rate": 7.1428571428571436e-06, "loss": 1.4245, "mean_token_accuracy": 0.6502398364245892, "num_tokens": 1285338.0, "step": 6 }, { "entropy": 0.5932183619588614, "epoch": 0.3181818181818182, "grad_norm": 3.703125, "learning_rate": 8.571428571428571e-06, "loss": 1.4026, "mean_token_accuracy": 0.6524394080042839, "num_tokens": 1518366.0, "step": 7 }, { "entropy": 0.6039981953799725, "epoch": 0.36363636363636365, "grad_norm": 3.328125, "learning_rate": 1e-05, "loss": 1.3217, "mean_token_accuracy": 0.6681927014142275, "num_tokens": 1734339.0, "step": 8 }, { "entropy": 0.5884840972721577, "epoch": 0.4090909090909091, "grad_norm": 3.140625, "learning_rate": 9.9984209464165e-06, "loss": 1.2699, "mean_token_accuracy": 0.6729655731469393, "num_tokens": 1958264.0, "step": 9 }, { "entropy": 0.6048963023349643, "epoch": 0.45454545454545453, "grad_norm": 2.984375, "learning_rate": 9.99368478303009e-06, "loss": 1.237, "mean_token_accuracy": 0.6799562647938728, "num_tokens": 2166887.0, "step": 10 }, { "entropy": 0.5761522091925144, "epoch": 0.5, "grad_norm": 2.84375, "learning_rate": 9.98579450130307e-06, "loss": 1.2118, "mean_token_accuracy": 0.6838508639484644, "num_tokens": 2388908.0, "step": 11 }, { "entropy": 0.5694795530289412, "epoch": 0.5454545454545454, "grad_norm": 2.5625, "learning_rate": 9.974755084906503e-06, "loss": 1.1741, "mean_token_accuracy": 0.6954156272113323, "num_tokens": 2605601.0, "step": 12 }, { "entropy": 0.5798916202038527, "epoch": 0.5909090909090909, "grad_norm": 2.75, "learning_rate": 9.960573506572391e-06, "loss": 1.1211, "mean_token_accuracy": 0.6951853409409523, "num_tokens": 2817700.0, "step": 13 }, { "entropy": 0.5453419080004096, "epoch": 0.6363636363636364, "grad_norm": 3.1875, "learning_rate": 9.94325872368957e-06, "loss": 1.1015, "mean_token_accuracy": 0.7093381285667419, "num_tokens": 3040280.0, "step": 14 }, { "entropy": 0.5564651843160391, "epoch": 0.6818181818181818, "grad_norm": 2.296875, "learning_rate": 9.922821672646028e-06, "loss": 1.081, "mean_token_accuracy": 0.7106043919920921, "num_tokens": 3247106.0, "step": 15 }, { "entropy": 0.5601713042706251, "epoch": 0.7272727272727273, "grad_norm": 2.328125, "learning_rate": 9.899275261921236e-06, "loss": 1.0923, "mean_token_accuracy": 0.7116606514900923, "num_tokens": 3458426.0, "step": 16 }, { "entropy": 0.5526782963424921, "epoch": 0.7727272727272727, "grad_norm": 2.25, "learning_rate": 9.872634363932887e-06, "loss": 1.0756, "mean_token_accuracy": 0.7068049628287554, "num_tokens": 3676439.0, "step": 17 }, { "entropy": 0.5471083605661988, "epoch": 0.8181818181818182, "grad_norm": 2.359375, "learning_rate": 9.842915805643156e-06, "loss": 1.0221, "mean_token_accuracy": 0.7173160128295422, "num_tokens": 3894572.0, "step": 18 }, { "entropy": 0.5807137787342072, "epoch": 0.8636363636363636, "grad_norm": 2.1875, "learning_rate": 9.81013835793043e-06, "loss": 0.9976, "mean_token_accuracy": 0.7263931501656771, "num_tokens": 4098419.0, "step": 19 }, { "entropy": 0.5447382191196084, "epoch": 0.9090909090909091, "grad_norm": 2.34375, "learning_rate": 9.774322723733216e-06, "loss": 0.9973, "mean_token_accuracy": 0.721901087090373, "num_tokens": 4317337.0, "step": 20 }, { "entropy": 0.5411477498710155, "epoch": 0.9545454545454546, "grad_norm": 2.0, "learning_rate": 9.735491524973723e-06, "loss": 1.0118, "mean_token_accuracy": 0.7198995053768158, "num_tokens": 4541085.0, "step": 21 }, { "entropy": 0.5523755457252264, "epoch": 1.0, "grad_norm": 2.109375, "learning_rate": 9.693669288269371e-06, "loss": 0.981, "mean_token_accuracy": 0.7275056149810553, "num_tokens": 4761424.0, "step": 22 }, { "entropy": 0.5443653706461191, "epoch": 1.0454545454545454, "grad_norm": 2.140625, "learning_rate": 9.648882429441258e-06, "loss": 0.9525, "mean_token_accuracy": 0.7372480425983667, "num_tokens": 4972261.0, "step": 23 }, { "entropy": 0.5743730887770653, "epoch": 1.0909090909090908, "grad_norm": 1.921875, "learning_rate": 9.601159236829353e-06, "loss": 0.9451, "mean_token_accuracy": 0.7314453404396772, "num_tokens": 5175446.0, "step": 24 }, { "entropy": 0.5375164104625583, "epoch": 1.1363636363636362, "grad_norm": 2.1875, "learning_rate": 9.550529853424979e-06, "loss": 0.9413, "mean_token_accuracy": 0.7362319473177195, "num_tokens": 5383844.0, "step": 25 }, { "entropy": 0.5443251971155405, "epoch": 1.1818181818181819, "grad_norm": 2.0625, "learning_rate": 9.497026257831856e-06, "loss": 0.976, "mean_token_accuracy": 0.7270661573857069, "num_tokens": 5603658.0, "step": 26 }, { "entropy": 0.517438679933548, "epoch": 1.2272727272727273, "grad_norm": 1.984375, "learning_rate": 9.440682244067724e-06, "loss": 0.9339, "mean_token_accuracy": 0.7412919718772173, "num_tokens": 5833420.0, "step": 27 }, { "entropy": 0.5209640683606267, "epoch": 1.2727272727272727, "grad_norm": 2.546875, "learning_rate": 9.381533400219319e-06, "loss": 0.9073, "mean_token_accuracy": 0.7471859473735094, "num_tokens": 6049437.0, "step": 28 }, { "entropy": 0.5130348689854145, "epoch": 1.3181818181818181, "grad_norm": 2.390625, "learning_rate": 9.319617085964177e-06, "loss": 0.8903, "mean_token_accuracy": 0.7493606805801392, "num_tokens": 6272070.0, "step": 29 }, { "entropy": 0.5314307110384107, "epoch": 1.3636363636363638, "grad_norm": 2.09375, "learning_rate": 9.25497240897346e-06, "loss": 0.8725, "mean_token_accuracy": 0.7509097009897232, "num_tokens": 6480742.0, "step": 30 }, { "entropy": 0.5306935114786029, "epoch": 1.4090909090909092, "grad_norm": 4.21875, "learning_rate": 9.18764020021071e-06, "loss": 0.8547, "mean_token_accuracy": 0.752067357301712, "num_tokens": 6683773.0, "step": 31 }, { "entropy": 0.5351670542731881, "epoch": 1.4545454545454546, "grad_norm": 2.015625, "learning_rate": 9.117662988142138e-06, "loss": 0.8583, "mean_token_accuracy": 0.7536429259926081, "num_tokens": 6886687.0, "step": 32 }, { "entropy": 0.5129835363477468, "epoch": 1.5, "grad_norm": 2.078125, "learning_rate": 9.045084971874738e-06, "loss": 0.89, "mean_token_accuracy": 0.7455399166792631, "num_tokens": 7115408.0, "step": 33 }, { "entropy": 0.5456664310768247, "epoch": 1.5454545454545454, "grad_norm": 2.171875, "learning_rate": 8.969951993239177e-06, "loss": 0.8931, "mean_token_accuracy": 0.7431404571980238, "num_tokens": 7311605.0, "step": 34 }, { "entropy": 0.5233238851651549, "epoch": 1.5909090909090908, "grad_norm": 2.0, "learning_rate": 8.892311507835118e-06, "loss": 0.894, "mean_token_accuracy": 0.7455066256225109, "num_tokens": 7536281.0, "step": 35 }, { "entropy": 0.5304507119581103, "epoch": 1.6363636363636362, "grad_norm": 1.9921875, "learning_rate": 8.81221255505724e-06, "loss": 0.8802, "mean_token_accuracy": 0.7477323599159718, "num_tokens": 7750772.0, "step": 36 }, { "entropy": 0.5277951331809163, "epoch": 1.6818181818181817, "grad_norm": 1.9375, "learning_rate": 8.729705727120911e-06, "loss": 0.8726, "mean_token_accuracy": 0.7508427072316408, "num_tokens": 7974590.0, "step": 37 }, { "entropy": 0.5125188445672393, "epoch": 1.7272727272727273, "grad_norm": 2.890625, "learning_rate": 8.644843137107058e-06, "loss": 0.8935, "mean_token_accuracy": 0.7437247112393379, "num_tokens": 8201132.0, "step": 38 }, { "entropy": 0.5137846125289798, "epoch": 1.7727272727272727, "grad_norm": 2.46875, "learning_rate": 8.557678386046429e-06, "loss": 0.8641, "mean_token_accuracy": 0.7521512098610401, "num_tokens": 8422575.0, "step": 39 }, { "entropy": 0.515689549036324, "epoch": 1.8181818181818183, "grad_norm": 2.28125, "learning_rate": 8.468266529064025e-06, "loss": 0.8577, "mean_token_accuracy": 0.752921536564827, "num_tokens": 8635107.0, "step": 40 }, { "entropy": 0.5162663543596864, "epoch": 1.8636363636363638, "grad_norm": 2.296875, "learning_rate": 8.376664040605122e-06, "loss": 0.864, "mean_token_accuracy": 0.7545531969517469, "num_tokens": 8842212.0, "step": 41 }, { "entropy": 0.514712393283844, "epoch": 1.9090909090909092, "grad_norm": 1.84375, "learning_rate": 8.282928778764783e-06, "loss": 0.8809, "mean_token_accuracy": 0.7484686318784952, "num_tokens": 9070194.0, "step": 42 }, { "entropy": 0.5233986722305417, "epoch": 1.9545454545454546, "grad_norm": 2.265625, "learning_rate": 8.18711994874345e-06, "loss": 0.8139, "mean_token_accuracy": 0.7604574281722307, "num_tokens": 9267652.0, "step": 43 }, { "entropy": 0.521757710725069, "epoch": 2.0, "grad_norm": 1.90625, "learning_rate": 8.089298065451673e-06, "loss": 0.8508, "mean_token_accuracy": 0.75594780780375, "num_tokens": 9490017.0, "step": 44 }, { "entropy": 0.5071435309946537, "epoch": 2.0454545454545454, "grad_norm": 1.90625, "learning_rate": 7.989524915287595e-06, "loss": 0.8516, "mean_token_accuracy": 0.7543456256389618, "num_tokens": 9711919.0, "step": 45 }, { "entropy": 0.5045623360201716, "epoch": 2.090909090909091, "grad_norm": 2.03125, "learning_rate": 7.887863517111337e-06, "loss": 0.8573, "mean_token_accuracy": 0.7523710802197456, "num_tokens": 9949908.0, "step": 46 }, { "entropy": 0.5323316175490618, "epoch": 2.1363636363636362, "grad_norm": 2.25, "learning_rate": 7.78437808244094e-06, "loss": 0.8448, "mean_token_accuracy": 0.7550894934684038, "num_tokens": 10155466.0, "step": 47 }, { "entropy": 0.5101688215509057, "epoch": 2.1818181818181817, "grad_norm": 1.953125, "learning_rate": 7.679133974894984e-06, "loss": 0.8411, "mean_token_accuracy": 0.7548086270689964, "num_tokens": 10371898.0, "step": 48 }, { "entropy": 0.5281436312943697, "epoch": 2.227272727272727, "grad_norm": 1.96875, "learning_rate": 7.572197668907533e-06, "loss": 0.8431, "mean_token_accuracy": 0.7579620629549026, "num_tokens": 10586035.0, "step": 49 }, { "entropy": 0.5026519363746047, "epoch": 2.2727272727272725, "grad_norm": 1.859375, "learning_rate": 7.463636707741458e-06, "loss": 0.8355, "mean_token_accuracy": 0.7575494665652514, "num_tokens": 10814372.0, "step": 50 }, { "entropy": 0.5014037564396858, "epoch": 2.3181818181818183, "grad_norm": 2.046875, "learning_rate": 7.353519660826665e-06, "loss": 0.8185, "mean_token_accuracy": 0.7590322364121675, "num_tokens": 11035168.0, "step": 51 }, { "entropy": 0.5116605255752802, "epoch": 2.3636363636363638, "grad_norm": 1.9140625, "learning_rate": 7.241916080450163e-06, "loss": 0.8359, "mean_token_accuracy": 0.7558552380651236, "num_tokens": 11249229.0, "step": 52 }, { "entropy": 0.5000560870394111, "epoch": 2.409090909090909, "grad_norm": 2.609375, "learning_rate": 7.128896457825364e-06, "loss": 0.8081, "mean_token_accuracy": 0.7626348324120045, "num_tokens": 11462723.0, "step": 53 }, { "entropy": 0.5258028572425246, "epoch": 2.4545454545454546, "grad_norm": 2.015625, "learning_rate": 7.014532178568314e-06, "loss": 0.83, "mean_token_accuracy": 0.7577290665358305, "num_tokens": 11664590.0, "step": 54 }, { "entropy": 0.49798215832561255, "epoch": 2.5, "grad_norm": 2.125, "learning_rate": 6.898895477609007e-06, "loss": 0.83, "mean_token_accuracy": 0.757939899340272, "num_tokens": 11894045.0, "step": 55 }, { "entropy": 0.5007442878559232, "epoch": 2.5454545454545454, "grad_norm": 2.03125, "learning_rate": 6.782059393566254e-06, "loss": 0.8315, "mean_token_accuracy": 0.75820074044168, "num_tokens": 12115353.0, "step": 56 }, { "entropy": 0.5033395420759916, "epoch": 2.590909090909091, "grad_norm": 2.078125, "learning_rate": 6.664097722614934e-06, "loss": 0.8273, "mean_token_accuracy": 0.7577789463102818, "num_tokens": 12332745.0, "step": 57 }, { "entropy": 0.5042030932381749, "epoch": 2.6363636363636362, "grad_norm": 2.4375, "learning_rate": 6.545084971874738e-06, "loss": 0.8343, "mean_token_accuracy": 0.7549517750740051, "num_tokens": 12554302.0, "step": 58 }, { "entropy": 0.509183426387608, "epoch": 2.6818181818181817, "grad_norm": 2.015625, "learning_rate": 6.425096312349881e-06, "loss": 0.796, "mean_token_accuracy": 0.7663499284535646, "num_tokens": 12764709.0, "step": 59 }, { "entropy": 0.5077009173110127, "epoch": 2.7272727272727275, "grad_norm": 2.0625, "learning_rate": 6.304207531449486e-06, "loss": 0.8174, "mean_token_accuracy": 0.7621090263128281, "num_tokens": 12983370.0, "step": 60 }, { "entropy": 0.5197898102924228, "epoch": 2.7727272727272725, "grad_norm": 2.0625, "learning_rate": 6.182494985118625e-06, "loss": 0.8088, "mean_token_accuracy": 0.7662495765835047, "num_tokens": 13195618.0, "step": 61 }, { "entropy": 0.49907076358795166, "epoch": 2.8181818181818183, "grad_norm": 2.109375, "learning_rate": 6.060035549610275e-06, "loss": 0.7853, "mean_token_accuracy": 0.7684814091771841, "num_tokens": 13409539.0, "step": 62 }, { "entropy": 0.5073017841205001, "epoch": 2.8636363636363638, "grad_norm": 2.015625, "learning_rate": 5.936906572928625e-06, "loss": 0.8065, "mean_token_accuracy": 0.7650269959121943, "num_tokens": 13620769.0, "step": 63 }, { "entropy": 0.5056956252083182, "epoch": 2.909090909090909, "grad_norm": 2.0, "learning_rate": 5.813185825974419e-06, "loss": 0.8059, "mean_token_accuracy": 0.7649325635284185, "num_tokens": 13842038.0, "step": 64 }, { "entropy": 0.5035996483638883, "epoch": 2.9545454545454546, "grad_norm": 1.921875, "learning_rate": 5.68895145342319e-06, "loss": 0.8247, "mean_token_accuracy": 0.7576907705515623, "num_tokens": 14060587.0, "step": 65 }, { "entropy": 0.5117500508204103, "epoch": 3.0, "grad_norm": 2.0625, "learning_rate": 5.5642819243674085e-06, "loss": 0.7885, "mean_token_accuracy": 0.7658284697681665, "num_tokens": 14271664.0, "step": 66 }, { "entropy": 0.5234015788882971, "epoch": 3.0454545454545454, "grad_norm": 1.859375, "learning_rate": 5.439255982753717e-06, "loss": 0.8105, "mean_token_accuracy": 0.763946671038866, "num_tokens": 14482081.0, "step": 67 }, { "entropy": 0.49382613878697157, "epoch": 3.090909090909091, "grad_norm": 2.4375, "learning_rate": 5.3139525976465675e-06, "loss": 0.8101, "mean_token_accuracy": 0.7636537831276655, "num_tokens": 14716672.0, "step": 68 }, { "entropy": 0.5024409759789705, "epoch": 3.1363636363636362, "grad_norm": 2.03125, "learning_rate": 5.188450913349674e-06, "loss": 0.7897, "mean_token_accuracy": 0.7669059839099646, "num_tokens": 14933081.0, "step": 69 }, { "entropy": 0.5273290555924177, "epoch": 3.1818181818181817, "grad_norm": 2.03125, "learning_rate": 5.062830199416764e-06, "loss": 0.7848, "mean_token_accuracy": 0.7632801961153746, "num_tokens": 15136984.0, "step": 70 }, { "entropy": 0.5051014283671975, "epoch": 3.227272727272727, "grad_norm": 2.03125, "learning_rate": 4.937169800583237e-06, "loss": 0.7991, "mean_token_accuracy": 0.7659726981073618, "num_tokens": 15350959.0, "step": 71 }, { "entropy": 0.5017923256382346, "epoch": 3.2727272727272725, "grad_norm": 2.03125, "learning_rate": 4.811549086650327e-06, "loss": 0.7963, "mean_token_accuracy": 0.7660163976252079, "num_tokens": 15568699.0, "step": 72 }, { "entropy": 0.5082337036728859, "epoch": 3.3181818181818183, "grad_norm": 2.140625, "learning_rate": 4.686047402353433e-06, "loss": 0.8051, "mean_token_accuracy": 0.7663215212523937, "num_tokens": 15778760.0, "step": 73 }, { "entropy": 0.49399478174746037, "epoch": 3.3636363636363638, "grad_norm": 2.0625, "learning_rate": 4.560744017246284e-06, "loss": 0.787, "mean_token_accuracy": 0.7704313322901726, "num_tokens": 15997159.0, "step": 74 }, { "entropy": 0.5041727554053068, "epoch": 3.409090909090909, "grad_norm": 2.1875, "learning_rate": 4.4357180756325915e-06, "loss": 0.7801, "mean_token_accuracy": 0.7677918504923582, "num_tokens": 16215568.0, "step": 75 }, { "entropy": 0.4995924336835742, "epoch": 3.4545454545454546, "grad_norm": 1.9140625, "learning_rate": 4.31104854657681e-06, "loss": 0.7761, "mean_token_accuracy": 0.7708524893969297, "num_tokens": 16425520.0, "step": 76 }, { "entropy": 0.5208209808915854, "epoch": 3.5, "grad_norm": 2.390625, "learning_rate": 4.186814174025582e-06, "loss": 0.7868, "mean_token_accuracy": 0.7692476995289326, "num_tokens": 16623924.0, "step": 77 }, { "entropy": 0.5112534500658512, "epoch": 3.5454545454545454, "grad_norm": 2.3125, "learning_rate": 4.063093427071376e-06, "loss": 0.8215, "mean_token_accuracy": 0.7575955875217915, "num_tokens": 16837670.0, "step": 78 }, { "entropy": 0.48737939167767763, "epoch": 3.590909090909091, "grad_norm": 2.125, "learning_rate": 3.939964450389728e-06, "loss": 0.7774, "mean_token_accuracy": 0.7722372729331255, "num_tokens": 17055501.0, "step": 79 }, { "entropy": 0.50137943867594, "epoch": 3.6363636363636362, "grad_norm": 2.0, "learning_rate": 3.817505014881378e-06, "loss": 0.7922, "mean_token_accuracy": 0.7668295446783304, "num_tokens": 17276503.0, "step": 80 }, { "entropy": 0.4918771870434284, "epoch": 3.6818181818181817, "grad_norm": 2.28125, "learning_rate": 3.695792468550517e-06, "loss": 0.797, "mean_token_accuracy": 0.7658107988536358, "num_tokens": 17500421.0, "step": 81 }, { "entropy": 0.49470567237585783, "epoch": 3.7272727272727275, "grad_norm": 1.96875, "learning_rate": 3.5749036876501196e-06, "loss": 0.7736, "mean_token_accuracy": 0.770993497222662, "num_tokens": 17726906.0, "step": 82 }, { "entropy": 0.49783285334706306, "epoch": 3.7727272727272725, "grad_norm": 2.046875, "learning_rate": 3.4549150281252635e-06, "loss": 0.8268, "mean_token_accuracy": 0.759495971724391, "num_tokens": 17936432.0, "step": 83 }, { "entropy": 0.5077159395441413, "epoch": 3.8181818181818183, "grad_norm": 1.9453125, "learning_rate": 3.3359022773850673e-06, "loss": 0.787, "mean_token_accuracy": 0.7670105397701263, "num_tokens": 18151745.0, "step": 84 }, { "entropy": 0.4940468706190586, "epoch": 3.8636363636363638, "grad_norm": 2.015625, "learning_rate": 3.217940606433747e-06, "loss": 0.7725, "mean_token_accuracy": 0.7713121753185987, "num_tokens": 18368653.0, "step": 85 }, { "entropy": 0.5109028052538633, "epoch": 3.909090909090909, "grad_norm": 2.046875, "learning_rate": 3.1011045223909954e-06, "loss": 0.7891, "mean_token_accuracy": 0.765845526009798, "num_tokens": 18580544.0, "step": 86 }, { "entropy": 0.5074712671339512, "epoch": 3.9545454545454546, "grad_norm": 2.453125, "learning_rate": 2.9854678214316875e-06, "loss": 0.7784, "mean_token_accuracy": 0.770260414108634, "num_tokens": 18793479.0, "step": 87 }, { "entropy": 0.5031467285007238, "epoch": 4.0, "grad_norm": 1.9765625, "learning_rate": 2.871103542174637e-06, "loss": 0.8017, "mean_token_accuracy": 0.7639777194708586, "num_tokens": 19017990.0, "step": 88 }, { "entropy": 0.5008434914052486, "epoch": 4.045454545454546, "grad_norm": 1.984375, "learning_rate": 2.7580839195498397e-06, "loss": 0.7876, "mean_token_accuracy": 0.765197154134512, "num_tokens": 19232836.0, "step": 89 }, { "entropy": 0.5124122239649296, "epoch": 4.090909090909091, "grad_norm": 1.9140625, "learning_rate": 2.646480339173337e-06, "loss": 0.7836, "mean_token_accuracy": 0.7690869830548763, "num_tokens": 19439837.0, "step": 90 }, { "entropy": 0.505384799093008, "epoch": 4.136363636363637, "grad_norm": 1.9765625, "learning_rate": 2.536363292258543e-06, "loss": 0.7888, "mean_token_accuracy": 0.767447579652071, "num_tokens": 19649689.0, "step": 91 }, { "entropy": 0.5008498653769493, "epoch": 4.181818181818182, "grad_norm": 1.9765625, "learning_rate": 2.4278023310924676e-06, "loss": 0.7858, "mean_token_accuracy": 0.7692194785922766, "num_tokens": 19865021.0, "step": 92 }, { "entropy": 0.49786832462996244, "epoch": 4.2272727272727275, "grad_norm": 1.859375, "learning_rate": 2.320866025105016e-06, "loss": 0.7896, "mean_token_accuracy": 0.7659407686442137, "num_tokens": 20088549.0, "step": 93 }, { "entropy": 0.4926151381805539, "epoch": 4.2727272727272725, "grad_norm": 1.8671875, "learning_rate": 2.2156219175590623e-06, "loss": 0.7791, "mean_token_accuracy": 0.7721114605665207, "num_tokens": 20314204.0, "step": 94 }, { "entropy": 0.4964353507384658, "epoch": 4.318181818181818, "grad_norm": 2.328125, "learning_rate": 2.112136482888663e-06, "loss": 0.7619, "mean_token_accuracy": 0.7737916205078363, "num_tokens": 20523190.0, "step": 95 }, { "entropy": 0.5042525352910161, "epoch": 4.363636363636363, "grad_norm": 2.21875, "learning_rate": 2.0104750847124075e-06, "loss": 0.7674, "mean_token_accuracy": 0.7730825170874596, "num_tokens": 20729046.0, "step": 96 }, { "entropy": 0.4980954099446535, "epoch": 4.409090909090909, "grad_norm": 2.453125, "learning_rate": 1.910701934548329e-06, "loss": 0.7684, "mean_token_accuracy": 0.7743080649524927, "num_tokens": 20925674.0, "step": 97 }, { "entropy": 0.5134334182366729, "epoch": 4.454545454545454, "grad_norm": 2.734375, "learning_rate": 1.8128800512565514e-06, "loss": 0.7852, "mean_token_accuracy": 0.7661070600152016, "num_tokens": 21149856.0, "step": 98 }, { "entropy": 0.48121114261448383, "epoch": 4.5, "grad_norm": 1.953125, "learning_rate": 1.7170712212352187e-06, "loss": 0.781, "mean_token_accuracy": 0.7670229282230139, "num_tokens": 21380719.0, "step": 99 }, { "entropy": 0.4965139916166663, "epoch": 4.545454545454545, "grad_norm": 1.8125, "learning_rate": 1.6233359593948777e-06, "loss": 0.7825, "mean_token_accuracy": 0.7711650598794222, "num_tokens": 21593512.0, "step": 100 }, { "entropy": 0.49770417250692844, "epoch": 4.590909090909091, "grad_norm": 1.828125, "learning_rate": 1.531733470935976e-06, "loss": 0.7974, "mean_token_accuracy": 0.7630838695913553, "num_tokens": 21813186.0, "step": 101 }, { "entropy": 0.5052190851420164, "epoch": 4.636363636363637, "grad_norm": 2.421875, "learning_rate": 1.4423216139535735e-06, "loss": 0.7991, "mean_token_accuracy": 0.7624437268823385, "num_tokens": 22026623.0, "step": 102 }, { "entropy": 0.5221615890040994, "epoch": 4.681818181818182, "grad_norm": 1.796875, "learning_rate": 1.3551568628929434e-06, "loss": 0.7913, "mean_token_accuracy": 0.7647000271826982, "num_tokens": 22244422.0, "step": 103 }, { "entropy": 0.5008693430572748, "epoch": 4.7272727272727275, "grad_norm": 2.21875, "learning_rate": 1.2702942728790897e-06, "loss": 0.7834, "mean_token_accuracy": 0.7710994388908148, "num_tokens": 22463984.0, "step": 104 }, { "entropy": 0.48945672158151865, "epoch": 4.7727272727272725, "grad_norm": 2.171875, "learning_rate": 1.18778744494276e-06, "loss": 0.7905, "mean_token_accuracy": 0.7667012866586447, "num_tokens": 22682817.0, "step": 105 }, { "entropy": 0.48861549887806177, "epoch": 4.818181818181818, "grad_norm": 4.28125, "learning_rate": 1.1076884921648834e-06, "loss": 0.8045, "mean_token_accuracy": 0.7612419724464417, "num_tokens": 22917878.0, "step": 106 }, { "entropy": 0.49805399868637323, "epoch": 4.863636363636363, "grad_norm": 1.875, "learning_rate": 1.0300480067608232e-06, "loss": 0.7937, "mean_token_accuracy": 0.7647292520850897, "num_tokens": 23123886.0, "step": 107 }, { "entropy": 0.5075469352304935, "epoch": 4.909090909090909, "grad_norm": 2.390625, "learning_rate": 9.549150281252633e-07, "loss": 0.7662, "mean_token_accuracy": 0.7728671338409185, "num_tokens": 23326280.0, "step": 108 }, { "entropy": 0.4884511986747384, "epoch": 4.954545454545455, "grad_norm": 1.7890625, "learning_rate": 8.823370118578628e-07, "loss": 0.7868, "mean_token_accuracy": 0.7672145701944828, "num_tokens": 23570721.0, "step": 109 }, { "entropy": 0.4963974915444851, "epoch": 5.0, "grad_norm": 2.140625, "learning_rate": 8.123597997892918e-07, "loss": 0.8003, "mean_token_accuracy": 0.7601921837776899, "num_tokens": 23797581.0, "step": 110 }, { "entropy": 0.5236662002280354, "epoch": 5.045454545454546, "grad_norm": 2.125, "learning_rate": 7.450275910265415e-07, "loss": 0.7908, "mean_token_accuracy": 0.7661727890372276, "num_tokens": 23988050.0, "step": 111 }, { "entropy": 0.5031342897564173, "epoch": 5.090909090909091, "grad_norm": 2.5, "learning_rate": 6.803829140358237e-07, "loss": 0.7843, "mean_token_accuracy": 0.769377738237381, "num_tokens": 24194925.0, "step": 112 }, { "entropy": 0.49556155782192945, "epoch": 5.136363636363637, "grad_norm": 1.8984375, "learning_rate": 6.184665997806832e-07, "loss": 0.7705, "mean_token_accuracy": 0.771408112719655, "num_tokens": 24414755.0, "step": 113 }, { "entropy": 0.4983775094151497, "epoch": 5.181818181818182, "grad_norm": 1.875, "learning_rate": 5.593177559322776e-07, "loss": 0.771, "mean_token_accuracy": 0.7732552234083414, "num_tokens": 24630985.0, "step": 114 }, { "entropy": 0.509802995249629, "epoch": 5.2272727272727275, "grad_norm": 1.984375, "learning_rate": 5.029737421681446e-07, "loss": 0.7953, "mean_token_accuracy": 0.7667434271425009, "num_tokens": 24854501.0, "step": 115 }, { "entropy": 0.49124314822256565, "epoch": 5.2727272727272725, "grad_norm": 2.34375, "learning_rate": 4.494701465750217e-07, "loss": 0.7816, "mean_token_accuracy": 0.7701654080301523, "num_tokens": 25085284.0, "step": 116 }, { "entropy": 0.5039074392989278, "epoch": 5.318181818181818, "grad_norm": 1.796875, "learning_rate": 3.9884076317064813e-07, "loss": 0.7772, "mean_token_accuracy": 0.7688853479921818, "num_tokens": 25297733.0, "step": 117 }, { "entropy": 0.49862359277904034, "epoch": 5.363636363636363, "grad_norm": 1.9765625, "learning_rate": 3.511175705587433e-07, "loss": 0.7853, "mean_token_accuracy": 0.7682102452963591, "num_tokens": 25518971.0, "step": 118 }, { "entropy": 0.4892881168052554, "epoch": 5.409090909090909, "grad_norm": 1.921875, "learning_rate": 3.0633071173062966e-07, "loss": 0.788, "mean_token_accuracy": 0.767511548474431, "num_tokens": 25748568.0, "step": 119 }, { "entropy": 0.5163639336824417, "epoch": 5.454545454545454, "grad_norm": 1.9375, "learning_rate": 2.6450847502627883e-07, "loss": 0.7587, "mean_token_accuracy": 0.7713187728077173, "num_tokens": 25945280.0, "step": 120 }, { "entropy": 0.49786319863051176, "epoch": 5.5, "grad_norm": 2.0625, "learning_rate": 2.2567727626678527e-07, "loss": 0.7704, "mean_token_accuracy": 0.7727676276117563, "num_tokens": 26165085.0, "step": 121 }, { "entropy": 0.48858580458909273, "epoch": 5.545454545454545, "grad_norm": 1.8984375, "learning_rate": 1.8986164206957037e-07, "loss": 0.7763, "mean_token_accuracy": 0.7700363770127296, "num_tokens": 26389656.0, "step": 122 }, { "entropy": 0.49373806826770306, "epoch": 5.590909090909091, "grad_norm": 2.0625, "learning_rate": 1.5708419435684463e-07, "loss": 0.7705, "mean_token_accuracy": 0.770933760330081, "num_tokens": 26612348.0, "step": 123 }, { "entropy": 0.5205885702744126, "epoch": 5.636363636363637, "grad_norm": 2.40625, "learning_rate": 1.2736563606711384e-07, "loss": 0.7881, "mean_token_accuracy": 0.7669357471168041, "num_tokens": 26814128.0, "step": 124 }, { "entropy": 0.5114659816026688, "epoch": 5.681818181818182, "grad_norm": 2.640625, "learning_rate": 1.007247380787657e-07, "loss": 0.7778, "mean_token_accuracy": 0.7689048480242491, "num_tokens": 27017995.0, "step": 125 }, { "entropy": 0.5094327395781875, "epoch": 5.7272727272727275, "grad_norm": 2.296875, "learning_rate": 7.717832735397335e-08, "loss": 0.8061, "mean_token_accuracy": 0.7618958260864019, "num_tokens": 27236093.0, "step": 126 }, { "entropy": 0.49905121326446533, "epoch": 5.7727272727272725, "grad_norm": 2.28125, "learning_rate": 5.674127631043025e-08, "loss": 0.7821, "mean_token_accuracy": 0.765771547332406, "num_tokens": 27444153.0, "step": 127 }, { "entropy": 0.5201311567798257, "epoch": 5.818181818181818, "grad_norm": 2.03125, "learning_rate": 3.9426493427611177e-08, "loss": 0.7799, "mean_token_accuracy": 0.7713412661105394, "num_tokens": 27648072.0, "step": 128 }, { "entropy": 0.4998158114030957, "epoch": 5.863636363636363, "grad_norm": 2.3125, "learning_rate": 2.5244915093499134e-08, "loss": 0.7712, "mean_token_accuracy": 0.7739516459405422, "num_tokens": 27855673.0, "step": 129 }, { "entropy": 0.5035147462040186, "epoch": 5.909090909090909, "grad_norm": 2.0, "learning_rate": 1.4205498696930332e-08, "loss": 0.7693, "mean_token_accuracy": 0.7732928432524204, "num_tokens": 28069882.0, "step": 130 }, { "entropy": 0.50436632335186, "epoch": 5.954545454545455, "grad_norm": 2.1875, "learning_rate": 6.315216969912663e-09, "loss": 0.7765, "mean_token_accuracy": 0.7761289775371552, "num_tokens": 28286266.0, "step": 131 }, { "entropy": 0.4877948518842459, "epoch": 6.0, "grad_norm": 1.859375, "learning_rate": 1.5790535835003006e-09, "loss": 0.7997, "mean_token_accuracy": 0.7679535001516342, "num_tokens": 28515234.0, "step": 132 }, { "epoch": 6.0, "step": 132, "total_flos": 6.216626585809981e+17, "train_loss": 0.8825002660353979, "train_runtime": 2572.3849, "train_samples_per_second": 2.808, "train_steps_per_second": 0.051 } ], "logging_steps": 1, "max_steps": 132, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.216626585809981e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }