{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.8042294681072235, "epoch": 0.025806451612903226, "grad_norm": 1.166382074356079, "learning_rate": 0.0, "loss": 2.5975, "mean_token_accuracy": 0.4834420457482338, "num_tokens": 1533.0, "step": 1 }, { "entropy": 1.8224012553691864, "epoch": 0.05161290322580645, "grad_norm": 1.568097472190857, "learning_rate": 8.333333333333334e-06, "loss": 2.6194, "mean_token_accuracy": 0.5228946506977081, "num_tokens": 2447.0, "step": 2 }, { "entropy": 2.1347350478172302, "epoch": 0.07741935483870968, "grad_norm": 1.6636226177215576, "learning_rate": 1.6666666666666667e-05, "loss": 3.1216, "mean_token_accuracy": 0.4500608742237091, "num_tokens": 3252.0, "step": 3 }, { "entropy": 2.042035460472107, "epoch": 0.1032258064516129, "grad_norm": 1.8585174083709717, "learning_rate": 2.5e-05, "loss": 3.0927, "mean_token_accuracy": 0.434286504983902, "num_tokens": 3990.0, "step": 4 }, { "entropy": 2.0793383419513702, "epoch": 0.12903225806451613, "grad_norm": 2.271517753601074, "learning_rate": 3.3333333333333335e-05, "loss": 3.1323, "mean_token_accuracy": 0.44490282237529755, "num_tokens": 4623.0, "step": 5 }, { "entropy": 2.078058958053589, "epoch": 0.15483870967741936, "grad_norm": 2.0911874771118164, "learning_rate": 4.166666666666667e-05, "loss": 3.0791, "mean_token_accuracy": 0.4434494748711586, "num_tokens": 5202.0, "step": 6 }, { "entropy": 1.9296036958694458, "epoch": 0.18064516129032257, "grad_norm": 2.447918176651001, "learning_rate": 5e-05, "loss": 2.9283, "mean_token_accuracy": 0.5010824277997017, "num_tokens": 5738.0, "step": 7 }, { "entropy": 2.1553411781787872, "epoch": 0.2064516129032258, "grad_norm": 2.70611572265625, "learning_rate": 5.833333333333334e-05, "loss": 2.8435, "mean_token_accuracy": 0.498832605779171, "num_tokens": 6235.0, "step": 8 }, { "entropy": 2.148306369781494, "epoch": 0.23225806451612904, "grad_norm": 2.3149070739746094, "learning_rate": 6.666666666666667e-05, "loss": 2.8677, "mean_token_accuracy": 0.46573129296302795, "num_tokens": 6703.0, "step": 9 }, { "entropy": 1.9346267580986023, "epoch": 0.25806451612903225, "grad_norm": 1.3574178218841553, "learning_rate": 7.500000000000001e-05, "loss": 2.4543, "mean_token_accuracy": 0.5017582848668098, "num_tokens": 8003.0, "step": 10 }, { "entropy": 2.2560064792633057, "epoch": 0.2838709677419355, "grad_norm": 1.4286997318267822, "learning_rate": 8.333333333333334e-05, "loss": 2.4076, "mean_token_accuracy": 0.516123816370964, "num_tokens": 8830.0, "step": 11 }, { "entropy": 2.271284520626068, "epoch": 0.3096774193548387, "grad_norm": 1.289847493171692, "learning_rate": 9.166666666666667e-05, "loss": 2.2502, "mean_token_accuracy": 0.581367239356041, "num_tokens": 9586.0, "step": 12 }, { "entropy": 2.506469488143921, "epoch": 0.33548387096774196, "grad_norm": 1.698026418685913, "learning_rate": 0.0001, "loss": 2.5559, "mean_token_accuracy": 0.5279825925827026, "num_tokens": 10255.0, "step": 13 }, { "entropy": 2.488889992237091, "epoch": 0.36129032258064514, "grad_norm": 2.1104917526245117, "learning_rate": 9.999827315381885e-05, "loss": 2.3051, "mean_token_accuracy": 0.5456234812736511, "num_tokens": 10842.0, "step": 14 }, { "entropy": 2.494838774204254, "epoch": 0.3870967741935484, "grad_norm": 1.7446825504302979, "learning_rate": 9.999309273455528e-05, "loss": 2.1948, "mean_token_accuracy": 0.5685414522886276, "num_tokens": 11363.0, "step": 15 }, { "entropy": 2.623446822166443, "epoch": 0.4129032258064516, "grad_norm": 1.934134840965271, "learning_rate": 9.998445910004082e-05, "loss": 2.2624, "mean_token_accuracy": 0.5481147766113281, "num_tokens": 11819.0, "step": 16 }, { "entropy": 2.3205150961875916, "epoch": 0.43870967741935485, "grad_norm": 1.6750158071517944, "learning_rate": 9.997237284663379e-05, "loss": 1.8547, "mean_token_accuracy": 0.6086297482252121, "num_tokens": 12247.0, "step": 17 }, { "entropy": 2.435093104839325, "epoch": 0.4645161290322581, "grad_norm": 1.8602609634399414, "learning_rate": 9.995683480917821e-05, "loss": 2.1032, "mean_token_accuracy": 0.5650125294923782, "num_tokens": 12646.0, "step": 18 }, { "entropy": 2.1141549050807953, "epoch": 0.49032258064516127, "grad_norm": 0.9358610510826111, "learning_rate": 9.993784606094612e-05, "loss": 1.9903, "mean_token_accuracy": 0.5407712012529373, "num_tokens": 14509.0, "step": 19 }, { "entropy": 2.083885967731476, "epoch": 0.5161290322580645, "grad_norm": 1.1308526992797852, "learning_rate": 9.991540791356342e-05, "loss": 1.8726, "mean_token_accuracy": 0.5599013864994049, "num_tokens": 15617.0, "step": 20 }, { "entropy": 2.3853049874305725, "epoch": 0.5419354838709678, "grad_norm": 1.350138545036316, "learning_rate": 9.988952191691925e-05, "loss": 2.251, "mean_token_accuracy": 0.5332682132720947, "num_tokens": 16449.0, "step": 21 }, { "entropy": 2.1798684000968933, "epoch": 0.567741935483871, "grad_norm": 1.3853743076324463, "learning_rate": 9.986018985905901e-05, "loss": 1.9656, "mean_token_accuracy": 0.5732992142438889, "num_tokens": 17216.0, "step": 22 }, { "entropy": 2.2904029488563538, "epoch": 0.5935483870967742, "grad_norm": 2.5513713359832764, "learning_rate": 9.982741376606078e-05, "loss": 2.1948, "mean_token_accuracy": 0.5600379034876823, "num_tokens": 17868.0, "step": 23 }, { "entropy": 1.961841881275177, "epoch": 0.6193548387096774, "grad_norm": 1.9767720699310303, "learning_rate": 9.97911959018954e-05, "loss": 1.9528, "mean_token_accuracy": 0.5889081507921219, "num_tokens": 18439.0, "step": 24 }, { "entropy": 2.061126083135605, "epoch": 0.6451612903225806, "grad_norm": 1.8903456926345825, "learning_rate": 9.975153876827008e-05, "loss": 1.9973, "mean_token_accuracy": 0.5782413184642792, "num_tokens": 18947.0, "step": 25 }, { "entropy": 1.953830897808075, "epoch": 0.6709677419354839, "grad_norm": 2.247823715209961, "learning_rate": 9.97084451044556e-05, "loss": 1.8999, "mean_token_accuracy": 0.5786410048604012, "num_tokens": 19410.0, "step": 26 }, { "entropy": 1.8129592537879944, "epoch": 0.6967741935483871, "grad_norm": 2.3078598976135254, "learning_rate": 9.966191788709716e-05, "loss": 1.6035, "mean_token_accuracy": 0.6230615079402924, "num_tokens": 19831.0, "step": 27 }, { "entropy": 1.9399387836456299, "epoch": 0.7225806451612903, "grad_norm": 1.3792117834091187, "learning_rate": 9.961196033000861e-05, "loss": 1.9753, "mean_token_accuracy": 0.5892214328050613, "num_tokens": 20970.0, "step": 28 }, { "entropy": 1.8130147755146027, "epoch": 0.7483870967741936, "grad_norm": 1.5490132570266724, "learning_rate": 9.955857588395065e-05, "loss": 1.7023, "mean_token_accuracy": 0.6110316589474678, "num_tokens": 21755.0, "step": 29 }, { "entropy": 2.077410489320755, "epoch": 0.7741935483870968, "grad_norm": 1.8052752017974854, "learning_rate": 9.950176823639233e-05, "loss": 1.9752, "mean_token_accuracy": 0.6064967960119247, "num_tokens": 22504.0, "step": 30 }, { "entropy": 1.9274516999721527, "epoch": 0.8, "grad_norm": 1.9139018058776855, "learning_rate": 9.944154131125642e-05, "loss": 2.0548, "mean_token_accuracy": 0.5636427998542786, "num_tokens": 23183.0, "step": 31 }, { "entropy": 1.9550862610340118, "epoch": 0.8258064516129032, "grad_norm": 1.9849357604980469, "learning_rate": 9.937789926864838e-05, "loss": 1.8553, "mean_token_accuracy": 0.5807601362466812, "num_tokens": 23774.0, "step": 32 }, { "entropy": 1.8781771957874298, "epoch": 0.8516129032258064, "grad_norm": 2.0134923458099365, "learning_rate": 9.931084650456892e-05, "loss": 1.7917, "mean_token_accuracy": 0.6070037335157394, "num_tokens": 24313.0, "step": 33 }, { "entropy": 1.897193729877472, "epoch": 0.8774193548387097, "grad_norm": 2.607464551925659, "learning_rate": 9.924038765061042e-05, "loss": 1.7723, "mean_token_accuracy": 0.6191761344671249, "num_tokens": 24779.0, "step": 34 }, { "entropy": 1.7519680559635162, "epoch": 0.9032258064516129, "grad_norm": 2.4835267066955566, "learning_rate": 9.916652757363698e-05, "loss": 1.5883, "mean_token_accuracy": 0.6609883904457092, "num_tokens": 25211.0, "step": 35 }, { "entropy": 1.9292193055152893, "epoch": 0.9290322580645162, "grad_norm": 2.3735604286193848, "learning_rate": 9.90892713754483e-05, "loss": 1.8049, "mean_token_accuracy": 0.5980570763349533, "num_tokens": 25599.0, "step": 36 }, { "entropy": 1.881245195865631, "epoch": 0.9548387096774194, "grad_norm": 1.9849742650985718, "learning_rate": 9.900862439242719e-05, "loss": 1.7902, "mean_token_accuracy": 0.5820632129907608, "num_tokens": 26408.0, "step": 37 }, { "entropy": 2.113930821418762, "epoch": 0.9806451612903225, "grad_norm": 3.527271270751953, "learning_rate": 9.892459219517108e-05, "loss": 2.2025, "mean_token_accuracy": 0.5260728523135185, "num_tokens": 27021.0, "step": 38 }, { "entropy": 1.7831549247105916, "epoch": 1.0, "grad_norm": 2.5327165126800537, "learning_rate": 9.883718058810707e-05, "loss": 1.4478, "mean_token_accuracy": 0.6935366789499918, "num_tokens": 27353.0, "step": 39 }, { "entropy": 1.797234058380127, "epoch": 1.0258064516129033, "grad_norm": 1.3197723627090454, "learning_rate": 9.874639560909117e-05, "loss": 1.8934, "mean_token_accuracy": 0.5857948064804077, "num_tokens": 28829.0, "step": 40 }, { "entropy": 1.9161739647388458, "epoch": 1.0516129032258064, "grad_norm": 1.5616050958633423, "learning_rate": 9.865224352899119e-05, "loss": 1.7257, "mean_token_accuracy": 0.6109496206045151, "num_tokens": 29650.0, "step": 41 }, { "entropy": 1.8019072711467743, "epoch": 1.0774193548387097, "grad_norm": 1.8876160383224487, "learning_rate": 9.85547308512535e-05, "loss": 1.8085, "mean_token_accuracy": 0.5969990640878677, "num_tokens": 30359.0, "step": 42 }, { "entropy": 1.7833741307258606, "epoch": 1.103225806451613, "grad_norm": 2.0070252418518066, "learning_rate": 9.84538643114539e-05, "loss": 1.6704, "mean_token_accuracy": 0.5969647467136383, "num_tokens": 30961.0, "step": 43 }, { "entropy": 1.7372365295886993, "epoch": 1.129032258064516, "grad_norm": 1.8577375411987305, "learning_rate": 9.834965087683236e-05, "loss": 1.6159, "mean_token_accuracy": 0.6475881487131119, "num_tokens": 31527.0, "step": 44 }, { "entropy": 1.636292964220047, "epoch": 1.1548387096774193, "grad_norm": 1.8432772159576416, "learning_rate": 9.824209774581174e-05, "loss": 1.5197, "mean_token_accuracy": 0.6530560553073883, "num_tokens": 32050.0, "step": 45 }, { "entropy": 1.6075344681739807, "epoch": 1.1806451612903226, "grad_norm": 1.869754672050476, "learning_rate": 9.81312123475006e-05, "loss": 1.3557, "mean_token_accuracy": 0.6470372080802917, "num_tokens": 32532.0, "step": 46 }, { "entropy": 1.6146334111690521, "epoch": 1.206451612903226, "grad_norm": 2.099989175796509, "learning_rate": 9.801700234117999e-05, "loss": 1.2998, "mean_token_accuracy": 0.6936827301979065, "num_tokens": 32967.0, "step": 47 }, { "entropy": 1.7050741314888, "epoch": 1.232258064516129, "grad_norm": 2.504159688949585, "learning_rate": 9.789947561577445e-05, "loss": 1.5017, "mean_token_accuracy": 0.622559979557991, "num_tokens": 33363.0, "step": 48 }, { "entropy": 1.6869353950023651, "epoch": 1.2580645161290323, "grad_norm": 1.2886877059936523, "learning_rate": 9.777864028930705e-05, "loss": 1.6731, "mean_token_accuracy": 0.6039082556962967, "num_tokens": 35015.0, "step": 49 }, { "entropy": 1.6093480288982391, "epoch": 1.2838709677419355, "grad_norm": 1.6378092765808105, "learning_rate": 9.765450470833865e-05, "loss": 1.4894, "mean_token_accuracy": 0.6367563456296921, "num_tokens": 35999.0, "step": 50 }, { "entropy": 1.6687067151069641, "epoch": 1.3096774193548386, "grad_norm": 1.8195027112960815, "learning_rate": 9.752707744739145e-05, "loss": 1.5385, "mean_token_accuracy": 0.6437539905309677, "num_tokens": 36850.0, "step": 51 }, { "entropy": 1.4987359642982483, "epoch": 1.335483870967742, "grad_norm": 1.8060271739959717, "learning_rate": 9.73963673083566e-05, "loss": 1.3978, "mean_token_accuracy": 0.661731407046318, "num_tokens": 37604.0, "step": 52 }, { "entropy": 1.5831853449344635, "epoch": 1.3612903225806452, "grad_norm": 2.213078260421753, "learning_rate": 9.726238331988624e-05, "loss": 1.7863, "mean_token_accuracy": 0.6147271245718002, "num_tokens": 38314.0, "step": 53 }, { "entropy": 1.5708496272563934, "epoch": 1.3870967741935485, "grad_norm": 3.098945140838623, "learning_rate": 9.712513473676996e-05, "loss": 1.6752, "mean_token_accuracy": 0.6371889561414719, "num_tokens": 38941.0, "step": 54 }, { "entropy": 1.4293319284915924, "epoch": 1.4129032258064516, "grad_norm": 2.6225318908691406, "learning_rate": 9.698463103929542e-05, "loss": 1.5132, "mean_token_accuracy": 0.6733423620462418, "num_tokens": 39485.0, "step": 55 }, { "entropy": 1.4221723973751068, "epoch": 1.4387096774193548, "grad_norm": 2.834839105606079, "learning_rate": 9.684088193259355e-05, "loss": 1.4956, "mean_token_accuracy": 0.6675658673048019, "num_tokens": 39954.0, "step": 56 }, { "entropy": 1.3391860723495483, "epoch": 1.4645161290322581, "grad_norm": 2.185546398162842, "learning_rate": 9.669389734596819e-05, "loss": 1.1981, "mean_token_accuracy": 0.7050470858812332, "num_tokens": 40374.0, "step": 57 }, { "entropy": 1.6070669293403625, "epoch": 1.4903225806451612, "grad_norm": 1.3461191654205322, "learning_rate": 9.654368743221022e-05, "loss": 1.6617, "mean_token_accuracy": 0.5980251729488373, "num_tokens": 42027.0, "step": 58 }, { "entropy": 1.6520465910434723, "epoch": 1.5161290322580645, "grad_norm": 1.6961472034454346, "learning_rate": 9.639026256689628e-05, "loss": 1.577, "mean_token_accuracy": 0.6316726058721542, "num_tokens": 42916.0, "step": 59 }, { "entropy": 1.7670880556106567, "epoch": 1.5419354838709678, "grad_norm": 2.0527658462524414, "learning_rate": 9.623363334767208e-05, "loss": 1.7517, "mean_token_accuracy": 0.6005731225013733, "num_tokens": 43719.0, "step": 60 }, { "entropy": 1.5744120478630066, "epoch": 1.567741935483871, "grad_norm": 2.1162519454956055, "learning_rate": 9.607381059352038e-05, "loss": 1.5544, "mean_token_accuracy": 0.6523573398590088, "num_tokens": 44493.0, "step": 61 }, { "entropy": 1.728984385728836, "epoch": 1.5935483870967742, "grad_norm": 2.0401268005371094, "learning_rate": 9.591080534401371e-05, "loss": 1.699, "mean_token_accuracy": 0.6030448973178864, "num_tokens": 45170.0, "step": 62 }, { "entropy": 1.5222464203834534, "epoch": 1.6193548387096774, "grad_norm": 2.430859327316284, "learning_rate": 9.574462885855174e-05, "loss": 1.2944, "mean_token_accuracy": 0.6946325898170471, "num_tokens": 45755.0, "step": 63 }, { "entropy": 1.528793841600418, "epoch": 1.6451612903225805, "grad_norm": 2.3277854919433594, "learning_rate": 9.557529261558367e-05, "loss": 1.3969, "mean_token_accuracy": 0.6722464263439178, "num_tokens": 46268.0, "step": 64 }, { "entropy": 1.6062091886997223, "epoch": 1.6709677419354838, "grad_norm": 2.8640811443328857, "learning_rate": 9.540280831181525e-05, "loss": 1.3636, "mean_token_accuracy": 0.6864263862371445, "num_tokens": 46737.0, "step": 65 }, { "entropy": 1.336740493774414, "epoch": 1.696774193548387, "grad_norm": 2.5550613403320312, "learning_rate": 9.522718786140097e-05, "loss": 1.0106, "mean_token_accuracy": 0.7365925908088684, "num_tokens": 47163.0, "step": 66 }, { "entropy": 1.789841502904892, "epoch": 1.7225806451612904, "grad_norm": 1.9967743158340454, "learning_rate": 9.504844339512095e-05, "loss": 1.715, "mean_token_accuracy": 0.614040270447731, "num_tokens": 48108.0, "step": 67 }, { "entropy": 1.5481957495212555, "epoch": 1.7483870967741937, "grad_norm": 1.912815809249878, "learning_rate": 9.486658725954321e-05, "loss": 1.3063, "mean_token_accuracy": 0.6685247123241425, "num_tokens": 48901.0, "step": 68 }, { "entropy": 1.618812471628189, "epoch": 1.7741935483870968, "grad_norm": 2.1326448917388916, "learning_rate": 9.468163201617062e-05, "loss": 1.4826, "mean_token_accuracy": 0.6648016273975372, "num_tokens": 49668.0, "step": 69 }, { "entropy": 1.4738461375236511, "epoch": 1.8, "grad_norm": 2.2856757640838623, "learning_rate": 9.449359044057345e-05, "loss": 1.5099, "mean_token_accuracy": 0.6307590007781982, "num_tokens": 50353.0, "step": 70 }, { "entropy": 1.42239710688591, "epoch": 1.8258064516129031, "grad_norm": 2.272261381149292, "learning_rate": 9.430247552150673e-05, "loss": 1.4451, "mean_token_accuracy": 0.6698804646730423, "num_tokens": 50954.0, "step": 71 }, { "entropy": 1.5603100061416626, "epoch": 1.8516129032258064, "grad_norm": 2.444957971572876, "learning_rate": 9.410830046001321e-05, "loss": 1.5631, "mean_token_accuracy": 0.6537315994501114, "num_tokens": 51493.0, "step": 72 }, { "entropy": 1.421448290348053, "epoch": 1.8774193548387097, "grad_norm": 2.62430477142334, "learning_rate": 9.391107866851143e-05, "loss": 1.442, "mean_token_accuracy": 0.6888918429613113, "num_tokens": 51976.0, "step": 73 }, { "entropy": 1.3042734861373901, "epoch": 1.903225806451613, "grad_norm": 2.522318124771118, "learning_rate": 9.371082376986928e-05, "loss": 1.2438, "mean_token_accuracy": 0.6721822023391724, "num_tokens": 52413.0, "step": 74 }, { "entropy": 1.0973184555768967, "epoch": 1.9290322580645163, "grad_norm": 2.2152483463287354, "learning_rate": 9.350754959646306e-05, "loss": 0.9649, "mean_token_accuracy": 0.7464027404785156, "num_tokens": 52812.0, "step": 75 }, { "entropy": 1.4785442054271698, "epoch": 1.9548387096774194, "grad_norm": 1.778226613998413, "learning_rate": 9.330127018922194e-05, "loss": 1.5472, "mean_token_accuracy": 0.6413073837757111, "num_tokens": 53810.0, "step": 76 }, { "entropy": 1.4850931763648987, "epoch": 1.9806451612903224, "grad_norm": 2.324070453643799, "learning_rate": 9.30919997966582e-05, "loss": 1.4766, "mean_token_accuracy": 0.6507462114095688, "num_tokens": 54370.0, "step": 77 }, { "entropy": 1.5041760206222534, "epoch": 2.0, "grad_norm": 2.711214542388916, "learning_rate": 9.287975287388298e-05, "loss": 1.3224, "mean_token_accuracy": 0.6853142380714417, "num_tokens": 54706.0, "step": 78 }, { "entropy": 1.561076819896698, "epoch": 2.0258064516129033, "grad_norm": 1.4298901557922363, "learning_rate": 9.266454408160779e-05, "loss": 1.5017, "mean_token_accuracy": 0.6616432368755341, "num_tokens": 56147.0, "step": 79 }, { "entropy": 1.4342933893203735, "epoch": 2.0516129032258066, "grad_norm": 1.9477201700210571, "learning_rate": 9.244638828513187e-05, "loss": 1.0989, "mean_token_accuracy": 0.7380426079034805, "num_tokens": 56998.0, "step": 80 }, { "entropy": 1.3799369037151337, "epoch": 2.07741935483871, "grad_norm": 1.899839162826538, "learning_rate": 9.22253005533154e-05, "loss": 1.0685, "mean_token_accuracy": 0.7503155916929245, "num_tokens": 57799.0, "step": 81 }, { "entropy": 1.2785212695598602, "epoch": 2.1032258064516127, "grad_norm": 2.1526200771331787, "learning_rate": 9.200129615753859e-05, "loss": 1.0346, "mean_token_accuracy": 0.7295394539833069, "num_tokens": 58548.0, "step": 82 }, { "entropy": 1.1957830488681793, "epoch": 2.129032258064516, "grad_norm": 2.5215909481048584, "learning_rate": 9.177439057064683e-05, "loss": 1.0066, "mean_token_accuracy": 0.7433657646179199, "num_tokens": 59174.0, "step": 83 }, { "entropy": 1.3421072363853455, "epoch": 2.1548387096774193, "grad_norm": 2.606336832046509, "learning_rate": 9.154459946588198e-05, "loss": 1.1666, "mean_token_accuracy": 0.7091180384159088, "num_tokens": 59769.0, "step": 84 }, { "entropy": 1.032430723309517, "epoch": 2.1806451612903226, "grad_norm": 2.835961103439331, "learning_rate": 9.131193871579975e-05, "loss": 0.9103, "mean_token_accuracy": 0.7784561067819595, "num_tokens": 60295.0, "step": 85 }, { "entropy": 1.0069421231746674, "epoch": 2.206451612903226, "grad_norm": 3.632134437561035, "learning_rate": 9.107642439117321e-05, "loss": 0.7677, "mean_token_accuracy": 0.7896548062562943, "num_tokens": 60744.0, "step": 86 }, { "entropy": 0.784252293407917, "epoch": 2.232258064516129, "grad_norm": 3.14766526222229, "learning_rate": 9.083807275988284e-05, "loss": 0.6092, "mean_token_accuracy": 0.8186918497085571, "num_tokens": 61151.0, "step": 87 }, { "entropy": 1.1425200402736664, "epoch": 2.258064516129032, "grad_norm": 2.9548776149749756, "learning_rate": 9.059690028579283e-05, "loss": 1.2423, "mean_token_accuracy": 0.67966029047966, "num_tokens": 62417.0, "step": 88 }, { "entropy": 1.075703114271164, "epoch": 2.2838709677419353, "grad_norm": 2.6472651958465576, "learning_rate": 9.035292362761381e-05, "loss": 1.1406, "mean_token_accuracy": 0.7184228450059891, "num_tokens": 63270.0, "step": 89 }, { "entropy": 0.9899384379386902, "epoch": 2.3096774193548386, "grad_norm": 2.6800777912139893, "learning_rate": 9.01061596377522e-05, "loss": 0.9555, "mean_token_accuracy": 0.759021058678627, "num_tokens": 64027.0, "step": 90 }, { "entropy": 1.2101148664951324, "epoch": 2.335483870967742, "grad_norm": 3.1797468662261963, "learning_rate": 8.985662536114613e-05, "loss": 1.2574, "mean_token_accuracy": 0.707681193947792, "num_tokens": 64701.0, "step": 91 }, { "entropy": 0.9667136818170547, "epoch": 2.361290322580645, "grad_norm": 2.6233391761779785, "learning_rate": 8.960433803408813e-05, "loss": 0.7913, "mean_token_accuracy": 0.7882635146379471, "num_tokens": 65308.0, "step": 92 }, { "entropy": 0.9306632727384567, "epoch": 2.3870967741935485, "grad_norm": 2.395880699157715, "learning_rate": 8.934931508303445e-05, "loss": 0.7301, "mean_token_accuracy": 0.7955707758665085, "num_tokens": 65878.0, "step": 93 }, { "entropy": 1.0530627965927124, "epoch": 2.412903225806452, "grad_norm": 2.9347379207611084, "learning_rate": 8.90915741234015e-05, "loss": 0.8363, "mean_token_accuracy": 0.775736004114151, "num_tokens": 66364.0, "step": 94 }, { "entropy": 1.0531336814165115, "epoch": 2.4387096774193546, "grad_norm": 3.1018309593200684, "learning_rate": 8.883113295834892e-05, "loss": 0.8268, "mean_token_accuracy": 0.7704032361507416, "num_tokens": 66820.0, "step": 95 }, { "entropy": 1.0696537494659424, "epoch": 2.464516129032258, "grad_norm": 3.423306941986084, "learning_rate": 8.856800957755e-05, "loss": 0.7847, "mean_token_accuracy": 0.7773692905902863, "num_tokens": 67214.0, "step": 96 }, { "entropy": 1.306801289319992, "epoch": 2.490322580645161, "grad_norm": 1.6437768936157227, "learning_rate": 8.83022221559489e-05, "loss": 1.2357, "mean_token_accuracy": 0.669854074716568, "num_tokens": 68733.0, "step": 97 }, { "entropy": 1.1772551238536835, "epoch": 2.5161290322580645, "grad_norm": 2.4962806701660156, "learning_rate": 8.803378905250544e-05, "loss": 1.0752, "mean_token_accuracy": 0.711113303899765, "num_tokens": 69580.0, "step": 98 }, { "entropy": 1.166929692029953, "epoch": 2.541935483870968, "grad_norm": 2.8279449939727783, "learning_rate": 8.776272880892675e-05, "loss": 1.0135, "mean_token_accuracy": 0.7302903383970261, "num_tokens": 70359.0, "step": 99 }, { "entropy": 1.2368881702423096, "epoch": 2.567741935483871, "grad_norm": 2.812784194946289, "learning_rate": 8.748906014838672e-05, "loss": 1.0997, "mean_token_accuracy": 0.7428575754165649, "num_tokens": 71051.0, "step": 100 }, { "entropy": 1.0734427571296692, "epoch": 2.5935483870967744, "grad_norm": 3.168055772781372, "learning_rate": 8.721280197423258e-05, "loss": 0.9557, "mean_token_accuracy": 0.7500255256891251, "num_tokens": 71653.0, "step": 101 }, { "entropy": 1.0182117372751236, "epoch": 2.6193548387096772, "grad_norm": 2.928173065185547, "learning_rate": 8.69339733686793e-05, "loss": 0.7934, "mean_token_accuracy": 0.7967472970485687, "num_tokens": 72206.0, "step": 102 }, { "entropy": 0.9825232028961182, "epoch": 2.6451612903225805, "grad_norm": 3.5911121368408203, "learning_rate": 8.665259359149132e-05, "loss": 0.7435, "mean_token_accuracy": 0.7856406420469284, "num_tokens": 72709.0, "step": 103 }, { "entropy": 0.8393460661172867, "epoch": 2.670967741935484, "grad_norm": 3.1751551628112793, "learning_rate": 8.636868207865244e-05, "loss": 0.5727, "mean_token_accuracy": 0.8536647707223892, "num_tokens": 73172.0, "step": 104 }, { "entropy": 0.733843207359314, "epoch": 2.696774193548387, "grad_norm": 3.002105951309204, "learning_rate": 8.60822584410231e-05, "loss": 0.4306, "mean_token_accuracy": 0.9011064171791077, "num_tokens": 73601.0, "step": 105 }, { "entropy": 1.09127739071846, "epoch": 2.7225806451612904, "grad_norm": 2.7801899909973145, "learning_rate": 8.579334246298593e-05, "loss": 1.3229, "mean_token_accuracy": 0.6847837716341019, "num_tokens": 75066.0, "step": 106 }, { "entropy": 1.1003702282905579, "epoch": 2.7483870967741937, "grad_norm": 2.8465728759765625, "learning_rate": 8.550195410107902e-05, "loss": 1.026, "mean_token_accuracy": 0.7287466824054718, "num_tokens": 75935.0, "step": 107 }, { "entropy": 1.0054174661636353, "epoch": 2.774193548387097, "grad_norm": 2.6831374168395996, "learning_rate": 8.520811348261759e-05, "loss": 0.8887, "mean_token_accuracy": 0.7784150391817093, "num_tokens": 76730.0, "step": 108 }, { "entropy": 1.1102914214134216, "epoch": 2.8, "grad_norm": 3.408310651779175, "learning_rate": 8.491184090430364e-05, "loss": 1.0831, "mean_token_accuracy": 0.7278113067150116, "num_tokens": 77474.0, "step": 109 }, { "entropy": 0.999423012137413, "epoch": 2.825806451612903, "grad_norm": 3.7338831424713135, "learning_rate": 8.461315683082399e-05, "loss": 1.0257, "mean_token_accuracy": 0.7361829876899719, "num_tokens": 78068.0, "step": 110 }, { "entropy": 0.9764816612005234, "epoch": 2.8516129032258064, "grad_norm": 3.499826192855835, "learning_rate": 8.43120818934367e-05, "loss": 0.8335, "mean_token_accuracy": 0.764112114906311, "num_tokens": 78589.0, "step": 111 }, { "entropy": 0.9866785109043121, "epoch": 2.8774193548387097, "grad_norm": 3.31439471244812, "learning_rate": 8.400863688854597e-05, "loss": 0.9472, "mean_token_accuracy": 0.7592662870883942, "num_tokens": 79080.0, "step": 112 }, { "entropy": 0.8102796524763107, "epoch": 2.903225806451613, "grad_norm": 3.768465757369995, "learning_rate": 8.370284277626577e-05, "loss": 0.6879, "mean_token_accuracy": 0.7918446511030197, "num_tokens": 79518.0, "step": 113 }, { "entropy": 0.7523371577262878, "epoch": 2.9290322580645163, "grad_norm": 3.107103109359741, "learning_rate": 8.339472067897187e-05, "loss": 0.5142, "mean_token_accuracy": 0.8337104171514511, "num_tokens": 79925.0, "step": 114 }, { "entropy": 1.2405670583248138, "epoch": 2.9548387096774196, "grad_norm": 2.0415544509887695, "learning_rate": 8.308429187984297e-05, "loss": 1.2469, "mean_token_accuracy": 0.6947166323661804, "num_tokens": 81111.0, "step": 115 }, { "entropy": 1.0534760355949402, "epoch": 2.9806451612903224, "grad_norm": 3.243969440460205, "learning_rate": 8.27715778213905e-05, "loss": 1.0014, "mean_token_accuracy": 0.752901017665863, "num_tokens": 81717.0, "step": 116 }, { "entropy": 0.8150668541590372, "epoch": 3.0, "grad_norm": 3.7620151042938232, "learning_rate": 8.24566001039776e-05, "loss": 0.6544, "mean_token_accuracy": 0.8201234340667725, "num_tokens": 82059.0, "step": 117 }, { "entropy": 1.2218182981014252, "epoch": 3.0258064516129033, "grad_norm": 2.0384461879730225, "learning_rate": 8.213938048432697e-05, "loss": 0.9903, "mean_token_accuracy": 0.7458517551422119, "num_tokens": 83704.0, "step": 118 }, { "entropy": 1.0654624998569489, "epoch": 3.0516129032258066, "grad_norm": 2.7097387313842773, "learning_rate": 8.181994087401819e-05, "loss": 0.6589, "mean_token_accuracy": 0.8282175809144974, "num_tokens": 84564.0, "step": 119 }, { "entropy": 0.9389624744653702, "epoch": 3.07741935483871, "grad_norm": 3.422351360321045, "learning_rate": 8.149830333797407e-05, "loss": 0.6736, "mean_token_accuracy": 0.8170457482337952, "num_tokens": 85305.0, "step": 120 }, { "entropy": 0.891632542014122, "epoch": 3.1032258064516127, "grad_norm": 2.9999988079071045, "learning_rate": 8.117449009293668e-05, "loss": 0.5435, "mean_token_accuracy": 0.8579341620206833, "num_tokens": 85927.0, "step": 121 }, { "entropy": 0.7080177962779999, "epoch": 3.129032258064516, "grad_norm": 2.7167727947235107, "learning_rate": 8.084852350593264e-05, "loss": 0.386, "mean_token_accuracy": 0.9050543904304504, "num_tokens": 86500.0, "step": 122 }, { "entropy": 0.5361127704381943, "epoch": 3.1548387096774193, "grad_norm": 3.051241874694824, "learning_rate": 8.052042609272817e-05, "loss": 0.314, "mean_token_accuracy": 0.9146886169910431, "num_tokens": 87009.0, "step": 123 }, { "entropy": 0.5324621573090553, "epoch": 3.1806451612903226, "grad_norm": 3.0022952556610107, "learning_rate": 8.019022051627388e-05, "loss": 0.3141, "mean_token_accuracy": 0.9247495979070663, "num_tokens": 87467.0, "step": 124 }, { "entropy": 0.40624529123306274, "epoch": 3.206451612903226, "grad_norm": 3.094412326812744, "learning_rate": 7.985792958513931e-05, "loss": 0.26, "mean_token_accuracy": 0.9299735277891159, "num_tokens": 87885.0, "step": 125 }, { "entropy": 0.3672215938568115, "epoch": 3.232258064516129, "grad_norm": 3.4929354190826416, "learning_rate": 7.952357625193749e-05, "loss": 0.2392, "mean_token_accuracy": 0.9306517392396927, "num_tokens": 88260.0, "step": 126 }, { "entropy": 0.8298548460006714, "epoch": 3.258064516129032, "grad_norm": 2.836134672164917, "learning_rate": 7.91871836117395e-05, "loss": 0.7053, "mean_token_accuracy": 0.8246497809886932, "num_tokens": 89262.0, "step": 127 }, { "entropy": 0.5190232917666435, "epoch": 3.2838709677419353, "grad_norm": 5.216272830963135, "learning_rate": 7.884877490047915e-05, "loss": 0.565, "mean_token_accuracy": 0.8471736311912537, "num_tokens": 90062.0, "step": 128 }, { "entropy": 0.4947461038827896, "epoch": 3.3096774193548386, "grad_norm": 4.143370628356934, "learning_rate": 7.85083734933481e-05, "loss": 0.5013, "mean_token_accuracy": 0.8697308301925659, "num_tokens": 90841.0, "step": 129 }, { "entropy": 0.5702934339642525, "epoch": 3.335483870967742, "grad_norm": 5.3610520362854, "learning_rate": 7.81660029031811e-05, "loss": 0.657, "mean_token_accuracy": 0.8270199149847031, "num_tokens": 91591.0, "step": 130 }, { "entropy": 0.5612503439188004, "epoch": 3.361290322580645, "grad_norm": 4.896009922027588, "learning_rate": 7.782168677883206e-05, "loss": 0.638, "mean_token_accuracy": 0.8336956202983856, "num_tokens": 92304.0, "step": 131 }, { "entropy": 0.47641437500715256, "epoch": 3.3870967741935485, "grad_norm": 5.059084415435791, "learning_rate": 7.74754489035403e-05, "loss": 0.516, "mean_token_accuracy": 0.8493129163980484, "num_tokens": 92920.0, "step": 132 }, { "entropy": 0.5311232656240463, "epoch": 3.412903225806452, "grad_norm": 3.7369489669799805, "learning_rate": 7.712731319328798e-05, "loss": 0.4084, "mean_token_accuracy": 0.8949003219604492, "num_tokens": 93468.0, "step": 133 }, { "entropy": 0.4599653482437134, "epoch": 3.4387096774193546, "grad_norm": 4.457752704620361, "learning_rate": 7.677730369514793e-05, "loss": 0.4303, "mean_token_accuracy": 0.8998099863529205, "num_tokens": 93952.0, "step": 134 }, { "entropy": 0.3341464288532734, "epoch": 3.464516129032258, "grad_norm": 2.74814772605896, "learning_rate": 7.642544458562278e-05, "loss": 0.2045, "mean_token_accuracy": 0.9389902055263519, "num_tokens": 94378.0, "step": 135 }, { "entropy": 0.7704500108957291, "epoch": 3.490322580645161, "grad_norm": 2.1899735927581787, "learning_rate": 7.60717601689749e-05, "loss": 0.7928, "mean_token_accuracy": 0.7940146774053574, "num_tokens": 96188.0, "step": 136 }, { "entropy": 0.8460464626550674, "epoch": 3.5161290322580645, "grad_norm": 2.439542531967163, "learning_rate": 7.571627487554769e-05, "loss": 0.7167, "mean_token_accuracy": 0.7986479252576828, "num_tokens": 97250.0, "step": 137 }, { "entropy": 0.683267816901207, "epoch": 3.541935483870968, "grad_norm": 3.4693028926849365, "learning_rate": 7.535901326007795e-05, "loss": 0.5391, "mean_token_accuracy": 0.8488983660936356, "num_tokens": 98028.0, "step": 138 }, { "entropy": 0.6665534228086472, "epoch": 3.567741935483871, "grad_norm": 3.313450336456299, "learning_rate": 7.500000000000001e-05, "loss": 0.4977, "mean_token_accuracy": 0.8638099581003189, "num_tokens": 98727.0, "step": 139 }, { "entropy": 0.6375805735588074, "epoch": 3.5935483870967744, "grad_norm": 3.621342897415161, "learning_rate": 7.463925989374089e-05, "loss": 0.521, "mean_token_accuracy": 0.8624279350042343, "num_tokens": 99329.0, "step": 140 }, { "entropy": 0.5712595283985138, "epoch": 3.6193548387096772, "grad_norm": 3.667834520339966, "learning_rate": 7.427681785900761e-05, "loss": 0.4579, "mean_token_accuracy": 0.8609372973442078, "num_tokens": 99866.0, "step": 141 }, { "entropy": 0.5664890855550766, "epoch": 3.6451612903225805, "grad_norm": 3.193061113357544, "learning_rate": 7.391269893106592e-05, "loss": 0.3498, "mean_token_accuracy": 0.9016094356775284, "num_tokens": 100358.0, "step": 142 }, { "entropy": 0.4809069186449051, "epoch": 3.670967741935484, "grad_norm": 2.9797909259796143, "learning_rate": 7.354692826101102e-05, "loss": 0.239, "mean_token_accuracy": 0.937361553311348, "num_tokens": 100810.0, "step": 143 }, { "entropy": 0.3825264722108841, "epoch": 3.696774193548387, "grad_norm": 2.5916123390197754, "learning_rate": 7.317953111403029e-05, "loss": 0.2293, "mean_token_accuracy": 0.959057167172432, "num_tokens": 101224.0, "step": 144 }, { "entropy": 1.0315645188093185, "epoch": 3.7225806451612904, "grad_norm": 2.4332456588745117, "learning_rate": 7.281053286765815e-05, "loss": 0.9734, "mean_token_accuracy": 0.7563262432813644, "num_tokens": 102666.0, "step": 145 }, { "entropy": 0.7324022054672241, "epoch": 3.7483870967741937, "grad_norm": 3.319155693054199, "learning_rate": 7.243995901002312e-05, "loss": 0.526, "mean_token_accuracy": 0.862901970744133, "num_tokens": 103560.0, "step": 146 }, { "entropy": 0.7977930456399918, "epoch": 3.774193548387097, "grad_norm": 3.708766460418701, "learning_rate": 7.20678351380872e-05, "loss": 0.5996, "mean_token_accuracy": 0.8376729637384415, "num_tokens": 104386.0, "step": 147 }, { "entropy": 0.67112597823143, "epoch": 3.8, "grad_norm": 3.474480152130127, "learning_rate": 7.169418695587791e-05, "loss": 0.5283, "mean_token_accuracy": 0.8518707603216171, "num_tokens": 105173.0, "step": 148 }, { "entropy": 0.6674353927373886, "epoch": 3.825806451612903, "grad_norm": 4.0479736328125, "learning_rate": 7.13190402727127e-05, "loss": 0.5836, "mean_token_accuracy": 0.8252883553504944, "num_tokens": 105827.0, "step": 149 }, { "entropy": 0.6601278185844421, "epoch": 3.8516129032258064, "grad_norm": 3.1081454753875732, "learning_rate": 7.094242100141625e-05, "loss": 0.4519, "mean_token_accuracy": 0.8595046997070312, "num_tokens": 106405.0, "step": 150 }, { "entropy": 0.397666834294796, "epoch": 3.8774193548387097, "grad_norm": 2.5936572551727295, "learning_rate": 7.056435515653059e-05, "loss": 0.2092, "mean_token_accuracy": 0.9478294253349304, "num_tokens": 106926.0, "step": 151 }, { "entropy": 0.5597369372844696, "epoch": 3.903225806451613, "grad_norm": 4.103569984436035, "learning_rate": 7.018486885251812e-05, "loss": 0.4531, "mean_token_accuracy": 0.8746808618307114, "num_tokens": 107392.0, "step": 152 }, { "entropy": 0.38467343896627426, "epoch": 3.9290322580645163, "grad_norm": 3.1950509548187256, "learning_rate": 6.980398830195785e-05, "loss": 0.212, "mean_token_accuracy": 0.9444408565759659, "num_tokens": 107827.0, "step": 153 }, { "entropy": 0.5698762461543083, "epoch": 3.9548387096774196, "grad_norm": 3.3116562366485596, "learning_rate": 6.942173981373474e-05, "loss": 0.4076, "mean_token_accuracy": 0.8756328076124191, "num_tokens": 108519.0, "step": 154 }, { "entropy": 0.5269991233944893, "epoch": 3.9806451612903224, "grad_norm": 3.074373483657837, "learning_rate": 6.903814979122249e-05, "loss": 0.3577, "mean_token_accuracy": 0.9049306809902191, "num_tokens": 109080.0, "step": 155 }, { "entropy": 0.411786029736201, "epoch": 4.0, "grad_norm": 3.1152896881103516, "learning_rate": 6.86532447304597e-05, "loss": 0.2401, "mean_token_accuracy": 0.9342868526776632, "num_tokens": 109412.0, "step": 156 }, { "entropy": 0.672158882021904, "epoch": 4.025806451612903, "grad_norm": 2.576361894607544, "learning_rate": 6.826705121831976e-05, "loss": 0.5307, "mean_token_accuracy": 0.8603871315717697, "num_tokens": 110911.0, "step": 157 }, { "entropy": 0.5033400803804398, "epoch": 4.051612903225807, "grad_norm": 2.3417139053344727, "learning_rate": 6.78795959306743e-05, "loss": 0.2862, "mean_token_accuracy": 0.9291664808988571, "num_tokens": 111773.0, "step": 158 }, { "entropy": 0.3818225935101509, "epoch": 4.077419354838709, "grad_norm": 2.526963233947754, "learning_rate": 6.749090563055076e-05, "loss": 0.204, "mean_token_accuracy": 0.9366898983716965, "num_tokens": 112552.0, "step": 159 }, { "entropy": 0.5210211500525475, "epoch": 4.103225806451613, "grad_norm": 3.331657648086548, "learning_rate": 6.710100716628344e-05, "loss": 0.3463, "mean_token_accuracy": 0.9079622030258179, "num_tokens": 113279.0, "step": 160 }, { "entropy": 0.4078049287199974, "epoch": 4.129032258064516, "grad_norm": 2.643353223800659, "learning_rate": 6.670992746965938e-05, "loss": 0.2458, "mean_token_accuracy": 0.9378542304039001, "num_tokens": 113927.0, "step": 161 }, { "entropy": 0.2958051636815071, "epoch": 4.15483870967742, "grad_norm": 2.6562397480010986, "learning_rate": 6.63176935540578e-05, "loss": 0.2228, "mean_token_accuracy": 0.9389047920703888, "num_tokens": 114535.0, "step": 162 }, { "entropy": 0.2642120160162449, "epoch": 4.180645161290323, "grad_norm": 3.720411539077759, "learning_rate": 6.592433251258423e-05, "loss": 0.1609, "mean_token_accuracy": 0.9546155333518982, "num_tokens": 115092.0, "step": 163 }, { "entropy": 0.2038814201951027, "epoch": 4.2064516129032254, "grad_norm": 3.742655038833618, "learning_rate": 6.552987151619919e-05, "loss": 0.1438, "mean_token_accuracy": 0.9577045887708664, "num_tokens": 115572.0, "step": 164 }, { "entropy": 0.21509704366326332, "epoch": 4.232258064516129, "grad_norm": 4.123962879180908, "learning_rate": 6.51343378118413e-05, "loss": 0.1326, "mean_token_accuracy": 0.955599308013916, "num_tokens": 116004.0, "step": 165 }, { "entropy": 0.5882035046815872, "epoch": 4.258064516129032, "grad_norm": 2.629396438598633, "learning_rate": 6.473775872054521e-05, "loss": 0.5174, "mean_token_accuracy": 0.855495274066925, "num_tokens": 117713.0, "step": 166 }, { "entropy": 0.4447134956717491, "epoch": 4.283870967741936, "grad_norm": 5.003028869628906, "learning_rate": 6.434016163555452e-05, "loss": 0.4682, "mean_token_accuracy": 0.8714989423751831, "num_tokens": 118624.0, "step": 167 }, { "entropy": 0.3722687065601349, "epoch": 4.309677419354839, "grad_norm": 3.819241762161255, "learning_rate": 6.394157402042951e-05, "loss": 0.3207, "mean_token_accuracy": 0.9076657742261887, "num_tokens": 119441.0, "step": 168 }, { "entropy": 0.2616325728595257, "epoch": 4.335483870967742, "grad_norm": 3.4206392765045166, "learning_rate": 6.354202340715026e-05, "loss": 0.205, "mean_token_accuracy": 0.9454829543828964, "num_tokens": 120187.0, "step": 169 }, { "entropy": 0.3457096070051193, "epoch": 4.361290322580645, "grad_norm": 3.556037425994873, "learning_rate": 6.314153739421476e-05, "loss": 0.2697, "mean_token_accuracy": 0.9172067493200302, "num_tokens": 120838.0, "step": 170 }, { "entropy": 0.2511453256011009, "epoch": 4.387096774193548, "grad_norm": 2.943145751953125, "learning_rate": 6.274014364473274e-05, "loss": 0.1491, "mean_token_accuracy": 0.9682914614677429, "num_tokens": 121408.0, "step": 171 }, { "entropy": 0.23977105692029, "epoch": 4.412903225806452, "grad_norm": 3.426252603530884, "learning_rate": 6.233786988451468e-05, "loss": 0.1645, "mean_token_accuracy": 0.9556652754545212, "num_tokens": 121915.0, "step": 172 }, { "entropy": 0.19089676067233086, "epoch": 4.438709677419355, "grad_norm": 2.1618521213531494, "learning_rate": 6.19347439001569e-05, "loss": 0.1059, "mean_token_accuracy": 0.97336345911026, "num_tokens": 122368.0, "step": 173 }, { "entropy": 0.19364609941840172, "epoch": 4.464516129032258, "grad_norm": 3.3634703159332275, "learning_rate": 6.153079353712201e-05, "loss": 0.1285, "mean_token_accuracy": 0.9543762654066086, "num_tokens": 122767.0, "step": 174 }, { "entropy": 0.6687990427017212, "epoch": 4.490322580645161, "grad_norm": 2.883437395095825, "learning_rate": 6.112604669781572e-05, "loss": 0.5348, "mean_token_accuracy": 0.8638840764760971, "num_tokens": 124288.0, "step": 175 }, { "entropy": 0.472368985414505, "epoch": 4.516129032258064, "grad_norm": 2.9869871139526367, "learning_rate": 6.072053133965938e-05, "loss": 0.2776, "mean_token_accuracy": 0.9314542561769485, "num_tokens": 125161.0, "step": 176 }, { "entropy": 0.4055846929550171, "epoch": 4.541935483870968, "grad_norm": 3.554269552230835, "learning_rate": 6.031427547315889e-05, "loss": 0.3152, "mean_token_accuracy": 0.9113509654998779, "num_tokens": 125955.0, "step": 177 }, { "entropy": 0.3913852721452713, "epoch": 4.567741935483871, "grad_norm": 3.3943800926208496, "learning_rate": 5.9907307159969884e-05, "loss": 0.2882, "mean_token_accuracy": 0.9336675554513931, "num_tokens": 126654.0, "step": 178 }, { "entropy": 0.2266981489956379, "epoch": 4.593548387096774, "grad_norm": 2.6177566051483154, "learning_rate": 5.949965451095951e-05, "loss": 0.1521, "mean_token_accuracy": 0.9607619494199753, "num_tokens": 127200.0, "step": 179 }, { "entropy": 0.2510114349424839, "epoch": 4.619354838709677, "grad_norm": 2.9274792671203613, "learning_rate": 5.9091345684264546e-05, "loss": 0.1527, "mean_token_accuracy": 0.9545964151620865, "num_tokens": 127710.0, "step": 180 }, { "entropy": 0.27408041059970856, "epoch": 4.645161290322581, "grad_norm": 3.970353841781616, "learning_rate": 5.868240888334653e-05, "loss": 0.2088, "mean_token_accuracy": 0.9431939721107483, "num_tokens": 128171.0, "step": 181 }, { "entropy": 0.21555104106664658, "epoch": 4.670967741935484, "grad_norm": 2.1485326290130615, "learning_rate": 5.827287235504356e-05, "loss": 0.1231, "mean_token_accuracy": 0.9743186682462692, "num_tokens": 128603.0, "step": 182 }, { "entropy": 0.1890631914138794, "epoch": 4.6967741935483875, "grad_norm": 3.0446012020111084, "learning_rate": 5.786276438761927e-05, "loss": 0.166, "mean_token_accuracy": 0.9585428386926651, "num_tokens": 129018.0, "step": 183 }, { "entropy": 0.4911561757326126, "epoch": 4.72258064516129, "grad_norm": 2.324612617492676, "learning_rate": 5.745211330880872e-05, "loss": 0.3596, "mean_token_accuracy": 0.9241899400949478, "num_tokens": 130189.0, "step": 184 }, { "entropy": 0.3451598323881626, "epoch": 4.748387096774193, "grad_norm": 3.1134896278381348, "learning_rate": 5.704094748386184e-05, "loss": 0.2163, "mean_token_accuracy": 0.9265208840370178, "num_tokens": 130996.0, "step": 185 }, { "entropy": 0.39382658153772354, "epoch": 4.774193548387097, "grad_norm": 3.3759310245513916, "learning_rate": 5.6629295313583974e-05, "loss": 0.266, "mean_token_accuracy": 0.923931747674942, "num_tokens": 131734.0, "step": 186 }, { "entropy": 0.362373985350132, "epoch": 4.8, "grad_norm": 3.549544095993042, "learning_rate": 5.621718523237427e-05, "loss": 0.2415, "mean_token_accuracy": 0.9290976673364639, "num_tokens": 132406.0, "step": 187 }, { "entropy": 0.33830052614212036, "epoch": 4.825806451612904, "grad_norm": 2.8866331577301025, "learning_rate": 5.5804645706261514e-05, "loss": 0.2333, "mean_token_accuracy": 0.93567855656147, "num_tokens": 133001.0, "step": 188 }, { "entropy": 0.2700263783335686, "epoch": 4.851612903225806, "grad_norm": 2.9685375690460205, "learning_rate": 5.539170523093794e-05, "loss": 0.1737, "mean_token_accuracy": 0.9484844356775284, "num_tokens": 133568.0, "step": 189 }, { "entropy": 0.2686317004263401, "epoch": 4.877419354838709, "grad_norm": 2.7458479404449463, "learning_rate": 5.497839232979084e-05, "loss": 0.1727, "mean_token_accuracy": 0.9658856242895126, "num_tokens": 134062.0, "step": 190 }, { "entropy": 0.2341674156486988, "epoch": 4.903225806451613, "grad_norm": 2.944103956222534, "learning_rate": 5.456473555193242e-05, "loss": 0.1788, "mean_token_accuracy": 0.9528596550226212, "num_tokens": 134514.0, "step": 191 }, { "entropy": 0.22099602594971657, "epoch": 4.929032258064516, "grad_norm": 3.862736940383911, "learning_rate": 5.415076347022776e-05, "loss": 0.1657, "mean_token_accuracy": 0.9679511785507202, "num_tokens": 134923.0, "step": 192 }, { "entropy": 0.5313196182250977, "epoch": 4.95483870967742, "grad_norm": 3.1668918132781982, "learning_rate": 5.373650467932122e-05, "loss": 0.5281, "mean_token_accuracy": 0.8866761773824692, "num_tokens": 135869.0, "step": 193 }, { "entropy": 0.2688843570649624, "epoch": 4.980645161290322, "grad_norm": 2.9400172233581543, "learning_rate": 5.332198779366122e-05, "loss": 0.1822, "mean_token_accuracy": 0.9536565244197845, "num_tokens": 136435.0, "step": 194 }, { "entropy": 0.34634942809740704, "epoch": 5.0, "grad_norm": 4.880941867828369, "learning_rate": 5.290724144552379e-05, "loss": 0.2718, "mean_token_accuracy": 0.9203394254048666, "num_tokens": 136765.0, "step": 195 }, { "entropy": 0.5787394121289253, "epoch": 5.025806451612903, "grad_norm": 2.429058313369751, "learning_rate": 5.249229428303486e-05, "loss": 0.3105, "mean_token_accuracy": 0.9199163019657135, "num_tokens": 138102.0, "step": 196 }, { "entropy": 0.3213765248656273, "epoch": 5.051612903225807, "grad_norm": 2.9777679443359375, "learning_rate": 5.2077174968191346e-05, "loss": 0.1813, "mean_token_accuracy": 0.9481654316186905, "num_tokens": 138950.0, "step": 197 }, { "entropy": 0.2601848617196083, "epoch": 5.077419354838709, "grad_norm": 2.173152446746826, "learning_rate": 5.166191217488133e-05, "loss": 0.1352, "mean_token_accuracy": 0.9740329831838608, "num_tokens": 139722.0, "step": 198 }, { "entropy": 0.27228355780243874, "epoch": 5.103225806451613, "grad_norm": 2.206040859222412, "learning_rate": 5.124653458690365e-05, "loss": 0.1203, "mean_token_accuracy": 0.9656965136528015, "num_tokens": 140396.0, "step": 199 }, { "entropy": 0.17130273580551147, "epoch": 5.129032258064516, "grad_norm": 2.000005006790161, "learning_rate": 5.083107089598632e-05, "loss": 0.0938, "mean_token_accuracy": 0.9830586761236191, "num_tokens": 140987.0, "step": 200 }, { "entropy": 0.19337046518921852, "epoch": 5.15483870967742, "grad_norm": 2.180755376815796, "learning_rate": 5.041554979980486e-05, "loss": 0.092, "mean_token_accuracy": 0.9733314365148544, "num_tokens": 141517.0, "step": 201 }, { "entropy": 0.16925612837076187, "epoch": 5.180645161290323, "grad_norm": 1.6496930122375488, "learning_rate": 5e-05, "loss": 0.0819, "mean_token_accuracy": 0.9781141579151154, "num_tokens": 142025.0, "step": 202 }, { "entropy": 0.20112577825784683, "epoch": 5.2064516129032254, "grad_norm": 2.5295193195343018, "learning_rate": 4.9584450200195156e-05, "loss": 0.1113, "mean_token_accuracy": 0.972536712884903, "num_tokens": 142501.0, "step": 203 }, { "entropy": 0.12446376867592335, "epoch": 5.232258064516129, "grad_norm": 1.8126459121704102, "learning_rate": 4.9168929104013697e-05, "loss": 0.1119, "mean_token_accuracy": 0.9784018099308014, "num_tokens": 142930.0, "step": 204 }, { "entropy": 0.3357328027486801, "epoch": 5.258064516129032, "grad_norm": 2.69579815864563, "learning_rate": 4.875346541309637e-05, "loss": 0.2933, "mean_token_accuracy": 0.9279916733503342, "num_tokens": 144619.0, "step": 205 }, { "entropy": 0.27347391098737717, "epoch": 5.283870967741936, "grad_norm": 3.0113985538482666, "learning_rate": 4.8338087825118675e-05, "loss": 0.2147, "mean_token_accuracy": 0.9462355375289917, "num_tokens": 145485.0, "step": 206 }, { "entropy": 0.18706193938851357, "epoch": 5.309677419354839, "grad_norm": 2.3350462913513184, "learning_rate": 4.792282503180867e-05, "loss": 0.1089, "mean_token_accuracy": 0.9645346254110336, "num_tokens": 146253.0, "step": 207 }, { "entropy": 0.23134352639317513, "epoch": 5.335483870967742, "grad_norm": 2.53825306892395, "learning_rate": 4.750770571696514e-05, "loss": 0.139, "mean_token_accuracy": 0.9644808024168015, "num_tokens": 146961.0, "step": 208 }, { "entropy": 0.18409648537635803, "epoch": 5.361290322580645, "grad_norm": 3.6751139163970947, "learning_rate": 4.709275855447621e-05, "loss": 0.1271, "mean_token_accuracy": 0.9647018611431122, "num_tokens": 147585.0, "step": 209 }, { "entropy": 0.13805431686341763, "epoch": 5.387096774193548, "grad_norm": 2.252584218978882, "learning_rate": 4.6678012206338793e-05, "loss": 0.11, "mean_token_accuracy": 0.9786661118268967, "num_tokens": 148137.0, "step": 210 }, { "entropy": 0.1293979026377201, "epoch": 5.412903225806452, "grad_norm": 3.228670358657837, "learning_rate": 4.626349532067879e-05, "loss": 0.1009, "mean_token_accuracy": 0.9756647497415543, "num_tokens": 148635.0, "step": 211 }, { "entropy": 0.15355101972818375, "epoch": 5.438709677419355, "grad_norm": 2.5168986320495605, "learning_rate": 4.584923652977224e-05, "loss": 0.0966, "mean_token_accuracy": 0.9696203321218491, "num_tokens": 149098.0, "step": 212 }, { "entropy": 0.12985192984342575, "epoch": 5.464516129032258, "grad_norm": 1.9614430665969849, "learning_rate": 4.543526444806759e-05, "loss": 0.0876, "mean_token_accuracy": 0.9787871986627579, "num_tokens": 149525.0, "step": 213 }, { "entropy": 0.41858533024787903, "epoch": 5.490322580645161, "grad_norm": 2.3210058212280273, "learning_rate": 4.502160767020918e-05, "loss": 0.3106, "mean_token_accuracy": 0.9150111377239227, "num_tokens": 151159.0, "step": 214 }, { "entropy": 0.23978786170482635, "epoch": 5.516129032258064, "grad_norm": 2.6100656986236572, "learning_rate": 4.4608294769062075e-05, "loss": 0.131, "mean_token_accuracy": 0.969085082411766, "num_tokens": 151972.0, "step": 215 }, { "entropy": 0.20062651857733727, "epoch": 5.541935483870968, "grad_norm": 2.6525464057922363, "learning_rate": 4.4195354293738484e-05, "loss": 0.1297, "mean_token_accuracy": 0.9647854268550873, "num_tokens": 152742.0, "step": 216 }, { "entropy": 0.18283047527074814, "epoch": 5.567741935483871, "grad_norm": 1.9218651056289673, "learning_rate": 4.378281476762576e-05, "loss": 0.1113, "mean_token_accuracy": 0.9758298695087433, "num_tokens": 153456.0, "step": 217 }, { "entropy": 0.17359177768230438, "epoch": 5.593548387096774, "grad_norm": 2.074409008026123, "learning_rate": 4.337070468641604e-05, "loss": 0.1127, "mean_token_accuracy": 0.9679757952690125, "num_tokens": 154114.0, "step": 218 }, { "entropy": 0.15452994219958782, "epoch": 5.619354838709677, "grad_norm": 1.4686728715896606, "learning_rate": 4.295905251613817e-05, "loss": 0.083, "mean_token_accuracy": 0.9716224670410156, "num_tokens": 154710.0, "step": 219 }, { "entropy": 0.1461981236934662, "epoch": 5.645161290322581, "grad_norm": 2.090766191482544, "learning_rate": 4.254788669119127e-05, "loss": 0.0915, "mean_token_accuracy": 0.9731487780809402, "num_tokens": 155272.0, "step": 220 }, { "entropy": 0.14948130398988724, "epoch": 5.670967741935484, "grad_norm": 2.874465227127075, "learning_rate": 4.213723561238074e-05, "loss": 0.1213, "mean_token_accuracy": 0.9657130539417267, "num_tokens": 155765.0, "step": 221 }, { "entropy": 0.147341663017869, "epoch": 5.6967741935483875, "grad_norm": 2.8784825801849365, "learning_rate": 4.172712764495644e-05, "loss": 0.1131, "mean_token_accuracy": 0.9677340090274811, "num_tokens": 156170.0, "step": 222 }, { "entropy": 0.37899941951036453, "epoch": 5.72258064516129, "grad_norm": 2.1102116107940674, "learning_rate": 4.131759111665349e-05, "loss": 0.2919, "mean_token_accuracy": 0.9289288818836212, "num_tokens": 157544.0, "step": 223 }, { "entropy": 0.1955309621989727, "epoch": 5.748387096774193, "grad_norm": 2.2968599796295166, "learning_rate": 4.0908654315735466e-05, "loss": 0.1214, "mean_token_accuracy": 0.9681131392717361, "num_tokens": 158450.0, "step": 224 }, { "entropy": 0.19111444801092148, "epoch": 5.774193548387097, "grad_norm": 2.6387436389923096, "learning_rate": 4.0500345489040515e-05, "loss": 0.1412, "mean_token_accuracy": 0.9579745233058929, "num_tokens": 159264.0, "step": 225 }, { "entropy": 0.1776861809194088, "epoch": 5.8, "grad_norm": 2.6175966262817383, "learning_rate": 4.0092692840030134e-05, "loss": 0.1223, "mean_token_accuracy": 0.9692755341529846, "num_tokens": 159933.0, "step": 226 }, { "entropy": 0.15603690408170223, "epoch": 5.825806451612904, "grad_norm": 2.4090588092803955, "learning_rate": 3.968572452684113e-05, "loss": 0.1004, "mean_token_accuracy": 0.9694436490535736, "num_tokens": 160526.0, "step": 227 }, { "entropy": 0.14687431044876575, "epoch": 5.851612903225806, "grad_norm": 2.5552449226379395, "learning_rate": 3.9279468660340626e-05, "loss": 0.1015, "mean_token_accuracy": 0.9700941145420074, "num_tokens": 161001.0, "step": 228 }, { "entropy": 0.14584726840257645, "epoch": 5.877419354838709, "grad_norm": 2.417149782180786, "learning_rate": 3.887395330218429e-05, "loss": 0.1257, "mean_token_accuracy": 0.969669446349144, "num_tokens": 161434.0, "step": 229 }, { "entropy": 0.12778180465102196, "epoch": 5.903225806451613, "grad_norm": 1.2179059982299805, "learning_rate": 3.846920646287799e-05, "loss": 0.0758, "mean_token_accuracy": 0.9738518297672272, "num_tokens": 161858.0, "step": 230 }, { "entropy": 0.1522289477288723, "epoch": 5.929032258064516, "grad_norm": 2.0130059719085693, "learning_rate": 3.806525609984312e-05, "loss": 0.1062, "mean_token_accuracy": 0.9636791348457336, "num_tokens": 162250.0, "step": 231 }, { "entropy": 0.23178323358297348, "epoch": 5.95483870967742, "grad_norm": 2.4759209156036377, "learning_rate": 3.7662130115485314e-05, "loss": 0.1228, "mean_token_accuracy": 0.9636365175247192, "num_tokens": 163108.0, "step": 232 }, { "entropy": 0.16584154963493347, "epoch": 5.980645161290322, "grad_norm": 2.447923421859741, "learning_rate": 3.7259856355267273e-05, "loss": 0.1304, "mean_token_accuracy": 0.9603947103023529, "num_tokens": 163768.0, "step": 233 }, { "entropy": 0.12077461183071136, "epoch": 6.0, "grad_norm": 3.7384884357452393, "learning_rate": 3.685846260578524e-05, "loss": 0.0966, "mean_token_accuracy": 0.9680581092834473, "num_tokens": 164118.0, "step": 234 } ], "logging_steps": 1, "max_steps": 390, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7434558634475520.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }