{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 534, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5478973388671875, "epoch": 0.011235955056179775, "grad_norm": 383.7328462293848, "learning_rate": 0.0, "loss": 8.3388, "mean_token_accuracy": 0.0, "num_tokens": 844265.0, "step": 1 }, { "entropy": 0.5487899780273438, "epoch": 0.02247191011235955, "grad_norm": 382.2250379061165, "learning_rate": 1.8518518518518518e-07, "loss": 8.3331, "mean_token_accuracy": 0.0, "num_tokens": 1688860.0, "step": 2 }, { "entropy": 0.5494003295898438, "epoch": 0.033707865168539325, "grad_norm": 385.6801519392013, "learning_rate": 3.7037037037037036e-07, "loss": 8.2895, "mean_token_accuracy": 0.0, "num_tokens": 2512910.0, "step": 3 }, { "entropy": 0.5540695190429688, "epoch": 0.0449438202247191, "grad_norm": 387.5163435160337, "learning_rate": 5.555555555555555e-07, "loss": 8.2596, "mean_token_accuracy": 0.0, "num_tokens": 3345813.0, "step": 4 }, { "entropy": 0.5646514892578125, "epoch": 0.056179775280898875, "grad_norm": 390.56658814859105, "learning_rate": 7.407407407407407e-07, "loss": 8.1342, "mean_token_accuracy": 0.0, "num_tokens": 4158244.0, "step": 5 }, { "entropy": 0.56011962890625, "epoch": 0.06741573033707865, "grad_norm": 396.82031188523996, "learning_rate": 9.259259259259259e-07, "loss": 8.0144, "mean_token_accuracy": 0.0, "num_tokens": 4967109.0, "step": 6 }, { "entropy": 0.5560760498046875, "epoch": 0.07865168539325842, "grad_norm": 399.44225638760815, "learning_rate": 1.111111111111111e-06, "loss": 7.4644, "mean_token_accuracy": 0.0, "num_tokens": 5797482.0, "step": 7 }, { "entropy": 0.5502700805664062, "epoch": 0.0898876404494382, "grad_norm": 271.48936847645507, "learning_rate": 1.2962962962962962e-06, "loss": 5.8786, "mean_token_accuracy": 0.0026041667442768812, "num_tokens": 6640065.0, "step": 8 }, { "entropy": 0.54205322265625, "epoch": 0.10112359550561797, "grad_norm": 230.51967558204245, "learning_rate": 1.4814814814814815e-06, "loss": 5.5918, "mean_token_accuracy": 0.006510416860692203, "num_tokens": 7494647.0, "step": 9 }, { "entropy": 0.553466796875, "epoch": 0.11235955056179775, "grad_norm": 186.8557668882384, "learning_rate": 1.6666666666666667e-06, "loss": 5.264, "mean_token_accuracy": 0.01953125058207661, "num_tokens": 8336619.0, "step": 10 }, { "entropy": 0.5583953857421875, "epoch": 0.12359550561797752, "grad_norm": 102.72564448300426, "learning_rate": 1.8518518518518519e-06, "loss": 4.112, "mean_token_accuracy": 0.5247395989717916, "num_tokens": 9153404.0, "step": 11 }, { "entropy": 0.550445556640625, "epoch": 0.1348314606741573, "grad_norm": 96.89583143635592, "learning_rate": 2.037037037037037e-06, "loss": 4.0343, "mean_token_accuracy": 0.5078125151339918, "num_tokens": 10007098.0, "step": 12 }, { "entropy": 0.5568389892578125, "epoch": 0.14606741573033707, "grad_norm": 82.87420696019375, "learning_rate": 2.222222222222222e-06, "loss": 3.8298, "mean_token_accuracy": 0.5117187652504072, "num_tokens": 10832783.0, "step": 13 }, { "entropy": 0.5591354370117188, "epoch": 0.15730337078651685, "grad_norm": 74.53414115193272, "learning_rate": 2.4074074074074075e-06, "loss": 3.7077, "mean_token_accuracy": 0.5299479324603453, "num_tokens": 11666567.0, "step": 14 }, { "entropy": 0.5399703979492188, "epoch": 0.16853932584269662, "grad_norm": 59.49474774838589, "learning_rate": 2.5925925925925925e-06, "loss": 3.2713, "mean_token_accuracy": 0.4973958481568843, "num_tokens": 12505279.0, "step": 15 }, { "entropy": 0.5487747192382812, "epoch": 0.1797752808988764, "grad_norm": 58.34965057030908, "learning_rate": 2.7777777777777783e-06, "loss": 3.2007, "mean_token_accuracy": 0.5299479324603453, "num_tokens": 13330043.0, "step": 16 }, { "entropy": 0.561126708984375, "epoch": 0.19101123595505617, "grad_norm": 57.55503720354528, "learning_rate": 2.962962962962963e-06, "loss": 3.1543, "mean_token_accuracy": 0.5169270987389609, "num_tokens": 14127916.0, "step": 17 }, { "entropy": 0.5448684692382812, "epoch": 0.20224719101123595, "grad_norm": 57.669979135570635, "learning_rate": 3.1481481481481483e-06, "loss": 3.0899, "mean_token_accuracy": 0.537760432693176, "num_tokens": 14969669.0, "step": 18 }, { "entropy": 0.56536865234375, "epoch": 0.21348314606741572, "grad_norm": 57.626889014580236, "learning_rate": 3.3333333333333333e-06, "loss": 3.0513, "mean_token_accuracy": 0.5273437657160684, "num_tokens": 15742573.0, "step": 19 }, { "entropy": 0.536224365234375, "epoch": 0.2247191011235955, "grad_norm": 57.83925364642696, "learning_rate": 3.5185185185185187e-06, "loss": 2.9626, "mean_token_accuracy": 0.5403645994374529, "num_tokens": 16604330.0, "step": 20 }, { "entropy": 0.5584945678710938, "epoch": 0.23595505617977527, "grad_norm": 57.73861838272076, "learning_rate": 3.7037037037037037e-06, "loss": 2.9248, "mean_token_accuracy": 0.5416666828095913, "num_tokens": 17401202.0, "step": 21 }, { "entropy": 0.5339431762695312, "epoch": 0.24719101123595505, "grad_norm": 58.12584008334574, "learning_rate": 3.88888888888889e-06, "loss": 2.9143, "mean_token_accuracy": 0.5325520992046222, "num_tokens": 18227760.0, "step": 22 }, { "entropy": 0.5297012329101562, "epoch": 0.25842696629213485, "grad_norm": 63.216344809460054, "learning_rate": 4.074074074074074e-06, "loss": 2.9183, "mean_token_accuracy": 0.5364583493210375, "num_tokens": 19085561.0, "step": 23 }, { "entropy": 0.538482666015625, "epoch": 0.2696629213483146, "grad_norm": 56.95197027840473, "learning_rate": 4.2592592592592596e-06, "loss": 2.8617, "mean_token_accuracy": 0.5429687661817297, "num_tokens": 19905165.0, "step": 24 }, { "entropy": 0.5329513549804688, "epoch": 0.2808988764044944, "grad_norm": 58.492963708867535, "learning_rate": 4.444444444444444e-06, "loss": 2.8492, "mean_token_accuracy": 0.5364583493210375, "num_tokens": 20746635.0, "step": 25 }, { "entropy": 0.5399398803710938, "epoch": 0.29213483146067415, "grad_norm": 57.60957116637501, "learning_rate": 4.62962962962963e-06, "loss": 2.815, "mean_token_accuracy": 0.5390625160653144, "num_tokens": 21555958.0, "step": 26 }, { "entropy": 0.5479583740234375, "epoch": 0.30337078651685395, "grad_norm": 57.30072476472811, "learning_rate": 4.814814814814815e-06, "loss": 2.7716, "mean_token_accuracy": 0.5651041835080832, "num_tokens": 22363221.0, "step": 27 }, { "entropy": 0.5298919677734375, "epoch": 0.3146067415730337, "grad_norm": 57.24926645812191, "learning_rate": 5e-06, "loss": 2.7507, "mean_token_accuracy": 0.5442708495538682, "num_tokens": 23200357.0, "step": 28 }, { "entropy": 0.5341720581054688, "epoch": 0.3258426966292135, "grad_norm": 57.30241919840906, "learning_rate": 4.999952005391863e-06, "loss": 2.7141, "mean_token_accuracy": 0.5520833497866988, "num_tokens": 24032340.0, "step": 29 }, { "entropy": 0.5347061157226562, "epoch": 0.33707865168539325, "grad_norm": 57.286490592876675, "learning_rate": 4.999808023410233e-06, "loss": 2.6785, "mean_token_accuracy": 0.5546875165309757, "num_tokens": 24875983.0, "step": 30 }, { "entropy": 0.523590087890625, "epoch": 0.34831460674157305, "grad_norm": 57.68819839964531, "learning_rate": 4.999568059583401e-06, "loss": 2.6613, "mean_token_accuracy": 0.5533854331588373, "num_tokens": 25724705.0, "step": 31 }, { "entropy": 0.5200119018554688, "epoch": 0.3595505617977528, "grad_norm": 58.4245124088433, "learning_rate": 4.9992321231249425e-06, "loss": 2.6243, "mean_token_accuracy": 0.5638021001359448, "num_tokens": 26582233.0, "step": 32 }, { "entropy": 0.5226516723632812, "epoch": 0.3707865168539326, "grad_norm": 58.32722377916457, "learning_rate": 4.998800226933367e-06, "loss": 2.5931, "mean_token_accuracy": 0.570312516996637, "num_tokens": 27422365.0, "step": 33 }, { "entropy": 0.540557861328125, "epoch": 0.38202247191011235, "grad_norm": 58.734213041913115, "learning_rate": 4.998272387591625e-06, "loss": 2.5598, "mean_token_accuracy": 0.5794271006016061, "num_tokens": 28242002.0, "step": 34 }, { "entropy": 0.5180587768554688, "epoch": 0.39325842696629215, "grad_norm": 59.22626114064095, "learning_rate": 4.997648625366471e-06, "loss": 2.5452, "mean_token_accuracy": 0.5677083502523601, "num_tokens": 29080530.0, "step": 35 }, { "entropy": 0.5167007446289062, "epoch": 0.4044943820224719, "grad_norm": 60.7804863069846, "learning_rate": 4.996928964207685e-06, "loss": 2.5519, "mean_token_accuracy": 0.5651041835080832, "num_tokens": 29911104.0, "step": 36 }, { "entropy": 0.5139617919921875, "epoch": 0.4157303370786517, "grad_norm": 59.02823200509279, "learning_rate": 4.99611343174715e-06, "loss": 2.4802, "mean_token_accuracy": 0.5664062668802217, "num_tokens": 30785022.0, "step": 37 }, { "entropy": 0.5275650024414062, "epoch": 0.42696629213483145, "grad_norm": 59.560232474144826, "learning_rate": 4.995202059297795e-06, "loss": 2.4654, "mean_token_accuracy": 0.5729166837409139, "num_tokens": 31611994.0, "step": 38 }, { "entropy": 0.5351028442382812, "epoch": 0.43820224719101125, "grad_norm": 59.72600726311165, "learning_rate": 4.99419488185239e-06, "loss": 2.4412, "mean_token_accuracy": 0.570312516996637, "num_tokens": 32393343.0, "step": 39 }, { "entropy": 0.527740478515625, "epoch": 0.449438202247191, "grad_norm": 59.57641648822912, "learning_rate": 4.993091938082206e-06, "loss": 2.4243, "mean_token_accuracy": 0.5690104336244985, "num_tokens": 33198379.0, "step": 40 }, { "entropy": 0.515106201171875, "epoch": 0.4606741573033708, "grad_norm": 60.12617134689506, "learning_rate": 4.991893270335526e-06, "loss": 2.4111, "mean_token_accuracy": 0.558593766647391, "num_tokens": 34054233.0, "step": 41 }, { "entropy": 0.5322647094726562, "epoch": 0.47191011235955055, "grad_norm": 59.660173498647474, "learning_rate": 4.990598924636019e-06, "loss": 2.3815, "mean_token_accuracy": 0.5625000167638063, "num_tokens": 34878164.0, "step": 42 }, { "entropy": 0.5286941528320312, "epoch": 0.48314606741573035, "grad_norm": 60.54371226870739, "learning_rate": 4.989208950680979e-06, "loss": 2.3666, "mean_token_accuracy": 0.558593766647391, "num_tokens": 35703689.0, "step": 43 }, { "entropy": 0.5433731079101562, "epoch": 0.4943820224719101, "grad_norm": 60.28449067908698, "learning_rate": 4.987723401839409e-06, "loss": 2.3225, "mean_token_accuracy": 0.5950521006016061, "num_tokens": 36503596.0, "step": 44 }, { "entropy": 0.5287551879882812, "epoch": 0.5056179775280899, "grad_norm": 60.062655344477925, "learning_rate": 4.9861423351499786e-06, "loss": 2.3121, "mean_token_accuracy": 0.6861979308305308, "num_tokens": 37321035.0, "step": 45 }, { "entropy": 0.5376815795898438, "epoch": 0.5168539325842697, "grad_norm": 61.069743149192924, "learning_rate": 4.984465811318826e-06, "loss": 2.2812, "mean_token_accuracy": 0.826822925475426, "num_tokens": 38143678.0, "step": 46 }, { "entropy": 0.5267181396484375, "epoch": 0.5280898876404494, "grad_norm": 60.3671855103254, "learning_rate": 4.982693894717237e-06, "loss": 2.2576, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 39005372.0, "step": 47 }, { "entropy": 0.5442123413085938, "epoch": 0.5393258426966292, "grad_norm": 60.49661142976516, "learning_rate": 4.980826653379163e-06, "loss": 2.2092, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 39808850.0, "step": 48 }, { "entropy": 0.538360595703125, "epoch": 0.550561797752809, "grad_norm": 60.88409758608409, "learning_rate": 4.97886415899862e-06, "loss": 2.1876, "mean_token_accuracy": 0.923177087912336, "num_tokens": 40635450.0, "step": 49 }, { "entropy": 0.5340042114257812, "epoch": 0.5617977528089888, "grad_norm": 60.57809081511029, "learning_rate": 4.976806486926926e-06, "loss": 2.176, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 41464069.0, "step": 50 }, { "entropy": 0.5406875610351562, "epoch": 0.5730337078651685, "grad_norm": 61.75540998165706, "learning_rate": 4.9746537161698125e-06, "loss": 2.1636, "mean_token_accuracy": 0.901041672565043, "num_tokens": 42275662.0, "step": 51 }, { "entropy": 0.560394287109375, "epoch": 0.5842696629213483, "grad_norm": 60.16814853435, "learning_rate": 4.972405929384391e-06, "loss": 2.1153, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 43057777.0, "step": 52 }, { "entropy": 0.5278244018554688, "epoch": 0.5955056179775281, "grad_norm": 60.91973249877592, "learning_rate": 4.970063212875979e-06, "loss": 2.1079, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 43898689.0, "step": 53 }, { "entropy": 0.5439300537109375, "epoch": 0.6067415730337079, "grad_norm": 59.911507237292454, "learning_rate": 4.967625656594782e-06, "loss": 2.0699, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 44744002.0, "step": 54 }, { "entropy": 0.5342025756835938, "epoch": 0.6179775280898876, "grad_norm": 59.838647030374474, "learning_rate": 4.965093354132451e-06, "loss": 2.044, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 45628240.0, "step": 55 }, { "entropy": 0.536773681640625, "epoch": 0.6292134831460674, "grad_norm": 60.35729117488591, "learning_rate": 4.962466402718475e-06, "loss": 2.0351, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 46468344.0, "step": 56 }, { "entropy": 0.5527420043945312, "epoch": 0.6404494382022472, "grad_norm": 59.71609925946788, "learning_rate": 4.959744903216458e-06, "loss": 1.9982, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 47283576.0, "step": 57 }, { "entropy": 0.53558349609375, "epoch": 0.651685393258427, "grad_norm": 59.63500869576208, "learning_rate": 4.9569289601202405e-06, "loss": 1.9785, "mean_token_accuracy": 0.9023437558207661, "num_tokens": 48111866.0, "step": 58 }, { "entropy": 0.551788330078125, "epoch": 0.6629213483146067, "grad_norm": 60.06109571239089, "learning_rate": 4.954018681549891e-06, "loss": 1.9583, "mean_token_accuracy": 0.901041672565043, "num_tokens": 48917746.0, "step": 59 }, { "entropy": 0.538818359375, "epoch": 0.6741573033707865, "grad_norm": 59.102981798917874, "learning_rate": 4.951014179247555e-06, "loss": 1.9142, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 49747914.0, "step": 60 }, { "entropy": 0.5444412231445312, "epoch": 0.6853932584269663, "grad_norm": 59.269303577585205, "learning_rate": 4.9479155685731595e-06, "loss": 1.9104, "mean_token_accuracy": 0.9062500055879354, "num_tokens": 50576281.0, "step": 61 }, { "entropy": 0.543304443359375, "epoch": 0.6966292134831461, "grad_norm": 58.631307105889, "learning_rate": 4.944722968499989e-06, "loss": 1.8554, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 51410056.0, "step": 62 }, { "entropy": 0.5558624267578125, "epoch": 0.7078651685393258, "grad_norm": 58.765453799476205, "learning_rate": 4.9414365016101144e-06, "loss": 1.8217, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 52208397.0, "step": 63 }, { "entropy": 0.5417938232421875, "epoch": 0.7191011235955056, "grad_norm": 59.35803234115679, "learning_rate": 4.938056294089689e-06, "loss": 1.8217, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 53054896.0, "step": 64 }, { "entropy": 0.5324325561523438, "epoch": 0.7303370786516854, "grad_norm": 58.481973969356844, "learning_rate": 4.934582475724101e-06, "loss": 1.7979, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 53923624.0, "step": 65 }, { "entropy": 0.5482559204101562, "epoch": 0.7415730337078652, "grad_norm": 59.26597816016183, "learning_rate": 4.93101517989299e-06, "loss": 1.7507, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 54743941.0, "step": 66 }, { "entropy": 0.533050537109375, "epoch": 0.7528089887640449, "grad_norm": 58.16716667827391, "learning_rate": 4.927354543565131e-06, "loss": 1.7286, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 55583113.0, "step": 67 }, { "entropy": 0.5493240356445312, "epoch": 0.7640449438202247, "grad_norm": 58.77068748544961, "learning_rate": 4.923600707293166e-06, "loss": 1.7072, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 56411372.0, "step": 68 }, { "entropy": 0.5332260131835938, "epoch": 0.7752808988764045, "grad_norm": 58.030678964904006, "learning_rate": 4.919753815208218e-06, "loss": 1.6664, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 57243688.0, "step": 69 }, { "entropy": 0.5486373901367188, "epoch": 0.7865168539325843, "grad_norm": 58.850517970345265, "learning_rate": 4.915814015014349e-06, "loss": 1.6563, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 58065169.0, "step": 70 }, { "entropy": 0.5496139526367188, "epoch": 0.797752808988764, "grad_norm": 58.53066867858742, "learning_rate": 4.91178145798289e-06, "loss": 1.6189, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 58862994.0, "step": 71 }, { "entropy": 0.531982421875, "epoch": 0.8089887640449438, "grad_norm": 59.11337145551544, "learning_rate": 4.90765629894664e-06, "loss": 1.5988, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 59712732.0, "step": 72 }, { "entropy": 0.5450286865234375, "epoch": 0.8202247191011236, "grad_norm": 58.17233978561868, "learning_rate": 4.90343869629391e-06, "loss": 1.5525, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 60540733.0, "step": 73 }, { "entropy": 0.5311508178710938, "epoch": 0.8314606741573034, "grad_norm": 58.64402722570219, "learning_rate": 4.89912881196245e-06, "loss": 1.5378, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 61392029.0, "step": 74 }, { "entropy": 0.5337066650390625, "epoch": 0.8426966292134831, "grad_norm": 58.691322870736734, "learning_rate": 4.8947268114332276e-06, "loss": 1.5081, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 62239350.0, "step": 75 }, { "entropy": 0.5332107543945312, "epoch": 0.8539325842696629, "grad_norm": 59.494275039320314, "learning_rate": 4.890232863724075e-06, "loss": 1.4997, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 63070023.0, "step": 76 }, { "entropy": 0.5474319458007812, "epoch": 0.8651685393258427, "grad_norm": 58.54978299815276, "learning_rate": 4.8856471413831995e-06, "loss": 1.4526, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 63896515.0, "step": 77 }, { "entropy": 0.5431747436523438, "epoch": 0.8764044943820225, "grad_norm": 58.6331170246829, "learning_rate": 4.880969820482559e-06, "loss": 1.4351, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 64711098.0, "step": 78 }, { "entropy": 0.5443191528320312, "epoch": 0.8876404494382022, "grad_norm": 60.77368585176177, "learning_rate": 4.8762010806111e-06, "loss": 1.4007, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 65528767.0, "step": 79 }, { "entropy": 0.5321044921875, "epoch": 0.898876404494382, "grad_norm": 58.41490381142874, "learning_rate": 4.8713411048678635e-06, "loss": 1.3736, "mean_token_accuracy": 0.923177087912336, "num_tokens": 66376396.0, "step": 80 }, { "entropy": 0.5301437377929688, "epoch": 0.9101123595505618, "grad_norm": 58.31714038287807, "learning_rate": 4.866390079854956e-06, "loss": 1.3632, "mean_token_accuracy": 0.9153645883779973, "num_tokens": 67234926.0, "step": 81 }, { "entropy": 0.54400634765625, "epoch": 0.9213483146067416, "grad_norm": 58.481418531052604, "learning_rate": 4.861348195670381e-06, "loss": 1.2982, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 68053260.0, "step": 82 }, { "entropy": 0.534088134765625, "epoch": 0.9325842696629213, "grad_norm": 57.95432573120232, "learning_rate": 4.856215645900742e-06, "loss": 1.2632, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 68867802.0, "step": 83 }, { "entropy": 0.5389633178710938, "epoch": 0.9438202247191011, "grad_norm": 58.38827044496076, "learning_rate": 4.850992627613812e-06, "loss": 1.251, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 69694869.0, "step": 84 }, { "entropy": 0.5363616943359375, "epoch": 0.9550561797752809, "grad_norm": 57.860955575276655, "learning_rate": 4.845679341350963e-06, "loss": 1.2127, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 70531155.0, "step": 85 }, { "entropy": 0.5427932739257812, "epoch": 0.9662921348314607, "grad_norm": 58.932378976256814, "learning_rate": 4.8402759911194705e-06, "loss": 1.1981, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 71363290.0, "step": 86 }, { "entropy": 0.5369033813476562, "epoch": 0.9775280898876404, "grad_norm": 59.60454994673109, "learning_rate": 4.834782784384674e-06, "loss": 1.1884, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 72178708.0, "step": 87 }, { "entropy": 0.5212631225585938, "epoch": 0.9887640449438202, "grad_norm": 57.158221171397074, "learning_rate": 4.8291999320620185e-06, "loss": 1.142, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 73067536.0, "step": 88 }, { "entropy": 0.53387451171875, "epoch": 1.0, "grad_norm": 57.71387886521076, "learning_rate": 4.823527648508951e-06, "loss": 1.1127, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 73887526.0, "step": 89 }, { "entropy": 0.54193115234375, "epoch": 1.0112359550561798, "grad_norm": 56.86485522019607, "learning_rate": 4.817766151516693e-06, "loss": 1.0862, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 74697192.0, "step": 90 }, { "entropy": 0.5283203125, "epoch": 1.0224719101123596, "grad_norm": 57.46020827331217, "learning_rate": 4.811915662301877e-06, "loss": 1.0731, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 75539407.0, "step": 91 }, { "entropy": 0.5219268798828125, "epoch": 1.0337078651685394, "grad_norm": 56.91787187127946, "learning_rate": 4.805976405498052e-06, "loss": 1.0468, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 76388888.0, "step": 92 }, { "entropy": 0.5343704223632812, "epoch": 1.0449438202247192, "grad_norm": 57.27346521029154, "learning_rate": 4.799948609147061e-06, "loss": 1.0094, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 77233062.0, "step": 93 }, { "entropy": 0.527801513671875, "epoch": 1.0561797752808988, "grad_norm": 56.33009259771239, "learning_rate": 4.793832504690283e-06, "loss": 0.9796, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 78066352.0, "step": 94 }, { "entropy": 0.52886962890625, "epoch": 1.0674157303370786, "grad_norm": 57.343778352318104, "learning_rate": 4.787628326959747e-06, "loss": 0.9711, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 78900853.0, "step": 95 }, { "entropy": 0.5222854614257812, "epoch": 1.0786516853932584, "grad_norm": 56.24835996693155, "learning_rate": 4.7813363141691166e-06, "loss": 0.947, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 79755419.0, "step": 96 }, { "entropy": 0.5308151245117188, "epoch": 1.0898876404494382, "grad_norm": 55.743629737936764, "learning_rate": 4.774956707904542e-06, "loss": 0.905, "mean_token_accuracy": 0.945312503259629, "num_tokens": 80595236.0, "step": 97 }, { "entropy": 0.53717041015625, "epoch": 1.101123595505618, "grad_norm": 55.52186768937809, "learning_rate": 4.768489753115386e-06, "loss": 0.8817, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 81420449.0, "step": 98 }, { "entropy": 0.5405044555664062, "epoch": 1.1123595505617978, "grad_norm": 54.95236799373694, "learning_rate": 4.761935698104817e-06, "loss": 0.852, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 82231257.0, "step": 99 }, { "entropy": 0.5181732177734375, "epoch": 1.1235955056179776, "grad_norm": 54.78774457440945, "learning_rate": 4.755294794520277e-06, "loss": 0.859, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 83098626.0, "step": 100 }, { "entropy": 0.5196533203125, "epoch": 1.1348314606741572, "grad_norm": 54.27301765434061, "learning_rate": 4.7485672973438175e-06, "loss": 0.805, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 83964827.0, "step": 101 }, { "entropy": 0.5354232788085938, "epoch": 1.146067415730337, "grad_norm": 53.98544893534557, "learning_rate": 4.741753464882312e-06, "loss": 0.8019, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 84783212.0, "step": 102 }, { "entropy": 0.5196075439453125, "epoch": 1.1573033707865168, "grad_norm": 53.76533745223255, "learning_rate": 4.734853558757534e-06, "loss": 0.7712, "mean_token_accuracy": 0.945312503259629, "num_tokens": 85630983.0, "step": 103 }, { "entropy": 0.5174789428710938, "epoch": 1.1685393258426966, "grad_norm": 53.14821983286506, "learning_rate": 4.727867843896116e-06, "loss": 0.7418, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 86471990.0, "step": 104 }, { "entropy": 0.535003662109375, "epoch": 1.1797752808988764, "grad_norm": 52.63736880368936, "learning_rate": 4.72079658851938e-06, "loss": 0.722, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 87254432.0, "step": 105 }, { "entropy": 0.5264739990234375, "epoch": 1.1910112359550562, "grad_norm": 51.9971299352231, "learning_rate": 4.7136400641330245e-06, "loss": 0.6939, "mean_token_accuracy": 0.945312503259629, "num_tokens": 88074800.0, "step": 106 }, { "entropy": 0.524566650390625, "epoch": 1.202247191011236, "grad_norm": 51.764163888935016, "learning_rate": 4.706398545516722e-06, "loss": 0.6962, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 88905071.0, "step": 107 }, { "entropy": 0.52813720703125, "epoch": 1.2134831460674158, "grad_norm": 52.44796000632313, "learning_rate": 4.6990723107135475e-06, "loss": 0.6476, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 89698224.0, "step": 108 }, { "entropy": 0.5216827392578125, "epoch": 1.2247191011235956, "grad_norm": 53.80256967623094, "learning_rate": 4.691661641019316e-06, "loss": 0.6913, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 90537702.0, "step": 109 }, { "entropy": 0.5336227416992188, "epoch": 1.2359550561797752, "grad_norm": 49.47024184934409, "learning_rate": 4.684166820971779e-06, "loss": 0.6087, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 91352264.0, "step": 110 }, { "entropy": 0.5501785278320312, "epoch": 1.247191011235955, "grad_norm": 47.99596183660331, "learning_rate": 4.6765881383396985e-06, "loss": 0.5901, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 92109673.0, "step": 111 }, { "entropy": 0.542724609375, "epoch": 1.2584269662921348, "grad_norm": 46.25282102422423, "learning_rate": 4.6689258841117946e-06, "loss": 0.5642, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 92900953.0, "step": 112 }, { "entropy": 0.5264129638671875, "epoch": 1.2696629213483146, "grad_norm": 45.38074896415223, "learning_rate": 4.6611803524855805e-06, "loss": 0.5528, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 93718876.0, "step": 113 }, { "entropy": 0.5328903198242188, "epoch": 1.2808988764044944, "grad_norm": 44.871573881233765, "learning_rate": 4.65335184085606e-06, "loss": 0.5572, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 94529005.0, "step": 114 }, { "entropy": 0.5391159057617188, "epoch": 1.2921348314606742, "grad_norm": 44.02274928854262, "learning_rate": 4.64544064980431e-06, "loss": 0.507, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 95341154.0, "step": 115 }, { "entropy": 0.5222015380859375, "epoch": 1.303370786516854, "grad_norm": 42.75676583668566, "learning_rate": 4.637447083085944e-06, "loss": 0.5046, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 96167635.0, "step": 116 }, { "entropy": 0.5244522094726562, "epoch": 1.3146067415730336, "grad_norm": 45.15532414937935, "learning_rate": 4.629371447619443e-06, "loss": 0.4978, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 97010200.0, "step": 117 }, { "entropy": 0.5244598388671875, "epoch": 1.3258426966292136, "grad_norm": 56.13362888266017, "learning_rate": 4.621214053474374e-06, "loss": 0.6082, "mean_token_accuracy": 0.8971354227978736, "num_tokens": 97852254.0, "step": 118 }, { "entropy": 0.539215087890625, "epoch": 1.3370786516853932, "grad_norm": 38.98016965907191, "learning_rate": 4.612975213859487e-06, "loss": 0.4563, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 98679091.0, "step": 119 }, { "entropy": 0.5576171875, "epoch": 1.348314606741573, "grad_norm": 40.255740846718474, "learning_rate": 4.604655245110684e-06, "loss": 0.4792, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 99483720.0, "step": 120 }, { "entropy": 0.5303115844726562, "epoch": 1.3595505617977528, "grad_norm": 35.28949683471456, "learning_rate": 4.596254466678877e-06, "loss": 0.4717, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 100322701.0, "step": 121 }, { "entropy": 0.5253524780273438, "epoch": 1.3707865168539326, "grad_norm": 35.972964160783164, "learning_rate": 4.5877732011177215e-06, "loss": 0.4626, "mean_token_accuracy": 0.9023437558207661, "num_tokens": 101165734.0, "step": 122 }, { "entropy": 0.5372314453125, "epoch": 1.3820224719101124, "grad_norm": 34.071476491719324, "learning_rate": 4.579211774071229e-06, "loss": 0.4247, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 101981751.0, "step": 123 }, { "entropy": 0.51861572265625, "epoch": 1.3932584269662922, "grad_norm": 33.00619895893129, "learning_rate": 4.570570514261272e-06, "loss": 0.4043, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 102873816.0, "step": 124 }, { "entropy": 0.5457382202148438, "epoch": 1.404494382022472, "grad_norm": 33.283611733855004, "learning_rate": 4.561849753474951e-06, "loss": 0.3831, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 103679964.0, "step": 125 }, { "entropy": 0.551483154296875, "epoch": 1.4157303370786516, "grad_norm": 30.450265998368877, "learning_rate": 4.553049826551864e-06, "loss": 0.3586, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 104483969.0, "step": 126 }, { "entropy": 0.544921875, "epoch": 1.4269662921348314, "grad_norm": 33.037901010787365, "learning_rate": 4.544171071371246e-06, "loss": 0.3743, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 105281682.0, "step": 127 }, { "entropy": 0.5335235595703125, "epoch": 1.4382022471910112, "grad_norm": 29.309084873235943, "learning_rate": 4.535213828838998e-06, "loss": 0.3215, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 106098113.0, "step": 128 }, { "entropy": 0.536895751953125, "epoch": 1.449438202247191, "grad_norm": 29.992570415099404, "learning_rate": 4.526178442874596e-06, "loss": 0.3275, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 106932062.0, "step": 129 }, { "entropy": 0.53948974609375, "epoch": 1.4606741573033708, "grad_norm": 29.198467800844845, "learning_rate": 4.517065260397887e-06, "loss": 0.3151, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 107772618.0, "step": 130 }, { "entropy": 0.5549697875976562, "epoch": 1.4719101123595506, "grad_norm": 26.144198778784126, "learning_rate": 4.5078746313157684e-06, "loss": 0.2783, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 108550420.0, "step": 131 }, { "entropy": 0.5575103759765625, "epoch": 1.4831460674157304, "grad_norm": 26.803101306468495, "learning_rate": 4.498606908508754e-06, "loss": 0.2814, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 109318814.0, "step": 132 }, { "entropy": 0.5332794189453125, "epoch": 1.49438202247191, "grad_norm": 23.65296150298368, "learning_rate": 4.489262447817421e-06, "loss": 0.2551, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 110158987.0, "step": 133 }, { "entropy": 0.531585693359375, "epoch": 1.50561797752809, "grad_norm": 22.686143776318026, "learning_rate": 4.479841608028756e-06, "loss": 0.2753, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 111014455.0, "step": 134 }, { "entropy": 0.5310440063476562, "epoch": 1.5168539325842696, "grad_norm": 22.75785890829976, "learning_rate": 4.470344750862369e-06, "loss": 0.2744, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 111844565.0, "step": 135 }, { "entropy": 0.5299530029296875, "epoch": 1.5280898876404494, "grad_norm": 19.646757022226197, "learning_rate": 4.460772240956609e-06, "loss": 0.236, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 112693858.0, "step": 136 }, { "entropy": 0.530914306640625, "epoch": 1.5393258426966292, "grad_norm": 18.581871649623455, "learning_rate": 4.4511244458545666e-06, "loss": 0.2337, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 113552373.0, "step": 137 }, { "entropy": 0.5373458862304688, "epoch": 1.550561797752809, "grad_norm": 18.480906621711842, "learning_rate": 4.441401735989958e-06, "loss": 0.224, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 114386318.0, "step": 138 }, { "entropy": 0.5466995239257812, "epoch": 1.5617977528089888, "grad_norm": 19.092891408382183, "learning_rate": 4.431604484672905e-06, "loss": 0.2181, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 115190002.0, "step": 139 }, { "entropy": 0.5388870239257812, "epoch": 1.5730337078651684, "grad_norm": 16.375029133835223, "learning_rate": 4.421733068075596e-06, "loss": 0.218, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 116010674.0, "step": 140 }, { "entropy": 0.5213394165039062, "epoch": 1.5842696629213484, "grad_norm": 15.182778483601002, "learning_rate": 4.411787865217847e-06, "loss": 0.2018, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 116866501.0, "step": 141 }, { "entropy": 0.5382003784179688, "epoch": 1.595505617977528, "grad_norm": 16.027990698815895, "learning_rate": 4.401769257952551e-06, "loss": 0.1885, "mean_token_accuracy": 0.9570312525611371, "num_tokens": 117690211.0, "step": 142 }, { "entropy": 0.51544189453125, "epoch": 1.606741573033708, "grad_norm": 19.861091704846544, "learning_rate": 4.3916776309510115e-06, "loss": 0.1953, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 118570203.0, "step": 143 }, { "entropy": 0.5500030517578125, "epoch": 1.6179775280898876, "grad_norm": 20.061583831622784, "learning_rate": 4.381513371688174e-06, "loss": 0.184, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 119362860.0, "step": 144 }, { "entropy": 0.5243148803710938, "epoch": 1.6292134831460674, "grad_norm": 13.08141096028655, "learning_rate": 4.3712768704277535e-06, "loss": 0.1714, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 120222412.0, "step": 145 }, { "entropy": 0.5271530151367188, "epoch": 1.6404494382022472, "grad_norm": 30.698080213743623, "learning_rate": 4.360968520207241e-06, "loss": 0.2563, "mean_token_accuracy": 0.9036458390764892, "num_tokens": 121059163.0, "step": 146 }, { "entropy": 0.5256195068359375, "epoch": 1.651685393258427, "grad_norm": 11.372090831152036, "learning_rate": 4.35058871682282e-06, "loss": 0.1618, "mean_token_accuracy": 0.955729169305414, "num_tokens": 121918453.0, "step": 147 }, { "entropy": 0.537628173828125, "epoch": 1.6629213483146068, "grad_norm": 18.04796633205754, "learning_rate": 4.340137858814168e-06, "loss": 0.1884, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 122723340.0, "step": 148 }, { "entropy": 0.536773681640625, "epoch": 1.6741573033707864, "grad_norm": 10.93001525080465, "learning_rate": 4.329616347449154e-06, "loss": 0.1552, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 123541295.0, "step": 149 }, { "entropy": 0.5089187622070312, "epoch": 1.6853932584269664, "grad_norm": 20.39620342458635, "learning_rate": 4.3190245867084275e-06, "loss": 0.1989, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 124435851.0, "step": 150 }, { "entropy": 0.5318374633789062, "epoch": 1.696629213483146, "grad_norm": 13.331468443274261, "learning_rate": 4.308362983269916e-06, "loss": 0.159, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 125265567.0, "step": 151 }, { "entropy": 0.53662109375, "epoch": 1.7078651685393258, "grad_norm": 23.59150612291166, "learning_rate": 4.297631946493202e-06, "loss": 0.2394, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 126117998.0, "step": 152 }, { "entropy": 0.5291519165039062, "epoch": 1.7191011235955056, "grad_norm": 16.96931279554441, "learning_rate": 4.2868318884038075e-06, "loss": 0.1939, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 126968965.0, "step": 153 }, { "entropy": 0.5551834106445312, "epoch": 1.7303370786516854, "grad_norm": 9.706337234757527, "learning_rate": 4.275963223677379e-06, "loss": 0.1495, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 127735103.0, "step": 154 }, { "entropy": 0.55059814453125, "epoch": 1.7415730337078652, "grad_norm": 14.998195966114025, "learning_rate": 4.265026369623761e-06, "loss": 0.1691, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 128567014.0, "step": 155 }, { "entropy": 0.5133056640625, "epoch": 1.7528089887640448, "grad_norm": 8.38785965664328, "learning_rate": 4.254021746170972e-06, "loss": 0.1674, "mean_token_accuracy": 0.9505208362825215, "num_tokens": 129446494.0, "step": 156 }, { "entropy": 0.5388946533203125, "epoch": 1.7640449438202248, "grad_norm": 8.628009379073635, "learning_rate": 4.242949775849083e-06, "loss": 0.1342, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 130282793.0, "step": 157 }, { "entropy": 0.5218429565429688, "epoch": 1.7752808988764044, "grad_norm": 8.979344060078741, "learning_rate": 4.231810883773999e-06, "loss": 0.1498, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 131144403.0, "step": 158 }, { "entropy": 0.5305633544921875, "epoch": 1.7865168539325844, "grad_norm": 10.386798650341824, "learning_rate": 4.220605497631125e-06, "loss": 0.1208, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 131969457.0, "step": 159 }, { "entropy": 0.5267715454101562, "epoch": 1.797752808988764, "grad_norm": 7.845366778688198, "learning_rate": 4.209334047658956e-06, "loss": 0.1339, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 132802344.0, "step": 160 }, { "entropy": 0.51898193359375, "epoch": 1.8089887640449438, "grad_norm": 15.048259815135598, "learning_rate": 4.197996966632551e-06, "loss": 0.1477, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 133686657.0, "step": 161 }, { "entropy": 0.5142669677734375, "epoch": 1.8202247191011236, "grad_norm": 6.170398947994508, "learning_rate": 4.186594689846919e-06, "loss": 0.1196, "mean_token_accuracy": 0.9544270860496908, "num_tokens": 134569286.0, "step": 162 }, { "entropy": 0.5417022705078125, "epoch": 1.8314606741573034, "grad_norm": 6.977797842018329, "learning_rate": 4.175127655100306e-06, "loss": 0.1176, "mean_token_accuracy": 0.955729169305414, "num_tokens": 135384479.0, "step": 163 }, { "entropy": 0.5277252197265625, "epoch": 1.8426966292134832, "grad_norm": 12.599667992263445, "learning_rate": 4.163596302677383e-06, "loss": 0.1273, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 136199109.0, "step": 164 }, { "entropy": 0.523162841796875, "epoch": 1.8539325842696628, "grad_norm": 6.79687806434144, "learning_rate": 4.152001075332342e-06, "loss": 0.1134, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 137047704.0, "step": 165 }, { "entropy": 0.5387954711914062, "epoch": 1.8651685393258428, "grad_norm": 5.293968888823296, "learning_rate": 4.140342418271897e-06, "loss": 0.0948, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 137837799.0, "step": 166 }, { "entropy": 0.5280380249023438, "epoch": 1.8764044943820224, "grad_norm": 9.734221142762577, "learning_rate": 4.128620779138191e-06, "loss": 0.1198, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 138667003.0, "step": 167 }, { "entropy": 0.5301589965820312, "epoch": 1.8876404494382022, "grad_norm": 10.983716082693723, "learning_rate": 4.116836607991603e-06, "loss": 0.1103, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 139486069.0, "step": 168 }, { "entropy": 0.5339202880859375, "epoch": 1.898876404494382, "grad_norm": 4.754192213758688, "learning_rate": 4.104990357293478e-06, "loss": 0.0976, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 140301431.0, "step": 169 }, { "entropy": 0.5370941162109375, "epoch": 1.9101123595505618, "grad_norm": 7.392070355069441, "learning_rate": 4.09308248188874e-06, "loss": 0.0953, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 141127372.0, "step": 170 }, { "entropy": 0.5226898193359375, "epoch": 1.9213483146067416, "grad_norm": 14.855299560449547, "learning_rate": 4.081113438988443e-06, "loss": 0.1426, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 141960887.0, "step": 171 }, { "entropy": 0.5215377807617188, "epoch": 1.9325842696629212, "grad_norm": 5.677690032968136, "learning_rate": 4.069083688152206e-06, "loss": 0.0999, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 142790358.0, "step": 172 }, { "entropy": 0.5326385498046875, "epoch": 1.9438202247191012, "grad_norm": 17.868174534877028, "learning_rate": 4.056993691270569e-06, "loss": 0.1516, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 143606992.0, "step": 173 }, { "entropy": 0.5331192016601562, "epoch": 1.9550561797752808, "grad_norm": 10.649577515960202, "learning_rate": 4.044843912547262e-06, "loss": 0.1173, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 144422922.0, "step": 174 }, { "entropy": 0.5289459228515625, "epoch": 1.9662921348314608, "grad_norm": 19.71433225206811, "learning_rate": 4.032634818481382e-06, "loss": 0.1615, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 145257110.0, "step": 175 }, { "entropy": 0.5237350463867188, "epoch": 1.9775280898876404, "grad_norm": 23.109552808231093, "learning_rate": 4.020366877849477e-06, "loss": 0.1769, "mean_token_accuracy": 0.9101562553551048, "num_tokens": 146089047.0, "step": 176 }, { "entropy": 0.5413665771484375, "epoch": 1.9887640449438202, "grad_norm": 11.897669907113016, "learning_rate": 4.008040561687549e-06, "loss": 0.1259, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 146909622.0, "step": 177 }, { "entropy": 0.5229339599609375, "epoch": 2.0, "grad_norm": 14.618156823772459, "learning_rate": 3.995656343272969e-06, "loss": 0.1309, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 147777645.0, "step": 178 }, { "entropy": 0.522613525390625, "epoch": 2.0112359550561796, "grad_norm": 22.246161812882423, "learning_rate": 3.983214698106305e-06, "loss": 0.1777, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 148648659.0, "step": 179 }, { "entropy": 0.5502471923828125, "epoch": 2.0224719101123596, "grad_norm": 8.734834674837812, "learning_rate": 3.970716103893065e-06, "loss": 0.1015, "mean_token_accuracy": 0.967447918606922, "num_tokens": 149458601.0, "step": 180 }, { "entropy": 0.5521392822265625, "epoch": 2.033707865168539, "grad_norm": 5.921310602189352, "learning_rate": 3.958161040525354e-06, "loss": 0.1132, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 150274927.0, "step": 181 }, { "entropy": 0.539154052734375, "epoch": 2.044943820224719, "grad_norm": 12.815169602354262, "learning_rate": 3.94554999006345e-06, "loss": 0.1233, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 151122405.0, "step": 182 }, { "entropy": 0.5272445678710938, "epoch": 2.056179775280899, "grad_norm": 8.808389478714032, "learning_rate": 3.932883436717291e-06, "loss": 0.1029, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 151962476.0, "step": 183 }, { "entropy": 0.5394363403320312, "epoch": 2.067415730337079, "grad_norm": 3.5484560249089445, "learning_rate": 3.92016186682789e-06, "loss": 0.0751, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 152793064.0, "step": 184 }, { "entropy": 0.516876220703125, "epoch": 2.0786516853932584, "grad_norm": 8.176851356833051, "learning_rate": 3.907385768848656e-06, "loss": 0.1042, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 153670203.0, "step": 185 }, { "entropy": 0.5299301147460938, "epoch": 2.0898876404494384, "grad_norm": 5.999812660978029, "learning_rate": 3.894555633326642e-06, "loss": 0.0825, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 154512429.0, "step": 186 }, { "entropy": 0.522918701171875, "epoch": 2.101123595505618, "grad_norm": 10.700306121124873, "learning_rate": 3.88167195288371e-06, "loss": 0.0943, "mean_token_accuracy": 0.967447918606922, "num_tokens": 155359706.0, "step": 187 }, { "entropy": 0.5169754028320312, "epoch": 2.1123595505617976, "grad_norm": 8.896828713381819, "learning_rate": 3.868735222197614e-06, "loss": 0.0897, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 156201601.0, "step": 188 }, { "entropy": 0.5218658447265625, "epoch": 2.1235955056179776, "grad_norm": 10.96828625347169, "learning_rate": 3.85574593798301e-06, "loss": 0.0896, "mean_token_accuracy": 0.967447918606922, "num_tokens": 157033011.0, "step": 189 }, { "entropy": 0.5191421508789062, "epoch": 2.134831460674157, "grad_norm": 10.179919221478066, "learning_rate": 3.842704598972384e-06, "loss": 0.0886, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 157878092.0, "step": 190 }, { "entropy": 0.524810791015625, "epoch": 2.146067415730337, "grad_norm": 5.580920039451122, "learning_rate": 3.8296117058969e-06, "loss": 0.1033, "mean_token_accuracy": 0.9635416688397527, "num_tokens": 158716668.0, "step": 191 }, { "entropy": 0.51739501953125, "epoch": 2.157303370786517, "grad_norm": 8.577423884096872, "learning_rate": 3.816467761467175e-06, "loss": 0.0731, "mean_token_accuracy": 0.977864584652707, "num_tokens": 159542526.0, "step": 192 }, { "entropy": 0.51837158203125, "epoch": 2.168539325842697, "grad_norm": 6.116622062216189, "learning_rate": 3.80327327035398e-06, "loss": 0.0775, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 160368637.0, "step": 193 }, { "entropy": 0.5116424560546875, "epoch": 2.1797752808988764, "grad_norm": 4.322788821439791, "learning_rate": 3.7900287391688584e-06, "loss": 0.0765, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 161223870.0, "step": 194 }, { "entropy": 0.506072998046875, "epoch": 2.191011235955056, "grad_norm": 5.948522126434665, "learning_rate": 3.776734676444678e-06, "loss": 0.0751, "mean_token_accuracy": 0.977864584652707, "num_tokens": 162059887.0, "step": 195 }, { "entropy": 0.53485107421875, "epoch": 2.202247191011236, "grad_norm": 6.939964205005145, "learning_rate": 3.763391592616104e-06, "loss": 0.0912, "mean_token_accuracy": 0.967447918606922, "num_tokens": 162850931.0, "step": 196 }, { "entropy": 0.5248184204101562, "epoch": 2.2134831460674156, "grad_norm": 6.111124160198039, "learning_rate": 3.7500000000000005e-06, "loss": 0.0627, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 163662524.0, "step": 197 }, { "entropy": 0.513763427734375, "epoch": 2.2247191011235956, "grad_norm": 4.072432706569934, "learning_rate": 3.7365604127757584e-06, "loss": 0.0562, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 164489538.0, "step": 198 }, { "entropy": 0.5129623413085938, "epoch": 2.235955056179775, "grad_norm": 9.616365421634299, "learning_rate": 3.7230733469655554e-06, "loss": 0.0767, "mean_token_accuracy": 0.977864584652707, "num_tokens": 165320322.0, "step": 199 }, { "entropy": 0.51519775390625, "epoch": 2.247191011235955, "grad_norm": 4.926175655349057, "learning_rate": 3.709539320414544e-06, "loss": 0.0689, "mean_token_accuracy": 0.977864584652707, "num_tokens": 166151934.0, "step": 200 }, { "entropy": 0.5228424072265625, "epoch": 2.258426966292135, "grad_norm": 7.13563776574761, "learning_rate": 3.6959588527709635e-06, "loss": 0.0801, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 166958027.0, "step": 201 }, { "entropy": 0.515106201171875, "epoch": 2.2696629213483144, "grad_norm": 8.484197218001166, "learning_rate": 3.6823324654661923e-06, "loss": 0.0756, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 167797700.0, "step": 202 }, { "entropy": 0.5139694213867188, "epoch": 2.2808988764044944, "grad_norm": 4.551393455292689, "learning_rate": 3.6686606816947264e-06, "loss": 0.0668, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 168633047.0, "step": 203 }, { "entropy": 0.511627197265625, "epoch": 2.292134831460674, "grad_norm": 13.590007002387853, "learning_rate": 3.6549440263940878e-06, "loss": 0.1093, "mean_token_accuracy": 0.955729169305414, "num_tokens": 169477710.0, "step": 204 }, { "entropy": 0.5248260498046875, "epoch": 2.303370786516854, "grad_norm": 8.339146711131134, "learning_rate": 3.6411830262246755e-06, "loss": 0.0797, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 170293561.0, "step": 205 }, { "entropy": 0.52813720703125, "epoch": 2.3146067415730336, "grad_norm": 6.075928027622734, "learning_rate": 3.627378209549537e-06, "loss": 0.0611, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 171094703.0, "step": 206 }, { "entropy": 0.5260696411132812, "epoch": 2.3258426966292136, "grad_norm": 8.027691228204391, "learning_rate": 3.6135301064140856e-06, "loss": 0.0788, "mean_token_accuracy": 0.967447918606922, "num_tokens": 171940750.0, "step": 207 }, { "entropy": 0.5362701416015625, "epoch": 2.337078651685393, "grad_norm": 4.7127759183440405, "learning_rate": 3.599639248525749e-06, "loss": 0.0656, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 172746388.0, "step": 208 }, { "entropy": 0.5174713134765625, "epoch": 2.348314606741573, "grad_norm": 12.123204299345215, "learning_rate": 3.5857061692335503e-06, "loss": 0.1167, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 173596511.0, "step": 209 }, { "entropy": 0.5310440063476562, "epoch": 2.359550561797753, "grad_norm": 3.6583692027389065, "learning_rate": 3.5717314035076355e-06, "loss": 0.0552, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 174399527.0, "step": 210 }, { "entropy": 0.5257034301757812, "epoch": 2.370786516853933, "grad_norm": 10.990027992636044, "learning_rate": 3.5577154879187286e-06, "loss": 0.1029, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 175213185.0, "step": 211 }, { "entropy": 0.5306854248046875, "epoch": 2.3820224719101124, "grad_norm": 9.765128326680557, "learning_rate": 3.5436589606175296e-06, "loss": 0.0794, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 176038856.0, "step": 212 }, { "entropy": 0.5189208984375, "epoch": 2.393258426966292, "grad_norm": 8.116475495273617, "learning_rate": 3.5295623613140563e-06, "loss": 0.0727, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 176888989.0, "step": 213 }, { "entropy": 0.529022216796875, "epoch": 2.404494382022472, "grad_norm": 7.3861491529542045, "learning_rate": 3.5154262312569134e-06, "loss": 0.0751, "mean_token_accuracy": 0.967447918606922, "num_tokens": 177734742.0, "step": 214 }, { "entropy": 0.5261993408203125, "epoch": 2.4157303370786516, "grad_norm": 3.2893739082717977, "learning_rate": 3.501251113212521e-06, "loss": 0.0606, "mean_token_accuracy": 0.977864584652707, "num_tokens": 178565645.0, "step": 215 }, { "entropy": 0.5221099853515625, "epoch": 2.4269662921348316, "grad_norm": 4.62174053824965, "learning_rate": 3.4870375514442677e-06, "loss": 0.0474, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 179403274.0, "step": 216 }, { "entropy": 0.5177154541015625, "epoch": 2.438202247191011, "grad_norm": 4.936685319427876, "learning_rate": 3.4727860916916143e-06, "loss": 0.0742, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 180259819.0, "step": 217 }, { "entropy": 0.5179824829101562, "epoch": 2.449438202247191, "grad_norm": 5.420191287061157, "learning_rate": 3.458497281149143e-06, "loss": 0.0727, "mean_token_accuracy": 0.977864584652707, "num_tokens": 181106828.0, "step": 218 }, { "entropy": 0.5365829467773438, "epoch": 2.460674157303371, "grad_norm": 5.80819935292701, "learning_rate": 3.444171668445544e-06, "loss": 0.0576, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 181903812.0, "step": 219 }, { "entropy": 0.5154647827148438, "epoch": 2.4719101123595504, "grad_norm": 4.646194834229386, "learning_rate": 3.429809803622551e-06, "loss": 0.0652, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 182736365.0, "step": 220 }, { "entropy": 0.5201644897460938, "epoch": 2.4831460674157304, "grad_norm": 4.3253828750360075, "learning_rate": 3.415412238113823e-06, "loss": 0.0592, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 183557658.0, "step": 221 }, { "entropy": 0.5149002075195312, "epoch": 2.49438202247191, "grad_norm": 3.335909978206754, "learning_rate": 3.400979524723773e-06, "loss": 0.0446, "mean_token_accuracy": 0.989583333954215, "num_tokens": 184415937.0, "step": 222 }, { "entropy": 0.5210342407226562, "epoch": 2.50561797752809, "grad_norm": 4.304889690569454, "learning_rate": 3.386512217606339e-06, "loss": 0.0582, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 185226530.0, "step": 223 }, { "entropy": 0.5176010131835938, "epoch": 2.5168539325842696, "grad_norm": 8.647373614702433, "learning_rate": 3.372010872243711e-06, "loss": 0.0624, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 186062362.0, "step": 224 }, { "entropy": 0.5189895629882812, "epoch": 2.5280898876404496, "grad_norm": 5.434934187796716, "learning_rate": 3.357476045424998e-06, "loss": 0.0524, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 186907709.0, "step": 225 }, { "entropy": 0.51287841796875, "epoch": 2.539325842696629, "grad_norm": 4.204918090662119, "learning_rate": 3.342908295224854e-06, "loss": 0.048, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 187754524.0, "step": 226 }, { "entropy": 0.5017776489257812, "epoch": 2.550561797752809, "grad_norm": 7.2126840274010755, "learning_rate": 3.32830818098205e-06, "loss": 0.0712, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 188612569.0, "step": 227 }, { "entropy": 0.5189361572265625, "epoch": 2.561797752808989, "grad_norm": 4.49147927637142, "learning_rate": 3.313676263277995e-06, "loss": 0.0506, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 189431556.0, "step": 228 }, { "entropy": 0.5114974975585938, "epoch": 2.5730337078651684, "grad_norm": 8.23007248713336, "learning_rate": 3.299013103915214e-06, "loss": 0.069, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 190280003.0, "step": 229 }, { "entropy": 0.5097122192382812, "epoch": 2.5842696629213484, "grad_norm": 9.609630695704968, "learning_rate": 3.2843192658957775e-06, "loss": 0.0753, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 191150162.0, "step": 230 }, { "entropy": 0.5175933837890625, "epoch": 2.595505617977528, "grad_norm": 4.841549926893903, "learning_rate": 3.269595313399683e-06, "loss": 0.0595, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 191982499.0, "step": 231 }, { "entropy": 0.53167724609375, "epoch": 2.606741573033708, "grad_norm": 5.9233921841701065, "learning_rate": 3.2548418117631952e-06, "loss": 0.0464, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 192798994.0, "step": 232 }, { "entropy": 0.5178909301757812, "epoch": 2.6179775280898876, "grad_norm": 5.634708544843851, "learning_rate": 3.240059327457138e-06, "loss": 0.0541, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 193626603.0, "step": 233 }, { "entropy": 0.5294189453125, "epoch": 2.629213483146067, "grad_norm": 4.84001706365338, "learning_rate": 3.2252484280651453e-06, "loss": 0.0558, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 194426242.0, "step": 234 }, { "entropy": 0.5013351440429688, "epoch": 2.640449438202247, "grad_norm": 3.350014032586222, "learning_rate": 3.2104096822618657e-06, "loss": 0.0402, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 195272929.0, "step": 235 }, { "entropy": 0.5089569091796875, "epoch": 2.6516853932584272, "grad_norm": 4.702507794220196, "learning_rate": 3.195543659791132e-06, "loss": 0.0414, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 196112635.0, "step": 236 }, { "entropy": 0.5014801025390625, "epoch": 2.662921348314607, "grad_norm": 5.663346108909122, "learning_rate": 3.1806509314440827e-06, "loss": 0.0399, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 196967166.0, "step": 237 }, { "entropy": 0.5024566650390625, "epoch": 2.6741573033707864, "grad_norm": 7.232339951811718, "learning_rate": 3.1657320690372464e-06, "loss": 0.055, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 197827153.0, "step": 238 }, { "entropy": 0.51092529296875, "epoch": 2.6853932584269664, "grad_norm": 6.540851314197654, "learning_rate": 3.150787645390587e-06, "loss": 0.0593, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 198648707.0, "step": 239 }, { "entropy": 0.5111083984375, "epoch": 2.696629213483146, "grad_norm": 5.662581940141526, "learning_rate": 3.135818234305511e-06, "loss": 0.0396, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 199450306.0, "step": 240 }, { "entropy": 0.5034255981445312, "epoch": 2.7078651685393256, "grad_norm": 5.8373831882494684, "learning_rate": 3.120824410542833e-06, "loss": 0.0321, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 200298341.0, "step": 241 }, { "entropy": 0.50079345703125, "epoch": 2.7191011235955056, "grad_norm": 5.910065701044768, "learning_rate": 3.1058067498007094e-06, "loss": 0.0422, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 201149610.0, "step": 242 }, { "entropy": 0.5079193115234375, "epoch": 2.7303370786516856, "grad_norm": 4.478641894279129, "learning_rate": 3.090765828692534e-06, "loss": 0.0379, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 202005054.0, "step": 243 }, { "entropy": 0.515045166015625, "epoch": 2.741573033707865, "grad_norm": 5.6413331362338806, "learning_rate": 3.0757022247248e-06, "loss": 0.0439, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 202822761.0, "step": 244 }, { "entropy": 0.5003509521484375, "epoch": 2.752808988764045, "grad_norm": 4.675637549204775, "learning_rate": 3.0606165162749212e-06, "loss": 0.0304, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 203654329.0, "step": 245 }, { "entropy": 0.5175704956054688, "epoch": 2.764044943820225, "grad_norm": 4.266714217449698, "learning_rate": 3.045509282569031e-06, "loss": 0.0355, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 204452028.0, "step": 246 }, { "entropy": 0.510162353515625, "epoch": 2.7752808988764044, "grad_norm": 3.472266417067048, "learning_rate": 3.0303811036597395e-06, "loss": 0.0275, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 205254485.0, "step": 247 }, { "entropy": 0.5191574096679688, "epoch": 2.7865168539325844, "grad_norm": 2.8786418244959275, "learning_rate": 3.01523256040386e-06, "loss": 0.0212, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 206055392.0, "step": 248 }, { "entropy": 0.5145721435546875, "epoch": 2.797752808988764, "grad_norm": 4.496243851753138, "learning_rate": 3.0000642344401115e-06, "loss": 0.0372, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 206869050.0, "step": 249 }, { "entropy": 0.49834442138671875, "epoch": 2.808988764044944, "grad_norm": 5.002231345777496, "learning_rate": 2.9848767081667823e-06, "loss": 0.028, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 207712295.0, "step": 250 }, { "entropy": 0.49251556396484375, "epoch": 2.8202247191011236, "grad_norm": 4.859586789233871, "learning_rate": 2.9696705647193695e-06, "loss": 0.0445, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 208580496.0, "step": 251 }, { "entropy": 0.5194778442382812, "epoch": 2.831460674157303, "grad_norm": 3.9520698327060177, "learning_rate": 2.9544463879481914e-06, "loss": 0.0315, "mean_token_accuracy": 0.989583333954215, "num_tokens": 209388304.0, "step": 252 }, { "entropy": 0.5085906982421875, "epoch": 2.842696629213483, "grad_norm": 4.449886428662148, "learning_rate": 2.9392047623959653e-06, "loss": 0.0307, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 210216106.0, "step": 253 }, { "entropy": 0.4924468994140625, "epoch": 2.853932584269663, "grad_norm": 4.102595454995453, "learning_rate": 2.923946273275369e-06, "loss": 0.0377, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 211096912.0, "step": 254 }, { "entropy": 0.501220703125, "epoch": 2.865168539325843, "grad_norm": 3.5065827045256714, "learning_rate": 2.908671506446566e-06, "loss": 0.0402, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 211917717.0, "step": 255 }, { "entropy": 0.5018081665039062, "epoch": 2.8764044943820224, "grad_norm": 4.772724836206971, "learning_rate": 2.8933810483947156e-06, "loss": 0.0387, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 212760942.0, "step": 256 }, { "entropy": 0.5093460083007812, "epoch": 2.8876404494382024, "grad_norm": 5.280967737032615, "learning_rate": 2.878075486207452e-06, "loss": 0.042, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 213577211.0, "step": 257 }, { "entropy": 0.5251312255859375, "epoch": 2.898876404494382, "grad_norm": 3.0721132607558403, "learning_rate": 2.8627554075523426e-06, "loss": 0.0276, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 214374237.0, "step": 258 }, { "entropy": 0.509552001953125, "epoch": 2.9101123595505616, "grad_norm": 3.418094859784156, "learning_rate": 2.8474214006543255e-06, "loss": 0.0304, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 215204658.0, "step": 259 }, { "entropy": 0.49858856201171875, "epoch": 2.9213483146067416, "grad_norm": 3.4215791011808947, "learning_rate": 2.832074054273121e-06, "loss": 0.0337, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 216059885.0, "step": 260 }, { "entropy": 0.5129241943359375, "epoch": 2.932584269662921, "grad_norm": 3.013036330316608, "learning_rate": 2.8167139576806306e-06, "loss": 0.028, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 216892678.0, "step": 261 }, { "entropy": 0.49480438232421875, "epoch": 2.943820224719101, "grad_norm": 3.191221312139142, "learning_rate": 2.8013417006383078e-06, "loss": 0.0291, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 217739914.0, "step": 262 }, { "entropy": 0.4976348876953125, "epoch": 2.955056179775281, "grad_norm": 2.906401361021095, "learning_rate": 2.7859578733745153e-06, "loss": 0.0178, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 218586521.0, "step": 263 }, { "entropy": 0.48983001708984375, "epoch": 2.966292134831461, "grad_norm": 4.7556975923651335, "learning_rate": 2.7705630665618605e-06, "loss": 0.0232, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 219419106.0, "step": 264 }, { "entropy": 0.4918670654296875, "epoch": 2.9775280898876404, "grad_norm": 5.022257244997175, "learning_rate": 2.755157871294521e-06, "loss": 0.0283, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 220252072.0, "step": 265 }, { "entropy": 0.487274169921875, "epoch": 2.98876404494382, "grad_norm": 8.564888081434068, "learning_rate": 2.7397428790655447e-06, "loss": 0.0497, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 221111708.0, "step": 266 }, { "entropy": 0.5102157592773438, "epoch": 3.0, "grad_norm": 4.460036763086264, "learning_rate": 2.7243186817441403e-06, "loss": 0.0385, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 221886114.0, "step": 267 }, { "entropy": 0.48343658447265625, "epoch": 3.0112359550561796, "grad_norm": 9.20662230947583, "learning_rate": 2.708885871552954e-06, "loss": 0.021, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 222747070.0, "step": 268 }, { "entropy": 0.48952484130859375, "epoch": 3.0224719101123596, "grad_norm": 5.187135606996883, "learning_rate": 2.693445041045326e-06, "loss": 0.0245, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 223568902.0, "step": 269 }, { "entropy": 0.47576141357421875, "epoch": 3.033707865168539, "grad_norm": 3.251501070378064, "learning_rate": 2.6779967830825454e-06, "loss": 0.0214, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 224413441.0, "step": 270 }, { "entropy": 0.48839569091796875, "epoch": 3.044943820224719, "grad_norm": 3.429178666958642, "learning_rate": 2.6625416908110825e-06, "loss": 0.0204, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 225228850.0, "step": 271 }, { "entropy": 0.4837799072265625, "epoch": 3.056179775280899, "grad_norm": 5.639294809675952, "learning_rate": 2.647080357639813e-06, "loss": 0.0403, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 226050198.0, "step": 272 }, { "entropy": 0.48087310791015625, "epoch": 3.067415730337079, "grad_norm": 4.675070193494617, "learning_rate": 2.6316133772172403e-06, "loss": 0.0288, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 226889816.0, "step": 273 }, { "entropy": 0.49853515625, "epoch": 3.0786516853932584, "grad_norm": 4.8721179152633445, "learning_rate": 2.616141343408696e-06, "loss": 0.033, "mean_token_accuracy": 0.989583333954215, "num_tokens": 227673295.0, "step": 274 }, { "entropy": 0.49887847900390625, "epoch": 3.0898876404494384, "grad_norm": 3.780990923057449, "learning_rate": 2.6006648502735384e-06, "loss": 0.0237, "mean_token_accuracy": 0.989583333954215, "num_tokens": 228493992.0, "step": 275 }, { "entropy": 0.48746490478515625, "epoch": 3.101123595505618, "grad_norm": 3.463349045434377, "learning_rate": 2.5851844920423473e-06, "loss": 0.0172, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 229329744.0, "step": 276 }, { "entropy": 0.5026702880859375, "epoch": 3.1123595505617976, "grad_norm": 3.277726971307229, "learning_rate": 2.569700863094104e-06, "loss": 0.0256, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 230150690.0, "step": 277 }, { "entropy": 0.489990234375, "epoch": 3.1235955056179776, "grad_norm": 6.501478175738681, "learning_rate": 2.554214557933372e-06, "loss": 0.0306, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 230974007.0, "step": 278 }, { "entropy": 0.5017242431640625, "epoch": 3.134831460674157, "grad_norm": 6.205617015245652, "learning_rate": 2.5387261711674695e-06, "loss": 0.0327, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 231788328.0, "step": 279 }, { "entropy": 0.49090576171875, "epoch": 3.146067415730337, "grad_norm": 3.537442099096884, "learning_rate": 2.5232362974836394e-06, "loss": 0.0204, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 232603310.0, "step": 280 }, { "entropy": 0.48221588134765625, "epoch": 3.157303370786517, "grad_norm": 5.701029872012899, "learning_rate": 2.507745531626215e-06, "loss": 0.0432, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 233460022.0, "step": 281 }, { "entropy": 0.4851226806640625, "epoch": 3.168539325842697, "grad_norm": 6.175135064457726, "learning_rate": 2.4922544683737857e-06, "loss": 0.027, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 234261620.0, "step": 282 }, { "entropy": 0.4852294921875, "epoch": 3.1797752808988764, "grad_norm": 3.6828139032465312, "learning_rate": 2.4767637025163614e-06, "loss": 0.0199, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 235088969.0, "step": 283 }, { "entropy": 0.48902130126953125, "epoch": 3.191011235955056, "grad_norm": 3.512834879515185, "learning_rate": 2.461273828832531e-06, "loss": 0.0258, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 235913452.0, "step": 284 }, { "entropy": 0.4867401123046875, "epoch": 3.202247191011236, "grad_norm": 4.189500977619063, "learning_rate": 2.445785442066628e-06, "loss": 0.0245, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 236724207.0, "step": 285 }, { "entropy": 0.4837646484375, "epoch": 3.2134831460674156, "grad_norm": 4.540256503622475, "learning_rate": 2.4302991369058963e-06, "loss": 0.0245, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 237532582.0, "step": 286 }, { "entropy": 0.48180389404296875, "epoch": 3.2247191011235956, "grad_norm": 3.461981571769003, "learning_rate": 2.414815507957653e-06, "loss": 0.0316, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 238364201.0, "step": 287 }, { "entropy": 0.4748687744140625, "epoch": 3.235955056179775, "grad_norm": 3.5186529222593323, "learning_rate": 2.399335149726463e-06, "loss": 0.0254, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 239227490.0, "step": 288 }, { "entropy": 0.48622894287109375, "epoch": 3.247191011235955, "grad_norm": 2.4622140466695583, "learning_rate": 2.3838586565913053e-06, "loss": 0.0219, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 240071974.0, "step": 289 }, { "entropy": 0.49408721923828125, "epoch": 3.258426966292135, "grad_norm": 2.7289333058731686, "learning_rate": 2.3683866227827605e-06, "loss": 0.0139, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 240885558.0, "step": 290 }, { "entropy": 0.4902801513671875, "epoch": 3.2696629213483144, "grad_norm": 3.099007161228062, "learning_rate": 2.352919642360188e-06, "loss": 0.0209, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 241726366.0, "step": 291 }, { "entropy": 0.48973846435546875, "epoch": 3.2808988764044944, "grad_norm": 3.073212878993699, "learning_rate": 2.3374583091889188e-06, "loss": 0.0205, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 242540731.0, "step": 292 }, { "entropy": 0.49147796630859375, "epoch": 3.292134831460674, "grad_norm": 3.4111012195131845, "learning_rate": 2.322003216917455e-06, "loss": 0.025, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 243359456.0, "step": 293 }, { "entropy": 0.48925018310546875, "epoch": 3.303370786516854, "grad_norm": 3.398747668599176, "learning_rate": 2.3065549589546747e-06, "loss": 0.0174, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 244207041.0, "step": 294 }, { "entropy": 0.48650360107421875, "epoch": 3.3146067415730336, "grad_norm": 3.7147965496555604, "learning_rate": 2.2911141284470466e-06, "loss": 0.0226, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 245038695.0, "step": 295 }, { "entropy": 0.48868560791015625, "epoch": 3.3258426966292136, "grad_norm": 3.6350776720835167, "learning_rate": 2.27568131825586e-06, "loss": 0.0241, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 245862029.0, "step": 296 }, { "entropy": 0.5014572143554688, "epoch": 3.337078651685393, "grad_norm": 1.9496739139045483, "learning_rate": 2.260257120934456e-06, "loss": 0.008, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 246649993.0, "step": 297 }, { "entropy": 0.486724853515625, "epoch": 3.348314606741573, "grad_norm": 3.179926005223232, "learning_rate": 2.2448421287054794e-06, "loss": 0.014, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 247496847.0, "step": 298 }, { "entropy": 0.49832916259765625, "epoch": 3.359550561797753, "grad_norm": 2.4689200808317735, "learning_rate": 2.229436933438141e-06, "loss": 0.0206, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 248289120.0, "step": 299 }, { "entropy": 0.486846923828125, "epoch": 3.370786516853933, "grad_norm": 2.8695936224449703, "learning_rate": 2.214042126625486e-06, "loss": 0.0152, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 249116159.0, "step": 300 }, { "entropy": 0.4945526123046875, "epoch": 3.3820224719101124, "grad_norm": 2.8107533462269965, "learning_rate": 2.1986582993616926e-06, "loss": 0.0239, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 249937868.0, "step": 301 }, { "entropy": 0.47344970703125, "epoch": 3.393258426966292, "grad_norm": 3.2328511428876383, "learning_rate": 2.1832860423193703e-06, "loss": 0.0137, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 250814526.0, "step": 302 }, { "entropy": 0.4880523681640625, "epoch": 3.404494382022472, "grad_norm": 4.262015191629379, "learning_rate": 2.1679259457268796e-06, "loss": 0.0106, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 251627640.0, "step": 303 }, { "entropy": 0.48815155029296875, "epoch": 3.4157303370786516, "grad_norm": 2.8300412513355524, "learning_rate": 2.1525785993456753e-06, "loss": 0.0142, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 252464366.0, "step": 304 }, { "entropy": 0.5023727416992188, "epoch": 3.4269662921348316, "grad_norm": 4.510851185477219, "learning_rate": 2.1372445924476578e-06, "loss": 0.0172, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 253246625.0, "step": 305 }, { "entropy": 0.47849273681640625, "epoch": 3.438202247191011, "grad_norm": 8.834249485370332, "learning_rate": 2.1219245137925482e-06, "loss": 0.0192, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 254096661.0, "step": 306 }, { "entropy": 0.4962005615234375, "epoch": 3.449438202247191, "grad_norm": 3.6441410619934462, "learning_rate": 2.1066189516052848e-06, "loss": 0.0286, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 254907096.0, "step": 307 }, { "entropy": 0.477813720703125, "epoch": 3.460674157303371, "grad_norm": 6.015307012511412, "learning_rate": 2.0913284935534345e-06, "loss": 0.014, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 255741853.0, "step": 308 }, { "entropy": 0.48021697998046875, "epoch": 3.4719101123595504, "grad_norm": 6.152174736765347, "learning_rate": 2.0760537267246316e-06, "loss": 0.0229, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 256570850.0, "step": 309 }, { "entropy": 0.48302459716796875, "epoch": 3.4831460674157304, "grad_norm": 2.359802121670126, "learning_rate": 2.0607952376040355e-06, "loss": 0.0095, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 257402947.0, "step": 310 }, { "entropy": 0.47766876220703125, "epoch": 3.49438202247191, "grad_norm": 3.3257588322733906, "learning_rate": 2.0455536120518094e-06, "loss": 0.0104, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 258264842.0, "step": 311 }, { "entropy": 0.4766845703125, "epoch": 3.50561797752809, "grad_norm": 1.603503598370096, "learning_rate": 2.0303294352806313e-06, "loss": 0.0053, "mean_token_accuracy": 1.0, "num_tokens": 259122986.0, "step": 312 }, { "entropy": 0.47867584228515625, "epoch": 3.5168539325842696, "grad_norm": 4.431892256593624, "learning_rate": 2.0151232918332186e-06, "loss": 0.0131, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 259973474.0, "step": 313 }, { "entropy": 0.48300933837890625, "epoch": 3.5280898876404496, "grad_norm": 4.326635672396205, "learning_rate": 1.9999357655598894e-06, "loss": 0.0242, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 260798300.0, "step": 314 }, { "entropy": 0.48386383056640625, "epoch": 3.539325842696629, "grad_norm": 4.607397469663412, "learning_rate": 1.9847674395961407e-06, "loss": 0.0193, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 261627223.0, "step": 315 }, { "entropy": 0.4709930419921875, "epoch": 3.550561797752809, "grad_norm": 2.6921209318158956, "learning_rate": 1.9696188963402613e-06, "loss": 0.0131, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 262492309.0, "step": 316 }, { "entropy": 0.49495697021484375, "epoch": 3.561797752808989, "grad_norm": 4.038453476304001, "learning_rate": 1.9544907174309693e-06, "loss": 0.0124, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 263296839.0, "step": 317 }, { "entropy": 0.47100830078125, "epoch": 3.5730337078651684, "grad_norm": 3.907050390180474, "learning_rate": 1.939383483725079e-06, "loss": 0.011, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 264166709.0, "step": 318 }, { "entropy": 0.48455047607421875, "epoch": 3.5842696629213484, "grad_norm": 4.629582756662647, "learning_rate": 1.9242977752752006e-06, "loss": 0.0209, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 265009474.0, "step": 319 }, { "entropy": 0.492584228515625, "epoch": 3.595505617977528, "grad_norm": 3.206501722582584, "learning_rate": 1.909234171307466e-06, "loss": 0.0106, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 265828804.0, "step": 320 }, { "entropy": 0.48418426513671875, "epoch": 3.606741573033708, "grad_norm": 2.9833193666984594, "learning_rate": 1.8941932501992915e-06, "loss": 0.0133, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 266691166.0, "step": 321 }, { "entropy": 0.48262786865234375, "epoch": 3.6179775280898876, "grad_norm": 2.9165593805140437, "learning_rate": 1.879175589457168e-06, "loss": 0.0154, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 267543284.0, "step": 322 }, { "entropy": 0.4853363037109375, "epoch": 3.629213483146067, "grad_norm": 3.4411795063924315, "learning_rate": 1.8641817656944894e-06, "loss": 0.009, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 268391851.0, "step": 323 }, { "entropy": 0.5037002563476562, "epoch": 3.640449438202247, "grad_norm": 7.219711770249014, "learning_rate": 1.8492123546094132e-06, "loss": 0.0161, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 269181570.0, "step": 324 }, { "entropy": 0.48888397216796875, "epoch": 3.6516853932584272, "grad_norm": 2.760692104580548, "learning_rate": 1.8342679309627545e-06, "loss": 0.0196, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 269996062.0, "step": 325 }, { "entropy": 0.48822021484375, "epoch": 3.662921348314607, "grad_norm": 1.9046282601388171, "learning_rate": 1.8193490685559179e-06, "loss": 0.0054, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 270800147.0, "step": 326 }, { "entropy": 0.49137115478515625, "epoch": 3.6741573033707864, "grad_norm": 3.419975494500157, "learning_rate": 1.8044563402088686e-06, "loss": 0.0194, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 271644718.0, "step": 327 }, { "entropy": 0.488311767578125, "epoch": 3.6853932584269664, "grad_norm": 2.7970940768512844, "learning_rate": 1.7895903177381351e-06, "loss": 0.0063, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 272494584.0, "step": 328 }, { "entropy": 0.46905517578125, "epoch": 3.696629213483146, "grad_norm": 3.2974393189689017, "learning_rate": 1.7747515719348551e-06, "loss": 0.03, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 273379667.0, "step": 329 }, { "entropy": 0.5005645751953125, "epoch": 3.7078651685393256, "grad_norm": 3.6205966064162824, "learning_rate": 1.759940672542862e-06, "loss": 0.016, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 274191441.0, "step": 330 }, { "entropy": 0.479522705078125, "epoch": 3.7191011235955056, "grad_norm": 3.725884043346435, "learning_rate": 1.7451581882368052e-06, "loss": 0.0128, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 275059931.0, "step": 331 }, { "entropy": 0.49169158935546875, "epoch": 3.7303370786516856, "grad_norm": 2.6597945294485483, "learning_rate": 1.7304046866003183e-06, "loss": 0.0067, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 275888389.0, "step": 332 }, { "entropy": 0.49629974365234375, "epoch": 3.741573033707865, "grad_norm": 2.433832686288709, "learning_rate": 1.7156807341042242e-06, "loss": 0.006, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 276689357.0, "step": 333 }, { "entropy": 0.494659423828125, "epoch": 3.752808988764045, "grad_norm": 3.7349731267491526, "learning_rate": 1.700986896084787e-06, "loss": 0.0164, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 277503649.0, "step": 334 }, { "entropy": 0.4885711669921875, "epoch": 3.764044943820225, "grad_norm": 3.3812622930224125, "learning_rate": 1.686323736722006e-06, "loss": 0.0149, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 278326436.0, "step": 335 }, { "entropy": 0.5007553100585938, "epoch": 3.7752808988764044, "grad_norm": 1.6952250172415233, "learning_rate": 1.671691819017951e-06, "loss": 0.0071, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 279115112.0, "step": 336 }, { "entropy": 0.5091018676757812, "epoch": 3.7865168539325844, "grad_norm": 2.403559137434532, "learning_rate": 1.6570917047751465e-06, "loss": 0.0153, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 279883193.0, "step": 337 }, { "entropy": 0.4808349609375, "epoch": 3.797752808988764, "grad_norm": 5.835484541379819, "learning_rate": 1.642523954575003e-06, "loss": 0.0224, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 280724938.0, "step": 338 }, { "entropy": 0.48146820068359375, "epoch": 3.808988764044944, "grad_norm": 2.8914536871971994, "learning_rate": 1.6279891277562896e-06, "loss": 0.0105, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 281570918.0, "step": 339 }, { "entropy": 0.49124908447265625, "epoch": 3.8202247191011236, "grad_norm": 2.7829856607409718, "learning_rate": 1.613487782393661e-06, "loss": 0.0149, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 282400350.0, "step": 340 }, { "entropy": 0.46373748779296875, "epoch": 3.831460674157303, "grad_norm": 2.70570741985553, "learning_rate": 1.5990204752762273e-06, "loss": 0.0102, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 283283614.0, "step": 341 }, { "entropy": 0.48815155029296875, "epoch": 3.842696629213483, "grad_norm": 3.3600186528383973, "learning_rate": 1.5845877618861769e-06, "loss": 0.007, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 284125946.0, "step": 342 }, { "entropy": 0.48841094970703125, "epoch": 3.853932584269663, "grad_norm": 2.7396108465149895, "learning_rate": 1.5701901963774504e-06, "loss": 0.0041, "mean_token_accuracy": 1.0, "num_tokens": 284927663.0, "step": 343 }, { "entropy": 0.502349853515625, "epoch": 3.865168539325843, "grad_norm": 2.5440235934327706, "learning_rate": 1.555828331554457e-06, "loss": 0.0053, "mean_token_accuracy": 1.0, "num_tokens": 285727563.0, "step": 344 }, { "entropy": 0.47910308837890625, "epoch": 3.8764044943820224, "grad_norm": 2.1966498948802724, "learning_rate": 1.5415027188508574e-06, "loss": 0.01, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 286574749.0, "step": 345 }, { "entropy": 0.49324798583984375, "epoch": 3.8876404494382024, "grad_norm": 2.8523138768326963, "learning_rate": 1.5272139083083865e-06, "loss": 0.0042, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 287366871.0, "step": 346 }, { "entropy": 0.48845672607421875, "epoch": 3.898876404494382, "grad_norm": 2.874883145772125, "learning_rate": 1.5129624485557331e-06, "loss": 0.0126, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 288166095.0, "step": 347 }, { "entropy": 0.471282958984375, "epoch": 3.9101123595505616, "grad_norm": 2.916158906263163, "learning_rate": 1.4987488867874798e-06, "loss": 0.007, "mean_token_accuracy": 1.0, "num_tokens": 289017040.0, "step": 348 }, { "entropy": 0.467559814453125, "epoch": 3.9213483146067416, "grad_norm": 8.16739036316256, "learning_rate": 1.4845737687430875e-06, "loss": 0.0189, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 289874408.0, "step": 349 }, { "entropy": 0.4731292724609375, "epoch": 3.932584269662921, "grad_norm": 5.468947396548159, "learning_rate": 1.4704376386859447e-06, "loss": 0.0146, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 290697280.0, "step": 350 }, { "entropy": 0.4630584716796875, "epoch": 3.943820224719101, "grad_norm": 3.8244914261266194, "learning_rate": 1.4563410393824701e-06, "loss": 0.0131, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 291557021.0, "step": 351 }, { "entropy": 0.4810333251953125, "epoch": 3.955056179775281, "grad_norm": 4.933507820150018, "learning_rate": 1.4422845120812718e-06, "loss": 0.0068, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 292378445.0, "step": 352 }, { "entropy": 0.46092987060546875, "epoch": 3.966292134831461, "grad_norm": 4.751144080267217, "learning_rate": 1.4282685964923643e-06, "loss": 0.0249, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 293233946.0, "step": 353 }, { "entropy": 0.478057861328125, "epoch": 3.9775280898876404, "grad_norm": 3.6775002976703437, "learning_rate": 1.4142938307664505e-06, "loss": 0.0102, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 294052224.0, "step": 354 }, { "entropy": 0.46146392822265625, "epoch": 3.98876404494382, "grad_norm": 2.7844187721067266, "learning_rate": 1.400360751474253e-06, "loss": 0.0116, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 294920656.0, "step": 355 }, { "entropy": 0.47667694091796875, "epoch": 4.0, "grad_norm": 0.627168832321652, "learning_rate": 1.3864698935859153e-06, "loss": 0.0024, "mean_token_accuracy": 1.0, "num_tokens": 295750761.0, "step": 356 }, { "entropy": 0.47129058837890625, "epoch": 4.01123595505618, "grad_norm": 3.237981990670737, "learning_rate": 1.3726217904504636e-06, "loss": 0.0105, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 296580033.0, "step": 357 }, { "entropy": 0.47356414794921875, "epoch": 4.022471910112359, "grad_norm": 0.6977481548583607, "learning_rate": 1.3588169737753258e-06, "loss": 0.0027, "mean_token_accuracy": 1.0, "num_tokens": 297422608.0, "step": 358 }, { "entropy": 0.46782684326171875, "epoch": 4.033707865168539, "grad_norm": 3.064591085863403, "learning_rate": 1.3450559736059126e-06, "loss": 0.0047, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 298269567.0, "step": 359 }, { "entropy": 0.47429656982421875, "epoch": 4.044943820224719, "grad_norm": 2.069688101699077, "learning_rate": 1.3313393183052747e-06, "loss": 0.0057, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 299104214.0, "step": 360 }, { "entropy": 0.4663238525390625, "epoch": 4.056179775280899, "grad_norm": 3.907869758579254, "learning_rate": 1.3176675345338085e-06, "loss": 0.0072, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 299969102.0, "step": 361 }, { "entropy": 0.4740753173828125, "epoch": 4.067415730337078, "grad_norm": 0.6875226849852165, "learning_rate": 1.304041147229037e-06, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 300808720.0, "step": 362 }, { "entropy": 0.4696197509765625, "epoch": 4.078651685393258, "grad_norm": 1.2333744557990824, "learning_rate": 1.2904606795854562e-06, "loss": 0.0026, "mean_token_accuracy": 1.0, "num_tokens": 301660462.0, "step": 363 }, { "entropy": 0.4776153564453125, "epoch": 4.089887640449438, "grad_norm": 3.4213409887705275, "learning_rate": 1.276926653034444e-06, "loss": 0.0047, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 302476904.0, "step": 364 }, { "entropy": 0.486846923828125, "epoch": 4.101123595505618, "grad_norm": 1.6568151958809272, "learning_rate": 1.2634395872242433e-06, "loss": 0.003, "mean_token_accuracy": 1.0, "num_tokens": 303281350.0, "step": 365 }, { "entropy": 0.4847259521484375, "epoch": 4.112359550561798, "grad_norm": 1.43202146868413, "learning_rate": 1.2500000000000007e-06, "loss": 0.0089, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 304106181.0, "step": 366 }, { "entropy": 0.4637298583984375, "epoch": 4.123595505617978, "grad_norm": 4.19515608091639, "learning_rate": 1.2366084073838963e-06, "loss": 0.0122, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 304980353.0, "step": 367 }, { "entropy": 0.46537017822265625, "epoch": 4.134831460674158, "grad_norm": 10.286022538713485, "learning_rate": 1.223265323555323e-06, "loss": 0.0153, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 305825415.0, "step": 368 }, { "entropy": 0.486083984375, "epoch": 4.146067415730337, "grad_norm": 4.05182835847461, "learning_rate": 1.2099712608311426e-06, "loss": 0.0068, "mean_token_accuracy": 1.0, "num_tokens": 306605981.0, "step": 369 }, { "entropy": 0.47647857666015625, "epoch": 4.157303370786517, "grad_norm": 7.930574622117007, "learning_rate": 1.1967267296460208e-06, "loss": 0.0138, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 307430230.0, "step": 370 }, { "entropy": 0.471038818359375, "epoch": 4.168539325842697, "grad_norm": 4.274711351283673, "learning_rate": 1.183532238532826e-06, "loss": 0.0044, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 308274797.0, "step": 371 }, { "entropy": 0.45969390869140625, "epoch": 4.179775280898877, "grad_norm": 4.565232175830148, "learning_rate": 1.1703882941031012e-06, "loss": 0.0055, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 309133614.0, "step": 372 }, { "entropy": 0.47374725341796875, "epoch": 4.191011235955056, "grad_norm": 3.485577354310045, "learning_rate": 1.157295401027616e-06, "loss": 0.0064, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 309964446.0, "step": 373 }, { "entropy": 0.4734649658203125, "epoch": 4.202247191011236, "grad_norm": 4.503566249088593, "learning_rate": 1.1442540620169906e-06, "loss": 0.008, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 310780776.0, "step": 374 }, { "entropy": 0.47723388671875, "epoch": 4.213483146067416, "grad_norm": 3.683632589736363, "learning_rate": 1.131264777802387e-06, "loss": 0.0101, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 311606897.0, "step": 375 }, { "entropy": 0.46724700927734375, "epoch": 4.224719101123595, "grad_norm": 7.188150042856905, "learning_rate": 1.1183280471162916e-06, "loss": 0.0165, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 312474482.0, "step": 376 }, { "entropy": 0.4607391357421875, "epoch": 4.235955056179775, "grad_norm": 1.8555417077203353, "learning_rate": 1.1054443666733586e-06, "loss": 0.0036, "mean_token_accuracy": 1.0, "num_tokens": 313320360.0, "step": 377 }, { "entropy": 0.4653167724609375, "epoch": 4.247191011235955, "grad_norm": 3.089259401627071, "learning_rate": 1.0926142311513453e-06, "loss": 0.0061, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 314179757.0, "step": 378 }, { "entropy": 0.48670196533203125, "epoch": 4.258426966292135, "grad_norm": 0.8449253758558445, "learning_rate": 1.079838133172111e-06, "loss": 0.0029, "mean_token_accuracy": 1.0, "num_tokens": 314983724.0, "step": 379 }, { "entropy": 0.46709442138671875, "epoch": 4.269662921348314, "grad_norm": 2.5990482901945238, "learning_rate": 1.0671165632827097e-06, "loss": 0.0051, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 315817708.0, "step": 380 }, { "entropy": 0.45947265625, "epoch": 4.280898876404494, "grad_norm": 4.100355461394568, "learning_rate": 1.0544500099365515e-06, "loss": 0.0055, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 316672629.0, "step": 381 }, { "entropy": 0.4636688232421875, "epoch": 4.292134831460674, "grad_norm": 1.3481744831139768, "learning_rate": 1.0418389594746462e-06, "loss": 0.0027, "mean_token_accuracy": 1.0, "num_tokens": 317519153.0, "step": 382 }, { "entropy": 0.46529388427734375, "epoch": 4.303370786516854, "grad_norm": 0.45901882778115105, "learning_rate": 1.0292838961069348e-06, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 318332221.0, "step": 383 }, { "entropy": 0.4586639404296875, "epoch": 4.314606741573034, "grad_norm": 2.721713545239488, "learning_rate": 1.0167853018936955e-06, "loss": 0.0083, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 319192190.0, "step": 384 }, { "entropy": 0.4707489013671875, "epoch": 4.325842696629214, "grad_norm": 0.43133743566324173, "learning_rate": 1.0043436567270313e-06, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 320016084.0, "step": 385 }, { "entropy": 0.4701690673828125, "epoch": 4.337078651685394, "grad_norm": 1.8268064344668493, "learning_rate": 9.919594383124512e-07, "loss": 0.003, "mean_token_accuracy": 1.0, "num_tokens": 320849099.0, "step": 386 }, { "entropy": 0.4613800048828125, "epoch": 4.348314606741573, "grad_norm": 6.220960050767719, "learning_rate": 9.796331221505235e-07, "loss": 0.0249, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 321673985.0, "step": 387 }, { "entropy": 0.4572601318359375, "epoch": 4.359550561797753, "grad_norm": 2.9553010407276807, "learning_rate": 9.673651815186186e-07, "loss": 0.0058, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 322512898.0, "step": 388 }, { "entropy": 0.4861297607421875, "epoch": 4.370786516853933, "grad_norm": 0.5559132664588449, "learning_rate": 9.551560874527385e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 323304527.0, "step": 389 }, { "entropy": 0.47039794921875, "epoch": 4.382022471910112, "grad_norm": 2.5011975751042828, "learning_rate": 9.43006308729432e-07, "loss": 0.0045, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 324144587.0, "step": 390 }, { "entropy": 0.464385986328125, "epoch": 4.393258426966292, "grad_norm": 3.587804289407284, "learning_rate": 9.309163118477954e-07, "loss": 0.007, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 324983109.0, "step": 391 }, { "entropy": 0.4694671630859375, "epoch": 4.404494382022472, "grad_norm": 2.6638824884363426, "learning_rate": 9.188865610115572e-07, "loss": 0.0041, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 325801705.0, "step": 392 }, { "entropy": 0.47499847412109375, "epoch": 4.415730337078652, "grad_norm": 6.179339696907779, "learning_rate": 9.069175181112597e-07, "loss": 0.0059, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 326592032.0, "step": 393 }, { "entropy": 0.4689788818359375, "epoch": 4.426966292134831, "grad_norm": 6.644329248384315, "learning_rate": 8.950096427065232e-07, "loss": 0.0106, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 327393253.0, "step": 394 }, { "entropy": 0.47740936279296875, "epoch": 4.438202247191011, "grad_norm": 1.8175645575817623, "learning_rate": 8.831633920083968e-07, "loss": 0.0032, "mean_token_accuracy": 1.0, "num_tokens": 328202359.0, "step": 395 }, { "entropy": 0.4592132568359375, "epoch": 4.449438202247191, "grad_norm": 1.984527870774781, "learning_rate": 8.713792208618097e-07, "loss": 0.0154, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 329038112.0, "step": 396 }, { "entropy": 0.47894287109375, "epoch": 4.460674157303371, "grad_norm": 0.2554716930753162, "learning_rate": 8.596575817281036e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 329827400.0, "step": 397 }, { "entropy": 0.47702789306640625, "epoch": 4.47191011235955, "grad_norm": 1.639558137815307, "learning_rate": 8.479989246676595e-07, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 330624750.0, "step": 398 }, { "entropy": 0.4718017578125, "epoch": 4.48314606741573, "grad_norm": 1.3769833756904124, "learning_rate": 8.36403697322618e-07, "loss": 0.0089, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 331441801.0, "step": 399 }, { "entropy": 0.47125244140625, "epoch": 4.49438202247191, "grad_norm": 1.3987859409727157, "learning_rate": 8.248723448996942e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 332277359.0, "step": 400 }, { "entropy": 0.48626708984375, "epoch": 4.50561797752809, "grad_norm": 2.9221300133808876, "learning_rate": 8.134053101530814e-07, "loss": 0.0054, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 333072813.0, "step": 401 }, { "entropy": 0.469268798828125, "epoch": 4.51685393258427, "grad_norm": 1.4654016785797133, "learning_rate": 8.020030333674498e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 333897777.0, "step": 402 }, { "entropy": 0.4660797119140625, "epoch": 4.52808988764045, "grad_norm": 4.200397575849315, "learning_rate": 7.906659523410445e-07, "loss": 0.0085, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 334695942.0, "step": 403 }, { "entropy": 0.46422576904296875, "epoch": 4.539325842696629, "grad_norm": 0.2658219325430272, "learning_rate": 7.793945023688756e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 335532907.0, "step": 404 }, { "entropy": 0.47092437744140625, "epoch": 4.550561797752809, "grad_norm": 3.1678718102940757, "learning_rate": 7.681891162260016e-07, "loss": 0.0059, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 336377217.0, "step": 405 }, { "entropy": 0.47515869140625, "epoch": 4.561797752808989, "grad_norm": 3.518189377752025, "learning_rate": 7.570502241509162e-07, "loss": 0.0054, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 337185785.0, "step": 406 }, { "entropy": 0.46327972412109375, "epoch": 4.573033707865169, "grad_norm": 0.9320317151845362, "learning_rate": 7.459782538290289e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 338041393.0, "step": 407 }, { "entropy": 0.47505950927734375, "epoch": 4.584269662921348, "grad_norm": 3.001322377230109, "learning_rate": 7.349736303762392e-07, "loss": 0.0056, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 338852941.0, "step": 408 }, { "entropy": 0.470977783203125, "epoch": 4.595505617977528, "grad_norm": 1.8828580251184197, "learning_rate": 7.240367763226214e-07, "loss": 0.0037, "mean_token_accuracy": 1.0, "num_tokens": 339676128.0, "step": 409 }, { "entropy": 0.47037506103515625, "epoch": 4.606741573033708, "grad_norm": 1.6546966909357694, "learning_rate": 7.13168111596193e-07, "loss": 0.0049, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 340476500.0, "step": 410 }, { "entropy": 0.47141265869140625, "epoch": 4.617977528089888, "grad_norm": 3.08125141351816, "learning_rate": 7.023680535067998e-07, "loss": 0.0098, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 341300800.0, "step": 411 }, { "entropy": 0.46193695068359375, "epoch": 4.629213483146067, "grad_norm": 1.4950626456597516, "learning_rate": 6.916370167300846e-07, "loss": 0.0033, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 342142892.0, "step": 412 }, { "entropy": 0.463287353515625, "epoch": 4.640449438202247, "grad_norm": 0.9730184386637767, "learning_rate": 6.809754132915722e-07, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 342972325.0, "step": 413 }, { "entropy": 0.45442962646484375, "epoch": 4.651685393258427, "grad_norm": 2.602421273978658, "learning_rate": 6.70383652550847e-07, "loss": 0.0042, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 343795139.0, "step": 414 }, { "entropy": 0.4595947265625, "epoch": 4.662921348314606, "grad_norm": 2.3960654933272565, "learning_rate": 6.59862141185832e-07, "loss": 0.0041, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 344644354.0, "step": 415 }, { "entropy": 0.45831298828125, "epoch": 4.674157303370786, "grad_norm": 2.8939251687083365, "learning_rate": 6.494112831771801e-07, "loss": 0.0063, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 345494609.0, "step": 416 }, { "entropy": 0.45064544677734375, "epoch": 4.685393258426966, "grad_norm": 3.5912419311907327, "learning_rate": 6.390314797927601e-07, "loss": 0.003, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 346343237.0, "step": 417 }, { "entropy": 0.4564056396484375, "epoch": 4.696629213483146, "grad_norm": 0.40858073845148135, "learning_rate": 6.28723129572247e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 347197500.0, "step": 418 }, { "entropy": 0.46680450439453125, "epoch": 4.707865168539326, "grad_norm": 0.251709990148425, "learning_rate": 6.184866283118254e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 348020958.0, "step": 419 }, { "entropy": 0.4546051025390625, "epoch": 4.719101123595506, "grad_norm": 0.25235852831165106, "learning_rate": 6.083223690489901e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 348861858.0, "step": 420 }, { "entropy": 0.46105194091796875, "epoch": 4.730337078651686, "grad_norm": 3.858488259482445, "learning_rate": 5.982307420474501e-07, "loss": 0.0048, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 349689679.0, "step": 421 }, { "entropy": 0.46041107177734375, "epoch": 4.741573033707866, "grad_norm": 1.7123619920632656, "learning_rate": 5.882121347821537e-07, "loss": 0.002, "mean_token_accuracy": 1.0, "num_tokens": 350521593.0, "step": 422 }, { "entropy": 0.464263916015625, "epoch": 4.752808988764045, "grad_norm": 1.354957155560715, "learning_rate": 5.782669319244058e-07, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 351359395.0, "step": 423 }, { "entropy": 0.45088958740234375, "epoch": 4.764044943820225, "grad_norm": 2.9102479875837206, "learning_rate": 5.683955153270959e-07, "loss": 0.0039, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 352212880.0, "step": 424 }, { "entropy": 0.460906982421875, "epoch": 4.775280898876405, "grad_norm": 2.001881536246789, "learning_rate": 5.585982640100416e-07, "loss": 0.0049, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 353039297.0, "step": 425 }, { "entropy": 0.45557403564453125, "epoch": 4.786516853932584, "grad_norm": 0.27617622178466006, "learning_rate": 5.488755541454335e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 353892046.0, "step": 426 }, { "entropy": 0.462371826171875, "epoch": 4.797752808988764, "grad_norm": 0.2591913786994451, "learning_rate": 5.39227759043392e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 354714658.0, "step": 427 }, { "entropy": 0.4490509033203125, "epoch": 4.808988764044944, "grad_norm": 2.4389875882288288, "learning_rate": 5.296552491376322e-07, "loss": 0.0085, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 355576030.0, "step": 428 }, { "entropy": 0.45798492431640625, "epoch": 4.820224719101123, "grad_norm": 0.6433650960240046, "learning_rate": 5.201583919712441e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 356392627.0, "step": 429 }, { "entropy": 0.4479827880859375, "epoch": 4.831460674157303, "grad_norm": 0.44501458316967063, "learning_rate": 5.107375521825791e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 357241360.0, "step": 430 }, { "entropy": 0.44903564453125, "epoch": 4.842696629213483, "grad_norm": 5.289790657216879, "learning_rate": 5.013930914912477e-07, "loss": 0.0152, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 358079861.0, "step": 431 }, { "entropy": 0.44850921630859375, "epoch": 4.853932584269663, "grad_norm": 0.485665337525698, "learning_rate": 4.921253686842323e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 358944222.0, "step": 432 }, { "entropy": 0.4571380615234375, "epoch": 4.865168539325842, "grad_norm": 2.5252971133780107, "learning_rate": 4.829347396021142e-07, "loss": 0.0141, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 359793305.0, "step": 433 }, { "entropy": 0.48027801513671875, "epoch": 4.876404494382022, "grad_norm": 0.6691097376008813, "learning_rate": 4.7382155712540484e-07, "loss": 0.0101, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 360562830.0, "step": 434 }, { "entropy": 0.46465301513671875, "epoch": 4.887640449438202, "grad_norm": 0.43370214444423943, "learning_rate": 4.6478617116100244e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 361364772.0, "step": 435 }, { "entropy": 0.46259307861328125, "epoch": 4.898876404494382, "grad_norm": 2.123319638794875, "learning_rate": 4.5582892862875457e-07, "loss": 0.0027, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 362205959.0, "step": 436 }, { "entropy": 0.46250152587890625, "epoch": 4.910112359550562, "grad_norm": 0.2966276056762264, "learning_rate": 4.469501734481363e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 363013439.0, "step": 437 }, { "entropy": 0.46131134033203125, "epoch": 4.921348314606742, "grad_norm": 0.2595312919643192, "learning_rate": 4.3815024652504897e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 363838845.0, "step": 438 }, { "entropy": 0.4611930847167969, "epoch": 4.932584269662922, "grad_norm": 0.27109994066163073, "learning_rate": 4.294294857387285e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 364673110.0, "step": 439 }, { "entropy": 0.45296478271484375, "epoch": 4.943820224719101, "grad_norm": 0.24175150457269287, "learning_rate": 4.2078822592877074e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 365535079.0, "step": 440 }, { "entropy": 0.47109222412109375, "epoch": 4.955056179775281, "grad_norm": 4.033092061276572, "learning_rate": 4.122267988822792e-07, "loss": 0.0033, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 366360305.0, "step": 441 }, { "entropy": 0.455108642578125, "epoch": 4.966292134831461, "grad_norm": 3.0118455984225885, "learning_rate": 4.0374553332112374e-07, "loss": 0.0072, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 367193457.0, "step": 442 }, { "entropy": 0.46689605712890625, "epoch": 4.97752808988764, "grad_norm": 0.281003717289293, "learning_rate": 3.953447548893169e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 367997504.0, "step": 443 }, { "entropy": 0.4514923095703125, "epoch": 4.98876404494382, "grad_norm": 0.6207453388018358, "learning_rate": 3.8702478614051353e-07, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 368860805.0, "step": 444 }, { "entropy": 0.4544677734375, "epoch": 5.0, "grad_norm": 0.5793207742642461, "learning_rate": 3.787859465256258e-07, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 369686022.0, "step": 445 }, { "entropy": 0.4565582275390625, "epoch": 5.01123595505618, "grad_norm": 3.2248965312750055, "learning_rate": 3.706285523805578e-07, "loss": 0.0044, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 370534769.0, "step": 446 }, { "entropy": 0.47351837158203125, "epoch": 5.022471910112359, "grad_norm": 0.23588422102883916, "learning_rate": 3.625529169140565e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 371333765.0, "step": 447 }, { "entropy": 0.46183013916015625, "epoch": 5.033707865168539, "grad_norm": 0.26051034061662975, "learning_rate": 3.545593501956901e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 372158519.0, "step": 448 }, { "entropy": 0.46891021728515625, "epoch": 5.044943820224719, "grad_norm": 0.7897977948934947, "learning_rate": 3.4664815914394106e-07, "loss": 0.0054, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 372972872.0, "step": 449 }, { "entropy": 0.4658966064453125, "epoch": 5.056179775280899, "grad_norm": 0.23854614330513269, "learning_rate": 3.3881964751441984e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 373764531.0, "step": 450 }, { "entropy": 0.4539337158203125, "epoch": 5.067415730337078, "grad_norm": 0.2582411348351849, "learning_rate": 3.3107411588820527e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 374610508.0, "step": 451 }, { "entropy": 0.44156646728515625, "epoch": 5.078651685393258, "grad_norm": 0.23327005612977159, "learning_rate": 3.2341186166030214e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 375494315.0, "step": 452 }, { "entropy": 0.46539306640625, "epoch": 5.089887640449438, "grad_norm": 0.30673979741348245, "learning_rate": 3.1583317902822127e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 376304953.0, "step": 453 }, { "entropy": 0.4680938720703125, "epoch": 5.101123595505618, "grad_norm": 0.23035448377867607, "learning_rate": 3.083383589806846e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 377109236.0, "step": 454 }, { "entropy": 0.46471405029296875, "epoch": 5.112359550561798, "grad_norm": 0.22962165278983354, "learning_rate": 3.0092768928645375e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 377915350.0, "step": 455 }, { "entropy": 0.4534149169921875, "epoch": 5.123595505617978, "grad_norm": 0.25075614257575074, "learning_rate": 2.936014544832794e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 378732812.0, "step": 456 }, { "entropy": 0.45656585693359375, "epoch": 5.134831460674158, "grad_norm": 2.0839651517450366, "learning_rate": 2.8635993586697555e-07, "loss": 0.003, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 379535203.0, "step": 457 }, { "entropy": 0.4586334228515625, "epoch": 5.146067415730337, "grad_norm": 0.22756596903598503, "learning_rate": 2.792034114806211e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 380351317.0, "step": 458 }, { "entropy": 0.45229339599609375, "epoch": 5.157303370786517, "grad_norm": 0.22757916112457438, "learning_rate": 2.7213215610388364e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 381191082.0, "step": 459 }, { "entropy": 0.44696807861328125, "epoch": 5.168539325842697, "grad_norm": 0.22565516577479386, "learning_rate": 2.6514644124246675e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 382014981.0, "step": 460 }, { "entropy": 0.46833038330078125, "epoch": 5.179775280898877, "grad_norm": 0.38263012185270356, "learning_rate": 2.582465351176891e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 382843849.0, "step": 461 }, { "entropy": 0.4453125, "epoch": 5.191011235955056, "grad_norm": 1.941394944381824, "learning_rate": 2.514327026561833e-07, "loss": 0.0114, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 383684445.0, "step": 462 }, { "entropy": 0.456756591796875, "epoch": 5.202247191011236, "grad_norm": 0.3217712106473879, "learning_rate": 2.447052054797233e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 384501685.0, "step": 463 }, { "entropy": 0.437744140625, "epoch": 5.213483146067416, "grad_norm": 1.3017516617976264, "learning_rate": 2.3806430189518337e-07, "loss": 0.0099, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 385365419.0, "step": 464 }, { "entropy": 0.45667266845703125, "epoch": 5.224719101123595, "grad_norm": 0.220187005935604, "learning_rate": 2.3151024688461422e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 386192220.0, "step": 465 }, { "entropy": 0.46067047119140625, "epoch": 5.235955056179775, "grad_norm": 0.22892437261891452, "learning_rate": 2.2504329209545846e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 387013925.0, "step": 466 }, { "entropy": 0.4477996826171875, "epoch": 5.247191011235955, "grad_norm": 0.22510109028620062, "learning_rate": 2.186636858308841e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 387859694.0, "step": 467 }, { "entropy": 0.4438323974609375, "epoch": 5.258426966292135, "grad_norm": 0.22217029391538748, "learning_rate": 2.1237167304025336e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 388717543.0, "step": 468 }, { "entropy": 0.43897247314453125, "epoch": 5.269662921348314, "grad_norm": 1.3611717994727468, "learning_rate": 2.0616749530971785e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 389594969.0, "step": 469 }, { "entropy": 0.438812255859375, "epoch": 5.280898876404494, "grad_norm": 4.6338923879333365, "learning_rate": 2.0005139085293945e-07, "loss": 0.0027, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 390462095.0, "step": 470 }, { "entropy": 0.45113372802734375, "epoch": 5.292134831460674, "grad_norm": 3.1222985309280444, "learning_rate": 1.9402359450194836e-07, "loss": 0.0024, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 391317304.0, "step": 471 }, { "entropy": 0.45957183837890625, "epoch": 5.303370786516854, "grad_norm": 0.26983711489676676, "learning_rate": 1.8808433769812367e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 392117648.0, "step": 472 }, { "entropy": 0.452423095703125, "epoch": 5.314606741573034, "grad_norm": 0.29167546299948666, "learning_rate": 1.8223384848330723e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 392958679.0, "step": 473 }, { "entropy": 0.44556427001953125, "epoch": 5.325842696629214, "grad_norm": 3.9531950686501087, "learning_rate": 1.7647235149104908e-07, "loss": 0.0047, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 393804414.0, "step": 474 }, { "entropy": 0.465728759765625, "epoch": 5.337078651685394, "grad_norm": 0.2611165051574614, "learning_rate": 1.7080006793798176e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 394606982.0, "step": 475 }, { "entropy": 0.4496002197265625, "epoch": 5.348314606741573, "grad_norm": 0.28831706738624335, "learning_rate": 1.6521721561532645e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 395464681.0, "step": 476 }, { "entropy": 0.454864501953125, "epoch": 5.359550561797753, "grad_norm": 0.27801351147825365, "learning_rate": 1.597240088805302e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 396272787.0, "step": 477 }, { "entropy": 0.4578704833984375, "epoch": 5.370786516853933, "grad_norm": 3.7652732902520767, "learning_rate": 1.54320658649037e-07, "loss": 0.003, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 397106952.0, "step": 478 }, { "entropy": 0.45423126220703125, "epoch": 5.382022471910112, "grad_norm": 1.0222681737176318, "learning_rate": 1.4900737238618874e-07, "loss": 0.0017, "mean_token_accuracy": 1.0, "num_tokens": 397922607.0, "step": 479 }, { "entropy": 0.4500579833984375, "epoch": 5.393258426966292, "grad_norm": 3.2055117041350987, "learning_rate": 1.4378435409925868e-07, "loss": 0.0064, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 398762139.0, "step": 480 }, { "entropy": 0.44394683837890625, "epoch": 5.404494382022472, "grad_norm": 0.27203919336113547, "learning_rate": 1.3865180432961977e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 399606807.0, "step": 481 }, { "entropy": 0.4582977294921875, "epoch": 5.415730337078652, "grad_norm": 0.2626129462808237, "learning_rate": 1.3360992014504414e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 400433406.0, "step": 482 }, { "entropy": 0.4539947509765625, "epoch": 5.426966292134831, "grad_norm": 1.3110655834081466, "learning_rate": 1.286588951321363e-07, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 401267844.0, "step": 483 }, { "entropy": 0.45575714111328125, "epoch": 5.438202247191011, "grad_norm": 0.30139852738632256, "learning_rate": 1.237989193889e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 402087652.0, "step": 484 }, { "entropy": 0.4398040771484375, "epoch": 5.449438202247191, "grad_norm": 0.2772895954120305, "learning_rate": 1.1903017951744144e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 402960724.0, "step": 485 }, { "entropy": 0.45752716064453125, "epoch": 5.460674157303371, "grad_norm": 0.28043078838229096, "learning_rate": 1.1435285861680106e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 403770083.0, "step": 486 }, { "entropy": 0.4523468017578125, "epoch": 5.47191011235955, "grad_norm": 0.290486600050334, "learning_rate": 1.0976713627592561e-07, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 404586068.0, "step": 487 }, { "entropy": 0.448974609375, "epoch": 5.48314606741573, "grad_norm": 0.2602876207673978, "learning_rate": 1.0527318856677293e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 405433958.0, "step": 488 }, { "entropy": 0.4537506103515625, "epoch": 5.49438202247191, "grad_norm": 0.2255688115426861, "learning_rate": 1.0087118803755069e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 406236735.0, "step": 489 }, { "entropy": 0.44225311279296875, "epoch": 5.50561797752809, "grad_norm": 0.23839279319796375, "learning_rate": 9.656130370609057e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 407098652.0, "step": 490 }, { "entropy": 0.44016265869140625, "epoch": 5.51685393258427, "grad_norm": 0.4044689455010622, "learning_rate": 9.234370105336039e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 407954406.0, "step": 491 }, { "entropy": 0.4402008056640625, "epoch": 5.52808988764045, "grad_norm": 0.2430544169170237, "learning_rate": 8.821854201711027e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 408805475.0, "step": 492 }, { "entropy": 0.45285797119140625, "epoch": 5.539325842696629, "grad_norm": 1.8753482122817826, "learning_rate": 8.418598498565217e-08, "loss": 0.0077, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 409625911.0, "step": 493 }, { "entropy": 0.44989013671875, "epoch": 5.550561797752809, "grad_norm": 0.22453644772138204, "learning_rate": 8.024618479178237e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 410448377.0, "step": 494 }, { "entropy": 0.45960235595703125, "epoch": 5.561797752808989, "grad_norm": 0.22608759145228952, "learning_rate": 7.639929270683438e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 411254496.0, "step": 495 }, { "entropy": 0.447357177734375, "epoch": 5.573033707865169, "grad_norm": 0.22231664608313986, "learning_rate": 7.264545643486997e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 412093632.0, "step": 496 }, { "entropy": 0.44928741455078125, "epoch": 5.584269662921348, "grad_norm": 0.7770114677040165, "learning_rate": 6.898482010701036e-08, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 412912725.0, "step": 497 }, { "entropy": 0.45952606201171875, "epoch": 5.595505617977528, "grad_norm": 0.22205123157021062, "learning_rate": 6.541752427590004e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 413704471.0, "step": 498 }, { "entropy": 0.45113372802734375, "epoch": 5.606741573033708, "grad_norm": 0.21940329918848647, "learning_rate": 6.194370591031174e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 414537095.0, "step": 499 }, { "entropy": 0.44696807861328125, "epoch": 5.617977528089888, "grad_norm": 0.2289707035324365, "learning_rate": 5.856349838988612e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 415392981.0, "step": 500 }, { "entropy": 0.45949554443359375, "epoch": 5.629213483146067, "grad_norm": 0.2211713582495607, "learning_rate": 5.5277031500011734e-08, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 416223543.0, "step": 501 }, { "entropy": 0.4473876953125, "epoch": 5.640449438202247, "grad_norm": 1.182133604490211, "learning_rate": 5.208443142684094e-08, "loss": 0.0104, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 417060727.0, "step": 502 }, { "entropy": 0.44683074951171875, "epoch": 5.651685393258427, "grad_norm": 0.21884403629787713, "learning_rate": 4.8985820752445177e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 417890059.0, "step": 503 }, { "entropy": 0.44339752197265625, "epoch": 5.662921348314606, "grad_norm": 0.21637706969512882, "learning_rate": 4.5981318450109e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 418737697.0, "step": 504 }, { "entropy": 0.45195770263671875, "epoch": 5.674157303370786, "grad_norm": 0.2173727523833432, "learning_rate": 4.307103987976041e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 419554252.0, "step": 505 }, { "entropy": 0.45410919189453125, "epoch": 5.685393258426966, "grad_norm": 0.21430018532606634, "learning_rate": 4.0255096783543e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 420379301.0, "step": 506 }, { "entropy": 0.44255828857421875, "epoch": 5.696629213483146, "grad_norm": 0.21992408445816547, "learning_rate": 3.75335972815255e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 421251190.0, "step": 507 }, { "entropy": 0.45482635498046875, "epoch": 5.707865168539326, "grad_norm": 0.21487068594235248, "learning_rate": 3.4906645867549547e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 422076483.0, "step": 508 }, { "entropy": 0.45025634765625, "epoch": 5.719101123595506, "grad_norm": 0.22095064700201622, "learning_rate": 3.237434340521789e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 422892953.0, "step": 509 }, { "entropy": 0.4595794677734375, "epoch": 5.730337078651686, "grad_norm": 0.21347314128040096, "learning_rate": 2.993678712402221e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 423725323.0, "step": 510 }, { "entropy": 0.44622802734375, "epoch": 5.741573033707866, "grad_norm": 0.21762320050905556, "learning_rate": 2.7594070615609426e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 424568879.0, "step": 511 }, { "entropy": 0.4506378173828125, "epoch": 5.752808988764045, "grad_norm": 0.2399851374545116, "learning_rate": 2.5346283830187667e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 425422220.0, "step": 512 }, { "entropy": 0.45760345458984375, "epoch": 5.764044943820225, "grad_norm": 3.618323663153434, "learning_rate": 2.319351307307427e-08, "loss": 0.0064, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 426238802.0, "step": 513 }, { "entropy": 0.463531494140625, "epoch": 5.775280898876405, "grad_norm": 3.020973889001048, "learning_rate": 2.1135841001380386e-08, "loss": 0.0032, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 427029167.0, "step": 514 }, { "entropy": 0.449920654296875, "epoch": 5.786516853932584, "grad_norm": 0.21466505136379285, "learning_rate": 1.917334662083714e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 427868982.0, "step": 515 }, { "entropy": 0.4319610595703125, "epoch": 5.797752808988764, "grad_norm": 0.21193776091797573, "learning_rate": 1.7306105282764162e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 428747997.0, "step": 516 }, { "entropy": 0.4481201171875, "epoch": 5.808988764044944, "grad_norm": 3.581869315768432, "learning_rate": 1.55341886811744e-08, "loss": 0.0027, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 429565485.0, "step": 517 }, { "entropy": 0.450286865234375, "epoch": 5.820224719101123, "grad_norm": 0.21526921468453963, "learning_rate": 1.3857664850022157e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 430396686.0, "step": 518 }, { "entropy": 0.455841064453125, "epoch": 5.831460674157303, "grad_norm": 0.21786690914163995, "learning_rate": 1.2276598160590736e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 431218771.0, "step": 519 }, { "entropy": 0.44765472412109375, "epoch": 5.842696629213483, "grad_norm": 0.22217126637320583, "learning_rate": 1.0791049319021086e-08, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 432065865.0, "step": 520 }, { "entropy": 0.45168304443359375, "epoch": 5.853932584269663, "grad_norm": 0.2152958362352817, "learning_rate": 9.401075363981438e-09, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 432899235.0, "step": 521 }, { "entropy": 0.44405364990234375, "epoch": 5.865168539325842, "grad_norm": 0.2167849625384711, "learning_rate": 8.106729664475178e-09, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 433737697.0, "step": 522 }, { "entropy": 0.445068359375, "epoch": 5.876404494382022, "grad_norm": 0.22185722429726154, "learning_rate": 6.908061917794417e-09, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 434580120.0, "step": 523 }, { "entropy": 0.43703460693359375, "epoch": 5.887640449438202, "grad_norm": 0.2189020524852438, "learning_rate": 5.805118147610145e-09, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 435459272.0, "step": 524 }, { "entropy": 0.456207275390625, "epoch": 5.898876404494382, "grad_norm": 0.22301334936598696, "learning_rate": 4.797940702205572e-09, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 436268641.0, "step": 525 }, { "entropy": 0.4462432861328125, "epoch": 5.910112359550562, "grad_norm": 0.2160379000791954, "learning_rate": 3.8865682528504975e-09, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 437126939.0, "step": 526 }, { "entropy": 0.44385528564453125, "epoch": 5.921348314606742, "grad_norm": 4.116469262583432, "learning_rate": 3.071035792315269e-09, "loss": 0.0178, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 437951169.0, "step": 527 }, { "entropy": 0.45404815673828125, "epoch": 5.932584269662922, "grad_norm": 4.6861858731438435, "learning_rate": 2.351374633528802e-09, "loss": 0.0046, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 438768686.0, "step": 528 }, { "entropy": 0.472015380859375, "epoch": 5.943820224719101, "grad_norm": 0.21812174178615043, "learning_rate": 1.7276124083753788e-09, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 439558604.0, "step": 529 }, { "entropy": 0.44561004638671875, "epoch": 5.955056179775281, "grad_norm": 0.22063825113739127, "learning_rate": 1.1997730666338248e-09, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 440395957.0, "step": 530 }, { "entropy": 0.4581298828125, "epoch": 5.966292134831461, "grad_norm": 1.4192369358126131, "learning_rate": 7.678768750579713e-10, "loss": 0.0111, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 441212037.0, "step": 531 }, { "entropy": 0.45439910888671875, "epoch": 5.97752808988764, "grad_norm": 0.2134790958132504, "learning_rate": 4.3194041659866405e-10, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 442053098.0, "step": 532 }, { "entropy": 0.43907928466796875, "epoch": 5.98876404494382, "grad_norm": 0.21385742865545188, "learning_rate": 1.9197658976677358e-10, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 442888433.0, "step": 533 }, { "entropy": 0.44598388671875, "epoch": 6.0, "grad_norm": 0.2177543367348665, "learning_rate": 4.799460813803558e-11, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 443721575.0, "step": 534 }, { "epoch": 6.0, "step": 534, "total_flos": 522066352668672.0, "train_loss": 0.5458187263237473, "train_runtime": 71514.3468, "train_samples_per_second": 3.454, "train_steps_per_second": 0.007 } ], "logging_steps": 1, "max_steps": 534, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 45, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 522066352668672.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }