{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 528, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5595855712890625, "epoch": 0.011363636363636364, "grad_norm": 381.7861402310546, "learning_rate": 0.0, "loss": 8.3191, "mean_token_accuracy": 0.0, "num_tokens": 852123.0, "step": 1 }, { "entropy": 0.5646438598632812, "epoch": 0.022727272727272728, "grad_norm": 384.6525251162329, "learning_rate": 3.7037037037037036e-07, "loss": 8.2985, "mean_token_accuracy": 0.0, "num_tokens": 1667244.0, "step": 2 }, { "entropy": 0.553924560546875, "epoch": 0.03409090909090909, "grad_norm": 384.8117222961788, "learning_rate": 7.407407407407407e-07, "loss": 8.2789, "mean_token_accuracy": 0.0, "num_tokens": 2503572.0, "step": 3 }, { "entropy": 0.5484161376953125, "epoch": 0.045454545454545456, "grad_norm": 389.3572097207638, "learning_rate": 1.111111111111111e-06, "loss": 8.0992, "mean_token_accuracy": 0.0, "num_tokens": 3345459.0, "step": 4 }, { "entropy": 0.5574264526367188, "epoch": 0.056818181818181816, "grad_norm": 403.9101556918202, "learning_rate": 1.4814814814814815e-06, "loss": 7.4779, "mean_token_accuracy": 0.0, "num_tokens": 4166938.0, "step": 5 }, { "entropy": 0.5470428466796875, "epoch": 0.06818181818181818, "grad_norm": 395.2760506482167, "learning_rate": 1.8518518518518519e-06, "loss": 7.2347, "mean_token_accuracy": 0.0, "num_tokens": 5016940.0, "step": 6 }, { "entropy": 0.5523529052734375, "epoch": 0.07954545454545454, "grad_norm": 223.35456023967305, "learning_rate": 2.222222222222222e-06, "loss": 5.576, "mean_token_accuracy": 0.007812500232830644, "num_tokens": 5848503.0, "step": 7 }, { "entropy": 0.5435867309570312, "epoch": 0.09090909090909091, "grad_norm": 112.29477147905564, "learning_rate": 2.5925925925925925e-06, "loss": 4.2732, "mean_token_accuracy": 0.5026041816454381, "num_tokens": 6709898.0, "step": 8 }, { "entropy": 0.5592041015625, "epoch": 0.10227272727272728, "grad_norm": 95.74272291998905, "learning_rate": 2.962962962962963e-06, "loss": 4.0579, "mean_token_accuracy": 0.505208348389715, "num_tokens": 7560854.0, "step": 9 }, { "entropy": 0.5602951049804688, "epoch": 0.11363636363636363, "grad_norm": 80.94962958572077, "learning_rate": 3.3333333333333333e-06, "loss": 3.8263, "mean_token_accuracy": 0.5117187652504072, "num_tokens": 8391135.0, "step": 10 }, { "entropy": 0.5577774047851562, "epoch": 0.125, "grad_norm": 59.51647665780966, "learning_rate": 3.7037037037037037e-06, "loss": 3.3053, "mean_token_accuracy": 0.505208348389715, "num_tokens": 9185279.0, "step": 11 }, { "entropy": 0.5463104248046875, "epoch": 0.13636363636363635, "grad_norm": 57.89777609345863, "learning_rate": 4.074074074074074e-06, "loss": 3.2147, "mean_token_accuracy": 0.5169270987389609, "num_tokens": 10024891.0, "step": 12 }, { "entropy": 0.5550765991210938, "epoch": 0.14772727272727273, "grad_norm": 57.43984061602855, "learning_rate": 4.444444444444444e-06, "loss": 3.1308, "mean_token_accuracy": 0.5273437657160684, "num_tokens": 10842191.0, "step": 13 }, { "entropy": 0.5534210205078125, "epoch": 0.1590909090909091, "grad_norm": 57.49098275744853, "learning_rate": 4.814814814814815e-06, "loss": 3.0731, "mean_token_accuracy": 0.5351562659488991, "num_tokens": 11650475.0, "step": 14 }, { "entropy": 0.5523452758789062, "epoch": 0.17045454545454544, "grad_norm": 60.44468793760702, "learning_rate": 5.185185185185185e-06, "loss": 2.9645, "mean_token_accuracy": 0.5208333488553762, "num_tokens": 12464155.0, "step": 15 }, { "entropy": 0.5218353271484375, "epoch": 0.18181818181818182, "grad_norm": 66.21017634939703, "learning_rate": 5.555555555555557e-06, "loss": 2.9797, "mean_token_accuracy": 0.5013020982732996, "num_tokens": 13346836.0, "step": 16 }, { "entropy": 0.5411224365234375, "epoch": 0.19318181818181818, "grad_norm": 57.306031059188456, "learning_rate": 5.925925925925926e-06, "loss": 2.9136, "mean_token_accuracy": 0.5234375155996531, "num_tokens": 14174968.0, "step": 17 }, { "entropy": 0.5549850463867188, "epoch": 0.20454545454545456, "grad_norm": 57.48037572896507, "learning_rate": 6.296296296296297e-06, "loss": 2.8744, "mean_token_accuracy": 0.5130208486225456, "num_tokens": 14975189.0, "step": 18 }, { "entropy": 0.558502197265625, "epoch": 0.2159090909090909, "grad_norm": 57.18753706899099, "learning_rate": 6.666666666666667e-06, "loss": 2.8435, "mean_token_accuracy": 0.5247395989717916, "num_tokens": 15764524.0, "step": 19 }, { "entropy": 0.5551605224609375, "epoch": 0.22727272727272727, "grad_norm": 57.099620732693666, "learning_rate": 7.0370370370370375e-06, "loss": 2.8033, "mean_token_accuracy": 0.5611979333916679, "num_tokens": 16594212.0, "step": 20 }, { "entropy": 0.54339599609375, "epoch": 0.23863636363636365, "grad_norm": 56.761099934785754, "learning_rate": 7.4074074074074075e-06, "loss": 2.7555, "mean_token_accuracy": 0.5455729329260066, "num_tokens": 17431524.0, "step": 21 }, { "entropy": 0.5582275390625, "epoch": 0.25, "grad_norm": 56.67625638500944, "learning_rate": 7.77777777777778e-06, "loss": 2.7206, "mean_token_accuracy": 0.5286458490882069, "num_tokens": 18240206.0, "step": 22 }, { "entropy": 0.544647216796875, "epoch": 0.26136363636363635, "grad_norm": 57.07526498149015, "learning_rate": 8.148148148148148e-06, "loss": 2.6748, "mean_token_accuracy": 0.5638021001359448, "num_tokens": 19067759.0, "step": 23 }, { "entropy": 0.5390625, "epoch": 0.2727272727272727, "grad_norm": 57.35919508340141, "learning_rate": 8.518518518518519e-06, "loss": 2.6279, "mean_token_accuracy": 0.5690104336244985, "num_tokens": 19896857.0, "step": 24 }, { "entropy": 0.5362167358398438, "epoch": 0.2840909090909091, "grad_norm": 58.5953667217032, "learning_rate": 8.888888888888888e-06, "loss": 2.5885, "mean_token_accuracy": 0.558593766647391, "num_tokens": 20712844.0, "step": 25 }, { "entropy": 0.5246353149414062, "epoch": 0.29545454545454547, "grad_norm": 59.726659251879035, "learning_rate": 9.25925925925926e-06, "loss": 2.5624, "mean_token_accuracy": 0.5572916832752526, "num_tokens": 21562110.0, "step": 26 }, { "entropy": 0.517822265625, "epoch": 0.3068181818181818, "grad_norm": 61.90672236852855, "learning_rate": 9.62962962962963e-06, "loss": 2.5112, "mean_token_accuracy": 0.5598958500195295, "num_tokens": 22406352.0, "step": 27 }, { "entropy": 0.5255279541015625, "epoch": 0.3181818181818182, "grad_norm": 70.11627842380628, "learning_rate": 1e-05, "loss": 2.5404, "mean_token_accuracy": 0.5195312654832378, "num_tokens": 23252892.0, "step": 28 }, { "entropy": 0.54864501953125, "epoch": 0.32954545454545453, "grad_norm": 59.62883231844681, "learning_rate": 9.999901697881075e-06, "loss": 2.4391, "mean_token_accuracy": 0.558593766647391, "num_tokens": 24068796.0, "step": 29 }, { "entropy": 0.5512237548828125, "epoch": 0.3409090909090909, "grad_norm": 60.88601090681198, "learning_rate": 9.999606795389623e-06, "loss": 2.4135, "mean_token_accuracy": 0.5442708487389609, "num_tokens": 24888182.0, "step": 30 }, { "entropy": 0.5544052124023438, "epoch": 0.3522727272727273, "grad_norm": 60.86420068870514, "learning_rate": 9.999115304121459e-06, "loss": 2.3746, "mean_token_accuracy": 0.8619791747769341, "num_tokens": 25701818.0, "step": 31 }, { "entropy": 0.5430526733398438, "epoch": 0.36363636363636365, "grad_norm": 60.251653944045856, "learning_rate": 9.998427243402437e-06, "loss": 2.3292, "mean_token_accuracy": 0.8854166734963655, "num_tokens": 26550803.0, "step": 32 }, { "entropy": 0.5553741455078125, "epoch": 0.375, "grad_norm": 60.3298612444737, "learning_rate": 9.997542640287686e-06, "loss": 2.2774, "mean_token_accuracy": 0.8841145902406424, "num_tokens": 27362749.0, "step": 33 }, { "entropy": 0.5552520751953125, "epoch": 0.38636363636363635, "grad_norm": 60.62608864413497, "learning_rate": 9.996461529560553e-06, "loss": 2.2108, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 28197226.0, "step": 34 }, { "entropy": 0.5430068969726562, "epoch": 0.3977272727272727, "grad_norm": 60.45843293213623, "learning_rate": 9.995183953731225e-06, "loss": 2.1694, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 29070075.0, "step": 35 }, { "entropy": 0.5520477294921875, "epoch": 0.4090909090909091, "grad_norm": 60.60470764646535, "learning_rate": 9.99370996303507e-06, "loss": 2.1127, "mean_token_accuracy": 0.8997395893093199, "num_tokens": 29909320.0, "step": 36 }, { "entropy": 0.533050537109375, "epoch": 0.42045454545454547, "grad_norm": 60.8797781205061, "learning_rate": 9.992039615430648e-06, "loss": 2.071, "mean_token_accuracy": 0.912760421866551, "num_tokens": 30778680.0, "step": 37 }, { "entropy": 0.5433807373046875, "epoch": 0.4318181818181818, "grad_norm": 60.866971731966345, "learning_rate": 9.990172976597446e-06, "loss": 2.0387, "mean_token_accuracy": 0.8958333395421505, "num_tokens": 31596240.0, "step": 38 }, { "entropy": 0.5598068237304688, "epoch": 0.4431818181818182, "grad_norm": 61.325213343510775, "learning_rate": 9.988110119933281e-06, "loss": 1.9883, "mean_token_accuracy": 0.8971354227978736, "num_tokens": 32396599.0, "step": 39 }, { "entropy": 0.5557174682617188, "epoch": 0.45454545454545453, "grad_norm": 60.349279291270896, "learning_rate": 9.985851126551428e-06, "loss": 1.9158, "mean_token_accuracy": 0.9192708381451666, "num_tokens": 33192180.0, "step": 40 }, { "entropy": 0.5460128784179688, "epoch": 0.4659090909090909, "grad_norm": 60.29280940957208, "learning_rate": 9.983396085277421e-06, "loss": 1.8879, "mean_token_accuracy": 0.8997395893093199, "num_tokens": 34019780.0, "step": 41 }, { "entropy": 0.5501022338867188, "epoch": 0.4772727272727273, "grad_norm": 58.97902817615115, "learning_rate": 9.980745092645564e-06, "loss": 1.8189, "mean_token_accuracy": 0.9101562553551048, "num_tokens": 34845898.0, "step": 42 }, { "entropy": 0.55059814453125, "epoch": 0.48863636363636365, "grad_norm": 59.56331193069494, "learning_rate": 9.977898252895133e-06, "loss": 1.7845, "mean_token_accuracy": 0.912760421866551, "num_tokens": 35658003.0, "step": 43 }, { "entropy": 0.5355606079101562, "epoch": 0.5, "grad_norm": 59.1190449560066, "learning_rate": 9.974855677966283e-06, "loss": 1.7301, "mean_token_accuracy": 0.9049479223322123, "num_tokens": 36513090.0, "step": 44 }, { "entropy": 0.5511093139648438, "epoch": 0.5113636363636364, "grad_norm": 58.84552514073676, "learning_rate": 9.971617487495635e-06, "loss": 1.6771, "mean_token_accuracy": 0.8997395893093199, "num_tokens": 37335753.0, "step": 45 }, { "entropy": 0.5509262084960938, "epoch": 0.5227272727272727, "grad_norm": 58.05247290731136, "learning_rate": 9.968183808811586e-06, "loss": 1.6113, "mean_token_accuracy": 0.912760421866551, "num_tokens": 38171714.0, "step": 46 }, { "entropy": 0.5467910766601562, "epoch": 0.5340909090909091, "grad_norm": 57.62644994665755, "learning_rate": 9.964554776929289e-06, "loss": 1.5464, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 39004676.0, "step": 47 }, { "entropy": 0.554779052734375, "epoch": 0.5454545454545454, "grad_norm": 57.493682548859965, "learning_rate": 9.960730534545357e-06, "loss": 1.507, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 39839454.0, "step": 48 }, { "entropy": 0.5637130737304688, "epoch": 0.5568181818181818, "grad_norm": 57.70900607158181, "learning_rate": 9.95671123203224e-06, "loss": 1.4657, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 40636201.0, "step": 49 }, { "entropy": 0.5558547973632812, "epoch": 0.5681818181818182, "grad_norm": 57.39395222476001, "learning_rate": 9.95249702743232e-06, "loss": 1.3995, "mean_token_accuracy": 0.923177087912336, "num_tokens": 41456439.0, "step": 50 }, { "entropy": 0.5491790771484375, "epoch": 0.5795454545454546, "grad_norm": 57.41371703432652, "learning_rate": 9.948088086451692e-06, "loss": 1.3504, "mean_token_accuracy": 0.923177087912336, "num_tokens": 42306168.0, "step": 51 }, { "entropy": 0.5419998168945312, "epoch": 0.5909090909090909, "grad_norm": 57.708036609750216, "learning_rate": 9.943484582453653e-06, "loss": 1.2953, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 43156058.0, "step": 52 }, { "entropy": 0.5348358154296875, "epoch": 0.6022727272727273, "grad_norm": 57.73872770246758, "learning_rate": 9.938686696451884e-06, "loss": 1.2523, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 44026197.0, "step": 53 }, { "entropy": 0.5535354614257812, "epoch": 0.6136363636363636, "grad_norm": 57.777576405881355, "learning_rate": 9.933694617103328e-06, "loss": 1.1934, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 44830346.0, "step": 54 }, { "entropy": 0.5513992309570312, "epoch": 0.625, "grad_norm": 57.34605611556142, "learning_rate": 9.928508540700775e-06, "loss": 1.147, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 45639894.0, "step": 55 }, { "entropy": 0.5546798706054688, "epoch": 0.6363636363636364, "grad_norm": 57.04857568973497, "learning_rate": 9.923128671165145e-06, "loss": 1.109, "mean_token_accuracy": 0.9023437558207661, "num_tokens": 46453026.0, "step": 56 }, { "entropy": 0.5538177490234375, "epoch": 0.6477272727272727, "grad_norm": 56.470712174285566, "learning_rate": 9.917555220037469e-06, "loss": 1.0488, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 47264316.0, "step": 57 }, { "entropy": 0.5666427612304688, "epoch": 0.6590909090909091, "grad_norm": 55.841215147852594, "learning_rate": 9.91178840647057e-06, "loss": 1.0016, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 48069506.0, "step": 58 }, { "entropy": 0.5659713745117188, "epoch": 0.6704545454545454, "grad_norm": 55.98626961303541, "learning_rate": 9.905828457220442e-06, "loss": 0.9377, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 48869835.0, "step": 59 }, { "entropy": 0.5512847900390625, "epoch": 0.6818181818181818, "grad_norm": 54.8634329832115, "learning_rate": 9.899675606637344e-06, "loss": 0.8998, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 49727599.0, "step": 60 }, { "entropy": 0.551544189453125, "epoch": 0.6931818181818182, "grad_norm": 54.36819144170542, "learning_rate": 9.893330096656576e-06, "loss": 0.837, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 50548928.0, "step": 61 }, { "entropy": 0.5626449584960938, "epoch": 0.7045454545454546, "grad_norm": 54.914774939776095, "learning_rate": 9.886792176788964e-06, "loss": 0.8096, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 51360270.0, "step": 62 }, { "entropy": 0.557525634765625, "epoch": 0.7159090909090909, "grad_norm": 55.61783293820803, "learning_rate": 9.880062104111064e-06, "loss": 0.8107, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 52194089.0, "step": 63 }, { "entropy": 0.55279541015625, "epoch": 0.7272727272727273, "grad_norm": 52.63001268534479, "learning_rate": 9.873140143255035e-06, "loss": 0.7542, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 53031403.0, "step": 64 }, { "entropy": 0.5531845092773438, "epoch": 0.7386363636363636, "grad_norm": 49.54331333494459, "learning_rate": 9.866026566398248e-06, "loss": 0.6812, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 53844933.0, "step": 65 }, { "entropy": 0.5571212768554688, "epoch": 0.75, "grad_norm": 48.50824783464925, "learning_rate": 9.858721653252571e-06, "loss": 0.6659, "mean_token_accuracy": 0.9075520888436586, "num_tokens": 54693823.0, "step": 66 }, { "entropy": 0.5417633056640625, "epoch": 0.7613636363636364, "grad_norm": 46.31315205932425, "learning_rate": 9.851225691053382e-06, "loss": 0.6124, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 55549536.0, "step": 67 }, { "entropy": 0.552581787109375, "epoch": 0.7727272727272727, "grad_norm": 44.52806747500585, "learning_rate": 9.843538974548264e-06, "loss": 0.5685, "mean_token_accuracy": 0.923177087912336, "num_tokens": 56378172.0, "step": 68 }, { "entropy": 0.5659942626953125, "epoch": 0.7840909090909091, "grad_norm": 41.985635746009535, "learning_rate": 9.835661805985432e-06, "loss": 0.5224, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 57189636.0, "step": 69 }, { "entropy": 0.5614395141601562, "epoch": 0.7954545454545454, "grad_norm": 39.34370735906953, "learning_rate": 9.827594495101824e-06, "loss": 0.4839, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 58000089.0, "step": 70 }, { "entropy": 0.5698471069335938, "epoch": 0.8068181818181818, "grad_norm": 37.74374421075821, "learning_rate": 9.819337359110945e-06, "loss": 0.4483, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 58802829.0, "step": 71 }, { "entropy": 0.5551528930664062, "epoch": 0.8181818181818182, "grad_norm": 36.45001846956263, "learning_rate": 9.81089072269038e-06, "loss": 0.4381, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 59625095.0, "step": 72 }, { "entropy": 0.5639801025390625, "epoch": 0.8295454545454546, "grad_norm": 32.91100179668808, "learning_rate": 9.802254917969033e-06, "loss": 0.4023, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 60419806.0, "step": 73 }, { "entropy": 0.5682144165039062, "epoch": 0.8409090909090909, "grad_norm": 36.672735804041565, "learning_rate": 9.793430284514063e-06, "loss": 0.4244, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 61242019.0, "step": 74 }, { "entropy": 0.5541458129882812, "epoch": 0.8522727272727273, "grad_norm": 28.896417954347815, "learning_rate": 9.78441716931754e-06, "loss": 0.3576, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 62099804.0, "step": 75 }, { "entropy": 0.5613327026367188, "epoch": 0.8636363636363636, "grad_norm": 27.80615299063651, "learning_rate": 9.775215926782788e-06, "loss": 0.3477, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 62934736.0, "step": 76 }, { "entropy": 0.5644607543945312, "epoch": 0.875, "grad_norm": 24.175728899608025, "learning_rate": 9.765826918710466e-06, "loss": 0.3243, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 63755694.0, "step": 77 }, { "entropy": 0.5602569580078125, "epoch": 0.8863636363636364, "grad_norm": 23.17376499958452, "learning_rate": 9.75625051428433e-06, "loss": 0.3052, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 64618678.0, "step": 78 }, { "entropy": 0.5586624145507812, "epoch": 0.8977272727272727, "grad_norm": 19.233654316377294, "learning_rate": 9.746487090056712e-06, "loss": 0.2687, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 65481725.0, "step": 79 }, { "entropy": 0.544525146484375, "epoch": 0.9090909090909091, "grad_norm": 21.651323416304713, "learning_rate": 9.736537029933738e-06, "loss": 0.2901, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 66326767.0, "step": 80 }, { "entropy": 0.5647964477539062, "epoch": 0.9204545454545454, "grad_norm": 19.420139513068413, "learning_rate": 9.726400725160199e-06, "loss": 0.2637, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 67132451.0, "step": 81 }, { "entropy": 0.5595474243164062, "epoch": 0.9318181818181818, "grad_norm": 14.444919061371778, "learning_rate": 9.71607857430419e-06, "loss": 0.2071, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 67951468.0, "step": 82 }, { "entropy": 0.5521011352539062, "epoch": 0.9431818181818182, "grad_norm": 16.094060936634268, "learning_rate": 9.705570983241433e-06, "loss": 0.2253, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 68794669.0, "step": 83 }, { "entropy": 0.5579681396484375, "epoch": 0.9545454545454546, "grad_norm": 15.839955166577948, "learning_rate": 9.694878365139313e-06, "loss": 0.242, "mean_token_accuracy": 0.9179687548894435, "num_tokens": 69610011.0, "step": 84 }, { "entropy": 0.5581436157226562, "epoch": 0.9659090909090909, "grad_norm": 10.342555180693717, "learning_rate": 9.68400114044064e-06, "loss": 0.2039, "mean_token_accuracy": 0.9361979204695672, "num_tokens": 70406656.0, "step": 85 }, { "entropy": 0.5596466064453125, "epoch": 0.9772727272727273, "grad_norm": 9.046442023037564, "learning_rate": 9.672939736847104e-06, "loss": 0.1837, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 71214893.0, "step": 86 }, { "entropy": 0.5611190795898438, "epoch": 0.9886363636363636, "grad_norm": 8.777636166267405, "learning_rate": 9.661694589302471e-06, "loss": 0.1819, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 72038768.0, "step": 87 }, { "entropy": 0.5543975830078125, "epoch": 1.0, "grad_norm": 7.180103618322581, "learning_rate": 9.650266139975474e-06, "loss": 0.1855, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 72847782.0, "step": 88 }, { "entropy": 0.546783447265625, "epoch": 1.0113636363636365, "grad_norm": 11.51915215226843, "learning_rate": 9.63865483824243e-06, "loss": 0.1778, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 73701972.0, "step": 89 }, { "entropy": 0.5644760131835938, "epoch": 1.0227272727272727, "grad_norm": 16.213117035444654, "learning_rate": 9.62686114066956e-06, "loss": 0.2025, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 74539661.0, "step": 90 }, { "entropy": 0.551055908203125, "epoch": 1.0340909090909092, "grad_norm": 5.356675563856126, "learning_rate": 9.614885510995047e-06, "loss": 0.1652, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 75393325.0, "step": 91 }, { "entropy": 0.5531463623046875, "epoch": 1.0454545454545454, "grad_norm": 21.01084472531982, "learning_rate": 9.602728420110807e-06, "loss": 0.2453, "mean_token_accuracy": 0.8932291730307043, "num_tokens": 76230771.0, "step": 92 }, { "entropy": 0.5424957275390625, "epoch": 1.0568181818181819, "grad_norm": 9.730742448823642, "learning_rate": 9.590390346043952e-06, "loss": 0.1919, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 77097214.0, "step": 93 }, { "entropy": 0.5310592651367188, "epoch": 1.0681818181818181, "grad_norm": 19.44105394481565, "learning_rate": 9.577871773938013e-06, "loss": 0.2412, "mean_token_accuracy": 0.8736979241948575, "num_tokens": 78024772.0, "step": 94 }, { "entropy": 0.5767440795898438, "epoch": 1.0795454545454546, "grad_norm": 16.703429142070803, "learning_rate": 9.565173196033855e-06, "loss": 0.2218, "mean_token_accuracy": 0.8763020907063037, "num_tokens": 78828734.0, "step": 95 }, { "entropy": 0.5640106201171875, "epoch": 1.0909090909090908, "grad_norm": 4.368012071519686, "learning_rate": 9.552295111650328e-06, "loss": 0.1877, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 79685825.0, "step": 96 }, { "entropy": 0.5703125, "epoch": 1.1022727272727273, "grad_norm": 7.72202950747282, "learning_rate": 9.539238027164618e-06, "loss": 0.1969, "mean_token_accuracy": 0.9049479223322123, "num_tokens": 80538841.0, "step": 97 }, { "entropy": 0.5514984130859375, "epoch": 1.1136363636363635, "grad_norm": 8.583643682888079, "learning_rate": 9.526002455992361e-06, "loss": 0.1799, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 81406964.0, "step": 98 }, { "entropy": 0.5702438354492188, "epoch": 1.125, "grad_norm": 6.688536010514189, "learning_rate": 9.512588918567429e-06, "loss": 0.2044, "mean_token_accuracy": 0.9166666716337204, "num_tokens": 82228992.0, "step": 99 }, { "entropy": 0.5829925537109375, "epoch": 1.1363636363636362, "grad_norm": 3.019991683808097, "learning_rate": 9.498997942321484e-06, "loss": 0.1495, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 83019198.0, "step": 100 }, { "entropy": 0.5686721801757812, "epoch": 1.1477272727272727, "grad_norm": 4.776977543639304, "learning_rate": 9.48523006166323e-06, "loss": 0.1832, "mean_token_accuracy": 0.8984375060535967, "num_tokens": 83850145.0, "step": 101 }, { "entropy": 0.5742645263671875, "epoch": 1.1590909090909092, "grad_norm": 2.5324395623496323, "learning_rate": 9.471285817957407e-06, "loss": 0.1641, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 84659245.0, "step": 102 }, { "entropy": 0.5639724731445312, "epoch": 1.1704545454545454, "grad_norm": 3.1158639729762867, "learning_rate": 9.457165759503492e-06, "loss": 0.1662, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 85506138.0, "step": 103 }, { "entropy": 0.5506134033203125, "epoch": 1.1818181818181819, "grad_norm": 2.507395364410583, "learning_rate": 9.442870441514155e-06, "loss": 0.1461, "mean_token_accuracy": 0.9283854209352285, "num_tokens": 86335548.0, "step": 104 }, { "entropy": 0.55096435546875, "epoch": 1.1931818181818181, "grad_norm": 12.463466410754858, "learning_rate": 9.428400426093413e-06, "loss": 0.1831, "mean_token_accuracy": 0.9140625051222742, "num_tokens": 87158155.0, "step": 105 }, { "entropy": 0.5426483154296875, "epoch": 1.2045454545454546, "grad_norm": 4.335881181165624, "learning_rate": 9.413756282214538e-06, "loss": 0.1358, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 87999932.0, "step": 106 }, { "entropy": 0.5401229858398438, "epoch": 1.2159090909090908, "grad_norm": 15.619862153125911, "learning_rate": 9.398938585697679e-06, "loss": 0.2466, "mean_token_accuracy": 0.9114583386108279, "num_tokens": 88819905.0, "step": 107 }, { "entropy": 0.5477371215820312, "epoch": 1.2272727272727273, "grad_norm": 12.902228668846844, "learning_rate": 9.383947919187219e-06, "loss": 0.2149, "mean_token_accuracy": 0.9244791711680591, "num_tokens": 89634909.0, "step": 108 }, { "entropy": 0.5696182250976562, "epoch": 1.2386363636363638, "grad_norm": 2.68826428324166, "learning_rate": 9.368784872128877e-06, "loss": 0.1646, "mean_token_accuracy": 0.923177087912336, "num_tokens": 90402321.0, "step": 109 }, { "entropy": 0.564300537109375, "epoch": 1.25, "grad_norm": 10.378868916072332, "learning_rate": 9.35345004074651e-06, "loss": 0.1955, "mean_token_accuracy": 0.8945312562864274, "num_tokens": 91203638.0, "step": 110 }, { "entropy": 0.5643157958984375, "epoch": 1.2613636363636362, "grad_norm": 12.255195984734726, "learning_rate": 9.337944028018689e-06, "loss": 0.2115, "mean_token_accuracy": 0.8632812581490725, "num_tokens": 92020252.0, "step": 111 }, { "entropy": 0.5825271606445312, "epoch": 1.2727272727272727, "grad_norm": 7.770388720635199, "learning_rate": 9.322267443654974e-06, "loss": 0.1836, "mean_token_accuracy": 0.9088541720993817, "num_tokens": 92816807.0, "step": 112 }, { "entropy": 0.560638427734375, "epoch": 1.2840909090909092, "grad_norm": 1.6431578654617318, "learning_rate": 9.306420904071949e-06, "loss": 0.1725, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 93649991.0, "step": 113 }, { "entropy": 0.561920166015625, "epoch": 1.2954545454545454, "grad_norm": 1.4017962904890229, "learning_rate": 9.290405032368983e-06, "loss": 0.1653, "mean_token_accuracy": 0.9270833376795053, "num_tokens": 94459730.0, "step": 114 }, { "entropy": 0.5430450439453125, "epoch": 1.3068181818181819, "grad_norm": 1.6568698646332456, "learning_rate": 9.274220458303727e-06, "loss": 0.1466, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 95302339.0, "step": 115 }, { "entropy": 0.5311431884765625, "epoch": 1.3181818181818181, "grad_norm": 4.761720493612297, "learning_rate": 9.257867818267347e-06, "loss": 0.1553, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 96149112.0, "step": 116 }, { "entropy": 0.5357666015625, "epoch": 1.3295454545454546, "grad_norm": 2.1880525559156156, "learning_rate": 9.241347755259514e-06, "loss": 0.1222, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 96974920.0, "step": 117 }, { "entropy": 0.5627593994140625, "epoch": 1.3409090909090908, "grad_norm": 5.770062896951192, "learning_rate": 9.224660918863104e-06, "loss": 0.1592, "mean_token_accuracy": 0.9218750046566129, "num_tokens": 97741872.0, "step": 118 }, { "entropy": 0.532928466796875, "epoch": 1.3522727272727273, "grad_norm": 1.7482037476041852, "learning_rate": 9.207807965218668e-06, "loss": 0.1545, "mean_token_accuracy": 0.9153645883779973, "num_tokens": 98575899.0, "step": 119 }, { "entropy": 0.5299148559570312, "epoch": 1.3636363636363638, "grad_norm": 5.278489286192921, "learning_rate": 9.190789556998627e-06, "loss": 0.1656, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 99431810.0, "step": 120 }, { "entropy": 0.5542984008789062, "epoch": 1.375, "grad_norm": 2.007494136264229, "learning_rate": 9.173606363381218e-06, "loss": 0.1351, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 100229559.0, "step": 121 }, { "entropy": 0.5370101928710938, "epoch": 1.3863636363636362, "grad_norm": 3.033501872375309, "learning_rate": 9.156259060024177e-06, "loss": 0.1464, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 101066027.0, "step": 122 }, { "entropy": 0.5522689819335938, "epoch": 1.3977272727272727, "grad_norm": 3.9482667306063894, "learning_rate": 9.138748329038175e-06, "loss": 0.1251, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 101861078.0, "step": 123 }, { "entropy": 0.5521087646484375, "epoch": 1.4090909090909092, "grad_norm": 6.9998063616091954, "learning_rate": 9.121074858959997e-06, "loss": 0.1495, "mean_token_accuracy": 0.9296875041909516, "num_tokens": 102654586.0, "step": 124 }, { "entropy": 0.5347137451171875, "epoch": 1.4204545454545454, "grad_norm": 2.3410945679167954, "learning_rate": 9.103239344725465e-06, "loss": 0.1254, "mean_token_accuracy": 0.945312503259629, "num_tokens": 103497056.0, "step": 125 }, { "entropy": 0.5460281372070312, "epoch": 1.4318181818181819, "grad_norm": 2.2767941229194495, "learning_rate": 9.085242487642117e-06, "loss": 0.1238, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 104315903.0, "step": 126 }, { "entropy": 0.5574874877929688, "epoch": 1.4431818181818181, "grad_norm": 4.61478807505284, "learning_rate": 9.067084995361623e-06, "loss": 0.1352, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 105097772.0, "step": 127 }, { "entropy": 0.53790283203125, "epoch": 1.4545454545454546, "grad_norm": 2.123720821874447, "learning_rate": 9.048767581851973e-06, "loss": 0.1419, "mean_token_accuracy": 0.9335937539581209, "num_tokens": 105938088.0, "step": 128 }, { "entropy": 0.5608444213867188, "epoch": 1.4659090909090908, "grad_norm": 1.5993090480931784, "learning_rate": 9.030290967369392e-06, "loss": 0.1217, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 106725926.0, "step": 129 }, { "entropy": 0.5270538330078125, "epoch": 1.4772727272727273, "grad_norm": 1.8273550840469062, "learning_rate": 9.011655878430018e-06, "loss": 0.1352, "mean_token_accuracy": 0.9388020869810134, "num_tokens": 107586088.0, "step": 130 }, { "entropy": 0.5430755615234375, "epoch": 1.4886363636363638, "grad_norm": 2.38930161410884, "learning_rate": 8.992863047781346e-06, "loss": 0.1219, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 108406686.0, "step": 131 }, { "entropy": 0.5335464477539062, "epoch": 1.5, "grad_norm": 4.0374627826678005, "learning_rate": 8.973913214373405e-06, "loss": 0.1383, "mean_token_accuracy": 0.9322916707023978, "num_tokens": 109266458.0, "step": 132 }, { "entropy": 0.53369140625, "epoch": 1.5113636363636362, "grad_norm": 2.267529849479496, "learning_rate": 8.954807123329703e-06, "loss": 0.1408, "mean_token_accuracy": 0.945312503259629, "num_tokens": 110097799.0, "step": 133 }, { "entropy": 0.5591506958007812, "epoch": 1.5227272727272727, "grad_norm": 1.4402288066852007, "learning_rate": 8.935545525917936e-06, "loss": 0.1294, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 110885530.0, "step": 134 }, { "entropy": 0.5544586181640625, "epoch": 1.5340909090909092, "grad_norm": 1.7978385448039365, "learning_rate": 8.916129179520443e-06, "loss": 0.1341, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 111709692.0, "step": 135 }, { "entropy": 0.549468994140625, "epoch": 1.5454545454545454, "grad_norm": 5.401067092612967, "learning_rate": 8.896558847604414e-06, "loss": 0.125, "mean_token_accuracy": 0.9414062534924597, "num_tokens": 112521769.0, "step": 136 }, { "entropy": 0.5504837036132812, "epoch": 1.5568181818181817, "grad_norm": 3.341966167312368, "learning_rate": 8.876835299691892e-06, "loss": 0.1175, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 113374792.0, "step": 137 }, { "entropy": 0.5494613647460938, "epoch": 1.5681818181818183, "grad_norm": 10.390450710003204, "learning_rate": 8.856959311329495e-06, "loss": 0.1683, "mean_token_accuracy": 0.9205729214008898, "num_tokens": 114209235.0, "step": 138 }, { "entropy": 0.5491485595703125, "epoch": 1.5795454545454546, "grad_norm": 7.8130355045508395, "learning_rate": 8.836931664057935e-06, "loss": 0.1425, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 115024770.0, "step": 139 }, { "entropy": 0.5506820678710938, "epoch": 1.5909090909090908, "grad_norm": 2.7886680033055837, "learning_rate": 8.816753145381276e-06, "loss": 0.1168, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 115851204.0, "step": 140 }, { "entropy": 0.5492782592773438, "epoch": 1.6022727272727273, "grad_norm": 1.9667205066850295, "learning_rate": 8.796424548735975e-06, "loss": 0.134, "mean_token_accuracy": 0.9427083367481828, "num_tokens": 116699283.0, "step": 141 }, { "entropy": 0.5760650634765625, "epoch": 1.6136363636363638, "grad_norm": 5.35821863023592, "learning_rate": 8.775946673459682e-06, "loss": 0.1229, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 117503423.0, "step": 142 }, { "entropy": 0.5779876708984375, "epoch": 1.625, "grad_norm": 2.8148266983890413, "learning_rate": 8.755320324759808e-06, "loss": 0.109, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 118291651.0, "step": 143 }, { "entropy": 0.5753326416015625, "epoch": 1.6363636363636362, "grad_norm": 3.2866194746376927, "learning_rate": 8.734546313681869e-06, "loss": 0.1232, "mean_token_accuracy": 0.9479166697710752, "num_tokens": 119098075.0, "step": 144 }, { "entropy": 0.5827407836914062, "epoch": 1.6477272727272727, "grad_norm": 2.9644999295746874, "learning_rate": 8.713625457077585e-06, "loss": 0.1191, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 119888254.0, "step": 145 }, { "entropy": 0.5566940307617188, "epoch": 1.6590909090909092, "grad_norm": 1.2629263567013618, "learning_rate": 8.692558577572773e-06, "loss": 0.1199, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 120730204.0, "step": 146 }, { "entropy": 0.5731048583984375, "epoch": 1.6704545454545454, "grad_norm": 4.651622739066796, "learning_rate": 8.671346503534987e-06, "loss": 0.1216, "mean_token_accuracy": 0.9401041702367365, "num_tokens": 121547571.0, "step": 147 }, { "entropy": 0.5564804077148438, "epoch": 1.6818181818181817, "grad_norm": 1.8545038590057121, "learning_rate": 8.64999006904096e-06, "loss": 0.0951, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 122398891.0, "step": 148 }, { "entropy": 0.5613327026367188, "epoch": 1.6931818181818183, "grad_norm": 6.832643774674837, "learning_rate": 8.628490113843798e-06, "loss": 0.1649, "mean_token_accuracy": 0.9309895874466747, "num_tokens": 123217975.0, "step": 149 }, { "entropy": 0.5449295043945312, "epoch": 1.7045454545454546, "grad_norm": 6.765128841750841, "learning_rate": 8.606847483339957e-06, "loss": 0.1562, "mean_token_accuracy": 0.9257812544237822, "num_tokens": 124067259.0, "step": 150 }, { "entropy": 0.5537567138671875, "epoch": 1.7159090909090908, "grad_norm": 1.8067052651080437, "learning_rate": 8.585063028536015e-06, "loss": 0.0881, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 124928545.0, "step": 151 }, { "entropy": 0.5481033325195312, "epoch": 1.7272727272727273, "grad_norm": 1.936600476892622, "learning_rate": 8.563137606015201e-06, "loss": 0.1096, "mean_token_accuracy": 0.9531250027939677, "num_tokens": 125802700.0, "step": 152 }, { "entropy": 0.5662612915039062, "epoch": 1.7386363636363638, "grad_norm": 3.3667078901079286, "learning_rate": 8.54107207790371e-06, "loss": 0.0913, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 126615551.0, "step": 153 }, { "entropy": 0.5416641235351562, "epoch": 1.75, "grad_norm": 1.8440360775274136, "learning_rate": 8.518867311836808e-06, "loss": 0.1098, "mean_token_accuracy": 0.955729169305414, "num_tokens": 127457356.0, "step": 154 }, { "entropy": 0.539337158203125, "epoch": 1.7613636363636362, "grad_norm": 2.928897176060182, "learning_rate": 8.49652418092472e-06, "loss": 0.0814, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 128340971.0, "step": 155 }, { "entropy": 0.5511016845703125, "epoch": 1.7727272727272727, "grad_norm": 2.5333879176767247, "learning_rate": 8.474043563718287e-06, "loss": 0.0981, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 129194640.0, "step": 156 }, { "entropy": 0.5571365356445312, "epoch": 1.7840909090909092, "grad_norm": 3.34543946127575, "learning_rate": 8.451426344174433e-06, "loss": 0.0957, "mean_token_accuracy": 0.955729169305414, "num_tokens": 130035771.0, "step": 157 }, { "entropy": 0.5454559326171875, "epoch": 1.7954545454545454, "grad_norm": 1.7036283510875303, "learning_rate": 8.4286734116214e-06, "loss": 0.0966, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 130925665.0, "step": 158 }, { "entropy": 0.5605239868164062, "epoch": 1.8068181818181817, "grad_norm": 1.665281365638825, "learning_rate": 8.405785660723784e-06, "loss": 0.0988, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 131762678.0, "step": 159 }, { "entropy": 0.5491256713867188, "epoch": 1.8181818181818183, "grad_norm": 7.473940273244859, "learning_rate": 8.382763991447344e-06, "loss": 0.1227, "mean_token_accuracy": 0.9440104200039059, "num_tokens": 132615569.0, "step": 160 }, { "entropy": 0.5663986206054688, "epoch": 1.8295454545454546, "grad_norm": 7.633821231684359, "learning_rate": 8.359609309023632e-06, "loss": 0.125, "mean_token_accuracy": 0.9492187530267984, "num_tokens": 133443981.0, "step": 161 }, { "entropy": 0.5599212646484375, "epoch": 1.8409090909090908, "grad_norm": 1.7052593935067812, "learning_rate": 8.336322523914385e-06, "loss": 0.0974, "mean_token_accuracy": 0.9596354190725833, "num_tokens": 134283417.0, "step": 162 }, { "entropy": 0.551239013671875, "epoch": 1.8522727272727273, "grad_norm": 2.9955966815957433, "learning_rate": 8.312904551775731e-06, "loss": 0.096, "mean_token_accuracy": 0.955729169305414, "num_tokens": 135129505.0, "step": 163 }, { "entropy": 0.54931640625, "epoch": 1.8636363636363638, "grad_norm": 1.605468865667034, "learning_rate": 8.289356313422182e-06, "loss": 0.116, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 135982474.0, "step": 164 }, { "entropy": 0.5581436157226562, "epoch": 1.875, "grad_norm": 1.6624611761909598, "learning_rate": 8.26567873479043e-06, "loss": 0.084, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 136810572.0, "step": 165 }, { "entropy": 0.564849853515625, "epoch": 1.8863636363636362, "grad_norm": 2.682316743161302, "learning_rate": 8.241872746902934e-06, "loss": 0.0893, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 137623231.0, "step": 166 }, { "entropy": 0.5549545288085938, "epoch": 1.8977272727272727, "grad_norm": 3.8888251203366844, "learning_rate": 8.217939285831315e-06, "loss": 0.1113, "mean_token_accuracy": 0.9466145865153521, "num_tokens": 138474048.0, "step": 167 }, { "entropy": 0.5504684448242188, "epoch": 1.9090909090909092, "grad_norm": 1.723647194606046, "learning_rate": 8.19387929265955e-06, "loss": 0.0836, "mean_token_accuracy": 0.967447918606922, "num_tokens": 139318039.0, "step": 168 }, { "entropy": 0.5573348999023438, "epoch": 1.9204545454545454, "grad_norm": 5.091415219779504, "learning_rate": 8.16969371344696e-06, "loss": 0.094, "mean_token_accuracy": 0.9687500018626451, "num_tokens": 140136159.0, "step": 169 }, { "entropy": 0.558563232421875, "epoch": 1.9318181818181817, "grad_norm": 3.8416271732252576, "learning_rate": 8.14538349919102e-06, "loss": 0.1123, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 140989018.0, "step": 170 }, { "entropy": 0.5544891357421875, "epoch": 1.9431818181818183, "grad_norm": 4.006864387265136, "learning_rate": 8.12094960578996e-06, "loss": 0.0885, "mean_token_accuracy": 0.9609375023283064, "num_tokens": 141834446.0, "step": 171 }, { "entropy": 0.5765304565429688, "epoch": 1.9545454545454546, "grad_norm": 2.0489624758048204, "learning_rate": 8.096392994005177e-06, "loss": 0.0784, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 142637643.0, "step": 172 }, { "entropy": 0.5684738159179688, "epoch": 1.9659090909090908, "grad_norm": 1.7193238150531707, "learning_rate": 8.071714629423459e-06, "loss": 0.0671, "mean_token_accuracy": 0.9804687511641532, "num_tokens": 143446972.0, "step": 173 }, { "entropy": 0.5797042846679688, "epoch": 1.9772727272727273, "grad_norm": 1.8892931068119632, "learning_rate": 8.046915482419018e-06, "loss": 0.0761, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 144219843.0, "step": 174 }, { "entropy": 0.5546188354492188, "epoch": 1.9886363636363638, "grad_norm": 2.89175754473916, "learning_rate": 8.021996528115335e-06, "loss": 0.0843, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 145065280.0, "step": 175 }, { "entropy": 0.5574493408203125, "epoch": 2.0, "grad_norm": 1.7760622157547898, "learning_rate": 7.996958746346812e-06, "loss": 0.0593, "mean_token_accuracy": 0.977864584652707, "num_tokens": 145884441.0, "step": 176 }, { "entropy": 0.5555419921875, "epoch": 2.0113636363636362, "grad_norm": 1.6411360190424782, "learning_rate": 7.971803121620252e-06, "loss": 0.0667, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 146692231.0, "step": 177 }, { "entropy": 0.5388107299804688, "epoch": 2.022727272727273, "grad_norm": 2.3653599867436776, "learning_rate": 7.946530643076138e-06, "loss": 0.0652, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 147528006.0, "step": 178 }, { "entropy": 0.5242385864257812, "epoch": 2.034090909090909, "grad_norm": 3.2222161790722694, "learning_rate": 7.921142304449744e-06, "loss": 0.0675, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 148417260.0, "step": 179 }, { "entropy": 0.5447845458984375, "epoch": 2.0454545454545454, "grad_norm": 2.463707776669041, "learning_rate": 7.895639104032071e-06, "loss": 0.066, "mean_token_accuracy": 0.9791666679084301, "num_tokens": 149242794.0, "step": 180 }, { "entropy": 0.5285720825195312, "epoch": 2.0568181818181817, "grad_norm": 7.236977149302813, "learning_rate": 7.870022044630569e-06, "loss": 0.1021, "mean_token_accuracy": 0.9622395855840296, "num_tokens": 150098081.0, "step": 181 }, { "entropy": 0.54425048828125, "epoch": 2.0681818181818183, "grad_norm": 3.3362947055324494, "learning_rate": 7.844292133529727e-06, "loss": 0.0811, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 150917507.0, "step": 182 }, { "entropy": 0.5337677001953125, "epoch": 2.0795454545454546, "grad_norm": 10.193693903709919, "learning_rate": 7.818450382451457e-06, "loss": 0.1514, "mean_token_accuracy": 0.9348958372138441, "num_tokens": 151774062.0, "step": 183 }, { "entropy": 0.5380935668945312, "epoch": 2.090909090909091, "grad_norm": 6.471329983707537, "learning_rate": 7.792497807515317e-06, "loss": 0.1129, "mean_token_accuracy": 0.9518229195382446, "num_tokens": 152611097.0, "step": 184 }, { "entropy": 0.5472564697265625, "epoch": 2.102272727272727, "grad_norm": 1.6410409359900417, "learning_rate": 7.766435429198547e-06, "loss": 0.0666, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 153466669.0, "step": 185 }, { "entropy": 0.5562591552734375, "epoch": 2.1136363636363638, "grad_norm": 6.992530287146324, "learning_rate": 7.740264272295954e-06, "loss": 0.1071, "mean_token_accuracy": 0.9583333358168602, "num_tokens": 154298660.0, "step": 186 }, { "entropy": 0.5470352172851562, "epoch": 2.125, "grad_norm": 6.0492157426773385, "learning_rate": 7.713985365879607e-06, "loss": 0.0907, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 155130292.0, "step": 187 }, { "entropy": 0.54876708984375, "epoch": 2.1363636363636362, "grad_norm": 2.698030226800749, "learning_rate": 7.68759974325838e-06, "loss": 0.0665, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 155975863.0, "step": 188 }, { "entropy": 0.5512847900390625, "epoch": 2.147727272727273, "grad_norm": 5.25550884455084, "learning_rate": 7.661108441937321e-06, "loss": 0.0861, "mean_token_accuracy": 0.9661458353511989, "num_tokens": 156800322.0, "step": 189 }, { "entropy": 0.5344924926757812, "epoch": 2.159090909090909, "grad_norm": 9.276367829707043, "learning_rate": 7.63451250357685e-06, "loss": 0.1309, "mean_token_accuracy": 0.9375000037252903, "num_tokens": 157668790.0, "step": 190 }, { "entropy": 0.5440673828125, "epoch": 2.1704545454545454, "grad_norm": 5.295760824612763, "learning_rate": 7.607812973951802e-06, "loss": 0.0848, "mean_token_accuracy": 0.9648437520954758, "num_tokens": 158496829.0, "step": 191 }, { "entropy": 0.5383377075195312, "epoch": 2.1818181818181817, "grad_norm": 1.4547423731248486, "learning_rate": 7.581010902910316e-06, "loss": 0.0592, "mean_token_accuracy": 0.9739583348855376, "num_tokens": 159347676.0, "step": 192 }, { "entropy": 0.5419235229492188, "epoch": 2.1931818181818183, "grad_norm": 3.45460465485423, "learning_rate": 7.55410734433254e-06, "loss": 0.0756, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 160186343.0, "step": 193 }, { "entropy": 0.5480194091796875, "epoch": 2.2045454545454546, "grad_norm": 4.242880928264725, "learning_rate": 7.5271033560892e-06, "loss": 0.0786, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 160994811.0, "step": 194 }, { "entropy": 0.54876708984375, "epoch": 2.215909090909091, "grad_norm": 2.01718177176073, "learning_rate": 7.500000000000001e-06, "loss": 0.0441, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 161804760.0, "step": 195 }, { "entropy": 0.5249099731445312, "epoch": 2.227272727272727, "grad_norm": 3.119173047157026, "learning_rate": 7.472798341791877e-06, "loss": 0.0502, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 162669635.0, "step": 196 }, { "entropy": 0.5340576171875, "epoch": 2.2386363636363638, "grad_norm": 3.2640184382157327, "learning_rate": 7.445499451057083e-06, "loss": 0.0751, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 163509429.0, "step": 197 }, { "entropy": 0.5215835571289062, "epoch": 2.25, "grad_norm": 2.5349734692769106, "learning_rate": 7.418104401211144e-06, "loss": 0.045, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 164350633.0, "step": 198 }, { "entropy": 0.5251007080078125, "epoch": 2.2613636363636362, "grad_norm": 2.6297238209432, "learning_rate": 7.390614269450633e-06, "loss": 0.0509, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 165187521.0, "step": 199 }, { "entropy": 0.5383987426757812, "epoch": 2.2727272727272725, "grad_norm": 3.562530401980587, "learning_rate": 7.363030136710837e-06, "loss": 0.05, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 166006751.0, "step": 200 }, { "entropy": 0.5240249633789062, "epoch": 2.284090909090909, "grad_norm": 3.7171813986264657, "learning_rate": 7.3353530876232315e-06, "loss": 0.0712, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 166843238.0, "step": 201 }, { "entropy": 0.5258255004882812, "epoch": 2.2954545454545454, "grad_norm": 2.0334492418178454, "learning_rate": 7.3075842104728445e-06, "loss": 0.0463, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 167691986.0, "step": 202 }, { "entropy": 0.5163116455078125, "epoch": 2.3068181818181817, "grad_norm": 5.950230395278793, "learning_rate": 7.279724597155463e-06, "loss": 0.0688, "mean_token_accuracy": 0.9700520851183683, "num_tokens": 168532989.0, "step": 203 }, { "entropy": 0.5151596069335938, "epoch": 2.3181818181818183, "grad_norm": 6.82055119678414, "learning_rate": 7.251775343134695e-06, "loss": 0.0818, "mean_token_accuracy": 0.9713541683740914, "num_tokens": 169363455.0, "step": 204 }, { "entropy": 0.5315170288085938, "epoch": 2.3295454545454546, "grad_norm": 3.65607888800478, "learning_rate": 7.223737547398898e-06, "loss": 0.0681, "mean_token_accuracy": 0.9752604181412607, "num_tokens": 170177432.0, "step": 205 }, { "entropy": 0.5236282348632812, "epoch": 2.340909090909091, "grad_norm": 3.922669241526332, "learning_rate": 7.195612312417964e-06, "loss": 0.0405, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 171014423.0, "step": 206 }, { "entropy": 0.5333251953125, "epoch": 2.3522727272727275, "grad_norm": 5.835006780995604, "learning_rate": 7.1674007440999706e-06, "loss": 0.0701, "mean_token_accuracy": 0.967447918606922, "num_tokens": 171832784.0, "step": 207 }, { "entropy": 0.5201034545898438, "epoch": 2.3636363636363638, "grad_norm": 3.762781794806702, "learning_rate": 7.139103951747694e-06, "loss": 0.0689, "mean_token_accuracy": 0.9726562516298145, "num_tokens": 172678098.0, "step": 208 }, { "entropy": 0.5207901000976562, "epoch": 2.375, "grad_norm": 2.8820018215511296, "learning_rate": 7.110723048014996e-06, "loss": 0.051, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 173521043.0, "step": 209 }, { "entropy": 0.5383834838867188, "epoch": 2.3863636363636362, "grad_norm": 4.032203821493698, "learning_rate": 7.082259148863064e-06, "loss": 0.0514, "mean_token_accuracy": 0.9765625013969839, "num_tokens": 174330129.0, "step": 210 }, { "entropy": 0.5193099975585938, "epoch": 2.3977272727272725, "grad_norm": 4.031643917333958, "learning_rate": 7.053713373516538e-06, "loss": 0.0599, "mean_token_accuracy": 0.977864584652707, "num_tokens": 175179051.0, "step": 211 }, { "entropy": 0.5137481689453125, "epoch": 2.409090909090909, "grad_norm": 1.494980753055014, "learning_rate": 7.0250868444195e-06, "loss": 0.0463, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 176037909.0, "step": 212 }, { "entropy": 0.5253143310546875, "epoch": 2.4204545454545454, "grad_norm": 1.7895769023957353, "learning_rate": 6.996380687191335e-06, "loss": 0.0445, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 176860131.0, "step": 213 }, { "entropy": 0.5091476440429688, "epoch": 2.4318181818181817, "grad_norm": 2.1956945351245682, "learning_rate": 6.9675960305824785e-06, "loss": 0.0398, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 177705136.0, "step": 214 }, { "entropy": 0.523956298828125, "epoch": 2.4431818181818183, "grad_norm": 2.703363904992189, "learning_rate": 6.9387340064300234e-06, "loss": 0.0425, "mean_token_accuracy": 0.9817708344198763, "num_tokens": 178510671.0, "step": 215 }, { "entropy": 0.5203170776367188, "epoch": 2.4545454545454546, "grad_norm": 2.1620913420308496, "learning_rate": 6.909795749613223e-06, "loss": 0.034, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 179329651.0, "step": 216 }, { "entropy": 0.5090408325195312, "epoch": 2.465909090909091, "grad_norm": 2.3276449435155357, "learning_rate": 6.880782398008862e-06, "loss": 0.0318, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 180158843.0, "step": 217 }, { "entropy": 0.501129150390625, "epoch": 2.4772727272727275, "grad_norm": 3.2599600902110795, "learning_rate": 6.851695092446517e-06, "loss": 0.0347, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 181003645.0, "step": 218 }, { "entropy": 0.49993133544921875, "epoch": 2.4886363636363638, "grad_norm": 2.2988867090225753, "learning_rate": 6.822534976663695e-06, "loss": 0.0249, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 181838208.0, "step": 219 }, { "entropy": 0.49395751953125, "epoch": 2.5, "grad_norm": 2.8150814971598033, "learning_rate": 6.7933031972608644e-06, "loss": 0.0229, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 182681925.0, "step": 220 }, { "entropy": 0.49420928955078125, "epoch": 2.5113636363636362, "grad_norm": 2.4377006101904937, "learning_rate": 6.764000903656367e-06, "loss": 0.0219, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 183519031.0, "step": 221 }, { "entropy": 0.49845123291015625, "epoch": 2.5227272727272725, "grad_norm": 1.9554050092260766, "learning_rate": 6.734629248041226e-06, "loss": 0.0282, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 184358917.0, "step": 222 }, { "entropy": 0.4810333251953125, "epoch": 2.534090909090909, "grad_norm": 1.9724903668294038, "learning_rate": 6.70518938533383e-06, "loss": 0.0223, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 185224412.0, "step": 223 }, { "entropy": 0.5009841918945312, "epoch": 2.5454545454545454, "grad_norm": 2.716850789628636, "learning_rate": 6.675682473134536e-06, "loss": 0.0354, "mean_token_accuracy": 0.989583333954215, "num_tokens": 186048899.0, "step": 224 }, { "entropy": 0.494140625, "epoch": 2.5568181818181817, "grad_norm": 3.486680710445784, "learning_rate": 6.64610967168014e-06, "loss": 0.045, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 186880043.0, "step": 225 }, { "entropy": 0.5100173950195312, "epoch": 2.5681818181818183, "grad_norm": 3.387793071023026, "learning_rate": 6.61647214379826e-06, "loss": 0.0482, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 187654506.0, "step": 226 }, { "entropy": 0.5018386840820312, "epoch": 2.5795454545454546, "grad_norm": 2.526453396302034, "learning_rate": 6.586771054861613e-06, "loss": 0.0263, "mean_token_accuracy": 0.989583333954215, "num_tokens": 188464920.0, "step": 227 }, { "entropy": 0.5029830932617188, "epoch": 2.590909090909091, "grad_norm": 3.0298112078189554, "learning_rate": 6.55700757274219e-06, "loss": 0.0374, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 189293301.0, "step": 228 }, { "entropy": 0.5063552856445312, "epoch": 2.6022727272727275, "grad_norm": 2.394723685020552, "learning_rate": 6.527182867765333e-06, "loss": 0.023, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 190121007.0, "step": 229 }, { "entropy": 0.5308837890625, "epoch": 2.6136363636363638, "grad_norm": 2.945677885521069, "learning_rate": 6.497298112663721e-06, "loss": 0.0335, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 190881003.0, "step": 230 }, { "entropy": 0.506683349609375, "epoch": 2.625, "grad_norm": 2.431103647089613, "learning_rate": 6.467354482531254e-06, "loss": 0.0189, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 191699288.0, "step": 231 }, { "entropy": 0.50860595703125, "epoch": 2.6363636363636362, "grad_norm": 2.6832616012101975, "learning_rate": 6.437353154776848e-06, "loss": 0.0289, "mean_token_accuracy": 0.989583333954215, "num_tokens": 192526668.0, "step": 232 }, { "entropy": 0.5082778930664062, "epoch": 2.6477272727272725, "grad_norm": 2.2653553256013796, "learning_rate": 6.407295309078139e-06, "loss": 0.0302, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 193349142.0, "step": 233 }, { "entropy": 0.5096511840820312, "epoch": 2.659090909090909, "grad_norm": 1.5361613738548292, "learning_rate": 6.377182127335096e-06, "loss": 0.0145, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 194176787.0, "step": 234 }, { "entropy": 0.5134658813476562, "epoch": 2.6704545454545454, "grad_norm": 2.8189785812143295, "learning_rate": 6.3470147936235485e-06, "loss": 0.0238, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 195004474.0, "step": 235 }, { "entropy": 0.5214920043945312, "epoch": 2.6818181818181817, "grad_norm": 1.7469149362286367, "learning_rate": 6.316794494148625e-06, "loss": 0.0224, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 195807358.0, "step": 236 }, { "entropy": 0.5124053955078125, "epoch": 2.6931818181818183, "grad_norm": 3.217590245044298, "learning_rate": 6.286522417198115e-06, "loss": 0.0284, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 196617818.0, "step": 237 }, { "entropy": 0.5110855102539062, "epoch": 2.7045454545454546, "grad_norm": 1.6664060279228878, "learning_rate": 6.256199753095745e-06, "loss": 0.0188, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 197430311.0, "step": 238 }, { "entropy": 0.505767822265625, "epoch": 2.715909090909091, "grad_norm": 3.3115585706561586, "learning_rate": 6.225827694154365e-06, "loss": 0.0287, "mean_token_accuracy": 0.989583333954215, "num_tokens": 198277321.0, "step": 239 }, { "entropy": 0.5065383911132812, "epoch": 2.7272727272727275, "grad_norm": 2.545417377201447, "learning_rate": 6.1954074346290775e-06, "loss": 0.0186, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 199108815.0, "step": 240 }, { "entropy": 0.5145187377929688, "epoch": 2.7386363636363638, "grad_norm": 2.333232181354277, "learning_rate": 6.164940170670266e-06, "loss": 0.0467, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 199899561.0, "step": 241 }, { "entropy": 0.516876220703125, "epoch": 2.75, "grad_norm": 2.914196581871424, "learning_rate": 6.134427100276579e-06, "loss": 0.0562, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 200725431.0, "step": 242 }, { "entropy": 0.5063247680664062, "epoch": 2.7613636363636362, "grad_norm": 1.8740763274483776, "learning_rate": 6.1038694232478e-06, "loss": 0.0263, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 201560765.0, "step": 243 }, { "entropy": 0.5165557861328125, "epoch": 2.7727272727272725, "grad_norm": 2.8713490854584705, "learning_rate": 6.073268341137694e-06, "loss": 0.0273, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 202381503.0, "step": 244 }, { "entropy": 0.5127792358398438, "epoch": 2.784090909090909, "grad_norm": 1.4259016961929543, "learning_rate": 6.042625057206742e-06, "loss": 0.0237, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 203203187.0, "step": 245 }, { "entropy": 0.5303802490234375, "epoch": 2.7954545454545454, "grad_norm": 2.761830170391252, "learning_rate": 6.0119407763748465e-06, "loss": 0.0256, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 203986073.0, "step": 246 }, { "entropy": 0.5184249877929688, "epoch": 2.8068181818181817, "grad_norm": 2.702130210407872, "learning_rate": 5.98121670517393e-06, "loss": 0.0274, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 204805437.0, "step": 247 }, { "entropy": 0.5221176147460938, "epoch": 2.8181818181818183, "grad_norm": 2.0126057256121936, "learning_rate": 5.950454051700519e-06, "loss": 0.0324, "mean_token_accuracy": 0.989583333954215, "num_tokens": 205608028.0, "step": 248 }, { "entropy": 0.5098419189453125, "epoch": 2.8295454545454546, "grad_norm": 3.2909474534699363, "learning_rate": 5.919654025568216e-06, "loss": 0.0392, "mean_token_accuracy": 0.9856770841870457, "num_tokens": 206426839.0, "step": 249 }, { "entropy": 0.49706268310546875, "epoch": 2.840909090909091, "grad_norm": 2.8470716408708774, "learning_rate": 5.8888178378601565e-06, "loss": 0.0344, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 207279265.0, "step": 250 }, { "entropy": 0.5188217163085938, "epoch": 2.8522727272727275, "grad_norm": 3.5895784761020173, "learning_rate": 5.85794670108138e-06, "loss": 0.0305, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 208074284.0, "step": 251 }, { "entropy": 0.5012741088867188, "epoch": 2.8636363636363638, "grad_norm": 6.393556799364061, "learning_rate": 5.827041829111144e-06, "loss": 0.0626, "mean_token_accuracy": 0.9830729176755995, "num_tokens": 208920165.0, "step": 252 }, { "entropy": 0.499847412109375, "epoch": 2.875, "grad_norm": 3.127734745296954, "learning_rate": 5.796104437155213e-06, "loss": 0.0261, "mean_token_accuracy": 0.989583333954215, "num_tokens": 209763076.0, "step": 253 }, { "entropy": 0.498291015625, "epoch": 2.8863636363636362, "grad_norm": 2.259346877145594, "learning_rate": 5.765135741698058e-06, "loss": 0.023, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 210578485.0, "step": 254 }, { "entropy": 0.5106277465820312, "epoch": 2.8977272727272725, "grad_norm": 3.030246237871133, "learning_rate": 5.734136960455035e-06, "loss": 0.0405, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 211388341.0, "step": 255 }, { "entropy": 0.49578857421875, "epoch": 2.909090909090909, "grad_norm": 2.750429015412331, "learning_rate": 5.703109312324493e-06, "loss": 0.0358, "mean_token_accuracy": 0.9882812506984919, "num_tokens": 212229038.0, "step": 256 }, { "entropy": 0.5024261474609375, "epoch": 2.9204545454545454, "grad_norm": 1.2863557607521734, "learning_rate": 5.672054017339855e-06, "loss": 0.0148, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 213072715.0, "step": 257 }, { "entropy": 0.49884796142578125, "epoch": 2.9318181818181817, "grad_norm": 3.02221939510891, "learning_rate": 5.640972296621644e-06, "loss": 0.0238, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 213912498.0, "step": 258 }, { "entropy": 0.5084075927734375, "epoch": 2.9431818181818183, "grad_norm": 4.428890553493783, "learning_rate": 5.609865372329461e-06, "loss": 0.0398, "mean_token_accuracy": 0.9843750009313226, "num_tokens": 214726839.0, "step": 259 }, { "entropy": 0.5108566284179688, "epoch": 2.9545454545454546, "grad_norm": 2.5752296561401664, "learning_rate": 5.578734467613933e-06, "loss": 0.0223, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 215527901.0, "step": 260 }, { "entropy": 0.4960784912109375, "epoch": 2.965909090909091, "grad_norm": 1.401132052463041, "learning_rate": 5.547580806568621e-06, "loss": 0.0187, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 216370628.0, "step": 261 }, { "entropy": 0.5012435913085938, "epoch": 2.9772727272727275, "grad_norm": 1.8759810858997696, "learning_rate": 5.516405614181883e-06, "loss": 0.0164, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 217191647.0, "step": 262 }, { "entropy": 0.494384765625, "epoch": 2.9886363636363638, "grad_norm": 1.708292087377222, "learning_rate": 5.485210116288704e-06, "loss": 0.0128, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 218045769.0, "step": 263 }, { "entropy": 0.4870147705078125, "epoch": 3.0, "grad_norm": 3.1469843111715337, "learning_rate": 5.453995539522503e-06, "loss": 0.0294, "mean_token_accuracy": 0.9869791674427688, "num_tokens": 218903155.0, "step": 264 }, { "entropy": 0.49308013916015625, "epoch": 3.0113636363636362, "grad_norm": 0.910668285240801, "learning_rate": 5.4227631112668955e-06, "loss": 0.0086, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 219733127.0, "step": 265 }, { "entropy": 0.497039794921875, "epoch": 3.022727272727273, "grad_norm": 1.4708773619805977, "learning_rate": 5.391514059607431e-06, "loss": 0.0098, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 220560193.0, "step": 266 }, { "entropy": 0.4811553955078125, "epoch": 3.034090909090909, "grad_norm": 2.026974085665039, "learning_rate": 5.360249613283308e-06, "loss": 0.014, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 221419000.0, "step": 267 }, { "entropy": 0.5044631958007812, "epoch": 3.0454545454545454, "grad_norm": 1.7384277706071183, "learning_rate": 5.328971001639054e-06, "loss": 0.0138, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 222209715.0, "step": 268 }, { "entropy": 0.4959716796875, "epoch": 3.0568181818181817, "grad_norm": 2.1623095900546883, "learning_rate": 5.2976794545761886e-06, "loss": 0.0188, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 223024902.0, "step": 269 }, { "entropy": 0.49108123779296875, "epoch": 3.0681818181818183, "grad_norm": 1.509606911341433, "learning_rate": 5.266376202504866e-06, "loss": 0.0217, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 223854643.0, "step": 270 }, { "entropy": 0.48877716064453125, "epoch": 3.0795454545454546, "grad_norm": 3.1043150088135776, "learning_rate": 5.235062476295488e-06, "loss": 0.0311, "mean_token_accuracy": 0.989583333954215, "num_tokens": 224719358.0, "step": 271 }, { "entropy": 0.4909515380859375, "epoch": 3.090909090909091, "grad_norm": 3.13964195517409, "learning_rate": 5.203739507230311e-06, "loss": 0.028, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 225549159.0, "step": 272 }, { "entropy": 0.50506591796875, "epoch": 3.102272727272727, "grad_norm": 1.6160317091643448, "learning_rate": 5.172408526955025e-06, "loss": 0.0071, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 226368587.0, "step": 273 }, { "entropy": 0.5047378540039062, "epoch": 3.1136363636363638, "grad_norm": 1.353500778978189, "learning_rate": 5.141070767430331e-06, "loss": 0.0108, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 227156622.0, "step": 274 }, { "entropy": 0.5011825561523438, "epoch": 3.125, "grad_norm": 6.367060575954853, "learning_rate": 5.109727460883496e-06, "loss": 0.0531, "mean_token_accuracy": 0.989583333954215, "num_tokens": 227980137.0, "step": 275 }, { "entropy": 0.5009002685546875, "epoch": 3.1363636363636362, "grad_norm": 3.0144130557709854, "learning_rate": 5.078379839759895e-06, "loss": 0.0151, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 228789801.0, "step": 276 }, { "entropy": 0.490264892578125, "epoch": 3.147727272727273, "grad_norm": 1.8289965387337097, "learning_rate": 5.047029136674563e-06, "loss": 0.0068, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 229631483.0, "step": 277 }, { "entropy": 0.49471282958984375, "epoch": 3.159090909090909, "grad_norm": 3.5661554320566493, "learning_rate": 5.015676584363716e-06, "loss": 0.0311, "mean_token_accuracy": 0.989583333954215, "num_tokens": 230473829.0, "step": 278 }, { "entropy": 0.49500274658203125, "epoch": 3.1704545454545454, "grad_norm": 2.297701797368414, "learning_rate": 4.984323415636285e-06, "loss": 0.0147, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 231297503.0, "step": 279 }, { "entropy": 0.48564910888671875, "epoch": 3.1818181818181817, "grad_norm": 1.5108735884549447, "learning_rate": 4.95297086332544e-06, "loss": 0.0141, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 232167203.0, "step": 280 }, { "entropy": 0.5065155029296875, "epoch": 3.1931818181818183, "grad_norm": 1.4197005931026672, "learning_rate": 4.921620160240107e-06, "loss": 0.0086, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 232982992.0, "step": 281 }, { "entropy": 0.50030517578125, "epoch": 3.2045454545454546, "grad_norm": 2.0658589438724086, "learning_rate": 4.890272539116508e-06, "loss": 0.0177, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 233794951.0, "step": 282 }, { "entropy": 0.5062942504882812, "epoch": 3.215909090909091, "grad_norm": 1.390316826284385, "learning_rate": 4.858929232569671e-06, "loss": 0.0116, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 234627063.0, "step": 283 }, { "entropy": 0.508026123046875, "epoch": 3.227272727272727, "grad_norm": 0.8411098837665795, "learning_rate": 4.827591473044978e-06, "loss": 0.0114, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 235422472.0, "step": 284 }, { "entropy": 0.5152587890625, "epoch": 3.2386363636363638, "grad_norm": 1.109489038564382, "learning_rate": 4.796260492769691e-06, "loss": 0.0101, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 236250854.0, "step": 285 }, { "entropy": 0.526885986328125, "epoch": 3.25, "grad_norm": 1.618921272165497, "learning_rate": 4.7649375237045135e-06, "loss": 0.0202, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 237025724.0, "step": 286 }, { "entropy": 0.51226806640625, "epoch": 3.2613636363636362, "grad_norm": 0.8364422992296099, "learning_rate": 4.733623797495136e-06, "loss": 0.0196, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 237847006.0, "step": 287 }, { "entropy": 0.5173263549804688, "epoch": 3.2727272727272725, "grad_norm": 0.9380743709915682, "learning_rate": 4.702320545423814e-06, "loss": 0.0072, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 238659400.0, "step": 288 }, { "entropy": 0.50469970703125, "epoch": 3.284090909090909, "grad_norm": 0.8521605214578054, "learning_rate": 4.671028998360947e-06, "loss": 0.0062, "mean_token_accuracy": 1.0, "num_tokens": 239520566.0, "step": 289 }, { "entropy": 0.5128173828125, "epoch": 3.2954545454545454, "grad_norm": 1.593635812033709, "learning_rate": 4.639750386716693e-06, "loss": 0.0092, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 240338169.0, "step": 290 }, { "entropy": 0.49086761474609375, "epoch": 3.3068181818181817, "grad_norm": 1.5856201719842262, "learning_rate": 4.60848594039257e-06, "loss": 0.0061, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 241205913.0, "step": 291 }, { "entropy": 0.49979400634765625, "epoch": 3.3181818181818183, "grad_norm": 2.4064616761833673, "learning_rate": 4.5772368887331044e-06, "loss": 0.0191, "mean_token_accuracy": 0.9908854172099382, "num_tokens": 242031860.0, "step": 292 }, { "entropy": 0.501129150390625, "epoch": 3.3295454545454546, "grad_norm": 1.786638161744003, "learning_rate": 4.5460044604774986e-06, "loss": 0.0121, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 242863430.0, "step": 293 }, { "entropy": 0.49781036376953125, "epoch": 3.340909090909091, "grad_norm": 1.3504594379263928, "learning_rate": 4.514789883711296e-06, "loss": 0.0042, "mean_token_accuracy": 1.0, "num_tokens": 243706209.0, "step": 294 }, { "entropy": 0.5142593383789062, "epoch": 3.3522727272727275, "grad_norm": 2.0637007989663494, "learning_rate": 4.483594385818119e-06, "loss": 0.0112, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 244495154.0, "step": 295 }, { "entropy": 0.491363525390625, "epoch": 3.3636363636363638, "grad_norm": 2.887309248591406, "learning_rate": 4.452419193431379e-06, "loss": 0.022, "mean_token_accuracy": 0.9934895837213844, "num_tokens": 245361556.0, "step": 296 }, { "entropy": 0.503753662109375, "epoch": 3.375, "grad_norm": 1.701023939664883, "learning_rate": 4.4212655323860685e-06, "loss": 0.0115, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 246179032.0, "step": 297 }, { "entropy": 0.4980621337890625, "epoch": 3.3863636363636362, "grad_norm": 1.5662274219193635, "learning_rate": 4.39013462767054e-06, "loss": 0.0035, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 247044591.0, "step": 298 }, { "entropy": 0.49640655517578125, "epoch": 3.3977272727272725, "grad_norm": 4.085649656756541, "learning_rate": 4.359027703378357e-06, "loss": 0.0117, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 247849642.0, "step": 299 }, { "entropy": 0.4882965087890625, "epoch": 3.409090909090909, "grad_norm": 2.0545604630857364, "learning_rate": 4.327945982660146e-06, "loss": 0.005, "mean_token_accuracy": 1.0, "num_tokens": 248687645.0, "step": 300 }, { "entropy": 0.49010467529296875, "epoch": 3.4204545454545454, "grad_norm": 4.205117061041687, "learning_rate": 4.29689068767551e-06, "loss": 0.0202, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 249523174.0, "step": 301 }, { "entropy": 0.48773193359375, "epoch": 3.4318181818181817, "grad_norm": 1.0834267264499315, "learning_rate": 4.265863039544967e-06, "loss": 0.0051, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 250381087.0, "step": 302 }, { "entropy": 0.4884796142578125, "epoch": 3.4431818181818183, "grad_norm": 3.6072628291290694, "learning_rate": 4.234864258301943e-06, "loss": 0.0073, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 251231901.0, "step": 303 }, { "entropy": 0.4911956787109375, "epoch": 3.4545454545454546, "grad_norm": 2.36608886636574, "learning_rate": 4.203895562844789e-06, "loss": 0.0044, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 252066427.0, "step": 304 }, { "entropy": 0.49538421630859375, "epoch": 3.465909090909091, "grad_norm": 2.887363512478274, "learning_rate": 4.172958170888858e-06, "loss": 0.0104, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 252898682.0, "step": 305 }, { "entropy": 0.47566986083984375, "epoch": 3.4772727272727275, "grad_norm": 2.4132663080029713, "learning_rate": 4.142053298918622e-06, "loss": 0.0112, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 253766783.0, "step": 306 }, { "entropy": 0.48429107666015625, "epoch": 3.4886363636363638, "grad_norm": 4.288908543594292, "learning_rate": 4.111182162139844e-06, "loss": 0.0124, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 254619003.0, "step": 307 }, { "entropy": 0.49538421630859375, "epoch": 3.5, "grad_norm": 7.886183702925642, "learning_rate": 4.080345974431786e-06, "loss": 0.0136, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 255449310.0, "step": 308 }, { "entropy": 0.49535369873046875, "epoch": 3.5113636363636362, "grad_norm": 1.364174024741626, "learning_rate": 4.049545948299482e-06, "loss": 0.0112, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 256269971.0, "step": 309 }, { "entropy": 0.48786163330078125, "epoch": 3.5227272727272725, "grad_norm": 2.0856928688130427, "learning_rate": 4.018783294826071e-06, "loss": 0.0092, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 257110990.0, "step": 310 }, { "entropy": 0.47814178466796875, "epoch": 3.534090909090909, "grad_norm": 1.4364086187206067, "learning_rate": 3.988059223625155e-06, "loss": 0.0057, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 257969152.0, "step": 311 }, { "entropy": 0.49022674560546875, "epoch": 3.5454545454545454, "grad_norm": 2.3462745888843757, "learning_rate": 3.957374942793259e-06, "loss": 0.0069, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 258787610.0, "step": 312 }, { "entropy": 0.5002593994140625, "epoch": 3.5568181818181817, "grad_norm": 1.076206746462636, "learning_rate": 3.926731658862307e-06, "loss": 0.0049, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 259591761.0, "step": 313 }, { "entropy": 0.493865966796875, "epoch": 3.5681818181818183, "grad_norm": 1.9316523298812094, "learning_rate": 3.8961305767522015e-06, "loss": 0.0106, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 260397894.0, "step": 314 }, { "entropy": 0.5008392333984375, "epoch": 3.5795454545454546, "grad_norm": 6.898639876304829, "learning_rate": 3.865572899723423e-06, "loss": 0.0285, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 261200683.0, "step": 315 }, { "entropy": 0.487060546875, "epoch": 3.590909090909091, "grad_norm": 2.0707747690495437, "learning_rate": 3.8350598293297345e-06, "loss": 0.0079, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 262040136.0, "step": 316 }, { "entropy": 0.490570068359375, "epoch": 3.6022727272727275, "grad_norm": 1.3196169683117467, "learning_rate": 3.8045925653709238e-06, "loss": 0.0117, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 262885485.0, "step": 317 }, { "entropy": 0.48920440673828125, "epoch": 3.6136363636363638, "grad_norm": 3.8532041553000016, "learning_rate": 3.774172305845636e-06, "loss": 0.027, "mean_token_accuracy": 0.9921875004656613, "num_tokens": 263732921.0, "step": 318 }, { "entropy": 0.5139923095703125, "epoch": 3.625, "grad_norm": 0.8491195918322743, "learning_rate": 3.7438002469042567e-06, "loss": 0.0035, "mean_token_accuracy": 1.0, "num_tokens": 264525028.0, "step": 319 }, { "entropy": 0.49022674560546875, "epoch": 3.6363636363636362, "grad_norm": 2.12743923318016, "learning_rate": 3.7134775828018864e-06, "loss": 0.0078, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 265358646.0, "step": 320 }, { "entropy": 0.47808837890625, "epoch": 3.6477272727272725, "grad_norm": 2.4356510539940985, "learning_rate": 3.683205505851377e-06, "loss": 0.0134, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 266215215.0, "step": 321 }, { "entropy": 0.47763824462890625, "epoch": 3.659090909090909, "grad_norm": 1.624290644861085, "learning_rate": 3.652985206376455e-06, "loss": 0.0085, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 267080581.0, "step": 322 }, { "entropy": 0.5059356689453125, "epoch": 3.6704545454545454, "grad_norm": 4.112024944011851, "learning_rate": 3.622817872664905e-06, "loss": 0.0107, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 267854230.0, "step": 323 }, { "entropy": 0.507568359375, "epoch": 3.6818181818181817, "grad_norm": 3.8307619158031723, "learning_rate": 3.5927046909218634e-06, "loss": 0.008, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 268647574.0, "step": 324 }, { "entropy": 0.49181365966796875, "epoch": 3.6931818181818183, "grad_norm": 1.635004844085684, "learning_rate": 3.5626468452231534e-06, "loss": 0.0082, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 269479857.0, "step": 325 }, { "entropy": 0.501556396484375, "epoch": 3.7045454545454546, "grad_norm": 2.6044052865970957, "learning_rate": 3.532645517468748e-06, "loss": 0.0089, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 270278460.0, "step": 326 }, { "entropy": 0.4834136962890625, "epoch": 3.715909090909091, "grad_norm": 2.713985450438721, "learning_rate": 3.50270188733628e-06, "loss": 0.0107, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 271125235.0, "step": 327 }, { "entropy": 0.4970855712890625, "epoch": 3.7272727272727275, "grad_norm": 2.55244138710167, "learning_rate": 3.472817132234669e-06, "loss": 0.0065, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 271947014.0, "step": 328 }, { "entropy": 0.4950103759765625, "epoch": 3.7386363636363638, "grad_norm": 1.5809093118699922, "learning_rate": 3.442992427257812e-06, "loss": 0.0041, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 272778931.0, "step": 329 }, { "entropy": 0.4849395751953125, "epoch": 3.75, "grad_norm": 3.7521409608312437, "learning_rate": 3.4132289451383866e-06, "loss": 0.0078, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 273609373.0, "step": 330 }, { "entropy": 0.48828125, "epoch": 3.7613636363636362, "grad_norm": 1.3638130294760002, "learning_rate": 3.3835278562017405e-06, "loss": 0.0045, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 274452240.0, "step": 331 }, { "entropy": 0.49332427978515625, "epoch": 3.7727272727272725, "grad_norm": 1.6122316611301655, "learning_rate": 3.353890328319861e-06, "loss": 0.0177, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 275279923.0, "step": 332 }, { "entropy": 0.48734283447265625, "epoch": 3.784090909090909, "grad_norm": 1.4018731489506076, "learning_rate": 3.3243175268654656e-06, "loss": 0.0117, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 276099706.0, "step": 333 }, { "entropy": 0.48419189453125, "epoch": 3.7954545454545454, "grad_norm": 1.3425953438370244, "learning_rate": 3.29481061466617e-06, "loss": 0.0107, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 276948323.0, "step": 334 }, { "entropy": 0.504119873046875, "epoch": 3.8068181818181817, "grad_norm": 1.973194371711739, "learning_rate": 3.2653707519587756e-06, "loss": 0.0198, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 277752317.0, "step": 335 }, { "entropy": 0.5062255859375, "epoch": 3.8181818181818183, "grad_norm": 3.5019643819627295, "learning_rate": 3.235999096343633e-06, "loss": 0.0112, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 278553380.0, "step": 336 }, { "entropy": 0.5022354125976562, "epoch": 3.8295454545454546, "grad_norm": 2.0390847173402866, "learning_rate": 3.2066968027391377e-06, "loss": 0.0129, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 279380180.0, "step": 337 }, { "entropy": 0.5066680908203125, "epoch": 3.840909090909091, "grad_norm": 1.000856198418521, "learning_rate": 3.177465023336306e-06, "loss": 0.0047, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 280190463.0, "step": 338 }, { "entropy": 0.4948577880859375, "epoch": 3.8522727272727275, "grad_norm": 0.4576350562763719, "learning_rate": 3.1483049075534853e-06, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 281038954.0, "step": 339 }, { "entropy": 0.4932098388671875, "epoch": 3.8636363636363638, "grad_norm": 0.43387840159329444, "learning_rate": 3.119217601991139e-06, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 281892856.0, "step": 340 }, { "entropy": 0.5100860595703125, "epoch": 3.875, "grad_norm": 1.5329998698285614, "learning_rate": 3.090204250386779e-06, "loss": 0.0056, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 282714631.0, "step": 341 }, { "entropy": 0.48583221435546875, "epoch": 3.8863636363636362, "grad_norm": 0.367306149196234, "learning_rate": 3.0612659935699774e-06, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 283568255.0, "step": 342 }, { "entropy": 0.49864959716796875, "epoch": 3.8977272727272725, "grad_norm": 2.4133813718148036, "learning_rate": 3.032403969417523e-06, "loss": 0.0119, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 284396996.0, "step": 343 }, { "entropy": 0.4964447021484375, "epoch": 3.909090909090909, "grad_norm": 1.0290489541589731, "learning_rate": 3.0036193128086667e-06, "loss": 0.0029, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 285208621.0, "step": 344 }, { "entropy": 0.48796844482421875, "epoch": 3.9204545454545454, "grad_norm": 1.2919335824908056, "learning_rate": 2.9749131555805035e-06, "loss": 0.004, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 286068274.0, "step": 345 }, { "entropy": 0.5021209716796875, "epoch": 3.9318181818181817, "grad_norm": 2.7963876596419777, "learning_rate": 2.946286626483463e-06, "loss": 0.0094, "mean_token_accuracy": 0.9947916669771075, "num_tokens": 286884909.0, "step": 346 }, { "entropy": 0.48792266845703125, "epoch": 3.9431818181818183, "grad_norm": 1.1255514150822787, "learning_rate": 2.9177408511369395e-06, "loss": 0.0039, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 287730733.0, "step": 347 }, { "entropy": 0.49262237548828125, "epoch": 3.9545454545454546, "grad_norm": 1.0668263081925125, "learning_rate": 2.889276951985005e-06, "loss": 0.0124, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 288550016.0, "step": 348 }, { "entropy": 0.5013275146484375, "epoch": 3.965909090909091, "grad_norm": 0.5501779477247107, "learning_rate": 2.8608960482523058e-06, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 289378888.0, "step": 349 }, { "entropy": 0.46614837646484375, "epoch": 3.9772727272727275, "grad_norm": 1.2675364954133146, "learning_rate": 2.8325992559000315e-06, "loss": 0.0029, "mean_token_accuracy": 1.0, "num_tokens": 290281736.0, "step": 350 }, { "entropy": 0.5061798095703125, "epoch": 3.9886363636363638, "grad_norm": 0.994131535402032, "learning_rate": 2.8043876875820363e-06, "loss": 0.0107, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 291064628.0, "step": 351 }, { "entropy": 0.49474334716796875, "epoch": 4.0, "grad_norm": 0.5646989718771591, "learning_rate": 2.776262452601104e-06, "loss": 0.0085, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 291890322.0, "step": 352 }, { "entropy": 0.50128173828125, "epoch": 4.011363636363637, "grad_norm": 0.2305855975577037, "learning_rate": 2.748224656865304e-06, "loss": 0.001, "mean_token_accuracy": 1.0, "num_tokens": 292701363.0, "step": 353 }, { "entropy": 0.49643707275390625, "epoch": 4.0227272727272725, "grad_norm": 1.0613672820064046, "learning_rate": 2.7202754028445375e-06, "loss": 0.0024, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 293532137.0, "step": 354 }, { "entropy": 0.48738861083984375, "epoch": 4.034090909090909, "grad_norm": 0.35673670151406833, "learning_rate": 2.6924157895271563e-06, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 294375049.0, "step": 355 }, { "entropy": 0.5010528564453125, "epoch": 4.045454545454546, "grad_norm": 0.7012641916420297, "learning_rate": 2.6646469123767694e-06, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 295187394.0, "step": 356 }, { "entropy": 0.5065155029296875, "epoch": 4.056818181818182, "grad_norm": 1.5537278669101064, "learning_rate": 2.636969863289164e-06, "loss": 0.008, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 295972208.0, "step": 357 }, { "entropy": 0.47309112548828125, "epoch": 4.068181818181818, "grad_norm": 1.0646019233100874, "learning_rate": 2.6093857305493666e-06, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 296841049.0, "step": 358 }, { "entropy": 0.4745330810546875, "epoch": 4.079545454545454, "grad_norm": 1.4480106846744918, "learning_rate": 2.581895598788857e-06, "loss": 0.0119, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 297687674.0, "step": 359 }, { "entropy": 0.4698486328125, "epoch": 4.090909090909091, "grad_norm": 3.08458120657145, "learning_rate": 2.5545005489429185e-06, "loss": 0.0034, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 298539579.0, "step": 360 }, { "entropy": 0.4842376708984375, "epoch": 4.1022727272727275, "grad_norm": 2.598857092464848, "learning_rate": 2.5272016582081236e-06, "loss": 0.0064, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 299370858.0, "step": 361 }, { "entropy": 0.48838043212890625, "epoch": 4.113636363636363, "grad_norm": 2.7477071900327545, "learning_rate": 2.5000000000000015e-06, "loss": 0.0111, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 300193508.0, "step": 362 }, { "entropy": 0.479339599609375, "epoch": 4.125, "grad_norm": 1.1736957699343047, "learning_rate": 2.472896643910802e-06, "loss": 0.0044, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 301045850.0, "step": 363 }, { "entropy": 0.49602508544921875, "epoch": 4.136363636363637, "grad_norm": 0.37729375908384405, "learning_rate": 2.445892655667462e-06, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 301841078.0, "step": 364 }, { "entropy": 0.48199462890625, "epoch": 4.1477272727272725, "grad_norm": 0.1355837306910303, "learning_rate": 2.418989097089685e-06, "loss": 0.0007, "mean_token_accuracy": 1.0, "num_tokens": 302689150.0, "step": 365 }, { "entropy": 0.51080322265625, "epoch": 4.159090909090909, "grad_norm": 1.1637477153658335, "learning_rate": 2.392187026048198e-06, "loss": 0.0064, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 303488060.0, "step": 366 }, { "entropy": 0.47991180419921875, "epoch": 4.170454545454546, "grad_norm": 0.38030490067585737, "learning_rate": 2.365487496423152e-06, "loss": 0.001, "mean_token_accuracy": 1.0, "num_tokens": 304341157.0, "step": 367 }, { "entropy": 0.47565460205078125, "epoch": 4.181818181818182, "grad_norm": 0.10997153487552541, "learning_rate": 2.3388915580626807e-06, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 305186459.0, "step": 368 }, { "entropy": 0.5027694702148438, "epoch": 4.193181818181818, "grad_norm": 0.14510941025692908, "learning_rate": 2.31240025674162e-06, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 305978157.0, "step": 369 }, { "entropy": 0.49146270751953125, "epoch": 4.204545454545454, "grad_norm": 1.5883776551936257, "learning_rate": 2.2860146341203936e-06, "loss": 0.0059, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 306769045.0, "step": 370 }, { "entropy": 0.49044036865234375, "epoch": 4.215909090909091, "grad_norm": 1.992656406279671, "learning_rate": 2.2597357277040494e-06, "loss": 0.015, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 307562505.0, "step": 371 }, { "entropy": 0.47119903564453125, "epoch": 4.2272727272727275, "grad_norm": 1.2199582939244709, "learning_rate": 2.233564570801453e-06, "loss": 0.0079, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 308441553.0, "step": 372 }, { "entropy": 0.4873199462890625, "epoch": 4.238636363636363, "grad_norm": 2.149306783936389, "learning_rate": 2.207502192484685e-06, "loss": 0.0101, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 309253697.0, "step": 373 }, { "entropy": 0.47591400146484375, "epoch": 4.25, "grad_norm": 0.35564477739269224, "learning_rate": 2.1815496175485433e-06, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 310096440.0, "step": 374 }, { "entropy": 0.47376251220703125, "epoch": 4.261363636363637, "grad_norm": 1.5466708854824467, "learning_rate": 2.1557078664702747e-06, "loss": 0.0053, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 310946699.0, "step": 375 }, { "entropy": 0.47772979736328125, "epoch": 4.2727272727272725, "grad_norm": 3.903707776896074, "learning_rate": 2.1299779553694323e-06, "loss": 0.0099, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 311780327.0, "step": 376 }, { "entropy": 0.4879913330078125, "epoch": 4.284090909090909, "grad_norm": 1.28385949827407, "learning_rate": 2.1043608959679302e-06, "loss": 0.0029, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 312599293.0, "step": 377 }, { "entropy": 0.48241424560546875, "epoch": 4.295454545454546, "grad_norm": 1.3157851336620305, "learning_rate": 2.0788576955502547e-06, "loss": 0.0032, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 313433046.0, "step": 378 }, { "entropy": 0.497406005859375, "epoch": 4.306818181818182, "grad_norm": 0.4986614424581066, "learning_rate": 2.053469356923865e-06, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 314231321.0, "step": 379 }, { "entropy": 0.47122955322265625, "epoch": 4.318181818181818, "grad_norm": 0.9605458456996543, "learning_rate": 2.028196878379749e-06, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 315090907.0, "step": 380 }, { "entropy": 0.46869659423828125, "epoch": 4.329545454545454, "grad_norm": 0.07665020057891755, "learning_rate": 2.0030412536531896e-06, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 315934937.0, "step": 381 }, { "entropy": 0.4843597412109375, "epoch": 4.340909090909091, "grad_norm": 2.4860913585370934, "learning_rate": 1.9780034718846653e-06, "loss": 0.0088, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 316744484.0, "step": 382 }, { "entropy": 0.4806060791015625, "epoch": 4.3522727272727275, "grad_norm": 1.610995719001932, "learning_rate": 1.9530845175809838e-06, "loss": 0.0032, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 317569584.0, "step": 383 }, { "entropy": 0.48772430419921875, "epoch": 4.363636363636363, "grad_norm": 0.3630643065135508, "learning_rate": 1.9282853705765435e-06, "loss": 0.0011, "mean_token_accuracy": 1.0, "num_tokens": 318380963.0, "step": 384 }, { "entropy": 0.4920654296875, "epoch": 4.375, "grad_norm": 0.11318106327604302, "learning_rate": 1.9036070059948253e-06, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 319177808.0, "step": 385 }, { "entropy": 0.48041534423828125, "epoch": 4.386363636363637, "grad_norm": 0.12828443108853602, "learning_rate": 1.8790503942100413e-06, "loss": 0.0007, "mean_token_accuracy": 1.0, "num_tokens": 319991752.0, "step": 386 }, { "entropy": 0.48044586181640625, "epoch": 4.3977272727272725, "grad_norm": 0.12009248173561284, "learning_rate": 1.8546165008089806e-06, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 320823781.0, "step": 387 }, { "entropy": 0.46346282958984375, "epoch": 4.409090909090909, "grad_norm": 1.4373424468930374, "learning_rate": 1.8303062865530407e-06, "loss": 0.0031, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 321672380.0, "step": 388 }, { "entropy": 0.48361968994140625, "epoch": 4.420454545454546, "grad_norm": 0.45259690051953183, "learning_rate": 1.8061207073404507e-06, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 322475535.0, "step": 389 }, { "entropy": 0.4696197509765625, "epoch": 4.431818181818182, "grad_norm": 0.4233541567189679, "learning_rate": 1.7820607141686846e-06, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 323285742.0, "step": 390 }, { "entropy": 0.4728851318359375, "epoch": 4.443181818181818, "grad_norm": 0.1816069998282873, "learning_rate": 1.7581272530970666e-06, "loss": 0.0008, "mean_token_accuracy": 1.0, "num_tokens": 324121368.0, "step": 391 }, { "entropy": 0.47039794921875, "epoch": 4.454545454545454, "grad_norm": 0.3483848284787408, "learning_rate": 1.734321265209572e-06, "loss": 0.001, "mean_token_accuracy": 1.0, "num_tokens": 324973947.0, "step": 392 }, { "entropy": 0.4647674560546875, "epoch": 4.465909090909091, "grad_norm": 0.1144960907716229, "learning_rate": 1.7106436865778182e-06, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 325804782.0, "step": 393 }, { "entropy": 0.48302459716796875, "epoch": 4.4772727272727275, "grad_norm": 0.1139971153359024, "learning_rate": 1.6870954482242707e-06, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 326603097.0, "step": 394 }, { "entropy": 0.4611968994140625, "epoch": 4.488636363636363, "grad_norm": 1.2769097861232268, "learning_rate": 1.663677476085616e-06, "loss": 0.0022, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 327455117.0, "step": 395 }, { "entropy": 0.46791839599609375, "epoch": 4.5, "grad_norm": 1.6141901294301404, "learning_rate": 1.6403906909763688e-06, "loss": 0.0062, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 328271844.0, "step": 396 }, { "entropy": 0.46714019775390625, "epoch": 4.511363636363637, "grad_norm": 3.407028832408574, "learning_rate": 1.6172360085526567e-06, "loss": 0.0144, "mean_token_accuracy": 0.9960937502328306, "num_tokens": 329105170.0, "step": 397 }, { "entropy": 0.4729766845703125, "epoch": 4.5227272727272725, "grad_norm": 1.1263314457968352, "learning_rate": 1.5942143392762178e-06, "loss": 0.0047, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 329922811.0, "step": 398 }, { "entropy": 0.4759521484375, "epoch": 4.534090909090909, "grad_norm": 2.280759600794648, "learning_rate": 1.5713265883786e-06, "loss": 0.0027, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 330722929.0, "step": 399 }, { "entropy": 0.4548187255859375, "epoch": 4.545454545454545, "grad_norm": 0.3194534227852655, "learning_rate": 1.54857365582557e-06, "loss": 0.0009, "mean_token_accuracy": 1.0, "num_tokens": 331559428.0, "step": 400 }, { "entropy": 0.4586944580078125, "epoch": 4.556818181818182, "grad_norm": 0.4687987791903233, "learning_rate": 1.5259564362817147e-06, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 332413515.0, "step": 401 }, { "entropy": 0.454254150390625, "epoch": 4.568181818181818, "grad_norm": 0.415545362925253, "learning_rate": 1.5034758190752836e-06, "loss": 0.001, "mean_token_accuracy": 1.0, "num_tokens": 333265076.0, "step": 402 }, { "entropy": 0.46869659423828125, "epoch": 4.579545454545455, "grad_norm": 0.7752038225629332, "learning_rate": 1.4811326881631937e-06, "loss": 0.0011, "mean_token_accuracy": 1.0, "num_tokens": 334094385.0, "step": 403 }, { "entropy": 0.4632415771484375, "epoch": 4.590909090909091, "grad_norm": 0.21104740454493934, "learning_rate": 1.4589279220962922e-06, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 334947835.0, "step": 404 }, { "entropy": 0.4703216552734375, "epoch": 4.6022727272727275, "grad_norm": 0.42765647679577773, "learning_rate": 1.4368623939848003e-06, "loss": 0.0008, "mean_token_accuracy": 1.0, "num_tokens": 335753110.0, "step": 405 }, { "entropy": 0.45865631103515625, "epoch": 4.613636363636363, "grad_norm": 0.05702480437380243, "learning_rate": 1.4149369714639856e-06, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 336600931.0, "step": 406 }, { "entropy": 0.4477996826171875, "epoch": 4.625, "grad_norm": 0.05129594018064339, "learning_rate": 1.3931525166600447e-06, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 337462613.0, "step": 407 }, { "entropy": 0.46739959716796875, "epoch": 4.636363636363637, "grad_norm": 0.05649629590246661, "learning_rate": 1.371509886156206e-06, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 338298847.0, "step": 408 }, { "entropy": 0.46347808837890625, "epoch": 4.6477272727272725, "grad_norm": 0.05392471241424441, "learning_rate": 1.3500099309590397e-06, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 339113689.0, "step": 409 }, { "entropy": 0.46538543701171875, "epoch": 4.659090909090909, "grad_norm": 0.04261517453780075, "learning_rate": 1.3286534964650121e-06, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 339940988.0, "step": 410 }, { "entropy": 0.475616455078125, "epoch": 4.670454545454545, "grad_norm": 0.041270752321869116, "learning_rate": 1.3074414224272287e-06, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 340752578.0, "step": 411 }, { "entropy": 0.47516632080078125, "epoch": 4.681818181818182, "grad_norm": 0.3298133997421892, "learning_rate": 1.2863745429224145e-06, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 341554983.0, "step": 412 }, { "entropy": 0.4729461669921875, "epoch": 4.693181818181818, "grad_norm": 3.708268542945646, "learning_rate": 1.2654536863181328e-06, "loss": 0.0095, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 342358183.0, "step": 413 }, { "entropy": 0.4546356201171875, "epoch": 4.704545454545455, "grad_norm": 1.8590922166659447, "learning_rate": 1.2446796752401912e-06, "loss": 0.0025, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 343205856.0, "step": 414 }, { "entropy": 0.45923614501953125, "epoch": 4.715909090909091, "grad_norm": 1.2867278575579393, "learning_rate": 1.22405332654032e-06, "loss": 0.0122, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 344033478.0, "step": 415 }, { "entropy": 0.4714202880859375, "epoch": 4.7272727272727275, "grad_norm": 0.04144212296397527, "learning_rate": 1.2035754512640263e-06, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 344827009.0, "step": 416 }, { "entropy": 0.4552764892578125, "epoch": 4.738636363636363, "grad_norm": 0.04260184992648092, "learning_rate": 1.1832468546187248e-06, "loss": 0.0002, "mean_token_accuracy": 1.0, "num_tokens": 345670205.0, "step": 417 }, { "entropy": 0.46997833251953125, "epoch": 4.75, "grad_norm": 0.8777265760304463, "learning_rate": 1.1630683359420653e-06, "loss": 0.0012, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 346471682.0, "step": 418 }, { "entropy": 0.45848846435546875, "epoch": 4.761363636363637, "grad_norm": 2.1795793503430803, "learning_rate": 1.1430406886705053e-06, "loss": 0.0019, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 347308458.0, "step": 419 }, { "entropy": 0.455657958984375, "epoch": 4.7727272727272725, "grad_norm": 0.11627180910044319, "learning_rate": 1.1231647003081092e-06, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 348141445.0, "step": 420 }, { "entropy": 0.45229339599609375, "epoch": 4.784090909090909, "grad_norm": 0.29998009764725636, "learning_rate": 1.103441152395588e-06, "loss": 0.0007, "mean_token_accuracy": 1.0, "num_tokens": 349005773.0, "step": 421 }, { "entropy": 0.46309661865234375, "epoch": 4.795454545454545, "grad_norm": 0.2991841726066073, "learning_rate": 1.0838708204795584e-06, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 349829599.0, "step": 422 }, { "entropy": 0.4610137939453125, "epoch": 4.806818181818182, "grad_norm": 1.0609883118459258, "learning_rate": 1.064454474082064e-06, "loss": 0.0126, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 350647234.0, "step": 423 }, { "entropy": 0.43990325927734375, "epoch": 4.818181818181818, "grad_norm": 0.16789793270696388, "learning_rate": 1.045192876670298e-06, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 351529720.0, "step": 424 }, { "entropy": 0.455780029296875, "epoch": 4.829545454545455, "grad_norm": 0.21827131764117488, "learning_rate": 1.0260867856265967e-06, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 352366018.0, "step": 425 }, { "entropy": 0.46533966064453125, "epoch": 4.840909090909091, "grad_norm": 1.2044394471044855, "learning_rate": 1.0071369522186546e-06, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 353187956.0, "step": 426 }, { "entropy": 0.4680633544921875, "epoch": 4.8522727272727275, "grad_norm": 2.817809119636871, "learning_rate": 9.883441215699824e-07, "loss": 0.0021, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 354005342.0, "step": 427 }, { "entropy": 0.45850372314453125, "epoch": 4.863636363636363, "grad_norm": 1.5308707690431358, "learning_rate": 9.697090326306096e-07, "loss": 0.0018, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 354847121.0, "step": 428 }, { "entropy": 0.45076751708984375, "epoch": 4.875, "grad_norm": 0.6612914507279286, "learning_rate": 9.51232418148027e-07, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 355685163.0, "step": 429 }, { "entropy": 0.464630126953125, "epoch": 4.886363636363637, "grad_norm": 0.564486291517846, "learning_rate": 9.329150046383773e-07, "loss": 0.0008, "mean_token_accuracy": 1.0, "num_tokens": 356523319.0, "step": 430 }, { "entropy": 0.4573822021484375, "epoch": 4.8977272727272725, "grad_norm": 0.9866803918640974, "learning_rate": 9.147575123578845e-07, "loss": 0.0036, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 357363691.0, "step": 431 }, { "entropy": 0.46978759765625, "epoch": 4.909090909090909, "grad_norm": 0.17811715621871, "learning_rate": 8.967606552745361e-07, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 358180072.0, "step": 432 }, { "entropy": 0.45139312744140625, "epoch": 4.920454545454545, "grad_norm": 0.12562428430327954, "learning_rate": 8.789251410400024e-07, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 359024240.0, "step": 433 }, { "entropy": 0.46982574462890625, "epoch": 4.931818181818182, "grad_norm": 0.6932111763771747, "learning_rate": 8.612516709618251e-07, "loss": 0.0009, "mean_token_accuracy": 1.0, "num_tokens": 359847230.0, "step": 434 }, { "entropy": 0.46353912353515625, "epoch": 4.943181818181818, "grad_norm": 0.23164408202084988, "learning_rate": 8.437409399758234e-07, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 360685374.0, "step": 435 }, { "entropy": 0.451324462890625, "epoch": 4.954545454545455, "grad_norm": 0.19196243101290555, "learning_rate": 8.263936366187825e-07, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 361526820.0, "step": 436 }, { "entropy": 0.4579963684082031, "epoch": 4.965909090909091, "grad_norm": 0.0687299728812095, "learning_rate": 8.092104430013737e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 362340123.0, "step": 437 }, { "entropy": 0.45263671875, "epoch": 4.9772727272727275, "grad_norm": 0.07585912602037853, "learning_rate": 7.921920347813333e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 363177369.0, "step": 438 }, { "entropy": 0.46087646484375, "epoch": 4.988636363636363, "grad_norm": 0.10025359489418027, "learning_rate": 7.753390811368972e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 364019483.0, "step": 439 }, { "entropy": 0.47498321533203125, "epoch": 5.0, "grad_norm": 0.08401001803633867, "learning_rate": 7.586522447404882e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 364810814.0, "step": 440 }, { "entropy": 0.4663848876953125, "epoch": 5.011363636363637, "grad_norm": 0.06753832442651661, "learning_rate": 7.421321817326527e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 365630038.0, "step": 441 }, { "entropy": 0.4595947265625, "epoch": 5.0227272727272725, "grad_norm": 0.052693763957122906, "learning_rate": 7.257795416962754e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 366463667.0, "step": 442 }, { "entropy": 0.45317840576171875, "epoch": 5.034090909090909, "grad_norm": 1.3697325708926469, "learning_rate": 7.095949676310171e-07, "loss": 0.0017, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 367311117.0, "step": 443 }, { "entropy": 0.4619865417480469, "epoch": 5.045454545454546, "grad_norm": 0.15729396392326342, "learning_rate": 6.935790959280525e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 368129489.0, "step": 444 }, { "entropy": 0.46871185302734375, "epoch": 5.056818181818182, "grad_norm": 0.06924769182275127, "learning_rate": 6.777325563450282e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 368926469.0, "step": 445 }, { "entropy": 0.47141265869140625, "epoch": 5.068181818181818, "grad_norm": 0.6783294778672456, "learning_rate": 6.62055971981313e-07, "loss": 0.0007, "mean_token_accuracy": 1.0, "num_tokens": 369725096.0, "step": 446 }, { "entropy": 0.451629638671875, "epoch": 5.079545454545454, "grad_norm": 0.1031929872686621, "learning_rate": 6.465499592534902e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 370550137.0, "step": 447 }, { "entropy": 0.455291748046875, "epoch": 5.090909090909091, "grad_norm": 0.0772291963261543, "learning_rate": 6.312151278711237e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 371400483.0, "step": 448 }, { "entropy": 0.46193695068359375, "epoch": 5.1022727272727275, "grad_norm": 1.326339689563553, "learning_rate": 6.160520808127807e-07, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 372240572.0, "step": 449 }, { "entropy": 0.44817352294921875, "epoch": 5.113636363636363, "grad_norm": 0.09897383514384525, "learning_rate": 6.010614143023231e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 373094230.0, "step": 450 }, { "entropy": 0.46028900146484375, "epoch": 5.125, "grad_norm": 0.09472823323409073, "learning_rate": 5.862437177854629e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 373920609.0, "step": 451 }, { "entropy": 0.46224212646484375, "epoch": 5.136363636363637, "grad_norm": 0.044213112080802454, "learning_rate": 5.715995739065877e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 374735909.0, "step": 452 }, { "entropy": 0.45038604736328125, "epoch": 5.1477272727272725, "grad_norm": 1.2764131215349144, "learning_rate": 5.571295584858466e-07, "loss": 0.0018, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 375593425.0, "step": 453 }, { "entropy": 0.4536018371582031, "epoch": 5.159090909090909, "grad_norm": 0.041178158611641445, "learning_rate": 5.428342404965076e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 376407379.0, "step": 454 }, { "entropy": 0.459197998046875, "epoch": 5.170454545454546, "grad_norm": 0.043318758692832936, "learning_rate": 5.287141820425945e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 377243881.0, "step": 455 }, { "entropy": 0.4656829833984375, "epoch": 5.181818181818182, "grad_norm": 0.046883302108158394, "learning_rate": 5.147699383367705e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 378061945.0, "step": 456 }, { "entropy": 0.4624786376953125, "epoch": 5.193181818181818, "grad_norm": 0.043128817920418165, "learning_rate": 5.010020576785174e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 378873096.0, "step": 457 }, { "entropy": 0.4590301513671875, "epoch": 5.204545454545454, "grad_norm": 0.7610317787113492, "learning_rate": 4.874110814325723e-07, "loss": 0.0144, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 379691075.0, "step": 458 }, { "entropy": 0.4679222106933594, "epoch": 5.215909090909091, "grad_norm": 0.043349450851136395, "learning_rate": 4.739975440076405e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 380488242.0, "step": 459 }, { "entropy": 0.44734954833984375, "epoch": 5.2272727272727275, "grad_norm": 0.04658326574454793, "learning_rate": 4.607619728353818e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 381344689.0, "step": 460 }, { "entropy": 0.44751739501953125, "epoch": 5.238636363636363, "grad_norm": 0.4068762779710976, "learning_rate": 4.4770488834967486e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 382191810.0, "step": 461 }, { "entropy": 0.46425628662109375, "epoch": 5.25, "grad_norm": 2.1205539558566877, "learning_rate": 4.348268039661452e-07, "loss": 0.0092, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 383013514.0, "step": 462 }, { "entropy": 0.4523468017578125, "epoch": 5.261363636363637, "grad_norm": 2.899603346657294, "learning_rate": 4.221282260619891e-07, "loss": 0.0066, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 383851533.0, "step": 463 }, { "entropy": 0.4504852294921875, "epoch": 5.2727272727272725, "grad_norm": 0.06079617521527757, "learning_rate": 4.0960965395605015e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 384690195.0, "step": 464 }, { "entropy": 0.4595794677734375, "epoch": 5.284090909090909, "grad_norm": 0.0850218217128288, "learning_rate": 3.972715798891952e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 385510439.0, "step": 465 }, { "entropy": 0.4385986328125, "epoch": 5.295454545454546, "grad_norm": 0.062420950188709565, "learning_rate": 3.851144890049535e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 386394511.0, "step": 466 }, { "entropy": 0.457122802734375, "epoch": 5.306818181818182, "grad_norm": 1.135348931750305, "learning_rate": 3.731388593304425e-07, "loss": 0.009, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 387221719.0, "step": 467 }, { "entropy": 0.46808624267578125, "epoch": 5.318181818181818, "grad_norm": 0.10620184695975321, "learning_rate": 3.6134516175757193e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 388021747.0, "step": 468 }, { "entropy": 0.4699668884277344, "epoch": 5.329545454545454, "grad_norm": 1.0873134956301034, "learning_rate": 3.497338600245254e-07, "loss": 0.0022, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 388858436.0, "step": 469 }, { "entropy": 0.4461669921875, "epoch": 5.340909090909091, "grad_norm": 0.08362416650926273, "learning_rate": 3.383054106975292e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 389713106.0, "step": 470 }, { "entropy": 0.45136260986328125, "epoch": 5.3522727272727275, "grad_norm": 0.11019270156426565, "learning_rate": 3.270602631528968e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 390560663.0, "step": 471 }, { "entropy": 0.45748138427734375, "epoch": 5.363636363636363, "grad_norm": 0.12689398877694197, "learning_rate": 3.159988595593616e-07, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 391378014.0, "step": 472 }, { "entropy": 0.4533958435058594, "epoch": 5.375, "grad_norm": 0.08597158452505722, "learning_rate": 3.051216348606867e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 392217186.0, "step": 473 }, { "entropy": 0.46509552001953125, "epoch": 5.386363636363637, "grad_norm": 0.4545345190127941, "learning_rate": 2.944290167585684e-07, "loss": 0.0007, "mean_token_accuracy": 1.0, "num_tokens": 393043338.0, "step": 474 }, { "entropy": 0.4675140380859375, "epoch": 5.3977272727272725, "grad_norm": 0.09329400176802885, "learning_rate": 2.839214256958106e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 393861984.0, "step": 475 }, { "entropy": 0.4590911865234375, "epoch": 5.409090909090909, "grad_norm": 0.12809045767232038, "learning_rate": 2.7359927483980254e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 394694902.0, "step": 476 }, { "entropy": 0.45597076416015625, "epoch": 5.420454545454546, "grad_norm": 0.0741422272419836, "learning_rate": 2.634629700662628e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 395512370.0, "step": 477 }, { "entropy": 0.455474853515625, "epoch": 5.431818181818182, "grad_norm": 0.0948517795191296, "learning_rate": 2.5351290994328703e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 396351709.0, "step": 478 }, { "entropy": 0.443359375, "epoch": 5.443181818181818, "grad_norm": 0.09645060067855557, "learning_rate": 2.4374948571567246e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 397205354.0, "step": 479 }, { "entropy": 0.44438934326171875, "epoch": 5.454545454545454, "grad_norm": 0.08367706408648884, "learning_rate": 2.3417308128953486e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 398068964.0, "step": 480 }, { "entropy": 0.44445037841796875, "epoch": 5.465909090909091, "grad_norm": 0.08096091553646355, "learning_rate": 2.2478407321721295e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 398931333.0, "step": 481 }, { "entropy": 0.46445465087890625, "epoch": 5.4772727272727275, "grad_norm": 0.07120740238741786, "learning_rate": 2.1558283068246254e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 399743561.0, "step": 482 }, { "entropy": 0.44690704345703125, "epoch": 5.488636363636363, "grad_norm": 1.3626841259762659, "learning_rate": 2.065697154859375e-07, "loss": 0.0015, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 400597274.0, "step": 483 }, { "entropy": 0.44899749755859375, "epoch": 5.5, "grad_norm": 0.0643260899222743, "learning_rate": 1.9774508203096843e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 401443536.0, "step": 484 }, { "entropy": 0.4696502685546875, "epoch": 5.511363636363637, "grad_norm": 1.74414056253339, "learning_rate": 1.8910927730962038e-07, "loss": 0.0059, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 402253531.0, "step": 485 }, { "entropy": 0.45011138916015625, "epoch": 5.5227272727272725, "grad_norm": 3.4760475953885583, "learning_rate": 1.806626408890555e-07, "loss": 0.0039, "mean_token_accuracy": 0.9973958334885538, "num_tokens": 403096077.0, "step": 486 }, { "entropy": 0.45471954345703125, "epoch": 5.534090909090909, "grad_norm": 0.06840228217105021, "learning_rate": 1.7240550489817652e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 403931464.0, "step": 487 }, { "entropy": 0.4575042724609375, "epoch": 5.545454545454545, "grad_norm": 1.7006292138027919, "learning_rate": 1.6433819401456996e-07, "loss": 0.0058, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 404753168.0, "step": 488 }, { "entropy": 0.4656829833984375, "epoch": 5.556818181818182, "grad_norm": 0.07530184857523241, "learning_rate": 1.5646102545173625e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 405574379.0, "step": 489 }, { "entropy": 0.4536285400390625, "epoch": 5.568181818181818, "grad_norm": 0.06879577484091869, "learning_rate": 1.4877430894662037e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 406389523.0, "step": 490 }, { "entropy": 0.46601104736328125, "epoch": 5.579545454545455, "grad_norm": 0.07631570022562256, "learning_rate": 1.412783467474299e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 407214371.0, "step": 491 }, { "entropy": 0.45583343505859375, "epoch": 5.590909090909091, "grad_norm": 0.07293315492618727, "learning_rate": 1.3397343360175287e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 408046275.0, "step": 492 }, { "entropy": 0.44484710693359375, "epoch": 5.6022727272727275, "grad_norm": 0.06505986294350438, "learning_rate": 1.268598567449647e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 408911193.0, "step": 493 }, { "entropy": 0.4643096923828125, "epoch": 5.613636363636363, "grad_norm": 0.12132793601368723, "learning_rate": 1.1993789588893634e-07, "loss": 0.0005, "mean_token_accuracy": 1.0, "num_tokens": 409719072.0, "step": 494 }, { "entropy": 0.4685325622558594, "epoch": 5.625, "grad_norm": 0.07053942071636507, "learning_rate": 1.1320782321103673e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 410525184.0, "step": 495 }, { "entropy": 0.45401763916015625, "epoch": 5.636363636363637, "grad_norm": 0.08594584151577633, "learning_rate": 1.0666990334342708e-07, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 411347407.0, "step": 496 }, { "entropy": 0.458099365234375, "epoch": 5.6477272727272725, "grad_norm": 0.08486605252448229, "learning_rate": 1.0032439336265742e-07, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 412178040.0, "step": 497 }, { "entropy": 0.4579010009765625, "epoch": 5.659090909090909, "grad_norm": 0.06659241573883855, "learning_rate": 9.417154277955864e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 413017601.0, "step": 498 }, { "entropy": 0.454986572265625, "epoch": 5.670454545454545, "grad_norm": 0.07546447247108004, "learning_rate": 8.821159352943142e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 413853566.0, "step": 499 }, { "entropy": 0.4441070556640625, "epoch": 5.681818181818182, "grad_norm": 0.05281085008423308, "learning_rate": 8.244477996253109e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 414704010.0, "step": 500 }, { "entropy": 0.4505615234375, "epoch": 5.693181818181818, "grad_norm": 0.06989309481120927, "learning_rate": 7.687132883485548e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 415551512.0, "step": 501 }, { "entropy": 0.45363616943359375, "epoch": 5.704545454545455, "grad_norm": 0.3490127804874226, "learning_rate": 7.149145929922607e-08, "loss": 0.0006, "mean_token_accuracy": 1.0, "num_tokens": 416393119.0, "step": 502 }, { "entropy": 0.451324462890625, "epoch": 5.715909090909091, "grad_norm": 0.0683123728452308, "learning_rate": 6.630538289667365e-08, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 417211364.0, "step": 503 }, { "entropy": 0.44922637939453125, "epoch": 5.7272727272727275, "grad_norm": 0.06803576881976604, "learning_rate": 6.131330354811616e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 418047976.0, "step": 504 }, { "entropy": 0.46166229248046875, "epoch": 5.738636363636363, "grad_norm": 0.0677087702491836, "learning_rate": 5.651541754634726e-08, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 418866422.0, "step": 505 }, { "entropy": 0.4704399108886719, "epoch": 5.75, "grad_norm": 0.060578646422546296, "learning_rate": 5.1911913548309266e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 419667732.0, "step": 506 }, { "entropy": 0.45671844482421875, "epoch": 5.761363636363637, "grad_norm": 0.6548315170801939, "learning_rate": 4.750297256768177e-08, "loss": 0.0011, "mean_token_accuracy": 1.0, "num_tokens": 420500583.0, "step": 507 }, { "entropy": 0.46508026123046875, "epoch": 5.7727272727272725, "grad_norm": 0.0817666940431779, "learning_rate": 4.328876796776071e-08, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 421327571.0, "step": 508 }, { "entropy": 0.447998046875, "epoch": 5.784090909090909, "grad_norm": 0.051109468067632835, "learning_rate": 3.926946545464327e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 422199122.0, "step": 509 }, { "entropy": 0.456024169921875, "epoch": 5.795454545454545, "grad_norm": 0.07490586230437635, "learning_rate": 3.544522307071085e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 423032124.0, "step": 510 }, { "entropy": 0.4470672607421875, "epoch": 5.806818181818182, "grad_norm": 0.057700792289358926, "learning_rate": 3.181619118841517e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 423876920.0, "step": 511 }, { "entropy": 0.454193115234375, "epoch": 5.818181818181818, "grad_norm": 0.07045615281014198, "learning_rate": 2.838251250436519e-08, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 424725606.0, "step": 512 }, { "entropy": 0.45302581787109375, "epoch": 5.829545454545455, "grad_norm": 0.05710442232271105, "learning_rate": 2.5144322033717748e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 425566413.0, "step": 513 }, { "entropy": 0.46292877197265625, "epoch": 5.840909090909091, "grad_norm": 0.052942049545793506, "learning_rate": 2.210174710486679e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 426383910.0, "step": 514 }, { "entropy": 0.44835662841796875, "epoch": 5.8522727272727275, "grad_norm": 0.716754002813711, "learning_rate": 1.9254907354436804e-08, "loss": 0.0016, "mean_token_accuracy": 0.9986979167442769, "num_tokens": 427233159.0, "step": 515 }, { "entropy": 0.4639892578125, "epoch": 5.863636363636363, "grad_norm": 0.06450919221085051, "learning_rate": 1.6603914722579938e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 428045659.0, "step": 516 }, { "entropy": 0.4636421203613281, "epoch": 5.875, "grad_norm": 1.5156528649134724, "learning_rate": 1.4148873448573408e-08, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 428849891.0, "step": 517 }, { "entropy": 0.4610748291015625, "epoch": 5.886363636363637, "grad_norm": 0.054638375766997926, "learning_rate": 1.1889880066720538e-08, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 429665132.0, "step": 518 }, { "entropy": 0.45429229736328125, "epoch": 5.8977272727272725, "grad_norm": 0.053427197947298284, "learning_rate": 9.827023402556035e-09, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 430492556.0, "step": 519 }, { "entropy": 0.44527435302734375, "epoch": 5.909090909090909, "grad_norm": 0.06448753790680099, "learning_rate": 7.96038456935322e-09, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 431344266.0, "step": 520 }, { "entropy": 0.45428466796875, "epoch": 5.920454545454545, "grad_norm": 0.05394024331607267, "learning_rate": 6.2900369649315785e-09, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 432174654.0, "step": 521 }, { "entropy": 0.45616912841796875, "epoch": 5.931818181818182, "grad_norm": 0.07017412516356693, "learning_rate": 4.816046268775742e-09, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 433016456.0, "step": 522 }, { "entropy": 0.4521484375, "epoch": 5.943181818181818, "grad_norm": 0.1719029795976701, "learning_rate": 3.538470439448105e-09, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 433844975.0, "step": 523 }, { "entropy": 0.44696807861328125, "epoch": 5.954545454545455, "grad_norm": 0.05060097656295442, "learning_rate": 2.4573597123145333e-09, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 434715113.0, "step": 524 }, { "entropy": 0.46170806884765625, "epoch": 5.965909090909091, "grad_norm": 0.06786098871780048, "learning_rate": 1.5727565975642844e-09, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 435516656.0, "step": 525 }, { "entropy": 0.47309112548828125, "epoch": 5.9772727272727275, "grad_norm": 0.08698342318910769, "learning_rate": 8.846958785418969e-10, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 436285686.0, "step": 526 }, { "entropy": 0.4729576110839844, "epoch": 5.988636363636363, "grad_norm": 0.06953938165656462, "learning_rate": 3.9320461037772873e-10, "loss": 0.0004, "mean_token_accuracy": 1.0, "num_tokens": 437058962.0, "step": 527 }, { "entropy": 0.458892822265625, "epoch": 6.0, "grad_norm": 0.0538137181935737, "learning_rate": 9.830211892492004e-11, "loss": 0.0003, "mean_token_accuracy": 1.0, "num_tokens": 437879517.0, "step": 528 }, { "epoch": 6.0, "step": 528, "total_flos": 515196244262912.0, "train_loss": 0.37932722877818986, "train_runtime": 69903.6938, "train_samples_per_second": 3.522, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 528, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 44, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 515196244262912.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }