{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07820617381880747, "eval_steps": 500, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.7426682472229, "epoch": 0.00011172310545543924, "grad_norm": 6.25, "learning_rate": 2e-06, "loss": 10.5231, "mean_token_accuracy": 0.0, "num_tokens": 4250.0, "step": 5 }, { "entropy": 10.742681884765625, "epoch": 0.00022344621091087847, "grad_norm": 7.03125, "learning_rate": 4.5e-06, "loss": 10.4891, "mean_token_accuracy": 0.0, "num_tokens": 8228.0, "step": 10 }, { "entropy": 10.742702770233155, "epoch": 0.0003351693163663177, "grad_norm": 6.8125, "learning_rate": 7e-06, "loss": 10.4445, "mean_token_accuracy": 0.0003105590119957924, "num_tokens": 12209.0, "step": 15 }, { "entropy": 10.742706871032714, "epoch": 0.00044689242182175694, "grad_norm": 6.15625, "learning_rate": 9.5e-06, "loss": 10.3987, "mean_token_accuracy": 0.0004866180010139942, "num_tokens": 16225.0, "step": 20 }, { "entropy": 10.742706108093262, "epoch": 0.0005586155272771962, "grad_norm": 5.53125, "learning_rate": 1.2e-05, "loss": 10.2787, "mean_token_accuracy": 0.010022059944458307, "num_tokens": 20212.0, "step": 25 }, { "entropy": 10.742425346374512, "epoch": 0.0006703386327326354, "grad_norm": 5.0, "learning_rate": 1.4500000000000002e-05, "loss": 10.173, "mean_token_accuracy": 0.04117634426802397, "num_tokens": 24445.0, "step": 30 }, { "entropy": 10.741415786743165, "epoch": 0.0007820617381880747, "grad_norm": 4.3125, "learning_rate": 1.7000000000000003e-05, "loss": 9.9901, "mean_token_accuracy": 0.052767305821180346, "num_tokens": 28365.0, "step": 35 }, { "entropy": 10.739409732818604, "epoch": 0.0008937848436435139, "grad_norm": 3.578125, "learning_rate": 1.95e-05, "loss": 9.9621, "mean_token_accuracy": 0.052475782483816145, "num_tokens": 33055.0, "step": 40 }, { "entropy": 10.7363787651062, "epoch": 0.0010055079490989532, "grad_norm": 2.9375, "learning_rate": 2.2e-05, "loss": 9.811, "mean_token_accuracy": 0.062037082761526106, "num_tokens": 37599.0, "step": 45 }, { "entropy": 10.733420372009277, "epoch": 0.0011172310545543925, "grad_norm": 2.59375, "learning_rate": 2.4500000000000003e-05, "loss": 9.6744, "mean_token_accuracy": 0.06838746592402459, "num_tokens": 41934.0, "step": 50 }, { "entropy": 10.731625938415528, "epoch": 0.0012289541600098315, "grad_norm": 2.53125, "learning_rate": 2.7e-05, "loss": 9.6365, "mean_token_accuracy": 0.05915887728333473, "num_tokens": 46178.0, "step": 55 }, { "entropy": 10.729843425750733, "epoch": 0.0013406772654652708, "grad_norm": 2.5, "learning_rate": 2.95e-05, "loss": 9.5494, "mean_token_accuracy": 0.0692246112972498, "num_tokens": 50513.0, "step": 60 }, { "entropy": 10.727421474456786, "epoch": 0.0014524003709207101, "grad_norm": 2.46875, "learning_rate": 3.2e-05, "loss": 9.5028, "mean_token_accuracy": 0.0702465757727623, "num_tokens": 54924.0, "step": 65 }, { "entropy": 10.722854518890381, "epoch": 0.0015641234763761494, "grad_norm": 2.46875, "learning_rate": 3.4500000000000005e-05, "loss": 9.4107, "mean_token_accuracy": 0.06344567574560642, "num_tokens": 59083.0, "step": 70 }, { "entropy": 10.71631669998169, "epoch": 0.0016758465818315885, "grad_norm": 2.65625, "learning_rate": 3.7e-05, "loss": 9.3233, "mean_token_accuracy": 0.06774163469672204, "num_tokens": 63324.0, "step": 75 }, { "entropy": 10.705671787261963, "epoch": 0.0017875696872870278, "grad_norm": 2.34375, "learning_rate": 3.95e-05, "loss": 9.3567, "mean_token_accuracy": 0.06332114227116108, "num_tokens": 67738.0, "step": 80 }, { "entropy": 10.693737983703613, "epoch": 0.001899292792742467, "grad_norm": 2.234375, "learning_rate": 4.2000000000000004e-05, "loss": 9.2866, "mean_token_accuracy": 0.06349676214158535, "num_tokens": 72305.0, "step": 85 }, { "entropy": 10.675388622283936, "epoch": 0.0020110158981979064, "grad_norm": 2.53125, "learning_rate": 4.45e-05, "loss": 9.0821, "mean_token_accuracy": 0.06834135130047798, "num_tokens": 76579.0, "step": 90 }, { "entropy": 10.65205717086792, "epoch": 0.0021227390036533456, "grad_norm": 2.234375, "learning_rate": 4.7000000000000004e-05, "loss": 9.0421, "mean_token_accuracy": 0.06946654319763183, "num_tokens": 80812.0, "step": 95 }, { "entropy": 10.615971279144286, "epoch": 0.002234462109108785, "grad_norm": 2.109375, "learning_rate": 4.9500000000000004e-05, "loss": 8.9523, "mean_token_accuracy": 0.06732719540596008, "num_tokens": 85090.0, "step": 100 }, { "entropy": 10.59008207321167, "epoch": 0.0023461852145642242, "grad_norm": 2.234375, "learning_rate": 5.2e-05, "loss": 8.888, "mean_token_accuracy": 0.06908667460083961, "num_tokens": 89578.0, "step": 105 }, { "entropy": 10.536420917510986, "epoch": 0.002457908320019663, "grad_norm": 1.96875, "learning_rate": 5.45e-05, "loss": 8.789, "mean_token_accuracy": 0.0728236336261034, "num_tokens": 94117.0, "step": 110 }, { "entropy": 10.488511657714843, "epoch": 0.0025696314254751024, "grad_norm": 2.0625, "learning_rate": 5.7e-05, "loss": 8.6132, "mean_token_accuracy": 0.07127482630312443, "num_tokens": 98082.0, "step": 115 }, { "entropy": 10.439968013763428, "epoch": 0.0026813545309305417, "grad_norm": 1.9765625, "learning_rate": 5.9499999999999996e-05, "loss": 8.5714, "mean_token_accuracy": 0.07672090865671635, "num_tokens": 102327.0, "step": 120 }, { "entropy": 10.355792045593262, "epoch": 0.002793077636385981, "grad_norm": 1.8359375, "learning_rate": 6.2e-05, "loss": 8.4426, "mean_token_accuracy": 0.0740627009421587, "num_tokens": 106567.0, "step": 125 }, { "entropy": 10.286309623718262, "epoch": 0.0029048007418414202, "grad_norm": 1.7890625, "learning_rate": 6.450000000000001e-05, "loss": 8.3003, "mean_token_accuracy": 0.07362989187240601, "num_tokens": 110654.0, "step": 130 }, { "entropy": 10.204053020477295, "epoch": 0.0030165238472968595, "grad_norm": 1.875, "learning_rate": 6.7e-05, "loss": 8.2511, "mean_token_accuracy": 0.062000279501080516, "num_tokens": 114679.0, "step": 135 }, { "entropy": 10.102067852020264, "epoch": 0.003128246952752299, "grad_norm": 1.6171875, "learning_rate": 6.950000000000001e-05, "loss": 8.1849, "mean_token_accuracy": 0.06811538599431514, "num_tokens": 118817.0, "step": 140 }, { "entropy": 9.926943397521972, "epoch": 0.003239970058207738, "grad_norm": 1.484375, "learning_rate": 7.2e-05, "loss": 8.0767, "mean_token_accuracy": 0.06979594528675079, "num_tokens": 123188.0, "step": 145 }, { "entropy": 9.793034744262695, "epoch": 0.003351693163663177, "grad_norm": 1.5078125, "learning_rate": 7.45e-05, "loss": 7.981, "mean_token_accuracy": 0.06847230046987533, "num_tokens": 127767.0, "step": 150 }, { "entropy": 9.643688774108886, "epoch": 0.0034634162691186163, "grad_norm": 1.4921875, "learning_rate": 7.7e-05, "loss": 7.7945, "mean_token_accuracy": 0.06945906654000282, "num_tokens": 131837.0, "step": 155 }, { "entropy": 9.430543518066406, "epoch": 0.0035751393745740555, "grad_norm": 1.2890625, "learning_rate": 7.950000000000001e-05, "loss": 7.7734, "mean_token_accuracy": 0.07027286775410176, "num_tokens": 136247.0, "step": 160 }, { "entropy": 9.239261722564697, "epoch": 0.003686862480029495, "grad_norm": 1.4609375, "learning_rate": 8.2e-05, "loss": 7.5788, "mean_token_accuracy": 0.07950169630348683, "num_tokens": 140170.0, "step": 165 }, { "entropy": 8.976140880584717, "epoch": 0.003798585585484934, "grad_norm": 1.21875, "learning_rate": 8.450000000000001e-05, "loss": 7.6177, "mean_token_accuracy": 0.07785017378628253, "num_tokens": 144139.0, "step": 170 }, { "entropy": 8.843453693389893, "epoch": 0.003910308690940373, "grad_norm": 1.2578125, "learning_rate": 8.7e-05, "loss": 7.5659, "mean_token_accuracy": 0.07487303391098976, "num_tokens": 148792.0, "step": 175 }, { "entropy": 8.658325004577637, "epoch": 0.004022031796395813, "grad_norm": 1.3515625, "learning_rate": 8.95e-05, "loss": 7.4988, "mean_token_accuracy": 0.07942216768860817, "num_tokens": 152844.0, "step": 180 }, { "entropy": 8.59526195526123, "epoch": 0.0041337549018512516, "grad_norm": 1.046875, "learning_rate": 9.2e-05, "loss": 7.527, "mean_token_accuracy": 0.07417443916201591, "num_tokens": 157366.0, "step": 185 }, { "entropy": 8.467089462280274, "epoch": 0.004245478007306691, "grad_norm": 1.0390625, "learning_rate": 9.45e-05, "loss": 7.3623, "mean_token_accuracy": 0.07755868881940842, "num_tokens": 161348.0, "step": 190 }, { "entropy": 8.307873630523682, "epoch": 0.00435720111276213, "grad_norm": 1.140625, "learning_rate": 9.7e-05, "loss": 7.3815, "mean_token_accuracy": 0.08716461397707462, "num_tokens": 165647.0, "step": 195 }, { "entropy": 8.236515140533447, "epoch": 0.00446892421821757, "grad_norm": 1.4453125, "learning_rate": 9.95e-05, "loss": 7.2754, "mean_token_accuracy": 0.08076057620346547, "num_tokens": 169521.0, "step": 200 }, { "entropy": 8.256762790679932, "epoch": 0.004580647323673009, "grad_norm": 1.359375, "learning_rate": 0.000102, "loss": 7.3426, "mean_token_accuracy": 0.0812241055071354, "num_tokens": 173466.0, "step": 205 }, { "entropy": 8.131280899047852, "epoch": 0.0046923704291284484, "grad_norm": 1.1484375, "learning_rate": 0.00010449999999999999, "loss": 7.2826, "mean_token_accuracy": 0.07643571458756923, "num_tokens": 177663.0, "step": 210 }, { "entropy": 8.097990989685059, "epoch": 0.004804093534583887, "grad_norm": 1.21875, "learning_rate": 0.000107, "loss": 7.2745, "mean_token_accuracy": 0.08235705867409707, "num_tokens": 181778.0, "step": 215 }, { "entropy": 8.089111948013306, "epoch": 0.004915816640039326, "grad_norm": 1.6171875, "learning_rate": 0.0001095, "loss": 7.2736, "mean_token_accuracy": 0.08633389472961425, "num_tokens": 185525.0, "step": 220 }, { "entropy": 8.083420944213866, "epoch": 0.005027539745494766, "grad_norm": 1.3671875, "learning_rate": 0.000112, "loss": 7.153, "mean_token_accuracy": 0.08806331530213356, "num_tokens": 189418.0, "step": 225 }, { "entropy": 7.933328151702881, "epoch": 0.005139262850950205, "grad_norm": 1.359375, "learning_rate": 0.0001145, "loss": 7.2217, "mean_token_accuracy": 0.08842612579464912, "num_tokens": 193494.0, "step": 230 }, { "entropy": 8.018900680541993, "epoch": 0.0052509859564056445, "grad_norm": 1.1484375, "learning_rate": 0.00011700000000000001, "loss": 7.2661, "mean_token_accuracy": 0.08137304298579692, "num_tokens": 198018.0, "step": 235 }, { "entropy": 7.955441856384278, "epoch": 0.005362709061861083, "grad_norm": 1.2578125, "learning_rate": 0.00011949999999999999, "loss": 7.1847, "mean_token_accuracy": 0.08625513166189194, "num_tokens": 202296.0, "step": 240 }, { "entropy": 7.9594367980957035, "epoch": 0.005474432167316523, "grad_norm": 1.3203125, "learning_rate": 0.000122, "loss": 7.1706, "mean_token_accuracy": 0.08195730969309807, "num_tokens": 206694.0, "step": 245 }, { "entropy": 7.792031574249267, "epoch": 0.005586155272771962, "grad_norm": 1.390625, "learning_rate": 0.0001245, "loss": 7.2007, "mean_token_accuracy": 0.08904931843280792, "num_tokens": 210810.0, "step": 250 }, { "entropy": 7.920461797714234, "epoch": 0.005697878378227402, "grad_norm": 1.1796875, "learning_rate": 0.000127, "loss": 7.1818, "mean_token_accuracy": 0.0905133418738842, "num_tokens": 215044.0, "step": 255 }, { "entropy": 7.8493430614471436, "epoch": 0.0058096014836828405, "grad_norm": 1.3828125, "learning_rate": 0.0001295, "loss": 7.28, "mean_token_accuracy": 0.08591654896736145, "num_tokens": 219235.0, "step": 260 }, { "entropy": 7.84934287071228, "epoch": 0.005921324589138279, "grad_norm": 1.1328125, "learning_rate": 0.000132, "loss": 7.0922, "mean_token_accuracy": 0.0903876356780529, "num_tokens": 223639.0, "step": 265 }, { "entropy": 7.785561227798462, "epoch": 0.006033047694593719, "grad_norm": 1.25, "learning_rate": 0.00013450000000000002, "loss": 7.1258, "mean_token_accuracy": 0.09057728350162506, "num_tokens": 227873.0, "step": 270 }, { "entropy": 7.707937574386596, "epoch": 0.006144770800049158, "grad_norm": 1.3671875, "learning_rate": 0.00013700000000000002, "loss": 7.0661, "mean_token_accuracy": 0.09807337448000908, "num_tokens": 232147.0, "step": 275 }, { "entropy": 7.739069509506225, "epoch": 0.006256493905504598, "grad_norm": 1.6015625, "learning_rate": 0.0001395, "loss": 7.1358, "mean_token_accuracy": 0.09250000454485416, "num_tokens": 236456.0, "step": 280 }, { "entropy": 7.7190714359283445, "epoch": 0.0063682170109600365, "grad_norm": 1.1953125, "learning_rate": 0.00014199999999999998, "loss": 7.1583, "mean_token_accuracy": 0.09051149562001229, "num_tokens": 241039.0, "step": 285 }, { "entropy": 7.938947439193726, "epoch": 0.006479940116415476, "grad_norm": 1.6640625, "learning_rate": 0.0001445, "loss": 7.1915, "mean_token_accuracy": 0.08653632178902626, "num_tokens": 245132.0, "step": 290 }, { "entropy": 7.673107481002807, "epoch": 0.006591663221870915, "grad_norm": 1.28125, "learning_rate": 0.000147, "loss": 7.0872, "mean_token_accuracy": 0.09988043382763863, "num_tokens": 249152.0, "step": 295 }, { "entropy": 7.712965631484986, "epoch": 0.006703386327326354, "grad_norm": 1.21875, "learning_rate": 0.0001495, "loss": 7.0503, "mean_token_accuracy": 0.09596830010414123, "num_tokens": 253439.0, "step": 300 }, { "entropy": 7.6600532054901125, "epoch": 0.006815109432781794, "grad_norm": 1.546875, "learning_rate": 0.000152, "loss": 7.0731, "mean_token_accuracy": 0.09302671104669571, "num_tokens": 258066.0, "step": 305 }, { "entropy": 7.665358448028565, "epoch": 0.0069268325382372325, "grad_norm": 1.4375, "learning_rate": 0.00015450000000000001, "loss": 7.0332, "mean_token_accuracy": 0.0973147690296173, "num_tokens": 261954.0, "step": 310 }, { "entropy": 7.616210794448852, "epoch": 0.007038555643692672, "grad_norm": 1.4453125, "learning_rate": 0.000157, "loss": 7.0779, "mean_token_accuracy": 0.10462095588445663, "num_tokens": 266650.0, "step": 315 }, { "entropy": 7.689846324920654, "epoch": 0.007150278749148111, "grad_norm": 1.3203125, "learning_rate": 0.0001595, "loss": 7.1433, "mean_token_accuracy": 0.09891897812485695, "num_tokens": 271069.0, "step": 320 }, { "entropy": 7.705677938461304, "epoch": 0.007262001854603551, "grad_norm": 1.359375, "learning_rate": 0.000162, "loss": 7.0039, "mean_token_accuracy": 0.10242248028516769, "num_tokens": 275084.0, "step": 325 }, { "entropy": 7.603102445602417, "epoch": 0.00737372496005899, "grad_norm": 1.390625, "learning_rate": 0.00016450000000000001, "loss": 7.0745, "mean_token_accuracy": 0.1031483568251133, "num_tokens": 279721.0, "step": 330 }, { "entropy": 7.619607782363891, "epoch": 0.007485448065514429, "grad_norm": 1.296875, "learning_rate": 0.00016700000000000002, "loss": 7.0708, "mean_token_accuracy": 0.10527726709842682, "num_tokens": 284317.0, "step": 335 }, { "entropy": 7.600710487365722, "epoch": 0.007597171170969868, "grad_norm": 1.3984375, "learning_rate": 0.00016950000000000003, "loss": 7.0451, "mean_token_accuracy": 0.10766607597470283, "num_tokens": 288870.0, "step": 340 }, { "entropy": 7.61973729133606, "epoch": 0.007708894276425307, "grad_norm": 1.3046875, "learning_rate": 0.00017199999999999998, "loss": 6.9812, "mean_token_accuracy": 0.11351362988352776, "num_tokens": 292996.0, "step": 345 }, { "entropy": 7.5854551792144775, "epoch": 0.007820617381880746, "grad_norm": 1.4765625, "learning_rate": 0.00017449999999999999, "loss": 6.9806, "mean_token_accuracy": 0.10384939089417458, "num_tokens": 297238.0, "step": 350 }, { "entropy": 7.531381893157959, "epoch": 0.007932340487336187, "grad_norm": 1.484375, "learning_rate": 0.000177, "loss": 6.9793, "mean_token_accuracy": 0.1117280475795269, "num_tokens": 301453.0, "step": 355 }, { "entropy": 7.653309726715088, "epoch": 0.008044063592791625, "grad_norm": 1.7890625, "learning_rate": 0.0001795, "loss": 6.9327, "mean_token_accuracy": 0.10786554217338562, "num_tokens": 305949.0, "step": 360 }, { "entropy": 7.534815788269043, "epoch": 0.008155786698247064, "grad_norm": 1.5625, "learning_rate": 0.000182, "loss": 6.9583, "mean_token_accuracy": 0.11513907313346863, "num_tokens": 310097.0, "step": 365 }, { "entropy": 7.547474193572998, "epoch": 0.008267509803702503, "grad_norm": 1.390625, "learning_rate": 0.0001845, "loss": 6.9517, "mean_token_accuracy": 0.10539396926760673, "num_tokens": 314567.0, "step": 370 }, { "entropy": 7.457708692550659, "epoch": 0.008379232909157944, "grad_norm": 1.4375, "learning_rate": 0.000187, "loss": 7.0323, "mean_token_accuracy": 0.10927818715572357, "num_tokens": 319166.0, "step": 375 }, { "entropy": 7.515052604675293, "epoch": 0.008490956014613383, "grad_norm": 1.421875, "learning_rate": 0.0001895, "loss": 6.9204, "mean_token_accuracy": 0.11223233640193939, "num_tokens": 323682.0, "step": 380 }, { "entropy": 7.488761281967163, "epoch": 0.008602679120068821, "grad_norm": 1.421875, "learning_rate": 0.000192, "loss": 6.8299, "mean_token_accuracy": 0.12143486216664315, "num_tokens": 327994.0, "step": 385 }, { "entropy": 7.412152099609375, "epoch": 0.00871440222552426, "grad_norm": 1.3671875, "learning_rate": 0.0001945, "loss": 6.9058, "mean_token_accuracy": 0.11854805946350097, "num_tokens": 332026.0, "step": 390 }, { "entropy": 7.578387832641601, "epoch": 0.0088261253309797, "grad_norm": 1.796875, "learning_rate": 0.00019700000000000002, "loss": 6.9475, "mean_token_accuracy": 0.11811894848942757, "num_tokens": 336552.0, "step": 395 }, { "entropy": 7.504688882827759, "epoch": 0.00893784843643514, "grad_norm": 1.4921875, "learning_rate": 0.00019950000000000002, "loss": 6.8884, "mean_token_accuracy": 0.1116393692791462, "num_tokens": 340643.0, "step": 400 }, { "entropy": 7.4580738067626955, "epoch": 0.009049571541890579, "grad_norm": 1.375, "learning_rate": 0.000202, "loss": 6.8029, "mean_token_accuracy": 0.12075437754392623, "num_tokens": 344886.0, "step": 405 }, { "entropy": 7.3586314678192135, "epoch": 0.009161294647346017, "grad_norm": 1.328125, "learning_rate": 0.00020449999999999998, "loss": 6.8632, "mean_token_accuracy": 0.1191755935549736, "num_tokens": 349115.0, "step": 410 }, { "entropy": 7.527571535110473, "epoch": 0.009273017752801456, "grad_norm": 1.5625, "learning_rate": 0.000207, "loss": 6.8235, "mean_token_accuracy": 0.12523479163646697, "num_tokens": 353368.0, "step": 415 }, { "entropy": 7.509571599960327, "epoch": 0.009384740858256897, "grad_norm": 1.484375, "learning_rate": 0.0002095, "loss": 6.8657, "mean_token_accuracy": 0.10757644474506378, "num_tokens": 357382.0, "step": 420 }, { "entropy": 7.441834354400635, "epoch": 0.009496463963712336, "grad_norm": 1.7265625, "learning_rate": 0.000212, "loss": 6.7799, "mean_token_accuracy": 0.12460733354091644, "num_tokens": 361542.0, "step": 425 }, { "entropy": 7.419776153564453, "epoch": 0.009608187069167775, "grad_norm": 1.2578125, "learning_rate": 0.0002145, "loss": 6.84, "mean_token_accuracy": 0.11263928636908531, "num_tokens": 366006.0, "step": 430 }, { "entropy": 7.367758464813233, "epoch": 0.009719910174623213, "grad_norm": 1.578125, "learning_rate": 0.00021700000000000002, "loss": 6.7506, "mean_token_accuracy": 0.12978531718254088, "num_tokens": 370021.0, "step": 435 }, { "entropy": 7.408233594894409, "epoch": 0.009831633280078652, "grad_norm": 1.703125, "learning_rate": 0.0002195, "loss": 6.8773, "mean_token_accuracy": 0.12414649501442909, "num_tokens": 374434.0, "step": 440 }, { "entropy": 7.441655158996582, "epoch": 0.009943356385534093, "grad_norm": 1.3515625, "learning_rate": 0.000222, "loss": 6.9027, "mean_token_accuracy": 0.11324851140379906, "num_tokens": 378634.0, "step": 445 }, { "entropy": 7.315918588638306, "epoch": 0.010055079490989532, "grad_norm": 1.4453125, "learning_rate": 0.0002245, "loss": 6.7869, "mean_token_accuracy": 0.1252473659813404, "num_tokens": 382904.0, "step": 450 }, { "entropy": 7.470485210418701, "epoch": 0.01016680259644497, "grad_norm": 1.328125, "learning_rate": 0.00022700000000000002, "loss": 6.7635, "mean_token_accuracy": 0.12090180814266205, "num_tokens": 386970.0, "step": 455 }, { "entropy": 7.348088216781616, "epoch": 0.01027852570190041, "grad_norm": 1.25, "learning_rate": 0.00022950000000000002, "loss": 6.729, "mean_token_accuracy": 0.13390202075242996, "num_tokens": 391043.0, "step": 460 }, { "entropy": 7.392842721939087, "epoch": 0.01039024880735585, "grad_norm": 1.3125, "learning_rate": 0.00023200000000000003, "loss": 6.7204, "mean_token_accuracy": 0.13383440747857095, "num_tokens": 395413.0, "step": 465 }, { "entropy": 7.40152382850647, "epoch": 0.010501971912811289, "grad_norm": 1.6015625, "learning_rate": 0.00023449999999999998, "loss": 6.8385, "mean_token_accuracy": 0.12566340565681458, "num_tokens": 399821.0, "step": 470 }, { "entropy": 7.2655271053314205, "epoch": 0.010613695018266728, "grad_norm": 1.34375, "learning_rate": 0.000237, "loss": 6.6582, "mean_token_accuracy": 0.13715523406863211, "num_tokens": 404043.0, "step": 475 }, { "entropy": 7.422811889648438, "epoch": 0.010725418123722167, "grad_norm": 1.65625, "learning_rate": 0.0002395, "loss": 6.803, "mean_token_accuracy": 0.1260749615728855, "num_tokens": 408339.0, "step": 480 }, { "entropy": 7.263138484954834, "epoch": 0.010837141229177606, "grad_norm": 1.4453125, "learning_rate": 0.000242, "loss": 6.6856, "mean_token_accuracy": 0.13459724336862564, "num_tokens": 412384.0, "step": 485 }, { "entropy": 7.3362548828125, "epoch": 0.010948864334633046, "grad_norm": 1.4921875, "learning_rate": 0.0002445, "loss": 6.8108, "mean_token_accuracy": 0.12614913210272788, "num_tokens": 416891.0, "step": 490 }, { "entropy": 7.36023063659668, "epoch": 0.011060587440088485, "grad_norm": 1.4921875, "learning_rate": 0.000247, "loss": 6.6642, "mean_token_accuracy": 0.1329216368496418, "num_tokens": 420980.0, "step": 495 }, { "entropy": 7.2991362571716305, "epoch": 0.011172310545543924, "grad_norm": 1.6640625, "learning_rate": 0.0002495, "loss": 6.7212, "mean_token_accuracy": 0.13121648952364923, "num_tokens": 425454.0, "step": 500 }, { "entropy": 7.339645147323608, "epoch": 0.011284033650999363, "grad_norm": 1.40625, "learning_rate": 0.000252, "loss": 6.7456, "mean_token_accuracy": 0.12466516643762589, "num_tokens": 430087.0, "step": 505 }, { "entropy": 7.289543581008911, "epoch": 0.011395756756454803, "grad_norm": 1.78125, "learning_rate": 0.0002545, "loss": 6.674, "mean_token_accuracy": 0.13790097907185556, "num_tokens": 434062.0, "step": 510 }, { "entropy": 7.032429838180542, "epoch": 0.011507479861910242, "grad_norm": 1.53125, "learning_rate": 0.000257, "loss": 6.6567, "mean_token_accuracy": 0.13795162215828896, "num_tokens": 437882.0, "step": 515 }, { "entropy": 7.306787776947021, "epoch": 0.011619202967365681, "grad_norm": 1.4296875, "learning_rate": 0.0002595, "loss": 6.6863, "mean_token_accuracy": 0.13145707920193672, "num_tokens": 442248.0, "step": 520 }, { "entropy": 7.304118633270264, "epoch": 0.01173092607282112, "grad_norm": 1.953125, "learning_rate": 0.000262, "loss": 6.6543, "mean_token_accuracy": 0.13509279638528823, "num_tokens": 446492.0, "step": 525 }, { "entropy": 7.187564706802368, "epoch": 0.011842649178276559, "grad_norm": 1.4140625, "learning_rate": 0.00026450000000000003, "loss": 6.6878, "mean_token_accuracy": 0.1363551899790764, "num_tokens": 450359.0, "step": 530 }, { "entropy": 7.3090503215789795, "epoch": 0.011954372283732, "grad_norm": 1.4609375, "learning_rate": 0.00026700000000000004, "loss": 6.6973, "mean_token_accuracy": 0.13162412643432617, "num_tokens": 454508.0, "step": 535 }, { "entropy": 7.349436283111572, "epoch": 0.012066095389187438, "grad_norm": 1.421875, "learning_rate": 0.00026950000000000005, "loss": 6.666, "mean_token_accuracy": 0.133739610016346, "num_tokens": 459028.0, "step": 540 }, { "entropy": 7.165412759780883, "epoch": 0.012177818494642877, "grad_norm": 1.546875, "learning_rate": 0.00027200000000000005, "loss": 6.6084, "mean_token_accuracy": 0.13254478350281715, "num_tokens": 462911.0, "step": 545 }, { "entropy": 7.141500329971313, "epoch": 0.012289541600098316, "grad_norm": 1.4296875, "learning_rate": 0.0002745, "loss": 6.6564, "mean_token_accuracy": 0.13796778842806817, "num_tokens": 467129.0, "step": 550 }, { "entropy": 7.303054237365723, "epoch": 0.012401264705553756, "grad_norm": 1.515625, "learning_rate": 0.000277, "loss": 6.5808, "mean_token_accuracy": 0.139659858494997, "num_tokens": 471324.0, "step": 555 }, { "entropy": 7.217333889007568, "epoch": 0.012512987811009195, "grad_norm": 1.5625, "learning_rate": 0.0002795, "loss": 6.7315, "mean_token_accuracy": 0.12687695473432542, "num_tokens": 475761.0, "step": 560 }, { "entropy": 7.198446226119995, "epoch": 0.012624710916464634, "grad_norm": 1.375, "learning_rate": 0.00028199999999999997, "loss": 6.5304, "mean_token_accuracy": 0.13995881900191307, "num_tokens": 479532.0, "step": 565 }, { "entropy": 7.236463642120361, "epoch": 0.012736434021920073, "grad_norm": 1.4296875, "learning_rate": 0.0002845, "loss": 6.6868, "mean_token_accuracy": 0.13790254518389702, "num_tokens": 483844.0, "step": 570 }, { "entropy": 7.325732755661011, "epoch": 0.012848157127375512, "grad_norm": 1.3984375, "learning_rate": 0.000287, "loss": 6.7171, "mean_token_accuracy": 0.1334671013057232, "num_tokens": 488402.0, "step": 575 }, { "entropy": 7.063196992874145, "epoch": 0.012959880232830952, "grad_norm": 1.4375, "learning_rate": 0.0002895, "loss": 6.4612, "mean_token_accuracy": 0.14302808195352554, "num_tokens": 492473.0, "step": 580 }, { "entropy": 7.110111331939697, "epoch": 0.013071603338286391, "grad_norm": 1.59375, "learning_rate": 0.000292, "loss": 6.568, "mean_token_accuracy": 0.14078054577112198, "num_tokens": 496830.0, "step": 585 }, { "entropy": 7.2420226573944095, "epoch": 0.01318332644374183, "grad_norm": 1.53125, "learning_rate": 0.0002945, "loss": 6.631, "mean_token_accuracy": 0.13996972143650055, "num_tokens": 500996.0, "step": 590 }, { "entropy": 7.113031435012817, "epoch": 0.013295049549197269, "grad_norm": 1.5390625, "learning_rate": 0.000297, "loss": 6.5937, "mean_token_accuracy": 0.13735369965434074, "num_tokens": 505174.0, "step": 595 }, { "entropy": 7.149052238464355, "epoch": 0.013406772654652708, "grad_norm": 1.4921875, "learning_rate": 0.0002995, "loss": 6.5965, "mean_token_accuracy": 0.14343740195035934, "num_tokens": 509425.0, "step": 600 }, { "entropy": 7.0243888854980465, "epoch": 0.013518495760108148, "grad_norm": 1.796875, "learning_rate": 0.000302, "loss": 6.4748, "mean_token_accuracy": 0.14360842779278754, "num_tokens": 513271.0, "step": 605 }, { "entropy": 7.198608922958374, "epoch": 0.013630218865563587, "grad_norm": 1.4375, "learning_rate": 0.0003045, "loss": 6.641, "mean_token_accuracy": 0.146073829382658, "num_tokens": 517879.0, "step": 610 }, { "entropy": 7.045837879180908, "epoch": 0.013741941971019026, "grad_norm": 1.3984375, "learning_rate": 0.000307, "loss": 6.6375, "mean_token_accuracy": 0.13332991302013397, "num_tokens": 522321.0, "step": 615 }, { "entropy": 7.1533918380737305, "epoch": 0.013853665076474465, "grad_norm": 1.59375, "learning_rate": 0.0003095, "loss": 6.4453, "mean_token_accuracy": 0.15030871629714965, "num_tokens": 525884.0, "step": 620 }, { "entropy": 7.081359624862671, "epoch": 0.013965388181929906, "grad_norm": 1.6484375, "learning_rate": 0.000312, "loss": 6.5558, "mean_token_accuracy": 0.145179907977581, "num_tokens": 530373.0, "step": 625 }, { "entropy": 7.055321979522705, "epoch": 0.014077111287385344, "grad_norm": 1.4765625, "learning_rate": 0.0003145, "loss": 6.3843, "mean_token_accuracy": 0.15800822898745537, "num_tokens": 534571.0, "step": 630 }, { "entropy": 7.117716646194458, "epoch": 0.014188834392840783, "grad_norm": 1.53125, "learning_rate": 0.000317, "loss": 6.5609, "mean_token_accuracy": 0.13290601670742036, "num_tokens": 538938.0, "step": 635 }, { "entropy": 7.100346279144287, "epoch": 0.014300557498296222, "grad_norm": 1.5234375, "learning_rate": 0.0003195, "loss": 6.5165, "mean_token_accuracy": 0.1455024905502796, "num_tokens": 542977.0, "step": 640 }, { "entropy": 7.071889972686767, "epoch": 0.014412280603751661, "grad_norm": 1.5859375, "learning_rate": 0.000322, "loss": 6.4884, "mean_token_accuracy": 0.1465342827141285, "num_tokens": 547005.0, "step": 645 }, { "entropy": 7.0628125190734865, "epoch": 0.014524003709207102, "grad_norm": 1.40625, "learning_rate": 0.00032450000000000003, "loss": 6.5272, "mean_token_accuracy": 0.13975051417946815, "num_tokens": 551361.0, "step": 650 }, { "entropy": 7.023661518096924, "epoch": 0.01463572681466254, "grad_norm": 1.546875, "learning_rate": 0.00032700000000000003, "loss": 6.4833, "mean_token_accuracy": 0.14340668320655822, "num_tokens": 555574.0, "step": 655 }, { "entropy": 6.989748096466064, "epoch": 0.01474744992011798, "grad_norm": 1.515625, "learning_rate": 0.00032950000000000004, "loss": 6.5127, "mean_token_accuracy": 0.14289377629756927, "num_tokens": 560368.0, "step": 660 }, { "entropy": 7.077346086502075, "epoch": 0.014859173025573418, "grad_norm": 1.5390625, "learning_rate": 0.00033200000000000005, "loss": 6.5634, "mean_token_accuracy": 0.1373551793396473, "num_tokens": 564884.0, "step": 665 }, { "entropy": 7.1274079322814945, "epoch": 0.014970896131028859, "grad_norm": 1.4453125, "learning_rate": 0.00033450000000000005, "loss": 6.5586, "mean_token_accuracy": 0.1458572693169117, "num_tokens": 569494.0, "step": 670 }, { "entropy": 7.002107572555542, "epoch": 0.015082619236484298, "grad_norm": 1.515625, "learning_rate": 0.000337, "loss": 6.5547, "mean_token_accuracy": 0.13712208420038224, "num_tokens": 574161.0, "step": 675 }, { "entropy": 7.072530221939087, "epoch": 0.015194342341939737, "grad_norm": 1.8828125, "learning_rate": 0.0003395, "loss": 6.4626, "mean_token_accuracy": 0.14953978583216668, "num_tokens": 578510.0, "step": 680 }, { "entropy": 6.912496089935303, "epoch": 0.015306065447395175, "grad_norm": 1.7265625, "learning_rate": 0.000342, "loss": 6.4979, "mean_token_accuracy": 0.1424515500664711, "num_tokens": 582695.0, "step": 685 }, { "entropy": 7.025566148757934, "epoch": 0.015417788552850614, "grad_norm": 1.4453125, "learning_rate": 0.00034449999999999997, "loss": 6.4376, "mean_token_accuracy": 0.13332833126187324, "num_tokens": 586622.0, "step": 690 }, { "entropy": 7.013347387313843, "epoch": 0.015529511658306055, "grad_norm": 1.8046875, "learning_rate": 0.000347, "loss": 6.4683, "mean_token_accuracy": 0.14548797011375428, "num_tokens": 591704.0, "step": 695 }, { "entropy": 6.755953073501587, "epoch": 0.015641234763761492, "grad_norm": 1.5546875, "learning_rate": 0.0003495, "loss": 6.276, "mean_token_accuracy": 0.15882887542247773, "num_tokens": 596029.0, "step": 700 }, { "entropy": 6.987126111984253, "epoch": 0.015752957869216933, "grad_norm": 1.5, "learning_rate": 0.000352, "loss": 6.4419, "mean_token_accuracy": 0.14538582488894464, "num_tokens": 600115.0, "step": 705 }, { "entropy": 6.84918212890625, "epoch": 0.015864680974672373, "grad_norm": 1.625, "learning_rate": 0.0003545, "loss": 6.4459, "mean_token_accuracy": 0.14390605613589286, "num_tokens": 604488.0, "step": 710 }, { "entropy": 7.084057378768921, "epoch": 0.01597640408012781, "grad_norm": 1.5625, "learning_rate": 0.000357, "loss": 6.4439, "mean_token_accuracy": 0.14581422209739686, "num_tokens": 608776.0, "step": 715 }, { "entropy": 6.7030833721160885, "epoch": 0.01608812718558325, "grad_norm": 1.5625, "learning_rate": 0.0003595, "loss": 6.4504, "mean_token_accuracy": 0.15099047794938086, "num_tokens": 612771.0, "step": 720 }, { "entropy": 6.734662055969238, "epoch": 0.01619985029103869, "grad_norm": 1.453125, "learning_rate": 0.000362, "loss": 6.3566, "mean_token_accuracy": 0.1516010656952858, "num_tokens": 616947.0, "step": 725 }, { "entropy": 6.9721879959106445, "epoch": 0.01631157339649413, "grad_norm": 1.4453125, "learning_rate": 0.0003645, "loss": 6.4281, "mean_token_accuracy": 0.15130600407719613, "num_tokens": 621064.0, "step": 730 }, { "entropy": 6.856808233261108, "epoch": 0.01642329650194957, "grad_norm": 1.515625, "learning_rate": 0.000367, "loss": 6.4363, "mean_token_accuracy": 0.14606723934412003, "num_tokens": 625349.0, "step": 735 }, { "entropy": 6.863543367385864, "epoch": 0.016535019607405006, "grad_norm": 1.3984375, "learning_rate": 0.0003695, "loss": 6.3498, "mean_token_accuracy": 0.15212762504816055, "num_tokens": 629754.0, "step": 740 }, { "entropy": 6.9013481616973875, "epoch": 0.016646742712860447, "grad_norm": 1.453125, "learning_rate": 0.000372, "loss": 6.3994, "mean_token_accuracy": 0.13600233122706412, "num_tokens": 634111.0, "step": 745 }, { "entropy": 6.783719491958618, "epoch": 0.016758465818315887, "grad_norm": 1.5390625, "learning_rate": 0.0003745, "loss": 6.2659, "mean_token_accuracy": 0.16154912412166594, "num_tokens": 638527.0, "step": 750 }, { "entropy": 6.681515502929687, "epoch": 0.016870188923771325, "grad_norm": 1.6875, "learning_rate": 0.000377, "loss": 6.2959, "mean_token_accuracy": 0.15193597078323365, "num_tokens": 642530.0, "step": 755 }, { "entropy": 6.832862663269043, "epoch": 0.016981912029226765, "grad_norm": 1.4609375, "learning_rate": 0.0003795, "loss": 6.3628, "mean_token_accuracy": 0.15937515050172807, "num_tokens": 646707.0, "step": 760 }, { "entropy": 6.755436944961548, "epoch": 0.017093635134682202, "grad_norm": 1.3984375, "learning_rate": 0.000382, "loss": 6.4113, "mean_token_accuracy": 0.15184309035539628, "num_tokens": 651326.0, "step": 765 }, { "entropy": 6.871073818206787, "epoch": 0.017205358240137643, "grad_norm": 1.4453125, "learning_rate": 0.0003845, "loss": 6.4618, "mean_token_accuracy": 0.15124865621328354, "num_tokens": 655651.0, "step": 770 }, { "entropy": 6.727626895904541, "epoch": 0.017317081345593083, "grad_norm": 1.4140625, "learning_rate": 0.00038700000000000003, "loss": 6.3946, "mean_token_accuracy": 0.14350455030798911, "num_tokens": 659868.0, "step": 775 }, { "entropy": 6.921767711639404, "epoch": 0.01742880445104852, "grad_norm": 1.484375, "learning_rate": 0.00038950000000000003, "loss": 6.4403, "mean_token_accuracy": 0.14834593906998633, "num_tokens": 664404.0, "step": 780 }, { "entropy": 6.699166393280029, "epoch": 0.01754052755650396, "grad_norm": 1.6953125, "learning_rate": 0.00039200000000000004, "loss": 6.2549, "mean_token_accuracy": 0.1540958382189274, "num_tokens": 668543.0, "step": 785 }, { "entropy": 6.646093511581421, "epoch": 0.0176522506619594, "grad_norm": 1.421875, "learning_rate": 0.00039450000000000005, "loss": 6.2322, "mean_token_accuracy": 0.15864449143409728, "num_tokens": 672637.0, "step": 790 }, { "entropy": 6.7683539390563965, "epoch": 0.01776397376741484, "grad_norm": 1.390625, "learning_rate": 0.00039700000000000005, "loss": 6.3473, "mean_token_accuracy": 0.15486485213041307, "num_tokens": 676929.0, "step": 795 }, { "entropy": 6.6748552322387695, "epoch": 0.01787569687287028, "grad_norm": 1.4375, "learning_rate": 0.0003995, "loss": 6.3935, "mean_token_accuracy": 0.14576203897595405, "num_tokens": 681287.0, "step": 800 }, { "entropy": 6.773136854171753, "epoch": 0.017987419978325717, "grad_norm": 1.4765625, "learning_rate": 0.000402, "loss": 6.2896, "mean_token_accuracy": 0.14774783104658126, "num_tokens": 685543.0, "step": 805 }, { "entropy": 6.736042118072509, "epoch": 0.018099143083781157, "grad_norm": 1.4765625, "learning_rate": 0.0004045, "loss": 6.2738, "mean_token_accuracy": 0.14571748450398445, "num_tokens": 689479.0, "step": 810 }, { "entropy": 6.654993534088135, "epoch": 0.018210866189236594, "grad_norm": 1.4453125, "learning_rate": 0.00040699999999999997, "loss": 6.2274, "mean_token_accuracy": 0.16590481102466584, "num_tokens": 693564.0, "step": 815 }, { "entropy": 6.655237770080566, "epoch": 0.018322589294692035, "grad_norm": 1.5234375, "learning_rate": 0.0004095, "loss": 6.3025, "mean_token_accuracy": 0.15087175816297532, "num_tokens": 697719.0, "step": 820 }, { "entropy": 6.695311594009399, "epoch": 0.018434312400147475, "grad_norm": 1.3828125, "learning_rate": 0.000412, "loss": 6.4423, "mean_token_accuracy": 0.1513037145137787, "num_tokens": 701937.0, "step": 825 }, { "entropy": 6.683870124816894, "epoch": 0.018546035505602913, "grad_norm": 1.4609375, "learning_rate": 0.0004145, "loss": 6.3653, "mean_token_accuracy": 0.14852157458662987, "num_tokens": 706457.0, "step": 830 }, { "entropy": 6.849027347564697, "epoch": 0.018657758611058353, "grad_norm": 1.59375, "learning_rate": 0.000417, "loss": 6.38, "mean_token_accuracy": 0.15221845954656602, "num_tokens": 710538.0, "step": 835 }, { "entropy": 6.5683678150177, "epoch": 0.018769481716513794, "grad_norm": 1.5078125, "learning_rate": 0.0004195, "loss": 6.248, "mean_token_accuracy": 0.16236387193202972, "num_tokens": 714663.0, "step": 840 }, { "entropy": 6.697820091247559, "epoch": 0.01888120482196923, "grad_norm": 1.3984375, "learning_rate": 0.000422, "loss": 6.2517, "mean_token_accuracy": 0.1534279391169548, "num_tokens": 718828.0, "step": 845 }, { "entropy": 6.571414041519165, "epoch": 0.01899292792742467, "grad_norm": 1.375, "learning_rate": 0.0004245, "loss": 6.2619, "mean_token_accuracy": 0.15856588035821914, "num_tokens": 723155.0, "step": 850 }, { "entropy": 6.808944749832153, "epoch": 0.01910465103288011, "grad_norm": 1.3984375, "learning_rate": 0.000427, "loss": 6.3217, "mean_token_accuracy": 0.15492385476827622, "num_tokens": 727605.0, "step": 855 }, { "entropy": 6.5332495212554935, "epoch": 0.01921637413833555, "grad_norm": 1.53125, "learning_rate": 0.0004295, "loss": 6.2777, "mean_token_accuracy": 0.14969536513090134, "num_tokens": 731510.0, "step": 860 }, { "entropy": 6.71437783241272, "epoch": 0.01932809724379099, "grad_norm": 1.34375, "learning_rate": 0.000432, "loss": 6.4619, "mean_token_accuracy": 0.1470765456557274, "num_tokens": 736048.0, "step": 865 }, { "entropy": 6.678787279129028, "epoch": 0.019439820349246427, "grad_norm": 1.421875, "learning_rate": 0.0004345, "loss": 6.2512, "mean_token_accuracy": 0.15854543596506118, "num_tokens": 740714.0, "step": 870 }, { "entropy": 6.619452238082886, "epoch": 0.019551543454701868, "grad_norm": 1.6171875, "learning_rate": 0.000437, "loss": 6.3257, "mean_token_accuracy": 0.15650657266378404, "num_tokens": 745167.0, "step": 875 }, { "entropy": 6.694642496109009, "epoch": 0.019663266560157305, "grad_norm": 1.296875, "learning_rate": 0.0004395, "loss": 6.1537, "mean_token_accuracy": 0.16280549690127372, "num_tokens": 749401.0, "step": 880 }, { "entropy": 6.627922868728637, "epoch": 0.019774989665612745, "grad_norm": 1.359375, "learning_rate": 0.000442, "loss": 6.2224, "mean_token_accuracy": 0.16411646455526352, "num_tokens": 753554.0, "step": 885 }, { "entropy": 6.636331701278687, "epoch": 0.019886712771068186, "grad_norm": 1.5234375, "learning_rate": 0.0004445, "loss": 6.2725, "mean_token_accuracy": 0.14831542521715163, "num_tokens": 757744.0, "step": 890 }, { "entropy": 6.588339996337891, "epoch": 0.019998435876523623, "grad_norm": 1.4375, "learning_rate": 0.000447, "loss": 6.1633, "mean_token_accuracy": 0.15396574288606643, "num_tokens": 761858.0, "step": 895 }, { "entropy": 6.522528743743896, "epoch": 0.020110158981979064, "grad_norm": 1.453125, "learning_rate": 0.00044950000000000003, "loss": 6.1908, "mean_token_accuracy": 0.16618741899728776, "num_tokens": 765852.0, "step": 900 }, { "entropy": 6.504194116592407, "epoch": 0.0202218820874345, "grad_norm": 1.515625, "learning_rate": 0.00045200000000000004, "loss": 6.195, "mean_token_accuracy": 0.1562537170946598, "num_tokens": 770274.0, "step": 905 }, { "entropy": 6.654917287826538, "epoch": 0.02033360519288994, "grad_norm": 1.3203125, "learning_rate": 0.00045450000000000004, "loss": 6.2299, "mean_token_accuracy": 0.15019148588180542, "num_tokens": 774775.0, "step": 910 }, { "entropy": 6.611723136901856, "epoch": 0.020445328298345382, "grad_norm": 1.421875, "learning_rate": 0.00045700000000000005, "loss": 6.2769, "mean_token_accuracy": 0.16535960435867308, "num_tokens": 779374.0, "step": 915 }, { "entropy": 6.6483289241790775, "epoch": 0.02055705140380082, "grad_norm": 1.5390625, "learning_rate": 0.00045950000000000006, "loss": 6.2519, "mean_token_accuracy": 0.1526936858892441, "num_tokens": 783537.0, "step": 920 }, { "entropy": 6.512672090530396, "epoch": 0.02066877450925626, "grad_norm": 1.4609375, "learning_rate": 0.000462, "loss": 6.3001, "mean_token_accuracy": 0.15667854249477386, "num_tokens": 787562.0, "step": 925 }, { "entropy": 6.7299144744873045, "epoch": 0.0207804976147117, "grad_norm": 1.4140625, "learning_rate": 0.0004645, "loss": 6.3687, "mean_token_accuracy": 0.1443271040916443, "num_tokens": 792303.0, "step": 930 }, { "entropy": 6.5588274002075195, "epoch": 0.020892220720167137, "grad_norm": 1.3359375, "learning_rate": 0.000467, "loss": 6.1584, "mean_token_accuracy": 0.16573118567466735, "num_tokens": 796402.0, "step": 935 }, { "entropy": 6.613452911376953, "epoch": 0.021003943825622578, "grad_norm": 1.5, "learning_rate": 0.0004695, "loss": 6.2314, "mean_token_accuracy": 0.16508372873067856, "num_tokens": 800004.0, "step": 940 }, { "entropy": 6.341346979141235, "epoch": 0.021115666931078015, "grad_norm": 1.4140625, "learning_rate": 0.000472, "loss": 6.117, "mean_token_accuracy": 0.16131858825683593, "num_tokens": 803782.0, "step": 945 }, { "entropy": 6.510941028594971, "epoch": 0.021227390036533456, "grad_norm": 1.328125, "learning_rate": 0.0004745, "loss": 6.1673, "mean_token_accuracy": 0.16098449528217315, "num_tokens": 808013.0, "step": 950 }, { "entropy": 6.4510125637054445, "epoch": 0.021339113141988896, "grad_norm": 1.3359375, "learning_rate": 0.000477, "loss": 6.1669, "mean_token_accuracy": 0.1604609191417694, "num_tokens": 812130.0, "step": 955 }, { "entropy": 6.501435708999634, "epoch": 0.021450836247444333, "grad_norm": 1.4296875, "learning_rate": 0.0004795, "loss": 6.1342, "mean_token_accuracy": 0.16159728765487671, "num_tokens": 816246.0, "step": 960 }, { "entropy": 6.510243511199951, "epoch": 0.021562559352899774, "grad_norm": 1.4140625, "learning_rate": 0.000482, "loss": 6.1317, "mean_token_accuracy": 0.16103896945714952, "num_tokens": 820628.0, "step": 965 }, { "entropy": 6.387259864807129, "epoch": 0.02167428245835521, "grad_norm": 1.3125, "learning_rate": 0.0004845, "loss": 6.1367, "mean_token_accuracy": 0.1684929057955742, "num_tokens": 824864.0, "step": 970 }, { "entropy": 6.61003623008728, "epoch": 0.02178600556381065, "grad_norm": 1.59375, "learning_rate": 0.000487, "loss": 6.183, "mean_token_accuracy": 0.15778308734297752, "num_tokens": 828887.0, "step": 975 }, { "entropy": 6.448976230621338, "epoch": 0.021897728669266092, "grad_norm": 1.328125, "learning_rate": 0.0004895, "loss": 6.0739, "mean_token_accuracy": 0.16717422604560853, "num_tokens": 832784.0, "step": 980 }, { "entropy": 6.41447229385376, "epoch": 0.02200945177472153, "grad_norm": 1.3359375, "learning_rate": 0.000492, "loss": 6.0937, "mean_token_accuracy": 0.15710036903619767, "num_tokens": 837408.0, "step": 985 }, { "entropy": 6.451285934448242, "epoch": 0.02212117488017697, "grad_norm": 1.3671875, "learning_rate": 0.0004945, "loss": 6.1512, "mean_token_accuracy": 0.16300837993621825, "num_tokens": 841536.0, "step": 990 }, { "entropy": 6.426507759094238, "epoch": 0.022232897985632407, "grad_norm": 1.453125, "learning_rate": 0.000497, "loss": 6.0582, "mean_token_accuracy": 0.17121603935956956, "num_tokens": 846044.0, "step": 995 }, { "entropy": 6.320924186706543, "epoch": 0.022344621091087848, "grad_norm": 1.2734375, "learning_rate": 0.0004995, "loss": 6.1118, "mean_token_accuracy": 0.1685657724738121, "num_tokens": 850254.0, "step": 1000 }, { "entropy": 6.453368425369263, "epoch": 0.022456344196543288, "grad_norm": 1.3828125, "learning_rate": 0.000499998026082006, "loss": 6.0715, "mean_token_accuracy": 0.16939653158187867, "num_tokens": 854548.0, "step": 1005 }, { "entropy": 6.386558151245117, "epoch": 0.022568067301998725, "grad_norm": 1.328125, "learning_rate": 0.0004999900070995136, "loss": 6.2133, "mean_token_accuracy": 0.1603351503610611, "num_tokens": 858837.0, "step": 1010 }, { "entropy": 6.328588962554932, "epoch": 0.022679790407454166, "grad_norm": 1.4375, "learning_rate": 0.0004999758199023239, "loss": 6.0295, "mean_token_accuracy": 0.16671301424503326, "num_tokens": 862772.0, "step": 1015 }, { "entropy": 6.418800020217896, "epoch": 0.022791513512909607, "grad_norm": 1.3515625, "learning_rate": 0.0004999554648793858, "loss": 6.1171, "mean_token_accuracy": 0.1632360801100731, "num_tokens": 866909.0, "step": 1020 }, { "entropy": 6.417377662658692, "epoch": 0.022903236618365044, "grad_norm": 1.4140625, "learning_rate": 0.0004999289425887425, "loss": 6.2569, "mean_token_accuracy": 0.1513159267604351, "num_tokens": 871127.0, "step": 1025 }, { "entropy": 6.434281063079834, "epoch": 0.023014959723820484, "grad_norm": 1.515625, "learning_rate": 0.0004998962537575161, "loss": 6.1777, "mean_token_accuracy": 0.1645081579685211, "num_tokens": 875292.0, "step": 1030 }, { "entropy": 6.387321043014526, "epoch": 0.02312668282927592, "grad_norm": 1.4140625, "learning_rate": 0.0004998573992818874, "loss": 6.0829, "mean_token_accuracy": 0.15736222565174102, "num_tokens": 879286.0, "step": 1035 }, { "entropy": 6.443514776229859, "epoch": 0.023238405934731362, "grad_norm": 1.3359375, "learning_rate": 0.0004998123802270715, "loss": 6.1128, "mean_token_accuracy": 0.16863536983728408, "num_tokens": 883490.0, "step": 1040 }, { "entropy": 6.404753255844116, "epoch": 0.023350129040186803, "grad_norm": 1.4140625, "learning_rate": 0.0004997611978272886, "loss": 6.0757, "mean_token_accuracy": 0.1634724885225296, "num_tokens": 887786.0, "step": 1045 }, { "entropy": 6.409213829040527, "epoch": 0.02346185214564224, "grad_norm": 1.3359375, "learning_rate": 0.0004997038534857298, "loss": 6.0974, "mean_token_accuracy": 0.16956386119127273, "num_tokens": 892035.0, "step": 1050 }, { "entropy": 6.431076288223267, "epoch": 0.02357357525109768, "grad_norm": 1.3515625, "learning_rate": 0.0004996403487745194, "loss": 6.0877, "mean_token_accuracy": 0.16654614359140396, "num_tokens": 896099.0, "step": 1055 }, { "entropy": 6.382920742034912, "epoch": 0.023685298356553117, "grad_norm": 1.5390625, "learning_rate": 0.000499570685434671, "loss": 6.0172, "mean_token_accuracy": 0.17494994923472404, "num_tokens": 900448.0, "step": 1060 }, { "entropy": 6.291054439544678, "epoch": 0.023797021462008558, "grad_norm": 1.4453125, "learning_rate": 0.0004994948653760405, "loss": 6.0807, "mean_token_accuracy": 0.16231946498155594, "num_tokens": 904538.0, "step": 1065 }, { "entropy": 6.47989239692688, "epoch": 0.023908744567464, "grad_norm": 1.546875, "learning_rate": 0.0004994128906772729, "loss": 6.1925, "mean_token_accuracy": 0.15721007734537124, "num_tokens": 908680.0, "step": 1070 }, { "entropy": 6.449333000183105, "epoch": 0.024020467672919436, "grad_norm": 1.4375, "learning_rate": 0.000499324763585746, "loss": 6.1782, "mean_token_accuracy": 0.15536463409662246, "num_tokens": 912868.0, "step": 1075 }, { "entropy": 6.176778936386109, "epoch": 0.024132190778374876, "grad_norm": 1.53125, "learning_rate": 0.0004992304865175085, "loss": 6.1923, "mean_token_accuracy": 0.15709846168756486, "num_tokens": 917312.0, "step": 1080 }, { "entropy": 6.612870168685913, "epoch": 0.024243913883830313, "grad_norm": 1.203125, "learning_rate": 0.0004991300620572138, "loss": 6.1957, "mean_token_accuracy": 0.16389840096235275, "num_tokens": 921977.0, "step": 1085 }, { "entropy": 6.3305353164672855, "epoch": 0.024355636989285754, "grad_norm": 1.3671875, "learning_rate": 0.0004990234929580494, "loss": 6.1261, "mean_token_accuracy": 0.17108615338802338, "num_tokens": 925964.0, "step": 1090 }, { "entropy": 6.32656307220459, "epoch": 0.024467360094741195, "grad_norm": 1.40625, "learning_rate": 0.0004989107821416609, "loss": 5.9851, "mean_token_accuracy": 0.1709235593676567, "num_tokens": 930077.0, "step": 1095 }, { "entropy": 6.294446897506714, "epoch": 0.02457908320019663, "grad_norm": 1.2734375, "learning_rate": 0.0004987919326980723, "loss": 6.0255, "mean_token_accuracy": 0.17414780110120773, "num_tokens": 934547.0, "step": 1100 }, { "entropy": 6.341350984573364, "epoch": 0.024690806305652072, "grad_norm": 1.46875, "learning_rate": 0.0004986669478856011, "loss": 6.0495, "mean_token_accuracy": 0.17528252005577089, "num_tokens": 938583.0, "step": 1105 }, { "entropy": 6.444618558883667, "epoch": 0.024802529411107513, "grad_norm": 1.3125, "learning_rate": 0.0004985358311307688, "loss": 6.0981, "mean_token_accuracy": 0.16719860881567, "num_tokens": 943007.0, "step": 1110 }, { "entropy": 6.317926597595215, "epoch": 0.02491425251656295, "grad_norm": 1.3671875, "learning_rate": 0.0004983985860282081, "loss": 6.0836, "mean_token_accuracy": 0.16416927874088288, "num_tokens": 947281.0, "step": 1115 }, { "entropy": 6.2807488441467285, "epoch": 0.02502597562201839, "grad_norm": 1.4375, "learning_rate": 0.0004982552163405623, "loss": 5.9665, "mean_token_accuracy": 0.1739448979496956, "num_tokens": 951443.0, "step": 1120 }, { "entropy": 6.281340885162353, "epoch": 0.025137698727473828, "grad_norm": 1.484375, "learning_rate": 0.0004981057259983839, "loss": 6.068, "mean_token_accuracy": 0.15851529389619828, "num_tokens": 955131.0, "step": 1125 }, { "entropy": 6.28678936958313, "epoch": 0.02524942183292927, "grad_norm": 1.296875, "learning_rate": 0.0004979501191000262, "loss": 6.0043, "mean_token_accuracy": 0.17229326516389848, "num_tokens": 959408.0, "step": 1130 }, { "entropy": 6.292270088195801, "epoch": 0.02536114493838471, "grad_norm": 1.3984375, "learning_rate": 0.0004977883999115311, "loss": 6.067, "mean_token_accuracy": 0.1669071465730667, "num_tokens": 963657.0, "step": 1135 }, { "entropy": 6.207029819488525, "epoch": 0.025472868043840146, "grad_norm": 1.3828125, "learning_rate": 0.0004976205728665113, "loss": 6.0802, "mean_token_accuracy": 0.16384815871715547, "num_tokens": 967973.0, "step": 1140 }, { "entropy": 6.254129791259766, "epoch": 0.025584591149295587, "grad_norm": 1.265625, "learning_rate": 0.0004974466425660307, "loss": 5.8691, "mean_token_accuracy": 0.1844624474644661, "num_tokens": 971957.0, "step": 1145 }, { "entropy": 6.198523759841919, "epoch": 0.025696314254751024, "grad_norm": 1.4140625, "learning_rate": 0.0004972666137784759, "loss": 5.9141, "mean_token_accuracy": 0.1774628072977066, "num_tokens": 975679.0, "step": 1150 }, { "entropy": 6.320246267318725, "epoch": 0.025808037360206464, "grad_norm": 1.4296875, "learning_rate": 0.0004970804914394271, "loss": 6.1461, "mean_token_accuracy": 0.16569039970636368, "num_tokens": 979806.0, "step": 1155 }, { "entropy": 6.319731616973877, "epoch": 0.025919760465661905, "grad_norm": 1.296875, "learning_rate": 0.0004968882806515225, "loss": 6.0224, "mean_token_accuracy": 0.17389359325170517, "num_tokens": 984221.0, "step": 1160 }, { "entropy": 6.255821323394775, "epoch": 0.026031483571117342, "grad_norm": 1.3203125, "learning_rate": 0.0004966899866843177, "loss": 5.9962, "mean_token_accuracy": 0.16791471540927888, "num_tokens": 988323.0, "step": 1165 }, { "entropy": 6.295364618301392, "epoch": 0.026143206676572783, "grad_norm": 1.296875, "learning_rate": 0.000496485614974142, "loss": 6.0453, "mean_token_accuracy": 0.16588685512542725, "num_tokens": 992643.0, "step": 1170 }, { "entropy": 6.16099534034729, "epoch": 0.02625492978202822, "grad_norm": 1.359375, "learning_rate": 0.0004962751711239492, "loss": 5.8792, "mean_token_accuracy": 0.18837386816740037, "num_tokens": 996994.0, "step": 1175 }, { "entropy": 6.207149171829224, "epoch": 0.02636665288748366, "grad_norm": 1.359375, "learning_rate": 0.0004960586609031636, "loss": 6.0087, "mean_token_accuracy": 0.17770088016986846, "num_tokens": 1001345.0, "step": 1180 }, { "entropy": 6.242070484161377, "epoch": 0.0264783759929391, "grad_norm": 1.3515625, "learning_rate": 0.0004958360902475224, "loss": 5.9216, "mean_token_accuracy": 0.17433482706546782, "num_tokens": 1005839.0, "step": 1185 }, { "entropy": 6.214767789840698, "epoch": 0.026590099098394538, "grad_norm": 1.296875, "learning_rate": 0.0004956074652589125, "loss": 6.0737, "mean_token_accuracy": 0.16556215435266494, "num_tokens": 1010698.0, "step": 1190 }, { "entropy": 6.351368093490601, "epoch": 0.02670182220384998, "grad_norm": 1.2890625, "learning_rate": 0.0004953727922052035, "loss": 6.029, "mean_token_accuracy": 0.17102244794368743, "num_tokens": 1014982.0, "step": 1195 }, { "entropy": 6.117547225952149, "epoch": 0.026813545309305416, "grad_norm": 1.5234375, "learning_rate": 0.0004951320775200756, "loss": 5.9538, "mean_token_accuracy": 0.1834021046757698, "num_tokens": 1019235.0, "step": 1200 }, { "entropy": 6.2841479778289795, "epoch": 0.026925268414760856, "grad_norm": 1.484375, "learning_rate": 0.0004948853278028436, "loss": 6.037, "mean_token_accuracy": 0.17534659653902054, "num_tokens": 1023855.0, "step": 1205 }, { "entropy": 6.276741075515747, "epoch": 0.027036991520216297, "grad_norm": 1.2421875, "learning_rate": 0.0004946325498182755, "loss": 5.9946, "mean_token_accuracy": 0.179076386988163, "num_tokens": 1028464.0, "step": 1210 }, { "entropy": 6.3194655418396, "epoch": 0.027148714625671734, "grad_norm": 1.1640625, "learning_rate": 0.0004943737504964076, "loss": 5.9019, "mean_token_accuracy": 0.17992053031921387, "num_tokens": 1033110.0, "step": 1215 }, { "entropy": 6.059362745285034, "epoch": 0.027260437731127175, "grad_norm": 1.2578125, "learning_rate": 0.000494108936932354, "loss": 6.0265, "mean_token_accuracy": 0.16747127920389177, "num_tokens": 1037470.0, "step": 1220 }, { "entropy": 6.275005626678467, "epoch": 0.027372160836582615, "grad_norm": 1.40625, "learning_rate": 0.0004938381163861124, "loss": 5.9895, "mean_token_accuracy": 0.1710431009531021, "num_tokens": 1041662.0, "step": 1225 }, { "entropy": 6.280491781234741, "epoch": 0.027483883942038052, "grad_norm": 1.328125, "learning_rate": 0.0004935612962823645, "loss": 5.9146, "mean_token_accuracy": 0.17440603524446488, "num_tokens": 1046041.0, "step": 1230 }, { "entropy": 6.002849531173706, "epoch": 0.027595607047493493, "grad_norm": 1.3125, "learning_rate": 0.0004932784842102739, "loss": 5.9419, "mean_token_accuracy": 0.17400026842951774, "num_tokens": 1049893.0, "step": 1235 }, { "entropy": 6.270329856872559, "epoch": 0.02770733015294893, "grad_norm": 1.390625, "learning_rate": 0.0004929896879232758, "loss": 6.0921, "mean_token_accuracy": 0.16643197387456893, "num_tokens": 1054048.0, "step": 1240 }, { "entropy": 6.235016345977783, "epoch": 0.02781905325840437, "grad_norm": 1.296875, "learning_rate": 0.0004926949153388668, "loss": 6.0027, "mean_token_accuracy": 0.16786330044269562, "num_tokens": 1058176.0, "step": 1245 }, { "entropy": 6.242469263076782, "epoch": 0.02793077636385981, "grad_norm": 1.359375, "learning_rate": 0.0004923941745383859, "loss": 5.9481, "mean_token_accuracy": 0.16720382124185562, "num_tokens": 1062031.0, "step": 1250 }, { "entropy": 6.049459505081177, "epoch": 0.02804249946931525, "grad_norm": 1.34375, "learning_rate": 0.000492087473766794, "loss": 5.9364, "mean_token_accuracy": 0.1834051489830017, "num_tokens": 1066705.0, "step": 1255 }, { "entropy": 6.242766857147217, "epoch": 0.02815422257477069, "grad_norm": 1.328125, "learning_rate": 0.000491774821432448, "loss": 5.9192, "mean_token_accuracy": 0.18238281905651094, "num_tokens": 1070960.0, "step": 1260 }, { "entropy": 6.3449784278869625, "epoch": 0.028265945680226126, "grad_norm": 1.3125, "learning_rate": 0.0004914562261068693, "loss": 6.1001, "mean_token_accuracy": 0.16833141073584557, "num_tokens": 1075576.0, "step": 1265 }, { "entropy": 6.116433477401733, "epoch": 0.028377668785681567, "grad_norm": 1.328125, "learning_rate": 0.0004911316965245098, "loss": 5.8615, "mean_token_accuracy": 0.17545292675495147, "num_tokens": 1079538.0, "step": 1270 }, { "entropy": 6.0287150859832765, "epoch": 0.028489391891137007, "grad_norm": 1.4609375, "learning_rate": 0.000490801241582512, "loss": 5.9247, "mean_token_accuracy": 0.17888442128896714, "num_tokens": 1083296.0, "step": 1275 }, { "entropy": 6.130237722396851, "epoch": 0.028601114996592444, "grad_norm": 1.296875, "learning_rate": 0.000490464870340465, "loss": 5.9678, "mean_token_accuracy": 0.17468917220830918, "num_tokens": 1087688.0, "step": 1280 }, { "entropy": 6.155571603775025, "epoch": 0.028712838102047885, "grad_norm": 1.296875, "learning_rate": 0.0004901225920201563, "loss": 5.9098, "mean_token_accuracy": 0.1857583463191986, "num_tokens": 1091988.0, "step": 1285 }, { "entropy": 6.23262209892273, "epoch": 0.028824561207503322, "grad_norm": 1.3046875, "learning_rate": 0.000489774416005319, "loss": 5.9385, "mean_token_accuracy": 0.17543941140174865, "num_tokens": 1095949.0, "step": 1290 }, { "entropy": 6.130452013015747, "epoch": 0.028936284312958763, "grad_norm": 1.296875, "learning_rate": 0.0004894203518413742, "loss": 5.8586, "mean_token_accuracy": 0.17781955897808074, "num_tokens": 1099795.0, "step": 1295 }, { "entropy": 5.954253005981445, "epoch": 0.029048007418414203, "grad_norm": 1.40625, "learning_rate": 0.0004890604092351701, "loss": 5.7856, "mean_token_accuracy": 0.1847798377275467, "num_tokens": 1103894.0, "step": 1300 }, { "entropy": 6.213930320739746, "epoch": 0.02915973052386964, "grad_norm": 1.296875, "learning_rate": 0.000488694598054715, "loss": 6.0079, "mean_token_accuracy": 0.17599031776189805, "num_tokens": 1108382.0, "step": 1305 }, { "entropy": 6.095987224578858, "epoch": 0.02927145362932508, "grad_norm": 1.3203125, "learning_rate": 0.0004883229283289071, "loss": 5.7812, "mean_token_accuracy": 0.18635666817426683, "num_tokens": 1112841.0, "step": 1310 }, { "entropy": 6.131918239593506, "epoch": 0.02938317673478052, "grad_norm": 1.3046875, "learning_rate": 0.00048794541024725993, "loss": 6.0198, "mean_token_accuracy": 0.17015290707349778, "num_tokens": 1117319.0, "step": 1315 }, { "entropy": 6.14964075088501, "epoch": 0.02949489984023596, "grad_norm": 1.390625, "learning_rate": 0.0004875620541596221, "loss": 5.9832, "mean_token_accuracy": 0.17692959010601045, "num_tokens": 1121371.0, "step": 1320 }, { "entropy": 6.087015199661255, "epoch": 0.0296066229456914, "grad_norm": 1.2734375, "learning_rate": 0.00048717287057589454, "loss": 5.8647, "mean_token_accuracy": 0.1832718223333359, "num_tokens": 1125496.0, "step": 1325 }, { "entropy": 6.156226062774659, "epoch": 0.029718346051146836, "grad_norm": 1.25, "learning_rate": 0.0004867778701657417, "loss": 5.9415, "mean_token_accuracy": 0.18602917045354844, "num_tokens": 1129929.0, "step": 1330 }, { "entropy": 6.084760046005249, "epoch": 0.029830069156602277, "grad_norm": 1.3671875, "learning_rate": 0.00048637706375829955, "loss": 5.8466, "mean_token_accuracy": 0.18009257912635804, "num_tokens": 1133982.0, "step": 1335 }, { "entropy": 6.020449161529541, "epoch": 0.029941792262057718, "grad_norm": 1.359375, "learning_rate": 0.000485970462341878, "loss": 5.8284, "mean_token_accuracy": 0.176700659096241, "num_tokens": 1138270.0, "step": 1340 }, { "entropy": 6.241879224777222, "epoch": 0.030053515367513155, "grad_norm": 1.4140625, "learning_rate": 0.00048555807706366044, "loss": 5.9128, "mean_token_accuracy": 0.1819134071469307, "num_tokens": 1142197.0, "step": 1345 }, { "entropy": 6.1059808254241945, "epoch": 0.030165238472968595, "grad_norm": 1.4453125, "learning_rate": 0.00048513991922939756, "loss": 5.9763, "mean_token_accuracy": 0.17749614864587784, "num_tokens": 1146439.0, "step": 1350 }, { "entropy": 6.060887145996094, "epoch": 0.030276961578424032, "grad_norm": 1.3046875, "learning_rate": 0.00048471600030309744, "loss": 5.9363, "mean_token_accuracy": 0.1883004993200302, "num_tokens": 1150809.0, "step": 1355 }, { "entropy": 6.27038950920105, "epoch": 0.030388684683879473, "grad_norm": 1.2578125, "learning_rate": 0.00048428633190671186, "loss": 6.0562, "mean_token_accuracy": 0.1610119305551052, "num_tokens": 1155132.0, "step": 1360 }, { "entropy": 6.380424118041992, "epoch": 0.030500407789334914, "grad_norm": 1.3046875, "learning_rate": 0.0004838509258198167, "loss": 5.9975, "mean_token_accuracy": 0.167344368994236, "num_tokens": 1159694.0, "step": 1365 }, { "entropy": 6.135957479476929, "epoch": 0.03061213089479035, "grad_norm": 1.296875, "learning_rate": 0.00048340979397929, "loss": 5.9235, "mean_token_accuracy": 0.1690131738781929, "num_tokens": 1164070.0, "step": 1370 }, { "entropy": 6.074416303634644, "epoch": 0.03072385400024579, "grad_norm": 1.328125, "learning_rate": 0.00048296294847898386, "loss": 5.7706, "mean_token_accuracy": 0.19714766144752502, "num_tokens": 1168097.0, "step": 1375 }, { "entropy": 6.055906534194946, "epoch": 0.03083557710570123, "grad_norm": 1.3359375, "learning_rate": 0.0004825104015693934, "loss": 5.9315, "mean_token_accuracy": 0.1687031865119934, "num_tokens": 1172375.0, "step": 1380 }, { "entropy": 6.07122950553894, "epoch": 0.03094730021115667, "grad_norm": 1.34375, "learning_rate": 0.0004820521656573208, "loss": 5.962, "mean_token_accuracy": 0.16933582127094268, "num_tokens": 1176628.0, "step": 1385 }, { "entropy": 6.064983272552491, "epoch": 0.03105902331661211, "grad_norm": 1.328125, "learning_rate": 0.00048158825330553505, "loss": 5.9208, "mean_token_accuracy": 0.1758138954639435, "num_tokens": 1180500.0, "step": 1390 }, { "entropy": 6.065640687942505, "epoch": 0.031170746422067547, "grad_norm": 1.375, "learning_rate": 0.00048111867723242763, "loss": 5.9367, "mean_token_accuracy": 0.17107723653316498, "num_tokens": 1184883.0, "step": 1395 }, { "entropy": 6.242153358459473, "epoch": 0.031282469527522984, "grad_norm": 1.3203125, "learning_rate": 0.0004806434503116637, "loss": 5.905, "mean_token_accuracy": 0.17483978867530822, "num_tokens": 1188723.0, "step": 1400 }, { "entropy": 6.178732633590698, "epoch": 0.03139419263297843, "grad_norm": 1.2109375, "learning_rate": 0.0004801625855718296, "loss": 6.0475, "mean_token_accuracy": 0.1655319780111313, "num_tokens": 1193318.0, "step": 1405 }, { "entropy": 6.136021947860717, "epoch": 0.031505915738433865, "grad_norm": 1.5, "learning_rate": 0.00047967609619607477, "loss": 5.9253, "mean_token_accuracy": 0.17083597481250762, "num_tokens": 1197663.0, "step": 1410 }, { "entropy": 5.984335994720459, "epoch": 0.0316176388438893, "grad_norm": 1.25, "learning_rate": 0.0004791839955217513, "loss": 5.7723, "mean_token_accuracy": 0.19360799193382264, "num_tokens": 1201949.0, "step": 1415 }, { "entropy": 6.23677077293396, "epoch": 0.031729361949344746, "grad_norm": 1.3203125, "learning_rate": 0.00047868629704004786, "loss": 6.0375, "mean_token_accuracy": 0.1644550994038582, "num_tokens": 1206360.0, "step": 1420 }, { "entropy": 6.234195375442505, "epoch": 0.03184108505480018, "grad_norm": 1.4609375, "learning_rate": 0.00047818301439561965, "loss": 5.8892, "mean_token_accuracy": 0.17676099985837937, "num_tokens": 1210738.0, "step": 1425 }, { "entropy": 6.093136215209961, "epoch": 0.03195280816025562, "grad_norm": 1.3671875, "learning_rate": 0.00047767416138621454, "loss": 5.8809, "mean_token_accuracy": 0.17498217523097992, "num_tokens": 1214722.0, "step": 1430 }, { "entropy": 6.140736865997314, "epoch": 0.032064531265711065, "grad_norm": 1.2421875, "learning_rate": 0.000477159751962295, "loss": 5.9662, "mean_token_accuracy": 0.17408566623926164, "num_tokens": 1218811.0, "step": 1435 }, { "entropy": 6.182424402236938, "epoch": 0.0321762543711665, "grad_norm": 1.3125, "learning_rate": 0.00047663980022665507, "loss": 5.9361, "mean_token_accuracy": 0.17186458110809327, "num_tokens": 1223089.0, "step": 1440 }, { "entropy": 5.910596895217895, "epoch": 0.03228797747662194, "grad_norm": 1.3984375, "learning_rate": 0.00047611432043403437, "loss": 5.7536, "mean_token_accuracy": 0.18107606768608092, "num_tokens": 1226970.0, "step": 1445 }, { "entropy": 5.991322135925293, "epoch": 0.03239970058207738, "grad_norm": 1.34375, "learning_rate": 0.0004755833269907267, "loss": 5.8273, "mean_token_accuracy": 0.18494699150323868, "num_tokens": 1230734.0, "step": 1450 }, { "entropy": 6.019584083557129, "epoch": 0.03251142368753282, "grad_norm": 1.1328125, "learning_rate": 0.0004750468344541857, "loss": 5.887, "mean_token_accuracy": 0.1700140006840229, "num_tokens": 1235363.0, "step": 1455 }, { "entropy": 6.014257526397705, "epoch": 0.03262314679298826, "grad_norm": 1.2421875, "learning_rate": 0.00047450485753262525, "loss": 5.7371, "mean_token_accuracy": 0.18364810347557067, "num_tokens": 1239826.0, "step": 1460 }, { "entropy": 6.125292587280273, "epoch": 0.032734869898443694, "grad_norm": 1.4765625, "learning_rate": 0.00047395741108461633, "loss": 5.9677, "mean_token_accuracy": 0.16234042197465898, "num_tokens": 1244067.0, "step": 1465 }, { "entropy": 6.030685091018677, "epoch": 0.03284659300389914, "grad_norm": 1.203125, "learning_rate": 0.00047340451011867985, "loss": 5.7734, "mean_token_accuracy": 0.18260960876941681, "num_tokens": 1248307.0, "step": 1470 }, { "entropy": 5.964048194885254, "epoch": 0.032958316109354575, "grad_norm": 1.3984375, "learning_rate": 0.00047284616979287515, "loss": 5.8281, "mean_token_accuracy": 0.17982411235570908, "num_tokens": 1252504.0, "step": 1475 }, { "entropy": 5.956363296508789, "epoch": 0.03307003921481001, "grad_norm": 1.2890625, "learning_rate": 0.00047228240541438433, "loss": 5.8804, "mean_token_accuracy": 0.1755678966641426, "num_tokens": 1256823.0, "step": 1480 }, { "entropy": 6.03358826637268, "epoch": 0.03318176232026546, "grad_norm": 1.234375, "learning_rate": 0.00047171323243909257, "loss": 5.8242, "mean_token_accuracy": 0.17837631106376647, "num_tokens": 1261236.0, "step": 1485 }, { "entropy": 6.114140462875366, "epoch": 0.033293485425720894, "grad_norm": 1.265625, "learning_rate": 0.00047113866647116457, "loss": 5.84, "mean_token_accuracy": 0.17883239686489105, "num_tokens": 1265360.0, "step": 1490 }, { "entropy": 5.907846260070801, "epoch": 0.03340520853117633, "grad_norm": 1.28125, "learning_rate": 0.0004705587232626164, "loss": 5.7961, "mean_token_accuracy": 0.1842688113451004, "num_tokens": 1269683.0, "step": 1495 }, { "entropy": 5.970038223266601, "epoch": 0.033516931636631775, "grad_norm": 1.296875, "learning_rate": 0.00046997341871288424, "loss": 5.8931, "mean_token_accuracy": 0.1749550759792328, "num_tokens": 1273843.0, "step": 1500 }, { "entropy": 6.059222936630249, "epoch": 0.03362865474208721, "grad_norm": 1.2421875, "learning_rate": 0.0004693827688683879, "loss": 5.886, "mean_token_accuracy": 0.182131527364254, "num_tokens": 1278299.0, "step": 1505 }, { "entropy": 5.9348760604858395, "epoch": 0.03374037784754265, "grad_norm": 1.296875, "learning_rate": 0.0004687867899220914, "loss": 5.7727, "mean_token_accuracy": 0.18791095465421676, "num_tokens": 1282541.0, "step": 1510 }, { "entropy": 6.046141624450684, "epoch": 0.033852100952998086, "grad_norm": 1.2734375, "learning_rate": 0.00046818549821305846, "loss": 5.6971, "mean_token_accuracy": 0.18971533626317977, "num_tokens": 1286318.0, "step": 1515 }, { "entropy": 5.891553258895874, "epoch": 0.03396382405845353, "grad_norm": 1.3671875, "learning_rate": 0.00046757891022600494, "loss": 5.7956, "mean_token_accuracy": 0.18425178676843643, "num_tokens": 1290713.0, "step": 1520 }, { "entropy": 6.0908989906311035, "epoch": 0.03407554716390897, "grad_norm": 1.3359375, "learning_rate": 0.0004669670425908471, "loss": 5.7639, "mean_token_accuracy": 0.17793523371219636, "num_tokens": 1294820.0, "step": 1525 }, { "entropy": 6.103552007675171, "epoch": 0.034187270269364405, "grad_norm": 1.265625, "learning_rate": 0.0004663499120822451, "loss": 5.7944, "mean_token_accuracy": 0.1779856227338314, "num_tokens": 1299232.0, "step": 1530 }, { "entropy": 5.96611704826355, "epoch": 0.03429899337481985, "grad_norm": 1.3515625, "learning_rate": 0.0004657275356191437, "loss": 5.8039, "mean_token_accuracy": 0.18248388022184373, "num_tokens": 1303287.0, "step": 1535 }, { "entropy": 5.9684473991394045, "epoch": 0.034410716480275286, "grad_norm": 1.28125, "learning_rate": 0.00046509993026430804, "loss": 5.778, "mean_token_accuracy": 0.18929605185985565, "num_tokens": 1307563.0, "step": 1540 }, { "entropy": 5.80886549949646, "epoch": 0.03452243958573072, "grad_norm": 1.328125, "learning_rate": 0.0004644671132238558, "loss": 5.6779, "mean_token_accuracy": 0.18525782525539397, "num_tokens": 1311484.0, "step": 1545 }, { "entropy": 6.063689661026001, "epoch": 0.03463416269118617, "grad_norm": 1.265625, "learning_rate": 0.00046382910184678585, "loss": 5.9013, "mean_token_accuracy": 0.17240882217884063, "num_tokens": 1316093.0, "step": 1550 }, { "entropy": 6.139868497848511, "epoch": 0.034745885796641604, "grad_norm": 1.2265625, "learning_rate": 0.0004631859136245025, "loss": 5.8746, "mean_token_accuracy": 0.1783792719244957, "num_tokens": 1321001.0, "step": 1555 }, { "entropy": 6.060155868530273, "epoch": 0.03485760890209704, "grad_norm": 1.2265625, "learning_rate": 0.0004625375661903357, "loss": 5.7142, "mean_token_accuracy": 0.18678450286388398, "num_tokens": 1325069.0, "step": 1560 }, { "entropy": 6.023049211502075, "epoch": 0.034969332007552485, "grad_norm": 1.3828125, "learning_rate": 0.00046188407731905787, "loss": 5.6733, "mean_token_accuracy": 0.18095604628324508, "num_tokens": 1328887.0, "step": 1565 }, { "entropy": 6.019297122955322, "epoch": 0.03508105511300792, "grad_norm": 1.25, "learning_rate": 0.00046122546492639643, "loss": 5.9267, "mean_token_accuracy": 0.17412633448839188, "num_tokens": 1333395.0, "step": 1570 }, { "entropy": 6.182040786743164, "epoch": 0.03519277821846336, "grad_norm": 1.4140625, "learning_rate": 0.000460561747068543, "loss": 6.0356, "mean_token_accuracy": 0.16573426127433777, "num_tokens": 1337457.0, "step": 1575 }, { "entropy": 5.961935520172119, "epoch": 0.0353045013239188, "grad_norm": 1.3203125, "learning_rate": 0.0004598929419416578, "loss": 5.6475, "mean_token_accuracy": 0.1947586268186569, "num_tokens": 1341768.0, "step": 1580 }, { "entropy": 5.973260307312012, "epoch": 0.03541622442937424, "grad_norm": 1.234375, "learning_rate": 0.00045921906788137123, "loss": 5.7183, "mean_token_accuracy": 0.1928910493850708, "num_tokens": 1346191.0, "step": 1585 }, { "entropy": 5.966824626922607, "epoch": 0.03552794753482968, "grad_norm": 1.359375, "learning_rate": 0.00045854014336228115, "loss": 5.7661, "mean_token_accuracy": 0.18546913117170333, "num_tokens": 1350228.0, "step": 1590 }, { "entropy": 5.945509767532348, "epoch": 0.035639670640285115, "grad_norm": 1.2734375, "learning_rate": 0.00045785618699744615, "loss": 5.8698, "mean_token_accuracy": 0.17462340891361236, "num_tokens": 1354722.0, "step": 1595 }, { "entropy": 6.016447973251343, "epoch": 0.03575139374574056, "grad_norm": 1.2734375, "learning_rate": 0.00045716721753787543, "loss": 5.863, "mean_token_accuracy": 0.18090844303369522, "num_tokens": 1358985.0, "step": 1600 }, { "entropy": 6.129895305633545, "epoch": 0.035863116851195996, "grad_norm": 1.2734375, "learning_rate": 0.0004564732538720148, "loss": 5.7705, "mean_token_accuracy": 0.18566034287214278, "num_tokens": 1363281.0, "step": 1605 }, { "entropy": 6.03486967086792, "epoch": 0.03597483995665143, "grad_norm": 1.3515625, "learning_rate": 0.00045577431502522877, "loss": 5.8546, "mean_token_accuracy": 0.17603144496679307, "num_tokens": 1367777.0, "step": 1610 }, { "entropy": 5.92125449180603, "epoch": 0.03608656306210688, "grad_norm": 1.203125, "learning_rate": 0.0004550704201592787, "loss": 5.9066, "mean_token_accuracy": 0.16197986379265786, "num_tokens": 1372537.0, "step": 1615 }, { "entropy": 5.871002578735352, "epoch": 0.036198286167562314, "grad_norm": 1.171875, "learning_rate": 0.0004543615885717981, "loss": 5.6652, "mean_token_accuracy": 0.19525430649518966, "num_tokens": 1377341.0, "step": 1620 }, { "entropy": 6.010153532028198, "epoch": 0.03631000927301775, "grad_norm": 1.296875, "learning_rate": 0.00045364783969576296, "loss": 5.7759, "mean_token_accuracy": 0.17927465140819548, "num_tokens": 1381480.0, "step": 1625 }, { "entropy": 6.026579904556274, "epoch": 0.03642173237847319, "grad_norm": 1.1796875, "learning_rate": 0.0004529291930989592, "loss": 5.7237, "mean_token_accuracy": 0.17483969032764435, "num_tokens": 1386057.0, "step": 1630 }, { "entropy": 5.877592515945435, "epoch": 0.03653345548392863, "grad_norm": 1.2890625, "learning_rate": 0.0004522056684834464, "loss": 5.7144, "mean_token_accuracy": 0.18803704977035524, "num_tokens": 1390405.0, "step": 1635 }, { "entropy": 5.916935634613037, "epoch": 0.03664517858938407, "grad_norm": 1.203125, "learning_rate": 0.0004514772856850173, "loss": 5.7084, "mean_token_accuracy": 0.18442935198545457, "num_tokens": 1394972.0, "step": 1640 }, { "entropy": 5.940288639068603, "epoch": 0.03675690169483951, "grad_norm": 1.328125, "learning_rate": 0.0004507440646726542, "loss": 5.687, "mean_token_accuracy": 0.19444524943828584, "num_tokens": 1399270.0, "step": 1645 }, { "entropy": 5.900350522994995, "epoch": 0.03686862480029495, "grad_norm": 1.28125, "learning_rate": 0.0004500060255479818, "loss": 5.686, "mean_token_accuracy": 0.19073100686073302, "num_tokens": 1403506.0, "step": 1650 }, { "entropy": 5.816886520385742, "epoch": 0.03698034790575039, "grad_norm": 1.265625, "learning_rate": 0.0004492631885447151, "loss": 5.8022, "mean_token_accuracy": 0.18439093232154846, "num_tokens": 1407493.0, "step": 1655 }, { "entropy": 5.997721147537232, "epoch": 0.037092071011205825, "grad_norm": 1.28125, "learning_rate": 0.00044851557402810616, "loss": 5.7482, "mean_token_accuracy": 0.1847753569483757, "num_tokens": 1411871.0, "step": 1660 }, { "entropy": 5.972122716903686, "epoch": 0.03720379411666127, "grad_norm": 1.25, "learning_rate": 0.00044776320249438444, "loss": 5.7221, "mean_token_accuracy": 0.18191029131412506, "num_tokens": 1415951.0, "step": 1665 }, { "entropy": 5.838072156906128, "epoch": 0.037315517222116706, "grad_norm": 1.2890625, "learning_rate": 0.00044700609457019565, "loss": 5.7248, "mean_token_accuracy": 0.18715287148952484, "num_tokens": 1420394.0, "step": 1670 }, { "entropy": 6.02448787689209, "epoch": 0.037427240327572144, "grad_norm": 1.2578125, "learning_rate": 0.0004462442710120359, "loss": 5.8614, "mean_token_accuracy": 0.1815338611602783, "num_tokens": 1424921.0, "step": 1675 }, { "entropy": 5.867510223388672, "epoch": 0.03753896343302759, "grad_norm": 1.296875, "learning_rate": 0.000445477752705683, "loss": 5.6933, "mean_token_accuracy": 0.1878269910812378, "num_tokens": 1428638.0, "step": 1680 }, { "entropy": 5.935184717178345, "epoch": 0.037650686538483025, "grad_norm": 1.3359375, "learning_rate": 0.00044470656066562336, "loss": 5.7843, "mean_token_accuracy": 0.18289850652217865, "num_tokens": 1432780.0, "step": 1685 }, { "entropy": 5.906549501419067, "epoch": 0.03776240964393846, "grad_norm": 1.3125, "learning_rate": 0.0004439307160344765, "loss": 5.7056, "mean_token_accuracy": 0.18908923119306564, "num_tokens": 1437178.0, "step": 1690 }, { "entropy": 6.072110605239868, "epoch": 0.0378741327493939, "grad_norm": 1.4140625, "learning_rate": 0.00044315024008241473, "loss": 5.7695, "mean_token_accuracy": 0.17374685257673264, "num_tokens": 1441040.0, "step": 1695 }, { "entropy": 5.934421253204346, "epoch": 0.03798585585484934, "grad_norm": 1.2265625, "learning_rate": 0.0004423651542065806, "loss": 5.7489, "mean_token_accuracy": 0.18246821612119674, "num_tokens": 1445530.0, "step": 1700 }, { "entropy": 5.934323310852051, "epoch": 0.03809757896030478, "grad_norm": 1.265625, "learning_rate": 0.00044157547993050006, "loss": 5.8347, "mean_token_accuracy": 0.18564676642417907, "num_tokens": 1449623.0, "step": 1705 }, { "entropy": 5.9197345733642575, "epoch": 0.03820930206576022, "grad_norm": 1.1796875, "learning_rate": 0.00044078123890349227, "loss": 5.6831, "mean_token_accuracy": 0.18841421008110046, "num_tokens": 1454167.0, "step": 1710 }, { "entropy": 5.848270463943481, "epoch": 0.03832102517121566, "grad_norm": 1.21875, "learning_rate": 0.00043998245290007606, "loss": 5.6999, "mean_token_accuracy": 0.18317352831363679, "num_tokens": 1458666.0, "step": 1715 }, { "entropy": 5.861880540847778, "epoch": 0.0384327482766711, "grad_norm": 1.203125, "learning_rate": 0.00043917914381937323, "loss": 5.7067, "mean_token_accuracy": 0.19191939234733582, "num_tokens": 1463253.0, "step": 1720 }, { "entropy": 5.804866981506348, "epoch": 0.038544471382126536, "grad_norm": 1.265625, "learning_rate": 0.00043837133368450815, "loss": 5.6554, "mean_token_accuracy": 0.18410271555185317, "num_tokens": 1467487.0, "step": 1725 }, { "entropy": 5.91278042793274, "epoch": 0.03865619448758198, "grad_norm": 1.2890625, "learning_rate": 0.0004375590446420037, "loss": 5.7621, "mean_token_accuracy": 0.1775332771241665, "num_tokens": 1471408.0, "step": 1730 }, { "entropy": 5.8655084609985355, "epoch": 0.03876791759303742, "grad_norm": 1.203125, "learning_rate": 0.0004367422989611743, "loss": 5.5388, "mean_token_accuracy": 0.20361874550580977, "num_tokens": 1475296.0, "step": 1735 }, { "entropy": 5.9305091381073, "epoch": 0.038879640698492854, "grad_norm": 1.3828125, "learning_rate": 0.0004359211190335153, "loss": 5.7256, "mean_token_accuracy": 0.1764163628220558, "num_tokens": 1479593.0, "step": 1740 }, { "entropy": 5.91884412765503, "epoch": 0.0389913638039483, "grad_norm": 1.375, "learning_rate": 0.00043509552737208923, "loss": 5.6551, "mean_token_accuracy": 0.19346691966056823, "num_tokens": 1483587.0, "step": 1745 }, { "entropy": 5.924755525588989, "epoch": 0.039103086909403735, "grad_norm": 1.21875, "learning_rate": 0.00043426554661090853, "loss": 5.6987, "mean_token_accuracy": 0.188627889752388, "num_tokens": 1488152.0, "step": 1750 }, { "entropy": 5.852688217163086, "epoch": 0.03921481001485917, "grad_norm": 1.2578125, "learning_rate": 0.00043343119950431516, "loss": 5.7215, "mean_token_accuracy": 0.18358622789382933, "num_tokens": 1492511.0, "step": 1755 }, { "entropy": 5.978724813461303, "epoch": 0.03932653312031461, "grad_norm": 1.1875, "learning_rate": 0.00043259250892635644, "loss": 5.6617, "mean_token_accuracy": 0.18464895635843276, "num_tokens": 1496634.0, "step": 1760 }, { "entropy": 5.969769763946533, "epoch": 0.03943825622577005, "grad_norm": 1.265625, "learning_rate": 0.0004317494978701582, "loss": 5.7043, "mean_token_accuracy": 0.18896432220935822, "num_tokens": 1500904.0, "step": 1765 }, { "entropy": 5.869477558135986, "epoch": 0.03954997933122549, "grad_norm": 1.3359375, "learning_rate": 0.0004309021894472943, "loss": 5.618, "mean_token_accuracy": 0.19045006930828096, "num_tokens": 1504612.0, "step": 1770 }, { "entropy": 5.796671152114868, "epoch": 0.03966170243668093, "grad_norm": 1.234375, "learning_rate": 0.0004300506068871534, "loss": 5.6544, "mean_token_accuracy": 0.1906888410449028, "num_tokens": 1508867.0, "step": 1775 }, { "entropy": 5.729340410232544, "epoch": 0.03977342554213637, "grad_norm": 1.609375, "learning_rate": 0.00042919477353630135, "loss": 5.4135, "mean_token_accuracy": 0.21768646389245988, "num_tokens": 1512355.0, "step": 1780 }, { "entropy": 5.7904619693756105, "epoch": 0.03988514864759181, "grad_norm": 1.3125, "learning_rate": 0.000428334712857842, "loss": 5.6296, "mean_token_accuracy": 0.19588664174079895, "num_tokens": 1516609.0, "step": 1785 }, { "entropy": 5.907398986816406, "epoch": 0.039996871753047246, "grad_norm": 1.296875, "learning_rate": 0.00042747044843077304, "loss": 5.7548, "mean_token_accuracy": 0.18738234639167786, "num_tokens": 1520945.0, "step": 1790 }, { "entropy": 5.954821968078614, "epoch": 0.04010859485850269, "grad_norm": 1.3359375, "learning_rate": 0.00042660200394934047, "loss": 5.7919, "mean_token_accuracy": 0.17330280616879462, "num_tokens": 1525136.0, "step": 1795 }, { "entropy": 5.911829948425293, "epoch": 0.04022031796395813, "grad_norm": 1.4140625, "learning_rate": 0.00042572940322238844, "loss": 5.7335, "mean_token_accuracy": 0.17516723573207854, "num_tokens": 1529088.0, "step": 1800 }, { "entropy": 5.8669532299041744, "epoch": 0.040332041069413564, "grad_norm": 1.2578125, "learning_rate": 0.00042485267017270664, "loss": 5.7184, "mean_token_accuracy": 0.1801592469215393, "num_tokens": 1533923.0, "step": 1805 }, { "entropy": 5.9007471084594725, "epoch": 0.040443764174869, "grad_norm": 1.3828125, "learning_rate": 0.0004239718288363745, "loss": 5.6071, "mean_token_accuracy": 0.2007177233695984, "num_tokens": 1538031.0, "step": 1810 }, { "entropy": 5.870078706741333, "epoch": 0.040555487280324445, "grad_norm": 1.2265625, "learning_rate": 0.0004230869033621023, "loss": 5.7082, "mean_token_accuracy": 0.17969376146793364, "num_tokens": 1542217.0, "step": 1815 }, { "entropy": 5.898741054534912, "epoch": 0.04066721038577988, "grad_norm": 1.3359375, "learning_rate": 0.0004221979180105688, "loss": 5.6301, "mean_token_accuracy": 0.1878265455365181, "num_tokens": 1546102.0, "step": 1820 }, { "entropy": 5.808369493484497, "epoch": 0.04077893349123532, "grad_norm": 1.234375, "learning_rate": 0.00042130489715375645, "loss": 5.5982, "mean_token_accuracy": 0.19695605784654618, "num_tokens": 1550292.0, "step": 1825 }, { "entropy": 5.843883419036866, "epoch": 0.040890656596690764, "grad_norm": 1.3828125, "learning_rate": 0.00042040786527428335, "loss": 5.6667, "mean_token_accuracy": 0.18189924210309982, "num_tokens": 1554231.0, "step": 1830 }, { "entropy": 5.756465053558349, "epoch": 0.0410023797021462, "grad_norm": 1.296875, "learning_rate": 0.0004195068469647315, "loss": 5.6449, "mean_token_accuracy": 0.19178105145692825, "num_tokens": 1558101.0, "step": 1835 }, { "entropy": 5.7194788455963135, "epoch": 0.04111410280760164, "grad_norm": 1.25, "learning_rate": 0.00041860186692697297, "loss": 5.5983, "mean_token_accuracy": 0.19210630059242248, "num_tokens": 1562735.0, "step": 1840 }, { "entropy": 5.858875274658203, "epoch": 0.04122582591305708, "grad_norm": 1.28125, "learning_rate": 0.00041769294997149264, "loss": 5.5543, "mean_token_accuracy": 0.2053885355591774, "num_tokens": 1566953.0, "step": 1845 }, { "entropy": 5.8189124584198, "epoch": 0.04133754901851252, "grad_norm": 1.2265625, "learning_rate": 0.0004167801210167081, "loss": 5.6008, "mean_token_accuracy": 0.1976646065711975, "num_tokens": 1571389.0, "step": 1850 }, { "entropy": 5.766389560699463, "epoch": 0.041449272123967956, "grad_norm": 1.296875, "learning_rate": 0.0004158634050882861, "loss": 5.7083, "mean_token_accuracy": 0.18083323091268538, "num_tokens": 1575737.0, "step": 1855 }, { "entropy": 5.973462867736816, "epoch": 0.0415609952294234, "grad_norm": 1.2734375, "learning_rate": 0.0004149428273184569, "loss": 5.7318, "mean_token_accuracy": 0.18451347351074218, "num_tokens": 1580371.0, "step": 1860 }, { "entropy": 5.9336131572723385, "epoch": 0.04167271833487884, "grad_norm": 1.203125, "learning_rate": 0.0004140184129453253, "loss": 5.7415, "mean_token_accuracy": 0.1852435752749443, "num_tokens": 1584569.0, "step": 1865 }, { "entropy": 5.808266925811767, "epoch": 0.041784441440334275, "grad_norm": 1.171875, "learning_rate": 0.000413090187312178, "loss": 5.6746, "mean_token_accuracy": 0.1897304803133011, "num_tokens": 1588869.0, "step": 1870 }, { "entropy": 5.92993016242981, "epoch": 0.04189616454578971, "grad_norm": 1.3125, "learning_rate": 0.0004121581758667898, "loss": 5.699, "mean_token_accuracy": 0.18758964240550996, "num_tokens": 1593254.0, "step": 1875 }, { "entropy": 5.851748323440551, "epoch": 0.042007887651245156, "grad_norm": 1.0859375, "learning_rate": 0.00041122240416072533, "loss": 5.5884, "mean_token_accuracy": 0.1970451608300209, "num_tokens": 1597506.0, "step": 1880 }, { "entropy": 5.822835302352905, "epoch": 0.04211961075670059, "grad_norm": 1.1953125, "learning_rate": 0.0004102828978486385, "loss": 5.748, "mean_token_accuracy": 0.18840789794921875, "num_tokens": 1601930.0, "step": 1885 }, { "entropy": 5.930711030960083, "epoch": 0.04223133386215603, "grad_norm": 1.328125, "learning_rate": 0.0004093396826875695, "loss": 5.7416, "mean_token_accuracy": 0.1909443974494934, "num_tokens": 1606634.0, "step": 1890 }, { "entropy": 5.928756856918335, "epoch": 0.042343056967611474, "grad_norm": 1.3671875, "learning_rate": 0.00040839278453623837, "loss": 5.636, "mean_token_accuracy": 0.1966735064983368, "num_tokens": 1610457.0, "step": 1895 }, { "entropy": 5.8650257110595705, "epoch": 0.04245478007306691, "grad_norm": 1.3515625, "learning_rate": 0.0004074422293543363, "loss": 5.5561, "mean_token_accuracy": 0.19099107682704924, "num_tokens": 1614315.0, "step": 1900 }, { "entropy": 5.836633396148682, "epoch": 0.04256650317852235, "grad_norm": 1.2421875, "learning_rate": 0.0004064880432018137, "loss": 5.6984, "mean_token_accuracy": 0.1857185408473015, "num_tokens": 1618613.0, "step": 1905 }, { "entropy": 5.989507627487183, "epoch": 0.04267822628397779, "grad_norm": 1.3359375, "learning_rate": 0.00040553025223816615, "loss": 5.5825, "mean_token_accuracy": 0.19654418975114823, "num_tokens": 1622874.0, "step": 1910 }, { "entropy": 5.865914678573608, "epoch": 0.04278994938943323, "grad_norm": 1.25, "learning_rate": 0.00040456888272171653, "loss": 5.5832, "mean_token_accuracy": 0.19483028203248978, "num_tokens": 1626799.0, "step": 1915 }, { "entropy": 5.614121818542481, "epoch": 0.04290167249488867, "grad_norm": 1.3984375, "learning_rate": 0.00040360396100889577, "loss": 5.6461, "mean_token_accuracy": 0.1878322646021843, "num_tokens": 1631082.0, "step": 1920 }, { "entropy": 5.902209091186523, "epoch": 0.043013395600344104, "grad_norm": 1.34375, "learning_rate": 0.0004026355135535202, "loss": 5.6043, "mean_token_accuracy": 0.194015434384346, "num_tokens": 1635075.0, "step": 1925 }, { "entropy": 5.905219459533692, "epoch": 0.04312511870579955, "grad_norm": 1.25, "learning_rate": 0.000401663566906066, "loss": 5.655, "mean_token_accuracy": 0.18685375303030013, "num_tokens": 1639663.0, "step": 1930 }, { "entropy": 5.824585723876953, "epoch": 0.043236841811254985, "grad_norm": 1.125, "learning_rate": 0.00040068814771294134, "loss": 5.7887, "mean_token_accuracy": 0.17511658370494843, "num_tokens": 1644285.0, "step": 1935 }, { "entropy": 5.836113452911377, "epoch": 0.04334856491671042, "grad_norm": 1.359375, "learning_rate": 0.0003997092827157562, "loss": 5.6278, "mean_token_accuracy": 0.19246720820665358, "num_tokens": 1648542.0, "step": 1940 }, { "entropy": 5.852768421173096, "epoch": 0.043460288022165866, "grad_norm": 1.25, "learning_rate": 0.000398726998750589, "loss": 5.688, "mean_token_accuracy": 0.18379789143800734, "num_tokens": 1652982.0, "step": 1945 }, { "entropy": 5.867275238037109, "epoch": 0.0435720111276213, "grad_norm": 1.34375, "learning_rate": 0.00039774132274725076, "loss": 5.5677, "mean_token_accuracy": 0.19981538355350495, "num_tokens": 1657111.0, "step": 1950 }, { "entropy": 5.74615364074707, "epoch": 0.04368373423307674, "grad_norm": 1.3203125, "learning_rate": 0.00039675228172854707, "loss": 5.5114, "mean_token_accuracy": 0.20943004339933396, "num_tokens": 1661541.0, "step": 1955 }, { "entropy": 5.7881919860839846, "epoch": 0.043795457338532184, "grad_norm": 1.3984375, "learning_rate": 0.0003957599028095371, "loss": 5.6962, "mean_token_accuracy": 0.1931190684437752, "num_tokens": 1665606.0, "step": 1960 }, { "entropy": 5.764306926727295, "epoch": 0.04390718044398762, "grad_norm": 1.390625, "learning_rate": 0.00039476421319679017, "loss": 5.5577, "mean_token_accuracy": 0.20224445462226867, "num_tokens": 1670110.0, "step": 1965 }, { "entropy": 5.783484220504761, "epoch": 0.04401890354944306, "grad_norm": 1.1796875, "learning_rate": 0.00039376524018764, "loss": 5.6797, "mean_token_accuracy": 0.19021057486534118, "num_tokens": 1674724.0, "step": 1970 }, { "entropy": 5.712728071212768, "epoch": 0.0441306266548985, "grad_norm": 1.2734375, "learning_rate": 0.00039276301116943616, "loss": 5.4867, "mean_token_accuracy": 0.20342362821102142, "num_tokens": 1679022.0, "step": 1975 }, { "entropy": 5.82525110244751, "epoch": 0.04424234976035394, "grad_norm": 1.296875, "learning_rate": 0.0003917575536187936, "loss": 5.5966, "mean_token_accuracy": 0.1854166254401207, "num_tokens": 1683192.0, "step": 1980 }, { "entropy": 5.9117003917694095, "epoch": 0.04435407286580938, "grad_norm": 1.2578125, "learning_rate": 0.00039074889510083894, "loss": 5.5508, "mean_token_accuracy": 0.1991869479417801, "num_tokens": 1687442.0, "step": 1985 }, { "entropy": 5.892691707611084, "epoch": 0.044465795971264814, "grad_norm": 1.3125, "learning_rate": 0.00038973706326845495, "loss": 5.6187, "mean_token_accuracy": 0.19358569532632827, "num_tokens": 1691528.0, "step": 1990 }, { "entropy": 5.792757225036621, "epoch": 0.04457751907672026, "grad_norm": 1.4140625, "learning_rate": 0.0003887220858615225, "loss": 5.7013, "mean_token_accuracy": 0.18120188936591147, "num_tokens": 1695936.0, "step": 1995 }, { "entropy": 5.806407785415649, "epoch": 0.044689242182175695, "grad_norm": 1.1796875, "learning_rate": 0.0003877039907061597, "loss": 5.647, "mean_token_accuracy": 0.1883596122264862, "num_tokens": 1700729.0, "step": 2000 }, { "entropy": 5.762160444259644, "epoch": 0.04480096528763113, "grad_norm": 1.1875, "learning_rate": 0.0003866828057139598, "loss": 5.5854, "mean_token_accuracy": 0.1900200754404068, "num_tokens": 1704962.0, "step": 2005 }, { "entropy": 5.903286409378052, "epoch": 0.044912688393086576, "grad_norm": 1.296875, "learning_rate": 0.00038565855888122503, "loss": 5.7253, "mean_token_accuracy": 0.188412207365036, "num_tokens": 1709408.0, "step": 2010 }, { "entropy": 5.749392414093018, "epoch": 0.045024411498542014, "grad_norm": 1.21875, "learning_rate": 0.00038463127828819975, "loss": 5.544, "mean_token_accuracy": 0.20539478808641434, "num_tokens": 1713894.0, "step": 2015 }, { "entropy": 5.758798837661743, "epoch": 0.04513613460399745, "grad_norm": 1.2421875, "learning_rate": 0.00038360099209830043, "loss": 5.4963, "mean_token_accuracy": 0.19964819103479386, "num_tokens": 1718477.0, "step": 2020 }, { "entropy": 5.695912027359009, "epoch": 0.045247857709452895, "grad_norm": 1.28125, "learning_rate": 0.0003825677285573433, "loss": 5.4753, "mean_token_accuracy": 0.2068771407008171, "num_tokens": 1722438.0, "step": 2025 }, { "entropy": 5.75974669456482, "epoch": 0.04535958081490833, "grad_norm": 1.34375, "learning_rate": 0.00038153151599277027, "loss": 5.5831, "mean_token_accuracy": 0.19000767022371293, "num_tokens": 1726360.0, "step": 2030 }, { "entropy": 5.711872291564942, "epoch": 0.04547130392036377, "grad_norm": 1.2890625, "learning_rate": 0.0003804923828128723, "loss": 5.5884, "mean_token_accuracy": 0.19782641232013704, "num_tokens": 1730550.0, "step": 2035 }, { "entropy": 5.848684644699096, "epoch": 0.04558302702581921, "grad_norm": 1.1953125, "learning_rate": 0.0003794503575060104, "loss": 5.6181, "mean_token_accuracy": 0.19873436838388442, "num_tokens": 1734685.0, "step": 2040 }, { "entropy": 5.871687984466552, "epoch": 0.04569475013127465, "grad_norm": 1.21875, "learning_rate": 0.00037840546863983484, "loss": 5.6398, "mean_token_accuracy": 0.18782007247209548, "num_tokens": 1738714.0, "step": 2045 }, { "entropy": 5.738994503021241, "epoch": 0.04580647323673009, "grad_norm": 1.375, "learning_rate": 0.0003773577448605015, "loss": 5.626, "mean_token_accuracy": 0.19532090276479722, "num_tokens": 1742714.0, "step": 2050 }, { "entropy": 5.682830095291138, "epoch": 0.045918196342185524, "grad_norm": 1.265625, "learning_rate": 0.0003763072148918872, "loss": 5.5304, "mean_token_accuracy": 0.19776215255260468, "num_tokens": 1747053.0, "step": 2055 }, { "entropy": 5.786202430725098, "epoch": 0.04602991944764097, "grad_norm": 1.3359375, "learning_rate": 0.0003752539075348017, "loss": 5.6494, "mean_token_accuracy": 0.18340060263872146, "num_tokens": 1750766.0, "step": 2060 }, { "entropy": 5.702880001068115, "epoch": 0.046141642553096406, "grad_norm": 1.3046875, "learning_rate": 0.00037419785166619817, "loss": 5.5526, "mean_token_accuracy": 0.20375038385391236, "num_tokens": 1754835.0, "step": 2065 }, { "entropy": 5.871013212203979, "epoch": 0.04625336565855184, "grad_norm": 1.3359375, "learning_rate": 0.0003731390762383818, "loss": 5.6323, "mean_token_accuracy": 0.19306133091449737, "num_tokens": 1759088.0, "step": 2070 }, { "entropy": 5.733165454864502, "epoch": 0.04636508876400729, "grad_norm": 1.234375, "learning_rate": 0.0003720776102782158, "loss": 5.4566, "mean_token_accuracy": 0.19918377846479415, "num_tokens": 1763493.0, "step": 2075 }, { "entropy": 5.687415218353271, "epoch": 0.046476811869462724, "grad_norm": 1.2421875, "learning_rate": 0.00037101348288632555, "loss": 5.6178, "mean_token_accuracy": 0.19442993849515916, "num_tokens": 1767859.0, "step": 2080 }, { "entropy": 5.834854173660278, "epoch": 0.04658853497491816, "grad_norm": 1.3046875, "learning_rate": 0.0003699467232363012, "loss": 5.6958, "mean_token_accuracy": 0.1966529980301857, "num_tokens": 1772448.0, "step": 2085 }, { "entropy": 5.876561212539673, "epoch": 0.046700258080373605, "grad_norm": 1.3125, "learning_rate": 0.0003688773605738973, "loss": 5.5735, "mean_token_accuracy": 0.19237288534641267, "num_tokens": 1776420.0, "step": 2090 }, { "entropy": 5.78769850730896, "epoch": 0.04681198118582904, "grad_norm": 1.2734375, "learning_rate": 0.00036780542421623134, "loss": 5.4762, "mean_token_accuracy": 0.20187776684761047, "num_tokens": 1780503.0, "step": 2095 }, { "entropy": 5.806005382537842, "epoch": 0.04692370429128448, "grad_norm": 1.375, "learning_rate": 0.0003667309435509802, "loss": 5.5254, "mean_token_accuracy": 0.18709133192896843, "num_tokens": 1784730.0, "step": 2100 }, { "entropy": 5.821065235137939, "epoch": 0.047035427396739916, "grad_norm": 1.265625, "learning_rate": 0.0003656539480355741, "loss": 5.5357, "mean_token_accuracy": 0.19565414786338806, "num_tokens": 1789317.0, "step": 2105 }, { "entropy": 5.8124915599823, "epoch": 0.04714715050219536, "grad_norm": 1.28125, "learning_rate": 0.0003645744671963891, "loss": 5.7168, "mean_token_accuracy": 0.18089606761932372, "num_tokens": 1793431.0, "step": 2110 }, { "entropy": 5.83145956993103, "epoch": 0.0472588736076508, "grad_norm": 1.3359375, "learning_rate": 0.0003634925306279376, "loss": 5.5886, "mean_token_accuracy": 0.19206707179546356, "num_tokens": 1797546.0, "step": 2115 }, { "entropy": 5.815211296081543, "epoch": 0.047370596713106235, "grad_norm": 1.171875, "learning_rate": 0.0003624081679920574, "loss": 5.5849, "mean_token_accuracy": 0.19378799498081206, "num_tokens": 1801995.0, "step": 2120 }, { "entropy": 5.810037565231323, "epoch": 0.04748231981856168, "grad_norm": 1.359375, "learning_rate": 0.0003613214090170977, "loss": 5.5494, "mean_token_accuracy": 0.1948327451944351, "num_tokens": 1805957.0, "step": 2125 }, { "entropy": 5.692611360549927, "epoch": 0.047594042924017116, "grad_norm": 1.2421875, "learning_rate": 0.0003602322834971048, "loss": 5.538, "mean_token_accuracy": 0.20250043869018555, "num_tokens": 1810216.0, "step": 2130 }, { "entropy": 5.737909555435181, "epoch": 0.04770576602947255, "grad_norm": 1.328125, "learning_rate": 0.0003591408212910051, "loss": 5.5878, "mean_token_accuracy": 0.19820891618728637, "num_tokens": 1814727.0, "step": 2135 }, { "entropy": 5.843041563034058, "epoch": 0.047817489134928, "grad_norm": 1.328125, "learning_rate": 0.0003580470523217863, "loss": 5.5271, "mean_token_accuracy": 0.2012895479798317, "num_tokens": 1819076.0, "step": 2140 }, { "entropy": 5.887670660018921, "epoch": 0.047929212240383434, "grad_norm": 1.2578125, "learning_rate": 0.0003569510065756771, "loss": 5.6315, "mean_token_accuracy": 0.19086363166570663, "num_tokens": 1823289.0, "step": 2145 }, { "entropy": 5.619873428344727, "epoch": 0.04804093534583887, "grad_norm": 1.234375, "learning_rate": 0.0003558527141013254, "loss": 5.4709, "mean_token_accuracy": 0.2047662228345871, "num_tokens": 1827301.0, "step": 2150 }, { "entropy": 5.79863338470459, "epoch": 0.048152658451294315, "grad_norm": 1.3125, "learning_rate": 0.0003547522050089742, "loss": 5.6093, "mean_token_accuracy": 0.19400935769081115, "num_tokens": 1831300.0, "step": 2155 }, { "entropy": 5.751356601715088, "epoch": 0.04826438155674975, "grad_norm": 1.2890625, "learning_rate": 0.00035364950946963606, "loss": 5.4207, "mean_token_accuracy": 0.20203897207975388, "num_tokens": 1835445.0, "step": 2160 }, { "entropy": 5.778922367095947, "epoch": 0.04837610466220519, "grad_norm": 1.3203125, "learning_rate": 0.0003525446577142663, "loss": 5.5592, "mean_token_accuracy": 0.19020436555147172, "num_tokens": 1839773.0, "step": 2165 }, { "entropy": 5.876074647903442, "epoch": 0.04848782776766063, "grad_norm": 1.234375, "learning_rate": 0.00035143768003293395, "loss": 5.5561, "mean_token_accuracy": 0.19834407269954682, "num_tokens": 1844011.0, "step": 2170 }, { "entropy": 5.866834402084351, "epoch": 0.04859955087311607, "grad_norm": 1.296875, "learning_rate": 0.0003503286067739913, "loss": 5.6563, "mean_token_accuracy": 0.18741424530744552, "num_tokens": 1848063.0, "step": 2175 }, { "entropy": 5.783164978027344, "epoch": 0.04871127397857151, "grad_norm": 1.2578125, "learning_rate": 0.00034921746834324193, "loss": 5.702, "mean_token_accuracy": 0.18896054923534394, "num_tokens": 1852403.0, "step": 2180 }, { "entropy": 5.707462406158447, "epoch": 0.048822997084026945, "grad_norm": 1.3984375, "learning_rate": 0.0003481042952031072, "loss": 5.5782, "mean_token_accuracy": 0.19232478588819504, "num_tokens": 1856680.0, "step": 2185 }, { "entropy": 5.941496753692627, "epoch": 0.04893472018948239, "grad_norm": 1.25, "learning_rate": 0.0003469891178717911, "loss": 5.6988, "mean_token_accuracy": 0.18841493278741836, "num_tokens": 1861166.0, "step": 2190 }, { "entropy": 5.853933238983155, "epoch": 0.049046443294937826, "grad_norm": 1.375, "learning_rate": 0.0003458719669224436, "loss": 5.5343, "mean_token_accuracy": 0.19651836454868316, "num_tokens": 1865334.0, "step": 2195 }, { "entropy": 5.7885599613189695, "epoch": 0.04915816640039326, "grad_norm": 1.265625, "learning_rate": 0.0003447528729823221, "loss": 5.5103, "mean_token_accuracy": 0.20324531346559524, "num_tokens": 1869391.0, "step": 2200 }, { "entropy": 5.779803848266601, "epoch": 0.04926988950584871, "grad_norm": 1.421875, "learning_rate": 0.0003436318667319525, "loss": 5.626, "mean_token_accuracy": 0.18151400834321976, "num_tokens": 1873811.0, "step": 2205 }, { "entropy": 5.938524341583252, "epoch": 0.049381612611304145, "grad_norm": 1.1328125, "learning_rate": 0.00034250897890428716, "loss": 5.6113, "mean_token_accuracy": 0.197547022998333, "num_tokens": 1878269.0, "step": 2210 }, { "entropy": 5.9333361148834225, "epoch": 0.04949333571675958, "grad_norm": 1.46875, "learning_rate": 0.0003413842402838633, "loss": 5.72, "mean_token_accuracy": 0.1759050115942955, "num_tokens": 1882800.0, "step": 2215 }, { "entropy": 5.799074602127075, "epoch": 0.049605058822215026, "grad_norm": 1.2578125, "learning_rate": 0.00034025768170595834, "loss": 5.6057, "mean_token_accuracy": 0.19079000651836395, "num_tokens": 1886968.0, "step": 2220 }, { "entropy": 5.835526609420777, "epoch": 0.04971678192767046, "grad_norm": 1.28125, "learning_rate": 0.0003391293340557446, "loss": 5.6423, "mean_token_accuracy": 0.18531539142131806, "num_tokens": 1891158.0, "step": 2225 }, { "entropy": 5.666534328460694, "epoch": 0.0498285050331259, "grad_norm": 1.25, "learning_rate": 0.0003379992282674431, "loss": 5.3402, "mean_token_accuracy": 0.2212604433298111, "num_tokens": 1895431.0, "step": 2230 }, { "entropy": 5.787649869918823, "epoch": 0.04994022813858134, "grad_norm": 1.2578125, "learning_rate": 0.0003368673953234749, "loss": 5.5506, "mean_token_accuracy": 0.1960133582353592, "num_tokens": 1899750.0, "step": 2235 }, { "entropy": 5.787043285369873, "epoch": 0.05005195124403678, "grad_norm": 1.2265625, "learning_rate": 0.00033573386625361176, "loss": 5.4654, "mean_token_accuracy": 0.20220616459846497, "num_tokens": 1904076.0, "step": 2240 }, { "entropy": 5.723090791702271, "epoch": 0.05016367434949222, "grad_norm": 1.265625, "learning_rate": 0.00033459867213412567, "loss": 5.5961, "mean_token_accuracy": 0.1981295756995678, "num_tokens": 1908859.0, "step": 2245 }, { "entropy": 5.7092540740966795, "epoch": 0.050275397454947655, "grad_norm": 1.328125, "learning_rate": 0.000333461844086937, "loss": 5.3811, "mean_token_accuracy": 0.21009975224733352, "num_tokens": 1913049.0, "step": 2250 }, { "entropy": 5.761197090148926, "epoch": 0.0503871205604031, "grad_norm": 1.3046875, "learning_rate": 0.00033232341327876097, "loss": 5.5858, "mean_token_accuracy": 0.1940508484840393, "num_tokens": 1917247.0, "step": 2255 }, { "entropy": 5.7653045654296875, "epoch": 0.05049884366585854, "grad_norm": 1.4609375, "learning_rate": 0.0003311834109202531, "loss": 5.5715, "mean_token_accuracy": 0.2005481407046318, "num_tokens": 1920958.0, "step": 2260 }, { "entropy": 5.650889492034912, "epoch": 0.050610566771313974, "grad_norm": 1.2265625, "learning_rate": 0.00033004186826515416, "loss": 5.4762, "mean_token_accuracy": 0.19745519757270813, "num_tokens": 1925184.0, "step": 2265 }, { "entropy": 5.788166809082031, "epoch": 0.05072228987676942, "grad_norm": 1.2734375, "learning_rate": 0.0003288988166094324, "loss": 5.476, "mean_token_accuracy": 0.20047650933265687, "num_tokens": 1929392.0, "step": 2270 }, { "entropy": 5.705606317520141, "epoch": 0.050834012982224855, "grad_norm": 1.34375, "learning_rate": 0.00032775428729042656, "loss": 5.5656, "mean_token_accuracy": 0.2057395651936531, "num_tokens": 1933535.0, "step": 2275 }, { "entropy": 5.746721363067627, "epoch": 0.05094573608768029, "grad_norm": 1.296875, "learning_rate": 0.000326608311685986, "loss": 5.4757, "mean_token_accuracy": 0.1967824012041092, "num_tokens": 1937830.0, "step": 2280 }, { "entropy": 5.682202482223511, "epoch": 0.05105745919313573, "grad_norm": 1.4453125, "learning_rate": 0.0003254609212136108, "loss": 5.469, "mean_token_accuracy": 0.20526605397462844, "num_tokens": 1941521.0, "step": 2285 }, { "entropy": 5.666811180114746, "epoch": 0.05116918229859117, "grad_norm": 1.3203125, "learning_rate": 0.00032431214732959036, "loss": 5.5247, "mean_token_accuracy": 0.19909686148166655, "num_tokens": 1945662.0, "step": 2290 }, { "entropy": 5.675921297073364, "epoch": 0.05128090540404661, "grad_norm": 1.2578125, "learning_rate": 0.000323162021528141, "loss": 5.3343, "mean_token_accuracy": 0.20175234079360962, "num_tokens": 1949529.0, "step": 2295 }, { "entropy": 5.5916831493377686, "epoch": 0.05139262850950205, "grad_norm": 1.2578125, "learning_rate": 0.00032201057534054264, "loss": 5.3595, "mean_token_accuracy": 0.21346658766269683, "num_tokens": 1953541.0, "step": 2300 }, { "entropy": 5.603648614883423, "epoch": 0.05150435161495749, "grad_norm": 1.2265625, "learning_rate": 0.00032085784033427414, "loss": 5.429, "mean_token_accuracy": 0.20238481909036637, "num_tokens": 1957863.0, "step": 2305 }, { "entropy": 5.6920647621154785, "epoch": 0.05161607472041293, "grad_norm": 1.15625, "learning_rate": 0.0003197038481121478, "loss": 5.4622, "mean_token_accuracy": 0.20448255985975267, "num_tokens": 1962085.0, "step": 2310 }, { "entropy": 5.818530702590943, "epoch": 0.051727797825868366, "grad_norm": 1.21875, "learning_rate": 0.0003185486303114436, "loss": 5.46, "mean_token_accuracy": 0.20918918401002884, "num_tokens": 1966707.0, "step": 2315 }, { "entropy": 5.782559156417847, "epoch": 0.05183952093132381, "grad_norm": 1.2421875, "learning_rate": 0.0003173922186030409, "loss": 5.5782, "mean_token_accuracy": 0.18843235522508622, "num_tokens": 1970461.0, "step": 2320 }, { "entropy": 5.730897331237793, "epoch": 0.05195124403677925, "grad_norm": 1.265625, "learning_rate": 0.000316234644690551, "loss": 5.4926, "mean_token_accuracy": 0.19218226224184037, "num_tokens": 1974611.0, "step": 2325 }, { "entropy": 5.58878173828125, "epoch": 0.052062967142234684, "grad_norm": 1.3828125, "learning_rate": 0.0003150759403094473, "loss": 5.3545, "mean_token_accuracy": 0.21814181208610534, "num_tokens": 1978644.0, "step": 2330 }, { "entropy": 5.569992017745972, "epoch": 0.05217469024769013, "grad_norm": 1.1796875, "learning_rate": 0.00031391613722619587, "loss": 5.3576, "mean_token_accuracy": 0.2130546897649765, "num_tokens": 1983045.0, "step": 2335 }, { "entropy": 5.643571615219116, "epoch": 0.052286413353145565, "grad_norm": 1.2265625, "learning_rate": 0.000312755267237384, "loss": 5.4264, "mean_token_accuracy": 0.19753186404705048, "num_tokens": 1987046.0, "step": 2340 }, { "entropy": 5.820702457427979, "epoch": 0.052398136458601, "grad_norm": 1.28125, "learning_rate": 0.0003115933621688488, "loss": 5.6737, "mean_token_accuracy": 0.19394189715385438, "num_tokens": 1992201.0, "step": 2345 }, { "entropy": 5.794189882278443, "epoch": 0.05250985956405644, "grad_norm": 1.296875, "learning_rate": 0.00031043045387480487, "loss": 5.5028, "mean_token_accuracy": 0.20308198630809784, "num_tokens": 1996124.0, "step": 2350 }, { "entropy": 5.6884153366088865, "epoch": 0.052621582669511884, "grad_norm": 1.21875, "learning_rate": 0.0003092665742369703, "loss": 5.5133, "mean_token_accuracy": 0.19611742794513704, "num_tokens": 2000602.0, "step": 2355 }, { "entropy": 5.69493989944458, "epoch": 0.05273330577496732, "grad_norm": 1.234375, "learning_rate": 0.00030810175516369343, "loss": 5.5597, "mean_token_accuracy": 0.1906090185046196, "num_tokens": 2005538.0, "step": 2360 }, { "entropy": 5.730038738250732, "epoch": 0.05284502888042276, "grad_norm": 1.1484375, "learning_rate": 0.0003069360285890775, "loss": 5.5006, "mean_token_accuracy": 0.19747251272201538, "num_tokens": 2010158.0, "step": 2365 }, { "entropy": 5.713155364990234, "epoch": 0.0529567519858782, "grad_norm": 1.3203125, "learning_rate": 0.00030576942647210547, "loss": 5.4504, "mean_token_accuracy": 0.20709324926137923, "num_tokens": 2013993.0, "step": 2370 }, { "entropy": 5.664393043518066, "epoch": 0.05306847509133364, "grad_norm": 1.296875, "learning_rate": 0.00030460198079576355, "loss": 5.4985, "mean_token_accuracy": 0.20383214950561523, "num_tokens": 2018312.0, "step": 2375 }, { "entropy": 5.771481895446778, "epoch": 0.053180198196789076, "grad_norm": 1.28125, "learning_rate": 0.0003034337235661648, "loss": 5.6226, "mean_token_accuracy": 0.19073883444070816, "num_tokens": 2022957.0, "step": 2380 }, { "entropy": 5.73010287284851, "epoch": 0.05329192130224452, "grad_norm": 1.2734375, "learning_rate": 0.0003022646868116714, "loss": 5.548, "mean_token_accuracy": 0.19659071415662766, "num_tokens": 2027546.0, "step": 2385 }, { "entropy": 5.743465614318848, "epoch": 0.05340364440769996, "grad_norm": 1.3515625, "learning_rate": 0.0003010949025820163, "loss": 5.362, "mean_token_accuracy": 0.19935485869646072, "num_tokens": 2031370.0, "step": 2390 }, { "entropy": 5.73956298828125, "epoch": 0.053515367513155394, "grad_norm": 1.1484375, "learning_rate": 0.0002999244029474252, "loss": 5.5286, "mean_token_accuracy": 0.19233475774526596, "num_tokens": 2036314.0, "step": 2395 }, { "entropy": 5.790511226654052, "epoch": 0.05362709061861083, "grad_norm": 1.2109375, "learning_rate": 0.00029875321999773684, "loss": 5.4899, "mean_token_accuracy": 0.1918132334947586, "num_tokens": 2040819.0, "step": 2400 }, { "entropy": 5.736824607849121, "epoch": 0.053738813724066276, "grad_norm": 1.265625, "learning_rate": 0.00029758138584152333, "loss": 5.4628, "mean_token_accuracy": 0.20366894453763962, "num_tokens": 2045230.0, "step": 2405 }, { "entropy": 5.79299521446228, "epoch": 0.05385053682952171, "grad_norm": 1.359375, "learning_rate": 0.0002964089326052102, "loss": 5.4941, "mean_token_accuracy": 0.20016811043024063, "num_tokens": 2049688.0, "step": 2410 }, { "entropy": 5.674882650375366, "epoch": 0.05396225993497715, "grad_norm": 1.296875, "learning_rate": 0.0002952358924321949, "loss": 5.4647, "mean_token_accuracy": 0.20398021638393402, "num_tokens": 2053811.0, "step": 2415 }, { "entropy": 5.634827518463135, "epoch": 0.054073983040432594, "grad_norm": 1.3984375, "learning_rate": 0.00029406229748196657, "loss": 5.5145, "mean_token_accuracy": 0.19332455545663835, "num_tokens": 2058018.0, "step": 2420 }, { "entropy": 5.709335565567017, "epoch": 0.05418570614588803, "grad_norm": 1.3203125, "learning_rate": 0.0002928881799292235, "loss": 5.5103, "mean_token_accuracy": 0.1937290146946907, "num_tokens": 2062258.0, "step": 2425 }, { "entropy": 5.741044902801514, "epoch": 0.05429742925134347, "grad_norm": 1.296875, "learning_rate": 0.00029171357196299154, "loss": 5.5401, "mean_token_accuracy": 0.1956827312707901, "num_tokens": 2066465.0, "step": 2430 }, { "entropy": 5.652325916290283, "epoch": 0.05440915235679891, "grad_norm": 1.3359375, "learning_rate": 0.0002905385057857414, "loss": 5.3751, "mean_token_accuracy": 0.21417505145072938, "num_tokens": 2070679.0, "step": 2435 }, { "entropy": 5.665175867080689, "epoch": 0.05452087546225435, "grad_norm": 1.34375, "learning_rate": 0.0002893630136125058, "loss": 5.4719, "mean_token_accuracy": 0.19800220280885697, "num_tokens": 2075443.0, "step": 2440 }, { "entropy": 5.677209901809692, "epoch": 0.054632598567709786, "grad_norm": 1.4453125, "learning_rate": 0.0002881871276699967, "loss": 5.4908, "mean_token_accuracy": 0.18837572783231735, "num_tokens": 2079726.0, "step": 2445 }, { "entropy": 5.596740770339966, "epoch": 0.05474432167316523, "grad_norm": 1.3359375, "learning_rate": 0.00028701088019572114, "loss": 5.4139, "mean_token_accuracy": 0.20456188321113586, "num_tokens": 2083868.0, "step": 2450 }, { "entropy": 5.6920839786529545, "epoch": 0.05485604477862067, "grad_norm": 1.2265625, "learning_rate": 0.0002858343034370977, "loss": 5.4324, "mean_token_accuracy": 0.20738107711076736, "num_tokens": 2087722.0, "step": 2455 }, { "entropy": 5.734287595748901, "epoch": 0.054967767884076105, "grad_norm": 1.296875, "learning_rate": 0.00028465742965057267, "loss": 5.3726, "mean_token_accuracy": 0.21619302332401275, "num_tokens": 2092293.0, "step": 2460 }, { "entropy": 5.639990568161011, "epoch": 0.05507949098953154, "grad_norm": 1.3203125, "learning_rate": 0.00028348029110073533, "loss": 5.4867, "mean_token_accuracy": 0.21042072176933288, "num_tokens": 2096715.0, "step": 2465 }, { "entropy": 5.737297534942627, "epoch": 0.055191214094986986, "grad_norm": 1.2578125, "learning_rate": 0.00028230292005943365, "loss": 5.5661, "mean_token_accuracy": 0.19853804111480713, "num_tokens": 2100810.0, "step": 2470 }, { "entropy": 5.691304540634155, "epoch": 0.05530293720044242, "grad_norm": 1.28125, "learning_rate": 0.00028112534880488945, "loss": 5.4729, "mean_token_accuracy": 0.19330257475376128, "num_tokens": 2105125.0, "step": 2475 }, { "entropy": 5.716330480575562, "epoch": 0.05541466030589786, "grad_norm": 1.296875, "learning_rate": 0.0002799476096208137, "loss": 5.5589, "mean_token_accuracy": 0.19316209107637405, "num_tokens": 2109218.0, "step": 2480 }, { "entropy": 5.671482372283935, "epoch": 0.055526383411353304, "grad_norm": 1.171875, "learning_rate": 0.00027876973479552087, "loss": 5.4623, "mean_token_accuracy": 0.19808712005615234, "num_tokens": 2113361.0, "step": 2485 }, { "entropy": 5.645354509353638, "epoch": 0.05563810651680874, "grad_norm": 1.3203125, "learning_rate": 0.00027759175662104424, "loss": 5.4073, "mean_token_accuracy": 0.20510194897651673, "num_tokens": 2117457.0, "step": 2490 }, { "entropy": 5.7331404209136965, "epoch": 0.05574982962226418, "grad_norm": 1.453125, "learning_rate": 0.0002764137073922508, "loss": 5.4521, "mean_token_accuracy": 0.21233909726142883, "num_tokens": 2121198.0, "step": 2495 }, { "entropy": 5.696898126602173, "epoch": 0.05586155272771962, "grad_norm": 1.140625, "learning_rate": 0.00027523561940595505, "loss": 5.5453, "mean_token_accuracy": 0.19986631721258163, "num_tokens": 2125742.0, "step": 2500 }, { "entropy": 5.617610836029053, "epoch": 0.05597327583317506, "grad_norm": 1.1953125, "learning_rate": 0.0002740575249600342, "loss": 5.4545, "mean_token_accuracy": 0.1947040691971779, "num_tokens": 2129851.0, "step": 2505 }, { "entropy": 5.578350973129273, "epoch": 0.0560849989386305, "grad_norm": 1.2890625, "learning_rate": 0.00027287945635254263, "loss": 5.241, "mean_token_accuracy": 0.2216262102127075, "num_tokens": 2133814.0, "step": 2510 }, { "entropy": 5.724449157714844, "epoch": 0.05619672204408594, "grad_norm": 1.28125, "learning_rate": 0.00027170144588082635, "loss": 5.5208, "mean_token_accuracy": 0.20132671147584916, "num_tokens": 2138191.0, "step": 2515 }, { "entropy": 5.729740953445434, "epoch": 0.05630844514954138, "grad_norm": 1.328125, "learning_rate": 0.00027052352584063763, "loss": 5.5569, "mean_token_accuracy": 0.19500400125980377, "num_tokens": 2142144.0, "step": 2520 }, { "entropy": 5.7269703388214115, "epoch": 0.056420168254996815, "grad_norm": 1.3671875, "learning_rate": 0.00026934572852524907, "loss": 5.4114, "mean_token_accuracy": 0.1983845517039299, "num_tokens": 2146308.0, "step": 2525 }, { "entropy": 5.745017242431641, "epoch": 0.05653189136045225, "grad_norm": 1.296875, "learning_rate": 0.00026816808622456937, "loss": 5.5356, "mean_token_accuracy": 0.187234328687191, "num_tokens": 2150810.0, "step": 2530 }, { "entropy": 5.739377689361572, "epoch": 0.056643614465907696, "grad_norm": 1.34375, "learning_rate": 0.0002669906312242569, "loss": 5.5433, "mean_token_accuracy": 0.18481508195400237, "num_tokens": 2154865.0, "step": 2535 }, { "entropy": 5.647379541397095, "epoch": 0.05675533757136313, "grad_norm": 1.2265625, "learning_rate": 0.00026581339580483525, "loss": 5.3846, "mean_token_accuracy": 0.2152353584766388, "num_tokens": 2159045.0, "step": 2540 }, { "entropy": 5.694047212600708, "epoch": 0.05686706067681857, "grad_norm": 1.3671875, "learning_rate": 0.0002646364122408082, "loss": 5.43, "mean_token_accuracy": 0.20127338021993638, "num_tokens": 2163151.0, "step": 2545 }, { "entropy": 5.623006582260132, "epoch": 0.056978783782274015, "grad_norm": 1.296875, "learning_rate": 0.0002634597127997749, "loss": 5.4784, "mean_token_accuracy": 0.20224980711936952, "num_tokens": 2167099.0, "step": 2550 }, { "entropy": 5.720960998535157, "epoch": 0.05709050688772945, "grad_norm": 1.3203125, "learning_rate": 0.0002622833297415445, "loss": 5.5428, "mean_token_accuracy": 0.19093946367502213, "num_tokens": 2171299.0, "step": 2555 }, { "entropy": 5.754732465744018, "epoch": 0.05720222999318489, "grad_norm": 1.3359375, "learning_rate": 0.0002611072953172531, "loss": 5.5369, "mean_token_accuracy": 0.19886625409126282, "num_tokens": 2175491.0, "step": 2560 }, { "entropy": 5.778254556655884, "epoch": 0.05731395309864033, "grad_norm": 1.4375, "learning_rate": 0.00025993164176847845, "loss": 5.3294, "mean_token_accuracy": 0.19855942130088805, "num_tokens": 2179677.0, "step": 2565 }, { "entropy": 5.752972841262817, "epoch": 0.05742567620409577, "grad_norm": 1.1875, "learning_rate": 0.0002587564013263564, "loss": 5.4079, "mean_token_accuracy": 0.20372159034013748, "num_tokens": 2184038.0, "step": 2570 }, { "entropy": 5.733305263519287, "epoch": 0.05753739930955121, "grad_norm": 1.265625, "learning_rate": 0.0002575816062106974, "loss": 5.4757, "mean_token_accuracy": 0.19013774991035462, "num_tokens": 2188278.0, "step": 2575 }, { "entropy": 5.702356815338135, "epoch": 0.057649122415006644, "grad_norm": 1.2578125, "learning_rate": 0.00025640728862910293, "loss": 5.6095, "mean_token_accuracy": 0.19494371265172958, "num_tokens": 2192402.0, "step": 2580 }, { "entropy": 5.578436946868896, "epoch": 0.05776084552046209, "grad_norm": 1.3203125, "learning_rate": 0.00025523348077608285, "loss": 5.3235, "mean_token_accuracy": 0.21250910609960555, "num_tokens": 2196631.0, "step": 2585 }, { "entropy": 5.638101482391358, "epoch": 0.057872568625917525, "grad_norm": 1.2109375, "learning_rate": 0.00025406021483217225, "loss": 5.4355, "mean_token_accuracy": 0.20848297327756882, "num_tokens": 2201016.0, "step": 2590 }, { "entropy": 5.593278789520264, "epoch": 0.05798429173137296, "grad_norm": 1.328125, "learning_rate": 0.00025288752296304963, "loss": 5.3912, "mean_token_accuracy": 0.21250712126493454, "num_tokens": 2205091.0, "step": 2595 }, { "entropy": 5.684863185882568, "epoch": 0.05809601483682841, "grad_norm": 1.28125, "learning_rate": 0.000251715437318655, "loss": 5.4442, "mean_token_accuracy": 0.20237696319818496, "num_tokens": 2209525.0, "step": 2600 }, { "entropy": 5.655977392196656, "epoch": 0.058207737942283844, "grad_norm": 1.1875, "learning_rate": 0.0002505439900323084, "loss": 5.409, "mean_token_accuracy": 0.20060928910970688, "num_tokens": 2214093.0, "step": 2605 }, { "entropy": 5.663346529006958, "epoch": 0.05831946104773928, "grad_norm": 1.265625, "learning_rate": 0.00024937321321982894, "loss": 5.4939, "mean_token_accuracy": 0.19938003718852998, "num_tokens": 2218619.0, "step": 2610 }, { "entropy": 5.671671676635742, "epoch": 0.058431184153194725, "grad_norm": 1.3671875, "learning_rate": 0.00024820313897865433, "loss": 5.3489, "mean_token_accuracy": 0.20987917482852936, "num_tokens": 2222953.0, "step": 2615 }, { "entropy": 5.648585557937622, "epoch": 0.05854290725865016, "grad_norm": 1.2578125, "learning_rate": 0.00024703379938696105, "loss": 5.4249, "mean_token_accuracy": 0.2062181681394577, "num_tokens": 2227162.0, "step": 2620 }, { "entropy": 5.658475542068482, "epoch": 0.0586546303641056, "grad_norm": 1.21875, "learning_rate": 0.00024586522650278447, "loss": 5.3822, "mean_token_accuracy": 0.19644873440265656, "num_tokens": 2231333.0, "step": 2625 }, { "entropy": 5.695336866378784, "epoch": 0.05876635346956104, "grad_norm": 1.34375, "learning_rate": 0.00024469745236314064, "loss": 5.4931, "mean_token_accuracy": 0.19799769371747972, "num_tokens": 2235160.0, "step": 2630 }, { "entropy": 5.700191783905029, "epoch": 0.05887807657501648, "grad_norm": 1.2578125, "learning_rate": 0.00024353050898314767, "loss": 5.4899, "mean_token_accuracy": 0.20430560261011124, "num_tokens": 2239699.0, "step": 2635 }, { "entropy": 5.764710092544556, "epoch": 0.05898979968047192, "grad_norm": 1.2734375, "learning_rate": 0.00024236442835514743, "loss": 5.4771, "mean_token_accuracy": 0.1954497367143631, "num_tokens": 2244146.0, "step": 2640 }, { "entropy": 5.732822465896606, "epoch": 0.059101522785927355, "grad_norm": 1.3984375, "learning_rate": 0.00024119924244782965, "loss": 5.5227, "mean_token_accuracy": 0.19945912808179855, "num_tokens": 2248190.0, "step": 2645 }, { "entropy": 5.663884496688842, "epoch": 0.0592132458913828, "grad_norm": 1.2890625, "learning_rate": 0.00024003498320535462, "loss": 5.3814, "mean_token_accuracy": 0.2188004434108734, "num_tokens": 2252389.0, "step": 2650 }, { "entropy": 5.584512186050415, "epoch": 0.059324968996838236, "grad_norm": 1.3125, "learning_rate": 0.00023887168254647727, "loss": 5.3835, "mean_token_accuracy": 0.21272893846035004, "num_tokens": 2256430.0, "step": 2655 }, { "entropy": 5.609366416931152, "epoch": 0.05943669210229367, "grad_norm": 1.265625, "learning_rate": 0.00023770937236367308, "loss": 5.5235, "mean_token_accuracy": 0.20157335847616195, "num_tokens": 2260632.0, "step": 2660 }, { "entropy": 5.64447250366211, "epoch": 0.05954841520774912, "grad_norm": 1.3046875, "learning_rate": 0.00023654808452226278, "loss": 5.374, "mean_token_accuracy": 0.20789485424757004, "num_tokens": 2264593.0, "step": 2665 }, { "entropy": 5.657564306259156, "epoch": 0.059660138313204554, "grad_norm": 1.2890625, "learning_rate": 0.00023538785085953912, "loss": 5.3631, "mean_token_accuracy": 0.20994101464748383, "num_tokens": 2268994.0, "step": 2670 }, { "entropy": 5.724624824523926, "epoch": 0.05977186141865999, "grad_norm": 1.28125, "learning_rate": 0.00023422870318389404, "loss": 5.4608, "mean_token_accuracy": 0.20652345567941666, "num_tokens": 2273355.0, "step": 2675 }, { "entropy": 5.622627449035645, "epoch": 0.059883584524115435, "grad_norm": 1.3828125, "learning_rate": 0.0002330706732739468, "loss": 5.4408, "mean_token_accuracy": 0.21060604006052017, "num_tokens": 2277618.0, "step": 2680 }, { "entropy": 5.578470468521118, "epoch": 0.05999530762957087, "grad_norm": 1.2109375, "learning_rate": 0.00023191379287767211, "loss": 5.4836, "mean_token_accuracy": 0.20465112030506133, "num_tokens": 2281751.0, "step": 2685 }, { "entropy": 5.6411576747894285, "epoch": 0.06010703073502631, "grad_norm": 1.4921875, "learning_rate": 0.0002307580937115305, "loss": 5.5372, "mean_token_accuracy": 0.19300513565540314, "num_tokens": 2286581.0, "step": 2690 }, { "entropy": 5.68374400138855, "epoch": 0.06021875384048175, "grad_norm": 1.3515625, "learning_rate": 0.00022960360745959846, "loss": 5.4147, "mean_token_accuracy": 0.20215319246053695, "num_tokens": 2290733.0, "step": 2695 }, { "entropy": 5.723708868026733, "epoch": 0.06033047694593719, "grad_norm": 1.2421875, "learning_rate": 0.00022845036577269972, "loss": 5.3536, "mean_token_accuracy": 0.20597321391105652, "num_tokens": 2295150.0, "step": 2700 }, { "entropy": 5.7238153457641605, "epoch": 0.06044220005139263, "grad_norm": 1.34375, "learning_rate": 0.00022729840026753777, "loss": 5.4375, "mean_token_accuracy": 0.20747863054275512, "num_tokens": 2299461.0, "step": 2705 }, { "entropy": 5.587930822372437, "epoch": 0.060553923156848065, "grad_norm": 1.28125, "learning_rate": 0.0002261477425258287, "loss": 5.2821, "mean_token_accuracy": 0.22456400692462922, "num_tokens": 2303981.0, "step": 2710 }, { "entropy": 5.83951325416565, "epoch": 0.06066564626230351, "grad_norm": 1.296875, "learning_rate": 0.0002249984240934358, "loss": 5.6461, "mean_token_accuracy": 0.18310097455978394, "num_tokens": 2308514.0, "step": 2715 }, { "entropy": 5.807481193542481, "epoch": 0.060777369367758946, "grad_norm": 1.3203125, "learning_rate": 0.00022385047647950464, "loss": 5.5157, "mean_token_accuracy": 0.20032941550016403, "num_tokens": 2312803.0, "step": 2720 }, { "entropy": 5.706876802444458, "epoch": 0.06088909247321438, "grad_norm": 1.25, "learning_rate": 0.0002227039311555986, "loss": 5.4204, "mean_token_accuracy": 0.20374609380960465, "num_tokens": 2317136.0, "step": 2725 }, { "entropy": 5.711637687683106, "epoch": 0.06100081557866983, "grad_norm": 1.3203125, "learning_rate": 0.0002215588195548372, "loss": 5.4106, "mean_token_accuracy": 0.20451716631650924, "num_tokens": 2321064.0, "step": 2730 }, { "entropy": 5.683818435668945, "epoch": 0.061112538684125264, "grad_norm": 1.2578125, "learning_rate": 0.00022041517307103337, "loss": 5.3412, "mean_token_accuracy": 0.21318926066160201, "num_tokens": 2324971.0, "step": 2735 }, { "entropy": 5.679719495773315, "epoch": 0.0612242617895807, "grad_norm": 1.375, "learning_rate": 0.0002192730230578331, "loss": 5.4779, "mean_token_accuracy": 0.19736759066581727, "num_tokens": 2329433.0, "step": 2740 }, { "entropy": 5.600196123123169, "epoch": 0.061335984895036146, "grad_norm": 1.265625, "learning_rate": 0.0002181324008278559, "loss": 5.3563, "mean_token_accuracy": 0.2150166630744934, "num_tokens": 2333493.0, "step": 2745 }, { "entropy": 5.57360200881958, "epoch": 0.06144770800049158, "grad_norm": 1.2890625, "learning_rate": 0.00021699333765183655, "loss": 5.2304, "mean_token_accuracy": 0.21621160507202147, "num_tokens": 2337663.0, "step": 2750 }, { "entropy": 5.678966856002807, "epoch": 0.06155943110594702, "grad_norm": 1.1796875, "learning_rate": 0.0002158558647577673, "loss": 5.2662, "mean_token_accuracy": 0.2184838131070137, "num_tokens": 2342291.0, "step": 2755 }, { "entropy": 5.676230812072754, "epoch": 0.06167115421140246, "grad_norm": 1.3671875, "learning_rate": 0.00021472001333004215, "loss": 5.4342, "mean_token_accuracy": 0.20801472216844558, "num_tokens": 2346660.0, "step": 2760 }, { "entropy": 5.669618463516235, "epoch": 0.0617828773168579, "grad_norm": 1.4453125, "learning_rate": 0.00021358581450860186, "loss": 5.3922, "mean_token_accuracy": 0.20119996964931489, "num_tokens": 2350605.0, "step": 2765 }, { "entropy": 5.683516979217529, "epoch": 0.06189460042231334, "grad_norm": 1.265625, "learning_rate": 0.0002124532993880799, "loss": 5.3747, "mean_token_accuracy": 0.20869618207216262, "num_tokens": 2355125.0, "step": 2770 }, { "entropy": 5.705159473419189, "epoch": 0.062006323527768775, "grad_norm": 1.3203125, "learning_rate": 0.00021132249901695044, "loss": 5.5159, "mean_token_accuracy": 0.194733664393425, "num_tokens": 2359058.0, "step": 2775 }, { "entropy": 5.627893495559692, "epoch": 0.06211804663322422, "grad_norm": 1.4140625, "learning_rate": 0.00021019344439667705, "loss": 5.463, "mean_token_accuracy": 0.21798547804355622, "num_tokens": 2363326.0, "step": 2780 }, { "entropy": 5.7318541526794435, "epoch": 0.062229769738679656, "grad_norm": 1.375, "learning_rate": 0.00020906616648086213, "loss": 5.2807, "mean_token_accuracy": 0.21216789335012437, "num_tokens": 2367241.0, "step": 2785 }, { "entropy": 5.78978590965271, "epoch": 0.062341492844135094, "grad_norm": 1.2421875, "learning_rate": 0.00020794069617439942, "loss": 5.4647, "mean_token_accuracy": 0.1928023025393486, "num_tokens": 2371660.0, "step": 2790 }, { "entropy": 5.7157361030578615, "epoch": 0.06245321594959054, "grad_norm": 1.140625, "learning_rate": 0.00020681706433262593, "loss": 5.5075, "mean_token_accuracy": 0.20195768922567367, "num_tokens": 2376083.0, "step": 2795 }, { "entropy": 5.644212436676026, "epoch": 0.06256493905504597, "grad_norm": 1.390625, "learning_rate": 0.00020569530176047602, "loss": 5.3724, "mean_token_accuracy": 0.21047956198453904, "num_tokens": 2380006.0, "step": 2800 }, { "entropy": 5.660019779205323, "epoch": 0.06267666216050141, "grad_norm": 1.28125, "learning_rate": 0.0002045754392116374, "loss": 5.4079, "mean_token_accuracy": 0.20587846338748933, "num_tokens": 2384382.0, "step": 2805 }, { "entropy": 5.658632707595825, "epoch": 0.06278838526595686, "grad_norm": 1.21875, "learning_rate": 0.00020345750738770757, "loss": 5.3543, "mean_token_accuracy": 0.20837244987487794, "num_tokens": 2388550.0, "step": 2810 }, { "entropy": 5.6060553073883055, "epoch": 0.06290010837141229, "grad_norm": 1.296875, "learning_rate": 0.00020234153693735214, "loss": 5.3939, "mean_token_accuracy": 0.20029546022415162, "num_tokens": 2392189.0, "step": 2815 }, { "entropy": 5.601018953323364, "epoch": 0.06301183147686773, "grad_norm": 1.28125, "learning_rate": 0.0002012275584554647, "loss": 5.3059, "mean_token_accuracy": 0.22086506187915803, "num_tokens": 2396016.0, "step": 2820 }, { "entropy": 5.631944751739502, "epoch": 0.06312355458232317, "grad_norm": 1.21875, "learning_rate": 0.00020011560248232803, "loss": 5.4003, "mean_token_accuracy": 0.20547943860292434, "num_tokens": 2400053.0, "step": 2825 }, { "entropy": 5.736320638656617, "epoch": 0.0632352776877786, "grad_norm": 1.34375, "learning_rate": 0.00019900569950277692, "loss": 5.498, "mean_token_accuracy": 0.18274902701377868, "num_tokens": 2404164.0, "step": 2830 }, { "entropy": 5.6216331958770756, "epoch": 0.06334700079323405, "grad_norm": 1.1953125, "learning_rate": 0.00019789787994536228, "loss": 5.4779, "mean_token_accuracy": 0.2058979496359825, "num_tokens": 2408477.0, "step": 2835 }, { "entropy": 5.798714542388916, "epoch": 0.06345872389868949, "grad_norm": 1.3828125, "learning_rate": 0.00019679217418151667, "loss": 5.5246, "mean_token_accuracy": 0.19199300259351731, "num_tokens": 2413299.0, "step": 2840 }, { "entropy": 5.629748058319092, "epoch": 0.06357044700414492, "grad_norm": 1.3203125, "learning_rate": 0.00019568861252472236, "loss": 5.2433, "mean_token_accuracy": 0.2218858554959297, "num_tokens": 2417668.0, "step": 2845 }, { "entropy": 5.650796365737915, "epoch": 0.06368217010960037, "grad_norm": 1.265625, "learning_rate": 0.00019458722522967952, "loss": 5.4082, "mean_token_accuracy": 0.20668836534023285, "num_tokens": 2422531.0, "step": 2850 }, { "entropy": 5.572155332565307, "epoch": 0.06379389321505581, "grad_norm": 1.2578125, "learning_rate": 0.00019348804249147723, "loss": 5.2035, "mean_token_accuracy": 0.222446571290493, "num_tokens": 2426958.0, "step": 2855 }, { "entropy": 5.589015960693359, "epoch": 0.06390561632051124, "grad_norm": 1.2578125, "learning_rate": 0.0001923910944447655, "loss": 5.3825, "mean_token_accuracy": 0.21515996307134627, "num_tokens": 2431011.0, "step": 2860 }, { "entropy": 5.543829345703125, "epoch": 0.06401733942596669, "grad_norm": 1.359375, "learning_rate": 0.00019129641116292928, "loss": 5.34, "mean_token_accuracy": 0.206784251332283, "num_tokens": 2435050.0, "step": 2865 }, { "entropy": 5.694529294967651, "epoch": 0.06412906253142213, "grad_norm": 1.40625, "learning_rate": 0.00019020402265726343, "loss": 5.4851, "mean_token_accuracy": 0.2040119305253029, "num_tokens": 2440017.0, "step": 2870 }, { "entropy": 5.667089891433716, "epoch": 0.06424078563687756, "grad_norm": 1.3671875, "learning_rate": 0.0001891139588761509, "loss": 5.3921, "mean_token_accuracy": 0.20922284126281737, "num_tokens": 2444177.0, "step": 2875 }, { "entropy": 5.710749530792237, "epoch": 0.064352508742333, "grad_norm": 1.3203125, "learning_rate": 0.00018802624970424076, "loss": 5.5013, "mean_token_accuracy": 0.19369867593050002, "num_tokens": 2448287.0, "step": 2880 }, { "entropy": 5.707183218002319, "epoch": 0.06446423184778845, "grad_norm": 1.2265625, "learning_rate": 0.00018694092496162945, "loss": 5.4414, "mean_token_accuracy": 0.20463113188743592, "num_tokens": 2452901.0, "step": 2885 }, { "entropy": 5.6640900611877445, "epoch": 0.06457595495324388, "grad_norm": 1.3671875, "learning_rate": 0.00018585801440304306, "loss": 5.2873, "mean_token_accuracy": 0.22243628948926925, "num_tokens": 2457333.0, "step": 2890 }, { "entropy": 5.768609142303466, "epoch": 0.06468767805869932, "grad_norm": 1.234375, "learning_rate": 0.00018477754771702165, "loss": 5.4756, "mean_token_accuracy": 0.19730397388339044, "num_tokens": 2461502.0, "step": 2895 }, { "entropy": 5.744687557220459, "epoch": 0.06479940116415477, "grad_norm": 1.296875, "learning_rate": 0.00018369955452510506, "loss": 5.5625, "mean_token_accuracy": 0.19384552538394928, "num_tokens": 2465742.0, "step": 2900 }, { "entropy": 5.689142560958862, "epoch": 0.0649111242696102, "grad_norm": 1.3515625, "learning_rate": 0.0001826240643810212, "loss": 5.4767, "mean_token_accuracy": 0.19787981808185579, "num_tokens": 2469768.0, "step": 2905 }, { "entropy": 5.614039134979248, "epoch": 0.06502284737506564, "grad_norm": 1.2734375, "learning_rate": 0.0001815511067698758, "loss": 5.4224, "mean_token_accuracy": 0.2049763947725296, "num_tokens": 2473867.0, "step": 2910 }, { "entropy": 5.599695920944214, "epoch": 0.06513457048052107, "grad_norm": 1.3125, "learning_rate": 0.0001804807111073436, "loss": 5.3444, "mean_token_accuracy": 0.20889384001493455, "num_tokens": 2477783.0, "step": 2915 }, { "entropy": 5.6680515766143795, "epoch": 0.06524629358597651, "grad_norm": 1.3203125, "learning_rate": 0.0001794129067388625, "loss": 5.5353, "mean_token_accuracy": 0.19455234706401825, "num_tokens": 2482135.0, "step": 2920 }, { "entropy": 5.678571319580078, "epoch": 0.06535801669143196, "grad_norm": 1.21875, "learning_rate": 0.00017834772293882868, "loss": 5.466, "mean_token_accuracy": 0.19640394151210785, "num_tokens": 2486171.0, "step": 2925 }, { "entropy": 5.572101306915283, "epoch": 0.06546973979688739, "grad_norm": 1.3671875, "learning_rate": 0.000177285188909794, "loss": 5.2695, "mean_token_accuracy": 0.21688248813152314, "num_tokens": 2490372.0, "step": 2930 }, { "entropy": 5.638118267059326, "epoch": 0.06558146290234283, "grad_norm": 1.2421875, "learning_rate": 0.0001762253337816656, "loss": 5.2066, "mean_token_accuracy": 0.2306307077407837, "num_tokens": 2494662.0, "step": 2935 }, { "entropy": 5.559669113159179, "epoch": 0.06569318600779828, "grad_norm": 1.453125, "learning_rate": 0.00017516818661090738, "loss": 5.3592, "mean_token_accuracy": 0.21668146252632142, "num_tokens": 2498164.0, "step": 2940 }, { "entropy": 5.663913297653198, "epoch": 0.0658049091132537, "grad_norm": 1.2890625, "learning_rate": 0.0001741137763797428, "loss": 5.4256, "mean_token_accuracy": 0.1958498999476433, "num_tokens": 2502335.0, "step": 2945 }, { "entropy": 5.715854167938232, "epoch": 0.06591663221870915, "grad_norm": 1.328125, "learning_rate": 0.00017306213199536115, "loss": 5.5713, "mean_token_accuracy": 0.1957806333899498, "num_tokens": 2506792.0, "step": 2950 }, { "entropy": 5.641438007354736, "epoch": 0.0660283553241646, "grad_norm": 1.1875, "learning_rate": 0.0001720132822891243, "loss": 5.3534, "mean_token_accuracy": 0.21230535209178925, "num_tokens": 2510863.0, "step": 2955 }, { "entropy": 5.697113370895385, "epoch": 0.06614007842962003, "grad_norm": 1.3203125, "learning_rate": 0.0001709672560157769, "loss": 5.4717, "mean_token_accuracy": 0.19569053649902343, "num_tokens": 2515220.0, "step": 2960 }, { "entropy": 5.630364465713501, "epoch": 0.06625180153507547, "grad_norm": 1.34375, "learning_rate": 0.00016992408185265758, "loss": 5.345, "mean_token_accuracy": 0.20241880565881729, "num_tokens": 2519501.0, "step": 2965 }, { "entropy": 5.6378789901733395, "epoch": 0.06636352464053091, "grad_norm": 1.359375, "learning_rate": 0.00016888378839891298, "loss": 5.388, "mean_token_accuracy": 0.20747827142477035, "num_tokens": 2523838.0, "step": 2970 }, { "entropy": 5.594701528549194, "epoch": 0.06647524774598634, "grad_norm": 1.3359375, "learning_rate": 0.0001678464041747137, "loss": 5.375, "mean_token_accuracy": 0.20893704146146774, "num_tokens": 2528480.0, "step": 2975 }, { "entropy": 5.649026823043823, "epoch": 0.06658697085144179, "grad_norm": 1.265625, "learning_rate": 0.00016681195762047223, "loss": 5.4124, "mean_token_accuracy": 0.19796885550022125, "num_tokens": 2532596.0, "step": 2980 }, { "entropy": 5.646418809890747, "epoch": 0.06669869395689723, "grad_norm": 1.2265625, "learning_rate": 0.00016578047709606337, "loss": 5.3353, "mean_token_accuracy": 0.21195341944694518, "num_tokens": 2536980.0, "step": 2985 }, { "entropy": 5.716257476806641, "epoch": 0.06681041706235266, "grad_norm": 1.171875, "learning_rate": 0.00016475199088004678, "loss": 5.4948, "mean_token_accuracy": 0.1920865833759308, "num_tokens": 2540729.0, "step": 2990 }, { "entropy": 5.747273063659668, "epoch": 0.0669221401678081, "grad_norm": 1.28125, "learning_rate": 0.00016372652716889163, "loss": 5.5499, "mean_token_accuracy": 0.18710134774446488, "num_tokens": 2545164.0, "step": 2995 }, { "entropy": 5.672609186172485, "epoch": 0.06703386327326355, "grad_norm": 1.2265625, "learning_rate": 0.0001627041140762035, "loss": 5.2965, "mean_token_accuracy": 0.21778741478919983, "num_tokens": 2549182.0, "step": 3000 }, { "entropy": 5.636187219619751, "epoch": 0.06714558637871898, "grad_norm": 1.2890625, "learning_rate": 0.00016168477963195382, "loss": 5.3853, "mean_token_accuracy": 0.2133116140961647, "num_tokens": 2553673.0, "step": 3005 }, { "entropy": 5.695971250534058, "epoch": 0.06725730948417442, "grad_norm": 1.1484375, "learning_rate": 0.0001606685517817114, "loss": 5.4821, "mean_token_accuracy": 0.20681501924991608, "num_tokens": 2558494.0, "step": 3010 }, { "entropy": 5.677333641052246, "epoch": 0.06736903258962987, "grad_norm": 1.3828125, "learning_rate": 0.00015965545838587592, "loss": 5.3658, "mean_token_accuracy": 0.20497429519891738, "num_tokens": 2562488.0, "step": 3015 }, { "entropy": 5.659304904937744, "epoch": 0.0674807556950853, "grad_norm": 1.46875, "learning_rate": 0.00015864552721891467, "loss": 5.3, "mean_token_accuracy": 0.2087144747376442, "num_tokens": 2566404.0, "step": 3020 }, { "entropy": 5.549917411804199, "epoch": 0.06759247880054074, "grad_norm": 1.3046875, "learning_rate": 0.00015763878596860076, "loss": 5.3035, "mean_token_accuracy": 0.21576655209064483, "num_tokens": 2571000.0, "step": 3025 }, { "entropy": 5.63624153137207, "epoch": 0.06770420190599617, "grad_norm": 1.125, "learning_rate": 0.00015663526223525412, "loss": 5.4649, "mean_token_accuracy": 0.19927534759044646, "num_tokens": 2575795.0, "step": 3030 }, { "entropy": 5.582642936706543, "epoch": 0.06781592501145162, "grad_norm": 1.265625, "learning_rate": 0.0001556349835309848, "loss": 5.2806, "mean_token_accuracy": 0.21977338343858718, "num_tokens": 2579885.0, "step": 3035 }, { "entropy": 5.667189407348633, "epoch": 0.06792764811690706, "grad_norm": 1.328125, "learning_rate": 0.0001546379772789389, "loss": 5.3114, "mean_token_accuracy": 0.2110684707760811, "num_tokens": 2583951.0, "step": 3040 }, { "entropy": 5.559815740585327, "epoch": 0.06803937122236249, "grad_norm": 1.234375, "learning_rate": 0.00015364427081254622, "loss": 5.2818, "mean_token_accuracy": 0.21681709736585617, "num_tokens": 2587881.0, "step": 3045 }, { "entropy": 5.647535800933838, "epoch": 0.06815109432781793, "grad_norm": 1.2265625, "learning_rate": 0.00015265389137477165, "loss": 5.3734, "mean_token_accuracy": 0.21438211798667908, "num_tokens": 2592123.0, "step": 3050 }, { "entropy": 5.74369068145752, "epoch": 0.06826281743327338, "grad_norm": 1.3046875, "learning_rate": 0.00015166686611736786, "loss": 5.5144, "mean_token_accuracy": 0.1928423747420311, "num_tokens": 2596315.0, "step": 3055 }, { "entropy": 5.70271782875061, "epoch": 0.06837454053872881, "grad_norm": 1.234375, "learning_rate": 0.00015068322210013064, "loss": 5.3366, "mean_token_accuracy": 0.20409525334835052, "num_tokens": 2600472.0, "step": 3060 }, { "entropy": 5.5629678726196286, "epoch": 0.06848626364418425, "grad_norm": 1.2578125, "learning_rate": 0.0001497029862901578, "loss": 5.2077, "mean_token_accuracy": 0.22850425839424132, "num_tokens": 2604310.0, "step": 3065 }, { "entropy": 5.63937201499939, "epoch": 0.0685979867496397, "grad_norm": 1.2109375, "learning_rate": 0.00014872618556110905, "loss": 5.4446, "mean_token_accuracy": 0.20147960931062697, "num_tokens": 2608771.0, "step": 3070 }, { "entropy": 5.563421058654785, "epoch": 0.06870970985509513, "grad_norm": 1.2734375, "learning_rate": 0.00014775284669246992, "loss": 5.327, "mean_token_accuracy": 0.21189119219779967, "num_tokens": 2613299.0, "step": 3075 }, { "entropy": 5.701922273635864, "epoch": 0.06882143296055057, "grad_norm": 1.2578125, "learning_rate": 0.00014678299636881716, "loss": 5.3712, "mean_token_accuracy": 0.21400164812803268, "num_tokens": 2617778.0, "step": 3080 }, { "entropy": 5.610776948928833, "epoch": 0.06893315606600602, "grad_norm": 1.2890625, "learning_rate": 0.0001458166611790873, "loss": 5.3401, "mean_token_accuracy": 0.2187138855457306, "num_tokens": 2621636.0, "step": 3085 }, { "entropy": 5.6394589900970455, "epoch": 0.06904487917146145, "grad_norm": 1.3125, "learning_rate": 0.00014485386761584773, "loss": 5.3942, "mean_token_accuracy": 0.20630815178155898, "num_tokens": 2625923.0, "step": 3090 }, { "entropy": 5.7048985958099365, "epoch": 0.06915660227691689, "grad_norm": 1.328125, "learning_rate": 0.00014389464207457042, "loss": 5.4896, "mean_token_accuracy": 0.1937684491276741, "num_tokens": 2630282.0, "step": 3095 }, { "entropy": 5.595674705505371, "epoch": 0.06926832538237233, "grad_norm": 1.359375, "learning_rate": 0.00014293901085290795, "loss": 5.2659, "mean_token_accuracy": 0.21248525232076645, "num_tokens": 2634376.0, "step": 3100 }, { "entropy": 5.653559637069702, "epoch": 0.06938004848782776, "grad_norm": 1.4453125, "learning_rate": 0.00014198700014997307, "loss": 5.3802, "mean_token_accuracy": 0.21333143562078477, "num_tokens": 2638486.0, "step": 3105 }, { "entropy": 5.716269636154175, "epoch": 0.06949177159328321, "grad_norm": 1.265625, "learning_rate": 0.00014103863606562016, "loss": 5.4523, "mean_token_accuracy": 0.19313299655914307, "num_tokens": 2643023.0, "step": 3110 }, { "entropy": 5.643144178390503, "epoch": 0.06960349469873865, "grad_norm": 1.375, "learning_rate": 0.00014009394459972964, "loss": 5.3403, "mean_token_accuracy": 0.21734200417995453, "num_tokens": 2646977.0, "step": 3115 }, { "entropy": 5.687381267547607, "epoch": 0.06971521780419408, "grad_norm": 1.3984375, "learning_rate": 0.00013915295165149513, "loss": 5.3712, "mean_token_accuracy": 0.20944836586713791, "num_tokens": 2651296.0, "step": 3120 }, { "entropy": 5.641043186187744, "epoch": 0.06982694090964953, "grad_norm": 1.3125, "learning_rate": 0.00013821568301871384, "loss": 5.4203, "mean_token_accuracy": 0.2098002940416336, "num_tokens": 2655511.0, "step": 3125 }, { "entropy": 5.659207010269165, "epoch": 0.06993866401510497, "grad_norm": 1.1640625, "learning_rate": 0.00013728216439707862, "loss": 5.3967, "mean_token_accuracy": 0.20127485394477845, "num_tokens": 2660457.0, "step": 3130 }, { "entropy": 5.628619861602783, "epoch": 0.0700503871205604, "grad_norm": 1.2578125, "learning_rate": 0.00013635242137947419, "loss": 5.3159, "mean_token_accuracy": 0.21965348869562148, "num_tokens": 2664675.0, "step": 3135 }, { "entropy": 5.6215027332305905, "epoch": 0.07016211022601584, "grad_norm": 1.359375, "learning_rate": 0.00013542647945527498, "loss": 5.2894, "mean_token_accuracy": 0.22159690260887147, "num_tokens": 2668809.0, "step": 3140 }, { "entropy": 5.630403709411621, "epoch": 0.07027383333147127, "grad_norm": 1.2578125, "learning_rate": 0.0001345043640096465, "loss": 5.3259, "mean_token_accuracy": 0.20712538212537765, "num_tokens": 2672974.0, "step": 3145 }, { "entropy": 5.648602390289307, "epoch": 0.07038555643692672, "grad_norm": 1.1796875, "learning_rate": 0.00013358610032284957, "loss": 5.3497, "mean_token_accuracy": 0.20255878418684006, "num_tokens": 2677633.0, "step": 3150 }, { "entropy": 5.608045816421509, "epoch": 0.07049727954238216, "grad_norm": 1.3046875, "learning_rate": 0.000132671713569547, "loss": 5.2446, "mean_token_accuracy": 0.21367690563201905, "num_tokens": 2681398.0, "step": 3155 }, { "entropy": 5.71210880279541, "epoch": 0.0706090026478376, "grad_norm": 1.21875, "learning_rate": 0.0001317612288181136, "loss": 5.4702, "mean_token_accuracy": 0.19393738508224487, "num_tokens": 2685785.0, "step": 3160 }, { "entropy": 5.611098861694336, "epoch": 0.07072072575329304, "grad_norm": 1.3359375, "learning_rate": 0.00013085467102994864, "loss": 5.333, "mean_token_accuracy": 0.21265119463205337, "num_tokens": 2689804.0, "step": 3165 }, { "entropy": 5.705452013015747, "epoch": 0.07083244885874848, "grad_norm": 1.296875, "learning_rate": 0.00012995206505879198, "loss": 5.4991, "mean_token_accuracy": 0.20156707912683486, "num_tokens": 2694632.0, "step": 3170 }, { "entropy": 5.57891731262207, "epoch": 0.07094417196420391, "grad_norm": 1.3359375, "learning_rate": 0.0001290534356500421, "loss": 5.1713, "mean_token_accuracy": 0.22638255208730698, "num_tokens": 2698723.0, "step": 3175 }, { "entropy": 5.4940540313720705, "epoch": 0.07105589506965936, "grad_norm": 1.28125, "learning_rate": 0.00012815880744007827, "loss": 5.116, "mean_token_accuracy": 0.2263195186853409, "num_tokens": 2702746.0, "step": 3180 }, { "entropy": 5.621317625045776, "epoch": 0.0711676181751148, "grad_norm": 1.390625, "learning_rate": 0.00012726820495558483, "loss": 5.3352, "mean_token_accuracy": 0.2047779694199562, "num_tokens": 2707037.0, "step": 3185 }, { "entropy": 5.711771535873413, "epoch": 0.07127934128057023, "grad_norm": 1.2578125, "learning_rate": 0.00012638165261287868, "loss": 5.4795, "mean_token_accuracy": 0.19314364492893218, "num_tokens": 2711529.0, "step": 3190 }, { "entropy": 5.598011302947998, "epoch": 0.07139106438602567, "grad_norm": 1.2421875, "learning_rate": 0.0001254991747172402, "loss": 5.3215, "mean_token_accuracy": 0.21524787843227386, "num_tokens": 2715804.0, "step": 3195 }, { "entropy": 5.548861265182495, "epoch": 0.07150278749148112, "grad_norm": 1.203125, "learning_rate": 0.00012462079546224662, "loss": 5.2071, "mean_token_accuracy": 0.2398407518863678, "num_tokens": 2719902.0, "step": 3200 }, { "entropy": 5.5862783908844, "epoch": 0.07161451059693655, "grad_norm": 1.2734375, "learning_rate": 0.00012374653892910896, "loss": 5.1621, "mean_token_accuracy": 0.22908655554056168, "num_tokens": 2723673.0, "step": 3205 }, { "entropy": 5.6736372947692875, "epoch": 0.07172623370239199, "grad_norm": 1.375, "learning_rate": 0.00012287642908601166, "loss": 5.4287, "mean_token_accuracy": 0.20150194317102432, "num_tokens": 2727634.0, "step": 3210 }, { "entropy": 5.567438411712646, "epoch": 0.07183795680784744, "grad_norm": 1.203125, "learning_rate": 0.00012201048978745569, "loss": 5.1691, "mean_token_accuracy": 0.23436965495347978, "num_tokens": 2731931.0, "step": 3215 }, { "entropy": 5.5576332092285154, "epoch": 0.07194967991330287, "grad_norm": 1.359375, "learning_rate": 0.00012114874477360427, "loss": 5.2348, "mean_token_accuracy": 0.2189928859472275, "num_tokens": 2736212.0, "step": 3220 }, { "entropy": 5.672902536392212, "epoch": 0.07206140301875831, "grad_norm": 1.3359375, "learning_rate": 0.00012029121766963236, "loss": 5.4208, "mean_token_accuracy": 0.20269766449928284, "num_tokens": 2740696.0, "step": 3225 }, { "entropy": 5.5735211849212645, "epoch": 0.07217312612421375, "grad_norm": 1.3359375, "learning_rate": 0.00011943793198507858, "loss": 5.3422, "mean_token_accuracy": 0.2063916340470314, "num_tokens": 2744695.0, "step": 3230 }, { "entropy": 5.624039936065674, "epoch": 0.07228484922966918, "grad_norm": 1.4375, "learning_rate": 0.00011858891111320104, "loss": 5.382, "mean_token_accuracy": 0.2006018877029419, "num_tokens": 2748855.0, "step": 3235 }, { "entropy": 5.558242464065552, "epoch": 0.07239657233512463, "grad_norm": 1.25, "learning_rate": 0.0001177441783303359, "loss": 5.2445, "mean_token_accuracy": 0.21730558276176454, "num_tokens": 2753099.0, "step": 3240 }, { "entropy": 5.604915714263916, "epoch": 0.07250829544058007, "grad_norm": 1.234375, "learning_rate": 0.00011690375679525896, "loss": 5.3702, "mean_token_accuracy": 0.20517390370368957, "num_tokens": 2757290.0, "step": 3245 }, { "entropy": 5.621156454086304, "epoch": 0.0726200185460355, "grad_norm": 1.3671875, "learning_rate": 0.00011606766954855124, "loss": 5.4317, "mean_token_accuracy": 0.2096293345093727, "num_tokens": 2761044.0, "step": 3250 }, { "entropy": 5.661703586578369, "epoch": 0.07273174165149095, "grad_norm": 1.21875, "learning_rate": 0.00011523593951196702, "loss": 5.3989, "mean_token_accuracy": 0.2107509046792984, "num_tokens": 2765353.0, "step": 3255 }, { "entropy": 5.6444660186767575, "epoch": 0.07284346475694638, "grad_norm": 1.34375, "learning_rate": 0.00011440858948780523, "loss": 5.2984, "mean_token_accuracy": 0.21078112572431565, "num_tokens": 2769530.0, "step": 3260 }, { "entropy": 5.711241579055786, "epoch": 0.07295518786240182, "grad_norm": 1.2734375, "learning_rate": 0.00011358564215828484, "loss": 5.3533, "mean_token_accuracy": 0.21310799419879914, "num_tokens": 2773876.0, "step": 3265 }, { "entropy": 5.765763902664185, "epoch": 0.07306691096785727, "grad_norm": 1.21875, "learning_rate": 0.00011276712008492254, "loss": 5.5361, "mean_token_accuracy": 0.20008978992700577, "num_tokens": 2778648.0, "step": 3270 }, { "entropy": 5.591998624801636, "epoch": 0.0731786340733127, "grad_norm": 1.4375, "learning_rate": 0.00011195304570791451, "loss": 5.2927, "mean_token_accuracy": 0.2178100273013115, "num_tokens": 2782651.0, "step": 3275 }, { "entropy": 5.670967674255371, "epoch": 0.07329035717876814, "grad_norm": 1.3125, "learning_rate": 0.00011114344134552094, "loss": 5.3911, "mean_token_accuracy": 0.2107714906334877, "num_tokens": 2786665.0, "step": 3280 }, { "entropy": 5.638211393356324, "epoch": 0.07340208028422358, "grad_norm": 1.21875, "learning_rate": 0.0001103383291934545, "loss": 5.33, "mean_token_accuracy": 0.21447014063596725, "num_tokens": 2791037.0, "step": 3285 }, { "entropy": 5.675831651687622, "epoch": 0.07351380338967901, "grad_norm": 1.3125, "learning_rate": 0.00010953773132427141, "loss": 5.3722, "mean_token_accuracy": 0.20269206464290618, "num_tokens": 2795026.0, "step": 3290 }, { "entropy": 5.5699504852294925, "epoch": 0.07362552649513446, "grad_norm": 1.1953125, "learning_rate": 0.00010874166968676677, "loss": 5.1734, "mean_token_accuracy": 0.22651610225439073, "num_tokens": 2798998.0, "step": 3295 }, { "entropy": 5.586806106567383, "epoch": 0.0737372496005899, "grad_norm": 1.2890625, "learning_rate": 0.00010795016610537251, "loss": 5.2785, "mean_token_accuracy": 0.21073253005743026, "num_tokens": 2803676.0, "step": 3300 }, { "entropy": 5.584428548812866, "epoch": 0.07384897270604533, "grad_norm": 1.203125, "learning_rate": 0.00010716324227955904, "loss": 5.2331, "mean_token_accuracy": 0.2215863883495331, "num_tokens": 2808227.0, "step": 3305 }, { "entropy": 5.554582262039185, "epoch": 0.07396069581150078, "grad_norm": 1.21875, "learning_rate": 0.0001063809197832406, "loss": 5.3332, "mean_token_accuracy": 0.2192509040236473, "num_tokens": 2812340.0, "step": 3310 }, { "entropy": 5.553052473068237, "epoch": 0.07407241891695622, "grad_norm": 1.3359375, "learning_rate": 0.00010560322006418368, "loss": 5.2239, "mean_token_accuracy": 0.22243622392416001, "num_tokens": 2816329.0, "step": 3315 }, { "entropy": 5.721336460113525, "epoch": 0.07418414202241165, "grad_norm": 1.359375, "learning_rate": 0.00010483016444341887, "loss": 5.5219, "mean_token_accuracy": 0.1935374766588211, "num_tokens": 2820759.0, "step": 3320 }, { "entropy": 5.531715250015258, "epoch": 0.0742958651278671, "grad_norm": 1.234375, "learning_rate": 0.00010406177411465654, "loss": 5.1382, "mean_token_accuracy": 0.23273815512657164, "num_tokens": 2825463.0, "step": 3325 }, { "entropy": 5.6134782314300535, "epoch": 0.07440758823332254, "grad_norm": 1.28125, "learning_rate": 0.00010329807014370562, "loss": 5.3523, "mean_token_accuracy": 0.2063085600733757, "num_tokens": 2829453.0, "step": 3330 }, { "entropy": 5.577965021133423, "epoch": 0.07451931133877797, "grad_norm": 1.3203125, "learning_rate": 0.00010253907346789632, "loss": 5.3323, "mean_token_accuracy": 0.21914800852537156, "num_tokens": 2833526.0, "step": 3335 }, { "entropy": 5.664664268493652, "epoch": 0.07463103444423341, "grad_norm": 1.359375, "learning_rate": 0.00010178480489550596, "loss": 5.4293, "mean_token_accuracy": 0.21083945482969285, "num_tokens": 2837634.0, "step": 3340 }, { "entropy": 5.626935768127441, "epoch": 0.07474275754968886, "grad_norm": 1.34375, "learning_rate": 0.00010103528510518836, "loss": 5.2617, "mean_token_accuracy": 0.21807632148265838, "num_tokens": 2842126.0, "step": 3345 }, { "entropy": 5.598409366607666, "epoch": 0.07485448065514429, "grad_norm": 1.1953125, "learning_rate": 0.0001002905346454073, "loss": 5.3069, "mean_token_accuracy": 0.21519531011581422, "num_tokens": 2846518.0, "step": 3350 }, { "entropy": 5.6287015914917, "epoch": 0.07496620376059973, "grad_norm": 1.3984375, "learning_rate": 9.955057393387285e-05, "loss": 5.3828, "mean_token_accuracy": 0.21342982202768326, "num_tokens": 2851045.0, "step": 3355 }, { "entropy": 5.626803350448609, "epoch": 0.07507792686605518, "grad_norm": 1.296875, "learning_rate": 9.88154232569816e-05, "loss": 5.3587, "mean_token_accuracy": 0.20877184420824052, "num_tokens": 2855235.0, "step": 3360 }, { "entropy": 5.675837659835816, "epoch": 0.0751896499715106, "grad_norm": 1.2421875, "learning_rate": 9.808510276926075e-05, "loss": 5.2999, "mean_token_accuracy": 0.21340236514806749, "num_tokens": 2859507.0, "step": 3365 }, { "entropy": 5.663285732269287, "epoch": 0.07530137307696605, "grad_norm": 1.2421875, "learning_rate": 9.735963249281549e-05, "loss": 5.4368, "mean_token_accuracy": 0.19709139168262482, "num_tokens": 2863833.0, "step": 3370 }, { "entropy": 5.717996978759766, "epoch": 0.0754130961824215, "grad_norm": 1.3671875, "learning_rate": 9.663903231677974e-05, "loss": 5.5231, "mean_token_accuracy": 0.1840850055217743, "num_tokens": 2868540.0, "step": 3375 }, { "entropy": 5.716697931289673, "epoch": 0.07552481928787692, "grad_norm": 1.359375, "learning_rate": 9.592332199677145e-05, "loss": 5.2744, "mean_token_accuracy": 0.2100656121969223, "num_tokens": 2872407.0, "step": 3380 }, { "entropy": 5.754004335403442, "epoch": 0.07563654239333237, "grad_norm": 1.3828125, "learning_rate": 9.521252115435061e-05, "loss": 5.3745, "mean_token_accuracy": 0.2050818309187889, "num_tokens": 2876428.0, "step": 3385 }, { "entropy": 5.767968940734863, "epoch": 0.0757482654987878, "grad_norm": 1.1484375, "learning_rate": 9.450664927648126e-05, "loss": 5.4116, "mean_token_accuracy": 0.2025663897395134, "num_tokens": 2880536.0, "step": 3390 }, { "entropy": 5.68847975730896, "epoch": 0.07585998860424324, "grad_norm": 1.2734375, "learning_rate": 9.380572571499758e-05, "loss": 5.3452, "mean_token_accuracy": 0.21021927148103714, "num_tokens": 2884607.0, "step": 3395 }, { "entropy": 5.550797319412231, "epoch": 0.07597171170969869, "grad_norm": 1.34375, "learning_rate": 9.310976968607307e-05, "loss": 5.3404, "mean_token_accuracy": 0.21931682974100114, "num_tokens": 2888996.0, "step": 3400 }, { "entropy": 5.603079700469971, "epoch": 0.07608343481515412, "grad_norm": 1.2578125, "learning_rate": 9.241880026969381e-05, "loss": 5.3063, "mean_token_accuracy": 0.2104686364531517, "num_tokens": 2893526.0, "step": 3405 }, { "entropy": 5.647873449325561, "epoch": 0.07619515792060956, "grad_norm": 1.1796875, "learning_rate": 9.173283640913537e-05, "loss": 5.4548, "mean_token_accuracy": 0.21058177202939987, "num_tokens": 2898170.0, "step": 3410 }, { "entropy": 5.669381523132325, "epoch": 0.076306881026065, "grad_norm": 1.1640625, "learning_rate": 9.10518969104436e-05, "loss": 5.3745, "mean_token_accuracy": 0.2074264630675316, "num_tokens": 2902694.0, "step": 3415 }, { "entropy": 5.736430358886719, "epoch": 0.07641860413152043, "grad_norm": 1.328125, "learning_rate": 9.037600044191868e-05, "loss": 5.3976, "mean_token_accuracy": 0.20373451411724092, "num_tokens": 2906529.0, "step": 3420 }, { "entropy": 5.738633918762207, "epoch": 0.07653032723697588, "grad_norm": 1.3984375, "learning_rate": 8.970516553360383e-05, "loss": 5.4598, "mean_token_accuracy": 0.19842032194137574, "num_tokens": 2910643.0, "step": 3425 }, { "entropy": 5.631137943267822, "epoch": 0.07664205034243132, "grad_norm": 1.328125, "learning_rate": 8.903941057677692e-05, "loss": 5.2826, "mean_token_accuracy": 0.20652553886175157, "num_tokens": 2914755.0, "step": 3430 }, { "entropy": 5.6463858127594, "epoch": 0.07675377344788675, "grad_norm": 1.3359375, "learning_rate": 8.837875382344635e-05, "loss": 5.3067, "mean_token_accuracy": 0.21481942981481553, "num_tokens": 2919054.0, "step": 3435 }, { "entropy": 5.491642808914184, "epoch": 0.0768654965533422, "grad_norm": 1.296875, "learning_rate": 8.772321338585076e-05, "loss": 5.0677, "mean_token_accuracy": 0.23272379338741303, "num_tokens": 2922887.0, "step": 3440 }, { "entropy": 5.581666898727417, "epoch": 0.07697721965879764, "grad_norm": 1.2578125, "learning_rate": 8.707280723596242e-05, "loss": 5.3983, "mean_token_accuracy": 0.2076255962252617, "num_tokens": 2927587.0, "step": 3445 }, { "entropy": 5.558794689178467, "epoch": 0.07708894276425307, "grad_norm": 1.5, "learning_rate": 8.64275532049944e-05, "loss": 5.3603, "mean_token_accuracy": 0.21038010716438293, "num_tokens": 2931485.0, "step": 3450 }, { "entropy": 5.5361310005187985, "epoch": 0.07720066586970852, "grad_norm": 1.375, "learning_rate": 8.578746898291198e-05, "loss": 5.2803, "mean_token_accuracy": 0.22317569255828856, "num_tokens": 2935592.0, "step": 3455 }, { "entropy": 5.620584774017334, "epoch": 0.07731238897516396, "grad_norm": 1.328125, "learning_rate": 8.515257211794742e-05, "loss": 5.3616, "mean_token_accuracy": 0.2116774022579193, "num_tokens": 2939758.0, "step": 3460 }, { "entropy": 5.554155349731445, "epoch": 0.07742411208061939, "grad_norm": 1.4453125, "learning_rate": 8.452288001611896e-05, "loss": 5.2242, "mean_token_accuracy": 0.22089575827121735, "num_tokens": 2943927.0, "step": 3465 }, { "entropy": 5.634997081756592, "epoch": 0.07753583518607483, "grad_norm": 1.2734375, "learning_rate": 8.389840994075379e-05, "loss": 5.3202, "mean_token_accuracy": 0.20712267607450485, "num_tokens": 2948310.0, "step": 3470 }, { "entropy": 5.6394532203674315, "epoch": 0.07764755829153028, "grad_norm": 1.359375, "learning_rate": 8.327917901201435e-05, "loss": 5.3149, "mean_token_accuracy": 0.21153984516859053, "num_tokens": 2952647.0, "step": 3475 }, { "entropy": 5.589263820648194, "epoch": 0.07775928139698571, "grad_norm": 1.28125, "learning_rate": 8.266520420642931e-05, "loss": 5.3369, "mean_token_accuracy": 0.20888011008501053, "num_tokens": 2956940.0, "step": 3480 }, { "entropy": 5.528928422927857, "epoch": 0.07787100450244115, "grad_norm": 1.2109375, "learning_rate": 8.205650235642828e-05, "loss": 5.1551, "mean_token_accuracy": 0.21771908700466155, "num_tokens": 2960954.0, "step": 3485 }, { "entropy": 5.64394326210022, "epoch": 0.0779827276078966, "grad_norm": 1.421875, "learning_rate": 8.145309014987978e-05, "loss": 5.4679, "mean_token_accuracy": 0.202695769071579, "num_tokens": 2965161.0, "step": 3490 }, { "entropy": 5.660602188110351, "epoch": 0.07809445071335203, "grad_norm": 1.40625, "learning_rate": 8.085498412963437e-05, "loss": 5.3331, "mean_token_accuracy": 0.20686759054660797, "num_tokens": 2969071.0, "step": 3495 }, { "entropy": 5.631470584869385, "epoch": 0.07820617381880747, "grad_norm": 1.1953125, "learning_rate": 8.026220069307078e-05, "loss": 5.3428, "mean_token_accuracy": 0.2097361460328102, "num_tokens": 2973593.0, "step": 3500 } ], "logging_steps": 5, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 639010367078400.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }